3 #define _USE_MATH_DEFINES
7 #include "dpsoftrast.h"
10 #pragma warning(disable : 4324)
14 typedef qboolean bool;
18 #define ATOMIC_SIZE 32
21 #if defined(__APPLE__)
22 #include <libkern/OSAtomic.h>
23 #define ALIGN(var) var __attribute__((__aligned__(16)))
24 #define ATOMIC(var) var __attribute__((__aligned__(32)))
25 #define MEMORY_BARRIER (_mm_sfence())
26 #define ATOMIC_COUNTER volatile int32_t
27 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
28 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
29 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
30 #elif defined(__GNUC__) && defined(WIN32)
31 #define ALIGN(var) var __attribute__((__aligned__(16)))
32 #define ATOMIC(var) var __attribute__((__aligned__(32)))
33 #define MEMORY_BARRIER (_mm_sfence())
34 //(__sync_synchronize())
35 #define ATOMIC_COUNTER volatile LONG
36 // this LONG * cast serves to fix an issue with broken mingw
37 // packages on Ubuntu; these only declare the function to take
38 // a LONG *, causing a compile error here. This seems to be
39 // error- and warn-free on platforms that DO declare
40 // InterlockedIncrement correctly, like mingw on Windows.
41 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement((LONG *) &(counter)))
42 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement((LONG *) &(counter)))
43 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd((LONG *) &(counter), (val)))
44 #elif defined(__GNUC__)
45 #define ALIGN(var) var __attribute__((__aligned__(16)))
46 #define ATOMIC(var) var __attribute__((__aligned__(32)))
47 #define MEMORY_BARRIER (_mm_sfence())
48 //(__sync_synchronize())
49 #define ATOMIC_COUNTER volatile int
50 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
51 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
52 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
53 #elif defined(_MSC_VER)
54 #define ALIGN(var) __declspec(align(16)) var
55 #define ATOMIC(var) __declspec(align(32)) var
56 #define MEMORY_BARRIER (_mm_sfence())
58 #define ATOMIC_COUNTER volatile LONG
59 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
60 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
61 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
66 #define ALIGN(var) var
69 #define ATOMIC(var) var
71 #ifndef MEMORY_BARRIER
72 #define MEMORY_BARRIER ((void)0)
74 #ifndef ATOMIC_COUNTER
75 #define ATOMIC_COUNTER int
77 #ifndef ATOMIC_INCREMENT
78 #define ATOMIC_INCREMENT(counter) (++(counter))
80 #ifndef ATOMIC_DECREMENT
81 #define ATOMIC_DECREMENT(counter) (--(counter))
84 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
88 #include <emmintrin.h>
90 #if defined(__GNUC__) && (__GNUC < 4 || __GNUC_MINOR__ < 6) && !defined(__clang__)
91 #define _mm_cvtss_f32(val) (__builtin_ia32_vec_ext_v4sf ((__v4sf)(val), 0))
94 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
96 static void *MM_CALLOC(size_t nmemb, size_t size)
98 void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
99 if (ptr != NULL) memset(ptr, 0, nmemb*size);
103 #define MM_FREE _mm_free
105 #define MM_MALLOC(size) malloc(size)
106 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
110 typedef enum DPSOFTRAST_ARRAY_e
112 DPSOFTRAST_ARRAY_POSITION,
113 DPSOFTRAST_ARRAY_COLOR,
114 DPSOFTRAST_ARRAY_TEXCOORD0,
115 DPSOFTRAST_ARRAY_TEXCOORD1,
116 DPSOFTRAST_ARRAY_TEXCOORD2,
117 DPSOFTRAST_ARRAY_TEXCOORD3,
118 DPSOFTRAST_ARRAY_TEXCOORD4,
119 DPSOFTRAST_ARRAY_TEXCOORD5,
120 DPSOFTRAST_ARRAY_TEXCOORD6,
121 DPSOFTRAST_ARRAY_TEXCOORD7,
122 DPSOFTRAST_ARRAY_TOTAL
126 typedef struct DPSOFTRAST_Texture_s
133 DPSOFTRAST_TEXTURE_FILTER filter;
136 ATOMIC_COUNTER binds;
137 unsigned char *bytes;
138 int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
142 #define COMMAND_SIZE ALIGN_SIZE
143 #define COMMAND_ALIGN(var) ALIGN(var)
145 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
147 unsigned char opcode;
148 unsigned short commandsize;
152 enum { DPSOFTRAST_OPCODE_Reset = 0 };
154 #define DEFCOMMAND(opcodeval, name, fields) \
155 enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
156 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
158 unsigned char opcode; \
159 unsigned short commandsize; \
161 } DPSOFTRAST_Command_##name );
163 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
164 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
166 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
170 ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
172 DPSOFTRAST_State_Command_Pool);
174 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
176 unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
178 ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
180 DPSOFTRAST_State_Triangle);
182 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
183 slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
184 data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
185 _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
186 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
188 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
189 slope[0] = (triangle)->attribs[arrayindex][0][0]; \
190 slope[1] = (triangle)->attribs[arrayindex][0][1]; \
191 slope[2] = (triangle)->attribs[arrayindex][0][2]; \
192 slope[3] = (triangle)->attribs[arrayindex][0][3]; \
193 data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
194 data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
195 data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
196 data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
199 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
201 typedef ALIGN(struct DPSOFTRAST_State_Span_s
203 int triangle; // triangle this span was generated by
204 int x; // framebuffer x coord
205 int y; // framebuffer y coord
206 int startx; // usable range (according to pixelmask)
207 int endx; // usable range (according to pixelmask)
208 unsigned char *pixelmask; // true for pixels that passed depth test, false for others
209 int depthbase; // depthbuffer value at x (add depthslope*startx to get first pixel's depthbuffer value)
210 int depthslope; // depthbuffer value pixel delta
212 DPSOFTRAST_State_Span);
214 #define DPSOFTRAST_DRAW_MAXSPANS 1024
215 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
216 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
218 #define DPSOFTRAST_VALIDATE_FB 1
219 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
220 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
221 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
223 typedef enum DPSOFTRAST_BLENDMODE_e
225 DPSOFTRAST_BLENDMODE_OPAQUE,
226 DPSOFTRAST_BLENDMODE_ALPHA,
227 DPSOFTRAST_BLENDMODE_ADDALPHA,
228 DPSOFTRAST_BLENDMODE_ADD,
229 DPSOFTRAST_BLENDMODE_INVMOD,
230 DPSOFTRAST_BLENDMODE_MUL,
231 DPSOFTRAST_BLENDMODE_MUL2,
232 DPSOFTRAST_BLENDMODE_SUBALPHA,
233 DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
234 DPSOFTRAST_BLENDMODE_INVADD,
235 DPSOFTRAST_BLENDMODE_TOTAL
237 DPSOFTRAST_BLENDMODE;
239 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
258 float polygonoffset[2];
260 ALIGN(float fb_clipplane[4]);
263 int shader_permutation;
264 int shader_exactspecularmath;
266 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
268 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
269 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
271 // DPSOFTRAST_VALIDATE_ flags
274 // derived values (DPSOFTRAST_VALIDATE_FB)
277 ALIGN(float fb_viewportcenter[4]);
278 ALIGN(float fb_viewportscale[4]);
280 // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
283 // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
292 ATOMIC(volatile int commandoffset);
294 volatile bool waiting;
295 volatile bool starving;
302 DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
303 DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
304 unsigned char pixelmaskarray[DPSOFTRAST_DRAW_MAXSPANLENGTH+4]; // LordHavoc: padded to allow some termination bytes
306 DPSOFTRAST_State_Thread);
308 typedef ATOMIC(struct DPSOFTRAST_State_s
312 unsigned int *fb_depthpixels;
313 unsigned int *fb_colorpixels[4];
316 ALIGN(float fb_viewportcenter[4]);
317 ALIGN(float fb_viewportscale[4]);
320 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
321 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
323 const float *pointer_vertex3f;
324 const float *pointer_color4f;
325 const unsigned char *pointer_color4ub;
326 const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
329 int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
330 int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
331 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
335 float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
336 float *screencoord4f;
342 int shader_permutation;
343 int shader_exactspecularmath;
347 int texture_firstfree;
348 DPSOFTRAST_Texture *texture;
353 const char *errorstring;
358 DPSOFTRAST_State_Thread *threads;
360 ATOMIC(volatile int drawcommand);
362 DPSOFTRAST_State_Command_Pool commandpool;
366 DPSOFTRAST_State dpsoftrast;
368 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
369 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
370 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
371 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
373 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span);
374 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span);
376 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
378 fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
379 fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
380 fb_viewportcenter[3] = 0.5f;
381 fb_viewportcenter[0] = 0.0f;
382 fb_viewportscale[1] = 0.5f * viewport[2];
383 fb_viewportscale[2] = -0.5f * viewport[3];
384 fb_viewportscale[3] = 0.5f;
385 fb_viewportscale[0] = 1.0f;
388 static void DPSOFTRAST_RecalcThread(DPSOFTRAST_State_Thread *thread)
390 if (dpsoftrast.interlace)
392 thread->miny1 = (thread->index*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
393 thread->maxy1 = ((thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
394 thread->miny2 = ((dpsoftrast.numthreads+thread->index)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
395 thread->maxy2 = ((dpsoftrast.numthreads+thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
399 thread->miny1 = thread->miny2 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
400 thread->maxy1 = thread->maxy2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
404 static void DPSOFTRAST_RecalcClipPlane(DPSOFTRAST_State_Thread *thread)
406 thread->fb_clipplane[0] = thread->clipplane[0] / thread->fb_viewportscale[1];
407 thread->fb_clipplane[1] = thread->clipplane[1] / thread->fb_viewportscale[2];
408 thread->fb_clipplane[2] = thread->clipplane[2] / thread->fb_viewportscale[3];
409 thread->fb_clipplane[3] = thread->clipplane[3] / thread->fb_viewportscale[0];
410 thread->fb_clipplane[3] -= thread->fb_viewportcenter[1]*thread->fb_clipplane[0] + thread->fb_viewportcenter[2]*thread->fb_clipplane[1] + thread->fb_viewportcenter[3]*thread->fb_clipplane[2] + thread->fb_viewportcenter[0]*thread->fb_clipplane[3];
413 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
415 // calculate framebuffer scissor, viewport, viewport clipped by scissor,
416 // and viewport projection values
419 x1 = thread->scissor[0];
420 x2 = thread->scissor[0] + thread->scissor[2];
421 y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
422 y2 = dpsoftrast.fb_height - thread->scissor[1];
423 if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
425 if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
427 if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
428 thread->fb_scissor[0] = x1;
429 thread->fb_scissor[1] = y1;
430 thread->fb_scissor[2] = x2 - x1;
431 thread->fb_scissor[3] = y2 - y1;
433 DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
434 DPSOFTRAST_RecalcClipPlane(thread);
435 DPSOFTRAST_RecalcThread(thread);
438 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
440 thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
443 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
445 if (thread->blendsubtract)
447 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
449 #define BLENDFUNC(sfactor, dfactor, blendmode) \
450 case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
451 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
452 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
457 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
459 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
460 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
461 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
462 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
463 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
464 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
465 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
466 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
467 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
468 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
469 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
474 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
476 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
478 mask &= thread->validate;
481 if (mask & DPSOFTRAST_VALIDATE_FB)
483 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
484 DPSOFTRAST_RecalcFB(thread);
486 if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
488 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
489 DPSOFTRAST_RecalcDepthFunc(thread);
491 if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
493 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
494 DPSOFTRAST_RecalcBlendFunc(thread);
498 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
500 if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
501 return &dpsoftrast.texture[index];
505 static void DPSOFTRAST_Texture_Grow(void)
507 DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
508 DPSOFTRAST_State_Thread *thread;
512 // expand texture array as needed
513 if (dpsoftrast.texture_max < 1024)
514 dpsoftrast.texture_max = 1024;
516 dpsoftrast.texture_max *= 2;
517 dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
518 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
519 if (dpsoftrast.texbound[i])
520 dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
521 for (j = 0; j < dpsoftrast.numthreads; j++)
523 thread = &dpsoftrast.threads[j];
524 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
525 if (thread->texbound[i])
526 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
530 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
539 int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
540 int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
541 DPSOFTRAST_Texture *texture;
542 if (width*height*depth < 1)
544 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
547 if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
549 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
554 case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
555 case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
556 case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
558 case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
559 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
561 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
566 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
569 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
571 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
576 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
578 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
581 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
583 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
586 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
588 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
591 if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
593 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
596 if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
598 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
601 // find first empty slot in texture array
602 for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
603 if (!dpsoftrast.texture[texnum].bytes)
605 dpsoftrast.texture_firstfree = texnum + 1;
606 if (dpsoftrast.texture_max <= texnum)
607 DPSOFTRAST_Texture_Grow();
608 if (dpsoftrast.texture_end <= texnum)
609 dpsoftrast.texture_end = texnum + 1;
610 texture = &dpsoftrast.texture[texnum];
611 memset(texture, 0, sizeof(*texture));
612 texture->flags = flags;
613 texture->width = width;
614 texture->height = height;
615 texture->depth = depth;
616 texture->sides = sides;
628 s = w * h * d * sides * 4;
629 texture->mipmap[mipmaps][0] = size;
630 texture->mipmap[mipmaps][1] = s;
631 texture->mipmap[mipmaps][2] = w;
632 texture->mipmap[mipmaps][3] = h;
633 texture->mipmap[mipmaps][4] = d;
636 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
642 texture->mipmaps = mipmaps;
643 texture->size = size;
645 // allocate the pixels now
646 texture->bytes = (unsigned char *)MM_CALLOC(1, size);
650 void DPSOFTRAST_Texture_Free(int index)
652 DPSOFTRAST_Texture *texture;
653 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
657 MM_FREE(texture->bytes);
658 texture->bytes = NULL;
659 memset(texture, 0, sizeof(*texture));
660 // adjust the free range and used range
661 if (dpsoftrast.texture_firstfree > index)
662 dpsoftrast.texture_firstfree = index;
663 while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
664 dpsoftrast.texture_end--;
666 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
668 int i, x, y, z, w, layer0, layer1, row0, row1;
669 unsigned char *o, *i0, *i1, *i2, *i3;
670 DPSOFTRAST_Texture *texture;
671 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
672 if (texture->mipmaps <= 1)
674 for (i = 1;i < texture->mipmaps;i++)
676 for (z = 0;z < texture->mipmap[i][4];z++)
680 if (layer1 >= texture->mipmap[i-1][4])
681 layer1 = texture->mipmap[i-1][4]-1;
682 for (y = 0;y < texture->mipmap[i][3];y++)
686 if (row1 >= texture->mipmap[i-1][3])
687 row1 = texture->mipmap[i-1][3]-1;
688 o = texture->bytes + texture->mipmap[i ][0] + 4*((texture->mipmap[i ][3] * z + y ) * texture->mipmap[i ][2]);
689 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
690 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
691 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
692 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
693 w = texture->mipmap[i][2];
696 if (texture->mipmap[i-1][2] > 1)
698 // average 3D texture
699 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
701 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
702 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
703 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
704 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
709 // average 3D mipmap with parent width == 1
710 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
712 o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
713 o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
714 o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
715 o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
721 if (texture->mipmap[i-1][2] > 1)
723 // average 2D texture (common case)
724 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
726 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
727 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
728 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
729 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
734 // 2D texture with parent width == 1
735 o[0] = (i0[0] + i1[0] + 1) >> 1;
736 o[1] = (i0[1] + i1[1] + 1) >> 1;
737 o[2] = (i0[2] + i1[2] + 1) >> 1;
738 o[3] = (i0[3] + i1[3] + 1) >> 1;
745 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
747 DPSOFTRAST_Texture *texture;
749 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
754 dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
755 while (blockheight > 0)
757 memcpy(dst, pixels, blockwidth * 4);
758 pixels += blockwidth * 4;
759 dst += texture->mipmap[0][2] * 4;
763 DPSOFTRAST_Texture_CalculateMipmaps(index);
765 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
767 DPSOFTRAST_Texture *texture;
768 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
772 memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
773 DPSOFTRAST_Texture_CalculateMipmaps(index);
775 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
777 DPSOFTRAST_Texture *texture;
778 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
779 return texture->mipmap[mip][2];
781 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
783 DPSOFTRAST_Texture *texture;
784 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
785 return texture->mipmap[mip][3];
787 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
789 DPSOFTRAST_Texture *texture;
790 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
791 return texture->mipmap[mip][4];
793 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
795 DPSOFTRAST_Texture *texture;
796 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
799 return texture->bytes + texture->mipmap[mip][0];
801 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
803 DPSOFTRAST_Texture *texture;
804 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
805 if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
807 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
812 texture->filter = filter;
815 static void DPSOFTRAST_Draw_FlushThreads(void);
817 static void DPSOFTRAST_Draw_SyncCommands(void)
819 if(dpsoftrast.usethreads) MEMORY_BARRIER;
820 dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
823 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
825 DPSOFTRAST_State_Thread *thread;
827 int freecommand = dpsoftrast.commandpool.freecommand;
828 int usedcommands = dpsoftrast.commandpool.usedcommands;
829 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
831 DPSOFTRAST_Draw_SyncCommands();
837 for (i = 0; i < dpsoftrast.numthreads; i++)
839 thread = &dpsoftrast.threads[i];
840 commandoffset = freecommand - thread->commandoffset;
841 if (commandoffset < 0)
842 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
843 if (commandoffset > usedcommands)
846 usedcommands = commandoffset;
849 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
851 thread = &dpsoftrast.threads[waitindex];
852 Thread_LockMutex(thread->drawmutex);
853 if (thread->commandoffset != dpsoftrast.drawcommand)
855 thread->waiting = true;
856 if (thread->starving) Thread_CondSignal(thread->drawcond);
857 Thread_CondWait(thread->waitcond, thread->drawmutex);
858 thread->waiting = false;
860 Thread_UnlockMutex(thread->drawmutex);
862 dpsoftrast.commandpool.usedcommands = usedcommands;
865 #define DPSOFTRAST_ALIGNCOMMAND(size) \
866 ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
867 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
868 ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
870 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
872 DPSOFTRAST_Command *command;
873 int freecommand = dpsoftrast.commandpool.freecommand;
874 int usedcommands = dpsoftrast.commandpool.usedcommands;
875 int extra = sizeof(DPSOFTRAST_Command);
876 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
877 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
878 if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
880 if (dpsoftrast.usethreads)
881 DPSOFTRAST_Draw_FreeCommandPool(size + extra);
883 DPSOFTRAST_Draw_FlushThreads();
884 freecommand = dpsoftrast.commandpool.freecommand;
885 usedcommands = dpsoftrast.commandpool.usedcommands;
887 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
889 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
890 command->opcode = DPSOFTRAST_OPCODE_Reset;
891 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
894 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
895 command->opcode = opcode;
896 command->commandsize = size;
898 if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
900 dpsoftrast.commandpool.freecommand = freecommand;
901 dpsoftrast.commandpool.usedcommands = usedcommands + size;
905 static void DPSOFTRAST_UndoCommand(int size)
907 int freecommand = dpsoftrast.commandpool.freecommand;
908 int usedcommands = dpsoftrast.commandpool.usedcommands;
911 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
912 usedcommands -= size;
913 dpsoftrast.commandpool.freecommand = freecommand;
914 dpsoftrast.commandpool.usedcommands = usedcommands;
917 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
918 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
920 thread->viewport[0] = command->x;
921 thread->viewport[1] = command->y;
922 thread->viewport[2] = command->width;
923 thread->viewport[3] = command->height;
924 thread->validate |= DPSOFTRAST_VALIDATE_FB;
926 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
928 DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
931 command->width = width;
932 command->height = height;
934 dpsoftrast.viewport[0] = x;
935 dpsoftrast.viewport[1] = y;
936 dpsoftrast.viewport[2] = width;
937 dpsoftrast.viewport[3] = height;
938 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
941 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;)
942 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
944 int i, x1, y1, x2, y2, w, h, x, y;
945 int miny1, maxy1, miny2, maxy2;
949 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
950 miny1 = thread->miny1;
951 maxy1 = thread->maxy1;
952 miny2 = thread->miny2;
953 maxy2 = thread->maxy2;
954 x1 = thread->fb_scissor[0];
955 y1 = thread->fb_scissor[1];
956 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
957 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
958 if (y1 < miny1) y1 = miny1;
959 if (y2 > maxy2) y2 = maxy2;
964 // FIXME: honor fb_colormask?
965 c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
966 for (i = 0;i < 4;i++)
968 if (!dpsoftrast.fb_colorpixels[i])
970 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
973 p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
974 for (x = x1;x < x2;x++)
979 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
981 DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
988 DEFCOMMAND(3, ClearDepth, float depth;)
989 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
991 int x1, y1, x2, y2, w, h, x, y;
992 int miny1, maxy1, miny2, maxy2;
996 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
997 miny1 = thread->miny1;
998 maxy1 = thread->maxy1;
999 miny2 = thread->miny2;
1000 maxy2 = thread->maxy2;
1001 x1 = thread->fb_scissor[0];
1002 y1 = thread->fb_scissor[1];
1003 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
1004 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
1005 if (y1 < miny1) y1 = miny1;
1006 if (y2 > maxy2) y2 = maxy2;
1011 c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
1012 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
1013 for (;y < bandy;y++)
1015 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
1016 for (x = x1;x < x2;x++)
1020 void DPSOFTRAST_ClearDepth(float d)
1022 DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
1026 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
1027 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
1029 thread->colormask[0] = command->r != 0;
1030 thread->colormask[1] = command->g != 0;
1031 thread->colormask[2] = command->b != 0;
1032 thread->colormask[3] = command->a != 0;
1033 thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
1035 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
1037 DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
1044 DEFCOMMAND(5, DepthTest, int enable;)
1045 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
1047 thread->depthtest = command->enable;
1048 thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1050 void DPSOFTRAST_DepthTest(int enable)
1052 DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1053 command->enable = enable;
1056 DEFCOMMAND(6, ScissorTest, int enable;)
1057 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1059 thread->scissortest = command->enable;
1060 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1062 void DPSOFTRAST_ScissorTest(int enable)
1064 DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1065 command->enable = enable;
1068 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1069 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1071 thread->scissor[0] = command->x;
1072 thread->scissor[1] = command->y;
1073 thread->scissor[2] = command->width;
1074 thread->scissor[3] = command->height;
1075 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1077 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1079 DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1082 command->width = width;
1083 command->height = height;
1086 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1087 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1089 thread->blendfunc[0] = command->sfactor;
1090 thread->blendfunc[1] = command->dfactor;
1091 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1093 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1095 DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1096 command->sfactor = sfactor;
1097 command->dfactor = dfactor;
1100 DEFCOMMAND(9, BlendSubtract, int enable;)
1101 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1103 thread->blendsubtract = command->enable;
1104 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1106 void DPSOFTRAST_BlendSubtract(int enable)
1108 DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1109 command->enable = enable;
1112 DEFCOMMAND(10, DepthMask, int enable;)
1113 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1115 thread->depthmask = command->enable;
1117 void DPSOFTRAST_DepthMask(int enable)
1119 DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1120 command->enable = enable;
1123 DEFCOMMAND(11, DepthFunc, int func;)
1124 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1126 thread->depthfunc = command->func;
1128 void DPSOFTRAST_DepthFunc(int func)
1130 DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1131 command->func = func;
1134 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1135 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1137 thread->depthrange[0] = command->nearval;
1138 thread->depthrange[1] = command->farval;
1140 void DPSOFTRAST_DepthRange(float nearval, float farval)
1142 DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1143 command->nearval = nearval;
1144 command->farval = farval;
1147 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1148 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1150 thread->polygonoffset[0] = command->alongnormal;
1151 thread->polygonoffset[1] = command->intoview;
1153 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1155 DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1156 command->alongnormal = alongnormal;
1157 command->intoview = intoview;
1160 DEFCOMMAND(14, CullFace, int mode;)
1161 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1163 thread->cullface = command->mode;
1165 void DPSOFTRAST_CullFace(int mode)
1167 DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1168 command->mode = mode;
1171 DEFCOMMAND(15, AlphaTest, int enable;)
1172 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1174 thread->alphatest = command->enable;
1176 void DPSOFTRAST_AlphaTest(int enable)
1178 DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1179 command->enable = enable;
1182 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1183 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1185 thread->alphafunc = command->func;
1186 thread->alphavalue = command->ref;
1188 void DPSOFTRAST_AlphaFunc(int func, float ref)
1190 DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1191 command->func = func;
1195 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1197 dpsoftrast.color[0] = r;
1198 dpsoftrast.color[1] = g;
1199 dpsoftrast.color[2] = b;
1200 dpsoftrast.color[3] = a;
1203 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1205 int outstride = blockwidth * 4;
1206 int instride = dpsoftrast.fb_width * 4;
1209 int bx2 = blockx + blockwidth;
1210 int by2 = blocky + blockheight;
1214 unsigned char *inpixels;
1218 if (bx1 < 0) bx1 = 0;
1219 if (by1 < 0) by1 = 0;
1220 if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1221 if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1223 inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1224 if (dpsoftrast.bigendian)
1226 for (y = by1;y < by2;y++)
1228 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1229 o = (unsigned char *)outpixels + (y - by1) * outstride;
1230 for (x = bx1;x < bx2;x++)
1243 for (y = by1;y < by2;y++)
1245 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1246 o = (unsigned char *)outpixels + (y - by1) * outstride;
1252 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1256 int tx2 = tx + width;
1257 int ty2 = ty + height;
1260 int sx2 = sx + width;
1261 int sy2 = sy + height;
1271 unsigned int *spixels;
1272 unsigned int *tpixels;
1273 DPSOFTRAST_Texture *texture;
1274 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1275 if (mip < 0 || mip >= texture->mipmaps) return;
1277 spixels = dpsoftrast.fb_colorpixels[0];
1278 swidth = dpsoftrast.fb_width;
1279 sheight = dpsoftrast.fb_height;
1280 tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1281 twidth = texture->mipmap[mip][2];
1282 theight = texture->mipmap[mip][3];
1283 if (tx1 < 0) tx1 = 0;
1284 if (ty1 < 0) ty1 = 0;
1285 if (tx2 > twidth) tx2 = twidth;
1286 if (ty2 > theight) ty2 = theight;
1287 if (sx1 < 0) sx1 = 0;
1288 if (sy1 < 0) sy1 = 0;
1289 if (sx2 > swidth) sx2 = swidth;
1290 if (sy2 > sheight) sy2 = sheight;
1295 if (tw > sw) tw = sw;
1296 if (th > sh) th = sh;
1297 if (tw < 1 || th < 1)
1299 sy1 = sheight - 1 - sy1;
1300 for (y = 0;y < th;y++)
1301 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 - y) * swidth + sx1), tw*4);
1302 if (texture->mipmaps > 1)
1303 DPSOFTRAST_Texture_CalculateMipmaps(index);
1306 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1307 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1309 if (thread->texbound[command->unitnum])
1310 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1311 thread->texbound[command->unitnum] = command->texture;
1313 void DPSOFTRAST_SetTexture(int unitnum, int index)
1315 DPSOFTRAST_Command_SetTexture *command;
1316 DPSOFTRAST_Texture *texture;
1317 if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1319 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1322 texture = DPSOFTRAST_Texture_GetByIndex(index);
1323 if (index && !texture)
1325 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1329 command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1330 command->unitnum = unitnum;
1331 command->texture = texture;
1333 dpsoftrast.texbound[unitnum] = texture;
1334 ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1337 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1339 dpsoftrast.pointer_vertex3f = vertex3f;
1340 dpsoftrast.stride_vertex = stride;
1342 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1344 dpsoftrast.pointer_color4f = color4f;
1345 dpsoftrast.pointer_color4ub = NULL;
1346 dpsoftrast.stride_color = stride;
1348 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1350 dpsoftrast.pointer_color4f = NULL;
1351 dpsoftrast.pointer_color4ub = color4ub;
1352 dpsoftrast.stride_color = stride;
1354 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1356 dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1357 dpsoftrast.components_texcoord[unitnum] = numcomponents;
1358 dpsoftrast.stride_texcoord[unitnum] = stride;
1361 DEFCOMMAND(18, SetShader, int mode; int permutation; int exactspecularmath;)
1362 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1364 thread->shader_mode = command->mode;
1365 thread->shader_permutation = command->permutation;
1366 thread->shader_exactspecularmath = command->exactspecularmath;
1368 void DPSOFTRAST_SetShader(int mode, int permutation, int exactspecularmath)
1370 DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1371 command->mode = mode;
1372 command->permutation = permutation;
1373 command->exactspecularmath = exactspecularmath;
1375 dpsoftrast.shader_mode = mode;
1376 dpsoftrast.shader_permutation = permutation;
1377 dpsoftrast.shader_exactspecularmath = exactspecularmath;
1380 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1381 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1383 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1385 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1387 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1388 command->index = index;
1389 command->val[0] = v0;
1390 command->val[1] = v1;
1391 command->val[2] = v2;
1392 command->val[3] = v3;
1394 dpsoftrast.uniform4f[index*4+0] = v0;
1395 dpsoftrast.uniform4f[index*4+1] = v1;
1396 dpsoftrast.uniform4f[index*4+2] = v2;
1397 dpsoftrast.uniform4f[index*4+3] = v3;
1399 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1401 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1402 command->index = index;
1403 memcpy(command->val, v, sizeof(command->val));
1405 memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1408 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1409 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1411 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1413 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1417 for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1419 __m128 m0, m1, m2, m3;
1420 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1421 command->index = (DPSOFTRAST_UNIFORM)index;
1422 if (((size_t)v)&(ALIGN_SIZE-1))
1424 m0 = _mm_loadu_ps(v);
1425 m1 = _mm_loadu_ps(v+4);
1426 m2 = _mm_loadu_ps(v+8);
1427 m3 = _mm_loadu_ps(v+12);
1431 m0 = _mm_load_ps(v);
1432 m1 = _mm_load_ps(v+4);
1433 m2 = _mm_load_ps(v+8);
1434 m3 = _mm_load_ps(v+12);
1438 __m128 t0, t1, t2, t3;
1439 t0 = _mm_unpacklo_ps(m0, m1);
1440 t1 = _mm_unpacklo_ps(m2, m3);
1441 t2 = _mm_unpackhi_ps(m0, m1);
1442 t3 = _mm_unpackhi_ps(m2, m3);
1443 m0 = _mm_movelh_ps(t0, t1);
1444 m1 = _mm_movehl_ps(t1, t0);
1445 m2 = _mm_movelh_ps(t2, t3);
1446 m3 = _mm_movehl_ps(t3, t2);
1448 _mm_store_ps(command->val, m0);
1449 _mm_store_ps(command->val+4, m1);
1450 _mm_store_ps(command->val+8, m2);
1451 _mm_store_ps(command->val+12, m3);
1452 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1453 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1454 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1455 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1460 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1461 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1463 thread->uniform1i[command->index] = command->val;
1465 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1467 DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1468 command->index = index;
1471 dpsoftrast.uniform1i[command->index] = i0;
1474 DEFCOMMAND(24, ClipPlane, float clipplane[4];)
1475 static void DPSOFTRAST_Interpret_ClipPlane(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClipPlane *command)
1477 memcpy(thread->clipplane, command->clipplane, 4*sizeof(float));
1478 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1480 void DPSOFTRAST_ClipPlane(float x, float y, float z, float w)
1482 DPSOFTRAST_Command_ClipPlane *command = DPSOFTRAST_ALLOCATECOMMAND(ClipPlane);
1483 command->clipplane[0] = x;
1484 command->clipplane[1] = y;
1485 command->clipplane[2] = z;
1486 command->clipplane[3] = w;
1490 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1492 float *end = dst + size*4;
1493 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1497 _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1506 _mm_store_ps(dst, _mm_load_ps((const float *)src));
1513 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1515 float *end = dst + size*4;
1516 if (stride == sizeof(float[3]))
1518 float *end4 = dst + (size&~3)*4;
1519 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1523 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv;
1524 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1525 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1526 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1527 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1528 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1529 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1530 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1531 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1532 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1533 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1534 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1535 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1537 src += 4*sizeof(float[3]);
1544 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1545 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1546 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1547 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1548 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1549 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1550 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1551 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1552 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1553 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1554 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1555 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1556 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1558 src += 4*sizeof(float[3]);
1562 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1566 __m128 v = _mm_loadu_ps((const float *)src);
1567 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1568 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1569 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1570 _mm_store_ps(dst, v);
1579 __m128 v = _mm_load_ps((const float *)src);
1580 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1581 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1582 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1583 _mm_store_ps(dst, v);
1590 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1592 float *end = dst + size*4;
1593 __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1594 if (stride == sizeof(float[2]))
1596 float *end2 = dst + (size&~1)*4;
1597 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1601 __m128 v = _mm_loadu_ps((const float *)src);
1602 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1603 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1605 src += 2*sizeof(float[2]);
1612 __m128 v = _mm_load_ps((const float *)src);
1613 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1614 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1616 src += 2*sizeof(float[2]);
1622 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1628 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1630 float *end = dst + size*4;
1631 __m128 scale = _mm_set1_ps(1.0f/255.0f);
1632 if (stride == sizeof(unsigned char[4]))
1634 float *end4 = dst + (size&~3)*4;
1635 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1639 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1640 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1641 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1642 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1643 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1645 src += 4*sizeof(unsigned char[4]);
1652 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1653 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1654 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1655 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1656 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1658 src += 4*sizeof(unsigned char[4]);
1664 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1665 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1671 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1673 float *end = dst + 4*size;
1674 __m128 v = _mm_loadu_ps(src);
1677 _mm_store_ps(dst, v);
1683 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1686 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1687 __m128 m0, m1, m2, m3;
1689 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1691 // fast case for identity matrix
1692 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1695 end = out4f + numitems*4;
1696 m0 = _mm_loadu_ps(inmatrix16f);
1697 m1 = _mm_loadu_ps(inmatrix16f + 4);
1698 m2 = _mm_loadu_ps(inmatrix16f + 8);
1699 m3 = _mm_loadu_ps(inmatrix16f + 12);
1700 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1704 __m128 v = _mm_loadu_ps(in4f);
1706 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1707 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1708 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1709 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1718 __m128 v = _mm_load_ps(in4f);
1720 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1721 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1722 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1723 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1731 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1733 memcpy(out4f, in4f, numitems * sizeof(float[4]));
1737 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1739 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1740 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1741 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1742 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1745 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1747 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1748 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1749 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1750 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1753 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1756 out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1757 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1758 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1759 _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1762 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, const float *minposf, const float *maxposf, const float *inmatrix16f)
1764 int clipmask = 0xFF;
1765 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1766 __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1767 __m128 m0 = _mm_loadu_ps(inmatrix16f), m1 = _mm_loadu_ps(inmatrix16f + 4), m2 = _mm_loadu_ps(inmatrix16f + 8), m3 = _mm_loadu_ps(inmatrix16f + 12);
1768 __m128 minpos = _mm_load_ps(minposf), maxpos = _mm_load_ps(maxposf);
1769 m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1770 m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1771 m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1772 m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1773 #define BBFRONT(k, pos) \
1775 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1776 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1777 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1780 clipmask &= ~(1<<k); \
1781 proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1782 minproj = _mm_min_ss(minproj, proj); \
1783 maxproj = _mm_max_ss(maxproj, proj); \
1787 BBFRONT(1, _mm_move_ss(minpos, maxpos));
1788 BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1789 BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1790 BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1791 BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1792 BBFRONT(6, _mm_move_ss(maxpos, minpos));
1796 if (clipmask&(1<<k)) \
1798 if (!(clipmask&(1<<(k^1)))) \
1800 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1801 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1802 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1803 minproj = _mm_min_ss(minproj, proj); \
1804 maxproj = _mm_max_ss(maxproj, proj); \
1806 if (!(clipmask&(1<<(k^2)))) \
1808 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1809 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1810 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1811 minproj = _mm_min_ss(minproj, proj); \
1812 maxproj = _mm_max_ss(maxproj, proj); \
1814 if (!(clipmask&(1<<(k^4)))) \
1816 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1817 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1818 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1819 minproj = _mm_min_ss(minproj, proj); \
1820 maxproj = _mm_max_ss(maxproj, proj); \
1824 BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1825 viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1826 viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1827 minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1828 maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1829 minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1830 maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1831 *starty = _mm_cvttss_si32(maxproj);
1832 *endy = _mm_cvttss_si32(minproj)+1;
1836 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1838 static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1839 float *end = out4f + numitems*4;
1840 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1841 __m128 minpos, maxpos;
1842 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1844 minpos = maxpos = _mm_loadu_ps(in4f);
1847 __m128 v = _mm_loadu_ps(in4f);
1848 minpos = _mm_min_ps(minpos, v);
1849 maxpos = _mm_max_ps(maxpos, v);
1850 _mm_store_ps(out4f, v);
1851 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1852 _mm_store_ps(screen4f, v);
1860 minpos = maxpos = _mm_load_ps(in4f);
1863 __m128 v = _mm_load_ps(in4f);
1864 minpos = _mm_min_ps(minpos, v);
1865 maxpos = _mm_max_ps(maxpos, v);
1866 _mm_store_ps(out4f, v);
1867 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1868 _mm_store_ps(screen4f, v);
1876 ALIGN(float minposf[4]);
1877 ALIGN(float maxposf[4]);
1878 _mm_store_ps(minposf, minpos);
1879 _mm_store_ps(maxposf, maxpos);
1880 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, identitymatrix);
1885 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1887 static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1888 __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1890 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1891 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1892 end = out4f + numitems*4;
1893 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1894 viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1895 m0 = _mm_loadu_ps(inmatrix16f);
1896 m1 = _mm_loadu_ps(inmatrix16f + 4);
1897 m2 = _mm_loadu_ps(inmatrix16f + 8);
1898 m3 = _mm_loadu_ps(inmatrix16f + 12);
1899 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1901 minpos = maxpos = _mm_loadu_ps(in4f);
1904 __m128 v = _mm_loadu_ps(in4f);
1905 minpos = _mm_min_ps(minpos, v);
1906 maxpos = _mm_max_ps(maxpos, v);
1907 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1908 _mm_store_ps(out4f, v);
1909 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1910 _mm_store_ps(screen4f, v);
1918 minpos = maxpos = _mm_load_ps(in4f);
1921 __m128 v = _mm_load_ps(in4f);
1922 minpos = _mm_min_ps(minpos, v);
1923 maxpos = _mm_max_ps(maxpos, v);
1924 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1925 _mm_store_ps(out4f, v);
1926 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1927 _mm_store_ps(screen4f, v);
1935 ALIGN(float minposf[4]);
1936 ALIGN(float maxposf[4]);
1937 _mm_store_ps(minposf, minpos);
1938 _mm_store_ps(maxposf, maxpos);
1939 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, inmatrix16f);
1945 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1948 float *outf = dpsoftrast.post_array4f[outarray];
1949 const unsigned char *inb;
1950 int firstvertex = dpsoftrast.firstvertex;
1951 int numvertices = dpsoftrast.numvertices;
1955 case DPSOFTRAST_ARRAY_POSITION:
1956 stride = dpsoftrast.stride_vertex;
1957 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1958 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1960 case DPSOFTRAST_ARRAY_COLOR:
1961 stride = dpsoftrast.stride_color;
1962 if (dpsoftrast.pointer_color4f)
1964 inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1965 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1967 else if (dpsoftrast.pointer_color4ub)
1969 stride = dpsoftrast.stride_color;
1970 inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1971 DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1975 DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1979 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1980 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1982 inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1983 switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1986 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1989 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1992 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
2004 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
2006 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
2007 DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
2012 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
2015 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
2016 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
2024 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
2027 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
2028 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
2035 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
2038 int startx = span->startx;
2039 int endx = span->endx;
2040 float wslope = triangle->w[0];
2041 float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
2042 float endz = 1.0f / (w + wslope * startx);
2043 if (triangle->w[0] == 0)
2045 // LordHavoc: fast flat polygons (HUD/menu)
2046 for (x = startx;x < endx;x++)
2050 for (x = startx;x < endx;)
2052 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2054 if (nextsub >= endx) nextsub = endsub = endx-1;
2055 endz = 1.0f / (w + wslope * nextsub);
2056 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
2057 for (; x <= endsub; x++, z += dz)
2062 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2066 int startx = span->startx;
2067 int endx = span->endx;
2070 const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2071 unsigned char * RESTRICT pixelmask = span->pixelmask;
2072 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2073 unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2076 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2077 pixeli += span->y * dpsoftrast.fb_width + span->x;
2078 // handle alphatest now (this affects depth writes too)
2079 if (thread->alphatest)
2080 for (x = startx;x < endx;x++)
2081 if (in4ub[x*4+3] < 128)
2082 pixelmask[x] = false;
2083 // LordHavoc: clear pixelmask for some pixels in alphablend cases, this
2084 // helps sprites, text and hud artwork
2085 switch(thread->fb_blendmode)
2087 case DPSOFTRAST_BLENDMODE_ALPHA:
2088 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2089 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2091 for (x = startx;x < endx;x++)
2093 if (in4ub[x*4+3] >= 1)
2098 while (++x < endx && in4ub[x*4+3] >= 1) ;
2100 if (x >= endx) break;
2102 while (++x < endx && in4ub[x*4+3] < 1) pixelmask[x] = false;
2103 if (x >= endx) break;
2110 case DPSOFTRAST_BLENDMODE_OPAQUE:
2111 case DPSOFTRAST_BLENDMODE_ADD:
2112 case DPSOFTRAST_BLENDMODE_INVMOD:
2113 case DPSOFTRAST_BLENDMODE_MUL:
2114 case DPSOFTRAST_BLENDMODE_MUL2:
2115 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2116 case DPSOFTRAST_BLENDMODE_INVADD:
2119 // put some special values at the end of the mask to ensure the loops end
2120 pixelmask[endx] = 1;
2121 pixelmask[endx+1] = 0;
2122 // LordHavoc: use a double loop to identify subspans, this helps the
2123 // optimized copy/blend loops to perform at their best, most triangles
2124 // have only one run of pixels, and do the search using wide reads...
2128 // if this pixel is masked off, it's probably not alone...
2135 // the 4-item search must be aligned or else it stalls badly
2136 if ((x & 3) && !pixelmask[x])
2138 if(pixelmask[x]) goto endmasked;
2142 if(pixelmask[x]) goto endmasked;
2146 if(pixelmask[x]) goto endmasked;
2151 while (*(unsigned int *)&pixelmask[x] == 0x00000000)
2155 for (;!pixelmask[x];x++)
2157 // rather than continue the loop, just check the end variable
2162 // find length of subspan
2165 if (subx + 8 < endx)
2169 if(!pixelmask[subx]) goto endunmasked;
2173 if(!pixelmask[subx]) goto endunmasked;
2177 if(!pixelmask[subx]) goto endunmasked;
2182 while (*(unsigned int *)&pixelmask[subx] == 0x01010101)
2186 for (;pixelmask[subx];subx++)
2188 // the checks can overshoot, so make sure to clip it...
2192 // now that we know the subspan length... process!
2193 switch(thread->fb_blendmode)
2195 case DPSOFTRAST_BLENDMODE_OPAQUE:
2199 memcpy(pixeli + x, ini + x, (subx - x) * sizeof(pixeli[x]));
2204 while (x + 16 <= subx)
2206 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2207 _mm_storeu_si128((__m128i *)&pixeli[x+4], _mm_loadu_si128((const __m128i *)&ini[x+4]));
2208 _mm_storeu_si128((__m128i *)&pixeli[x+8], _mm_loadu_si128((const __m128i *)&ini[x+8]));
2209 _mm_storeu_si128((__m128i *)&pixeli[x+12], _mm_loadu_si128((const __m128i *)&ini[x+12]));
2214 while (x + 4 <= subx)
2216 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2222 pixeli[x+1] = ini[x+1];
2232 case DPSOFTRAST_BLENDMODE_ALPHA:
2233 #define FINISHBLEND(blend2, blend1) \
2234 for (;x + 1 < subx;x += 2) \
2237 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2238 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2240 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2245 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2246 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2248 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2252 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2253 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2255 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2256 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2259 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2261 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2262 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2264 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2265 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2268 case DPSOFTRAST_BLENDMODE_ADD:
2269 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2271 case DPSOFTRAST_BLENDMODE_INVMOD:
2273 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2275 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2278 case DPSOFTRAST_BLENDMODE_MUL:
2279 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2281 case DPSOFTRAST_BLENDMODE_MUL2:
2282 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2284 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2286 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2287 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2289 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2290 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2293 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2295 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2296 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2298 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2299 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2302 case DPSOFTRAST_BLENDMODE_INVADD:
2304 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2306 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2314 static void DPSOFTRAST_Texture2DBGRA8(DPSOFTRAST_Texture *texture, int mip, float x, float y, unsigned char c[4])
2315 // warning: this is SLOW, only use if the optimized per-span functions won't do
2317 const unsigned char * RESTRICT pixelbase;
2318 const unsigned char * RESTRICT pixel[4];
2319 int width = texture->mipmap[mip][2], height = texture->mipmap[mip][3];
2320 int wrapmask[2] = { width-1, height-1 };
2321 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2322 if(texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR)
2324 unsigned int tc[2] = { x * (width<<12) - 2048, y * (height<<12) - 2048};
2325 unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
2326 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2327 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2328 int tci[2] = { tc[0]>>12, tc[1]>>12 };
2329 int tci1[2] = { tci[0] + 1, tci[1] + 1 };
2330 if (texture->flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2332 tci[0] = tci[0] >= 0 ? (tci[0] <= wrapmask[0] ? tci[0] : wrapmask[0]) : 0;
2333 tci[1] = tci[1] >= 0 ? (tci[1] <= wrapmask[1] ? tci[1] : wrapmask[1]) : 0;
2334 tci1[0] = tci1[0] >= 0 ? (tci1[0] <= wrapmask[0] ? tci1[0] : wrapmask[0]) : 0;
2335 tci1[1] = tci1[1] >= 0 ? (tci1[1] <= wrapmask[1] ? tci1[1] : wrapmask[1]) : 0;
2339 tci[0] &= wrapmask[0];
2340 tci[1] &= wrapmask[1];
2341 tci1[0] &= wrapmask[0];
2342 tci1[1] &= wrapmask[1];
2344 pixel[0] = pixelbase + 4 * (tci[1]*width+tci[0]);
2345 pixel[1] = pixelbase + 4 * (tci[1]*width+tci1[0]);
2346 pixel[2] = pixelbase + 4 * (tci1[1]*width+tci[0]);
2347 pixel[3] = pixelbase + 4 * (tci1[1]*width+tci1[0]);
2348 c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3])>>24;
2349 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3])>>24;
2350 c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3])>>24;
2351 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3])>>24;
2355 int tci[2] = { x * width, y * height };
2356 if (texture->flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2358 tci[0] = tci[0] >= 0 ? (tci[0] <= wrapmask[0] ? tci[0] : wrapmask[0]) : 0;
2359 tci[1] = tci[1] >= 0 ? (tci[1] <= wrapmask[1] ? tci[1] : wrapmask[1]) : 0;
2363 tci[0] &= wrapmask[0];
2364 tci[1] &= wrapmask[1];
2366 pixel[0] = pixelbase + 4 * (tci[1]*width+tci[0]);
2374 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2377 int startx = span->startx;
2378 int endx = span->endx;
2383 float tc[2], endtc[2];
2385 unsigned int tci[2];
2386 unsigned int tci1[2];
2387 unsigned int tcimin[2];
2388 unsigned int tcimax[2];
2393 const unsigned char * RESTRICT pixelbase;
2394 const unsigned char * RESTRICT pixel[4];
2395 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2396 // if no texture is bound, just fill it with white
2399 for (x = startx;x < endx;x++)
2401 out4f[x*4+0] = 1.0f;
2402 out4f[x*4+1] = 1.0f;
2403 out4f[x*4+2] = 1.0f;
2404 out4f[x*4+3] = 1.0f;
2408 mip = triangle->mip[texunitindex];
2409 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2410 // if this mipmap of the texture is 1 pixel, just fill it with that color
2411 if (texture->mipmap[mip][1] == 4)
2413 c[0] = texture->bytes[2] * (1.0f/255.0f);
2414 c[1] = texture->bytes[1] * (1.0f/255.0f);
2415 c[2] = texture->bytes[0] * (1.0f/255.0f);
2416 c[3] = texture->bytes[3] * (1.0f/255.0f);
2417 for (x = startx;x < endx;x++)
2419 out4f[x*4+0] = c[0];
2420 out4f[x*4+1] = c[1];
2421 out4f[x*4+2] = c[2];
2422 out4f[x*4+3] = c[3];
2426 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2427 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2428 flags = texture->flags;
2429 tcscale[0] = texture->mipmap[mip][2];
2430 tcscale[1] = texture->mipmap[mip][3];
2431 tciwidth = texture->mipmap[mip][2];
2434 tcimax[0] = texture->mipmap[mip][2]-1;
2435 tcimax[1] = texture->mipmap[mip][3]-1;
2436 tciwrapmask[0] = texture->mipmap[mip][2]-1;
2437 tciwrapmask[1] = texture->mipmap[mip][3]-1;
2438 endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0];
2439 endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1];
2445 for (x = startx;x < endx;)
2447 unsigned int subtc[2];
2448 unsigned int substep[2];
2449 float subscale = 4096.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2450 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2451 if (nextsub >= endx)
2453 nextsub = endsub = endx-1;
2454 if (x < nextsub) subscale = 4096.0f / (nextsub - x);
2458 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0];
2459 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1];
2465 substep[0] = (endtc[0] - tc[0]) * subscale;
2466 substep[1] = (endtc[1] - tc[1]) * subscale;
2467 subtc[0] = tc[0] * (1<<12);
2468 subtc[1] = tc[1] * (1<<12);
2471 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2473 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2475 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2476 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2477 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2478 tci[0] = subtc[0]>>12;
2479 tci[1] = subtc[1]>>12;
2480 tci1[0] = tci[0] + 1;
2481 tci1[1] = tci[1] + 1;
2482 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2483 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2484 tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2485 tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2486 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2487 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2488 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2489 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2490 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2491 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2492 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2493 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2494 out4f[x*4+0] = c[0];
2495 out4f[x*4+1] = c[1];
2496 out4f[x*4+2] = c[2];
2497 out4f[x*4+3] = c[3];
2502 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2504 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2505 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2506 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2507 tci[0] = subtc[0]>>12;
2508 tci[1] = subtc[1]>>12;
2509 tci1[0] = tci[0] + 1;
2510 tci1[1] = tci[1] + 1;
2511 tci[0] &= tciwrapmask[0];
2512 tci[1] &= tciwrapmask[1];
2513 tci1[0] &= tciwrapmask[0];
2514 tci1[1] &= tciwrapmask[1];
2515 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2516 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2517 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2518 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2519 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2520 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2521 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2522 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2523 out4f[x*4+0] = c[0];
2524 out4f[x*4+1] = c[1];
2525 out4f[x*4+2] = c[2];
2526 out4f[x*4+3] = c[3];
2530 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2532 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2534 tci[0] = subtc[0]>>12;
2535 tci[1] = subtc[1]>>12;
2536 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2537 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2538 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2539 c[0] = pixel[0][2] * (1.0f / 255.0f);
2540 c[1] = pixel[0][1] * (1.0f / 255.0f);
2541 c[2] = pixel[0][0] * (1.0f / 255.0f);
2542 c[3] = pixel[0][3] * (1.0f / 255.0f);
2543 out4f[x*4+0] = c[0];
2544 out4f[x*4+1] = c[1];
2545 out4f[x*4+2] = c[2];
2546 out4f[x*4+3] = c[3];
2551 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2553 tci[0] = subtc[0]>>12;
2554 tci[1] = subtc[1]>>12;
2555 tci[0] &= tciwrapmask[0];
2556 tci[1] &= tciwrapmask[1];
2557 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2558 c[0] = pixel[0][2] * (1.0f / 255.0f);
2559 c[1] = pixel[0][1] * (1.0f / 255.0f);
2560 c[2] = pixel[0][0] * (1.0f / 255.0f);
2561 c[3] = pixel[0][3] * (1.0f / 255.0f);
2562 out4f[x*4+0] = c[0];
2563 out4f[x*4+1] = c[1];
2564 out4f[x*4+2] = c[2];
2565 out4f[x*4+3] = c[3];
2571 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2575 int startx = span->startx;
2576 int endx = span->endx;
2578 __m128 data, slope, tcscale;
2579 __m128i tcsize, tcmask, tcoffset, tcmax;
2581 __m128i subtc, substep, endsubtc;
2584 int affine; // LordHavoc: optimized affine texturing case
2585 unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2586 const unsigned char * RESTRICT pixelbase;
2587 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2588 // if no texture is bound, just fill it with white
2591 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2594 mip = triangle->mip[texunitindex];
2595 pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2596 // if this mipmap of the texture is 1 pixel, just fill it with that color
2597 if (texture->mipmap[mip][1] == 4)
2599 unsigned int k = *((const unsigned int *)pixelbase);
2600 for (x = startx;x < endx;x++)
2604 affine = zf[startx] == zf[endx-1];
2605 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2606 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2607 flags = texture->flags;
2608 tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2609 tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2610 tcscale = _mm_cvtepi32_ps(tcsize);
2611 data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2612 slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2613 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2615 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2616 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2617 tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2618 tcmax = _mm_packs_epi32(tcmask, tcmask);
2619 for (x = startx;x < endx;)
2621 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2622 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2623 if (nextsub >= endx || affine)
2625 nextsub = endsub = endx-1;
2626 if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2630 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2632 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2633 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2634 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2635 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2636 substep = _mm_slli_epi32(substep, 1);
2639 __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2640 if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2642 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2643 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2645 const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;
2646 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2647 tci = _mm_madd_epi16(tci, tcoffset);
2648 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2649 ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2650 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2651 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2652 pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2653 pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2654 fracm = _mm_srli_epi16(subtc, 1);
2655 pix1 = _mm_add_epi16(pix1,
2656 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2657 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2658 pix3 = _mm_add_epi16(pix3,
2659 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2660 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2661 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2662 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2663 pix2 = _mm_add_epi16(pix2,
2664 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2665 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2666 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2670 const unsigned char * RESTRICT ptr1;
2671 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2672 tci = _mm_madd_epi16(tci, tcoffset);
2673 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2674 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2675 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2676 fracm = _mm_srli_epi16(subtc, 1);
2677 pix1 = _mm_add_epi16(pix1,
2678 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2679 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2680 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2681 pix1 = _mm_add_epi16(pix1,
2682 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2683 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2684 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2688 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2690 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2692 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2693 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2694 tci = _mm_madd_epi16(tci, tcoffset);
2695 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2696 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2697 _mm_setzero_si128());
2698 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2699 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2700 _mm_setzero_si128());
2701 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2702 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2703 tci = _mm_madd_epi16(tci, tcoffset);
2704 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2705 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2706 _mm_setzero_si128());
2707 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2708 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2709 _mm_setzero_si128());
2710 fracm = _mm_srli_epi16(subtc, 1);
2711 pix1 = _mm_add_epi16(pix1,
2712 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2713 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2714 pix3 = _mm_add_epi16(pix3,
2715 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2716 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2717 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2718 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2719 pix2 = _mm_add_epi16(pix2,
2720 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2721 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2722 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2726 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2727 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2728 tci = _mm_madd_epi16(tci, tcoffset);
2729 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2730 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2731 _mm_setzero_si128());
2732 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2733 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2734 _mm_setzero_si128());
2735 fracm = _mm_srli_epi16(subtc, 1);
2736 pix1 = _mm_add_epi16(pix1,
2737 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2738 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2739 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2740 pix1 = _mm_add_epi16(pix1,
2741 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2742 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2743 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2749 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2751 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2752 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2753 tci = _mm_madd_epi16(tci, tcoffset);
2754 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2755 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2756 _mm_setzero_si128());
2757 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2758 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2759 _mm_setzero_si128());
2760 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2761 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2762 tci = _mm_madd_epi16(tci, tcoffset);
2763 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2764 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2765 _mm_setzero_si128());
2766 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2767 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2768 _mm_setzero_si128());
2769 fracm = _mm_srli_epi16(subtc, 1);
2770 pix1 = _mm_add_epi16(pix1,
2771 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2772 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2773 pix3 = _mm_add_epi16(pix3,
2774 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2775 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2776 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2777 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2778 pix2 = _mm_add_epi16(pix2,
2779 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2780 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2781 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2785 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2786 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2787 tci = _mm_madd_epi16(tci, tcoffset);
2788 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2789 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2790 _mm_setzero_si128());
2791 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2792 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2793 _mm_setzero_si128());
2794 fracm = _mm_srli_epi16(subtc, 1);
2795 pix1 = _mm_add_epi16(pix1,
2796 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2797 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2798 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2799 pix1 = _mm_add_epi16(pix1,
2800 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2801 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2802 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2809 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2811 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2813 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2814 tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2815 tci = _mm_madd_epi16(tci, tcoffset);
2816 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2817 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2821 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2822 tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2823 tci = _mm_madd_epi16(tci, tcoffset);
2824 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2830 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2832 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2833 tci = _mm_and_si128(tci, tcmax);
2834 tci = _mm_madd_epi16(tci, tcoffset);
2835 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2836 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2840 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2841 tci = _mm_and_si128(tci, tcmax);
2842 tci = _mm_madd_epi16(tci, tcoffset);
2843 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2852 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2855 memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2858 float DPSOFTRAST_SampleShadowmap(const float *vector)
2864 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2867 int startx = span->startx;
2868 int endx = span->endx;
2873 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2874 for (x = startx;x < endx;x++)
2877 c[0] = (data[0] + slope[0]*x) * z;
2878 c[1] = (data[1] + slope[1]*x) * z;
2879 c[2] = (data[2] + slope[2]*x) * z;
2880 c[3] = (data[3] + slope[3]*x) * z;
2881 out4f[x*4+0] = in4f[x*4+0] * c[0];
2882 out4f[x*4+1] = in4f[x*4+1] * c[1];
2883 out4f[x*4+2] = in4f[x*4+2] * c[2];
2884 out4f[x*4+3] = in4f[x*4+3] * c[3];
2888 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2891 int startx = span->startx;
2892 int endx = span->endx;
2897 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2898 for (x = startx;x < endx;x++)
2901 c[0] = (data[0] + slope[0]*x) * z;
2902 c[1] = (data[1] + slope[1]*x) * z;
2903 c[2] = (data[2] + slope[2]*x) * z;
2904 c[3] = (data[3] + slope[3]*x) * z;
2905 out4f[x*4+0] = c[0];
2906 out4f[x*4+1] = c[1];
2907 out4f[x*4+2] = c[2];
2908 out4f[x*4+3] = c[3];
2912 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2914 int x, startx = span->startx, endx = span->endx;
2915 float c[4], localcolor[4];
2916 localcolor[0] = subcolor[0];
2917 localcolor[1] = subcolor[1];
2918 localcolor[2] = subcolor[2];
2919 localcolor[3] = subcolor[3];
2920 for (x = startx;x < endx;x++)
2922 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2923 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2924 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2925 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2926 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2927 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2928 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2929 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2933 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2935 int x, startx = span->startx, endx = span->endx;
2936 for (x = startx;x < endx;x++)
2938 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2939 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2940 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2941 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2945 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2947 int x, startx = span->startx, endx = span->endx;
2948 for (x = startx;x < endx;x++)
2950 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2951 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2952 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2953 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2957 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2959 int x, startx = span->startx, endx = span->endx;
2961 for (x = startx;x < endx;x++)
2963 a = 1.0f - inb4f[x*4+3];
2965 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2966 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2967 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2968 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2972 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2974 int x, startx = span->startx, endx = span->endx;
2975 float localcolor[4], ilerp, lerp;
2976 localcolor[0] = color[0];
2977 localcolor[1] = color[1];
2978 localcolor[2] = color[2];
2979 localcolor[3] = color[3];
2980 ilerp = 1.0f - localcolor[3];
2981 lerp = localcolor[3];
2982 for (x = startx;x < endx;x++)
2984 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2985 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2986 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2987 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2993 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2997 int startx = span->startx;
2998 int endx = span->endx;
3001 __m128i submod, substep, endsubmod;
3002 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3003 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3004 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3005 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
3006 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
3007 for (x = startx; x < endx;)
3009 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3010 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3011 if (nextsub >= endx)
3013 nextsub = endsub = endx-1;
3014 if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
3018 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3019 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3020 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
3021 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3022 substep = _mm_packs_epi32(substep, substep);
3023 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3025 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
3026 pix = _mm_mulhi_epu16(pix, submod);
3027 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3031 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
3032 pix = _mm_mulhi_epu16(pix, submod);
3033 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3040 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
3044 int startx = span->startx;
3045 int endx = span->endx;
3048 __m128i submod, substep, endsubmod;
3049 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3050 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3051 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3052 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
3053 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3054 for (x = startx; x < endx;)
3056 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3057 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3058 if (nextsub >= endx)
3060 nextsub = endsub = endx-1;
3061 if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
3065 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3066 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3067 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3068 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3069 substep = _mm_packs_epi32(substep, substep);
3070 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3072 __m128i pix = _mm_srai_epi16(submod, 4);
3073 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3077 __m128i pix = _mm_srai_epi16(submod, 4);
3078 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3085 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
3088 int x, startx = span->startx, endx = span->endx;
3089 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3090 localcolor = _mm_packs_epi32(localcolor, localcolor);
3091 for (x = startx;x+2 <= endx;x+=2)
3093 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3094 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3095 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3096 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3100 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3101 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3102 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3103 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3108 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3111 int x, startx = span->startx, endx = span->endx;
3112 for (x = startx;x+2 <= endx;x+=2)
3114 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3115 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3116 pix1 = _mm_mulhi_epu16(pix1, pix2);
3117 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3121 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3122 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3123 pix1 = _mm_mulhi_epu16(pix1, pix2);
3124 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3129 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3132 int x, startx = span->startx, endx = span->endx;
3133 for (x = startx;x+2 <= endx;x+=2)
3135 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3136 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3137 pix1 = _mm_add_epi16(pix1, pix2);
3138 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3142 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3143 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3144 pix1 = _mm_add_epi16(pix1, pix2);
3145 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3150 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3153 int x, startx = span->startx, endx = span->endx;
3154 __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3155 tint = _mm_packs_epi32(tint, tint);
3156 for (x = startx;x+2 <= endx;x+=2)
3158 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3159 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3160 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3161 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3165 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3166 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3167 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3168 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3173 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3176 int x, startx = span->startx, endx = span->endx;
3177 for (x = startx;x+2 <= endx;x+=2)
3179 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3180 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3181 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3182 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3183 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3187 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3188 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3189 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3190 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3191 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3196 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3199 int x, startx = span->startx, endx = span->endx;
3200 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3201 localcolor = _mm_packs_epi32(localcolor, localcolor);
3202 blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3203 for (x = startx;x+2 <= endx;x+=2)
3205 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3206 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3207 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3211 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3212 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3213 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3220 void DPSOFTRAST_VertexShader_Generic(void)
3222 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3223 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3224 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3225 if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3226 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3229 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3231 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3232 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3233 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3234 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3235 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3236 if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3238 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3239 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3240 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3242 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3243 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3246 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3248 else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3251 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3253 else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3256 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3261 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3262 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3267 void DPSOFTRAST_VertexShader_PostProcess(void)
3269 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3270 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3271 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD4);
3274 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3276 // TODO: optimize!! at the very least there is no reason to use texture sampling on the frame texture
3277 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3278 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3279 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3280 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3281 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3282 if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3284 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3285 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3287 DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3288 if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3290 // TODO: implement saturation
3292 if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3294 // TODO: implement gammaramps
3296 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3301 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3303 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3306 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3308 // this is never called (because colormask is off when this shader is used)
3309 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3310 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3311 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3312 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3313 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3318 void DPSOFTRAST_VertexShader_FlatColor(void)
3320 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3321 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3324 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3327 unsigned char * RESTRICT pixelmask = span->pixelmask;
3328 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3329 int x, startx = span->startx, endx = span->endx;
3330 __m128i Color_Ambientm;
3331 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3332 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3333 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3334 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3335 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3336 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3337 pixel = buffer_FragColorbgra8;
3338 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3339 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3340 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3341 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3342 for (x = startx;x < endx;x++)
3345 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3348 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3349 pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3350 pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3351 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3357 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3358 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3359 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3361 if (pixel == buffer_FragColorbgra8)
3362 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3368 void DPSOFTRAST_VertexShader_VertexColor(void)
3370 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3371 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3372 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3375 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3378 unsigned char * RESTRICT pixelmask = span->pixelmask;
3379 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3380 int x, startx = span->startx, endx = span->endx;
3381 __m128i Color_Ambientm, Color_Diffusem;
3383 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3384 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3385 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3386 int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3387 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3388 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3389 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3390 pixel = buffer_FragColorbgra8;
3391 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3392 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3393 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3394 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3395 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3396 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3397 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3398 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3399 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3400 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3401 data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3402 data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3403 slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3404 for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3406 __m128i color, mod, pix;
3407 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3410 __m128 z = _mm_loadu_ps(&buffer_z[x]);
3411 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3412 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3413 data = _mm_add_ps(data, slope);
3414 mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3415 data = _mm_add_ps(data, slope);
3416 mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3417 data = _mm_add_ps(data, slope);
3418 mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3419 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3420 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3421 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3422 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3423 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3429 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3430 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x])));
3431 mod = _mm_packs_epi32(mod, mod);
3432 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3433 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3435 if (pixel == buffer_FragColorbgra8)
3436 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3442 void DPSOFTRAST_VertexShader_Lightmap(void)
3444 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3445 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3446 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3449 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3452 unsigned char * RESTRICT pixelmask = span->pixelmask;
3453 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3454 int x, startx = span->startx, endx = span->endx;
3455 __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3456 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3457 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3458 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3459 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3460 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3461 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3462 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3463 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3464 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3465 pixel = buffer_FragColorbgra8;
3466 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3467 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3468 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3469 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3470 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3471 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3472 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3473 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3475 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3476 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3477 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3478 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3479 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3480 for (x = startx;x < endx;x++)
3482 __m128i color, lightmap, glow, pix;
3483 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3486 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3487 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3488 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3489 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3490 _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3491 _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3492 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3493 _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3494 _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3495 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3501 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3502 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3503 glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3504 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3505 pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3506 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3511 for (x = startx;x < endx;x++)
3513 __m128i color, lightmap, pix;
3514 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3517 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3518 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3519 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3520 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3521 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3522 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3523 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3529 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3530 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3531 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3532 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3535 if (pixel == buffer_FragColorbgra8)
3536 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3541 void DPSOFTRAST_VertexShader_LightDirection(void);
3542 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
3544 void DPSOFTRAST_VertexShader_FakeLight(void)
3546 DPSOFTRAST_VertexShader_LightDirection();
3549 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3551 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3556 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3558 DPSOFTRAST_VertexShader_LightDirection();
3559 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3562 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3564 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3569 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3571 DPSOFTRAST_VertexShader_LightDirection();
3572 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3575 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3577 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3582 void DPSOFTRAST_VertexShader_LightDirection(void)
3585 int numvertices = dpsoftrast.numvertices;
3587 float LightVector[4];
3588 float EyePosition[4];
3589 float EyeVectorModelSpace[4];
3595 LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3596 LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3597 LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3598 LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3599 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3600 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3601 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3602 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3603 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3604 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3605 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3606 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3607 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3608 for (i = 0;i < numvertices;i++)
3610 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3611 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3612 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3613 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3614 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3615 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3616 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3617 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3618 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3619 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3620 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3621 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3622 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3623 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3624 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3625 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+0] = LightVector[0];
3626 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+1] = LightVector[1];
3627 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+2] = LightVector[2];
3628 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+3] = 0.0f;
3629 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3630 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3631 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3632 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3633 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3634 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
3635 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
3636 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
3637 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
3638 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
3640 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3643 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3644 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3645 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3646 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3647 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3648 #define DPSOFTRAST_Vector3Normalize(v)\
3651 float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3662 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3664 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3665 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3666 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3667 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3668 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3669 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3670 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3671 unsigned char buffer_texture_deluxemapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3672 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3673 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3674 int x, startx = span->startx, endx = span->endx;
3675 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3676 float LightVectordata[4];
3677 float LightVectorslope[4];
3678 float EyeVectordata[4];
3679 float EyeVectorslope[4];
3680 float VectorSdata[4];
3681 float VectorSslope[4];
3682 float VectorTdata[4];
3683 float VectorTslope[4];
3684 float VectorRdata[4];
3685 float VectorRslope[4];
3687 float diffusetex[4];
3689 float surfacenormal[4];
3690 float lightnormal[4];
3691 float lightnormal_modelspace[4];
3693 float specularnormal[4];
3696 float SpecularPower;
3698 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3699 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3700 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3701 Color_Glow[3] = 0.0f;
3702 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3703 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3704 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3705 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3706 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3707 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3708 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3709 Color_Pants[3] = 0.0f;
3710 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3711 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3712 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3713 Color_Shirt[3] = 0.0f;
3714 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3715 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3716 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3718 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3719 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3721 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3723 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3725 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3727 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3728 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3729 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3730 Color_Diffuse[3] = 0.0f;
3731 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3732 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3733 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3734 LightColor[3] = 0.0f;
3735 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3736 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3737 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3738 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3739 Color_Specular[3] = 0.0f;
3740 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3741 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3742 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3744 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3746 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3747 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3748 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3749 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3750 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3752 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3754 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3755 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3757 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3759 // nothing of this needed
3763 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3766 for (x = startx;x < endx;x++)
3769 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3770 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3771 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3772 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3773 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3775 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3776 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3777 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3778 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3780 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3781 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3782 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3783 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3784 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3785 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3786 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3787 DPSOFTRAST_Vector3Normalize(surfacenormal);
3789 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3791 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3792 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3793 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3794 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3796 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3797 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3798 + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3799 + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3801 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3802 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3803 + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3804 + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3806 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3807 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3808 + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3809 + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3811 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3812 DPSOFTRAST_Vector3Normalize(lightnormal);
3814 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3816 float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3817 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3818 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3819 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3822 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3824 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3825 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3826 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3828 float f = 1.0f / 256.0f;
3829 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3830 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3831 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3834 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3836 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3837 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3838 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3839 DPSOFTRAST_Vector3Normalize(lightnormal);
3841 LightColor[0] = 1.0;
3842 LightColor[1] = 1.0;
3843 LightColor[2] = 1.0;
3847 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3848 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3849 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3850 DPSOFTRAST_Vector3Normalize(lightnormal);
3853 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3855 if(thread->shader_exactspecularmath)
3857 // reflect lightnormal at surfacenormal, take the negative of that
3858 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
3860 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
3861 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
3862 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
3863 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
3865 // dot of this and normalize(EyeVectorFogDepth.xyz)
3866 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3867 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3868 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3869 DPSOFTRAST_Vector3Normalize(eyenormal);
3871 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3875 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3876 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3877 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3878 DPSOFTRAST_Vector3Normalize(eyenormal);
3880 specularnormal[0] = lightnormal[0] + eyenormal[0];
3881 specularnormal[1] = lightnormal[1] + eyenormal[1];
3882 specularnormal[2] = lightnormal[2] + eyenormal[2];
3883 DPSOFTRAST_Vector3Normalize(specularnormal);
3885 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3888 specular = pow(specular, SpecularPower * glosstex[3]);
3889 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3891 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3892 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3893 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3894 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3898 d[0] = (int)( diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3899 d[1] = (int)( diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3900 d[2] = (int)( diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3901 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3904 buffer_FragColorbgra8[x*4+0] = d[0];
3905 buffer_FragColorbgra8[x*4+1] = d[1];
3906 buffer_FragColorbgra8[x*4+2] = d[2];
3907 buffer_FragColorbgra8[x*4+3] = d[3];
3910 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3912 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3913 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3914 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3915 Color_Diffuse[3] = 0.0f;
3916 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3917 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3918 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3919 LightColor[3] = 0.0f;
3920 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3922 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3924 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3925 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3926 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3927 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3928 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3930 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3932 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3933 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3935 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3937 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3941 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3944 for (x = startx;x < endx;x++)
3947 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3948 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3949 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3950 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3951 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3952 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3953 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3954 DPSOFTRAST_Vector3Normalize(surfacenormal);
3956 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3958 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3959 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3960 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3961 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3963 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3964 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3965 + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3966 + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3968 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3969 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3970 + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3971 + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3973 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3974 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3975 + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3976 + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3978 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3979 DPSOFTRAST_Vector3Normalize(lightnormal);
3981 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3983 float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3984 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3985 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3986 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3989 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3991 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3992 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3993 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3995 float f = 1.0f / 256.0f;
3996 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3997 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3998 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
4001 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
4003 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4004 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4005 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4006 DPSOFTRAST_Vector3Normalize(lightnormal);
4008 LightColor[0] = 1.0;
4009 LightColor[1] = 1.0;
4010 LightColor[2] = 1.0;
4014 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4015 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4016 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4017 DPSOFTRAST_Vector3Normalize(lightnormal);
4020 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4021 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4023 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4024 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4025 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4026 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
4030 d[0] = (int)( + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4031 d[1] = (int)( + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4032 d[2] = (int)( + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4033 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
4035 buffer_FragColorbgra8[x*4+0] = d[0];
4036 buffer_FragColorbgra8[x*4+1] = d[1];
4037 buffer_FragColorbgra8[x*4+2] = d[2];
4038 buffer_FragColorbgra8[x*4+3] = d[3];
4043 for (x = startx;x < endx;x++)
4046 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4047 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4048 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4049 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4051 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4053 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4054 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4055 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4056 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4060 d[0] = (int)( diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4061 d[1] = (int)( diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4062 d[2] = (int)( diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4063 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4065 buffer_FragColorbgra8[x*4+0] = d[0];
4066 buffer_FragColorbgra8[x*4+1] = d[1];
4067 buffer_FragColorbgra8[x*4+2] = d[2];
4068 buffer_FragColorbgra8[x*4+3] = d[3];
4071 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4076 void DPSOFTRAST_VertexShader_LightSource(void)
4079 int numvertices = dpsoftrast.numvertices;
4080 float LightPosition[4];
4081 float LightVector[4];
4082 float LightVectorModelSpace[4];
4083 float EyePosition[4];
4084 float EyeVectorModelSpace[4];
4090 LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
4091 LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
4092 LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
4093 LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
4094 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4095 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4096 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4097 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4098 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4099 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4100 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4101 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4102 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4103 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
4104 for (i = 0;i < numvertices;i++)
4106 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4107 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4108 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4109 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4110 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4111 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4112 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4113 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4114 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4115 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4116 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4117 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4118 LightVectorModelSpace[0] = LightPosition[0] - position[0];
4119 LightVectorModelSpace[1] = LightPosition[1] - position[1];
4120 LightVectorModelSpace[2] = LightPosition[2] - position[2];
4121 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
4122 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
4123 LightVector[2] = normal[0] * LightVectorModelSpace[0] + normal[1] * LightVectorModelSpace[1] + normal[2] * LightVectorModelSpace[2];
4124 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
4125 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
4126 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
4127 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
4128 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4129 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4130 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4131 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4132 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4133 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
4134 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
4135 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
4136 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
4137 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
4139 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4140 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
4143 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4146 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4147 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4148 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4149 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4150 unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4151 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4152 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4153 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4154 int x, startx = span->startx, endx = span->endx;
4155 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
4156 float CubeVectordata[4];
4157 float CubeVectorslope[4];
4158 float LightVectordata[4];
4159 float LightVectorslope[4];
4160 float EyeVectordata[4];
4161 float EyeVectorslope[4];
4163 float diffusetex[4];
4165 float surfacenormal[4];
4166 float lightnormal[4];
4168 float specularnormal[4];
4171 float SpecularPower;
4172 float CubeVector[4];
4175 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
4176 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
4177 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
4178 Color_Glow[3] = 0.0f;
4179 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
4180 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
4181 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
4182 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
4183 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
4184 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
4185 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
4186 Color_Diffuse[3] = 0.0f;
4187 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
4188 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
4189 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
4190 Color_Specular[3] = 0.0f;
4191 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
4192 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
4193 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
4194 Color_Pants[3] = 0.0f;
4195 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
4196 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
4197 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
4198 Color_Shirt[3] = 0.0f;
4199 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
4200 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
4201 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
4202 LightColor[3] = 0.0f;
4203 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
4204 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
4205 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
4206 DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
4207 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4208 memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
4209 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4210 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4212 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4213 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4215 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4216 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
4217 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
4219 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4220 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4221 for (x = startx;x < endx;x++)
4224 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4225 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4226 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4227 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4228 if (attenuation < 0.01f)
4230 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4232 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4233 if (attenuation < 0.01f)
4237 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4238 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4239 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4240 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4241 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4243 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4244 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4245 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4246 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4248 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
4249 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
4250 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
4251 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
4252 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4253 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4254 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4255 DPSOFTRAST_Vector3Normalize(surfacenormal);
4257 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4258 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4259 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4260 DPSOFTRAST_Vector3Normalize(lightnormal);
4262 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4264 if(thread->shader_exactspecularmath)
4266 // reflect lightnormal at surfacenormal, take the negative of that
4267 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
4269 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
4270 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
4271 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
4272 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
4274 // dot of this and normalize(EyeVectorFogDepth.xyz)
4275 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4276 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4277 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4278 DPSOFTRAST_Vector3Normalize(eyenormal);
4280 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4284 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4285 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4286 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4287 DPSOFTRAST_Vector3Normalize(eyenormal);
4289 specularnormal[0] = lightnormal[0] + eyenormal[0];
4290 specularnormal[1] = lightnormal[1] + eyenormal[1];
4291 specularnormal[2] = lightnormal[2] + eyenormal[2];
4292 DPSOFTRAST_Vector3Normalize(specularnormal);
4294 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4296 specular = pow(specular, SpecularPower * glosstex[3]);
4298 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4300 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4301 attenuation *= (1.0f / 255.0f);
4302 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4303 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4304 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4305 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4309 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4310 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4311 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4312 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4314 buffer_FragColorbgra8[x*4+0] = d[0];
4315 buffer_FragColorbgra8[x*4+1] = d[1];
4316 buffer_FragColorbgra8[x*4+2] = d[2];
4317 buffer_FragColorbgra8[x*4+3] = d[3];
4320 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4322 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4323 for (x = startx;x < endx;x++)
4326 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4327 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4328 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4329 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4330 if (attenuation < 0.01f)
4332 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4334 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4335 if (attenuation < 0.01f)
4339 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4340 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4341 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4342 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4343 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4345 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4346 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4347 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4348 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4350 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4351 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4352 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4353 DPSOFTRAST_Vector3Normalize(surfacenormal);
4355 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4356 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4357 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4358 DPSOFTRAST_Vector3Normalize(lightnormal);
4360 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4361 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4363 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4364 attenuation *= (1.0f / 255.0f);
4365 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4366 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4367 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4368 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4372 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4373 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4374 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4375 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4377 buffer_FragColorbgra8[x*4+0] = d[0];
4378 buffer_FragColorbgra8[x*4+1] = d[1];
4379 buffer_FragColorbgra8[x*4+2] = d[2];
4380 buffer_FragColorbgra8[x*4+3] = d[3];
4385 for (x = startx;x < endx;x++)
4388 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4389 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4390 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4391 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4392 if (attenuation < 0.01f)
4394 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4396 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4397 if (attenuation < 0.01f)
4401 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4402 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4403 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4404 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4405 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4407 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4408 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4409 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4410 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4412 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4414 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4415 attenuation *= (1.0f / 255.0f);
4416 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4417 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4418 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4419 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4423 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4424 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4425 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4426 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4428 buffer_FragColorbgra8[x*4+0] = d[0];
4429 buffer_FragColorbgra8[x*4+1] = d[1];
4430 buffer_FragColorbgra8[x*4+2] = d[2];
4431 buffer_FragColorbgra8[x*4+3] = d[3];
4434 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4440 void DPSOFTRAST_VertexShader_Refraction(void)
4442 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4443 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4444 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4447 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4449 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4451 int x, startx = span->startx, endx = span->endx;
4454 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4455 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4458 float ModelViewProjectionPositiondata[4];
4459 float ModelViewProjectionPositionslope[4];
4462 float ScreenScaleRefractReflect[2];
4463 float ScreenCenterRefractReflect[2];
4464 float DistortScaleRefractReflect[2];
4465 float RefractColor[4];
4467 DPSOFTRAST_Texture *texture = thread->texbound[GL20TU_REFRACTION];
4468 if(!texture) return;
4471 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4472 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4475 DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD4);
4478 ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4479 ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4480 ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4481 ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4482 DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4483 DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4484 RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4485 RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4486 RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4487 RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4490 for (x = startx;x < endx;x++)
4492 float SafeScreenTexCoord[2];
4493 float ScreenTexCoord[2];
4500 // " vec2 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect.xy * (1.0 / ModelViewProjectionPosition.w);\n"
4501 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4503 // " vec2 SafeScreenTexCoord = ModelViewProjectionPosition.xy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect.xy;\n"
4504 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4505 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4507 // " vec2 ScreenTexCoord = SafeScreenTexCoord + vec3(normalize(myhalf3(dp_texture2D(Texture_Normal, TexCoord)) - myhalf3(0.5))).xy * DistortScaleRefractReflect.zw;\n"
4508 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4509 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4510 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4511 DPSOFTRAST_Vector3Normalize(v);
4512 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4513 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4515 // " dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4516 DPSOFTRAST_Texture2DBGRA8(texture, 0, ScreenTexCoord[0], ScreenTexCoord[1], c);
4518 buffer_FragColorbgra8[x*4+0] = c[0] * RefractColor[0];
4519 buffer_FragColorbgra8[x*4+1] = c[1] * RefractColor[1];
4520 buffer_FragColorbgra8[x*4+2] = c[2] * RefractColor[2];
4521 buffer_FragColorbgra8[x*4+3] = min(RefractColor[3] * 256, 255);
4524 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4529 void DPSOFTRAST_VertexShader_Water(void)
4532 int numvertices = dpsoftrast.numvertices;
4533 float EyePosition[4];
4534 float EyeVectorModelSpace[4];
4540 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4541 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4542 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4543 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4544 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4545 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4546 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4547 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4548 for (i = 0;i < numvertices;i++)
4550 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4551 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4552 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4553 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4554 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4555 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4556 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4557 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4558 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4559 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4560 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4561 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4562 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4563 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4564 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4565 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4566 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4567 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
4568 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
4569 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
4570 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
4571 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
4573 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4574 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4575 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4579 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4581 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4583 int x, startx = span->startx, endx = span->endx;
4586 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4587 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4590 float ModelViewProjectionPositiondata[4];
4591 float ModelViewProjectionPositionslope[4];
4592 float EyeVectordata[4];
4593 float EyeVectorslope[4];
4596 float ScreenScaleRefractReflect[2];
4597 float ScreenCenterRefractReflect[2];
4598 float DistortScaleRefractReflect[2];
4599 float RefractColor[4];
4600 float ReflectColor[4];
4601 float ReflectFactor;
4602 float ReflectOffset;
4604 DPSOFTRAST_Texture *texture_refraction = thread->texbound[GL20TU_REFRACTION];
4605 DPSOFTRAST_Texture *texture_reflection = thread->texbound[GL20TU_REFLECTION];
4606 if(!texture_refraction || !texture_reflection) return;
4609 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4610 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4613 DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD4);
4614 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
4617 ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4618 ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4619 ScreenScaleRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+2];
4620 ScreenScaleRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+3];
4621 ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4622 ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4623 ScreenCenterRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+2];
4624 ScreenCenterRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+3];
4625 DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4626 DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4627 DistortScaleRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+2];
4628 DistortScaleRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+3];
4629 RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4630 RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4631 RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4632 RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4633 ReflectColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+2];
4634 ReflectColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+1];
4635 ReflectColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+0];
4636 ReflectColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+3];
4637 ReflectFactor = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectFactor*4+0];
4638 ReflectOffset = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectOffset*4+0];
4641 for (x = startx;x < endx;x++)
4643 float SafeScreenTexCoord[4];
4644 float ScreenTexCoord[4];
4647 unsigned char c1[4];
4648 unsigned char c2[4];
4653 // " vec4 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect * (1.0 / ModelViewProjectionPosition.w);\n"
4654 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4656 // " vec4 SafeScreenTexCoord = ModelViewProjectionPosition.xyxy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect;\n"
4657 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4658 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4659 SafeScreenTexCoord[2] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[2] + ScreenCenterRefractReflect[2]; // * z (disappears)
4660 SafeScreenTexCoord[3] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[3] + ScreenCenterRefractReflect[3]; // * z (disappears)
4662 // " vec4 ScreenTexCoord = SafeScreenTexCoord + vec2(normalize(vec3(dp_texture2D(Texture_Normal, TexCoord)) - vec3(0.5))).xyxy * DistortScaleRefractReflect;\n"
4663 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4664 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4665 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4666 DPSOFTRAST_Vector3Normalize(v);
4667 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4668 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4669 ScreenTexCoord[2] = SafeScreenTexCoord[2] + v[0] * DistortScaleRefractReflect[2];
4670 ScreenTexCoord[3] = SafeScreenTexCoord[3] + v[1] * DistortScaleRefractReflect[3];
4672 // " float Fresnel = pow(min(1.0, 1.0 - float(normalize(EyeVector).z)), 2.0) * ReflectFactor + ReflectOffset;\n"
4673 v[0] = (EyeVectordata[0] + EyeVectorslope[0] * x); // * z (disappears)
4674 v[1] = (EyeVectordata[1] + EyeVectorslope[1] * x); // * z (disappears)
4675 v[2] = (EyeVectordata[2] + EyeVectorslope[2] * x); // * z (disappears)
4676 DPSOFTRAST_Vector3Normalize(v);
4677 Fresnel = 1.0f - v[2];
4678 Fresnel = min(1.0f, Fresnel);
4679 Fresnel = Fresnel * Fresnel * ReflectFactor + ReflectOffset;
4681 // " dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4682 // " dp_FragColor = mix(vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord.xy).rgb, 1) * RefractColor, vec4(dp_texture2D(Texture_Reflection, ScreenTexCoord.zw).rgb, 1) * ReflectColor, Fresnel);\n"
4683 DPSOFTRAST_Texture2DBGRA8(texture_refraction, 0, ScreenTexCoord[0], ScreenTexCoord[1], c1);
4684 DPSOFTRAST_Texture2DBGRA8(texture_reflection, 0, ScreenTexCoord[2], ScreenTexCoord[3], c2);
4686 buffer_FragColorbgra8[x*4+0] = (c1[0] * RefractColor[0]) * (1.0f - Fresnel) + (c2[0] * ReflectColor[0]) * Fresnel;
4687 buffer_FragColorbgra8[x*4+1] = (c1[1] * RefractColor[1]) * (1.0f - Fresnel) + (c2[1] * ReflectColor[1]) * Fresnel;
4688 buffer_FragColorbgra8[x*4+2] = (c1[2] * RefractColor[2]) * (1.0f - Fresnel) + (c2[2] * ReflectColor[2]) * Fresnel;
4689 buffer_FragColorbgra8[x*4+3] = min(( RefractColor[3] * (1.0f - Fresnel) + ReflectColor[3] * Fresnel) * 256, 255);
4692 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4697 void DPSOFTRAST_VertexShader_ShowDepth(void)
4699 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4702 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4705 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4706 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4707 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4708 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4709 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4714 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4716 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4719 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4722 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4723 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4724 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4725 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4726 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4731 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4733 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4736 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4739 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4740 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4741 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4742 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4743 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4748 typedef struct DPSOFTRAST_ShaderModeInfo_s
4751 void (*Vertex)(void);
4752 void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4753 unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4754 unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4756 DPSOFTRAST_ShaderModeInfo;
4758 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4760 {2, DPSOFTRAST_VertexShader_Generic, DPSOFTRAST_PixelShader_Generic, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4761 {2, DPSOFTRAST_VertexShader_PostProcess, DPSOFTRAST_PixelShader_PostProcess, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4762 {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow, DPSOFTRAST_PixelShader_Depth_Or_Shadow, {~0}, {~0}},
4763 {2, DPSOFTRAST_VertexShader_FlatColor, DPSOFTRAST_PixelShader_FlatColor, {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4764 {2, DPSOFTRAST_VertexShader_VertexColor, DPSOFTRAST_PixelShader_VertexColor, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4765 {2, DPSOFTRAST_VertexShader_Lightmap, DPSOFTRAST_PixelShader_Lightmap, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4766 {2, DPSOFTRAST_VertexShader_FakeLight, DPSOFTRAST_PixelShader_FakeLight, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4767 {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace, DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4768 {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4769 {2, DPSOFTRAST_VertexShader_LightDirection, DPSOFTRAST_PixelShader_LightDirection, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4770 {2, DPSOFTRAST_VertexShader_LightSource, DPSOFTRAST_PixelShader_LightSource, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4771 {2, DPSOFTRAST_VertexShader_Refraction, DPSOFTRAST_PixelShader_Refraction, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_NORMAL, GL20TU_REFRACTION, ~0}},
4772 {2, DPSOFTRAST_VertexShader_Water, DPSOFTRAST_PixelShader_Water, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_NORMAL, GL20TU_REFLECTION, GL20TU_REFRACTION, ~0}},
4773 {2, DPSOFTRAST_VertexShader_ShowDepth, DPSOFTRAST_PixelShader_ShowDepth, {~0}},
4774 {2, DPSOFTRAST_VertexShader_DeferredGeometry, DPSOFTRAST_PixelShader_DeferredGeometry, {~0}},
4775 {2, DPSOFTRAST_VertexShader_DeferredLightSource, DPSOFTRAST_PixelShader_DeferredLightSource, {~0}},
4778 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span)
4783 unsigned int *depthpixel;
4787 unsigned char *pixelmask;
4788 DPSOFTRAST_State_Triangle *triangle;
4789 triangle = &thread->triangles[span->triangle];
4790 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4791 startx = span->startx;
4793 depth = span->depthbase;
4794 depthslope = span->depthslope;
4795 pixelmask = thread->pixelmaskarray;
4796 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4798 switch(thread->fb_depthfunc)
4801 case GL_ALWAYS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4802 case GL_LESS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4803 case GL_LEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4804 case GL_EQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4805 case GL_GEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4806 case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4807 case GL_NEVER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4809 while (startx < endx && !pixelmask[startx])
4811 while (endx > startx && !pixelmask[endx-1])
4816 // no depth testing means we're just dealing with color...
4817 memset(pixelmask + startx, 1, endx - startx);
4819 span->pixelmask = pixelmask;
4820 span->startx = startx;
4824 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span)
4826 int x, d, depth, depthslope, startx, endx;
4827 const unsigned char *pixelmask;
4828 unsigned int *depthpixel;
4829 if (thread->depthmask && thread->depthtest && dpsoftrast.fb_depthpixels)
4831 depth = span->depthbase;
4832 depthslope = span->depthslope;
4833 pixelmask = span->pixelmask;
4834 startx = span->startx;
4836 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4837 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4843 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4846 DPSOFTRAST_State_Triangle *triangle;
4847 DPSOFTRAST_State_Span *span;
4848 for (i = 0; i < thread->numspans; i++)
4850 span = &thread->spans[i];
4851 triangle = &thread->triangles[span->triangle];
4852 DPSOFTRAST_Draw_DepthTest(thread, span);
4853 if (span->startx >= span->endx)
4855 // run pixel shader if appropriate
4856 // do this before running depthmask code, to allow the pixelshader
4857 // to clear pixelmask values for alpha testing
4858 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4859 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4860 DPSOFTRAST_Draw_DepthWrite(thread, span);
4862 thread->numspans = 0;
4865 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4867 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4870 int cullface = thread->cullface;
4871 int minx, maxx, miny, maxy;
4872 int miny1, maxy1, miny2, maxy2;
4873 __m128i fbmin, fbmax;
4874 __m128 viewportcenter, viewportscale;
4875 int firstvertex = command->firstvertex;
4876 int numvertices = command->numvertices;
4877 int numtriangles = command->numtriangles;
4878 const int *element3i = command->element3i;
4879 const unsigned short *element3s = command->element3s;
4880 int clipped = command->clipped;
4887 int starty, endy, bandy;
4891 float clip0origin, clip0slope;
4893 __m128 triangleedge1, triangleedge2, trianglenormal;
4896 DPSOFTRAST_State_Triangle *triangle;
4897 DPSOFTRAST_Texture *texture;
4898 DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4899 miny = thread->fb_scissor[1];
4900 maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4901 miny1 = bound(miny, thread->miny1, maxy);
4902 maxy1 = bound(miny, thread->maxy1, maxy);
4903 miny2 = bound(miny, thread->miny2, maxy);
4904 maxy2 = bound(miny, thread->maxy2, maxy);
4905 if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4907 if (!ATOMIC_DECREMENT(command->refcount))
4909 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4910 MM_FREE(command->arrays);
4914 minx = thread->fb_scissor[0];
4915 maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4916 fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4917 fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4918 viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4919 viewportscale = _mm_load_ps(thread->fb_viewportscale);
4920 screen[3] = _mm_setzero_ps();
4921 clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4922 for (i = 0;i < numtriangles;i++)
4924 const float *screencoord4f = command->arrays;
4925 const float *arrays = screencoord4f + numvertices*4;
4927 // generate the 3 edges of this triangle
4928 // generate spans for the triangle - switch based on left split or right split classification of triangle
4931 e[0] = element3s[i*3+0] - firstvertex;
4932 e[1] = element3s[i*3+1] - firstvertex;
4933 e[2] = element3s[i*3+2] - firstvertex;
4937 e[0] = element3i[i*3+0] - firstvertex;
4938 e[1] = element3i[i*3+1] - firstvertex;
4939 e[2] = element3i[i*3+2] - firstvertex;
4948 #define SKIPBACKFACE \
4949 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4950 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4951 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4952 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4953 _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4957 if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4961 if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4966 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4967 clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4969 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4970 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4972 #define CLIPPEDVERTEXCOPY(k,p1) \
4973 screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4975 #define GENATTRIBCOPY(attrib, p1) \
4976 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4977 #define GENATTRIBLERP(attrib, p1, p2) \
4979 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4980 attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4982 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4986 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4987 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4988 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4989 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4990 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4991 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4992 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4998 // calculate distance from nearplane
4999 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
5000 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
5001 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
5002 if (clipdist[0] >= 0.0f)
5004 if (clipdist[1] >= 0.0f)
5006 if (clipdist[2] >= 0.0f)
5009 // triangle is entirely in front of nearplane
5010 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
5017 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
5025 if (clipdist[2] >= 0.0f)
5027 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
5034 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
5041 else if (clipdist[1] >= 0.0f)
5043 if (clipdist[2] >= 0.0f)
5045 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
5052 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
5058 else if (clipdist[2] >= 0.0f)
5060 CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
5065 else continue; // triangle is entirely behind nearplane
5068 // calculate integer y coords for triangle points
5069 __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
5070 screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
5071 screenmin = _mm_min_epi16(screeni, screenir),
5072 screenmax = _mm_max_epi16(screeni, screenir);
5073 screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
5074 screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
5075 screenmin = _mm_max_epi16(screenmin, fbmin);
5076 screenmax = _mm_min_epi16(screenmax, fbmax);
5077 // skip offscreen triangles
5078 if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
5080 starty = _mm_extract_epi16(screenmin, 1);
5081 endy = _mm_extract_epi16(screenmax, 1)+1;
5082 if (starty >= maxy1 && endy <= miny2)
5084 screeny = _mm_srai_epi32(screeni, 16);
5087 triangle = &thread->triangles[thread->numtriangles];
5089 // calculate attribute plans for triangle data...
5090 // okay, this triangle is going to produce spans, we'd better project
5091 // the interpolants now (this is what gives perspective texturing),
5092 // this consists of simply multiplying all arrays by the W coord
5093 // (which is basically 1/Z), which will be undone per-pixel
5094 // (multiplying by Z again) to get the perspective-correct array
5097 __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
5098 __m128 mipedgescale, mipdensity;
5099 attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
5100 attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
5101 attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
5102 attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
5103 attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
5104 w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
5105 w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
5106 w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
5107 attribedge1 = _mm_sub_ss(w0, w1);
5108 attribedge2 = _mm_sub_ss(w2, w1);
5109 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5110 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5111 x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
5112 y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
5113 attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5114 _mm_store_ss(&triangle->w[0], attribxslope);
5115 _mm_store_ss(&triangle->w[1], attribyslope);
5116 _mm_store_ss(&triangle->w[2], attriborigin);
5121 if(thread->fb_clipplane[0] || thread->fb_clipplane[1] || thread->fb_clipplane[2])
5123 float cliporigin, clipxslope, clipyslope;
5124 attriborigin = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(2, 2, 2, 2));
5125 attribedge1 = _mm_sub_ss(_mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5126 attribedge2 = _mm_sub_ss(_mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5127 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5128 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5129 attriborigin = _mm_sub_ss(attriborigin, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5130 cliporigin = _mm_cvtss_f32(attriborigin)*thread->fb_clipplane[2] + thread->fb_clipplane[3];
5131 clipxslope = thread->fb_clipplane[0] + _mm_cvtss_f32(attribxslope)*thread->fb_clipplane[2];
5132 clipyslope = thread->fb_clipplane[1] + _mm_cvtss_f32(attribyslope)*thread->fb_clipplane[2];
5135 clip0origin = -cliporigin/clipxslope;
5136 clip0slope = -clipyslope/clipxslope;
5137 clip0dir = clipxslope > 0 ? 1 : -1;
5139 else if(clipyslope > 0)
5141 clip0origin = dpsoftrast.fb_width*floor(cliporigin/clipyslope);
5142 clip0slope = dpsoftrast.fb_width;
5145 else if(clipyslope < 0)
5147 clip0origin = dpsoftrast.fb_width*ceil(cliporigin/clipyslope);
5148 clip0slope = -dpsoftrast.fb_width;
5151 else if(clip0origin < 0) continue;
5154 mipedgescale = _mm_setzero_ps();
5155 for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
5157 __m128 attrib0, attrib1, attrib2;
5158 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
5159 if (k >= DPSOFTRAST_ARRAY_TOTAL)
5161 arrays += numvertices*4;
5162 GENATTRIBS(attrib0, attrib1, attrib2);
5163 attriborigin = _mm_mul_ps(attrib1, w1);
5164 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
5165 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
5166 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
5167 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
5168 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
5169 _mm_storeu_ps(triangle->attribs[k][0], attribxslope);
5170 _mm_storeu_ps(triangle->attribs[k][1], attribyslope);
5171 _mm_storeu_ps(triangle->attribs[k][2], attriborigin);
5172 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
5174 mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
5175 mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
5176 mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
5177 mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
5181 memset(triangle->mip, 0, sizeof(triangle->mip));
5182 for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
5184 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
5185 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
5187 texture = thread->texbound[texunit];
5188 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
5190 mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
5191 mipdensity = _mm_mul_ps(mipdensity, mipdensity);
5192 mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
5193 mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
5194 // this will be multiplied in the texturing routine by the texture resolution
5195 y = _mm_cvtss_si32(mipdensity);
5198 y = (int)(log((float)y)*0.5f/M_LN2);
5199 if (y > texture->mipmaps - 1)
5200 y = texture->mipmaps - 1;
5201 triangle->mip[texunit] = y;
5207 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
5210 __m128 xcoords, xslope;
5211 __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
5212 int yccmask = _mm_movemask_epi8(ycc);
5213 int edge0p, edge0n, edge1p, edge1n;
5222 case 0xFFFF: /*0000*/ y = endy; continue;
5223 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
5224 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5225 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
5226 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
5227 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
5228 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
5229 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
5230 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
5231 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
5232 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
5233 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
5234 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
5235 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5236 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
5237 case 0x0000: /*1111*/ y++; continue;
5245 case 0xFFFF: /*000*/ y = endy; continue;
5246 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
5247 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5248 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
5249 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
5250 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5251 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
5252 case 0x0000: /*111*/ y++; continue;
5255 ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
5256 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
5257 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
5258 nexty = _mm_extract_epi16(ycc, 0);
5259 if (nexty >= bandy) nexty = bandy-1;
5260 xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
5261 xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
5262 xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
5263 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
5264 xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
5265 if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
5267 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
5268 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
5270 clip0 = clip0origin + (y+0.5f)*clip0slope + 0.5f;
5271 for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope), clip0 += clip0slope)
5273 int startx, endx, offset;
5274 startx = _mm_cvtss_si32(xcoords);
5275 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
5276 if (startx < minx) startx = minx;
5277 if (endx > maxx) endx = maxx;
5278 if (startx >= endx) continue;
5286 if(endx <= clip0) continue;
5287 startx = (int)clip0;
5290 else if (endx > clip0)
5292 if(startx >= clip0) continue;
5297 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
5299 DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
5300 span->triangle = thread->numtriangles;
5304 span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
5305 if (span->startx >= span->endx)
5307 wslope = triangle->w[0];
5308 w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
5309 span->depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
5310 span->depthbase = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
5311 if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
5312 DPSOFTRAST_Draw_ProcessSpans(thread);
5317 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
5319 DPSOFTRAST_Draw_ProcessSpans(thread);
5320 thread->numtriangles = 0;
5324 if (!ATOMIC_DECREMENT(command->refcount))
5326 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5327 MM_FREE(command->arrays);
5330 if (thread->numspans > 0 || thread->numtriangles > 0)
5332 DPSOFTRAST_Draw_ProcessSpans(thread);
5333 thread->numtriangles = 0;
5338 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5342 int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
5343 int datasize = 2*numvertices*sizeof(float[4]);
5344 DPSOFTRAST_Command_Draw *command;
5345 unsigned char *data;
5346 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5348 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5349 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5351 datasize += numvertices*sizeof(float[4]);
5354 datasize += numtriangles*sizeof(unsigned short[3]);
5356 datasize += numtriangles*sizeof(int[3]);
5357 datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
5358 if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
5360 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
5361 data = (unsigned char *)MM_CALLOC(datasize, 1);
5365 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
5366 data = (unsigned char *)command + commandsize;
5368 command->firstvertex = firstvertex;
5369 command->numvertices = numvertices;
5370 command->numtriangles = numtriangles;
5371 command->arrays = (float *)data;
5372 memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
5373 dpsoftrast.firstvertex = firstvertex;
5374 dpsoftrast.numvertices = numvertices;
5375 dpsoftrast.screencoord4f = (float *)data;
5376 data += numvertices*sizeof(float[4]);
5377 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
5378 data += numvertices*sizeof(float[4]);
5379 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5381 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5382 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5384 dpsoftrast.post_array4f[j] = (float *)data;
5385 data += numvertices*sizeof(float[4]);
5387 command->element3i = NULL;
5388 command->element3s = NULL;
5391 command->element3s = (unsigned short *)data;
5392 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
5396 command->element3i = (int *)data;
5397 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
5402 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5404 DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
5405 DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
5406 command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
5407 command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
5408 if (command->starty >= command->endy)
5410 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5411 MM_FREE(command->arrays);
5412 DPSOFTRAST_UndoCommand(command->commandsize);
5415 command->clipped = dpsoftrast.drawclipped;
5416 command->refcount = dpsoftrast.numthreads;
5418 if (dpsoftrast.usethreads)
5421 DPSOFTRAST_Draw_SyncCommands();
5422 for (i = 0; i < dpsoftrast.numthreads; i++)
5424 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5425 if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
5426 Thread_CondSignal(thread->drawcond);
5431 DPSOFTRAST_Draw_FlushThreads();
5435 DEFCOMMAND(23, SetRenderTargets, int width; int height;);
5436 static void DPSOFTRAST_Interpret_SetRenderTargets(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_SetRenderTargets *command)
5438 thread->validate |= DPSOFTRAST_VALIDATE_FB;
5440 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
5442 DPSOFTRAST_Command_SetRenderTargets *command;
5443 if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
5444 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
5445 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
5447 dpsoftrast.fb_width = width;
5448 dpsoftrast.fb_height = height;
5449 dpsoftrast.fb_depthpixels = depthpixels;
5450 dpsoftrast.fb_colorpixels[0] = colorpixels0;
5451 dpsoftrast.fb_colorpixels[1] = colorpixels1;
5452 dpsoftrast.fb_colorpixels[2] = colorpixels2;
5453 dpsoftrast.fb_colorpixels[3] = colorpixels3;
5454 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5455 command = DPSOFTRAST_ALLOCATECOMMAND(SetRenderTargets);
5456 command->width = width;
5457 command->height = height;
5460 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
5462 int commandoffset = thread->commandoffset;
5463 while (commandoffset != endoffset)
5465 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
5466 switch (command->opcode)
5468 #define INTERPCOMMAND(name) \
5469 case DPSOFTRAST_OPCODE_##name : \
5470 DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
5471 commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
5472 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
5473 commandoffset = 0; \
5475 INTERPCOMMAND(Viewport)
5476 INTERPCOMMAND(ClearColor)
5477 INTERPCOMMAND(ClearDepth)
5478 INTERPCOMMAND(ColorMask)
5479 INTERPCOMMAND(DepthTest)
5480 INTERPCOMMAND(ScissorTest)
5481 INTERPCOMMAND(Scissor)
5482 INTERPCOMMAND(BlendFunc)
5483 INTERPCOMMAND(BlendSubtract)
5484 INTERPCOMMAND(DepthMask)
5485 INTERPCOMMAND(DepthFunc)
5486 INTERPCOMMAND(DepthRange)
5487 INTERPCOMMAND(PolygonOffset)
5488 INTERPCOMMAND(CullFace)
5489 INTERPCOMMAND(AlphaTest)
5490 INTERPCOMMAND(AlphaFunc)
5491 INTERPCOMMAND(SetTexture)
5492 INTERPCOMMAND(SetShader)
5493 INTERPCOMMAND(Uniform4f)
5494 INTERPCOMMAND(UniformMatrix4f)
5495 INTERPCOMMAND(Uniform1i)
5496 INTERPCOMMAND(SetRenderTargets)
5497 INTERPCOMMAND(ClipPlane)
5499 case DPSOFTRAST_OPCODE_Draw:
5500 DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
5501 commandoffset += command->commandsize;
5502 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
5504 thread->commandoffset = commandoffset;
5507 case DPSOFTRAST_OPCODE_Reset:
5512 thread->commandoffset = commandoffset;
5515 static int DPSOFTRAST_Draw_Thread(void *data)
5517 DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
5518 while(thread->index >= 0)
5520 if (thread->commandoffset != dpsoftrast.drawcommand)
5522 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5526 Thread_LockMutex(thread->drawmutex);
5527 if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
5529 if (thread->waiting) Thread_CondSignal(thread->waitcond);
5530 thread->starving = true;
5531 Thread_CondWait(thread->drawcond, thread->drawmutex);
5532 thread->starving = false;
5534 Thread_UnlockMutex(thread->drawmutex);
5540 static void DPSOFTRAST_Draw_FlushThreads(void)
5542 DPSOFTRAST_State_Thread *thread;
5544 DPSOFTRAST_Draw_SyncCommands();
5545 if (dpsoftrast.usethreads)
5547 for (i = 0; i < dpsoftrast.numthreads; i++)
5549 thread = &dpsoftrast.threads[i];
5550 if (thread->commandoffset != dpsoftrast.drawcommand)
5552 Thread_LockMutex(thread->drawmutex);
5553 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
5554 Thread_CondSignal(thread->drawcond);
5555 Thread_UnlockMutex(thread->drawmutex);
5558 for (i = 0; i < dpsoftrast.numthreads; i++)
5560 thread = &dpsoftrast.threads[i];
5561 if (thread->commandoffset != dpsoftrast.drawcommand)
5563 Thread_LockMutex(thread->drawmutex);
5564 if (thread->commandoffset != dpsoftrast.drawcommand)
5566 thread->waiting = true;
5567 Thread_CondWait(thread->waitcond, thread->drawmutex);
5568 thread->waiting = false;
5570 Thread_UnlockMutex(thread->drawmutex);
5576 for (i = 0; i < dpsoftrast.numthreads; i++)
5578 thread = &dpsoftrast.threads[i];
5579 if (thread->commandoffset != dpsoftrast.drawcommand)
5580 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5583 dpsoftrast.commandpool.usedcommands = 0;
5586 void DPSOFTRAST_Flush(void)
5588 DPSOFTRAST_Draw_FlushThreads();
5591 void DPSOFTRAST_Finish(void)
5596 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
5606 memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5607 dpsoftrast.bigendian = u.b[3];
5608 dpsoftrast.fb_width = width;
5609 dpsoftrast.fb_height = height;
5610 dpsoftrast.fb_depthpixels = depthpixels;
5611 dpsoftrast.fb_colorpixels[0] = colorpixels;
5612 dpsoftrast.fb_colorpixels[1] = NULL;
5613 dpsoftrast.fb_colorpixels[1] = NULL;
5614 dpsoftrast.fb_colorpixels[1] = NULL;
5615 dpsoftrast.viewport[0] = 0;
5616 dpsoftrast.viewport[1] = 0;
5617 dpsoftrast.viewport[2] = dpsoftrast.fb_width;
5618 dpsoftrast.viewport[3] = dpsoftrast.fb_height;
5619 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5620 dpsoftrast.texture_firstfree = 1;
5621 dpsoftrast.texture_end = 1;
5622 dpsoftrast.texture_max = 0;
5623 dpsoftrast.color[0] = 1;
5624 dpsoftrast.color[1] = 1;
5625 dpsoftrast.color[2] = 1;
5626 dpsoftrast.color[3] = 1;
5627 dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5628 dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5629 dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5630 dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5631 for (i = 0; i < dpsoftrast.numthreads; i++)
5633 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5635 thread->cullface = GL_BACK;
5636 thread->colormask[0] = 1;
5637 thread->colormask[1] = 1;
5638 thread->colormask[2] = 1;
5639 thread->colormask[3] = 1;
5640 thread->blendfunc[0] = GL_ONE;
5641 thread->blendfunc[1] = GL_ZERO;
5642 thread->depthmask = true;
5643 thread->depthtest = true;
5644 thread->depthfunc = GL_LEQUAL;
5645 thread->scissortest = false;
5646 thread->alphatest = false;
5647 thread->alphafunc = GL_GREATER;
5648 thread->alphavalue = 0.5f;
5649 thread->viewport[0] = 0;
5650 thread->viewport[1] = 0;
5651 thread->viewport[2] = dpsoftrast.fb_width;
5652 thread->viewport[3] = dpsoftrast.fb_height;
5653 thread->scissor[0] = 0;
5654 thread->scissor[1] = 0;
5655 thread->scissor[2] = dpsoftrast.fb_width;
5656 thread->scissor[3] = dpsoftrast.fb_height;
5657 thread->depthrange[0] = 0;
5658 thread->depthrange[1] = 1;
5659 thread->polygonoffset[0] = 0;
5660 thread->polygonoffset[1] = 0;
5661 thread->clipplane[0] = 0;
5662 thread->clipplane[1] = 0;
5663 thread->clipplane[2] = 0;
5664 thread->clipplane[3] = 1;
5666 thread->numspans = 0;
5667 thread->numtriangles = 0;
5668 thread->commandoffset = 0;
5669 thread->waiting = false;
5670 thread->starving = false;
5672 thread->validate = -1;
5673 DPSOFTRAST_Validate(thread, -1);
5675 if (dpsoftrast.usethreads)
5677 thread->waitcond = Thread_CreateCond();
5678 thread->drawcond = Thread_CreateCond();
5679 thread->drawmutex = Thread_CreateMutex();
5680 thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5686 void DPSOFTRAST_Shutdown(void)
5689 if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5691 DPSOFTRAST_State_Thread *thread;
5692 for (i = 0; i < dpsoftrast.numthreads; i++)
5694 thread = &dpsoftrast.threads[i];
5695 Thread_LockMutex(thread->drawmutex);
5697 Thread_CondSignal(thread->drawcond);
5698 Thread_UnlockMutex(thread->drawmutex);
5699 Thread_WaitThread(thread->thread, 0);
5700 Thread_DestroyCond(thread->waitcond);
5701 Thread_DestroyCond(thread->drawcond);
5702 Thread_DestroyMutex(thread->drawmutex);
5705 for (i = 0;i < dpsoftrast.texture_end;i++)
5706 if (dpsoftrast.texture[i].bytes)
5707 MM_FREE(dpsoftrast.texture[i].bytes);
5708 if (dpsoftrast.texture)
5709 free(dpsoftrast.texture);
5710 if (dpsoftrast.threads)
5711 MM_FREE(dpsoftrast.threads);
5712 memset(&dpsoftrast, 0, sizeof(dpsoftrast));