3 #define _USE_MATH_DEFINES
6 #include "dpsoftrast.h"
13 typedef qboolean bool;
17 #define ATOMIC_SIZE 32
21 #define ALIGN(var) var __attribute__((__aligned__(16)))
22 #define ATOMIC(var) var __attribute__((__aligned__(32)))
24 #define MEMORY_BARRIER (_mm_sfence())
25 //(__sync_synchronize())
26 #define ATOMIC_COUNTER volatile int
27 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
28 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
29 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
31 #elif defined(_MSC_VER)
32 #define ALIGN(var) __declspec(align(16)) var
33 #define ATOMIC(var) __declspec(align(32)) var
35 #define MEMORY_BARRIER (_mm_sfence())
37 #define ATOMIC_COUNTER volatile LONG
38 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
39 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
40 #define ATOMIC_ADD(counter, val) (InterlockedExchangeAdd(&(counter), (val)))
49 #define ALIGN(var) var
50 #define ATOMIC(var) var
55 #include <SDL_thread.h>
57 #define MEMORY_BARRIER ((void)0)
58 #define ATOMIC_COUNTER int
59 #define ATOMIC_INCREMENT(counter) (++(counter))
60 #define ATOMIC_DECREMENT(counter) (--(counter))
61 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
62 typedef void SDL_Thread;
63 typedef void SDL_cond;
64 typedef void SDL_mutex;
68 #include <emmintrin.h>
70 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
72 static void *MM_CALLOC(size_t nmemb, size_t size)
74 void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
75 if(ptr != NULL) memset(ptr, 0, nmemb*size);
79 #define MM_FREE _mm_free
81 #define MM_MALLOC(size) malloc(size)
82 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
86 typedef enum DPSOFTRAST_ARRAY_e
88 DPSOFTRAST_ARRAY_POSITION,
89 DPSOFTRAST_ARRAY_COLOR,
90 DPSOFTRAST_ARRAY_TEXCOORD0,
91 DPSOFTRAST_ARRAY_TEXCOORD1,
92 DPSOFTRAST_ARRAY_TEXCOORD2,
93 DPSOFTRAST_ARRAY_TEXCOORD3,
94 DPSOFTRAST_ARRAY_TEXCOORD4,
95 DPSOFTRAST_ARRAY_TEXCOORD5,
96 DPSOFTRAST_ARRAY_TEXCOORD6,
97 DPSOFTRAST_ARRAY_TEXCOORD7,
98 DPSOFTRAST_ARRAY_TOTAL
102 typedef struct DPSOFTRAST_Texture_s
109 DPSOFTRAST_TEXTURE_FILTER filter;
112 ATOMIC_COUNTER binds;
113 unsigned char *bytes;
114 int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
118 #define COMMAND_SIZE ALIGN_SIZE
119 #define COMMAND_ALIGN(var) ALIGN(var)
121 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
123 unsigned char opcode;
124 unsigned short commandsize;
128 enum { DPSOFTRAST_OPCODE_Reset = 0 };
130 #define DEFCOMMAND(opcodeval, name, fields) \
131 enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
132 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
134 unsigned char opcode; \
135 unsigned short commandsize; \
137 } DPSOFTRAST_Command_##name );
139 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
140 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
142 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
146 ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
148 DPSOFTRAST_State_Command_Pool);
150 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
152 unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
154 ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
156 DPSOFTRAST_State_Triangle);
158 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
159 slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
160 data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
161 _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
162 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
164 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
165 slope[0] = (triangle)->attribs[arrayindex][0][0]; \
166 slope[1] = (triangle)->attribs[arrayindex][0][1]; \
167 slope[2] = (triangle)->attribs[arrayindex][0][2]; \
168 slope[3] = (triangle)->attribs[arrayindex][0][3]; \
169 data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
170 data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
171 data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
172 data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
175 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
177 typedef ALIGN(struct DPSOFTRAST_State_Span_s
179 int triangle; // triangle this span was generated by
180 int x; // framebuffer x coord
181 int y; // framebuffer y coord
182 int length; // pixel count
183 int startx; // usable range (according to pixelmask)
184 int endx; // usable range (according to pixelmask)
185 unsigned char *pixelmask; // true for pixels that passed depth test, false for others
187 DPSOFTRAST_State_Span);
189 #define DPSOFTRAST_DRAW_MAXSPANS 1024
190 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
192 #define DPSOFTRAST_VALIDATE_FB 1
193 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
194 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
195 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
197 typedef enum DPSOFTRAST_BLENDMODE_e
199 DPSOFTRAST_BLENDMODE_OPAQUE,
200 DPSOFTRAST_BLENDMODE_ALPHA,
201 DPSOFTRAST_BLENDMODE_ADDALPHA,
202 DPSOFTRAST_BLENDMODE_ADD,
203 DPSOFTRAST_BLENDMODE_INVMOD,
204 DPSOFTRAST_BLENDMODE_MUL,
205 DPSOFTRAST_BLENDMODE_MUL2,
206 DPSOFTRAST_BLENDMODE_SUBALPHA,
207 DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
208 DPSOFTRAST_BLENDMODE_TOTAL
210 DPSOFTRAST_BLENDMODE;
212 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
231 float polygonoffset[2];
234 int shader_permutation;
236 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
238 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
239 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
241 // DPSOFTRAST_VALIDATE_ flags
244 // derived values (DPSOFTRAST_VALIDATE_FB)
246 int fb_clearscissor[4];
247 ALIGN(float fb_viewportcenter[4]);
248 ALIGN(float fb_viewportscale[4]);
250 // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
253 // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
256 ATOMIC(volatile int commandoffset);
258 volatile bool waiting;
259 volatile bool starving;
262 SDL_mutex *drawmutex;
266 DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
267 DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
269 DPSOFTRAST_State_Thread);
271 typedef ATOMIC(struct DPSOFTRAST_State_s
275 unsigned int *fb_depthpixels;
276 unsigned int *fb_colorpixels[4];
279 ALIGN(float fb_viewportcenter[4]);
280 ALIGN(float fb_viewportscale[4]);
283 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
284 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
286 const float *pointer_vertex3f;
287 const float *pointer_color4f;
288 const unsigned char *pointer_color4ub;
289 const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
292 int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
293 int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
294 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
298 float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
299 float *screencoord4f;
305 int shader_permutation;
309 int texture_firstfree;
310 DPSOFTRAST_Texture *texture;
315 const char *errorstring;
318 DPSOFTRAST_State_Thread *threads;
320 ATOMIC(volatile int drawcommand);
322 DPSOFTRAST_State_Command_Pool commandpool;
326 DPSOFTRAST_State dpsoftrast;
328 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
329 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
330 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
331 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
332 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
334 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
336 fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
337 fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
338 fb_viewportcenter[3] = 0.5f;
339 fb_viewportcenter[0] = 0.0f;
340 fb_viewportscale[1] = 0.5f * viewport[2];
341 fb_viewportscale[2] = -0.5f * viewport[3];
342 fb_viewportscale[3] = 0.5f;
343 fb_viewportscale[0] = 1.0f;
346 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
348 // calculate framebuffer scissor, viewport, viewport clipped by scissor,
349 // and viewport projection values
352 x1 = thread->scissor[0];
353 x2 = thread->scissor[0] + thread->scissor[2];
354 y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
355 y2 = dpsoftrast.fb_height - thread->scissor[1];
356 if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
358 if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
360 if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
361 thread->fb_clearscissor[0] = x1;
362 thread->fb_clearscissor[1] = y1;
363 thread->fb_clearscissor[2] = x2 - x1;
364 thread->fb_clearscissor[3] = y2 - y1;
366 DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
369 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
371 thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
374 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
376 if (thread->blendsubtract)
378 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
380 #define BLENDFUNC(sfactor, dfactor, blendmode) \
381 case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
382 BLENDFUNC(GL_SRC_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
383 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
388 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
390 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
391 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
392 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
393 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
394 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
395 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
396 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
397 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
398 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
399 BLENDFUNC(GL_SRC_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
400 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
405 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
407 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
409 mask &= thread->validate;
412 if (mask & DPSOFTRAST_VALIDATE_FB)
414 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
415 DPSOFTRAST_RecalcFB(thread);
417 if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
419 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
420 DPSOFTRAST_RecalcDepthFunc(thread);
422 if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
424 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
425 DPSOFTRAST_RecalcBlendFunc(thread);
429 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
431 if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
432 return &dpsoftrast.texture[index];
436 static void DPSOFTRAST_Texture_Grow(void)
438 DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
439 DPSOFTRAST_State_Thread *thread;
443 // expand texture array as needed
444 if (dpsoftrast.texture_max < 1024)
445 dpsoftrast.texture_max = 1024;
447 dpsoftrast.texture_max *= 2;
448 dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
449 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
450 if(dpsoftrast.texbound[i])
451 dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
452 for (j = 0; j < dpsoftrast.numthreads; j++)
454 thread = &dpsoftrast.threads[j];
455 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
456 if(thread->texbound[i])
457 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
461 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
470 int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
471 int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
472 DPSOFTRAST_Texture *texture;
473 if (width*height*depth < 1)
475 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
478 if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
480 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
485 case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
486 case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
487 case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
489 case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
490 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
492 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
497 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
500 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
502 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
507 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
509 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
512 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
514 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
517 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
519 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
522 if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
524 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
527 if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
529 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
532 // find first empty slot in texture array
533 for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
534 if (!dpsoftrast.texture[texnum].bytes)
536 dpsoftrast.texture_firstfree = texnum + 1;
537 if (dpsoftrast.texture_max <= texnum)
538 DPSOFTRAST_Texture_Grow();
539 if (dpsoftrast.texture_end <= texnum)
540 dpsoftrast.texture_end = texnum + 1;
541 texture = &dpsoftrast.texture[texnum];
542 memset(texture, 0, sizeof(*texture));
543 texture->flags = flags;
544 texture->width = width;
545 texture->height = height;
546 texture->depth = depth;
547 texture->sides = sides;
559 s = w * h * d * sides * 4;
560 texture->mipmap[mipmaps][0] = size;
561 texture->mipmap[mipmaps][1] = s;
562 texture->mipmap[mipmaps][2] = w;
563 texture->mipmap[mipmaps][3] = h;
564 texture->mipmap[mipmaps][4] = d;
567 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
573 texture->mipmaps = mipmaps;
574 texture->size = size;
576 // allocate the pixels now
577 texture->bytes = (unsigned char *)MM_CALLOC(1, size);
581 void DPSOFTRAST_Texture_Free(int index)
583 DPSOFTRAST_Texture *texture;
584 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
588 MM_FREE(texture->bytes);
589 texture->bytes = NULL;
590 memset(texture, 0, sizeof(*texture));
591 // adjust the free range and used range
592 if (dpsoftrast.texture_firstfree > index)
593 dpsoftrast.texture_firstfree = index;
594 while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
595 dpsoftrast.texture_end--;
597 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
599 int i, x, y, z, w, layer0, layer1, row0, row1;
600 unsigned char *o, *i0, *i1, *i2, *i3;
601 DPSOFTRAST_Texture *texture;
602 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
603 if (texture->mipmaps <= 1)
605 for (i = 1;i < texture->mipmaps;i++)
607 for (z = 0;z < texture->mipmap[i][4];z++)
611 if (layer1 >= texture->mipmap[i-1][4])
612 layer1 = texture->mipmap[i-1][4]-1;
613 for (y = 0;y < texture->mipmap[i][3];y++)
617 if (row1 >= texture->mipmap[i-1][3])
618 row1 = texture->mipmap[i-1][3]-1;
619 o = texture->bytes + texture->mipmap[i ][0] + 4*((texture->mipmap[i ][3] * z + y ) * texture->mipmap[i ][2]);
620 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
621 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
622 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
623 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
624 w = texture->mipmap[i][2];
627 if (texture->mipmap[i-1][2] > 1)
629 // average 3D texture
630 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
632 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
633 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
634 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
635 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
640 // average 3D mipmap with parent width == 1
641 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
643 o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
644 o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
645 o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
646 o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
652 if (texture->mipmap[i-1][2] > 1)
654 // average 2D texture (common case)
655 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
657 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
658 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
659 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
660 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
665 // 2D texture with parent width == 1
666 o[0] = (i0[0] + i1[0] + 1) >> 1;
667 o[1] = (i0[1] + i1[1] + 1) >> 1;
668 o[2] = (i0[2] + i1[2] + 1) >> 1;
669 o[3] = (i0[3] + i1[3] + 1) >> 1;
676 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
678 DPSOFTRAST_Texture *texture;
680 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
683 dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
684 while (blockheight > 0)
686 memcpy(dst, pixels, blockwidth * 4);
687 pixels += blockwidth * 4;
688 dst += texture->mipmap[0][2] * 4;
691 DPSOFTRAST_Texture_CalculateMipmaps(index);
693 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
695 DPSOFTRAST_Texture *texture;
696 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
699 memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
700 DPSOFTRAST_Texture_CalculateMipmaps(index);
702 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
704 DPSOFTRAST_Texture *texture;
705 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
706 return texture->mipmap[mip][2];
708 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
710 DPSOFTRAST_Texture *texture;
711 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
712 return texture->mipmap[mip][3];
714 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
716 DPSOFTRAST_Texture *texture;
717 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
718 return texture->mipmap[mip][4];
720 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
722 DPSOFTRAST_Texture *texture;
723 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
726 return texture->bytes + texture->mipmap[mip][0];
728 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
730 DPSOFTRAST_Texture *texture;
731 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
732 if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
734 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
739 texture->filter = filter;
742 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
744 if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
745 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
746 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
748 dpsoftrast.fb_width = width;
749 dpsoftrast.fb_height = height;
750 dpsoftrast.fb_depthpixels = depthpixels;
751 dpsoftrast.fb_colorpixels[0] = colorpixels0;
752 dpsoftrast.fb_colorpixels[1] = colorpixels1;
753 dpsoftrast.fb_colorpixels[2] = colorpixels2;
754 dpsoftrast.fb_colorpixels[3] = colorpixels3;
757 static void DPSOFTRAST_Draw_FlushThreads(void);
759 static void DPSOFTRAST_Draw_SyncCommands(void)
762 dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
765 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
768 DPSOFTRAST_State_Thread *thread;
770 int freecommand = dpsoftrast.commandpool.freecommand;
771 int usedcommands = dpsoftrast.commandpool.usedcommands;
772 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
774 DPSOFTRAST_Draw_SyncCommands();
780 for (i = 0; i < dpsoftrast.numthreads; i++)
782 thread = &dpsoftrast.threads[i];
783 commandoffset = freecommand - thread->commandoffset;
784 if (commandoffset < 0)
785 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
786 if (commandoffset > usedcommands)
789 usedcommands = commandoffset;
792 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
794 thread = &dpsoftrast.threads[waitindex];
795 SDL_LockMutex(thread->drawmutex);
796 if (thread->commandoffset != dpsoftrast.drawcommand)
798 thread->waiting = true;
799 if (thread->starving) SDL_CondSignal(thread->drawcond);
800 SDL_CondWait(thread->waitcond, thread->drawmutex);
801 thread->waiting = false;
803 SDL_UnlockMutex(thread->drawmutex);
805 dpsoftrast.commandpool.usedcommands = usedcommands;
807 DPSOFTRAST_Draw_FlushThreads();
811 #define DPSOFTRAST_ALIGNCOMMAND(size) \
812 ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
813 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
814 ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
816 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
818 DPSOFTRAST_Command *command;
819 int freecommand = dpsoftrast.commandpool.freecommand;
820 int usedcommands = dpsoftrast.commandpool.usedcommands;
821 int extra = sizeof(DPSOFTRAST_Command);
822 if(DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
823 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
824 if(usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
826 DPSOFTRAST_Draw_FreeCommandPool(size + extra);
827 freecommand = dpsoftrast.commandpool.freecommand;
828 usedcommands = dpsoftrast.commandpool.usedcommands;
830 if(DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
832 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
833 command->opcode = DPSOFTRAST_OPCODE_Reset;
834 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
837 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
838 command->opcode = opcode;
839 command->commandsize = size;
841 if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
843 dpsoftrast.commandpool.freecommand = freecommand;
844 dpsoftrast.commandpool.usedcommands = usedcommands + size;
848 static void DPSOFTRAST_UndoCommand(int size)
850 int freecommand = dpsoftrast.commandpool.freecommand;
851 int usedcommands = dpsoftrast.commandpool.usedcommands;
854 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
855 usedcommands -= size;
856 dpsoftrast.commandpool.freecommand = freecommand;
857 dpsoftrast.commandpool.usedcommands = usedcommands;
860 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
861 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
863 thread->viewport[0] = command->x;
864 thread->viewport[1] = command->y;
865 thread->viewport[2] = command->width;
866 thread->viewport[3] = command->height;
867 thread->validate |= DPSOFTRAST_VALIDATE_FB;
869 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
871 DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
874 command->width = width;
875 command->height = height;
877 dpsoftrast.viewport[0] = x;
878 dpsoftrast.viewport[1] = y;
879 dpsoftrast.viewport[2] = width;
880 dpsoftrast.viewport[3] = height;
881 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
884 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;)
885 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
887 int i, x1, y1, x2, y2, w, h, x, y, t1, t2;
890 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
891 x1 = thread->fb_clearscissor[0];
892 y1 = thread->fb_clearscissor[1];
893 x2 = thread->fb_clearscissor[0] + thread->fb_clearscissor[2];
894 y2 = thread->fb_clearscissor[1] + thread->fb_clearscissor[3];
895 t1 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
896 t2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
903 // FIXME: honor fb_colormask?
904 c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
905 for (i = 0;i < 4;i++)
907 if (!dpsoftrast.fb_colorpixels[i])
909 for (y = y1;y < y2;y++)
911 p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
912 for (x = x1;x < x2;x++)
917 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
919 DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
926 DEFCOMMAND(3, ClearDepth, float depth;)
927 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
929 int x1, y1, x2, y2, w, h, x, y, t1, t2;
932 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
933 x1 = thread->fb_clearscissor[0];
934 y1 = thread->fb_clearscissor[1];
935 x2 = thread->fb_clearscissor[0] + thread->fb_clearscissor[2];
936 y2 = thread->fb_clearscissor[1] + thread->fb_clearscissor[3];
937 t1 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
938 t2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
945 c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
946 for (y = y1;y < y2;y++)
948 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
949 for (x = x1;x < x2;x++)
953 void DPSOFTRAST_ClearDepth(float d)
955 DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
959 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
960 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
962 thread->colormask[0] = command->r != 0;
963 thread->colormask[1] = command->g != 0;
964 thread->colormask[2] = command->b != 0;
965 thread->colormask[3] = command->a != 0;
966 thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
968 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
970 DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
977 DEFCOMMAND(5, DepthTest, int enable;)
978 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
980 thread->depthtest = command->enable;
981 thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
983 void DPSOFTRAST_DepthTest(int enable)
985 DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
986 command->enable = enable;
989 DEFCOMMAND(6, ScissorTest, int enable;)
990 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
992 thread->scissortest = command->enable;
993 thread->validate |= DPSOFTRAST_VALIDATE_FB;
995 void DPSOFTRAST_ScissorTest(int enable)
997 DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
998 command->enable = enable;
1001 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1002 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1004 thread->scissor[0] = command->x;
1005 thread->scissor[1] = command->y;
1006 thread->scissor[2] = command->width;
1007 thread->scissor[3] = command->height;
1008 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1010 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1012 DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1015 command->width = width;
1016 command->height = height;
1019 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1020 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1022 thread->blendfunc[0] = command->sfactor;
1023 thread->blendfunc[1] = command->dfactor;
1024 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1026 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1028 DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1029 command->sfactor = sfactor;
1030 command->dfactor = dfactor;
1033 DEFCOMMAND(9, BlendSubtract, int enable;)
1034 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1036 thread->blendsubtract = command->enable;
1037 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1039 void DPSOFTRAST_BlendSubtract(int enable)
1041 DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1042 command->enable = enable;
1045 DEFCOMMAND(10, DepthMask, int enable;)
1046 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1048 thread->depthmask = command->enable;
1050 void DPSOFTRAST_DepthMask(int enable)
1052 DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1053 command->enable = enable;
1056 DEFCOMMAND(11, DepthFunc, int func;)
1057 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1059 thread->depthfunc = command->func;
1061 void DPSOFTRAST_DepthFunc(int func)
1063 DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1064 command->func = func;
1067 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1068 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1070 thread->depthrange[0] = command->nearval;
1071 thread->depthrange[1] = command->farval;
1073 void DPSOFTRAST_DepthRange(float nearval, float farval)
1075 DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1076 command->nearval = nearval;
1077 command->farval = farval;
1080 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1081 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1083 thread->polygonoffset[0] = command->alongnormal;
1084 thread->polygonoffset[1] = command->intoview;
1086 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1088 DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1089 command->alongnormal = alongnormal;
1090 command->intoview = intoview;
1093 DEFCOMMAND(14, CullFace, int mode;)
1094 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1096 thread->cullface = command->mode;
1098 void DPSOFTRAST_CullFace(int mode)
1100 DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1101 command->mode = mode;
1104 DEFCOMMAND(15, AlphaTest, int enable;)
1105 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1107 thread->alphatest = command->enable;
1109 void DPSOFTRAST_AlphaTest(int enable)
1111 DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1112 command->enable = enable;
1115 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1116 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1118 thread->alphafunc = command->func;
1119 thread->alphavalue = command->ref;
1121 void DPSOFTRAST_AlphaFunc(int func, float ref)
1123 DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1124 command->func = func;
1128 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1130 dpsoftrast.color[0] = r;
1131 dpsoftrast.color[1] = g;
1132 dpsoftrast.color[2] = b;
1133 dpsoftrast.color[3] = a;
1136 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1138 int outstride = blockwidth * 4;
1139 int instride = dpsoftrast.fb_width * 4;
1142 int bx2 = blockx + blockwidth;
1143 int by2 = blocky + blockheight;
1148 unsigned char *inpixels;
1152 if (bx1 < 0) bx1 = 0;
1153 if (by1 < 0) by1 = 0;
1154 if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1155 if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1158 inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1159 if (dpsoftrast.bigendian)
1161 for (y = by1;y < by2;y++)
1163 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1164 o = (unsigned char *)outpixels + (y - by1) * outstride;
1165 for (x = bx1;x < bx2;x++)
1178 for (y = by1;y < by2;y++)
1180 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1181 o = (unsigned char *)outpixels + (y - by1) * outstride;
1187 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1191 int tx2 = tx + width;
1192 int ty2 = ty + height;
1195 int sx2 = sx + width;
1196 int sy2 = sy + height;
1206 unsigned int *spixels;
1207 unsigned int *tpixels;
1208 DPSOFTRAST_Texture *texture;
1209 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1210 if (mip < 0 || mip >= texture->mipmaps) return;
1213 spixels = dpsoftrast.fb_colorpixels[0];
1214 swidth = dpsoftrast.fb_width;
1215 sheight = dpsoftrast.fb_height;
1216 tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1217 twidth = texture->mipmap[mip][2];
1218 theight = texture->mipmap[mip][3];
1219 if (tx1 < 0) tx1 = 0;
1220 if (ty1 < 0) ty1 = 0;
1221 if (tx2 > twidth) tx2 = twidth;
1222 if (ty2 > theight) ty2 = theight;
1223 if (sx1 < 0) sx1 = 0;
1224 if (sy1 < 0) sy1 = 0;
1225 if (sx2 > swidth) sx2 = swidth;
1226 if (sy2 > sheight) sy2 = sheight;
1231 if (tw > sw) tw = sw;
1232 if (th > sh) th = sh;
1233 if (tw < 1 || th < 1)
1235 for (y = 0;y < th;y++)
1236 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 + y) * swidth + sx1), tw*4);
1237 if (texture->mipmaps > 1)
1238 DPSOFTRAST_Texture_CalculateMipmaps(index);
1241 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1242 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1244 if (thread->texbound[command->unitnum])
1245 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1246 thread->texbound[command->unitnum] = command->texture;
1248 void DPSOFTRAST_SetTexture(int unitnum, int index)
1250 DPSOFTRAST_Command_SetTexture *command;
1251 DPSOFTRAST_Texture *texture;
1252 if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1254 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1257 texture = DPSOFTRAST_Texture_GetByIndex(index);
1258 if (index && !texture)
1260 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1264 command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1265 command->unitnum = unitnum;
1266 command->texture = texture;
1268 dpsoftrast.texbound[unitnum] = texture;
1269 ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1272 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1274 dpsoftrast.pointer_vertex3f = vertex3f;
1275 dpsoftrast.stride_vertex = stride;
1277 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1279 dpsoftrast.pointer_color4f = color4f;
1280 dpsoftrast.pointer_color4ub = NULL;
1281 dpsoftrast.stride_color = stride;
1283 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1285 dpsoftrast.pointer_color4f = NULL;
1286 dpsoftrast.pointer_color4ub = color4ub;
1287 dpsoftrast.stride_color = stride;
1289 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1291 dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1292 dpsoftrast.components_texcoord[unitnum] = numcomponents;
1293 dpsoftrast.stride_texcoord[unitnum] = stride;
1296 DEFCOMMAND(18, SetShader, int mode; int permutation;)
1297 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1299 thread->shader_mode = command->mode;
1300 thread->shader_permutation = command->permutation;
1302 void DPSOFTRAST_SetShader(int mode, int permutation)
1304 DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1305 command->mode = mode;
1306 command->permutation = permutation;
1308 dpsoftrast.shader_mode = mode;
1309 dpsoftrast.shader_permutation = permutation;
1312 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1313 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1315 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1317 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1319 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1320 command->index = index;
1321 command->val[0] = v0;
1322 command->val[1] = v1;
1323 command->val[2] = v2;
1324 command->val[3] = v3;
1326 dpsoftrast.uniform4f[index*4+0] = v0;
1327 dpsoftrast.uniform4f[index*4+1] = v1;
1328 dpsoftrast.uniform4f[index*4+2] = v2;
1329 dpsoftrast.uniform4f[index*4+3] = v3;
1331 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1333 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1334 command->index = index;
1335 memcpy(command->val, v, sizeof(command->val));
1337 memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1340 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1341 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1343 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1345 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1349 for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1351 __m128 m0, m1, m2, m3;
1352 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1353 command->index = index;
1354 if (((size_t)v)&(ALIGN_SIZE-1))
1356 m0 = _mm_loadu_ps(v);
1357 m1 = _mm_loadu_ps(v+4);
1358 m2 = _mm_loadu_ps(v+8);
1359 m3 = _mm_loadu_ps(v+12);
1363 m0 = _mm_load_ps(v);
1364 m1 = _mm_load_ps(v+4);
1365 m2 = _mm_load_ps(v+8);
1366 m3 = _mm_load_ps(v+12);
1370 __m128 t0, t1, t2, t3;
1371 t0 = _mm_unpacklo_ps(m0, m1);
1372 t1 = _mm_unpacklo_ps(m2, m3);
1373 t2 = _mm_unpackhi_ps(m0, m1);
1374 t3 = _mm_unpackhi_ps(m2, m3);
1375 m0 = _mm_movelh_ps(t0, t1);
1376 m1 = _mm_movehl_ps(t1, t0);
1377 m2 = _mm_movelh_ps(t2, t3);
1378 m3 = _mm_movehl_ps(t3, t2);
1380 _mm_store_ps(command->val, m0);
1381 _mm_store_ps(command->val+4, m1);
1382 _mm_store_ps(command->val+8, m2);
1383 _mm_store_ps(command->val+12, m3);
1384 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1385 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1386 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1387 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1392 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1393 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1395 thread->uniform1i[command->index] = command->val;
1397 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1399 DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1400 command->index = index;
1403 dpsoftrast.uniform1i[command->index] = i0;
1407 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1409 float *end = dst + size*4;
1410 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1414 _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1423 _mm_store_ps(dst, _mm_load_ps((const float *)src));
1430 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1432 float *end = dst + size*4;
1433 if (stride == sizeof(float[3]))
1435 float *end4 = dst + (size&~3)*4;
1436 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1440 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv;
1441 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1442 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1443 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1444 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1445 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1446 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1447 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1448 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1449 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1450 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1451 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1452 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1454 src += 4*sizeof(float[3]);
1461 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1462 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1463 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1464 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1465 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1466 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1467 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1468 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1469 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1470 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1471 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1472 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1473 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1475 src += 4*sizeof(float[3]);
1479 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1483 __m128 v = _mm_loadu_ps((const float *)src);
1484 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1485 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1486 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1487 _mm_store_ps(dst, v);
1496 __m128 v = _mm_load_ps((const float *)src);
1497 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1498 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1499 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1500 _mm_store_ps(dst, v);
1507 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1509 float *end = dst + size*4;
1510 __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1511 if (stride == sizeof(float[2]))
1513 float *end2 = dst + (size&~1)*4;
1514 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1518 __m128 v = _mm_loadu_ps((const float *)src);
1519 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1520 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1522 src += 2*sizeof(float[2]);
1529 __m128 v = _mm_load_ps((const float *)src);
1530 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1531 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1533 src += 2*sizeof(float[2]);
1539 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1545 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1547 float *end = dst + size*4;
1548 __m128 scale = _mm_set1_ps(1.0f/255.0f);
1549 if (stride == sizeof(unsigned char[4]))
1551 float *end4 = dst + (size&~3)*4;
1552 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1556 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1557 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1558 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1559 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1560 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1562 src += 4*sizeof(unsigned char[4]);
1569 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1570 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1571 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1572 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1573 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1575 src += 4*sizeof(unsigned char[4]);
1581 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1582 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1588 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1590 float *end = dst + 4*size;
1591 __m128 v = _mm_loadu_ps(src);
1594 _mm_store_ps(dst, v);
1600 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1603 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1604 __m128 m0, m1, m2, m3;
1606 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1608 // fast case for identity matrix
1609 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1612 end = out4f + numitems*4;
1613 m0 = _mm_loadu_ps(inmatrix16f);
1614 m1 = _mm_loadu_ps(inmatrix16f + 4);
1615 m2 = _mm_loadu_ps(inmatrix16f + 8);
1616 m3 = _mm_loadu_ps(inmatrix16f + 12);
1617 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1621 __m128 v = _mm_loadu_ps(in4f);
1623 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1624 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1625 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1626 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1635 __m128 v = _mm_load_ps(in4f);
1637 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1638 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1639 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1640 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1648 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1650 memcpy(out4f, in4f, numitems * sizeof(float[4]));
1654 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1656 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1657 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1658 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1659 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1662 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1664 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1665 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1666 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1667 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1670 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1673 out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1674 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1675 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1676 _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1679 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, __m128 minpos, __m128 maxpos, __m128 viewportcenter, __m128 viewportscale, __m128 m0, __m128 m1, __m128 m2, __m128 m3)
1681 int clipmask = 0xFF;
1682 __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1683 m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1684 m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1685 m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1686 m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1687 #define BBFRONT(k, pos) \
1689 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1690 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1691 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1694 clipmask &= ~(1<<k); \
1695 proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1696 minproj = _mm_min_ss(minproj, proj); \
1697 maxproj = _mm_max_ss(maxproj, proj); \
1701 BBFRONT(1, _mm_move_ss(minpos, maxpos));
1702 BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1703 BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1704 BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1705 BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1706 BBFRONT(6, _mm_move_ss(maxpos, minpos));
1710 if (clipmask&(1<<k)) \
1712 if (!(clipmask&(1<<(k^1)))) \
1714 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1715 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1716 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1717 minproj = _mm_min_ss(minproj, proj); \
1718 maxproj = _mm_max_ss(maxproj, proj); \
1720 if (!(clipmask&(1<<(k^2)))) \
1722 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1723 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1724 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1725 minproj = _mm_min_ss(minproj, proj); \
1726 maxproj = _mm_max_ss(maxproj, proj); \
1728 if (!(clipmask&(1<<(k^4)))) \
1730 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1731 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1732 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1733 minproj = _mm_min_ss(minproj, proj); \
1734 maxproj = _mm_max_ss(maxproj, proj); \
1738 BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1739 viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1740 viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1741 minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1742 maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1743 minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1744 maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1745 *starty = _mm_cvttss_si32(maxproj);
1746 *endy = _mm_cvttss_si32(minproj)+1;
1751 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1754 float *end = out4f + numitems*4;
1755 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1756 __m128 minpos, maxpos;
1757 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1759 minpos = maxpos = _mm_loadu_ps(in4f);
1762 __m128 v = _mm_loadu_ps(in4f);
1763 minpos = _mm_min_ps(minpos, v);
1764 maxpos = _mm_max_ps(maxpos, v);
1765 _mm_store_ps(out4f, v);
1766 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1767 _mm_store_ps(screen4f, v);
1775 minpos = maxpos = _mm_load_ps(in4f);
1778 __m128 v = _mm_load_ps(in4f);
1779 minpos = _mm_min_ps(minpos, v);
1780 maxpos = _mm_max_ps(maxpos, v);
1781 _mm_store_ps(out4f, v);
1782 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1783 _mm_store_ps(screen4f, v);
1790 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale,
1791 _mm_setr_ps(1.0f, 0.0f, 0.0f, 0.0f),
1792 _mm_setr_ps(0.0f, 1.0f, 0.0f, 0.0f),
1793 _mm_setr_ps(0.0f, 0.0f, 1.0f, 0.0f),
1794 _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f));
1799 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1802 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1803 __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1805 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1806 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1807 end = out4f + numitems*4;
1808 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1809 viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1810 m0 = _mm_loadu_ps(inmatrix16f);
1811 m1 = _mm_loadu_ps(inmatrix16f + 4);
1812 m2 = _mm_loadu_ps(inmatrix16f + 8);
1813 m3 = _mm_loadu_ps(inmatrix16f + 12);
1814 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1816 minpos = maxpos = _mm_loadu_ps(in4f);
1819 __m128 v = _mm_loadu_ps(in4f);
1820 minpos = _mm_min_ps(minpos, v);
1821 maxpos = _mm_max_ps(maxpos, v);
1822 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1823 _mm_store_ps(out4f, v);
1824 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1825 _mm_store_ps(screen4f, v);
1833 minpos = maxpos = _mm_load_ps(in4f);
1836 __m128 v = _mm_load_ps(in4f);
1837 minpos = _mm_min_ps(minpos, v);
1838 maxpos = _mm_max_ps(maxpos, v);
1839 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1840 _mm_store_ps(out4f, v);
1841 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1842 _mm_store_ps(screen4f, v);
1849 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale, m0, m1, m2, m3);
1854 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1856 float *outf = dpsoftrast.post_array4f[outarray];
1857 const unsigned char *inb;
1858 int firstvertex = dpsoftrast.firstvertex;
1859 int numvertices = dpsoftrast.numvertices;
1863 case DPSOFTRAST_ARRAY_POSITION:
1864 stride = dpsoftrast.stride_vertex;
1865 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1866 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1868 case DPSOFTRAST_ARRAY_COLOR:
1869 stride = dpsoftrast.stride_color;
1870 if (dpsoftrast.pointer_color4f)
1872 inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1873 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1875 else if (dpsoftrast.pointer_color4ub)
1877 stride = dpsoftrast.stride_color;
1878 inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1879 DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1883 DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1887 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1888 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1890 inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1891 switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1894 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1897 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1900 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1909 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1911 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1912 DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1917 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1919 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1920 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
1925 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
1927 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1928 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
1932 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
1935 int startx = span->startx;
1936 int endx = span->endx;
1937 float wslope = triangle->w[0];
1938 float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
1939 float endz = 1.0f / (w + wslope * startx);
1940 for (x = startx;x < endx;)
1942 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
1944 if(nextsub >= endx) nextsub = endsub = endx-1;
1945 endz = 1.0f / (w + wslope * nextsub);
1946 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
1947 for (; x <= endsub; x++, z += dz)
1952 void DPSOFTRAST_Draw_Span_Finish(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const float * RESTRICT in4f)
1955 int startx = span->startx;
1956 int endx = span->endx;
1959 unsigned char * RESTRICT pixelmask = span->pixelmask;
1960 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1963 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
1964 // handle alphatest now (this affects depth writes too)
1965 if (thread->alphatest)
1966 for (x = startx;x < endx;x++)
1967 if (in4f[x*4+3] < 0.5f)
1968 pixelmask[x] = false;
1969 // FIXME: this does not handle bigendian
1970 switch(thread->fb_blendmode)
1972 case DPSOFTRAST_BLENDMODE_OPAQUE:
1973 for (x = startx;x < endx;x++)
1977 d[0] = (int)(in4f[x*4+2]*255.0f);if (d[0] > 255) d[0] = 255;
1978 d[1] = (int)(in4f[x*4+1]*255.0f);if (d[1] > 255) d[1] = 255;
1979 d[2] = (int)(in4f[x*4+0]*255.0f);if (d[2] > 255) d[2] = 255;
1980 d[3] = (int)(in4f[x*4+3]*255.0f);if (d[3] > 255) d[3] = 255;
1981 pixel[x*4+0] = d[0];
1982 pixel[x*4+1] = d[1];
1983 pixel[x*4+2] = d[2];
1984 pixel[x*4+3] = d[3];
1987 case DPSOFTRAST_BLENDMODE_ALPHA:
1988 for (x = startx;x < endx;x++)
1992 a = in4f[x*4+3] * 255.0f;
1993 b = 1.0f - in4f[x*4+3];
1994 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
1995 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
1996 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
1997 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
1998 pixel[x*4+0] = d[0];
1999 pixel[x*4+1] = d[1];
2000 pixel[x*4+2] = d[2];
2001 pixel[x*4+3] = d[3];
2004 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2005 for (x = startx;x < endx;x++)
2009 a = in4f[x*4+3] * 255.0f;
2010 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2011 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2012 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2013 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2014 pixel[x*4+0] = d[0];
2015 pixel[x*4+1] = d[1];
2016 pixel[x*4+2] = d[2];
2017 pixel[x*4+3] = d[3];
2020 case DPSOFTRAST_BLENDMODE_ADD:
2021 for (x = startx;x < endx;x++)
2025 d[0] = (int)(in4f[x*4+2]*255.0f+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2026 d[1] = (int)(in4f[x*4+1]*255.0f+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2027 d[2] = (int)(in4f[x*4+0]*255.0f+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2028 d[3] = (int)(in4f[x*4+3]*255.0f+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2029 pixel[x*4+0] = d[0];
2030 pixel[x*4+1] = d[1];
2031 pixel[x*4+2] = d[2];
2032 pixel[x*4+3] = d[3];
2035 case DPSOFTRAST_BLENDMODE_INVMOD:
2036 for (x = startx;x < endx;x++)
2040 d[0] = (int)((1.0f-in4f[x*4+2])*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2041 d[1] = (int)((1.0f-in4f[x*4+1])*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2042 d[2] = (int)((1.0f-in4f[x*4+0])*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2043 d[3] = (int)((1.0f-in4f[x*4+3])*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2044 pixel[x*4+0] = d[0];
2045 pixel[x*4+1] = d[1];
2046 pixel[x*4+2] = d[2];
2047 pixel[x*4+3] = d[3];
2050 case DPSOFTRAST_BLENDMODE_MUL:
2051 for (x = startx;x < endx;x++)
2055 d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2056 d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2057 d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2058 d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2059 pixel[x*4+0] = d[0];
2060 pixel[x*4+1] = d[1];
2061 pixel[x*4+2] = d[2];
2062 pixel[x*4+3] = d[3];
2065 case DPSOFTRAST_BLENDMODE_MUL2:
2066 for (x = startx;x < endx;x++)
2070 d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]*2.0f);if (d[0] > 255) d[0] = 255;
2071 d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]*2.0f);if (d[1] > 255) d[1] = 255;
2072 d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]*2.0f);if (d[2] > 255) d[2] = 255;
2073 d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]*2.0f);if (d[3] > 255) d[3] = 255;
2074 pixel[x*4+0] = d[0];
2075 pixel[x*4+1] = d[1];
2076 pixel[x*4+2] = d[2];
2077 pixel[x*4+3] = d[3];
2080 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2081 for (x = startx;x < endx;x++)
2085 a = in4f[x*4+3] * -255.0f;
2086 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;if (d[0] < 0) d[0] = 0;
2087 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;if (d[1] < 0) d[1] = 0;
2088 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;if (d[2] < 0) d[2] = 0;
2089 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;if (d[3] < 0) d[3] = 0;
2090 pixel[x*4+0] = d[0];
2091 pixel[x*4+1] = d[1];
2092 pixel[x*4+2] = d[2];
2093 pixel[x*4+3] = d[3];
2096 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2097 for (x = startx;x < endx;x++)
2102 b = 1.0f - in4f[x*4+3];
2103 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2104 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2105 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2106 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2107 pixel[x*4+0] = d[0];
2108 pixel[x*4+1] = d[1];
2109 pixel[x*4+2] = d[2];
2110 pixel[x*4+3] = d[3];
2116 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2120 int startx = span->startx;
2121 int endx = span->endx;
2122 const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2123 unsigned char * RESTRICT pixelmask = span->pixelmask;
2124 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2125 unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2128 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2129 pixeli += span->y * dpsoftrast.fb_width + span->x;
2130 // handle alphatest now (this affects depth writes too)
2131 if (thread->alphatest)
2132 for (x = startx;x < endx;x++)
2133 if (in4ub[x*4+3] < 0.5f)
2134 pixelmask[x] = false;
2135 // FIXME: this does not handle bigendian
2136 switch(thread->fb_blendmode)
2138 case DPSOFTRAST_BLENDMODE_OPAQUE:
2139 for (x = startx;x + 4 <= endx;)
2141 if (*(const unsigned int *)&pixelmask[x] == 0x01010101)
2143 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2157 case DPSOFTRAST_BLENDMODE_ALPHA:
2158 #define FINISHBLEND(blend2, blend1) \
2159 for (x = startx;x + 2 <= endx;x += 2) \
2162 switch (*(const unsigned short*)&pixelmask[x]) \
2165 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2166 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2168 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2171 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x+1]), _mm_setzero_si128()); \
2172 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x+1]), _mm_setzero_si128()); \
2174 pixeli[x+1] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2177 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2178 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2180 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2185 for(;x < endx; x++) \
2188 if (!pixelmask[x]) \
2190 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2191 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2193 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2197 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2198 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2200 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2201 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2204 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2206 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2207 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2209 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2210 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2213 case DPSOFTRAST_BLENDMODE_ADD:
2214 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2216 case DPSOFTRAST_BLENDMODE_INVMOD:
2218 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2220 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2223 case DPSOFTRAST_BLENDMODE_MUL:
2224 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2226 case DPSOFTRAST_BLENDMODE_MUL2:
2227 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2229 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2231 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2232 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2234 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2235 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2238 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2240 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2241 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2243 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2244 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2251 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2254 int startx = span->startx;
2255 int endx = span->endx;
2260 float tc[2], endtc[2];
2262 unsigned int tci[2];
2263 unsigned int tci1[2];
2264 unsigned int tcimin[2];
2265 unsigned int tcimax[2];
2270 const unsigned char * RESTRICT pixelbase;
2271 const unsigned char * RESTRICT pixel[4];
2272 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2273 // if no texture is bound, just fill it with white
2276 for (x = startx;x < endx;x++)
2278 out4f[x*4+0] = 1.0f;
2279 out4f[x*4+1] = 1.0f;
2280 out4f[x*4+2] = 1.0f;
2281 out4f[x*4+3] = 1.0f;
2285 mip = triangle->mip[texunitindex];
2286 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2287 // if this mipmap of the texture is 1 pixel, just fill it with that color
2288 if (texture->mipmap[mip][1] == 4)
2290 c[0] = texture->bytes[2] * (1.0f/255.0f);
2291 c[1] = texture->bytes[1] * (1.0f/255.0f);
2292 c[2] = texture->bytes[0] * (1.0f/255.0f);
2293 c[3] = texture->bytes[3] * (1.0f/255.0f);
2294 for (x = startx;x < endx;x++)
2296 out4f[x*4+0] = c[0];
2297 out4f[x*4+1] = c[1];
2298 out4f[x*4+2] = c[2];
2299 out4f[x*4+3] = c[3];
2303 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2304 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2305 flags = texture->flags;
2306 tcscale[0] = texture->mipmap[mip][2];
2307 tcscale[1] = texture->mipmap[mip][3];
2308 tciwidth = texture->mipmap[mip][2];
2311 tcimax[0] = texture->mipmap[mip][2]-1;
2312 tcimax[1] = texture->mipmap[mip][3]-1;
2313 tciwrapmask[0] = texture->mipmap[mip][2]-1;
2314 tciwrapmask[1] = texture->mipmap[mip][3]-1;
2315 endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0] - 0.5f;
2316 endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1] - 0.5f;
2317 for (x = startx;x < endx;)
2319 unsigned int subtc[2];
2320 unsigned int substep[2];
2321 float subscale = 65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2322 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2325 nextsub = endsub = endx-1;
2326 if(x < nextsub) subscale = 65536.0f / (nextsub - x);
2330 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0] - 0.5f;
2331 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1] - 0.5f;
2332 substep[0] = (endtc[0] - tc[0]) * subscale;
2333 substep[1] = (endtc[1] - tc[1]) * subscale;
2334 subtc[0] = tc[0] * (1<<16);
2335 subtc[1] = tc[1] * (1<<16);
2338 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2340 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2342 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2343 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2344 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2345 tci[0] = subtc[0]>>16;
2346 tci[1] = subtc[1]>>16;
2347 tci1[0] = tci[0] + 1;
2348 tci1[1] = tci[1] + 1;
2349 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2350 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2351 tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2352 tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2353 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2354 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2355 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2356 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2357 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2358 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2359 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2360 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2361 out4f[x*4+0] = c[0];
2362 out4f[x*4+1] = c[1];
2363 out4f[x*4+2] = c[2];
2364 out4f[x*4+3] = c[3];
2369 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2371 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2372 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2373 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2374 tci[0] = subtc[0]>>16;
2375 tci[1] = subtc[1]>>16;
2376 tci1[0] = tci[0] + 1;
2377 tci1[1] = tci[1] + 1;
2378 tci[0] &= tciwrapmask[0];
2379 tci[1] &= tciwrapmask[1];
2380 tci1[0] &= tciwrapmask[0];
2381 tci1[1] &= tciwrapmask[1];
2382 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2383 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2384 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2385 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2386 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2387 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2388 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2389 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2390 out4f[x*4+0] = c[0];
2391 out4f[x*4+1] = c[1];
2392 out4f[x*4+2] = c[2];
2393 out4f[x*4+3] = c[3];
2397 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2399 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2401 tci[0] = subtc[0]>>16;
2402 tci[1] = subtc[1]>>16;
2403 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2404 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2405 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2406 c[0] = pixel[0][2] * (1.0f / 255.0f);
2407 c[1] = pixel[0][1] * (1.0f / 255.0f);
2408 c[2] = pixel[0][0] * (1.0f / 255.0f);
2409 c[3] = pixel[0][3] * (1.0f / 255.0f);
2410 out4f[x*4+0] = c[0];
2411 out4f[x*4+1] = c[1];
2412 out4f[x*4+2] = c[2];
2413 out4f[x*4+3] = c[3];
2418 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2420 tci[0] = subtc[0]>>16;
2421 tci[1] = subtc[1]>>16;
2422 tci[0] &= tciwrapmask[0];
2423 tci[1] &= tciwrapmask[1];
2424 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2425 c[0] = pixel[0][2] * (1.0f / 255.0f);
2426 c[1] = pixel[0][1] * (1.0f / 255.0f);
2427 c[2] = pixel[0][0] * (1.0f / 255.0f);
2428 c[3] = pixel[0][3] * (1.0f / 255.0f);
2429 out4f[x*4+0] = c[0];
2430 out4f[x*4+1] = c[1];
2431 out4f[x*4+2] = c[2];
2432 out4f[x*4+3] = c[3];
2438 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2442 int startx = span->startx;
2443 int endx = span->endx;
2445 __m128 data, slope, tcscale;
2446 __m128i tcsize, tcmask, tcoffset, tcmax;
2448 __m128i subtc, substep, endsubtc;
2451 unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2452 const unsigned char * RESTRICT pixelbase;
2453 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2454 // if no texture is bound, just fill it with white
2457 memset(out4ub + startx*4, 255, span->length*4);
2460 mip = triangle->mip[texunitindex];
2461 pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2462 // if this mipmap of the texture is 1 pixel, just fill it with that color
2463 if (texture->mipmap[mip][1] == 4)
2465 unsigned int k = *((const unsigned int *)pixelbase);
2466 for (x = startx;x < endx;x++)
2470 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2471 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2472 flags = texture->flags;
2473 tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2474 tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2475 tcscale = _mm_cvtepi32_ps(tcsize);
2476 data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2477 slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2478 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx])), _mm_set1_ps(0.5f));
2479 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2480 tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2481 tcmax = _mm_packs_epi32(tcmask, tcmask);
2482 for (x = startx;x < endx;)
2484 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2485 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2488 nextsub = endsub = endx-1;
2489 if(x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2493 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub])), _mm_set1_ps(0.5f));
2494 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2495 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2496 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2497 substep = _mm_slli_epi32(substep, 1);
2500 __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2501 if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2503 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2504 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2506 const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;
2507 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2508 tci = _mm_madd_epi16(tci, tcoffset);
2509 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2510 ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2511 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2512 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2513 pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2514 pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2515 fracm = _mm_srli_epi16(subtc, 1);
2516 pix1 = _mm_add_epi16(pix1,
2517 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2518 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2519 pix3 = _mm_add_epi16(pix3,
2520 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2521 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2522 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2523 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2524 pix2 = _mm_add_epi16(pix2,
2525 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2526 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2527 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2531 const unsigned char * RESTRICT ptr1;
2532 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2533 tci = _mm_madd_epi16(tci, tcoffset);
2534 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2535 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2536 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2537 fracm = _mm_srli_epi16(subtc, 1);
2538 pix1 = _mm_add_epi16(pix1,
2539 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2540 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2541 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2542 pix1 = _mm_add_epi16(pix1,
2543 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2544 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2545 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2549 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2551 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2553 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2554 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2555 tci = _mm_madd_epi16(tci, tcoffset);
2556 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2557 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2558 _mm_setzero_si128());
2559 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2560 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2561 _mm_setzero_si128());
2562 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2563 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2564 tci = _mm_madd_epi16(tci, tcoffset);
2565 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2566 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2567 _mm_setzero_si128());
2568 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2569 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2570 _mm_setzero_si128());
2571 fracm = _mm_srli_epi16(subtc, 1);
2572 pix1 = _mm_add_epi16(pix1,
2573 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2574 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2575 pix3 = _mm_add_epi16(pix3,
2576 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2577 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2578 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2579 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2580 pix2 = _mm_add_epi16(pix2,
2581 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2582 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2583 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2587 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2588 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2589 tci = _mm_madd_epi16(tci, tcoffset);
2590 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2591 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2592 _mm_setzero_si128());
2593 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2594 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2595 _mm_setzero_si128());
2596 fracm = _mm_srli_epi16(subtc, 1);
2597 pix1 = _mm_add_epi16(pix1,
2598 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2599 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2600 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2601 pix1 = _mm_add_epi16(pix1,
2602 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2603 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2604 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2610 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2612 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2613 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2614 tci = _mm_madd_epi16(tci, tcoffset);
2615 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2616 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2617 _mm_setzero_si128());
2618 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2619 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2620 _mm_setzero_si128());
2621 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2622 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2623 tci = _mm_madd_epi16(tci, tcoffset);
2624 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2625 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2626 _mm_setzero_si128());
2627 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2628 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2629 _mm_setzero_si128());
2630 fracm = _mm_srli_epi16(subtc, 1);
2631 pix1 = _mm_add_epi16(pix1,
2632 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2633 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2634 pix3 = _mm_add_epi16(pix3,
2635 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2636 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2637 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2638 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2639 pix2 = _mm_add_epi16(pix2,
2640 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2641 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2642 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2646 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2647 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2648 tci = _mm_madd_epi16(tci, tcoffset);
2649 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2650 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2651 _mm_setzero_si128());
2652 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2653 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2654 _mm_setzero_si128());
2655 fracm = _mm_srli_epi16(subtc, 1);
2656 pix1 = _mm_add_epi16(pix1,
2657 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2658 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2659 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2660 pix1 = _mm_add_epi16(pix1,
2661 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2662 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2663 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2670 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2672 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2674 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2675 tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2676 tci = _mm_madd_epi16(tci, tcoffset);
2677 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2678 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2682 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2683 tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2684 tci = _mm_madd_epi16(tci, tcoffset);
2685 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2691 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2693 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2694 tci = _mm_and_si128(tci, tcmax);
2695 tci = _mm_madd_epi16(tci, tcoffset);
2696 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2697 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2701 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2702 tci = _mm_and_si128(tci, tcmax);
2703 tci = _mm_madd_epi16(tci, tcoffset);
2704 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2713 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2716 memset(out4ub, 255, span->length*4);
2719 float DPSOFTRAST_SampleShadowmap(const float *vector)
2725 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2728 int startx = span->startx;
2729 int endx = span->endx;
2734 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2735 for (x = startx;x < endx;x++)
2738 c[0] = (data[0] + slope[0]*x) * z;
2739 c[1] = (data[1] + slope[1]*x) * z;
2740 c[2] = (data[2] + slope[2]*x) * z;
2741 c[3] = (data[3] + slope[3]*x) * z;
2742 out4f[x*4+0] = in4f[x*4+0] * c[0];
2743 out4f[x*4+1] = in4f[x*4+1] * c[1];
2744 out4f[x*4+2] = in4f[x*4+2] * c[2];
2745 out4f[x*4+3] = in4f[x*4+3] * c[3];
2749 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2752 int startx = span->startx;
2753 int endx = span->endx;
2758 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2759 for (x = startx;x < endx;x++)
2762 c[0] = (data[0] + slope[0]*x) * z;
2763 c[1] = (data[1] + slope[1]*x) * z;
2764 c[2] = (data[2] + slope[2]*x) * z;
2765 c[3] = (data[3] + slope[3]*x) * z;
2766 out4f[x*4+0] = c[0];
2767 out4f[x*4+1] = c[1];
2768 out4f[x*4+2] = c[2];
2769 out4f[x*4+3] = c[3];
2773 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2775 int x, startx = span->startx, endx = span->endx;
2776 float c[4], localcolor[4];
2777 localcolor[0] = subcolor[0];
2778 localcolor[1] = subcolor[1];
2779 localcolor[2] = subcolor[2];
2780 localcolor[3] = subcolor[3];
2781 for (x = startx;x < endx;x++)
2783 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2784 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2785 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2786 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2787 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2788 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2789 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2790 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2794 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2796 int x, startx = span->startx, endx = span->endx;
2797 for (x = startx;x < endx;x++)
2799 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2800 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2801 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2802 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2806 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2808 int x, startx = span->startx, endx = span->endx;
2809 for (x = startx;x < endx;x++)
2811 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2812 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2813 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2814 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2818 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2820 int x, startx = span->startx, endx = span->endx;
2822 for (x = startx;x < endx;x++)
2824 a = 1.0f - inb4f[x*4+3];
2826 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2827 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2828 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2829 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2833 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2835 int x, startx = span->startx, endx = span->endx;
2836 float localcolor[4], ilerp, lerp;
2837 localcolor[0] = color[0];
2838 localcolor[1] = color[1];
2839 localcolor[2] = color[2];
2840 localcolor[3] = color[3];
2841 ilerp = 1.0f - localcolor[3];
2842 lerp = localcolor[3];
2843 for (x = startx;x < endx;x++)
2845 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2846 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2847 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2848 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2854 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2858 int startx = span->startx;
2859 int endx = span->endx;
2862 __m128i submod, substep, endsubmod;
2863 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2864 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2865 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2866 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2867 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2868 for (x = startx; x < endx;)
2870 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2871 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2874 nextsub = endsub = endx-1;
2875 if(x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
2879 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2880 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2881 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2882 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2883 substep = _mm_packs_epi32(substep, substep);
2884 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2886 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
2887 pix = _mm_mulhi_epu16(pix, submod);
2888 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2892 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
2893 pix = _mm_mulhi_epu16(pix, submod);
2894 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2901 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
2905 int startx = span->startx;
2906 int endx = span->endx;
2909 __m128i submod, substep, endsubmod;
2910 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2911 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2912 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2913 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2914 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2915 for (x = startx; x < endx;)
2917 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2918 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2921 nextsub = endsub = endx-1;
2922 if(x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
2926 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2927 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2928 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2929 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2930 substep = _mm_packs_epi32(substep, substep);
2931 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2933 __m128i pix = _mm_srai_epi16(submod, 4);
2934 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2938 __m128i pix = _mm_srai_epi16(submod, 4);
2939 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2946 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
2949 int x, startx = span->startx, endx = span->endx;
2950 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
2951 localcolor = _mm_shuffle_epi32(_mm_packs_epi32(localcolor, localcolor), _MM_SHUFFLE(1, 0, 1, 0));
2952 for (x = startx;x+2 <= endx;x+=2)
2954 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2955 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
2956 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
2957 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2961 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2962 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
2963 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
2964 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2969 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
2972 int x, startx = span->startx, endx = span->endx;
2973 for (x = startx;x+2 <= endx;x+=2)
2975 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2976 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
2977 pix1 = _mm_mulhi_epu16(pix1, pix2);
2978 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2982 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2983 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
2984 pix1 = _mm_mulhi_epu16(pix1, pix2);
2985 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2990 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
2993 int x, startx = span->startx, endx = span->endx;
2994 for (x = startx;x+2 <= endx;x+=2)
2996 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2997 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
2998 pix1 = _mm_add_epi16(pix1, pix2);
2999 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3003 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3004 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3005 pix1 = _mm_add_epi16(pix1, pix2);
3006 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3011 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3014 int x, startx = span->startx, endx = span->endx;
3015 __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3016 tint = _mm_shuffle_epi32(_mm_packs_epi32(tint, tint), _MM_SHUFFLE(1, 0, 1, 0));
3017 for (x = startx;x+2 <= endx;x+=2)
3019 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3020 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3021 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3022 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3026 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3027 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3028 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3029 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3034 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3037 int x, startx = span->startx, endx = span->endx;
3038 for (x = startx;x+2 <= endx;x+=2)
3040 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3041 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3042 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3043 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3044 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3048 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3049 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3050 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3051 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3052 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3057 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3060 int x, startx = span->startx, endx = span->endx;
3061 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3062 localcolor = _mm_shuffle_epi32(_mm_packs_epi32(localcolor, localcolor), _MM_SHUFFLE(1, 0, 1, 0));
3063 blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3064 for (x = startx;x+2 <= endx;x+=2)
3066 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3067 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3068 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3072 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3073 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3074 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3081 void DPSOFTRAST_VertexShader_Generic(void)
3083 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3084 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3085 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3086 if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3087 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3090 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3092 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3093 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3094 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3095 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3096 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3097 if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3099 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3100 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3101 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3103 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3104 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3107 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3109 else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3112 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3114 else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3117 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3122 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3123 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3128 void DPSOFTRAST_VertexShader_PostProcess(void)
3130 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3131 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3132 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3135 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3137 // TODO: optimize!! at the very least there is no reason to use texture sampling on the frame texture
3138 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3139 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3140 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3141 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3142 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3143 if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3145 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3146 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3148 DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3149 if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3151 // TODO: implement saturation
3153 if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3155 // TODO: implement gammaramps
3157 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3162 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3164 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3167 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3169 // this is never called (because colormask is off when this shader is used)
3170 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3171 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3172 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3173 memset(buffer_FragColorbgra8, 0, span->length*4);
3174 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3179 void DPSOFTRAST_VertexShader_FlatColor(void)
3181 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3182 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3185 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3187 int x, startx = span->startx, endx = span->endx;
3188 int Color_Ambienti[4];
3189 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3190 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3191 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3192 Color_Ambienti[2] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0]*256.0f);
3193 Color_Ambienti[1] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1]*256.0f);
3194 Color_Ambienti[0] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2]*256.0f);
3195 Color_Ambienti[3] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0] *256.0f);
3196 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3197 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3198 for (x = startx;x < endx;x++)
3200 buffer_FragColorbgra8[x*4+0] = (buffer_texture_colorbgra8[x*4+0] * Color_Ambienti[0])>>8;
3201 buffer_FragColorbgra8[x*4+1] = (buffer_texture_colorbgra8[x*4+1] * Color_Ambienti[1])>>8;
3202 buffer_FragColorbgra8[x*4+2] = (buffer_texture_colorbgra8[x*4+2] * Color_Ambienti[2])>>8;
3203 buffer_FragColorbgra8[x*4+3] = (buffer_texture_colorbgra8[x*4+3] * Color_Ambienti[3])>>8;
3205 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3210 void DPSOFTRAST_VertexShader_VertexColor(void)
3212 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3213 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3214 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3217 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3220 unsigned char * RESTRICT pixelmask = span->pixelmask;
3221 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3222 int x, startx = span->startx, endx = span->endx;
3223 __m128i Color_Ambientm, Color_Diffusem;
3225 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3226 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3227 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3228 int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3229 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3230 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3231 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3232 pixel = buffer_FragColorbgra8;
3233 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3234 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3235 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3236 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3237 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3238 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3239 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3240 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3241 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3242 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3243 data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3244 data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3245 slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3246 for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3248 __m128i color, mod, pix;
3249 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3252 __m128 z = _mm_loadu_ps(&buffer_z[x]);
3253 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3254 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3255 data = _mm_add_ps(data, slope);
3256 mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3257 data = _mm_add_ps(data, slope);
3258 mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3259 data = _mm_add_ps(data, slope);
3260 mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3261 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3262 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3263 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3264 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3265 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3271 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3272 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x])));
3273 mod = _mm_packs_epi32(mod, mod);
3274 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3275 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3277 if(pixel == buffer_FragColorbgra8)
3278 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3284 void DPSOFTRAST_VertexShader_Lightmap(void)
3286 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3287 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3288 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3291 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3294 unsigned char * RESTRICT pixelmask = span->pixelmask;
3295 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3296 int x, startx = span->startx, endx = span->endx;
3297 __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3298 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3299 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3300 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3301 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3302 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3303 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3304 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3305 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3306 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3307 pixel = buffer_FragColorbgra8;
3308 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3309 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3310 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3311 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3312 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3313 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3314 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3315 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3317 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3318 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3319 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3320 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3321 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3322 for (x = startx;x < endx;x++)
3324 __m128i color, lightmap, glow, pix;
3325 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3328 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3329 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3330 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3331 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3332 _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3333 _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3334 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3335 _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3336 _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3337 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3343 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3344 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3345 glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3346 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3347 pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3348 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3353 for (x = startx;x < endx;x++)
3355 __m128i color, lightmap, pix;
3356 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3359 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3360 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3361 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3362 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3363 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3364 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3365 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3371 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3372 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3373 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3374 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3377 if(pixel == buffer_FragColorbgra8)
3378 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3384 void DPSOFTRAST_VertexShader_FakeLight(void)
3386 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3389 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3392 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3393 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3394 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3395 memset(buffer_FragColorbgra8, 0, span->length*4);
3396 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3401 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3403 DPSOFTRAST_VertexShader_Lightmap();
3406 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3408 DPSOFTRAST_PixelShader_Lightmap(thread, triangle, span);
3414 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3416 DPSOFTRAST_VertexShader_Lightmap();
3419 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3421 DPSOFTRAST_PixelShader_Lightmap(thread, triangle, span);
3427 void DPSOFTRAST_VertexShader_LightDirection(void)
3430 int numvertices = dpsoftrast.numvertices;
3432 float LightVector[4];
3433 float EyePosition[4];
3434 float EyeVectorModelSpace[4];
3440 LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3441 LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3442 LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3443 LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3444 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3445 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3446 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3447 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3448 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3449 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3450 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3451 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3452 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3453 for (i = 0;i < numvertices;i++)
3455 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3456 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3457 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3458 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3459 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3460 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3461 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3462 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3463 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3464 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3465 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3466 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3467 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3468 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3469 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3470 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
3471 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
3472 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
3473 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
3474 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3475 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3476 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3477 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3478 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3479 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
3480 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
3481 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
3482 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
3483 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
3485 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3488 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3489 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3490 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3491 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3492 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3493 #define DPSOFTRAST_Vector3Normalize(v)\
3496 float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3507 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3509 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3510 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3511 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3512 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3513 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3514 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3515 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3516 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3517 int x, startx = span->startx, endx = span->endx;
3518 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3519 float LightVectordata[4];
3520 float LightVectorslope[4];
3521 float EyeVectordata[4];
3522 float EyeVectorslope[4];
3524 float diffusetex[4];
3526 float surfacenormal[4];
3527 float lightnormal[4];
3529 float specularnormal[4];
3532 float SpecularPower;
3534 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3535 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3536 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3537 Color_Glow[3] = 0.0f;
3538 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3539 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3540 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3541 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3542 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3543 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3544 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3545 Color_Pants[3] = 0.0f;
3546 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3547 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3548 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3549 Color_Shirt[3] = 0.0f;
3550 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3551 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3552 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3554 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3555 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3557 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3559 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3561 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3563 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3564 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3565 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3566 Color_Diffuse[3] = 0.0f;
3567 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3568 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3569 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3570 LightColor[3] = 0.0f;
3571 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3572 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3573 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3574 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3575 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3576 Color_Specular[3] = 0.0f;
3577 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3578 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3579 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3580 for (x = startx;x < endx;x++)
3583 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3584 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3585 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3586 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3587 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3589 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3590 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3591 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3592 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3594 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3595 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3596 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3597 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3598 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3599 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3600 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3601 DPSOFTRAST_Vector3Normalize(surfacenormal);
3603 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3604 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3605 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3606 DPSOFTRAST_Vector3Normalize(lightnormal);
3608 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3609 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3610 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3611 DPSOFTRAST_Vector3Normalize(eyenormal);
3613 specularnormal[0] = lightnormal[0] + eyenormal[0];
3614 specularnormal[1] = lightnormal[1] + eyenormal[1];
3615 specularnormal[2] = lightnormal[2] + eyenormal[2];
3616 DPSOFTRAST_Vector3Normalize(specularnormal);
3618 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3619 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3620 specular = pow(specular, SpecularPower * glosstex[3]);
3621 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3623 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3624 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3625 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3626 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3630 d[0] = (int)( diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3631 d[1] = (int)( diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3632 d[2] = (int)( diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3633 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3635 buffer_FragColorbgra8[x*4+0] = d[0];
3636 buffer_FragColorbgra8[x*4+1] = d[1];
3637 buffer_FragColorbgra8[x*4+2] = d[2];
3638 buffer_FragColorbgra8[x*4+3] = d[3];
3641 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3643 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3644 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3645 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3646 Color_Diffuse[3] = 0.0f;
3647 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3648 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3649 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3650 LightColor[3] = 0.0f;
3651 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3652 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3653 for (x = startx;x < endx;x++)
3656 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3657 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3658 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3659 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3660 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3661 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3662 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3663 DPSOFTRAST_Vector3Normalize(surfacenormal);
3665 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3666 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3667 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3668 DPSOFTRAST_Vector3Normalize(lightnormal);
3670 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3671 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3673 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3674 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3675 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3676 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
3680 d[0] = (int)( + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3681 d[1] = (int)( + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3682 d[2] = (int)( + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3683 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
3685 buffer_FragColorbgra8[x*4+0] = d[0];
3686 buffer_FragColorbgra8[x*4+1] = d[1];
3687 buffer_FragColorbgra8[x*4+2] = d[2];
3688 buffer_FragColorbgra8[x*4+3] = d[3];
3693 for (x = startx;x < endx;x++)
3696 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3697 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3698 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3699 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3701 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3703 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3704 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3705 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3706 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3710 d[0] = (int)( diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3711 d[1] = (int)( diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3712 d[2] = (int)( diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3713 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3715 buffer_FragColorbgra8[x*4+0] = d[0];
3716 buffer_FragColorbgra8[x*4+1] = d[1];
3717 buffer_FragColorbgra8[x*4+2] = d[2];
3718 buffer_FragColorbgra8[x*4+3] = d[3];
3721 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3726 void DPSOFTRAST_VertexShader_LightSource(void)
3729 int numvertices = dpsoftrast.numvertices;
3730 float LightPosition[4];
3731 float LightVector[4];
3732 float LightVectorModelSpace[4];
3733 float EyePosition[4];
3734 float EyeVectorModelSpace[4];
3740 LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
3741 LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
3742 LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
3743 LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
3744 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3745 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3746 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3747 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3748 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3749 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3750 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3751 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3752 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3753 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3754 for (i = 0;i < numvertices;i++)
3756 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3757 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3758 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3759 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3760 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3761 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3762 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3763 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3764 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3765 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3766 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3767 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3768 LightVectorModelSpace[0] = LightPosition[0] - position[0];
3769 LightVectorModelSpace[1] = LightPosition[1] - position[1];
3770 LightVectorModelSpace[2] = LightPosition[2] - position[2];
3771 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
3772 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
3773 LightVector[2] = normal[0] * LightVectorModelSpace[0] + normal[1] * LightVectorModelSpace[1] + normal[2] * LightVectorModelSpace[2];
3774 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
3775 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
3776 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
3777 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
3778 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3779 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3780 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3781 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3782 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3783 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
3784 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
3785 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
3786 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
3787 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
3789 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3790 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
3793 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3796 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3797 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3798 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3799 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3800 unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3801 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3802 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3803 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3804 int x, startx = span->startx, endx = span->endx;
3805 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3806 float CubeVectordata[4];
3807 float CubeVectorslope[4];
3808 float LightVectordata[4];
3809 float LightVectorslope[4];
3810 float EyeVectordata[4];
3811 float EyeVectorslope[4];
3813 float diffusetex[4];
3815 float surfacenormal[4];
3816 float lightnormal[4];
3818 float specularnormal[4];
3821 float SpecularPower;
3822 float CubeVector[4];
3825 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3826 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3827 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3828 Color_Glow[3] = 0.0f;
3829 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3830 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3831 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3832 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3833 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3834 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3835 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3836 Color_Diffuse[3] = 0.0f;
3837 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3838 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3839 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3840 Color_Specular[3] = 0.0f;
3841 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3842 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3843 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3844 Color_Pants[3] = 0.0f;
3845 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3846 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3847 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3848 Color_Shirt[3] = 0.0f;
3849 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3850 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3851 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3852 LightColor[3] = 0.0f;
3853 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3854 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3855 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3856 DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3857 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3858 memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
3859 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3860 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3862 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3863 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3865 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3866 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
3867 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3869 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3870 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3871 for (x = startx;x < endx;x++)
3874 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
3875 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
3876 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
3877 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
3878 if (attenuation < 0.01f)
3880 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
3882 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
3883 if (attenuation < 0.01f)
3887 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3888 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3889 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3890 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3891 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3893 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3894 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3895 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3896 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3898 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3899 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3900 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3901 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3902 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3903 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3904 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3905 DPSOFTRAST_Vector3Normalize(surfacenormal);
3907 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3908 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3909 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3910 DPSOFTRAST_Vector3Normalize(lightnormal);
3912 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3913 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3914 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3915 DPSOFTRAST_Vector3Normalize(eyenormal);
3917 specularnormal[0] = lightnormal[0] + eyenormal[0];
3918 specularnormal[1] = lightnormal[1] + eyenormal[1];
3919 specularnormal[2] = lightnormal[2] + eyenormal[2];
3920 DPSOFTRAST_Vector3Normalize(specularnormal);
3922 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3923 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3924 specular = pow(specular, SpecularPower * glosstex[3]);
3925 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3927 // scale down the attenuation to account for the cubefilter multiplying everything by 255
3928 attenuation *= (1.0f / 255.0f);
3929 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
3930 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
3931 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
3932 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
3936 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
3937 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
3938 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
3939 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
3941 buffer_FragColorbgra8[x*4+0] = d[0];
3942 buffer_FragColorbgra8[x*4+1] = d[1];
3943 buffer_FragColorbgra8[x*4+2] = d[2];
3944 buffer_FragColorbgra8[x*4+3] = d[3];
3947 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3949 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3950 for (x = startx;x < endx;x++)
3953 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
3954 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
3955 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
3956 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
3957 if (attenuation < 0.01f)
3959 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
3961 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
3962 if (attenuation < 0.01f)
3966 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3967 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3968 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3969 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3970 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3972 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3973 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3974 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3975 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3977 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3978 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3979 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3980 DPSOFTRAST_Vector3Normalize(surfacenormal);
3982 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3983 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3984 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3985 DPSOFTRAST_Vector3Normalize(lightnormal);
3987 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3988 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3990 // scale down the attenuation to account for the cubefilter multiplying everything by 255
3991 attenuation *= (1.0f / 255.0f);
3992 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
3993 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
3994 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
3995 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
3999 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4000 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4001 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4002 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4004 buffer_FragColorbgra8[x*4+0] = d[0];
4005 buffer_FragColorbgra8[x*4+1] = d[1];
4006 buffer_FragColorbgra8[x*4+2] = d[2];
4007 buffer_FragColorbgra8[x*4+3] = d[3];
4012 for (x = startx;x < endx;x++)
4015 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4016 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4017 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4018 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4019 if (attenuation < 0.01f)
4021 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4023 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4024 if (attenuation < 0.01f)
4028 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4029 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4030 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4031 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4032 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4034 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4035 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4036 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4037 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4039 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4041 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4042 attenuation *= (1.0f / 255.0f);
4043 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4044 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4045 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4046 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4050 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4051 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4052 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4053 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4055 buffer_FragColorbgra8[x*4+0] = d[0];
4056 buffer_FragColorbgra8[x*4+1] = d[1];
4057 buffer_FragColorbgra8[x*4+2] = d[2];
4058 buffer_FragColorbgra8[x*4+3] = d[3];
4061 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4067 void DPSOFTRAST_VertexShader_Refraction(void)
4069 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4072 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4075 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4076 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4077 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4078 memset(buffer_FragColorbgra8, 0, span->length*4);
4079 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4084 void DPSOFTRAST_VertexShader_Water(void)
4086 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4090 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4093 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4094 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4095 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4096 memset(buffer_FragColorbgra8, 0, span->length*4);
4097 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4102 void DPSOFTRAST_VertexShader_ShowDepth(void)
4104 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4107 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4110 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4111 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4112 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4113 memset(buffer_FragColorbgra8, 0, span->length*4);
4114 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4119 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4121 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4124 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4127 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4128 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4129 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4130 memset(buffer_FragColorbgra8, 0, span->length*4);
4131 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4136 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4138 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4141 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4144 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4145 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4146 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4147 memset(buffer_FragColorbgra8, 0, span->length*4);
4148 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4153 typedef struct DPSOFTRAST_ShaderModeInfo_s
4156 void (*Vertex)(void);
4157 void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4158 unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4159 unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4161 DPSOFTRAST_ShaderModeInfo;
4163 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4165 {2, DPSOFTRAST_VertexShader_Generic, DPSOFTRAST_PixelShader_Generic, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4166 {2, DPSOFTRAST_VertexShader_PostProcess, DPSOFTRAST_PixelShader_PostProcess, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4167 {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow, DPSOFTRAST_PixelShader_Depth_Or_Shadow, {~0}, {~0}},
4168 {2, DPSOFTRAST_VertexShader_FlatColor, DPSOFTRAST_PixelShader_FlatColor, {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4169 {2, DPSOFTRAST_VertexShader_VertexColor, DPSOFTRAST_PixelShader_VertexColor, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4170 {2, DPSOFTRAST_VertexShader_Lightmap, DPSOFTRAST_PixelShader_Lightmap, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4171 {2, DPSOFTRAST_VertexShader_FakeLight, DPSOFTRAST_PixelShader_FakeLight, {~0}, {~0}},
4172 {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace, DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4173 {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4174 {2, DPSOFTRAST_VertexShader_LightDirection, DPSOFTRAST_PixelShader_LightDirection, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4175 {2, DPSOFTRAST_VertexShader_LightSource, DPSOFTRAST_PixelShader_LightSource, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4176 {2, DPSOFTRAST_VertexShader_Refraction, DPSOFTRAST_PixelShader_Refraction, {~0}},
4177 {2, DPSOFTRAST_VertexShader_Water, DPSOFTRAST_PixelShader_Water, {~0}},
4178 {2, DPSOFTRAST_VertexShader_ShowDepth, DPSOFTRAST_PixelShader_ShowDepth, {~0}},
4179 {2, DPSOFTRAST_VertexShader_DeferredGeometry, DPSOFTRAST_PixelShader_DeferredGeometry, {~0}},
4180 {2, DPSOFTRAST_VertexShader_DeferredLightSource, DPSOFTRAST_PixelShader_DeferredLightSource, {~0}}
4183 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4190 // unsigned int *colorpixel;
4191 unsigned int *depthpixel;
4197 DPSOFTRAST_State_Triangle *triangle;
4198 DPSOFTRAST_State_Span *span;
4199 unsigned char pixelmask[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4200 for (i = 0; i < thread->numspans; i++)
4202 span = &thread->spans[i];
4203 triangle = &thread->triangles[span->triangle];
4204 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4206 wslope = triangle->w[0];
4207 w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
4208 depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
4209 depth = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
4210 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4211 switch(thread->fb_depthfunc)
4214 case GL_ALWAYS: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = true; break;
4215 case GL_LESS: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4216 case GL_LEQUAL: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4217 case GL_EQUAL: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4218 case GL_GEQUAL: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4219 case GL_GREATER: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4220 case GL_NEVER: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = false; break;
4222 //colorpixel = dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;;
4223 //for (x = 0;x < span->length;x++)
4224 // colorpixel[x] = (depthpixel[x] & 0xFF000000) ? (0x00FF0000) : (depthpixel[x] & 0x00FF0000);
4225 // if there is no color buffer, skip pixel shader
4227 endx = span->length;
4228 while (startx < endx && !pixelmask[startx])
4230 while (endx > startx && !pixelmask[endx-1])
4233 continue; // no pixels to fill
4234 span->pixelmask = pixelmask;
4235 span->startx = startx;
4237 // run pixel shader if appropriate
4238 // do this before running depthmask code, to allow the pixelshader
4239 // to clear pixelmask values for alpha testing
4240 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4241 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4242 if (thread->depthmask)
4243 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4249 // no depth testing means we're just dealing with color...
4250 // if there is no color buffer, skip pixel shader
4251 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4253 memset(pixelmask, 1, span->length);
4254 span->pixelmask = pixelmask;
4256 span->endx = span->length;
4257 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4261 thread->numspans = 0;
4264 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4266 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4269 int cullface = thread->cullface;
4270 int width = dpsoftrast.fb_width;
4271 int miny = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
4272 int maxy = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
4273 __m128i fbmin, fbmax;
4274 __m128 viewportcenter, viewportscale;
4275 int firstvertex = command->firstvertex;
4276 int numvertices = command->numvertices;
4277 int numtriangles = command->numtriangles;
4278 const int *element3i = command->element3i;
4279 const unsigned short *element3s = command->element3s;
4280 int clipped = command->clipped;
4291 __m128 triangleedge1, triangleedge2, trianglenormal;
4294 DPSOFTRAST_State_Triangle *triangle;
4295 DPSOFTRAST_Texture *texture;
4296 if (command->starty >= maxy || command->endy <= miny)
4298 if (!ATOMIC_DECREMENT(command->refcount))
4300 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4301 MM_FREE(command->arrays);
4305 DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4306 fbmin = _mm_setr_epi16(0, miny, 0, miny, 0, miny, 0, miny);
4307 fbmax = _mm_sub_epi16(_mm_setr_epi16(width, maxy, width, maxy, width, maxy, width, maxy), _mm_set1_epi16(1));
4308 viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4309 viewportscale = _mm_load_ps(thread->fb_viewportscale);
4310 screen[3] = _mm_setzero_ps();
4311 clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4312 for (i = 0;i < numtriangles;i++)
4314 const float *screencoord4f = command->arrays;
4315 const float *arrays = screencoord4f + numvertices*4;
4317 // generate the 3 edges of this triangle
4318 // generate spans for the triangle - switch based on left split or right split classification of triangle
4321 e[0] = element3s[i*3+0] - firstvertex;
4322 e[1] = element3s[i*3+1] - firstvertex;
4323 e[2] = element3s[i*3+2] - firstvertex;
4327 e[0] = element3i[i*3+0] - firstvertex;
4328 e[1] = element3i[i*3+1] - firstvertex;
4329 e[2] = element3i[i*3+2] - firstvertex;
4338 #define SKIPBACKFACE \
4339 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4340 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4341 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4342 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4343 _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4347 if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4351 if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4356 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4357 clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4359 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4360 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4362 #define CLIPPEDVERTEXCOPY(k,p1) \
4363 screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4365 #define GENATTRIBCOPY(attrib, p1) \
4366 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4367 #define GENATTRIBLERP(attrib, p1, p2) \
4369 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4370 attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4372 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4376 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4377 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4378 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4379 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4380 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4381 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4382 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4388 // calculate distance from nearplane
4389 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4390 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4391 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4392 if (clipdist[0] >= 0.0f)
4394 if (clipdist[1] >= 0.0f)
4396 if (clipdist[2] >= 0.0f)
4399 // triangle is entirely in front of nearplane
4400 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4407 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4415 if (clipdist[2] >= 0.0f)
4417 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
4424 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
4431 else if (clipdist[1] >= 0.0f)
4433 if (clipdist[2] >= 0.0f)
4435 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
4442 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
4448 else if (clipdist[2] >= 0.0f)
4450 CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
4455 else continue; // triangle is entirely behind nearplane
4458 // calculate integer y coords for triangle points
4459 __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
4460 screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
4461 screenmin = _mm_min_epi16(screeni, screenir),
4462 screenmax = _mm_max_epi16(screeni, screenir);
4463 screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
4464 screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
4465 screenmin = _mm_max_epi16(screenmin, fbmin);
4466 screenmax = _mm_min_epi16(screenmax, fbmax);
4467 // skip offscreen triangles
4468 if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
4470 starty = _mm_extract_epi16(screenmin, 1);
4471 endy = _mm_extract_epi16(screenmax, 1)+1;
4472 screeny = _mm_srai_epi32(screeni, 16);
4475 triangle = &thread->triangles[thread->numtriangles];
4477 // calculate attribute plans for triangle data...
4478 // okay, this triangle is going to produce spans, we'd better project
4479 // the interpolants now (this is what gives perspective texturing),
4480 // this consists of simply multiplying all arrays by the W coord
4481 // (which is basically 1/Z), which will be undone per-pixel
4482 // (multiplying by Z again) to get the perspective-correct array
4485 __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
4486 __m128 mipedgescale, mipdensity;
4487 attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
4488 attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
4489 attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
4490 attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
4491 attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
4492 w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
4493 w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
4494 w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
4495 attribedge1 = _mm_sub_ss(w0, w1);
4496 attribedge2 = _mm_sub_ss(w2, w1);
4497 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
4498 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
4499 x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
4500 y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
4501 attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
4502 _mm_store_ss(&triangle->w[0], attribxslope);
4503 _mm_store_ss(&triangle->w[1], attribyslope);
4504 _mm_store_ss(&triangle->w[2], attriborigin);
4505 mipedgescale = _mm_setzero_ps();
4506 for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
4508 __m128 attrib0, attrib1, attrib2;
4509 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
4510 if (k >= DPSOFTRAST_ARRAY_TOTAL)
4512 arrays += numvertices*4;
4513 GENATTRIBS(attrib0, attrib1, attrib2);
4514 attriborigin = _mm_mul_ps(attrib1, w1);
4515 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
4516 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
4517 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
4518 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
4519 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
4520 _mm_stream_ps(triangle->attribs[k][0], attribxslope);
4521 _mm_stream_ps(triangle->attribs[k][1], attribyslope);
4522 _mm_stream_ps(triangle->attribs[k][2], attriborigin);
4523 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
4525 mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
4526 mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
4527 mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
4528 mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
4532 memset(triangle->mip, 0, sizeof(triangle->mip));
4533 for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
4535 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
4536 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
4538 texture = thread->texbound[texunit];
4539 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4541 mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
4542 mipdensity = _mm_mul_ps(mipdensity, mipdensity);
4543 mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
4544 mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
4545 // this will be multiplied in the texturing routine by the texture resolution
4546 y = _mm_cvtss_si32(mipdensity);
4549 y = (int)(log((float)y)*0.5f/M_LN2);
4550 if (y > texture->mipmaps - 1)
4551 y = texture->mipmaps - 1;
4552 triangle->mip[texunit] = y;
4558 for (y = starty; y < endy;)
4560 __m128 xcoords, xslope;
4561 __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
4562 int yccmask = _mm_movemask_epi8(ycc);
4563 int edge0p, edge0n, edge1p, edge1n;
4570 case 0xFFFF: /*0000*/ y = endy; continue;
4571 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
4572 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4573 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
4574 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
4575 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
4576 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
4577 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
4578 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
4579 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
4580 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
4581 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
4582 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
4583 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4584 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
4585 case 0x0000: /*1111*/ y++; continue;
4593 case 0xFFFF: /*000*/ y = endy; continue;
4594 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
4595 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4596 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
4597 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
4598 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4599 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
4600 case 0x0000: /*111*/ y++; continue;
4603 ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
4604 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
4605 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
4606 nexty = _mm_extract_epi16(ycc, 0);
4607 if(nexty >= endy) nexty = endy-1;
4608 if (_mm_ucomigt_ss(_mm_max_ss(screen[edge0n], screen[edge0p]), _mm_min_ss(screen[edge1n], screen[edge1p])))
4617 xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
4618 xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
4619 xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
4620 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
4621 xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
4622 for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope))
4624 int startx, endx, offset;
4625 startx = _mm_cvtss_si32(xcoords);
4626 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
4627 if (startx < 0) startx = 0;
4628 if (endx > dpsoftrast.fb_width) endx = dpsoftrast.fb_width;
4629 if (startx >= endx) continue;
4630 for (offset = startx; offset < endx;)
4632 DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
4633 span->triangle = thread->numtriangles;
4636 span->length = endx - offset;
4637 if (span -> length > DPSOFTRAST_DRAW_MAXSPANLENGTH)
4638 span -> length = DPSOFTRAST_DRAW_MAXSPANLENGTH;
4639 offset += span->length;
4640 if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
4641 DPSOFTRAST_Draw_ProcessSpans(thread);
4646 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
4648 DPSOFTRAST_Draw_ProcessSpans(thread);
4649 thread->numtriangles = 0;
4653 if (!ATOMIC_DECREMENT(command->refcount))
4655 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4656 MM_FREE(command->arrays);
4659 if (thread->numspans > 0 || thread->numtriangles > 0)
4661 DPSOFTRAST_Draw_ProcessSpans(thread);
4662 thread->numtriangles = 0;
4667 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
4671 int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
4672 int datasize = 2*numvertices*sizeof(float[4]);
4673 DPSOFTRAST_Command_Draw *command;
4674 unsigned char *data;
4675 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
4677 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
4678 if (j >= DPSOFTRAST_ARRAY_TOTAL)
4680 datasize += numvertices*sizeof(float[4]);
4683 datasize += numtriangles*sizeof(unsigned short[3]);
4685 datasize += numtriangles*sizeof(int[3]);
4686 datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
4687 if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
4689 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
4690 data = (unsigned char *)MM_CALLOC(datasize, 1);
4694 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
4695 data = (unsigned char *)command + commandsize;
4697 command->firstvertex = firstvertex;
4698 command->numvertices = numvertices;
4699 command->numtriangles = numtriangles;
4700 command->arrays = (float *)data;
4701 memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
4702 dpsoftrast.firstvertex = firstvertex;
4703 dpsoftrast.numvertices = numvertices;
4704 dpsoftrast.screencoord4f = (float *)data;
4705 data += numvertices*sizeof(float[4]);
4706 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
4707 data += numvertices*sizeof(float[4]);
4708 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
4710 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
4711 if (j >= DPSOFTRAST_ARRAY_TOTAL)
4713 dpsoftrast.post_array4f[j] = (float *)data;
4714 data += numvertices*sizeof(float[4]);
4716 command->element3i = NULL;
4717 command->element3s = NULL;
4720 command->element3s = (unsigned short *)data;
4721 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
4725 command->element3i = (int *)data;
4726 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
4731 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
4733 DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
4734 DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
4735 command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
4736 command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
4737 if (command->starty >= command->endy)
4739 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4740 MM_FREE(command->arrays);
4741 DPSOFTRAST_UndoCommand(command->commandsize);
4744 command->clipped = dpsoftrast.drawclipped;
4745 command->refcount = dpsoftrast.numthreads;
4748 DPSOFTRAST_Draw_SyncCommands();
4752 for (i = 0; i < dpsoftrast.numthreads; i++)
4754 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
4756 nexty = ((i+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
4757 if (command->starty < nexty && command->endy > y && thread->starving)
4758 SDL_CondSignal(thread->drawcond);
4762 DPSOFTRAST_Draw_FlushThreads();
4766 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
4768 int commandoffset = thread->commandoffset;
4769 while (commandoffset != endoffset)
4771 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
4772 switch (command->opcode)
4774 #define INTERPCOMMAND(name) \
4775 case DPSOFTRAST_OPCODE_##name : \
4776 DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
4777 commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
4778 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
4779 commandoffset = 0; \
4781 INTERPCOMMAND(Viewport)
4782 INTERPCOMMAND(ClearColor)
4783 INTERPCOMMAND(ClearDepth)
4784 INTERPCOMMAND(ColorMask)
4785 INTERPCOMMAND(DepthTest)
4786 INTERPCOMMAND(ScissorTest)
4787 INTERPCOMMAND(Scissor)
4788 INTERPCOMMAND(BlendFunc)
4789 INTERPCOMMAND(BlendSubtract)
4790 INTERPCOMMAND(DepthMask)
4791 INTERPCOMMAND(DepthFunc)
4792 INTERPCOMMAND(DepthRange)
4793 INTERPCOMMAND(PolygonOffset)
4794 INTERPCOMMAND(CullFace)
4795 INTERPCOMMAND(AlphaTest)
4796 INTERPCOMMAND(AlphaFunc)
4797 INTERPCOMMAND(SetTexture)
4798 INTERPCOMMAND(SetShader)
4799 INTERPCOMMAND(Uniform4f)
4800 INTERPCOMMAND(UniformMatrix4f)
4801 INTERPCOMMAND(Uniform1i)
4803 case DPSOFTRAST_OPCODE_Draw:
4804 DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
4805 commandoffset += command->commandsize;
4806 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
4808 thread->commandoffset = commandoffset;
4811 case DPSOFTRAST_OPCODE_Reset:
4816 thread->commandoffset = commandoffset;
4820 static int DPSOFTRAST_Draw_Thread(void *data)
4822 DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
4823 while(thread->index >= 0)
4825 if (thread->commandoffset != dpsoftrast.drawcommand)
4827 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
4831 SDL_LockMutex(thread->drawmutex);
4832 if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
4834 if (thread->waiting) SDL_CondSignal(thread->waitcond);
4835 thread->starving = true;
4836 SDL_CondWait(thread->drawcond, thread->drawmutex);
4837 thread->starving = false;
4839 SDL_UnlockMutex(thread->drawmutex);
4846 static void DPSOFTRAST_Draw_FlushThreads(void)
4848 DPSOFTRAST_State_Thread *thread;
4850 DPSOFTRAST_Draw_SyncCommands();
4852 for (i = 0; i < dpsoftrast.numthreads; i++)
4854 thread = &dpsoftrast.threads[i];
4855 if (thread->commandoffset != dpsoftrast.drawcommand)
4857 SDL_LockMutex(thread->drawmutex);
4858 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
4859 SDL_CondSignal(thread->drawcond);
4860 SDL_UnlockMutex(thread->drawmutex);
4864 for (i = 0; i < dpsoftrast.numthreads; i++)
4866 thread = &dpsoftrast.threads[i];
4868 if (thread->commandoffset != dpsoftrast.drawcommand)
4870 SDL_LockMutex(thread->drawmutex);
4871 if (thread->commandoffset != dpsoftrast.drawcommand)
4873 thread->waiting = true;
4874 SDL_CondWait(thread->waitcond, thread->drawmutex);
4875 thread->waiting = false;
4877 SDL_UnlockMutex(thread->drawmutex);
4880 if (thread->commandoffset != dpsoftrast.drawcommand)
4881 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
4884 dpsoftrast.commandpool.usedcommands = 0;
4887 void DPSOFTRAST_Flush(void)
4889 DPSOFTRAST_Draw_FlushThreads();
4892 void DPSOFTRAST_Finish(void)
4897 void DPSOFTRAST_Init(int width, int height, int numthreads, unsigned int *colorpixels, unsigned int *depthpixels)
4907 memset(&dpsoftrast, 0, sizeof(dpsoftrast));
4908 dpsoftrast.bigendian = u.b[3];
4909 dpsoftrast.fb_width = width;
4910 dpsoftrast.fb_height = height;
4911 dpsoftrast.fb_depthpixels = depthpixels;
4912 dpsoftrast.fb_colorpixels[0] = colorpixels;
4913 dpsoftrast.fb_colorpixels[1] = NULL;
4914 dpsoftrast.fb_colorpixels[1] = NULL;
4915 dpsoftrast.fb_colorpixels[1] = NULL;
4916 dpsoftrast.viewport[0] = 0;
4917 dpsoftrast.viewport[1] = 0;
4918 dpsoftrast.viewport[2] = dpsoftrast.fb_width;
4919 dpsoftrast.viewport[3] = dpsoftrast.fb_height;
4920 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
4921 dpsoftrast.texture_firstfree = 1;
4922 dpsoftrast.texture_end = 1;
4923 dpsoftrast.texture_max = 0;
4924 dpsoftrast.color[0] = 1;
4925 dpsoftrast.color[1] = 1;
4926 dpsoftrast.color[2] = 1;
4927 dpsoftrast.color[3] = 1;
4929 dpsoftrast.numthreads = bound(1, numthreads, 64);
4931 dpsoftrast.numthreads = 1;
4933 dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
4934 for (i = 0; i < dpsoftrast.numthreads; i++)
4936 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
4938 thread->cullface = GL_BACK;
4939 thread->colormask[1] = 1;
4940 thread->colormask[2] = 1;
4941 thread->colormask[3] = 1;
4942 thread->blendfunc[0] = GL_ONE;
4943 thread->blendfunc[1] = GL_ZERO;
4944 thread->depthmask = true;
4945 thread->depthtest = true;
4946 thread->depthfunc = GL_LEQUAL;
4947 thread->scissortest = false;
4948 thread->alphatest = false;
4949 thread->alphafunc = GL_GREATER;
4950 thread->alphavalue = 0.5f;
4951 thread->viewport[0] = 0;
4952 thread->viewport[1] = 0;
4953 thread->viewport[2] = dpsoftrast.fb_width;
4954 thread->viewport[3] = dpsoftrast.fb_height;
4955 thread->scissor[0] = 0;
4956 thread->scissor[1] = 0;
4957 thread->scissor[2] = dpsoftrast.fb_width;
4958 thread->scissor[3] = dpsoftrast.fb_height;
4959 thread->depthrange[0] = 0;
4960 thread->depthrange[1] = 1;
4961 thread->polygonoffset[0] = 0;
4962 thread->polygonoffset[1] = 0;
4964 thread->numspans = 0;
4965 thread->numtriangles = 0;
4966 thread->commandoffset = 0;
4967 thread->waiting = false;
4968 thread->starving = false;
4970 thread->waitcond = SDL_CreateCond();
4971 thread->drawcond = SDL_CreateCond();
4972 thread->drawmutex = SDL_CreateMutex();
4975 thread->validate = -1;
4976 DPSOFTRAST_Validate(thread, -1);
4978 thread->thread = SDL_CreateThread(DPSOFTRAST_Draw_Thread, thread);
4983 void DPSOFTRAST_Shutdown(void)
4987 if(dpsoftrast.numthreads > 0)
4989 DPSOFTRAST_State_Thread *thread;
4990 for (i = 0; i < dpsoftrast.numthreads; i++)
4992 thread = &dpsoftrast.threads[i];
4993 SDL_LockMutex(thread->drawmutex);
4995 SDL_CondSignal(thread->drawcond);
4996 SDL_UnlockMutex(thread->drawmutex);
4997 SDL_WaitThread(thread->thread, NULL);
4998 SDL_DestroyCond(thread->waitcond);
4999 SDL_DestroyCond(thread->drawcond);
5000 SDL_DestroyMutex(thread->drawmutex);
5004 for (i = 0;i < dpsoftrast.texture_end;i++)
5005 if (dpsoftrast.texture[i].bytes)
5006 MM_FREE(dpsoftrast.texture[i].bytes);
5007 if (dpsoftrast.texture)
5008 free(dpsoftrast.texture);
5009 if (dpsoftrast.threads)
5010 MM_FREE(dpsoftrast.threads);
5011 memset(&dpsoftrast, 0, sizeof(dpsoftrast));