3 #define _USE_MATH_DEFINES
6 #include "dpsoftrast.h"
14 #include <SDL_thread.h>
18 typedef qboolean bool;
22 #define ATOMIC_SIZE 32
26 #define ALIGN(var) var __attribute__((__aligned__(16)))
27 #define ATOMIC(var) var __attribute__((__aligned__(32)))
29 #define MEMORY_BARRIER (_mm_sfence())
30 //(__sync_synchronize())
31 #define ATOMIC_COUNTER volatile int
32 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
33 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
34 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
36 #elif defined(_MSC_VER)
37 #define ALIGN(var) __declspec(align(16)) var
38 #define ATOMIC(var) __declspec(align(32)) var
40 #define MEMORY_BARRIER (_mm_sfence())
42 #define ATOMIC_COUNTER volatile LONG
43 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
44 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
45 #define ATOMIC_ADD(counter, val) (InterlockedExchangeAdd(&(counter), (val)))
54 #define ALIGN(var) var
55 #define ATOMIC(var) var
59 #define MEMORY_BARRIER ((void)0)
60 #define ATOMIC_COUNTER int
61 #define ATOMIC_INCREMENT(counter) (++(counter))
62 #define ATOMIC_DECREMENT(counter) (--(counter))
63 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
67 #include <emmintrin.h>
69 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
71 static void *MM_CALLOC(size_t nmemb, size_t size)
73 void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
74 if(ptr != NULL) memset(ptr, 0, nmemb*size);
78 #define MM_FREE _mm_free
80 #define MM_MALLOC(size) malloc(size)
81 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
85 typedef enum DPSOFTRAST_ARRAY_e
87 DPSOFTRAST_ARRAY_POSITION,
88 DPSOFTRAST_ARRAY_COLOR,
89 DPSOFTRAST_ARRAY_TEXCOORD0,
90 DPSOFTRAST_ARRAY_TEXCOORD1,
91 DPSOFTRAST_ARRAY_TEXCOORD2,
92 DPSOFTRAST_ARRAY_TEXCOORD3,
93 DPSOFTRAST_ARRAY_TEXCOORD4,
94 DPSOFTRAST_ARRAY_TEXCOORD5,
95 DPSOFTRAST_ARRAY_TEXCOORD6,
96 DPSOFTRAST_ARRAY_TEXCOORD7,
97 DPSOFTRAST_ARRAY_TOTAL
101 typedef struct DPSOFTRAST_Texture_s
108 DPSOFTRAST_TEXTURE_FILTER filter;
111 ATOMIC_COUNTER binds;
112 unsigned char *bytes;
113 int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
117 #define COMMAND_SIZE ALIGN_SIZE
118 #define COMMAND_ALIGN(var) ALIGN(var)
120 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
122 unsigned char opcode;
123 unsigned short commandsize;
127 enum { DPSOFTRAST_OPCODE_Reset = 0 };
129 #define DEFCOMMAND(opcodeval, name, fields) \
130 enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
131 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
133 unsigned char opcode; \
134 unsigned short commandsize; \
136 } DPSOFTRAST_Command_##name );
138 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
139 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
141 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
145 ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
147 DPSOFTRAST_State_Command_Pool);
149 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
151 unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
153 ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
155 DPSOFTRAST_State_Triangle);
157 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
158 slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
159 data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
160 _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
161 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
163 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
164 slope[0] = (triangle)->attribs[arrayindex][0][0]; \
165 slope[1] = (triangle)->attribs[arrayindex][0][1]; \
166 slope[2] = (triangle)->attribs[arrayindex][0][2]; \
167 slope[3] = (triangle)->attribs[arrayindex][0][3]; \
168 data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
169 data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
170 data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
171 data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
174 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
176 typedef ALIGN(struct DPSOFTRAST_State_Span_s
178 int triangle; // triangle this span was generated by
179 int x; // framebuffer x coord
180 int y; // framebuffer y coord
181 int length; // pixel count
182 int startx; // usable range (according to pixelmask)
183 int endx; // usable range (according to pixelmask)
184 unsigned char *pixelmask; // true for pixels that passed depth test, false for others
186 DPSOFTRAST_State_Span);
188 #define DPSOFTRAST_DRAW_MAXSPANS 1024
189 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
191 #define DPSOFTRAST_VALIDATE_FB 1
192 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
193 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
194 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
196 typedef enum DPSOFTRAST_BLENDMODE_e
198 DPSOFTRAST_BLENDMODE_OPAQUE,
199 DPSOFTRAST_BLENDMODE_ALPHA,
200 DPSOFTRAST_BLENDMODE_ADDALPHA,
201 DPSOFTRAST_BLENDMODE_ADD,
202 DPSOFTRAST_BLENDMODE_INVMOD,
203 DPSOFTRAST_BLENDMODE_MUL,
204 DPSOFTRAST_BLENDMODE_MUL2,
205 DPSOFTRAST_BLENDMODE_SUBALPHA,
206 DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
207 DPSOFTRAST_BLENDMODE_TOTAL
209 DPSOFTRAST_BLENDMODE;
211 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
232 float polygonoffset[2];
235 int shader_permutation;
237 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
239 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
240 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
242 // DPSOFTRAST_VALIDATE_ flags
245 // derived values (DPSOFTRAST_VALIDATE_FB)
247 int fb_clearscissor[4];
248 ALIGN(float fb_viewportcenter[4]);
249 ALIGN(float fb_viewportscale[4]);
251 // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
254 // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
257 ATOMIC(volatile int commandoffset);
259 volatile bool waiting;
260 volatile bool starving;
264 SDL_mutex *drawmutex;
269 DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
270 DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
272 DPSOFTRAST_State_Thread);
274 typedef ATOMIC(struct DPSOFTRAST_State_s
278 unsigned int *fb_depthpixels;
279 unsigned int *fb_colorpixels[4];
282 ALIGN(float fb_viewportcenter[4]);
283 ALIGN(float fb_viewportscale[4]);
286 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
287 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
289 const float *pointer_vertex3f;
290 const float *pointer_color4f;
291 const unsigned char *pointer_color4ub;
292 const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
295 int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
296 int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
297 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
301 float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
302 float *screencoord4f;
308 int shader_permutation;
312 int texture_firstfree;
313 DPSOFTRAST_Texture *texture;
318 const char *errorstring;
321 DPSOFTRAST_State_Thread *threads;
323 ATOMIC(volatile int drawcommand);
325 DPSOFTRAST_State_Command_Pool commandpool;
329 DPSOFTRAST_State dpsoftrast;
331 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
332 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
333 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
334 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
335 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
337 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
339 fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
340 fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
341 fb_viewportcenter[3] = 0.5f;
342 fb_viewportcenter[0] = 0.0f;
343 fb_viewportscale[1] = 0.5f * viewport[2];
344 fb_viewportscale[2] = -0.5f * viewport[3];
345 fb_viewportscale[3] = 0.5f;
346 fb_viewportscale[0] = 1.0f;
349 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
351 // calculate framebuffer scissor, viewport, viewport clipped by scissor,
352 // and viewport projection values
355 x1 = thread->scissor[0];
356 x2 = thread->scissor[0] + thread->scissor[2];
357 y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
358 y2 = dpsoftrast.fb_height - thread->scissor[1];
359 if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
361 if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
363 if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
364 thread->fb_clearscissor[0] = x1;
365 thread->fb_clearscissor[1] = y1;
366 thread->fb_clearscissor[2] = x2 - x1;
367 thread->fb_clearscissor[3] = y2 - y1;
369 DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
372 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
374 thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
377 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
379 if (thread->blendsubtract)
381 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
383 #define BLENDFUNC(sfactor, dfactor, blendmode) \
384 case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
385 BLENDFUNC(GL_SRC_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
386 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
391 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
393 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
394 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
395 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
396 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
397 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
398 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
399 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
400 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
401 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
402 BLENDFUNC(GL_SRC_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
403 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
408 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
410 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
412 mask &= thread->validate;
415 if (mask & DPSOFTRAST_VALIDATE_FB)
417 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
418 DPSOFTRAST_RecalcFB(thread);
420 if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
422 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
423 DPSOFTRAST_RecalcDepthFunc(thread);
425 if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
427 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
428 DPSOFTRAST_RecalcBlendFunc(thread);
432 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
434 if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
435 return &dpsoftrast.texture[index];
439 static void DPSOFTRAST_Texture_Grow(void)
441 DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
442 DPSOFTRAST_State_Thread *thread;
446 // expand texture array as needed
447 if (dpsoftrast.texture_max < 1024)
448 dpsoftrast.texture_max = 1024;
450 dpsoftrast.texture_max *= 2;
451 dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
452 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
453 if(dpsoftrast.texbound[i])
454 dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
455 for (j = 0; j < dpsoftrast.numthreads; j++)
457 thread = &dpsoftrast.threads[j];
458 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
459 if(thread->texbound[i])
460 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
464 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
473 int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
474 int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
475 DPSOFTRAST_Texture *texture;
476 if (width*height*depth < 1)
478 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
481 if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
483 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
488 case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
489 case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
490 case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
492 case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
493 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
495 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
500 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
503 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
505 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
510 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
512 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
515 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
517 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
520 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
522 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
525 if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
527 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
530 if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
532 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
535 // find first empty slot in texture array
536 for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
537 if (!dpsoftrast.texture[texnum].bytes)
539 dpsoftrast.texture_firstfree = texnum + 1;
540 if (dpsoftrast.texture_max <= texnum)
541 DPSOFTRAST_Texture_Grow();
542 if (dpsoftrast.texture_end <= texnum)
543 dpsoftrast.texture_end = texnum + 1;
544 texture = &dpsoftrast.texture[texnum];
545 memset(texture, 0, sizeof(*texture));
546 texture->flags = flags;
547 texture->width = width;
548 texture->height = height;
549 texture->depth = depth;
550 texture->sides = sides;
562 s = w * h * d * sides * 4;
563 texture->mipmap[mipmaps][0] = size;
564 texture->mipmap[mipmaps][1] = s;
565 texture->mipmap[mipmaps][2] = w;
566 texture->mipmap[mipmaps][3] = h;
567 texture->mipmap[mipmaps][4] = d;
570 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
576 texture->mipmaps = mipmaps;
577 texture->size = size;
579 // allocate the pixels now
580 texture->bytes = (unsigned char *)MM_CALLOC(1, size);
584 void DPSOFTRAST_Texture_Free(int index)
586 DPSOFTRAST_Texture *texture;
587 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
591 MM_FREE(texture->bytes);
592 texture->bytes = NULL;
593 memset(texture, 0, sizeof(*texture));
594 // adjust the free range and used range
595 if (dpsoftrast.texture_firstfree > index)
596 dpsoftrast.texture_firstfree = index;
597 while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
598 dpsoftrast.texture_end--;
600 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
602 int i, x, y, z, w, layer0, layer1, row0, row1;
603 unsigned char *o, *i0, *i1, *i2, *i3;
604 DPSOFTRAST_Texture *texture;
605 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
606 if (texture->mipmaps <= 1)
608 for (i = 1;i < texture->mipmaps;i++)
610 for (z = 0;z < texture->mipmap[i][4];z++)
614 if (layer1 >= texture->mipmap[i-1][4])
615 layer1 = texture->mipmap[i-1][4]-1;
616 for (y = 0;y < texture->mipmap[i][3];y++)
620 if (row1 >= texture->mipmap[i-1][3])
621 row1 = texture->mipmap[i-1][3]-1;
622 o = texture->bytes + texture->mipmap[i ][0] + 4*((texture->mipmap[i ][3] * z + y ) * texture->mipmap[i ][2]);
623 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
624 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
625 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
626 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
627 w = texture->mipmap[i][2];
630 if (texture->mipmap[i-1][2] > 1)
632 // average 3D texture
633 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
635 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
636 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
637 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
638 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
643 // average 3D mipmap with parent width == 1
644 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
646 o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
647 o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
648 o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
649 o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
655 if (texture->mipmap[i-1][2] > 1)
657 // average 2D texture (common case)
658 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
660 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
661 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
662 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
663 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
668 // 2D texture with parent width == 1
669 o[0] = (i0[0] + i1[0] + 1) >> 1;
670 o[1] = (i0[1] + i1[1] + 1) >> 1;
671 o[2] = (i0[2] + i1[2] + 1) >> 1;
672 o[3] = (i0[3] + i1[3] + 1) >> 1;
679 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
681 DPSOFTRAST_Texture *texture;
683 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
686 dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
687 while (blockheight > 0)
689 memcpy(dst, pixels, blockwidth * 4);
690 pixels += blockwidth * 4;
691 dst += texture->mipmap[0][2] * 4;
694 DPSOFTRAST_Texture_CalculateMipmaps(index);
696 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
698 DPSOFTRAST_Texture *texture;
699 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
702 memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
703 DPSOFTRAST_Texture_CalculateMipmaps(index);
705 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
707 DPSOFTRAST_Texture *texture;
708 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
709 return texture->mipmap[mip][2];
711 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
713 DPSOFTRAST_Texture *texture;
714 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
715 return texture->mipmap[mip][3];
717 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
719 DPSOFTRAST_Texture *texture;
720 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
721 return texture->mipmap[mip][4];
723 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
725 DPSOFTRAST_Texture *texture;
726 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
729 return texture->bytes + texture->mipmap[mip][0];
731 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
733 DPSOFTRAST_Texture *texture;
734 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
735 if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
737 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
742 texture->filter = filter;
745 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
747 if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
748 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
749 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
751 dpsoftrast.fb_width = width;
752 dpsoftrast.fb_height = height;
753 dpsoftrast.fb_depthpixels = depthpixels;
754 dpsoftrast.fb_colorpixels[0] = colorpixels0;
755 dpsoftrast.fb_colorpixels[1] = colorpixels1;
756 dpsoftrast.fb_colorpixels[2] = colorpixels2;
757 dpsoftrast.fb_colorpixels[3] = colorpixels3;
760 static void DPSOFTRAST_Draw_FlushThreads(void);
762 static void DPSOFTRAST_Draw_SyncCommands(void)
765 dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
768 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
771 DPSOFTRAST_State_Thread *thread;
773 int freecommand = dpsoftrast.commandpool.freecommand;
774 int usedcommands = dpsoftrast.commandpool.usedcommands;
775 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
777 DPSOFTRAST_Draw_SyncCommands();
783 for (i = 0; i < dpsoftrast.numthreads; i++)
785 thread = &dpsoftrast.threads[i];
786 commandoffset = freecommand - thread->commandoffset;
787 if (commandoffset < 0)
788 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
789 if (commandoffset > usedcommands)
792 usedcommands = commandoffset;
795 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
797 thread = &dpsoftrast.threads[waitindex];
798 SDL_LockMutex(thread->drawmutex);
799 if (thread->commandoffset != dpsoftrast.drawcommand)
801 thread->waiting = true;
802 if (thread->starving) SDL_CondSignal(thread->drawcond);
803 SDL_CondWait(thread->waitcond, thread->drawmutex);
804 thread->waiting = false;
806 SDL_UnlockMutex(thread->drawmutex);
808 dpsoftrast.commandpool.usedcommands = usedcommands;
810 DPSOFTRAST_Draw_FlushThreads();
814 #define DPSOFTRAST_ALIGNCOMMAND(size) \
815 ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
816 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
817 ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
819 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
821 DPSOFTRAST_Command *command;
822 int freecommand = dpsoftrast.commandpool.freecommand;
823 int usedcommands = dpsoftrast.commandpool.usedcommands;
824 int extra = sizeof(DPSOFTRAST_Command);
825 if(DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
826 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
827 if(usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
829 DPSOFTRAST_Draw_FreeCommandPool(size + extra);
830 freecommand = dpsoftrast.commandpool.freecommand;
831 usedcommands = dpsoftrast.commandpool.usedcommands;
833 if(DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
835 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
836 command->opcode = DPSOFTRAST_OPCODE_Reset;
837 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
840 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
841 command->opcode = opcode;
842 command->commandsize = size;
844 if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
846 dpsoftrast.commandpool.freecommand = freecommand;
847 dpsoftrast.commandpool.usedcommands = usedcommands + size;
851 static void DPSOFTRAST_UndoCommand(int size)
853 int freecommand = dpsoftrast.commandpool.freecommand;
854 int usedcommands = dpsoftrast.commandpool.usedcommands;
856 usedcommands -= size;
857 dpsoftrast.commandpool.freecommand = freecommand;
858 dpsoftrast.commandpool.usedcommands = usedcommands;
861 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
862 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
864 thread->viewport[0] = command->x;
865 thread->viewport[1] = command->y;
866 thread->viewport[2] = command->width;
867 thread->viewport[3] = command->height;
868 thread->validate |= DPSOFTRAST_VALIDATE_FB;
870 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
872 DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
875 command->width = width;
876 command->height = height;
878 dpsoftrast.viewport[0] = x;
879 dpsoftrast.viewport[1] = y;
880 dpsoftrast.viewport[2] = width;
881 dpsoftrast.viewport[3] = height;
882 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
885 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;)
886 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
888 int i, x1, y1, x2, y2, w, h, x, y, t1, t2;
891 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
892 x1 = thread->fb_clearscissor[0];
893 y1 = thread->fb_clearscissor[1];
894 x2 = thread->fb_clearscissor[0] + thread->fb_clearscissor[2];
895 y2 = thread->fb_clearscissor[1] + thread->fb_clearscissor[3];
896 t1 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
897 t2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
904 // FIXME: honor fb_colormask?
905 c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
906 for (i = 0;i < 4;i++)
908 if (!dpsoftrast.fb_colorpixels[i])
910 for (y = y1;y < y2;y++)
912 p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
913 for (x = x1;x < x2;x++)
918 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
920 DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
927 DEFCOMMAND(3, ClearDepth, float depth;)
928 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
930 int x1, y1, x2, y2, w, h, x, y, t1, t2;
933 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
934 x1 = thread->fb_clearscissor[0];
935 y1 = thread->fb_clearscissor[1];
936 x2 = thread->fb_clearscissor[0] + thread->fb_clearscissor[2];
937 y2 = thread->fb_clearscissor[1] + thread->fb_clearscissor[3];
938 t1 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
939 t2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
946 c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
947 for (y = y1;y < y2;y++)
949 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
950 for (x = x1;x < x2;x++)
954 void DPSOFTRAST_ClearDepth(float d)
956 DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
960 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
961 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
963 thread->colormask[0] = command->r != 0;
964 thread->colormask[1] = command->g != 0;
965 thread->colormask[2] = command->b != 0;
966 thread->colormask[3] = command->a != 0;
967 thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
969 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
971 DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
978 DEFCOMMAND(5, DepthTest, int enable;)
979 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
981 thread->depthtest = command->enable;
982 thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
984 void DPSOFTRAST_DepthTest(int enable)
986 DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
987 command->enable = enable;
990 DEFCOMMAND(6, ScissorTest, int enable;)
991 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
993 thread->scissortest = command->enable;
994 thread->validate |= DPSOFTRAST_VALIDATE_FB;
996 void DPSOFTRAST_ScissorTest(int enable)
998 DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
999 command->enable = enable;
1002 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1003 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1005 thread->scissor[0] = command->x;
1006 thread->scissor[1] = command->y;
1007 thread->scissor[2] = command->width;
1008 thread->scissor[3] = command->height;
1009 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1011 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1013 DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1016 command->width = width;
1017 command->height = height;
1020 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1021 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1023 thread->blendfunc[0] = command->sfactor;
1024 thread->blendfunc[1] = command->dfactor;
1025 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1027 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1029 DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1030 command->sfactor = sfactor;
1031 command->dfactor = dfactor;
1034 DEFCOMMAND(9, BlendSubtract, int enable;)
1035 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1037 thread->blendsubtract = command->enable;
1038 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1040 void DPSOFTRAST_BlendSubtract(int enable)
1042 DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1043 command->enable = enable;
1046 DEFCOMMAND(10, DepthMask, int enable;)
1047 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1049 thread->depthmask = command->enable;
1051 void DPSOFTRAST_DepthMask(int enable)
1053 DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1054 command->enable = enable;
1057 DEFCOMMAND(11, DepthFunc, int func;)
1058 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1060 thread->depthfunc = command->func;
1062 void DPSOFTRAST_DepthFunc(int func)
1064 DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1065 command->func = func;
1068 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1069 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1071 thread->depthrange[0] = command->nearval;
1072 thread->depthrange[1] = command->farval;
1074 void DPSOFTRAST_DepthRange(float nearval, float farval)
1076 DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1077 command->nearval = nearval;
1078 command->farval = farval;
1081 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1082 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1084 thread->polygonoffset[0] = command->alongnormal;
1085 thread->polygonoffset[1] = command->intoview;
1087 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1089 DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1090 command->alongnormal = alongnormal;
1091 command->intoview = intoview;
1094 DEFCOMMAND(14, CullFace, int mode;)
1095 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1097 thread->cullface = command->mode;
1099 void DPSOFTRAST_CullFace(int mode)
1101 DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1102 command->mode = mode;
1105 DEFCOMMAND(15, AlphaTest, int enable;)
1106 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1108 thread->alphatest = command->enable;
1110 void DPSOFTRAST_AlphaTest(int enable)
1112 DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1113 command->enable = enable;
1116 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1117 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1119 thread->alphafunc = command->func;
1120 thread->alphavalue = command->ref;
1122 void DPSOFTRAST_AlphaFunc(int func, float ref)
1124 DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1125 command->func = func;
1129 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1131 dpsoftrast.color[0] = r;
1132 dpsoftrast.color[1] = g;
1133 dpsoftrast.color[2] = b;
1134 dpsoftrast.color[3] = a;
1137 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1139 int outstride = blockwidth * 4;
1140 int instride = dpsoftrast.fb_width * 4;
1143 int bx2 = blockx + blockwidth;
1144 int by2 = blocky + blockheight;
1149 unsigned char *inpixels;
1153 if (bx1 < 0) bx1 = 0;
1154 if (by1 < 0) by1 = 0;
1155 if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1156 if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1159 inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1160 if (dpsoftrast.bigendian)
1162 for (y = by1;y < by2;y++)
1164 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1165 o = (unsigned char *)outpixels + (y - by1) * outstride;
1166 for (x = bx1;x < bx2;x++)
1179 for (y = by1;y < by2;y++)
1181 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1182 o = (unsigned char *)outpixels + (y - by1) * outstride;
1188 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1192 int tx2 = tx + width;
1193 int ty2 = ty + height;
1196 int sx2 = sx + width;
1197 int sy2 = sy + height;
1207 unsigned int *spixels;
1208 unsigned int *tpixels;
1209 DPSOFTRAST_Texture *texture;
1210 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1211 if (mip < 0 || mip >= texture->mipmaps) return;
1214 spixels = dpsoftrast.fb_colorpixels[0];
1215 swidth = dpsoftrast.fb_width;
1216 sheight = dpsoftrast.fb_height;
1217 tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1218 twidth = texture->mipmap[mip][2];
1219 theight = texture->mipmap[mip][3];
1220 if (tx1 < 0) tx1 = 0;
1221 if (ty1 < 0) ty1 = 0;
1222 if (tx2 > twidth) tx2 = twidth;
1223 if (ty2 > theight) ty2 = theight;
1224 if (sx1 < 0) sx1 = 0;
1225 if (sy1 < 0) sy1 = 0;
1226 if (sx2 > swidth) sx2 = swidth;
1227 if (sy2 > sheight) sy2 = sheight;
1232 if (tw > sw) tw = sw;
1233 if (th > sh) th = sh;
1234 if (tw < 1 || th < 1)
1236 for (y = 0;y < th;y++)
1237 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 + y) * swidth + sx1), tw*4);
1238 if (texture->mipmaps > 1)
1239 DPSOFTRAST_Texture_CalculateMipmaps(index);
1242 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1243 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1245 if (thread->texbound[command->unitnum])
1246 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1247 thread->texbound[command->unitnum] = command->texture;
1249 void DPSOFTRAST_SetTexture(int unitnum, int index)
1251 DPSOFTRAST_Command_SetTexture *command;
1252 DPSOFTRAST_Texture *texture;
1253 if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1255 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1258 texture = DPSOFTRAST_Texture_GetByIndex(index);
1259 if (index && !texture)
1261 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1265 command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1266 command->unitnum = unitnum;
1267 command->texture = texture;
1269 dpsoftrast.texbound[unitnum] = texture;
1270 ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1273 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1275 dpsoftrast.pointer_vertex3f = vertex3f;
1276 dpsoftrast.stride_vertex = stride;
1278 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1280 dpsoftrast.pointer_color4f = color4f;
1281 dpsoftrast.pointer_color4ub = NULL;
1282 dpsoftrast.stride_color = stride;
1284 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1286 dpsoftrast.pointer_color4f = NULL;
1287 dpsoftrast.pointer_color4ub = color4ub;
1288 dpsoftrast.stride_color = stride;
1290 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1292 dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1293 dpsoftrast.components_texcoord[unitnum] = numcomponents;
1294 dpsoftrast.stride_texcoord[unitnum] = stride;
1297 DEFCOMMAND(18, SetShader, int mode; int permutation;)
1298 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1300 thread->shader_mode = command->mode;
1301 thread->shader_permutation = command->permutation;
1303 void DPSOFTRAST_SetShader(int mode, int permutation)
1305 DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1306 command->mode = mode;
1307 command->permutation = permutation;
1309 dpsoftrast.shader_mode = mode;
1310 dpsoftrast.shader_permutation = permutation;
1313 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1314 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1316 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1318 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1320 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1321 command->index = index;
1322 command->val[0] = v0;
1323 command->val[1] = v1;
1324 command->val[2] = v2;
1325 command->val[3] = v3;
1327 dpsoftrast.uniform4f[index*4+0] = v0;
1328 dpsoftrast.uniform4f[index*4+1] = v1;
1329 dpsoftrast.uniform4f[index*4+2] = v2;
1330 dpsoftrast.uniform4f[index*4+3] = v3;
1332 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1334 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1335 command->index = index;
1336 memcpy(command->val, v, sizeof(command->val));
1338 memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1341 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1342 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1344 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1346 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1350 for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1352 __m128 m0, m1, m2, m3;
1353 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1354 command->index = index;
1355 if (((size_t)v)&(ALIGN_SIZE-1))
1357 m0 = _mm_loadu_ps(v);
1358 m1 = _mm_loadu_ps(v+4);
1359 m2 = _mm_loadu_ps(v+8);
1360 m3 = _mm_loadu_ps(v+12);
1364 m0 = _mm_load_ps(v);
1365 m1 = _mm_load_ps(v+4);
1366 m2 = _mm_load_ps(v+8);
1367 m3 = _mm_load_ps(v+12);
1371 __m128 t0, t1, t2, t3;
1372 t0 = _mm_unpacklo_ps(m0, m1);
1373 t1 = _mm_unpacklo_ps(m2, m3);
1374 t2 = _mm_unpackhi_ps(m0, m1);
1375 t3 = _mm_unpackhi_ps(m2, m3);
1376 m0 = _mm_movelh_ps(t0, t1);
1377 m1 = _mm_movehl_ps(t1, t0);
1378 m2 = _mm_movelh_ps(t2, t3);
1379 m3 = _mm_movehl_ps(t3, t2);
1381 _mm_store_ps(command->val, m0);
1382 _mm_store_ps(command->val+4, m1);
1383 _mm_store_ps(command->val+8, m2);
1384 _mm_store_ps(command->val+12, m3);
1385 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1386 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1387 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1388 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1393 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1394 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1396 thread->uniform1i[command->index] = command->val;
1398 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1400 DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1401 command->index = index;
1404 dpsoftrast.uniform1i[command->index] = i0;
1408 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1410 float *end = dst + size*4;
1411 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1415 _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1424 _mm_store_ps(dst, _mm_load_ps((const float *)src));
1431 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1433 float *end = dst + size*4;
1434 if (stride == sizeof(float[3]))
1436 float *end4 = dst + (size&~3)*4;
1437 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1441 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv;
1442 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1443 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1444 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1445 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1446 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1447 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1448 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1449 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1450 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1451 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1452 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1453 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1455 src += 4*sizeof(float[3]);
1462 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1463 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1464 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1465 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1466 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1467 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1468 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1469 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1470 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1471 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1472 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1473 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1474 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1476 src += 4*sizeof(float[3]);
1480 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1484 __m128 v = _mm_loadu_ps((const float *)src);
1485 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1486 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1487 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1488 _mm_store_ps(dst, v);
1497 __m128 v = _mm_load_ps((const float *)src);
1498 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1499 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1500 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1501 _mm_store_ps(dst, v);
1508 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1510 float *end = dst + size*4;
1511 __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1512 if (stride == sizeof(float[2]))
1514 float *end2 = dst + (size&~1)*4;
1515 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1519 __m128 v = _mm_loadu_ps((const float *)src);
1520 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1521 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1523 src += 2*sizeof(float[2]);
1530 __m128 v = _mm_load_ps((const float *)src);
1531 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1532 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1534 src += 2*sizeof(float[2]);
1540 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1546 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1548 float *end = dst + size*4;
1549 __m128 scale = _mm_set1_ps(1.0f/255.0f);
1550 if (stride == sizeof(unsigned char[4]))
1552 float *end4 = dst + (size&~3)*4;
1553 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1557 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1558 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1559 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1560 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1561 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1563 src += 4*sizeof(unsigned char[4]);
1570 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1571 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1572 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1573 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1574 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1576 src += 4*sizeof(unsigned char[4]);
1582 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1583 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1589 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1591 float *end = dst + 4*size;
1592 __m128 v = _mm_loadu_ps(src);
1595 _mm_store_ps(dst, v);
1601 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1604 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1605 __m128 m0, m1, m2, m3;
1607 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1609 // fast case for identity matrix
1610 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1613 end = out4f + numitems*4;
1614 m0 = _mm_loadu_ps(inmatrix16f);
1615 m1 = _mm_loadu_ps(inmatrix16f + 4);
1616 m2 = _mm_loadu_ps(inmatrix16f + 8);
1617 m3 = _mm_loadu_ps(inmatrix16f + 12);
1618 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1622 __m128 v = _mm_loadu_ps(in4f);
1624 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1625 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1626 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1627 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1636 __m128 v = _mm_load_ps(in4f);
1638 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1639 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1640 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1641 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1649 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1651 memcpy(out4f, in4f, numitems * sizeof(float[4]));
1655 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1657 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1658 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1659 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1660 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1663 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1665 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1666 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1667 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1668 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1671 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1674 out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1675 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1676 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1677 _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1680 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, __m128 minpos, __m128 maxpos, __m128 viewportcenter, __m128 viewportscale, __m128 m0, __m128 m1, __m128 m2, __m128 m3)
1682 int clipmask = 0xFF;
1683 __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1684 m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1685 m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1686 m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1687 m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1688 #define BBFRONT(k, pos) \
1690 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1691 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1692 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1695 clipmask &= ~(1<<k); \
1696 proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1697 minproj = _mm_min_ss(minproj, proj); \
1698 maxproj = _mm_max_ss(maxproj, proj); \
1702 BBFRONT(1, _mm_move_ss(minpos, maxpos));
1703 BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1704 BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1705 BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1706 BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1707 BBFRONT(6, _mm_move_ss(maxpos, minpos));
1711 if (clipmask&(1<<k)) \
1713 if (!(clipmask&(1<<(k^1)))) \
1715 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1716 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1717 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1718 minproj = _mm_min_ss(minproj, proj); \
1719 maxproj = _mm_max_ss(maxproj, proj); \
1721 if (!(clipmask&(1<<(k^2)))) \
1723 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1724 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1725 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1726 minproj = _mm_min_ss(minproj, proj); \
1727 maxproj = _mm_max_ss(maxproj, proj); \
1729 if (!(clipmask&(1<<(k^4)))) \
1731 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1732 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1733 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1734 minproj = _mm_min_ss(minproj, proj); \
1735 maxproj = _mm_max_ss(maxproj, proj); \
1739 BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1740 viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1741 viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1742 minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1743 maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1744 minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1745 maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1746 *starty = _mm_cvttss_si32(maxproj);
1747 *endy = _mm_cvttss_si32(minproj)+1;
1752 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1755 float *end = out4f + numitems*4;
1756 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1757 __m128 minpos, maxpos;
1758 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1760 minpos = maxpos = _mm_loadu_ps(in4f);
1763 __m128 v = _mm_loadu_ps(in4f);
1764 minpos = _mm_min_ps(minpos, v);
1765 maxpos = _mm_max_ps(maxpos, v);
1766 _mm_store_ps(out4f, v);
1767 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1768 _mm_store_ps(screen4f, v);
1776 minpos = maxpos = _mm_load_ps(in4f);
1779 __m128 v = _mm_load_ps(in4f);
1780 minpos = _mm_min_ps(minpos, v);
1781 maxpos = _mm_max_ps(maxpos, v);
1782 _mm_store_ps(out4f, v);
1783 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1784 _mm_store_ps(screen4f, v);
1791 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale,
1792 _mm_setr_ps(1.0f, 0.0f, 0.0f, 0.0f),
1793 _mm_setr_ps(0.0f, 1.0f, 0.0f, 0.0f),
1794 _mm_setr_ps(0.0f, 0.0f, 1.0f, 0.0f),
1795 _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f));
1800 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1803 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1804 __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1806 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1807 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1808 end = out4f + numitems*4;
1809 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1810 viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1811 m0 = _mm_loadu_ps(inmatrix16f);
1812 m1 = _mm_loadu_ps(inmatrix16f + 4);
1813 m2 = _mm_loadu_ps(inmatrix16f + 8);
1814 m3 = _mm_loadu_ps(inmatrix16f + 12);
1815 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1817 minpos = maxpos = _mm_loadu_ps(in4f);
1820 __m128 v = _mm_loadu_ps(in4f);
1821 minpos = _mm_min_ps(minpos, v);
1822 maxpos = _mm_max_ps(maxpos, v);
1823 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1824 _mm_store_ps(out4f, v);
1825 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1826 _mm_store_ps(screen4f, v);
1834 minpos = maxpos = _mm_load_ps(in4f);
1837 __m128 v = _mm_load_ps(in4f);
1838 minpos = _mm_min_ps(minpos, v);
1839 maxpos = _mm_max_ps(maxpos, v);
1840 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1841 _mm_store_ps(out4f, v);
1842 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1843 _mm_store_ps(screen4f, v);
1850 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale, m0, m1, m2, m3);
1855 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1857 float *outf = dpsoftrast.post_array4f[outarray];
1858 const unsigned char *inb;
1859 int firstvertex = dpsoftrast.firstvertex;
1860 int numvertices = dpsoftrast.numvertices;
1864 case DPSOFTRAST_ARRAY_POSITION:
1865 stride = dpsoftrast.stride_vertex;
1866 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1867 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1869 case DPSOFTRAST_ARRAY_COLOR:
1870 stride = dpsoftrast.stride_color;
1871 if (dpsoftrast.pointer_color4f)
1873 inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1874 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1876 else if (dpsoftrast.pointer_color4ub)
1878 stride = dpsoftrast.stride_color;
1879 inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1880 DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1884 DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1888 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1889 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1891 inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1892 switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1895 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1898 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1901 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1910 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1912 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1913 DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1918 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1920 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1921 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
1926 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
1928 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1929 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
1933 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
1936 int startx = span->startx;
1937 int endx = span->endx;
1938 float wslope = triangle->w[0];
1939 float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
1940 float endz = 1.0f / (w + wslope * startx);
1941 for (x = startx;x < endx;)
1943 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
1945 if(nextsub >= endx) nextsub = endsub = endx-1;
1946 endz = 1.0f / (w + wslope * nextsub);
1947 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
1948 for (; x <= endsub; x++, z += dz)
1953 void DPSOFTRAST_Draw_Span_Finish(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const float * RESTRICT in4f)
1956 int startx = span->startx;
1957 int endx = span->endx;
1960 unsigned char * RESTRICT pixelmask = span->pixelmask;
1961 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1964 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
1965 // handle alphatest now (this affects depth writes too)
1966 if (thread->alphatest)
1967 for (x = startx;x < endx;x++)
1968 if (in4f[x*4+3] < 0.5f)
1969 pixelmask[x] = false;
1970 // FIXME: this does not handle bigendian
1971 switch(thread->fb_blendmode)
1973 case DPSOFTRAST_BLENDMODE_OPAQUE:
1974 for (x = startx;x < endx;x++)
1978 d[0] = (int)(in4f[x*4+2]*255.0f);if (d[0] > 255) d[0] = 255;
1979 d[1] = (int)(in4f[x*4+1]*255.0f);if (d[1] > 255) d[1] = 255;
1980 d[2] = (int)(in4f[x*4+0]*255.0f);if (d[2] > 255) d[2] = 255;
1981 d[3] = (int)(in4f[x*4+3]*255.0f);if (d[3] > 255) d[3] = 255;
1982 pixel[x*4+0] = d[0];
1983 pixel[x*4+1] = d[1];
1984 pixel[x*4+2] = d[2];
1985 pixel[x*4+3] = d[3];
1988 case DPSOFTRAST_BLENDMODE_ALPHA:
1989 for (x = startx;x < endx;x++)
1993 a = in4f[x*4+3] * 255.0f;
1994 b = 1.0f - in4f[x*4+3];
1995 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
1996 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
1997 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
1998 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
1999 pixel[x*4+0] = d[0];
2000 pixel[x*4+1] = d[1];
2001 pixel[x*4+2] = d[2];
2002 pixel[x*4+3] = d[3];
2005 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2006 for (x = startx;x < endx;x++)
2010 a = in4f[x*4+3] * 255.0f;
2011 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2012 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2013 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2014 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2015 pixel[x*4+0] = d[0];
2016 pixel[x*4+1] = d[1];
2017 pixel[x*4+2] = d[2];
2018 pixel[x*4+3] = d[3];
2021 case DPSOFTRAST_BLENDMODE_ADD:
2022 for (x = startx;x < endx;x++)
2026 d[0] = (int)(in4f[x*4+2]*255.0f+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2027 d[1] = (int)(in4f[x*4+1]*255.0f+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2028 d[2] = (int)(in4f[x*4+0]*255.0f+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2029 d[3] = (int)(in4f[x*4+3]*255.0f+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2030 pixel[x*4+0] = d[0];
2031 pixel[x*4+1] = d[1];
2032 pixel[x*4+2] = d[2];
2033 pixel[x*4+3] = d[3];
2036 case DPSOFTRAST_BLENDMODE_INVMOD:
2037 for (x = startx;x < endx;x++)
2041 d[0] = (int)((1.0f-in4f[x*4+2])*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2042 d[1] = (int)((1.0f-in4f[x*4+1])*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2043 d[2] = (int)((1.0f-in4f[x*4+0])*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2044 d[3] = (int)((1.0f-in4f[x*4+3])*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2045 pixel[x*4+0] = d[0];
2046 pixel[x*4+1] = d[1];
2047 pixel[x*4+2] = d[2];
2048 pixel[x*4+3] = d[3];
2051 case DPSOFTRAST_BLENDMODE_MUL:
2052 for (x = startx;x < endx;x++)
2056 d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2057 d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2058 d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2059 d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2060 pixel[x*4+0] = d[0];
2061 pixel[x*4+1] = d[1];
2062 pixel[x*4+2] = d[2];
2063 pixel[x*4+3] = d[3];
2066 case DPSOFTRAST_BLENDMODE_MUL2:
2067 for (x = startx;x < endx;x++)
2071 d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]*2.0f);if (d[0] > 255) d[0] = 255;
2072 d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]*2.0f);if (d[1] > 255) d[1] = 255;
2073 d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]*2.0f);if (d[2] > 255) d[2] = 255;
2074 d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]*2.0f);if (d[3] > 255) d[3] = 255;
2075 pixel[x*4+0] = d[0];
2076 pixel[x*4+1] = d[1];
2077 pixel[x*4+2] = d[2];
2078 pixel[x*4+3] = d[3];
2081 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2082 for (x = startx;x < endx;x++)
2086 a = in4f[x*4+3] * -255.0f;
2087 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;if (d[0] < 0) d[0] = 0;
2088 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;if (d[1] < 0) d[1] = 0;
2089 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;if (d[2] < 0) d[2] = 0;
2090 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;if (d[3] < 0) d[3] = 0;
2091 pixel[x*4+0] = d[0];
2092 pixel[x*4+1] = d[1];
2093 pixel[x*4+2] = d[2];
2094 pixel[x*4+3] = d[3];
2097 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2098 for (x = startx;x < endx;x++)
2103 b = 1.0f - in4f[x*4+3];
2104 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2105 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2106 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2107 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2108 pixel[x*4+0] = d[0];
2109 pixel[x*4+1] = d[1];
2110 pixel[x*4+2] = d[2];
2111 pixel[x*4+3] = d[3];
2117 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2121 int startx = span->startx;
2122 int endx = span->endx;
2123 const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2124 unsigned char * RESTRICT pixelmask = span->pixelmask;
2125 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2126 unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2129 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2130 pixeli += span->y * dpsoftrast.fb_width + span->x;
2131 // handle alphatest now (this affects depth writes too)
2132 if (thread->alphatest)
2133 for (x = startx;x < endx;x++)
2134 if (in4ub[x*4+3] < 0.5f)
2135 pixelmask[x] = false;
2136 // FIXME: this does not handle bigendian
2137 switch(thread->fb_blendmode)
2139 case DPSOFTRAST_BLENDMODE_OPAQUE:
2140 for (x = startx;x + 4 <= endx;)
2142 if (*(const unsigned int *)&pixelmask[x] == 0x01010101)
2144 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2158 case DPSOFTRAST_BLENDMODE_ALPHA:
2159 #define FINISHBLEND(blend2, blend1) \
2160 for (x = startx;x + 2 <= endx;x += 2) \
2163 switch (*(const unsigned short*)&pixelmask[x]) \
2166 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2167 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2169 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2172 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x+1]), _mm_setzero_si128()); \
2173 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x+1]), _mm_setzero_si128()); \
2175 pixeli[x+1] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2178 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2179 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2181 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2186 for(;x < endx; x++) \
2189 if (!pixelmask[x]) \
2191 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2192 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2194 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2198 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2199 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2201 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2202 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2205 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2207 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2208 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2210 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2211 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2214 case DPSOFTRAST_BLENDMODE_ADD:
2215 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2217 case DPSOFTRAST_BLENDMODE_INVMOD:
2219 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2221 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2224 case DPSOFTRAST_BLENDMODE_MUL:
2225 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2227 case DPSOFTRAST_BLENDMODE_MUL2:
2228 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2230 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2232 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2233 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2235 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2236 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2239 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2241 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2242 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2244 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2245 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2252 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2255 int startx = span->startx;
2256 int endx = span->endx;
2261 float tc[2], endtc[2];
2263 unsigned int tci[2];
2264 unsigned int tci1[2];
2265 unsigned int tcimin[2];
2266 unsigned int tcimax[2];
2271 const unsigned char * RESTRICT pixelbase;
2272 const unsigned char * RESTRICT pixel[4];
2273 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2274 // if no texture is bound, just fill it with white
2277 for (x = startx;x < endx;x++)
2279 out4f[x*4+0] = 1.0f;
2280 out4f[x*4+1] = 1.0f;
2281 out4f[x*4+2] = 1.0f;
2282 out4f[x*4+3] = 1.0f;
2286 mip = triangle->mip[texunitindex];
2287 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2288 // if this mipmap of the texture is 1 pixel, just fill it with that color
2289 if (texture->mipmap[mip][1] == 4)
2291 c[0] = texture->bytes[2] * (1.0f/255.0f);
2292 c[1] = texture->bytes[1] * (1.0f/255.0f);
2293 c[2] = texture->bytes[0] * (1.0f/255.0f);
2294 c[3] = texture->bytes[3] * (1.0f/255.0f);
2295 for (x = startx;x < endx;x++)
2297 out4f[x*4+0] = c[0];
2298 out4f[x*4+1] = c[1];
2299 out4f[x*4+2] = c[2];
2300 out4f[x*4+3] = c[3];
2304 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2305 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2306 flags = texture->flags;
2307 tcscale[0] = texture->mipmap[mip][2];
2308 tcscale[1] = texture->mipmap[mip][3];
2309 tciwidth = texture->mipmap[mip][2];
2312 tcimax[0] = texture->mipmap[mip][2]-1;
2313 tcimax[1] = texture->mipmap[mip][3]-1;
2314 tciwrapmask[0] = texture->mipmap[mip][2]-1;
2315 tciwrapmask[1] = texture->mipmap[mip][3]-1;
2316 endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0] - 0.5f;
2317 endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1] - 0.5f;
2318 for (x = startx;x < endx;)
2320 unsigned int subtc[2];
2321 unsigned int substep[2];
2322 float subscale = 65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2323 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2326 nextsub = endsub = endx-1;
2327 if(x < nextsub) subscale = 65536.0f / (nextsub - x);
2331 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0] - 0.5f;
2332 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1] - 0.5f;
2333 substep[0] = (endtc[0] - tc[0]) * subscale;
2334 substep[1] = (endtc[1] - tc[1]) * subscale;
2335 subtc[0] = tc[0] * (1<<16);
2336 subtc[1] = tc[1] * (1<<16);
2339 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2341 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2343 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2344 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2345 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2346 tci[0] = subtc[0]>>16;
2347 tci[1] = subtc[1]>>16;
2348 tci1[0] = tci[0] + 1;
2349 tci1[1] = tci[1] + 1;
2350 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2351 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2352 tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2353 tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2354 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2355 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2356 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2357 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2358 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2359 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2360 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2361 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2362 out4f[x*4+0] = c[0];
2363 out4f[x*4+1] = c[1];
2364 out4f[x*4+2] = c[2];
2365 out4f[x*4+3] = c[3];
2370 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2372 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2373 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2374 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2375 tci[0] = subtc[0]>>16;
2376 tci[1] = subtc[1]>>16;
2377 tci1[0] = tci[0] + 1;
2378 tci1[1] = tci[1] + 1;
2379 tci[0] &= tciwrapmask[0];
2380 tci[1] &= tciwrapmask[1];
2381 tci1[0] &= tciwrapmask[0];
2382 tci1[1] &= tciwrapmask[1];
2383 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2384 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2385 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2386 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2387 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2388 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2389 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2390 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2391 out4f[x*4+0] = c[0];
2392 out4f[x*4+1] = c[1];
2393 out4f[x*4+2] = c[2];
2394 out4f[x*4+3] = c[3];
2398 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2400 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2402 tci[0] = subtc[0]>>16;
2403 tci[1] = subtc[1]>>16;
2404 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2405 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2406 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2407 c[0] = pixel[0][2] * (1.0f / 255.0f);
2408 c[1] = pixel[0][1] * (1.0f / 255.0f);
2409 c[2] = pixel[0][0] * (1.0f / 255.0f);
2410 c[3] = pixel[0][3] * (1.0f / 255.0f);
2411 out4f[x*4+0] = c[0];
2412 out4f[x*4+1] = c[1];
2413 out4f[x*4+2] = c[2];
2414 out4f[x*4+3] = c[3];
2419 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2421 tci[0] = subtc[0]>>16;
2422 tci[1] = subtc[1]>>16;
2423 tci[0] &= tciwrapmask[0];
2424 tci[1] &= tciwrapmask[1];
2425 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2426 c[0] = pixel[0][2] * (1.0f / 255.0f);
2427 c[1] = pixel[0][1] * (1.0f / 255.0f);
2428 c[2] = pixel[0][0] * (1.0f / 255.0f);
2429 c[3] = pixel[0][3] * (1.0f / 255.0f);
2430 out4f[x*4+0] = c[0];
2431 out4f[x*4+1] = c[1];
2432 out4f[x*4+2] = c[2];
2433 out4f[x*4+3] = c[3];
2439 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2443 int startx = span->startx;
2444 int endx = span->endx;
2446 __m128 data, slope, tcscale;
2447 __m128i tcsize, tcmask, tcoffset, tcmax;
2449 __m128i subtc, substep, endsubtc;
2452 unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2453 const unsigned char * RESTRICT pixelbase;
2454 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2455 // if no texture is bound, just fill it with white
2458 memset(out4ub + startx*4, 255, span->length*4);
2461 mip = triangle->mip[texunitindex];
2462 pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2463 // if this mipmap of the texture is 1 pixel, just fill it with that color
2464 if (texture->mipmap[mip][1] == 4)
2466 unsigned int k = *((const unsigned int *)pixelbase);
2467 for (x = startx;x < endx;x++)
2471 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2472 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2473 flags = texture->flags;
2474 tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2475 tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2476 tcscale = _mm_cvtepi32_ps(tcsize);
2477 data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2478 slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2479 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx])), _mm_set1_ps(0.5f));
2480 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2481 tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2482 tcmax = _mm_packs_epi32(tcmask, tcmask);
2483 for (x = startx;x < endx;)
2485 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2486 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2489 nextsub = endsub = endx-1;
2490 if(x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2494 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub])), _mm_set1_ps(0.5f));
2495 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2496 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2497 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2498 substep = _mm_slli_epi32(substep, 1);
2501 __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2502 if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2504 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2505 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2507 const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;
2508 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2509 tci = _mm_madd_epi16(tci, tcoffset);
2510 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2511 ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2512 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2513 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2514 pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2515 pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2516 fracm = _mm_srli_epi16(subtc, 1);
2517 pix1 = _mm_add_epi16(pix1,
2518 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2519 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2520 pix3 = _mm_add_epi16(pix3,
2521 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2522 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2523 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2524 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2525 pix2 = _mm_add_epi16(pix2,
2526 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2527 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2528 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2532 const unsigned char * RESTRICT ptr1;
2533 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2534 tci = _mm_madd_epi16(tci, tcoffset);
2535 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2536 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2537 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2538 fracm = _mm_srli_epi16(subtc, 1);
2539 pix1 = _mm_add_epi16(pix1,
2540 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2541 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2542 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2543 pix1 = _mm_add_epi16(pix1,
2544 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2545 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2546 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2550 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2552 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2554 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2555 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2556 tci = _mm_madd_epi16(tci, tcoffset);
2557 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2558 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2559 _mm_setzero_si128());
2560 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2561 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2562 _mm_setzero_si128());
2563 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2564 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2565 tci = _mm_madd_epi16(tci, tcoffset);
2566 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2567 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2568 _mm_setzero_si128());
2569 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2570 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2571 _mm_setzero_si128());
2572 fracm = _mm_srli_epi16(subtc, 1);
2573 pix1 = _mm_add_epi16(pix1,
2574 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2575 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2576 pix3 = _mm_add_epi16(pix3,
2577 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2578 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2579 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2580 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2581 pix2 = _mm_add_epi16(pix2,
2582 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2583 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2584 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2588 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2589 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2590 tci = _mm_madd_epi16(tci, tcoffset);
2591 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2592 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2593 _mm_setzero_si128());
2594 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2595 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2596 _mm_setzero_si128());
2597 fracm = _mm_srli_epi16(subtc, 1);
2598 pix1 = _mm_add_epi16(pix1,
2599 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2600 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2601 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2602 pix1 = _mm_add_epi16(pix1,
2603 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2604 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2605 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2611 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2613 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2614 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2615 tci = _mm_madd_epi16(tci, tcoffset);
2616 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2617 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2618 _mm_setzero_si128());
2619 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2620 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2621 _mm_setzero_si128());
2622 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2623 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2624 tci = _mm_madd_epi16(tci, tcoffset);
2625 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2626 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2627 _mm_setzero_si128());
2628 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2629 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2630 _mm_setzero_si128());
2631 fracm = _mm_srli_epi16(subtc, 1);
2632 pix1 = _mm_add_epi16(pix1,
2633 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2634 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2635 pix3 = _mm_add_epi16(pix3,
2636 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2637 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2638 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2639 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2640 pix2 = _mm_add_epi16(pix2,
2641 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2642 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2643 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2647 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2648 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2649 tci = _mm_madd_epi16(tci, tcoffset);
2650 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2651 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2652 _mm_setzero_si128());
2653 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2654 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2655 _mm_setzero_si128());
2656 fracm = _mm_srli_epi16(subtc, 1);
2657 pix1 = _mm_add_epi16(pix1,
2658 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2659 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2660 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2661 pix1 = _mm_add_epi16(pix1,
2662 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2663 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2664 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2671 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2673 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2675 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2676 tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2677 tci = _mm_madd_epi16(tci, tcoffset);
2678 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2679 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2683 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2684 tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2685 tci = _mm_madd_epi16(tci, tcoffset);
2686 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2692 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2694 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2695 tci = _mm_and_si128(tci, tcmax);
2696 tci = _mm_madd_epi16(tci, tcoffset);
2697 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2698 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2702 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2703 tci = _mm_and_si128(tci, tcmax);
2704 tci = _mm_madd_epi16(tci, tcoffset);
2705 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2714 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2717 memset(out4ub, 255, span->length*4);
2720 float DPSOFTRAST_SampleShadowmap(const float *vector)
2726 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2729 int startx = span->startx;
2730 int endx = span->endx;
2735 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2736 for (x = startx;x < endx;x++)
2739 c[0] = (data[0] + slope[0]*x) * z;
2740 c[1] = (data[1] + slope[1]*x) * z;
2741 c[2] = (data[2] + slope[2]*x) * z;
2742 c[3] = (data[3] + slope[3]*x) * z;
2743 out4f[x*4+0] = in4f[x*4+0] * c[0];
2744 out4f[x*4+1] = in4f[x*4+1] * c[1];
2745 out4f[x*4+2] = in4f[x*4+2] * c[2];
2746 out4f[x*4+3] = in4f[x*4+3] * c[3];
2750 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2753 int startx = span->startx;
2754 int endx = span->endx;
2759 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2760 for (x = startx;x < endx;x++)
2763 c[0] = (data[0] + slope[0]*x) * z;
2764 c[1] = (data[1] + slope[1]*x) * z;
2765 c[2] = (data[2] + slope[2]*x) * z;
2766 c[3] = (data[3] + slope[3]*x) * z;
2767 out4f[x*4+0] = c[0];
2768 out4f[x*4+1] = c[1];
2769 out4f[x*4+2] = c[2];
2770 out4f[x*4+3] = c[3];
2774 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2776 int x, startx = span->startx, endx = span->endx;
2777 float c[4], localcolor[4];
2778 localcolor[0] = subcolor[0];
2779 localcolor[1] = subcolor[1];
2780 localcolor[2] = subcolor[2];
2781 localcolor[3] = subcolor[3];
2782 for (x = startx;x < endx;x++)
2784 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2785 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2786 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2787 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2788 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2789 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2790 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2791 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2795 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2797 int x, startx = span->startx, endx = span->endx;
2798 for (x = startx;x < endx;x++)
2800 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2801 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2802 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2803 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2807 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2809 int x, startx = span->startx, endx = span->endx;
2810 for (x = startx;x < endx;x++)
2812 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2813 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2814 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2815 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2819 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2821 int x, startx = span->startx, endx = span->endx;
2823 for (x = startx;x < endx;x++)
2825 a = 1.0f - inb4f[x*4+3];
2827 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2828 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2829 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2830 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2834 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2836 int x, startx = span->startx, endx = span->endx;
2837 float localcolor[4], ilerp, lerp;
2838 localcolor[0] = color[0];
2839 localcolor[1] = color[1];
2840 localcolor[2] = color[2];
2841 localcolor[3] = color[3];
2842 ilerp = 1.0f - localcolor[3];
2843 lerp = localcolor[3];
2844 for (x = startx;x < endx;x++)
2846 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2847 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2848 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2849 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2855 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2859 int startx = span->startx;
2860 int endx = span->endx;
2863 __m128i submod, substep, endsubmod;
2864 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2865 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2866 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2867 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2868 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2869 for (x = startx; x < endx;)
2871 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2872 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2875 nextsub = endsub = endx-1;
2876 if(x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
2880 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2881 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2882 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2883 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2884 substep = _mm_packs_epi32(substep, substep);
2885 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2887 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
2888 pix = _mm_mulhi_epu16(pix, submod);
2889 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2893 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
2894 pix = _mm_mulhi_epu16(pix, submod);
2895 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2902 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
2906 int startx = span->startx;
2907 int endx = span->endx;
2910 __m128i submod, substep, endsubmod;
2911 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2912 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2913 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2914 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2915 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2916 for (x = startx; x < endx;)
2918 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2919 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2922 nextsub = endsub = endx-1;
2923 if(x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
2927 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2928 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2929 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2930 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2931 substep = _mm_packs_epi32(substep, substep);
2932 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2934 __m128i pix = _mm_srai_epi16(submod, 4);
2935 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2939 __m128i pix = _mm_srai_epi16(submod, 4);
2940 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2947 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
2950 int x, startx = span->startx, endx = span->endx;
2951 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
2952 localcolor = _mm_shuffle_epi32(_mm_packs_epi32(localcolor, localcolor), _MM_SHUFFLE(1, 0, 1, 0));
2953 for (x = startx;x+2 <= endx;x+=2)
2955 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2956 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
2957 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
2958 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2962 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2963 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
2964 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
2965 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2970 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
2973 int x, startx = span->startx, endx = span->endx;
2974 for (x = startx;x+2 <= endx;x+=2)
2976 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2977 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
2978 pix1 = _mm_mulhi_epu16(pix1, pix2);
2979 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2983 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2984 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
2985 pix1 = _mm_mulhi_epu16(pix1, pix2);
2986 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2991 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
2994 int x, startx = span->startx, endx = span->endx;
2995 for (x = startx;x+2 <= endx;x+=2)
2997 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2998 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
2999 pix1 = _mm_add_epi16(pix1, pix2);
3000 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3004 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3005 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3006 pix1 = _mm_add_epi16(pix1, pix2);
3007 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3012 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3015 int x, startx = span->startx, endx = span->endx;
3016 __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3017 tint = _mm_shuffle_epi32(_mm_packs_epi32(tint, tint), _MM_SHUFFLE(1, 0, 1, 0));
3018 for (x = startx;x+2 <= endx;x+=2)
3020 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3021 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3022 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3023 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3027 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3028 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3029 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3030 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3035 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3038 int x, startx = span->startx, endx = span->endx;
3039 for (x = startx;x+2 <= endx;x+=2)
3041 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3042 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3043 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3044 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3045 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3049 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3050 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3051 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3052 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3053 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3058 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3061 int x, startx = span->startx, endx = span->endx;
3062 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3063 localcolor = _mm_shuffle_epi32(_mm_packs_epi32(localcolor, localcolor), _MM_SHUFFLE(1, 0, 1, 0));
3064 blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3065 for (x = startx;x+2 <= endx;x+=2)
3067 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3068 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3069 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3073 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3074 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3075 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3082 void DPSOFTRAST_VertexShader_Generic(void)
3084 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3085 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3086 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3087 if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3088 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3091 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3093 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3094 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3095 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3096 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3097 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3098 if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3100 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3101 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3102 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3104 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3105 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3108 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3110 else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3113 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3115 else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3118 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3123 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3124 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3129 void DPSOFTRAST_VertexShader_PostProcess(void)
3131 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3132 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3133 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3136 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3138 // TODO: optimize!! at the very least there is no reason to use texture sampling on the frame texture
3139 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3140 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3141 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3142 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3143 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3144 if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3146 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3147 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3149 DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3150 if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3152 // TODO: implement saturation
3154 if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3156 // TODO: implement gammaramps
3158 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3163 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3165 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3168 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3170 // this is never called (because colormask is off when this shader is used)
3171 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3172 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3173 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3174 memset(buffer_FragColorbgra8, 0, span->length*4);
3175 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3180 void DPSOFTRAST_VertexShader_FlatColor(void)
3182 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3183 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3186 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3188 int x, startx = span->startx, endx = span->endx;
3189 int Color_Ambienti[4];
3190 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3191 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3192 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3193 Color_Ambienti[2] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0]*256.0f);
3194 Color_Ambienti[1] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1]*256.0f);
3195 Color_Ambienti[0] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2]*256.0f);
3196 Color_Ambienti[3] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0] *256.0f);
3197 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3198 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3199 for (x = startx;x < endx;x++)
3201 buffer_FragColorbgra8[x*4+0] = (buffer_texture_colorbgra8[x*4+0] * Color_Ambienti[0])>>8;
3202 buffer_FragColorbgra8[x*4+1] = (buffer_texture_colorbgra8[x*4+1] * Color_Ambienti[1])>>8;
3203 buffer_FragColorbgra8[x*4+2] = (buffer_texture_colorbgra8[x*4+2] * Color_Ambienti[2])>>8;
3204 buffer_FragColorbgra8[x*4+3] = (buffer_texture_colorbgra8[x*4+3] * Color_Ambienti[3])>>8;
3206 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3211 void DPSOFTRAST_VertexShader_VertexColor(void)
3213 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3214 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3215 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3218 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3221 unsigned char * RESTRICT pixelmask = span->pixelmask;
3222 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3223 int x, startx = span->startx, endx = span->endx;
3224 __m128i Color_Ambientm, Color_Diffusem;
3226 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3227 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3228 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3229 int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3230 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3231 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3232 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3233 pixel = buffer_FragColorbgra8;
3234 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3235 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3236 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3237 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3238 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3239 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3240 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3241 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3242 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3243 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3244 data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3245 data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3246 slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3247 for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3249 __m128i color, mod, pix;
3250 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3253 __m128 z = _mm_loadu_ps(&buffer_z[x]);
3254 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3255 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3256 data = _mm_add_ps(data, slope);
3257 mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3258 data = _mm_add_ps(data, slope);
3259 mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3260 data = _mm_add_ps(data, slope);
3261 mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3262 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3263 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3264 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3265 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3266 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3272 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3273 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x])));
3274 mod = _mm_packs_epi32(mod, mod);
3275 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3276 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3278 if(pixel == buffer_FragColorbgra8)
3279 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3285 void DPSOFTRAST_VertexShader_Lightmap(void)
3287 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3288 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3289 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3292 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3295 unsigned char * RESTRICT pixelmask = span->pixelmask;
3296 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3297 int x, startx = span->startx, endx = span->endx;
3298 __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3299 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3300 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3301 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3302 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3303 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3304 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3305 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3306 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3307 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3308 pixel = buffer_FragColorbgra8;
3309 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3310 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3311 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3312 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3313 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3314 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3315 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3316 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3318 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3319 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3320 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3321 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3322 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3323 for (x = startx;x < endx;x++)
3325 __m128i color, lightmap, glow, pix;
3326 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3329 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3330 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3331 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3332 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3333 _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3334 _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3335 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3336 _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3337 _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3338 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3344 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3345 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3346 glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3347 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3348 pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3349 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3354 for (x = startx;x < endx;x++)
3356 __m128i color, lightmap, pix;
3357 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3360 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3361 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3362 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3363 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3364 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3365 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3366 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3372 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3373 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3374 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3375 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3378 if(pixel == buffer_FragColorbgra8)
3379 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3385 void DPSOFTRAST_VertexShader_FakeLight(void)
3387 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3390 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3393 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3394 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3395 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3396 memset(buffer_FragColorbgra8, 0, span->length*4);
3397 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3402 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3404 DPSOFTRAST_VertexShader_Lightmap();
3407 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3409 DPSOFTRAST_PixelShader_Lightmap(thread, triangle, span);
3415 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3417 DPSOFTRAST_VertexShader_Lightmap();
3420 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3422 DPSOFTRAST_PixelShader_Lightmap(thread, triangle, span);
3428 void DPSOFTRAST_VertexShader_LightDirection(void)
3431 int numvertices = dpsoftrast.numvertices;
3433 float LightVector[4];
3434 float EyePosition[4];
3435 float EyeVectorModelSpace[4];
3441 LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3442 LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3443 LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3444 LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3445 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3446 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3447 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3448 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3449 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3450 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3451 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3452 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3453 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3454 for (i = 0;i < numvertices;i++)
3456 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3457 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3458 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3459 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3460 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3461 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3462 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3463 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3464 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3465 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3466 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3467 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3468 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3469 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3470 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3471 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
3472 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
3473 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
3474 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
3475 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3476 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3477 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3478 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3479 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3480 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
3481 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
3482 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
3483 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
3484 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
3486 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3489 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3490 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3491 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3492 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3493 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3494 #define DPSOFTRAST_Vector3Normalize(v)\
3497 float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3508 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3510 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3511 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3512 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3513 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3514 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3515 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3516 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3517 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3518 int x, startx = span->startx, endx = span->endx;
3519 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3520 float LightVectordata[4];
3521 float LightVectorslope[4];
3522 float EyeVectordata[4];
3523 float EyeVectorslope[4];
3525 float diffusetex[4];
3527 float surfacenormal[4];
3528 float lightnormal[4];
3530 float specularnormal[4];
3533 float SpecularPower;
3535 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3536 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3537 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3538 Color_Glow[3] = 0.0f;
3539 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3540 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3541 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3542 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3543 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3544 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3545 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3546 Color_Pants[3] = 0.0f;
3547 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3548 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3549 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3550 Color_Shirt[3] = 0.0f;
3551 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3552 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3553 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3555 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3556 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3558 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3560 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3562 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3564 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3565 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3566 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3567 Color_Diffuse[3] = 0.0f;
3568 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3569 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3570 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3571 LightColor[3] = 0.0f;
3572 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3573 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3574 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3575 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3576 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3577 Color_Specular[3] = 0.0f;
3578 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3579 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3580 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3581 for (x = startx;x < endx;x++)
3584 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3585 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3586 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3587 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3588 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3590 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3591 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3592 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3593 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3595 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3596 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3597 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3598 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3599 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3600 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3601 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3602 DPSOFTRAST_Vector3Normalize(surfacenormal);
3604 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3605 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3606 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3607 DPSOFTRAST_Vector3Normalize(lightnormal);
3609 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3610 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3611 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3612 DPSOFTRAST_Vector3Normalize(eyenormal);
3614 specularnormal[0] = lightnormal[0] + eyenormal[0];
3615 specularnormal[1] = lightnormal[1] + eyenormal[1];
3616 specularnormal[2] = lightnormal[2] + eyenormal[2];
3617 DPSOFTRAST_Vector3Normalize(specularnormal);
3619 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3620 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3621 specular = pow(specular, SpecularPower * glosstex[3]);
3622 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3624 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3625 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3626 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3627 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3631 d[0] = (int)( diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3632 d[1] = (int)( diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3633 d[2] = (int)( diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3634 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3636 buffer_FragColorbgra8[x*4+0] = d[0];
3637 buffer_FragColorbgra8[x*4+1] = d[1];
3638 buffer_FragColorbgra8[x*4+2] = d[2];
3639 buffer_FragColorbgra8[x*4+3] = d[3];
3642 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3644 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3645 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3646 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3647 Color_Diffuse[3] = 0.0f;
3648 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3649 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3650 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3651 LightColor[3] = 0.0f;
3652 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3653 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3654 for (x = startx;x < endx;x++)
3657 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3658 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3659 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3660 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3661 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3662 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3663 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3664 DPSOFTRAST_Vector3Normalize(surfacenormal);
3666 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3667 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3668 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3669 DPSOFTRAST_Vector3Normalize(lightnormal);
3671 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3672 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3674 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3675 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3676 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3677 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
3681 d[0] = (int)( + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3682 d[1] = (int)( + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3683 d[2] = (int)( + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3684 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
3686 buffer_FragColorbgra8[x*4+0] = d[0];
3687 buffer_FragColorbgra8[x*4+1] = d[1];
3688 buffer_FragColorbgra8[x*4+2] = d[2];
3689 buffer_FragColorbgra8[x*4+3] = d[3];
3694 for (x = startx;x < endx;x++)
3697 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3698 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3699 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3700 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3702 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3704 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3705 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3706 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3707 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3711 d[0] = (int)( diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3712 d[1] = (int)( diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3713 d[2] = (int)( diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3714 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3716 buffer_FragColorbgra8[x*4+0] = d[0];
3717 buffer_FragColorbgra8[x*4+1] = d[1];
3718 buffer_FragColorbgra8[x*4+2] = d[2];
3719 buffer_FragColorbgra8[x*4+3] = d[3];
3722 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3727 void DPSOFTRAST_VertexShader_LightSource(void)
3730 int numvertices = dpsoftrast.numvertices;
3731 float LightPosition[4];
3732 float LightVector[4];
3733 float LightVectorModelSpace[4];
3734 float EyePosition[4];
3735 float EyeVectorModelSpace[4];
3741 LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
3742 LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
3743 LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
3744 LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
3745 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3746 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3747 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3748 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3749 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3750 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3751 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3752 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3753 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3754 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3755 for (i = 0;i < numvertices;i++)
3757 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3758 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3759 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3760 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3761 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3762 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3763 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3764 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3765 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3766 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3767 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3768 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3769 LightVectorModelSpace[0] = LightPosition[0] - position[0];
3770 LightVectorModelSpace[1] = LightPosition[1] - position[1];
3771 LightVectorModelSpace[2] = LightPosition[2] - position[2];
3772 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
3773 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
3774 LightVector[2] = normal[0] * LightVectorModelSpace[0] + normal[1] * LightVectorModelSpace[1] + normal[2] * LightVectorModelSpace[2];
3775 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
3776 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
3777 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
3778 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
3779 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3780 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3781 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3782 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3783 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3784 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
3785 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
3786 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
3787 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
3788 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
3790 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3791 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
3794 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3797 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3798 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3799 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3800 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3801 unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3802 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3803 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3804 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3805 int x, startx = span->startx, endx = span->endx;
3806 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3807 float CubeVectordata[4];
3808 float CubeVectorslope[4];
3809 float LightVectordata[4];
3810 float LightVectorslope[4];
3811 float EyeVectordata[4];
3812 float EyeVectorslope[4];
3814 float diffusetex[4];
3816 float surfacenormal[4];
3817 float lightnormal[4];
3819 float specularnormal[4];
3822 float SpecularPower;
3823 float CubeVector[4];
3826 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3827 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3828 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3829 Color_Glow[3] = 0.0f;
3830 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3831 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3832 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3833 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3834 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3835 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3836 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3837 Color_Diffuse[3] = 0.0f;
3838 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3839 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3840 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3841 Color_Specular[3] = 0.0f;
3842 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3843 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3844 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3845 Color_Pants[3] = 0.0f;
3846 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3847 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3848 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3849 Color_Shirt[3] = 0.0f;
3850 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3851 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3852 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3853 LightColor[3] = 0.0f;
3854 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3855 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3856 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3857 DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3858 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3859 memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
3860 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3861 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3863 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3864 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3866 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3867 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
3868 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3870 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3871 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3872 for (x = startx;x < endx;x++)
3875 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
3876 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
3877 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
3878 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
3879 if (attenuation < 0.01f)
3881 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
3883 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
3884 if (attenuation < 0.01f)
3888 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3889 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3890 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3891 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3892 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3894 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3895 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3896 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3897 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3899 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3900 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3901 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3902 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3903 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3904 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3905 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3906 DPSOFTRAST_Vector3Normalize(surfacenormal);
3908 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3909 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3910 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3911 DPSOFTRAST_Vector3Normalize(lightnormal);
3913 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3914 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3915 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3916 DPSOFTRAST_Vector3Normalize(eyenormal);
3918 specularnormal[0] = lightnormal[0] + eyenormal[0];
3919 specularnormal[1] = lightnormal[1] + eyenormal[1];
3920 specularnormal[2] = lightnormal[2] + eyenormal[2];
3921 DPSOFTRAST_Vector3Normalize(specularnormal);
3923 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3924 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3925 specular = pow(specular, SpecularPower * glosstex[3]);
3926 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3928 // scale down the attenuation to account for the cubefilter multiplying everything by 255
3929 attenuation *= (1.0f / 255.0f);
3930 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
3931 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
3932 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
3933 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
3937 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
3938 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
3939 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
3940 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
3942 buffer_FragColorbgra8[x*4+0] = d[0];
3943 buffer_FragColorbgra8[x*4+1] = d[1];
3944 buffer_FragColorbgra8[x*4+2] = d[2];
3945 buffer_FragColorbgra8[x*4+3] = d[3];
3948 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3950 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3951 for (x = startx;x < endx;x++)
3954 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
3955 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
3956 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
3957 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
3958 if (attenuation < 0.01f)
3960 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
3962 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
3963 if (attenuation < 0.01f)
3967 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3968 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3969 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3970 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3971 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3973 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3974 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3975 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3976 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3978 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3979 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3980 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3981 DPSOFTRAST_Vector3Normalize(surfacenormal);
3983 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3984 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3985 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3986 DPSOFTRAST_Vector3Normalize(lightnormal);
3988 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3989 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3991 // scale down the attenuation to account for the cubefilter multiplying everything by 255
3992 attenuation *= (1.0f / 255.0f);
3993 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
3994 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
3995 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
3996 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4000 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4001 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4002 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4003 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4005 buffer_FragColorbgra8[x*4+0] = d[0];
4006 buffer_FragColorbgra8[x*4+1] = d[1];
4007 buffer_FragColorbgra8[x*4+2] = d[2];
4008 buffer_FragColorbgra8[x*4+3] = d[3];
4013 for (x = startx;x < endx;x++)
4016 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4017 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4018 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4019 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4020 if (attenuation < 0.01f)
4022 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4024 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4025 if (attenuation < 0.01f)
4029 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4030 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4031 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4032 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4033 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4035 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4036 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4037 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4038 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4040 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4042 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4043 attenuation *= (1.0f / 255.0f);
4044 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4045 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4046 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4047 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4051 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4052 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4053 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4054 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4056 buffer_FragColorbgra8[x*4+0] = d[0];
4057 buffer_FragColorbgra8[x*4+1] = d[1];
4058 buffer_FragColorbgra8[x*4+2] = d[2];
4059 buffer_FragColorbgra8[x*4+3] = d[3];
4062 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4068 void DPSOFTRAST_VertexShader_Refraction(void)
4070 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4073 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4076 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4077 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4078 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4079 memset(buffer_FragColorbgra8, 0, span->length*4);
4080 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4085 void DPSOFTRAST_VertexShader_Water(void)
4087 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4091 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4094 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4095 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4096 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4097 memset(buffer_FragColorbgra8, 0, span->length*4);
4098 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4103 void DPSOFTRAST_VertexShader_ShowDepth(void)
4105 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4108 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4111 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4112 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4113 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4114 memset(buffer_FragColorbgra8, 0, span->length*4);
4115 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4120 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4122 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4125 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4128 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4129 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4130 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4131 memset(buffer_FragColorbgra8, 0, span->length*4);
4132 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4137 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4139 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4142 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4145 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4146 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4147 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4148 memset(buffer_FragColorbgra8, 0, span->length*4);
4149 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4154 typedef struct DPSOFTRAST_ShaderModeInfo_s
4157 void (*Vertex)(void);
4158 void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4159 unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4160 unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4162 DPSOFTRAST_ShaderModeInfo;
4164 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4166 {2, DPSOFTRAST_VertexShader_Generic, DPSOFTRAST_PixelShader_Generic, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4167 {2, DPSOFTRAST_VertexShader_PostProcess, DPSOFTRAST_PixelShader_PostProcess, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4168 {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow, DPSOFTRAST_PixelShader_Depth_Or_Shadow, {~0}, {~0}},
4169 {2, DPSOFTRAST_VertexShader_FlatColor, DPSOFTRAST_PixelShader_FlatColor, {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4170 {2, DPSOFTRAST_VertexShader_VertexColor, DPSOFTRAST_PixelShader_VertexColor, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4171 {2, DPSOFTRAST_VertexShader_Lightmap, DPSOFTRAST_PixelShader_Lightmap, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4172 {2, DPSOFTRAST_VertexShader_FakeLight, DPSOFTRAST_PixelShader_FakeLight, {~0}, {~0}},
4173 {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace, DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4174 {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4175 {2, DPSOFTRAST_VertexShader_LightDirection, DPSOFTRAST_PixelShader_LightDirection, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4176 {2, DPSOFTRAST_VertexShader_LightSource, DPSOFTRAST_PixelShader_LightSource, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4177 {2, DPSOFTRAST_VertexShader_Refraction, DPSOFTRAST_PixelShader_Refraction, {~0}},
4178 {2, DPSOFTRAST_VertexShader_Water, DPSOFTRAST_PixelShader_Water, {~0}},
4179 {2, DPSOFTRAST_VertexShader_ShowDepth, DPSOFTRAST_PixelShader_ShowDepth, {~0}},
4180 {2, DPSOFTRAST_VertexShader_DeferredGeometry, DPSOFTRAST_PixelShader_DeferredGeometry, {~0}},
4181 {2, DPSOFTRAST_VertexShader_DeferredLightSource, DPSOFTRAST_PixelShader_DeferredLightSource, {~0}}
4184 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4191 // unsigned int *colorpixel;
4192 unsigned int *depthpixel;
4198 DPSOFTRAST_State_Triangle *triangle;
4199 DPSOFTRAST_State_Span *span;
4200 unsigned char pixelmask[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4201 for (i = 0; i < thread->numspans; i++)
4203 span = &thread->spans[i];
4204 triangle = &thread->triangles[span->triangle];
4205 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4207 wslope = triangle->w[0];
4208 w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
4209 depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
4210 depth = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
4211 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4212 switch(thread->fb_depthfunc)
4215 case GL_ALWAYS: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = true; break;
4216 case GL_LESS: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4217 case GL_LEQUAL: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4218 case GL_EQUAL: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4219 case GL_GEQUAL: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4220 case GL_GREATER: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4221 case GL_NEVER: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = false; break;
4223 //colorpixel = dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;;
4224 //for (x = 0;x < span->length;x++)
4225 // colorpixel[x] = (depthpixel[x] & 0xFF000000) ? (0x00FF0000) : (depthpixel[x] & 0x00FF0000);
4226 // if there is no color buffer, skip pixel shader
4228 endx = span->length;
4229 while (startx < endx && !pixelmask[startx])
4231 while (endx > startx && !pixelmask[endx-1])
4234 continue; // no pixels to fill
4235 span->pixelmask = pixelmask;
4236 span->startx = startx;
4238 // run pixel shader if appropriate
4239 // do this before running depthmask code, to allow the pixelshader
4240 // to clear pixelmask values for alpha testing
4241 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4242 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4243 if (thread->depthmask)
4244 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4250 // no depth testing means we're just dealing with color...
4251 // if there is no color buffer, skip pixel shader
4252 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4254 memset(pixelmask, 1, span->length);
4255 span->pixelmask = pixelmask;
4257 span->endx = span->length;
4258 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4262 thread->numspans = 0;
4265 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4267 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4270 int cullface = thread->cullface;
4271 int width = dpsoftrast.fb_width;
4272 int miny = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
4273 int maxy = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
4274 __m128i fbmin, fbmax;
4275 __m128 viewportcenter, viewportscale;
4276 int firstvertex = command->firstvertex;
4277 int numvertices = command->numvertices;
4278 int numtriangles = command->numtriangles;
4279 const int *element3i = command->element3i;
4280 const unsigned short *element3s = command->element3s;
4281 int clipped = command->clipped;
4292 __m128 triangleedge1, triangleedge2, trianglenormal;
4295 DPSOFTRAST_State_Triangle *triangle;
4296 DPSOFTRAST_Texture *texture;
4297 if (command->starty >= maxy || command->endy <= miny)
4299 if (!ATOMIC_DECREMENT(command->refcount))
4301 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4302 MM_FREE(command->arrays);
4306 DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4307 fbmin = _mm_setr_epi16(0, miny, 0, miny, 0, miny, 0, miny);
4308 fbmax = _mm_sub_epi16(_mm_setr_epi16(width, maxy, width, maxy, width, maxy, width, maxy), _mm_set1_epi16(1));
4309 viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4310 viewportscale = _mm_load_ps(thread->fb_viewportscale);
4311 screen[3] = _mm_setzero_ps();
4312 clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4313 for (i = 0;i < numtriangles;i++)
4315 const float *screencoord4f = command->arrays;
4316 const float *arrays = screencoord4f + numvertices*4;
4318 // generate the 3 edges of this triangle
4319 // generate spans for the triangle - switch based on left split or right split classification of triangle
4322 e[0] = element3s[i*3+0] - firstvertex;
4323 e[1] = element3s[i*3+1] - firstvertex;
4324 e[2] = element3s[i*3+2] - firstvertex;
4328 e[0] = element3i[i*3+0] - firstvertex;
4329 e[1] = element3i[i*3+1] - firstvertex;
4330 e[2] = element3i[i*3+2] - firstvertex;
4339 #define SKIPBACKFACE \
4340 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4341 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4342 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4343 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4344 _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4348 if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4352 if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4357 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4358 clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4360 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4361 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4363 #define CLIPPEDVERTEXCOPY(k,p1) \
4364 screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4366 #define GENATTRIBCOPY(attrib, p1) \
4367 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4368 #define GENATTRIBLERP(attrib, p1, p2) \
4370 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4371 attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4373 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4377 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4378 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4379 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4380 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4381 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4382 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4383 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4389 // calculate distance from nearplane
4390 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4391 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4392 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4393 if (clipdist[0] >= 0.0f)
4395 if (clipdist[1] >= 0.0f)
4397 if (clipdist[2] >= 0.0f)
4400 // triangle is entirely in front of nearplane
4401 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4408 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4416 if (clipdist[2] >= 0.0f)
4418 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
4425 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
4432 else if (clipdist[1] >= 0.0f)
4434 if (clipdist[2] >= 0.0f)
4436 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
4443 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
4449 else if (clipdist[2] >= 0.0f)
4451 CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
4456 else continue; // triangle is entirely behind nearplane
4459 // calculate integer y coords for triangle points
4460 __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
4461 screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
4462 screenmin = _mm_min_epi16(screeni, screenir),
4463 screenmax = _mm_max_epi16(screeni, screenir);
4464 screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
4465 screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
4466 screenmin = _mm_max_epi16(screenmin, fbmin);
4467 screenmax = _mm_min_epi16(screenmax, fbmax);
4468 // skip offscreen triangles
4469 if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
4471 starty = _mm_extract_epi16(screenmin, 1);
4472 endy = _mm_extract_epi16(screenmax, 1)+1;
4473 screeny = _mm_srai_epi32(screeni, 16);
4476 triangle = &thread->triangles[thread->numtriangles];
4478 // calculate attribute plans for triangle data...
4479 // okay, this triangle is going to produce spans, we'd better project
4480 // the interpolants now (this is what gives perspective texturing),
4481 // this consists of simply multiplying all arrays by the W coord
4482 // (which is basically 1/Z), which will be undone per-pixel
4483 // (multiplying by Z again) to get the perspective-correct array
4486 __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
4487 __m128 mipedgescale, mipdensity;
4488 attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
4489 attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
4490 attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
4491 attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
4492 attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
4493 w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
4494 w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
4495 w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
4496 attribedge1 = _mm_sub_ss(w0, w1);
4497 attribedge2 = _mm_sub_ss(w2, w1);
4498 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
4499 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
4500 x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
4501 y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
4502 attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
4503 _mm_store_ss(&triangle->w[0], attribxslope);
4504 _mm_store_ss(&triangle->w[1], attribyslope);
4505 _mm_store_ss(&triangle->w[2], attriborigin);
4506 mipedgescale = _mm_setzero_ps();
4507 for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
4509 __m128 attrib0, attrib1, attrib2;
4510 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
4511 if (k >= DPSOFTRAST_ARRAY_TOTAL)
4513 arrays += numvertices*4;
4514 GENATTRIBS(attrib0, attrib1, attrib2);
4515 attriborigin = _mm_mul_ps(attrib1, w1);
4516 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
4517 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
4518 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
4519 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
4520 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
4521 _mm_stream_ps(triangle->attribs[k][0], attribxslope);
4522 _mm_stream_ps(triangle->attribs[k][1], attribyslope);
4523 _mm_stream_ps(triangle->attribs[k][2], attriborigin);
4524 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
4526 mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
4527 mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
4528 mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
4529 mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
4533 memset(triangle->mip, 0, sizeof(triangle->mip));
4534 for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
4536 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
4537 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
4539 texture = thread->texbound[texunit];
4540 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4542 mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
4543 mipdensity = _mm_mul_ps(mipdensity, mipdensity);
4544 mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
4545 mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
4546 // this will be multiplied in the texturing routine by the texture resolution
4547 y = _mm_cvtss_si32(mipdensity);
4550 y = (int)(log((float)y)*0.5f/M_LN2);
4551 if (y > texture->mipmaps - 1)
4552 y = texture->mipmaps - 1;
4553 triangle->mip[texunit] = y;
4559 for (y = starty; y < endy;)
4561 __m128 xcoords, xslope;
4562 __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
4563 int yccmask = _mm_movemask_epi8(ycc);
4564 int edge0p, edge0n, edge1p, edge1n;
4571 case 0xFFFF: /*0000*/ y = endy; continue;
4572 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
4573 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4574 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
4575 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
4576 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
4577 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
4578 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
4579 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
4580 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
4581 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
4582 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
4583 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
4584 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4585 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
4586 case 0x0000: /*1111*/ y++; continue;
4594 case 0xFFFF: /*000*/ y = endy; continue;
4595 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
4596 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4597 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
4598 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
4599 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4600 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
4601 case 0x0000: /*111*/ y++; continue;
4604 ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
4605 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
4606 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
4607 nexty = _mm_extract_epi16(ycc, 0);
4608 if(nexty >= endy) nexty = endy-1;
4609 if (_mm_ucomigt_ss(_mm_max_ss(screen[edge0n], screen[edge0p]), _mm_min_ss(screen[edge1n], screen[edge1p])))
4618 xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
4619 xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
4620 xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
4621 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
4622 xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
4623 for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope))
4625 int startx, endx, offset;
4626 startx = _mm_cvtss_si32(xcoords);
4627 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
4628 if (startx < 0) startx = 0;
4629 if (endx > dpsoftrast.fb_width) endx = dpsoftrast.fb_width;
4630 if (startx >= endx) continue;
4631 for (offset = startx; offset < endx;)
4633 DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
4634 span->triangle = thread->numtriangles;
4637 span->length = endx - offset;
4638 if (span -> length > DPSOFTRAST_DRAW_MAXSPANLENGTH)
4639 span -> length = DPSOFTRAST_DRAW_MAXSPANLENGTH;
4640 offset += span->length;
4641 if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
4642 DPSOFTRAST_Draw_ProcessSpans(thread);
4647 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
4649 DPSOFTRAST_Draw_ProcessSpans(thread);
4650 thread->numtriangles = 0;
4654 if (!ATOMIC_DECREMENT(command->refcount))
4656 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4657 MM_FREE(command->arrays);
4660 if (thread->numspans > 0 || thread->numtriangles > 0)
4662 DPSOFTRAST_Draw_ProcessSpans(thread);
4663 thread->numtriangles = 0;
4668 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
4672 int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
4673 int datasize = 2*numvertices*sizeof(float[4]);
4674 DPSOFTRAST_Command_Draw *command;
4675 unsigned char *data;
4676 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
4678 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
4679 if (j >= DPSOFTRAST_ARRAY_TOTAL)
4681 datasize += numvertices*sizeof(float[4]);
4684 datasize += numtriangles*sizeof(unsigned short[3]);
4686 datasize += numtriangles*sizeof(int[3]);
4687 datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
4688 if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
4690 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
4691 data = (unsigned char *)MM_CALLOC(datasize, 1);
4695 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
4696 data = (unsigned char *)command + commandsize;
4698 command->firstvertex = firstvertex;
4699 command->numvertices = numvertices;
4700 command->numtriangles = numtriangles;
4701 command->arrays = (float *)data;
4702 memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
4703 dpsoftrast.firstvertex = firstvertex;
4704 dpsoftrast.numvertices = numvertices;
4705 dpsoftrast.screencoord4f = (float *)data;
4706 data += numvertices*sizeof(float[4]);
4707 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
4708 data += numvertices*sizeof(float[4]);
4709 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
4711 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
4712 if (j >= DPSOFTRAST_ARRAY_TOTAL)
4714 dpsoftrast.post_array4f[j] = (float *)data;
4715 data += numvertices*sizeof(float[4]);
4717 command->element3i = NULL;
4718 command->element3s = NULL;
4721 command->element3s = (unsigned short *)data;
4722 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
4726 command->element3i = (int *)data;
4727 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
4732 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
4734 DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
4735 DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
4736 command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
4737 command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
4738 if (command->starty >= command->endy)
4740 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4741 MM_FREE(command->arrays);
4742 DPSOFTRAST_UndoCommand(command->commandsize);
4745 command->clipped = dpsoftrast.drawclipped;
4746 command->refcount = dpsoftrast.numthreads;
4749 DPSOFTRAST_Draw_SyncCommands();
4753 for (i = 0; i < dpsoftrast.numthreads; i++)
4755 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
4757 nexty = ((i+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
4758 if (command->starty < nexty && command->endy > y && thread->starving)
4759 SDL_CondSignal(thread->drawcond);
4763 DPSOFTRAST_Draw_FlushThreads();
4767 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
4769 int commandoffset = thread->commandoffset;
4770 while (commandoffset != endoffset)
4772 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
4773 switch (command->opcode)
4775 #define INTERPCOMMAND(name) \
4776 case DPSOFTRAST_OPCODE_##name : \
4777 DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
4778 commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
4779 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
4780 commandoffset = 0; \
4782 INTERPCOMMAND(Viewport)
4783 INTERPCOMMAND(ClearColor)
4784 INTERPCOMMAND(ClearDepth)
4785 INTERPCOMMAND(ColorMask)
4786 INTERPCOMMAND(DepthTest)
4787 INTERPCOMMAND(ScissorTest)
4788 INTERPCOMMAND(Scissor)
4789 INTERPCOMMAND(BlendFunc)
4790 INTERPCOMMAND(BlendSubtract)
4791 INTERPCOMMAND(DepthMask)
4792 INTERPCOMMAND(DepthFunc)
4793 INTERPCOMMAND(DepthRange)
4794 INTERPCOMMAND(PolygonOffset)
4795 INTERPCOMMAND(CullFace)
4796 INTERPCOMMAND(AlphaTest)
4797 INTERPCOMMAND(AlphaFunc)
4798 INTERPCOMMAND(SetTexture)
4799 INTERPCOMMAND(SetShader)
4800 INTERPCOMMAND(Uniform4f)
4801 INTERPCOMMAND(UniformMatrix4f)
4802 INTERPCOMMAND(Uniform1i)
4804 case DPSOFTRAST_OPCODE_Draw:
4805 DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
4806 commandoffset += command->commandsize;
4807 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
4809 thread->commandoffset = commandoffset;
4812 case DPSOFTRAST_OPCODE_Reset:
4817 thread->commandoffset = commandoffset;
4821 static int DPSOFTRAST_Draw_Thread(void *data)
4823 DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
4824 while(thread->index >= 0)
4826 if (thread->commandoffset != dpsoftrast.drawcommand)
4828 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
4832 SDL_LockMutex(thread->drawmutex);
4833 if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
4835 if (thread->waiting) SDL_CondSignal(thread->waitcond);
4836 thread->starving = true;
4837 SDL_CondWait(thread->drawcond, thread->drawmutex);
4838 thread->starving = false;
4840 SDL_UnlockMutex(thread->drawmutex);
4847 static void DPSOFTRAST_Draw_FlushThreads(void)
4849 DPSOFTRAST_State_Thread *thread;
4851 DPSOFTRAST_Draw_SyncCommands();
4853 for (i = 0; i < dpsoftrast.numthreads; i++)
4855 thread = &dpsoftrast.threads[i];
4856 if (thread->commandoffset != dpsoftrast.drawcommand)
4858 SDL_LockMutex(thread->drawmutex);
4859 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
4860 SDL_CondSignal(thread->drawcond);
4861 SDL_UnlockMutex(thread->drawmutex);
4865 for (i = 0; i < dpsoftrast.numthreads; i++)
4867 thread = &dpsoftrast.threads[i];
4869 if (thread->commandoffset != dpsoftrast.drawcommand)
4871 SDL_LockMutex(thread->drawmutex);
4872 if (thread->commandoffset != dpsoftrast.drawcommand)
4874 thread->waiting = true;
4875 SDL_CondWait(thread->waitcond, thread->drawmutex);
4876 thread->waiting = false;
4878 SDL_UnlockMutex(thread->drawmutex);
4881 if (thread->commandoffset != dpsoftrast.drawcommand)
4882 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
4885 dpsoftrast.commandpool.usedcommands = 0;
4888 void DPSOFTRAST_Flush(void)
4890 DPSOFTRAST_Draw_FlushThreads();
4893 void DPSOFTRAST_Finish(void)
4898 void DPSOFTRAST_Init(int width, int height, int numthreads, unsigned int *colorpixels, unsigned int *depthpixels)
4908 memset(&dpsoftrast, 0, sizeof(dpsoftrast));
4909 dpsoftrast.bigendian = u.b[3];
4910 dpsoftrast.fb_width = width;
4911 dpsoftrast.fb_height = height;
4912 dpsoftrast.fb_depthpixels = depthpixels;
4913 dpsoftrast.fb_colorpixels[0] = colorpixels;
4914 dpsoftrast.fb_colorpixels[1] = NULL;
4915 dpsoftrast.fb_colorpixels[1] = NULL;
4916 dpsoftrast.fb_colorpixels[1] = NULL;
4917 dpsoftrast.viewport[0] = 0;
4918 dpsoftrast.viewport[1] = 0;
4919 dpsoftrast.viewport[2] = dpsoftrast.fb_width;
4920 dpsoftrast.viewport[3] = dpsoftrast.fb_height;
4921 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
4922 dpsoftrast.texture_firstfree = 1;
4923 dpsoftrast.texture_end = 1;
4924 dpsoftrast.texture_max = 0;
4925 dpsoftrast.color[0] = 1;
4926 dpsoftrast.color[1] = 1;
4927 dpsoftrast.color[2] = 1;
4928 dpsoftrast.color[3] = 1;
4930 dpsoftrast.numthreads = bound(1, numthreads, 64);
4932 dpsoftrast.numthreads = 1;
4934 dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
4935 for (i = 0; i < dpsoftrast.numthreads; i++)
4937 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
4939 thread->cullface = GL_BACK;
4940 thread->colormask[1] = 1;
4941 thread->colormask[2] = 1;
4942 thread->colormask[3] = 1;
4943 thread->blendfunc[0] = GL_ONE;
4944 thread->blendfunc[1] = GL_ZERO;
4945 thread->depthmask = true;
4946 thread->depthtest = true;
4947 thread->depthfunc = GL_LEQUAL;
4948 thread->scissortest = false;
4949 thread->alphatest = false;
4950 thread->alphafunc = GL_GREATER;
4951 thread->alphavalue = 0.5f;
4952 thread->viewport[0] = 0;
4953 thread->viewport[1] = 0;
4954 thread->viewport[2] = dpsoftrast.fb_width;
4955 thread->viewport[3] = dpsoftrast.fb_height;
4956 thread->scissor[0] = 0;
4957 thread->scissor[1] = 0;
4958 thread->scissor[2] = dpsoftrast.fb_width;
4959 thread->scissor[3] = dpsoftrast.fb_height;
4960 thread->depthrange[0] = 0;
4961 thread->depthrange[1] = 1;
4962 thread->polygonoffset[0] = 0;
4963 thread->polygonoffset[1] = 0;
4965 thread->numspans = 0;
4966 thread->numtriangles = 0;
4967 thread->commandoffset = 0;
4968 thread->waiting = false;
4969 thread->starving = false;
4971 thread->waitcond = SDL_CreateCond();
4972 thread->drawcond = SDL_CreateCond();
4973 thread->drawmutex = SDL_CreateMutex();
4976 thread->validate = -1;
4977 DPSOFTRAST_Validate(thread, -1);
4979 thread->thread = SDL_CreateThread(DPSOFTRAST_Draw_Thread, thread);
4984 void DPSOFTRAST_Shutdown(void)
4988 if(dpsoftrast.numthreads > 0)
4990 DPSOFTRAST_State_Thread *thread;
4991 for (i = 0; i < dpsoftrast.numthreads; i++)
4993 thread = &dpsoftrast.threads[i];
4994 SDL_LockMutex(thread->drawmutex);
4996 SDL_CondSignal(thread->drawcond);
4997 SDL_UnlockMutex(thread->drawmutex);
4998 SDL_WaitThread(thread->thread, NULL);
4999 SDL_DestroyCond(thread->waitcond);
5000 SDL_DestroyCond(thread->drawcond);
5001 SDL_DestroyMutex(thread->drawmutex);
5005 for (i = 0;i < dpsoftrast.texture_end;i++)
5006 if (dpsoftrast.texture[i].bytes)
5007 MM_FREE(dpsoftrast.texture[i].bytes);
5008 if (dpsoftrast.texture)
5009 free(dpsoftrast.texture);
5010 if (dpsoftrast.threads)
5011 MM_FREE(dpsoftrast.threads);
5012 memset(&dpsoftrast, 0, sizeof(dpsoftrast));