3 #define _USE_MATH_DEFINES
6 #include "dpsoftrast.h"
14 #include <SDL_thread.h>
18 typedef qboolean bool;
22 #define ATOMIC_SIZE 32
26 #define ALIGN(var) var __attribute__((__aligned__(16)))
27 #define ATOMIC(var) var __attribute__((__aligned__(32)))
29 #define MEMORY_BARRIER (_mm_sfence())
30 //(__sync_synchronize())
31 #define ATOMIC_COUNTER volatile int
32 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
33 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
34 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
36 #elif defined(_MSC_VER)
37 #define ALIGN(var) __declspec(align(16)) var
38 #define ATOMIC(var) __declspec(align(32)) var
40 #define MEMORY_BARRIER (_mm_sfence())
42 #define ATOMIC_COUNTER volatile LONG
43 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
44 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
45 #define ATOMIC_ADD(counter, val) (InterlockedExchangeAdd(&(counter), (val)))
54 #define ALIGN(var) var
55 #define ATOMIC(var) var
59 #define MEMORY_BARRIER ((void)0)
60 #define ATOMIC_COUNTER int
61 #define ATOMIC_INCREMENT(counter) (++(counter))
62 #define ATOMIC_DECREMENT(counter) (--(counter))
63 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
67 #include <emmintrin.h>
69 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
71 static void *MM_CALLOC(size_t nmemb, size_t size)
73 void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
74 if(ptr != NULL) memset(ptr, 0, nmemb*size);
78 #define MM_FREE _mm_free
80 #define MM_MALLOC(size) malloc(size)
81 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
85 typedef enum DPSOFTRAST_ARRAY_e
87 DPSOFTRAST_ARRAY_POSITION,
88 DPSOFTRAST_ARRAY_COLOR,
89 DPSOFTRAST_ARRAY_TEXCOORD0,
90 DPSOFTRAST_ARRAY_TEXCOORD1,
91 DPSOFTRAST_ARRAY_TEXCOORD2,
92 DPSOFTRAST_ARRAY_TEXCOORD3,
93 DPSOFTRAST_ARRAY_TEXCOORD4,
94 DPSOFTRAST_ARRAY_TEXCOORD5,
95 DPSOFTRAST_ARRAY_TEXCOORD6,
96 DPSOFTRAST_ARRAY_TEXCOORD7,
97 DPSOFTRAST_ARRAY_TOTAL
101 typedef struct DPSOFTRAST_Texture_s
108 DPSOFTRAST_TEXTURE_FILTER filter;
111 ATOMIC_COUNTER binds;
112 unsigned char *bytes;
113 int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
117 #define COMMAND_SIZE ALIGN_SIZE
118 #define COMMAND_ALIGN(var) ALIGN(var)
120 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
122 unsigned char opcode;
123 unsigned short commandsize;
127 enum { DPSOFTRAST_OPCODE_Reset = 0 };
129 #define DEFCOMMAND(opcodeval, name, fields) \
130 enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
131 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
133 unsigned char opcode; \
134 unsigned short commandsize; \
136 } DPSOFTRAST_Command_##name );
138 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
139 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
141 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
145 ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
147 DPSOFTRAST_State_Command_Pool);
149 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
151 unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
153 ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
155 DPSOFTRAST_State_Triangle);
157 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
158 slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
159 data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
160 _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
161 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
163 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
164 slope[0] = (triangle)->attribs[arrayindex][0][0]; \
165 slope[1] = (triangle)->attribs[arrayindex][0][1]; \
166 slope[2] = (triangle)->attribs[arrayindex][0][2]; \
167 slope[3] = (triangle)->attribs[arrayindex][0][3]; \
168 data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
169 data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
170 data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
171 data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
174 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
176 typedef ALIGN(struct DPSOFTRAST_State_Span_s
178 int triangle; // triangle this span was generated by
179 int x; // framebuffer x coord
180 int y; // framebuffer y coord
181 int length; // pixel count
182 int startx; // usable range (according to pixelmask)
183 int endx; // usable range (according to pixelmask)
184 unsigned char *pixelmask; // true for pixels that passed depth test, false for others
186 DPSOFTRAST_State_Span);
188 #define DPSOFTRAST_DRAW_MAXSPANS 1024
189 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
191 #define DPSOFTRAST_VALIDATE_FB 1
192 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
193 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
194 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
196 typedef enum DPSOFTRAST_BLENDMODE_e
198 DPSOFTRAST_BLENDMODE_OPAQUE,
199 DPSOFTRAST_BLENDMODE_ALPHA,
200 DPSOFTRAST_BLENDMODE_ADDALPHA,
201 DPSOFTRAST_BLENDMODE_ADD,
202 DPSOFTRAST_BLENDMODE_INVMOD,
203 DPSOFTRAST_BLENDMODE_MUL,
204 DPSOFTRAST_BLENDMODE_MUL2,
205 DPSOFTRAST_BLENDMODE_SUBALPHA,
206 DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
207 DPSOFTRAST_BLENDMODE_TOTAL
209 DPSOFTRAST_BLENDMODE;
211 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
232 float polygonoffset[2];
235 int shader_permutation;
237 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
239 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
240 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
242 // DPSOFTRAST_VALIDATE_ flags
245 // derived values (DPSOFTRAST_VALIDATE_FB)
247 int fb_clearscissor[4];
248 ALIGN(float fb_viewportcenter[4]);
249 ALIGN(float fb_viewportscale[4]);
251 // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
254 // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
257 ATOMIC(volatile int commandoffset);
259 volatile bool waiting;
260 volatile bool starving;
264 SDL_mutex *drawmutex;
269 DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
270 DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
272 DPSOFTRAST_State_Thread);
274 typedef ATOMIC(struct DPSOFTRAST_State_s
278 unsigned int *fb_depthpixels;
279 unsigned int *fb_colorpixels[4];
282 ALIGN(float fb_viewportcenter[4]);
283 ALIGN(float fb_viewportscale[4]);
286 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
287 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
289 const float *pointer_vertex3f;
290 const float *pointer_color4f;
291 const unsigned char *pointer_color4ub;
292 const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
295 int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
296 int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
297 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
301 float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
302 float *screencoord4f;
308 int shader_permutation;
312 int texture_firstfree;
313 DPSOFTRAST_Texture *texture;
318 const char *errorstring;
321 DPSOFTRAST_State_Thread *threads;
323 ATOMIC(volatile int drawcommand);
325 DPSOFTRAST_State_Command_Pool commandpool;
329 DPSOFTRAST_State dpsoftrast;
331 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
332 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
333 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
334 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
335 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
337 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
339 fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
340 fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
341 fb_viewportcenter[3] = 0.5f;
342 fb_viewportcenter[0] = 0.0f;
343 fb_viewportscale[1] = 0.5f * viewport[2];
344 fb_viewportscale[2] = -0.5f * viewport[3];
345 fb_viewportscale[3] = 0.5f;
346 fb_viewportscale[0] = 1.0f;
349 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
351 // calculate framebuffer scissor, viewport, viewport clipped by scissor,
352 // and viewport projection values
355 x1 = thread->scissor[0];
356 x2 = thread->scissor[0] + thread->scissor[2];
357 y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
358 y2 = dpsoftrast.fb_height - thread->scissor[1];
359 if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
361 if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
363 if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
364 thread->fb_clearscissor[0] = x1;
365 thread->fb_clearscissor[1] = y1;
366 thread->fb_clearscissor[2] = x2 - x1;
367 thread->fb_clearscissor[3] = y2 - y1;
369 DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
372 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
374 thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
377 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
379 if (thread->blendsubtract)
381 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
383 #define BLENDFUNC(sfactor, dfactor, blendmode) \
384 case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
385 BLENDFUNC(GL_SRC_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
386 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
391 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
393 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
394 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
395 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
396 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
397 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
398 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
399 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
400 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
401 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
402 BLENDFUNC(GL_SRC_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
403 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
408 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
410 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
412 mask &= thread->validate;
415 if (mask & DPSOFTRAST_VALIDATE_FB)
417 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
418 DPSOFTRAST_RecalcFB(thread);
420 if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
422 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
423 DPSOFTRAST_RecalcDepthFunc(thread);
425 if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
427 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
428 DPSOFTRAST_RecalcBlendFunc(thread);
432 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
434 if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
435 return &dpsoftrast.texture[index];
439 static void DPSOFTRAST_Texture_Grow(void)
441 DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
442 DPSOFTRAST_State_Thread *thread;
446 // expand texture array as needed
447 if (dpsoftrast.texture_max < 1024)
448 dpsoftrast.texture_max = 1024;
450 dpsoftrast.texture_max *= 2;
451 dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
452 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
453 if(dpsoftrast.texbound[i])
454 dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
455 for (j = 0; j < dpsoftrast.numthreads; j++)
457 thread = &dpsoftrast.threads[j];
458 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
459 if(thread->texbound[i])
460 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
464 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
473 int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
474 int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
475 DPSOFTRAST_Texture *texture;
476 if (width*height*depth < 1)
478 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
481 if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
483 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
488 case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
489 case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
490 case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
492 case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
493 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
495 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
500 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
503 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
505 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
510 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
512 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
515 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
517 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
520 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
522 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
525 if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
527 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
530 if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
532 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
535 // find first empty slot in texture array
536 for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
537 if (!dpsoftrast.texture[texnum].bytes)
539 dpsoftrast.texture_firstfree = texnum + 1;
540 if (dpsoftrast.texture_max <= texnum)
541 DPSOFTRAST_Texture_Grow();
542 if (dpsoftrast.texture_end <= texnum)
543 dpsoftrast.texture_end = texnum + 1;
544 texture = &dpsoftrast.texture[texnum];
545 memset(texture, 0, sizeof(*texture));
546 texture->flags = flags;
547 texture->width = width;
548 texture->height = height;
549 texture->depth = depth;
550 texture->sides = sides;
562 s = w * h * d * sides * 4;
563 texture->mipmap[mipmaps][0] = size;
564 texture->mipmap[mipmaps][1] = s;
565 texture->mipmap[mipmaps][2] = w;
566 texture->mipmap[mipmaps][3] = h;
567 texture->mipmap[mipmaps][4] = d;
570 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
576 texture->mipmaps = mipmaps;
577 texture->size = size;
579 // allocate the pixels now
580 texture->bytes = (unsigned char *)MM_CALLOC(1, size);
584 void DPSOFTRAST_Texture_Free(int index)
586 DPSOFTRAST_Texture *texture;
587 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
591 MM_FREE(texture->bytes);
592 texture->bytes = NULL;
593 memset(texture, 0, sizeof(*texture));
594 // adjust the free range and used range
595 if (dpsoftrast.texture_firstfree > index)
596 dpsoftrast.texture_firstfree = index;
597 while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
598 dpsoftrast.texture_end--;
600 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
602 int i, x, y, z, w, layer0, layer1, row0, row1;
603 unsigned char *o, *i0, *i1, *i2, *i3;
604 DPSOFTRAST_Texture *texture;
605 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
606 if (texture->mipmaps <= 1)
608 for (i = 1;i < texture->mipmaps;i++)
610 for (z = 0;z < texture->mipmap[i][4];z++)
614 if (layer1 >= texture->mipmap[i-1][4])
615 layer1 = texture->mipmap[i-1][4]-1;
616 for (y = 0;y < texture->mipmap[i][3];y++)
620 if (row1 >= texture->mipmap[i-1][3])
621 row1 = texture->mipmap[i-1][3]-1;
622 o = texture->bytes + texture->mipmap[i ][0] + 4*((texture->mipmap[i ][3] * z + y ) * texture->mipmap[i ][2]);
623 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
624 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
625 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
626 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
627 w = texture->mipmap[i][2];
630 if (texture->mipmap[i-1][2] > 1)
632 // average 3D texture
633 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
635 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
636 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
637 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
638 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
643 // average 3D mipmap with parent width == 1
644 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
646 o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
647 o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
648 o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
649 o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
655 if (texture->mipmap[i-1][2] > 1)
657 // average 2D texture (common case)
658 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
660 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
661 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
662 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
663 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
668 // 2D texture with parent width == 1
669 o[0] = (i0[0] + i1[0] + 1) >> 1;
670 o[1] = (i0[1] + i1[1] + 1) >> 1;
671 o[2] = (i0[2] + i1[2] + 1) >> 1;
672 o[3] = (i0[3] + i1[3] + 1) >> 1;
679 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
681 DPSOFTRAST_Texture *texture;
683 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
686 dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
687 while (blockheight > 0)
689 memcpy(dst, pixels, blockwidth * 4);
690 pixels += blockwidth * 4;
691 dst += texture->mipmap[0][2] * 4;
694 DPSOFTRAST_Texture_CalculateMipmaps(index);
696 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
698 DPSOFTRAST_Texture *texture;
699 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
702 memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
703 DPSOFTRAST_Texture_CalculateMipmaps(index);
705 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
707 DPSOFTRAST_Texture *texture;
708 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
709 return texture->mipmap[mip][2];
711 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
713 DPSOFTRAST_Texture *texture;
714 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
715 return texture->mipmap[mip][3];
717 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
719 DPSOFTRAST_Texture *texture;
720 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
721 return texture->mipmap[mip][4];
723 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
725 DPSOFTRAST_Texture *texture;
726 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
729 return texture->bytes + texture->mipmap[mip][0];
731 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
733 DPSOFTRAST_Texture *texture;
734 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
735 if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
737 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
742 texture->filter = filter;
745 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
747 if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
748 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
749 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
751 dpsoftrast.fb_width = width;
752 dpsoftrast.fb_height = height;
753 dpsoftrast.fb_depthpixels = depthpixels;
754 dpsoftrast.fb_colorpixels[0] = colorpixels0;
755 dpsoftrast.fb_colorpixels[1] = colorpixels1;
756 dpsoftrast.fb_colorpixels[2] = colorpixels2;
757 dpsoftrast.fb_colorpixels[3] = colorpixels3;
760 static void DPSOFTRAST_Draw_FlushThreads(void);
762 static void DPSOFTRAST_Draw_SyncCommands(void)
765 dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
768 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
771 DPSOFTRAST_State_Thread *thread;
773 int freecommand = dpsoftrast.commandpool.freecommand;
774 int usedcommands = dpsoftrast.commandpool.usedcommands;
775 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
777 DPSOFTRAST_Draw_SyncCommands();
783 for (i = 0; i < dpsoftrast.numthreads; i++)
785 thread = &dpsoftrast.threads[i];
786 commandoffset = freecommand - thread->commandoffset;
787 if (commandoffset < 0)
788 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
789 if (commandoffset > usedcommands)
792 usedcommands = commandoffset;
795 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
797 thread = &dpsoftrast.threads[waitindex];
798 SDL_LockMutex(thread->drawmutex);
799 if (thread->commandoffset != dpsoftrast.drawcommand)
801 thread->waiting = true;
802 if (thread->starving) SDL_CondSignal(thread->drawcond);
803 SDL_CondWait(thread->waitcond, thread->drawmutex);
804 thread->waiting = false;
806 SDL_UnlockMutex(thread->drawmutex);
808 dpsoftrast.commandpool.usedcommands = usedcommands;
810 DPSOFTRAST_Draw_FlushThreads();
814 #define DPSOFTRAST_ALIGNCOMMAND(size) \
815 ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
816 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
817 ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
819 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
821 DPSOFTRAST_Command *command;
822 int freecommand = dpsoftrast.commandpool.freecommand;
823 int usedcommands = dpsoftrast.commandpool.usedcommands;
824 int extra = sizeof(DPSOFTRAST_Command);
825 if(DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
826 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
827 if(usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
829 DPSOFTRAST_Draw_FreeCommandPool(size + extra);
830 freecommand = dpsoftrast.commandpool.freecommand;
831 usedcommands = dpsoftrast.commandpool.usedcommands;
833 if(DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
835 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
836 command->opcode = DPSOFTRAST_OPCODE_Reset;
837 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
840 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
841 command->opcode = opcode;
842 command->commandsize = size;
844 if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
846 dpsoftrast.commandpool.freecommand = freecommand;
847 dpsoftrast.commandpool.usedcommands = usedcommands + size;
851 static void DPSOFTRAST_UndoCommand(int size)
853 int freecommand = dpsoftrast.commandpool.freecommand;
854 int usedcommands = dpsoftrast.commandpool.usedcommands;
856 usedcommands -= size;
857 dpsoftrast.commandpool.freecommand = freecommand;
858 dpsoftrast.commandpool.usedcommands = usedcommands;
861 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
862 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
864 thread->viewport[0] = command->x;
865 thread->viewport[1] = command->y;
866 thread->viewport[2] = command->width;
867 thread->viewport[3] = command->height;
868 thread->validate |= DPSOFTRAST_VALIDATE_FB;
870 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
872 DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
875 command->width = width;
876 command->height = height;
878 dpsoftrast.viewport[0] = x;
879 dpsoftrast.viewport[1] = y;
880 dpsoftrast.viewport[2] = width;
881 dpsoftrast.viewport[3] = height;
882 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
885 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;)
886 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
888 int i, x1, y1, x2, y2, w, h, x, y, t1, t2;
891 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
892 x1 = thread->fb_clearscissor[0];
893 y1 = thread->fb_clearscissor[1];
894 x2 = thread->fb_clearscissor[2];
895 y2 = thread->fb_clearscissor[1] + thread->fb_clearscissor[3];
896 t1 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
897 t2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
904 // FIXME: honor fb_colormask?
905 c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
906 for (i = 0;i < 4;i++)
908 if (!dpsoftrast.fb_colorpixels[i])
910 for (y = y1;y < y2;y++)
912 p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
913 for (x = x1;x < x2;x++)
918 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
920 DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
927 DEFCOMMAND(3, ClearDepth, float depth;)
928 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
930 int x1, y1, x2, y2, w, h, x, y, t1, t2;
933 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
934 x1 = thread->fb_clearscissor[0];
935 y1 = thread->fb_clearscissor[1];
936 x2 = thread->fb_clearscissor[2];
937 y2 = thread->fb_clearscissor[1] + thread->fb_clearscissor[3];
938 t1 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
939 t2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
946 c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
947 for (y = y1;y < y2;y++)
949 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
950 for (x = x1;x < x2;x++)
954 void DPSOFTRAST_ClearDepth(float d)
956 DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
960 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
961 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
963 thread->colormask[0] = command->r != 0;
964 thread->colormask[1] = command->g != 0;
965 thread->colormask[2] = command->b != 0;
966 thread->colormask[3] = command->a != 0;
967 thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
969 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
971 DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
978 DEFCOMMAND(5, DepthTest, int enable;)
979 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
981 thread->depthtest = command->enable;
982 thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
984 void DPSOFTRAST_DepthTest(int enable)
986 DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
987 command->enable = enable;
990 DEFCOMMAND(6, ScissorTest, int enable;)
991 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
993 thread->scissortest = command->enable;
994 thread->validate |= DPSOFTRAST_VALIDATE_FB;
996 void DPSOFTRAST_ScissorTest(int enable)
998 DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
999 command->enable = enable;
1002 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1003 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1005 thread->scissor[0] = command->x;
1006 thread->scissor[1] = command->y;
1007 thread->scissor[2] = command->width;
1008 thread->scissor[3] = command->height;
1009 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1011 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1013 DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1016 command->width = width;
1017 command->height = height;
1020 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1021 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1023 thread->blendfunc[0] = command->sfactor;
1024 thread->blendfunc[1] = command->dfactor;
1025 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1027 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1029 DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1030 command->sfactor = sfactor;
1031 command->dfactor = dfactor;
1034 DEFCOMMAND(9, BlendSubtract, int enable;)
1035 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1037 thread->blendsubtract = command->enable;
1038 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1040 void DPSOFTRAST_BlendSubtract(int enable)
1042 DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1043 command->enable = enable;
1046 DEFCOMMAND(10, DepthMask, int enable;)
1047 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1049 thread->depthmask = command->enable;
1051 void DPSOFTRAST_DepthMask(int enable)
1053 DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1054 command->enable = enable;
1057 DEFCOMMAND(11, DepthFunc, int func;)
1058 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1060 thread->depthfunc = command->func;
1062 void DPSOFTRAST_DepthFunc(int func)
1064 DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1065 command->func = func;
1068 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1069 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1071 thread->depthrange[0] = command->nearval;
1072 thread->depthrange[1] = command->farval;
1074 void DPSOFTRAST_DepthRange(float nearval, float farval)
1076 DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1077 command->nearval = nearval;
1078 command->farval = farval;
1081 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1082 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1084 thread->polygonoffset[0] = command->alongnormal;
1085 thread->polygonoffset[1] = command->intoview;
1087 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1089 DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1090 command->alongnormal = alongnormal;
1091 command->intoview = intoview;
1094 DEFCOMMAND(14, CullFace, int mode;)
1095 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1097 thread->cullface = command->mode;
1099 void DPSOFTRAST_CullFace(int mode)
1101 DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1102 command->mode = mode;
1105 DEFCOMMAND(15, AlphaTest, int enable;)
1106 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1108 thread->alphatest = command->enable;
1110 void DPSOFTRAST_AlphaTest(int enable)
1112 DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1113 command->enable = enable;
1116 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1117 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1119 thread->alphafunc = command->func;
1120 thread->alphavalue = command->ref;
1122 void DPSOFTRAST_AlphaFunc(int func, float ref)
1124 DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1125 command->func = func;
1129 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1131 dpsoftrast.color[0] = r;
1132 dpsoftrast.color[1] = g;
1133 dpsoftrast.color[2] = b;
1134 dpsoftrast.color[3] = a;
1137 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1139 int outstride = blockwidth * 4;
1140 int instride = dpsoftrast.fb_width * 4;
1143 int bx2 = blockx + blockwidth;
1144 int by2 = blocky + blockheight;
1149 unsigned char *inpixels;
1153 if (bx1 < 0) bx1 = 0;
1154 if (by1 < 0) by1 = 0;
1155 if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1156 if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1159 inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1160 if (dpsoftrast.bigendian)
1162 for (y = by1;y < by2;y++)
1164 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1165 o = (unsigned char *)outpixels + (y - by1) * outstride;
1166 for (x = bx1;x < bx2;x++)
1179 for (y = by1;y < by2;y++)
1181 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1182 o = (unsigned char *)outpixels + (y - by1) * outstride;
1188 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1192 int tx2 = tx + width;
1193 int ty2 = ty + height;
1196 int sx2 = sx + width;
1197 int sy2 = sy + height;
1207 unsigned int *spixels;
1208 unsigned int *tpixels;
1209 DPSOFTRAST_Texture *texture;
1210 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1211 if (mip < 0 || mip >= texture->mipmaps) return;
1214 spixels = dpsoftrast.fb_colorpixels[0];
1215 swidth = dpsoftrast.fb_width;
1216 sheight = dpsoftrast.fb_height;
1217 tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1218 twidth = texture->mipmap[mip][2];
1219 theight = texture->mipmap[mip][3];
1220 if (tx1 < 0) tx1 = 0;
1221 if (ty1 < 0) ty1 = 0;
1222 if (tx2 > twidth) tx2 = twidth;
1223 if (ty2 > theight) ty2 = theight;
1224 if (sx1 < 0) sx1 = 0;
1225 if (sy1 < 0) sy1 = 0;
1226 if (sx2 > swidth) sx2 = swidth;
1227 if (sy2 > sheight) sy2 = sheight;
1232 if (tw > sw) tw = sw;
1233 if (th > sh) th = sh;
1234 if (tw < 1 || th < 1)
1236 for (y = 0;y < th;y++)
1237 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 + y) * swidth + sx1), tw*4);
1238 if (texture->mipmaps > 1)
1239 DPSOFTRAST_Texture_CalculateMipmaps(index);
1242 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1243 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1245 if (thread->texbound[command->unitnum])
1246 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1247 thread->texbound[command->unitnum] = command->texture;
1249 void DPSOFTRAST_SetTexture(int unitnum, int index)
1251 DPSOFTRAST_Command_SetTexture *command;
1252 DPSOFTRAST_Texture *texture;
1253 if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1255 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1258 texture = DPSOFTRAST_Texture_GetByIndex(index);
1259 if (index && !texture)
1261 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1265 command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1266 command->unitnum = unitnum;
1267 command->texture = texture;
1269 dpsoftrast.texbound[unitnum] = texture;
1270 ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1273 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1275 dpsoftrast.pointer_vertex3f = vertex3f;
1276 dpsoftrast.stride_vertex = stride;
1278 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1280 dpsoftrast.pointer_color4f = color4f;
1281 dpsoftrast.pointer_color4ub = NULL;
1282 dpsoftrast.stride_color = stride;
1284 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1286 dpsoftrast.pointer_color4f = NULL;
1287 dpsoftrast.pointer_color4ub = color4ub;
1288 dpsoftrast.stride_color = stride;
1290 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1292 dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1293 dpsoftrast.components_texcoord[unitnum] = numcomponents;
1294 dpsoftrast.stride_texcoord[unitnum] = stride;
1297 DEFCOMMAND(18, SetShader, int mode; int permutation;)
1298 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1300 thread->shader_mode = command->mode;
1301 thread->shader_permutation = command->permutation;
1303 void DPSOFTRAST_SetShader(int mode, int permutation)
1305 DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1306 command->mode = mode;
1307 command->permutation = permutation;
1309 dpsoftrast.shader_mode = mode;
1310 dpsoftrast.shader_permutation = permutation;
1313 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1314 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1316 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1318 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1320 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1321 command->index = index;
1322 command->val[0] = v0;
1323 command->val[1] = v1;
1324 command->val[2] = v2;
1325 command->val[3] = v3;
1327 dpsoftrast.uniform4f[index*4+0] = v0;
1328 dpsoftrast.uniform4f[index*4+1] = v1;
1329 dpsoftrast.uniform4f[index*4+2] = v2;
1330 dpsoftrast.uniform4f[index*4+3] = v3;
1332 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1334 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1335 command->index = index;
1336 memcpy(command->val, v, sizeof(command->val));
1338 memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1341 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1342 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1344 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1346 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1350 for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1352 __m128 m0, m1, m2, m3;
1353 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1354 command->index = index;
1355 if (((size_t)v)&(ALIGN_SIZE-1))
1357 m0 = _mm_loadu_ps(v);
1358 m1 = _mm_loadu_ps(v+4);
1359 m2 = _mm_loadu_ps(v+8);
1360 m3 = _mm_loadu_ps(v+12);
1364 m0 = _mm_load_ps(v);
1365 m1 = _mm_load_ps(v+4);
1366 m2 = _mm_load_ps(v+8);
1367 m3 = _mm_load_ps(v+12);
1371 __m128 t0, t1, t2, t3;
1372 t0 = _mm_unpacklo_ps(m0, m1);
1373 t1 = _mm_unpacklo_ps(m2, m3);
1374 t2 = _mm_unpackhi_ps(m0, m1);
1375 t3 = _mm_unpackhi_ps(m2, m3);
1376 m0 = _mm_movelh_ps(t0, t1);
1377 m1 = _mm_movehl_ps(t1, t0);
1378 m2 = _mm_movelh_ps(t2, t3);
1379 m3 = _mm_movehl_ps(t3, t2);
1381 _mm_store_ps(command->val, m0);
1382 _mm_store_ps(command->val+4, m1);
1383 _mm_store_ps(command->val+8, m2);
1384 _mm_store_ps(command->val+12, m3);
1385 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1386 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1387 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1388 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1393 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1394 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1396 thread->uniform1i[command->index] = command->val;
1398 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1400 DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1401 command->index = index;
1404 dpsoftrast.uniform1i[command->index] = i0;
1408 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1410 float *end = dst + size*4;
1411 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1415 _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1424 _mm_store_ps(dst, _mm_load_ps((const float *)src));
1431 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1433 float *end = dst + size*4;
1434 if (stride == sizeof(float[3]))
1436 float *end4 = dst + (size&~3)*4;
1437 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1441 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv;
1442 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1443 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1444 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1445 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1446 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1447 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1448 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1449 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1450 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1451 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1452 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1453 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1455 src += 4*sizeof(float[3]);
1462 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1463 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1464 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1465 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1466 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1467 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1468 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1469 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1470 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1471 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1472 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1473 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1474 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1476 src += 4*sizeof(float[3]);
1480 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1484 __m128 v = _mm_loadu_ps((const float *)src);
1485 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1486 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1487 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1488 _mm_store_ps(dst, v);
1497 __m128 v = _mm_load_ps((const float *)src);
1498 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1499 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1500 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1501 _mm_store_ps(dst, v);
1508 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1510 float *end = dst + size*4;
1511 __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1512 if (stride == sizeof(float[2]))
1514 float *end2 = dst + (size&~1)*4;
1515 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1519 __m128 v = _mm_loadu_ps((const float *)src);
1520 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1521 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1523 src += 2*sizeof(float[2]);
1530 __m128 v = _mm_load_ps((const float *)src);
1531 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1532 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1534 src += 2*sizeof(float[2]);
1540 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1546 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1548 float *end = dst + size*4;
1549 __m128 scale = _mm_set1_ps(1.0f/255.0f);
1550 if (stride == sizeof(unsigned char[4]))
1552 float *end4 = dst + (size&~3)*4;
1553 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1557 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1558 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1559 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1560 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1561 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1563 src += 4*sizeof(unsigned char[4]);
1570 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1571 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1572 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1573 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1574 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1576 src += 4*sizeof(unsigned char[4]);
1582 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1583 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1589 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1591 float *end = dst + 4*size;
1592 __m128 v = _mm_loadu_ps(src);
1595 _mm_store_ps(dst, v);
1601 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1604 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1605 __m128 m0, m1, m2, m3;
1607 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1609 // fast case for identity matrix
1610 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1613 end = out4f + numitems*4;
1614 m0 = _mm_loadu_ps(inmatrix16f);
1615 m1 = _mm_loadu_ps(inmatrix16f + 4);
1616 m2 = _mm_loadu_ps(inmatrix16f + 8);
1617 m3 = _mm_loadu_ps(inmatrix16f + 12);
1618 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1622 __m128 v = _mm_loadu_ps(in4f);
1624 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1625 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1626 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1627 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1636 __m128 v = _mm_load_ps(in4f);
1638 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1639 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1640 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1641 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1649 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1651 memcpy(out4f, in4f, numitems * sizeof(float[4]));
1655 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1657 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1658 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1659 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1660 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1663 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1665 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1666 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1667 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1668 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1671 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1674 out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1675 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1676 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1677 _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1680 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, __m128 minpos, __m128 maxpos, __m128 viewportcenter, __m128 viewportscale, __m128 m0, __m128 m1, __m128 m2, __m128 m3)
1682 int clipmask = 0xFF;
1683 __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1684 m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1685 m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1686 m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1687 m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1688 #define BBFRONT(k, pos) \
1690 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1691 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1692 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1695 clipmask &= ~(1<<k); \
1696 proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1697 minproj = _mm_min_ss(minproj, proj); \
1698 maxproj = _mm_max_ss(maxproj, proj); \
1702 BBFRONT(1, _mm_move_ss(minpos, maxpos));
1703 BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1704 BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1705 BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1706 BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1707 BBFRONT(6, _mm_move_ss(maxpos, minpos));
1711 if (clipmask&(1<<k)) \
1713 if (!(clipmask&(1<<(k^1)))) \
1715 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1716 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1717 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1718 minproj = _mm_min_ss(minproj, proj); \
1719 maxproj = _mm_max_ss(maxproj, proj); \
1721 if (!(clipmask&(1<<(k^2)))) \
1723 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1724 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1725 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1726 minproj = _mm_min_ss(minproj, proj); \
1727 maxproj = _mm_max_ss(maxproj, proj); \
1729 if (!(clipmask&(1<<(k^4)))) \
1731 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1732 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1733 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1734 minproj = _mm_min_ss(minproj, proj); \
1735 maxproj = _mm_max_ss(maxproj, proj); \
1739 BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1740 viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1741 viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1742 minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1743 maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1744 minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1745 maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1746 *starty = _mm_cvttss_si32(maxproj);
1747 *endy = _mm_cvttss_si32(minproj)+1;
1752 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1755 float *end = out4f + numitems*4;
1756 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1757 __m128 minpos, maxpos;
1758 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1760 minpos = maxpos = _mm_loadu_ps(in4f);
1763 __m128 v = _mm_loadu_ps(in4f);
1764 minpos = _mm_min_ps(minpos, v);
1765 maxpos = _mm_max_ps(maxpos, v);
1766 _mm_store_ps(out4f, v);
1767 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1768 _mm_store_ps(screen4f, v);
1776 minpos = maxpos = _mm_load_ps(in4f);
1779 __m128 v = _mm_load_ps(in4f);
1780 minpos = _mm_min_ps(minpos, v);
1781 maxpos = _mm_max_ps(maxpos, v);
1782 _mm_store_ps(out4f, v);
1783 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1784 _mm_store_ps(screen4f, v);
1791 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale,
1792 _mm_setr_ps(1.0f, 0.0f, 0.0f, 0.0f),
1793 _mm_setr_ps(0.0f, 1.0f, 0.0f, 0.0f),
1794 _mm_setr_ps(0.0f, 0.0f, 1.0f, 0.0f),
1795 _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f));
1800 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1803 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1804 __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1806 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1807 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1808 end = out4f + numitems*4;
1809 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1810 viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1811 m0 = _mm_loadu_ps(inmatrix16f);
1812 m1 = _mm_loadu_ps(inmatrix16f + 4);
1813 m2 = _mm_loadu_ps(inmatrix16f + 8);
1814 m3 = _mm_loadu_ps(inmatrix16f + 12);
1815 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1817 minpos = maxpos = _mm_loadu_ps(in4f);
1820 __m128 v = _mm_loadu_ps(in4f);
1821 minpos = _mm_min_ps(minpos, v);
1822 maxpos = _mm_max_ps(maxpos, v);
1823 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1824 _mm_store_ps(out4f, v);
1825 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1826 _mm_store_ps(screen4f, v);
1834 minpos = maxpos = _mm_load_ps(in4f);
1837 __m128 v = _mm_load_ps(in4f);
1838 minpos = _mm_min_ps(minpos, v);
1839 maxpos = _mm_max_ps(maxpos, v);
1840 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1841 _mm_store_ps(out4f, v);
1842 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1843 _mm_store_ps(screen4f, v);
1850 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale, m0, m1, m2, m3);
1855 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1857 float *outf = dpsoftrast.post_array4f[outarray];
1858 const unsigned char *inb;
1859 int firstvertex = dpsoftrast.firstvertex;
1860 int numvertices = dpsoftrast.numvertices;
1864 case DPSOFTRAST_ARRAY_POSITION:
1865 stride = dpsoftrast.stride_vertex;
1866 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1867 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1869 case DPSOFTRAST_ARRAY_COLOR:
1870 stride = dpsoftrast.stride_color;
1871 if (dpsoftrast.pointer_color4f)
1873 inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1874 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1876 else if (dpsoftrast.pointer_color4ub)
1878 stride = dpsoftrast.stride_color;
1879 inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1880 DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1884 DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1888 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1889 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1891 inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1892 switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1895 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1898 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1901 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1910 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1912 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1913 DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1918 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1920 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1921 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
1926 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
1928 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1929 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
1933 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
1936 int startx = span->startx;
1937 int endx = span->endx;
1938 float wslope = triangle->w[0];
1939 float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
1940 float endz = 1.0f / (w + wslope * startx);
1941 for (x = startx;x < endx;)
1943 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
1945 if(nextsub >= endx) nextsub = endsub = endx-1;
1946 endz = 1.0f / (w + wslope * nextsub);
1947 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
1948 for (; x <= endsub; x++, z += dz)
1953 void DPSOFTRAST_Draw_Span_Finish(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const float * RESTRICT in4f)
1956 int startx = span->startx;
1957 int endx = span->endx;
1960 unsigned char * RESTRICT pixelmask = span->pixelmask;
1961 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1964 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
1965 // handle alphatest now (this affects depth writes too)
1966 if (thread->alphatest)
1967 for (x = startx;x < endx;x++)
1968 if (in4f[x*4+3] < 0.5f)
1969 pixelmask[x] = false;
1970 // FIXME: this does not handle bigendian
1971 switch(thread->fb_blendmode)
1973 case DPSOFTRAST_BLENDMODE_OPAQUE:
1974 for (x = startx;x < endx;x++)
1978 d[0] = (int)(in4f[x*4+2]*255.0f);if (d[0] > 255) d[0] = 255;
1979 d[1] = (int)(in4f[x*4+1]*255.0f);if (d[1] > 255) d[1] = 255;
1980 d[2] = (int)(in4f[x*4+0]*255.0f);if (d[2] > 255) d[2] = 255;
1981 d[3] = (int)(in4f[x*4+3]*255.0f);if (d[3] > 255) d[3] = 255;
1982 pixel[x*4+0] = d[0];
1983 pixel[x*4+1] = d[1];
1984 pixel[x*4+2] = d[2];
1985 pixel[x*4+3] = d[3];
1988 case DPSOFTRAST_BLENDMODE_ALPHA:
1989 for (x = startx;x < endx;x++)
1993 a = in4f[x*4+3] * 255.0f;
1994 b = 1.0f - in4f[x*4+3];
1995 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
1996 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
1997 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
1998 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
1999 pixel[x*4+0] = d[0];
2000 pixel[x*4+1] = d[1];
2001 pixel[x*4+2] = d[2];
2002 pixel[x*4+3] = d[3];
2005 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2006 for (x = startx;x < endx;x++)
2010 a = in4f[x*4+3] * 255.0f;
2011 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2012 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2013 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2014 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2015 pixel[x*4+0] = d[0];
2016 pixel[x*4+1] = d[1];
2017 pixel[x*4+2] = d[2];
2018 pixel[x*4+3] = d[3];
2021 case DPSOFTRAST_BLENDMODE_ADD:
2022 for (x = startx;x < endx;x++)
2026 d[0] = (int)(in4f[x*4+2]*255.0f+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2027 d[1] = (int)(in4f[x*4+1]*255.0f+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2028 d[2] = (int)(in4f[x*4+0]*255.0f+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2029 d[3] = (int)(in4f[x*4+3]*255.0f+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2030 pixel[x*4+0] = d[0];
2031 pixel[x*4+1] = d[1];
2032 pixel[x*4+2] = d[2];
2033 pixel[x*4+3] = d[3];
2036 case DPSOFTRAST_BLENDMODE_INVMOD:
2037 for (x = startx;x < endx;x++)
2041 d[0] = (int)((1.0f-in4f[x*4+2])*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2042 d[1] = (int)((1.0f-in4f[x*4+1])*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2043 d[2] = (int)((1.0f-in4f[x*4+0])*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2044 d[3] = (int)((1.0f-in4f[x*4+3])*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2045 pixel[x*4+0] = d[0];
2046 pixel[x*4+1] = d[1];
2047 pixel[x*4+2] = d[2];
2048 pixel[x*4+3] = d[3];
2051 case DPSOFTRAST_BLENDMODE_MUL:
2052 for (x = startx;x < endx;x++)
2056 d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2057 d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2058 d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2059 d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2060 pixel[x*4+0] = d[0];
2061 pixel[x*4+1] = d[1];
2062 pixel[x*4+2] = d[2];
2063 pixel[x*4+3] = d[3];
2066 case DPSOFTRAST_BLENDMODE_MUL2:
2067 for (x = startx;x < endx;x++)
2071 d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]*2.0f);if (d[0] > 255) d[0] = 255;
2072 d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]*2.0f);if (d[1] > 255) d[1] = 255;
2073 d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]*2.0f);if (d[2] > 255) d[2] = 255;
2074 d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]*2.0f);if (d[3] > 255) d[3] = 255;
2075 pixel[x*4+0] = d[0];
2076 pixel[x*4+1] = d[1];
2077 pixel[x*4+2] = d[2];
2078 pixel[x*4+3] = d[3];
2081 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2082 for (x = startx;x < endx;x++)
2086 a = in4f[x*4+3] * -255.0f;
2087 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;if (d[0] < 0) d[0] = 0;
2088 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;if (d[1] < 0) d[1] = 0;
2089 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;if (d[2] < 0) d[2] = 0;
2090 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;if (d[3] < 0) d[3] = 0;
2091 pixel[x*4+0] = d[0];
2092 pixel[x*4+1] = d[1];
2093 pixel[x*4+2] = d[2];
2094 pixel[x*4+3] = d[3];
2097 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2098 for (x = startx;x < endx;x++)
2103 b = 1.0f - in4f[x*4+3];
2104 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2105 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2106 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2107 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2108 pixel[x*4+0] = d[0];
2109 pixel[x*4+1] = d[1];
2110 pixel[x*4+2] = d[2];
2111 pixel[x*4+3] = d[3];
2117 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2121 int startx = span->startx;
2122 int endx = span->endx;
2123 const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2124 unsigned char * RESTRICT pixelmask = span->pixelmask;
2125 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2126 unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2129 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2130 pixeli += span->y * dpsoftrast.fb_width + span->x;
2131 // handle alphatest now (this affects depth writes too)
2132 if (thread->alphatest)
2133 for (x = startx;x < endx;x++)
2134 if (in4ub[x*4+3] < 0.5f)
2135 pixelmask[x] = false;
2136 // FIXME: this does not handle bigendian
2137 switch(thread->fb_blendmode)
2139 case DPSOFTRAST_BLENDMODE_OPAQUE:
2140 for (x = startx;x + 4 <= endx;)
2142 if (*(const unsigned int *)&pixelmask[x] == 0x01010101)
2144 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2158 case DPSOFTRAST_BLENDMODE_ALPHA:
2159 #define FINISHBLEND(blend2, blend1) \
2160 for (x = startx;x + 2 <= endx;x += 2) \
2163 switch (*(const unsigned short*)&pixelmask[x]) \
2166 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2167 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2169 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2172 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x+1]), _mm_setzero_si128()); \
2173 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x+1]), _mm_setzero_si128()); \
2175 pixeli[x+1] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2178 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2179 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2181 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2186 for(;x < endx; x++) \
2189 if (!pixelmask[x]) \
2191 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2192 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2194 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2198 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2199 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2201 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2202 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2205 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2207 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2208 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2210 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2211 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2214 case DPSOFTRAST_BLENDMODE_ADD:
2215 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2217 case DPSOFTRAST_BLENDMODE_INVMOD:
2219 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2221 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2224 case DPSOFTRAST_BLENDMODE_MUL:
2225 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2227 case DPSOFTRAST_BLENDMODE_MUL2:
2228 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2230 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2232 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2233 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2235 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2236 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2239 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2241 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2242 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2244 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2245 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2252 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2255 int startx = span->startx;
2256 int endx = span->endx;
2261 float tc[2], endtc[2];
2263 unsigned int tci[2];
2264 unsigned int tci1[2];
2265 unsigned int tcimin[2];
2266 unsigned int tcimax[2];
2271 const unsigned char * RESTRICT pixelbase;
2272 const unsigned char * RESTRICT pixel[4];
2273 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2274 // if no texture is bound, just fill it with white
2277 for (x = startx;x < endx;x++)
2279 out4f[x*4+0] = 1.0f;
2280 out4f[x*4+1] = 1.0f;
2281 out4f[x*4+2] = 1.0f;
2282 out4f[x*4+3] = 1.0f;
2286 mip = triangle->mip[texunitindex];
2287 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2288 // if this mipmap of the texture is 1 pixel, just fill it with that color
2289 if (texture->mipmap[mip][1] == 4)
2291 c[0] = texture->bytes[2] * (1.0f/255.0f);
2292 c[1] = texture->bytes[1] * (1.0f/255.0f);
2293 c[2] = texture->bytes[0] * (1.0f/255.0f);
2294 c[3] = texture->bytes[3] * (1.0f/255.0f);
2295 for (x = startx;x < endx;x++)
2297 out4f[x*4+0] = c[0];
2298 out4f[x*4+1] = c[1];
2299 out4f[x*4+2] = c[2];
2300 out4f[x*4+3] = c[3];
2304 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2305 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2306 flags = texture->flags;
2307 tcscale[0] = texture->mipmap[mip][2];
2308 tcscale[1] = texture->mipmap[mip][3];
2309 tciwidth = texture->mipmap[mip][2];
2312 tcimax[0] = texture->mipmap[mip][2]-1;
2313 tcimax[1] = texture->mipmap[mip][3]-1;
2314 tciwrapmask[0] = texture->mipmap[mip][2]-1;
2315 tciwrapmask[1] = texture->mipmap[mip][3]-1;
2316 endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0] - 0.5f;
2317 endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1] - 0.5f;
2318 for (x = startx;x < endx;)
2320 unsigned int subtc[2];
2321 unsigned int substep[2];
2322 float subscale = 65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2323 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2326 nextsub = endsub = endx-1;
2327 if(x < nextsub) subscale = 65536.0f / (nextsub - x);
2331 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0] - 0.5f;
2332 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1] - 0.5f;
2333 substep[0] = (endtc[0] - tc[0]) * subscale;
2334 substep[1] = (endtc[1] - tc[1]) * subscale;
2335 subtc[0] = tc[0] * (1<<16);
2336 subtc[1] = tc[1] * (1<<16);
2339 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2341 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2343 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2344 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2345 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2346 tci[0] = subtc[0]>>16;
2347 tci[1] = subtc[1]>>16;
2348 tci1[0] = tci[0] + 1;
2349 tci1[1] = tci[1] + 1;
2350 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2351 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2352 tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2353 tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2354 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2355 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2356 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2357 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2358 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2359 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2360 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2361 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2362 out4f[x*4+0] = c[0];
2363 out4f[x*4+1] = c[1];
2364 out4f[x*4+2] = c[2];
2365 out4f[x*4+3] = c[3];
2370 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2372 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2373 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2374 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2375 tci[0] = subtc[0]>>16;
2376 tci[1] = subtc[1]>>16;
2377 tci1[0] = tci[0] + 1;
2378 tci1[1] = tci[1] + 1;
2379 tci[0] &= tciwrapmask[0];
2380 tci[1] &= tciwrapmask[1];
2381 tci1[0] &= tciwrapmask[0];
2382 tci1[1] &= tciwrapmask[1];
2383 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2384 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2385 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2386 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2387 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2388 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2389 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2390 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2391 out4f[x*4+0] = c[0];
2392 out4f[x*4+1] = c[1];
2393 out4f[x*4+2] = c[2];
2394 out4f[x*4+3] = c[3];
2398 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2400 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2402 tci[0] = subtc[0]>>16;
2403 tci[1] = subtc[1]>>16;
2404 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2405 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2406 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2407 c[0] = pixel[0][2] * (1.0f / 255.0f);
2408 c[1] = pixel[0][1] * (1.0f / 255.0f);
2409 c[2] = pixel[0][0] * (1.0f / 255.0f);
2410 c[3] = pixel[0][3] * (1.0f / 255.0f);
2411 out4f[x*4+0] = c[0];
2412 out4f[x*4+1] = c[1];
2413 out4f[x*4+2] = c[2];
2414 out4f[x*4+3] = c[3];
2419 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2421 tci[0] = subtc[0]>>16;
2422 tci[1] = subtc[1]>>16;
2423 tci[0] &= tciwrapmask[0];
2424 tci[1] &= tciwrapmask[1];
2425 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2426 c[0] = pixel[0][2] * (1.0f / 255.0f);
2427 c[1] = pixel[0][1] * (1.0f / 255.0f);
2428 c[2] = pixel[0][0] * (1.0f / 255.0f);
2429 c[3] = pixel[0][3] * (1.0f / 255.0f);
2430 out4f[x*4+0] = c[0];
2431 out4f[x*4+1] = c[1];
2432 out4f[x*4+2] = c[2];
2433 out4f[x*4+3] = c[3];
2439 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2443 int startx = span->startx;
2444 int endx = span->endx;
2446 __m128 data, slope, tcscale;
2447 __m128i tcsize, tcmask, tcoffset, tcmax;
2449 __m128i subtc, substep, endsubtc;
2452 unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2453 const unsigned char * RESTRICT pixelbase;
2454 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2455 // if no texture is bound, just fill it with white
2458 memset(out4ub + startx*4, 255, span->length*4);
2461 mip = triangle->mip[texunitindex];
2462 pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2463 // if this mipmap of the texture is 1 pixel, just fill it with that color
2464 if (texture->mipmap[mip][1] == 4)
2466 unsigned int k = *((const unsigned int *)pixelbase);
2467 for (x = startx;x < endx;x++)
2471 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2472 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2473 flags = texture->flags;
2474 tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2475 tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2476 tcscale = _mm_cvtepi32_ps(tcsize);
2477 data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2478 slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2479 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx])), _mm_set1_ps(0.5f));
2480 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2481 tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2482 tcmax = _mm_packs_epi32(tcmask, tcmask);
2483 for (x = startx;x < endx;)
2485 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2486 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2489 nextsub = endsub = endx-1;
2490 if(x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2494 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub])), _mm_set1_ps(0.5f));
2495 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2496 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2497 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2498 substep = _mm_slli_epi32(substep, 1);
2501 __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2502 if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2504 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2505 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2507 const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;
2508 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2509 tci = _mm_madd_epi16(tci, tcoffset);
2510 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2511 ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2512 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2513 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2514 pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2515 pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2516 fracm = _mm_srli_epi16(subtc, 1);
2517 pix1 = _mm_add_epi16(pix1,
2518 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2519 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2520 pix3 = _mm_add_epi16(pix3,
2521 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2522 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2523 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2524 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2525 pix2 = _mm_add_epi16(pix2,
2526 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2527 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2528 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2532 const unsigned char * RESTRICT ptr1;
2533 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2534 tci = _mm_madd_epi16(tci, tcoffset);
2535 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2536 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2537 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2538 fracm = _mm_srli_epi16(subtc, 1);
2539 pix1 = _mm_add_epi16(pix1,
2540 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2541 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2542 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2543 pix1 = _mm_add_epi16(pix1,
2544 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2545 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2546 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2550 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2552 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2554 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2555 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2556 tci = _mm_madd_epi16(tci, tcoffset);
2557 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2558 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2559 _mm_setzero_si128());
2560 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2561 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2562 _mm_setzero_si128());
2563 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2564 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2565 tci = _mm_madd_epi16(tci, tcoffset);
2566 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2567 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2568 _mm_setzero_si128());
2569 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2570 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2571 _mm_setzero_si128());
2572 fracm = _mm_srli_epi16(subtc, 1);
2573 pix1 = _mm_add_epi16(pix1,
2574 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2575 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2576 pix3 = _mm_add_epi16(pix3,
2577 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2578 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2579 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2580 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2581 pix2 = _mm_add_epi16(pix2,
2582 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2583 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2584 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2588 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2589 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2590 tci = _mm_madd_epi16(tci, tcoffset);
2591 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2592 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2593 _mm_setzero_si128());
2594 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2595 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2596 _mm_setzero_si128());
2597 fracm = _mm_srli_epi16(subtc, 1);
2598 pix1 = _mm_add_epi16(pix1,
2599 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2600 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2601 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2602 pix1 = _mm_add_epi16(pix1,
2603 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2604 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2605 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2611 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2613 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2614 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2615 tci = _mm_madd_epi16(tci, tcoffset);
2616 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2617 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2618 _mm_setzero_si128());
2619 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2620 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2621 _mm_setzero_si128());
2622 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2623 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2624 tci = _mm_madd_epi16(tci, tcoffset);
2625 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2626 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2627 _mm_setzero_si128());
2628 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2629 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2630 _mm_setzero_si128());
2631 fracm = _mm_srli_epi16(subtc, 1);
2632 pix1 = _mm_add_epi16(pix1,
2633 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2634 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2635 pix3 = _mm_add_epi16(pix3,
2636 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2637 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2638 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2639 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2640 pix2 = _mm_add_epi16(pix2,
2641 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2642 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2643 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2647 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2648 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2649 tci = _mm_madd_epi16(tci, tcoffset);
2650 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2651 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2652 _mm_setzero_si128());
2653 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2654 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2655 _mm_setzero_si128());
2656 fracm = _mm_srli_epi16(subtc, 1);
2657 pix1 = _mm_add_epi16(pix1,
2658 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2659 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2660 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2661 pix1 = _mm_add_epi16(pix1,
2662 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2663 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2664 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2671 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2673 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2675 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2676 tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2677 tci = _mm_madd_epi16(tci, tcoffset);
2678 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2679 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2683 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2684 tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2685 tci = _mm_madd_epi16(tci, tcoffset);
2686 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2692 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2694 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2695 tci = _mm_and_si128(tci, tcmax);
2696 tci = _mm_madd_epi16(tci, tcoffset);
2697 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2698 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2702 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2703 tci = _mm_and_si128(tci, tcmax);
2704 tci = _mm_madd_epi16(tci, tcoffset);
2705 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2714 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2717 memset(out4ub, 255, span->length*4);
2720 float DPSOFTRAST_SampleShadowmap(const float *vector)
2726 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2729 int startx = span->startx;
2730 int endx = span->endx;
2735 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2736 for (x = startx;x < endx;x++)
2739 c[0] = (data[0] + slope[0]*x) * z;
2740 c[1] = (data[1] + slope[1]*x) * z;
2741 c[2] = (data[2] + slope[2]*x) * z;
2742 c[3] = (data[3] + slope[3]*x) * z;
2743 out4f[x*4+0] = in4f[x*4+0] * c[0];
2744 out4f[x*4+1] = in4f[x*4+1] * c[1];
2745 out4f[x*4+2] = in4f[x*4+2] * c[2];
2746 out4f[x*4+3] = in4f[x*4+3] * c[3];
2750 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2753 int startx = span->startx;
2754 int endx = span->endx;
2759 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2760 for (x = startx;x < endx;x++)
2763 c[0] = (data[0] + slope[0]*x) * z;
2764 c[1] = (data[1] + slope[1]*x) * z;
2765 c[2] = (data[2] + slope[2]*x) * z;
2766 c[3] = (data[3] + slope[3]*x) * z;
2767 out4f[x*4+0] = c[0];
2768 out4f[x*4+1] = c[1];
2769 out4f[x*4+2] = c[2];
2770 out4f[x*4+3] = c[3];
2774 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2776 int x, startx = span->startx, endx = span->endx;
2777 float c[4], localcolor[4];
2778 localcolor[0] = subcolor[0];
2779 localcolor[1] = subcolor[1];
2780 localcolor[2] = subcolor[2];
2781 localcolor[3] = subcolor[3];
2782 for (x = startx;x < endx;x++)
2784 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2785 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2786 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2787 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2788 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2789 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2790 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2791 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2795 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2797 int x, startx = span->startx, endx = span->endx;
2798 for (x = startx;x < endx;x++)
2800 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2801 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2802 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2803 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2807 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2809 int x, startx = span->startx, endx = span->endx;
2810 for (x = startx;x < endx;x++)
2812 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2813 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2814 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2815 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2819 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2821 int x, startx = span->startx, endx = span->endx;
2823 for (x = startx;x < endx;x++)
2825 a = 1.0f - inb4f[x*4+3];
2827 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2828 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2829 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2830 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2834 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2836 int x, startx = span->startx, endx = span->endx;
2837 float localcolor[4], ilerp, lerp;
2838 localcolor[0] = color[0];
2839 localcolor[1] = color[1];
2840 localcolor[2] = color[2];
2841 localcolor[3] = color[3];
2842 ilerp = 1.0f - localcolor[3];
2843 lerp = localcolor[3];
2844 for (x = startx;x < endx;x++)
2846 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2847 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2848 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2849 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2855 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2859 int startx = span->startx;
2860 int endx = span->endx;
2862 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2863 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2864 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2865 data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
2866 data = _mm_mul_ps(data, _mm_set1_ps(256.0f));
2867 slope = _mm_mul_ps(slope, _mm_set1_ps(256.0f));
2868 for (x = startx;x+2 <= endx;x += 2, data = _mm_add_ps(data, slope))
2870 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
2871 __m128i mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x]))), mod2;
2872 data = _mm_add_ps(data, slope);
2873 mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x+1])));
2874 mod = _mm_unpacklo_epi64(_mm_packs_epi32(mod, mod), _mm_packs_epi32(mod2, mod2));
2875 pix = _mm_mulhi_epu16(pix, mod);
2876 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2878 for (;x < endx;x++, data = _mm_add_ps(data, slope))
2880 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
2881 __m128i mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x])));
2882 mod = _mm_packs_epi32(mod, mod);
2883 pix = _mm_mulhi_epu16(pix, mod);
2884 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2889 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
2893 int startx = span->startx;
2894 int endx = span->endx;
2896 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2897 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2898 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2899 data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
2900 data = _mm_mul_ps(data, _mm_set1_ps(255.0f));
2901 slope = _mm_mul_ps(slope, _mm_set1_ps(255.0f));
2902 for (x = startx;x+2 <= endx;x += 2, data = _mm_add_ps(data, slope))
2904 __m128i pix = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x]))), pix2;
2905 data = _mm_add_ps(data, slope);
2906 pix2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x+1])));
2907 pix = _mm_unpacklo_epi64(_mm_packs_epi32(pix, pix), _mm_packs_epi32(pix2, pix2));
2908 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2910 for (;x < endx;x++, data = _mm_add_ps(data, slope))
2912 __m128i pix = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x])));
2913 pix = _mm_packs_epi32(pix, pix);
2914 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2919 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
2922 int x, startx = span->startx, endx = span->endx;
2923 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
2924 localcolor = _mm_shuffle_epi32(_mm_packs_epi32(localcolor, localcolor), _MM_SHUFFLE(1, 0, 1, 0));
2925 for (x = startx;x+2 <= endx;x+=2)
2927 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2928 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
2929 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
2930 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2934 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2935 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
2936 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
2937 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2942 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
2945 int x, startx = span->startx, endx = span->endx;
2946 for (x = startx;x+2 <= endx;x+=2)
2948 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2949 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
2950 pix1 = _mm_mulhi_epu16(pix1, pix2);
2951 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2955 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2956 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
2957 pix1 = _mm_mulhi_epu16(pix1, pix2);
2958 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2963 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
2966 int x, startx = span->startx, endx = span->endx;
2967 for (x = startx;x+2 <= endx;x+=2)
2969 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2970 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
2971 pix1 = _mm_add_epi16(pix1, pix2);
2972 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2976 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2977 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
2978 pix1 = _mm_add_epi16(pix1, pix2);
2979 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2984 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
2987 int x, startx = span->startx, endx = span->endx;
2988 __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
2989 tint = _mm_shuffle_epi32(_mm_packs_epi32(tint, tint), _MM_SHUFFLE(1, 0, 1, 0));
2990 for (x = startx;x+2 <= endx;x+=2)
2992 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2993 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
2994 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
2995 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2999 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3000 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3001 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3002 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3007 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3010 int x, startx = span->startx, endx = span->endx;
3011 for (x = startx;x+2 <= endx;x+=2)
3013 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3014 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3015 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3016 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3017 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3021 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3022 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3023 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3024 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3025 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3030 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3033 int x, startx = span->startx, endx = span->endx;
3034 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3035 localcolor = _mm_shuffle_epi32(_mm_packs_epi32(localcolor, localcolor), _MM_SHUFFLE(1, 0, 1, 0));
3036 blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3037 for (x = startx;x+2 <= endx;x+=2)
3039 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3040 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3041 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3045 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3046 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3047 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3054 void DPSOFTRAST_VertexShader_Generic(void)
3056 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3057 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3058 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3059 if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3060 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3063 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3065 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3066 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3067 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3068 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3069 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3070 if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3072 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3073 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3074 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3076 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3077 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3080 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3082 else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3085 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3087 else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3090 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3095 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3096 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3101 void DPSOFTRAST_VertexShader_PostProcess(void)
3103 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3104 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3105 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3108 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3110 // TODO: optimize!! at the very least there is no reason to use texture sampling on the frame texture
3111 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3112 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3113 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3114 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3115 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3116 if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3118 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3119 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3121 DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3122 if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3124 // TODO: implement saturation
3126 if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3128 // TODO: implement gammaramps
3130 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3135 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3137 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3140 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3142 // this is never called (because colormask is off when this shader is used)
3143 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3144 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3145 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3146 memset(buffer_FragColorbgra8, 0, span->length*4);
3147 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3152 void DPSOFTRAST_VertexShader_FlatColor(void)
3154 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3155 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3158 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3160 int x, startx = span->startx, endx = span->endx;
3161 int Color_Ambienti[4];
3162 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3163 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3164 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3165 Color_Ambienti[2] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0]*256.0f);
3166 Color_Ambienti[1] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1]*256.0f);
3167 Color_Ambienti[0] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2]*256.0f);
3168 Color_Ambienti[3] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0] *256.0f);
3169 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3170 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3171 for (x = startx;x < endx;x++)
3173 buffer_FragColorbgra8[x*4+0] = (buffer_texture_colorbgra8[x*4+0] * Color_Ambienti[0])>>8;
3174 buffer_FragColorbgra8[x*4+1] = (buffer_texture_colorbgra8[x*4+1] * Color_Ambienti[1])>>8;
3175 buffer_FragColorbgra8[x*4+2] = (buffer_texture_colorbgra8[x*4+2] * Color_Ambienti[2])>>8;
3176 buffer_FragColorbgra8[x*4+3] = (buffer_texture_colorbgra8[x*4+3] * Color_Ambienti[3])>>8;
3178 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3183 void DPSOFTRAST_VertexShader_VertexColor(void)
3185 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3186 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3187 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3190 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3193 unsigned char * RESTRICT pixelmask = span->pixelmask;
3194 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3195 int x, startx = span->startx, endx = span->endx;
3196 __m128i Color_Ambientm, Color_Diffusem;
3198 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3199 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3200 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3201 int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3202 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3203 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3204 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3205 pixel = buffer_FragColorbgra8;
3206 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3207 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3208 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3209 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3210 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3211 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3212 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3213 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3214 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3215 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3216 data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3217 data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3218 slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3219 for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3221 __m128i color, mod, pix;
3222 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3225 __m128 z = _mm_loadu_ps(&buffer_z[x]);
3226 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3227 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3228 data = _mm_add_ps(data, slope);
3229 mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3230 data = _mm_add_ps(data, slope);
3231 mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3232 data = _mm_add_ps(data, slope);
3233 mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3234 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3235 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3236 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3237 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3238 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3244 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3245 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x])));
3246 mod = _mm_packs_epi32(mod, mod);
3247 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3248 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3250 if(pixel == buffer_FragColorbgra8)
3251 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3257 void DPSOFTRAST_VertexShader_Lightmap(void)
3259 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3260 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3261 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3264 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3267 unsigned char * RESTRICT pixelmask = span->pixelmask;
3268 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3269 int x, startx = span->startx, endx = span->endx;
3270 __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3271 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3272 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3273 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3274 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3275 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3276 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3277 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3278 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3279 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3280 pixel = buffer_FragColorbgra8;
3281 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3282 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3283 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3284 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3285 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3286 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3287 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3288 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3290 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3291 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3292 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3293 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3294 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3295 for (x = startx;x < endx;x++)
3297 __m128i color, lightmap, glow, pix;
3298 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3301 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3302 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3303 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3304 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3305 _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3306 _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3307 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3308 _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3309 _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3310 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3316 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3317 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3318 glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3319 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3320 pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3321 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3326 for (x = startx;x < endx;x++)
3328 __m128i color, lightmap, pix;
3329 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3332 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3333 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3334 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3335 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3336 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3337 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3338 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3344 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3345 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3346 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3347 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3350 if(pixel == buffer_FragColorbgra8)
3351 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3357 void DPSOFTRAST_VertexShader_FakeLight(void)
3359 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3362 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3365 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3366 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3367 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3368 memset(buffer_FragColorbgra8, 0, span->length*4);
3369 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3374 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3376 DPSOFTRAST_VertexShader_Lightmap();
3379 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3381 DPSOFTRAST_PixelShader_Lightmap(thread, triangle, span);
3387 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3389 DPSOFTRAST_VertexShader_Lightmap();
3392 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3394 DPSOFTRAST_PixelShader_Lightmap(thread, triangle, span);
3400 void DPSOFTRAST_VertexShader_LightDirection(void)
3403 int numvertices = dpsoftrast.numvertices;
3405 float LightVector[4];
3406 float EyePosition[4];
3407 float EyeVectorModelSpace[4];
3413 LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3414 LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3415 LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3416 LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3417 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3418 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3419 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3420 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3421 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3422 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3423 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3424 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3425 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3426 for (i = 0;i < numvertices;i++)
3428 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3429 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3430 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3431 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3432 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3433 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3434 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3435 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3436 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3437 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3438 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3439 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3440 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3441 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3442 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3443 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
3444 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
3445 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
3446 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
3447 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3448 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3449 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3450 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3451 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3452 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
3453 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
3454 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
3455 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
3456 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
3458 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3461 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3462 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3463 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3464 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3465 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3466 #define DPSOFTRAST_Vector3Normalize(v)\
3469 float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3480 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3482 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3483 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3484 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3485 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3486 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3487 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3488 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3489 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3490 int x, startx = span->startx, endx = span->endx;
3491 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3492 float LightVectordata[4];
3493 float LightVectorslope[4];
3494 float EyeVectordata[4];
3495 float EyeVectorslope[4];
3497 float diffusetex[4];
3499 float surfacenormal[4];
3500 float lightnormal[4];
3502 float specularnormal[4];
3505 float SpecularPower;
3507 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3508 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3509 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3510 Color_Glow[3] = 0.0f;
3511 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3512 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3513 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3514 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3515 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3516 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3517 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3518 Color_Pants[3] = 0.0f;
3519 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3520 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3521 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3522 Color_Shirt[3] = 0.0f;
3523 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3524 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3525 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3527 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3528 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3530 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3532 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3534 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3536 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3537 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3538 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3539 Color_Diffuse[3] = 0.0f;
3540 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3541 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3542 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3543 LightColor[3] = 0.0f;
3544 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3545 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3546 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3547 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3548 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3549 Color_Specular[3] = 0.0f;
3550 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3551 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3552 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3553 for (x = startx;x < endx;x++)
3556 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3557 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3558 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3559 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3560 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3562 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3563 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3564 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3565 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3567 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3568 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3569 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3570 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3571 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3572 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3573 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3574 DPSOFTRAST_Vector3Normalize(surfacenormal);
3576 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3577 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3578 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3579 DPSOFTRAST_Vector3Normalize(lightnormal);
3581 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3582 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3583 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3584 DPSOFTRAST_Vector3Normalize(eyenormal);
3586 specularnormal[0] = lightnormal[0] + eyenormal[0];
3587 specularnormal[1] = lightnormal[1] + eyenormal[1];
3588 specularnormal[2] = lightnormal[2] + eyenormal[2];
3589 DPSOFTRAST_Vector3Normalize(specularnormal);
3591 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3592 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3593 specular = pow(specular, SpecularPower * glosstex[3]);
3594 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3596 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3597 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3598 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3599 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3603 d[0] = (int)( diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3604 d[1] = (int)( diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3605 d[2] = (int)( diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3606 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3608 buffer_FragColorbgra8[x*4+0] = d[0];
3609 buffer_FragColorbgra8[x*4+1] = d[1];
3610 buffer_FragColorbgra8[x*4+2] = d[2];
3611 buffer_FragColorbgra8[x*4+3] = d[3];
3614 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3616 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3617 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3618 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3619 Color_Diffuse[3] = 0.0f;
3620 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3621 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3622 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3623 LightColor[3] = 0.0f;
3624 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3625 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3626 for (x = startx;x < endx;x++)
3629 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3630 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3631 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3632 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3633 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3634 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3635 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3636 DPSOFTRAST_Vector3Normalize(surfacenormal);
3638 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3639 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3640 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3641 DPSOFTRAST_Vector3Normalize(lightnormal);
3643 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3644 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3646 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3647 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3648 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3649 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
3653 d[0] = (int)( + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3654 d[1] = (int)( + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3655 d[2] = (int)( + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3656 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
3658 buffer_FragColorbgra8[x*4+0] = d[0];
3659 buffer_FragColorbgra8[x*4+1] = d[1];
3660 buffer_FragColorbgra8[x*4+2] = d[2];
3661 buffer_FragColorbgra8[x*4+3] = d[3];
3666 for (x = startx;x < endx;x++)
3669 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3670 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3671 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3672 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3674 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3676 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3677 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3678 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3679 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3683 d[0] = (int)( diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3684 d[1] = (int)( diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3685 d[2] = (int)( diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3686 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3688 buffer_FragColorbgra8[x*4+0] = d[0];
3689 buffer_FragColorbgra8[x*4+1] = d[1];
3690 buffer_FragColorbgra8[x*4+2] = d[2];
3691 buffer_FragColorbgra8[x*4+3] = d[3];
3694 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3699 void DPSOFTRAST_VertexShader_LightSource(void)
3702 int numvertices = dpsoftrast.numvertices;
3703 float LightPosition[4];
3704 float LightVector[4];
3705 float LightVectorModelSpace[4];
3706 float EyePosition[4];
3707 float EyeVectorModelSpace[4];
3713 LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
3714 LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
3715 LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
3716 LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
3717 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3718 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3719 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3720 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3721 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3722 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3723 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3724 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3725 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3726 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3727 for (i = 0;i < numvertices;i++)
3729 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3730 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3731 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3732 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3733 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3734 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3735 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3736 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3737 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3738 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3739 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3740 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3741 LightVectorModelSpace[0] = LightPosition[0] - position[0];
3742 LightVectorModelSpace[1] = LightPosition[1] - position[1];
3743 LightVectorModelSpace[2] = LightPosition[2] - position[2];
3744 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
3745 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
3746 LightVector[2] = normal[0] * LightVectorModelSpace[0] + normal[1] * LightVectorModelSpace[1] + normal[2] * LightVectorModelSpace[2];
3747 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
3748 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
3749 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
3750 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
3751 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3752 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3753 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3754 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3755 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3756 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
3757 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
3758 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
3759 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
3760 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
3762 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3763 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
3766 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3769 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3770 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3771 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3772 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3773 unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3774 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3775 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3776 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3777 int x, startx = span->startx, endx = span->endx;
3778 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3779 float CubeVectordata[4];
3780 float CubeVectorslope[4];
3781 float LightVectordata[4];
3782 float LightVectorslope[4];
3783 float EyeVectordata[4];
3784 float EyeVectorslope[4];
3786 float diffusetex[4];
3788 float surfacenormal[4];
3789 float lightnormal[4];
3791 float specularnormal[4];
3794 float SpecularPower;
3795 float CubeVector[4];
3798 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3799 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3800 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3801 Color_Glow[3] = 0.0f;
3802 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3803 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3804 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3805 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3806 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3807 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3808 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3809 Color_Diffuse[3] = 0.0f;
3810 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3811 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3812 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3813 Color_Specular[3] = 0.0f;
3814 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3815 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3816 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3817 Color_Pants[3] = 0.0f;
3818 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3819 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3820 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3821 Color_Shirt[3] = 0.0f;
3822 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3823 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3824 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3825 LightColor[3] = 0.0f;
3826 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3827 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3828 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3829 DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3830 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3831 memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
3832 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3833 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3835 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3836 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3838 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3839 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
3840 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3842 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3843 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3844 for (x = startx;x < endx;x++)
3847 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
3848 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
3849 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
3850 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
3851 if (attenuation < 0.01f)
3853 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
3855 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
3856 if (attenuation < 0.01f)
3860 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3861 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3862 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3863 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3864 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3866 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3867 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3868 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3869 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3871 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3872 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3873 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3874 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3875 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3876 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3877 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3878 DPSOFTRAST_Vector3Normalize(surfacenormal);
3880 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3881 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3882 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3883 DPSOFTRAST_Vector3Normalize(lightnormal);
3885 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3886 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3887 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3888 DPSOFTRAST_Vector3Normalize(eyenormal);
3890 specularnormal[0] = lightnormal[0] + eyenormal[0];
3891 specularnormal[1] = lightnormal[1] + eyenormal[1];
3892 specularnormal[2] = lightnormal[2] + eyenormal[2];
3893 DPSOFTRAST_Vector3Normalize(specularnormal);
3895 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3896 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3897 specular = pow(specular, SpecularPower * glosstex[3]);
3898 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3900 // scale down the attenuation to account for the cubefilter multiplying everything by 255
3901 attenuation *= (1.0f / 255.0f);
3902 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
3903 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
3904 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
3905 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
3909 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
3910 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
3911 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
3912 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
3914 buffer_FragColorbgra8[x*4+0] = d[0];
3915 buffer_FragColorbgra8[x*4+1] = d[1];
3916 buffer_FragColorbgra8[x*4+2] = d[2];
3917 buffer_FragColorbgra8[x*4+3] = d[3];
3920 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3922 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3923 for (x = startx;x < endx;x++)
3926 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
3927 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
3928 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
3929 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
3930 if (attenuation < 0.01f)
3932 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
3934 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
3935 if (attenuation < 0.01f)
3939 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3940 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3941 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3942 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3943 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3945 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3946 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3947 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3948 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3950 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3951 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3952 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3953 DPSOFTRAST_Vector3Normalize(surfacenormal);
3955 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3956 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3957 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3958 DPSOFTRAST_Vector3Normalize(lightnormal);
3960 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3961 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3963 // scale down the attenuation to account for the cubefilter multiplying everything by 255
3964 attenuation *= (1.0f / 255.0f);
3965 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
3966 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
3967 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
3968 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
3972 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
3973 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
3974 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
3975 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
3977 buffer_FragColorbgra8[x*4+0] = d[0];
3978 buffer_FragColorbgra8[x*4+1] = d[1];
3979 buffer_FragColorbgra8[x*4+2] = d[2];
3980 buffer_FragColorbgra8[x*4+3] = d[3];
3985 for (x = startx;x < endx;x++)
3988 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
3989 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
3990 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
3991 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
3992 if (attenuation < 0.01f)
3994 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
3996 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
3997 if (attenuation < 0.01f)
4001 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4002 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4003 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4004 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4005 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4007 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4008 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4009 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4010 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4012 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4014 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4015 attenuation *= (1.0f / 255.0f);
4016 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4017 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4018 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4019 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4023 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4024 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4025 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4026 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4028 buffer_FragColorbgra8[x*4+0] = d[0];
4029 buffer_FragColorbgra8[x*4+1] = d[1];
4030 buffer_FragColorbgra8[x*4+2] = d[2];
4031 buffer_FragColorbgra8[x*4+3] = d[3];
4034 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4040 void DPSOFTRAST_VertexShader_Refraction(void)
4042 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4045 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4048 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4049 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4050 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4051 memset(buffer_FragColorbgra8, 0, span->length*4);
4052 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4057 void DPSOFTRAST_VertexShader_Water(void)
4059 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4063 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4066 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4067 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4068 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4069 memset(buffer_FragColorbgra8, 0, span->length*4);
4070 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4075 void DPSOFTRAST_VertexShader_ShowDepth(void)
4077 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4080 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4083 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4084 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4085 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4086 memset(buffer_FragColorbgra8, 0, span->length*4);
4087 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4092 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4094 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4097 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4100 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4101 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4102 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4103 memset(buffer_FragColorbgra8, 0, span->length*4);
4104 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4109 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4111 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4114 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4117 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4118 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4119 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4120 memset(buffer_FragColorbgra8, 0, span->length*4);
4121 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4126 typedef struct DPSOFTRAST_ShaderModeInfo_s
4129 void (*Vertex)(void);
4130 void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4131 unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4132 unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4134 DPSOFTRAST_ShaderModeInfo;
4136 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4138 {2, DPSOFTRAST_VertexShader_Generic, DPSOFTRAST_PixelShader_Generic, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4139 {2, DPSOFTRAST_VertexShader_PostProcess, DPSOFTRAST_PixelShader_PostProcess, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4140 {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow, DPSOFTRAST_PixelShader_Depth_Or_Shadow, {~0}, {~0}},
4141 {2, DPSOFTRAST_VertexShader_FlatColor, DPSOFTRAST_PixelShader_FlatColor, {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4142 {2, DPSOFTRAST_VertexShader_VertexColor, DPSOFTRAST_PixelShader_VertexColor, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4143 {2, DPSOFTRAST_VertexShader_Lightmap, DPSOFTRAST_PixelShader_Lightmap, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4144 {2, DPSOFTRAST_VertexShader_FakeLight, DPSOFTRAST_PixelShader_FakeLight, {~0}, {~0}},
4145 {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace, DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4146 {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4147 {2, DPSOFTRAST_VertexShader_LightDirection, DPSOFTRAST_PixelShader_LightDirection, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4148 {2, DPSOFTRAST_VertexShader_LightSource, DPSOFTRAST_PixelShader_LightSource, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4149 {2, DPSOFTRAST_VertexShader_Refraction, DPSOFTRAST_PixelShader_Refraction, {~0}},
4150 {2, DPSOFTRAST_VertexShader_Water, DPSOFTRAST_PixelShader_Water, {~0}},
4151 {2, DPSOFTRAST_VertexShader_ShowDepth, DPSOFTRAST_PixelShader_ShowDepth, {~0}},
4152 {2, DPSOFTRAST_VertexShader_DeferredGeometry, DPSOFTRAST_PixelShader_DeferredGeometry, {~0}},
4153 {2, DPSOFTRAST_VertexShader_DeferredLightSource, DPSOFTRAST_PixelShader_DeferredLightSource, {~0}}
4156 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4163 // unsigned int *colorpixel;
4164 unsigned int *depthpixel;
4170 DPSOFTRAST_State_Triangle *triangle;
4171 DPSOFTRAST_State_Span *span;
4172 unsigned char pixelmask[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4173 for (i = 0; i < thread->numspans; i++)
4175 span = &thread->spans[i];
4176 triangle = &thread->triangles[span->triangle];
4177 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4179 wslope = triangle->w[0];
4180 w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
4181 depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
4182 depth = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
4183 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4184 switch(thread->fb_depthfunc)
4187 case GL_ALWAYS: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = true; break;
4188 case GL_LESS: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4189 case GL_LEQUAL: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4190 case GL_EQUAL: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4191 case GL_GEQUAL: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4192 case GL_GREATER: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4193 case GL_NEVER: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = false; break;
4195 //colorpixel = dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;;
4196 //for (x = 0;x < span->length;x++)
4197 // colorpixel[x] = (depthpixel[x] & 0xFF000000) ? (0x00FF0000) : (depthpixel[x] & 0x00FF0000);
4198 // if there is no color buffer, skip pixel shader
4200 endx = span->length;
4201 while (startx < endx && !pixelmask[startx])
4203 while (endx > startx && !pixelmask[endx-1])
4206 continue; // no pixels to fill
4207 span->pixelmask = pixelmask;
4208 span->startx = startx;
4210 // run pixel shader if appropriate
4211 // do this before running depthmask code, to allow the pixelshader
4212 // to clear pixelmask values for alpha testing
4213 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4214 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4215 if (thread->depthmask)
4216 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4222 // no depth testing means we're just dealing with color...
4223 // if there is no color buffer, skip pixel shader
4224 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4226 memset(pixelmask, 1, span->length);
4227 span->pixelmask = pixelmask;
4229 span->endx = span->length;
4230 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4234 thread->numspans = 0;
4237 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4239 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4242 int cullface = thread->cullface;
4243 int width = dpsoftrast.fb_width;
4244 int miny = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
4245 int maxy = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
4246 __m128i fbmin, fbmax;
4247 __m128 viewportcenter, viewportscale;
4248 int firstvertex = command->firstvertex;
4249 int numvertices = command->numvertices;
4250 int numtriangles = command->numtriangles;
4251 const int *element3i = command->element3i;
4252 const unsigned short *element3s = command->element3s;
4253 int clipped = command->clipped;
4264 __m128 triangleedge1, triangleedge2, trianglenormal;
4267 DPSOFTRAST_State_Triangle *triangle;
4268 DPSOFTRAST_Texture *texture;
4269 if (command->starty >= maxy || command->endy <= miny)
4271 if (!ATOMIC_DECREMENT(command->refcount))
4273 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4274 MM_FREE(command->arrays);
4278 DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4279 fbmin = _mm_setr_epi16(0, miny, 0, miny, 0, miny, 0, miny);
4280 fbmax = _mm_sub_epi16(_mm_setr_epi16(width, maxy, width, maxy, width, maxy, width, maxy), _mm_set1_epi16(1));
4281 viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4282 viewportscale = _mm_load_ps(thread->fb_viewportscale);
4283 screen[3] = _mm_setzero_ps();
4284 clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4285 for (i = 0;i < numtriangles;i++)
4287 const float *screencoord4f = command->arrays;
4288 const float *arrays = screencoord4f + numvertices*4;
4290 // generate the 3 edges of this triangle
4291 // generate spans for the triangle - switch based on left split or right split classification of triangle
4294 e[0] = element3i[i*3+0] - firstvertex;
4295 e[1] = element3i[i*3+1] - firstvertex;
4296 e[2] = element3i[i*3+2] - firstvertex;
4300 e[0] = element3s[i*3+0] - firstvertex;
4301 e[1] = element3s[i*3+1] - firstvertex;
4302 e[2] = element3s[i*3+2] - firstvertex;
4311 #define SKIPBACKFACE \
4312 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4313 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4314 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4315 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4316 _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4320 if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4324 if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4329 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4330 clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4332 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4333 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4335 #define CLIPPEDVERTEXCOPY(k,p1) \
4336 screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4338 #define GENATTRIBCOPY(attrib, p1) \
4339 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4340 #define GENATTRIBLERP(attrib, p1, p2) \
4342 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4343 attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4345 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4349 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4350 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4351 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4352 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4353 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4354 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4355 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4361 // calculate distance from nearplane
4362 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4363 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4364 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4365 if (clipdist[0] >= 0.0f)
4367 if (clipdist[1] >= 0.0f)
4369 if (clipdist[2] >= 0.0f)
4372 // triangle is entirely in front of nearplane
4373 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4380 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4388 if (clipdist[2] >= 0.0f)
4390 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
4397 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
4404 else if (clipdist[1] >= 0.0f)
4406 if (clipdist[2] >= 0.0f)
4408 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
4415 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
4421 else if (clipdist[2] >= 0.0f)
4423 CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
4428 else continue; // triangle is entirely behind nearplane
4431 // calculate integer y coords for triangle points
4432 __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
4433 screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
4434 screenmin = _mm_min_epi16(screeni, screenir),
4435 screenmax = _mm_max_epi16(screeni, screenir);
4436 screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
4437 screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
4438 screenmin = _mm_max_epi16(screenmin, fbmin);
4439 screenmax = _mm_min_epi16(screenmax, fbmax);
4440 // skip offscreen triangles
4441 if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
4443 starty = _mm_extract_epi16(screenmin, 1);
4444 endy = _mm_extract_epi16(screenmax, 1)+1;
4445 screeny = _mm_srai_epi32(screeni, 16);
4448 triangle = &thread->triangles[thread->numtriangles];
4450 // calculate attribute plans for triangle data...
4451 // okay, this triangle is going to produce spans, we'd better project
4452 // the interpolants now (this is what gives perspective texturing),
4453 // this consists of simply multiplying all arrays by the W coord
4454 // (which is basically 1/Z), which will be undone per-pixel
4455 // (multiplying by Z again) to get the perspective-correct array
4458 __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
4459 __m128 mipedgescale, mipdensity;
4460 attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
4461 attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
4462 attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
4463 attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
4464 attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
4465 w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
4466 w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
4467 w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
4468 attribedge1 = _mm_sub_ss(w0, w1);
4469 attribedge2 = _mm_sub_ss(w2, w1);
4470 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
4471 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
4472 x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
4473 y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
4474 attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
4475 _mm_store_ss(&triangle->w[0], attribxslope);
4476 _mm_store_ss(&triangle->w[1], attribyslope);
4477 _mm_store_ss(&triangle->w[2], attriborigin);
4478 mipedgescale = _mm_setzero_ps();
4479 for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
4481 __m128 attrib0, attrib1, attrib2;
4482 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
4483 if (k >= DPSOFTRAST_ARRAY_TOTAL)
4485 arrays += numvertices*4;
4486 GENATTRIBS(attrib0, attrib1, attrib2);
4487 attriborigin = _mm_mul_ps(attrib1, w1);
4488 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
4489 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
4490 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
4491 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
4492 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
4493 _mm_stream_ps(triangle->attribs[k][0], attribxslope);
4494 _mm_stream_ps(triangle->attribs[k][1], attribyslope);
4495 _mm_stream_ps(triangle->attribs[k][2], attriborigin);
4496 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
4498 mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
4499 mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
4500 mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
4501 mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
4505 memset(triangle->mip, 0, sizeof(triangle->mip));
4506 for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
4508 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
4509 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
4511 texture = thread->texbound[texunit];
4512 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4514 mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
4515 mipdensity = _mm_mul_ps(mipdensity, mipdensity);
4516 mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
4517 mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
4518 // this will be multiplied in the texturing routine by the texture resolution
4519 y = _mm_cvtss_si32(mipdensity);
4522 y = (int)(log((float)y)*0.5f/M_LN2);
4523 if (y > texture->mipmaps - 1)
4524 y = texture->mipmaps - 1;
4525 triangle->mip[texunit] = y;
4531 for (y = starty; y < endy;)
4533 __m128 xcoords, xslope;
4534 __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
4535 int yccmask = _mm_movemask_epi8(ycc);
4536 int edge0p, edge0n, edge1p, edge1n;
4543 case 0xFFFF: /*0000*/ y = endy; continue;
4544 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
4545 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4546 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
4547 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
4548 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
4549 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
4550 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
4551 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
4552 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
4553 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
4554 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
4555 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
4556 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4557 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
4558 case 0x0000: /*1111*/ y++; continue;
4566 case 0xFFFF: /*000*/ y = endy; continue;
4567 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
4568 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4569 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
4570 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
4571 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4572 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
4573 case 0x0000: /*111*/ y++; continue;
4576 ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
4577 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
4578 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
4579 nexty = _mm_extract_epi16(ycc, 0);
4580 if(nexty >= endy) nexty = endy-1;
4581 if (_mm_ucomigt_ss(_mm_max_ss(screen[edge0n], screen[edge0p]), _mm_min_ss(screen[edge1n], screen[edge1p])))
4590 xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
4591 xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
4592 xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
4593 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
4594 xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
4595 for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope))
4597 int startx, endx, offset;
4598 startx = _mm_cvtss_si32(xcoords);
4599 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
4600 if (startx < 0) startx = 0;
4601 if (endx > dpsoftrast.fb_width) endx = dpsoftrast.fb_width;
4602 if (startx >= endx) continue;
4603 for (offset = startx; offset < endx;)
4605 DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
4606 span->triangle = thread->numtriangles;
4609 span->length = endx - offset;
4610 if (span -> length > DPSOFTRAST_DRAW_MAXSPANLENGTH)
4611 span -> length = DPSOFTRAST_DRAW_MAXSPANLENGTH;
4612 offset += span->length;
4613 if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
4614 DPSOFTRAST_Draw_ProcessSpans(thread);
4619 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
4621 DPSOFTRAST_Draw_ProcessSpans(thread);
4622 thread->numtriangles = 0;
4626 if (!ATOMIC_DECREMENT(command->refcount))
4628 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4629 MM_FREE(command->arrays);
4632 if (thread->numspans > 0 || thread->numtriangles > 0)
4634 DPSOFTRAST_Draw_ProcessSpans(thread);
4635 thread->numtriangles = 0;
4640 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
4644 int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
4645 int datasize = 2*numvertices*sizeof(float[4]);
4646 DPSOFTRAST_Command_Draw *command;
4647 unsigned char *data;
4648 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
4650 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
4651 if (j >= DPSOFTRAST_ARRAY_TOTAL)
4653 datasize += numvertices*sizeof(float[4]);
4656 datasize += numtriangles*sizeof(int[3]);
4658 datasize += numtriangles*sizeof(unsigned short[3]);
4659 datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
4660 if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
4662 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
4663 data = (unsigned char *)MM_CALLOC(datasize, 1);
4667 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
4668 data = (unsigned char *)command + commandsize;
4670 command->firstvertex = firstvertex;
4671 command->numvertices = numvertices;
4672 command->numtriangles = numtriangles;
4673 command->arrays = (float *)data;
4674 memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
4675 dpsoftrast.firstvertex = firstvertex;
4676 dpsoftrast.numvertices = numvertices;
4677 dpsoftrast.screencoord4f = (float *)data;
4678 data += numvertices*sizeof(float[4]);
4679 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
4680 data += numvertices*sizeof(float[4]);
4681 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
4683 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
4684 if (j >= DPSOFTRAST_ARRAY_TOTAL)
4686 dpsoftrast.post_array4f[j] = (float *)data;
4687 data += numvertices*sizeof(float[4]);
4689 command->element3i = NULL;
4690 command->element3s = NULL;
4693 command->element3i = (int *)data;
4694 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
4698 command->element3s = (unsigned short *)data;
4699 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
4704 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
4706 DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
4707 DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
4708 command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
4709 command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
4710 if (command->starty >= command->endy)
4712 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4713 MM_FREE(command->arrays);
4714 DPSOFTRAST_UndoCommand(command->commandsize);
4717 command->clipped = dpsoftrast.drawclipped;
4718 command->refcount = dpsoftrast.numthreads;
4721 DPSOFTRAST_Draw_SyncCommands();
4725 for (i = 0; i < dpsoftrast.numthreads; i++)
4727 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
4729 nexty = ((i+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
4730 if (command->starty < nexty && command->endy > y && thread->starving)
4731 SDL_CondSignal(thread->drawcond);
4735 DPSOFTRAST_Draw_FlushThreads();
4739 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
4741 int commandoffset = thread->commandoffset;
4742 while (commandoffset != endoffset)
4744 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
4745 switch (command->opcode)
4747 #define INTERPCOMMAND(name) \
4748 case DPSOFTRAST_OPCODE_##name : \
4749 DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
4750 commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
4751 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
4752 commandoffset = 0; \
4754 INTERPCOMMAND(Viewport)
4755 INTERPCOMMAND(ClearColor)
4756 INTERPCOMMAND(ClearDepth)
4757 INTERPCOMMAND(ColorMask)
4758 INTERPCOMMAND(DepthTest)
4759 INTERPCOMMAND(ScissorTest)
4760 INTERPCOMMAND(Scissor)
4761 INTERPCOMMAND(BlendFunc)
4762 INTERPCOMMAND(BlendSubtract)
4763 INTERPCOMMAND(DepthMask)
4764 INTERPCOMMAND(DepthFunc)
4765 INTERPCOMMAND(DepthRange)
4766 INTERPCOMMAND(PolygonOffset)
4767 INTERPCOMMAND(CullFace)
4768 INTERPCOMMAND(AlphaTest)
4769 INTERPCOMMAND(AlphaFunc)
4770 INTERPCOMMAND(SetTexture)
4771 INTERPCOMMAND(SetShader)
4772 INTERPCOMMAND(Uniform4f)
4773 INTERPCOMMAND(UniformMatrix4f)
4774 INTERPCOMMAND(Uniform1i)
4776 case DPSOFTRAST_OPCODE_Draw:
4777 DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
4778 commandoffset += command->commandsize;
4779 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
4781 thread->commandoffset = commandoffset;
4784 case DPSOFTRAST_OPCODE_Reset:
4789 thread->commandoffset = commandoffset;
4793 static int DPSOFTRAST_Draw_Thread(void *data)
4795 DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
4796 while(thread->index >= 0)
4798 if (thread->commandoffset != dpsoftrast.drawcommand)
4800 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
4804 SDL_LockMutex(thread->drawmutex);
4805 if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
4807 if (thread->waiting) SDL_CondSignal(thread->waitcond);
4808 thread->starving = true;
4809 SDL_CondWait(thread->drawcond, thread->drawmutex);
4810 thread->starving = false;
4812 SDL_UnlockMutex(thread->drawmutex);
4819 static void DPSOFTRAST_Draw_FlushThreads(void)
4821 DPSOFTRAST_State_Thread *thread;
4823 DPSOFTRAST_Draw_SyncCommands();
4825 for (i = 0; i < dpsoftrast.numthreads; i++)
4827 thread = &dpsoftrast.threads[i];
4828 if (thread->commandoffset != dpsoftrast.drawcommand)
4830 SDL_LockMutex(thread->drawmutex);
4831 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
4832 SDL_CondSignal(thread->drawcond);
4833 SDL_UnlockMutex(thread->drawmutex);
4837 for (i = 0; i < dpsoftrast.numthreads; i++)
4839 thread = &dpsoftrast.threads[i];
4841 if (thread->commandoffset != dpsoftrast.drawcommand)
4843 SDL_LockMutex(thread->drawmutex);
4844 if (thread->commandoffset != dpsoftrast.drawcommand)
4846 thread->waiting = true;
4847 SDL_CondWait(thread->waitcond, thread->drawmutex);
4848 thread->waiting = false;
4850 SDL_UnlockMutex(thread->drawmutex);
4853 if (thread->commandoffset != dpsoftrast.drawcommand)
4854 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
4857 dpsoftrast.commandpool.usedcommands = 0;
4860 void DPSOFTRAST_Flush(void)
4862 DPSOFTRAST_Draw_FlushThreads();
4865 void DPSOFTRAST_Finish(void)
4870 void DPSOFTRAST_Init(int width, int height, int numthreads, unsigned int *colorpixels, unsigned int *depthpixels)
4880 memset(&dpsoftrast, 0, sizeof(dpsoftrast));
4881 dpsoftrast.bigendian = u.b[3];
4882 dpsoftrast.fb_width = width;
4883 dpsoftrast.fb_height = height;
4884 dpsoftrast.fb_depthpixels = depthpixels;
4885 dpsoftrast.fb_colorpixels[0] = colorpixels;
4886 dpsoftrast.fb_colorpixels[1] = NULL;
4887 dpsoftrast.fb_colorpixels[1] = NULL;
4888 dpsoftrast.fb_colorpixels[1] = NULL;
4889 dpsoftrast.viewport[0] = 0;
4890 dpsoftrast.viewport[1] = 0;
4891 dpsoftrast.viewport[2] = dpsoftrast.fb_width;
4892 dpsoftrast.viewport[3] = dpsoftrast.fb_height;
4893 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
4894 dpsoftrast.texture_firstfree = 1;
4895 dpsoftrast.texture_end = 1;
4896 dpsoftrast.texture_max = 0;
4897 dpsoftrast.color[0] = 1;
4898 dpsoftrast.color[1] = 1;
4899 dpsoftrast.color[2] = 1;
4900 dpsoftrast.color[3] = 1;
4902 dpsoftrast.numthreads = bound(1, numthreads, 64);
4904 dpsoftrast.numthreads = 1;
4906 dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
4907 for (i = 0; i < dpsoftrast.numthreads; i++)
4909 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
4911 thread->cullface = GL_BACK;
4912 thread->colormask[1] = 1;
4913 thread->colormask[2] = 1;
4914 thread->colormask[3] = 1;
4915 thread->blendfunc[0] = GL_ONE;
4916 thread->blendfunc[1] = GL_ZERO;
4917 thread->depthmask = true;
4918 thread->depthtest = true;
4919 thread->depthfunc = GL_LEQUAL;
4920 thread->scissortest = false;
4921 thread->alphatest = false;
4922 thread->alphafunc = GL_GREATER;
4923 thread->alphavalue = 0.5f;
4924 thread->viewport[0] = 0;
4925 thread->viewport[1] = 0;
4926 thread->viewport[2] = dpsoftrast.fb_width;
4927 thread->viewport[3] = dpsoftrast.fb_height;
4928 thread->scissor[0] = 0;
4929 thread->scissor[1] = 0;
4930 thread->scissor[2] = dpsoftrast.fb_width;
4931 thread->scissor[3] = dpsoftrast.fb_height;
4932 thread->depthrange[0] = 0;
4933 thread->depthrange[1] = 1;
4934 thread->polygonoffset[0] = 0;
4935 thread->polygonoffset[1] = 0;
4937 thread->numspans = 0;
4938 thread->numtriangles = 0;
4939 thread->commandoffset = 0;
4940 thread->waiting = false;
4941 thread->starving = false;
4943 thread->waitcond = SDL_CreateCond();
4944 thread->drawcond = SDL_CreateCond();
4945 thread->drawmutex = SDL_CreateMutex();
4948 thread->validate = -1;
4949 DPSOFTRAST_Validate(thread, -1);
4951 thread->thread = SDL_CreateThread(DPSOFTRAST_Draw_Thread, thread);
4956 void DPSOFTRAST_Shutdown(void)
4960 if(dpsoftrast.numthreads > 0)
4962 DPSOFTRAST_State_Thread *thread;
4963 for (i = 0; i < dpsoftrast.numthreads; i++)
4965 thread = &dpsoftrast.threads[i];
4966 SDL_LockMutex(thread->drawmutex);
4968 SDL_CondSignal(thread->drawcond);
4969 SDL_UnlockMutex(thread->drawmutex);
4970 SDL_WaitThread(thread->thread, NULL);
4971 SDL_DestroyCond(thread->waitcond);
4972 SDL_DestroyCond(thread->drawcond);
4973 SDL_DestroyMutex(thread->drawmutex);
4977 for (i = 0;i < dpsoftrast.texture_end;i++)
4978 if (dpsoftrast.texture[i].bytes)
4979 MM_FREE(dpsoftrast.texture[i].bytes);
4980 if (dpsoftrast.texture)
4981 free(dpsoftrast.texture);
4982 if (dpsoftrast.threads)
4983 MM_FREE(dpsoftrast.threads);
4984 memset(&dpsoftrast, 0, sizeof(dpsoftrast));