3 #define _USE_MATH_DEFINES
6 #include "dpsoftrast.h"
14 #include <SDL_thread.h>
18 typedef qboolean bool;
22 #define ATOMIC_SIZE 32
26 #define ALIGN(var) var __attribute__((__aligned__(16)))
27 #define ATOMIC(var) var __attribute__((__aligned__(32)))
29 #define MEMORY_BARRIER (_mm_sfence())
30 //(__sync_synchronize())
31 #define ATOMIC_COUNTER volatile int
32 #define ATOMIC_ADD(counter, val) (__sync_add_and_fetch(&(counter), (val)))
34 #elif defined(_MSC_VER)
35 #define ALIGN(var) __declspec(align(16)) var
36 #define ATOMIC(var) __declspec(align(32)) var
38 #define MEMORY_BARRIER (_mm_sfence())
40 #define ATOMIC_COUNTER volatile LONG
41 #define ATOMIC_ADD(counter, val) (InterlockedAdd(&(counter), (val)))
50 #define ALIGN(var) var
51 #define ATOMIC(var) var
55 #define MEMORY_BARRIER ((void)0)
56 #define ATOMIC_COUNTER int
57 #define ATOMIC_ADD(counter, val) ((counter) += (val))
61 #include <emmintrin.h>
63 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
65 static void *MM_CALLOC(size_t nmemb, size_t size)
67 void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
68 if(ptr != NULL) memset(ptr, 0, nmemb*size);
72 #define MM_FREE _mm_free
74 #define MM_MALLOC(size) malloc(size)
75 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
79 typedef enum DPSOFTRAST_ARRAY_e
81 DPSOFTRAST_ARRAY_POSITION,
82 DPSOFTRAST_ARRAY_COLOR,
83 DPSOFTRAST_ARRAY_TEXCOORD0,
84 DPSOFTRAST_ARRAY_TEXCOORD1,
85 DPSOFTRAST_ARRAY_TEXCOORD2,
86 DPSOFTRAST_ARRAY_TEXCOORD3,
87 DPSOFTRAST_ARRAY_TEXCOORD4,
88 DPSOFTRAST_ARRAY_TEXCOORD5,
89 DPSOFTRAST_ARRAY_TEXCOORD6,
90 DPSOFTRAST_ARRAY_TEXCOORD7,
91 DPSOFTRAST_ARRAY_TOTAL
95 typedef struct DPSOFTRAST_Texture_s
102 DPSOFTRAST_TEXTURE_FILTER filter;
105 ATOMIC_COUNTER binds;
106 unsigned char *bytes;
107 int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
111 #define COMMAND_SIZE ALIGN_SIZE
112 #define COMMAND_ALIGN(var) ALIGN(var)
114 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
120 enum { DPSOFTRAST_OPCODE_Reset = 0 };
122 #define DEFCOMMAND(opcodeval, name, fields) \
123 enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
124 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
128 } DPSOFTRAST_Command_##name );
130 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
132 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
136 ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
138 DPSOFTRAST_State_Command_Pool);
140 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
143 unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
148 ALIGN(float coords[4][4]);
149 ALIGN(int ycoords[4]);
150 ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
152 DPSOFTRAST_State_Triangle);
154 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
155 slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
156 data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
157 _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
158 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
160 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
161 slope[0] = (triangle)->attribs[arrayindex][0][0]; \
162 slope[1] = (triangle)->attribs[arrayindex][0][1]; \
163 slope[2] = (triangle)->attribs[arrayindex][0][2]; \
164 slope[3] = (triangle)->attribs[arrayindex][0][3]; \
165 data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
166 data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
167 data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
168 data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
171 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
173 typedef ALIGN(struct DPSOFTRAST_State_Span_s
175 int triangle; // triangle this span was generated by
176 int x; // framebuffer x coord
177 int y; // framebuffer y coord
178 int length; // pixel count
179 int startx; // usable range (according to pixelmask)
180 int endx; // usable range (according to pixelmask)
181 unsigned char *pixelmask; // true for pixels that passed depth test, false for others
183 DPSOFTRAST_State_Span);
185 #define DPSOFTRAST_DRAW_MAXSPANS 1024
187 #define DPSOFTRAST_DRAW_MAXTRIANGLEPOOL 4096
188 #define DPSOFTRAST_DRAW_FLUSHPROCESSTRIANGLES 64
190 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_Pool_s
194 ATOMIC(DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLEPOOL]);
196 DPSOFTRAST_State_Triangle_Pool);
198 #define DPSOFTRAST_VALIDATE_FB 1
199 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
200 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
201 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
203 typedef enum DPSOFTRAST_BLENDMODE_e
205 DPSOFTRAST_BLENDMODE_OPAQUE,
206 DPSOFTRAST_BLENDMODE_ALPHA,
207 DPSOFTRAST_BLENDMODE_ADDALPHA,
208 DPSOFTRAST_BLENDMODE_ADD,
209 DPSOFTRAST_BLENDMODE_INVMOD,
210 DPSOFTRAST_BLENDMODE_MUL,
211 DPSOFTRAST_BLENDMODE_MUL2,
212 DPSOFTRAST_BLENDMODE_SUBALPHA,
213 DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
214 DPSOFTRAST_BLENDMODE_TOTAL
216 DPSOFTRAST_BLENDMODE;
218 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
238 float polygonoffset[2];
241 int shader_permutation;
243 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
245 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
246 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
248 // DPSOFTRAST_VALIDATE_ flags
251 // derived values (DPSOFTRAST_VALIDATE_FB)
253 int fb_clearscissor[4];
255 // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
258 // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
261 ATOMIC(int commandoffset);
270 DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
272 DPSOFTRAST_State_Thread);
274 typedef ATOMIC(struct DPSOFTRAST_State_s
278 unsigned int *fb_depthpixels;
279 unsigned int *fb_colorpixels[4];
282 ALIGN(float fb_viewportcenter[4]);
283 ALIGN(float fb_viewportscale[4]);
286 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
287 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
291 const float *pointer_vertex3f;
292 const float *pointer_color4f;
293 const unsigned char *pointer_color4ub;
294 const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
297 int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
298 int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
299 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
303 float *in_array4f[DPSOFTRAST_ARRAY_TOTAL];
304 float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
305 float *screencoord4f;
308 int shader_permutation;
312 int texture_firstfree;
313 DPSOFTRAST_Texture *texture;
318 const char *errorstring;
321 DPSOFTRAST_State_Thread *threads;
323 SDL_mutex *trianglemutex;
324 SDL_cond *trianglecond;
327 ATOMIC(int drawtriangle);
329 DPSOFTRAST_State_Command_Pool commandpool;
330 DPSOFTRAST_State_Triangle_Pool trianglepool;
334 DPSOFTRAST_State dpsoftrast;
336 extern int dpsoftrast_test;
338 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
339 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
340 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
341 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
342 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
344 void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
346 // calculate framebuffer scissor, viewport, viewport clipped by scissor,
347 // and viewport projection values
350 x1 = thread->scissor[0];
351 x2 = thread->scissor[0] + thread->scissor[2];
352 y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
353 y2 = dpsoftrast.fb_height - thread->scissor[1];
354 if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
356 if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
358 if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
359 thread->fb_clearscissor[0] = x1;
360 thread->fb_clearscissor[1] = y1;
361 thread->fb_clearscissor[2] = x2 - x1;
362 thread->fb_clearscissor[3] = y2 - y1;
365 void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
367 thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
370 void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
372 if (thread->blendsubtract)
374 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
376 #define BLENDFUNC(sfactor, dfactor, blendmode) \
377 case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
378 BLENDFUNC(GL_SRC_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
379 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
384 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
386 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
387 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
388 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
389 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
390 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
391 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
392 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
393 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
394 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
395 BLENDFUNC(GL_SRC_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
396 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
401 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
403 void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
405 mask &= thread->validate;
408 if (mask & DPSOFTRAST_VALIDATE_FB)
410 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
411 DPSOFTRAST_RecalcFB(thread);
413 if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
415 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
416 DPSOFTRAST_RecalcDepthFunc(thread);
418 if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
420 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
421 DPSOFTRAST_RecalcBlendFunc(thread);
425 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
427 if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
428 return &dpsoftrast.texture[index];
432 static void DPSOFTRAST_Texture_Grow(void)
434 DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
435 DPSOFTRAST_State_Thread *thread;
439 // expand texture array as needed
440 if (dpsoftrast.texture_max < 1024)
441 dpsoftrast.texture_max = 1024;
443 dpsoftrast.texture_max *= 2;
444 dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
445 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
446 if(dpsoftrast.texbound[i])
447 dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
448 for (j = 0; j < dpsoftrast.numthreads; j++)
450 thread = &dpsoftrast.threads[j];
451 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
452 if(thread->texbound[i])
453 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
457 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
466 int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
467 int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
468 DPSOFTRAST_Texture *texture;
469 if (width*height*depth < 1)
471 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
474 if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
476 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
481 case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
482 case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
483 case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
485 case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
486 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
488 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
493 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
496 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
498 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
503 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
505 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
508 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
510 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
513 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
515 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
518 if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
520 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
523 if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
525 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
528 // find first empty slot in texture array
529 for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
530 if (!dpsoftrast.texture[texnum].bytes)
532 dpsoftrast.texture_firstfree = texnum + 1;
533 if (dpsoftrast.texture_max <= texnum)
534 DPSOFTRAST_Texture_Grow();
535 if (dpsoftrast.texture_end <= texnum)
536 dpsoftrast.texture_end = texnum + 1;
537 texture = &dpsoftrast.texture[texnum];
538 memset(texture, 0, sizeof(*texture));
539 texture->flags = flags;
540 texture->width = width;
541 texture->height = height;
542 texture->depth = depth;
543 texture->sides = sides;
555 s = w * h * d * sides * 4;
556 texture->mipmap[mipmaps][0] = size;
557 texture->mipmap[mipmaps][1] = s;
558 texture->mipmap[mipmaps][2] = w;
559 texture->mipmap[mipmaps][3] = h;
560 texture->mipmap[mipmaps][4] = d;
563 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
569 texture->mipmaps = mipmaps;
570 texture->size = size;
572 // allocate the pixels now
573 texture->bytes = (unsigned char *)MM_CALLOC(1, size);
577 void DPSOFTRAST_Texture_Free(int index)
579 DPSOFTRAST_Texture *texture;
580 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
584 MM_FREE(texture->bytes);
585 texture->bytes = NULL;
586 memset(texture, 0, sizeof(*texture));
587 // adjust the free range and used range
588 if (dpsoftrast.texture_firstfree > index)
589 dpsoftrast.texture_firstfree = index;
590 while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
591 dpsoftrast.texture_end--;
593 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
595 int i, x, y, z, w, layer0, layer1, row0, row1;
596 unsigned char *o, *i0, *i1, *i2, *i3;
597 DPSOFTRAST_Texture *texture;
598 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
599 if (texture->mipmaps <= 1)
601 for (i = 1;i < texture->mipmaps;i++)
603 for (z = 0;z < texture->mipmap[i][4];z++)
607 if (layer1 >= texture->mipmap[i-1][4])
608 layer1 = texture->mipmap[i-1][4]-1;
609 for (y = 0;y < texture->mipmap[i][3];y++)
613 if (row1 >= texture->mipmap[i-1][3])
614 row1 = texture->mipmap[i-1][3]-1;
615 o = texture->bytes + texture->mipmap[i ][0] + 4*((texture->mipmap[i ][3] * z + y ) * texture->mipmap[i ][2]);
616 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
617 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
618 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
619 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
620 w = texture->mipmap[i][2];
623 if (texture->mipmap[i-1][2] > 1)
625 // average 3D texture
626 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
628 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
629 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
630 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
631 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
636 // average 3D mipmap with parent width == 1
637 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
639 o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
640 o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
641 o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
642 o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
648 if (texture->mipmap[i-1][2] > 1)
650 // average 2D texture (common case)
651 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
653 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
654 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
655 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
656 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
661 // 2D texture with parent width == 1
662 o[0] = (i0[0] + i1[0] + 1) >> 1;
663 o[1] = (i0[1] + i1[1] + 1) >> 1;
664 o[2] = (i0[2] + i1[2] + 1) >> 1;
665 o[3] = (i0[3] + i1[3] + 1) >> 1;
672 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
674 DPSOFTRAST_Texture *texture;
676 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
679 dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
680 while (blockheight > 0)
682 memcpy(dst, pixels, blockwidth * 4);
683 pixels += blockwidth * 4;
684 dst += texture->mipmap[0][2] * 4;
687 DPSOFTRAST_Texture_CalculateMipmaps(index);
689 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
691 DPSOFTRAST_Texture *texture;
692 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
695 memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
696 DPSOFTRAST_Texture_CalculateMipmaps(index);
698 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
700 DPSOFTRAST_Texture *texture;
701 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
702 return texture->mipmap[mip][2];
704 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
706 DPSOFTRAST_Texture *texture;
707 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
708 return texture->mipmap[mip][3];
710 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
712 DPSOFTRAST_Texture *texture;
713 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
714 return texture->mipmap[mip][4];
716 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
718 DPSOFTRAST_Texture *texture;
719 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
722 return texture->bytes + texture->mipmap[mip][0];
724 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
726 DPSOFTRAST_Texture *texture;
727 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
728 if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
730 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
735 texture->filter = filter;
738 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
740 if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
741 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
742 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
744 dpsoftrast.fb_width = width;
745 dpsoftrast.fb_height = height;
746 dpsoftrast.fb_depthpixels = depthpixels;
747 dpsoftrast.fb_colorpixels[0] = colorpixels0;
748 dpsoftrast.fb_colorpixels[1] = colorpixels1;
749 dpsoftrast.fb_colorpixels[2] = colorpixels2;
750 dpsoftrast.fb_colorpixels[3] = colorpixels3;
753 void DPSOFTRAST_Draw_FlushThreads(void);
755 void DPSOFTRAST_Draw_FreeTrianglePool(int space)
757 DPSOFTRAST_State_Thread *thread;
759 int freetriangle = dpsoftrast.trianglepool.freetriangle;
760 int usedtriangles = dpsoftrast.trianglepool.usedtriangles;
761 if (usedtriangles <= DPSOFTRAST_DRAW_MAXTRIANGLEPOOL-space)
764 SDL_LockMutex(dpsoftrast.trianglemutex);
771 for (i = 0; i < dpsoftrast.numthreads; i++)
773 thread = &dpsoftrast.threads[i];
774 triangleoffset = freetriangle - thread->triangleoffset;
775 if (triangleoffset < 0)
776 triangleoffset += DPSOFTRAST_DRAW_MAXTRIANGLEPOOL;
777 if (triangleoffset > usedtriangles)
780 usedtriangles = triangleoffset;
783 if (usedtriangles <= DPSOFTRAST_DRAW_MAXTRIANGLEPOOL-space || waitindex < 0)
786 thread = &dpsoftrast.threads[waitindex];
787 thread->waiting = true;
788 SDL_CondBroadcast(dpsoftrast.trianglecond);
789 SDL_CondWait(thread->waitcond, dpsoftrast.trianglemutex);
790 thread->waiting = false;
794 SDL_UnlockMutex(dpsoftrast.trianglemutex);
796 dpsoftrast.trianglepool.usedtriangles = usedtriangles;
799 void DPSOFTRAST_Draw_SyncCommands(void)
801 DPSOFTRAST_State_Triangle *triangle;
802 if (dpsoftrast.trianglepool.usedtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLEPOOL-1)
804 DPSOFTRAST_Draw_FreeTrianglePool(DPSOFTRAST_DRAW_MAXTRIANGLEPOOL/8);
806 DPSOFTRAST_Draw_FlushThreads();
808 triangle = &dpsoftrast.trianglepool.triangles[dpsoftrast.trianglepool.freetriangle];
809 triangle->commandoffset = dpsoftrast.commandpool.freecommand;
810 triangle->starty = -1;
812 dpsoftrast.trianglepool.freetriangle = dpsoftrast.trianglepool.freetriangle < DPSOFTRAST_DRAW_MAXTRIANGLEPOOL-1 ? dpsoftrast.trianglepool.freetriangle + 1 : 0;
813 dpsoftrast.trianglepool.usedtriangles++;
815 dpsoftrast.drawtriangle = dpsoftrast.trianglepool.freetriangle;
818 void DPSOFTRAST_Draw_FreeCommandPool(int space)
820 DPSOFTRAST_State_Thread *thread;
822 int freecommand = dpsoftrast.commandpool.freecommand;
823 int usedcommands = dpsoftrast.commandpool.usedcommands;
824 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
826 DPSOFTRAST_Draw_SyncCommands();
828 SDL_LockMutex(dpsoftrast.trianglemutex);
835 for (i = 0; i < dpsoftrast.numthreads; i++)
837 thread = &dpsoftrast.threads[i];
838 commandoffset = freecommand - thread->commandoffset;
839 if (commandoffset < 0)
840 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
841 if (commandoffset > usedcommands)
844 usedcommands = commandoffset;
847 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
850 thread = &dpsoftrast.threads[waitindex];
851 thread->waiting = true;
852 SDL_CondBroadcast(dpsoftrast.trianglecond);
853 SDL_CondWait(thread->waitcond, dpsoftrast.trianglemutex);
854 thread->waiting = false;
858 SDL_UnlockMutex(dpsoftrast.trianglemutex);
860 dpsoftrast.commandpool.usedcommands = usedcommands;
863 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
864 ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand(sizeof( DPSOFTRAST_Command_##name ) + ((COMMAND_SIZE - (sizeof( DPSOFTRAST_Command_##name )&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1))))
866 static void *DPSOFTRAST_AllocateCommand(int size)
868 DPSOFTRAST_Command *command;
869 int freecommand = dpsoftrast.commandpool.freecommand;
870 int usedcommands = dpsoftrast.commandpool.usedcommands;
871 int extra = sizeof(DPSOFTRAST_Command);
872 if(DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
873 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
874 if(usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
877 DPSOFTRAST_Draw_FreeCommandPool(size + extra);
879 DPSOFTRAST_Draw_FlushThreads();
881 freecommand = dpsoftrast.commandpool.freecommand;
882 usedcommands = dpsoftrast.commandpool.usedcommands;
884 if(DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
886 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
887 command->opcode = DPSOFTRAST_OPCODE_Reset;
888 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
891 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
893 if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
896 dpsoftrast.commandpool.freecommand = freecommand;
897 dpsoftrast.commandpool.usedcommands = usedcommands + size;
901 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
902 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
904 thread->viewport[0] = command->x;
905 thread->viewport[1] = command->y;
906 thread->viewport[2] = command->width;
907 thread->viewport[3] = command->height;
908 thread->validate |= DPSOFTRAST_VALIDATE_FB;
910 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
912 DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
913 command->opcode = DPSOFTRAST_OPCODE_Viewport;
916 command->width = width;
917 command->height = height;
919 dpsoftrast.viewport[0] = x;
920 dpsoftrast.viewport[1] = y;
921 dpsoftrast.viewport[2] = width;
922 dpsoftrast.viewport[3] = height;
923 dpsoftrast.fb_viewportcenter[1] = dpsoftrast.viewport[0] + 0.5f * dpsoftrast.viewport[2] - 0.5f;
924 dpsoftrast.fb_viewportcenter[2] = dpsoftrast.fb_height - dpsoftrast.viewport[1] - 0.5f * dpsoftrast.viewport[3] - 0.5f;
925 dpsoftrast.fb_viewportcenter[3] = 0.5f;
926 dpsoftrast.fb_viewportcenter[0] = 0.0f;
927 dpsoftrast.fb_viewportscale[1] = 0.5f * dpsoftrast.viewport[2];
928 dpsoftrast.fb_viewportscale[2] = -0.5f * dpsoftrast.viewport[3];
929 dpsoftrast.fb_viewportscale[3] = 0.5f;
930 dpsoftrast.fb_viewportscale[0] = 1.0f;
933 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;)
934 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
936 int i, x1, y1, x2, y2, w, h, x, y, t1, t2;
939 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
940 x1 = thread->fb_clearscissor[0];
941 y1 = thread->fb_clearscissor[1];
942 x2 = thread->fb_clearscissor[2];
943 y2 = thread->fb_clearscissor[1] + thread->fb_clearscissor[3];
944 t1 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
945 t2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
952 // FIXME: honor fb_colormask?
953 c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
954 for (i = 0;i < 4;i++)
956 if (!dpsoftrast.fb_colorpixels[i])
958 for (y = y1;y < y2;y++)
960 p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
961 for (x = x1;x < x2;x++)
966 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
968 DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
969 command->opcode = DPSOFTRAST_OPCODE_ClearColor;
976 DEFCOMMAND(3, ClearDepth, float depth;)
977 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
979 int x1, y1, x2, y2, w, h, x, y, t1, t2;
982 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
983 x1 = thread->fb_clearscissor[0];
984 y1 = thread->fb_clearscissor[1];
985 x2 = thread->fb_clearscissor[2];
986 y2 = thread->fb_clearscissor[1] + thread->fb_clearscissor[3];
987 t1 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
988 t2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
995 c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
996 for (y = y1;y < y2;y++)
998 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
999 for (x = x1;x < x2;x++)
1003 void DPSOFTRAST_ClearDepth(float d)
1005 DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
1006 command->opcode = DPSOFTRAST_OPCODE_ClearDepth;
1010 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
1011 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
1013 thread->colormask[0] = command->r != 0;
1014 thread->colormask[1] = command->g != 0;
1015 thread->colormask[2] = command->b != 0;
1016 thread->colormask[3] = command->a != 0;
1017 thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
1019 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
1021 DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
1022 command->opcode = DPSOFTRAST_OPCODE_ColorMask;
1029 DEFCOMMAND(5, DepthTest, int enable;)
1030 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
1032 thread->depthtest = command->enable;
1033 thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1035 void DPSOFTRAST_DepthTest(int enable)
1037 DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1038 command->opcode = DPSOFTRAST_OPCODE_DepthTest;
1039 command->enable = enable;
1042 DEFCOMMAND(6, ScissorTest, int enable;)
1043 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1045 thread->scissortest = command->enable;
1046 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1048 void DPSOFTRAST_ScissorTest(int enable)
1050 DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1051 command->opcode = DPSOFTRAST_OPCODE_ScissorTest;
1052 command->enable = enable;
1055 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1056 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1058 thread->scissor[0] = command->x;
1059 thread->scissor[1] = command->y;
1060 thread->scissor[2] = command->width;
1061 thread->scissor[3] = command->height;
1062 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1064 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1066 DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1067 command->opcode = DPSOFTRAST_OPCODE_Scissor;
1070 command->width = width;
1071 command->height = height;
1074 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1075 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1077 thread->blendfunc[0] = command->sfactor;
1078 thread->blendfunc[1] = command->dfactor;
1079 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1081 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1083 DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1084 command->opcode = DPSOFTRAST_OPCODE_BlendFunc;
1085 command->sfactor = sfactor;
1086 command->dfactor = dfactor;
1089 DEFCOMMAND(9, BlendSubtract, int enable;)
1090 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1092 thread->blendsubtract = command->enable;
1093 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1095 void DPSOFTRAST_BlendSubtract(int enable)
1097 DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1098 command->opcode = DPSOFTRAST_OPCODE_BlendSubtract;
1099 command->enable = enable;
1102 DEFCOMMAND(10, DepthMask, int enable;)
1103 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1105 thread->depthmask = command->enable;
1107 void DPSOFTRAST_DepthMask(int enable)
1109 DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1110 command->opcode = DPSOFTRAST_OPCODE_DepthMask;
1111 command->enable = enable;
1114 DEFCOMMAND(11, DepthFunc, int func;)
1115 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1117 thread->depthfunc = command->func;
1119 void DPSOFTRAST_DepthFunc(int func)
1121 DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1122 command->opcode = DPSOFTRAST_OPCODE_DepthFunc;
1123 command->func = func;
1126 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1127 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1129 thread->depthrange[0] = command->nearval;
1130 thread->depthrange[1] = command->farval;
1132 void DPSOFTRAST_DepthRange(float nearval, float farval)
1134 DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1135 command->opcode = DPSOFTRAST_OPCODE_DepthRange;
1136 command->nearval = nearval;
1137 command->farval = farval;
1140 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1141 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1143 thread->polygonoffset[0] = command->alongnormal;
1144 thread->polygonoffset[1] = command->intoview;
1146 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1148 DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1149 command->opcode = DPSOFTRAST_OPCODE_PolygonOffset;
1150 command->alongnormal = alongnormal;
1151 command->intoview = intoview;
1154 void DPSOFTRAST_CullFace(int mode)
1156 dpsoftrast.cullface = mode;
1159 DEFCOMMAND(15, AlphaTest, int enable;)
1160 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1162 thread->alphatest = command->enable;
1164 void DPSOFTRAST_AlphaTest(int enable)
1166 DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1167 command->opcode = DPSOFTRAST_OPCODE_AlphaTest;
1168 command->enable = enable;
1171 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1172 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1174 thread->alphafunc = command->func;
1175 thread->alphavalue = command->ref;
1177 void DPSOFTRAST_AlphaFunc(int func, float ref)
1179 DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1180 command->opcode = DPSOFTRAST_OPCODE_AlphaFunc;
1181 command->func = func;
1185 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1187 dpsoftrast.color[0] = r;
1188 dpsoftrast.color[1] = g;
1189 dpsoftrast.color[2] = b;
1190 dpsoftrast.color[3] = a;
1193 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1195 int outstride = blockwidth * 4;
1196 int instride = dpsoftrast.fb_width * 4;
1199 int bx2 = blockx + blockwidth;
1200 int by2 = blocky + blockheight;
1205 unsigned char *inpixels;
1209 if (bx1 < 0) bx1 = 0;
1210 if (by1 < 0) by1 = 0;
1211 if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1212 if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1215 inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1216 if (dpsoftrast.bigendian)
1218 for (y = by1;y < by2;y++)
1220 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1221 o = (unsigned char *)outpixels + (y - by1) * outstride;
1222 for (x = bx1;x < bx2;x++)
1235 for (y = by1;y < by2;y++)
1237 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1238 o = (unsigned char *)outpixels + (y - by1) * outstride;
1244 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1248 int tx2 = tx + width;
1249 int ty2 = ty + height;
1252 int sx2 = sx + width;
1253 int sy2 = sy + height;
1263 unsigned int *spixels;
1264 unsigned int *tpixels;
1265 DPSOFTRAST_Texture *texture;
1266 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1267 if (mip < 0 || mip >= texture->mipmaps) return;
1270 spixels = dpsoftrast.fb_colorpixels[0];
1271 swidth = dpsoftrast.fb_width;
1272 sheight = dpsoftrast.fb_height;
1273 tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1274 twidth = texture->mipmap[mip][2];
1275 theight = texture->mipmap[mip][3];
1276 if (tx1 < 0) tx1 = 0;
1277 if (ty1 < 0) ty1 = 0;
1278 if (tx2 > twidth) tx2 = twidth;
1279 if (ty2 > theight) ty2 = theight;
1280 if (sx1 < 0) sx1 = 0;
1281 if (sy1 < 0) sy1 = 0;
1282 if (sx2 > swidth) sx2 = swidth;
1283 if (sy2 > sheight) sy2 = sheight;
1288 if (tw > sw) tw = sw;
1289 if (th > sh) th = sh;
1290 if (tw < 1 || th < 1)
1292 for (y = 0;y < th;y++)
1293 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 + y) * swidth + sx1), tw*4);
1294 if (texture->mipmaps > 1)
1295 DPSOFTRAST_Texture_CalculateMipmaps(index);
1298 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1299 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1301 if (thread->texbound[command->unitnum])
1302 ATOMIC_ADD(thread->texbound[command->unitnum]->binds, -1);
1303 thread->texbound[command->unitnum] = command->texture;
1305 void DPSOFTRAST_SetTexture(int unitnum, int index)
1307 DPSOFTRAST_Command_SetTexture *command;
1308 DPSOFTRAST_Texture *texture;
1309 if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1311 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1314 texture = DPSOFTRAST_Texture_GetByIndex(index);
1315 if (index && !texture)
1317 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1321 command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1322 command->opcode = DPSOFTRAST_OPCODE_SetTexture;
1323 command->unitnum = unitnum;
1324 command->texture = texture;
1326 dpsoftrast.texbound[unitnum] = texture;
1327 ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1330 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1332 dpsoftrast.pointer_vertex3f = vertex3f;
1333 dpsoftrast.stride_vertex = stride;
1335 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1337 dpsoftrast.pointer_color4f = color4f;
1338 dpsoftrast.pointer_color4ub = NULL;
1339 dpsoftrast.stride_color = stride;
1341 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1343 dpsoftrast.pointer_color4f = NULL;
1344 dpsoftrast.pointer_color4ub = color4ub;
1345 dpsoftrast.stride_color = stride;
1347 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1349 dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1350 dpsoftrast.components_texcoord[unitnum] = numcomponents;
1351 dpsoftrast.stride_texcoord[unitnum] = stride;
1354 DEFCOMMAND(18, SetShader, int mode; int permutation;)
1355 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1357 thread->shader_mode = command->mode;
1358 thread->shader_permutation = command->permutation;
1360 void DPSOFTRAST_SetShader(int mode, int permutation)
1362 DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1363 command->opcode = DPSOFTRAST_OPCODE_SetShader;
1364 command->mode = mode;
1365 command->permutation = permutation;
1367 dpsoftrast.shader_mode = mode;
1368 dpsoftrast.shader_permutation = permutation;
1371 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1372 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1374 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1376 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1378 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1379 command->opcode = DPSOFTRAST_OPCODE_Uniform4f;
1380 command->index = index;
1381 command->val[0] = v0;
1382 command->val[1] = v1;
1383 command->val[2] = v2;
1384 command->val[3] = v3;
1386 dpsoftrast.uniform4f[index*4+0] = v0;
1387 dpsoftrast.uniform4f[index*4+1] = v1;
1388 dpsoftrast.uniform4f[index*4+2] = v2;
1389 dpsoftrast.uniform4f[index*4+3] = v3;
1391 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1393 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1394 command->opcode = DPSOFTRAST_OPCODE_Uniform4f;
1395 command->index = index;
1396 memcpy(command->val, v, sizeof(command->val));
1398 memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1401 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1402 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1404 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1406 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1410 for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1412 __m128 m0, m1, m2, m3;
1413 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1414 command->opcode = DPSOFTRAST_OPCODE_UniformMatrix4f;
1415 command->index = index;
1416 if (((size_t)v)&(ALIGN_SIZE-1))
1418 m0 = _mm_loadu_ps(v);
1419 m1 = _mm_loadu_ps(v+4);
1420 m2 = _mm_loadu_ps(v+8);
1421 m3 = _mm_loadu_ps(v+12);
1425 m0 = _mm_load_ps(v);
1426 m1 = _mm_load_ps(v+4);
1427 m2 = _mm_load_ps(v+8);
1428 m3 = _mm_load_ps(v+12);
1432 __m128 t0, t1, t2, t3;
1433 t0 = _mm_unpacklo_ps(m0, m1);
1434 t1 = _mm_unpacklo_ps(m2, m3);
1435 t2 = _mm_unpackhi_ps(m0, m1);
1436 t3 = _mm_unpackhi_ps(m2, m3);
1437 m0 = _mm_movelh_ps(t0, t1);
1438 m1 = _mm_movehl_ps(t1, t0);
1439 m2 = _mm_movelh_ps(t2, t3);
1440 m3 = _mm_movehl_ps(t3, t2);
1442 _mm_store_ps(command->val, m0);
1443 _mm_store_ps(command->val+4, m1);
1444 _mm_store_ps(command->val+8, m2);
1445 _mm_store_ps(command->val+12, m3);
1446 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1447 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1448 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1449 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1454 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1455 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1457 thread->uniform1i[command->index] = command->val;
1459 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1461 DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1462 command->opcode = DPSOFTRAST_OPCODE_Uniform1i;
1463 command->index = index;
1466 dpsoftrast.uniform1i[command->index] = i0;
1470 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1472 float *end = dst + size*4;
1473 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1477 _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1486 _mm_store_ps(dst, _mm_load_ps((const float *)src));
1493 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1495 float *end = dst + size*4;
1496 if (stride == sizeof(float[3]))
1498 float *end4 = dst + (size&~3)*4;
1499 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1503 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv;
1504 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1505 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1506 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1507 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1508 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1509 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1510 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1511 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1512 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1513 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1514 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1515 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1517 src += 4*sizeof(float[3]);
1524 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1525 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1526 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1527 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1528 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1529 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1530 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1531 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1532 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1533 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1534 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1535 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1536 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1538 src += 4*sizeof(float[3]);
1542 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1546 __m128 v = _mm_loadu_ps((const float *)src);
1547 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1548 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1549 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1550 _mm_store_ps(dst, v);
1559 __m128 v = _mm_load_ps((const float *)src);
1560 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1561 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1562 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1563 _mm_store_ps(dst, v);
1570 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1572 float *end = dst + size*4;
1573 __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1574 if (stride == sizeof(float[2]))
1576 float *end2 = dst + (size&~1)*4;
1577 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1581 __m128 v = _mm_loadu_ps((const float *)src);
1582 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1583 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1585 src += 2*sizeof(float[2]);
1592 __m128 v = _mm_load_ps((const float *)src);
1593 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1594 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1596 src += 2*sizeof(float[2]);
1602 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1608 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1610 float *end = dst + size*4;
1611 __m128 scale = _mm_set1_ps(1.0f/255.0f);
1612 if (stride == sizeof(unsigned char[4]))
1614 float *end4 = dst + (size&~3)*4;
1615 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1619 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1620 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1621 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1622 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1623 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1625 src += 4*sizeof(unsigned char[4]);
1632 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1633 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1634 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1635 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1636 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1638 src += 4*sizeof(unsigned char[4]);
1644 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1645 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1651 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1653 float *end = dst + 4*size;
1654 __m128 v = _mm_loadu_ps(src);
1657 _mm_store_ps(dst, v);
1663 void DPSOFTRAST_Draw_LoadVertices(int firstvertex, int numvertices, bool needcolors)
1672 const unsigned char *b;
1673 dpsoftrast.numvertices = numvertices;
1674 if (dpsoftrast.maxvertices < dpsoftrast.numvertices)
1676 if (dpsoftrast.maxvertices < 4096)
1677 dpsoftrast.maxvertices = 4096;
1678 while (dpsoftrast.maxvertices < dpsoftrast.numvertices)
1679 dpsoftrast.maxvertices *= 2;
1680 if (dpsoftrast.in_array4f[0])
1681 MM_FREE(dpsoftrast.in_array4f[0]);
1682 data = (float *)MM_CALLOC(1, dpsoftrast.maxvertices * sizeof(float[4])*(DPSOFTRAST_ARRAY_TOTAL*2 + 1));
1683 for (i = 0;i < DPSOFTRAST_ARRAY_TOTAL;i++, data += dpsoftrast.maxvertices * 4)
1684 dpsoftrast.in_array4f[i] = data;
1685 for (i = 0;i < DPSOFTRAST_ARRAY_TOTAL;i++, data += dpsoftrast.maxvertices * 4)
1686 dpsoftrast.post_array4f[i] = data;
1687 dpsoftrast.screencoord4f = data;
1688 data += dpsoftrast.maxvertices * 4;
1690 stride = dpsoftrast.stride_vertex;
1691 v = (const float *)((unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride);
1692 p = dpsoftrast.in_array4f[0];
1693 DPSOFTRAST_Load3fTo4f(p, (const unsigned char *)v, numvertices, stride);
1696 if (dpsoftrast.pointer_color4f)
1698 stride = dpsoftrast.stride_color;
1699 v = (const float *)((const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride);
1700 p = dpsoftrast.in_array4f[1];
1701 DPSOFTRAST_Load4fTo4f(p, (const unsigned char *)v, numvertices, stride);
1703 else if (dpsoftrast.pointer_color4ub)
1705 stride = dpsoftrast.stride_color;
1706 b = (const unsigned char *)((const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride);
1707 p = dpsoftrast.in_array4f[1];
1708 DPSOFTRAST_Load4bTo4f(p, b, numvertices, stride);
1712 p = dpsoftrast.in_array4f[1];
1713 DPSOFTRAST_Fill4f(p, dpsoftrast.color, numvertices);
1716 for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL-2;j++)
1718 if (dpsoftrast.pointer_texcoordf[j])
1720 stride = dpsoftrast.stride_texcoord[j];
1721 v = (const float *)((const unsigned char *)dpsoftrast.pointer_texcoordf[j] + firstvertex * stride);
1722 p = dpsoftrast.in_array4f[j+2];
1723 switch(dpsoftrast.components_texcoord[j])
1726 DPSOFTRAST_Load2fTo4f(p, (const unsigned char *)v, numvertices, stride);
1729 DPSOFTRAST_Load3fTo4f(p, (const unsigned char *)v, numvertices, stride);
1732 DPSOFTRAST_Load4fTo4f(p, (const unsigned char *)v, numvertices, stride);
1740 void DPSOFTRAST_Array_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1743 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1744 __m128 m0, m1, m2, m3;
1745 float *end = out4f + numitems*4;
1746 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1748 // fast case for identity matrix
1749 memcpy(out4f, in4f, numitems * sizeof(float[4]));
1752 m0 = _mm_loadu_ps(inmatrix16f);
1753 m1 = _mm_loadu_ps(inmatrix16f + 4);
1754 m2 = _mm_loadu_ps(inmatrix16f + 8);
1755 m3 = _mm_loadu_ps(inmatrix16f + 12);
1756 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1760 __m128 v = _mm_loadu_ps(in4f);
1762 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1763 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1764 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1765 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1774 __m128 v = _mm_load_ps(in4f);
1776 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1777 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1778 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1779 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1787 void DPSOFTRAST_Array_Copy(float *out4f, const float *in4f, int numitems)
1789 memcpy(out4f, in4f, numitems * sizeof(float[4]));
1793 static __m128 DPSOFTRAST_Draw_ProjectVertex(__m128 v)
1795 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1796 __m128 w = _mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3));
1797 v = _mm_move_ss(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f));
1798 v = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, v), w));
1799 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1804 void DPSOFTRAST_Array_Project(float *out4f, float *screen4f, const float *in4f, int numitems)
1807 float *end = out4f + numitems*4;
1808 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1811 __m128 v = _mm_load_ps(in4f), w = _mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3));
1812 _mm_store_ps(out4f, v);
1813 v = _mm_move_ss(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f));
1814 v = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, v), w));
1815 _mm_store_ps(screen4f, _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1)));
1823 void DPSOFTRAST_Array_TransformProject(float *out4f, float *screen4f, const float *in4f, int numitems, const float *inmatrix16f)
1826 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1827 __m128 m0, m1, m2, m3, viewportcenter, viewportscale;
1828 float *end = out4f + numitems*4;
1829 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1831 DPSOFTRAST_Array_Project(out4f, screen4f, in4f, numitems);
1834 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1835 viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1836 m0 = _mm_loadu_ps(inmatrix16f);
1837 m1 = _mm_loadu_ps(inmatrix16f + 4);
1838 m2 = _mm_loadu_ps(inmatrix16f + 8);
1839 m3 = _mm_loadu_ps(inmatrix16f + 12);
1840 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1844 __m128 v = _mm_loadu_ps(in4f), w;
1845 v = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1846 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1847 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1848 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3))));
1849 _mm_store_ps(out4f, v);
1850 w = _mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3));
1851 v = _mm_move_ss(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f));
1852 v = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, v), w));
1853 _mm_store_ps(screen4f, _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1)));
1863 __m128 v = _mm_load_ps(in4f), w;
1864 v = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1865 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1866 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1867 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3))));
1868 _mm_store_ps(out4f, v);
1869 w = _mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3));
1870 v = _mm_move_ss(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f));
1871 v = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, v), w));
1872 _mm_store_ps(screen4f, _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1)));
1881 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
1884 int startx = span->startx;
1885 int endx = span->endx;
1886 float wslope = triangle->w[0];
1887 float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
1888 float endz = 1.0f / (w + wslope * startx);
1889 for (x = startx;x < endx;)
1891 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
1893 if(nextsub >= endx) nextsub = endsub = endx-1;
1894 endz = 1.0f / (w + wslope * nextsub);
1895 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
1896 for (; x <= endsub; x++, z += dz)
1901 void DPSOFTRAST_Draw_Span_Finish(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const float * RESTRICT in4f)
1904 int startx = span->startx;
1905 int endx = span->endx;
1908 unsigned char * RESTRICT pixelmask = span->pixelmask;
1909 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1912 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
1913 // handle alphatest now (this affects depth writes too)
1914 if (thread->alphatest)
1915 for (x = startx;x < endx;x++)
1916 if (in4f[x*4+3] < 0.5f)
1917 pixelmask[x] = false;
1918 // FIXME: this does not handle bigendian
1919 switch(thread->fb_blendmode)
1921 case DPSOFTRAST_BLENDMODE_OPAQUE:
1922 for (x = startx;x < endx;x++)
1926 d[0] = (int)(in4f[x*4+2]*255.0f);if (d[0] > 255) d[0] = 255;
1927 d[1] = (int)(in4f[x*4+1]*255.0f);if (d[1] > 255) d[1] = 255;
1928 d[2] = (int)(in4f[x*4+0]*255.0f);if (d[2] > 255) d[2] = 255;
1929 d[3] = (int)(in4f[x*4+3]*255.0f);if (d[3] > 255) d[3] = 255;
1930 pixel[x*4+0] = d[0];
1931 pixel[x*4+1] = d[1];
1932 pixel[x*4+2] = d[2];
1933 pixel[x*4+3] = d[3];
1936 case DPSOFTRAST_BLENDMODE_ALPHA:
1937 for (x = startx;x < endx;x++)
1941 a = in4f[x*4+3] * 255.0f;
1942 b = 1.0f - in4f[x*4+3];
1943 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
1944 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
1945 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
1946 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
1947 pixel[x*4+0] = d[0];
1948 pixel[x*4+1] = d[1];
1949 pixel[x*4+2] = d[2];
1950 pixel[x*4+3] = d[3];
1953 case DPSOFTRAST_BLENDMODE_ADDALPHA:
1954 for (x = startx;x < endx;x++)
1958 a = in4f[x*4+3] * 255.0f;
1959 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
1960 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
1961 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
1962 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
1963 pixel[x*4+0] = d[0];
1964 pixel[x*4+1] = d[1];
1965 pixel[x*4+2] = d[2];
1966 pixel[x*4+3] = d[3];
1969 case DPSOFTRAST_BLENDMODE_ADD:
1970 for (x = startx;x < endx;x++)
1974 d[0] = (int)(in4f[x*4+2]*255.0f+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
1975 d[1] = (int)(in4f[x*4+1]*255.0f+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
1976 d[2] = (int)(in4f[x*4+0]*255.0f+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
1977 d[3] = (int)(in4f[x*4+3]*255.0f+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
1978 pixel[x*4+0] = d[0];
1979 pixel[x*4+1] = d[1];
1980 pixel[x*4+2] = d[2];
1981 pixel[x*4+3] = d[3];
1984 case DPSOFTRAST_BLENDMODE_INVMOD:
1985 for (x = startx;x < endx;x++)
1989 d[0] = (int)((1.0f-in4f[x*4+2])*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
1990 d[1] = (int)((1.0f-in4f[x*4+1])*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
1991 d[2] = (int)((1.0f-in4f[x*4+0])*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
1992 d[3] = (int)((1.0f-in4f[x*4+3])*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
1993 pixel[x*4+0] = d[0];
1994 pixel[x*4+1] = d[1];
1995 pixel[x*4+2] = d[2];
1996 pixel[x*4+3] = d[3];
1999 case DPSOFTRAST_BLENDMODE_MUL:
2000 for (x = startx;x < endx;x++)
2004 d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2005 d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2006 d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2007 d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2008 pixel[x*4+0] = d[0];
2009 pixel[x*4+1] = d[1];
2010 pixel[x*4+2] = d[2];
2011 pixel[x*4+3] = d[3];
2014 case DPSOFTRAST_BLENDMODE_MUL2:
2015 for (x = startx;x < endx;x++)
2019 d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]*2.0f);if (d[0] > 255) d[0] = 255;
2020 d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]*2.0f);if (d[1] > 255) d[1] = 255;
2021 d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]*2.0f);if (d[2] > 255) d[2] = 255;
2022 d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]*2.0f);if (d[3] > 255) d[3] = 255;
2023 pixel[x*4+0] = d[0];
2024 pixel[x*4+1] = d[1];
2025 pixel[x*4+2] = d[2];
2026 pixel[x*4+3] = d[3];
2029 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2030 for (x = startx;x < endx;x++)
2034 a = in4f[x*4+3] * -255.0f;
2035 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;if (d[0] < 0) d[0] = 0;
2036 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;if (d[1] < 0) d[1] = 0;
2037 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;if (d[2] < 0) d[2] = 0;
2038 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;if (d[3] < 0) d[3] = 0;
2039 pixel[x*4+0] = d[0];
2040 pixel[x*4+1] = d[1];
2041 pixel[x*4+2] = d[2];
2042 pixel[x*4+3] = d[3];
2045 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2046 for (x = startx;x < endx;x++)
2051 b = 1.0f - in4f[x*4+3];
2052 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2053 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2054 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2055 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2056 pixel[x*4+0] = d[0];
2057 pixel[x*4+1] = d[1];
2058 pixel[x*4+2] = d[2];
2059 pixel[x*4+3] = d[3];
2065 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2069 int startx = span->startx;
2070 int endx = span->endx;
2071 const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2072 unsigned char * RESTRICT pixelmask = span->pixelmask;
2073 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2074 unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2077 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2078 pixeli += span->y * dpsoftrast.fb_width + span->x;
2079 // handle alphatest now (this affects depth writes too)
2080 if (thread->alphatest)
2081 for (x = startx;x < endx;x++)
2082 if (in4ub[x*4+3] < 0.5f)
2083 pixelmask[x] = false;
2084 // FIXME: this does not handle bigendian
2085 switch(thread->fb_blendmode)
2087 case DPSOFTRAST_BLENDMODE_OPAQUE:
2088 for (x = startx;x + 4 <= endx;)
2090 if (*(const unsigned int *)&pixelmask[x] == 0x01010101)
2092 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2106 case DPSOFTRAST_BLENDMODE_ALPHA:
2107 #define FINISHBLEND(blend2, blend1) \
2108 for (x = startx;x + 2 <= endx;x += 2) \
2111 switch (*(const unsigned short*)&pixelmask[x]) \
2114 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2115 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2117 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2120 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x+1]), _mm_setzero_si128()); \
2121 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x+1]), _mm_setzero_si128()); \
2123 pixeli[x+1] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2126 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2127 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2129 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2134 for(;x < endx; x++) \
2137 if (!pixelmask[x]) \
2139 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2140 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2142 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2146 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2147 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2149 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2150 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2153 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2155 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2156 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2158 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2159 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2162 case DPSOFTRAST_BLENDMODE_ADD:
2163 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2165 case DPSOFTRAST_BLENDMODE_INVMOD:
2167 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2169 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2172 case DPSOFTRAST_BLENDMODE_MUL:
2173 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2175 case DPSOFTRAST_BLENDMODE_MUL2:
2176 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2178 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2180 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2181 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2183 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2184 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2187 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2189 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2190 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2192 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2193 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2200 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2203 int startx = span->startx;
2204 int endx = span->endx;
2209 float tc[2], endtc[2];
2211 unsigned int tci[2];
2212 unsigned int tci1[2];
2213 unsigned int tcimin[2];
2214 unsigned int tcimax[2];
2219 const unsigned char * RESTRICT pixelbase;
2220 const unsigned char * RESTRICT pixel[4];
2221 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2222 // if no texture is bound, just fill it with white
2225 for (x = startx;x < endx;x++)
2227 out4f[x*4+0] = 1.0f;
2228 out4f[x*4+1] = 1.0f;
2229 out4f[x*4+2] = 1.0f;
2230 out4f[x*4+3] = 1.0f;
2234 mip = triangle->mip[texunitindex];
2235 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2236 // if this mipmap of the texture is 1 pixel, just fill it with that color
2237 if (texture->mipmap[mip][1] == 4)
2239 c[0] = texture->bytes[2] * (1.0f/255.0f);
2240 c[1] = texture->bytes[1] * (1.0f/255.0f);
2241 c[2] = texture->bytes[0] * (1.0f/255.0f);
2242 c[3] = texture->bytes[3] * (1.0f/255.0f);
2243 for (x = startx;x < endx;x++)
2245 out4f[x*4+0] = c[0];
2246 out4f[x*4+1] = c[1];
2247 out4f[x*4+2] = c[2];
2248 out4f[x*4+3] = c[3];
2252 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2253 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2254 flags = texture->flags;
2255 tcscale[0] = texture->mipmap[mip][2];
2256 tcscale[1] = texture->mipmap[mip][3];
2257 tciwidth = texture->mipmap[mip][2];
2260 tcimax[0] = texture->mipmap[mip][2]-1;
2261 tcimax[1] = texture->mipmap[mip][3]-1;
2262 tciwrapmask[0] = texture->mipmap[mip][2]-1;
2263 tciwrapmask[1] = texture->mipmap[mip][3]-1;
2264 endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0] - 0.5f;
2265 endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1] - 0.5f;
2266 for (x = startx;x < endx;)
2268 unsigned int subtc[2];
2269 unsigned int substep[2];
2270 float subscale = 65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2271 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2274 nextsub = endsub = endx-1;
2275 if(x < nextsub) subscale = 65536.0f / (nextsub - x);
2279 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0] - 0.5f;
2280 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1] - 0.5f;
2281 substep[0] = (endtc[0] - tc[0]) * subscale;
2282 substep[1] = (endtc[1] - tc[1]) * subscale;
2283 subtc[0] = tc[0] * (1<<16);
2284 subtc[1] = tc[1] * (1<<16);
2287 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2289 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2291 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2292 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2293 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2294 tci[0] = subtc[0]>>16;
2295 tci[1] = subtc[1]>>16;
2296 tci1[0] = tci[0] + 1;
2297 tci1[1] = tci[1] + 1;
2298 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2299 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2300 tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2301 tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2302 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2303 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2304 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2305 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2306 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2307 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2308 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2309 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2310 out4f[x*4+0] = c[0];
2311 out4f[x*4+1] = c[1];
2312 out4f[x*4+2] = c[2];
2313 out4f[x*4+3] = c[3];
2318 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2320 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2321 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2322 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2323 tci[0] = subtc[0]>>16;
2324 tci[1] = subtc[1]>>16;
2325 tci1[0] = tci[0] + 1;
2326 tci1[1] = tci[1] + 1;
2327 tci[0] &= tciwrapmask[0];
2328 tci[1] &= tciwrapmask[1];
2329 tci1[0] &= tciwrapmask[0];
2330 tci1[1] &= tciwrapmask[1];
2331 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2332 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2333 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2334 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2335 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2336 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2337 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2338 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2339 out4f[x*4+0] = c[0];
2340 out4f[x*4+1] = c[1];
2341 out4f[x*4+2] = c[2];
2342 out4f[x*4+3] = c[3];
2346 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2348 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2350 tci[0] = subtc[0]>>16;
2351 tci[1] = subtc[1]>>16;
2352 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2353 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2354 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2355 c[0] = pixel[0][2] * (1.0f / 255.0f);
2356 c[1] = pixel[0][1] * (1.0f / 255.0f);
2357 c[2] = pixel[0][0] * (1.0f / 255.0f);
2358 c[3] = pixel[0][3] * (1.0f / 255.0f);
2359 out4f[x*4+0] = c[0];
2360 out4f[x*4+1] = c[1];
2361 out4f[x*4+2] = c[2];
2362 out4f[x*4+3] = c[3];
2367 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2369 tci[0] = subtc[0]>>16;
2370 tci[1] = subtc[1]>>16;
2371 tci[0] &= tciwrapmask[0];
2372 tci[1] &= tciwrapmask[1];
2373 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2374 c[0] = pixel[0][2] * (1.0f / 255.0f);
2375 c[1] = pixel[0][1] * (1.0f / 255.0f);
2376 c[2] = pixel[0][0] * (1.0f / 255.0f);
2377 c[3] = pixel[0][3] * (1.0f / 255.0f);
2378 out4f[x*4+0] = c[0];
2379 out4f[x*4+1] = c[1];
2380 out4f[x*4+2] = c[2];
2381 out4f[x*4+3] = c[3];
2387 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2391 int startx = span->startx;
2392 int endx = span->endx;
2394 __m128 data, slope, tcscale;
2395 __m128i tcsize, tcmask, tcoffset, tcmax;
2397 __m128i subtc, substep, endsubtc;
2400 unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2401 const unsigned char * RESTRICT pixelbase;
2402 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2403 // if no texture is bound, just fill it with white
2406 memset(out4ub + startx*4, 255, span->length*4);
2409 mip = triangle->mip[texunitindex];
2410 pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2411 // if this mipmap of the texture is 1 pixel, just fill it with that color
2412 if (texture->mipmap[mip][1] == 4)
2414 unsigned int k = *((const unsigned int *)pixelbase);
2415 for (x = startx;x < endx;x++)
2419 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2420 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2421 flags = texture->flags;
2422 tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2423 tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2424 tcscale = _mm_cvtepi32_ps(tcsize);
2425 data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2426 slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2427 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx])), _mm_set1_ps(0.5f));
2428 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2429 tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2430 tcmax = _mm_packs_epi32(tcmask, tcmask);
2431 for (x = startx;x < endx;)
2433 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2434 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2437 nextsub = endsub = endx-1;
2438 if(x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2442 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub])), _mm_set1_ps(0.5f));
2443 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2444 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2445 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2446 substep = _mm_slli_epi32(substep, 1);
2449 __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2450 if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2452 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2453 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2455 const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;
2456 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2457 tci = _mm_madd_epi16(tci, tcoffset);
2458 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2459 ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2460 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2461 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2462 pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2463 pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2464 fracm = _mm_srli_epi16(subtc, 1);
2465 pix1 = _mm_add_epi16(pix1,
2466 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2467 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2468 pix3 = _mm_add_epi16(pix3,
2469 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2470 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2471 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2472 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2473 pix2 = _mm_add_epi16(pix2,
2474 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2475 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2476 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2480 const unsigned char * RESTRICT ptr1;
2481 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2482 tci = _mm_madd_epi16(tci, tcoffset);
2483 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2484 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2485 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2486 fracm = _mm_srli_epi16(subtc, 1);
2487 pix1 = _mm_add_epi16(pix1,
2488 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2489 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2490 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2491 pix1 = _mm_add_epi16(pix1,
2492 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2493 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2494 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2498 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2500 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2502 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2503 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2504 tci = _mm_madd_epi16(tci, tcoffset);
2505 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2506 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2507 _mm_setzero_si128());
2508 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2509 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2510 _mm_setzero_si128());
2511 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2512 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2513 tci = _mm_madd_epi16(tci, tcoffset);
2514 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2515 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2516 _mm_setzero_si128());
2517 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2518 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2519 _mm_setzero_si128());
2520 fracm = _mm_srli_epi16(subtc, 1);
2521 pix1 = _mm_add_epi16(pix1,
2522 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2523 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2524 pix3 = _mm_add_epi16(pix3,
2525 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2526 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2527 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2528 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2529 pix2 = _mm_add_epi16(pix2,
2530 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2531 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2532 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2536 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2537 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2538 tci = _mm_madd_epi16(tci, tcoffset);
2539 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2540 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2541 _mm_setzero_si128());
2542 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2543 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2544 _mm_setzero_si128());
2545 fracm = _mm_srli_epi16(subtc, 1);
2546 pix1 = _mm_add_epi16(pix1,
2547 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2548 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2549 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2550 pix1 = _mm_add_epi16(pix1,
2551 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2552 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2553 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2559 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2561 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2562 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2563 tci = _mm_madd_epi16(tci, tcoffset);
2564 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2565 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2566 _mm_setzero_si128());
2567 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2568 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2569 _mm_setzero_si128());
2570 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2571 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2572 tci = _mm_madd_epi16(tci, tcoffset);
2573 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2574 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2575 _mm_setzero_si128());
2576 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2577 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2578 _mm_setzero_si128());
2579 fracm = _mm_srli_epi16(subtc, 1);
2580 pix1 = _mm_add_epi16(pix1,
2581 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2582 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2583 pix3 = _mm_add_epi16(pix3,
2584 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2585 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2586 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2587 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2588 pix2 = _mm_add_epi16(pix2,
2589 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2590 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2591 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2595 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2596 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2597 tci = _mm_madd_epi16(tci, tcoffset);
2598 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2599 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2600 _mm_setzero_si128());
2601 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2602 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2603 _mm_setzero_si128());
2604 fracm = _mm_srli_epi16(subtc, 1);
2605 pix1 = _mm_add_epi16(pix1,
2606 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2607 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2608 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2609 pix1 = _mm_add_epi16(pix1,
2610 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2611 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2612 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2619 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2621 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2623 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2624 tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2625 tci = _mm_madd_epi16(tci, tcoffset);
2626 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2627 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2631 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2632 tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2633 tci = _mm_madd_epi16(tci, tcoffset);
2634 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2640 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2642 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2643 tci = _mm_and_si128(tci, tcmax);
2644 tci = _mm_madd_epi16(tci, tcoffset);
2645 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2646 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2650 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2651 tci = _mm_and_si128(tci, tcmax);
2652 tci = _mm_madd_epi16(tci, tcoffset);
2653 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2662 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2665 memset(out4ub, 255, span->length*4);
2668 float DPSOFTRAST_SampleShadowmap(const float *vector)
2674 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2677 int startx = span->startx;
2678 int endx = span->endx;
2683 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2684 for (x = startx;x < endx;x++)
2687 c[0] = (data[0] + slope[0]*x) * z;
2688 c[1] = (data[1] + slope[1]*x) * z;
2689 c[2] = (data[2] + slope[2]*x) * z;
2690 c[3] = (data[3] + slope[3]*x) * z;
2691 out4f[x*4+0] = in4f[x*4+0] * c[0];
2692 out4f[x*4+1] = in4f[x*4+1] * c[1];
2693 out4f[x*4+2] = in4f[x*4+2] * c[2];
2694 out4f[x*4+3] = in4f[x*4+3] * c[3];
2698 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2701 int startx = span->startx;
2702 int endx = span->endx;
2707 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2708 for (x = startx;x < endx;x++)
2711 c[0] = (data[0] + slope[0]*x) * z;
2712 c[1] = (data[1] + slope[1]*x) * z;
2713 c[2] = (data[2] + slope[2]*x) * z;
2714 c[3] = (data[3] + slope[3]*x) * z;
2715 out4f[x*4+0] = c[0];
2716 out4f[x*4+1] = c[1];
2717 out4f[x*4+2] = c[2];
2718 out4f[x*4+3] = c[3];
2722 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2724 int x, startx = span->startx, endx = span->endx;
2725 float c[4], localcolor[4];
2726 localcolor[0] = subcolor[0];
2727 localcolor[1] = subcolor[1];
2728 localcolor[2] = subcolor[2];
2729 localcolor[3] = subcolor[3];
2730 for (x = startx;x < endx;x++)
2732 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2733 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2734 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2735 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2736 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2737 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2738 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2739 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2743 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2745 int x, startx = span->startx, endx = span->endx;
2746 for (x = startx;x < endx;x++)
2748 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2749 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2750 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2751 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2755 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2757 int x, startx = span->startx, endx = span->endx;
2758 for (x = startx;x < endx;x++)
2760 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2761 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2762 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2763 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2767 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2769 int x, startx = span->startx, endx = span->endx;
2771 for (x = startx;x < endx;x++)
2773 a = 1.0f - inb4f[x*4+3];
2775 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2776 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2777 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2778 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2782 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2784 int x, startx = span->startx, endx = span->endx;
2785 float localcolor[4], ilerp, lerp;
2786 localcolor[0] = color[0];
2787 localcolor[1] = color[1];
2788 localcolor[2] = color[2];
2789 localcolor[3] = color[3];
2790 ilerp = 1.0f - localcolor[3];
2791 lerp = localcolor[3];
2792 for (x = startx;x < endx;x++)
2794 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2795 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2796 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2797 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2803 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2807 int startx = span->startx;
2808 int endx = span->endx;
2810 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2811 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2812 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2813 data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
2814 data = _mm_mul_ps(data, _mm_set1_ps(256.0f));
2815 slope = _mm_mul_ps(slope, _mm_set1_ps(256.0f));
2816 for (x = startx;x+2 <= endx;x += 2, data = _mm_add_ps(data, slope))
2818 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
2819 __m128i mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x]))), mod2;
2820 data = _mm_add_ps(data, slope);
2821 mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x+1])));
2822 mod = _mm_unpacklo_epi64(_mm_packs_epi32(mod, mod), _mm_packs_epi32(mod2, mod2));
2823 pix = _mm_mulhi_epu16(pix, mod);
2824 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2826 for (;x < endx;x++, data = _mm_add_ps(data, slope))
2828 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
2829 __m128i mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x])));
2830 mod = _mm_packs_epi32(mod, mod);
2831 pix = _mm_mulhi_epu16(pix, mod);
2832 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2837 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
2841 int startx = span->startx;
2842 int endx = span->endx;
2844 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2845 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2846 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2847 data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
2848 data = _mm_mul_ps(data, _mm_set1_ps(255.0f));
2849 slope = _mm_mul_ps(slope, _mm_set1_ps(255.0f));
2850 for (x = startx;x+2 <= endx;x += 2, data = _mm_add_ps(data, slope))
2852 __m128i pix = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x]))), pix2;
2853 data = _mm_add_ps(data, slope);
2854 pix2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x+1])));
2855 pix = _mm_unpacklo_epi64(_mm_packs_epi32(pix, pix), _mm_packs_epi32(pix2, pix2));
2856 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2858 for (;x < endx;x++, data = _mm_add_ps(data, slope))
2860 __m128i pix = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x])));
2861 pix = _mm_packs_epi32(pix, pix);
2862 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2867 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
2870 int x, startx = span->startx, endx = span->endx;
2871 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
2872 localcolor = _mm_shuffle_epi32(_mm_packs_epi32(localcolor, localcolor), _MM_SHUFFLE(1, 0, 1, 0));
2873 for (x = startx;x+2 <= endx;x+=2)
2875 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2876 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
2877 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
2878 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2882 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2883 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
2884 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
2885 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2890 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
2893 int x, startx = span->startx, endx = span->endx;
2894 for (x = startx;x+2 <= endx;x+=2)
2896 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2897 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
2898 pix1 = _mm_mulhi_epu16(pix1, pix2);
2899 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2903 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2904 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
2905 pix1 = _mm_mulhi_epu16(pix1, pix2);
2906 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2911 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
2914 int x, startx = span->startx, endx = span->endx;
2915 for (x = startx;x+2 <= endx;x+=2)
2917 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2918 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
2919 pix1 = _mm_add_epi16(pix1, pix2);
2920 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2924 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2925 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
2926 pix1 = _mm_add_epi16(pix1, pix2);
2927 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2932 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
2935 int x, startx = span->startx, endx = span->endx;
2936 __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
2937 tint = _mm_shuffle_epi32(_mm_packs_epi32(tint, tint), _MM_SHUFFLE(1, 0, 1, 0));
2938 for (x = startx;x+2 <= endx;x+=2)
2940 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2941 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
2942 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
2943 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2947 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2948 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
2949 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
2950 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2955 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
2958 int x, startx = span->startx, endx = span->endx;
2959 for (x = startx;x+2 <= endx;x+=2)
2961 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2962 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
2963 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2964 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
2965 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2969 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2970 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
2971 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
2972 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
2973 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2978 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
2981 int x, startx = span->startx, endx = span->endx;
2982 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
2983 localcolor = _mm_shuffle_epi32(_mm_packs_epi32(localcolor, localcolor), _MM_SHUFFLE(1, 0, 1, 0));
2984 blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
2985 for (x = startx;x+2 <= endx;x+=2)
2987 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
2988 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
2989 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2993 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
2994 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
2995 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3002 void DPSOFTRAST_VertexShader_Generic(void)
3004 DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3005 DPSOFTRAST_Array_Copy(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_COLOR], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_COLOR], dpsoftrast.numvertices);
3006 DPSOFTRAST_Array_Copy(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.numvertices);
3007 if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3008 DPSOFTRAST_Array_Copy(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD1], dpsoftrast.numvertices);
3011 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3013 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3014 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3015 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3016 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3017 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3018 if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3020 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3021 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3022 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3024 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3025 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3028 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3030 else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3033 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3035 else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3038 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3043 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3044 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3049 void DPSOFTRAST_VertexShader_PostProcess(void)
3051 DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3052 DPSOFTRAST_Array_Copy(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.numvertices);
3053 DPSOFTRAST_Array_Copy(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD1], dpsoftrast.numvertices);
3056 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3058 // TODO: optimize!! at the very least there is no reason to use texture sampling on the frame texture
3059 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3060 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3061 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3062 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3063 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3064 if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3066 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3067 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3069 DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3070 if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3072 // TODO: implement saturation
3074 if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3076 // TODO: implement gammaramps
3078 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3083 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3085 DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3088 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3090 // this is never called (because colormask is off when this shader is used)
3091 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3092 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3093 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3094 memset(buffer_FragColorbgra8, 0, span->length*4);
3095 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3100 void DPSOFTRAST_VertexShader_FlatColor(void)
3102 DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3103 DPSOFTRAST_Array_Transform(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3106 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3108 int x, startx = span->startx, endx = span->endx;
3109 int Color_Ambienti[4];
3110 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3111 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3112 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3113 Color_Ambienti[2] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0]*256.0f);
3114 Color_Ambienti[1] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1]*256.0f);
3115 Color_Ambienti[0] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2]*256.0f);
3116 Color_Ambienti[3] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0] *256.0f);
3117 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3118 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3119 for (x = startx;x < endx;x++)
3121 buffer_FragColorbgra8[x*4+0] = (buffer_texture_colorbgra8[x*4+0] * Color_Ambienti[0])>>8;
3122 buffer_FragColorbgra8[x*4+1] = (buffer_texture_colorbgra8[x*4+1] * Color_Ambienti[1])>>8;
3123 buffer_FragColorbgra8[x*4+2] = (buffer_texture_colorbgra8[x*4+2] * Color_Ambienti[2])>>8;
3124 buffer_FragColorbgra8[x*4+3] = (buffer_texture_colorbgra8[x*4+3] * Color_Ambienti[3])>>8;
3126 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3131 void DPSOFTRAST_VertexShader_VertexColor(void)
3133 DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3134 DPSOFTRAST_Array_Copy(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_COLOR], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_COLOR], dpsoftrast.numvertices);
3135 DPSOFTRAST_Array_Transform(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3138 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3141 unsigned char * RESTRICT pixelmask = span->pixelmask;
3142 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3143 int x, startx = span->startx, endx = span->endx;
3144 __m128i Color_Ambientm, Color_Diffusem;
3146 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3147 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3148 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3149 int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3150 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3151 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3152 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3153 pixel = buffer_FragColorbgra8;
3154 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3155 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3156 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3157 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3158 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3159 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3160 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3161 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3162 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3163 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3164 data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3165 data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3166 slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3167 for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3169 __m128i color, mod, pix;
3170 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3173 __m128 z = _mm_loadu_ps(&buffer_z[x]);
3174 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3175 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3176 data = _mm_add_ps(data, slope);
3177 mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3178 data = _mm_add_ps(data, slope);
3179 mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3180 data = _mm_add_ps(data, slope);
3181 mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3182 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3183 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3184 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3185 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3186 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3192 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3193 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x])));
3194 mod = _mm_packs_epi32(mod, mod);
3195 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3196 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3198 if(pixel == buffer_FragColorbgra8)
3199 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3205 void DPSOFTRAST_VertexShader_Lightmap(void)
3207 DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3208 DPSOFTRAST_Array_Transform(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3209 DPSOFTRAST_Array_Copy(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD4], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD4], dpsoftrast.numvertices);
3212 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3215 unsigned char * RESTRICT pixelmask = span->pixelmask;
3216 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3217 int x, startx = span->startx, endx = span->endx;
3218 __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3219 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3220 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3221 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3222 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3223 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3224 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3225 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3226 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3227 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3228 pixel = buffer_FragColorbgra8;
3229 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3230 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3231 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3232 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3233 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3234 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3235 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3236 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3238 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3239 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3240 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3241 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3242 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3243 for (x = startx;x < endx;x++)
3245 __m128i color, lightmap, glow, pix;
3246 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3249 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3250 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3251 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3252 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3253 _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3254 _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3255 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3256 _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3257 _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3258 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3264 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3265 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3266 glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3267 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3268 pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3269 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3274 for (x = startx;x < endx;x++)
3276 __m128i color, lightmap, pix;
3277 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3280 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3281 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3282 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3283 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3284 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3285 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3286 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3292 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3293 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3294 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3295 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3298 if(pixel == buffer_FragColorbgra8)
3299 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3305 void DPSOFTRAST_VertexShader_FakeLight(void)
3307 DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3310 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3313 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3314 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3315 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3316 memset(buffer_FragColorbgra8, 0, span->length*4);
3317 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3322 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3324 DPSOFTRAST_VertexShader_Lightmap();
3327 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3329 DPSOFTRAST_PixelShader_Lightmap(thread, triangle, span);
3335 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3337 DPSOFTRAST_VertexShader_Lightmap();
3340 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3342 DPSOFTRAST_PixelShader_Lightmap(thread, triangle, span);
3348 void DPSOFTRAST_VertexShader_LightDirection(void)
3351 int numvertices = dpsoftrast.numvertices;
3353 float LightVector[4];
3354 float EyePosition[4];
3355 float EyeVectorModelSpace[4];
3361 LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3362 LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3363 LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3364 LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3365 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3366 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3367 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3368 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3369 DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3370 DPSOFTRAST_Array_Transform(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3371 for (i = 0;i < numvertices;i++)
3373 position[0] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3374 position[1] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3375 position[2] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3376 svector[0] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3377 svector[1] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3378 svector[2] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3379 tvector[0] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3380 tvector[1] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3381 tvector[2] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3382 normal[0] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3383 normal[1] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3384 normal[2] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3385 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3386 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3387 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3388 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
3389 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
3390 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
3391 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
3392 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3393 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3394 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3395 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3396 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3397 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
3398 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
3399 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
3400 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
3401 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
3405 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3406 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3407 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3408 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3409 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3410 #define DPSOFTRAST_Vector3Normalize(v)\
3413 float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3424 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3426 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3427 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3428 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3429 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3430 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3431 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3432 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3433 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3434 int x, startx = span->startx, endx = span->endx;
3435 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3436 float LightVectordata[4];
3437 float LightVectorslope[4];
3438 float EyeVectordata[4];
3439 float EyeVectorslope[4];
3441 float diffusetex[4];
3443 float surfacenormal[4];
3444 float lightnormal[4];
3446 float specularnormal[4];
3449 float SpecularPower;
3451 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3452 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3453 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3454 Color_Glow[3] = 0.0f;
3455 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3456 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3457 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3458 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3459 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3460 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3461 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3462 Color_Pants[3] = 0.0f;
3463 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3464 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3465 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3466 Color_Shirt[3] = 0.0f;
3467 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3468 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3469 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3471 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3472 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3474 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3476 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3478 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3480 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3481 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3482 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3483 Color_Diffuse[3] = 0.0f;
3484 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3485 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3486 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3487 LightColor[3] = 0.0f;
3488 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3489 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3490 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3491 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3492 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3493 Color_Specular[3] = 0.0f;
3494 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3495 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3496 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3497 for (x = startx;x < endx;x++)
3500 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3501 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3502 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3503 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3504 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3506 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3507 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3508 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3509 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3511 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3512 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3513 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3514 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3515 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3516 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3517 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3518 DPSOFTRAST_Vector3Normalize(surfacenormal);
3520 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3521 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3522 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3523 DPSOFTRAST_Vector3Normalize(lightnormal);
3525 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3526 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3527 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3528 DPSOFTRAST_Vector3Normalize(eyenormal);
3530 specularnormal[0] = lightnormal[0] + eyenormal[0];
3531 specularnormal[1] = lightnormal[1] + eyenormal[1];
3532 specularnormal[2] = lightnormal[2] + eyenormal[2];
3533 DPSOFTRAST_Vector3Normalize(specularnormal);
3535 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3536 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3537 specular = pow(specular, SpecularPower * glosstex[3]);
3538 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3540 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3541 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3542 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3543 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3547 d[0] = (int)( diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3548 d[1] = (int)( diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3549 d[2] = (int)( diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3550 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3552 buffer_FragColorbgra8[x*4+0] = d[0];
3553 buffer_FragColorbgra8[x*4+1] = d[1];
3554 buffer_FragColorbgra8[x*4+2] = d[2];
3555 buffer_FragColorbgra8[x*4+3] = d[3];
3558 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3560 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3561 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3562 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3563 Color_Diffuse[3] = 0.0f;
3564 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3565 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3566 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3567 LightColor[3] = 0.0f;
3568 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3569 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3570 for (x = startx;x < endx;x++)
3573 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3574 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3575 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3576 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3577 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3578 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3579 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3580 DPSOFTRAST_Vector3Normalize(surfacenormal);
3582 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3583 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3584 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3585 DPSOFTRAST_Vector3Normalize(lightnormal);
3587 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3588 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3590 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3591 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3592 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3593 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
3597 d[0] = (int)( + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3598 d[1] = (int)( + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3599 d[2] = (int)( + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3600 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
3602 buffer_FragColorbgra8[x*4+0] = d[0];
3603 buffer_FragColorbgra8[x*4+1] = d[1];
3604 buffer_FragColorbgra8[x*4+2] = d[2];
3605 buffer_FragColorbgra8[x*4+3] = d[3];
3610 for (x = startx;x < endx;x++)
3613 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3614 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3615 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3616 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3618 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3620 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3621 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3622 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3623 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3627 d[0] = (int)( diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3628 d[1] = (int)( diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3629 d[2] = (int)( diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3630 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3632 buffer_FragColorbgra8[x*4+0] = d[0];
3633 buffer_FragColorbgra8[x*4+1] = d[1];
3634 buffer_FragColorbgra8[x*4+2] = d[2];
3635 buffer_FragColorbgra8[x*4+3] = d[3];
3638 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3643 void DPSOFTRAST_VertexShader_LightSource(void)
3646 int numvertices = dpsoftrast.numvertices;
3647 float LightPosition[4];
3648 float LightVector[4];
3649 float LightVectorModelSpace[4];
3650 float EyePosition[4];
3651 float EyeVectorModelSpace[4];
3657 LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
3658 LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
3659 LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
3660 LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
3661 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3662 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3663 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3664 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3665 DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3666 DPSOFTRAST_Array_Transform(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3667 DPSOFTRAST_Array_Transform(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
3668 DPSOFTRAST_Array_Copy(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD4], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD4], dpsoftrast.numvertices);
3669 for (i = 0;i < numvertices;i++)
3671 position[0] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3672 position[1] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3673 position[2] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3674 svector[0] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3675 svector[1] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3676 svector[2] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3677 tvector[0] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3678 tvector[1] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3679 tvector[2] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3680 normal[0] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3681 normal[1] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3682 normal[2] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3683 LightVectorModelSpace[0] = LightPosition[0] - position[0];
3684 LightVectorModelSpace[1] = LightPosition[1] - position[1];
3685 LightVectorModelSpace[2] = LightPosition[2] - position[2];
3686 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
3687 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
3688 LightVector[2] = normal[0] * LightVectorModelSpace[0] + normal[1] * LightVectorModelSpace[1] + normal[2] * LightVectorModelSpace[2];
3689 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
3690 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
3691 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
3692 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
3693 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3694 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3695 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3696 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3697 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3698 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
3699 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
3700 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
3701 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
3702 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
3706 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3709 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3710 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3711 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3712 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3713 unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3714 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3715 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3716 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3717 int x, startx = span->startx, endx = span->endx;
3718 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3719 float CubeVectordata[4];
3720 float CubeVectorslope[4];
3721 float LightVectordata[4];
3722 float LightVectorslope[4];
3723 float EyeVectordata[4];
3724 float EyeVectorslope[4];
3726 float diffusetex[4];
3728 float surfacenormal[4];
3729 float lightnormal[4];
3731 float specularnormal[4];
3734 float SpecularPower;
3735 float CubeVector[4];
3738 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3739 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3740 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3741 Color_Glow[3] = 0.0f;
3742 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3743 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3744 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3745 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3746 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3747 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3748 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3749 Color_Diffuse[3] = 0.0f;
3750 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3751 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3752 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3753 Color_Specular[3] = 0.0f;
3754 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3755 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3756 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3757 Color_Pants[3] = 0.0f;
3758 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3759 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3760 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3761 Color_Shirt[3] = 0.0f;
3762 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3763 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3764 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3765 LightColor[3] = 0.0f;
3766 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3767 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3768 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3769 DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3770 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3771 memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
3772 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3773 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3775 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3776 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3778 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3779 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
3780 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3782 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3783 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3784 for (x = startx;x < endx;x++)
3787 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
3788 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
3789 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
3790 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
3791 if (attenuation < 0.01f)
3793 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
3795 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
3796 if (attenuation < 0.01f)
3800 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3801 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3802 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3803 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3804 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3806 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3807 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3808 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3809 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3811 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3812 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3813 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3814 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3815 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3816 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3817 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3818 DPSOFTRAST_Vector3Normalize(surfacenormal);
3820 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3821 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3822 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3823 DPSOFTRAST_Vector3Normalize(lightnormal);
3825 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3826 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3827 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3828 DPSOFTRAST_Vector3Normalize(eyenormal);
3830 specularnormal[0] = lightnormal[0] + eyenormal[0];
3831 specularnormal[1] = lightnormal[1] + eyenormal[1];
3832 specularnormal[2] = lightnormal[2] + eyenormal[2];
3833 DPSOFTRAST_Vector3Normalize(specularnormal);
3835 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3836 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3837 specular = pow(specular, SpecularPower * glosstex[3]);
3838 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3840 // scale down the attenuation to account for the cubefilter multiplying everything by 255
3841 attenuation *= (1.0f / 255.0f);
3842 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
3843 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
3844 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
3845 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
3849 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
3850 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
3851 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
3852 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
3854 buffer_FragColorbgra8[x*4+0] = d[0];
3855 buffer_FragColorbgra8[x*4+1] = d[1];
3856 buffer_FragColorbgra8[x*4+2] = d[2];
3857 buffer_FragColorbgra8[x*4+3] = d[3];
3860 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3862 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3863 for (x = startx;x < endx;x++)
3866 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
3867 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
3868 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
3869 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
3870 if (attenuation < 0.01f)
3872 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
3874 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
3875 if (attenuation < 0.01f)
3879 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3880 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3881 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3882 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3883 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3885 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3886 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3887 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3888 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3890 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3891 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3892 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3893 DPSOFTRAST_Vector3Normalize(surfacenormal);
3895 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3896 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3897 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3898 DPSOFTRAST_Vector3Normalize(lightnormal);
3900 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3901 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3903 // scale down the attenuation to account for the cubefilter multiplying everything by 255
3904 attenuation *= (1.0f / 255.0f);
3905 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
3906 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
3907 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
3908 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
3912 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
3913 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
3914 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
3915 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
3917 buffer_FragColorbgra8[x*4+0] = d[0];
3918 buffer_FragColorbgra8[x*4+1] = d[1];
3919 buffer_FragColorbgra8[x*4+2] = d[2];
3920 buffer_FragColorbgra8[x*4+3] = d[3];
3925 for (x = startx;x < endx;x++)
3928 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
3929 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
3930 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
3931 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
3932 if (attenuation < 0.01f)
3934 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
3936 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
3937 if (attenuation < 0.01f)
3941 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3942 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3943 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3944 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3945 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3947 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3948 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3949 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3950 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3952 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3954 // scale down the attenuation to account for the cubefilter multiplying everything by 255
3955 attenuation *= (1.0f / 255.0f);
3956 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
3957 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
3958 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
3959 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
3963 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
3964 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
3965 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
3966 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
3968 buffer_FragColorbgra8[x*4+0] = d[0];
3969 buffer_FragColorbgra8[x*4+1] = d[1];
3970 buffer_FragColorbgra8[x*4+2] = d[2];
3971 buffer_FragColorbgra8[x*4+3] = d[3];
3974 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3980 void DPSOFTRAST_VertexShader_Refraction(void)
3982 DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3985 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3988 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3989 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3990 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3991 memset(buffer_FragColorbgra8, 0, span->length*4);
3992 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3997 void DPSOFTRAST_VertexShader_Water(void)
3999 DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4003 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4006 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4007 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4008 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4009 memset(buffer_FragColorbgra8, 0, span->length*4);
4010 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4015 void DPSOFTRAST_VertexShader_ShowDepth(void)
4017 DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4020 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4023 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4024 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4025 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4026 memset(buffer_FragColorbgra8, 0, span->length*4);
4027 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4032 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4034 DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4037 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4040 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4041 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4042 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4043 memset(buffer_FragColorbgra8, 0, span->length*4);
4044 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4049 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4051 DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4054 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4057 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4058 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4059 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4060 memset(buffer_FragColorbgra8, 0, span->length*4);
4061 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4066 typedef struct DPSOFTRAST_ShaderModeInfo_s
4069 void (*Vertex)(void);
4070 void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4071 unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4072 unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4074 DPSOFTRAST_ShaderModeInfo;
4076 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4078 {2, DPSOFTRAST_VertexShader_Generic, DPSOFTRAST_PixelShader_Generic, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4079 {2, DPSOFTRAST_VertexShader_PostProcess, DPSOFTRAST_PixelShader_PostProcess, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4080 {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow, DPSOFTRAST_PixelShader_Depth_Or_Shadow, {~0}, {~0}},
4081 {2, DPSOFTRAST_VertexShader_FlatColor, DPSOFTRAST_PixelShader_FlatColor, {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4082 {2, DPSOFTRAST_VertexShader_VertexColor, DPSOFTRAST_PixelShader_VertexColor, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4083 {2, DPSOFTRAST_VertexShader_Lightmap, DPSOFTRAST_PixelShader_Lightmap, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4084 {2, DPSOFTRAST_VertexShader_FakeLight, DPSOFTRAST_PixelShader_FakeLight, {~0}, {~0}},
4085 {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace, DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4086 {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4087 {2, DPSOFTRAST_VertexShader_LightDirection, DPSOFTRAST_PixelShader_LightDirection, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4088 {2, DPSOFTRAST_VertexShader_LightSource, DPSOFTRAST_PixelShader_LightSource, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4089 {2, DPSOFTRAST_VertexShader_Refraction, DPSOFTRAST_PixelShader_Refraction, {~0}},
4090 {2, DPSOFTRAST_VertexShader_Water, DPSOFTRAST_PixelShader_Water, {~0}},
4091 {2, DPSOFTRAST_VertexShader_ShowDepth, DPSOFTRAST_PixelShader_ShowDepth, {~0}},
4092 {2, DPSOFTRAST_VertexShader_DeferredGeometry, DPSOFTRAST_PixelShader_DeferredGeometry, {~0}},
4093 {2, DPSOFTRAST_VertexShader_DeferredLightSource, DPSOFTRAST_PixelShader_DeferredLightSource, {~0}}
4097 int DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int commandoffset, int endoffset)
4099 while (commandoffset != endoffset)
4101 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
4102 switch (command->opcode)
4104 #define INTERPCOMMAND(name) \
4105 case DPSOFTRAST_OPCODE_##name : \
4106 DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
4107 commandoffset += sizeof( DPSOFTRAST_Command_##name ) + ((COMMAND_SIZE - (sizeof( DPSOFTRAST_Command_##name )&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)); \
4108 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
4109 commandoffset = 0; \
4111 INTERPCOMMAND(Viewport)
4112 INTERPCOMMAND(ClearColor)
4113 INTERPCOMMAND(ClearDepth)
4114 INTERPCOMMAND(ColorMask)
4115 INTERPCOMMAND(DepthTest)
4116 INTERPCOMMAND(ScissorTest)
4117 INTERPCOMMAND(Scissor)
4118 INTERPCOMMAND(BlendFunc)
4119 INTERPCOMMAND(BlendSubtract)
4120 INTERPCOMMAND(DepthMask)
4121 INTERPCOMMAND(DepthFunc)
4122 INTERPCOMMAND(DepthRange)
4123 INTERPCOMMAND(PolygonOffset)
4124 INTERPCOMMAND(AlphaTest)
4125 INTERPCOMMAND(AlphaFunc)
4126 INTERPCOMMAND(SetTexture)
4127 INTERPCOMMAND(SetShader)
4128 INTERPCOMMAND(Uniform4f)
4129 INTERPCOMMAND(UniformMatrix4f)
4130 INTERPCOMMAND(Uniform1i)
4132 case DPSOFTRAST_OPCODE_Reset:
4137 return commandoffset;
4140 int DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread, int commandoffset)
4147 // unsigned int *colorpixel;
4148 unsigned int *depthpixel;
4154 DPSOFTRAST_State_Triangle *triangle;
4155 DPSOFTRAST_State_Span *span;
4156 unsigned char pixelmask[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4157 for (i = 0; i < thread->numspans; i++)
4159 span = &thread->spans[i];
4160 triangle = &dpsoftrast.trianglepool.triangles[span->triangle];
4161 if (commandoffset != triangle->commandoffset)
4163 commandoffset = DPSOFTRAST_Draw_InterpretCommands(thread, commandoffset, triangle->commandoffset);
4164 DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4166 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4168 wslope = triangle->w[0];
4169 w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
4170 depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
4171 depth = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
4172 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4173 switch(thread->fb_depthfunc)
4176 case GL_ALWAYS: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = true; break;
4177 case GL_LESS: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4178 case GL_LEQUAL: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4179 case GL_EQUAL: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4180 case GL_GEQUAL: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4181 case GL_GREATER: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4182 case GL_NEVER: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = false; break;
4184 //colorpixel = dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;;
4185 //for (x = 0;x < span->length;x++)
4186 // colorpixel[x] = (depthpixel[x] & 0xFF000000) ? (0x00FF0000) : (depthpixel[x] & 0x00FF0000);
4187 // if there is no color buffer, skip pixel shader
4189 endx = span->length;
4190 while (startx < endx && !pixelmask[startx])
4192 while (endx > startx && !pixelmask[endx-1])
4195 continue; // no pixels to fill
4196 span->pixelmask = pixelmask;
4197 span->startx = startx;
4199 // run pixel shader if appropriate
4200 // do this before running depthmask code, to allow the pixelshader
4201 // to clear pixelmask values for alpha testing
4202 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4203 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4204 if (thread->depthmask)
4205 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4211 // no depth testing means we're just dealing with color...
4212 // if there is no color buffer, skip pixel shader
4213 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4215 memset(pixelmask, 1, span->length);
4216 span->pixelmask = pixelmask;
4218 span->endx = span->length;
4219 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4223 thread->numspans = 0;
4224 return commandoffset;
4227 void DPSOFTRAST_Draw_GenerateSpans(DPSOFTRAST_State_Thread *thread, int freetriangle)
4230 int miny = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
4231 int maxy = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
4232 int commandoffset = thread->commandoffset;
4233 int triangleoffset = thread->triangleoffset;
4234 DPSOFTRAST_State_Triangle *triangle = NULL;
4241 while (triangleoffset != freetriangle)
4243 triangle = &dpsoftrast.trianglepool.triangles[triangleoffset];
4244 if (++triangleoffset >= DPSOFTRAST_DRAW_MAXTRIANGLEPOOL)
4246 starty = triangle->starty + 1;
4247 endy = triangle->endy;
4248 if (starty >= maxy || endy <= miny)
4250 numpoints = triangle->numpoints;
4251 coords[0] = _mm_load_ps(triangle->coords[0]);
4252 coords[1] = _mm_load_ps(triangle->coords[1]);
4253 coords[2] = _mm_load_ps(triangle->coords[2]);
4254 coords[3] = _mm_load_ps(triangle->coords[3]);
4255 ycoords = _mm_load_si128((const __m128i *)triangle->ycoords);
4260 for (y = starty; y < endy;)
4262 __m128 xcoords, xslope;
4263 __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), ycoords);
4264 int yccmask = _mm_movemask_epi8(ycc);
4265 int edge0p, edge0n, edge1p, edge1n;
4272 case 0xFFFF: /*0000*/ y = endy; continue;
4273 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
4274 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4275 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
4276 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
4277 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
4278 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
4279 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
4280 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
4281 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
4282 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
4283 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
4284 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
4285 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4286 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
4287 case 0x0000: /*1111*/ y++; continue;
4295 case 0xFFFF: /*000*/ y = endy; continue;
4296 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
4297 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4298 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
4299 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
4300 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4301 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
4302 case 0x0000: /*111*/ y++; continue;
4305 ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), ycoords);
4306 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
4307 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
4308 nexty = _mm_extract_epi16(ycc, 0);
4309 if(nexty >= endy) nexty = endy-1;
4310 if (_mm_ucomigt_ss(_mm_max_ss(coords[edge0n], coords[edge0p]), _mm_min_ss(coords[edge1n], coords[edge1p])))
4319 xslope = _mm_sub_ps(_mm_movelh_ps(coords[edge0n], coords[edge1n]), _mm_movelh_ps(coords[edge0p], coords[edge1p]));
4320 xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
4321 xcoords = _mm_add_ps(_mm_movelh_ps(coords[edge0p], coords[edge1p]),
4322 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(coords[edge0p], coords[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
4323 xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
4324 for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope))
4326 int startx, endx, offset;
4327 startx = _mm_cvtss_si32(xcoords);
4328 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
4329 if (startx < 0) startx = 0;
4330 if (endx > dpsoftrast.fb_width) endx = dpsoftrast.fb_width;
4331 if (startx >= endx) continue;
4332 for (offset = startx; offset < endx;)
4334 DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
4335 span->triangle = (int)(triangle - dpsoftrast.trianglepool.triangles);
4338 span->length = endx - offset;
4339 if (span -> length > DPSOFTRAST_DRAW_MAXSPANLENGTH)
4340 span -> length = DPSOFTRAST_DRAW_MAXSPANLENGTH;
4341 offset += span->length;
4342 if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
4343 commandoffset = DPSOFTRAST_Draw_ProcessSpans(thread, commandoffset);
4349 if (thread->numspans > 0)
4350 commandoffset = DPSOFTRAST_Draw_ProcessSpans(thread, commandoffset);
4351 if (commandoffset != triangle->commandoffset)
4353 commandoffset = DPSOFTRAST_Draw_InterpretCommands(thread, commandoffset, triangle->commandoffset);
4354 DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4359 thread->commandoffset = commandoffset;
4360 thread->triangleoffset = triangleoffset;
4364 void DPSOFTRAST_Draw_FlushThreads(void)
4366 DPSOFTRAST_State_Thread *thread;
4368 if(dpsoftrast.drawtriangle != dpsoftrast.trianglepool.freetriangle)
4371 dpsoftrast.drawtriangle = dpsoftrast.trianglepool.freetriangle;
4374 SDL_LockMutex(dpsoftrast.trianglemutex);
4376 for (i = 0; i < dpsoftrast.numthreads; i++)
4378 thread = &dpsoftrast.threads[i];
4380 while (thread->triangleoffset != dpsoftrast.drawtriangle)
4382 thread->waiting = true;
4383 SDL_CondBroadcast(dpsoftrast.trianglecond);
4384 SDL_CondWait(thread->waitcond, dpsoftrast.trianglemutex);
4385 thread->waiting = false;
4388 if (thread->triangleoffset != dpsoftrast.drawtriangle)
4389 DPSOFTRAST_Draw_GenerateSpans(thread, dpsoftrast.drawtriangle);
4393 SDL_UnlockMutex(dpsoftrast.trianglemutex);
4395 dpsoftrast.trianglepool.usedtriangles = 0;
4396 dpsoftrast.commandpool.usedcommands = 0;
4400 static int DPSOFTRAST_Draw_Thread(void *data)
4402 DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
4403 while(thread->index >= 0)
4405 if (thread->triangleoffset != dpsoftrast.drawtriangle)
4407 DPSOFTRAST_Draw_GenerateSpans(thread, dpsoftrast.drawtriangle);
4411 SDL_LockMutex(dpsoftrast.trianglemutex);
4412 if (thread->triangleoffset != dpsoftrast.drawtriangle)
4414 SDL_UnlockMutex(dpsoftrast.trianglemutex);
4417 if (thread->waiting) SDL_CondSignal(thread->waitcond);
4418 SDL_CondWait(dpsoftrast.trianglecond, dpsoftrast.trianglemutex);
4419 SDL_UnlockMutex(dpsoftrast.trianglemutex);
4426 void DPSOFTRAST_Draw_ProcessTriangles(int firstvertex, int numtriangles, const int *element3i, const unsigned short *element3s, unsigned char *arraymask, int numarrays)
4429 int cullface = dpsoftrast.cullface;
4430 int width = dpsoftrast.fb_width;
4431 int height = dpsoftrast.fb_height;
4432 __m128i fbmax = _mm_sub_epi16(_mm_setr_epi16(width, height, width, height, width, height, width, height), _mm_set1_epi16(1));
4433 DPSOFTRAST_State_Triangle *triangle;
4445 __m128 triangleedge1, triangleedge2, trianglenormal;
4448 DPSOFTRAST_Texture *texture;
4449 screen[3] = _mm_setzero_ps();
4450 clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4451 for (i = 0;i < numtriangles;i++)
4453 // generate the 3 edges of this triangle
4454 // generate spans for the triangle - switch based on left split or right split classification of triangle
4457 e[0] = element3i[i*3+0] - firstvertex;
4458 e[1] = element3i[i*3+1] - firstvertex;
4459 e[2] = element3i[i*3+2] - firstvertex;
4463 e[0] = element3s[i*3+0] - firstvertex;
4464 e[1] = element3s[i*3+1] - firstvertex;
4465 e[2] = element3s[i*3+2] - firstvertex;
4474 #define SKIPBACKFACE \
4475 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4476 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4477 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4478 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4479 _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4483 if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4487 if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4491 //trianglenormal = _mm_sub_ps(_mm_mul_ps(triangleedge[0], _mm_shuffle_ps(triangleedge[1], triangleedge[1], _MM_SHUFFLE(3, 0, 2, 1))),
4492 // _mm_mul_ps(_mm_shuffle_ps(triangleedge[0], triangleedge[0], _MM_SHUFFLE(3, 0, 2, 1)), triangleedge[1]));
4493 //trianglenormal[2] = triangleedge[0][0] * triangleedge[1][1] - triangleedge[0][1] * triangleedge[1][0];
4494 //trianglenormal[0] = triangleedge[0][1] * triangleedge[1][2] - triangleedge[0][2] * triangleedge[1][1];
4495 //trianglenormal[1] = triangleedge[0][2] * triangleedge[1][0] - triangleedge[0][0] * triangleedge[1][2];
4497 // macros for clipping vertices
4499 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4500 clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4502 __m128 v1 = _mm_load_ps(&dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[p1]*4]), v2 = _mm_load_ps(&dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[p2]*4]); \
4503 screen[k] = DPSOFTRAST_Draw_ProjectVertex(_mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1]))); \
4505 #define CLIPPEDVERTEXCOPY(k,p1) \
4506 screen[k] = _mm_load_ps(&dpsoftrast.screencoord4f[e[p1]*4]);
4508 #define GENATTRIBCOPY(j, attrib, p1) \
4509 attrib = _mm_load_ps(&dpsoftrast.post_array4f[j][e[p1]*4]);
4510 #define GENATTRIBLERP(j, attrib, p1, p2) \
4512 __m128 v1 = _mm_load_ps(&dpsoftrast.post_array4f[j][e[p1]*4]), v2 = _mm_load_ps(&dpsoftrast.post_array4f[j][e[p2]*4]); \
4513 attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4515 #define GENATTRIBS(j, attrib0, attrib1, attrib2) \
4519 case 0: GENATTRIBCOPY(j, attrib0, 0); GENATTRIBCOPY(j, attrib1, 1); GENATTRIBCOPY(j, attrib2, 2); break; \
4520 case 1: GENATTRIBCOPY(j, attrib0, 0); GENATTRIBCOPY(j, attrib1, 1); GENATTRIBLERP(j, attrib2, 1, 2); break; \
4521 case 2: GENATTRIBCOPY(j, attrib0, 0); GENATTRIBLERP(j, attrib1, 0, 1); GENATTRIBLERP(j, attrib2, 1, 2); break; \
4522 case 3: GENATTRIBCOPY(j, attrib0, 0); GENATTRIBLERP(j, attrib1, 0, 1); GENATTRIBLERP(j, attrib2, 2, 0); break; \
4523 case 4: GENATTRIBLERP(j, attrib0, 0, 1); GENATTRIBCOPY(j, attrib1, 1); GENATTRIBCOPY(j, attrib2, 2); break; \
4524 case 5: GENATTRIBLERP(j, attrib0, 0, 1); GENATTRIBCOPY(j, attrib1, 1); GENATTRIBLERP(j, attrib2, 1, 2); break; \
4525 case 6: GENATTRIBLERP(j, attrib0, 1, 2); GENATTRIBCOPY(j, attrib1, 2); GENATTRIBLERP(j, attrib2, 2, 0); break; \
4528 // calculate distance from nearplane
4529 clipdist[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[0]*4+2] + dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[0]*4+3];
4530 clipdist[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[1]*4+2] + dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[1]*4+3];
4531 clipdist[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[2]*4+2] + dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[2]*4+3];
4532 if (clipdist[0] >= 0.0f)
4534 if (clipdist[1] >= 0.0f)
4536 if (clipdist[2] >= 0.0f)
4538 // triangle is entirely in front of nearplane
4539 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4546 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4554 if (clipdist[2] >= 0.0f)
4556 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
4563 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
4570 else if (clipdist[1] >= 0.0f)
4572 if (clipdist[2] >= 0.0f)
4574 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
4581 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
4587 else if (clipdist[2] >= 0.0f)
4589 CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
4594 else continue; // triangle is entirely behind nearplane
4597 // calculate integer y coords for triangle points
4598 __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
4599 screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
4600 screenmin = _mm_min_epi16(screeni, screenir),
4601 screenmax = _mm_max_epi16(screeni, screenir);
4602 screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
4603 screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
4604 screenmin = _mm_max_epi16(screenmin, _mm_setzero_si128());
4605 screenmax = _mm_min_epi16(screenmax, fbmax);
4606 // skip offscreen triangles
4607 if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
4609 starty = _mm_extract_epi16(screenmin, 1);
4610 endy = _mm_extract_epi16(screenmax, 1)+1;
4611 screeny = _mm_srai_epi32(screeni, 16);
4614 if (dpsoftrast.trianglepool.usedtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLEPOOL-1)
4616 DPSOFTRAST_Draw_FreeTrianglePool(DPSOFTRAST_DRAW_MAXTRIANGLEPOOL/8);
4618 DPSOFTRAST_Draw_FlushThreads();
4621 triangle = &dpsoftrast.trianglepool.triangles[dpsoftrast.trianglepool.freetriangle];
4622 triangle->commandoffset = dpsoftrast.commandpool.freecommand;
4623 triangle->starty = starty;
4624 triangle->endy = endy;
4625 triangle->numpoints = numpoints;
4626 _mm_store_ps(triangle->coords[0], screen[0]);
4627 _mm_store_ps(triangle->coords[1], screen[1]);
4628 _mm_store_ps(triangle->coords[2], screen[2]);
4629 _mm_store_ps(triangle->coords[3], numpoints > 3 ? screen[3] : screen[2]);
4630 _mm_store_si128((__m128i *)triangle->ycoords, screeny);
4632 // calculate attribute plans for triangle data...
4633 // okay, this triangle is going to produce spans, we'd better project
4634 // the interpolants now (this is what gives perspective texturing),
4635 // this consists of simply multiplying all arrays by the W coord
4636 // (which is basically 1/Z), which will be undone per-pixel
4637 // (multiplying by Z again) to get the perspective-correct array
4640 __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
4641 attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
4642 attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
4643 attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
4644 attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
4645 attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
4646 w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
4647 w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
4648 w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
4649 attribedge1 = _mm_sub_ss(w0, w1);
4650 attribedge2 = _mm_sub_ss(w2, w1);
4651 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
4652 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
4653 x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
4654 y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
4655 attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
4656 _mm_store_ss(&triangle->w[0], attribxslope);
4657 _mm_store_ss(&triangle->w[1], attribyslope);
4658 _mm_store_ss(&triangle->w[2], attriborigin);
4659 for (j = 0;j < numarrays;j++)
4663 __m128 attrib0, attrib1, attrib2;
4664 GENATTRIBS(j, attrib0, attrib1, attrib2);
4665 attriborigin = _mm_mul_ps(attrib1, w1);
4666 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
4667 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
4668 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
4669 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
4670 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
4671 _mm_stream_ps(triangle->attribs[j][0], attribxslope);
4672 _mm_stream_ps(triangle->attribs[j][1], attribyslope);
4673 _mm_stream_ps(triangle->attribs[j][2], attriborigin);
4678 // adjust texture LOD by texture density, in the simplest way possible...
4680 __m128 mipedgescale, mipedgetc, mipdensity, attrib0, attrib1, attrib2;
4681 memset(triangle->mip, 0, sizeof(triangle->mip));
4682 mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
4683 mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
4684 mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
4685 k = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].lodarrayindex;
4686 GENATTRIBS(k, attrib0, attrib1, attrib2);
4687 mipedgetc = _mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1));
4688 mipedgetc = _mm_mul_ps(mipedgetc, mipedgescale);
4689 for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
4691 int texunit = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].texunits[j];
4692 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
4694 texture = dpsoftrast.texbound[texunit];
4695 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4697 mipdensity = _mm_mul_ps(mipedgetc, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
4698 mipdensity = _mm_mul_ps(mipdensity, mipdensity);
4699 mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
4700 mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
4701 // this will be multiplied in the texturing routine by the texture resolution
4702 y = _mm_cvtss_si32(mipdensity);
4705 y = (int)(log((float)y)*0.5f/M_LN2);
4706 if (y > texture->mipmaps - 1)
4707 y = texture->mipmaps - 1;
4708 triangle->mip[texunit] = y;
4714 dpsoftrast.trianglepool.freetriangle = dpsoftrast.trianglepool.freetriangle < DPSOFTRAST_DRAW_MAXTRIANGLEPOOL-1 ? dpsoftrast.trianglepool.freetriangle + 1 : 0;
4715 dpsoftrast.trianglepool.usedtriangles++;
4718 if (numqueued >= DPSOFTRAST_DRAW_FLUSHPROCESSTRIANGLES)
4721 dpsoftrast.drawtriangle = dpsoftrast.trianglepool.freetriangle;
4724 //SDL_LockMutex(dpsoftrast.trianglemutex);
4725 SDL_CondBroadcast(dpsoftrast.trianglecond);
4726 //SDL_UnlockMutex(dpsoftrast.trianglemutex);
4728 DPSOFTRAST_Draw_FlushThreads();
4736 dpsoftrast.drawtriangle = dpsoftrast.trianglepool.freetriangle;
4739 //SDL_LockMutex(dpsoftrast.trianglemutex);
4740 SDL_CondBroadcast(dpsoftrast.trianglecond);
4741 //SDL_UnlockMutex(dpsoftrast.trianglemutex);
4743 DPSOFTRAST_Draw_FlushThreads();
4749 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
4752 int lastarray = DPSOFTRAST_ARRAY_POSITION;
4753 unsigned char arraymask[DPSOFTRAST_ARRAY_TOTAL];
4754 memset(arraymask, false, sizeof(arraymask));
4755 arraymask[DPSOFTRAST_ARRAY_POSITION] = true;
4756 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
4758 int arrayindex = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
4759 if (arrayindex >= DPSOFTRAST_ARRAY_TOTAL)
4763 case DPSOFTRAST_ARRAY_POSITION:
4764 case DPSOFTRAST_ARRAY_COLOR:
4767 if (dpsoftrast.pointer_texcoordf[arrayindex-DPSOFTRAST_ARRAY_TEXCOORD0] == NULL)
4771 arraymask[arrayindex] = true;
4772 if (arrayindex > lastarray)
4773 lastarray = arrayindex;
4775 DPSOFTRAST_Draw_LoadVertices(firstvertex, numvertices, arraymask[DPSOFTRAST_ARRAY_COLOR]);
4776 DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
4777 // DPSOFTRAST_Draw_ProjectVertices(dpsoftrast.screencoord4f, dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], numvertices);
4778 DPSOFTRAST_Draw_ProcessTriangles(firstvertex, numtriangles, element3i, element3s, arraymask, lastarray+1);
4781 void DPSOFTRAST_Flush(void)
4783 DPSOFTRAST_Draw_SyncCommands();
4784 DPSOFTRAST_Draw_FlushThreads();
4787 void DPSOFTRAST_Finish(void)
4792 void DPSOFTRAST_Init(int width, int height, int numthreads, unsigned int *colorpixels, unsigned int *depthpixels)
4802 memset(&dpsoftrast, 0, sizeof(dpsoftrast));
4803 dpsoftrast.bigendian = u.b[3];
4804 dpsoftrast.fb_width = width;
4805 dpsoftrast.fb_height = height;
4806 dpsoftrast.fb_depthpixels = depthpixels;
4807 dpsoftrast.fb_colorpixels[0] = colorpixels;
4808 dpsoftrast.fb_colorpixels[1] = NULL;
4809 dpsoftrast.fb_colorpixels[1] = NULL;
4810 dpsoftrast.fb_colorpixels[1] = NULL;
4811 dpsoftrast.texture_firstfree = 1;
4812 dpsoftrast.texture_end = 1;
4813 dpsoftrast.texture_max = 0;
4814 dpsoftrast.viewport[0] = 0;
4815 dpsoftrast.viewport[1] = 0;
4816 dpsoftrast.viewport[2] = dpsoftrast.fb_width;
4817 dpsoftrast.viewport[3] = dpsoftrast.fb_height;
4818 dpsoftrast.color[0] = 1;
4819 dpsoftrast.color[1] = 1;
4820 dpsoftrast.color[2] = 1;
4821 dpsoftrast.color[3] = 1;
4822 dpsoftrast.cullface = GL_BACK;
4824 dpsoftrast.numthreads = bound(1, numthreads, 64);
4825 dpsoftrast.trianglemutex = SDL_CreateMutex();
4826 dpsoftrast.trianglecond = SDL_CreateCond();
4828 dpsoftrast.numthreads = 1;
4830 dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
4831 for (i = 0; i < dpsoftrast.numthreads; i++)
4833 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
4835 thread->colormask[1] = 1;
4836 thread->colormask[2] = 1;
4837 thread->colormask[3] = 1;
4838 thread->blendfunc[0] = GL_ONE;
4839 thread->blendfunc[1] = GL_ZERO;
4840 thread->depthmask = true;
4841 thread->depthtest = true;
4842 thread->depthfunc = GL_LEQUAL;
4843 thread->scissortest = false;
4844 thread->alphatest = false;
4845 thread->alphafunc = GL_GREATER;
4846 thread->alphavalue = 0.5f;
4847 thread->scissor[0] = 0;
4848 thread->scissor[1] = 0;
4849 thread->scissor[2] = dpsoftrast.fb_width;
4850 thread->scissor[3] = dpsoftrast.fb_height;
4851 thread->depthrange[0] = 0;
4852 thread->depthrange[1] = 1;
4853 thread->polygonoffset[0] = 0;
4854 thread->polygonoffset[1] = 0;
4856 thread->numspans = 0;
4857 thread->triangleoffset = 0;
4858 thread->commandoffset = 0;
4859 thread->waiting = false;
4861 thread->waitcond = SDL_CreateCond();
4864 thread->validate = -1;
4865 DPSOFTRAST_Validate(thread, -1);
4867 thread->thread = SDL_CreateThread(DPSOFTRAST_Draw_Thread, thread);
4872 void DPSOFTRAST_Shutdown(void)
4876 if(dpsoftrast.numthreads > 0)
4878 DPSOFTRAST_State_Thread *thread;
4879 SDL_LockMutex(dpsoftrast.trianglemutex);
4880 for (i = 0; i < dpsoftrast.numthreads; i++)
4882 thread = &dpsoftrast.threads[i];
4885 SDL_CondBroadcast(dpsoftrast.trianglecond);
4886 SDL_UnlockMutex(dpsoftrast.trianglemutex);
4887 for (i = 0; i < dpsoftrast.numthreads; i++)
4889 thread = &dpsoftrast.threads[i];
4890 SDL_WaitThread(thread->thread, NULL);
4891 SDL_DestroyCond(thread->waitcond);
4893 SDL_DestroyMutex(dpsoftrast.trianglemutex);
4894 SDL_DestroyCond(dpsoftrast.trianglecond);
4897 for (i = 0;i < dpsoftrast.texture_end;i++)
4898 if (dpsoftrast.texture[i].bytes)
4899 MM_FREE(dpsoftrast.texture[i].bytes);
4900 if (dpsoftrast.texture)
4901 free(dpsoftrast.texture);
4902 if (dpsoftrast.threads)
4903 MM_FREE(dpsoftrast.threads);
4904 memset(&dpsoftrast, 0, sizeof(dpsoftrast));