+void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
+{
+#ifdef SSE_POSSIBLE
+ int x;
+ int startx = span->startx;
+ int endx = span->endx;
+ int flags;
+ __m128 data, slope, tcscale;
+ __m128i tcsize, tcmask, tcoffset, tcmax;
+ __m128 tc, endtc;
+ __m128i subtc, substep, endsubtc;
+ int filter;
+ int mip;
+ int affine; // LordHavoc: optimized affine texturing case
+ unsigned int * RESTRICT outi = (unsigned int *)out4ub;
+ const unsigned char * RESTRICT pixelbase;
+ DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
+ // if no texture is bound, just fill it with white
+ if (!texture)
+ {
+ memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
+ return;
+ }
+ mip = triangle->mip[texunitindex];
+ pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
+ // if this mipmap of the texture is 1 pixel, just fill it with that color
+ if (texture->mipmap[mip][1] == 4)
+ {
+ unsigned int k = *((const unsigned int *)pixelbase);
+ for (x = startx;x < endx;x++)
+ outi[x] = k;
+ return;
+ }
+ affine = zf[startx] == zf[endx-1];
+ filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
+ DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
+ flags = texture->flags;
+ tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
+ tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
+ tcscale = _mm_cvtepi32_ps(tcsize);
+ data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
+ slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
+ endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
+ if (filter)
+ endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
+ endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
+ tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
+ tcmax = _mm_packs_epi32(tcmask, tcmask);
+ for (x = startx;x < endx;)
+ {
+ int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
+ __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
+ if (nextsub >= endx || affine)
+ {
+ nextsub = endsub = endx-1;
+ if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
+ }
+ tc = endtc;
+ subtc = endsubtc;
+ endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
+ if (filter)
+ endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
+ substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
+ endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
+ subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
+ substep = _mm_slli_epi32(substep, 1);
+ if (filter)
+ {
+ __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
+ if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
+ {
+ int stride = _mm_cvtsi128_si32(tcoffset)>>16;
+ for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
+ {
+ const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;
+ __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
+ tci = _mm_madd_epi16(tci, tcoffset);
+ ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
+ ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
+ pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
+ pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
+ pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
+ pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
+ fracm = _mm_srli_epi16(subtc, 1);
+ pix1 = _mm_add_epi16(pix1,
+ _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
+ _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
+ pix3 = _mm_add_epi16(pix3,
+ _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
+ _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
+ pix2 = _mm_unpacklo_epi64(pix1, pix3);
+ pix4 = _mm_unpackhi_epi64(pix1, pix3);
+ pix2 = _mm_add_epi16(pix2,
+ _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
+ _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
+ _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
+ }
+ if (x <= endsub)
+ {
+ const unsigned char * RESTRICT ptr1;
+ __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
+ tci = _mm_madd_epi16(tci, tcoffset);
+ ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
+ pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
+ pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
+ fracm = _mm_srli_epi16(subtc, 1);
+ pix1 = _mm_add_epi16(pix1,
+ _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
+ _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
+ pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
+ pix1 = _mm_add_epi16(pix1,
+ _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
+ _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
+ outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
+ x++;
+ }
+ }
+ else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
+ {
+ for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
+ {
+ __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
+ tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
+ tci = _mm_madd_epi16(tci, tcoffset);
+ pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
+ _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
+ _mm_setzero_si128());
+ pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
+ _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
+ _mm_setzero_si128());
+ tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
+ tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
+ tci = _mm_madd_epi16(tci, tcoffset);
+ pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
+ _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
+ _mm_setzero_si128());
+ pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
+ _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
+ _mm_setzero_si128());
+ fracm = _mm_srli_epi16(subtc, 1);
+ pix1 = _mm_add_epi16(pix1,
+ _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
+ _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
+ pix3 = _mm_add_epi16(pix3,
+ _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
+ _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
+ pix2 = _mm_unpacklo_epi64(pix1, pix3);
+ pix4 = _mm_unpackhi_epi64(pix1, pix3);
+ pix2 = _mm_add_epi16(pix2,
+ _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
+ _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
+ _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
+ }
+ if (x <= endsub)
+ {
+ __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
+ tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
+ tci = _mm_madd_epi16(tci, tcoffset);
+ pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
+ _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
+ _mm_setzero_si128());
+ pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
+ _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
+ _mm_setzero_si128());
+ fracm = _mm_srli_epi16(subtc, 1);
+ pix1 = _mm_add_epi16(pix1,
+ _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
+ _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
+ pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
+ pix1 = _mm_add_epi16(pix1,
+ _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
+ _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
+ outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
+ x++;
+ }
+ }
+ else
+ {
+ for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
+ {
+ __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
+ tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
+ tci = _mm_madd_epi16(tci, tcoffset);
+ pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
+ _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
+ _mm_setzero_si128());
+ pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
+ _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
+ _mm_setzero_si128());
+ tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
+ tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
+ tci = _mm_madd_epi16(tci, tcoffset);
+ pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
+ _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
+ _mm_setzero_si128());
+ pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
+ _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
+ _mm_setzero_si128());
+ fracm = _mm_srli_epi16(subtc, 1);
+ pix1 = _mm_add_epi16(pix1,
+ _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
+ _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
+ pix3 = _mm_add_epi16(pix3,
+ _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
+ _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
+ pix2 = _mm_unpacklo_epi64(pix1, pix3);
+ pix4 = _mm_unpackhi_epi64(pix1, pix3);
+ pix2 = _mm_add_epi16(pix2,
+ _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
+ _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
+ _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
+ }
+ if (x <= endsub)
+ {
+ __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
+ tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
+ tci = _mm_madd_epi16(tci, tcoffset);
+ pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
+ _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
+ _mm_setzero_si128());
+ pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
+ _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
+ _mm_setzero_si128());
+ fracm = _mm_srli_epi16(subtc, 1);
+ pix1 = _mm_add_epi16(pix1,
+ _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
+ _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
+ pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
+ pix1 = _mm_add_epi16(pix1,
+ _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
+ _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
+ outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
+ x++;
+ }
+ }
+ }
+ else
+ {
+ if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
+ {
+ for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
+ {
+ __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
+ tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
+ tci = _mm_madd_epi16(tci, tcoffset);
+ outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
+ outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
+ }
+ if (x <= endsub)
+ {
+ __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
+ tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
+ tci = _mm_madd_epi16(tci, tcoffset);
+ outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
+ x++;
+ }
+ }
+ else
+ {
+ for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
+ {
+ __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
+ tci = _mm_and_si128(tci, tcmax);
+ tci = _mm_madd_epi16(tci, tcoffset);
+ outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
+ outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
+ }
+ if (x <= endsub)
+ {
+ __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
+ tci = _mm_and_si128(tci, tcmax);
+ tci = _mm_madd_epi16(tci, tcoffset);
+ outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
+ x++;
+ }
+ }
+ }
+ }
+#endif
+}
+
+void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
+{
+ // TODO: IMPLEMENT
+ memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
+}
+
+float DPSOFTRAST_SampleShadowmap(const float *vector)
+{
+ // TODO: IMPLEMENT
+ return 1.0f;
+}
+
+void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)