in imagecore/image/internal/sse.cpp [478:634]
static void adaptiveSeperable12(const FilterKernelAdaptive* kernel, const uint8_t* __restrict inputBuffer, unsigned int inputWidth, unsigned int inputHeight, unsigned int inputPitch, uint8_t* __restrict outputBuffer, unsigned int outputWidth, unsigned int outputHeight, unsigned int outputPitch, unsigned int outputCapacity)
{
// The seperable version writes transposed images.
SECURE_ASSERT(SafeUMul(outputHeight, 4U) <= outputPitch);
SECURE_ASSERT(SafeUMul(outputWidth, outputPitch) <= outputCapacity);
__restrict int32_t* kernelTable = kernel->getTableFixedPoint4();
__m128i half = v128_set_int32(kHalf16);
for( unsigned int x = 0; x < outputWidth; x++ ) {
const int32_t* kernelTableSample = kernelTable + x * 48;
int startX = kernel->computeSampleStart(x);
uint8_t* outputSample = outputBuffer + (x * outputPitch);
const uint8_t* sample = inputBuffer + startX * 4;
for( unsigned int y = 0; y < outputHeight; y++ ) {
#if (SEPERABLE12_ASM_OPTIMIZATION)
__asm__ (
"pxor %%xmm0, %%xmm0 \n"
"lddqu %[sample], %%xmm1 \n"
"movdqa %%xmm1, %%xmm2 \n"
"punpcklbw %%xmm0, %%xmm1 \n"
"movdqa %%xmm1, %%xmm3 \n"
"punpcklwd %%xmm0, %%xmm1 \n"
"punpckhwd %%xmm0, %%xmm3 \n"
"pmulld 0%[kernelTable], %%xmm1 \n"
"pmulld 16%[kernelTable], %%xmm3 \n"
"paddd %%xmm3, %%xmm1 \n"
"punpckhbw %%xmm0, %%xmm2 \n"
"movdqa %%xmm2, %%xmm3 \n"
"punpcklwd %%xmm0, %%xmm2 \n"
"punpckhwd %%xmm0, %%xmm3 \n"
"pmulld 32%[kernelTable], %%xmm2 \n"
"pmulld 48%[kernelTable], %%xmm3 \n"
"paddd %%xmm2, %%xmm1 \n"
"paddd %%xmm3, %%xmm1 \n"
"movdqa %%xmm1, %%xmm4 \n"
"lddqu 16%[sample], %%xmm1 \n"
"movdqa %%xmm1, %%xmm2 \n"
"punpcklbw %%xmm0, %%xmm1 \n"
"movdqa %%xmm1, %%xmm3 \n"
"punpcklwd %%xmm0, %%xmm1 \n"
"punpckhwd %%xmm0, %%xmm3 \n"
"pmulld 64%[kernelTable], %%xmm1 \n"
"pmulld 80%[kernelTable], %%xmm3 \n"
"paddd %%xmm3, %%xmm1 \n"
"punpckhbw %%xmm0, %%xmm2 \n"
"movdqa %%xmm2, %%xmm3 \n"
"punpcklwd %%xmm0, %%xmm2 \n"
"punpckhwd %%xmm0, %%xmm3 \n"
"pmulld 96%[kernelTable], %%xmm2 \n"
"pmulld 112%[kernelTable], %%xmm3 \n"
"paddd %%xmm2, %%xmm1 \n"
"paddd %%xmm3, %%xmm1 \n"
"paddd %%xmm1, %%xmm4 \n"
"lddqu 32%[sample], %%xmm1 \n"
"movdqa %%xmm1, %%xmm2 \n"
"punpcklbw %%xmm0, %%xmm1 \n"
"movdqa %%xmm1, %%xmm3 \n"
"punpcklwd %%xmm0, %%xmm1 \n"
"punpckhwd %%xmm0, %%xmm3 \n"
"pmulld 128%[kernelTable], %%xmm1 \n"
"pmulld 144%[kernelTable], %%xmm3 \n"
"paddd %%xmm3, %%xmm1 \n"
"punpckhbw %%xmm0, %%xmm2 \n"
"movdqa %%xmm2, %%xmm3 \n"
"punpcklwd %%xmm0, %%xmm2 \n"
"punpckhwd %%xmm0, %%xmm3 \n"
"pmulld 160%[kernelTable], %%xmm2 \n"
"pmulld 176%[kernelTable], %%xmm3 \n"
"paddd %%xmm2, %%xmm1 \n"
"paddd %%xmm3, %%xmm1 \n"
"paddd %%xmm1, %%xmm4 \n"
"paddd %[half], %%xmm4 \n"
"psrad $16, %%xmm4 \n"
"packusdw %%xmm0, %%xmm4 \n"
"packuswb %%xmm0, %%xmm4 \n"
"movd %%xmm4, %[outputSample] \n"
: [outputSample] "=m" (*outputSample)
: [sample] "m" (*sample), [kernelTable] "m" (*kernelTableSample), [half] "x" (half)
: "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4"
);
#else
__m128i zero = v128_setzero();
vSInt32 result = zero;
vSInt32 coeffs_x_0 = *(vSInt32*)(kernelTableSample + 0);
vSInt32 coeffs_x_1 = *(vSInt32*)(kernelTableSample + 4);
vSInt32 coeffs_x_2 = *(vSInt32*)(kernelTableSample + 8);
vSInt32 coeffs_x_3 = *(vSInt32*)(kernelTableSample + 12);
vSInt32 coeffs_x_4 = *(vSInt32*)(kernelTableSample + 16);
vSInt32 coeffs_x_5 = *(vSInt32*)(kernelTableSample + 20);
vSInt32 coeffs_x_6 = *(vSInt32*)(kernelTableSample + 24);
vSInt32 coeffs_x_7 = *(vSInt32*)(kernelTableSample + 28);
vSInt32 coeffs_x_8 = *(vSInt32*)(kernelTableSample + 32);
vSInt32 coeffs_x_9 = *(vSInt32*)(kernelTableSample + 36);
vSInt32 coeffs_x_10 = *(vSInt32*)(kernelTableSample + 40);
vSInt32 coeffs_x_11 = *(vSInt32*)(kernelTableSample + 44);
vUInt8 row_8_a = v128_load_unaligned((__m128i*)sample);
vUInt16 row_16_a = v128_unpacklo_int8(row_8_a, zero);
vUInt16 row_16_b = v128_unpackhi_int8(row_8_a, zero);
vSInt32 row_32_a = v128_unpacklo_int16(row_16_a, zero);
vSInt32 row_32_b = v128_unpackhi_int16(row_16_a, zero);
vSInt32 row_32_c = v128_unpacklo_int16(row_16_b, zero);
vSInt32 row_32_d = v128_unpackhi_int16(row_16_b, zero);
vSInt32 mul_a = v128_mul_int32(row_32_a, coeffs_x_0);
vSInt32 mul_b = v128_mul_int32(row_32_b, coeffs_x_1);
vSInt32 mul_c = v128_mul_int32(row_32_c, coeffs_x_2);
vSInt32 mul_d = v128_mul_int32(row_32_d, coeffs_x_3);
result = v128_add_int32(mul_a, v128_add_int32(mul_b, v128_add_int32(mul_c, mul_d)));
vUInt8 row_8_b = v128_load_unaligned((__m128i*)(sample + 16));
vUInt16 row_16_c = v128_unpacklo_int8(row_8_b, zero);
vUInt16 row_16_d = v128_unpackhi_int8(row_8_b, zero);
vSInt32 row_32_e = v128_unpacklo_int16(row_16_c, zero);
vSInt32 row_32_f = v128_unpackhi_int16(row_16_c, zero);
vSInt32 row_32_g = v128_unpacklo_int16(row_16_d, zero);
vSInt32 row_32_h = v128_unpackhi_int16(row_16_d, zero);
vSInt32 mul_e = v128_mul_int32(row_32_e, coeffs_x_4);
vSInt32 mul_f = v128_mul_int32(row_32_f, coeffs_x_5);
vSInt32 mul_g = v128_mul_int32(row_32_g, coeffs_x_6);
vSInt32 mul_h = v128_mul_int32(row_32_h, coeffs_x_7);
result = v128_add_int32(result, v128_add_int32(mul_e, v128_add_int32(mul_f, v128_add_int32(mul_g, mul_h))));
vUInt8 row_8_c = v128_load_unaligned((__m128i*)(sample + 32));
vUInt16 row_16_e = v128_unpacklo_int8(row_8_c, zero);
vUInt16 row_16_f = v128_unpackhi_int8(row_8_c, zero);
vSInt32 row_32_i = v128_unpacklo_int16(row_16_e, zero);
vSInt32 row_32_j = v128_unpackhi_int16(row_16_e, zero);
vSInt32 row_32_k = v128_unpacklo_int16(row_16_f, zero);
vSInt32 row_32_l = v128_unpackhi_int16(row_16_f, zero);
vSInt32 mul_i = v128_mul_int32(row_32_i, coeffs_x_8);
vSInt32 mul_j = v128_mul_int32(row_32_j, coeffs_x_9);
vSInt32 mul_k = v128_mul_int32(row_32_k, coeffs_x_10);
vSInt32 mul_l = v128_mul_int32(row_32_l, coeffs_x_11);
result = v128_add_int32(result, v128_add_int32(mul_i, v128_add_int32(mul_j, v128_add_int32(mul_k, mul_l))));
result = v128_add_int32(result, half);
result = v128_shift_right_signed_int32<16>(result);
vSInt16 packed_16 = v128_pack_unsigned_saturate_int32(result, zero);
vSInt8 packed_8 = v128_pack_unsigned_saturate_int16(packed_16, zero);
unsigned int oi = (x * outputPitch) + y * 4;
int a = v128_convert_to_int32(packed_8);
*(int*)(outputBuffer + oi) = a;
#endif
outputSample += 4;
sample += inputPitch;
}
}
}