static void adaptiveSeperable12()

in imagecore/image/internal/sse.cpp [478:634]


static void adaptiveSeperable12(const FilterKernelAdaptive* kernel, const uint8_t* __restrict inputBuffer, unsigned int inputWidth, unsigned int inputHeight, unsigned int inputPitch, uint8_t* __restrict outputBuffer, unsigned int outputWidth, unsigned int outputHeight, unsigned int outputPitch, unsigned int outputCapacity)
{
	// The seperable version writes transposed images.
	SECURE_ASSERT(SafeUMul(outputHeight, 4U) <= outputPitch);
	SECURE_ASSERT(SafeUMul(outputWidth, outputPitch) <= outputCapacity);

	__restrict int32_t* kernelTable = kernel->getTableFixedPoint4();

	__m128i half = v128_set_int32(kHalf16);

	for( unsigned int x = 0; x < outputWidth; x++ ) {
		const int32_t* kernelTableSample = kernelTable + x * 48;
		int startX = kernel->computeSampleStart(x);
		uint8_t* outputSample = outputBuffer + (x * outputPitch);
		const uint8_t* sample = inputBuffer + startX * 4;
		for( unsigned int y = 0; y < outputHeight; y++ ) {
#if (SEPERABLE12_ASM_OPTIMIZATION)
			__asm__ (
					 "pxor		%%xmm0, %%xmm0 \n"
					 "lddqu		%[sample], %%xmm1 \n"
					 "movdqa	%%xmm1, %%xmm2 \n"
					 "punpcklbw	%%xmm0, %%xmm1 \n"
					 "movdqa 	%%xmm1, %%xmm3 \n"
					 "punpcklwd	%%xmm0, %%xmm1 \n"
					 "punpckhwd	%%xmm0, %%xmm3 \n"
					 "pmulld 	0%[kernelTable], %%xmm1 \n"
					 "pmulld 	16%[kernelTable], %%xmm3 \n"
					 "paddd 	%%xmm3, %%xmm1 \n"
					 "punpckhbw	%%xmm0, %%xmm2 \n"
					 "movdqa 	%%xmm2, %%xmm3 \n"
					 "punpcklwd	%%xmm0, %%xmm2 \n"
					 "punpckhwd	%%xmm0, %%xmm3 \n"
					 "pmulld 	32%[kernelTable], %%xmm2 \n"
					 "pmulld 	48%[kernelTable], %%xmm3 \n"
					 "paddd 	%%xmm2, %%xmm1 \n"
					 "paddd 	%%xmm3, %%xmm1 \n"
					 "movdqa	%%xmm1, %%xmm4 \n"

					 "lddqu		16%[sample], %%xmm1 \n"
					 "movdqa	%%xmm1, %%xmm2 \n"
					 "punpcklbw	%%xmm0, %%xmm1 \n"
					 "movdqa 	%%xmm1, %%xmm3 \n"
					 "punpcklwd	%%xmm0, %%xmm1 \n"
					 "punpckhwd	%%xmm0, %%xmm3 \n"
					 "pmulld 	64%[kernelTable], %%xmm1 \n"
					 "pmulld 	80%[kernelTable], %%xmm3 \n"
					 "paddd 	%%xmm3, %%xmm1 \n"
					 "punpckhbw	%%xmm0, %%xmm2 \n"
					 "movdqa 	%%xmm2, %%xmm3 \n"
					 "punpcklwd	%%xmm0, %%xmm2 \n"
					 "punpckhwd	%%xmm0, %%xmm3 \n"
					 "pmulld 	96%[kernelTable], %%xmm2 \n"
					 "pmulld 	112%[kernelTable], %%xmm3 \n"
					 "paddd 	%%xmm2, %%xmm1 \n"
					 "paddd 	%%xmm3, %%xmm1 \n"
					 "paddd	 	%%xmm1, %%xmm4 \n"

					 "lddqu		32%[sample], %%xmm1 \n"
					 "movdqa	%%xmm1, %%xmm2 \n"
					 "punpcklbw	%%xmm0, %%xmm1 \n"
					 "movdqa 	%%xmm1, %%xmm3 \n"
					 "punpcklwd	%%xmm0, %%xmm1 \n"
					 "punpckhwd	%%xmm0, %%xmm3 \n"
					 "pmulld 	128%[kernelTable], %%xmm1 \n"
					 "pmulld 	144%[kernelTable], %%xmm3 \n"
					 "paddd 	%%xmm3, %%xmm1 \n"
					 "punpckhbw	%%xmm0, %%xmm2 \n"
					 "movdqa 	%%xmm2, %%xmm3 \n"
					 "punpcklwd	%%xmm0, %%xmm2 \n"
					 "punpckhwd	%%xmm0, %%xmm3 \n"
					 "pmulld 	160%[kernelTable], %%xmm2 \n"
					 "pmulld 	176%[kernelTable], %%xmm3 \n"
					 "paddd 	%%xmm2, %%xmm1 \n"
					 "paddd 	%%xmm3, %%xmm1 \n"
					 "paddd		%%xmm1, %%xmm4 \n"

					 "paddd 	%[half], %%xmm4 \n"
					 "psrad 	$16, %%xmm4 \n"
					 "packusdw 	%%xmm0, %%xmm4 \n"
					 "packuswb 	%%xmm0, %%xmm4 \n"
					 "movd 		%%xmm4, %[outputSample] \n"
					 : [outputSample] "=m" (*outputSample)
					 : [sample] "m" (*sample), [kernelTable] "m" (*kernelTableSample), [half] "x" (half)
					 : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4"
					 );
#else
			__m128i zero = v128_setzero();

			vSInt32 result = zero;
			vSInt32 coeffs_x_0 = *(vSInt32*)(kernelTableSample + 0);
			vSInt32 coeffs_x_1 = *(vSInt32*)(kernelTableSample + 4);
			vSInt32 coeffs_x_2 = *(vSInt32*)(kernelTableSample + 8);
			vSInt32 coeffs_x_3 = *(vSInt32*)(kernelTableSample + 12);
			vSInt32 coeffs_x_4 = *(vSInt32*)(kernelTableSample + 16);
			vSInt32 coeffs_x_5 = *(vSInt32*)(kernelTableSample + 20);
			vSInt32 coeffs_x_6 = *(vSInt32*)(kernelTableSample + 24);
			vSInt32 coeffs_x_7 = *(vSInt32*)(kernelTableSample + 28);
			vSInt32 coeffs_x_8 = *(vSInt32*)(kernelTableSample + 32);
			vSInt32 coeffs_x_9 = *(vSInt32*)(kernelTableSample + 36);
			vSInt32 coeffs_x_10 = *(vSInt32*)(kernelTableSample + 40);
			vSInt32 coeffs_x_11 = *(vSInt32*)(kernelTableSample + 44);

			vUInt8 row_8_a = v128_load_unaligned((__m128i*)sample);
			vUInt16 row_16_a = v128_unpacklo_int8(row_8_a, zero);
			vUInt16 row_16_b = v128_unpackhi_int8(row_8_a, zero);
			vSInt32 row_32_a = v128_unpacklo_int16(row_16_a, zero);
			vSInt32 row_32_b = v128_unpackhi_int16(row_16_a, zero);
			vSInt32 row_32_c = v128_unpacklo_int16(row_16_b, zero);
			vSInt32 row_32_d = v128_unpackhi_int16(row_16_b, zero);
			vSInt32 mul_a = v128_mul_int32(row_32_a, coeffs_x_0);
			vSInt32 mul_b = v128_mul_int32(row_32_b, coeffs_x_1);
			vSInt32 mul_c = v128_mul_int32(row_32_c, coeffs_x_2);
			vSInt32 mul_d = v128_mul_int32(row_32_d, coeffs_x_3);

			result = v128_add_int32(mul_a, v128_add_int32(mul_b, v128_add_int32(mul_c, mul_d)));

			vUInt8 row_8_b = v128_load_unaligned((__m128i*)(sample + 16));
			vUInt16 row_16_c = v128_unpacklo_int8(row_8_b, zero);
			vUInt16 row_16_d = v128_unpackhi_int8(row_8_b, zero);
			vSInt32 row_32_e = v128_unpacklo_int16(row_16_c, zero);
			vSInt32 row_32_f = v128_unpackhi_int16(row_16_c, zero);
			vSInt32 row_32_g = v128_unpacklo_int16(row_16_d, zero);
			vSInt32 row_32_h = v128_unpackhi_int16(row_16_d, zero);
			vSInt32 mul_e = v128_mul_int32(row_32_e, coeffs_x_4);
			vSInt32 mul_f = v128_mul_int32(row_32_f, coeffs_x_5);
			vSInt32 mul_g = v128_mul_int32(row_32_g, coeffs_x_6);
			vSInt32 mul_h = v128_mul_int32(row_32_h, coeffs_x_7);

			result = v128_add_int32(result, v128_add_int32(mul_e, v128_add_int32(mul_f, v128_add_int32(mul_g, mul_h))));

			vUInt8 row_8_c = v128_load_unaligned((__m128i*)(sample + 32));
			vUInt16 row_16_e = v128_unpacklo_int8(row_8_c, zero);
			vUInt16 row_16_f = v128_unpackhi_int8(row_8_c, zero);
			vSInt32 row_32_i = v128_unpacklo_int16(row_16_e, zero);
			vSInt32 row_32_j = v128_unpackhi_int16(row_16_e, zero);
			vSInt32 row_32_k = v128_unpacklo_int16(row_16_f, zero);
			vSInt32 row_32_l = v128_unpackhi_int16(row_16_f, zero);
			vSInt32 mul_i = v128_mul_int32(row_32_i, coeffs_x_8);
			vSInt32 mul_j = v128_mul_int32(row_32_j, coeffs_x_9);
			vSInt32 mul_k = v128_mul_int32(row_32_k, coeffs_x_10);
			vSInt32 mul_l = v128_mul_int32(row_32_l, coeffs_x_11);

			result = v128_add_int32(result, v128_add_int32(mul_i, v128_add_int32(mul_j, v128_add_int32(mul_k, mul_l))));
			result = v128_add_int32(result, half);
			result = v128_shift_right_signed_int32<16>(result);

			vSInt16 packed_16 = v128_pack_unsigned_saturate_int32(result, zero);
			vSInt8 packed_8 = v128_pack_unsigned_saturate_int16(packed_16, zero);
			unsigned int oi = (x * outputPitch) + y * 4;
			int a = v128_convert_to_int32(packed_8);
			*(int*)(outputBuffer + oi) = a;
#endif
			outputSample += 4;
			sample += inputPitch;
		}
	}
}