static void adaptiveSeperable12()

in imagecore/image/internal/neon.cpp [352:435]


static void adaptiveSeperable12(const FilterKernelAdaptive* kernel, const uint8_t* __restrict inputBuffer, unsigned int inputWidth, unsigned int inputHeight, unsigned int inputPitch, uint8_t* __restrict outputBuffer, unsigned int outputWidth, unsigned int outputHeight, unsigned int outputPitch, unsigned int outputCapacity)
{
	// The seperable version writes transposed images.
	SECURE_ASSERT(SafeUMul(outputHeight, 4U) <= outputPitch);
	SECURE_ASSERT(SafeUMul(outputWidth, outputPitch) <= outputCapacity);

	__restrict int32_t* kernelTable = kernel->getTableFixedPoint4();

	int32x4_t zero = vdupq_n_s32(0);
	int32x4_t half = vdupq_n_s32(kHalf16);

	for( unsigned int y = 0; y < outputHeight; y++ ) {
		for( unsigned int x = 0; x < outputWidth; x++ ) {
			int startX = kernel->computeSampleStart(x);
			int sampleOffset = (y * (int)inputPitch) + (startX) * 4;
			const uint8_t* sample = inputBuffer + sampleOffset;

			int32x4_t result = zero;
			unsigned int filterIndexX = x * 48;

			int32x4_t coeffs_x_0 = *(int32x4_t*)(kernelTable + filterIndexX + 0);
			int32x4_t coeffs_x_1 = *(int32x4_t*)(kernelTable + filterIndexX + 4);
			int32x4_t coeffs_x_2 = *(int32x4_t*)(kernelTable + filterIndexX + 8);
			int32x4_t coeffs_x_3 = *(int32x4_t*)(kernelTable + filterIndexX + 12);
			int32x4_t coeffs_x_4 = *(int32x4_t*)(kernelTable + filterIndexX + 16);
			int32x4_t coeffs_x_5 = *(int32x4_t*)(kernelTable + filterIndexX + 20);
			int32x4_t coeffs_x_6 = *(int32x4_t*)(kernelTable + filterIndexX + 24);
			int32x4_t coeffs_x_7 = *(int32x4_t*)(kernelTable + filterIndexX + 28);
			int32x4_t coeffs_x_8 = *(int32x4_t*)(kernelTable + filterIndexX + 32);
			int32x4_t coeffs_x_9 = *(int32x4_t*)(kernelTable + filterIndexX + 36);
			int32x4_t coeffs_x_10 = *(int32x4_t*)(kernelTable + filterIndexX + 40);
			int32x4_t coeffs_x_11 = *(int32x4_t*)(kernelTable + filterIndexX + 44);

			uint8x16_t row_8_a = vld1q_u8((uint8_t*)sample);
			int16x8_t row_16_a = vmovl_u8(vget_low_u8(row_8_a));
			int16x8_t row_16_b = vmovl_u8(vget_high_u8(row_8_a));
			int32x4_t row_32_a = vmovl_s16(vget_low_s16(row_16_a));
			int32x4_t row_32_b = vmovl_s16(vget_high_s16(row_16_a));
			int32x4_t row_32_c = vmovl_s16(vget_low_s16(row_16_b));
			int32x4_t row_32_d = vmovl_s16(vget_high_s16(row_16_b));
			int32x4_t mul_a = vmulq_s32(row_32_a, coeffs_x_0);
			int32x4_t mul_b = vmulq_s32(row_32_b, coeffs_x_1);
			int32x4_t mul_c = vmulq_s32(row_32_c, coeffs_x_2);
			int32x4_t mul_d = vmulq_s32(row_32_d, coeffs_x_3);

			result = vaddq_s32(result, vaddq_s32(mul_a, vaddq_s32(mul_b, vaddq_s32(mul_c, mul_d))));

			uint8x16_t row_8_b = vld1q_u8((uint8_t*)(sample + 16));
			int16x8_t row_16_c = vmovl_u8(vget_low_u8(row_8_b));
			int16x8_t row_16_d = vmovl_u8(vget_high_u8(row_8_b));
			int32x4_t row_32_e = vmovl_s16(vget_low_s16(row_16_c));
			int32x4_t row_32_f = vmovl_s16(vget_high_s16(row_16_c));
			int32x4_t row_32_g = vmovl_s16(vget_low_s16(row_16_d));
			int32x4_t row_32_h = vmovl_s16(vget_high_s16(row_16_d));
			int32x4_t mul_e = vmulq_s32(row_32_e, coeffs_x_4);
			int32x4_t mul_f = vmulq_s32(row_32_f, coeffs_x_5);
			int32x4_t mul_g = vmulq_s32(row_32_g, coeffs_x_6);
			int32x4_t mul_h = vmulq_s32(row_32_h, coeffs_x_7);

			result = vaddq_s32(result, vaddq_s32(mul_e, vaddq_s32(mul_f, vaddq_s32(mul_g, mul_h))));

			uint8x16_t row_8_c = vld1q_u8((uint8_t*)(sample + 32));
			int16x8_t row_16_e = vmovl_u8(vget_low_u8(row_8_c));
			int16x8_t row_16_f = vmovl_u8(vget_high_u8(row_8_c));
			int32x4_t row_32_i = vmovl_s16(vget_low_s16(row_16_e));
			int32x4_t row_32_j = vmovl_s16(vget_high_s16(row_16_e));
			int32x4_t row_32_k = vmovl_s16(vget_low_s16(row_16_f));
			int32x4_t row_32_l = vmovl_s16(vget_high_s16(row_16_f));
			int32x4_t mul_i = vmulq_s32(row_32_i, coeffs_x_8);
			int32x4_t mul_j = vmulq_s32(row_32_j, coeffs_x_9);
			int32x4_t mul_k = vmulq_s32(row_32_k, coeffs_x_10);
			int32x4_t mul_l = vmulq_s32(row_32_l, coeffs_x_11);

			result = vaddq_s32(result, vaddq_s32(mul_i, vaddq_s32(mul_j, vaddq_s32(mul_k, mul_l))));

			result = vaddq_s32(result, half);
			result = vshrq_n_s32(result, 16);

			int8x8_t packed_8 = vqmovun_s16(vcombine_s16(vmovn_s32(result), vdup_n_s16(0)));
			unsigned int oi = (x * outputPitch) + y * 4;
			vst1_lane_s32((int32_t*)(outputBuffer + oi), packed_8, 0);
		}
	}
}