void adaptive4x4_3()

in imagecore/image/internal/neon.cpp [36:117]


void adaptive4x4_3(const FilterKernelAdaptive* kernelX, const FilterKernelAdaptive* kernelY, const uint8_t* __restrict inputBuffer, unsigned int inputWidth, unsigned int inputHeight, unsigned int inputPitch, uint8_t* __restrict outputBuffer, unsigned int outputWidth, unsigned int outputHeight, unsigned int outputPitch, unsigned int outputCapacity)
{
	SECURE_ASSERT(SafeUMul(outputWidth, 4U) <= outputPitch);
	SECURE_ASSERT(SafeUMul(outputHeight, outputPitch) <= outputCapacity);

	int32x4_t half = vdupq_n_s32(kHalf22);

	__restrict int32_t* kernelTableX = kernelX->getTableFixedPoint4();
	__restrict int32_t* kernelTableY = kernelY->getTableFixedPoint4();

	for( unsigned int y = 0; y < outputHeight; y++ ) {
		int startY = kernelY->computeSampleStart(y);
		for( unsigned int x = 0; x < outputWidth; x++ ) {
			int startX = kernelX->computeSampleStart(x);

			int sampleOffset = ((startY) * (int)inputPitch) + (startX) * 4;
			const uint8_t* sample = inputBuffer + sampleOffset;

			int32x4_t final;

			unsigned int filterIndexX = x * 16;
			unsigned int filter_index_y = y * 16;

			int32x4_t coeffs_x_0 = *(int32x4_t*)(kernelTableX + filterIndexX + 0);
			int32x4_t coeffs_x_1 = *(int32x4_t*)(kernelTableX + filterIndexX + 4);
			int32x4_t coeffs_x_2 = *(int32x4_t*)(kernelTableX + filterIndexX + 8);
			int32x4_t coeffs_y_0 = *(int32x4_t*)(kernelTableY + filter_index_y + 0);
			{
				uint8x16_t row_8 = vld1q_u8((uint8_t*)sample);
				int16x8_t row_16_a = vmovl_u8(vget_low_u8(row_8));
				int16x8_t row_16_b = vmovl_u8(vget_high_u8(row_8));
				int32x4_t row_32_a = vmovl_s16(vget_low_s16(row_16_a));
				int32x4_t row_32_b = vmovl_s16(vget_high_s16(row_16_a));
				int32x4_t row_32_c = vmovl_s16(vget_low_s16(row_16_b));
				int32x4_t mul_a = vmulq_s32(row_32_a, coeffs_x_0);
				int32x4_t mul_b = vmulq_s32(row_32_b, coeffs_x_1);
				int32x4_t mul_c = vmulq_s32(row_32_c, coeffs_x_2);
				int32x4_t row = vshrq_n_s32(vaddq_s32(mul_a, vaddq_s32(mul_b, mul_c)), 10);
				final = vmulq_s32(row, coeffs_y_0);
				sample += inputPitch;
			}

			int32x4_t coeffs_y_1 = *(int32x4_t*)(kernelTableY + filter_index_y + 4);
			{
				uint8x16_t row_8 = vld1q_u8((uint8_t*)sample);
				int16x8_t row_16_a = vmovl_u8(vget_low_u8(row_8));
				int16x8_t row_16_b = vmovl_u8(vget_high_u8(row_8));
				int32x4_t row_32_a = vmovl_s16(vget_low_s16(row_16_a));
				int32x4_t row_32_b = vmovl_s16(vget_high_s16(row_16_a));
				int32x4_t row_32_c = vmovl_s16(vget_low_s16(row_16_b));
				int32x4_t mul_a = vmulq_s32(row_32_a, coeffs_x_0);
				int32x4_t mul_b = vmulq_s32(row_32_b, coeffs_x_1);
				int32x4_t mul_c = vmulq_s32(row_32_c, coeffs_x_2);
				int32x4_t row = vshrq_n_s32(vaddq_s32(mul_a, vaddq_s32(mul_b, mul_c)), 10);
				final = vaddq_s32(final, vmulq_s32(row, coeffs_y_1));
				sample += inputPitch;
			}

			int32x4_t coeffs_y_2 = *(int32x4_t*)(kernelTableY + filter_index_y + 8);
			{
				uint8x16_t row_8 = vld1q_u8((uint8_t*)sample);
				int16x8_t row_16_a = vmovl_u8(vget_low_u8(row_8));
				int16x8_t row_16_b = vmovl_u8(vget_high_u8(row_8));
				int32x4_t row_32_a = vmovl_s16(vget_low_s16(row_16_a));
				int32x4_t row_32_b = vmovl_s16(vget_high_s16(row_16_a));
				int32x4_t row_32_c = vmovl_s16(vget_low_s16(row_16_b));
				int32x4_t mul_a = vmulq_s32(row_32_a, coeffs_x_0);
				int32x4_t mul_b = vmulq_s32(row_32_b, coeffs_x_1);
				int32x4_t mul_c = vmulq_s32(row_32_c, coeffs_x_2);
				int32x4_t row = vshrq_n_s32(vaddq_s32(mul_a, vaddq_s32(mul_b, mul_c)), 10);
				final = vaddq_s32(final, vmulq_s32(row, coeffs_y_2));
				sample += inputPitch;
			}

			final = vaddq_s32(final, half);
			final = vshrq_n_s32(final, 22);
			int8x8_t packed_8 = vmovn_s16(vcombine_s16(vmovn_s32(final), vdup_n_s16(0)));
			unsigned int oi = (y * outputPitch) + x * 4;
			vst1_lane_s32((int32_t*)(outputBuffer + oi), packed_8, 0);
		}
	}
}