in imagecore/image/internal/neon.cpp [36:117]
void adaptive4x4_3(const FilterKernelAdaptive* kernelX, const FilterKernelAdaptive* kernelY, const uint8_t* __restrict inputBuffer, unsigned int inputWidth, unsigned int inputHeight, unsigned int inputPitch, uint8_t* __restrict outputBuffer, unsigned int outputWidth, unsigned int outputHeight, unsigned int outputPitch, unsigned int outputCapacity)
{
SECURE_ASSERT(SafeUMul(outputWidth, 4U) <= outputPitch);
SECURE_ASSERT(SafeUMul(outputHeight, outputPitch) <= outputCapacity);
int32x4_t half = vdupq_n_s32(kHalf22);
__restrict int32_t* kernelTableX = kernelX->getTableFixedPoint4();
__restrict int32_t* kernelTableY = kernelY->getTableFixedPoint4();
for( unsigned int y = 0; y < outputHeight; y++ ) {
int startY = kernelY->computeSampleStart(y);
for( unsigned int x = 0; x < outputWidth; x++ ) {
int startX = kernelX->computeSampleStart(x);
int sampleOffset = ((startY) * (int)inputPitch) + (startX) * 4;
const uint8_t* sample = inputBuffer + sampleOffset;
int32x4_t final;
unsigned int filterIndexX = x * 16;
unsigned int filter_index_y = y * 16;
int32x4_t coeffs_x_0 = *(int32x4_t*)(kernelTableX + filterIndexX + 0);
int32x4_t coeffs_x_1 = *(int32x4_t*)(kernelTableX + filterIndexX + 4);
int32x4_t coeffs_x_2 = *(int32x4_t*)(kernelTableX + filterIndexX + 8);
int32x4_t coeffs_y_0 = *(int32x4_t*)(kernelTableY + filter_index_y + 0);
{
uint8x16_t row_8 = vld1q_u8((uint8_t*)sample);
int16x8_t row_16_a = vmovl_u8(vget_low_u8(row_8));
int16x8_t row_16_b = vmovl_u8(vget_high_u8(row_8));
int32x4_t row_32_a = vmovl_s16(vget_low_s16(row_16_a));
int32x4_t row_32_b = vmovl_s16(vget_high_s16(row_16_a));
int32x4_t row_32_c = vmovl_s16(vget_low_s16(row_16_b));
int32x4_t mul_a = vmulq_s32(row_32_a, coeffs_x_0);
int32x4_t mul_b = vmulq_s32(row_32_b, coeffs_x_1);
int32x4_t mul_c = vmulq_s32(row_32_c, coeffs_x_2);
int32x4_t row = vshrq_n_s32(vaddq_s32(mul_a, vaddq_s32(mul_b, mul_c)), 10);
final = vmulq_s32(row, coeffs_y_0);
sample += inputPitch;
}
int32x4_t coeffs_y_1 = *(int32x4_t*)(kernelTableY + filter_index_y + 4);
{
uint8x16_t row_8 = vld1q_u8((uint8_t*)sample);
int16x8_t row_16_a = vmovl_u8(vget_low_u8(row_8));
int16x8_t row_16_b = vmovl_u8(vget_high_u8(row_8));
int32x4_t row_32_a = vmovl_s16(vget_low_s16(row_16_a));
int32x4_t row_32_b = vmovl_s16(vget_high_s16(row_16_a));
int32x4_t row_32_c = vmovl_s16(vget_low_s16(row_16_b));
int32x4_t mul_a = vmulq_s32(row_32_a, coeffs_x_0);
int32x4_t mul_b = vmulq_s32(row_32_b, coeffs_x_1);
int32x4_t mul_c = vmulq_s32(row_32_c, coeffs_x_2);
int32x4_t row = vshrq_n_s32(vaddq_s32(mul_a, vaddq_s32(mul_b, mul_c)), 10);
final = vaddq_s32(final, vmulq_s32(row, coeffs_y_1));
sample += inputPitch;
}
int32x4_t coeffs_y_2 = *(int32x4_t*)(kernelTableY + filter_index_y + 8);
{
uint8x16_t row_8 = vld1q_u8((uint8_t*)sample);
int16x8_t row_16_a = vmovl_u8(vget_low_u8(row_8));
int16x8_t row_16_b = vmovl_u8(vget_high_u8(row_8));
int32x4_t row_32_a = vmovl_s16(vget_low_s16(row_16_a));
int32x4_t row_32_b = vmovl_s16(vget_high_s16(row_16_a));
int32x4_t row_32_c = vmovl_s16(vget_low_s16(row_16_b));
int32x4_t mul_a = vmulq_s32(row_32_a, coeffs_x_0);
int32x4_t mul_b = vmulq_s32(row_32_b, coeffs_x_1);
int32x4_t mul_c = vmulq_s32(row_32_c, coeffs_x_2);
int32x4_t row = vshrq_n_s32(vaddq_s32(mul_a, vaddq_s32(mul_b, mul_c)), 10);
final = vaddq_s32(final, vmulq_s32(row, coeffs_y_2));
sample += inputPitch;
}
final = vaddq_s32(final, half);
final = vshrq_n_s32(final, 22);
int8x8_t packed_8 = vmovn_s16(vcombine_s16(vmovn_s32(final), vdup_n_s16(0)));
unsigned int oi = (y * outputPitch) + x * 4;
vst1_lane_s32((int32_t*)(outputBuffer + oi), packed_8, 0);
}
}
}