imagecore/image/internal/neon.cpp (474 lines of code) (raw):

/* * MIT License * * Copyright (c) 2017 Twitter * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "filters.h" #include "imagecore/imagecore.h" #include "imagecore/utils/securemath.h" #include "imagecore/utils/mathutils.h" #if __ARM_NEON__ #include "intrinsics.h" namespace imagecore { void adaptive4x4_3(const FilterKernelAdaptive* kernelX, const FilterKernelAdaptive* kernelY, const uint8_t* __restrict inputBuffer, unsigned int inputWidth, unsigned int inputHeight, unsigned int inputPitch, uint8_t* __restrict outputBuffer, unsigned int outputWidth, unsigned int outputHeight, unsigned int outputPitch, unsigned int outputCapacity) { SECURE_ASSERT(SafeUMul(outputWidth, 4U) <= outputPitch); SECURE_ASSERT(SafeUMul(outputHeight, outputPitch) <= outputCapacity); int32x4_t half = vdupq_n_s32(kHalf22); __restrict int32_t* kernelTableX = kernelX->getTableFixedPoint4(); __restrict int32_t* kernelTableY = kernelY->getTableFixedPoint4(); for( unsigned int y = 0; y < outputHeight; y++ ) { int startY = kernelY->computeSampleStart(y); for( unsigned int x = 0; x < outputWidth; x++ ) { int startX = kernelX->computeSampleStart(x); int sampleOffset = ((startY) * (int)inputPitch) + (startX) * 4; const uint8_t* sample = inputBuffer + sampleOffset; int32x4_t final; unsigned int filterIndexX = x * 16; unsigned int filter_index_y = y * 16; int32x4_t coeffs_x_0 = *(int32x4_t*)(kernelTableX + filterIndexX + 0); int32x4_t coeffs_x_1 = *(int32x4_t*)(kernelTableX + filterIndexX + 4); int32x4_t coeffs_x_2 = *(int32x4_t*)(kernelTableX + filterIndexX + 8); int32x4_t coeffs_y_0 = *(int32x4_t*)(kernelTableY + filter_index_y + 0); { uint8x16_t row_8 = vld1q_u8((uint8_t*)sample); int16x8_t row_16_a = vmovl_u8(vget_low_u8(row_8)); int16x8_t row_16_b = vmovl_u8(vget_high_u8(row_8)); int32x4_t row_32_a = vmovl_s16(vget_low_s16(row_16_a)); int32x4_t row_32_b = vmovl_s16(vget_high_s16(row_16_a)); int32x4_t row_32_c = vmovl_s16(vget_low_s16(row_16_b)); int32x4_t mul_a = vmulq_s32(row_32_a, coeffs_x_0); int32x4_t mul_b = vmulq_s32(row_32_b, coeffs_x_1); int32x4_t mul_c = vmulq_s32(row_32_c, coeffs_x_2); int32x4_t row = vshrq_n_s32(vaddq_s32(mul_a, vaddq_s32(mul_b, mul_c)), 10); final = vmulq_s32(row, coeffs_y_0); sample += inputPitch; } int32x4_t coeffs_y_1 = *(int32x4_t*)(kernelTableY + filter_index_y + 4); { uint8x16_t row_8 = vld1q_u8((uint8_t*)sample); int16x8_t row_16_a = vmovl_u8(vget_low_u8(row_8)); int16x8_t row_16_b = vmovl_u8(vget_high_u8(row_8)); int32x4_t row_32_a = vmovl_s16(vget_low_s16(row_16_a)); int32x4_t row_32_b = vmovl_s16(vget_high_s16(row_16_a)); int32x4_t row_32_c = vmovl_s16(vget_low_s16(row_16_b)); int32x4_t mul_a = vmulq_s32(row_32_a, coeffs_x_0); int32x4_t mul_b = vmulq_s32(row_32_b, coeffs_x_1); int32x4_t mul_c = vmulq_s32(row_32_c, coeffs_x_2); int32x4_t row = vshrq_n_s32(vaddq_s32(mul_a, vaddq_s32(mul_b, mul_c)), 10); final = vaddq_s32(final, vmulq_s32(row, coeffs_y_1)); sample += inputPitch; } int32x4_t coeffs_y_2 = *(int32x4_t*)(kernelTableY + filter_index_y + 8); { uint8x16_t row_8 = vld1q_u8((uint8_t*)sample); int16x8_t row_16_a = vmovl_u8(vget_low_u8(row_8)); int16x8_t row_16_b = vmovl_u8(vget_high_u8(row_8)); int32x4_t row_32_a = vmovl_s16(vget_low_s16(row_16_a)); int32x4_t row_32_b = vmovl_s16(vget_high_s16(row_16_a)); int32x4_t row_32_c = vmovl_s16(vget_low_s16(row_16_b)); int32x4_t mul_a = vmulq_s32(row_32_a, coeffs_x_0); int32x4_t mul_b = vmulq_s32(row_32_b, coeffs_x_1); int32x4_t mul_c = vmulq_s32(row_32_c, coeffs_x_2); int32x4_t row = vshrq_n_s32(vaddq_s32(mul_a, vaddq_s32(mul_b, mul_c)), 10); final = vaddq_s32(final, vmulq_s32(row, coeffs_y_2)); sample += inputPitch; } final = vaddq_s32(final, half); final = vshrq_n_s32(final, 22); int8x8_t packed_8 = vmovn_s16(vcombine_s16(vmovn_s32(final), vdup_n_s16(0))); unsigned int oi = (y * outputPitch) + x * 4; vst1_lane_s32((int32_t*)(outputBuffer + oi), packed_8, 0); } } } template<> void Filters<ComponentSIMD<4>>::adaptive4x4(const FilterKernelAdaptive* kernelX, const FilterKernelAdaptive* kernelY, const uint8_t* __restrict inputBuffer, unsigned int inputWidth, unsigned int inputHeight, unsigned int inputPitch, uint8_t* __restrict outputBuffer, unsigned int outputWidth, unsigned int outputHeight, unsigned int outputPitch, unsigned int outputCapacity) { if (kernelX->getMaxSamples() == 3 && kernelY->getMaxSamples() == 3) { adaptive4x4_3(kernelX, kernelY, inputBuffer, inputWidth, inputHeight, inputPitch, outputBuffer, outputWidth, outputHeight, outputPitch, outputCapacity); return; } SECURE_ASSERT(SafeUMul(outputWidth, 4U) <= outputPitch); SECURE_ASSERT(SafeUMul(outputHeight, outputPitch) <= outputCapacity); int32x4_t half = vdupq_n_s32(kHalf22); __restrict int32_t* kernelTableX = kernelX->getTableFixedPoint4(); __restrict int32_t* kernelTableY = kernelY->getTableFixedPoint4(); for( unsigned int y = 0; y < outputHeight; y++ ) { int startY = kernelY->computeSampleStart(y); for( unsigned int x = 0; x < outputWidth; x++ ) { int startX = kernelX->computeSampleStart(x); int sampleOffset = ((startY) * (int)inputPitch) + (startX) * 4; const uint8_t* sample = inputBuffer + sampleOffset; int32x4_t final; unsigned int filterIndexX = x * 16; unsigned int filter_index_y = y * 16; int32x4_t coeffs_x_0 = *(int32x4_t*)(kernelTableX + filterIndexX + 0); int32x4_t coeffs_x_1 = *(int32x4_t*)(kernelTableX + filterIndexX + 4); int32x4_t coeffs_x_2 = *(int32x4_t*)(kernelTableX + filterIndexX + 8); int32x4_t coeffs_x_3 = *(int32x4_t*)(kernelTableX + filterIndexX + 12); int32x4_t coeffs_y_0 = *(int32x4_t*)(kernelTableY + filter_index_y + 0); { uint8x16_t row_8 = vld1q_u8((uint8_t*)sample); int16x8_t row_16_a = vmovl_u8(vget_low_u8(row_8)); int16x8_t row_16_b = vmovl_u8(vget_high_u8(row_8)); int32x4_t row_32_a = vmovl_s16(vget_low_s16(row_16_a)); int32x4_t row_32_b = vmovl_s16(vget_high_s16(row_16_a)); int32x4_t row_32_c = vmovl_s16(vget_low_s16(row_16_b)); int32x4_t row_32_d = vmovl_s16(vget_high_s16(row_16_b)); int32x4_t mul_a = vmulq_s32(row_32_a, coeffs_x_0); int32x4_t mul_b = vmulq_s32(row_32_b, coeffs_x_1); int32x4_t mul_c = vmulq_s32(row_32_c, coeffs_x_2); int32x4_t mul_d = vmulq_s32(row_32_d, coeffs_x_3); int32x4_t row = vshrq_n_s32(vaddq_s32(mul_a, vaddq_s32(mul_b, vaddq_s32(mul_c, mul_d))), 10); final = vmulq_s32(row, coeffs_y_0); sample += inputPitch; } int32x4_t coeffs_y_1 = *(int32x4_t*)(kernelTableY + filter_index_y + 4); { uint8x16_t row_8 = vld1q_u8((uint8_t*)sample); int16x8_t row_16_a = vmovl_u8(vget_low_u8(row_8)); int16x8_t row_16_b = vmovl_u8(vget_high_u8(row_8)); int32x4_t row_32_a = vmovl_s16(vget_low_s16(row_16_a)); int32x4_t row_32_b = vmovl_s16(vget_high_s16(row_16_a)); int32x4_t row_32_c = vmovl_s16(vget_low_s16(row_16_b)); int32x4_t row_32_d = vmovl_s16(vget_high_s16(row_16_b)); int32x4_t mul_a = vmulq_s32(row_32_a, coeffs_x_0); int32x4_t mul_b = vmulq_s32(row_32_b, coeffs_x_1); int32x4_t mul_c = vmulq_s32(row_32_c, coeffs_x_2); int32x4_t mul_d = vmulq_s32(row_32_d, coeffs_x_3); int32x4_t row = vshrq_n_s32(vaddq_s32(mul_a, vaddq_s32(mul_b, vaddq_s32(mul_c, mul_d))), 10); final = vaddq_s32(final, vmulq_s32(row, coeffs_y_1)); sample += inputPitch; } int32x4_t coeffs_y_2 = *(int32x4_t*)(kernelTableY + filter_index_y + 8); { uint8x16_t row_8 = vld1q_u8((uint8_t*)sample); int16x8_t row_16_a = vmovl_u8(vget_low_u8(row_8)); int16x8_t row_16_b = vmovl_u8(vget_high_u8(row_8)); int32x4_t row_32_a = vmovl_s16(vget_low_s16(row_16_a)); int32x4_t row_32_b = vmovl_s16(vget_high_s16(row_16_a)); int32x4_t row_32_c = vmovl_s16(vget_low_s16(row_16_b)); int32x4_t row_32_d = vmovl_s16(vget_high_s16(row_16_b)); int32x4_t mul_a = vmulq_s32(row_32_a, coeffs_x_0); int32x4_t mul_b = vmulq_s32(row_32_b, coeffs_x_1); int32x4_t mul_c = vmulq_s32(row_32_c, coeffs_x_2); int32x4_t mul_d = vmulq_s32(row_32_d, coeffs_x_3); int32x4_t row = vshrq_n_s32(vaddq_s32(mul_a, vaddq_s32(mul_b, vaddq_s32(mul_c, mul_d))), 10); final = vaddq_s32(final, vmulq_s32(row, coeffs_y_2)); sample += inputPitch; } int32x4_t coeffs_y_3 = *(int32x4_t*)(kernelTableY + filter_index_y + 12); { uint8x16_t row_8 = vld1q_u8((uint8_t*)sample); int16x8_t row_16_a = vmovl_u8(vget_low_u8(row_8)); int16x8_t row_16_b = vmovl_u8(vget_high_u8(row_8)); int32x4_t row_32_a = vmovl_s16(vget_low_s16(row_16_a)); int32x4_t row_32_b = vmovl_s16(vget_high_s16(row_16_a)); int32x4_t row_32_c = vmovl_s16(vget_low_s16(row_16_b)); int32x4_t row_32_d = vmovl_s16(vget_high_s16(row_16_b)); int32x4_t mul_a = vmulq_s32(row_32_a, coeffs_x_0); int32x4_t mul_b = vmulq_s32(row_32_b, coeffs_x_1); int32x4_t mul_c = vmulq_s32(row_32_c, coeffs_x_2); int32x4_t mul_d = vmulq_s32(row_32_d, coeffs_x_3); int32x4_t row = vshrq_n_s32(vaddq_s32(mul_a, vaddq_s32(mul_b, vaddq_s32(mul_c, mul_d))), 10); final = vaddq_s32(final, vmulq_s32(row, coeffs_y_3)); sample += inputPitch; } final = vaddq_s32(final, half); final = vshrq_n_s32(final, 22); int8x8_t packed_8 = vmovn_s16(vcombine_s16(vmovn_s32(final), vdup_n_s16(0))); unsigned int oi = (y * outputPitch) + x * 4; vst1_lane_s32((int32_t*)(outputBuffer + oi), packed_8, 0); } } } static void adaptiveSeperableAny(const FilterKernelAdaptive* kernel, const uint8_t* __restrict inputBuffer, unsigned int inputWidth, unsigned int inputHeight, unsigned int inputPitch, uint8_t* __restrict outputBuffer, unsigned int outputWidth, unsigned int outputHeight, unsigned int outputPitch, unsigned int outputCapacity) { SECURE_ASSERT(SafeUMul(outputWidth, 4U) <= outputPitch); SECURE_ASSERT(SafeUMul(outputHeight, outputPitch) <= outputCapacity); __restrict int32_t* kernelTable = kernel->getTableFixedPoint4(); unsigned int kernel_width = kernel->getKernelSize(); int32x4_t zero = vdupq_n_s32(0); int32x4_t half = vdupq_n_s32(kHalf16); for( unsigned int y = 0; y < outputHeight; y++ ) { for( unsigned int x = 0; x < outputWidth; x++ ) { int startX = kernel->computeSampleStart(x); int sampleOffset = (y * (int)inputPitch) + (startX) * 4; const uint8_t* sample = inputBuffer + sampleOffset; int32x4_t result = zero; for( unsigned int section = 0; section < kernel_width; section += 4 ) { unsigned int filterIndexX = x * kernel_width * 4 + section * 4; int32x4_t coeffs_x_0 = *(int32x4_t*)(kernelTable + filterIndexX + 0); int32x4_t coeffs_x_1 = *(int32x4_t*)(kernelTable + filterIndexX + 4); int32x4_t coeffs_x_2 = *(int32x4_t*)(kernelTable + filterIndexX + 8); int32x4_t coeffs_x_3 = *(int32x4_t*)(kernelTable + filterIndexX + 12); uint8x16_t row_8 = vld1q_u8((uint8_t*)(sample + section * 4)); int16x8_t row_16_a = vmovl_u8(vget_low_u8(row_8)); int16x8_t row_16_b = vmovl_u8(vget_high_u8(row_8)); int32x4_t row_32_a = vmovl_s16(vget_low_s16(row_16_a)); int32x4_t row_32_b = vmovl_s16(vget_high_s16(row_16_a)); int32x4_t row_32_c = vmovl_s16(vget_low_s16(row_16_b)); int32x4_t row_32_d = vmovl_s16(vget_high_s16(row_16_b)); int32x4_t mul_a = vmulq_s32(row_32_a, coeffs_x_0); int32x4_t mul_b = vmulq_s32(row_32_b, coeffs_x_1); int32x4_t mul_c = vmulq_s32(row_32_c, coeffs_x_2); int32x4_t mul_d = vmulq_s32(row_32_d, coeffs_x_3); result = vaddq_s32(result, vaddq_s32(mul_a, vaddq_s32(mul_b, vaddq_s32(mul_c, mul_d)))); } result = vaddq_s32(result, half); result = vshrq_n_s32(result, 16); int8x8_t packed_8 = vqmovun_s16(vcombine_s16(vmovn_s32(result), vdup_n_s16(0))); unsigned int oi = (x * outputPitch) + y * 4; vst1_lane_s32((int32_t*)(outputBuffer + oi), packed_8, 0); } } } static void adaptiveSeperable8(const FilterKernelAdaptive* kernel, const uint8_t* __restrict inputBuffer, unsigned int inputWidth, unsigned int inputHeight, unsigned int inputPitch, uint8_t* __restrict outputBuffer, unsigned int outputWidth, unsigned int outputHeight, unsigned int outputPitch, unsigned int outputCapacity) { // The seperable version writes transposed images. SECURE_ASSERT(SafeUMul(outputHeight, 4U) <= outputPitch); SECURE_ASSERT(SafeUMul(outputWidth, outputPitch) <= outputCapacity); __restrict int32_t* kernelTable = kernel->getTableFixedPoint4(); int32x4_t zero = vdupq_n_s32(0); int32x4_t half = vdupq_n_s32(kHalf16); for( unsigned int y = 0; y < outputHeight; y++ ) { for( unsigned int x = 0; x < outputWidth; x++ ) { int startX = kernel->computeSampleStart(x); int sampleOffset = (y * (int)inputPitch) + (startX) * 4; const uint8_t* sample = inputBuffer + sampleOffset; int32x4_t result = zero; unsigned int filterIndexX = x * 32; int32x4_t coeffs_x_0 = *(int32x4_t*)(kernelTable + filterIndexX + 0); int32x4_t coeffs_x_1 = *(int32x4_t*)(kernelTable + filterIndexX + 4); int32x4_t coeffs_x_2 = *(int32x4_t*)(kernelTable + filterIndexX + 8); int32x4_t coeffs_x_3 = *(int32x4_t*)(kernelTable + filterIndexX + 12); int32x4_t coeffs_x_4 = *(int32x4_t*)(kernelTable + filterIndexX + 16); int32x4_t coeffs_x_5 = *(int32x4_t*)(kernelTable + filterIndexX + 20); int32x4_t coeffs_x_6 = *(int32x4_t*)(kernelTable + filterIndexX + 24); int32x4_t coeffs_x_7 = *(int32x4_t*)(kernelTable + filterIndexX + 28); uint8x16_t row_8_a = vld1q_u8((uint8_t*)sample); int16x8_t row_16_a = vmovl_u8(vget_low_u8(row_8_a)); int16x8_t row_16_b = vmovl_u8(vget_high_u8(row_8_a)); int32x4_t row_32_a = vmovl_s16(vget_low_s16(row_16_a)); int32x4_t row_32_b = vmovl_s16(vget_high_s16(row_16_a)); int32x4_t row_32_c = vmovl_s16(vget_low_s16(row_16_b)); int32x4_t row_32_d = vmovl_s16(vget_high_s16(row_16_b)); int32x4_t mul_a = vmulq_s32(row_32_a, coeffs_x_0); int32x4_t mul_b = vmulq_s32(row_32_b, coeffs_x_1); int32x4_t mul_c = vmulq_s32(row_32_c, coeffs_x_2); int32x4_t mul_d = vmulq_s32(row_32_d, coeffs_x_3); result = vaddq_s32(result, vaddq_s32(mul_a, vaddq_s32(mul_b, vaddq_s32(mul_c, mul_d)))); uint8x16_t row_8_b = vld1q_u8((uint8_t*)(sample + 16)); int16x8_t row_16_c = vmovl_u8(vget_low_u8(row_8_b)); int16x8_t row_16_d = vmovl_u8(vget_high_u8(row_8_b)); int32x4_t row_32_e = vmovl_s16(vget_low_s16(row_16_c)); int32x4_t row_32_f = vmovl_s16(vget_high_s16(row_16_c)); int32x4_t row_32_g = vmovl_s16(vget_low_s16(row_16_d)); int32x4_t row_32_h = vmovl_s16(vget_high_s16(row_16_d)); int32x4_t mul_e = vmulq_s32(row_32_e, coeffs_x_4); int32x4_t mul_f = vmulq_s32(row_32_f, coeffs_x_5); int32x4_t mul_g = vmulq_s32(row_32_g, coeffs_x_6); int32x4_t mul_h = vmulq_s32(row_32_h, coeffs_x_7); result = vaddq_s32(result, vaddq_s32(mul_e, vaddq_s32(mul_f, vaddq_s32(mul_g, mul_h)))); result = vaddq_s32(result, half); result = vshrq_n_s32(result, 16); int8x8_t packed_8 = vqmovun_s16(vcombine_s16(vmovn_s32(result), vdup_n_s16(0))); unsigned int oi = (x * outputPitch) + y * 4; vst1_lane_s32((int32_t*)(outputBuffer + oi), packed_8, 0); } } } // Adaptive-width filter, single axis, 12 samples. // 16.16 Fixed point SSE version static void adaptiveSeperable12(const FilterKernelAdaptive* kernel, const uint8_t* __restrict inputBuffer, unsigned int inputWidth, unsigned int inputHeight, unsigned int inputPitch, uint8_t* __restrict outputBuffer, unsigned int outputWidth, unsigned int outputHeight, unsigned int outputPitch, unsigned int outputCapacity) { // The seperable version writes transposed images. SECURE_ASSERT(SafeUMul(outputHeight, 4U) <= outputPitch); SECURE_ASSERT(SafeUMul(outputWidth, outputPitch) <= outputCapacity); __restrict int32_t* kernelTable = kernel->getTableFixedPoint4(); int32x4_t zero = vdupq_n_s32(0); int32x4_t half = vdupq_n_s32(kHalf16); for( unsigned int y = 0; y < outputHeight; y++ ) { for( unsigned int x = 0; x < outputWidth; x++ ) { int startX = kernel->computeSampleStart(x); int sampleOffset = (y * (int)inputPitch) + (startX) * 4; const uint8_t* sample = inputBuffer + sampleOffset; int32x4_t result = zero; unsigned int filterIndexX = x * 48; int32x4_t coeffs_x_0 = *(int32x4_t*)(kernelTable + filterIndexX + 0); int32x4_t coeffs_x_1 = *(int32x4_t*)(kernelTable + filterIndexX + 4); int32x4_t coeffs_x_2 = *(int32x4_t*)(kernelTable + filterIndexX + 8); int32x4_t coeffs_x_3 = *(int32x4_t*)(kernelTable + filterIndexX + 12); int32x4_t coeffs_x_4 = *(int32x4_t*)(kernelTable + filterIndexX + 16); int32x4_t coeffs_x_5 = *(int32x4_t*)(kernelTable + filterIndexX + 20); int32x4_t coeffs_x_6 = *(int32x4_t*)(kernelTable + filterIndexX + 24); int32x4_t coeffs_x_7 = *(int32x4_t*)(kernelTable + filterIndexX + 28); int32x4_t coeffs_x_8 = *(int32x4_t*)(kernelTable + filterIndexX + 32); int32x4_t coeffs_x_9 = *(int32x4_t*)(kernelTable + filterIndexX + 36); int32x4_t coeffs_x_10 = *(int32x4_t*)(kernelTable + filterIndexX + 40); int32x4_t coeffs_x_11 = *(int32x4_t*)(kernelTable + filterIndexX + 44); uint8x16_t row_8_a = vld1q_u8((uint8_t*)sample); int16x8_t row_16_a = vmovl_u8(vget_low_u8(row_8_a)); int16x8_t row_16_b = vmovl_u8(vget_high_u8(row_8_a)); int32x4_t row_32_a = vmovl_s16(vget_low_s16(row_16_a)); int32x4_t row_32_b = vmovl_s16(vget_high_s16(row_16_a)); int32x4_t row_32_c = vmovl_s16(vget_low_s16(row_16_b)); int32x4_t row_32_d = vmovl_s16(vget_high_s16(row_16_b)); int32x4_t mul_a = vmulq_s32(row_32_a, coeffs_x_0); int32x4_t mul_b = vmulq_s32(row_32_b, coeffs_x_1); int32x4_t mul_c = vmulq_s32(row_32_c, coeffs_x_2); int32x4_t mul_d = vmulq_s32(row_32_d, coeffs_x_3); result = vaddq_s32(result, vaddq_s32(mul_a, vaddq_s32(mul_b, vaddq_s32(mul_c, mul_d)))); uint8x16_t row_8_b = vld1q_u8((uint8_t*)(sample + 16)); int16x8_t row_16_c = vmovl_u8(vget_low_u8(row_8_b)); int16x8_t row_16_d = vmovl_u8(vget_high_u8(row_8_b)); int32x4_t row_32_e = vmovl_s16(vget_low_s16(row_16_c)); int32x4_t row_32_f = vmovl_s16(vget_high_s16(row_16_c)); int32x4_t row_32_g = vmovl_s16(vget_low_s16(row_16_d)); int32x4_t row_32_h = vmovl_s16(vget_high_s16(row_16_d)); int32x4_t mul_e = vmulq_s32(row_32_e, coeffs_x_4); int32x4_t mul_f = vmulq_s32(row_32_f, coeffs_x_5); int32x4_t mul_g = vmulq_s32(row_32_g, coeffs_x_6); int32x4_t mul_h = vmulq_s32(row_32_h, coeffs_x_7); result = vaddq_s32(result, vaddq_s32(mul_e, vaddq_s32(mul_f, vaddq_s32(mul_g, mul_h)))); uint8x16_t row_8_c = vld1q_u8((uint8_t*)(sample + 32)); int16x8_t row_16_e = vmovl_u8(vget_low_u8(row_8_c)); int16x8_t row_16_f = vmovl_u8(vget_high_u8(row_8_c)); int32x4_t row_32_i = vmovl_s16(vget_low_s16(row_16_e)); int32x4_t row_32_j = vmovl_s16(vget_high_s16(row_16_e)); int32x4_t row_32_k = vmovl_s16(vget_low_s16(row_16_f)); int32x4_t row_32_l = vmovl_s16(vget_high_s16(row_16_f)); int32x4_t mul_i = vmulq_s32(row_32_i, coeffs_x_8); int32x4_t mul_j = vmulq_s32(row_32_j, coeffs_x_9); int32x4_t mul_k = vmulq_s32(row_32_k, coeffs_x_10); int32x4_t mul_l = vmulq_s32(row_32_l, coeffs_x_11); result = vaddq_s32(result, vaddq_s32(mul_i, vaddq_s32(mul_j, vaddq_s32(mul_k, mul_l)))); result = vaddq_s32(result, half); result = vshrq_n_s32(result, 16); int8x8_t packed_8 = vqmovun_s16(vcombine_s16(vmovn_s32(result), vdup_n_s16(0))); unsigned int oi = (x * outputPitch) + y * 4; vst1_lane_s32((int32_t*)(outputBuffer + oi), packed_8, 0); } } } // Adaptive-width filter, variable number of samples. template<> void Filters<ComponentSIMD<4>>::adaptiveSeperable(const FilterKernelAdaptive* kernel, const uint8_t* __restrict inputBuffer, unsigned int inputWidth, unsigned int inputHeight, unsigned int inputPitch, uint8_t* __restrict outputBuffer, unsigned int outputWidth, unsigned int outputHeight, unsigned int outputPitch, unsigned int outputCapacity, bool) { unsigned int kernelSize = kernel->getKernelSize(); if( kernelSize == 8U ) { adaptiveSeperable8(kernel, inputBuffer, inputWidth, inputHeight, inputPitch, outputBuffer, outputWidth, outputHeight, outputPitch, outputCapacity); } else if( kernelSize == 12U ) { adaptiveSeperable12(kernel, inputBuffer, inputWidth, inputHeight, inputPitch, outputBuffer, outputWidth, outputHeight, outputPitch, outputCapacity); } else { adaptiveSeperableAny(kernel, inputBuffer, inputWidth, inputHeight, inputPitch, outputBuffer, outputWidth, outputHeight, outputPitch, outputCapacity); } } template<> void Filters<ComponentSIMD<4>>::fixed4x4(const FilterKernelFixed *kernelX, const FilterKernelFixed *kernelY, const uint8_t *inputBuffer, unsigned int inputWidth, unsigned int inputHeight, unsigned int inputPitch, uint8_t *outputBuffer, unsigned int outputWidth, unsigned int outputHeight, unsigned int outputPitch, unsigned int outputCapacity) { SECURE_ASSERT(SafeUMul(outputWidth, 4U) <= outputPitch); SECURE_ASSERT(SafeUMul(outputHeight, outputPitch) <= outputCapacity); __restrict int32_t* kernelTableX = kernelX->getTableFixedPoint4(); __restrict int32_t* kernelTableY = kernelY->getTableFixedPoint4(); int32x4_t half = vdupq_n_s32(kHalf22); for( unsigned int y = 0; y < outputHeight; y++ ) { int sampleY = kernelY->computeSampleStart(y); for( unsigned int x = 0; x < outputWidth; x++ ) { int sampleX = kernelX->computeSampleStart(x); int sampleOffset = ((sampleY - 1) * (int)inputPitch) + (sampleX - 1) * 4; const uint8_t* sample = inputBuffer + sampleOffset; int32x4_t final; unsigned int filterIndexX = x; filterIndexX *= 16; int32x4_t coeffs_x_0 = *(int32x4_t*)(kernelTableX + filterIndexX + 0); int32x4_t coeffs_x_1 = *(int32x4_t*)(kernelTableX + filterIndexX + 4); int32x4_t coeffs_x_2 = *(int32x4_t*)(kernelTableX + filterIndexX + 8); int32x4_t coeffs_x_3 = *(int32x4_t*)(kernelTableX + filterIndexX + 12); unsigned int filter_index_y = y; filter_index_y *= 16; int32x4_t coeffs_y_0 = *(int32x4_t*)(kernelTableY + filter_index_y + 0); { uint8x16_t row_8 = vld1q_u8((uint8_t*)sample); int16x8_t row_16_a = vmovl_u8(vget_low_u8(row_8)); int16x8_t row_16_b = vmovl_u8(vget_high_u8(row_8)); int32x4_t row_32_a = vmovl_s16(vget_low_s16(row_16_a)); int32x4_t row_32_b = vmovl_s16(vget_high_s16(row_16_a)); int32x4_t row_32_c = vmovl_s16(vget_low_s16(row_16_b)); int32x4_t row_32_d = vmovl_s16(vget_high_s16(row_16_b)); int32x4_t mul_a = vmulq_s32(row_32_a, coeffs_x_0); int32x4_t mul_b = vmulq_s32(row_32_b, coeffs_x_1); int32x4_t mul_c = vmulq_s32(row_32_c, coeffs_x_2); int32x4_t mul_d = vmulq_s32(row_32_d, coeffs_x_3); int32x4_t row = vshrq_n_s32(vaddq_s32(mul_a, vaddq_s32(mul_b, vaddq_s32(mul_c, mul_d))), 10); final = vmulq_s32(row, coeffs_y_0); sample += inputPitch; } int32x4_t coeffs_y_1 = *(int32x4_t*)(kernelTableY + filter_index_y + 4); { uint8x16_t row_8 = vld1q_u8((uint8_t*)sample); int16x8_t row_16_a = vmovl_u8(vget_low_u8(row_8)); int16x8_t row_16_b = vmovl_u8(vget_high_u8(row_8)); int32x4_t row_32_a = vmovl_s16(vget_low_s16(row_16_a)); int32x4_t row_32_b = vmovl_s16(vget_high_s16(row_16_a)); int32x4_t row_32_c = vmovl_s16(vget_low_s16(row_16_b)); int32x4_t row_32_d = vmovl_s16(vget_high_s16(row_16_b)); int32x4_t mul_a = vmulq_s32(row_32_a, coeffs_x_0); int32x4_t mul_b = vmulq_s32(row_32_b, coeffs_x_1); int32x4_t mul_c = vmulq_s32(row_32_c, coeffs_x_2); int32x4_t mul_d = vmulq_s32(row_32_d, coeffs_x_3); int32x4_t row = vshrq_n_s32(vaddq_s32(mul_a, vaddq_s32(mul_b, vaddq_s32(mul_c, mul_d))), 10); final = vaddq_s32(final, vmulq_s32(row, coeffs_y_1)); sample += inputPitch; } int32x4_t coeffs_y_2 = *(int32x4_t*)(kernelTableY + filter_index_y + 8); { uint8x16_t row_8 = vld1q_u8((uint8_t*)sample); int16x8_t row_16_a = vmovl_u8(vget_low_u8(row_8)); int16x8_t row_16_b = vmovl_u8(vget_high_u8(row_8)); int32x4_t row_32_a = vmovl_s16(vget_low_s16(row_16_a)); int32x4_t row_32_b = vmovl_s16(vget_high_s16(row_16_a)); int32x4_t row_32_c = vmovl_s16(vget_low_s16(row_16_b)); int32x4_t row_32_d = vmovl_s16(vget_high_s16(row_16_b)); int32x4_t mul_a = vmulq_s32(row_32_a, coeffs_x_0); int32x4_t mul_b = vmulq_s32(row_32_b, coeffs_x_1); int32x4_t mul_c = vmulq_s32(row_32_c, coeffs_x_2); int32x4_t mul_d = vmulq_s32(row_32_d, coeffs_x_3); int32x4_t row = vshrq_n_s32(vaddq_s32(mul_a, vaddq_s32(mul_b, vaddq_s32(mul_c, mul_d))), 10); final = vaddq_s32(final, vmulq_s32(row, coeffs_y_2)); sample += inputPitch; } int32x4_t coeffs_y_3 = *(int32x4_t*)(kernelTableY + filter_index_y + 12); { uint8x16_t row_8 = vld1q_u8((uint8_t*)sample); int16x8_t row_16_a = vmovl_u8(vget_low_u8(row_8)); int16x8_t row_16_b = vmovl_u8(vget_high_u8(row_8)); int32x4_t row_32_a = vmovl_s16(vget_low_s16(row_16_a)); int32x4_t row_32_b = vmovl_s16(vget_high_s16(row_16_a)); int32x4_t row_32_c = vmovl_s16(vget_low_s16(row_16_b)); int32x4_t row_32_d = vmovl_s16(vget_high_s16(row_16_b)); int32x4_t mul_a = vmulq_s32(row_32_a, coeffs_x_0); int32x4_t mul_b = vmulq_s32(row_32_b, coeffs_x_1); int32x4_t mul_c = vmulq_s32(row_32_c, coeffs_x_2); int32x4_t mul_d = vmulq_s32(row_32_d, coeffs_x_3); int32x4_t row = vshrq_n_s32(vaddq_s32(mul_a, vaddq_s32(mul_b, vaddq_s32(mul_c, mul_d))), 10); final = vaddq_s32(final, vmulq_s32(row, coeffs_y_3)); sample += inputPitch; } final = vaddq_s32(final, half); final = vshrq_n_s32(final, 22); int8x8_t packed_8 = vqmovun_s16(vcombine_s16(vmovn_s32(final), vdup_n_s16(0))); unsigned int oi = (y * outputPitch) + x * 4; vst1_lane_s32((int32_t*)(outputBuffer + oi), packed_8, 0); } } } template<> bool Filters<ComponentSIMD<4>>::fasterUnpadded(uint32_t kernelSize) { return false; } template<> bool Filters<ComponentSIMD<4>>::supportsUnpadded(uint32_t kernelSize) { return false; } // forward template declarations template class Filters<ComponentScalar<1>>; template class Filters<ComponentScalar<2>>; template class Filters<ComponentScalar<4>>; template class Filters<ComponentSIMD<1>>; template class Filters<ComponentSIMD<2>>; template class Filters<ComponentSIMD<4>>; } #endif