imagecore/image/internal/sse.cpp (1,048 lines of code) (raw):

/* * MIT License * * Copyright (c) 2017 Twitter * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "filters.h" #include "imagecore/imagecore.h" #include "imagecore/utils/securemath.h" #include "imagecore/utils/mathutils.h" #include "platform_support.h" #if __SSE4_1__ #include "intrinsics.h" #define vec_transpose_epi32(r0, r1, r2, r3) \ { \ vSInt32 t0 = v128_unpacklo_int32(r0, r1); \ vSInt32 t1 = v128_unpacklo_int32(r2, r3); \ vSInt32 t2 = v128_unpackhi_int32(r0, r1); \ vSInt32 t3 = v128_unpackhi_int32(r2, r3); \ r0 = v128_unpacklo_int64(t0, t1); \ r1 = v128_unpackhi_int64(t0, t1); \ r2 = v128_unpacklo_int64(t2, t3); \ r3 = v128_unpackhi_int64(t2, t3); \ } namespace imagecore { // Adaptive-width filter, both axes, 4x4 samples. // 16.16 Fixed point SSE version. template<> void Filters<ComponentSIMD<4>>::adaptive4x4(const FilterKernelAdaptive* kernelX, const FilterKernelAdaptive* kernelY, const uint8_t* __restrict inputBuffer, unsigned int inputWidth, unsigned int inputHeight, unsigned int inputPitch, uint8_t* __restrict outputBuffer, unsigned int outputWidth, unsigned int outputHeight, unsigned int outputPitch, unsigned int outputCapacity) { #if IMAGECORE_DETECT_SSE if( !checkForCPUSupport(kCPUFeature_SSE4_1)) { return Filters<ComponentScalar<4>>::adaptive4x4(kernelX, kernelY, inputBuffer, inputWidth, inputHeight, inputPitch, outputBuffer, outputWidth, outputHeight, outputPitch, outputCapacity); } #endif SECURE_ASSERT(SafeUMul(outputWidth, 4U) <= outputPitch); SECURE_ASSERT(SafeUMul(outputHeight, outputPitch) <= outputCapacity); __m128i zero = v128_setzero(); __m128i half = v128_set_int32(kHalf22); __restrict int32_t* kernelTableX = kernelX->getTableFixedPoint4(); __restrict int32_t* kernelTableY = kernelY->getTableFixedPoint4(); for( unsigned int y = 0; y < outputHeight; y++ ) { int startY = kernelY->computeSampleStart(y); for( unsigned int x = 0; x < outputWidth; x++ ) { int startX = kernelX->computeSampleStart(x); int sampleOffset = ((startY) * (int)inputPitch) + (startX) * 4; const uint8_t* sample = inputBuffer + sampleOffset; vSInt32 final = zero; unsigned int filterIndexX = x * 16; unsigned int filter_index_y = y * 16; vSInt32 coeffs_x_0 = *(vSInt32*)(kernelTableX + filterIndexX + 0); vSInt32 coeffs_x_1 = *(vSInt32*)(kernelTableX + filterIndexX + 4); vSInt32 coeffs_x_2 = *(vSInt32*)(kernelTableX + filterIndexX + 8); vSInt32 coeffs_x_3 = *(vSInt32*)(kernelTableX + filterIndexX + 12); vSInt32 coeffs_y_0 = *(vSInt32*)(kernelTableY + filter_index_y + 0); { vUInt8 row_8 = v128_load_unaligned((const vSInt32*)sample); vUInt16 row_16_a = v128_unpacklo_int8(row_8, zero); vUInt16 row_16_b = v128_unpackhi_int8(row_8, zero); vSInt32 row_32_a = v128_unpacklo_int16(row_16_a, zero); vSInt32 row_32_b = v128_unpackhi_int16(row_16_a, zero); vSInt32 row_32_c = v128_unpacklo_int16(row_16_b, zero); vSInt32 row_32_d = v128_unpackhi_int16(row_16_b, zero); vSInt32 mul_a = v128_mul_int32(row_32_a, coeffs_x_0); vSInt32 mul_b = v128_mul_int32(row_32_b, coeffs_x_1); vSInt32 mul_c = v128_mul_int32(row_32_c, coeffs_x_2); vSInt32 mul_d = v128_mul_int32(row_32_d, coeffs_x_3); vSInt32 row = v128_shift_right_signed_int32<10>(v128_add_int32(mul_a, v128_add_int32(mul_b, v128_add_int32(mul_c, mul_d)))); final = v128_add_int32(final, v128_mul_int32(row, coeffs_y_0)); sample += inputPitch; } vSInt32 coeffs_y_1 = *(vSInt32*)(kernelTableY + filter_index_y + 4); { vUInt8 row_8 = v128_load_unaligned((const vSInt32*)sample); vUInt16 row_16_a = v128_unpacklo_int8(row_8, zero); vUInt16 row_16_b = v128_unpackhi_int8(row_8, zero); vSInt32 row_32_a = v128_unpacklo_int16(row_16_a, zero); vSInt32 row_32_b = v128_unpackhi_int16(row_16_a, zero); vSInt32 row_32_c = v128_unpacklo_int16(row_16_b, zero); vSInt32 row_32_d = v128_unpackhi_int16(row_16_b, zero); vSInt32 mul_a = v128_mul_int32(row_32_a, coeffs_x_0); vSInt32 mul_b = v128_mul_int32(row_32_b, coeffs_x_1); vSInt32 mul_c = v128_mul_int32(row_32_c, coeffs_x_2); vSInt32 mul_d = v128_mul_int32(row_32_d, coeffs_x_3); vSInt32 row = v128_shift_right_signed_int32<10>(v128_add_int32(mul_a, v128_add_int32(mul_b, v128_add_int32(mul_c, mul_d)))); final = v128_add_int32(final, v128_mul_int32(row, coeffs_y_1)); sample += inputPitch; } vSInt32 coeffs_y_2 = *(vSInt32*)(kernelTableY + filter_index_y + 8); { vUInt8 row_8 = v128_load_unaligned((const vSInt32*)sample); vUInt16 row_16_a = v128_unpacklo_int8(row_8, zero); vUInt16 row_16_b = v128_unpackhi_int8(row_8, zero); vSInt32 row_32_a = v128_unpacklo_int16(row_16_a, zero); vSInt32 row_32_b = v128_unpackhi_int16(row_16_a, zero); vSInt32 row_32_c = v128_unpacklo_int16(row_16_b, zero); vSInt32 row_32_d = v128_unpackhi_int16(row_16_b, zero); vSInt32 mul_a = v128_mul_int32(row_32_a, coeffs_x_0); vSInt32 mul_b = v128_mul_int32(row_32_b, coeffs_x_1); vSInt32 mul_c = v128_mul_int32(row_32_c, coeffs_x_2); vSInt32 mul_d = v128_mul_int32(row_32_d, coeffs_x_3); vSInt32 row = v128_shift_right_signed_int32<10>(v128_add_int32(mul_a, v128_add_int32(mul_b, v128_add_int32(mul_c, mul_d)))); final = v128_add_int32(final, v128_mul_int32(row, coeffs_y_2)); sample += inputPitch; } vSInt32 coeffs_y_3 = *(vSInt32*)(kernelTableY + filter_index_y + 12); { vUInt8 row_8 = v128_load_unaligned((const vSInt32*)sample); vUInt16 row_16_a = v128_unpacklo_int8(row_8, zero); vUInt16 row_16_b = v128_unpackhi_int8(row_8, zero); vSInt32 row_32_a = v128_unpacklo_int16(row_16_a, zero); vSInt32 row_32_b = v128_unpackhi_int16(row_16_a, zero); vSInt32 row_32_c = v128_unpacklo_int16(row_16_b, zero); vSInt32 row_32_d = v128_unpackhi_int16(row_16_b, zero); vSInt32 mul_a = v128_mul_int32(row_32_a, coeffs_x_0); vSInt32 mul_b = v128_mul_int32(row_32_b, coeffs_x_1); vSInt32 mul_c = v128_mul_int32(row_32_c, coeffs_x_2); vSInt32 mul_d = v128_mul_int32(row_32_d, coeffs_x_3); vSInt32 row = v128_shift_right_signed_int32<10>(v128_add_int32(mul_a, v128_add_int32(mul_b, v128_add_int32(mul_c, mul_d)))); final = v128_add_int32(final, v128_mul_int32(row, coeffs_y_3)); } final = v128_add_int32(final, half); final = v128_shift_right_signed_int32<22>(final); vSInt16 packed_16 = v128_pack_unsigned_saturate_int32(final, zero); vSInt8 packed_8 = v128_pack_unsigned_saturate_int16(packed_16, zero, zero); unsigned int oi = (y * outputPitch) + x * 4; int a = v128_convert_to_int32(packed_8); *(int*)(outputBuffer + oi) = a; } } } // Adaptive-width filter, single axis, any number of samples. // 16.16 Fixed point SSE version. static void adaptiveSeperableAny(const FilterKernelAdaptive* kernel, const uint8_t* __restrict inputBuffer, unsigned int inputWidth, unsigned int inputHeight, unsigned int inputPitch, uint8_t* __restrict outputBuffer, unsigned int outputWidth, unsigned int outputHeight, unsigned int outputPitch, unsigned int outputCapacity) { SECURE_ASSERT(SafeUMul(outputWidth, 4U) <= outputPitch); SECURE_ASSERT(SafeUMul(outputHeight, outputPitch) <= outputCapacity); __restrict int32_t* kernelTable = kernel->getTableFixedPoint4(); unsigned int kernelWidth = kernel->getKernelSize(); __m128i zero = v128_setzero(); __m128i half = v128_set_int32(kHalf16); for( unsigned int y = 0; y < outputHeight; y++ ) { for( unsigned int x = 0; x < outputWidth; x++ ) { int startX = kernel->computeSampleStart(x); int sampleOffset = (y * (int)inputPitch) + (startX) * 4; const uint8_t* sample = inputBuffer + sampleOffset; vSInt32 result = zero; for( unsigned int section = 0; section < kernelWidth; section += 4 ) { unsigned int filterIndexX = x * kernelWidth * 4 + section * 4; vSInt32 coeffs_x_0 = *(vSInt32*)(kernelTable + filterIndexX + 0); vSInt32 coeffs_x_1 = *(vSInt32*)(kernelTable + filterIndexX + 4); vSInt32 coeffs_x_2 = *(vSInt32*)(kernelTable + filterIndexX + 8); vSInt32 coeffs_x_3 = *(vSInt32*)(kernelTable + filterIndexX + 12); vUInt8 row_8_a = v128_load_unaligned((const vSInt32*)(sample + section * 4)); vUInt16 row_16_a = v128_unpacklo_int8(row_8_a, zero); vUInt16 row_16_b = v128_unpackhi_int8(row_8_a, zero); vSInt32 row_32_a = v128_unpacklo_int16(row_16_a, zero); vSInt32 row_32_b = v128_unpackhi_int16(row_16_a, zero); vSInt32 row_32_c = v128_unpacklo_int16(row_16_b, zero); vSInt32 row_32_d = v128_unpackhi_int16(row_16_b, zero); vSInt32 mul_a = v128_mul_int32(row_32_a, coeffs_x_0); vSInt32 mul_b = v128_mul_int32(row_32_b, coeffs_x_1); vSInt32 mul_c = v128_mul_int32(row_32_c, coeffs_x_2); vSInt32 mul_d = v128_mul_int32(row_32_d, coeffs_x_3); result = v128_add_int32(result, v128_add_int32(mul_a, v128_add_int32(mul_b, v128_add_int32(mul_c, mul_d)))); } result = v128_add_int32(result, half); result = v128_shift_right_signed_int32<16>(result); vSInt16 packed_16 = v128_pack_unsigned_saturate_int32(result, zero); vSInt8 packed_8 = v128_pack_unsigned_saturate_int16(packed_16, zero, zero); unsigned int oi = (x * outputPitch) + y * 4; int a = v128_convert_to_int32(packed_8); *(int*)(outputBuffer + oi) = a; } } } // Adaptive-width filter, single axis, 8 samples. // 16.16 Fixed point SSE version. static void adaptiveSeperable8(const FilterKernelAdaptive* kernel, const uint8_t* __restrict inputBuffer, unsigned int inputWidth, unsigned int inputHeight, unsigned int inputPitch, uint8_t* __restrict outputBuffer, unsigned int outputWidth, unsigned int outputHeight, unsigned int outputPitch, unsigned int outputCapacity) { SECURE_ASSERT(SafeUMul(outputHeight, 4U) <= outputPitch); SECURE_ASSERT(SafeUMul(outputWidth, outputPitch) <= outputCapacity); __restrict int32_t* kernelTable = kernel->getTableFixedPoint4(); __m128i zero = v128_setzero(); __m128i half = v128_set_int32(kHalf16); for( unsigned int y = 0; y < outputHeight; y++ ) { for( unsigned int x = 0; x < outputWidth; x++ ) { int startX = kernel->computeSampleStart(x); int sampleOffset = (y * (int)inputPitch) + (startX) * 4; const uint8_t* sample = inputBuffer + sampleOffset; unsigned int filterIndexX = x * 32; vSInt32 coeffs_x_0 = *(vSInt32*)(kernelTable + filterIndexX + 0); vSInt32 coeffs_x_1 = *(vSInt32*)(kernelTable + filterIndexX + 4); vSInt32 coeffs_x_2 = *(vSInt32*)(kernelTable + filterIndexX + 8); vSInt32 coeffs_x_3 = *(vSInt32*)(kernelTable + filterIndexX + 12); vSInt32 coeffs_x_4 = *(vSInt32*)(kernelTable + filterIndexX + 16); vSInt32 coeffs_x_5 = *(vSInt32*)(kernelTable + filterIndexX + 20); vSInt32 coeffs_x_6 = *(vSInt32*)(kernelTable + filterIndexX + 24); vSInt32 coeffs_x_7 = *(vSInt32*)(kernelTable + filterIndexX + 28); vSInt32 result = zero; vUInt8 row_8_a = v128_load_unaligned((const vSInt32*)sample); vUInt16 row_16_a = v128_unpacklo_int8(row_8_a, zero); vUInt16 row_16_b = v128_unpackhi_int8(row_8_a, zero); vSInt32 row_32_a = v128_unpacklo_int16(row_16_a, zero); vSInt32 row_32_b = v128_unpackhi_int16(row_16_a, zero); vSInt32 row_32_c = v128_unpacklo_int16(row_16_b, zero); vSInt32 row_32_d = v128_unpackhi_int16(row_16_b, zero); vSInt32 mul_a = v128_mul_int32(row_32_a, coeffs_x_0); vSInt32 mul_b = v128_mul_int32(row_32_b, coeffs_x_1); vSInt32 mul_c = v128_mul_int32(row_32_c, coeffs_x_2); vSInt32 mul_d = v128_mul_int32(row_32_d, coeffs_x_3); result = v128_add_int32(mul_a, v128_add_int32(mul_b, v128_add_int32(mul_c, mul_d))); vUInt8 row_8_b = v128_load_unaligned((const vSInt32*)(sample + 16)); vUInt16 row_16_c = v128_unpacklo_int8(row_8_b, zero); vUInt16 row_16_d = v128_unpackhi_int8(row_8_b, zero); vSInt32 row_32_e = v128_unpacklo_int16(row_16_c, zero); vSInt32 row_32_f = v128_unpackhi_int16(row_16_c, zero); vSInt32 row_32_g = v128_unpacklo_int16(row_16_d, zero); vSInt32 row_32_h = v128_unpackhi_int16(row_16_d, zero); vSInt32 mul_e = v128_mul_int32(row_32_e, coeffs_x_4); vSInt32 mul_f = v128_mul_int32(row_32_f, coeffs_x_5); vSInt32 mul_g = v128_mul_int32(row_32_g, coeffs_x_6); vSInt32 mul_h = v128_mul_int32(row_32_h, coeffs_x_7); result = v128_add_int32(result, v128_add_int32(mul_e, v128_add_int32(mul_f, v128_add_int32(mul_g, mul_h)))); result = v128_add_int32(result, half); result = v128_shift_right_signed_int32<16>(result); vSInt16 packed_16 = v128_pack_unsigned_saturate_int32(result, zero); vSInt8 packed_8 = v128_pack_unsigned_saturate_int16(packed_16, zero, zero); unsigned int oi = (x * outputPitch) + y * 4; int a = v128_convert_to_int32(packed_8); *(int*)(outputBuffer + oi) = a; } } } #define SEPERABLE12_ASM_OPTIMIZATION 1 // Samples loading, unpacking and factoring in. #define ADD_SAMPLE(x) inSample = v128_set_int32(*(samples32[x])); \ inSampleUnpacked = v128_shuffle_int8(inSample, unpackMask); \ sum = v128_add_int32(sum, v128_mul_int32(inSampleUnpacked, coeffs_x_##x)); #define SAMPLE_0 ADD_SAMPLE(0) #define SAMPLE_1 SAMPLE_0 \ ADD_SAMPLE(1) #define SAMPLE_2 SAMPLE_1 \ ADD_SAMPLE(2) #define SAMPLE_3 SAMPLE_2 \ ADD_SAMPLE(3) #define SAMPLE_4 SAMPLE_3 \ ADD_SAMPLE(4) #define SAMPLE_5 SAMPLE_4 \ ADD_SAMPLE(5) #define SAMPLE_6 SAMPLE_5 \ ADD_SAMPLE(6) #define SAMPLE_7 SAMPLE_6 \ ADD_SAMPLE(7) #define SAMPLE_8 SAMPLE_7 \ ADD_SAMPLE(8) #define SAMPLE_9 SAMPLE_8 \ ADD_SAMPLE(9) #define SAMPLE_10 SAMPLE_9 \ ADD_SAMPLE(10) #define SAMPLE_11 SAMPLE_10 \ ADD_SAMPLE(11) #define SAMPLE(x) SAMPLE_##x // Filter coefficient loading #define LOAD_COEFF(x) vSInt32 coeffs_x_##x = *(vSInt32*)(kernelTableSample + (x * 4)); #define COEFF_0 LOAD_COEFF(0) #define COEFF_1 COEFF_0 \ LOAD_COEFF(1) #define COEFF_2 COEFF_1 \ LOAD_COEFF(2) #define COEFF_3 COEFF_2 \ LOAD_COEFF(3) #define COEFF_4 COEFF_3 \ LOAD_COEFF(4) #define COEFF_5 COEFF_4 \ LOAD_COEFF(5) #define COEFF_6 COEFF_5 \ LOAD_COEFF(6) #define COEFF_7 COEFF_6 \ LOAD_COEFF(7) #define COEFF_8 COEFF_7 \ LOAD_COEFF(8) #define COEFF_9 COEFF_8 \ LOAD_COEFF(9) #define COEFF_10 COEFF_9 \ LOAD_COEFF(10) #define COEFF_11 COEFF_10 \ LOAD_COEFF(11) #define COEFF(x) COEFF_##x #define ADAPTIVE_SEPERABLE_INIT SECURE_ASSERT(SafeUMul(outputHeight, 4U) <= outputPitch); \ SECURE_ASSERT(SafeUMul(outputWidth, outputPitch) <= outputCapacity); \ vSInt32 unpackMask = v128_set_int8_packed(0x80, 0x80, 0x80, 0x03, 0x80, 0x80, 0x80, 0x02, 0x80, 0x80, 0x80, 0x01, 0x80, 0x80, 0x80, 0x00); \ vSInt32 zero = v128_setzero(); \ vSInt32 half = v128_set_int32(kHalf16); \ __restrict int32_t* kernelTable = kernel->getTableFixedPoint4(); #define ADAPTIVE_SEPERABLE_XLOOP_START for( unsigned int x = 0; x < outputWidth; x++ ) { \ int startX = kernel->computeSampleStart(x); \ uint8_t* outputSample = outputBuffer + (x * outputPitch); \ const int32_t* samples32[12]; \ int32_t startIndex = startX < 0 ? 0 : startX; \ const uint8_t* sample = inputBuffer + startIndex * 4; \ for( int kernelIndex = 0; kernelIndex < 12; kernelIndex++ ) { \ int sampleIndex; \ if(startX < 0) { \ sampleIndex = kernelIndex + startX < 0 ? 0 : kernelIndex + startX; \ } else { \ sampleIndex = kernelIndex + startIndex < inputWidth ? kernelIndex : kernelIndex - (kernelIndex + startIndex - inputWidth + 1); \ } \ sampleIndex = min(sampleIndex, (int32_t)inputWidth - 1); \ samples32[kernelIndex] = (const int32_t*)&sample[sampleIndex * 4]; \ } \ const int32_t* kernelTableSample = kernelTable + x * 48; #define ADAPTIVE_SEPERABLE_XLOOP_END } #define ADAPTIVE_SEPERABLE_YLOOP_START for( unsigned int y = 0; y < outputHeight; y++ ) { \ vSInt32 inSample; \ vSInt32 inSampleUnpacked; \ vSInt32 sum = zero; #define ADAPTIVE_SEPERABLE_YLOOP_END sum = v128_add_int32(sum, half); \ sum = v128_shift_right_signed_int32<16>(sum); \ vSInt16 packed_16 = v128_pack_unsigned_saturate_int32(sum, zero); \ vSInt8 packed_8 = v128_pack_unsigned_saturate_int16(packed_16, zero, zero); \ int32_t a = v128_convert_to_int32(packed_8); \ *((int32_t*)outputSample) = a; \ outputSample += 4; \ for( int kernelIndex = 0; kernelIndex < 12; kernelIndex++ ) { \ samples32[kernelIndex] += (inputPitch / 4); \ } \ } #define ADAPTIVE_SEPERABLE_FUNC_CODE(x) ADAPTIVE_SEPERABLE_INIT \ ADAPTIVE_SEPERABLE_XLOOP_START \ COEFF(x) \ ADAPTIVE_SEPERABLE_YLOOP_START \ SAMPLE(x) \ ADAPTIVE_SEPERABLE_YLOOP_END \ ADAPTIVE_SEPERABLE_XLOOP_END static void adaptiveSeperable12_maxSamples12(const FilterKernelAdaptive* kernel, const uint8_t* __restrict inputBuffer, unsigned int inputWidth, unsigned int inputHeight, unsigned int inputPitch, uint8_t* __restrict outputBuffer, unsigned int outputWidth, unsigned int outputHeight, unsigned int outputPitch, unsigned int outputCapacity) { // The seperable version writes transposed images. ADAPTIVE_SEPERABLE_FUNC_CODE(11) } static void adaptiveSeperable12_maxSamples11(const FilterKernelAdaptive* kernel, const uint8_t* __restrict inputBuffer, unsigned int inputWidth, unsigned int inputHeight, unsigned int inputPitch, uint8_t* __restrict outputBuffer, unsigned int outputWidth, unsigned int outputHeight, unsigned int outputPitch, unsigned int outputCapacity) { // The seperable version writes transposed images. ADAPTIVE_SEPERABLE_FUNC_CODE(10) } static void adaptiveSeperable12_maxSamples10(const FilterKernelAdaptive* kernel, const uint8_t* __restrict inputBuffer, unsigned int inputWidth, unsigned int inputHeight, unsigned int inputPitch, uint8_t* __restrict outputBuffer, unsigned int outputWidth, unsigned int outputHeight, unsigned int outputPitch, unsigned int outputCapacity) { // The seperable version writes transposed images. ADAPTIVE_SEPERABLE_FUNC_CODE(9) } static void adaptiveSeperable12_maxSamples9(const FilterKernelAdaptive* kernel, const uint8_t* __restrict inputBuffer, unsigned int inputWidth, unsigned int inputHeight, unsigned int inputPitch, uint8_t* __restrict outputBuffer, unsigned int outputWidth, unsigned int outputHeight, unsigned int outputPitch, unsigned int outputCapacity) { // The seperable version writes transposed images. ADAPTIVE_SEPERABLE_FUNC_CODE(8) } static void adaptiveSeperable12_maxSamples8(const FilterKernelAdaptive* kernel, const uint8_t* __restrict inputBuffer, unsigned int inputWidth, unsigned int inputHeight, unsigned int inputPitch, uint8_t* __restrict outputBuffer, unsigned int outputWidth, unsigned int outputHeight, unsigned int outputPitch, unsigned int outputCapacity) { // The seperable version writes transposed images. ADAPTIVE_SEPERABLE_FUNC_CODE(7) } static void adaptiveSeperable12_maxSamples7(const FilterKernelAdaptive* kernel, const uint8_t* __restrict inputBuffer, unsigned int inputWidth, unsigned int inputHeight, unsigned int inputPitch, uint8_t* __restrict outputBuffer, unsigned int outputWidth, unsigned int outputHeight, unsigned int outputPitch, unsigned int outputCapacity) { // The seperable version writes transposed images. ADAPTIVE_SEPERABLE_FUNC_CODE(6) } static void adaptiveSeperable12_maxSamples6(const FilterKernelAdaptive* kernel, const uint8_t* __restrict inputBuffer, unsigned int inputWidth, unsigned int inputHeight, unsigned int inputPitch, uint8_t* __restrict outputBuffer, unsigned int outputWidth, unsigned int outputHeight, unsigned int outputPitch, unsigned int outputCapacity) { // The seperable version writes transposed images. ADAPTIVE_SEPERABLE_FUNC_CODE(5) } static void adaptiveSeperable12_maxSamples5(const FilterKernelAdaptive* kernel, const uint8_t* __restrict inputBuffer, unsigned int inputWidth, unsigned int inputHeight, unsigned int inputPitch, uint8_t* __restrict outputBuffer, unsigned int outputWidth, unsigned int outputHeight, unsigned int outputPitch, unsigned int outputCapacity) { // The seperable version writes transposed images. ADAPTIVE_SEPERABLE_FUNC_CODE(4) } static void adaptiveSeperable12_maxSamples4(const FilterKernelAdaptive* kernel, const uint8_t* __restrict inputBuffer, unsigned int inputWidth, unsigned int inputHeight, unsigned int inputPitch, uint8_t* __restrict outputBuffer, unsigned int outputWidth, unsigned int outputHeight, unsigned int outputPitch, unsigned int outputCapacity) { // The seperable version writes transposed images. ADAPTIVE_SEPERABLE_FUNC_CODE(3) } static void adaptiveSeperable12_maxSamples3(const FilterKernelAdaptive* kernel, const uint8_t* __restrict inputBuffer, unsigned int inputWidth, unsigned int inputHeight, unsigned int inputPitch, uint8_t* __restrict outputBuffer, unsigned int outputWidth, unsigned int outputHeight, unsigned int outputPitch, unsigned int outputCapacity) { // The seperable version writes transposed images. ADAPTIVE_SEPERABLE_FUNC_CODE(2) } static void adaptiveSeperable12_maxSamples2(const FilterKernelAdaptive* kernel, const uint8_t* __restrict inputBuffer, unsigned int inputWidth, unsigned int inputHeight, unsigned int inputPitch, uint8_t* __restrict outputBuffer, unsigned int outputWidth, unsigned int outputHeight, unsigned int outputPitch, unsigned int outputCapacity) { // The seperable version writes transposed images. ADAPTIVE_SEPERABLE_FUNC_CODE(1) } static void adaptiveSeperable12_maxSamples1(const FilterKernelAdaptive* kernel, const uint8_t* __restrict inputBuffer, unsigned int inputWidth, unsigned int inputHeight, unsigned int inputPitch, uint8_t* __restrict outputBuffer, unsigned int outputWidth, unsigned int outputHeight, unsigned int outputPitch, unsigned int outputCapacity) { // The seperable version writes transposed images. ADAPTIVE_SEPERABLE_FUNC_CODE(0) } // Adaptive-width filter, single axis, 12 samples. // 16.16 Fixed point SSE version static void adaptiveSeperable12(const FilterKernelAdaptive* kernel, const uint8_t* __restrict inputBuffer, unsigned int inputWidth, unsigned int inputHeight, unsigned int inputPitch, uint8_t* __restrict outputBuffer, unsigned int outputWidth, unsigned int outputHeight, unsigned int outputPitch, unsigned int outputCapacity) { // The seperable version writes transposed images. SECURE_ASSERT(SafeUMul(outputHeight, 4U) <= outputPitch); SECURE_ASSERT(SafeUMul(outputWidth, outputPitch) <= outputCapacity); __restrict int32_t* kernelTable = kernel->getTableFixedPoint4(); __m128i half = v128_set_int32(kHalf16); for( unsigned int x = 0; x < outputWidth; x++ ) { const int32_t* kernelTableSample = kernelTable + x * 48; int startX = kernel->computeSampleStart(x); uint8_t* outputSample = outputBuffer + (x * outputPitch); const uint8_t* sample = inputBuffer + startX * 4; for( unsigned int y = 0; y < outputHeight; y++ ) { #if (SEPERABLE12_ASM_OPTIMIZATION) __asm__ ( "pxor %%xmm0, %%xmm0 \n" "lddqu %[sample], %%xmm1 \n" "movdqa %%xmm1, %%xmm2 \n" "punpcklbw %%xmm0, %%xmm1 \n" "movdqa %%xmm1, %%xmm3 \n" "punpcklwd %%xmm0, %%xmm1 \n" "punpckhwd %%xmm0, %%xmm3 \n" "pmulld 0%[kernelTable], %%xmm1 \n" "pmulld 16%[kernelTable], %%xmm3 \n" "paddd %%xmm3, %%xmm1 \n" "punpckhbw %%xmm0, %%xmm2 \n" "movdqa %%xmm2, %%xmm3 \n" "punpcklwd %%xmm0, %%xmm2 \n" "punpckhwd %%xmm0, %%xmm3 \n" "pmulld 32%[kernelTable], %%xmm2 \n" "pmulld 48%[kernelTable], %%xmm3 \n" "paddd %%xmm2, %%xmm1 \n" "paddd %%xmm3, %%xmm1 \n" "movdqa %%xmm1, %%xmm4 \n" "lddqu 16%[sample], %%xmm1 \n" "movdqa %%xmm1, %%xmm2 \n" "punpcklbw %%xmm0, %%xmm1 \n" "movdqa %%xmm1, %%xmm3 \n" "punpcklwd %%xmm0, %%xmm1 \n" "punpckhwd %%xmm0, %%xmm3 \n" "pmulld 64%[kernelTable], %%xmm1 \n" "pmulld 80%[kernelTable], %%xmm3 \n" "paddd %%xmm3, %%xmm1 \n" "punpckhbw %%xmm0, %%xmm2 \n" "movdqa %%xmm2, %%xmm3 \n" "punpcklwd %%xmm0, %%xmm2 \n" "punpckhwd %%xmm0, %%xmm3 \n" "pmulld 96%[kernelTable], %%xmm2 \n" "pmulld 112%[kernelTable], %%xmm3 \n" "paddd %%xmm2, %%xmm1 \n" "paddd %%xmm3, %%xmm1 \n" "paddd %%xmm1, %%xmm4 \n" "lddqu 32%[sample], %%xmm1 \n" "movdqa %%xmm1, %%xmm2 \n" "punpcklbw %%xmm0, %%xmm1 \n" "movdqa %%xmm1, %%xmm3 \n" "punpcklwd %%xmm0, %%xmm1 \n" "punpckhwd %%xmm0, %%xmm3 \n" "pmulld 128%[kernelTable], %%xmm1 \n" "pmulld 144%[kernelTable], %%xmm3 \n" "paddd %%xmm3, %%xmm1 \n" "punpckhbw %%xmm0, %%xmm2 \n" "movdqa %%xmm2, %%xmm3 \n" "punpcklwd %%xmm0, %%xmm2 \n" "punpckhwd %%xmm0, %%xmm3 \n" "pmulld 160%[kernelTable], %%xmm2 \n" "pmulld 176%[kernelTable], %%xmm3 \n" "paddd %%xmm2, %%xmm1 \n" "paddd %%xmm3, %%xmm1 \n" "paddd %%xmm1, %%xmm4 \n" "paddd %[half], %%xmm4 \n" "psrad $16, %%xmm4 \n" "packusdw %%xmm0, %%xmm4 \n" "packuswb %%xmm0, %%xmm4 \n" "movd %%xmm4, %[outputSample] \n" : [outputSample] "=m" (*outputSample) : [sample] "m" (*sample), [kernelTable] "m" (*kernelTableSample), [half] "x" (half) : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4" ); #else __m128i zero = v128_setzero(); vSInt32 result = zero; vSInt32 coeffs_x_0 = *(vSInt32*)(kernelTableSample + 0); vSInt32 coeffs_x_1 = *(vSInt32*)(kernelTableSample + 4); vSInt32 coeffs_x_2 = *(vSInt32*)(kernelTableSample + 8); vSInt32 coeffs_x_3 = *(vSInt32*)(kernelTableSample + 12); vSInt32 coeffs_x_4 = *(vSInt32*)(kernelTableSample + 16); vSInt32 coeffs_x_5 = *(vSInt32*)(kernelTableSample + 20); vSInt32 coeffs_x_6 = *(vSInt32*)(kernelTableSample + 24); vSInt32 coeffs_x_7 = *(vSInt32*)(kernelTableSample + 28); vSInt32 coeffs_x_8 = *(vSInt32*)(kernelTableSample + 32); vSInt32 coeffs_x_9 = *(vSInt32*)(kernelTableSample + 36); vSInt32 coeffs_x_10 = *(vSInt32*)(kernelTableSample + 40); vSInt32 coeffs_x_11 = *(vSInt32*)(kernelTableSample + 44); vUInt8 row_8_a = v128_load_unaligned((__m128i*)sample); vUInt16 row_16_a = v128_unpacklo_int8(row_8_a, zero); vUInt16 row_16_b = v128_unpackhi_int8(row_8_a, zero); vSInt32 row_32_a = v128_unpacklo_int16(row_16_a, zero); vSInt32 row_32_b = v128_unpackhi_int16(row_16_a, zero); vSInt32 row_32_c = v128_unpacklo_int16(row_16_b, zero); vSInt32 row_32_d = v128_unpackhi_int16(row_16_b, zero); vSInt32 mul_a = v128_mul_int32(row_32_a, coeffs_x_0); vSInt32 mul_b = v128_mul_int32(row_32_b, coeffs_x_1); vSInt32 mul_c = v128_mul_int32(row_32_c, coeffs_x_2); vSInt32 mul_d = v128_mul_int32(row_32_d, coeffs_x_3); result = v128_add_int32(mul_a, v128_add_int32(mul_b, v128_add_int32(mul_c, mul_d))); vUInt8 row_8_b = v128_load_unaligned((__m128i*)(sample + 16)); vUInt16 row_16_c = v128_unpacklo_int8(row_8_b, zero); vUInt16 row_16_d = v128_unpackhi_int8(row_8_b, zero); vSInt32 row_32_e = v128_unpacklo_int16(row_16_c, zero); vSInt32 row_32_f = v128_unpackhi_int16(row_16_c, zero); vSInt32 row_32_g = v128_unpacklo_int16(row_16_d, zero); vSInt32 row_32_h = v128_unpackhi_int16(row_16_d, zero); vSInt32 mul_e = v128_mul_int32(row_32_e, coeffs_x_4); vSInt32 mul_f = v128_mul_int32(row_32_f, coeffs_x_5); vSInt32 mul_g = v128_mul_int32(row_32_g, coeffs_x_6); vSInt32 mul_h = v128_mul_int32(row_32_h, coeffs_x_7); result = v128_add_int32(result, v128_add_int32(mul_e, v128_add_int32(mul_f, v128_add_int32(mul_g, mul_h)))); vUInt8 row_8_c = v128_load_unaligned((__m128i*)(sample + 32)); vUInt16 row_16_e = v128_unpacklo_int8(row_8_c, zero); vUInt16 row_16_f = v128_unpackhi_int8(row_8_c, zero); vSInt32 row_32_i = v128_unpacklo_int16(row_16_e, zero); vSInt32 row_32_j = v128_unpackhi_int16(row_16_e, zero); vSInt32 row_32_k = v128_unpacklo_int16(row_16_f, zero); vSInt32 row_32_l = v128_unpackhi_int16(row_16_f, zero); vSInt32 mul_i = v128_mul_int32(row_32_i, coeffs_x_8); vSInt32 mul_j = v128_mul_int32(row_32_j, coeffs_x_9); vSInt32 mul_k = v128_mul_int32(row_32_k, coeffs_x_10); vSInt32 mul_l = v128_mul_int32(row_32_l, coeffs_x_11); result = v128_add_int32(result, v128_add_int32(mul_i, v128_add_int32(mul_j, v128_add_int32(mul_k, mul_l)))); result = v128_add_int32(result, half); result = v128_shift_right_signed_int32<16>(result); vSInt16 packed_16 = v128_pack_unsigned_saturate_int32(result, zero); vSInt8 packed_8 = v128_pack_unsigned_saturate_int16(packed_16, zero); unsigned int oi = (x * outputPitch) + y * 4; int a = v128_convert_to_int32(packed_8); *(int*)(outputBuffer + oi) = a; #endif outputSample += 4; sample += inputPitch; } } } // Adaptive-width filter, variable number of samples. template<> void Filters<ComponentSIMD<4>>::adaptiveSeperable(const FilterKernelAdaptive* kernel, const uint8_t* __restrict inputBuffer, unsigned int inputWidth, unsigned int inputHeight, unsigned int inputPitch, uint8_t* __restrict outputBuffer, unsigned int outputWidth, unsigned int outputHeight, unsigned int outputPitch, unsigned int outputCapacity, bool unpadded) { #if IMAGECORE_DETECT_SSE if( !checkForCPUSupport(kCPUFeature_SSE4_1)) { return Filters<ComponentScalar<4>>::adaptiveSeperable(kernel, inputBuffer, inputWidth, inputHeight, inputPitch, outputBuffer, outputWidth, outputHeight, outputPitch, outputCapacity, unpadded); } #endif unsigned int kernelSize = kernel->getKernelSize(); if( kernelSize == 8U ) { adaptiveSeperable8(kernel, inputBuffer, inputWidth, inputHeight, inputPitch, outputBuffer, outputWidth, outputHeight, outputPitch, outputCapacity); } else if( kernelSize == 12U ) { if( unpadded ) { switch((uint32_t)kernel->getMaxSamples()) { case 12: { adaptiveSeperable12_maxSamples12(kernel, inputBuffer, inputWidth, inputHeight, inputPitch, outputBuffer, outputWidth, outputHeight, outputPitch, outputCapacity); break; } case 11: { adaptiveSeperable12_maxSamples11(kernel, inputBuffer, inputWidth, inputHeight, inputPitch, outputBuffer, outputWidth, outputHeight, outputPitch, outputCapacity); break; } case 10: { adaptiveSeperable12_maxSamples10(kernel, inputBuffer, inputWidth, inputHeight, inputPitch, outputBuffer, outputWidth, outputHeight, outputPitch, outputCapacity); break; } case 9: { adaptiveSeperable12_maxSamples9(kernel, inputBuffer, inputWidth, inputHeight, inputPitch, outputBuffer, outputWidth, outputHeight, outputPitch, outputCapacity); break; } case 8: { adaptiveSeperable12_maxSamples8(kernel, inputBuffer, inputWidth, inputHeight, inputPitch, outputBuffer, outputWidth, outputHeight, outputPitch, outputCapacity); break; } case 7: { adaptiveSeperable12_maxSamples7(kernel, inputBuffer, inputWidth, inputHeight, inputPitch, outputBuffer, outputWidth, outputHeight, outputPitch, outputCapacity); break; } case 6: { adaptiveSeperable12_maxSamples6(kernel, inputBuffer, inputWidth, inputHeight, inputPitch, outputBuffer, outputWidth, outputHeight, outputPitch, outputCapacity); break; } case 5: { adaptiveSeperable12_maxSamples5(kernel, inputBuffer, inputWidth, inputHeight, inputPitch, outputBuffer, outputWidth, outputHeight, outputPitch, outputCapacity); break; } case 4: { adaptiveSeperable12_maxSamples4(kernel, inputBuffer, inputWidth, inputHeight, inputPitch, outputBuffer, outputWidth, outputHeight, outputPitch, outputCapacity); break; } case 3: { adaptiveSeperable12_maxSamples3(kernel, inputBuffer, inputWidth, inputHeight, inputPitch, outputBuffer, outputWidth, outputHeight, outputPitch, outputCapacity); break; } case 2: { adaptiveSeperable12_maxSamples2(kernel, inputBuffer, inputWidth, inputHeight, inputPitch, outputBuffer, outputWidth, outputHeight, outputPitch, outputCapacity); break; } case 1: { adaptiveSeperable12_maxSamples1(kernel, inputBuffer, inputWidth, inputHeight, inputPitch, outputBuffer, outputWidth, outputHeight, outputPitch, outputCapacity); break; } } } else { adaptiveSeperable12(kernel, inputBuffer, inputWidth, inputHeight, inputPitch, outputBuffer, outputWidth, outputHeight, outputPitch, outputCapacity); } } else { adaptiveSeperableAny(kernel, inputBuffer, inputWidth, inputHeight, inputPitch, outputBuffer, outputWidth, outputHeight, outputPitch, outputCapacity); } } // 4 rows at a time void adaptiveSeperable8_12x4(const int32_t* sampleStarts, const FilterKernelAdaptive* kernel, const uint8_t* __restrict inputBuffer, unsigned int inputWidth, unsigned int input_height, unsigned int inputPitch, uint8_t* __restrict outputBuffer, int32_t sx, int32_t ex, int32_t sy, int32_t ey, unsigned int outputPitch, unsigned int outputCapacity) { // The seperable version writes transposed images. SECURE_ASSERT(SafeUMul(ey - sy, 1U) <= outputPitch); SECURE_ASSERT(SafeUMul(ex - sx, outputPitch) <= outputCapacity); __restrict const vSInt32* kernelTable = (vSInt32*)kernel->getTableFixedPoint4(); kernelTable += (sx * 12); __m128i half = v128_set_int32(kHalf16); __m128i zero = v128_setzero(); vSInt32 unpackMask0 = v128_set_int8_packed(0x80, 0x80, 0x80, 0x03, 0x80, 0x80, 0x80, 0x02, 0x80, 0x80, 0x80, 0x01, 0x80, 0x80, 0x80, 0x00); vSInt32 unpackMask1 = v128_set_int8_packed(0x80, 0x80, 0x80, 0x07, 0x80, 0x80, 0x80, 0x06, 0x80, 0x80, 0x80, 0x05, 0x80, 0x80, 0x80, 0x04); vSInt32 unpackMask2 = v128_set_int8_packed(0x80, 0x80, 0x80, 0x0B, 0x80, 0x80, 0x80, 0x0A, 0x80, 0x80, 0x80, 0x09, 0x80, 0x80, 0x80, 0x08); for( unsigned int x = sx; x < ex; x++ ) { int startX = sampleStarts[x]; uint8_t* outputSample = outputBuffer + (x * outputPitch) + sy; const uint8_t* sample = inputBuffer + (sy * inputPitch) + startX; uint8_t* outputSampleEnd = outputSample + (ey - sy); do { // Load and expand (8bit -> 32bit) // 4 distinct rows of 16 values each (only 12 used, since this is a 12-wide filter). // 0 1 2 3 // G G G G | G G G G | G G G G | X X X X (row 0123) dot (coeffs 0-12) = 1st output sample // 4 5 6 7 // G G G G | G G G G | G G G G | X X X X (row 4567) dot (coeffs 0-12) = 2nd output sample // 8 9 A B // G G G G | G G G G | G G G G | X X X X (row 89AB) dot (coeffs 0-12) = 3rd output sample // C D E F // G G G G | G G G G | G G G G | X X X X (row CDEF) dot (coeffs 0-12) = 4th output sample vUInt8 row_0123 = v128_load_unaligned((const vSInt32*)(sample + inputPitch * 0)); vSInt32 row_0 = v128_shuffle_int8(row_0123, unpackMask0); vSInt32 row_1 = v128_shuffle_int8(row_0123, unpackMask1); vSInt32 row_2 = v128_shuffle_int8(row_0123, unpackMask2); vUInt8 row_4567 = v128_load_unaligned((const vSInt32*)(sample + inputPitch * 1)); vSInt32 row_4 = v128_shuffle_int8(row_4567, unpackMask0); vSInt32 row_5 = v128_shuffle_int8(row_4567, unpackMask1); vSInt32 row_6 = v128_shuffle_int8(row_4567, unpackMask2); vUInt8 row_89AB = v128_load_unaligned((const vSInt32*)(sample + inputPitch * 2)); vSInt32 row_8 = v128_shuffle_int8(row_89AB, unpackMask0); vSInt32 row_9 = v128_shuffle_int8(row_89AB, unpackMask1); vSInt32 row_A = v128_shuffle_int8(row_89AB, unpackMask2); vUInt8 row_CDEF = v128_load_unaligned((const vSInt32*)(sample + inputPitch * 3)); vSInt32 row_C = v128_shuffle_int8(row_CDEF, unpackMask0); vSInt32 row_D = v128_shuffle_int8(row_CDEF, unpackMask1); vSInt32 row_E = v128_shuffle_int8(row_CDEF, unpackMask2); vSInt32 result; // Transpose the rows so each multiply/add is operating on 4 of the same element, 1 from each row // This is more efficient than treating each row separately and needing to do horizontal addition within the vectors. // The coefficients are replicated 4 times in each vector, so we have 12 vectors for 12 coefficients. // i.e. // (A0 A1 A2 A3) dot (X0 X1 X2 X3) = (Ar 0 0 0) // (B0 B1 B2 B3) dot (X0 X1 X2 X3) = (Br 0 0 0) // (C0 C1 C2 C3) dot (X0 X1 X2 X3) = (Cr 0 0 0) // (D0 D1 D2 D3) dot (X0 X1 X2 X3) = (Dr 0 0 0) // becomes: // (A0 B0 C0 D0) * (X0 X0 X0 X0) + // (A1 B1 C1 D1) * (X1 X1 X1 X1) + // (A2 B2 C2 D2) * (X2 X2 X2 X2) + // (A3 B3 C3 D3) * (X3 X3 X3 X3) = (Ar Br Cr Dr) // Fixed point fun. 8.0 * 16.16. Since the coefficients are normalized, worst case is 255*(Coeffs0+Coeffs1+...CoeffN=65536), so we won't overflow. Avoid shifting back until later. vec_transpose_epi32(row_0, row_4, row_8, row_C); result = v128_mul_int32(row_0, kernelTable[0]); result = v128_add_int32(result, v128_mul_int32(row_4, kernelTable[1])); result = v128_add_int32(result, v128_mul_int32(row_8, kernelTable[2])); result = v128_add_int32(result, v128_mul_int32(row_C, kernelTable[3])); vec_transpose_epi32(row_1, row_5, row_9, row_D); result = v128_add_int32(result, v128_mul_int32(row_1, kernelTable[4])); result = v128_add_int32(result, v128_mul_int32(row_5, kernelTable[5])); result = v128_add_int32(result, v128_mul_int32(row_9, kernelTable[6])); result = v128_add_int32(result, v128_mul_int32(row_D, kernelTable[7])); vec_transpose_epi32(row_2, row_6, row_A, row_E); result = v128_add_int32(result, v128_mul_int32(row_2, kernelTable[8])); result = v128_add_int32(result, v128_mul_int32(row_6, kernelTable[9])); result = v128_add_int32(result, v128_mul_int32(row_A, kernelTable[10])); result = v128_add_int32(result, v128_mul_int32(row_E, kernelTable[11])); result = v128_add_int32(result, half); // Shift back because of the multiplication we did. result = v128_shift_right_signed_int32<16>(result); // Pack back down (saturating), and store. vSInt16 packed_16 = v128_pack_unsigned_saturate_int32(result, zero); vSInt8 packed_8 = v128_pack_unsigned_saturate_int16(packed_16, zero, zero); *(int*)(outputSample) = v128_extract_int32<0>(packed_8); outputSample += 4; sample += inputPitch * 4; } while (outputSample < outputSampleEnd); kernelTable += 12; } } // single row at a time void adaptiveSeperable8_12x1(const int32_t* sampleStarts, const FilterKernelAdaptive* kernel, const uint8_t* __restrict inputBuffer, unsigned int inputWidth, unsigned int input_height, unsigned int inputPitch, uint8_t* __restrict outputBuffer, int32_t sx, int32_t ex, int32_t sy, int32_t ey, unsigned int outputPitch, unsigned int outputCapacity) { // The seperable version writes transposed images. SECURE_ASSERT(SafeUMul(ey - sy, 1U) <= outputPitch); SECURE_ASSERT(SafeUMul(ex - sx, outputPitch) <= outputCapacity); __restrict const vSInt32* kernelTable = (vSInt32*)kernel->getTableFixedPoint(); kernelTable += (sx * 3); __m128i half = v128_set_int32(kHalf16); __m128i zero = v128_setzero(); vSInt32 unpackMask0 = v128_set_int8_packed(0x80, 0x80, 0x80, 0x03, 0x80, 0x80, 0x80, 0x02, 0x80, 0x80, 0x80, 0x01, 0x80, 0x80, 0x80, 0x00); vSInt32 unpackMask1 = v128_set_int8_packed(0x80, 0x80, 0x80, 0x07, 0x80, 0x80, 0x80, 0x06, 0x80, 0x80, 0x80, 0x05, 0x80, 0x80, 0x80, 0x04); vSInt32 unpackMask2 = v128_set_int8_packed(0x80, 0x80, 0x80, 0x0B, 0x80, 0x80, 0x80, 0x0A, 0x80, 0x80, 0x80, 0x09, 0x80, 0x80, 0x80, 0x08); for( unsigned int x = sx; x < ex; x++ ) { int startX = sampleStarts[x]; uint8_t* outputSample = outputBuffer + (x * outputPitch) + sy; const uint8_t* sample = inputBuffer + (sy * inputPitch) + startX; uint8_t* outputSampleEnd = outputSample + (ey - sy); do { vUInt8 samples0_15 = v128_load_unaligned((const vSInt32*)(sample)); vSInt32 samples0_3 = v128_shuffle_int8(samples0_15, unpackMask0); vSInt32 samples4_7 = v128_shuffle_int8(samples0_15, unpackMask1); vSInt32 samples8_11 = v128_shuffle_int8(samples0_15, unpackMask2); vSInt32 sum = v128_mul_int32(samples0_3, kernelTable[0]); sum = v128_add_int32(sum, v128_mul_int32(samples4_7, kernelTable[1])); sum = v128_add_int32(sum, v128_mul_int32(samples8_11, kernelTable[2])); sum = v128_add_int32(sum, v128_shift_right_unsigned_vec128<8>(sum)); // m128[0] + m128[1], m128[2] + m128[3], .... sum = v128_add_int32(sum, v128_shift_right_unsigned_vec128<4>(sum)); // m128[0] + m128[1] + m128[2] + m128[3], .... sum = v128_add_int32(sum, half); sum = v128_shift_right_signed_int32<16>(sum); vSInt8 packed_8 = v128_pack_unsigned_saturate_int16(sum, zero, zero); // for saturation *outputSample = v128_extract_int8<0>(packed_8); outputSample += 1; sample += inputPitch; } while (outputSample < outputSampleEnd); kernelTable += 3; } } #define LOAD_ROW(row, index0, index1, index2, index3) row_##row = v128_set_int8_packed(0, 0, 0, *samples8[index3], \ 0, 0, 0, *samples8[index2], \ 0, 0, 0, *samples8[index1], \ 0, 0, 0, *samples8[index0]); \ samples8[index3] += inputPitch; \ samples8[index2] += inputPitch; \ samples8[index1] += inputPitch; \ samples8[index0] += inputPitch; #define CONVOLVE(index0, index1, index2, index3) LOAD_ROW(0, index0, index1, index2, index3) \ LOAD_ROW(1, index0, index1, index2, index3) \ LOAD_ROW(2, index0, index1, index2, index3) \ LOAD_ROW(3, index0, index1, index2, index3) \ vec_transpose_epi32(row_0, row_1, row_2, row_3); \ result = v128_add_int32(result, v128_mul_int32(row_0, kernelTable[index0])); \ result = v128_add_int32(result, v128_mul_int32(row_1, kernelTable[index1])); \ result = v128_add_int32(result, v128_mul_int32(row_2, kernelTable[index2])); \ result = v128_add_int32(result, v128_mul_int32(row_3, kernelTable[index3])); void adaptiveSeperableUnpadded8_12x4(const int32_t* sampleStarts, const FilterKernelAdaptive* kernel, const uint8_t* __restrict inputBuffer, unsigned int inputWidth, unsigned int input_height, unsigned int inputPitch, uint8_t* __restrict outputBuffer, int32_t sx, int32_t ex, int32_t sy, int32_t ey, unsigned int outputPitch, unsigned int outputCapacity) { // The seperable version writes transposed images. SECURE_ASSERT(SafeUMul(ey - sy, 1U) <= outputPitch); SECURE_ASSERT(SafeUMul(ex - sx, outputPitch) <= outputCapacity); __restrict const vSInt32* kernelTable = (vSInt32*)kernel->getTableFixedPoint4(); kernelTable += (sx * 12); vSInt32 half = v128_set_int32(kHalf16); vSInt32 zero = v128_setzero(); for( unsigned int x = sx; x < ex; x++ ) { int startX = sampleStarts[x]; uint8_t* outputSample = outputBuffer + (x * outputPitch) + sy; const uint8_t* samples8[12]; int32_t startIndex = startX < 0 ? 0 : startX; const uint8_t* sample = inputBuffer + (sy * inputPitch) + startIndex; IMAGECORE_UNUSED(samples8); IMAGECORE_UNUSED(kernelTable); for( int kernelIndex = 0; kernelIndex < 12; kernelIndex++ ) { int sampleIndex; if(startX < 0) { sampleIndex = kernelIndex + startX < 0 ? 0 : kernelIndex + startX; } else { sampleIndex = kernelIndex + startIndex < inputWidth ? kernelIndex : kernelIndex - (kernelIndex + startIndex - inputWidth + 1); } sampleIndex = min(sampleIndex, (int32_t)inputWidth - 1); samples8[kernelIndex] = &sample[sampleIndex]; } uint8_t* outputSampleEnd = outputSample + (ey - sy); do { vSInt32 row_0; vSInt32 row_1; vSInt32 row_2; vSInt32 row_3; vSInt32 result = half; CONVOLVE(0, 1, 2, 3) CONVOLVE(4, 5, 6, 7) CONVOLVE(8, 9, 10, 11) // Shift back because of the multiplication we did. result = v128_shift_right_signed_int32<16>(result); // Pack back down (saturating), and store. vSInt16 packed_16 = v128_pack_unsigned_saturate_int32(result, zero); vSInt8 packed_8 = v128_pack_unsigned_saturate_int16(packed_16, zero, zero); *(int*)(outputSample) = v128_extract_int32<0>(packed_8); outputSample += 4; } while( outputSample < outputSampleEnd ); kernelTable += 12; } } void adaptiveSeperableUnpadded8_12x1(const int32_t* sampleStarts, const FilterKernelAdaptive* kernel, const uint8_t* __restrict inputBuffer, unsigned int inputWidth, unsigned int input_height, unsigned int inputPitch, uint8_t* __restrict outputBuffer, int32_t sx, int32_t ex, int32_t sy, int32_t ey, unsigned int outputPitch, unsigned int outputCapacity) { // The seperable version writes transposed images. SECURE_ASSERT(SafeUMul(ey - sy, 1U) <= outputPitch); SECURE_ASSERT(SafeUMul(ex - sx, outputPitch) <= outputCapacity); __restrict const vSInt32* kernelTable = (vSInt32*)kernel->getTableFixedPoint(); kernelTable += (sx * 3); vSInt32 half = v128_set_int32(kHalf16); vSInt32 zero = v128_setzero(); for( unsigned int x = sx; x < ex; x++ ) { int startX = sampleStarts[x]; uint8_t* outputSample = outputBuffer + (x * outputPitch) + sy; const uint8_t* samples8[12]; int32_t startIndex = startX < 0 ? 0 : startX; const uint8_t* sample = inputBuffer + (sy * inputPitch) + startIndex; IMAGECORE_UNUSED(samples8); IMAGECORE_UNUSED(kernelTable); for( int kernelIndex = 0; kernelIndex < 12; kernelIndex++ ) { int sampleIndex; if(startX < 0) { sampleIndex = kernelIndex + startX < 0 ? 0 : kernelIndex + startX; } else { sampleIndex = kernelIndex + startIndex < inputWidth ? kernelIndex : kernelIndex - (kernelIndex + startIndex - inputWidth + 1); } sampleIndex = min(sampleIndex, (int32_t)inputWidth - 1); samples8[kernelIndex] = &sample[sampleIndex]; } uint8_t* outputSampleEnd = outputSample + (ey - sy); do { vSInt32 samples0_3; vSInt32 samples4_7; vSInt32 samples8_11; samples0_3 = v128_set_int8_packed(0, 0, 0, *samples8[3], 0, 0, 0, *samples8[2], 0, 0, 0, *samples8[1], 0, 0, 0, *samples8[0]); samples4_7 = v128_set_int8_packed(0, 0, 0, *samples8[7], 0, 0, 0, *samples8[6], 0, 0, 0, *samples8[5], 0, 0, 0, *samples8[4]); samples8_11 = v128_set_int8_packed(0, 0, 0, *samples8[11], 0, 0, 0, *samples8[10], 0, 0, 0, *samples8[9], 0, 0, 0, *samples8[8]); vSInt32 sum = v128_mul_int32(samples0_3, kernelTable[0]); sum = v128_add_int32(sum, v128_mul_int32(samples4_7, kernelTable[1])); sum = v128_add_int32(sum, v128_mul_int32(samples8_11, kernelTable[2])); sum = v128_add_int32(sum, v128_shift_right_unsigned_vec128<8>(sum)); sum = v128_add_int32(sum, v128_shift_right_unsigned_vec128<4>(sum)); sum = v128_add_int32(sum, half); sum = v128_shift_right_signed_int32<16>(sum); vSInt8 packed_8 = v128_pack_unsigned_saturate_int16(sum, zero, zero); *outputSample = v128_extract_int8<0>(packed_8); outputSample += 1; for( int kernelIndex = 0; kernelIndex < 12; kernelIndex++ ) { samples8[kernelIndex] += inputPitch; } } while( outputSample < outputSampleEnd ); kernelTable += 3; } } // uses unpadded code version around the edges of the image and the faster padded version for the internal part void adaptiveSeperableHybrid8_12(const FilterKernelAdaptive* kernel, const uint8_t* __restrict inputBuffer, unsigned int inputWidth, unsigned int inputHeight, unsigned int inputPitch, uint8_t* __restrict outputBuffer, unsigned int outputWidth, unsigned int outputHeight, unsigned int outputPitch, unsigned int outputCapacity) { int32_t unpaddedStartX = 0; int32_t unpaddedEndX = 0; int32_t unpaddedEndY = (outputHeight & (~3)); // optimized 4 lines at a time version int32_t* startX = new int32_t[outputWidth]; // pre-calculate the start sampling points per kernel, keep track of the unpadded regions limits for(int32_t x = 0; x < outputWidth; x++) { startX[x] = kernel->computeSampleStart(x); if((startX[x] >= 0) && (unpaddedStartX == 0)) { unpaddedStartX = x; } int32_t endX = startX[x] + 12; if(endX < inputWidth) { unpaddedEndX = x; } } // 6 passes: // 1 - left edge 4 rows at a time (unpadded) // 2 - left edge remainder of the rows (unpadded) // 3 - middle section 4 rows at a time (padded) // 4 - middle section remainder of the rows (padded) // 5 - right edge 4 rows at a time (unpadded) // 6 - right edge remainder of the rows (unpadded) // left edge if(unpaddedStartX > 0) { adaptiveSeperableUnpadded8_12x4(startX, kernel, inputBuffer, inputWidth, inputHeight, inputPitch, outputBuffer, 0, unpaddedStartX, 0, unpaddedEndY, outputPitch, outputCapacity); if( unpaddedEndY != outputHeight) { // leftover lines, one at a time adaptiveSeperableUnpadded8_12x1(startX, kernel, inputBuffer, inputWidth, inputHeight, inputPitch, outputBuffer, 0, unpaddedStartX, unpaddedEndY, outputHeight, outputPitch, outputCapacity); } } // middle section, doesn't need unpadded code if(unpaddedEndX - unpaddedStartX > 0) { adaptiveSeperable8_12x4(startX, kernel, inputBuffer, inputWidth, inputHeight, inputPitch, outputBuffer, unpaddedStartX, unpaddedEndX, 0, unpaddedEndY, outputPitch, outputCapacity); if( unpaddedEndY != outputHeight) { // leftover lines, one at a time adaptiveSeperable8_12x1(startX, kernel, inputBuffer, inputWidth, inputHeight, inputPitch, outputBuffer, unpaddedStartX, unpaddedEndX, unpaddedEndY, outputHeight, outputPitch, outputCapacity); } } // right edge if(outputWidth - unpaddedEndX > 0) { adaptiveSeperableUnpadded8_12x4(startX, kernel, inputBuffer, inputWidth, inputHeight, inputPitch, outputBuffer, unpaddedEndX, outputWidth, 0, unpaddedEndY, outputPitch, outputCapacity); if( unpaddedEndY != outputHeight) { // leftover lines, one at a time adaptiveSeperableUnpadded8_12x1(startX, kernel, inputBuffer, inputWidth, inputHeight, inputPitch, outputBuffer, unpaddedEndX, outputWidth, unpaddedEndY, outputHeight, outputPitch, outputCapacity); } } delete[] startX; } void adaptiveSeperable8_12(const FilterKernelAdaptive* kernel, const uint8_t* __restrict inputBuffer, unsigned int inputWidth, unsigned int input_height, unsigned int inputPitch, uint8_t* __restrict outputBuffer, unsigned int outputWidth, unsigned int outputHeight, unsigned int outputPitch, unsigned int outputCapacity) { // The seperable version writes transposed images. SECURE_ASSERT(SafeUMul(outputHeight, 1U) <= outputPitch); SECURE_ASSERT(SafeUMul(outputWidth, outputPitch) <= outputCapacity); __restrict const vSInt32* kernelTable = (vSInt32*)kernel->getTableFixedPoint4(); __m128i half = v128_set_int32(kHalf16); __m128i zero = v128_setzero(); vSInt32 unpackMask0 = v128_set_int8_packed(0x80, 0x80, 0x80, 0x03, 0x80, 0x80, 0x80, 0x02, 0x80, 0x80, 0x80, 0x01, 0x80, 0x80, 0x80, 0x00); vSInt32 unpackMask1 = v128_set_int8_packed(0x80, 0x80, 0x80, 0x07, 0x80, 0x80, 0x80, 0x06, 0x80, 0x80, 0x80, 0x05, 0x80, 0x80, 0x80, 0x04); vSInt32 unpackMask2 = v128_set_int8_packed(0x80, 0x80, 0x80, 0x0B, 0x80, 0x80, 0x80, 0x0A, 0x80, 0x80, 0x80, 0x09, 0x80, 0x80, 0x80, 0x08); for( unsigned int x = 0; x < outputWidth; x++ ) { int startX = kernel->computeSampleStart(x); uint8_t* outputSample = outputBuffer + (x * outputPitch); const uint8_t* sample = inputBuffer + startX; uint8_t* outputSampleEnd = outputSample + outputHeight; do { // Load and expand (8bit -> 32bit) // 4 distinct rows of 16 values each (only 12 used, since this is a 12-wide filter). // 0 1 2 3 // G G G G | G G G G | G G G G | X X X X (row 0123) dot (coeffs 0-12) = 1st output sample // 4 5 6 7 // G G G G | G G G G | G G G G | X X X X (row 4567) dot (coeffs 0-12) = 2nd output sample // 8 9 A B // G G G G | G G G G | G G G G | X X X X (row 89AB) dot (coeffs 0-12) = 3rd output sample // C D E F // G G G G | G G G G | G G G G | X X X X (row CDEF) dot (coeffs 0-12) = 4th output sample vUInt8 row_0123 = v128_load_unaligned((const vSInt32*)(sample + inputPitch * 0)); vSInt32 row_0 = v128_shuffle_int8(row_0123, unpackMask0); vSInt32 row_1 = v128_shuffle_int8(row_0123, unpackMask1); vSInt32 row_2 = v128_shuffle_int8(row_0123, unpackMask2); vUInt8 row_4567 = v128_load_unaligned((const vSInt32*)(sample + inputPitch * 1)); vSInt32 row_4 = v128_shuffle_int8(row_4567, unpackMask0); vSInt32 row_5 = v128_shuffle_int8(row_4567, unpackMask1); vSInt32 row_6 = v128_shuffle_int8(row_4567, unpackMask2); vUInt8 row_89AB = v128_load_unaligned((const vSInt32*)(sample + inputPitch * 2)); vSInt32 row_8 = v128_shuffle_int8(row_89AB, unpackMask0); vSInt32 row_9 = v128_shuffle_int8(row_89AB, unpackMask1); vSInt32 row_A = v128_shuffle_int8(row_89AB, unpackMask2); vUInt8 row_CDEF = v128_load_unaligned((const vSInt32*)(sample + inputPitch * 3)); vSInt32 row_C = v128_shuffle_int8(row_CDEF, unpackMask0); vSInt32 row_D = v128_shuffle_int8(row_CDEF, unpackMask1); vSInt32 row_E = v128_shuffle_int8(row_CDEF, unpackMask2); vSInt32 result; // Transpose the rows so each multiply/add is operating on 4 of the same element, 1 from each row // This is more efficient than treating each row separately and needing to do horizontal addition within the vectors. // The coefficients are replicated 4 times in each vector, so we have 12 vectors for 12 coefficients. // i.e. // (A0 A1 A2 A3) dot (X0 X1 X2 X3) = (Ar 0 0 0) // (B0 B1 B2 B3) dot (X0 X1 X2 X3) = (Br 0 0 0) // (C0 C1 C2 C3) dot (X0 X1 X2 X3) = (Cr 0 0 0) // (D0 D1 D2 D3) dot (X0 X1 X2 X3) = (Dr 0 0 0) // becomes: // (A0 B0 C0 D0) * (X0 X0 X0 X0) + // (A1 B1 C1 D1) * (X1 X1 X1 X1) + // (A2 B2 C2 D2) * (X2 X2 X2 X2) + // (A3 B3 C3 D3) * (X3 X3 X3 X3) = (Ar Br Cr Dr) // Fixed point fun. 8.0 * 16.16. Since the coefficients are normalized, worst case is 255*(Coeffs0+Coeffs1+...CoeffN=65536), so we won't overflow. Avoid shifting back until later. vec_transpose_epi32(row_0, row_4, row_8, row_C); result = v128_mul_int32(row_0, kernelTable[0]); result = v128_add_int32(result, v128_mul_int32(row_4, kernelTable[1])); result = v128_add_int32(result, v128_mul_int32(row_8, kernelTable[2])); result = v128_add_int32(result, v128_mul_int32(row_C, kernelTable[3])); vec_transpose_epi32(row_1, row_5, row_9, row_D); result = v128_add_int32(result, v128_mul_int32(row_1, kernelTable[4])); result = v128_add_int32(result, v128_mul_int32(row_5, kernelTable[5])); result = v128_add_int32(result, v128_mul_int32(row_9, kernelTable[6])); result = v128_add_int32(result, v128_mul_int32(row_D, kernelTable[7])); vec_transpose_epi32(row_2, row_6, row_A, row_E); result = v128_add_int32(result, v128_mul_int32(row_2, kernelTable[8])); result = v128_add_int32(result, v128_mul_int32(row_6, kernelTable[9])); result = v128_add_int32(result, v128_mul_int32(row_A, kernelTable[10])); result = v128_add_int32(result, v128_mul_int32(row_E, kernelTable[11])); result = v128_add_int32(result, half); // Shift back because of the multiplication we did. result = v128_shift_right_signed_int32<16>(result); // Pack back down (saturating), and store. vSInt16 packed_16 = v128_pack_unsigned_saturate_int32(result, zero); vSInt8 packed_8 = v128_pack_unsigned_saturate_int16(packed_16, zero, zero); *(int*)(outputSample) = v128_extract_int32<0>(packed_8); outputSample += 4; sample += inputPitch * 4; } while (outputSample < outputSampleEnd); kernelTable += 12; } } template<> void Filters<ComponentSIMD<1>>::adaptiveSeperable(const FilterKernelAdaptive* kernel, const uint8_t* __restrict inputBuffer, unsigned int inputWidth, unsigned int inputHeight, unsigned int inputPitch, uint8_t* __restrict outputBuffer, unsigned int outputWidth, unsigned int outputHeight, unsigned int outputPitch, unsigned int outputCapacity, bool unpadded) { #if IMAGECORE_DETECT_SSE if( !checkForCPUSupport(kCPUFeature_SSE4_1)) { return Filters<ComponentScalar<1>>::adaptiveSeperable(kernel, inputBuffer, inputWidth, inputHeight, inputPitch, outputBuffer, outputWidth, outputHeight, outputPitch, outputCapacity, unpadded); } #endif unsigned int kernelSize = kernel->getKernelSize(); if( kernelSize == 8U ) { // TODO SECURE_ASSERT(0); } else if( kernelSize == 12U ) { if(unpadded) { adaptiveSeperableHybrid8_12(kernel, inputBuffer, inputWidth, inputHeight, inputPitch, outputBuffer, outputWidth, outputHeight, outputPitch, outputCapacity); } else { adaptiveSeperable8_12(kernel, inputBuffer, inputWidth, inputHeight, inputPitch, outputBuffer, outputWidth, outputHeight, outputPitch, outputCapacity); } } else { SECURE_ASSERT(0); } } // 4 sample fixed filter // 16.16 Fixed point SSE version template<> void Filters<ComponentSIMD<4>>::fixed4x4(const FilterKernelFixed *kernelX, const FilterKernelFixed *kernelY, const uint8_t *inputBuffer, unsigned int inputWidth, unsigned int inputHeight, unsigned int inputPitch, uint8_t *outputBuffer, unsigned int outputWidth, unsigned int outputHeight, unsigned int outputPitch, unsigned int outputCapacity) { #if IMAGECORE_DETECT_SSE if( !checkForCPUSupport(kCPUFeature_SSE4_1)) { return Filters<ComponentScalar<4>>::fixed4x4(kernelX, kernelY, inputBuffer, inputWidth, inputHeight, inputPitch, outputBuffer, outputWidth, outputHeight, outputPitch, outputCapacity); } #endif SECURE_ASSERT(SafeUMul(outputWidth, 4U) <= outputPitch); SECURE_ASSERT(SafeUMul(outputHeight, outputPitch) <= outputCapacity); __restrict int32_t* kernelTableX = kernelX->getTableFixedPoint4(); __restrict int32_t* kernelTableY = kernelY->getTableFixedPoint4(); __m128i zero = v128_setzero(); __m128i half = v128_set_int32(kHalf22); for( int y = 0; y < outputHeight; y++ ) { int sampleY = kernelY->computeSampleStart(y); for( int x = 0; x < outputWidth; x++ ) { int sampleX = kernelX->computeSampleStart(x); int sampleOffset = ((sampleY - 1) * (int)inputPitch) + (sampleX - 1) * 4; const uint8_t* sample = inputBuffer + sampleOffset; vSInt32 final = zero; unsigned int filterIndexX = x; filterIndexX *= 16; vSInt32 coeffs_x_0 = *(vSInt32*)(kernelTableX + filterIndexX + 0); vSInt32 coeffs_x_1 = *(vSInt32*)(kernelTableX + filterIndexX + 4); vSInt32 coeffs_x_2 = *(vSInt32*)(kernelTableX + filterIndexX + 8); vSInt32 coeffs_x_3 = *(vSInt32*)(kernelTableX + filterIndexX + 12); unsigned int filterIndexY = y; filterIndexY *= 16; vSInt32 coeffs_y_0 = *(vSInt32*)(kernelTableY + filterIndexY + 0); { vUInt8 row_8 = v128_load_unaligned((const vSInt32*)sample); vUInt16 row_16_a = v128_unpacklo_int8(row_8, zero); vUInt16 row_16_b = v128_unpackhi_int8(row_8, zero); vSInt32 row_32_a = v128_unpacklo_int16(row_16_a, zero); vSInt32 row_32_b = v128_unpackhi_int16(row_16_a, zero); vSInt32 row_32_c = v128_unpacklo_int16(row_16_b, zero); vSInt32 row_32_d = v128_unpackhi_int16(row_16_b, zero); vSInt32 mul_a = v128_mul_int32(row_32_a, coeffs_x_0); vSInt32 mul_b = v128_mul_int32(row_32_b, coeffs_x_1); vSInt32 mul_c = v128_mul_int32(row_32_c, coeffs_x_2); vSInt32 mul_d = v128_mul_int32(row_32_d, coeffs_x_3); vSInt32 row = v128_shift_right_signed_int32<10>(v128_add_int32(mul_a, v128_add_int32(mul_b, v128_add_int32(mul_c, mul_d)))); final = v128_add_int32(final, v128_mul_int32(row, coeffs_y_0)); sample += inputPitch; } vSInt32 coeffs_y_1 = *(vSInt32*)(kernelTableY + filterIndexY + 4); { vUInt8 row_8 = v128_load_unaligned((const vSInt32*)sample); vUInt16 row_16_a = v128_unpacklo_int8(row_8, zero); vUInt16 row_16_b = v128_unpackhi_int8(row_8, zero); vSInt32 row_32_a = v128_unpacklo_int16(row_16_a, zero); vSInt32 row_32_b = v128_unpackhi_int16(row_16_a, zero); vSInt32 row_32_c = v128_unpacklo_int16(row_16_b, zero); vSInt32 row_32_d = v128_unpackhi_int16(row_16_b, zero); vSInt32 mul_a = v128_mul_int32(row_32_a, coeffs_x_0); vSInt32 mul_b = v128_mul_int32(row_32_b, coeffs_x_1); vSInt32 mul_c = v128_mul_int32(row_32_c, coeffs_x_2); vSInt32 mul_d = v128_mul_int32(row_32_d, coeffs_x_3); vSInt32 row = v128_shift_right_signed_int32<10>(v128_add_int32(mul_a, v128_add_int32(mul_b, v128_add_int32(mul_c, mul_d)))); final = v128_add_int32(final, v128_mul_int32(row, coeffs_y_1)); sample += inputPitch; } vSInt32 coeffs_y_2 = *(vSInt32*)(kernelTableY + filterIndexY + 8); { vUInt8 row_8 = v128_load_unaligned((const vSInt32*)sample); vUInt16 row_16_a = v128_unpacklo_int8(row_8, zero); vUInt16 row_16_b = v128_unpackhi_int8(row_8, zero); vSInt32 row_32_a = v128_unpacklo_int16(row_16_a, zero); vSInt32 row_32_b = v128_unpackhi_int16(row_16_a, zero); vSInt32 row_32_c = v128_unpacklo_int16(row_16_b, zero); vSInt32 row_32_d = v128_unpackhi_int16(row_16_b, zero); vSInt32 mul_a = v128_mul_int32(row_32_a, coeffs_x_0); vSInt32 mul_b = v128_mul_int32(row_32_b, coeffs_x_1); vSInt32 mul_c = v128_mul_int32(row_32_c, coeffs_x_2); vSInt32 mul_d = v128_mul_int32(row_32_d, coeffs_x_3); vSInt32 row = v128_shift_right_signed_int32<10>(v128_add_int32(mul_a, v128_add_int32(mul_b, v128_add_int32(mul_c, mul_d)))); final = v128_add_int32(final, v128_mul_int32(row, coeffs_y_2)); sample += inputPitch; } vSInt32 coeffs_y_3 = *(vSInt32*)(kernelTableY + filterIndexY + 12); { vUInt8 row_8 = v128_load_unaligned((const vSInt32*)sample); vUInt16 row_16_a = v128_unpacklo_int8(row_8, zero); vUInt16 row_16_b = v128_unpackhi_int8(row_8, zero); vSInt32 row_32_a = v128_unpacklo_int16(row_16_a, zero); vSInt32 row_32_b = v128_unpackhi_int16(row_16_a, zero); vSInt32 row_32_c = v128_unpacklo_int16(row_16_b, zero); vSInt32 row_32_d = v128_unpackhi_int16(row_16_b, zero); vSInt32 mul_a = v128_mul_int32(row_32_a, coeffs_x_0); vSInt32 mul_b = v128_mul_int32(row_32_b, coeffs_x_1); vSInt32 mul_c = v128_mul_int32(row_32_c, coeffs_x_2); vSInt32 mul_d = v128_mul_int32(row_32_d, coeffs_x_3); vSInt32 row = v128_shift_right_signed_int32<10>(v128_add_int32(mul_a, v128_add_int32(mul_b, v128_add_int32(mul_c, mul_d)))); final = v128_add_int32(final, v128_mul_int32(row, coeffs_y_3)); } final = v128_add_int32(final, half); final = v128_shift_right_signed_int32<22>(final); vSInt16 packed_16 = v128_pack_unsigned_saturate_int32(final, zero); vSInt8 packed_8 = v128_pack_unsigned_saturate_int16(packed_16, zero, zero); unsigned int oi = (y * outputPitch) + x * 4; int a = v128_convert_to_int32(packed_8); *(int*)(outputBuffer + oi) = a; } } } template<> bool Filters<ComponentSIMD<4>>::fasterUnpadded(uint32_t kernelSize) { return (kernelSize == 12); } template<> bool Filters<ComponentSIMD<1>>::fasterUnpadded(uint32_t kernelSize) { return false; } template<> bool Filters<ComponentSIMD<4>>::supportsUnpadded(uint32_t kernelSize) { return true; } template<> bool Filters<ComponentSIMD<1>>::supportsUnpadded(uint32_t kernelSize) { return true; } // forward template declarations template class Filters<ComponentScalar<1>>; template class Filters<ComponentScalar<2>>; template class Filters<ComponentScalar<4>>; template class Filters<ComponentSIMD<1>>; template class Filters<ComponentSIMD<2>>; template class Filters<ComponentSIMD<4>>; } #endif