in imagecore/image/internal/filters_intrinsics.cpp [345:432]
static void transpose2x16(const uint8_t* __restrict input_buffer, uint8_t* __restrict output_buffer, unsigned int width, unsigned int height, unsigned int input_pitch, unsigned int output_pitch, unsigned int output_capacity)
{
const uint32_t h_blocks = width / 8;
const uint32_t v_blocks = height / 4;
if((h_blocks > 0) && (v_blocks > 0)) {
const uint32_t outputPitch4 = output_pitch / 4;
for(uint32_t v_index = 0; v_index < v_blocks; v_index++ ) {
for(uint32_t h_index = 0; h_index < h_blocks; h_index++) {
const uint8_t* srcBlock = input_buffer + v_index * 4 * input_pitch + h_index * 16;
uint8_t* dstBlock = output_buffer + h_index * 8 * output_pitch + v_index * 8;
vUInt16 srcRow0 = v128_load_unaligned((const vSInt32 *)(srcBlock + input_pitch * 0)); // a0 b0 c0 d0 e0 f0 g0 h0
vUInt16 srcRow1 = v128_load_unaligned((const vSInt32 *)(srcBlock + input_pitch * 1)); // a1 b1 c1 d1 e1 f1 g1 h1
vUInt16 srcRow2 = v128_load_unaligned((const vSInt32 *)(srcBlock + input_pitch * 2)); // a2 b2 c2 d2 e2 f2 g2 h2
vUInt16 srcRow3 = v128_load_unaligned((const vSInt32 *)(srcBlock + input_pitch * 3)); // a3 b3 c3 d3 e3 f3 g3 h3
vSInt64 dstRow0;
vSInt64 dstRow1;
vSInt64 dstRow2;
vSInt64 dstRow3;
vec_transpose_int16(srcRow0, srcRow1, srcRow2, srcRow3, dstRow0, dstRow1, dstRow2, dstRow3); // a0 a1 a2 a3 e0 e1 e2 e3
// b0 b1 b2 b3 f0 f1 f2 f3
// c0 c1 c2 c3 g0 g1 g2 g3
// d0 d1 d2 d3 h0 h1 h2 h3
// now need to store by 4x4 blocks at a time
uint32_t* blockStart = (uint32_t*)dstBlock;
*blockStart = v128_convert_to_int32(dstRow0); // a0 a1
blockStart += outputPitch4;
*blockStart = v128_convert_to_int32(dstRow1); // b0 b1
blockStart += outputPitch4;
*blockStart = v128_convert_to_int32(dstRow2); // c0 c1
blockStart += outputPitch4;
*blockStart = v128_convert_to_int32(dstRow3); // d0 d1
blockStart += outputPitch4;
*blockStart = v128_convert_lane_to_int32<2>(dstRow0); // e0 e1
blockStart += outputPitch4;
*blockStart = v128_convert_lane_to_int32<2>(dstRow1); // f0 f1
blockStart += outputPitch4;
*blockStart = v128_convert_lane_to_int32<2>(dstRow2); // g0 g1
blockStart += outputPitch4;
*blockStart = v128_convert_lane_to_int32<2>(dstRow3); // h0 h1
blockStart += outputPitch4;
blockStart = (uint32_t*)(dstBlock + 4);
*blockStart = v128_convert_lane_to_int32<1>(dstRow0); // a2 a3
blockStart += outputPitch4;
*blockStart = v128_convert_lane_to_int32<1>(dstRow1); // b2 b3
blockStart += outputPitch4;
*blockStart = v128_convert_lane_to_int32<1>(dstRow2); // c2 c3
blockStart += outputPitch4;
*blockStart = v128_convert_lane_to_int32<1>(dstRow3); // d2 d3
blockStart += outputPitch4;
*blockStart = v128_convert_lane_to_int32<3>(dstRow0); // e2 e3
blockStart += outputPitch4;
*blockStart = v128_convert_lane_to_int32<3>(dstRow1); // f2 f3
blockStart += outputPitch4;
*blockStart = v128_convert_lane_to_int32<3>(dstRow2); // g2 g3
blockStart += outputPitch4;
*blockStart = v128_convert_lane_to_int32<3>(dstRow3); // h2 h3
blockStart += outputPitch4;
}
}
// finished the top left region, now need to do top right, bot left and bot right
const uint32_t colsLeft = width - h_blocks * 8;
const uint32_t rowsLeft = height - v_blocks * 4;
const uint32_t topRightWidth = colsLeft;
const uint32_t topRightHeight = height - rowsLeft;
const uint8_t* topRightInput = input_buffer + 2 * (width - topRightWidth);
uint8_t* topRightOutput = output_buffer + (width - topRightWidth) * output_pitch;
Filters<ComponentScalar<2>>::transpose(topRightInput, topRightOutput, topRightWidth, topRightHeight, input_pitch, output_pitch, output_capacity); // top right
const uint32_t botLeftWidth = width - colsLeft;
const uint32_t botLeftHeight = rowsLeft;
const uint8_t* botLeftInput = input_buffer + (height - rowsLeft) * input_pitch;
uint8_t* botLeftOutput = output_buffer + 2 * (height - rowsLeft);
Filters<ComponentScalar<2>>::transpose(botLeftInput, botLeftOutput, botLeftWidth, botLeftHeight, input_pitch, output_pitch, output_capacity); // bot left
const uint32_t botRightWidth = colsLeft;
const uint32_t botRightHeight = rowsLeft;
const uint8_t* botRightInput = input_buffer + (height - rowsLeft) * input_pitch + 2 * (width - colsLeft);
uint8_t* botRightOutput = output_buffer + (width - colsLeft) * output_pitch + 2 * (height - rowsLeft);
Filters<ComponentScalar<2>>::transpose(botRightInput, botRightOutput, botRightWidth, botRightHeight, input_pitch, output_pitch, output_capacity); // cols/rows left
} else {
Filters<ComponentScalar<2>>::transpose(input_buffer, output_buffer, width, height, input_pitch, output_pitch, output_capacity);
}
}