static void transpose1x16()

in imagecore/image/internal/filters_intrinsics.cpp [240:326]


static void transpose1x16(const uint8_t* __restrict input_buffer, uint8_t* __restrict output_buffer, unsigned int width, unsigned int height, unsigned int input_pitch, unsigned int output_pitch, unsigned int output_capacity)
{
	const uint32_t h_blocks = width / 16;
	const uint32_t v_blocks = height / 4;
	if((h_blocks > 0) && (v_blocks > 0)) {
		const uint32_t outputPitch4 = output_pitch / 4;
		for(uint32_t v_index = 0; v_index < v_blocks; v_index++ ) {
			for(uint32_t h_index = 0; h_index < h_blocks; h_index++) {
				const uint8_t* srcBlock = input_buffer + v_index * 4 * input_pitch + h_index * 16;
				uint8_t* dstBlock = output_buffer + h_index * 16 * output_pitch + v_index * 4;
				vSInt8 srcRow0 = v128_load_unaligned((const vSInt32 *)(srcBlock + input_pitch * 0)); // a0 b0 c0 d0  e0 f0 g0 h0  i0 j0 k0 l0  m0 n0 o0 p0
				vSInt8 srcRow1 = v128_load_unaligned((const vSInt32 *)(srcBlock + input_pitch * 1)); // a1 b1 c1 d1  e1 f1 g1 h1  i1 j1 k1 l1  m1 n1 o1 p1
				vSInt8 srcRow2 = v128_load_unaligned((const vSInt32 *)(srcBlock + input_pitch * 2)); // a2 b2 c2 d2  e2 f2 g2 h2  i2 j2 k2 l2  m2 n2 o2 p2
				vSInt8 srcRow3 = v128_load_unaligned((const vSInt32 *)(srcBlock + input_pitch * 3)); // a3 b3 c3 d3  e3 f3 g3 h3  i3 j3 k3 l3  m3 n3 o3 p3

				vSInt64 dstRow0;
				vSInt64 dstRow1;
				vSInt64 dstRow2;
				vSInt64 dstRow3;
				vec_transpose_int8(srcRow0, srcRow1, srcRow2, srcRow3, dstRow0, dstRow1, dstRow2, dstRow3); // a0 a1 a2 a3  e0 e1 e2 e3  i0 i1 i2 i3  m0 m1 m2 m3
																											// b0 b1 b2 b3  f0 f1 f2 f3  j0 j1 j2 j3  n0 n1 n2 n3
																											// c0 c1 c2 c3  g0 g1 g2 g3  k0 k1 k2 k3  o0 01 02 03
																											// d0 d1 d2 d3  h0 h1 h2 h3  l0 l1 l2 l3  p0 p1 p2 p3
				// now need to store by 4x4 blocks at a time
				uint32_t* blockStart = (uint32_t*)dstBlock;
				*blockStart = v128_convert_to_int32(dstRow0);         // a0 a1 a2 a3
				blockStart += outputPitch4;
				*blockStart = v128_convert_to_int32(dstRow1);         // b0 b1 b2 b3
				blockStart += outputPitch4;
				*blockStart = v128_convert_to_int32(dstRow2);         // c0 c1 c2 c3
				blockStart += outputPitch4;
				*blockStart = v128_convert_to_int32(dstRow3);         // d0 d1 d2 d3
				blockStart += outputPitch4;

				*blockStart = v128_convert_lane_to_int32<2>(dstRow0); // e0 e1 e2 e3
				blockStart += outputPitch4;
				*blockStart = v128_convert_lane_to_int32<2>(dstRow1); // f0 f1 f2 f3
				blockStart += outputPitch4;
				*blockStart = v128_convert_lane_to_int32<2>(dstRow2); // g0 g1 g2 g3
				blockStart += outputPitch4;
				*blockStart = v128_convert_lane_to_int32<2>(dstRow3); // h0 h1 h2 h3
				blockStart += outputPitch4;

				*blockStart = v128_convert_lane_to_int32<1>(dstRow0); // i0 i1 i2 i3
				blockStart += outputPitch4;
				*blockStart = v128_convert_lane_to_int32<1>(dstRow1); // j0 j1 j2 j3
				blockStart += outputPitch4;
				*blockStart = v128_convert_lane_to_int32<1>(dstRow2); // k0 k1 k2 k3
				blockStart += outputPitch4;
				*blockStart = v128_convert_lane_to_int32<1>(dstRow3); // l0 l1 l2 l3
				blockStart += outputPitch4;

				*blockStart = v128_convert_lane_to_int32<3>(dstRow0); // m0 m1 m2 m3
				blockStart += outputPitch4;
				*blockStart = v128_convert_lane_to_int32<3>(dstRow1); // n0 n1 n2 n3
				blockStart += outputPitch4;
				*blockStart = v128_convert_lane_to_int32<3>(dstRow2); // o0 01 02 03
				blockStart += outputPitch4;
				*blockStart = v128_convert_lane_to_int32<3>(dstRow3); // p0 p1 p2 p3
				blockStart += outputPitch4;
			}
		}
		// finished the top left region, now need to do top right, bot left and bot right
		const uint32_t colsLeft = width - h_blocks * 16;
		const uint32_t rowsLeft = height - v_blocks * 4;

		const uint32_t topRightWidth = colsLeft;
		const uint32_t topRightHeight = height - rowsLeft;
		const uint8_t* topRightInput = input_buffer + width - topRightWidth;
		uint8_t* topRightOutput = output_buffer + (width - topRightWidth) * output_pitch;
		Filters<ComponentScalar<1>>::transpose(topRightInput, topRightOutput, topRightWidth, topRightHeight, input_pitch, output_pitch, output_capacity); // top right

		const uint32_t botLeftWidth = width - colsLeft;
		const uint32_t botLeftHeight = rowsLeft;
		const uint8_t* botLeftInput = input_buffer + (height - rowsLeft) * input_pitch;
		uint8_t* botLeftOutput = output_buffer + (height - rowsLeft);
		Filters<ComponentScalar<1>>::transpose(botLeftInput, botLeftOutput, botLeftWidth, botLeftHeight, input_pitch, output_pitch, output_capacity); // bot left

		const uint32_t botRightWidth = colsLeft;
		const uint32_t botRightHeight = rowsLeft;
		const uint8_t* botRightInput = input_buffer + (height - rowsLeft) * input_pitch + (width - colsLeft);
		uint8_t* botRightOutput = output_buffer + (width - colsLeft) * output_pitch + (height - rowsLeft);
		Filters<ComponentScalar<1>>::transpose(botRightInput, botRightOutput, botRightWidth, botRightHeight, input_pitch, output_pitch, output_capacity); // cols/rows left
	} else {
		Filters<ComponentScalar<1>>::transpose(input_buffer, output_buffer, width, height, input_pitch, output_pitch, output_capacity);
	}
}