static void transpose2x16()

in imagecore/image/internal/filters_intrinsics.cpp [345:432]


static void transpose2x16(const uint8_t* __restrict input_buffer, uint8_t* __restrict output_buffer, unsigned int width, unsigned int height, unsigned int input_pitch, unsigned int output_pitch, unsigned int output_capacity)
{
	const uint32_t h_blocks = width / 8;
	const uint32_t v_blocks = height / 4;
	if((h_blocks > 0) && (v_blocks > 0)) {
		const uint32_t outputPitch4 = output_pitch / 4;
		for(uint32_t v_index = 0; v_index < v_blocks; v_index++ ) {
			for(uint32_t h_index = 0; h_index < h_blocks; h_index++) {
				const uint8_t* srcBlock = input_buffer + v_index * 4 * input_pitch + h_index * 16;
				uint8_t* dstBlock = output_buffer + h_index * 8 * output_pitch + v_index * 8;
				vUInt16 srcRow0 = v128_load_unaligned((const vSInt32 *)(srcBlock + input_pitch * 0)); // a0 b0 c0 d0  e0 f0 g0 h0
				vUInt16 srcRow1 = v128_load_unaligned((const vSInt32 *)(srcBlock + input_pitch * 1)); // a1 b1 c1 d1  e1 f1 g1 h1
				vUInt16 srcRow2 = v128_load_unaligned((const vSInt32 *)(srcBlock + input_pitch * 2)); // a2 b2 c2 d2  e2 f2 g2 h2
				vUInt16 srcRow3 = v128_load_unaligned((const vSInt32 *)(srcBlock + input_pitch * 3)); // a3 b3 c3 d3  e3 f3 g3 h3

				vSInt64 dstRow0;
				vSInt64 dstRow1;
				vSInt64 dstRow2;
				vSInt64 dstRow3;
				vec_transpose_int16(srcRow0, srcRow1, srcRow2, srcRow3, dstRow0, dstRow1, dstRow2, dstRow3); // a0 a1 a2 a3  e0 e1 e2 e3
																											 // b0 b1 b2 b3  f0 f1 f2 f3
																											 // c0 c1 c2 c3  g0 g1 g2 g3
																											 // d0 d1 d2 d3  h0 h1 h2 h3
				// now need to store by 4x4 blocks at a time
				uint32_t* blockStart = (uint32_t*)dstBlock;
				*blockStart = v128_convert_to_int32(dstRow0);         // a0 a1
				blockStart += outputPitch4;
				*blockStart = v128_convert_to_int32(dstRow1);         // b0 b1
				blockStart += outputPitch4;
				*blockStart = v128_convert_to_int32(dstRow2);         // c0 c1
				blockStart += outputPitch4;
				*blockStart = v128_convert_to_int32(dstRow3);         // d0 d1
				blockStart += outputPitch4;

				*blockStart = v128_convert_lane_to_int32<2>(dstRow0); // e0 e1
				blockStart += outputPitch4;
				*blockStart = v128_convert_lane_to_int32<2>(dstRow1); // f0 f1
				blockStart += outputPitch4;
				*blockStart = v128_convert_lane_to_int32<2>(dstRow2); // g0 g1
				blockStart += outputPitch4;
				*blockStart = v128_convert_lane_to_int32<2>(dstRow3); // h0 h1
				blockStart += outputPitch4;

				blockStart = (uint32_t*)(dstBlock + 4);
				*blockStart = v128_convert_lane_to_int32<1>(dstRow0); // a2 a3
				blockStart += outputPitch4;
				*blockStart = v128_convert_lane_to_int32<1>(dstRow1); // b2 b3
				blockStart += outputPitch4;
				*blockStart = v128_convert_lane_to_int32<1>(dstRow2); // c2 c3
				blockStart += outputPitch4;
				*blockStart = v128_convert_lane_to_int32<1>(dstRow3); // d2 d3
				blockStart += outputPitch4;

				*blockStart = v128_convert_lane_to_int32<3>(dstRow0); // e2 e3
				blockStart += outputPitch4;
				*blockStart = v128_convert_lane_to_int32<3>(dstRow1); // f2 f3
				blockStart += outputPitch4;
				*blockStart = v128_convert_lane_to_int32<3>(dstRow2); // g2 g3
				blockStart += outputPitch4;
				*blockStart = v128_convert_lane_to_int32<3>(dstRow3); // h2 h3
				blockStart += outputPitch4;
			}
		}
		// finished the top left region, now need to do top right, bot left and bot right
		const uint32_t colsLeft = width - h_blocks * 8;
		const uint32_t rowsLeft = height - v_blocks * 4;

		const uint32_t topRightWidth = colsLeft;
		const uint32_t topRightHeight = height - rowsLeft;
		const uint8_t* topRightInput = input_buffer + 2 * (width - topRightWidth);
		uint8_t* topRightOutput = output_buffer + (width - topRightWidth) * output_pitch;
		Filters<ComponentScalar<2>>::transpose(topRightInput, topRightOutput, topRightWidth, topRightHeight, input_pitch, output_pitch, output_capacity); // top right

		const uint32_t botLeftWidth = width - colsLeft;
		const uint32_t botLeftHeight = rowsLeft;
		const uint8_t* botLeftInput = input_buffer + (height - rowsLeft) * input_pitch;
		uint8_t* botLeftOutput = output_buffer + 2 * (height - rowsLeft);
		Filters<ComponentScalar<2>>::transpose(botLeftInput, botLeftOutput, botLeftWidth, botLeftHeight, input_pitch, output_pitch, output_capacity); // bot left

		const uint32_t botRightWidth = colsLeft;
		const uint32_t botRightHeight = rowsLeft;
		const uint8_t* botRightInput = input_buffer + (height - rowsLeft) * input_pitch + 2 * (width - colsLeft);
		uint8_t* botRightOutput = output_buffer + (width - colsLeft) * output_pitch + 2 * (height - rowsLeft);
		Filters<ComponentScalar<2>>::transpose(botRightInput, botRightOutput, botRightWidth, botRightHeight, input_pitch, output_pitch, output_capacity); // cols/rows left
	} else {
		Filters<ComponentScalar<2>>::transpose(input_buffer, output_buffer, width, height, input_pitch, output_pitch, output_capacity);
	}
}