in imagecore/image/internal/conversions.cpp [124:215]
static void rgba_to_yuv420x4(uint8_t* dstY, uint8_t* dstUV, const uint8_t* srcRGBA, uint32_t inputWidth, uint32_t inputHeight, uint32_t inputPitch, uint32_t outputPitchY, uint32_t outputPitchUV)
{
uint32_t columns_processed = inputWidth & (~3);
if(columns_processed) { // 4 pixels wide version
vSInt32 zero = v128_setzero();
vSInt16 coeff_ry = v128_set_int16(76);
vSInt16 coeff_gy = v128_set_int16(150);
vSInt16 coeff_by = v128_set_int16(29);
vSInt16 coeff_ru = v128_set_int16(-43);
vSInt16 coeff_gu = v128_set_int16(-84);
vSInt16 coeff_bu = v128_set_int16(127);
vSInt16 coeff_rv = v128_set_int16(127);
vSInt16 coeff_gv = v128_set_int16(-106);
vSInt16 coeff_bv = v128_set_int16(-21);
vSInt16 uv_bias = v128_set_int16(128);
vUInt8 mergeMask = v128_set_int8_packed(ZMASK, ZMASK, ZMASK, ZMASK, ZMASK, ZMASK, ZMASK, ZMASK, 14, 6, 12, 4, 10, 2, 8, 0);
for(uint32_t row = 0; row < inputHeight; row += 2) {
uint8_t* outputY0 = dstY;
uint8_t* outputY1 = dstY + outputPitchY;
uint8_t* outputUV = dstUV;
const uint8_t* inputRGBA0 = srcRGBA;
const uint8_t* inputRGBA1 = srcRGBA + inputPitch;
for(uint32_t column = 0; column < inputWidth; column += 4) {
vSInt8 row0 = v128_load_unaligned((const vSInt32*)inputRGBA0);
vSInt8 row1 = v128_load_unaligned((const vSInt32*)inputRGBA1);
inputRGBA0 += 16;
inputRGBA1 += 16;
vSInt8 row0_pixels_01;
vSInt8 row0_pixels_23;
v128_unpack_int8(row0_pixels_01, row0_pixels_23, row0, zero);
vSInt8 row1_pixels_45;
vSInt8 row1_pixels_67;
v128_unpack_int8(row1_pixels_45, row1_pixels_67, row1, zero);
vSInt64 r0r2r4r6r1r3r5r7;
vSInt64 g0g2g4g6g1g3g5g7;
vSInt64 b0b2b4b6b1b3b5b7;
vSInt64 a0a2a4a6a1a3a5a7;
IMAGECORE_UNUSED(a0a2a4a6a1a3a5a7);
vec_transpose_int16(row0_pixels_01, row0_pixels_23, row1_pixels_45, row1_pixels_67, r0r2r4r6r1r3r5r7, g0g2g4g6g1g3g5g7, b0b2b4b6b1b3b5b7, a0a2a4a6a1a3a5a7);
vSInt16 y0y2y4y6y1y3y5y7 = v128_mul_int16(r0r2r4r6r1r3r5r7, coeff_ry);
y0y2y4y6y1y3y5y7 = v128_add_int16(y0y2y4y6y1y3y5y7, v128_mul_int16(g0g2g4g6g1g3g5g7, coeff_gy));
y0y2y4y6y1y3y5y7 = v128_add_int16(y0y2y4y6y1y3y5y7, v128_mul_int16(b0b2b4b6b1b3b5b7, coeff_by));
y0y2y4y6y1y3y5y7 = v128_shift_right_unsigned_int16<8>(y0y2y4y6y1y3y5y7);
vSInt16 u0u2u4u6u1u3u5u7 = v128_mul_int16(r0r2r4r6r1r3r5r7, coeff_ru);
u0u2u4u6u1u3u5u7 = v128_add_int16(u0u2u4u6u1u3u5u7, v128_mul_int16(g0g2g4g6g1g3g5g7, coeff_gu));
u0u2u4u6u1u3u5u7 = v128_add_int16(u0u2u4u6u1u3u5u7, v128_mul_int16(b0b2b4b6b1b3b5b7, coeff_bu));
u0u2u4u6u1u3u5u7 = v128_shift_right_unsigned_int16<8>(u0u2u4u6u1u3u5u7);
u0u2u4u6u1u3u5u7 = v128_add_int16(u0u2u4u6u1u3u5u7, uv_bias);
vSInt16 v0v2v4v6v1v3v5v7 = v128_mul_int16(r0r2r4r6r1r3r5r7, coeff_rv);
v0v2v4v6v1v3v5v7 = v128_add_int16(v0v2v4v6v1v3v5v7, v128_mul_int16(g0g2g4g6g1g3g5g7, coeff_gv));
v0v2v4v6v1v3v5v7 = v128_add_int16(v0v2v4v6v1v3v5v7, v128_mul_int16(b0b2b4b6b1b3b5b7, coeff_bv));
v0v2v4v6v1v3v5v7 = v128_shift_right_unsigned_int16<8>(v0v2v4v6v1v3v5v7);
v0v2v4v6v1v3v5v7 = v128_add_int16(v0v2v4v6v1v3v5v7, uv_bias);
vUInt8x8 y0y1y2y3y4y5y6y7 = v128_merge(y0y2y4y6y1y3y5y7, mergeMask);
type64 y;
y.m_64 = v128_convert_to_int64(y0y1y2y3y4y5y6y7);
*(uint32_t*)outputY0 = y.m_32[0];
*(uint32_t*)outputY1 = y.m_32[1];
outputY0 += 4;
outputY1 += 4;
vUInt8x8 u0u1u2u3u4u5u6u7 = v128_merge(u0u2u4u6u1u3u5u7, mergeMask);
vUInt8x8 v0v1v2v3v4v5v6v7 = v128_merge(v0v2v4v6v1v3v5v7, mergeMask);
type64 u;
type64 v;
u.m_64 = v128_convert_to_int64(u0u1u2u3u4u5u6u7);
v.m_64 = v128_convert_to_int64(v0v1v2v3v4v5v6v7);
*outputUV++ = ((u.m_8[0] + u.m_8[1] + u.m_8[4] + u.m_8[5]) >> 2);
*outputUV++ = ((v.m_8[0] + v.m_8[1] + v.m_8[4] + v.m_8[5]) >> 2);
*outputUV++ = ((u.m_8[2] + u.m_8[3] + u.m_8[6] + u.m_8[7]) >> 2);
*outputUV++ = ((v.m_8[2] + v.m_8[3] + v.m_8[6] + v.m_8[7]) >> 2);
}
dstY += outputPitchY * 2;
dstUV += outputPitchUV;
srcRGBA += inputPitch * 2;
}
}
uint32_t columns_remaining = inputWidth - columns_processed;
if(columns_remaining) {
Conversions<false>::rgba_to_yuv420(&dstY[columns_processed], &dstUV[columns_processed * 2], &srcRGBA[columns_processed * 4], columns_remaining, inputHeight, inputPitch, outputPitchY, outputPitchUV);
}
}