in cpp/src/Spaces/Euclidean.h [68:95]
static float L2SqrSIMD16Ext(const float *pVect1, const float *pVect2,
const size_t qty) {
float PORTABLE_ALIGN64 TmpRes[16];
size_t qty16 = qty >> 4;
const float *pEnd1 = pVect1 + (qty16 << 4);
__m512 diff, v1, v2;
__m512 sum = _mm512_set1_ps(0);
while (pVect1 < pEnd1) {
v1 = _mm512_loadu_ps(pVect1);
pVect1 += 16;
v2 = _mm512_loadu_ps(pVect2);
pVect2 += 16;
diff = _mm512_sub_ps(v1, v2);
// sum = _mm512_fmadd_ps(diff, diff, sum);
sum = _mm512_add_ps(sum, _mm512_mul_ps(diff, diff));
}
_mm512_store_ps(TmpRes, sum);
float res = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] +
TmpRes[5] + TmpRes[6] + TmpRes[7] + TmpRes[8] + TmpRes[9] +
TmpRes[10] + TmpRes[11] + TmpRes[12] + TmpRes[13] + TmpRes[14] +
TmpRes[15];
return (res);
}