static float InnerProductSIMD16Ext()

in cpp/src/Spaces/InnerProduct.h [221:252]


static float InnerProductSIMD16Ext(const float *pVect1, const float *pVect2,
                                   const size_t qty) {
  float PORTABLE_ALIGN32 TmpRes[8];

  size_t qty16 = qty / 16;

  const float *pEnd1 = pVect1 + 16 * qty16;

  __m256 sum256 = _mm256_set1_ps(0);

  while (pVect1 < pEnd1) {
    //_mm_prefetch((char*)(pVect2 + 16), _MM_HINT_T0);

    __m256 v1 = _mm256_loadu_ps(pVect1);
    pVect1 += 8;
    __m256 v2 = _mm256_loadu_ps(pVect2);
    pVect2 += 8;
    sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v1, v2));

    v1 = _mm256_loadu_ps(pVect1);
    pVect1 += 8;
    v2 = _mm256_loadu_ps(pVect2);
    pVect2 += 8;
    sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v1, v2));
  }

  _mm256_store_ps(TmpRes, sum256);
  float sum = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] +
              TmpRes[5] + TmpRes[6] + TmpRes[7];

  return 1.0f - sum;
}