static float InnerProductSIMD4Ext()

in cpp/src/Spaces/InnerProduct.h [133:184]


static float InnerProductSIMD4Ext(const float *pVect1, const float *pVect2,
                                  const size_t qty) {
  float PORTABLE_ALIGN32 TmpRes[8];

  size_t qty16 = qty / 16;
  size_t qty4 = qty / 4;

  const float *pEnd1 = pVect1 + 16 * qty16;
  const float *pEnd2 = pVect1 + 4 * qty4;

  __m128 v1, v2;
  __m128 sum_prod = _mm_set1_ps(0);

  while (pVect1 < pEnd1) {
    v1 = _mm_loadu_ps(pVect1);
    pVect1 += 4;
    v2 = _mm_loadu_ps(pVect2);
    pVect2 += 4;
    sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));

    v1 = _mm_loadu_ps(pVect1);
    pVect1 += 4;
    v2 = _mm_loadu_ps(pVect2);
    pVect2 += 4;
    sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));

    v1 = _mm_loadu_ps(pVect1);
    pVect1 += 4;
    v2 = _mm_loadu_ps(pVect2);
    pVect2 += 4;
    sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));

    v1 = _mm_loadu_ps(pVect1);
    pVect1 += 4;
    v2 = _mm_loadu_ps(pVect2);
    pVect2 += 4;
    sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
  }

  while (pVect1 < pEnd2) {
    v1 = _mm_loadu_ps(pVect1);
    pVect1 += 4;
    v2 = _mm_loadu_ps(pVect2);
    pVect2 += 4;
    sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
  }

  _mm_store_ps(TmpRes, sum_prod);
  float sum = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];

  return 1.0f - sum;
}