XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_initCustomSecret_avx2()

in include/hash/xxhash.h [3230:3263]


XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_initCustomSecret_avx2(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
{
    XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 31) == 0);
    XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE / sizeof(__m256i)) == 6);
    XXH_STATIC_ASSERT(XXH_SEC_ALIGN <= 64);
    (void)(&XXH_writeLE64);
    XXH_PREFETCH(customSecret);
    {   __m256i const seed = _mm256_set_epi64x(-(xxh_i64)seed64, (xxh_i64)seed64, -(xxh_i64)seed64, (xxh_i64)seed64);

        XXH_ALIGN(64) const __m256i* const src  = (const __m256i*) XXH3_kSecret;
        XXH_ALIGN(64)       __m256i*       dest = (      __m256i*) customSecret;

#       if defined(__GNUC__) || defined(__clang__)
        /*
         * On GCC & Clang, marking 'dest' as modified will cause the compiler:
         *   - do not extract the secret from sse registers in the internal loop
         *   - use less common registers, and avoid pushing these reg into stack
         * The asm hack causes Clang to assume that XXH3_kSecretPtr aliases with
         * customSecret, and on aarch64, this prevented LDP from merging two
         * loads together for free. Putting the loads together before the stores
         * properly generates LDP.
         */
        __asm__("" : "+r" (dest));
#       endif

        /* GCC -O2 need unroll loop manually */
        dest[0] = _mm256_add_epi64(_mm256_stream_load_si256(src+0), seed);
        dest[1] = _mm256_add_epi64(_mm256_stream_load_si256(src+1), seed);
        dest[2] = _mm256_add_epi64(_mm256_stream_load_si256(src+2), seed);
        dest[3] = _mm256_add_epi64(_mm256_stream_load_si256(src+3), seed);
        dest[4] = _mm256_add_epi64(_mm256_stream_load_si256(src+4), seed);
        dest[5] = _mm256_add_epi64(_mm256_stream_load_si256(src+5), seed);
    }
}