in include/hash/xxhash.h [1234:1288]
static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input)
{
acc += input * XXH_PRIME32_2;
acc = XXH_rotl32(acc, 13);
acc *= XXH_PRIME32_1;
#if defined(__GNUC__) && defined(__SSE4_1__) && !defined(XXH_ENABLE_AUTOVECTORIZE)
/*
* UGLY HACK:
* This inline assembly hack forces acc into a normal register. This is the
* only thing that prevents GCC and Clang from autovectorizing the XXH32
* loop (pragmas and attributes don't work for some resason) without globally
* disabling SSE4.1.
*
* The reason we want to avoid vectorization is because despite working on
* 4 integers at a time, there are multiple factors slowing XXH32 down on
* SSE4:
* - There's a ridiculous amount of lag from pmulld (10 cycles of latency on
* newer chips!) making it slightly slower to multiply four integers at
* once compared to four integers independently. Even when pmulld was
* fastest, Sandy/Ivy Bridge, it is still not worth it to go into SSE
* just to multiply unless doing a long operation.
*
* - Four instructions are required to rotate,
* movqda tmp, v // not required with VEX encoding
* pslld tmp, 13 // tmp <<= 13
* psrld v, 19 // x >>= 19
* por v, tmp // x |= tmp
* compared to one for scalar:
* roll v, 13 // reliably fast across the board
* shldl v, v, 13 // Sandy Bridge and later prefer this for some reason
*
* - Instruction level parallelism is actually more beneficial here because
* the SIMD actually serializes this operation: While v1 is rotating, v2
* can load data, while v3 can multiply. SSE forces them to operate
* together.
*
* How this hack works:
* __asm__("" // Declare an assembly block but don't declare any instructions
* : // However, as an Input/Output Operand,
* "+r" // constrain a read/write operand (+) as a general purpose register (r).
* (acc) // and set acc as the operand
* );
*
* Because of the 'r', the compiler has promised that seed will be in a
* general purpose register and the '+' says that it will be 'read/write',
* so it has to assume it has changed. It is like volatile without all the
* loads and stores.
*
* Since the argument has to be in a normal register (not an SSE register),
* each time XXH32_round is called, it is impossible to vectorize.
*/
__asm__("" : "+r" (acc));
#endif
return acc;
}