imagecore/image/internal/intrinsics_sse.h (246 lines of code) (raw):
/*
* MIT License
*
* Copyright (c) 2017 Twitter
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#pragma once
#ifdef __APPLE__
#include <Accelerate/Accelerate.h>
#else
#define vUInt8 __m128i
#define vUInt8x8 __m128i
#define vUInt16 __m128i
#define vUInt32 __m128i
#define vUInt64 __m128i
#define vSInt8 __m128i
#define vSInt16 __m128i
#define vSInt32 __m128i
#define vSInt64 __m128i
#define vFloat __m128
#endif
#define v128 __m128
#define v128i __m128i
// for neon compatibility
#define vUInt8x8 __m128i
#define vMask128 __m128i
#define v64_load v64_load_unaligned
#define v64_shuffle_int8 v128_shuffle_int8
#define v64_add_int16 v128_add_int16
#define v64_shift_right_unsigned_int16 v128_shift_right_unsigned_int16
#define v64_pack_unsigned_saturate_int16 v128_pack_unsigned_saturate_int16
#define v64_convert_to_int32 v128_convert_to_int32
#define V64_MASK_LO(e7, e6, e5, e4, e3, e2, e1, e0) (uint64_t)(e0) \
| ((uint64_t)(e1) << 8) \
| ((uint64_t)(e2) << 16) \
| ((uint64_t)(e3) << 24) \
| ((uint64_t)(e4) << 32) \
| ((uint64_t)(e5) << 40) \
| ((uint64_t)(e6) << 48) \
| ((uint64_t)(e7) << 56)
// for neon 64 bit compatibility, assumption that high mask only indexes the high 64 bits in the source 128 bit register
#define V64_MASK_HI(e7, e6, e5, e4, e3, e2, e1, e0) (uint64_t)(e0 == 0x80 ? 0x80 : e0 + 8) \
| ((uint64_t)(e1 == 0x80 ? 0x80 : e1 + 8) << 8) \
| ((uint64_t)(e2 == 0x80 ? 0x80 : e2 + 8) << 16) \
| ((uint64_t)(e3 == 0x80 ? 0x80 : e3 + 8) << 24) \
| ((uint64_t)(e4 == 0x80 ? 0x80 : e4 + 8) << 32) \
| ((uint64_t)(e5 == 0x80 ? 0x80 : e5 + 8) << 40) \
| ((uint64_t)(e6 == 0x80 ? 0x80 : e6 + 8) << 48) \
| ((uint64_t)(e7 == 0x80 ? 0x80 : e7 + 8) << 56)
#include <immintrin.h>
#include <emmintrin.h>
#include <cpuid.h>
// note, this doesn't do a full transpose, the 2 middle 32 bit elements still need to be swapped
#define vec_transpose_int8(r0, r1, r2, r3, c0, c1, c2, c3) \
{ \
vSInt8 u0 = v128_unpacklo_int8(r0, r1); \
vSInt8 u1 = v128_unpacklo_int8(r2, r3); \
vSInt8 u2 = v128_unpackhi_int8(r0, r1); \
vSInt8 u3 = v128_unpackhi_int8(r2, r3); \
vSInt8 t0 = v128_unpacklo_int16(u0, u1); \
vSInt8 t1 = v128_unpacklo_int16(u2, u3); \
vSInt8 t2 = v128_unpackhi_int16(u0, u1); \
vSInt8 t3 = v128_unpackhi_int16(u2, u3); \
vSInt8 s0 = v128_unpacklo_int32(t0, t1); \
vSInt8 s1 = v128_unpacklo_int32(t2, t3); \
vSInt8 s2 = v128_unpackhi_int32(t0, t1); \
vSInt8 s3 = v128_unpackhi_int32(t2, t3); \
c0 = v128_unpacklo_int64(s0, s1); \
c1 = v128_unpackhi_int64(s0, s1); \
c2 = v128_unpacklo_int64(s2, s3); \
c3 = v128_unpackhi_int64(s2, s3); \
}
#define vec_transpose_int16(r0, r1, r2, r3, c0, c1, c2, c3) \
{ \
vSInt16 t0 = v128_unpacklo_int16(r0, r1); \
vSInt16 t1 = v128_unpacklo_int16(r2, r3); \
vSInt16 t2 = v128_unpackhi_int16(r0, r1); \
vSInt16 t3 = v128_unpackhi_int16(r2, r3); \
vSInt16 s0 = v128_unpacklo_int32(t0, t1); \
vSInt16 s1 = v128_unpacklo_int32(t2, t3); \
vSInt16 s2 = v128_unpackhi_int32(t0, t1); \
vSInt16 s3 = v128_unpackhi_int32(t2, t3); \
c0 = v128_unpacklo_int64(s0, s1); \
c1 = v128_unpackhi_int64(s0, s1); \
c2 = v128_unpacklo_int64(s2, s3); \
c3 = v128_unpackhi_int64(s2, s3); \
}
// set
inline v128i v128_setzero()
{
return _mm_setzero_si128();
}
inline v128i v128_set_int32(int32_t a)
{
return _mm_set1_epi32(a);
}
inline v128i v128_set_int16(int16_t a)
{
return _mm_set1_epi16(a);
}
inline v128i v128_set_int8_packed(int8_t e15, int8_t e14, int8_t e13, int8_t e12, int8_t e11, int8_t e10, int8_t e9, int8_t e8, int8_t e7, int8_t e6, int8_t e5, int8_t e4, int8_t e3, int8_t e2, int8_t e1, int8_t e0)
{
return _mm_set_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0);
}
inline vMask128 v128_set_mask(uint64_t high, uint64_t low)
{
vMask128 res;
res = _mm_set_epi64x((long long)high, (long long)low);
return res;
}
inline v128i v64_set_int8_packed(int8_t e7, int8_t e6, int8_t e5, int8_t e4, int8_t e3, int8_t e2, int8_t e1, int8_t e0)
{
return _mm_set_epi8(ZMASK, ZMASK, ZMASK, ZMASK, ZMASK, ZMASK, ZMASK, ZMASK, e7, e6, e5, e4, e3, e2, e1, e0);
}
// load
inline v128i v64_load_unaligned(vSInt32 a, const vSInt32* mem_addr)
{
return (v128i) _mm_loadl_pi((v128)a, (__m64 const*) mem_addr);
}
inline v128i v128_load_unaligned(const vSInt32* mem_addr)
{
return _mm_lddqu_si128((const __m128i*)mem_addr);
}
// store
inline void v64_store(vSInt32* mem_addr, v128i a)
{
_mm_storel_epi64((__m128i*)mem_addr, a);
}
inline void v128_store_unaligned(vSInt32* mem_addr, v128i a)
{
_mm_storeu_si128 ((__m128i*) mem_addr, a);
}
// conversions
inline int32_t v128_convert_to_int32(v128i a)
{
return _mm_cvtsi128_si32(a);
}
template<int lane>
inline int32_t v128_convert_lane_to_int32(v128i a)
{
v128i b;
b = _mm_srli_si128(a, lane * 4);
return _mm_cvtsi128_si32(b);
}
inline int64_t v128_convert_to_int64(v128i a)
{
#ifdef __x86_64__
return _mm_cvtsi128_si64(a);
#else
int64_t val[2];
_mm_store_si128((v128i*)val, a);
return val[0];
#endif
}
template<int imm8>
int v128_extract_int32(v128i a)
{
return _mm_extract_epi32(a, imm8);
}
template<int imm8>
int v128_extract_int8(v128i a)
{
return _mm_extract_epi8(a, imm8);
}
// math
inline v128i v128_add_int16(v128i a, v128i b)
{
return _mm_add_epi16(a, b);
}
inline v128i v128_add_int32(v128i a, v128i b)
{
return _mm_add_epi32(a, b);
}
inline v128i v128_mul_int16(v128i a, v128i b)
{
return _mm_mullo_epi16(a, b);
}
inline v128i v128_mul_int32(v128i a, v128i b)
{
return _mm_mullo_epi32(a, b);
}
// unpack
inline v128i v128_unpacklo_int8(v128i a, v128i b)
{
return _mm_unpacklo_epi8(a, b);
}
inline v128i v128_unpackhi_int8(v128i a, v128i b)
{
return _mm_unpackhi_epi8(a, b);
}
inline v128i v128_unpacklo_int16(v128i a, v128i b)
{
return _mm_unpacklo_epi16(a, b);
}
inline v128i v128_unpackhi_int16(v128i a, v128i b)
{
return _mm_unpackhi_epi16(a, b);
}
inline v128i v128_unpacklo_int32(v128i a, v128i b)
{
return _mm_unpacklo_epi32(a, b);
}
inline v128i v128_unpackhi_int32(v128i a, v128i b)
{
return _mm_unpackhi_epi32(a, b);
}
inline v128i v128_unpacklo_int64(v128i a, v128i b)
{
return _mm_unpacklo_epi64(a, b);
}
inline v128i v128_unpackhi_int64(v128i a, v128i b)
{
return _mm_unpackhi_epi64(a, b);
}
// pack
inline v128i v128_pack_unsigned_saturate_int16(v128i a, v128i b, v128i)
{
return _mm_packus_epi16(a, b);
}
inline v128i v128_pack_unsigned_saturate_int32(v128i a, v128i b)
{
return _mm_packus_epi32(a, b);
}
// shift
template<int imm>
inline v128i v128_shift_right_unsigned_int16(v128i a)
{
return _mm_srli_epi16(a, imm);
}
template<int imm8>
inline v128i v128_shift_right_unsigned_vec128(v128i a)
{
return _mm_srli_si128(a, imm8);
}
template<int imm>
inline v128i v128_shift_right_signed_int32(v128i a)
{
return _mm_srai_epi32(a, imm);
}
// shuffles
template<int imm>
inline v128i v128_shuffle_int32(v128i a)
{
return _mm_shuffle_epi32(a, imm);
}
inline v128i v128_shuffle_int8(v128i a, v128i b)
{
return _mm_shuffle_epi8(a, b);
}
inline v128i v128_merge(v128i a, v128i b)
{
return _mm_shuffle_epi8(a, b);
}
// special cases for cross platform compatibility
inline void v128_swizzleAndUnpack(vUInt16& a, vUInt16& b, vUInt8 c, vSInt32 zero)
{
__m128i swizzled = v128_shuffle_int32<V128_SHUFFLE(3, 1, 2, 0)>(c);
a = v128_unpacklo_int8(swizzled, zero);
b = v128_unpackhi_int8(swizzled, zero);
}
inline void v128_unpack_int8(vSInt8& a, vSInt8& b, vUInt8 c, vUInt8 d)
{
a = v128_unpacklo_int8(c, d);
b = v128_unpackhi_int8(c, d);
}