Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Missing 8-bit vector type aliases and asociated min/max calls #240

Open
wants to merge 12 commits into
base: 1.4
Choose a base branch
from
Open
112 changes: 94 additions & 18 deletions Vc/avx/detail.h
Original file line number Diff line number Diff line change
Expand Up @@ -244,49 +244,112 @@ Vc_INTRINSIC __m256i load(const int *mem, when_streaming,
}

template <typename V, typename DstT>
Vc_INTRINSIC __m256i load(const short *mem, when_unaligned,
enable_if<(std::is_same<DstT, short>::value &&
std::is_same<V, __m256i>::value)> = nullarg)
Vc_INTRINSIC __m256i
load(const short *mem, when_unaligned,
enable_if<(std::is_same<DstT, short>::value && std::is_same<V, __m256i>::value)> =
nullarg)
{
return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(mem));
}

template <typename V, typename DstT>
Vc_INTRINSIC __m256i load(const short *mem, when_aligned,
enable_if<(std::is_same<DstT, short>::value &&
std::is_same<V, __m256i>::value)> = nullarg)
Vc_INTRINSIC __m256i
load(const short *mem, when_aligned,
enable_if<(std::is_same<DstT, short>::value && std::is_same<V, __m256i>::value)> =
nullarg)
{
return _mm256_load_si256(reinterpret_cast<const __m256i *>(mem));
}

template <typename V, typename DstT>
Vc_INTRINSIC __m256i load(const short *mem, when_streaming,
enable_if<(std::is_same<DstT, short>::value &&
std::is_same<V, __m256i>::value)> = nullarg)
Vc_INTRINSIC __m256i
load(const short *mem, when_streaming,
enable_if<(std::is_same<DstT, short>::value && std::is_same<V, __m256i>::value)> =
nullarg)
{
return AvxIntrinsics::stream_load<__m256i>(mem);
}

template <typename V, typename DstT>
Vc_INTRINSIC __m256i load(const ushort *mem, when_unaligned,
enable_if<(std::is_same<DstT, ushort>::value &&
std::is_same<V, __m256i>::value)> = nullarg)
Vc_INTRINSIC __m256i
load(const ushort *mem, when_unaligned,
enable_if<(std::is_same<DstT, ushort>::value && std::is_same<V, __m256i>::value)> =
nullarg)
{
return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(mem));
}

template <typename V, typename DstT>
Vc_INTRINSIC __m256i load(const ushort *mem, when_aligned,
enable_if<(std::is_same<DstT, ushort>::value &&
std::is_same<V, __m256i>::value)> = nullarg)
Vc_INTRINSIC __m256i
load(const ushort *mem, when_aligned,
enable_if<(std::is_same<DstT, ushort>::value && std::is_same<V, __m256i>::value)> =
nullarg)
{
return _mm256_load_si256(reinterpret_cast<const __m256i *>(mem));
}

template <typename V, typename DstT>
Vc_INTRINSIC __m256i load(const ushort *mem, when_streaming,
enable_if<(std::is_same<DstT, ushort>::value &&
std::is_same<V, __m256i>::value)> = nullarg)
Vc_INTRINSIC __m256i
load(const ushort *mem, when_streaming,
enable_if<(std::is_same<DstT, ushort>::value && std::is_same<V, __m256i>::value)> =
nullarg)
{
return AvxIntrinsics::stream_load<__m256i>(mem);
}


//chars

template <typename V, typename DstT>
Vc_INTRINSIC __m256i
load(const char *mem, when_unaligned,
enable_if<(std::is_same<DstT, char>::value && std::is_same<V, __m256i>::value)> =
nullarg)
{
return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(mem));
}

template <typename V, typename DstT>
Vc_INTRINSIC __m256i
load(const char *mem, when_aligned,
enable_if<(std::is_same<DstT, char>::value && std::is_same<V, __m256i>::value)> =
nullarg)
{
return _mm256_load_si256(reinterpret_cast<const __m256i *>(mem));
}

template <typename V, typename DstT>
Vc_INTRINSIC __m256i
load(const char *mem, when_streaming,
enable_if<(std::is_same<DstT, char>::value && std::is_same<V, __m256i>::value)> =
nullarg)
{
return AvxIntrinsics::stream_load<__m256i>(mem);
}

template <typename V, typename DstT>
Vc_INTRINSIC __m256i
load(const uchar *mem, when_unaligned,
enable_if<(std::is_same<DstT, uchar>::value && std::is_same<V, __m256i>::value)> =
nullarg)
{
return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(mem));
}

template <typename V, typename DstT>
Vc_INTRINSIC __m256i
load(const uchar *mem, when_aligned,
enable_if<(std::is_same<DstT, uchar>::value && std::is_same<V, __m256i>::value)> =
nullarg)
{
return _mm256_load_si256(reinterpret_cast<const __m256i *>(mem));
}

template <typename V, typename DstT>
Vc_INTRINSIC __m256i
load(const uchar *mem, when_streaming,
enable_if<(std::is_same<DstT, uchar>::value && std::is_same<V, __m256i>::value)> =
nullarg)
{
return AvxIntrinsics::stream_load<__m256i>(mem);
}
Expand Down Expand Up @@ -699,6 +762,8 @@ Vc_INTRINSIC __m256i add(__m256i a, __m256i b, int) { return AVX::add_epi32(a
Vc_INTRINSIC __m256i add(__m256i a, __m256i b, uint) { return AVX::add_epi32(a, b); }
Vc_INTRINSIC __m256i add(__m256i a, __m256i b, short) { return AVX::add_epi16(a, b); }
Vc_INTRINSIC __m256i add(__m256i a, __m256i b, ushort) { return AVX::add_epi16(a, b); }
Vc_INTRINSIC __m256i add(__m256i a, __m256i b, char) { return AVX::add_epi8(a, b); }
Vc_INTRINSIC __m256i add(__m256i a, __m256i b, uchar) { return AVX::add_epi8(a, b); }

// sub{{{1
Vc_INTRINSIC __m256 sub(__m256 a, __m256 b, float) { return _mm256_sub_ps(a, b); }
Expand All @@ -707,6 +772,8 @@ Vc_INTRINSIC __m256i sub(__m256i a, __m256i b, int) { return AVX::sub_epi32(a
Vc_INTRINSIC __m256i sub(__m256i a, __m256i b, uint) { return AVX::sub_epi32(a, b); }
Vc_INTRINSIC __m256i sub(__m256i a, __m256i b, short) { return AVX::sub_epi16(a, b); }
Vc_INTRINSIC __m256i sub(__m256i a, __m256i b, ushort) { return AVX::sub_epi16(a, b); }
Vc_INTRINSIC __m256i sub(__m256i a, __m256i b, char) { return AVX::sub_epi8(a, b); }
Vc_INTRINSIC __m256i sub(__m256i a, __m256i b, uchar) { return AVX::sub_epi8(a, b); }

// mul{{{1
Vc_INTRINSIC __m256 mul(__m256 a, __m256 b, float) { return _mm256_mul_ps(a, b); }
Expand Down Expand Up @@ -787,6 +854,8 @@ Vc_INTRINSIC __m256i cmpeq(__m256i a, __m256i b, int) { return AvxIntrinsics:
Vc_INTRINSIC __m256i cmpeq(__m256i a, __m256i b, uint) { return AvxIntrinsics::cmpeq_epi32(a, b); }
Vc_INTRINSIC __m256i cmpeq(__m256i a, __m256i b, short) { return AvxIntrinsics::cmpeq_epi16(a, b); }
Vc_INTRINSIC __m256i cmpeq(__m256i a, __m256i b, ushort) { return AvxIntrinsics::cmpeq_epi16(a, b); }
Vc_INTRINSIC __m256i cmpeq(__m256i a, __m256i b, schar) { return AvxIntrinsics::cmpeq_epi8(a, b); }
Vc_INTRINSIC __m256i cmpeq(__m256i a, __m256i b, uchar) { return AvxIntrinsics::cmpeq_epi8(a, b); }

// cmpneq{{{1
Vc_INTRINSIC __m256 cmpneq(__m256 a, __m256 b, float) { return AvxIntrinsics::cmpneq_ps(a, b); }
Expand Down Expand Up @@ -1319,6 +1388,13 @@ template <> Vc_INTRINSIC Vc_CONST int mask_to_int<16>(__m256i k)
{
return _pext_u32(movemask(k), 0x55555555u);
}
#else
template <> Vc_INTRINSIC Vc_CONST int mask_to_int<16>(__m256i k)
{
int upper = movemask(AVX::avx_cast<__m256>(k)) << 8;
int lower = movemask(AVX::avx_cast<__m256>(_mm256_slli_epi32(k,16)));
return upper | lower;
}
#endif
template <> Vc_INTRINSIC Vc_CONST int mask_to_int<32>(__m256i k)
{
Expand Down
89 changes: 58 additions & 31 deletions Vc/avx/intrinsics.h
Original file line number Diff line number Diff line change
Expand Up @@ -150,15 +150,32 @@ namespace AvxIntrinsics
#endif
}

template <int offset> Vc_INTRINSIC __m128 extract128(__m256 a) { return _mm256_extractf128_ps(a, offset); }
template <int offset> Vc_INTRINSIC __m128d extract128(__m256d a) { return _mm256_extractf128_pd(a, offset); }
template <int offset> Vc_INTRINSIC __m128i extract128(__m256i a) {
#ifdef Vc_IMPL_AVX2
return _mm256_extracti128_si256(a, offset);
#else
return _mm256_extractf128_si256(a, offset);
#endif
}
//Support
template<int offset>
struct extract128_impl{
//Only use extract functions with offset=1
static_assert(offset == 1);
static Vc_INTRINSIC __m128 extract128(__m256 a) { return _mm256_extractf128_ps(a, offset); }
static Vc_INTRINSIC __m128d extract128(__m256d a) { return _mm256_extractf128_pd(a, offset); }
static Vc_INTRINSIC __m128i extract128(__m256i a) {
#ifdef Vc_IMPL_AVX2
return _mm256_extracti128_si256(a, offset);
#else
return _mm256_extractf128_si256(a, offset);
#endif
}
};

template<>
struct extract128_impl<0>{
static Vc_INTRINSIC __m128 extract128(__m256 a) { return _mm256_castps256_ps128(a); }
static Vc_INTRINSIC __m128d extract128(__m256d a) { return _mm256_castpd256_pd128(a); }
static Vc_INTRINSIC __m128i extract128(__m256i a) { return _mm256_castsi256_si128(a); }
};

template <int offset> Vc_INTRINSIC __m128 extract128(__m256 a) { return extract128_impl<offset>::extract128(a); }
template <int offset> Vc_INTRINSIC __m128d extract128(__m256d a) { return extract128_impl<offset>::extract128(a); }
template <int offset> Vc_INTRINSIC __m128i extract128(__m256i a) { return extract128_impl<offset>::extract128(a); }

/////////////////////// COMPARE OPS ///////////////////////
#ifdef Vc_GCC
Expand Down Expand Up @@ -226,8 +243,8 @@ namespace AvxIntrinsics
template <int shift> Vc_INTRINSIC Vc_CONST m256i alignr(__m256i s1, __m256i s2)
{
return insert128<1>(
_mm256_castsi128_si256(_mm_alignr_epi8(_mm256_castsi256_si128(s1),
_mm256_castsi256_si128(s2), shift)),
_mm256_castsi128_si256(
_mm_alignr_epi8(extract128<0>(s1), extract128<0>(s2), shift)),
_mm_alignr_epi8(extract128<1>(s1), extract128<1>(s2), shift));
}
#endif
Expand Down Expand Up @@ -264,7 +281,7 @@ namespace AvxIntrinsics
Vc_INTRINSIC Vc_CONST __m256i name(__m256i a0) \
{ \
__m128i a1 = extract128<1>(a0); \
__m128i r0 = _mm_##name(_mm256_castsi256_si128(a0)); \
__m128i r0 = _mm_##name(extract128<0>(a0)); \
__m128i r1 = _mm_##name(a1); \
return insert128<1>(_mm256_castsi128_si256(r0), r1); \
}
Expand All @@ -280,23 +297,23 @@ namespace AvxIntrinsics
{ \
m128i a1 = extract128<1>(a0); \
m128i b1 = extract128<1>(b0); \
m128i r0 = _mm_##name(_mm256_castsi256_si128(a0), _mm256_castsi256_si128(b0)); \
m128i r0 = _mm_##name(extract128<0>(a0), extract128<0>(b0)); \
m128i r1 = _mm_##name(a1, b1); \
return insert128<1>(_mm256_castsi128_si256(r0), r1); \
}
#define Vc_AVX_TO_SSE_256_128(name) \
Vc_INTRINSIC Vc_CONST m256i name(__m256i a0, __m128i b0) \
{ \
m128i a1 = extract128<1>(a0); \
m128i r0 = _mm_##name(_mm256_castsi256_si128(a0), b0); \
m128i r0 = _mm_##name(extract128<0>(a0), b0); \
m128i r1 = _mm_##name(a1, b0); \
return insert128<1>(_mm256_castsi128_si256(r0), r1); \
}
#define Vc_AVX_TO_SSE_1i(name) \
template <int i> Vc_INTRINSIC Vc_CONST m256i name(__m256i a0) \
{ \
m128i a1 = extract128<1>(a0); \
m128i r0 = _mm_##name(_mm256_castsi256_si128(a0), i); \
m128i r0 = _mm_##name(extract128<0>(a0), i); \
m128i r1 = _mm_##name(a1, i); \
return insert128<1>(_mm256_castsi128_si256(r0), r1); \
}
Expand Down Expand Up @@ -338,11 +355,14 @@ namespace AvxIntrinsics
Vc_AVX_TO_SSE_2_NEW(cmpgt_epi64)
Vc_AVX_TO_SSE_2_NEW(unpackhi_epi16)
Vc_AVX_TO_SSE_2_NEW(unpacklo_epi16)
Vc_AVX_TO_SSE_2_NEW(add_epi8)
Vc_AVX_TO_SSE_2_NEW(add_epi16)
Vc_AVX_TO_SSE_2_NEW(add_epi32)
Vc_AVX_TO_SSE_2_NEW(add_epi64)
Vc_AVX_TO_SSE_2_NEW(sub_epi8)
Vc_AVX_TO_SSE_2_NEW(sub_epi16)
Vc_AVX_TO_SSE_2_NEW(sub_epi32)
Vc_AVX_TO_SSE_2_NEW(sub_epi64)
Vc_AVX_TO_SSE_2_NEW(mullo_epi16)
Vc_AVX_TO_SSE_2_NEW(sign_epi16)
Vc_AVX_TO_SSE_2_NEW(sign_epi32)
Expand Down Expand Up @@ -392,23 +412,17 @@ namespace AvxIntrinsics

Vc_INTRINSIC Vc_CONST int movemask_epi8(__m256i a0)
{
m128i a1 = extract128<1>(a0);
return (_mm_movemask_epi8(a1) << 16) | _mm_movemask_epi8(_mm256_castsi256_si128(a0));
return (_mm_movemask_epi8(extract128<1>(a0)) << 16) | _mm_movemask_epi8(extract128<0>(a0));
}
template <int m> Vc_INTRINSIC Vc_CONST m256i blend_epi16(__m256i a0, __m256i b0)
{
m128i a1 = extract128<1>(a0);
m128i b1 = extract128<1>(b0);
m128i r0 = _mm_blend_epi16(_mm256_castsi256_si128(a0), _mm256_castsi256_si128(b0), m & 0xff);
m128i r1 = _mm_blend_epi16(a1, b1, m >> 8);
m128i r0 = _mm_blend_epi16(extract128<0>(a0), extract128<0>(b0), m & 0xff);
m128i r1 = _mm_blend_epi16(extract128<1>(a0), extract128<1>(b0), m >> 8);
return insert128<1>(_mm256_castsi128_si256(r0), r1);
}
Vc_INTRINSIC Vc_CONST m256i blendv_epi8(__m256i a0, __m256i b0, __m256i m0) {
m128i a1 = extract128<1>(a0);
m128i b1 = extract128<1>(b0);
m128i m1 = extract128<1>(m0);
m128i r0 = _mm_blendv_epi8(_mm256_castsi256_si128(a0), _mm256_castsi256_si128(b0), _mm256_castsi256_si128(m0));
m128i r1 = _mm_blendv_epi8(a1, b1, m1);
m128i r0 = _mm_blendv_epi8(extract128<0>(a0), extract128<0>(b0), extract128<0>(m0));
m128i r1 = _mm_blendv_epi8(extract128<1>(a0), extract128<1>(b0), extract128<1>(m0));
return insert128<1>(_mm256_castsi128_si256(r0), r1);
}
// mpsadbw_epu8 (__m128i __X, __m128i __Y, const int __M)
Expand Down Expand Up @@ -454,6 +468,9 @@ static Vc_INTRINSIC m256i cmplt_epi8(__m256i a, __m256i b) {
static Vc_INTRINSIC m256i cmpgt_epu8(__m256i a, __m256i b) {
return cmpgt_epi8(xor_si256(a, setmin_epi8()), xor_si256(b, setmin_epi8()));
}
static Vc_INTRINSIC m256i cmplt_epu8(__m256i a, __m256i b) {
return cmpgt_epu8(b, a);
}
#if defined(Vc_IMPL_XOP)
Vc_AVX_TO_SSE_2_NEW(comlt_epu32)
Vc_AVX_TO_SSE_2_NEW(comgt_epu32)
Expand Down Expand Up @@ -504,13 +521,23 @@ static Vc_INTRINSIC void _mm256_maskstore(unsigned int *mem, const __m256i mask,
}
static Vc_INTRINSIC void _mm256_maskstore(short *mem, const __m256i mask, const __m256i v) {
using namespace AVX;
_mm_maskmoveu_si128(_mm256_castsi256_si128(v), _mm256_castsi256_si128(mask), reinterpret_cast<char *>(&mem[0]));
_mm_maskmoveu_si128(extract128<1>(v), extract128<1>(mask), reinterpret_cast<char *>(&mem[8]));
__m256i tmp = _mm256_loadu_si256(reinterpret_cast<__m256i *>(mem));
tmp = _mm256_blendv_epi8(tmp, v, mask);
_mm256_storeu_si256(reinterpret_cast<__m256i *>(mem), tmp);
}
static Vc_INTRINSIC void _mm256_maskstore(unsigned short *mem, const __m256i mask, const __m256i v) {
_mm256_maskstore(reinterpret_cast<short *>(mem), mask, v);
}

static Vc_INTRINSIC void _mm256_maskstore(char *mem, const __m256i mask, const __m256i v) {
_mm256_maskstore(reinterpret_cast<short *>(mem), mask, v);
}

static Vc_INTRINSIC void _mm256_maskstore(unsigned char *mem, const __m256i mask, const __m256i v) {
_mm256_maskstore(reinterpret_cast<short *>(mem), mask, v);
}


#undef Vc_AVX_TO_SSE_1
#undef Vc_AVX_TO_SSE_1_128
#undef Vc_AVX_TO_SSE_2_NEW
Expand Down Expand Up @@ -556,7 +583,7 @@ Vc_INTRINSIC void stream_store(float *mem, __m128 value, __m128 mask)
}
Vc_INTRINSIC void stream_store(float *mem, __m256 value, __m256 mask)
{
stream_store(mem, _mm256_castps256_ps128(value), _mm256_castps256_ps128(mask));
stream_store(mem , extract128<0>(value), extract128<0>(mask));
stream_store(mem + 4, extract128<1>(value), extract128<1>(mask));
}
Vc_INTRINSIC void stream_store(double *mem, __m128d value, __m128d mask)
Expand All @@ -565,7 +592,7 @@ Vc_INTRINSIC void stream_store(double *mem, __m128d value, __m128d mask)
}
Vc_INTRINSIC void stream_store(double *mem, __m256d value, __m256d mask)
{
stream_store(mem, _mm256_castpd256_pd128(value), _mm256_castpd256_pd128(mask));
stream_store(mem , extract128<0>(value), extract128<0>(mask));
stream_store(mem + 2, extract128<1>(value), extract128<1>(mask));
}
Vc_INTRINSIC void stream_store(void *mem, __m128i value, __m128i mask)
Expand All @@ -574,7 +601,7 @@ Vc_INTRINSIC void stream_store(void *mem, __m128i value, __m128i mask)
}
Vc_INTRINSIC void stream_store(void *mem, __m256i value, __m256i mask)
{
stream_store(mem, _mm256_castsi256_si128(value), _mm256_castsi256_si128(mask));
stream_store(mem, extract128<0>(value), extract128<0>(mask));
stream_store(static_cast<__m128i *>(mem) + 1, extract128<1>(value), extract128<1>(mask));
}

Expand Down
4 changes: 4 additions & 0 deletions Vc/avx/math.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,14 @@ Vc_ALWAYS_INLINE AVX2::int_v min(const AVX2::int_v &x, const AVX2::int_v
Vc_ALWAYS_INLINE AVX2::uint_v min(const AVX2::uint_v &x, const AVX2::uint_v &y) { return _mm256_min_epu32(x.data(), y.data()); }
Vc_ALWAYS_INLINE AVX2::short_v min(const AVX2::short_v &x, const AVX2::short_v &y) { return _mm256_min_epi16(x.data(), y.data()); }
Vc_ALWAYS_INLINE AVX2::ushort_v min(const AVX2::ushort_v &x, const AVX2::ushort_v &y) { return _mm256_min_epu16(x.data(), y.data()); }
Vc_ALWAYS_INLINE AVX2::schar_v min(const AVX2::schar_v &x, const AVX2::schar_v &y) { return _mm256_min_epi8(x.data(), y.data()); }
Vc_ALWAYS_INLINE AVX2::uchar_v min(const AVX2::uchar_v &x, const AVX2::uchar_v &y) { return _mm256_min_epu8(x.data(), y.data()); }
Vc_ALWAYS_INLINE AVX2::int_v max(const AVX2::int_v &x, const AVX2::int_v &y) { return _mm256_max_epi32(x.data(), y.data()); }
Vc_ALWAYS_INLINE AVX2::uint_v max(const AVX2::uint_v &x, const AVX2::uint_v &y) { return _mm256_max_epu32(x.data(), y.data()); }
Vc_ALWAYS_INLINE AVX2::short_v max(const AVX2::short_v &x, const AVX2::short_v &y) { return _mm256_max_epi16(x.data(), y.data()); }
Vc_ALWAYS_INLINE AVX2::ushort_v max(const AVX2::ushort_v &x, const AVX2::ushort_v &y) { return _mm256_max_epu16(x.data(), y.data()); }
Vc_ALWAYS_INLINE AVX2::schar_v max(const AVX2::schar_v &x, const AVX2::schar_v &y) { return _mm256_max_epi8(x.data(), y.data()); }
Vc_ALWAYS_INLINE AVX2::uchar_v max(const AVX2::uchar_v &x, const AVX2::uchar_v &y) { return _mm256_max_epu8(x.data(), y.data()); }
#endif
Vc_ALWAYS_INLINE AVX2::float_v min(const AVX2::float_v &x, const AVX2::float_v &y) { return _mm256_min_ps(x.data(), y.data()); }
Vc_ALWAYS_INLINE AVX2::double_v min(const AVX2::double_v &x, const AVX2::double_v &y) { return _mm256_min_pd(x.data(), y.data()); }
Expand Down
2 changes: 1 addition & 1 deletion Vc/avx/simd_cast_caller.tcc
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ template <typename T>
template <typename U>
Vc_INTRINSIC Mask<T, VectorAbi::Avx>::Mask(U &&rhs,
Common::enable_if_mask_converts_explicitly<T, U>)
: Mask(simd_cast<Mask>(std::forward<U>(rhs)))
: d(simd_cast<Mask>(std::forward<U>(rhs)))
{
}
#endif // Vc_IS_VERSION_1
Expand Down
Loading