diff --git a/Vc/avx/detail.h b/Vc/avx/detail.h index d2c72592b..6130d110e 100644 --- a/Vc/avx/detail.h +++ b/Vc/avx/detail.h @@ -244,49 +244,112 @@ Vc_INTRINSIC __m256i load(const int *mem, when_streaming, } template -Vc_INTRINSIC __m256i load(const short *mem, when_unaligned, - enable_if<(std::is_same::value && - std::is_same::value)> = nullarg) +Vc_INTRINSIC __m256i +load(const short *mem, when_unaligned, + enable_if<(std::is_same::value && std::is_same::value)> = + nullarg) { return _mm256_loadu_si256(reinterpret_cast(mem)); } template -Vc_INTRINSIC __m256i load(const short *mem, when_aligned, - enable_if<(std::is_same::value && - std::is_same::value)> = nullarg) +Vc_INTRINSIC __m256i +load(const short *mem, when_aligned, + enable_if<(std::is_same::value && std::is_same::value)> = + nullarg) { return _mm256_load_si256(reinterpret_cast(mem)); } template -Vc_INTRINSIC __m256i load(const short *mem, when_streaming, - enable_if<(std::is_same::value && - std::is_same::value)> = nullarg) +Vc_INTRINSIC __m256i +load(const short *mem, when_streaming, + enable_if<(std::is_same::value && std::is_same::value)> = + nullarg) { return AvxIntrinsics::stream_load<__m256i>(mem); } template -Vc_INTRINSIC __m256i load(const ushort *mem, when_unaligned, - enable_if<(std::is_same::value && - std::is_same::value)> = nullarg) +Vc_INTRINSIC __m256i +load(const ushort *mem, when_unaligned, + enable_if<(std::is_same::value && std::is_same::value)> = + nullarg) { return _mm256_loadu_si256(reinterpret_cast(mem)); } template -Vc_INTRINSIC __m256i load(const ushort *mem, when_aligned, - enable_if<(std::is_same::value && - std::is_same::value)> = nullarg) +Vc_INTRINSIC __m256i +load(const ushort *mem, when_aligned, + enable_if<(std::is_same::value && std::is_same::value)> = + nullarg) { return _mm256_load_si256(reinterpret_cast(mem)); } template -Vc_INTRINSIC __m256i load(const ushort *mem, when_streaming, - enable_if<(std::is_same::value && - std::is_same::value)> = nullarg) +Vc_INTRINSIC __m256i +load(const ushort *mem, when_streaming, + enable_if<(std::is_same::value && std::is_same::value)> = + nullarg) +{ + return AvxIntrinsics::stream_load<__m256i>(mem); +} + + +//chars + +template +Vc_INTRINSIC __m256i +load(const char *mem, when_unaligned, + enable_if<(std::is_same::value && std::is_same::value)> = + nullarg) +{ + return _mm256_loadu_si256(reinterpret_cast(mem)); +} + +template +Vc_INTRINSIC __m256i +load(const char *mem, when_aligned, + enable_if<(std::is_same::value && std::is_same::value)> = + nullarg) +{ + return _mm256_load_si256(reinterpret_cast(mem)); +} + +template +Vc_INTRINSIC __m256i +load(const char *mem, when_streaming, + enable_if<(std::is_same::value && std::is_same::value)> = + nullarg) +{ + return AvxIntrinsics::stream_load<__m256i>(mem); +} + +template +Vc_INTRINSIC __m256i +load(const uchar *mem, when_unaligned, + enable_if<(std::is_same::value && std::is_same::value)> = + nullarg) +{ + return _mm256_loadu_si256(reinterpret_cast(mem)); +} + +template +Vc_INTRINSIC __m256i +load(const uchar *mem, when_aligned, + enable_if<(std::is_same::value && std::is_same::value)> = + nullarg) +{ + return _mm256_load_si256(reinterpret_cast(mem)); +} + +template +Vc_INTRINSIC __m256i +load(const uchar *mem, when_streaming, + enable_if<(std::is_same::value && std::is_same::value)> = + nullarg) { return AvxIntrinsics::stream_load<__m256i>(mem); } @@ -699,6 +762,8 @@ Vc_INTRINSIC __m256i add(__m256i a, __m256i b, int) { return AVX::add_epi32(a Vc_INTRINSIC __m256i add(__m256i a, __m256i b, uint) { return AVX::add_epi32(a, b); } Vc_INTRINSIC __m256i add(__m256i a, __m256i b, short) { return AVX::add_epi16(a, b); } Vc_INTRINSIC __m256i add(__m256i a, __m256i b, ushort) { return AVX::add_epi16(a, b); } +Vc_INTRINSIC __m256i add(__m256i a, __m256i b, char) { return AVX::add_epi8(a, b); } +Vc_INTRINSIC __m256i add(__m256i a, __m256i b, uchar) { return AVX::add_epi8(a, b); } // sub{{{1 Vc_INTRINSIC __m256 sub(__m256 a, __m256 b, float) { return _mm256_sub_ps(a, b); } @@ -707,6 +772,8 @@ Vc_INTRINSIC __m256i sub(__m256i a, __m256i b, int) { return AVX::sub_epi32(a Vc_INTRINSIC __m256i sub(__m256i a, __m256i b, uint) { return AVX::sub_epi32(a, b); } Vc_INTRINSIC __m256i sub(__m256i a, __m256i b, short) { return AVX::sub_epi16(a, b); } Vc_INTRINSIC __m256i sub(__m256i a, __m256i b, ushort) { return AVX::sub_epi16(a, b); } +Vc_INTRINSIC __m256i sub(__m256i a, __m256i b, char) { return AVX::sub_epi8(a, b); } +Vc_INTRINSIC __m256i sub(__m256i a, __m256i b, uchar) { return AVX::sub_epi8(a, b); } // mul{{{1 Vc_INTRINSIC __m256 mul(__m256 a, __m256 b, float) { return _mm256_mul_ps(a, b); } @@ -787,6 +854,8 @@ Vc_INTRINSIC __m256i cmpeq(__m256i a, __m256i b, int) { return AvxIntrinsics: Vc_INTRINSIC __m256i cmpeq(__m256i a, __m256i b, uint) { return AvxIntrinsics::cmpeq_epi32(a, b); } Vc_INTRINSIC __m256i cmpeq(__m256i a, __m256i b, short) { return AvxIntrinsics::cmpeq_epi16(a, b); } Vc_INTRINSIC __m256i cmpeq(__m256i a, __m256i b, ushort) { return AvxIntrinsics::cmpeq_epi16(a, b); } +Vc_INTRINSIC __m256i cmpeq(__m256i a, __m256i b, schar) { return AvxIntrinsics::cmpeq_epi8(a, b); } +Vc_INTRINSIC __m256i cmpeq(__m256i a, __m256i b, uchar) { return AvxIntrinsics::cmpeq_epi8(a, b); } // cmpneq{{{1 Vc_INTRINSIC __m256 cmpneq(__m256 a, __m256 b, float) { return AvxIntrinsics::cmpneq_ps(a, b); } @@ -1319,6 +1388,13 @@ template <> Vc_INTRINSIC Vc_CONST int mask_to_int<16>(__m256i k) { return _pext_u32(movemask(k), 0x55555555u); } +#else +template <> Vc_INTRINSIC Vc_CONST int mask_to_int<16>(__m256i k) +{ + int upper = movemask(AVX::avx_cast<__m256>(k)) << 8; + int lower = movemask(AVX::avx_cast<__m256>(_mm256_slli_epi32(k,16))); + return upper | lower; +} #endif template <> Vc_INTRINSIC Vc_CONST int mask_to_int<32>(__m256i k) { diff --git a/Vc/avx/intrinsics.h b/Vc/avx/intrinsics.h index 492e6b0ea..eeb9b5444 100644 --- a/Vc/avx/intrinsics.h +++ b/Vc/avx/intrinsics.h @@ -150,15 +150,32 @@ namespace AvxIntrinsics #endif } - template Vc_INTRINSIC __m128 extract128(__m256 a) { return _mm256_extractf128_ps(a, offset); } - template Vc_INTRINSIC __m128d extract128(__m256d a) { return _mm256_extractf128_pd(a, offset); } - template Vc_INTRINSIC __m128i extract128(__m256i a) { -#ifdef Vc_IMPL_AVX2 - return _mm256_extracti128_si256(a, offset); -#else - return _mm256_extractf128_si256(a, offset); -#endif - } + //Support + template + struct extract128_impl{ + //Only use extract functions with offset=1 + static_assert(offset == 1); + static Vc_INTRINSIC __m128 extract128(__m256 a) { return _mm256_extractf128_ps(a, offset); } + static Vc_INTRINSIC __m128d extract128(__m256d a) { return _mm256_extractf128_pd(a, offset); } + static Vc_INTRINSIC __m128i extract128(__m256i a) { + #ifdef Vc_IMPL_AVX2 + return _mm256_extracti128_si256(a, offset); + #else + return _mm256_extractf128_si256(a, offset); + #endif + } + }; + + template<> + struct extract128_impl<0>{ + static Vc_INTRINSIC __m128 extract128(__m256 a) { return _mm256_castps256_ps128(a); } + static Vc_INTRINSIC __m128d extract128(__m256d a) { return _mm256_castpd256_pd128(a); } + static Vc_INTRINSIC __m128i extract128(__m256i a) { return _mm256_castsi256_si128(a); } + }; + + template Vc_INTRINSIC __m128 extract128(__m256 a) { return extract128_impl::extract128(a); } + template Vc_INTRINSIC __m128d extract128(__m256d a) { return extract128_impl::extract128(a); } + template Vc_INTRINSIC __m128i extract128(__m256i a) { return extract128_impl::extract128(a); } /////////////////////// COMPARE OPS /////////////////////// #ifdef Vc_GCC @@ -226,8 +243,8 @@ namespace AvxIntrinsics template Vc_INTRINSIC Vc_CONST m256i alignr(__m256i s1, __m256i s2) { return insert128<1>( - _mm256_castsi128_si256(_mm_alignr_epi8(_mm256_castsi256_si128(s1), - _mm256_castsi256_si128(s2), shift)), + _mm256_castsi128_si256( + _mm_alignr_epi8(extract128<0>(s1), extract128<0>(s2), shift)), _mm_alignr_epi8(extract128<1>(s1), extract128<1>(s2), shift)); } #endif @@ -264,7 +281,7 @@ namespace AvxIntrinsics Vc_INTRINSIC Vc_CONST __m256i name(__m256i a0) \ { \ __m128i a1 = extract128<1>(a0); \ - __m128i r0 = _mm_##name(_mm256_castsi256_si128(a0)); \ + __m128i r0 = _mm_##name(extract128<0>(a0)); \ __m128i r1 = _mm_##name(a1); \ return insert128<1>(_mm256_castsi128_si256(r0), r1); \ } @@ -280,7 +297,7 @@ namespace AvxIntrinsics { \ m128i a1 = extract128<1>(a0); \ m128i b1 = extract128<1>(b0); \ - m128i r0 = _mm_##name(_mm256_castsi256_si128(a0), _mm256_castsi256_si128(b0)); \ + m128i r0 = _mm_##name(extract128<0>(a0), extract128<0>(b0)); \ m128i r1 = _mm_##name(a1, b1); \ return insert128<1>(_mm256_castsi128_si256(r0), r1); \ } @@ -288,7 +305,7 @@ namespace AvxIntrinsics Vc_INTRINSIC Vc_CONST m256i name(__m256i a0, __m128i b0) \ { \ m128i a1 = extract128<1>(a0); \ - m128i r0 = _mm_##name(_mm256_castsi256_si128(a0), b0); \ + m128i r0 = _mm_##name(extract128<0>(a0), b0); \ m128i r1 = _mm_##name(a1, b0); \ return insert128<1>(_mm256_castsi128_si256(r0), r1); \ } @@ -296,7 +313,7 @@ namespace AvxIntrinsics template Vc_INTRINSIC Vc_CONST m256i name(__m256i a0) \ { \ m128i a1 = extract128<1>(a0); \ - m128i r0 = _mm_##name(_mm256_castsi256_si128(a0), i); \ + m128i r0 = _mm_##name(extract128<0>(a0), i); \ m128i r1 = _mm_##name(a1, i); \ return insert128<1>(_mm256_castsi128_si256(r0), r1); \ } @@ -338,11 +355,14 @@ namespace AvxIntrinsics Vc_AVX_TO_SSE_2_NEW(cmpgt_epi64) Vc_AVX_TO_SSE_2_NEW(unpackhi_epi16) Vc_AVX_TO_SSE_2_NEW(unpacklo_epi16) + Vc_AVX_TO_SSE_2_NEW(add_epi8) Vc_AVX_TO_SSE_2_NEW(add_epi16) Vc_AVX_TO_SSE_2_NEW(add_epi32) Vc_AVX_TO_SSE_2_NEW(add_epi64) + Vc_AVX_TO_SSE_2_NEW(sub_epi8) Vc_AVX_TO_SSE_2_NEW(sub_epi16) Vc_AVX_TO_SSE_2_NEW(sub_epi32) + Vc_AVX_TO_SSE_2_NEW(sub_epi64) Vc_AVX_TO_SSE_2_NEW(mullo_epi16) Vc_AVX_TO_SSE_2_NEW(sign_epi16) Vc_AVX_TO_SSE_2_NEW(sign_epi32) @@ -392,23 +412,17 @@ namespace AvxIntrinsics Vc_INTRINSIC Vc_CONST int movemask_epi8(__m256i a0) { - m128i a1 = extract128<1>(a0); - return (_mm_movemask_epi8(a1) << 16) | _mm_movemask_epi8(_mm256_castsi256_si128(a0)); + return (_mm_movemask_epi8(extract128<1>(a0)) << 16) | _mm_movemask_epi8(extract128<0>(a0)); } template Vc_INTRINSIC Vc_CONST m256i blend_epi16(__m256i a0, __m256i b0) { - m128i a1 = extract128<1>(a0); - m128i b1 = extract128<1>(b0); - m128i r0 = _mm_blend_epi16(_mm256_castsi256_si128(a0), _mm256_castsi256_si128(b0), m & 0xff); - m128i r1 = _mm_blend_epi16(a1, b1, m >> 8); + m128i r0 = _mm_blend_epi16(extract128<0>(a0), extract128<0>(b0), m & 0xff); + m128i r1 = _mm_blend_epi16(extract128<1>(a0), extract128<1>(b0), m >> 8); return insert128<1>(_mm256_castsi128_si256(r0), r1); } Vc_INTRINSIC Vc_CONST m256i blendv_epi8(__m256i a0, __m256i b0, __m256i m0) { - m128i a1 = extract128<1>(a0); - m128i b1 = extract128<1>(b0); - m128i m1 = extract128<1>(m0); - m128i r0 = _mm_blendv_epi8(_mm256_castsi256_si128(a0), _mm256_castsi256_si128(b0), _mm256_castsi256_si128(m0)); - m128i r1 = _mm_blendv_epi8(a1, b1, m1); + m128i r0 = _mm_blendv_epi8(extract128<0>(a0), extract128<0>(b0), extract128<0>(m0)); + m128i r1 = _mm_blendv_epi8(extract128<1>(a0), extract128<1>(b0), extract128<1>(m0)); return insert128<1>(_mm256_castsi128_si256(r0), r1); } // mpsadbw_epu8 (__m128i __X, __m128i __Y, const int __M) @@ -454,6 +468,9 @@ static Vc_INTRINSIC m256i cmplt_epi8(__m256i a, __m256i b) { static Vc_INTRINSIC m256i cmpgt_epu8(__m256i a, __m256i b) { return cmpgt_epi8(xor_si256(a, setmin_epi8()), xor_si256(b, setmin_epi8())); } +static Vc_INTRINSIC m256i cmplt_epu8(__m256i a, __m256i b) { + return cmpgt_epu8(b, a); +} #if defined(Vc_IMPL_XOP) Vc_AVX_TO_SSE_2_NEW(comlt_epu32) Vc_AVX_TO_SSE_2_NEW(comgt_epu32) @@ -504,13 +521,23 @@ static Vc_INTRINSIC void _mm256_maskstore(unsigned int *mem, const __m256i mask, } static Vc_INTRINSIC void _mm256_maskstore(short *mem, const __m256i mask, const __m256i v) { using namespace AVX; - _mm_maskmoveu_si128(_mm256_castsi256_si128(v), _mm256_castsi256_si128(mask), reinterpret_cast(&mem[0])); - _mm_maskmoveu_si128(extract128<1>(v), extract128<1>(mask), reinterpret_cast(&mem[8])); + __m256i tmp = _mm256_loadu_si256(reinterpret_cast<__m256i *>(mem)); + tmp = _mm256_blendv_epi8(tmp, v, mask); + _mm256_storeu_si256(reinterpret_cast<__m256i *>(mem), tmp); } static Vc_INTRINSIC void _mm256_maskstore(unsigned short *mem, const __m256i mask, const __m256i v) { _mm256_maskstore(reinterpret_cast(mem), mask, v); } +static Vc_INTRINSIC void _mm256_maskstore(char *mem, const __m256i mask, const __m256i v) { + _mm256_maskstore(reinterpret_cast(mem), mask, v); +} + +static Vc_INTRINSIC void _mm256_maskstore(unsigned char *mem, const __m256i mask, const __m256i v) { + _mm256_maskstore(reinterpret_cast(mem), mask, v); +} + + #undef Vc_AVX_TO_SSE_1 #undef Vc_AVX_TO_SSE_1_128 #undef Vc_AVX_TO_SSE_2_NEW @@ -556,7 +583,7 @@ Vc_INTRINSIC void stream_store(float *mem, __m128 value, __m128 mask) } Vc_INTRINSIC void stream_store(float *mem, __m256 value, __m256 mask) { - stream_store(mem, _mm256_castps256_ps128(value), _mm256_castps256_ps128(mask)); + stream_store(mem , extract128<0>(value), extract128<0>(mask)); stream_store(mem + 4, extract128<1>(value), extract128<1>(mask)); } Vc_INTRINSIC void stream_store(double *mem, __m128d value, __m128d mask) @@ -565,7 +592,7 @@ Vc_INTRINSIC void stream_store(double *mem, __m128d value, __m128d mask) } Vc_INTRINSIC void stream_store(double *mem, __m256d value, __m256d mask) { - stream_store(mem, _mm256_castpd256_pd128(value), _mm256_castpd256_pd128(mask)); + stream_store(mem , extract128<0>(value), extract128<0>(mask)); stream_store(mem + 2, extract128<1>(value), extract128<1>(mask)); } Vc_INTRINSIC void stream_store(void *mem, __m128i value, __m128i mask) @@ -574,7 +601,7 @@ Vc_INTRINSIC void stream_store(void *mem, __m128i value, __m128i mask) } Vc_INTRINSIC void stream_store(void *mem, __m256i value, __m256i mask) { - stream_store(mem, _mm256_castsi256_si128(value), _mm256_castsi256_si128(mask)); + stream_store(mem, extract128<0>(value), extract128<0>(mask)); stream_store(static_cast<__m128i *>(mem) + 1, extract128<1>(value), extract128<1>(mask)); } diff --git a/Vc/avx/math.h b/Vc/avx/math.h index d5f849117..a2303c414 100644 --- a/Vc/avx/math.h +++ b/Vc/avx/math.h @@ -40,10 +40,14 @@ Vc_ALWAYS_INLINE AVX2::int_v min(const AVX2::int_v &x, const AVX2::int_v Vc_ALWAYS_INLINE AVX2::uint_v min(const AVX2::uint_v &x, const AVX2::uint_v &y) { return _mm256_min_epu32(x.data(), y.data()); } Vc_ALWAYS_INLINE AVX2::short_v min(const AVX2::short_v &x, const AVX2::short_v &y) { return _mm256_min_epi16(x.data(), y.data()); } Vc_ALWAYS_INLINE AVX2::ushort_v min(const AVX2::ushort_v &x, const AVX2::ushort_v &y) { return _mm256_min_epu16(x.data(), y.data()); } +Vc_ALWAYS_INLINE AVX2::schar_v min(const AVX2::schar_v &x, const AVX2::schar_v &y) { return _mm256_min_epi8(x.data(), y.data()); } +Vc_ALWAYS_INLINE AVX2::uchar_v min(const AVX2::uchar_v &x, const AVX2::uchar_v &y) { return _mm256_min_epu8(x.data(), y.data()); } Vc_ALWAYS_INLINE AVX2::int_v max(const AVX2::int_v &x, const AVX2::int_v &y) { return _mm256_max_epi32(x.data(), y.data()); } Vc_ALWAYS_INLINE AVX2::uint_v max(const AVX2::uint_v &x, const AVX2::uint_v &y) { return _mm256_max_epu32(x.data(), y.data()); } Vc_ALWAYS_INLINE AVX2::short_v max(const AVX2::short_v &x, const AVX2::short_v &y) { return _mm256_max_epi16(x.data(), y.data()); } Vc_ALWAYS_INLINE AVX2::ushort_v max(const AVX2::ushort_v &x, const AVX2::ushort_v &y) { return _mm256_max_epu16(x.data(), y.data()); } +Vc_ALWAYS_INLINE AVX2::schar_v max(const AVX2::schar_v &x, const AVX2::schar_v &y) { return _mm256_max_epi8(x.data(), y.data()); } +Vc_ALWAYS_INLINE AVX2::uchar_v max(const AVX2::uchar_v &x, const AVX2::uchar_v &y) { return _mm256_max_epu8(x.data(), y.data()); } #endif Vc_ALWAYS_INLINE AVX2::float_v min(const AVX2::float_v &x, const AVX2::float_v &y) { return _mm256_min_ps(x.data(), y.data()); } Vc_ALWAYS_INLINE AVX2::double_v min(const AVX2::double_v &x, const AVX2::double_v &y) { return _mm256_min_pd(x.data(), y.data()); } diff --git a/Vc/avx/simd_cast_caller.tcc b/Vc/avx/simd_cast_caller.tcc index 405abce18..19a9d4218 100644 --- a/Vc/avx/simd_cast_caller.tcc +++ b/Vc/avx/simd_cast_caller.tcc @@ -44,7 +44,7 @@ template template Vc_INTRINSIC Mask::Mask(U &&rhs, Common::enable_if_mask_converts_explicitly) - : Mask(simd_cast(std::forward(rhs))) + : d(simd_cast(std::forward(rhs))) { } #endif // Vc_IS_VERSION_1 diff --git a/Vc/avx/types.h b/Vc/avx/types.h index 107573041..1b093e72d 100644 --- a/Vc/avx/types.h +++ b/Vc/avx/types.h @@ -59,6 +59,8 @@ typedef Vector int_v; typedef Vector uint_v; typedef Vector short_v; typedef Vector ushort_v; +typedef Vector schar_v; +typedef Vector uchar_v; template using Mask = Vc::Mask>; typedef Mask double_m; @@ -67,6 +69,8 @@ typedef Mask int_m; typedef Mask uint_m; typedef Mask short_m; typedef Mask ushort_m; +typedef Mask schar_m; +typedef Mask uchar_m; template struct Const; @@ -85,6 +89,8 @@ using int_v = Vector< int>; using uint_v = Vector< uint>; using short_v = Vector< short>; using ushort_v = Vector; +using schar_v = Vector< char>; +using uchar_v = Vector; template using Mask = Vc::Mask; using double_m = Mask; diff --git a/Vc/avx/vector.tcc b/Vc/avx/vector.tcc index 63d867dbe..dbcfb3d11 100644 --- a/Vc/avx/vector.tcc +++ b/Vc/avx/vector.tcc @@ -56,26 +56,38 @@ Vc_INTRINSIC AVX2:: int_m operator==(AVX2:: int_v a, AVX2:: int_v b) { ret Vc_INTRINSIC AVX2:: uint_m operator==(AVX2:: uint_v a, AVX2:: uint_v b) { return AVX::cmpeq_epi32(a.data(), b.data()); } Vc_INTRINSIC AVX2:: short_m operator==(AVX2:: short_v a, AVX2:: short_v b) { return AVX::cmpeq_epi16(a.data(), b.data()); } Vc_INTRINSIC AVX2::ushort_m operator==(AVX2::ushort_v a, AVX2::ushort_v b) { return AVX::cmpeq_epi16(a.data(), b.data()); } +Vc_INTRINSIC AVX2:: schar_m operator==(AVX2::schar_v a, AVX2::schar_v b) { return AVX::cmpeq_epi8(a.data(), b.data()); } +Vc_INTRINSIC AVX2:: uchar_m operator==(AVX2::uchar_v a, AVX2::uchar_v b) { return AVX::cmpeq_epi8(a.data(), b.data()); } Vc_INTRINSIC AVX2:: int_m operator!=(AVX2:: int_v a, AVX2:: int_v b) { return not_(AVX::cmpeq_epi32(a.data(), b.data())); } Vc_INTRINSIC AVX2:: uint_m operator!=(AVX2:: uint_v a, AVX2:: uint_v b) { return not_(AVX::cmpeq_epi32(a.data(), b.data())); } Vc_INTRINSIC AVX2:: short_m operator!=(AVX2:: short_v a, AVX2:: short_v b) { return not_(AVX::cmpeq_epi16(a.data(), b.data())); } Vc_INTRINSIC AVX2::ushort_m operator!=(AVX2::ushort_v a, AVX2::ushort_v b) { return not_(AVX::cmpeq_epi16(a.data(), b.data())); } +Vc_INTRINSIC AVX2:: schar_m operator!=(AVX2::schar_v a, AVX2::schar_v b) { return not_(AVX::cmpeq_epi8(a.data(), b.data())); } +Vc_INTRINSIC AVX2:: uchar_m operator!=(AVX2::uchar_v a, AVX2::uchar_v b) { return not_(AVX::cmpeq_epi8(a.data(), b.data())); } Vc_INTRINSIC AVX2:: int_m operator>=(AVX2:: int_v a, AVX2:: int_v b) { return not_(AVX::cmplt_epi32(a.data(), b.data())); } Vc_INTRINSIC AVX2:: uint_m operator>=(AVX2:: uint_v a, AVX2:: uint_v b) { return not_(AVX::cmplt_epu32(a.data(), b.data())); } Vc_INTRINSIC AVX2:: short_m operator>=(AVX2:: short_v a, AVX2:: short_v b) { return not_(AVX::cmplt_epi16(a.data(), b.data())); } Vc_INTRINSIC AVX2::ushort_m operator>=(AVX2::ushort_v a, AVX2::ushort_v b) { return not_(AVX::cmplt_epu16(a.data(), b.data())); } +Vc_INTRINSIC AVX2:: schar_m operator>=(AVX2::schar_v a, AVX2::schar_v b) { return not_(AVX::cmplt_epi8(a.data(), b.data())); } +Vc_INTRINSIC AVX2:: uchar_m operator>=(AVX2::uchar_v a, AVX2::uchar_v b) { return not_(AVX::cmplt_epu8(a.data(), b.data())); } Vc_INTRINSIC AVX2:: int_m operator<=(AVX2:: int_v a, AVX2:: int_v b) { return not_(AVX::cmpgt_epi32(a.data(), b.data())); } Vc_INTRINSIC AVX2:: uint_m operator<=(AVX2:: uint_v a, AVX2:: uint_v b) { return not_(AVX::cmpgt_epu32(a.data(), b.data())); } Vc_INTRINSIC AVX2:: short_m operator<=(AVX2:: short_v a, AVX2:: short_v b) { return not_(AVX::cmpgt_epi16(a.data(), b.data())); } Vc_INTRINSIC AVX2::ushort_m operator<=(AVX2::ushort_v a, AVX2::ushort_v b) { return not_(AVX::cmpgt_epu16(a.data(), b.data())); } +Vc_INTRINSIC AVX2:: schar_m operator<=(AVX2::schar_v a, AVX2::schar_v b) { return not_(AVX::cmpgt_epi8(a.data(), b.data())); } +Vc_INTRINSIC AVX2:: uchar_m operator<=(AVX2::uchar_v a, AVX2::uchar_v b) { return not_(AVX::cmpgt_epu8(a.data(), b.data())); } Vc_INTRINSIC AVX2:: int_m operator> (AVX2:: int_v a, AVX2:: int_v b) { return AVX::cmpgt_epi32(a.data(), b.data()); } Vc_INTRINSIC AVX2:: uint_m operator> (AVX2:: uint_v a, AVX2:: uint_v b) { return AVX::cmpgt_epu32(a.data(), b.data()); } Vc_INTRINSIC AVX2:: short_m operator> (AVX2:: short_v a, AVX2:: short_v b) { return AVX::cmpgt_epi16(a.data(), b.data()); } Vc_INTRINSIC AVX2::ushort_m operator> (AVX2::ushort_v a, AVX2::ushort_v b) { return AVX::cmpgt_epu16(a.data(), b.data()); } +Vc_INTRINSIC AVX2:: schar_m operator> (AVX2::schar_v a, AVX2::schar_v b) { return AVX::cmpgt_epi8(a.data(), b.data()); } +Vc_INTRINSIC AVX2:: uchar_m operator> (AVX2::uchar_v a, AVX2::uchar_v b) { return AVX::cmpgt_epu8(a.data(), b.data()); } Vc_INTRINSIC AVX2:: int_m operator< (AVX2:: int_v a, AVX2:: int_v b) { return AVX::cmplt_epi32(a.data(), b.data()); } Vc_INTRINSIC AVX2:: uint_m operator< (AVX2:: uint_v a, AVX2:: uint_v b) { return AVX::cmplt_epu32(a.data(), b.data()); } Vc_INTRINSIC AVX2:: short_m operator< (AVX2:: short_v a, AVX2:: short_v b) { return AVX::cmplt_epi16(a.data(), b.data()); } Vc_INTRINSIC AVX2::ushort_m operator< (AVX2::ushort_v a, AVX2::ushort_v b) { return AVX::cmplt_epu16(a.data(), b.data()); } +Vc_INTRINSIC AVX2:: schar_m operator< (AVX2::schar_v a, AVX2::schar_v b) { return AVX::cmplt_epi8(a.data(), b.data()); } +Vc_INTRINSIC AVX2:: uchar_m operator< (AVX2::uchar_v a, AVX2::uchar_v b) { return AVX::cmplt_epu8(a.data(), b.data()); } #endif // Vc_IMPL_AVX2 // bitwise operators {{{1 diff --git a/Vc/sse/const_data.h b/Vc/sse/const_data.h index 54a2c351c..5fb443c2f 100644 --- a/Vc/sse/const_data.h +++ b/Vc/sse/const_data.h @@ -47,6 +47,7 @@ struct c_general alignas(16) static const unsigned int highMaskFloat[4]; alignas(16) static const short minShort[8]; + alignas(16) static const unsigned char one8[16]; alignas(16) static const unsigned short one16[8]; alignas(16) static const unsigned int one32[4]; alignas(16) static const float oneFloat[4]; diff --git a/Vc/sse/detail.h b/Vc/sse/detail.h index 086a00c62..d2ccecca4 100644 --- a/Vc/sse/detail.h +++ b/Vc/sse/detail.h @@ -163,6 +163,24 @@ Vc_INTRINSIC __m128i load(const ushort *mem, F f, { return load16(mem, f); } + +template +Vc_INTRINSIC __m128i +load(const char *mem, F f, + enable_if<(std::is_same::value && std::is_same::value)> = + nullarg) +{ + return load16(mem, f); +} + +template +Vc_INTRINSIC __m128i +load(const uchar *mem, F f, + enable_if<(std::is_same::value && std::is_same::value)> = + nullarg) +{ + return load16(mem, f); +} #endif // Vc_MSVC // generic load{{{2 diff --git a/Vc/sse/intrinsics.h b/Vc/sse/intrinsics.h index 2d5495419..17ae8eadb 100644 --- a/Vc/sse/intrinsics.h +++ b/Vc/sse/intrinsics.h @@ -77,9 +77,11 @@ namespace SseIntrinsics static Vc_INTRINSIC Vc_CONST __m128i _mm_setallone_si128() { return _mm_load_si128(reinterpret_cast(Common::AllBitsSet)); } static Vc_INTRINSIC Vc_CONST __m128d _mm_setallone_pd() { return _mm_load_pd(reinterpret_cast(Common::AllBitsSet)); } static Vc_INTRINSIC Vc_CONST __m128 _mm_setallone_ps() { return _mm_load_ps(reinterpret_cast(Common::AllBitsSet)); } - - static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epi16() { return _mm_load_si128(reinterpret_cast(c_general::one16)); } - static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epu16() { return _mm_setone_epi16(); } + + static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epi8() { return _mm_load_si128(reinterpret_cast(c_general::one8)); } + static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epu8() { return _mm_setone_epi8(); } + static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epi16() { return _mm_load_si128(reinterpret_cast(c_general::one16)); } + static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epu16() { return _mm_setone_epi16(); } static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epi32() { return _mm_load_si128(reinterpret_cast(c_general::one32)); } static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epu32() { return _mm_setone_epi32(); } @@ -542,6 +544,9 @@ namespace SseIntrinsics Vc_INTRINSIC Vc_CONST __m128i min_epi8 (__m128i a, __m128i b) { return blendv_epi8(a, b, _mm_cmpgt_epi8 (a, b)); } + Vc_INTRINSIC Vc_CONST __m128i min_epu8 (__m128i a, __m128i b) { + return blendv_epi8(a, b, cmpgt_epu8 (a, b)); + } Vc_INTRINSIC Vc_CONST __m128i min_epi32(__m128i a, __m128i b) { return blendv_epi8(a, b, _mm_cmpgt_epi32(a, b)); } diff --git a/Vc/sse/math.h b/Vc/sse/math.h index ed399ce93..3417d3bd7 100644 --- a/Vc/sse/math.h +++ b/Vc/sse/math.h @@ -33,6 +33,43 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace Vc_VERSIONED_NAMESPACE { + +static Vc_ALWAYS_INLINE Vc_PURE SSE::int_v min(const SSE::int_v &x, const SSE::int_v &y) { return SSE::min_epi32(x.data(), y.data()); } +static Vc_ALWAYS_INLINE Vc_PURE SSE::uint_v min(const SSE::uint_v &x, const SSE::uint_v &y) { return SSE::min_epu32(x.data(), y.data()); } +static Vc_ALWAYS_INLINE Vc_PURE SSE::short_v min(const SSE::short_v &x, const SSE::short_v &y) { return _mm_min_epi16(x.data(), y.data()); } +static Vc_ALWAYS_INLINE Vc_PURE SSE::ushort_v min(const SSE::ushort_v &x, const SSE::ushort_v &y) { return SSE::min_epu16(x.data(), y.data()); } +static Vc_ALWAYS_INLINE Vc_PURE SSE::schar_v min(const SSE::schar_v &x, const SSE::schar_v &y) { return SSE::min_epi8(x.data(), y.data()); } +static Vc_ALWAYS_INLINE Vc_PURE SSE::uchar_v min(const SSE::uchar_v &x, const SSE::uchar_v &y) { return _mm_min_epu8(x.data(), y.data()); } +static Vc_ALWAYS_INLINE Vc_PURE SSE::float_v min(const SSE::float_v &x, const SSE::float_v &y) { return _mm_min_ps(x.data(), y.data()); } +static Vc_ALWAYS_INLINE Vc_PURE SSE::double_v min(const SSE::double_v &x, const SSE::double_v &y) { return _mm_min_pd(x.data(), y.data()); } +static Vc_ALWAYS_INLINE Vc_PURE SSE::int_v max(const SSE::int_v &x, const SSE::int_v &y) { return SSE::max_epi32(x.data(), y.data()); } +static Vc_ALWAYS_INLINE Vc_PURE SSE::uint_v max(const SSE::uint_v &x, const SSE::uint_v &y) { return SSE::max_epu32(x.data(), y.data()); } +static Vc_ALWAYS_INLINE Vc_PURE SSE::short_v max(const SSE::short_v &x, const SSE::short_v &y) { return _mm_max_epi16(x.data(), y.data()); } +static Vc_ALWAYS_INLINE Vc_PURE SSE::ushort_v max(const SSE::ushort_v &x, const SSE::ushort_v &y) { return SSE::max_epu16(x.data(), y.data()); } +static Vc_ALWAYS_INLINE Vc_PURE SSE::schar_v max(const SSE::schar_v &x, const SSE::schar_v &y) { return SSE::max_epi8(x.data(), y.data()); } +static Vc_ALWAYS_INLINE Vc_PURE SSE::uchar_v max(const SSE::uchar_v &x, const SSE::uchar_v &y) { return _mm_max_epu8(x.data(), y.data()); } +static Vc_ALWAYS_INLINE Vc_PURE SSE::float_v max(const SSE::float_v &x, const SSE::float_v &y) { return _mm_max_ps(x.data(), y.data()); } +static Vc_ALWAYS_INLINE Vc_PURE SSE::double_v max(const SSE::double_v &x, const SSE::double_v &y) { return _mm_max_pd(x.data(), y.data()); } + +template ::value || std::is_same::value || + std::is_same::value || + std::is_same::value>> +Vc_ALWAYS_INLINE Vc_PURE Vector abs(Vector x) +{ + return SSE::VectorHelper::abs(x.data()); +} + + template Vc_ALWAYS_INLINE Vc_PURE Vector sqrt (const Vector &x) { return SSE::VectorHelper::sqrt(x.data()); } + template Vc_ALWAYS_INLINE Vc_PURE Vector rsqrt(const Vector &x) { return SSE::VectorHelper::rsqrt(x.data()); } + template Vc_ALWAYS_INLINE Vc_PURE Vector reciprocal(const Vector &x) { return SSE::VectorHelper::reciprocal(x.data()); } + template Vc_ALWAYS_INLINE Vc_PURE Vector round(const Vector &x) { return SSE::VectorHelper::round(x.data()); } + + template Vc_ALWAYS_INLINE Vc_PURE typename Vector::Mask isfinite(const Vector &x) { return SSE::VectorHelper::isFinite(x.data()); } + template Vc_ALWAYS_INLINE Vc_PURE typename Vector::Mask isinf(const Vector &x) { return SSE::VectorHelper::isInfinite(x.data()); } + template Vc_ALWAYS_INLINE Vc_PURE typename Vector::Mask isnan(const Vector &x) { return SSE::VectorHelper::isNaN(x.data()); } + + // copysign {{{1 Vc_INTRINSIC Vc_CONST SSE::float_v copysign(SSE::float_v mag, SSE::float_v sign) { diff --git a/Vc/sse/types.h b/Vc/sse/types.h index df938baa7..37d277ca1 100644 --- a/Vc/sse/types.h +++ b/Vc/sse/types.h @@ -50,6 +50,8 @@ typedef Vector int_v; typedef Vector uint_v; typedef Vector short_v; typedef Vector ushort_v; +typedef Vector schar_v; +typedef Vector uchar_v; template using Mask = Vc::Mask; typedef Mask double_m; @@ -58,6 +60,9 @@ typedef Mask int_m; typedef Mask uint_m; typedef Mask short_m; typedef Mask ushort_m; +typedef Mask schar_m; +typedef Mask uchar_m; + template struct Const; diff --git a/Vc/sse/vector.h b/Vc/sse/vector.h index 42905f0e0..3e999d21e 100644 --- a/Vc/sse/vector.h +++ b/Vc/sse/vector.h @@ -425,37 +425,6 @@ template class Vector template constexpr size_t Vector::Size; template constexpr size_t Vector::MemoryAlignment; -static Vc_ALWAYS_INLINE Vc_PURE SSE::int_v min(const SSE::int_v &x, const SSE::int_v &y) { return SSE::min_epi32(x.data(), y.data()); } -static Vc_ALWAYS_INLINE Vc_PURE SSE::uint_v min(const SSE::uint_v &x, const SSE::uint_v &y) { return SSE::min_epu32(x.data(), y.data()); } -static Vc_ALWAYS_INLINE Vc_PURE SSE::short_v min(const SSE::short_v &x, const SSE::short_v &y) { return _mm_min_epi16(x.data(), y.data()); } -static Vc_ALWAYS_INLINE Vc_PURE SSE::ushort_v min(const SSE::ushort_v &x, const SSE::ushort_v &y) { return SSE::min_epu16(x.data(), y.data()); } -static Vc_ALWAYS_INLINE Vc_PURE SSE::float_v min(const SSE::float_v &x, const SSE::float_v &y) { return _mm_min_ps(x.data(), y.data()); } -static Vc_ALWAYS_INLINE Vc_PURE SSE::double_v min(const SSE::double_v &x, const SSE::double_v &y) { return _mm_min_pd(x.data(), y.data()); } -static Vc_ALWAYS_INLINE Vc_PURE SSE::int_v max(const SSE::int_v &x, const SSE::int_v &y) { return SSE::max_epi32(x.data(), y.data()); } -static Vc_ALWAYS_INLINE Vc_PURE SSE::uint_v max(const SSE::uint_v &x, const SSE::uint_v &y) { return SSE::max_epu32(x.data(), y.data()); } -static Vc_ALWAYS_INLINE Vc_PURE SSE::short_v max(const SSE::short_v &x, const SSE::short_v &y) { return _mm_max_epi16(x.data(), y.data()); } -static Vc_ALWAYS_INLINE Vc_PURE SSE::ushort_v max(const SSE::ushort_v &x, const SSE::ushort_v &y) { return SSE::max_epu16(x.data(), y.data()); } -static Vc_ALWAYS_INLINE Vc_PURE SSE::float_v max(const SSE::float_v &x, const SSE::float_v &y) { return _mm_max_ps(x.data(), y.data()); } -static Vc_ALWAYS_INLINE Vc_PURE SSE::double_v max(const SSE::double_v &x, const SSE::double_v &y) { return _mm_max_pd(x.data(), y.data()); } - -template ::value || std::is_same::value || - std::is_same::value || - std::is_same::value>> -Vc_ALWAYS_INLINE Vc_PURE Vector abs(Vector x) -{ - return SSE::VectorHelper::abs(x.data()); -} - - template Vc_ALWAYS_INLINE Vc_PURE Vector sqrt (const Vector &x) { return SSE::VectorHelper::sqrt(x.data()); } - template Vc_ALWAYS_INLINE Vc_PURE Vector rsqrt(const Vector &x) { return SSE::VectorHelper::rsqrt(x.data()); } - template Vc_ALWAYS_INLINE Vc_PURE Vector reciprocal(const Vector &x) { return SSE::VectorHelper::reciprocal(x.data()); } - template Vc_ALWAYS_INLINE Vc_PURE Vector round(const Vector &x) { return SSE::VectorHelper::round(x.data()); } - - template Vc_ALWAYS_INLINE Vc_PURE typename Vector::Mask isfinite(const Vector &x) { return SSE::VectorHelper::isFinite(x.data()); } - template Vc_ALWAYS_INLINE Vc_PURE typename Vector::Mask isinf(const Vector &x) { return SSE::VectorHelper::isInfinite(x.data()); } - template Vc_ALWAYS_INLINE Vc_PURE typename Vector::Mask isnan(const Vector &x) { return SSE::VectorHelper::isNaN(x.data()); } - #define Vc_CONDITIONAL_ASSIGN(name_, op_) \ template \ Vc_INTRINSIC enable_if conditional_assign( \ diff --git a/Vc/sse/vector.tcc b/Vc/sse/vector.tcc index 2e2fe5d58..4d7874bdf 100644 --- a/Vc/sse/vector.tcc +++ b/Vc/sse/vector.tcc @@ -45,6 +45,8 @@ Vc_INTRINSIC SSE:: int_m operator==(SSE:: int_v a, SSE:: int_v b) { return Vc_INTRINSIC SSE:: uint_m operator==(SSE:: uint_v a, SSE:: uint_v b) { return _mm_cmpeq_epi32(a.data(), b.data()); } Vc_INTRINSIC SSE:: short_m operator==(SSE:: short_v a, SSE:: short_v b) { return _mm_cmpeq_epi16(a.data(), b.data()); } Vc_INTRINSIC SSE::ushort_m operator==(SSE::ushort_v a, SSE::ushort_v b) { return _mm_cmpeq_epi16(a.data(), b.data()); } +Vc_INTRINSIC SSE::schar_m operator==(SSE:: schar_v a, SSE:: schar_v b) { return _mm_cmpeq_epi8(a.data(), b.data()); } +Vc_INTRINSIC SSE::uchar_m operator==(SSE::uchar_v a, SSE::uchar_v b) { return _mm_cmpeq_epi8(a.data(), b.data()); } Vc_INTRINSIC SSE::double_m operator!=(SSE::double_v a, SSE::double_v b) { return _mm_cmpneq_pd(a.data(), b.data()); } Vc_INTRINSIC SSE:: float_m operator!=(SSE:: float_v a, SSE:: float_v b) { return _mm_cmpneq_ps(a.data(), b.data()); } @@ -52,6 +54,8 @@ Vc_INTRINSIC SSE:: int_m operator!=(SSE:: int_v a, SSE:: int_v b) { return Vc_INTRINSIC SSE:: uint_m operator!=(SSE:: uint_v a, SSE:: uint_v b) { return not_(_mm_cmpeq_epi32(a.data(), b.data())); } Vc_INTRINSIC SSE:: short_m operator!=(SSE:: short_v a, SSE:: short_v b) { return not_(_mm_cmpeq_epi16(a.data(), b.data())); } Vc_INTRINSIC SSE::ushort_m operator!=(SSE::ushort_v a, SSE::ushort_v b) { return not_(_mm_cmpeq_epi16(a.data(), b.data())); } +Vc_INTRINSIC SSE::schar_m operator!=(SSE::schar_v a, SSE::schar_v b) { return not_(_mm_cmpeq_epi8(a.data(), b.data())); } +Vc_INTRINSIC SSE::uchar_m operator!=(SSE::uchar_v a, SSE::uchar_v b) { return not_(_mm_cmpeq_epi8(a.data(), b.data())); } Vc_INTRINSIC SSE::double_m operator> (SSE::double_v a, SSE::double_v b) { return _mm_cmpgt_pd(a.data(), b.data()); } Vc_INTRINSIC SSE:: float_m operator> (SSE:: float_v a, SSE:: float_v b) { return _mm_cmpgt_ps(a.data(), b.data()); } @@ -71,6 +75,14 @@ Vc_INTRINSIC SSE::ushort_m operator> (SSE::ushort_v a, SSE::ushort_v b) { return _mm_cmpgt_epi16(a.data(), b.data()); #endif } +Vc_INTRINSIC SSE::schar_m operator> (SSE::schar_v a, SSE::schar_v b) { return _mm_cmpgt_epi8(a.data(), b.data()); } +Vc_INTRINSIC SSE::uchar_m operator> (SSE::uchar_v a, SSE::uchar_v b) { +#ifndef USE_INCORRECT_UNSIGNED_COMPARE + return SSE::cmpgt_epu8(a.data(), b.data()); +#else + return _mm_cmpgt_epi8(a.data(), b.data()); +#endif +} Vc_INTRINSIC SSE::double_m operator< (SSE::double_v a, SSE::double_v b) { return _mm_cmplt_pd(a.data(), b.data()); } Vc_INTRINSIC SSE:: float_m operator< (SSE:: float_v a, SSE:: float_v b) { return _mm_cmplt_ps(a.data(), b.data()); } @@ -90,6 +102,14 @@ Vc_INTRINSIC SSE::ushort_m operator< (SSE::ushort_v a, SSE::ushort_v b) { return _mm_cmplt_epi16(a.data(), b.data()); #endif } +Vc_INTRINSIC SSE::schar_m operator< (SSE::schar_v a, SSE::schar_v b) { return _mm_cmplt_epi8(a.data(), b.data()); } +Vc_INTRINSIC SSE::uchar_m operator< (SSE::uchar_v a, SSE::uchar_v b) { +#ifndef USE_INCORRECT_UNSIGNED_COMPARE + return SSE::cmpgt_epu8(b.data(), a.data()); +#else + return _mm_cmplt_epi8(a.data(), b.data()); +#endif +} Vc_INTRINSIC SSE::double_m operator>=(SSE::double_v a, SSE::double_v b) { return _mm_cmpnlt_pd(a.data(), b.data()); } Vc_INTRINSIC SSE:: float_m operator>=(SSE:: float_v a, SSE:: float_v b) { return _mm_cmpnlt_ps(a.data(), b.data()); } @@ -97,6 +117,8 @@ Vc_INTRINSIC SSE:: int_m operator>=(SSE:: int_v a, SSE:: int_v b) { return Vc_INTRINSIC SSE:: uint_m operator>=(SSE:: uint_v a, SSE:: uint_v b) { return !(a < b); } Vc_INTRINSIC SSE:: short_m operator>=(SSE:: short_v a, SSE:: short_v b) { return !(a < b); } Vc_INTRINSIC SSE::ushort_m operator>=(SSE::ushort_v a, SSE::ushort_v b) { return !(a < b); } +Vc_INTRINSIC SSE::schar_m operator>=(SSE:: schar_v a, SSE:: schar_v b) { return !(a < b); } +Vc_INTRINSIC SSE::uchar_m operator>=(SSE:: uchar_v a, SSE:: uchar_v b) { return !(a < b); } Vc_INTRINSIC SSE::double_m operator<=(SSE::double_v a, SSE::double_v b) { return _mm_cmple_pd(a.data(), b.data()); } Vc_INTRINSIC SSE:: float_m operator<=(SSE:: float_v a, SSE:: float_v b) { return _mm_cmple_ps(a.data(), b.data()); } @@ -104,6 +126,8 @@ Vc_INTRINSIC SSE:: int_m operator<=(SSE:: int_v a, SSE:: int_v b) { return Vc_INTRINSIC SSE:: uint_m operator<=(SSE:: uint_v a, SSE:: uint_v b) { return !(a > b); } Vc_INTRINSIC SSE:: short_m operator<=(SSE:: short_v a, SSE:: short_v b) { return !(a > b); } Vc_INTRINSIC SSE::ushort_m operator<=(SSE::ushort_v a, SSE::ushort_v b) { return !(a > b); } +Vc_INTRINSIC SSE::schar_m operator<=(SSE:: schar_v a, SSE:: schar_v b) { return !(a > b); } +Vc_INTRINSIC SSE::uchar_m operator<=(SSE:: uchar_v a, SSE:: uchar_v b) { return !(a > b); } // bitwise operators {{{1 template @@ -640,6 +664,10 @@ template <> Vc_INTRINSIC SSE::short_v SSE::short_v::interleaveLow ( SSE::short template <> Vc_INTRINSIC SSE::short_v SSE::short_v::interleaveHigh( SSE::short_v x) const { return _mm_unpackhi_epi16(data(), x.data()); } template <> Vc_INTRINSIC SSE::ushort_v SSE::ushort_v::interleaveLow (SSE::ushort_v x) const { return _mm_unpacklo_epi16(data(), x.data()); } template <> Vc_INTRINSIC SSE::ushort_v SSE::ushort_v::interleaveHigh(SSE::ushort_v x) const { return _mm_unpackhi_epi16(data(), x.data()); } +template <> Vc_INTRINSIC SSE::schar_v SSE::schar_v::interleaveLow ( SSE::schar_v x) const { return _mm_unpacklo_epi8(data(), x.data()); } +template <> Vc_INTRINSIC SSE::schar_v SSE::schar_v::interleaveHigh( SSE::schar_v x) const { return _mm_unpackhi_epi8(data(), x.data()); } +template <> Vc_INTRINSIC SSE::uchar_v SSE::uchar_v::interleaveLow (SSE::uchar_v x) const { return _mm_unpacklo_epi8(data(), x.data()); } +template <> Vc_INTRINSIC SSE::uchar_v SSE::uchar_v::interleaveHigh(SSE::uchar_v x) const { return _mm_unpackhi_epi8(data(), x.data()); } // }}}1 // generate {{{1 template <> template Vc_INTRINSIC SSE::double_v SSE::double_v::generate(G gen) diff --git a/Vc/sse/vectorhelper.h b/Vc/sse/vectorhelper.h index 1b516468d..32d94513f 100644 --- a/Vc/sse/vectorhelper.h +++ b/Vc/sse/vectorhelper.h @@ -611,6 +611,323 @@ namespace SSE #undef Vc_SUFFIX static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) { return a; } }; + + + template <> struct VectorHelper { + typedef __m128i VectorType; + typedef char EntryType; +#define Vc_SUFFIX si128 + Vc_OP_CAST_(or_) Vc_OP_CAST_(and_) + Vc_OP_CAST_(xor_) static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() + { + return Vc_CAT2(_mm_setzero_, Vc_SUFFIX)(); + } + static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, + __m128 mask) + { + return Vc_CAT2(_mm_and_, Vc_SUFFIX)(_mm_castps_si128(mask), a); + } +#ifdef Vc_IMPL_SSE4_1 + static Vc_ALWAYS_INLINE Vc_CONST __m128i concat(__m128i a, __m128i b) + { + return _mm_packus_epi16(a, b); + } +#else + // FIXME too bad, but this is broken without SSE 4.1 + // Copy pasted from unsigned short + // static Vc_ALWAYS_INLINE Vc_CONST __m128i concat(__m128i a, __m128i b) + //{ + // auto tmp0 = _mm_unpacklo_epi16(a, b); // 0 4 X X 1 5 X X + // auto tmp1 = _mm_unpackhi_epi16(a, b); // 2 6 X X 3 7 X X + // auto tmp2 = _mm_unpacklo_epi16(tmp0, tmp1); // 0 2 4 6 X X X X + // auto tmp3 = _mm_unpackhi_epi16(tmp0, tmp1); // 1 3 5 7 X X X X + // return _mm_unpacklo_epi16(tmp2, tmp3); // 0 1 2 3 4 5 6 7 + //} +#endif + static Vc_ALWAYS_INLINE Vc_CONST __m128i expand0(__m128i x) + { + return _mm_unpacklo_epi8(x, _mm_setzero_si128()); + } + static Vc_ALWAYS_INLINE Vc_CONST __m128i expand1(__m128i x) + { + return _mm_unpackhi_epi8(x, _mm_setzero_si128()); + } + +#undef Vc_SUFFIX +#define Vc_SUFFIX epi8 + static Vc_ALWAYS_INLINE Vc_CONST VectorType one() + { + return Vc_CAT2(_mm_setone_, Vc_SUFFIX)(); + } + + // X template static Vc_ALWAYS_INLINE Vc_CONST + // VectorType mul(const VectorType a) { X switch (b) { X case + // 0: return zero(); X case 1: return a; X case 2: + // return _mm_slli_epi16(a, 1); X case 4: return + // _mm_slli_epi16(a, 2); X case 8: return + // _mm_slli_epi16(a, 3); X case 16: return _mm_slli_epi16(a, 4); X case + // 32: return _mm_slli_epi16(a, 5); X case 64: return + // _mm_slli_epi16(a, 6); X case 128: return + // _mm_slli_epi16(a, 7); X case 256: return _mm_slli_epi16(a, 8); X case + // 512: return _mm_slli_epi16(a, 9); X case 1024: return + // _mm_slli_epi16(a, 10); X case 2048: return + // _mm_slli_epi16(a, 11); X } X return mul(a, set(b)); X } +#if !defined(USE_INCORRECT_UNSIGNED_COMPARE) || Vc_IMPL_SSE4_1 + static Vc_ALWAYS_INLINE Vc_CONST VectorType min(VectorType a, VectorType b) + { + return min_epi8(a, b); + } + static Vc_ALWAYS_INLINE Vc_CONST VectorType max(VectorType a, VectorType b) + { + return max_epi8(a, b); + } +#endif +#undef Vc_SUFFIX +#define Vc_SUFFIX epi8 + static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VectorType a, int shift) + { + return _mm_and_si128(_mm_slli_epi16(a, shift), + _mm_set1_epi8(0xff << shift)); + } + static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VectorType a, + int shift) + { + return _mm_and_si128(_mm_srli_epi16(a, shift), + _mm_set1_epi8(0xff >> shift)); + } + + static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) + { + v1 = add(mul(v1, v2), v3); + } + + static Vc_ALWAYS_INLINE Vc_CONST VectorType mul(const VectorType a, + const VectorType b) + { + __m128i even = _mm_mullo_epi16(a, b); + __m128i odd = _mm_mullo_epi16(_mm_srli_epi16(a, 8), _mm_srli_epi16(b, 8)); + + return _mm_or_si128(_mm_slli_epi16(odd, 8), + _mm_srli_epi16(_mm_slli_epi16(even, 8), 8)); + } + +#if defined(USE_INCORRECT_UNSIGNED_COMPARE) && !defined(Vc_IMPL_SSE4_1) + Vc_OP(min) Vc_OP(max) // XXX breaks for values with MSB set +#endif + static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) + { + // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change" + a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); + a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); + a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); + auto two_elems = _mm_cvtsi128_si32(a); // two remaining elements + return std::min(static_cast(two_elems), + static_cast(two_elems >> 8)); + } + static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) + { + // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change" + a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); + a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); + a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); + auto two_elems = _mm_cvtsi128_si32(a); + return std::max(static_cast(two_elems), + static_cast(two_elems >> 8)); + } + static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) + { + // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change" + a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); + a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); + a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); + auto two_elems = _mm_cvtsi128_si32(a); // & 0xffff is implicit + return static_cast(two_elems) * + static_cast(two_elems >> 8); + } + static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) + { + // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change" + a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); + a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); + a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); + auto two_elems = _mm_cvtsi128_si32(a); // & 0xffff is implicit + return static_cast(two_elems) + + static_cast(two_elems >> 8); + } + static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const EntryType a) + { + return Vc_CAT2(_mm_set1_, Vc_SUFFIX)(a); + } + static Vc_ALWAYS_INLINE Vc_CONST VectorType + set(const EntryType a, const EntryType b, const EntryType c, + const EntryType d, const EntryType e, const EntryType f, + const EntryType g, const EntryType h, const EntryType i, + const EntryType j, const EntryType k, const EntryType l, + const EntryType m, const EntryType n, const EntryType o, + const EntryType p) + { + return Vc_CAT2(_mm_set_, Vc_SUFFIX)(a, b, c, d, e, f, g, h, i, j, k, l, m, + n, o, p); + } + + Vc_OP(add) Vc_OP(sub) +#undef Vc_SUFFIX + static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) + { + return a; + } + }; + + + + template <> struct VectorHelper { + typedef __m128i VectorType; + typedef char EntryType; +#define Vc_SUFFIX si128 + Vc_OP_CAST_(or_) Vc_OP_CAST_(and_) + Vc_OP_CAST_(xor_) static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() + { + return Vc_CAT2(_mm_setzero_, Vc_SUFFIX)(); + } + static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, + __m128 mask) + { + return Vc_CAT2(_mm_and_, Vc_SUFFIX)(_mm_castps_si128(mask), a); + } + + static Vc_ALWAYS_INLINE Vc_CONST __m128i expand0(__m128i x) + { + return _mm_unpacklo_epi8(x, _mm_setzero_si128()); + } + static Vc_ALWAYS_INLINE Vc_CONST __m128i expand1(__m128i x) + { + return _mm_unpackhi_epi8(x, _mm_setzero_si128()); + } + +#undef Vc_SUFFIX +#define Vc_SUFFIX epu8 + static Vc_ALWAYS_INLINE Vc_CONST VectorType one() + { + return Vc_CAT2(_mm_setone_, Vc_SUFFIX)(); + } + + // X template static Vc_ALWAYS_INLINE Vc_CONST + // VectorType mul(const VectorType a) { X switch (b) { X case + // 0: return zero(); X case 1: return a; X case 2: + // return _mm_slli_epi16(a, 1); X case 4: return + // _mm_slli_epi16(a, 2); X case 8: return + // _mm_slli_epi16(a, 3); X case 16: return _mm_slli_epi16(a, 4); X case + // 32: return _mm_slli_epi16(a, 5); X case 64: return + // _mm_slli_epi16(a, 6); X case 128: return + // _mm_slli_epi16(a, 7); X case 256: return _mm_slli_epi16(a, 8); X case + // 512: return _mm_slli_epi16(a, 9); X case 1024: return + // _mm_slli_epi16(a, 10); X case 2048: return + // _mm_slli_epi16(a, 11); X } X return mul(a, set(b)); X } +#if !defined(USE_INCORRECT_UNSIGNED_COMPARE) || Vc_IMPL_SSE4_1 + static Vc_ALWAYS_INLINE Vc_CONST VectorType min(VectorType a, VectorType b) + { + return min(a, b); + } + static Vc_ALWAYS_INLINE Vc_CONST VectorType max(VectorType a, VectorType b) + { + return max(a, b); + } +#endif +#undef Vc_SUFFIX +#define Vc_SUFFIX epi8 + static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VectorType a, int shift) + { + return _mm_and_si128(_mm_slli_epi16(a, shift), + _mm_set1_epi8(0xff << shift)); + } + static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VectorType a, + int shift) + { + return _mm_and_si128(_mm_srli_epi16(a, shift), + _mm_set1_epi8(0xff >> shift)); + } + + static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) + { + v1 = add(mul(v1, v2), v3); + } + + static Vc_ALWAYS_INLINE Vc_CONST VectorType mul(const VectorType a, + const VectorType b) + { + __m128i even = _mm_mullo_epi16(a, b); + __m128i odd = _mm_mullo_epi16(_mm_srli_epi16(a, 8), _mm_srli_epi16(b, 8)); + + return _mm_or_si128(_mm_slli_epi16(odd, 8), + _mm_srli_epi16(_mm_slli_epi16(even, 8), 8)); + } + +#if defined(USE_INCORRECT_UNSIGNED_COMPARE) && !defined(Vc_IMPL_SSE4_1) + Vc_OP(min) Vc_OP(max) // XXX breaks for values with MSB set +#endif + static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) + { + // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change" + a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); + a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); + a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); + auto two_elems = _mm_cvtsi128_si32(a); // two remaining elements + return std::min(static_cast(two_elems), + static_cast(two_elems >> 8)); + } + static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) + { + // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change" + a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); + a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); + a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); + auto two_elems = _mm_cvtsi128_si32(a); + return std::max(static_cast(two_elems), + static_cast(two_elems >> 8)); + } + static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) + { + // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change" + a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); + a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); + a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); + auto two_elems = _mm_cvtsi128_si32(a); // & 0xffff is implicit + return static_cast(two_elems) * + static_cast(two_elems >> 8); + } + static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) + { + // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change" + a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); + a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); + a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); + auto two_elems = _mm_cvtsi128_si32(a); // & 0xffff is implicit + return static_cast(two_elems) + + static_cast(two_elems >> 8); + } + static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const EntryType a) + { + return Vc_CAT2(_mm_set1_, Vc_SUFFIX)(a); + } + static Vc_ALWAYS_INLINE Vc_CONST VectorType + set(const EntryType a, const EntryType b, const EntryType c, + const EntryType d, const EntryType e, const EntryType f, + const EntryType g, const EntryType h, const EntryType i, + const EntryType j, const EntryType k, const EntryType l, + const EntryType m, const EntryType n, const EntryType o, + const EntryType p) + { + return Vc_CAT2(_mm_set_, Vc_SUFFIX)(a, b, c, d, e, f, g, h, i, j, k, l, m, + n, o, p); + } + + Vc_OP(add) Vc_OP(sub) +#undef Vc_SUFFIX + static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) + { + return a; + } + }; #undef Vc_OP1 #undef Vc_OP #undef Vc_OP_ diff --git a/src/const.cpp b/src/const.cpp index 94dd65a44..3ba624503 100644 --- a/src/const.cpp +++ b/src/const.cpp @@ -278,6 +278,7 @@ namespace SSE // cacheline 2 alignas(16) extern const unsigned int _IndexesFromZero4[4] = { 0, 1, 2, 3 }; + alignas(16) const unsigned char c_general::one8[16] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; alignas(16) const unsigned short c_general::one16[8] = { 1, 1, 1, 1, 1, 1, 1, 1 }; alignas(16) const unsigned int c_general::one32[4] = { 1, 1, 1, 1 }; alignas(16) const float c_general::oneFloat[4] = { 1.f, 1.f, 1.f, 1.f };