From f47fabfac7a4a61bc9864358a6705d078140b623 Mon Sep 17 00:00:00 2001 From: Michael Kuron <mkuron@icp.uni-stuttgart.de> Date: Mon, 2 Sep 2019 09:38:30 +0200 Subject: [PATCH] AES-NI: Implement _my_cvtepu64_pd in terms of SSE instructions --- pystencils/include/aesni_rand.h | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/pystencils/include/aesni_rand.h b/pystencils/include/aesni_rand.h index 09327f27b..3a708314d 100644 --- a/pystencils/include/aesni_rand.h +++ b/pystencils/include/aesni_rand.h @@ -1,11 +1,13 @@ -#if !defined(__AES__) || !defined(__SSE2__) -#error AES-NI and SSE2 need to be enabled +#if !defined(__AES__) || !defined(__SSE2__) || (!defined(__AVX512VL__) && !defined(__SSE4_1__)) +#error AES-NI and SSE2, as well as AVX512 or SSE4.1 need to be enabled #endif #include <emmintrin.h> // SSE2 #include <wmmintrin.h> // AES #ifdef __AVX512VL__ #include <immintrin.h> // AVX* +#else +#include <smmintrin.h> // SSE4 #endif #include <cstdint> @@ -44,14 +46,19 @@ QUALIFIERS __m128 _my_cvtepu32_ps(const __m128i v) #endif } +#if !defined(__AVX512VL__) && defined(__FAST_MATH__) && defined(__GNUC__) && __GNUC__ >= 5 +__attribute__((optimize("no-fast-math"))) +#endif QUALIFIERS __m128d _my_cvtepu64_pd(const __m128i x) { #ifdef __AVX512VL__ return _mm_cvtepu64_pd(x); #else - uint64 r[2]; - _mm_storeu_si128((__m128i*)r, x); - return _mm_set_pd((double)r[1], (double)r[0]); + __m128i xH = _mm_srli_epi64(x, 32); + xH = _mm_or_si128(xH, _mm_castpd_si128(_mm_set1_pd(19342813113834066795298816.))); // 2^84 + __m128i xL = _mm_blend_epi16(x, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)), 0xcc); // 2^52 + __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(19342813118337666422669312.)); // 2^84 + 2^52 + return _mm_add_pd(f, _mm_castsi128_pd(xL)); #endif } @@ -110,4 +117,4 @@ QUALIFIERS void aesni_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3, rnd2 = r[1]; rnd3 = r[2]; rnd4 = r[3]; -} \ No newline at end of file +} -- GitLab