From fa0a09a58e5dee42d93c1cbf2021dfe58bb1f004 Mon Sep 17 00:00:00 2001 From: Michael Kuron <mkuron@icp.uni-stuttgart.de> Date: Wed, 14 Aug 2019 10:01:25 +0200 Subject: [PATCH] AES-NI RNG fully vectorized --- pystencils/include/aesni_rand.h | 60 +++++++++++++++++++++++++++------ pystencils_tests/test_random.py | 1 - 2 files changed, 49 insertions(+), 12 deletions(-) diff --git a/pystencils/include/aesni_rand.h b/pystencils/include/aesni_rand.h index eb7217e0b..a480883e5 100644 --- a/pystencils/include/aesni_rand.h +++ b/pystencils/include/aesni_rand.h @@ -27,10 +27,27 @@ QUALIFIERS __m128i aesni1xm128i(const __m128i & in, const __m128i & k) { return x; } -QUALIFIERS double _uniform_double_hq(uint32 x, uint32 y) +QUALIFIERS __m128 _my_cvtepu32_ps(const __m128i v) { - uint64 z = (uint64)x ^ ((uint64)y << (53 - 32)); - return z * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0); +#ifdef __AVX512F__ + return _mm_cvtepu32_ps(v); +#else + __m128i v2 = _mm_srli_epi32(v, 1); + __m128i v1 = _mm_and_si128(v, _mm_set1_epi32(1)); + __m128 v2f = _mm_cvtepi32_ps(v2); + __m128 v1f = _mm_cvtepi32_ps(v1); + return _mm_add_ps(_mm_add_ps(v2f, v2f), v1f); +#endif +} + +QUALIFIERS __m128d _my_cvtepu64_pd(const __m128i v) +{ +#ifdef __AVX512F__ + return _mm_cvtepu64_pd(v); +#else + #warning need to implement _my_cvtepu64_pd + return (__m128d) v; +#endif } @@ -41,11 +58,26 @@ QUALIFIERS void aesni_double2(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3 __m128i c128 = _mm_set_epi32(ctr3, ctr2, ctr1, ctr0); __m128i k128 = _mm_set_epi32(key3, key2, key1, key0); c128 = aesni1xm128i(c128, k128); + uint32 r[4]; _mm_storeu_si128((__m128i*)&r[0], c128); + __m128i x = _mm_set_epi64x((uint64) r[2], (uint64) r[0]); + __m128i y = _mm_set_epi64x((uint64) r[3], (uint64) r[1]); - rnd1 = _uniform_double_hq(r[0], r[1]); - rnd2 = _uniform_double_hq(r[2], r[3]); + __m128i cnt = _mm_set_epi64x(53 - 32, 53 - 32); + y = _mm_sll_epi64(y, cnt); + __m128i z = _mm_xor_si128(x, y); + + __m128d rs = _my_cvtepu64_pd(z); + const __m128d tp53 = _mm_set_pd1(TWOPOW53_INV_DOUBLE); + const __m128d tp54 = _mm_set_pd1(TWOPOW53_INV_DOUBLE/2.0); + rs = _mm_mul_pd(rs, tp53); + rs = _mm_add_pd(rs, tp54); + + double rr[2]; + _mm_storeu_pd(rr, rs); + rnd1 = rr[0]; + rnd2 = rr[1]; } @@ -56,11 +88,17 @@ QUALIFIERS void aesni_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3, __m128i c128 = _mm_set_epi32(ctr3, ctr2, ctr1, ctr0); __m128i k128 = _mm_set_epi32(key3, key2, key1, key0); c128 = aesni1xm128i(c128, k128); - uint32 r[4]; - _mm_storeu_si128((__m128i*)&r[0], c128); - rnd1 = r[0] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f); - rnd2 = r[1] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f); - rnd3 = r[2] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f); - rnd4 = r[3] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f); + __m128 rs = _my_cvtepu32_ps(c128); + const __m128 tp32 = _mm_set_ps1(TWOPOW32_INV_FLOAT); + const __m128 tp33 = _mm_set_ps1(TWOPOW32_INV_FLOAT/2.0f); + rs = _mm_mul_ps(rs, tp32); + rs = _mm_add_ps(rs, tp33); + + float r[4]; + _mm_storeu_ps(r, rs); + rnd1 = r[0]; + rnd2 = r[1]; + rnd3 = r[2]; + rnd4 = r[3]; } \ No newline at end of file diff --git a/pystencils_tests/test_random.py b/pystencils_tests/test_random.py index 16ade6656..f55d028d0 100644 --- a/pystencils_tests/test_random.py +++ b/pystencils_tests/test_random.py @@ -98,7 +98,6 @@ def test_aesni_float(): dh.all_to_cpu() arr = dh.gather_array('f') assert np.logical_and(arr <= 1.0, arr >= 0).all() - print(arr) #float_reference = aesni_reference * 2.**-32 + 2.**-33 #assert(np.allclose(arr, float_reference, rtol=0, atol=np.finfo(np.float32).eps)) \ No newline at end of file -- GitLab