From fa0a09a58e5dee42d93c1cbf2021dfe58bb1f004 Mon Sep 17 00:00:00 2001
From: Michael Kuron <mkuron@icp.uni-stuttgart.de>
Date: Wed, 14 Aug 2019 10:01:25 +0200
Subject: [PATCH] AES-NI RNG fully vectorized

---
 pystencils/include/aesni_rand.h | 60 +++++++++++++++++++++++++++------
 pystencils_tests/test_random.py |  1 -
 2 files changed, 49 insertions(+), 12 deletions(-)

diff --git a/pystencils/include/aesni_rand.h b/pystencils/include/aesni_rand.h
index eb7217e0b..a480883e5 100644
--- a/pystencils/include/aesni_rand.h
+++ b/pystencils/include/aesni_rand.h
@@ -27,10 +27,27 @@ QUALIFIERS __m128i aesni1xm128i(const __m128i & in, const __m128i & k) {
     return x;
 }
 
-QUALIFIERS double _uniform_double_hq(uint32 x, uint32 y)
+QUALIFIERS __m128 _my_cvtepu32_ps(const __m128i v)
 {
-    uint64 z = (uint64)x ^ ((uint64)y << (53 - 32));
-    return z * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0);
+#ifdef __AVX512F__
+    return _mm_cvtepu32_ps(v);
+#else
+    __m128i v2 = _mm_srli_epi32(v, 1);
+    __m128i v1 = _mm_and_si128(v, _mm_set1_epi32(1));
+    __m128 v2f = _mm_cvtepi32_ps(v2);
+    __m128 v1f = _mm_cvtepi32_ps(v1);
+    return _mm_add_ps(_mm_add_ps(v2f, v2f), v1f);
+#endif
+}
+
+QUALIFIERS __m128d _my_cvtepu64_pd(const __m128i v)
+{
+#ifdef __AVX512F__
+    return _mm_cvtepu64_pd(v);
+#else
+    #warning need to implement _my_cvtepu64_pd
+    return (__m128d) v;
+#endif
 }
 
 
@@ -41,11 +58,26 @@ QUALIFIERS void aesni_double2(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3
     __m128i c128 = _mm_set_epi32(ctr3, ctr2, ctr1, ctr0);
     __m128i k128 = _mm_set_epi32(key3, key2, key1, key0);
     c128 = aesni1xm128i(c128, k128);
+
     uint32 r[4];
     _mm_storeu_si128((__m128i*)&r[0], c128);
+    __m128i x = _mm_set_epi64x((uint64) r[2], (uint64) r[0]);
+    __m128i y = _mm_set_epi64x((uint64) r[3], (uint64) r[1]);
 
-    rnd1 = _uniform_double_hq(r[0], r[1]);
-    rnd2 = _uniform_double_hq(r[2], r[3]);
+    __m128i cnt = _mm_set_epi64x(53 - 32, 53 - 32);
+    y = _mm_sll_epi64(y, cnt);
+    __m128i z = _mm_xor_si128(x, y);
+
+    __m128d rs = _my_cvtepu64_pd(z);
+    const __m128d tp53 = _mm_set_pd1(TWOPOW53_INV_DOUBLE);
+    const __m128d tp54 = _mm_set_pd1(TWOPOW53_INV_DOUBLE/2.0);
+    rs = _mm_mul_pd(rs, tp53);
+    rs = _mm_add_pd(rs, tp54);
+
+    double rr[2];
+    _mm_storeu_pd(rr, rs);
+    rnd1 = rr[0];
+    rnd2 = rr[1];
 }
 
 
@@ -56,11 +88,17 @@ QUALIFIERS void aesni_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3,
     __m128i c128 = _mm_set_epi32(ctr3, ctr2, ctr1, ctr0);
     __m128i k128 = _mm_set_epi32(key3, key2, key1, key0);
     c128 = aesni1xm128i(c128, k128);
-    uint32 r[4];
-    _mm_storeu_si128((__m128i*)&r[0], c128);
 
-    rnd1 = r[0] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
-    rnd2 = r[1] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
-    rnd3 = r[2] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
-    rnd4 = r[3] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
+    __m128 rs = _my_cvtepu32_ps(c128);
+    const __m128 tp32 = _mm_set_ps1(TWOPOW32_INV_FLOAT);
+    const __m128 tp33 = _mm_set_ps1(TWOPOW32_INV_FLOAT/2.0f);
+    rs = _mm_mul_ps(rs, tp32);
+    rs = _mm_add_ps(rs, tp33);
+
+    float r[4];
+    _mm_storeu_ps(r, rs);
+    rnd1 = r[0];
+    rnd2 = r[1];
+    rnd3 = r[2];
+    rnd4 = r[3];
 }
\ No newline at end of file
diff --git a/pystencils_tests/test_random.py b/pystencils_tests/test_random.py
index 16ade6656..f55d028d0 100644
--- a/pystencils_tests/test_random.py
+++ b/pystencils_tests/test_random.py
@@ -98,7 +98,6 @@ def test_aesni_float():
     dh.all_to_cpu()
     arr = dh.gather_array('f')
     assert np.logical_and(arr <= 1.0, arr >= 0).all()
-    print(arr)
 
     #float_reference = aesni_reference * 2.**-32 + 2.**-33
     #assert(np.allclose(arr, float_reference, rtol=0, atol=np.finfo(np.float32).eps))
\ No newline at end of file
-- 
GitLab