From d38aed46834056bdfd41ec495bcd1defb45329f0 Mon Sep 17 00:00:00 2001
From: Michael Kuron <mkuron@icp.uni-stuttgart.de>
Date: Tue, 13 Aug 2019 17:18:09 +0200
Subject: [PATCH] AES-NI RNG

---
 pystencils/include/aesni_rand.h | 66 +++++++++++++++++++++++++++++++++
 pystencils/rng.py               | 56 ++++++++++++++++++----------
 pystencils_tests/test_random.py | 48 +++++++++++++++++++++++-
 3 files changed, 150 insertions(+), 20 deletions(-)
 create mode 100644 pystencils/include/aesni_rand.h

diff --git a/pystencils/include/aesni_rand.h b/pystencils/include/aesni_rand.h
new file mode 100644
index 000000000..eb7217e0b
--- /dev/null
+++ b/pystencils/include/aesni_rand.h
@@ -0,0 +1,66 @@
+#if !defined(__AES__) || !defined(__SSE2__)
+#error AES-NI and SSE2 need to be enabled
+#endif
+
+#include <x86intrin.h>
+#include <cstdint>
+
+#define QUALIFIERS inline
+#define TWOPOW53_INV_DOUBLE (1.1102230246251565e-16)
+#define TWOPOW32_INV_FLOAT (2.3283064e-10f)
+
+typedef std::uint32_t uint32;
+typedef std::uint64_t uint64;
+
+QUALIFIERS __m128i aesni1xm128i(const __m128i & in, const __m128i & k) {
+    __m128i x = _mm_xor_si128(k, in);
+    x = _mm_aesenc_si128(x, k);
+    x = _mm_aesenc_si128(x, k);
+    x = _mm_aesenc_si128(x, k);
+    x = _mm_aesenc_si128(x, k);
+    x = _mm_aesenc_si128(x, k);
+    x = _mm_aesenc_si128(x, k);
+    x = _mm_aesenc_si128(x, k);
+    x = _mm_aesenc_si128(x, k);
+    x = _mm_aesenc_si128(x, k);
+    x = _mm_aesenclast_si128(x, k);
+    return x;
+}
+
+QUALIFIERS double _uniform_double_hq(uint32 x, uint32 y)
+{
+    uint64 z = (uint64)x ^ ((uint64)y << (53 - 32));
+    return z * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0);
+}
+
+
+QUALIFIERS void aesni_double2(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3,
+                              uint32 key0, uint32 key1, uint32 key2, uint32 key3,
+                              double & rnd1, double & rnd2)
+{
+    __m128i c128 = _mm_set_epi32(ctr3, ctr2, ctr1, ctr0);
+    __m128i k128 = _mm_set_epi32(key3, key2, key1, key0);
+    c128 = aesni1xm128i(c128, k128);
+    uint32 r[4];
+    _mm_storeu_si128((__m128i*)&r[0], c128);
+
+    rnd1 = _uniform_double_hq(r[0], r[1]);
+    rnd2 = _uniform_double_hq(r[2], r[3]);
+}
+
+
+QUALIFIERS void aesni_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3,
+                             uint32 key0, uint32 key1, uint32 key2, uint32 key3,
+                             float & rnd1, float & rnd2, float & rnd3, float & rnd4)
+{
+    __m128i c128 = _mm_set_epi32(ctr3, ctr2, ctr1, ctr0);
+    __m128i k128 = _mm_set_epi32(key3, key2, key1, key0);
+    c128 = aesni1xm128i(c128, k128);
+    uint32 r[4];
+    _mm_storeu_si128((__m128i*)&r[0], c128);
+
+    rnd1 = r[0] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
+    rnd2 = r[1] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
+    rnd3 = r[2] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
+    rnd4 = r[3] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
+}
\ No newline at end of file
diff --git a/pystencils/rng.py b/pystencils/rng.py
index 48f3fb9b6..81e33419f 100644
--- a/pystencils/rng.py
+++ b/pystencils/rng.py
@@ -6,7 +6,7 @@ from pystencils.astnodes import LoopOverCoordinate
 from pystencils.backends.cbackend import CustomCodeNode
 
 
-def _get_philox_template(data_type, num_vars):
+def _get_rng_template(name, data_type, num_vars):
     if data_type is np.float32:
         c_type = "float"
     elif data_type is np.float64:
@@ -14,20 +14,14 @@ def _get_philox_template(data_type, num_vars):
     template = "\n"
     for i in range(num_vars):
         template += "{{result_symbols[{}].dtype}} {{result_symbols[{}].name}};\n".format(i, i)
-    template += ("philox_{}{}({{parameters}}, " + ", ".join(["{{result_symbols[{}].name}}"] * num_vars) + ");\n") \
-        .format(c_type, num_vars, *tuple(range(num_vars)))
+    template += ("{}_{}{}({{parameters}}, " + ", ".join(["{{result_symbols[{}].name}}"] * num_vars) + ");\n") \
+        .format(name, c_type, num_vars, *tuple(range(num_vars)))
     return template
 
 
-def _get_philox_code(template, dialect, vector_instruction_set, time_step, offsets, keys, dim, result_symbols):
+def _get_rng_code(template, dialect, vector_instruction_set, time_step, offsets, keys, dim, result_symbols):
     parameters = [time_step] + [LoopOverCoordinate.get_loop_counter_symbol(i) + offsets[i]
-                                for i in range(dim)] + list(keys)
-
-    while len(parameters) < 6:
-        parameters.append(0)
-    parameters = parameters[:6]
-
-    assert len(parameters) == 6
+                                for i in range(dim)] + [0] * (3-dim) + list(keys)
 
     if dialect == 'cuda' or (dialect == 'c' and vector_instruction_set is None):
         return template.format(parameters=', '.join(str(p) for p in parameters),
@@ -36,15 +30,21 @@ def _get_philox_code(template, dialect, vector_instruction_set, time_step, offse
         raise NotImplementedError("Not yet implemented for this backend")
 
 
-class PhiloxBase(CustomCodeNode):
+class RNGBase(CustomCodeNode):
 
-    def __init__(self, dim, time_step=TypedSymbol("time_step", np.uint32), offsets=(0, 0, 0), keys=(0, 0)):
+    def __init__(self, dim, time_step=TypedSymbol("time_step", np.uint32), offsets=(0, 0, 0), keys=None):
+        if keys is None:
+            keys = (0,) * self._num_keys
+        if len(keys) != self._num_keys:
+            raise ValueError("Provided {} keys but need {}".format(len(keys), self._num_keys))
+        if len(offsets) != 3:
+            raise ValueError("Provided {} offsets but need {}".format(len(offsets), 3))
         self.result_symbols = tuple(TypedSymbol(sp.Dummy().name, self._data_type) for _ in range(self._num_vars))
         symbols_read = [s for s in keys if isinstance(s, sp.Symbol)]
         super().__init__("", symbols_read=symbols_read, symbols_defined=self.result_symbols)
         self._time_step = time_step
         self._offsets = offsets
-        self.headers = ['"philox_rand.h"']
+        self.headers = ['"{}_rand.h"'.format(self._name)]
         self.keys = tuple(keys)
         self._args = sp.sympify((dim, time_step, keys))
         self._dim = dim
@@ -65,22 +65,40 @@ class PhiloxBase(CustomCodeNode):
         return self  # nothing to replace inside this node - would destroy intermediate "dummy" by re-creating them
 
     def get_code(self, dialect, vector_instruction_set):
-        template = _get_philox_template(self._data_type, self._num_vars)
-        return _get_philox_code(template, dialect, vector_instruction_set,
+        template = _get_rng_template(self._name, self._data_type, self._num_vars)
+        return _get_rng_code(template, dialect, vector_instruction_set,
                                 self._time_step, self._offsets, self.keys, self._dim, self.result_symbols)
 
     def __repr__(self):
-        return (", ".join(['{}'] * self._num_vars) + " <- PhiloxRNG").format(*self.result_symbols)
+        return (", ".join(['{}'] * self._num_vars) + " <- {}RNG").format(*self.result_symbols, self._name.capitalize())
+
+
+class PhiloxTwoDoubles(RNGBase):
+    _name = "philox"
+    _data_type = np.float64
+    _num_vars = 2
+    _num_keys = 2
+
+
+class PhiloxFourFloats(RNGBase):
+    _name = "philox"
+    _data_type = np.float32
+    _num_vars = 4
+    _num_keys = 2
 
 
-class PhiloxTwoDoubles(PhiloxBase):
+class AESNITwoDoubles(RNGBase):
+    _name = "aesni"
     _data_type = np.float64
     _num_vars = 2
+    _num_keys = 4
 
 
-class PhiloxFourFloats(PhiloxBase):
+class AESNIFourFloats(RNGBase):
+    _name = "aesni"
     _data_type = np.float32
     _num_vars = 4
+    _num_keys = 4
 
 
 def random_symbol(assignment_list, seed=TypedSymbol("seed", np.uint32), rng_node=PhiloxTwoDoubles, *args, **kwargs):
diff --git a/pystencils_tests/test_random.py b/pystencils_tests/test_random.py
index 1c5ee9cbe..16ade6656 100644
--- a/pystencils_tests/test_random.py
+++ b/pystencils_tests/test_random.py
@@ -1,7 +1,7 @@
 import numpy as np
 
 import pystencils as ps
-from pystencils.rng import PhiloxFourFloats, PhiloxTwoDoubles
+from pystencils.rng import PhiloxFourFloats, PhiloxTwoDoubles, AESNIFourFloats, AESNITwoDoubles
 
 
 # curand_Philox4x32_10(make_uint4(124, i, j, 0), make_uint2(0, 0))
@@ -56,3 +56,49 @@ def test_philox_float():
 
         float_reference = philox_reference * 2.**-32 + 2.**-33
         assert(np.allclose(arr, float_reference, rtol=0, atol=np.finfo(np.float32).eps))
+
+def test_aesni_double():
+    dh = ps.create_data_handling((2, 2), default_ghost_layers=0, default_target="cpu")
+    f = dh.add_array("f", values_per_cell=2)
+
+    dh.fill('f', 42.0)
+
+    aesni_node = AESNITwoDoubles(dh.dim)
+    assignments = [aesni_node,
+                   ps.Assignment(f(0), aesni_node.result_symbols[0]),
+                   ps.Assignment(f(1), aesni_node.result_symbols[1])]
+    kernel = ps.create_kernel(assignments, target=dh.default_target).compile()
+
+    dh.all_to_gpu()
+    dh.run_kernel(kernel, time_step=124)
+    dh.all_to_cpu()
+
+    arr = dh.gather_array('f')
+    assert np.logical_and(arr <= 1.0, arr >= 0).all()
+
+    #x = aesni_reference[:,:,0::2]
+    #y = aesni_reference[:,:,1::2]
+    #z = x ^ y << (53 - 32)
+    #double_reference = z * 2.**-53 + 2.**-54
+    #assert(np.allclose(arr, double_reference, rtol=0, atol=np.finfo(np.float64).eps))
+
+
+def test_aesni_float():
+    dh = ps.create_data_handling((2, 2), default_ghost_layers=0, default_target="cpu")
+    f = dh.add_array("f", values_per_cell=4)
+
+    dh.fill('f', 42.0)
+
+    aesni_node = AESNIFourFloats(dh.dim)
+    assignments = [aesni_node] + [ps.Assignment(f(i), aesni_node.result_symbols[i]) for i in range(4)]
+    kernel = ps.create_kernel(assignments, target=dh.default_target).compile()
+
+    dh.all_to_gpu()
+    dh.run_kernel(kernel, time_step=124)
+    dh.all_to_cpu()
+    arr = dh.gather_array('f')
+    assert np.logical_and(arr <= 1.0, arr >= 0).all()
+    print(arr)
+
+    #float_reference = aesni_reference * 2.**-32 + 2.**-33
+    #assert(np.allclose(arr, float_reference, rtol=0, atol=np.finfo(np.float32).eps))
\ No newline at end of file
-- 
GitLab