diff --git a/pystencils/include/aesni_rand.h b/pystencils/include/aesni_rand.h new file mode 100644 index 0000000000000000000000000000000000000000..09327f27b8bc0b3cdd0b16cb8a64e3237b555797 --- /dev/null +++ b/pystencils/include/aesni_rand.h @@ -0,0 +1,113 @@ +#if !defined(__AES__) || !defined(__SSE2__) +#error AES-NI and SSE2 need to be enabled +#endif + +#include <emmintrin.h> // SSE2 +#include <wmmintrin.h> // AES +#ifdef __AVX512VL__ +#include <immintrin.h> // AVX* +#endif +#include <cstdint> + +#define QUALIFIERS inline +#define TWOPOW53_INV_DOUBLE (1.1102230246251565e-16) +#define TWOPOW32_INV_FLOAT (2.3283064e-10f) + +typedef std::uint32_t uint32; +typedef std::uint64_t uint64; + +QUALIFIERS __m128i aesni1xm128i(const __m128i & in, const __m128i & k) { + __m128i x = _mm_xor_si128(k, in); + x = _mm_aesenc_si128(x, k); // 1 + x = _mm_aesenc_si128(x, k); // 2 + x = _mm_aesenc_si128(x, k); // 3 + x = _mm_aesenc_si128(x, k); // 4 + x = _mm_aesenc_si128(x, k); // 5 + x = _mm_aesenc_si128(x, k); // 6 + x = _mm_aesenc_si128(x, k); // 7 + x = _mm_aesenc_si128(x, k); // 8 + x = _mm_aesenc_si128(x, k); // 9 + x = _mm_aesenclast_si128(x, k); // 10 + return x; +} + +QUALIFIERS __m128 _my_cvtepu32_ps(const __m128i v) +{ +#ifdef __AVX512VL__ + return _mm_cvtepu32_ps(v); +#else + __m128i v2 = _mm_srli_epi32(v, 1); + __m128i v1 = _mm_and_si128(v, _mm_set1_epi32(1)); + __m128 v2f = _mm_cvtepi32_ps(v2); + __m128 v1f = _mm_cvtepi32_ps(v1); + return _mm_add_ps(_mm_add_ps(v2f, v2f), v1f); +#endif +} + +QUALIFIERS __m128d _my_cvtepu64_pd(const __m128i x) +{ +#ifdef __AVX512VL__ + return _mm_cvtepu64_pd(x); +#else + uint64 r[2]; + _mm_storeu_si128((__m128i*)r, x); + return _mm_set_pd((double)r[1], (double)r[0]); +#endif +} + + +QUALIFIERS void aesni_double2(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3, + uint32 key0, uint32 key1, uint32 key2, uint32 key3, + double & rnd1, double & rnd2) +{ + // pack input and call AES + __m128i c128 = _mm_set_epi32(ctr3, ctr2, ctr1, ctr0); + __m128i k128 = _mm_set_epi32(key3, key2, key1, key0); + c128 = aesni1xm128i(c128, k128); + + // convert 32 to 64 bit and put 0th and 2nd element into x, 1st and 3rd element into y + __m128i x = _mm_and_si128(c128, _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff)); + __m128i y = _mm_and_si128(c128, _mm_set_epi32(0xffffffff, 0, 0xffffffff, 0)); + y = _mm_srli_si128(y, 4); + + // calculate z = x ^ y << (53 - 32)) + __m128i z = _mm_sll_epi64(y, _mm_set_epi64x(53 - 32, 53 - 32)); + z = _mm_xor_si128(x, z); + + // convert uint64 to double + __m128d rs = _my_cvtepu64_pd(z); + // calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0) + rs = _mm_mul_pd(rs, _mm_set_pd1(TWOPOW53_INV_DOUBLE)); + rs = _mm_add_pd(rs, _mm_set_pd1(TWOPOW53_INV_DOUBLE/2.0)); + + // store result + double rr[2]; + _mm_storeu_pd(rr, rs); + rnd1 = rr[0]; + rnd2 = rr[1]; +} + + +QUALIFIERS void aesni_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3, + uint32 key0, uint32 key1, uint32 key2, uint32 key3, + float & rnd1, float & rnd2, float & rnd3, float & rnd4) +{ + // pack input and call AES + __m128i c128 = _mm_set_epi32(ctr3, ctr2, ctr1, ctr0); + __m128i k128 = _mm_set_epi32(key3, key2, key1, key0); + c128 = aesni1xm128i(c128, k128); + + // convert uint32 to float + __m128 rs = _my_cvtepu32_ps(c128); + // calculate rs * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f) + rs = _mm_mul_ps(rs, _mm_set_ps1(TWOPOW32_INV_FLOAT)); + rs = _mm_add_ps(rs, _mm_set_ps1(TWOPOW32_INV_FLOAT/2.0f)); + + // store result + float r[4]; + _mm_storeu_ps(r, rs); + rnd1 = r[0]; + rnd2 = r[1]; + rnd3 = r[2]; + rnd4 = r[3]; +} \ No newline at end of file diff --git a/pystencils/rng.py b/pystencils/rng.py index 48f3fb9b6d5dfb0521d1cc1c2f233600397966f4..4341a0b3906cb9505b1ffbb4c6070e9ea09aee60 100644 --- a/pystencils/rng.py +++ b/pystencils/rng.py @@ -6,7 +6,7 @@ from pystencils.astnodes import LoopOverCoordinate from pystencils.backends.cbackend import CustomCodeNode -def _get_philox_template(data_type, num_vars): +def _get_rng_template(name, data_type, num_vars): if data_type is np.float32: c_type = "float" elif data_type is np.float64: @@ -14,20 +14,14 @@ def _get_philox_template(data_type, num_vars): template = "\n" for i in range(num_vars): template += "{{result_symbols[{}].dtype}} {{result_symbols[{}].name}};\n".format(i, i) - template += ("philox_{}{}({{parameters}}, " + ", ".join(["{{result_symbols[{}].name}}"] * num_vars) + ");\n") \ - .format(c_type, num_vars, *tuple(range(num_vars))) + template += ("{}_{}{}({{parameters}}, " + ", ".join(["{{result_symbols[{}].name}}"] * num_vars) + ");\n") \ + .format(name, c_type, num_vars, *tuple(range(num_vars))) return template -def _get_philox_code(template, dialect, vector_instruction_set, time_step, offsets, keys, dim, result_symbols): +def _get_rng_code(template, dialect, vector_instruction_set, time_step, offsets, keys, dim, result_symbols): parameters = [time_step] + [LoopOverCoordinate.get_loop_counter_symbol(i) + offsets[i] - for i in range(dim)] + list(keys) - - while len(parameters) < 6: - parameters.append(0) - parameters = parameters[:6] - - assert len(parameters) == 6 + for i in range(dim)] + [0] * (3 - dim) + list(keys) if dialect == 'cuda' or (dialect == 'c' and vector_instruction_set is None): return template.format(parameters=', '.join(str(p) for p in parameters), @@ -36,15 +30,21 @@ def _get_philox_code(template, dialect, vector_instruction_set, time_step, offse raise NotImplementedError("Not yet implemented for this backend") -class PhiloxBase(CustomCodeNode): +class RNGBase(CustomCodeNode): - def __init__(self, dim, time_step=TypedSymbol("time_step", np.uint32), offsets=(0, 0, 0), keys=(0, 0)): + def __init__(self, dim, time_step=TypedSymbol("time_step", np.uint32), offsets=(0, 0, 0), keys=None): + if keys is None: + keys = (0,) * self._num_keys + if len(keys) != self._num_keys: + raise ValueError("Provided {} keys but need {}".format(len(keys), self._num_keys)) + if len(offsets) != 3: + raise ValueError("Provided {} offsets but need {}".format(len(offsets), 3)) self.result_symbols = tuple(TypedSymbol(sp.Dummy().name, self._data_type) for _ in range(self._num_vars)) symbols_read = [s for s in keys if isinstance(s, sp.Symbol)] super().__init__("", symbols_read=symbols_read, symbols_defined=self.result_symbols) self._time_step = time_step self._offsets = offsets - self.headers = ['"philox_rand.h"'] + self.headers = ['"{}_rand.h"'.format(self._name)] self.keys = tuple(keys) self._args = sp.sympify((dim, time_step, keys)) self._dim = dim @@ -65,22 +65,40 @@ class PhiloxBase(CustomCodeNode): return self # nothing to replace inside this node - would destroy intermediate "dummy" by re-creating them def get_code(self, dialect, vector_instruction_set): - template = _get_philox_template(self._data_type, self._num_vars) - return _get_philox_code(template, dialect, vector_instruction_set, - self._time_step, self._offsets, self.keys, self._dim, self.result_symbols) + template = _get_rng_template(self._name, self._data_type, self._num_vars) + return _get_rng_code(template, dialect, vector_instruction_set, + self._time_step, self._offsets, self.keys, self._dim, self.result_symbols) def __repr__(self): - return (", ".join(['{}'] * self._num_vars) + " <- PhiloxRNG").format(*self.result_symbols) + return (", ".join(['{}'] * self._num_vars) + " <- {}RNG").format(*self.result_symbols, self._name.capitalize()) + + +class PhiloxTwoDoubles(RNGBase): + _name = "philox" + _data_type = np.float64 + _num_vars = 2 + _num_keys = 2 + + +class PhiloxFourFloats(RNGBase): + _name = "philox" + _data_type = np.float32 + _num_vars = 4 + _num_keys = 2 -class PhiloxTwoDoubles(PhiloxBase): +class AESNITwoDoubles(RNGBase): + _name = "aesni" _data_type = np.float64 _num_vars = 2 + _num_keys = 4 -class PhiloxFourFloats(PhiloxBase): +class AESNIFourFloats(RNGBase): + _name = "aesni" _data_type = np.float32 _num_vars = 4 + _num_keys = 4 def random_symbol(assignment_list, seed=TypedSymbol("seed", np.uint32), rng_node=PhiloxTwoDoubles, *args, **kwargs): diff --git a/pystencils_tests/test_random.py b/pystencils_tests/test_random.py index 1c5ee9cbe143f81a253de57cf23d7b75914a9d82..473aa3d1215449f51e70aa3b09f2c605ed13c313 100644 --- a/pystencils_tests/test_random.py +++ b/pystencils_tests/test_random.py @@ -1,7 +1,7 @@ import numpy as np import pystencils as ps -from pystencils.rng import PhiloxFourFloats, PhiloxTwoDoubles +from pystencils.rng import PhiloxFourFloats, PhiloxTwoDoubles, AESNIFourFloats, AESNITwoDoubles # curand_Philox4x32_10(make_uint4(124, i, j, 0), make_uint2(0, 0)) @@ -56,3 +56,39 @@ def test_philox_float(): float_reference = philox_reference * 2.**-32 + 2.**-33 assert(np.allclose(arr, float_reference, rtol=0, atol=np.finfo(np.float32).eps)) + +def test_aesni_double(): + dh = ps.create_data_handling((2, 2), default_ghost_layers=0, default_target="cpu") + f = dh.add_array("f", values_per_cell=2) + + dh.fill('f', 42.0) + + aesni_node = AESNITwoDoubles(dh.dim) + assignments = [aesni_node, + ps.Assignment(f(0), aesni_node.result_symbols[0]), + ps.Assignment(f(1), aesni_node.result_symbols[1])] + kernel = ps.create_kernel(assignments, target=dh.default_target).compile() + + dh.all_to_gpu() + dh.run_kernel(kernel, time_step=124) + dh.all_to_cpu() + + arr = dh.gather_array('f') + assert np.logical_and(arr <= 1.0, arr >= 0).all() + + +def test_aesni_float(): + dh = ps.create_data_handling((2, 2), default_ghost_layers=0, default_target="cpu") + f = dh.add_array("f", values_per_cell=4) + + dh.fill('f', 42.0) + + aesni_node = AESNIFourFloats(dh.dim) + assignments = [aesni_node] + [ps.Assignment(f(i), aesni_node.result_symbols[i]) for i in range(4)] + kernel = ps.create_kernel(assignments, target=dh.default_target).compile() + + dh.all_to_gpu() + dh.run_kernel(kernel, time_step=124) + dh.all_to_cpu() + arr = dh.gather_array('f') + assert np.logical_and(arr <= 1.0, arr >= 0).all() \ No newline at end of file