Commit 69f779ea authored by Martin Bauer's avatar Martin Bauer
Browse files

Merge branch 'philox' into 'master'

AES-NI Random Number Generator

See merge request pycodegen/pystencils!30
parents 83597528 170a7717
#if !defined(__AES__) || !defined(__SSE2__)
#error AES-NI and SSE2 need to be enabled
#endif
#include <emmintrin.h> // SSE2
#include <wmmintrin.h> // AES
#ifdef __AVX512VL__
#include <immintrin.h> // AVX*
#endif
#include <cstdint>
#define QUALIFIERS inline
#define TWOPOW53_INV_DOUBLE (1.1102230246251565e-16)
#define TWOPOW32_INV_FLOAT (2.3283064e-10f)
typedef std::uint32_t uint32;
typedef std::uint64_t uint64;
QUALIFIERS __m128i aesni1xm128i(const __m128i & in, const __m128i & k) {
__m128i x = _mm_xor_si128(k, in);
x = _mm_aesenc_si128(x, k); // 1
x = _mm_aesenc_si128(x, k); // 2
x = _mm_aesenc_si128(x, k); // 3
x = _mm_aesenc_si128(x, k); // 4
x = _mm_aesenc_si128(x, k); // 5
x = _mm_aesenc_si128(x, k); // 6
x = _mm_aesenc_si128(x, k); // 7
x = _mm_aesenc_si128(x, k); // 8
x = _mm_aesenc_si128(x, k); // 9
x = _mm_aesenclast_si128(x, k); // 10
return x;
}
QUALIFIERS __m128 _my_cvtepu32_ps(const __m128i v)
{
#ifdef __AVX512VL__
return _mm_cvtepu32_ps(v);
#else
__m128i v2 = _mm_srli_epi32(v, 1);
__m128i v1 = _mm_and_si128(v, _mm_set1_epi32(1));
__m128 v2f = _mm_cvtepi32_ps(v2);
__m128 v1f = _mm_cvtepi32_ps(v1);
return _mm_add_ps(_mm_add_ps(v2f, v2f), v1f);
#endif
}
QUALIFIERS __m128d _my_cvtepu64_pd(const __m128i x)
{
#ifdef __AVX512VL__
return _mm_cvtepu64_pd(x);
#else
uint64 r[2];
_mm_storeu_si128((__m128i*)r, x);
return _mm_set_pd((double)r[1], (double)r[0]);
#endif
}
QUALIFIERS void aesni_double2(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1, uint32 key2, uint32 key3,
double & rnd1, double & rnd2)
{
// pack input and call AES
__m128i c128 = _mm_set_epi32(ctr3, ctr2, ctr1, ctr0);
__m128i k128 = _mm_set_epi32(key3, key2, key1, key0);
c128 = aesni1xm128i(c128, k128);
// convert 32 to 64 bit and put 0th and 2nd element into x, 1st and 3rd element into y
__m128i x = _mm_and_si128(c128, _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff));
__m128i y = _mm_and_si128(c128, _mm_set_epi32(0xffffffff, 0, 0xffffffff, 0));
y = _mm_srli_si128(y, 4);
// calculate z = x ^ y << (53 - 32))
__m128i z = _mm_sll_epi64(y, _mm_set_epi64x(53 - 32, 53 - 32));
z = _mm_xor_si128(x, z);
// convert uint64 to double
__m128d rs = _my_cvtepu64_pd(z);
// calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
rs = _mm_mul_pd(rs, _mm_set_pd1(TWOPOW53_INV_DOUBLE));
rs = _mm_add_pd(rs, _mm_set_pd1(TWOPOW53_INV_DOUBLE/2.0));
// store result
double rr[2];
_mm_storeu_pd(rr, rs);
rnd1 = rr[0];
rnd2 = rr[1];
}
QUALIFIERS void aesni_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1, uint32 key2, uint32 key3,
float & rnd1, float & rnd2, float & rnd3, float & rnd4)
{
// pack input and call AES
__m128i c128 = _mm_set_epi32(ctr3, ctr2, ctr1, ctr0);
__m128i k128 = _mm_set_epi32(key3, key2, key1, key0);
c128 = aesni1xm128i(c128, k128);
// convert uint32 to float
__m128 rs = _my_cvtepu32_ps(c128);
// calculate rs * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
rs = _mm_mul_ps(rs, _mm_set_ps1(TWOPOW32_INV_FLOAT));
rs = _mm_add_ps(rs, _mm_set_ps1(TWOPOW32_INV_FLOAT/2.0f));
// store result
float r[4];
_mm_storeu_ps(r, rs);
rnd1 = r[0];
rnd2 = r[1];
rnd3 = r[2];
rnd4 = r[3];
}
\ No newline at end of file
......@@ -6,7 +6,7 @@ from pystencils.astnodes import LoopOverCoordinate
from pystencils.backends.cbackend import CustomCodeNode
def _get_philox_template(data_type, num_vars):
def _get_rng_template(name, data_type, num_vars):
if data_type is np.float32:
c_type = "float"
elif data_type is np.float64:
......@@ -14,20 +14,14 @@ def _get_philox_template(data_type, num_vars):
template = "\n"
for i in range(num_vars):
template += "{{result_symbols[{}].dtype}} {{result_symbols[{}].name}};\n".format(i, i)
template += ("philox_{}{}({{parameters}}, " + ", ".join(["{{result_symbols[{}].name}}"] * num_vars) + ");\n") \
.format(c_type, num_vars, *tuple(range(num_vars)))
template += ("{}_{}{}({{parameters}}, " + ", ".join(["{{result_symbols[{}].name}}"] * num_vars) + ");\n") \
.format(name, c_type, num_vars, *tuple(range(num_vars)))
return template
def _get_philox_code(template, dialect, vector_instruction_set, time_step, offsets, keys, dim, result_symbols):
def _get_rng_code(template, dialect, vector_instruction_set, time_step, offsets, keys, dim, result_symbols):
parameters = [time_step] + [LoopOverCoordinate.get_loop_counter_symbol(i) + offsets[i]
for i in range(dim)] + list(keys)
while len(parameters) < 6:
parameters.append(0)
parameters = parameters[:6]
assert len(parameters) == 6
for i in range(dim)] + [0] * (3 - dim) + list(keys)
if dialect == 'cuda' or (dialect == 'c' and vector_instruction_set is None):
return template.format(parameters=', '.join(str(p) for p in parameters),
......@@ -36,15 +30,21 @@ def _get_philox_code(template, dialect, vector_instruction_set, time_step, offse
raise NotImplementedError("Not yet implemented for this backend")
class PhiloxBase(CustomCodeNode):
class RNGBase(CustomCodeNode):
def __init__(self, dim, time_step=TypedSymbol("time_step", np.uint32), offsets=(0, 0, 0), keys=(0, 0)):
def __init__(self, dim, time_step=TypedSymbol("time_step", np.uint32), offsets=(0, 0, 0), keys=None):
if keys is None:
keys = (0,) * self._num_keys
if len(keys) != self._num_keys:
raise ValueError("Provided {} keys but need {}".format(len(keys), self._num_keys))
if len(offsets) != 3:
raise ValueError("Provided {} offsets but need {}".format(len(offsets), 3))
self.result_symbols = tuple(TypedSymbol(sp.Dummy().name, self._data_type) for _ in range(self._num_vars))
symbols_read = [s for s in keys if isinstance(s, sp.Symbol)]
super().__init__("", symbols_read=symbols_read, symbols_defined=self.result_symbols)
self._time_step = time_step
self._offsets = offsets
self.headers = ['"philox_rand.h"']
self.headers = ['"{}_rand.h"'.format(self._name)]
self.keys = tuple(keys)
self._args = sp.sympify((dim, time_step, keys))
self._dim = dim
......@@ -65,22 +65,40 @@ class PhiloxBase(CustomCodeNode):
return self # nothing to replace inside this node - would destroy intermediate "dummy" by re-creating them
def get_code(self, dialect, vector_instruction_set):
template = _get_philox_template(self._data_type, self._num_vars)
return _get_philox_code(template, dialect, vector_instruction_set,
template = _get_rng_template(self._name, self._data_type, self._num_vars)
return _get_rng_code(template, dialect, vector_instruction_set,
self._time_step, self._offsets, self.keys, self._dim, self.result_symbols)
def __repr__(self):
return (", ".join(['{}'] * self._num_vars) + " <- PhiloxRNG").format(*self.result_symbols)
return (", ".join(['{}'] * self._num_vars) + " <- {}RNG").format(*self.result_symbols, self._name.capitalize())
class PhiloxTwoDoubles(RNGBase):
_name = "philox"
_data_type = np.float64
_num_vars = 2
_num_keys = 2
class PhiloxFourFloats(RNGBase):
_name = "philox"
_data_type = np.float32
_num_vars = 4
_num_keys = 2
class PhiloxTwoDoubles(PhiloxBase):
class AESNITwoDoubles(RNGBase):
_name = "aesni"
_data_type = np.float64
_num_vars = 2
_num_keys = 4
class PhiloxFourFloats(PhiloxBase):
class AESNIFourFloats(RNGBase):
_name = "aesni"
_data_type = np.float32
_num_vars = 4
_num_keys = 4
def random_symbol(assignment_list, seed=TypedSymbol("seed", np.uint32), rng_node=PhiloxTwoDoubles, *args, **kwargs):
......
import numpy as np
import pystencils as ps
from pystencils.rng import PhiloxFourFloats, PhiloxTwoDoubles
from pystencils.rng import PhiloxFourFloats, PhiloxTwoDoubles, AESNIFourFloats, AESNITwoDoubles
# curand_Philox4x32_10(make_uint4(124, i, j, 0), make_uint2(0, 0))
......@@ -56,3 +56,39 @@ def test_philox_float():
float_reference = philox_reference * 2.**-32 + 2.**-33
assert(np.allclose(arr, float_reference, rtol=0, atol=np.finfo(np.float32).eps))
def test_aesni_double():
dh = ps.create_data_handling((2, 2), default_ghost_layers=0, default_target="cpu")
f = dh.add_array("f", values_per_cell=2)
dh.fill('f', 42.0)
aesni_node = AESNITwoDoubles(dh.dim)
assignments = [aesni_node,
ps.Assignment(f(0), aesni_node.result_symbols[0]),
ps.Assignment(f(1), aesni_node.result_symbols[1])]
kernel = ps.create_kernel(assignments, target=dh.default_target).compile()
dh.all_to_gpu()
dh.run_kernel(kernel, time_step=124)
dh.all_to_cpu()
arr = dh.gather_array('f')
assert np.logical_and(arr <= 1.0, arr >= 0).all()
def test_aesni_float():
dh = ps.create_data_handling((2, 2), default_ghost_layers=0, default_target="cpu")
f = dh.add_array("f", values_per_cell=4)
dh.fill('f', 42.0)
aesni_node = AESNIFourFloats(dh.dim)
assignments = [aesni_node] + [ps.Assignment(f(i), aesni_node.result_symbols[i]) for i in range(4)]
kernel = ps.create_kernel(assignments, target=dh.default_target).compile()
dh.all_to_gpu()
dh.run_kernel(kernel, time_step=124)
dh.all_to_cpu()
arr = dh.gather_array('f')
assert np.logical_and(arr <= 1.0, arr >= 0).all()
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment