Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Sebastian Bindgen
pystencils
Commits
d38aed46
Commit
d38aed46
authored
Aug 13, 2019
by
Michael Kuron
Browse files
AES-NI RNG
parent
7750926f
Changes
3
Hide whitespace changes
Inline
Side-by-side
pystencils/include/aesni_rand.h
0 → 100644
View file @
d38aed46
#if !defined(__AES__) || !defined(__SSE2__)
#error AES-NI and SSE2 need to be enabled
#endif
#include
<x86intrin.h>
#include
<cstdint>
#define QUALIFIERS inline
#define TWOPOW53_INV_DOUBLE (1.1102230246251565e-16)
#define TWOPOW32_INV_FLOAT (2.3283064e-10f)
typedef
std
::
uint32_t
uint32
;
typedef
std
::
uint64_t
uint64
;
QUALIFIERS
__m128i
aesni1xm128i
(
const
__m128i
&
in
,
const
__m128i
&
k
)
{
__m128i
x
=
_mm_xor_si128
(
k
,
in
);
x
=
_mm_aesenc_si128
(
x
,
k
);
x
=
_mm_aesenc_si128
(
x
,
k
);
x
=
_mm_aesenc_si128
(
x
,
k
);
x
=
_mm_aesenc_si128
(
x
,
k
);
x
=
_mm_aesenc_si128
(
x
,
k
);
x
=
_mm_aesenc_si128
(
x
,
k
);
x
=
_mm_aesenc_si128
(
x
,
k
);
x
=
_mm_aesenc_si128
(
x
,
k
);
x
=
_mm_aesenc_si128
(
x
,
k
);
x
=
_mm_aesenclast_si128
(
x
,
k
);
return
x
;
}
QUALIFIERS
double
_uniform_double_hq
(
uint32
x
,
uint32
y
)
{
uint64
z
=
(
uint64
)
x
^
((
uint64
)
y
<<
(
53
-
32
));
return
z
*
TWOPOW53_INV_DOUBLE
+
(
TWOPOW53_INV_DOUBLE
/
2
.
0
);
}
QUALIFIERS
void
aesni_double2
(
uint32
ctr0
,
uint32
ctr1
,
uint32
ctr2
,
uint32
ctr3
,
uint32
key0
,
uint32
key1
,
uint32
key2
,
uint32
key3
,
double
&
rnd1
,
double
&
rnd2
)
{
__m128i
c128
=
_mm_set_epi32
(
ctr3
,
ctr2
,
ctr1
,
ctr0
);
__m128i
k128
=
_mm_set_epi32
(
key3
,
key2
,
key1
,
key0
);
c128
=
aesni1xm128i
(
c128
,
k128
);
uint32
r
[
4
];
_mm_storeu_si128
((
__m128i
*
)
&
r
[
0
],
c128
);
rnd1
=
_uniform_double_hq
(
r
[
0
],
r
[
1
]);
rnd2
=
_uniform_double_hq
(
r
[
2
],
r
[
3
]);
}
QUALIFIERS
void
aesni_float4
(
uint32
ctr0
,
uint32
ctr1
,
uint32
ctr2
,
uint32
ctr3
,
uint32
key0
,
uint32
key1
,
uint32
key2
,
uint32
key3
,
float
&
rnd1
,
float
&
rnd2
,
float
&
rnd3
,
float
&
rnd4
)
{
__m128i
c128
=
_mm_set_epi32
(
ctr3
,
ctr2
,
ctr1
,
ctr0
);
__m128i
k128
=
_mm_set_epi32
(
key3
,
key2
,
key1
,
key0
);
c128
=
aesni1xm128i
(
c128
,
k128
);
uint32
r
[
4
];
_mm_storeu_si128
((
__m128i
*
)
&
r
[
0
],
c128
);
rnd1
=
r
[
0
]
*
TWOPOW32_INV_FLOAT
+
(
TWOPOW32_INV_FLOAT
/
2
.
0
f
);
rnd2
=
r
[
1
]
*
TWOPOW32_INV_FLOAT
+
(
TWOPOW32_INV_FLOAT
/
2
.
0
f
);
rnd3
=
r
[
2
]
*
TWOPOW32_INV_FLOAT
+
(
TWOPOW32_INV_FLOAT
/
2
.
0
f
);
rnd4
=
r
[
3
]
*
TWOPOW32_INV_FLOAT
+
(
TWOPOW32_INV_FLOAT
/
2
.
0
f
);
}
\ No newline at end of file
pystencils/rng.py
View file @
d38aed46
...
...
@@ -6,7 +6,7 @@ from pystencils.astnodes import LoopOverCoordinate
from
pystencils.backends.cbackend
import
CustomCodeNode
def
_get_
philox
_template
(
data_type
,
num_vars
):
def
_get_
rng
_template
(
name
,
data_type
,
num_vars
):
if
data_type
is
np
.
float32
:
c_type
=
"float"
elif
data_type
is
np
.
float64
:
...
...
@@ -14,20 +14,14 @@ def _get_philox_template(data_type, num_vars):
template
=
"
\n
"
for
i
in
range
(
num_vars
):
template
+=
"{{result_symbols[{}].dtype}} {{result_symbols[{}].name}};
\n
"
.
format
(
i
,
i
)
template
+=
(
"
philox
_{}{}({{parameters}}, "
+
", "
.
join
([
"{{result_symbols[{}].name}}"
]
*
num_vars
)
+
");
\n
"
)
\
.
format
(
c_type
,
num_vars
,
*
tuple
(
range
(
num_vars
)))
template
+=
(
"
{}
_{}{}({{parameters}}, "
+
", "
.
join
([
"{{result_symbols[{}].name}}"
]
*
num_vars
)
+
");
\n
"
)
\
.
format
(
name
,
c_type
,
num_vars
,
*
tuple
(
range
(
num_vars
)))
return
template
def
_get_
philox
_code
(
template
,
dialect
,
vector_instruction_set
,
time_step
,
offsets
,
keys
,
dim
,
result_symbols
):
def
_get_
rng
_code
(
template
,
dialect
,
vector_instruction_set
,
time_step
,
offsets
,
keys
,
dim
,
result_symbols
):
parameters
=
[
time_step
]
+
[
LoopOverCoordinate
.
get_loop_counter_symbol
(
i
)
+
offsets
[
i
]
for
i
in
range
(
dim
)]
+
list
(
keys
)
while
len
(
parameters
)
<
6
:
parameters
.
append
(
0
)
parameters
=
parameters
[:
6
]
assert
len
(
parameters
)
==
6
for
i
in
range
(
dim
)]
+
[
0
]
*
(
3
-
dim
)
+
list
(
keys
)
if
dialect
==
'cuda'
or
(
dialect
==
'c'
and
vector_instruction_set
is
None
):
return
template
.
format
(
parameters
=
', '
.
join
(
str
(
p
)
for
p
in
parameters
),
...
...
@@ -36,15 +30,21 @@ def _get_philox_code(template, dialect, vector_instruction_set, time_step, offse
raise
NotImplementedError
(
"Not yet implemented for this backend"
)
class
Philox
Base
(
CustomCodeNode
):
class
RNG
Base
(
CustomCodeNode
):
def
__init__
(
self
,
dim
,
time_step
=
TypedSymbol
(
"time_step"
,
np
.
uint32
),
offsets
=
(
0
,
0
,
0
),
keys
=
(
0
,
0
)):
def
__init__
(
self
,
dim
,
time_step
=
TypedSymbol
(
"time_step"
,
np
.
uint32
),
offsets
=
(
0
,
0
,
0
),
keys
=
None
):
if
keys
is
None
:
keys
=
(
0
,)
*
self
.
_num_keys
if
len
(
keys
)
!=
self
.
_num_keys
:
raise
ValueError
(
"Provided {} keys but need {}"
.
format
(
len
(
keys
),
self
.
_num_keys
))
if
len
(
offsets
)
!=
3
:
raise
ValueError
(
"Provided {} offsets but need {}"
.
format
(
len
(
offsets
),
3
))
self
.
result_symbols
=
tuple
(
TypedSymbol
(
sp
.
Dummy
().
name
,
self
.
_data_type
)
for
_
in
range
(
self
.
_num_vars
))
symbols_read
=
[
s
for
s
in
keys
if
isinstance
(
s
,
sp
.
Symbol
)]
super
().
__init__
(
""
,
symbols_read
=
symbols_read
,
symbols_defined
=
self
.
result_symbols
)
self
.
_time_step
=
time_step
self
.
_offsets
=
offsets
self
.
headers
=
[
'"
philox
_rand.h"'
]
self
.
headers
=
[
'"
{}
_rand.h"'
.
format
(
self
.
_name
)
]
self
.
keys
=
tuple
(
keys
)
self
.
_args
=
sp
.
sympify
((
dim
,
time_step
,
keys
))
self
.
_dim
=
dim
...
...
@@ -65,22 +65,40 @@ class PhiloxBase(CustomCodeNode):
return
self
# nothing to replace inside this node - would destroy intermediate "dummy" by re-creating them
def
get_code
(
self
,
dialect
,
vector_instruction_set
):
template
=
_get_
philox
_template
(
self
.
_data_type
,
self
.
_num_vars
)
return
_get_
philox
_code
(
template
,
dialect
,
vector_instruction_set
,
template
=
_get_
rng
_template
(
self
.
_name
,
self
.
_data_type
,
self
.
_num_vars
)
return
_get_
rng
_code
(
template
,
dialect
,
vector_instruction_set
,
self
.
_time_step
,
self
.
_offsets
,
self
.
keys
,
self
.
_dim
,
self
.
result_symbols
)
def
__repr__
(
self
):
return
(
", "
.
join
([
'{}'
]
*
self
.
_num_vars
)
+
" <- PhiloxRNG"
).
format
(
*
self
.
result_symbols
)
return
(
", "
.
join
([
'{}'
]
*
self
.
_num_vars
)
+
" <- {}RNG"
).
format
(
*
self
.
result_symbols
,
self
.
_name
.
capitalize
())
class
PhiloxTwoDoubles
(
RNGBase
):
_name
=
"philox"
_data_type
=
np
.
float64
_num_vars
=
2
_num_keys
=
2
class
PhiloxFourFloats
(
RNGBase
):
_name
=
"philox"
_data_type
=
np
.
float32
_num_vars
=
4
_num_keys
=
2
class
PhiloxTwoDoubles
(
PhiloxBase
):
class
AESNITwoDoubles
(
RNGBase
):
_name
=
"aesni"
_data_type
=
np
.
float64
_num_vars
=
2
_num_keys
=
4
class
PhiloxFourFloats
(
PhiloxBase
):
class
AESNIFourFloats
(
RNGBase
):
_name
=
"aesni"
_data_type
=
np
.
float32
_num_vars
=
4
_num_keys
=
4
def
random_symbol
(
assignment_list
,
seed
=
TypedSymbol
(
"seed"
,
np
.
uint32
),
rng_node
=
PhiloxTwoDoubles
,
*
args
,
**
kwargs
):
...
...
pystencils_tests/test_random.py
View file @
d38aed46
import
numpy
as
np
import
pystencils
as
ps
from
pystencils.rng
import
PhiloxFourFloats
,
PhiloxTwoDoubles
from
pystencils.rng
import
PhiloxFourFloats
,
PhiloxTwoDoubles
,
AESNIFourFloats
,
AESNITwoDoubles
# curand_Philox4x32_10(make_uint4(124, i, j, 0), make_uint2(0, 0))
...
...
@@ -56,3 +56,49 @@ def test_philox_float():
float_reference
=
philox_reference
*
2.
**-
32
+
2.
**-
33
assert
(
np
.
allclose
(
arr
,
float_reference
,
rtol
=
0
,
atol
=
np
.
finfo
(
np
.
float32
).
eps
))
def
test_aesni_double
():
dh
=
ps
.
create_data_handling
((
2
,
2
),
default_ghost_layers
=
0
,
default_target
=
"cpu"
)
f
=
dh
.
add_array
(
"f"
,
values_per_cell
=
2
)
dh
.
fill
(
'f'
,
42.0
)
aesni_node
=
AESNITwoDoubles
(
dh
.
dim
)
assignments
=
[
aesni_node
,
ps
.
Assignment
(
f
(
0
),
aesni_node
.
result_symbols
[
0
]),
ps
.
Assignment
(
f
(
1
),
aesni_node
.
result_symbols
[
1
])]
kernel
=
ps
.
create_kernel
(
assignments
,
target
=
dh
.
default_target
).
compile
()
dh
.
all_to_gpu
()
dh
.
run_kernel
(
kernel
,
time_step
=
124
)
dh
.
all_to_cpu
()
arr
=
dh
.
gather_array
(
'f'
)
assert
np
.
logical_and
(
arr
<=
1.0
,
arr
>=
0
).
all
()
#x = aesni_reference[:,:,0::2]
#y = aesni_reference[:,:,1::2]
#z = x ^ y << (53 - 32)
#double_reference = z * 2.**-53 + 2.**-54
#assert(np.allclose(arr, double_reference, rtol=0, atol=np.finfo(np.float64).eps))
def
test_aesni_float
():
dh
=
ps
.
create_data_handling
((
2
,
2
),
default_ghost_layers
=
0
,
default_target
=
"cpu"
)
f
=
dh
.
add_array
(
"f"
,
values_per_cell
=
4
)
dh
.
fill
(
'f'
,
42.0
)
aesni_node
=
AESNIFourFloats
(
dh
.
dim
)
assignments
=
[
aesni_node
]
+
[
ps
.
Assignment
(
f
(
i
),
aesni_node
.
result_symbols
[
i
])
for
i
in
range
(
4
)]
kernel
=
ps
.
create_kernel
(
assignments
,
target
=
dh
.
default_target
).
compile
()
dh
.
all_to_gpu
()
dh
.
run_kernel
(
kernel
,
time_step
=
124
)
dh
.
all_to_cpu
()
arr
=
dh
.
gather_array
(
'f'
)
assert
np
.
logical_and
(
arr
<=
1.0
,
arr
>=
0
).
all
()
print
(
arr
)
#float_reference = aesni_reference * 2.**-32 + 2.**-33
#assert(np.allclose(arr, float_reference, rtol=0, atol=np.finfo(np.float32).eps))
\ No newline at end of file
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment