Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Jonas Plewinski
pystencils
Commits
69f779ea
Commit
69f779ea
authored
Aug 15, 2019
by
Martin Bauer
Browse files
Merge branch 'philox' into 'master'
AES-NI Random Number Generator See merge request
pycodegen/pystencils!30
parents
83597528
170a7717
Changes
3
Hide whitespace changes
Inline
Side-by-side
pystencils/include/aesni_rand.h
0 → 100644
View file @
69f779ea
#if !defined(__AES__) || !defined(__SSE2__)
#error AES-NI and SSE2 need to be enabled
#endif
#include
<emmintrin.h>
// SSE2
#include
<wmmintrin.h>
// AES
#ifdef __AVX512VL__
#include
<immintrin.h>
// AVX*
#endif
#include
<cstdint>
#define QUALIFIERS inline
#define TWOPOW53_INV_DOUBLE (1.1102230246251565e-16)
#define TWOPOW32_INV_FLOAT (2.3283064e-10f)
typedef
std
::
uint32_t
uint32
;
typedef
std
::
uint64_t
uint64
;
QUALIFIERS
__m128i
aesni1xm128i
(
const
__m128i
&
in
,
const
__m128i
&
k
)
{
__m128i
x
=
_mm_xor_si128
(
k
,
in
);
x
=
_mm_aesenc_si128
(
x
,
k
);
// 1
x
=
_mm_aesenc_si128
(
x
,
k
);
// 2
x
=
_mm_aesenc_si128
(
x
,
k
);
// 3
x
=
_mm_aesenc_si128
(
x
,
k
);
// 4
x
=
_mm_aesenc_si128
(
x
,
k
);
// 5
x
=
_mm_aesenc_si128
(
x
,
k
);
// 6
x
=
_mm_aesenc_si128
(
x
,
k
);
// 7
x
=
_mm_aesenc_si128
(
x
,
k
);
// 8
x
=
_mm_aesenc_si128
(
x
,
k
);
// 9
x
=
_mm_aesenclast_si128
(
x
,
k
);
// 10
return
x
;
}
QUALIFIERS
__m128
_my_cvtepu32_ps
(
const
__m128i
v
)
{
#ifdef __AVX512VL__
return
_mm_cvtepu32_ps
(
v
);
#else
__m128i
v2
=
_mm_srli_epi32
(
v
,
1
);
__m128i
v1
=
_mm_and_si128
(
v
,
_mm_set1_epi32
(
1
));
__m128
v2f
=
_mm_cvtepi32_ps
(
v2
);
__m128
v1f
=
_mm_cvtepi32_ps
(
v1
);
return
_mm_add_ps
(
_mm_add_ps
(
v2f
,
v2f
),
v1f
);
#endif
}
QUALIFIERS
__m128d
_my_cvtepu64_pd
(
const
__m128i
x
)
{
#ifdef __AVX512VL__
return
_mm_cvtepu64_pd
(
x
);
#else
uint64
r
[
2
];
_mm_storeu_si128
((
__m128i
*
)
r
,
x
);
return
_mm_set_pd
((
double
)
r
[
1
],
(
double
)
r
[
0
]);
#endif
}
QUALIFIERS
void
aesni_double2
(
uint32
ctr0
,
uint32
ctr1
,
uint32
ctr2
,
uint32
ctr3
,
uint32
key0
,
uint32
key1
,
uint32
key2
,
uint32
key3
,
double
&
rnd1
,
double
&
rnd2
)
{
// pack input and call AES
__m128i
c128
=
_mm_set_epi32
(
ctr3
,
ctr2
,
ctr1
,
ctr0
);
__m128i
k128
=
_mm_set_epi32
(
key3
,
key2
,
key1
,
key0
);
c128
=
aesni1xm128i
(
c128
,
k128
);
// convert 32 to 64 bit and put 0th and 2nd element into x, 1st and 3rd element into y
__m128i
x
=
_mm_and_si128
(
c128
,
_mm_set_epi32
(
0
,
0xffffffff
,
0
,
0xffffffff
));
__m128i
y
=
_mm_and_si128
(
c128
,
_mm_set_epi32
(
0xffffffff
,
0
,
0xffffffff
,
0
));
y
=
_mm_srli_si128
(
y
,
4
);
// calculate z = x ^ y << (53 - 32))
__m128i
z
=
_mm_sll_epi64
(
y
,
_mm_set_epi64x
(
53
-
32
,
53
-
32
));
z
=
_mm_xor_si128
(
x
,
z
);
// convert uint64 to double
__m128d
rs
=
_my_cvtepu64_pd
(
z
);
// calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
rs
=
_mm_mul_pd
(
rs
,
_mm_set_pd1
(
TWOPOW53_INV_DOUBLE
));
rs
=
_mm_add_pd
(
rs
,
_mm_set_pd1
(
TWOPOW53_INV_DOUBLE
/
2
.
0
));
// store result
double
rr
[
2
];
_mm_storeu_pd
(
rr
,
rs
);
rnd1
=
rr
[
0
];
rnd2
=
rr
[
1
];
}
QUALIFIERS
void
aesni_float4
(
uint32
ctr0
,
uint32
ctr1
,
uint32
ctr2
,
uint32
ctr3
,
uint32
key0
,
uint32
key1
,
uint32
key2
,
uint32
key3
,
float
&
rnd1
,
float
&
rnd2
,
float
&
rnd3
,
float
&
rnd4
)
{
// pack input and call AES
__m128i
c128
=
_mm_set_epi32
(
ctr3
,
ctr2
,
ctr1
,
ctr0
);
__m128i
k128
=
_mm_set_epi32
(
key3
,
key2
,
key1
,
key0
);
c128
=
aesni1xm128i
(
c128
,
k128
);
// convert uint32 to float
__m128
rs
=
_my_cvtepu32_ps
(
c128
);
// calculate rs * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
rs
=
_mm_mul_ps
(
rs
,
_mm_set_ps1
(
TWOPOW32_INV_FLOAT
));
rs
=
_mm_add_ps
(
rs
,
_mm_set_ps1
(
TWOPOW32_INV_FLOAT
/
2
.
0
f
));
// store result
float
r
[
4
];
_mm_storeu_ps
(
r
,
rs
);
rnd1
=
r
[
0
];
rnd2
=
r
[
1
];
rnd3
=
r
[
2
];
rnd4
=
r
[
3
];
}
\ No newline at end of file
pystencils/rng.py
View file @
69f779ea
...
...
@@ -6,7 +6,7 @@ from pystencils.astnodes import LoopOverCoordinate
from
pystencils.backends.cbackend
import
CustomCodeNode
def
_get_
philox
_template
(
data_type
,
num_vars
):
def
_get_
rng
_template
(
name
,
data_type
,
num_vars
):
if
data_type
is
np
.
float32
:
c_type
=
"float"
elif
data_type
is
np
.
float64
:
...
...
@@ -14,20 +14,14 @@ def _get_philox_template(data_type, num_vars):
template
=
"
\n
"
for
i
in
range
(
num_vars
):
template
+=
"{{result_symbols[{}].dtype}} {{result_symbols[{}].name}};
\n
"
.
format
(
i
,
i
)
template
+=
(
"
philox
_{}{}({{parameters}}, "
+
", "
.
join
([
"{{result_symbols[{}].name}}"
]
*
num_vars
)
+
");
\n
"
)
\
.
format
(
c_type
,
num_vars
,
*
tuple
(
range
(
num_vars
)))
template
+=
(
"
{}
_{}{}({{parameters}}, "
+
", "
.
join
([
"{{result_symbols[{}].name}}"
]
*
num_vars
)
+
");
\n
"
)
\
.
format
(
name
,
c_type
,
num_vars
,
*
tuple
(
range
(
num_vars
)))
return
template
def
_get_
philox
_code
(
template
,
dialect
,
vector_instruction_set
,
time_step
,
offsets
,
keys
,
dim
,
result_symbols
):
def
_get_
rng
_code
(
template
,
dialect
,
vector_instruction_set
,
time_step
,
offsets
,
keys
,
dim
,
result_symbols
):
parameters
=
[
time_step
]
+
[
LoopOverCoordinate
.
get_loop_counter_symbol
(
i
)
+
offsets
[
i
]
for
i
in
range
(
dim
)]
+
list
(
keys
)
while
len
(
parameters
)
<
6
:
parameters
.
append
(
0
)
parameters
=
parameters
[:
6
]
assert
len
(
parameters
)
==
6
for
i
in
range
(
dim
)]
+
[
0
]
*
(
3
-
dim
)
+
list
(
keys
)
if
dialect
==
'cuda'
or
(
dialect
==
'c'
and
vector_instruction_set
is
None
):
return
template
.
format
(
parameters
=
', '
.
join
(
str
(
p
)
for
p
in
parameters
),
...
...
@@ -36,15 +30,21 @@ def _get_philox_code(template, dialect, vector_instruction_set, time_step, offse
raise
NotImplementedError
(
"Not yet implemented for this backend"
)
class
Philox
Base
(
CustomCodeNode
):
class
RNG
Base
(
CustomCodeNode
):
def
__init__
(
self
,
dim
,
time_step
=
TypedSymbol
(
"time_step"
,
np
.
uint32
),
offsets
=
(
0
,
0
,
0
),
keys
=
(
0
,
0
)):
def
__init__
(
self
,
dim
,
time_step
=
TypedSymbol
(
"time_step"
,
np
.
uint32
),
offsets
=
(
0
,
0
,
0
),
keys
=
None
):
if
keys
is
None
:
keys
=
(
0
,)
*
self
.
_num_keys
if
len
(
keys
)
!=
self
.
_num_keys
:
raise
ValueError
(
"Provided {} keys but need {}"
.
format
(
len
(
keys
),
self
.
_num_keys
))
if
len
(
offsets
)
!=
3
:
raise
ValueError
(
"Provided {} offsets but need {}"
.
format
(
len
(
offsets
),
3
))
self
.
result_symbols
=
tuple
(
TypedSymbol
(
sp
.
Dummy
().
name
,
self
.
_data_type
)
for
_
in
range
(
self
.
_num_vars
))
symbols_read
=
[
s
for
s
in
keys
if
isinstance
(
s
,
sp
.
Symbol
)]
super
().
__init__
(
""
,
symbols_read
=
symbols_read
,
symbols_defined
=
self
.
result_symbols
)
self
.
_time_step
=
time_step
self
.
_offsets
=
offsets
self
.
headers
=
[
'"
philox
_rand.h"'
]
self
.
headers
=
[
'"
{}
_rand.h"'
.
format
(
self
.
_name
)
]
self
.
keys
=
tuple
(
keys
)
self
.
_args
=
sp
.
sympify
((
dim
,
time_step
,
keys
))
self
.
_dim
=
dim
...
...
@@ -65,22 +65,40 @@ class PhiloxBase(CustomCodeNode):
return
self
# nothing to replace inside this node - would destroy intermediate "dummy" by re-creating them
def
get_code
(
self
,
dialect
,
vector_instruction_set
):
template
=
_get_
philox
_template
(
self
.
_data_type
,
self
.
_num_vars
)
return
_get_
philox
_code
(
template
,
dialect
,
vector_instruction_set
,
self
.
_time_step
,
self
.
_offsets
,
self
.
keys
,
self
.
_dim
,
self
.
result_symbols
)
template
=
_get_
rng
_template
(
self
.
_name
,
self
.
_data_type
,
self
.
_num_vars
)
return
_get_
rng
_code
(
template
,
dialect
,
vector_instruction_set
,
self
.
_time_step
,
self
.
_offsets
,
self
.
keys
,
self
.
_dim
,
self
.
result_symbols
)
def
__repr__
(
self
):
return
(
", "
.
join
([
'{}'
]
*
self
.
_num_vars
)
+
" <- PhiloxRNG"
).
format
(
*
self
.
result_symbols
)
return
(
", "
.
join
([
'{}'
]
*
self
.
_num_vars
)
+
" <- {}RNG"
).
format
(
*
self
.
result_symbols
,
self
.
_name
.
capitalize
())
class
PhiloxTwoDoubles
(
RNGBase
):
_name
=
"philox"
_data_type
=
np
.
float64
_num_vars
=
2
_num_keys
=
2
class
PhiloxFourFloats
(
RNGBase
):
_name
=
"philox"
_data_type
=
np
.
float32
_num_vars
=
4
_num_keys
=
2
class
PhiloxTwoDoubles
(
PhiloxBase
):
class
AESNITwoDoubles
(
RNGBase
):
_name
=
"aesni"
_data_type
=
np
.
float64
_num_vars
=
2
_num_keys
=
4
class
PhiloxFourFloats
(
PhiloxBase
):
class
AESNIFourFloats
(
RNGBase
):
_name
=
"aesni"
_data_type
=
np
.
float32
_num_vars
=
4
_num_keys
=
4
def
random_symbol
(
assignment_list
,
seed
=
TypedSymbol
(
"seed"
,
np
.
uint32
),
rng_node
=
PhiloxTwoDoubles
,
*
args
,
**
kwargs
):
...
...
pystencils_tests/test_random.py
View file @
69f779ea
import
numpy
as
np
import
pystencils
as
ps
from
pystencils.rng
import
PhiloxFourFloats
,
PhiloxTwoDoubles
from
pystencils.rng
import
PhiloxFourFloats
,
PhiloxTwoDoubles
,
AESNIFourFloats
,
AESNITwoDoubles
# curand_Philox4x32_10(make_uint4(124, i, j, 0), make_uint2(0, 0))
...
...
@@ -56,3 +56,39 @@ def test_philox_float():
float_reference
=
philox_reference
*
2.
**-
32
+
2.
**-
33
assert
(
np
.
allclose
(
arr
,
float_reference
,
rtol
=
0
,
atol
=
np
.
finfo
(
np
.
float32
).
eps
))
def
test_aesni_double
():
dh
=
ps
.
create_data_handling
((
2
,
2
),
default_ghost_layers
=
0
,
default_target
=
"cpu"
)
f
=
dh
.
add_array
(
"f"
,
values_per_cell
=
2
)
dh
.
fill
(
'f'
,
42.0
)
aesni_node
=
AESNITwoDoubles
(
dh
.
dim
)
assignments
=
[
aesni_node
,
ps
.
Assignment
(
f
(
0
),
aesni_node
.
result_symbols
[
0
]),
ps
.
Assignment
(
f
(
1
),
aesni_node
.
result_symbols
[
1
])]
kernel
=
ps
.
create_kernel
(
assignments
,
target
=
dh
.
default_target
).
compile
()
dh
.
all_to_gpu
()
dh
.
run_kernel
(
kernel
,
time_step
=
124
)
dh
.
all_to_cpu
()
arr
=
dh
.
gather_array
(
'f'
)
assert
np
.
logical_and
(
arr
<=
1.0
,
arr
>=
0
).
all
()
def
test_aesni_float
():
dh
=
ps
.
create_data_handling
((
2
,
2
),
default_ghost_layers
=
0
,
default_target
=
"cpu"
)
f
=
dh
.
add_array
(
"f"
,
values_per_cell
=
4
)
dh
.
fill
(
'f'
,
42.0
)
aesni_node
=
AESNIFourFloats
(
dh
.
dim
)
assignments
=
[
aesni_node
]
+
[
ps
.
Assignment
(
f
(
i
),
aesni_node
.
result_symbols
[
i
])
for
i
in
range
(
4
)]
kernel
=
ps
.
create_kernel
(
assignments
,
target
=
dh
.
default_target
).
compile
()
dh
.
all_to_gpu
()
dh
.
run_kernel
(
kernel
,
time_step
=
124
)
dh
.
all_to_cpu
()
arr
=
dh
.
gather_array
(
'f'
)
assert
np
.
logical_and
(
arr
<=
1.0
,
arr
>=
0
).
all
()
\ No newline at end of file
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment