Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Christoph Alt
pystencils
Commits
584b4255
Commit
584b4255
authored
May 26, 2021
by
Markus Holzer
Browse files
Merge branch 'opencl' into 'master'
OpenCL RNG See merge request
pycodegen/pystencils!247
parents
fce1a417
6209c0da
Changes
3
Hide whitespace changes
Inline
Side-by-side
pystencils/include/philox_rand.h
View file @
584b4255
#include <cstdint>
#ifndef __OPENCL_VERSION__
#if defined(__SSE2__) || defined(_MSC_VER)
#include <emmintrin.h> // SSE2
#endif
...
...
@@ -33,12 +32,15 @@
#ifdef __riscv_v
#include <riscv_vector.h>
#endif
#endif
#ifndef __CUDA_ARCH__
#ifdef __CUDA_ARCH__
#define QUALIFIERS static __forceinline__ __device__
#elif defined(__OPENCL_VERSION__)
#define QUALIFIERS static inline
#else
#define QUALIFIERS inline
#include "myintrin.h"
#else
#define QUALIFIERS static __forceinline__ __device__
#endif
#define PHILOX_W32_0 (0x9E3779B9)
...
...
@@ -48,8 +50,15 @@
#define TWOPOW53_INV_DOUBLE (1.1102230246251565e-16)
#define TWOPOW32_INV_FLOAT (2.3283064e-10f)
#ifdef __OPENCL_VERSION__
#include "opencl_stdint.h"
typedef
uint32_t
uint32
;
typedef
uint64_t
uint64
;
#else
#include <cstdint>
typedef
std
::
uint32_t
uint32
;
typedef
std
::
uint64_t
uint64
;
#endif
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS > 0
typedef
svfloat32_t
svfloat32_st
__attribute__
((
arm_sve_vector_bits
(
__ARM_FEATURE_SVE_BITS
)));
...
...
@@ -67,6 +76,9 @@ QUALIFIERS uint32 mulhilo32(uint32 a, uint32 b, uint32* hip)
#if defined(__powerpc__) && (!defined(__clang__) || defined(__xlC__))
*
hip
=
__mulhwu
(
a
,
b
);
return
a
*
b
;
#elif defined(__OPENCL_VERSION__)
*
hip
=
mul_hi
(
a
,
b
);
return
a
*
b
;
#else
uint64
product
=
((
uint64
)
a
)
*
((
uint64
)
b
);
*
hip
=
product
>>
32
;
...
...
@@ -106,7 +118,12 @@ QUALIFIERS double _uniform_double_hq(uint32 x, uint32 y)
QUALIFIERS
void
philox_double2
(
uint32
ctr0
,
uint32
ctr1
,
uint32
ctr2
,
uint32
ctr3
,
uint32
key0
,
uint32
key1
,
double
&
rnd1
,
double
&
rnd2
)
uint32
key0
,
uint32
key1
,
#ifdef __OPENCL_VERSION__
double
*
rnd1
,
double
*
rnd2
)
#else
double
&
rnd1
,
double
&
rnd2
)
#endif
{
uint32
key
[
2
]
=
{
key0
,
key1
};
uint32
ctr
[
4
]
=
{
ctr0
,
ctr1
,
ctr2
,
ctr3
};
...
...
@@ -121,14 +138,23 @@ QUALIFIERS void philox_double2(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr
_philox4x32bumpkey
(
key
);
_philox4x32round
(
ctr
,
key
);
// 9
_philox4x32bumpkey
(
key
);
_philox4x32round
(
ctr
,
key
);
// 10
#ifdef __OPENCL_VERSION__
*
rnd1
=
_uniform_double_hq
(
ctr
[
0
],
ctr
[
1
]);
*
rnd2
=
_uniform_double_hq
(
ctr
[
2
],
ctr
[
3
]);
#else
rnd1
=
_uniform_double_hq
(
ctr
[
0
],
ctr
[
1
]);
rnd2
=
_uniform_double_hq
(
ctr
[
2
],
ctr
[
3
]);
#endif
}
QUALIFIERS
void
philox_float4
(
uint32
ctr0
,
uint32
ctr1
,
uint32
ctr2
,
uint32
ctr3
,
uint32
key0
,
uint32
key1
,
#ifdef __OPENCL_VERSION__
float
*
rnd1
,
float
*
rnd2
,
float
*
rnd3
,
float
*
rnd4
)
#else
float
&
rnd1
,
float
&
rnd2
,
float
&
rnd3
,
float
&
rnd4
)
#endif
{
uint32
key
[
2
]
=
{
key0
,
key1
};
uint32
ctr
[
4
]
=
{
ctr0
,
ctr1
,
ctr2
,
ctr3
};
...
...
@@ -143,13 +169,20 @@ QUALIFIERS void philox_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3
_philox4x32bumpkey
(
key
);
_philox4x32round
(
ctr
,
key
);
// 9
_philox4x32bumpkey
(
key
);
_philox4x32round
(
ctr
,
key
);
// 10
#ifdef __OPENCL_VERSION__
*
rnd1
=
ctr
[
0
]
*
TWOPOW32_INV_FLOAT
+
(
TWOPOW32_INV_FLOAT
/
2.0
f
);
*
rnd2
=
ctr
[
1
]
*
TWOPOW32_INV_FLOAT
+
(
TWOPOW32_INV_FLOAT
/
2.0
f
);
*
rnd3
=
ctr
[
2
]
*
TWOPOW32_INV_FLOAT
+
(
TWOPOW32_INV_FLOAT
/
2.0
f
);
*
rnd4
=
ctr
[
3
]
*
TWOPOW32_INV_FLOAT
+
(
TWOPOW32_INV_FLOAT
/
2.0
f
);
#else
rnd1
=
ctr
[
0
]
*
TWOPOW32_INV_FLOAT
+
(
TWOPOW32_INV_FLOAT
/
2.0
f
);
rnd2
=
ctr
[
1
]
*
TWOPOW32_INV_FLOAT
+
(
TWOPOW32_INV_FLOAT
/
2.0
f
);
rnd3
=
ctr
[
2
]
*
TWOPOW32_INV_FLOAT
+
(
TWOPOW32_INV_FLOAT
/
2.0
f
);
rnd4
=
ctr
[
3
]
*
TWOPOW32_INV_FLOAT
+
(
TWOPOW32_INV_FLOAT
/
2.0
f
);
#endif
}
#if
n
def
__CUDA_ARCH__
#if
!
def
ined(
__CUDA_ARCH__
) && !defined(__OPENCL_VERSION__)
#if defined(__SSE4_1__) || defined(_MSC_VER)
QUALIFIERS
void
_philox4x32round
(
__m128i
*
ctr
,
__m128i
*
key
)
{
...
...
@@ -318,13 +351,13 @@ QUALIFIERS void _philox4x32round(__vector unsigned int* ctr, __vector unsigned i
{
#ifndef _ARCH_PWR8
__vector
unsigned
int
lo0
=
vec_mul
(
ctr
[
0
],
vec_splats
(
PHILOX_M4x32_0
));
__vector
unsigned
int
lo1
=
vec_mul
(
ctr
[
2
],
vec_splats
(
PHILOX_M4x32_1
));
__vector
unsigned
int
hi0
=
vec_mulhuw
(
ctr
[
0
],
vec_splats
(
PHILOX_M4x32_0
));
__vector
unsigned
int
lo1
=
vec_mul
(
ctr
[
2
],
vec_splats
(
PHILOX_M4x32_1
));
__vector
unsigned
int
hi1
=
vec_mulhuw
(
ctr
[
2
],
vec_splats
(
PHILOX_M4x32_1
));
#elif defined(_ARCH_PWR10)
__vector
unsigned
int
lo0
=
vec_mul
(
ctr
[
0
],
vec_splats
(
PHILOX_M4x32_0
));
__vector
unsigned
int
lo1
=
vec_mul
(
ctr
[
2
],
vec_splats
(
PHILOX_M4x32_1
));
__vector
unsigned
int
hi0
=
vec_mulh
(
ctr
[
0
],
vec_splats
(
PHILOX_M4x32_0
));
__vector
unsigned
int
lo1
=
vec_mul
(
ctr
[
2
],
vec_splats
(
PHILOX_M4x32_1
));
__vector
unsigned
int
hi1
=
vec_mulh
(
ctr
[
2
],
vec_splats
(
PHILOX_M4x32_1
));
#else
__vector
unsigned
int
lohi0a
=
(
__vector
unsigned
int
)
vec_mule
(
ctr
[
0
],
vec_splats
(
PHILOX_M4x32_0
));
...
...
@@ -675,8 +708,8 @@ QUALIFIERS void philox_double2(uint32 ctr0, int32x4_t ctr1, uint32 ctr2, uint32
QUALIFIERS
void
_philox4x32round
(
svuint32x4_t
&
ctr
,
svuint32x2_t
&
key
)
{
svuint32_t
lo0
=
svmul_u32_x
(
svptrue_b32
(),
svget4_u32
(
ctr
,
0
),
svdup_u32
(
PHILOX_M4x32_0
));
svuint32_t
lo1
=
svmul_u32_x
(
svptrue_b32
(),
svget4_u32
(
ctr
,
2
),
svdup_u32
(
PHILOX_M4x32_1
));
svuint32_t
hi0
=
svmulh_u32_x
(
svptrue_b32
(),
svget4_u32
(
ctr
,
0
),
svdup_u32
(
PHILOX_M4x32_0
));
svuint32_t
lo1
=
svmul_u32_x
(
svptrue_b32
(),
svget4_u32
(
ctr
,
2
),
svdup_u32
(
PHILOX_M4x32_1
));
svuint32_t
hi1
=
svmulh_u32_x
(
svptrue_b32
(),
svget4_u32
(
ctr
,
2
),
svdup_u32
(
PHILOX_M4x32_1
));
ctr
=
svset4_u32
(
ctr
,
0
,
sveor_u32_x
(
svptrue_b32
(),
sveor_u32_x
(
svptrue_b32
(),
hi1
,
svget4_u32
(
ctr
,
1
)),
svget2_u32
(
key
,
0
)));
...
...
@@ -827,8 +860,8 @@ QUALIFIERS void _philox4x32round(vuint32m1_t & ctr0, vuint32m1_t & ctr1, vuint32
vuint32m1_t
key0
,
vuint32m1_t
key1
)
{
vuint32m1_t
lo0
=
vmul_vv_u32m1
(
ctr0
,
vmv_v_x_u32m1
(
PHILOX_M4x32_0
,
vsetvlmax_e32m1
()),
vsetvlmax_e32m1
());
vuint32m1_t
lo1
=
vmul_vv_u32m1
(
ctr2
,
vmv_v_x_u32m1
(
PHILOX_M4x32_1
,
vsetvlmax_e32m1
()),
vsetvlmax_e32m1
());
vuint32m1_t
hi0
=
vmulhu_vv_u32m1
(
ctr0
,
vmv_v_x_u32m1
(
PHILOX_M4x32_0
,
vsetvlmax_e32m1
()),
vsetvlmax_e32m1
());
vuint32m1_t
lo1
=
vmul_vv_u32m1
(
ctr2
,
vmv_v_x_u32m1
(
PHILOX_M4x32_1
,
vsetvlmax_e32m1
()),
vsetvlmax_e32m1
());
vuint32m1_t
hi1
=
vmulhu_vv_u32m1
(
ctr2
,
vmv_v_x_u32m1
(
PHILOX_M4x32_1
,
vsetvlmax_e32m1
()),
vsetvlmax_e32m1
());
ctr0
=
vxor_vv_u32m1
(
vxor_vv_u32m1
(
hi1
,
ctr1
,
vsetvlmax_e32m1
()),
key0
,
vsetvlmax_e32m1
());
...
...
pystencils/rng.py
View file @
584b4255
...
...
@@ -53,8 +53,9 @@ class RNGBase(CustomCodeNode):
else
:
code
+=
f
"
{
vector_instruction_set
[
r
.
dtype
.
base_name
]
if
vector_instruction_set
else
r
.
dtype
}
"
+
\
f
"
{
r
.
name
}
;
\n
"
code
+=
(
self
.
_name
+
"("
+
", "
.
join
([
print_arg
(
a
)
for
a
in
self
.
args
]
+
[
r
.
name
for
r
in
self
.
result_symbols
])
+
");
\n
"
)
args
=
[
print_arg
(
a
)
for
a
in
self
.
args
]
+
\
[(
'&'
if
dialect
==
'opencl'
else
''
)
+
r
.
name
for
r
in
self
.
result_symbols
]
code
+=
(
self
.
_name
+
"("
+
", "
.
join
(
args
)
+
");
\n
"
)
return
code
def
__repr__
(
self
):
...
...
pystencils_tests/test_random.py
View file @
584b4255
...
...
@@ -21,12 +21,16 @@ if get_compiler_config()['os'] == 'windows':
instruction_sets
.
remove
(
'avx512'
)
@
pytest
.
mark
.
parametrize
(
'target,rng'
,
((
'cpu'
,
'philox'
),
(
'cpu'
,
'aesni'
),
(
'gpu'
,
'philox'
)))
@
pytest
.
mark
.
parametrize
(
'target,rng'
,
((
'cpu'
,
'philox'
),
(
'cpu'
,
'aesni'
),
(
'gpu'
,
'philox'
),
(
'opencl'
,
'philox'
)))
@
pytest
.
mark
.
parametrize
(
'precision'
,
(
'float'
,
'double'
))
@
pytest
.
mark
.
parametrize
(
'dtype'
,
(
'float'
,
'double'
))
def
test_rng
(
target
,
rng
,
precision
,
dtype
,
t
=
124
,
offsets
=
(
0
,
0
),
keys
=
(
0
,
0
),
offset_values
=
None
):
if
target
==
'gpu'
:
pytest
.
importorskip
(
'pycuda'
)
if
target
==
'opencl'
:
pytest
.
importorskip
(
'pyopencl'
)
from
pystencils.opencl.opencljit
import
init_globally
init_globally
()
if
instruction_sets
and
set
([
'neon'
,
'sve'
,
'vsx'
,
'rvv'
]).
intersection
(
instruction_sets
)
and
rng
==
'aesni'
:
pytest
.
xfail
(
'AES not yet implemented for this architecture'
)
if
rng
==
'aesni'
and
len
(
keys
)
==
2
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment