Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Jean-Noël Grad
pystencils
Commits
b075b723
Commit
b075b723
authored
Mar 29, 2021
by
Markus Holzer
Browse files
Merge branch 'ppc' into 'master'
Vectorization improvements See merge request
pycodegen/pystencils!228
parents
24dde405
8440bc6e
Changes
15
Hide whitespace changes
Inline
Side-by-side
pystencils/backends/arm_instruction_sets.py
View file @
b075b723
...
...
@@ -13,7 +13,10 @@ def get_argument_string(function_shortcut):
return
arg_string
def
get_vector_instruction_set_arm
(
data_type
=
'double'
,
instruction_set
=
'neon'
,
q_registers
=
True
):
def
get_vector_instruction_set_arm
(
data_type
=
'double'
,
instruction_set
=
'neon'
):
if
instruction_set
!=
'neon'
:
raise
NotImplementedError
(
instruction_set
)
base_names
=
{
'+'
:
'add[0, 1]'
,
'-'
:
'sub[0, 1]'
,
...
...
@@ -39,16 +42,9 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon', q
'float'
:
32
,
'int'
:
32
}
if
q_registers
is
True
:
q_reg
=
'q'
width
=
128
//
bits
[
data_type
]
intwidth
=
128
//
bits
[
'int'
]
suffix
=
f
'q_f
{
bits
[
data_type
]
}
'
else
:
q_reg
=
''
width
=
64
//
bits
[
data_type
]
intwidth
=
64
//
bits
[
'int'
]
suffix
=
f
'_f
{
bits
[
data_type
]
}
'
width
=
128
//
bits
[
data_type
]
intwidth
=
128
//
bits
[
'int'
]
suffix
=
f
'q_f
{
bits
[
data_type
]
}
'
result
=
dict
()
...
...
@@ -60,10 +56,10 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon', q
result
[
intrinsic_id
]
=
'v'
+
name
+
suffix
+
arg_string
result
[
'makeVecConst'
]
=
f
'vdup
{
q_reg
}
_n_f
{
bits
[
data_type
]
}
'
+
'({0})'
result
[
'makeVecConst'
]
=
f
'vdup
q
_n_f
{
bits
[
data_type
]
}
'
+
'({0})'
result
[
'makeVec'
]
=
f
'makeVec_f
{
bits
[
data_type
]
}
'
+
'('
+
", "
.
join
([
'{'
+
str
(
i
)
+
'}'
for
i
in
range
(
width
)])
+
\
')'
result
[
'makeVecConstInt'
]
=
f
'vdup
{
q_reg
}
_n_s
{
bits
[
"int"
]
}
'
+
'({0})'
result
[
'makeVecConstInt'
]
=
f
'vdup
q
_n_s
{
bits
[
"int"
]
}
'
+
'({0})'
result
[
'makeVecInt'
]
=
f
'makeVec_s
{
bits
[
"int"
]
}
'
+
'({0}, {1}, {2}, {3})'
result
[
'+int'
]
=
f
"vaddq_s
{
bits
[
'int'
]
}
"
+
"({0}, {1})"
...
...
@@ -77,10 +73,12 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon', q
result
[
'bool'
]
=
f
'uint
{
bits
[
data_type
]
}
x
{
width
}
_t'
result
[
'headers'
]
=
[
'<arm_neon.h>'
,
'"arm_neon_helpers.h"'
]
result
[
'!='
]
=
f
'vmvn
{
q_reg
}
_u
{
bits
[
data_type
]
}
(
{
result
[
"=="
]
}
)'
result
[
'!='
]
=
f
'vmvn
q
_u
{
bits
[
data_type
]
}
(
{
result
[
"=="
]
}
)'
result
[
'&'
]
=
f
'vand
{
q_reg
}
_u
{
bits
[
data_type
]
}
'
+
'({0}, {1})'
result
[
'|'
]
=
f
'vorr
{
q_reg
}
_u
{
bits
[
data_type
]
}
'
+
'({0}, {1})'
result
[
'blendv'
]
=
f
'vbsl
{
q_reg
}
_f
{
bits
[
data_type
]
}
'
+
'({2}, {1}, {0})'
result
[
'&'
]
=
f
'vandq_u
{
bits
[
data_type
]
}
'
+
'({0}, {1})'
result
[
'|'
]
=
f
'vorrq_u
{
bits
[
data_type
]
}
'
+
'({0}, {1})'
result
[
'blendv'
]
=
f
'vbslq_f
{
bits
[
data_type
]
}
'
+
'({2}, {1}, {0})'
result
[
'any'
]
=
f
'vaddlvq_u8(vreinterpretq_u8_u
{
bits
[
data_type
]
}
({{0}})) > 0'
result
[
'all'
]
=
f
'vaddlvq_u8(vreinterpretq_u8_u
{
bits
[
data_type
]
}
({{0}})) == 16*0xff'
return
result
pystencils/backends/cbackend.py
View file @
b075b723
...
...
@@ -588,18 +588,17 @@ class VectorizedCustomSympyPrinter(CustomSympyPrinter):
return
self
.
instruction_set
[
'rsqrt'
].
format
(
self
.
_print
(
expr
.
args
[
0
]))
else
:
return
f
"(
{
self
.
_print
(
1
/
sp
.
sqrt
(
expr
.
args
[
0
]))
}
)"
elif
isinstance
(
expr
,
vec_any
):
expr_type
=
get_type_of_expression
(
expr
.
args
[
0
])
if
type
(
expr_type
)
is
not
VectorType
:
return
self
.
_print
(
expr
.
args
[
0
])
else
:
return
self
.
instruction_set
[
'any'
].
format
(
self
.
_print
(
expr
.
args
[
0
]))
elif
isinstance
(
expr
,
vec_all
):
elif
isinstance
(
expr
,
vec_any
)
or
isinstance
(
expr
,
vec_all
):
instr
=
'any'
if
isinstance
(
expr
,
vec_any
)
else
'all'
expr_type
=
get_type_of_expression
(
expr
.
args
[
0
])
if
type
(
expr_type
)
is
not
VectorType
:
return
self
.
_print
(
expr
.
args
[
0
])
else
:
return
self
.
instruction_set
[
'all'
].
format
(
self
.
_print
(
expr
.
args
[
0
]))
if
isinstance
(
expr
.
args
[
0
],
sp
.
Rel
):
op
=
expr
.
args
[
0
].
rel_op
if
(
instr
,
op
)
in
self
.
instruction_set
:
return
self
.
instruction_set
[(
instr
,
op
)].
format
(
*
[
self
.
_print
(
a
)
for
a
in
expr
.
args
[
0
].
args
])
return
self
.
instruction_set
[
instr
].
format
(
self
.
_print
(
expr
.
args
[
0
]))
return
super
(
VectorizedCustomSympyPrinter
,
self
).
_print_Function
(
expr
)
...
...
pystencils/backends/ppc_instruction_sets.py
0 → 100644
View file @
b075b723
def
get_argument_string
(
function_shortcut
):
args
=
function_shortcut
[
function_shortcut
.
index
(
'['
)
+
1
:
-
1
]
arg_string
=
"("
for
arg
in
args
.
split
(
","
):
arg
=
arg
.
strip
()
if
not
arg
:
continue
if
arg
in
(
'0'
,
'1'
,
'2'
,
'3'
,
'4'
,
'5'
):
arg_string
+=
"{"
+
arg
+
"},"
else
:
arg_string
+=
arg
+
","
arg_string
=
arg_string
[:
-
1
]
+
")"
return
arg_string
def
get_vector_instruction_set_ppc
(
data_type
=
'double'
,
instruction_set
=
'vsx'
):
if
instruction_set
!=
'vsx'
:
raise
NotImplementedError
(
instruction_set
)
base_names
=
{
'+'
:
'add[0, 1]'
,
'-'
:
'sub[0, 1]'
,
'*'
:
'mul[0, 1]'
,
'/'
:
'div[0, 1]'
,
'sqrt'
:
'sqrt[0]'
,
'rsqrt'
:
'rsqrte[0]'
,
# rsqrt is available too, but not on Clang
'loadU'
:
'xl[0x0, 0]'
,
'loadA'
:
'ld[0x0, 0]'
,
'storeU'
:
'xst[1, 0x0, 0]'
,
'storeA'
:
'st[1, 0x0, 0]'
,
'stream'
:
'stl[1, 0x0, 0]'
,
'abs'
:
'abs[0]'
,
'=='
:
'cmpeq[0, 1]'
,
'!='
:
'cmpne[0, 1]'
,
'<='
:
'cmple[0, 1]'
,
'<'
:
'cmplt[0, 1]'
,
'>='
:
'cmpge[0, 1]'
,
'>'
:
'cmpgt[0, 1]'
,
'&'
:
'and[0, 1]'
,
'|'
:
'or[0, 1]'
,
'blendv'
:
'sel[0, 1, 2]'
,
(
'any'
,
'=='
):
'any_eq[0, 1]'
,
(
'any'
,
'!='
):
'any_ne[0, 1]'
,
(
'any'
,
'<='
):
'any_le[0, 1]'
,
(
'any'
,
'<'
):
'any_lt[0, 1]'
,
(
'any'
,
'>='
):
'any_ge[0, 1]'
,
(
'any'
,
'>'
):
'any_gt[0, 1]'
,
(
'all'
,
'=='
):
'all_eq[0, 1]'
,
(
'all'
,
'!='
):
'all_ne[0, 1]'
,
(
'all'
,
'<='
):
'all_le[0, 1]'
,
(
'all'
,
'<'
):
'all_lt[0, 1]'
,
(
'all'
,
'>='
):
'all_ge[0, 1]'
,
(
'all'
,
'>'
):
'all_gt[0, 1]'
,
}
bits
=
{
'double'
:
64
,
'float'
:
32
,
'int'
:
32
}
width
=
128
//
bits
[
data_type
]
intwidth
=
128
//
bits
[
'int'
]
result
=
dict
()
for
intrinsic_id
,
function_shortcut
in
base_names
.
items
():
function_shortcut
=
function_shortcut
.
strip
()
name
=
function_shortcut
[:
function_shortcut
.
index
(
'['
)]
arg_string
=
get_argument_string
(
function_shortcut
)
result
[
intrinsic_id
]
=
'vec_'
+
name
+
arg_string
if
data_type
==
'double'
:
# Clang and XL C++ are missing these for doubles
result
[
'loadA'
]
=
'(__vector double)'
+
result
[
'loadA'
].
format
(
'(float*) {0}'
)
result
[
'storeA'
]
=
result
[
'storeA'
].
format
(
'(float*) {0}'
,
'(__vector float) {1}'
)
result
[
'stream'
]
=
result
[
'stream'
].
format
(
'(float*) {0}'
,
'(__vector float) {1}'
)
result
[
'+int'
]
=
"vec_add({0}, {1})"
result
[
'width'
]
=
width
result
[
'intwidth'
]
=
intwidth
result
[
data_type
]
=
f
'__vector
{
data_type
}
'
result
[
'int'
]
=
'__vector int'
result
[
'bool'
]
=
f
'__vector __bool
{
"long long"
if
data_type
==
"double"
else
"int"
}
'
result
[
'headers'
]
=
[
'<altivec.h>'
,
'"ppc_altivec_helpers.h"'
]
result
[
'makeVecConst'
]
=
'(('
+
result
[
data_type
]
+
'){{'
+
\
", "
.
join
([
'('
+
data_type
+
') {0}'
for
_
in
range
(
width
)])
+
'}})'
result
[
'makeVec'
]
=
'(('
+
result
[
data_type
]
+
'){{'
+
\
", "
.
join
([
'{'
+
data_type
+
'} {'
+
str
(
i
)
+
'}'
for
i
in
range
(
width
)])
+
'}})'
result
[
'makeVecConstInt'
]
=
'(('
+
result
[
'int'
]
+
'){{'
+
", "
.
join
([
'(int) {0}'
for
_
in
range
(
intwidth
)])
+
'}})'
result
[
'makeVecInt'
]
=
'(('
+
result
[
'int'
]
+
'){{(int) {0}, (int) {1}, (int) {2}, (int) {3}}})'
result
[
'any'
]
=
'vec_any_ne({0}, (('
+
result
[
'bool'
]
+
') {{'
+
", "
.
join
([
'0'
]
*
width
)
+
'}}))'
result
[
'all'
]
=
'vec_all_ne({0}, (('
+
result
[
'bool'
]
+
') {{'
+
", "
.
join
([
'0'
]
*
width
)
+
'}}))'
return
result
pystencils/backends/simd_instruction_sets.py
View file @
b075b723
...
...
@@ -2,19 +2,40 @@ import platform
from
pystencils.backends.x86_instruction_sets
import
get_vector_instruction_set_x86
from
pystencils.backends.arm_instruction_sets
import
get_vector_instruction_set_arm
from
pystencils.backends.ppc_instruction_sets
import
get_vector_instruction_set_ppc
def
get_vector_instruction_set
(
data_type
=
'double'
,
instruction_set
=
'avx'
,
q_registers
=
True
):
def
get_vector_instruction_set
(
data_type
=
'double'
,
instruction_set
=
'avx'
):
if
instruction_set
in
[
'neon'
,
'sve'
]:
return
get_vector_instruction_set_arm
(
data_type
,
instruction_set
,
q_registers
)
return
get_vector_instruction_set_arm
(
data_type
,
instruction_set
)
elif
instruction_set
in
[
'vsx'
]:
return
get_vector_instruction_set_ppc
(
data_type
,
instruction_set
)
else
:
return
get_vector_instruction_set_x86
(
data_type
,
instruction_set
)
_cache
=
None
def
get_supported_instruction_sets
():
"""List of supported instruction sets on current hardware, or None if query failed."""
global
_cache
if
_cache
is
not
None
:
return
_cache
.
copy
()
if
platform
.
system
()
==
'Darwin'
and
platform
.
machine
()
==
'arm64'
:
# not supported by cpuinfo
return
[
'neon'
]
elif
platform
.
machine
().
startswith
(
'ppc64'
):
# no flags reported by cpuinfo
import
subprocess
import
tempfile
from
pystencils.cpu.cpujit
import
get_compiler_config
f
=
tempfile
.
NamedTemporaryFile
(
suffix
=
'.cpp'
)
command
=
[
get_compiler_config
()[
'command'
],
'-mcpu=native'
,
'-dM'
,
'-E'
,
f
.
name
]
macros
=
subprocess
.
check_output
(
command
,
input
=
''
,
text
=
True
)
if
'#define __VSX__'
in
macros
and
'#define __ALTIVEC__'
in
macros
:
_cache
=
[
'vsx'
]
else
:
_cache
=
[]
return
_cache
.
copy
()
try
:
from
cpuinfo
import
get_cpu_info
except
ImportError
:
...
...
pystencils/backends/x86_instruction_sets.py
View file @
b075b723
...
...
@@ -137,11 +137,11 @@ def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'):
result
[
'double'
]
=
f
"__m
{
bit_width
}
d"
result
[
'float'
]
=
f
"__m
{
bit_width
}
"
result
[
'int'
]
=
f
"__m
{
bit_width
}
i"
result
[
'bool'
]
=
f
"__m
{
bit_width
}
d"
result
[
'bool'
]
=
result
[
data_type
]
result
[
'headers'
]
=
headers
[
instruction_set
]
result
[
'any'
]
=
f
"
{
pre
}
_movemask_
{
suf
}
({{0}}) > 0"
result
[
'all'
]
=
f
"
{
pre
}
_movemask_
{
suf
}
({{0}}) ==
0xF
"
result
[
'all'
]
=
f
"
{
pre
}
_movemask_
{
suf
}
({{0}}) ==
{
hex
(
2
**
result
[
'width'
]
-
1
)
}
"
if
instruction_set
==
'avx512'
:
size
=
8
if
data_type
==
'double'
else
16
...
...
pystencils/cpu/cpujit.py
View file @
b075b723
...
...
@@ -155,6 +155,9 @@ def read_config():
(
'flags'
,
'-Ofast -DNDEBUG -fPIC -march=native -fopenmp -std=c++11'
),
(
'restrict_qualifier'
,
'__restrict__'
)
])
if
platform
.
machine
().
startswith
(
'ppc64'
):
default_compiler_config
[
'flags'
]
=
default_compiler_config
[
'flags'
].
replace
(
'-march=native'
,
'-mcpu=native'
)
elif
platform
.
system
().
lower
()
==
'windows'
:
default_compiler_config
=
OrderedDict
([
(
'os'
,
'windows'
),
...
...
pystencils/include/aesni_rand.h
View file @
b075b723
...
...
@@ -21,6 +21,36 @@
typedef
std
::
uint32_t
uint32
;
typedef
std
::
uint64_t
uint64
;
template
<
typename
T
,
std
::
size_t
Alignment
>
class
AlignedAllocator
{
public:
typedef
T
value_type
;
template
<
typename
U
>
struct
rebind
{
typedef
AlignedAllocator
<
U
,
Alignment
>
other
;
};
T
*
allocate
(
const
std
::
size_t
n
)
const
{
if
(
n
==
0
)
{
return
nullptr
;
}
void
*
const
p
=
_mm_malloc
(
n
*
sizeof
(
T
),
Alignment
);
if
(
p
==
nullptr
)
{
throw
std
::
bad_alloc
();
}
return
static_cast
<
T
*>
(
p
);
}
void
deallocate
(
T
*
const
p
,
const
std
::
size_t
n
)
const
{
_mm_free
(
p
);
}
};
template
<
typename
Key
,
typename
T
>
using
AlignedMap
=
std
::
map
<
Key
,
T
,
std
::
less
<
Key
>
,
AlignedAllocator
<
std
::
pair
<
const
Key
,
T
>
,
sizeof
(
Key
)
>>
;
#if defined(__AES__) || defined(_MSC_VER)
QUALIFIERS
__m128i
aesni_keygen_assist
(
__m128i
temp1
,
__m128i
temp2
)
{
__m128i
temp3
;
...
...
@@ -85,10 +115,10 @@ QUALIFIERS std::array<__m128i,11> aesni_keygen(__m128i k) {
}
QUALIFIERS
const
std
::
array
<
__m128i
,
11
>
&
aesni_roundkeys
(
const
__m128i
&
k128
)
{
std
::
array
<
uint32
,
4
>
a
;
_mm_store
u
_si128
((
__m128i
*
)
a
.
data
(),
k128
);
alignas
(
16
)
std
::
array
<
uint32
,
4
>
a
;
_mm_store_si128
((
__m128i
*
)
a
.
data
(),
k128
);
static
std
::
m
ap
<
std
::
array
<
uint32
,
4
>
,
std
::
array
<
__m128i
,
11
>>
roundkeys
;
static
AlignedM
ap
<
std
::
array
<
uint32
,
4
>
,
std
::
array
<
__m128i
,
11
>>
roundkeys
;
if
(
roundkeys
.
find
(
a
)
==
roundkeys
.
end
())
{
auto
rk
=
aesni_keygen
(
k128
);
...
...
@@ -308,10 +338,10 @@ QUALIFIERS void aesni_double2(uint32 ctr0, __m128i ctr1, uint32 ctr2, uint32 ctr
#ifdef __AVX2__
QUALIFIERS
const
std
::
array
<
__m256i
,
11
>
&
aesni_roundkeys
(
const
__m256i
&
k256
)
{
std
::
array
<
uint32
,
8
>
a
;
_mm256_store
u
_si256
((
__m256i
*
)
a
.
data
(),
k256
);
alignas
(
32
)
std
::
array
<
uint32
,
8
>
a
;
_mm256_store_si256
((
__m256i
*
)
a
.
data
(),
k256
);
static
std
::
m
ap
<
std
::
array
<
uint32
,
8
>
,
std
::
array
<
__m256i
,
11
>>
roundkeys
;
static
AlignedM
ap
<
std
::
array
<
uint32
,
8
>
,
std
::
array
<
__m256i
,
11
>>
roundkeys
;
if
(
roundkeys
.
find
(
a
)
==
roundkeys
.
end
())
{
auto
rk1
=
aesni_keygen
(
_mm256_extractf128_si256
(
k256
,
0
));
...
...
@@ -523,10 +553,10 @@ QUALIFIERS void aesni_double2(uint32 ctr0, __m256i ctr1, uint32 ctr2, uint32 ctr
#ifdef __AVX512F__
QUALIFIERS
const
std
::
array
<
__m512i
,
11
>
&
aesni_roundkeys
(
const
__m512i
&
k512
)
{
std
::
array
<
uint32
,
16
>
a
;
_mm512_store
u
_si512
((
__m512i
*
)
a
.
data
(),
k512
);
alignas
(
64
)
std
::
array
<
uint32
,
16
>
a
;
_mm512_store_si512
((
__m512i
*
)
a
.
data
(),
k512
);
static
std
::
m
ap
<
std
::
array
<
uint32
,
16
>
,
std
::
array
<
__m512i
,
11
>>
roundkeys
;
static
AlignedM
ap
<
std
::
array
<
uint32
,
16
>
,
std
::
array
<
__m512i
,
11
>>
roundkeys
;
if
(
roundkeys
.
find
(
a
)
==
roundkeys
.
end
())
{
auto
rk1
=
aesni_keygen
(
_mm512_extracti32x4_epi32
(
k512
,
0
));
...
...
@@ -553,7 +583,7 @@ QUALIFIERS __m512i aesni1xm128i(const __m512i & in, const __m512i & k0) {
x
=
_mm512_aesenc_epi128
(
x
,
k
[
7
]);
x
=
_mm512_aesenc_epi128
(
x
,
k
[
8
]);
x
=
_mm512_aesenc_epi128
(
x
,
k
[
9
]);
x
=
_mm512_aesenclast_epi128
(
x
[
10
]
,
k
);
x
=
_mm512_aesenclast_epi128
(
x
,
k
[
10
]);
#else
__m128i
a
=
aesni1xm128i
(
_mm512_extracti32x4_epi32
(
in
,
0
),
_mm512_extracti32x4_epi32
(
k0
,
0
));
__m128i
b
=
aesni1xm128i
(
_mm512_extracti32x4_epi32
(
in
,
1
),
_mm512_extracti32x4_epi32
(
k0
,
1
));
...
...
pystencils/include/philox_rand.h
View file @
b075b723
...
...
@@ -16,6 +16,17 @@
#include
<arm_neon.h>
#endif
#if defined(__powerpc__) && defined(__GNUC__) && !defined(__clang__) && !defined(__xlC__)
#include
<ppu_intrinsics.h>
#endif
#ifdef __ALTIVEC__
#include
<altivec.h>
#undef bool
#ifndef _ARCH_PWR8
#include
<pveclib/vec_int64_ppc.h>
#endif
#endif
#ifndef __CUDA_ARCH__
#define QUALIFIERS inline
#include
"myintrin.h"
...
...
@@ -38,9 +49,14 @@ QUALIFIERS uint32 mulhilo32(uint32 a, uint32 b, uint32* hip)
{
#ifndef __CUDA_ARCH__
// host code
#if defined(__powerpc__) && (!defined(__clang__) || defined(__xlC__))
*
hip
=
__mulhwu
(
a
,
b
);
return
a
*
b
;
#else
uint64
product
=
((
uint64
)
a
)
*
((
uint64
)
b
);
*
hip
=
product
>>
32
;
return
(
uint32
)
product
;
#endif
#else
// device code
*
hip
=
__umulhi
(
a
,
b
);
...
...
@@ -281,6 +297,209 @@ QUALIFIERS void philox_double2(uint32 ctr0, __m128i ctr1, uint32 ctr2, uint32 ct
}
#endif
#ifdef __ALTIVEC__
QUALIFIERS
void
_philox4x32round
(
__vector
unsigned
int
*
ctr
,
__vector
unsigned
int
*
key
)
{
#ifndef _ARCH_PWR8
__vector
unsigned
int
lo0
=
vec_mul
(
ctr
[
0
],
vec_splats
(
PHILOX_M4x32_0
));
__vector
unsigned
int
lo1
=
vec_mul
(
ctr
[
2
],
vec_splats
(
PHILOX_M4x32_1
));
__vector
unsigned
int
hi0
=
vec_mulhuw
(
ctr
[
0
],
vec_splats
(
PHILOX_M4x32_0
));
__vector
unsigned
int
hi1
=
vec_mulhuw
(
ctr
[
2
],
vec_splats
(
PHILOX_M4x32_1
));
#elif defined(_ARCH_PWR10)
__vector
unsigned
int
lo0
=
vec_mul
(
ctr
[
0
],
vec_splats
(
PHILOX_M4x32_0
));
__vector
unsigned
int
lo1
=
vec_mul
(
ctr
[
2
],
vec_splats
(
PHILOX_M4x32_1
));
__vector
unsigned
int
hi0
=
vec_mulh
(
ctr
[
0
],
vec_splats
(
PHILOX_M4x32_0
));
__vector
unsigned
int
hi1
=
vec_mulh
(
ctr
[
2
],
vec_splats
(
PHILOX_M4x32_1
));
#else
__vector
unsigned
int
lohi0a
=
(
__vector
unsigned
int
)
vec_mule
(
ctr
[
0
],
vec_splats
(
PHILOX_M4x32_0
));
__vector
unsigned
int
lohi0b
=
(
__vector
unsigned
int
)
vec_mulo
(
ctr
[
0
],
vec_splats
(
PHILOX_M4x32_0
));
__vector
unsigned
int
lohi1a
=
(
__vector
unsigned
int
)
vec_mule
(
ctr
[
2
],
vec_splats
(
PHILOX_M4x32_1
));
__vector
unsigned
int
lohi1b
=
(
__vector
unsigned
int
)
vec_mulo
(
ctr
[
2
],
vec_splats
(
PHILOX_M4x32_1
));
#ifdef __LITTLE_ENDIAN__
__vector
unsigned
int
lo0
=
vec_mergee
(
lohi0a
,
lohi0b
);
__vector
unsigned
int
lo1
=
vec_mergee
(
lohi1a
,
lohi1b
);
__vector
unsigned
int
hi0
=
vec_mergeo
(
lohi0a
,
lohi0b
);
__vector
unsigned
int
hi1
=
vec_mergeo
(
lohi1a
,
lohi1b
);
#else
__vector
unsigned
int
lo0
=
vec_mergeo
(
lohi0a
,
lohi0b
);
__vector
unsigned
int
lo1
=
vec_mergeo
(
lohi1a
,
lohi1b
);
__vector
unsigned
int
hi0
=
vec_mergee
(
lohi0a
,
lohi0b
);
__vector
unsigned
int
hi1
=
vec_mergee
(
lohi1a
,
lohi1b
);
#endif
#endif
ctr
[
0
]
=
vec_xor
(
vec_xor
(
hi1
,
ctr
[
1
]),
key
[
0
]);
ctr
[
1
]
=
lo1
;
ctr
[
2
]
=
vec_xor
(
vec_xor
(
hi0
,
ctr
[
3
]),
key
[
1
]);
ctr
[
3
]
=
lo0
;
}
QUALIFIERS
void
_philox4x32bumpkey
(
__vector
unsigned
int
*
key
)
{
key
[
0
]
=
vec_add
(
key
[
0
],
vec_splats
(
PHILOX_W32_0
));
key
[
1
]
=
vec_add
(
key
[
1
],
vec_splats
(
PHILOX_W32_1
));
}
#ifdef __VSX__
template
<
bool
high
>
QUALIFIERS
__vector
double
_uniform_double_hq
(
__vector
unsigned
int
x
,
__vector
unsigned
int
y
)
{
// convert 32 to 64 bit
#ifdef __LITTLE_ENDIAN__
if
(
high
)
{
x
=
vec_mergel
(
x
,
vec_splats
(
0U
));
y
=
vec_mergel
(
y
,
vec_splats
(
0U
));
}
else
{
x
=
vec_mergeh
(
x
,
vec_splats
(
0U
));
y
=
vec_mergeh
(
y
,
vec_splats
(
0U
));
}
#else
if
(
high
)
{
x
=
vec_mergel
(
vec_splats
(
0U
),
x
);
y
=
vec_mergel
(
vec_splats
(
0U
),
y
);
}
else
{
x
=
vec_mergeh
(
vec_splats
(
0U
),
x
);
y
=
vec_mergeh
(
vec_splats
(
0U
),
y
);
}
#endif
// calculate z = x ^ y << (53 - 32))
#ifdef _ARCH_PWR8
__vector
unsigned
long
long
z
=
vec_sl
((
__vector
unsigned
long
long
)
y
,
vec_splats
(
53ULL
-
32ULL
));
#else
__vector
unsigned
long
long
z
=
vec_vsld
((
__vector
unsigned
long
long
)
y
,
vec_splats
(
53ULL
-
32ULL
));
#endif
z
=
vec_xor
((
__vector
unsigned
long
long
)
x
,
z
);
// convert uint64 to double
#ifdef __xlC__
__vector
double
rs
=
vec_ctd
(
z
,
0
);
#else
__vector
double
rs
=
vec_ctf
(
z
,
0
);
#endif
// calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
rs
=
vec_madd
(
rs
,
vec_splats
(
TWOPOW53_INV_DOUBLE
),
vec_splats
(
TWOPOW53_INV_DOUBLE
/
2.0
));
return
rs
;
}
#endif
QUALIFIERS
void
philox_float4
(
__vector
unsigned
int
ctr0
,
__vector
unsigned
int
ctr1
,
__vector
unsigned
int
ctr2
,
__vector
unsigned
int
ctr3
,
uint32
key0
,
uint32
key1
,
__vector
float
&
rnd1
,
__vector
float
&
rnd2
,
__vector
float
&
rnd3
,
__vector
float
&
rnd4
)
{
__vector
unsigned
int
key
[
2
]
=
{
vec_splats
(
key0
),
vec_splats
(
key1
)};
__vector
unsigned
int
ctr
[
4
]
=
{
ctr0
,
ctr1
,
ctr2
,
ctr3
};
_philox4x32round
(
ctr
,
key
);
// 1
_philox4x32bumpkey
(
key
);
_philox4x32round
(
ctr
,
key
);
// 2
_philox4x32bumpkey
(
key
);
_philox4x32round
(
ctr
,
key
);
// 3
_philox4x32bumpkey
(
key
);
_philox4x32round
(
ctr
,
key
);
// 4
_philox4x32bumpkey
(
key
);
_philox4x32round
(
ctr
,
key
);
// 5
_philox4x32bumpkey
(
key
);
_philox4x32round
(
ctr
,
key
);
// 6
_philox4x32bumpkey
(
key
);
_philox4x32round
(
ctr
,
key
);
// 7
_philox4x32bumpkey
(
key
);
_philox4x32round
(
ctr
,
key
);
// 8
_philox4x32bumpkey
(
key
);
_philox4x32round
(
ctr
,
key
);
// 9
_philox4x32bumpkey
(
key
);
_philox4x32round
(
ctr
,
key
);
// 10
// convert uint32 to float
rnd1
=
vec_ctf
(
ctr
[
0
],
0
);
rnd2
=
vec_ctf
(
ctr
[
1
],
0
);
rnd3
=
vec_ctf
(
ctr
[
2
],
0
);
rnd4
=
vec_ctf
(
ctr
[
3
],
0
);
// calculate rnd * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
rnd1
=
vec_madd
(
rnd1
,
vec_splats
(
TWOPOW32_INV_FLOAT
),
vec_splats
(
TWOPOW32_INV_FLOAT
/
2.0
f
));
rnd2
=
vec_madd
(
rnd2
,
vec_splats
(
TWOPOW32_INV_FLOAT
),
vec_splats
(
TWOPOW32_INV_FLOAT
/
2.0
f
));
rnd3
=
vec_madd
(
rnd3
,
vec_splats
(
TWOPOW32_INV_FLOAT
),
vec_splats
(
TWOPOW32_INV_FLOAT
/
2.0
f
));
rnd4
=
vec_madd
(
rnd4
,
vec_splats
(
TWOPOW32_INV_FLOAT
),
vec_splats
(
TWOPOW32_INV_FLOAT
/
2.0
f
));
}
#ifdef __VSX__
QUALIFIERS
void
philox_double2
(
__vector
unsigned
int
ctr0
,
__vector
unsigned
int
ctr1
,
__vector
unsigned
int
ctr2
,
__vector
unsigned
int
ctr3
,
uint32
key0
,
uint32
key1
,
__vector
double
&
rnd1lo
,
__vector
double
&
rnd1hi
,
__vector
double
&
rnd2lo
,
__vector
double
&
rnd2hi
)
{
__vector
unsigned
int
key
[
2
]
=
{
vec_splats
(
key0
),
vec_splats
(
key1
)};
__vector
unsigned
int
ctr
[
4
]
=
{
ctr0
,
ctr1
,
ctr2
,
ctr3
};
_philox4x32round
(
ctr
,
key
);
// 1
_philox4x32bumpkey
(
key
);
_philox4x32round
(
ctr
,
key
);
// 2
_philox4x32bumpkey
(
key
);
_philox4x32round
(
ctr
,
key
);
// 3
_philox4x32bumpkey
(
key
);
_philox4x32round
(
ctr
,
key
);
// 4
_philox4x32bumpkey
(
key
);
_philox4x32round
(
ctr
,
key
);
// 5
_philox4x32bumpkey
(
key
);
_philox4x32round
(
ctr
,
key
);
// 6
_philox4x32bumpkey
(
key
);
_philox4x32round
(
ctr
,
key
);
// 7
_philox4x32bumpkey
(
key
);
_philox4x32round
(
ctr
,
key
);
// 8
_philox4x32bumpkey
(
key
);
_philox4x32round
(
ctr
,
key
);
// 9
_philox4x32bumpkey
(
key
);
_philox4x32round
(
ctr
,
key
);
// 10
rnd1lo
=
_uniform_double_hq
<
false
>
(
ctr
[
0
],
ctr
[
1
]);
rnd1hi
=
_uniform_double_hq
<
true
>
(
ctr
[
0
],
ctr
[
1
]);
rnd2lo
=
_uniform_double_hq
<
false
>
(
ctr
[
2
],
ctr
[
3
]);
rnd2hi
=
_uniform_double_hq
<
true
>
(
ctr
[
2
],
ctr
[
3
]);
}
#endif
QUALIFIERS
void
philox_float4
(
uint32
ctr0
,
__vector
unsigned
int
ctr1
,
uint32
ctr2
,
uint32
ctr3
,
uint32
key0
,
uint32
key1
,
__vector
float
&
rnd1
,
__vector
float
&
rnd2
,
__vector
float
&
rnd3
,
__vector
float
&
rnd4
)
{
__vector
unsigned
int
ctr0v
=
vec_splats
(
ctr0
);
__vector
unsigned
int
ctr2v
=
vec_splats
(
ctr2
);
__vector
unsigned
int
ctr3v
=
vec_splats
(
ctr3
);
philox_float4
(
ctr0v
,
ctr1
,
ctr2v
,
ctr3v
,
key0
,
key1
,
rnd1
,
rnd2
,
rnd3
,
rnd4
);
}
QUALIFIERS
void
philox_float4
(
uint32
ctr0
,
__vector
int
ctr1
,
uint32
ctr2
,
uint32
ctr3
,
uint32
key0
,
uint32
key1
,
__vector
float
&
rnd1
,
__vector
float
&
rnd2
,
__vector
float
&
rnd3
,
__vector
float
&
rnd4
)
{
philox_float4
(
ctr0
,
(
__vector
unsigned
int
)
ctr1
,
ctr2
,
ctr3
,
key0
,
key1
,
rnd1
,
rnd2
,
rnd3
,
rnd4
);
}
#ifdef __VSX__
QUALIFIERS
void
philox_double2
(
uint32
ctr0
,
__vector
unsigned
int
ctr1
,
uint32
ctr2
,
uint32
ctr3
,
uint32
key0
,
uint32
key1
,
__vector
double
&
rnd1lo
,
__vector
double
&
rnd1hi
,
__vector
double
&
rnd2lo
,
__vector
double
&
rnd2hi
)
{
__vector
unsigned
int
ctr0v
=
vec_splats
(
ctr0
);
__vector
unsigned
int
ctr2v
=
vec_splats
(
ctr2
);
__vector
unsigned
int
ctr3v
=
vec_splats
(
ctr3
);
philox_double2
(
ctr0v
,
ctr1
,
ctr2v
,
ctr3v
,
key0
,
key1
,
rnd1lo
,
rnd1hi
,
rnd2lo
,
rnd2hi
);
}
QUALIFIERS
void
philox_double2
(
uint32
ctr0
,
__vector
unsigned
int
ctr1
,
uint32
ctr2
,
uint32
ctr3
,
uint32
key0
,
uint32
key1
,
__vector
double
&
rnd1
,
__vector
double
&
rnd2
)
{
__vector
unsigned
int
ctr0v
=
vec_splats
(
ctr0
);
__vector
unsigned
int
ctr2v
=
vec_splats
(
ctr2
);
__vector
unsigned
int
ctr3v
=
vec_splats
(
ctr3
);
__vector
double
ignore
;
philox_double2
(
ctr0v
,
ctr1
,
ctr2v
,
ctr3v
,
key0
,
key1
,
rnd1
,
ignore
,
rnd2
,
ignore
);
}