Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
pycodegen
pystencils
Commits
896b4192
Commit
896b4192
authored
May 07, 2021
by
Michael Kuron
Browse files
Cherry-pick updated SVE Philox
parent
7e6be86e
Pipeline
#31981
passed with stage
in 71 minutes and 6 seconds
Changes
1
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
pystencils/include/philox_rand.h
View file @
896b4192
...
...
@@ -15,13 +15,8 @@
#ifdef __ARM_NEON
#include <arm_neon.h>
#endif
#if
def
ined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS > 0
#ifdef
__ARM_FEATURE_SVE
#include <arm_sve.h>
typedef
svfloat32_t
svfloat32_st
__attribute__
((
arm_sve_vector_bits
(
__ARM_FEATURE_SVE_BITS
)));
typedef
svfloat64_t
svfloat64_st
__attribute__
((
arm_sve_vector_bits
(
__ARM_FEATURE_SVE_BITS
)));
typedef
svint32_t
svint32_st
__attribute__
((
arm_sve_vector_bits
(
__ARM_FEATURE_SVE_BITS
)));
typedef
svuint32_t
svuint32_st
__attribute__
((
arm_sve_vector_bits
(
__ARM_FEATURE_SVE_BITS
)));
typedef
svuint64_t
svuint64_st
__attribute__
((
arm_sve_vector_bits
(
__ARM_FEATURE_SVE_BITS
)));
#endif
#if defined(__powerpc__) && defined(__GNUC__) && !defined(__clang__) && !defined(__xlC__)
...
...
@@ -52,6 +47,14 @@ typedef svuint64_t svuint64_st __attribute__((arm_sve_vector_bits(__ARM_FEATURE_
typedef
std
::
uint32_t
uint32
;
typedef
std
::
uint64_t
uint64
;
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS > 0
typedef
svfloat32_t
svfloat32_st
__attribute__
((
arm_sve_vector_bits
(
__ARM_FEATURE_SVE_BITS
)));
typedef
svfloat64_t
svfloat64_st
__attribute__
((
arm_sve_vector_bits
(
__ARM_FEATURE_SVE_BITS
)));
#elif defined(__ARM_FEATURE_SVE)
typedef
svfloat32_t
svfloat32_st
;
typedef
svfloat64_t
svfloat64_st
;
#endif
QUALIFIERS
uint32
mulhilo32
(
uint32
a
,
uint32
b
,
uint32
*
hip
)
{
...
...
@@ -664,28 +667,28 @@ QUALIFIERS void philox_double2(uint32 ctr0, int32x4_t ctr1, uint32 ctr2, uint32
#endif
#if defined(__ARM_FEATURE_SVE)
&& defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS > 0
QUALIFIERS
void
_philox4x32round
(
svuint32
_st
*
ctr
,
svuint32
_st
*
key
)
#if defined(__ARM_FEATURE_SVE)
QUALIFIERS
void
_philox4x32round
(
svuint32
x4_t
&
ctr
,
svuint32
x2_t
&
key
)
{
svuint32_
s
t
lo0
=
svmul_u32_x
(
svptrue_b32
(),
ctr
[
0
]
,
svdup_u32
(
PHILOX_M4x32_0
));
svuint32_
s
t
lo1
=
svmul_u32_x
(
svptrue_b32
(),
ctr
[
2
]
,
svdup_u32
(
PHILOX_M4x32_1
));
svuint32_
s
t
hi0
=
svmulh_u32_x
(
svptrue_b32
(),
ctr
[
0
]
,
svdup_u32
(
PHILOX_M4x32_0
));
svuint32_
s
t
hi1
=
svmulh_u32_x
(
svptrue_b32
(),
ctr
[
2
]
,
svdup_u32
(
PHILOX_M4x32_1
));
ctr
[
0
]
=
sveor_u32_x
(
svptrue_b32
(),
sveor_u32_x
(
svptrue_b32
(),
hi1
,
ctr
[
1
]),
key
[
0
]
);
ctr
[
1
]
=
lo1
;
ctr
[
2
]
=
sveor_u32_x
(
svptrue_b32
(),
sveor_u32_x
(
svptrue_b32
(),
hi0
,
ctr
[
3
]),
key
[
1
]
);
ctr
[
3
]
=
lo0
;
svuint32_t
lo0
=
svmul_u32_x
(
svptrue_b32
(),
svget4_u32
(
ctr
,
0
)
,
svdup_u32
(
PHILOX_M4x32_0
));
svuint32_t
lo1
=
svmul_u32_x
(
svptrue_b32
(),
svget4_u32
(
ctr
,
2
)
,
svdup_u32
(
PHILOX_M4x32_1
));
svuint32_t
hi0
=
svmulh_u32_x
(
svptrue_b32
(),
svget4_u32
(
ctr
,
0
)
,
svdup_u32
(
PHILOX_M4x32_0
));
svuint32_t
hi1
=
svmulh_u32_x
(
svptrue_b32
(),
svget4_u32
(
ctr
,
2
)
,
svdup_u32
(
PHILOX_M4x32_1
));
ctr
=
svset4_u32
(
ctr
,
0
,
sveor_u32_x
(
svptrue_b32
(),
sveor_u32_x
(
svptrue_b32
(),
hi1
,
svget4_u32
(
ctr
,
1
)),
svget2_u32
(
key
,
0
))
);
ctr
=
svset4_u32
(
ctr
,
1
,
lo1
)
;
ctr
=
svset4_u32
(
ctr
,
2
,
sveor_u32_x
(
svptrue_b32
(),
sveor_u32_x
(
svptrue_b32
(),
hi0
,
svget4_u32
(
ctr
,
3
)),
svget2_u32
(
key
,
1
))
);
ctr
=
svset4_u32
(
ctr
,
3
,
lo0
)
;
}
QUALIFIERS
void
_philox4x32bumpkey
(
svuint32
_st
*
key
)
QUALIFIERS
void
_philox4x32bumpkey
(
svuint32
x2_t
&
key
)
{
key
[
0
]
=
svadd_u32_x
(
svptrue_b32
(),
key
[
0
]
,
svdup_u32
(
PHILOX_W32_0
));
key
[
1
]
=
svadd_u32_x
(
svptrue_b32
(),
key
[
1
]
,
svdup_u32
(
PHILOX_W32_1
));
key
=
svset2_u32
(
key
,
0
,
svadd_u32_x
(
svptrue_b32
(),
svget2_u32
(
key
,
0
)
,
svdup_u32
(
PHILOX_W32_0
))
)
;
key
=
svset2_u32
(
key
,
1
,
svadd_u32_x
(
svptrue_b32
(),
svget2_u32
(
key
,
1
)
,
svdup_u32
(
PHILOX_W32_1
))
)
;
}
template
<
bool
high
>
QUALIFIERS
svfloat64_
s
t
_uniform_double_hq
(
svuint32_
s
t
x
,
svuint32_
s
t
y
)
QUALIFIERS
svfloat64_t
_uniform_double_hq
(
svuint32_t
x
,
svuint32_t
y
)
{
// convert 32 to 64 bit
if
(
high
)
...
...
@@ -700,11 +703,11 @@ QUALIFIERS svfloat64_st _uniform_double_hq(svuint32_st x, svuint32_st y)
}
// calculate z = x ^ y << (53 - 32))
svuint64_
s
t
z
=
svlsl_n_u64_x
(
svptrue_b64
(),
svreinterpret_u64_u32
(
y
),
53
-
32
);
svuint64_t
z
=
svlsl_n_u64_x
(
svptrue_b64
(),
svreinterpret_u64_u32
(
y
),
53
-
32
);
z
=
sveor_u64_x
(
svptrue_b64
(),
svreinterpret_u64_u32
(
x
),
z
);
// convert uint64 to double
svfloat64_
s
t
rs
=
svcvt_f64_u64_x
(
svptrue_b64
(),
z
);
svfloat64_t
rs
=
svcvt_f64_u64_x
(
svptrue_b64
(),
z
);
// calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
rs
=
svmad_f64_x
(
svptrue_b64
(),
rs
,
svdup_f64
(
TWOPOW53_INV_DOUBLE
),
svdup_f64
(
TWOPOW53_INV_DOUBLE
/
2.0
));
...
...
@@ -712,12 +715,12 @@ QUALIFIERS svfloat64_st _uniform_double_hq(svuint32_st x, svuint32_st y)
}
QUALIFIERS
void
philox_float4
(
svuint32_
s
t
ctr0
,
svuint32_
s
t
ctr1
,
svuint32_
s
t
ctr2
,
svuint32_
s
t
ctr3
,
QUALIFIERS
void
philox_float4
(
svuint32_t
ctr0
,
svuint32_t
ctr1
,
svuint32_t
ctr2
,
svuint32_t
ctr3
,
uint32
key0
,
uint32
key1
,
svfloat32_st
&
rnd1
,
svfloat32_st
&
rnd2
,
svfloat32_st
&
rnd3
,
svfloat32_st
&
rnd4
)
{
svuint32_
s
t
key
[
2
]
=
{
svdup_u32
(
key0
),
svdup_u32
(
key1
)
}
;
svuint32_
s
t
ctr
[
4
]
=
{
ctr0
,
ctr1
,
ctr2
,
ctr3
}
;
svuint32
x2
_t
key
=
svcreate2_u32
(
svdup_u32
(
key0
),
svdup_u32
(
key1
)
)
;
svuint32
x4
_t
ctr
=
svcreate4_u32
(
ctr0
,
ctr1
,
ctr2
,
ctr3
)
;
_philox4x32round
(
ctr
,
key
);
// 1
_philox4x32bumpkey
(
key
);
_philox4x32round
(
ctr
,
key
);
// 2
_philox4x32bumpkey
(
key
);
_philox4x32round
(
ctr
,
key
);
// 3
...
...
@@ -730,10 +733,10 @@ QUALIFIERS void philox_float4(svuint32_st ctr0, svuint32_st ctr1, svuint32_st ct
_philox4x32bumpkey
(
key
);
_philox4x32round
(
ctr
,
key
);
// 10
// convert uint32 to float
rnd1
=
svcvt_f32_u32_x
(
svptrue_b32
(),
ctr
[
0
]
);
rnd2
=
svcvt_f32_u32_x
(
svptrue_b32
(),
ctr
[
1
]
);
rnd3
=
svcvt_f32_u32_x
(
svptrue_b32
(),
ctr
[
2
]
);
rnd4
=
svcvt_f32_u32_x
(
svptrue_b32
(),
ctr
[
3
]
);
rnd1
=
svcvt_f32_u32_x
(
svptrue_b32
(),
svget4_u32
(
ctr
,
0
)
);
rnd2
=
svcvt_f32_u32_x
(
svptrue_b32
(),
svget4_u32
(
ctr
,
1
)
);
rnd3
=
svcvt_f32_u32_x
(
svptrue_b32
(),
svget4_u32
(
ctr
,
2
)
);
rnd4
=
svcvt_f32_u32_x
(
svptrue_b32
(),
svget4_u32
(
ctr
,
3
)
);
// calculate rnd * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
rnd1
=
svmad_f32_x
(
svptrue_b32
(),
rnd1
,
svdup_f32
(
TWOPOW32_INV_FLOAT
),
svdup_f32
(
TWOPOW32_INV_FLOAT
/
2.0
));
rnd2
=
svmad_f32_x
(
svptrue_b32
(),
rnd2
,
svdup_f32
(
TWOPOW32_INV_FLOAT
),
svdup_f32
(
TWOPOW32_INV_FLOAT
/
2.0
));
...
...
@@ -742,12 +745,12 @@ QUALIFIERS void philox_float4(svuint32_st ctr0, svuint32_st ctr1, svuint32_st ct
}
QUALIFIERS
void
philox_double2
(
svuint32_
s
t
ctr0
,
svuint32_
s
t
ctr1
,
svuint32_
s
t
ctr2
,
svuint32_
s
t
ctr3
,
QUALIFIERS
void
philox_double2
(
svuint32_t
ctr0
,
svuint32_t
ctr1
,
svuint32_t
ctr2
,
svuint32_t
ctr3
,
uint32
key0
,
uint32
key1
,
svfloat64_st
&
rnd1lo
,
svfloat64_st
&
rnd1hi
,
svfloat64_st
&
rnd2lo
,
svfloat64_st
&
rnd2hi
)
{
svuint32_
s
t
key
[
2
]
=
{
svdup_u32
(
key0
),
svdup_u32
(
key1
)
}
;
svuint32_
s
t
ctr
[
4
]
=
{
ctr0
,
ctr1
,
ctr2
,
ctr3
}
;
svuint32
x2
_t
key
=
svcreate2_u32
(
svdup_u32
(
key0
),
svdup_u32
(
key1
)
)
;
svuint32
x4
_t
ctr
=
svcreate4_u32
(
ctr0
,
ctr1
,
ctr2
,
ctr3
)
;
_philox4x32round
(
ctr
,
key
);
// 1
_philox4x32bumpkey
(
key
);
_philox4x32round
(
ctr
,
key
);
// 2
_philox4x32bumpkey
(
key
);
_philox4x32round
(
ctr
,
key
);
// 3
...
...
@@ -759,54 +762,54 @@ QUALIFIERS void philox_double2(svuint32_st ctr0, svuint32_st ctr1, svuint32_st c
_philox4x32bumpkey
(
key
);
_philox4x32round
(
ctr
,
key
);
// 9
_philox4x32bumpkey
(
key
);
_philox4x32round
(
ctr
,
key
);
// 10
rnd1lo
=
_uniform_double_hq
<
false
>
(
ctr
[
0
],
ctr
[
1
]
);
rnd1hi
=
_uniform_double_hq
<
true
>
(
ctr
[
0
],
ctr
[
1
]
);
rnd2lo
=
_uniform_double_hq
<
false
>
(
ctr
[
2
],
ctr
[
3
]
);
rnd2hi
=
_uniform_double_hq
<
true
>
(
ctr
[
2
],
ctr
[
3
]
);
rnd1lo
=
_uniform_double_hq
<
false
>
(
svget4_u32
(
ctr
,
0
),
svget4_u32
(
ctr
,
1
)
);
rnd1hi
=
_uniform_double_hq
<
true
>
(
svget4_u32
(
ctr
,
0
),
svget4_u32
(
ctr
,
1
)
);
rnd2lo
=
_uniform_double_hq
<
false
>
(
svget4_u32
(
ctr
,
2
),
svget4_u32
(
ctr
,
3
)
);
rnd2hi
=
_uniform_double_hq
<
true
>
(
svget4_u32
(
ctr
,
2
),
svget4_u32
(
ctr
,
3
)
);
}
QUALIFIERS
void
philox_float4
(
uint32
ctr0
,
svuint32_
s
t
ctr1
,
uint32
ctr2
,
uint32
ctr3
,
QUALIFIERS
void
philox_float4
(
uint32
ctr0
,
svuint32_t
ctr1
,
uint32
ctr2
,
uint32
ctr3
,
uint32
key0
,
uint32
key1
,
svfloat32_st
&
rnd1
,
svfloat32_st
&
rnd2
,
svfloat32_st
&
rnd3
,
svfloat32_st
&
rnd4
)
{
svuint32_
s
t
ctr0v
=
svdup_u32
(
ctr0
);
svuint32_
s
t
ctr2v
=
svdup_u32
(
ctr2
);
svuint32_
s
t
ctr3v
=
svdup_u32
(
ctr3
);
svuint32_t
ctr0v
=
svdup_u32
(
ctr0
);
svuint32_t
ctr2v
=
svdup_u32
(
ctr2
);
svuint32_t
ctr3v
=
svdup_u32
(
ctr3
);
philox_float4
(
ctr0v
,
ctr1
,
ctr2v
,
ctr3v
,
key0
,
key1
,
rnd1
,
rnd2
,
rnd3
,
rnd4
);
}
QUALIFIERS
void
philox_float4
(
uint32
ctr0
,
svint32_
s
t
ctr1
,
uint32
ctr2
,
uint32
ctr3
,
QUALIFIERS
void
philox_float4
(
uint32
ctr0
,
svint32_t
ctr1
,
uint32
ctr2
,
uint32
ctr3
,
uint32
key0
,
uint32
key1
,
svfloat32_st
&
rnd1
,
svfloat32_st
&
rnd2
,
svfloat32_st
&
rnd3
,
svfloat32_st
&
rnd4
)
{
philox_float4
(
ctr0
,
svreinterpret_u32_s32
(
ctr1
),
ctr2
,
ctr3
,
key0
,
key1
,
rnd1
,
rnd2
,
rnd3
,
rnd4
);
}
QUALIFIERS
void
philox_double2
(
uint32
ctr0
,
svuint32_
s
t
ctr1
,
uint32
ctr2
,
uint32
ctr3
,
QUALIFIERS
void
philox_double2
(
uint32
ctr0
,
svuint32_t
ctr1
,
uint32
ctr2
,
uint32
ctr3
,
uint32
key0
,
uint32
key1
,
svfloat64_st
&
rnd1lo
,
svfloat64_st
&
rnd1hi
,
svfloat64_st
&
rnd2lo
,
svfloat64_st
&
rnd2hi
)
{
svuint32_
s
t
ctr0v
=
svdup_u32
(
ctr0
);
svuint32_
s
t
ctr2v
=
svdup_u32
(
ctr2
);
svuint32_
s
t
ctr3v
=
svdup_u32
(
ctr3
);
svuint32_t
ctr0v
=
svdup_u32
(
ctr0
);
svuint32_t
ctr2v
=
svdup_u32
(
ctr2
);
svuint32_t
ctr3v
=
svdup_u32
(
ctr3
);
philox_double2
(
ctr0v
,
ctr1
,
ctr2v
,
ctr3v
,
key0
,
key1
,
rnd1lo
,
rnd1hi
,
rnd2lo
,
rnd2hi
);
}
QUALIFIERS
void
philox_double2
(
uint32
ctr0
,
svuint32_
s
t
ctr1
,
uint32
ctr2
,
uint32
ctr3
,
QUALIFIERS
void
philox_double2
(
uint32
ctr0
,
svuint32_t
ctr1
,
uint32
ctr2
,
uint32
ctr3
,
uint32
key0
,
uint32
key1
,
svfloat64_st
&
rnd1
,
svfloat64_st
&
rnd2
)
{
svuint32_
s
t
ctr0v
=
svdup_u32
(
ctr0
);
svuint32_
s
t
ctr2v
=
svdup_u32
(
ctr2
);
svuint32_
s
t
ctr3v
=
svdup_u32
(
ctr3
);
svuint32_t
ctr0v
=
svdup_u32
(
ctr0
);
svuint32_t
ctr2v
=
svdup_u32
(
ctr2
);
svuint32_t
ctr3v
=
svdup_u32
(
ctr3
);
svfloat64_st
ignore
;
philox_double2
(
ctr0v
,
ctr1
,
ctr2v
,
ctr3v
,
key0
,
key1
,
rnd1
,
ignore
,
rnd2
,
ignore
);
}
QUALIFIERS
void
philox_double2
(
uint32
ctr0
,
svint32_
s
t
ctr1
,
uint32
ctr2
,
uint32
ctr3
,
QUALIFIERS
void
philox_double2
(
uint32
ctr0
,
svint32_t
ctr1
,
uint32
ctr2
,
uint32
ctr3
,
uint32
key0
,
uint32
key1
,
svfloat64_st
&
rnd1
,
svfloat64_st
&
rnd2
)
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment