Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Sebastian Bindgen
pystencils
Commits
fa0a09a5
Commit
fa0a09a5
authored
Aug 14, 2019
by
Michael Kuron
Browse files
AES-NI RNG fully vectorized
parent
d38aed46
Changes
2
Hide whitespace changes
Inline
Side-by-side
pystencils/include/aesni_rand.h
View file @
fa0a09a5
...
...
@@ -27,10 +27,27 @@ QUALIFIERS __m128i aesni1xm128i(const __m128i & in, const __m128i & k) {
return
x
;
}
QUALIFIERS
double
_uniform_double_hq
(
uint32
x
,
uint32
y
)
QUALIFIERS
__m128
_my_cvtepu32_ps
(
const
__m128i
v
)
{
uint64
z
=
(
uint64
)
x
^
((
uint64
)
y
<<
(
53
-
32
));
return
z
*
TWOPOW53_INV_DOUBLE
+
(
TWOPOW53_INV_DOUBLE
/
2
.
0
);
#ifdef __AVX512F__
return
_mm_cvtepu32_ps
(
v
);
#else
__m128i
v2
=
_mm_srli_epi32
(
v
,
1
);
__m128i
v1
=
_mm_and_si128
(
v
,
_mm_set1_epi32
(
1
));
__m128
v2f
=
_mm_cvtepi32_ps
(
v2
);
__m128
v1f
=
_mm_cvtepi32_ps
(
v1
);
return
_mm_add_ps
(
_mm_add_ps
(
v2f
,
v2f
),
v1f
);
#endif
}
QUALIFIERS
__m128d
_my_cvtepu64_pd
(
const
__m128i
v
)
{
#ifdef __AVX512F__
return
_mm_cvtepu64_pd
(
v
);
#else
#warning need to implement _my_cvtepu64_pd
return
(
__m128d
)
v
;
#endif
}
...
...
@@ -41,11 +58,26 @@ QUALIFIERS void aesni_double2(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3
__m128i
c128
=
_mm_set_epi32
(
ctr3
,
ctr2
,
ctr1
,
ctr0
);
__m128i
k128
=
_mm_set_epi32
(
key3
,
key2
,
key1
,
key0
);
c128
=
aesni1xm128i
(
c128
,
k128
);
uint32
r
[
4
];
_mm_storeu_si128
((
__m128i
*
)
&
r
[
0
],
c128
);
__m128i
x
=
_mm_set_epi64x
((
uint64
)
r
[
2
],
(
uint64
)
r
[
0
]);
__m128i
y
=
_mm_set_epi64x
((
uint64
)
r
[
3
],
(
uint64
)
r
[
1
]);
rnd1
=
_uniform_double_hq
(
r
[
0
],
r
[
1
]);
rnd2
=
_uniform_double_hq
(
r
[
2
],
r
[
3
]);
__m128i
cnt
=
_mm_set_epi64x
(
53
-
32
,
53
-
32
);
y
=
_mm_sll_epi64
(
y
,
cnt
);
__m128i
z
=
_mm_xor_si128
(
x
,
y
);
__m128d
rs
=
_my_cvtepu64_pd
(
z
);
const
__m128d
tp53
=
_mm_set_pd1
(
TWOPOW53_INV_DOUBLE
);
const
__m128d
tp54
=
_mm_set_pd1
(
TWOPOW53_INV_DOUBLE
/
2
.
0
);
rs
=
_mm_mul_pd
(
rs
,
tp53
);
rs
=
_mm_add_pd
(
rs
,
tp54
);
double
rr
[
2
];
_mm_storeu_pd
(
rr
,
rs
);
rnd1
=
rr
[
0
];
rnd2
=
rr
[
1
];
}
...
...
@@ -56,11 +88,17 @@ QUALIFIERS void aesni_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3,
__m128i
c128
=
_mm_set_epi32
(
ctr3
,
ctr2
,
ctr1
,
ctr0
);
__m128i
k128
=
_mm_set_epi32
(
key3
,
key2
,
key1
,
key0
);
c128
=
aesni1xm128i
(
c128
,
k128
);
uint32
r
[
4
];
_mm_storeu_si128
((
__m128i
*
)
&
r
[
0
],
c128
);
rnd1
=
r
[
0
]
*
TWOPOW32_INV_FLOAT
+
(
TWOPOW32_INV_FLOAT
/
2
.
0
f
);
rnd2
=
r
[
1
]
*
TWOPOW32_INV_FLOAT
+
(
TWOPOW32_INV_FLOAT
/
2
.
0
f
);
rnd3
=
r
[
2
]
*
TWOPOW32_INV_FLOAT
+
(
TWOPOW32_INV_FLOAT
/
2
.
0
f
);
rnd4
=
r
[
3
]
*
TWOPOW32_INV_FLOAT
+
(
TWOPOW32_INV_FLOAT
/
2
.
0
f
);
__m128
rs
=
_my_cvtepu32_ps
(
c128
);
const
__m128
tp32
=
_mm_set_ps1
(
TWOPOW32_INV_FLOAT
);
const
__m128
tp33
=
_mm_set_ps1
(
TWOPOW32_INV_FLOAT
/
2
.
0
f
);
rs
=
_mm_mul_ps
(
rs
,
tp32
);
rs
=
_mm_add_ps
(
rs
,
tp33
);
float
r
[
4
];
_mm_storeu_ps
(
r
,
rs
);
rnd1
=
r
[
0
];
rnd2
=
r
[
1
];
rnd3
=
r
[
2
];
rnd4
=
r
[
3
];
}
\ No newline at end of file
pystencils_tests/test_random.py
View file @
fa0a09a5
...
...
@@ -98,7 +98,6 @@ def test_aesni_float():
dh
.
all_to_cpu
()
arr
=
dh
.
gather_array
(
'f'
)
assert
np
.
logical_and
(
arr
<=
1.0
,
arr
>=
0
).
all
()
print
(
arr
)
#float_reference = aesni_reference * 2.**-32 + 2.**-33
#assert(np.allclose(arr, float_reference, rtol=0, atol=np.finfo(np.float32).eps))
\ No newline at end of file
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment