Commit fa0a09a5 authored by Michael Kuron's avatar Michael Kuron
Browse files

AES-NI RNG fully vectorized

parent d38aed46
......@@ -27,10 +27,27 @@ QUALIFIERS __m128i aesni1xm128i(const __m128i & in, const __m128i & k) {
return x;
}
QUALIFIERS double _uniform_double_hq(uint32 x, uint32 y)
QUALIFIERS __m128 _my_cvtepu32_ps(const __m128i v)
{
uint64 z = (uint64)x ^ ((uint64)y << (53 - 32));
return z * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0);
#ifdef __AVX512F__
return _mm_cvtepu32_ps(v);
#else
__m128i v2 = _mm_srli_epi32(v, 1);
__m128i v1 = _mm_and_si128(v, _mm_set1_epi32(1));
__m128 v2f = _mm_cvtepi32_ps(v2);
__m128 v1f = _mm_cvtepi32_ps(v1);
return _mm_add_ps(_mm_add_ps(v2f, v2f), v1f);
#endif
}
QUALIFIERS __m128d _my_cvtepu64_pd(const __m128i v)
{
#ifdef __AVX512F__
return _mm_cvtepu64_pd(v);
#else
#warning need to implement _my_cvtepu64_pd
return (__m128d) v;
#endif
}
......@@ -41,11 +58,26 @@ QUALIFIERS void aesni_double2(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3
__m128i c128 = _mm_set_epi32(ctr3, ctr2, ctr1, ctr0);
__m128i k128 = _mm_set_epi32(key3, key2, key1, key0);
c128 = aesni1xm128i(c128, k128);
uint32 r[4];
_mm_storeu_si128((__m128i*)&r[0], c128);
__m128i x = _mm_set_epi64x((uint64) r[2], (uint64) r[0]);
__m128i y = _mm_set_epi64x((uint64) r[3], (uint64) r[1]);
rnd1 = _uniform_double_hq(r[0], r[1]);
rnd2 = _uniform_double_hq(r[2], r[3]);
__m128i cnt = _mm_set_epi64x(53 - 32, 53 - 32);
y = _mm_sll_epi64(y, cnt);
__m128i z = _mm_xor_si128(x, y);
__m128d rs = _my_cvtepu64_pd(z);
const __m128d tp53 = _mm_set_pd1(TWOPOW53_INV_DOUBLE);
const __m128d tp54 = _mm_set_pd1(TWOPOW53_INV_DOUBLE/2.0);
rs = _mm_mul_pd(rs, tp53);
rs = _mm_add_pd(rs, tp54);
double rr[2];
_mm_storeu_pd(rr, rs);
rnd1 = rr[0];
rnd2 = rr[1];
}
......@@ -56,11 +88,17 @@ QUALIFIERS void aesni_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3,
__m128i c128 = _mm_set_epi32(ctr3, ctr2, ctr1, ctr0);
__m128i k128 = _mm_set_epi32(key3, key2, key1, key0);
c128 = aesni1xm128i(c128, k128);
uint32 r[4];
_mm_storeu_si128((__m128i*)&r[0], c128);
rnd1 = r[0] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
rnd2 = r[1] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
rnd3 = r[2] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
rnd4 = r[3] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
__m128 rs = _my_cvtepu32_ps(c128);
const __m128 tp32 = _mm_set_ps1(TWOPOW32_INV_FLOAT);
const __m128 tp33 = _mm_set_ps1(TWOPOW32_INV_FLOAT/2.0f);
rs = _mm_mul_ps(rs, tp32);
rs = _mm_add_ps(rs, tp33);
float r[4];
_mm_storeu_ps(r, rs);
rnd1 = r[0];
rnd2 = r[1];
rnd3 = r[2];
rnd4 = r[3];
}
\ No newline at end of file
......@@ -98,7 +98,6 @@ def test_aesni_float():
dh.all_to_cpu()
arr = dh.gather_array('f')
assert np.logical_and(arr <= 1.0, arr >= 0).all()
print(arr)
#float_reference = aesni_reference * 2.**-32 + 2.**-33
#assert(np.allclose(arr, float_reference, rtol=0, atol=np.finfo(np.float32).eps))
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment