Skip to content
Snippets Groups Projects
Commit c6bb38c3 authored by Michael Kuron's avatar Michael Kuron :mortar_board:
Browse files

AES-NI: Use FMA if available

parent f47fabfa
1 merge request!46AES-NI vectorization improvements
Pipeline #17688 passed with stage
in 2 minutes and 40 seconds
...@@ -8,6 +8,9 @@ ...@@ -8,6 +8,9 @@
#include <immintrin.h> // AVX* #include <immintrin.h> // AVX*
#else #else
#include <smmintrin.h> // SSE4 #include <smmintrin.h> // SSE4
#ifdef __FMA__
#include <immintrin.h> // FMA
#endif
#endif #endif
#include <cstdint> #include <cstdint>
...@@ -84,8 +87,12 @@ QUALIFIERS void aesni_double2(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3 ...@@ -84,8 +87,12 @@ QUALIFIERS void aesni_double2(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3
// convert uint64 to double // convert uint64 to double
__m128d rs = _my_cvtepu64_pd(z); __m128d rs = _my_cvtepu64_pd(z);
// calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0) // calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
#ifdef __FMA__
rs = _mm_fmadd_pd(rs, _mm_set_pd1(TWOPOW53_INV_DOUBLE), _mm_set_pd1(TWOPOW53_INV_DOUBLE/2.0));
#else
rs = _mm_mul_pd(rs, _mm_set_pd1(TWOPOW53_INV_DOUBLE)); rs = _mm_mul_pd(rs, _mm_set_pd1(TWOPOW53_INV_DOUBLE));
rs = _mm_add_pd(rs, _mm_set_pd1(TWOPOW53_INV_DOUBLE/2.0)); rs = _mm_add_pd(rs, _mm_set_pd1(TWOPOW53_INV_DOUBLE/2.0));
#endif
// store result // store result
double rr[2]; double rr[2];
...@@ -107,8 +114,12 @@ QUALIFIERS void aesni_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3, ...@@ -107,8 +114,12 @@ QUALIFIERS void aesni_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3,
// convert uint32 to float // convert uint32 to float
__m128 rs = _my_cvtepu32_ps(c128); __m128 rs = _my_cvtepu32_ps(c128);
// calculate rs * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f) // calculate rs * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
#ifdef __FMA__
rs = _mm_fmadd_ps(rs, _mm_set_ps1(TWOPOW32_INV_FLOAT), _mm_set_ps1(TWOPOW32_INV_FLOAT/2.0f));
#else
rs = _mm_mul_ps(rs, _mm_set_ps1(TWOPOW32_INV_FLOAT)); rs = _mm_mul_ps(rs, _mm_set_ps1(TWOPOW32_INV_FLOAT));
rs = _mm_add_ps(rs, _mm_set_ps1(TWOPOW32_INV_FLOAT/2.0f)); rs = _mm_add_ps(rs, _mm_set_ps1(TWOPOW32_INV_FLOAT/2.0f));
#endif
// store result // store result
float r[4]; float r[4];
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment