From 8440bc6e50264a3f69df0347a0cbbf213603f77d Mon Sep 17 00:00:00 2001 From: Michael Kuron <mkuron@icp.uni-stuttgart.de> Date: Mon, 29 Mar 2021 20:31:21 +0000 Subject: [PATCH] Vectorization improvements --- pystencils/backends/arm_instruction_sets.py | 32 ++- pystencils/backends/cbackend.py | 15 +- pystencils/backends/ppc_instruction_sets.py | 101 +++++++++ pystencils/backends/simd_instruction_sets.py | 25 ++- pystencils/backends/x86_instruction_sets.py | 4 +- pystencils/cpu/cpujit.py | 3 + pystencils/include/aesni_rand.h | 50 ++++- pystencils/include/philox_rand.h | 219 +++++++++++++++++++ pystencils/include/ppc_altivec_helpers.h | 3 + pystencils/llvm/llvmjit.py | 9 +- pystencils_tests/test_conditional_vec.py | 48 ++-- pystencils_tests/test_llvm.py | 1 + pystencils_tests/test_random.py | 8 +- pystencils_tests/test_vectorization.py | 2 +- pytest.ini | 2 +- 15 files changed, 451 insertions(+), 71 deletions(-) create mode 100644 pystencils/backends/ppc_instruction_sets.py create mode 100644 pystencils/include/ppc_altivec_helpers.h diff --git a/pystencils/backends/arm_instruction_sets.py b/pystencils/backends/arm_instruction_sets.py index 26f61e909..201451fb8 100644 --- a/pystencils/backends/arm_instruction_sets.py +++ b/pystencils/backends/arm_instruction_sets.py @@ -13,7 +13,10 @@ def get_argument_string(function_shortcut): return arg_string -def get_vector_instruction_set_arm(data_type='double', instruction_set='neon', q_registers=True): +def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'): + if instruction_set != 'neon': + raise NotImplementedError(instruction_set) + base_names = { '+': 'add[0, 1]', '-': 'sub[0, 1]', @@ -39,16 +42,9 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon', q 'float': 32, 'int': 32} - if q_registers is True: - q_reg = 'q' - width = 128 // bits[data_type] - intwidth = 128 // bits['int'] - suffix = f'q_f{bits[data_type]}' - else: - q_reg = '' - width = 64 // bits[data_type] - intwidth = 64 // bits['int'] - suffix = f'_f{bits[data_type]}' + width = 128 // bits[data_type] + intwidth = 128 // bits['int'] + suffix = f'q_f{bits[data_type]}' result = dict() @@ -60,10 +56,10 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon', q result[intrinsic_id] = 'v' + name + suffix + arg_string - result['makeVecConst'] = f'vdup{q_reg}_n_f{bits[data_type]}' + '({0})' + result['makeVecConst'] = f'vdupq_n_f{bits[data_type]}' + '({0})' result['makeVec'] = f'makeVec_f{bits[data_type]}' + '(' + ", ".join(['{' + str(i) + '}' for i in range(width)]) + \ ')' - result['makeVecConstInt'] = f'vdup{q_reg}_n_s{bits["int"]}' + '({0})' + result['makeVecConstInt'] = f'vdupq_n_s{bits["int"]}' + '({0})' result['makeVecInt'] = f'makeVec_s{bits["int"]}' + '({0}, {1}, {2}, {3})' result['+int'] = f"vaddq_s{bits['int']}" + "({0}, {1})" @@ -77,10 +73,12 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon', q result['bool'] = f'uint{bits[data_type]}x{width}_t' result['headers'] = ['<arm_neon.h>', '"arm_neon_helpers.h"'] - result['!='] = f'vmvn{q_reg}_u{bits[data_type]}({result["=="]})' + result['!='] = f'vmvnq_u{bits[data_type]}({result["=="]})' - result['&'] = f'vand{q_reg}_u{bits[data_type]}' + '({0}, {1})' - result['|'] = f'vorr{q_reg}_u{bits[data_type]}' + '({0}, {1})' - result['blendv'] = f'vbsl{q_reg}_f{bits[data_type]}' + '({2}, {1}, {0})' + result['&'] = f'vandq_u{bits[data_type]}' + '({0}, {1})' + result['|'] = f'vorrq_u{bits[data_type]}' + '({0}, {1})' + result['blendv'] = f'vbslq_f{bits[data_type]}' + '({2}, {1}, {0})' + result['any'] = f'vaddlvq_u8(vreinterpretq_u8_u{bits[data_type]}({{0}})) > 0' + result['all'] = f'vaddlvq_u8(vreinterpretq_u8_u{bits[data_type]}({{0}})) == 16*0xff' return result diff --git a/pystencils/backends/cbackend.py b/pystencils/backends/cbackend.py index 9603d6d23..2a15ef74f 100644 --- a/pystencils/backends/cbackend.py +++ b/pystencils/backends/cbackend.py @@ -588,18 +588,17 @@ class VectorizedCustomSympyPrinter(CustomSympyPrinter): return self.instruction_set['rsqrt'].format(self._print(expr.args[0])) else: return f"({self._print(1 / sp.sqrt(expr.args[0]))})" - elif isinstance(expr, vec_any): - expr_type = get_type_of_expression(expr.args[0]) - if type(expr_type) is not VectorType: - return self._print(expr.args[0]) - else: - return self.instruction_set['any'].format(self._print(expr.args[0])) - elif isinstance(expr, vec_all): + elif isinstance(expr, vec_any) or isinstance(expr, vec_all): + instr = 'any' if isinstance(expr, vec_any) else 'all' expr_type = get_type_of_expression(expr.args[0]) if type(expr_type) is not VectorType: return self._print(expr.args[0]) else: - return self.instruction_set['all'].format(self._print(expr.args[0])) + if isinstance(expr.args[0], sp.Rel): + op = expr.args[0].rel_op + if (instr, op) in self.instruction_set: + return self.instruction_set[(instr, op)].format(*[self._print(a) for a in expr.args[0].args]) + return self.instruction_set[instr].format(self._print(expr.args[0])) return super(VectorizedCustomSympyPrinter, self)._print_Function(expr) diff --git a/pystencils/backends/ppc_instruction_sets.py b/pystencils/backends/ppc_instruction_sets.py new file mode 100644 index 000000000..bb9e3e851 --- /dev/null +++ b/pystencils/backends/ppc_instruction_sets.py @@ -0,0 +1,101 @@ +def get_argument_string(function_shortcut): + args = function_shortcut[function_shortcut.index('[') + 1: -1] + arg_string = "(" + for arg in args.split(","): + arg = arg.strip() + if not arg: + continue + if arg in ('0', '1', '2', '3', '4', '5'): + arg_string += "{" + arg + "}," + else: + arg_string += arg + "," + arg_string = arg_string[:-1] + ")" + return arg_string + + +def get_vector_instruction_set_ppc(data_type='double', instruction_set='vsx'): + if instruction_set != 'vsx': + raise NotImplementedError(instruction_set) + + base_names = { + '+': 'add[0, 1]', + '-': 'sub[0, 1]', + '*': 'mul[0, 1]', + '/': 'div[0, 1]', + 'sqrt': 'sqrt[0]', + 'rsqrt': 'rsqrte[0]', # rsqrt is available too, but not on Clang + + 'loadU': 'xl[0x0, 0]', + 'loadA': 'ld[0x0, 0]', + 'storeU': 'xst[1, 0x0, 0]', + 'storeA': 'st[1, 0x0, 0]', + 'stream': 'stl[1, 0x0, 0]', + + 'abs': 'abs[0]', + '==': 'cmpeq[0, 1]', + '!=': 'cmpne[0, 1]', + '<=': 'cmple[0, 1]', + '<': 'cmplt[0, 1]', + '>=': 'cmpge[0, 1]', + '>': 'cmpgt[0, 1]', + '&': 'and[0, 1]', + '|': 'or[0, 1]', + 'blendv': 'sel[0, 1, 2]', + + ('any', '=='): 'any_eq[0, 1]', + ('any', '!='): 'any_ne[0, 1]', + ('any', '<='): 'any_le[0, 1]', + ('any', '<'): 'any_lt[0, 1]', + ('any', '>='): 'any_ge[0, 1]', + ('any', '>'): 'any_gt[0, 1]', + ('all', '=='): 'all_eq[0, 1]', + ('all', '!='): 'all_ne[0, 1]', + ('all', '<='): 'all_le[0, 1]', + ('all', '<'): 'all_lt[0, 1]', + ('all', '>='): 'all_ge[0, 1]', + ('all', '>'): 'all_gt[0, 1]', + } + + bits = {'double': 64, + 'float': 32, + 'int': 32} + + width = 128 // bits[data_type] + intwidth = 128 // bits['int'] + + result = dict() + + for intrinsic_id, function_shortcut in base_names.items(): + function_shortcut = function_shortcut.strip() + name = function_shortcut[:function_shortcut.index('[')] + + arg_string = get_argument_string(function_shortcut) + + result[intrinsic_id] = 'vec_' + name + arg_string + + if data_type == 'double': + # Clang and XL C++ are missing these for doubles + result['loadA'] = '(__vector double)' + result['loadA'].format('(float*) {0}') + result['storeA'] = result['storeA'].format('(float*) {0}', '(__vector float) {1}') + result['stream'] = result['stream'].format('(float*) {0}', '(__vector float) {1}') + + result['+int'] = "vec_add({0}, {1})" + + result['width'] = width + result['intwidth'] = intwidth + result[data_type] = f'__vector {data_type}' + result['int'] = '__vector int' + result['bool'] = f'__vector __bool {"long long" if data_type == "double" else "int"}' + result['headers'] = ['<altivec.h>', '"ppc_altivec_helpers.h"'] + + result['makeVecConst'] = '((' + result[data_type] + '){{' + \ + ", ".join(['(' + data_type + ') {0}' for _ in range(width)]) + '}})' + result['makeVec'] = '((' + result[data_type] + '){{' + \ + ", ".join(['{' + data_type + '} {' + str(i) + '}' for i in range(width)]) + '}})' + result['makeVecConstInt'] = '((' + result['int'] + '){{' + ", ".join(['(int) {0}' for _ in range(intwidth)]) + '}})' + result['makeVecInt'] = '((' + result['int'] + '){{(int) {0}, (int) {1}, (int) {2}, (int) {3}}})' + + result['any'] = 'vec_any_ne({0}, ((' + result['bool'] + ') {{' + ", ".join(['0'] * width) + '}}))' + result['all'] = 'vec_all_ne({0}, ((' + result['bool'] + ') {{' + ", ".join(['0'] * width) + '}}))' + + return result diff --git a/pystencils/backends/simd_instruction_sets.py b/pystencils/backends/simd_instruction_sets.py index 3bcbaee1b..850f8ff6d 100644 --- a/pystencils/backends/simd_instruction_sets.py +++ b/pystencils/backends/simd_instruction_sets.py @@ -2,19 +2,40 @@ import platform from pystencils.backends.x86_instruction_sets import get_vector_instruction_set_x86 from pystencils.backends.arm_instruction_sets import get_vector_instruction_set_arm +from pystencils.backends.ppc_instruction_sets import get_vector_instruction_set_ppc -def get_vector_instruction_set(data_type='double', instruction_set='avx', q_registers=True): +def get_vector_instruction_set(data_type='double', instruction_set='avx'): if instruction_set in ['neon', 'sve']: - return get_vector_instruction_set_arm(data_type, instruction_set, q_registers) + return get_vector_instruction_set_arm(data_type, instruction_set) + elif instruction_set in ['vsx']: + return get_vector_instruction_set_ppc(data_type, instruction_set) else: return get_vector_instruction_set_x86(data_type, instruction_set) +_cache = None + + def get_supported_instruction_sets(): """List of supported instruction sets on current hardware, or None if query failed.""" + global _cache + if _cache is not None: + return _cache.copy() if platform.system() == 'Darwin' and platform.machine() == 'arm64': # not supported by cpuinfo return ['neon'] + elif platform.machine().startswith('ppc64'): # no flags reported by cpuinfo + import subprocess + import tempfile + from pystencils.cpu.cpujit import get_compiler_config + f = tempfile.NamedTemporaryFile(suffix='.cpp') + command = [get_compiler_config()['command'], '-mcpu=native', '-dM', '-E', f.name] + macros = subprocess.check_output(command, input='', text=True) + if '#define __VSX__' in macros and '#define __ALTIVEC__' in macros: + _cache = ['vsx'] + else: + _cache = [] + return _cache.copy() try: from cpuinfo import get_cpu_info except ImportError: diff --git a/pystencils/backends/x86_instruction_sets.py b/pystencils/backends/x86_instruction_sets.py index 57164d678..836ffc579 100644 --- a/pystencils/backends/x86_instruction_sets.py +++ b/pystencils/backends/x86_instruction_sets.py @@ -137,11 +137,11 @@ def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'): result['double'] = f"__m{bit_width}d" result['float'] = f"__m{bit_width}" result['int'] = f"__m{bit_width}i" - result['bool'] = f"__m{bit_width}d" + result['bool'] = result[data_type] result['headers'] = headers[instruction_set] result['any'] = f"{pre}_movemask_{suf}({{0}}) > 0" - result['all'] = f"{pre}_movemask_{suf}({{0}}) == 0xF" + result['all'] = f"{pre}_movemask_{suf}({{0}}) == {hex(2**result['width']-1)}" if instruction_set == 'avx512': size = 8 if data_type == 'double' else 16 diff --git a/pystencils/cpu/cpujit.py b/pystencils/cpu/cpujit.py index 4ac13a065..6f2c9a5b0 100644 --- a/pystencils/cpu/cpujit.py +++ b/pystencils/cpu/cpujit.py @@ -155,6 +155,9 @@ def read_config(): ('flags', '-Ofast -DNDEBUG -fPIC -march=native -fopenmp -std=c++11'), ('restrict_qualifier', '__restrict__') ]) + if platform.machine().startswith('ppc64'): + default_compiler_config['flags'] = default_compiler_config['flags'].replace('-march=native', + '-mcpu=native') elif platform.system().lower() == 'windows': default_compiler_config = OrderedDict([ ('os', 'windows'), diff --git a/pystencils/include/aesni_rand.h b/pystencils/include/aesni_rand.h index 4206e37f6..00e1dffee 100644 --- a/pystencils/include/aesni_rand.h +++ b/pystencils/include/aesni_rand.h @@ -21,6 +21,36 @@ typedef std::uint32_t uint32; typedef std::uint64_t uint64; +template <typename T, std::size_t Alignment> +class AlignedAllocator +{ +public: + typedef T value_type; + + template <typename U> + struct rebind { + typedef AlignedAllocator<U, Alignment> other; + }; + + T * allocate(const std::size_t n) const { + if (n == 0) { + return nullptr; + } + void * const p = _mm_malloc(n*sizeof(T), Alignment); + if (p == nullptr) { + throw std::bad_alloc(); + } + return static_cast<T *>(p); + } + + void deallocate(T * const p, const std::size_t n) const { + _mm_free(p); + } +}; + +template <typename Key, typename T> +using AlignedMap = std::map<Key, T, std::less<Key>, AlignedAllocator<std::pair<const Key, T>, sizeof(Key)>>; + #if defined(__AES__) || defined(_MSC_VER) QUALIFIERS __m128i aesni_keygen_assist(__m128i temp1, __m128i temp2) { __m128i temp3; @@ -85,10 +115,10 @@ QUALIFIERS std::array<__m128i,11> aesni_keygen(__m128i k) { } QUALIFIERS const std::array<__m128i,11> & aesni_roundkeys(const __m128i & k128) { - std::array<uint32,4> a; - _mm_storeu_si128((__m128i*) a.data(), k128); + alignas(16) std::array<uint32,4> a; + _mm_store_si128((__m128i*) a.data(), k128); - static std::map<std::array<uint32,4>, std::array<__m128i,11>> roundkeys; + static AlignedMap<std::array<uint32,4>, std::array<__m128i,11>> roundkeys; if(roundkeys.find(a) == roundkeys.end()) { auto rk = aesni_keygen(k128); @@ -308,10 +338,10 @@ QUALIFIERS void aesni_double2(uint32 ctr0, __m128i ctr1, uint32 ctr2, uint32 ctr #ifdef __AVX2__ QUALIFIERS const std::array<__m256i,11> & aesni_roundkeys(const __m256i & k256) { - std::array<uint32,8> a; - _mm256_storeu_si256((__m256i*) a.data(), k256); + alignas(32) std::array<uint32,8> a; + _mm256_store_si256((__m256i*) a.data(), k256); - static std::map<std::array<uint32,8>, std::array<__m256i,11>> roundkeys; + static AlignedMap<std::array<uint32,8>, std::array<__m256i,11>> roundkeys; if(roundkeys.find(a) == roundkeys.end()) { auto rk1 = aesni_keygen(_mm256_extractf128_si256(k256, 0)); @@ -523,10 +553,10 @@ QUALIFIERS void aesni_double2(uint32 ctr0, __m256i ctr1, uint32 ctr2, uint32 ctr #ifdef __AVX512F__ QUALIFIERS const std::array<__m512i,11> & aesni_roundkeys(const __m512i & k512) { - std::array<uint32,16> a; - _mm512_storeu_si512((__m512i*) a.data(), k512); + alignas(64) std::array<uint32,16> a; + _mm512_store_si512((__m512i*) a.data(), k512); - static std::map<std::array<uint32,16>, std::array<__m512i,11>> roundkeys; + static AlignedMap<std::array<uint32,16>, std::array<__m512i,11>> roundkeys; if(roundkeys.find(a) == roundkeys.end()) { auto rk1 = aesni_keygen(_mm512_extracti32x4_epi32(k512, 0)); @@ -553,7 +583,7 @@ QUALIFIERS __m512i aesni1xm128i(const __m512i & in, const __m512i & k0) { x = _mm512_aesenc_epi128(x, k[7]); x = _mm512_aesenc_epi128(x, k[8]); x = _mm512_aesenc_epi128(x, k[9]); - x = _mm512_aesenclast_epi128(x[10], k); + x = _mm512_aesenclast_epi128(x, k[10]); #else __m128i a = aesni1xm128i(_mm512_extracti32x4_epi32(in, 0), _mm512_extracti32x4_epi32(k0, 0)); __m128i b = aesni1xm128i(_mm512_extracti32x4_epi32(in, 1), _mm512_extracti32x4_epi32(k0, 1)); diff --git a/pystencils/include/philox_rand.h b/pystencils/include/philox_rand.h index 71da35715..4d81d43e4 100644 --- a/pystencils/include/philox_rand.h +++ b/pystencils/include/philox_rand.h @@ -16,6 +16,17 @@ #include <arm_neon.h> #endif +#if defined(__powerpc__) && defined(__GNUC__) && !defined(__clang__) && !defined(__xlC__) +#include <ppu_intrinsics.h> +#endif +#ifdef __ALTIVEC__ +#include <altivec.h> +#undef bool +#ifndef _ARCH_PWR8 +#include <pveclib/vec_int64_ppc.h> +#endif +#endif + #ifndef __CUDA_ARCH__ #define QUALIFIERS inline #include "myintrin.h" @@ -38,9 +49,14 @@ QUALIFIERS uint32 mulhilo32(uint32 a, uint32 b, uint32* hip) { #ifndef __CUDA_ARCH__ // host code +#if defined(__powerpc__) && (!defined(__clang__) || defined(__xlC__)) + *hip = __mulhwu(a,b); + return a*b; +#else uint64 product = ((uint64)a) * ((uint64)b); *hip = product >> 32; return (uint32)product; +#endif #else // device code *hip = __umulhi(a,b); @@ -281,6 +297,209 @@ QUALIFIERS void philox_double2(uint32 ctr0, __m128i ctr1, uint32 ctr2, uint32 ct } #endif + +#ifdef __ALTIVEC__ +QUALIFIERS void _philox4x32round(__vector unsigned int* ctr, __vector unsigned int* key) +{ +#ifndef _ARCH_PWR8 + __vector unsigned int lo0 = vec_mul(ctr[0], vec_splats(PHILOX_M4x32_0)); + __vector unsigned int lo1 = vec_mul(ctr[2], vec_splats(PHILOX_M4x32_1)); + __vector unsigned int hi0 = vec_mulhuw(ctr[0], vec_splats(PHILOX_M4x32_0)); + __vector unsigned int hi1 = vec_mulhuw(ctr[2], vec_splats(PHILOX_M4x32_1)); +#elif defined(_ARCH_PWR10) + __vector unsigned int lo0 = vec_mul(ctr[0], vec_splats(PHILOX_M4x32_0)); + __vector unsigned int lo1 = vec_mul(ctr[2], vec_splats(PHILOX_M4x32_1)); + __vector unsigned int hi0 = vec_mulh(ctr[0], vec_splats(PHILOX_M4x32_0)); + __vector unsigned int hi1 = vec_mulh(ctr[2], vec_splats(PHILOX_M4x32_1)); +#else + __vector unsigned int lohi0a = (__vector unsigned int) vec_mule(ctr[0], vec_splats(PHILOX_M4x32_0)); + __vector unsigned int lohi0b = (__vector unsigned int) vec_mulo(ctr[0], vec_splats(PHILOX_M4x32_0)); + __vector unsigned int lohi1a = (__vector unsigned int) vec_mule(ctr[2], vec_splats(PHILOX_M4x32_1)); + __vector unsigned int lohi1b = (__vector unsigned int) vec_mulo(ctr[2], vec_splats(PHILOX_M4x32_1)); + +#ifdef __LITTLE_ENDIAN__ + __vector unsigned int lo0 = vec_mergee(lohi0a, lohi0b); + __vector unsigned int lo1 = vec_mergee(lohi1a, lohi1b); + __vector unsigned int hi0 = vec_mergeo(lohi0a, lohi0b); + __vector unsigned int hi1 = vec_mergeo(lohi1a, lohi1b); +#else + __vector unsigned int lo0 = vec_mergeo(lohi0a, lohi0b); + __vector unsigned int lo1 = vec_mergeo(lohi1a, lohi1b); + __vector unsigned int hi0 = vec_mergee(lohi0a, lohi0b); + __vector unsigned int hi1 = vec_mergee(lohi1a, lohi1b); +#endif +#endif + + ctr[0] = vec_xor(vec_xor(hi1, ctr[1]), key[0]); + ctr[1] = lo1; + ctr[2] = vec_xor(vec_xor(hi0, ctr[3]), key[1]); + ctr[3] = lo0; +} + +QUALIFIERS void _philox4x32bumpkey(__vector unsigned int* key) +{ + key[0] = vec_add(key[0], vec_splats(PHILOX_W32_0)); + key[1] = vec_add(key[1], vec_splats(PHILOX_W32_1)); +} + +#ifdef __VSX__ +template<bool high> +QUALIFIERS __vector double _uniform_double_hq(__vector unsigned int x, __vector unsigned int y) +{ + // convert 32 to 64 bit +#ifdef __LITTLE_ENDIAN__ + if (high) + { + x = vec_mergel(x, vec_splats(0U)); + y = vec_mergel(y, vec_splats(0U)); + } + else + { + x = vec_mergeh(x, vec_splats(0U)); + y = vec_mergeh(y, vec_splats(0U)); + } +#else + if (high) + { + x = vec_mergel(vec_splats(0U), x); + y = vec_mergel(vec_splats(0U), y); + } + else + { + x = vec_mergeh(vec_splats(0U), x); + y = vec_mergeh(vec_splats(0U), y); + } +#endif + + // calculate z = x ^ y << (53 - 32)) +#ifdef _ARCH_PWR8 + __vector unsigned long long z = vec_sl((__vector unsigned long long) y, vec_splats(53ULL - 32ULL)); +#else + __vector unsigned long long z = vec_vsld((__vector unsigned long long) y, vec_splats(53ULL - 32ULL)); +#endif + z = vec_xor((__vector unsigned long long) x, z); + + // convert uint64 to double +#ifdef __xlC__ + __vector double rs = vec_ctd(z, 0); +#else + __vector double rs = vec_ctf(z, 0); +#endif + // calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0) + rs = vec_madd(rs, vec_splats(TWOPOW53_INV_DOUBLE), vec_splats(TWOPOW53_INV_DOUBLE/2.0)); + + return rs; +} +#endif + + +QUALIFIERS void philox_float4(__vector unsigned int ctr0, __vector unsigned int ctr1, __vector unsigned int ctr2, __vector unsigned int ctr3, + uint32 key0, uint32 key1, + __vector float & rnd1, __vector float & rnd2, __vector float & rnd3, __vector float & rnd4) +{ + __vector unsigned int key[2] = {vec_splats(key0), vec_splats(key1)}; + __vector unsigned int ctr[4] = {ctr0, ctr1, ctr2, ctr3}; + _philox4x32round(ctr, key); // 1 + _philox4x32bumpkey(key); _philox4x32round(ctr, key); // 2 + _philox4x32bumpkey(key); _philox4x32round(ctr, key); // 3 + _philox4x32bumpkey(key); _philox4x32round(ctr, key); // 4 + _philox4x32bumpkey(key); _philox4x32round(ctr, key); // 5 + _philox4x32bumpkey(key); _philox4x32round(ctr, key); // 6 + _philox4x32bumpkey(key); _philox4x32round(ctr, key); // 7 + _philox4x32bumpkey(key); _philox4x32round(ctr, key); // 8 + _philox4x32bumpkey(key); _philox4x32round(ctr, key); // 9 + _philox4x32bumpkey(key); _philox4x32round(ctr, key); // 10 + + // convert uint32 to float + rnd1 = vec_ctf(ctr[0], 0); + rnd2 = vec_ctf(ctr[1], 0); + rnd3 = vec_ctf(ctr[2], 0); + rnd4 = vec_ctf(ctr[3], 0); + // calculate rnd * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f) + rnd1 = vec_madd(rnd1, vec_splats(TWOPOW32_INV_FLOAT), vec_splats(TWOPOW32_INV_FLOAT/2.0f)); + rnd2 = vec_madd(rnd2, vec_splats(TWOPOW32_INV_FLOAT), vec_splats(TWOPOW32_INV_FLOAT/2.0f)); + rnd3 = vec_madd(rnd3, vec_splats(TWOPOW32_INV_FLOAT), vec_splats(TWOPOW32_INV_FLOAT/2.0f)); + rnd4 = vec_madd(rnd4, vec_splats(TWOPOW32_INV_FLOAT), vec_splats(TWOPOW32_INV_FLOAT/2.0f)); +} + + +#ifdef __VSX__ +QUALIFIERS void philox_double2(__vector unsigned int ctr0, __vector unsigned int ctr1, __vector unsigned int ctr2, __vector unsigned int ctr3, + uint32 key0, uint32 key1, + __vector double & rnd1lo, __vector double & rnd1hi, __vector double & rnd2lo, __vector double & rnd2hi) +{ + __vector unsigned int key[2] = {vec_splats(key0), vec_splats(key1)}; + __vector unsigned int ctr[4] = {ctr0, ctr1, ctr2, ctr3}; + _philox4x32round(ctr, key); // 1 + _philox4x32bumpkey(key); _philox4x32round(ctr, key); // 2 + _philox4x32bumpkey(key); _philox4x32round(ctr, key); // 3 + _philox4x32bumpkey(key); _philox4x32round(ctr, key); // 4 + _philox4x32bumpkey(key); _philox4x32round(ctr, key); // 5 + _philox4x32bumpkey(key); _philox4x32round(ctr, key); // 6 + _philox4x32bumpkey(key); _philox4x32round(ctr, key); // 7 + _philox4x32bumpkey(key); _philox4x32round(ctr, key); // 8 + _philox4x32bumpkey(key); _philox4x32round(ctr, key); // 9 + _philox4x32bumpkey(key); _philox4x32round(ctr, key); // 10 + + rnd1lo = _uniform_double_hq<false>(ctr[0], ctr[1]); + rnd1hi = _uniform_double_hq<true>(ctr[0], ctr[1]); + rnd2lo = _uniform_double_hq<false>(ctr[2], ctr[3]); + rnd2hi = _uniform_double_hq<true>(ctr[2], ctr[3]); +} +#endif + +QUALIFIERS void philox_float4(uint32 ctr0, __vector unsigned int ctr1, uint32 ctr2, uint32 ctr3, + uint32 key0, uint32 key1, + __vector float & rnd1, __vector float & rnd2, __vector float & rnd3, __vector float & rnd4) +{ + __vector unsigned int ctr0v = vec_splats(ctr0); + __vector unsigned int ctr2v = vec_splats(ctr2); + __vector unsigned int ctr3v = vec_splats(ctr3); + + philox_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, rnd2, rnd3, rnd4); +} + +QUALIFIERS void philox_float4(uint32 ctr0, __vector int ctr1, uint32 ctr2, uint32 ctr3, + uint32 key0, uint32 key1, + __vector float & rnd1, __vector float & rnd2, __vector float & rnd3, __vector float & rnd4) +{ + philox_float4(ctr0, (__vector unsigned int) ctr1, ctr2, ctr3, key0, key1, rnd1, rnd2, rnd3, rnd4); +} + +#ifdef __VSX__ +QUALIFIERS void philox_double2(uint32 ctr0, __vector unsigned int ctr1, uint32 ctr2, uint32 ctr3, + uint32 key0, uint32 key1, + __vector double & rnd1lo, __vector double & rnd1hi, __vector double & rnd2lo, __vector double & rnd2hi) +{ + __vector unsigned int ctr0v = vec_splats(ctr0); + __vector unsigned int ctr2v = vec_splats(ctr2); + __vector unsigned int ctr3v = vec_splats(ctr3); + + philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1lo, rnd1hi, rnd2lo, rnd2hi); +} + +QUALIFIERS void philox_double2(uint32 ctr0, __vector unsigned int ctr1, uint32 ctr2, uint32 ctr3, + uint32 key0, uint32 key1, + __vector double & rnd1, __vector double & rnd2) +{ + __vector unsigned int ctr0v = vec_splats(ctr0); + __vector unsigned int ctr2v = vec_splats(ctr2); + __vector unsigned int ctr3v = vec_splats(ctr3); + + __vector double ignore; + philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, ignore, rnd2, ignore); +} + +QUALIFIERS void philox_double2(uint32 ctr0, __vector int ctr1, uint32 ctr2, uint32 ctr3, + uint32 key0, uint32 key1, + __vector double & rnd1, __vector double & rnd2) +{ + philox_double2(ctr0, (__vector unsigned int) ctr1, ctr2, ctr3, key0, key1, rnd1, rnd2); +} +#endif +#endif + + #if defined(__ARM_NEON) QUALIFIERS void _philox4x32round(uint32x4_t* ctr, uint32x4_t* key) { diff --git a/pystencils/include/ppc_altivec_helpers.h b/pystencils/include/ppc_altivec_helpers.h new file mode 100644 index 000000000..d2edb437a --- /dev/null +++ b/pystencils/include/ppc_altivec_helpers.h @@ -0,0 +1,3 @@ +#include <altivec.h> +#undef vector +#undef bool diff --git a/pystencils/llvm/llvmjit.py b/pystencils/llvm/llvmjit.py index 0baec5452..2a9f698aa 100644 --- a/pystencils/llvm/llvmjit.py +++ b/pystencils/llvm/llvmjit.py @@ -141,9 +141,12 @@ class Jit(object): self._llvmmod = llvm.parse_assembly("") self.target = llvm.Target.from_default_triple() self.cpu = llvm.get_host_cpu_name() - self.cpu_features = llvm.get_host_cpu_features() - self.target_machine = self.target.create_target_machine(cpu=self.cpu, features=self.cpu_features.flatten(), - opt=2) + try: + self.cpu_features = llvm.get_host_cpu_features() + self.target_machine = self.target.create_target_machine(cpu=self.cpu, features=self.cpu_features.flatten(), + opt=2) + except RuntimeError: + self.target_machine = self.target.create_target_machine(cpu=self.cpu, opt=2) llvm.check_jit_execution() self.ee = llvm.create_mcjit_compiler(self.llvmmod, self.target_machine) self.ee.finalize_object() diff --git a/pystencils_tests/test_conditional_vec.py b/pystencils_tests/test_conditional_vec.py index 0ef55bdf4..59f6367cd 100644 --- a/pystencils_tests/test_conditional_vec.py +++ b/pystencils_tests/test_conditional_vec.py @@ -4,17 +4,19 @@ import pytest import pystencils as ps from pystencils.astnodes import Block, Conditional -from pystencils.backends.simd_instruction_sets import get_supported_instruction_sets +from pystencils.backends.simd_instruction_sets import get_supported_instruction_sets, get_vector_instruction_set from pystencils.cpu.vectorization import vec_all, vec_any +supported_instruction_sets = get_supported_instruction_sets() if get_supported_instruction_sets() else [] -@pytest.mark.skipif(not get_supported_instruction_sets(), reason='cannot detect CPU instruction set') -@pytest.mark.skipif('neon' in get_supported_instruction_sets(), reason='ARM does not have collective instructions') -def test_vec_any(): - data_arr = np.zeros((15, 15)) +@pytest.mark.parametrize('instruction_set', supported_instruction_sets) +@pytest.mark.parametrize('dtype', ('float', 'double')) +def test_vec_any(instruction_set, dtype): + width = get_vector_instruction_set(dtype, instruction_set)['width'] + data_arr = np.zeros((4*width, 4*width), dtype=np.float64 if dtype == 'double' else np.float32) - data_arr[3:9, 1] = 1.0 - data = ps.fields("data: double[2D]", data=data_arr) + data_arr[3:9, 1:3*width-1] = 1.0 + data = ps.fields(f"data: {dtype}[2D]", data=data_arr) c = [ ps.Assignment(sp.Symbol("t1"), vec_any(data.center() > 0.0)), @@ -22,24 +24,21 @@ def test_vec_any(): ps.Assignment(data.center(), 2.0) ])) ] - instruction_set = get_supported_instruction_sets()[-1] ast = ps.create_kernel(c, target='cpu', cpu_vectorize_info={'instruction_set': instruction_set}) kernel = ast.compile() kernel(data=data_arr) + np.testing.assert_equal(data_arr[3:9, :3*width], 2.0) - width = ast.instruction_set['width'] - np.testing.assert_equal(data_arr[3:9, 0:width], 2.0) +@pytest.mark.parametrize('instruction_set', supported_instruction_sets) +@pytest.mark.parametrize('dtype', ('float', 'double')) +def test_vec_all(instruction_set, dtype): + width = get_vector_instruction_set(dtype, instruction_set)['width'] + data_arr = np.zeros((4*width, 4*width), dtype=np.float64 if dtype == 'double' else np.float32) - -@pytest.mark.skipif(not get_supported_instruction_sets(), reason='cannot detect CPU instruction set') -@pytest.mark.skipif('neon' in get_supported_instruction_sets(), reason='ARM does not have collective instructions') -def test_vec_all(): - data_arr = np.zeros((15, 15)) - - data_arr[3:9, 2:7] = 1.0 - data = ps.fields("data: double[2D]", data=data_arr) + data_arr[3:9, 1:3*width-1] = 1.0 + data = ps.fields(f"data: {dtype}[2D]", data=data_arr) c = [ Conditional(vec_all(data.center() > 0.0), Block([ @@ -47,14 +46,17 @@ def test_vec_all(): ])) ] ast = ps.create_kernel(c, target='cpu', - cpu_vectorize_info={'instruction_set': get_supported_instruction_sets()[-1]}) + cpu_vectorize_info={'instruction_set': instruction_set}) kernel = ast.compile() - before = data_arr.copy() kernel(data=data_arr) - np.testing.assert_equal(data_arr, before) + np.testing.assert_equal(data_arr[3:9, :1], 0.0) + np.testing.assert_equal(data_arr[3:9, 1:width], 1.0) + np.testing.assert_equal(data_arr[3:9, width:2*width], 2.0) + np.testing.assert_equal(data_arr[3:9, 2*width:3*width-1], 1.0) + np.testing.assert_equal(data_arr[3:9, 3*width-1:], 0.0) -@pytest.mark.skipif(not get_supported_instruction_sets(), reason='cannot detect CPU instruction set') +@pytest.mark.skipif(not supported_instruction_sets, reason='cannot detect CPU instruction set') def test_boolean_before_loop(): t1, t2 = sp.symbols('t1, t2') f_arr = np.ones((10, 10)) @@ -66,7 +68,7 @@ def test_boolean_before_loop(): ps.Assignment(g[0, 0], sp.Piecewise((f[0, 0], t1), (42, True))) ] - ast = ps.create_kernel(a, cpu_vectorize_info={'instruction_set': get_supported_instruction_sets()[-1]}) + ast = ps.create_kernel(a, cpu_vectorize_info={'instruction_set': supported_instruction_sets[-1]}) kernel = ast.compile() kernel(f=f_arr, g=g_arr, t2=1.0) print(g) diff --git a/pystencils_tests/test_llvm.py b/pystencils_tests/test_llvm.py index 1337d3041..28b38fe98 100644 --- a/pystencils_tests/test_llvm.py +++ b/pystencils_tests/test_llvm.py @@ -38,6 +38,7 @@ def test_jacobi_fixed_field_size(): @pytest.mark.skipif(not get_llc_command(), reason="Tests requires llc in $PATH") def test_jacobi_fixed_field_size_gpu(): + pytest.importorskip("pycuda") size = (30, 20) import pycuda.autoinit # noqa diff --git a/pystencils_tests/test_random.py b/pystencils_tests/test_random.py index f22aaeb7f..b5396c9fb 100644 --- a/pystencils_tests/test_random.py +++ b/pystencils_tests/test_random.py @@ -27,8 +27,8 @@ if get_compiler_config()['os'] == 'windows': def test_rng(target, rng, precision, dtype, t=124, offsets=(0, 0), keys=(0, 0), offset_values=None): if target == 'gpu': pytest.importorskip('pycuda') - if instruction_sets and 'neon' in instruction_sets and rng == 'aesni': - pytest.xfail('AES not yet implemented for ARM Neon') + if instruction_sets and set(['neon', 'vsx']).intersection(instruction_sets) and rng == 'aesni': + pytest.xfail('AES not yet implemented for this architecture') if rng == 'aesni' and len(keys) == 2: keys *= 2 if offset_values is None: @@ -118,8 +118,8 @@ def test_rng_offsets(kind, vectorized): @pytest.mark.parametrize('rng', ('philox', 'aesni')) @pytest.mark.parametrize('precision,dtype', (('float', 'float'), ('double', 'double'))) def test_rng_vectorized(target, rng, precision, dtype, t=130, offsets=(1, 3), keys=(0, 0), offset_values=None): - if target == 'neon' and rng == 'aesni': - pytest.xfail('AES not yet implemented for ARM Neon') + if target in ['neon', 'vsx'] and rng == 'aesni': + pytest.xfail('AES not yet implemented for this architecture') cpu_vectorize_info = {'assume_inner_stride_one': True, 'assume_aligned': True, 'instruction_set': target} dh = ps.create_data_handling((17, 17), default_ghost_layers=0, default_target='cpu') diff --git a/pystencils_tests/test_vectorization.py b/pystencils_tests/test_vectorization.py index 40432f5b6..0889ab468 100644 --- a/pystencils_tests/test_vectorization.py +++ b/pystencils_tests/test_vectorization.py @@ -203,7 +203,7 @@ def test_logical_operators(): def test_hardware_query(): instruction_sets = get_supported_instruction_sets() - assert 'sse' in instruction_sets or 'neon' in instruction_sets + assert set(['sse', 'neon', 'vsx']).intersection(instruction_sets) def test_vectorised_pow(): diff --git a/pytest.ini b/pytest.ini index e7b0eeb98..db4823c05 100644 --- a/pytest.ini +++ b/pytest.ini @@ -41,7 +41,7 @@ exclude_lines = if __name__ == .__main__.: skip_covered = True -fail_under = 89 +fail_under = 88 [html] directory = coverage_report -- GitLab