From 8440bc6e50264a3f69df0347a0cbbf213603f77d Mon Sep 17 00:00:00 2001
From: Michael Kuron <mkuron@icp.uni-stuttgart.de>
Date: Mon, 29 Mar 2021 20:31:21 +0000
Subject: [PATCH] Vectorization improvements

---
 pystencils/backends/arm_instruction_sets.py  |  32 ++-
 pystencils/backends/cbackend.py              |  15 +-
 pystencils/backends/ppc_instruction_sets.py  | 101 +++++++++
 pystencils/backends/simd_instruction_sets.py |  25 ++-
 pystencils/backends/x86_instruction_sets.py  |   4 +-
 pystencils/cpu/cpujit.py                     |   3 +
 pystencils/include/aesni_rand.h              |  50 ++++-
 pystencils/include/philox_rand.h             | 219 +++++++++++++++++++
 pystencils/include/ppc_altivec_helpers.h     |   3 +
 pystencils/llvm/llvmjit.py                   |   9 +-
 pystencils_tests/test_conditional_vec.py     |  48 ++--
 pystencils_tests/test_llvm.py                |   1 +
 pystencils_tests/test_random.py              |   8 +-
 pystencils_tests/test_vectorization.py       |   2 +-
 pytest.ini                                   |   2 +-
 15 files changed, 451 insertions(+), 71 deletions(-)
 create mode 100644 pystencils/backends/ppc_instruction_sets.py
 create mode 100644 pystencils/include/ppc_altivec_helpers.h

diff --git a/pystencils/backends/arm_instruction_sets.py b/pystencils/backends/arm_instruction_sets.py
index 26f61e909..201451fb8 100644
--- a/pystencils/backends/arm_instruction_sets.py
+++ b/pystencils/backends/arm_instruction_sets.py
@@ -13,7 +13,10 @@ def get_argument_string(function_shortcut):
     return arg_string
 
 
-def get_vector_instruction_set_arm(data_type='double', instruction_set='neon', q_registers=True):
+def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'):
+    if instruction_set != 'neon':
+        raise NotImplementedError(instruction_set)
+
     base_names = {
         '+': 'add[0, 1]',
         '-': 'sub[0, 1]',
@@ -39,16 +42,9 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon', q
             'float': 32,
             'int': 32}
 
-    if q_registers is True:
-        q_reg = 'q'
-        width = 128 // bits[data_type]
-        intwidth = 128 // bits['int']
-        suffix = f'q_f{bits[data_type]}'
-    else:
-        q_reg = ''
-        width = 64 // bits[data_type]
-        intwidth = 64 // bits['int']
-        suffix = f'_f{bits[data_type]}'
+    width = 128 // bits[data_type]
+    intwidth = 128 // bits['int']
+    suffix = f'q_f{bits[data_type]}'
 
     result = dict()
 
@@ -60,10 +56,10 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon', q
 
         result[intrinsic_id] = 'v' + name + suffix + arg_string
 
-    result['makeVecConst'] = f'vdup{q_reg}_n_f{bits[data_type]}' + '({0})'
+    result['makeVecConst'] = f'vdupq_n_f{bits[data_type]}' + '({0})'
     result['makeVec'] = f'makeVec_f{bits[data_type]}' + '(' + ", ".join(['{' + str(i) + '}' for i in range(width)]) + \
         ')'
-    result['makeVecConstInt'] = f'vdup{q_reg}_n_s{bits["int"]}' + '({0})'
+    result['makeVecConstInt'] = f'vdupq_n_s{bits["int"]}' + '({0})'
     result['makeVecInt'] = f'makeVec_s{bits["int"]}' + '({0}, {1}, {2}, {3})'
 
     result['+int'] = f"vaddq_s{bits['int']}" + "({0}, {1})"
@@ -77,10 +73,12 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon', q
     result['bool'] = f'uint{bits[data_type]}x{width}_t'
     result['headers'] = ['<arm_neon.h>', '"arm_neon_helpers.h"']
 
-    result['!='] = f'vmvn{q_reg}_u{bits[data_type]}({result["=="]})'
+    result['!='] = f'vmvnq_u{bits[data_type]}({result["=="]})'
 
-    result['&'] = f'vand{q_reg}_u{bits[data_type]}' + '({0}, {1})'
-    result['|'] = f'vorr{q_reg}_u{bits[data_type]}' + '({0}, {1})'
-    result['blendv'] = f'vbsl{q_reg}_f{bits[data_type]}' + '({2}, {1}, {0})'
+    result['&'] = f'vandq_u{bits[data_type]}' + '({0}, {1})'
+    result['|'] = f'vorrq_u{bits[data_type]}' + '({0}, {1})'
+    result['blendv'] = f'vbslq_f{bits[data_type]}' + '({2}, {1}, {0})'
+    result['any'] = f'vaddlvq_u8(vreinterpretq_u8_u{bits[data_type]}({{0}})) > 0'
+    result['all'] = f'vaddlvq_u8(vreinterpretq_u8_u{bits[data_type]}({{0}})) == 16*0xff'
 
     return result
diff --git a/pystencils/backends/cbackend.py b/pystencils/backends/cbackend.py
index 9603d6d23..2a15ef74f 100644
--- a/pystencils/backends/cbackend.py
+++ b/pystencils/backends/cbackend.py
@@ -588,18 +588,17 @@ class VectorizedCustomSympyPrinter(CustomSympyPrinter):
                     return self.instruction_set['rsqrt'].format(self._print(expr.args[0]))
                 else:
                     return f"({self._print(1 / sp.sqrt(expr.args[0]))})"
-        elif isinstance(expr, vec_any):
-            expr_type = get_type_of_expression(expr.args[0])
-            if type(expr_type) is not VectorType:
-                return self._print(expr.args[0])
-            else:
-                return self.instruction_set['any'].format(self._print(expr.args[0]))
-        elif isinstance(expr, vec_all):
+        elif isinstance(expr, vec_any) or isinstance(expr, vec_all):
+            instr = 'any' if isinstance(expr, vec_any) else 'all'
             expr_type = get_type_of_expression(expr.args[0])
             if type(expr_type) is not VectorType:
                 return self._print(expr.args[0])
             else:
-                return self.instruction_set['all'].format(self._print(expr.args[0]))
+                if isinstance(expr.args[0], sp.Rel):
+                    op = expr.args[0].rel_op
+                    if (instr, op) in self.instruction_set:
+                        return self.instruction_set[(instr, op)].format(*[self._print(a) for a in expr.args[0].args])
+                return self.instruction_set[instr].format(self._print(expr.args[0]))
 
         return super(VectorizedCustomSympyPrinter, self)._print_Function(expr)
 
diff --git a/pystencils/backends/ppc_instruction_sets.py b/pystencils/backends/ppc_instruction_sets.py
new file mode 100644
index 000000000..bb9e3e851
--- /dev/null
+++ b/pystencils/backends/ppc_instruction_sets.py
@@ -0,0 +1,101 @@
+def get_argument_string(function_shortcut):
+    args = function_shortcut[function_shortcut.index('[') + 1: -1]
+    arg_string = "("
+    for arg in args.split(","):
+        arg = arg.strip()
+        if not arg:
+            continue
+        if arg in ('0', '1', '2', '3', '4', '5'):
+            arg_string += "{" + arg + "},"
+        else:
+            arg_string += arg + ","
+    arg_string = arg_string[:-1] + ")"
+    return arg_string
+
+
+def get_vector_instruction_set_ppc(data_type='double', instruction_set='vsx'):
+    if instruction_set != 'vsx':
+        raise NotImplementedError(instruction_set)
+
+    base_names = {
+        '+': 'add[0, 1]',
+        '-': 'sub[0, 1]',
+        '*': 'mul[0, 1]',
+        '/': 'div[0, 1]',
+        'sqrt': 'sqrt[0]',
+        'rsqrt': 'rsqrte[0]',  # rsqrt is available too, but not on Clang
+
+        'loadU': 'xl[0x0, 0]',
+        'loadA': 'ld[0x0, 0]',
+        'storeU': 'xst[1, 0x0, 0]',
+        'storeA': 'st[1, 0x0, 0]',
+        'stream': 'stl[1, 0x0, 0]',
+
+        'abs': 'abs[0]',
+        '==': 'cmpeq[0, 1]',
+        '!=': 'cmpne[0, 1]',
+        '<=': 'cmple[0, 1]',
+        '<': 'cmplt[0, 1]',
+        '>=': 'cmpge[0, 1]',
+        '>': 'cmpgt[0, 1]',
+        '&': 'and[0, 1]',
+        '|': 'or[0, 1]',
+        'blendv': 'sel[0, 1, 2]',
+
+        ('any', '=='): 'any_eq[0, 1]',
+        ('any', '!='): 'any_ne[0, 1]',
+        ('any', '<='): 'any_le[0, 1]',
+        ('any', '<'): 'any_lt[0, 1]',
+        ('any', '>='): 'any_ge[0, 1]',
+        ('any', '>'): 'any_gt[0, 1]',
+        ('all', '=='): 'all_eq[0, 1]',
+        ('all', '!='): 'all_ne[0, 1]',
+        ('all', '<='): 'all_le[0, 1]',
+        ('all', '<'): 'all_lt[0, 1]',
+        ('all', '>='): 'all_ge[0, 1]',
+        ('all', '>'): 'all_gt[0, 1]',
+    }
+
+    bits = {'double': 64,
+            'float': 32,
+            'int': 32}
+
+    width = 128 // bits[data_type]
+    intwidth = 128 // bits['int']
+
+    result = dict()
+
+    for intrinsic_id, function_shortcut in base_names.items():
+        function_shortcut = function_shortcut.strip()
+        name = function_shortcut[:function_shortcut.index('[')]
+
+        arg_string = get_argument_string(function_shortcut)
+
+        result[intrinsic_id] = 'vec_' + name + arg_string
+
+    if data_type == 'double':
+        # Clang and XL C++ are missing these for doubles
+        result['loadA'] = '(__vector double)' + result['loadA'].format('(float*) {0}')
+        result['storeA'] = result['storeA'].format('(float*) {0}', '(__vector float) {1}')
+        result['stream'] = result['stream'].format('(float*) {0}', '(__vector float) {1}')
+
+    result['+int'] = "vec_add({0}, {1})"
+
+    result['width'] = width
+    result['intwidth'] = intwidth
+    result[data_type] = f'__vector {data_type}'
+    result['int'] = '__vector int'
+    result['bool'] = f'__vector __bool {"long long" if data_type == "double" else "int"}'
+    result['headers'] = ['<altivec.h>', '"ppc_altivec_helpers.h"']
+
+    result['makeVecConst'] = '((' + result[data_type] + '){{' + \
+        ", ".join(['(' + data_type + ') {0}' for _ in range(width)]) + '}})'
+    result['makeVec'] = '((' + result[data_type] + '){{' + \
+        ", ".join(['{' + data_type + '} {' + str(i) + '}' for i in range(width)]) + '}})'
+    result['makeVecConstInt'] = '((' + result['int'] + '){{' + ", ".join(['(int) {0}' for _ in range(intwidth)]) + '}})'
+    result['makeVecInt'] = '((' + result['int'] + '){{(int) {0}, (int) {1}, (int) {2}, (int) {3}}})'
+
+    result['any'] = 'vec_any_ne({0}, ((' + result['bool'] + ') {{' + ", ".join(['0'] * width) + '}}))'
+    result['all'] = 'vec_all_ne({0}, ((' + result['bool'] + ') {{' + ", ".join(['0'] * width) + '}}))'
+
+    return result
diff --git a/pystencils/backends/simd_instruction_sets.py b/pystencils/backends/simd_instruction_sets.py
index 3bcbaee1b..850f8ff6d 100644
--- a/pystencils/backends/simd_instruction_sets.py
+++ b/pystencils/backends/simd_instruction_sets.py
@@ -2,19 +2,40 @@ import platform
 
 from pystencils.backends.x86_instruction_sets import get_vector_instruction_set_x86
 from pystencils.backends.arm_instruction_sets import get_vector_instruction_set_arm
+from pystencils.backends.ppc_instruction_sets import get_vector_instruction_set_ppc
 
 
-def get_vector_instruction_set(data_type='double', instruction_set='avx', q_registers=True):
+def get_vector_instruction_set(data_type='double', instruction_set='avx'):
     if instruction_set in ['neon', 'sve']:
-        return get_vector_instruction_set_arm(data_type, instruction_set, q_registers)
+        return get_vector_instruction_set_arm(data_type, instruction_set)
+    elif instruction_set in ['vsx']:
+        return get_vector_instruction_set_ppc(data_type, instruction_set)
     else:
         return get_vector_instruction_set_x86(data_type, instruction_set)
 
 
+_cache = None
+
+
 def get_supported_instruction_sets():
     """List of supported instruction sets on current hardware, or None if query failed."""
+    global _cache
+    if _cache is not None:
+        return _cache.copy()
     if platform.system() == 'Darwin' and platform.machine() == 'arm64':  # not supported by cpuinfo
         return ['neon']
+    elif platform.machine().startswith('ppc64'):  # no flags reported by cpuinfo
+        import subprocess
+        import tempfile
+        from pystencils.cpu.cpujit import get_compiler_config
+        f = tempfile.NamedTemporaryFile(suffix='.cpp')
+        command = [get_compiler_config()['command'], '-mcpu=native', '-dM', '-E', f.name]
+        macros = subprocess.check_output(command, input='', text=True)
+        if '#define __VSX__' in macros and '#define __ALTIVEC__' in macros:
+            _cache = ['vsx']
+        else:
+            _cache = []
+        return _cache.copy()
     try:
         from cpuinfo import get_cpu_info
     except ImportError:
diff --git a/pystencils/backends/x86_instruction_sets.py b/pystencils/backends/x86_instruction_sets.py
index 57164d678..836ffc579 100644
--- a/pystencils/backends/x86_instruction_sets.py
+++ b/pystencils/backends/x86_instruction_sets.py
@@ -137,11 +137,11 @@ def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'):
     result['double'] = f"__m{bit_width}d"
     result['float'] = f"__m{bit_width}"
     result['int'] = f"__m{bit_width}i"
-    result['bool'] = f"__m{bit_width}d"
+    result['bool'] = result[data_type]
 
     result['headers'] = headers[instruction_set]
     result['any'] = f"{pre}_movemask_{suf}({{0}}) > 0"
-    result['all'] = f"{pre}_movemask_{suf}({{0}}) == 0xF"
+    result['all'] = f"{pre}_movemask_{suf}({{0}}) == {hex(2**result['width']-1)}"
 
     if instruction_set == 'avx512':
         size = 8 if data_type == 'double' else 16
diff --git a/pystencils/cpu/cpujit.py b/pystencils/cpu/cpujit.py
index 4ac13a065..6f2c9a5b0 100644
--- a/pystencils/cpu/cpujit.py
+++ b/pystencils/cpu/cpujit.py
@@ -155,6 +155,9 @@ def read_config():
             ('flags', '-Ofast -DNDEBUG -fPIC -march=native -fopenmp -std=c++11'),
             ('restrict_qualifier', '__restrict__')
         ])
+        if platform.machine().startswith('ppc64'):
+            default_compiler_config['flags'] = default_compiler_config['flags'].replace('-march=native',
+                                                                                        '-mcpu=native')
     elif platform.system().lower() == 'windows':
         default_compiler_config = OrderedDict([
             ('os', 'windows'),
diff --git a/pystencils/include/aesni_rand.h b/pystencils/include/aesni_rand.h
index 4206e37f6..00e1dffee 100644
--- a/pystencils/include/aesni_rand.h
+++ b/pystencils/include/aesni_rand.h
@@ -21,6 +21,36 @@
 typedef std::uint32_t uint32;
 typedef std::uint64_t uint64;
 
+template <typename T, std::size_t Alignment>
+class AlignedAllocator
+{
+public:
+    typedef T value_type;
+
+    template <typename U>
+    struct rebind {
+        typedef AlignedAllocator<U, Alignment> other;
+    };
+
+    T * allocate(const std::size_t n) const {
+        if (n == 0) {
+            return nullptr;
+        }
+        void * const p = _mm_malloc(n*sizeof(T), Alignment);
+        if (p == nullptr) {
+            throw std::bad_alloc();
+        }
+        return static_cast<T *>(p);
+    }
+
+    void deallocate(T * const p, const std::size_t n) const {
+        _mm_free(p);
+    }
+};
+
+template <typename Key, typename T>
+using AlignedMap = std::map<Key, T, std::less<Key>, AlignedAllocator<std::pair<const Key, T>, sizeof(Key)>>;
+
 #if defined(__AES__) || defined(_MSC_VER)
 QUALIFIERS __m128i aesni_keygen_assist(__m128i temp1, __m128i temp2) {
     __m128i temp3; 
@@ -85,10 +115,10 @@ QUALIFIERS std::array<__m128i,11> aesni_keygen(__m128i k) {
 }
 
 QUALIFIERS const std::array<__m128i,11> & aesni_roundkeys(const __m128i & k128) {
-    std::array<uint32,4> a;
-    _mm_storeu_si128((__m128i*) a.data(), k128);
+    alignas(16) std::array<uint32,4> a;
+    _mm_store_si128((__m128i*) a.data(), k128);
     
-    static std::map<std::array<uint32,4>, std::array<__m128i,11>> roundkeys;
+    static AlignedMap<std::array<uint32,4>, std::array<__m128i,11>> roundkeys;
     
     if(roundkeys.find(a) == roundkeys.end()) {
         auto rk = aesni_keygen(k128);
@@ -308,10 +338,10 @@ QUALIFIERS void aesni_double2(uint32 ctr0, __m128i ctr1, uint32 ctr2, uint32 ctr
 
 #ifdef __AVX2__
 QUALIFIERS const std::array<__m256i,11> & aesni_roundkeys(const __m256i & k256) {
-    std::array<uint32,8> a;
-    _mm256_storeu_si256((__m256i*) a.data(), k256);
+    alignas(32) std::array<uint32,8> a;
+    _mm256_store_si256((__m256i*) a.data(), k256);
     
-    static std::map<std::array<uint32,8>, std::array<__m256i,11>> roundkeys;
+    static AlignedMap<std::array<uint32,8>, std::array<__m256i,11>> roundkeys;
     
     if(roundkeys.find(a) == roundkeys.end()) {
         auto rk1 = aesni_keygen(_mm256_extractf128_si256(k256, 0));
@@ -523,10 +553,10 @@ QUALIFIERS void aesni_double2(uint32 ctr0, __m256i ctr1, uint32 ctr2, uint32 ctr
 
 #ifdef __AVX512F__
 QUALIFIERS const std::array<__m512i,11> & aesni_roundkeys(const __m512i & k512) {
-    std::array<uint32,16> a;
-    _mm512_storeu_si512((__m512i*) a.data(), k512);
+    alignas(64) std::array<uint32,16> a;
+    _mm512_store_si512((__m512i*) a.data(), k512);
     
-    static std::map<std::array<uint32,16>, std::array<__m512i,11>> roundkeys;
+    static AlignedMap<std::array<uint32,16>, std::array<__m512i,11>> roundkeys;
     
     if(roundkeys.find(a) == roundkeys.end()) {
         auto rk1 = aesni_keygen(_mm512_extracti32x4_epi32(k512, 0));
@@ -553,7 +583,7 @@ QUALIFIERS __m512i aesni1xm128i(const __m512i & in, const __m512i & k0) {
     x = _mm512_aesenc_epi128(x, k[7]);
     x = _mm512_aesenc_epi128(x, k[8]);
     x = _mm512_aesenc_epi128(x, k[9]);
-    x = _mm512_aesenclast_epi128(x[10], k);
+    x = _mm512_aesenclast_epi128(x, k[10]);
 #else
     __m128i a = aesni1xm128i(_mm512_extracti32x4_epi32(in, 0), _mm512_extracti32x4_epi32(k0, 0));
     __m128i b = aesni1xm128i(_mm512_extracti32x4_epi32(in, 1), _mm512_extracti32x4_epi32(k0, 1));
diff --git a/pystencils/include/philox_rand.h b/pystencils/include/philox_rand.h
index 71da35715..4d81d43e4 100644
--- a/pystencils/include/philox_rand.h
+++ b/pystencils/include/philox_rand.h
@@ -16,6 +16,17 @@
 #include <arm_neon.h>
 #endif
 
+#if defined(__powerpc__) && defined(__GNUC__) && !defined(__clang__) && !defined(__xlC__)
+#include <ppu_intrinsics.h>
+#endif
+#ifdef __ALTIVEC__
+#include <altivec.h>
+#undef bool
+#ifndef _ARCH_PWR8
+#include <pveclib/vec_int64_ppc.h>
+#endif
+#endif
+
 #ifndef __CUDA_ARCH__
 #define QUALIFIERS inline
 #include "myintrin.h"
@@ -38,9 +49,14 @@ QUALIFIERS uint32 mulhilo32(uint32 a, uint32 b, uint32* hip)
 {
 #ifndef __CUDA_ARCH__
     // host code
+#if defined(__powerpc__) && (!defined(__clang__) || defined(__xlC__))
+    *hip = __mulhwu(a,b);
+    return a*b;
+#else
     uint64 product = ((uint64)a) * ((uint64)b);
     *hip = product >> 32;
     return (uint32)product;
+#endif
 #else
     // device code
     *hip = __umulhi(a,b);
@@ -281,6 +297,209 @@ QUALIFIERS void philox_double2(uint32 ctr0, __m128i ctr1, uint32 ctr2, uint32 ct
 }
 #endif
 
+
+#ifdef __ALTIVEC__
+QUALIFIERS void _philox4x32round(__vector unsigned int* ctr, __vector unsigned int* key)
+{
+#ifndef _ARCH_PWR8
+    __vector unsigned int lo0 = vec_mul(ctr[0], vec_splats(PHILOX_M4x32_0));
+    __vector unsigned int lo1 = vec_mul(ctr[2], vec_splats(PHILOX_M4x32_1));
+    __vector unsigned int hi0 = vec_mulhuw(ctr[0], vec_splats(PHILOX_M4x32_0));
+    __vector unsigned int hi1 = vec_mulhuw(ctr[2], vec_splats(PHILOX_M4x32_1));
+#elif defined(_ARCH_PWR10)
+    __vector unsigned int lo0 = vec_mul(ctr[0], vec_splats(PHILOX_M4x32_0));
+    __vector unsigned int lo1 = vec_mul(ctr[2], vec_splats(PHILOX_M4x32_1));
+    __vector unsigned int hi0 = vec_mulh(ctr[0], vec_splats(PHILOX_M4x32_0));
+    __vector unsigned int hi1 = vec_mulh(ctr[2], vec_splats(PHILOX_M4x32_1));
+#else
+    __vector unsigned int lohi0a = (__vector unsigned int) vec_mule(ctr[0], vec_splats(PHILOX_M4x32_0));
+    __vector unsigned int lohi0b = (__vector unsigned int) vec_mulo(ctr[0], vec_splats(PHILOX_M4x32_0));
+    __vector unsigned int lohi1a = (__vector unsigned int) vec_mule(ctr[2], vec_splats(PHILOX_M4x32_1));
+    __vector unsigned int lohi1b = (__vector unsigned int) vec_mulo(ctr[2], vec_splats(PHILOX_M4x32_1));
+
+#ifdef __LITTLE_ENDIAN__
+    __vector unsigned int lo0 = vec_mergee(lohi0a, lohi0b);
+    __vector unsigned int lo1 = vec_mergee(lohi1a, lohi1b);
+    __vector unsigned int hi0 = vec_mergeo(lohi0a, lohi0b);
+    __vector unsigned int hi1 = vec_mergeo(lohi1a, lohi1b);
+#else
+    __vector unsigned int lo0 = vec_mergeo(lohi0a, lohi0b);
+    __vector unsigned int lo1 = vec_mergeo(lohi1a, lohi1b);
+    __vector unsigned int hi0 = vec_mergee(lohi0a, lohi0b);
+    __vector unsigned int hi1 = vec_mergee(lohi1a, lohi1b);
+#endif
+#endif
+
+    ctr[0] = vec_xor(vec_xor(hi1, ctr[1]), key[0]);
+    ctr[1] = lo1;
+    ctr[2] = vec_xor(vec_xor(hi0, ctr[3]), key[1]);
+    ctr[3] = lo0;
+}
+
+QUALIFIERS void _philox4x32bumpkey(__vector unsigned int* key)
+{
+    key[0] = vec_add(key[0], vec_splats(PHILOX_W32_0));
+    key[1] = vec_add(key[1], vec_splats(PHILOX_W32_1));
+}
+
+#ifdef __VSX__
+template<bool high>
+QUALIFIERS __vector double _uniform_double_hq(__vector unsigned int x, __vector unsigned int y)
+{
+    // convert 32 to 64 bit
+#ifdef __LITTLE_ENDIAN__
+    if (high)
+    {
+        x = vec_mergel(x, vec_splats(0U));
+        y = vec_mergel(y, vec_splats(0U));
+    }
+    else
+    {
+        x = vec_mergeh(x, vec_splats(0U));
+        y = vec_mergeh(y, vec_splats(0U));
+    }
+#else
+    if (high)
+    {
+        x = vec_mergel(vec_splats(0U), x);
+        y = vec_mergel(vec_splats(0U), y);
+    }
+    else
+    {
+        x = vec_mergeh(vec_splats(0U), x);
+        y = vec_mergeh(vec_splats(0U), y);
+    }
+#endif
+
+    // calculate z = x ^ y << (53 - 32))
+#ifdef _ARCH_PWR8
+    __vector unsigned long long z = vec_sl((__vector unsigned long long) y, vec_splats(53ULL - 32ULL));
+#else
+    __vector unsigned long long z = vec_vsld((__vector unsigned long long) y, vec_splats(53ULL - 32ULL));
+#endif
+    z = vec_xor((__vector unsigned long long) x, z);
+
+    // convert uint64 to double
+#ifdef __xlC__
+    __vector double rs = vec_ctd(z, 0);
+#else
+    __vector double rs = vec_ctf(z, 0);
+#endif
+    // calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
+    rs = vec_madd(rs, vec_splats(TWOPOW53_INV_DOUBLE), vec_splats(TWOPOW53_INV_DOUBLE/2.0));
+
+    return rs;
+}
+#endif
+
+
+QUALIFIERS void philox_float4(__vector unsigned int ctr0, __vector unsigned int ctr1, __vector unsigned int ctr2, __vector unsigned int ctr3,
+                              uint32 key0, uint32 key1,
+                              __vector float & rnd1, __vector float & rnd2, __vector float & rnd3, __vector float & rnd4)
+{
+    __vector unsigned int key[2] = {vec_splats(key0), vec_splats(key1)};
+    __vector unsigned int ctr[4] = {ctr0, ctr1, ctr2, ctr3};
+    _philox4x32round(ctr, key);                           // 1
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 2
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 3
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 4
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 5
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 6
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 7
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 8
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 9
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 10
+
+    // convert uint32 to float
+    rnd1 = vec_ctf(ctr[0], 0);
+    rnd2 = vec_ctf(ctr[1], 0);
+    rnd3 = vec_ctf(ctr[2], 0);
+    rnd4 = vec_ctf(ctr[3], 0);
+    // calculate rnd * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
+    rnd1 = vec_madd(rnd1, vec_splats(TWOPOW32_INV_FLOAT), vec_splats(TWOPOW32_INV_FLOAT/2.0f));
+    rnd2 = vec_madd(rnd2, vec_splats(TWOPOW32_INV_FLOAT), vec_splats(TWOPOW32_INV_FLOAT/2.0f));
+    rnd3 = vec_madd(rnd3, vec_splats(TWOPOW32_INV_FLOAT), vec_splats(TWOPOW32_INV_FLOAT/2.0f));
+    rnd4 = vec_madd(rnd4, vec_splats(TWOPOW32_INV_FLOAT), vec_splats(TWOPOW32_INV_FLOAT/2.0f));
+}
+
+
+#ifdef __VSX__
+QUALIFIERS void philox_double2(__vector unsigned int ctr0, __vector unsigned int ctr1, __vector unsigned int ctr2, __vector unsigned int ctr3,
+                               uint32 key0, uint32 key1,
+                               __vector double & rnd1lo, __vector double & rnd1hi, __vector double & rnd2lo, __vector double & rnd2hi)
+{
+    __vector unsigned int key[2] = {vec_splats(key0), vec_splats(key1)};
+    __vector unsigned int ctr[4] = {ctr0, ctr1, ctr2, ctr3};
+    _philox4x32round(ctr, key);                           // 1
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 2
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 3
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 4
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 5
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 6
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 7
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 8
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 9
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 10
+
+    rnd1lo = _uniform_double_hq<false>(ctr[0], ctr[1]);
+    rnd1hi = _uniform_double_hq<true>(ctr[0], ctr[1]);
+    rnd2lo = _uniform_double_hq<false>(ctr[2], ctr[3]);
+    rnd2hi = _uniform_double_hq<true>(ctr[2], ctr[3]);
+}
+#endif
+
+QUALIFIERS void philox_float4(uint32 ctr0, __vector unsigned int ctr1, uint32 ctr2, uint32 ctr3,
+                              uint32 key0, uint32 key1,
+                              __vector float & rnd1, __vector float & rnd2, __vector float & rnd3, __vector float & rnd4)
+{
+    __vector unsigned int ctr0v = vec_splats(ctr0);
+    __vector unsigned int ctr2v = vec_splats(ctr2);
+    __vector unsigned int ctr3v = vec_splats(ctr3);
+
+    philox_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, rnd2, rnd3, rnd4);
+}
+
+QUALIFIERS void philox_float4(uint32 ctr0, __vector int ctr1, uint32 ctr2, uint32 ctr3,
+                              uint32 key0, uint32 key1,
+                              __vector float & rnd1, __vector float & rnd2, __vector float & rnd3, __vector float & rnd4)
+{
+    philox_float4(ctr0, (__vector unsigned int) ctr1, ctr2, ctr3, key0, key1, rnd1, rnd2, rnd3, rnd4);
+}
+
+#ifdef __VSX__
+QUALIFIERS void philox_double2(uint32 ctr0, __vector unsigned int ctr1, uint32 ctr2, uint32 ctr3,
+                               uint32 key0, uint32 key1,
+                               __vector double & rnd1lo, __vector double & rnd1hi, __vector double & rnd2lo, __vector double & rnd2hi)
+{
+    __vector unsigned int ctr0v = vec_splats(ctr0);
+    __vector unsigned int ctr2v = vec_splats(ctr2);
+    __vector unsigned int ctr3v = vec_splats(ctr3);
+
+    philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1lo, rnd1hi, rnd2lo, rnd2hi);
+}
+
+QUALIFIERS void philox_double2(uint32 ctr0, __vector unsigned int ctr1, uint32 ctr2, uint32 ctr3,
+                               uint32 key0, uint32 key1,
+                               __vector double & rnd1, __vector double & rnd2)
+{
+    __vector unsigned int ctr0v = vec_splats(ctr0);
+    __vector unsigned int ctr2v = vec_splats(ctr2);
+    __vector unsigned int ctr3v = vec_splats(ctr3);
+
+    __vector double ignore;
+    philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, ignore, rnd2, ignore);
+}
+
+QUALIFIERS void philox_double2(uint32 ctr0, __vector int ctr1, uint32 ctr2, uint32 ctr3,
+                               uint32 key0, uint32 key1,
+                               __vector double & rnd1, __vector double & rnd2)
+{
+    philox_double2(ctr0, (__vector unsigned int) ctr1, ctr2, ctr3, key0, key1, rnd1, rnd2);
+}
+#endif
+#endif
+
+
 #if defined(__ARM_NEON)
 QUALIFIERS void _philox4x32round(uint32x4_t* ctr, uint32x4_t* key)
 {
diff --git a/pystencils/include/ppc_altivec_helpers.h b/pystencils/include/ppc_altivec_helpers.h
new file mode 100644
index 000000000..d2edb437a
--- /dev/null
+++ b/pystencils/include/ppc_altivec_helpers.h
@@ -0,0 +1,3 @@
+#include <altivec.h>
+#undef vector
+#undef bool
diff --git a/pystencils/llvm/llvmjit.py b/pystencils/llvm/llvmjit.py
index 0baec5452..2a9f698aa 100644
--- a/pystencils/llvm/llvmjit.py
+++ b/pystencils/llvm/llvmjit.py
@@ -141,9 +141,12 @@ class Jit(object):
         self._llvmmod = llvm.parse_assembly("")
         self.target = llvm.Target.from_default_triple()
         self.cpu = llvm.get_host_cpu_name()
-        self.cpu_features = llvm.get_host_cpu_features()
-        self.target_machine = self.target.create_target_machine(cpu=self.cpu, features=self.cpu_features.flatten(),
-                                                                opt=2)
+        try:
+            self.cpu_features = llvm.get_host_cpu_features()
+            self.target_machine = self.target.create_target_machine(cpu=self.cpu, features=self.cpu_features.flatten(),
+                                                                    opt=2)
+        except RuntimeError:
+            self.target_machine = self.target.create_target_machine(cpu=self.cpu, opt=2)
         llvm.check_jit_execution()
         self.ee = llvm.create_mcjit_compiler(self.llvmmod, self.target_machine)
         self.ee.finalize_object()
diff --git a/pystencils_tests/test_conditional_vec.py b/pystencils_tests/test_conditional_vec.py
index 0ef55bdf4..59f6367cd 100644
--- a/pystencils_tests/test_conditional_vec.py
+++ b/pystencils_tests/test_conditional_vec.py
@@ -4,17 +4,19 @@ import pytest
 
 import pystencils as ps
 from pystencils.astnodes import Block, Conditional
-from pystencils.backends.simd_instruction_sets import get_supported_instruction_sets
+from pystencils.backends.simd_instruction_sets import get_supported_instruction_sets, get_vector_instruction_set
 from pystencils.cpu.vectorization import vec_all, vec_any
 
+supported_instruction_sets = get_supported_instruction_sets() if get_supported_instruction_sets() else []
 
-@pytest.mark.skipif(not get_supported_instruction_sets(), reason='cannot detect CPU instruction set')
-@pytest.mark.skipif('neon' in get_supported_instruction_sets(), reason='ARM does not have collective instructions')
-def test_vec_any():
-    data_arr = np.zeros((15, 15))
+@pytest.mark.parametrize('instruction_set', supported_instruction_sets)
+@pytest.mark.parametrize('dtype', ('float', 'double'))
+def test_vec_any(instruction_set, dtype):
+    width = get_vector_instruction_set(dtype, instruction_set)['width']
+    data_arr = np.zeros((4*width, 4*width), dtype=np.float64 if dtype == 'double' else np.float32)
 
-    data_arr[3:9, 1] = 1.0
-    data = ps.fields("data: double[2D]", data=data_arr)
+    data_arr[3:9, 1:3*width-1] = 1.0
+    data = ps.fields(f"data: {dtype}[2D]", data=data_arr)
 
     c = [
         ps.Assignment(sp.Symbol("t1"), vec_any(data.center() > 0.0)),
@@ -22,24 +24,21 @@ def test_vec_any():
             ps.Assignment(data.center(), 2.0)
         ]))
     ]
-    instruction_set = get_supported_instruction_sets()[-1]
     ast = ps.create_kernel(c, target='cpu',
                            cpu_vectorize_info={'instruction_set': instruction_set})
     kernel = ast.compile()
     kernel(data=data_arr)
+    np.testing.assert_equal(data_arr[3:9, :3*width], 2.0)
 
-    width = ast.instruction_set['width']
 
-    np.testing.assert_equal(data_arr[3:9, 0:width], 2.0)
+@pytest.mark.parametrize('instruction_set', supported_instruction_sets)
+@pytest.mark.parametrize('dtype', ('float', 'double'))
+def test_vec_all(instruction_set, dtype):
+    width = get_vector_instruction_set(dtype, instruction_set)['width']
+    data_arr = np.zeros((4*width, 4*width), dtype=np.float64 if dtype == 'double' else np.float32)
 
-
-@pytest.mark.skipif(not get_supported_instruction_sets(), reason='cannot detect CPU instruction set')
-@pytest.mark.skipif('neon' in get_supported_instruction_sets(), reason='ARM does not have collective instructions')
-def test_vec_all():
-    data_arr = np.zeros((15, 15))
-
-    data_arr[3:9, 2:7] = 1.0
-    data = ps.fields("data: double[2D]", data=data_arr)
+    data_arr[3:9, 1:3*width-1] = 1.0
+    data = ps.fields(f"data: {dtype}[2D]", data=data_arr)
 
     c = [
         Conditional(vec_all(data.center() > 0.0), Block([
@@ -47,14 +46,17 @@ def test_vec_all():
         ]))
     ]
     ast = ps.create_kernel(c, target='cpu',
-                           cpu_vectorize_info={'instruction_set': get_supported_instruction_sets()[-1]})
+                           cpu_vectorize_info={'instruction_set': instruction_set})
     kernel = ast.compile()
-    before = data_arr.copy()
     kernel(data=data_arr)
-    np.testing.assert_equal(data_arr, before)
+    np.testing.assert_equal(data_arr[3:9, :1], 0.0)
+    np.testing.assert_equal(data_arr[3:9, 1:width], 1.0)
+    np.testing.assert_equal(data_arr[3:9, width:2*width], 2.0)
+    np.testing.assert_equal(data_arr[3:9, 2*width:3*width-1], 1.0)
+    np.testing.assert_equal(data_arr[3:9, 3*width-1:], 0.0)
 
 
-@pytest.mark.skipif(not get_supported_instruction_sets(), reason='cannot detect CPU instruction set')
+@pytest.mark.skipif(not supported_instruction_sets, reason='cannot detect CPU instruction set')
 def test_boolean_before_loop():
     t1, t2 = sp.symbols('t1, t2')
     f_arr = np.ones((10, 10))
@@ -66,7 +68,7 @@ def test_boolean_before_loop():
         ps.Assignment(g[0, 0],
                       sp.Piecewise((f[0, 0], t1), (42, True)))
     ]
-    ast = ps.create_kernel(a, cpu_vectorize_info={'instruction_set': get_supported_instruction_sets()[-1]})
+    ast = ps.create_kernel(a, cpu_vectorize_info={'instruction_set': supported_instruction_sets[-1]})
     kernel = ast.compile()
     kernel(f=f_arr, g=g_arr, t2=1.0)
     print(g)
diff --git a/pystencils_tests/test_llvm.py b/pystencils_tests/test_llvm.py
index 1337d3041..28b38fe98 100644
--- a/pystencils_tests/test_llvm.py
+++ b/pystencils_tests/test_llvm.py
@@ -38,6 +38,7 @@ def test_jacobi_fixed_field_size():
 
 @pytest.mark.skipif(not get_llc_command(), reason="Tests requires llc in $PATH")
 def test_jacobi_fixed_field_size_gpu():
+    pytest.importorskip("pycuda")
     size = (30, 20)
 
     import pycuda.autoinit  # noqa
diff --git a/pystencils_tests/test_random.py b/pystencils_tests/test_random.py
index f22aaeb7f..b5396c9fb 100644
--- a/pystencils_tests/test_random.py
+++ b/pystencils_tests/test_random.py
@@ -27,8 +27,8 @@ if get_compiler_config()['os'] == 'windows':
 def test_rng(target, rng, precision, dtype, t=124, offsets=(0, 0), keys=(0, 0), offset_values=None):
     if target == 'gpu':
         pytest.importorskip('pycuda')
-    if instruction_sets and 'neon' in instruction_sets and rng == 'aesni':
-        pytest.xfail('AES not yet implemented for ARM Neon')
+    if instruction_sets and set(['neon', 'vsx']).intersection(instruction_sets) and rng == 'aesni':
+        pytest.xfail('AES not yet implemented for this architecture')
     if rng == 'aesni' and len(keys) == 2:
         keys *= 2
     if offset_values is None:
@@ -118,8 +118,8 @@ def test_rng_offsets(kind, vectorized):
 @pytest.mark.parametrize('rng', ('philox', 'aesni'))
 @pytest.mark.parametrize('precision,dtype', (('float', 'float'), ('double', 'double')))
 def test_rng_vectorized(target, rng, precision, dtype, t=130, offsets=(1, 3), keys=(0, 0), offset_values=None):
-    if target == 'neon' and rng == 'aesni':
-        pytest.xfail('AES not yet implemented for ARM Neon')
+    if target in ['neon', 'vsx'] and rng == 'aesni':
+        pytest.xfail('AES not yet implemented for this architecture')
     cpu_vectorize_info = {'assume_inner_stride_one': True, 'assume_aligned': True, 'instruction_set': target}
 
     dh = ps.create_data_handling((17, 17), default_ghost_layers=0, default_target='cpu')
diff --git a/pystencils_tests/test_vectorization.py b/pystencils_tests/test_vectorization.py
index 40432f5b6..0889ab468 100644
--- a/pystencils_tests/test_vectorization.py
+++ b/pystencils_tests/test_vectorization.py
@@ -203,7 +203,7 @@ def test_logical_operators():
 
 def test_hardware_query():
     instruction_sets = get_supported_instruction_sets()
-    assert 'sse' in instruction_sets or 'neon' in instruction_sets
+    assert set(['sse', 'neon', 'vsx']).intersection(instruction_sets)
 
 
 def test_vectorised_pow():
diff --git a/pytest.ini b/pytest.ini
index e7b0eeb98..db4823c05 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -41,7 +41,7 @@ exclude_lines =
        if __name__ == .__main__.:
 
 skip_covered = True
-fail_under = 89
+fail_under = 88
 
 [html]
 directory = coverage_report
-- 
GitLab