diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 9ae978824d56fcf19fb2c2a216cc170a77e1d0fb..2ed64515b35972ad865a5060b58346ab9ea30139 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -151,12 +151,11 @@ ubuntu:
       cobertura: coverage.xml
       junit: report.xml
 
-arm64:
+arm64v8:
   extends: .multiarch_template
   image: i10git.cs.fau.de:5005/pycodegen/pycodegen/arm64
   variables:
     PYSTENCILS_SIMD: "neon"
-    QEMU_CPU: "cortex-a72"
   before_script:
     - *multiarch_before_script
     - sed -i s/march=native/march=armv8-a/g ~/.config/pystencils/config.json
@@ -170,6 +169,23 @@ ppc64le:
     - *multiarch_before_script
     - sed -i s/mcpu=native/mcpu=power8/g ~/.config/pystencils/config.json
 
+arm64v9:
+  # Compiler support for SVE is still pretty rough: GCC 10+11 produce incorrect code for fixed-width vectors,
+  # while Clang 12 produces memory-corrupting heisenbugs unless we enable the address sanitizer.
+  # In the RNG tests, GCC 10+11 produce an internal compiler error.
+  # The memory corruption seems to only happen with qemu-user, not with qemu-system.
+  # Once the compilers and QEMU have improved, this job should be cleaned up to match the others.
+  extends: .multiarch_template
+  image: i10git.cs.fau.de:5005/pycodegen/pycodegen/arm64
+  variables:
+    PYSTENCILS_SIMD: "sve256,sve512"
+    ASAN_OPTIONS: detect_leaks=0
+    LD_PRELOAD: /usr/lib/aarch64-linux-gnu/libasan.so.6
+  before_script:
+    - *multiarch_before_script
+    - sed -i s/march=native/march=armv8-a+sve/g ~/.config/pystencils/config.json
+    - sed -i s/g\+\+/clang++/g ~/.config/pystencils/config.json
+
 minimal-conda:
   stage: test
   except:
diff --git a/pystencils/astnodes.py b/pystencils/astnodes.py
index f9044d3cb994ed81fef149073d2a48ef3e66da9f..8690546216b8da10c3acabc3fad485bba6ada967 100644
--- a/pystencils/astnodes.py
+++ b/pystencils/astnodes.py
@@ -842,47 +842,3 @@ class ConditionalFieldAccess(sp.Function):
 
     def __getnewargs_ex__(self):
         return (self.access, self.outofbounds_condition, self.outofbounds_value), {}
-
-
-class NontemporalFence(Node):
-    def __init__(self):
-        super(NontemporalFence, self).__init__(parent=None)
-
-    @property
-    def symbols_defined(self):
-        return set()
-
-    @property
-    def undefined_symbols(self):
-        return set()
-
-    @property
-    def args(self):
-        return []
-
-    def __eq__(self, other):
-        return isinstance(other, NontemporalFence)
-
-
-class CachelineSize(Node):
-    symbol = sp.Symbol("_clsize")
-    mask_symbol = sp.Symbol("_clsize_mask")
-    last_symbol = sp.Symbol("_cl_lastvec")
-    
-    def __init__(self):
-        super(CachelineSize, self).__init__(parent=None)
-
-    @property
-    def symbols_defined(self):
-        return set([self.symbol, self.mask_symbol, self.last_symbol])
-
-    @property
-    def undefined_symbols(self):
-        return set()
-
-    @property
-    def args(self):
-        return []
-
-    def __eq__(self, other):
-        return isinstance(other, CachelineSize)
diff --git a/pystencils/backends/cbackend.py b/pystencils/backends/cbackend.py
index d11723c1d0259439db2d6f2878558b91cbbceade..8b0b13aa7a863f7dedddda7270aaebebb8cac1e2 100644
--- a/pystencils/backends/cbackend.py
+++ b/pystencils/backends/cbackend.py
@@ -8,8 +8,8 @@ import sympy as sp
 from sympy.core import S
 from sympy.logic.boolalg import BooleanFalse, BooleanTrue
 
-from pystencils.astnodes import KernelFunction, Node, CachelineSize
-from pystencils.cpu.vectorization import vec_all, vec_any
+from pystencils.astnodes import KernelFunction, LoopOverCoordinate, Node
+from pystencils.cpu.vectorization import vec_all, vec_any, CachelineSize
 from pystencils.data_types import (
     PointerType, VectorType, address_of, cast_func, create_type, get_type_of_expression,
     reinterpret_cast_func, vector_memory_access, BasicType, TypedSymbol)
@@ -293,7 +293,14 @@ class CBackend:
 
                 pre_code = ''
                 if nontemporal and 'cachelineZero' in self._vector_instruction_set:
-                    pre_code = f"if (((uintptr_t) {ptr} & {CachelineSize.mask_symbol}) == 0) " + "{\n\t" + \
+                    first_cond = f"((uintptr_t) {ptr} & {CachelineSize.mask_symbol}) == 0"
+                    offset = sp.Add(*[sp.Symbol(LoopOverCoordinate.get_loop_counter_name(i))
+                                      * node.lhs.args[0].field.spatial_strides[i] for i in
+                                      range(len(node.lhs.args[0].field.spatial_strides))])
+                    size = sp.Mul(*node.lhs.args[0].field.spatial_shape)
+                    element_size = 8 if data_type.base_type.base_name == 'double' else 4
+                    size_cond = f"({offset} + {CachelineSize.symbol/element_size}) < {size}"
+                    pre_code = f"if ({first_cond} && {size_cond}) " + "{\n\t" + \
                         self._vector_instruction_set['cachelineZero'].format(ptr) + ';\n}\n'
 
                 code = self._vector_instruction_set[instr].format(ptr, self.sympy_printer.doprint(rhs),
diff --git a/pystencils/backends/simd_instruction_sets.py b/pystencils/backends/simd_instruction_sets.py
index 0b982814ad2e9b9379b71fface4f361d49696f65..4fe147821dabc8bc62bc2661afe2445ed49fbd77 100644
--- a/pystencils/backends/simd_instruction_sets.py
+++ b/pystencils/backends/simd_instruction_sets.py
@@ -92,12 +92,13 @@ def get_cacheline_size(instruction_set):
     
     import pystencils as ps
     import numpy as np
+    from pystencils.cpu.vectorization import CachelineSize
     
     arr = np.zeros((1, 1), dtype=np.float32)
     f = ps.Field.create_from_numpy_array('f', arr, index_dimensions=0)
-    ass = [ps.astnodes.CachelineSize(), ps.Assignment(f.center, ps.astnodes.CachelineSize.symbol)]
+    ass = [CachelineSize(), ps.Assignment(f.center, CachelineSize.symbol)]
     ast = ps.create_kernel(ass, cpu_vectorize_info={'instruction_set': instruction_set})
     kernel = ast.compile()
-    kernel(**{f.name: arr, ps.astnodes.CachelineSize.symbol.name: 0})
+    kernel(**{f.name: arr, CachelineSize.symbol.name: 0})
     _cachelinesize = int(arr[0, 0])
     return _cachelinesize
diff --git a/pystencils/cpu/vectorization.py b/pystencils/cpu/vectorization.py
index 16f0a15633af0299bf76d0c379bda728d1600e2f..6ab821f4eb7735ed37fd5910cc33e6a676f3a9e9 100644
--- a/pystencils/cpu/vectorization.py
+++ b/pystencils/cpu/vectorization.py
@@ -26,6 +26,53 @@ class vec_all(sp.Function):
     nargs = (1,)
 
 
+class NontemporalFence(ast.Node):
+    def __init__(self):
+        super(NontemporalFence, self).__init__(parent=None)
+
+    @property
+    def symbols_defined(self):
+        return set()
+
+    @property
+    def undefined_symbols(self):
+        return set()
+
+    @property
+    def args(self):
+        return []
+
+    def __eq__(self, other):
+        return isinstance(other, NontemporalFence)
+
+
+class CachelineSize(ast.Node):
+    symbol = sp.Symbol("_clsize")
+    mask_symbol = sp.Symbol("_clsize_mask")
+    last_symbol = sp.Symbol("_cl_lastvec")
+    
+    def __init__(self):
+        super(CachelineSize, self).__init__(parent=None)
+
+    @property
+    def symbols_defined(self):
+        return set([self.symbol, self.mask_symbol, self.last_symbol])
+
+    @property
+    def undefined_symbols(self):
+        return set()
+
+    @property
+    def args(self):
+        return []
+
+    def __eq__(self, other):
+        return isinstance(other, CachelineSize)
+
+    def __hash__(self):
+        return hash(self.symbol)
+
+
 def vectorize(kernel_ast: ast.KernelFunction, instruction_set: str = 'best',
               assume_aligned: bool = False, nontemporal: Union[bool, Container[Union[str, Field]]] = False,
               assume_inner_stride_one: bool = False, assume_sufficient_line_padding: bool = True):
@@ -156,9 +203,9 @@ def vectorize_inner_loops_and_adapt_load_stores(ast_node, vector_width, assume_a
                     parent = loop_node.parent
                     while type(parent.parent.parent) is not ast.KernelFunction:
                         parent = parent.parent
-                    parent.parent.insert_after(ast.NontemporalFence(), parent, if_not_exists=True)
+                    parent.parent.insert_after(NontemporalFence(), parent, if_not_exists=True)
                     # insert CachelineSize at the beginning of the kernel
-                    parent.parent.insert_front(ast.CachelineSize(), if_not_exists=True)
+                    parent.parent.insert_front(CachelineSize(), if_not_exists=True)
         if not successful:
             warnings.warn("Could not vectorize loop because of non-consecutive memory access")
             continue
diff --git a/pystencils/include/philox_rand.h b/pystencils/include/philox_rand.h
index 7684a4507f3fc0a532beb15632fb48f871640f21..84f0ba91edab6722847bf333d97e787ee07b6ce0 100644
--- a/pystencils/include/philox_rand.h
+++ b/pystencils/include/philox_rand.h
@@ -15,13 +15,8 @@
 #ifdef __ARM_NEON
 #include <arm_neon.h>
 #endif
-#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS > 0
+#ifdef __ARM_FEATURE_SVE
 #include <arm_sve.h>
-typedef svfloat32_t svfloat32_st __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
-typedef svfloat64_t svfloat64_st __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
-typedef svint32_t svint32_st __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
-typedef svuint32_t svuint32_st __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
-typedef svuint64_t svuint64_st __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
 #endif
 
 #if defined(__powerpc__) && defined(__GNUC__) && !defined(__clang__) && !defined(__xlC__)
@@ -52,6 +47,14 @@ typedef svuint64_t svuint64_st __attribute__((arm_sve_vector_bits(__ARM_FEATURE_
 typedef std::uint32_t uint32;
 typedef std::uint64_t uint64;
 
+#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS > 0
+typedef svfloat32_t svfloat32_st __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
+typedef svfloat64_t svfloat64_st __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
+#elif defined(__ARM_FEATURE_SVE)
+typedef svfloat32_t svfloat32_st;
+typedef svfloat64_t svfloat64_st;
+#endif
+
 
 QUALIFIERS uint32 mulhilo32(uint32 a, uint32 b, uint32* hip)
 {
@@ -664,28 +667,28 @@ QUALIFIERS void philox_double2(uint32 ctr0, int32x4_t ctr1, uint32 ctr2, uint32
 #endif
 
 
-#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS > 0
-QUALIFIERS void _philox4x32round(svuint32_st* ctr, svuint32_st* key)
+#if defined(__ARM_FEATURE_SVE)
+QUALIFIERS void _philox4x32round(svuint32x4_t & ctr, svuint32x2_t & key)
 {
-    svuint32_st lo0 = svmul_u32_x(svptrue_b32(), ctr[0], svdup_u32(PHILOX_M4x32_0));
-    svuint32_st lo1 = svmul_u32_x(svptrue_b32(), ctr[2], svdup_u32(PHILOX_M4x32_1));
-    svuint32_st hi0 = svmulh_u32_x(svptrue_b32(), ctr[0], svdup_u32(PHILOX_M4x32_0));
-    svuint32_st hi1 = svmulh_u32_x(svptrue_b32(), ctr[2], svdup_u32(PHILOX_M4x32_1));
-
-    ctr[0] = sveor_u32_x(svptrue_b32(), sveor_u32_x(svptrue_b32(), hi1, ctr[1]), key[0]);
-    ctr[1] = lo1;
-    ctr[2] = sveor_u32_x(svptrue_b32(), sveor_u32_x(svptrue_b32(), hi0, ctr[3]), key[1]);
-    ctr[3] = lo0;
+    svuint32_t lo0 = svmul_u32_x(svptrue_b32(), svget4_u32(ctr, 0), svdup_u32(PHILOX_M4x32_0));
+    svuint32_t lo1 = svmul_u32_x(svptrue_b32(), svget4_u32(ctr, 2), svdup_u32(PHILOX_M4x32_1));
+    svuint32_t hi0 = svmulh_u32_x(svptrue_b32(), svget4_u32(ctr, 0), svdup_u32(PHILOX_M4x32_0));
+    svuint32_t hi1 = svmulh_u32_x(svptrue_b32(), svget4_u32(ctr, 2), svdup_u32(PHILOX_M4x32_1));
+
+    ctr = svset4_u32(ctr, 0, sveor_u32_x(svptrue_b32(), sveor_u32_x(svptrue_b32(), hi1, svget4_u32(ctr, 1)), svget2_u32(key, 0)));
+    ctr = svset4_u32(ctr, 1, lo1);
+    ctr = svset4_u32(ctr, 2, sveor_u32_x(svptrue_b32(), sveor_u32_x(svptrue_b32(), hi0, svget4_u32(ctr, 3)), svget2_u32(key, 1)));
+    ctr = svset4_u32(ctr, 3, lo0);
 }
 
-QUALIFIERS void _philox4x32bumpkey(svuint32_st* key)
+QUALIFIERS void _philox4x32bumpkey(svuint32x2_t & key)
 {
-    key[0] = svadd_u32_x(svptrue_b32(), key[0], svdup_u32(PHILOX_W32_0));
-    key[1] = svadd_u32_x(svptrue_b32(), key[1], svdup_u32(PHILOX_W32_1));
+    key = svset2_u32(key, 0, svadd_u32_x(svptrue_b32(), svget2_u32(key, 0), svdup_u32(PHILOX_W32_0)));
+    key = svset2_u32(key, 1, svadd_u32_x(svptrue_b32(), svget2_u32(key, 1), svdup_u32(PHILOX_W32_1)));
 }
 
 template<bool high>
-QUALIFIERS svfloat64_st _uniform_double_hq(svuint32_st x, svuint32_st y)
+QUALIFIERS svfloat64_t _uniform_double_hq(svuint32_t x, svuint32_t y)
 {
     // convert 32 to 64 bit
     if (high)
@@ -700,11 +703,11 @@ QUALIFIERS svfloat64_st _uniform_double_hq(svuint32_st x, svuint32_st y)
     }
 
     // calculate z = x ^ y << (53 - 32))
-    svuint64_st z = svlsl_n_u64_x(svptrue_b64(), svreinterpret_u64_u32(y), 53 - 32);
+    svuint64_t z = svlsl_n_u64_x(svptrue_b64(), svreinterpret_u64_u32(y), 53 - 32);
     z = sveor_u64_x(svptrue_b64(), svreinterpret_u64_u32(x), z);
 
     // convert uint64 to double
-    svfloat64_st rs = svcvt_f64_u64_x(svptrue_b64(), z);
+    svfloat64_t rs = svcvt_f64_u64_x(svptrue_b64(), z);
     // calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
     rs = svmad_f64_x(svptrue_b64(), rs, svdup_f64(TWOPOW53_INV_DOUBLE), svdup_f64(TWOPOW53_INV_DOUBLE/2.0));
 
@@ -712,12 +715,12 @@ QUALIFIERS svfloat64_st _uniform_double_hq(svuint32_st x, svuint32_st y)
 }
 
 
-QUALIFIERS void philox_float4(svuint32_st ctr0, svuint32_st ctr1, svuint32_st ctr2, svuint32_st ctr3,
+QUALIFIERS void philox_float4(svuint32_t ctr0, svuint32_t ctr1, svuint32_t ctr2, svuint32_t ctr3,
                               uint32 key0, uint32 key1,
                               svfloat32_st & rnd1, svfloat32_st & rnd2, svfloat32_st & rnd3, svfloat32_st & rnd4)
 {
-    svuint32_st key[2] = {svdup_u32(key0), svdup_u32(key1)};
-    svuint32_st ctr[4] = {ctr0, ctr1, ctr2, ctr3};
+    svuint32x2_t key = svcreate2_u32(svdup_u32(key0), svdup_u32(key1));
+    svuint32x4_t ctr = svcreate4_u32(ctr0, ctr1, ctr2, ctr3);
     _philox4x32round(ctr, key);                           // 1
     _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 2
     _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 3
@@ -730,10 +733,10 @@ QUALIFIERS void philox_float4(svuint32_st ctr0, svuint32_st ctr1, svuint32_st ct
     _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 10
 
     // convert uint32 to float
-    rnd1 = svcvt_f32_u32_x(svptrue_b32(), ctr[0]);
-    rnd2 = svcvt_f32_u32_x(svptrue_b32(), ctr[1]);
-    rnd3 = svcvt_f32_u32_x(svptrue_b32(), ctr[2]);
-    rnd4 = svcvt_f32_u32_x(svptrue_b32(), ctr[3]);
+    rnd1 = svcvt_f32_u32_x(svptrue_b32(), svget4_u32(ctr, 0));
+    rnd2 = svcvt_f32_u32_x(svptrue_b32(), svget4_u32(ctr, 1));
+    rnd3 = svcvt_f32_u32_x(svptrue_b32(), svget4_u32(ctr, 2));
+    rnd4 = svcvt_f32_u32_x(svptrue_b32(), svget4_u32(ctr, 3));
     // calculate rnd * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
     rnd1 = svmad_f32_x(svptrue_b32(), rnd1, svdup_f32(TWOPOW32_INV_FLOAT), svdup_f32(TWOPOW32_INV_FLOAT/2.0));
     rnd2 = svmad_f32_x(svptrue_b32(), rnd2, svdup_f32(TWOPOW32_INV_FLOAT), svdup_f32(TWOPOW32_INV_FLOAT/2.0));
@@ -742,12 +745,12 @@ QUALIFIERS void philox_float4(svuint32_st ctr0, svuint32_st ctr1, svuint32_st ct
 }
 
 
-QUALIFIERS void philox_double2(svuint32_st ctr0, svuint32_st ctr1, svuint32_st ctr2, svuint32_st ctr3,
+QUALIFIERS void philox_double2(svuint32_t ctr0, svuint32_t ctr1, svuint32_t ctr2, svuint32_t ctr3,
                                uint32 key0, uint32 key1,
                                svfloat64_st & rnd1lo, svfloat64_st & rnd1hi, svfloat64_st & rnd2lo, svfloat64_st & rnd2hi)
 {
-    svuint32_st key[2] = {svdup_u32(key0), svdup_u32(key1)};
-    svuint32_st ctr[4] = {ctr0, ctr1, ctr2, ctr3};
+    svuint32x2_t key = svcreate2_u32(svdup_u32(key0), svdup_u32(key1));
+    svuint32x4_t ctr = svcreate4_u32(ctr0, ctr1, ctr2, ctr3);
     _philox4x32round(ctr, key);                           // 1
     _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 2
     _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 3
@@ -759,54 +762,54 @@ QUALIFIERS void philox_double2(svuint32_st ctr0, svuint32_st ctr1, svuint32_st c
     _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 9
     _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 10
 
-    rnd1lo = _uniform_double_hq<false>(ctr[0], ctr[1]);
-    rnd1hi = _uniform_double_hq<true>(ctr[0], ctr[1]);
-    rnd2lo = _uniform_double_hq<false>(ctr[2], ctr[3]);
-    rnd2hi = _uniform_double_hq<true>(ctr[2], ctr[3]);
+    rnd1lo = _uniform_double_hq<false>(svget4_u32(ctr, 0), svget4_u32(ctr, 1));
+    rnd1hi = _uniform_double_hq<true>(svget4_u32(ctr, 0), svget4_u32(ctr, 1));
+    rnd2lo = _uniform_double_hq<false>(svget4_u32(ctr, 2), svget4_u32(ctr, 3));
+    rnd2hi = _uniform_double_hq<true>(svget4_u32(ctr, 2), svget4_u32(ctr, 3));
 }
 
-QUALIFIERS void philox_float4(uint32 ctr0, svuint32_st ctr1, uint32 ctr2, uint32 ctr3,
+QUALIFIERS void philox_float4(uint32 ctr0, svuint32_t ctr1, uint32 ctr2, uint32 ctr3,
                               uint32 key0, uint32 key1,
                               svfloat32_st & rnd1, svfloat32_st & rnd2, svfloat32_st & rnd3, svfloat32_st & rnd4)
 {
-    svuint32_st ctr0v = svdup_u32(ctr0);
-    svuint32_st ctr2v = svdup_u32(ctr2);
-    svuint32_st ctr3v = svdup_u32(ctr3);
+    svuint32_t ctr0v = svdup_u32(ctr0);
+    svuint32_t ctr2v = svdup_u32(ctr2);
+    svuint32_t ctr3v = svdup_u32(ctr3);
 
     philox_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, rnd2, rnd3, rnd4);
 }
 
-QUALIFIERS void philox_float4(uint32 ctr0, svint32_st ctr1, uint32 ctr2, uint32 ctr3,
+QUALIFIERS void philox_float4(uint32 ctr0, svint32_t ctr1, uint32 ctr2, uint32 ctr3,
                               uint32 key0, uint32 key1,
                               svfloat32_st & rnd1, svfloat32_st & rnd2, svfloat32_st & rnd3, svfloat32_st & rnd4)
 {
     philox_float4(ctr0, svreinterpret_u32_s32(ctr1), ctr2, ctr3, key0, key1, rnd1, rnd2, rnd3, rnd4);
 }
 
-QUALIFIERS void philox_double2(uint32 ctr0, svuint32_st ctr1, uint32 ctr2, uint32 ctr3,
+QUALIFIERS void philox_double2(uint32 ctr0, svuint32_t ctr1, uint32 ctr2, uint32 ctr3,
                                uint32 key0, uint32 key1,
                                svfloat64_st & rnd1lo, svfloat64_st & rnd1hi, svfloat64_st & rnd2lo, svfloat64_st & rnd2hi)
 {
-    svuint32_st ctr0v = svdup_u32(ctr0);
-    svuint32_st ctr2v = svdup_u32(ctr2);
-    svuint32_st ctr3v = svdup_u32(ctr3);
+    svuint32_t ctr0v = svdup_u32(ctr0);
+    svuint32_t ctr2v = svdup_u32(ctr2);
+    svuint32_t ctr3v = svdup_u32(ctr3);
 
     philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1lo, rnd1hi, rnd2lo, rnd2hi);
 }
 
-QUALIFIERS void philox_double2(uint32 ctr0, svuint32_st ctr1, uint32 ctr2, uint32 ctr3,
+QUALIFIERS void philox_double2(uint32 ctr0, svuint32_t ctr1, uint32 ctr2, uint32 ctr3,
                                uint32 key0, uint32 key1,
                                svfloat64_st & rnd1, svfloat64_st & rnd2)
 {
-    svuint32_st ctr0v = svdup_u32(ctr0);
-    svuint32_st ctr2v = svdup_u32(ctr2);
-    svuint32_st ctr3v = svdup_u32(ctr3);
+    svuint32_t ctr0v = svdup_u32(ctr0);
+    svuint32_t ctr2v = svdup_u32(ctr2);
+    svuint32_t ctr3v = svdup_u32(ctr3);
 
     svfloat64_st ignore;
     philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, ignore, rnd2, ignore);
 }
 
-QUALIFIERS void philox_double2(uint32 ctr0, svint32_st ctr1, uint32 ctr2, uint32 ctr3,
+QUALIFIERS void philox_double2(uint32 ctr0, svint32_t ctr1, uint32 ctr2, uint32 ctr3,
                                uint32 key0, uint32 key1,
                                svfloat64_st & rnd1, svfloat64_st & rnd2)
 {
diff --git a/pystencils_tests/test_vectorization_specific.py b/pystencils_tests/test_vectorization_specific.py
index f579b4e4615914f77de646e89d563281cd4f18c2..16780f1470992764316361b982e52125def7f756 100644
--- a/pystencils_tests/test_vectorization_specific.py
+++ b/pystencils_tests/test_vectorization_specific.py
@@ -117,7 +117,7 @@ def test_cacheline_size(instruction_set):
 
 # test_vectorization is not parametrized because it is supposed to run without pytest, so we parametrize it here
 from pystencils_tests import test_vectorization
-@pytest.mark.parametrize('instruction_set', set(supported_instruction_sets) - set([test_vectorization.instruction_set]))
+@pytest.mark.parametrize('instruction_set', sorted(set(supported_instruction_sets) - set([test_vectorization.instruction_set])))
 @pytest.mark.parametrize('function', [f for f in test_vectorization.__dict__ if f.startswith('test_') and f != 'test_hardware_query'])
 def test_vectorization_other(instruction_set, function):
     test_vectorization.__dict__[function](instruction_set)
diff --git a/pytest.ini b/pytest.ini
index 500485359e9b50696d429cdd7e879e2661d5c29e..039d41b593e3ccf0a57deecdf44f7aeaf590d46a 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -43,7 +43,7 @@ exclude_lines =
        if __name__ == .__main__.:
 
 skip_covered = True
-fail_under = 88
+fail_under = 87
 
 [html]
 directory = coverage_report