diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index f333e761d5568ba256c3d5c48a2c6dfd6f4660b5..f338f2740a72d98e046a0b14c300e3d74688f839 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -21,7 +21,7 @@ tests-and-coverage:
     - mkdir -p ~/.config/matplotlib
     - echo "backend:template" > ~/.config/matplotlib/matplotlibrc
     - mkdir public
-    - py.test -v -n $NUM_CORES --cov-report html --cov-report term --cov=. -m "not longrun" --html test-report/index.html --junitxml=report.xml
+    - py.test -v -n $NUM_CORES --cov-report html --cov-report xml --cov-report term --cov=. -m "not longrun" --html test-report/index.html --junitxml=report.xml
     - python3 -m coverage xml
   tags:
     - docker
@@ -156,7 +156,7 @@ arm64v8:
   extends: .multiarch_template
   image: i10git.cs.fau.de:5005/pycodegen/pycodegen/arm64
   variables:
-    PYSTENCILS_SIMD: "neon"
+    QEMU_CPU: "cortex-a76"
   before_script:
     - *multiarch_before_script
     - sed -i s/march=native/march=armv8-a/g ~/.config/pystencils/config.json
@@ -164,8 +164,6 @@ arm64v8:
 ppc64le:
   extends: .multiarch_template
   image: i10git.cs.fau.de:5005/pycodegen/pycodegen/ppc64le
-  variables:
-    PYSTENCILS_SIMD: "vsx"
   before_script:
     - *multiarch_before_script
     - sed -i s/mcpu=native/mcpu=power8/g ~/.config/pystencils/config.json
@@ -174,8 +172,6 @@ arm64v9:
   # SVE support is still unreliable in GCC 11 (incorrect code for fixed-width vectors, internal compiler errors).
   extends: .multiarch_template
   image: i10git.cs.fau.de:5005/pycodegen/pycodegen/arm64
-  variables:
-    PYSTENCILS_SIMD: "sve128,sve256,sve512,sve"
   before_script:
     - *multiarch_before_script
     - sed -i s/march=native/march=armv8-a+sve/g ~/.config/pystencils/config.json
@@ -187,6 +183,7 @@ riscv64:
   extends: .multiarch_template
   image: i10git.cs.fau.de:5005/pycodegen/pycodegen/riscv64
   variables:
+    # explicitly set SIMD as detection does not appear to work on QEMU
     PYSTENCILS_SIMD: "rvv"
     QEMU_CPU: "rv64,v=true"
   before_script:
diff --git a/doc/conf.py b/doc/conf.py
old mode 100644
new mode 100755
index c230cc945b3e58403af84146081aea79e83ca6c7..c493f806640eba010d1afee128b9bcd3ee5add3e
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -33,7 +33,7 @@ version = re.sub(r'(\d+\.\d+)\.\d+(.*)', r'\1\2', pystencils.__version__)
 version = re.sub(r'(\.dev\d+).*?$', r'\1', version)
 # The full version, including alpha/beta/rc tags.
 release = pystencils.__version__
-language = None
+language = 'en'
 exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store', '**.ipynb_checkpoints']
 default_role = 'any'
 pygments_style = 'sphinx'
diff --git a/pystencils/alignedarray.py b/pystencils/alignedarray.py
index 067a26d58370460beb1852add9ae46b9709b0595..63bdb3a5f1324a099bbd82fd666bfaec11eeb5af 100644
--- a/pystencils/alignedarray.py
+++ b/pystencils/alignedarray.py
@@ -25,7 +25,8 @@ def aligned_empty(shape, byte_alignment=True, dtype=np.float64, byte_offset=0, o
             byte_alignment = 64
         elif byte_alignment == 'cacheline':
             cacheline_sizes = [get_cacheline_size(is_name) for is_name in instruction_sets]
-            if all([s is None for s in cacheline_sizes]):
+            if all([s is None for s in cacheline_sizes]) or \
+                    max([s for s in cacheline_sizes if s is not None]) > 0x100000:
                 widths = [get_vector_instruction_set(dtype, is_name)['width'] * np.dtype(dtype).itemsize
                           for is_name in instruction_sets
                           if type(get_vector_instruction_set(dtype, is_name)['width']) is int]
diff --git a/pystencils/backends/simd_instruction_sets.py b/pystencils/backends/simd_instruction_sets.py
index 7d0d028c0691e48252a287dd81b46fd0d0a420cc..8024d58c3960235611020cd05f3ea3755375cf5b 100644
--- a/pystencils/backends/simd_instruction_sets.py
+++ b/pystencils/backends/simd_instruction_sets.py
@@ -9,6 +9,7 @@ from pystencils.backends.x86_instruction_sets import get_vector_instruction_set_
 from pystencils.backends.arm_instruction_sets import get_vector_instruction_set_arm
 from pystencils.backends.ppc_instruction_sets import get_vector_instruction_set_ppc
 from pystencils.backends.riscv_instruction_sets import get_vector_instruction_set_riscv
+from pystencils.cache import memorycache
 from pystencils.typing import numpy_name_to_c
 
 
@@ -31,80 +32,68 @@ def get_vector_instruction_set(data_type='double', instruction_set='avx'):
         return get_vector_instruction_set_x86(type_name, instruction_set)
 
 
-_cache = None
-_cachelinesize = None
-
-
+@memorycache
 def get_supported_instruction_sets():
     """List of supported instruction sets on current hardware, or None if query failed."""
-    global _cache
-    if _cache is not None:
-        return _cache.copy()
     if 'PYSTENCILS_SIMD' in os.environ:
         return os.environ['PYSTENCILS_SIMD'].split(',')
-    if (platform.system() == 'Darwin' or platform.system() == 'Linux') and platform.machine() == 'arm64':
-        # not supported by cpuinfo
+    if platform.system() == 'Darwin' and platform.machine() == 'arm64':
         return ['neon']
-    elif platform.system() == 'Linux' and platform.machine().startswith('riscv'):  # not supported by cpuinfo
+    elif platform.system() == 'Windows' and platform.machine() == 'ARM64':
+        return ['neon']
+    elif platform.system() == 'Linux' and platform.machine() == 'aarch64':
+        result = ['neon']  # Neon is mandatory on 64-bit ARM
         libc = CDLL('libc.so.6')
         hwcap = libc.getauxval(16)  # AT_HWCAP
-        hwcap_isa_v = 1 << (ord('V') - ord('A'))  # COMPAT_HWCAP_ISA_V
-        return ['rvv'] if hwcap & hwcap_isa_v else []
-    elif platform.machine().startswith('ppc64'):  # no flags reported by cpuinfo
-        import subprocess
-        import tempfile
-        from pystencils.cpu.cpujit import get_compiler_config
-        f = tempfile.NamedTemporaryFile(suffix='.cpp')
-        command = [get_compiler_config()['command'], '-mcpu=native', '-dM', '-E', f.name]
-        macros = subprocess.check_output(command, input='', text=True)
-        if '#define __VSX__' in macros and '#define __ALTIVEC__' in macros:
-            _cache = ['vsx']
-        else:
-            _cache = []
-        return _cache.copy()
-    try:
-        from cpuinfo import get_cpu_info
-    except ImportError:
-        return None
-
-    result = []
-    required_sse_flags = {'sse', 'sse2', 'ssse3', 'sse4_1', 'sse4_2'}
-    required_avx_flags = {'avx', 'avx2'}
-    required_avx512_flags = {'avx512f'}
-    required_neon_flags = {'neon'}
-    required_sve_flags = {'sve'}
-    flags = set(get_cpu_info()['flags'])
-    if flags.issuperset(required_sse_flags):
-        result.append("sse")
-    if flags.issuperset(required_avx_flags):
-        result.append("avx")
-    if flags.issuperset(required_avx512_flags):
-        result.append("avx512")
-    if flags.issuperset(required_neon_flags):
-        result.append("neon")
-    if flags.issuperset(required_sve_flags):
-        if platform.system() == 'Linux':
-            libc = CDLL('libc.so.6')
+        if hwcap & (1 << 22):  # HWCAP_SVE
             length = 8 * libc.prctl(51, 0, 0, 0, 0)  # PR_SVE_GET_VL
             if length < 0:
                 raise OSError("SVE length query failed")
-            while length > 128:
+            while length >= 128:
                 result.append(f"sve{length}")
                 length //= 2
-        result.append("sve")
-    return result
+            result.append("sve")
+        return result
+    elif platform.system() == 'Linux' and platform.machine().startswith('riscv'):
+        libc = CDLL('libc.so.6')
+        hwcap = libc.getauxval(16)  # AT_HWCAP
+        hwcap_isa_v = 1 << (ord('V') - ord('A'))  # COMPAT_HWCAP_ISA_V
+        return ['rvv'] if hwcap & hwcap_isa_v else []
+    elif platform.system() == 'Linux' and platform.machine().startswith('ppc64'):
+        libc = CDLL('libc.so.6')
+        hwcap = libc.getauxval(16)  # AT_HWCAP
+        return ['vsx'] if hwcap & 0x00000080 else []  # PPC_FEATURE_HAS_VSX
+    elif platform.machine() in ['x86_64', 'x86', 'AMD64', 'i386']:
+        try:
+            from cpuinfo import get_cpu_info
+        except ImportError:
+            return None
+
+        result = []
+        required_sse_flags = {'sse', 'sse2', 'ssse3', 'sse4_1', 'sse4_2'}
+        required_avx_flags = {'avx', 'avx2'}
+        required_avx512_flags = {'avx512f'}
+        flags = set(get_cpu_info()['flags'])
+        if flags.issuperset(required_sse_flags):
+            result.append("sse")
+        if flags.issuperset(required_avx_flags):
+            result.append("avx")
+        if flags.issuperset(required_avx512_flags):
+            result.append("avx512")
+        return result
+    else:
+        raise NotImplementedError('Instruction set detection for %s on %s is not implemented' %
+                                  (platform.system(), platform.machine()))
 
 
+@memorycache
 def get_cacheline_size(instruction_set):
     """Get the size (in bytes) of a cache block that can be zeroed without memory access.
        Usually, this is identical to the cache line size."""
-    global _cachelinesize
     
     instruction_sets = get_vector_instruction_set('double', instruction_set)
     if 'cachelineSize' not in instruction_sets:
         return None
-    if _cachelinesize is not None:
-        return _cachelinesize
     
     import pystencils as ps
     from pystencils.astnodes import SympyAssignment
@@ -117,5 +106,4 @@ def get_cacheline_size(instruction_set):
     ast = ps.create_kernel(ass, cpu_vectorize_info={'instruction_set': instruction_set})
     kernel = ast.compile()
     kernel(**{f.name: arr, CachelineSize.symbol.name: 0})
-    _cachelinesize = int(arr[0, 0])
-    return _cachelinesize
+    return int(arr[0, 0])
diff --git a/pystencils/cpu/cpujit.py b/pystencils/cpu/cpujit.py
index aebefec91d1b2f392b849f79960bf72dee666bf2..4c3febe2981562f6b0c923e8d68417c722ca7d61 100644
--- a/pystencils/cpu/cpujit.py
+++ b/pystencils/cpu/cpujit.py
@@ -146,9 +146,7 @@ def read_config():
             ('flags', '-Ofast -DNDEBUG -fPIC -march=native -fopenmp -std=c++11'),
             ('restrict_qualifier', '__restrict__')
         ])
-        if platform.machine() == 'arm64':
-            default_compiler_config['flags'] = default_compiler_config['flags'].replace('-march=native', '')
-        elif platform.machine().startswith('ppc64'):
+        if platform.machine().startswith('ppc64') or platform.machine() == 'arm64':
             default_compiler_config['flags'] = default_compiler_config['flags'].replace('-march=native',
                                                                                         '-mcpu=native')
     elif platform.system().lower() == 'windows':
@@ -159,6 +157,9 @@ def read_config():
             ('flags', '/Ox /fp:fast /OpenMP /arch:avx'),
             ('restrict_qualifier', '__restrict')
         ])
+        if platform.machine() == 'ARM64':
+            default_compiler_config['arch'] = 'ARM64'
+            default_compiler_config['flags'] = default_compiler_config['flags'].replace(' /arch:avx', '')
     elif platform.system().lower() == 'darwin':
         default_compiler_config = OrderedDict([
             ('os', 'darwin'),
@@ -174,8 +175,8 @@ def read_config():
                 default_compiler_config['flags'] += ' ' + libomp
                 break
     else:
-        raise ValueError("The detection of the platform with platform.system() did not work. "
-                         "Pystencils is only supported for linux, windows, and darwin platforms.")
+        raise NotImplementedError('Generation of default compiler flags for %s is not implemented' %
+                                  (platform.system(),))
 
     default_cache_config = OrderedDict([
         ('object_cache', os.path.join(user_cache_dir('pystencils'), 'objectcache')),
@@ -393,7 +394,8 @@ def create_function_boilerplate_code(parameter_info, name, ast_node, insert_chec
                             has_nontemporal = has_nontemporal or any([a.args[0].field == field and a.args[3] for a in
                                                                       loop.atoms(VectorMemoryAccess)])
                         if has_openmp and has_nontemporal:
-                            byte_width = ast_node.instruction_set['cachelineSize']
+                            cl_size = ast_node.instruction_set['cachelineSize']
+                            byte_width = f"({cl_size}) < SIZE_MAX ? ({cl_size}) : ({byte_width})"
                     offset = max(max(ast_node.ghost_layers)) * item_size
                     offset_cond = f"(((uintptr_t) buffer_{field.name}.buf) + {offset}) % ({byte_width}) == 0"
 
diff --git a/pystencils/include/arm_neon_helpers.h b/pystencils/include/arm_neon_helpers.h
index a27b8ff6fa9e7244a8a0467315ed06d3985ed7b6..5ffc6a4f17d46cc5495cfc82d81ed16e7636f040 100644
--- a/pystencils/include/arm_neon_helpers.h
+++ b/pystencils/include/arm_neon_helpers.h
@@ -1,3 +1,7 @@
+#if defined(_MSC_VER)
+#define __ARM_NEON
+#endif
+
 #ifdef __ARM_NEON
 #include <arm_neon.h>
 #endif
@@ -32,10 +36,13 @@ inline int32x4_t makeVec_s32(int a, int b, int c, int d)
 #endif
 
 inline void cachelineZero(void * p) {
+#if !defined(_MSC_VER) || defined(__clang__)
 	__asm__ volatile("dc zva, %0"::"r"(p):"memory");
+#endif
 }
 
 inline size_t _cachelineSize() {
+#if !defined(_MSC_VER) || defined(__clang__)
 	// check that dc zva is permitted
 	uint64_t dczid;
 	__asm__ volatile ("mrs %0, dczid_el0" : "=r"(dczid));
@@ -72,6 +79,7 @@ inline size_t _cachelineSize() {
 			return size;
 		}
 	}
+#endif
 	
 	// too much was zeroed
 	return SIZE_MAX;
diff --git a/pystencils/include/myintrin.h b/pystencils/include/myintrin.h
index 6c1d9d4d02636bc73a56ea5f0896eb128a86fbd1..eb1fe4dc41f2851660723a3c2ddd57fafb06a22a 100644
--- a/pystencils/include/myintrin.h
+++ b/pystencils/include/myintrin.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#if defined(__SSE2__) || defined(_MSC_VER)
+#if defined(__SSE2__) || (defined(_MSC_VER) && !defined(_M_ARM64))
 QUALIFIERS __m128 _my_cvtepu32_ps(const __m128i v)
 {
 #ifdef __AVX512VL__
@@ -28,7 +28,7 @@ QUALIFIERS void _MY_TRANSPOSE4_EPI32(__m128i & R0, __m128i & R1, __m128i & R2, _
 }
 #endif
 
-#if defined(__SSE4_1__) || defined(_MSC_VER)
+#if defined(__SSE4_1__) || (defined(_MSC_VER) && !defined(_M_ARM64))
 #if !defined(__AVX512VL__) && defined(__GNUC__) && __GNUC__ >= 5 && !defined(__clang__)
 __attribute__((optimize("no-associative-math")))
 #endif
diff --git a/pystencils/include/philox_rand.h b/pystencils/include/philox_rand.h
index 0571eb39d114c5ea29c974e9630744d0bbcc1540..fab94146889a854f09537b0395cbee5607355c1e 100644
--- a/pystencils/include/philox_rand.h
+++ b/pystencils/include/philox_rand.h
@@ -1,16 +1,20 @@
 #ifndef __OPENCL_VERSION__
-#if defined(__SSE2__) || defined(_MSC_VER)
+#if defined(__SSE2__) || (defined(_MSC_VER) && !defined(_M_ARM64))
 #include <emmintrin.h> // SSE2
 #endif
 #ifdef __AVX2__
 #include <immintrin.h> // AVX*
-#elif defined(__SSE4_1__) || defined(_MSC_VER)
+#elif defined(__SSE4_1__) || (defined(_MSC_VER) && !defined(_M_ARM64))
 #include <smmintrin.h>  // SSE4
 #ifdef __FMA__
 #include <immintrin.h> // FMA
 #endif
 #endif
 
+#if defined(_MSC_VER) && defined(_M_ARM64)
+#define __ARM_NEON
+#endif
+
 #ifdef __ARM_NEON
 #include <arm_neon.h>
 #endif
@@ -183,7 +187,7 @@ QUALIFIERS void philox_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3
 }
 
 #if !defined(__CUDA_ARCH__) && !defined(__OPENCL_VERSION__)
-#if defined(__SSE4_1__) || defined(_MSC_VER)
+#if defined(__SSE4_1__) || (defined(_MSC_VER) && !defined(_M_ARM64))
 QUALIFIERS void _philox4x32round(__m128i* ctr, __m128i* key)
 {
     __m128i lohi0a = _mm_mul_epu32(ctr[0], _mm_set1_epi32(PHILOX_M4x32_0));
@@ -665,12 +669,14 @@ QUALIFIERS void philox_float4(uint32 ctr0, uint32x4_t ctr1, uint32 ctr2, uint32
     philox_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, rnd2, rnd3, rnd4);
 }
 
+#ifndef _MSC_VER
 QUALIFIERS void philox_float4(uint32 ctr0, int32x4_t ctr1, uint32 ctr2, uint32 ctr3,
                               uint32 key0, uint32 key1,
                               float32x4_t & rnd1, float32x4_t & rnd2, float32x4_t & rnd3, float32x4_t & rnd4)
 {
     philox_float4(ctr0, vreinterpretq_u32_s32(ctr1), ctr2, ctr3, key0, key1, rnd1, rnd2, rnd3, rnd4);
 }
+#endif
 
 QUALIFIERS void philox_double2(uint32 ctr0, uint32x4_t ctr1, uint32 ctr2, uint32 ctr3,
                                uint32 key0, uint32 key1,
@@ -695,6 +701,7 @@ QUALIFIERS void philox_double2(uint32 ctr0, uint32x4_t ctr1, uint32 ctr2, uint32
     philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, ignore, rnd2, ignore);
 }
 
+#ifndef _MSC_VER
 QUALIFIERS void philox_double2(uint32 ctr0, int32x4_t ctr1, uint32 ctr2, uint32 ctr3,
                                uint32 key0, uint32 key1,
                                float64x2_t & rnd1, float64x2_t & rnd2)
@@ -702,6 +709,7 @@ QUALIFIERS void philox_double2(uint32 ctr0, int32x4_t ctr1, uint32 ctr2, uint32
     philox_double2(ctr0, vreinterpretq_u32_s32(ctr1), ctr2, ctr3, key0, key1, rnd1, rnd2);
 }
 #endif
+#endif
 
 
 #if defined(__ARM_FEATURE_SVE)
diff --git a/pystencils/typing/utilities.py b/pystencils/typing/utilities.py
index da40c510ef91c7ca7fee0e6a0259b3eef50f0ab8..223da701a4d5c133715eb30f99366c44b13f16b2 100644
--- a/pystencils/typing/utilities.py
+++ b/pystencils/typing/utilities.py
@@ -187,18 +187,15 @@ def get_type_of_expression(expr,
 
 # Fix for sympy versions from 1.9
 sympy_version = sp.__version__.split('.')
-if int(sympy_version[0]) * 100 + int(sympy_version[1]) >= 109:
+sympy_version_int = int(sympy_version[0]) * 100 + int(sympy_version[1])
+if sympy_version_int >= 109:
     # __setstate__ would bypass the contructor, so we remove it
-    sp.Number.__getstate__ = sp.Basic.__getstate__
-    del sp.Basic.__getstate__
-
-    class FunctorWithStoredKwargs:
-        def __init__(self, func, **kwargs):
-            self.func = func
-            self.kwargs = kwargs
-
-        def __call__(self, *args):
-            return self.func(*args, **self.kwargs)
+    if sympy_version_int >= 111:
+        del sp.Basic.__setstate__
+        del sp.Symbol.__setstate__
+    else:
+        sp.Number.__getstate__ = sp.Basic.__getstate__
+        del sp.Basic.__getstate__
 
     # __reduce_ex__ would strip kwargs, so we override it
     def basic_reduce_ex(self, protocol):
@@ -210,9 +207,7 @@ if int(sympy_version[0]) * 100 + int(sympy_version[1]) >= 109:
             state = self.__getstate__()
         else:
             state = None
-        return FunctorWithStoredKwargs(type(self), **kwargs), args, state
-
-    sp.Number.__reduce_ex__ = sp.Basic.__reduce_ex__
+        return partial(type(self), **kwargs), args, state
     sp.Basic.__reduce_ex__ = basic_reduce_ex