diff --git a/pystencils/alignedarray.py b/pystencils/alignedarray.py
index 067a26d58370460beb1852add9ae46b9709b0595..63bdb3a5f1324a099bbd82fd666bfaec11eeb5af 100644
--- a/pystencils/alignedarray.py
+++ b/pystencils/alignedarray.py
@@ -25,7 +25,8 @@ def aligned_empty(shape, byte_alignment=True, dtype=np.float64, byte_offset=0, o
             byte_alignment = 64
         elif byte_alignment == 'cacheline':
             cacheline_sizes = [get_cacheline_size(is_name) for is_name in instruction_sets]
-            if all([s is None for s in cacheline_sizes]):
+            if all([s is None for s in cacheline_sizes]) or \
+                    max([s for s in cacheline_sizes if s is not None]) > 0x100000:
                 widths = [get_vector_instruction_set(dtype, is_name)['width'] * np.dtype(dtype).itemsize
                           for is_name in instruction_sets
                           if type(get_vector_instruction_set(dtype, is_name)['width']) is int]
diff --git a/pystencils/backends/simd_instruction_sets.py b/pystencils/backends/simd_instruction_sets.py
index 7d0d028c0691e48252a287dd81b46fd0d0a420cc..678a44f39d31465632a3b5f9b75f4acb839a7209 100644
--- a/pystencils/backends/simd_instruction_sets.py
+++ b/pystencils/backends/simd_instruction_sets.py
@@ -45,6 +45,9 @@ def get_supported_instruction_sets():
     if (platform.system() == 'Darwin' or platform.system() == 'Linux') and platform.machine() == 'arm64':
         # not supported by cpuinfo
         return ['neon']
+    elif platform.system() == 'Windows' and platform.machine() == 'ARM64':
+        # not supported by cpuinfo
+        return ['neon']
     elif platform.system() == 'Linux' and platform.machine().startswith('riscv'):  # not supported by cpuinfo
         libc = CDLL('libc.so.6')
         hwcap = libc.getauxval(16)  # AT_HWCAP
diff --git a/pystencils/cpu/cpujit.py b/pystencils/cpu/cpujit.py
index aebefec91d1b2f392b849f79960bf72dee666bf2..e761412375fb88d959f411e1e34c2b0afb83dd33 100644
--- a/pystencils/cpu/cpujit.py
+++ b/pystencils/cpu/cpujit.py
@@ -159,6 +159,9 @@ def read_config():
             ('flags', '/Ox /fp:fast /OpenMP /arch:avx'),
             ('restrict_qualifier', '__restrict')
         ])
+    if platform.machine() == 'ARM64':
+        default_compiler_config['arch'] = 'ARM64'
+        default_compiler_config['flags'] = default_compiler_config['flags'].replace(' /arch:avx', '')
     elif platform.system().lower() == 'darwin':
         default_compiler_config = OrderedDict([
             ('os', 'darwin'),
@@ -393,7 +396,8 @@ def create_function_boilerplate_code(parameter_info, name, ast_node, insert_chec
                             has_nontemporal = has_nontemporal or any([a.args[0].field == field and a.args[3] for a in
                                                                       loop.atoms(VectorMemoryAccess)])
                         if has_openmp and has_nontemporal:
-                            byte_width = ast_node.instruction_set['cachelineSize']
+                            cl_size = ast_node.instruction_set['cachelineSize']
+                            byte_width = f"({cl_size}) < SIZE_MAX ? ({cl_size}) : ({byte_width})"
                     offset = max(max(ast_node.ghost_layers)) * item_size
                     offset_cond = f"(((uintptr_t) buffer_{field.name}.buf) + {offset}) % ({byte_width}) == 0"
 
diff --git a/pystencils/include/arm_neon_helpers.h b/pystencils/include/arm_neon_helpers.h
index a27b8ff6fa9e7244a8a0467315ed06d3985ed7b6..5900543b7ace3e032cf53232fd0d83a8c4306d2b 100644
--- a/pystencils/include/arm_neon_helpers.h
+++ b/pystencils/include/arm_neon_helpers.h
@@ -1,3 +1,7 @@
+#if defined(_MSC_VER)
+#define __ARM_NEON
+#endif
+
 #ifdef __ARM_NEON
 #include <arm_neon.h>
 #endif
@@ -32,10 +36,13 @@ inline int32x4_t makeVec_s32(int a, int b, int c, int d)
 #endif
 
 inline void cachelineZero(void * p) {
+#ifndef _MSC_VER
 	__asm__ volatile("dc zva, %0"::"r"(p):"memory");
+#endif
 }
 
 inline size_t _cachelineSize() {
+#ifndef _MSC_VER
 	// check that dc zva is permitted
 	uint64_t dczid;
 	__asm__ volatile ("mrs %0, dczid_el0" : "=r"(dczid));
@@ -72,6 +79,7 @@ inline size_t _cachelineSize() {
 			return size;
 		}
 	}
+#endif
 	
 	// too much was zeroed
 	return SIZE_MAX;
diff --git a/pystencils/include/myintrin.h b/pystencils/include/myintrin.h
index 6c1d9d4d02636bc73a56ea5f0896eb128a86fbd1..eb1fe4dc41f2851660723a3c2ddd57fafb06a22a 100644
--- a/pystencils/include/myintrin.h
+++ b/pystencils/include/myintrin.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#if defined(__SSE2__) || defined(_MSC_VER)
+#if defined(__SSE2__) || (defined(_MSC_VER) && !defined(_M_ARM64))
 QUALIFIERS __m128 _my_cvtepu32_ps(const __m128i v)
 {
 #ifdef __AVX512VL__
@@ -28,7 +28,7 @@ QUALIFIERS void _MY_TRANSPOSE4_EPI32(__m128i & R0, __m128i & R1, __m128i & R2, _
 }
 #endif
 
-#if defined(__SSE4_1__) || defined(_MSC_VER)
+#if defined(__SSE4_1__) || (defined(_MSC_VER) && !defined(_M_ARM64))
 #if !defined(__AVX512VL__) && defined(__GNUC__) && __GNUC__ >= 5 && !defined(__clang__)
 __attribute__((optimize("no-associative-math")))
 #endif
diff --git a/pystencils/include/philox_rand.h b/pystencils/include/philox_rand.h
index 0571eb39d114c5ea29c974e9630744d0bbcc1540..fab94146889a854f09537b0395cbee5607355c1e 100644
--- a/pystencils/include/philox_rand.h
+++ b/pystencils/include/philox_rand.h
@@ -1,16 +1,20 @@
 #ifndef __OPENCL_VERSION__
-#if defined(__SSE2__) || defined(_MSC_VER)
+#if defined(__SSE2__) || (defined(_MSC_VER) && !defined(_M_ARM64))
 #include <emmintrin.h> // SSE2
 #endif
 #ifdef __AVX2__
 #include <immintrin.h> // AVX*
-#elif defined(__SSE4_1__) || defined(_MSC_VER)
+#elif defined(__SSE4_1__) || (defined(_MSC_VER) && !defined(_M_ARM64))
 #include <smmintrin.h>  // SSE4
 #ifdef __FMA__
 #include <immintrin.h> // FMA
 #endif
 #endif
 
+#if defined(_MSC_VER) && defined(_M_ARM64)
+#define __ARM_NEON
+#endif
+
 #ifdef __ARM_NEON
 #include <arm_neon.h>
 #endif
@@ -183,7 +187,7 @@ QUALIFIERS void philox_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3
 }
 
 #if !defined(__CUDA_ARCH__) && !defined(__OPENCL_VERSION__)
-#if defined(__SSE4_1__) || defined(_MSC_VER)
+#if defined(__SSE4_1__) || (defined(_MSC_VER) && !defined(_M_ARM64))
 QUALIFIERS void _philox4x32round(__m128i* ctr, __m128i* key)
 {
     __m128i lohi0a = _mm_mul_epu32(ctr[0], _mm_set1_epi32(PHILOX_M4x32_0));
@@ -665,12 +669,14 @@ QUALIFIERS void philox_float4(uint32 ctr0, uint32x4_t ctr1, uint32 ctr2, uint32
     philox_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, rnd2, rnd3, rnd4);
 }
 
+#ifndef _MSC_VER
 QUALIFIERS void philox_float4(uint32 ctr0, int32x4_t ctr1, uint32 ctr2, uint32 ctr3,
                               uint32 key0, uint32 key1,
                               float32x4_t & rnd1, float32x4_t & rnd2, float32x4_t & rnd3, float32x4_t & rnd4)
 {
     philox_float4(ctr0, vreinterpretq_u32_s32(ctr1), ctr2, ctr3, key0, key1, rnd1, rnd2, rnd3, rnd4);
 }
+#endif
 
 QUALIFIERS void philox_double2(uint32 ctr0, uint32x4_t ctr1, uint32 ctr2, uint32 ctr3,
                                uint32 key0, uint32 key1,
@@ -695,6 +701,7 @@ QUALIFIERS void philox_double2(uint32 ctr0, uint32x4_t ctr1, uint32 ctr2, uint32
     philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, ignore, rnd2, ignore);
 }
 
+#ifndef _MSC_VER
 QUALIFIERS void philox_double2(uint32 ctr0, int32x4_t ctr1, uint32 ctr2, uint32 ctr3,
                                uint32 key0, uint32 key1,
                                float64x2_t & rnd1, float64x2_t & rnd2)
@@ -702,6 +709,7 @@ QUALIFIERS void philox_double2(uint32 ctr0, int32x4_t ctr1, uint32 ctr2, uint32
     philox_double2(ctr0, vreinterpretq_u32_s32(ctr1), ctr2, ctr3, key0, key1, rnd1, rnd2);
 }
 #endif
+#endif
 
 
 #if defined(__ARM_FEATURE_SVE)