From 587b23f659f6feed98756d186bfd1818cca4f8b4 Mon Sep 17 00:00:00 2001 From: Michael Kuron <m.kuron@gmx.de> Date: Wed, 31 May 2023 20:33:33 +0200 Subject: [PATCH] Support Windows on ARM64 --- pystencils/alignedarray.py | 3 ++- pystencils/backends/simd_instruction_sets.py | 3 +++ pystencils/cpu/cpujit.py | 6 +++++- pystencils/include/arm_neon_helpers.h | 8 ++++++++ pystencils/include/myintrin.h | 4 ++-- pystencils/include/philox_rand.h | 14 +++++++++++--- 6 files changed, 31 insertions(+), 7 deletions(-) diff --git a/pystencils/alignedarray.py b/pystencils/alignedarray.py index 067a26d58..63bdb3a5f 100644 --- a/pystencils/alignedarray.py +++ b/pystencils/alignedarray.py @@ -25,7 +25,8 @@ def aligned_empty(shape, byte_alignment=True, dtype=np.float64, byte_offset=0, o byte_alignment = 64 elif byte_alignment == 'cacheline': cacheline_sizes = [get_cacheline_size(is_name) for is_name in instruction_sets] - if all([s is None for s in cacheline_sizes]): + if all([s is None for s in cacheline_sizes]) or \ + max([s for s in cacheline_sizes if s is not None]) > 0x100000: widths = [get_vector_instruction_set(dtype, is_name)['width'] * np.dtype(dtype).itemsize for is_name in instruction_sets if type(get_vector_instruction_set(dtype, is_name)['width']) is int] diff --git a/pystencils/backends/simd_instruction_sets.py b/pystencils/backends/simd_instruction_sets.py index 7d0d028c0..678a44f39 100644 --- a/pystencils/backends/simd_instruction_sets.py +++ b/pystencils/backends/simd_instruction_sets.py @@ -45,6 +45,9 @@ def get_supported_instruction_sets(): if (platform.system() == 'Darwin' or platform.system() == 'Linux') and platform.machine() == 'arm64': # not supported by cpuinfo return ['neon'] + elif platform.system() == 'Windows' and platform.machine() == 'ARM64': + # not supported by cpuinfo + return ['neon'] elif platform.system() == 'Linux' and platform.machine().startswith('riscv'): # not supported by cpuinfo libc = CDLL('libc.so.6') hwcap = libc.getauxval(16) # AT_HWCAP diff --git a/pystencils/cpu/cpujit.py b/pystencils/cpu/cpujit.py index aebefec91..e76141237 100644 --- a/pystencils/cpu/cpujit.py +++ b/pystencils/cpu/cpujit.py @@ -159,6 +159,9 @@ def read_config(): ('flags', '/Ox /fp:fast /OpenMP /arch:avx'), ('restrict_qualifier', '__restrict') ]) + if platform.machine() == 'ARM64': + default_compiler_config['arch'] = 'ARM64' + default_compiler_config['flags'] = default_compiler_config['flags'].replace(' /arch:avx', '') elif platform.system().lower() == 'darwin': default_compiler_config = OrderedDict([ ('os', 'darwin'), @@ -393,7 +396,8 @@ def create_function_boilerplate_code(parameter_info, name, ast_node, insert_chec has_nontemporal = has_nontemporal or any([a.args[0].field == field and a.args[3] for a in loop.atoms(VectorMemoryAccess)]) if has_openmp and has_nontemporal: - byte_width = ast_node.instruction_set['cachelineSize'] + cl_size = ast_node.instruction_set['cachelineSize'] + byte_width = f"({cl_size}) < SIZE_MAX ? ({cl_size}) : ({byte_width})" offset = max(max(ast_node.ghost_layers)) * item_size offset_cond = f"(((uintptr_t) buffer_{field.name}.buf) + {offset}) % ({byte_width}) == 0" diff --git a/pystencils/include/arm_neon_helpers.h b/pystencils/include/arm_neon_helpers.h index a27b8ff6f..5900543b7 100644 --- a/pystencils/include/arm_neon_helpers.h +++ b/pystencils/include/arm_neon_helpers.h @@ -1,3 +1,7 @@ +#if defined(_MSC_VER) +#define __ARM_NEON +#endif + #ifdef __ARM_NEON #include <arm_neon.h> #endif @@ -32,10 +36,13 @@ inline int32x4_t makeVec_s32(int a, int b, int c, int d) #endif inline void cachelineZero(void * p) { +#ifndef _MSC_VER __asm__ volatile("dc zva, %0"::"r"(p):"memory"); +#endif } inline size_t _cachelineSize() { +#ifndef _MSC_VER // check that dc zva is permitted uint64_t dczid; __asm__ volatile ("mrs %0, dczid_el0" : "=r"(dczid)); @@ -72,6 +79,7 @@ inline size_t _cachelineSize() { return size; } } +#endif // too much was zeroed return SIZE_MAX; diff --git a/pystencils/include/myintrin.h b/pystencils/include/myintrin.h index 6c1d9d4d0..eb1fe4dc4 100644 --- a/pystencils/include/myintrin.h +++ b/pystencils/include/myintrin.h @@ -1,6 +1,6 @@ #pragma once -#if defined(__SSE2__) || defined(_MSC_VER) +#if defined(__SSE2__) || (defined(_MSC_VER) && !defined(_M_ARM64)) QUALIFIERS __m128 _my_cvtepu32_ps(const __m128i v) { #ifdef __AVX512VL__ @@ -28,7 +28,7 @@ QUALIFIERS void _MY_TRANSPOSE4_EPI32(__m128i & R0, __m128i & R1, __m128i & R2, _ } #endif -#if defined(__SSE4_1__) || defined(_MSC_VER) +#if defined(__SSE4_1__) || (defined(_MSC_VER) && !defined(_M_ARM64)) #if !defined(__AVX512VL__) && defined(__GNUC__) && __GNUC__ >= 5 && !defined(__clang__) __attribute__((optimize("no-associative-math"))) #endif diff --git a/pystencils/include/philox_rand.h b/pystencils/include/philox_rand.h index 0571eb39d..fab941468 100644 --- a/pystencils/include/philox_rand.h +++ b/pystencils/include/philox_rand.h @@ -1,16 +1,20 @@ #ifndef __OPENCL_VERSION__ -#if defined(__SSE2__) || defined(_MSC_VER) +#if defined(__SSE2__) || (defined(_MSC_VER) && !defined(_M_ARM64)) #include <emmintrin.h> // SSE2 #endif #ifdef __AVX2__ #include <immintrin.h> // AVX* -#elif defined(__SSE4_1__) || defined(_MSC_VER) +#elif defined(__SSE4_1__) || (defined(_MSC_VER) && !defined(_M_ARM64)) #include <smmintrin.h> // SSE4 #ifdef __FMA__ #include <immintrin.h> // FMA #endif #endif +#if defined(_MSC_VER) && defined(_M_ARM64) +#define __ARM_NEON +#endif + #ifdef __ARM_NEON #include <arm_neon.h> #endif @@ -183,7 +187,7 @@ QUALIFIERS void philox_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3 } #if !defined(__CUDA_ARCH__) && !defined(__OPENCL_VERSION__) -#if defined(__SSE4_1__) || defined(_MSC_VER) +#if defined(__SSE4_1__) || (defined(_MSC_VER) && !defined(_M_ARM64)) QUALIFIERS void _philox4x32round(__m128i* ctr, __m128i* key) { __m128i lohi0a = _mm_mul_epu32(ctr[0], _mm_set1_epi32(PHILOX_M4x32_0)); @@ -665,12 +669,14 @@ QUALIFIERS void philox_float4(uint32 ctr0, uint32x4_t ctr1, uint32 ctr2, uint32 philox_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, rnd2, rnd3, rnd4); } +#ifndef _MSC_VER QUALIFIERS void philox_float4(uint32 ctr0, int32x4_t ctr1, uint32 ctr2, uint32 ctr3, uint32 key0, uint32 key1, float32x4_t & rnd1, float32x4_t & rnd2, float32x4_t & rnd3, float32x4_t & rnd4) { philox_float4(ctr0, vreinterpretq_u32_s32(ctr1), ctr2, ctr3, key0, key1, rnd1, rnd2, rnd3, rnd4); } +#endif QUALIFIERS void philox_double2(uint32 ctr0, uint32x4_t ctr1, uint32 ctr2, uint32 ctr3, uint32 key0, uint32 key1, @@ -695,6 +701,7 @@ QUALIFIERS void philox_double2(uint32 ctr0, uint32x4_t ctr1, uint32 ctr2, uint32 philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, ignore, rnd2, ignore); } +#ifndef _MSC_VER QUALIFIERS void philox_double2(uint32 ctr0, int32x4_t ctr1, uint32 ctr2, uint32 ctr3, uint32 key0, uint32 key1, float64x2_t & rnd1, float64x2_t & rnd2) @@ -702,6 +709,7 @@ QUALIFIERS void philox_double2(uint32 ctr0, int32x4_t ctr1, uint32 ctr2, uint32 philox_double2(ctr0, vreinterpretq_u32_s32(ctr1), ctr2, ctr3, key0, key1, rnd1, rnd2); } #endif +#endif #if defined(__ARM_FEATURE_SVE) -- GitLab