diff --git a/pystencils/alignedarray.py b/pystencils/alignedarray.py index 067a26d58370460beb1852add9ae46b9709b0595..63bdb3a5f1324a099bbd82fd666bfaec11eeb5af 100644 --- a/pystencils/alignedarray.py +++ b/pystencils/alignedarray.py @@ -25,7 +25,8 @@ def aligned_empty(shape, byte_alignment=True, dtype=np.float64, byte_offset=0, o byte_alignment = 64 elif byte_alignment == 'cacheline': cacheline_sizes = [get_cacheline_size(is_name) for is_name in instruction_sets] - if all([s is None for s in cacheline_sizes]): + if all([s is None for s in cacheline_sizes]) or \ + max([s for s in cacheline_sizes if s is not None]) > 0x100000: widths = [get_vector_instruction_set(dtype, is_name)['width'] * np.dtype(dtype).itemsize for is_name in instruction_sets if type(get_vector_instruction_set(dtype, is_name)['width']) is int] diff --git a/pystencils/backends/simd_instruction_sets.py b/pystencils/backends/simd_instruction_sets.py index 7d0d028c0691e48252a287dd81b46fd0d0a420cc..678a44f39d31465632a3b5f9b75f4acb839a7209 100644 --- a/pystencils/backends/simd_instruction_sets.py +++ b/pystencils/backends/simd_instruction_sets.py @@ -45,6 +45,9 @@ def get_supported_instruction_sets(): if (platform.system() == 'Darwin' or platform.system() == 'Linux') and platform.machine() == 'arm64': # not supported by cpuinfo return ['neon'] + elif platform.system() == 'Windows' and platform.machine() == 'ARM64': + # not supported by cpuinfo + return ['neon'] elif platform.system() == 'Linux' and platform.machine().startswith('riscv'): # not supported by cpuinfo libc = CDLL('libc.so.6') hwcap = libc.getauxval(16) # AT_HWCAP diff --git a/pystencils/cpu/cpujit.py b/pystencils/cpu/cpujit.py index aebefec91d1b2f392b849f79960bf72dee666bf2..e761412375fb88d959f411e1e34c2b0afb83dd33 100644 --- a/pystencils/cpu/cpujit.py +++ b/pystencils/cpu/cpujit.py @@ -159,6 +159,9 @@ def read_config(): ('flags', '/Ox /fp:fast /OpenMP /arch:avx'), ('restrict_qualifier', '__restrict') ]) + if platform.machine() == 'ARM64': + default_compiler_config['arch'] = 'ARM64' + default_compiler_config['flags'] = default_compiler_config['flags'].replace(' /arch:avx', '') elif platform.system().lower() == 'darwin': default_compiler_config = OrderedDict([ ('os', 'darwin'), @@ -393,7 +396,8 @@ def create_function_boilerplate_code(parameter_info, name, ast_node, insert_chec has_nontemporal = has_nontemporal or any([a.args[0].field == field and a.args[3] for a in loop.atoms(VectorMemoryAccess)]) if has_openmp and has_nontemporal: - byte_width = ast_node.instruction_set['cachelineSize'] + cl_size = ast_node.instruction_set['cachelineSize'] + byte_width = f"({cl_size}) < SIZE_MAX ? ({cl_size}) : ({byte_width})" offset = max(max(ast_node.ghost_layers)) * item_size offset_cond = f"(((uintptr_t) buffer_{field.name}.buf) + {offset}) % ({byte_width}) == 0" diff --git a/pystencils/include/arm_neon_helpers.h b/pystencils/include/arm_neon_helpers.h index a27b8ff6fa9e7244a8a0467315ed06d3985ed7b6..5900543b7ace3e032cf53232fd0d83a8c4306d2b 100644 --- a/pystencils/include/arm_neon_helpers.h +++ b/pystencils/include/arm_neon_helpers.h @@ -1,3 +1,7 @@ +#if defined(_MSC_VER) +#define __ARM_NEON +#endif + #ifdef __ARM_NEON #include <arm_neon.h> #endif @@ -32,10 +36,13 @@ inline int32x4_t makeVec_s32(int a, int b, int c, int d) #endif inline void cachelineZero(void * p) { +#ifndef _MSC_VER __asm__ volatile("dc zva, %0"::"r"(p):"memory"); +#endif } inline size_t _cachelineSize() { +#ifndef _MSC_VER // check that dc zva is permitted uint64_t dczid; __asm__ volatile ("mrs %0, dczid_el0" : "=r"(dczid)); @@ -72,6 +79,7 @@ inline size_t _cachelineSize() { return size; } } +#endif // too much was zeroed return SIZE_MAX; diff --git a/pystencils/include/myintrin.h b/pystencils/include/myintrin.h index 6c1d9d4d02636bc73a56ea5f0896eb128a86fbd1..eb1fe4dc41f2851660723a3c2ddd57fafb06a22a 100644 --- a/pystencils/include/myintrin.h +++ b/pystencils/include/myintrin.h @@ -1,6 +1,6 @@ #pragma once -#if defined(__SSE2__) || defined(_MSC_VER) +#if defined(__SSE2__) || (defined(_MSC_VER) && !defined(_M_ARM64)) QUALIFIERS __m128 _my_cvtepu32_ps(const __m128i v) { #ifdef __AVX512VL__ @@ -28,7 +28,7 @@ QUALIFIERS void _MY_TRANSPOSE4_EPI32(__m128i & R0, __m128i & R1, __m128i & R2, _ } #endif -#if defined(__SSE4_1__) || defined(_MSC_VER) +#if defined(__SSE4_1__) || (defined(_MSC_VER) && !defined(_M_ARM64)) #if !defined(__AVX512VL__) && defined(__GNUC__) && __GNUC__ >= 5 && !defined(__clang__) __attribute__((optimize("no-associative-math"))) #endif diff --git a/pystencils/include/philox_rand.h b/pystencils/include/philox_rand.h index 0571eb39d114c5ea29c974e9630744d0bbcc1540..fab94146889a854f09537b0395cbee5607355c1e 100644 --- a/pystencils/include/philox_rand.h +++ b/pystencils/include/philox_rand.h @@ -1,16 +1,20 @@ #ifndef __OPENCL_VERSION__ -#if defined(__SSE2__) || defined(_MSC_VER) +#if defined(__SSE2__) || (defined(_MSC_VER) && !defined(_M_ARM64)) #include <emmintrin.h> // SSE2 #endif #ifdef __AVX2__ #include <immintrin.h> // AVX* -#elif defined(__SSE4_1__) || defined(_MSC_VER) +#elif defined(__SSE4_1__) || (defined(_MSC_VER) && !defined(_M_ARM64)) #include <smmintrin.h> // SSE4 #ifdef __FMA__ #include <immintrin.h> // FMA #endif #endif +#if defined(_MSC_VER) && defined(_M_ARM64) +#define __ARM_NEON +#endif + #ifdef __ARM_NEON #include <arm_neon.h> #endif @@ -183,7 +187,7 @@ QUALIFIERS void philox_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3 } #if !defined(__CUDA_ARCH__) && !defined(__OPENCL_VERSION__) -#if defined(__SSE4_1__) || defined(_MSC_VER) +#if defined(__SSE4_1__) || (defined(_MSC_VER) && !defined(_M_ARM64)) QUALIFIERS void _philox4x32round(__m128i* ctr, __m128i* key) { __m128i lohi0a = _mm_mul_epu32(ctr[0], _mm_set1_epi32(PHILOX_M4x32_0)); @@ -665,12 +669,14 @@ QUALIFIERS void philox_float4(uint32 ctr0, uint32x4_t ctr1, uint32 ctr2, uint32 philox_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, rnd2, rnd3, rnd4); } +#ifndef _MSC_VER QUALIFIERS void philox_float4(uint32 ctr0, int32x4_t ctr1, uint32 ctr2, uint32 ctr3, uint32 key0, uint32 key1, float32x4_t & rnd1, float32x4_t & rnd2, float32x4_t & rnd3, float32x4_t & rnd4) { philox_float4(ctr0, vreinterpretq_u32_s32(ctr1), ctr2, ctr3, key0, key1, rnd1, rnd2, rnd3, rnd4); } +#endif QUALIFIERS void philox_double2(uint32 ctr0, uint32x4_t ctr1, uint32 ctr2, uint32 ctr3, uint32 key0, uint32 key1, @@ -695,6 +701,7 @@ QUALIFIERS void philox_double2(uint32 ctr0, uint32x4_t ctr1, uint32 ctr2, uint32 philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, ignore, rnd2, ignore); } +#ifndef _MSC_VER QUALIFIERS void philox_double2(uint32 ctr0, int32x4_t ctr1, uint32 ctr2, uint32 ctr3, uint32 key0, uint32 key1, float64x2_t & rnd1, float64x2_t & rnd2) @@ -702,6 +709,7 @@ QUALIFIERS void philox_double2(uint32 ctr0, int32x4_t ctr1, uint32 ctr2, uint32 philox_double2(ctr0, vreinterpretq_u32_s32(ctr1), ctr2, ctr3, key0, key1, rnd1, rnd2); } #endif +#endif #if defined(__ARM_FEATURE_SVE)