diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index f333e761d5568ba256c3d5c48a2c6dfd6f4660b5..f338f2740a72d98e046a0b14c300e3d74688f839 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -21,7 +21,7 @@ tests-and-coverage: - mkdir -p ~/.config/matplotlib - echo "backend:template" > ~/.config/matplotlib/matplotlibrc - mkdir public - - py.test -v -n $NUM_CORES --cov-report html --cov-report term --cov=. -m "not longrun" --html test-report/index.html --junitxml=report.xml + - py.test -v -n $NUM_CORES --cov-report html --cov-report xml --cov-report term --cov=. -m "not longrun" --html test-report/index.html --junitxml=report.xml - python3 -m coverage xml tags: - docker @@ -156,7 +156,7 @@ arm64v8: extends: .multiarch_template image: i10git.cs.fau.de:5005/pycodegen/pycodegen/arm64 variables: - PYSTENCILS_SIMD: "neon" + QEMU_CPU: "cortex-a76" before_script: - *multiarch_before_script - sed -i s/march=native/march=armv8-a/g ~/.config/pystencils/config.json @@ -164,8 +164,6 @@ arm64v8: ppc64le: extends: .multiarch_template image: i10git.cs.fau.de:5005/pycodegen/pycodegen/ppc64le - variables: - PYSTENCILS_SIMD: "vsx" before_script: - *multiarch_before_script - sed -i s/mcpu=native/mcpu=power8/g ~/.config/pystencils/config.json @@ -174,8 +172,6 @@ arm64v9: # SVE support is still unreliable in GCC 11 (incorrect code for fixed-width vectors, internal compiler errors). extends: .multiarch_template image: i10git.cs.fau.de:5005/pycodegen/pycodegen/arm64 - variables: - PYSTENCILS_SIMD: "sve128,sve256,sve512,sve" before_script: - *multiarch_before_script - sed -i s/march=native/march=armv8-a+sve/g ~/.config/pystencils/config.json @@ -187,6 +183,7 @@ riscv64: extends: .multiarch_template image: i10git.cs.fau.de:5005/pycodegen/pycodegen/riscv64 variables: + # explicitly set SIMD as detection does not appear to work on QEMU PYSTENCILS_SIMD: "rvv" QEMU_CPU: "rv64,v=true" before_script: diff --git a/doc/conf.py b/doc/conf.py old mode 100644 new mode 100755 index c230cc945b3e58403af84146081aea79e83ca6c7..c493f806640eba010d1afee128b9bcd3ee5add3e --- a/doc/conf.py +++ b/doc/conf.py @@ -33,7 +33,7 @@ version = re.sub(r'(\d+\.\d+)\.\d+(.*)', r'\1\2', pystencils.__version__) version = re.sub(r'(\.dev\d+).*?$', r'\1', version) # The full version, including alpha/beta/rc tags. release = pystencils.__version__ -language = None +language = 'en' exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store', '**.ipynb_checkpoints'] default_role = 'any' pygments_style = 'sphinx' diff --git a/pystencils/alignedarray.py b/pystencils/alignedarray.py index 067a26d58370460beb1852add9ae46b9709b0595..63bdb3a5f1324a099bbd82fd666bfaec11eeb5af 100644 --- a/pystencils/alignedarray.py +++ b/pystencils/alignedarray.py @@ -25,7 +25,8 @@ def aligned_empty(shape, byte_alignment=True, dtype=np.float64, byte_offset=0, o byte_alignment = 64 elif byte_alignment == 'cacheline': cacheline_sizes = [get_cacheline_size(is_name) for is_name in instruction_sets] - if all([s is None for s in cacheline_sizes]): + if all([s is None for s in cacheline_sizes]) or \ + max([s for s in cacheline_sizes if s is not None]) > 0x100000: widths = [get_vector_instruction_set(dtype, is_name)['width'] * np.dtype(dtype).itemsize for is_name in instruction_sets if type(get_vector_instruction_set(dtype, is_name)['width']) is int] diff --git a/pystencils/backends/simd_instruction_sets.py b/pystencils/backends/simd_instruction_sets.py index 7d0d028c0691e48252a287dd81b46fd0d0a420cc..8024d58c3960235611020cd05f3ea3755375cf5b 100644 --- a/pystencils/backends/simd_instruction_sets.py +++ b/pystencils/backends/simd_instruction_sets.py @@ -9,6 +9,7 @@ from pystencils.backends.x86_instruction_sets import get_vector_instruction_set_ from pystencils.backends.arm_instruction_sets import get_vector_instruction_set_arm from pystencils.backends.ppc_instruction_sets import get_vector_instruction_set_ppc from pystencils.backends.riscv_instruction_sets import get_vector_instruction_set_riscv +from pystencils.cache import memorycache from pystencils.typing import numpy_name_to_c @@ -31,80 +32,68 @@ def get_vector_instruction_set(data_type='double', instruction_set='avx'): return get_vector_instruction_set_x86(type_name, instruction_set) -_cache = None -_cachelinesize = None - - +@memorycache def get_supported_instruction_sets(): """List of supported instruction sets on current hardware, or None if query failed.""" - global _cache - if _cache is not None: - return _cache.copy() if 'PYSTENCILS_SIMD' in os.environ: return os.environ['PYSTENCILS_SIMD'].split(',') - if (platform.system() == 'Darwin' or platform.system() == 'Linux') and platform.machine() == 'arm64': - # not supported by cpuinfo + if platform.system() == 'Darwin' and platform.machine() == 'arm64': return ['neon'] - elif platform.system() == 'Linux' and platform.machine().startswith('riscv'): # not supported by cpuinfo + elif platform.system() == 'Windows' and platform.machine() == 'ARM64': + return ['neon'] + elif platform.system() == 'Linux' and platform.machine() == 'aarch64': + result = ['neon'] # Neon is mandatory on 64-bit ARM libc = CDLL('libc.so.6') hwcap = libc.getauxval(16) # AT_HWCAP - hwcap_isa_v = 1 << (ord('V') - ord('A')) # COMPAT_HWCAP_ISA_V - return ['rvv'] if hwcap & hwcap_isa_v else [] - elif platform.machine().startswith('ppc64'): # no flags reported by cpuinfo - import subprocess - import tempfile - from pystencils.cpu.cpujit import get_compiler_config - f = tempfile.NamedTemporaryFile(suffix='.cpp') - command = [get_compiler_config()['command'], '-mcpu=native', '-dM', '-E', f.name] - macros = subprocess.check_output(command, input='', text=True) - if '#define __VSX__' in macros and '#define __ALTIVEC__' in macros: - _cache = ['vsx'] - else: - _cache = [] - return _cache.copy() - try: - from cpuinfo import get_cpu_info - except ImportError: - return None - - result = [] - required_sse_flags = {'sse', 'sse2', 'ssse3', 'sse4_1', 'sse4_2'} - required_avx_flags = {'avx', 'avx2'} - required_avx512_flags = {'avx512f'} - required_neon_flags = {'neon'} - required_sve_flags = {'sve'} - flags = set(get_cpu_info()['flags']) - if flags.issuperset(required_sse_flags): - result.append("sse") - if flags.issuperset(required_avx_flags): - result.append("avx") - if flags.issuperset(required_avx512_flags): - result.append("avx512") - if flags.issuperset(required_neon_flags): - result.append("neon") - if flags.issuperset(required_sve_flags): - if platform.system() == 'Linux': - libc = CDLL('libc.so.6') + if hwcap & (1 << 22): # HWCAP_SVE length = 8 * libc.prctl(51, 0, 0, 0, 0) # PR_SVE_GET_VL if length < 0: raise OSError("SVE length query failed") - while length > 128: + while length >= 128: result.append(f"sve{length}") length //= 2 - result.append("sve") - return result + result.append("sve") + return result + elif platform.system() == 'Linux' and platform.machine().startswith('riscv'): + libc = CDLL('libc.so.6') + hwcap = libc.getauxval(16) # AT_HWCAP + hwcap_isa_v = 1 << (ord('V') - ord('A')) # COMPAT_HWCAP_ISA_V + return ['rvv'] if hwcap & hwcap_isa_v else [] + elif platform.system() == 'Linux' and platform.machine().startswith('ppc64'): + libc = CDLL('libc.so.6') + hwcap = libc.getauxval(16) # AT_HWCAP + return ['vsx'] if hwcap & 0x00000080 else [] # PPC_FEATURE_HAS_VSX + elif platform.machine() in ['x86_64', 'x86', 'AMD64', 'i386']: + try: + from cpuinfo import get_cpu_info + except ImportError: + return None + + result = [] + required_sse_flags = {'sse', 'sse2', 'ssse3', 'sse4_1', 'sse4_2'} + required_avx_flags = {'avx', 'avx2'} + required_avx512_flags = {'avx512f'} + flags = set(get_cpu_info()['flags']) + if flags.issuperset(required_sse_flags): + result.append("sse") + if flags.issuperset(required_avx_flags): + result.append("avx") + if flags.issuperset(required_avx512_flags): + result.append("avx512") + return result + else: + raise NotImplementedError('Instruction set detection for %s on %s is not implemented' % + (platform.system(), platform.machine())) +@memorycache def get_cacheline_size(instruction_set): """Get the size (in bytes) of a cache block that can be zeroed without memory access. Usually, this is identical to the cache line size.""" - global _cachelinesize instruction_sets = get_vector_instruction_set('double', instruction_set) if 'cachelineSize' not in instruction_sets: return None - if _cachelinesize is not None: - return _cachelinesize import pystencils as ps from pystencils.astnodes import SympyAssignment @@ -117,5 +106,4 @@ def get_cacheline_size(instruction_set): ast = ps.create_kernel(ass, cpu_vectorize_info={'instruction_set': instruction_set}) kernel = ast.compile() kernel(**{f.name: arr, CachelineSize.symbol.name: 0}) - _cachelinesize = int(arr[0, 0]) - return _cachelinesize + return int(arr[0, 0]) diff --git a/pystencils/cpu/cpujit.py b/pystencils/cpu/cpujit.py index aebefec91d1b2f392b849f79960bf72dee666bf2..4c3febe2981562f6b0c923e8d68417c722ca7d61 100644 --- a/pystencils/cpu/cpujit.py +++ b/pystencils/cpu/cpujit.py @@ -146,9 +146,7 @@ def read_config(): ('flags', '-Ofast -DNDEBUG -fPIC -march=native -fopenmp -std=c++11'), ('restrict_qualifier', '__restrict__') ]) - if platform.machine() == 'arm64': - default_compiler_config['flags'] = default_compiler_config['flags'].replace('-march=native', '') - elif platform.machine().startswith('ppc64'): + if platform.machine().startswith('ppc64') or platform.machine() == 'arm64': default_compiler_config['flags'] = default_compiler_config['flags'].replace('-march=native', '-mcpu=native') elif platform.system().lower() == 'windows': @@ -159,6 +157,9 @@ def read_config(): ('flags', '/Ox /fp:fast /OpenMP /arch:avx'), ('restrict_qualifier', '__restrict') ]) + if platform.machine() == 'ARM64': + default_compiler_config['arch'] = 'ARM64' + default_compiler_config['flags'] = default_compiler_config['flags'].replace(' /arch:avx', '') elif platform.system().lower() == 'darwin': default_compiler_config = OrderedDict([ ('os', 'darwin'), @@ -174,8 +175,8 @@ def read_config(): default_compiler_config['flags'] += ' ' + libomp break else: - raise ValueError("The detection of the platform with platform.system() did not work. " - "Pystencils is only supported for linux, windows, and darwin platforms.") + raise NotImplementedError('Generation of default compiler flags for %s is not implemented' % + (platform.system(),)) default_cache_config = OrderedDict([ ('object_cache', os.path.join(user_cache_dir('pystencils'), 'objectcache')), @@ -393,7 +394,8 @@ def create_function_boilerplate_code(parameter_info, name, ast_node, insert_chec has_nontemporal = has_nontemporal or any([a.args[0].field == field and a.args[3] for a in loop.atoms(VectorMemoryAccess)]) if has_openmp and has_nontemporal: - byte_width = ast_node.instruction_set['cachelineSize'] + cl_size = ast_node.instruction_set['cachelineSize'] + byte_width = f"({cl_size}) < SIZE_MAX ? ({cl_size}) : ({byte_width})" offset = max(max(ast_node.ghost_layers)) * item_size offset_cond = f"(((uintptr_t) buffer_{field.name}.buf) + {offset}) % ({byte_width}) == 0" diff --git a/pystencils/include/arm_neon_helpers.h b/pystencils/include/arm_neon_helpers.h index a27b8ff6fa9e7244a8a0467315ed06d3985ed7b6..5ffc6a4f17d46cc5495cfc82d81ed16e7636f040 100644 --- a/pystencils/include/arm_neon_helpers.h +++ b/pystencils/include/arm_neon_helpers.h @@ -1,3 +1,7 @@ +#if defined(_MSC_VER) +#define __ARM_NEON +#endif + #ifdef __ARM_NEON #include <arm_neon.h> #endif @@ -32,10 +36,13 @@ inline int32x4_t makeVec_s32(int a, int b, int c, int d) #endif inline void cachelineZero(void * p) { +#if !defined(_MSC_VER) || defined(__clang__) __asm__ volatile("dc zva, %0"::"r"(p):"memory"); +#endif } inline size_t _cachelineSize() { +#if !defined(_MSC_VER) || defined(__clang__) // check that dc zva is permitted uint64_t dczid; __asm__ volatile ("mrs %0, dczid_el0" : "=r"(dczid)); @@ -72,6 +79,7 @@ inline size_t _cachelineSize() { return size; } } +#endif // too much was zeroed return SIZE_MAX; diff --git a/pystencils/include/myintrin.h b/pystencils/include/myintrin.h index 6c1d9d4d02636bc73a56ea5f0896eb128a86fbd1..eb1fe4dc41f2851660723a3c2ddd57fafb06a22a 100644 --- a/pystencils/include/myintrin.h +++ b/pystencils/include/myintrin.h @@ -1,6 +1,6 @@ #pragma once -#if defined(__SSE2__) || defined(_MSC_VER) +#if defined(__SSE2__) || (defined(_MSC_VER) && !defined(_M_ARM64)) QUALIFIERS __m128 _my_cvtepu32_ps(const __m128i v) { #ifdef __AVX512VL__ @@ -28,7 +28,7 @@ QUALIFIERS void _MY_TRANSPOSE4_EPI32(__m128i & R0, __m128i & R1, __m128i & R2, _ } #endif -#if defined(__SSE4_1__) || defined(_MSC_VER) +#if defined(__SSE4_1__) || (defined(_MSC_VER) && !defined(_M_ARM64)) #if !defined(__AVX512VL__) && defined(__GNUC__) && __GNUC__ >= 5 && !defined(__clang__) __attribute__((optimize("no-associative-math"))) #endif diff --git a/pystencils/include/philox_rand.h b/pystencils/include/philox_rand.h index 0571eb39d114c5ea29c974e9630744d0bbcc1540..fab94146889a854f09537b0395cbee5607355c1e 100644 --- a/pystencils/include/philox_rand.h +++ b/pystencils/include/philox_rand.h @@ -1,16 +1,20 @@ #ifndef __OPENCL_VERSION__ -#if defined(__SSE2__) || defined(_MSC_VER) +#if defined(__SSE2__) || (defined(_MSC_VER) && !defined(_M_ARM64)) #include <emmintrin.h> // SSE2 #endif #ifdef __AVX2__ #include <immintrin.h> // AVX* -#elif defined(__SSE4_1__) || defined(_MSC_VER) +#elif defined(__SSE4_1__) || (defined(_MSC_VER) && !defined(_M_ARM64)) #include <smmintrin.h> // SSE4 #ifdef __FMA__ #include <immintrin.h> // FMA #endif #endif +#if defined(_MSC_VER) && defined(_M_ARM64) +#define __ARM_NEON +#endif + #ifdef __ARM_NEON #include <arm_neon.h> #endif @@ -183,7 +187,7 @@ QUALIFIERS void philox_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3 } #if !defined(__CUDA_ARCH__) && !defined(__OPENCL_VERSION__) -#if defined(__SSE4_1__) || defined(_MSC_VER) +#if defined(__SSE4_1__) || (defined(_MSC_VER) && !defined(_M_ARM64)) QUALIFIERS void _philox4x32round(__m128i* ctr, __m128i* key) { __m128i lohi0a = _mm_mul_epu32(ctr[0], _mm_set1_epi32(PHILOX_M4x32_0)); @@ -665,12 +669,14 @@ QUALIFIERS void philox_float4(uint32 ctr0, uint32x4_t ctr1, uint32 ctr2, uint32 philox_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, rnd2, rnd3, rnd4); } +#ifndef _MSC_VER QUALIFIERS void philox_float4(uint32 ctr0, int32x4_t ctr1, uint32 ctr2, uint32 ctr3, uint32 key0, uint32 key1, float32x4_t & rnd1, float32x4_t & rnd2, float32x4_t & rnd3, float32x4_t & rnd4) { philox_float4(ctr0, vreinterpretq_u32_s32(ctr1), ctr2, ctr3, key0, key1, rnd1, rnd2, rnd3, rnd4); } +#endif QUALIFIERS void philox_double2(uint32 ctr0, uint32x4_t ctr1, uint32 ctr2, uint32 ctr3, uint32 key0, uint32 key1, @@ -695,6 +701,7 @@ QUALIFIERS void philox_double2(uint32 ctr0, uint32x4_t ctr1, uint32 ctr2, uint32 philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, ignore, rnd2, ignore); } +#ifndef _MSC_VER QUALIFIERS void philox_double2(uint32 ctr0, int32x4_t ctr1, uint32 ctr2, uint32 ctr3, uint32 key0, uint32 key1, float64x2_t & rnd1, float64x2_t & rnd2) @@ -702,6 +709,7 @@ QUALIFIERS void philox_double2(uint32 ctr0, int32x4_t ctr1, uint32 ctr2, uint32 philox_double2(ctr0, vreinterpretq_u32_s32(ctr1), ctr2, ctr3, key0, key1, rnd1, rnd2); } #endif +#endif #if defined(__ARM_FEATURE_SVE) diff --git a/pystencils/typing/utilities.py b/pystencils/typing/utilities.py index da40c510ef91c7ca7fee0e6a0259b3eef50f0ab8..223da701a4d5c133715eb30f99366c44b13f16b2 100644 --- a/pystencils/typing/utilities.py +++ b/pystencils/typing/utilities.py @@ -187,18 +187,15 @@ def get_type_of_expression(expr, # Fix for sympy versions from 1.9 sympy_version = sp.__version__.split('.') -if int(sympy_version[0]) * 100 + int(sympy_version[1]) >= 109: +sympy_version_int = int(sympy_version[0]) * 100 + int(sympy_version[1]) +if sympy_version_int >= 109: # __setstate__ would bypass the contructor, so we remove it - sp.Number.__getstate__ = sp.Basic.__getstate__ - del sp.Basic.__getstate__ - - class FunctorWithStoredKwargs: - def __init__(self, func, **kwargs): - self.func = func - self.kwargs = kwargs - - def __call__(self, *args): - return self.func(*args, **self.kwargs) + if sympy_version_int >= 111: + del sp.Basic.__setstate__ + del sp.Symbol.__setstate__ + else: + sp.Number.__getstate__ = sp.Basic.__getstate__ + del sp.Basic.__getstate__ # __reduce_ex__ would strip kwargs, so we override it def basic_reduce_ex(self, protocol): @@ -210,9 +207,7 @@ if int(sympy_version[0]) * 100 + int(sympy_version[1]) >= 109: state = self.__getstate__() else: state = None - return FunctorWithStoredKwargs(type(self), **kwargs), args, state - - sp.Number.__reduce_ex__ = sp.Basic.__reduce_ex__ + return partial(type(self), **kwargs), args, state sp.Basic.__reduce_ex__ = basic_reduce_ex