diff --git a/pystencils/backends/arm_instruction_sets.py b/pystencils/backends/arm_instruction_sets.py index 1f2f51cd6154a63beab7c4b6939e8f41b6ad976d..6c388f3e4cb798297f1902b06ef82b0909d191a5 100644 --- a/pystencils/backends/arm_instruction_sets.py +++ b/pystencils/backends/arm_instruction_sets.py @@ -33,21 +33,21 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon', q '<': 'clt[0, 1]', '>=': 'cge[0, 1]', '>': 'cgt[0, 1]', - # '&': 'and[0, 1]', -> only for integer values available - # '|': 'orr[0, 1]' - } bits = {'double': 64, - 'float': 32} + 'float': 32, + 'int': 32} if q_registers is True: q_reg = 'q' width = 128 // bits[data_type] + intwidth = 128 // bits[data_type] suffix = f'q_f{bits[data_type]}' else: q_reg = '' width = 64 // bits[data_type] + intwidth = 64 // bits[data_type] suffix = f'_f{bits[data_type]}' result = dict() @@ -60,16 +60,26 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon', q result[intrinsic_id] = 'v' + name + suffix + arg_string - result['makeVecConst'] = 'vdup' + q_reg + '_n_f' + str(bits[data_type]) + '({0})' - result['makeVec'] = 'vdup' + q_reg + '_n_f' + str(bits[data_type]) + '({0})' + result['makeVecConst'] = f'vdup{q_reg}_n_f{bits[data_type]}' + '({0})' + result['makeVec'] = f'vdup{q_reg}_n_f{bits[data_type]}' + '({0})' + result['makeVecConstInt'] = f'vdup{q_reg}_n_s{bits["int"]}' + '({0})' + result['makeVecInt'] = f'vdup{q_reg}_n_s{bits["int"]}' + '({0})' + + result['+int'] = f"vaddq_s{bits['int']}" + "({0}, {1})" result['rsqrt'] = None result['width'] = width - result['double'] = 'float64x' + str(width) + '_t' - result['float'] = 'float32x' + str(width * 2) + '_t' + result['intwidth'] = intwidth + result[data_type] = f'float{bits[data_type]}x{width}_t' + result['int'] = f'int{bits["int"]}x{bits[data_type]}_t' + result['bool'] = f'uint{bits[data_type]}x{width}_t' result['headers'] = ['<arm_neon.h>'] - result['!='] = 'vmvnq_u%d(%s)' % (bits[data_type], result['==']) + result['!='] = f'vmvn{q_reg}_u{bits[data_type]}({result["=="]})' + + result['&'] = f'vand{q_reg}_u{bits[data_type]}' + '({0}, {1})' + result['|'] = f'vorr{q_reg}_u{bits[data_type]}' + '({0}, {1})' + result['blendv'] = f'vbsl{q_reg}_f{bits[data_type]}' + '({2}, {1}, {0})' return result diff --git a/pystencils/backends/simd_instruction_sets.py b/pystencils/backends/simd_instruction_sets.py index 44284d6bbbfd3fec99306408c22c5d34493eb508..3bcbaee1be5bdf9ac9d5c50bc9e0d109f4a21b3f 100644 --- a/pystencils/backends/simd_instruction_sets.py +++ b/pystencils/backends/simd_instruction_sets.py @@ -1,3 +1,5 @@ +import platform + from pystencils.backends.x86_instruction_sets import get_vector_instruction_set_x86 from pystencils.backends.arm_instruction_sets import get_vector_instruction_set_arm @@ -11,6 +13,8 @@ def get_vector_instruction_set(data_type='double', instruction_set='avx', q_regi def get_supported_instruction_sets(): """List of supported instruction sets on current hardware, or None if query failed.""" + if platform.system() == 'Darwin' and platform.machine() == 'arm64': # not supported by cpuinfo + return ['neon'] try: from cpuinfo import get_cpu_info except ImportError: diff --git a/pystencils/cpu/cpujit.py b/pystencils/cpu/cpujit.py index 84908dcb29b38fc89a1b381d1491fbde59a09db7..ab4f8f84e628577d0b2343744e74954b541d4ce0 100644 --- a/pystencils/cpu/cpujit.py +++ b/pystencils/cpu/cpujit.py @@ -171,6 +171,13 @@ def read_config(): ('flags', '-Ofast -DNDEBUG -fPIC -march=native -Xclang -fopenmp -std=c++11'), ('restrict_qualifier', '__restrict__') ]) + if platform.machine() == 'arm64': + default_compiler_config['flags'] = default_compiler_config['flags'].replace('-march=native ', '') + for libomp in ['/opt/local/lib/libomp/libomp.dylib', '/usr/local/lib/libomp.dylib', + '/opt/homebrew/lib/libomp.dylib']: + if os.path.exists(libomp): + default_compiler_config['flags'] += ' ' + libomp + break default_cache_config = OrderedDict([ ('object_cache', os.path.join(user_cache_dir('pystencils'), 'objectcache')), ('clear_cache_on_start', False), diff --git a/pystencils/cpu/vectorization.py b/pystencils/cpu/vectorization.py index 51fa1a807db17b8ba58fda8bf080658015b33ef8..0af12adaf81584c93bff037705c1edcf67bbab93 100644 --- a/pystencils/cpu/vectorization.py +++ b/pystencils/cpu/vectorization.py @@ -6,7 +6,7 @@ import sympy as sp from sympy.logic.boolalg import BooleanFunction import pystencils.astnodes as ast -from pystencils.backends.simd_instruction_sets import get_vector_instruction_set +from pystencils.backends.simd_instruction_sets import get_supported_instruction_sets, get_vector_instruction_set from pystencils.data_types import ( PointerType, TypedSymbol, VectorType, cast_func, collate_types, get_type_of_expression, vector_memory_access) from pystencils.fast_approximation import fast_division, fast_inv_sqrt, fast_sqrt @@ -26,7 +26,7 @@ class vec_all(sp.Function): nargs = (1,) -def vectorize(kernel_ast: ast.KernelFunction, instruction_set: str = 'avx', +def vectorize(kernel_ast: ast.KernelFunction, instruction_set: str = 'best', assume_aligned: bool = False, nontemporal: Union[bool, Container[Union[str, Field]]] = False, assume_inner_stride_one: bool = False, assume_sufficient_line_padding: bool = True): """Explicit vectorization using SIMD vectorization via intrinsics. @@ -51,6 +51,11 @@ def vectorize(kernel_ast: ast.KernelFunction, instruction_set: str = 'avx', depending on the access pattern there might be additional padding required at the end of the array """ + if instruction_set == 'best': + if get_supported_instruction_sets(): + instruction_set = get_supported_instruction_sets()[-1] + else: + instruction_set = 'avx' if instruction_set is None: return diff --git a/pystencils/include/philox_rand.h b/pystencils/include/philox_rand.h index b4c83669d0fda05aee1a7e018904c927a59c7ad1..2950717738d93f19410dfcc17786a687b8ee8226 100644 --- a/pystencils/include/philox_rand.h +++ b/pystencils/include/philox_rand.h @@ -1,11 +1,11 @@ #include <cstdint> -#if defined(__SSE4_1__) || defined(_MSC_VER) +#if defined(__SSE2__) || defined(_MSC_VER) #include <emmintrin.h> // SSE2 #endif #ifdef __AVX2__ #include <immintrin.h> // AVX* -#else +#elif defined(__SSE4_1__) || defined(_MSC_VER) #include <smmintrin.h> // SSE4 #ifdef __FMA__ #include <immintrin.h> // FMA diff --git a/pystencils_tests/test_basic_usage_llvm.ipynb b/pystencils_tests/test_basic_usage_llvm.ipynb index 4b7f843410cee16c39ee96c106ebbf1a7f9ac4ac..623bad6601f64a3f26521df74bb747359ee207ff 100644 --- a/pystencils_tests/test_basic_usage_llvm.ipynb +++ b/pystencils_tests/test_basic_usage_llvm.ipynb @@ -10,6 +10,16 @@ "In this example a simple weighted Jacobi kernel is generated, so the focus remains on the part of LLVM generation." ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pytest\n", + "pytest.importorskip('llvmlite')" + ] + }, { "cell_type": "code", "execution_count": 1, diff --git a/pystencils_tests/test_conditional_vec.py b/pystencils_tests/test_conditional_vec.py index d3fc3dc66b4254e4b78a7699ce23316afb830132..5f22a230d501f1df26d605beba6fad5d1d251971 100644 --- a/pystencils_tests/test_conditional_vec.py +++ b/pystencils_tests/test_conditional_vec.py @@ -1,11 +1,15 @@ import numpy as np import sympy as sp +import pytest import pystencils as ps from pystencils.astnodes import Block, Conditional +from pystencils.backends.simd_instruction_sets import get_supported_instruction_sets from pystencils.cpu.vectorization import vec_all, vec_any +@pytest.mark.skipif(not get_supported_instruction_sets(), reason='cannot detect CPU instruction set') +@pytest.mark.skipif('neon' in get_supported_instruction_sets(), reason='ARM does not have collective instructions') def test_vec_any(): data_arr = np.zeros((15, 15)) @@ -19,12 +23,14 @@ def test_vec_any(): ])) ] ast = ps.create_kernel(c, target='cpu', - cpu_vectorize_info={'instruction_set': 'avx'}) + cpu_vectorize_info={'instruction_set': get_supported_instruction_sets()[-1]}) kernel = ast.compile() kernel(data=data_arr) np.testing.assert_equal(data_arr[3:9, 0:8], 2.0) +@pytest.mark.skipif(not get_supported_instruction_sets(), reason='cannot detect CPU instruction set') +@pytest.mark.skipif('neon' in get_supported_instruction_sets(), reason='ARM does not have collective instructions') def test_vec_all(): data_arr = np.zeros((15, 15)) @@ -37,13 +43,14 @@ def test_vec_all(): ])) ] ast = ps.create_kernel(c, target='cpu', - cpu_vectorize_info={'instruction_set': 'avx'}) + cpu_vectorize_info={'instruction_set': get_supported_instruction_sets()[-1]}) kernel = ast.compile() before = data_arr.copy() kernel(data=data_arr) np.testing.assert_equal(data_arr, before) +@pytest.mark.skipif(not get_supported_instruction_sets(), reason='cannot detect CPU instruction set') def test_boolean_before_loop(): t1, t2 = sp.symbols('t1, t2') f_arr = np.ones((10, 10)) @@ -55,7 +62,7 @@ def test_boolean_before_loop(): ps.Assignment(g[0, 0], sp.Piecewise((f[0, 0], t1), (42, True))) ] - ast = ps.create_kernel(a, cpu_vectorize_info={'instruction_set': 'avx'}) + ast = ps.create_kernel(a, cpu_vectorize_info={'instruction_set': get_supported_instruction_sets()[-1]}) kernel = ast.compile() kernel(f=f_arr, g=g_arr, t2=1.0) print(g) diff --git a/pystencils_tests/test_dot_printer.ipynb b/pystencils_tests/test_dot_printer.ipynb index 8b61525f520fddfdc37ac028241c897ffd8c02fd..67c0e14a947167b13ba012cc71fa6d46841f9aba 100644 --- a/pystencils_tests/test_dot_printer.ipynb +++ b/pystencils_tests/test_dot_printer.ipynb @@ -1,5 +1,15 @@ { "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pytest\n", + "pytest.importorskip('graphviz')" + ] + }, { "cell_type": "code", "execution_count": 1, diff --git a/pystencils_tests/test_field_coordinates.py b/pystencils_tests/test_field_coordinates.py index 1aeed343bc0c77d6e6b56b3d48e68c100be3f56c..5d95a308ca5829ba4d5f4ffc4f1022334ac5fd45 100644 --- a/pystencils_tests/test_field_coordinates.py +++ b/pystencils_tests/test_field_coordinates.py @@ -27,7 +27,7 @@ try: import skimage.io lenna = skimage.io.imread(LENNA_FILE, as_gray=True).astype(np.float32) except Exception: - lenna = np.random.rand(20, 30) + lenna = np.random.rand(20, 30).astype(np.float32) def test_rotate_center(): diff --git a/pystencils_tests/test_fvm.py b/pystencils_tests/test_fvm.py index c289ac54e7492cef31d74867847e2aefb2663d67..c863c8f45fdb31386eda547f6774b16e864879b5 100644 --- a/pystencils_tests/test_fvm.py +++ b/pystencils_tests/test_fvm.py @@ -3,7 +3,6 @@ import pystencils as ps import numpy as np import pytest from itertools import product -from scipy.optimize import curve_fit def advection_diffusion(dim: int): @@ -94,6 +93,8 @@ def advection_diffusion(dim: int): calc_density = density(pos - velocity * time, time, D) target = [time, D] + pytest.importorskip('scipy.optimize') + from scipy.optimize import curve_fit popt, _ = curve_fit(lambda x, t, D: density(x - velocity * time, t, D), pos.reshape(-1, dim), sim_density.reshape(-1), diff --git a/pystencils_tests/test_random.py b/pystencils_tests/test_random.py index 856bfd2c5af53f7c01476136173439931b5a347a..30b55a66be082454e48de57284e1ed9678e6e19e 100644 --- a/pystencils_tests/test_random.py +++ b/pystencils_tests/test_random.py @@ -175,7 +175,7 @@ def test_staggered(vectorized): dh = ps.create_data_handling((8, 8), default_ghost_layers=0, default_target="cpu") j = dh.add_array("j", values_per_cell=dh.dim, field_type=ps.FieldType.STAGGERED_FLUX) a = ps.AssignmentCollection([ps.Assignment(j.staggered_access(n), 0) for n in j.staggered_stencil]) - rng_symbol_gen = random_symbol(a.subexpressions, dim=dh.dim, rng_node=AESNITwoDoubles) + rng_symbol_gen = random_symbol(a.subexpressions, dim=dh.dim, rng_node=PhiloxTwoDoubles) a.main_assignments[0] = ps.Assignment(a.main_assignments[0].lhs, next(rng_symbol_gen)) kernel = ps.create_staggered_kernel(a, target=dh.default_target).compile() diff --git a/pystencils_tests/test_small_block_benchmark.ipynb b/pystencils_tests/test_small_block_benchmark.ipynb index 45ab56bbe194952bfbadf937be43952a9ed4a43a..81101c5a0d33e45300300ab24c70c4c464eb5eac 100644 --- a/pystencils_tests/test_small_block_benchmark.ipynb +++ b/pystencils_tests/test_small_block_benchmark.ipynb @@ -1,5 +1,15 @@ { "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pytest\n", + "pytest.importorskip('waLBerla')" + ] + }, { "cell_type": "code", "execution_count": 1, diff --git a/pystencils_tests/test_vectorization.py b/pystencils_tests/test_vectorization.py index 7b1891a4def311f20ae4a7f7932f83a5b0c821e2..40432f5b6b8e6a17dd6c03ef25d9c9800dfa8a45 100644 --- a/pystencils_tests/test_vectorization.py +++ b/pystencils_tests/test_vectorization.py @@ -203,7 +203,7 @@ def test_logical_operators(): def test_hardware_query(): instruction_sets = get_supported_instruction_sets() - assert 'sse' in instruction_sets + assert 'sse' in instruction_sets or 'neon' in instruction_sets def test_vectorised_pow():