From bcd2d628663a756186ca4cfa6bab68731d597610 Mon Sep 17 00:00:00 2001 From: Michael Kuron <m.kuron@gmx.de> Date: Thu, 1 Apr 2021 22:41:17 +0200 Subject: [PATCH] Allow fields to be aligned to cacheline size --- pystencils/alignedarray.py | 12 +++++++-- pystencils/backends/cbackend.py | 4 +-- pystencils/backends/simd_instruction_sets.py | 25 +++++++++++++++++++ pystencils_tests/test_vectorization.py | 6 +---- .../test_vectorization_specific.py | 17 +++++++++++-- 5 files changed, 53 insertions(+), 11 deletions(-) diff --git a/pystencils/alignedarray.py b/pystencils/alignedarray.py index 70271b0c0..eda9fcaeb 100644 --- a/pystencils/alignedarray.py +++ b/pystencils/alignedarray.py @@ -10,20 +10,28 @@ def aligned_empty(shape, byte_alignment=True, dtype=np.float64, byte_offset=0, o shape: size of the array byte_alignment: alignment in bytes, for the start address of the array holds (a % byte_alignment) == 0 By default, use the maximum required by the CPU (or 512 bits if this cannot be detected). + When 'cacheline' is specified, the size of a cache line is used. dtype: numpy data type byte_offset: offset in bytes for position that should be aligned i.e. (a+byte_offset) % byte_alignment == 0 typically used to align first inner cell instead of ghost layer order: storage linearization order align_inner_coordinate: if True, the start of the innermost coordinate lines are aligned as well """ - if byte_alignment is True: - from pystencils.backends.simd_instruction_sets import (get_supported_instruction_sets, + if byte_alignment is True or byte_alignment == 'cacheline': + from pystencils.backends.simd_instruction_sets import (get_supported_instruction_sets, get_cacheline_size, get_vector_instruction_set) type_name = BasicType.numpy_name_to_c(np.dtype(dtype).name) instruction_sets = get_supported_instruction_sets() if instruction_sets is None: byte_alignment = 64 + elif byte_alignment == 'cacheline': + cacheline_sizes = [get_cacheline_size(is_name) for is_name in instruction_sets] + if all([s is None for s in cacheline_sizes]): + byte_alignment = max([get_vector_instruction_set(type_name, is_name)['width'] * np.dtype(dtype).itemsize + for is_name in instruction_sets]) + else: + byte_alignment = max([s for s in cacheline_sizes if s is not None]) else: byte_alignment = max([get_vector_instruction_set(type_name, is_name)['width'] * np.dtype(dtype).itemsize for is_name in instruction_sets]) diff --git a/pystencils/backends/cbackend.py b/pystencils/backends/cbackend.py index a924b67fd..9ff44bbef 100644 --- a/pystencils/backends/cbackend.py +++ b/pystencils/backends/cbackend.py @@ -317,7 +317,7 @@ class CBackend: if self._vector_instruction_set: align = self._vector_instruction_set['bytes'] else: - align = node.symbol.dtype.base_type.numpy_dtype.type(0).nbytes + align = node.symbol.dtype.base_type.numpy_dtype.itemsize np_dtype = node.symbol.dtype.base_type.numpy_dtype required_size = np_dtype.itemsize * node.size + align @@ -341,7 +341,7 @@ class CBackend: if self._vector_instruction_set: align = self._vector_instruction_set['bytes'] else: - align = node.symbol.dtype.base_type.numpy_dtype.type(0).nbytes + align = node.symbol.dtype.base_type.numpy_dtype.itemsize code = "#if defined(_MSC_VER)\n" code += "_aligned_free(%s - %d);\n" % (self.sympy_printer.doprint(node.symbol.name), node.offset(align)) diff --git a/pystencils/backends/simd_instruction_sets.py b/pystencils/backends/simd_instruction_sets.py index 850f8ff6d..9469dc59e 100644 --- a/pystencils/backends/simd_instruction_sets.py +++ b/pystencils/backends/simd_instruction_sets.py @@ -15,6 +15,7 @@ def get_vector_instruction_set(data_type='double', instruction_set='avx'): _cache = None +_cachelinesize = None def get_supported_instruction_sets(): @@ -56,3 +57,27 @@ def get_supported_instruction_sets(): if flags.issuperset(required_neon_flags): result.append("neon") return result + + +def get_cacheline_size(instruction_set): + """Get the size (in bytes) of a cache block that can be zeroed without memory access. + Usually, this is identical to the cache line size.""" + global _cachelinesize + + instruction_sets = get_vector_instruction_set('double', instruction_set) + if 'cachelineSize' not in instruction_sets: + return None + if _cachelinesize is not None: + return _cachelinesize + + import pystencils as ps + import numpy as np + + arr = np.zeros((1, 1), dtype=np.float32) + f = ps.Field.create_from_numpy_array('f', arr, index_dimensions=0) + ass = [ps.astnodes.CachelineSize(), ps.Assignment(f.center, ps.astnodes.CachelineSize.symbol)] + ast = ps.create_kernel(ass, cpu_vectorize_info={'instruction_set': instruction_set}) + kernel = ast.compile() + kernel(**{f.name: arr, ps.astnodes.CachelineSize.symbol.name: 0}) + _cachelinesize = int(arr[0, 0]) + return _cachelinesize diff --git a/pystencils_tests/test_vectorization.py b/pystencils_tests/test_vectorization.py index 59c85a425..783b9bb34 100644 --- a/pystencils_tests/test_vectorization.py +++ b/pystencils_tests/test_vectorization.py @@ -38,12 +38,8 @@ def test_aligned_and_nt_stores(openmp=False): # create a datahandling object dh = ps.create_data_handling(domain_size, periodicity=(True, True), parallel=False, default_target='cpu') - if openmp: - alignment = 128 if instruction_set == 'vsx' else 64 if instruction_set == 'neon' else True - else: - alignment = True - # fields + alignment = 'cacheline' if openmp else True g = dh.add_array("g", values_per_cell=1, alignment=alignment) dh.fill("g", 1.0, ghost_layers=True) f = dh.add_array("f", values_per_cell=1, alignment=alignment) diff --git a/pystencils_tests/test_vectorization_specific.py b/pystencils_tests/test_vectorization_specific.py index 4476e5bf4..fca50949e 100644 --- a/pystencils_tests/test_vectorization_specific.py +++ b/pystencils_tests/test_vectorization_specific.py @@ -4,7 +4,8 @@ import numpy as np import sympy as sp import pystencils as ps -from pystencils.backends.simd_instruction_sets import get_supported_instruction_sets, get_vector_instruction_set +from pystencils.backends.simd_instruction_sets import (get_cacheline_size, get_supported_instruction_sets, + get_vector_instruction_set) from pystencils.data_types import cast_func, VectorType supported_instruction_sets = get_supported_instruction_sets() if get_supported_instruction_sets() else [] @@ -76,4 +77,16 @@ def test_alignment_and_correct_ghost_layers(gl_field, gl_kernel, instruction_set with pytest.raises(ValueError): dh.run_kernel(kernel) else: - dh.run_kernel(kernel) \ No newline at end of file + dh.run_kernel(kernel) + + +@pytest.mark.parametrize('instruction_set', supported_instruction_sets) +def test_cacheline_size(instruction_set): + cacheline_size = get_cacheline_size(instruction_set) + if cacheline_size is None: + pytest.skip() + instruction_set = get_vector_instruction_set('double', instruction_set) + vector_size = instruction_set['bytes'] + assert cacheline_size > 8 and cacheline_size < 0x100000, "Cache line size is implausible" + assert cacheline_size % vector_size == 0, "Cache line size should be multiple of vector size" + assert cacheline_size & (cacheline_size - 1) == 0, "Cache line size is not a power of 2" -- GitLab