diff --git a/pystencils/alignedarray.py b/pystencils/alignedarray.py index 70271b0c0f3b50f04ceed671cabb16411554cdac..eda9fcaeba9915edeada8a1f72559821c2a0062b 100644 --- a/pystencils/alignedarray.py +++ b/pystencils/alignedarray.py @@ -10,20 +10,28 @@ def aligned_empty(shape, byte_alignment=True, dtype=np.float64, byte_offset=0, o shape: size of the array byte_alignment: alignment in bytes, for the start address of the array holds (a % byte_alignment) == 0 By default, use the maximum required by the CPU (or 512 bits if this cannot be detected). + When 'cacheline' is specified, the size of a cache line is used. dtype: numpy data type byte_offset: offset in bytes for position that should be aligned i.e. (a+byte_offset) % byte_alignment == 0 typically used to align first inner cell instead of ghost layer order: storage linearization order align_inner_coordinate: if True, the start of the innermost coordinate lines are aligned as well """ - if byte_alignment is True: - from pystencils.backends.simd_instruction_sets import (get_supported_instruction_sets, + if byte_alignment is True or byte_alignment == 'cacheline': + from pystencils.backends.simd_instruction_sets import (get_supported_instruction_sets, get_cacheline_size, get_vector_instruction_set) type_name = BasicType.numpy_name_to_c(np.dtype(dtype).name) instruction_sets = get_supported_instruction_sets() if instruction_sets is None: byte_alignment = 64 + elif byte_alignment == 'cacheline': + cacheline_sizes = [get_cacheline_size(is_name) for is_name in instruction_sets] + if all([s is None for s in cacheline_sizes]): + byte_alignment = max([get_vector_instruction_set(type_name, is_name)['width'] * np.dtype(dtype).itemsize + for is_name in instruction_sets]) + else: + byte_alignment = max([s for s in cacheline_sizes if s is not None]) else: byte_alignment = max([get_vector_instruction_set(type_name, is_name)['width'] * np.dtype(dtype).itemsize for is_name in instruction_sets]) diff --git a/pystencils/astnodes.py b/pystencils/astnodes.py index b874db9b0c2f869131fa4986714aa172b8bfafdc..53731645797afeb62a4a5ff803c2ee7b4e5d8e16 100644 --- a/pystencils/astnodes.py +++ b/pystencils/astnodes.py @@ -313,7 +313,9 @@ class Block(Node): self._nodes = [fast_subs(a, subs_dict, skip) for a in self._nodes] return self - def insert_front(self, node): + def insert_front(self, node, if_not_exists=False): + if if_not_exists and len(self._nodes) > 0 and self._nodes[0] == node: + return if isinstance(node, collections.abc.Iterable): node = list(node) for n in node: @@ -324,7 +326,7 @@ class Block(Node): node.parent = self self._nodes.insert(0, node) - def insert_before(self, new_node, insert_before): + def insert_before(self, new_node, insert_before, if_not_exists=False): new_node.parent = self assert self._nodes.count(insert_before) == 1 idx = self._nodes.index(insert_before) @@ -337,7 +339,25 @@ class Block(Node): idx -= 1 else: break - self._nodes.insert(idx, new_node) + if not if_not_exists or self._nodes[idx] != new_node: + self._nodes.insert(idx, new_node) + + def insert_after(self, new_node, insert_after, if_not_exists=False): + new_node.parent = self + assert self._nodes.count(insert_after) == 1 + idx = self._nodes.index(insert_after) + 1 + + # move all assignment (definitions to the top) + if isinstance(new_node, SympyAssignment) and new_node.is_declaration: + while idx > 0: + pn = self._nodes[idx - 1] + if isinstance(pn, LoopOverCoordinate) or isinstance(pn, Conditional): + idx -= 1 + else: + break + if not if_not_exists or not (self._nodes[idx - 1] == new_node + or (idx < len(self._nodes) and self._nodes[idx] == new_node)): + self._nodes.insert(idx, new_node) def append(self, node): if isinstance(node, list) or isinstance(node, tuple): @@ -816,3 +836,47 @@ class ConditionalFieldAccess(sp.Function): def __getnewargs__(self): return self.access, self.outofbounds_condition, self.outofbounds_value + + +class NontemporalFence(Node): + def __init__(self): + super(NontemporalFence, self).__init__(parent=None) + + @property + def symbols_defined(self): + return set() + + @property + def undefined_symbols(self): + return set() + + @property + def args(self): + return [] + + def __eq__(self, other): + return isinstance(other, NontemporalFence) + + +class CachelineSize(Node): + symbol = sp.Symbol("_clsize") + mask_symbol = sp.Symbol("_clsize_mask") + last_symbol = sp.Symbol("_cl_lastvec") + + def __init__(self): + super(CachelineSize, self).__init__(parent=None) + + @property + def symbols_defined(self): + return set([self.symbol, self.mask_symbol, self.last_symbol]) + + @property + def undefined_symbols(self): + return set() + + @property + def args(self): + return [] + + def __eq__(self, other): + return isinstance(other, CachelineSize) diff --git a/pystencils/backends/arm_instruction_sets.py b/pystencils/backends/arm_instruction_sets.py index 201451fb8c7d39f1396fa6c7c799051c4d2d8102..a386253a098be738f9a6bc932144edebeaa4a1ea 100644 --- a/pystencils/backends/arm_instruction_sets.py +++ b/pystencils/backends/arm_instruction_sets.py @@ -28,7 +28,6 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'): 'loadA': 'ld1[0]', 'storeU': 'st1[0, 1]', 'storeA': 'st1[0, 1]', - 'stream': 'st1[0, 1]', 'abs': 'abs[0]', '==': 'ceq[0, 1]', @@ -47,6 +46,7 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'): suffix = f'q_f{bits[data_type]}' result = dict() + result['bytes'] = 16 for intrinsic_id, function_shortcut in base_names.items(): function_shortcut = function_shortcut.strip() @@ -81,4 +81,7 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'): result['any'] = f'vaddlvq_u8(vreinterpretq_u8_u{bits[data_type]}({{0}})) > 0' result['all'] = f'vaddlvq_u8(vreinterpretq_u8_u{bits[data_type]}({{0}})) == 16*0xff' + result['cachelineSize'] = 'cachelineSize()' + result['cachelineZero'] = 'cachelineZero((void*) {0})' + return result diff --git a/pystencils/backends/cbackend.py b/pystencils/backends/cbackend.py index 2a15ef74f9ee5a528538d5976cd4a40d55426c82..9d7aa3cc30ba465fc97b97b75d7227776366ae51 100644 --- a/pystencils/backends/cbackend.py +++ b/pystencils/backends/cbackend.py @@ -1,5 +1,6 @@ import re from collections import namedtuple +import hashlib from typing import Set import numpy as np @@ -7,7 +8,7 @@ import sympy as sp from sympy.core import S from sympy.logic.boolalg import BooleanFalse, BooleanTrue -from pystencils.astnodes import KernelFunction, Node +from pystencils.astnodes import KernelFunction, Node, CachelineSize from pystencils.cpu.vectorization import vec_all, vec_any from pystencils.data_types import ( PointerType, VectorType, address_of, cast_func, create_type, get_type_of_expression, @@ -258,7 +259,7 @@ class CBackend: arg, data_type, aligned, nontemporal, mask = node.lhs.args instr = 'storeU' if aligned: - instr = 'stream' if nontemporal else 'storeA' + instr = 'stream' if nontemporal and 'stream' in self._vector_instruction_set else 'storeA' if mask != True: # NOQA instr = 'maskStore' if aligned else 'maskStoreU' printed_mask = self.sympy_printer.doprint(mask) @@ -271,21 +272,60 @@ class CBackend: else: rhs = node.rhs - return self._vector_instruction_set[instr].format("&" + self.sympy_printer.doprint(node.lhs.args[0]), - self.sympy_printer.doprint(rhs), + ptr = "&" + self.sympy_printer.doprint(node.lhs.args[0]) + pre_code = '' + if nontemporal and 'cachelineZero' in self._vector_instruction_set: + pre_code = f"if (((uintptr_t) {ptr} & {CachelineSize.mask_symbol}) == 0) " + "{\n\t" + \ + self._vector_instruction_set['cachelineZero'].format(ptr) + ';\n}\n' + + code = self._vector_instruction_set[instr].format(ptr, self.sympy_printer.doprint(rhs), printed_mask) + ';' + flushcond = f"((uintptr_t) {ptr} & {CachelineSize.mask_symbol}) == {CachelineSize.last_symbol}" + if nontemporal and 'flushCacheline' in self._vector_instruction_set: + code2 = self._vector_instruction_set['flushCacheline'].format( + ptr, self.sympy_printer.doprint(rhs)) + ';' + code = f"{code}\nif ({flushcond}) {{\n\t{code2}\n}}" + elif nontemporal and 'storeAAndFlushCacheline' in self._vector_instruction_set: + tmpvar = '_tmp_' + hashlib.sha1(self.sympy_printer.doprint(rhs).encode('ascii')).hexdigest()[:8] + code = 'const ' + self._print(node.lhs.dtype).replace(' const', '') + ' ' + tmpvar + ' = ' \ + + self.sympy_printer.doprint(rhs) + ';' + code1 = self._vector_instruction_set[instr].format(ptr, tmpvar, printed_mask) + ';' + code2 = self._vector_instruction_set['storeAAndFlushCacheline'].format(ptr, tmpvar, printed_mask) \ + + ';' + code += f"\nif ({flushcond}) {{\n\t{code2}\n}} else {{\n\t{code1}\n}}" + return pre_code + code else: return f"{self.sympy_printer.doprint(node.lhs)} = {self.sympy_printer.doprint(node.rhs)};" + def _print_NontemporalFence(self, _): + if 'streamFence' in self._vector_instruction_set: + return self._vector_instruction_set['streamFence'] + ';' + else: + return '' + + def _print_CachelineSize(self, node): + if 'cachelineSize' in self._vector_instruction_set: + code = f'const size_t {node.symbol} = {self._vector_instruction_set["cachelineSize"]};\n' + code += f'const size_t {node.mask_symbol} = {node.symbol} - 1;\n' + vectorsize = self._vector_instruction_set['bytes'] + code += f'const size_t {node.last_symbol} = {node.symbol} - {vectorsize};\n' + return code + else: + return '' + def _print_TemporaryMemoryAllocation(self, node): - align = 64 + if self._vector_instruction_set: + align = self._vector_instruction_set['bytes'] + else: + align = node.symbol.dtype.base_type.numpy_dtype.itemsize + np_dtype = node.symbol.dtype.base_type.numpy_dtype required_size = np_dtype.itemsize * node.size + align size = modulo_ceil(required_size, align) - code = "#if __cplusplus >= 201703L || __STDC_VERSION__ >= 201112L\n" - code += "{dtype} {name}=({dtype})aligned_alloc({align}, {size}) + {offset};\n" - code += "#elif defined(_MSC_VER)\n" + code = "#if defined(_MSC_VER)\n" code += "{dtype} {name}=({dtype})_aligned_malloc({size}, {align}) + {offset};\n" + code += "#elif __cplusplus >= 201703L || __STDC_VERSION__ >= 201112L\n" + code += "{dtype} {name}=({dtype})aligned_alloc({align}, {size}) + {offset};\n" code += "#else\n" code += "{dtype} {name};\n" code += "posix_memalign((void**) &{name}, {align}, {size});\n" @@ -298,7 +338,11 @@ class CBackend: align=align) def _print_TemporaryMemoryFree(self, node): - align = 64 + if self._vector_instruction_set: + align = self._vector_instruction_set['bytes'] + else: + align = node.symbol.dtype.base_type.numpy_dtype.itemsize + code = "#if defined(_MSC_VER)\n" code += "_aligned_free(%s - %d);\n" % (self.sympy_printer.doprint(node.symbol.name), node.offset(align)) code += "#else\n" diff --git a/pystencils/backends/ppc_instruction_sets.py b/pystencils/backends/ppc_instruction_sets.py index bb9e3e85113023c0f2c82ddce432a464891ca756..f792127618b025b1bc9cf272e9c2c59d93422318 100644 --- a/pystencils/backends/ppc_instruction_sets.py +++ b/pystencils/backends/ppc_instruction_sets.py @@ -29,7 +29,7 @@ def get_vector_instruction_set_ppc(data_type='double', instruction_set='vsx'): 'loadA': 'ld[0x0, 0]', 'storeU': 'xst[1, 0x0, 0]', 'storeA': 'st[1, 0x0, 0]', - 'stream': 'stl[1, 0x0, 0]', + 'storeAAndFlushCacheline': 'stl[1, 0x0, 0]', 'abs': 'abs[0]', '==': 'cmpeq[0, 1]', @@ -64,6 +64,7 @@ def get_vector_instruction_set_ppc(data_type='double', instruction_set='vsx'): intwidth = 128 // bits['int'] result = dict() + result['bytes'] = 16 for intrinsic_id, function_shortcut in base_names.items(): function_shortcut = function_shortcut.strip() @@ -78,6 +79,8 @@ def get_vector_instruction_set_ppc(data_type='double', instruction_set='vsx'): result['loadA'] = '(__vector double)' + result['loadA'].format('(float*) {0}') result['storeA'] = result['storeA'].format('(float*) {0}', '(__vector float) {1}') result['stream'] = result['stream'].format('(float*) {0}', '(__vector float) {1}') + result['storeAAndFlushCacheline'] = result['storeAAndFlushCacheline'].format('(float*) {0}', + '(__vector float) {1}') result['+int'] = "vec_add({0}, {1})" @@ -98,4 +101,7 @@ def get_vector_instruction_set_ppc(data_type='double', instruction_set='vsx'): result['any'] = 'vec_any_ne({0}, ((' + result['bool'] + ') {{' + ", ".join(['0'] * width) + '}}))' result['all'] = 'vec_all_ne({0}, ((' + result['bool'] + ') {{' + ", ".join(['0'] * width) + '}}))' + result['cachelineSize'] = 'cachelineSize()' + result['cachelineZero'] = 'cachelineZero((void*) {0})' + return result diff --git a/pystencils/backends/simd_instruction_sets.py b/pystencils/backends/simd_instruction_sets.py index 850f8ff6d4a9ae168a78c6588a69fd87f5e5f03e..9469dc59eb1b4d5d2189c138d42f4c1f233e5259 100644 --- a/pystencils/backends/simd_instruction_sets.py +++ b/pystencils/backends/simd_instruction_sets.py @@ -15,6 +15,7 @@ def get_vector_instruction_set(data_type='double', instruction_set='avx'): _cache = None +_cachelinesize = None def get_supported_instruction_sets(): @@ -56,3 +57,27 @@ def get_supported_instruction_sets(): if flags.issuperset(required_neon_flags): result.append("neon") return result + + +def get_cacheline_size(instruction_set): + """Get the size (in bytes) of a cache block that can be zeroed without memory access. + Usually, this is identical to the cache line size.""" + global _cachelinesize + + instruction_sets = get_vector_instruction_set('double', instruction_set) + if 'cachelineSize' not in instruction_sets: + return None + if _cachelinesize is not None: + return _cachelinesize + + import pystencils as ps + import numpy as np + + arr = np.zeros((1, 1), dtype=np.float32) + f = ps.Field.create_from_numpy_array('f', arr, index_dimensions=0) + ass = [ps.astnodes.CachelineSize(), ps.Assignment(f.center, ps.astnodes.CachelineSize.symbol)] + ast = ps.create_kernel(ass, cpu_vectorize_info={'instruction_set': instruction_set}) + kernel = ast.compile() + kernel(**{f.name: arr, ps.astnodes.CachelineSize.symbol.name: 0}) + _cachelinesize = int(arr[0, 0]) + return _cachelinesize diff --git a/pystencils/backends/x86_instruction_sets.py b/pystencils/backends/x86_instruction_sets.py index 836ffc57906b503c03164715fbed6e19aabfe760..86196aad7047c8f2bcc16a94b487537d3147d36d 100644 --- a/pystencils/backends/x86_instruction_sets.py +++ b/pystencils/backends/x86_instruction_sets.py @@ -110,7 +110,8 @@ def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'): result = { 'width': width[(data_type, instruction_set)], - 'intwidth': width[('int', instruction_set)] + 'intwidth': width[('int', instruction_set)], + 'bytes': 4 * width[("float", instruction_set)] } pre = prefix[instruction_set] for intrinsic_id, function_shortcut in base_names.items(): @@ -164,4 +165,6 @@ def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'): result['+int'] = f"{pre}_add_{suffix['int']}({{0}}, {{1}})" + result['streamFence'] = '_mm_mfence()' + return result diff --git a/pystencils/cpu/cpujit.py b/pystencils/cpu/cpujit.py index 6f2c9a5b0ea502e16393565b82638524caaa6dcd..68ca259024e26cd2d97aa16bf5698e163c5c8373 100644 --- a/pystencils/cpu/cpujit.py +++ b/pystencils/cpu/cpujit.py @@ -58,8 +58,9 @@ import numpy as np from appdirs import user_cache_dir, user_config_dir from pystencils import FieldType +from pystencils.astnodes import LoopOverCoordinate from pystencils.backends.cbackend import generate_c, get_headers -from pystencils.data_types import cast_func, VectorType +from pystencils.data_types import cast_func, VectorType, vector_memory_access from pystencils.include import get_pystencils_include_path from pystencils.kernel_wrapper import KernelWrapper from pystencils.utils import atomic_file_write, file_handle_for_atomic_write, recursive_dict_update @@ -386,6 +387,14 @@ def create_function_boilerplate_code(parameter_info, name, ast_node, insert_chec if ast_node.instruction_set and aligned: byte_width = ast_node.instruction_set['width'] * item_size + if 'cachelineZero' in ast_node.instruction_set: + has_openmp, has_nontemporal = False, False + for loop in ast_node.atoms(LoopOverCoordinate): + has_openmp = has_openmp or any(['#pragma omp' in p for p in loop.prefix_lines]) + has_nontemporal = has_nontemporal or any([a.args[0].field == field and a.args[3] for a in + loop.atoms(vector_memory_access)]) + if has_openmp and has_nontemporal: + byte_width = ast_node.instruction_set['cachelineSize'] offset = max(max(ast_node.ghost_layers)) * item_size offset_cond = f"(((uintptr_t) buffer_{field.name}.buf) + {offset}) % {byte_width} == 0" @@ -394,6 +403,9 @@ def create_function_boilerplate_code(parameter_info, name, ast_node, insert_chec "the kernel creation is not specified it will choose a suitable value " \ "automatically. This value might not " \ "be compatible with the allocated arrays." + if type(byte_width) is not int: + message += " Note that when both OpenMP and non-temporal stores are enabled, alignment to the "\ + "cacheline size is required." pre_call_code += template_check_array.format(cond=offset_cond, what="offset", name=field.name, expected=message) diff --git a/pystencils/cpu/vectorization.py b/pystencils/cpu/vectorization.py index a7d2b76d8981732da019b2bc9f9acfc52a104d2f..0de34b40bf2c68a2b12f29c6ad1016618b5c34ac 100644 --- a/pystencils/cpu/vectorization.py +++ b/pystencils/cpu/vectorization.py @@ -148,6 +148,14 @@ def vectorize_inner_loops_and_adapt_load_stores(ast_node, vector_width, assume_a if hasattr(indexed, 'field'): nontemporal = (indexed.field in nontemporal_fields) or (indexed.field.name in nontemporal_fields) substitutions[indexed] = vector_memory_access(indexed, vec_type, use_aligned_access, nontemporal, True) + if nontemporal: + # insert NontemporalFence after the outermost loop + parent = loop_node.parent + while type(parent.parent.parent) is not ast.KernelFunction: + parent = parent.parent + parent.parent.insert_after(ast.NontemporalFence(), parent, if_not_exists=True) + # insert CachelineSize at the beginning of the kernel + parent.parent.insert_front(ast.CachelineSize(), if_not_exists=True) if not successful: warnings.warn("Could not vectorize loop because of non-consecutive memory access") continue diff --git a/pystencils/include/arm_neon_helpers.h b/pystencils/include/arm_neon_helpers.h index ba6cbc2d7bae45591bcec580b98394c4f6830339..3d06d69bfc2dd866370e98968dacce3aaef3975c 100644 --- a/pystencils/include/arm_neon_helpers.h +++ b/pystencils/include/arm_neon_helpers.h @@ -17,3 +17,54 @@ inline int32x4_t makeVec_s32(int a, int b, int c, int d) alignas(16) int data[4] = {a, b, c, d}; return vld1q_s32(data); } + +inline void cachelineZero(void * p) { + __asm__ volatile("dc zva, %0"::"r"(p)); +} + +inline size_t _cachelineSize() { + // check that dc zva is permitted + uint64_t dczid; + __asm__ volatile ("mrs %0, dczid_el0" : "=r"(dczid)); + if ((dczid & (1 << 4)) != 0) { + return SIZE_MAX; + } + + // allocate and fill with ones + const size_t max_size = 0x100000; + uint8_t data[2*max_size]; + for (size_t i = 0; i < 2*max_size; ++i) { + data[i] = 0xff; + } + + // find alignment offset + size_t offset = max_size - ((uintptr_t) data) % max_size; + + // zero a cacheline + cachelineZero((void*) (data + offset)); + + // make sure that at least one byte was zeroed + if (data[offset] != 0) { + return SIZE_MAX; + } + + // make sure that nothing was zeroed before the pointer + if (data[offset-1] == 0) { + return SIZE_MAX; + } + + // find the last byte that was zeroed + for (size_t size = 1; size < max_size; ++size) { + if (data[offset + size] != 0) { + return size; + } + } + + // too much was zeroed + return SIZE_MAX; +} + +inline size_t cachelineSize() { + static size_t size = _cachelineSize(); + return size; +} diff --git a/pystencils/include/ppc_altivec_helpers.h b/pystencils/include/ppc_altivec_helpers.h index d2edb437a4bb945bf1abec95df34e6d518594fd7..ac4ebd2e1f1b3a65421c228516b7768e794a03a8 100644 --- a/pystencils/include/ppc_altivec_helpers.h +++ b/pystencils/include/ppc_altivec_helpers.h @@ -1,3 +1,51 @@ #include <altivec.h> #undef vector #undef bool + +inline void cachelineZero(void * p) { +#ifdef __xlC__ + __dcbz(p); +#else + __asm__ volatile("dcbz 0, %0"::"r"(p):"memory"); +#endif +} + +inline size_t _cachelineSize() { + // allocate and fill with ones + const size_t max_size = 0x100000; + uint8_t data[2*max_size]; + for (size_t i = 0; i < 2*max_size; ++i) { + data[i] = 0xff; + } + + // find alignment offset + size_t offset = max_size - ((uintptr_t) data) % max_size; + + // zero a cacheline + cachelineZero((void*) (data + offset)); + + // make sure that at least one byte was zeroed + if (data[offset] != 0) { + return SIZE_MAX; + } + + // make sure that nothing was zeroed before the pointer + if (data[offset-1] == 0) { + return SIZE_MAX; + } + + // find the last byte that was zeroed + for (size_t size = 1; size < max_size; ++size) { + if (data[offset + size] != 0) { + return size; + } + } + + // too much was zeroed + return SIZE_MAX; +} + +inline size_t cachelineSize() { + static size_t size = _cachelineSize(); + return size; +} diff --git a/pystencils_tests/test_vectorization.py b/pystencils_tests/test_vectorization.py index 0889ab468f095796833a52e6dd1536ef031b578c..880a009a294569f004f7f8a0cba3a02b98ec5bd2 100644 --- a/pystencils_tests/test_vectorization.py +++ b/pystencils_tests/test_vectorization.py @@ -33,25 +33,39 @@ def test_vector_type_propagation(): np.testing.assert_equal(dst[1:-1, 1:-1], 2 * 10.0 + 3) -def test_aligned_and_nt_stores(): +def test_aligned_and_nt_stores(openmp=False): domain_size = (24, 24) # create a datahandling object dh = ps.create_data_handling(domain_size, periodicity=(True, True), parallel=False, default_target='cpu') # fields - g = dh.add_array("g", values_per_cell=1, alignment=True) + alignment = 'cacheline' if openmp else True + g = dh.add_array("g", values_per_cell=1, alignment=alignment) dh.fill("g", 1.0, ghost_layers=True) - f = dh.add_array("f", values_per_cell=1, alignment=True) + f = dh.add_array("f", values_per_cell=1, alignment=alignment) dh.fill("f", 0.0, ghost_layers=True) opt = {'instruction_set': instruction_set, 'assume_aligned': True, 'nontemporal': True, 'assume_inner_stride_one': True} update_rule = [ps.Assignment(f.center(), 0.25 * (g[-1, 0] + g[1, 0] + g[0, -1] + g[0, 1]))] - ast = ps.create_kernel(update_rule, target=dh.default_target, cpu_vectorize_info=opt) + ast = ps.create_kernel(update_rule, target=dh.default_target, cpu_vectorize_info=opt, cpu_openmp=openmp) + if instruction_set in ['sse'] or instruction_set.startswith('avx'): + assert 'stream' in ast.instruction_set + assert 'streamFence' in ast.instruction_set + if instruction_set in ['neon', 'vsx'] or instruction_set.startswith('sve'): + assert 'cachelineZero' in ast.instruction_set + if instruction_set in ['vsx']: + assert 'storeAAndFlushCacheline' in ast.instruction_set + for instruction in ['stream', 'streamFence', 'cachelineZero', 'storeAAndFlushCacheline', 'flushCacheline']: + if instruction in ast.instruction_set: + assert ast.instruction_set[instruction].split('{')[0] in ps.get_code_str(ast) kernel = ast.compile() dh.run_kernel(kernel) np.testing.assert_equal(np.sum(dh.cpu_arrays['f']), np.prod(domain_size)) +def test_aligned_and_nt_stores_openmp(): + test_aligned_and_nt_stores(True) + def test_inplace_update(): shape = (9, 9, 3) diff --git a/pystencils_tests/test_vectorization_specific.py b/pystencils_tests/test_vectorization_specific.py index 4476e5bf4f68b82056ecb3242237b1c3ce879191..fca50949e6dbed63cc97c6a2bff33aad7dbe2968 100644 --- a/pystencils_tests/test_vectorization_specific.py +++ b/pystencils_tests/test_vectorization_specific.py @@ -4,7 +4,8 @@ import numpy as np import sympy as sp import pystencils as ps -from pystencils.backends.simd_instruction_sets import get_supported_instruction_sets, get_vector_instruction_set +from pystencils.backends.simd_instruction_sets import (get_cacheline_size, get_supported_instruction_sets, + get_vector_instruction_set) from pystencils.data_types import cast_func, VectorType supported_instruction_sets = get_supported_instruction_sets() if get_supported_instruction_sets() else [] @@ -76,4 +77,16 @@ def test_alignment_and_correct_ghost_layers(gl_field, gl_kernel, instruction_set with pytest.raises(ValueError): dh.run_kernel(kernel) else: - dh.run_kernel(kernel) \ No newline at end of file + dh.run_kernel(kernel) + + +@pytest.mark.parametrize('instruction_set', supported_instruction_sets) +def test_cacheline_size(instruction_set): + cacheline_size = get_cacheline_size(instruction_set) + if cacheline_size is None: + pytest.skip() + instruction_set = get_vector_instruction_set('double', instruction_set) + vector_size = instruction_set['bytes'] + assert cacheline_size > 8 and cacheline_size < 0x100000, "Cache line size is implausible" + assert cacheline_size % vector_size == 0, "Cache line size should be multiple of vector size" + assert cacheline_size & (cacheline_size - 1) == 0, "Cache line size is not a power of 2"