diff --git a/pystencils/astnodes.py b/pystencils/astnodes.py index 74ca259da727d24c02946716d7015cbf42cd505d..a1f282b9de584ae0f1b452379f5acf99c9373ded 100644 --- a/pystencils/astnodes.py +++ b/pystencils/astnodes.py @@ -313,7 +313,9 @@ class Block(Node): self._nodes = [fast_subs(a, subs_dict, skip) for a in self._nodes] return self - def insert_front(self, node): + def insert_front(self, node, if_not_exists=False): + if if_not_exists and len(self._nodes) > 0 and self._nodes[0] == node: + return if isinstance(node, collections.abc.Iterable): node = list(node) for n in node: @@ -854,3 +856,25 @@ class NontemporalFence(Node): def __eq__(self, other): return isinstance(other, NontemporalFence) + + +class CachelineSize(Node): + mask_symbol = sp.Symbol("_clsize_mask") + + def __init__(self): + super(CachelineSize, self).__init__(parent=None) + + @property + def symbols_defined(self): + return set([self.mask_symbol]) + + @property + def undefined_symbols(self): + return set() + + @property + def args(self): + return [] + + def __eq__(self, other): + return isinstance(other, CachelineSize) diff --git a/pystencils/backends/arm_instruction_sets.py b/pystencils/backends/arm_instruction_sets.py index 201451fb8c7d39f1396fa6c7c799051c4d2d8102..c6da59dfda5621154f292c4c50b75a2d3dad7a5b 100644 --- a/pystencils/backends/arm_instruction_sets.py +++ b/pystencils/backends/arm_instruction_sets.py @@ -81,4 +81,7 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'): result['any'] = f'vaddlvq_u8(vreinterpretq_u8_u{bits[data_type]}({{0}})) > 0' result['all'] = f'vaddlvq_u8(vreinterpretq_u8_u{bits[data_type]}({{0}})) == 16*0xff' + result['cachelineSize'] = 'cachelineSize()' + result['cachelineZero'] = 'cachelineZero((void*) {0})' + return result diff --git a/pystencils/backends/cbackend.py b/pystencils/backends/cbackend.py index 08fcede5eff66b532768964333c021fdd6060e81..b469b13f15a9e14032514d473b3ce2e07e037f5d 100644 --- a/pystencils/backends/cbackend.py +++ b/pystencils/backends/cbackend.py @@ -7,7 +7,7 @@ import sympy as sp from sympy.core import S from sympy.logic.boolalg import BooleanFalse, BooleanTrue -from pystencils.astnodes import KernelFunction, Node +from pystencils.astnodes import KernelFunction, Node, CachelineSize from pystencils.cpu.vectorization import vec_all, vec_any from pystencils.data_types import ( PointerType, VectorType, address_of, cast_func, create_type, get_type_of_expression, @@ -271,15 +271,27 @@ class CBackend: else: rhs = node.rhs - return self._vector_instruction_set[instr].format("&" + self.sympy_printer.doprint(node.lhs.args[0]), - self.sympy_printer.doprint(rhs), + ptr = "&" + self.sympy_printer.doprint(node.lhs.args[0]) + pre_code = '' + if instr == 'stream' and 'cachelineZero' in self._vector_instruction_set: + pre_code = f"if (((uintptr_t) {ptr} & {CachelineSize.mask_symbol}) == 0) " + "\n\t" + \ + self._vector_instruction_set['cachelineZero'].format(ptr) + ';\n' + + code = self._vector_instruction_set[instr].format(ptr, self.sympy_printer.doprint(rhs), printed_mask) + ';' + return pre_code + code else: return f"{self.sympy_printer.doprint(node.lhs)} = {self.sympy_printer.doprint(node.rhs)};" def _print_NontemporalFence(self, _): - if 'stream_fence' in self._vector_instruction_set: - return self._vector_instruction_set['stream_fence'] + ';' + if 'streamFence' in self._vector_instruction_set: + return self._vector_instruction_set['streamFence'] + ';' + else: + return '' + + def _print_CachelineSize(self, node): + if 'cachelineSize' in self._vector_instruction_set: + return f'const size_t {node.mask_symbol} = {self._vector_instruction_set["cachelineSize"]} - 1;' else: return '' diff --git a/pystencils/backends/ppc_instruction_sets.py b/pystencils/backends/ppc_instruction_sets.py index bb9e3e85113023c0f2c82ddce432a464891ca756..a1c481ae41ca2036b90dc5c338cefd7c71dc9fc4 100644 --- a/pystencils/backends/ppc_instruction_sets.py +++ b/pystencils/backends/ppc_instruction_sets.py @@ -29,7 +29,7 @@ def get_vector_instruction_set_ppc(data_type='double', instruction_set='vsx'): 'loadA': 'ld[0x0, 0]', 'storeU': 'xst[1, 0x0, 0]', 'storeA': 'st[1, 0x0, 0]', - 'stream': 'stl[1, 0x0, 0]', + 'stream': 'st[1, 0x0, 0]', # stl would flush the cacheline, which only makes sense for the last item 'abs': 'abs[0]', '==': 'cmpeq[0, 1]', @@ -98,4 +98,7 @@ def get_vector_instruction_set_ppc(data_type='double', instruction_set='vsx'): result['any'] = 'vec_any_ne({0}, ((' + result['bool'] + ') {{' + ", ".join(['0'] * width) + '}}))' result['all'] = 'vec_all_ne({0}, ((' + result['bool'] + ') {{' + ", ".join(['0'] * width) + '}}))' + result['cachelineSize'] = 'cachelineSize()' + result['cachelineZero'] = 'cachelineZero((void*) {0})' + return result diff --git a/pystencils/backends/x86_instruction_sets.py b/pystencils/backends/x86_instruction_sets.py index 0454621eb1a20c039b327f1f9282d8f5e8542851..7809a89790e9f6b98cf70765a4124097adf40562 100644 --- a/pystencils/backends/x86_instruction_sets.py +++ b/pystencils/backends/x86_instruction_sets.py @@ -164,6 +164,6 @@ def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'): result['+int'] = f"{pre}_add_{suffix['int']}({{0}}, {{1}})" - result['stream_fence'] = '_mm_mfence()' + result['streamFence'] = '_mm_mfence()' return result diff --git a/pystencils/cpu/vectorization.py b/pystencils/cpu/vectorization.py index 13d705b36d5ab19224209ddc3d81dc01241d85f1..0de34b40bf2c68a2b12f29c6ad1016618b5c34ac 100644 --- a/pystencils/cpu/vectorization.py +++ b/pystencils/cpu/vectorization.py @@ -149,10 +149,13 @@ def vectorize_inner_loops_and_adapt_load_stores(ast_node, vector_width, assume_a nontemporal = (indexed.field in nontemporal_fields) or (indexed.field.name in nontemporal_fields) substitutions[indexed] = vector_memory_access(indexed, vec_type, use_aligned_access, nontemporal, True) if nontemporal: + # insert NontemporalFence after the outermost loop parent = loop_node.parent while type(parent.parent.parent) is not ast.KernelFunction: parent = parent.parent parent.parent.insert_after(ast.NontemporalFence(), parent, if_not_exists=True) + # insert CachelineSize at the beginning of the kernel + parent.parent.insert_front(ast.CachelineSize(), if_not_exists=True) if not successful: warnings.warn("Could not vectorize loop because of non-consecutive memory access") continue diff --git a/pystencils/include/arm_neon_helpers.h b/pystencils/include/arm_neon_helpers.h index ba6cbc2d7bae45591bcec580b98394c4f6830339..3d06d69bfc2dd866370e98968dacce3aaef3975c 100644 --- a/pystencils/include/arm_neon_helpers.h +++ b/pystencils/include/arm_neon_helpers.h @@ -17,3 +17,54 @@ inline int32x4_t makeVec_s32(int a, int b, int c, int d) alignas(16) int data[4] = {a, b, c, d}; return vld1q_s32(data); } + +inline void cachelineZero(void * p) { + __asm__ volatile("dc zva, %0"::"r"(p)); +} + +inline size_t _cachelineSize() { + // check that dc zva is permitted + uint64_t dczid; + __asm__ volatile ("mrs %0, dczid_el0" : "=r"(dczid)); + if ((dczid & (1 << 4)) != 0) { + return SIZE_MAX; + } + + // allocate and fill with ones + const size_t max_size = 0x100000; + uint8_t data[2*max_size]; + for (size_t i = 0; i < 2*max_size; ++i) { + data[i] = 0xff; + } + + // find alignment offset + size_t offset = max_size - ((uintptr_t) data) % max_size; + + // zero a cacheline + cachelineZero((void*) (data + offset)); + + // make sure that at least one byte was zeroed + if (data[offset] != 0) { + return SIZE_MAX; + } + + // make sure that nothing was zeroed before the pointer + if (data[offset-1] == 0) { + return SIZE_MAX; + } + + // find the last byte that was zeroed + for (size_t size = 1; size < max_size; ++size) { + if (data[offset + size] != 0) { + return size; + } + } + + // too much was zeroed + return SIZE_MAX; +} + +inline size_t cachelineSize() { + static size_t size = _cachelineSize(); + return size; +} diff --git a/pystencils/include/ppc_altivec_helpers.h b/pystencils/include/ppc_altivec_helpers.h index d2edb437a4bb945bf1abec95df34e6d518594fd7..ac4ebd2e1f1b3a65421c228516b7768e794a03a8 100644 --- a/pystencils/include/ppc_altivec_helpers.h +++ b/pystencils/include/ppc_altivec_helpers.h @@ -1,3 +1,51 @@ #include <altivec.h> #undef vector #undef bool + +inline void cachelineZero(void * p) { +#ifdef __xlC__ + __dcbz(p); +#else + __asm__ volatile("dcbz 0, %0"::"r"(p):"memory"); +#endif +} + +inline size_t _cachelineSize() { + // allocate and fill with ones + const size_t max_size = 0x100000; + uint8_t data[2*max_size]; + for (size_t i = 0; i < 2*max_size; ++i) { + data[i] = 0xff; + } + + // find alignment offset + size_t offset = max_size - ((uintptr_t) data) % max_size; + + // zero a cacheline + cachelineZero((void*) (data + offset)); + + // make sure that at least one byte was zeroed + if (data[offset] != 0) { + return SIZE_MAX; + } + + // make sure that nothing was zeroed before the pointer + if (data[offset-1] == 0) { + return SIZE_MAX; + } + + // find the last byte that was zeroed + for (size_t size = 1; size < max_size; ++size) { + if (data[offset + size] != 0) { + return size; + } + } + + // too much was zeroed + return SIZE_MAX; +} + +inline size_t cachelineSize() { + static size_t size = _cachelineSize(); + return size; +} diff --git a/pystencils_tests/test_vectorization.py b/pystencils_tests/test_vectorization.py index 782ea28df03f352aea2e8fe015114b08c64d810c..d05c37c6bfd15f6935ec29523321fcd45dff86f1 100644 --- a/pystencils_tests/test_vectorization.py +++ b/pystencils_tests/test_vectorization.py @@ -33,7 +33,7 @@ def test_vector_type_propagation(): np.testing.assert_equal(dst[1:-1, 1:-1], 2 * 10.0 + 3) -def test_aligned_and_nt_stores(): +def test_aligned_and_nt_stores(openmp=False): domain_size = (24, 24) # create a datahandling object dh = ps.create_data_handling(domain_size, periodicity=(True, True), parallel=False, default_target='cpu') @@ -41,19 +41,29 @@ def test_aligned_and_nt_stores(): # fields g = dh.add_array("g", values_per_cell=1, alignment=True) dh.fill("g", 1.0, ghost_layers=True) - f = dh.add_array("f", values_per_cell=1, alignment=True) + if openmp: + # TODO: throw error when not cacheline-aligned + alignment = 128 if instruction_set == 'vsx' else 64 if instruction_set == 'neon' else True + else: + alignment = True + f = dh.add_array("f", values_per_cell=1, alignment=alignment) dh.fill("f", 0.0, ghost_layers=True) opt = {'instruction_set': instruction_set, 'assume_aligned': True, 'nontemporal': True, 'assume_inner_stride_one': True} update_rule = [ps.Assignment(f.center(), 0.25 * (g[-1, 0] + g[1, 0] + g[0, -1] + g[0, 1]))] - ast = ps.create_kernel(update_rule, target=dh.default_target, cpu_vectorize_info=opt) - if 'stream_fence' in ast.instruction_set: - assert ast.instruction_set['stream_fence'] in ps.get_code_str(ast) + ast = ps.create_kernel(update_rule, target=dh.default_target, cpu_vectorize_info=opt, cpu_openmp=openmp) + if 'streamFence' in ast.instruction_set: + assert ast.instruction_set['streamFence'] in ps.get_code_str(ast) + if 'cachelineZero' in ast.instruction_set: + assert ast.instruction_set['cachelineZero'].split('{0}')[0] in ps.get_code_str(ast) kernel = ast.compile() dh.run_kernel(kernel) np.testing.assert_equal(np.sum(dh.cpu_arrays['f']), np.prod(domain_size)) +def test_aligned_and_nt_stores_openmp(): + test_aligned_and_nt_stores(True) + def test_inplace_update(): shape = (9, 9, 3)