diff --git a/pystencils/astnodes.py b/pystencils/astnodes.py index b874db9b0c2f869131fa4986714aa172b8bfafdc..74ca259da727d24c02946716d7015cbf42cd505d 100644 --- a/pystencils/astnodes.py +++ b/pystencils/astnodes.py @@ -324,7 +324,7 @@ class Block(Node): node.parent = self self._nodes.insert(0, node) - def insert_before(self, new_node, insert_before): + def insert_before(self, new_node, insert_before, if_not_exists=False): new_node.parent = self assert self._nodes.count(insert_before) == 1 idx = self._nodes.index(insert_before) @@ -337,7 +337,25 @@ class Block(Node): idx -= 1 else: break - self._nodes.insert(idx, new_node) + if not if_not_exists or self._nodes[idx] != new_node: + self._nodes.insert(idx, new_node) + + def insert_after(self, new_node, insert_after, if_not_exists=False): + new_node.parent = self + assert self._nodes.count(insert_after) == 1 + idx = self._nodes.index(insert_after) + 1 + + # move all assignment (definitions to the top) + if isinstance(new_node, SympyAssignment) and new_node.is_declaration: + while idx > 0: + pn = self._nodes[idx - 1] + if isinstance(pn, LoopOverCoordinate) or isinstance(pn, Conditional): + idx -= 1 + else: + break + if not if_not_exists or not (self._nodes[idx - 1] == new_node + or (idx < len(self._nodes) and self._nodes[idx] == new_node)): + self._nodes.insert(idx, new_node) def append(self, node): if isinstance(node, list) or isinstance(node, tuple): @@ -816,3 +834,23 @@ class ConditionalFieldAccess(sp.Function): def __getnewargs__(self): return self.access, self.outofbounds_condition, self.outofbounds_value + + +class NontemporalFence(Node): + def __init__(self): + super(NontemporalFence, self).__init__(parent=None) + + @property + def symbols_defined(self): + return set() + + @property + def undefined_symbols(self): + return set() + + @property + def args(self): + return [] + + def __eq__(self, other): + return isinstance(other, NontemporalFence) diff --git a/pystencils/backends/cbackend.py b/pystencils/backends/cbackend.py index 2a15ef74f9ee5a528538d5976cd4a40d55426c82..08fcede5eff66b532768964333c021fdd6060e81 100644 --- a/pystencils/backends/cbackend.py +++ b/pystencils/backends/cbackend.py @@ -277,6 +277,12 @@ class CBackend: else: return f"{self.sympy_printer.doprint(node.lhs)} = {self.sympy_printer.doprint(node.rhs)};" + def _print_NontemporalFence(self, _): + if 'stream_fence' in self._vector_instruction_set: + return self._vector_instruction_set['stream_fence'] + ';' + else: + return '' + def _print_TemporaryMemoryAllocation(self, node): align = 64 np_dtype = node.symbol.dtype.base_type.numpy_dtype diff --git a/pystencils/backends/x86_instruction_sets.py b/pystencils/backends/x86_instruction_sets.py index 836ffc57906b503c03164715fbed6e19aabfe760..0454621eb1a20c039b327f1f9282d8f5e8542851 100644 --- a/pystencils/backends/x86_instruction_sets.py +++ b/pystencils/backends/x86_instruction_sets.py @@ -164,4 +164,6 @@ def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'): result['+int'] = f"{pre}_add_{suffix['int']}({{0}}, {{1}})" + result['stream_fence'] = '_mm_mfence()' + return result diff --git a/pystencils/cpu/vectorization.py b/pystencils/cpu/vectorization.py index a7d2b76d8981732da019b2bc9f9acfc52a104d2f..13d705b36d5ab19224209ddc3d81dc01241d85f1 100644 --- a/pystencils/cpu/vectorization.py +++ b/pystencils/cpu/vectorization.py @@ -148,6 +148,11 @@ def vectorize_inner_loops_and_adapt_load_stores(ast_node, vector_width, assume_a if hasattr(indexed, 'field'): nontemporal = (indexed.field in nontemporal_fields) or (indexed.field.name in nontemporal_fields) substitutions[indexed] = vector_memory_access(indexed, vec_type, use_aligned_access, nontemporal, True) + if nontemporal: + parent = loop_node.parent + while type(parent.parent.parent) is not ast.KernelFunction: + parent = parent.parent + parent.parent.insert_after(ast.NontemporalFence(), parent, if_not_exists=True) if not successful: warnings.warn("Could not vectorize loop because of non-consecutive memory access") continue diff --git a/pystencils_tests/test_vectorization.py b/pystencils_tests/test_vectorization.py index 0889ab468f095796833a52e6dd1536ef031b578c..782ea28df03f352aea2e8fe015114b08c64d810c 100644 --- a/pystencils_tests/test_vectorization.py +++ b/pystencils_tests/test_vectorization.py @@ -47,6 +47,8 @@ def test_aligned_and_nt_stores(): 'assume_inner_stride_one': True} update_rule = [ps.Assignment(f.center(), 0.25 * (g[-1, 0] + g[1, 0] + g[0, -1] + g[0, 1]))] ast = ps.create_kernel(update_rule, target=dh.default_target, cpu_vectorize_info=opt) + if 'stream_fence' in ast.instruction_set: + assert ast.instruction_set['stream_fence'] in ps.get_code_str(ast) kernel = ast.compile() dh.run_kernel(kernel)