diff --git a/pystencils/astnodes.py b/pystencils/astnodes.py
index b874db9b0c2f869131fa4986714aa172b8bfafdc..74ca259da727d24c02946716d7015cbf42cd505d 100644
--- a/pystencils/astnodes.py
+++ b/pystencils/astnodes.py
@@ -324,7 +324,7 @@ class Block(Node):
             node.parent = self
             self._nodes.insert(0, node)
 
-    def insert_before(self, new_node, insert_before):
+    def insert_before(self, new_node, insert_before, if_not_exists=False):
         new_node.parent = self
         assert self._nodes.count(insert_before) == 1
         idx = self._nodes.index(insert_before)
@@ -337,7 +337,25 @@ class Block(Node):
                     idx -= 1
                 else:
                     break
-        self._nodes.insert(idx, new_node)
+        if not if_not_exists or self._nodes[idx] != new_node:
+            self._nodes.insert(idx, new_node)
+
+    def insert_after(self, new_node, insert_after, if_not_exists=False):
+        new_node.parent = self
+        assert self._nodes.count(insert_after) == 1
+        idx = self._nodes.index(insert_after) + 1
+
+        # move all assignment (definitions to the top)
+        if isinstance(new_node, SympyAssignment) and new_node.is_declaration:
+            while idx > 0:
+                pn = self._nodes[idx - 1]
+                if isinstance(pn, LoopOverCoordinate) or isinstance(pn, Conditional):
+                    idx -= 1
+                else:
+                    break
+        if not if_not_exists or not (self._nodes[idx - 1] == new_node
+                                     or (idx < len(self._nodes) and self._nodes[idx] == new_node)):
+            self._nodes.insert(idx, new_node)
 
     def append(self, node):
         if isinstance(node, list) or isinstance(node, tuple):
@@ -816,3 +834,23 @@ class ConditionalFieldAccess(sp.Function):
 
     def __getnewargs__(self):
         return self.access, self.outofbounds_condition, self.outofbounds_value
+
+
+class NontemporalFence(Node):
+    def __init__(self):
+        super(NontemporalFence, self).__init__(parent=None)
+
+    @property
+    def symbols_defined(self):
+        return set()
+
+    @property
+    def undefined_symbols(self):
+        return set()
+
+    @property
+    def args(self):
+        return []
+
+    def __eq__(self, other):
+        return isinstance(other, NontemporalFence)
diff --git a/pystencils/backends/cbackend.py b/pystencils/backends/cbackend.py
index 2a15ef74f9ee5a528538d5976cd4a40d55426c82..08fcede5eff66b532768964333c021fdd6060e81 100644
--- a/pystencils/backends/cbackend.py
+++ b/pystencils/backends/cbackend.py
@@ -277,6 +277,12 @@ class CBackend:
             else:
                 return f"{self.sympy_printer.doprint(node.lhs)} = {self.sympy_printer.doprint(node.rhs)};"
 
+    def _print_NontemporalFence(self, _):
+        if 'stream_fence' in self._vector_instruction_set:
+            return self._vector_instruction_set['stream_fence'] + ';'
+        else:
+            return ''
+
     def _print_TemporaryMemoryAllocation(self, node):
         align = 64
         np_dtype = node.symbol.dtype.base_type.numpy_dtype
diff --git a/pystencils/backends/x86_instruction_sets.py b/pystencils/backends/x86_instruction_sets.py
index 836ffc57906b503c03164715fbed6e19aabfe760..0454621eb1a20c039b327f1f9282d8f5e8542851 100644
--- a/pystencils/backends/x86_instruction_sets.py
+++ b/pystencils/backends/x86_instruction_sets.py
@@ -164,4 +164,6 @@ def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'):
 
     result['+int'] = f"{pre}_add_{suffix['int']}({{0}}, {{1}})"
 
+    result['stream_fence'] = '_mm_mfence()'
+
     return result
diff --git a/pystencils/cpu/vectorization.py b/pystencils/cpu/vectorization.py
index a7d2b76d8981732da019b2bc9f9acfc52a104d2f..13d705b36d5ab19224209ddc3d81dc01241d85f1 100644
--- a/pystencils/cpu/vectorization.py
+++ b/pystencils/cpu/vectorization.py
@@ -148,6 +148,11 @@ def vectorize_inner_loops_and_adapt_load_stores(ast_node, vector_width, assume_a
                 if hasattr(indexed, 'field'):
                     nontemporal = (indexed.field in nontemporal_fields) or (indexed.field.name in nontemporal_fields)
                 substitutions[indexed] = vector_memory_access(indexed, vec_type, use_aligned_access, nontemporal, True)
+                if nontemporal:
+                    parent = loop_node.parent
+                    while type(parent.parent.parent) is not ast.KernelFunction:
+                        parent = parent.parent
+                    parent.parent.insert_after(ast.NontemporalFence(), parent, if_not_exists=True)
         if not successful:
             warnings.warn("Could not vectorize loop because of non-consecutive memory access")
             continue
diff --git a/pystencils_tests/test_vectorization.py b/pystencils_tests/test_vectorization.py
index 0889ab468f095796833a52e6dd1536ef031b578c..782ea28df03f352aea2e8fe015114b08c64d810c 100644
--- a/pystencils_tests/test_vectorization.py
+++ b/pystencils_tests/test_vectorization.py
@@ -47,6 +47,8 @@ def test_aligned_and_nt_stores():
            'assume_inner_stride_one': True}
     update_rule = [ps.Assignment(f.center(), 0.25 * (g[-1, 0] + g[1, 0] + g[0, -1] + g[0, 1]))]
     ast = ps.create_kernel(update_rule, target=dh.default_target, cpu_vectorize_info=opt)
+    if 'stream_fence' in ast.instruction_set:
+        assert ast.instruction_set['stream_fence'] in ps.get_code_str(ast)
     kernel = ast.compile()
 
     dh.run_kernel(kernel)