diff --git a/pystencils/astnodes.py b/pystencils/astnodes.py
index 74ca259da727d24c02946716d7015cbf42cd505d..a1f282b9de584ae0f1b452379f5acf99c9373ded 100644
--- a/pystencils/astnodes.py
+++ b/pystencils/astnodes.py
@@ -313,7 +313,9 @@ class Block(Node):
         self._nodes = [fast_subs(a, subs_dict, skip) for a in self._nodes]
         return self
 
-    def insert_front(self, node):
+    def insert_front(self, node, if_not_exists=False):
+        if if_not_exists and len(self._nodes) > 0 and self._nodes[0] == node:
+            return
         if isinstance(node, collections.abc.Iterable):
             node = list(node)
             for n in node:
@@ -854,3 +856,25 @@ class NontemporalFence(Node):
 
     def __eq__(self, other):
         return isinstance(other, NontemporalFence)
+
+
+class CachelineSize(Node):
+    mask_symbol = sp.Symbol("_clsize_mask")
+    
+    def __init__(self):
+        super(CachelineSize, self).__init__(parent=None)
+
+    @property
+    def symbols_defined(self):
+        return set([self.mask_symbol])
+
+    @property
+    def undefined_symbols(self):
+        return set()
+
+    @property
+    def args(self):
+        return []
+
+    def __eq__(self, other):
+        return isinstance(other, CachelineSize)
diff --git a/pystencils/backends/arm_instruction_sets.py b/pystencils/backends/arm_instruction_sets.py
index 201451fb8c7d39f1396fa6c7c799051c4d2d8102..c6da59dfda5621154f292c4c50b75a2d3dad7a5b 100644
--- a/pystencils/backends/arm_instruction_sets.py
+++ b/pystencils/backends/arm_instruction_sets.py
@@ -81,4 +81,7 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'):
     result['any'] = f'vaddlvq_u8(vreinterpretq_u8_u{bits[data_type]}({{0}})) > 0'
     result['all'] = f'vaddlvq_u8(vreinterpretq_u8_u{bits[data_type]}({{0}})) == 16*0xff'
 
+    result['cachelineSize'] = 'cachelineSize()'
+    result['cachelineZero'] = 'cachelineZero((void*) {0})'
+
     return result
diff --git a/pystencils/backends/cbackend.py b/pystencils/backends/cbackend.py
index 08fcede5eff66b532768964333c021fdd6060e81..b469b13f15a9e14032514d473b3ce2e07e037f5d 100644
--- a/pystencils/backends/cbackend.py
+++ b/pystencils/backends/cbackend.py
@@ -7,7 +7,7 @@ import sympy as sp
 from sympy.core import S
 from sympy.logic.boolalg import BooleanFalse, BooleanTrue
 
-from pystencils.astnodes import KernelFunction, Node
+from pystencils.astnodes import KernelFunction, Node, CachelineSize
 from pystencils.cpu.vectorization import vec_all, vec_any
 from pystencils.data_types import (
     PointerType, VectorType, address_of, cast_func, create_type, get_type_of_expression,
@@ -271,15 +271,27 @@ class CBackend:
                 else:
                     rhs = node.rhs
 
-                return self._vector_instruction_set[instr].format("&" + self.sympy_printer.doprint(node.lhs.args[0]),
-                                                                  self.sympy_printer.doprint(rhs),
+                ptr = "&" + self.sympy_printer.doprint(node.lhs.args[0])
+                pre_code = ''
+                if instr == 'stream' and 'cachelineZero' in self._vector_instruction_set:
+                    pre_code = f"if (((uintptr_t) {ptr} & {CachelineSize.mask_symbol}) == 0) " + "\n\t" + \
+                        self._vector_instruction_set['cachelineZero'].format(ptr) + ';\n'
+
+                code = self._vector_instruction_set[instr].format(ptr, self.sympy_printer.doprint(rhs),
                                                                   printed_mask) + ';'
+                return pre_code + code
             else:
                 return f"{self.sympy_printer.doprint(node.lhs)} = {self.sympy_printer.doprint(node.rhs)};"
 
     def _print_NontemporalFence(self, _):
-        if 'stream_fence' in self._vector_instruction_set:
-            return self._vector_instruction_set['stream_fence'] + ';'
+        if 'streamFence' in self._vector_instruction_set:
+            return self._vector_instruction_set['streamFence'] + ';'
+        else:
+            return ''
+
+    def _print_CachelineSize(self, node):
+        if 'cachelineSize' in self._vector_instruction_set:
+            return f'const size_t {node.mask_symbol} = {self._vector_instruction_set["cachelineSize"]} - 1;'
         else:
             return ''
 
diff --git a/pystencils/backends/ppc_instruction_sets.py b/pystencils/backends/ppc_instruction_sets.py
index bb9e3e85113023c0f2c82ddce432a464891ca756..a1c481ae41ca2036b90dc5c338cefd7c71dc9fc4 100644
--- a/pystencils/backends/ppc_instruction_sets.py
+++ b/pystencils/backends/ppc_instruction_sets.py
@@ -29,7 +29,7 @@ def get_vector_instruction_set_ppc(data_type='double', instruction_set='vsx'):
         'loadA': 'ld[0x0, 0]',
         'storeU': 'xst[1, 0x0, 0]',
         'storeA': 'st[1, 0x0, 0]',
-        'stream': 'stl[1, 0x0, 0]',
+        'stream': 'st[1, 0x0, 0]',  # stl would flush the cacheline, which only makes sense for the last item
 
         'abs': 'abs[0]',
         '==': 'cmpeq[0, 1]',
@@ -98,4 +98,7 @@ def get_vector_instruction_set_ppc(data_type='double', instruction_set='vsx'):
     result['any'] = 'vec_any_ne({0}, ((' + result['bool'] + ') {{' + ", ".join(['0'] * width) + '}}))'
     result['all'] = 'vec_all_ne({0}, ((' + result['bool'] + ') {{' + ", ".join(['0'] * width) + '}}))'
 
+    result['cachelineSize'] = 'cachelineSize()'
+    result['cachelineZero'] = 'cachelineZero((void*) {0})'
+
     return result
diff --git a/pystencils/backends/x86_instruction_sets.py b/pystencils/backends/x86_instruction_sets.py
index 0454621eb1a20c039b327f1f9282d8f5e8542851..7809a89790e9f6b98cf70765a4124097adf40562 100644
--- a/pystencils/backends/x86_instruction_sets.py
+++ b/pystencils/backends/x86_instruction_sets.py
@@ -164,6 +164,6 @@ def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'):
 
     result['+int'] = f"{pre}_add_{suffix['int']}({{0}}, {{1}})"
 
-    result['stream_fence'] = '_mm_mfence()'
+    result['streamFence'] = '_mm_mfence()'
 
     return result
diff --git a/pystencils/cpu/vectorization.py b/pystencils/cpu/vectorization.py
index 13d705b36d5ab19224209ddc3d81dc01241d85f1..0de34b40bf2c68a2b12f29c6ad1016618b5c34ac 100644
--- a/pystencils/cpu/vectorization.py
+++ b/pystencils/cpu/vectorization.py
@@ -149,10 +149,13 @@ def vectorize_inner_loops_and_adapt_load_stores(ast_node, vector_width, assume_a
                     nontemporal = (indexed.field in nontemporal_fields) or (indexed.field.name in nontemporal_fields)
                 substitutions[indexed] = vector_memory_access(indexed, vec_type, use_aligned_access, nontemporal, True)
                 if nontemporal:
+                    # insert NontemporalFence after the outermost loop
                     parent = loop_node.parent
                     while type(parent.parent.parent) is not ast.KernelFunction:
                         parent = parent.parent
                     parent.parent.insert_after(ast.NontemporalFence(), parent, if_not_exists=True)
+                    # insert CachelineSize at the beginning of the kernel
+                    parent.parent.insert_front(ast.CachelineSize(), if_not_exists=True)
         if not successful:
             warnings.warn("Could not vectorize loop because of non-consecutive memory access")
             continue
diff --git a/pystencils/include/arm_neon_helpers.h b/pystencils/include/arm_neon_helpers.h
index ba6cbc2d7bae45591bcec580b98394c4f6830339..3d06d69bfc2dd866370e98968dacce3aaef3975c 100644
--- a/pystencils/include/arm_neon_helpers.h
+++ b/pystencils/include/arm_neon_helpers.h
@@ -17,3 +17,54 @@ inline int32x4_t makeVec_s32(int a, int b, int c, int d)
     alignas(16) int data[4] = {a, b, c, d};
     return vld1q_s32(data);
 }
+
+inline void cachelineZero(void * p) {
+	__asm__ volatile("dc zva, %0"::"r"(p));
+}
+
+inline size_t _cachelineSize() {
+	// check that dc zva is permitted
+	uint64_t dczid;
+	__asm__ volatile ("mrs %0, dczid_el0" : "=r"(dczid));
+	if ((dczid & (1 << 4)) != 0) {
+		return SIZE_MAX;
+	}
+
+	// allocate and fill with ones
+	const size_t max_size = 0x100000;
+	uint8_t data[2*max_size];
+	for (size_t i = 0; i < 2*max_size; ++i) {
+		data[i] = 0xff;
+	}
+	
+	// find alignment offset
+	size_t offset = max_size - ((uintptr_t) data) % max_size;
+
+	// zero a cacheline
+	cachelineZero((void*) (data + offset));
+
+	// make sure that at least one byte was zeroed
+	if (data[offset] != 0) {
+		return SIZE_MAX;
+	}
+
+	// make sure that nothing was zeroed before the pointer
+	if (data[offset-1] == 0) {
+		return SIZE_MAX;
+	}
+
+	// find the last byte that was zeroed
+	for (size_t size = 1; size < max_size; ++size) {
+		if (data[offset + size] != 0) {
+			return size;
+		}
+	}
+	
+	// too much was zeroed
+	return SIZE_MAX;
+}
+
+inline size_t cachelineSize() {
+	static size_t size = _cachelineSize();
+	return size;
+}
diff --git a/pystencils/include/ppc_altivec_helpers.h b/pystencils/include/ppc_altivec_helpers.h
index d2edb437a4bb945bf1abec95df34e6d518594fd7..ac4ebd2e1f1b3a65421c228516b7768e794a03a8 100644
--- a/pystencils/include/ppc_altivec_helpers.h
+++ b/pystencils/include/ppc_altivec_helpers.h
@@ -1,3 +1,51 @@
 #include <altivec.h>
 #undef vector
 #undef bool
+
+inline void cachelineZero(void * p) {
+#ifdef __xlC__
+	__dcbz(p);
+#else
+	__asm__ volatile("dcbz 0, %0"::"r"(p):"memory");
+#endif
+}
+
+inline size_t _cachelineSize() {
+	// allocate and fill with ones
+	const size_t max_size = 0x100000;
+	uint8_t data[2*max_size];
+	for (size_t i = 0; i < 2*max_size; ++i) {
+		data[i] = 0xff;
+	}
+	
+	// find alignment offset
+	size_t offset = max_size - ((uintptr_t) data) % max_size;
+
+	// zero a cacheline
+	cachelineZero((void*) (data + offset));
+
+	// make sure that at least one byte was zeroed
+	if (data[offset] != 0) {
+		return SIZE_MAX;
+	}
+
+	// make sure that nothing was zeroed before the pointer
+	if (data[offset-1] == 0) {
+		return SIZE_MAX;
+	}
+
+	// find the last byte that was zeroed
+	for (size_t size = 1; size < max_size; ++size) {
+		if (data[offset + size] != 0) {
+			return size;
+		}
+	}
+	
+	// too much was zeroed
+	return SIZE_MAX;
+}
+
+inline size_t cachelineSize() {
+	static size_t size = _cachelineSize();
+	return size;
+}
diff --git a/pystencils_tests/test_vectorization.py b/pystencils_tests/test_vectorization.py
index 782ea28df03f352aea2e8fe015114b08c64d810c..d05c37c6bfd15f6935ec29523321fcd45dff86f1 100644
--- a/pystencils_tests/test_vectorization.py
+++ b/pystencils_tests/test_vectorization.py
@@ -33,7 +33,7 @@ def test_vector_type_propagation():
     np.testing.assert_equal(dst[1:-1, 1:-1], 2 * 10.0 + 3)
 
 
-def test_aligned_and_nt_stores():
+def test_aligned_and_nt_stores(openmp=False):
     domain_size = (24, 24)
     # create a datahandling object
     dh = ps.create_data_handling(domain_size, periodicity=(True, True), parallel=False, default_target='cpu')
@@ -41,19 +41,29 @@ def test_aligned_and_nt_stores():
     # fields
     g = dh.add_array("g", values_per_cell=1, alignment=True)
     dh.fill("g", 1.0, ghost_layers=True)
-    f = dh.add_array("f", values_per_cell=1, alignment=True)
+    if openmp:
+        # TODO: throw error when not cacheline-aligned
+        alignment = 128 if instruction_set == 'vsx' else 64 if instruction_set == 'neon' else True
+    else:
+        alignment = True
+    f = dh.add_array("f", values_per_cell=1, alignment=alignment)
     dh.fill("f", 0.0, ghost_layers=True)
     opt = {'instruction_set': instruction_set, 'assume_aligned': True, 'nontemporal': True,
            'assume_inner_stride_one': True}
     update_rule = [ps.Assignment(f.center(), 0.25 * (g[-1, 0] + g[1, 0] + g[0, -1] + g[0, 1]))]
-    ast = ps.create_kernel(update_rule, target=dh.default_target, cpu_vectorize_info=opt)
-    if 'stream_fence' in ast.instruction_set:
-        assert ast.instruction_set['stream_fence'] in ps.get_code_str(ast)
+    ast = ps.create_kernel(update_rule, target=dh.default_target, cpu_vectorize_info=opt, cpu_openmp=openmp)
+    if 'streamFence' in ast.instruction_set:
+        assert ast.instruction_set['streamFence'] in ps.get_code_str(ast)
+    if 'cachelineZero' in ast.instruction_set:
+        assert ast.instruction_set['cachelineZero'].split('{0}')[0] in ps.get_code_str(ast)
     kernel = ast.compile()
 
     dh.run_kernel(kernel)
     np.testing.assert_equal(np.sum(dh.cpu_arrays['f']), np.prod(domain_size))
 
+def test_aligned_and_nt_stores_openmp():
+    test_aligned_and_nt_stores(True)
+
 
 def test_inplace_update():
     shape = (9, 9, 3)