diff --git a/pystencils/alignedarray.py b/pystencils/alignedarray.py
index 70271b0c0f3b50f04ceed671cabb16411554cdac..eda9fcaeba9915edeada8a1f72559821c2a0062b 100644
--- a/pystencils/alignedarray.py
+++ b/pystencils/alignedarray.py
@@ -10,20 +10,28 @@ def aligned_empty(shape, byte_alignment=True, dtype=np.float64, byte_offset=0, o
         shape: size of the array
         byte_alignment: alignment in bytes, for the start address of the array holds (a % byte_alignment) == 0
                         By default, use the maximum required by the CPU (or 512 bits if this cannot be detected).
+                        When 'cacheline' is specified, the size of a cache line is used.
         dtype: numpy data type
         byte_offset: offset in bytes for position that should be aligned i.e. (a+byte_offset) % byte_alignment == 0
                     typically used to align first inner cell instead of ghost layer
         order: storage linearization order
         align_inner_coordinate: if True, the start of the innermost coordinate lines are aligned as well
     """
-    if byte_alignment is True:
-        from pystencils.backends.simd_instruction_sets import (get_supported_instruction_sets,
+    if byte_alignment is True or byte_alignment == 'cacheline':
+        from pystencils.backends.simd_instruction_sets import (get_supported_instruction_sets, get_cacheline_size,
                                                                get_vector_instruction_set)
 
         type_name = BasicType.numpy_name_to_c(np.dtype(dtype).name)
         instruction_sets = get_supported_instruction_sets()
         if instruction_sets is None:
             byte_alignment = 64
+        elif byte_alignment == 'cacheline':
+            cacheline_sizes = [get_cacheline_size(is_name) for is_name in instruction_sets]
+            if all([s is None for s in cacheline_sizes]):
+                byte_alignment = max([get_vector_instruction_set(type_name, is_name)['width'] * np.dtype(dtype).itemsize
+                                      for is_name in instruction_sets])
+            else:
+                byte_alignment = max([s for s in cacheline_sizes if s is not None])
         else:
             byte_alignment = max([get_vector_instruction_set(type_name, is_name)['width'] * np.dtype(dtype).itemsize
                                   for is_name in instruction_sets])
diff --git a/pystencils/astnodes.py b/pystencils/astnodes.py
index b874db9b0c2f869131fa4986714aa172b8bfafdc..53731645797afeb62a4a5ff803c2ee7b4e5d8e16 100644
--- a/pystencils/astnodes.py
+++ b/pystencils/astnodes.py
@@ -313,7 +313,9 @@ class Block(Node):
         self._nodes = [fast_subs(a, subs_dict, skip) for a in self._nodes]
         return self
 
-    def insert_front(self, node):
+    def insert_front(self, node, if_not_exists=False):
+        if if_not_exists and len(self._nodes) > 0 and self._nodes[0] == node:
+            return
         if isinstance(node, collections.abc.Iterable):
             node = list(node)
             for n in node:
@@ -324,7 +326,7 @@ class Block(Node):
             node.parent = self
             self._nodes.insert(0, node)
 
-    def insert_before(self, new_node, insert_before):
+    def insert_before(self, new_node, insert_before, if_not_exists=False):
         new_node.parent = self
         assert self._nodes.count(insert_before) == 1
         idx = self._nodes.index(insert_before)
@@ -337,7 +339,25 @@ class Block(Node):
                     idx -= 1
                 else:
                     break
-        self._nodes.insert(idx, new_node)
+        if not if_not_exists or self._nodes[idx] != new_node:
+            self._nodes.insert(idx, new_node)
+
+    def insert_after(self, new_node, insert_after, if_not_exists=False):
+        new_node.parent = self
+        assert self._nodes.count(insert_after) == 1
+        idx = self._nodes.index(insert_after) + 1
+
+        # move all assignment (definitions to the top)
+        if isinstance(new_node, SympyAssignment) and new_node.is_declaration:
+            while idx > 0:
+                pn = self._nodes[idx - 1]
+                if isinstance(pn, LoopOverCoordinate) or isinstance(pn, Conditional):
+                    idx -= 1
+                else:
+                    break
+        if not if_not_exists or not (self._nodes[idx - 1] == new_node
+                                     or (idx < len(self._nodes) and self._nodes[idx] == new_node)):
+            self._nodes.insert(idx, new_node)
 
     def append(self, node):
         if isinstance(node, list) or isinstance(node, tuple):
@@ -816,3 +836,47 @@ class ConditionalFieldAccess(sp.Function):
 
     def __getnewargs__(self):
         return self.access, self.outofbounds_condition, self.outofbounds_value
+
+
+class NontemporalFence(Node):
+    def __init__(self):
+        super(NontemporalFence, self).__init__(parent=None)
+
+    @property
+    def symbols_defined(self):
+        return set()
+
+    @property
+    def undefined_symbols(self):
+        return set()
+
+    @property
+    def args(self):
+        return []
+
+    def __eq__(self, other):
+        return isinstance(other, NontemporalFence)
+
+
+class CachelineSize(Node):
+    symbol = sp.Symbol("_clsize")
+    mask_symbol = sp.Symbol("_clsize_mask")
+    last_symbol = sp.Symbol("_cl_lastvec")
+    
+    def __init__(self):
+        super(CachelineSize, self).__init__(parent=None)
+
+    @property
+    def symbols_defined(self):
+        return set([self.symbol, self.mask_symbol, self.last_symbol])
+
+    @property
+    def undefined_symbols(self):
+        return set()
+
+    @property
+    def args(self):
+        return []
+
+    def __eq__(self, other):
+        return isinstance(other, CachelineSize)
diff --git a/pystencils/backends/arm_instruction_sets.py b/pystencils/backends/arm_instruction_sets.py
index 201451fb8c7d39f1396fa6c7c799051c4d2d8102..a386253a098be738f9a6bc932144edebeaa4a1ea 100644
--- a/pystencils/backends/arm_instruction_sets.py
+++ b/pystencils/backends/arm_instruction_sets.py
@@ -28,7 +28,6 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'):
         'loadA': 'ld1[0]',
         'storeU': 'st1[0, 1]',
         'storeA': 'st1[0, 1]',
-        'stream': 'st1[0, 1]',
 
         'abs': 'abs[0]',
         '==': 'ceq[0, 1]',
@@ -47,6 +46,7 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'):
     suffix = f'q_f{bits[data_type]}'
 
     result = dict()
+    result['bytes'] = 16
 
     for intrinsic_id, function_shortcut in base_names.items():
         function_shortcut = function_shortcut.strip()
@@ -81,4 +81,7 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'):
     result['any'] = f'vaddlvq_u8(vreinterpretq_u8_u{bits[data_type]}({{0}})) > 0'
     result['all'] = f'vaddlvq_u8(vreinterpretq_u8_u{bits[data_type]}({{0}})) == 16*0xff'
 
+    result['cachelineSize'] = 'cachelineSize()'
+    result['cachelineZero'] = 'cachelineZero((void*) {0})'
+
     return result
diff --git a/pystencils/backends/cbackend.py b/pystencils/backends/cbackend.py
index 2a15ef74f9ee5a528538d5976cd4a40d55426c82..9d7aa3cc30ba465fc97b97b75d7227776366ae51 100644
--- a/pystencils/backends/cbackend.py
+++ b/pystencils/backends/cbackend.py
@@ -1,5 +1,6 @@
 import re
 from collections import namedtuple
+import hashlib
 from typing import Set
 
 import numpy as np
@@ -7,7 +8,7 @@ import sympy as sp
 from sympy.core import S
 from sympy.logic.boolalg import BooleanFalse, BooleanTrue
 
-from pystencils.astnodes import KernelFunction, Node
+from pystencils.astnodes import KernelFunction, Node, CachelineSize
 from pystencils.cpu.vectorization import vec_all, vec_any
 from pystencils.data_types import (
     PointerType, VectorType, address_of, cast_func, create_type, get_type_of_expression,
@@ -258,7 +259,7 @@ class CBackend:
                 arg, data_type, aligned, nontemporal, mask = node.lhs.args
                 instr = 'storeU'
                 if aligned:
-                    instr = 'stream' if nontemporal else 'storeA'
+                    instr = 'stream' if nontemporal and 'stream' in self._vector_instruction_set else 'storeA'
                 if mask != True:  # NOQA
                     instr = 'maskStore' if aligned else 'maskStoreU'
                     printed_mask = self.sympy_printer.doprint(mask)
@@ -271,21 +272,60 @@ class CBackend:
                 else:
                     rhs = node.rhs
 
-                return self._vector_instruction_set[instr].format("&" + self.sympy_printer.doprint(node.lhs.args[0]),
-                                                                  self.sympy_printer.doprint(rhs),
+                ptr = "&" + self.sympy_printer.doprint(node.lhs.args[0])
+                pre_code = ''
+                if nontemporal and 'cachelineZero' in self._vector_instruction_set:
+                    pre_code = f"if (((uintptr_t) {ptr} & {CachelineSize.mask_symbol}) == 0) " + "{\n\t" + \
+                        self._vector_instruction_set['cachelineZero'].format(ptr) + ';\n}\n'
+
+                code = self._vector_instruction_set[instr].format(ptr, self.sympy_printer.doprint(rhs),
                                                                   printed_mask) + ';'
+                flushcond = f"((uintptr_t) {ptr} & {CachelineSize.mask_symbol}) == {CachelineSize.last_symbol}"
+                if nontemporal and 'flushCacheline' in self._vector_instruction_set:
+                    code2 = self._vector_instruction_set['flushCacheline'].format(
+                        ptr, self.sympy_printer.doprint(rhs)) + ';'
+                    code = f"{code}\nif ({flushcond}) {{\n\t{code2}\n}}"
+                elif nontemporal and 'storeAAndFlushCacheline' in self._vector_instruction_set:
+                    tmpvar = '_tmp_' + hashlib.sha1(self.sympy_printer.doprint(rhs).encode('ascii')).hexdigest()[:8]
+                    code = 'const ' + self._print(node.lhs.dtype).replace(' const', '') + ' ' + tmpvar + ' = ' \
+                        + self.sympy_printer.doprint(rhs) + ';'
+                    code1 = self._vector_instruction_set[instr].format(ptr, tmpvar, printed_mask) + ';'
+                    code2 = self._vector_instruction_set['storeAAndFlushCacheline'].format(ptr, tmpvar, printed_mask) \
+                        + ';'
+                    code += f"\nif ({flushcond}) {{\n\t{code2}\n}} else {{\n\t{code1}\n}}"
+                return pre_code + code
             else:
                 return f"{self.sympy_printer.doprint(node.lhs)} = {self.sympy_printer.doprint(node.rhs)};"
 
+    def _print_NontemporalFence(self, _):
+        if 'streamFence' in self._vector_instruction_set:
+            return self._vector_instruction_set['streamFence'] + ';'
+        else:
+            return ''
+
+    def _print_CachelineSize(self, node):
+        if 'cachelineSize' in self._vector_instruction_set:
+            code = f'const size_t {node.symbol} = {self._vector_instruction_set["cachelineSize"]};\n'
+            code += f'const size_t {node.mask_symbol} = {node.symbol} - 1;\n'
+            vectorsize = self._vector_instruction_set['bytes']
+            code += f'const size_t {node.last_symbol} = {node.symbol} - {vectorsize};\n'
+            return code
+        else:
+            return ''
+
     def _print_TemporaryMemoryAllocation(self, node):
-        align = 64
+        if self._vector_instruction_set:
+            align = self._vector_instruction_set['bytes']
+        else:
+            align = node.symbol.dtype.base_type.numpy_dtype.itemsize
+
         np_dtype = node.symbol.dtype.base_type.numpy_dtype
         required_size = np_dtype.itemsize * node.size + align
         size = modulo_ceil(required_size, align)
-        code = "#if __cplusplus >= 201703L || __STDC_VERSION__ >= 201112L\n"
-        code += "{dtype} {name}=({dtype})aligned_alloc({align}, {size}) + {offset};\n"
-        code += "#elif defined(_MSC_VER)\n"
+        code = "#if defined(_MSC_VER)\n"
         code += "{dtype} {name}=({dtype})_aligned_malloc({size}, {align}) + {offset};\n"
+        code += "#elif __cplusplus >= 201703L || __STDC_VERSION__ >= 201112L\n"
+        code += "{dtype} {name}=({dtype})aligned_alloc({align}, {size}) + {offset};\n"
         code += "#else\n"
         code += "{dtype} {name};\n"
         code += "posix_memalign((void**) &{name}, {align}, {size});\n"
@@ -298,7 +338,11 @@ class CBackend:
                            align=align)
 
     def _print_TemporaryMemoryFree(self, node):
-        align = 64
+        if self._vector_instruction_set:
+            align = self._vector_instruction_set['bytes']
+        else:
+            align = node.symbol.dtype.base_type.numpy_dtype.itemsize
+
         code = "#if defined(_MSC_VER)\n"
         code += "_aligned_free(%s - %d);\n" % (self.sympy_printer.doprint(node.symbol.name), node.offset(align))
         code += "#else\n"
diff --git a/pystencils/backends/ppc_instruction_sets.py b/pystencils/backends/ppc_instruction_sets.py
index bb9e3e85113023c0f2c82ddce432a464891ca756..f792127618b025b1bc9cf272e9c2c59d93422318 100644
--- a/pystencils/backends/ppc_instruction_sets.py
+++ b/pystencils/backends/ppc_instruction_sets.py
@@ -29,7 +29,7 @@ def get_vector_instruction_set_ppc(data_type='double', instruction_set='vsx'):
         'loadA': 'ld[0x0, 0]',
         'storeU': 'xst[1, 0x0, 0]',
         'storeA': 'st[1, 0x0, 0]',
-        'stream': 'stl[1, 0x0, 0]',
+        'storeAAndFlushCacheline': 'stl[1, 0x0, 0]',
 
         'abs': 'abs[0]',
         '==': 'cmpeq[0, 1]',
@@ -64,6 +64,7 @@ def get_vector_instruction_set_ppc(data_type='double', instruction_set='vsx'):
     intwidth = 128 // bits['int']
 
     result = dict()
+    result['bytes'] = 16
 
     for intrinsic_id, function_shortcut in base_names.items():
         function_shortcut = function_shortcut.strip()
@@ -78,6 +79,8 @@ def get_vector_instruction_set_ppc(data_type='double', instruction_set='vsx'):
         result['loadA'] = '(__vector double)' + result['loadA'].format('(float*) {0}')
         result['storeA'] = result['storeA'].format('(float*) {0}', '(__vector float) {1}')
         result['stream'] = result['stream'].format('(float*) {0}', '(__vector float) {1}')
+        result['storeAAndFlushCacheline'] = result['storeAAndFlushCacheline'].format('(float*) {0}',
+                                                                                     '(__vector float) {1}')
 
     result['+int'] = "vec_add({0}, {1})"
 
@@ -98,4 +101,7 @@ def get_vector_instruction_set_ppc(data_type='double', instruction_set='vsx'):
     result['any'] = 'vec_any_ne({0}, ((' + result['bool'] + ') {{' + ", ".join(['0'] * width) + '}}))'
     result['all'] = 'vec_all_ne({0}, ((' + result['bool'] + ') {{' + ", ".join(['0'] * width) + '}}))'
 
+    result['cachelineSize'] = 'cachelineSize()'
+    result['cachelineZero'] = 'cachelineZero((void*) {0})'
+
     return result
diff --git a/pystencils/backends/simd_instruction_sets.py b/pystencils/backends/simd_instruction_sets.py
index 850f8ff6d4a9ae168a78c6588a69fd87f5e5f03e..9469dc59eb1b4d5d2189c138d42f4c1f233e5259 100644
--- a/pystencils/backends/simd_instruction_sets.py
+++ b/pystencils/backends/simd_instruction_sets.py
@@ -15,6 +15,7 @@ def get_vector_instruction_set(data_type='double', instruction_set='avx'):
 
 
 _cache = None
+_cachelinesize = None
 
 
 def get_supported_instruction_sets():
@@ -56,3 +57,27 @@ def get_supported_instruction_sets():
     if flags.issuperset(required_neon_flags):
         result.append("neon")
     return result
+
+
+def get_cacheline_size(instruction_set):
+    """Get the size (in bytes) of a cache block that can be zeroed without memory access.
+       Usually, this is identical to the cache line size."""
+    global _cachelinesize
+    
+    instruction_sets = get_vector_instruction_set('double', instruction_set)
+    if 'cachelineSize' not in instruction_sets:
+        return None
+    if _cachelinesize is not None:
+        return _cachelinesize
+    
+    import pystencils as ps
+    import numpy as np
+    
+    arr = np.zeros((1, 1), dtype=np.float32)
+    f = ps.Field.create_from_numpy_array('f', arr, index_dimensions=0)
+    ass = [ps.astnodes.CachelineSize(), ps.Assignment(f.center, ps.astnodes.CachelineSize.symbol)]
+    ast = ps.create_kernel(ass, cpu_vectorize_info={'instruction_set': instruction_set})
+    kernel = ast.compile()
+    kernel(**{f.name: arr, ps.astnodes.CachelineSize.symbol.name: 0})
+    _cachelinesize = int(arr[0, 0])
+    return _cachelinesize
diff --git a/pystencils/backends/x86_instruction_sets.py b/pystencils/backends/x86_instruction_sets.py
index 836ffc57906b503c03164715fbed6e19aabfe760..86196aad7047c8f2bcc16a94b487537d3147d36d 100644
--- a/pystencils/backends/x86_instruction_sets.py
+++ b/pystencils/backends/x86_instruction_sets.py
@@ -110,7 +110,8 @@ def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'):
 
     result = {
         'width': width[(data_type, instruction_set)],
-        'intwidth': width[('int', instruction_set)]
+        'intwidth': width[('int', instruction_set)],
+        'bytes': 4 * width[("float", instruction_set)]
     }
     pre = prefix[instruction_set]
     for intrinsic_id, function_shortcut in base_names.items():
@@ -164,4 +165,6 @@ def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'):
 
     result['+int'] = f"{pre}_add_{suffix['int']}({{0}}, {{1}})"
 
+    result['streamFence'] = '_mm_mfence()'
+
     return result
diff --git a/pystencils/cpu/cpujit.py b/pystencils/cpu/cpujit.py
index 6f2c9a5b0ea502e16393565b82638524caaa6dcd..68ca259024e26cd2d97aa16bf5698e163c5c8373 100644
--- a/pystencils/cpu/cpujit.py
+++ b/pystencils/cpu/cpujit.py
@@ -58,8 +58,9 @@ import numpy as np
 from appdirs import user_cache_dir, user_config_dir
 
 from pystencils import FieldType
+from pystencils.astnodes import LoopOverCoordinate
 from pystencils.backends.cbackend import generate_c, get_headers
-from pystencils.data_types import cast_func, VectorType
+from pystencils.data_types import cast_func, VectorType, vector_memory_access
 from pystencils.include import get_pystencils_include_path
 from pystencils.kernel_wrapper import KernelWrapper
 from pystencils.utils import atomic_file_write, file_handle_for_atomic_write, recursive_dict_update
@@ -386,6 +387,14 @@ def create_function_boilerplate_code(parameter_info, name, ast_node, insert_chec
 
                 if ast_node.instruction_set and aligned:
                     byte_width = ast_node.instruction_set['width'] * item_size
+                    if 'cachelineZero' in ast_node.instruction_set:
+                        has_openmp, has_nontemporal = False, False
+                        for loop in ast_node.atoms(LoopOverCoordinate):
+                            has_openmp = has_openmp or any(['#pragma omp' in p for p in loop.prefix_lines])
+                            has_nontemporal = has_nontemporal or any([a.args[0].field == field and a.args[3] for a in
+                                                                      loop.atoms(vector_memory_access)])
+                        if has_openmp and has_nontemporal:
+                            byte_width = ast_node.instruction_set['cachelineSize']
                     offset = max(max(ast_node.ghost_layers)) * item_size
                     offset_cond = f"(((uintptr_t) buffer_{field.name}.buf) + {offset}) % {byte_width} == 0"
 
@@ -394,6 +403,9 @@ def create_function_boilerplate_code(parameter_info, name, ast_node, insert_chec
                                             "the kernel creation is not specified it will choose a suitable value " \
                                             "automatically. This value might not " \
                                             "be compatible with the allocated arrays."
+                    if type(byte_width) is not int:
+                        message += " Note that when both OpenMP and non-temporal stores are enabled, alignment to the "\
+                                   "cacheline size is required."
                     pre_call_code += template_check_array.format(cond=offset_cond, what="offset", name=field.name,
                                                                  expected=message)
 
diff --git a/pystencils/cpu/vectorization.py b/pystencils/cpu/vectorization.py
index a7d2b76d8981732da019b2bc9f9acfc52a104d2f..0de34b40bf2c68a2b12f29c6ad1016618b5c34ac 100644
--- a/pystencils/cpu/vectorization.py
+++ b/pystencils/cpu/vectorization.py
@@ -148,6 +148,14 @@ def vectorize_inner_loops_and_adapt_load_stores(ast_node, vector_width, assume_a
                 if hasattr(indexed, 'field'):
                     nontemporal = (indexed.field in nontemporal_fields) or (indexed.field.name in nontemporal_fields)
                 substitutions[indexed] = vector_memory_access(indexed, vec_type, use_aligned_access, nontemporal, True)
+                if nontemporal:
+                    # insert NontemporalFence after the outermost loop
+                    parent = loop_node.parent
+                    while type(parent.parent.parent) is not ast.KernelFunction:
+                        parent = parent.parent
+                    parent.parent.insert_after(ast.NontemporalFence(), parent, if_not_exists=True)
+                    # insert CachelineSize at the beginning of the kernel
+                    parent.parent.insert_front(ast.CachelineSize(), if_not_exists=True)
         if not successful:
             warnings.warn("Could not vectorize loop because of non-consecutive memory access")
             continue
diff --git a/pystencils/include/arm_neon_helpers.h b/pystencils/include/arm_neon_helpers.h
index ba6cbc2d7bae45591bcec580b98394c4f6830339..3d06d69bfc2dd866370e98968dacce3aaef3975c 100644
--- a/pystencils/include/arm_neon_helpers.h
+++ b/pystencils/include/arm_neon_helpers.h
@@ -17,3 +17,54 @@ inline int32x4_t makeVec_s32(int a, int b, int c, int d)
     alignas(16) int data[4] = {a, b, c, d};
     return vld1q_s32(data);
 }
+
+inline void cachelineZero(void * p) {
+	__asm__ volatile("dc zva, %0"::"r"(p));
+}
+
+inline size_t _cachelineSize() {
+	// check that dc zva is permitted
+	uint64_t dczid;
+	__asm__ volatile ("mrs %0, dczid_el0" : "=r"(dczid));
+	if ((dczid & (1 << 4)) != 0) {
+		return SIZE_MAX;
+	}
+
+	// allocate and fill with ones
+	const size_t max_size = 0x100000;
+	uint8_t data[2*max_size];
+	for (size_t i = 0; i < 2*max_size; ++i) {
+		data[i] = 0xff;
+	}
+	
+	// find alignment offset
+	size_t offset = max_size - ((uintptr_t) data) % max_size;
+
+	// zero a cacheline
+	cachelineZero((void*) (data + offset));
+
+	// make sure that at least one byte was zeroed
+	if (data[offset] != 0) {
+		return SIZE_MAX;
+	}
+
+	// make sure that nothing was zeroed before the pointer
+	if (data[offset-1] == 0) {
+		return SIZE_MAX;
+	}
+
+	// find the last byte that was zeroed
+	for (size_t size = 1; size < max_size; ++size) {
+		if (data[offset + size] != 0) {
+			return size;
+		}
+	}
+	
+	// too much was zeroed
+	return SIZE_MAX;
+}
+
+inline size_t cachelineSize() {
+	static size_t size = _cachelineSize();
+	return size;
+}
diff --git a/pystencils/include/ppc_altivec_helpers.h b/pystencils/include/ppc_altivec_helpers.h
index d2edb437a4bb945bf1abec95df34e6d518594fd7..ac4ebd2e1f1b3a65421c228516b7768e794a03a8 100644
--- a/pystencils/include/ppc_altivec_helpers.h
+++ b/pystencils/include/ppc_altivec_helpers.h
@@ -1,3 +1,51 @@
 #include <altivec.h>
 #undef vector
 #undef bool
+
+inline void cachelineZero(void * p) {
+#ifdef __xlC__
+	__dcbz(p);
+#else
+	__asm__ volatile("dcbz 0, %0"::"r"(p):"memory");
+#endif
+}
+
+inline size_t _cachelineSize() {
+	// allocate and fill with ones
+	const size_t max_size = 0x100000;
+	uint8_t data[2*max_size];
+	for (size_t i = 0; i < 2*max_size; ++i) {
+		data[i] = 0xff;
+	}
+	
+	// find alignment offset
+	size_t offset = max_size - ((uintptr_t) data) % max_size;
+
+	// zero a cacheline
+	cachelineZero((void*) (data + offset));
+
+	// make sure that at least one byte was zeroed
+	if (data[offset] != 0) {
+		return SIZE_MAX;
+	}
+
+	// make sure that nothing was zeroed before the pointer
+	if (data[offset-1] == 0) {
+		return SIZE_MAX;
+	}
+
+	// find the last byte that was zeroed
+	for (size_t size = 1; size < max_size; ++size) {
+		if (data[offset + size] != 0) {
+			return size;
+		}
+	}
+	
+	// too much was zeroed
+	return SIZE_MAX;
+}
+
+inline size_t cachelineSize() {
+	static size_t size = _cachelineSize();
+	return size;
+}
diff --git a/pystencils_tests/test_vectorization.py b/pystencils_tests/test_vectorization.py
index 0889ab468f095796833a52e6dd1536ef031b578c..880a009a294569f004f7f8a0cba3a02b98ec5bd2 100644
--- a/pystencils_tests/test_vectorization.py
+++ b/pystencils_tests/test_vectorization.py
@@ -33,25 +33,39 @@ def test_vector_type_propagation():
     np.testing.assert_equal(dst[1:-1, 1:-1], 2 * 10.0 + 3)
 
 
-def test_aligned_and_nt_stores():
+def test_aligned_and_nt_stores(openmp=False):
     domain_size = (24, 24)
     # create a datahandling object
     dh = ps.create_data_handling(domain_size, periodicity=(True, True), parallel=False, default_target='cpu')
 
     # fields
-    g = dh.add_array("g", values_per_cell=1, alignment=True)
+    alignment = 'cacheline' if openmp else True
+    g = dh.add_array("g", values_per_cell=1, alignment=alignment)
     dh.fill("g", 1.0, ghost_layers=True)
-    f = dh.add_array("f", values_per_cell=1, alignment=True)
+    f = dh.add_array("f", values_per_cell=1, alignment=alignment)
     dh.fill("f", 0.0, ghost_layers=True)
     opt = {'instruction_set': instruction_set, 'assume_aligned': True, 'nontemporal': True,
            'assume_inner_stride_one': True}
     update_rule = [ps.Assignment(f.center(), 0.25 * (g[-1, 0] + g[1, 0] + g[0, -1] + g[0, 1]))]
-    ast = ps.create_kernel(update_rule, target=dh.default_target, cpu_vectorize_info=opt)
+    ast = ps.create_kernel(update_rule, target=dh.default_target, cpu_vectorize_info=opt, cpu_openmp=openmp)
+    if instruction_set in ['sse'] or instruction_set.startswith('avx'):
+        assert 'stream' in ast.instruction_set
+        assert 'streamFence' in ast.instruction_set
+    if instruction_set in ['neon', 'vsx'] or instruction_set.startswith('sve'):
+        assert 'cachelineZero' in ast.instruction_set
+    if instruction_set in ['vsx']:
+        assert 'storeAAndFlushCacheline' in ast.instruction_set
+    for instruction in ['stream', 'streamFence', 'cachelineZero', 'storeAAndFlushCacheline', 'flushCacheline']:
+        if instruction in ast.instruction_set:
+            assert ast.instruction_set[instruction].split('{')[0] in ps.get_code_str(ast)
     kernel = ast.compile()
 
     dh.run_kernel(kernel)
     np.testing.assert_equal(np.sum(dh.cpu_arrays['f']), np.prod(domain_size))
 
+def test_aligned_and_nt_stores_openmp():
+    test_aligned_and_nt_stores(True)
+
 
 def test_inplace_update():
     shape = (9, 9, 3)
diff --git a/pystencils_tests/test_vectorization_specific.py b/pystencils_tests/test_vectorization_specific.py
index 4476e5bf4f68b82056ecb3242237b1c3ce879191..fca50949e6dbed63cc97c6a2bff33aad7dbe2968 100644
--- a/pystencils_tests/test_vectorization_specific.py
+++ b/pystencils_tests/test_vectorization_specific.py
@@ -4,7 +4,8 @@ import numpy as np
 import sympy as sp
 
 import pystencils as ps
-from pystencils.backends.simd_instruction_sets import get_supported_instruction_sets, get_vector_instruction_set
+from pystencils.backends.simd_instruction_sets import (get_cacheline_size, get_supported_instruction_sets,
+                                                       get_vector_instruction_set)
 from pystencils.data_types import cast_func, VectorType
 
 supported_instruction_sets = get_supported_instruction_sets() if get_supported_instruction_sets() else []
@@ -76,4 +77,16 @@ def test_alignment_and_correct_ghost_layers(gl_field, gl_kernel, instruction_set
         with pytest.raises(ValueError):
             dh.run_kernel(kernel)
     else:
-        dh.run_kernel(kernel)
\ No newline at end of file
+        dh.run_kernel(kernel)
+
+
+@pytest.mark.parametrize('instruction_set', supported_instruction_sets)
+def test_cacheline_size(instruction_set):
+    cacheline_size = get_cacheline_size(instruction_set)
+    if cacheline_size is None:
+        pytest.skip()
+    instruction_set = get_vector_instruction_set('double', instruction_set)
+    vector_size = instruction_set['bytes']
+    assert cacheline_size > 8 and cacheline_size < 0x100000, "Cache line size is implausible"
+    assert cacheline_size % vector_size == 0, "Cache line size should be multiple of vector size"
+    assert cacheline_size & (cacheline_size - 1) == 0, "Cache line size is not a power of 2"