From a100b5b069992b1d5496bfa22449a62da275195c Mon Sep 17 00:00:00 2001
From: Michael Kuron <m.kuron@gmx.de>
Date: Wed, 31 Mar 2021 23:08:29 +0200
Subject: [PATCH] cache line zero: require alignment to cache line size when
 OpenMP enabled

otherwise we can delete stuff from the next outer loop, which in 2D would be the one OpenMP-parallelized
---
 pystencils/backends/ppc_instruction_sets.py |  2 ++
 pystencils/cpu/cpujit.py                    | 14 +++++++++++++-
 pystencils_tests/test_vectorization.py      |  8 ++++----
 3 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/pystencils/backends/ppc_instruction_sets.py b/pystencils/backends/ppc_instruction_sets.py
index b8116fd6a..938677f6a 100644
--- a/pystencils/backends/ppc_instruction_sets.py
+++ b/pystencils/backends/ppc_instruction_sets.py
@@ -80,6 +80,8 @@ def get_vector_instruction_set_ppc(data_type='double', instruction_set='vsx'):
         result['loadA'] = '(__vector double)' + result['loadA'].format('(float*) {0}')
         result['storeA'] = result['storeA'].format('(float*) {0}', '(__vector float) {1}')
         result['stream'] = result['stream'].format('(float*) {0}', '(__vector float) {1}')
+        result['streamAndFlushCacheline'] = result['streamAndFlushCacheline'].format('(float*) {0}',
+                                                                                     '(__vector float) {1}')
 
     result['+int'] = "vec_add({0}, {1})"
 
diff --git a/pystencils/cpu/cpujit.py b/pystencils/cpu/cpujit.py
index 6f2c9a5b0..68ca25902 100644
--- a/pystencils/cpu/cpujit.py
+++ b/pystencils/cpu/cpujit.py
@@ -58,8 +58,9 @@ import numpy as np
 from appdirs import user_cache_dir, user_config_dir
 
 from pystencils import FieldType
+from pystencils.astnodes import LoopOverCoordinate
 from pystencils.backends.cbackend import generate_c, get_headers
-from pystencils.data_types import cast_func, VectorType
+from pystencils.data_types import cast_func, VectorType, vector_memory_access
 from pystencils.include import get_pystencils_include_path
 from pystencils.kernel_wrapper import KernelWrapper
 from pystencils.utils import atomic_file_write, file_handle_for_atomic_write, recursive_dict_update
@@ -386,6 +387,14 @@ def create_function_boilerplate_code(parameter_info, name, ast_node, insert_chec
 
                 if ast_node.instruction_set and aligned:
                     byte_width = ast_node.instruction_set['width'] * item_size
+                    if 'cachelineZero' in ast_node.instruction_set:
+                        has_openmp, has_nontemporal = False, False
+                        for loop in ast_node.atoms(LoopOverCoordinate):
+                            has_openmp = has_openmp or any(['#pragma omp' in p for p in loop.prefix_lines])
+                            has_nontemporal = has_nontemporal or any([a.args[0].field == field and a.args[3] for a in
+                                                                      loop.atoms(vector_memory_access)])
+                        if has_openmp and has_nontemporal:
+                            byte_width = ast_node.instruction_set['cachelineSize']
                     offset = max(max(ast_node.ghost_layers)) * item_size
                     offset_cond = f"(((uintptr_t) buffer_{field.name}.buf) + {offset}) % {byte_width} == 0"
 
@@ -394,6 +403,9 @@ def create_function_boilerplate_code(parameter_info, name, ast_node, insert_chec
                                             "the kernel creation is not specified it will choose a suitable value " \
                                             "automatically. This value might not " \
                                             "be compatible with the allocated arrays."
+                    if type(byte_width) is not int:
+                        message += " Note that when both OpenMP and non-temporal stores are enabled, alignment to the "\
+                                   "cacheline size is required."
                     pre_call_code += template_check_array.format(cond=offset_cond, what="offset", name=field.name,
                                                                  expected=message)
 
diff --git a/pystencils_tests/test_vectorization.py b/pystencils_tests/test_vectorization.py
index 6852490bf..59c85a425 100644
--- a/pystencils_tests/test_vectorization.py
+++ b/pystencils_tests/test_vectorization.py
@@ -38,14 +38,14 @@ def test_aligned_and_nt_stores(openmp=False):
     # create a datahandling object
     dh = ps.create_data_handling(domain_size, periodicity=(True, True), parallel=False, default_target='cpu')
 
-    # fields
-    g = dh.add_array("g", values_per_cell=1, alignment=True)
-    dh.fill("g", 1.0, ghost_layers=True)
     if openmp:
-        # TODO: throw error when not cacheline-aligned
         alignment = 128 if instruction_set == 'vsx' else 64 if instruction_set == 'neon' else True
     else:
         alignment = True
+
+    # fields
+    g = dh.add_array("g", values_per_cell=1, alignment=alignment)
+    dh.fill("g", 1.0, ghost_layers=True)
     f = dh.add_array("f", values_per_cell=1, alignment=alignment)
     dh.fill("f", 0.0, ghost_layers=True)
     opt = {'instruction_set': instruction_set, 'assume_aligned': True, 'nontemporal': True,
-- 
GitLab