cache line zero: require alignment to cache line size when OpenMP enabled

otherwise we can delete stuff from the next outer loop, which in 2D would be the one OpenMP-parallelized

cache line zero: require alignment to cache line size when OpenMP enabled
otherwise we can delete stuff from the next outer loop, which in 2D would be the one OpenMP-parallelized
a100b5b0 · Michael Kuron · 31be359b · a100b5b0 · a100b5b0 · a100b5b0
Commit a100b5b0 authored 3 years ago by Michael Kuron
--- a/pystencils/backends/ppc_instruction_sets.py
+++ b/pystencils/backends/ppc_instruction_sets.py
@@ -80,6 +80,8 @@ def get_vector_instruction_set_ppc(data_type='double', instruction_set='vsx'):
        result['loadA'] = '(__vector double)' + result['loadA'].format('(float*) {0}')
        result['storeA'] = result['storeA'].format('(float*) {0}', '(__vector float) {1}')
        result['stream'] = result['stream'].format('(float*) {0}', '(__vector float) {1}')
+        result['streamAndFlushCacheline'] = result['streamAndFlushCacheline'].format('(float*) {0}',
+                                                                                     '(__vector float) {1}')

    result['+int'] = "vec_add({0}, {1})"


--- a/pystencils/cpu/cpujit.py
+++ b/pystencils/cpu/cpujit.py
@@ -58,8 +58,9 @@ import numpy as np
 from appdirs import user_cache_dir, user_config_dir

 from pystencils import FieldType
+from pystencils.astnodes import LoopOverCoordinate
 from pystencils.backends.cbackend import generate_c, get_headers
-from pystencils.data_types import cast_func, VectorType
+from pystencils.data_types import cast_func, VectorType, vector_memory_access
 from pystencils.include import get_pystencils_include_path
 from pystencils.kernel_wrapper import KernelWrapper
 from pystencils.utils import atomic_file_write, file_handle_for_atomic_write, recursive_dict_update
@@ -386,6 +387,14 @@ def create_function_boilerplate_code(parameter_info, name, ast_node, insert_chec

                if ast_node.instruction_set and aligned:
                    byte_width = ast_node.instruction_set['width'] * item_size
+                    if 'cachelineZero' in ast_node.instruction_set:
+                        has_openmp, has_nontemporal = False, False
+                        for loop in ast_node.atoms(LoopOverCoordinate):
+                            has_openmp = has_openmp or any(['#pragma omp' in p for p in loop.prefix_lines])
+                            has_nontemporal = has_nontemporal or any([a.args[0].field == field and a.args[3] for a in
+                                                                      loop.atoms(vector_memory_access)])
+                        if has_openmp and has_nontemporal:
+                            byte_width = ast_node.instruction_set['cachelineSize']
                    offset = max(max(ast_node.ghost_layers)) * item_size
                    offset_cond = f"(((uintptr_t) buffer_{field.name}.buf) + {offset}) % {byte_width} == 0"

@@ -394,6 +403,9 @@ def create_function_boilerplate_code(parameter_info, name, ast_node, insert_chec
                                            "the kernel creation is not specified it will choose a suitable value " \
                                            "automatically. This value might not " \
                                            "be compatible with the allocated arrays."
+                    if type(byte_width) is not int:
+                        message += " Note that when both OpenMP and non-temporal stores are enabled, alignment to the "\
+                                   "cacheline size is required."
                    pre_call_code += template_check_array.format(cond=offset_cond, what="offset", name=field.name,
                                                                 expected=message)


--- a/pystencils_tests/test_vectorization.py
+++ b/pystencils_tests/test_vectorization.py
@@ -38,14 +38,14 @@ def test_aligned_and_nt_stores(openmp=False):
    # create a datahandling object
    dh = ps.create_data_handling(domain_size, periodicity=(True, True), parallel=False, default_target='cpu')

-    # fields
-    g = dh.add_array("g", values_per_cell=1, alignment=True)
-    dh.fill("g", 1.0, ghost_layers=True)
    if openmp:
-        # TODO: throw error when not cacheline-aligned
        alignment = 128 if instruction_set == 'vsx' else 64 if instruction_set == 'neon' else True
    else:
        alignment = True
+
+    # fields
+    g = dh.add_array("g", values_per_cell=1, alignment=alignment)
+    dh.fill("g", 1.0, ghost_layers=True)
    f = dh.add_array("f", values_per_cell=1, alignment=alignment)
    dh.fill("f", 0.0, ghost_layers=True)
    opt = {'instruction_set': instruction_set, 'assume_aligned': True, 'nontemporal': True,