From a100b5b069992b1d5496bfa22449a62da275195c Mon Sep 17 00:00:00 2001 From: Michael Kuron <m.kuron@gmx.de> Date: Wed, 31 Mar 2021 23:08:29 +0200 Subject: [PATCH] cache line zero: require alignment to cache line size when OpenMP enabled otherwise we can delete stuff from the next outer loop, which in 2D would be the one OpenMP-parallelized --- pystencils/backends/ppc_instruction_sets.py | 2 ++ pystencils/cpu/cpujit.py | 14 +++++++++++++- pystencils_tests/test_vectorization.py | 8 ++++---- 3 files changed, 19 insertions(+), 5 deletions(-) diff --git a/pystencils/backends/ppc_instruction_sets.py b/pystencils/backends/ppc_instruction_sets.py index b8116fd6a..938677f6a 100644 --- a/pystencils/backends/ppc_instruction_sets.py +++ b/pystencils/backends/ppc_instruction_sets.py @@ -80,6 +80,8 @@ def get_vector_instruction_set_ppc(data_type='double', instruction_set='vsx'): result['loadA'] = '(__vector double)' + result['loadA'].format('(float*) {0}') result['storeA'] = result['storeA'].format('(float*) {0}', '(__vector float) {1}') result['stream'] = result['stream'].format('(float*) {0}', '(__vector float) {1}') + result['streamAndFlushCacheline'] = result['streamAndFlushCacheline'].format('(float*) {0}', + '(__vector float) {1}') result['+int'] = "vec_add({0}, {1})" diff --git a/pystencils/cpu/cpujit.py b/pystencils/cpu/cpujit.py index 6f2c9a5b0..68ca25902 100644 --- a/pystencils/cpu/cpujit.py +++ b/pystencils/cpu/cpujit.py @@ -58,8 +58,9 @@ import numpy as np from appdirs import user_cache_dir, user_config_dir from pystencils import FieldType +from pystencils.astnodes import LoopOverCoordinate from pystencils.backends.cbackend import generate_c, get_headers -from pystencils.data_types import cast_func, VectorType +from pystencils.data_types import cast_func, VectorType, vector_memory_access from pystencils.include import get_pystencils_include_path from pystencils.kernel_wrapper import KernelWrapper from pystencils.utils import atomic_file_write, file_handle_for_atomic_write, recursive_dict_update @@ -386,6 +387,14 @@ def create_function_boilerplate_code(parameter_info, name, ast_node, insert_chec if ast_node.instruction_set and aligned: byte_width = ast_node.instruction_set['width'] * item_size + if 'cachelineZero' in ast_node.instruction_set: + has_openmp, has_nontemporal = False, False + for loop in ast_node.atoms(LoopOverCoordinate): + has_openmp = has_openmp or any(['#pragma omp' in p for p in loop.prefix_lines]) + has_nontemporal = has_nontemporal or any([a.args[0].field == field and a.args[3] for a in + loop.atoms(vector_memory_access)]) + if has_openmp and has_nontemporal: + byte_width = ast_node.instruction_set['cachelineSize'] offset = max(max(ast_node.ghost_layers)) * item_size offset_cond = f"(((uintptr_t) buffer_{field.name}.buf) + {offset}) % {byte_width} == 0" @@ -394,6 +403,9 @@ def create_function_boilerplate_code(parameter_info, name, ast_node, insert_chec "the kernel creation is not specified it will choose a suitable value " \ "automatically. This value might not " \ "be compatible with the allocated arrays." + if type(byte_width) is not int: + message += " Note that when both OpenMP and non-temporal stores are enabled, alignment to the "\ + "cacheline size is required." pre_call_code += template_check_array.format(cond=offset_cond, what="offset", name=field.name, expected=message) diff --git a/pystencils_tests/test_vectorization.py b/pystencils_tests/test_vectorization.py index 6852490bf..59c85a425 100644 --- a/pystencils_tests/test_vectorization.py +++ b/pystencils_tests/test_vectorization.py @@ -38,14 +38,14 @@ def test_aligned_and_nt_stores(openmp=False): # create a datahandling object dh = ps.create_data_handling(domain_size, periodicity=(True, True), parallel=False, default_target='cpu') - # fields - g = dh.add_array("g", values_per_cell=1, alignment=True) - dh.fill("g", 1.0, ghost_layers=True) if openmp: - # TODO: throw error when not cacheline-aligned alignment = 128 if instruction_set == 'vsx' else 64 if instruction_set == 'neon' else True else: alignment = True + + # fields + g = dh.add_array("g", values_per_cell=1, alignment=alignment) + dh.fill("g", 1.0, ghost_layers=True) f = dh.add_array("f", values_per_cell=1, alignment=alignment) dh.fill("f", 0.0, ghost_layers=True) opt = {'instruction_set': instruction_set, 'assume_aligned': True, 'nontemporal': True, -- GitLab