From 35e7428fad7701a943e10d11e94caa8f4ffe8c97 Mon Sep 17 00:00:00 2001 From: Martin Bauer <martin.bauer@fau.de> Date: Wed, 10 Oct 2018 09:15:51 +0200 Subject: [PATCH] pystencils: vectorization - bugfixes and small performance improvements - bugfix in vector-typing of piecewise functions - cast_function is now a sympy atom - fixes problems with sympy > 1.1 - replace_inner_stride_with_one is a bit faster now --- cpu/vectorization.py | 4 +++- data_types.py | 1 + kernelcreation.py | 10 +++++++++- transformations.py | 16 +++++++++++++--- 4 files changed, 26 insertions(+), 5 deletions(-) diff --git a/cpu/vectorization.py b/cpu/vectorization.py index 8f8829061..32b47f262 100644 --- a/cpu/vectorization.py +++ b/cpu/vectorization.py @@ -28,7 +28,7 @@ def vectorize(kernel_ast: ast.KernelFunction, instruction_set: str = 'avx', If true, nontemporal access instructions are used for all fields. assume_inner_stride_one: kernels with non-constant inner loop bound and strides can not be vectorized since the inner loop stride is a runtime variable and thus might not be always 1. - If this parameter is set to true, the the inner stride is assumed to be always one. + If this parameter is set to true, the inner stride is assumed to be always one. This has to be ensured at runtime! assume_sufficient_line_padding: if True and assume_inner_stride_one, no tail loop is created but loop is extended by at most (vector_width-1) elements @@ -141,6 +141,8 @@ def insert_vector_casts(ast_node): condition_target_type = collate_types(types_of_conditions) if type(condition_target_type) is VectorType and type(result_target_type) is not VectorType: result_target_type = VectorType(result_target_type, width=condition_target_type.width) + if type(condition_target_type) is not VectorType and type(result_target_type) is VectorType: + condition_target_type = VectorType(condition_target_type, width=result_target_type.width) casted_results = [cast_func(a, result_target_type) if t != result_target_type else a for a, t in zip(new_results, types_of_results)] diff --git a/data_types.py b/data_types.py index 320103bd7..63d2ae560 100644 --- a/data_types.py +++ b/data_types.py @@ -16,6 +16,7 @@ from sympy.logic.boolalg import Boolean # noinspection PyPep8Naming class cast_func(sp.Function, Boolean): # to work in conditions of sp.Piecewise cast_func has to be of type Boolean as well + is_Atom = True @property def canonical(self): diff --git a/kernelcreation.py b/kernelcreation.py index f4854a603..d9bf1c7d9 100644 --- a/kernelcreation.py +++ b/kernelcreation.py @@ -76,7 +76,7 @@ def create_kernel(assignments, target='cpu', data_type="double", iteration_slice add_openmp(ast, num_threads=cpu_openmp) if cpu_vectorize_info: if cpu_vectorize_info is True: - vectorize(ast, instruction_set='avx', assume_aligned=False, nontemporal=None) + vectorize(ast) elif isinstance(cpu_vectorize_info, dict): vectorize(ast, **cpu_vectorize_info) else: @@ -207,7 +207,15 @@ def create_staggered_kernel(staggered_field, expressions, subexpressions=(), tar ghost_layers = [(1, 0)] * dim + cpu_vectorize_info = kwargs.get('cpu_vectorize_info', None) + if cpu_vectorize_info: + del kwargs['cpu_vectorize_info'] ast = create_kernel(final_assignments, ghost_layers=ghost_layers, target=target, **kwargs) + if target == 'cpu': remove_conditionals_in_staggered_kernel(ast) + if cpu_vectorize_info is True: + vectorize(ast) + elif isinstance(cpu_vectorize_info, dict): + vectorize(ast, **cpu_vectorize_info) return ast diff --git a/transformations.py b/transformations.py index 64efc0788..b80cc37f6 100644 --- a/transformations.py +++ b/transformations.py @@ -302,6 +302,8 @@ def substitute_array_accesses_with_constants(ast_node): # get all indexed expressions that are not field accesses indexed_expressions = [e for e in expr.atoms(sp.Indexed) if not isinstance(e, ast.ResolvedFieldAccess)] + if len(indexed_expressions) == 0: + return expr # special case: right hand side is a single indexed expression, then nothing has to be done if len(indexed_expressions) == 1 and expr == indexed_expressions[0]: @@ -1054,14 +1056,22 @@ def replace_inner_stride_with_one(ast_node: ast.KernelFunction) -> None: Warning: the assumption is not checked at runtime! """ - inner_loop_counters = {l.coordinate_to_loop_over - for l in ast_node.atoms(ast.LoopOverCoordinate) if l.is_innermost_loop} + inner_loops = [] + inner_loop_counters = set() + for loop in filtered_tree_iteration(ast_node, ast.LoopOverCoordinate, stop_type=ast.SympyAssignment): + if loop.is_innermost_loop: + inner_loops.append(loop) + inner_loop_counters.add(loop.coordinate_to_loop_over) + if len(inner_loop_counters) != 1: raise ValueError("Inner loops iterate over different coordinates") + inner_loop_counter = inner_loop_counters.pop() stride_params = [p for p in ast_node.parameters if p.is_field_stride_argument] + subs_dict = {} for stride_param in stride_params: stride_symbol = stride_param.symbol - subs_dict = {IndexedBase(stride_symbol, shape=(1,))[inner_loop_counter]: 1} + subs_dict.update({IndexedBase(stride_symbol, shape=(1,))[inner_loop_counter]: 1}) + if subs_dict: ast_node.subs(subs_dict) -- GitLab