diff --git a/cpu/vectorization.py b/cpu/vectorization.py index 8f8829061050467d0af0a1acf2e62d2da91de9ff..32b47f2622747f7d2586bc0d5a0b14cbfd1008b3 100644 --- a/cpu/vectorization.py +++ b/cpu/vectorization.py @@ -28,7 +28,7 @@ def vectorize(kernel_ast: ast.KernelFunction, instruction_set: str = 'avx', If true, nontemporal access instructions are used for all fields. assume_inner_stride_one: kernels with non-constant inner loop bound and strides can not be vectorized since the inner loop stride is a runtime variable and thus might not be always 1. - If this parameter is set to true, the the inner stride is assumed to be always one. + If this parameter is set to true, the inner stride is assumed to be always one. This has to be ensured at runtime! assume_sufficient_line_padding: if True and assume_inner_stride_one, no tail loop is created but loop is extended by at most (vector_width-1) elements @@ -141,6 +141,8 @@ def insert_vector_casts(ast_node): condition_target_type = collate_types(types_of_conditions) if type(condition_target_type) is VectorType and type(result_target_type) is not VectorType: result_target_type = VectorType(result_target_type, width=condition_target_type.width) + if type(condition_target_type) is not VectorType and type(result_target_type) is VectorType: + condition_target_type = VectorType(condition_target_type, width=result_target_type.width) casted_results = [cast_func(a, result_target_type) if t != result_target_type else a for a, t in zip(new_results, types_of_results)] diff --git a/data_types.py b/data_types.py index 320103bd78bf0788063953a2e2ba84a9e68bf3b0..63d2ae5605cb6920e0aec4b931d628cb3bc3f3d7 100644 --- a/data_types.py +++ b/data_types.py @@ -16,6 +16,7 @@ from sympy.logic.boolalg import Boolean # noinspection PyPep8Naming class cast_func(sp.Function, Boolean): # to work in conditions of sp.Piecewise cast_func has to be of type Boolean as well + is_Atom = True @property def canonical(self): diff --git a/kernelcreation.py b/kernelcreation.py index f4854a6036af27798145b8de3387551bb6ad4235..d9bf1c7d9a0392d1c930dc52b49fce92573da563 100644 --- a/kernelcreation.py +++ b/kernelcreation.py @@ -76,7 +76,7 @@ def create_kernel(assignments, target='cpu', data_type="double", iteration_slice add_openmp(ast, num_threads=cpu_openmp) if cpu_vectorize_info: if cpu_vectorize_info is True: - vectorize(ast, instruction_set='avx', assume_aligned=False, nontemporal=None) + vectorize(ast) elif isinstance(cpu_vectorize_info, dict): vectorize(ast, **cpu_vectorize_info) else: @@ -207,7 +207,15 @@ def create_staggered_kernel(staggered_field, expressions, subexpressions=(), tar ghost_layers = [(1, 0)] * dim + cpu_vectorize_info = kwargs.get('cpu_vectorize_info', None) + if cpu_vectorize_info: + del kwargs['cpu_vectorize_info'] ast = create_kernel(final_assignments, ghost_layers=ghost_layers, target=target, **kwargs) + if target == 'cpu': remove_conditionals_in_staggered_kernel(ast) + if cpu_vectorize_info is True: + vectorize(ast) + elif isinstance(cpu_vectorize_info, dict): + vectorize(ast, **cpu_vectorize_info) return ast diff --git a/transformations.py b/transformations.py index 64efc0788073f5ceac96d2600f858205e679c87a..b80cc37f6ef222fcfb5c4a2f31c83c0bdd5c817c 100644 --- a/transformations.py +++ b/transformations.py @@ -302,6 +302,8 @@ def substitute_array_accesses_with_constants(ast_node): # get all indexed expressions that are not field accesses indexed_expressions = [e for e in expr.atoms(sp.Indexed) if not isinstance(e, ast.ResolvedFieldAccess)] + if len(indexed_expressions) == 0: + return expr # special case: right hand side is a single indexed expression, then nothing has to be done if len(indexed_expressions) == 1 and expr == indexed_expressions[0]: @@ -1054,14 +1056,22 @@ def replace_inner_stride_with_one(ast_node: ast.KernelFunction) -> None: Warning: the assumption is not checked at runtime! """ - inner_loop_counters = {l.coordinate_to_loop_over - for l in ast_node.atoms(ast.LoopOverCoordinate) if l.is_innermost_loop} + inner_loops = [] + inner_loop_counters = set() + for loop in filtered_tree_iteration(ast_node, ast.LoopOverCoordinate, stop_type=ast.SympyAssignment): + if loop.is_innermost_loop: + inner_loops.append(loop) + inner_loop_counters.add(loop.coordinate_to_loop_over) + if len(inner_loop_counters) != 1: raise ValueError("Inner loops iterate over different coordinates") + inner_loop_counter = inner_loop_counters.pop() stride_params = [p for p in ast_node.parameters if p.is_field_stride_argument] + subs_dict = {} for stride_param in stride_params: stride_symbol = stride_param.symbol - subs_dict = {IndexedBase(stride_symbol, shape=(1,))[inner_loop_counter]: 1} + subs_dict.update({IndexedBase(stride_symbol, shape=(1,))[inner_loop_counter]: 1}) + if subs_dict: ast_node.subs(subs_dict)