Commit 35e7428f authored by Martin Bauer's avatar Martin Bauer
Browse files

pystencils: vectorization - bugfixes and small performance improvements

- bugfix in vector-typing of piecewise functions
- cast_function is now a sympy atom - fixes problems with sympy > 1.1
- replace_inner_stride_with_one is a bit faster now
parent 65ddbe06
......@@ -28,7 +28,7 @@ def vectorize(kernel_ast: ast.KernelFunction, instruction_set: str = 'avx',
If true, nontemporal access instructions are used for all fields.
assume_inner_stride_one: kernels with non-constant inner loop bound and strides can not be vectorized since
the inner loop stride is a runtime variable and thus might not be always 1.
If this parameter is set to true, the the inner stride is assumed to be always one.
If this parameter is set to true, the inner stride is assumed to be always one.
This has to be ensured at runtime!
assume_sufficient_line_padding: if True and assume_inner_stride_one, no tail loop is created but loop is
extended by at most (vector_width-1) elements
......@@ -141,6 +141,8 @@ def insert_vector_casts(ast_node):
condition_target_type = collate_types(types_of_conditions)
if type(condition_target_type) is VectorType and type(result_target_type) is not VectorType:
result_target_type = VectorType(result_target_type, width=condition_target_type.width)
if type(condition_target_type) is not VectorType and type(result_target_type) is VectorType:
condition_target_type = VectorType(condition_target_type, width=result_target_type.width)
casted_results = [cast_func(a, result_target_type) if t != result_target_type else a
for a, t in zip(new_results, types_of_results)]
......
......@@ -16,6 +16,7 @@ from sympy.logic.boolalg import Boolean
# noinspection PyPep8Naming
class cast_func(sp.Function, Boolean):
# to work in conditions of sp.Piecewise cast_func has to be of type Boolean as well
is_Atom = True
@property
def canonical(self):
......
......@@ -76,7 +76,7 @@ def create_kernel(assignments, target='cpu', data_type="double", iteration_slice
add_openmp(ast, num_threads=cpu_openmp)
if cpu_vectorize_info:
if cpu_vectorize_info is True:
vectorize(ast, instruction_set='avx', assume_aligned=False, nontemporal=None)
vectorize(ast)
elif isinstance(cpu_vectorize_info, dict):
vectorize(ast, **cpu_vectorize_info)
else:
......@@ -207,7 +207,15 @@ def create_staggered_kernel(staggered_field, expressions, subexpressions=(), tar
ghost_layers = [(1, 0)] * dim
cpu_vectorize_info = kwargs.get('cpu_vectorize_info', None)
if cpu_vectorize_info:
del kwargs['cpu_vectorize_info']
ast = create_kernel(final_assignments, ghost_layers=ghost_layers, target=target, **kwargs)
if target == 'cpu':
remove_conditionals_in_staggered_kernel(ast)
if cpu_vectorize_info is True:
vectorize(ast)
elif isinstance(cpu_vectorize_info, dict):
vectorize(ast, **cpu_vectorize_info)
return ast
......@@ -302,6 +302,8 @@ def substitute_array_accesses_with_constants(ast_node):
# get all indexed expressions that are not field accesses
indexed_expressions = [e for e in expr.atoms(sp.Indexed) if not isinstance(e, ast.ResolvedFieldAccess)]
if len(indexed_expressions) == 0:
return expr
# special case: right hand side is a single indexed expression, then nothing has to be done
if len(indexed_expressions) == 1 and expr == indexed_expressions[0]:
......@@ -1054,14 +1056,22 @@ def replace_inner_stride_with_one(ast_node: ast.KernelFunction) -> None:
Warning: the assumption is not checked at runtime!
"""
inner_loop_counters = {l.coordinate_to_loop_over
for l in ast_node.atoms(ast.LoopOverCoordinate) if l.is_innermost_loop}
inner_loops = []
inner_loop_counters = set()
for loop in filtered_tree_iteration(ast_node, ast.LoopOverCoordinate, stop_type=ast.SympyAssignment):
if loop.is_innermost_loop:
inner_loops.append(loop)
inner_loop_counters.add(loop.coordinate_to_loop_over)
if len(inner_loop_counters) != 1:
raise ValueError("Inner loops iterate over different coordinates")
inner_loop_counter = inner_loop_counters.pop()
stride_params = [p for p in ast_node.parameters if p.is_field_stride_argument]
subs_dict = {}
for stride_param in stride_params:
stride_symbol = stride_param.symbol
subs_dict = {IndexedBase(stride_symbol, shape=(1,))[inner_loop_counter]: 1}
subs_dict.update({IndexedBase(stride_symbol, shape=(1,))[inner_loop_counter]: 1})
if subs_dict:
ast_node.subs(subs_dict)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment