From 35e7428fad7701a943e10d11e94caa8f4ffe8c97 Mon Sep 17 00:00:00 2001
From: Martin Bauer <martin.bauer@fau.de>
Date: Wed, 10 Oct 2018 09:15:51 +0200
Subject: [PATCH] pystencils: vectorization - bugfixes and small performance
 improvements

- bugfix in vector-typing of piecewise functions
- cast_function is now a sympy atom - fixes problems with sympy > 1.1
- replace_inner_stride_with_one is a bit faster now
---
 cpu/vectorization.py |  4 +++-
 data_types.py        |  1 +
 kernelcreation.py    | 10 +++++++++-
 transformations.py   | 16 +++++++++++++---
 4 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/cpu/vectorization.py b/cpu/vectorization.py
index 8f8829061..32b47f262 100644
--- a/cpu/vectorization.py
+++ b/cpu/vectorization.py
@@ -28,7 +28,7 @@ def vectorize(kernel_ast: ast.KernelFunction, instruction_set: str = 'avx',
                      If true, nontemporal access instructions are used for all fields.
         assume_inner_stride_one: kernels with non-constant inner loop bound and strides can not be vectorized since
                                  the inner loop stride is a runtime variable and thus might not be always 1.
-                                 If this parameter is set to true, the the inner stride is assumed to be always one.
+                                 If this parameter is set to true, the inner stride is assumed to be always one.
                                  This has to be ensured at runtime!
         assume_sufficient_line_padding: if True and assume_inner_stride_one, no tail loop is created but loop is
                                         extended by at most (vector_width-1) elements
@@ -141,6 +141,8 @@ def insert_vector_casts(ast_node):
             condition_target_type = collate_types(types_of_conditions)
             if type(condition_target_type) is VectorType and type(result_target_type) is not VectorType:
                 result_target_type = VectorType(result_target_type, width=condition_target_type.width)
+            if type(condition_target_type) is not VectorType and type(result_target_type) is VectorType:
+                condition_target_type = VectorType(condition_target_type, width=result_target_type.width)
 
             casted_results = [cast_func(a, result_target_type) if t != result_target_type else a
                               for a, t in zip(new_results, types_of_results)]
diff --git a/data_types.py b/data_types.py
index 320103bd7..63d2ae560 100644
--- a/data_types.py
+++ b/data_types.py
@@ -16,6 +16,7 @@ from sympy.logic.boolalg import Boolean
 # noinspection PyPep8Naming
 class cast_func(sp.Function, Boolean):
     # to work in conditions of sp.Piecewise cast_func has to be of type Boolean as well
+    is_Atom = True
 
     @property
     def canonical(self):
diff --git a/kernelcreation.py b/kernelcreation.py
index f4854a603..d9bf1c7d9 100644
--- a/kernelcreation.py
+++ b/kernelcreation.py
@@ -76,7 +76,7 @@ def create_kernel(assignments, target='cpu', data_type="double", iteration_slice
             add_openmp(ast, num_threads=cpu_openmp)
         if cpu_vectorize_info:
             if cpu_vectorize_info is True:
-                vectorize(ast, instruction_set='avx', assume_aligned=False, nontemporal=None)
+                vectorize(ast)
             elif isinstance(cpu_vectorize_info, dict):
                 vectorize(ast, **cpu_vectorize_info)
             else:
@@ -207,7 +207,15 @@ def create_staggered_kernel(staggered_field, expressions, subexpressions=(), tar
 
     ghost_layers = [(1, 0)] * dim
 
+    cpu_vectorize_info = kwargs.get('cpu_vectorize_info', None)
+    if cpu_vectorize_info:
+        del kwargs['cpu_vectorize_info']
     ast = create_kernel(final_assignments, ghost_layers=ghost_layers, target=target, **kwargs)
+
     if target == 'cpu':
         remove_conditionals_in_staggered_kernel(ast)
+        if cpu_vectorize_info is True:
+            vectorize(ast)
+        elif isinstance(cpu_vectorize_info, dict):
+            vectorize(ast, **cpu_vectorize_info)
     return ast
diff --git a/transformations.py b/transformations.py
index 64efc0788..b80cc37f6 100644
--- a/transformations.py
+++ b/transformations.py
@@ -302,6 +302,8 @@ def substitute_array_accesses_with_constants(ast_node):
 
         # get all indexed expressions that are not field accesses
         indexed_expressions = [e for e in expr.atoms(sp.Indexed) if not isinstance(e, ast.ResolvedFieldAccess)]
+        if len(indexed_expressions) == 0:
+            return expr
 
         # special case: right hand side is a single indexed expression, then nothing has to be done
         if len(indexed_expressions) == 1 and expr == indexed_expressions[0]:
@@ -1054,14 +1056,22 @@ def replace_inner_stride_with_one(ast_node: ast.KernelFunction) -> None:
 
     Warning: the assumption is not checked at runtime!
     """
-    inner_loop_counters = {l.coordinate_to_loop_over
-                           for l in ast_node.atoms(ast.LoopOverCoordinate) if l.is_innermost_loop}
+    inner_loops = []
+    inner_loop_counters = set()
+    for loop in filtered_tree_iteration(ast_node, ast.LoopOverCoordinate, stop_type=ast.SympyAssignment):
+        if loop.is_innermost_loop:
+            inner_loops.append(loop)
+            inner_loop_counters.add(loop.coordinate_to_loop_over)
+
     if len(inner_loop_counters) != 1:
         raise ValueError("Inner loops iterate over different coordinates")
+
     inner_loop_counter = inner_loop_counters.pop()
 
     stride_params = [p for p in ast_node.parameters if p.is_field_stride_argument]
+    subs_dict = {}
     for stride_param in stride_params:
         stride_symbol = stride_param.symbol
-        subs_dict = {IndexedBase(stride_symbol, shape=(1,))[inner_loop_counter]: 1}
+        subs_dict.update({IndexedBase(stride_symbol, shape=(1,))[inner_loop_counter]: 1})
+    if subs_dict:
         ast_node.subs(subs_dict)
-- 
GitLab