From c99ea544609dca7e4ee864ad33bc7b0b234c791a Mon Sep 17 00:00:00 2001
From: Martin Bauer <martin.bauer@fau.de>
Date: Tue, 5 Jun 2018 14:10:39 +0200
Subject: [PATCH] Vectorization - advanced parameters

- option to allocate more memory at end of line and do not generate a
  tail loop, if loop range is not divisible by SIMD width
---
 alignedarray.py      | 21 ++++++++++++---------
 cpu/vectorization.py | 28 ++++++++++++++++++++--------
 integer_functions.py |  4 ++--
 3 files changed, 34 insertions(+), 19 deletions(-)

diff --git a/alignedarray.py b/alignedarray.py
index 65807042e..02a4f3599 100644
--- a/alignedarray.py
+++ b/alignedarray.py
@@ -4,19 +4,22 @@ import numpy as np
 def aligned_empty(shape, byte_alignment=32, dtype=np.float64, byte_offset=0, order='C', align_inner_coordinate=True):
     """
     Creates an aligned empty numpy array
-    :param shape: size of the array
-    :param byte_alignment: alignment in bytes, for the start address of the array holds (a % byte_alignment) == 0
-    :param dtype: numpy data type
-    :param byte_offset: offset in bytes for position that should be aligned i.e. (a+byte_offset) % byte_alignment == 0
-                       typically used to align first inner cell instead of ghost layer
-    :param order: storage linearization order
-    :param align_inner_coordinate: if True, the start of the innermost coordinate lines are aligned as well
-    :return:
+
+    Args:
+        shape: size of the array
+        byte_alignment: alignment in bytes, for the start address of the array holds (a % byte_alignment) == 0
+        dtype: numpy data type
+        byte_offset: offset in bytes for position that should be aligned i.e. (a+byte_offset) % byte_alignment == 0
+                    typically used to align first inner cell instead of ghost layer
+        order: storage linearization order
+        align_inner_coordinate: if True, the start of the innermost coordinate lines are aligned as well
     """
     if (not align_inner_coordinate) or (not hasattr(shape, '__len__')):
         size = np.prod(shape)
         d = np.dtype(dtype)
-        tmp = np.empty(size * d.itemsize + byte_alignment, dtype=np.uint8)
+        # 2 * byte_alignment instead of 1 * byte_alignment to have slack in the end such that
+        # vectorized loops can access vector_width elements further and don't require a tail loop
+        tmp = np.empty(size * d.itemsize + 2 * byte_alignment, dtype=np.uint8)
         address = tmp.__array_interface__['data'][0]
         offset = (byte_alignment - (address + byte_offset) % byte_alignment) % byte_alignment
         return tmp[offset:offset + size * d.itemsize].view(dtype=d).reshape(shape, order=order)
diff --git a/cpu/vectorization.py b/cpu/vectorization.py
index ecc4ebc1a..8f8829061 100644
--- a/cpu/vectorization.py
+++ b/cpu/vectorization.py
@@ -2,7 +2,7 @@ import sympy as sp
 import warnings
 from typing import Union, Container
 from pystencils.backends.simd_instruction_sets import get_vector_instruction_set
-from pystencils.integer_functions import modulo_floor
+from pystencils.integer_functions import modulo_floor, modulo_ceil
 from pystencils.sympyextensions import fast_subs
 from pystencils.data_types import TypedSymbol, VectorType, get_type_of_expression, vector_memory_access, cast_func, \
     collate_types, PointerType
@@ -13,7 +13,7 @@ from pystencils.field import Field
 
 def vectorize(kernel_ast: ast.KernelFunction, instruction_set: str = 'avx',
               assume_aligned: bool = False, nontemporal: Union[bool, Container[Union[str, Field]]] = False,
-              assume_inner_stride_one: bool = False):
+              assume_inner_stride_one: bool = False, assume_sufficient_line_padding: bool = True):
     """Explicit vectorization using SIMD vectorization via intrinsics.
 
     Args:
@@ -30,6 +30,11 @@ def vectorize(kernel_ast: ast.KernelFunction, instruction_set: str = 'avx',
                                  the inner loop stride is a runtime variable and thus might not be always 1.
                                  If this parameter is set to true, the the inner stride is assumed to be always one.
                                  This has to be ensured at runtime!
+        assume_sufficient_line_padding: if True and assume_inner_stride_one, no tail loop is created but loop is
+                                        extended by at most (vector_width-1) elements
+                                        assumes that at the end of each line there is enough padding with dummy data
+                                        depending on the access pattern there might be additional padding
+                                        required at the end of the array
     """
     all_fields = kernel_ast.fields_accessed
     if nontemporal is None or nontemporal is False:
@@ -51,11 +56,13 @@ def vectorize(kernel_ast: ast.KernelFunction, instruction_set: str = 'avx',
     vector_width = vector_is['width']
     kernel_ast.instruction_set = vector_is
 
-    vectorize_inner_loops_and_adapt_load_stores(kernel_ast, vector_width, assume_aligned, nontemporal)
+    vectorize_inner_loops_and_adapt_load_stores(kernel_ast, vector_width, assume_aligned,
+                                                nontemporal, assume_sufficient_line_padding)
     insert_vector_casts(kernel_ast)
 
 
-def vectorize_inner_loops_and_adapt_load_stores(ast_node, vector_width, assume_aligned, nontemporal_fields):
+def vectorize_inner_loops_and_adapt_load_stores(ast_node, vector_width, assume_aligned, nontemporal_fields,
+                                                assume_sufficient_line_padding):
     """Goes over all innermost loops, changes increment to vector width and replaces field accesses by vector type."""
     all_loops = filtered_tree_iteration(ast_node, ast.LoopOverCoordinate, stop_type=ast.SympyAssignment)
     inner_loops = [n for n in all_loops if n.is_innermost_loop]
@@ -65,10 +72,15 @@ def vectorize_inner_loops_and_adapt_load_stores(ast_node, vector_width, assume_a
         loop_range = loop_node.stop - loop_node.start
 
         # cut off loop tail, that is not a multiple of four
-        cutting_point = modulo_floor(loop_range, vector_width) + loop_node.start
-        loop_nodes = cut_loop(loop_node, [cutting_point])
-        assert len(loop_nodes) in (1, 2)  # 2 for main and tail loop, 1 if loop range divisible by vector width
-        loop_node = loop_nodes[0]
+        if assume_aligned and assume_sufficient_line_padding:
+            loop_range = loop_node.stop - loop_node.start
+            new_stop = loop_node.start + modulo_ceil(loop_range, vector_width)
+            loop_node.stop = new_stop
+        else:
+            cutting_point = modulo_floor(loop_range, vector_width) + loop_node.start
+            loop_nodes = cut_loop(loop_node, [cutting_point])
+            assert len(loop_nodes) in (1, 2)  # 2 for main and tail loop, 1 if loop range divisible by vector width
+            loop_node = loop_nodes[0]
         
         # Find all array accesses (indexed) that depend on the loop counter as offset
         loop_counter_symbol = ast.LoopOverCoordinate.get_loop_counter_symbol(loop_node.coordinate_to_loop_over)
diff --git a/integer_functions.py b/integer_functions.py
index db8358ff2..c60a5fb2c 100644
--- a/integer_functions.py
+++ b/integer_functions.py
@@ -43,7 +43,7 @@ class modulo_floor(sp.Function):
 
 # noinspection PyPep8Naming
 class modulo_ceil(sp.Function):
-    """Returns the next smaller integer divisible by given divisor.
+    """Returns the next bigger integer divisible by given divisor.
 
     Examples:
         >>> modulo_ceil(9, 4)
@@ -68,5 +68,5 @@ class modulo_ceil(sp.Function):
     def to_c(self, print_func):
         dtype = collate_types((get_type_of_expression(self.args[0]), get_type_of_expression(self.args[1])))
         assert dtype.is_int()
-        code = "({0}) % ({1}) == 0 ? {0} : (({dtype})(({0}) / ({1}))+1) * ({1})"
+        code = "(({0}) % ({1}) == 0 ? {0} : (({dtype})(({0}) / ({1}))+1) * ({1}))"
         return code.format(print_func(self.args[0]), print_func(self.args[1]), dtype=dtype)
-- 
GitLab