Vectorization - advanced parameters

- option to allocate more memory at end of line and do not generate a tail loop, if loop range is not divisible by SIMD width

Vectorization - advanced parameters
- option to allocate more memory at end of line and do not generate a tail loop, if loop range is not divisible by SIMD width
c99ea544 · Martin Bauer · 80334597 · c99ea544 · c99ea544 · c99ea544
Commit c99ea544 authored 6 years ago by Martin Bauer
--- a/alignedarray.py
+++ b/alignedarray.py
@@ -4,19 +4,22 @@ import numpy as np
 def aligned_empty(shape, byte_alignment=32, dtype=np.float64, byte_offset=0, order='C', align_inner_coordinate=True):
    """
    Creates an aligned empty numpy array
-    :param shape: size of the array
-    :param byte_alignment: alignment in bytes, for the start address of the array holds (a % byte_alignment) == 0
+    Args:
-    :param dtype: numpy data type
+        shape: size of the array
-    :param byte_offset: offset in bytes for position that should be aligned i.e. (a+byte_offset) % byte_alignment == 0
+        byte_alignment: alignment in bytes, for the start address of the array holds (a % byte_alignment) == 0
-                       typically used to align first inner cell instead of ghost layer
+        dtype: numpy data type
-    :param order: storage linearization order
+        byte_offset: offset in bytes for position that should be aligned i.e. (a+byte_offset) % byte_alignment == 0
-    :param align_inner_coordinate: if True, the start of the innermost coordinate lines are aligned as well
+                    typically used to align first inner cell instead of ghost layer
-    :return:
+        order: storage linearization order
+        align_inner_coordinate: if True, the start of the innermost coordinate lines are aligned as well
    """
    if (not align_inner_coordinate) or (not hasattr(shape, '__len__')):
        size = np.prod(shape)
        d = np.dtype(dtype)
-        tmp = np.empty(size * d.itemsize + byte_alignment, dtype=np.uint8)
+        # 2 * byte_alignment instead of 1 * byte_alignment to have slack in the end such that
+        # vectorized loops can access vector_width elements further and don't require a tail loop
+        tmp = np.empty(size * d.itemsize + 2 * byte_alignment, dtype=np.uint8)
        address = tmp.__array_interface__['data'][0]
        offset = (byte_alignment - (address + byte_offset) % byte_alignment) % byte_alignment
        return tmp[offset:offset + size * d.itemsize].view(dtype=d).reshape(shape, order=order)

--- a/cpu/vectorization.py
+++ b/cpu/vectorization.py
@@ -2,7 +2,7 @@ import sympy as sp
 import warnings
 from typing import Union, Container
 from pystencils.backends.simd_instruction_sets import get_vector_instruction_set
-from pystencils.integer_functions import modulo_floor
+from pystencils.integer_functions import modulo_floor, modulo_ceil
 from pystencils.sympyextensions import fast_subs
 from pystencils.data_types import TypedSymbol, VectorType, get_type_of_expression, vector_memory_access, cast_func, \
    collate_types, PointerType
@@ -13,7 +13,7 @@ from pystencils.field import Field
 def vectorize(kernel_ast: ast.KernelFunction, instruction_set: str = 'avx',
              assume_aligned: bool = False, nontemporal: Union[bool, Container[Union[str, Field]]] = False,
-              assume_inner_stride_one: bool = False):
+              assume_inner_stride_one: bool = False, assume_sufficient_line_padding: bool = True):
    """Explicit vectorization using SIMD vectorization via intrinsics.
    Args:
@@ -30,6 +30,11 @@ def vectorize(kernel_ast: ast.KernelFunction, instruction_set: str = 'avx',
                                 the inner loop stride is a runtime variable and thus might not be always 1.
                                 If this parameter is set to true, the the inner stride is assumed to be always one.
                                 This has to be ensured at runtime!
+        assume_sufficient_line_padding: if True and assume_inner_stride_one, no tail loop is created but loop is
+                                        extended by at most (vector_width-1) elements
+                                        assumes that at the end of each line there is enough padding with dummy data
+                                        depending on the access pattern there might be additional padding
+                                        required at the end of the array
    """
    all_fields = kernel_ast.fields_accessed
    if nontemporal is None or nontemporal is False:
@@ -51,11 +56,13 @@ def vectorize(kernel_ast: ast.KernelFunction, instruction_set: str = 'avx',
    vector_width = vector_is['width']
    kernel_ast.instruction_set = vector_is
-    vectorize_inner_loops_and_adapt_load_stores(kernel_ast, vector_width, assume_aligned, nontemporal)
+    vectorize_inner_loops_and_adapt_load_stores(kernel_ast, vector_width, assume_aligned,
+                                                nontemporal, assume_sufficient_line_padding)
    insert_vector_casts(kernel_ast)
-def vectorize_inner_loops_and_adapt_load_stores(ast_node, vector_width, assume_aligned, nontemporal_fields):
+def vectorize_inner_loops_and_adapt_load_stores(ast_node, vector_width, assume_aligned, nontemporal_fields,
+                                                assume_sufficient_line_padding):
    """Goes over all innermost loops, changes increment to vector width and replaces field accesses by vector type."""
    all_loops = filtered_tree_iteration(ast_node, ast.LoopOverCoordinate, stop_type=ast.SympyAssignment)
    inner_loops = [n for n in all_loops if n.is_innermost_loop]
@@ -65,10 +72,15 @@ def vectorize_inner_loops_and_adapt_load_stores(ast_node, vector_width, assume_a
        loop_range = loop_node.stop - loop_node.start
        # cut off loop tail, that is not a multiple of four
-        cutting_point = modulo_floor(loop_range, vector_width) + loop_node.start
+        if assume_aligned and assume_sufficient_line_padding:
-        loop_nodes = cut_loop(loop_node, [cutting_point])
+            loop_range = loop_node.stop - loop_node.start
-        assert len(loop_nodes) in (1, 2)  # 2 for main and tail loop, 1 if loop range divisible by vector width
+            new_stop = loop_node.start + modulo_ceil(loop_range, vector_width)
-        loop_node = loop_nodes[0]
+            loop_node.stop = new_stop
+        else:
+            cutting_point = modulo_floor(loop_range, vector_width) + loop_node.start
+            loop_nodes = cut_loop(loop_node, [cutting_point])
+            assert len(loop_nodes) in (1, 2)  # 2 for main and tail loop, 1 if loop range divisible by vector width
+            loop_node = loop_nodes[0]
        # Find all array accesses (indexed) that depend on the loop counter as offset
        loop_counter_symbol = ast.LoopOverCoordinate.get_loop_counter_symbol(loop_node.coordinate_to_loop_over)

--- a/integer_functions.py
+++ b/integer_functions.py
@@ -43,7 +43,7 @@ class modulo_floor(sp.Function):
 # noinspection PyPep8Naming
 class modulo_ceil(sp.Function):
-    """Returns the next smaller integer divisible by given divisor.
+    """Returns the next bigger integer divisible by given divisor.
    Examples:
        >>> modulo_ceil(9, 4)
@@ -68,5 +68,5 @@ class modulo_ceil(sp.Function):
    def to_c(self, print_func):
        dtype = collate_types((get_type_of_expression(self.args[0]), get_type_of_expression(self.args[1])))
        assert dtype.is_int()
-        code = "({0}) % ({1}) == 0 ? {0} : (({dtype})(({0}) / ({1}))+1) * ({1})"
+        code = "(({0}) % ({1}) == 0 ? {0} : (({dtype})(({0}) / ({1}))+1) * ({1}))"
        return code.format(print_func(self.args[0]), print_func(self.args[1]), dtype=dtype)