Skip to content
Snippets Groups Projects
Commit c99ea544 authored by Martin Bauer's avatar Martin Bauer
Browse files

Vectorization - advanced parameters

- option to allocate more memory at end of line and do not generate a
  tail loop, if loop range is not divisible by SIMD width
parent 80334597
No related merge requests found
...@@ -4,19 +4,22 @@ import numpy as np ...@@ -4,19 +4,22 @@ import numpy as np
def aligned_empty(shape, byte_alignment=32, dtype=np.float64, byte_offset=0, order='C', align_inner_coordinate=True): def aligned_empty(shape, byte_alignment=32, dtype=np.float64, byte_offset=0, order='C', align_inner_coordinate=True):
""" """
Creates an aligned empty numpy array Creates an aligned empty numpy array
:param shape: size of the array
:param byte_alignment: alignment in bytes, for the start address of the array holds (a % byte_alignment) == 0 Args:
:param dtype: numpy data type shape: size of the array
:param byte_offset: offset in bytes for position that should be aligned i.e. (a+byte_offset) % byte_alignment == 0 byte_alignment: alignment in bytes, for the start address of the array holds (a % byte_alignment) == 0
typically used to align first inner cell instead of ghost layer dtype: numpy data type
:param order: storage linearization order byte_offset: offset in bytes for position that should be aligned i.e. (a+byte_offset) % byte_alignment == 0
:param align_inner_coordinate: if True, the start of the innermost coordinate lines are aligned as well typically used to align first inner cell instead of ghost layer
:return: order: storage linearization order
align_inner_coordinate: if True, the start of the innermost coordinate lines are aligned as well
""" """
if (not align_inner_coordinate) or (not hasattr(shape, '__len__')): if (not align_inner_coordinate) or (not hasattr(shape, '__len__')):
size = np.prod(shape) size = np.prod(shape)
d = np.dtype(dtype) d = np.dtype(dtype)
tmp = np.empty(size * d.itemsize + byte_alignment, dtype=np.uint8) # 2 * byte_alignment instead of 1 * byte_alignment to have slack in the end such that
# vectorized loops can access vector_width elements further and don't require a tail loop
tmp = np.empty(size * d.itemsize + 2 * byte_alignment, dtype=np.uint8)
address = tmp.__array_interface__['data'][0] address = tmp.__array_interface__['data'][0]
offset = (byte_alignment - (address + byte_offset) % byte_alignment) % byte_alignment offset = (byte_alignment - (address + byte_offset) % byte_alignment) % byte_alignment
return tmp[offset:offset + size * d.itemsize].view(dtype=d).reshape(shape, order=order) return tmp[offset:offset + size * d.itemsize].view(dtype=d).reshape(shape, order=order)
......
...@@ -2,7 +2,7 @@ import sympy as sp ...@@ -2,7 +2,7 @@ import sympy as sp
import warnings import warnings
from typing import Union, Container from typing import Union, Container
from pystencils.backends.simd_instruction_sets import get_vector_instruction_set from pystencils.backends.simd_instruction_sets import get_vector_instruction_set
from pystencils.integer_functions import modulo_floor from pystencils.integer_functions import modulo_floor, modulo_ceil
from pystencils.sympyextensions import fast_subs from pystencils.sympyextensions import fast_subs
from pystencils.data_types import TypedSymbol, VectorType, get_type_of_expression, vector_memory_access, cast_func, \ from pystencils.data_types import TypedSymbol, VectorType, get_type_of_expression, vector_memory_access, cast_func, \
collate_types, PointerType collate_types, PointerType
...@@ -13,7 +13,7 @@ from pystencils.field import Field ...@@ -13,7 +13,7 @@ from pystencils.field import Field
def vectorize(kernel_ast: ast.KernelFunction, instruction_set: str = 'avx', def vectorize(kernel_ast: ast.KernelFunction, instruction_set: str = 'avx',
assume_aligned: bool = False, nontemporal: Union[bool, Container[Union[str, Field]]] = False, assume_aligned: bool = False, nontemporal: Union[bool, Container[Union[str, Field]]] = False,
assume_inner_stride_one: bool = False): assume_inner_stride_one: bool = False, assume_sufficient_line_padding: bool = True):
"""Explicit vectorization using SIMD vectorization via intrinsics. """Explicit vectorization using SIMD vectorization via intrinsics.
Args: Args:
...@@ -30,6 +30,11 @@ def vectorize(kernel_ast: ast.KernelFunction, instruction_set: str = 'avx', ...@@ -30,6 +30,11 @@ def vectorize(kernel_ast: ast.KernelFunction, instruction_set: str = 'avx',
the inner loop stride is a runtime variable and thus might not be always 1. the inner loop stride is a runtime variable and thus might not be always 1.
If this parameter is set to true, the the inner stride is assumed to be always one. If this parameter is set to true, the the inner stride is assumed to be always one.
This has to be ensured at runtime! This has to be ensured at runtime!
assume_sufficient_line_padding: if True and assume_inner_stride_one, no tail loop is created but loop is
extended by at most (vector_width-1) elements
assumes that at the end of each line there is enough padding with dummy data
depending on the access pattern there might be additional padding
required at the end of the array
""" """
all_fields = kernel_ast.fields_accessed all_fields = kernel_ast.fields_accessed
if nontemporal is None or nontemporal is False: if nontemporal is None or nontemporal is False:
...@@ -51,11 +56,13 @@ def vectorize(kernel_ast: ast.KernelFunction, instruction_set: str = 'avx', ...@@ -51,11 +56,13 @@ def vectorize(kernel_ast: ast.KernelFunction, instruction_set: str = 'avx',
vector_width = vector_is['width'] vector_width = vector_is['width']
kernel_ast.instruction_set = vector_is kernel_ast.instruction_set = vector_is
vectorize_inner_loops_and_adapt_load_stores(kernel_ast, vector_width, assume_aligned, nontemporal) vectorize_inner_loops_and_adapt_load_stores(kernel_ast, vector_width, assume_aligned,
nontemporal, assume_sufficient_line_padding)
insert_vector_casts(kernel_ast) insert_vector_casts(kernel_ast)
def vectorize_inner_loops_and_adapt_load_stores(ast_node, vector_width, assume_aligned, nontemporal_fields): def vectorize_inner_loops_and_adapt_load_stores(ast_node, vector_width, assume_aligned, nontemporal_fields,
assume_sufficient_line_padding):
"""Goes over all innermost loops, changes increment to vector width and replaces field accesses by vector type.""" """Goes over all innermost loops, changes increment to vector width and replaces field accesses by vector type."""
all_loops = filtered_tree_iteration(ast_node, ast.LoopOverCoordinate, stop_type=ast.SympyAssignment) all_loops = filtered_tree_iteration(ast_node, ast.LoopOverCoordinate, stop_type=ast.SympyAssignment)
inner_loops = [n for n in all_loops if n.is_innermost_loop] inner_loops = [n for n in all_loops if n.is_innermost_loop]
...@@ -65,10 +72,15 @@ def vectorize_inner_loops_and_adapt_load_stores(ast_node, vector_width, assume_a ...@@ -65,10 +72,15 @@ def vectorize_inner_loops_and_adapt_load_stores(ast_node, vector_width, assume_a
loop_range = loop_node.stop - loop_node.start loop_range = loop_node.stop - loop_node.start
# cut off loop tail, that is not a multiple of four # cut off loop tail, that is not a multiple of four
cutting_point = modulo_floor(loop_range, vector_width) + loop_node.start if assume_aligned and assume_sufficient_line_padding:
loop_nodes = cut_loop(loop_node, [cutting_point]) loop_range = loop_node.stop - loop_node.start
assert len(loop_nodes) in (1, 2) # 2 for main and tail loop, 1 if loop range divisible by vector width new_stop = loop_node.start + modulo_ceil(loop_range, vector_width)
loop_node = loop_nodes[0] loop_node.stop = new_stop
else:
cutting_point = modulo_floor(loop_range, vector_width) + loop_node.start
loop_nodes = cut_loop(loop_node, [cutting_point])
assert len(loop_nodes) in (1, 2) # 2 for main and tail loop, 1 if loop range divisible by vector width
loop_node = loop_nodes[0]
# Find all array accesses (indexed) that depend on the loop counter as offset # Find all array accesses (indexed) that depend on the loop counter as offset
loop_counter_symbol = ast.LoopOverCoordinate.get_loop_counter_symbol(loop_node.coordinate_to_loop_over) loop_counter_symbol = ast.LoopOverCoordinate.get_loop_counter_symbol(loop_node.coordinate_to_loop_over)
......
...@@ -43,7 +43,7 @@ class modulo_floor(sp.Function): ...@@ -43,7 +43,7 @@ class modulo_floor(sp.Function):
# noinspection PyPep8Naming # noinspection PyPep8Naming
class modulo_ceil(sp.Function): class modulo_ceil(sp.Function):
"""Returns the next smaller integer divisible by given divisor. """Returns the next bigger integer divisible by given divisor.
Examples: Examples:
>>> modulo_ceil(9, 4) >>> modulo_ceil(9, 4)
...@@ -68,5 +68,5 @@ class modulo_ceil(sp.Function): ...@@ -68,5 +68,5 @@ class modulo_ceil(sp.Function):
def to_c(self, print_func): def to_c(self, print_func):
dtype = collate_types((get_type_of_expression(self.args[0]), get_type_of_expression(self.args[1]))) dtype = collate_types((get_type_of_expression(self.args[0]), get_type_of_expression(self.args[1])))
assert dtype.is_int() assert dtype.is_int()
code = "({0}) % ({1}) == 0 ? {0} : (({dtype})(({0}) / ({1}))+1) * ({1})" code = "(({0}) % ({1}) == 0 ? {0} : (({dtype})(({0}) / ({1}))+1) * ({1}))"
return code.format(print_func(self.args[0]), print_func(self.args[1]), dtype=dtype) return code.format(print_func(self.args[0]), print_func(self.args[1]), dtype=dtype)
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment