Commit c99ea544 authored by Martin Bauer's avatar Martin Bauer
Browse files

Vectorization - advanced parameters

- option to allocate more memory at end of line and do not generate a
  tail loop, if loop range is not divisible by SIMD width
parent 80334597
......@@ -4,19 +4,22 @@ import numpy as np
def aligned_empty(shape, byte_alignment=32, dtype=np.float64, byte_offset=0, order='C', align_inner_coordinate=True):
"""
Creates an aligned empty numpy array
:param shape: size of the array
:param byte_alignment: alignment in bytes, for the start address of the array holds (a % byte_alignment) == 0
:param dtype: numpy data type
:param byte_offset: offset in bytes for position that should be aligned i.e. (a+byte_offset) % byte_alignment == 0
typically used to align first inner cell instead of ghost layer
:param order: storage linearization order
:param align_inner_coordinate: if True, the start of the innermost coordinate lines are aligned as well
:return:
Args:
shape: size of the array
byte_alignment: alignment in bytes, for the start address of the array holds (a % byte_alignment) == 0
dtype: numpy data type
byte_offset: offset in bytes for position that should be aligned i.e. (a+byte_offset) % byte_alignment == 0
typically used to align first inner cell instead of ghost layer
order: storage linearization order
align_inner_coordinate: if True, the start of the innermost coordinate lines are aligned as well
"""
if (not align_inner_coordinate) or (not hasattr(shape, '__len__')):
size = np.prod(shape)
d = np.dtype(dtype)
tmp = np.empty(size * d.itemsize + byte_alignment, dtype=np.uint8)
# 2 * byte_alignment instead of 1 * byte_alignment to have slack in the end such that
# vectorized loops can access vector_width elements further and don't require a tail loop
tmp = np.empty(size * d.itemsize + 2 * byte_alignment, dtype=np.uint8)
address = tmp.__array_interface__['data'][0]
offset = (byte_alignment - (address + byte_offset) % byte_alignment) % byte_alignment
return tmp[offset:offset + size * d.itemsize].view(dtype=d).reshape(shape, order=order)
......
......@@ -2,7 +2,7 @@ import sympy as sp
import warnings
from typing import Union, Container
from pystencils.backends.simd_instruction_sets import get_vector_instruction_set
from pystencils.integer_functions import modulo_floor
from pystencils.integer_functions import modulo_floor, modulo_ceil
from pystencils.sympyextensions import fast_subs
from pystencils.data_types import TypedSymbol, VectorType, get_type_of_expression, vector_memory_access, cast_func, \
collate_types, PointerType
......@@ -13,7 +13,7 @@ from pystencils.field import Field
def vectorize(kernel_ast: ast.KernelFunction, instruction_set: str = 'avx',
assume_aligned: bool = False, nontemporal: Union[bool, Container[Union[str, Field]]] = False,
assume_inner_stride_one: bool = False):
assume_inner_stride_one: bool = False, assume_sufficient_line_padding: bool = True):
"""Explicit vectorization using SIMD vectorization via intrinsics.
Args:
......@@ -30,6 +30,11 @@ def vectorize(kernel_ast: ast.KernelFunction, instruction_set: str = 'avx',
the inner loop stride is a runtime variable and thus might not be always 1.
If this parameter is set to true, the the inner stride is assumed to be always one.
This has to be ensured at runtime!
assume_sufficient_line_padding: if True and assume_inner_stride_one, no tail loop is created but loop is
extended by at most (vector_width-1) elements
assumes that at the end of each line there is enough padding with dummy data
depending on the access pattern there might be additional padding
required at the end of the array
"""
all_fields = kernel_ast.fields_accessed
if nontemporal is None or nontemporal is False:
......@@ -51,11 +56,13 @@ def vectorize(kernel_ast: ast.KernelFunction, instruction_set: str = 'avx',
vector_width = vector_is['width']
kernel_ast.instruction_set = vector_is
vectorize_inner_loops_and_adapt_load_stores(kernel_ast, vector_width, assume_aligned, nontemporal)
vectorize_inner_loops_and_adapt_load_stores(kernel_ast, vector_width, assume_aligned,
nontemporal, assume_sufficient_line_padding)
insert_vector_casts(kernel_ast)
def vectorize_inner_loops_and_adapt_load_stores(ast_node, vector_width, assume_aligned, nontemporal_fields):
def vectorize_inner_loops_and_adapt_load_stores(ast_node, vector_width, assume_aligned, nontemporal_fields,
assume_sufficient_line_padding):
"""Goes over all innermost loops, changes increment to vector width and replaces field accesses by vector type."""
all_loops = filtered_tree_iteration(ast_node, ast.LoopOverCoordinate, stop_type=ast.SympyAssignment)
inner_loops = [n for n in all_loops if n.is_innermost_loop]
......@@ -65,10 +72,15 @@ def vectorize_inner_loops_and_adapt_load_stores(ast_node, vector_width, assume_a
loop_range = loop_node.stop - loop_node.start
# cut off loop tail, that is not a multiple of four
cutting_point = modulo_floor(loop_range, vector_width) + loop_node.start
loop_nodes = cut_loop(loop_node, [cutting_point])
assert len(loop_nodes) in (1, 2) # 2 for main and tail loop, 1 if loop range divisible by vector width
loop_node = loop_nodes[0]
if assume_aligned and assume_sufficient_line_padding:
loop_range = loop_node.stop - loop_node.start
new_stop = loop_node.start + modulo_ceil(loop_range, vector_width)
loop_node.stop = new_stop
else:
cutting_point = modulo_floor(loop_range, vector_width) + loop_node.start
loop_nodes = cut_loop(loop_node, [cutting_point])
assert len(loop_nodes) in (1, 2) # 2 for main and tail loop, 1 if loop range divisible by vector width
loop_node = loop_nodes[0]
# Find all array accesses (indexed) that depend on the loop counter as offset
loop_counter_symbol = ast.LoopOverCoordinate.get_loop_counter_symbol(loop_node.coordinate_to_loop_over)
......
......@@ -43,7 +43,7 @@ class modulo_floor(sp.Function):
# noinspection PyPep8Naming
class modulo_ceil(sp.Function):
"""Returns the next smaller integer divisible by given divisor.
"""Returns the next bigger integer divisible by given divisor.
Examples:
>>> modulo_ceil(9, 4)
......@@ -68,5 +68,5 @@ class modulo_ceil(sp.Function):
def to_c(self, print_func):
dtype = collate_types((get_type_of_expression(self.args[0]), get_type_of_expression(self.args[1])))
assert dtype.is_int()
code = "({0}) % ({1}) == 0 ? {0} : (({dtype})(({0}) / ({1}))+1) * ({1})"
code = "(({0}) % ({1}) == 0 ? {0} : (({dtype})(({0}) / ({1}))+1) * ({1}))"
return code.format(print_func(self.args[0]), print_func(self.args[1]), dtype=dtype)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment