Commit 729989d5 authored by Martin Bauer's avatar Martin Bauer
Browse files

Cache Blocking Transformation

parent a274d662
...@@ -184,6 +184,11 @@ class KernelFunction(Node): ...@@ -184,6 +184,11 @@ class KernelFunction(Node):
def body(self): def body(self):
return self._body return self._body
@body.setter
def body(self, value):
self._body = value
self._body.parent = self
@property @property
def args(self): def args(self):
return [self._body] return [self._body]
...@@ -325,8 +330,9 @@ class PragmaBlock(Block): ...@@ -325,8 +330,9 @@ class PragmaBlock(Block):
class LoopOverCoordinate(Node): class LoopOverCoordinate(Node):
LOOP_COUNTER_NAME_PREFIX = "ctr" LOOP_COUNTER_NAME_PREFIX = "ctr"
BlOCK_LOOP_COUNTER_NAME_PREFIX = "_blockctr"
def __init__(self, body, coordinate_to_loop_over, start, stop, step=1): def __init__(self, body, coordinate_to_loop_over, start, stop, step=1, is_block_loop=False):
super(LoopOverCoordinate, self).__init__(parent=None) super(LoopOverCoordinate, self).__init__(parent=None)
self.body = body self.body = body
body.parent = self body.parent = self
...@@ -336,9 +342,11 @@ class LoopOverCoordinate(Node): ...@@ -336,9 +342,11 @@ class LoopOverCoordinate(Node):
self.step = step self.step = step
self.body.parent = self self.body.parent = self
self.prefix_lines = [] self.prefix_lines = []
self.is_block_loop = is_block_loop
def new_loop_with_different_body(self, new_body): def new_loop_with_different_body(self, new_body):
result = LoopOverCoordinate(new_body, self.coordinate_to_loop_over, self.start, self.stop, self.step) result = LoopOverCoordinate(new_body, self.coordinate_to_loop_over, self.start, self.stop,
self.step, self.is_block_loop)
result.prefix_lines = [l for l in self.prefix_lines] result.prefix_lines = [l for l in self.prefix_lines]
return result return result
...@@ -385,9 +393,16 @@ class LoopOverCoordinate(Node): ...@@ -385,9 +393,16 @@ class LoopOverCoordinate(Node):
def get_loop_counter_name(coordinate_to_loop_over): def get_loop_counter_name(coordinate_to_loop_over):
return "%s_%s" % (LoopOverCoordinate.LOOP_COUNTER_NAME_PREFIX, coordinate_to_loop_over) return "%s_%s" % (LoopOverCoordinate.LOOP_COUNTER_NAME_PREFIX, coordinate_to_loop_over)
@staticmethod
def get_block_loop_counter_name(coordinate_to_loop_over):
return "%s_%s" % (LoopOverCoordinate.BlOCK_LOOP_COUNTER_NAME_PREFIX, coordinate_to_loop_over)
@property @property
def loop_counter_name(self): def loop_counter_name(self):
return LoopOverCoordinate.get_loop_counter_name(self.coordinate_to_loop_over) if self.is_block_loop:
return LoopOverCoordinate.get_block_loop_counter_name(self.coordinate_to_loop_over)
else:
return LoopOverCoordinate.get_loop_counter_name(self.coordinate_to_loop_over)
@staticmethod @staticmethod
def is_loop_counter_symbol(symbol): def is_loop_counter_symbol(symbol):
...@@ -403,9 +418,16 @@ class LoopOverCoordinate(Node): ...@@ -403,9 +418,16 @@ class LoopOverCoordinate(Node):
def get_loop_counter_symbol(coordinate_to_loop_over): def get_loop_counter_symbol(coordinate_to_loop_over):
return TypedSymbol(LoopOverCoordinate.get_loop_counter_name(coordinate_to_loop_over), 'int') return TypedSymbol(LoopOverCoordinate.get_loop_counter_name(coordinate_to_loop_over), 'int')
@staticmethod
def get_block_loop_counter_symbol(coordinate_to_loop_over):
return TypedSymbol(LoopOverCoordinate.get_block_loop_counter_name(coordinate_to_loop_over), 'int')
@property @property
def loop_counter_symbol(self): def loop_counter_symbol(self):
return LoopOverCoordinate.get_loop_counter_symbol(self.coordinate_to_loop_over) if self.is_block_loop:
return self.get_block_loop_counter_symbol(self.coordinate_to_loop_over)
else:
return self.get_loop_counter_symbol(self.coordinate_to_loop_over)
@property @property
def is_outermost_loop(self): def is_outermost_loop(self):
......
...@@ -152,13 +152,14 @@ def create_indexed_kernel(assignments: AssignmentOrAstNodeList, index_fields, fu ...@@ -152,13 +152,14 @@ def create_indexed_kernel(assignments: AssignmentOrAstNodeList, index_fields, fu
return ast_node return ast_node
def add_openmp(ast_node, schedule="static", num_threads=True): def add_openmp(ast_node, schedule="static", num_threads=True, collapse=None):
"""Parallelize the outer loop with OpenMP. """Parallelize the outer loop with OpenMP.
Args: Args:
ast_node: abstract syntax tree created e.g. by :func:`create_kernel` ast_node: abstract syntax tree created e.g. by :func:`create_kernel`
schedule: OpenMP scheduling policy e.g. 'static' or 'dynamic' schedule: OpenMP scheduling policy e.g. 'static' or 'dynamic'
num_threads: explicitly specify number of threads num_threads: explicitly specify number of threads
collapse: number of nested loops to include in parallel region (see OpenMP collapse)
""" """
if not num_threads: if not num_threads:
return return
...@@ -182,7 +183,7 @@ def add_openmp(ast_node, schedule="static", num_threads=True): ...@@ -182,7 +183,7 @@ def add_openmp(ast_node, schedule="static", num_threads=True):
import multiprocessing import multiprocessing
num_threads = multiprocessing.cpu_count() num_threads = multiprocessing.cpu_count()
if loop_range is not None and loop_range < num_threads: if loop_range is not None and loop_range < num_threads and not collapse:
contained_loops = [l for l in loop_to_parallelize.body.args if isinstance(l, LoopOverCoordinate)] contained_loops = [l for l in loop_to_parallelize.body.args if isinstance(l, LoopOverCoordinate)]
if len(contained_loops) == 1: if len(contained_loops) == 1:
contained_loop = contained_loops[0] contained_loop = contained_loops[0]
...@@ -193,4 +194,7 @@ def add_openmp(ast_node, schedule="static", num_threads=True): ...@@ -193,4 +194,7 @@ def add_openmp(ast_node, schedule="static", num_threads=True):
except TypeError: except TypeError:
pass pass
loop_to_parallelize.prefix_lines.append("#pragma omp for schedule(%s)" % (schedule,)) prefix = "#pragma omp for schedule(%s)" % (schedule,)
if collapse:
prefix += " collapse(%d)" % (collapse, )
loop_to_parallelize.prefix_lines.append(prefix)
...@@ -6,12 +6,12 @@ from pystencils.astnodes import LoopOverCoordinate, Conditional, Block, SympyAss ...@@ -6,12 +6,12 @@ from pystencils.astnodes import LoopOverCoordinate, Conditional, Block, SympyAss
from pystencils.cpu.vectorization import vectorize from pystencils.cpu.vectorization import vectorize
from pystencils.simp.assignment_collection import AssignmentCollection from pystencils.simp.assignment_collection import AssignmentCollection
from pystencils.gpucuda.indexing import indexing_creator_from_params from pystencils.gpucuda.indexing import indexing_creator_from_params
from pystencils.transformations import remove_conditionals_in_staggered_kernel from pystencils.transformations import remove_conditionals_in_staggered_kernel, loop_blocking
def create_kernel(assignments, target='cpu', data_type="double", iteration_slice=None, ghost_layers=None, def create_kernel(assignments, target='cpu', data_type="double", iteration_slice=None, ghost_layers=None,
skip_independence_check=False, skip_independence_check=False,
cpu_openmp=False, cpu_vectorize_info=None, cpu_openmp=False, cpu_vectorize_info=None, cpu_blocking=None,
gpu_indexing='block', gpu_indexing_params=MappingProxyType({})): gpu_indexing='block', gpu_indexing_params=MappingProxyType({})):
""" """
Creates abstract syntax tree (AST) of kernel, using a list of update equations. Creates abstract syntax tree (AST) of kernel, using a list of update equations.
...@@ -32,6 +32,7 @@ def create_kernel(assignments, target='cpu', data_type="double", iteration_slice ...@@ -32,6 +32,7 @@ def create_kernel(assignments, target='cpu', data_type="double", iteration_slice
cpu_vectorize_info: a dictionary with keys, 'vector_instruction_set', 'assume_aligned' and 'nontemporal' cpu_vectorize_info: a dictionary with keys, 'vector_instruction_set', 'assume_aligned' and 'nontemporal'
for documentation of these parameters see vectorize function. Example: for documentation of these parameters see vectorize function. Example:
'{'instruction_set': 'avx512', 'assume_aligned': True, 'nontemporal':True}' '{'instruction_set': 'avx512', 'assume_aligned': True, 'nontemporal':True}'
cpu_blocking: a tuple of block sizes or None if no blocking should be applied
gpu_indexing: either 'block' or 'line' , or custom indexing class, see `AbstractIndexing` gpu_indexing: either 'block' or 'line' , or custom indexing class, see `AbstractIndexing`
gpu_indexing_params: dict with indexing parameters (constructor parameters of indexing class) gpu_indexing_params: dict with indexing parameters (constructor parameters of indexing class)
e.g. for 'block' one can specify '{'block_size': (20, 20, 10) }' e.g. for 'block' one can specify '{'block_size': (20, 20, 10) }'
...@@ -72,8 +73,11 @@ def create_kernel(assignments, target='cpu', data_type="double", iteration_slice ...@@ -72,8 +73,11 @@ def create_kernel(assignments, target='cpu', data_type="double", iteration_slice
ast = create_kernel(assignments, type_info=data_type, split_groups=split_groups, ast = create_kernel(assignments, type_info=data_type, split_groups=split_groups,
iteration_slice=iteration_slice, ghost_layers=ghost_layers, iteration_slice=iteration_slice, ghost_layers=ghost_layers,
skip_independence_check=skip_independence_check) skip_independence_check=skip_independence_check)
omp_collapse = None
if cpu_blocking:
omp_collapse = loop_blocking(ast, cpu_blocking)
if cpu_openmp: if cpu_openmp:
add_openmp(ast, num_threads=cpu_openmp) add_openmp(ast, num_threads=cpu_openmp, collapse=omp_collapse)
if cpu_vectorize_info: if cpu_vectorize_info:
if cpu_vectorize_info is True: if cpu_vectorize_info is True:
vectorize(ast) vectorize(ast)
...@@ -232,6 +236,10 @@ def create_staggered_kernel(staggered_field, expressions, subexpressions=(), tar ...@@ -232,6 +236,10 @@ def create_staggered_kernel(staggered_field, expressions, subexpressions=(), tar
ghost_layers = [(1, 0)] * dim ghost_layers = [(1, 0)] * dim
blocking = kwargs.get('cpu_blocking', None)
if blocking:
del kwargs['cpu_blocking']
cpu_vectorize_info = kwargs.get('cpu_vectorize_info', None) cpu_vectorize_info = kwargs.get('cpu_vectorize_info', None)
if cpu_vectorize_info: if cpu_vectorize_info:
del kwargs['cpu_vectorize_info'] del kwargs['cpu_vectorize_info']
...@@ -239,6 +247,8 @@ def create_staggered_kernel(staggered_field, expressions, subexpressions=(), tar ...@@ -239,6 +247,8 @@ def create_staggered_kernel(staggered_field, expressions, subexpressions=(), tar
if target == 'cpu': if target == 'cpu':
remove_conditionals_in_staggered_kernel(ast) remove_conditionals_in_staggered_kernel(ast)
if blocking:
loop_blocking(ast, blocking)
if cpu_vectorize_info is True: if cpu_vectorize_info is True:
vectorize(ast) vectorize(ast)
elif isinstance(cpu_vectorize_info, dict): elif isinstance(cpu_vectorize_info, dict):
......
...@@ -1008,3 +1008,52 @@ def replace_inner_stride_with_one(ast_node: ast.KernelFunction) -> None: ...@@ -1008,3 +1008,52 @@ def replace_inner_stride_with_one(ast_node: ast.KernelFunction) -> None:
subs_dict = {stride_param: 1 for stride_param in stride_params} subs_dict = {stride_param: 1 for stride_param in stride_params}
if subs_dict: if subs_dict:
ast_node.subs(subs_dict) ast_node.subs(subs_dict)
def loop_blocking(ast_node: ast.KernelFunction, block_size) -> int:
"""Blocking of loops to enhance cache locality. Modifies the ast node in-place.
Args:
ast_node: kernel function node before vectorization transformation has been applied
block_size: sequence defining block size in x, y, (z) direction
Returns:
number of dimensions blocked
"""
loops = [l for l in filtered_tree_iteration(ast_node, ast.LoopOverCoordinate, stop_type=ast.SympyAssignment)]
body = ast_node.body
coordinates = []
loop_starts = {}
loop_stops = {}
for loop in loops:
coord = loop.coordinate_to_loop_over
if coord not in coordinates:
coordinates.append(coord)
loop_starts[coord] = loop.start
loop_stops[coord] = loop.stop
else:
assert loop.start == loop_starts[coord] and loop.stop == loop_stops[coord], \
"Multiple loops over coordinate {} with different loop bounds".format(coord)
# Create the outer loops that iterate over the blocks
outer_loop = None
for coord in reversed(coordinates):
body = ast.Block([outer_loop]) if outer_loop else body
outer_loop = ast.LoopOverCoordinate(body, coord, loop_starts[coord], loop_stops[coord],
step=block_size[coord], is_block_loop=True)
ast_node.body = ast.Block([outer_loop])
# modify the existing loops to only iterate within one block
for inner_loop in loops:
coord = inner_loop.coordinate_to_loop_over
block_ctr = ast.LoopOverCoordinate.get_block_loop_counter_symbol(coord)
loop_range = inner_loop.stop - inner_loop.start
if sp.sympify(loop_range).is_number and loop_range % block_size[coord] == 0:
stop = block_ctr + block_size[coord]
else:
stop = sp.Min(inner_loop.stop, block_ctr + block_size[coord])
inner_loop.start = block_ctr
inner_loop.stop = stop
return len(coordinates)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment