diff --git a/astnodes.py b/astnodes.py index 09564178c142efdfad3bcafb68aafebc7c5561aa..992cbf25a0fd30aa0b1dd3d72952a715da0c4bdf 100644 --- a/astnodes.py +++ b/astnodes.py @@ -184,6 +184,11 @@ class KernelFunction(Node): def body(self): return self._body + @body.setter + def body(self, value): + self._body = value + self._body.parent = self + @property def args(self): return [self._body] @@ -325,8 +330,9 @@ class PragmaBlock(Block): class LoopOverCoordinate(Node): LOOP_COUNTER_NAME_PREFIX = "ctr" + BlOCK_LOOP_COUNTER_NAME_PREFIX = "_blockctr" - def __init__(self, body, coordinate_to_loop_over, start, stop, step=1): + def __init__(self, body, coordinate_to_loop_over, start, stop, step=1, is_block_loop=False): super(LoopOverCoordinate, self).__init__(parent=None) self.body = body body.parent = self @@ -336,9 +342,11 @@ class LoopOverCoordinate(Node): self.step = step self.body.parent = self self.prefix_lines = [] + self.is_block_loop = is_block_loop def new_loop_with_different_body(self, new_body): - result = LoopOverCoordinate(new_body, self.coordinate_to_loop_over, self.start, self.stop, self.step) + result = LoopOverCoordinate(new_body, self.coordinate_to_loop_over, self.start, self.stop, + self.step, self.is_block_loop) result.prefix_lines = [l for l in self.prefix_lines] return result @@ -385,9 +393,16 @@ class LoopOverCoordinate(Node): def get_loop_counter_name(coordinate_to_loop_over): return "%s_%s" % (LoopOverCoordinate.LOOP_COUNTER_NAME_PREFIX, coordinate_to_loop_over) + @staticmethod + def get_block_loop_counter_name(coordinate_to_loop_over): + return "%s_%s" % (LoopOverCoordinate.BlOCK_LOOP_COUNTER_NAME_PREFIX, coordinate_to_loop_over) + @property def loop_counter_name(self): - return LoopOverCoordinate.get_loop_counter_name(self.coordinate_to_loop_over) + if self.is_block_loop: + return LoopOverCoordinate.get_block_loop_counter_name(self.coordinate_to_loop_over) + else: + return LoopOverCoordinate.get_loop_counter_name(self.coordinate_to_loop_over) @staticmethod def is_loop_counter_symbol(symbol): @@ -403,9 +418,16 @@ class LoopOverCoordinate(Node): def get_loop_counter_symbol(coordinate_to_loop_over): return TypedSymbol(LoopOverCoordinate.get_loop_counter_name(coordinate_to_loop_over), 'int') + @staticmethod + def get_block_loop_counter_symbol(coordinate_to_loop_over): + return TypedSymbol(LoopOverCoordinate.get_block_loop_counter_name(coordinate_to_loop_over), 'int') + @property def loop_counter_symbol(self): - return LoopOverCoordinate.get_loop_counter_symbol(self.coordinate_to_loop_over) + if self.is_block_loop: + return self.get_block_loop_counter_symbol(self.coordinate_to_loop_over) + else: + return self.get_loop_counter_symbol(self.coordinate_to_loop_over) @property def is_outermost_loop(self): diff --git a/cpu/kernelcreation.py b/cpu/kernelcreation.py index 136cd7f87b6dab6185271a82d51a144ace513d73..a3f02113b063d9f1f629e0bb73e6c2c9809f9271 100644 --- a/cpu/kernelcreation.py +++ b/cpu/kernelcreation.py @@ -152,13 +152,14 @@ def create_indexed_kernel(assignments: AssignmentOrAstNodeList, index_fields, fu return ast_node -def add_openmp(ast_node, schedule="static", num_threads=True): +def add_openmp(ast_node, schedule="static", num_threads=True, collapse=None): """Parallelize the outer loop with OpenMP. Args: ast_node: abstract syntax tree created e.g. by :func:`create_kernel` schedule: OpenMP scheduling policy e.g. 'static' or 'dynamic' num_threads: explicitly specify number of threads + collapse: number of nested loops to include in parallel region (see OpenMP collapse) """ if not num_threads: return @@ -182,7 +183,7 @@ def add_openmp(ast_node, schedule="static", num_threads=True): import multiprocessing num_threads = multiprocessing.cpu_count() - if loop_range is not None and loop_range < num_threads: + if loop_range is not None and loop_range < num_threads and not collapse: contained_loops = [l for l in loop_to_parallelize.body.args if isinstance(l, LoopOverCoordinate)] if len(contained_loops) == 1: contained_loop = contained_loops[0] @@ -193,4 +194,7 @@ def add_openmp(ast_node, schedule="static", num_threads=True): except TypeError: pass - loop_to_parallelize.prefix_lines.append("#pragma omp for schedule(%s)" % (schedule,)) + prefix = "#pragma omp for schedule(%s)" % (schedule,) + if collapse: + prefix += " collapse(%d)" % (collapse, ) + loop_to_parallelize.prefix_lines.append(prefix) diff --git a/kernelcreation.py b/kernelcreation.py index 81a32b3a263f0d0a6d012d868c4de858ac642950..69a80cba1478c992e4d9d1849a5cad9f81c37dce 100644 --- a/kernelcreation.py +++ b/kernelcreation.py @@ -6,12 +6,12 @@ from pystencils.astnodes import LoopOverCoordinate, Conditional, Block, SympyAss from pystencils.cpu.vectorization import vectorize from pystencils.simp.assignment_collection import AssignmentCollection from pystencils.gpucuda.indexing import indexing_creator_from_params -from pystencils.transformations import remove_conditionals_in_staggered_kernel +from pystencils.transformations import remove_conditionals_in_staggered_kernel, loop_blocking def create_kernel(assignments, target='cpu', data_type="double", iteration_slice=None, ghost_layers=None, skip_independence_check=False, - cpu_openmp=False, cpu_vectorize_info=None, + cpu_openmp=False, cpu_vectorize_info=None, cpu_blocking=None, gpu_indexing='block', gpu_indexing_params=MappingProxyType({})): """ Creates abstract syntax tree (AST) of kernel, using a list of update equations. @@ -32,6 +32,7 @@ def create_kernel(assignments, target='cpu', data_type="double", iteration_slice cpu_vectorize_info: a dictionary with keys, 'vector_instruction_set', 'assume_aligned' and 'nontemporal' for documentation of these parameters see vectorize function. Example: '{'instruction_set': 'avx512', 'assume_aligned': True, 'nontemporal':True}' + cpu_blocking: a tuple of block sizes or None if no blocking should be applied gpu_indexing: either 'block' or 'line' , or custom indexing class, see `AbstractIndexing` gpu_indexing_params: dict with indexing parameters (constructor parameters of indexing class) e.g. for 'block' one can specify '{'block_size': (20, 20, 10) }' @@ -72,8 +73,11 @@ def create_kernel(assignments, target='cpu', data_type="double", iteration_slice ast = create_kernel(assignments, type_info=data_type, split_groups=split_groups, iteration_slice=iteration_slice, ghost_layers=ghost_layers, skip_independence_check=skip_independence_check) + omp_collapse = None + if cpu_blocking: + omp_collapse = loop_blocking(ast, cpu_blocking) if cpu_openmp: - add_openmp(ast, num_threads=cpu_openmp) + add_openmp(ast, num_threads=cpu_openmp, collapse=omp_collapse) if cpu_vectorize_info: if cpu_vectorize_info is True: vectorize(ast) @@ -232,6 +236,10 @@ def create_staggered_kernel(staggered_field, expressions, subexpressions=(), tar ghost_layers = [(1, 0)] * dim + blocking = kwargs.get('cpu_blocking', None) + if blocking: + del kwargs['cpu_blocking'] + cpu_vectorize_info = kwargs.get('cpu_vectorize_info', None) if cpu_vectorize_info: del kwargs['cpu_vectorize_info'] @@ -239,6 +247,8 @@ def create_staggered_kernel(staggered_field, expressions, subexpressions=(), tar if target == 'cpu': remove_conditionals_in_staggered_kernel(ast) + if blocking: + loop_blocking(ast, blocking) if cpu_vectorize_info is True: vectorize(ast) elif isinstance(cpu_vectorize_info, dict): diff --git a/transformations.py b/transformations.py index fb48c4a738d3ace2b603f7f838a8698283af43cc..787438f37cf0f959567f338f70dd51f57dfb2f97 100644 --- a/transformations.py +++ b/transformations.py @@ -1008,3 +1008,52 @@ def replace_inner_stride_with_one(ast_node: ast.KernelFunction) -> None: subs_dict = {stride_param: 1 for stride_param in stride_params} if subs_dict: ast_node.subs(subs_dict) + + +def loop_blocking(ast_node: ast.KernelFunction, block_size) -> int: + """Blocking of loops to enhance cache locality. Modifies the ast node in-place. + + Args: + ast_node: kernel function node before vectorization transformation has been applied + block_size: sequence defining block size in x, y, (z) direction + + Returns: + number of dimensions blocked + """ + loops = [l for l in filtered_tree_iteration(ast_node, ast.LoopOverCoordinate, stop_type=ast.SympyAssignment)] + body = ast_node.body + + coordinates = [] + loop_starts = {} + loop_stops = {} + for loop in loops: + coord = loop.coordinate_to_loop_over + if coord not in coordinates: + coordinates.append(coord) + loop_starts[coord] = loop.start + loop_stops[coord] = loop.stop + else: + assert loop.start == loop_starts[coord] and loop.stop == loop_stops[coord], \ + "Multiple loops over coordinate {} with different loop bounds".format(coord) + + # Create the outer loops that iterate over the blocks + outer_loop = None + for coord in reversed(coordinates): + body = ast.Block([outer_loop]) if outer_loop else body + outer_loop = ast.LoopOverCoordinate(body, coord, loop_starts[coord], loop_stops[coord], + step=block_size[coord], is_block_loop=True) + + ast_node.body = ast.Block([outer_loop]) + + # modify the existing loops to only iterate within one block + for inner_loop in loops: + coord = inner_loop.coordinate_to_loop_over + block_ctr = ast.LoopOverCoordinate.get_block_loop_counter_symbol(coord) + loop_range = inner_loop.stop - inner_loop.start + if sp.sympify(loop_range).is_number and loop_range % block_size[coord] == 0: + stop = block_ctr + block_size[coord] + else: + stop = sp.Min(inner_loop.stop, block_ctr + block_size[coord]) + inner_loop.start = block_ctr + inner_loop.stop = stop + return len(coordinates)