From 729989d561eed520a0597451cfddaa700291b380 Mon Sep 17 00:00:00 2001
From: Martin Bauer <martin.bauer@fau.de>
Date: Thu, 14 Mar 2019 12:48:16 +0100
Subject: [PATCH] Cache Blocking Transformation

---
 astnodes.py           | 30 ++++++++++++++++++++++----
 cpu/kernelcreation.py | 10 ++++++---
 kernelcreation.py     | 16 +++++++++++---
 transformations.py    | 49 +++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 95 insertions(+), 10 deletions(-)

diff --git a/astnodes.py b/astnodes.py
index 0956417..992cbf2 100644
--- a/astnodes.py
+++ b/astnodes.py
@@ -184,6 +184,11 @@ class KernelFunction(Node):
     def body(self):
         return self._body
 
+    @body.setter
+    def body(self, value):
+        self._body = value
+        self._body.parent = self
+
     @property
     def args(self):
         return [self._body]
@@ -325,8 +330,9 @@ class PragmaBlock(Block):
 
 class LoopOverCoordinate(Node):
     LOOP_COUNTER_NAME_PREFIX = "ctr"
+    BlOCK_LOOP_COUNTER_NAME_PREFIX = "_blockctr"
 
-    def __init__(self, body, coordinate_to_loop_over, start, stop, step=1):
+    def __init__(self, body, coordinate_to_loop_over, start, stop, step=1, is_block_loop=False):
         super(LoopOverCoordinate, self).__init__(parent=None)
         self.body = body
         body.parent = self
@@ -336,9 +342,11 @@ class LoopOverCoordinate(Node):
         self.step = step
         self.body.parent = self
         self.prefix_lines = []
+        self.is_block_loop = is_block_loop
 
     def new_loop_with_different_body(self, new_body):
-        result = LoopOverCoordinate(new_body, self.coordinate_to_loop_over, self.start, self.stop, self.step)
+        result = LoopOverCoordinate(new_body, self.coordinate_to_loop_over, self.start, self.stop,
+                                    self.step, self.is_block_loop)
         result.prefix_lines = [l for l in self.prefix_lines]
         return result
 
@@ -385,9 +393,16 @@ class LoopOverCoordinate(Node):
     def get_loop_counter_name(coordinate_to_loop_over):
         return "%s_%s" % (LoopOverCoordinate.LOOP_COUNTER_NAME_PREFIX, coordinate_to_loop_over)
 
+    @staticmethod
+    def get_block_loop_counter_name(coordinate_to_loop_over):
+        return "%s_%s" % (LoopOverCoordinate.BlOCK_LOOP_COUNTER_NAME_PREFIX, coordinate_to_loop_over)
+
     @property
     def loop_counter_name(self):
-        return LoopOverCoordinate.get_loop_counter_name(self.coordinate_to_loop_over)
+        if self.is_block_loop:
+            return LoopOverCoordinate.get_block_loop_counter_name(self.coordinate_to_loop_over)
+        else:
+            return LoopOverCoordinate.get_loop_counter_name(self.coordinate_to_loop_over)
 
     @staticmethod
     def is_loop_counter_symbol(symbol):
@@ -403,9 +418,16 @@ class LoopOverCoordinate(Node):
     def get_loop_counter_symbol(coordinate_to_loop_over):
         return TypedSymbol(LoopOverCoordinate.get_loop_counter_name(coordinate_to_loop_over), 'int')
 
+    @staticmethod
+    def get_block_loop_counter_symbol(coordinate_to_loop_over):
+        return TypedSymbol(LoopOverCoordinate.get_block_loop_counter_name(coordinate_to_loop_over), 'int')
+
     @property
     def loop_counter_symbol(self):
-        return LoopOverCoordinate.get_loop_counter_symbol(self.coordinate_to_loop_over)
+        if self.is_block_loop:
+            return self.get_block_loop_counter_symbol(self.coordinate_to_loop_over)
+        else:
+            return self.get_loop_counter_symbol(self.coordinate_to_loop_over)
 
     @property
     def is_outermost_loop(self):
diff --git a/cpu/kernelcreation.py b/cpu/kernelcreation.py
index 136cd7f..a3f0211 100644
--- a/cpu/kernelcreation.py
+++ b/cpu/kernelcreation.py
@@ -152,13 +152,14 @@ def create_indexed_kernel(assignments: AssignmentOrAstNodeList, index_fields, fu
     return ast_node
 
 
-def add_openmp(ast_node, schedule="static", num_threads=True):
+def add_openmp(ast_node, schedule="static", num_threads=True, collapse=None):
     """Parallelize the outer loop with OpenMP.
 
     Args:
         ast_node: abstract syntax tree created e.g. by :func:`create_kernel`
         schedule: OpenMP scheduling policy e.g. 'static' or 'dynamic'
         num_threads: explicitly specify number of threads
+        collapse: number of nested loops to include in parallel region (see OpenMP collapse)
     """
     if not num_threads:
         return
@@ -182,7 +183,7 @@ def add_openmp(ast_node, schedule="static", num_threads=True):
         import multiprocessing
         num_threads = multiprocessing.cpu_count()
 
-    if loop_range is not None and loop_range < num_threads:
+    if loop_range is not None and loop_range < num_threads and not collapse:
         contained_loops = [l for l in loop_to_parallelize.body.args if isinstance(l, LoopOverCoordinate)]
         if len(contained_loops) == 1:
             contained_loop = contained_loops[0]
@@ -193,4 +194,7 @@ def add_openmp(ast_node, schedule="static", num_threads=True):
             except TypeError:
                 pass
 
-    loop_to_parallelize.prefix_lines.append("#pragma omp for schedule(%s)" % (schedule,))
+    prefix = "#pragma omp for schedule(%s)" % (schedule,)
+    if collapse:
+        prefix += " collapse(%d)" % (collapse, )
+    loop_to_parallelize.prefix_lines.append(prefix)
diff --git a/kernelcreation.py b/kernelcreation.py
index 81a32b3..69a80cb 100644
--- a/kernelcreation.py
+++ b/kernelcreation.py
@@ -6,12 +6,12 @@ from pystencils.astnodes import LoopOverCoordinate, Conditional, Block, SympyAss
 from pystencils.cpu.vectorization import vectorize
 from pystencils.simp.assignment_collection import AssignmentCollection
 from pystencils.gpucuda.indexing import indexing_creator_from_params
-from pystencils.transformations import remove_conditionals_in_staggered_kernel
+from pystencils.transformations import remove_conditionals_in_staggered_kernel, loop_blocking
 
 
 def create_kernel(assignments, target='cpu', data_type="double", iteration_slice=None, ghost_layers=None,
                   skip_independence_check=False,
-                  cpu_openmp=False, cpu_vectorize_info=None,
+                  cpu_openmp=False, cpu_vectorize_info=None, cpu_blocking=None,
                   gpu_indexing='block', gpu_indexing_params=MappingProxyType({})):
     """
     Creates abstract syntax tree (AST) of kernel, using a list of update equations.
@@ -32,6 +32,7 @@ def create_kernel(assignments, target='cpu', data_type="double", iteration_slice
         cpu_vectorize_info: a dictionary with keys, 'vector_instruction_set', 'assume_aligned' and 'nontemporal'
                             for documentation of these parameters see vectorize function. Example:
                             '{'instruction_set': 'avx512', 'assume_aligned': True, 'nontemporal':True}'
+        cpu_blocking: a tuple of block sizes or None if no blocking should be applied
         gpu_indexing: either 'block' or 'line' , or custom indexing class, see `AbstractIndexing`
         gpu_indexing_params: dict with indexing parameters (constructor parameters of indexing class)
                              e.g. for 'block' one can specify '{'block_size': (20, 20, 10) }'
@@ -72,8 +73,11 @@ def create_kernel(assignments, target='cpu', data_type="double", iteration_slice
         ast = create_kernel(assignments, type_info=data_type, split_groups=split_groups,
                             iteration_slice=iteration_slice, ghost_layers=ghost_layers,
                             skip_independence_check=skip_independence_check)
+        omp_collapse = None
+        if cpu_blocking:
+            omp_collapse = loop_blocking(ast, cpu_blocking)
         if cpu_openmp:
-            add_openmp(ast, num_threads=cpu_openmp)
+            add_openmp(ast, num_threads=cpu_openmp, collapse=omp_collapse)
         if cpu_vectorize_info:
             if cpu_vectorize_info is True:
                 vectorize(ast)
@@ -232,6 +236,10 @@ def create_staggered_kernel(staggered_field, expressions, subexpressions=(), tar
 
     ghost_layers = [(1, 0)] * dim
 
+    blocking = kwargs.get('cpu_blocking', None)
+    if blocking:
+        del kwargs['cpu_blocking']
+
     cpu_vectorize_info = kwargs.get('cpu_vectorize_info', None)
     if cpu_vectorize_info:
         del kwargs['cpu_vectorize_info']
@@ -239,6 +247,8 @@ def create_staggered_kernel(staggered_field, expressions, subexpressions=(), tar
 
     if target == 'cpu':
         remove_conditionals_in_staggered_kernel(ast)
+        if blocking:
+            loop_blocking(ast, blocking)
         if cpu_vectorize_info is True:
             vectorize(ast)
         elif isinstance(cpu_vectorize_info, dict):
diff --git a/transformations.py b/transformations.py
index fb48c4a..787438f 100644
--- a/transformations.py
+++ b/transformations.py
@@ -1008,3 +1008,52 @@ def replace_inner_stride_with_one(ast_node: ast.KernelFunction) -> None:
     subs_dict = {stride_param: 1 for stride_param in stride_params}
     if subs_dict:
         ast_node.subs(subs_dict)
+
+
+def loop_blocking(ast_node: ast.KernelFunction, block_size) -> int:
+    """Blocking of loops to enhance cache locality. Modifies the ast node in-place.
+
+    Args:
+        ast_node: kernel function node before vectorization transformation has been applied
+        block_size: sequence defining block size in x, y, (z) direction
+
+    Returns:
+        number of dimensions blocked
+    """
+    loops = [l for l in filtered_tree_iteration(ast_node, ast.LoopOverCoordinate, stop_type=ast.SympyAssignment)]
+    body = ast_node.body
+
+    coordinates = []
+    loop_starts = {}
+    loop_stops = {}
+    for loop in loops:
+        coord = loop.coordinate_to_loop_over
+        if coord not in coordinates:
+            coordinates.append(coord)
+            loop_starts[coord] = loop.start
+            loop_stops[coord] = loop.stop
+        else:
+            assert loop.start == loop_starts[coord] and loop.stop == loop_stops[coord], \
+                "Multiple loops over coordinate {} with different loop bounds".format(coord)
+
+    # Create the outer loops that iterate over the blocks
+    outer_loop = None
+    for coord in reversed(coordinates):
+        body = ast.Block([outer_loop]) if outer_loop else body
+        outer_loop = ast.LoopOverCoordinate(body, coord, loop_starts[coord], loop_stops[coord],
+                                            step=block_size[coord], is_block_loop=True)
+
+    ast_node.body = ast.Block([outer_loop])
+
+    # modify the existing loops to only iterate within one block
+    for inner_loop in loops:
+        coord = inner_loop.coordinate_to_loop_over
+        block_ctr = ast.LoopOverCoordinate.get_block_loop_counter_symbol(coord)
+        loop_range = inner_loop.stop - inner_loop.start
+        if sp.sympify(loop_range).is_number and loop_range % block_size[coord] == 0:
+            stop = block_ctr + block_size[coord]
+        else:
+            stop = sp.Min(inner_loop.stop, block_ctr + block_size[coord])
+        inner_loop.start = block_ctr
+        inner_loop.stop = stop
+    return len(coordinates)
-- 
GitLab