From 771a9b2224723d931313e80ed0f576a290f280da Mon Sep 17 00:00:00 2001
From: Martin Bauer <martin.bauer@fau.de>
Date: Tue, 21 Mar 2017 12:40:33 +0100
Subject: [PATCH] Caching for jitted cpu and gpu kernels (big speedup for small
 work sizes)

---
 cpu/cpujit.py             |  17 ++--
 field.py                  |  41 +++++++--
 gpucuda/cudajit.py        |  27 ++++--
 gpucuda/indexing.py       | 171 ++++++++++++++++++++++++--------------
 gpucuda/kernelcreation.py |   6 +-
 5 files changed, 178 insertions(+), 84 deletions(-)

diff --git a/cpu/cpujit.py b/cpu/cpujit.py
index 75db5a790..7e1891e48 100644
--- a/cpu/cpujit.py
+++ b/cpu/cpujit.py
@@ -428,12 +428,19 @@ def makePythonFunctionIncompleteParams(kernelFunctionNode, argumentDict):
     func.restype = None
     parameters = kernelFunctionNode.parameters
 
+    cache = {}
+
     def wrapper(**kwargs):
-        from copy import copy
-        fullArguments = copy(argumentDict)
-        fullArguments.update(kwargs)
-        args = buildCTypeArgumentList(parameters, fullArguments)
-        func(*args)
+        key = hash(tuple((k, id(v)) for k, v in kwargs.items()))
+        try:
+            args = cache[key]
+            func(*args)
+        except KeyError:
+            fullArguments = argumentDict.copy()
+            fullArguments.update(kwargs)
+            args = buildCTypeArgumentList(parameters, fullArguments)
+            cache[key] = args
+            func(*args)
     return wrapper
 
 
diff --git a/field.py b/field.py
index 83a6ba4ff..3d4b6aae9 100644
--- a/field.py
+++ b/field.py
@@ -95,7 +95,7 @@ class Field(object):
         if spatialDimensions < 1:
             raise ValueError("Too many index dimensions. At least one spatial dimension required")
 
-        fullLayout = getLayoutFromNumpyArray(npArray)
+        fullLayout = getLayoutOfArray(npArray)
         spatialLayout = tuple([i for i in fullLayout if i < spatialDimensions])
         assert len(spatialLayout) == spatialDimensions
 
@@ -349,11 +349,11 @@ def extractCommonSubexpressions(equations):
     return equations
 
 
-def getLayoutFromNumpyArray(arr, indexDimensionIds=[]):
+def getLayoutOfArray(arr, indexDimensionIds=[]):
     """
     Returns a list indicating the memory layout (linearization order) of the numpy array.
     Example:
-    >>> getLayoutFromNumpyArray(np.zeros([3,3,3]))
+    >>> getLayoutOfArray(np.zeros([3,3,3]))
     (0, 1, 2)
 
     In this example the loop over the zeroth coordinate should be the outermost loop,
@@ -369,6 +369,37 @@ def getLayoutFromNumpyArray(arr, indexDimensionIds=[]):
     return normalizeLayout(result)
 
 
+def createNumpyArrayWithLayout(shape, layout):
+    """
+    Creates a numpy array with
+    :param shape: shape of the resulting array
+    :param layout: layout as tuple, where the coordinates are ordered from slow to fast
+    >>> res = createNumpyArrayWithLayout(shape=(2, 3, 4, 5), layout=(3, 2, 0, 1))
+    >>> res.shape
+    (2, 3, 4, 5)
+    >>> getLayoutOfArray(res)
+    (3, 2, 0, 1)
+    """
+    assert set(layout) == set(range(len(shape))), "Wrong layout descriptor"
+    currentLayout = list(range(len(shape)))
+    swaps = []
+    for i in range(len(layout)):
+        if currentLayout[i] != layout[i]:
+            indexToSwapWith = currentLayout.index(layout[i])
+            swaps.append((i, indexToSwapWith))
+            currentLayout[i], currentLayout[indexToSwapWith] = currentLayout[indexToSwapWith], currentLayout[i]
+    assert tuple(currentLayout) == tuple(layout)
+
+    shape = list(shape)
+    for a, b in swaps:
+        shape[a], shape[b] = shape[b], shape[a]
+
+    res = np.empty(shape, order='c')
+    for a, b in reversed(swaps):
+        res = res.swapaxes(a, b)
+    return res
+
+
 def normalizeLayout(layout):
     """Takes a layout tuple and subtracts the minimum from all entries"""
     minEntry = min(layout)
@@ -382,14 +413,12 @@ def computeStrides(shape, layout):
     :param layout: layout specification as tuple
     :return: strides in elements, not in bytes
     """
-    layout = list(reversed(layout))
     N = len(shape)
     assert len(layout) == N
     assert len(set(layout)) == N
     strides = [0] * N
     product = 1
-    for i in range(N):
-        j = layout.index(i)
+    for j in reversed(layout):
         strides[j] = product
         product *= shape[j]
     return tuple(strides)
diff --git a/gpucuda/cudajit.py b/gpucuda/cudajit.py
index a99aa69cc..041949357 100644
--- a/gpucuda/cudajit.py
+++ b/gpucuda/cudajit.py
@@ -28,16 +28,24 @@ def makePythonFunction(kernelFunctionNode, argumentDict={}):
 
     parameters = kernelFunctionNode.parameters
 
+    cache = {}
+
     def wrapper(**kwargs):
-        from copy import copy
-        fullArguments = copy(argumentDict)
-        fullArguments.update(kwargs)
-        shape = _checkArguments(parameters, fullArguments)
-
-        indexing = kernelFunctionNode.indexing
-        dictWithBlockAndThreadNumbers = indexing.getCallParameters(shape)
-        args = _buildNumpyArgumentList(parameters, fullArguments)
-        func(*args, **dictWithBlockAndThreadNumbers)
+        key = hash(tuple((k, id(v)) for k, v in kwargs.items()))
+        try:
+            args, dictWithBlockAndThreadNumbers = cache[key]
+            func(*args, **dictWithBlockAndThreadNumbers)
+        except KeyError:
+            fullArguments = argumentDict.copy()
+            fullArguments.update(kwargs)
+            shape = _checkArguments(parameters, fullArguments)
+
+            indexing = kernelFunctionNode.indexing
+            dictWithBlockAndThreadNumbers = indexing.getCallParameters(shape)
+
+            args = _buildNumpyArgumentList(parameters, fullArguments)
+            cache[key] = (args, dictWithBlockAndThreadNumbers)
+            func(*args, **dictWithBlockAndThreadNumbers)
         #cuda.Context.synchronize() # useful for debugging, to get errors right after kernel was called
     return wrapper
 
@@ -64,6 +72,7 @@ def _buildNumpyArgumentList(parameters, argumentDict):
             param = argumentDict[arg.name]
             expectedType = arg.dtype.numpyDtype
             result.append(expectedType(param))
+    assert len(result) == len(parameters)
     return result
 
 
diff --git a/gpucuda/indexing.py b/gpucuda/indexing.py
index f63e633de..7cc46c133 100644
--- a/gpucuda/indexing.py
+++ b/gpucuda/indexing.py
@@ -1,3 +1,4 @@
+import abc
 import sympy as sp
 import math
 import pycuda.driver as cuda
@@ -8,68 +9,108 @@ from pystencils.astnodes import Conditional, Block
 BLOCK_IDX = list(sp.symbols("blockIdx.x blockIdx.y blockIdx.z"))
 THREAD_IDX = list(sp.symbols("threadIdx.x threadIdx.y threadIdx.z"))
 
-# Part 1:
-#  given a field and the number of ghost layers, return the x, y and z coordinates
-#  dependent on CUDA thread and block indices
 
-# Part 2:
-#  given the actual field size, determine the call parameters i.e. # of blocks and threads
+class AbstractIndexing(abc.ABCMeta('ABC', (object,), {})):
+    """
+    Abstract base class for all Indexing classes. An Indexing class defines how a multidimensional
+    field is mapped to CUDA's block and grid system. It calculates indices based on CUDA's thread and block indices
+    and computes the number of blocks and threads a kernel is started with. The Indexing class is created with
+    a pystencils field and the number of ghost layers as well as further optional parameters that must have default
+    values.
+    """
 
-
-class LineIndexing:
-    def __init__(self, field, ghostLayers):
-        availableIndices = [THREAD_IDX[0]] + BLOCK_IDX
-        if field.spatialDimensions > 4:
-            raise NotImplementedError("This indexing scheme supports at most 4 spatial dimensions")
-
-        coordinates = availableIndices[:field.spatialDimensions]
-
-        fastestCoordinate = field.layout[-1]
-        coordinates[0], coordinates[fastestCoordinate] = coordinates[fastestCoordinate], coordinates[0]
-
-        self._coordiantesNoGhostLayer = coordinates
-        self._coordinates = [i + ghostLayers for i in coordinates]
-        self._ghostLayers = ghostLayers
+    @abc.abstractproperty
+    def coordinates(self):
+        """Returns a sequence of coordinate expressions for (x,y,z) depending on symbolic CUDA block and thread indices.
+        These symbolic indices can be obtained with the method `indexVariables` """
 
     @property
-    def coordinates(self):
-        return self._coordinates
+    def indexVariables(self):
+        """Sympy symbols for CUDA's block and thread indices"""
+        return BLOCK_IDX + THREAD_IDX
 
+    @abc.abstractmethod
     def getCallParameters(self, arrShape):
-        def getShapeOfCudaIdx(cudaIdx):
-            if cudaIdx not in self._coordiantesNoGhostLayer:
-                return 1
-            else:
-                return arrShape[self._coordiantesNoGhostLayer.index(cudaIdx)] - 2 * self._ghostLayers
+        """
+        Determine grid and block size for kernel call
+        :param arrShape: the numeric (not symbolic) shape of the array
+        :return: dict with keys 'blocks' and 'threads' with tuple values for number of (x,y,z) threads and blocks
+                 the kernel should be started with
+        """
+
+    @abc.abstractmethod
+    def guard(self, kernelContent, arrShape):
+        """
+        In some indexing schemes not all threads of a block execute the kernel content.
+        This function can return a Conditional ast node, defining this execution guard.
+        :param kernelContent: the actual kernel contents which can e.g. be put into the Conditional node as true block
+        :param arrShape: the numeric or symbolic shape of the field
+        :return: ast node, which is put inside the kernel function
+        """
 
-        return {'block': tuple([getShapeOfCudaIdx(idx) for idx in THREAD_IDX]),
-                'grid': tuple([getShapeOfCudaIdx(idx) for idx in BLOCK_IDX])}
 
-    def guard(self, kernelContent, arrShape):
-        return kernelContent
+# -------------------------------------------- Implementations ---------------------------------------------------------
 
-    @property
-    def indexVariables(self):
-        return BLOCK_IDX + THREAD_IDX
 
+class BlockIndexing(AbstractIndexing):
+    """Generic indexing scheme that maps sub-blocks of an array to CUDA blocks."""
 
-class BlockIndexing:
     def __init__(self, field, ghostLayers, blockSize=(256, 8, 1), permuteBlockSizeDependentOnLayout=True):
+        """
+        Creates
+        :param field: pystencils field (common to all Indexing classes)
+        :param ghostLayers: number of ghost layers (common to all Indexing classes)
+        :param blockSize: size of a CUDA block
+        :param permuteBlockSizeDependentOnLayout: if True the blockSize is permuted such that the fastest coordinate
+                                                  gets the largest amount of threads
+        """
         if field.spatialDimensions > 3:
             raise NotImplementedError("This indexing scheme supports at most 3 spatial dimensions")
 
         if permuteBlockSizeDependentOnLayout:
             blockSize = self.permuteBlockSizeAccordingToLayout(blockSize, field.layout)
 
-        self._blockSize = self.limitBlockSizeToDeviceMaximum(blockSize)
+        blockSize = self.limitBlockSizeToDeviceMaximum(blockSize)
+        self._blockSize = blockSize
+
         self._coordinates = [blockIndex * bs + threadIndex + ghostLayers
                              for blockIndex, bs, threadIndex in zip(BLOCK_IDX, blockSize, THREAD_IDX)]
 
         self._coordinates = self._coordinates[:field.spatialDimensions]
         self._ghostLayers = ghostLayers
 
+    @property
+    def coordinates(self):
+        return self._coordinates
+
+    def getCallParameters(self, arrShape):
+        dim = len(self._coordinates)
+        arrShape = arrShape[:dim]
+        grid = tuple(math.ceil(length / blockSize) for length, blockSize in zip(arrShape, self._blockSize))
+        extendBs = (1,) * (3 - len(self._blockSize))
+        extendGr = (1,) * (3 - len(grid))
+        return {'block': self._blockSize + extendBs,
+                'grid': grid + extendGr}
+
+    def guard(self, kernelContent, arrShape):
+        dim = len(self._coordinates)
+        arrShape = arrShape[:dim]
+        conditions = [c < shapeComponent - self._ghostLayers
+                      for c, shapeComponent in zip(self._coordinates, arrShape)]
+        condition = conditions[0]
+        for c in conditions[1:]:
+            condition = sp.And(condition, c)
+        return Block([Conditional(condition, kernelContent)])
+
     @staticmethod
     def limitBlockSizeToDeviceMaximum(blockSize):
+        """
+        Changes blocksize according to match device limits according to the following rules:
+        1) if the total amount of threads is too big for the current device, the biggest coordinate is divided by 2.
+        2) next, if one component is still too big, the component which is too big is divided by 2 and the smallest
+           component is multiplied by 2, such that the total amount of threads stays the same
+        Returns the altered blockSize
+        """
         # Get device limits
         da = cuda.device_attribute
         device = cuda.Context.get_device()
@@ -117,7 +158,7 @@ class BlockIndexing:
 
     @staticmethod
     def permuteBlockSizeAccordingToLayout(blockSize, layout):
-        """The fastest coordinate gets the biggest block dimension"""
+        """Returns modified blockSize such that the fastest coordinate gets the biggest block dimension"""
         sortedBlockSize = list(sorted(blockSize, reverse=True))
         while len(sortedBlockSize) > len(layout):
             sortedBlockSize[0] *= sortedBlockSize[-1]
@@ -128,34 +169,42 @@ class BlockIndexing:
             result[l] = bs
         return tuple(result[:len(layout)])
 
+
+class LineIndexing(AbstractIndexing):
+    """
+    Indexing scheme that assigns the innermost 'line' i.e. the elements which are adjacent in memory to a 1D CUDA block.
+    The fastest coordinate is indexed with threadIdx.x, the remaining coordinates are mapped to blockIdx.{x,y,z}
+    This indexing scheme supports up to 4 spatial dimensions, where the innermost dimensions is not larger than the
+    maximum amount of threads allowed in a CUDA block (which depends on device).
+    """
+
+    def __init__(self, field, ghostLayers):
+        availableIndices = [THREAD_IDX[0]] + BLOCK_IDX
+        if field.spatialDimensions > 4:
+            raise NotImplementedError("This indexing scheme supports at most 4 spatial dimensions")
+
+        coordinates = availableIndices[:field.spatialDimensions]
+
+        fastestCoordinate = field.layout[-1]
+        coordinates[0], coordinates[fastestCoordinate] = coordinates[fastestCoordinate], coordinates[0]
+
+        self._coordiantesNoGhostLayer = coordinates
+        self._coordinates = [i + ghostLayers for i in coordinates]
+        self._ghostLayers = ghostLayers
+
     @property
     def coordinates(self):
         return self._coordinates
 
     def getCallParameters(self, arrShape):
-        dim = len(self._coordinates)
-        arrShape = arrShape[:dim]
-        grid = tuple(math.ceil(length / blockSize) for length, blockSize in zip(arrShape, self._blockSize))
-        extendBs = (1,) * (3 - len(self._blockSize))
-        extendGr = (1,) * (3 - len(grid))
-        return {'block': self._blockSize + extendBs,
-                'grid': grid + extendGr}
-
-    def guard(self, kernelContent, arrShape):
-        dim = len(self._coordinates)
-        arrShape = arrShape[:dim]
-        conditions = [c < shapeComponent - self._ghostLayers
-                      for c, shapeComponent in zip(self._coordinates, arrShape)]
-        condition = conditions[0]
-        for c in conditions[1:]:
-            condition = sp.And(condition, c)
-        return Block([Conditional(condition, kernelContent)])
+        def getShapeOfCudaIdx(cudaIdx):
+            if cudaIdx not in self._coordiantesNoGhostLayer:
+                return 1
+            else:
+                return arrShape[self._coordiantesNoGhostLayer.index(cudaIdx)] - 2 * self._ghostLayers
 
-    @property
-    def indexVariables(self):
-        return BLOCK_IDX + THREAD_IDX
+        return {'block': tuple([getShapeOfCudaIdx(idx) for idx in THREAD_IDX]),
+                'grid': tuple([getShapeOfCudaIdx(idx) for idx in BLOCK_IDX])}
 
-if __name__ == '__main__':
-    bs = BlockIndexing.permuteBlockSizeAccordingToLayout((256, 8, 1), (0,))
-    bs = BlockIndexing.limitBlockSizeToDeviceMaximum(bs)
-    print(bs)
+    def guard(self, kernelContent, arrShape):
+        return kernelContent
diff --git a/gpucuda/kernelcreation.py b/gpucuda/kernelcreation.py
index 5d4a3ac5b..ea06701ca 100644
--- a/gpucuda/kernelcreation.py
+++ b/gpucuda/kernelcreation.py
@@ -1,4 +1,4 @@
-from pystencils.gpucuda.indexing import BlockIndexing, LineIndexing
+from pystencils.gpucuda.indexing import BlockIndexing
 from pystencils.transformations import resolveFieldAccesses, typeAllEquations, parseBasePointerInfo, getCommonShape
 from pystencils.astnodes import Block, KernelFunction, SympyAssignment
 from pystencils.types import TypedSymbol, BasicType, StructType
@@ -15,7 +15,7 @@ def createCUDAKernel(listOfEquations, functionName="kernel", typeForSymbol=None,
         fieldAccesses.update(eq.atoms(Field.Access))
 
     requiredGhostLayers = max([fa.requiredGhostLayers for fa in fieldAccesses])
-    indexing = indexingCreator(list(fieldsRead)[0], requiredGhostLayers)
+    indexing = indexingCreator(field=list(fieldsRead)[0], ghostLayers=requiredGhostLayers)
 
     block = Block(assignments)
     block = indexing.guard(block, getCommonShape(allFields))
@@ -63,7 +63,7 @@ def createdIndexedCUDAKernel(listOfEquations, indexFields, functionName="kernel"
     coordinateSymbolAssignments = [getCoordinateSymbolAssignment(n) for n in coordinateNames[:spatialCoordinates]]
     coordinateTypedSymbols = [eq.lhs for eq in coordinateSymbolAssignments]
 
-    indexing = indexingCreator(list(indexFields)[0], ghostLayers=0)
+    indexing = indexingCreator(field=list(indexFields)[0], ghostLayers=0)
 
     functionBody = Block(coordinateSymbolAssignments + assignments)
     functionBody = indexing.guard(functionBody, getCommonShape(indexFields))
-- 
GitLab