pystencils: CUDA kernels can now also iterate over slice

fbf355ec · Martin Bauer · 882c84e6 · fbf355ec · fbf355ec
Commit fbf355ec authored 8 years ago by Martin Bauer
--- a/gpucuda/indexing.py
+++ b/gpucuda/indexing.py
@@ -5,6 +5,7 @@ import pycuda.driver as cuda
 import pycuda.autoinit
 from pystencils.astnodes import Conditional, Block
+from pystencils.slicing import normalizeSlice
 BLOCK_IDX = list(sp.symbols("blockIdx.x blockIdx.y blockIdx.z"))
 THREAD_IDX = list(sp.symbols("threadIdx.x threadIdx.y threadIdx.z"))
@@ -15,8 +16,7 @@ class AbstractIndexing(abc.ABCMeta('ABC', (object,), {})):
    Abstract base class for all Indexing classes. An Indexing class defines how a multidimensional
    field is mapped to CUDA's block and grid system. It calculates indices based on CUDA's thread and block indices
    and computes the number of blocks and threads a kernel is started with. The Indexing class is created with
-    a pystencils field and the number of ghost layers as well as further optional parameters that must have default
+    a pystencils field, a slice to iterate over, and further optional parameters that must have default values.
-    values.
    """
    @abc.abstractproperty
@@ -55,12 +55,12 @@ class AbstractIndexing(abc.ABCMeta('ABC', (object,), {})):
 class BlockIndexing(AbstractIndexing):
    """Generic indexing scheme that maps sub-blocks of an array to CUDA blocks."""
-    def __init__(self, field, ghostLayers, blockSize=(256, 8, 1), permuteBlockSizeDependentOnLayout=True):
+    def __init__(self, field, iterationSlice=None,
+                 blockSize=(256, 8, 1), permuteBlockSizeDependentOnLayout=True):
        """
        Creates
        :param field: pystencils field (common to all Indexing classes)
-        :param ghostLayers: number of ghost layers (common to all Indexing classes)
+        :param iterationSlice: slice that defines rectangular subarea which is iterated over
-        :param blockSize: size of a CUDA block
        :param permuteBlockSizeDependentOnLayout: if True the blockSize is permuted such that the fastest coordinate
                                                  gets the largest amount of threads
        """
@@ -73,20 +73,21 @@ class BlockIndexing(AbstractIndexing):
        blockSize = self.limitBlockSizeToDeviceMaximum(blockSize)
        self._blockSize = blockSize
-        self._coordinates = [blockIndex * bs + threadIndex + gl[0]
+        self._iterationSlice = iterationSlice
-                             for blockIndex, bs, threadIndex, gl in zip(BLOCK_IDX, blockSize, THREAD_IDX, ghostLayers)]
+        offsets = _getStartFromSlice(self._iterationSlice)
+        self._coordinates = [blockIndex * bs + threadIndex + off
+                             for blockIndex, bs, threadIndex, off in zip(BLOCK_IDX, blockSize, THREAD_IDX, offsets)]
        self._coordinates = self._coordinates[:field.spatialDimensions]
-        self._ghostLayers = ghostLayers
    @property
    def coordinates(self):
        return self._coordinates
    def getCallParameters(self, arrShape):
-        dim = len(self._coordinates)
+        widths = [end - start for start, end in zip(_getStartFromSlice(self._iterationSlice),
-        arrShape = [s - (gl[0] + gl[1]) for s, gl in zip(arrShape[:dim], self._ghostLayers)]
+                                                    _getEndFromSlice(self._iterationSlice, arrShape))]
-        grid = tuple(math.ceil(length / blockSize) for length, blockSize in zip(arrShape, self._blockSize))
+        grid = tuple(math.ceil(length / blockSize) for length, blockSize in zip(widths, self._blockSize))
        extendBs = (1,) * (3 - len(self._blockSize))
        extendGr = (1,) * (3 - len(grid))
        return {'block': self._blockSize + extendBs,
@@ -95,8 +96,8 @@ class BlockIndexing(AbstractIndexing):
    def guard(self, kernelContent, arrShape):
        dim = len(self._coordinates)
        arrShape = arrShape[:dim]
-        conditions = [c < shapeComponent - gl[1]
+        conditions = [c < end
-                      for c, shapeComponent, gl in zip(self._coordinates, arrShape, self._ghostLayers)]
+                      for c, end in zip(self._coordinates, _getEndFromSlice(self._iterationSlice, arrShape))]
        condition = conditions[0]
        for c in conditions[1:]:
            condition = sp.And(condition, c)
@@ -178,7 +179,7 @@ class LineIndexing(AbstractIndexing):
    maximum amount of threads allowed in a CUDA block (which depends on device).
    """
-    def __init__(self, field, ghostLayers):
+    def __init__(self, field, iterationSlice=None):
        availableIndices = [THREAD_IDX[0]] + BLOCK_IDX
        if field.spatialDimensions > 4:
            raise NotImplementedError("This indexing scheme supports at most 4 spatial dimensions")
@@ -188,24 +189,51 @@ class LineIndexing(AbstractIndexing):
        fastestCoordinate = field.layout[-1]
        coordinates[0], coordinates[fastestCoordinate] = coordinates[fastestCoordinate], coordinates[0]
-        self._coordiantesNoGhostLayer = coordinates
+        self._coordinates = coordinates
-        self._coordinates = [i + gl[0] for i, gl in zip(coordinates, ghostLayers)]
+        self._iterationSlice = iterationSlice
-        self._ghostLayers = ghostLayers
    @property
    def coordinates(self):
-        return self._coordinates
+        return [i + offset for i, offset in zip(self._coordinates, _getStartFromSlice(self._iterationSlice))]
    def getCallParameters(self, arrShape):
+        widths = [end - start for start, end in zip(_getStartFromSlice(self._iterationSlice),
+                                                    _getEndFromSlice(self._iterationSlice, arrShape))]
        def getShapeOfCudaIdx(cudaIdx):
-            if cudaIdx not in self._coordiantesNoGhostLayer:
+            if cudaIdx not in self._coordinates:
                return 1
            else:
-                idx = self._coordiantesNoGhostLayer.index(cudaIdx)
+                idx = self._coordinates.index(cudaIdx)
-                return arrShape[idx] - (self._ghostLayers[idx][0] + self._ghostLayers[idx][1])
+                return widths[idx]
        return {'block': tuple([getShapeOfCudaIdx(idx) for idx in THREAD_IDX]),
                'grid': tuple([getShapeOfCudaIdx(idx) for idx in BLOCK_IDX])}
    def guard(self, kernelContent, arrShape):
        return kernelContent
+# -------------------------------------- Helper functions --------------------------------------------------------------
+def _getStartFromSlice(iterationSlice):
+    res = []
+    for sliceComponent in iterationSlice:
+        if type(sliceComponent) is slice:
+            res.append(sliceComponent.start if sliceComponent.start is not None else 0)
+        else:
+            assert isinstance(sliceComponent, int)
+            res.append(sliceComponent)
+    return res
+def _getEndFromSlice(iterationSlice, arrShape):
+    iterSlice = normalizeSlice(iterationSlice, arrShape)
+    res = []
+    for sliceComponent in iterSlice:
+        if type(sliceComponent) is slice:
+            res.append(sliceComponent.stop)
+        else:
+            assert isinstance(sliceComponent, int)
+            res.append(sliceComponent + 1)
+    return res
\ No newline at end of file
--- a/gpucuda/kernelcreation.py
+++ b/gpucuda/kernelcreation.py
@@ -5,8 +5,8 @@ from pystencils.types import TypedSymbol, BasicType, StructType
 from pystencils import Field
-def createCUDAKernel(listOfEquations, functionName="kernel", typeForSymbol=None, indexingCreator=BlockIndexing,
+def createCUDAKernel(listOfEquations, functionName="kernel",  typeForSymbol=None, indexingCreator=BlockIndexing,
-                     ghostLayers=None):
+                     iterationSlice=None, ghostLayers=None):
    fieldsRead, fieldsWritten, assignments = typeAllEquations(listOfEquations, typeForSymbol)
    allFields = fieldsRead.union(fieldsWritten)
    readOnlyFields = set([f.name for f in fieldsRead - fieldsWritten])
@@ -16,13 +16,22 @@ def createCUDAKernel(listOfEquations, functionName="kernel", typeForSymbol=None,
        fieldAccesses.update(eq.atoms(Field.Access))
    commonShape = getCommonShape(allFields)
-    if ghostLayers is None:
+    if iterationSlice is None:
-        requiredGhostLayers = max([fa.requiredGhostLayers for fa in fieldAccesses])
+        # determine iteration slice from ghost layers
-        ghostLayers = [(requiredGhostLayers, requiredGhostLayers)] * len(commonShape)
+        if ghostLayers is None:
-    if isinstance(ghostLayers, int):
+            # determine required number of ghost layers from field access
-        ghostLayers = [(ghostLayers, ghostLayers)] * len(commonShape)
+            requiredGhostLayers = max([fa.requiredGhostLayers for fa in fieldAccesses])
+            ghostLayers = [(requiredGhostLayers, requiredGhostLayers)] * len(commonShape)
-    indexing = indexingCreator(field=list(fieldsRead)[0], ghostLayers=ghostLayers)
+        iterationSlice = []
+        if isinstance(ghostLayers, int):
+            for i in range(len(commonShape)):
+                iterationSlice.append(slice(ghostLayers[i], -ghostLayers[i]))
+        else:
+            for i in range(len(commonShape)):
+                iterationSlice.append(slice(ghostLayers[i][0], -ghostLayers[i][1]))
+    indexing = indexingCreator(field=list(fieldsRead)[0], iterationSlice=iterationSlice)
    block = Block(assignments)
    block = indexing.guard(block, commonShape)
@@ -71,7 +80,7 @@ def createdIndexedCUDAKernel(listOfEquations, indexFields, functionName="kernel"
    coordinateTypedSymbols = [eq.lhs for eq in coordinateSymbolAssignments]
    idxField = list(indexFields)[0]
-    indexing = indexingCreator(field=idxField, ghostLayers=[(0, 0)] * len(idxField.shape))
+    indexing = indexingCreator(field=idxField, iterationSlice=[slice(None, None, None)] * len(idxField.spatialShape))
    functionBody = Block(coordinateSymbolAssignments + assignments)
    functionBody = indexing.guard(functionBody, getCommonShape(indexFields))