From 882c84e6465e44463cbd70d3435f5e1793e77d2e Mon Sep 17 00:00:00 2001
From: Martin Bauer <martin.bauer@fau.de>
Date: Wed, 22 Mar 2017 17:09:53 +0100
Subject: [PATCH] More flexible ghost layer specification for CUDA kernels

---
 gpucuda/indexing.py       | 15 ++++++++-------
 gpucuda/kernelcreation.py | 18 +++++++++++++-----
 transformations.py        |  2 ++
 3 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/gpucuda/indexing.py b/gpucuda/indexing.py
index 7cc46c133..6db21f806 100644
--- a/gpucuda/indexing.py
+++ b/gpucuda/indexing.py
@@ -73,8 +73,8 @@ class BlockIndexing(AbstractIndexing):
         blockSize = self.limitBlockSizeToDeviceMaximum(blockSize)
         self._blockSize = blockSize
 
-        self._coordinates = [blockIndex * bs + threadIndex + ghostLayers
-                             for blockIndex, bs, threadIndex in zip(BLOCK_IDX, blockSize, THREAD_IDX)]
+        self._coordinates = [blockIndex * bs + threadIndex + gl[0]
+                             for blockIndex, bs, threadIndex, gl in zip(BLOCK_IDX, blockSize, THREAD_IDX, ghostLayers)]
 
         self._coordinates = self._coordinates[:field.spatialDimensions]
         self._ghostLayers = ghostLayers
@@ -85,7 +85,7 @@ class BlockIndexing(AbstractIndexing):
 
     def getCallParameters(self, arrShape):
         dim = len(self._coordinates)
-        arrShape = arrShape[:dim]
+        arrShape = [s - (gl[0] + gl[1]) for s, gl in zip(arrShape[:dim], self._ghostLayers)]
         grid = tuple(math.ceil(length / blockSize) for length, blockSize in zip(arrShape, self._blockSize))
         extendBs = (1,) * (3 - len(self._blockSize))
         extendGr = (1,) * (3 - len(grid))
@@ -95,8 +95,8 @@ class BlockIndexing(AbstractIndexing):
     def guard(self, kernelContent, arrShape):
         dim = len(self._coordinates)
         arrShape = arrShape[:dim]
-        conditions = [c < shapeComponent - self._ghostLayers
-                      for c, shapeComponent in zip(self._coordinates, arrShape)]
+        conditions = [c < shapeComponent - gl[1]
+                      for c, shapeComponent, gl in zip(self._coordinates, arrShape, self._ghostLayers)]
         condition = conditions[0]
         for c in conditions[1:]:
             condition = sp.And(condition, c)
@@ -189,7 +189,7 @@ class LineIndexing(AbstractIndexing):
         coordinates[0], coordinates[fastestCoordinate] = coordinates[fastestCoordinate], coordinates[0]
 
         self._coordiantesNoGhostLayer = coordinates
-        self._coordinates = [i + ghostLayers for i in coordinates]
+        self._coordinates = [i + gl[0] for i, gl in zip(coordinates, ghostLayers)]
         self._ghostLayers = ghostLayers
 
     @property
@@ -201,7 +201,8 @@ class LineIndexing(AbstractIndexing):
             if cudaIdx not in self._coordiantesNoGhostLayer:
                 return 1
             else:
-                return arrShape[self._coordiantesNoGhostLayer.index(cudaIdx)] - 2 * self._ghostLayers
+                idx = self._coordiantesNoGhostLayer.index(cudaIdx)
+                return arrShape[idx] - (self._ghostLayers[idx][0] + self._ghostLayers[idx][1])
 
         return {'block': tuple([getShapeOfCudaIdx(idx) for idx in THREAD_IDX]),
                 'grid': tuple([getShapeOfCudaIdx(idx) for idx in BLOCK_IDX])}
diff --git a/gpucuda/kernelcreation.py b/gpucuda/kernelcreation.py
index ea06701ca..03a490aad 100644
--- a/gpucuda/kernelcreation.py
+++ b/gpucuda/kernelcreation.py
@@ -5,7 +5,8 @@ from pystencils.types import TypedSymbol, BasicType, StructType
 from pystencils import Field
 
 
-def createCUDAKernel(listOfEquations, functionName="kernel", typeForSymbol=None, indexingCreator=BlockIndexing):
+def createCUDAKernel(listOfEquations, functionName="kernel", typeForSymbol=None, indexingCreator=BlockIndexing,
+                     ghostLayers=None):
     fieldsRead, fieldsWritten, assignments = typeAllEquations(listOfEquations, typeForSymbol)
     allFields = fieldsRead.union(fieldsWritten)
     readOnlyFields = set([f.name for f in fieldsRead - fieldsWritten])
@@ -14,11 +15,17 @@ def createCUDAKernel(listOfEquations, functionName="kernel", typeForSymbol=None,
     for eq in listOfEquations:
         fieldAccesses.update(eq.atoms(Field.Access))
 
-    requiredGhostLayers = max([fa.requiredGhostLayers for fa in fieldAccesses])
-    indexing = indexingCreator(field=list(fieldsRead)[0], ghostLayers=requiredGhostLayers)
+    commonShape = getCommonShape(allFields)
+    if ghostLayers is None:
+        requiredGhostLayers = max([fa.requiredGhostLayers for fa in fieldAccesses])
+        ghostLayers = [(requiredGhostLayers, requiredGhostLayers)] * len(commonShape)
+    if isinstance(ghostLayers, int):
+        ghostLayers = [(ghostLayers, ghostLayers)] * len(commonShape)
+
+    indexing = indexingCreator(field=list(fieldsRead)[0], ghostLayers=ghostLayers)
 
     block = Block(assignments)
-    block = indexing.guard(block, getCommonShape(allFields))
+    block = indexing.guard(block, commonShape)
     ast = KernelFunction(block, allFields, functionName)
     ast.globalVariables.update(indexing.indexVariables)
 
@@ -63,7 +70,8 @@ def createdIndexedCUDAKernel(listOfEquations, indexFields, functionName="kernel"
     coordinateSymbolAssignments = [getCoordinateSymbolAssignment(n) for n in coordinateNames[:spatialCoordinates]]
     coordinateTypedSymbols = [eq.lhs for eq in coordinateSymbolAssignments]
 
-    indexing = indexingCreator(field=list(indexFields)[0], ghostLayers=0)
+    idxField = list(indexFields)[0]
+    indexing = indexingCreator(field=idxField, ghostLayers=[(0, 0)] * len(idxField.shape))
 
     functionBody = Block(coordinateSymbolAssignments + assignments)
     functionBody = indexing.guard(functionBody, getCommonShape(indexFields))
diff --git a/transformations.py b/transformations.py
index a48125dea..19aebd937 100644
--- a/transformations.py
+++ b/transformations.py
@@ -77,6 +77,8 @@ def makeLoopOverDomain(body, functionName, iterationSlice=None, ghostLayers=None
     if ghostLayers is None:
         requiredGhostLayers = max([fa.requiredGhostLayers for fa in fieldAccesses])
         ghostLayers = [(requiredGhostLayers, requiredGhostLayers)] * len(loopOrder)
+    if isinstance(ghostLayers, int):
+        ghostLayers = [(ghostLayers, ghostLayers)] * len(loopOrder)
 
     currentBody = body
     lastLoop = None
-- 
GitLab