From 3f45aed6ba232d0aabfad09d3ca96da26cb39d9a Mon Sep 17 00:00:00 2001
From: Martin Bauer <martin.bauer@fau.de>
Date: Sun, 19 Mar 2017 22:04:11 +0100
Subject: [PATCH] GPU bugfixes and lbmpy GPU support

- bugfix for CUDA kernels with variable field sizes
- extended tests for pystencils gpu kernels
---
 gpucuda/cudajit.py        | 11 +++++------
 gpucuda/kernelcreation.py | 17 ++++-------------
 2 files changed, 9 insertions(+), 19 deletions(-)

diff --git a/gpucuda/cudajit.py b/gpucuda/cudajit.py
index c754afcd6..a0355d26c 100644
--- a/gpucuda/cudajit.py
+++ b/gpucuda/cudajit.py
@@ -4,7 +4,7 @@ import pycuda.autoinit
 from pycuda.compiler import SourceModule
 from pystencils.backends.cbackend import generateC
 from pystencils.transformations import symbolNameToVariableName
-from pystencils.types import StructType
+from pystencils.types import StructType, getBaseType
 
 
 def makePythonFunction(kernelFunctionNode, argumentDict={}):
@@ -36,6 +36,7 @@ def makePythonFunction(kernelFunctionNode, argumentDict={}):
 
         args = _buildNumpyArgumentList(kernelFunctionNode, fullArguments)
         func(*args, **dictWithBlockAndThreadNumbers)
+        # cuda.Context.synchronize() #  useful for debugging, to get errors right after kernel was called
     return wrapper
 
 
@@ -47,12 +48,10 @@ def _buildNumpyArgumentList(kernelFunctionNode, argumentDict):
             field = argumentDict[arg.fieldName]
             if arg.isFieldPtrArgument:
                 result.append(field.gpudata)
-            elif arg.isFieldShapeArgument:
-                strideArr = np.array(field.strides, dtype=np.int32) / field.dtype.itemsize
-                result.append(cuda.In(strideArr))
             elif arg.isFieldStrideArgument:
-                shapeArr = np.array(field.shape, dtype=np.int32)
-                result.append(cuda.In(shapeArr))
+                dtype = getBaseType(arg.dtype).numpyDtype
+                strideArr = np.array(field.strides, dtype=dtype) // field.dtype.itemsize
+                result.append(cuda.In(strideArr))
             else:
                 assert False
         else:
diff --git a/gpucuda/kernelcreation.py b/gpucuda/kernelcreation.py
index 512334898..664752935 100644
--- a/gpucuda/kernelcreation.py
+++ b/gpucuda/kernelcreation.py
@@ -1,8 +1,7 @@
 import sympy as sp
 
-from pystencils.transformations import resolveFieldAccesses, typeAllEquations, \
-    parseBasePointerInfo, typingFromSympyInspection
-from pystencils.astnodes import Block, KernelFunction, LoopOverCoordinate, SympyAssignment
+from pystencils.transformations import resolveFieldAccesses, typeAllEquations, parseBasePointerInfo
+from pystencils.astnodes import Block, KernelFunction, SympyAssignment
 from pystencils import Field
 from pystencils.types import TypedSymbol, BasicType, StructType
 
@@ -82,20 +81,12 @@ def createdIndexedCUDAKernel(listOfEquations, indexFields, functionName="kernel"
 
     coordinateSymbolAssignments = [getCoordinateSymbolAssignment(n) for n in coordinateNames[:spatialCoordinates]]
     coordinateTypedSymbols = [eq.lhs for eq in coordinateSymbolAssignments]
-    assignments = coordinateSymbolAssignments + assignments
 
-    # make 1D loop over index fields
-    loopBody = Block([])
-    loopNode = LoopOverCoordinate(loopBody, coordinateToLoopOver=0, start=0, stop=indexFields[0].shape[0])
-
-    for assignment in assignments:
-        loopBody.append(assignment)
-
-    functionBody = Block([loopNode])
+    functionBody = Block(coordinateSymbolAssignments + assignments)
     ast = KernelFunction(functionBody, allFields, functionName)
     ast.globalVariables.update(BLOCK_IDX + THREAD_IDX)
 
-    coordMapping, getCallParameters = getLinewiseCoordinates(list(fieldsRead)[0], ghostLayers=0)
+    coordMapping, getCallParameters = getLinewiseCoordinates(list(indexFields)[0], ghostLayers=0)
     basePointerInfo = [['spatialInner0']]
     basePointerInfos = {f.name: parseBasePointerInfo(basePointerInfo, [2, 1, 0], f) for f in allFields}
 
-- 
GitLab