diff --git a/gpucuda/cudajit.py b/gpucuda/cudajit.py index 29239b63ec2496fa171ff5d12993b9b4203638d4..513b8eda0ac66860b5f16f7fc015f3cdb075ab45 100644 --- a/gpucuda/cudajit.py +++ b/gpucuda/cudajit.py @@ -1,5 +1,6 @@ import numpy as np import pycuda.driver as cuda +import pycuda.autoinit from pycuda.compiler import SourceModule @@ -52,7 +53,7 @@ def buildNumpyArgumentList(kernelFunctionNode, argumentDict): return result -def makePythonFunction(kernelFunctionNode, argumentDict): +def makePythonFunction(kernelFunctionNode, argumentDict={}): mod = SourceModule(str(kernelFunctionNode.generateC())) func = mod.get_function(kernelFunctionNode.functionName) @@ -60,6 +61,7 @@ def makePythonFunction(kernelFunctionNode, argumentDict): args = buildNumpyArgumentList(kernelFunctionNode, argumentDict) # 2) determine block and grid tuples - + dictWithBlockAndThreadNumbers = kernelFunctionNode.getCallParameters() + # TODO prepare the function here diff --git a/gpucuda/kernelcreation.py b/gpucuda/kernelcreation.py index 1251e76a5143d95c7e38e6a7176db698eddd5b91..b01886bea3699f20ad7d09bdb506b1be0ed4188a 100644 --- a/gpucuda/kernelcreation.py +++ b/gpucuda/kernelcreation.py @@ -4,55 +4,60 @@ import sympy as sp from pystencils.transformations import resolveFieldAccesses, typeAllEquations, parseBasePointerInfo from pystencils.ast import Block, KernelFunction +from pystencils import Field BLOCK_IDX = list(sp.symbols("blockIdx.x blockIdx.y blockIdx.z")) THREAD_IDX = list(sp.symbols("threadIdx.x threadIdx.y threadIdx.z")) -""" -GPU Access Patterns -- knows about the iteration range -- know about mapping of field indices to CUDA block and thread indices -- iterates over spatial coordinates - constructed with a specific number of coordinates -- can -""" - - -def getLinewiseCoordinateAccessExpression(field, indexCoordinate): +def getLinewiseCoordinates(field, ghostLayers): availableIndices = [THREAD_IDX[0]] + BLOCK_IDX + assert field.spatialDimensions <= 4, "This indexing scheme supports at most 4 spatial dimensions" + result = availableIndices[:field.spatialDimensions] + fastestCoordinate = field.layout[-1] - availableIndices[fastestCoordinate], availableIndices[0] = availableIndices[0], availableIndices[fastestCoordinate] - cudaIndices = availableIndices[:field.spatialDimensions] + result[0], result[fastestCoordinate] = result[fastestCoordinate], result[0] - offsetToCell = sum([cudaIdx * stride for cudaIdx, stride in zip(cudaIndices, field.spatialStrides)]) - indexOffset = sum([idx * indexStride for idx, indexStride in zip(indexCoordinate, field.indexStrides)]) - return sp.simplify(offsetToCell + indexOffset) + def getCallParameters(arrShape): + def getShapeOfCudaIdx(cudaIdx): + if cudaIdx not in result: + return 1 + else: + return arrShape[result.index[cudaIdx]] - 2 * ghostLayers + return {'block': tuple([getShapeOfCudaIdx(idx) for idx in THREAD_IDX]), + 'grid': tuple([getShapeOfCudaIdx(idx) for idx in BLOCK_IDX]) } -def getLinewiseCoordinates(field): - availableIndices = [THREAD_IDX[0]] + BLOCK_IDX - d = field.spatialDimensions + field.indexDimensions - fastestCoordinate = field.layout[-1] - result = availableIndices[:d] - result[0], result[fastestCoordinate] = result[fastestCoordinate], result[0] - return result + # add ghost layer offset + for i in range(len(result)): + result[i] += ghostLayers + + return result, getCallParameters def createCUDAKernel(listOfEquations, functionName="kernel", typeForSymbol=defaultdict(lambda: "double")): fieldsRead, fieldsWritten, assignments = typeAllEquations(listOfEquations, typeForSymbol) - for f in fieldsRead - fieldsWritten: - f.setReadOnly() + allFields = fieldsRead.union(fieldsWritten) + for field in allFields: + field.setReadOnly(False) + for field in fieldsRead - fieldsWritten: + field.setReadOnly() code = KernelFunction(Block(assignments), functionName) - code.qualifierPrefix = "__global__ " code.variablesToIgnore.update(BLOCK_IDX + THREAD_IDX) - coordMapping = getLinewiseCoordinates(list(fieldsRead)[0]) + fieldAccesses = code.atoms(Field.Access) + requiredGhostLayers = max([fa.requiredGhostLayers for fa in fieldAccesses]) + + coordMapping, getCallParameters = getLinewiseCoordinates(list(fieldsRead)[0], requiredGhostLayers) allFields = fieldsRead.union(fieldsWritten) basePointerInfo = [['spatialInner0']] basePointerInfos = {f.name: parseBasePointerInfo(basePointerInfo, [0, 1, 2], f) for f in allFields} resolveFieldAccesses(code, fieldToFixedCoordinates={'src': coordMapping, 'dst': coordMapping}, fieldToBasePointerInfo=basePointerInfos) + # add the function which determines #blocks and #threads as additional member to KernelFunction node + # this is used by the jit + code.getCallParameters = getCallParameters return code @@ -61,12 +66,17 @@ if __name__ == "__main__": from lbmpy.stencils import getStencil from lbmpy.collisionoperator import makeSRT from lbmpy.lbmgenerator import createLbmEquations + from pystencils.backends.cbackend import generateCUDA latticeModel = makeSRT(getStencil("D2Q9"), order=2, compressible=False) r = createLbmEquations(latticeModel, doCSE=True) kernel = createCUDAKernel(r) + import pycuda.driver as cuda + import pycuda.autoinit from pycuda.compiler import SourceModule + print(generateCUDA(kernel)) + - mod = SourceModule(str(kernel.generateC())) + mod = SourceModule(str(generateCUDA(kernel))) func = mod.get_function("kernel")