Skip to content
Snippets Groups Projects
Commit 8c693cd1 authored by Martin Bauer's avatar Martin Bauer
Browse files

Worked on CUDA code generation

parent 411af476
Branches
Tags
No related merge requests found
import numpy as np import numpy as np
import pycuda.driver as cuda import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule from pycuda.compiler import SourceModule
...@@ -52,7 +53,7 @@ def buildNumpyArgumentList(kernelFunctionNode, argumentDict): ...@@ -52,7 +53,7 @@ def buildNumpyArgumentList(kernelFunctionNode, argumentDict):
return result return result
def makePythonFunction(kernelFunctionNode, argumentDict): def makePythonFunction(kernelFunctionNode, argumentDict={}):
mod = SourceModule(str(kernelFunctionNode.generateC())) mod = SourceModule(str(kernelFunctionNode.generateC()))
func = mod.get_function(kernelFunctionNode.functionName) func = mod.get_function(kernelFunctionNode.functionName)
...@@ -60,6 +61,7 @@ def makePythonFunction(kernelFunctionNode, argumentDict): ...@@ -60,6 +61,7 @@ def makePythonFunction(kernelFunctionNode, argumentDict):
args = buildNumpyArgumentList(kernelFunctionNode, argumentDict) args = buildNumpyArgumentList(kernelFunctionNode, argumentDict)
# 2) determine block and grid tuples # 2) determine block and grid tuples
dictWithBlockAndThreadNumbers = kernelFunctionNode.getCallParameters()
# TODO prepare the function here # TODO prepare the function here
...@@ -4,55 +4,60 @@ import sympy as sp ...@@ -4,55 +4,60 @@ import sympy as sp
from pystencils.transformations import resolveFieldAccesses, typeAllEquations, parseBasePointerInfo from pystencils.transformations import resolveFieldAccesses, typeAllEquations, parseBasePointerInfo
from pystencils.ast import Block, KernelFunction from pystencils.ast import Block, KernelFunction
from pystencils import Field
BLOCK_IDX = list(sp.symbols("blockIdx.x blockIdx.y blockIdx.z")) BLOCK_IDX = list(sp.symbols("blockIdx.x blockIdx.y blockIdx.z"))
THREAD_IDX = list(sp.symbols("threadIdx.x threadIdx.y threadIdx.z")) THREAD_IDX = list(sp.symbols("threadIdx.x threadIdx.y threadIdx.z"))
"""
GPU Access Patterns
- knows about the iteration range def getLinewiseCoordinates(field, ghostLayers):
- know about mapping of field indices to CUDA block and thread indices
- iterates over spatial coordinates - constructed with a specific number of coordinates
- can
"""
def getLinewiseCoordinateAccessExpression(field, indexCoordinate):
availableIndices = [THREAD_IDX[0]] + BLOCK_IDX availableIndices = [THREAD_IDX[0]] + BLOCK_IDX
assert field.spatialDimensions <= 4, "This indexing scheme supports at most 4 spatial dimensions"
result = availableIndices[:field.spatialDimensions]
fastestCoordinate = field.layout[-1] fastestCoordinate = field.layout[-1]
availableIndices[fastestCoordinate], availableIndices[0] = availableIndices[0], availableIndices[fastestCoordinate] result[0], result[fastestCoordinate] = result[fastestCoordinate], result[0]
cudaIndices = availableIndices[:field.spatialDimensions]
offsetToCell = sum([cudaIdx * stride for cudaIdx, stride in zip(cudaIndices, field.spatialStrides)]) def getCallParameters(arrShape):
indexOffset = sum([idx * indexStride for idx, indexStride in zip(indexCoordinate, field.indexStrides)]) def getShapeOfCudaIdx(cudaIdx):
return sp.simplify(offsetToCell + indexOffset) if cudaIdx not in result:
return 1
else:
return arrShape[result.index[cudaIdx]] - 2 * ghostLayers
return {'block': tuple([getShapeOfCudaIdx(idx) for idx in THREAD_IDX]),
'grid': tuple([getShapeOfCudaIdx(idx) for idx in BLOCK_IDX]) }
def getLinewiseCoordinates(field): # add ghost layer offset
availableIndices = [THREAD_IDX[0]] + BLOCK_IDX for i in range(len(result)):
d = field.spatialDimensions + field.indexDimensions result[i] += ghostLayers
fastestCoordinate = field.layout[-1]
result = availableIndices[:d] return result, getCallParameters
result[0], result[fastestCoordinate] = result[fastestCoordinate], result[0]
return result
def createCUDAKernel(listOfEquations, functionName="kernel", typeForSymbol=defaultdict(lambda: "double")): def createCUDAKernel(listOfEquations, functionName="kernel", typeForSymbol=defaultdict(lambda: "double")):
fieldsRead, fieldsWritten, assignments = typeAllEquations(listOfEquations, typeForSymbol) fieldsRead, fieldsWritten, assignments = typeAllEquations(listOfEquations, typeForSymbol)
for f in fieldsRead - fieldsWritten: allFields = fieldsRead.union(fieldsWritten)
f.setReadOnly() for field in allFields:
field.setReadOnly(False)
for field in fieldsRead - fieldsWritten:
field.setReadOnly()
code = KernelFunction(Block(assignments), functionName) code = KernelFunction(Block(assignments), functionName)
code.qualifierPrefix = "__global__ "
code.variablesToIgnore.update(BLOCK_IDX + THREAD_IDX) code.variablesToIgnore.update(BLOCK_IDX + THREAD_IDX)
coordMapping = getLinewiseCoordinates(list(fieldsRead)[0]) fieldAccesses = code.atoms(Field.Access)
requiredGhostLayers = max([fa.requiredGhostLayers for fa in fieldAccesses])
coordMapping, getCallParameters = getLinewiseCoordinates(list(fieldsRead)[0], requiredGhostLayers)
allFields = fieldsRead.union(fieldsWritten) allFields = fieldsRead.union(fieldsWritten)
basePointerInfo = [['spatialInner0']] basePointerInfo = [['spatialInner0']]
basePointerInfos = {f.name: parseBasePointerInfo(basePointerInfo, [0, 1, 2], f) for f in allFields} basePointerInfos = {f.name: parseBasePointerInfo(basePointerInfo, [0, 1, 2], f) for f in allFields}
resolveFieldAccesses(code, fieldToFixedCoordinates={'src': coordMapping, 'dst': coordMapping}, resolveFieldAccesses(code, fieldToFixedCoordinates={'src': coordMapping, 'dst': coordMapping},
fieldToBasePointerInfo=basePointerInfos) fieldToBasePointerInfo=basePointerInfos)
# add the function which determines #blocks and #threads as additional member to KernelFunction node
# this is used by the jit
code.getCallParameters = getCallParameters
return code return code
...@@ -61,12 +66,17 @@ if __name__ == "__main__": ...@@ -61,12 +66,17 @@ if __name__ == "__main__":
from lbmpy.stencils import getStencil from lbmpy.stencils import getStencil
from lbmpy.collisionoperator import makeSRT from lbmpy.collisionoperator import makeSRT
from lbmpy.lbmgenerator import createLbmEquations from lbmpy.lbmgenerator import createLbmEquations
from pystencils.backends.cbackend import generateCUDA
latticeModel = makeSRT(getStencil("D2Q9"), order=2, compressible=False) latticeModel = makeSRT(getStencil("D2Q9"), order=2, compressible=False)
r = createLbmEquations(latticeModel, doCSE=True) r = createLbmEquations(latticeModel, doCSE=True)
kernel = createCUDAKernel(r) kernel = createCUDAKernel(r)
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule from pycuda.compiler import SourceModule
print(generateCUDA(kernel))
mod = SourceModule(str(kernel.generateC())) mod = SourceModule(str(generateCUDA(kernel)))
func = mod.get_function("kernel") func = mod.get_function("kernel")
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment