Commit 771a9b22 authored by Martin Bauer's avatar Martin Bauer
Browse files

Caching for jitted cpu and gpu kernels (big speedup for small work sizes)

parent ff641ec9
......@@ -428,12 +428,19 @@ def makePythonFunctionIncompleteParams(kernelFunctionNode, argumentDict):
func.restype = None
parameters = kernelFunctionNode.parameters
cache = {}
def wrapper(**kwargs):
from copy import copy
fullArguments = copy(argumentDict)
fullArguments.update(kwargs)
args = buildCTypeArgumentList(parameters, fullArguments)
func(*args)
key = hash(tuple((k, id(v)) for k, v in kwargs.items()))
try:
args = cache[key]
func(*args)
except KeyError:
fullArguments = argumentDict.copy()
fullArguments.update(kwargs)
args = buildCTypeArgumentList(parameters, fullArguments)
cache[key] = args
func(*args)
return wrapper
......@@ -95,7 +95,7 @@ class Field(object):
if spatialDimensions < 1:
raise ValueError("Too many index dimensions. At least one spatial dimension required")
fullLayout = getLayoutFromNumpyArray(npArray)
fullLayout = getLayoutOfArray(npArray)
spatialLayout = tuple([i for i in fullLayout if i < spatialDimensions])
assert len(spatialLayout) == spatialDimensions
......@@ -349,11 +349,11 @@ def extractCommonSubexpressions(equations):
return equations
def getLayoutFromNumpyArray(arr, indexDimensionIds=[]):
def getLayoutOfArray(arr, indexDimensionIds=[]):
"""
Returns a list indicating the memory layout (linearization order) of the numpy array.
Example:
>>> getLayoutFromNumpyArray(np.zeros([3,3,3]))
>>> getLayoutOfArray(np.zeros([3,3,3]))
(0, 1, 2)
In this example the loop over the zeroth coordinate should be the outermost loop,
......@@ -369,6 +369,37 @@ def getLayoutFromNumpyArray(arr, indexDimensionIds=[]):
return normalizeLayout(result)
def createNumpyArrayWithLayout(shape, layout):
"""
Creates a numpy array with
:param shape: shape of the resulting array
:param layout: layout as tuple, where the coordinates are ordered from slow to fast
>>> res = createNumpyArrayWithLayout(shape=(2, 3, 4, 5), layout=(3, 2, 0, 1))
>>> res.shape
(2, 3, 4, 5)
>>> getLayoutOfArray(res)
(3, 2, 0, 1)
"""
assert set(layout) == set(range(len(shape))), "Wrong layout descriptor"
currentLayout = list(range(len(shape)))
swaps = []
for i in range(len(layout)):
if currentLayout[i] != layout[i]:
indexToSwapWith = currentLayout.index(layout[i])
swaps.append((i, indexToSwapWith))
currentLayout[i], currentLayout[indexToSwapWith] = currentLayout[indexToSwapWith], currentLayout[i]
assert tuple(currentLayout) == tuple(layout)
shape = list(shape)
for a, b in swaps:
shape[a], shape[b] = shape[b], shape[a]
res = np.empty(shape, order='c')
for a, b in reversed(swaps):
res = res.swapaxes(a, b)
return res
def normalizeLayout(layout):
"""Takes a layout tuple and subtracts the minimum from all entries"""
minEntry = min(layout)
......@@ -382,14 +413,12 @@ def computeStrides(shape, layout):
:param layout: layout specification as tuple
:return: strides in elements, not in bytes
"""
layout = list(reversed(layout))
N = len(shape)
assert len(layout) == N
assert len(set(layout)) == N
strides = [0] * N
product = 1
for i in range(N):
j = layout.index(i)
for j in reversed(layout):
strides[j] = product
product *= shape[j]
return tuple(strides)
......
......@@ -28,16 +28,24 @@ def makePythonFunction(kernelFunctionNode, argumentDict={}):
parameters = kernelFunctionNode.parameters
cache = {}
def wrapper(**kwargs):
from copy import copy
fullArguments = copy(argumentDict)
fullArguments.update(kwargs)
shape = _checkArguments(parameters, fullArguments)
indexing = kernelFunctionNode.indexing
dictWithBlockAndThreadNumbers = indexing.getCallParameters(shape)
args = _buildNumpyArgumentList(parameters, fullArguments)
func(*args, **dictWithBlockAndThreadNumbers)
key = hash(tuple((k, id(v)) for k, v in kwargs.items()))
try:
args, dictWithBlockAndThreadNumbers = cache[key]
func(*args, **dictWithBlockAndThreadNumbers)
except KeyError:
fullArguments = argumentDict.copy()
fullArguments.update(kwargs)
shape = _checkArguments(parameters, fullArguments)
indexing = kernelFunctionNode.indexing
dictWithBlockAndThreadNumbers = indexing.getCallParameters(shape)
args = _buildNumpyArgumentList(parameters, fullArguments)
cache[key] = (args, dictWithBlockAndThreadNumbers)
func(*args, **dictWithBlockAndThreadNumbers)
#cuda.Context.synchronize() # useful for debugging, to get errors right after kernel was called
return wrapper
......@@ -64,6 +72,7 @@ def _buildNumpyArgumentList(parameters, argumentDict):
param = argumentDict[arg.name]
expectedType = arg.dtype.numpyDtype
result.append(expectedType(param))
assert len(result) == len(parameters)
return result
......
import abc
import sympy as sp
import math
import pycuda.driver as cuda
......@@ -8,68 +9,108 @@ from pystencils.astnodes import Conditional, Block
BLOCK_IDX = list(sp.symbols("blockIdx.x blockIdx.y blockIdx.z"))
THREAD_IDX = list(sp.symbols("threadIdx.x threadIdx.y threadIdx.z"))
# Part 1:
# given a field and the number of ghost layers, return the x, y and z coordinates
# dependent on CUDA thread and block indices
# Part 2:
# given the actual field size, determine the call parameters i.e. # of blocks and threads
class AbstractIndexing(abc.ABCMeta('ABC', (object,), {})):
"""
Abstract base class for all Indexing classes. An Indexing class defines how a multidimensional
field is mapped to CUDA's block and grid system. It calculates indices based on CUDA's thread and block indices
and computes the number of blocks and threads a kernel is started with. The Indexing class is created with
a pystencils field and the number of ghost layers as well as further optional parameters that must have default
values.
"""
class LineIndexing:
def __init__(self, field, ghostLayers):
availableIndices = [THREAD_IDX[0]] + BLOCK_IDX
if field.spatialDimensions > 4:
raise NotImplementedError("This indexing scheme supports at most 4 spatial dimensions")
coordinates = availableIndices[:field.spatialDimensions]
fastestCoordinate = field.layout[-1]
coordinates[0], coordinates[fastestCoordinate] = coordinates[fastestCoordinate], coordinates[0]
self._coordiantesNoGhostLayer = coordinates
self._coordinates = [i + ghostLayers for i in coordinates]
self._ghostLayers = ghostLayers
@abc.abstractproperty
def coordinates(self):
"""Returns a sequence of coordinate expressions for (x,y,z) depending on symbolic CUDA block and thread indices.
These symbolic indices can be obtained with the method `indexVariables` """
@property
def coordinates(self):
return self._coordinates
def indexVariables(self):
"""Sympy symbols for CUDA's block and thread indices"""
return BLOCK_IDX + THREAD_IDX
@abc.abstractmethod
def getCallParameters(self, arrShape):
def getShapeOfCudaIdx(cudaIdx):
if cudaIdx not in self._coordiantesNoGhostLayer:
return 1
else:
return arrShape[self._coordiantesNoGhostLayer.index(cudaIdx)] - 2 * self._ghostLayers
"""
Determine grid and block size for kernel call
:param arrShape: the numeric (not symbolic) shape of the array
:return: dict with keys 'blocks' and 'threads' with tuple values for number of (x,y,z) threads and blocks
the kernel should be started with
"""
@abc.abstractmethod
def guard(self, kernelContent, arrShape):
"""
In some indexing schemes not all threads of a block execute the kernel content.
This function can return a Conditional ast node, defining this execution guard.
:param kernelContent: the actual kernel contents which can e.g. be put into the Conditional node as true block
:param arrShape: the numeric or symbolic shape of the field
:return: ast node, which is put inside the kernel function
"""
return {'block': tuple([getShapeOfCudaIdx(idx) for idx in THREAD_IDX]),
'grid': tuple([getShapeOfCudaIdx(idx) for idx in BLOCK_IDX])}
def guard(self, kernelContent, arrShape):
return kernelContent
# -------------------------------------------- Implementations ---------------------------------------------------------
@property
def indexVariables(self):
return BLOCK_IDX + THREAD_IDX
class BlockIndexing(AbstractIndexing):
"""Generic indexing scheme that maps sub-blocks of an array to CUDA blocks."""
class BlockIndexing:
def __init__(self, field, ghostLayers, blockSize=(256, 8, 1), permuteBlockSizeDependentOnLayout=True):
"""
Creates
:param field: pystencils field (common to all Indexing classes)
:param ghostLayers: number of ghost layers (common to all Indexing classes)
:param blockSize: size of a CUDA block
:param permuteBlockSizeDependentOnLayout: if True the blockSize is permuted such that the fastest coordinate
gets the largest amount of threads
"""
if field.spatialDimensions > 3:
raise NotImplementedError("This indexing scheme supports at most 3 spatial dimensions")
if permuteBlockSizeDependentOnLayout:
blockSize = self.permuteBlockSizeAccordingToLayout(blockSize, field.layout)
self._blockSize = self.limitBlockSizeToDeviceMaximum(blockSize)
blockSize = self.limitBlockSizeToDeviceMaximum(blockSize)
self._blockSize = blockSize
self._coordinates = [blockIndex * bs + threadIndex + ghostLayers
for blockIndex, bs, threadIndex in zip(BLOCK_IDX, blockSize, THREAD_IDX)]
self._coordinates = self._coordinates[:field.spatialDimensions]
self._ghostLayers = ghostLayers
@property
def coordinates(self):
return self._coordinates
def getCallParameters(self, arrShape):
dim = len(self._coordinates)
arrShape = arrShape[:dim]
grid = tuple(math.ceil(length / blockSize) for length, blockSize in zip(arrShape, self._blockSize))
extendBs = (1,) * (3 - len(self._blockSize))
extendGr = (1,) * (3 - len(grid))
return {'block': self._blockSize + extendBs,
'grid': grid + extendGr}
def guard(self, kernelContent, arrShape):
dim = len(self._coordinates)
arrShape = arrShape[:dim]
conditions = [c < shapeComponent - self._ghostLayers
for c, shapeComponent in zip(self._coordinates, arrShape)]
condition = conditions[0]
for c in conditions[1:]:
condition = sp.And(condition, c)
return Block([Conditional(condition, kernelContent)])
@staticmethod
def limitBlockSizeToDeviceMaximum(blockSize):
"""
Changes blocksize according to match device limits according to the following rules:
1) if the total amount of threads is too big for the current device, the biggest coordinate is divided by 2.
2) next, if one component is still too big, the component which is too big is divided by 2 and the smallest
component is multiplied by 2, such that the total amount of threads stays the same
Returns the altered blockSize
"""
# Get device limits
da = cuda.device_attribute
device = cuda.Context.get_device()
......@@ -117,7 +158,7 @@ class BlockIndexing:
@staticmethod
def permuteBlockSizeAccordingToLayout(blockSize, layout):
"""The fastest coordinate gets the biggest block dimension"""
"""Returns modified blockSize such that the fastest coordinate gets the biggest block dimension"""
sortedBlockSize = list(sorted(blockSize, reverse=True))
while len(sortedBlockSize) > len(layout):
sortedBlockSize[0] *= sortedBlockSize[-1]
......@@ -128,34 +169,42 @@ class BlockIndexing:
result[l] = bs
return tuple(result[:len(layout)])
class LineIndexing(AbstractIndexing):
"""
Indexing scheme that assigns the innermost 'line' i.e. the elements which are adjacent in memory to a 1D CUDA block.
The fastest coordinate is indexed with threadIdx.x, the remaining coordinates are mapped to blockIdx.{x,y,z}
This indexing scheme supports up to 4 spatial dimensions, where the innermost dimensions is not larger than the
maximum amount of threads allowed in a CUDA block (which depends on device).
"""
def __init__(self, field, ghostLayers):
availableIndices = [THREAD_IDX[0]] + BLOCK_IDX
if field.spatialDimensions > 4:
raise NotImplementedError("This indexing scheme supports at most 4 spatial dimensions")
coordinates = availableIndices[:field.spatialDimensions]
fastestCoordinate = field.layout[-1]
coordinates[0], coordinates[fastestCoordinate] = coordinates[fastestCoordinate], coordinates[0]
self._coordiantesNoGhostLayer = coordinates
self._coordinates = [i + ghostLayers for i in coordinates]
self._ghostLayers = ghostLayers
@property
def coordinates(self):
return self._coordinates
def getCallParameters(self, arrShape):
dim = len(self._coordinates)
arrShape = arrShape[:dim]
grid = tuple(math.ceil(length / blockSize) for length, blockSize in zip(arrShape, self._blockSize))
extendBs = (1,) * (3 - len(self._blockSize))
extendGr = (1,) * (3 - len(grid))
return {'block': self._blockSize + extendBs,
'grid': grid + extendGr}
def guard(self, kernelContent, arrShape):
dim = len(self._coordinates)
arrShape = arrShape[:dim]
conditions = [c < shapeComponent - self._ghostLayers
for c, shapeComponent in zip(self._coordinates, arrShape)]
condition = conditions[0]
for c in conditions[1:]:
condition = sp.And(condition, c)
return Block([Conditional(condition, kernelContent)])
def getShapeOfCudaIdx(cudaIdx):
if cudaIdx not in self._coordiantesNoGhostLayer:
return 1
else:
return arrShape[self._coordiantesNoGhostLayer.index(cudaIdx)] - 2 * self._ghostLayers
@property
def indexVariables(self):
return BLOCK_IDX + THREAD_IDX
return {'block': tuple([getShapeOfCudaIdx(idx) for idx in THREAD_IDX]),
'grid': tuple([getShapeOfCudaIdx(idx) for idx in BLOCK_IDX])}
if __name__ == '__main__':
bs = BlockIndexing.permuteBlockSizeAccordingToLayout((256, 8, 1), (0,))
bs = BlockIndexing.limitBlockSizeToDeviceMaximum(bs)
print(bs)
def guard(self, kernelContent, arrShape):
return kernelContent
from pystencils.gpucuda.indexing import BlockIndexing, LineIndexing
from pystencils.gpucuda.indexing import BlockIndexing
from pystencils.transformations import resolveFieldAccesses, typeAllEquations, parseBasePointerInfo, getCommonShape
from pystencils.astnodes import Block, KernelFunction, SympyAssignment
from pystencils.types import TypedSymbol, BasicType, StructType
......@@ -15,7 +15,7 @@ def createCUDAKernel(listOfEquations, functionName="kernel", typeForSymbol=None,
fieldAccesses.update(eq.atoms(Field.Access))
requiredGhostLayers = max([fa.requiredGhostLayers for fa in fieldAccesses])
indexing = indexingCreator(list(fieldsRead)[0], requiredGhostLayers)
indexing = indexingCreator(field=list(fieldsRead)[0], ghostLayers=requiredGhostLayers)
block = Block(assignments)
block = indexing.guard(block, getCommonShape(allFields))
......@@ -63,7 +63,7 @@ def createdIndexedCUDAKernel(listOfEquations, indexFields, functionName="kernel"
coordinateSymbolAssignments = [getCoordinateSymbolAssignment(n) for n in coordinateNames[:spatialCoordinates]]
coordinateTypedSymbols = [eq.lhs for eq in coordinateSymbolAssignments]
indexing = indexingCreator(list(indexFields)[0], ghostLayers=0)
indexing = indexingCreator(field=list(indexFields)[0], ghostLayers=0)
functionBody = Block(coordinateSymbolAssignments + assignments)
functionBody = indexing.guard(functionBody, getCommonShape(indexFields))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment