diff --git a/cpu/cpujit.py b/cpu/cpujit.py index 75db5a790bea85c00c72d4a49916a23d867b55ee..7e1891e481f2898f504d6dd3952ddd901d918a88 100644 --- a/cpu/cpujit.py +++ b/cpu/cpujit.py @@ -428,12 +428,19 @@ def makePythonFunctionIncompleteParams(kernelFunctionNode, argumentDict): func.restype = None parameters = kernelFunctionNode.parameters + cache = {} + def wrapper(**kwargs): - from copy import copy - fullArguments = copy(argumentDict) - fullArguments.update(kwargs) - args = buildCTypeArgumentList(parameters, fullArguments) - func(*args) + key = hash(tuple((k, id(v)) for k, v in kwargs.items())) + try: + args = cache[key] + func(*args) + except KeyError: + fullArguments = argumentDict.copy() + fullArguments.update(kwargs) + args = buildCTypeArgumentList(parameters, fullArguments) + cache[key] = args + func(*args) return wrapper diff --git a/field.py b/field.py index 83a6ba4ff1eee90e7ddf5226182456176429253d..3d4b6aae96630a567ebfbc9ceb3a29bdbbb39493 100644 --- a/field.py +++ b/field.py @@ -95,7 +95,7 @@ class Field(object): if spatialDimensions < 1: raise ValueError("Too many index dimensions. At least one spatial dimension required") - fullLayout = getLayoutFromNumpyArray(npArray) + fullLayout = getLayoutOfArray(npArray) spatialLayout = tuple([i for i in fullLayout if i < spatialDimensions]) assert len(spatialLayout) == spatialDimensions @@ -349,11 +349,11 @@ def extractCommonSubexpressions(equations): return equations -def getLayoutFromNumpyArray(arr, indexDimensionIds=[]): +def getLayoutOfArray(arr, indexDimensionIds=[]): """ Returns a list indicating the memory layout (linearization order) of the numpy array. Example: - >>> getLayoutFromNumpyArray(np.zeros([3,3,3])) + >>> getLayoutOfArray(np.zeros([3,3,3])) (0, 1, 2) In this example the loop over the zeroth coordinate should be the outermost loop, @@ -369,6 +369,37 @@ def getLayoutFromNumpyArray(arr, indexDimensionIds=[]): return normalizeLayout(result) +def createNumpyArrayWithLayout(shape, layout): + """ + Creates a numpy array with + :param shape: shape of the resulting array + :param layout: layout as tuple, where the coordinates are ordered from slow to fast + >>> res = createNumpyArrayWithLayout(shape=(2, 3, 4, 5), layout=(3, 2, 0, 1)) + >>> res.shape + (2, 3, 4, 5) + >>> getLayoutOfArray(res) + (3, 2, 0, 1) + """ + assert set(layout) == set(range(len(shape))), "Wrong layout descriptor" + currentLayout = list(range(len(shape))) + swaps = [] + for i in range(len(layout)): + if currentLayout[i] != layout[i]: + indexToSwapWith = currentLayout.index(layout[i]) + swaps.append((i, indexToSwapWith)) + currentLayout[i], currentLayout[indexToSwapWith] = currentLayout[indexToSwapWith], currentLayout[i] + assert tuple(currentLayout) == tuple(layout) + + shape = list(shape) + for a, b in swaps: + shape[a], shape[b] = shape[b], shape[a] + + res = np.empty(shape, order='c') + for a, b in reversed(swaps): + res = res.swapaxes(a, b) + return res + + def normalizeLayout(layout): """Takes a layout tuple and subtracts the minimum from all entries""" minEntry = min(layout) @@ -382,14 +413,12 @@ def computeStrides(shape, layout): :param layout: layout specification as tuple :return: strides in elements, not in bytes """ - layout = list(reversed(layout)) N = len(shape) assert len(layout) == N assert len(set(layout)) == N strides = [0] * N product = 1 - for i in range(N): - j = layout.index(i) + for j in reversed(layout): strides[j] = product product *= shape[j] return tuple(strides) diff --git a/gpucuda/cudajit.py b/gpucuda/cudajit.py index a99aa69cc293c6087c9390cafba4acc099e4dbc4..0419493572675beefdf6704d12e50afa1ef88ced 100644 --- a/gpucuda/cudajit.py +++ b/gpucuda/cudajit.py @@ -28,16 +28,24 @@ def makePythonFunction(kernelFunctionNode, argumentDict={}): parameters = kernelFunctionNode.parameters + cache = {} + def wrapper(**kwargs): - from copy import copy - fullArguments = copy(argumentDict) - fullArguments.update(kwargs) - shape = _checkArguments(parameters, fullArguments) - - indexing = kernelFunctionNode.indexing - dictWithBlockAndThreadNumbers = indexing.getCallParameters(shape) - args = _buildNumpyArgumentList(parameters, fullArguments) - func(*args, **dictWithBlockAndThreadNumbers) + key = hash(tuple((k, id(v)) for k, v in kwargs.items())) + try: + args, dictWithBlockAndThreadNumbers = cache[key] + func(*args, **dictWithBlockAndThreadNumbers) + except KeyError: + fullArguments = argumentDict.copy() + fullArguments.update(kwargs) + shape = _checkArguments(parameters, fullArguments) + + indexing = kernelFunctionNode.indexing + dictWithBlockAndThreadNumbers = indexing.getCallParameters(shape) + + args = _buildNumpyArgumentList(parameters, fullArguments) + cache[key] = (args, dictWithBlockAndThreadNumbers) + func(*args, **dictWithBlockAndThreadNumbers) #cuda.Context.synchronize() # useful for debugging, to get errors right after kernel was called return wrapper @@ -64,6 +72,7 @@ def _buildNumpyArgumentList(parameters, argumentDict): param = argumentDict[arg.name] expectedType = arg.dtype.numpyDtype result.append(expectedType(param)) + assert len(result) == len(parameters) return result diff --git a/gpucuda/indexing.py b/gpucuda/indexing.py index f63e633de4b749f2974b51bb71803c1edf2ee76c..7cc46c13369d43bbc9c3f9443f95a1490cb90267 100644 --- a/gpucuda/indexing.py +++ b/gpucuda/indexing.py @@ -1,3 +1,4 @@ +import abc import sympy as sp import math import pycuda.driver as cuda @@ -8,68 +9,108 @@ from pystencils.astnodes import Conditional, Block BLOCK_IDX = list(sp.symbols("blockIdx.x blockIdx.y blockIdx.z")) THREAD_IDX = list(sp.symbols("threadIdx.x threadIdx.y threadIdx.z")) -# Part 1: -# given a field and the number of ghost layers, return the x, y and z coordinates -# dependent on CUDA thread and block indices -# Part 2: -# given the actual field size, determine the call parameters i.e. # of blocks and threads +class AbstractIndexing(abc.ABCMeta('ABC', (object,), {})): + """ + Abstract base class for all Indexing classes. An Indexing class defines how a multidimensional + field is mapped to CUDA's block and grid system. It calculates indices based on CUDA's thread and block indices + and computes the number of blocks and threads a kernel is started with. The Indexing class is created with + a pystencils field and the number of ghost layers as well as further optional parameters that must have default + values. + """ - -class LineIndexing: - def __init__(self, field, ghostLayers): - availableIndices = [THREAD_IDX[0]] + BLOCK_IDX - if field.spatialDimensions > 4: - raise NotImplementedError("This indexing scheme supports at most 4 spatial dimensions") - - coordinates = availableIndices[:field.spatialDimensions] - - fastestCoordinate = field.layout[-1] - coordinates[0], coordinates[fastestCoordinate] = coordinates[fastestCoordinate], coordinates[0] - - self._coordiantesNoGhostLayer = coordinates - self._coordinates = [i + ghostLayers for i in coordinates] - self._ghostLayers = ghostLayers + @abc.abstractproperty + def coordinates(self): + """Returns a sequence of coordinate expressions for (x,y,z) depending on symbolic CUDA block and thread indices. + These symbolic indices can be obtained with the method `indexVariables` """ @property - def coordinates(self): - return self._coordinates + def indexVariables(self): + """Sympy symbols for CUDA's block and thread indices""" + return BLOCK_IDX + THREAD_IDX + @abc.abstractmethod def getCallParameters(self, arrShape): - def getShapeOfCudaIdx(cudaIdx): - if cudaIdx not in self._coordiantesNoGhostLayer: - return 1 - else: - return arrShape[self._coordiantesNoGhostLayer.index(cudaIdx)] - 2 * self._ghostLayers + """ + Determine grid and block size for kernel call + :param arrShape: the numeric (not symbolic) shape of the array + :return: dict with keys 'blocks' and 'threads' with tuple values for number of (x,y,z) threads and blocks + the kernel should be started with + """ + + @abc.abstractmethod + def guard(self, kernelContent, arrShape): + """ + In some indexing schemes not all threads of a block execute the kernel content. + This function can return a Conditional ast node, defining this execution guard. + :param kernelContent: the actual kernel contents which can e.g. be put into the Conditional node as true block + :param arrShape: the numeric or symbolic shape of the field + :return: ast node, which is put inside the kernel function + """ - return {'block': tuple([getShapeOfCudaIdx(idx) for idx in THREAD_IDX]), - 'grid': tuple([getShapeOfCudaIdx(idx) for idx in BLOCK_IDX])} - def guard(self, kernelContent, arrShape): - return kernelContent +# -------------------------------------------- Implementations --------------------------------------------------------- - @property - def indexVariables(self): - return BLOCK_IDX + THREAD_IDX +class BlockIndexing(AbstractIndexing): + """Generic indexing scheme that maps sub-blocks of an array to CUDA blocks.""" -class BlockIndexing: def __init__(self, field, ghostLayers, blockSize=(256, 8, 1), permuteBlockSizeDependentOnLayout=True): + """ + Creates + :param field: pystencils field (common to all Indexing classes) + :param ghostLayers: number of ghost layers (common to all Indexing classes) + :param blockSize: size of a CUDA block + :param permuteBlockSizeDependentOnLayout: if True the blockSize is permuted such that the fastest coordinate + gets the largest amount of threads + """ if field.spatialDimensions > 3: raise NotImplementedError("This indexing scheme supports at most 3 spatial dimensions") if permuteBlockSizeDependentOnLayout: blockSize = self.permuteBlockSizeAccordingToLayout(blockSize, field.layout) - self._blockSize = self.limitBlockSizeToDeviceMaximum(blockSize) + blockSize = self.limitBlockSizeToDeviceMaximum(blockSize) + self._blockSize = blockSize + self._coordinates = [blockIndex * bs + threadIndex + ghostLayers for blockIndex, bs, threadIndex in zip(BLOCK_IDX, blockSize, THREAD_IDX)] self._coordinates = self._coordinates[:field.spatialDimensions] self._ghostLayers = ghostLayers + @property + def coordinates(self): + return self._coordinates + + def getCallParameters(self, arrShape): + dim = len(self._coordinates) + arrShape = arrShape[:dim] + grid = tuple(math.ceil(length / blockSize) for length, blockSize in zip(arrShape, self._blockSize)) + extendBs = (1,) * (3 - len(self._blockSize)) + extendGr = (1,) * (3 - len(grid)) + return {'block': self._blockSize + extendBs, + 'grid': grid + extendGr} + + def guard(self, kernelContent, arrShape): + dim = len(self._coordinates) + arrShape = arrShape[:dim] + conditions = [c < shapeComponent - self._ghostLayers + for c, shapeComponent in zip(self._coordinates, arrShape)] + condition = conditions[0] + for c in conditions[1:]: + condition = sp.And(condition, c) + return Block([Conditional(condition, kernelContent)]) + @staticmethod def limitBlockSizeToDeviceMaximum(blockSize): + """ + Changes blocksize according to match device limits according to the following rules: + 1) if the total amount of threads is too big for the current device, the biggest coordinate is divided by 2. + 2) next, if one component is still too big, the component which is too big is divided by 2 and the smallest + component is multiplied by 2, such that the total amount of threads stays the same + Returns the altered blockSize + """ # Get device limits da = cuda.device_attribute device = cuda.Context.get_device() @@ -117,7 +158,7 @@ class BlockIndexing: @staticmethod def permuteBlockSizeAccordingToLayout(blockSize, layout): - """The fastest coordinate gets the biggest block dimension""" + """Returns modified blockSize such that the fastest coordinate gets the biggest block dimension""" sortedBlockSize = list(sorted(blockSize, reverse=True)) while len(sortedBlockSize) > len(layout): sortedBlockSize[0] *= sortedBlockSize[-1] @@ -128,34 +169,42 @@ class BlockIndexing: result[l] = bs return tuple(result[:len(layout)]) + +class LineIndexing(AbstractIndexing): + """ + Indexing scheme that assigns the innermost 'line' i.e. the elements which are adjacent in memory to a 1D CUDA block. + The fastest coordinate is indexed with threadIdx.x, the remaining coordinates are mapped to blockIdx.{x,y,z} + This indexing scheme supports up to 4 spatial dimensions, where the innermost dimensions is not larger than the + maximum amount of threads allowed in a CUDA block (which depends on device). + """ + + def __init__(self, field, ghostLayers): + availableIndices = [THREAD_IDX[0]] + BLOCK_IDX + if field.spatialDimensions > 4: + raise NotImplementedError("This indexing scheme supports at most 4 spatial dimensions") + + coordinates = availableIndices[:field.spatialDimensions] + + fastestCoordinate = field.layout[-1] + coordinates[0], coordinates[fastestCoordinate] = coordinates[fastestCoordinate], coordinates[0] + + self._coordiantesNoGhostLayer = coordinates + self._coordinates = [i + ghostLayers for i in coordinates] + self._ghostLayers = ghostLayers + @property def coordinates(self): return self._coordinates def getCallParameters(self, arrShape): - dim = len(self._coordinates) - arrShape = arrShape[:dim] - grid = tuple(math.ceil(length / blockSize) for length, blockSize in zip(arrShape, self._blockSize)) - extendBs = (1,) * (3 - len(self._blockSize)) - extendGr = (1,) * (3 - len(grid)) - return {'block': self._blockSize + extendBs, - 'grid': grid + extendGr} - - def guard(self, kernelContent, arrShape): - dim = len(self._coordinates) - arrShape = arrShape[:dim] - conditions = [c < shapeComponent - self._ghostLayers - for c, shapeComponent in zip(self._coordinates, arrShape)] - condition = conditions[0] - for c in conditions[1:]: - condition = sp.And(condition, c) - return Block([Conditional(condition, kernelContent)]) + def getShapeOfCudaIdx(cudaIdx): + if cudaIdx not in self._coordiantesNoGhostLayer: + return 1 + else: + return arrShape[self._coordiantesNoGhostLayer.index(cudaIdx)] - 2 * self._ghostLayers - @property - def indexVariables(self): - return BLOCK_IDX + THREAD_IDX + return {'block': tuple([getShapeOfCudaIdx(idx) for idx in THREAD_IDX]), + 'grid': tuple([getShapeOfCudaIdx(idx) for idx in BLOCK_IDX])} -if __name__ == '__main__': - bs = BlockIndexing.permuteBlockSizeAccordingToLayout((256, 8, 1), (0,)) - bs = BlockIndexing.limitBlockSizeToDeviceMaximum(bs) - print(bs) + def guard(self, kernelContent, arrShape): + return kernelContent diff --git a/gpucuda/kernelcreation.py b/gpucuda/kernelcreation.py index 5d4a3ac5b53a05ad06ce1fe14b02ddb58c99ffab..ea06701ca9ccba1ec517024a647afa60c70ffe2e 100644 --- a/gpucuda/kernelcreation.py +++ b/gpucuda/kernelcreation.py @@ -1,4 +1,4 @@ -from pystencils.gpucuda.indexing import BlockIndexing, LineIndexing +from pystencils.gpucuda.indexing import BlockIndexing from pystencils.transformations import resolveFieldAccesses, typeAllEquations, parseBasePointerInfo, getCommonShape from pystencils.astnodes import Block, KernelFunction, SympyAssignment from pystencils.types import TypedSymbol, BasicType, StructType @@ -15,7 +15,7 @@ def createCUDAKernel(listOfEquations, functionName="kernel", typeForSymbol=None, fieldAccesses.update(eq.atoms(Field.Access)) requiredGhostLayers = max([fa.requiredGhostLayers for fa in fieldAccesses]) - indexing = indexingCreator(list(fieldsRead)[0], requiredGhostLayers) + indexing = indexingCreator(field=list(fieldsRead)[0], ghostLayers=requiredGhostLayers) block = Block(assignments) block = indexing.guard(block, getCommonShape(allFields)) @@ -63,7 +63,7 @@ def createdIndexedCUDAKernel(listOfEquations, indexFields, functionName="kernel" coordinateSymbolAssignments = [getCoordinateSymbolAssignment(n) for n in coordinateNames[:spatialCoordinates]] coordinateTypedSymbols = [eq.lhs for eq in coordinateSymbolAssignments] - indexing = indexingCreator(list(indexFields)[0], ghostLayers=0) + indexing = indexingCreator(field=list(indexFields)[0], ghostLayers=0) functionBody = Block(coordinateSymbolAssignments + assignments) functionBody = indexing.guard(functionBody, getCommonShape(indexFields))