Commit eee767f9 authored by Martin Bauer's avatar Martin Bauer
Browse files

Fixed CUDA resource problems in GPU test (too many registers used)

-> smaller block
parent 0ff9a94e
......@@ -41,7 +41,7 @@ def makePythonFunction(kernelFunctionNode, argumentDict={}):
shape = _checkArguments(parameters, fullArguments)
indexing = kernelFunctionNode.indexing
dictWithBlockAndThreadNumbers = indexing.getCallParameters(shape)
dictWithBlockAndThreadNumbers = indexing.getCallParameters(shape, func)
args = _buildNumpyArgumentList(parameters, fullArguments)
cache[key] = (args, dictWithBlockAndThreadNumbers)
......
import abc
import sympy as sp
import math
import pycuda.driver as cuda
......@@ -30,10 +31,12 @@ class AbstractIndexing(abc.ABCMeta('ABC', (object,), {})):
return BLOCK_IDX + THREAD_IDX
@abc.abstractmethod
def getCallParameters(self, arrShape):
def getCallParameters(self, arrShape, functionToCall):
"""
Determine grid and block size for kernel call
:param arrShape: the numeric (not symbolic) shape of the array
:param functionToCall: compile kernel function that should be called. Use this object to get information
about required resources like number of registers
:return: dict with keys 'blocks' and 'threads' with tuple values for number of (x,y,z) threads and blocks
the kernel should be started with
"""
......@@ -84,7 +87,7 @@ class BlockIndexing(AbstractIndexing):
return coordinates[:self._dim]
def getCallParameters(self, arrShape):
def getCallParameters(self, arrShape, functionToCall):
substitutionDict = {sym: value for sym, value in zip(self._symbolicShape, arrShape) if sym is not None}
widths = [end - start for start, end in zip(_getStartFromSlice(self._iterationSlice),
......@@ -94,6 +97,7 @@ class BlockIndexing(AbstractIndexing):
grid = tuple(math.ceil(length / blockSize) for length, blockSize in zip(widths, self._blockSize))
extendBs = (1,) * (3 - len(self._blockSize))
extendGr = (1,) * (3 - len(grid))
return {'block': self._blockSize + extendBs,
'grid': grid + extendGr}
......@@ -160,6 +164,32 @@ class BlockIndexing(AbstractIndexing):
return tuple(blockSize)
@staticmethod
def limitBlockSizeByRegisterRestriction(blockSize, requiredRegistersPerThread, device=None):
"""Shrinks the blockSize if there are too many registers used per multiprocessor.
This is not done automatically, since the requiredRegistersPerThread are not known before compilation.
They can be obtained by ``func.num_regs`` from a pycuda function.
:returns smaller blockSize if too many registers are used.
"""
da = cuda.device_attribute
if device is None:
device = cuda.Context.get_device()
availableRegistersPerMP = device.get_attribute(da.MAX_REGISTERS_PER_MULTIPROCESSOR)
block = blockSize
while True:
numThreads = 1
for t in block:
numThreads *= t
requiredRegistersPerMT = numThreads * requiredRegistersPerThread
if requiredRegistersPerMT <= availableRegistersPerMP:
return block
else:
largestGridEntryIdx = max(range(len(block)), key=lambda e: block[e])
assert block[largestGridEntryIdx] >= 2
block[largestGridEntryIdx] //= 2
@staticmethod
def permuteBlockSizeAccordingToLayout(blockSize, layout):
"""Returns modified blockSize such that the fastest coordinate gets the biggest block dimension"""
......@@ -200,7 +230,7 @@ class LineIndexing(AbstractIndexing):
def coordinates(self):
return [i + offset for i, offset in zip(self._coordinates, _getStartFromSlice(self._iterationSlice))]
def getCallParameters(self, arrShape):
def getCallParameters(self, arrShape, functionToCall):
substitutionDict = {sym: value for sym, value in zip(self._symbolicShape, arrShape) if sym is not None}
widths = [end - start for start, end in zip(_getStartFromSlice(self._iterationSlice),
......@@ -243,4 +273,5 @@ def _getEndFromSlice(iterationSlice, arrShape):
else:
assert isinstance(sliceComponent, int)
res.append(sliceComponent + 1)
return res
\ No newline at end of file
return res
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment