From eee767f9e825a0097f07cbfda9b6e09578c770a7 Mon Sep 17 00:00:00 2001
From: Martin Bauer <martin.bauer@fau.de>
Date: Tue, 11 Apr 2017 09:27:52 +0200
Subject: [PATCH] Fixed CUDA resource problems in GPU test (too many registers
 used)

-> smaller block
---
 gpucuda/cudajit.py  |  2 +-
 gpucuda/indexing.py | 39 +++++++++++++++++++++++++++++++++++----
 2 files changed, 36 insertions(+), 5 deletions(-)

diff --git a/gpucuda/cudajit.py b/gpucuda/cudajit.py
index f57ded792..0b18c1359 100644
--- a/gpucuda/cudajit.py
+++ b/gpucuda/cudajit.py
@@ -41,7 +41,7 @@ def makePythonFunction(kernelFunctionNode, argumentDict={}):
             shape = _checkArguments(parameters, fullArguments)
 
             indexing = kernelFunctionNode.indexing
-            dictWithBlockAndThreadNumbers = indexing.getCallParameters(shape)
+            dictWithBlockAndThreadNumbers = indexing.getCallParameters(shape, func)
 
             args = _buildNumpyArgumentList(parameters, fullArguments)
             cache[key] = (args, dictWithBlockAndThreadNumbers)
diff --git a/gpucuda/indexing.py b/gpucuda/indexing.py
index 651340d6d..34de23a2e 100644
--- a/gpucuda/indexing.py
+++ b/gpucuda/indexing.py
@@ -1,4 +1,5 @@
 import abc
+
 import sympy as sp
 import math
 import pycuda.driver as cuda
@@ -30,10 +31,12 @@ class AbstractIndexing(abc.ABCMeta('ABC', (object,), {})):
         return BLOCK_IDX + THREAD_IDX
 
     @abc.abstractmethod
-    def getCallParameters(self, arrShape):
+    def getCallParameters(self, arrShape, functionToCall):
         """
         Determine grid and block size for kernel call
         :param arrShape: the numeric (not symbolic) shape of the array
+        :param functionToCall: compile kernel function that should be called. Use this object to get information
+                               about required resources like number of registers
         :return: dict with keys 'blocks' and 'threads' with tuple values for number of (x,y,z) threads and blocks
                  the kernel should be started with
         """
@@ -84,7 +87,7 @@ class BlockIndexing(AbstractIndexing):
 
         return coordinates[:self._dim]
 
-    def getCallParameters(self, arrShape):
+    def getCallParameters(self, arrShape, functionToCall):
         substitutionDict = {sym: value for sym, value in zip(self._symbolicShape, arrShape) if sym is not None}
 
         widths = [end - start for start, end in zip(_getStartFromSlice(self._iterationSlice),
@@ -94,6 +97,7 @@ class BlockIndexing(AbstractIndexing):
         grid = tuple(math.ceil(length / blockSize) for length, blockSize in zip(widths, self._blockSize))
         extendBs = (1,) * (3 - len(self._blockSize))
         extendGr = (1,) * (3 - len(grid))
+
         return {'block': self._blockSize + extendBs,
                 'grid': grid + extendGr}
 
@@ -160,6 +164,32 @@ class BlockIndexing(AbstractIndexing):
 
         return tuple(blockSize)
 
+    @staticmethod
+    def limitBlockSizeByRegisterRestriction(blockSize, requiredRegistersPerThread, device=None):
+        """Shrinks the blockSize if there are too many registers used per multiprocessor.
+        This is not done automatically, since the requiredRegistersPerThread are not known before compilation.
+        They can be obtained by ``func.num_regs`` from a pycuda function.
+        :returns smaller blockSize if too many registers are used.
+        """
+        da = cuda.device_attribute
+        if device is None:
+            device = cuda.Context.get_device()
+        availableRegistersPerMP = device.get_attribute(da.MAX_REGISTERS_PER_MULTIPROCESSOR)
+
+        block = blockSize
+
+        while True:
+            numThreads = 1
+            for t in block:
+                numThreads *= t
+            requiredRegistersPerMT = numThreads * requiredRegistersPerThread
+            if requiredRegistersPerMT <= availableRegistersPerMP:
+                return block
+            else:
+                largestGridEntryIdx = max(range(len(block)), key=lambda e: block[e])
+                assert block[largestGridEntryIdx] >= 2
+                block[largestGridEntryIdx] //= 2
+
     @staticmethod
     def permuteBlockSizeAccordingToLayout(blockSize, layout):
         """Returns modified blockSize such that the fastest coordinate gets the biggest block dimension"""
@@ -200,7 +230,7 @@ class LineIndexing(AbstractIndexing):
     def coordinates(self):
         return [i + offset for i, offset in zip(self._coordinates, _getStartFromSlice(self._iterationSlice))]
 
-    def getCallParameters(self, arrShape):
+    def getCallParameters(self, arrShape, functionToCall):
         substitutionDict = {sym: value for sym, value in zip(self._symbolicShape, arrShape) if sym is not None}
 
         widths = [end - start for start, end in zip(_getStartFromSlice(self._iterationSlice),
@@ -243,4 +273,5 @@ def _getEndFromSlice(iterationSlice, arrShape):
         else:
             assert isinstance(sliceComponent, int)
             res.append(sliceComponent + 1)
-    return res
\ No newline at end of file
+    return res
+
-- 
GitLab