indexing.py 12.7 KB
Newer Older
1
import abc
2

3
4
5
import sympy as sp

from pystencils.astnodes import Conditional, Block
6
from pystencils.slicing import normalizeSlice
Martin Bauer's avatar
Martin Bauer committed
7
from pystencils.data_types import TypedSymbol, create_type
Martin Bauer's avatar
Martin Bauer committed
8
from functools import partial
9

10
11
AUTO_BLOCKSIZE_LIMITING = True

Martin Bauer's avatar
Martin Bauer committed
12
13
14
15
BLOCK_IDX = [TypedSymbol("blockIdx." + coord, create_type("int")) for coord in ('x', 'y', 'z')]
THREAD_IDX = [TypedSymbol("threadIdx." + coord, create_type("int")) for coord in ('x', 'y', 'z')]
BLOCK_DIM = [TypedSymbol("blockDim." + coord, create_type("int")) for coord in ('x', 'y', 'z')]
GRID_DIM = [TypedSymbol("gridDim." + coord, create_type("int")) for coord in ('x', 'y', 'z')]
16
17


18
19
20
21
22
class AbstractIndexing(abc.ABCMeta('ABC', (object,), {})):
    """
    Abstract base class for all Indexing classes. An Indexing class defines how a multidimensional
    field is mapped to CUDA's block and grid system. It calculates indices based on CUDA's thread and block indices
    and computes the number of blocks and threads a kernel is started with. The Indexing class is created with
23
    a pystencils field, a slice to iterate over, and further optional parameters that must have default values.
24
    """
25

26
27
28
29
    @abc.abstractproperty
    def coordinates(self):
        """Returns a sequence of coordinate expressions for (x,y,z) depending on symbolic CUDA block and thread indices.
        These symbolic indices can be obtained with the method `indexVariables` """
30
31

    @property
32
    def indexVariables(self):
33
34
        """Sympy symbols for CUDA's block and thread indices, and block and grid dimensions. """
        return BLOCK_IDX + THREAD_IDX + BLOCK_DIM + GRID_DIM
35

36
    @abc.abstractmethod
37
    def getCallParameters(self, arrShape):
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
        """
        Determine grid and block size for kernel call
        :param arrShape: the numeric (not symbolic) shape of the array
        :return: dict with keys 'blocks' and 'threads' with tuple values for number of (x,y,z) threads and blocks
                 the kernel should be started with
        """

    @abc.abstractmethod
    def guard(self, kernelContent, arrShape):
        """
        In some indexing schemes not all threads of a block execute the kernel content.
        This function can return a Conditional ast node, defining this execution guard.
        :param kernelContent: the actual kernel contents which can e.g. be put into the Conditional node as true block
        :param arrShape: the numeric or symbolic shape of the field
        :return: ast node, which is put inside the kernel function
        """
54
55


56
# -------------------------------------------- Implementations ---------------------------------------------------------
57
58


59
60
class BlockIndexing(AbstractIndexing):
    """Generic indexing scheme that maps sub-blocks of an array to CUDA blocks."""
61

62
63
    def __init__(self, field, iterationSlice=None,
                 blockSize=(256, 8, 1), permuteBlockSizeDependentOnLayout=True):
64
65
66
        """
        Creates
        :param field: pystencils field (common to all Indexing classes)
67
        :param iterationSlice: slice that defines rectangular subarea which is iterated over
68
69
70
        :param permuteBlockSizeDependentOnLayout: if True the blockSize is permuted such that the fastest coordinate
                                                  gets the largest amount of threads
        """
71
72
73
74
75
76
        if field.spatialDimensions > 3:
            raise NotImplementedError("This indexing scheme supports at most 3 spatial dimensions")

        if permuteBlockSizeDependentOnLayout:
            blockSize = self.permuteBlockSizeAccordingToLayout(blockSize, field.layout)

77
78
79
        if AUTO_BLOCKSIZE_LIMITING:
            blockSize = self.limitBlockSizeToDeviceMaximum(blockSize)
            
80
        self._blockSize = blockSize
81
82
83
        self._iterationSlice = normalizeSlice(iterationSlice, field.spatialShape)
        self._dim = field.spatialDimensions
        self._symbolicShape = [e if isinstance(e, sp.Basic) else None for e in field.spatialShape]
84

85
86
    @property
    def coordinates(self):
87
88
89
90
91
        offsets = _getStartFromSlice(self._iterationSlice)
        coordinates = [blockIndex * bs + threadIdx + off
                       for blockIndex, bs, threadIdx, off in zip(BLOCK_IDX, self._blockSize, THREAD_IDX, offsets)]

        return coordinates[:self._dim]
92

93
    def getCallParameters(self, arrShape):
94
95
        substitutionDict = {sym: value for sym, value in zip(self._symbolicShape, arrShape) if sym is not None}

96
97
        widths = [end - start for start, end in zip(_getStartFromSlice(self._iterationSlice),
                                                    _getEndFromSlice(self._iterationSlice, arrShape))]
98
99
        widths = sp.Matrix(widths).subs(substitutionDict)

100
        grid = tuple(sp.ceiling(length / blockSize) for length, blockSize in zip(widths, self._blockSize))
101
102
        extendBs = (1,) * (3 - len(self._blockSize))
        extendGr = (1,) * (3 - len(grid))
103

104
105
106
107
        return {'block': self._blockSize + extendBs,
                'grid': grid + extendGr}

    def guard(self, kernelContent, arrShape):
108
        arrShape = arrShape[:self._dim]
109
        conditions = [c < end
110
                      for c, end in zip(self.coordinates, _getEndFromSlice(self._iterationSlice, arrShape))]
111
112
113
114
115
        condition = conditions[0]
        for c in conditions[1:]:
            condition = sp.And(condition, c)
        return Block([Conditional(condition, kernelContent)])

116
117
    @staticmethod
    def limitBlockSizeToDeviceMaximum(blockSize):
118
119
120
121
122
123
124
        """
        Changes blocksize according to match device limits according to the following rules:
        1) if the total amount of threads is too big for the current device, the biggest coordinate is divided by 2.
        2) next, if one component is still too big, the component which is too big is divided by 2 and the smallest
           component is multiplied by 2, such that the total amount of threads stays the same
        Returns the altered blockSize
        """
125
        # Get device limits
126
127
128
        import pycuda.driver as cuda
        import pycuda.autoinit

129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
        da = cuda.device_attribute
        device = cuda.Context.get_device()

        blockSize = list(blockSize)
        maxThreads = device.get_attribute(da.MAX_THREADS_PER_BLOCK)
        maxBlockSize = [device.get_attribute(a)
                        for a in (da.MAX_BLOCK_DIM_X, da.MAX_BLOCK_DIM_Y, da.MAX_BLOCK_DIM_Z)]

        def prod(seq):
            result = 1
            for e in seq:
                result *= e
            return result

        def getIndexOfTooBigElement(blockSize):
            for i, bs in enumerate(blockSize):
                if bs > maxBlockSize[i]:
                    return i
            return None

        def getIndexOfTooSmallElement(blockSize):
            for i, bs in enumerate(blockSize):
                if bs // 2 <= maxBlockSize[i]:
                    return i
            return None

        # Reduce the total number of threads if necessary
        while prod(blockSize) > maxThreads:
            itemToReduce = blockSize.index(max(blockSize))
            for i, bs in enumerate(blockSize):
                if bs > maxBlockSize[i]:
                    itemToReduce = i
            blockSize[itemToReduce] //= 2

        # Cap individual elements
        tooBigElementIndex = getIndexOfTooBigElement(blockSize)
        while tooBigElementIndex is not None:
            tooSmallElementIndex = getIndexOfTooSmallElement(blockSize)
            blockSize[tooSmallElementIndex] *= 2
            blockSize[tooBigElementIndex] //= 2
            tooBigElementIndex = getIndexOfTooBigElement(blockSize)

        return tuple(blockSize)

173
174
175
176
177
178
179
    @staticmethod
    def limitBlockSizeByRegisterRestriction(blockSize, requiredRegistersPerThread, device=None):
        """Shrinks the blockSize if there are too many registers used per multiprocessor.
        This is not done automatically, since the requiredRegistersPerThread are not known before compilation.
        They can be obtained by ``func.num_regs`` from a pycuda function.
        :returns smaller blockSize if too many registers are used.
        """
180
181
182
        import pycuda.driver as cuda
        import pycuda.autoinit

183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
        da = cuda.device_attribute
        if device is None:
            device = cuda.Context.get_device()
        availableRegistersPerMP = device.get_attribute(da.MAX_REGISTERS_PER_MULTIPROCESSOR)

        block = blockSize

        while True:
            numThreads = 1
            for t in block:
                numThreads *= t
            requiredRegistersPerMT = numThreads * requiredRegistersPerThread
            if requiredRegistersPerMT <= availableRegistersPerMP:
                return block
            else:
                largestGridEntryIdx = max(range(len(block)), key=lambda e: block[e])
                assert block[largestGridEntryIdx] >= 2
                block[largestGridEntryIdx] //= 2

202
203
    @staticmethod
    def permuteBlockSizeAccordingToLayout(blockSize, layout):
204
        """Returns modified blockSize such that the fastest coordinate gets the biggest block dimension"""
205
206
207
208
209
210
211
212
213
214
        sortedBlockSize = list(sorted(blockSize, reverse=True))
        while len(sortedBlockSize) > len(layout):
            sortedBlockSize[0] *= sortedBlockSize[-1]
            sortedBlockSize = sortedBlockSize[:-1]

        result = list(blockSize)
        for l, bs in zip(reversed(layout), sortedBlockSize):
            result[l] = bs
        return tuple(result[:len(layout)])

215
216
217
218
219
220
221
222
223

class LineIndexing(AbstractIndexing):
    """
    Indexing scheme that assigns the innermost 'line' i.e. the elements which are adjacent in memory to a 1D CUDA block.
    The fastest coordinate is indexed with threadIdx.x, the remaining coordinates are mapped to blockIdx.{x,y,z}
    This indexing scheme supports up to 4 spatial dimensions, where the innermost dimensions is not larger than the
    maximum amount of threads allowed in a CUDA block (which depends on device).
    """

224
    def __init__(self, field, iterationSlice=None):
225
226
227
228
229
230
231
232
233
        availableIndices = [THREAD_IDX[0]] + BLOCK_IDX
        if field.spatialDimensions > 4:
            raise NotImplementedError("This indexing scheme supports at most 4 spatial dimensions")

        coordinates = availableIndices[:field.spatialDimensions]

        fastestCoordinate = field.layout[-1]
        coordinates[0], coordinates[fastestCoordinate] = coordinates[fastestCoordinate], coordinates[0]

234
        self._coordinates = coordinates
235
236
        self._iterationSlice = normalizeSlice(iterationSlice, field.spatialShape)
        self._symbolicShape = [e if isinstance(e, sp.Basic) else None for e in field.spatialShape]
237

238
239
    @property
    def coordinates(self):
240
        return [i + offset for i, offset in zip(self._coordinates, _getStartFromSlice(self._iterationSlice))]
241

242
    def getCallParameters(self, arrShape):
243
244
        substitutionDict = {sym: value for sym, value in zip(self._symbolicShape, arrShape) if sym is not None}

245
246
        widths = [end - start for start, end in zip(_getStartFromSlice(self._iterationSlice),
                                                    _getEndFromSlice(self._iterationSlice, arrShape))]
247
        widths = sp.Matrix(widths).subs(substitutionDict)
248

249
        def getShapeOfCudaIdx(cudaIdx):
250
            if cudaIdx not in self._coordinates:
251
252
                return 1
            else:
253
                idx = self._coordinates.index(cudaIdx)
254
                return widths[idx]
255

256
257
        return {'block': tuple([getShapeOfCudaIdx(idx) for idx in THREAD_IDX]),
                'grid': tuple([getShapeOfCudaIdx(idx) for idx in BLOCK_IDX])}
258

259
260
    def guard(self, kernelContent, arrShape):
        return kernelContent
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284


# -------------------------------------- Helper functions --------------------------------------------------------------

def _getStartFromSlice(iterationSlice):
    res = []
    for sliceComponent in iterationSlice:
        if type(sliceComponent) is slice:
            res.append(sliceComponent.start if sliceComponent.start is not None else 0)
        else:
            assert isinstance(sliceComponent, int)
            res.append(sliceComponent)
    return res


def _getEndFromSlice(iterationSlice, arrShape):
    iterSlice = normalizeSlice(iterationSlice, arrShape)
    res = []
    for sliceComponent in iterSlice:
        if type(sliceComponent) is slice:
            res.append(sliceComponent.stop)
        else:
            assert isinstance(sliceComponent, int)
            res.append(sliceComponent + 1)
285
286
    return res

Martin Bauer's avatar
Martin Bauer committed
287
288
289
290
291
292
293
294
295
296
297
298
299
300

def indexingCreatorFromParams(gpuIndexing, gpuIndexingParams):
    if isinstance(gpuIndexing, str):
        if gpuIndexing == 'block':
            indexingCreator = BlockIndexing
        elif gpuIndexing == 'line':
            indexingCreator = LineIndexing
        else:
            raise ValueError("Unknown GPU indexing %s. Valid values are 'block' and 'line'" % (gpuIndexing,))
        if gpuIndexingParams:
            indexingCreator = partial(indexingCreator, **gpuIndexingParams)
        return indexingCreator
    else:
        return gpuIndexing