indexing.py 8.9 KB
Newer Older
1
import abc
2
3
4
5
6
7
8
9
10
11
12
import sympy as sp
import math
import pycuda.driver as cuda
import pycuda.autoinit

from pystencils.astnodes import Conditional, Block

BLOCK_IDX = list(sp.symbols("blockIdx.x blockIdx.y blockIdx.z"))
THREAD_IDX = list(sp.symbols("threadIdx.x threadIdx.y threadIdx.z"))


13
14
15
16
17
18
19
20
class AbstractIndexing(abc.ABCMeta('ABC', (object,), {})):
    """
    Abstract base class for all Indexing classes. An Indexing class defines how a multidimensional
    field is mapped to CUDA's block and grid system. It calculates indices based on CUDA's thread and block indices
    and computes the number of blocks and threads a kernel is started with. The Indexing class is created with
    a pystencils field and the number of ghost layers as well as further optional parameters that must have default
    values.
    """
21

22
23
24
25
    @abc.abstractproperty
    def coordinates(self):
        """Returns a sequence of coordinate expressions for (x,y,z) depending on symbolic CUDA block and thread indices.
        These symbolic indices can be obtained with the method `indexVariables` """
26
27

    @property
28
29
30
    def indexVariables(self):
        """Sympy symbols for CUDA's block and thread indices"""
        return BLOCK_IDX + THREAD_IDX
31

32
    @abc.abstractmethod
33
    def getCallParameters(self, arrShape):
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
        """
        Determine grid and block size for kernel call
        :param arrShape: the numeric (not symbolic) shape of the array
        :return: dict with keys 'blocks' and 'threads' with tuple values for number of (x,y,z) threads and blocks
                 the kernel should be started with
        """

    @abc.abstractmethod
    def guard(self, kernelContent, arrShape):
        """
        In some indexing schemes not all threads of a block execute the kernel content.
        This function can return a Conditional ast node, defining this execution guard.
        :param kernelContent: the actual kernel contents which can e.g. be put into the Conditional node as true block
        :param arrShape: the numeric or symbolic shape of the field
        :return: ast node, which is put inside the kernel function
        """
50
51


52
# -------------------------------------------- Implementations ---------------------------------------------------------
53
54


55
56
class BlockIndexing(AbstractIndexing):
    """Generic indexing scheme that maps sub-blocks of an array to CUDA blocks."""
57
58

    def __init__(self, field, ghostLayers, blockSize=(256, 8, 1), permuteBlockSizeDependentOnLayout=True):
59
60
61
62
63
64
65
66
        """
        Creates
        :param field: pystencils field (common to all Indexing classes)
        :param ghostLayers: number of ghost layers (common to all Indexing classes)
        :param blockSize: size of a CUDA block
        :param permuteBlockSizeDependentOnLayout: if True the blockSize is permuted such that the fastest coordinate
                                                  gets the largest amount of threads
        """
67
68
69
70
71
72
        if field.spatialDimensions > 3:
            raise NotImplementedError("This indexing scheme supports at most 3 spatial dimensions")

        if permuteBlockSizeDependentOnLayout:
            blockSize = self.permuteBlockSizeAccordingToLayout(blockSize, field.layout)

73
74
75
        blockSize = self.limitBlockSizeToDeviceMaximum(blockSize)
        self._blockSize = blockSize

76
77
        self._coordinates = [blockIndex * bs + threadIndex + gl[0]
                             for blockIndex, bs, threadIndex, gl in zip(BLOCK_IDX, blockSize, THREAD_IDX, ghostLayers)]
78
79
80
81

        self._coordinates = self._coordinates[:field.spatialDimensions]
        self._ghostLayers = ghostLayers

82
83
84
85
86
87
    @property
    def coordinates(self):
        return self._coordinates

    def getCallParameters(self, arrShape):
        dim = len(self._coordinates)
88
        arrShape = [s - (gl[0] + gl[1]) for s, gl in zip(arrShape[:dim], self._ghostLayers)]
89
90
91
92
93
94
95
96
97
        grid = tuple(math.ceil(length / blockSize) for length, blockSize in zip(arrShape, self._blockSize))
        extendBs = (1,) * (3 - len(self._blockSize))
        extendGr = (1,) * (3 - len(grid))
        return {'block': self._blockSize + extendBs,
                'grid': grid + extendGr}

    def guard(self, kernelContent, arrShape):
        dim = len(self._coordinates)
        arrShape = arrShape[:dim]
98
99
        conditions = [c < shapeComponent - gl[1]
                      for c, shapeComponent, gl in zip(self._coordinates, arrShape, self._ghostLayers)]
100
101
102
103
104
        condition = conditions[0]
        for c in conditions[1:]:
            condition = sp.And(condition, c)
        return Block([Conditional(condition, kernelContent)])

105
106
    @staticmethod
    def limitBlockSizeToDeviceMaximum(blockSize):
107
108
109
110
111
112
113
        """
        Changes blocksize according to match device limits according to the following rules:
        1) if the total amount of threads is too big for the current device, the biggest coordinate is divided by 2.
        2) next, if one component is still too big, the component which is too big is divided by 2 and the smallest
           component is multiplied by 2, such that the total amount of threads stays the same
        Returns the altered blockSize
        """
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
        # Get device limits
        da = cuda.device_attribute
        device = cuda.Context.get_device()

        blockSize = list(blockSize)
        maxThreads = device.get_attribute(da.MAX_THREADS_PER_BLOCK)
        maxBlockSize = [device.get_attribute(a)
                        for a in (da.MAX_BLOCK_DIM_X, da.MAX_BLOCK_DIM_Y, da.MAX_BLOCK_DIM_Z)]

        def prod(seq):
            result = 1
            for e in seq:
                result *= e
            return result

        def getIndexOfTooBigElement(blockSize):
            for i, bs in enumerate(blockSize):
                if bs > maxBlockSize[i]:
                    return i
            return None

        def getIndexOfTooSmallElement(blockSize):
            for i, bs in enumerate(blockSize):
                if bs // 2 <= maxBlockSize[i]:
                    return i
            return None

        # Reduce the total number of threads if necessary
        while prod(blockSize) > maxThreads:
            itemToReduce = blockSize.index(max(blockSize))
            for i, bs in enumerate(blockSize):
                if bs > maxBlockSize[i]:
                    itemToReduce = i
            blockSize[itemToReduce] //= 2

        # Cap individual elements
        tooBigElementIndex = getIndexOfTooBigElement(blockSize)
        while tooBigElementIndex is not None:
            tooSmallElementIndex = getIndexOfTooSmallElement(blockSize)
            blockSize[tooSmallElementIndex] *= 2
            blockSize[tooBigElementIndex] //= 2
            tooBigElementIndex = getIndexOfTooBigElement(blockSize)

        return tuple(blockSize)

    @staticmethod
    def permuteBlockSizeAccordingToLayout(blockSize, layout):
161
        """Returns modified blockSize such that the fastest coordinate gets the biggest block dimension"""
162
163
164
165
166
167
168
169
170
171
        sortedBlockSize = list(sorted(blockSize, reverse=True))
        while len(sortedBlockSize) > len(layout):
            sortedBlockSize[0] *= sortedBlockSize[-1]
            sortedBlockSize = sortedBlockSize[:-1]

        result = list(blockSize)
        for l, bs in zip(reversed(layout), sortedBlockSize):
            result[l] = bs
        return tuple(result[:len(layout)])

172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191

class LineIndexing(AbstractIndexing):
    """
    Indexing scheme that assigns the innermost 'line' i.e. the elements which are adjacent in memory to a 1D CUDA block.
    The fastest coordinate is indexed with threadIdx.x, the remaining coordinates are mapped to blockIdx.{x,y,z}
    This indexing scheme supports up to 4 spatial dimensions, where the innermost dimensions is not larger than the
    maximum amount of threads allowed in a CUDA block (which depends on device).
    """

    def __init__(self, field, ghostLayers):
        availableIndices = [THREAD_IDX[0]] + BLOCK_IDX
        if field.spatialDimensions > 4:
            raise NotImplementedError("This indexing scheme supports at most 4 spatial dimensions")

        coordinates = availableIndices[:field.spatialDimensions]

        fastestCoordinate = field.layout[-1]
        coordinates[0], coordinates[fastestCoordinate] = coordinates[fastestCoordinate], coordinates[0]

        self._coordiantesNoGhostLayer = coordinates
192
        self._coordinates = [i + gl[0] for i, gl in zip(coordinates, ghostLayers)]
193
194
        self._ghostLayers = ghostLayers

195
196
197
198
199
    @property
    def coordinates(self):
        return self._coordinates

    def getCallParameters(self, arrShape):
200
201
202
203
        def getShapeOfCudaIdx(cudaIdx):
            if cudaIdx not in self._coordiantesNoGhostLayer:
                return 1
            else:
204
205
                idx = self._coordiantesNoGhostLayer.index(cudaIdx)
                return arrShape[idx] - (self._ghostLayers[idx][0] + self._ghostLayers[idx][1])
206

207
208
        return {'block': tuple([getShapeOfCudaIdx(idx) for idx in THREAD_IDX]),
                'grid': tuple([getShapeOfCudaIdx(idx) for idx in BLOCK_IDX])}
209

210
211
    def guard(self, kernelContent, arrShape):
        return kernelContent