indexing.py 5.85 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import sympy as sp
import math
import pycuda.driver as cuda
import pycuda.autoinit

from pystencils.astnodes import Conditional, Block

BLOCK_IDX = list(sp.symbols("blockIdx.x blockIdx.y blockIdx.z"))
THREAD_IDX = list(sp.symbols("threadIdx.x threadIdx.y threadIdx.z"))

# Part 1:
#  given a field and the number of ghost layers, return the x, y and z coordinates
#  dependent on CUDA thread and block indices

# Part 2:
#  given the actual field size, determine the call parameters i.e. # of blocks and threads


class LineIndexing:
    def __init__(self, field, ghostLayers):
        availableIndices = [THREAD_IDX[0]] + BLOCK_IDX
        if field.spatialDimensions > 4:
            raise NotImplementedError("This indexing scheme supports at most 4 spatial dimensions")

        coordinates = availableIndices[:field.spatialDimensions]

        fastestCoordinate = field.layout[-1]
        coordinates[0], coordinates[fastestCoordinate] = coordinates[fastestCoordinate], coordinates[0]

        self._coordiantesNoGhostLayer = coordinates
        self._coordinates = [i + ghostLayers for i in coordinates]
        self._ghostLayers = ghostLayers

    @property
    def coordinates(self):
        return self._coordinates

    def getCallParameters(self, arrShape):
        def getShapeOfCudaIdx(cudaIdx):
            if cudaIdx not in self._coordiantesNoGhostLayer:
                return 1
            else:
                return arrShape[self._coordiantesNoGhostLayer.index(cudaIdx)] - 2 * self._ghostLayers

        return {'block': tuple([getShapeOfCudaIdx(idx) for idx in THREAD_IDX]),
                'grid': tuple([getShapeOfCudaIdx(idx) for idx in BLOCK_IDX])}

    def guard(self, kernelContent, arrShape):
        return kernelContent

    @property
    def indexVariables(self):
        return BLOCK_IDX + THREAD_IDX


class BlockIndexing:
    def __init__(self, field, ghostLayers, blockSize=(256, 8, 1), permuteBlockSizeDependentOnLayout=True):
        if field.spatialDimensions > 3:
            raise NotImplementedError("This indexing scheme supports at most 3 spatial dimensions")

        if permuteBlockSizeDependentOnLayout:
            blockSize = self.permuteBlockSizeAccordingToLayout(blockSize, field.layout)

        self._blockSize = self.limitBlockSizeToDeviceMaximum(blockSize)
        self._coordinates = [blockIndex * bs + threadIndex + ghostLayers
                             for blockIndex, bs, threadIndex in zip(BLOCK_IDX, blockSize, THREAD_IDX)]

        self._coordinates = self._coordinates[:field.spatialDimensions]
        self._ghostLayers = ghostLayers

    @staticmethod
    def limitBlockSizeToDeviceMaximum(blockSize):
        # Get device limits
        da = cuda.device_attribute
        device = cuda.Context.get_device()

        blockSize = list(blockSize)
        maxThreads = device.get_attribute(da.MAX_THREADS_PER_BLOCK)
        maxBlockSize = [device.get_attribute(a)
                        for a in (da.MAX_BLOCK_DIM_X, da.MAX_BLOCK_DIM_Y, da.MAX_BLOCK_DIM_Z)]

        def prod(seq):
            result = 1
            for e in seq:
                result *= e
            return result

        def getIndexOfTooBigElement(blockSize):
            for i, bs in enumerate(blockSize):
                if bs > maxBlockSize[i]:
                    return i
            return None

        def getIndexOfTooSmallElement(blockSize):
            for i, bs in enumerate(blockSize):
                if bs // 2 <= maxBlockSize[i]:
                    return i
            return None

        # Reduce the total number of threads if necessary
        while prod(blockSize) > maxThreads:
            itemToReduce = blockSize.index(max(blockSize))
            for i, bs in enumerate(blockSize):
                if bs > maxBlockSize[i]:
                    itemToReduce = i
            blockSize[itemToReduce] //= 2

        # Cap individual elements
        tooBigElementIndex = getIndexOfTooBigElement(blockSize)
        while tooBigElementIndex is not None:
            tooSmallElementIndex = getIndexOfTooSmallElement(blockSize)
            blockSize[tooSmallElementIndex] *= 2
            blockSize[tooBigElementIndex] //= 2
            tooBigElementIndex = getIndexOfTooBigElement(blockSize)

        return tuple(blockSize)

    @staticmethod
    def permuteBlockSizeAccordingToLayout(blockSize, layout):
        """The fastest coordinate gets the biggest block dimension"""
        sortedBlockSize = list(sorted(blockSize, reverse=True))
        while len(sortedBlockSize) > len(layout):
            sortedBlockSize[0] *= sortedBlockSize[-1]
            sortedBlockSize = sortedBlockSize[:-1]

        result = list(blockSize)
        for l, bs in zip(reversed(layout), sortedBlockSize):
            result[l] = bs
        return tuple(result[:len(layout)])

    @property
    def coordinates(self):
        return self._coordinates

    def getCallParameters(self, arrShape):
        dim = len(self._coordinates)
        arrShape = arrShape[:dim]
        grid = tuple(math.ceil(length / blockSize) for length, blockSize in zip(arrShape, self._blockSize))
        extendBs = (1,) * (3 - len(self._blockSize))
        extendGr = (1,) * (3 - len(grid))
        return {'block': self._blockSize + extendBs,
                'grid': grid + extendGr}

    def guard(self, kernelContent, arrShape):
        dim = len(self._coordinates)
        arrShape = arrShape[:dim]
        conditions = [c < shapeComponent - self._ghostLayers
                      for c, shapeComponent in zip(self._coordinates, arrShape)]
        condition = conditions[0]
        for c in conditions[1:]:
            condition = sp.And(condition, c)
        return Block([Conditional(condition, kernelContent)])

    @property
    def indexVariables(self):
        return BLOCK_IDX + THREAD_IDX

if __name__ == '__main__':
    bs = BlockIndexing.permuteBlockSizeAccordingToLayout((256, 8, 1), (0,))
    bs = BlockIndexing.limitBlockSizeToDeviceMaximum(bs)
    print(bs)