indexing.py 13 KB
Newer Older
1
import abc
Martin Bauer's avatar
Martin Bauer committed
2
from typing import Tuple
3

4
5
6
import sympy as sp

from pystencils.astnodes import Conditional, Block
Martin Bauer's avatar
Martin Bauer committed
7
from pystencils.slicing import normalize_slice
Martin Bauer's avatar
Martin Bauer committed
8
from pystencils.data_types import TypedSymbol, create_type
Martin Bauer's avatar
Martin Bauer committed
9
from functools import partial
10

Martin Bauer's avatar
Martin Bauer committed
11
AUTO_BLOCK_SIZE_LIMITING = True
12

Martin Bauer's avatar
Martin Bauer committed
13
14
15
16
BLOCK_IDX = [TypedSymbol("blockIdx." + coord, create_type("int")) for coord in ('x', 'y', 'z')]
THREAD_IDX = [TypedSymbol("threadIdx." + coord, create_type("int")) for coord in ('x', 'y', 'z')]
BLOCK_DIM = [TypedSymbol("blockDim." + coord, create_type("int")) for coord in ('x', 'y', 'z')]
GRID_DIM = [TypedSymbol("gridDim." + coord, create_type("int")) for coord in ('x', 'y', 'z')]
17
18


Martin Bauer's avatar
Martin Bauer committed
19
class AbstractIndexing(abc.ABC):
20
21
22
23
    """
    Abstract base class for all Indexing classes. An Indexing class defines how a multidimensional
    field is mapped to CUDA's block and grid system. It calculates indices based on CUDA's thread and block indices
    and computes the number of blocks and threads a kernel is started with. The Indexing class is created with
24
    a pystencils field, a slice to iterate over, and further optional parameters that must have default values.
25
    """
26

Martin Bauer's avatar
Martin Bauer committed
27
28
    @property
    @abc.abstractmethod
29
30
    def coordinates(self):
        """Returns a sequence of coordinate expressions for (x,y,z) depending on symbolic CUDA block and thread indices.
Martin Bauer's avatar
Martin Bauer committed
31
        These symbolic indices can be obtained with the method `index_variables` """
32
33

    @property
Martin Bauer's avatar
Martin Bauer committed
34
    def index_variables(self):
35
36
        """Sympy symbols for CUDA's block and thread indices, and block and grid dimensions. """
        return BLOCK_IDX + THREAD_IDX + BLOCK_DIM + GRID_DIM
37

38
    @abc.abstractmethod
Martin Bauer's avatar
Martin Bauer committed
39
    def call_parameters(self, arr_shape):
40
41
        """
        Determine grid and block size for kernel call
Martin Bauer's avatar
Martin Bauer committed
42
        :param arr_shape: the numeric (not symbolic) shape of the array
43
44
45
46
47
        :return: dict with keys 'blocks' and 'threads' with tuple values for number of (x,y,z) threads and blocks
                 the kernel should be started with
        """

    @abc.abstractmethod
Martin Bauer's avatar
Martin Bauer committed
48
    def guard(self, kernel_content, arr_shape):
49
50
51
        """
        In some indexing schemes not all threads of a block execute the kernel content.
        This function can return a Conditional ast node, defining this execution guard.
Martin Bauer's avatar
Martin Bauer committed
52
53
        :param kernel_content: the actual kernel contents which can e.g. be put into the Conditional node as true block
        :param arr_shape: the numeric or symbolic shape of the field
54
55
        :return: ast node, which is put inside the kernel function
        """
56
57


58
# -------------------------------------------- Implementations ---------------------------------------------------------
59
60


61
class BlockIndexing(AbstractIndexing):
Martin Bauer's avatar
Martin Bauer committed
62
    """Generic indexing scheme that maps sub-blocks of an array to CUDA blocks.
63

Martin Bauer's avatar
Martin Bauer committed
64
65
66
67
68
69
70
71
72
73
    Args:
        field: pystencils field (common to all Indexing classes)
        iteration_slice: slice that defines rectangular subarea which is iterated over
        permute_block_size_dependent_on_layout: if True the block_size is permuted such that the fastest coordinate
                                                gets the largest amount of threads
    """

    def __init__(self, field, iteration_slice=None,
                 block_size=(256, 8, 1), permute_block_size_dependent_on_layout=True):
        if field.spatial_dimensions > 3:
74
75
            raise NotImplementedError("This indexing scheme supports at most 3 spatial dimensions")

Martin Bauer's avatar
Martin Bauer committed
76
77
        if permute_block_size_dependent_on_layout:
            block_size = self.permute_block_size_according_to_layout(block_size, field.layout)
78

Martin Bauer's avatar
Martin Bauer committed
79
80
        if AUTO_BLOCK_SIZE_LIMITING:
            block_size = self.limit_block_size_to_device_maximum(block_size)
81
            
Martin Bauer's avatar
Martin Bauer committed
82
83
84
85
        self._blockSize = block_size
        self._iterationSlice = normalize_slice(iteration_slice, field.spatial_shape)
        self._dim = field.spatial_dimensions
        self._symbolicShape = [e if isinstance(e, sp.Basic) else None for e in field.spatial_shape]
86

87
88
    @property
    def coordinates(self):
Martin Bauer's avatar
Martin Bauer committed
89
        offsets = _get_start_from_slice(self._iterationSlice)
Martin Bauer's avatar
Martin Bauer committed
90
91
        coordinates = [block_index * bs + thread_idx + off
                       for block_index, bs, thread_idx, off in zip(BLOCK_IDX, self._blockSize, THREAD_IDX, offsets)]
92
93

        return coordinates[:self._dim]
94

Martin Bauer's avatar
Martin Bauer committed
95
96
    def call_parameters(self, arr_shape):
        substitution_dict = {sym: value for sym, value in zip(self._symbolicShape, arr_shape) if sym is not None}
97

Martin Bauer's avatar
Martin Bauer committed
98
99
100
        widths = [end - start for start, end in zip(_get_start_from_slice(self._iterationSlice),
                                                    _get_end_from_slice(self._iterationSlice, arr_shape))]
        widths = sp.Matrix(widths).subs(substitution_dict)
101

Martin Bauer's avatar
Martin Bauer committed
102
103
        grid: Tuple[int, ...] = tuple(sp.ceiling(length / block_size)
                                      for length, block_size in zip(widths, self._blockSize))
Martin Bauer's avatar
Martin Bauer committed
104
105
        extend_bs = (1,) * (3 - len(self._blockSize))
        extend_gr = (1,) * (3 - len(grid))
106

Martin Bauer's avatar
Martin Bauer committed
107
108
        return {'block': self._blockSize + extend_bs,
                'grid': grid + extend_gr}
109

Martin Bauer's avatar
Martin Bauer committed
110
111
    def guard(self, kernel_content, arr_shape):
        arr_shape = arr_shape[:self._dim]
112
        conditions = [c < end
Martin Bauer's avatar
Martin Bauer committed
113
                      for c, end in zip(self.coordinates, _get_end_from_slice(self._iterationSlice, arr_shape))]
114
115
116
        condition = conditions[0]
        for c in conditions[1:]:
            condition = sp.And(condition, c)
Martin Bauer's avatar
Martin Bauer committed
117
        return Block([Conditional(condition, kernel_content)])
118

119
    @staticmethod
Martin Bauer's avatar
Martin Bauer committed
120
    def limit_block_size_to_device_maximum(block_size):
121
        """
Martin Bauer's avatar
Martin Bauer committed
122
        Changes block size according to match device limits according to the following rules:
123
124
125
        1) if the total amount of threads is too big for the current device, the biggest coordinate is divided by 2.
        2) next, if one component is still too big, the component which is too big is divided by 2 and the smallest
           component is multiplied by 2, such that the total amount of threads stays the same
Martin Bauer's avatar
Martin Bauer committed
126
        Returns the altered block_size
127
        """
128
        # Get device limits
129
130
131
        import pycuda.driver as cuda
        import pycuda.autoinit

132
133
134
        da = cuda.device_attribute
        device = cuda.Context.get_device()

Martin Bauer's avatar
Martin Bauer committed
135
136
137
138
        block_size = list(block_size)
        max_threads = device.get_attribute(da.MAX_THREADS_PER_BLOCK)
        max_block_size = [device.get_attribute(a)
                          for a in (da.MAX_BLOCK_DIM_X, da.MAX_BLOCK_DIM_Y, da.MAX_BLOCK_DIM_Z)]
139
140
141
142
143
144
145

        def prod(seq):
            result = 1
            for e in seq:
                result *= e
            return result

Martin Bauer's avatar
Martin Bauer committed
146
147
148
        def get_index_of_too_big_element():
            for i, bs in enumerate(block_size):
                if bs > max_block_size[i]:
149
150
151
                    return i
            return None

Martin Bauer's avatar
Martin Bauer committed
152
153
154
        def get_index_of_too_small_element():
            for i, bs in enumerate(block_size):
                if bs // 2 <= max_block_size[i]:
155
156
157
158
                    return i
            return None

        # Reduce the total number of threads if necessary
Martin Bauer's avatar
Martin Bauer committed
159
160
161
162
163
164
        while prod(block_size) > max_threads:
            item_to_reduce = block_size.index(max(block_size))
            for j, block_size_entry in enumerate(block_size):
                if block_size_entry > max_block_size[j]:
                    item_to_reduce = j
            block_size[item_to_reduce] //= 2
165
166

        # Cap individual elements
Martin Bauer's avatar
Martin Bauer committed
167
168
169
170
171
172
        too_big_element_index = get_index_of_too_big_element()
        while too_big_element_index is not None:
            too_small_element_index = get_index_of_too_small_element()
            block_size[too_small_element_index] *= 2
            block_size[too_big_element_index] //= 2
            too_big_element_index = get_index_of_too_big_element()
173

Martin Bauer's avatar
Martin Bauer committed
174
        return tuple(block_size)
175

176
    @staticmethod
Martin Bauer's avatar
Martin Bauer committed
177
178
179
    def limit_block_size_by_register_restriction(block_size, required_registers_per_thread, device=None):
        """Shrinks the block_size if there are too many registers used per multiprocessor.
        This is not done automatically, since the required_registers_per_thread are not known before compilation.
180
        They can be obtained by ``func.num_regs`` from a pycuda function.
Martin Bauer's avatar
Martin Bauer committed
181
        :returns smaller block_size if too many registers are used.
182
        """
183
184
185
        import pycuda.driver as cuda
        import pycuda.autoinit

186
187
188
        da = cuda.device_attribute
        if device is None:
            device = cuda.Context.get_device()
Martin Bauer's avatar
Martin Bauer committed
189
        available_registers_per_mp = device.get_attribute(da.MAX_REGISTERS_PER_MULTIPROCESSOR)
190

Martin Bauer's avatar
Martin Bauer committed
191
        block = block_size
192
193

        while True:
Martin Bauer's avatar
Martin Bauer committed
194
            num_threads = 1
195
            for t in block:
Martin Bauer's avatar
Martin Bauer committed
196
197
198
                num_threads *= t
            required_registers_per_mt = num_threads * required_registers_per_thread
            if required_registers_per_mt <= available_registers_per_mp:
199
200
                return block
            else:
Martin Bauer's avatar
Martin Bauer committed
201
202
203
                largest_grid_entry_idx = max(range(len(block)), key=lambda e: block[e])
                assert block[largest_grid_entry_idx] >= 2
                block[largest_grid_entry_idx] //= 2
204

205
    @staticmethod
Martin Bauer's avatar
Martin Bauer committed
206
207
208
209
210
211
212
213
214
    def permute_block_size_according_to_layout(block_size, layout):
        """Returns modified block_size such that the fastest coordinate gets the biggest block dimension"""
        sorted_block_size = list(sorted(block_size, reverse=True))
        while len(sorted_block_size) > len(layout):
            sorted_block_size[0] *= sorted_block_size[-1]
            sorted_block_size = sorted_block_size[:-1]

        result = list(block_size)
        for l, bs in zip(reversed(layout), sorted_block_size):
215
216
217
            result[l] = bs
        return tuple(result[:len(layout)])

218
219
220
221

class LineIndexing(AbstractIndexing):
    """
    Indexing scheme that assigns the innermost 'line' i.e. the elements which are adjacent in memory to a 1D CUDA block.
Martin Bauer's avatar
Martin Bauer committed
222
    The fastest coordinate is indexed with thread_idx.x, the remaining coordinates are mapped to block_idx.{x,y,z}
223
224
225
226
    This indexing scheme supports up to 4 spatial dimensions, where the innermost dimensions is not larger than the
    maximum amount of threads allowed in a CUDA block (which depends on device).
    """

Martin Bauer's avatar
Martin Bauer committed
227
228
229
    def __init__(self, field, iteration_slice=None):
        available_indices = [THREAD_IDX[0]] + BLOCK_IDX
        if field.spatial_dimensions > 4:
230
231
            raise NotImplementedError("This indexing scheme supports at most 4 spatial dimensions")

Martin Bauer's avatar
Martin Bauer committed
232
        coordinates = available_indices[:field.spatial_dimensions]
233

Martin Bauer's avatar
Martin Bauer committed
234
235
        fastest_coordinate = field.layout[-1]
        coordinates[0], coordinates[fastest_coordinate] = coordinates[fastest_coordinate], coordinates[0]
236

237
        self._coordinates = coordinates
Martin Bauer's avatar
Martin Bauer committed
238
239
        self._iterationSlice = normalize_slice(iteration_slice, field.spatial_shape)
        self._symbolicShape = [e if isinstance(e, sp.Basic) else None for e in field.spatial_shape]
240

241
242
    @property
    def coordinates(self):
Martin Bauer's avatar
Martin Bauer committed
243
        return [i + offset for i, offset in zip(self._coordinates, _get_start_from_slice(self._iterationSlice))]
244

Martin Bauer's avatar
Martin Bauer committed
245
246
    def call_parameters(self, arr_shape):
        substitution_dict = {sym: value for sym, value in zip(self._symbolicShape, arr_shape) if sym is not None}
247

Martin Bauer's avatar
Martin Bauer committed
248
249
250
        widths = [end - start for start, end in zip(_get_start_from_slice(self._iterationSlice),
                                                    _get_end_from_slice(self._iterationSlice, arr_shape))]
        widths = sp.Matrix(widths).subs(substitution_dict)
251

Martin Bauer's avatar
Martin Bauer committed
252
253
        def get_shape_of_cuda_idx(cuda_idx):
            if cuda_idx not in self._coordinates:
254
255
                return 1
            else:
Martin Bauer's avatar
Martin Bauer committed
256
                idx = self._coordinates.index(cuda_idx)
257
                return widths[idx]
258

Martin Bauer's avatar
Martin Bauer committed
259
260
        return {'block': tuple([get_shape_of_cuda_idx(idx) for idx in THREAD_IDX]),
                'grid': tuple([get_shape_of_cuda_idx(idx) for idx in BLOCK_IDX])}
261

Martin Bauer's avatar
Martin Bauer committed
262
263
    def guard(self, kernel_content, arr_shape):
        return kernel_content
264
265
266
267


# -------------------------------------- Helper functions --------------------------------------------------------------

Martin Bauer's avatar
Martin Bauer committed
268
def _get_start_from_slice(iteration_slice):
269
    res = []
Martin Bauer's avatar
Martin Bauer committed
270
271
272
    for slice_component in iteration_slice:
        if type(slice_component) is slice:
            res.append(slice_component.start if slice_component.start is not None else 0)
273
        else:
Martin Bauer's avatar
Martin Bauer committed
274
275
            assert isinstance(slice_component, int)
            res.append(slice_component)
276
277
278
    return res


Martin Bauer's avatar
Martin Bauer committed
279
280
def _get_end_from_slice(iteration_slice, arr_shape):
    iter_slice = normalize_slice(iteration_slice, arr_shape)
281
    res = []
Martin Bauer's avatar
Martin Bauer committed
282
283
284
    for slice_component in iter_slice:
        if type(slice_component) is slice:
            res.append(slice_component.stop)
285
        else:
Martin Bauer's avatar
Martin Bauer committed
286
287
            assert isinstance(slice_component, int)
            res.append(slice_component + 1)
288
289
    return res

Martin Bauer's avatar
Martin Bauer committed
290

Martin Bauer's avatar
Martin Bauer committed
291
292
293
294
295
296
def indexing_creator_from_params(gpu_indexing, gpu_indexing_params):
    if isinstance(gpu_indexing, str):
        if gpu_indexing == 'block':
            indexing_creator = BlockIndexing
        elif gpu_indexing == 'line':
            indexing_creator = LineIndexing
Martin Bauer's avatar
Martin Bauer committed
297
        else:
Martin Bauer's avatar
Martin Bauer committed
298
299
300
301
            raise ValueError("Unknown GPU indexing %s. Valid values are 'block' and 'line'" % (gpu_indexing,))
        if gpu_indexing_params:
            indexing_creator = partial(indexing_creator, **gpu_indexing_params)
        return indexing_creator
Martin Bauer's avatar
Martin Bauer committed
302
    else:
Martin Bauer's avatar
Martin Bauer committed
303
        return gpu_indexing