indexing.py 14.2 KB
Newer Older
1
import abc
Martin Bauer's avatar
Martin Bauer committed
2
from typing import Tuple  # noqa
3
4
import sympy as sp
from pystencils.astnodes import Conditional, Block
5
from pystencils.integer_functions import div_ceil, div_floor
Martin Bauer's avatar
Martin Bauer committed
6
from pystencils.slicing import normalize_slice
Martin Bauer's avatar
Martin Bauer committed
7
from pystencils.data_types import TypedSymbol, create_type
Martin Bauer's avatar
Martin Bauer committed
8
from functools import partial
9

10
11
from pystencils.sympyextensions import prod

12
AUTO_BLOCK_SIZE_LIMITING = False
13

Martin Bauer's avatar
Martin Bauer committed
14
15
16
17
BLOCK_IDX = [TypedSymbol("blockIdx." + coord, create_type("int")) for coord in ('x', 'y', 'z')]
THREAD_IDX = [TypedSymbol("threadIdx." + coord, create_type("int")) for coord in ('x', 'y', 'z')]
BLOCK_DIM = [TypedSymbol("blockDim." + coord, create_type("int")) for coord in ('x', 'y', 'z')]
GRID_DIM = [TypedSymbol("gridDim." + coord, create_type("int")) for coord in ('x', 'y', 'z')]
18
19


Martin Bauer's avatar
Martin Bauer committed
20
class AbstractIndexing(abc.ABC):
21
22
23
24
    """
    Abstract base class for all Indexing classes. An Indexing class defines how a multidimensional
    field is mapped to CUDA's block and grid system. It calculates indices based on CUDA's thread and block indices
    and computes the number of blocks and threads a kernel is started with. The Indexing class is created with
25
    a pystencils field, a slice to iterate over, and further optional parameters that must have default values.
26
    """
27

Martin Bauer's avatar
Martin Bauer committed
28
29
    @property
    @abc.abstractmethod
30
31
    def coordinates(self):
        """Returns a sequence of coordinate expressions for (x,y,z) depending on symbolic CUDA block and thread indices.
Martin Bauer's avatar
Martin Bauer committed
32
        These symbolic indices can be obtained with the method `index_variables` """
33
34

    @property
Martin Bauer's avatar
Martin Bauer committed
35
    def index_variables(self):
36
37
        """Sympy symbols for CUDA's block and thread indices, and block and grid dimensions. """
        return BLOCK_IDX + THREAD_IDX + BLOCK_DIM + GRID_DIM
38

39
    @abc.abstractmethod
Martin Bauer's avatar
Martin Bauer committed
40
    def call_parameters(self, arr_shape):
41
42
43
44
45
46
47
        """Determine grid and block size for kernel call.

        Args:
            arr_shape: the numeric (not symbolic) shape of the array
        Returns:
            dict with keys 'blocks' and 'threads' with tuple values for number of (x,y,z) threads and blocks
            the kernel should be started with
48
49
50
        """

    @abc.abstractmethod
Martin Bauer's avatar
Martin Bauer committed
51
    def guard(self, kernel_content, arr_shape):
52
53
        """In some indexing schemes not all threads of a block execute the kernel content.

54
        This function can return a Conditional ast node, defining this execution guard.
55
56
57
58
59
60
61

        Args:
            kernel_content: the actual kernel contents which can e.g. be put into the Conditional node as true block
            arr_shape: the numeric or symbolic shape of the field

        Returns:
            ast node, which is put inside the kernel function
62
        """
63

64
65
66
67
68
    @abc.abstractmethod
    def max_threads_per_block(self):
        """Return maximal number of threads per block for launch bounds. If this cannot be determined without
        knowing the array shape return None for unknown """

69

70
# -------------------------------------------- Implementations ---------------------------------------------------------
71
72


73
class BlockIndexing(AbstractIndexing):
Martin Bauer's avatar
Martin Bauer committed
74
    """Generic indexing scheme that maps sub-blocks of an array to CUDA blocks.
75

Martin Bauer's avatar
Martin Bauer committed
76
77
78
79
80
    Args:
        field: pystencils field (common to all Indexing classes)
        iteration_slice: slice that defines rectangular subarea which is iterated over
        permute_block_size_dependent_on_layout: if True the block_size is permuted such that the fastest coordinate
                                                gets the largest amount of threads
81
        compile_time_block_size: compile in concrete block size, otherwise the cuda variable 'blockDim' is used
Martin Bauer's avatar
Martin Bauer committed
82
    """
83
    def __init__(self, field, iteration_slice,
84
                 block_size=(16, 16, 1), permute_block_size_dependent_on_layout=True, compile_time_block_size=False):
Martin Bauer's avatar
Martin Bauer committed
85
        if field.spatial_dimensions > 3:
86
87
            raise NotImplementedError("This indexing scheme supports at most 3 spatial dimensions")

Martin Bauer's avatar
Martin Bauer committed
88
89
        if permute_block_size_dependent_on_layout:
            block_size = self.permute_block_size_according_to_layout(block_size, field.layout)
90

Martin Bauer's avatar
Martin Bauer committed
91
92
        if AUTO_BLOCK_SIZE_LIMITING:
            block_size = self.limit_block_size_to_device_maximum(block_size)
Martin Bauer's avatar
Martin Bauer committed
93

94
        self._block_size = block_size
Martin Bauer's avatar
Martin Bauer committed
95
96
        self._iterationSlice = normalize_slice(iteration_slice, field.spatial_shape)
        self._dim = field.spatial_dimensions
97
        self._symbolic_shape = [e if isinstance(e, sp.Basic) else None for e in field.spatial_shape]
98
        self._compile_time_block_size = compile_time_block_size
99

100
101
    @property
    def coordinates(self):
Martin Bauer's avatar
Martin Bauer committed
102
        offsets = _get_start_from_slice(self._iterationSlice)
103
        block_size = self._block_size if self._compile_time_block_size else BLOCK_DIM
Martin Bauer's avatar
Martin Bauer committed
104
        coordinates = [block_index * bs + thread_idx + off
105
                       for block_index, bs, thread_idx, off in zip(BLOCK_IDX, block_size, THREAD_IDX, offsets)]
106
107

        return coordinates[:self._dim]
108

Martin Bauer's avatar
Martin Bauer committed
109
    def call_parameters(self, arr_shape):
110
        substitution_dict = {sym: value for sym, value in zip(self._symbolic_shape, arr_shape) if sym is not None}
111

Martin Bauer's avatar
Martin Bauer committed
112
113
114
        widths = [end - start for start, end in zip(_get_start_from_slice(self._iterationSlice),
                                                    _get_end_from_slice(self._iterationSlice, arr_shape))]
        widths = sp.Matrix(widths).subs(substitution_dict)
115
116
117
        extend_bs = (1,) * (3 - len(self._block_size))
        block_size = self._block_size + extend_bs
        if not self._compile_time_block_size:
118
119
120
121
122
123
124
125
            assert len(block_size) == 3
            adapted_block_size = []
            for i in range(len(widths)):
                factor = div_floor(prod(block_size[:i]), prod(adapted_block_size))
                adapted_block_size.append(sp.Min(block_size[i] * factor, widths[i]))
            block_size = tuple(adapted_block_size) + extend_bs

        grid = tuple(div_ceil(length, block_size) for length, block_size in zip(widths, block_size))
Martin Bauer's avatar
Martin Bauer committed
126
        extend_gr = (1,) * (3 - len(grid))
127

128
        return {'block': block_size,
Martin Bauer's avatar
Martin Bauer committed
129
                'grid': grid + extend_gr}
130

Martin Bauer's avatar
Martin Bauer committed
131
132
    def guard(self, kernel_content, arr_shape):
        arr_shape = arr_shape[:self._dim]
133
        conditions = [c < end
Martin Bauer's avatar
Martin Bauer committed
134
                      for c, end in zip(self.coordinates, _get_end_from_slice(self._iterationSlice, arr_shape))]
135
136
137
        condition = conditions[0]
        for c in conditions[1:]:
            condition = sp.And(condition, c)
Martin Bauer's avatar
Martin Bauer committed
138
        return Block([Conditional(condition, kernel_content)])
139

140
    @staticmethod
Martin Bauer's avatar
Martin Bauer committed
141
    def limit_block_size_to_device_maximum(block_size):
142
        """Changes block size to match device limits.
143
144
145
146
147
148
149

        * if the total amount of threads is too big for the current device, the biggest coordinate is divided by 2.
        * next, if one component is still too big, the component which is too big is divided by 2 and the smallest
          component is multiplied by 2, such that the total amount of threads stays the same

        Returns:
            the altered block_size
150
        """
151
        # Get device limits
152
        import pycuda.driver as cuda
Martin Bauer's avatar
Martin Bauer committed
153
154
        # noinspection PyUnresolvedReferences
        import pycuda.autoinit  # NOQA
155

156
157
158
        da = cuda.device_attribute
        device = cuda.Context.get_device()

Martin Bauer's avatar
Martin Bauer committed
159
160
161
162
        block_size = list(block_size)
        max_threads = device.get_attribute(da.MAX_THREADS_PER_BLOCK)
        max_block_size = [device.get_attribute(a)
                          for a in (da.MAX_BLOCK_DIM_X, da.MAX_BLOCK_DIM_Y, da.MAX_BLOCK_DIM_Z)]
163
164
165
166
167
168
169

        def prod(seq):
            result = 1
            for e in seq:
                result *= e
            return result

Martin Bauer's avatar
Martin Bauer committed
170
171
172
        def get_index_of_too_big_element():
            for i, bs in enumerate(block_size):
                if bs > max_block_size[i]:
173
174
175
                    return i
            return None

Martin Bauer's avatar
Martin Bauer committed
176
177
178
        def get_index_of_too_small_element():
            for i, bs in enumerate(block_size):
                if bs // 2 <= max_block_size[i]:
179
180
181
182
                    return i
            return None

        # Reduce the total number of threads if necessary
Martin Bauer's avatar
Martin Bauer committed
183
184
185
186
187
188
        while prod(block_size) > max_threads:
            item_to_reduce = block_size.index(max(block_size))
            for j, block_size_entry in enumerate(block_size):
                if block_size_entry > max_block_size[j]:
                    item_to_reduce = j
            block_size[item_to_reduce] //= 2
189
190

        # Cap individual elements
Martin Bauer's avatar
Martin Bauer committed
191
192
193
194
195
196
        too_big_element_index = get_index_of_too_big_element()
        while too_big_element_index is not None:
            too_small_element_index = get_index_of_too_small_element()
            block_size[too_small_element_index] *= 2
            block_size[too_big_element_index] //= 2
            too_big_element_index = get_index_of_too_big_element()
197

Martin Bauer's avatar
Martin Bauer committed
198
        return tuple(block_size)
199

200
    @staticmethod
Martin Bauer's avatar
Martin Bauer committed
201
202
203
    def limit_block_size_by_register_restriction(block_size, required_registers_per_thread, device=None):
        """Shrinks the block_size if there are too many registers used per multiprocessor.
        This is not done automatically, since the required_registers_per_thread are not known before compilation.
204
        They can be obtained by ``func.num_regs`` from a pycuda function.
Martin Bauer's avatar
Martin Bauer committed
205
        :returns smaller block_size if too many registers are used.
206
        """
207
        import pycuda.driver as cuda
Martin Bauer's avatar
Martin Bauer committed
208
209
        # noinspection PyUnresolvedReferences
        import pycuda.autoinit  # NOQA
210

211
212
213
        da = cuda.device_attribute
        if device is None:
            device = cuda.Context.get_device()
Martin Bauer's avatar
Martin Bauer committed
214
        available_registers_per_mp = device.get_attribute(da.MAX_REGISTERS_PER_MULTIPROCESSOR)
215

Martin Bauer's avatar
Martin Bauer committed
216
        block = block_size
217
218

        while True:
Martin Bauer's avatar
Martin Bauer committed
219
            num_threads = 1
220
            for t in block:
Martin Bauer's avatar
Martin Bauer committed
221
222
223
                num_threads *= t
            required_registers_per_mt = num_threads * required_registers_per_thread
            if required_registers_per_mt <= available_registers_per_mp:
224
225
                return block
            else:
Martin Bauer's avatar
Martin Bauer committed
226
227
228
                largest_grid_entry_idx = max(range(len(block)), key=lambda e: block[e])
                assert block[largest_grid_entry_idx] >= 2
                block[largest_grid_entry_idx] //= 2
229

230
    @staticmethod
Martin Bauer's avatar
Martin Bauer committed
231
232
233
234
235
236
237
238
239
    def permute_block_size_according_to_layout(block_size, layout):
        """Returns modified block_size such that the fastest coordinate gets the biggest block dimension"""
        sorted_block_size = list(sorted(block_size, reverse=True))
        while len(sorted_block_size) > len(layout):
            sorted_block_size[0] *= sorted_block_size[-1]
            sorted_block_size = sorted_block_size[:-1]

        result = list(block_size)
        for l, bs in zip(reversed(layout), sorted_block_size):
240
241
242
            result[l] = bs
        return tuple(result[:len(layout)])

243
244
245
    def max_threads_per_block(self):
        return prod(self._block_size)

246
247
248
249

class LineIndexing(AbstractIndexing):
    """
    Indexing scheme that assigns the innermost 'line' i.e. the elements which are adjacent in memory to a 1D CUDA block.
Martin Bauer's avatar
Martin Bauer committed
250
    The fastest coordinate is indexed with thread_idx.x, the remaining coordinates are mapped to block_idx.{x,y,z}
251
252
253
254
    This indexing scheme supports up to 4 spatial dimensions, where the innermost dimensions is not larger than the
    maximum amount of threads allowed in a CUDA block (which depends on device).
    """

255
    def __init__(self, field, iteration_slice):
Martin Bauer's avatar
Martin Bauer committed
256
257
        available_indices = [THREAD_IDX[0]] + BLOCK_IDX
        if field.spatial_dimensions > 4:
258
259
            raise NotImplementedError("This indexing scheme supports at most 4 spatial dimensions")

Martin Bauer's avatar
Martin Bauer committed
260
        coordinates = available_indices[:field.spatial_dimensions]
261

Martin Bauer's avatar
Martin Bauer committed
262
263
        fastest_coordinate = field.layout[-1]
        coordinates[0], coordinates[fastest_coordinate] = coordinates[fastest_coordinate], coordinates[0]
264

265
        self._coordinates = coordinates
Martin Bauer's avatar
Martin Bauer committed
266
267
        self._iterationSlice = normalize_slice(iteration_slice, field.spatial_shape)
        self._symbolicShape = [e if isinstance(e, sp.Basic) else None for e in field.spatial_shape]
268

269
270
    @property
    def coordinates(self):
Martin Bauer's avatar
Martin Bauer committed
271
        return [i + offset for i, offset in zip(self._coordinates, _get_start_from_slice(self._iterationSlice))]
272

Martin Bauer's avatar
Martin Bauer committed
273
274
    def call_parameters(self, arr_shape):
        substitution_dict = {sym: value for sym, value in zip(self._symbolicShape, arr_shape) if sym is not None}
275

Martin Bauer's avatar
Martin Bauer committed
276
277
278
        widths = [end - start for start, end in zip(_get_start_from_slice(self._iterationSlice),
                                                    _get_end_from_slice(self._iterationSlice, arr_shape))]
        widths = sp.Matrix(widths).subs(substitution_dict)
279

Martin Bauer's avatar
Martin Bauer committed
280
281
        def get_shape_of_cuda_idx(cuda_idx):
            if cuda_idx not in self._coordinates:
282
283
                return 1
            else:
Martin Bauer's avatar
Martin Bauer committed
284
                idx = self._coordinates.index(cuda_idx)
285
                return widths[idx]
286

Martin Bauer's avatar
Martin Bauer committed
287
288
        return {'block': tuple([get_shape_of_cuda_idx(idx) for idx in THREAD_IDX]),
                'grid': tuple([get_shape_of_cuda_idx(idx) for idx in BLOCK_IDX])}
289

Martin Bauer's avatar
Martin Bauer committed
290
291
    def guard(self, kernel_content, arr_shape):
        return kernel_content
292

293
294
295
    def max_threads_per_block(self):
        return None

296
297
298

# -------------------------------------- Helper functions --------------------------------------------------------------

Martin Bauer's avatar
Martin Bauer committed
299
def _get_start_from_slice(iteration_slice):
300
    res = []
Martin Bauer's avatar
Martin Bauer committed
301
302
303
    for slice_component in iteration_slice:
        if type(slice_component) is slice:
            res.append(slice_component.start if slice_component.start is not None else 0)
304
        else:
Martin Bauer's avatar
Martin Bauer committed
305
306
            assert isinstance(slice_component, int)
            res.append(slice_component)
307
308
309
    return res


Martin Bauer's avatar
Martin Bauer committed
310
311
def _get_end_from_slice(iteration_slice, arr_shape):
    iter_slice = normalize_slice(iteration_slice, arr_shape)
312
    res = []
Martin Bauer's avatar
Martin Bauer committed
313
314
315
    for slice_component in iter_slice:
        if type(slice_component) is slice:
            res.append(slice_component.stop)
316
        else:
Martin Bauer's avatar
Martin Bauer committed
317
318
            assert isinstance(slice_component, int)
            res.append(slice_component + 1)
319
320
    return res

Martin Bauer's avatar
Martin Bauer committed
321

Martin Bauer's avatar
Martin Bauer committed
322
323
324
325
326
327
def indexing_creator_from_params(gpu_indexing, gpu_indexing_params):
    if isinstance(gpu_indexing, str):
        if gpu_indexing == 'block':
            indexing_creator = BlockIndexing
        elif gpu_indexing == 'line':
            indexing_creator = LineIndexing
Martin Bauer's avatar
Martin Bauer committed
328
        else:
Martin Bauer's avatar
Martin Bauer committed
329
330
331
332
            raise ValueError("Unknown GPU indexing %s. Valid values are 'block' and 'line'" % (gpu_indexing,))
        if gpu_indexing_params:
            indexing_creator = partial(indexing_creator, **gpu_indexing_params)
        return indexing_creator
Martin Bauer's avatar
Martin Bauer committed
333
    else:
Martin Bauer's avatar
Martin Bauer committed
334
        return gpu_indexing