indexing.py 13.6 KB
Newer Older
1
import abc
Martin Bauer's avatar
Martin Bauer committed
2
from typing import Tuple  # noqa
3
4
import sympy as sp
from pystencils.astnodes import Conditional, Block
5
from pystencils.integer_functions import div_ceil
Martin Bauer's avatar
Martin Bauer committed
6
from pystencils.slicing import normalize_slice
Martin Bauer's avatar
Martin Bauer committed
7
from pystencils.data_types import TypedSymbol, create_type
Martin Bauer's avatar
Martin Bauer committed
8
from functools import partial
9

Martin Bauer's avatar
Martin Bauer committed
10
AUTO_BLOCK_SIZE_LIMITING = True
11

Martin Bauer's avatar
Martin Bauer committed
12
13
14
15
BLOCK_IDX = [TypedSymbol("blockIdx." + coord, create_type("int")) for coord in ('x', 'y', 'z')]
THREAD_IDX = [TypedSymbol("threadIdx." + coord, create_type("int")) for coord in ('x', 'y', 'z')]
BLOCK_DIM = [TypedSymbol("blockDim." + coord, create_type("int")) for coord in ('x', 'y', 'z')]
GRID_DIM = [TypedSymbol("gridDim." + coord, create_type("int")) for coord in ('x', 'y', 'z')]
16
17


Martin Bauer's avatar
Martin Bauer committed
18
class AbstractIndexing(abc.ABC):
19
20
21
22
    """
    Abstract base class for all Indexing classes. An Indexing class defines how a multidimensional
    field is mapped to CUDA's block and grid system. It calculates indices based on CUDA's thread and block indices
    and computes the number of blocks and threads a kernel is started with. The Indexing class is created with
23
    a pystencils field, a slice to iterate over, and further optional parameters that must have default values.
24
    """
25

Martin Bauer's avatar
Martin Bauer committed
26
27
    @property
    @abc.abstractmethod
28
29
    def coordinates(self):
        """Returns a sequence of coordinate expressions for (x,y,z) depending on symbolic CUDA block and thread indices.
Martin Bauer's avatar
Martin Bauer committed
30
        These symbolic indices can be obtained with the method `index_variables` """
31
32

    @property
Martin Bauer's avatar
Martin Bauer committed
33
    def index_variables(self):
34
35
        """Sympy symbols for CUDA's block and thread indices, and block and grid dimensions. """
        return BLOCK_IDX + THREAD_IDX + BLOCK_DIM + GRID_DIM
36

37
    @abc.abstractmethod
Martin Bauer's avatar
Martin Bauer committed
38
    def call_parameters(self, arr_shape):
39
40
41
42
43
44
45
        """Determine grid and block size for kernel call.

        Args:
            arr_shape: the numeric (not symbolic) shape of the array
        Returns:
            dict with keys 'blocks' and 'threads' with tuple values for number of (x,y,z) threads and blocks
            the kernel should be started with
46
47
48
        """

    @abc.abstractmethod
Martin Bauer's avatar
Martin Bauer committed
49
    def guard(self, kernel_content, arr_shape):
50
51
        """In some indexing schemes not all threads of a block execute the kernel content.

52
        This function can return a Conditional ast node, defining this execution guard.
53
54
55
56
57
58
59

        Args:
            kernel_content: the actual kernel contents which can e.g. be put into the Conditional node as true block
            arr_shape: the numeric or symbolic shape of the field

        Returns:
            ast node, which is put inside the kernel function
60
        """
61
62


63
# -------------------------------------------- Implementations ---------------------------------------------------------
64
65


66
class BlockIndexing(AbstractIndexing):
Martin Bauer's avatar
Martin Bauer committed
67
    """Generic indexing scheme that maps sub-blocks of an array to CUDA blocks.
68

Martin Bauer's avatar
Martin Bauer committed
69
70
71
72
73
    Args:
        field: pystencils field (common to all Indexing classes)
        iteration_slice: slice that defines rectangular subarea which is iterated over
        permute_block_size_dependent_on_layout: if True the block_size is permuted such that the fastest coordinate
                                                gets the largest amount of threads
74
        compile_time_block_size: compile in concrete block size, otherwise the cuda variable 'blockDim' is used
Martin Bauer's avatar
Martin Bauer committed
75
76
    """
    def __init__(self, field, iteration_slice=None,
77
                 block_size=(16, 16, 1), permute_block_size_dependent_on_layout=True, compile_time_block_size=False):
Martin Bauer's avatar
Martin Bauer committed
78
        if field.spatial_dimensions > 3:
79
80
            raise NotImplementedError("This indexing scheme supports at most 3 spatial dimensions")

Martin Bauer's avatar
Martin Bauer committed
81
82
        if permute_block_size_dependent_on_layout:
            block_size = self.permute_block_size_according_to_layout(block_size, field.layout)
83

Martin Bauer's avatar
Martin Bauer committed
84
85
        if AUTO_BLOCK_SIZE_LIMITING:
            block_size = self.limit_block_size_to_device_maximum(block_size)
Martin Bauer's avatar
Martin Bauer committed
86

87
        self._block_size = block_size
Martin Bauer's avatar
Martin Bauer committed
88
89
        self._iterationSlice = normalize_slice(iteration_slice, field.spatial_shape)
        self._dim = field.spatial_dimensions
90
        self._symbolic_shape = [e if isinstance(e, sp.Basic) else None for e in field.spatial_shape]
91
        self._compile_time_block_size = compile_time_block_size
92

93
94
    @property
    def coordinates(self):
Martin Bauer's avatar
Martin Bauer committed
95
        offsets = _get_start_from_slice(self._iterationSlice)
96
        block_size = self._block_size if self._compile_time_block_size else BLOCK_DIM
Martin Bauer's avatar
Martin Bauer committed
97
        coordinates = [block_index * bs + thread_idx + off
98
                       for block_index, bs, thread_idx, off in zip(BLOCK_IDX, block_size, THREAD_IDX, offsets)]
99
100

        return coordinates[:self._dim]
101

Martin Bauer's avatar
Martin Bauer committed
102
    def call_parameters(self, arr_shape):
103
        substitution_dict = {sym: value for sym, value in zip(self._symbolic_shape, arr_shape) if sym is not None}
104

Martin Bauer's avatar
Martin Bauer committed
105
106
107
        widths = [end - start for start, end in zip(_get_start_from_slice(self._iterationSlice),
                                                    _get_end_from_slice(self._iterationSlice, arr_shape))]
        widths = sp.Matrix(widths).subs(substitution_dict)
108
109
110
        extend_bs = (1,) * (3 - len(self._block_size))
        block_size = self._block_size + extend_bs
        if not self._compile_time_block_size:
Martin Bauer's avatar
Martin Bauer committed
111
            block_size = tuple(sp.Min(bs, shape) for bs, shape in zip(block_size, widths)) + extend_bs
112

113
114
        grid = tuple(div_ceil(length, block_size)
                     for length, block_size in zip(widths, block_size))
Martin Bauer's avatar
Martin Bauer committed
115
        extend_gr = (1,) * (3 - len(grid))
116

117
        return {'block': block_size,
Martin Bauer's avatar
Martin Bauer committed
118
                'grid': grid + extend_gr}
119

Martin Bauer's avatar
Martin Bauer committed
120
121
    def guard(self, kernel_content, arr_shape):
        arr_shape = arr_shape[:self._dim]
122
        conditions = [c < end
Martin Bauer's avatar
Martin Bauer committed
123
                      for c, end in zip(self.coordinates, _get_end_from_slice(self._iterationSlice, arr_shape))]
124
125
126
        condition = conditions[0]
        for c in conditions[1:]:
            condition = sp.And(condition, c)
Martin Bauer's avatar
Martin Bauer committed
127
        return Block([Conditional(condition, kernel_content)])
128

129
    @staticmethod
Martin Bauer's avatar
Martin Bauer committed
130
    def limit_block_size_to_device_maximum(block_size):
131
132
133
134
135
136
137
138
        """Changes block size according to match device limits.

        * if the total amount of threads is too big for the current device, the biggest coordinate is divided by 2.
        * next, if one component is still too big, the component which is too big is divided by 2 and the smallest
          component is multiplied by 2, such that the total amount of threads stays the same

        Returns:
            the altered block_size
139
        """
140
        # Get device limits
141
        import pycuda.driver as cuda
Martin Bauer's avatar
Martin Bauer committed
142
143
        # noinspection PyUnresolvedReferences
        import pycuda.autoinit  # NOQA
144

145
146
147
        da = cuda.device_attribute
        device = cuda.Context.get_device()

Martin Bauer's avatar
Martin Bauer committed
148
149
150
151
        block_size = list(block_size)
        max_threads = device.get_attribute(da.MAX_THREADS_PER_BLOCK)
        max_block_size = [device.get_attribute(a)
                          for a in (da.MAX_BLOCK_DIM_X, da.MAX_BLOCK_DIM_Y, da.MAX_BLOCK_DIM_Z)]
152
153
154
155
156
157
158

        def prod(seq):
            result = 1
            for e in seq:
                result *= e
            return result

Martin Bauer's avatar
Martin Bauer committed
159
160
161
        def get_index_of_too_big_element():
            for i, bs in enumerate(block_size):
                if bs > max_block_size[i]:
162
163
164
                    return i
            return None

Martin Bauer's avatar
Martin Bauer committed
165
166
167
        def get_index_of_too_small_element():
            for i, bs in enumerate(block_size):
                if bs // 2 <= max_block_size[i]:
168
169
170
171
                    return i
            return None

        # Reduce the total number of threads if necessary
Martin Bauer's avatar
Martin Bauer committed
172
173
174
175
176
177
        while prod(block_size) > max_threads:
            item_to_reduce = block_size.index(max(block_size))
            for j, block_size_entry in enumerate(block_size):
                if block_size_entry > max_block_size[j]:
                    item_to_reduce = j
            block_size[item_to_reduce] //= 2
178
179

        # Cap individual elements
Martin Bauer's avatar
Martin Bauer committed
180
181
182
183
184
185
        too_big_element_index = get_index_of_too_big_element()
        while too_big_element_index is not None:
            too_small_element_index = get_index_of_too_small_element()
            block_size[too_small_element_index] *= 2
            block_size[too_big_element_index] //= 2
            too_big_element_index = get_index_of_too_big_element()
186

Martin Bauer's avatar
Martin Bauer committed
187
        return tuple(block_size)
188

189
    @staticmethod
Martin Bauer's avatar
Martin Bauer committed
190
191
192
    def limit_block_size_by_register_restriction(block_size, required_registers_per_thread, device=None):
        """Shrinks the block_size if there are too many registers used per multiprocessor.
        This is not done automatically, since the required_registers_per_thread are not known before compilation.
193
        They can be obtained by ``func.num_regs`` from a pycuda function.
Martin Bauer's avatar
Martin Bauer committed
194
        :returns smaller block_size if too many registers are used.
195
        """
196
        import pycuda.driver as cuda
Martin Bauer's avatar
Martin Bauer committed
197
198
        # noinspection PyUnresolvedReferences
        import pycuda.autoinit  # NOQA
199

200
201
202
        da = cuda.device_attribute
        if device is None:
            device = cuda.Context.get_device()
Martin Bauer's avatar
Martin Bauer committed
203
        available_registers_per_mp = device.get_attribute(da.MAX_REGISTERS_PER_MULTIPROCESSOR)
204

Martin Bauer's avatar
Martin Bauer committed
205
        block = block_size
206
207

        while True:
Martin Bauer's avatar
Martin Bauer committed
208
            num_threads = 1
209
            for t in block:
Martin Bauer's avatar
Martin Bauer committed
210
211
212
                num_threads *= t
            required_registers_per_mt = num_threads * required_registers_per_thread
            if required_registers_per_mt <= available_registers_per_mp:
213
214
                return block
            else:
Martin Bauer's avatar
Martin Bauer committed
215
216
217
                largest_grid_entry_idx = max(range(len(block)), key=lambda e: block[e])
                assert block[largest_grid_entry_idx] >= 2
                block[largest_grid_entry_idx] //= 2
218

219
    @staticmethod
Martin Bauer's avatar
Martin Bauer committed
220
221
222
223
224
225
226
227
228
    def permute_block_size_according_to_layout(block_size, layout):
        """Returns modified block_size such that the fastest coordinate gets the biggest block dimension"""
        sorted_block_size = list(sorted(block_size, reverse=True))
        while len(sorted_block_size) > len(layout):
            sorted_block_size[0] *= sorted_block_size[-1]
            sorted_block_size = sorted_block_size[:-1]

        result = list(block_size)
        for l, bs in zip(reversed(layout), sorted_block_size):
229
230
231
            result[l] = bs
        return tuple(result[:len(layout)])

232
233
234
235

class LineIndexing(AbstractIndexing):
    """
    Indexing scheme that assigns the innermost 'line' i.e. the elements which are adjacent in memory to a 1D CUDA block.
Martin Bauer's avatar
Martin Bauer committed
236
    The fastest coordinate is indexed with thread_idx.x, the remaining coordinates are mapped to block_idx.{x,y,z}
237
238
239
240
    This indexing scheme supports up to 4 spatial dimensions, where the innermost dimensions is not larger than the
    maximum amount of threads allowed in a CUDA block (which depends on device).
    """

Martin Bauer's avatar
Martin Bauer committed
241
242
243
    def __init__(self, field, iteration_slice=None):
        available_indices = [THREAD_IDX[0]] + BLOCK_IDX
        if field.spatial_dimensions > 4:
244
245
            raise NotImplementedError("This indexing scheme supports at most 4 spatial dimensions")

Martin Bauer's avatar
Martin Bauer committed
246
        coordinates = available_indices[:field.spatial_dimensions]
247

Martin Bauer's avatar
Martin Bauer committed
248
249
        fastest_coordinate = field.layout[-1]
        coordinates[0], coordinates[fastest_coordinate] = coordinates[fastest_coordinate], coordinates[0]
250

251
        self._coordinates = coordinates
Martin Bauer's avatar
Martin Bauer committed
252
253
        self._iterationSlice = normalize_slice(iteration_slice, field.spatial_shape)
        self._symbolicShape = [e if isinstance(e, sp.Basic) else None for e in field.spatial_shape]
254

255
256
    @property
    def coordinates(self):
Martin Bauer's avatar
Martin Bauer committed
257
        return [i + offset for i, offset in zip(self._coordinates, _get_start_from_slice(self._iterationSlice))]
258

Martin Bauer's avatar
Martin Bauer committed
259
260
    def call_parameters(self, arr_shape):
        substitution_dict = {sym: value for sym, value in zip(self._symbolicShape, arr_shape) if sym is not None}
261

Martin Bauer's avatar
Martin Bauer committed
262
263
264
        widths = [end - start for start, end in zip(_get_start_from_slice(self._iterationSlice),
                                                    _get_end_from_slice(self._iterationSlice, arr_shape))]
        widths = sp.Matrix(widths).subs(substitution_dict)
265

Martin Bauer's avatar
Martin Bauer committed
266
267
        def get_shape_of_cuda_idx(cuda_idx):
            if cuda_idx not in self._coordinates:
268
269
                return 1
            else:
Martin Bauer's avatar
Martin Bauer committed
270
                idx = self._coordinates.index(cuda_idx)
271
                return widths[idx]
272

Martin Bauer's avatar
Martin Bauer committed
273
274
        return {'block': tuple([get_shape_of_cuda_idx(idx) for idx in THREAD_IDX]),
                'grid': tuple([get_shape_of_cuda_idx(idx) for idx in BLOCK_IDX])}
275

Martin Bauer's avatar
Martin Bauer committed
276
277
    def guard(self, kernel_content, arr_shape):
        return kernel_content
278
279
280
281


# -------------------------------------- Helper functions --------------------------------------------------------------

Martin Bauer's avatar
Martin Bauer committed
282
def _get_start_from_slice(iteration_slice):
283
    res = []
Martin Bauer's avatar
Martin Bauer committed
284
285
286
    for slice_component in iteration_slice:
        if type(slice_component) is slice:
            res.append(slice_component.start if slice_component.start is not None else 0)
287
        else:
Martin Bauer's avatar
Martin Bauer committed
288
289
            assert isinstance(slice_component, int)
            res.append(slice_component)
290
291
292
    return res


Martin Bauer's avatar
Martin Bauer committed
293
294
def _get_end_from_slice(iteration_slice, arr_shape):
    iter_slice = normalize_slice(iteration_slice, arr_shape)
295
    res = []
Martin Bauer's avatar
Martin Bauer committed
296
297
298
    for slice_component in iter_slice:
        if type(slice_component) is slice:
            res.append(slice_component.stop)
299
        else:
Martin Bauer's avatar
Martin Bauer committed
300
301
            assert isinstance(slice_component, int)
            res.append(slice_component + 1)
302
303
    return res

Martin Bauer's avatar
Martin Bauer committed
304

Martin Bauer's avatar
Martin Bauer committed
305
306
307
308
309
310
def indexing_creator_from_params(gpu_indexing, gpu_indexing_params):
    if isinstance(gpu_indexing, str):
        if gpu_indexing == 'block':
            indexing_creator = BlockIndexing
        elif gpu_indexing == 'line':
            indexing_creator = LineIndexing
Martin Bauer's avatar
Martin Bauer committed
311
        else:
Martin Bauer's avatar
Martin Bauer committed
312
313
314
315
            raise ValueError("Unknown GPU indexing %s. Valid values are 'block' and 'line'" % (gpu_indexing,))
        if gpu_indexing_params:
            indexing_creator = partial(indexing_creator, **gpu_indexing_params)
        return indexing_creator
Martin Bauer's avatar
Martin Bauer committed
316
    else:
Martin Bauer's avatar
Martin Bauer committed
317
        return gpu_indexing