indexing.py 12.9 KB
Newer Older
1
import abc
Martin Bauer's avatar
Martin Bauer committed
2
from typing import Tuple  # noqa
3
4
import sympy as sp
from pystencils.astnodes import Conditional, Block
5
from pystencils.integer_functions import div_ceil, div_floor
Martin Bauer's avatar
Martin Bauer committed
6
from pystencils.slicing import normalize_slice
Martin Bauer's avatar
Martin Bauer committed
7
from pystencils.data_types import TypedSymbol, create_type
Martin Bauer's avatar
Martin Bauer committed
8
from functools import partial
9

10
from pystencils.sympyextensions import prod, is_integer_sequence
11

Martin Bauer's avatar
Martin Bauer committed
12
13
14
15
BLOCK_IDX = [TypedSymbol("blockIdx." + coord, create_type("int")) for coord in ('x', 'y', 'z')]
THREAD_IDX = [TypedSymbol("threadIdx." + coord, create_type("int")) for coord in ('x', 'y', 'z')]
BLOCK_DIM = [TypedSymbol("blockDim." + coord, create_type("int")) for coord in ('x', 'y', 'z')]
GRID_DIM = [TypedSymbol("gridDim." + coord, create_type("int")) for coord in ('x', 'y', 'z')]
16
17


Martin Bauer's avatar
Martin Bauer committed
18
class AbstractIndexing(abc.ABC):
19
20
21
22
    """
    Abstract base class for all Indexing classes. An Indexing class defines how a multidimensional
    field is mapped to CUDA's block and grid system. It calculates indices based on CUDA's thread and block indices
    and computes the number of blocks and threads a kernel is started with. The Indexing class is created with
23
    a pystencils field, a slice to iterate over, and further optional parameters that must have default values.
24
    """
25

Martin Bauer's avatar
Martin Bauer committed
26
27
    @property
    @abc.abstractmethod
28
29
    def coordinates(self):
        """Returns a sequence of coordinate expressions for (x,y,z) depending on symbolic CUDA block and thread indices.
Martin Bauer's avatar
Martin Bauer committed
30
        These symbolic indices can be obtained with the method `index_variables` """
31
32

    @property
Martin Bauer's avatar
Martin Bauer committed
33
    def index_variables(self):
34
35
        """Sympy symbols for CUDA's block and thread indices, and block and grid dimensions. """
        return BLOCK_IDX + THREAD_IDX + BLOCK_DIM + GRID_DIM
36

37
    @abc.abstractmethod
Martin Bauer's avatar
Martin Bauer committed
38
    def call_parameters(self, arr_shape):
39
40
41
42
43
44
45
        """Determine grid and block size for kernel call.

        Args:
            arr_shape: the numeric (not symbolic) shape of the array
        Returns:
            dict with keys 'blocks' and 'threads' with tuple values for number of (x,y,z) threads and blocks
            the kernel should be started with
46
47
48
        """

    @abc.abstractmethod
Martin Bauer's avatar
Martin Bauer committed
49
    def guard(self, kernel_content, arr_shape):
50
51
        """In some indexing schemes not all threads of a block execute the kernel content.

52
        This function can return a Conditional ast node, defining this execution guard.
53
54
55
56
57
58
59

        Args:
            kernel_content: the actual kernel contents which can e.g. be put into the Conditional node as true block
            arr_shape: the numeric or symbolic shape of the field

        Returns:
            ast node, which is put inside the kernel function
60
        """
61

62
63
64
65
66
    @abc.abstractmethod
    def max_threads_per_block(self):
        """Return maximal number of threads per block for launch bounds. If this cannot be determined without
        knowing the array shape return None for unknown """

67
68
69
    @abc.abstractmethod
    def symbolic_parameters(self):
        """Set of symbols required in call_parameters code"""
70

71
# -------------------------------------------- Implementations ---------------------------------------------------------
72
73


74
class BlockIndexing(AbstractIndexing):
Martin Bauer's avatar
Martin Bauer committed
75
    """Generic indexing scheme that maps sub-blocks of an array to CUDA blocks.
76

Martin Bauer's avatar
Martin Bauer committed
77
78
79
80
81
    Args:
        field: pystencils field (common to all Indexing classes)
        iteration_slice: slice that defines rectangular subarea which is iterated over
        permute_block_size_dependent_on_layout: if True the block_size is permuted such that the fastest coordinate
                                                gets the largest amount of threads
82
        compile_time_block_size: compile in concrete block size, otherwise the cuda variable 'blockDim' is used
Martin Bauer's avatar
Martin Bauer committed
83
    """
84
    def __init__(self, field, iteration_slice,
85
86
                 block_size=(16, 16, 1), permute_block_size_dependent_on_layout=True, compile_time_block_size=False,
                 maximum_block_size=(1024, 1024, 64)):
Martin Bauer's avatar
Martin Bauer committed
87
        if field.spatial_dimensions > 3:
88
89
            raise NotImplementedError("This indexing scheme supports at most 3 spatial dimensions")

Martin Bauer's avatar
Martin Bauer committed
90
91
        if permute_block_size_dependent_on_layout:
            block_size = self.permute_block_size_according_to_layout(block_size, field.layout)
92

93
        self._block_size = block_size
94
95
96
97
98
99
100
101
102
103
104
        if maximum_block_size == 'auto':
            # Get device limits
            import pycuda.driver as cuda
            # noinspection PyUnresolvedReferences
            import pycuda.autoinit  # NOQA
            da = cuda.device_attribute
            device = cuda.Context.get_device()
            maximum_block_size = tuple(device.get_attribute(a)
                                       for a in (da.MAX_BLOCK_DIM_X, da.MAX_BLOCK_DIM_Y, da.MAX_BLOCK_DIM_Z))

        self._maximum_block_size = maximum_block_size
Martin Bauer's avatar
Martin Bauer committed
105
106
        self._iterationSlice = normalize_slice(iteration_slice, field.spatial_shape)
        self._dim = field.spatial_dimensions
107
        self._symbolic_shape = [e if isinstance(e, sp.Basic) else None for e in field.spatial_shape]
108
        self._compile_time_block_size = compile_time_block_size
109

110
111
    @property
    def coordinates(self):
Martin Bauer's avatar
Martin Bauer committed
112
        offsets = _get_start_from_slice(self._iterationSlice)
113
        block_size = self._block_size if self._compile_time_block_size else BLOCK_DIM
Martin Bauer's avatar
Martin Bauer committed
114
        coordinates = [block_index * bs + thread_idx + off
115
                       for block_index, bs, thread_idx, off in zip(BLOCK_IDX, block_size, THREAD_IDX, offsets)]
116
117

        return coordinates[:self._dim]
118

Martin Bauer's avatar
Martin Bauer committed
119
    def call_parameters(self, arr_shape):
120
        substitution_dict = {sym: value for sym, value in zip(self._symbolic_shape, arr_shape) if sym is not None}
121

Martin Bauer's avatar
Martin Bauer committed
122
123
124
        widths = [end - start for start, end in zip(_get_start_from_slice(self._iterationSlice),
                                                    _get_end_from_slice(self._iterationSlice, arr_shape))]
        widths = sp.Matrix(widths).subs(substitution_dict)
125
126
127
        extend_bs = (1,) * (3 - len(self._block_size))
        block_size = self._block_size + extend_bs
        if not self._compile_time_block_size:
128
129
130
131
132
133
134
            assert len(block_size) == 3
            adapted_block_size = []
            for i in range(len(widths)):
                factor = div_floor(prod(block_size[:i]), prod(adapted_block_size))
                adapted_block_size.append(sp.Min(block_size[i] * factor, widths[i]))
            block_size = tuple(adapted_block_size) + extend_bs

135
        block_size = tuple(sp.Min(bs, max_bs) for bs, max_bs in zip(block_size, self._maximum_block_size))
136
        grid = tuple(div_ceil(length, block_size) for length, block_size in zip(widths, block_size))
Martin Bauer's avatar
Martin Bauer committed
137
        extend_gr = (1,) * (3 - len(grid))
138

139
        return {'block': block_size,
Martin Bauer's avatar
Martin Bauer committed
140
                'grid': grid + extend_gr}
141

Martin Bauer's avatar
Martin Bauer committed
142
143
    def guard(self, kernel_content, arr_shape):
        arr_shape = arr_shape[:self._dim]
144
        conditions = [c < end
Martin Bauer's avatar
Martin Bauer committed
145
                      for c, end in zip(self.coordinates, _get_end_from_slice(self._iterationSlice, arr_shape))]
146
147
148
        condition = conditions[0]
        for c in conditions[1:]:
            condition = sp.And(condition, c)
Martin Bauer's avatar
Martin Bauer committed
149
        return Block([Conditional(condition, kernel_content)])
150

151
    @staticmethod
Martin Bauer's avatar
Martin Bauer committed
152
153
154
    def limit_block_size_by_register_restriction(block_size, required_registers_per_thread, device=None):
        """Shrinks the block_size if there are too many registers used per multiprocessor.
        This is not done automatically, since the required_registers_per_thread are not known before compilation.
155
        They can be obtained by ``func.num_regs`` from a pycuda function.
Martin Bauer's avatar
Martin Bauer committed
156
        :returns smaller block_size if too many registers are used.
157
        """
158
        import pycuda.driver as cuda
Martin Bauer's avatar
Martin Bauer committed
159
160
        # noinspection PyUnresolvedReferences
        import pycuda.autoinit  # NOQA
161

162
163
164
        da = cuda.device_attribute
        if device is None:
            device = cuda.Context.get_device()
Martin Bauer's avatar
Martin Bauer committed
165
        available_registers_per_mp = device.get_attribute(da.MAX_REGISTERS_PER_MULTIPROCESSOR)
166

Martin Bauer's avatar
Martin Bauer committed
167
        block = block_size
168
169

        while True:
Martin Bauer's avatar
Martin Bauer committed
170
            num_threads = 1
171
            for t in block:
Martin Bauer's avatar
Martin Bauer committed
172
173
174
                num_threads *= t
            required_registers_per_mt = num_threads * required_registers_per_thread
            if required_registers_per_mt <= available_registers_per_mp:
175
176
                return block
            else:
Martin Bauer's avatar
Martin Bauer committed
177
178
179
                largest_grid_entry_idx = max(range(len(block)), key=lambda e: block[e])
                assert block[largest_grid_entry_idx] >= 2
                block[largest_grid_entry_idx] //= 2
180

181
    @staticmethod
Martin Bauer's avatar
Martin Bauer committed
182
183
    def permute_block_size_according_to_layout(block_size, layout):
        """Returns modified block_size such that the fastest coordinate gets the biggest block dimension"""
184
185
        if not is_integer_sequence(block_size):
            return block_size
Martin Bauer's avatar
Martin Bauer committed
186
187
188
189
190
191
192
        sorted_block_size = list(sorted(block_size, reverse=True))
        while len(sorted_block_size) > len(layout):
            sorted_block_size[0] *= sorted_block_size[-1]
            sorted_block_size = sorted_block_size[:-1]

        result = list(block_size)
        for l, bs in zip(reversed(layout), sorted_block_size):
193
194
195
            result[l] = bs
        return tuple(result[:len(layout)])

196
    def max_threads_per_block(self):
197
198
199
200
201
202
203
        if is_integer_sequence(self._block_size):
            return prod(self._block_size)
        else:
            return None

    def symbolic_parameters(self):
        return set(b for b in self._block_size if isinstance(b, sp.Symbol))
204

205
206
207
208

class LineIndexing(AbstractIndexing):
    """
    Indexing scheme that assigns the innermost 'line' i.e. the elements which are adjacent in memory to a 1D CUDA block.
Martin Bauer's avatar
Martin Bauer committed
209
    The fastest coordinate is indexed with thread_idx.x, the remaining coordinates are mapped to block_idx.{x,y,z}
210
211
212
213
    This indexing scheme supports up to 4 spatial dimensions, where the innermost dimensions is not larger than the
    maximum amount of threads allowed in a CUDA block (which depends on device).
    """

214
    def __init__(self, field, iteration_slice):
Martin Bauer's avatar
Martin Bauer committed
215
216
        available_indices = [THREAD_IDX[0]] + BLOCK_IDX
        if field.spatial_dimensions > 4:
217
218
            raise NotImplementedError("This indexing scheme supports at most 4 spatial dimensions")

Martin Bauer's avatar
Martin Bauer committed
219
        coordinates = available_indices[:field.spatial_dimensions]
220

Martin Bauer's avatar
Martin Bauer committed
221
222
        fastest_coordinate = field.layout[-1]
        coordinates[0], coordinates[fastest_coordinate] = coordinates[fastest_coordinate], coordinates[0]
223

224
        self._coordinates = coordinates
Martin Bauer's avatar
Martin Bauer committed
225
226
        self._iterationSlice = normalize_slice(iteration_slice, field.spatial_shape)
        self._symbolicShape = [e if isinstance(e, sp.Basic) else None for e in field.spatial_shape]
227

228
229
    @property
    def coordinates(self):
Martin Bauer's avatar
Martin Bauer committed
230
        return [i + offset for i, offset in zip(self._coordinates, _get_start_from_slice(self._iterationSlice))]
231

Martin Bauer's avatar
Martin Bauer committed
232
233
    def call_parameters(self, arr_shape):
        substitution_dict = {sym: value for sym, value in zip(self._symbolicShape, arr_shape) if sym is not None}
234

Martin Bauer's avatar
Martin Bauer committed
235
236
237
        widths = [end - start for start, end in zip(_get_start_from_slice(self._iterationSlice),
                                                    _get_end_from_slice(self._iterationSlice, arr_shape))]
        widths = sp.Matrix(widths).subs(substitution_dict)
238

Martin Bauer's avatar
Martin Bauer committed
239
240
        def get_shape_of_cuda_idx(cuda_idx):
            if cuda_idx not in self._coordinates:
241
242
                return 1
            else:
Martin Bauer's avatar
Martin Bauer committed
243
                idx = self._coordinates.index(cuda_idx)
244
                return widths[idx]
245

Martin Bauer's avatar
Martin Bauer committed
246
247
        return {'block': tuple([get_shape_of_cuda_idx(idx) for idx in THREAD_IDX]),
                'grid': tuple([get_shape_of_cuda_idx(idx) for idx in BLOCK_IDX])}
248

Martin Bauer's avatar
Martin Bauer committed
249
250
    def guard(self, kernel_content, arr_shape):
        return kernel_content
251

252
253
254
    def max_threads_per_block(self):
        return None

255
256
    def symbolic_parameters(self):
        return set()
257

258

259
260
# -------------------------------------- Helper functions --------------------------------------------------------------

Martin Bauer's avatar
Martin Bauer committed
261
def _get_start_from_slice(iteration_slice):
262
    res = []
Martin Bauer's avatar
Martin Bauer committed
263
264
265
    for slice_component in iteration_slice:
        if type(slice_component) is slice:
            res.append(slice_component.start if slice_component.start is not None else 0)
266
        else:
Martin Bauer's avatar
Martin Bauer committed
267
268
            assert isinstance(slice_component, int)
            res.append(slice_component)
269
270
271
    return res


Martin Bauer's avatar
Martin Bauer committed
272
273
def _get_end_from_slice(iteration_slice, arr_shape):
    iter_slice = normalize_slice(iteration_slice, arr_shape)
274
    res = []
Martin Bauer's avatar
Martin Bauer committed
275
276
277
    for slice_component in iter_slice:
        if type(slice_component) is slice:
            res.append(slice_component.stop)
278
        else:
Martin Bauer's avatar
Martin Bauer committed
279
280
            assert isinstance(slice_component, int)
            res.append(slice_component + 1)
281
282
    return res

Martin Bauer's avatar
Martin Bauer committed
283

Martin Bauer's avatar
Martin Bauer committed
284
285
286
287
288
289
def indexing_creator_from_params(gpu_indexing, gpu_indexing_params):
    if isinstance(gpu_indexing, str):
        if gpu_indexing == 'block':
            indexing_creator = BlockIndexing
        elif gpu_indexing == 'line':
            indexing_creator = LineIndexing
Martin Bauer's avatar
Martin Bauer committed
290
        else:
Martin Bauer's avatar
Martin Bauer committed
291
292
293
294
            raise ValueError("Unknown GPU indexing %s. Valid values are 'block' and 'line'" % (gpu_indexing,))
        if gpu_indexing_params:
            indexing_creator = partial(indexing_creator, **gpu_indexing_params)
        return indexing_creator
Martin Bauer's avatar
Martin Bauer committed
295
    else:
Martin Bauer's avatar
Martin Bauer committed
296
        return gpu_indexing