indexing.py 13.4 KB
Newer Older
1
import abc
Martin Bauer's avatar
Martin Bauer committed
2
3
from functools import partial

4
import sympy as sp
5
from sympy.core.cache import cacheit
Martin Bauer's avatar
Martin Bauer committed
6
7
8

from pystencils.astnodes import Block, Conditional
from pystencils.data_types import TypedSymbol, create_type
9
from pystencils.integer_functions import div_ceil, div_floor
Martin Bauer's avatar
Martin Bauer committed
10
from pystencils.slicing import normalize_slice
Martin Bauer's avatar
Martin Bauer committed
11
from pystencils.sympyextensions import is_integer_sequence, prod
12

13
14
15
16
17
18
19
20
21
22
23
24
25
26

class ThreadIndexingSymbol(TypedSymbol):
    def __new__(cls, *args, **kwds):
        obj = ThreadIndexingSymbol.__xnew_cached_(cls, *args, **kwds)
        return obj

    def __new_stage2__(cls, name, dtype, *args, **kwargs):
        obj = super(ThreadIndexingSymbol, cls).__xnew__(cls, name, dtype, *args, **kwargs)
        return obj

    __xnew__ = staticmethod(__new_stage2__)
    __xnew_cached_ = staticmethod(cacheit(__new_stage2__))


27
28
29
30
BLOCK_IDX = [ThreadIndexingSymbol("blockIdx." + coord, create_type("int32")) for coord in ('x', 'y', 'z')]
THREAD_IDX = [ThreadIndexingSymbol("threadIdx." + coord, create_type("int32")) for coord in ('x', 'y', 'z')]
BLOCK_DIM = [ThreadIndexingSymbol("blockDim." + coord, create_type("int32")) for coord in ('x', 'y', 'z')]
GRID_DIM = [ThreadIndexingSymbol("gridDim." + coord, create_type("int32")) for coord in ('x', 'y', 'z')]
31
32


Martin Bauer's avatar
Martin Bauer committed
33
class AbstractIndexing(abc.ABC):
34
35
36
37
    """
    Abstract base class for all Indexing classes. An Indexing class defines how a multidimensional
    field is mapped to CUDA's block and grid system. It calculates indices based on CUDA's thread and block indices
    and computes the number of blocks and threads a kernel is started with. The Indexing class is created with
38
    a pystencils field, a slice to iterate over, and further optional parameters that must have default values.
39
    """
40

Martin Bauer's avatar
Martin Bauer committed
41
42
    @property
    @abc.abstractmethod
43
44
    def coordinates(self):
        """Returns a sequence of coordinate expressions for (x,y,z) depending on symbolic CUDA block and thread indices.
Martin Bauer's avatar
Martin Bauer committed
45
        These symbolic indices can be obtained with the method `index_variables` """
46
47

    @property
Martin Bauer's avatar
Martin Bauer committed
48
    def index_variables(self):
49
50
        """Sympy symbols for CUDA's block and thread indices, and block and grid dimensions. """
        return BLOCK_IDX + THREAD_IDX + BLOCK_DIM + GRID_DIM
51

52
    @abc.abstractmethod
Martin Bauer's avatar
Martin Bauer committed
53
    def call_parameters(self, arr_shape):
54
55
56
57
58
59
60
        """Determine grid and block size for kernel call.

        Args:
            arr_shape: the numeric (not symbolic) shape of the array
        Returns:
            dict with keys 'blocks' and 'threads' with tuple values for number of (x,y,z) threads and blocks
            the kernel should be started with
61
62
63
        """

    @abc.abstractmethod
Martin Bauer's avatar
Martin Bauer committed
64
    def guard(self, kernel_content, arr_shape):
65
66
        """In some indexing schemes not all threads of a block execute the kernel content.

67
        This function can return a Conditional ast node, defining this execution guard.
68
69
70
71
72
73
74

        Args:
            kernel_content: the actual kernel contents which can e.g. be put into the Conditional node as true block
            arr_shape: the numeric or symbolic shape of the field

        Returns:
            ast node, which is put inside the kernel function
75
        """
76

77
78
79
80
81
    @abc.abstractmethod
    def max_threads_per_block(self):
        """Return maximal number of threads per block for launch bounds. If this cannot be determined without
        knowing the array shape return None for unknown """

82
83
84
    @abc.abstractmethod
    def symbolic_parameters(self):
        """Set of symbols required in call_parameters code"""
85

86

87
# -------------------------------------------- Implementations ---------------------------------------------------------
88
89


90
class BlockIndexing(AbstractIndexing):
Martin Bauer's avatar
Martin Bauer committed
91
    """Generic indexing scheme that maps sub-blocks of an array to CUDA blocks.
92

Martin Bauer's avatar
Martin Bauer committed
93
94
95
96
97
    Args:
        field: pystencils field (common to all Indexing classes)
        iteration_slice: slice that defines rectangular subarea which is iterated over
        permute_block_size_dependent_on_layout: if True the block_size is permuted such that the fastest coordinate
                                                gets the largest amount of threads
98
        compile_time_block_size: compile in concrete block size, otherwise the cuda variable 'blockDim' is used
Martin Bauer's avatar
Martin Bauer committed
99
    """
100

101
    def __init__(self, field, iteration_slice,
102
103
                 block_size=(16, 16, 1), permute_block_size_dependent_on_layout=True, compile_time_block_size=False,
                 maximum_block_size=(1024, 1024, 64)):
Martin Bauer's avatar
Martin Bauer committed
104
        if field.spatial_dimensions > 3:
105
106
            raise NotImplementedError("This indexing scheme supports at most 3 spatial dimensions")

Martin Bauer's avatar
Martin Bauer committed
107
108
        if permute_block_size_dependent_on_layout:
            block_size = self.permute_block_size_according_to_layout(block_size, field.layout)
109

110
        self._block_size = block_size
111
112
113
114
115
116
117
118
119
120
121
        if maximum_block_size == 'auto':
            # Get device limits
            import pycuda.driver as cuda
            # noinspection PyUnresolvedReferences
            import pycuda.autoinit  # NOQA
            da = cuda.device_attribute
            device = cuda.Context.get_device()
            maximum_block_size = tuple(device.get_attribute(a)
                                       for a in (da.MAX_BLOCK_DIM_X, da.MAX_BLOCK_DIM_Y, da.MAX_BLOCK_DIM_Z))

        self._maximum_block_size = maximum_block_size
Martin Bauer's avatar
Martin Bauer committed
122
123
        self._iterationSlice = normalize_slice(iteration_slice, field.spatial_shape)
        self._dim = field.spatial_dimensions
124
        self._symbolic_shape = [e if isinstance(e, sp.Basic) else None for e in field.spatial_shape]
125
        self._compile_time_block_size = compile_time_block_size
126

127
128
    @property
    def coordinates(self):
Martin Bauer's avatar
Martin Bauer committed
129
        offsets = _get_start_from_slice(self._iterationSlice)
130
        block_size = self._block_size if self._compile_time_block_size else BLOCK_DIM
Martin Bauer's avatar
Martin Bauer committed
131
        coordinates = [block_index * bs + thread_idx + off
132
                       for block_index, bs, thread_idx, off in zip(BLOCK_IDX, block_size, THREAD_IDX, offsets)]
133
134

        return coordinates[:self._dim]
135

Martin Bauer's avatar
Martin Bauer committed
136
    def call_parameters(self, arr_shape):
137
        substitution_dict = {sym: value for sym, value in zip(self._symbolic_shape, arr_shape) if sym is not None}
138

Martin Bauer's avatar
Martin Bauer committed
139
140
141
        widths = [end - start for start, end in zip(_get_start_from_slice(self._iterationSlice),
                                                    _get_end_from_slice(self._iterationSlice, arr_shape))]
        widths = sp.Matrix(widths).subs(substitution_dict)
142
143
144
        extend_bs = (1,) * (3 - len(self._block_size))
        block_size = self._block_size + extend_bs
        if not self._compile_time_block_size:
145
146
147
148
149
150
151
            assert len(block_size) == 3
            adapted_block_size = []
            for i in range(len(widths)):
                factor = div_floor(prod(block_size[:i]), prod(adapted_block_size))
                adapted_block_size.append(sp.Min(block_size[i] * factor, widths[i]))
            block_size = tuple(adapted_block_size) + extend_bs

152
        block_size = tuple(sp.Min(bs, max_bs) for bs, max_bs in zip(block_size, self._maximum_block_size))
153
        grid = tuple(div_ceil(length, block_size) for length, block_size in zip(widths, block_size))
Martin Bauer's avatar
Martin Bauer committed
154
        extend_gr = (1,) * (3 - len(grid))
155

156
        return {'block': block_size,
Martin Bauer's avatar
Martin Bauer committed
157
                'grid': grid + extend_gr}
158

Martin Bauer's avatar
Martin Bauer committed
159
160
    def guard(self, kernel_content, arr_shape):
        arr_shape = arr_shape[:self._dim]
161
        conditions = [c < end
Martin Bauer's avatar
Martin Bauer committed
162
                      for c, end in zip(self.coordinates, _get_end_from_slice(self._iterationSlice, arr_shape))]
163
164
165
        condition = conditions[0]
        for c in conditions[1:]:
            condition = sp.And(condition, c)
Martin Bauer's avatar
Martin Bauer committed
166
        return Block([Conditional(condition, kernel_content)])
167

168
    @staticmethod
Martin Bauer's avatar
Martin Bauer committed
169
170
171
    def limit_block_size_by_register_restriction(block_size, required_registers_per_thread, device=None):
        """Shrinks the block_size if there are too many registers used per multiprocessor.
        This is not done automatically, since the required_registers_per_thread are not known before compilation.
172
        They can be obtained by ``func.num_regs`` from a pycuda function.
Martin Bauer's avatar
Martin Bauer committed
173
        :returns smaller block_size if too many registers are used.
174
        """
175
        import pycuda.driver as cuda
Martin Bauer's avatar
Martin Bauer committed
176
177
        # noinspection PyUnresolvedReferences
        import pycuda.autoinit  # NOQA
178

179
180
181
        da = cuda.device_attribute
        if device is None:
            device = cuda.Context.get_device()
Martin Bauer's avatar
Martin Bauer committed
182
        available_registers_per_mp = device.get_attribute(da.MAX_REGISTERS_PER_MULTIPROCESSOR)
183

Martin Bauer's avatar
Martin Bauer committed
184
        block = block_size
185
186

        while True:
Martin Bauer's avatar
Martin Bauer committed
187
            num_threads = 1
188
            for t in block:
Martin Bauer's avatar
Martin Bauer committed
189
190
191
                num_threads *= t
            required_registers_per_mt = num_threads * required_registers_per_thread
            if required_registers_per_mt <= available_registers_per_mp:
192
193
                return block
            else:
Martin Bauer's avatar
Martin Bauer committed
194
195
196
                largest_grid_entry_idx = max(range(len(block)), key=lambda e: block[e])
                assert block[largest_grid_entry_idx] >= 2
                block[largest_grid_entry_idx] //= 2
197

198
    @staticmethod
Martin Bauer's avatar
Martin Bauer committed
199
200
    def permute_block_size_according_to_layout(block_size, layout):
        """Returns modified block_size such that the fastest coordinate gets the biggest block dimension"""
201
202
        if not is_integer_sequence(block_size):
            return block_size
Martin Bauer's avatar
Martin Bauer committed
203
204
205
206
207
208
        sorted_block_size = list(sorted(block_size, reverse=True))
        while len(sorted_block_size) > len(layout):
            sorted_block_size[0] *= sorted_block_size[-1]
            sorted_block_size = sorted_block_size[:-1]

        result = list(block_size)
Markus Holzer's avatar
Markus Holzer committed
209
210
        for lo, bs in zip(reversed(layout), sorted_block_size):
            result[lo] = bs
211
212
        return tuple(result[:len(layout)])

213
    def max_threads_per_block(self):
214
215
216
217
218
219
220
        if is_integer_sequence(self._block_size):
            return prod(self._block_size)
        else:
            return None

    def symbolic_parameters(self):
        return set(b for b in self._block_size if isinstance(b, sp.Symbol))
221

222
223
224
225

class LineIndexing(AbstractIndexing):
    """
    Indexing scheme that assigns the innermost 'line' i.e. the elements which are adjacent in memory to a 1D CUDA block.
Martin Bauer's avatar
Martin Bauer committed
226
    The fastest coordinate is indexed with thread_idx.x, the remaining coordinates are mapped to block_idx.{x,y,z}
227
228
229
230
    This indexing scheme supports up to 4 spatial dimensions, where the innermost dimensions is not larger than the
    maximum amount of threads allowed in a CUDA block (which depends on device).
    """

231
    def __init__(self, field, iteration_slice):
Martin Bauer's avatar
Martin Bauer committed
232
233
        available_indices = [THREAD_IDX[0]] + BLOCK_IDX
        if field.spatial_dimensions > 4:
234
235
            raise NotImplementedError("This indexing scheme supports at most 4 spatial dimensions")

Martin Bauer's avatar
Martin Bauer committed
236
        coordinates = available_indices[:field.spatial_dimensions]
237

Martin Bauer's avatar
Martin Bauer committed
238
239
        fastest_coordinate = field.layout[-1]
        coordinates[0], coordinates[fastest_coordinate] = coordinates[fastest_coordinate], coordinates[0]
240

241
        self._coordinates = coordinates
Martin Bauer's avatar
Martin Bauer committed
242
243
        self._iterationSlice = normalize_slice(iteration_slice, field.spatial_shape)
        self._symbolicShape = [e if isinstance(e, sp.Basic) else None for e in field.spatial_shape]
244

245
246
    @property
    def coordinates(self):
Martin Bauer's avatar
Martin Bauer committed
247
        return [i + offset for i, offset in zip(self._coordinates, _get_start_from_slice(self._iterationSlice))]
248

Martin Bauer's avatar
Martin Bauer committed
249
250
    def call_parameters(self, arr_shape):
        substitution_dict = {sym: value for sym, value in zip(self._symbolicShape, arr_shape) if sym is not None}
251

Martin Bauer's avatar
Martin Bauer committed
252
253
254
        widths = [end - start for start, end in zip(_get_start_from_slice(self._iterationSlice),
                                                    _get_end_from_slice(self._iterationSlice, arr_shape))]
        widths = sp.Matrix(widths).subs(substitution_dict)
255

Martin Bauer's avatar
Martin Bauer committed
256
257
        def get_shape_of_cuda_idx(cuda_idx):
            if cuda_idx not in self._coordinates:
258
259
                return 1
            else:
Martin Bauer's avatar
Martin Bauer committed
260
                idx = self._coordinates.index(cuda_idx)
261
                return widths[idx]
262

Martin Bauer's avatar
Martin Bauer committed
263
264
        return {'block': tuple([get_shape_of_cuda_idx(idx) for idx in THREAD_IDX]),
                'grid': tuple([get_shape_of_cuda_idx(idx) for idx in BLOCK_IDX])}
265

Martin Bauer's avatar
Martin Bauer committed
266
267
    def guard(self, kernel_content, arr_shape):
        return kernel_content
268

269
270
271
    def max_threads_per_block(self):
        return None

272
273
    def symbolic_parameters(self):
        return set()
274

275

276
277
# -------------------------------------- Helper functions --------------------------------------------------------------

Martin Bauer's avatar
Martin Bauer committed
278
def _get_start_from_slice(iteration_slice):
279
    res = []
Martin Bauer's avatar
Martin Bauer committed
280
281
282
    for slice_component in iteration_slice:
        if type(slice_component) is slice:
            res.append(slice_component.start if slice_component.start is not None else 0)
283
        else:
Martin Bauer's avatar
Martin Bauer committed
284
285
            assert isinstance(slice_component, int)
            res.append(slice_component)
286
287
288
    return res


Martin Bauer's avatar
Martin Bauer committed
289
290
def _get_end_from_slice(iteration_slice, arr_shape):
    iter_slice = normalize_slice(iteration_slice, arr_shape)
291
    res = []
Martin Bauer's avatar
Martin Bauer committed
292
293
294
    for slice_component in iter_slice:
        if type(slice_component) is slice:
            res.append(slice_component.stop)
295
        else:
Martin Bauer's avatar
Martin Bauer committed
296
297
            assert isinstance(slice_component, int)
            res.append(slice_component + 1)
298
299
    return res

Martin Bauer's avatar
Martin Bauer committed
300

Martin Bauer's avatar
Martin Bauer committed
301
302
303
304
305
306
def indexing_creator_from_params(gpu_indexing, gpu_indexing_params):
    if isinstance(gpu_indexing, str):
        if gpu_indexing == 'block':
            indexing_creator = BlockIndexing
        elif gpu_indexing == 'line':
            indexing_creator = LineIndexing
Martin Bauer's avatar
Martin Bauer committed
307
        else:
Martin Bauer's avatar
Martin Bauer committed
308
309
310
311
            raise ValueError("Unknown GPU indexing %s. Valid values are 'block' and 'line'" % (gpu_indexing,))
        if gpu_indexing_params:
            indexing_creator = partial(indexing_creator, **gpu_indexing_params)
        return indexing_creator
Martin Bauer's avatar
Martin Bauer committed
312
    else:
Martin Bauer's avatar
Martin Bauer committed
313
        return gpu_indexing