From 1754ef2782b504895302e102bef189482e706ac5 Mon Sep 17 00:00:00 2001 From: Martin Bauer <martin.bauer@fau.de> Date: Tue, 18 Jun 2019 15:27:31 +0200 Subject: [PATCH] CUDA indexing: clip to maximum cuda block size - previous method did not work with kernels generated for walberla where block size changes are made at runtime - device query does not always work, since the compile system may have no GPU or not the same GPU -> max block size is passed as parameter and only optionally determined by a device query --- pystencils/gpucuda/indexing.py | 81 ++++++-------------------------- pystencils_tests/test_cudagpu.py | 5 -- 2 files changed, 15 insertions(+), 71 deletions(-) diff --git a/pystencils/gpucuda/indexing.py b/pystencils/gpucuda/indexing.py index 23a7b51ec..5864c4f82 100644 --- a/pystencils/gpucuda/indexing.py +++ b/pystencils/gpucuda/indexing.py @@ -9,8 +9,6 @@ from functools import partial from pystencils.sympyextensions import prod, is_integer_sequence -AUTO_BLOCK_SIZE_LIMITING = False - BLOCK_IDX = [TypedSymbol("blockIdx." + coord, create_type("int")) for coord in ('x', 'y', 'z')] THREAD_IDX = [TypedSymbol("threadIdx." + coord, create_type("int")) for coord in ('x', 'y', 'z')] BLOCK_DIM = [TypedSymbol("blockDim." + coord, create_type("int")) for coord in ('x', 'y', 'z')] @@ -84,17 +82,26 @@ class BlockIndexing(AbstractIndexing): compile_time_block_size: compile in concrete block size, otherwise the cuda variable 'blockDim' is used """ def __init__(self, field, iteration_slice, - block_size=(16, 16, 1), permute_block_size_dependent_on_layout=True, compile_time_block_size=False): + block_size=(16, 16, 1), permute_block_size_dependent_on_layout=True, compile_time_block_size=False, + maximum_block_size=(1024, 1024, 64)): if field.spatial_dimensions > 3: raise NotImplementedError("This indexing scheme supports at most 3 spatial dimensions") if permute_block_size_dependent_on_layout: block_size = self.permute_block_size_according_to_layout(block_size, field.layout) - if AUTO_BLOCK_SIZE_LIMITING: - block_size = self.limit_block_size_to_device_maximum(block_size) - self._block_size = block_size + if maximum_block_size == 'auto': + # Get device limits + import pycuda.driver as cuda + # noinspection PyUnresolvedReferences + import pycuda.autoinit # NOQA + da = cuda.device_attribute + device = cuda.Context.get_device() + maximum_block_size = tuple(device.get_attribute(a) + for a in (da.MAX_BLOCK_DIM_X, da.MAX_BLOCK_DIM_Y, da.MAX_BLOCK_DIM_Z)) + + self._maximum_block_size = maximum_block_size self._iterationSlice = normalize_slice(iteration_slice, field.spatial_shape) self._dim = field.spatial_dimensions self._symbolic_shape = [e if isinstance(e, sp.Basic) else None for e in field.spatial_shape] @@ -125,6 +132,7 @@ class BlockIndexing(AbstractIndexing): adapted_block_size.append(sp.Min(block_size[i] * factor, widths[i])) block_size = tuple(adapted_block_size) + extend_bs + block_size = tuple(sp.Min(bs, max_bs) for bs, max_bs in zip(block_size, self._maximum_block_size)) grid = tuple(div_ceil(length, block_size) for length, block_size in zip(widths, block_size)) extend_gr = (1,) * (3 - len(grid)) @@ -140,66 +148,6 @@ class BlockIndexing(AbstractIndexing): condition = sp.And(condition, c) return Block([Conditional(condition, kernel_content)]) - @staticmethod - def limit_block_size_to_device_maximum(block_size): - """Changes block size to match device limits. - - * if the total amount of threads is too big for the current device, the biggest coordinate is divided by 2. - * next, if one component is still too big, the component which is too big is divided by 2 and the smallest - component is multiplied by 2, such that the total amount of threads stays the same - - Returns: - the altered block_size - """ - # Get device limits - import pycuda.driver as cuda - # noinspection PyUnresolvedReferences - import pycuda.autoinit # NOQA - - da = cuda.device_attribute - device = cuda.Context.get_device() - - block_size = list(block_size) - max_threads = device.get_attribute(da.MAX_THREADS_PER_BLOCK) - max_block_size = [device.get_attribute(a) - for a in (da.MAX_BLOCK_DIM_X, da.MAX_BLOCK_DIM_Y, da.MAX_BLOCK_DIM_Z)] - - def prod(seq): - result = 1 - for e in seq: - result *= e - return result - - def get_index_of_too_big_element(): - for i, bs in enumerate(block_size): - if bs > max_block_size[i]: - return i - return None - - def get_index_of_too_small_element(): - for i, bs in enumerate(block_size): - if bs // 2 <= max_block_size[i]: - return i - return None - - # Reduce the total number of threads if necessary - while prod(block_size) > max_threads: - item_to_reduce = block_size.index(max(block_size)) - for j, block_size_entry in enumerate(block_size): - if block_size_entry > max_block_size[j]: - item_to_reduce = j - block_size[item_to_reduce] //= 2 - - # Cap individual elements - too_big_element_index = get_index_of_too_big_element() - while too_big_element_index is not None: - too_small_element_index = get_index_of_too_small_element() - block_size[too_small_element_index] *= 2 - block_size[too_big_element_index] //= 2 - too_big_element_index = get_index_of_too_big_element() - - return tuple(block_size) - @staticmethod def limit_block_size_by_register_restriction(block_size, required_registers_per_thread, device=None): """Shrinks the block_size if there are too many registers used per multiprocessor. @@ -307,6 +255,7 @@ class LineIndexing(AbstractIndexing): def symbolic_parameters(self): return set() + # -------------------------------------- Helper functions -------------------------------------------------------------- def _get_start_from_slice(iteration_slice): diff --git a/pystencils_tests/test_cudagpu.py b/pystencils_tests/test_cudagpu.py index 3afddb128..ad13ac96f 100644 --- a/pystencils_tests/test_cudagpu.py +++ b/pystencils_tests/test_cudagpu.py @@ -147,11 +147,6 @@ def test_periodicity(): np.testing.assert_equal(cpu_result, gpu_result) -def test_block_size_limiting(): - res = BlockIndexing.limit_block_size_to_device_maximum((4096, 4096, 4096)) - assert all(r < 4096 for r in res) - - def test_block_indexing(): f = fields("f: [3D]") bi = BlockIndexing(f, make_slice[:, :, :], block_size=(16, 8, 2), permute_block_size_dependent_on_layout=False) -- GitLab