diff --git a/pystencils/backends/cbackend.py b/pystencils/backends/cbackend.py index 95ca58e42c2eed4b00d9ad230f22b483230127d8..3d0f22457a22942d925b5a2f5c14f935c589a52b 100644 --- a/pystencils/backends/cbackend.py +++ b/pystencils/backends/cbackend.py @@ -131,7 +131,13 @@ class CBackend: def _print_KernelFunction(self, node): function_arguments = ["%s %s" % (str(s.symbol.dtype), s.symbol.name) for s in node.get_parameters()] - func_declaration = "FUNC_PREFIX void %s(%s)" % (node.function_name, ", ".join(function_arguments)) + launch_bounds = "" + if self._dialect == 'cuda': + max_threads = node.indexing.max_threads_per_block() + if max_threads: + launch_bounds = "__launch_bounds__({}) ".format(max_threads) + func_declaration = "FUNC_PREFIX %svoid %s(%s)" % (launch_bounds, node.function_name, + ", ".join(function_arguments)) if self._signatureOnly: return func_declaration diff --git a/pystencils/gpucuda/cudajit.py b/pystencils/gpucuda/cudajit.py index 3d55bce1b68ae2564886b84950e18d8a7626eaf8..90c23b133eba1a5e60e8ecfa80a0a849e2b3ab19 100644 --- a/pystencils/gpucuda/cudajit.py +++ b/pystencils/gpucuda/cudajit.py @@ -36,7 +36,7 @@ def make_python_function(kernel_function_node, argument_dict=None): code += "#define FUNC_PREFIX __global__\n" code += "#define RESTRICT __restrict__\n\n" code += str(generate_c(kernel_function_node, dialect='cuda')) - options = options = ["-w", "-std=c++11", "-Wno-deprecated-gpu-targets", "-use_fast_math"] + options = ["-w", "-std=c++11", "-Wno-deprecated-gpu-targets"] if USE_FAST_MATH: options.append("-use_fast_math") mod = SourceModule(code, options=options, include_dirs=[get_pystencils_include_path()]) diff --git a/pystencils/gpucuda/indexing.py b/pystencils/gpucuda/indexing.py index 403d6790f7f7b06901c28f2d9e6c59c07e1421c1..80301a5bf70c24a80db50dbcc27da306d9cb62f2 100644 --- a/pystencils/gpucuda/indexing.py +++ b/pystencils/gpucuda/indexing.py @@ -2,11 +2,13 @@ import abc from typing import Tuple # noqa import sympy as sp from pystencils.astnodes import Conditional, Block -from pystencils.integer_functions import div_ceil +from pystencils.integer_functions import div_ceil, div_floor from pystencils.slicing import normalize_slice from pystencils.data_types import TypedSymbol, create_type from functools import partial +from pystencils.sympyextensions import prod + AUTO_BLOCK_SIZE_LIMITING = False BLOCK_IDX = [TypedSymbol("blockIdx." + coord, create_type("int")) for coord in ('x', 'y', 'z')] @@ -59,6 +61,11 @@ class AbstractIndexing(abc.ABC): ast node, which is put inside the kernel function """ + @abc.abstractmethod + def max_threads_per_block(self): + """Return maximal number of threads per block for launch bounds. If this cannot be determined without + knowing the array shape return None for unknown """ + # -------------------------------------------- Implementations --------------------------------------------------------- @@ -73,7 +80,7 @@ class BlockIndexing(AbstractIndexing): gets the largest amount of threads compile_time_block_size: compile in concrete block size, otherwise the cuda variable 'blockDim' is used """ - def __init__(self, field, iteration_slice=None, + def __init__(self, field, iteration_slice, block_size=(16, 16, 1), permute_block_size_dependent_on_layout=True, compile_time_block_size=False): if field.spatial_dimensions > 3: raise NotImplementedError("This indexing scheme supports at most 3 spatial dimensions") @@ -108,10 +115,14 @@ class BlockIndexing(AbstractIndexing): extend_bs = (1,) * (3 - len(self._block_size)) block_size = self._block_size + extend_bs if not self._compile_time_block_size: - block_size = tuple(sp.Min(bs, shape) for bs, shape in zip(block_size, widths)) + extend_bs - - grid = tuple(div_ceil(length, block_size) - for length, block_size in zip(widths, block_size)) + assert len(block_size) == 3 + adapted_block_size = [] + for i in range(len(widths)): + factor = div_floor(prod(block_size[:i]), prod(adapted_block_size)) + adapted_block_size.append(sp.Min(block_size[i] * factor, widths[i])) + block_size = tuple(adapted_block_size) + extend_bs + + grid = tuple(div_ceil(length, block_size) for length, block_size in zip(widths, block_size)) extend_gr = (1,) * (3 - len(grid)) return {'block': block_size, @@ -128,7 +139,7 @@ class BlockIndexing(AbstractIndexing): @staticmethod def limit_block_size_to_device_maximum(block_size): - """Changes block size according to match device limits. + """Changes block size to match device limits. * if the total amount of threads is too big for the current device, the biggest coordinate is divided by 2. * next, if one component is still too big, the component which is too big is divided by 2 and the smallest @@ -229,6 +240,9 @@ class BlockIndexing(AbstractIndexing): result[l] = bs return tuple(result[:len(layout)]) + def max_threads_per_block(self): + return prod(self._block_size) + class LineIndexing(AbstractIndexing): """ @@ -238,7 +252,7 @@ class LineIndexing(AbstractIndexing): maximum amount of threads allowed in a CUDA block (which depends on device). """ - def __init__(self, field, iteration_slice=None): + def __init__(self, field, iteration_slice): available_indices = [THREAD_IDX[0]] + BLOCK_IDX if field.spatial_dimensions > 4: raise NotImplementedError("This indexing scheme supports at most 4 spatial dimensions") @@ -276,6 +290,9 @@ class LineIndexing(AbstractIndexing): def guard(self, kernel_content, arr_shape): return kernel_content + def max_threads_per_block(self): + return None + # -------------------------------------- Helper functions -------------------------------------------------------------- diff --git a/pystencils/integer_functions.py b/pystencils/integer_functions.py index 9369f236c3cb574f1bac2a6f337878f03c44d089..b1ac944a977717405b1f8c39c2f5a0ceeea14e66 100644 --- a/pystencils/integer_functions.py +++ b/pystencils/integer_functions.py @@ -99,3 +99,32 @@ class div_ceil(sp.Function): assert dtype.is_int() code = "( ({0}) % ({1}) == 0 ? ({dtype})({0}) / ({dtype})({1}) : ( ({dtype})({0}) / ({dtype})({1}) ) +1 )" return code.format(print_func(self.args[0]), print_func(self.args[1]), dtype=dtype) + + +# noinspection PyPep8Naming +class div_floor(sp.Function): + """Integer division + + Examples: + >>> div_floor(9, 4) + 2 + >>> div_floor(8, 4) + 2 + >>> from pystencils import TypedSymbol + >>> a, b = TypedSymbol("a", "int64"), TypedSymbol("b", "int32") + >>> div_floor(a, b).to_c(str) + '((int64_t)(a) / (int64_t)(b))' + """ + nargs = 2 + + def __new__(cls, integer, divisor): + if is_integer_sequence((integer, divisor)): + return integer // divisor + else: + return super().__new__(cls, integer, divisor) + + def to_c(self, print_func): + dtype = collate_types((get_type_of_expression(self.args[0]), get_type_of_expression(self.args[1]))) + assert dtype.is_int() + code = "(({dtype})({0}) / ({dtype})({1}))" + return code.format(print_func(self.args[0]), print_func(self.args[1]), dtype=dtype) diff --git a/pystencils/kernelparameters.py b/pystencils/kernelparameters.py index 25869d2669400f645120cd595265905df827e73a..f6e19ef83def4331d0c3f4c3a45d08605263e984 100644 --- a/pystencils/kernelparameters.py +++ b/pystencils/kernelparameters.py @@ -76,7 +76,7 @@ class FieldPointerSymbol(TypedSymbol): def __new_stage2__(cls, field_name, field_dtype, const): name = "_data_{name}".format(name=field_name) - dtype = PointerType(get_base_type(field_dtype), const=const, restrict=False) + dtype = PointerType(get_base_type(field_dtype), const=const, restrict=True) obj = super(FieldPointerSymbol, cls).__xnew__(cls, name, dtype) obj.field_name = field_name return obj diff --git a/pystencils/sympyextensions.py b/pystencils/sympyextensions.py index e0906f84cca886e70a19eb548b36352d6385365d..86614a21224d5891e8890868771c6d9dc693f9c0 100644 --- a/pystencils/sympyextensions.py +++ b/pystencils/sympyextensions.py @@ -179,6 +179,7 @@ def is_constant(expr): """ return len(expr.free_symbols) == 0 + def subs_additive(expr: sp.Expr, replacement: sp.Expr, subexpression: sp.Expr, required_match_replacement: Optional[Union[int, float]] = 0.5, required_match_original: Optional[Union[int, float]] = None) -> sp.Expr: diff --git a/pystencils_tests/test_cudagpu.py b/pystencils_tests/test_cudagpu.py index 8de52ba490bc2fdc7914d8a194776937562e9481..3afddb128d2bb649f6bc5454d3ddfcd71d2a61a2 100644 --- a/pystencils_tests/test_cudagpu.py +++ b/pystencils_tests/test_cudagpu.py @@ -1,6 +1,6 @@ import numpy as np import sympy as sp -from pystencils import Field, Assignment +from pystencils import Field, Assignment, fields from pystencils.simp import sympy_cse_on_assignment_list from pystencils.gpucuda.indexing import LineIndexing from pystencils.slicing import remove_ghost_layers, add_ghost_layers, make_slice @@ -150,3 +150,13 @@ def test_periodicity(): def test_block_size_limiting(): res = BlockIndexing.limit_block_size_to_device_maximum((4096, 4096, 4096)) assert all(r < 4096 for r in res) + + +def test_block_indexing(): + f = fields("f: [3D]") + bi = BlockIndexing(f, make_slice[:, :, :], block_size=(16, 8, 2), permute_block_size_dependent_on_layout=False) + assert bi.call_parameters((3, 2, 32))['block'] == (3, 2, 32) + assert bi.call_parameters((32, 2, 32))['block'] == (16, 2, 8) + + bi = BlockIndexing(f, make_slice[:, :, :], block_size=(32, 1, 1), permute_block_size_dependent_on_layout=False) + assert bi.call_parameters((1, 16, 16))['block'] == (1, 16, 2)