From f504b40f293b8af49b1ee0eb20f5bfc5045eae04 Mon Sep 17 00:00:00 2001 From: Martin Bauer <martin.bauer@fau.de> Date: Wed, 24 Apr 2019 13:03:26 +0200 Subject: [PATCH] Improvements for GPU code generation - turned on restrict keyword by default (makes large difference on GPUs) - smarter block indexing: changing block size depending on domain size Example: previously there where (1,1,1) blocks when requested block size was (64, 1, 1) and domain size (1, 512, 512), now the block size is changed automatically to (1, 64, 1) in this case - added __lauch_bounds__ to kernels to allow better optimizations from the CUDA compiler --- pystencils/backends/cbackend.py | 8 +++++++- pystencils/gpucuda/cudajit.py | 2 +- pystencils/gpucuda/indexing.py | 33 ++++++++++++++++++++++++-------- pystencils/integer_functions.py | 29 ++++++++++++++++++++++++++++ pystencils/kernelparameters.py | 2 +- pystencils/sympyextensions.py | 1 + pystencils_tests/test_cudagpu.py | 12 +++++++++++- 7 files changed, 75 insertions(+), 12 deletions(-) diff --git a/pystencils/backends/cbackend.py b/pystencils/backends/cbackend.py index 95ca58e..3d0f224 100644 --- a/pystencils/backends/cbackend.py +++ b/pystencils/backends/cbackend.py @@ -131,7 +131,13 @@ class CBackend: def _print_KernelFunction(self, node): function_arguments = ["%s %s" % (str(s.symbol.dtype), s.symbol.name) for s in node.get_parameters()] - func_declaration = "FUNC_PREFIX void %s(%s)" % (node.function_name, ", ".join(function_arguments)) + launch_bounds = "" + if self._dialect == 'cuda': + max_threads = node.indexing.max_threads_per_block() + if max_threads: + launch_bounds = "__launch_bounds__({}) ".format(max_threads) + func_declaration = "FUNC_PREFIX %svoid %s(%s)" % (launch_bounds, node.function_name, + ", ".join(function_arguments)) if self._signatureOnly: return func_declaration diff --git a/pystencils/gpucuda/cudajit.py b/pystencils/gpucuda/cudajit.py index 3d55bce..90c23b1 100644 --- a/pystencils/gpucuda/cudajit.py +++ b/pystencils/gpucuda/cudajit.py @@ -36,7 +36,7 @@ def make_python_function(kernel_function_node, argument_dict=None): code += "#define FUNC_PREFIX __global__\n" code += "#define RESTRICT __restrict__\n\n" code += str(generate_c(kernel_function_node, dialect='cuda')) - options = options = ["-w", "-std=c++11", "-Wno-deprecated-gpu-targets", "-use_fast_math"] + options = ["-w", "-std=c++11", "-Wno-deprecated-gpu-targets"] if USE_FAST_MATH: options.append("-use_fast_math") mod = SourceModule(code, options=options, include_dirs=[get_pystencils_include_path()]) diff --git a/pystencils/gpucuda/indexing.py b/pystencils/gpucuda/indexing.py index 403d679..80301a5 100644 --- a/pystencils/gpucuda/indexing.py +++ b/pystencils/gpucuda/indexing.py @@ -2,11 +2,13 @@ import abc from typing import Tuple # noqa import sympy as sp from pystencils.astnodes import Conditional, Block -from pystencils.integer_functions import div_ceil +from pystencils.integer_functions import div_ceil, div_floor from pystencils.slicing import normalize_slice from pystencils.data_types import TypedSymbol, create_type from functools import partial +from pystencils.sympyextensions import prod + AUTO_BLOCK_SIZE_LIMITING = False BLOCK_IDX = [TypedSymbol("blockIdx." + coord, create_type("int")) for coord in ('x', 'y', 'z')] @@ -59,6 +61,11 @@ class AbstractIndexing(abc.ABC): ast node, which is put inside the kernel function """ + @abc.abstractmethod + def max_threads_per_block(self): + """Return maximal number of threads per block for launch bounds. If this cannot be determined without + knowing the array shape return None for unknown """ + # -------------------------------------------- Implementations --------------------------------------------------------- @@ -73,7 +80,7 @@ class BlockIndexing(AbstractIndexing): gets the largest amount of threads compile_time_block_size: compile in concrete block size, otherwise the cuda variable 'blockDim' is used """ - def __init__(self, field, iteration_slice=None, + def __init__(self, field, iteration_slice, block_size=(16, 16, 1), permute_block_size_dependent_on_layout=True, compile_time_block_size=False): if field.spatial_dimensions > 3: raise NotImplementedError("This indexing scheme supports at most 3 spatial dimensions") @@ -108,10 +115,14 @@ class BlockIndexing(AbstractIndexing): extend_bs = (1,) * (3 - len(self._block_size)) block_size = self._block_size + extend_bs if not self._compile_time_block_size: - block_size = tuple(sp.Min(bs, shape) for bs, shape in zip(block_size, widths)) + extend_bs - - grid = tuple(div_ceil(length, block_size) - for length, block_size in zip(widths, block_size)) + assert len(block_size) == 3 + adapted_block_size = [] + for i in range(len(widths)): + factor = div_floor(prod(block_size[:i]), prod(adapted_block_size)) + adapted_block_size.append(sp.Min(block_size[i] * factor, widths[i])) + block_size = tuple(adapted_block_size) + extend_bs + + grid = tuple(div_ceil(length, block_size) for length, block_size in zip(widths, block_size)) extend_gr = (1,) * (3 - len(grid)) return {'block': block_size, @@ -128,7 +139,7 @@ class BlockIndexing(AbstractIndexing): @staticmethod def limit_block_size_to_device_maximum(block_size): - """Changes block size according to match device limits. + """Changes block size to match device limits. * if the total amount of threads is too big for the current device, the biggest coordinate is divided by 2. * next, if one component is still too big, the component which is too big is divided by 2 and the smallest @@ -229,6 +240,9 @@ class BlockIndexing(AbstractIndexing): result[l] = bs return tuple(result[:len(layout)]) + def max_threads_per_block(self): + return prod(self._block_size) + class LineIndexing(AbstractIndexing): """ @@ -238,7 +252,7 @@ class LineIndexing(AbstractIndexing): maximum amount of threads allowed in a CUDA block (which depends on device). """ - def __init__(self, field, iteration_slice=None): + def __init__(self, field, iteration_slice): available_indices = [THREAD_IDX[0]] + BLOCK_IDX if field.spatial_dimensions > 4: raise NotImplementedError("This indexing scheme supports at most 4 spatial dimensions") @@ -276,6 +290,9 @@ class LineIndexing(AbstractIndexing): def guard(self, kernel_content, arr_shape): return kernel_content + def max_threads_per_block(self): + return None + # -------------------------------------- Helper functions -------------------------------------------------------------- diff --git a/pystencils/integer_functions.py b/pystencils/integer_functions.py index 9369f23..b1ac944 100644 --- a/pystencils/integer_functions.py +++ b/pystencils/integer_functions.py @@ -99,3 +99,32 @@ class div_ceil(sp.Function): assert dtype.is_int() code = "( ({0}) % ({1}) == 0 ? ({dtype})({0}) / ({dtype})({1}) : ( ({dtype})({0}) / ({dtype})({1}) ) +1 )" return code.format(print_func(self.args[0]), print_func(self.args[1]), dtype=dtype) + + +# noinspection PyPep8Naming +class div_floor(sp.Function): + """Integer division + + Examples: + >>> div_floor(9, 4) + 2 + >>> div_floor(8, 4) + 2 + >>> from pystencils import TypedSymbol + >>> a, b = TypedSymbol("a", "int64"), TypedSymbol("b", "int32") + >>> div_floor(a, b).to_c(str) + '((int64_t)(a) / (int64_t)(b))' + """ + nargs = 2 + + def __new__(cls, integer, divisor): + if is_integer_sequence((integer, divisor)): + return integer // divisor + else: + return super().__new__(cls, integer, divisor) + + def to_c(self, print_func): + dtype = collate_types((get_type_of_expression(self.args[0]), get_type_of_expression(self.args[1]))) + assert dtype.is_int() + code = "(({dtype})({0}) / ({dtype})({1}))" + return code.format(print_func(self.args[0]), print_func(self.args[1]), dtype=dtype) diff --git a/pystencils/kernelparameters.py b/pystencils/kernelparameters.py index 25869d2..f6e19ef 100644 --- a/pystencils/kernelparameters.py +++ b/pystencils/kernelparameters.py @@ -76,7 +76,7 @@ class FieldPointerSymbol(TypedSymbol): def __new_stage2__(cls, field_name, field_dtype, const): name = "_data_{name}".format(name=field_name) - dtype = PointerType(get_base_type(field_dtype), const=const, restrict=False) + dtype = PointerType(get_base_type(field_dtype), const=const, restrict=True) obj = super(FieldPointerSymbol, cls).__xnew__(cls, name, dtype) obj.field_name = field_name return obj diff --git a/pystencils/sympyextensions.py b/pystencils/sympyextensions.py index e0906f8..86614a2 100644 --- a/pystencils/sympyextensions.py +++ b/pystencils/sympyextensions.py @@ -179,6 +179,7 @@ def is_constant(expr): """ return len(expr.free_symbols) == 0 + def subs_additive(expr: sp.Expr, replacement: sp.Expr, subexpression: sp.Expr, required_match_replacement: Optional[Union[int, float]] = 0.5, required_match_original: Optional[Union[int, float]] = None) -> sp.Expr: diff --git a/pystencils_tests/test_cudagpu.py b/pystencils_tests/test_cudagpu.py index 8de52ba..3afddb1 100644 --- a/pystencils_tests/test_cudagpu.py +++ b/pystencils_tests/test_cudagpu.py @@ -1,6 +1,6 @@ import numpy as np import sympy as sp -from pystencils import Field, Assignment +from pystencils import Field, Assignment, fields from pystencils.simp import sympy_cse_on_assignment_list from pystencils.gpucuda.indexing import LineIndexing from pystencils.slicing import remove_ghost_layers, add_ghost_layers, make_slice @@ -150,3 +150,13 @@ def test_periodicity(): def test_block_size_limiting(): res = BlockIndexing.limit_block_size_to_device_maximum((4096, 4096, 4096)) assert all(r < 4096 for r in res) + + +def test_block_indexing(): + f = fields("f: [3D]") + bi = BlockIndexing(f, make_slice[:, :, :], block_size=(16, 8, 2), permute_block_size_dependent_on_layout=False) + assert bi.call_parameters((3, 2, 32))['block'] == (3, 2, 32) + assert bi.call_parameters((32, 2, 32))['block'] == (16, 2, 8) + + bi = BlockIndexing(f, make_slice[:, :, :], block_size=(32, 1, 1), permute_block_size_dependent_on_layout=False) + assert bi.call_parameters((1, 16, 16))['block'] == (1, 16, 2) -- GitLab