Skip to content
Snippets Groups Projects
Commit f504b40f authored by Martin Bauer's avatar Martin Bauer
Browse files

Improvements for GPU code generation

- turned on restrict keyword by default (makes large difference on GPUs)
- smarter block indexing: changing block size depending on domain size
  Example: previously there where (1,1,1) blocks when requested
  block size was (64, 1, 1) and domain size (1, 512, 512), now the
  block size is changed automatically to (1, 64, 1) in this case
- added __lauch_bounds__ to kernels to allow better optimizations from
  the CUDA compiler
parent d8e498fa
Branches
Tags
No related merge requests found
...@@ -131,7 +131,13 @@ class CBackend: ...@@ -131,7 +131,13 @@ class CBackend:
def _print_KernelFunction(self, node): def _print_KernelFunction(self, node):
function_arguments = ["%s %s" % (str(s.symbol.dtype), s.symbol.name) for s in node.get_parameters()] function_arguments = ["%s %s" % (str(s.symbol.dtype), s.symbol.name) for s in node.get_parameters()]
func_declaration = "FUNC_PREFIX void %s(%s)" % (node.function_name, ", ".join(function_arguments)) launch_bounds = ""
if self._dialect == 'cuda':
max_threads = node.indexing.max_threads_per_block()
if max_threads:
launch_bounds = "__launch_bounds__({}) ".format(max_threads)
func_declaration = "FUNC_PREFIX %svoid %s(%s)" % (launch_bounds, node.function_name,
", ".join(function_arguments))
if self._signatureOnly: if self._signatureOnly:
return func_declaration return func_declaration
......
...@@ -36,7 +36,7 @@ def make_python_function(kernel_function_node, argument_dict=None): ...@@ -36,7 +36,7 @@ def make_python_function(kernel_function_node, argument_dict=None):
code += "#define FUNC_PREFIX __global__\n" code += "#define FUNC_PREFIX __global__\n"
code += "#define RESTRICT __restrict__\n\n" code += "#define RESTRICT __restrict__\n\n"
code += str(generate_c(kernel_function_node, dialect='cuda')) code += str(generate_c(kernel_function_node, dialect='cuda'))
options = options = ["-w", "-std=c++11", "-Wno-deprecated-gpu-targets", "-use_fast_math"] options = ["-w", "-std=c++11", "-Wno-deprecated-gpu-targets"]
if USE_FAST_MATH: if USE_FAST_MATH:
options.append("-use_fast_math") options.append("-use_fast_math")
mod = SourceModule(code, options=options, include_dirs=[get_pystencils_include_path()]) mod = SourceModule(code, options=options, include_dirs=[get_pystencils_include_path()])
......
...@@ -2,11 +2,13 @@ import abc ...@@ -2,11 +2,13 @@ import abc
from typing import Tuple # noqa from typing import Tuple # noqa
import sympy as sp import sympy as sp
from pystencils.astnodes import Conditional, Block from pystencils.astnodes import Conditional, Block
from pystencils.integer_functions import div_ceil from pystencils.integer_functions import div_ceil, div_floor
from pystencils.slicing import normalize_slice from pystencils.slicing import normalize_slice
from pystencils.data_types import TypedSymbol, create_type from pystencils.data_types import TypedSymbol, create_type
from functools import partial from functools import partial
from pystencils.sympyextensions import prod
AUTO_BLOCK_SIZE_LIMITING = False AUTO_BLOCK_SIZE_LIMITING = False
BLOCK_IDX = [TypedSymbol("blockIdx." + coord, create_type("int")) for coord in ('x', 'y', 'z')] BLOCK_IDX = [TypedSymbol("blockIdx." + coord, create_type("int")) for coord in ('x', 'y', 'z')]
...@@ -59,6 +61,11 @@ class AbstractIndexing(abc.ABC): ...@@ -59,6 +61,11 @@ class AbstractIndexing(abc.ABC):
ast node, which is put inside the kernel function ast node, which is put inside the kernel function
""" """
@abc.abstractmethod
def max_threads_per_block(self):
"""Return maximal number of threads per block for launch bounds. If this cannot be determined without
knowing the array shape return None for unknown """
# -------------------------------------------- Implementations --------------------------------------------------------- # -------------------------------------------- Implementations ---------------------------------------------------------
...@@ -73,7 +80,7 @@ class BlockIndexing(AbstractIndexing): ...@@ -73,7 +80,7 @@ class BlockIndexing(AbstractIndexing):
gets the largest amount of threads gets the largest amount of threads
compile_time_block_size: compile in concrete block size, otherwise the cuda variable 'blockDim' is used compile_time_block_size: compile in concrete block size, otherwise the cuda variable 'blockDim' is used
""" """
def __init__(self, field, iteration_slice=None, def __init__(self, field, iteration_slice,
block_size=(16, 16, 1), permute_block_size_dependent_on_layout=True, compile_time_block_size=False): block_size=(16, 16, 1), permute_block_size_dependent_on_layout=True, compile_time_block_size=False):
if field.spatial_dimensions > 3: if field.spatial_dimensions > 3:
raise NotImplementedError("This indexing scheme supports at most 3 spatial dimensions") raise NotImplementedError("This indexing scheme supports at most 3 spatial dimensions")
...@@ -108,10 +115,14 @@ class BlockIndexing(AbstractIndexing): ...@@ -108,10 +115,14 @@ class BlockIndexing(AbstractIndexing):
extend_bs = (1,) * (3 - len(self._block_size)) extend_bs = (1,) * (3 - len(self._block_size))
block_size = self._block_size + extend_bs block_size = self._block_size + extend_bs
if not self._compile_time_block_size: if not self._compile_time_block_size:
block_size = tuple(sp.Min(bs, shape) for bs, shape in zip(block_size, widths)) + extend_bs assert len(block_size) == 3
adapted_block_size = []
grid = tuple(div_ceil(length, block_size) for i in range(len(widths)):
for length, block_size in zip(widths, block_size)) factor = div_floor(prod(block_size[:i]), prod(adapted_block_size))
adapted_block_size.append(sp.Min(block_size[i] * factor, widths[i]))
block_size = tuple(adapted_block_size) + extend_bs
grid = tuple(div_ceil(length, block_size) for length, block_size in zip(widths, block_size))
extend_gr = (1,) * (3 - len(grid)) extend_gr = (1,) * (3 - len(grid))
return {'block': block_size, return {'block': block_size,
...@@ -128,7 +139,7 @@ class BlockIndexing(AbstractIndexing): ...@@ -128,7 +139,7 @@ class BlockIndexing(AbstractIndexing):
@staticmethod @staticmethod
def limit_block_size_to_device_maximum(block_size): def limit_block_size_to_device_maximum(block_size):
"""Changes block size according to match device limits. """Changes block size to match device limits.
* if the total amount of threads is too big for the current device, the biggest coordinate is divided by 2. * if the total amount of threads is too big for the current device, the biggest coordinate is divided by 2.
* next, if one component is still too big, the component which is too big is divided by 2 and the smallest * next, if one component is still too big, the component which is too big is divided by 2 and the smallest
...@@ -229,6 +240,9 @@ class BlockIndexing(AbstractIndexing): ...@@ -229,6 +240,9 @@ class BlockIndexing(AbstractIndexing):
result[l] = bs result[l] = bs
return tuple(result[:len(layout)]) return tuple(result[:len(layout)])
def max_threads_per_block(self):
return prod(self._block_size)
class LineIndexing(AbstractIndexing): class LineIndexing(AbstractIndexing):
""" """
...@@ -238,7 +252,7 @@ class LineIndexing(AbstractIndexing): ...@@ -238,7 +252,7 @@ class LineIndexing(AbstractIndexing):
maximum amount of threads allowed in a CUDA block (which depends on device). maximum amount of threads allowed in a CUDA block (which depends on device).
""" """
def __init__(self, field, iteration_slice=None): def __init__(self, field, iteration_slice):
available_indices = [THREAD_IDX[0]] + BLOCK_IDX available_indices = [THREAD_IDX[0]] + BLOCK_IDX
if field.spatial_dimensions > 4: if field.spatial_dimensions > 4:
raise NotImplementedError("This indexing scheme supports at most 4 spatial dimensions") raise NotImplementedError("This indexing scheme supports at most 4 spatial dimensions")
...@@ -276,6 +290,9 @@ class LineIndexing(AbstractIndexing): ...@@ -276,6 +290,9 @@ class LineIndexing(AbstractIndexing):
def guard(self, kernel_content, arr_shape): def guard(self, kernel_content, arr_shape):
return kernel_content return kernel_content
def max_threads_per_block(self):
return None
# -------------------------------------- Helper functions -------------------------------------------------------------- # -------------------------------------- Helper functions --------------------------------------------------------------
......
...@@ -99,3 +99,32 @@ class div_ceil(sp.Function): ...@@ -99,3 +99,32 @@ class div_ceil(sp.Function):
assert dtype.is_int() assert dtype.is_int()
code = "( ({0}) % ({1}) == 0 ? ({dtype})({0}) / ({dtype})({1}) : ( ({dtype})({0}) / ({dtype})({1}) ) +1 )" code = "( ({0}) % ({1}) == 0 ? ({dtype})({0}) / ({dtype})({1}) : ( ({dtype})({0}) / ({dtype})({1}) ) +1 )"
return code.format(print_func(self.args[0]), print_func(self.args[1]), dtype=dtype) return code.format(print_func(self.args[0]), print_func(self.args[1]), dtype=dtype)
# noinspection PyPep8Naming
class div_floor(sp.Function):
"""Integer division
Examples:
>>> div_floor(9, 4)
2
>>> div_floor(8, 4)
2
>>> from pystencils import TypedSymbol
>>> a, b = TypedSymbol("a", "int64"), TypedSymbol("b", "int32")
>>> div_floor(a, b).to_c(str)
'((int64_t)(a) / (int64_t)(b))'
"""
nargs = 2
def __new__(cls, integer, divisor):
if is_integer_sequence((integer, divisor)):
return integer // divisor
else:
return super().__new__(cls, integer, divisor)
def to_c(self, print_func):
dtype = collate_types((get_type_of_expression(self.args[0]), get_type_of_expression(self.args[1])))
assert dtype.is_int()
code = "(({dtype})({0}) / ({dtype})({1}))"
return code.format(print_func(self.args[0]), print_func(self.args[1]), dtype=dtype)
...@@ -76,7 +76,7 @@ class FieldPointerSymbol(TypedSymbol): ...@@ -76,7 +76,7 @@ class FieldPointerSymbol(TypedSymbol):
def __new_stage2__(cls, field_name, field_dtype, const): def __new_stage2__(cls, field_name, field_dtype, const):
name = "_data_{name}".format(name=field_name) name = "_data_{name}".format(name=field_name)
dtype = PointerType(get_base_type(field_dtype), const=const, restrict=False) dtype = PointerType(get_base_type(field_dtype), const=const, restrict=True)
obj = super(FieldPointerSymbol, cls).__xnew__(cls, name, dtype) obj = super(FieldPointerSymbol, cls).__xnew__(cls, name, dtype)
obj.field_name = field_name obj.field_name = field_name
return obj return obj
......
...@@ -179,6 +179,7 @@ def is_constant(expr): ...@@ -179,6 +179,7 @@ def is_constant(expr):
""" """
return len(expr.free_symbols) == 0 return len(expr.free_symbols) == 0
def subs_additive(expr: sp.Expr, replacement: sp.Expr, subexpression: sp.Expr, def subs_additive(expr: sp.Expr, replacement: sp.Expr, subexpression: sp.Expr,
required_match_replacement: Optional[Union[int, float]] = 0.5, required_match_replacement: Optional[Union[int, float]] = 0.5,
required_match_original: Optional[Union[int, float]] = None) -> sp.Expr: required_match_original: Optional[Union[int, float]] = None) -> sp.Expr:
......
import numpy as np import numpy as np
import sympy as sp import sympy as sp
from pystencils import Field, Assignment from pystencils import Field, Assignment, fields
from pystencils.simp import sympy_cse_on_assignment_list from pystencils.simp import sympy_cse_on_assignment_list
from pystencils.gpucuda.indexing import LineIndexing from pystencils.gpucuda.indexing import LineIndexing
from pystencils.slicing import remove_ghost_layers, add_ghost_layers, make_slice from pystencils.slicing import remove_ghost_layers, add_ghost_layers, make_slice
...@@ -150,3 +150,13 @@ def test_periodicity(): ...@@ -150,3 +150,13 @@ def test_periodicity():
def test_block_size_limiting(): def test_block_size_limiting():
res = BlockIndexing.limit_block_size_to_device_maximum((4096, 4096, 4096)) res = BlockIndexing.limit_block_size_to_device_maximum((4096, 4096, 4096))
assert all(r < 4096 for r in res) assert all(r < 4096 for r in res)
def test_block_indexing():
f = fields("f: [3D]")
bi = BlockIndexing(f, make_slice[:, :, :], block_size=(16, 8, 2), permute_block_size_dependent_on_layout=False)
assert bi.call_parameters((3, 2, 32))['block'] == (3, 2, 32)
assert bi.call_parameters((32, 2, 32))['block'] == (16, 2, 8)
bi = BlockIndexing(f, make_slice[:, :, :], block_size=(32, 1, 1), permute_block_size_dependent_on_layout=False)
assert bi.call_parameters((1, 16, 16))['block'] == (1, 16, 2)
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment