diff --git a/pystencils/backends/cbackend.py b/pystencils/backends/cbackend.py
index 95ca58e42c2eed4b00d9ad230f22b483230127d8..3d0f22457a22942d925b5a2f5c14f935c589a52b 100644
--- a/pystencils/backends/cbackend.py
+++ b/pystencils/backends/cbackend.py
@@ -131,7 +131,13 @@ class CBackend:
 
     def _print_KernelFunction(self, node):
         function_arguments = ["%s %s" % (str(s.symbol.dtype), s.symbol.name) for s in node.get_parameters()]
-        func_declaration = "FUNC_PREFIX void %s(%s)" % (node.function_name, ", ".join(function_arguments))
+        launch_bounds = ""
+        if self._dialect == 'cuda':
+            max_threads = node.indexing.max_threads_per_block()
+            if max_threads:
+                launch_bounds = "__launch_bounds__({}) ".format(max_threads)
+        func_declaration = "FUNC_PREFIX %svoid %s(%s)" % (launch_bounds, node.function_name,
+                                                          ", ".join(function_arguments))
         if self._signatureOnly:
             return func_declaration
 
diff --git a/pystencils/gpucuda/cudajit.py b/pystencils/gpucuda/cudajit.py
index 3d55bce1b68ae2564886b84950e18d8a7626eaf8..90c23b133eba1a5e60e8ecfa80a0a849e2b3ab19 100644
--- a/pystencils/gpucuda/cudajit.py
+++ b/pystencils/gpucuda/cudajit.py
@@ -36,7 +36,7 @@ def make_python_function(kernel_function_node, argument_dict=None):
     code += "#define FUNC_PREFIX __global__\n"
     code += "#define RESTRICT __restrict__\n\n"
     code += str(generate_c(kernel_function_node, dialect='cuda'))
-    options = options = ["-w", "-std=c++11", "-Wno-deprecated-gpu-targets", "-use_fast_math"]
+    options = ["-w", "-std=c++11", "-Wno-deprecated-gpu-targets"]
     if USE_FAST_MATH:
         options.append("-use_fast_math")
     mod = SourceModule(code, options=options, include_dirs=[get_pystencils_include_path()])
diff --git a/pystencils/gpucuda/indexing.py b/pystencils/gpucuda/indexing.py
index 403d6790f7f7b06901c28f2d9e6c59c07e1421c1..80301a5bf70c24a80db50dbcc27da306d9cb62f2 100644
--- a/pystencils/gpucuda/indexing.py
+++ b/pystencils/gpucuda/indexing.py
@@ -2,11 +2,13 @@ import abc
 from typing import Tuple  # noqa
 import sympy as sp
 from pystencils.astnodes import Conditional, Block
-from pystencils.integer_functions import div_ceil
+from pystencils.integer_functions import div_ceil, div_floor
 from pystencils.slicing import normalize_slice
 from pystencils.data_types import TypedSymbol, create_type
 from functools import partial
 
+from pystencils.sympyextensions import prod
+
 AUTO_BLOCK_SIZE_LIMITING = False
 
 BLOCK_IDX = [TypedSymbol("blockIdx." + coord, create_type("int")) for coord in ('x', 'y', 'z')]
@@ -59,6 +61,11 @@ class AbstractIndexing(abc.ABC):
             ast node, which is put inside the kernel function
         """
 
+    @abc.abstractmethod
+    def max_threads_per_block(self):
+        """Return maximal number of threads per block for launch bounds. If this cannot be determined without
+        knowing the array shape return None for unknown """
+
 
 # -------------------------------------------- Implementations ---------------------------------------------------------
 
@@ -73,7 +80,7 @@ class BlockIndexing(AbstractIndexing):
                                                 gets the largest amount of threads
         compile_time_block_size: compile in concrete block size, otherwise the cuda variable 'blockDim' is used
     """
-    def __init__(self, field, iteration_slice=None,
+    def __init__(self, field, iteration_slice,
                  block_size=(16, 16, 1), permute_block_size_dependent_on_layout=True, compile_time_block_size=False):
         if field.spatial_dimensions > 3:
             raise NotImplementedError("This indexing scheme supports at most 3 spatial dimensions")
@@ -108,10 +115,14 @@ class BlockIndexing(AbstractIndexing):
         extend_bs = (1,) * (3 - len(self._block_size))
         block_size = self._block_size + extend_bs
         if not self._compile_time_block_size:
-            block_size = tuple(sp.Min(bs, shape) for bs, shape in zip(block_size, widths)) + extend_bs
-
-        grid = tuple(div_ceil(length, block_size)
-                     for length, block_size in zip(widths, block_size))
+            assert len(block_size) == 3
+            adapted_block_size = []
+            for i in range(len(widths)):
+                factor = div_floor(prod(block_size[:i]), prod(adapted_block_size))
+                adapted_block_size.append(sp.Min(block_size[i] * factor, widths[i]))
+            block_size = tuple(adapted_block_size) + extend_bs
+
+        grid = tuple(div_ceil(length, block_size) for length, block_size in zip(widths, block_size))
         extend_gr = (1,) * (3 - len(grid))
 
         return {'block': block_size,
@@ -128,7 +139,7 @@ class BlockIndexing(AbstractIndexing):
 
     @staticmethod
     def limit_block_size_to_device_maximum(block_size):
-        """Changes block size according to match device limits.
+        """Changes block size to match device limits.
 
         * if the total amount of threads is too big for the current device, the biggest coordinate is divided by 2.
         * next, if one component is still too big, the component which is too big is divided by 2 and the smallest
@@ -229,6 +240,9 @@ class BlockIndexing(AbstractIndexing):
             result[l] = bs
         return tuple(result[:len(layout)])
 
+    def max_threads_per_block(self):
+        return prod(self._block_size)
+
 
 class LineIndexing(AbstractIndexing):
     """
@@ -238,7 +252,7 @@ class LineIndexing(AbstractIndexing):
     maximum amount of threads allowed in a CUDA block (which depends on device).
     """
 
-    def __init__(self, field, iteration_slice=None):
+    def __init__(self, field, iteration_slice):
         available_indices = [THREAD_IDX[0]] + BLOCK_IDX
         if field.spatial_dimensions > 4:
             raise NotImplementedError("This indexing scheme supports at most 4 spatial dimensions")
@@ -276,6 +290,9 @@ class LineIndexing(AbstractIndexing):
     def guard(self, kernel_content, arr_shape):
         return kernel_content
 
+    def max_threads_per_block(self):
+        return None
+
 
 # -------------------------------------- Helper functions --------------------------------------------------------------
 
diff --git a/pystencils/integer_functions.py b/pystencils/integer_functions.py
index 9369f236c3cb574f1bac2a6f337878f03c44d089..b1ac944a977717405b1f8c39c2f5a0ceeea14e66 100644
--- a/pystencils/integer_functions.py
+++ b/pystencils/integer_functions.py
@@ -99,3 +99,32 @@ class div_ceil(sp.Function):
         assert dtype.is_int()
         code = "( ({0}) % ({1}) == 0 ? ({dtype})({0}) / ({dtype})({1}) : ( ({dtype})({0}) / ({dtype})({1}) ) +1 )"
         return code.format(print_func(self.args[0]), print_func(self.args[1]), dtype=dtype)
+
+
+# noinspection PyPep8Naming
+class div_floor(sp.Function):
+    """Integer division
+
+    Examples:
+        >>> div_floor(9, 4)
+        2
+        >>> div_floor(8, 4)
+        2
+        >>> from pystencils import TypedSymbol
+        >>> a, b = TypedSymbol("a", "int64"), TypedSymbol("b", "int32")
+        >>> div_floor(a, b).to_c(str)
+        '((int64_t)(a) / (int64_t)(b))'
+    """
+    nargs = 2
+
+    def __new__(cls, integer, divisor):
+        if is_integer_sequence((integer, divisor)):
+            return integer // divisor
+        else:
+            return super().__new__(cls, integer, divisor)
+
+    def to_c(self, print_func):
+        dtype = collate_types((get_type_of_expression(self.args[0]), get_type_of_expression(self.args[1])))
+        assert dtype.is_int()
+        code = "(({dtype})({0}) / ({dtype})({1}))"
+        return code.format(print_func(self.args[0]), print_func(self.args[1]), dtype=dtype)
diff --git a/pystencils/kernelparameters.py b/pystencils/kernelparameters.py
index 25869d2669400f645120cd595265905df827e73a..f6e19ef83def4331d0c3f4c3a45d08605263e984 100644
--- a/pystencils/kernelparameters.py
+++ b/pystencils/kernelparameters.py
@@ -76,7 +76,7 @@ class FieldPointerSymbol(TypedSymbol):
 
     def __new_stage2__(cls, field_name, field_dtype, const):
         name = "_data_{name}".format(name=field_name)
-        dtype = PointerType(get_base_type(field_dtype), const=const, restrict=False)
+        dtype = PointerType(get_base_type(field_dtype), const=const, restrict=True)
         obj = super(FieldPointerSymbol, cls).__xnew__(cls, name, dtype)
         obj.field_name = field_name
         return obj
diff --git a/pystencils/sympyextensions.py b/pystencils/sympyextensions.py
index e0906f84cca886e70a19eb548b36352d6385365d..86614a21224d5891e8890868771c6d9dc693f9c0 100644
--- a/pystencils/sympyextensions.py
+++ b/pystencils/sympyextensions.py
@@ -179,6 +179,7 @@ def is_constant(expr):
     """
     return len(expr.free_symbols) == 0
 
+
 def subs_additive(expr: sp.Expr, replacement: sp.Expr, subexpression: sp.Expr,
                   required_match_replacement: Optional[Union[int, float]] = 0.5,
                   required_match_original: Optional[Union[int, float]] = None) -> sp.Expr:
diff --git a/pystencils_tests/test_cudagpu.py b/pystencils_tests/test_cudagpu.py
index 8de52ba490bc2fdc7914d8a194776937562e9481..3afddb128d2bb649f6bc5454d3ddfcd71d2a61a2 100644
--- a/pystencils_tests/test_cudagpu.py
+++ b/pystencils_tests/test_cudagpu.py
@@ -1,6 +1,6 @@
 import numpy as np
 import sympy as sp
-from pystencils import Field, Assignment
+from pystencils import Field, Assignment, fields
 from pystencils.simp import sympy_cse_on_assignment_list
 from pystencils.gpucuda.indexing import LineIndexing
 from pystencils.slicing import remove_ghost_layers, add_ghost_layers, make_slice
@@ -150,3 +150,13 @@ def test_periodicity():
 def test_block_size_limiting():
     res = BlockIndexing.limit_block_size_to_device_maximum((4096, 4096, 4096))
     assert all(r < 4096 for r in res)
+
+
+def test_block_indexing():
+    f = fields("f: [3D]")
+    bi = BlockIndexing(f, make_slice[:, :, :], block_size=(16, 8, 2), permute_block_size_dependent_on_layout=False)
+    assert bi.call_parameters((3, 2, 32))['block'] == (3, 2, 32)
+    assert bi.call_parameters((32, 2, 32))['block'] == (16, 2, 8)
+
+    bi = BlockIndexing(f, make_slice[:, :, :], block_size=(32, 1, 1), permute_block_size_dependent_on_layout=False)
+    assert bi.call_parameters((1, 16, 16))['block'] == (1, 16, 2)