diff --git a/apps/benchmarks/UniformGridGPU/CMakeLists.txt b/apps/benchmarks/UniformGridGPU/CMakeLists.txt
index 5725eac23a734f88dbe3ea215a460d2037cde53c..890d77f124479016436f5b0de7ba3f9eb49384fa 100644
--- a/apps/benchmarks/UniformGridGPU/CMakeLists.txt
+++ b/apps/benchmarks/UniformGridGPU/CMakeLists.txt
@@ -6,7 +6,11 @@ waLBerla_link_files_to_builddir( "simulation_setup" )
 
 foreach(streaming_pattern pull push aa esotwist)
     foreach(stencil d3q19 d3q27)
-        foreach (collision_setup srt trt mrt cumulant entropic smagorinsky mrt-overrelax cumulant-overrelax)
+        foreach (collision_setup srt trt mrt mrt-overrelax central central-overrelax cumulant cumulant-overrelax entropic smagorinsky)
+	    # KBC methods only for D2Q9 and D3Q27 defined
+	    if (${collision_setup} STREQUAL "entropic" AND ${stencil} STREQUAL "d3q19")
+		    continue()
+	    endif (${collision_setup} STREQUAL "entropic" AND ${stencil} STREQUAL "d3q19")
             set(config ${stencil}_${streaming_pattern}_${collision_setup})
             waLBerla_generate_target_from_python(NAME UniformGridGPUGenerated_${config}
                     FILE UniformGridGPU.py
@@ -36,4 +40,4 @@ foreach(streaming_pattern pull push aa esotwist)
 
         endforeach ()
     endforeach()
-endforeach()
\ No newline at end of file
+endforeach()
diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.py b/apps/benchmarks/UniformGridGPU/UniformGridGPU.py
index 86f5ae3b73357f0217b24285a882d4426e58b70a..ce61f5f0e926d2e865dcf47e56e0ee7ef9379f47 100644
--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.py
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.py
@@ -13,6 +13,7 @@ from lbmpy.advanced_streaming.utility import streaming_patterns
 from lbmpy.boundaries import NoSlip, UBB
 from lbmpy.creationfunctions import create_lb_collision_rule
 from lbmpy.macroscopic_value_kernels import macroscopic_values_setter
+from lbmpy.moments import get_default_moment_set_for_stencil
 from lbmpy.updatekernels import create_stream_only_kernel
 from lbmpy.fieldaccess import *
 
@@ -22,6 +23,7 @@ from lbmpy_walberla import generate_alternating_lbm_sweep, generate_lb_pack_info
 omega = sp.symbols("omega")
 omega_free = sp.Symbol("omega_free")
 compile_time_block_size = False
+max_threads = None
 
 if compile_time_block_size:
     sweep_block_size = (128, 1, 1)
@@ -41,34 +43,48 @@ options_dict = {
     'trt': {
         'method': Method.TRT,
         'relaxation_rate': omega,
+        'compressible': False,
     },
     'mrt': {
         'method': Method.MRT,
         'relaxation_rates': [omega, 1, 1, 1, 1, 1, 1],
+        'compressible': False,
     },
     'mrt-overrelax': {
         'method': Method.MRT,
         'relaxation_rates': [omega] + [1 + x * 1e-2 for x in range(1, 11)],
+        'compressible': False,
+    },
+    'central': {
+        'method': Method.CENTRAL_MOMENT,
+        'relaxation_rate': omega,
+        'compressible': True,
+    },
+    'central-overrelax': {
+        'method': Method.CENTRAL_MOMENT,
+        'relaxation_rates': [omega] + [1 + x * 1e-2 for x in range(1, 11)],
+        'compressible': True,
     },
     'cumulant': {
-        'method': Method.CUMULANT,
+        'method': Method.MONOMIAL_CUMULANT,
         'relaxation_rate': omega,
         'compressible': True,
     },
     'cumulant-overrelax': {
-        'method': Method.CUMULANT,
-        'relaxation_rates': [omega] + [1 + x * 1e-2 for x in range(1, 11)],
+        'method': Method.MONOMIAL_CUMULANT,
+        'relaxation_rates': [omega] + [1 + x * 1e-2 for x in range(1, 18)],
         'compressible': True,
     },
     'entropic': {
-        'method': Method.MRT,
+        'method': Method.TRT_KBC_N4,
         'compressible': True,
-        'relaxation_rates': [omega, omega] + [omega_free] * 6,
+        'relaxation_rates': [omega, omega_free],
         'entropic': True,
+        'entropic_newton_iterations': False
     },
     'smagorinsky': {
         'method': Method.SRT,
-        'smagorinsky': True,
+        'smagorinsky': False,
         'relaxation_rate': omega,
     }
 }
@@ -96,10 +112,12 @@ with CodeGeneration() as ctx:
     if len(config_tokens) >= 4:
         optimize = (config_tokens[3] != 'noopt')
 
-    if stencil_str == "D3Q27":
+    if stencil_str == "d3q27":
         stencil = LBStencil(Stencil.D3Q27)
-    else:
+    elif stencil_str == "d3q19":
         stencil = LBStencil(Stencil.D3Q19)
+    else:
+        raise ValueError("Only D3Q27 and D3Q19 stencil are supported at the moment")
 
     assert streaming_pattern in streaming_patterns, f"Invalid streaming pattern: {streaming_pattern}"
 
@@ -114,6 +132,9 @@ with CodeGeneration() as ctx:
     lbm_config = LBMConfig(stencil=stencil, field_name=pdfs.name, streaming_pattern=streaming_pattern, **options)
     lbm_opt = LBMOptimisation(cse_global=True, cse_pdfs=False, symbolic_field=pdfs, field_layout='fzyx')
 
+    if lbm_config.method == Method.CENTRAL_MOMENT:
+        lbm_config = replace(lbm_config, nested_moments=get_default_moment_set_for_stencil(stencil))
+
     if not is_inplace(streaming_pattern):
         lbm_opt = replace(lbm_opt, symbolic_temporary_field=pdfs_tmp)
         field_swaps = [(pdfs, pdfs_tmp)]
@@ -145,18 +166,21 @@ with CodeGeneration() as ctx:
 
     generate_alternating_lbm_sweep(ctx, 'UniformGridGPU_LbKernel', collision_rule, lbm_config=lbm_config,
                                    lbm_optimisation=lbm_opt, target=ps.Target.GPU,
-                                   inner_outer_split=True, varying_parameters=vp, field_swaps=field_swaps)
+                                   gpu_indexing_params=gpu_indexing_params,
+                                   inner_outer_split=True, varying_parameters=vp, field_swaps=field_swaps,
+                                   max_threads=max_threads)
 
     # getter & setter
     setter_assignments = macroscopic_values_setter(lb_method, density=1.0, velocity=velocity_field.center_vector,
                                                    pdfs=pdfs,
                                                    streaming_pattern=streaming_pattern,
                                                    previous_timestep=Timestep.EVEN)
-    generate_sweep(ctx, 'UniformGridGPU_MacroSetter', setter_assignments, target=ps.Target.GPU)
+    generate_sweep(ctx, 'UniformGridGPU_MacroSetter', setter_assignments, target=ps.Target.GPU, max_threads=max_threads)
 
     # Stream only kernel
     generate_sweep(ctx, 'UniformGridGPU_StreamOnlyKernel', stream_only_kernel, field_swaps=field_swaps_stream_only,
-                   gpu_indexing_params=gpu_indexing_params, varying_parameters=vp, target=ps.Target.GPU)
+                   gpu_indexing_params=gpu_indexing_params, varying_parameters=vp, target=ps.Target.GPU,
+                   max_threads=max_threads)
 
     # Boundaries
     noslip = NoSlip()
diff --git a/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs.py b/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs.py
index a1816eba1569722be2edf70c04a70078bbabf4f0..50d9bfd756b4bc5d463ac9848e084945e32da1bd 100755
--- a/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs.py
+++ b/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs.py
@@ -18,8 +18,8 @@ from math import prod
 # Number of time steps run for a workload of 128^3 per GPU
 # if double as many cells are on the GPU, half as many time steps are run etc.
 # increase this to get more reliable measurements
-TIME_STEPS_FOR_128_BLOCK = 500
-DB_FILE = "gpu_benchmark.sqlite3"
+TIME_STEPS_FOR_128_BLOCK = 1000
+DB_FILE = os.environ.get('DB_FILE', "gpu_benchmark.sqlite3")
 
 BASE_CONFIG = {
     'DomainSetup': {
@@ -129,6 +129,7 @@ class Scenario:
         num_tries = 4
         # check multiple times e.g. may fail when multiple benchmark processes are running
         table_name = f"runs_{data['stencil']}_{data['streamingPattern']}_{data['collisionSetup']}_{prod(self.blocks)}"
+        table_name = table_name.replace("-", "_")
         for num_try in range(num_tries):
             try:
                 checkAndUpdateSchema(result, table_name, DB_FILE)
@@ -193,7 +194,7 @@ def single_gpu_benchmark():
         additional_info['gpu_type'] = gpu_type
 
     scenarios = wlb.ScenarioManager()
-    block_sizes = [(i, i, i) for i in (64, 128, 256, 320, 384, 448, 512)]
+    block_sizes = [(i, i, i) for i in (32, 64, 128, 256)]
     cuda_blocks = [(32, 1, 1), (64, 1, 1), (128, 1, 1), (256, 1, 1), (512, 1, 1),
                    (32, 2, 1), (64, 2, 1), (128, 2, 1), (256, 2, 1),
                    (32, 4, 1), (64, 4, 1), (128, 4, 1),
@@ -201,6 +202,9 @@ def single_gpu_benchmark():
                    (32, 16, 1)]
     for block_size in block_sizes:
         for cuda_block_size in cuda_blocks:
+            # cuda_block_size = (256, 1, 1) and block_size = (64, 64, 64) would be cut to cuda_block_size = (64, 1, 1)
+            if cuda_block_size > block_size:
+                continue
             if not cuda_block_size_ok(cuda_block_size):
                 wlb.log_info_on_root(f"Cuda block size {cuda_block_size} would exceed register limit. Skipping.")
                 continue
@@ -210,7 +214,7 @@ def single_gpu_benchmark():
             scenario = Scenario(cells_per_block=block_size,
                                 cuda_blocks=cuda_block_size,
                                 time_step_strategy='kernelOnly',
-                                timesteps=num_time_steps(block_size),
+                                timesteps=num_time_steps(block_size, 2000),
                                 additional_info=additional_info)
             scenarios.add(scenario)
 
diff --git a/python/lbmpy_walberla/alternating_sweeps.py b/python/lbmpy_walberla/alternating_sweeps.py
index 587217690cb464a8e8f6dbaa2a535aa0c1835f0b..dbcc1ab54e618101658a2c2262dac946f9d99805 100644
--- a/python/lbmpy_walberla/alternating_sweeps.py
+++ b/python/lbmpy_walberla/alternating_sweeps.py
@@ -63,7 +63,7 @@ def generate_alternating_lbm_sweep(generation_context, class_name, collision_rul
                                    namespace='lbm', field_swaps=(), varying_parameters=(),
                                    inner_outer_split=False, ghost_layers_to_include=0,
                                    target=Target.CPU, data_type=None,
-                                   cpu_openmp=None, cpu_vectorize_info=None,
+                                   cpu_openmp=None, cpu_vectorize_info=None, max_threads=None,
                                    **kernel_parameters):
     """Generates an Alternating lattice Boltzmann sweep class. This is in particular meant for
     in-place streaming patterns, but can of course also be used with two-fields patterns (why make it
@@ -87,6 +87,7 @@ def generate_alternating_lbm_sweep(generation_context, class_name, collision_rul
         data_type: default datatype for the kernel creation. Default is double
         cpu_openmp: if loops should use openMP or not.
         cpu_vectorize_info: dictionary containing necessary information for the usage of a SIMD instruction set.
+        max_threads: only relevant for GPU kernels. Will be argument of `__launch_bounds__`.
         kernel_parameters: other parameters passed to the creation of a pystencils.CreateKernelConfig
     """
     config = config_from_context(generation_context, target=target, data_type=data_type, cpu_openmp=cpu_openmp,
@@ -120,4 +121,4 @@ def generate_alternating_lbm_sweep(generation_context, class_name, collision_rul
                              target=target, namespace=namespace,
                              field_swaps=field_swaps, varying_parameters=varying_parameters,
                              inner_outer_split=inner_outer_split, ghost_layers_to_include=ghost_layers_to_include,
-                             cpu_vectorize_info=vec_info, cpu_openmp=openmp)
+                             cpu_vectorize_info=vec_info, cpu_openmp=openmp, max_threads=max_threads)
diff --git a/python/pystencils_walberla/codegen.py b/python/pystencils_walberla/codegen.py
index 7e27e19cc4ca7e531286d98d181425ce78e563a5..6816ac87ec90e22bc93b5086b77f1ff8d259d760 100644
--- a/python/pystencils_walberla/codegen.py
+++ b/python/pystencils_walberla/codegen.py
@@ -29,7 +29,7 @@ __all__ = ['generate_sweep', 'generate_pack_info', 'generate_pack_info_for_field
 def generate_sweep(generation_context, class_name, assignments,
                    namespace='pystencils', field_swaps=(), staggered=False, varying_parameters=(),
                    inner_outer_split=False, ghost_layers_to_include=0,
-                   target=Target.CPU, data_type=None, cpu_openmp=None, cpu_vectorize_info=None,
+                   target=Target.CPU, data_type=None, cpu_openmp=None, cpu_vectorize_info=None, max_threads=None,
                    **create_kernel_params):
     """Generates a waLBerla sweep from a pystencils representation.
 
@@ -59,6 +59,7 @@ def generate_sweep(generation_context, class_name, assignments,
         data_type: default datatype for the kernel creation. Default is double
         cpu_openmp: if loops should use openMP or not.
         cpu_vectorize_info: dictionary containing necessary information for the usage of a SIMD instruction set.
+        max_threads: only relevant for GPU kernels. Will be argument of `__launch_bounds__`
         **create_kernel_params: remaining keyword arguments are passed to `pystencils.create_kernel`
     """
     if staggered:
@@ -83,13 +84,13 @@ def generate_sweep(generation_context, class_name, assignments,
                              field_swaps=field_swaps, varying_parameters=varying_parameters,
                              inner_outer_split=inner_outer_split, ghost_layers_to_include=ghost_layers_to_include,
                              cpu_vectorize_info=config.cpu_vectorize_info,
-                             cpu_openmp=config.cpu_openmp)
+                             cpu_openmp=config.cpu_openmp, max_threads=max_threads)
 
 
 def generate_selective_sweep(generation_context, class_name, selection_tree, interface_mappings=(), target=None,
                              namespace='pystencils', field_swaps=(), varying_parameters=(),
                              inner_outer_split=False, ghost_layers_to_include=0,
-                             cpu_vectorize_info=None, cpu_openmp=False):
+                             cpu_vectorize_info=None, cpu_openmp=False, max_threads=None):
     """Generates a selective sweep from a kernel selection tree. A kernel selection tree consolidates multiple
     pystencils ASTs in a tree-like structure. See also module `pystencils_walberla.kernel_selection`.
 
@@ -107,6 +108,7 @@ def generate_selective_sweep(generation_context, class_name, selection_tree, int
         ghost_layers_to_include: see documentation of `generate_sweep`
         cpu_vectorize_info: Dictionary containing information about CPU vectorization applied to the kernels
         cpu_openmp: Whether or not CPU kernels use OpenMP parallelization
+        max_threads: only relevant for GPU kernels. Will be argument of `__launch_bounds__`
     """
     def to_name(f):
         return f.name if isinstance(f, Field) else f
@@ -144,7 +146,8 @@ def generate_selective_sweep(generation_context, class_name, selection_tree, int
         'interface_spec': interface_spec,
         'generate_functor': True,
         'cpu_vectorize_info': cpu_vectorize_info,
-        'cpu_openmp': cpu_openmp
+        'cpu_openmp': cpu_openmp,
+        'max_threads': max_threads
     }
     header = env.get_template("Sweep.tmpl.h").render(**jinja_context)
     source = env.get_template("Sweep.tmpl.cpp").render(**jinja_context)
diff --git a/python/pystencils_walberla/jinja_filters.py b/python/pystencils_walberla/jinja_filters.py
index c615243a7a2c17b90d7eb3cd924ecdcfeb10a5a7..c728bcf46a13bf114de2f0ce05c12abc7edd01df 100644
--- a/python/pystencils_walberla/jinja_filters.py
+++ b/python/pystencils_walberla/jinja_filters.py
@@ -43,6 +43,16 @@ delete_loop = """
     }}
 """
 
+# the target will enter the jinja filters as string. The reason for that is, that is not easy to work with the
+# enum in the template files.
+def translate_target(target):
+    if isinstance(target, Target):
+        return target
+    elif isinstance(target, str):
+        return Target[target.upper()]
+    else:
+        raise ValueError(f"The type of the target {type(target)} is not supported")
+
 
 def make_field_type(dtype, f_size, is_gpu):
     if is_gpu:
@@ -82,6 +92,7 @@ def get_field_stride(param):
 
 def generate_declaration(kernel_info, target=Target.CPU):
     """Generates the declaration of the kernel function"""
+    target = translate_target(target)
     ast = kernel_info.ast
     result = generate_c(ast, signature_only=True, dialect=Backend.CUDA if target == Target.GPU else Backend.C) + ";"
     result = "namespace internal_%s {\n%s\n}" % (ast.function_name, result,)
@@ -90,6 +101,7 @@ def generate_declaration(kernel_info, target=Target.CPU):
 
 def generate_definition(kernel_info, target=Target.CPU):
     """Generates the definition (i.e. implementation) of the kernel function"""
+    target = translate_target(target)
     ast = kernel_info.ast
     result = generate_c(ast, dialect=Backend.CUDA if target == Target.GPU else Backend.C)
     result = "namespace internal_%s {\nstatic %s\n}" % (ast.function_name, result)
@@ -97,6 +109,7 @@ def generate_definition(kernel_info, target=Target.CPU):
 
 
 def generate_declarations(kernel_family, target=Target.CPU):
+    target = translate_target(target)
     declarations = []
     for ast in kernel_family.all_asts:
         code = generate_c(ast, signature_only=True, dialect=Backend.CUDA if target == Target.GPU else Backend.C) + ";"
@@ -105,10 +118,15 @@ def generate_declarations(kernel_family, target=Target.CPU):
     return "\n".join(declarations)
 
 
-def generate_definitions(kernel_family, target=Target.CPU):
+def generate_definitions(kernel_family, target=Target.CPU, max_threads=None):
+    target = translate_target(target)
     definitions = []
     for ast in kernel_family.all_asts:
         code = generate_c(ast, dialect=Backend.CUDA if target == Target.GPU else Backend.C)
+        if max_threads is not None and target == Target.GPU:
+            assert isinstance(max_threads, int), "maximal number of threads should be an integer value"
+            index = code.find('FUNC_PREFIX') + len("FUNC_PREFIX ")
+            code = code[:index] + f'__launch_bounds__({max_threads}) ' + code[index:]
         code = "namespace internal_%s {\nstatic %s\n}\n" % (ast.function_name, code)
         definitions.append(code)
     return "\n".join(definitions)
@@ -177,10 +195,12 @@ def generate_block_data_to_field_extraction(ctx, kernel_info, parameters_to_igno
     normal_fields = {f for f in field_parameters if f.name not in kernel_info.temporary_fields}
     temporary_fields = {f for f in field_parameters if f.name in kernel_info.temporary_fields}
 
+    target = translate_target(ctx['target'])
+
     args = {
         'declaration_only': declarations_only,
         'no_declaration': no_declarations,
-        'is_gpu': ctx['target'] == 'gpu',
+        'is_gpu': target == Target.GPU,
     }
     result = "\n".join(
         field_extraction_code(field=field, is_temporary=False, update_member=update_member, **args) for field in
@@ -389,7 +409,8 @@ def generate_members(ctx, kernel_info, parameters_to_ignore=(), only_fields=Fals
 
     params_to_skip = tuple(parameters_to_ignore) + tuple(kernel_info.temporary_fields)
     params_to_skip += tuple(e[1] for e in kernel_info.varying_parameters)
-    is_gpu = ctx['target'] == 'gpu'
+    target = translate_target(ctx['target'])
+    is_gpu = target == Target.GPU
 
     result = []
     for param in kernel_info.parameters:
diff --git a/python/pystencils_walberla/templates/Sweep.tmpl.cpp b/python/pystencils_walberla/templates/Sweep.tmpl.cpp
index d70096e97faa2a9ff0da26fbb996b5f5b5764ac1..7b095e32877f295f8f39f74501a7ecadcd6c02e8 100644
--- a/python/pystencils_walberla/templates/Sweep.tmpl.cpp
+++ b/python/pystencils_walberla/templates/Sweep.tmpl.cpp
@@ -54,7 +54,7 @@ namespace walberla {
 namespace {{namespace}} {
 
 
-{{kernel|generate_definitions(target)}}
+{{kernel|generate_definitions(target, max_threads)}}
 
 void {{class_name}}::run( {{- ["IBlock * block", kernel.kernel_selection_parameters, ["cudaStream_t stream"] if target == 'gpu' else []] | type_identifier_list -}} )
 {