diff --git a/apps/benchmarks/UniformGridGPU/CMakeLists.txt b/apps/benchmarks/UniformGridGPU/CMakeLists.txt index 5725eac23a734f88dbe3ea215a460d2037cde53c..890d77f124479016436f5b0de7ba3f9eb49384fa 100644 --- a/apps/benchmarks/UniformGridGPU/CMakeLists.txt +++ b/apps/benchmarks/UniformGridGPU/CMakeLists.txt @@ -6,7 +6,11 @@ waLBerla_link_files_to_builddir( "simulation_setup" ) foreach(streaming_pattern pull push aa esotwist) foreach(stencil d3q19 d3q27) - foreach (collision_setup srt trt mrt cumulant entropic smagorinsky mrt-overrelax cumulant-overrelax) + foreach (collision_setup srt trt mrt mrt-overrelax central central-overrelax cumulant cumulant-overrelax entropic smagorinsky) + # KBC methods only for D2Q9 and D3Q27 defined + if (${collision_setup} STREQUAL "entropic" AND ${stencil} STREQUAL "d3q19") + continue() + endif (${collision_setup} STREQUAL "entropic" AND ${stencil} STREQUAL "d3q19") set(config ${stencil}_${streaming_pattern}_${collision_setup}) waLBerla_generate_target_from_python(NAME UniformGridGPUGenerated_${config} FILE UniformGridGPU.py @@ -36,4 +40,4 @@ foreach(streaming_pattern pull push aa esotwist) endforeach () endforeach() -endforeach() \ No newline at end of file +endforeach() diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.py b/apps/benchmarks/UniformGridGPU/UniformGridGPU.py index 86f5ae3b73357f0217b24285a882d4426e58b70a..ce61f5f0e926d2e865dcf47e56e0ee7ef9379f47 100644 --- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.py +++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.py @@ -13,6 +13,7 @@ from lbmpy.advanced_streaming.utility import streaming_patterns from lbmpy.boundaries import NoSlip, UBB from lbmpy.creationfunctions import create_lb_collision_rule from lbmpy.macroscopic_value_kernels import macroscopic_values_setter +from lbmpy.moments import get_default_moment_set_for_stencil from lbmpy.updatekernels import create_stream_only_kernel from lbmpy.fieldaccess import * @@ -22,6 +23,7 @@ from lbmpy_walberla import generate_alternating_lbm_sweep, generate_lb_pack_info omega = sp.symbols("omega") omega_free = sp.Symbol("omega_free") compile_time_block_size = False +max_threads = None if compile_time_block_size: sweep_block_size = (128, 1, 1) @@ -41,34 +43,48 @@ options_dict = { 'trt': { 'method': Method.TRT, 'relaxation_rate': omega, + 'compressible': False, }, 'mrt': { 'method': Method.MRT, 'relaxation_rates': [omega, 1, 1, 1, 1, 1, 1], + 'compressible': False, }, 'mrt-overrelax': { 'method': Method.MRT, 'relaxation_rates': [omega] + [1 + x * 1e-2 for x in range(1, 11)], + 'compressible': False, + }, + 'central': { + 'method': Method.CENTRAL_MOMENT, + 'relaxation_rate': omega, + 'compressible': True, + }, + 'central-overrelax': { + 'method': Method.CENTRAL_MOMENT, + 'relaxation_rates': [omega] + [1 + x * 1e-2 for x in range(1, 11)], + 'compressible': True, }, 'cumulant': { - 'method': Method.CUMULANT, + 'method': Method.MONOMIAL_CUMULANT, 'relaxation_rate': omega, 'compressible': True, }, 'cumulant-overrelax': { - 'method': Method.CUMULANT, - 'relaxation_rates': [omega] + [1 + x * 1e-2 for x in range(1, 11)], + 'method': Method.MONOMIAL_CUMULANT, + 'relaxation_rates': [omega] + [1 + x * 1e-2 for x in range(1, 18)], 'compressible': True, }, 'entropic': { - 'method': Method.MRT, + 'method': Method.TRT_KBC_N4, 'compressible': True, - 'relaxation_rates': [omega, omega] + [omega_free] * 6, + 'relaxation_rates': [omega, omega_free], 'entropic': True, + 'entropic_newton_iterations': False }, 'smagorinsky': { 'method': Method.SRT, - 'smagorinsky': True, + 'smagorinsky': False, 'relaxation_rate': omega, } } @@ -96,10 +112,12 @@ with CodeGeneration() as ctx: if len(config_tokens) >= 4: optimize = (config_tokens[3] != 'noopt') - if stencil_str == "D3Q27": + if stencil_str == "d3q27": stencil = LBStencil(Stencil.D3Q27) - else: + elif stencil_str == "d3q19": stencil = LBStencil(Stencil.D3Q19) + else: + raise ValueError("Only D3Q27 and D3Q19 stencil are supported at the moment") assert streaming_pattern in streaming_patterns, f"Invalid streaming pattern: {streaming_pattern}" @@ -114,6 +132,9 @@ with CodeGeneration() as ctx: lbm_config = LBMConfig(stencil=stencil, field_name=pdfs.name, streaming_pattern=streaming_pattern, **options) lbm_opt = LBMOptimisation(cse_global=True, cse_pdfs=False, symbolic_field=pdfs, field_layout='fzyx') + if lbm_config.method == Method.CENTRAL_MOMENT: + lbm_config = replace(lbm_config, nested_moments=get_default_moment_set_for_stencil(stencil)) + if not is_inplace(streaming_pattern): lbm_opt = replace(lbm_opt, symbolic_temporary_field=pdfs_tmp) field_swaps = [(pdfs, pdfs_tmp)] @@ -145,18 +166,21 @@ with CodeGeneration() as ctx: generate_alternating_lbm_sweep(ctx, 'UniformGridGPU_LbKernel', collision_rule, lbm_config=lbm_config, lbm_optimisation=lbm_opt, target=ps.Target.GPU, - inner_outer_split=True, varying_parameters=vp, field_swaps=field_swaps) + gpu_indexing_params=gpu_indexing_params, + inner_outer_split=True, varying_parameters=vp, field_swaps=field_swaps, + max_threads=max_threads) # getter & setter setter_assignments = macroscopic_values_setter(lb_method, density=1.0, velocity=velocity_field.center_vector, pdfs=pdfs, streaming_pattern=streaming_pattern, previous_timestep=Timestep.EVEN) - generate_sweep(ctx, 'UniformGridGPU_MacroSetter', setter_assignments, target=ps.Target.GPU) + generate_sweep(ctx, 'UniformGridGPU_MacroSetter', setter_assignments, target=ps.Target.GPU, max_threads=max_threads) # Stream only kernel generate_sweep(ctx, 'UniformGridGPU_StreamOnlyKernel', stream_only_kernel, field_swaps=field_swaps_stream_only, - gpu_indexing_params=gpu_indexing_params, varying_parameters=vp, target=ps.Target.GPU) + gpu_indexing_params=gpu_indexing_params, varying_parameters=vp, target=ps.Target.GPU, + max_threads=max_threads) # Boundaries noslip = NoSlip() diff --git a/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs.py b/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs.py index a1816eba1569722be2edf70c04a70078bbabf4f0..50d9bfd756b4bc5d463ac9848e084945e32da1bd 100755 --- a/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs.py +++ b/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs.py @@ -18,8 +18,8 @@ from math import prod # Number of time steps run for a workload of 128^3 per GPU # if double as many cells are on the GPU, half as many time steps are run etc. # increase this to get more reliable measurements -TIME_STEPS_FOR_128_BLOCK = 500 -DB_FILE = "gpu_benchmark.sqlite3" +TIME_STEPS_FOR_128_BLOCK = 1000 +DB_FILE = os.environ.get('DB_FILE', "gpu_benchmark.sqlite3") BASE_CONFIG = { 'DomainSetup': { @@ -129,6 +129,7 @@ class Scenario: num_tries = 4 # check multiple times e.g. may fail when multiple benchmark processes are running table_name = f"runs_{data['stencil']}_{data['streamingPattern']}_{data['collisionSetup']}_{prod(self.blocks)}" + table_name = table_name.replace("-", "_") for num_try in range(num_tries): try: checkAndUpdateSchema(result, table_name, DB_FILE) @@ -193,7 +194,7 @@ def single_gpu_benchmark(): additional_info['gpu_type'] = gpu_type scenarios = wlb.ScenarioManager() - block_sizes = [(i, i, i) for i in (64, 128, 256, 320, 384, 448, 512)] + block_sizes = [(i, i, i) for i in (32, 64, 128, 256)] cuda_blocks = [(32, 1, 1), (64, 1, 1), (128, 1, 1), (256, 1, 1), (512, 1, 1), (32, 2, 1), (64, 2, 1), (128, 2, 1), (256, 2, 1), (32, 4, 1), (64, 4, 1), (128, 4, 1), @@ -201,6 +202,9 @@ def single_gpu_benchmark(): (32, 16, 1)] for block_size in block_sizes: for cuda_block_size in cuda_blocks: + # cuda_block_size = (256, 1, 1) and block_size = (64, 64, 64) would be cut to cuda_block_size = (64, 1, 1) + if cuda_block_size > block_size: + continue if not cuda_block_size_ok(cuda_block_size): wlb.log_info_on_root(f"Cuda block size {cuda_block_size} would exceed register limit. Skipping.") continue @@ -210,7 +214,7 @@ def single_gpu_benchmark(): scenario = Scenario(cells_per_block=block_size, cuda_blocks=cuda_block_size, time_step_strategy='kernelOnly', - timesteps=num_time_steps(block_size), + timesteps=num_time_steps(block_size, 2000), additional_info=additional_info) scenarios.add(scenario) diff --git a/python/lbmpy_walberla/alternating_sweeps.py b/python/lbmpy_walberla/alternating_sweeps.py index 587217690cb464a8e8f6dbaa2a535aa0c1835f0b..dbcc1ab54e618101658a2c2262dac946f9d99805 100644 --- a/python/lbmpy_walberla/alternating_sweeps.py +++ b/python/lbmpy_walberla/alternating_sweeps.py @@ -63,7 +63,7 @@ def generate_alternating_lbm_sweep(generation_context, class_name, collision_rul namespace='lbm', field_swaps=(), varying_parameters=(), inner_outer_split=False, ghost_layers_to_include=0, target=Target.CPU, data_type=None, - cpu_openmp=None, cpu_vectorize_info=None, + cpu_openmp=None, cpu_vectorize_info=None, max_threads=None, **kernel_parameters): """Generates an Alternating lattice Boltzmann sweep class. This is in particular meant for in-place streaming patterns, but can of course also be used with two-fields patterns (why make it @@ -87,6 +87,7 @@ def generate_alternating_lbm_sweep(generation_context, class_name, collision_rul data_type: default datatype for the kernel creation. Default is double cpu_openmp: if loops should use openMP or not. cpu_vectorize_info: dictionary containing necessary information for the usage of a SIMD instruction set. + max_threads: only relevant for GPU kernels. Will be argument of `__launch_bounds__`. kernel_parameters: other parameters passed to the creation of a pystencils.CreateKernelConfig """ config = config_from_context(generation_context, target=target, data_type=data_type, cpu_openmp=cpu_openmp, @@ -120,4 +121,4 @@ def generate_alternating_lbm_sweep(generation_context, class_name, collision_rul target=target, namespace=namespace, field_swaps=field_swaps, varying_parameters=varying_parameters, inner_outer_split=inner_outer_split, ghost_layers_to_include=ghost_layers_to_include, - cpu_vectorize_info=vec_info, cpu_openmp=openmp) + cpu_vectorize_info=vec_info, cpu_openmp=openmp, max_threads=max_threads) diff --git a/python/pystencils_walberla/codegen.py b/python/pystencils_walberla/codegen.py index 7e27e19cc4ca7e531286d98d181425ce78e563a5..6816ac87ec90e22bc93b5086b77f1ff8d259d760 100644 --- a/python/pystencils_walberla/codegen.py +++ b/python/pystencils_walberla/codegen.py @@ -29,7 +29,7 @@ __all__ = ['generate_sweep', 'generate_pack_info', 'generate_pack_info_for_field def generate_sweep(generation_context, class_name, assignments, namespace='pystencils', field_swaps=(), staggered=False, varying_parameters=(), inner_outer_split=False, ghost_layers_to_include=0, - target=Target.CPU, data_type=None, cpu_openmp=None, cpu_vectorize_info=None, + target=Target.CPU, data_type=None, cpu_openmp=None, cpu_vectorize_info=None, max_threads=None, **create_kernel_params): """Generates a waLBerla sweep from a pystencils representation. @@ -59,6 +59,7 @@ def generate_sweep(generation_context, class_name, assignments, data_type: default datatype for the kernel creation. Default is double cpu_openmp: if loops should use openMP or not. cpu_vectorize_info: dictionary containing necessary information for the usage of a SIMD instruction set. + max_threads: only relevant for GPU kernels. Will be argument of `__launch_bounds__` **create_kernel_params: remaining keyword arguments are passed to `pystencils.create_kernel` """ if staggered: @@ -83,13 +84,13 @@ def generate_sweep(generation_context, class_name, assignments, field_swaps=field_swaps, varying_parameters=varying_parameters, inner_outer_split=inner_outer_split, ghost_layers_to_include=ghost_layers_to_include, cpu_vectorize_info=config.cpu_vectorize_info, - cpu_openmp=config.cpu_openmp) + cpu_openmp=config.cpu_openmp, max_threads=max_threads) def generate_selective_sweep(generation_context, class_name, selection_tree, interface_mappings=(), target=None, namespace='pystencils', field_swaps=(), varying_parameters=(), inner_outer_split=False, ghost_layers_to_include=0, - cpu_vectorize_info=None, cpu_openmp=False): + cpu_vectorize_info=None, cpu_openmp=False, max_threads=None): """Generates a selective sweep from a kernel selection tree. A kernel selection tree consolidates multiple pystencils ASTs in a tree-like structure. See also module `pystencils_walberla.kernel_selection`. @@ -107,6 +108,7 @@ def generate_selective_sweep(generation_context, class_name, selection_tree, int ghost_layers_to_include: see documentation of `generate_sweep` cpu_vectorize_info: Dictionary containing information about CPU vectorization applied to the kernels cpu_openmp: Whether or not CPU kernels use OpenMP parallelization + max_threads: only relevant for GPU kernels. Will be argument of `__launch_bounds__` """ def to_name(f): return f.name if isinstance(f, Field) else f @@ -144,7 +146,8 @@ def generate_selective_sweep(generation_context, class_name, selection_tree, int 'interface_spec': interface_spec, 'generate_functor': True, 'cpu_vectorize_info': cpu_vectorize_info, - 'cpu_openmp': cpu_openmp + 'cpu_openmp': cpu_openmp, + 'max_threads': max_threads } header = env.get_template("Sweep.tmpl.h").render(**jinja_context) source = env.get_template("Sweep.tmpl.cpp").render(**jinja_context) diff --git a/python/pystencils_walberla/jinja_filters.py b/python/pystencils_walberla/jinja_filters.py index c615243a7a2c17b90d7eb3cd924ecdcfeb10a5a7..c728bcf46a13bf114de2f0ce05c12abc7edd01df 100644 --- a/python/pystencils_walberla/jinja_filters.py +++ b/python/pystencils_walberla/jinja_filters.py @@ -43,6 +43,16 @@ delete_loop = """ }} """ +# the target will enter the jinja filters as string. The reason for that is, that is not easy to work with the +# enum in the template files. +def translate_target(target): + if isinstance(target, Target): + return target + elif isinstance(target, str): + return Target[target.upper()] + else: + raise ValueError(f"The type of the target {type(target)} is not supported") + def make_field_type(dtype, f_size, is_gpu): if is_gpu: @@ -82,6 +92,7 @@ def get_field_stride(param): def generate_declaration(kernel_info, target=Target.CPU): """Generates the declaration of the kernel function""" + target = translate_target(target) ast = kernel_info.ast result = generate_c(ast, signature_only=True, dialect=Backend.CUDA if target == Target.GPU else Backend.C) + ";" result = "namespace internal_%s {\n%s\n}" % (ast.function_name, result,) @@ -90,6 +101,7 @@ def generate_declaration(kernel_info, target=Target.CPU): def generate_definition(kernel_info, target=Target.CPU): """Generates the definition (i.e. implementation) of the kernel function""" + target = translate_target(target) ast = kernel_info.ast result = generate_c(ast, dialect=Backend.CUDA if target == Target.GPU else Backend.C) result = "namespace internal_%s {\nstatic %s\n}" % (ast.function_name, result) @@ -97,6 +109,7 @@ def generate_definition(kernel_info, target=Target.CPU): def generate_declarations(kernel_family, target=Target.CPU): + target = translate_target(target) declarations = [] for ast in kernel_family.all_asts: code = generate_c(ast, signature_only=True, dialect=Backend.CUDA if target == Target.GPU else Backend.C) + ";" @@ -105,10 +118,15 @@ def generate_declarations(kernel_family, target=Target.CPU): return "\n".join(declarations) -def generate_definitions(kernel_family, target=Target.CPU): +def generate_definitions(kernel_family, target=Target.CPU, max_threads=None): + target = translate_target(target) definitions = [] for ast in kernel_family.all_asts: code = generate_c(ast, dialect=Backend.CUDA if target == Target.GPU else Backend.C) + if max_threads is not None and target == Target.GPU: + assert isinstance(max_threads, int), "maximal number of threads should be an integer value" + index = code.find('FUNC_PREFIX') + len("FUNC_PREFIX ") + code = code[:index] + f'__launch_bounds__({max_threads}) ' + code[index:] code = "namespace internal_%s {\nstatic %s\n}\n" % (ast.function_name, code) definitions.append(code) return "\n".join(definitions) @@ -177,10 +195,12 @@ def generate_block_data_to_field_extraction(ctx, kernel_info, parameters_to_igno normal_fields = {f for f in field_parameters if f.name not in kernel_info.temporary_fields} temporary_fields = {f for f in field_parameters if f.name in kernel_info.temporary_fields} + target = translate_target(ctx['target']) + args = { 'declaration_only': declarations_only, 'no_declaration': no_declarations, - 'is_gpu': ctx['target'] == 'gpu', + 'is_gpu': target == Target.GPU, } result = "\n".join( field_extraction_code(field=field, is_temporary=False, update_member=update_member, **args) for field in @@ -389,7 +409,8 @@ def generate_members(ctx, kernel_info, parameters_to_ignore=(), only_fields=Fals params_to_skip = tuple(parameters_to_ignore) + tuple(kernel_info.temporary_fields) params_to_skip += tuple(e[1] for e in kernel_info.varying_parameters) - is_gpu = ctx['target'] == 'gpu' + target = translate_target(ctx['target']) + is_gpu = target == Target.GPU result = [] for param in kernel_info.parameters: diff --git a/python/pystencils_walberla/templates/Sweep.tmpl.cpp b/python/pystencils_walberla/templates/Sweep.tmpl.cpp index d70096e97faa2a9ff0da26fbb996b5f5b5764ac1..7b095e32877f295f8f39f74501a7ecadcd6c02e8 100644 --- a/python/pystencils_walberla/templates/Sweep.tmpl.cpp +++ b/python/pystencils_walberla/templates/Sweep.tmpl.cpp @@ -54,7 +54,7 @@ namespace walberla { namespace {{namespace}} { -{{kernel|generate_definitions(target)}} +{{kernel|generate_definitions(target, max_threads)}} void {{class_name}}::run( {{- ["IBlock * block", kernel.kernel_selection_parameters, ["cudaStream_t stream"] if target == 'gpu' else []] | type_identifier_list -}} ) {