Commit 84b1bade authored by Helen Schottenhamml's avatar Helen Schottenhamml
Browse files

Merge branch 'UpdateUniGridGPU' into 'master'

Update UniformGridGPU

See merge request walberla/walberla!517
parents a6df42e9 b5de846f
......@@ -6,7 +6,11 @@ waLBerla_link_files_to_builddir( "simulation_setup" )
foreach(streaming_pattern pull push aa esotwist)
foreach(stencil d3q19 d3q27)
foreach (collision_setup srt trt mrt cumulant entropic smagorinsky mrt-overrelax cumulant-overrelax)
foreach (collision_setup srt trt mrt mrt-overrelax central central-overrelax cumulant cumulant-overrelax entropic smagorinsky)
# KBC methods only for D2Q9 and D3Q27 defined
if (${collision_setup} STREQUAL "entropic" AND ${stencil} STREQUAL "d3q19")
continue()
endif (${collision_setup} STREQUAL "entropic" AND ${stencil} STREQUAL "d3q19")
set(config ${stencil}_${streaming_pattern}_${collision_setup})
waLBerla_generate_target_from_python(NAME UniformGridGPUGenerated_${config}
FILE UniformGridGPU.py
......@@ -36,4 +40,4 @@ foreach(streaming_pattern pull push aa esotwist)
endforeach ()
endforeach()
endforeach()
\ No newline at end of file
endforeach()
......@@ -13,6 +13,7 @@ from lbmpy.advanced_streaming.utility import streaming_patterns
from lbmpy.boundaries import NoSlip, UBB
from lbmpy.creationfunctions import create_lb_collision_rule
from lbmpy.macroscopic_value_kernels import macroscopic_values_setter
from lbmpy.moments import get_default_moment_set_for_stencil
from lbmpy.updatekernels import create_stream_only_kernel
from lbmpy.fieldaccess import *
......@@ -22,6 +23,7 @@ from lbmpy_walberla import generate_alternating_lbm_sweep, generate_lb_pack_info
omega = sp.symbols("omega")
omega_free = sp.Symbol("omega_free")
compile_time_block_size = False
max_threads = None
if compile_time_block_size:
sweep_block_size = (128, 1, 1)
......@@ -41,34 +43,48 @@ options_dict = {
'trt': {
'method': Method.TRT,
'relaxation_rate': omega,
'compressible': False,
},
'mrt': {
'method': Method.MRT,
'relaxation_rates': [omega, 1, 1, 1, 1, 1, 1],
'compressible': False,
},
'mrt-overrelax': {
'method': Method.MRT,
'relaxation_rates': [omega] + [1 + x * 1e-2 for x in range(1, 11)],
'compressible': False,
},
'central': {
'method': Method.CENTRAL_MOMENT,
'relaxation_rate': omega,
'compressible': True,
},
'central-overrelax': {
'method': Method.CENTRAL_MOMENT,
'relaxation_rates': [omega] + [1 + x * 1e-2 for x in range(1, 11)],
'compressible': True,
},
'cumulant': {
'method': Method.CUMULANT,
'method': Method.MONOMIAL_CUMULANT,
'relaxation_rate': omega,
'compressible': True,
},
'cumulant-overrelax': {
'method': Method.CUMULANT,
'relaxation_rates': [omega] + [1 + x * 1e-2 for x in range(1, 11)],
'method': Method.MONOMIAL_CUMULANT,
'relaxation_rates': [omega] + [1 + x * 1e-2 for x in range(1, 18)],
'compressible': True,
},
'entropic': {
'method': Method.MRT,
'method': Method.TRT_KBC_N4,
'compressible': True,
'relaxation_rates': [omega, omega] + [omega_free] * 6,
'relaxation_rates': [omega, omega_free],
'entropic': True,
'entropic_newton_iterations': False
},
'smagorinsky': {
'method': Method.SRT,
'smagorinsky': True,
'smagorinsky': False,
'relaxation_rate': omega,
}
}
......@@ -96,10 +112,12 @@ with CodeGeneration() as ctx:
if len(config_tokens) >= 4:
optimize = (config_tokens[3] != 'noopt')
if stencil_str == "D3Q27":
if stencil_str == "d3q27":
stencil = LBStencil(Stencil.D3Q27)
else:
elif stencil_str == "d3q19":
stencil = LBStencil(Stencil.D3Q19)
else:
raise ValueError("Only D3Q27 and D3Q19 stencil are supported at the moment")
assert streaming_pattern in streaming_patterns, f"Invalid streaming pattern: {streaming_pattern}"
......@@ -114,6 +132,9 @@ with CodeGeneration() as ctx:
lbm_config = LBMConfig(stencil=stencil, field_name=pdfs.name, streaming_pattern=streaming_pattern, **options)
lbm_opt = LBMOptimisation(cse_global=True, cse_pdfs=False, symbolic_field=pdfs, field_layout='fzyx')
if lbm_config.method == Method.CENTRAL_MOMENT:
lbm_config = replace(lbm_config, nested_moments=get_default_moment_set_for_stencil(stencil))
if not is_inplace(streaming_pattern):
lbm_opt = replace(lbm_opt, symbolic_temporary_field=pdfs_tmp)
field_swaps = [(pdfs, pdfs_tmp)]
......@@ -145,18 +166,21 @@ with CodeGeneration() as ctx:
generate_alternating_lbm_sweep(ctx, 'UniformGridGPU_LbKernel', collision_rule, lbm_config=lbm_config,
lbm_optimisation=lbm_opt, target=ps.Target.GPU,
inner_outer_split=True, varying_parameters=vp, field_swaps=field_swaps)
gpu_indexing_params=gpu_indexing_params,
inner_outer_split=True, varying_parameters=vp, field_swaps=field_swaps,
max_threads=max_threads)
# getter & setter
setter_assignments = macroscopic_values_setter(lb_method, density=1.0, velocity=velocity_field.center_vector,
pdfs=pdfs,
streaming_pattern=streaming_pattern,
previous_timestep=Timestep.EVEN)
generate_sweep(ctx, 'UniformGridGPU_MacroSetter', setter_assignments, target=ps.Target.GPU)
generate_sweep(ctx, 'UniformGridGPU_MacroSetter', setter_assignments, target=ps.Target.GPU, max_threads=max_threads)
# Stream only kernel
generate_sweep(ctx, 'UniformGridGPU_StreamOnlyKernel', stream_only_kernel, field_swaps=field_swaps_stream_only,
gpu_indexing_params=gpu_indexing_params, varying_parameters=vp, target=ps.Target.GPU)
gpu_indexing_params=gpu_indexing_params, varying_parameters=vp, target=ps.Target.GPU,
max_threads=max_threads)
# Boundaries
noslip = NoSlip()
......
......@@ -18,8 +18,8 @@ from math import prod
# Number of time steps run for a workload of 128^3 per GPU
# if double as many cells are on the GPU, half as many time steps are run etc.
# increase this to get more reliable measurements
TIME_STEPS_FOR_128_BLOCK = 500
DB_FILE = "gpu_benchmark.sqlite3"
TIME_STEPS_FOR_128_BLOCK = 1000
DB_FILE = os.environ.get('DB_FILE', "gpu_benchmark.sqlite3")
BASE_CONFIG = {
'DomainSetup': {
......@@ -129,6 +129,7 @@ class Scenario:
num_tries = 4
# check multiple times e.g. may fail when multiple benchmark processes are running
table_name = f"runs_{data['stencil']}_{data['streamingPattern']}_{data['collisionSetup']}_{prod(self.blocks)}"
table_name = table_name.replace("-", "_")
for num_try in range(num_tries):
try:
checkAndUpdateSchema(result, table_name, DB_FILE)
......@@ -193,7 +194,7 @@ def single_gpu_benchmark():
additional_info['gpu_type'] = gpu_type
scenarios = wlb.ScenarioManager()
block_sizes = [(i, i, i) for i in (64, 128, 256, 320, 384, 448, 512)]
block_sizes = [(i, i, i) for i in (32, 64, 128, 256)]
cuda_blocks = [(32, 1, 1), (64, 1, 1), (128, 1, 1), (256, 1, 1), (512, 1, 1),
(32, 2, 1), (64, 2, 1), (128, 2, 1), (256, 2, 1),
(32, 4, 1), (64, 4, 1), (128, 4, 1),
......@@ -201,6 +202,9 @@ def single_gpu_benchmark():
(32, 16, 1)]
for block_size in block_sizes:
for cuda_block_size in cuda_blocks:
# cuda_block_size = (256, 1, 1) and block_size = (64, 64, 64) would be cut to cuda_block_size = (64, 1, 1)
if cuda_block_size > block_size:
continue
if not cuda_block_size_ok(cuda_block_size):
wlb.log_info_on_root(f"Cuda block size {cuda_block_size} would exceed register limit. Skipping.")
continue
......@@ -210,7 +214,7 @@ def single_gpu_benchmark():
scenario = Scenario(cells_per_block=block_size,
cuda_blocks=cuda_block_size,
time_step_strategy='kernelOnly',
timesteps=num_time_steps(block_size),
timesteps=num_time_steps(block_size, 2000),
additional_info=additional_info)
scenarios.add(scenario)
......
......@@ -63,7 +63,7 @@ def generate_alternating_lbm_sweep(generation_context, class_name, collision_rul
namespace='lbm', field_swaps=(), varying_parameters=(),
inner_outer_split=False, ghost_layers_to_include=0,
target=Target.CPU, data_type=None,
cpu_openmp=None, cpu_vectorize_info=None,
cpu_openmp=None, cpu_vectorize_info=None, max_threads=None,
**kernel_parameters):
"""Generates an Alternating lattice Boltzmann sweep class. This is in particular meant for
in-place streaming patterns, but can of course also be used with two-fields patterns (why make it
......@@ -87,6 +87,7 @@ def generate_alternating_lbm_sweep(generation_context, class_name, collision_rul
data_type: default datatype for the kernel creation. Default is double
cpu_openmp: if loops should use openMP or not.
cpu_vectorize_info: dictionary containing necessary information for the usage of a SIMD instruction set.
max_threads: only relevant for GPU kernels. Will be argument of `__launch_bounds__`.
kernel_parameters: other parameters passed to the creation of a pystencils.CreateKernelConfig
"""
config = config_from_context(generation_context, target=target, data_type=data_type, cpu_openmp=cpu_openmp,
......@@ -120,4 +121,4 @@ def generate_alternating_lbm_sweep(generation_context, class_name, collision_rul
target=target, namespace=namespace,
field_swaps=field_swaps, varying_parameters=varying_parameters,
inner_outer_split=inner_outer_split, ghost_layers_to_include=ghost_layers_to_include,
cpu_vectorize_info=vec_info, cpu_openmp=openmp)
cpu_vectorize_info=vec_info, cpu_openmp=openmp, max_threads=max_threads)
......@@ -29,7 +29,7 @@ __all__ = ['generate_sweep', 'generate_pack_info', 'generate_pack_info_for_field
def generate_sweep(generation_context, class_name, assignments,
namespace='pystencils', field_swaps=(), staggered=False, varying_parameters=(),
inner_outer_split=False, ghost_layers_to_include=0,
target=Target.CPU, data_type=None, cpu_openmp=None, cpu_vectorize_info=None,
target=Target.CPU, data_type=None, cpu_openmp=None, cpu_vectorize_info=None, max_threads=None,
**create_kernel_params):
"""Generates a waLBerla sweep from a pystencils representation.
......@@ -59,6 +59,7 @@ def generate_sweep(generation_context, class_name, assignments,
data_type: default datatype for the kernel creation. Default is double
cpu_openmp: if loops should use openMP or not.
cpu_vectorize_info: dictionary containing necessary information for the usage of a SIMD instruction set.
max_threads: only relevant for GPU kernels. Will be argument of `__launch_bounds__`
**create_kernel_params: remaining keyword arguments are passed to `pystencils.create_kernel`
"""
if staggered:
......@@ -83,13 +84,13 @@ def generate_sweep(generation_context, class_name, assignments,
field_swaps=field_swaps, varying_parameters=varying_parameters,
inner_outer_split=inner_outer_split, ghost_layers_to_include=ghost_layers_to_include,
cpu_vectorize_info=config.cpu_vectorize_info,
cpu_openmp=config.cpu_openmp)
cpu_openmp=config.cpu_openmp, max_threads=max_threads)
def generate_selective_sweep(generation_context, class_name, selection_tree, interface_mappings=(), target=None,
namespace='pystencils', field_swaps=(), varying_parameters=(),
inner_outer_split=False, ghost_layers_to_include=0,
cpu_vectorize_info=None, cpu_openmp=False):
cpu_vectorize_info=None, cpu_openmp=False, max_threads=None):
"""Generates a selective sweep from a kernel selection tree. A kernel selection tree consolidates multiple
pystencils ASTs in a tree-like structure. See also module `pystencils_walberla.kernel_selection`.
......@@ -107,6 +108,7 @@ def generate_selective_sweep(generation_context, class_name, selection_tree, int
ghost_layers_to_include: see documentation of `generate_sweep`
cpu_vectorize_info: Dictionary containing information about CPU vectorization applied to the kernels
cpu_openmp: Whether or not CPU kernels use OpenMP parallelization
max_threads: only relevant for GPU kernels. Will be argument of `__launch_bounds__`
"""
def to_name(f):
return f.name if isinstance(f, Field) else f
......@@ -144,7 +146,8 @@ def generate_selective_sweep(generation_context, class_name, selection_tree, int
'interface_spec': interface_spec,
'generate_functor': True,
'cpu_vectorize_info': cpu_vectorize_info,
'cpu_openmp': cpu_openmp
'cpu_openmp': cpu_openmp,
'max_threads': max_threads
}
header = env.get_template("Sweep.tmpl.h").render(**jinja_context)
source = env.get_template("Sweep.tmpl.cpp").render(**jinja_context)
......
......@@ -43,6 +43,16 @@ delete_loop = """
}}
"""
# the target will enter the jinja filters as string. The reason for that is, that is not easy to work with the
# enum in the template files.
def translate_target(target):
if isinstance(target, Target):
return target
elif isinstance(target, str):
return Target[target.upper()]
else:
raise ValueError(f"The type of the target {type(target)} is not supported")
def make_field_type(dtype, f_size, is_gpu):
if is_gpu:
......@@ -82,6 +92,7 @@ def get_field_stride(param):
def generate_declaration(kernel_info, target=Target.CPU):
"""Generates the declaration of the kernel function"""
target = translate_target(target)
ast = kernel_info.ast
result = generate_c(ast, signature_only=True, dialect=Backend.CUDA if target == Target.GPU else Backend.C) + ";"
result = "namespace internal_%s {\n%s\n}" % (ast.function_name, result,)
......@@ -90,6 +101,7 @@ def generate_declaration(kernel_info, target=Target.CPU):
def generate_definition(kernel_info, target=Target.CPU):
"""Generates the definition (i.e. implementation) of the kernel function"""
target = translate_target(target)
ast = kernel_info.ast
result = generate_c(ast, dialect=Backend.CUDA if target == Target.GPU else Backend.C)
result = "namespace internal_%s {\nstatic %s\n}" % (ast.function_name, result)
......@@ -97,6 +109,7 @@ def generate_definition(kernel_info, target=Target.CPU):
def generate_declarations(kernel_family, target=Target.CPU):
target = translate_target(target)
declarations = []
for ast in kernel_family.all_asts:
code = generate_c(ast, signature_only=True, dialect=Backend.CUDA if target == Target.GPU else Backend.C) + ";"
......@@ -105,10 +118,15 @@ def generate_declarations(kernel_family, target=Target.CPU):
return "\n".join(declarations)
def generate_definitions(kernel_family, target=Target.CPU):
def generate_definitions(kernel_family, target=Target.CPU, max_threads=None):
target = translate_target(target)
definitions = []
for ast in kernel_family.all_asts:
code = generate_c(ast, dialect=Backend.CUDA if target == Target.GPU else Backend.C)
if max_threads is not None and target == Target.GPU:
assert isinstance(max_threads, int), "maximal number of threads should be an integer value"
index = code.find('FUNC_PREFIX') + len("FUNC_PREFIX ")
code = code[:index] + f'__launch_bounds__({max_threads}) ' + code[index:]
code = "namespace internal_%s {\nstatic %s\n}\n" % (ast.function_name, code)
definitions.append(code)
return "\n".join(definitions)
......@@ -177,10 +195,12 @@ def generate_block_data_to_field_extraction(ctx, kernel_info, parameters_to_igno
normal_fields = {f for f in field_parameters if f.name not in kernel_info.temporary_fields}
temporary_fields = {f for f in field_parameters if f.name in kernel_info.temporary_fields}
target = translate_target(ctx['target'])
args = {
'declaration_only': declarations_only,
'no_declaration': no_declarations,
'is_gpu': ctx['target'] == 'gpu',
'is_gpu': target == Target.GPU,
}
result = "\n".join(
field_extraction_code(field=field, is_temporary=False, update_member=update_member, **args) for field in
......@@ -389,7 +409,8 @@ def generate_members(ctx, kernel_info, parameters_to_ignore=(), only_fields=Fals
params_to_skip = tuple(parameters_to_ignore) + tuple(kernel_info.temporary_fields)
params_to_skip += tuple(e[1] for e in kernel_info.varying_parameters)
is_gpu = ctx['target'] == 'gpu'
target = translate_target(ctx['target'])
is_gpu = target == Target.GPU
result = []
for param in kernel_info.parameters:
......
......@@ -54,7 +54,7 @@ namespace walberla {
namespace {{namespace}} {
{{kernel|generate_definitions(target)}}
{{kernel|generate_definitions(target, max_threads)}}
void {{class_name}}::run( {{- ["IBlock * block", kernel.kernel_selection_parameters, ["cudaStream_t stream"] if target == 'gpu' else []] | type_identifier_list -}} )
{
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment