Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Markus Holzer
waLBerla
Commits
84b1bade
Commit
84b1bade
authored
Mar 07, 2022
by
Helen Schottenhamml
Browse files
Merge branch 'UpdateUniGridGPU' into 'master'
Update UniformGridGPU See merge request
walberla/walberla!517
parents
a6df42e9
b5de846f
Pipeline
#38236
failed with stages
in 240 minutes and 38 seconds
Changes
7
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
apps/benchmarks/UniformGridGPU/CMakeLists.txt
View file @
84b1bade
...
...
@@ -6,7 +6,11 @@ waLBerla_link_files_to_builddir( "simulation_setup" )
foreach
(
streaming_pattern pull push aa esotwist
)
foreach
(
stencil d3q19 d3q27
)
foreach
(
collision_setup srt trt mrt cumulant entropic smagorinsky mrt-overrelax cumulant-overrelax
)
foreach
(
collision_setup srt trt mrt mrt-overrelax central central-overrelax cumulant cumulant-overrelax entropic smagorinsky
)
# KBC methods only for D2Q9 and D3Q27 defined
if
(
${
collision_setup
}
STREQUAL
"entropic"
AND
${
stencil
}
STREQUAL
"d3q19"
)
continue
()
endif
(
${
collision_setup
}
STREQUAL
"entropic"
AND
${
stencil
}
STREQUAL
"d3q19"
)
set
(
config
${
stencil
}
_
${
streaming_pattern
}
_
${
collision_setup
}
)
waLBerla_generate_target_from_python
(
NAME UniformGridGPUGenerated_
${
config
}
FILE UniformGridGPU.py
...
...
@@ -36,4 +40,4 @@ foreach(streaming_pattern pull push aa esotwist)
endforeach
()
endforeach
()
endforeach
()
\ No newline at end of file
endforeach
()
apps/benchmarks/UniformGridGPU/UniformGridGPU.py
View file @
84b1bade
...
...
@@ -13,6 +13,7 @@ from lbmpy.advanced_streaming.utility import streaming_patterns
from
lbmpy.boundaries
import
NoSlip
,
UBB
from
lbmpy.creationfunctions
import
create_lb_collision_rule
from
lbmpy.macroscopic_value_kernels
import
macroscopic_values_setter
from
lbmpy.moments
import
get_default_moment_set_for_stencil
from
lbmpy.updatekernels
import
create_stream_only_kernel
from
lbmpy.fieldaccess
import
*
...
...
@@ -22,6 +23,7 @@ from lbmpy_walberla import generate_alternating_lbm_sweep, generate_lb_pack_info
omega
=
sp
.
symbols
(
"omega"
)
omega_free
=
sp
.
Symbol
(
"omega_free"
)
compile_time_block_size
=
False
max_threads
=
None
if
compile_time_block_size
:
sweep_block_size
=
(
128
,
1
,
1
)
...
...
@@ -41,34 +43,48 @@ options_dict = {
'trt'
:
{
'method'
:
Method
.
TRT
,
'relaxation_rate'
:
omega
,
'compressible'
:
False
,
},
'mrt'
:
{
'method'
:
Method
.
MRT
,
'relaxation_rates'
:
[
omega
,
1
,
1
,
1
,
1
,
1
,
1
],
'compressible'
:
False
,
},
'mrt-overrelax'
:
{
'method'
:
Method
.
MRT
,
'relaxation_rates'
:
[
omega
]
+
[
1
+
x
*
1e-2
for
x
in
range
(
1
,
11
)],
'compressible'
:
False
,
},
'central'
:
{
'method'
:
Method
.
CENTRAL_MOMENT
,
'relaxation_rate'
:
omega
,
'compressible'
:
True
,
},
'central-overrelax'
:
{
'method'
:
Method
.
CENTRAL_MOMENT
,
'relaxation_rates'
:
[
omega
]
+
[
1
+
x
*
1e-2
for
x
in
range
(
1
,
11
)],
'compressible'
:
True
,
},
'cumulant'
:
{
'method'
:
Method
.
CUMULANT
,
'method'
:
Method
.
MONOMIAL_
CUMULANT
,
'relaxation_rate'
:
omega
,
'compressible'
:
True
,
},
'cumulant-overrelax'
:
{
'method'
:
Method
.
CUMULANT
,
'relaxation_rates'
:
[
omega
]
+
[
1
+
x
*
1e-2
for
x
in
range
(
1
,
1
1
)],
'method'
:
Method
.
MONOMIAL_
CUMULANT
,
'relaxation_rates'
:
[
omega
]
+
[
1
+
x
*
1e-2
for
x
in
range
(
1
,
1
8
)],
'compressible'
:
True
,
},
'entropic'
:
{
'method'
:
Method
.
M
RT
,
'method'
:
Method
.
T
RT
_KBC_N4
,
'compressible'
:
True
,
'relaxation_rates'
:
[
omega
,
omega
]
+
[
omega_free
]
*
6
,
'relaxation_rates'
:
[
omega
,
omega_free
],
'entropic'
:
True
,
'entropic_newton_iterations'
:
False
},
'smagorinsky'
:
{
'method'
:
Method
.
SRT
,
'smagorinsky'
:
Tru
e
,
'smagorinsky'
:
Fals
e
,
'relaxation_rate'
:
omega
,
}
}
...
...
@@ -96,10 +112,12 @@ with CodeGeneration() as ctx:
if
len
(
config_tokens
)
>=
4
:
optimize
=
(
config_tokens
[
3
]
!=
'noopt'
)
if
stencil_str
==
"
D3Q
27"
:
if
stencil_str
==
"
d3q
27"
:
stencil
=
LBStencil
(
Stencil
.
D3Q27
)
el
se
:
el
if
stencil_str
==
"d3q19"
:
stencil
=
LBStencil
(
Stencil
.
D3Q19
)
else
:
raise
ValueError
(
"Only D3Q27 and D3Q19 stencil are supported at the moment"
)
assert
streaming_pattern
in
streaming_patterns
,
f
"Invalid streaming pattern:
{
streaming_pattern
}
"
...
...
@@ -114,6 +132,9 @@ with CodeGeneration() as ctx:
lbm_config
=
LBMConfig
(
stencil
=
stencil
,
field_name
=
pdfs
.
name
,
streaming_pattern
=
streaming_pattern
,
**
options
)
lbm_opt
=
LBMOptimisation
(
cse_global
=
True
,
cse_pdfs
=
False
,
symbolic_field
=
pdfs
,
field_layout
=
'fzyx'
)
if
lbm_config
.
method
==
Method
.
CENTRAL_MOMENT
:
lbm_config
=
replace
(
lbm_config
,
nested_moments
=
get_default_moment_set_for_stencil
(
stencil
))
if
not
is_inplace
(
streaming_pattern
):
lbm_opt
=
replace
(
lbm_opt
,
symbolic_temporary_field
=
pdfs_tmp
)
field_swaps
=
[(
pdfs
,
pdfs_tmp
)]
...
...
@@ -145,18 +166,21 @@ with CodeGeneration() as ctx:
generate_alternating_lbm_sweep
(
ctx
,
'UniformGridGPU_LbKernel'
,
collision_rule
,
lbm_config
=
lbm_config
,
lbm_optimisation
=
lbm_opt
,
target
=
ps
.
Target
.
GPU
,
inner_outer_split
=
True
,
varying_parameters
=
vp
,
field_swaps
=
field_swaps
)
gpu_indexing_params
=
gpu_indexing_params
,
inner_outer_split
=
True
,
varying_parameters
=
vp
,
field_swaps
=
field_swaps
,
max_threads
=
max_threads
)
# getter & setter
setter_assignments
=
macroscopic_values_setter
(
lb_method
,
density
=
1.0
,
velocity
=
velocity_field
.
center_vector
,
pdfs
=
pdfs
,
streaming_pattern
=
streaming_pattern
,
previous_timestep
=
Timestep
.
EVEN
)
generate_sweep
(
ctx
,
'UniformGridGPU_MacroSetter'
,
setter_assignments
,
target
=
ps
.
Target
.
GPU
)
generate_sweep
(
ctx
,
'UniformGridGPU_MacroSetter'
,
setter_assignments
,
target
=
ps
.
Target
.
GPU
,
max_threads
=
max_threads
)
# Stream only kernel
generate_sweep
(
ctx
,
'UniformGridGPU_StreamOnlyKernel'
,
stream_only_kernel
,
field_swaps
=
field_swaps_stream_only
,
gpu_indexing_params
=
gpu_indexing_params
,
varying_parameters
=
vp
,
target
=
ps
.
Target
.
GPU
)
gpu_indexing_params
=
gpu_indexing_params
,
varying_parameters
=
vp
,
target
=
ps
.
Target
.
GPU
,
max_threads
=
max_threads
)
# Boundaries
noslip
=
NoSlip
()
...
...
apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs.py
View file @
84b1bade
...
...
@@ -18,8 +18,8 @@ from math import prod
# Number of time steps run for a workload of 128^3 per GPU
# if double as many cells are on the GPU, half as many time steps are run etc.
# increase this to get more reliable measurements
TIME_STEPS_FOR_128_BLOCK
=
5
00
DB_FILE
=
"gpu_benchmark.sqlite3"
TIME_STEPS_FOR_128_BLOCK
=
10
00
DB_FILE
=
os
.
environ
.
get
(
'DB_FILE'
,
"gpu_benchmark.sqlite3"
)
BASE_CONFIG
=
{
'DomainSetup'
:
{
...
...
@@ -129,6 +129,7 @@ class Scenario:
num_tries
=
4
# check multiple times e.g. may fail when multiple benchmark processes are running
table_name
=
f
"runs_
{
data
[
'stencil'
]
}
_
{
data
[
'streamingPattern'
]
}
_
{
data
[
'collisionSetup'
]
}
_
{
prod
(
self
.
blocks
)
}
"
table_name
=
table_name
.
replace
(
"-"
,
"_"
)
for
num_try
in
range
(
num_tries
):
try
:
checkAndUpdateSchema
(
result
,
table_name
,
DB_FILE
)
...
...
@@ -193,7 +194,7 @@ def single_gpu_benchmark():
additional_info
[
'gpu_type'
]
=
gpu_type
scenarios
=
wlb
.
ScenarioManager
()
block_sizes
=
[(
i
,
i
,
i
)
for
i
in
(
64
,
128
,
256
,
320
,
384
,
448
,
512
)]
block_sizes
=
[(
i
,
i
,
i
)
for
i
in
(
32
,
64
,
128
,
256
)]
cuda_blocks
=
[(
32
,
1
,
1
),
(
64
,
1
,
1
),
(
128
,
1
,
1
),
(
256
,
1
,
1
),
(
512
,
1
,
1
),
(
32
,
2
,
1
),
(
64
,
2
,
1
),
(
128
,
2
,
1
),
(
256
,
2
,
1
),
(
32
,
4
,
1
),
(
64
,
4
,
1
),
(
128
,
4
,
1
),
...
...
@@ -201,6 +202,9 @@ def single_gpu_benchmark():
(
32
,
16
,
1
)]
for
block_size
in
block_sizes
:
for
cuda_block_size
in
cuda_blocks
:
# cuda_block_size = (256, 1, 1) and block_size = (64, 64, 64) would be cut to cuda_block_size = (64, 1, 1)
if
cuda_block_size
>
block_size
:
continue
if
not
cuda_block_size_ok
(
cuda_block_size
):
wlb
.
log_info_on_root
(
f
"Cuda block size
{
cuda_block_size
}
would exceed register limit. Skipping."
)
continue
...
...
@@ -210,7 +214,7 @@ def single_gpu_benchmark():
scenario
=
Scenario
(
cells_per_block
=
block_size
,
cuda_blocks
=
cuda_block_size
,
time_step_strategy
=
'kernelOnly'
,
timesteps
=
num_time_steps
(
block_size
),
timesteps
=
num_time_steps
(
block_size
,
2000
),
additional_info
=
additional_info
)
scenarios
.
add
(
scenario
)
...
...
python/lbmpy_walberla/alternating_sweeps.py
View file @
84b1bade
...
...
@@ -63,7 +63,7 @@ def generate_alternating_lbm_sweep(generation_context, class_name, collision_rul
namespace
=
'lbm'
,
field_swaps
=
(),
varying_parameters
=
(),
inner_outer_split
=
False
,
ghost_layers_to_include
=
0
,
target
=
Target
.
CPU
,
data_type
=
None
,
cpu_openmp
=
None
,
cpu_vectorize_info
=
None
,
cpu_openmp
=
None
,
cpu_vectorize_info
=
None
,
max_threads
=
None
,
**
kernel_parameters
):
"""Generates an Alternating lattice Boltzmann sweep class. This is in particular meant for
in-place streaming patterns, but can of course also be used with two-fields patterns (why make it
...
...
@@ -87,6 +87,7 @@ def generate_alternating_lbm_sweep(generation_context, class_name, collision_rul
data_type: default datatype for the kernel creation. Default is double
cpu_openmp: if loops should use openMP or not.
cpu_vectorize_info: dictionary containing necessary information for the usage of a SIMD instruction set.
max_threads: only relevant for GPU kernels. Will be argument of `__launch_bounds__`.
kernel_parameters: other parameters passed to the creation of a pystencils.CreateKernelConfig
"""
config
=
config_from_context
(
generation_context
,
target
=
target
,
data_type
=
data_type
,
cpu_openmp
=
cpu_openmp
,
...
...
@@ -120,4 +121,4 @@ def generate_alternating_lbm_sweep(generation_context, class_name, collision_rul
target
=
target
,
namespace
=
namespace
,
field_swaps
=
field_swaps
,
varying_parameters
=
varying_parameters
,
inner_outer_split
=
inner_outer_split
,
ghost_layers_to_include
=
ghost_layers_to_include
,
cpu_vectorize_info
=
vec_info
,
cpu_openmp
=
openmp
)
cpu_vectorize_info
=
vec_info
,
cpu_openmp
=
openmp
,
max_threads
=
max_threads
)
python/pystencils_walberla/codegen.py
View file @
84b1bade
...
...
@@ -29,7 +29,7 @@ __all__ = ['generate_sweep', 'generate_pack_info', 'generate_pack_info_for_field
def
generate_sweep
(
generation_context
,
class_name
,
assignments
,
namespace
=
'pystencils'
,
field_swaps
=
(),
staggered
=
False
,
varying_parameters
=
(),
inner_outer_split
=
False
,
ghost_layers_to_include
=
0
,
target
=
Target
.
CPU
,
data_type
=
None
,
cpu_openmp
=
None
,
cpu_vectorize_info
=
None
,
target
=
Target
.
CPU
,
data_type
=
None
,
cpu_openmp
=
None
,
cpu_vectorize_info
=
None
,
max_threads
=
None
,
**
create_kernel_params
):
"""Generates a waLBerla sweep from a pystencils representation.
...
...
@@ -59,6 +59,7 @@ def generate_sweep(generation_context, class_name, assignments,
data_type: default datatype for the kernel creation. Default is double
cpu_openmp: if loops should use openMP or not.
cpu_vectorize_info: dictionary containing necessary information for the usage of a SIMD instruction set.
max_threads: only relevant for GPU kernels. Will be argument of `__launch_bounds__`
**create_kernel_params: remaining keyword arguments are passed to `pystencils.create_kernel`
"""
if
staggered
:
...
...
@@ -83,13 +84,13 @@ def generate_sweep(generation_context, class_name, assignments,
field_swaps
=
field_swaps
,
varying_parameters
=
varying_parameters
,
inner_outer_split
=
inner_outer_split
,
ghost_layers_to_include
=
ghost_layers_to_include
,
cpu_vectorize_info
=
config
.
cpu_vectorize_info
,
cpu_openmp
=
config
.
cpu_openmp
)
cpu_openmp
=
config
.
cpu_openmp
,
max_threads
=
max_threads
)
def
generate_selective_sweep
(
generation_context
,
class_name
,
selection_tree
,
interface_mappings
=
(),
target
=
None
,
namespace
=
'pystencils'
,
field_swaps
=
(),
varying_parameters
=
(),
inner_outer_split
=
False
,
ghost_layers_to_include
=
0
,
cpu_vectorize_info
=
None
,
cpu_openmp
=
False
):
cpu_vectorize_info
=
None
,
cpu_openmp
=
False
,
max_threads
=
None
):
"""Generates a selective sweep from a kernel selection tree. A kernel selection tree consolidates multiple
pystencils ASTs in a tree-like structure. See also module `pystencils_walberla.kernel_selection`.
...
...
@@ -107,6 +108,7 @@ def generate_selective_sweep(generation_context, class_name, selection_tree, int
ghost_layers_to_include: see documentation of `generate_sweep`
cpu_vectorize_info: Dictionary containing information about CPU vectorization applied to the kernels
cpu_openmp: Whether or not CPU kernels use OpenMP parallelization
max_threads: only relevant for GPU kernels. Will be argument of `__launch_bounds__`
"""
def
to_name
(
f
):
return
f
.
name
if
isinstance
(
f
,
Field
)
else
f
...
...
@@ -144,7 +146,8 @@ def generate_selective_sweep(generation_context, class_name, selection_tree, int
'interface_spec'
:
interface_spec
,
'generate_functor'
:
True
,
'cpu_vectorize_info'
:
cpu_vectorize_info
,
'cpu_openmp'
:
cpu_openmp
'cpu_openmp'
:
cpu_openmp
,
'max_threads'
:
max_threads
}
header
=
env
.
get_template
(
"Sweep.tmpl.h"
).
render
(
**
jinja_context
)
source
=
env
.
get_template
(
"Sweep.tmpl.cpp"
).
render
(
**
jinja_context
)
...
...
python/pystencils_walberla/jinja_filters.py
View file @
84b1bade
...
...
@@ -43,6 +43,16 @@ delete_loop = """
}}
"""
# the target will enter the jinja filters as string. The reason for that is, that is not easy to work with the
# enum in the template files.
def
translate_target
(
target
):
if
isinstance
(
target
,
Target
):
return
target
elif
isinstance
(
target
,
str
):
return
Target
[
target
.
upper
()]
else
:
raise
ValueError
(
f
"The type of the target
{
type
(
target
)
}
is not supported"
)
def
make_field_type
(
dtype
,
f_size
,
is_gpu
):
if
is_gpu
:
...
...
@@ -82,6 +92,7 @@ def get_field_stride(param):
def
generate_declaration
(
kernel_info
,
target
=
Target
.
CPU
):
"""Generates the declaration of the kernel function"""
target
=
translate_target
(
target
)
ast
=
kernel_info
.
ast
result
=
generate_c
(
ast
,
signature_only
=
True
,
dialect
=
Backend
.
CUDA
if
target
==
Target
.
GPU
else
Backend
.
C
)
+
";"
result
=
"namespace internal_%s {
\n
%s
\n
}"
%
(
ast
.
function_name
,
result
,)
...
...
@@ -90,6 +101,7 @@ def generate_declaration(kernel_info, target=Target.CPU):
def
generate_definition
(
kernel_info
,
target
=
Target
.
CPU
):
"""Generates the definition (i.e. implementation) of the kernel function"""
target
=
translate_target
(
target
)
ast
=
kernel_info
.
ast
result
=
generate_c
(
ast
,
dialect
=
Backend
.
CUDA
if
target
==
Target
.
GPU
else
Backend
.
C
)
result
=
"namespace internal_%s {
\n
static %s
\n
}"
%
(
ast
.
function_name
,
result
)
...
...
@@ -97,6 +109,7 @@ def generate_definition(kernel_info, target=Target.CPU):
def
generate_declarations
(
kernel_family
,
target
=
Target
.
CPU
):
target
=
translate_target
(
target
)
declarations
=
[]
for
ast
in
kernel_family
.
all_asts
:
code
=
generate_c
(
ast
,
signature_only
=
True
,
dialect
=
Backend
.
CUDA
if
target
==
Target
.
GPU
else
Backend
.
C
)
+
";"
...
...
@@ -105,10 +118,15 @@ def generate_declarations(kernel_family, target=Target.CPU):
return
"
\n
"
.
join
(
declarations
)
def
generate_definitions
(
kernel_family
,
target
=
Target
.
CPU
):
def
generate_definitions
(
kernel_family
,
target
=
Target
.
CPU
,
max_threads
=
None
):
target
=
translate_target
(
target
)
definitions
=
[]
for
ast
in
kernel_family
.
all_asts
:
code
=
generate_c
(
ast
,
dialect
=
Backend
.
CUDA
if
target
==
Target
.
GPU
else
Backend
.
C
)
if
max_threads
is
not
None
and
target
==
Target
.
GPU
:
assert
isinstance
(
max_threads
,
int
),
"maximal number of threads should be an integer value"
index
=
code
.
find
(
'FUNC_PREFIX'
)
+
len
(
"FUNC_PREFIX "
)
code
=
code
[:
index
]
+
f
'__launch_bounds__(
{
max_threads
}
) '
+
code
[
index
:]
code
=
"namespace internal_%s {
\n
static %s
\n
}
\n
"
%
(
ast
.
function_name
,
code
)
definitions
.
append
(
code
)
return
"
\n
"
.
join
(
definitions
)
...
...
@@ -177,10 +195,12 @@ def generate_block_data_to_field_extraction(ctx, kernel_info, parameters_to_igno
normal_fields
=
{
f
for
f
in
field_parameters
if
f
.
name
not
in
kernel_info
.
temporary_fields
}
temporary_fields
=
{
f
for
f
in
field_parameters
if
f
.
name
in
kernel_info
.
temporary_fields
}
target
=
translate_target
(
ctx
[
'target'
])
args
=
{
'declaration_only'
:
declarations_only
,
'no_declaration'
:
no_declarations
,
'is_gpu'
:
ctx
[
'
target
'
]
==
'gpu'
,
'is_gpu'
:
target
==
Target
.
GPU
,
}
result
=
"
\n
"
.
join
(
field_extraction_code
(
field
=
field
,
is_temporary
=
False
,
update_member
=
update_member
,
**
args
)
for
field
in
...
...
@@ -389,7 +409,8 @@ def generate_members(ctx, kernel_info, parameters_to_ignore=(), only_fields=Fals
params_to_skip
=
tuple
(
parameters_to_ignore
)
+
tuple
(
kernel_info
.
temporary_fields
)
params_to_skip
+=
tuple
(
e
[
1
]
for
e
in
kernel_info
.
varying_parameters
)
is_gpu
=
ctx
[
'target'
]
==
'gpu'
target
=
translate_target
(
ctx
[
'target'
])
is_gpu
=
target
==
Target
.
GPU
result
=
[]
for
param
in
kernel_info
.
parameters
:
...
...
python/pystencils_walberla/templates/Sweep.tmpl.cpp
View file @
84b1bade
...
...
@@ -54,7 +54,7 @@ namespace walberla {
namespace
{{
namespace
}}
{
{{
kernel
|
generate_definitions
(
target
)}}
{{
kernel
|
generate_definitions
(
target
,
max_threads
)}}
void
{{
class_name
}}
::
run
(
{{
-
[
"IBlock * block"
,
kernel
.
kernel_selection_parameters
,
[
"cudaStream_t stream"
]
if
target
==
'
gpu
'
else
[]]
|
type_identifier_list
-
}}
)
{
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment