Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Jan Hönig
waLBerla
Commits
d5c5fac4
Commit
d5c5fac4
authored
Jun 29, 2021
by
Frederik Hennig
Committed by
Markus Holzer
Jun 29, 2021
Browse files
Revamp LB GPU Benchmark App for In-Place Streaming
parent
803a82cb
Changes
21
Expand all
Hide whitespace changes
Inline
Side-by-side
apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.py
View file @
d5c5fac4
from
pystencils.field
import
fields
from
lbmpy.advanced_streaming.utility
import
get_timesteps
,
Timestep
from
lbmpy.advanced_streaming.utility
import
get_timesteps
from
lbmpy.macroscopic_value_kernels
import
macroscopic_values_setter
from
lbmpy.stencils
import
get_stencil
from
lbmpy.creationfunctions
import
create_lb_collision_rule
,
create_lb_method
,
create_lb_update_rule
from
lbmpy.creationfunctions
import
create_lb_collision_rule
from
lbmpy.boundaries
import
NoSlip
,
UBB
,
ExtrapolationOutflow
from
pystencils_walberla
import
CodeGeneration
,
generate_sweep
,
generate_info_header
...
...
apps/benchmarks/UniformGridGPU/CMakeLists.txt
View file @
d5c5fac4
...
...
@@ -4,49 +4,27 @@ waLBerla_link_files_to_builddir( "*.py" )
waLBerla_link_files_to_builddir
(
"simulation_setup"
)
foreach
(
config srt trt mrt smagorinsky entropic smagorinsky_noopt entropic_kbc_n4
entropic_kbc_n4_noopt mrt_noopt mrt_full mrt_full_noopt
cumulant cumulant_d3q27
srt_d3q27 mrt_d3q27 mrt_d3q27_noopt smagorinsky_d3q27 smagorinsky_d3q27_noopt mrt_full_d3q27 mrt_full_d3q27_noopt
)
waLBerla_generate_target_from_python
(
NAME UniformGridGPUGenerated_
${
config
}
FILE UniformGridGPU.py
CODEGEN_CFG
${
config
}
OUT_FILES UniformGridGPU_LatticeModel.cpp UniformGridGPU_LatticeModel.h
UniformGridGPU_LbKernel.cu UniformGridGPU_LbKernel.h
UniformGridGPU_NoSlip.cu UniformGridGPU_NoSlip.h
UniformGridGPU_UBB.cu UniformGridGPU_UBB.h
UniformGridGPU_PackInfo.cu UniformGridGPU_PackInfo.h
UniformGridGPU_MacroSetter.cpp UniformGridGPU_MacroSetter.h
UniformGridGPU_MacroGetter.cpp UniformGridGPU_MacroGetter.h
UniformGridGPU_Defines.h
)
waLBerla_add_executable
(
NAME UniformGridBenchmarkGPU_
${
config
}
FILES UniformGridGPU.cpp
DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk gui UniformGridGPUGenerated_
${
config
}
)
set_target_properties
(
UniformGridBenchmarkGPU_
${
config
}
PROPERTIES CXX_VISIBILITY_PRESET hidden
)
endforeach
()
foreach
(
config srt trt mrt smagorinsky entropic
)
waLBerla_generate_target_from_python
(
NAME UniformGridGPUGenerated_AA_
${
config
}
FILE UniformGridGPU_AA.py
CODEGEN_CFG
${
config
}
OUT_FILES UniformGridGPU_AA_PackInfoPull.cu UniformGridGPU_AA_PackInfoPull.h
UniformGridGPU_AA_LbKernelOdd.cu UniformGridGPU_AA_LbKernelOdd.h
UniformGridGPU_AA_LbKernelEven.cu UniformGridGPU_AA_LbKernelEven.h
UniformGridGPU_AA_PackInfoPush.cu UniformGridGPU_AA_PackInfoPush.h
UniformGridGPU_AA_MacroSetter.cpp UniformGridGPU_AA_MacroSetter.h
UniformGridGPU_AA_MacroGetter.cpp UniformGridGPU_AA_MacroGetter.h
UniformGridGPU_AA_Defines.h
)
waLBerla_add_executable
(
NAME UniformGridBenchmarkGPU_AA_
${
config
}
FILES UniformGridGPU_AA.cpp
DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk gui UniformGridGPUGenerated_AA_
${
config
}
)
set_target_properties
(
UniformGridBenchmarkGPU_AA_
${
config
}
PROPERTIES CXX_VISIBILITY_PRESET hidden
)
endforeach
()
foreach
(
streaming_pattern aa
)
# choose from {pull, push, aa, esotwist}
foreach
(
stencil d3q27
)
# choose from {d3q19 d3q27}
foreach
(
collision_setup srt trt mrt cumulant
)
# choose from {srt trt mrt cumulant entropic smagorinsky}
set
(
config
${
stencil
}
_
${
streaming_pattern
}
_
${
collision_setup
}
)
waLBerla_generate_target_from_python
(
NAME UniformGridGPUGenerated_
${
config
}
FILE UniformGridGPU.py
CODEGEN_CFG
${
config
}
OUT_FILES UniformGridGPU_LbKernel.cu UniformGridGPU_LbKernel.h
UniformGridGPU_PackInfoEven.cu UniformGridGPU_PackInfoEven.h
UniformGridGPU_PackInfoOdd.cu UniformGridGPU_PackInfoOdd.h
UniformGridGPU_NoSlip.cu UniformGridGPU_NoSlip.h
UniformGridGPU_UBB.cu UniformGridGPU_UBB.h
UniformGridGPU_MacroSetter.cu UniformGridGPU_MacroSetter.h
UniformGridGPU_InfoHeader.h
)
waLBerla_add_executable
(
NAME UniformGridGPU_
${
config
}
FILES UniformGridGPU.cpp
DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk UniformGridGPUGenerated_
${
config
}
)
set_target_properties
(
UniformGridGPU_
${
config
}
PROPERTIES CXX_VISIBILITY_PRESET hidden
)
endforeach
()
endforeach
()
endforeach
()
\ No newline at end of file
apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
View file @
d5c5fac4
This diff is collapsed.
Click to expand it.
apps/benchmarks/UniformGridGPU/UniformGridGPU.py
View file @
d5c5fac4
import
sympy
as
sp
import
numpy
as
np
import
pystencils
as
ps
from
lbmpy.creationfunctions
import
create_lb_method
,
create_lb_update_rule
,
create_lb_collision_rule
from
lbmpy.boundaries
import
NoSlip
,
UBB
from
lbmpy.fieldaccess
import
StreamPullTwoFieldsAccessor
from
pystencils_walberla
import
generate_pack_info_from_kernel
from
lbmpy_walberla
import
generate_lattice_model
,
generate_boundary
from
pystencils_walberla
import
CodeGeneration
,
generate_sweep
from
pystencils.data_types
import
TypedSymbol
from
pystencils.fast_approximation
import
insert_fast_sqrts
,
insert_fast_divisions
from
lbmpy.macroscopic_value_kernels
import
macroscopic_values_getter
,
macroscopic_values_setter
from
lbmpy.advanced_streaming
import
Timestep
,
is_inplace
from
lbmpy.advanced_streaming.utility
import
streaming_patterns
from
lbmpy.boundaries
import
NoSlip
,
UBB
from
lbmpy.creationfunctions
import
create_lb_collision_rule
from
lbmpy.macroscopic_value_kernels
import
macroscopic_values_setter
from
lbmpy.stencils
import
get_stencil
from
pystencils_walberla
import
CodeGeneration
,
generate_info_header
,
generate_sweep
from
lbmpy_walberla
import
generate_alternating_lbm_sweep
,
generate_lb_pack_info
,
generate_alternating_lbm_boundary
omega
=
sp
.
symbols
(
"omega"
)
omega_free
=
sp
.
Symbol
(
"omega_free"
)
omega_fill
=
sp
.
symbols
(
"omega_:10"
)
compile_time_block_size
=
False
if
compile_time_block_size
:
...
...
@@ -21,156 +24,158 @@ if compile_time_block_size:
else
:
sweep_block_size
=
(
TypedSymbol
(
"cudaBlockSize0"
,
np
.
int32
),
TypedSymbol
(
"cudaBlockSize1"
,
np
.
int32
),
1
)
TypedSymbol
(
"cudaBlockSize2"
,
np
.
int32
)
)
sweep
_params
=
{
'block_size'
:
sweep_block_size
}
gpu_indexing
_params
=
{
'block_size'
:
sweep_block_size
}
options_dict
=
{
'srt'
:
{
'method'
:
'srt'
,
'stencil'
:
'D3Q19'
,
'relaxation_rate'
:
omega
,
'compressible'
:
False
,
},
'trt'
:
{
'method'
:
'trt'
,
'stencil'
:
'D3Q19'
,
'relaxation_rate'
:
omega
,
},
'mrt'
:
{
'method'
:
'mrt'
,
'stencil'
:
'D3Q19'
,
'relaxation_rates'
:
[
omega
,
1.3
,
1.4
,
1.2
,
1.1
,
1.15
,
1.234
,
1.4235
],
'relaxation_rates'
:
[
omega
,
1
,
1
,
1
,
1
,
1
,
1
],
},
'mrt
_full
'
:
{
'mrt
-overrelax
'
:
{
'method'
:
'mrt'
,
'stencil'
:
'D3Q19'
,
'relaxation_rates'
:
[
omega_fill
[
0
],
omega
,
omega_fill
[
1
],
omega_fill
[
2
],
omega_fill
[
3
],
omega_fill
[
4
],
omega_fill
[
5
]],
'relaxation_rates'
:
[
omega
,
1.3
,
1.4
,
omega
,
1.2
,
1.1
],
},
'
entropic
'
:
{
'method'
:
'
mr
t'
,
'
stencil'
:
'D3Q19'
,
'
cumulant
'
:
{
'method'
:
'
cumulan
t'
,
'
relaxation_rate'
:
omega
,
'compressible'
:
True
,
'relaxation_rates'
:
[
omega
,
omega
,
omega_free
,
omega_free
,
omega_free
,
omega_free
],
'entropic'
:
True
,
},
'
entropic_kbc_n4
'
:
{
'method'
:
'
trt-kbc-n4
'
,
'
stencil'
:
'D3Q27'
,
'
cumulant-overrelax
'
:
{
'method'
:
'
cumulant
'
,
'
relaxation_rates'
:
[
omega
]
+
[
1
+
x
*
1e-2
for
x
in
range
(
1
,
11
)]
,
'compressible'
:
True
,
'relaxation_rates'
:
[
omega
,
omega_free
],
},
'entropic'
:
{
'method'
:
'mrt'
,
'compressible'
:
True
,
'relaxation_rates'
:
[
omega
,
omega
,
omega_free
,
omega_free
,
omega_free
],
'entropic'
:
True
,
},
'smagorinsky'
:
{
'method'
:
'srt'
,
'stencil'
:
'D3Q19'
,
'smagorinsky'
:
True
,
'relaxation_rate'
:
omega
,
},
'cumulant'
:
{
'method'
:
'cumulant'
,
'stencil'
:
'D3Q19'
,
'compressible'
:
True
,
'relaxation_rate'
:
omega
,
},
}
}
info_header
=
"""
#include "stencil/D3Q{q}.h"
\n
using Stencil_T = walberla::stencil::D3Q{q};
const char * infoStencil = "{stencil}";
const char * infoConfigName = "{configName}";
const char * infoStreamingPattern = "{streaming_pattern}";
const char * infoCollisionSetup = "{collision_setup}";
const bool infoCseGlobal = {cse_global};
const bool infoCsePdfs = {cse_pdfs};
"""
# DEFAULTS
optimize
=
True
with
CodeGeneration
()
as
ctx
:
accessor
=
StreamPullTwoFieldsAccessor
()
# accessor = StreamPushTwoFieldsAccessor()
assert
not
accessor
.
is_inplace
,
"This app does not work for inplace accessors"
config_tokens
=
ctx
.
config
.
split
(
'_'
)
assert
len
(
config_tokens
)
>=
3
stencil_str
=
config_tokens
[
0
]
streaming_pattern
=
config_tokens
[
1
]
collision_setup
=
config_tokens
[
2
]
if
len
(
config_tokens
)
>=
4
:
optimize
=
(
config_tokens
[
3
]
!=
'noopt'
)
stencil
=
get_stencil
(
stencil_str
)
assert
streaming_pattern
in
streaming_patterns
,
f
"Invalid streaming pattern:
{
streaming_pattern
}
"
options
=
options_dict
[
collision_setup
]
q
=
len
(
stencil
)
dim
=
len
(
stencil
[
0
])
assert
dim
==
3
,
"This app supports only three-dimensional stencils"
pdfs
,
pdfs_tmp
,
velocity_field
=
ps
.
fields
(
f
"pdfs(
{
q
}
), pdfs_tmp(
{
q
}
), velocity(3) : double[3D]"
,
layout
=
'fzyx'
)
common_options
=
{
'field_name'
:
'pdfs'
,
'temporary_field_name'
:
'pdfs_tmp'
,
'kernel_type'
:
accessor
,
'optimization'
:
{
'cse_global'
:
True
,
'cse_pdfs'
:
False
}
'stencil'
:
stencil
,
'field_name'
:
pdfs
.
name
,
'optimization'
:
{
'target'
:
'gpu'
,
'cse_global'
:
True
,
'cse_pdfs'
:
False
,
'symbolic_field'
:
pdfs
,
'field_layout'
:
'fzyx'
,
'gpu_indexing_params'
:
gpu_indexing_params
,
}
}
config_name
=
ctx
.
config
noopt
=
False
d3q27
=
False
if
config_name
.
endswith
(
"_noopt"
):
noopt
=
True
config_name
=
config_name
[:
-
len
(
"_noopt"
)]
if
config_name
.
endswith
(
"_d3q27"
):
d3q27
=
True
config_name
=
config_name
[:
-
len
(
"_d3q27"
)]
options
=
options_dict
[
config_name
]
options
.
update
(
common_options
)
options
=
options
.
copy
()
if
noopt
:
options
[
'optimization'
][
'cse_global'
]
=
False
options
[
'optimization'
][
'cse_pdfs'
]
=
False
if
d3q27
:
options
[
'stencil'
]
=
'D3Q27'
options
.
update
(
common_options
)
stencil_str
=
options
[
'stencil'
]
q
=
int
(
stencil_str
[
stencil_str
.
find
(
'Q'
)
+
1
:])
pdfs
,
velocity_field
=
ps
.
fields
(
"pdfs({q}), velocity(3) : double[3D]"
.
format
(
q
=
q
),
layout
=
'fzyx'
)
options
[
'optimization'
][
'symbolic_field'
]
=
pdfs
if
not
is_inplace
(
streaming_pattern
):
options
[
'optimization'
][
'symbolic_temporary_field'
]
=
pdfs_tmp
field_swaps
=
[(
pdfs
,
pdfs_tmp
)]
else
:
field_swaps
=
[]
vp
=
[
(
'double'
,
'omega_0'
),
(
'double'
,
'omega_1'
),
(
'double'
,
'omega_2'
),
(
'double'
,
'omega_3'
),
(
'double'
,
'omega_4'
),
(
'double'
,
'omega_5'
),
(
'double'
,
'omega_6'
),
(
'int32_t'
,
'cudaBlockSize0'
),
(
'int32_t'
,
'cudaBlockSize1'
),
(
'int32_t'
,
'cudaBlockSize2'
)
]
lb_method
=
create_lb_method
(
**
options
)
update_rule
=
create_lb_update_rule
(
lb_method
=
lb_method
,
**
options
)
if
not
noopt
:
update_rule
=
insert_fast_divisions
(
update_rule
)
update_rule
=
insert_fast_sqrts
(
update_rule
)
# CPU lattice model - required for macroscopic value computation, VTK output etc.
options_without_opt
=
options
.
copy
()
del
options_without_opt
[
'optimization'
]
generate_lattice_model
(
ctx
,
'UniformGridGPU_LatticeModel'
,
create_lb_collision_rule
(
lb_method
=
lb_method
,
**
options_without_opt
))
# gpu LB sweep & boundaries
generate_sweep
(
ctx
,
'UniformGridGPU_LbKernel'
,
update_rule
,
field_swaps
=
[(
'pdfs'
,
'pdfs_tmp'
)],
inner_outer_split
=
True
,
target
=
'gpu'
,
gpu_indexing_params
=
sweep_params
,
varying_parameters
=
vp
)
generate_boundary
(
ctx
,
'UniformGridGPU_NoSlip'
,
NoSlip
(),
lb_method
,
target
=
'gpu'
)
generate_boundary
(
ctx
,
'UniformGridGPU_UBB'
,
UBB
([
0.05
,
0
,
0
]),
lb_method
,
target
=
'gpu'
)
# LB Sweep
collision_rule
=
create_lb_collision_rule
(
**
options
)
if
optimize
:
collision_rule
=
insert_fast_divisions
(
collision_rule
)
collision_rule
=
insert_fast_sqrts
(
collision_rule
)
lb_method
=
collision_rule
.
method
generate_alternating_lbm_sweep
(
ctx
,
'UniformGridGPU_LbKernel'
,
collision_rule
,
streaming_pattern
,
optimization
=
options
[
'optimization'
],
inner_outer_split
=
True
,
varying_parameters
=
vp
,
field_swaps
=
field_swaps
)
# getter & setter
setter_assignments
=
macroscopic_values_setter
(
lb_method
,
velocity
=
velocity_field
.
center_vector
,
pdfs
=
pdfs
.
center_vector
,
density
=
1.0
)
getter_assignments
=
macroscopic_values_getter
(
lb_method
,
velocity
=
velocity_field
.
center_vector
,
pdfs
=
pdfs
.
center_vector
,
density
=
None
)
generate_sweep
(
ctx
,
'UniformGridGPU_MacroSetter'
,
setter_assignments
)
generate_sweep
(
ctx
,
'UniformGridGPU_MacroGetter'
,
getter_assignments
)
setter_assignments
=
macroscopic_values_setter
(
lb_method
,
density
=
1.0
,
velocity
=
velocity_field
.
center_vector
,
pdfs
=
pdfs
,
streaming_pattern
=
streaming_pattern
,
previous_timestep
=
Timestep
.
EVEN
)
generate_sweep
(
ctx
,
'UniformGridGPU_MacroSetter'
,
setter_assignments
,
target
=
'gpu'
)
# Boundaries
noslip
=
NoSlip
()
ubb
=
UBB
((
0.05
,
0
,
0
))
generate_alternating_lbm_boundary
(
ctx
,
'UniformGridGPU_NoSlip'
,
noslip
,
lb_method
,
field_name
=
pdfs
.
name
,
streaming_pattern
=
streaming_pattern
,
target
=
'gpu'
)
generate_alternating_lbm_boundary
(
ctx
,
'UniformGridGPU_UBB'
,
ubb
,
lb_method
,
field_name
=
pdfs
.
name
,
streaming_pattern
=
streaming_pattern
,
target
=
'gpu'
)
# communication
generate_pack_info_from_kernel
(
ctx
,
'UniformGridGPU_PackInfo'
,
update_rule
,
target
=
'gpu'
)
generate_lb_pack_info
(
ctx
,
'UniformGridGPU_PackInfo'
,
stencil
,
pdfs
,
streaming_pattern
=
streaming_pattern
,
target
=
'gpu'
,
always_generate_separate_classes
=
True
)
infoHeaderParams
=
{
'stencil'
:
stencil_str
,
'
q'
:
q
,
'co
nfigName'
:
ctx
.
config
,
'
streaming_pattern'
:
streaming_pattern
,
'co
llision_setup'
:
collision_setup
,
'cse_global'
:
int
(
options
[
'optimization'
][
'cse_global'
]),
'cse_pdfs'
:
int
(
options
[
'optimization'
][
'cse_pdfs'
]),
}
ctx
.
write_file
(
"UniformGridGPU_Defines.h"
,
info_header
.
format
(
**
infoHeaderParams
))
stencil_typedefs
=
{
'Stencil_T'
:
stencil
,
'CommunicationStencil_T'
:
stencil
}
field_typedefs
=
{
'PdfField_T'
:
pdfs
,
'VelocityField_T'
:
velocity_field
}
# Info header containing correct template definitions for stencil and field
generate_info_header
(
ctx
,
'UniformGridGPU_InfoHeader'
,
stencil_typedefs
=
stencil_typedefs
,
field_typedefs
=
field_typedefs
,
additional_code
=
info_header
.
format
(
**
infoHeaderParams
))
apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.cpp
deleted
100644 → 0
View file @
803a82cb
#include
"core/Environment.h"
#include
"core/logging/Initialization.h"
#include
"python_coupling/CreateConfig.h"
#include
"python_coupling/PythonCallback.h"
#include
"python_coupling/DictWrapper.h"
#include
"blockforest/Initialization.h"
#include
"field/FlagField.h"
#include
"field/AddToStorage.h"
#include
"field/vtk/VTKWriter.h"
#include
"field/communication/PackInfo.h"
#include
"lbm/PerformanceLogger.h"
#include
"blockforest/communication/UniformBufferedScheme.h"
#include
"timeloop/all.h"
#include
"geometry/all.h"
#include
"cuda/HostFieldAllocator.h"
#include
"cuda/communication/GPUPackInfo.h"
#include
"cuda/ParallelStreams.h"
#include
"core/timing/TimingPool.h"
#include
"core/timing/RemainingTimeLogger.h"
#include
"cuda/AddGPUFieldToStorage.h"
#include
"cuda/communication/UniformGPUScheme.h"
#include
"cuda/DeviceSelectMPI.h"
#include
"domain_decomposition/SharedSweep.h"
#include
"InitShearVelocity.h"
#include
"gui/Gui.h"
#ifdef WALBERLA_ENABLE_GUI
#include
"lbm/gui/PdfFieldDisplayAdaptor.h"
#endif
#include
"UniformGridGPU_AA_PackInfoPush.h"
#include
"UniformGridGPU_AA_PackInfoPull.h"
#include
"UniformGridGPU_AA_MacroSetter.h"
#include
"UniformGridGPU_AA_MacroGetter.h"
#include
"UniformGridGPU_AA_LbKernelEven.h"
#include
"UniformGridGPU_AA_LbKernelOdd.h"
#include
"UniformGridGPU_AA_Defines.h"
#include
<cmath>
using
namespace
walberla
;
using
CommunicationStencil_T
=
Stencil_T
;
using
PdfField_T
=
GhostLayerField
<
real_t
,
Stencil_T
::
Q
>
;
using
VelocityField_T
=
GhostLayerField
<
real_t
,
3
>
;
int
main
(
int
argc
,
char
**
argv
)
{
mpi
::
Environment
env
(
argc
,
argv
);
cuda
::
selectDeviceBasedOnMpiRank
();
for
(
auto
cfg
=
python_coupling
::
configBegin
(
argc
,
argv
);
cfg
!=
python_coupling
::
configEnd
();
++
cfg
)
{
WALBERLA_MPI_WORLD_BARRIER
();
WALBERLA_CUDA_CHECK
(
cudaPeekAtLastError
()
);
auto
config
=
*
cfg
;
logging
::
configureLogging
(
config
);
auto
blocks
=
blockforest
::
createUniformBlockGridFromConfig
(
config
);
Vector3
<
uint_t
>
cellsPerBlock
=
config
->
getBlock
(
"DomainSetup"
).
getParameter
<
Vector3
<
uint_t
>
>
(
"cellsPerBlock"
);
// Reading parameters
auto
parameters
=
config
->
getOneBlock
(
"Parameters"
);
const
real_t
omega
=
parameters
.
getParameter
<
real_t
>
(
"omega"
,
real_c
(
1.4
));
const
uint_t
timesteps
=
parameters
.
getParameter
<
uint_t
>
(
"timesteps"
,
uint_c
(
50
));
// Creating fields
BlockDataID
pdfFieldCpuID
=
field
::
addToStorage
<
PdfField_T
>
(
blocks
,
"pdfs cpu"
,
real_t
(
std
::
nan
(
""
)
),
field
::
fzyx
);
BlockDataID
velFieldCpuID
=
field
::
addToStorage
<
VelocityField_T
>
(
blocks
,
"vel"
,
real_t
(
0
),
field
::
fzyx
);
WALBERLA_LOG_INFO_ON_ROOT
(
"Initializing shear flow"
);
initShearVelocity
(
blocks
,
velFieldCpuID
);
pystencils
::
UniformGridGPU_AA_MacroGetter
getterSweep
(
pdfFieldCpuID
,
velFieldCpuID
);
pystencils
::
UniformGridGPU_AA_MacroSetter
setterSweep
(
pdfFieldCpuID
,
velFieldCpuID
);
for
(
auto
&
block
:
*
blocks
)
setterSweep
(
&
block
);
BlockDataID
pdfFieldGpuID
=
cuda
::
addGPUFieldToStorage
<
PdfField_T
>
(
blocks
,
pdfFieldCpuID
,
"pdfs on GPU"
,
true
);
Vector3
<
int
>
innerOuterSplit
=
parameters
.
getParameter
<
Vector3
<
int
>
>
(
"innerOuterSplit"
,
Vector3
<
int
>
(
1
,
1
,
1
));
for
(
uint_t
i
=
0
;
i
<
3
;
++
i
)
{
if
(
int_c
(
cellsPerBlock
[
i
])
<=
innerOuterSplit
[
i
]
*
2
)
{
WALBERLA_ABORT_NO_DEBUG_INFO
(
"innerOuterSplit too large - make it smaller or increase cellsPerBlock"
);
}
}
Cell
innerOuterSplitCell
(
innerOuterSplit
[
0
],
innerOuterSplit
[
1
],
innerOuterSplit
[
2
]);
bool
cudaEnabledMPI
=
parameters
.
getParameter
<
bool
>
(
"cudaEnabledMPI"
,
false
);
Vector3
<
int32_t
>
gpuBlockSize
=
parameters
.
getParameter
<
Vector3
<
int32_t
>
>
(
"gpuBlockSize"
,
Vector3
<
int32_t
>
(
256
,
1
,
1
));
int
streamHighPriority
=
0
;
int
streamLowPriority
=
0
;
WALBERLA_CUDA_CHECK
(
cudaDeviceGetStreamPriorityRange
(
&
streamLowPriority
,
&
streamHighPriority
));
WALBERLA_CHECK
(
gpuBlockSize
[
2
]
==
1
);
using
KernelEven
=
pystencils
::
UniformGridGPU_AA_LbKernelEven
;
using
KernelOdd
=
pystencils
::
UniformGridGPU_AA_LbKernelOdd
;
using
PackInfoPull
=
pystencils
::
UniformGridGPU_AA_PackInfoPull
;
using
PackInfoPush
=
pystencils
::
UniformGridGPU_AA_PackInfoPush
;
using
cuda
::
communication
::
UniformGPUScheme
;
KernelEven
kernelEven
(
pdfFieldGpuID
,
omega
,
gpuBlockSize
[
0
],
gpuBlockSize
[
1
],
innerOuterSplitCell
);
KernelOdd
kernelOdd
(
pdfFieldGpuID
,
omega
,
gpuBlockSize
[
0
],
gpuBlockSize
[
1
],
innerOuterSplitCell
);
kernelEven
.
setOuterPriority
(
streamHighPriority
);
kernelOdd
.
setOuterPriority
(
streamHighPriority
);
auto
pullScheme
=
make_shared
<
UniformGPUScheme
<
Stencil_T
>
>
(
blocks
,
cudaEnabledMPI
);
auto
pushScheme
=
make_shared
<
UniformGPUScheme
<
Stencil_T
>
>
(
blocks
,
cudaEnabledMPI
);
pullScheme
->
addPackInfo
(
make_shared
<
PackInfoPull
>
(
pdfFieldGpuID
)
);
pushScheme
->
addPackInfo
(
make_shared
<
PackInfoPush
>
(
pdfFieldGpuID
)
);
auto
defaultStream
=
cuda
::
StreamRAII
::
newPriorityStream
(
streamLowPriority
);
auto
setupPhase
=
[
&
]()
{
for
(
auto
&
block
:
*
blocks
)
kernelEven
(
&
block
);
pullScheme
->
communicate
();
for
(
auto
&
block
:
*
blocks
)
kernelOdd
(
&
block
);
};
auto
tearDownPhase
=
[
&
]()
{
pushScheme
->
communicate
();
cuda
::
fieldCpy
<
PdfField_T
,
cuda
::
GPUField
<
real_t
>
>
(
blocks
,
pdfFieldCpuID
,
pdfFieldGpuID
);
for
(
auto
&
block
:
*
blocks
)
getterSweep
(
&
block
);
};
auto
simpleOverlapTimeStep
=
[
&
]()
{
// Even
pushScheme
->
startCommunication
(
defaultStream
);
for
(
auto
&
block
:
*
blocks
)
kernelEven
.
inner
(
&
block
,
defaultStream
);
pushScheme
->
wait
(
defaultStream
);
for
(
auto
&
block
:
*
blocks
)
kernelEven
.
outer
(
&
block
,
defaultStream
);
// Odd
pullScheme
->
startCommunication
(
defaultStream
);
for
(
auto
&
block
:
*
blocks
)
kernelOdd
.
inner
(
&
block
,
defaultStream
);
pullScheme
->
wait
(
defaultStream
);
for
(
auto
&
block
:
*
blocks
)
kernelOdd
.
outer
(
&
block
,
defaultStream
);
};
auto
normalTimeStep
=
[
&
]()
{
pushScheme
->
communicate
(
defaultStream
);
for
(
auto
&
block
:
*
blocks
)
kernelEven
(
&
block
,
defaultStream
);
pullScheme
->
communicate
(
defaultStream
);
for
(
auto
&
block
:
*
blocks
)
kernelOdd
(
&
block
,
defaultStream
);
};
auto
kernelOnlyFunc
=
[
&
]()
{
for
(
auto
&
block
:
*
blocks
)
kernelEven
(
&
block
,
defaultStream
);
for
(
auto
&
block
:
*
blocks
)
kernelOdd
(
&
block
,
defaultStream
);
};
SweepTimeloop
timeLoop
(
blocks
->
getBlockStorage
(),
timesteps
/
2
);
const
std
::
string
timeStepStrategy
=
parameters
.
getParameter
<
std
::
string
>
(
"timeStepStrategy"
,
"normal"
);
std
::
function
<
void
()
>
timeStep
;
if
(
timeStepStrategy
==
"noOverlap"
)
timeStep
=
std
::
function
<
void
()
>
(
normalTimeStep
);
else
if
(
timeStepStrategy
==
"simpleOverlap"
)
timeStep
=
simpleOverlapTimeStep
;
else
if
(
timeStepStrategy
==
"kernelOnly"
)
{
WALBERLA_LOG_INFO_ON_ROOT
(
"Running only compute kernel without boundary - this makes only sense for benchmarking!"
)
timeStep
=
kernelOnlyFunc
;
}
else
{
WALBERLA_ABORT_NO_DEBUG_INFO
(
"Invalid value for 'timeStepStrategy'. Allowed values are 'noOverlap', 'complexOverlap', 'simpleOverlap', 'kernelOnly'"
);
}
timeLoop
.
add
()
<<
BeforeFunction
(
timeStep
)
<<
Sweep
(
[](
IBlock
*
)
{},
"time step"
);
// VTK
uint_t
vtkWriteFrequency
=
parameters
.
getParameter
<
uint_t
>
(
"vtkWriteFrequency"
,
0
);
if
(
vtkWriteFrequency
>
0
)
{
auto
vtkOutput
=
vtk
::
createVTKOutput_BlockData
(
*
blocks
,
"vtk"
,
vtkWriteFrequency
,
0
,
false
,
"vtk_out"
,
"simulation_step"
,
false
,
true