Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
waLBerla
waLBerla
Commits
d5c5fac4
Commit
d5c5fac4
authored
Jun 29, 2021
by
Frederik Hennig
Committed by
Markus Holzer
Jun 29, 2021
Browse files
Revamp LB GPU Benchmark App for In-Place Streaming
parent
803a82cb
Changes
21
Hide whitespace changes
Inline
Side-by-side
apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.py
View file @
d5c5fac4
from
pystencils.field
import
fields
from
lbmpy.advanced_streaming.utility
import
get_timesteps
,
Timestep
from
lbmpy.advanced_streaming.utility
import
get_timesteps
from
lbmpy.macroscopic_value_kernels
import
macroscopic_values_setter
from
lbmpy.stencils
import
get_stencil
from
lbmpy.creationfunctions
import
create_lb_collision_rule
,
create_lb_method
,
create_lb_update_rule
from
lbmpy.creationfunctions
import
create_lb_collision_rule
from
lbmpy.boundaries
import
NoSlip
,
UBB
,
ExtrapolationOutflow
from
pystencils_walberla
import
CodeGeneration
,
generate_sweep
,
generate_info_header
...
...
apps/benchmarks/UniformGridGPU/CMakeLists.txt
View file @
d5c5fac4
...
...
@@ -4,49 +4,27 @@ waLBerla_link_files_to_builddir( "*.py" )
waLBerla_link_files_to_builddir
(
"simulation_setup"
)
foreach
(
config srt trt mrt smagorinsky entropic smagorinsky_noopt entropic_kbc_n4
entropic_kbc_n4_noopt mrt_noopt mrt_full mrt_full_noopt
cumulant cumulant_d3q27
srt_d3q27 mrt_d3q27 mrt_d3q27_noopt smagorinsky_d3q27 smagorinsky_d3q27_noopt mrt_full_d3q27 mrt_full_d3q27_noopt
)
waLBerla_generate_target_from_python
(
NAME UniformGridGPUGenerated_
${
config
}
FILE UniformGridGPU.py
CODEGEN_CFG
${
config
}
OUT_FILES UniformGridGPU_LatticeModel.cpp UniformGridGPU_LatticeModel.h
UniformGridGPU_LbKernel.cu UniformGridGPU_LbKernel.h
UniformGridGPU_NoSlip.cu UniformGridGPU_NoSlip.h
UniformGridGPU_UBB.cu UniformGridGPU_UBB.h
UniformGridGPU_PackInfo.cu UniformGridGPU_PackInfo.h
UniformGridGPU_MacroSetter.cpp UniformGridGPU_MacroSetter.h
UniformGridGPU_MacroGetter.cpp UniformGridGPU_MacroGetter.h
UniformGridGPU_Defines.h
)
waLBerla_add_executable
(
NAME UniformGridBenchmarkGPU_
${
config
}
FILES UniformGridGPU.cpp
DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk gui UniformGridGPUGenerated_
${
config
}
)
set_target_properties
(
UniformGridBenchmarkGPU_
${
config
}
PROPERTIES CXX_VISIBILITY_PRESET hidden
)
endforeach
()
foreach
(
config srt trt mrt smagorinsky entropic
)
waLBerla_generate_target_from_python
(
NAME UniformGridGPUGenerated_AA_
${
config
}
FILE UniformGridGPU_AA.py
CODEGEN_CFG
${
config
}
OUT_FILES UniformGridGPU_AA_PackInfoPull.cu UniformGridGPU_AA_PackInfoPull.h
UniformGridGPU_AA_LbKernelOdd.cu UniformGridGPU_AA_LbKernelOdd.h
UniformGridGPU_AA_LbKernelEven.cu UniformGridGPU_AA_LbKernelEven.h
UniformGridGPU_AA_PackInfoPush.cu UniformGridGPU_AA_PackInfoPush.h
UniformGridGPU_AA_MacroSetter.cpp UniformGridGPU_AA_MacroSetter.h
UniformGridGPU_AA_MacroGetter.cpp UniformGridGPU_AA_MacroGetter.h
UniformGridGPU_AA_Defines.h
)
waLBerla_add_executable
(
NAME UniformGridBenchmarkGPU_AA_
${
config
}
FILES UniformGridGPU_AA.cpp
DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk gui UniformGridGPUGenerated_AA_
${
config
}
)
set_target_properties
(
UniformGridBenchmarkGPU_AA_
${
config
}
PROPERTIES CXX_VISIBILITY_PRESET hidden
)
endforeach
()
foreach
(
streaming_pattern aa
)
# choose from {pull, push, aa, esotwist}
foreach
(
stencil d3q27
)
# choose from {d3q19 d3q27}
foreach
(
collision_setup srt trt mrt cumulant
)
# choose from {srt trt mrt cumulant entropic smagorinsky}
set
(
config
${
stencil
}
_
${
streaming_pattern
}
_
${
collision_setup
}
)
waLBerla_generate_target_from_python
(
NAME UniformGridGPUGenerated_
${
config
}
FILE UniformGridGPU.py
CODEGEN_CFG
${
config
}
OUT_FILES UniformGridGPU_LbKernel.cu UniformGridGPU_LbKernel.h
UniformGridGPU_PackInfoEven.cu UniformGridGPU_PackInfoEven.h
UniformGridGPU_PackInfoOdd.cu UniformGridGPU_PackInfoOdd.h
UniformGridGPU_NoSlip.cu UniformGridGPU_NoSlip.h
UniformGridGPU_UBB.cu UniformGridGPU_UBB.h
UniformGridGPU_MacroSetter.cu UniformGridGPU_MacroSetter.h
UniformGridGPU_InfoHeader.h
)
waLBerla_add_executable
(
NAME UniformGridGPU_
${
config
}
FILES UniformGridGPU.cpp
DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk UniformGridGPUGenerated_
${
config
}
)
set_target_properties
(
UniformGridGPU_
${
config
}
PROPERTIES CXX_VISIBILITY_PRESET hidden
)
endforeach
()
endforeach
()
endforeach
()
\ No newline at end of file
apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
View file @
d5c5fac4
#include "blockforest/Initialization.h"
#include "core/Environment.h"
#include "core/logging/Initialization.h"
#include "core/math/Random.h"
#include "python_coupling/CreateConfig.h"
#include "python_coupling/PythonCallback.h"
#include "python_coupling/DictWrapper.h"
#include "blockforest/Initialization.h"
#include "field/FlagField.h"
#include "field/AddToStorage.h"
#include "field/vtk/VTKWriter.h"
#include "field/communication/PackInfo.h"
#include "lbm/PerformanceLogger.h"
#include "blockforest/communication/UniformBufferedScheme.h"
#include "timeloop/all.h"
#include "core/math/Random.h"
#include "geometry/all.h"
#include "cuda/HostFieldAllocator.h"
#include "cuda/communication/GPUPackInfo.h"
#include "cuda/ParallelStreams.h"
#include "cuda/NVTX.h"
#include "core/timing/TimingPool.h"
#include "core/timing/RemainingTimeLogger.h"
#include "core/timing/TimingPool.h"
#include "cuda/AddGPUFieldToStorage.h"
#include "cuda/communication/UniformGPUScheme.h"
#include "cuda/DeviceSelectMPI.h"
#include "domain_decomposition/SharedSweep.h"
#include "gui/Gui.h"
#include "lbm/gui/Connection.h"
#include "UniformGridGPU_LatticeModel.h"
#include "UniformGridGPU_LbKernel.h"
#include "UniformGridGPU_PackInfo.h"
#include "UniformGridGPU_UBB.h"
#include "UniformGridGPU_NoSlip.h"
#include "UniformGridGPU_Communication.h"
#include "UniformGridGPU_MacroSetter.h"
#include "UniformGridGPU_MacroGetter.h"
#include "UniformGridGPU_Defines.h"
#include "cuda/ParallelStreams.h"
#include "cuda/communication/UniformGPUScheme.h"
#include "cuda/FieldCopy.h"
#include "cuda/lbm/CombinedInPlaceGpuPackInfo.h"
#include "field/AddToStorage.h"
#include "field/FlagField.h"
#include "field/communication/PackInfo.h"
#include "field/vtk/VTKWriter.h"
using
namespace
walberla
;
#include "geometry/InitBoundaryHandling.h"
using
LatticeModel_T
=
lbm
::
UniformGridGPU_LatticeModel
;
#include "lbm/inplace_streaming/TimestepTracker.h"
const
auto
Q
=
LatticeModel_T
::
Stencil
::
Q
;
#include "python_coupling/CreateConfig.h"
#include "python_coupling/DictWrapper.h"
#include "python_coupling/PythonCallback.h"
#include "timeloop/SweepTimeloop.h"
using
Stencil_T
=
LatticeModel_T
::
Stencil
;
using
CommunicationStencil_T
=
LatticeModel_T
::
CommunicationStencil
;
using
PdfField_T
=
GhostLayerField
<
real_t
,
Q
>
;
using
CommScheme_T
=
cuda
::
communication
::
UniformGPUScheme
<
CommunicationStencil_T
>
;
using
VelocityField_T
=
GhostLayerField
<
real_t
,
3
>
;
using
flag_t
=
walberla
::
uint8_t
;
using
FlagField_T
=
FlagField
<
flag_t
>
;
#include "InitShearVelocity.h"
#include <cmath>
void
initShearVelocity
(
const
shared_ptr
<
StructuredBlockStorage
>
&
blocks
,
BlockDataID
velFieldID
,
const
real_t
xMagnitude
=
real_t
(
0.1
),
const
real_t
fluctuationMagnitude
=
real_t
(
0.05
)
)
{
math
::
seedRandomGenerator
(
0
);
auto
halfZ
=
blocks
->
getDomainCellBB
().
zMax
()
/
2
;
for
(
auto
&
block
:
*
blocks
)
{
auto
velField
=
block
.
getData
<
VelocityField_T
>
(
velFieldID
);
WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ
(
velField
,
Cell
globalCell
;
blocks
->
transformBlockLocalToGlobalCell
(
globalCell
,
block
,
Cell
(
x
,
y
,
z
));
real_t
randomReal
=
xMagnitude
*
math
::
realRandom
<
real_t
>
(
-
fluctuationMagnitude
,
fluctuationMagnitude
);
velField
->
get
(
x
,
y
,
z
,
1
)
=
real_t
(
0
);
velField
->
get
(
x
,
y
,
z
,
2
)
=
randomReal
;
if
(
globalCell
[
2
]
>=
halfZ
)
{
velField
->
get
(
x
,
y
,
z
,
0
)
=
xMagnitude
;
}
else
{
velField
->
get
(
x
,
y
,
z
,
0
)
=
-
xMagnitude
;
}
);
}
}
#include "UniformGridGPU_InfoHeader.h"
using
namespace
walberla
;
using
FlagField_T
=
FlagField
<
uint8_t
>
;
int
main
(
int
argc
,
char
**
argv
)
int
main
(
int
argc
,
char
**
argv
)
{
mpi
::
Environment
env
(
argc
,
argv
);
mpi
::
Environment
env
(
argc
,
argv
);
cuda
::
selectDeviceBasedOnMpiRank
();
for
(
auto
cfg
=
python_coupling
::
configBegin
(
argc
,
argv
);
cfg
!=
python_coupling
::
configEnd
();
++
cfg
)
for
(
auto
cfg
=
python_coupling
::
configBegin
(
argc
,
argv
);
cfg
!=
python_coupling
::
configEnd
();
++
cfg
)
{
WALBERLA_MPI_WORLD_BARRIER
();
WALBERLA_MPI_WORLD_BARRIER
()
WALBERLA_CUDA_CHECK
(
cudaPeekAtLastError
())
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// SETUP AND CONFIGURATION ///
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
auto
config
=
*
cfg
;
logging
::
configureLogging
(
config
);
auto
blocks
=
blockforest
::
createUniformBlockGridFromConfig
(
config
);
logging
::
configureLogging
(
config
);
auto
blocks
=
blockforest
::
createUniformBlockGridFromConfig
(
config
);
Vector3
<
uint_t
>
cellsPerBlock
=
config
->
getBlock
(
"DomainSetup"
).
getParameter
<
Vector3
<
uint_t
>
>
(
"cellsPerBlock"
);
Vector3
<
uint_t
>
cellsPerBlock
=
config
->
getBlock
(
"DomainSetup"
).
getParameter
<
Vector3
<
uint_t
>
>
(
"cellsPerBlock"
);
// Reading parameters
auto
parameters
=
config
->
getOneBlock
(
"Parameters"
);
const
std
::
string
timeStepStrategy
=
parameters
.
getParameter
<
std
::
string
>
(
"timeStepStrategy"
,
"normal"
);
const
real_t
omega
=
parameters
.
getParameter
<
real_t
>
(
"omega"
,
real_c
(
1.4
));
const
uint_t
timesteps
=
parameters
.
getParameter
<
uint_t
>
(
"timesteps"
,
uint_c
(
50
));
auto
parameters
=
config
->
getOneBlock
(
"Parameters"
);
const
real_t
omega
=
parameters
.
getParameter
<
real_t
>
(
"omega"
,
real_c
(
1.4
));
const
uint_t
timesteps
=
parameters
.
getParameter
<
uint_t
>
(
"timesteps"
,
uint_c
(
50
));
const
bool
initShearFlow
=
parameters
.
getParameter
<
bool
>
(
"initShearFlow"
,
true
);
// Creating fields
BlockDataID
pdfFieldCpuID
=
field
::
addToStorage
<
PdfField_T
>
(
blocks
,
"pdfs cpu"
,
real_t
(
0
),
field
::
fzyx
);
BlockDataID
velFieldCpuID
=
field
::
addToStorage
<
VelocityField_T
>
(
blocks
,
"vel"
,
real_t
(
0
),
field
::
fzyx
);
BlockDataID
pdfFieldCpuID
=
field
::
addToStorage
<
PdfField_T
>
(
blocks
,
"pdfs cpu"
,
real_t
(
std
::
nan
(
""
)),
field
::
fzyx
);
BlockDataID
velFieldCpuID
=
field
::
addToStorage
<
VelocityField_T
>
(
blocks
,
"vel"
,
real_t
(
0
),
field
::
fzyx
);
// Initialize velocity on cpu
if
(
initShearFlow
){
WALBERLA_LOG_INFO_ON_ROOT
(
"Initializing shear flow"
)
initShearVelocity
(
blocks
,
velFieldCpuID
);
}
BlockDataID
pdfFieldGpuID
=
cuda
::
addGPUFieldToStorage
<
PdfField_T
>
(
blocks
,
pdfFieldCpuID
,
"pdfs on GPU"
,
true
);
// Velocity field is copied to the GPU
BlockDataID
velFieldGpuID
=
cuda
::
addGPUFieldToStorage
<
VelocityField_T
>
(
blocks
,
velFieldCpuID
,
"velocity on GPU"
,
true
);
if
(
timeStepStrategy
!=
"kernelOnlyNoInit"
)
pystencils
::
UniformGridGPU_MacroSetter
setterSweep
(
pdfFieldGpuID
,
velFieldGpuID
);
// Set up initial PDF values
for
(
auto
&
block
:
*
blocks
)
setterSweep
(
&
block
);
Vector3
<
int
>
innerOuterSplit
=
parameters
.
getParameter
<
Vector3
<
int
>
>
(
"innerOuterSplit"
,
Vector3
<
int
>
(
1
,
1
,
1
));
for
(
uint_t
i
=
0
;
i
<
3
;
++
i
)
{
if
(
initShearFlow
)
{
WALBERLA_LOG_INFO_ON_ROOT
(
"Initializing shear flow"
);
initShearVelocity
(
blocks
,
velFieldCpuID
);
}
pystencils
::
UniformGridGPU_MacroSetter
setterSweep
(
pdfFieldCpuID
,
velFieldCpuID
);
for
(
auto
&
block
:
*
blocks
)
setterSweep
(
&
block
);
// setter sweep only initializes interior of domain - for push schemes to work a first communication is required here
blockforest
::
communication
::
UniformBufferedScheme
<
CommunicationStencil_T
>
initialComm
(
blocks
);
initialComm
.
addPackInfo
(
make_shared
<
field
::
communication
::
PackInfo
<
PdfField_T
>
>
(
pdfFieldCpuID
)
);
initialComm
();
if
(
int_c
(
cellsPerBlock
[
i
])
<=
innerOuterSplit
[
i
]
*
2
)
{
WALBERLA_ABORT_NO_DEBUG_INFO
(
"innerOuterSplit too large - make it smaller or increase cellsPerBlock"
)
}
}
BlockDataID
pdfFieldGpuID
=
cuda
::
addGPUFieldToStorage
<
PdfField_T
>
(
blocks
,
pdfFieldCpuID
,
"pdfs on GPU"
,
true
);
BlockDataID
flagFieldID
=
field
::
addFlagFieldToStorage
<
FlagField_T
>
(
blocks
,
"flag field"
);
Cell
innerOuterSplitCell
(
innerOuterSplit
[
0
],
innerOuterSplit
[
1
],
innerOuterSplit
[
2
]);
bool
cudaEnabledMPI
=
parameters
.
getParameter
<
bool
>
(
"cudaEnabledMPI"
,
false
);
Vector3
<
int32_t
>
gpuBlockSize
=
parameters
.
getParameter
<
Vector3
<
int32_t
>
>
(
"gpuBlockSize"
,
Vector3
<
int32_t
>
(
256
,
1
,
1
));
int
streamHighPriority
=
0
;
int
streamLowPriority
=
0
;
WALBERLA_CUDA_CHECK
(
cudaDeviceGetStreamPriorityRange
(
&
streamLowPriority
,
&
streamHighPriority
))
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// LB SWEEPS AND BOUNDARY HANDLING ///
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
using
LbSweep
=
lbm
::
UniformGridGPU_LbKernel
;
using
PackInfoEven
=
lbm
::
UniformGridGPU_PackInfoEven
;
using
PackInfoOdd
=
lbm
::
UniformGridGPU_PackInfoOdd
;
using
cuda
::
communication
::
UniformGPUScheme
;
LbSweep
lbSweep
(
pdfFieldGpuID
,
omega
,
gpuBlockSize
[
0
],
gpuBlockSize
[
1
],
gpuBlockSize
[
2
],
innerOuterSplitCell
);
lbSweep
.
setOuterPriority
(
streamHighPriority
);
// Boundaries
const
FlagUID
fluidFlagUID
(
"Fluid"
);
BlockDataID
flagFieldID
=
field
::
addFlagFieldToStorage
<
FlagField_T
>
(
blocks
,
"Boundary Flag Field"
);
auto
boundariesConfig
=
config
->
getBlock
(
"Boundaries"
);
bool
disableB
oundaries
=
tru
e
;
bool
b
oundaries
=
fals
e
;
if
(
boundariesConfig
)
{
disableB
oundaries
=
fals
e
;
geometry
::
initBoundaryHandling
<
FlagField_T
>
(
*
blocks
,
flagFieldID
,
boundariesConfig
);
geometry
::
setNonBoundaryCellsToDomain
<
FlagField_T
>
(
*
blocks
,
flagFieldID
,
fluidFlagUID
);
b
oundaries
=
tru
e
;
geometry
::
initBoundaryHandling
<
FlagField_T
>
(
*
blocks
,
flagFieldID
,
boundariesConfig
);
geometry
::
setNonBoundaryCellsToDomain
<
FlagField_T
>
(
*
blocks
,
flagFieldID
,
fluidFlagUID
);
}
lbm
::
UniformGridGPU_UBB
ubb
(
blocks
,
pdfFieldGpuID
);
lbm
::
UniformGridGPU_NoSlip
noSlip
(
blocks
,
pdfFieldGpuID
);
noSlip
.
fillFromFlagField
<
FlagField_T
>
(
blocks
,
flagFieldID
,
FlagUID
(
"NoSlip"
),
fluidFlagUID
);
ubb
.
fillFromFlagField
<
FlagField_T
>
(
blocks
,
flagFieldID
,
FlagUID
(
"UBB"
),
fluidFlagUID
);
noSlip
.
fillFromFlagField
<
FlagField_T
>
(
blocks
,
flagFieldID
,
FlagUID
(
"NoSlip"
),
fluidFlagUID
);
// Communication setup
bool
cudaEnabledMPI
=
parameters
.
getParameter
<
bool
>
(
"cudaEnabledMPI"
,
false
);
Vector3
<
int32_t
>
gpuBlockSize
=
parameters
.
getParameter
<
Vector3
<
int32_t
>
>
(
"gpuBlockSize"
,
Vector3
<
int32_t
>
(
256
,
1
,
1
));
const
std
::
string
communicationSchemeStr
=
parameters
.
getParameter
<
std
::
string
>
(
"communicationScheme"
,
"UniformGPUScheme_Baseline"
);
CommunicationSchemeType
communicationScheme
;
if
(
communicationSchemeStr
==
"GPUPackInfo_Baseline"
)
communicationScheme
=
GPUPackInfo_Baseline
;
else
if
(
communicationSchemeStr
==
"GPUPackInfo_Streams"
)
communicationScheme
=
GPUPackInfo_Streams
;
else
if
(
communicationSchemeStr
==
"UniformGPUScheme_Baseline"
)
communicationScheme
=
UniformGPUScheme_Baseline
;
else
if
(
communicationSchemeStr
==
"UniformGPUScheme_Memcpy"
)
communicationScheme
=
UniformGPUScheme_Memcpy
;
else
if
(
communicationSchemeStr
==
"MPIDatatypes"
)
communicationScheme
=
MPIDatatypes
;
else
if
(
communicationSchemeStr
==
"MPIDatatypesFull"
)
communicationScheme
=
MPIDatatypesFull
;
else
{
WALBERLA_ABORT_NO_DEBUG_INFO
(
"Invalid choice for communicationScheme"
)
}
lbm
::
UniformGridGPU_UBB
ubb
(
blocks
,
pdfFieldGpuID
);
ubb
.
fillFromFlagField
<
FlagField_T
>
(
blocks
,
flagFieldID
,
FlagUID
(
"UBB"
),
fluidFlagUID
);
Vector3
<
int
>
innerOuterSplit
=
parameters
.
getParameter
<
Vector3
<
int
>
>
(
"innerOuterSplit"
,
Vector3
<
int
>
(
1
,
1
,
1
));
for
(
uint_t
i
=
0
;
i
<
3
;
++
i
)
{
if
(
int_c
(
cellsPerBlock
[
i
])
<=
innerOuterSplit
[
i
]
*
2
)
{
WALBERLA_ABORT_NO_DEBUG_INFO
(
"innerOuterSplit too large - make it smaller or increase cellsPerBlock"
);
}
}
// Initial setup is the post-collision state of an even time step
auto
tracker
=
make_shared
<
lbm
::
TimestepTracker
>
(
0
);
int
streamHighPriority
=
0
;
int
streamLowPriority
=
0
;
WALBERLA_CUDA_CHECK
(
cudaDeviceGetStreamPriorityRange
(
&
streamLowPriority
,
&
streamHighPriority
)
);
WALBERLA_CHECK
(
gpuBlockSize
[
2
]
==
1
);
pystencils
::
UniformGridGPU_LbKernel
lbKernel
(
pdfFieldGpuID
,
omega
,
1.1
,
1.2
,
1.3
,
1.4
,
1.5
,
1.6
,
1.7
,
gpuBlockSize
[
0
],
gpuBlockSize
[
1
],
Cell
(
innerOuterSplit
[
0
],
innerOuterSplit
[
1
],
innerOuterSplit
[
2
])
);
lbKernel
.
setOuterPriority
(
streamHighPriority
);
UniformGridGPU_Communication
<
CommunicationStencil_T
,
cuda
::
GPUField
<
double
>
>
gpuComm
(
blocks
,
pdfFieldGpuID
,
(
CommunicationSchemeType
)
communicationScheme
,
cudaEnabledMPI
);
auto
defaultStream
=
cuda
::
StreamRAII
::
newPriorityStream
(
streamLowPriority
);
auto
innerOuterStreams
=
cuda
::
ParallelStreams
(
streamHighPriority
);
auto
boundaryOuterStreams
=
cuda
::
ParallelStreams
(
streamHighPriority
);
auto
boundaryInnerStreams
=
cuda
::
ParallelStreams
(
streamHighPriority
);
uint_t
currentTimeStep
=
0
;
auto
simpleOverlapTimeStep
=
[
&
]
()
{
gpuComm
.
startCommunication
(
defaultStream
);
for
(
auto
&
block
:
*
blocks
)
lbKernel
.
inner
(
&
block
,
defaultStream
);
gpuComm
.
wait
(
defaultStream
);
for
(
auto
&
block
:
*
blocks
)
lbKernel
.
outer
(
&
block
,
defaultStream
);
};
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// COMMUNICATION SCHEME ///
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
auto
overlapTimeStep
=
[
&
]()
{
cuda
::
NvtxRange
namedRange
(
"timestep"
);
auto
innerOuterSection
=
innerOuterStreams
.
parallelSection
(
defaultStream
);
UniformGPUScheme
<
Stencil_T
>
comm
(
blocks
,
cudaEnabledMPI
);
auto
packInfo
=
make_shared
<
lbm
::
CombinedInPlaceGpuPackInfo
<
PackInfoEven
,
PackInfoOdd
>
>
(
tracker
,
pdfFieldGpuID
);
comm
.
addPackInfo
(
packInfo
);
innerOuterSection
.
run
([
&
](
auto
innerStream
)
{
cuda
::
nameStream
(
innerStream
,
"inner stream"
);
for
(
auto
&
block
:
*
blocks
)
{
if
(
!
disableBoundaries
)
{
auto
p
=
boundaryInnerStreams
.
parallelSection
(
innerStream
);
p
.
run
(
[
&
block
,
&
ubb
](
cudaStream_t
s
)
{
ubb
.
inner
(
&
block
,
s
);
}
);
p
.
run
(
[
&
block
,
&
noSlip
](
cudaStream_t
s
)
{
noSlip
.
inner
(
&
block
,
s
);
}
);
}
lbKernel
.
inner
(
&
block
,
innerStream
);
}
});
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// TIME STEP DEFINITIONS ///
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
innerOuterSection
.
run
([
&
](
auto
outerStream
)
{
cuda
::
nameStream
(
outerStream
,
"outer stream"
);
gpuComm
(
outerStream
);
auto
defaultStream
=
cuda
::
StreamRAII
::
newPriorityStream
(
streamLowPriority
);
for
(
auto
&
block
:
*
blocks
)
{
if
(
!
disableBoundaries
)
{
auto
p
=
boundaryOuterStreams
.
parallelSection
(
outerStream
);
p
.
run
(
[
&
block
,
&
ubb
](
cudaStream_t
s
)
{
ubb
.
outer
(
&
block
,
s
);
}
);
p
.
run
(
[
&
block
,
&
noSlip
](
cudaStream_t
s
)
{
noSlip
.
outer
(
&
block
,
s
);
}
);
}
lbKernel
.
outer
(
&
block
,
outerStream
);
}
});
currentTimeStep
+=
1
;
auto
boundarySweep
=
[
&
](
IBlock
*
block
,
uint8_t
t
,
cudaStream_t
stream
){
noSlip
.
run
(
block
,
t
,
stream
);
ubb
.
run
(
block
,
t
,
stream
);
};
auto
boundaryInner
=
[
&
](
IBlock
*
block
,
uint8_t
t
,
cudaStream_t
stream
){
noSlip
.
inner
(
block
,
t
,
stream
);
ubb
.
inner
(
block
,
t
,
stream
);
};
auto
boundaryStreams
=
cuda
::
ParallelStreams
(
streamHighPriority
);
auto
normalTimeStep
=
[
&
]()
{
gpuComm
();
for
(
auto
&
block
:
*
blocks
)
{
if
(
!
disableBoundaries
)
{
auto
p
=
boundaryStreams
.
parallelSection
(
defaultStream
);
p
.
run
(
[
&
block
,
&
ubb
](
cudaStream_t
s
)
{
ubb
(
&
block
,
s
);
}
);
p
.
run
(
[
&
block
,
&
noSlip
](
cudaStream_t
s
)
{
noSlip
(
&
block
,
s
);
}
);
}
lbKernel
(
&
block
);
auto
boundaryOuter
=
[
&
](
IBlock
*
block
,
uint8_t
t
,
cudaStream_t
stream
){
noSlip
.
outer
(
block
,
t
,
stream
);
ubb
.
outer
(
block
,
t
,
stream
);
};
auto
simpleOverlapTimeStep
=
[
&
]()
{
// Communicate post-collision values of previous timestep...
comm
.
startCommunication
(
defaultStream
);
for
(
auto
&
block
:
*
blocks
){
if
(
boundaries
)
boundaryInner
(
&
block
,
tracker
->
getCounter
(),
defaultStream
);
lbSweep
.
inner
(
&
block
,
tracker
->
getCounterPlusOne
(),
defaultStream
);
}
comm
.
wait
(
defaultStream
);
for
(
auto
&
block
:
*
blocks
){
if
(
boundaries
)
boundaryOuter
(
&
block
,
tracker
->
getCounter
(),
defaultStream
);
lbSweep
.
outer
(
&
block
,
tracker
->
getCounterPlusOne
(),
defaultStream
);
}
tracker
->
advance
();
};
auto
kernelOnlyFunc
=
[
&
]
()
{
for
(
auto
&
block
:
*
blocks
)
lbKernel
(
&
block
);
auto
normalTimeStep
=
[
&
]()
{
comm
.
communicate
(
defaultStream
);
for
(
auto
&
block
:
*
blocks
){
if
(
boundaries
)
boundarySweep
(
&
block
,
tracker
->
getCounter
(),
defaultStream
);
lbSweep
(
&
block
,
tracker
->
getCounterPlusOne
(),
defaultStream
);
}
tracker
->
advance
();
};
SweepTimeloop
timeLoop
(
blocks
->
getBlockStorage
(),
timesteps
);
// With two-fields patterns, ghost layer cells act as constant stream-in boundaries;
// with in-place patterns, ghost layer cells act as wet-node no-slip boundaries.
auto
kernelOnlyFunc
=
[
&
]()
{
tracker
->
advance
();
for
(
auto
&
block
:
*
blocks
)
lbSweep
(
&
block
,
tracker
->
getCounter
(),
defaultStream
);
};
std
::
function
<
void
()
>
timeStep
;
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// TIME LOOP SETUP ///
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
SweepTimeloop
timeLoop
(
blocks
->
getBlockStorage
(),
timesteps
);
const
std
::
string
timeStepStrategy
=
parameters
.
getParameter
<
std
::
string
>
(
"timeStepStrategy"
,
"normal"
);
std
::
function
<
void
()
>
timeStep
;
if
(
timeStepStrategy
==
"noOverlap"
)
timeStep
=
std
::
function
<
void
()
>
(
normalTimeStep
);
else
if
(
timeStepStrategy
==
"complexOverlap"
)
timeStep
=
std
::
function
<
void
()
>
(
overlapTimeStep
);
timeStep
=
std
::
function
<
void
()
>
(
normalTimeStep
);
else
if
(
timeStepStrategy
==
"simpleOverlap"
)
timeStep
=
simpleOverlapTimeStep
;
else
if
(
timeStepStrategy
==
"kernelOnly"
or
timeStepStrategy
==
"kernelOnlyNoInit"
)
{
WALBERLA_LOG_INFO_ON_ROOT
(
"Running only compute kernel without boundary - this makes only sense for benchmarking!"
)
timeStep
=
kernelOnlyFunc
;
timeStep
=
simpleOverlapTimeStep
;
else
if
(
timeStepStrategy
==
"kernelOnly"
)
{
WALBERLA_LOG_INFO_ON_ROOT
(
"Running only compute kernel without boundary - this makes only sense for benchmarking!"
)
// Run initial communication once to provide any missing stream-in populations
comm
.
communicate
();
timeStep
=
kernelOnlyFunc
;
}
else
{
WALBERLA_ABORT_NO_DEBUG_INFO
(
"Invalid value for 'timeStepStrategy'. Allowed values are 'noOverlap', 'complexOverlap', 'simpleOverlap', 'kernelOnly'"
);
else
{
WALBERLA_ABORT_NO_DEBUG_INFO
(
"Invalid value for 'timeStepStrategy'. Allowed values are 'noOverlap', "
"'simpleOverlap', 'kernelOnly'"
)
}
timeLoop
.
add
()
<<
BeforeFunction
(
timeStep
)
<<
Sweep
(
[](
IBlock
*
)
{},
"time step"
);
pystencils
::
UniformGridGPU_MacroGetter
getterSweep
(
pdfFieldCpuID
,
velFieldCpuID
);
timeLoop
.
add
()
<<
BeforeFunction
(
timeStep
)
<<
Sweep
([](
IBlock
*
)
{},
"time step"
);
// VTK
uint_t
vtkWriteFrequency
=
parameters
.
getParameter
<
uint_t
>
(
"vtkWriteFrequency"
,
0
);
if
(
vtkWriteFrequency
>
0
)
uint_t
vtkWriteFrequency
=
parameters
.
getParameter
<
uint_t
>
(
"vtkWriteFrequency"
,
0
);
if
(
vtkWriteFrequency
>
0
)
{
auto
vtkOutput
=
vtk
::
createVTKOutput_BlockData
(
*
blocks
,
"vtk"
,
vtkWriteFrequency
,
0
,
false
,
"vtk_out"
,
"simulation_step"
,
false
,
true
,
true
,
false
,
0
);
auto
velWriter
=
make_shared
<
field
::
VTKWriter
<
VelocityField_T
>
>
(
velFieldCpuID
,
"vel"
);
auto
vtkOutput
=
vtk
::
createVTKOutput_BlockData
(
*
blocks
,
"vtk"
,
vtkWriteFrequency
,
0
,
false
,
"vtk_out"
,
"simulation_step"
,
false
,
true
,
true
,
false
,
0
);
auto
velWriter
=
make_shared
<
field
::
VTKWriter
<
VelocityField_T
>
>
(
velFieldCpuID
,
"vel"
);
vtkOutput
->
addCellDataWriter
(
velWriter
);
vtkOutput
->
addBeforeFunction
(
[
&
]()
{
cuda
::
fieldCpy
<
PdfField_T
,
cuda
::
GPUField
<
real_t
>
>
(
blocks
,
pdfFieldCpuID
,
pdfFieldGpuID
);
for
(
auto
&
block
:
*
blocks
)
getterSweep
(
&
block
);
vtkOutput
->
addBeforeFunction
([
&
]()
{
cuda
::
fieldCpy
<
VelocityField_T
,
cuda
::
GPUField
<
real_t
>
>
(
blocks
,
velFieldCpuID
,
velFieldGpuID
);
});
timeLoop
.
addFuncAfterTimeStep
(
vtk
::
writeFiles
(
vtkOutput
),
"VTK Output"
);
timeLoop
.
addFuncAfterTimeStep
(
vtk
::
writeFiles
(
vtkOutput
),
"VTK Output"
);
}
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// BENCHMARK ///
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
int
warmupSteps
=
parameters
.
getParameter
<
int
>
(
"warmupSteps"
,
2
);
int
outerIterations
=
parameters
.
getParameter
<
int
>
(
"outerIterations"
,
1
);
for
(
int
i
=
0
;
i
<
warmupSteps
;
++
i
)