Commit d5c5fac4 authored by Frederik Hennig's avatar Frederik Hennig Committed by Markus Holzer
Browse files

Revamp LB GPU Benchmark App for In-Place Streaming

parent 803a82cb
from pystencils.field import fields from pystencils.field import fields
from lbmpy.advanced_streaming.utility import get_timesteps, Timestep from lbmpy.advanced_streaming.utility import get_timesteps
from lbmpy.macroscopic_value_kernels import macroscopic_values_setter from lbmpy.macroscopic_value_kernels import macroscopic_values_setter
from lbmpy.stencils import get_stencil from lbmpy.stencils import get_stencil
from lbmpy.creationfunctions import create_lb_collision_rule, create_lb_method, create_lb_update_rule from lbmpy.creationfunctions import create_lb_collision_rule
from lbmpy.boundaries import NoSlip, UBB, ExtrapolationOutflow from lbmpy.boundaries import NoSlip, UBB, ExtrapolationOutflow
from pystencils_walberla import CodeGeneration, generate_sweep, generate_info_header from pystencils_walberla import CodeGeneration, generate_sweep, generate_info_header
......
...@@ -4,49 +4,27 @@ waLBerla_link_files_to_builddir( "*.py" ) ...@@ -4,49 +4,27 @@ waLBerla_link_files_to_builddir( "*.py" )
waLBerla_link_files_to_builddir( "simulation_setup" ) waLBerla_link_files_to_builddir( "simulation_setup" )
foreach (config srt trt mrt smagorinsky entropic smagorinsky_noopt entropic_kbc_n4 foreach(streaming_pattern aa) # choose from {pull, push, aa, esotwist}
entropic_kbc_n4_noopt mrt_noopt mrt_full mrt_full_noopt foreach(stencil d3q27) # choose from {d3q19 d3q27}
cumulant cumulant_d3q27 foreach (collision_setup srt trt mrt cumulant) # choose from {srt trt mrt cumulant entropic smagorinsky}
srt_d3q27 mrt_d3q27 mrt_d3q27_noopt smagorinsky_d3q27 smagorinsky_d3q27_noopt mrt_full_d3q27 mrt_full_d3q27_noopt) set(config ${stencil}_${streaming_pattern}_${collision_setup})
waLBerla_generate_target_from_python(NAME UniformGridGPUGenerated_${config}
waLBerla_generate_target_from_python(NAME UniformGridGPUGenerated_${config} FILE UniformGridGPU.py
FILE UniformGridGPU.py CODEGEN_CFG ${config}
CODEGEN_CFG ${config} OUT_FILES UniformGridGPU_LbKernel.cu UniformGridGPU_LbKernel.h
OUT_FILES UniformGridGPU_LatticeModel.cpp UniformGridGPU_LatticeModel.h UniformGridGPU_PackInfoEven.cu UniformGridGPU_PackInfoEven.h
UniformGridGPU_LbKernel.cu UniformGridGPU_LbKernel.h UniformGridGPU_PackInfoOdd.cu UniformGridGPU_PackInfoOdd.h
UniformGridGPU_NoSlip.cu UniformGridGPU_NoSlip.h UniformGridGPU_NoSlip.cu UniformGridGPU_NoSlip.h
UniformGridGPU_UBB.cu UniformGridGPU_UBB.h UniformGridGPU_UBB.cu UniformGridGPU_UBB.h
UniformGridGPU_PackInfo.cu UniformGridGPU_PackInfo.h UniformGridGPU_MacroSetter.cu UniformGridGPU_MacroSetter.h
UniformGridGPU_MacroSetter.cpp UniformGridGPU_MacroSetter.h UniformGridGPU_InfoHeader.h
UniformGridGPU_MacroGetter.cpp UniformGridGPU_MacroGetter.h )
UniformGridGPU_Defines.h
)
waLBerla_add_executable(NAME UniformGridGPU_${config}
FILES UniformGridGPU.cpp
waLBerla_add_executable(NAME UniformGridBenchmarkGPU_${config} DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk UniformGridGPUGenerated_${config})
FILES UniformGridGPU.cpp set_target_properties( UniformGridGPU_${config} PROPERTIES CXX_VISIBILITY_PRESET hidden)
DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk gui UniformGridGPUGenerated_${config}) endforeach ()
set_target_properties( UniformGridBenchmarkGPU_${config} PROPERTIES CXX_VISIBILITY_PRESET hidden) endforeach()
endforeach () endforeach()
\ No newline at end of file
foreach (config srt trt mrt smagorinsky entropic)
waLBerla_generate_target_from_python(NAME UniformGridGPUGenerated_AA_${config}
FILE UniformGridGPU_AA.py
CODEGEN_CFG ${config}
OUT_FILES UniformGridGPU_AA_PackInfoPull.cu UniformGridGPU_AA_PackInfoPull.h
UniformGridGPU_AA_LbKernelOdd.cu UniformGridGPU_AA_LbKernelOdd.h
UniformGridGPU_AA_LbKernelEven.cu UniformGridGPU_AA_LbKernelEven.h
UniformGridGPU_AA_PackInfoPush.cu UniformGridGPU_AA_PackInfoPush.h
UniformGridGPU_AA_MacroSetter.cpp UniformGridGPU_AA_MacroSetter.h
UniformGridGPU_AA_MacroGetter.cpp UniformGridGPU_AA_MacroGetter.h
UniformGridGPU_AA_Defines.h
)
waLBerla_add_executable(NAME UniformGridBenchmarkGPU_AA_${config}
FILES UniformGridGPU_AA.cpp
DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk gui UniformGridGPUGenerated_AA_${config})
set_target_properties( UniformGridBenchmarkGPU_AA_${config} PROPERTIES CXX_VISIBILITY_PRESET hidden)
endforeach ()
import sympy as sp import sympy as sp
import numpy as np import numpy as np
import pystencils as ps import pystencils as ps
from lbmpy.creationfunctions import create_lb_method, create_lb_update_rule, create_lb_collision_rule
from lbmpy.boundaries import NoSlip, UBB
from lbmpy.fieldaccess import StreamPullTwoFieldsAccessor
from pystencils_walberla import generate_pack_info_from_kernel
from lbmpy_walberla import generate_lattice_model, generate_boundary
from pystencils_walberla import CodeGeneration, generate_sweep
from pystencils.data_types import TypedSymbol from pystencils.data_types import TypedSymbol
from pystencils.fast_approximation import insert_fast_sqrts, insert_fast_divisions from pystencils.fast_approximation import insert_fast_sqrts, insert_fast_divisions
from lbmpy.macroscopic_value_kernels import macroscopic_values_getter, macroscopic_values_setter
from lbmpy.advanced_streaming import Timestep, is_inplace
from lbmpy.advanced_streaming.utility import streaming_patterns
from lbmpy.boundaries import NoSlip, UBB
from lbmpy.creationfunctions import create_lb_collision_rule
from lbmpy.macroscopic_value_kernels import macroscopic_values_setter
from lbmpy.stencils import get_stencil
from pystencils_walberla import CodeGeneration, generate_info_header, generate_sweep
from lbmpy_walberla import generate_alternating_lbm_sweep, generate_lb_pack_info, generate_alternating_lbm_boundary
omega = sp.symbols("omega") omega = sp.symbols("omega")
omega_free = sp.Symbol("omega_free") omega_free = sp.Symbol("omega_free")
omega_fill = sp.symbols("omega_:10")
compile_time_block_size = False compile_time_block_size = False
if compile_time_block_size: if compile_time_block_size:
...@@ -21,156 +24,158 @@ if compile_time_block_size: ...@@ -21,156 +24,158 @@ if compile_time_block_size:
else: else:
sweep_block_size = (TypedSymbol("cudaBlockSize0", np.int32), sweep_block_size = (TypedSymbol("cudaBlockSize0", np.int32),
TypedSymbol("cudaBlockSize1", np.int32), TypedSymbol("cudaBlockSize1", np.int32),
1) TypedSymbol("cudaBlockSize2", np.int32))
sweep_params = {'block_size': sweep_block_size} gpu_indexing_params = {'block_size': sweep_block_size}
options_dict = { options_dict = {
'srt': { 'srt': {
'method': 'srt', 'method': 'srt',
'stencil': 'D3Q19',
'relaxation_rate': omega, 'relaxation_rate': omega,
'compressible': False, 'compressible': False,
}, },
'trt': { 'trt': {
'method': 'trt', 'method': 'trt',
'stencil': 'D3Q19',
'relaxation_rate': omega, 'relaxation_rate': omega,
}, },
'mrt': { 'mrt': {
'method': 'mrt', 'method': 'mrt',
'stencil': 'D3Q19', 'relaxation_rates': [omega, 1, 1, 1, 1, 1, 1],
'relaxation_rates': [omega, 1.3, 1.4, 1.2, 1.1, 1.15, 1.234, 1.4235],
}, },
'mrt_full': { 'mrt-overrelax': {
'method': 'mrt', 'method': 'mrt',
'stencil': 'D3Q19', 'relaxation_rates': [omega, 1.3, 1.4, omega, 1.2, 1.1],
'relaxation_rates': [omega_fill[0], omega, omega_fill[1], omega_fill[2],
omega_fill[3], omega_fill[4], omega_fill[5]],
}, },
'entropic': { 'cumulant': {
'method': 'mrt', 'method': 'cumulant',
'stencil': 'D3Q19', 'relaxation_rate': omega,
'compressible': True, 'compressible': True,
'relaxation_rates': [omega, omega, omega_free, omega_free, omega_free, omega_free],
'entropic': True,
}, },
'entropic_kbc_n4': { 'cumulant-overrelax': {
'method': 'trt-kbc-n4', 'method': 'cumulant',
'stencil': 'D3Q27', 'relaxation_rates': [omega] + [1 + x * 1e-2 for x in range(1, 11)],
'compressible': True, 'compressible': True,
'relaxation_rates': [omega, omega_free], },
'entropic': {
'method': 'mrt',
'compressible': True,
'relaxation_rates': [omega, omega, omega_free, omega_free, omega_free],
'entropic': True, 'entropic': True,
}, },
'smagorinsky': { 'smagorinsky': {
'method': 'srt', 'method': 'srt',
'stencil': 'D3Q19',
'smagorinsky': True, 'smagorinsky': True,
'relaxation_rate': omega, 'relaxation_rate': omega,
}, }
'cumulant': {
'method': 'cumulant',
'stencil': 'D3Q19',
'compressible': True,
'relaxation_rate': omega,
},
} }
info_header = """ info_header = """
#include "stencil/D3Q{q}.h"\nusing Stencil_T = walberla::stencil::D3Q{q};
const char * infoStencil = "{stencil}"; const char * infoStencil = "{stencil}";
const char * infoConfigName = "{configName}"; const char * infoStreamingPattern = "{streaming_pattern}";
const char * infoCollisionSetup = "{collision_setup}";
const bool infoCseGlobal = {cse_global}; const bool infoCseGlobal = {cse_global};
const bool infoCsePdfs = {cse_pdfs}; const bool infoCsePdfs = {cse_pdfs};
""" """
# DEFAULTS
optimize = True
with CodeGeneration() as ctx: with CodeGeneration() as ctx:
accessor = StreamPullTwoFieldsAccessor() config_tokens = ctx.config.split('_')
# accessor = StreamPushTwoFieldsAccessor()
assert not accessor.is_inplace, "This app does not work for inplace accessors" assert len(config_tokens) >= 3
stencil_str = config_tokens[0]
streaming_pattern = config_tokens[1]
collision_setup = config_tokens[2]
if len(config_tokens) >= 4:
optimize = (config_tokens[3] != 'noopt')
stencil = get_stencil(stencil_str)
assert streaming_pattern in streaming_patterns, f"Invalid streaming pattern: {streaming_pattern}"
options = options_dict[collision_setup]
q = len(stencil)
dim = len(stencil[0])
assert dim == 3, "This app supports only three-dimensional stencils"
pdfs, pdfs_tmp, velocity_field = ps.fields(f"pdfs({q}), pdfs_tmp({q}), velocity(3) : double[3D]", layout='fzyx')
common_options = { common_options = {
'field_name': 'pdfs', 'stencil': stencil,
'temporary_field_name': 'pdfs_tmp', 'field_name': pdfs.name,
'kernel_type': accessor, 'optimization': {
'optimization': {'cse_global': True, 'target': 'gpu',
'cse_pdfs': False} 'cse_global': True,
'cse_pdfs': False,
'symbolic_field': pdfs,
'field_layout': 'fzyx',
'gpu_indexing_params': gpu_indexing_params,
}
} }
config_name = ctx.config
noopt = False
d3q27 = False
if config_name.endswith("_noopt"):
noopt = True
config_name = config_name[:-len("_noopt")]
if config_name.endswith("_d3q27"):
d3q27 = True
config_name = config_name[:-len("_d3q27")]
options = options_dict[config_name]
options.update(common_options)
options = options.copy()
if noopt: options.update(common_options)
options['optimization']['cse_global'] = False
options['optimization']['cse_pdfs'] = False
if d3q27:
options['stencil'] = 'D3Q27'
stencil_str = options['stencil'] if not is_inplace(streaming_pattern):
q = int(stencil_str[stencil_str.find('Q') + 1:]) options['optimization']['symbolic_temporary_field'] = pdfs_tmp
pdfs, velocity_field = ps.fields("pdfs({q}), velocity(3) : double[3D]".format(q=q), layout='fzyx') field_swaps = [(pdfs, pdfs_tmp)]
options['optimization']['symbolic_field'] = pdfs else:
field_swaps = []
vp = [ vp = [
('double', 'omega_0'),
('double', 'omega_1'),
('double', 'omega_2'),
('double', 'omega_3'),
('double', 'omega_4'),
('double', 'omega_5'),
('double', 'omega_6'),
('int32_t', 'cudaBlockSize0'), ('int32_t', 'cudaBlockSize0'),
('int32_t', 'cudaBlockSize1'), ('int32_t', 'cudaBlockSize1'),
('int32_t', 'cudaBlockSize2')
] ]
lb_method = create_lb_method(**options)
update_rule = create_lb_update_rule(lb_method=lb_method, **options) # LB Sweep
collision_rule = create_lb_collision_rule(**options)
if not noopt:
update_rule = insert_fast_divisions(update_rule) if optimize:
update_rule = insert_fast_sqrts(update_rule) collision_rule = insert_fast_divisions(collision_rule)
collision_rule = insert_fast_sqrts(collision_rule)
# CPU lattice model - required for macroscopic value computation, VTK output etc.
options_without_opt = options.copy() lb_method = collision_rule.method
del options_without_opt['optimization']
generate_lattice_model(ctx, 'UniformGridGPU_LatticeModel', create_lb_collision_rule(lb_method=lb_method, generate_alternating_lbm_sweep(ctx, 'UniformGridGPU_LbKernel', collision_rule, streaming_pattern,
**options_without_opt)) optimization=options['optimization'],
inner_outer_split=True, varying_parameters=vp, field_swaps=field_swaps)
# gpu LB sweep & boundaries
generate_sweep(ctx, 'UniformGridGPU_LbKernel', update_rule,
field_swaps=[('pdfs', 'pdfs_tmp')],
inner_outer_split=True, target='gpu', gpu_indexing_params=sweep_params,
varying_parameters=vp)
generate_boundary(ctx, 'UniformGridGPU_NoSlip', NoSlip(), lb_method, target='gpu')
generate_boundary(ctx, 'UniformGridGPU_UBB', UBB([0.05, 0, 0]), lb_method, target='gpu')
# getter & setter # getter & setter
setter_assignments = macroscopic_values_setter(lb_method, velocity=velocity_field.center_vector, setter_assignments = macroscopic_values_setter(lb_method, density=1.0, velocity=velocity_field.center_vector,
pdfs=pdfs.center_vector, density=1.0) pdfs=pdfs,
getter_assignments = macroscopic_values_getter(lb_method, velocity=velocity_field.center_vector, streaming_pattern=streaming_pattern,
pdfs=pdfs.center_vector, density=None) previous_timestep=Timestep.EVEN)
generate_sweep(ctx, 'UniformGridGPU_MacroSetter', setter_assignments) generate_sweep(ctx, 'UniformGridGPU_MacroSetter', setter_assignments, target='gpu')
generate_sweep(ctx, 'UniformGridGPU_MacroGetter', getter_assignments)
# Boundaries
noslip = NoSlip()
ubb = UBB((0.05, 0, 0))
generate_alternating_lbm_boundary(ctx, 'UniformGridGPU_NoSlip', noslip, lb_method, field_name=pdfs.name,
streaming_pattern=streaming_pattern, target='gpu')
generate_alternating_lbm_boundary(ctx, 'UniformGridGPU_UBB', ubb, lb_method, field_name=pdfs.name,
streaming_pattern=streaming_pattern, target='gpu')
# communication # communication
generate_pack_info_from_kernel(ctx, 'UniformGridGPU_PackInfo', update_rule, target='gpu') generate_lb_pack_info(ctx, 'UniformGridGPU_PackInfo', stencil, pdfs,
streaming_pattern=streaming_pattern, target='gpu',
always_generate_separate_classes=True)
infoHeaderParams = { infoHeaderParams = {
'stencil': stencil_str, 'stencil': stencil_str,
'q': q, 'streaming_pattern': streaming_pattern,
'configName': ctx.config, 'collision_setup': collision_setup,
'cse_global': int(options['optimization']['cse_global']), 'cse_global': int(options['optimization']['cse_global']),
'cse_pdfs': int(options['optimization']['cse_pdfs']), 'cse_pdfs': int(options['optimization']['cse_pdfs']),
} }
ctx.write_file("UniformGridGPU_Defines.h", info_header.format(**infoHeaderParams))
stencil_typedefs = {'Stencil_T': stencil,
'CommunicationStencil_T': stencil}
field_typedefs = {'PdfField_T': pdfs,
'VelocityField_T': velocity_field}
# Info header containing correct template definitions for stencil and field
generate_info_header(ctx, 'UniformGridGPU_InfoHeader',
stencil_typedefs=stencil_typedefs, field_typedefs=field_typedefs,
additional_code=info_header.format(**infoHeaderParams))
#include "core/Environment.h"
#include "core/logging/Initialization.h"
#include "python_coupling/CreateConfig.h"
#include "python_coupling/PythonCallback.h"
#include "python_coupling/DictWrapper.h"
#include "blockforest/Initialization.h"
#include "field/FlagField.h"
#include "field/AddToStorage.h"
#include "field/vtk/VTKWriter.h"
#include "field/communication/PackInfo.h"
#include "lbm/PerformanceLogger.h"
#include "blockforest/communication/UniformBufferedScheme.h"
#include "timeloop/all.h"
#include "geometry/all.h"
#include "cuda/HostFieldAllocator.h"
#include "cuda/communication/GPUPackInfo.h"
#include "cuda/ParallelStreams.h"
#include "core/timing/TimingPool.h"
#include "core/timing/RemainingTimeLogger.h"
#include "cuda/AddGPUFieldToStorage.h"
#include "cuda/communication/UniformGPUScheme.h"
#include "cuda/DeviceSelectMPI.h"
#include "domain_decomposition/SharedSweep.h"
#include "InitShearVelocity.h"
#include "gui/Gui.h"
#ifdef WALBERLA_ENABLE_GUI
#include "lbm/gui/PdfFieldDisplayAdaptor.h"
#endif
#include "UniformGridGPU_AA_PackInfoPush.h"
#include "UniformGridGPU_AA_PackInfoPull.h"
#include "UniformGridGPU_AA_MacroSetter.h"
#include "UniformGridGPU_AA_MacroGetter.h"
#include "UniformGridGPU_AA_LbKernelEven.h"
#include "UniformGridGPU_AA_LbKernelOdd.h"
#include "UniformGridGPU_AA_Defines.h"
#include <cmath>
using namespace walberla;
using CommunicationStencil_T = Stencil_T;
using PdfField_T = GhostLayerField< real_t, Stencil_T::Q >;
using VelocityField_T = GhostLayerField< real_t, 3 >;
int main( int argc, char **argv )
{
mpi::Environment env( argc, argv );
cuda::selectDeviceBasedOnMpiRank();
for ( auto cfg = python_coupling::configBegin( argc, argv ); cfg != python_coupling::configEnd(); ++cfg )
{
WALBERLA_MPI_WORLD_BARRIER();
WALBERLA_CUDA_CHECK( cudaPeekAtLastError() );
auto config = *cfg;
logging::configureLogging( config );
auto blocks = blockforest::createUniformBlockGridFromConfig( config );
Vector3< uint_t > cellsPerBlock = config->getBlock( "DomainSetup" ).getParameter< Vector3< uint_t > >( "cellsPerBlock" );
// Reading parameters
auto parameters = config->getOneBlock( "Parameters" );
const real_t omega = parameters.getParameter< real_t >( "omega", real_c( 1.4 ));
const uint_t timesteps = parameters.getParameter< uint_t >( "timesteps", uint_c( 50 ));
// Creating fields
BlockDataID pdfFieldCpuID = field::addToStorage< PdfField_T >( blocks, "pdfs cpu", real_t( std::nan("") ), field::fzyx );
BlockDataID velFieldCpuID = field::addToStorage< VelocityField_T >( blocks, "vel", real_t( 0 ), field::fzyx );
WALBERLA_LOG_INFO_ON_ROOT( "Initializing shear flow" );
initShearVelocity( blocks, velFieldCpuID );
pystencils::UniformGridGPU_AA_MacroGetter getterSweep( pdfFieldCpuID, velFieldCpuID );
pystencils::UniformGridGPU_AA_MacroSetter setterSweep( pdfFieldCpuID, velFieldCpuID );
for ( auto &block : *blocks )
setterSweep( &block );
BlockDataID pdfFieldGpuID = cuda::addGPUFieldToStorage< PdfField_T >( blocks, pdfFieldCpuID, "pdfs on GPU", true );
Vector3<int> innerOuterSplit = parameters.getParameter<Vector3<int> >("innerOuterSplit", Vector3<int>(1, 1, 1));
for(uint_t i=0; i< 3; ++i)
{
if( int_c(cellsPerBlock[i]) <= innerOuterSplit[i] * 2) {
WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller or increase cellsPerBlock");
}
}
Cell innerOuterSplitCell (innerOuterSplit[0], innerOuterSplit[1], innerOuterSplit[2]);
bool cudaEnabledMPI = parameters.getParameter<bool>( "cudaEnabledMPI", false );
Vector3<int32_t> gpuBlockSize = parameters.getParameter<Vector3<int32_t> > ("gpuBlockSize", Vector3<int32_t>(256, 1, 1));
int streamHighPriority = 0;
int streamLowPriority = 0;
WALBERLA_CUDA_CHECK( cudaDeviceGetStreamPriorityRange( &streamLowPriority, &streamHighPriority ));
WALBERLA_CHECK( gpuBlockSize[2] == 1 );
using KernelEven = pystencils::UniformGridGPU_AA_LbKernelEven;
using KernelOdd = pystencils::UniformGridGPU_AA_LbKernelOdd;
using PackInfoPull = pystencils::UniformGridGPU_AA_PackInfoPull;
using PackInfoPush = pystencils::UniformGridGPU_AA_PackInfoPush;
using cuda::communication::UniformGPUScheme;
KernelEven kernelEven( pdfFieldGpuID, omega, gpuBlockSize[0], gpuBlockSize[1], innerOuterSplitCell );
KernelOdd kernelOdd ( pdfFieldGpuID, omega, gpuBlockSize[0], gpuBlockSize[1], innerOuterSplitCell );
kernelEven.setOuterPriority( streamHighPriority );
kernelOdd .setOuterPriority( streamHighPriority );
auto pullScheme = make_shared< UniformGPUScheme< Stencil_T > >( blocks, cudaEnabledMPI );
auto pushScheme = make_shared< UniformGPUScheme< Stencil_T > >( blocks, cudaEnabledMPI );
pullScheme->addPackInfo( make_shared< PackInfoPull >( pdfFieldGpuID ) );
pushScheme->addPackInfo( make_shared< PackInfoPush >( pdfFieldGpuID ) );
auto defaultStream = cuda::StreamRAII::newPriorityStream( streamLowPriority );
auto setupPhase = [&]() {
for ( auto &block: *blocks )
kernelEven( &block );
pullScheme->communicate();
for ( auto &block: *blocks )
kernelOdd( &block );
};
auto tearDownPhase = [&]() {
pushScheme->communicate();
cuda::fieldCpy< PdfField_T, cuda::GPUField< real_t > >( blocks, pdfFieldCpuID, pdfFieldGpuID );
for ( auto &block : *blocks )
getterSweep( &block );
};
auto simpleOverlapTimeStep = [&]()
{
// Even
pushScheme->startCommunication( defaultStream );
for ( auto &block: *blocks )
kernelEven.inner( &block, defaultStream );
pushScheme->wait( defaultStream );
for ( auto &block: *blocks )
kernelEven.outer( &block, defaultStream );
// Odd
pullScheme->startCommunication( defaultStream );
for ( auto &block: *blocks )
kernelOdd.inner( &block, defaultStream );
pullScheme->wait( defaultStream );
for ( auto &block: *blocks )
kernelOdd.outer( &block, defaultStream );
};
auto normalTimeStep = [&]()
{
pushScheme->communicate( defaultStream );
for ( auto &block: *blocks )