Commit d5c5fac4 authored by Frederik Hennig's avatar Frederik Hennig Committed by Markus Holzer
Browse files

Revamp LB GPU Benchmark App for In-Place Streaming

parent 803a82cb
from pystencils.field import fields
from lbmpy.advanced_streaming.utility import get_timesteps, Timestep
from lbmpy.advanced_streaming.utility import get_timesteps
from lbmpy.macroscopic_value_kernels import macroscopic_values_setter
from lbmpy.stencils import get_stencil
from lbmpy.creationfunctions import create_lb_collision_rule, create_lb_method, create_lb_update_rule
from lbmpy.creationfunctions import create_lb_collision_rule
from lbmpy.boundaries import NoSlip, UBB, ExtrapolationOutflow
from pystencils_walberla import CodeGeneration, generate_sweep, generate_info_header
......
......@@ -4,49 +4,27 @@ waLBerla_link_files_to_builddir( "*.py" )
waLBerla_link_files_to_builddir( "simulation_setup" )
foreach (config srt trt mrt smagorinsky entropic smagorinsky_noopt entropic_kbc_n4
entropic_kbc_n4_noopt mrt_noopt mrt_full mrt_full_noopt
cumulant cumulant_d3q27
srt_d3q27 mrt_d3q27 mrt_d3q27_noopt smagorinsky_d3q27 smagorinsky_d3q27_noopt mrt_full_d3q27 mrt_full_d3q27_noopt)
waLBerla_generate_target_from_python(NAME UniformGridGPUGenerated_${config}
FILE UniformGridGPU.py
CODEGEN_CFG ${config}
OUT_FILES UniformGridGPU_LatticeModel.cpp UniformGridGPU_LatticeModel.h
UniformGridGPU_LbKernel.cu UniformGridGPU_LbKernel.h
UniformGridGPU_NoSlip.cu UniformGridGPU_NoSlip.h
UniformGridGPU_UBB.cu UniformGridGPU_UBB.h
UniformGridGPU_PackInfo.cu UniformGridGPU_PackInfo.h
UniformGridGPU_MacroSetter.cpp UniformGridGPU_MacroSetter.h
UniformGridGPU_MacroGetter.cpp UniformGridGPU_MacroGetter.h
UniformGridGPU_Defines.h
)
waLBerla_add_executable(NAME UniformGridBenchmarkGPU_${config}
FILES UniformGridGPU.cpp
DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk gui UniformGridGPUGenerated_${config})
set_target_properties( UniformGridBenchmarkGPU_${config} PROPERTIES CXX_VISIBILITY_PRESET hidden)
endforeach ()
foreach (config srt trt mrt smagorinsky entropic)
waLBerla_generate_target_from_python(NAME UniformGridGPUGenerated_AA_${config}
FILE UniformGridGPU_AA.py
CODEGEN_CFG ${config}
OUT_FILES UniformGridGPU_AA_PackInfoPull.cu UniformGridGPU_AA_PackInfoPull.h
UniformGridGPU_AA_LbKernelOdd.cu UniformGridGPU_AA_LbKernelOdd.h
UniformGridGPU_AA_LbKernelEven.cu UniformGridGPU_AA_LbKernelEven.h
UniformGridGPU_AA_PackInfoPush.cu UniformGridGPU_AA_PackInfoPush.h
UniformGridGPU_AA_MacroSetter.cpp UniformGridGPU_AA_MacroSetter.h
UniformGridGPU_AA_MacroGetter.cpp UniformGridGPU_AA_MacroGetter.h
UniformGridGPU_AA_Defines.h
)
waLBerla_add_executable(NAME UniformGridBenchmarkGPU_AA_${config}
FILES UniformGridGPU_AA.cpp
DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk gui UniformGridGPUGenerated_AA_${config})
set_target_properties( UniformGridBenchmarkGPU_AA_${config} PROPERTIES CXX_VISIBILITY_PRESET hidden)
endforeach ()
foreach(streaming_pattern aa) # choose from {pull, push, aa, esotwist}
foreach(stencil d3q27) # choose from {d3q19 d3q27}
foreach (collision_setup srt trt mrt cumulant) # choose from {srt trt mrt cumulant entropic smagorinsky}
set(config ${stencil}_${streaming_pattern}_${collision_setup})
waLBerla_generate_target_from_python(NAME UniformGridGPUGenerated_${config}
FILE UniformGridGPU.py
CODEGEN_CFG ${config}
OUT_FILES UniformGridGPU_LbKernel.cu UniformGridGPU_LbKernel.h
UniformGridGPU_PackInfoEven.cu UniformGridGPU_PackInfoEven.h
UniformGridGPU_PackInfoOdd.cu UniformGridGPU_PackInfoOdd.h
UniformGridGPU_NoSlip.cu UniformGridGPU_NoSlip.h
UniformGridGPU_UBB.cu UniformGridGPU_UBB.h
UniformGridGPU_MacroSetter.cu UniformGridGPU_MacroSetter.h
UniformGridGPU_InfoHeader.h
)
waLBerla_add_executable(NAME UniformGridGPU_${config}
FILES UniformGridGPU.cpp
DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk UniformGridGPUGenerated_${config})
set_target_properties( UniformGridGPU_${config} PROPERTIES CXX_VISIBILITY_PRESET hidden)
endforeach ()
endforeach()
endforeach()
\ No newline at end of file
#include "blockforest/Initialization.h"
#include "core/Environment.h"
#include "core/logging/Initialization.h"
#include "core/math/Random.h"
#include "python_coupling/CreateConfig.h"
#include "python_coupling/PythonCallback.h"
#include "python_coupling/DictWrapper.h"
#include "blockforest/Initialization.h"
#include "field/FlagField.h"
#include "field/AddToStorage.h"
#include "field/vtk/VTKWriter.h"
#include "field/communication/PackInfo.h"
#include "lbm/PerformanceLogger.h"
#include "blockforest/communication/UniformBufferedScheme.h"
#include "timeloop/all.h"
#include "core/math/Random.h"
#include "geometry/all.h"
#include "cuda/HostFieldAllocator.h"
#include "cuda/communication/GPUPackInfo.h"
#include "cuda/ParallelStreams.h"
#include "cuda/NVTX.h"
#include "core/timing/TimingPool.h"
#include "core/timing/RemainingTimeLogger.h"
#include "core/timing/TimingPool.h"
#include "cuda/AddGPUFieldToStorage.h"
#include "cuda/communication/UniformGPUScheme.h"
#include "cuda/DeviceSelectMPI.h"
#include "domain_decomposition/SharedSweep.h"
#include "gui/Gui.h"
#include "lbm/gui/Connection.h"
#include "UniformGridGPU_LatticeModel.h"
#include "UniformGridGPU_LbKernel.h"
#include "UniformGridGPU_PackInfo.h"
#include "UniformGridGPU_UBB.h"
#include "UniformGridGPU_NoSlip.h"
#include "UniformGridGPU_Communication.h"
#include "UniformGridGPU_MacroSetter.h"
#include "UniformGridGPU_MacroGetter.h"
#include "UniformGridGPU_Defines.h"
#include "cuda/ParallelStreams.h"
#include "cuda/communication/UniformGPUScheme.h"
#include "cuda/FieldCopy.h"
#include "cuda/lbm/CombinedInPlaceGpuPackInfo.h"
#include "field/AddToStorage.h"
#include "field/FlagField.h"
#include "field/communication/PackInfo.h"
#include "field/vtk/VTKWriter.h"
using namespace walberla;
#include "geometry/InitBoundaryHandling.h"
using LatticeModel_T = lbm::UniformGridGPU_LatticeModel;
#include "lbm/inplace_streaming/TimestepTracker.h"
const auto Q = LatticeModel_T::Stencil::Q;
#include "python_coupling/CreateConfig.h"
#include "python_coupling/DictWrapper.h"
#include "python_coupling/PythonCallback.h"
#include "timeloop/SweepTimeloop.h"
using Stencil_T = LatticeModel_T::Stencil;
using CommunicationStencil_T = LatticeModel_T::CommunicationStencil;
using PdfField_T = GhostLayerField<real_t, Q>;
using CommScheme_T = cuda::communication::UniformGPUScheme<CommunicationStencil_T>;
using VelocityField_T = GhostLayerField<real_t, 3>;
using flag_t = walberla::uint8_t;
using FlagField_T = FlagField<flag_t>;
#include "InitShearVelocity.h"
#include <cmath>
void initShearVelocity(const shared_ptr<StructuredBlockStorage> & blocks, BlockDataID velFieldID,
const real_t xMagnitude=real_t(0.1), const real_t fluctuationMagnitude=real_t(0.05) )
{
math::seedRandomGenerator(0);
auto halfZ = blocks->getDomainCellBB().zMax() / 2;
for( auto & block: *blocks)
{
auto velField = block.getData<VelocityField_T>( velFieldID );
WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(velField,
Cell globalCell;
blocks->transformBlockLocalToGlobalCell(globalCell, block, Cell(x, y, z));
real_t randomReal = xMagnitude * math::realRandom<real_t>(-fluctuationMagnitude, fluctuationMagnitude);
velField->get(x, y, z, 1) = real_t(0);
velField->get(x, y, z, 2) = randomReal;
if( globalCell[2] >= halfZ ) {
velField->get(x, y, z, 0) = xMagnitude;
} else {
velField->get(x, y, z, 0) = -xMagnitude;
}
);
}
}
#include "UniformGridGPU_InfoHeader.h"
using namespace walberla;
using FlagField_T = FlagField<uint8_t>;
int main( int argc, char **argv )
int main(int argc, char** argv)
{
mpi::Environment env( argc, argv );
mpi::Environment env(argc, argv);
cuda::selectDeviceBasedOnMpiRank();
for( auto cfg = python_coupling::configBegin( argc, argv ); cfg != python_coupling::configEnd(); ++cfg )
for (auto cfg = python_coupling::configBegin(argc, argv); cfg != python_coupling::configEnd(); ++cfg)
{
WALBERLA_MPI_WORLD_BARRIER();
WALBERLA_MPI_WORLD_BARRIER()
WALBERLA_CUDA_CHECK(cudaPeekAtLastError())
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// SETUP AND CONFIGURATION ///
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
auto config = *cfg;
logging::configureLogging( config );
auto blocks = blockforest::createUniformBlockGridFromConfig( config );
logging::configureLogging(config);
auto blocks = blockforest::createUniformBlockGridFromConfig(config);
Vector3<uint_t> cellsPerBlock = config->getBlock( "DomainSetup" ).getParameter<Vector3<uint_t> >( "cellsPerBlock" );
Vector3< uint_t > cellsPerBlock =
config->getBlock("DomainSetup").getParameter< Vector3< uint_t > >("cellsPerBlock");
// Reading parameters
auto parameters = config->getOneBlock( "Parameters" );
const std::string timeStepStrategy = parameters.getParameter<std::string>( "timeStepStrategy", "normal");
const real_t omega = parameters.getParameter<real_t>( "omega", real_c( 1.4 ));
const uint_t timesteps = parameters.getParameter<uint_t>( "timesteps", uint_c( 50 ));
auto parameters = config->getOneBlock("Parameters");
const real_t omega = parameters.getParameter< real_t >("omega", real_c(1.4));
const uint_t timesteps = parameters.getParameter< uint_t >("timesteps", uint_c(50));
const bool initShearFlow = parameters.getParameter<bool>("initShearFlow", true);
// Creating fields
BlockDataID pdfFieldCpuID = field::addToStorage< PdfField_T >( blocks, "pdfs cpu", real_t(0), field::fzyx);
BlockDataID velFieldCpuID = field::addToStorage< VelocityField_T >( blocks, "vel", real_t(0), field::fzyx);
BlockDataID pdfFieldCpuID =
field::addToStorage< PdfField_T >(blocks, "pdfs cpu", real_t(std::nan("")), field::fzyx);
BlockDataID velFieldCpuID = field::addToStorage< VelocityField_T >(blocks, "vel", real_t(0), field::fzyx);
// Initialize velocity on cpu
if( initShearFlow ){
WALBERLA_LOG_INFO_ON_ROOT("Initializing shear flow")
initShearVelocity(blocks, velFieldCpuID);
}
BlockDataID pdfFieldGpuID = cuda::addGPUFieldToStorage< PdfField_T >(blocks, pdfFieldCpuID, "pdfs on GPU", true);
// Velocity field is copied to the GPU
BlockDataID velFieldGpuID =
cuda::addGPUFieldToStorage< VelocityField_T >(blocks, velFieldCpuID, "velocity on GPU", true);
if( timeStepStrategy != "kernelOnlyNoInit")
pystencils::UniformGridGPU_MacroSetter setterSweep(pdfFieldGpuID, velFieldGpuID);
// Set up initial PDF values
for (auto& block : *blocks)
setterSweep(&block);
Vector3< int > innerOuterSplit =
parameters.getParameter< Vector3< int > >("innerOuterSplit", Vector3< int >(1, 1, 1));
for (uint_t i = 0; i < 3; ++i)
{
if ( initShearFlow )
{
WALBERLA_LOG_INFO_ON_ROOT( "Initializing shear flow" );
initShearVelocity( blocks, velFieldCpuID );
}
pystencils::UniformGridGPU_MacroSetter setterSweep(pdfFieldCpuID, velFieldCpuID);
for( auto & block : *blocks )
setterSweep( &block );
// setter sweep only initializes interior of domain - for push schemes to work a first communication is required here
blockforest::communication::UniformBufferedScheme<CommunicationStencil_T> initialComm(blocks);
initialComm.addPackInfo( make_shared< field::communication::PackInfo<PdfField_T> >( pdfFieldCpuID ) );
initialComm();
if (int_c(cellsPerBlock[i]) <= innerOuterSplit[i] * 2)
{
WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller or increase cellsPerBlock")
}
}
BlockDataID pdfFieldGpuID = cuda::addGPUFieldToStorage<PdfField_T >( blocks, pdfFieldCpuID, "pdfs on GPU", true );
BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >( blocks, "flag field" );
Cell innerOuterSplitCell(innerOuterSplit[0], innerOuterSplit[1], innerOuterSplit[2]);
bool cudaEnabledMPI = parameters.getParameter< bool >("cudaEnabledMPI", false);
Vector3< int32_t > gpuBlockSize =
parameters.getParameter< Vector3< int32_t > >("gpuBlockSize", Vector3< int32_t >(256, 1, 1));
int streamHighPriority = 0;
int streamLowPriority = 0;
WALBERLA_CUDA_CHECK(cudaDeviceGetStreamPriorityRange(&streamLowPriority, &streamHighPriority))
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// LB SWEEPS AND BOUNDARY HANDLING ///
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
using LbSweep = lbm::UniformGridGPU_LbKernel;
using PackInfoEven = lbm::UniformGridGPU_PackInfoEven;
using PackInfoOdd = lbm::UniformGridGPU_PackInfoOdd;
using cuda::communication::UniformGPUScheme;
LbSweep lbSweep(pdfFieldGpuID, omega, gpuBlockSize[0], gpuBlockSize[1], gpuBlockSize[2], innerOuterSplitCell);
lbSweep.setOuterPriority(streamHighPriority);
// Boundaries
const FlagUID fluidFlagUID( "Fluid" );
BlockDataID flagFieldID = field::addFlagFieldToStorage<FlagField_T>(blocks, "Boundary Flag Field");
auto boundariesConfig = config->getBlock( "Boundaries" );
bool disableBoundaries = true;
bool boundaries = false;
if( boundariesConfig )
{
disableBoundaries = false;
geometry::initBoundaryHandling< FlagField_T >( *blocks, flagFieldID, boundariesConfig );
geometry::setNonBoundaryCellsToDomain< FlagField_T >( *blocks, flagFieldID, fluidFlagUID );
boundaries = true;
geometry::initBoundaryHandling< FlagField_T >( *blocks, flagFieldID, boundariesConfig );
geometry::setNonBoundaryCellsToDomain< FlagField_T >( *blocks, flagFieldID, fluidFlagUID );
}
lbm::UniformGridGPU_UBB ubb(blocks, pdfFieldGpuID);
lbm::UniformGridGPU_NoSlip noSlip(blocks, pdfFieldGpuID);
noSlip.fillFromFlagField<FlagField_T>(blocks, flagFieldID, FlagUID("NoSlip"), fluidFlagUID);
ubb.fillFromFlagField<FlagField_T>( blocks, flagFieldID, FlagUID("UBB"), fluidFlagUID );
noSlip.fillFromFlagField<FlagField_T>( blocks, flagFieldID, FlagUID("NoSlip"), fluidFlagUID );
// Communication setup
bool cudaEnabledMPI = parameters.getParameter<bool>( "cudaEnabledMPI", false );
Vector3<int32_t> gpuBlockSize = parameters.getParameter<Vector3<int32_t> > ("gpuBlockSize", Vector3<int32_t>(256, 1, 1));
const std::string communicationSchemeStr = parameters.getParameter<std::string>("communicationScheme", "UniformGPUScheme_Baseline");
CommunicationSchemeType communicationScheme;
if( communicationSchemeStr == "GPUPackInfo_Baseline")
communicationScheme = GPUPackInfo_Baseline;
else if (communicationSchemeStr == "GPUPackInfo_Streams")
communicationScheme = GPUPackInfo_Streams;
else if (communicationSchemeStr == "UniformGPUScheme_Baseline")
communicationScheme = UniformGPUScheme_Baseline;
else if (communicationSchemeStr == "UniformGPUScheme_Memcpy")
communicationScheme = UniformGPUScheme_Memcpy;
else if (communicationSchemeStr == "MPIDatatypes")
communicationScheme = MPIDatatypes;
else if (communicationSchemeStr == "MPIDatatypesFull")
communicationScheme = MPIDatatypesFull;
else {
WALBERLA_ABORT_NO_DEBUG_INFO("Invalid choice for communicationScheme")
}
lbm::UniformGridGPU_UBB ubb(blocks, pdfFieldGpuID);
ubb.fillFromFlagField<FlagField_T>(blocks, flagFieldID, FlagUID("UBB"), fluidFlagUID);
Vector3<int> innerOuterSplit = parameters.getParameter<Vector3<int> >("innerOuterSplit", Vector3<int>(1, 1, 1));
for(uint_t i=0; i< 3; ++i)
{
if( int_c(cellsPerBlock[i]) <= innerOuterSplit[i] * 2) {
WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller or increase cellsPerBlock");
}
}
// Initial setup is the post-collision state of an even time step
auto tracker = make_shared< lbm::TimestepTracker >(0);
int streamHighPriority = 0;
int streamLowPriority = 0;
WALBERLA_CUDA_CHECK( cudaDeviceGetStreamPriorityRange(&streamLowPriority, &streamHighPriority) );
WALBERLA_CHECK(gpuBlockSize[2] == 1);
pystencils::UniformGridGPU_LbKernel lbKernel( pdfFieldGpuID, omega,
1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7,
gpuBlockSize[0], gpuBlockSize[1],
Cell(innerOuterSplit[0], innerOuterSplit[1], innerOuterSplit[2]) );
lbKernel.setOuterPriority( streamHighPriority );
UniformGridGPU_Communication< CommunicationStencil_T, cuda::GPUField< double > >
gpuComm( blocks, pdfFieldGpuID, (CommunicationSchemeType) communicationScheme, cudaEnabledMPI );
auto defaultStream = cuda::StreamRAII::newPriorityStream( streamLowPriority );
auto innerOuterStreams = cuda::ParallelStreams( streamHighPriority );
auto boundaryOuterStreams = cuda::ParallelStreams( streamHighPriority );
auto boundaryInnerStreams = cuda::ParallelStreams( streamHighPriority );
uint_t currentTimeStep = 0;
auto simpleOverlapTimeStep = [&] ()
{
gpuComm.startCommunication(defaultStream);
for( auto &block: *blocks )
lbKernel.inner( &block, defaultStream );
gpuComm.wait(defaultStream);
for( auto &block: *blocks )
lbKernel.outer( &block, defaultStream );
};
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// COMMUNICATION SCHEME ///
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
auto overlapTimeStep = [&]()
{
cuda::NvtxRange namedRange("timestep");
auto innerOuterSection = innerOuterStreams.parallelSection( defaultStream );
UniformGPUScheme< Stencil_T > comm(blocks, cudaEnabledMPI);
auto packInfo = make_shared< lbm::CombinedInPlaceGpuPackInfo< PackInfoEven, PackInfoOdd > >(tracker, pdfFieldGpuID);
comm.addPackInfo(packInfo);
innerOuterSection.run([&]( auto innerStream )
{
cuda::nameStream(innerStream, "inner stream");
for( auto &block: *blocks )
{
if(!disableBoundaries)
{
auto p = boundaryInnerStreams.parallelSection( innerStream );
p.run( [&block, &ubb]( cudaStream_t s ) { ubb.inner( &block, s ); } );
p.run( [&block, &noSlip]( cudaStream_t s ) { noSlip.inner( &block, s ); } );
}
lbKernel.inner( &block, innerStream );
}
});
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// TIME STEP DEFINITIONS ///
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
innerOuterSection.run([&]( auto outerStream )
{
cuda::nameStream(outerStream, "outer stream");
gpuComm( outerStream );
auto defaultStream = cuda::StreamRAII::newPriorityStream(streamLowPriority);
for( auto &block: *blocks )
{
if(!disableBoundaries)
{
auto p = boundaryOuterStreams.parallelSection( outerStream );
p.run( [&block, &ubb]( cudaStream_t s ) { ubb.outer( &block, s ); } );
p.run( [&block, &noSlip]( cudaStream_t s ) { noSlip.outer( &block, s ); } );
}
lbKernel.outer( &block, outerStream );
}
});
currentTimeStep += 1;
auto boundarySweep = [&](IBlock * block, uint8_t t, cudaStream_t stream){
noSlip.run(block, t, stream);
ubb.run(block, t, stream);
};
auto boundaryInner = [&](IBlock * block, uint8_t t, cudaStream_t stream){
noSlip.inner(block, t, stream);
ubb.inner(block, t, stream);
};
auto boundaryStreams = cuda::ParallelStreams( streamHighPriority );
auto normalTimeStep = [&]()
{
gpuComm();
for( auto &block: *blocks )
{
if(!disableBoundaries)
{
auto p = boundaryStreams.parallelSection( defaultStream );
p.run( [&block, &ubb]( cudaStream_t s ) { ubb( &block, s ); } );
p.run( [&block, &noSlip]( cudaStream_t s ) { noSlip( &block, s ); } );
}
lbKernel( &block );
auto boundaryOuter = [&](IBlock * block, uint8_t t, cudaStream_t stream){
noSlip.outer(block, t, stream);
ubb.outer(block, t, stream);
};
auto simpleOverlapTimeStep = [&]() {
// Communicate post-collision values of previous timestep...
comm.startCommunication(defaultStream);
for (auto& block : *blocks){
if(boundaries) boundaryInner(&block, tracker->getCounter(), defaultStream);
lbSweep.inner(&block, tracker->getCounterPlusOne(), defaultStream);
}
comm.wait(defaultStream);
for (auto& block : *blocks){
if(boundaries) boundaryOuter(&block, tracker->getCounter(), defaultStream);
lbSweep.outer(&block, tracker->getCounterPlusOne(), defaultStream);
}
tracker->advance();
};
auto kernelOnlyFunc = [&] ()
{
for( auto &block: *blocks )
lbKernel( &block );
auto normalTimeStep = [&]() {
comm.communicate(defaultStream);
for (auto& block : *blocks){
if(boundaries) boundarySweep(&block, tracker->getCounter(), defaultStream);
lbSweep(&block, tracker->getCounterPlusOne(), defaultStream);
}
tracker->advance();
};
SweepTimeloop timeLoop( blocks->getBlockStorage(), timesteps );
// With two-fields patterns, ghost layer cells act as constant stream-in boundaries;
// with in-place patterns, ghost layer cells act as wet-node no-slip boundaries.
auto kernelOnlyFunc = [&]() {
tracker->advance();
for (auto& block : *blocks)
lbSweep(&block, tracker->getCounter(), defaultStream);
};
std::function<void()> timeStep;
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// TIME LOOP SETUP ///
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
SweepTimeloop timeLoop(blocks->getBlockStorage(), timesteps);
const std::string timeStepStrategy = parameters.getParameter< std::string >("timeStepStrategy", "normal");
std::function< void() > timeStep;
if (timeStepStrategy == "noOverlap")
timeStep = std::function<void()>( normalTimeStep );
else if (timeStepStrategy == "complexOverlap")
timeStep = std::function<void()>( overlapTimeStep );
timeStep = std::function< void() >(normalTimeStep);
else if (timeStepStrategy == "simpleOverlap")
timeStep = simpleOverlapTimeStep;
else if (timeStepStrategy == "kernelOnly" or timeStepStrategy == "kernelOnlyNoInit") {
WALBERLA_LOG_INFO_ON_ROOT("Running only compute kernel without boundary - this makes only sense for benchmarking!")
timeStep = kernelOnlyFunc;
timeStep = simpleOverlapTimeStep;
else if (timeStepStrategy == "kernelOnly")
{
WALBERLA_LOG_INFO_ON_ROOT(
"Running only compute kernel without boundary - this makes only sense for benchmarking!")
// Run initial communication once to provide any missing stream-in populations
comm.communicate();
timeStep = kernelOnlyFunc;
}
else {
WALBERLA_ABORT_NO_DEBUG_INFO("Invalid value for 'timeStepStrategy'. Allowed values are 'noOverlap', 'complexOverlap', 'simpleOverlap', 'kernelOnly'");
else
{
WALBERLA_ABORT_NO_DEBUG_INFO("Invalid value for 'timeStepStrategy'. Allowed values are 'noOverlap', "
"'simpleOverlap', 'kernelOnly'")
}
timeLoop.add() << BeforeFunction( timeStep )
<< Sweep( []( IBlock * ) {}, "time step" );
pystencils::UniformGridGPU_MacroGetter getterSweep( pdfFieldCpuID, velFieldCpuID );
timeLoop.add() << BeforeFunction(timeStep) << Sweep([](IBlock*) {}, "time step");
// VTK
uint_t vtkWriteFrequency = parameters.getParameter<uint_t>( "vtkWriteFrequency", 0 );
if( vtkWriteFrequency > 0 )
uint_t vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0);
if (vtkWriteFrequency > 0)
{
auto vtkOutput = vtk::createVTKOutput_BlockData( *blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out",
"simulation_step", false, true, true, false, 0 );
auto velWriter = make_shared< field::VTKWriter<VelocityField_T> >(velFieldCpuID, "vel");
auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out",
"simulation_step", false, true, true, false, 0);
auto velWriter = make_shared< field::VTKWriter< VelocityField_T > >(velFieldCpuID, "vel");
vtkOutput->addCellDataWriter(velWriter);
vtkOutput->addBeforeFunction( [&]() {
cuda::fieldCpy<PdfField_T, cuda::GPUField<real_t> >( blocks, pdfFieldCpuID, pdfFieldGpuID );
for( auto & block : *blocks )
getterSweep( &block );
vtkOutput->addBeforeFunction([&]() {
cuda::fieldCpy< VelocityField_T , cuda::GPUField< real_t > >(blocks, velFieldCpuID, velFieldGpuID);
});
timeLoop.addFuncAfterTimeStep( vtk::writeFiles( vtkOutput ), "VTK Output" );
timeLoop.addFuncAfterTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
}
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// BENCHMARK ///
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
int warmupSteps = parameters.getParameter<int>( "warmupSteps", 2 );
int outerIterations = parameters.getParameter<int>( "outerIterations", 1 );
for(int i=0; i < warmupSteps; ++i )