diff --git a/apps/benchmarks/UniformGridGenerated/CMakeLists.txt b/apps/benchmarks/UniformGridGenerated/CMakeLists.txt index 836baeefa42b5cdfe5c668eb5cdb14e4e288e93d..5d3f2dca099b2b2d3a92d6a61f2036f2452fd927 100644 --- a/apps/benchmarks/UniformGridGenerated/CMakeLists.txt +++ b/apps/benchmarks/UniformGridGenerated/CMakeLists.txt @@ -5,6 +5,7 @@ waLBerla_python_file_generates(UniformGridGenerated.py GenMacroGetter.cpp GenMacroSetter.cpp GenPackInfo.cpp GenPackInfoAAPush.cpp GenPackInfoAAPull.cpp GenLbKernel.cpp GenLbKernelAAEven.cpp GenLbKernelAAOdd.cpp + GenMpiDtypeInfo.h GenMpiDtypeInfoAAPull.h GenMpiDtypeInfoAAPush.h GenDefines.h) diff --git a/apps/benchmarks/UniformGridGenerated/UniformGrid.prm b/apps/benchmarks/UniformGridGenerated/UniformGrid.prm index ae22ab6b594604258e5f8fbf08a90b727684d46d..e53b3cd4f4ccdc0c1c1abd14fc3a05c0f6d8c8f7 100644 --- a/apps/benchmarks/UniformGridGenerated/UniformGrid.prm +++ b/apps/benchmarks/UniformGridGenerated/UniformGrid.prm @@ -1,21 +1,22 @@ DomainSetup { blocks < 1, 1, 1 >; - cellsPerBlock < 256, 128, 128 >; + cellsPerBlock < 64, 64, 64 >; periodic < 1, 1, 1 >; } Parameters { - timesteps 400; // time steps of one performance measurement - warmupSteps 1; // number of steps to run before measurement starts + timesteps 3000; // time steps of one performance measurement + warmupSteps 1; // number of steps to run before measurement starts outerIterations 1; // how many measurements to conduct - vtkWriteFrequency 0; // write a VTK file every n'th step, if zero VTK output is disabled + vtkWriteFrequency 100; // write a VTK file every n'th step, if zero VTK output is disabled cudaEnabledMPI false; // switch on if you have a CUDA-enabled MPI implementation - timeStepMode aaKernelOnly; // can be: noOverlap, simpleOverlap, complexOverlap, kernelOnly + timeStepMode aa; // can be: noOverlap, simpleOverlap, complexOverlap, kernelOnly remainingTimeLoggerFrequency 0; // interval in seconds to log the estimated remaining time + directComm 1; omega 1.8; useGui 0; diff --git a/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.cpp b/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.cpp index bbc904de35e6a8167ffb5be76224327f639de2e4..d6671b3b2f68662ba0971a8a4bdb94d6343f1098 100644 --- a/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.cpp +++ b/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.cpp @@ -7,6 +7,7 @@ #include "field/vtk/VTKWriter.h" #include "field/AddToStorage.h" #include "blockforest/communication/UniformBufferedScheme.h" +#include "blockforest/communication/UniformDirectScheme.h" #include "timeloop/all.h" #include "core/timing/TimingPool.h" #include "core/timing/RemainingTimeLogger.h" @@ -25,6 +26,9 @@ #include "GenPackInfo.h" #include "GenPackInfoAAPush.h" #include "GenPackInfoAAPull.h" +#include "GenMpiDtypeInfo.h" +#include "GenMpiDtypeInfoAAPull.h" +#include "GenMpiDtypeInfoAAPush.h" using namespace walberla; @@ -52,7 +56,7 @@ int main( int argc, char **argv ) const real_t omega = parameters.getParameter<real_t>( "omega", real_c( 1.4 )); uint_t timesteps = parameters.getParameter<uint_t>( "timesteps", uint_c( 60 )); const real_t shearVelocityMagnitude = parameters.getParameter<real_t>("shearVelocityMagnitude", 0.08); - + const bool directComm = parameters.getParameter<bool>("directComm", false); auto pdfFieldAdder = [](IBlock* const block, StructuredBlockStorage * const storage) { return new PdfField_T(storage->getNumberOfXCells(*block), @@ -74,6 +78,7 @@ int main( int argc, char **argv ) for( auto & b : *blocks) setterKernel(&b); + // Buffered Comm blockforest::communication::UniformBufferedScheme< Stencil_T > twoFieldComm(blocks ); twoFieldComm.addPackInfo(make_shared< pystencils::GenPackInfo >(pdfFieldId ) ); @@ -83,12 +88,23 @@ int main( int argc, char **argv ) blockforest::communication::UniformBufferedScheme< Stencil_T > aaPushComm(blocks); aaPushComm.addPackInfo(make_shared< pystencils::GenPackInfoAAPush>(pdfFieldId)); + // Direct Comm + blockforest::communication::UniformDirectScheme< Stencil_T > twoFieldCommDirect(blocks); + twoFieldCommDirect.addDataToCommunicate(make_shared<pystencils::GenMpiDtypeInfo>(pdfFieldId)); + + blockforest::communication::UniformDirectScheme< Stencil_T > aaPullCommDirect(blocks); + aaPullCommDirect.addDataToCommunicate(make_shared<pystencils::GenMpiDtypeInfoAAPull>(pdfFieldId)); + + blockforest::communication::UniformDirectScheme< Stencil_T > aaPushCommDirect(blocks); + aaPushCommDirect.addDataToCommunicate(make_shared<pystencils::GenMpiDtypeInfoAAPush>(pdfFieldId)); + + using F = std::function<void()>; SweepTimeloop timeLoop( blocks->getBlockStorage(), timesteps / 2 ); if( timeStepMode == "twoField") { - timeLoop.add() << BeforeFunction(twoFieldComm, "communication" ) + timeLoop.add() << BeforeFunction(directComm ? F(twoFieldCommDirect) : F(twoFieldComm), "communication" ) << Sweep( pystencils::GenLbKernel(pdfFieldId, omega), "LB stream & collide1" ); - timeLoop.add() << BeforeFunction(twoFieldComm, "communication" ) + timeLoop.add() << BeforeFunction(directComm ? F(twoFieldCommDirect) : F(twoFieldComm), "communication" ) << Sweep( pystencils::GenLbKernel(pdfFieldId, omega), "LB stream & collide2" ); } else if ( timeStepMode == "twoFieldKernelOnly") { @@ -96,9 +112,9 @@ int main( int argc, char **argv ) timeLoop.add() << Sweep( pystencils::GenLbKernel(pdfFieldId, omega), "LB stream & collide2" ); } else if ( timeStepMode == "aa") { timeLoop.add() << Sweep( pystencils::GenLbKernelAAEven(pdfFieldId, omega), "AA Even" ); - timeLoop.add() << BeforeFunction( aaPullComm ) + timeLoop.add() << BeforeFunction( directComm ? F(aaPullCommDirect) : F(aaPullComm) ) << Sweep( pystencils::GenLbKernelAAOdd(pdfFieldId, omega), "AA Odd") - << AfterFunction( aaPushComm ); + << AfterFunction( directComm ? F(aaPushCommDirect) : F(aaPushComm) ); } else if ( timeStepMode == "aaKernelOnly") { timeLoop.add() << Sweep( pystencils::GenLbKernelAAEven(pdfFieldId, omega), "AA Even" ); timeLoop.add() << Sweep( pystencils::GenLbKernelAAOdd(pdfFieldId, omega), "AA Odd"); diff --git a/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.py b/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.py index 74315df21ff29588b48bc503eb378378ef95a714..16b04e122bd61f7496433d1adca4bd42668a1b73 100644 --- a/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.py +++ b/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.py @@ -1,7 +1,7 @@ import sympy as sp import pystencils as ps from lbmpy.creationfunctions import create_lb_update_rule -from pystencils_walberla import CodeGeneration, generate_pack_info_from_kernel, generate_sweep +from pystencils_walberla import CodeGeneration, generate_pack_info_from_kernel, generate_sweep, generate_mpidtype_info_from_kernel from lbmpy.macroscopic_value_kernels import macroscopic_values_getter, macroscopic_values_setter from lbmpy.fieldaccess import AAEvenTimeStepAccessor, AAOddTimeStepAccessor @@ -135,6 +135,10 @@ with CodeGeneration() as ctx: generate_pack_info_from_kernel(ctx, 'GenPackInfoAAPush', update_rule_aa_odd, kind='push', cpu_vectorize_info={'instruction_set': None}) + generate_mpidtype_info_from_kernel(ctx, 'GenMpiDtypeInfo', update_rule_two_field) + generate_mpidtype_info_from_kernel(ctx, 'GenMpiDtypeInfoAAPull', update_rule_aa_odd, kind='pull') + generate_mpidtype_info_from_kernel(ctx, 'GenMpiDtypeInfoAAPush', update_rule_aa_odd, kind='push') + # Info Header infoHeaderParams = { 'stencil': stencil_str,