diff --git a/apps/benchmarks/UniformGridGenerated/UniformGrid.prm b/apps/benchmarks/UniformGridGenerated/UniformGrid.prm index 22493880997cfecd041cb36e99c7a76fe1e2f122..41d29bbccba9df9ed7850b0daa7e0a71a4e157a8 100644 --- a/apps/benchmarks/UniformGridGenerated/UniformGrid.prm +++ b/apps/benchmarks/UniformGridGenerated/UniformGrid.prm @@ -1,13 +1,13 @@ DomainSetup { blocks < 1, 1, 1 >; - cellsPerBlock < 64, 64, 64 >; + cellsPerBlock < 300, 64, 64 >; periodic < 1, 1, 1 >; } Parameters { - timeStepMode twoField; + timeStepMode aa; // twoField: normal src-dst update with two fields [default] // twoFieldKernelOnly: same as above but without communication and periodicity // aa: AA single-field udate pattern @@ -22,11 +22,13 @@ Parameters // manualD3Q19: manual D3Q19 - timesteps 200; // time steps of one performance measurement default 60 + + timesteps 2000; // time steps of one performance measurement default 60 warmupSteps 1; // number of steps to run before measurement starts - outerIterations 4; // how many measurements to conduct - vtkWriteFrequency 0; // write a VTK file every n'th step, if zero VTK output is disabled + outerIterations 1; // how many measurements to conduct + vtkWriteFrequency 100; // write a VTK file every n'th step, if zero VTK output is disabled remainingTimeLoggerFrequency 6; // interval in seconds to log the estimated remaining time + fPadding 3; useGui 0; diff --git a/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.cpp b/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.cpp index e1b013e0adf606b2763ba54d69102c438d92f185..5ca6398df9172f0de561e8250fc97d5f3c171b86 100644 --- a/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.cpp +++ b/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.cpp @@ -43,31 +43,91 @@ using PdfField_T = GhostLayerField< real_t, Stencil_T::Q >; using VelocityField_T = GhostLayerField< real_t, 3 >; -void pinOpenMP(const char * pinString) +template <typename T> +class OuterPaddingFieldAllocator : public field::FieldAllocator<T> +{ +public: + OuterPaddingFieldAllocator( uint_t paddingElements ) + : padding_(paddingElements) + {} + + virtual T * allocate(const field::Layout & layout, + uint_t xSize, uint_t ySize, uint_t zSize, uint_t fSize, + uint_t & xAllocSize, uint_t & yAllocSize,uint_t & zAllocSize,uint_t & fAllocSize, + cell_idx_t & xStride, cell_idx_t & yStride, cell_idx_t & zStride, cell_idx_t & fStride) + { + T * ptr; + + if (layout == field::fzyx ) { + ptr = field::FieldAllocator<T>::allocateField(fSize, zSize, ySize, xSize, fAllocSize, zAllocSize, yAllocSize, xAllocSize); + + WALBERLA_CHECK_LESS_EQUAL( fSize * xAllocSize * yAllocSize * zAllocSize + xSize + ySize * xAllocSize + zSize * xAllocSize * yAllocSize, + std::numeric_limits< cell_idx_t >::max(), + "The data type 'cell_idx_t' is too small for your field size! Your field is too large.\nYou may have to set 'cell_idx_t' to an 'int64t'." ); + + fStride = cell_idx_c(xAllocSize * yAllocSize * zAllocSize + padding_); + zStride = cell_idx_c(xAllocSize * yAllocSize); + yStride = cell_idx_c(xAllocSize); + xStride = 1; + } else { + WALBERLA_ABORT("OuterPaddingFieldAllocator works only for fzyx layout"); + } + + return ptr; + } + + virtual T * allocateMemory ( uint_t size0, uint_t size1, uint_t size2, uint_t size3, + uint_t & allocSize0, uint_t & allocSize1, uint_t & allocSize2, uint_t & allocSize3 ) + { + allocSize0 = size0; + allocSize1 = size1; + allocSize2 = size2; + allocSize3 = size3; + return new T[allocSize0 * allocSize1 * allocSize2 * allocSize3 + padding_ * size0]; + } + + virtual T * allocateMemory ( uint_t size ) + { + return new T[size]; + } + + virtual void deallocate(T *& values) { + delete[] values; + values = 0; + } +private: + uint_t padding_; +}; + + +void pinOpenMP( const char *pinString ) { #ifdef WALBERLA_BUILD_WITH_OPENMP - if (pinString != NULL) { - #pragma omp parallel + if ( pinString != NULL ) + { +#pragma omp parallel { int threadId = omp_get_thread_num(); int err; - err = PinCurrentThreadByCpuList(pinString, threadId); + err = PinCurrentThreadByCpuList( pinString, threadId ); - if (err) { - WALBERLA_ABORT("Pinning of " << threadId << "failed"); + if ( err ) + { + WALBERLA_ABORT( "Pinning of " << threadId << "failed" ); } - const char * cpuList = PinCpuListAsString(); - WALBERLA_ASSERT(cpuList != NULL); + const char *cpuList = PinCpuListAsString(); + WALBERLA_ASSERT( cpuList != NULL ); // Not so nice hack to print the thread ids ordered. - #pragma omp for ordered - for (int i = 0; i < omp_get_num_threads(); ++i) { - #pragma omp ordered - WALBERLA_LOG_INFO("Thread " << threadId << " pinned to core(s) " << cpuList); +#pragma omp for ordered + for ( int i = 0; i < omp_get_num_threads(); ++i ) + { +#pragma omp ordered + WALBERLA_LOG_INFO( "Thread " << threadId << " pinned to core(s) " << cpuList ); } - free((void *)cpuList); + free((void *) cpuList ); } } #endif @@ -75,194 +135,219 @@ void pinOpenMP(const char * pinString) int main( int argc, char **argv ) { - mpi::Environment env( argc, argv ); - - for( auto cfg = python_coupling::configBegin( argc, argv ); cfg != python_coupling::configEnd(); ++cfg ) - { - WALBERLA_MPI_WORLD_BARRIER(); - - auto config = *cfg; - logging::configureLogging( config ); - auto blocks = blockforest::createUniformBlockGridFromConfig( config ); - - Vector3<uint_t> cellsPerBlock = config->getBlock( "DomainSetup" ).getParameter<Vector3<uint_t> >( "cellsPerBlock" ); - // Reading parameters - auto parameters = config->getOneBlock( "Parameters" ); - const std::string timeStepMode = parameters.getParameter<std::string>( "timeStepMode", "twoField"); - const real_t omega = parameters.getParameter<real_t>( "omega", real_c( 1.4 )); - uint_t timesteps = parameters.getParameter<uint_t>( "timesteps", uint_c( 60 )); - const real_t shearVelocityMagnitude = parameters.getParameter<real_t>("shearVelocityMagnitude", 0.02); - const bool directComm = parameters.getParameter<bool>("directComm", false); - - const std::string pinning = parameters.getParameter<std::string>("pinning", ""); - if( !pinning.empty() ) - pinOpenMP(pinning.c_str()); - - auto pdfFieldAdder = [](IBlock* const block, StructuredBlockStorage * const storage) { - return new PdfField_T(storage->getNumberOfXCells(*block), - storage->getNumberOfYCells(*block), - storage->getNumberOfZCells(*block), - uint_t(1), - field::fzyx, - make_shared<field::AllocateAligned<real_t, 64>>()); - }; - - // Creating fields - BlockDataID pdfFieldId = blocks->addStructuredBlockData<PdfField_T>(pdfFieldAdder, "pdfs"); - BlockDataID velFieldId = field::addToStorage< VelocityField_T >( blocks, "vel", real_t( 0 ), field::fzyx ); - - pystencils::GenMacroSetter setterKernel(pdfFieldId, velFieldId); - pystencils::GenMacroGetter getterKernel(pdfFieldId, velFieldId); - - if( shearVelocityMagnitude > 0 ) - initShearVelocity(blocks, velFieldId, shearVelocityMagnitude); - for( auto & b : *blocks) - setterKernel(&b); - - // Buffered Comm - blockforest::communication::UniformBufferedScheme< Stencil_T > twoFieldComm(blocks ); - twoFieldComm.addPackInfo(make_shared< pystencils::GenPackInfo >(pdfFieldId ) ); - - blockforest::communication::UniformBufferedScheme< Stencil_T > aaPullComm(blocks); - aaPullComm.addPackInfo(make_shared< pystencils::GenPackInfoAAPull>(pdfFieldId)); - - blockforest::communication::UniformBufferedScheme< Stencil_T > aaPushComm(blocks); - aaPushComm.addPackInfo(make_shared< pystencils::GenPackInfoAAPush>(pdfFieldId)); - - // Direct Comm - blockforest::communication::UniformDirectScheme< Stencil_T > twoFieldCommDirect(blocks); - twoFieldCommDirect.addDataToCommunicate(make_shared<pystencils::GenMpiDtypeInfo>(pdfFieldId)); - - blockforest::communication::UniformDirectScheme< Stencil_T > aaPullCommDirect(blocks); - aaPullCommDirect.addDataToCommunicate(make_shared<pystencils::GenMpiDtypeInfoAAPull>(pdfFieldId)); - - blockforest::communication::UniformDirectScheme< Stencil_T > aaPushCommDirect(blocks); - aaPushCommDirect.addDataToCommunicate(make_shared<pystencils::GenMpiDtypeInfoAAPush>(pdfFieldId)); - - - const std::string twoFieldKernelType = parameters.getParameter<std::string>( "twoFieldKernelType", "generated"); - std::function<void(IBlock*)> twoFieldKernel; - if( twoFieldKernelType == "generated") { - twoFieldKernel = pystencils::GenLbKernel(pdfFieldId, omega); - } else if (twoFieldKernelType == "manualGeneric") { - using MyLM = lbm::D3Q19<lbm::collision_model::SRT>; - BlockDataID tmpPdfFieldId = blocks->addStructuredBlockData<PdfField_T>(pdfFieldAdder, "pdfs"); - twoFieldKernel = StreamPullCollideGeneric<MyLM>(pdfFieldId, tmpPdfFieldId, omega); - } else if (twoFieldKernelType == "manualD3Q19") { - using MyLM = lbm::D3Q19<lbm::collision_model::SRT>; - BlockDataID tmpPdfFieldId = blocks->addStructuredBlockData<PdfField_T>(pdfFieldAdder, "pdfs"); - twoFieldKernel = StreamPullCollideD3Q19<MyLM>(pdfFieldId, tmpPdfFieldId, omega); - } else { - WALBERLA_ABORT_NO_DEBUG_INFO("Invalid option for \"twoFieldKernelType\", " - "valid options are \"generated\", \"manualGeneric\", \"manualD3Q19\""); - } - - using F = std::function<void()>; - SweepTimeloop timeLoop( blocks->getBlockStorage(), timesteps / 2 ); - if( timeStepMode == "twoField") - { - timeLoop.add() << BeforeFunction(directComm ? F(twoFieldCommDirect) : F(twoFieldComm), "communication" ) - << Sweep( twoFieldKernel, "LB stream & collide1" ); - timeLoop.add() << BeforeFunction(directComm ? F(twoFieldCommDirect) : F(twoFieldComm), "communication" ) - << Sweep( twoFieldKernel, "LB stream & collide2" ); - - } else if ( timeStepMode == "twoFieldKernelOnly") { - timeLoop.add() << Sweep( pystencils::GenLbKernel(pdfFieldId, omega), "LB stream & collide1" ); - timeLoop.add() << Sweep( pystencils::GenLbKernel(pdfFieldId, omega), "LB stream & collide2" ); - } else if ( timeStepMode == "aa") { - timeLoop.add() << Sweep( pystencils::GenLbKernelAAEven(pdfFieldId, omega), "AA Even" ); - timeLoop.add() << BeforeFunction( directComm ? F(aaPullCommDirect) : F(aaPullComm) ) - << Sweep( pystencils::GenLbKernelAAOdd(pdfFieldId, omega), "AA Odd") - << AfterFunction( directComm ? F(aaPushCommDirect) : F(aaPushComm) ); - } else if ( timeStepMode == "aaKernelOnly") { - timeLoop.add() << Sweep( pystencils::GenLbKernelAAEven(pdfFieldId, omega), "AA Even" ); - timeLoop.add() << Sweep( pystencils::GenLbKernelAAOdd(pdfFieldId, omega), "AA Odd"); - } else { - WALBERLA_ABORT("Invalid value for timeStepMode"); - } - - - int warmupSteps = parameters.getParameter<int>( "warmupSteps", 2 ); - int outerIterations = parameters.getParameter<int>( "outerIterations", 1 ); - for(int i=0; i < warmupSteps; ++i ) - timeLoop.singleStep(); - - auto remainingTimeLoggerFrequency = parameters.getParameter< double >( "remainingTimeLoggerFrequency", -1.0 ); // in seconds - if (remainingTimeLoggerFrequency > 0) { - auto logger = timing::RemainingTimeLogger( timeLoop.getNrOfTimeSteps() * outerIterations, remainingTimeLoggerFrequency ); - timeLoop.addFuncAfterTimeStep( logger, "remaining time logger" ); - } - - // VTK - uint_t vtkWriteFrequency = parameters.getParameter<uint_t>( "vtkWriteFrequency", 0 ); - if( vtkWriteFrequency > 0 ) - { - auto vtkOutput = vtk::createVTKOutput_BlockData( *blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out", - "simulation_step", false, true, true, false, 0 ); - auto velWriter = make_shared< field::VTKWriter< VelocityField_T > >( velFieldId, "vel" ); - vtkOutput->addCellDataWriter( velWriter ); - vtkOutput->addBeforeFunction( [&]() - { for( auto & b : *blocks) - getterKernel(&b); - } ); - timeLoop.addFuncAfterTimeStep( vtk::writeFiles( vtkOutput ), "VTK Output" ); - } - - - bool useGui = parameters.getParameter<bool>( "useGui", false ); - if( useGui ) - { - GUI gui( timeLoop, blocks, argc, argv); - gui.run(); - } - else - { - for ( int outerIteration = 0; outerIteration < outerIterations; ++outerIteration ) - { - timeLoop.setCurrentTimeStepToZero(); - WcTimer simTimer; - - auto threads = omp_get_max_threads(); - - simTimer.start(); - timeLoop.run(); - simTimer.end(); - auto time = simTimer.last(); - auto nrOfCells = real_c( cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2] ); - auto mlupsPerProcess = nrOfCells * real_c( timesteps ) / time * 1e-6; - - using std::setw; - WALBERLA_LOG_INFO_ON_ROOT(setw(18) << timeStepMode << - " procs: " << setw(6) << MPIManager::instance()->numProcesses() << - " threads: " << threads << - " direct_comm: " << directComm << - " time steps: " << timesteps << - setw(15) << " block size: " << cellsPerBlock << - " mlups/core: " << int(mlupsPerProcess/ threads) << - " mlups: " << int(mlupsPerProcess) * MPIManager::instance()->numProcesses()); - - WALBERLA_ROOT_SECTION() - { - python_coupling::PythonCallback pythonCallbackResults( "results_callback" ); - if ( pythonCallbackResults.isCallable()) - { - pythonCallbackResults.data().exposeValue( "mlupsPerProcess", mlupsPerProcess ); - pythonCallbackResults.data().exposeValue( "stencil", infoStencil ); - pythonCallbackResults.data().exposeValue( "configName", infoConfigName ); - pythonCallbackResults.data().exposeValue( "timeStepMode", timeStepMode ); - pythonCallbackResults.data().exposeValue( "twoFieldKernel", twoFieldKernelType ); - pythonCallbackResults.data().exposeValue( "optimizations", optimizationDict ); - pythonCallbackResults.data().exposeValue( "githash", core::buildinfo::gitSHA1() ); - pythonCallbackResults.data().exposeValue( "compilerFlags", core::buildinfo::compilerFlags() ); - pythonCallbackResults.data().exposeValue( "buildMachine", core::buildinfo::buildMachine() ); - - // Call Python function to report results - pythonCallbackResults(); - } - } - } - } - } - - return 0; + mpi::Environment env( argc, argv ); + + for ( auto cfg = python_coupling::configBegin( argc, argv ); cfg != python_coupling::configEnd(); ++cfg ) + { + WALBERLA_MPI_WORLD_BARRIER(); + + auto config = *cfg; + logging::configureLogging( config ); + auto blocks = blockforest::createUniformBlockGridFromConfig( config ); + + Vector3< uint_t > cellsPerBlock = config->getBlock( "DomainSetup" ).getParameter< Vector3< uint_t > >( "cellsPerBlock" ); + // Reading parameters + auto parameters = config->getOneBlock( "Parameters" ); + const std::string timeStepMode = parameters.getParameter< std::string >( "timeStepMode", "twoField" ); + const real_t omega = parameters.getParameter< real_t >( "omega", real_c( 1.4 )); + uint_t timesteps = parameters.getParameter< uint_t >( "timesteps", uint_c( 60 )); + const real_t shearVelocityMagnitude = parameters.getParameter< real_t >( "shearVelocityMagnitude", 0.02 ); + const bool directComm = parameters.getParameter< bool >( "directComm", false ); + const uint_t fPadding = parameters.getParameter<uint_t>("fPadding", 0); + + const std::string pinning = parameters.getParameter< std::string >( "pinning", "" ); + if ( !pinning.empty()) + pinOpenMP( pinning.c_str()); + + auto pdfFieldAdder = [fPadding]( IBlock *const block, StructuredBlockStorage *const storage ) + { + shared_ptr< field::FieldAllocator<real_t> > allocator; + if( fPadding > 0) + allocator = make_shared< OuterPaddingFieldAllocator<real_t> >( fPadding ); + else + allocator = make_shared< field::AllocateAligned< real_t, 64> >(); + + return new PdfField_T( storage->getNumberOfXCells( *block ), + storage->getNumberOfYCells( *block ), + storage->getNumberOfZCells( *block ), + uint_t( 1 ), + field::fzyx, + allocator); + }; + + // Creating fields + BlockDataID pdfFieldId = blocks->addStructuredBlockData< PdfField_T >( pdfFieldAdder, "pdfs" ); + BlockDataID velFieldId = field::addToStorage< VelocityField_T >( blocks, "vel", real_t( 0 ), field::fzyx ); + + pystencils::GenMacroSetter setterKernel( pdfFieldId, velFieldId ); + pystencils::GenMacroGetter getterKernel( pdfFieldId, velFieldId ); + + if ( shearVelocityMagnitude > 0 ) + initShearVelocity( blocks, velFieldId, shearVelocityMagnitude ); + for ( auto &b : *blocks ) + setterKernel( &b ); + + // Buffered Comm + blockforest::communication::UniformBufferedScheme< Stencil_T > twoFieldComm( blocks ); + twoFieldComm.addPackInfo( make_shared< pystencils::GenPackInfo >( pdfFieldId )); + + blockforest::communication::UniformBufferedScheme< Stencil_T > aaPullComm( blocks ); + aaPullComm.addPackInfo( make_shared< pystencils::GenPackInfoAAPull >( pdfFieldId )); + + blockforest::communication::UniformBufferedScheme< Stencil_T > aaPushComm( blocks ); + aaPushComm.addPackInfo( make_shared< pystencils::GenPackInfoAAPush >( pdfFieldId )); + + // Direct Comm + blockforest::communication::UniformDirectScheme< Stencil_T > twoFieldCommDirect( blocks ); + twoFieldCommDirect.addDataToCommunicate( make_shared< pystencils::GenMpiDtypeInfo >( pdfFieldId )); + + blockforest::communication::UniformDirectScheme< Stencil_T > aaPullCommDirect( blocks ); + aaPullCommDirect.addDataToCommunicate( make_shared< pystencils::GenMpiDtypeInfoAAPull >( pdfFieldId )); + + blockforest::communication::UniformDirectScheme< Stencil_T > aaPushCommDirect( blocks ); + aaPushCommDirect.addDataToCommunicate( make_shared< pystencils::GenMpiDtypeInfoAAPush >( pdfFieldId )); + + + const std::string twoFieldKernelType = parameters.getParameter< std::string >( "twoFieldKernelType", "generated" ); + std::function< void( IBlock * ) > twoFieldKernel; + if ( twoFieldKernelType == "generated" ) + { + twoFieldKernel = pystencils::GenLbKernel( pdfFieldId, omega ); + } + else if ( twoFieldKernelType == "manualGeneric" ) + { + using MyLM = lbm::D3Q19< lbm::collision_model::SRT >; + BlockDataID tmpPdfFieldId = blocks->addStructuredBlockData< PdfField_T >( pdfFieldAdder, "pdfs" ); + twoFieldKernel = StreamPullCollideGeneric< MyLM >( pdfFieldId, tmpPdfFieldId, omega ); + } + else if ( twoFieldKernelType == "manualD3Q19" ) + { + using MyLM = lbm::D3Q19< lbm::collision_model::SRT >; + BlockDataID tmpPdfFieldId = blocks->addStructuredBlockData< PdfField_T >( pdfFieldAdder, "pdfs" ); + twoFieldKernel = StreamPullCollideD3Q19< MyLM >( pdfFieldId, tmpPdfFieldId, omega ); + } + else + { + WALBERLA_ABORT_NO_DEBUG_INFO( "Invalid option for \"twoFieldKernelType\", " + "valid options are \"generated\", \"manualGeneric\", \"manualD3Q19\"" ); + } + + using F = std::function< void() >; + SweepTimeloop timeLoop( blocks->getBlockStorage(), timesteps / 2 ); + if ( timeStepMode == "twoField" ) + { + timeLoop.add() << BeforeFunction( directComm ? F( twoFieldCommDirect ) : F( twoFieldComm ), "communication" ) + << Sweep( twoFieldKernel, "LB stream & collide1" ); + timeLoop.add() << BeforeFunction( directComm ? F( twoFieldCommDirect ) : F( twoFieldComm ), "communication" ) + << Sweep( twoFieldKernel, "LB stream & collide2" ); + + } + else if ( timeStepMode == "twoFieldKernelOnly" ) + { + timeLoop.add() << Sweep( pystencils::GenLbKernel( pdfFieldId, omega ), "LB stream & collide1" ); + timeLoop.add() << Sweep( pystencils::GenLbKernel( pdfFieldId, omega ), "LB stream & collide2" ); + } + else if ( timeStepMode == "aa" ) + { + timeLoop.add() << Sweep( pystencils::GenLbKernelAAEven( pdfFieldId, omega ), "AA Even" ); + timeLoop.add() << BeforeFunction( directComm ? F( aaPullCommDirect ) : F( aaPullComm )) + << Sweep( pystencils::GenLbKernelAAOdd( pdfFieldId, omega ), "AA Odd" ) + << AfterFunction( directComm ? F( aaPushCommDirect ) : F( aaPushComm )); + } + else if ( timeStepMode == "aaKernelOnly" ) + { + timeLoop.add() << Sweep( pystencils::GenLbKernelAAEven( pdfFieldId, omega ), "AA Even" ); + timeLoop.add() << Sweep( pystencils::GenLbKernelAAOdd( pdfFieldId, omega ), "AA Odd" ); + } + else + { + WALBERLA_ABORT( "Invalid value for timeStepMode" ); + } + + + int warmupSteps = parameters.getParameter< int >( "warmupSteps", 2 ); + int outerIterations = parameters.getParameter< int >( "outerIterations", 1 ); + for ( int i = 0; i < warmupSteps; ++i ) + timeLoop.singleStep(); + + auto remainingTimeLoggerFrequency = parameters.getParameter< double >( "remainingTimeLoggerFrequency", -1.0 ); // in seconds + if ( remainingTimeLoggerFrequency > 0 ) + { + auto logger = timing::RemainingTimeLogger( timeLoop.getNrOfTimeSteps() * outerIterations, remainingTimeLoggerFrequency ); + timeLoop.addFuncAfterTimeStep( logger, "remaining time logger" ); + } + + // VTK + uint_t vtkWriteFrequency = parameters.getParameter< uint_t >( "vtkWriteFrequency", 0 ); + if ( vtkWriteFrequency > 0 ) + { + auto vtkOutput = vtk::createVTKOutput_BlockData( *blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out", + "simulation_step", false, true, true, false, 0 ); + auto velWriter = make_shared< field::VTKWriter< VelocityField_T > >( velFieldId, "vel" ); + vtkOutput->addCellDataWriter( velWriter ); + vtkOutput->addBeforeFunction( [&]() + { + for ( auto &b : *blocks ) + getterKernel( &b ); + } ); + timeLoop.addFuncAfterTimeStep( vtk::writeFiles( vtkOutput ), "VTK Output" ); + } + + + bool useGui = parameters.getParameter< bool >( "useGui", false ); + if ( useGui ) + { + GUI gui( timeLoop, blocks, argc, argv ); + gui.run(); + } + else + { + for ( int outerIteration = 0; outerIteration < outerIterations; ++outerIteration ) + { + timeLoop.setCurrentTimeStepToZero(); + WcTimer simTimer; + + auto threads = omp_get_max_threads(); + + simTimer.start(); + timeLoop.run(); + simTimer.end(); + auto time = simTimer.last(); + auto nrOfCells = real_c( cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2] ); + auto mlupsPerProcess = nrOfCells * real_c( timesteps ) / time * 1e-6; + + using std::setw; + WALBERLA_LOG_INFO_ON_ROOT( setw( 18 ) << timeStepMode << + " procs: " << setw( 6 ) << MPIManager::instance()->numProcesses() << + " threads: " << threads << + " direct_comm: " << directComm << + " time steps: " << timesteps << + setw( 15 ) << " block size: " << cellsPerBlock << + " mlups/core: " << int( mlupsPerProcess / threads ) << + " mlups: " << int( mlupsPerProcess ) * MPIManager::instance()->numProcesses()); + + WALBERLA_ROOT_SECTION() + { + python_coupling::PythonCallback pythonCallbackResults( "results_callback" ); + if ( pythonCallbackResults.isCallable()) + { + pythonCallbackResults.data().exposeValue( "mlupsPerProcess", mlupsPerProcess ); + pythonCallbackResults.data().exposeValue( "stencil", infoStencil ); + pythonCallbackResults.data().exposeValue( "configName", infoConfigName ); + pythonCallbackResults.data().exposeValue( "timeStepMode", timeStepMode ); + pythonCallbackResults.data().exposeValue( "twoFieldKernel", twoFieldKernelType ); + pythonCallbackResults.data().exposeValue( "optimizations", optimizationDict ); + pythonCallbackResults.data().exposeValue( "githash", core::buildinfo::gitSHA1()); + pythonCallbackResults.data().exposeValue( "compilerFlags", core::buildinfo::compilerFlags()); + pythonCallbackResults.data().exposeValue( "buildMachine", core::buildinfo::buildMachine()); + + // Call Python function to report results + pythonCallbackResults(); + } + } + } + } + } + + return 0; } diff --git a/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.py b/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.py index 21ee9f0c80c3494be684aa69391b55c912e0b88f..0c8b7ff3c39003fa7efb65ba7e9376c72d7738b5 100644 --- a/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.py +++ b/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.py @@ -150,16 +150,19 @@ with CodeGeneration() as ctx: 'cse_global': opts['aa_odd_cse_global'], 'cse_pdfs': opts['aa_odd_cse_pdfs']}, **options) - vec = { 'assume_aligned': True, 'assume_inner_stride_one': True} + vec = {'assume_aligned': True, 'assume_inner_stride_one': True} # Sweeps vec['nontemporal'] = opts['two_field_nt_stores'] + vec['assume_aligned'] = opts['two_field_nt_stores'] generate_sweep(ctx, 'GenLbKernel', update_rule_two_field, field_swaps=[('pdfs', 'pdfs_tmp')], cpu_vectorize_info=vec) vec['nontemporal'] = opts['aa_even_nt_stores'] + vec['assume_aligned'] = opts['aa_even_nt_stores'] generate_sweep(ctx, 'GenLbKernelAAEven', update_rule_aa_even, cpu_vectorize_info=vec, cpu_openmp=True, ghost_layers=1) vec['nontemporal'] = opts['aa_odd_nt_stores'] + vec['assume_aligned'] = opts['aa_odd_nt_stores'] generate_sweep(ctx, 'GenLbKernelAAOdd', update_rule_aa_odd, cpu_vectorize_info=vec, cpu_openmp=True, ghost_layers=1) diff --git a/apps/benchmarks/UniformGridGenerated/params.py b/apps/benchmarks/UniformGridGenerated/params.py index 724238f4bb456be75906dafeff6b0683e4dbe3a4..f701bed0b2368491119c76e5144b64855d7e5198 100644 --- a/apps/benchmarks/UniformGridGenerated/params.py +++ b/apps/benchmarks/UniformGridGenerated/params.py @@ -51,7 +51,7 @@ def domain_decomposition_func_full(processes, threads, block_size): class BenchmarkScenario: def __init__(self, block_size=(256, 128, 128), direct_comm=True, time_step_mode='aa', two_field_kernel_type='generated', - domain_decomposition_func=domain_decomposition_func_z, + domain_decomposition_func=domain_decomposition_func_z, pinning="", f_padding=0, db_file_name='uniform_grid_gen.sqlite'): self.block_size = block_size self.direct_comm = direct_comm @@ -61,6 +61,8 @@ class BenchmarkScenario: self.threads = int(os.environ['OMP_NUM_THREADS']) self.processes = wlb.mpi.numProcesses() self.db_file_name = db_file_name + self.pinning = pinning + self.f_padding = f_padding @wlb.member_callback def config(self, **kwargs): @@ -81,6 +83,8 @@ class BenchmarkScenario: 'timeStepMode': self.time_step_mode, 'twoFieldKernelType': self.two_field_kernel_type, 'directComm': self.direct_comm, + 'pinning': self.pinning, + 'fPadding': self.f_padding, } } cfg['DomainSetup'].update(self.domain_decomposition_func(self.processes, self.threads, self.block_size)) @@ -168,4 +172,20 @@ def weak_scaling(): continue scenarios.add(sc) -single_node_benchmark() + +def padding_test(): + scenarios = wlb.ScenarioManager() + for block_size in [(300, 100, 100), (500, 100, 100)]: + for direct_comm in (False,): + for time_step_mode in ['aa', 'aaKernelOnly']: + for f_padding in range(16): + sc = BenchmarkScenario(block_size=block_size, direct_comm=direct_comm, + time_step_mode=time_step_mode, domain_decomposition_func=domain_decomposition_func_z, + f_padding=f_padding, pinning="0") + if not block_size_ok(sc): + continue + scenarios.add(sc) + + +#single_node_benchmark() +padding_test()