Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
No results found
Show changes
Showing
with 3139 additions and 872 deletions
...@@ -204,25 +204,25 @@ template< typename LatticeModel_T, class Enable = void > ...@@ -204,25 +204,25 @@ template< typename LatticeModel_T, class Enable = void >
struct StencilString; struct StencilString;
template< typename LatticeModel_T > template< typename LatticeModel_T >
struct StencilString< LatticeModel_T, typename std::enable_if< std::is_same< typename LatticeModel_T::Stencil, stencil::D2Q9 >::value >::type > struct StencilString< LatticeModel_T, std::enable_if_t< std::is_same_v< typename LatticeModel_T::Stencil, stencil::D2Q9 > > >
{ {
static const char * str() { return "D2Q9"; } static const char * str() { return "D2Q9"; }
}; };
template< typename LatticeModel_T > template< typename LatticeModel_T >
struct StencilString< LatticeModel_T, typename std::enable_if< std::is_same< typename LatticeModel_T::Stencil, stencil::D3Q15 >::value >::type > struct StencilString< LatticeModel_T, std::enable_if_t< std::is_same_v< typename LatticeModel_T::Stencil, stencil::D3Q15 > > >
{ {
static const char * str() { return "D3Q15"; } static const char * str() { return "D3Q15"; }
}; };
template< typename LatticeModel_T > template< typename LatticeModel_T >
struct StencilString< LatticeModel_T, typename std::enable_if< std::is_same< typename LatticeModel_T::Stencil, stencil::D3Q19 >::value >::type > struct StencilString< LatticeModel_T, std::enable_if_t< std::is_same_v< typename LatticeModel_T::Stencil, stencil::D3Q19 > > >
{ {
static const char * str() { return "D3Q19"; } static const char * str() { return "D3Q19"; }
}; };
template< typename LatticeModel_T > template< typename LatticeModel_T >
struct StencilString< LatticeModel_T, typename std::enable_if< std::is_same< typename LatticeModel_T::Stencil, stencil::D3Q27 >::value >::type > struct StencilString< LatticeModel_T, std::enable_if_t< std::is_same_v< typename LatticeModel_T::Stencil, stencil::D3Q27 > > >
{ {
static const char * str() { return "D3Q27"; } static const char * str() { return "D3Q27"; }
}; };
...@@ -232,22 +232,22 @@ template< typename LatticeModel_T, class Enable = void > ...@@ -232,22 +232,22 @@ template< typename LatticeModel_T, class Enable = void >
struct CollisionModelString; struct CollisionModelString;
template< typename LatticeModel_T > template< typename LatticeModel_T >
struct CollisionModelString< LatticeModel_T, typename std::enable_if< std::is_same< typename LatticeModel_T::CollisionModel::tag, struct CollisionModelString< LatticeModel_T, std::enable_if_t< std::is_same_v< typename LatticeModel_T::CollisionModel::tag,
lbm::collision_model::SRT_tag >::value >::type > lbm::collision_model::SRT_tag > > >
{ {
static const char * str() { return "SRT"; } static const char * str() { return "SRT"; }
}; };
template< typename LatticeModel_T > template< typename LatticeModel_T >
struct CollisionModelString< LatticeModel_T, typename std::enable_if< std::is_same< typename LatticeModel_T::CollisionModel::tag, struct CollisionModelString< LatticeModel_T, std::enable_if_t< std::is_same_v< typename LatticeModel_T::CollisionModel::tag,
lbm::collision_model::TRT_tag >::value >::type > lbm::collision_model::TRT_tag > > >
{ {
static const char * str() { return "TRT"; } static const char * str() { return "TRT"; }
}; };
template< typename LatticeModel_T > template< typename LatticeModel_T >
struct CollisionModelString< LatticeModel_T, typename std::enable_if< std::is_same< typename LatticeModel_T::CollisionModel::tag, struct CollisionModelString< LatticeModel_T, std::enable_if_t< std::is_same_v< typename LatticeModel_T::CollisionModel::tag,
lbm::collision_model::MRT_tag >::value >::type > lbm::collision_model::MRT_tag > > >
{ {
static const char * str() { return "MRT"; } static const char * str() { return "MRT"; }
}; };
...@@ -355,7 +355,7 @@ bool Cylinder::contains( const AABB & aabb ) const ...@@ -355,7 +355,7 @@ bool Cylinder::contains( const AABB & aabb ) const
{ {
if( setup_.circularCrossSection ) if( setup_.circularCrossSection )
{ {
Vector3< real_t > p[8]; std::array< Vector3< real_t >, 8 > p;
p[0].set( aabb.xMin(), aabb.yMin(), aabb.zMin() ); p[0].set( aabb.xMin(), aabb.yMin(), aabb.zMin() );
p[1].set( aabb.xMax(), aabb.yMin(), aabb.zMin() ); p[1].set( aabb.xMax(), aabb.yMin(), aabb.zMin() );
p[2].set( aabb.xMin(), aabb.yMax(), aabb.zMin() ); p[2].set( aabb.xMin(), aabb.yMax(), aabb.zMin() );
...@@ -521,12 +521,12 @@ public: ...@@ -521,12 +521,12 @@ public:
void operator()( SetupBlockForest & forest ) void operator()( SetupBlockForest & forest )
{ {
for( auto block = forest.begin(); block != forest.end(); ++block ) for(auto & block : forest)
{ {
const AABB & aabb = block->getAABB(); const AABB & aabb = block.getAABB();
if( block->getLevel() < level_ && cylinder_.intersects( aabb, bufferDistance_ ) && !cylinder_.contains( aabb ) ) if( block.getLevel() < level_ && cylinder_.intersects( aabb, bufferDistance_ ) && !cylinder_.contains( aabb ) )
block->setMarker( true ); block.setMarker( true );
} }
} }
...@@ -633,10 +633,10 @@ static void workloadMemoryAndSUIDAssignment( SetupBlockForest & forest, const me ...@@ -633,10 +633,10 @@ static void workloadMemoryAndSUIDAssignment( SetupBlockForest & forest, const me
} }
else else
{ {
for( auto block = forest.begin(); block != forest.end(); ++block ) for(auto & block : forest)
{ {
block->setWorkload( numeric_cast< workload_t >( uint_t(1) << block->getLevel() ) ); block.setWorkload( numeric_cast< workload_t >( uint_t(1) << block.getLevel() ) );
block->setMemory( memoryPerBlock ); block.setMemory( memoryPerBlock );
} }
} }
} }
...@@ -656,7 +656,8 @@ static shared_ptr< SetupBlockForest > createSetupBlockForest( const blockforest: ...@@ -656,7 +656,8 @@ static shared_ptr< SetupBlockForest > createSetupBlockForest( const blockforest:
( setup.xCells + uint_t(2) * FieldGhostLayers ) ) * memoryPerCell; ( setup.xCells + uint_t(2) * FieldGhostLayers ) ) * memoryPerCell;
forest->addRefinementSelectionFunction( refinementSelectionFunctions ); forest->addRefinementSelectionFunction( refinementSelectionFunctions );
forest->addWorkloadMemorySUIDAssignmentFunction( std::bind( workloadMemoryAndSUIDAssignment, std::placeholders::_1, memoryPerBlock, std::cref( setup ) ) ); forest->addWorkloadMemorySUIDAssignmentFunction([memoryPerBlock, setup_cref = std::cref(setup)](SetupBlockForest & sbf) {
return workloadMemoryAndSUIDAssignment(sbf, memoryPerBlock, setup_cref); } );
forest->init( AABB( real_c(0), real_c(0), real_c(0), forest->init( AABB( real_c(0), real_c(0), real_c(0),
setup.H * ( real_c(setup.xBlocks) * real_c(setup.xCells) ) / ( real_c(setup.yzBlocks) * real_c(setup.yzCells) ), setup.H, setup.H ), setup.H * ( real_c(setup.xBlocks) * real_c(setup.xCells) ) / ( real_c(setup.yzBlocks) * real_c(setup.yzCells) ), setup.H, setup.H ),
...@@ -1034,11 +1035,11 @@ Set<SUID> Pseudo2DBlockStateDetermination::operator()( const std::vector< std::p ...@@ -1034,11 +1035,11 @@ Set<SUID> Pseudo2DBlockStateDetermination::operator()( const std::vector< std::p
auto aabb = forest_.getAABBFromBlockId( destintation ); auto aabb = forest_.getAABBFromBlockId( destintation );
Set<SUID> state; Set<SUID> state;
for( auto it = source.begin(); it != source.end(); ++it ) for(const auto & it : source)
{ {
for( auto suid = it->second.begin(); suid != it->second.end(); ++suid ) for(auto suid : it.second)
if( *suid != state_ ) if( suid != state_ )
state += *suid; state += suid;
} }
if( markEmptyBlocks_ ) if( markEmptyBlocks_ )
...@@ -1064,16 +1065,16 @@ void keepInflowOutflowAtTheSameLevel( std::vector< std::pair< const Block *, uin ...@@ -1064,16 +1065,16 @@ void keepInflowOutflowAtTheSameLevel( std::vector< std::pair< const Block *, uin
uint_t maxInflowLevel( uint_t(0) ); uint_t maxInflowLevel( uint_t(0) );
uint_t maxOutflowLevel( uint_t(0) ); uint_t maxOutflowLevel( uint_t(0) );
// In addtion to keeping in- and outflow blocks at the same level, this callback also // In addition to keeping in- and outflow blocks at the same level, this callback also
// prevents these blocks from coarsening. // prevents these blocks from coarsening.
for( auto it = minTargetLevels.begin(); it != minTargetLevels.end(); ++it ) for(auto & minTargetLevel : minTargetLevels)
{ {
const Block * const block = it->first; const Block * const block = minTargetLevel.first;
if( forest.atDomainXMinBorder(*block) ) if( forest.atDomainXMinBorder(*block) )
{ {
it->second = std::max( it->second, block->getLevel() ); minTargetLevel.second = std::max( minTargetLevel.second, block->getLevel() );
maxInflowLevel = std::max( maxInflowLevel, it->second ); maxInflowLevel = std::max( maxInflowLevel, minTargetLevel.second );
} }
if( setup.strictlyObeyL ) if( setup.strictlyObeyL )
{ {
...@@ -1083,19 +1084,18 @@ void keepInflowOutflowAtTheSameLevel( std::vector< std::pair< const Block *, uin ...@@ -1083,19 +1084,18 @@ void keepInflowOutflowAtTheSameLevel( std::vector< std::pair< const Block *, uin
p[0] = setup.L; p[0] = setup.L;
if( aabb.contains(p) ) if( aabb.contains(p) )
{ {
it->second = std::max( it->second, block->getLevel() ); minTargetLevel.second = std::max( minTargetLevel.second, block->getLevel() );
maxOutflowLevel = std::max( maxOutflowLevel, it->second ); maxOutflowLevel = std::max( maxOutflowLevel, minTargetLevel.second );
} }
} }
else if( forest.atDomainXMaxBorder(*block) ) else if( forest.atDomainXMaxBorder(*block) )
{ {
it->second = std::max( it->second, block->getLevel() ); minTargetLevel.second = std::max( minTargetLevel.second, block->getLevel() );
maxOutflowLevel = std::max( maxOutflowLevel, it->second ); maxOutflowLevel = std::max( maxOutflowLevel, minTargetLevel.second );
} }
} }
for( auto it = blocksAlreadyMarkedForRefinement.begin(); it != blocksAlreadyMarkedForRefinement.end(); ++it ) for(auto block : blocksAlreadyMarkedForRefinement)
{ {
const Block * const block = *it;
if( forest.atDomainXMinBorder(*block) ) if( forest.atDomainXMinBorder(*block) )
{ {
maxInflowLevel = std::max( maxInflowLevel, block->getTargetLevel() ); maxInflowLevel = std::max( maxInflowLevel, block->getTargetLevel() );
...@@ -1116,12 +1116,12 @@ void keepInflowOutflowAtTheSameLevel( std::vector< std::pair< const Block *, uin ...@@ -1116,12 +1116,12 @@ void keepInflowOutflowAtTheSameLevel( std::vector< std::pair< const Block *, uin
mpi::allReduceInplace( maxInflowLevel, mpi::MAX ); mpi::allReduceInplace( maxInflowLevel, mpi::MAX );
mpi::allReduceInplace( maxOutflowLevel, mpi::MAX ); mpi::allReduceInplace( maxOutflowLevel, mpi::MAX );
for( auto it = minTargetLevels.begin(); it != minTargetLevels.end(); ++it ) for(auto & minTargetLevel : minTargetLevels)
{ {
const Block * const block = it->first; const Block * const block = minTargetLevel.first;
if( forest.atDomainXMinBorder(*block) ) if( forest.atDomainXMinBorder(*block) )
{ {
it->second = maxInflowLevel; minTargetLevel.second = maxInflowLevel;
} }
if( setup.strictlyObeyL ) if( setup.strictlyObeyL )
{ {
...@@ -1130,10 +1130,10 @@ void keepInflowOutflowAtTheSameLevel( std::vector< std::pair< const Block *, uin ...@@ -1130,10 +1130,10 @@ void keepInflowOutflowAtTheSameLevel( std::vector< std::pair< const Block *, uin
auto p = aabb.center(); auto p = aabb.center();
p[0] = setup.L; p[0] = setup.L;
if( aabb.contains(p) ) if( aabb.contains(p) )
it->second = maxOutflowLevel; minTargetLevel.second = maxOutflowLevel;
} }
else if( forest.atDomainXMaxBorder(*block) ) else if( forest.atDomainXMaxBorder(*block) )
it->second = maxOutflowLevel; minTargetLevel.second = maxOutflowLevel;
} }
} }
...@@ -1143,10 +1143,10 @@ void keepInflowOutflowAtTheSameLevel( std::vector< std::pair< const Block *, uin ...@@ -1143,10 +1143,10 @@ void keepInflowOutflowAtTheSameLevel( std::vector< std::pair< const Block *, uin
void pseudo2DTargetLevelCorrection( std::vector< std::pair< const Block *, uint_t > > & minTargetLevels, void pseudo2DTargetLevelCorrection( std::vector< std::pair< const Block *, uint_t > > & minTargetLevels,
std::vector< const Block * > &, const blockforest::BlockForest & forest ) std::vector< const Block * > &, const blockforest::BlockForest & forest )
{ {
for( auto it = minTargetLevels.begin(); it != minTargetLevels.end(); ++it ) for(auto & minTargetLevel : minTargetLevels)
{ {
if( ! forest.atDomainZMinBorder( *(it->first ) ) ) if( ! forest.atDomainZMinBorder( *(minTargetLevel.first ) ) )
it->second = uint_t(0); minTargetLevel.second = uint_t(0);
} }
} }
...@@ -1177,12 +1177,12 @@ public: ...@@ -1177,12 +1177,12 @@ public:
void operator()( std::vector< std::pair< const PhantomBlock *, walberla::any > > & blockData, const PhantomBlockForest & ) void operator()( std::vector< std::pair< const PhantomBlock *, walberla::any > > & blockData, const PhantomBlockForest & )
{ {
for( auto it = blockData.begin(); it != blockData.end(); ++it ) for(auto & it : blockData)
{ {
if( it->first->getState().contains( state_ ) ) if( it.first->getState().contains( state_ ) )
it->second = Pseudo2DPhantomWeight( markEmptyBlocks_ ? uint8_t(0) : uint8_t(1) ); it.second = Pseudo2DPhantomWeight( markEmptyBlocks_ ? uint8_t(0) : uint8_t(1) );
else else
it->second = Pseudo2DPhantomWeight( markEmptyBlocks_ ? uint8_t(1) : uint8_t(0) ); it.second = Pseudo2DPhantomWeight( markEmptyBlocks_ ? uint8_t(1) : uint8_t(0) );
} }
} }
...@@ -1297,10 +1297,9 @@ public: ...@@ -1297,10 +1297,9 @@ public:
const bool logToStream = true, const bool logToFile = true, const std::string& filename = std::string("SchaeferTurek.txt"), const bool logToStream = true, const bool logToFile = true, const std::string& filename = std::string("SchaeferTurek.txt"),
const Set<SUID> & requiredSelectors = Set<SUID>::emptySet(), const Set<SUID> & requiredSelectors = Set<SUID>::emptySet(),
const Set<SUID> & incompatibleSelectors = Set<SUID>::emptySet() ) : const Set<SUID> & incompatibleSelectors = Set<SUID>::emptySet() ) :
initialized_( false ), blocks_( blocks ), blocks_( blocks ),
executionCounter_( uint_t(0) ), checkFrequency_( checkFrequency ), pdfFieldId_( pdfFieldId ), flagFieldId_( flagFieldId ), checkFrequency_( checkFrequency ), pdfFieldId_( pdfFieldId ), flagFieldId_( flagFieldId ),
fluid_( fluid ), obstacle_( obstacle ), setup_( setup ), D_( uint_t(0) ), AD_( real_t(0) ), AL_( real_t(0) ), forceEvaluationExecutionCount_( uint_t(0) ), fluid_( fluid ), obstacle_( obstacle ), setup_( setup ),
strouhalRising_( false ), strouhalNumberRealD_( real_t(0) ), strouhalNumberDiscreteD_( real_t(0) ), strouhalEvaluationExecutionCount_( uint_t(0) ),
logToStream_( logToStream ), logToFile_( logToFile ), filename_( filename ), logToStream_( logToStream ), logToFile_( logToFile ), filename_( filename ),
requiredSelectors_( requiredSelectors ), incompatibleSelectors_( incompatibleSelectors ) requiredSelectors_( requiredSelectors ), incompatibleSelectors_( incompatibleSelectors )
{ {
...@@ -1345,13 +1344,13 @@ protected: ...@@ -1345,13 +1344,13 @@ protected:
bool initialized_; bool initialized_{ false };
weak_ptr< StructuredBlockStorage > blocks_; weak_ptr< StructuredBlockStorage > blocks_;
std::map< IBlock *, std::vector< std::pair< Cell, stencil::Direction > > > directions_; std::map< IBlock *, std::vector< std::pair< Cell, stencil::Direction > > > directions_;
uint_t executionCounter_; uint_t executionCounter_{ uint_c(0) };
uint_t checkFrequency_; uint_t checkFrequency_{ uint_c(0) };
BlockDataID pdfFieldId_; BlockDataID pdfFieldId_;
BlockDataID flagFieldId_; BlockDataID flagFieldId_;
...@@ -1361,23 +1360,23 @@ protected: ...@@ -1361,23 +1360,23 @@ protected:
Setup setup_; Setup setup_;
uint_t D_; uint_t D_{ uint_c(0) };
real_t AD_; real_t AD_{ real_c(0) };
real_t AL_; real_t AL_{ real_c(0) };
Vector3< real_t > force_; Vector3< real_t > force_;
std::vector< math::Sample > forceSample_; std::vector< math::Sample > forceSample_;
uint_t forceEvaluationExecutionCount_; uint_t forceEvaluationExecutionCount_{ uint_c(0) };
std::vector< std::deque< real_t > > coefficients_; std::vector< std::deque< real_t > > coefficients_;
std::vector< std::pair< real_t, real_t > > coefficientExtremas_; std::vector< std::pair< real_t, real_t > > coefficientExtremas_;
std::vector< real_t > strouhalVelocities_; std::vector< real_t > strouhalVelocities_;
std::vector< uint_t > strouhalTimeStep_; std::vector< uint_t > strouhalTimeStep_;
bool strouhalRising_; bool strouhalRising_{ false };
real_t strouhalNumberRealD_; real_t strouhalNumberRealD_{ real_c(0) };
real_t strouhalNumberDiscreteD_; real_t strouhalNumberDiscreteD_{ real_c(0) };
uint_t strouhalEvaluationExecutionCount_; uint_t strouhalEvaluationExecutionCount_{ uint_c(0) };
bool logToStream_; bool logToStream_;
bool logToFile_; bool logToFile_;
...@@ -1472,7 +1471,7 @@ void Evaluation< LatticeModel_T >::operator()() ...@@ -1472,7 +1471,7 @@ void Evaluation< LatticeModel_T >::operator()()
{ {
WALBERLA_LOG_RESULT_ON_ROOT( "force acting on cylinder (in dimensionless lattice units of the coarsest grid - evaluated in time step " WALBERLA_LOG_RESULT_ON_ROOT( "force acting on cylinder (in dimensionless lattice units of the coarsest grid - evaluated in time step "
<< forceEvaluationExecutionCount_ << "):\n " << force_ << oss.str() << << forceEvaluationExecutionCount_ << "):\n " << force_ << oss.str() <<
"\ndrag and lift coefficients (including extremas of last " << ( coefficients_[0].size() * checkFrequency_ ) << " time steps):" "\ndrag and lift coefficients (including extrema of last " << ( coefficients_[0].size() * checkFrequency_ ) << " time steps):"
"\n \"real\" area:" "\n \"real\" area:"
"\n c_D: " << cDRealArea << " (min = " << coefficientExtremas_[0].first << ", max = " << coefficientExtremas_[0].second << ")" << "\n c_D: " << cDRealArea << " (min = " << coefficientExtremas_[0].first << ", max = " << coefficientExtremas_[0].second << ")" <<
"\n c_L: " << cLRealArea << " (min = " << coefficientExtremas_[1].first << ", max = " << coefficientExtremas_[1].second << ")" << "\n c_L: " << cLRealArea << " (min = " << coefficientExtremas_[1].first << ", max = " << coefficientExtremas_[1].second << ")" <<
...@@ -1604,10 +1603,10 @@ void Evaluation< LatticeModel_T >::operator()( IBlock * block, const uint_t leve ...@@ -1604,10 +1603,10 @@ void Evaluation< LatticeModel_T >::operator()( IBlock * block, const uint_t leve
const PdfField_T * const pdfField = block->template getData< PdfField_T >( pdfFieldId_ ); const PdfField_T * const pdfField = block->template getData< PdfField_T >( pdfFieldId_ );
const auto & directions = directions_[ block ]; const auto & directions = directions_[ block ];
for( auto pair = directions.begin(); pair != directions.end(); ++pair ) for(const auto & pair : directions)
{ {
const Cell cell( pair->first ); const Cell cell( pair.first );
const stencil::Direction direction( pair->second ); const stencil::Direction direction( pair.second );
const real_t scaleFactor = real_t(1) / real_c( uint_t(1) << ( (Is2D< LatticeModel_T >::value ? uint_t(1) : uint_t(2)) * level ) ); const real_t scaleFactor = real_t(1) / real_c( uint_t(1) << ( (Is2D< LatticeModel_T >::value ? uint_t(1) : uint_t(2)) * level ) );
...@@ -1712,8 +1711,8 @@ void Evaluation< LatticeModel_T >::getResultsForSQLOnRoot( std::map< std::string ...@@ -1712,8 +1711,8 @@ void Evaluation< LatticeModel_T >::getResultsForSQLOnRoot( std::map< std::string
{ {
WALBERLA_ROOT_SECTION() WALBERLA_ROOT_SECTION()
{ {
for( auto result = sqlResults_.begin(); result != sqlResults_.end(); ++result ) for(const auto & sqlResult : sqlResults_)
realProperties[ result->first ] = result->second; realProperties[ sqlResult.first ] = sqlResult.second;
integerProperties[ "forceEvaluationTimeStep" ] = int_c( forceEvaluationExecutionCount_ ); integerProperties[ "forceEvaluationTimeStep" ] = int_c( forceEvaluationExecutionCount_ );
integerProperties[ "strouhalEvaluationTimeStep" ] = int_c( strouhalEvaluationExecutionCount_ ); integerProperties[ "strouhalEvaluationTimeStep" ] = int_c( strouhalEvaluationExecutionCount_ );
...@@ -2050,9 +2049,7 @@ void Evaluation< LatticeModel_T >::evaluate( real_t & cDRealArea, real_t & cLRea ...@@ -2050,9 +2049,7 @@ void Evaluation< LatticeModel_T >::evaluate( real_t & cDRealArea, real_t & cLRea
// force on obstacle // force on obstacle
mpi::reduceInplace( force_[0], mpi::SUM ); mpi::reduceInplace( force_, mpi::SUM );
mpi::reduceInplace( force_[1], mpi::SUM );
mpi::reduceInplace( force_[2], mpi::SUM );
if( setup_.evaluateForceComponents ) if( setup_.evaluateForceComponents )
{ {
...@@ -2157,7 +2154,7 @@ public: ...@@ -2157,7 +2154,7 @@ public:
SnapshotSimulator( const weak_ptr<blockforest::StructuredBlockForest> & blocks, SnapshotSimulator( const weak_ptr<blockforest::StructuredBlockForest> & blocks,
const uint_t failAt, const uint_t failRangeBegin, const uint_t failRangeEnd, const bool failRebalance ) : const uint_t failAt, const uint_t failRangeBegin, const uint_t failRangeEnd, const bool failRebalance ) :
blocks_( blocks ), executionCounter_( 0 ), blocks_( blocks ),
failAt_( failAt ), failRangeBegin_( failRangeBegin ), failRangeEnd_( failRangeEnd ), failRebalance_( failRebalance ) failAt_( failAt ), failRangeBegin_( failRangeBegin ), failRangeEnd_( failRangeEnd ), failRebalance_( failRebalance )
{} {}
...@@ -2218,7 +2215,7 @@ private: ...@@ -2218,7 +2215,7 @@ private:
weak_ptr<blockforest::StructuredBlockForest> blocks_; weak_ptr<blockforest::StructuredBlockForest> blocks_;
uint_t executionCounter_; uint_t executionCounter_{ uint_c(0) };
uint_t failAt_; uint_t failAt_;
uint_t failRangeBegin_; uint_t failRangeBegin_;
...@@ -2304,11 +2301,11 @@ struct AddRefinementTimeStep ...@@ -2304,11 +2301,11 @@ struct AddRefinementTimeStep
}; };
template< typename LatticeModel_T > template< typename LatticeModel_T >
struct AddRefinementTimeStep< LatticeModel_T, typename std::enable_if< std::is_same< typename LatticeModel_T::CollisionModel::tag, lbm::collision_model::MRT_tag >::value || struct AddRefinementTimeStep< LatticeModel_T, std::enable_if_t< std::is_same_v< typename LatticeModel_T::CollisionModel::tag, lbm::collision_model::MRT_tag > ||
std::is_same< typename LatticeModel_T::Stencil, stencil::D2Q9 >::value || std::is_same_v< typename LatticeModel_T::Stencil, stencil::D2Q9 > ||
std::is_same< typename LatticeModel_T::Stencil, stencil::D3Q15 >::value || std::is_same_v< typename LatticeModel_T::Stencil, stencil::D3Q15 > ||
std::is_same< typename LatticeModel_T::Stencil, stencil::D3Q27 >::value std::is_same_v< typename LatticeModel_T::Stencil, stencil::D3Q27 >
>::type > > >
{ {
static void add( SweepTimeloop & timeloop, shared_ptr< blockforest::StructuredBlockForest > & blocks, static void add( SweepTimeloop & timeloop, shared_ptr< blockforest::StructuredBlockForest > & blocks,
const BlockDataID & pdfFieldId, const BlockDataID & flagFieldId, const BlockDataID & boundaryHandlingId, const BlockDataID & pdfFieldId, const BlockDataID & flagFieldId, const BlockDataID & boundaryHandlingId,
...@@ -2365,7 +2362,7 @@ void run( const shared_ptr< Config > & config, const LatticeModel_T & latticeMod ...@@ -2365,7 +2362,7 @@ void run( const shared_ptr< Config > & config, const LatticeModel_T & latticeMod
// add velocity field + initialize velocity field writer (only used for simulations with an adaptive block structure) // add velocity field + initialize velocity field writer (only used for simulations with an adaptive block structure)
using VelocityField_T = field::GhostLayerField<Vector3<real_t>, 1>; using VelocityField_T = field::GhostLayerField<Vector3<real_t>, 1>;
BlockDataID velocityFieldId = field::addToStorage< VelocityField_T >( blocks, "velocity", Vector3<real_t>(0), field::zyxf, FieldGhostLayers, true, None, Empty ); BlockDataID velocityFieldId = field::addToStorage< VelocityField_T >( blocks, "velocity", Vector3<real_t>(0), field::fzyx, FieldGhostLayers, true, None, Empty );
using VelocityFieldWriter_T = lbm::VelocityFieldWriter<typename Types<LatticeModel_T>::PdfField_T, VelocityField_T>; using VelocityFieldWriter_T = lbm::VelocityFieldWriter<typename Types<LatticeModel_T>::PdfField_T, VelocityField_T>;
BlockSweepWrapper< VelocityFieldWriter_T > velocityFieldWriter( blocks, VelocityFieldWriter_T( pdfFieldId, velocityFieldId ), None, Empty ); BlockSweepWrapper< VelocityFieldWriter_T > velocityFieldWriter( blocks, VelocityFieldWriter_T( pdfFieldId, velocityFieldId ), None, Empty );
...@@ -2431,9 +2428,9 @@ void run( const shared_ptr< Config > & config, const LatticeModel_T & latticeMod ...@@ -2431,9 +2428,9 @@ void run( const shared_ptr< Config > & config, const LatticeModel_T & latticeMod
if( vtkBeforeTimeStep ) if( vtkBeforeTimeStep )
{ {
for( auto output = vtkOutputFunctions.begin(); output != vtkOutputFunctions.end(); ++output ) for(auto & output : vtkOutputFunctions)
timeloop.addFuncBeforeTimeStep( output->second.outputFunction, std::string("VTK: ") + output->first, timeloop.addFuncBeforeTimeStep( output.second.outputFunction, std::string("VTK: ") + output.first,
output->second.requiredGlobalStates, output->second.incompatibleGlobalStates ); output.second.requiredGlobalStates, output.second.incompatibleGlobalStates );
} }
// evaluation // evaluation
...@@ -2492,8 +2489,8 @@ void run( const shared_ptr< Config > & config, const LatticeModel_T & latticeMod ...@@ -2492,8 +2489,8 @@ void run( const shared_ptr< Config > & config, const LatticeModel_T & latticeMod
adaptiveRefinementLog = oss.str(); adaptiveRefinementLog = oss.str();
} }
minTargetLevelDeterminationFunctions.add( std::bind( keepInflowOutflowAtTheSameLevel, std::placeholders::_1, std::placeholders::_2, minTargetLevelDeterminationFunctions.add([setup_cref = std::cref(setup)](auto & minTargetLevels, auto & blocksAlreadyMarkedForRefinement, auto & bf) {
std::placeholders::_3, std::cref(setup) ) ); return keepInflowOutflowAtTheSameLevel(minTargetLevels, blocksAlreadyMarkedForRefinement, bf, setup_cref); } );
if( Is2D< LatticeModel_T >::value ) if( Is2D< LatticeModel_T >::value )
minTargetLevelDeterminationFunctions.add( pseudo2DTargetLevelCorrection ); minTargetLevelDeterminationFunctions.add( pseudo2DTargetLevelCorrection );
...@@ -2569,14 +2566,14 @@ void run( const shared_ptr< Config > & config, const LatticeModel_T & latticeMod ...@@ -2569,14 +2566,14 @@ void run( const shared_ptr< Config > & config, const LatticeModel_T & latticeMod
blockforest::DynamicDiffusionBalance< blockforest::NoPhantomData >( maxIterations, flowIterations ) ); blockforest::DynamicDiffusionBalance< blockforest::NoPhantomData >( maxIterations, flowIterations ) );
} }
// add callback functions which are executed after all block data was unpakced after the dynamic load balancing // add callback functions which are executed after all block data was unpacked after the dynamic load balancing
// for blocks that have *not* migrated: store current flag field state (required for lbm::PostProcessing) // for blocks that have *not* migrated: store current flag field state (required for lbm::PostProcessing)
blockforest.addRefreshCallbackFunctionAfterBlockDataIsUnpacked( lbm::MarkerFieldGenerator< LatticeModel_T, field::FlagFieldEvaluationFilter<FlagField_T> >( blockforest.addRefreshCallbackFunctionAfterBlockDataIsUnpacked( lbm::MarkerFieldGenerator< LatticeModel_T, field::FlagFieldEvaluationFilter<FlagField_T> >(
pdfFieldId, markerDataId, flagFieldFilter ) ); pdfFieldId, markerDataId, flagFieldFilter ) );
// (re)set boundaries = (re)initialize flag field for every block with respect to the new block structure (the size of neighbor blocks might have changed) // (re)set boundaries = (re)initialize flag field for every block with respect to the new block structure (the size of neighbor blocks might have changed)
blockforest.addRefreshCallbackFunctionAfterBlockDataIsUnpacked( blockforest::BlockForest::RefreshCallbackWrappper( boundarySetter ) ); blockforest.addRefreshCallbackFunctionAfterBlockDataIsUnpacked( blockforest::BlockForest::RefreshCallbackWrappper( boundarySetter ) );
// treat boundary-fluid cell convertions // treat boundary-fluid cell conversions
blockforest.addRefreshCallbackFunctionAfterBlockDataIsUnpacked( lbm::PostProcessing< LatticeModel_T, field::FlagFieldEvaluationFilter<FlagField_T> >( blockforest.addRefreshCallbackFunctionAfterBlockDataIsUnpacked( lbm::PostProcessing< LatticeModel_T, field::FlagFieldEvaluationFilter<FlagField_T> >(
pdfFieldId, markerDataId, flagFieldFilter ) ); pdfFieldId, markerDataId, flagFieldFilter ) );
// (re)set velocity field (velocity field data is not migrated!) // (re)set velocity field (velocity field data is not migrated!)
...@@ -2612,9 +2609,9 @@ void run( const shared_ptr< Config > & config, const LatticeModel_T & latticeMod ...@@ -2612,9 +2609,9 @@ void run( const shared_ptr< Config > & config, const LatticeModel_T & latticeMod
if( !vtkBeforeTimeStep ) if( !vtkBeforeTimeStep )
{ {
for( auto output = vtkOutputFunctions.begin(); output != vtkOutputFunctions.end(); ++output ) for(auto & output : vtkOutputFunctions)
timeloop.addFuncAfterTimeStep( output->second.outputFunction, std::string("VTK: ") + output->first, timeloop.addFuncAfterTimeStep( output.second.outputFunction, std::string("VTK: ") + output.first,
output->second.requiredGlobalStates, output->second.incompatibleGlobalStates ); output.second.requiredGlobalStates, output.second.incompatibleGlobalStates );
} }
// stability check (non-finite values in the PDF field?) // stability check (non-finite values in the PDF field?)
...@@ -2625,7 +2622,7 @@ void run( const shared_ptr< Config > & config, const LatticeModel_T & latticeMod ...@@ -2625,7 +2622,7 @@ void run( const shared_ptr< Config > & config, const LatticeModel_T & latticeMod
// remaining time logger // remaining time logger
const double remainingTimeLoggerFrequency = configBlock.getParameter< double >( "remainingTimeLoggerFrequency", 3.0 ); const real_t remainingTimeLoggerFrequency = configBlock.getParameter< real_t >( "remainingTimeLoggerFrequency", real_c(3.0) );
timeloop.addFuncAfterTimeStep( timing::RemainingTimeLogger( timeloop.getNrOfTimeSteps(), remainingTimeLoggerFrequency ), "Remaining time logger" ); timeloop.addFuncAfterTimeStep( timing::RemainingTimeLogger( timeloop.getNrOfTimeSteps(), remainingTimeLoggerFrequency ), "Remaining time logger" );
// logging right before the simulation starts // logging right before the simulation starts
...@@ -2920,10 +2917,10 @@ int main( int argc, char **argv ) ...@@ -2920,10 +2917,10 @@ int main( int argc, char **argv )
"// //\n" "// //\n"
"// Schaefer Turek Benchmark //\n" "// Schaefer Turek Benchmark //\n"
"// //\n" "// //\n"
"// Reference: Schaefer, M. and Turek, S. (1996) 'Benchmark computations of laminar flow around a cylinder (with support //\n" "// Reference: Schaefer, M. and Turek, S. (1996) Benchmark computations of laminar flow around a cylinder (with support //\n"
"// by F. Durst, E. Krause and R. Rannacher), in E. Hirschel (Ed.): Flow Simulation with High-Performance //\n" "// by F. Durst, E. Krause and R. Rannacher), in E. Hirschel (Ed.): Flow Simulation with High-Performance //\n"
"// Computers II. DFG Priority Research Program Results 1993-1995, No. 52 in Notes Numer, Fluid Mech., //\n" "// Computers II. DFG Priority Research Program Results 1993-1995, No. 48 in Notes on Numerical Fluid //\n"
"// pp.547-566, Vieweg, Weisbaden. //\n" "// Mechanics, pp.547-566, Vieweg, Weisbaden. //\n"
"// //\n" "// //\n"
"//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////" ); "//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////" );
...@@ -3062,7 +3059,7 @@ int main( int argc, char **argv ) ...@@ -3062,7 +3059,7 @@ int main( int argc, char **argv )
refinementSelectionFunctions.add( cylinderRefinementSelection ); refinementSelectionFunctions.add( cylinderRefinementSelection );
} }
refinementSelectionFunctions.add( std::bind( setInflowOutflowToSameLevel, std::placeholders::_1, setup ) ); refinementSelectionFunctions.add([&setup](auto & bf) { return setInflowOutflowToSameLevel(bf, setup); });
if( setup.pseudo2D ) if( setup.pseudo2D )
refinementSelectionFunctions.add( Pseudo2DRefinementSelectionCorrection ); refinementSelectionFunctions.add( Pseudo2DRefinementSelectionCorrection );
......
waLBerla_link_files_to_builddir( *.prm )
if( WALBERLA_BUILD_WITH_CODEGEN )
# Turbulent Channel generation
walberla_generate_target_from_python( NAME TurbulentChannel_CodeGeneration
FILE TurbulentChannel.py
OUT_FILES CodegenIncludes.h
TurbulentChannel_Sweep.cpp TurbulentChannel_Sweep.h
TurbulentChannel_PackInfo.cpp TurbulentChannel_PackInfo.h
TurbulentChannel_Setter.cpp TurbulentChannel_Setter.h
TurbulentChannel_NoSlip.cpp TurbulentChannel_NoSlip.h
TurbulentChannel_FreeSlip_top.cpp TurbulentChannel_FreeSlip_top.h
TurbulentChannel_WFB_bottom.cpp TurbulentChannel_WFB_bottom.h
TurbulentChannel_WFB_top.cpp TurbulentChannel_WFB_top.h
TurbulentChannel_Welford.cpp TurbulentChannel_Welford.h
TurbulentChannel_Welford_TKE_SGS.cpp TurbulentChannel_Welford_TKE_SGS.h
TurbulentChannel_TKE_SGS_Writer.cpp TurbulentChannel_TKE_SGS_Writer.h
)
walberla_add_executable( NAME TurbulentChannel_Application
FILES TurbulentChannel.cpp
DEPENDS walberla::blockforest walberla::core walberla::domain_decomposition walberla::field walberla::geometry walberla::timeloop
walberla::lbm walberla::stencil walberla::vtk TurbulentChannel_CodeGeneration )
endif()
\ No newline at end of file
//======================================================================================================================
//
// This file is part of waLBerla. waLBerla is free software: you can
// redistribute it and/or modify it under the terms of the GNU General Public
// License as published by the Free Software Foundation, either version 3 of
// the License, or (at your option) any later version.
//
// waLBerla is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
//
//! \file TurbulentChannel.cpp
//! \author Helen Schottenhamml <helen.schottenhamml@fau.de>
//
//======================================================================================================================
#include <memory>
#include <cmath>
#include <string>
#include <iostream>
#include <blockforest/all.h>
#include <core/all.h>
#include <domain_decomposition/all.h>
#include <field/all.h>
#include <field/vtk/VTKWriter.h>
#include <geometry/all.h>
#include <timeloop/all.h>
#include <lbm/all.h>
// Codegen Includes
#include "CodegenIncludes.h"
namespace walberla {
///////////////////////
/// Typedef Aliases ///
///////////////////////
using Stencil_T = codegen::Stencil_T;
// Communication Pack Info
using PackInfo_T = pystencils::TurbulentChannel_PackInfo;
// PDF field type
using PdfField_T = field::GhostLayerField< real_t, Stencil_T::Size >;
// Field Types
using ScalarField_T = field::GhostLayerField< real_t, 1 >;
using VectorField_T = field::GhostLayerField< real_t, Stencil_T::D >;
using TensorField_T = field::GhostLayerField< real_t, Stencil_T::D*Stencil_T::D >;
using Setter_T = pystencils::TurbulentChannel_Setter;
using StreamCollideSweep_T = pystencils::TurbulentChannel_Sweep;
using WelfordSweep_T = pystencils::TurbulentChannel_Welford;
using TKEWelfordSweep_T = pystencils::TurbulentChannel_Welford_TKE_SGS;
using TkeSgsWriter_T = pystencils::TurbulentChannel_TKE_SGS_Writer;
// Boundary Handling
using flag_t = uint8_t;
using FlagField_T = FlagField< flag_t >;
using NoSlip_T = lbm::TurbulentChannel_NoSlip;
using FreeSlip_top_T = lbm::TurbulentChannel_FreeSlip_top;
using WFB_bottom_T = lbm::TurbulentChannel_WFB_bottom;
using WFB_top_T = lbm::TurbulentChannel_WFB_top;
/// DEAN CORRELATIONS
namespace dean_correlation {
real_t calculateFrictionReynoldsNumber(const real_t reynoldsBulk) {
return std::pow(0.073_r / 8_r, 1_r / 2_r) * std::pow(reynoldsBulk, 7_r / 8_r);
}
real_t calculateBulkReynoldsNumber(const real_t reynoldsFriction) {
return std::pow(8_r / 0.073_r, 4_r / 7_r) * std::pow(reynoldsFriction, 8_r / 7_r);
}
} // namespace dean_correlation
/// VELOCITY FIELD INITIALISATION
/*
* Initialises the velocity field with a logarithmic profile and sinusoidal perturbations to trigger turbulence.
* This initialisation is provided by Henrik Asmuth.
*/
template<typename VelocityField_T>
void setVelocityFieldsAsmuth( const std::weak_ptr<StructuredBlockStorage>& forest,
const BlockDataID & velocityFieldId, const BlockDataID & meanVelocityFieldId,
const real_t frictionVelocity, const uint_t channel_half_width,
const real_t B, const real_t kappa, const real_t viscosity,
const uint_t wallAxis, const uint_t flowAxis ) {
auto blocks = forest.lock();
WALBERLA_CHECK_NOT_NULLPTR(blocks)
const auto domainSize = blocks->getDomain().max();
const auto delta = real_c(channel_half_width);
const auto remAxis = 3 - wallAxis - flowAxis;
for( auto block = blocks->begin(); block != blocks->end(); ++block ) {
auto * velocityField = block->template getData<VelocityField_T>(velocityFieldId);
WALBERLA_CHECK_NOT_NULLPTR(velocityField)
auto * meanVelocityField = block->template getData<VelocityField_T>(meanVelocityFieldId);
WALBERLA_CHECK_NOT_NULLPTR(meanVelocityField)
const auto ci = velocityField->xyzSizeWithGhostLayer();
for(auto cellIt = ci.begin(); cellIt != ci.end(); ++cellIt) {
Cell globalCell(*cellIt);
blocks->transformBlockLocalToGlobalCell(globalCell, *block);
Vector3<real_t> cellCenter;
blocks->getCellCenter(cellCenter, globalCell);
const auto y = cellCenter[wallAxis];
const auto rel_x = cellCenter[flowAxis] / domainSize[flowAxis];
const auto rel_z = cellCenter[remAxis] / domainSize[remAxis];
const real_t pos = std::max(delta - std::abs(y - delta - 1_r), 0.05_r);
const auto rel_y = pos / delta;
auto initialVel = frictionVelocity * (std::log(frictionVelocity * pos / viscosity) / kappa + B);
Vector3<real_t> vel;
vel[flowAxis] = initialVel;
vel[remAxis] = 2_r * frictionVelocity / kappa * std::sin(math::pi * 16_r * rel_x) *
std::sin(math::pi * 8_r * rel_y) / (std::pow(rel_y, 2_r) + 1_r);
vel[wallAxis] = 8_r * frictionVelocity / kappa *
(std::sin(math::pi * 8_r * rel_z) * std::sin(math::pi * 8_r * rel_y) +
std::sin(math::pi * 8_r * rel_x)) / (std::pow(0.5_r * delta - pos, 2_r) + 1_r);
for(uint_t d = 0; d < 3; ++d) {
velocityField->get(*cellIt, d) = vel[d];
meanVelocityField->get(*cellIt, d) = vel[d];
}
}
}
} // function setVelocityFieldsHenrik
/// SIMULATION PARAMETERS
struct SimulationParameters {
SimulationParameters(const Config::BlockHandle & config)
{
channelHalfWidth = config.getParameter<uint_t>("channel_half_width");
fullChannel = config.getParameter<bool>("full_channel", false);
/// TARGET QUANTITIES
targetFrictionReynolds = config.getParameter<real_t>("target_friction_reynolds");
targetBulkVelocity = config.getParameter<real_t>("target_bulk_velocity", 0.05_r);
targetBulkReynolds = dean_correlation::calculateBulkReynoldsNumber(targetFrictionReynolds);
viscosity = 2_r * real_c(channelHalfWidth) * targetBulkVelocity / targetBulkReynolds;
targetFrictionVelocity = targetFrictionReynolds * viscosity / real_c(channelHalfWidth);
/// TIMESTEPS
timesteps = config.getParameter<uint_t>("timesteps", 0);
const uint_t turnoverPeriods = config.getParameter<uint_t>("turnover_periods", 0);
WALBERLA_ASSERT((timesteps != 0) != (turnoverPeriods != 0),
"Either timesteps OR turnover periods must be given.")
if(turnoverPeriods != 0) {
// turnover period defined by T = delta / u_tau
timesteps = turnoverPeriods * uint_c((real_c(channelHalfWidth) / targetFrictionVelocity));
}
/// DOMAIN DEFINITIONS
// obtained from codegen script -> adapt there
wallAxis = codegen::wallAxis;
flowAxis = codegen::flowAxis;
uint_t sizeFlowAxis = config.getParameter<uint_t>("size_flow_axis", 0);
uint_t sizeRemainingAxis = config.getParameter<uint_t>("size_remaining_axis", 0);
WALBERLA_ASSERT_NOT_IDENTICAL(wallAxis, flowAxis, "Wall and flow axis must be different.")
const auto sizeFactor = channelHalfWidth / uint_t(10);
if( !sizeFlowAxis) sizeFlowAxis = sizeFactor * 64;
if( !sizeRemainingAxis) sizeRemainingAxis = sizeFactor * 32;
domainSize[wallAxis] = fullChannel ? 2 * channelHalfWidth : channelHalfWidth;
domainSize[flowAxis] = sizeFlowAxis;
domainSize[3- wallAxis - flowAxis] = sizeRemainingAxis;
periodicity[wallAxis] = false;
boundaryCondition = config.getParameter<std::string>("wall_boundary_condition", "WFB");
/// OUTPUT
auto tsPerPeriod = uint_c((real_c(channelHalfWidth) / targetFrictionVelocity));
vtkFrequency = config.getParameter<uint_t>("vtk_frequency", 0);
vtkStart = config.getParameter<uint_t>("vtk_start", 0);
plotFrequency = config.getParameter<uint_t>("plot_frequency", 0);
plotStart = config.getParameter<uint_t>("plot_start", 0);
// vtk start
vtkStart = config.getParameter<uint_t>("vtk_start_timesteps", 0);
const uint_t vtkStartPeriods = config.getParameter<uint_t>("vtk_start_periods", 0);
if(vtkStart || vtkStartPeriods) {
WALBERLA_ASSERT((vtkStart != 0) != (vtkStartPeriods != 0),
"VTK start must be given in timesteps OR periods, not both.")
}
if(vtkStartPeriods != 0) {
// turnover period defined by T = delta / u_tau
vtkStart = vtkStartPeriods * tsPerPeriod;
}
// plot start
plotStart = config.getParameter<uint_t>("plot_start_timesteps", 0);
const uint_t plotStartPeriods = config.getParameter<uint_t>("plot_start_periods", 0);
if(plotStart || plotStartPeriods) {
WALBERLA_ASSERT((plotStart != 0) != (plotStartPeriods != 0),
"Plotting start must be given in timesteps OR periods, not both.")
}
if(plotStartPeriods != 0) {
// turnover period defined by T = delta / u_tau
plotStart = plotStartPeriods * tsPerPeriod;
}
// frequencies
if(plotFrequency) {
timesteps = uint_c(std::ceil(real_c(timesteps) / real_c(plotFrequency))) * plotFrequency;
}
// sampling start & interval
samplingStart = config.getParameter<uint_t>("sampling_start_timesteps", 0);
const uint_t samplingStartPeriods = config.getParameter<uint_t>("sampling_start_periods", 0);
if(samplingStart || samplingStartPeriods) {
WALBERLA_ASSERT((samplingStart != 0) != (samplingStartPeriods != 0),
"Sampling start must be given in timesteps OR periods, not both.")
}
if(samplingStartPeriods != 0) {
// turnover period defined by T = delta / u_tau
samplingStart = samplingStartPeriods * tsPerPeriod;
}
samplingInterval = config.getParameter<uint_t>("sampling_interval_timesteps", 0);
const uint_t samplingIntervalPeriods = config.getParameter<uint_t>("sampling_interval_periods", 0);
if(samplingInterval || samplingIntervalPeriods) {
WALBERLA_ASSERT((samplingInterval != 0) != (samplingIntervalPeriods != 0),
"Sampling start must be given in timesteps OR periods, not both.")
}
if(samplingStartPeriods != 0) {
// turnover period defined by T = delta / u_tau
samplingInterval = samplingIntervalPeriods * tsPerPeriod;
}
timesteps += 1;
}
uint_t channelHalfWidth{};
bool fullChannel{};
Vector3<uint_t> domainSize{};
Vector3<uint_t> periodicity{true};
real_t targetFrictionReynolds{};
real_t targetBulkReynolds{};
real_t targetFrictionVelocity{};
real_t targetBulkVelocity{};
real_t viscosity{};
const real_t density{1.0};
uint_t timesteps{};
std::string boundaryCondition{};
uint_t wallAxis{};
uint_t flowAxis{};
/// output
uint_t vtkFrequency{};
uint_t vtkStart{};
uint_t plotFrequency{};
uint_t plotStart{};
uint_t samplingStart{};
uint_t samplingInterval{};
};
namespace boundaries {
void createBoundaryConfig(const SimulationParameters & parameters, Config::Block & boundaryBlock) {
auto & bottomWall = boundaryBlock.createBlock("Border");
bottomWall.addParameter("direction", stencil::dirToString[stencil::directionFromAxis(parameters.wallAxis, true)]);
bottomWall.addParameter("walldistance", "-1");
if(parameters.boundaryCondition == "NoSlip") {
bottomWall.addParameter("flag", "NoSlip");
} else if(parameters.boundaryCondition == "WFB") {
bottomWall.addParameter("flag", "WFB_bottom");
}
auto & topWall = boundaryBlock.createBlock("Border");
topWall.addParameter("direction", stencil::dirToString[stencil::directionFromAxis(parameters.wallAxis, false)]);
topWall.addParameter("walldistance", "-1");
if(parameters.fullChannel) {
if (parameters.boundaryCondition == "NoSlip") {
topWall.addParameter("flag", "NoSlip");
} else if (parameters.boundaryCondition == "WFB") {
topWall.addParameter("flag", "WFB_top");
}
} else {
topWall.addParameter("flag", "FreeSlip");
}
}
}
/// BULK VELOCITY CALCULATION
template< typename VelocityField_T >
class ForceCalculator {
public:
ForceCalculator(const std::weak_ptr<StructuredBlockStorage> & blocks, const BlockDataID meanVelocityId,
const SimulationParameters & parameter)
: blocks_(blocks), meanVelocityId_(meanVelocityId), channelHalfWidth_(real_c(parameter.channelHalfWidth)),
targetBulkVelocity_(parameter.targetBulkVelocity), targetFrictionVelocity_(parameter.targetFrictionVelocity)
{
const auto & domainSize = parameter.domainSize;
Cell maxCell;
maxCell[parameter.wallAxis] = int_c(parameter.channelHalfWidth) - 1;
maxCell[flowDirection_] = int_c(domainSize[flowDirection_]) - 1;
const auto remainingIdx = 3 - parameter.wallAxis - flowDirection_;
maxCell[remainingIdx] = int_c(domainSize[remainingIdx]) - 1;
ci_ = CellInterval(Cell{}, maxCell);
numCells_ = real_c(parameter.channelHalfWidth * domainSize[flowDirection_] * domainSize[remainingIdx]);
}
real_t bulkVelocity() const { return bulkVelocity_; }
void setBulkVelocity(const real_t bulkVelocity) { bulkVelocity_ = bulkVelocity; }
void calculateBulkVelocity() {
// reset bulk velocity
bulkVelocity_ = 0_r;
auto blocks = blocks_.lock();
WALBERLA_CHECK_NOT_NULLPTR(blocks)
for( auto block = blocks->begin(); block != blocks->end(); ++block) {
auto * meanVelocityField = block->template getData<VelocityField_T>(meanVelocityId_);
WALBERLA_CHECK_NOT_NULLPTR(meanVelocityField)
auto fieldSize = meanVelocityField->xyzSize();
CellInterval localCi;
blocks->transformGlobalToBlockLocalCellInterval(localCi, *block, ci_);
fieldSize.intersect(localCi);
auto * slicedField = meanVelocityField->getSlicedField(fieldSize);
WALBERLA_CHECK_NOT_NULLPTR(meanVelocityField)
for(auto fieldIt = slicedField->beginXYZ(); fieldIt != slicedField->end(); ++fieldIt) {
const auto localMean = fieldIt[flowDirection_];
bulkVelocity_ += localMean;
}
}
mpi::allReduceInplace< real_t >(bulkVelocity_, mpi::SUM);
bulkVelocity_ /= numCells_;
}
real_t calculateDrivingForce() const {
// forcing term as in Malaspinas (2014) "Wall model for large-eddy simulation based on the lattice Boltzmann method"
const auto force = targetFrictionVelocity_ * targetFrictionVelocity_ / channelHalfWidth_
+ (targetBulkVelocity_ - bulkVelocity_) * targetBulkVelocity_ / channelHalfWidth_;
return force;
}
private:
const std::weak_ptr<StructuredBlockStorage> blocks_{};
const BlockDataID meanVelocityId_{};
const uint_t flowDirection_{};
const real_t channelHalfWidth_{};
const real_t targetBulkVelocity_{};
const real_t targetFrictionVelocity_{};
CellInterval ci_{};
real_t numCells_{};
real_t bulkVelocity_{};
};
template< typename Welford_T >
class TurbulentChannelPlotter {
public:
TurbulentChannelPlotter(SimulationParameters const * const parameters, Timeloop * const timeloop,
ForceCalculator<VectorField_T> const * const forceCalculator,
const std::weak_ptr<StructuredBlockStorage> & blocks,
const BlockDataID velocityFieldId, const BlockDataID meanVelocityFieldId,
const BlockDataID meanTkeSGSFieldId, const BlockDataID sosFieldId, Welford_T * velocityWelford,
const bool separateFile = false)
: parameters_(parameters), forceCalculator_(forceCalculator), timeloop_(timeloop), blocks_(blocks),
velocityWelford_(velocityWelford), velocityFieldId_(velocityFieldId), meanVelocityFieldId_(meanVelocityFieldId),
meanTkeSGSFieldId_(meanTkeSGSFieldId), sosFieldId_(sosFieldId), plotFrequency_(parameters->plotFrequency), plotStart_(parameters->plotStart),
separateFiles_(separateFile)
{
if(!plotFrequency_)
return;
// prepare output folder
const filesystem::path path(baseFolder_);
std::string fileSuffix = parameters->boundaryCondition + "_";
if(parameters->fullChannel)
fileSuffix += "full_D";
else
fileSuffix += "half_D";
fileSuffix += std::to_string(parameters->channelHalfWidth) + "_Re" +
std::to_string(int(parameters->targetFrictionReynolds)) ;
velocityProfilesFilePath_ = path / ("velocity_profiles_" + fileSuffix);
forcingDataFilePath_ = path / ("forcing_data_" + fileSuffix + "_t" +
std::to_string(parameters->timesteps-1) + ".txt");
WALBERLA_ROOT_SECTION() {
// create directory if not existent; empty if existent
if( !filesystem::exists(path) ) {
filesystem::create_directories(path);
} else {
for (const auto& entry : filesystem::directory_iterator(path))
std::filesystem::remove_all(entry.path());
}
}
// write force header
std::ofstream os (forcingDataFilePath_, std::ios::out | std::ios::trunc);
if(os.is_open()) {
os << "# timestep\t bulk_velocity\t driving_force\n";
os.close();
} else {
WALBERLA_ABORT("Could not open forcing data file.")
}
}
void operator()() {
const auto ts = timeloop_->getCurrentTimeStep();
if(ts < plotStart_)
return;
if(!plotFrequency_ || (ts % plotFrequency_))
return;
const auto channelHalfWidth = real_c(parameters_->channelHalfWidth);
const auto bulkVelocity = forceCalculator_->bulkVelocity();
/// write force data
WALBERLA_ROOT_SECTION() {
std::ofstream forceOS(forcingDataFilePath_, std::ios::out | std::ios::app);
if (forceOS.is_open())
{
forceOS << ts << "\t" << bulkVelocity << "\t" << forceCalculator_->calculateDrivingForce() << "\n";
forceOS.close();
}
}
/// write velocity data
// gather velocity data
std::vector<real_t> instantaneousVelocityVector(parameters_->channelHalfWidth, 0_r);
std::vector<real_t> meanVelocityVector(parameters_->channelHalfWidth, 0_r);
std::vector<real_t> tkeSGSVector(parameters_->channelHalfWidth, 0_r);
std::vector<real_t> tkeResolvedVector(parameters_->channelHalfWidth, 0_r);
std::vector<real_t> reynoldsStressVector(parameters_->channelHalfWidth * TensorField_T::F_SIZE, 0_r);
const auto idxFlow = int_c(parameters_->domainSize[parameters_->flowAxis] / uint_t(2));
const auto idxRem = int_c(parameters_->domainSize[3 - parameters_->flowAxis - parameters_->wallAxis] / uint_t(2));
Cell point;
point[parameters_->flowAxis] = idxFlow;
point[3 - parameters_->flowAxis - parameters_->wallAxis] = idxRem;
const auto flowAxis = int_c(parameters_->flowAxis);
auto blocks = blocks_.lock();
WALBERLA_CHECK_NOT_NULLPTR(blocks)
for(auto block = blocks->begin(); block != blocks->end(); ++block) {
const auto * const velocity = block->template getData<VectorField_T>(velocityFieldId_);
WALBERLA_CHECK_NOT_NULLPTR(velocity)
const auto * const meanVelocity = block->template getData<VectorField_T>(meanVelocityFieldId_);
WALBERLA_CHECK_NOT_NULLPTR(meanVelocity)
const auto * const tkeSGS = block->template getData<ScalarField_T>(meanTkeSGSFieldId_);
WALBERLA_CHECK_NOT_NULLPTR(tkeSGS)
const auto * const sos = block->template getData<TensorField_T>(sosFieldId_);
WALBERLA_CHECK_NOT_NULLPTR(sos)
for(uint_t idx = 0; idx < parameters_->channelHalfWidth; ++idx) {
point[parameters_->wallAxis] = int_c(idx);
Cell localCell;
blocks->transformGlobalToBlockLocalCell(localCell, *block, point);
if(velocity->xyzSize().contains(localCell)){
instantaneousVelocityVector[idx] = velocity->get(localCell, flowAxis);
meanVelocityVector[idx] = meanVelocity->get(localCell, flowAxis);
tkeSGSVector[idx] = tkeSGS->get(localCell);
for(uint_t i = 0; i < TensorField_T::F_SIZE; ++i) {
reynoldsStressVector[idx*TensorField_T::F_SIZE+i] = sos->get(localCell,i) / velocityWelford_->getCounter();
}
tkeResolvedVector[idx] = real_c(0.5) * (
reynoldsStressVector[idx*TensorField_T::F_SIZE+0] +
reynoldsStressVector[idx*TensorField_T::F_SIZE+4] +
reynoldsStressVector[idx*TensorField_T::F_SIZE+8]
);
}
}
}
// MPI exchange information
mpi::reduceInplace(instantaneousVelocityVector, mpi::SUM);
mpi::reduceInplace(meanVelocityVector, mpi::SUM);
mpi::reduceInplace(tkeSGSVector, mpi::SUM);
mpi::reduceInplace(tkeResolvedVector, mpi::SUM);
mpi::reduceInplace(reynoldsStressVector, mpi::SUM);
WALBERLA_ROOT_SECTION()
{
std::ofstream velocityOS;
filesystem::path path = velocityProfilesFilePath_;
if (separateFiles_) {
path.concat("_t" + std::to_string(timeloop_->getCurrentTimeStep()) + ".txt");
velocityOS.open(path, std::ios::out | std::ios::trunc);
} else {
path.concat("_t" + std::to_string(parameters_->timesteps-1) + ".txt");
velocityOS.open(path, std::ios::out | std::ios::trunc);
}
if (velocityOS.is_open()) {
if (!separateFiles_) velocityOS << "# t = " << ts << "\n";
velocityOS << "# y/delta\t y+\t u+\t u_mean\t u_instantaneous\t TKE_SGS\t TKE_resolved\t uu_rms\t uv_rms\t uw_rms\t vu_rms\t vv_rms\t vw_rms\t wu_rms\t wv_rms\t ww_rms\n";
const auto & viscosity = parameters_->viscosity;
const auto bulkReynolds = 2_r * channelHalfWidth * bulkVelocity / viscosity;
const auto frictionReynolds = dean_correlation::calculateFrictionReynoldsNumber(bulkReynolds);
const auto frictionVelocity = frictionReynolds * viscosity / channelHalfWidth;
for(uint_t idx = 0; idx < parameters_->channelHalfWidth; ++idx) {
// relative position
velocityOS << (real_c(idx)+0.5_r) / channelHalfWidth << "\t";
// y+
velocityOS << (real_c(idx)+0.5_r) * frictionVelocity / viscosity << "\t";
// u+
velocityOS << meanVelocityVector[idx] / frictionVelocity << "\t";
// mean velocity
velocityOS << meanVelocityVector[idx] << "\t";
// instantaneous velocity
velocityOS << instantaneousVelocityVector[idx] << "\t";
// subgrid-scale TKE
velocityOS << tkeSGSVector[idx] << "\t";
// resolved TKE
velocityOS << tkeResolvedVector[idx] << "\t";
// Reynolds stresses
for(uint_t i = 0; i < TensorField_T::F_SIZE; ++i) {
velocityOS << reynoldsStressVector[idx*TensorField_T::F_SIZE+i] << "\t";
}
velocityOS << "\n";
}
velocityOS.close();
} else{
WALBERLA_ABORT("Could not open velocity plot file " << path.generic_string())
}
}
}
private:
SimulationParameters const * const parameters_{};
ForceCalculator<VectorField_T> const * const forceCalculator_{};
Timeloop * const timeloop_{};
const std::weak_ptr<StructuredBlockStorage> blocks_;
Welford_T * const velocityWelford_{};
const BlockDataID velocityFieldId_{};
const BlockDataID meanVelocityFieldId_{};
const BlockDataID meanTkeSGSFieldId_{};
const BlockDataID sosFieldId_{};
const uint_t plotFrequency_{};
const uint_t plotStart_{};
const bool separateFiles_{false};
const std::string baseFolder_{"output"};
filesystem::path velocityProfilesFilePath_;
filesystem::path forcingDataFilePath_;
};
/////////////////////
/// Main Function ///
/////////////////////
int main(int argc, char** argv) {
Environment walberlaEnv(argc, argv);
if (!walberlaEnv.config()) { WALBERLA_ABORT("No configuration file specified!") }
///////////////////////////////////////////////////////
/// Block Storage Creation and Simulation Parameter ///
///////////////////////////////////////////////////////
WALBERLA_LOG_INFO_ON_ROOT("Creating block forest...")
const auto channelParameter = walberlaEnv.config()->getOneBlock("TurbulentChannel");
SimulationParameters simulationParameters(channelParameter);
// domain creation
std::shared_ptr<StructuredBlockForest> blocks;
{
Vector3< uint_t > numBlocks;
Vector3< uint_t > cellsPerBlock;
blockforest::calculateCellDistribution(simulationParameters.domainSize,
uint_c(mpi::MPIManager::instance()->numProcesses()),
numBlocks, cellsPerBlock);
const auto & periodicity = simulationParameters.periodicity;
auto & domainSize = simulationParameters.domainSize;
const Vector3<uint_t> newDomainSize(numBlocks[0] * cellsPerBlock[0], numBlocks[1] * cellsPerBlock[1], numBlocks[2] * cellsPerBlock[2]);
if(domainSize != newDomainSize) {
domainSize = newDomainSize;
WALBERLA_LOG_WARNING_ON_ROOT("\nWARNING: Domain size has changed due to the chosen domain decomposition.\n")
}
SetupBlockForest sforest;
sforest.addWorkloadMemorySUIDAssignmentFunction( blockforest::uniformWorkloadAndMemoryAssignment );
sforest.init( AABB(0_r, 0_r, 0_r, real_c(domainSize[0]), real_c(domainSize[1]), real_c(domainSize[2])),
numBlocks[0], numBlocks[1], numBlocks[2], periodicity[0], periodicity[1], periodicity[2] );
// calculate process distribution
const memory_t memoryLimit = numeric_cast< memory_t >( sforest.getNumberOfBlocks() );
const blockforest::GlobalLoadBalancing::MetisConfiguration< SetupBlock > metisConfig(
true, false, std::bind( blockforest::cellWeightedCommunicationCost, std::placeholders::_1, std::placeholders::_2,
cellsPerBlock[0], cellsPerBlock[1], cellsPerBlock[2] ) );
sforest.calculateProcessDistribution_Default( uint_c( MPIManager::instance()->numProcesses() ), memoryLimit,
"hilbert", 10, false, metisConfig );
if( !MPIManager::instance()->rankValid() )
MPIManager::instance()->useWorldComm();
// create StructuredBlockForest (encapsulates a newly created BlockForest)
WALBERLA_LOG_INFO_ON_ROOT("SetupBlockForest created successfully:\n" << sforest)
sforest.writeVTKOutput("domain_decomposition");
auto bf = std::make_shared< BlockForest >( uint_c( MPIManager::instance()->rank() ), sforest, false );
blocks = std::make_shared< StructuredBlockForest >( bf, cellsPerBlock[0], cellsPerBlock[1], cellsPerBlock[2] );
blocks->createCellBoundingBoxes();
}
////////////////////////////////////
/// PDF Field and Velocity Setup ///
////////////////////////////////////
WALBERLA_LOG_INFO_ON_ROOT("Creating fields...")
// Common Fields
const BlockDataID velocityFieldId = field::addToStorage< VectorField_T >(blocks, "velocity", real_c(0.0), codegen::layout);
const BlockDataID meanVelocityFieldId = field::addToStorage< VectorField_T >(blocks, "mean velocity", real_c(0.0), codegen::layout);
const BlockDataID sosFieldId = field::addToStorage< TensorField_T >(blocks, "sum of squares", real_c(0.0), codegen::layout);
const BlockDataID tkeSgsFieldId = field::addToStorage< ScalarField_T >(blocks, "tke_SGS", real_c(0.0), codegen::layout);
const BlockDataID meanTkeSgsFieldId = field::addToStorage< ScalarField_T >(blocks, "mean_tke_SGS", real_c(0.0), codegen::layout);
const BlockDataID omegaFieldId = field::addToStorage< ScalarField_T >(blocks, "omega_out", real_c(0.0), codegen::layout);
const BlockDataID flagFieldId = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field");
// CPU Field for PDFs
const BlockDataID pdfFieldId = field::addToStorage< PdfField_T >(blocks, "pdf field", real_c(0.0), codegen::layout);
///////////////////////////////////////////
/// Force and bulk velocity calculation ///
///////////////////////////////////////////
ForceCalculator<VectorField_T> forceCalculator(blocks, velocityFieldId, simulationParameters);
//////////////
/// Setter ///
//////////////
WALBERLA_LOG_INFO_ON_ROOT("Setting up fields...")
// Velocity field setup
setVelocityFieldsAsmuth<VectorField_T>(
blocks, velocityFieldId, meanVelocityFieldId,
simulationParameters.targetFrictionVelocity, simulationParameters.channelHalfWidth,
5.5_r, 0.41_r, simulationParameters.viscosity,
simulationParameters.wallAxis, simulationParameters.flowAxis );
forceCalculator.setBulkVelocity(simulationParameters.targetBulkVelocity);
const auto initialForce = forceCalculator.calculateDrivingForce();
// pdfs setup
Setter_T pdfSetter(pdfFieldId, velocityFieldId, initialForce, simulationParameters.density);
for (auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt)
pdfSetter(blockIt.get());
/////////////
/// Sweep ///
/////////////
WALBERLA_LOG_INFO_ON_ROOT("Creating sweeps...")
const auto omega = lbm::collision_model::omegaFromViscosity(simulationParameters.viscosity);
StreamCollideSweep_T streamCollideSweep(omegaFieldId, pdfFieldId, velocityFieldId, initialForce, omega);
WelfordSweep_T welfordSweep(meanVelocityFieldId, sosFieldId, velocityFieldId, 0_r);
TKEWelfordSweep_T welfordTKESweep(meanTkeSgsFieldId, tkeSgsFieldId, 0_r);
TkeSgsWriter_T tkeSgsWriter(omegaFieldId, pdfFieldId, tkeSgsFieldId, initialForce, omega);
/////////////////////////
/// Boundary Handling ///
/////////////////////////
WALBERLA_LOG_INFO_ON_ROOT("Creating boundary handling...")
const FlagUID fluidFlagUID("Fluid");
Config::Block boundaryBlock;
boundaries::createBoundaryConfig(simulationParameters, boundaryBlock);
std::unique_ptr<WFB_bottom_T> wfb_bottom_ptr = std::make_unique<WFB_bottom_T>(blocks, meanVelocityFieldId, pdfFieldId, omega, simulationParameters.targetFrictionVelocity);
std::unique_ptr<WFB_top_T > wfb_top_ptr = std::make_unique<WFB_top_T>(blocks, meanVelocityFieldId, pdfFieldId, omega, simulationParameters.targetFrictionVelocity);
NoSlip_T noSlip(blocks, pdfFieldId);
FreeSlip_top_T freeSlip_top(blocks, pdfFieldId);
geometry::initBoundaryHandling< FlagField_T >(*blocks, flagFieldId, Config::BlockHandle(&boundaryBlock));
geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, flagFieldId, fluidFlagUID);
noSlip.fillFromFlagField< FlagField_T >(blocks, flagFieldId, FlagUID("NoSlip"), fluidFlagUID);
freeSlip_top.fillFromFlagField< FlagField_T >(blocks, flagFieldId, FlagUID("FreeSlip"), fluidFlagUID);
wfb_bottom_ptr->fillFromFlagField< FlagField_T >(blocks, flagFieldId, FlagUID("WFB_bottom"), fluidFlagUID);
wfb_top_ptr->fillFromFlagField< FlagField_T >(blocks, flagFieldId, FlagUID("WFB_top"), fluidFlagUID);
//////////////
/// Output ///
//////////////
WALBERLA_LOG_INFO_ON_ROOT("Creating field output...")
// vtk output
auto vtkWriter = vtk::createVTKOutput_BlockData(
blocks, "field_writer", simulationParameters.vtkFrequency, 0, false, "vtk_out", "simulation_step",
false, false, true, false
);
vtkWriter->setInitialWriteCallsToSkip(simulationParameters.vtkStart);
// velocity field writer
auto velocityWriter = std::make_shared<field::VTKWriter<VectorField_T>>(velocityFieldId, "instantaneous velocity");
vtkWriter->addCellDataWriter(velocityWriter);
auto meanVelocityFieldWriter = std::make_shared<field::VTKWriter<VectorField_T>>(meanVelocityFieldId, "mean velocity");
vtkWriter->addCellDataWriter(meanVelocityFieldWriter);
// vtk writer
{
auto flagOutput = vtk::createVTKOutput_BlockData(
blocks, "flag_writer", 1, 1, false, "vtk_out", "simulation_step",
false, true, true, false
);
auto flagWriter = std::make_shared<field::VTKWriter<FlagField_T>>(flagFieldId, "flag field");
flagOutput->addCellDataWriter(flagWriter);
flagOutput->write();
}
/////////////////
/// Time Loop ///
/////////////////
WALBERLA_LOG_INFO_ON_ROOT("Creating timeloop...")
SweepTimeloop timeloop(blocks->getBlockStorage(), simulationParameters.timesteps);
// Communication
blockforest::communication::UniformBufferedScheme< Stencil_T > communication(blocks);
communication.addPackInfo(make_shared< PackInfo_T >(pdfFieldId));
auto setNewForce = [&](const real_t newForce) {
streamCollideSweep.setF_x(newForce);
tkeSgsWriter.setF_x(newForce);
};
// plotting
const bool outputSeparateFiles = channelParameter.getParameter<bool>("separate_files", false);
const TurbulentChannelPlotter<WelfordSweep_T > plotter(&simulationParameters, &timeloop, &forceCalculator, blocks,
velocityFieldId, meanVelocityFieldId,
meanTkeSgsFieldId, sosFieldId, &welfordSweep,
outputSeparateFiles);
//NOTE must convert sweeps that are altered to lambdas, otherwise copy and counter will stay 0
auto welfordLambda = [&welfordSweep, &welfordTKESweep](IBlock * block) {
welfordSweep(block);
welfordTKESweep(block);
};
auto wfbLambda = [&wfb_bottom_ptr, &wfb_top_ptr](IBlock * block) {
wfb_bottom_ptr->operator()(block);
wfb_top_ptr->operator()(block);
};
auto streamCollideLambda = [&streamCollideSweep](IBlock * block) {
streamCollideSweep(block);
};
// Timeloop
timeloop.add() << BeforeFunction(communication, "communication")
<< BeforeFunction([&](){forceCalculator.calculateBulkVelocity();}, "bulk velocity calculation")
<< BeforeFunction([&](){
const auto newForce = forceCalculator.calculateDrivingForce();
setNewForce(newForce);
}, "new force setter")
<< Sweep([](IBlock *){}, "new force setter");
timeloop.add() << Sweep(freeSlip_top, "freeSlip");
timeloop.add() << Sweep(noSlip, "noSlip");
timeloop.add() << Sweep(wfbLambda, "wall function bounce");
timeloop.add() << Sweep(streamCollideLambda, "stream and collide");
timeloop.add() << BeforeFunction([&](){
const uint_t velCtr = uint_c(welfordSweep.getCounter());
if((timeloop.getCurrentTimeStep() == simulationParameters.samplingStart) ||
(timeloop.getCurrentTimeStep() > simulationParameters.samplingStart && simulationParameters.samplingInterval && (velCtr % simulationParameters.samplingInterval == 0))) {
welfordSweep.setCounter(real_c(0.0));
welfordTKESweep.setCounter(real_c(0.0));
for(auto & block : *blocks) {
auto * sopField = block.template getData<TensorField_T >(sosFieldId);
sopField->setWithGhostLayer(0.0);
auto * tkeField = block.template getData<ScalarField_T>(tkeSgsFieldId);
tkeField->setWithGhostLayer(0.0);
}
}
welfordSweep.setCounter(welfordSweep.getCounter() + real_c(1.0));
welfordTKESweep.setCounter(welfordTKESweep.getCounter() + real_c(1.0));
}, "welford sweep")
<< Sweep(welfordLambda, "welford sweep");
timeloop.add() << Sweep(tkeSgsWriter, "TKE_SGS writer");
timeloop.addFuncAfterTimeStep(vtk::writeFiles(vtkWriter), "VTK field output");
timeloop.addFuncAfterTimeStep(plotter, "Turbulent quantity plotting");
// LBM stability check
timeloop.addFuncAfterTimeStep( makeSharedFunctor( field::makeStabilityChecker< PdfField_T, FlagField_T >(
walberlaEnv.config(), blocks, pdfFieldId, flagFieldId, fluidFlagUID ) ),
"LBM stability check" );
// Time logger
timeloop.addFuncAfterTimeStep(timing::RemainingTimeLogger(timeloop.getNrOfTimeSteps(), 5_r),
"remaining time logger");
WALBERLA_LOG_INFO_ON_ROOT("Running timeloop with " << timeloop.getNrOfTimeSteps() - 1 << " timesteps...")
WcTimingPool timing;
WcTimer timer;
timer.start();
timeloop.run(timing);
timer.end();
double time = timer.max();
walberla::mpi::reduceInplace(time, walberla::mpi::MAX);
const auto timeloopTiming = timing.getReduced();
WALBERLA_LOG_INFO_ON_ROOT("Timeloop timing:\n" << *timeloopTiming)
const walberla::lbm::PerformanceEvaluation<FlagField_T> performance(blocks, flagFieldId, fluidFlagUID);
performance.logResultOnRoot(simulationParameters.timesteps, time);
return EXIT_SUCCESS;
}
} // namespace walberla
int main(int argc, char** argv) { return walberla::main(argc, argv); }
TurbulentChannel {
channel_half_width 20;
full_channel 0;
wall_boundary_condition WFB;
target_friction_Reynolds 395;
target_bulk_velocity 0.1;
// turnover_periods 50;
timesteps 100;
// sampling_start_timesteps 50;
// sampling_start_periods 1;
// sampling_interval_timesteps 20;
// sampling_interval_periods 1;
// vtk_start_timesteps 50;
// vtk_start_periods 1000;
// plot_start_timesteps 50;
// plot_start_periods 1000;
vtk_frequency 10;
plot_frequency 10;
separate_files 0;
}
StabilityChecker
{
checkFrequency 10000;
streamOutput false;
vtkOutput true;
}
import sympy as sp
import pystencils as ps
from lbmpy.enums import SubgridScaleModel
from lbmpy import LBMConfig, LBMOptimisation, LBStencil, Method, Stencil, ForceModel
from lbmpy.flow_statistics import welford_assignments
from lbmpy.relaxationrates import lattice_viscosity_from_relaxation_rate
from lbmpy.creationfunctions import create_lb_update_rule
from lbmpy.macroscopic_value_kernels import macroscopic_values_setter
from lbmpy.boundaries import NoSlip, FreeSlip, WallFunctionBounce
from lbmpy.boundaries.wall_function_models import SpaldingsLaw, MoninObukhovSimilarityTheory
from lbmpy.utils import frobenius_norm, second_order_moment_tensor
from pystencils_walberla import CodeGeneration, generate_sweep, generate_pack_info_from_kernel
from lbmpy_walberla import generate_boundary
# =====================
# Code Generation
# =====================
info_header = """
#ifndef TURBULENTCHANNEL_INCLUDES
#define TURBULENTCHANNEL_INCLUDES
#include <stencil/D{d}Q{q}.h>
#include "TurbulentChannel_Sweep.h"
#include "TurbulentChannel_PackInfo.h"
#include "TurbulentChannel_Setter.h"
#include "TurbulentChannel_Welford.h"
#include "TurbulentChannel_Welford_TKE_SGS.h"
#include "TurbulentChannel_TKE_SGS_Writer.h"
#include "TurbulentChannel_NoSlip.h"
#include "TurbulentChannel_FreeSlip_top.h"
#include "TurbulentChannel_WFB_top.h"
#include "TurbulentChannel_WFB_bottom.h"
namespace walberla {{
namespace codegen {{
using Stencil_T = walberla::stencil::D{d}Q{q};
static constexpr uint_t flowAxis = {flow_axis};
static constexpr uint_t wallAxis = {wall_axis};
static constexpr field::Layout layout = field::{layout};
}}
}}
#endif // TURBULENTCHANNEL_INCLUDES
"""
def check_axis(flow_axis, wall_axis):
assert flow_axis != wall_axis
assert flow_axis < 3
assert wall_axis < 3
with CodeGeneration() as ctx:
flow_axis = 0
wall_axis = 1
check_axis(flow_axis=flow_axis, wall_axis=wall_axis)
# ========================
# General Parameters
# ========================
target = ps.Target.CPU
data_type = "float64" if ctx.double_accuracy else "float32"
stencil = LBStencil(Stencil.D3Q19)
omega = sp.Symbol('omega')
F_x = sp.Symbol('F_x')
force_vector = [0] * 3
force_vector[flow_axis] = F_x
layout = 'fzyx'
normal_direction_top = [0] * 3
normal_direction_top[wall_axis] = -1
normal_direction_top = tuple(normal_direction_top)
normal_direction_bottom = [0] * 3
normal_direction_bottom[wall_axis] = 1
normal_direction_bottom = tuple(normal_direction_bottom)
# PDF Fields
pdfs, pdfs_tmp = ps.fields(f'pdfs({stencil.Q}), pdfs_tmp({stencil.Q}): {data_type}[{stencil.D}D]', layout=layout)
# Output Fields
omega_field = ps.fields(f"omega_out: {data_type}[{stencil.D}D]", layout=layout)
sgs_tke = ps.fields(f"sgs_tke: {data_type}[{stencil.D}D]", layout=layout)
mean_sgs_tke = ps.fields(f"mean_sgs_tke: {data_type}[{stencil.D}D]", layout=layout)
velocity = ps.fields(f"velocity({stencil.D}): {data_type}[{stencil.D}D]", layout=layout)
mean_velocity = ps.fields(f"mean_velocity({stencil.D}): {data_type}[{stencil.D}D]", layout=layout)
sum_of_squares_field = ps.fields(f"sum_of_squares_field({stencil.D**2}): {data_type}[{stencil.D}D]", layout=layout)
# LBM Optimisation
lbm_opt = LBMOptimisation(cse_global=True,
symbolic_field=pdfs,
symbolic_temporary_field=pdfs_tmp,
field_layout=layout)
# ==================
# Method Setup
# ==================
lbm_config = LBMConfig(stencil=stencil,
method=Method.CUMULANT,
force_model=ForceModel.GUO,
force=tuple(force_vector),
relaxation_rate=omega,
subgrid_scale_model=SubgridScaleModel.QR,
# galilean_correction=True,
compressible=True,
omega_output_field=omega_field,
output={'velocity': velocity})
update_rule = create_lb_update_rule(lbm_config=lbm_config, lbm_optimisation=lbm_opt)
lbm_method = update_rule.method
# ========================
# PDF Initialization
# ========================
initial_rho = sp.Symbol('rho_0')
pdfs_setter = macroscopic_values_setter(lbm_method,
initial_rho,
velocity.center_vector,
pdfs.center_vector)
# LBM Sweep
generate_sweep(ctx, "TurbulentChannel_Sweep", update_rule, field_swaps=[(pdfs, pdfs_tmp)], target=target)
# Pack Info
generate_pack_info_from_kernel(ctx, "TurbulentChannel_PackInfo", update_rule, target=target)
# Macroscopic Values Setter
generate_sweep(ctx, "TurbulentChannel_Setter", pdfs_setter, target=target, ghost_layers_to_include=1)
# Welford update
# welford_update = welford_assignments(vector_field=velocity, mean_vector_field=mean_velocity)
welford_update = welford_assignments(field=velocity, mean_field=mean_velocity,
sum_of_squares_field=sum_of_squares_field)
generate_sweep(ctx, "TurbulentChannel_Welford", welford_update, target=target)
tke_welford_update = welford_assignments(field=sgs_tke, mean_field=mean_sgs_tke)
generate_sweep(ctx, "TurbulentChannel_Welford_TKE_SGS", tke_welford_update, target=target)
# subgrid TKE output
@ps.kernel
def tke_sgs_writer():
f_neq = sp.Matrix(pdfs.center_vector) - lbm_method.get_equilibrium_terms()
rho = lbm_method.conserved_quantity_computation.density_symbol
strain_rate = frobenius_norm(-3 * omega_field.center / (2 * rho) * second_order_moment_tensor(f_neq, lbm_method.stencil))
eddy_viscosity = lattice_viscosity_from_relaxation_rate(omega_field.center) - lattice_viscosity_from_relaxation_rate(omega)
sgs_tke.center @= (eddy_viscosity * strain_rate**2)**(2.0/3.0)
tke_sgs_ac = ps.AssignmentCollection(
[lbm_method.conserved_quantity_computation.equilibrium_input_equations_from_pdfs(pdfs.center_vector),
*tke_sgs_writer]
)
generate_sweep(ctx, "TurbulentChannel_TKE_SGS_Writer", tke_sgs_ac)
# Boundary conditions
nu = lattice_viscosity_from_relaxation_rate(omega)
u_tau_target = sp.Symbol("target_u_tau")
noslip = NoSlip()
freeslip_top = FreeSlip(stencil, normal_direction=normal_direction_top)
wfb_top = WallFunctionBounce(lbm_method, pdfs, normal_direction=normal_direction_top,
wall_function_model=SpaldingsLaw(viscosity=nu,
kappa=0.41, b=5.5, newton_steps=5),
mean_velocity=mean_velocity, data_type=data_type,
target_friction_velocity=u_tau_target)
wfb_bottom = WallFunctionBounce(lbm_method, pdfs, normal_direction=normal_direction_bottom,
wall_function_model=SpaldingsLaw(viscosity=nu,
kappa=0.41, b=5.5, newton_steps=5),
mean_velocity=mean_velocity, data_type=data_type,
target_friction_velocity=u_tau_target)
generate_boundary(ctx, "TurbulentChannel_NoSlip", noslip, lbm_method, target=target)
generate_boundary(ctx, "TurbulentChannel_FreeSlip_top", freeslip_top, lbm_method, target=target)
generate_boundary(ctx, "TurbulentChannel_WFB_bottom", wfb_bottom, lbm_method, target=target)
generate_boundary(ctx, "TurbulentChannel_WFB_top", wfb_top, lbm_method, target=target)
info_header_params = {
'layout': layout,
'd': stencil.D,
'q': stencil.Q,
'flow_axis': flow_axis,
'wall_axis': wall_axis
}
ctx.write_file("CodegenIncludes.h", info_header.format(**info_header_params))
...@@ -2,5 +2,5 @@ ...@@ -2,5 +2,5 @@
waLBerla_link_files_to_builddir( "*.dat" ) waLBerla_link_files_to_builddir( "*.dat" )
waLBerla_add_executable ( NAME UniformGridBenchmark waLBerla_add_executable ( NAME UniformGridBenchmark
DEPENDS blockforest boundary core domain_decomposition field lbm postprocessing timeloop vtk sqlite ) DEPENDS walberla::blockforest walberla::boundary walberla::core walberla::domain_decomposition walberla::field walberla::lbm walberla::postprocessing walberla::timeloop walberla::vtk walberla::sqlite )
\ No newline at end of file \ No newline at end of file
waLBerla_link_files_to_builddir( "*.prm" )
waLBerla_link_files_to_builddir( "*.py" )
waLBerla_link_files_to_builddir( "simulation_setup" )
foreach(streaming_pattern pull push aa esotwist esopull esopush)
foreach(stencil d3q19 d3q27)
foreach (collision_setup srt trt mrt mrt-overrelax central central-overrelax cumulant cumulant-overrelax cumulant-K17 entropic smagorinsky qr)
# KBC methods only for D2Q9 and D3Q27 defined
if (${collision_setup} STREQUAL "entropic" AND ${stencil} STREQUAL "d3q19")
continue()
endif (${collision_setup} STREQUAL "entropic" AND ${stencil} STREQUAL "d3q19")
if (${collision_setup} STREQUAL "cumulant-K17" AND ${stencil} STREQUAL "d3q19")
continue()
endif (${collision_setup} STREQUAL "cumulant-K17" AND ${stencil} STREQUAL "d3q19")
set(config ${stencil}_${streaming_pattern}_${collision_setup})
waLBerla_generate_target_from_python(NAME UniformGridCPUGenerated_${config}
FILE UniformGridCPU.py
CODEGEN_CFG ${config}
OUT_FILES UniformGridCPUStorageSpecification.h UniformGridCPUStorageSpecification.cpp
UniformGridCPUSweepCollection.h UniformGridCPUSweepCollection.cpp
NoSlip.cpp NoSlip.h
UBB.cpp UBB.h
UniformGridCPUBoundaryCollection.h
UniformGridCPU_StreamOnlyKernel.cpp UniformGridCPU_StreamOnlyKernel.h
UniformGridCPU_InfoHeader.h
)
waLBerla_add_executable(NAME UniformGridCPU_${config}
FILES UniformGridCPU.cpp
DEPENDS walberla::blockforest walberla::boundary walberla::core walberla::domain_decomposition walberla::field walberla::geometry walberla::python_coupling walberla::timeloop walberla::vtk UniformGridCPUGenerated_${config} )
# all configs are excluded from all except for pull d3q27.
if (${streaming_pattern} STREQUAL "pull" AND ${stencil} STREQUAL "d3q27")
set_target_properties( UniformGridCPUGenerated_${config} PROPERTIES EXCLUDE_FROM_ALL FALSE)
set_target_properties( UniformGridCPU_${config} PROPERTIES EXCLUDE_FROM_ALL FALSE)
else()
set_target_properties( UniformGridCPUGenerated_${config} PROPERTIES EXCLUDE_FROM_ALL TRUE)
set_target_properties( UniformGridCPU_${config} PROPERTIES EXCLUDE_FROM_ALL TRUE)
endif(${streaming_pattern} STREQUAL "pull" AND ${stencil} STREQUAL "d3q27")
endforeach ()
endforeach()
endforeach()
...@@ -16,7 +16,7 @@ inline void initShearVelocity(const shared_ptr<StructuredBlockStorage> & blocks, ...@@ -16,7 +16,7 @@ inline void initShearVelocity(const shared_ptr<StructuredBlockStorage> & blocks,
WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(velField, WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(velField,
Cell globalCell; Cell globalCell;
blocks->transformBlockLocalToGlobalCell(globalCell, block, Cell(x, y, z)); blocks->transformBlockLocalToGlobalCell(globalCell, block, Cell(x, y, z));
real_t randomReal = xMagnitude * math::realRandom<real_t>(-fluctuationMagnitude, fluctuationMagnitude); const real_t randomReal = xMagnitude * math::realRandom<real_t>(-fluctuationMagnitude, fluctuationMagnitude);
velField->get(x, y, z, 1) = real_t(0); velField->get(x, y, z, 1) = real_t(0);
velField->get(x, y, z, 2) = randomReal; velField->get(x, y, z, 2) = randomReal;
......
//======================================================================================================================
//
// This file is part of waLBerla. waLBerla is free software: you can
// redistribute it and/or modify it under the terms of the GNU General Public
// License as published by the Free Software Foundation, either version 3 of
// the License, or (at your option) any later version.
//
// waLBerla is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
//
//! \file UniformGridCPU.cpp
//! \author Markus Holzer <markus.holzer@fau.de>
//
//======================================================================================================================
#include "blockforest/Initialization.h"
#include "blockforest/communication/UniformBufferedScheme.h"
#include "core/Environment.h"
#include "core/OpenMP.h"
#include "core/logging/Initialization.h"
#include "core/timing/RemainingTimeLogger.h"
#include "core/timing/TimingPool.h"
#include "domain_decomposition/SharedSweep.h"
#include "field/AddToStorage.h"
#include "field/vtk/VTKWriter.h"
#include "geometry/InitBoundaryHandling.h"
#include "lbm_generated/field/PdfField.h"
#include "lbm_generated/field/AddToStorage.h"
#include "lbm_generated/communication/UniformGeneratedPdfPackInfo.h"
#include "lbm_generated/evaluation/PerformanceEvaluation.h"
#include "python_coupling/CreateConfig.h"
#include "python_coupling/DictWrapper.h"
#include "python_coupling/PythonCallback.h"
#include "timeloop/all.h"
#include <iomanip>
#include "InitShearVelocity.h"
#include "ManualKernels.h"
#include "UniformGridCPU_InfoHeader.h"
using namespace walberla;
using StorageSpecification_T = lbm::UniformGridCPUStorageSpecification;
using Stencil_T = lbm::UniformGridCPUStorageSpecification::Stencil;
using PdfField_T = lbm_generated::PdfField< StorageSpecification_T >;
using FlagField_T = FlagField< uint8_t >;
using BoundaryCollection_T = lbm::UniformGridCPUBoundaryCollection< FlagField_T >;
using SweepCollection_T = lbm::UniformGridCPUSweepCollection;
using blockforest::communication::UniformBufferedScheme;
using macroFieldType = VelocityField_T::value_type;
using pdfFieldType = PdfField_T::value_type;
int main(int argc, char** argv)
{
const mpi::Environment env(argc, argv);
const std::string input_filename(argv[1]);
const bool inputIsPython = string_ends_with(input_filename, ".py");
for (auto cfg = python_coupling::configBegin(argc, argv); cfg != python_coupling::configEnd(); ++cfg)
{
WALBERLA_MPI_WORLD_BARRIER()
auto config = *cfg;
logging::configureLogging(config);
auto blocks = blockforest::createUniformBlockGridFromConfig(config);
// Reading parameters
auto parameters = config->getOneBlock("Parameters");
const real_t omega = parameters.getParameter< real_t >("omega", real_c(1.4));
const uint_t timesteps = parameters.getParameter< uint_t >("timesteps", uint_c(50));
const bool initShearFlow = parameters.getParameter< bool >("initShearFlow", true);
// Creating fields
const StorageSpecification_T StorageSpec = StorageSpecification_T();
auto fieldAllocator = make_shared< field::AllocateAligned< pdfFieldType, 64 > >();
const BlockDataID pdfFieldId = lbm_generated::addPdfFieldToStorage(blocks, "pdfs", StorageSpec, field::fzyx, fieldAllocator);
const BlockDataID velFieldId = field::addToStorage< VelocityField_T >(blocks, "vel", macroFieldType(0.0), field::fzyx);
const BlockDataID densityFieldId = field::addToStorage< ScalarField_T >(blocks, "density", macroFieldType(1.0), field::fzyx);
const BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >(blocks, "Boundary Flag Field");
// Initialize velocity on cpu
if (initShearFlow)
{
WALBERLA_LOG_INFO_ON_ROOT("Initializing shear flow")
initShearVelocity(blocks, velFieldId);
}
const Cell innerOuterSplit = Cell(parameters.getParameter< Vector3<cell_idx_t> >("innerOuterSplit", Vector3<cell_idx_t>(1, 1, 1)));
SweepCollection_T sweepCollection(blocks, pdfFieldId, densityFieldId, velFieldId, omega, innerOuterSplit);
for (auto& block : *blocks)
{
sweepCollection.initialise(&block);
}
const pystencils::UniformGridCPU_StreamOnlyKernel StreamOnlyKernel(pdfFieldId);
// Boundaries
const FlagUID fluidFlagUID("Fluid");
auto boundariesConfig = config->getBlock("Boundaries");
if (boundariesConfig)
{
WALBERLA_LOG_INFO_ON_ROOT("Setting boundary conditions")
geometry::initBoundaryHandling< FlagField_T >(*blocks, flagFieldID, boundariesConfig);
}
geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, flagFieldID, fluidFlagUID);
BoundaryCollection_T boundaryCollection(blocks, flagFieldID, pdfFieldId, fluidFlagUID);
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// COMMUNICATION SCHEME ///
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
auto packInfo = std::make_shared<lbm_generated::UniformGeneratedPdfPackInfo< PdfField_T >>(pdfFieldId);
UniformBufferedScheme< Stencil_T > communication(blocks);
communication.addPackInfo(packInfo);
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// TIME STEP DEFINITIONS ///
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
SweepTimeloop timeLoop(blocks->getBlockStorage(), timesteps);
const std::string timeStepStrategy = parameters.getParameter< std::string >("timeStepStrategy", "normal");
if (timeStepStrategy == "noOverlap") {
if (boundariesConfig){
timeLoop.add() << BeforeFunction(communication, "communication")
<< Sweep(boundaryCollection.getSweep(BoundaryCollection_T::ALL), "Boundary Conditions");
timeLoop.add() << Sweep(sweepCollection.streamCollide(SweepCollection_T::ALL), "LBM StreamCollide");
}else {
timeLoop.add() << BeforeFunction(communication, "communication")
<< Sweep(sweepCollection.streamCollide(SweepCollection_T::ALL), "LBM StreamCollide");}
} else if (timeStepStrategy == "simpleOverlap") {
if (boundariesConfig){
timeLoop.add() << BeforeFunction(communication.getStartCommunicateFunctor(), "Start Communication")
<< Sweep(boundaryCollection.getSweep(BoundaryCollection_T::ALL), "Boundary Conditions");
timeLoop.add() << Sweep(sweepCollection.streamCollide(SweepCollection_T::INNER), "LBM StreamCollide Inner Frame");
timeLoop.add() << BeforeFunction(communication.getWaitFunctor(), "Wait for Communication")
<< Sweep(sweepCollection.streamCollide(SweepCollection_T::OUTER), "LBM StreamCollide Outer Frame");
}else{
timeLoop.add() << BeforeFunction(communication.getStartCommunicateFunctor(), "Start Communication")
<< Sweep(sweepCollection.streamCollide(SweepCollection_T::INNER), "LBM StreamCollide Inner Frame");
timeLoop.add() << BeforeFunction(communication.getWaitFunctor(), "Wait for Communication")
<< Sweep(sweepCollection.streamCollide(SweepCollection_T::OUTER), "LBM StreamCollide Outer Frame");}
} else if (timeStepStrategy == "kernelOnly") {
timeLoop.add() << Sweep(sweepCollection.streamCollide(SweepCollection_T::ALL), "LBM StreamCollide");
} else if (timeStepStrategy == "StreamOnly") {
timeLoop.add() << Sweep(sweepCollection.stream(SweepCollection_T::ALL), "LBM Stream-Only");
} else {
WALBERLA_ABORT_NO_DEBUG_INFO("Invalid value for 'timeStepStrategy'")
}
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// TIME LOOP SETUP ///
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
const uint_t vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0);
if (vtkWriteFrequency > 0)
{
auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out",
"simulation_step", false, true, true, false, 0);
auto velWriter = make_shared< field::VTKWriter< VelocityField_T, float32 > >(velFieldId, "vel");
vtkOutput->addCellDataWriter(velWriter);
vtkOutput->addBeforeFunction([&]() {
for (auto& block : *blocks){
sweepCollection.calculateMacroscopicParameters(&block);}
});
timeLoop.addFuncBeforeTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
}
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// BENCHMARK ///
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
lbm_generated::PerformanceEvaluation<FlagField_T> const performance(blocks, flagFieldID, fluidFlagUID);
const uint_t warmupSteps = parameters.getParameter< uint_t >("warmupSteps", uint_c(2));
const uint_t outerIterations = parameters.getParameter< uint_t >("outerIterations", uint_c(1));
for (uint_t i = 0; i < warmupSteps; ++i)
timeLoop.singleStep();
auto remainingTimeLoggerFrequency =
parameters.getParameter< real_t >("remainingTimeLoggerFrequency", real_c(-1.0)); // in seconds
if (remainingTimeLoggerFrequency > 0)
{
auto logger = timing::RemainingTimeLogger(timeLoop.getNrOfTimeSteps() * outerIterations,
remainingTimeLoggerFrequency);
timeLoop.addFuncAfterTimeStep(logger, "remaining time logger");
}
for (uint_t outerIteration = 0; outerIteration < outerIterations; ++outerIteration)
{
timeLoop.setCurrentTimeStepToZero();
WcTimingPool timeloopTiming;
WcTimer simTimer;
WALBERLA_MPI_WORLD_BARRIER()
WALBERLA_LOG_INFO_ON_ROOT("Starting simulation with " << timesteps << " time steps")
simTimer.start();
timeLoop.run(timeloopTiming);
simTimer.end();
WALBERLA_LOG_INFO_ON_ROOT("Simulation finished")
double time = simTimer.max();
WALBERLA_MPI_SECTION() { walberla::mpi::reduceInplace(time, walberla::mpi::MAX); }
performance.logResultOnRoot(timesteps, time);
const auto reducedTimeloopTiming = timeloopTiming.getReduced();
WALBERLA_LOG_RESULT_ON_ROOT("Time loop timing:\n" << *reducedTimeloopTiming)
WALBERLA_ROOT_SECTION()
{
if(inputIsPython){
python_coupling::PythonCallback pythonCallbackResults("results_callback");
if (pythonCallbackResults.isCallable())
{
pythonCallbackResults.data().exposeValue("numProcesses", performance.processes());
pythonCallbackResults.data().exposeValue("numThreads", performance.threads());
pythonCallbackResults.data().exposeValue("numCores", performance.cores());
pythonCallbackResults.data().exposeValue("numberOfCells", performance.numberOfCells());
pythonCallbackResults.data().exposeValue("numberOfFluidCells", performance.numberOfFluidCells());
pythonCallbackResults.data().exposeValue("mlups", performance.mlups(timesteps, time));
pythonCallbackResults.data().exposeValue("mlupsPerCore", performance.mlupsPerCore(timesteps, time));
pythonCallbackResults.data().exposeValue("mlupsPerProcess", performance.mlupsPerProcess(timesteps, time));
pythonCallbackResults.data().exposeValue("stencil", infoStencil);
pythonCallbackResults.data().exposeValue("streamingPattern", infoStreamingPattern);
pythonCallbackResults.data().exposeValue("collisionSetup", infoCollisionSetup);
pythonCallbackResults.data().exposeValue("vectorised", vectorised);
pythonCallbackResults.data().exposeValue("nontemporal", nontemporal);
pythonCallbackResults.data().exposeValue("cse_global", infoCseGlobal);
pythonCallbackResults.data().exposeValue("cse_pdfs", infoCsePdfs);
// Call Python function to report results
pythonCallbackResults();
}
}
}
}
}
return EXIT_SUCCESS;
}
from dataclasses import replace
import sympy as sp
import pystencils as ps
from lbmpy.advanced_streaming import is_inplace
from lbmpy.advanced_streaming.utility import streaming_patterns
from lbmpy.boundaries import NoSlip, UBB
from lbmpy.creationfunctions import LBMConfig, LBMOptimisation, LBStencil, create_lb_collision_rule
from lbmpy.enums import Method, Stencil, SubgridScaleModel
from lbmpy.moments import get_default_moment_set_for_stencil
from pystencils_walberla import CodeGeneration, generate_info_header, generate_sweep
from lbmpy_walberla import generate_lbm_package, lbm_boundary_generator
omega = sp.symbols('omega')
omega_free = sp.Symbol('omega_free')
options_dict = {
'srt': {
'method': Method.SRT,
'relaxation_rate': omega,
'compressible': False,
},
'trt': {
'method': Method.TRT,
'relaxation_rate': omega,
'compressible': False,
},
'mrt': {
'method': Method.MRT,
'relaxation_rates': [omega, 1, 1, 1, 1, 1, 1],
'compressible': False,
},
'mrt-overrelax': {
'method': Method.MRT,
'relaxation_rates': [omega] + [1 + x * 1e-2 for x in range(1, 11)],
'compressible': False,
},
'central': {
'method': Method.CENTRAL_MOMENT,
'relaxation_rate': omega,
'compressible': True,
},
'central-overrelax': {
'method': Method.CENTRAL_MOMENT,
'relaxation_rates': [omega] + [1 + x * 1e-2 for x in range(1, 11)],
'compressible': True,
},
'cumulant': {
'method': Method.MONOMIAL_CUMULANT,
'relaxation_rate': omega,
'compressible': True,
},
'cumulant-overrelax': {
'method': Method.MONOMIAL_CUMULANT,
'relaxation_rates': [omega] + [1 + x * 1e-2 for x in range(1, 18)],
'compressible': True,
},
'cumulant-K17': {
'method': Method.CUMULANT,
'relaxation_rate': omega,
'compressible': True,
'fourth_order_correction': 0.01
},
'entropic': {
'method': Method.TRT_KBC_N4,
'compressible': True,
'zero_centered': False,
'relaxation_rates': [omega, omega_free],
'entropic': True,
'entropic_newton_iterations': False
},
'smagorinsky': {
'method': Method.SRT,
'subgrid_scale_model': SubgridScaleModel.SMAGORINSKY,
'relaxation_rate': omega,
},
'qr': {
'method': Method.SRT,
'subgrid_scale_model': SubgridScaleModel.QR,
'relaxation_rate': omega,
}
}
info_header = """
const char * infoStencil = "{stencil}";
const char * infoStreamingPattern = "{streaming_pattern}";
const char * infoCollisionSetup = "{collision_setup}";
const bool vectorised = {vec};
const bool nontemporal = {nt_stores};
const bool infoCseGlobal = {cse_global};
const bool infoCsePdfs = {cse_pdfs};
"""
with CodeGeneration() as ctx:
openmp = True if ctx.openmp else False
field_type = "float64" if ctx.double_accuracy else "float32"
# This base pointer specification causes introduces temporary pointers in the outer loop such that the inner loop
# only contains aligned memory addresses. Doing so NT Stores are much more effective which causes great perfomance
# gains especially for the pull scheme on skylake architectures
base_pointer_spec = None # [['spatialInner0'], ['spatialInner1']]
# cpu_vec = {"instruction_set": "best", "nontemporal": False,
# "assume_aligned": True, 'assume_sufficient_line_padding': True}
cpu_vec = {"instruction_set": None}
nt_stores = False
config_tokens = ctx.config.split('_')
assert len(config_tokens) >= 3
stencil_str = config_tokens[0]
streaming_pattern = config_tokens[1]
collision_setup = config_tokens[2]
if stencil_str == "d3q27":
stencil = LBStencil(Stencil.D3Q27)
elif stencil_str == "d3q19":
stencil = LBStencil(Stencil.D3Q19)
else:
raise ValueError("Only D3Q27 and D3Q19 stencil are supported at the moment")
assert streaming_pattern in streaming_patterns, f"Invalid streaming pattern: {streaming_pattern}"
options = options_dict[collision_setup]
assert stencil.D == 3, "This application supports only three-dimensional stencils"
pdfs, pdfs_tmp = ps.fields(f"pdfs({stencil.Q}), pdfs_tmp({stencil.Q}): {field_type}[3D]", layout='fzyx')
density_field, velocity_field = ps.fields(f"density, velocity(3) : {field_type}[3D]", layout='fzyx')
macroscopic_fields = {'density': density_field, 'velocity': velocity_field}
lbm_config = LBMConfig(stencil=stencil, field_name=pdfs.name, streaming_pattern=streaming_pattern, **options)
lbm_opt = LBMOptimisation(cse_global=True, cse_pdfs=False, symbolic_field=pdfs, field_layout='fzyx')
# This creates a simplified version of the central moment collision operator where the bulk and shear viscosity is
# not seperated. This is done to get a fair comparison with the monomial cumulants.
if lbm_config.method == Method.CENTRAL_MOMENT:
lbm_config = replace(lbm_config, nested_moments=get_default_moment_set_for_stencil(stencil))
if not is_inplace(streaming_pattern):
lbm_opt = replace(lbm_opt, symbolic_temporary_field=pdfs_tmp)
# This is a microbenchmark for testing how fast Q PDFs can be updated per cell. To avoid optimisations from
# the compiler the PDFs are shuffled inside a cell. Otherwise, for common streaming patterns compilers would
# typically remove the copy of the center PDF which results in an overestimation of the maximum performance
stream_only_kernel = []
for i in range(stencil.Q):
stream_only_kernel.append(ps.Assignment(pdfs(i), pdfs((i + 3) % stencil.Q)))
# LB Sweep
collision_rule = create_lb_collision_rule(lbm_config=lbm_config, lbm_optimisation=lbm_opt)
no_slip = lbm_boundary_generator(class_name='NoSlip', flag_uid='NoSlip',
boundary_object=NoSlip())
ubb = lbm_boundary_generator(class_name='UBB', flag_uid='UBB',
boundary_object=UBB([0.05, 0, 0], data_type=field_type))
generate_lbm_package(ctx, name="UniformGridCPU",
collision_rule=collision_rule,
lbm_config=lbm_config, lbm_optimisation=lbm_opt,
nonuniform=False, boundaries=[no_slip, ubb],
macroscopic_fields=macroscopic_fields,
cpu_openmp=openmp, cpu_vectorize_info=cpu_vec,
base_pointer_specification=base_pointer_spec)
# Stream only kernel
cpu_vec_stream = None
if ctx.optimize_for_localhost:
cpu_vec_stream = {"instruction_set": "best", "nontemporal": True,
"assume_aligned": True, 'assume_sufficient_line_padding': True,
"assume_inner_stride_one": True}
generate_sweep(ctx, 'UniformGridCPU_StreamOnlyKernel', stream_only_kernel,
target=ps.Target.CPU, cpu_openmp=openmp,
cpu_vectorize_info=cpu_vec_stream, base_pointer_specification=[['spatialInner0'], ['spatialInner1']])
infoHeaderParams = {
'stencil': stencil_str,
'streaming_pattern': streaming_pattern,
'collision_setup': collision_setup,
'vec': int(True if cpu_vec else False),
'nt_stores': int(nt_stores),
'cse_global': int(lbm_opt.cse_global),
'cse_pdfs': int(lbm_opt.cse_pdfs),
}
field_typedefs = {'VelocityField_T': velocity_field,
'ScalarField_T': density_field}
# Info header containing correct template definitions for stencil and field
generate_info_header(ctx, 'UniformGridCPU_InfoHeader',
field_typedefs=field_typedefs,
additional_code=info_header.format(**infoHeaderParams))
import os
import waLBerla as wlb
from waLBerla.tools.config import block_decomposition
from waLBerla.tools.sqlitedb import sequenceValuesToScalars, checkAndUpdateSchema, storeSingle
import sys
import sqlite3
try:
import machinestate as ms
except ImportError:
ms = None
# Number of time steps run for a workload of 128^3 per process
# if double as many cells are on the process, half as many time steps are run etc.
# increase this to get more reliable measurements
TIME_STEPS_FOR_128_BLOCK = 10
DB_FILE = os.environ.get('DB_FILE', "cpu_benchmark.sqlite3")
BENCHMARK = int(os.environ.get('BENCHMARK', 0))
WeakX = int(os.environ.get('WeakX', 128))
WeakY = int(os.environ.get('WeakY', 128))
WeakZ = int(os.environ.get('WeakZ', 128))
StrongX = int(os.environ.get('StrongX', 128))
StrongY = int(os.environ.get('StrongY', 128))
StrongZ = int(os.environ.get('StrongZ', 128))
def num_time_steps(block_size, time_steps_for_128_block=TIME_STEPS_FOR_128_BLOCK):
cells = block_size[0] * block_size[1] * block_size[2]
time_steps = (128 ** 3 / cells) * time_steps_for_128_block
if time_steps < TIME_STEPS_FOR_128_BLOCK:
time_steps = 5
return int(time_steps)
ldc_setup = {'Border': [
{'direction': 'N', 'walldistance': -1, 'flag': 'UBB'},
{'direction': 'W', 'walldistance': -1, 'flag': 'NoSlip'},
{'direction': 'E', 'walldistance': -1, 'flag': 'NoSlip'},
{'direction': 'S', 'walldistance': -1, 'flag': 'NoSlip'},
{'direction': 'B', 'walldistance': -1, 'flag': 'NoSlip'},
{'direction': 'T', 'walldistance': -1, 'flag': 'NoSlip'},
]}
class Scenario:
def __init__(self, cells_per_block=(128, 128, 128), periodic=(1, 1, 1), blocks_per_process=1,
timesteps=None, time_step_strategy="normal", omega=1.8, inner_outer_split=(1, 1, 1),
warmup_steps=2, outer_iterations=3, init_shear_flow=False, boundary_setup=False,
vtk_write_frequency=0, remaining_time_logger_frequency=-1, db_file_name=None):
if boundary_setup:
init_shear_flow = False
periodic = (0, 0, 0)
self.blocks_per_process = blocks_per_process
self.blocks = block_decomposition(self.blocks_per_process * wlb.mpi.numProcesses())
self.cells_per_block = cells_per_block
self.periodic = periodic
self.time_step_strategy = time_step_strategy
self.omega = omega
self.timesteps = timesteps if timesteps else num_time_steps(cells_per_block)
self.inner_outer_split = inner_outer_split
self.init_shear_flow = init_shear_flow
self.boundary_setup = boundary_setup
self.warmup_steps = warmup_steps
self.outer_iterations = outer_iterations
self.vtk_write_frequency = vtk_write_frequency
self.remaining_time_logger_frequency = remaining_time_logger_frequency
self.db_file_name = DB_FILE if db_file_name is None else db_file_name
self.config_dict = self.config(print_dict=False)
@wlb.member_callback
def config(self, print_dict=True):
from pprint import pformat
config_dict = {
'DomainSetup': {
'blocks': self.blocks,
'cellsPerBlock': self.cells_per_block,
'periodic': self.periodic,
'cartesianSetup': (self.blocks_per_process == 1)
},
'Parameters': {
'omega': self.omega,
'warmupSteps': self.warmup_steps,
'outerIterations': self.outer_iterations,
'timeStepStrategy': self.time_step_strategy,
'timesteps': self.timesteps,
'initShearFlow': self.init_shear_flow,
'innerOuterSplit': self.inner_outer_split,
'vtkWriteFrequency': self.vtk_write_frequency,
'remainingTimeLoggerFrequency': self.remaining_time_logger_frequency
}
}
if self.boundary_setup:
config_dict["Boundaries"] = ldc_setup
if print_dict:
wlb.log_info_on_root("Scenario:\n" + pformat(config_dict))
return config_dict
@wlb.member_callback
def results_callback(self, **kwargs):
data = {}
data.update(self.config_dict['Parameters'])
data.update(self.config_dict['DomainSetup'])
data.update(kwargs)
data['executable'] = sys.argv[0]
data['compile_flags'] = wlb.build_info.compiler_flags
data['walberla_version'] = wlb.build_info.version
data['build_machine'] = wlb.build_info.build_machine
if ms:
state = ms.MachineState(extended=False, anonymous=True)
state.generate() # generate subclasses
state.update() # read information
data["MachineState"] = str(state.get())
else:
print("MachineState module is not available. MachineState was not saved")
sequenceValuesToScalars(data)
result = data
sequenceValuesToScalars(result)
num_tries = 4
# check multiple times e.g. may fail when multiple benchmark processes are running
table_name = f"runs"
table_name = table_name.replace("-", "_")
for num_try in range(num_tries):
try:
checkAndUpdateSchema(result, table_name, self.db_file_name)
storeSingle(result, table_name, self.db_file_name)
break
except sqlite3.OperationalError as e:
wlb.log_warning(f"Sqlite DB writing failed: try {num_try + 1}/{num_tries} {str(e)}")
# -------------------------------------- Profiling -----------------------------------
def profiling():
"""Tests different communication overlapping strategies"""
wlb.log_info_on_root("Running 2 timesteps for profiling")
wlb.log_info_on_root("")
scenarios = wlb.ScenarioManager()
cells = (128, 128, 128)
scenarios.add(Scenario(cells_per_block=cells, time_step_strategy='kernelOnly',
inner_outer_split=(1, 1, 1), timesteps=2,
outer_iterations=1, warmup_steps=0))
# -------------------------------------- Functions trying different parameter sets -----------------------------------
def overlap_benchmark():
"""Tests different communication overlapping strategies"""
wlb.log_info_on_root("Running different communication overlap strategies")
wlb.log_info_on_root("")
scenarios = wlb.ScenarioManager()
inner_outer_splits = [(1, 1, 1), (4, 1, 1)]
cells_per_block = [(i, i, i) for i in (16, 32, 64, 128)]
for cell_per_block in cells_per_block:
scenarios.add(Scenario(time_step_strategy='noOverlap',
inner_outer_split=(1, 1, 1),
cells_per_block=cell_per_block))
for inner_outer_split in inner_outer_splits:
scenario = Scenario(time_step_strategy='simpleOverlap',
inner_outer_split=inner_outer_split,
cells_per_block=cell_per_block)
scenarios.add(scenario)
def weak_scaling_benchmark():
wlb.log_info_on_root("Running weak scaling benchmark with one block per proc")
wlb.log_info_on_root("")
scenarios = wlb.ScenarioManager()
for t in ["noOverlap", "simpleOverlap"]:
scenarios.add(Scenario(time_step_strategy=t,
inner_outer_split=(1, 1, 1),
cells_per_block=(WeakX, WeakY, WeakZ),
boundary_setup=True,
outer_iterations=1,
db_file_name="weakScalingUniformGridOneBlock.sqlite3"))
def strong_scaling_benchmark():
wlb.log_info_on_root("Running strong scaling benchmark with one block per proc")
wlb.log_info_on_root("")
scenarios = wlb.ScenarioManager()
domain_size = (StrongX, StrongY, StrongZ)
blocks = block_decomposition(wlb.mpi.numProcesses())
cells_per_block = tuple([d // b for d, b in zip(domain_size, reversed(blocks))])
for t in ["noOverlap", "simpleOverlap"]:
scenarios.add(Scenario(cells_per_block=cells_per_block,
time_step_strategy=t,
outer_iterations=1,
timesteps=10,
boundary_setup=True,
db_file_name="strongScalingUniformGridOneBlock.sqlite3"))
def single_node_benchmark():
"""Benchmarks only the LBM compute kernel"""
wlb.log_info_on_root("Running single Node benchmarks")
wlb.log_info_on_root("")
scenarios = wlb.ScenarioManager()
scenario = Scenario(cells_per_block=(128, 128, 128),
time_step_strategy='kernelOnly',
outer_iterations=1,
timesteps=10)
scenarios.add(scenario)
def validation_run():
"""Run with full periodic shear flow or boundary scenario (ldc) to check if the code works"""
wlb.log_info_on_root("Validation run")
wlb.log_info_on_root("")
time_step_strategy = "noOverlap" # "noOverlap"
scenarios = wlb.ScenarioManager()
scenario = Scenario(cells_per_block=(64, 64, 64),
time_step_strategy=time_step_strategy,
timesteps=201,
outer_iterations=1,
warmup_steps=0,
init_shear_flow=False,
boundary_setup=True,
vtk_write_frequency=50,
remaining_time_logger_frequency=10)
scenarios.add(scenario)
wlb.log_info_on_root(f"Batch run of benchmark scenarios, saving result to {DB_FILE}")
# Select the benchmark you want to run
# single_node_benchmark() # benchmarks different CUDA block sizes and domain sizes and measures single GPU
# performance of compute kernel (no communication)
# overlap_benchmark() # benchmarks different communication overlap options
# profiling() # run only two timesteps on a smaller domain for profiling only
# validation_run()
# scaling_benchmark()
if BENCHMARK == 0:
single_node_benchmark()
elif BENCHMARK == 1:
weak_scaling_benchmark()
elif BENCHMARK == 2:
strong_scaling_benchmark()
else:
validation_run()
import os
import waLBerla as wlb
from waLBerla.tools.config import block_decomposition
from waLBerla.tools.sqlitedb import sequenceValuesToScalars, checkAndUpdateSchema, storeSingle
import sys
import sqlite3
from pprint import pformat
try:
import machinestate as ms
except ImportError:
ms = None
# Number of time steps run for a workload of 128^3 per process
# if double as many cells are on the process, half as many time steps are run etc.
# increase this to get more reliable measurements
TIME_STEPS_FOR_128_BLOCK = int(os.environ.get('TIME_STEPS_FOR_128_BLOCK', 100))
DB_FILE = os.environ.get('DB_FILE', "cpu_benchmark.sqlite3")
BENCHMARK = int(os.environ.get('BENCHMARK', 0))
WeakX = int(os.environ.get('WeakX', 128))
WeakY = int(os.environ.get('WeakY', 128))
WeakZ = int(os.environ.get('WeakZ', 128))
StrongX = int(os.environ.get('StrongX', 128))
StrongY = int(os.environ.get('StrongY', 128))
StrongZ = int(os.environ.get('StrongZ', 128))
def num_time_steps(block_size, time_steps_for_128_block=TIME_STEPS_FOR_128_BLOCK):
"""
Calculate the number of time steps based on the block size.
This function computes the number of time steps required for a given block size
by scaling the time steps that could be executed on one process within one second
for a 128x128x128 cells_per_block to the given cells_per_block size.
Parameters:
block_size (tuple): A tuple of three integers representing the dimensions of the cells_per_block (x, y, z).
time_steps_for_128_block (int, optional): The number of time steps for a 128x128x128 block. Default is 100.
Returns:
int: The calculated number of time steps, with a minimum value of 5.
"""
cells = block_size[0] * block_size[1] * block_size[2]
time_steps = (128 ** 3 / cells) * time_steps_for_128_block
if time_steps < 5:
time_steps = 5
return int(time_steps)
ldc_setup = {'Border': [
{'direction': 'N', 'walldistance': -1, 'flag': 'UBB'},
{'direction': 'W', 'walldistance': -1, 'flag': 'NoSlip'},
{'direction': 'E', 'walldistance': -1, 'flag': 'NoSlip'},
{'direction': 'S', 'walldistance': -1, 'flag': 'NoSlip'},
{'direction': 'B', 'walldistance': -1, 'flag': 'NoSlip'},
{'direction': 'T', 'walldistance': -1, 'flag': 'NoSlip'},
]}
class Scenario:
def __init__(self, cells_per_block=(128, 128, 128), periodic=(1, 1, 1), blocks_per_process=1,
timesteps=None, time_step_strategy="normal", omega=1.8, inner_outer_split=(1, 1, 1),
warmup_steps=2, outer_iterations=3, init_shear_flow=False, boundary_setup=False,
vtk_write_frequency=0, remaining_time_logger_frequency=-1, db_file_name=None):
if boundary_setup:
init_shear_flow = False
periodic = (0, 0, 0)
self.blocks_per_process = blocks_per_process
self.blocks = block_decomposition(self.blocks_per_process * wlb.mpi.numProcesses())
self.cells_per_block = cells_per_block
self.periodic = periodic
self.time_step_strategy = time_step_strategy
self.omega = omega
self.timesteps = timesteps if timesteps else num_time_steps(cells_per_block)
self.inner_outer_split = inner_outer_split
self.init_shear_flow = init_shear_flow
self.boundary_setup = boundary_setup
self.warmup_steps = warmup_steps
self.outer_iterations = outer_iterations
self.vtk_write_frequency = vtk_write_frequency
self.remaining_time_logger_frequency = remaining_time_logger_frequency
self.db_file_name = DB_FILE if db_file_name is None else db_file_name
self.config_dict = self.config(print_dict=False)
@wlb.member_callback
def config(self, print_dict=True):
config_dict = {
'DomainSetup': {
'blocks': self.blocks,
'cellsPerBlock': self.cells_per_block,
'periodic': self.periodic,
'cartesianSetup': (self.blocks_per_process == 1)
},
'Parameters': {
'omega': self.omega,
'warmupSteps': self.warmup_steps,
'outerIterations': self.outer_iterations,
'timeStepStrategy': self.time_step_strategy,
'timesteps': self.timesteps,
'initShearFlow': self.init_shear_flow,
'innerOuterSplit': self.inner_outer_split,
'vtkWriteFrequency': self.vtk_write_frequency,
'remainingTimeLoggerFrequency': self.remaining_time_logger_frequency
}
}
if self.boundary_setup:
config_dict["Boundaries"] = ldc_setup
if print_dict:
wlb.log_info_on_root("Scenario:\n" + pformat(config_dict))
return config_dict
@wlb.member_callback
def results_callback(self, **kwargs):
data = {}
data.update(self.config_dict['Parameters'])
data.update(self.config_dict['DomainSetup'])
data.update(kwargs)
data['executable'] = sys.argv[0]
data['compile_flags'] = wlb.build_info.compiler_flags
data['walberla_version'] = wlb.build_info.version
data['build_machine'] = wlb.build_info.build_machine
if ms:
state = ms.MachineState(extended=False, anonymous=True)
state.generate() # generate subclasses
state.update() # read information
data["MachineState"] = str(state.get())
else:
print("MachineState module is not available. MachineState was not saved")
sequenceValuesToScalars(data)
result = data
sequenceValuesToScalars(result)
num_tries = 4
# check multiple times e.g. may fail when multiple benchmark processes are running
table_name = "runs"
table_name = table_name.replace("-", "_")
for num_try in range(num_tries):
try:
checkAndUpdateSchema(result, table_name, self.db_file_name)
storeSingle(result, table_name, self.db_file_name)
break
except sqlite3.OperationalError as e:
wlb.log_warning(f"Sqlite DB writing failed: try {num_try + 1}/{num_tries} {str(e)}")
# -------------------------------------- Functions trying different parameter sets -----------------------------------
def weak_scaling_benchmark():
wlb.log_info_on_root("Running weak scaling benchmark with one block per proc")
wlb.log_info_on_root("")
scenarios = wlb.ScenarioManager()
for t in ["simpleOverlap"]:
scenarios.add(Scenario(time_step_strategy=t,
inner_outer_split=(1, 1, 1),
cells_per_block=(WeakX, WeakY, WeakZ),
boundary_setup=True,
outer_iterations=1,
db_file_name="weakScalingUniformGridOneBlock.sqlite3"))
def strong_scaling_benchmark():
wlb.log_info_on_root("Running strong scaling benchmark with one block per proc")
wlb.log_info_on_root("")
scenarios = wlb.ScenarioManager()
domain_size = (StrongX, StrongY, StrongZ)
blocks = block_decomposition(wlb.mpi.numProcesses())
cells_per_block = tuple([d // b for d, b in zip(domain_size, reversed(blocks))])
for t in ["simpleOverlap"]:
scenarios.add(Scenario(cells_per_block=cells_per_block,
time_step_strategy=t,
outer_iterations=1,
timesteps=num_time_steps(cells_per_block),
boundary_setup=True,
db_file_name="strongScalingUniformGridOneBlock.sqlite3"))
def single_node_benchmark():
"""Benchmarks only the LBM compute kernel"""
wlb.log_info_on_root("Running single Node benchmarks")
wlb.log_info_on_root("")
scenarios = wlb.ScenarioManager()
scenario = Scenario(cells_per_block=(128, 128, 128),
time_step_strategy='kernelOnly',
outer_iterations=1,
timesteps=10)
scenarios.add(scenario)
def validation_run():
"""Run with full periodic shear flow or boundary scenario (ldc) to check if the code works"""
wlb.log_info_on_root("Validation run")
wlb.log_info_on_root("")
time_step_strategy = "noOverlap" # "noOverlap"
scenarios = wlb.ScenarioManager()
scenario = Scenario(cells_per_block=(64, 64, 64),
time_step_strategy=time_step_strategy,
timesteps=201,
outer_iterations=1,
warmup_steps=0,
init_shear_flow=False,
boundary_setup=True,
vtk_write_frequency=50,
remaining_time_logger_frequency=10)
scenarios.add(scenario)
wlb.log_info_on_root(f"Batch run of benchmark scenarios, saving result to {DB_FILE}")
# Select the benchmark you want to run
# single_node_benchmark() # benchmarks different CUDA block sizes and domain sizes and measures single GPU
# performance of compute kernel (no communication)
# overlap_benchmark() # benchmarks different communication overlap options
# profiling() # run only two timesteps on a smaller domain for profiling only
# validation_run()
# scaling_benchmark()
if BENCHMARK == 0:
single_node_benchmark()
elif BENCHMARK == 1:
weak_scaling_benchmark()
elif BENCHMARK == 2:
strong_scaling_benchmark()
else:
validation_run()
waLBerla_link_files_to_builddir( "*.prm" ) waLBerla_link_files_to_builddir( "*.prm" )
waLBerla_link_files_to_builddir( "*.py" ) waLBerla_link_files_to_builddir( "*.py" )
waLBerla_link_files_to_builddir( "simulation_setup" ) waLBerla_link_files_to_builddir( "simulation_setup" )
foreach (config srt trt mrt smagorinsky entropic smagorinsky_noopt entropic_kbc_n4 foreach(streaming_pattern pull push aa esotwist esopull esopush)
entropic_kbc_n4_noopt mrt_noopt mrt_full mrt_full_noopt foreach(stencil d3q19 d3q27)
cumulant cumulant_d3q27 foreach (collision_setup srt trt mrt mrt-overrelax central central-overrelax cumulant cumulant-overrelax cumulant-K17 entropic smagorinsky qr)
srt_d3q27 mrt_d3q27 mrt_d3q27_noopt smagorinsky_d3q27 smagorinsky_d3q27_noopt mrt_full_d3q27 mrt_full_d3q27_noopt) # KBC methods only for D2Q9 and D3Q27 defined
if (${collision_setup} STREQUAL "entropic" AND ${stencil} STREQUAL "d3q19")
waLBerla_generate_target_from_python(NAME UniformGridGPUGenerated_${config} continue()
FILE UniformGridGPU.py endif (${collision_setup} STREQUAL "entropic" AND ${stencil} STREQUAL "d3q19")
CODEGEN_CFG ${config} if (${collision_setup} STREQUAL "cumulant-K17" AND ${stencil} STREQUAL "d3q19")
OUT_FILES UniformGridGPU_LatticeModel.cpp UniformGridGPU_LatticeModel.h continue()
UniformGridGPU_LbKernel.cu UniformGridGPU_LbKernel.h endif (${collision_setup} STREQUAL "cumulant-K17" AND ${stencil} STREQUAL "d3q19")
UniformGridGPU_NoSlip.cu UniformGridGPU_NoSlip.h set(config ${stencil}_${streaming_pattern}_${collision_setup})
UniformGridGPU_UBB.cu UniformGridGPU_UBB.h waLBerla_generate_target_from_python(NAME UniformGridGPUGenerated_${config}
UniformGridGPU_PackInfo.cu UniformGridGPU_PackInfo.h FILE UniformGridGPU.py
UniformGridGPU_MacroSetter.cpp UniformGridGPU_MacroSetter.h CODEGEN_CFG ${config}
UniformGridGPU_MacroGetter.cpp UniformGridGPU_MacroGetter.h OUT_FILES UniformGridGPUStorageSpecification.h UniformGridGPUStorageSpecification.${CODEGEN_FILE_SUFFIX}
UniformGridGPU_Defines.h UniformGridGPUSweepCollection.h UniformGridGPUSweepCollection.${CODEGEN_FILE_SUFFIX}
) NoSlip.h NoSlip.${CODEGEN_FILE_SUFFIX}
UBB.h UBB.${CODEGEN_FILE_SUFFIX}
UniformGridGPUBoundaryCollection.h
waLBerla_add_executable(NAME UniformGridBenchmarkGPU_${config} UniformGridGPU_StreamOnlyKernel.h UniformGridGPU_StreamOnlyKernel.${CODEGEN_FILE_SUFFIX}
FILES UniformGridGPU.cpp UniformGridGPU_InfoHeader.h
DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk gui UniformGridGPUGenerated_${config}) )
set_target_properties( UniformGridBenchmarkGPU_${config} PROPERTIES CXX_VISIBILITY_PRESET hidden)
endforeach () waLBerla_add_executable(NAME UniformGridGPU_${config}
FILES UniformGridGPU.cpp
DEPENDS walberla::blockforest walberla::boundary walberla::core walberla::gpu walberla::domain_decomposition walberla::field walberla::geometry walberla::lbm_generated walberla::python_coupling walberla::timeloop walberla::vtk UniformGridGPUGenerated_${config} )
foreach (config srt trt mrt smagorinsky entropic)
# all configs are excluded from all except for pull d3q27.
waLBerla_generate_target_from_python(NAME UniformGridGPUGenerated_AA_${config} if (${streaming_pattern} STREQUAL "pull" AND ${stencil} STREQUAL "d3q27")
FILE UniformGridGPU_AA.py set_target_properties( UniformGridGPUGenerated_${config} PROPERTIES EXCLUDE_FROM_ALL FALSE)
CODEGEN_CFG ${config} set_target_properties( UniformGridGPU_${config} PROPERTIES EXCLUDE_FROM_ALL FALSE)
OUT_FILES UniformGridGPU_AA_PackInfoPull.cu UniformGridGPU_AA_PackInfoPull.h else()
UniformGridGPU_AA_LbKernelOdd.cu UniformGridGPU_AA_LbKernelOdd.h set_target_properties( UniformGridGPUGenerated_${config} PROPERTIES EXCLUDE_FROM_ALL TRUE)
UniformGridGPU_AA_LbKernelEven.cu UniformGridGPU_AA_LbKernelEven.h set_target_properties( UniformGridGPU_${config} PROPERTIES EXCLUDE_FROM_ALL TRUE)
UniformGridGPU_AA_PackInfoPush.cu UniformGridGPU_AA_PackInfoPush.h endif(${streaming_pattern} STREQUAL "pull" AND ${stencil} STREQUAL "d3q27")
UniformGridGPU_AA_MacroSetter.cpp UniformGridGPU_AA_MacroSetter.h
UniformGridGPU_AA_MacroGetter.cpp UniformGridGPU_AA_MacroGetter.h endforeach ()
UniformGridGPU_AA_Defines.h endforeach()
) endforeach()
waLBerla_add_executable(NAME UniformGridBenchmarkGPU_AA_${config}
FILES UniformGridGPU_AA.cpp
DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk gui UniformGridGPUGenerated_AA_${config})
set_target_properties( UniformGridBenchmarkGPU_AA_${config} PROPERTIES CXX_VISIBILITY_PRESET hidden)
endforeach ()
...@@ -16,7 +16,7 @@ inline void initShearVelocity(const shared_ptr<StructuredBlockStorage> & blocks, ...@@ -16,7 +16,7 @@ inline void initShearVelocity(const shared_ptr<StructuredBlockStorage> & blocks,
WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(velField, WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(velField,
Cell globalCell; Cell globalCell;
blocks->transformBlockLocalToGlobalCell(globalCell, block, Cell(x, y, z)); blocks->transformBlockLocalToGlobalCell(globalCell, block, Cell(x, y, z));
real_t randomReal = xMagnitude * math::realRandom<real_t>(-fluctuationMagnitude, fluctuationMagnitude); const real_t randomReal = xMagnitude * math::realRandom<real_t>(-fluctuationMagnitude, fluctuationMagnitude);
velField->get(x, y, z, 1) = real_t(0); velField->get(x, y, z, 1) = real_t(0);
velField->get(x, y, z, 2) = randomReal; velField->get(x, y, z, 2) = randomReal;
......
//======================================================================================================================
//
// This file is part of waLBerla. waLBerla is free software: you can
// redistribute it and/or modify it under the terms of the GNU General Public
// License as published by the Free Software Foundation, either version 3 of
// the License, or (at your option) any later version.
//
// waLBerla is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
//
//! \file UniformGridGPU.cpp
//! \author Martin Bauer <martin.bauer@fau.de>
//! \author Frederik Hennig <frederik.hennig@fau.de>
//! \author Markus Holzer <markus.holzer@fau.de>
//
//======================================================================================================================
#include "blockforest/Initialization.h"
#include "core/Environment.h" #include "core/Environment.h"
#include "core/logging/Initialization.h" #include "core/logging/Initialization.h"
#include "core/math/Random.h" #include "core/timing/RemainingTimeLogger.h"
#include "python_coupling/CreateConfig.h" #include "core/timing/TimingPool.h"
#include "python_coupling/PythonCallback.h"
#include "python_coupling/DictWrapper.h"
#include "blockforest/Initialization.h"
#include "field/FlagField.h"
#include "field/AddToStorage.h" #include "field/AddToStorage.h"
#include "field/FlagField.h"
#include "field/vtk/VTKWriter.h" #include "field/vtk/VTKWriter.h"
#include "field/communication/PackInfo.h"
#include "lbm/PerformanceLogger.h"
#include "blockforest/communication/UniformBufferedScheme.h"
#include "timeloop/all.h"
#include "core/math/Random.h"
#include "geometry/all.h"
#include "cuda/HostFieldAllocator.h"
#include "cuda/communication/GPUPackInfo.h"
#include "cuda/ParallelStreams.h"
#include "cuda/NVTX.h"
#include "core/timing/TimingPool.h"
#include "core/timing/RemainingTimeLogger.h"
#include "cuda/AddGPUFieldToStorage.h"
#include "cuda/communication/UniformGPUScheme.h"
#include "cuda/DeviceSelectMPI.h"
#include "domain_decomposition/SharedSweep.h"
#include "gui/Gui.h"
#include "lbm/gui/Connection.h"
#include "UniformGridGPU_LatticeModel.h"
#include "UniformGridGPU_LbKernel.h"
#include "UniformGridGPU_PackInfo.h"
#include "UniformGridGPU_UBB.h"
#include "UniformGridGPU_NoSlip.h"
#include "UniformGridGPU_Communication.h"
#include "UniformGridGPU_MacroSetter.h"
#include "UniformGridGPU_MacroGetter.h"
#include "UniformGridGPU_Defines.h"
#include "geometry/InitBoundaryHandling.h"
using namespace walberla; #include "gpu/AddGPUFieldToStorage.h"
#include "gpu/DeviceSelectMPI.h"
#include "gpu/FieldCopy.h"
#include "gpu/GPUWrapper.h"
#include "gpu/HostFieldAllocator.h"
#include "gpu/ParallelStreams.h"
#include "gpu/communication/UniformGPUScheme.h"
#include "lbm_generated/field/PdfField.h"
#include "lbm_generated/field/AddToStorage.h"
#include "lbm_generated/gpu/UniformGeneratedGPUPdfPackInfo.h"
#include "lbm_generated/gpu/GPUPdfField.h"
#include "lbm_generated/gpu/AddToStorage.h"
#include "lbm_generated/evaluation/PerformanceEvaluation.h"
#include "python_coupling/CreateConfig.h"
#include "python_coupling/DictWrapper.h"
#include "python_coupling/PythonCallback.h"
using LatticeModel_T = lbm::UniformGridGPU_LatticeModel; #include "timeloop/SweepTimeloop.h"
const auto Q = LatticeModel_T::Stencil::Q; #include <cmath>
#include "InitShearVelocity.h"
#include "UniformGridGPU_InfoHeader.h"
using Stencil_T = LatticeModel_T::Stencil; using namespace walberla;
using CommunicationStencil_T = LatticeModel_T::CommunicationStencil;
using PdfField_T = GhostLayerField<real_t, Q>;
using CommScheme_T = cuda::communication::UniformGPUScheme<CommunicationStencil_T>;
using VelocityField_T = GhostLayerField<real_t, 3>;
using flag_t = walberla::uint8_t;
using FlagField_T = FlagField<flag_t>;
using StorageSpecification_T = lbm::UniformGridGPUStorageSpecification;
using Stencil_T = lbm::UniformGridGPUStorageSpecification::Stencil;
void initShearVelocity(const shared_ptr<StructuredBlockStorage> & blocks, BlockDataID velFieldID, using PdfField_T = lbm_generated::PdfField< StorageSpecification_T >;
const real_t xMagnitude=real_t(0.1), const real_t fluctuationMagnitude=real_t(0.05) ) using GPUPdfField_T = lbm_generated::GPUPdfField< StorageSpecification_T >;
{ using FlagField_T = FlagField< uint8_t >;
math::seedRandomGenerator(0); using BoundaryCollection_T = lbm::UniformGridGPUBoundaryCollection< FlagField_T >;
auto halfZ = blocks->getDomainCellBB().zMax() / 2;
for( auto & block: *blocks) using SweepCollection_T = lbm::UniformGridGPUSweepCollection;
{
auto velField = block.getData<VelocityField_T>( velFieldID );
WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(velField,
Cell globalCell;
blocks->transformBlockLocalToGlobalCell(globalCell, block, Cell(x, y, z));
real_t randomReal = xMagnitude * math::realRandom<real_t>(-fluctuationMagnitude, fluctuationMagnitude);
velField->get(x, y, z, 1) = real_t(0);
velField->get(x, y, z, 2) = randomReal;
if( globalCell[2] >= halfZ ) {
velField->get(x, y, z, 0) = xMagnitude;
} else {
velField->get(x, y, z, 0) = -xMagnitude;
}
);
}
}
using gpu::communication::UniformGPUScheme;
int main( int argc, char **argv ) using macroFieldType = VelocityField_T::value_type;
int main(int argc, char** argv)
{ {
mpi::Environment env( argc, argv ); mpi::Environment const env(argc, argv);
cuda::selectDeviceBasedOnMpiRank(); gpu::selectDeviceBasedOnMpiRank();
for( auto cfg = python_coupling::configBegin( argc, argv ); cfg != python_coupling::configEnd(); ++cfg ) for (auto cfg = python_coupling::configBegin(argc, argv); cfg != python_coupling::configEnd(); ++cfg)
{ {
WALBERLA_MPI_WORLD_BARRIER(); WALBERLA_MPI_WORLD_BARRIER()
WALBERLA_GPU_CHECK(gpuPeekAtLastError())
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// SETUP AND CONFIGURATION ///
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
auto config = *cfg; auto config = *cfg;
logging::configureLogging( config ); logging::configureLogging(config);
auto blocks = blockforest::createUniformBlockGridFromConfig( config ); auto blocks = blockforest::createUniformBlockGridFromConfig(config);
Vector3<uint_t> cellsPerBlock = config->getBlock( "DomainSetup" ).getParameter<Vector3<uint_t> >( "cellsPerBlock" );
// Reading parameters // Reading parameters
auto parameters = config->getOneBlock( "Parameters" ); auto parameters = config->getOneBlock("Parameters");
const std::string timeStepStrategy = parameters.getParameter<std::string>( "timeStepStrategy", "normal"); const real_t omega = parameters.getParameter< real_t >("omega", real_c(1.4));
const real_t omega = parameters.getParameter<real_t>( "omega", real_c( 1.4 )); const uint_t timesteps = parameters.getParameter< uint_t >("timesteps", uint_c(50));
const uint_t timesteps = parameters.getParameter<uint_t>( "timesteps", uint_c( 50 )); const bool initShearFlow = parameters.getParameter< bool >("initShearFlow", true);
const bool initShearFlow = parameters.getParameter<bool>("initShearFlow", true); const bool cudaEnabledMPI = parameters.getParameter< bool >("cudaEnabledMPI", false);
// Creating fields // Creating fields
BlockDataID pdfFieldCpuID = field::addToStorage< PdfField_T >( blocks, "pdfs cpu", real_t(0), field::fzyx); const StorageSpecification_T StorageSpec = StorageSpecification_T();
BlockDataID velFieldCpuID = field::addToStorage< VelocityField_T >( blocks, "vel", real_t(0), field::fzyx); const BlockDataID pdfFieldCpuID = lbm_generated::addPdfFieldToStorage(blocks, "pdfs", StorageSpec, uint_c(1), field::fzyx);
if( timeStepStrategy != "kernelOnlyNoInit")
{
if ( initShearFlow )
{
WALBERLA_LOG_INFO_ON_ROOT( "Initializing shear flow" );
initShearVelocity( blocks, velFieldCpuID );
}
pystencils::UniformGridGPU_MacroSetter setterSweep(pdfFieldCpuID, velFieldCpuID);
for( auto & block : *blocks )
setterSweep( &block );
// setter sweep only initializes interior of domain - for push schemes to work a first communication is required here
blockforest::communication::UniformBufferedScheme<CommunicationStencil_T> initialComm(blocks);
initialComm.addPackInfo( make_shared< field::communication::PackInfo<PdfField_T> >( pdfFieldCpuID ) );
initialComm();
}
BlockDataID pdfFieldGpuID = cuda::addGPUFieldToStorage<PdfField_T >( blocks, pdfFieldCpuID, "pdfs on GPU", true );
BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >( blocks, "flag field" );
auto allocator = make_shared< gpu::HostFieldAllocator<macroFieldType> >(); // use pinned memory allocator for faster CPU-GPU memory transfers
const BlockDataID velFieldCpuID = field::addToStorage< VelocityField_T >(blocks, "vel", macroFieldType(0.0), field::fzyx, uint_c(1), allocator);
const BlockDataID densityFieldCpuID = field::addToStorage< ScalarField_T >(blocks, "density", macroFieldType(1.0), field::fzyx, uint_c(1), allocator);
const BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >(blocks, "Boundary Flag Field");
// Boundaries // Initialize velocity on cpu
const FlagUID fluidFlagUID( "Fluid" ); if (initShearFlow)
auto boundariesConfig = config->getBlock( "Boundaries" );
bool disableBoundaries = true;
if( boundariesConfig )
{ {
disableBoundaries = false; WALBERLA_LOG_INFO_ON_ROOT("Initializing shear flow")
geometry::initBoundaryHandling< FlagField_T >( *blocks, flagFieldID, boundariesConfig ); initShearVelocity(blocks, velFieldCpuID);
geometry::setNonBoundaryCellsToDomain< FlagField_T >( *blocks, flagFieldID, fluidFlagUID );
} }
lbm::UniformGridGPU_UBB ubb(blocks, pdfFieldGpuID); const BlockDataID pdfFieldGpuID = lbm_generated::addGPUPdfFieldToStorage< PdfField_T >(blocks, pdfFieldCpuID, StorageSpec, "pdfs on GPU", true);
lbm::UniformGridGPU_NoSlip noSlip(blocks, pdfFieldGpuID); const BlockDataID velFieldGpuID =
gpu::addGPUFieldToStorage< VelocityField_T >(blocks, velFieldCpuID, "velocity on GPU", true);
ubb.fillFromFlagField<FlagField_T>( blocks, flagFieldID, FlagUID("UBB"), fluidFlagUID ); const BlockDataID densityFieldGpuID =
noSlip.fillFromFlagField<FlagField_T>( blocks, flagFieldID, FlagUID("NoSlip"), fluidFlagUID ); gpu::addGPUFieldToStorage< ScalarField_T >(blocks, densityFieldCpuID, "velocity on GPU", true);
// Communication setup
bool cudaEnabledMPI = parameters.getParameter<bool>( "cudaEnabledMPI", false );
Vector3<int32_t> gpuBlockSize = parameters.getParameter<Vector3<int32_t> > ("gpuBlockSize", Vector3<int32_t>(256, 1, 1));
const std::string communicationSchemeStr = parameters.getParameter<std::string>("communicationScheme", "UniformGPUScheme_Baseline");
CommunicationSchemeType communicationScheme;
if( communicationSchemeStr == "GPUPackInfo_Baseline")
communicationScheme = GPUPackInfo_Baseline;
else if (communicationSchemeStr == "GPUPackInfo_Streams")
communicationScheme = GPUPackInfo_Streams;
else if (communicationSchemeStr == "UniformGPUScheme_Baseline")
communicationScheme = UniformGPUScheme_Baseline;
else if (communicationSchemeStr == "UniformGPUScheme_Memcpy")
communicationScheme = UniformGPUScheme_Memcpy;
else if (communicationSchemeStr == "MPIDatatypes")
communicationScheme = MPIDatatypes;
else if (communicationSchemeStr == "MPIDatatypesFull")
communicationScheme = MPIDatatypesFull;
else {
WALBERLA_ABORT_NO_DEBUG_INFO("Invalid choice for communicationScheme")
}
Vector3<int> innerOuterSplit = parameters.getParameter<Vector3<int> >("innerOuterSplit", Vector3<int>(1, 1, 1)); const Cell innerOuterSplit = Cell(parameters.getParameter< Vector3<cell_idx_t> >("innerOuterSplit", Vector3<cell_idx_t>(1, 1, 1)));
for(uint_t i=0; i< 3; ++i) Vector3< int32_t > gpuBlockSize = parameters.getParameter< Vector3< int32_t > >("gpuBlockSize", Vector3< int32_t >(256, 1, 1));
SweepCollection_T sweepCollection(blocks, pdfFieldGpuID, densityFieldGpuID, velFieldGpuID, gpuBlockSize[0], gpuBlockSize[1], gpuBlockSize[2], omega, innerOuterSplit);
for (auto& block : *blocks)
{ {
if( int_c(cellsPerBlock[i]) <= innerOuterSplit[i] * 2) { sweepCollection.initialise(&block);
WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller or increase cellsPerBlock");
}
} }
int streamHighPriority = 0; int streamHighPriority = 0;
int streamLowPriority = 0; int streamLowPriority = 0;
WALBERLA_CUDA_CHECK( cudaDeviceGetStreamPriorityRange(&streamLowPriority, &streamHighPriority) ); WALBERLA_GPU_CHECK(gpuDeviceGetStreamPriorityRange(&streamLowPriority, &streamHighPriority))
WALBERLA_CHECK(gpuBlockSize[2] == 1); sweepCollection.setOuterPriority(streamHighPriority);
pystencils::UniformGridGPU_LbKernel lbKernel( pdfFieldGpuID, omega, //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, /// LB SWEEPS AND BOUNDARY HANDLING ///
gpuBlockSize[0], gpuBlockSize[1], //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
Cell(innerOuterSplit[0], innerOuterSplit[1], innerOuterSplit[2]) ); const pystencils::UniformGridGPU_StreamOnlyKernel StreamOnlyKernel(pdfFieldGpuID);
lbKernel.setOuterPriority( streamHighPriority );
UniformGridGPU_Communication< CommunicationStencil_T, cuda::GPUField< double > >
gpuComm( blocks, pdfFieldGpuID, (CommunicationSchemeType) communicationScheme, cudaEnabledMPI );
auto defaultStream = cuda::StreamRAII::newPriorityStream( streamLowPriority );
auto innerOuterStreams = cuda::ParallelStreams( streamHighPriority );
auto boundaryOuterStreams = cuda::ParallelStreams( streamHighPriority );
auto boundaryInnerStreams = cuda::ParallelStreams( streamHighPriority );
uint_t currentTimeStep = 0;
auto simpleOverlapTimeStep = [&] ()
{
gpuComm.startCommunication(defaultStream);
for( auto &block: *blocks )
lbKernel.inner( &block, defaultStream );
gpuComm.wait(defaultStream);
for( auto &block: *blocks )
lbKernel.outer( &block, defaultStream );
};
auto overlapTimeStep = [&]()
{
cuda::NvtxRange namedRange("timestep");
auto innerOuterSection = innerOuterStreams.parallelSection( defaultStream );
innerOuterSection.run([&]( auto innerStream ) // Boundaries
{ const FlagUID fluidFlagUID("Fluid");
cuda::nameStream(innerStream, "inner stream"); auto boundariesConfig = config->getBlock("Boundaries");
for( auto &block: *blocks ) if (boundariesConfig)
{
if(!disableBoundaries)
{
auto p = boundaryInnerStreams.parallelSection( innerStream );
p.run( [&block, &ubb]( cudaStream_t s ) { ubb.inner( &block, s ); } );
p.run( [&block, &noSlip]( cudaStream_t s ) { noSlip.inner( &block, s ); } );
}
lbKernel.inner( &block, innerStream );
}
});
innerOuterSection.run([&]( auto outerStream )
{
cuda::nameStream(outerStream, "outer stream");
gpuComm( outerStream );
for( auto &block: *blocks )
{
if(!disableBoundaries)
{
auto p = boundaryOuterStreams.parallelSection( outerStream );
p.run( [&block, &ubb]( cudaStream_t s ) { ubb.outer( &block, s ); } );
p.run( [&block, &noSlip]( cudaStream_t s ) { noSlip.outer( &block, s ); } );
}
lbKernel.outer( &block, outerStream );
}
});
currentTimeStep += 1;
};
auto boundaryStreams = cuda::ParallelStreams( streamHighPriority );
auto normalTimeStep = [&]()
{
gpuComm();
for( auto &block: *blocks )
{
if(!disableBoundaries)
{
auto p = boundaryStreams.parallelSection( defaultStream );
p.run( [&block, &ubb]( cudaStream_t s ) { ubb( &block, s ); } );
p.run( [&block, &noSlip]( cudaStream_t s ) { noSlip( &block, s ); } );
}
lbKernel( &block );
}
};
auto kernelOnlyFunc = [&] ()
{ {
for( auto &block: *blocks ) WALBERLA_LOG_INFO_ON_ROOT("Setting boundary conditions")
lbKernel( &block ); geometry::initBoundaryHandling< FlagField_T >(*blocks, flagFieldID, boundariesConfig);
};
SweepTimeloop timeLoop( blocks->getBlockStorage(), timesteps );
std::function<void()> timeStep;
if (timeStepStrategy == "noOverlap")
timeStep = std::function<void()>( normalTimeStep );
else if (timeStepStrategy == "complexOverlap")
timeStep = std::function<void()>( overlapTimeStep );
else if (timeStepStrategy == "simpleOverlap")
timeStep = simpleOverlapTimeStep;
else if (timeStepStrategy == "kernelOnly" or timeStepStrategy == "kernelOnlyNoInit") {
WALBERLA_LOG_INFO_ON_ROOT("Running only compute kernel without boundary - this makes only sense for benchmarking!")
timeStep = kernelOnlyFunc;
} }
else { geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, flagFieldID, fluidFlagUID);
WALBERLA_ABORT_NO_DEBUG_INFO("Invalid value for 'timeStepStrategy'. Allowed values are 'noOverlap', 'complexOverlap', 'simpleOverlap', 'kernelOnly'"); BoundaryCollection_T boundaryCollection(blocks, flagFieldID, pdfFieldGpuID, fluidFlagUID);
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// COMMUNICATION SCHEME ///
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
UniformGPUScheme< Stencil_T > communication(blocks, cudaEnabledMPI);
auto packInfo = std::make_shared<lbm_generated::UniformGeneratedGPUPdfPackInfo< GPUPdfField_T >>(pdfFieldGpuID);
communication.addPackInfo(packInfo);
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// TIME STEP DEFINITIONS ///
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
SweepTimeloop timeLoop(blocks->getBlockStorage(), timesteps);
const std::string timeStepStrategy = parameters.getParameter< std::string >("timeStepStrategy", "normal");
auto defaultStream = gpu::StreamRAII::newPriorityStream(streamLowPriority);
if (timeStepStrategy == "noOverlap") {
if (boundariesConfig){
timeLoop.add() << BeforeFunction(communication.getCommunicateFunctor(), "communication")
<< Sweep(boundaryCollection.getSweep(BoundaryCollection_T::ALL, defaultStream), "Boundary Conditions");
timeLoop.add() << Sweep(sweepCollection.streamCollide(SweepCollection_T::ALL, defaultStream), "LBM StreamCollide");
}else {
timeLoop.add() << BeforeFunction(communication.getCommunicateFunctor(), "communication")
<< Sweep(sweepCollection.streamCollide(SweepCollection_T::ALL, defaultStream), "LBM StreamCollide");}
} else if (timeStepStrategy == "simpleOverlap") {
if (boundariesConfig){
timeLoop.add() << BeforeFunction(communication.getStartCommunicateFunctor(), "Start Communication")
<< Sweep(boundaryCollection.getSweep(BoundaryCollection_T::ALL, defaultStream), "Boundary Conditions");
timeLoop.add() << Sweep(sweepCollection.streamCollide(SweepCollection_T::INNER, defaultStream), "LBM StreamCollide Inner Frame");
timeLoop.add() << BeforeFunction(communication.getWaitFunctor(), "Wait for Communication")
<< Sweep(sweepCollection.streamCollide(SweepCollection_T::OUTER, defaultStream), "LBM StreamCollide Outer Frame");
}else{
timeLoop.add() << BeforeFunction(communication.getStartCommunicateFunctor(), "Start Communication")
<< Sweep(sweepCollection.streamCollide(SweepCollection_T::INNER, defaultStream), "LBM StreamCollide Inner Frame");
timeLoop.add() << BeforeFunction(communication.getWaitFunctor(), "Wait for Communication")
<< Sweep(sweepCollection.streamCollide(SweepCollection_T::OUTER,defaultStream), "LBM StreamCollide Outer Frame");}
} else if (timeStepStrategy == "kernelOnly") {
timeLoop.add() << Sweep(sweepCollection.streamCollide(SweepCollection_T::ALL, defaultStream), "LBM StreamCollide");
} else if (timeStepStrategy == "StreamOnly") {
timeLoop.add() << Sweep(StreamOnlyKernel, "LBM Stream Only");
} else {
WALBERLA_ABORT_NO_DEBUG_INFO("Invalid value for 'timeStepStrategy'")
} }
timeLoop.add() << BeforeFunction( timeStep ) //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
<< Sweep( []( IBlock * ) {}, "time step" ); /// TIME LOOP SETUP ///
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
pystencils::UniformGridGPU_MacroGetter getterSweep( pdfFieldCpuID, velFieldCpuID );
// VTK // VTK
uint_t vtkWriteFrequency = parameters.getParameter<uint_t>( "vtkWriteFrequency", 0 ); const uint_t vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0);
if( vtkWriteFrequency > 0 ) if (vtkWriteFrequency > 0)
{ {
auto vtkOutput = vtk::createVTKOutput_BlockData( *blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out", auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out",
"simulation_step", false, true, true, false, 0 ); "simulation_step", false, true, true, false, 0);
auto velWriter = make_shared< field::VTKWriter<VelocityField_T> >(velFieldCpuID, "vel"); auto velWriter = make_shared< field::VTKWriter< VelocityField_T, float32 > >(velFieldCpuID, "vel");
vtkOutput->addCellDataWriter(velWriter); vtkOutput->addCellDataWriter(velWriter);
vtkOutput->addBeforeFunction( [&]() {
cuda::fieldCpy<PdfField_T, cuda::GPUField<real_t> >( blocks, pdfFieldCpuID, pdfFieldGpuID ); vtkOutput->addBeforeFunction([&]() {
for( auto & block : *blocks ) for (auto& block : *blocks)
getterSweep( &block ); sweepCollection.calculateMacroscopicParameters(&block);
gpu::fieldCpy< VelocityField_T, gpu::GPUField< VelocityField_T::value_type > >(blocks, velFieldCpuID, velFieldGpuID);
}); });
timeLoop.addFuncAfterTimeStep( vtk::writeFiles( vtkOutput ), "VTK Output" ); timeLoop.addFuncAfterTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
} }
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// BENCHMARK ///
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
lbm_generated::PerformanceEvaluation<FlagField_T> const performance(blocks, flagFieldID, fluidFlagUID);
int warmupSteps = parameters.getParameter<int>( "warmupSteps", 2 ); const uint_t warmupSteps = parameters.getParameter< uint_t >("warmupSteps", uint_c(2));
int outerIterations = parameters.getParameter<int>( "outerIterations", 1 ); const uint_t outerIterations = parameters.getParameter< uint_t >("outerIterations", uint_c(1));
for(int i=0; i < warmupSteps; ++i ) for (uint_t i = 0; i < warmupSteps; ++i)
timeLoop.singleStep(); timeLoop.singleStep();
auto remainingTimeLoggerFrequency = parameters.getParameter< double >( "remainingTimeLoggerFrequency", -1.0 ); // in seconds auto remainingTimeLoggerFrequency =
if (remainingTimeLoggerFrequency > 0) { parameters.getParameter< real_t >("remainingTimeLoggerFrequency", real_c(-1.0)); // in seconds
auto logger = timing::RemainingTimeLogger( timeLoop.getNrOfTimeSteps() * uint_c( outerIterations ), remainingTimeLoggerFrequency ); if (remainingTimeLoggerFrequency > 0)
timeLoop.addFuncAfterTimeStep( logger, "remaining time logger" );
}
bool useGui = parameters.getParameter<bool>( "useGui", false );
if( useGui )
{ {
GUI gui( timeLoop, blocks, argc, argv); auto logger = timing::RemainingTimeLogger(timeLoop.getNrOfTimeSteps() * uint_c(outerIterations),
lbm::connectToGui<LatticeModel_T>(gui); remainingTimeLoggerFrequency);
gui.run(); timeLoop.addFuncAfterTimeStep(logger, "remaining time logger");
} }
else
for (uint_t outerIteration = 0; outerIteration < outerIterations; ++outerIteration)
{ {
for ( int outerIteration = 0; outerIteration < outerIterations; ++outerIteration ) WALBERLA_GPU_CHECK(gpuPeekAtLastError())
{
timeLoop.setCurrentTimeStepToZero(); timeLoop.setCurrentTimeStepToZero();
WcTimer simTimer; WcTimingPool timeloopTiming;
cudaDeviceSynchronize(); WcTimer simTimer;
WALBERLA_LOG_INFO_ON_ROOT( "Starting simulation with " << timesteps << " time steps" );
simTimer.start(); WALBERLA_GPU_CHECK( gpuDeviceSynchronize() )
timeLoop.run(); WALBERLA_GPU_CHECK( gpuPeekAtLastError() )
cudaDeviceSynchronize(); WALBERLA_LOG_INFO_ON_ROOT("Starting simulation with " << timesteps << " time steps")
simTimer.end(); simTimer.start();
WALBERLA_LOG_INFO_ON_ROOT( "Simulation finished" ); timeLoop.run(timeloopTiming);
auto time = simTimer.last(); WALBERLA_GPU_CHECK( gpuDeviceSynchronize() )
auto nrOfCells = real_c( cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2] ); simTimer.end();
auto mlupsPerProcess = nrOfCells * real_c( timesteps ) / time * 1e-6;
WALBERLA_LOG_RESULT_ON_ROOT( "MLUPS per process " << mlupsPerProcess ); WALBERLA_LOG_INFO_ON_ROOT("Simulation finished")
WALBERLA_LOG_RESULT_ON_ROOT( "Time per time step " << time / real_c( timesteps )); double time = simTimer.max();
WALBERLA_ROOT_SECTION() WALBERLA_MPI_SECTION() { walberla::mpi::reduceInplace(time, walberla::mpi::MAX); }
{ performance.logResultOnRoot(timesteps, time);
python_coupling::PythonCallback pythonCallbackResults( "results_callback" );
if ( pythonCallbackResults.isCallable()) const auto reducedTimeloopTiming = timeloopTiming.getReduced();
{ WALBERLA_LOG_RESULT_ON_ROOT("Time loop timing:\n" << *reducedTimeloopTiming)
const char * storagePattern = "twofield";
pythonCallbackResults.data().exposeValue( "mlupsPerProcess", mlupsPerProcess ); WALBERLA_ROOT_SECTION()
pythonCallbackResults.data().exposeValue( "stencil", infoStencil ); {
pythonCallbackResults.data().exposeValue( "configName", infoConfigName ); python_coupling::PythonCallback pythonCallbackResults("results_callback");
pythonCallbackResults.data().exposeValue( "storagePattern", storagePattern ); if (pythonCallbackResults.isCallable())
pythonCallbackResults.data().exposeValue( "cse_global", infoCseGlobal ); {
pythonCallbackResults.data().exposeValue( "cse_pdfs", infoCsePdfs ); pythonCallbackResults.data().exposeValue("numProcesses", performance.processes());
// Call Python function to report results pythonCallbackResults.data().exposeValue("numThreads", performance.threads());
pythonCallbackResults(); pythonCallbackResults.data().exposeValue("numCores", performance.cores());
} pythonCallbackResults.data().exposeValue("numberOfCells", performance.numberOfCells());
} pythonCallbackResults.data().exposeValue("numberOfFluidCells", performance.numberOfFluidCells());
} pythonCallbackResults.data().exposeValue("mlups", performance.mlups(timesteps, time));
pythonCallbackResults.data().exposeValue("mlupsPerCore", performance.mlupsPerCore(timesteps, time));
pythonCallbackResults.data().exposeValue("mlupsPerProcess", performance.mlupsPerProcess(timesteps, time));
pythonCallbackResults.data().exposeValue("stencil", infoStencil);
pythonCallbackResults.data().exposeValue("streamingPattern", infoStreamingPattern);
pythonCallbackResults.data().exposeValue("collisionSetup", infoCollisionSetup);
pythonCallbackResults.data().exposeValue("cse_global", infoCseGlobal);
pythonCallbackResults.data().exposeValue("cse_pdfs", infoCsePdfs);
// Call Python function to report results
pythonCallbackResults();
}
}
} }
} }
return EXIT_SUCCESS;
return 0;
} }
import sympy as sp import sympy as sp
import numpy as np import numpy as np
import pystencils as ps import pystencils as ps
from lbmpy.creationfunctions import create_lb_method, create_lb_update_rule, create_lb_collision_rule
from lbmpy.boundaries import NoSlip, UBB from dataclasses import replace
from lbmpy.fieldaccess import StreamPullTwoFieldsAccessor
from pystencils_walberla import generate_pack_info_from_kernel from pystencils import Assignment
from lbmpy_walberla import generate_lattice_model, generate_boundary from pystencils.typing import TypedSymbol
from pystencils_walberla import CodeGeneration, generate_sweep
from pystencils.data_types import TypedSymbol
from pystencils.fast_approximation import insert_fast_sqrts, insert_fast_divisions from pystencils.fast_approximation import insert_fast_sqrts, insert_fast_divisions
from lbmpy.macroscopic_value_kernels import macroscopic_values_getter, macroscopic_values_setter
from lbmpy import LBMConfig, LBMOptimisation, LBStencil, Method, Stencil, SubgridScaleModel
from lbmpy.advanced_streaming import is_inplace
from lbmpy.advanced_streaming.utility import streaming_patterns
from lbmpy.boundaries import NoSlip, UBB
from lbmpy.creationfunctions import create_lb_collision_rule
from lbmpy.moments import get_default_moment_set_for_stencil
from pystencils_walberla import CodeGeneration, generate_info_header, generate_sweep
from lbmpy_walberla import generate_lbm_package, lbm_boundary_generator
omega = sp.symbols("omega") omega = sp.symbols("omega")
omega_free = sp.Symbol("omega_free") omega_free = sp.Symbol("omega_free")
omega_fill = sp.symbols("omega_:10")
compile_time_block_size = False compile_time_block_size = False
max_threads = 256
if compile_time_block_size: if compile_time_block_size:
sweep_block_size = (128, 1, 1) sweep_block_size = (128, 1, 1)
else: else:
sweep_block_size = (TypedSymbol("cudaBlockSize0", np.int32), sweep_block_size = (TypedSymbol("cudaBlockSize0", np.int32),
TypedSymbol("cudaBlockSize1", np.int32), TypedSymbol("cudaBlockSize1", np.int32),
1) TypedSymbol("cudaBlockSize2", np.int32))
sweep_params = {'block_size': sweep_block_size} gpu_indexing_params = {'block_size': sweep_block_size}
options_dict = { options_dict = {
'srt': { 'srt': {
'method': 'srt', 'method': Method.SRT,
'stencil': 'D3Q19',
'relaxation_rate': omega, 'relaxation_rate': omega,
'compressible': False, 'compressible': False,
}, },
'trt': { 'trt': {
'method': 'trt', 'method': Method.TRT,
'stencil': 'D3Q19',
'relaxation_rate': omega, 'relaxation_rate': omega,
'compressible': False,
}, },
'mrt': { 'mrt': {
'method': 'mrt', 'method': Method.MRT,
'stencil': 'D3Q19', 'relaxation_rates': [omega, 1, 1, 1, 1, 1, 1],
'relaxation_rates': [omega, 1.3, 1.4, 1.2, 1.1, 1.15, 1.234, 1.4235], 'compressible': False,
}, },
'mrt_full': { 'mrt-overrelax': {
'method': 'mrt', 'method': Method.MRT,
'stencil': 'D3Q19', 'relaxation_rates': [omega] + [1 + x * 1e-2 for x in range(1, 11)],
'relaxation_rates': [omega_fill[0], omega, omega_fill[1], omega_fill[2], 'compressible': False,
omega_fill[3], omega_fill[4], omega_fill[5]],
}, },
'entropic': { 'central': {
'method': 'mrt', 'method': Method.CENTRAL_MOMENT,
'stencil': 'D3Q19', 'relaxation_rate': omega,
'compressible': True, 'compressible': True,
'relaxation_rates': [omega, omega, omega_free, omega_free, omega_free, omega_free],
'entropic': True,
}, },
'entropic_kbc_n4': { 'central-overrelax': {
'method': 'trt-kbc-n4', 'method': Method.CENTRAL_MOMENT,
'stencil': 'D3Q27', 'relaxation_rates': [omega] + [1 + x * 1e-2 for x in range(1, 11)],
'compressible': True,
},
'cumulant': {
'method': Method.MONOMIAL_CUMULANT,
'relaxation_rate': omega,
'compressible': True, 'compressible': True,
},
'cumulant-overrelax': {
'method': Method.MONOMIAL_CUMULANT,
'relaxation_rates': [omega] + [1 + x * 1e-2 for x in range(1, 18)],
'compressible': True,
},
'cumulant-K17': {
'method': Method.CUMULANT,
'relaxation_rate': omega,
'compressible': True,
'fourth_order_correction': 0.01
},
'entropic': {
'method': Method.TRT_KBC_N4,
'compressible': True,
'zero_centered': False,
'relaxation_rates': [omega, omega_free], 'relaxation_rates': [omega, omega_free],
'entropic': True, 'entropic': True,
'entropic_newton_iterations': False
}, },
'smagorinsky': { 'smagorinsky': {
'method': 'srt', 'method': Method.SRT,
'stencil': 'D3Q19', 'subgrid_scale_model': SubgridScaleModel.SMAGORINSKY,
'smagorinsky': True,
'relaxation_rate': omega, 'relaxation_rate': omega,
}, },
'cumulant': { 'qr': {
'method': 'cumulant', 'method': Method.SRT,
'stencil': 'D3Q19', 'subgrid_scale_model': SubgridScaleModel.QR,
'compressible': True,
'relaxation_rate': omega, 'relaxation_rate': omega,
}, }
} }
info_header = """ info_header = """
#include "stencil/D3Q{q}.h"\nusing Stencil_T = walberla::stencil::D3Q{q};
const char * infoStencil = "{stencil}"; const char * infoStencil = "{stencil}";
const char * infoConfigName = "{configName}"; const char * infoStreamingPattern = "{streaming_pattern}";
const char * infoCollisionSetup = "{collision_setup}";
const bool infoCseGlobal = {cse_global}; const bool infoCseGlobal = {cse_global};
const bool infoCsePdfs = {cse_pdfs}; const bool infoCsePdfs = {cse_pdfs};
""" """
# DEFAULTS
optimize = True
with CodeGeneration() as ctx: with CodeGeneration() as ctx:
accessor = StreamPullTwoFieldsAccessor() pdf_data_type = "float64"
# accessor = StreamPushTwoFieldsAccessor() field_data_type = "float64"
assert not accessor.is_inplace, "This app does not work for inplace accessors" config_tokens = ctx.config.split('_')
common_options = { assert len(config_tokens) >= 3
'field_name': 'pdfs', stencil_str = config_tokens[0]
'temporary_field_name': 'pdfs_tmp', streaming_pattern = config_tokens[1]
'kernel_type': accessor, collision_setup = config_tokens[2]
'optimization': {'cse_global': True,
'cse_pdfs': False} if len(config_tokens) >= 4:
} optimize = (config_tokens[3] != 'noopt')
config_name = ctx.config
noopt = False if stencil_str == "d3q27":
d3q27 = False stencil = LBStencil(Stencil.D3Q27)
if config_name.endswith("_noopt"): elif stencil_str == "d3q19":
noopt = True stencil = LBStencil(Stencil.D3Q19)
config_name = config_name[:-len("_noopt")] else:
if config_name.endswith("_d3q27"): raise ValueError("Only D3Q27 and D3Q19 stencil are supported at the moment")
d3q27 = True
config_name = config_name[:-len("_d3q27")] assert streaming_pattern in streaming_patterns, f"Invalid streaming pattern: {streaming_pattern}"
options = options_dict[config_name] options = options_dict[collision_setup]
options.update(common_options)
options = options.copy() assert stencil.D == 3, "This application supports only three-dimensional stencils"
pdfs, pdfs_tmp = ps.fields(f"pdfs({stencil.Q}), pdfs_tmp({stencil.Q}): {pdf_data_type}[3D]", layout='fzyx')
if noopt: density_field, velocity_field = ps.fields(f"density, velocity(3) : {field_data_type}[3D]", layout='fzyx')
options['optimization']['cse_global'] = False macroscopic_fields = {'density': density_field, 'velocity': velocity_field}
options['optimization']['cse_pdfs'] = False
if d3q27: lbm_config = LBMConfig(stencil=stencil, field_name=pdfs.name, streaming_pattern=streaming_pattern, **options)
options['stencil'] = 'D3Q27' lbm_opt = LBMOptimisation(cse_global=True, cse_pdfs=False, symbolic_field=pdfs, field_layout='fzyx')
stencil_str = options['stencil'] if lbm_config.method == Method.CENTRAL_MOMENT:
q = int(stencil_str[stencil_str.find('Q') + 1:]) lbm_config = replace(lbm_config, nested_moments=get_default_moment_set_for_stencil(stencil))
pdfs, velocity_field = ps.fields("pdfs({q}), velocity(3) : double[3D]".format(q=q), layout='fzyx')
options['optimization']['symbolic_field'] = pdfs if not is_inplace(streaming_pattern):
lbm_opt = replace(lbm_opt, symbolic_temporary_field=pdfs_tmp)
vp = [ field_swaps = [(pdfs, pdfs_tmp)]
('double', 'omega_0'), else:
('double', 'omega_1'), field_swaps = []
('double', 'omega_2'),
('double', 'omega_3'), # This is a microbenchmark for testing how fast Q PDFs can be updated per cell. To avoid optimisations from
('double', 'omega_4'), # the compiler the PDFs are shuffled inside a cell. Otherwise, for common streaming patterns compilers would
('double', 'omega_5'), # typically remove the copy of the center PDF which results in an overestimation of the maximum performance
('double', 'omega_6'), stream_only_kernel = []
('int32_t', 'cudaBlockSize0'), for i in range(stencil.Q):
('int32_t', 'cudaBlockSize1'), stream_only_kernel.append(Assignment(pdfs(i), pdfs((i + 3) % stencil.Q)))
]
lb_method = create_lb_method(**options) # LB Sweep
update_rule = create_lb_update_rule(lb_method=lb_method, **options) collision_rule = create_lb_collision_rule(lbm_config=lbm_config, lbm_optimisation=lbm_opt)
if not noopt: if optimize:
update_rule = insert_fast_divisions(update_rule) collision_rule = insert_fast_divisions(collision_rule)
update_rule = insert_fast_sqrts(update_rule) collision_rule = insert_fast_sqrts(collision_rule)
# CPU lattice model - required for macroscopic value computation, VTK output etc. lb_method = collision_rule.method
options_without_opt = options.copy()
del options_without_opt['optimization'] no_slip = lbm_boundary_generator(class_name='NoSlip', flag_uid='NoSlip',
generate_lattice_model(ctx, 'UniformGridGPU_LatticeModel', create_lb_collision_rule(lb_method=lb_method, boundary_object=NoSlip(), field_data_type=pdf_data_type)
**options_without_opt)) ubb = lbm_boundary_generator(class_name='UBB', flag_uid='UBB',
boundary_object=UBB([0.05, 0, 0], data_type=field_data_type),
# gpu LB sweep & boundaries field_data_type=pdf_data_type)
generate_sweep(ctx, 'UniformGridGPU_LbKernel', update_rule,
field_swaps=[('pdfs', 'pdfs_tmp')], generate_lbm_package(ctx, name="UniformGridGPU",
inner_outer_split=True, target='gpu', gpu_indexing_params=sweep_params, collision_rule=collision_rule,
varying_parameters=vp) lbm_config=lbm_config, lbm_optimisation=lbm_opt,
nonuniform=False, boundaries=[no_slip, ubb],
generate_boundary(ctx, 'UniformGridGPU_NoSlip', NoSlip(), lb_method, target='gpu') macroscopic_fields=macroscopic_fields,
generate_boundary(ctx, 'UniformGridGPU_UBB', UBB([0.05, 0, 0]), lb_method, target='gpu') target=ps.Target.GPU, gpu_indexing_params=gpu_indexing_params,
data_type=field_data_type, pdfs_data_type=pdf_data_type,
# getter & setter max_threads=max_threads)
setter_assignments = macroscopic_values_setter(lb_method, velocity=velocity_field.center_vector,
pdfs=pdfs.center_vector, density=1.0) # Stream only kernel
getter_assignments = macroscopic_values_getter(lb_method, velocity=velocity_field.center_vector, generate_sweep(ctx, 'UniformGridGPU_StreamOnlyKernel', stream_only_kernel,
pdfs=pdfs.center_vector, density=None) gpu_indexing_params={'block_size': (128, 1, 1)}, target=ps.Target.GPU,
generate_sweep(ctx, 'UniformGridGPU_MacroSetter', setter_assignments) max_threads=max_threads)
generate_sweep(ctx, 'UniformGridGPU_MacroGetter', getter_assignments)
# communication
generate_pack_info_from_kernel(ctx, 'UniformGridGPU_PackInfo', update_rule, target='gpu')
infoHeaderParams = { infoHeaderParams = {
'stencil': stencil_str, 'stencil': stencil_str,
'q': q, 'streaming_pattern': streaming_pattern,
'configName': ctx.config, 'collision_setup': collision_setup,
'cse_global': int(options['optimization']['cse_global']), 'cse_global': int(lbm_opt.cse_global),
'cse_pdfs': int(options['optimization']['cse_pdfs']), 'cse_pdfs': int(lbm_opt.cse_pdfs),
} }
ctx.write_file("UniformGridGPU_Defines.h", info_header.format(**infoHeaderParams))
field_typedefs = {'VelocityField_T': velocity_field,
'ScalarField_T': density_field}
# Info header containing correct template definitions for stencil and field
generate_info_header(ctx, 'UniformGridGPU_InfoHeader',
field_typedefs=field_typedefs,
additional_code=info_header.format(**infoHeaderParams))
#include "core/Environment.h"
#include "core/logging/Initialization.h"
#include "python_coupling/CreateConfig.h"
#include "python_coupling/PythonCallback.h"
#include "python_coupling/DictWrapper.h"
#include "blockforest/Initialization.h"
#include "field/FlagField.h"
#include "field/AddToStorage.h"
#include "field/vtk/VTKWriter.h"
#include "field/communication/PackInfo.h"
#include "lbm/PerformanceLogger.h"
#include "blockforest/communication/UniformBufferedScheme.h"
#include "timeloop/all.h"
#include "geometry/all.h"
#include "cuda/HostFieldAllocator.h"
#include "cuda/communication/GPUPackInfo.h"
#include "cuda/ParallelStreams.h"
#include "core/timing/TimingPool.h"
#include "core/timing/RemainingTimeLogger.h"
#include "cuda/AddGPUFieldToStorage.h"
#include "cuda/communication/UniformGPUScheme.h"
#include "cuda/DeviceSelectMPI.h"
#include "domain_decomposition/SharedSweep.h"
#include "InitShearVelocity.h"
#include "gui/Gui.h"
#ifdef WALBERLA_ENABLE_GUI
#include "lbm/gui/PdfFieldDisplayAdaptor.h"
#endif
#include "UniformGridGPU_AA_PackInfoPush.h"
#include "UniformGridGPU_AA_PackInfoPull.h"
#include "UniformGridGPU_AA_MacroSetter.h"
#include "UniformGridGPU_AA_MacroGetter.h"
#include "UniformGridGPU_AA_LbKernelEven.h"
#include "UniformGridGPU_AA_LbKernelOdd.h"
#include "UniformGridGPU_AA_Defines.h"
#include <cmath>
using namespace walberla;
using CommunicationStencil_T = Stencil_T;
using PdfField_T = GhostLayerField< real_t, Stencil_T::Q >;
using VelocityField_T = GhostLayerField< real_t, 3 >;
int main( int argc, char **argv )
{
mpi::Environment env( argc, argv );
cuda::selectDeviceBasedOnMpiRank();
for ( auto cfg = python_coupling::configBegin( argc, argv ); cfg != python_coupling::configEnd(); ++cfg )
{
WALBERLA_MPI_WORLD_BARRIER();
WALBERLA_CUDA_CHECK( cudaPeekAtLastError() );
auto config = *cfg;
logging::configureLogging( config );
auto blocks = blockforest::createUniformBlockGridFromConfig( config );
Vector3< uint_t > cellsPerBlock = config->getBlock( "DomainSetup" ).getParameter< Vector3< uint_t > >( "cellsPerBlock" );
// Reading parameters
auto parameters = config->getOneBlock( "Parameters" );
const real_t omega = parameters.getParameter< real_t >( "omega", real_c( 1.4 ));
const uint_t timesteps = parameters.getParameter< uint_t >( "timesteps", uint_c( 50 ));
// Creating fields
BlockDataID pdfFieldCpuID = field::addToStorage< PdfField_T >( blocks, "pdfs cpu", real_t( std::nan("") ), field::fzyx );
BlockDataID velFieldCpuID = field::addToStorage< VelocityField_T >( blocks, "vel", real_t( 0 ), field::fzyx );
WALBERLA_LOG_INFO_ON_ROOT( "Initializing shear flow" );
initShearVelocity( blocks, velFieldCpuID );
pystencils::UniformGridGPU_AA_MacroGetter getterSweep( pdfFieldCpuID, velFieldCpuID );
pystencils::UniformGridGPU_AA_MacroSetter setterSweep( pdfFieldCpuID, velFieldCpuID );
for ( auto &block : *blocks )
setterSweep( &block );
BlockDataID pdfFieldGpuID = cuda::addGPUFieldToStorage< PdfField_T >( blocks, pdfFieldCpuID, "pdfs on GPU", true );
Vector3<int> innerOuterSplit = parameters.getParameter<Vector3<int> >("innerOuterSplit", Vector3<int>(1, 1, 1));
for(uint_t i=0; i< 3; ++i)
{
if( int_c(cellsPerBlock[i]) <= innerOuterSplit[i] * 2) {
WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller or increase cellsPerBlock");
}
}
Cell innerOuterSplitCell (innerOuterSplit[0], innerOuterSplit[1], innerOuterSplit[2]);
bool cudaEnabledMPI = parameters.getParameter<bool>( "cudaEnabledMPI", false );
Vector3<int32_t> gpuBlockSize = parameters.getParameter<Vector3<int32_t> > ("gpuBlockSize", Vector3<int32_t>(256, 1, 1));
int streamHighPriority = 0;
int streamLowPriority = 0;
WALBERLA_CUDA_CHECK( cudaDeviceGetStreamPriorityRange( &streamLowPriority, &streamHighPriority ));
WALBERLA_CHECK( gpuBlockSize[2] == 1 );
using KernelEven = pystencils::UniformGridGPU_AA_LbKernelEven;
using KernelOdd = pystencils::UniformGridGPU_AA_LbKernelOdd;
using PackInfoPull = pystencils::UniformGridGPU_AA_PackInfoPull;
using PackInfoPush = pystencils::UniformGridGPU_AA_PackInfoPush;
using cuda::communication::UniformGPUScheme;
KernelEven kernelEven( pdfFieldGpuID, omega, gpuBlockSize[0], gpuBlockSize[1], innerOuterSplitCell );
KernelOdd kernelOdd ( pdfFieldGpuID, omega, gpuBlockSize[0], gpuBlockSize[1], innerOuterSplitCell );
kernelEven.setOuterPriority( streamHighPriority );
kernelOdd .setOuterPriority( streamHighPriority );
auto pullScheme = make_shared< UniformGPUScheme< Stencil_T > >( blocks, cudaEnabledMPI );
auto pushScheme = make_shared< UniformGPUScheme< Stencil_T > >( blocks, cudaEnabledMPI );
pullScheme->addPackInfo( make_shared< PackInfoPull >( pdfFieldGpuID ) );
pushScheme->addPackInfo( make_shared< PackInfoPush >( pdfFieldGpuID ) );
auto defaultStream = cuda::StreamRAII::newPriorityStream( streamLowPriority );
auto setupPhase = [&]() {
for ( auto &block: *blocks )
kernelEven( &block );
pullScheme->communicate();
for ( auto &block: *blocks )
kernelOdd( &block );
};
auto tearDownPhase = [&]() {
pushScheme->communicate();
cuda::fieldCpy< PdfField_T, cuda::GPUField< real_t > >( blocks, pdfFieldCpuID, pdfFieldGpuID );
for ( auto &block : *blocks )
getterSweep( &block );
};
auto simpleOverlapTimeStep = [&]()
{
// Even
pushScheme->startCommunication( defaultStream );
for ( auto &block: *blocks )
kernelEven.inner( &block, defaultStream );
pushScheme->wait( defaultStream );
for ( auto &block: *blocks )
kernelEven.outer( &block, defaultStream );
// Odd
pullScheme->startCommunication( defaultStream );
for ( auto &block: *blocks )
kernelOdd.inner( &block, defaultStream );
pullScheme->wait( defaultStream );
for ( auto &block: *blocks )
kernelOdd.outer( &block, defaultStream );
};
auto normalTimeStep = [&]()
{
pushScheme->communicate( defaultStream );
for ( auto &block: *blocks )
kernelEven( &block, defaultStream );
pullScheme->communicate( defaultStream );
for ( auto &block: *blocks )
kernelOdd( &block, defaultStream );
};
auto kernelOnlyFunc = [&]()
{
for ( auto &block: *blocks )
kernelEven( &block, defaultStream );
for ( auto &block: *blocks )
kernelOdd( &block, defaultStream );
};
SweepTimeloop timeLoop( blocks->getBlockStorage(), timesteps / 2 );
const std::string timeStepStrategy = parameters.getParameter< std::string >( "timeStepStrategy", "normal" );
std::function< void() > timeStep;
if ( timeStepStrategy == "noOverlap" )
timeStep = std::function< void() >( normalTimeStep );
else if ( timeStepStrategy == "simpleOverlap" )
timeStep = simpleOverlapTimeStep;
else if ( timeStepStrategy == "kernelOnly" )
{
WALBERLA_LOG_INFO_ON_ROOT( "Running only compute kernel without boundary - this makes only sense for benchmarking!" )
timeStep = kernelOnlyFunc;
}
else
{
WALBERLA_ABORT_NO_DEBUG_INFO(
"Invalid value for 'timeStepStrategy'. Allowed values are 'noOverlap', 'complexOverlap', 'simpleOverlap', 'kernelOnly'" );
}
timeLoop.add() << BeforeFunction( timeStep )
<< Sweep( []( IBlock * ) {}, "time step" );
// VTK
uint_t vtkWriteFrequency = parameters.getParameter< uint_t >( "vtkWriteFrequency", 0 );
if ( vtkWriteFrequency > 0 )
{
auto vtkOutput = vtk::createVTKOutput_BlockData( *blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out",
"simulation_step", false, true, true, false, 0 );
auto velWriter = make_shared< field::VTKWriter< VelocityField_T > >( velFieldCpuID, "vel" );
vtkOutput->addCellDataWriter( velWriter );
vtkOutput->addBeforeFunction( [&]()
{
tearDownPhase();
setupPhase();
} );
timeLoop.addFuncAfterTimeStep( vtk::writeFiles( vtkOutput ), "VTK Output" );
}
int warmupSteps = parameters.getParameter< int >( "warmupSteps", 2 );
int outerIterations = parameters.getParameter< int >( "outerIterations", 1 );
setupPhase();
for ( int i = 0; i < warmupSteps; ++i )
timeLoop.singleStep();
double remainingTimeLoggerFrequency = parameters.getParameter< double >( "remainingTimeLoggerFrequency", -1.0 ); // in seconds
if ( remainingTimeLoggerFrequency > 0 )
{
auto logger = timing::RemainingTimeLogger( timeLoop.getNrOfTimeSteps() * uint_c(outerIterations), remainingTimeLoggerFrequency );
timeLoop.addFuncAfterTimeStep( logger, "remaining time logger" );
}
bool useGui = parameters.getParameter<bool>( "useGui", false );
if( useGui )
{
#ifdef WALBERLA_ENABLE_GUI
cuda::fieldCpy< PdfField_T, cuda::GPUField< real_t > >( blocks, pdfFieldCpuID, pdfFieldGpuID );
timeLoop.addFuncAfterTimeStep( cuda::fieldCpyFunctor<PdfField_T, cuda::GPUField<real_t> >( blocks, pdfFieldCpuID, pdfFieldGpuID ), "copy to CPU" );
GUI gui( timeLoop, blocks, argc, argv);
gui.registerDisplayAdaptorCreator(
[&](const IBlock & block, ConstBlockDataID blockDataID) -> gui::DisplayAdaptor * {
if ( block.isDataOfType< PdfField_T >( blockDataID) )
return new lbm::PdfFieldDisplayAdaptor<GhostLayerField<real_t, Stencil_T::Q>, Stencil_T >( blockDataID );
return nullptr;
});
gui.run();
#else
WALBERLA_ABORT_NO_DEBUG_INFO("Application was built without GUI. Set useGui to false or re-compile with GUI.")
#endif
}
else
{
for ( int outerIteration = 0; outerIteration < outerIterations; ++outerIteration )
{
WALBERLA_CUDA_CHECK( cudaPeekAtLastError() );
timeLoop.setCurrentTimeStepToZero();
WcTimer simTimer;
cudaDeviceSynchronize();
WALBERLA_CUDA_CHECK( cudaPeekAtLastError() );
WALBERLA_LOG_INFO_ON_ROOT( "Starting simulation with " << timesteps << " time steps" );
simTimer.start();
timeLoop.run();
cudaDeviceSynchronize();
simTimer.end();
WALBERLA_LOG_INFO_ON_ROOT( "Simulation finished" );
auto time = simTimer.last();
auto nrOfCells = real_c( cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2] );
auto mlupsPerProcess = nrOfCells * real_c( timesteps ) / time * 1e-6;
WALBERLA_LOG_RESULT_ON_ROOT( "MLUPS per process " << mlupsPerProcess );
WALBERLA_LOG_RESULT_ON_ROOT( "Time per time step " << time / real_c( timesteps ));
WALBERLA_ROOT_SECTION()
{
python_coupling::PythonCallback pythonCallbackResults( "results_callback" );
if ( pythonCallbackResults.isCallable())
{
const char * storagePattern = "aa";
pythonCallbackResults.data().exposeValue( "mlupsPerProcess", mlupsPerProcess );
pythonCallbackResults.data().exposeValue( "stencil", infoStencil );
pythonCallbackResults.data().exposeValue( "configName", infoConfigName );
pythonCallbackResults.data().exposeValue( "storagePattern", storagePattern );
pythonCallbackResults.data().exposeValue( "cse_global", infoCseGlobal );
pythonCallbackResults.data().exposeValue( "cse_pdfs", infoCsePdfs );
// Call Python function to report results
pythonCallbackResults();
}
}
}
}
}
return 0;
}
#!/usr/bin/env python3
import os
from waLBerla.tools.config import block_decomposition
job_script_header = """
#!/bin/bash -l
#SBATCH --job-name=scaling
#SBATCH --time=01:00:00
#SBATCH --nodes={nodes}
#SBATCH -o out_scaling_{nodes}_%j.txt
#SBATCH -e err_scaling_{nodes}_%j.txt
#SBATCH --ntasks-per-core=1
#SBATCH --cpus-per-task=1
#SBATCH --partition=normal
#SBATCH --constraint=gpu
#SBATCH --account=s1042
source ~/env.sh
export MPICH_RDMA_ENABLED_CUDA=1 # allow GPU-GPU data transfer
export CRAY_CUDA_MPS=1 # allow GPU sharing
export MPICH_G2G_PIPELINE=256 # adapt maximum number of concurrent in-flight messages
export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
export CRAY_CUDA_MPS=1
export MPICH_RANK_REORDER_METHOD=3
export PMI_MMAP_SYNC_WAIT_TIME=300
cd {folder}
# grid_order -R -H -c 1,1,8 -g 16,16,8
ulimit -c 0
"""
job_script_exe_part = """
export WALBERLA_SCENARIO_IDX=0
while srun -n {nodes} ./{app} {config}
do
((WALBERLA_SCENARIO_IDX++))
done
"""
streaming_patterns = ['pull', 'push', 'aa', 'esotwist']
stencils = ['d3q27', 'd3q19']
methods = ['srt', 'mrt', 'cumulant', 'entropic']
all_executables = []
for stencil in stencils:
for streaming_pattern in streaming_patterns:
for method in methods:
all_executables.append(f"UniformGridGPU_{stencil}_{streaming_pattern}_{method}")
all_executables = tuple(all_executables)
def generate_jobscripts(exe_names=all_executables):
for node_count in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 2400]:
folder_name = "scaling_{:04d}".format(node_count)
os.makedirs(folder_name, exist_ok=True)
# run grid_order
import subprocess
decomposition = block_decomposition(node_count)
decomposition_str = ",".join(str(e) for e in decomposition)
subprocess.check_call(['grid_order', '-R', '-H', '-g', decomposition_str])
job_script = job_script_header.format(nodes=node_count, folder=os.path.join(os.getcwd(), folder_name))
for exe in exe_names:
job_script += job_script_exe_part.format(app="../" + exe, nodes=node_count,
config='../communication_compare.py')
with open(os.path.join(folder_name, 'job.sh'), 'w') as f:
f.write(job_script)
if __name__ == '__main__':
print("Called without waLBerla - generating job scripts for PizDaint")
generate_jobscripts()
#include "core/Environment.h"
#include "core/logging/Initialization.h"
#include "core/math/Random.h"
#include "python_coupling/CreateConfig.h"
#include "python_coupling/PythonCallback.h"
#include "python_coupling/DictWrapper.h"
#include "blockforest/Initialization.h"
#include "field/FlagField.h"
#include "field/AddToStorage.h"
#include "field/vtk/VTKWriter.h"
#include "field/communication/PackInfo.h"
#include "lbm/PerformanceLogger.h"
#include "blockforest/communication/UniformBufferedScheme.h"
#include "timeloop/all.h"
#include "core/math/Random.h"
#include "geometry/all.h"
#include "gpu/HostFieldAllocator.h"
#include "gpu/communication/GPUPackInfo.h"
#include "gpu/ParallelStreams.h"
#include "gpu/NVTX.h"
#include "core/timing/TimingPool.h"
#include "core/timing/RemainingTimeLogger.h"
#include "gpu/AddGPUFieldToStorage.h"
#include "gpu/communication/UniformGPUScheme.h"
#include "gpu/DeviceSelectMPI.h"
#include "domain_decomposition/SharedSweep.h"
#include "UniformGridGPU_LatticeModel.h"
#include "UniformGridGPU_LbKernel.h"
#include "UniformGridGPU_PackInfo.h"
#include "UniformGridGPU_UBB.h"
#include "UniformGridGPU_NoSlip.h"
#include "UniformGridGPU_Communication.h"
#include "UniformGridGPU_MacroSetter.h"
#include "UniformGridGPU_MacroGetter.h"
#include "UniformGridGPU_Defines.h"
#include "InitShearVelocity.h"
using namespace walberla;
using LatticeModel_T = lbm::UniformGridGPU_LatticeModel;
const auto Q = LatticeModel_T::Stencil::Q;
using Stencil_T = LatticeModel_T::Stencil;
using CommunicationStencil_T = LatticeModel_T::CommunicationStencil;
using PdfField_T = GhostLayerField<real_t, Q>;
using CommScheme_T = gpu::communication::UniformGPUScheme<CommunicationStencil_T>;
using VelocityField_T = GhostLayerField<real_t, 3>;
using flag_t = walberla::uint8_t;
using FlagField_T = FlagField<flag_t>;
int main( int argc, char **argv )
{
mpi::Environment env( argc, argv );
gpu::selectDeviceBasedOnMpiRank();
for( auto cfg = python_coupling::configBegin( argc, argv ); cfg != python_coupling::configEnd(); ++cfg )
{
WALBERLA_MPI_WORLD_BARRIER();
auto config = *cfg;
logging::configureLogging( config );
auto blocks = blockforest::createUniformBlockGridFromConfig( config );
Vector3<uint_t> cellsPerBlock = config->getBlock( "DomainSetup" ).getParameter<Vector3<uint_t> >( "cellsPerBlock" );
// Reading parameters
auto parameters = config->getOneBlock( "Parameters" );
const std::string timeStepStrategy = parameters.getParameter<std::string>( "timeStepStrategy", "normal");
const real_t omega = parameters.getParameter<real_t>( "omega", real_c( 1.4 ));
const uint_t timesteps = parameters.getParameter<uint_t>( "timesteps", uint_c( 50 ));
const bool initShearFlow = parameters.getParameter<bool>("initShearFlow", true);
// Creating fields
BlockDataID pdfFieldCpuID = field::addToStorage< PdfField_T >( blocks, "pdfs cpu", real_t(0), field::fzyx);
BlockDataID velFieldCpuID = field::addToStorage< VelocityField_T >( blocks, "vel", real_t(0), field::fzyx);
if( timeStepStrategy != "kernelOnlyNoInit")
{
if ( initShearFlow )
{
WALBERLA_LOG_INFO_ON_ROOT( "Initializing shear flow" );
initShearVelocity( blocks, velFieldCpuID );
}
pystencils::UniformGridGPU_MacroSetter setterSweep(pdfFieldCpuID, velFieldCpuID);
for( auto & block : *blocks )
setterSweep( &block );
// setter sweep only initializes interior of domain - for push schemes to work a first communication is required here
blockforest::communication::UniformBufferedScheme<CommunicationStencil_T> initialComm(blocks);
initialComm.addPackInfo( make_shared< field::communication::PackInfo<PdfField_T> >( pdfFieldCpuID ) );
initialComm();
}
BlockDataID pdfFieldGpuID = gpu::addGPUFieldToStorage<PdfField_T >( blocks, pdfFieldCpuID, "pdfs on GPU", true );
BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >( blocks, "flag field" );
// Boundaries
const FlagUID fluidFlagUID( "Fluid" );
auto boundariesConfig = config->getBlock( "Boundaries" );
bool disableBoundaries = true;
if( boundariesConfig )
{
disableBoundaries = false;
geometry::initBoundaryHandling< FlagField_T >( *blocks, flagFieldID, boundariesConfig );
geometry::setNonBoundaryCellsToDomain< FlagField_T >( *blocks, flagFieldID, fluidFlagUID );
}
lbm::UniformGridGPU_UBB ubb(blocks, pdfFieldGpuID);
lbm::UniformGridGPU_NoSlip noSlip(blocks, pdfFieldGpuID);
ubb.fillFromFlagField<FlagField_T>( blocks, flagFieldID, FlagUID("UBB"), fluidFlagUID );
noSlip.fillFromFlagField<FlagField_T>( blocks, flagFieldID, FlagUID("NoSlip"), fluidFlagUID );
// Communication setup
bool cudaEnabledMPI = parameters.getParameter<bool>( "cudaEnabledMPI", false );
Vector3<int32_t> gpuBlockSize = parameters.getParameter<Vector3<int32_t> > ("gpuBlockSize", Vector3<int32_t>(256, 1, 1));
const std::string communicationSchemeStr = parameters.getParameter<std::string>("communicationScheme", "UniformGPUScheme_Baseline");
CommunicationSchemeType communicationScheme;
if( communicationSchemeStr == "GPUPackInfo_Baseline")
communicationScheme = GPUPackInfo_Baseline;
else if (communicationSchemeStr == "GPUPackInfo_Streams")
communicationScheme = GPUPackInfo_Streams;
else if (communicationSchemeStr == "UniformGPUScheme_Baseline")
communicationScheme = UniformGPUScheme_Baseline;
else if (communicationSchemeStr == "UniformGPUScheme_Memcpy")
communicationScheme = UniformGPUScheme_Memcpy;
else if (communicationSchemeStr == "MPIDatatypes")
communicationScheme = MPIDatatypes;
else if (communicationSchemeStr == "MPIDatatypesFull")
communicationScheme = MPIDatatypesFull;
else {
WALBERLA_ABORT_NO_DEBUG_INFO("Invalid choice for communicationScheme")
}
Vector3<int> innerOuterSplit = parameters.getParameter<Vector3<int> >("innerOuterSplit", Vector3<int>(1, 1, 1));
for(uint_t i=0; i< 3; ++i)
{
if( int_c(cellsPerBlock[i]) <= innerOuterSplit[i] * 2) {
WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller or increase cellsPerBlock");
}
}
int streamHighPriority = 0;
int streamLowPriority = 0;
WALBERLA_CUDA_CHECK( cudaDeviceGetStreamPriorityRange(&streamLowPriority, &streamHighPriority) );
WALBERLA_CHECK(gpuBlockSize[2] == 1);
pystencils::UniformGridGPU_LbKernel lbKernel( pdfFieldGpuID, omega,
1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7,
gpuBlockSize[0], gpuBlockSize[1],
Cell(innerOuterSplit[0], innerOuterSplit[1], innerOuterSplit[2]) );
lbKernel.setOuterPriority( streamHighPriority );
UniformGridGPU_Communication< CommunicationStencil_T, gpu::GPUField< double > >
gpuComm( blocks, pdfFieldGpuID, (CommunicationSchemeType) communicationScheme, cudaEnabledMPI );
auto defaultStream = gpu::StreamRAII::newPriorityStream( streamLowPriority );
auto innerOuterStreams = gpu::ParallelStreams( streamHighPriority );
auto boundaryOuterStreams = gpu::ParallelStreams( streamHighPriority );
auto boundaryInnerStreams = gpu::ParallelStreams( streamHighPriority );
uint_t currentTimeStep = 0;
auto simpleOverlapTimeStep = [&] ()
{
gpuComm.startCommunication(defaultStream);
for( auto &block: *blocks )
lbKernel.inner( &block, defaultStream );
gpuComm.wait(defaultStream);
for( auto &block: *blocks )
lbKernel.outer( &block, defaultStream );
};
auto overlapTimeStep = [&]()
{
gpu::NvtxRange namedRange("timestep");
auto innerOuterSection = innerOuterStreams.parallelSection( defaultStream );
innerOuterSection.run([&]( auto innerStream )
{
gpu::nameStream(innerStream, "inner stream");
for( auto &block: *blocks )
{
if(!disableBoundaries)
{
auto p = boundaryInnerStreams.parallelSection( innerStream );
p.run( [&block, &ubb]( cudaStream_t s ) { ubb.inner( &block, s ); } );
p.run( [&block, &noSlip]( cudaStream_t s ) { noSlip.inner( &block, s ); } );
}
lbKernel.inner( &block, innerStream );
}
});
innerOuterSection.run([&]( auto outerStream )
{
gpu::nameStream(outerStream, "outer stream");
gpuComm( outerStream );
for( auto &block: *blocks )
{
if(!disableBoundaries)
{
auto p = boundaryOuterStreams.parallelSection( outerStream );
p.run( [&block, &ubb]( cudaStream_t s ) { ubb.outer( &block, s ); } );
p.run( [&block, &noSlip]( cudaStream_t s ) { noSlip.outer( &block, s ); } );
}
lbKernel.outer( &block, outerStream );
}
});
currentTimeStep += 1;
};
auto boundaryStreams = gpu::ParallelStreams( streamHighPriority );
auto normalTimeStep = [&]()
{
gpuComm();
for( auto &block: *blocks )
{
if(!disableBoundaries)
{
auto p = boundaryStreams.parallelSection( defaultStream );
p.run( [&block, &ubb]( cudaStream_t s ) { ubb( &block, s ); } );
p.run( [&block, &noSlip]( cudaStream_t s ) { noSlip( &block, s ); } );
}
lbKernel( &block );
}
};
auto kernelOnlyFunc = [&] ()
{
for( auto &block: *blocks )
lbKernel( &block );
};
SweepTimeloop timeLoop( blocks->getBlockStorage(), timesteps );
std::function<void()> timeStep;
if (timeStepStrategy == "noOverlap")
timeStep = std::function<void()>( normalTimeStep );
else if (timeStepStrategy == "complexOverlap")
timeStep = std::function<void()>( overlapTimeStep );
else if (timeStepStrategy == "simpleOverlap")
timeStep = simpleOverlapTimeStep;
else if (timeStepStrategy == "kernelOnly" or timeStepStrategy == "kernelOnlyNoInit") {
WALBERLA_LOG_INFO_ON_ROOT("Running only compute kernel without boundary - this makes only sense for benchmarking!")
timeStep = kernelOnlyFunc;
}
else {
WALBERLA_ABORT_NO_DEBUG_INFO("Invalid value for 'timeStepStrategy'. Allowed values are 'noOverlap', 'complexOverlap', 'simpleOverlap', 'kernelOnly'");
}
timeLoop.add() << BeforeFunction( timeStep )
<< Sweep( []( IBlock * ) {}, "time step" );
pystencils::UniformGridGPU_MacroGetter getterSweep( pdfFieldCpuID, velFieldCpuID );
// VTK
uint_t vtkWriteFrequency = parameters.getParameter<uint_t>( "vtkWriteFrequency", 0 );
if( vtkWriteFrequency > 0 )
{
auto vtkOutput = vtk::createVTKOutput_BlockData( *blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out",
"simulation_step", false, true, true, false, 0 );
auto velWriter = make_shared< field::VTKWriter<VelocityField_T> >(velFieldCpuID, "vel");
vtkOutput->addCellDataWriter(velWriter);
vtkOutput->addBeforeFunction( [&]() {
gpu::fieldCpy<PdfField_T, gpu::GPUField<real_t> >( blocks, pdfFieldCpuID, pdfFieldGpuID );
for( auto & block : *blocks )
getterSweep( &block );
});
timeLoop.addFuncAfterTimeStep( vtk::writeFiles( vtkOutput ), "VTK Output" );
}
int warmupSteps = parameters.getParameter<int>( "warmupSteps", 2 );
int outerIterations = parameters.getParameter<int>( "outerIterations", 1 );
for(int i=0; i < warmupSteps; ++i )
timeLoop.singleStep();
auto remainingTimeLoggerFrequency = parameters.getParameter< double >( "remainingTimeLoggerFrequency", -1.0 ); // in seconds
if (remainingTimeLoggerFrequency > 0) {
auto logger = timing::RemainingTimeLogger( timeLoop.getNrOfTimeSteps() * uint_c( outerIterations ), remainingTimeLoggerFrequency );
timeLoop.addFuncAfterTimeStep( logger, "remaining time logger" );
}
for ( int outerIteration = 0; outerIteration < outerIterations; ++outerIteration )
{
timeLoop.setCurrentTimeStepToZero();
WcTimer simTimer;
cudaDeviceSynchronize();
WALBERLA_LOG_INFO_ON_ROOT( "Starting simulation with " << timesteps << " time steps" );
simTimer.start();
timeLoop.run();
cudaDeviceSynchronize();
simTimer.end();
WALBERLA_LOG_INFO_ON_ROOT( "Simulation finished" );
auto time = simTimer.last();
auto nrOfCells = real_c( cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2] );
auto mlupsPerProcess = nrOfCells * real_c( timesteps ) / time * 1e-6;
WALBERLA_LOG_RESULT_ON_ROOT( "MLUPS per process " << mlupsPerProcess );
WALBERLA_LOG_RESULT_ON_ROOT( "Time per time step " << time / real_c( timesteps ));
WALBERLA_ROOT_SECTION()
{
python_coupling::PythonCallback pythonCallbackResults( "results_callback" );
if ( pythonCallbackResults.isCallable())
{
const char * storagePattern = "twofield";
pythonCallbackResults.data().exposeValue( "mlupsPerProcess", mlupsPerProcess );
pythonCallbackResults.data().exposeValue( "stencil", infoStencil );
pythonCallbackResults.data().exposeValue( "configName", infoConfigName );
pythonCallbackResults.data().exposeValue( "storagePattern", storagePattern );
pythonCallbackResults.data().exposeValue( "cse_global", infoCseGlobal );
pythonCallbackResults.data().exposeValue( "cse_pdfs", infoCsePdfs );
// Call Python function to report results
pythonCallbackResults();
}
}
}
}
return 0;
}