Commit 9cde0d9f authored by Martin Bauer's avatar Martin Bauer

execution tree module - a more flexible time loop implementation

- allows for nested execution flow (i.e. solver loop inside time loop)
- timing tree integration
- parallel CUDA stream support
parent c2b58592
......@@ -4,6 +4,7 @@
#
###################################################################################################
waLBerla_add_module( DEPENDS blockforest core communication domain_decomposition python_coupling field stencil BUILD_ONLY_IF_FOUND CUDA )
waLBerla_add_module( DEPENDS blockforest core communication domain_decomposition executiontree python_coupling field stencil
BUILD_ONLY_IF_FOUND CUDA )
###################################################################################################
\ No newline at end of file
//==============================================================================================================================================================
//
// This file is part of waLBerla. waLBerla is free software: you can
// redistribute it and/or modify it under the terms of the GNU General Public
// License as published by the Free Software Foundation, either version 3 of
// the License, or (at your option) any later version.
//
// waLBerla is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
//
//! \file TaskTree.h
//! \ingroup cuda
//! \author Martin Bauer <martin.bauer@fau.de>
//
//==============================================================================================================================================================
#pragma once
#include "executiontree/ExecutionTree.h"
#include "ParallelStreams.h"
#include <cuda_runtime.h>
namespace walberla {
namespace executiontree {
// -------------------------------------- Forward Declarations ------------------------------------------------------------------------------------------------
using executiontree::IFunctionNode;
using executiontree::IFunctionNodePtr;
using executiontree::TimingTreePtr;
class SequenceCUDA;
class IFunctionNodeCUDA;
template<typename FunctorClass> class FunctorCUDA;
using IFunctionNodeCUDAPtr = shared_ptr<IFunctionNodeCUDA>;
// -------------------------------------- Public Interface ------------------------------------------------------------------------------------------------
template<typename FunctorType>
IFunctionNodeCUDAPtr functorCUDA( const FunctorType & t, const std::string &name = "", const TimingTreePtr &timingTree = nullptr );
shared_ptr< SequenceCUDA > sequenceCUDA( std::initializer_list< IFunctionNodeCUDAPtr > initializerList,
const std::string &name, cudaStream_t defaultStream = 0, bool parallel = false, int priority = 0,
const TimingTreePtr &timingTree = nullptr );
// -------------------------------------- Node Classes --------------------------------------------------------------------------------------------------------
class IFunctionNodeCUDA : public IFunctionNode
{
public:
virtual void operator()( cudaStream_t ) = 0;
};
template<typename FunctorClass>
void CUDART_CB functorCUDAStartTimer(void *data)
{
auto functor = reinterpret_cast<FunctorClass *>( data );
functor->timingTree_->start( functor->getName() );
}
template<typename FunctorClass>
void CUDART_CB functorCUDAStopTimer(void *data)
{
auto functor = reinterpret_cast<FunctorClass *>( data );
functor->timingTree_->stop( functor->getName() );
}
template<typename FunctorType>
class FunctorCUDA : public IFunctionNodeCUDA
{
public:
FunctorCUDA( const FunctorType &functor,
const std::string &name,
const TimingTreePtr &timingTree )
: functor_( functor ), name_( name ), timingTree_( timingTree ) {}
void operator() (cudaStream_t stream) override
{
if ( timingTree_ )
{
WALBERLA_CUDA_CHECK( cudaLaunchHostFunc( stream, functorCUDAStartTimer<FunctorCUDA<FunctorType> >, this ) );
executiontree::internal::Caller<FunctorType>::call( functor_, stream );
WALBERLA_CUDA_CHECK( cudaLaunchHostFunc( stream, functorCUDAStopTimer<FunctorCUDA<FunctorType> >, this ) );
}
else
executiontree::internal::Caller<FunctorType>::call( functor_, stream );
}
const std::string getName() const override { return name_ != "" ? name_ : "FunctorCUDA"; };
void operator() () override { (*this)( 0 ); }
private:
friend void CUDART_CB functorCUDAStartTimer<FunctorCUDA<FunctorType> >(void *data);
friend void CUDART_CB functorCUDAStopTimer<FunctorCUDA<FunctorType> >(void *data);
FunctorType functor_;
std::string name_;
shared_ptr< WcTimingTree > timingTree_;
};
class SequenceCUDA : public IFunctionNodeCUDA
{
public:
SequenceCUDA( std::initializer_list< IFunctionNodeCUDAPtr > initializerList, const std::string &name, cudaStream_t defaultStream,
bool parallel = false, int priority=0,
const TimingTreePtr &timingTree = nullptr)
: name_( name ), defaultStream_( defaultStream), timingTree_( timingTree ), parallelStreams_( priority ), parallel_( parallel ), priority_(priority)
{
for ( auto &e : initializerList )
children_.push_back( e );
}
void operator() (cudaStream_t stream) override
{
if ( timingTree_ ) {
WALBERLA_CUDA_CHECK( cudaLaunchHostFunc( stream, functorCUDAStartTimer< SequenceCUDA >, this ));
}
if( parallel_ )
{
auto parallelSection = parallelStreams_.parallelSection( stream );
for ( auto &el : children_ )
{
( *el )( parallelSection.stream());
parallelSection.next();
}
}
else
for ( auto &el : children_ )
(*el)( stream );
if ( timingTree_ ) {
WALBERLA_CUDA_CHECK( cudaLaunchHostFunc( stream, functorCUDAStopTimer< SequenceCUDA >, this ));
}
}
void operator() () override { (*this)( defaultStream_ ); }
void push_back( const IFunctionNodeCUDAPtr &fct ) { children_.push_back( fct ); }
void push_front( const IFunctionNodeCUDAPtr &fct ) { children_.push_front( fct ); }
const std::string getName() const override { return name_ != "" ? name_ : "ParallelSequenceCUDA"; };
const std::deque< IFunctionNodePtr > getChildren() const override {
std::deque< IFunctionNodePtr > result;
for( auto & c : children_ )
result.push_back( c );
return result;
};
private:
friend void CUDART_CB functorCUDAStartTimer< SequenceCUDA >( void *data );
friend void CUDART_CB functorCUDAStopTimer< SequenceCUDA >( void *data );
std::string name_;
cudaStream_t defaultStream_;
std::deque< IFunctionNodeCUDAPtr > children_;
shared_ptr< WcTimingTree > timingTree_;
cuda::ParallelStreams parallelStreams_;
bool parallel_;
int priority_;
};
template<typename FunctorType>
IFunctionNodeCUDAPtr functorCUDA( const FunctorType & t, const std::string &name, const shared_ptr< WcTimingTree > &timingTree )
{
return make_shared<FunctorCUDA<FunctorType> >( t, name, timingTree );
}
shared_ptr< SequenceCUDA > sequenceCUDA( std::initializer_list< IFunctionNodeCUDAPtr > initializerList,
const std::string &name, cudaStream_t defaultStream, bool parallel, int priority,
const TimingTreePtr &timingTree )
{
return make_shared< SequenceCUDA >( initializerList, name, defaultStream, parallel, priority, timingTree );
}
} // namespace executiontree
} // namespace walberla
//==============================================================================================================================================================
//
// This file is part of waLBerla. waLBerla is free software: you can
// redistribute it and/or modify it under the terms of the GNU General Public
// License as published by the Free Software Foundation, either version 3 of
// the License, or (at your option) any later version.
//
// waLBerla is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
//
//! \file ExecutionTreeSweepGPU.h
//! \ingroup cuda
//! \author Martin Bauer <martin.bauer@fau.de>
//
//==============================================================================================================================================================
#include "domain_decomposition/IBlock.h"
#include "executiontree/ExecutionTree.h"
#include "ExecutionTreeGPU.h"
namespace walberla {
namespace executiontree {
template<typename FunctorType>
IFunctionNodeCUDAPtr sweepCUDA( BlockStorage &bs, const FunctorType & t, const std::string &name = "", const TimingTreePtr &timingTree = nullptr );
template<typename FunctorType>
IFunctionNodeCUDAPtr sweepCUDA( const shared_ptr< StructuredBlockStorage > &bs, const FunctorType & t, const std::string &name = "",
const TimingTreePtr &tt = nullptr );
template<typename FunctorType>
class SweepCUDA : public IFunctionNodeCUDA
{
public:
SweepCUDA( BlockStorage &bs,
const FunctorType &functor,
const std::string &name,
const TimingTreePtr &timingTree )
: blockStorage_( bs ),
functor_( functor ),
name_( name ),
timingTree_( timingTree ) {}
SweepCUDA( const shared_ptr <StructuredBlockStorage> &bs,
const FunctorType &functor,
const std::string &name,
const TimingTreePtr &timingTree )
: blockStorage_( bs->getBlockStorage()),
functor_( functor ),
name_( name ),
timingTree_( timingTree ) {}
void operator() () override { (*this)( 0 ); }
void operator()( cudaStream_t stream ) override
{
if ( timingTree_ )
{
for ( auto &block: blockStorage_ )
{
timingTree_->start( name_ );
executiontree::internal::Caller<FunctorType>::call( functor_, &block, stream );
timingTree_->stop( name_ );
}
}
else
for ( auto &block: blockStorage_ )
executiontree::internal::Caller<FunctorType>::call( functor_, &block, stream );
}
const std::string getName() const override { return name_ != "" ? name_ : "Sweep"; };
private:
BlockStorage &blockStorage_;
FunctorType functor_;
std::string name_;
TimingTreePtr timingTree_;
};
template<typename FunctorType>
IFunctionNodeCUDAPtr sweepCUDA( BlockStorage &bs, FunctorType t, const std::string &name, const shared_ptr< WcTimingTree > &timingTree )
{
return make_shared<SweepCUDA<FunctorType> >( bs, t, name, timingTree );
}
template<typename FunctorType>
IFunctionNodeCUDAPtr sweepCUDA( const shared_ptr< StructuredBlockStorage > &bs, const FunctorType & t, const std::string &name,
const TimingTreePtr &timingTree )
{
return make_shared<SweepCUDA<FunctorType> >( bs, t, name, timingTree );
}
} // namespace executiontree
} // namespace walberla
......@@ -35,15 +35,15 @@ namespace cuda {
~ParallelSection();
void run( const std::function<void( cudaStream_t )> &f );
cudaStream_t stream();
void next();
private:
friend class ParallelStreams;
ParallelSection( ParallelStreams *parent, cudaStream_t mainStream );
void synchronize();
cudaStream_t stream();
void next();
ParallelStreams * parent_;
cudaStream_t mainStream_;
cudaEvent_t startEvent_;
......
###################################################################################################
#
# Module executiontree
#
###################################################################################################
waLBerla_add_module( DEPENDS core domain_decomposition timeloop )
###################################################################################################
\ No newline at end of file
//==============================================================================================================================================================
//
// This file is part of waLBerla. waLBerla is free software: you can
// redistribute it and/or modify it under the terms of the GNU General Public
// License as published by the Free Software Foundation, either version 3 of
// the License, or (at your option) any later version.
//
// waLBerla is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
//
//! \file TaskTree.cpp
//! \ingroup executiontree
//! \author Martin Bauer <martin.bauer@fau.de>
//
//==============================================================================================================================================================
#include <sstream>
#include <iostream>
#include "core/logging/Logging.h"
#include "core/OpenMP.h"
#include "ExecutionTree.h"
namespace walberla {
namespace executiontree {
using timeloop::ITimeloop;
// --------------------------- Logging Integration of Loop node -----------------------------------------------------------------------------------------------
class LoggingStamp : public logging::Logging::CustomStamp
{
public:
explicit LoggingStamp( const ITimeloop & timeloop ) : timeloop_( timeloop ) {}
std::string stamp() override
{
std::ostringstream oss;
int indention;
if( timeloop_.getNrOfTimeSteps() > 0 )
indention = int_c( std::ceil( std::log10( real_c( timeloop_.getNrOfTimeSteps() ) ) ) );
else if( timeloop_.getCurrentTimeStep() > 0 )
indention = int_c( std::ceil( std::log10( real_c( timeloop_.getCurrentTimeStep() ) ) ) );
else
indention = 0;
oss << std::setw( indention )
<< std::setfill(' ') << std::right << timeloop_.getCurrentTimeStep();
return std::string("[") + oss.str() + std::string("]");
}
uint_t maxStampWidth() override
{
if( timeloop_.getNrOfTimeSteps() > 0 )
return uint_c( std::ceil( std::log10( real_c( timeloop_.getNrOfTimeSteps() ) ) ) ) + uint_c(2);
else if( timeloop_.getCurrentTimeStep() > 0 )
return uint_c( std::ceil( std::log10( real_c( timeloop_.getCurrentTimeStep() ) ) ) ) + uint_c(2);
else
return uint_c(2);
}
private:
const ITimeloop & timeloop_;
};
class LoggingStampManager
{
public:
LoggingStampManager( const shared_ptr< LoggingStamp > & stamp, const bool useCustomStamp )
: useCustomStamp_( useCustomStamp )
{
if( useCustomStamp_ )
logging::Logging::instance()->addCustomStamp( stamp );
}
~LoggingStampManager()
{
if( useCustomStamp_ )
logging::Logging::instance()->clearCustomStamp();
}
private:
const bool useCustomStamp_;
};
// --------------------------- Printing ------------------------------------------------------------------------------------------------------------------------
void printNode( std::ostream &os, const IFunctionNode &node, uint_t indentation )
{
for ( uint_t i = 0; i < indentation; ++i )
os << " ";
os << node.getName() << std::endl;
for ( auto &c : node.getChildren())
printNode( os, *c, indentation + 4 );
}
std::ostream &operator<<( std::ostream &os, const IFunctionNode &node )
{
printNode( os, node, 0 );
return os;
}
// --------------------------- Node class implementation -------------------------------------------------------------------------------------------------------
EveryNth::EveryNth( const IFunctionNodePtr &node, uint_t interval, bool onFirst, uint_t startValue )
: wrapped_( node ), interval_( interval ), onFirst_( onFirst ), calls_( startValue ) {}
void EveryNth::operator()()
{
if ( calls_ == 0 && !onFirst_ ) {
++calls_;
return;
}
if (( calls_ % interval_ ) == 0 )
( *wrapped_ )();
++calls_;
}
const std::string EveryNth::getName() const
{
std::stringstream ss;
ss << "every " << interval_ << "th step:";
return ss.str();
}
Sequence::Sequence( std::initializer_list< IFunctionNodePtr > initializerList, const std::string &name, const TimingTreePtr &timingTree, bool parallel )
: name_( name ), timingTree_( timingTree ), parallel_( parallel )
{
for ( auto &e : initializerList )
children_.push_back( e );
}
void Sequence::operator()()
{
#ifdef WALBERLA_BUILD_WITH_OPENMP
if( parallel_ )
{
if ( timingTree_ )
timingTree_->start( name_ );
int threads = int_c( children_.size() );
#pragma omp parallel num_threads( threads )
{
( *children_[ uint_c( omp_get_thread_num() ) ] )();
}
if ( timingTree_ )
timingTree_->stop( name_ );
return;
}
#endif
WALBERLA_UNUSED(parallel_);
if ( timingTree_ )
timingTree_->start( name_ );
for ( auto &el : children_ )
{
( *el )();
}
if ( timingTree_ )
timingTree_->stop( name_ );
}
Loop::Loop( const IFunctionNodePtr &body, uint_t iterations, bool logTimeStep )
: body_( body ), currentIteration_( 0 ), iterations_( iterations ), stop_( false ), logTimeStep_( logTimeStep ) {}
void Loop::singleStep()
{
LoggingStampManager raii( make_shared<LoggingStamp>( *this ), logTimeStep_ );
( *body_ )();
++currentIteration_;
}
void Loop::operator()()
{
LoggingStampManager raii( make_shared<LoggingStamp>( *this ), logTimeStep_ );
for ( ; currentIteration_ < iterations_; ++currentIteration_ )
{
( *body_ )();
if ( stop_ )
{
stop_ = false;
break;
}
}
}
void Loop::synchronizedStop( bool stopVar )
{
stop_ = stopVar;
mpi::allReduceInplace( stop_, mpi::LOGICAL_OR );
}
const std::string Loop::getName() const
{
std::stringstream ss;
ss << "Loop [" << iterations_ << "]";
return ss.str();
}
} // namespace tasktree
} // namespace walberla
\ No newline at end of file
//==============================================================================================================================================================
//
// This file is part of waLBerla. waLBerla is free software: you can
// redistribute it and/or modify it under the terms of the GNU General Public
// License as published by the Free Software Foundation, either version 3 of
// the License, or (at your option) any later version.
//
// waLBerla is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
//
//! \file TaskTree.h
//! \ingroup executiontree
//! \author Martin Bauer <martin.bauer@fau.de>
//
//==============================================================================================================================================================
#pragma once
#include "core/DataTypes.h"
#include "timeloop/ITimeloop.h"
#include "core/timing/TimingTree.h"
#include "domain_decomposition/StructuredBlockStorage.h"
#include <deque>
#include <string>
#include <initializer_list>
#include <functional>
namespace walberla {
namespace executiontree {
// -------------------------------------- Forward Declarations ------------------------------------------------------------------------------------------------
class IFunctionNode;
using IFunctionNodePtr = shared_ptr<IFunctionNode>;
using TimingTreePtr = shared_ptr<WcTimingTree>;
class EveryNth;
class Sequence;
class Loop;
template< typename FunctorType > class Functor;
template< typename FunctorType > class SharedFunctor;
template< typename FunctorType > class Sweep;
template< typename FunctorType > class SharedSweep;
// -------------------------------------- Public Interface ------------------------------------------------------------------------------------------------
/*! Creates a functor node around any callable object. The wrapped functor is copied.
*
* \param any callable object. The object is copied - if its state has to be modified later, pass a shared_ptr to a functor instead
* \param name optional name of the functor node
* \param timingTree optional timing tree object to time all executions of this functor
*/
template<typename FunctorType>
IFunctionNodePtr functor( FunctorType t, const std::string &name = "", const shared_ptr< WcTimingTree > &timingTree = nullptr );
/*! Combine multiple task nodes into a (named) sequence
*
* \param initializerList list of tasks that are executed in the passed order
* \param name optional sequence name, used for printing and for labeling time measurements
* \param timingTree optional timing tree object
*/
shared_ptr< Sequence > sequence( std::initializer_list< IFunctionNodePtr > initializerList,
const std::string &name = "",
const TimingTreePtr &timingTree = nullptr );
/*! All subtasks of this region are executed in parallel using OpenMP */
shared_ptr< Sequence > parallelSequence( std::initializer_list< IFunctionNodePtr > initializerList,
const std::string &name = "",
const TimingTreePtr &timingTree = nullptr );
/*! Note that runs its contents only every n'th call
*
* \param node task that is only run every n'th call
* \param name the interval i.e. "n"
* \param onFirst if false the task is not run at the first call
* \param startValue initial call counter
*/
shared_ptr< EveryNth > everyNth( const IFunctionNodePtr &node,
uint_t interval,
bool onFirst = false,
uint_t startValue = 0 );
/*! Runs the child node for the given amount of iterations */
shared_ptr< Loop > loop( const IFunctionNodePtr &body, uint_t iterations, bool logTimeStep = true );
std::ostream &operator<<( std::ostream &os, const IFunctionNode &node );
// -------------------------------------- Node Classes --------------------------------------------------------------------------------------------------------
class IFunctionNode
{
public:
virtual ~IFunctionNode() {}
virtual void operator()() = 0;
virtual const std::string getName() const = 0;
virtual const std::deque< shared_ptr< IFunctionNode > > getChildren() const { return {}; }
};
template<typename FunctorType>
class Functor : public IFunctionNode
{
public:
Functor(const FunctorType &functor,
const std::string &name,
const TimingTreePtr & timingTree );
const std::string getName() const override { return name_ != "" ? name_ : "Functor"; };
void operator() () override;
private:
FunctorType functor_;
std::string name_;
shared_ptr< WcTimingTree > timingTree_;
};
class EveryNth : public IFunctionNode
{
public:
EveryNth( const IFunctionNodePtr &node, uint_t interval, bool onFirst = false, uint_t startValue = 0 );
void operator()() override;
const std::string getName() const override;
const std::deque< shared_ptr< IFunctionNode > > getChildren() const override { return { wrapped_ }; }
private:
IFunctionNodePtr wrapped_;
uint_t interval_;
bool onFirst_;
uint_t calls_;
};
class Sequence : public IFunctionNode
{
public:
Sequence( std::initializer_list< IFunctionNodePtr > initializerList, const std::string &name,
const TimingTreePtr &timingTree = nullptr, bool parallel = false );
void operator()() override;
void push_back( const IFunctionNodePtr &fct ) { children_.push_back( fct ); }
void push_front( const IFunctionNodePtr &fct ) { children_.push_front( fct ); }
const std::string getName() const override { return name_ != "" ? name_ : "Sequence"; };
const std::deque< IFunctionNodePtr > getChildren() const override { return children_; };
private:
std::string name_;
std::deque< IFunctionNodePtr > children_;
shared_ptr< WcTimingTree > timingTree_;
bool parallel_;
};