From 9cde0d9fa7759409d2d2dc702c5c3ed6a65fc554 Mon Sep 17 00:00:00 2001
From: Martin Bauer <martin.bauer@fau.de>
Date: Thu, 21 Feb 2019 08:25:37 +0100
Subject: [PATCH] execution tree module - a more flexible time loop
 implementation

- allows for nested execution flow (i.e. solver loop inside time loop)
- timing tree integration
- parallel CUDA stream support
---
 src/cuda/CMakeLists.txt                   |   3 +-
 src/cuda/ExecutionTreeGPU.h               | 190 +++++++++++++++++++
 src/cuda/ExecutionTreeSweepGPU.h          | 103 ++++++++++
 src/cuda/ParallelStreams.h                |   6 +-
 src/executiontree/CMakeLists.txt          |   9 +
 src/executiontree/ExecutionTree.cpp       | 220 ++++++++++++++++++++++
 src/executiontree/ExecutionTree.h         | 206 ++++++++++++++++++++
 src/executiontree/ExecutionTree.impl.h    | 109 +++++++++++
 src/executiontree/ExecutionTreeSweep.h    | 119 ++++++++++++
 tests/CMakeLists.txt                      |   1 +
 tests/executiontree/CMakeLists.txt        |   8 +
 tests/executiontree/ExecutionTreeTest.cpp |  68 +++++++
 12 files changed, 1038 insertions(+), 4 deletions(-)
 create mode 100644 src/cuda/ExecutionTreeGPU.h
 create mode 100644 src/cuda/ExecutionTreeSweepGPU.h
 create mode 100644 src/executiontree/CMakeLists.txt
 create mode 100644 src/executiontree/ExecutionTree.cpp
 create mode 100644 src/executiontree/ExecutionTree.h
 create mode 100644 src/executiontree/ExecutionTree.impl.h
 create mode 100644 src/executiontree/ExecutionTreeSweep.h
 create mode 100644 tests/executiontree/CMakeLists.txt
 create mode 100644 tests/executiontree/ExecutionTreeTest.cpp

diff --git a/src/cuda/CMakeLists.txt b/src/cuda/CMakeLists.txt
index c4ff50133..98aa991f0 100644
--- a/src/cuda/CMakeLists.txt
+++ b/src/cuda/CMakeLists.txt
@@ -4,6 +4,7 @@
 #
 ###################################################################################################
 
-waLBerla_add_module( DEPENDS blockforest core communication domain_decomposition python_coupling field stencil BUILD_ONLY_IF_FOUND CUDA )
+waLBerla_add_module( DEPENDS blockforest core communication domain_decomposition executiontree python_coupling field stencil
+                     BUILD_ONLY_IF_FOUND CUDA )
 
 ###################################################################################################
\ No newline at end of file
diff --git a/src/cuda/ExecutionTreeGPU.h b/src/cuda/ExecutionTreeGPU.h
new file mode 100644
index 000000000..9f458f289
--- /dev/null
+++ b/src/cuda/ExecutionTreeGPU.h
@@ -0,0 +1,190 @@
+//==============================================================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file TaskTree.h
+//! \ingroup cuda
+//! \author Martin Bauer <martin.bauer@fau.de>
+//
+//==============================================================================================================================================================
+
+#pragma once
+
+#include "executiontree/ExecutionTree.h"
+#include "ParallelStreams.h"
+
+#include <cuda_runtime.h>
+
+namespace walberla {
+namespace executiontree {
+
+// -------------------------------------- Forward Declarations ------------------------------------------------------------------------------------------------
+
+using executiontree::IFunctionNode;
+using executiontree::IFunctionNodePtr;
+using executiontree::TimingTreePtr;
+
+class SequenceCUDA;
+class IFunctionNodeCUDA;
+template<typename FunctorClass> class FunctorCUDA;
+using IFunctionNodeCUDAPtr = shared_ptr<IFunctionNodeCUDA>;
+
+
+// -------------------------------------- Public Interface     ------------------------------------------------------------------------------------------------
+
+template<typename FunctorType>
+IFunctionNodeCUDAPtr functorCUDA( const FunctorType & t, const std::string &name = "", const TimingTreePtr &timingTree = nullptr );
+
+
+shared_ptr< SequenceCUDA > sequenceCUDA( std::initializer_list< IFunctionNodeCUDAPtr > initializerList,
+                                         const std::string &name, cudaStream_t defaultStream = 0, bool parallel = false, int priority = 0,
+                                         const TimingTreePtr &timingTree = nullptr );
+
+
+// -------------------------------------- Node Classes --------------------------------------------------------------------------------------------------------
+
+
+class IFunctionNodeCUDA : public IFunctionNode
+{
+public:
+   virtual void operator()( cudaStream_t ) = 0;
+};
+
+template<typename FunctorClass>
+void CUDART_CB functorCUDAStartTimer(void *data)
+{
+   auto functor = reinterpret_cast<FunctorClass *>( data );
+   functor->timingTree_->start( functor->getName() );
+}
+
+template<typename FunctorClass>
+void CUDART_CB functorCUDAStopTimer(void *data)
+{
+   auto functor = reinterpret_cast<FunctorClass *>( data );
+   functor->timingTree_->stop( functor->getName() );
+}
+
+template<typename FunctorType>
+class FunctorCUDA : public IFunctionNodeCUDA
+{
+public:
+   FunctorCUDA( const FunctorType &functor,
+                const std::string &name,
+                const TimingTreePtr &timingTree )
+      : functor_( functor ), name_( name ), timingTree_( timingTree ) {}
+
+   void operator() (cudaStream_t stream) override
+   {
+      if ( timingTree_ )
+      {
+         WALBERLA_CUDA_CHECK( cudaLaunchHostFunc( stream, functorCUDAStartTimer<FunctorCUDA<FunctorType> >, this ) );
+         executiontree::internal::Caller<FunctorType>::call( functor_, stream );
+         WALBERLA_CUDA_CHECK( cudaLaunchHostFunc( stream, functorCUDAStopTimer<FunctorCUDA<FunctorType> >, this ) );
+      }
+      else
+         executiontree::internal::Caller<FunctorType>::call( functor_, stream );
+   }
+
+   const std::string getName() const override { return name_ != "" ? name_ : "FunctorCUDA"; };
+   void operator() () override {  (*this)( 0 );  }
+
+private:
+   friend void CUDART_CB functorCUDAStartTimer<FunctorCUDA<FunctorType> >(void *data);
+   friend void CUDART_CB functorCUDAStopTimer<FunctorCUDA<FunctorType> >(void *data);
+
+   FunctorType functor_;
+   std::string name_;
+   shared_ptr< WcTimingTree > timingTree_;
+};
+
+
+class SequenceCUDA : public IFunctionNodeCUDA
+{
+public:
+   SequenceCUDA( std::initializer_list< IFunctionNodeCUDAPtr > initializerList, const std::string &name, cudaStream_t defaultStream,
+                 bool parallel = false, int priority=0,
+                 const TimingTreePtr &timingTree = nullptr)
+      : name_( name ), defaultStream_( defaultStream), timingTree_( timingTree ), parallelStreams_( priority ), parallel_( parallel ), priority_(priority)
+   {
+      for ( auto &e : initializerList )
+         children_.push_back( e );
+   }
+
+
+   void operator() (cudaStream_t stream) override
+   {
+      if ( timingTree_ ) {
+         WALBERLA_CUDA_CHECK( cudaLaunchHostFunc( stream, functorCUDAStartTimer< SequenceCUDA >, this ));
+      }
+
+      if( parallel_ )
+      {
+         auto parallelSection = parallelStreams_.parallelSection( stream );
+         for ( auto &el : children_ )
+         {
+            ( *el )( parallelSection.stream());
+            parallelSection.next();
+         }
+      }
+      else
+         for ( auto &el : children_ )
+            (*el)( stream );
+
+      if ( timingTree_ ) {
+         WALBERLA_CUDA_CHECK( cudaLaunchHostFunc( stream, functorCUDAStopTimer< SequenceCUDA >, this ));
+      }
+   }
+
+   void operator() () override {  (*this)( defaultStream_ );  }
+   void push_back( const IFunctionNodeCUDAPtr &fct ) { children_.push_back( fct ); }
+   void push_front( const IFunctionNodeCUDAPtr &fct ) { children_.push_front( fct ); }
+   const std::string getName() const override { return name_ != "" ? name_ : "ParallelSequenceCUDA"; };
+   const std::deque< IFunctionNodePtr > getChildren() const override {
+      std::deque< IFunctionNodePtr > result;
+      for( auto & c : children_ )
+         result.push_back( c );
+      return result;
+   };
+
+private:
+   friend void CUDART_CB functorCUDAStartTimer< SequenceCUDA >( void *data );
+   friend void CUDART_CB functorCUDAStopTimer< SequenceCUDA >( void *data );
+
+   std::string name_;
+   cudaStream_t defaultStream_;
+   std::deque< IFunctionNodeCUDAPtr > children_;
+   shared_ptr< WcTimingTree > timingTree_;
+   cuda::ParallelStreams parallelStreams_;
+   bool parallel_;
+   int priority_;
+};
+
+
+template<typename FunctorType>
+IFunctionNodeCUDAPtr functorCUDA( const FunctorType & t, const std::string &name, const shared_ptr< WcTimingTree > &timingTree )
+{
+   return make_shared<FunctorCUDA<FunctorType> >( t, name, timingTree );
+}
+
+
+shared_ptr< SequenceCUDA > sequenceCUDA( std::initializer_list< IFunctionNodeCUDAPtr > initializerList,
+                                         const std::string &name, cudaStream_t defaultStream, bool parallel, int priority,
+                                         const TimingTreePtr &timingTree )
+{
+   return make_shared< SequenceCUDA >( initializerList, name, defaultStream, parallel, priority, timingTree );
+}
+
+
+} // namespace executiontree
+} // namespace walberla
diff --git a/src/cuda/ExecutionTreeSweepGPU.h b/src/cuda/ExecutionTreeSweepGPU.h
new file mode 100644
index 000000000..e5ad3d2a6
--- /dev/null
+++ b/src/cuda/ExecutionTreeSweepGPU.h
@@ -0,0 +1,103 @@
+//==============================================================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file ExecutionTreeSweepGPU.h
+//! \ingroup cuda
+//! \author Martin Bauer <martin.bauer@fau.de>
+//
+//==============================================================================================================================================================
+
+#include "domain_decomposition/IBlock.h"
+#include "executiontree/ExecutionTree.h"
+#include "ExecutionTreeGPU.h"
+
+namespace walberla {
+namespace executiontree {
+
+
+template<typename FunctorType>
+IFunctionNodeCUDAPtr sweepCUDA( BlockStorage &bs, const FunctorType & t, const std::string &name = "", const TimingTreePtr &timingTree = nullptr );
+
+template<typename FunctorType>
+IFunctionNodeCUDAPtr sweepCUDA( const shared_ptr< StructuredBlockStorage > &bs, const FunctorType & t, const std::string &name = "",
+                                const TimingTreePtr &tt = nullptr );
+
+
+template<typename FunctorType>
+class SweepCUDA : public IFunctionNodeCUDA
+{
+public:
+   SweepCUDA( BlockStorage &bs,
+              const FunctorType &functor,
+              const std::string &name,
+              const TimingTreePtr &timingTree )
+      : blockStorage_( bs ),
+        functor_( functor ),
+        name_( name ),
+        timingTree_( timingTree ) {}
+
+   SweepCUDA( const shared_ptr <StructuredBlockStorage> &bs,
+              const FunctorType &functor,
+              const std::string &name,
+              const TimingTreePtr &timingTree )
+      : blockStorage_( bs->getBlockStorage()),
+        functor_( functor ),
+        name_( name ),
+        timingTree_( timingTree ) {}
+
+   void operator() () override {  (*this)( 0 );  }
+
+   void operator()( cudaStream_t stream ) override
+   {
+      if ( timingTree_ )
+      {
+         for ( auto &block: blockStorage_ )
+         {
+            timingTree_->start( name_ );
+            executiontree::internal::Caller<FunctorType>::call( functor_, &block, stream );
+            timingTree_->stop( name_ );
+         }
+      }
+      else
+         for ( auto &block: blockStorage_ )
+            executiontree::internal::Caller<FunctorType>::call( functor_, &block, stream );
+   }
+
+   const std::string getName() const override { return name_ != "" ? name_ : "Sweep"; };
+
+private:
+   BlockStorage &blockStorage_;
+
+   FunctorType functor_;
+   std::string name_;
+   TimingTreePtr timingTree_;
+};
+
+template<typename FunctorType>
+IFunctionNodeCUDAPtr sweepCUDA( BlockStorage &bs, FunctorType t, const std::string &name, const shared_ptr< WcTimingTree > &timingTree )
+{
+   return make_shared<SweepCUDA<FunctorType> >( bs, t, name, timingTree );
+}
+
+template<typename FunctorType>
+IFunctionNodeCUDAPtr sweepCUDA( const shared_ptr< StructuredBlockStorage > &bs, const FunctorType & t, const std::string &name,
+                                const TimingTreePtr &timingTree )
+{
+   return make_shared<SweepCUDA<FunctorType> >( bs, t, name, timingTree );
+}
+
+
+} // namespace executiontree
+} // namespace walberla
diff --git a/src/cuda/ParallelStreams.h b/src/cuda/ParallelStreams.h
index 8f6348015..4116e0ef9 100644
--- a/src/cuda/ParallelStreams.h
+++ b/src/cuda/ParallelStreams.h
@@ -35,15 +35,15 @@ namespace cuda {
       ~ParallelSection();
       void run( const std::function<void( cudaStream_t )> &f );
 
+      cudaStream_t stream();
+      void next();
+
    private:
       friend class ParallelStreams;
 
       ParallelSection( ParallelStreams *parent, cudaStream_t mainStream );
       void synchronize();
 
-      cudaStream_t stream();
-      void next();
-
       ParallelStreams * parent_;
       cudaStream_t mainStream_;
       cudaEvent_t startEvent_;
diff --git a/src/executiontree/CMakeLists.txt b/src/executiontree/CMakeLists.txt
new file mode 100644
index 000000000..46737d9f9
--- /dev/null
+++ b/src/executiontree/CMakeLists.txt
@@ -0,0 +1,9 @@
+###################################################################################################
+#
+# Module executiontree
+#
+###################################################################################################
+
+waLBerla_add_module( DEPENDS core domain_decomposition timeloop )
+                        
+###################################################################################################                        
\ No newline at end of file
diff --git a/src/executiontree/ExecutionTree.cpp b/src/executiontree/ExecutionTree.cpp
new file mode 100644
index 000000000..b27389707
--- /dev/null
+++ b/src/executiontree/ExecutionTree.cpp
@@ -0,0 +1,220 @@
+//==============================================================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file TaskTree.cpp
+//! \ingroup executiontree
+//! \author Martin Bauer <martin.bauer@fau.de>
+//
+//==============================================================================================================================================================
+
+
+#include <sstream>
+#include <iostream>
+#include "core/logging/Logging.h"
+#include "core/OpenMP.h"
+#include "ExecutionTree.h"
+
+
+namespace walberla {
+namespace executiontree {
+
+using timeloop::ITimeloop;
+
+
+// --------------------------- Logging Integration of Loop node  -----------------------------------------------------------------------------------------------
+
+
+class LoggingStamp : public logging::Logging::CustomStamp
+{
+public:
+   explicit LoggingStamp( const ITimeloop & timeloop ) : timeloop_( timeloop ) {}
+   std::string stamp() override
+   {
+      std::ostringstream oss;
+      int indention;
+
+      if( timeloop_.getNrOfTimeSteps() > 0 )
+         indention = int_c( std::ceil( std::log10( real_c( timeloop_.getNrOfTimeSteps() ) ) ) );
+      else if( timeloop_.getCurrentTimeStep() > 0 )
+         indention = int_c( std::ceil( std::log10( real_c( timeloop_.getCurrentTimeStep() ) ) ) );
+      else
+         indention = 0;
+
+      oss << std::setw( indention )
+          << std::setfill(' ') << std::right << timeloop_.getCurrentTimeStep();
+      return std::string("[") + oss.str() + std::string("]");
+   }
+   uint_t maxStampWidth() override
+   {
+      if( timeloop_.getNrOfTimeSteps() > 0 )
+         return uint_c( std::ceil( std::log10( real_c( timeloop_.getNrOfTimeSteps() ) ) ) ) + uint_c(2);
+      else if( timeloop_.getCurrentTimeStep() > 0 )
+         return uint_c( std::ceil( std::log10( real_c( timeloop_.getCurrentTimeStep() ) ) ) ) + uint_c(2);
+      else
+         return uint_c(2);
+   }
+private:
+   const ITimeloop & timeloop_;
+};
+
+class LoggingStampManager
+{
+public:
+   LoggingStampManager( const shared_ptr< LoggingStamp > & stamp, const bool useCustomStamp )
+           : useCustomStamp_( useCustomStamp )
+   {
+      if( useCustomStamp_ )
+         logging::Logging::instance()->addCustomStamp( stamp );
+   }
+   ~LoggingStampManager()
+   {
+      if( useCustomStamp_ )
+         logging::Logging::instance()->clearCustomStamp();
+   }
+private:
+   const bool useCustomStamp_;
+};
+
+
+// --------------------------- Printing ------------------------------------------------------------------------------------------------------------------------
+
+void printNode( std::ostream &os, const IFunctionNode &node, uint_t indentation )
+{
+   for ( uint_t i = 0; i < indentation; ++i )
+      os << " ";
+
+   os << node.getName() << std::endl;
+   for ( auto &c : node.getChildren())
+      printNode( os, *c, indentation + 4 );
+}
+
+std::ostream &operator<<( std::ostream &os, const IFunctionNode &node )
+{
+   printNode( os, node, 0 );
+   return os;
+}
+
+// --------------------------- Node class implementation -------------------------------------------------------------------------------------------------------
+
+
+EveryNth::EveryNth( const IFunctionNodePtr &node, uint_t interval, bool onFirst, uint_t startValue )
+        : wrapped_( node ), interval_( interval ), onFirst_( onFirst ), calls_( startValue ) {}
+
+
+void EveryNth::operator()()
+{
+   if ( calls_ == 0 && !onFirst_ ) {
+      ++calls_;
+      return;
+   }
+
+   if (( calls_ % interval_ ) == 0 )
+      ( *wrapped_ )();
+   ++calls_;
+}
+
+const std::string EveryNth::getName() const
+{
+   std::stringstream ss;
+   ss << "every " << interval_ << "th step:";
+   return ss.str();
+}
+
+
+Sequence::Sequence( std::initializer_list< IFunctionNodePtr > initializerList, const std::string &name, const TimingTreePtr &timingTree, bool parallel )
+        : name_( name ), timingTree_( timingTree ), parallel_( parallel )
+{
+   for ( auto &e : initializerList )
+      children_.push_back( e );
+}
+
+void Sequence::operator()()
+{
+#ifdef WALBERLA_BUILD_WITH_OPENMP
+   if( parallel_ )
+   {
+      if ( timingTree_ )
+         timingTree_->start( name_ );
+
+      int threads = int_c( children_.size() );
+      #pragma omp parallel num_threads( threads )
+      {
+
+         ( *children_[ uint_c( omp_get_thread_num() ) ] )();
+      }
+
+      if ( timingTree_ )
+         timingTree_->stop( name_ );
+
+      return;
+   }
+#endif
+   WALBERLA_UNUSED(parallel_);
+
+   if ( timingTree_ )
+      timingTree_->start( name_ );
+
+   for ( auto &el : children_ )
+   {
+      ( *el )();
+   }
+
+   if ( timingTree_ )
+      timingTree_->stop( name_ );
+}
+
+
+Loop::Loop( const IFunctionNodePtr &body, uint_t iterations, bool logTimeStep )
+        : body_( body ), currentIteration_( 0 ), iterations_( iterations ), stop_( false ), logTimeStep_( logTimeStep ) {}
+
+
+void Loop::singleStep()
+{
+   LoggingStampManager raii( make_shared<LoggingStamp>( *this ), logTimeStep_ );
+   ( *body_ )();
+   ++currentIteration_;
+}
+
+void Loop::operator()()
+{
+   LoggingStampManager raii( make_shared<LoggingStamp>( *this ), logTimeStep_ );
+
+   for ( ; currentIteration_ < iterations_; ++currentIteration_ )
+   {
+      ( *body_ )();
+      if ( stop_ )
+      {
+         stop_ = false;
+         break;
+      }
+   }
+}
+
+void Loop::synchronizedStop( bool stopVar )
+{
+   stop_ = stopVar;
+   mpi::allReduceInplace( stop_, mpi::LOGICAL_OR );
+}
+
+const std::string Loop::getName() const
+{
+   std::stringstream ss;
+   ss << "Loop [" << iterations_ << "]";
+   return ss.str();
+}
+
+
+} // namespace tasktree
+} // namespace walberla
\ No newline at end of file
diff --git a/src/executiontree/ExecutionTree.h b/src/executiontree/ExecutionTree.h
new file mode 100644
index 000000000..99b3515a5
--- /dev/null
+++ b/src/executiontree/ExecutionTree.h
@@ -0,0 +1,206 @@
+//==============================================================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file TaskTree.h
+//! \ingroup executiontree
+//! \author Martin Bauer <martin.bauer@fau.de>
+//
+//==============================================================================================================================================================
+
+#pragma once
+
+#include "core/DataTypes.h"
+#include "timeloop/ITimeloop.h"
+#include "core/timing/TimingTree.h"
+#include "domain_decomposition/StructuredBlockStorage.h"
+
+#include <deque>
+#include <string>
+#include <initializer_list>
+#include <functional>
+
+
+namespace walberla {
+namespace executiontree {
+
+
+// -------------------------------------- Forward Declarations ------------------------------------------------------------------------------------------------
+
+class IFunctionNode;
+using IFunctionNodePtr = shared_ptr<IFunctionNode>;
+using TimingTreePtr = shared_ptr<WcTimingTree>;
+
+class EveryNth;
+class Sequence;
+class Loop;
+
+template< typename FunctorType > class Functor;
+template< typename FunctorType > class SharedFunctor;
+template< typename FunctorType > class Sweep;
+template< typename FunctorType > class SharedSweep;
+
+
+// -------------------------------------- Public Interface     ------------------------------------------------------------------------------------------------
+
+
+/*! Creates a functor node around any callable object. The wrapped functor is copied.
+ *
+ * \param any callable object. The object is copied - if its state has to be modified later, pass a shared_ptr to a functor instead
+ * \param name optional name of the functor node
+ * \param timingTree optional timing tree object to time all executions of this functor
+ */
+template<typename FunctorType>
+IFunctionNodePtr functor( FunctorType t, const std::string &name = "", const shared_ptr< WcTimingTree > &timingTree = nullptr );
+
+
+/*! Combine multiple task nodes into a (named) sequence
+ *
+ * \param initializerList list of tasks that are executed in the passed order
+ * \param name optional sequence name, used for printing and for labeling time measurements
+ * \param timingTree optional timing tree object
+ */
+shared_ptr< Sequence > sequence( std::initializer_list< IFunctionNodePtr > initializerList,
+                                 const std::string &name = "",
+                                 const TimingTreePtr &timingTree = nullptr );
+
+
+/*! All subtasks of this region are executed in parallel using OpenMP */
+shared_ptr< Sequence > parallelSequence( std::initializer_list< IFunctionNodePtr > initializerList,
+                                         const std::string &name = "",
+                                         const TimingTreePtr &timingTree = nullptr );
+
+
+
+/*! Note that runs its contents only every n'th call
+ *
+ * \param node task that is only run every n'th call
+ * \param name the interval i.e. "n"
+ * \param onFirst if false the task is not run at the first call
+ * \param startValue initial call counter
+ */
+shared_ptr< EveryNth > everyNth( const IFunctionNodePtr &node,
+                                 uint_t interval,
+                                 bool onFirst = false,
+                                 uint_t startValue = 0 );
+
+/*! Runs the child node for the given amount of iterations */
+shared_ptr< Loop > loop( const IFunctionNodePtr &body, uint_t iterations, bool logTimeStep = true );
+
+std::ostream &operator<<( std::ostream &os, const IFunctionNode &node );
+
+
+// -------------------------------------- Node Classes --------------------------------------------------------------------------------------------------------
+
+
+class IFunctionNode
+{
+public:
+   virtual ~IFunctionNode() {}
+   virtual void operator()() = 0;
+   virtual const std::string getName() const = 0;
+   virtual const std::deque< shared_ptr< IFunctionNode > > getChildren() const { return {}; }
+};
+
+
+template<typename FunctorType>
+class Functor : public IFunctionNode
+{
+public:
+   Functor(const FunctorType &functor,
+           const std::string &name,
+           const TimingTreePtr & timingTree );
+
+   const std::string getName() const override { return name_ != "" ? name_ : "Functor"; };
+   void operator() () override;
+
+private:
+   FunctorType functor_;
+   std::string name_;
+   shared_ptr< WcTimingTree > timingTree_;
+};
+
+
+class EveryNth : public IFunctionNode
+{
+public:
+   EveryNth( const IFunctionNodePtr &node, uint_t interval, bool onFirst = false, uint_t startValue = 0 );
+
+   void operator()() override;
+   const std::string getName() const override;
+   const std::deque< shared_ptr< IFunctionNode > > getChildren() const override { return { wrapped_ }; }
+
+private:
+   IFunctionNodePtr wrapped_;
+   uint_t interval_;
+   bool onFirst_;
+   uint_t calls_;
+};
+
+class Sequence : public IFunctionNode
+{
+public:
+   Sequence( std::initializer_list< IFunctionNodePtr > initializerList, const std::string &name,
+             const TimingTreePtr &timingTree = nullptr, bool parallel = false );
+
+   void operator()() override;
+
+   void push_back( const IFunctionNodePtr &fct ) { children_.push_back( fct ); }
+   void push_front( const IFunctionNodePtr &fct ) { children_.push_front( fct ); }
+   const std::string getName() const override { return name_ != "" ? name_ : "Sequence"; };
+   const std::deque< IFunctionNodePtr > getChildren() const override { return children_; };
+
+private:
+   std::string name_;
+   std::deque< IFunctionNodePtr > children_;
+   shared_ptr< WcTimingTree > timingTree_;
+   bool parallel_;
+};
+
+
+class Loop : public IFunctionNode, public timeloop::ITimeloop
+{
+public:
+   Loop( const IFunctionNodePtr &body, uint_t iterations, bool logTimeStep = true );
+
+   void operator()() override;
+   void run() override { ( *this )(); }
+   void singleStep() override;
+
+   void synchronizedStop( bool stopVal ) override;
+   void stop() override { stop_ = true; }
+   void setBody( const IFunctionNodePtr &body ) { body_ = body; }
+   void setCurrentTimeStep( uint_t ts ) override { currentIteration_ = ts; };
+   uint_t getCurrentTimeStep() const override { return currentIteration_; }
+   uint_t getNrOfTimeSteps() const override { return iterations_; }
+
+   const std::deque< shared_ptr< IFunctionNode > > getChildren() const override { return { body_ }; }
+   const std::string getName()  const override;
+
+private:
+   IFunctionNodePtr body_;
+   uint_t currentIteration_;
+   uint_t iterations_;
+   bool stop_;
+   bool logTimeStep_;
+};
+
+
+
+
+} // namespace executiontree
+} // namespace walberla
+
+
+#include "ExecutionTree.impl.h"
\ No newline at end of file
diff --git a/src/executiontree/ExecutionTree.impl.h b/src/executiontree/ExecutionTree.impl.h
new file mode 100644
index 000000000..8bdfb2bed
--- /dev/null
+++ b/src/executiontree/ExecutionTree.impl.h
@@ -0,0 +1,109 @@
+//==============================================================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file TaskTree.impl.h
+//! \ingroup executiontree
+//! \author Martin Bauer <martin.bauer@fau.de>
+//
+//==============================================================================================================================================================
+
+#pragma once
+
+#include "ExecutionTree.h"
+#include <utility>
+
+namespace walberla {
+namespace executiontree {
+
+namespace internal {
+
+// Helper to handle functors and shared_ptr's to functors the same way
+template<typename T>
+struct Caller
+{
+   template<typename ... Args>
+   static void call( T &t, Args&&... args )
+   {
+      t(std::forward<Args>(args)...);
+   }
+};
+
+template<typename T>
+struct Caller< shared_ptr < T > >
+{
+   template<typename ... Args>
+   static void call( shared_ptr <T> &t, Args&&... args )
+   {
+      ( *t )(std::forward<Args>(args)...);
+   }
+};
+
+
+} // namespace internal
+
+
+template<typename FunctorType>
+IFunctionNodePtr functor( FunctorType t, const std::string &name, const TimingTreePtr &timingTree )
+{
+   return make_shared< Functor< FunctorType > >( t, name, timingTree );
+}
+
+inline shared_ptr <Sequence> sequence( std::initializer_list< IFunctionNodePtr > initializerList, const std::string &name,
+                                       const TimingTreePtr &timingTree )
+{
+   return make_shared< Sequence >( initializerList, name, timingTree, false );
+}
+
+inline shared_ptr <Sequence> parallelSequence( std::initializer_list< IFunctionNodePtr > initializerList, const std::string &name,
+                                               const TimingTreePtr &timingTree )
+{
+   return make_shared< Sequence >( initializerList, name, timingTree, true );
+}
+
+
+inline shared_ptr< EveryNth > everyNth( const IFunctionNodePtr &node, uint_t interval, bool onFirst, uint_t startValue )
+{
+   return make_shared< EveryNth >( node, interval, onFirst, startValue );
+}
+
+
+inline shared_ptr< Loop > loop( const IFunctionNodePtr &body, uint_t iterations, bool logTimeStep )
+{
+   return make_shared< Loop >( body, iterations, logTimeStep );
+}
+
+
+template<typename FunctorType>
+Functor< FunctorType >::Functor( const FunctorType &functor, const std::string &name, const TimingTreePtr &timingTree )
+        :functor_( functor ), name_( name ), timingTree_( timingTree ) {}
+
+
+template<typename FunctorType>
+void Functor< FunctorType >::operator()()
+{
+   if ( timingTree_ )
+   {
+      timingTree_->start( name_ );
+      internal::Caller<FunctorType>::call(functor_);
+      timingTree_->stop( name_ );
+   }
+   else
+      internal::Caller<FunctorType>::call(functor_);
+}
+
+
+
+} // namespace executiontree
+} // namespace walberla
\ No newline at end of file
diff --git a/src/executiontree/ExecutionTreeSweep.h b/src/executiontree/ExecutionTreeSweep.h
new file mode 100644
index 000000000..e450d9843
--- /dev/null
+++ b/src/executiontree/ExecutionTreeSweep.h
@@ -0,0 +1,119 @@
+//==============================================================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file TaskTree.h
+//! \ingroup executiontree
+//! \author Martin Bauer <martin.bauer@fau.de>
+//
+//==============================================================================================================================================================
+
+#include "domain_decomposition/IBlock.h"
+#include "executiontree/ExecutionTree.h"
+
+namespace walberla {
+namespace executiontree {
+
+
+// -------------------------------------- Public Interface     ------------------------------------------------------------------------------------------------
+
+
+template<typename FunctorType>
+IFunctionNodePtr sweep( BlockStorage &bs,
+                        const FunctorType &functor,
+                        const std::string &name,
+                        const TimingTreePtr &timingTree = nullptr );
+
+template<typename FunctorType>
+IFunctionNodePtr sweep( const shared_ptr< StructuredBlockStorage > &bs,
+                        const FunctorType &functor,
+                        const std::string &name,
+                        const TimingTreePtr &timingTree = nullptr );
+
+
+// -------------------------------------- Implementation ------------------------------------------------------------------------------------------------------
+
+
+template<typename FunctorType>
+class Sweep : public IFunctionNode
+{
+public:
+   Sweep( BlockStorage &bs,
+          const FunctorType &functor,
+          const std::string &name,
+          const TimingTreePtr &timingTree )
+      : blockStorage_( bs ),
+        functor_( functor ),
+        name_( name ),
+        timingTree_( timingTree ) {}
+
+   Sweep( const shared_ptr< StructuredBlockStorage > &bs,
+          const FunctorType &functor,
+          const std::string &name,
+          const TimingTreePtr &timingTree )
+      : blockStorage_( bs->getBlockStorage()),
+        functor_( functor ),
+        name_( name ),
+        timingTree_( timingTree ) {}
+
+   void operator()() override
+   {
+      if ( timingTree_ )
+      {
+         for ( auto &block: blockStorage_ )
+         {
+            timingTree_->start( name_ );
+            internal::Caller<FunctorType>::call( functor_, &block );
+            timingTree_->stop( name_ );
+         }
+      }
+      else
+         for ( auto &block: blockStorage_ )
+            internal::Caller<FunctorType>::call( functor_, &block );
+   }
+
+   const std::string getName() const override { return name_ != "" ? name_ : "Sweep"; };
+
+private:
+   BlockStorage &blockStorage_;
+
+   FunctorType functor_;
+   std::string name_;
+   TimingTreePtr timingTree_;
+};
+
+
+
+template<typename FunctorType>
+IFunctionNodePtr sweep( BlockStorage &bs,
+                        const FunctorType &functor,
+                        const std::string &name,
+                        const TimingTreePtr &timingTree )
+{
+   return make_shared< Sweep< FunctorType > >( bs, functor, name, timingTree );
+}
+
+template<typename FunctorType>
+IFunctionNodePtr sweep( const shared_ptr< StructuredBlockStorage > &bs,
+                        const FunctorType &functor,
+                        const std::string &name,
+                        const TimingTreePtr &timingTree )
+{
+   return make_shared< Sweep< FunctorType > >( bs, functor, name, timingTree );
+}
+
+
+
+} // namespace executiontree
+} // namespace walberla
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 47e3b49ff..e259e3a7d 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -7,6 +7,7 @@ add_subdirectory( boundary )
 add_subdirectory( core )
 add_subdirectory( cuda )
 add_subdirectory( domain_decomposition )
+add_subdirectory( executiontree )
 add_subdirectory( fft )
 add_subdirectory( field )
 add_subdirectory( gather )
diff --git a/tests/executiontree/CMakeLists.txt b/tests/executiontree/CMakeLists.txt
new file mode 100644
index 000000000..37fcab24f
--- /dev/null
+++ b/tests/executiontree/CMakeLists.txt
@@ -0,0 +1,8 @@
+###################################################################################################
+#
+# Tests for executiontree module
+#
+###################################################################################################
+
+waLBerla_compile_test( NAME ExecutionTreeTest FILES ExecutionTreeTest.cpp )
+waLBerla_execute_test( NAME ExecutionTreeTest )
diff --git a/tests/executiontree/ExecutionTreeTest.cpp b/tests/executiontree/ExecutionTreeTest.cpp
new file mode 100644
index 000000000..c362dd11f
--- /dev/null
+++ b/tests/executiontree/ExecutionTreeTest.cpp
@@ -0,0 +1,68 @@
+#include "executiontree/ExecutionTree.h"
+
+#include <iostream>
+#include "core/logging/Logging.h"
+#include "core/debug/Debug.h"
+#include "core/debug/TestSubsystem.h"
+#include "core/mpi/Environment.h"
+
+using namespace walberla;
+namespace et = executiontree;
+
+class MyFunctor
+{
+public:
+   void operator() ()
+   {
+      WALBERLA_LOG_RESULT( "i = " << i );
+      i += 1;
+   }
+
+   int i = 0;
+};
+
+
+int main( int argc, char **argv )
+{
+   mpi::Environment env( argc, argv );
+   debug::enterTestMode();
+
+   int counter1 = 0;
+   auto func1 = [&counter1]() {
+      WALBERLA_LOG_RESULT("A");
+      ++counter1;
+   };
+
+   int counter2 = 0;
+   auto func2 = [&counter2]() {
+      ++counter2;
+   };
+
+   int counter3 = 0;
+   auto func3 = [&counter3]() {
+      ++counter3;
+   };
+
+   auto func4 = [] {  WALBERLA_LOG_RESULT("B"); };
+
+   auto myFunctor = make_shared<MyFunctor>();
+
+   auto s = et::parallelSequence( { et::everyNth( et::functor( func2, "func2" ), 5, true ),
+                                    et::everyNth( et::functor( func3, "func3" ), 5, false ),
+                                    et::functor( func1, "func1" ),
+                                    et::functor( func4, "func4" ),
+                                    et::functor( myFunctor, "myFunctor") } );
+
+   auto l = et::loop( s, 20 );
+   myFunctor->i = 42;
+
+   std::cout << *l << std::endl;
+   l->run();
+
+   WALBERLA_CHECK_EQUAL( counter1, 20 );
+   WALBERLA_CHECK_EQUAL( counter2, 20 / 5 );
+   WALBERLA_CHECK_EQUAL( counter3, ( 20 / 5 ) - 1 );
+   WALBERLA_CHECK_EQUAL( myFunctor->i, 20 + 42 );
+
+   return 0;
+}
\ No newline at end of file
-- 
GitLab