diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index fe6e951ed121cc905de3b099d8639594dd440795..4cf439fe24bd22b024875be726aef1d0bc137b96 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -1554,7 +1554,7 @@ conda-py35-linux:
       - export PATH=$PATH:/usr/local/likwid/bin
       - likwid-setFrequencies -t 0
       - likwid-setFrequencies -g performance
-      - likwid-setFrequencies -x 3.3 -y 3.3 # set frequency to 3.3
+      - likwid-setFrequencies -f 3.3 # set frequency to 3.3
       - mpirun --allow-run-as-root -np 8 --map-by core --bind-to core --report-bindings ./PeriodicGranularGas PeriodicGranularGas.cfg --DEM --syncNextNeighbor | tee PeriodicGranularGas_DEM_NN.txt
       - mpirun --allow-run-as-root -np 8 --map-by core --bind-to core --report-bindings ./PeriodicGranularGas PeriodicGranularGas.cfg --DEM --syncShadowOwners | tee PeriodicGranularGas_DEM_SO.txt
       - mpirun --allow-run-as-root -np 8 --map-by core --bind-to core --report-bindings ./PeriodicGranularGas PeriodicGranularGas.cfg --HCSITS --syncNextNeighbor --InelasticFrictionlessContact | tee PeriodicGranularGas_HCSITS_NN_IFC.txt
@@ -1588,4 +1588,4 @@ benchmark_gcc7:
 
 benchmark_clang6:
    <<: *benchmark_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:6.0
\ No newline at end of file
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:6.0
diff --git a/apps/benchmarks/PeriodicGranularGas/PeriodicGranularGas.cpp b/apps/benchmarks/PeriodicGranularGas/PeriodicGranularGas.cpp
index 7efaeb660c56c92b7467a83b17312125fa403d19..ea83552cba47f36c087390010e77d0fac41934d1 100644
--- a/apps/benchmarks/PeriodicGranularGas/PeriodicGranularGas.cpp
+++ b/apps/benchmarks/PeriodicGranularGas/PeriodicGranularGas.cpp
@@ -160,11 +160,11 @@ int main( int argc, char ** argv )
    std::function<void(void)> syncCallWithoutTT;
    if (bNN)
    {
-      syncCallWithoutTT = std::bind( pe::syncNextNeighbors<BodyTuple>, boost::ref(*forest), storageID, static_cast<WcTimingTree*>(NULL), real_c(0.0), false );
+      syncCallWithoutTT = std::bind( pe::syncNextNeighbors<BodyTuple>, boost::ref(*forest), storageID, static_cast<WcTimingTree*>(NULL), real_c(0.1), false );
       WALBERLA_LOG_INFO_ON_ROOT("Using NextNeighbor sync!");
    } else if (bSO)
    {
-      syncCallWithoutTT = std::bind( pe::syncShadowOwners<BodyTuple>, boost::ref(*forest), storageID, static_cast<WcTimingTree*>(NULL), real_c(0.0), false );
+      syncCallWithoutTT = std::bind( pe::syncShadowOwners<BodyTuple>, boost::ref(*forest), storageID, static_cast<WcTimingTree*>(NULL), real_c(0.1), false );
       WALBERLA_LOG_INFO_ON_ROOT("Using ShadowOwner sync!");
    } else
    {
diff --git a/src/blockforest/AABBRefinementSelection.h b/src/blockforest/AABBRefinementSelection.h
index 4637f36f7c7c51adccd412b66cd98ed4aefc0f7d..fc454bedc4de777788d6b4a25ceb202bf950a373 100644
--- a/src/blockforest/AABBRefinementSelection.h
+++ b/src/blockforest/AABBRefinementSelection.h
@@ -22,6 +22,7 @@
 #pragma once
 
 #include "SetupBlockForest.h"
+#include "BlockForest.h"
 
 #include "core/config/Config.h"
 #include "core/math/AABB.h"
@@ -39,6 +40,8 @@ class AABBRefinementSelection
 {
 public:
 
+   AABBRefinementSelection(){}
+
    AABBRefinementSelection( const Config::BlockHandle & configBlock )
    {
       if( configBlock )
@@ -79,25 +82,10 @@ public:
       regions_.push_back( std::make_pair( region, level ) );
    }
 
-   std::vector< std::pair< math::AABB, uint_t > > transformRegionsToAABBs( SetupBlockForest & forest ) const
-   {
-      std::vector< std::pair< math::AABB, uint_t > > aabbs;
-      math::AABB aabb = forest.getDomain();
-      for( auto region = regions_.begin(); region != regions_.end(); ++region )
-      {
-         aabbs.push_back( std::make_pair( math::AABB( aabb.xMin() + region->first.xMin() * aabb.xSize(),
-                                                      aabb.yMin() + region->first.yMin() * aabb.ySize(),
-                                                      aabb.zMin() + region->first.zMin() * aabb.zSize(),
-                                                      aabb.xMin() + region->first.xMax() * aabb.xSize(),
-                                                      aabb.yMin() + region->first.yMax() * aabb.ySize(),
-                                                      aabb.zMin() + region->first.zMax() * aabb.zSize() ), region->second ) );
-      }
-      return aabbs;
-   }
-
+   // for static refinement
    void operator()( SetupBlockForest & forest )
    {
-      std::vector< std::pair< math::AABB, uint_t > > aabbs = transformRegionsToAABBs( forest );
+      std::vector< std::pair< math::AABB, uint_t > > aabbs = transformRegionsToAABBs( forest.getDomain() );
       aabbs.insert( aabbs.end(), aabbs_.begin(), aabbs_.end() );
 
       if( aabbs.empty() )
@@ -113,8 +101,60 @@ public:
       }
    }
 
+   // for dynamic refinement
+   void operator()(std::vector< std::pair< const Block *, uint_t > > & minTargetLevels,
+                   std::vector< const Block * > &, const BlockForest & forest )
+   {
+      std::vector< std::pair< math::AABB, uint_t > > aabbs = transformRegionsToAABBs( forest.getDomain() );
+      aabbs.insert( aabbs.end(), aabbs_.begin(), aabbs_.end() );
+
+      for( auto it = minTargetLevels.begin(); it != minTargetLevels.end(); ++it )
+      {
+         uint_t currentLevelOfBlock = it->first->getLevel();
+         uint_t targetLevelOfBlock = currentLevelOfBlock;
+
+         for( auto aabb = aabbs.begin(); aabb != aabbs.end(); ++aabb )
+         {
+            if( it->first->getAABB().intersects( aabb->first ) )
+            {
+               uint_t targetLevelOfAABB = aabb->second;
+               if( currentLevelOfBlock > targetLevelOfAABB )
+               {
+                  targetLevelOfBlock = currentLevelOfBlock - uint_t(1);
+               }
+               else if ( currentLevelOfBlock < targetLevelOfBlock )
+               {
+                  targetLevelOfBlock = currentLevelOfBlock + uint_t(1);
+               }
+               // only the first found intersecting AABB is taken into account
+               break;
+            }
+         }
+
+         WALBERLA_CHECK_LESS_EQUAL(std::abs(int_c(targetLevelOfBlock) - int_c(currentLevelOfBlock)), uint_t(1), "Only level difference of maximum 1 allowed!");
+         it->second = targetLevelOfBlock;
+      }
+   }
+
+
 private:
 
+   std::vector< std::pair< math::AABB, uint_t > > transformRegionsToAABBs( const math::AABB & simulationDomain ) const
+   {
+      std::vector< std::pair< math::AABB, uint_t > > aabbs;
+      for( auto region = regions_.begin(); region != regions_.end(); ++region )
+      {
+         aabbs.push_back( std::make_pair( math::AABB( simulationDomain.xMin() + region->first.xMin() * simulationDomain.xSize(),
+                                                      simulationDomain.yMin() + region->first.yMin() * simulationDomain.ySize(),
+                                                      simulationDomain.zMin() + region->first.zMin() * simulationDomain.zSize(),
+                                                      simulationDomain.xMin() + region->first.xMax() * simulationDomain.xSize(),
+                                                      simulationDomain.yMin() + region->first.yMax() * simulationDomain.ySize(),
+                                                      simulationDomain.zMin() + region->first.zMax() * simulationDomain.zSize() ), region->second ) );
+      }
+      return aabbs;
+   }
+
+
    std::vector< std::pair< math::AABB, uint_t > > aabbs_;
    std::vector< std::pair< math::AABB, uint_t > > regions_;
 
diff --git a/src/blockforest/BlockForest.h b/src/blockforest/BlockForest.h
index 42d1081e1feb7d6af47aba14e398966cdf055066..5d9a28628a7e23aa9029c1707d435bb0ef82fad0 100644
--- a/src/blockforest/BlockForest.h
+++ b/src/blockforest/BlockForest.h
@@ -989,6 +989,62 @@ private:
 
 
 
+class CombinedMinTargetLevelDeterminationFunctions
+{
+public:
+
+   typedef blockforest::BlockForest::RefreshMinTargetLevelDeterminationFunction MinTargetLevelDeterminationFunction;
+
+   CombinedMinTargetLevelDeterminationFunctions(const std::function<uint_t(const std::vector<uint_t> &)> & targetLevelReductionFct = [](const std::vector<uint_t> & t){ return *std::max_element(t.begin(), t.end());})
+   : targetLevelReductionFct_( targetLevelReductionFct )
+   {
+
+   }
+
+   void add( const MinTargetLevelDeterminationFunction & function )
+   {
+      functions_.push_back( function );
+   }
+
+   void operator()( std::vector< std::pair< const Block *, uint_t > > & minTargetLevels,
+                    std::vector< const Block * > & blocksAlreadyMarkedForRefinement,
+                    const blockforest::BlockForest & forest )
+   {
+      const uint_t numberOfBlocks = minTargetLevels.size();
+
+      std::vector< std::vector< std::pair< const Block *, uint_t > > > minTargetLevelsPerFunction(functions_.size(), minTargetLevels);
+
+      // evaluate the different determination functions
+      auto iter = minTargetLevelsPerFunction.begin();
+      for( auto function = functions_.begin(); function != functions_.end(); ++function, ++iter )
+      {
+         (*function)( *iter, blocksAlreadyMarkedForRefinement, forest );
+         WALBERLA_ASSERT_EQUAL(iter->size(), numberOfBlocks, "Number of blocks has changed during min target level determination!");
+      }
+
+      // combine the outcome of the different functions into a single target level
+      std::vector<uint_t> targetLevels(functions_.size());
+      for( uint_t block = 0; block < numberOfBlocks; ++block )
+      {
+         for( uint_t fct = 0; fct < functions_.size(); ++fct)
+         {
+            WALBERLA_ASSERT_EQUAL(minTargetLevelsPerFunction[fct][block].first->getId(), minTargetLevels[block].first->getId());
+            targetLevels[fct] = minTargetLevelsPerFunction[fct][block].second;
+         }
+         minTargetLevels[block].second = targetLevelReductionFct_(targetLevels);
+      }
+
+   }
+
+private:
+
+   std::vector< MinTargetLevelDeterminationFunction > functions_;
+   std::function<uint_t(const std::vector<uint_t> &)> targetLevelReductionFct_;
+
+}; // class CombinedMinTargetLevelDeterminationFunctions
+
+
+
 } // namespace blockforest
 
 using blockforest::BlockForest;
diff --git a/src/blockforest/loadbalancing/DynamicParMetis.cpp b/src/blockforest/loadbalancing/DynamicParMetis.cpp
index 0e215b6f92bc7b8bd835d165b2b76b7892e9c06b..369339eee4c0fc726e8da72a6e0335157ae76fc8 100644
--- a/src/blockforest/loadbalancing/DynamicParMetis.cpp
+++ b/src/blockforest/loadbalancing/DynamicParMetis.cpp
@@ -101,19 +101,6 @@ std::map< blockforest::BlockID, uint_t > getBlockIdToSequenceMapping( const Phan
    return mapping;
 }
 
-template< typename T >
-T * ptr( std::vector<T> & v )
-{
-   if( v.empty() )
-      return NULL;
-   else
-      return &( v.front() );
-}
-
-typedef uint_t idx_t;
-
-
-
 bool DynamicParMetis::operator()( std::vector< std::pair< const PhantomBlock *, uint_t > > & targetProcess,
                                   std::set< uint_t > & processesToRecvFrom,
                                   const PhantomBlockForest & phantomForest,
@@ -163,6 +150,7 @@ bool DynamicParMetis::operator()( std::vector< std::pair< const PhantomBlock *,
       std::vector<double> xyz;
 
       uint_t blockIndex = 0;
+      int64_t ncon = int64_c(ncon_);
       for( auto it = targetProcess.begin(); it != targetProcess.end(); ++it, ++blockIndex )
       {
          WALBERLA_CHECK_EQUAL(blockIndex, mapping.find(it->first->getId())->second - blockSequenceRange.first);
@@ -181,7 +169,7 @@ bool DynamicParMetis::operator()( std::vector< std::pair< const PhantomBlock *,
                WALBERLA_CHECK_LESS( mapIt->second, vtxdist.back() );
                adjncy.push_back( int64_c( mapIt->second ) );
                auto edgeWeightIt = bi.getEdgeWeights().find( nit->getId() );
-               //WALBERLA_CHECK_UNEQUAL( edgeWeightIt->second, 0 ); // perhaps WARNING
+               WALBERLA_CHECK_GREATER_EQUAL( edgeWeightIt->second, 0 );
                adjwgt.push_back( edgeWeightIt == bi.getEdgeWeights().end() ? int64_t( 1 ) : edgeWeightIt->second );
             }
             break;
@@ -193,14 +181,22 @@ bool DynamicParMetis::operator()( std::vector< std::pair< const PhantomBlock *,
                WALBERLA_CHECK_GREATER_EQUAL( mapIt->second, 0 );
                WALBERLA_CHECK_LESS( mapIt->second, vtxdist.back() );
                adjncy.push_back( int64_c( mapIt->second ) );
-               //WALBERLA_CHECK_UNEQUAL( edgeIt->second, 0 ); // perhaps WARNING
+               WALBERLA_CHECK_GREATER_EQUAL( edgeIt->second, 0 );
                adjwgt.push_back( edgeIt->second );
             }
             break;
          }
-         WALBERLA_CHECK_UNEQUAL( bi.getVertexWeight(), 0 );
-         vwgt.push_back( bi.getVertexWeight() );
+
+         WALBERLA_CHECK_EQUAL(ncon, int64_c(bi.getNcon()), "Number of constraints on block does not fit to specified number of constraints in ParMetis setup!");
+
+         for( uint_t con = uint_t(0); con < bi.getNcon(); ++con )
+         {
+            WALBERLA_CHECK_GREATER_EQUAL( bi.getVertexWeight(con), 0 );
+            vwgt.push_back( bi.getVertexWeight(con) );
+         }
+
          vsize.push_back( bi.getVertexSize() );
+
          xyz.push_back( bi.getVertexCoords()[0] );
          xyz.push_back( bi.getVertexCoords()[1] );
          xyz.push_back( bi.getVertexCoords()[2] );
@@ -215,20 +211,19 @@ bool DynamicParMetis::operator()( std::vector< std::pair< const PhantomBlock *,
       {
          WALBERLA_ASSERT_LESS( xadj[i-1], xadj[i] );
       }
-      WALBERLA_CHECK_EQUAL( vwgt.size(), targetProcess.size() );
+      WALBERLA_CHECK_EQUAL( int64_c(vwgt.size()), int64_c(targetProcess.size()) * ncon);
       WALBERLA_CHECK_EQUAL( vsize.size(), targetProcess.size() );
       WALBERLA_CHECK_EQUAL( xyz.size(), targetProcess.size() * 3 );
       WALBERLA_CHECK_EQUAL( adjncy.size(), adjwgt.size() );
       WALBERLA_CHECK_EQUAL( adjwgt.size(), xadj.back() );
 
-      int64_t wgtflag         = weightsToUse_;
-      int64_t numflag         = 0; // C-style ordering
-      int64_t ncon            = 1; // Number of constraints
-      int64_t ndims           = 3; // Number of dimensions
-      std::vector<double>  ubvec( uint_c(ncon), double_c( 1.05 ) ); // imbalance tolerance
-      int64_t nparts          = int64_c( MPIManager::instance()->numProcesses() ); // number of subdomains
-      double    ipc2redist    = double_c(ipc2redist_);
-      MPI_Comm comm           = subComm; //MPIManager::instance()->comm();
+      int64_t wgtflag           = weightsToUse_;
+      int64_t numflag           = 0; // C-style ordering
+      int64_t ndims             = 3; // Number of dimensions
+      std::vector<double> ubvec = ubvec_; // imbalance tolerance
+      int64_t nparts            = int64_c( MPIManager::instance()->numProcesses() ); // number of subdomains
+      double ipc2redist         = double_c(ipc2redist_);
+      MPI_Comm comm             = subComm; //MPIManager::instance()->comm();
       std::vector<double> tpwgts( uint_c(nparts * ncon), 1.0 / double_c( nparts ) ); // vertex weight fraction that is stored in a subdomain
       std::vector<int64_t> options = { int64_t( 1 ), int64_t( 0 ), int64_t( 23 ), int64_t( 1 ) };
 
@@ -237,34 +232,36 @@ bool DynamicParMetis::operator()( std::vector< std::pair< const PhantomBlock *,
       switch( algorithm_ )
       {
       case PARMETIS_PART_GEOM:
+         WALBERLA_ASSERT_EQUAL(ncon, int64_t(1), "Chosen algorithm can only work with single constraints!");
          parmetisTimer.start();
-         metisResult = core::ParMETIS_V3_PartGeom( ptr( vtxdist ), &ndims, ptr( xyz ), ptr( part ), &comm );
+         metisResult = core::ParMETIS_V3_PartGeom( vtxdist.data(), &ndims, xyz.data(), part.data(), &comm );
          parmetisTimer.end();
          break;
       case PARMETIS_PART_GEOM_KWAY:
          parmetisTimer.start();
-         metisResult = core::ParMETIS_V3_PartGeomKway( ptr( vtxdist ), ptr( xadj ), ptr( adjncy ), ptr( vwgt ), ptr( adjwgt ), &wgtflag, &numflag, &ndims, ptr( xyz ), &ncon, &nparts, ptr( tpwgts ), ptr( ubvec ), ptr( options ), &edgecut, ptr( part ), &comm );
+         metisResult = core::ParMETIS_V3_PartGeomKway( vtxdist.data(), xadj.data(), adjncy.data(), vwgt.data(), adjwgt.data(), &wgtflag, &numflag, &ndims, xyz.data(), &ncon, &nparts, tpwgts.data(), ubvec.data(), options.data(), &edgecut, part.data(), &comm );
          parmetisTimer.end();
          break;
       case PARMETIS_PART_KWAY:
          parmetisTimer.start();
-         metisResult = core::ParMETIS_V3_PartKway( ptr( vtxdist ), ptr( xadj ), ptr( adjncy ), ptr( vwgt ), ptr( adjwgt ), &wgtflag, &numflag, &ncon, &nparts, ptr( tpwgts ), ptr(ubvec), ptr(options), &edgecut, ptr( part ), &comm );
+         metisResult = core::ParMETIS_V3_PartKway( vtxdist.data(), xadj.data(), adjncy.data(), vwgt.data(), adjwgt.data(), &wgtflag, &numflag, &ncon, &nparts, tpwgts.data(), ubvec.data(), options.data(), &edgecut, part.data(), &comm );
          parmetisTimer.end();
          break;
       case PARMETIS_ADAPTIVE_REPART:
          parmetisTimer.start();
-         metisResult = core::ParMETIS_V3_AdaptiveRepart( ptr( vtxdist ), ptr( xadj ), ptr( adjncy ), ptr( vwgt ), ptr( vsize ), ptr( adjwgt ), &wgtflag, &numflag, &ncon, &nparts, ptr( tpwgts ), ptr(ubvec), &ipc2redist, ptr(options), &edgecut, ptr( part ), &comm );
+         metisResult = core::ParMETIS_V3_AdaptiveRepart( vtxdist.data(), xadj.data(), adjncy.data(), vwgt.data(), vsize.data(), adjwgt.data(), &wgtflag, &numflag, &ncon, &nparts, tpwgts.data(), ubvec.data(), &ipc2redist, options.data(), &edgecut, part.data(), &comm );
          parmetisTimer.end();
          break;
       case PARMETIS_REFINE_KWAY:
          parmetisTimer.start();
-         metisResult = core::ParMETIS_V3_RefineKway( ptr( vtxdist ), ptr( xadj ), ptr( adjncy ), ptr( vwgt ), ptr( adjwgt ), &wgtflag, &numflag, &ncon, &nparts, ptr( tpwgts ), ptr(ubvec), ptr(options), &edgecut, ptr( part ), &comm );
+         metisResult = core::ParMETIS_V3_RefineKway( vtxdist.data(), xadj.data(), adjncy.data(), vwgt.data(), adjwgt.data(), &wgtflag, &numflag, &ncon, &nparts, tpwgts.data(), ubvec.data(), options.data(), &edgecut, part.data(), &comm );
          parmetisTimer.end();
          break;
       }
 
       if( metisResult != core::METIS_OK )
          WALBERLA_ABORT("ParMetis failed!");
+
    }
 
    // Determine which processes will receive a block from this process
@@ -278,7 +275,7 @@ bool DynamicParMetis::operator()( std::vector< std::pair< const PhantomBlock *,
    isSendingBlockToProcess[uint_c(MPIManager::instance()->rank())] = uint8_t( 0 );
 
    std::vector<uint8_t> isReceivingBlockFromProcess( uint_c(MPIManager::instance()->numProcesses()), uint8_t( 0 ) );
-   MPI_Alltoall( ptr( isSendingBlockToProcess ), 1, MPITrait<uint8_t>::type(), ptr( isReceivingBlockFromProcess ), 1, MPITrait<uint8_t>::type(), MPIManager::instance()->comm() );
+   MPI_Alltoall( isSendingBlockToProcess.data(), 1, MPITrait<uint8_t>::type(), isReceivingBlockFromProcess.data(), 1, MPITrait<uint8_t>::type(), MPIManager::instance()->comm() );
    for( uint_t i = 0; i < isReceivingBlockFromProcess.size(); ++i )
       if( isReceivingBlockFromProcess[i] == uint8_t( 1 ) )
          processesToRecvFrom.insert( i );
@@ -317,12 +314,12 @@ DynamicParMetis::Algorithm DynamicParMetis::stringToAlgorithm( std::string s )
       return PARMETIS_PART_GEOM;
    else if( s == "PART_KWAY" )
       return PARMETIS_PART_KWAY;
-   else if( s == "PART_ADAPTIVE_REPART" )
+   else if( s == "ADAPTIVE_REPART" )
       return PARMETIS_ADAPTIVE_REPART;
    else if( s == "REFINE_KWAY" )
       return PARMETIS_REFINE_KWAY;
    else
-      WALBERLA_ABORT( "Illegal ParMetis algorithm specified (" << s << ")! Valid choices are: \"PART_GEOM_KWAY\", \"PART_KWAY\", \"PART_ADAPTIVE_REPART\", or \"REFINE_KWAY\"." );
+      WALBERLA_ABORT( "Illegal ParMetis algorithm specified (" << s << ")! Valid choices are: \"PART_GEOM_KWAY\", \"PART_GEOM\", \"PART_KWAY\", \"ADAPTIVE_REPART\", or \"REFINE_KWAY\"." );
 }
 
 
@@ -369,7 +366,7 @@ std::string DynamicParMetis::algorithmToString( ) const
    case walberla::blockforest::DynamicParMetis::PARMETIS_PART_KWAY:
       return "PART_KWAY";
    case walberla::blockforest::DynamicParMetis::PARMETIS_ADAPTIVE_REPART:
-      return "PART_ADAPTIVE_REPART";
+      return "ADAPTIVE_REPART";
    case walberla::blockforest::DynamicParMetis::PARMETIS_REFINE_KWAY:
       return "PARMETIS_REFINE_KWAY";
    }
diff --git a/src/blockforest/loadbalancing/DynamicParMetis.h b/src/blockforest/loadbalancing/DynamicParMetis.h
index d1b3acd1f877c755279f8a4b3f0d2a2586549cd7..fa5663db5961e89cb57f4150cca10902706379d3 100644
--- a/src/blockforest/loadbalancing/DynamicParMetis.h
+++ b/src/blockforest/loadbalancing/DynamicParMetis.h
@@ -30,6 +30,7 @@
 
 #include <cmath>
 #include <map>
+#include <vector>
 
 namespace walberla {
 namespace blockforest {
@@ -48,8 +49,9 @@ public:
 
    DynamicParMetis( const Algorithm algorithm = PARMETIS_PART_GEOM_KWAY,
                     const WeightsToUse weightsToUse = PARMETIS_BOTH_WEIGHTS,
-                    const EdgeSource edgeSource = PARMETIS_EDGES_FROM_EDGE_WEIGHTS )
-      : algorithm_( algorithm ), weightsToUse_( weightsToUse ), edgeSource_( edgeSource ) {}
+                    const EdgeSource edgeSource = PARMETIS_EDGES_FROM_EDGE_WEIGHTS,
+                    const uint_t ncon = uint_t(1))
+      : algorithm_( algorithm ), weightsToUse_( weightsToUse ), edgeSource_( edgeSource ), ncon_( ncon ), ubvec_( ncon, real_t(1.05) ) {}
 
    bool operator()( std::vector< std::pair< const PhantomBlock *, uint_t > > & targetProcess,
                     std::set< uint_t > & processesToRecvFrom,
@@ -60,6 +62,10 @@ public:
    void   setipc2redist(double val) {ipc2redist_ = val;}
    double getipc2redist() const {return ipc2redist_;}
 
+   void setImbalanceTolerances(const std::vector<double> & tolerances){ WALBERLA_ASSERT_EQUAL(tolerances.size(), ncon_-uint_t(1)); ubvec_ = tolerances; }
+   void setImbalanceTolerance(double tolerance, const uint_t con = 0){ WALBERLA_ASSERT_LESS(con, ncon_); ubvec_[con] = tolerance; }
+   double getImbalanceTolerance(const uint_t con = 0) const { WALBERLA_ASSERT_LESS(con, ncon_); return ubvec_[con]; }
+
    bool edgeWeightsUsed()   const { return ( weightsToUse_ == PARMETIS_EDGE_WEIGHTS   ) || ( weightsToUse_ == PARMETIS_BOTH_WEIGHTS ); }
    bool vertexWeightsUsed() const { return ( weightsToUse_ == PARMETIS_VERTEX_WEIGHTS ) || ( weightsToUse_ == PARMETIS_BOTH_WEIGHTS ); }
    bool vertexSizeUsed()    const { return algorithm_ == PARMETIS_ADAPTIVE_REPART; }
@@ -77,6 +83,8 @@ protected:
    WeightsToUse weightsToUse_;
    EdgeSource edgeSource_;
 
+   uint_t ncon_ = uint_t(1); ///< number of constraints
+   std::vector<double> ubvec_; ///< imbalance tolerance for each constraint, value between 1 (perfect balance) and number of subdomains (= number of processes, perfect imbalance)
    double ipc2redist_ = real_t( 1000.0 ); ///< compute repartitioning with low edge cut (set lower (down to 0.000001) to get minimal repartitioning )
 };
 
@@ -87,8 +95,8 @@ public:
    typedef int64_t weight_t;
    typedef int64_t vsize_t;
 
-   DynamicParMetisBlockInfo( const weight_t vertexWeight )
-      : vertexWeight_(vertexWeight), vertexSize_(1)
+   DynamicParMetisBlockInfo( const weight_t vertexWeight, const uint_t ncon = 1 )
+      : vertexWeight_(ncon, vertexWeight), vertexSize_(1)
    { }
 
    DynamicParMetisBlockInfo( mpi::RecvBuffer & buffer )
@@ -96,13 +104,35 @@ public:
       buffer >> vertexWeight_ >> vertexSize_ >> vertexCoords_ >> edgeWeights_;
    }
 
-   weight_t getVertexWeight() const { return vertexWeight_; }
-   void     setVertexWeight( const weight_t vertexWeight ) { vertexWeight_ = vertexWeight; }
+   DynamicParMetisBlockInfo( const std::vector<weight_t> & vertexWeight )
+      : vertexWeight_(vertexWeight), vertexSize_(1)
+   { }
+
+   // get number of constraints ( = number of vertex weights), default: 1, > 1 for multi physics problems
+   uint_t getNcon() const
+   {
+      return vertexWeight_.size();
+   }
+
+   weight_t getVertexWeight( const uint_t con = 0 ) const
+   {
+      WALBERLA_ASSERT_LESS(con, getNcon());
+      return vertexWeight_[con];
+   }
+   void setVertexWeight( const weight_t vertexWeight, const uint_t con = 0 )
+   {
+      WALBERLA_ASSERT_LESS(con, getNcon());
+      vertexWeight_[con] = vertexWeight;
+   }
 
-   vsize_t getVertexSize() const { return vertexSize_; }
-   void setVertexSize( const vsize_t size ) { vertexSize_ = size; }
+   vsize_t getVertexSize( ) const { return vertexSize_; }
+   void setVertexSize( const vsize_t size) { vertexSize_ = size; }
 
-   const Vector3<double> & getVertexCoords() const { WALBERLA_ASSERT( !std::isnan(vertexCoords_[0]) && !std::isnan(vertexCoords_[1]) && !std::isnan(vertexCoords_[2]) ); return vertexCoords_; }
+   const Vector3<double> & getVertexCoords() const
+   {
+      WALBERLA_ASSERT( !std::isnan(vertexCoords_[0]) && !std::isnan(vertexCoords_[1]) && !std::isnan(vertexCoords_[2]) );
+      return vertexCoords_;
+   }
    void setVertexCoords( const Vector3<double> & p ) { vertexCoords_ = p; }
 
    void setEdgeWeight( const blockforest::BlockID & blockId, const weight_t edgeWeight ) { edgeWeights_[blockId] = edgeWeight; }
@@ -117,8 +147,8 @@ public:
 
 private:
 
-   weight_t vertexWeight_; /// Defines the weight of a block
-   vsize_t vertexSize_;    /// Defines the cost of rebalancing a block
+   std::vector<weight_t> vertexWeight_; /// Defines the ncon weights of a block
+   vsize_t vertexSize_;    /// Defines the cost of rebalancing a block. Needed by some ParMetis algorithms
    Vector3<double> vertexCoords_ = Vector3<double>(std::numeric_limits<double>::signaling_NaN()); /// Defines where the block is located in space. Needed by some ParMetis algorithms
    std::map< blockforest::BlockID, weight_t > edgeWeights_; /// Defines the cost of communication with other blocks
 };
diff --git a/src/blockforest/loadbalancing/all.h b/src/blockforest/loadbalancing/all.h
index a4a2cb31b899f0da423196d5203db0327d275894..b3c956f8b13ac1efc0c3a3de4369c04239a1cfb8 100644
--- a/src/blockforest/loadbalancing/all.h
+++ b/src/blockforest/loadbalancing/all.h
@@ -25,5 +25,8 @@
 #include "Cartesian.h"
 #include "DynamicCurve.h"
 #include "DynamicDiffusive.h"
+#include "DynamicParMetis.h"
 #include "NoPhantomData.h"
+#include "PODPhantomData.h"
 #include "StaticCurve.h"
+#include "StaticParMetis.h"
diff --git a/src/core/timing/TimingNode.h b/src/core/timing/TimingNode.h
index 2f0597f7a591b12907932942e76f107b4f90015d..6f880d7e457919a74a33ada2a53cbd2e19220ea3 100644
--- a/src/core/timing/TimingNode.h
+++ b/src/core/timing/TimingNode.h
@@ -111,7 +111,7 @@ void TimingNode<TP>::swap(TimingNode<TP>& tt)
     }
 }
 
-/// Finds the spezified timer in the timing hierarchy
+/// Finds the specified timer in the timing hierarchy
 /// \param name timer name which may include more than one hierarchy separated by "."
 /// \code findTimer(tn, "firstLevel.secondLevel.thirdLevel.timerName"); \endcode
 /// \relates TimingNode
@@ -130,6 +130,31 @@ const Timer<TP>& findTimer( const TimingNode<TP>& tn, const std::string& name)
    }
 }
 
+/// Checks if the specified timer exists in the timing hierarchy
+/// \param name timer name which may include more than one hierarchy separated by "."
+/// \code timerExists(tn, "firstLevel.secondLevel.thirdLevel.timerName"); \endcode
+/// \relates TimingNode
+template< typename TP >  // Timing policy
+bool timerExists( const TimingNode<TP>& tn, const std::string& name )
+{
+   auto pos = name.find_first_of(".");
+   if (pos != std::string::npos)
+   {
+      if( tn.tree_.find(name.substr(0, pos)) != tn.tree_.end() )
+      {
+         return timerExists( tn.tree_.at(name.substr(0, pos)), name.substr(pos+1, std::string::npos));
+      }
+      else
+      {
+         return false;
+      }
+   }
+   else
+   {
+      return tn.tree_.find(name) != tn.tree_.end();
+   }
+}
+
 /// Resets the timer in the TimingNode structure and all sub timers
 /// \relates TimingNode
 template< typename TP >  // Timing policy
diff --git a/src/core/timing/TimingTree.h b/src/core/timing/TimingTree.h
index f34fc7f8845dd0e96c91ceb8038aafc5c4fb9f10..7e1e59c5bde590d14dc0b26a0f3bfe0a26a07907 100644
--- a/src/core/timing/TimingTree.h
+++ b/src/core/timing/TimingTree.h
@@ -59,13 +59,13 @@ public:
 
    void swap(TimingTree<TP>& tt);
 
-   /// Starts a timer beneath the current hirarchy level
+   /// Starts a timer beneath the current hierarchy level
    void start(const std::string& name);
-   /// Stops the last started timer and jumps back one hirarchy level
+   /// Stops the last started timer and jumps back one hierarchy level
    void stop(const std::string& name);
    /// Checks if specified timer is currently running.
    bool isTimerRunning(const std::string& name) const;
-   /// Resets the the timing hirarchy
+   /// Resets the the timing hierarchy
    void reset();
 
    //** Reduction ******************************************************************************************************
@@ -81,7 +81,9 @@ public:
 
    /// Returns the raw tree data structure
    const TimingNode<TP>& getRawData() const;
+
    const Timer<TP>& operator[](const std::string& name) const;
+   inline bool timerExists ( const std::string & n ) const;
 
    /// Returns the name of the currently running timer
    /// Might be expensive due to value search.
@@ -89,7 +91,7 @@ public:
 private:
    /// Tree data structure
    TimingNode<TP>  root_;
-   /// Pointer to the current hirarchy level.
+   /// Pointer to the current hierarchy level.
    TimingNode<TP>* current_;
 };
 
@@ -204,7 +206,7 @@ const TimingNode<TP>& TimingTree<TP>::getRawData() const
    return root_;
 }
 
-/// Finds the spezified timer in the timing hierarchy
+/// Finds the specified timer in the timing hierarchy
 /// \param name timer name which may include more than one hierarchy separated by "."
 /// \code tt["firstLevel.secondLevel.thirdLevel.timerName"].total(); \endcode
 template< typename TP >  // Timing policy
@@ -213,6 +215,15 @@ const Timer<TP>& TimingTree<TP>::operator[](const std::string& name) const
    return findTimer(root_, name);
 }
 
+/// Checks if the specified timer exists in the timing hierarchy
+/// \param name timer name which may include more than one hierarchy separated by "."
+/// \code tt.timerExists("firstLevel.secondLevel.thirdLevel.timerName"); \endcode
+template< typename TP >  // Timing policy
+bool TimingTree<TP>::timerExists(const std::string& name) const
+{
+   return walberla::timing::timerExists(root_, name);
+}
+
 template< typename TP >  // Timing policy
 std::string TimingTree<TP>::getCurrentTimerName() const
 {
diff --git a/src/field/allocation/FieldAllocator.h b/src/field/allocation/FieldAllocator.h
index 4ab58c0fe1d82195f28a64627d4577fdd9db3336..418fb6c0bd6d520f4a6a213a957000c27da23dc7 100644
--- a/src/field/allocation/FieldAllocator.h
+++ b/src/field/allocation/FieldAllocator.h
@@ -242,7 +242,7 @@ namespace field {
 
          virtual T * allocateMemory (  uint_t size )
          {
-            void * ptr = aligned_malloc_with_offset(size * sizeof(T), alignment, offset_ % alignment );
+            void * ptr = aligned_malloc_with_offset(size * sizeof(T) + alignment, alignment, offset_ % alignment );
             if(!ptr)
                throw std::bad_alloc();
 
diff --git a/src/field/python/FieldExport.impl.h b/src/field/python/FieldExport.impl.h
index 4dc9d0c3b0444561945dcfca2f1ed64fa5566bbd..a092b4103ecabc0882777e4e297be736cda3ef35 100644
--- a/src/field/python/FieldExport.impl.h
+++ b/src/field/python/FieldExport.impl.h
@@ -490,6 +490,65 @@ namespace internal {
    }
 
 
+   //===================================================================================================================
+   //
+   //  Aligned Allocation
+   //
+   //===================================================================================================================
+
+   template<typename T>
+   shared_ptr<field::FieldAllocator<T> > getAllocator(uint_t alignment)
+   {
+      if( alignment == 0 )
+         return shared_ptr<field::FieldAllocator<T> >(); // leave to default - auto-detection of alignment
+      else if ( alignment == 16 )
+         return make_shared< field::AllocateAligned<T, 16> >();
+      else if ( alignment == 32 )
+         return make_shared< field::AllocateAligned<T, 32> >();
+      else if ( alignment == 64 )
+         return make_shared< field::AllocateAligned<T, 64> >();
+      else if ( alignment == 128 )
+         return make_shared< field::AllocateAligned<T, 128> >();
+      else {
+         PyErr_SetString( PyExc_ValueError, "Alignment parameter has to be one of 0, 16, 32, 64, 128." );
+         throw boost::python::error_already_set();
+         return shared_ptr<field::FieldAllocator<T> >();
+      }
+   }
+
+   template< typename GhostLayerField_T >
+   class GhostLayerFieldDataHandling : public blockforest::AlwaysInitializeBlockDataHandling< GhostLayerField_T >
+   {
+   public:
+      typedef typename GhostLayerField_T::value_type Value_T;
+
+      GhostLayerFieldDataHandling( const weak_ptr<StructuredBlockStorage> &blocks, const uint_t nrOfGhostLayers,
+                                   const Value_T &initValue, const Layout layout, uint_t alignment = 0 ) :
+              blocks_( blocks ), nrOfGhostLayers_( nrOfGhostLayers ), initValue_( initValue ), layout_( layout ),
+              alignment_( alignment ) {}
+
+      GhostLayerField_T * initialize( IBlock * const block )
+      {
+         auto blocks = blocks_.lock();
+         WALBERLA_CHECK_NOT_NULLPTR( blocks, "Trying to access 'AlwaysInitializeBlockDataHandling' for a block "
+                                             "storage object that doesn't exist anymore" );
+         GhostLayerField_T * field = new GhostLayerField_T ( blocks->getNumberOfXCells( *block ),
+                                                             blocks->getNumberOfYCells( *block ),
+                                                             blocks->getNumberOfZCells( *block ),
+                                                             nrOfGhostLayers_, initValue_, layout_,
+                                                             getAllocator<Value_T>(alignment_) );
+         return field;
+      }
+
+   private:
+      weak_ptr< StructuredBlockStorage > blocks_;
+
+      uint_t  nrOfGhostLayers_;
+      Value_T initValue_;
+      Layout  layout_;
+      uint_t alignment_;
+   };
+
 
    //===================================================================================================================
    //
@@ -843,10 +902,10 @@ namespace internal {
    {
    public:
       CreateFieldExporter( uint_t xs, uint_t ys, uint_t zs, uint_t fs, uint_t gl,
-                           Layout layout, const boost::python::object & type,
-                           const shared_ptr<boost::python::object> & resultPointer )
+                           Layout layout, const boost::python::object & type, uint_t alignment,
+                           const shared_ptr<boost::python::object> & resultPointer  )
          : xs_( xs ), ys_(ys), zs_(zs), fs_(fs), gl_(gl),
-           layout_( layout),  type_( type ), resultPointer_( resultPointer )
+           layout_( layout),  type_( type ), alignment_(alignment), resultPointer_( resultPointer )
       {}
 
       template< typename FieldType>
@@ -862,7 +921,8 @@ namespace internal {
          if( python_coupling::isCppEqualToPythonType<T>( (PyTypeObject *)type_.ptr() )  )
          {
             T initVal = T(); //extract<T> ( initValue_ );
-            *resultPointer_ = object( make_shared< GhostLayerField<T,F_SIZE> >( xs_,ys_,zs_, gl_, initVal, layout_ )  );
+            *resultPointer_ = object( make_shared< GhostLayerField<T,F_SIZE> >( xs_,ys_,zs_, gl_, initVal, layout_,
+                                                                                getAllocator<T>(alignment_)));
          }
       }
 
@@ -874,6 +934,7 @@ namespace internal {
       uint_t gl_;
       Layout layout_;
       boost::python::object type_;
+      uint_t alignment_;
       shared_ptr<boost::python::object> resultPointer_;
    };
 
@@ -881,7 +942,8 @@ namespace internal {
    boost::python::object createPythonField( boost::python::list size,
                                             boost::python::object type,
                                             uint_t ghostLayers,
-                                            Layout layout)
+                                            Layout layout,
+                                            uint_t alignment)
    {
       using namespace boost::python;
       uint_t xSize = extract<uint_t> ( size[0] );
@@ -898,7 +960,7 @@ namespace internal {
       }
 
       auto result = make_shared<boost::python::object>();
-      CreateFieldExporter exporter( xSize,ySize, zSize, fSize, ghostLayers, layout, type, result );
+      CreateFieldExporter exporter( xSize,ySize, zSize, fSize, ghostLayers, layout, type, alignment, result );
       python_coupling::for_each_noncopyable_type< FieldTypes >  ( exporter );
 
       if ( *result == object()  )
@@ -952,12 +1014,13 @@ namespace internal {
    class AddToStorageExporter
    {
    public:
-      AddToStorageExporter( const shared_ptr<StructuredBlockStorage> & blocks,
+      AddToStorageExporter(const shared_ptr<StructuredBlockStorage> & blocks,
                            const std::string & name, uint_t fs, uint_t gl, Layout layout,
                            const boost::python::object & type,
-                           const boost::python::object & initObj )
+                           const boost::python::object & initObj,
+                           uint_t alignment )
          : blocks_( blocks ), name_( name ), fs_( fs ),
-           gl_(gl),layout_( layout),  type_( type ), initObj_( initObj), found_(false)
+           gl_(gl),layout_( layout),  type_( type ), initObj_( initObj), alignment_(alignment), found_(false)
       {}
 
       template< typename FieldType>
@@ -972,11 +1035,15 @@ namespace internal {
 
          if( !found_ && python_coupling::isCppEqualToPythonType<T>( (PyTypeObject *)type_.ptr() )  )
          {
-            if ( initObj_ == object() )
-               field::addToStorage< GhostLayerField<T,F_SIZE> >( blocks_, name_, T(), layout_, gl_ );
-            else
-               field::addToStorage< GhostLayerField<T,F_SIZE> >( blocks_, name_, extract<T>(initObj_), layout_, gl_ );
-
+            typedef internal::GhostLayerFieldDataHandling< GhostLayerField<T,F_SIZE > > DataHandling;
+            if ( initObj_ == object() ) {
+               auto dataHandling = walberla::make_shared< DataHandling >( blocks_, gl_, T(), layout_, alignment_ );
+               blocks_->addBlockData( dataHandling, name_ );
+            }
+            else {
+               auto dataHandling = walberla::make_shared< DataHandling >( blocks_, gl_, extract<T>(initObj_), layout_, alignment_ );
+               blocks_->addBlockData( dataHandling, name_ );
+            }
             found_ = true;
          }
       }
@@ -990,12 +1057,14 @@ namespace internal {
       Layout layout_;
       boost::python::object type_;
       boost::python::object initObj_;
+      uint_t alignment_;
       bool found_;
    };
 
    template<typename FieldTypes>
    void addToStorage( const shared_ptr<StructuredBlockStorage> & blocks, const std::string & name,
-                      boost::python::object type, uint_t fs, uint_t gl, Layout layout, boost::python::object initValue )
+                      boost::python::object type, uint_t fs, uint_t gl, Layout layout, boost::python::object initValue,
+                      uint_t alignment)
    {
       using namespace boost::python;
 
@@ -1005,7 +1074,7 @@ namespace internal {
       }
 
       auto result = make_shared<boost::python::object>();
-      AddToStorageExporter exporter( blocks, name, fs, gl, layout, type, initValue );
+      AddToStorageExporter exporter( blocks, name, fs, gl, layout, type, initValue, alignment );
       python_coupling::for_each_noncopyable_type< FieldTypes >  ( std::ref(exporter) );
 
       if ( ! exporter.successful() ) {
@@ -1252,7 +1321,8 @@ void exportFields()
    def( "createField", &internal::createPythonField<FieldTypes>, ( ( arg("size")                    ),
                                                                    ( arg("type")                    ),
                                                                    ( arg("ghostLayers") = uint_t(1) ),
-                                                                   ( arg("layout")      = zyxf      )   ) );
+                                                                   ( arg("layout")      = zyxf      ),
+                                                                   ( arg("alignment")   = 0         )) );
 
    def( "createFlagField", &internal::createPythonFlagField, ( ( arg("size")                      ),
                                                                ( arg("nrOfBits")    = uint_t(32)  ),
@@ -1264,7 +1334,8 @@ void exportFields()
                                                                   ( arg("fSize")       = 1         ),
                                                                   ( arg("ghostLayers") = uint_t(1) ),
                                                                   ( arg("layout")      = zyxf      ),
-                                                                  ( arg("initValue")   = object()  ) ) );
+                                                                  ( arg("initValue")   = object()  ),
+                                                                  ( arg("alignment")   = 0         ) ) );
 
    def( "addFlagFieldToStorage",&internal::addFlagFieldToStorage, ( ( arg("blocks")                  ),
                                                                     ( arg("name")                    ),
diff --git a/src/pe/rigidbody/StorageDataHandling.h b/src/pe/rigidbody/StorageDataHandling.h
index 2d49a98c84b21d11d4c3bf51da230e62c648ebae..dd319398a9bba478d4dcdd92085ab9365041cb0e 100644
--- a/src/pe/rigidbody/StorageDataHandling.h
+++ b/src/pe/rigidbody/StorageDataHandling.h
@@ -29,6 +29,7 @@
 #include "pe/communication/DynamicMarshalling.h"
 
 #include "blockforest/BlockDataHandling.h"
+#include "blockforest/BlockForest.h"
 #include "domain_decomposition/BlockStorage.h"
 #include "core/Abort.h"
 
@@ -114,21 +115,8 @@ template<typename BodyTuple>
 void StorageDataHandling<BodyTuple>::serializeCoarseToFine( Block * const block, const BlockDataID & id, mpi::SendBuffer & buffer, const uint_t child )
 {
    // get child aabb
-   const math::AABB aabb = block->getAABB();
-   const real_t xMid = (aabb.xMax() + aabb.xMin()) * real_t(0.5);
-   const real_t yMid = (aabb.yMax() + aabb.yMin()) * real_t(0.5);
-   const real_t zMid = (aabb.zMax() + aabb.zMin()) * real_t(0.5);
-
-   const real_t xMin = (child & uint_t(1)) ? xMid : aabb.xMin();
-   const real_t xMax = (child & uint_t(1)) ? aabb.xMax() : xMid;
-
-   const real_t yMin = (child & uint_t(2)) ? yMid : aabb.yMin();
-   const real_t yMax = (child & uint_t(2)) ? aabb.yMax() : yMid;
-
-   const real_t zMin = (child & uint_t(4)) ? zMid : aabb.zMin();
-   const real_t zMax = (child & uint_t(4)) ? aabb.zMax() : zMid;
-
-   const math::AABB childAABB(xMin, yMin, zMin, xMax, yMax, zMax);
+   const auto childID   = BlockID(block->getId(), child);
+   const auto childAABB = block->getForest().getAABBFromBlockId(childID);
    //WALBERLA_LOG_DEVEL( (child & uint_t(1)) << (child & uint_t(2)) << (child & uint_t(4)) << "\naabb: " << aabb << "\nchild: " << childAABB );
 
    using namespace walberla::pe::communication;
diff --git a/src/pe_coupling/utility/BodiesForceTorqueContainer.h b/src/pe_coupling/utility/BodiesForceTorqueContainer.h
index a32dc589d99d0a1138b4e5d1cab974338dc0933e..d69b6791b23734526521e10a3b42a6acb8b5f6bf 100644
--- a/src/pe_coupling/utility/BodiesForceTorqueContainer.h
+++ b/src/pe_coupling/utility/BodiesForceTorqueContainer.h
@@ -21,13 +21,13 @@
 
 #pragma once
 
+#include "blockforest/StructuredBlockForest.h"
 #include "core/math/Vector3.h"
-#include "domain_decomposition/StructuredBlockStorage.h"
 #include "pe/rigidbody/BodyIterators.h"
 #include "pe/synchronization/SyncForces.h"
 
 #include <map>
-#include <vector>
+#include <array>
 
 namespace walberla {
 namespace pe_coupling {
@@ -36,9 +36,14 @@ class BodiesForceTorqueContainer
 {  
 public:
 
-   BodiesForceTorqueContainer( const shared_ptr<StructuredBlockStorage> & blockStorage, const BlockDataID & bodyStorageID )
-   : blockStorage_( blockStorage ), bodyStorageID_( bodyStorageID )
-     { }
+   typedef std::map< walberla::id_t, std::array<real_t,6> > ForceTorqueStorage_T;
+
+   BodiesForceTorqueContainer( const shared_ptr<StructuredBlockForest> & blockForest, const BlockDataID & bodyStorageID )
+   : blockForest_( blockForest ), bodyStorageID_( bodyStorageID )
+   {
+      // has to be added to the forest (not the storage) to register correctly
+      bodyForceTorqueStorageID_ = blockForest->addBlockData(make_shared<blockforest::AlwaysCreateBlockDataHandling<ForceTorqueStorage_T> >(), "BodiesForceTorqueContainer");
+   }
 
    void operator()()
    {
@@ -50,38 +55,18 @@ public:
       // clear map
       clear();
 
-      // sum up all forces/torques from shadow copies on local body (owner)
-      pe::reduceForces( blockStorage_->getBlockStorage(), bodyStorageID_ );
-      // send total forces/torques to shadow owners
-      pe::distributeForces( blockStorage_->getBlockStorage(), bodyStorageID_ );
-
       // (re-)build map
-      for( auto blockIt = blockStorage_->begin(); blockIt != blockStorage_->end(); ++blockIt )
+      for( auto blockIt = blockForest_->begin(); blockIt != blockForest_->end(); ++blockIt )
       {
+         auto bodyForceTorqueStorage = blockIt->getData<ForceTorqueStorage_T>(bodyForceTorqueStorageID_);
+
          for( auto bodyIt = pe::BodyIterator::begin(*blockIt, bodyStorageID_); bodyIt != pe::BodyIterator::end(); ++bodyIt )
          {
-            auto & f = bodyForceTorqueMap_[ bodyIt->getSystemID() ];
+            auto & f = (*bodyForceTorqueStorage)[ bodyIt->getSystemID() ];
 
-            // only add if body has not been added already before (from another block)
-            if( f.empty() )
-            {
-               const auto & force = bodyIt->getForce();
-               f.push_back( force[0] );
-               f.push_back( force[1] );
-               f.push_back( force[2] );
-
-               const auto & torque = bodyIt->getTorque();
-               f.push_back( torque[0] );
-               f.push_back( torque[1] );
-               f.push_back( torque[2] );
-            }
-
-            // reset of force/torque on remote bodies necessary to erase the multiple occurrences of forces/torques on bodies
-            // (due to call to distributeForces() before)
-            if ( bodyIt->isRemote() )
-            {
-               bodyIt->resetForceAndTorque();
-            }
+            const auto & force = bodyIt->getForce();
+            const auto & torque = bodyIt->getTorque();
+            f = {{force[0], force[1], force[2], torque[0], torque[1], torque[2] }};
          }
       }
 
@@ -89,42 +74,52 @@ public:
 
    void setOnBodies()
    {
-      // owning process sets the force/torque on the bodies
-      for( auto blockIt = blockStorage_->begin(); blockIt != blockStorage_->end(); ++blockIt )
+      // set the force/torque stored in the block-local map onto all bodies
+      for( auto blockIt = blockForest_->begin(); blockIt != blockForest_->end(); ++blockIt )
       {
-         for( auto bodyIt = pe::LocalBodyIterator::begin(*blockIt, bodyStorageID_); bodyIt != pe::LocalBodyIterator::end(); ++bodyIt )
+         auto bodyForceTorqueStorage = blockIt->getData<ForceTorqueStorage_T>(bodyForceTorqueStorageID_);
+
+         for( auto bodyIt = pe::BodyIterator::begin(*blockIt, bodyStorageID_); bodyIt != pe::BodyIterator::end(); ++bodyIt )
          {
-            const auto &f = bodyForceTorqueMap_[bodyIt->getSystemID()];
-            WALBERLA_ASSERT( !f.empty(), "When attempting to set force/torque on local body " << bodyIt->getSystemID() << " at position " << bodyIt->getPosition() << ", body was not found in map!");
-            bodyIt->addForce ( f[0], f[1], f[2] );
-            bodyIt->addTorque( f[3], f[4], f[5] );
+            const auto f = bodyForceTorqueStorage->find( bodyIt->getSystemID() );
+
+            if( f != bodyForceTorqueStorage->end() )
+            {
+               const auto & ftValues = f->second;
+               bodyIt->addForce ( ftValues[0], ftValues[1], ftValues[2] );
+               bodyIt->addTorque( ftValues[3], ftValues[4], ftValues[5] );
+            }
+            // else: new body has arrived that was not known before
          }
       }
    }
 
    void clear()
    {
-      bodyForceTorqueMap_.clear();
+      for( auto blockIt = blockForest_->begin(); blockIt != blockForest_->end(); ++blockIt )
+      {
+         auto bodyForceTorqueStorage = blockIt->getData<ForceTorqueStorage_T>(bodyForceTorqueStorageID_);
+         bodyForceTorqueStorage->clear();
+      }
    }
 
    void swap( BodiesForceTorqueContainer & other )
    {
-      std::swap( this->bodyForceTorqueMap_, other.bodyForceTorqueMap_ );
+      std::swap( bodyForceTorqueStorageID_, other.bodyForceTorqueStorageID_);
    }
 
 private:
 
-   shared_ptr<StructuredBlockStorage> blockStorage_;
+   shared_ptr<StructuredBlockStorage> blockForest_;
    const BlockDataID bodyStorageID_;
-   std::map< walberla::id_t, std::vector<real_t> > bodyForceTorqueMap_;
+   BlockDataID bodyForceTorqueStorageID_;
 };
 
 
 class BodyContainerSwapper
 {
 public:
-   BodyContainerSwapper( const shared_ptr<BodiesForceTorqueContainer> & cont1,
-                         const shared_ptr<BodiesForceTorqueContainer> & cont2 )
+   BodyContainerSwapper( const shared_ptr<BodiesForceTorqueContainer> & cont1, const shared_ptr<BodiesForceTorqueContainer> & cont2 )
    : cont1_( cont1 ), cont2_( cont2 )
    { }
 
diff --git a/src/pe_coupling/utility/BodySelectorFunctions.cpp b/src/pe_coupling/utility/BodySelectorFunctions.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4c10f95028c47e88b3340aef74840337dc723992
--- /dev/null
+++ b/src/pe_coupling/utility/BodySelectorFunctions.cpp
@@ -0,0 +1,50 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can 
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of 
+//  the License, or (at your option) any later version.
+//  
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT 
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
+//  for more details.
+//  
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file BodySelectorFunctions.cpp
+//! \ingroup pe_coupling
+//! \author Christoph Rettinger <christoph.rettinger@fau.de>
+//
+//======================================================================================================================
+
+#include "BodySelectorFunctions.h"
+
+#include "pe/rigidbody/RigidBody.h"
+
+namespace walberla {
+namespace pe_coupling {
+
+bool selectAllBodies(pe::BodyID /*bodyID*/)
+{
+   return true;
+}
+
+bool selectRegularBodies(pe::BodyID bodyID)
+{
+   return !bodyID->hasInfiniteMass() && !bodyID->isGlobal();
+}
+
+bool selectFixedBodies(pe::BodyID bodyID)
+{
+   return bodyID->hasInfiniteMass() && !bodyID->isGlobal();
+}
+
+bool selectGlobalBodies(pe::BodyID bodyID)
+{
+   return bodyID->isGlobal();
+}
+
+} // namespace pe_coupling
+} // namespace walberla
diff --git a/src/pe_coupling/utility/BodySelectorFunctions.h b/src/pe_coupling/utility/BodySelectorFunctions.h
index 75bc5c93b1e12c5c970c8b0f11556f08e5980d06..a811d416b2fe440d2ff4a0c062b18b7f0823eae2 100644
--- a/src/pe_coupling/utility/BodySelectorFunctions.h
+++ b/src/pe_coupling/utility/BodySelectorFunctions.h
@@ -21,29 +21,18 @@
 
 #pragma once
 
-#include "pe/Types.h"
+#include "pe/rigidbody/RigidBody.h"
 
 namespace walberla {
 namespace pe_coupling {
 
-bool selectAllBodies(pe::BodyID /*bodyID*/)
-{
-   return true;
-}
-
-bool selectRegularBodies(pe::BodyID bodyID)
-{
-   return !bodyID->hasInfiniteMass() && !bodyID->isGlobal();
-}
-
-bool selectFixedBodies(pe::BodyID bodyID)
-{
-   return bodyID->hasInfiniteMass() && !bodyID->isGlobal();
-}
-
-bool selectGlobalBodies(pe::BodyID bodyID)
-{
-   return bodyID->isGlobal();
-}
+bool selectAllBodies(pe::BodyID /*bodyID*/);
+
+bool selectRegularBodies(pe::BodyID bodyID);
+
+bool selectFixedBodies(pe::BodyID bodyID);
+
+bool selectGlobalBodies(pe::BodyID bodyID);
+
 } // namespace pe_coupling
 } // namespace walberla
diff --git a/src/pe_coupling/utility/TimeStep.h b/src/pe_coupling/utility/TimeStep.h
index d89e0a6e7aff337ba570c252e605fe4d006c8e9c..f32c306142dbe01c904f7e081e8aa49dd2cd3455 100644
--- a/src/pe_coupling/utility/TimeStep.h
+++ b/src/pe_coupling/utility/TimeStep.h
@@ -35,7 +35,7 @@
 #include <functional>
 
 #include <map>
-
+#include <array>
 
 namespace walberla {
 namespace pe_coupling {
@@ -88,43 +88,25 @@ public:
          // Since they are reset after the call to collision response, they have to be stored explicitly before.
          // Then they are set again after each intermediate step.
 
-         // sum up all forces/torques from shadow copies on local body (owner)
-         pe::reduceForces( blockStorage_->getBlockStorage(), bodyStorageID_ );
-         // send total forces/torques to shadow owners
-         pe::distributeForces( blockStorage_->getBlockStorage(), bodyStorageID_ );
-
          // generate map from all known bodies (process local) to total forces/torques
-         std::map< walberla::id_t, std::vector< real_t > > forceMap;
+         // this has to be done on a block-local basis, since the same body could reside on several blocks from this process
+         typedef domain_decomposition::IBlockID::IDType BlockID_T;
+         std::map< BlockID_T, std::map< walberla::id_t, std::array< real_t, 6 > > > forceTorqueMap;
+
          for( auto blockIt = blockStorage_->begin(); blockIt != blockStorage_->end(); ++blockIt )
          {
+            BlockID_T blockID = blockIt->getId().getID();
+            auto& blockLocalForceTorqueMap = forceTorqueMap[blockID];
+
             // iterate over local and remote bodies and store force/torque in map
-            // Remote bodies are required since during the then following collision response time steps,
-            // bodies can change ownership (i.e. a now remote body could become a local body ).
-            // Since only the owning process re-sets the force/torque later, remote body values have to be stored as well.
             for( auto bodyIt = pe::BodyIterator::begin(*blockIt, bodyStorageID_); bodyIt != pe::BodyIterator::end(); ++bodyIt )
             {
-               auto & f = forceMap[ bodyIt->getSystemID() ];
+               auto & f = blockLocalForceTorqueMap[ bodyIt->getSystemID() ];
 
-               // only add if body has not been added already before (from another block)
-               if( f.empty() )
-               {
-                  const auto & force = bodyIt->getForce();
-                  f.push_back( force[0] );
-                  f.push_back( force[1] );
-                  f.push_back( force[2] );
-
-                  const auto & torque = bodyIt->getTorque();
-                  f.push_back( torque[0] );
-                  f.push_back( torque[1] );
-                  f.push_back( torque[2] );
-               }
+               const auto & force = bodyIt->getForce();
+               const auto & torque = bodyIt->getTorque();
 
-               // reset of force/torque on remote bodies necessary to erase the multiple occurrences of forces/torques on bodies
-               // (due to call to distributeForces() before)
-               if ( bodyIt->isRemote() )
-               {
-                  bodyIt->resetForceAndTorque();
-               }
+               f = {{force[0], force[1], force[2], torque[0], torque[1], torque[2] }};
             }
          }
 
@@ -133,18 +115,26 @@ public:
          for( uint_t i = 0; i != numberOfSubIterations_; ++i )
          {
 
-            // in the first iteration, forces on local bodies are already set by force synchronization
+            // in the first iteration, forces are already set
             if( i != 0 )
             {
                for( auto blockIt = blockStorage_->begin(); blockIt != blockStorage_->end(); ++blockIt )
                {
-                  // only owning process sets force/torque on bodies
-                  for( auto bodyIt = pe::LocalBodyIterator::begin(*blockIt, bodyStorageID_); bodyIt != pe::LocalBodyIterator::end(); ++bodyIt )
+                  BlockID_T blockID = blockIt->getId().getID();
+                  auto& blockLocalForceTorqueMap = forceTorqueMap[blockID];
+
+                  // re-set stored force/torque on bodies
+                  for( auto bodyIt = pe::BodyIterator::begin(*blockIt, bodyStorageID_); bodyIt != pe::BodyIterator::end(); ++bodyIt )
                   {
-                     const auto & f = forceMap[ bodyIt->getSystemID() ];
-                     WALBERLA_ASSERT( !f.empty(), "When attempting to set force/torque on local body " << bodyIt->getSystemID() << " at position " << bodyIt->getPosition() << ", body was not found in map!");
-                     bodyIt->addForce ( f[0], f[1], f[2] );
-                     bodyIt->addTorque( f[3], f[4], f[5] );
+
+                     const auto f = blockLocalForceTorqueMap.find( bodyIt->getSystemID() );
+
+                     if( f != blockLocalForceTorqueMap.end() )
+                     {
+                        const auto & ftValues = f->second;
+                        bodyIt->addForce ( ftValues[0], ftValues[1], ftValues[2] );
+                        bodyIt->addTorque( ftValues[3], ftValues[4], ftValues[5] );
+                     }
                   }
                }
             }
diff --git a/tests/core/timing/TimingTreeTest.cpp b/tests/core/timing/TimingTreeTest.cpp
index 862620f326fc8b0eeadcfb55c3b64c6221fd13be..74fa27e046074df0d4eb5923e0e0705545046854 100644
--- a/tests/core/timing/TimingTreeTest.cpp
+++ b/tests/core/timing/TimingTreeTest.cpp
@@ -13,7 +13,7 @@
 //  You should have received a copy of the GNU General Public License along
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
-//! \file TimingPoolTest.cpp
+//! \file TimingTreeTest.cpp
 //! \ingroup core
 //! \author Martin Bauer <martin.bauer@fau.de>
 //
@@ -67,12 +67,36 @@ int main( int argc, char ** argv )
    tt.stop("AC");
    tt.stop("A");
 
+   WALBERLA_ASSERT(tt.timerExists("A"));
+   WALBERLA_ASSERT(tt.timerExists("A.AA"));
+   WALBERLA_ASSERT(tt.timerExists("A.AB.ABA"));
+   WALBERLA_ASSERT(tt.timerExists("A.AB.ABB"));
+   WALBERLA_ASSERT(tt.timerExists("A.AC.ACA"));
+   WALBERLA_ASSERT(!tt.timerExists("AAC"));
+   WALBERLA_ASSERT(!tt.timerExists("A.AA.C"));
+
    // check copy constructor
    timing::TimingTree<timing::StaticPolicy> tt2(tt);
    // check assignment operator
    timing::TimingTree<timing::StaticPolicy> tt3;
    tt3 = tt;
 
+   WALBERLA_ASSERT(tt2.timerExists("A"));
+   WALBERLA_ASSERT(tt2.timerExists("A.AA"));
+   WALBERLA_ASSERT(tt2.timerExists("A.AB.ABA"));
+   WALBERLA_ASSERT(tt2.timerExists("A.AB.ABB"));
+   WALBERLA_ASSERT(tt2.timerExists("A.AC.ACA"));
+   WALBERLA_ASSERT(!tt2.timerExists("AAC"));
+   WALBERLA_ASSERT(!tt2.timerExists("A.AA.C"));
+
+   WALBERLA_ASSERT(tt3.timerExists("A"));
+   WALBERLA_ASSERT(tt3.timerExists("A.AA"));
+   WALBERLA_ASSERT(tt3.timerExists("A.AB.ABA"));
+   WALBERLA_ASSERT(tt3.timerExists("A.AB.ABB"));
+   WALBERLA_ASSERT(tt3.timerExists("A.AC.ACA"));
+   WALBERLA_ASSERT(!tt3.timerExists("AAC"));
+   WALBERLA_ASSERT(!tt3.timerExists("A.AA.C"));
+
    tt2 = tt.getReduced( timing::REDUCE_TOTAL, 0 );
    tt2 = tt.getReduced( timing::REDUCE_TOTAL, 1 );
 
diff --git a/tests/pe/ParMetis.cpp b/tests/pe/ParMetis.cpp
index e35a8cb2336015a2f6aa3f925a1c4a74c9ed263b..f43a92f57e36d2ea6acbb7fcf643d074a7aaff4f 100644
--- a/tests/pe/ParMetis.cpp
+++ b/tests/pe/ParMetis.cpp
@@ -129,7 +129,7 @@ int main( int argc, char ** argv )
    walberla::MPIManager::instance()->initializeMPI( &argc, &argv );
    walberla::MPIManager::instance()->useWorldComm();
 
-   std::vector<std::string> algs = {"PART_GEOM", "PART_GEOM_KWAY", "PART_KWAY", "PART_ADAPTIVE_REPART", "REFINE_KWAY"};
+   std::vector<std::string> algs = {"PART_GEOM", "PART_GEOM_KWAY", "PART_KWAY", "ADAPTIVE_REPART", "REFINE_KWAY"};
    std::vector<std::string> wtu  = {"NO_WEIGHTS", "EDGE_WEIGHTS", "VERTEX_WEIGHTS", "BOTH_WEIGHTS"};
    std::vector<std::string> es   = {"EDGES_FROM_FOREST", "EDGES_FROM_EDGE_WEIGHTS"};
 
diff --git a/tests/pe/RigidBody.cpp b/tests/pe/RigidBody.cpp
index ad3e0facb2e5fb3900fccbc4c042b5a341640c1c..1e73e0f8b073577370e36d19d510ba8c21521ef9 100644
--- a/tests/pe/RigidBody.cpp
+++ b/tests/pe/RigidBody.cpp
@@ -82,10 +82,10 @@ void move( BodyStorage& storage, real_t dt )
 void checkRotationFunctions()
 {
    MaterialID iron = Material::find("iron");
-   auto sp1 = std::make_shared<Sphere>( 0, 0, Vec3(0,0,0), Vec3(0,0,0), Quat(), 1, iron, false, true, false );
-   auto sp2 = std::make_shared<Sphere>( 0, 0, Vec3(0,0,0), Vec3(0,0,0), Quat(), 1, iron, false, true, false );
-   auto sp3 = std::make_shared<Sphere>( 0, 0, Vec3(0,0,0), Vec3(0,0,0), Quat(), 1, iron, false, true, false );
-   auto sp4 = std::make_shared<Sphere>( 0, 0, Vec3(0,0,0), Vec3(0,0,0), Quat(), 1, iron, false, true, false );
+   auto sp1 = std::make_shared<Sphere>( 0, 0, Vec3(0,0,0), Vec3(0,0,0), Quat(), real_t(1), iron, false, true, false );
+   auto sp2 = std::make_shared<Sphere>( 0, 0, Vec3(0,0,0), Vec3(0,0,0), Quat(), real_t(1), iron, false, true, false );
+   auto sp3 = std::make_shared<Sphere>( 0, 0, Vec3(0,0,0), Vec3(0,0,0), Quat(), real_t(1), iron, false, true, false );
+   auto sp4 = std::make_shared<Sphere>( 0, 0, Vec3(0,0,0), Vec3(0,0,0), Quat(), real_t(1), iron, false, true, false );
 
    sp1->rotate( 1, 0, 0, math::M_PI * real_t(0.5));
    sp1->rotate( 0, 1, 0, math::M_PI * real_t(0.5));
@@ -121,7 +121,7 @@ void checkRotationFunctions()
 void checkPointFunctions()
 {
    MaterialID iron = Material::find("iron");
-   auto sp1 = std::make_shared<Sphere>( 0, 0, Vec3(10,10,10), Vec3(0,0,0), Quat(), 1, iron, false, true, false );
+   auto sp1 = std::make_shared<Sphere>( 0, 0, Vec3(10,10,10), Vec3(0,0,0), Quat(), real_t(1), iron, false, true, false );
 
    WALBERLA_CHECK( sp1->containsPoint( 10, 10, 10 ) );
    WALBERLA_CHECK( sp1->containsPoint( real_c(10.9), 10, 10 ) );
@@ -148,7 +148,7 @@ int main( int argc, char** argv )
 
    MaterialID iron = Material::find("iron");
    BodyStorage storage;
-   SpherePtr spPtr = std::make_unique<Sphere>(0, 0, Vec3(0,0,0), Vec3(0,0,0), Quat(), 1, iron, false, true, false);
+   SpherePtr spPtr = std::make_unique<Sphere>(0, 0, Vec3(0,0,0), Vec3(0,0,0), Quat(), real_t(1), iron, false, true, false);
    SphereID sphere = static_cast<SphereID>(&storage.add(std::move(spPtr)));
 
    Vec3 x0 = Vec3(-2,2,0);
diff --git a/tests/pe/SimpleCCD.cpp b/tests/pe/SimpleCCD.cpp
index 527a846383cb8931c3215e5a16352093918f4699..20c147c693a75246d4f4434c212b34131c19c3c7 100644
--- a/tests/pe/SimpleCCD.cpp
+++ b/tests/pe/SimpleCCD.cpp
@@ -60,7 +60,7 @@ int main( int argc, char** argv )
     math::seedRandomGenerator(1337);
 
     for (uint_t i = 0; i < 100; ++i)
-      storage[0].add( std::make_unique<Sphere>(UniqueID<Sphere>::createGlobal(), 0, Vec3( math::realRandom(real_c(0), real_c(10)), math::realRandom(real_c(0), real_c(10)), math::realRandom(real_c(0), real_c(10))), Vec3(0,0,0), Quat(), 1, iron, false, false, false) );
+      storage[0].add( std::make_unique<Sphere>(UniqueID<Sphere>::createGlobal(), 0, Vec3( math::realRandom(real_c(0), real_c(10)), math::realRandom(real_c(0), real_c(10)), math::realRandom(real_c(0), real_c(10))), Vec3(0,0,0), Quat(), real_t(1), iron, false, false, false) );
 
     sccd.generatePossibleContacts();
 
@@ -84,14 +84,14 @@ int main( int argc, char** argv )
 
     bs.clear();
 
-    bs.add( std::make_unique<Sphere>(UniqueID<Sphere>::createGlobal(), 0, Vec3( math::realRandom(real_c(0), real_c(10)), math::realRandom(real_c(0), real_c(10)), math::realRandom(real_c(0), real_c(10))), Vec3(0,0,0), Quat(), 1, iron, false, false, false) );
+    bs.add( std::make_unique<Sphere>(UniqueID<Sphere>::createGlobal(), 0, Vec3( math::realRandom(real_c(0), real_c(10)), math::realRandom(real_c(0), real_c(10)), math::realRandom(real_c(0), real_c(10))), Vec3(0,0,0), Quat(), real_t(1), iron, false, false, false) );
 
     WcTimingPool pool;
     for (int runs = 0; runs < 10; ++runs)
     {
        auto oldSize = bs.size();
        for (uint_t i = 0; i < oldSize; ++i)
-         bs.add( std::make_unique<Sphere>(UniqueID<Sphere>::createGlobal(), 0, Vec3( math::realRandom(real_c(0), real_c(10)), math::realRandom(real_c(0), real_c(10)), math::realRandom(real_c(0), real_c(10))), Vec3(0,0,0), Quat(), 0.5, iron, false, false, false) );
+         bs.add( std::make_unique<Sphere>(UniqueID<Sphere>::createGlobal(), 0, Vec3( math::realRandom(real_c(0), real_c(10)), math::realRandom(real_c(0), real_c(10)), math::realRandom(real_c(0), real_c(10))), Vec3(0,0,0), Quat(), real_t(0.5), iron, false, false, false) );
        pool["SCCD"].start();
        sccd.generatePossibleContacts();
        pool["SCCD"].end();
diff --git a/tests/pe_coupling/CMakeLists.txt b/tests/pe_coupling/CMakeLists.txt
index 1803b3090ccc1f744a4b1711a81ac3b8732d2828..b53b734b5b16b8c18841938c4839e88e0dd7885f 100644
--- a/tests/pe_coupling/CMakeLists.txt
+++ b/tests/pe_coupling/CMakeLists.txt
@@ -156,3 +156,15 @@ waLBerla_execute_test( NAME SphereWallCollisionBehaviorDPMFuncTest COMMAND $<TAR
 
 waLBerla_compile_test( FILES discrete_particle_methods/HinderedSettlingDynamicsDPM.cpp DEPENDS blockforest pe timeloop )
 waLBerla_execute_test( NAME HinderedSettlingDynamicsDPMFuncTest COMMAND $<TARGET_FILE:HinderedSettlingDynamicsDPM> --funcTest PROCESSES 4 LABELS longrun CONFIGURATIONS RelWithDbgInfo )
+
+###################################################################################################
+# Utility tests
+###################################################################################################
+
+waLBerla_compile_test( FILES utility/BodiesForceTorqueContainerTest.cpp DEPENDS blockforest pe timeloop )
+waLBerla_execute_test( NAME BodiesForceTorqueContainerTest COMMAND $<TARGET_FILE:BodiesForceTorqueContainerTest> PROCESSES 1 )
+waLBerla_execute_test( NAME BodiesForceTorqueContainerParallelTest COMMAND $<TARGET_FILE:BodiesForceTorqueContainerTest> PROCESSES 3 )
+
+waLBerla_compile_test( FILES utility/PeSubCyclingTest.cpp DEPENDS blockforest pe timeloop )
+waLBerla_execute_test( NAME PeSubCyclingTest COMMAND $<TARGET_FILE:PeSubCyclingTest> PROCESSES 1 )
+waLBerla_execute_test( NAME PeSubCyclingParallelTest COMMAND $<TARGET_FILE:PeSubCyclingTest> PROCESSES 3 )
\ No newline at end of file
diff --git a/tests/pe_coupling/utility/BodiesForceTorqueContainerTest.cpp b/tests/pe_coupling/utility/BodiesForceTorqueContainerTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ed015b72f6c78f0ecf445ea4e7e8295a036f1d1e
--- /dev/null
+++ b/tests/pe_coupling/utility/BodiesForceTorqueContainerTest.cpp
@@ -0,0 +1,487 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file BodiesForceTorqueContainerTest.cpp
+//! \ingroup pe_coupling
+//! \author Christoph Rettinger <christoph.rettinger@fau.de>
+//
+//======================================================================================================================
+
+#include "blockforest/Initialization.h"
+
+#include "core/DataTypes.h"
+#include "core/Environment.h"
+#include "core/debug/TestSubsystem.h"
+#include "core/math/all.h"
+
+#include "pe/basic.h"
+#include "pe/utility/DestroyBody.h"
+
+#include <pe_coupling/utility/all.h>
+
+namespace force_torque_container_test
+{
+
+///////////
+// USING //
+///////////
+
+using namespace walberla;
+
+typedef boost::tuple<pe::Sphere> BodyTypeTuple ;
+
+
+/*!\brief Test cases for the force torque container provided by the coupling module
+ *
+ * Spheres at different positions are created and force(s) and torque(s) are applied.
+ * Then, they are store in the container and reset on the body.
+ * Then, in some cases, the sphere is moved to cross block borders, and the stored forces/torques are reapplied by the container again.
+ * The obtained force/torque values are compared against the originally set (and thus expected) values.
+ */
+//////////
+// MAIN //
+//////////
+int main( int argc, char **argv )
+{
+   debug::enterTestMode();
+
+   mpi::Environment env( argc, argv );
+
+   // uncomment to have logging
+   //logging::Logging::instance()->setLogLevel(logging::Logging::LogLevel::DETAIL);
+
+   const real_t dx     = real_t(1);
+   const real_t radius = real_t(5);
+
+   ///////////////////////////
+   // DATA STRUCTURES SETUP //
+   ///////////////////////////
+
+   Vector3<uint_t> blocksPerDirection(uint_t(3), uint_t(1), uint_t(1));
+   Vector3<uint_t> cellsPerBlock(uint_t(20), uint_t(20), uint_t(20));
+   Vector3<bool> periodicity(true, false, false);
+
+   // create fully periodic domain with refined blocks
+   auto blocks = blockforest::createUniformBlockGrid( blocksPerDirection[0], blocksPerDirection[1], blocksPerDirection[2],
+                                                      cellsPerBlock[0], cellsPerBlock[1], cellsPerBlock[2],
+                                                      dx,
+                                                      0, false, false,
+                                                      periodicity[0], periodicity[1], periodicity[2],
+                                                      false );
+
+
+   // pe body storage
+   pe::SetBodyTypeIDs<BodyTypeTuple>::execute();
+   shared_ptr<pe::BodyStorage> globalBodyStorage = make_shared<pe::BodyStorage>();
+   auto bodyStorageID = blocks->addBlockData(pe::createStorageDataHandling<BodyTypeTuple>(), "Storage");
+   auto sphereMaterialID = pe::createMaterial( "sphereMat", real_t(1) , real_t(0.3), real_t(0.2), real_t(0.2), real_t(0.24), real_t(200), real_t(200), real_t(0), real_t(0) );
+
+   // pe coupling
+   const real_t overlap = real_t( 1.5 ) * dx;
+   std::function<void(void)> syncCall = std::bind( pe::syncNextNeighbors<BodyTypeTuple>, boost::ref(blocks->getBlockForest()), bodyStorageID, static_cast<WcTimingTree*>(NULL), overlap, false );
+
+   // sphere positions for test scenarios
+   Vector3<real_t> positionInsideBlock(real_t(10), real_t(10), real_t(10));
+   Vector3<real_t> positionAtBlockBorder(real_t(19.5), real_t(10), real_t(10));
+   Vector3<real_t> positionAtBlockBorderUpdated(real_t(20.5), real_t(10), real_t(10));
+
+   Vector3<real_t> positionAtBlockBorder2(real_t(20) + radius + overlap - real_t(0.5), real_t(10), real_t(10));
+   Vector3<real_t> positionAtBlockBorderUpdated2(real_t(20) + radius + overlap + real_t(0.5), real_t(10), real_t(10));
+
+
+   Vector3<real_t> testForce(real_t(2), real_t(1), real_t(0));
+   Vector3<real_t> torqueOffset = Vector3<real_t>(real_t(1), real_t(0), real_t(0));
+
+   pe_coupling::ForceTorqueOnBodiesResetter resetter(blocks, bodyStorageID);
+   shared_ptr<pe_coupling::BodiesForceTorqueContainer> container1 = make_shared<pe_coupling::BodiesForceTorqueContainer>(blocks, bodyStorageID);
+   shared_ptr<pe_coupling::BodiesForceTorqueContainer> container2 = make_shared<pe_coupling::BodiesForceTorqueContainer>(blocks, bodyStorageID);
+   pe_coupling::BodyContainerSwapper containerSwapper(container1, container2);
+
+   //////////////////
+   // Inside block //
+   //////////////////
+   {
+      std::string testIdentifier("Test: sphere inside block");
+      WALBERLA_LOG_DEVEL_ON_ROOT(testIdentifier << " - started");
+      pe::createSphere(*globalBodyStorage, blocks->getBlockStorage(), bodyStorageID, 0,
+                       positionInsideBlock, radius, sphereMaterialID, false, true, false);
+
+      syncCall();
+
+      uint_t applicationCounter( 0 );
+
+      for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
+      {
+         for( auto bodyIt = pe::BodyIterator::begin( *blockIt, bodyStorageID); bodyIt != pe::BodyIterator::end(); ++bodyIt )
+         {
+            ++applicationCounter;
+            auto pos = bodyIt->getPosition();
+            bodyIt->addForceAtPos(testForce, pos+torqueOffset);
+         }
+      }
+      mpi::allReduceInplace(applicationCounter, mpi::SUM);
+
+      container1->store();
+
+      resetter();
+
+      containerSwapper();
+
+      container2->setOnBodies();
+
+      Vector3<real_t> expectedForce = applicationCounter * testForce;
+      Vector3<real_t> expectedTorque = applicationCounter * ( torqueOffset % testForce );
+
+      WALBERLA_LOG_DEVEL_ON_ROOT(" - expecting force: " << expectedForce);
+      WALBERLA_LOG_DEVEL_ON_ROOT(" - expecting torque: " << expectedTorque);
+
+      Vector3<real_t> actingForce(real_t(0));
+      Vector3<real_t> actingTorque(real_t(0));
+      for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
+      {
+         for( auto bodyIt = pe::BodyIterator::begin( *blockIt, bodyStorageID); bodyIt != pe::BodyIterator::end(); ++bodyIt )
+         {
+            actingForce += bodyIt->getForce();
+            actingTorque += bodyIt->getTorque();
+
+         }
+      }
+
+      mpi::allReduceInplace(actingForce[0], mpi::SUM);
+      mpi::allReduceInplace(actingForce[1], mpi::SUM);
+      mpi::allReduceInplace(actingForce[2], mpi::SUM);
+
+      mpi::allReduceInplace(actingTorque[0], mpi::SUM);
+      mpi::allReduceInplace(actingTorque[1], mpi::SUM);
+      mpi::allReduceInplace(actingTorque[2], mpi::SUM);
+
+      WALBERLA_ROOT_SECTION()
+      {
+         WALBERLA_CHECK_FLOAT_EQUAL(actingForce[0], expectedForce[0], "Mismatch in force0");
+         WALBERLA_CHECK_FLOAT_EQUAL(actingForce[1], expectedForce[1], "Mismatch in force1");
+         WALBERLA_CHECK_FLOAT_EQUAL(actingForce[2], expectedForce[2], "Mismatch in force2");
+
+         WALBERLA_CHECK_FLOAT_EQUAL(actingTorque[0], expectedTorque[0], "Mismatch in torque0");
+         WALBERLA_CHECK_FLOAT_EQUAL(actingTorque[1], expectedTorque[1], "Mismatch in torque1");
+         WALBERLA_CHECK_FLOAT_EQUAL(actingTorque[2], expectedTorque[2], "Mismatch in torque2");
+      }
+
+      // clean up
+      for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
+      {
+         for( auto bodyIt = pe::BodyIterator::begin( *blockIt, bodyStorageID); bodyIt != pe::BodyIterator::end(); ++bodyIt )
+         {
+            bodyIt->markForDeletion();
+         }
+      }
+      syncCall();
+
+      container1->clear();
+      container2->clear();
+
+      WALBERLA_LOG_DEVEL_ON_ROOT(testIdentifier << " - ended");
+
+   }
+
+   /////////////////////
+   // At block border //
+   /////////////////////
+   {
+      std::string testIdentifier("Test: sphere at block border");
+      WALBERLA_LOG_DEVEL_ON_ROOT(testIdentifier << " - started");
+      pe::createSphere(*globalBodyStorage, blocks->getBlockStorage(), bodyStorageID, 0,
+                       positionAtBlockBorder, radius, sphereMaterialID, false, true, false);
+
+      syncCall();
+
+      uint_t applicationCounter( 0 );
+
+      for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
+      {
+         for( auto bodyIt = pe::BodyIterator::begin( *blockIt, bodyStorageID); bodyIt != pe::BodyIterator::end(); ++bodyIt )
+         {
+            ++applicationCounter;
+            auto pos = bodyIt->getPosition();
+            bodyIt->addForceAtPos(testForce, pos+torqueOffset);
+         }
+      }
+      mpi::allReduceInplace(applicationCounter, mpi::SUM);
+
+      container1->store();
+
+      resetter();
+
+      containerSwapper();
+
+      container2->setOnBodies();
+
+      Vector3<real_t> expectedForce = applicationCounter * testForce;
+      Vector3<real_t> expectedTorque = applicationCounter * ( torqueOffset % testForce );
+
+      WALBERLA_LOG_DEVEL_ON_ROOT(" - expecting force: " << expectedForce);
+      WALBERLA_LOG_DEVEL_ON_ROOT(" - expecting torque: " << expectedTorque);
+
+      Vector3<real_t> actingForce(real_t(0));
+      Vector3<real_t> actingTorque(real_t(0));
+      for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
+      {
+         for( auto bodyIt = pe::BodyIterator::begin( *blockIt, bodyStorageID); bodyIt != pe::BodyIterator::end(); ++bodyIt )
+         {
+            actingForce += bodyIt->getForce();
+            actingTorque += bodyIt->getTorque();
+         }
+      }
+
+      mpi::allReduceInplace(actingForce[0], mpi::SUM);
+      mpi::allReduceInplace(actingForce[1], mpi::SUM);
+      mpi::allReduceInplace(actingForce[2], mpi::SUM);
+
+      mpi::allReduceInplace(actingTorque[0], mpi::SUM);
+      mpi::allReduceInplace(actingTorque[1], mpi::SUM);
+      mpi::allReduceInplace(actingTorque[2], mpi::SUM);
+
+      WALBERLA_ROOT_SECTION()
+      {
+         WALBERLA_CHECK_FLOAT_EQUAL(actingForce[0], expectedForce[0], "Mismatch in force0");
+         WALBERLA_CHECK_FLOAT_EQUAL(actingForce[1], expectedForce[1], "Mismatch in force1");
+         WALBERLA_CHECK_FLOAT_EQUAL(actingForce[2], expectedForce[2], "Mismatch in force2");
+
+         WALBERLA_CHECK_FLOAT_EQUAL(actingTorque[0], expectedTorque[0], "Mismatch in torque0");
+         WALBERLA_CHECK_FLOAT_EQUAL(actingTorque[1], expectedTorque[1], "Mismatch in torque1");
+         WALBERLA_CHECK_FLOAT_EQUAL(actingTorque[2], expectedTorque[2], "Mismatch in torque2");
+      }
+
+      // clean up
+      for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
+      {
+         for( auto bodyIt = pe::BodyIterator::begin( *blockIt, bodyStorageID); bodyIt != pe::BodyIterator::end(); ++bodyIt )
+         {
+            bodyIt->markForDeletion();
+         }
+      }
+      syncCall();
+
+      container1->clear();
+      container2->clear();
+
+      WALBERLA_LOG_DEVEL_ON_ROOT(testIdentifier << " - ended");
+
+   }
+
+   ///////////////////////////////////////////
+   // At block border with updated position //
+   ///////////////////////////////////////////
+   {
+      std::string testIdentifier("Test: sphere at block border with updated position");
+      WALBERLA_LOG_DEVEL_ON_ROOT(testIdentifier << " - started");
+      pe::createSphere(*globalBodyStorage, blocks->getBlockStorage(), bodyStorageID, 0,
+                       positionAtBlockBorder, radius, sphereMaterialID, false, true, false);
+
+      syncCall();
+
+      uint_t applicationCounter( 0 );
+
+      for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
+      {
+         for( auto bodyIt = pe::BodyIterator::begin( *blockIt, bodyStorageID); bodyIt != pe::BodyIterator::end(); ++bodyIt )
+         {
+            ++applicationCounter;
+            auto pos = bodyIt->getPosition();
+            bodyIt->addForceAtPos(testForce, pos+torqueOffset);
+         }
+      }
+      mpi::allReduceInplace(applicationCounter, mpi::SUM);
+
+      container1->store();
+
+      resetter();
+
+      containerSwapper();
+
+      for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt ) {
+         for (auto bodyIt = pe::LocalBodyIterator::begin(*blockIt, bodyStorageID); bodyIt != pe::LocalBodyIterator::end(); ++bodyIt) {
+            bodyIt->setPosition(positionAtBlockBorderUpdated);
+         }
+      }
+      syncCall();
+
+      container2->setOnBodies();
+
+      Vector3<real_t> expectedForce = applicationCounter * testForce;
+      Vector3<real_t> expectedTorque = applicationCounter * ( torqueOffset % testForce );
+
+      WALBERLA_LOG_DEVEL_ON_ROOT(" - expecting force: " << expectedForce);
+      WALBERLA_LOG_DEVEL_ON_ROOT(" - expecting torque: " << expectedTorque);
+
+      Vector3<real_t> actingForce(real_t(0));
+      Vector3<real_t> actingTorque(real_t(0));
+      for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
+      {
+         for( auto bodyIt = pe::BodyIterator::begin( *blockIt, bodyStorageID); bodyIt != pe::BodyIterator::end(); ++bodyIt )
+         {
+            actingForce += bodyIt->getForce();
+            actingTorque += bodyIt->getTorque();
+         }
+      }
+
+      mpi::allReduceInplace(actingForce[0], mpi::SUM);
+      mpi::allReduceInplace(actingForce[1], mpi::SUM);
+      mpi::allReduceInplace(actingForce[2], mpi::SUM);
+
+      mpi::allReduceInplace(actingTorque[0], mpi::SUM);
+      mpi::allReduceInplace(actingTorque[1], mpi::SUM);
+      mpi::allReduceInplace(actingTorque[2], mpi::SUM);
+
+      WALBERLA_ROOT_SECTION()
+      {
+         WALBERLA_CHECK_FLOAT_EQUAL(actingForce[0], expectedForce[0], "Mismatch in force0");
+         WALBERLA_CHECK_FLOAT_EQUAL(actingForce[1], expectedForce[1], "Mismatch in force1");
+         WALBERLA_CHECK_FLOAT_EQUAL(actingForce[2], expectedForce[2], "Mismatch in force2");
+
+         WALBERLA_CHECK_FLOAT_EQUAL(actingTorque[0], expectedTorque[0], "Mismatch in torque0");
+         WALBERLA_CHECK_FLOAT_EQUAL(actingTorque[1], expectedTorque[1], "Mismatch in torque1");
+         WALBERLA_CHECK_FLOAT_EQUAL(actingTorque[2], expectedTorque[2], "Mismatch in torque2");
+      }
+
+      // clean up
+      for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
+      {
+         for( auto bodyIt = pe::BodyIterator::begin( *blockIt, bodyStorageID); bodyIt != pe::BodyIterator::end(); ++bodyIt )
+         {
+            bodyIt->markForDeletion();
+         }
+      }
+      syncCall();
+
+      container1->clear();
+      container2->clear();
+
+      WALBERLA_LOG_DEVEL_ON_ROOT(testIdentifier << " - ended");
+
+   }
+
+   /////////////////////////////////////////////
+   // At block border with updated position 2 //
+   /////////////////////////////////////////////
+   {
+      std::string testIdentifier("Test: sphere at block border with updated position 2");
+      WALBERLA_LOG_DEVEL_ON_ROOT(testIdentifier << " - started");
+      pe::createSphere(*globalBodyStorage, blocks->getBlockStorage(), bodyStorageID, 0,
+                       positionAtBlockBorder2, radius, sphereMaterialID, false, true, false);
+
+      syncCall();
+
+      uint_t applicationCounter( 0 );
+
+      for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
+      {
+         for( auto bodyIt = pe::BodyIterator::begin( *blockIt, bodyStorageID); bodyIt != pe::BodyIterator::end(); ++bodyIt )
+         {
+            ++applicationCounter;
+            auto pos = bodyIt->getPosition();
+            bodyIt->addForceAtPos(testForce, pos+torqueOffset);
+         }
+      }
+      mpi::allReduceInplace(applicationCounter, mpi::SUM);
+
+      container1->store();
+
+      resetter();
+
+      containerSwapper();
+
+      for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt ) {
+         for (auto bodyIt = pe::LocalBodyIterator::begin(*blockIt, bodyStorageID); bodyIt != pe::LocalBodyIterator::end(); ++bodyIt) {
+            bodyIt->setPosition(positionAtBlockBorderUpdated2);
+         }
+      }
+      syncCall();
+
+      container2->setOnBodies();
+
+      --applicationCounter; // sphere has vanished from one block
+
+      // in this case, the complete force can no longer be applied onto the body since the part from the block
+      // from which the body vanished is lost
+
+      // HOWEVER: this case will never appear in a coupled simulation since NO FORCE will be acting a body
+      // that is about to vanish from a block since it will not be mapped to the block from which it will vanish
+
+      // If you are expecting a different behavior, you need to change the container mechanism by first all-reducing the
+      // forces/torques to all processes/blocks that know this body such that every process/block has
+      // the full information and then the process/block that owns this body sets the complete force
+
+      Vector3<real_t> expectedForce = applicationCounter * testForce;
+      Vector3<real_t> expectedTorque = applicationCounter * ( torqueOffset % testForce );
+
+      WALBERLA_LOG_DEVEL_ON_ROOT(" - expecting force: " << expectedForce);
+      WALBERLA_LOG_DEVEL_ON_ROOT(" - expecting torque: " << expectedTorque);
+
+      Vector3<real_t> actingForce(real_t(0));
+      Vector3<real_t> actingTorque(real_t(0));
+      for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
+      {
+         for( auto bodyIt = pe::BodyIterator::begin( *blockIt, bodyStorageID); bodyIt != pe::BodyIterator::end(); ++bodyIt )
+         {
+            actingForce += bodyIt->getForce();
+            actingTorque += bodyIt->getTorque();
+         }
+      }
+
+      mpi::allReduceInplace(actingForce[0], mpi::SUM);
+      mpi::allReduceInplace(actingForce[1], mpi::SUM);
+      mpi::allReduceInplace(actingForce[2], mpi::SUM);
+
+      mpi::allReduceInplace(actingTorque[0], mpi::SUM);
+      mpi::allReduceInplace(actingTorque[1], mpi::SUM);
+      mpi::allReduceInplace(actingTorque[2], mpi::SUM);
+
+      WALBERLA_ROOT_SECTION()
+      {
+         WALBERLA_CHECK_FLOAT_EQUAL(actingForce[0], expectedForce[0], "Mismatch in force0");
+         WALBERLA_CHECK_FLOAT_EQUAL(actingForce[1], expectedForce[1], "Mismatch in force1");
+         WALBERLA_CHECK_FLOAT_EQUAL(actingForce[2], expectedForce[2], "Mismatch in force2");
+
+         WALBERLA_CHECK_FLOAT_EQUAL(actingTorque[0], expectedTorque[0], "Mismatch in torque0");
+         WALBERLA_CHECK_FLOAT_EQUAL(actingTorque[1], expectedTorque[1], "Mismatch in torque1");
+         WALBERLA_CHECK_FLOAT_EQUAL(actingTorque[2], expectedTorque[2], "Mismatch in torque2");
+      }
+
+      // clean up
+      for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
+      {
+         for( auto bodyIt = pe::BodyIterator::begin( *blockIt, bodyStorageID); bodyIt != pe::BodyIterator::end(); ++bodyIt )
+         {
+            bodyIt->markForDeletion();
+         }
+      }
+      syncCall();
+
+      container1->clear();
+      container2->clear();
+
+      WALBERLA_LOG_DEVEL_ON_ROOT(testIdentifier << " - ended");
+
+   }
+
+   return 0;
+
+}
+
+} //namespace force_torque_container_test
+
+int main( int argc, char **argv ){
+   force_torque_container_test::main(argc, argv);
+}
diff --git a/tests/pe_coupling/utility/PeSubCyclingTest.cpp b/tests/pe_coupling/utility/PeSubCyclingTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c21cdff12ee606166f60c42cc53e9a42435c2146
--- /dev/null
+++ b/tests/pe_coupling/utility/PeSubCyclingTest.cpp
@@ -0,0 +1,455 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file PeSubCyclingTest.cpp
+//! \ingroup pe_coupling
+//! \author Christoph Rettinger <christoph.rettinger@fau.de>
+//
+//======================================================================================================================
+
+#include "blockforest/Initialization.h"
+
+#include "core/DataTypes.h"
+#include "core/Environment.h"
+#include "core/debug/TestSubsystem.h"
+#include "core/math/all.h"
+
+#include "pe/basic.h"
+#include "pe/utility/DestroyBody.h"
+#include "pe/cr/DEM.h"
+
+#include <pe_coupling/utility/all.h>
+
+namespace pe_sub_cycling_test
+{
+
+///////////
+// USING //
+///////////
+
+using namespace walberla;
+
+typedef boost::tuple<pe::Sphere> BodyTypeTuple ;
+
+/*!\brief test case to check functionality of sub cycling in the pe time step provided by the coupling module
+ *
+ * During this time step, currently acting forces/torques on a body are kept constant.
+ *
+ * This test first computes the resulting position offset as well as the linear and rotational velocity after a
+ * certain force has been applied to a sphere when using several 'real' pe time steps and re-applying the force 10 times.
+ *
+ * It then checks the pe time step sub-cycling functionality of the coupling module by creating same-sized spheres
+ * at different locations and applying the same force. But this time, 10 sub-cycles are carried out.
+ * As a result, the same position offset and velocities must be obtained as in the regular case.
+ *
+ */
+//////////
+// MAIN //
+//////////
+int main( int argc, char **argv )
+{
+   debug::enterTestMode();
+
+   mpi::Environment env( argc, argv );
+
+   // uncomment to have logging
+   //logging::Logging::instance()->setLogLevel(logging::Logging::LogLevel::DETAIL);
+
+   const real_t dx     = real_t(1);
+   const real_t radius = real_t(5);
+
+   ///////////////////////////
+   // DATA STRUCTURES SETUP //
+   ///////////////////////////
+
+   Vector3<uint_t> blocksPerDirection(uint_t(3), uint_t(1), uint_t(1));
+   Vector3<uint_t> cellsPerBlock(uint_t(20), uint_t(20), uint_t(20));
+   Vector3<bool> periodicity(true, false, false);
+
+   // create fully periodic domain with refined blocks
+   auto blocks = blockforest::createUniformBlockGrid( blocksPerDirection[0], blocksPerDirection[1], blocksPerDirection[2],
+                                                      cellsPerBlock[0], cellsPerBlock[1], cellsPerBlock[2],
+                                                      dx,
+                                                      0, false, false,
+                                                      periodicity[0], periodicity[1], periodicity[2],
+                                                      false );
+
+
+   // pe body storage
+   pe::SetBodyTypeIDs<BodyTypeTuple>::execute();
+   shared_ptr<pe::BodyStorage> globalBodyStorage = make_shared<pe::BodyStorage>();
+   auto bodyStorageID = blocks->addBlockData(pe::createStorageDataHandling<BodyTypeTuple>(), "Storage");
+   auto sphereMaterialID = pe::createMaterial( "sphereMat", real_t(1) , real_t(0.3), real_t(0.2), real_t(0.2), real_t(0.24), real_t(200), real_t(200), real_t(0), real_t(0) );
+
+   auto ccdID = blocks->addBlockData(pe::ccd::createHashGridsDataHandling( globalBodyStorage, bodyStorageID ), "CCD");
+   auto fcdID = blocks->addBlockData(pe::fcd::createGenericFCDDataHandling<BodyTypeTuple, pe::fcd::AnalyticCollideFunctor>(), "FCD");
+   pe::cr::DEM cr(globalBodyStorage, blocks->getBlockStoragePointer(), bodyStorageID, ccdID, fcdID, NULL);
+
+   // set up synchronization procedure
+   const real_t overlap = real_t( 1.5 ) * dx;
+   std::function<void(void)> syncCall = std::bind( pe::syncNextNeighbors<BodyTypeTuple>, boost::ref(blocks->getBlockForest()), bodyStorageID, static_cast<WcTimingTree*>(NULL), overlap, false );
+
+
+   // sphere positions for test scenarios
+   Vector3<real_t> positionInsideBlock(real_t(10), real_t(10), real_t(10));
+   Vector3<real_t> positionAtBlockBorder(real_t(19.9), real_t(10), real_t(10));
+   Vector3<real_t> positionAtBlockBorder2(real_t(20) + radius + overlap - real_t(0.1), real_t(10), real_t(10));
+
+   Vector3<real_t> testForce(real_t(2), real_t(1), real_t(0));
+   Vector3<real_t> torqueOffset = Vector3<real_t>(real_t(1), real_t(0), real_t(0));
+
+   uint_t peSubCycles( 10 );
+   real_t dtPe( real_t(10) );
+   real_t dtPeSubCycle = dtPe / real_c(peSubCycles);
+
+   pe_coupling::TimeStep timestep(blocks, bodyStorageID, cr, syncCall, dtPe, peSubCycles);
+
+   // evaluate how far the sphere will travel with a specific force applied which is the reference distance for later
+   // (depends on the chosen time integrator in the DEM and thus can not generally be computed a priori here)
+
+   Vector3<real_t> expectedPosOffset(real_t(0));
+   Vector3<real_t> expectedLinearVel(real_t(0));
+   Vector3<real_t> expectedAngularVel(real_t(0));
+   {
+
+      Vector3<real_t> startPos = positionInsideBlock;
+
+      pe::createSphere(*globalBodyStorage, blocks->getBlockStorage(), bodyStorageID, 0,
+                       startPos, radius, sphereMaterialID, false, true, false);
+
+      for( uint_t t = 0; t < peSubCycles; ++t )
+      {
+         for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
+         {
+            for( auto bodyIt = pe::LocalBodyIterator::begin( *blockIt, bodyStorageID); bodyIt != pe::LocalBodyIterator::end(); ++bodyIt )
+            {
+               bodyIt->addForceAtPos(testForce, bodyIt->getPosition() + torqueOffset);
+            }
+         }
+         cr.timestep(dtPeSubCycle);
+         syncCall();
+      }
+
+      for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
+      {
+         for( auto bodyIt = pe::LocalBodyIterator::begin( *blockIt, bodyStorageID); bodyIt != pe::LocalBodyIterator::end(); ++bodyIt )
+         {
+            expectedPosOffset = bodyIt->getPosition() - startPos;
+            expectedLinearVel = bodyIt->getLinearVel();
+            expectedAngularVel = bodyIt->getAngularVel();
+         }
+      }
+
+      mpi::allReduceInplace(expectedPosOffset[0], mpi::SUM);
+      mpi::allReduceInplace(expectedPosOffset[1], mpi::SUM);
+      mpi::allReduceInplace(expectedPosOffset[2], mpi::SUM);
+      mpi::allReduceInplace(expectedLinearVel[0], mpi::SUM);
+      mpi::allReduceInplace(expectedLinearVel[1], mpi::SUM);
+      mpi::allReduceInplace(expectedLinearVel[2], mpi::SUM);
+      mpi::allReduceInplace(expectedAngularVel[0], mpi::SUM);
+      mpi::allReduceInplace(expectedAngularVel[1], mpi::SUM);
+      mpi::allReduceInplace(expectedAngularVel[2], mpi::SUM);
+
+      WALBERLA_LOG_DEVEL_ON_ROOT(" - expecting position offset: " << expectedPosOffset);
+      WALBERLA_LOG_DEVEL_ON_ROOT(" - expecting linear vel: " << expectedLinearVel);
+      WALBERLA_LOG_DEVEL_ON_ROOT(" - expecting angular vel: " << expectedAngularVel);
+
+      // clean up
+      for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
+      {
+         for( auto bodyIt = pe::BodyIterator::begin( *blockIt, bodyStorageID); bodyIt != pe::BodyIterator::end(); ++bodyIt )
+         {
+            bodyIt->markForDeletion();
+         }
+      }
+      syncCall();
+
+   }
+
+   //////////////////
+   // Inside block //
+   //////////////////
+   {
+      std::string testIdentifier("Test: sphere inside block");
+      WALBERLA_LOG_DEVEL_ON_ROOT(testIdentifier << " - started");
+
+      Vector3<real_t> startPos = positionInsideBlock;
+
+      pe::createSphere(*globalBodyStorage, blocks->getBlockStorage(), bodyStorageID, 0,
+                       startPos, radius, sphereMaterialID, false, true, false);
+
+      syncCall();
+
+      for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
+      {
+         for( auto bodyIt = pe::LocalBodyIterator::begin( *blockIt, bodyStorageID); bodyIt != pe::LocalBodyIterator::end(); ++bodyIt )
+         {
+            auto pos = bodyIt->getPosition();
+            bodyIt->addForceAtPos(testForce, pos+torqueOffset);
+         }
+      }
+
+      timestep();
+
+
+      Vector3<real_t> curPosOffset(real_t(0));
+      Vector3<real_t> curLinearVel(real_t(0));
+      Vector3<real_t> curAngularVel(real_t(0));
+      for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
+      {
+         for( auto bodyIt = pe::LocalBodyIterator::begin( *blockIt, bodyStorageID); bodyIt != pe::LocalBodyIterator::end(); ++bodyIt )
+         {
+            curPosOffset = bodyIt->getPosition() - startPos;
+            curLinearVel = bodyIt->getLinearVel();
+            curAngularVel = bodyIt->getAngularVel();
+
+            WALBERLA_CHECK_FLOAT_EQUAL(curPosOffset[0], expectedPosOffset[0], "Mismatch in posOffset0");
+            WALBERLA_CHECK_FLOAT_EQUAL(curPosOffset[1], expectedPosOffset[1], "Mismatch in posOffset1");
+            WALBERLA_CHECK_FLOAT_EQUAL(curPosOffset[2], expectedPosOffset[2], "Mismatch in posOffset2");
+
+            WALBERLA_CHECK_FLOAT_EQUAL(curLinearVel[0], expectedLinearVel[0], "Mismatch in linearVel0");
+            WALBERLA_CHECK_FLOAT_EQUAL(curLinearVel[1], expectedLinearVel[1], "Mismatch in linearVel1");
+            WALBERLA_CHECK_FLOAT_EQUAL(curLinearVel[2], expectedLinearVel[2], "Mismatch in linearVel2");
+
+            WALBERLA_CHECK_FLOAT_EQUAL(curAngularVel[0], expectedAngularVel[0], "Mismatch in angularVel0");
+            WALBERLA_CHECK_FLOAT_EQUAL(curAngularVel[1], expectedAngularVel[1], "Mismatch in angularVel1");
+            WALBERLA_CHECK_FLOAT_EQUAL(curAngularVel[2], expectedAngularVel[2], "Mismatch in angularVel2");
+
+
+         }
+      }
+
+      // clean up
+      for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
+      {
+         for( auto bodyIt = pe::BodyIterator::begin( *blockIt, bodyStorageID); bodyIt != pe::BodyIterator::end(); ++bodyIt )
+         {
+            bodyIt->markForDeletion();
+         }
+      }
+      syncCall();
+
+      WALBERLA_LOG_DEVEL_ON_ROOT(testIdentifier << " - ended");
+
+   }
+
+   ///////////////////////
+   // At block border 1 //
+   ///////////////////////
+   {
+      std::string testIdentifier("Test: sphere at block border 1");
+      WALBERLA_LOG_DEVEL_ON_ROOT(testIdentifier << " - started");
+
+      Vector3<real_t> startPos = positionAtBlockBorder;
+
+      pe::createSphere(*globalBodyStorage, blocks->getBlockStorage(), bodyStorageID, 0,
+                       startPos, radius, sphereMaterialID, false, true, false);
+
+      syncCall();
+
+      for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
+      {
+         for( auto bodyIt = pe::LocalBodyIterator::begin( *blockIt, bodyStorageID); bodyIt != pe::LocalBodyIterator::end(); ++bodyIt )
+         {
+            auto pos = bodyIt->getPosition();
+            bodyIt->addForceAtPos(testForce, pos+torqueOffset);
+         }
+      }
+
+      timestep();
+
+
+      Vector3<real_t> curPosOffset(real_t(0));
+      Vector3<real_t> curLinearVel(real_t(0));
+      Vector3<real_t> curAngularVel(real_t(0));
+      for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
+      {
+         for( auto bodyIt = pe::LocalBodyIterator::begin( *blockIt, bodyStorageID); bodyIt != pe::LocalBodyIterator::end(); ++bodyIt )
+         {
+            curPosOffset = bodyIt->getPosition() - startPos;
+            curLinearVel = bodyIt->getLinearVel();
+            curAngularVel = bodyIt->getAngularVel();
+
+            WALBERLA_CHECK_FLOAT_EQUAL(curPosOffset[0], expectedPosOffset[0], "Mismatch in posOffset0");
+            WALBERLA_CHECK_FLOAT_EQUAL(curPosOffset[1], expectedPosOffset[1], "Mismatch in posOffset1");
+            WALBERLA_CHECK_FLOAT_EQUAL(curPosOffset[2], expectedPosOffset[2], "Mismatch in posOffset2");
+
+            WALBERLA_CHECK_FLOAT_EQUAL(curLinearVel[0], expectedLinearVel[0], "Mismatch in linearVel0");
+            WALBERLA_CHECK_FLOAT_EQUAL(curLinearVel[1], expectedLinearVel[1], "Mismatch in linearVel1");
+            WALBERLA_CHECK_FLOAT_EQUAL(curLinearVel[2], expectedLinearVel[2], "Mismatch in linearVel2");
+
+            WALBERLA_CHECK_FLOAT_EQUAL(curAngularVel[0], expectedAngularVel[0], "Mismatch in angularVel0");
+            WALBERLA_CHECK_FLOAT_EQUAL(curAngularVel[1], expectedAngularVel[1], "Mismatch in angularVel1");
+            WALBERLA_CHECK_FLOAT_EQUAL(curAngularVel[2], expectedAngularVel[2], "Mismatch in angularVel2");
+
+         }
+      }
+
+      // clean up
+      for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
+      {
+         for( auto bodyIt = pe::BodyIterator::begin( *blockIt, bodyStorageID); bodyIt != pe::BodyIterator::end(); ++bodyIt )
+         {
+            bodyIt->markForDeletion();
+         }
+      }
+      syncCall();
+
+      WALBERLA_LOG_DEVEL_ON_ROOT(testIdentifier << " - ended");
+
+   }
+
+   ////////////////////////////
+   // At block border 1, mod //
+   ////////////////////////////
+   {
+      std::string testIdentifier("Test: sphere at block border 1 mod");
+      WALBERLA_LOG_DEVEL_ON_ROOT(testIdentifier << " - started");
+
+      Vector3<real_t> startPos = positionAtBlockBorder;
+
+      pe::createSphere(*globalBodyStorage, blocks->getBlockStorage(), bodyStorageID, 0,
+                       startPos, radius, sphereMaterialID, false, true, false);
+
+      syncCall();
+
+      // also add on shadow copy, but only half of it to have same total force/torque
+      for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
+      {
+         for( auto bodyIt = pe::BodyIterator::begin( *blockIt, bodyStorageID); bodyIt != pe::BodyIterator::end(); ++bodyIt )
+         {
+            auto pos = bodyIt->getPosition();
+            bodyIt->addForceAtPos(real_t(0.5)*testForce, pos+torqueOffset);
+         }
+      }
+
+      timestep();
+
+
+      Vector3<real_t> curPosOffset(real_t(0));
+      Vector3<real_t> curLinearVel(real_t(0));
+      Vector3<real_t> curAngularVel(real_t(0));
+      for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
+      {
+         for( auto bodyIt = pe::LocalBodyIterator::begin( *blockIt, bodyStorageID); bodyIt != pe::LocalBodyIterator::end(); ++bodyIt )
+         {
+            curPosOffset = bodyIt->getPosition() - startPos;
+            curLinearVel = bodyIt->getLinearVel();
+            curAngularVel = bodyIt->getAngularVel();
+
+            WALBERLA_CHECK_FLOAT_EQUAL(curPosOffset[0], expectedPosOffset[0], "Mismatch in posOffset0");
+            WALBERLA_CHECK_FLOAT_EQUAL(curPosOffset[1], expectedPosOffset[1], "Mismatch in posOffset1");
+            WALBERLA_CHECK_FLOAT_EQUAL(curPosOffset[2], expectedPosOffset[2], "Mismatch in posOffset2");
+
+            WALBERLA_CHECK_FLOAT_EQUAL(curLinearVel[0], expectedLinearVel[0], "Mismatch in linearVel0");
+            WALBERLA_CHECK_FLOAT_EQUAL(curLinearVel[1], expectedLinearVel[1], "Mismatch in linearVel1");
+            WALBERLA_CHECK_FLOAT_EQUAL(curLinearVel[2], expectedLinearVel[2], "Mismatch in linearVel2");
+
+            WALBERLA_CHECK_FLOAT_EQUAL(curAngularVel[0], expectedAngularVel[0], "Mismatch in angularVel0");
+            WALBERLA_CHECK_FLOAT_EQUAL(curAngularVel[1], expectedAngularVel[1], "Mismatch in angularVel1");
+            WALBERLA_CHECK_FLOAT_EQUAL(curAngularVel[2], expectedAngularVel[2], "Mismatch in angularVel2");
+
+         }
+      }
+
+      // clean up
+      for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
+      {
+         for( auto bodyIt = pe::BodyIterator::begin( *blockIt, bodyStorageID); bodyIt != pe::BodyIterator::end(); ++bodyIt )
+         {
+            bodyIt->markForDeletion();
+         }
+      }
+      syncCall();
+
+      WALBERLA_LOG_DEVEL_ON_ROOT(testIdentifier << " - ended");
+
+   }
+
+   ///////////////////////
+   // At block border 2 //
+   ///////////////////////
+   {
+      std::string testIdentifier("Test: sphere at block border 2");
+      WALBERLA_LOG_DEVEL_ON_ROOT(testIdentifier << " - started");
+
+      Vector3<real_t> startPos = positionAtBlockBorder2;
+
+      pe::createSphere(*globalBodyStorage, blocks->getBlockStorage(), bodyStorageID, 0,
+                       startPos, radius, sphereMaterialID, false, true, false);
+
+      syncCall();
+
+      for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
+      {
+         for( auto bodyIt = pe::LocalBodyIterator::begin( *blockIt, bodyStorageID); bodyIt != pe::LocalBodyIterator::end(); ++bodyIt )
+         {
+            auto pos = bodyIt->getPosition();
+            bodyIt->addForceAtPos(testForce, pos+torqueOffset);
+         }
+      }
+
+      timestep();
+
+
+      Vector3<real_t> curPosOffset(real_t(0));
+      Vector3<real_t> curLinearVel(real_t(0));
+      Vector3<real_t> curAngularVel(real_t(0));
+      for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
+      {
+         for( auto bodyIt = pe::LocalBodyIterator::begin( *blockIt, bodyStorageID); bodyIt != pe::LocalBodyIterator::end(); ++bodyIt )
+         {
+            curPosOffset = bodyIt->getPosition() - startPos;
+            curLinearVel = bodyIt->getLinearVel();
+            curAngularVel = bodyIt->getAngularVel();
+
+            WALBERLA_CHECK_FLOAT_EQUAL(curPosOffset[0], expectedPosOffset[0], "Mismatch in posOffset0");
+            WALBERLA_CHECK_FLOAT_EQUAL(curPosOffset[1], expectedPosOffset[1], "Mismatch in posOffset1");
+            WALBERLA_CHECK_FLOAT_EQUAL(curPosOffset[2], expectedPosOffset[2], "Mismatch in posOffset2");
+
+            WALBERLA_CHECK_FLOAT_EQUAL(curLinearVel[0], expectedLinearVel[0], "Mismatch in linearVel0");
+            WALBERLA_CHECK_FLOAT_EQUAL(curLinearVel[1], expectedLinearVel[1], "Mismatch in linearVel1");
+            WALBERLA_CHECK_FLOAT_EQUAL(curLinearVel[2], expectedLinearVel[2], "Mismatch in linearVel2");
+
+            WALBERLA_CHECK_FLOAT_EQUAL(curAngularVel[0], expectedAngularVel[0], "Mismatch in angularVel0");
+            WALBERLA_CHECK_FLOAT_EQUAL(curAngularVel[1], expectedAngularVel[1], "Mismatch in angularVel1");
+            WALBERLA_CHECK_FLOAT_EQUAL(curAngularVel[2], expectedAngularVel[2], "Mismatch in angularVel2");
+
+         }
+      }
+
+      // clean up
+      for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
+      {
+         for( auto bodyIt = pe::BodyIterator::begin( *blockIt, bodyStorageID); bodyIt != pe::BodyIterator::end(); ++bodyIt )
+         {
+            bodyIt->markForDeletion();
+         }
+      }
+      syncCall();
+
+      WALBERLA_LOG_DEVEL_ON_ROOT(testIdentifier << " - ended");
+
+   }
+
+   return 0;
+
+}
+
+} //namespace pe_sub_cycling_test
+
+int main( int argc, char **argv ){
+   pe_sub_cycling_test::main(argc, argv);
+}