From 29e89b3ac443a44549e1ff1f1daba389ca5a84fd Mon Sep 17 00:00:00 2001
From: Christoph Rettinger <christoph.rettinger@fau.de>
Date: Tue, 19 Jun 2018 13:20:49 +0200
Subject: [PATCH] Added benchmarks for AMR and LB functionalities of coupling

---
 .../AMRSedimentSettling.cpp                   | 2249 +++++++++++++++++
 .../CMakeLists.txt                            |    5 +
 .../WorkloadEvaluation.cpp                    |  985 ++++++++
 apps/benchmarks/CMakeLists.txt                |    1 +
 4 files changed, 3240 insertions(+)
 create mode 100644 apps/benchmarks/AdaptiveMeshRefinementFluidParticleCoupling/AMRSedimentSettling.cpp
 create mode 100644 apps/benchmarks/AdaptiveMeshRefinementFluidParticleCoupling/CMakeLists.txt
 create mode 100644 apps/benchmarks/AdaptiveMeshRefinementFluidParticleCoupling/WorkloadEvaluation.cpp

diff --git a/apps/benchmarks/AdaptiveMeshRefinementFluidParticleCoupling/AMRSedimentSettling.cpp b/apps/benchmarks/AdaptiveMeshRefinementFluidParticleCoupling/AMRSedimentSettling.cpp
new file mode 100644
index 000000000..988079c48
--- /dev/null
+++ b/apps/benchmarks/AdaptiveMeshRefinementFluidParticleCoupling/AMRSedimentSettling.cpp
@@ -0,0 +1,2249 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file AMRSedimentSettling.cpp
+//! \ingroup pe_coupling
+//! \author Christoph Rettinger <christoph.rettinger@fau.de>
+//
+//======================================================================================================================
+
+#include "blockforest/Initialization.h"
+#include "blockforest/communication/UniformBufferedScheme.h"
+#include "blockforest/loadbalancing/all.h"
+#include "blockforest/AABBRefinementSelection.h"
+
+#include "boundary/all.h"
+
+#include "core/DataTypes.h"
+#include "core/Environment.h"
+#include "core/SharedFunctor.h"
+#include "core/debug/Debug.h"
+#include "core/debug/TestSubsystem.h"
+#include "core/math/all.h"
+#include "core/timing/RemainingTimeLogger.h"
+#include "core/mpi/Broadcast.h"
+
+#include "domain_decomposition/SharedSweep.h"
+#include "domain_decomposition/BlockSweepWrapper.h"
+
+#include "field/AddToStorage.h"
+#include "field/StabilityChecker.h"
+#include "field/communication/PackInfo.h"
+
+#include "lbm/boundary/all.h"
+#include "lbm/communication/PdfFieldPackInfo.h"
+#include "lbm/field/AddToStorage.h"
+#include "lbm/field/PdfField.h"
+#include "lbm/field/VelocityFieldWriter.h"
+#include "lbm/lattice_model/D3Q19.h"
+#include "lbm/refinement/all.h"
+#include "lbm/sweeps/CellwiseSweep.h"
+#include "lbm/sweeps/SweepWrappers.h"
+
+#include "pe/basic.h"
+#include "pe/Types.h"
+#include "pe/fcd/GJKEPACollideFunctor.h"
+#include "pe/vtk/BodyVtkOutput.h"
+#include "pe/vtk/SphereVtkOutput.h"
+#include "pe/vtk/EllipsoidVtkOutput.h"
+#include "pe/cr/ICR.h"
+#include "pe/synchronization/ClearSynchronization.h"
+#include "pe/amr/InfoCollection.h"
+#include "pe/amr/weight_assignment/WeightAssignmentFunctor.h"
+#include "pe/amr/weight_assignment/MetisAssignmentFunctor.h"
+
+#include "pe_coupling/amr/all.h"
+#include "pe_coupling/mapping/all.h"
+#include "pe_coupling/momentum_exchange_method/all.h"
+#include "pe_coupling/utility/all.h"
+
+#include "timeloop/SweepTimeloop.h"
+
+#include "vtk/all.h"
+#include "field/vtk/all.h"
+#include "lbm/vtk/all.h"
+
+namespace amr_sediment_settling
+{
+
+///////////
+// USING //
+///////////
+
+using namespace walberla;
+using walberla::uint_t;
+
+//////////////
+// TYPEDEFS //
+//////////////
+
+// PDF field, flag field & body field
+typedef lbm::D3Q19< lbm::collision_model::TRT, false>  LatticeModel_T;
+typedef LatticeModel_T::Stencil          Stencil_T;
+typedef lbm::PdfField< LatticeModel_T > PdfField_T;
+
+typedef walberla::uint8_t                 flag_t;
+typedef FlagField< flag_t >               FlagField_T;
+typedef GhostLayerField< pe::BodyID, 1 >  BodyField_T;
+typedef GhostLayerField< Vector3<real_t>, 1 >  VelocityField_T;
+
+const uint_t FieldGhostLayers = 4;
+
+// boundary handling
+typedef lbm::NoSlip< LatticeModel_T, flag_t > NoSlip_T;
+
+typedef pe_coupling::CurvedLinear< LatticeModel_T, FlagField_T > MO_T;
+
+typedef boost::tuples::tuple< NoSlip_T, MO_T > BoundaryConditions_T;
+typedef BoundaryHandling< FlagField_T, Stencil_T, BoundaryConditions_T > BoundaryHandling_T;
+
+typedef boost::tuple<pe::Sphere, pe::Ellipsoid, pe::Plane> BodyTypeTuple;
+
+///////////
+// FLAGS //
+///////////
+
+const FlagUID Fluid_Flag( "fluid" );
+const FlagUID NoSlip_Flag( "no slip" );
+const FlagUID MO_Flag( "moving obstacle" );
+const FlagUID FormerMO_Flag( "former moving obstacle" );
+
+
+//////////////////////////////////////
+// DYNAMIC REFINEMENT FUNCTIONALITY //
+//////////////////////////////////////
+
+
+/*
+ * Refinement check based on gradient magnitude
+ * If gradient magnitude is below lowerLimit in all cells of a block, that block could be coarsened.
+ * If the gradient value is above the upperLimit for at least one cell, that block gets marked for refinement.
+ * Else, the block remains on the current level.
+ */
+template< typename LatticeModel_T, typename Filter_T >
+class VectorGradientRefinement
+{
+public:
+   typedef GhostLayerField< Vector3<real_t>, 1 >  VectorField_T;
+   typedef typename LatticeModel_T::Stencil       Stencil_T;
+
+   VectorGradientRefinement( const ConstBlockDataID & fieldID, const Filter_T & filter,
+                             const real_t upperLimit, const real_t lowerLimit, const uint_t maxLevel ) :
+         fieldID_( fieldID ), filter_( filter ),
+         upperLimit_( upperLimit ), lowerLimit_( lowerLimit ), maxLevel_( maxLevel )
+   {}
+
+   void operator()( std::vector< std::pair< const Block *, uint_t > > & minTargetLevels,
+                    std::vector< const Block * > & blocksAlreadyMarkedForRefinement,
+                    const BlockForest & forest );
+
+private:
+
+   ConstBlockDataID fieldID_;
+
+   Filter_T filter_;
+
+   real_t upperLimit_;
+   real_t lowerLimit_;
+
+   uint_t maxLevel_;
+
+}; // class GradientRefinement
+
+template< typename LatticeModel_T, typename Filter_T >
+void VectorGradientRefinement< LatticeModel_T, Filter_T >::operator()( std::vector< std::pair< const Block *, uint_t > > & minTargetLevels,
+                                                                       std::vector< const Block * > &, const BlockForest & )
+{
+   for( auto it = minTargetLevels.begin(); it != minTargetLevels.end(); ++it )
+   {
+      const Block * const block = it->first;
+
+      const uint_t currentLevelOfBlock = block->getLevel();
+
+      const VectorField_T * uField = block->template getData< VectorField_T >( fieldID_ );
+
+      if( uField == NULL )
+      {
+         it->second = uint_t(0);
+         continue;
+      }
+
+      Matrix3<real_t> uGradient( real_t(0) );
+
+      bool refine( false );
+      bool coarsen( true );
+
+      filter_( *block );
+
+      WALBERLA_FOR_ALL_CELLS_XYZ( uField,
+
+          std::vector< Vector3<real_t> > uValues( Stencil_T::Size, Vector3<real_t>(real_t(0)) );
+
+          Vector3<real_t> uInCenterCell = uField->get( x,y,z );
+
+          for( auto dir = Stencil_T::beginNoCenter(); dir != Stencil_T::end(); ++dir)
+          {
+             // check if boundary treatment is necessary
+             if( filter_( x+dir.cx(),y+dir.cy(),z+dir.cz() ) )
+             {
+                // copy from center cell
+                uValues[ *dir ] = uInCenterCell;
+             } else {
+                uValues[ *dir ] = uField->get( x+dir.cx(),y+dir.cy(),z+dir.cz() );
+             }
+          }
+
+          // obtain the matrix grad(u) with the help of the gradient formula from
+          // See: Ramadugu et al - Lattice differential operators for computational physics (2013)
+          // with T = c_s**2
+          const real_t inv_c_s_sqr = real_t(3);
+          uGradient = real_t(0);
+          for( auto dir = Stencil_T::beginNoCenter(); dir != Stencil_T::end(); ++dir)
+          {
+             real_t cx = real_c(dir.cx());
+             real_t cy = real_c(dir.cy());
+             real_t cz = real_c(dir.cz());
+
+             // grad(ux)
+             real_t ux = uValues[ *dir ][0];
+             uGradient[ 0 ] += LatticeModel_T::w[ dir.toIdx() ] * cx * ux;
+             uGradient[ 3 ] += LatticeModel_T::w[ dir.toIdx() ] * cy * ux;
+             uGradient[ 6 ] += LatticeModel_T::w[ dir.toIdx() ] * cz * ux;
+
+             // grad(uy)
+             real_t uy = uValues[ *dir ][1];
+             uGradient[ 1 ] += LatticeModel_T::w[ dir.toIdx() ] * cx * uy;
+             uGradient[ 4 ] += LatticeModel_T::w[ dir.toIdx() ] * cy * uy;
+             uGradient[ 7 ] += LatticeModel_T::w[ dir.toIdx() ] * cz * uy;
+
+             // grad(uz)
+             real_t uz = uValues[ *dir ][2];
+             uGradient[ 2 ] += LatticeModel_T::w[ dir.toIdx() ] * cx * uz;
+             uGradient[ 5 ] += LatticeModel_T::w[ dir.toIdx() ] * cy * uz;
+             uGradient[ 8 ] += LatticeModel_T::w[ dir.toIdx() ] * cz * uz;
+
+          }
+          uGradient *= inv_c_s_sqr;
+
+          real_t norm( real_t(0) );
+          //compute maximums norm of 3x3 matrix
+          for( uint_t i = uint_t(0); i < uint_t(3*3); ++i )
+             norm = std::max(norm, std::fabs(uGradient[i]));
+
+          if( norm > lowerLimit_ )
+          {
+             coarsen = false;
+             if( norm > upperLimit_ )
+                refine = true;
+          }
+
+      )
+
+      if( refine && currentLevelOfBlock < maxLevel_ )
+      {
+         WALBERLA_ASSERT( !coarsen );
+         it->second = currentLevelOfBlock + uint_t(1);
+      }
+      if( coarsen && currentLevelOfBlock > uint_t(0) )
+      {
+         WALBERLA_ASSERT( !refine );
+         it->second = currentLevelOfBlock - uint_t(1);
+      }
+   }
+}
+
+
+// Load estimators for spheres and ellipsoids, obtained at SuperMUC Phase 2
+// See Sec. 3 in the paper for more infos
+
+/////////////
+// Spheres //
+/////////////
+real_t fittedLBMWeightEvaluationFunctionSpheres(const pe_coupling::BlockInfo& blockInfo)
+{
+   uint_t Ce = blockInfo.numberOfCells;
+   uint_t F  = blockInfo.numberOfFluidCells;
+
+   // from fits with optimized D3Q19 LBM kernel (no forces)
+   real_t weight = real_t(9.990957667738165e-06) * real_c(Ce) + real_t(0.00015749920523711047) * real_c(F) + real_t(-0.08232498905584973);
+   return weight;
+}
+
+real_t fittedBHWeightEvaluationFunctionSpheres(const pe_coupling::BlockInfo& blockInfo)
+{
+   uint_t Ce = blockInfo.numberOfCells;
+   uint_t NB = blockInfo.numberOfNearBoundaryCells;
+
+   // from fits with optimized D3Q19 LBM kernel (no forces)
+   real_t weight = real_t(6.654810986939097e-06) * real_c(Ce) + real_t(0.0007061414693533274) * real_c(NB) + real_t(-0.1094292992294259);
+   return weight;
+}
+
+real_t fittedCoupling1WeightEvaluationFunctionSpheres(const pe_coupling::BlockInfo& blockInfo)
+{
+   uint_t Ce = blockInfo.numberOfCells;
+   uint_t F  = blockInfo.numberOfFluidCells;
+   uint_t Pl = blockInfo.numberOfLocalBodies;
+   uint_t Ps = blockInfo.numberOfShadowBodies;
+
+   // from fits with optimized D3Q19 LBM kernel (no forces)
+   real_t weight = real_t(3.07542641675429e-06) * real_c(Ce) + real_t(2.419364600880769e-07) * real_c(F) + real_t(0.01413718259604757) * real_c(Pl) + real_t(0.027761707343462727) * real_c(Ps) + real_t(-0.13991481483939272);
+   return weight;
+}
+
+real_t fittedCoupling2WeightEvaluationFunctionSpheres(const pe_coupling::BlockInfo& blockInfo)
+{
+   uint_t Ce = blockInfo.numberOfCells;
+   uint_t F  = blockInfo.numberOfFluidCells;
+   uint_t Pl = blockInfo.numberOfLocalBodies;
+   uint_t Ps = blockInfo.numberOfShadowBodies;
+
+   // from fits with optimized D3Q19 LBM kernel (no forces)
+   real_t weight = real_t(5.988401232749505e-06) * real_c(Ce) + real_t(3.903532223977357e-06) * real_c(F) + real_t(-0.008802674250816316) * real_c(Pl) + real_t(0.02505020738346139) * real_c(Ps) + real_t(-0.12970723676003335);
+   return weight;
+}
+
+real_t fittedPEWeightEvaluationFunctionSpheres(const pe_coupling::BlockInfo& blockInfo)
+{
+   uint_t Pl = blockInfo.numberOfLocalBodies;
+   uint_t Ps = blockInfo.numberOfShadowBodies;
+   uint_t Ct = blockInfo.numberOfContacts;
+   uint_t Sc = blockInfo.numberOfPeSubCycles;
+
+   // from fits with optimized D3Q19 LBM kernel (no forces)
+   real_t cPlPs2 = real_t(1.1562854544700417e-06);
+   real_t cPl    = real_t(0.0009620525068318354);
+   real_t cPs    = real_t(0.00027549401081063894);
+   real_t cCt    = real_t(0.0014801932788115464);
+   real_t c      = real_t(0.01883682418448259);
+   real_t weight = real_c(Sc) * ( cPlPs2 * real_c(Pl+Ps) * real_c(Pl+Ps) + cPl * real_c(Pl) + cPs * real_c(Ps) + cCt * real_c(Ct) + c );
+   return weight;
+}
+
+real_t fittedTotalWeightEvaluationFunctionSpheres(const pe_coupling::BlockInfo& blockInfo)
+{
+   return fittedLBMWeightEvaluationFunctionSpheres(blockInfo) + fittedBHWeightEvaluationFunctionSpheres(blockInfo) +
+          fittedCoupling1WeightEvaluationFunctionSpheres(blockInfo) + fittedCoupling2WeightEvaluationFunctionSpheres(blockInfo) +
+          fittedPEWeightEvaluationFunctionSpheres(blockInfo);
+}
+
+////////////////
+// Ellipsoids //
+////////////////
+real_t fittedLBMWeightEvaluationFunctionEllipsoids(const pe_coupling::BlockInfo& blockInfo)
+{
+   uint_t Ce = blockInfo.numberOfCells;
+   uint_t F  = blockInfo.numberOfFluidCells;
+   uint_t NB = blockInfo.numberOfNearBoundaryCells;
+
+   // from fits with optimized D3Q19 LBM kernel (no forces)
+   real_t cCe = real_t(4.69973868717e-05);
+   real_t cF  = real_t(0.000110568537442);
+   real_t weight = cCe * real_c(Ce) + cF * real_c(F);
+   if( NB > uint_t(0) ) weight += real_t(5.96551488486e-05) * real_c(Ce) + real_t(-5.75351782026e-05) * real_c(F) + real_t(0.000695800745231) * real_c(NB);
+   return weight;
+}
+
+real_t fittedCouplingWeightEvaluationFunctionEllipsoids(const pe_coupling::BlockInfo& blockInfo)
+{
+   uint_t Ce = blockInfo.numberOfCells;
+   uint_t F  = blockInfo.numberOfFluidCells;
+   uint_t Pl = blockInfo.numberOfLocalBodies;
+   uint_t Ps = blockInfo.numberOfShadowBodies;
+
+   // from fits with optimized D3Q19 LBM kernel (no forces)
+   real_t cCe = real_t(0.000176674935526);
+   real_t cF  = real_t(-0.000170513513027);
+   real_t cPl = real_t(0.0252031634776);
+   real_t cPs = real_t(0.0356835220918);
+   real_t weight = real_t(0);
+   if( (Pl + Ps ) > uint_t(0) ) weight = cCe * real_c(Ce) + cF * real_c(F) + cPl * real_c(Pl) + cPs * real_c(Ps);
+   return weight;
+}
+
+real_t fittedPEWeightEvaluationFunctionEllipsoids(const pe_coupling::BlockInfo& blockInfo)
+{
+   uint_t Pl = blockInfo.numberOfLocalBodies;
+   uint_t Ps = blockInfo.numberOfShadowBodies;
+   uint_t Ct = blockInfo.numberOfContacts;
+   uint_t Sc = blockInfo.numberOfPeSubCycles;
+
+   // from fits with optimized D3Q19 LBM kernel (no forces)
+   real_t cPlPs2 = real_t(8.24153555785e-06);
+   real_t cPl    = real_t(0.00135966650494);
+   real_t cPs    = real_t(0.00440464092538);
+   real_t cCt    = real_t(0.0216278259881);
+   real_t weight = real_c(Sc) * ( cPlPs2 * real_c(Pl+Ps) * real_c(Pl+Ps) + cPl * real_c(Pl) + cPs * real_c(Ps) + cCt * real_c(Ct) );
+   return weight;
+}
+
+real_t fittedTotalWeightEvaluationFunctionEllipsoids(const pe_coupling::BlockInfo& blockInfo)
+{
+   return fittedLBMWeightEvaluationFunctionEllipsoids(blockInfo) +
+          fittedCouplingWeightEvaluationFunctionEllipsoids(blockInfo) +
+          fittedPEWeightEvaluationFunctionEllipsoids(blockInfo);
+}
+
+
+struct TimingPoolLogger
+{
+   TimingPoolLogger( WcTimingPool & timingPool, const SweepTimeloop & timeloop, const uint_t interval )
+         : timingPool_( timingPool ), timeloop_( timeloop ), interval_( interval )
+   {
+   }
+
+   void operator()()
+   {
+      if( interval_ > uint_t(0) && timeloop_.getCurrentTimeStep() % interval_ == uint_t(0) )
+      {
+         timingPool_.logResultOnRoot();
+      }
+   }
+
+private:
+   WcTimingPool & timingPool_;
+   const SweepTimeloop & timeloop_;
+   uint_t interval_;
+};
+
+struct TimingTreeLogger
+{
+   TimingTreeLogger( WcTimingTree & timingTree, const SweepTimeloop & timeloop, const uint_t interval )
+         : timingTree_( timingTree ), timeloop_( timeloop ), interval_( interval )
+   {
+   }
+
+   void operator()()
+   {
+      if( interval_ > uint_t(0) && timeloop_.getCurrentTimeStep() % interval_ == uint_t(0) )
+      {
+         timingTree_.synchronize();
+         auto reducedTimingTree = timingTree_.getReduced();
+         WALBERLA_LOG_INFO_ON_ROOT( reducedTimingTree );
+      }
+   }
+
+private:
+   WcTimingTree & timingTree_;
+   const SweepTimeloop & timeloop_;
+   uint_t interval_;
+};
+
+/////////////////////
+// BLOCK STRUCTURE //
+/////////////////////
+
+static void refinementSelection( SetupBlockForest& forest, uint_t levels, const AABB & refinementBox )
+{
+   real_t dx = real_t(1); // dx on finest level
+   for( auto block = forest.begin(); block != forest.end(); ++block )
+   {
+      uint_t blockLevel = block->getLevel();
+      uint_t levelScalingFactor = ( uint_t(1) << (levels - uint_t(1) - blockLevel) );
+      real_t dxOnLevel = dx * real_c(levelScalingFactor);
+      AABB blockAABB = block->getAABB();
+
+      // extend block AABB by ghostlayers
+      AABB extendedBlockAABB = blockAABB.getExtended( dxOnLevel * real_c(FieldGhostLayers) );
+
+      if( extendedBlockAABB.intersects( refinementBox ) )
+         if( blockLevel < ( levels - uint_t(1) ) )
+            block->setMarker( true );
+   }
+}
+
+static void workloadAndMemoryAssignment( SetupBlockForest& forest )
+{
+   for( auto block = forest.begin(); block != forest.end(); ++block )
+   {
+      block->setWorkload( numeric_cast< workload_t >( uint_t(1) << block->getLevel() ) );
+      block->setMemory( numeric_cast< memory_t >(1) );
+   }
+}
+
+static shared_ptr< StructuredBlockForest > createBlockStructure( const AABB & domainAABB, Vector3<uint_t> blockSizeInCells,
+                                                                 uint_t numberOfLevels, const AABB & refinementBox,
+                                                                 bool useBox, std::string loadDistributionStrategy,
+                                                                 bool keepGlobalBlockInformation = false )
+{
+   SetupBlockForest sforest;
+
+   Vector3<uint_t> numberOfFineBlocksPerDirection( uint_c(domainAABB.size(0)) / blockSizeInCells[0],
+                                                   uint_c(domainAABB.size(1)) / blockSizeInCells[1],
+                                                   uint_c(domainAABB.size(2)) / blockSizeInCells[2] );
+
+   for(uint_t i = 0; i < 3; ++i )
+   {
+      WALBERLA_CHECK_EQUAL( numberOfFineBlocksPerDirection[i] * blockSizeInCells[i], uint_c(domainAABB.size(i)),
+                            "Domain can not be decomposed in direction " << i << " into fine blocks of size " << blockSizeInCells[i] );
+   }
+
+   uint_t levelScalingFactor = ( uint_t(1) << ( numberOfLevels - uint_t(1) ) );
+   Vector3<uint_t> numberOfCoarseBlocksPerDirection( numberOfFineBlocksPerDirection / levelScalingFactor );
+
+   for(uint_t i = 0; i < 3; ++i )
+   {
+      WALBERLA_CHECK_EQUAL(numberOfCoarseBlocksPerDirection[i] * levelScalingFactor, numberOfFineBlocksPerDirection[i],
+                            "Domain can not be refined in direction " << i << " according to the specified number of levels!" );
+   }
+
+   WALBERLA_LOG_INFO_ON_ROOT(" - refinement box: " << refinementBox);
+
+   MPIManager::instance()->useWorldComm();
+
+   sforest.addRefinementSelectionFunction( std::bind( refinementSelection, std::placeholders::_1, numberOfLevels, refinementBox ) );
+   sforest.addWorkloadMemorySUIDAssignmentFunction( workloadAndMemoryAssignment );
+
+   Vector3<bool> periodicity( true, true, false);
+   if( useBox )
+   {
+      periodicity[0] = false;
+      periodicity[1] = false;
+   }
+   sforest.init( domainAABB,
+                 numberOfCoarseBlocksPerDirection[0], numberOfCoarseBlocksPerDirection[1], numberOfCoarseBlocksPerDirection[2],
+                 periodicity[0], periodicity[1], periodicity[2]);
+
+   // calculate process distribution
+   const memory_t memoryLimit = math::Limits< memory_t >::inf();
+
+   if( loadDistributionStrategy == "Hilbert" )
+   {
+      bool useHilbert = true;
+      sforest.balanceLoad( blockforest::StaticLevelwiseCurveBalance(useHilbert), uint_c( MPIManager::instance()->numProcesses() ), real_t(0), memoryLimit, true );
+   } else if ( loadDistributionStrategy == "Morton" )
+   {
+      bool useHilbert = false;
+      sforest.balanceLoad( blockforest::StaticLevelwiseCurveBalance(useHilbert), uint_c( MPIManager::instance()->numProcesses() ), real_t(0), memoryLimit, true );
+   } else if ( loadDistributionStrategy == "ParMetis" )
+   {
+      blockforest::StaticLevelwiseParMetis::Algorithm algorithm = blockforest::StaticLevelwiseParMetis::Algorithm::PARMETIS_PART_GEOM_KWAY;
+      blockforest::StaticLevelwiseParMetis staticParMetis(algorithm);
+      sforest.balanceLoad( staticParMetis, uint_c( MPIManager::instance()->numProcesses() ), real_t(0), memoryLimit, true );
+   } else if (loadDistributionStrategy == "Diffusive" )
+   {
+      // also use Hilbert curve here
+      bool useHilbert = true;
+      sforest.balanceLoad( blockforest::StaticLevelwiseCurveBalance(useHilbert), uint_c( MPIManager::instance()->numProcesses() ), real_t(0), memoryLimit, true );
+   } else
+   {
+      WALBERLA_ABORT("Load distribution strategy \"" << loadDistributionStrategy << "\t not implemented! - Aborting" );
+   }
+
+   WALBERLA_LOG_INFO_ON_ROOT( sforest );
+
+
+   // create StructuredBlockForest (encapsulates a newly created BlockForest)
+   shared_ptr< StructuredBlockForest > sbf =
+         make_shared< StructuredBlockForest >( make_shared< BlockForest >( uint_c( MPIManager::instance()->rank() ), sforest, keepGlobalBlockInformation ),
+                                               blockSizeInCells[0], blockSizeInCells[1], blockSizeInCells[2]);
+   sbf->createCellBoundingBoxes();
+
+   return sbf;
+}
+
+/////////////////////////////////////
+// BOUNDARY HANDLING CUSTOMIZATION //
+/////////////////////////////////////
+class MyBoundaryHandling : public blockforest::AlwaysInitializeBlockDataHandling< BoundaryHandling_T >
+{
+public:
+   MyBoundaryHandling( const weak_ptr< StructuredBlockStorage > & blocks,
+                       const BlockDataID & flagFieldID, const BlockDataID & pdfFieldID, const BlockDataID & bodyFieldID ) :
+         blocks_( blocks ), flagFieldID_( flagFieldID ), pdfFieldID_( pdfFieldID ), bodyFieldID_ ( bodyFieldID )
+   {}
+
+   BoundaryHandling_T * initialize( IBlock * const block );
+
+private:
+
+   weak_ptr< StructuredBlockStorage > blocks_;
+
+   const BlockDataID flagFieldID_;
+   const BlockDataID pdfFieldID_;
+   const BlockDataID bodyFieldID_;
+
+
+}; // class MyBoundaryHandling
+
+BoundaryHandling_T * MyBoundaryHandling::initialize( IBlock * const block )
+{
+   WALBERLA_ASSERT_NOT_NULLPTR( block );
+
+   FlagField_T * flagField       = block->getData< FlagField_T >( flagFieldID_ );
+   PdfField_T *  pdfField        = block->getData< PdfField_T > ( pdfFieldID_ );
+   BodyField_T * bodyField       = block->getData< BodyField_T >( bodyFieldID_ );
+
+   const auto fluid = flagField->flagExists( Fluid_Flag ) ? flagField->getFlag( Fluid_Flag ) : flagField->registerFlag( Fluid_Flag );
+
+   auto blocksPtr = blocks_.lock();
+   WALBERLA_CHECK_NOT_NULLPTR( blocksPtr );
+
+   BoundaryHandling_T * handling = new BoundaryHandling_T( "moving obstacle boundary handling", flagField, fluid,
+                                                           boost::tuples::make_tuple( NoSlip_T( "NoSlip", NoSlip_Flag, pdfField ),
+                                                                                      MO_T( "MO", MO_Flag, pdfField, flagField, bodyField, fluid, *blocksPtr, *block ) ),
+                                                           BoundaryHandling_T::Mode::ENTIRE_FIELD_TRAVERSAL);
+
+   handling->fillWithDomain( FieldGhostLayers );
+
+   return handling;
+}
+
+
+//*******************************************************************************************************************
+
+
+//*******************************************************************************************************************
+/*!\brief Evaluating the position and velocity of the sediments
+ *
+ */
+//*******************************************************************************************************************
+class PropertyLogger
+{
+public:
+   PropertyLogger( SweepTimeloop* timeloop, const shared_ptr< StructuredBlockStorage > & blocks,
+                   const BlockDataID & bodyStorageID, const std::string & fileName, bool fileIO) :
+      timeloop_( timeloop ), blocks_( blocks ), bodyStorageID_( bodyStorageID ), fileName_( fileName ), fileIO_(fileIO),
+      meanPos_( real_t(0) ), meanVel_( real_t(0) ), maxVel_( real_t(0) )
+   {
+      if ( fileIO_ )
+      {
+         WALBERLA_ROOT_SECTION()
+         {
+            std::ofstream file;
+            file.open( fileName_.c_str() );
+            file << "#\t pos\t vel\t maxVel\n";
+            file.close();
+         }
+      }
+   }
+
+   void operator()()
+   {
+      const uint_t timestep (timeloop_->getCurrentTimeStep() );
+
+      uint_t numSediments( uint_t(0));
+      real_t meanPos(real_t(0));
+      real_t meanVel(real_t(0));
+      real_t maxVel(real_t(0));
+
+      for( auto blockIt = blocks_->begin(); blockIt != blocks_->end(); ++blockIt )
+      {
+         for( auto bodyIt = pe::LocalBodyIterator::begin( *blockIt, bodyStorageID_); bodyIt != pe::LocalBodyIterator::end(); ++bodyIt )
+         {
+            meanPos += bodyIt->getPosition()[2];
+            meanVel += bodyIt->getLinearVel()[2];
+            maxVel = std::max(maxVel, std::fabs(bodyIt->getLinearVel()[2]));
+            ++numSediments;
+         }
+      }
+
+      WALBERLA_MPI_SECTION()
+      {
+         mpi::allReduceInplace( numSediments, mpi::SUM );
+         mpi::allReduceInplace( meanPos, mpi::SUM );
+         mpi::allReduceInplace( meanVel, mpi::SUM );
+         mpi::allReduceInplace( maxVel, mpi::MAX );
+      }
+
+      meanPos /= real_c(numSediments);
+      meanVel /= real_c(numSediments);
+
+      meanPos_ = meanPos;
+      meanVel_ = meanVel;
+      maxVel_ = maxVel;
+
+      if( fileIO_ )
+         writeToFile( timestep );
+   }
+
+   real_t getMeanPosition() const
+   {
+      return meanPos_;
+   }
+
+   real_t getMaxVelocity() const
+   {
+      return maxVel_;
+   }
+
+   real_t getMeanVelocity() const
+   {
+      return meanVel_;
+   }
+
+
+private:
+   void writeToFile( uint_t timestep )
+   {
+      WALBERLA_ROOT_SECTION()
+      {
+         std::ofstream file;
+         file.open( fileName_.c_str(), std::ofstream::app );
+
+         file << timestep << "\t" << meanPos_ << "\t" << meanVel_ << "\t" << maxVel_ << "\n";
+         file.close();
+      }
+   }
+
+   SweepTimeloop* timeloop_;
+   shared_ptr< StructuredBlockStorage > blocks_;
+   const BlockDataID bodyStorageID_;
+   std::string fileName_;
+   bool fileIO_;
+
+   real_t meanPos_;
+   real_t meanVel_;
+   real_t maxVel_;
+};
+
+void clearBoundaryHandling( BlockForest & forest, const BlockDataID & boundaryHandlingID )
+{
+   for( auto blockIt = forest.begin(); blockIt != forest.end(); ++blockIt )
+   {
+      BoundaryHandling_T * boundaryHandling = blockIt->getData<BoundaryHandling_T>(boundaryHandlingID);
+      boundaryHandling->clear( FieldGhostLayers );
+   }
+}
+
+void clearBodyField( BlockForest & forest, const BlockDataID & bodyFieldID )
+{
+   for( auto blockIt = forest.begin(); blockIt != forest.end(); ++blockIt )
+   {
+      BodyField_T * bodyField = blockIt->getData<BodyField_T>(bodyFieldID);
+      bodyField->setWithGhostLayer( NULL );
+   }
+}
+
+void recreateBoundaryHandling( BlockForest & forest, const BlockDataID & boundaryHandlingID )
+{
+   for( auto blockIt = forest.begin(); blockIt != forest.end(); ++blockIt )
+   {
+      BoundaryHandling_T * boundaryHandling = blockIt->getData<BoundaryHandling_T>(boundaryHandlingID);
+      boundaryHandling->fillWithDomain( FieldGhostLayers );
+   }
+}
+
+
+class TimingEvaluator
+{
+public:
+   TimingEvaluator( WcTimingPool & levelwiseTimingPool, WcTimingTree & peTimingTree, uint_t numberOfLevels)
+   : levelwiseTimingPool_( levelwiseTimingPool ), peTimingTree_( peTimingTree ), numberOfLevels_( numberOfLevels )
+   {}
+
+   real_t getTimings(const std::vector<std::string> & timerNames, uint_t level )
+   {
+
+      real_t timing = real_t(0);
+      for( auto timerIt = timerNames.begin(); timerIt != timerNames.end(); ++timerIt )
+      {
+         std::string timerNameLvlWise = *timerIt;// +
+         // put level between timer string and possible suffix
+         auto suffixBegin = timerNameLvlWise.find_first_of("[");
+         if( suffixBegin != std::string::npos)
+         {
+            // suffix detected
+            auto suffixEnd = timerNameLvlWise.find_last_of("]");
+            if( suffixEnd != std::string::npos)
+            {
+               auto timerString = timerNameLvlWise.substr(0,suffixBegin);
+               auto suffixString = timerNameLvlWise.substr(suffixBegin,suffixEnd-suffixBegin+1);
+
+               timerNameLvlWise = timerString + "(" + std::to_string(level) + ") " + suffixString;
+
+            }
+            else
+            {
+               WALBERLA_ABORT("Invalid timer string");
+            }
+         }
+         else
+         {
+            timerNameLvlWise += " (" + std::to_string(level) + ")";;
+         }
+
+         if( levelwiseTimingPool_.timerExists(timerNameLvlWise))
+            timing += levelwiseTimingPool_[timerNameLvlWise].total();
+
+         if( level == numberOfLevels_- 1)
+         {
+            std::string timerNamePE = *timerIt;
+            if( peTimingTree_.timerExists(timerNamePE))
+               timing += peTimingTree_[timerNamePE].total();
+         }
+      }
+
+      return timing;
+   }
+
+
+private:
+
+   WcTimingPool & levelwiseTimingPool_;
+   WcTimingTree & peTimingTree_;
+   uint_t numberOfLevels_;
+};
+
+
+real_t weightEvaluation(BlockForest & forest,
+                        const shared_ptr<pe_coupling::InfoCollection>& couplingInfoCollection,
+                        const shared_ptr<pe::InfoCollection> & peInfoCollection,
+                        real_t peBlockBaseWeight,
+                        std::string loadEvaluationStrategy,
+                        uint_t level,
+                        bool useEllipsoids )
+{
+   real_t weight = real_t(0);
+   for( auto blockIt = forest.begin(); blockIt != forest.end(); ++blockIt )
+   {
+      if( forest.getLevel(*blockIt) != level) continue;
+
+
+      blockforest::Block* block = static_cast<blockforest::Block*> (&(*blockIt));
+      auto blockID = block->getId();
+
+      if(loadEvaluationStrategy == "LBM")
+      {
+         auto infoIt = couplingInfoCollection->find( blockID );
+         weight += pe_coupling::amr::defaultWeightEvaluationFunction(infoIt->second);
+
+      }else if(loadEvaluationStrategy == "PE")
+      {
+         auto infoIt = peInfoCollection->find( blockID );
+         weight += real_c(infoIt->second.numberOfLocalBodies) + peBlockBaseWeight;
+      }else if(loadEvaluationStrategy == "Fit" || loadEvaluationStrategy == "FitMulti")
+      {
+         auto infoIt = couplingInfoCollection->find( blockID );
+         if( useEllipsoids )
+         {
+            weight += fittedTotalWeightEvaluationFunctionEllipsoids(infoIt->second);
+         }
+         else
+         {
+            weight += fittedTotalWeightEvaluationFunctionSpheres(infoIt->second);
+         }
+      }else
+      {
+         WALBERLA_ABORT("Load balancing strategy not defined");
+      }
+   }
+   return weight;
+}
+
+
+uint_t evaluateEdgeCut(BlockForest & forest)
+{
+
+   //note: only works for edges in uniform grids
+
+   uint_t edgecut = uint_t(0); // = edge weights between processes
+
+   for( auto blockIt = forest.begin(); blockIt != forest.end(); ++blockIt )
+   {
+      blockforest::Block* block = static_cast<blockforest::Block*> (&(*blockIt));
+
+      real_t blockVolume = block->getAABB().volume();
+      real_t approximateEdgeLength = std::cbrt( blockVolume );
+
+      uint_t faceNeighborWeight = uint_c(approximateEdgeLength * approximateEdgeLength ); //common face
+      uint_t edgeNeighborWeight = uint_c(approximateEdgeLength); //common edge
+      uint_t cornerNeighborWeight = uint_c( 1 ); //common corner
+
+
+      for( const uint_t idx : blockforest::getFaceNeighborhoodSectionIndices() )
+      {
+         for( uint_t nb = uint_t(0); nb < block->getNeighborhoodSectionSize(idx); ++nb )
+         {
+            if( block->neighborExistsRemotely(idx,nb) ) edgecut += faceNeighborWeight;
+         }
+      }
+
+      for( const uint_t idx : blockforest::getEdgeNeighborhoodSectionIndices() )
+      {
+         for( uint_t nb = uint_t(0); nb < block->getNeighborhoodSectionSize(idx); ++nb )
+         {
+            if( block->neighborExistsRemotely(idx,nb) ) edgecut += edgeNeighborWeight;
+         }
+      }
+
+      for( const uint_t idx : blockforest::getCornerNeighborhoodSectionIndices() )
+      {
+         for( uint_t nb = uint_t(0); nb < block->getNeighborhoodSectionSize(idx); ++nb )
+         {
+            if( block->neighborExistsRemotely(idx,nb) ) edgecut += cornerNeighborWeight;
+         }
+      }
+   }
+   return edgecut;
+}
+
+
+void evaluateTotalSimulationTimePassed(WcTimingPool & timeloopTimingPool, real_t & totalSimTime, real_t & totalLBTime)
+{
+   shared_ptr< WcTimingPool> reduced = timeloopTimingPool.getReduced(WcTimingPool::REDUCE_TOTAL, 0);
+
+   std::string simulationString("LBM refinement time step");
+   real_t totalTime = real_t(0);
+   WALBERLA_ROOT_SECTION(){
+      totalTime = (*reduced)[simulationString].total();
+   }
+   totalSimTime = totalTime;
+
+   std::string lbString("refinement checking");
+   real_t lbTime = real_t(0);
+   WALBERLA_ROOT_SECTION(){
+      lbTime = (*reduced)[lbString].total();
+   }
+   totalLBTime = lbTime;
+
+}
+
+void createSedimentLayer(uint_t numberOfSediments, const AABB & generationDomain, real_t diameter, real_t heightBorder,
+                         pe::MaterialID peMaterial,
+                         pe::cr::HCSITS & cr, const std::function<void(void)> & syncCall,
+                         const shared_ptr< StructuredBlockForest > & blocks,
+                         const shared_ptr<pe::BodyStorage> & globalBodyStorage, BlockDataID bodyStorageID,
+                         real_t gravitationalAcceleration, bool useEllipsoids, bool shortRun)
+{
+   WALBERLA_LOG_INFO_ON_ROOT("Starting creation of sediments");
+
+   real_t xParticle = real_t(0);
+   real_t yParticle = real_t(0);
+   real_t zParticle = real_t(0);
+
+   for( uint_t nSed = 0; nSed < numberOfSediments; ++nSed )
+   {
+
+      WALBERLA_ROOT_SECTION()
+      {
+         xParticle = math::realRandom<real_t>(generationDomain.xMin(), generationDomain.xMax());
+         yParticle = math::realRandom<real_t>(generationDomain.yMin(), generationDomain.yMax());
+         zParticle = math::realRandom<real_t>(generationDomain.zMin(), generationDomain.zMax());
+      }
+
+      WALBERLA_MPI_SECTION()
+      {
+         mpi::broadcastObject( xParticle );
+         mpi::broadcastObject( yParticle );
+         mpi::broadcastObject( zParticle );
+      }
+
+      if( useEllipsoids )
+      {
+         // prolate ellipsoids
+         real_t axisFactor = real_t(1.5);
+         real_t axisFactor2 = std::sqrt(real_t(1)/axisFactor);
+         real_t radius = diameter * real_t(0.5);
+         pe::createEllipsoid( *globalBodyStorage, blocks->getBlockStorage(), bodyStorageID, 0, Vector3<real_t>( xParticle, yParticle, zParticle ), Vector3<real_t>(axisFactor*radius, axisFactor2*radius, axisFactor2*radius), peMaterial );
+      }
+      else
+      {
+         pe::createSphere( *globalBodyStorage, blocks->getBlockStorage(), bodyStorageID, 0, Vector3<real_t>( xParticle, yParticle, zParticle ), diameter * real_t(0.5), peMaterial );
+      }
+
+   }
+
+   syncCall();
+
+   // carry out 100 simulations to resolve all overlaps
+   for( uint_t pet = uint_t(1); pet <= uint_t(100); ++pet )
+   {
+      cr.timestep( real_t(1) );
+      syncCall();
+
+      // reset all velocities to zero
+      for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
+      {
+         for( auto bodyIt = pe::BodyIterator::begin( *blockIt, bodyStorageID); bodyIt != pe::BodyIterator::end(); ++bodyIt )
+         {
+            bodyIt->setLinearVel(Vector3<real_t>(real_t(0)));
+            bodyIt->setAngularVel(Vector3<real_t>(real_t(0)));
+         }
+      }
+   }
+
+
+   const uint_t maxInitialPeSteps = (shortRun) ? uint_t(10) : uint_t(200000);
+   const real_t dt_PE_init = real_t(1);
+
+   real_t gravityGeneration = real_t(0.1) * gravitationalAcceleration;
+   cr.setGlobalLinearAcceleration(Vector3<real_t>(real_t(0), real_t(0), gravityGeneration));
+
+   real_t oldMinBodyPosition = real_t(0);
+   real_t convergenceLimit = std::fabs(gravityGeneration);
+   for( uint_t pet = uint_t(1); pet <= maxInitialPeSteps; ++pet )
+   {
+      cr.timestep( dt_PE_init );
+      syncCall();
+
+      real_t minBodyPosition = generationDomain.zMax();
+      for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
+      {
+         for( auto bodyIt = pe::LocalBodyIterator::begin( *blockIt, bodyStorageID); bodyIt != pe::LocalBodyIterator::end(); ++bodyIt )
+         {
+            minBodyPosition = std::min(bodyIt->getPosition()[2], minBodyPosition);
+         }
+      }
+
+      WALBERLA_MPI_SECTION()
+      {
+         mpi::allReduceInplace(minBodyPosition, mpi::MIN);
+      }
+
+      if( minBodyPosition > heightBorder ) break;
+
+      if( pet % 500 == 0)
+      {
+         if( std::fabs(minBodyPosition - oldMinBodyPosition) / minBodyPosition  < convergenceLimit ) break;
+         oldMinBodyPosition = minBodyPosition;
+      }
+
+      WALBERLA_ROOT_SECTION()
+      {
+         if( pet % 100 == 0)
+         {
+            WALBERLA_LOG_INFO("[" << pet << "] Min position of all bodies = " << minBodyPosition << " with goal height " << heightBorder);
+         }
+      }
+
+   }
+
+   // revert gravitational acceleration to 'real' direction
+   cr.setGlobalLinearAcceleration(Vector3<real_t>(real_t(0), real_t(0), -gravityGeneration));
+
+   // carry out a few time steps to relax the system towards the real condition
+   const uint_t relaxationTimeSteps = uint_t(std::sqrt(real_t(2)/std::fabs(gravitationalAcceleration)));
+   WALBERLA_LOG_INFO_ON_ROOT("Carrying out " << relaxationTimeSteps << " more time steps with correct gravity");
+   for( uint_t pet = uint_t(1); pet <= relaxationTimeSteps; ++pet )
+   {
+      cr.timestep(dt_PE_init);
+      syncCall();
+   }
+
+   WALBERLA_LOG_INFO_ON_ROOT("Sediment layer creation done!");
+
+   // reset all velocities to zero
+   Vector3<real_t> initialBodyVelocity(real_t(0));
+   WALBERLA_LOG_INFO_ON_ROOT("Setting initial velocity " << initialBodyVelocity << " of all bodies");
+   for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
+   {
+      for( auto bodyIt = pe::BodyIterator::begin( *blockIt, bodyStorageID); bodyIt != pe::BodyIterator::end(); ++bodyIt )
+      {
+         bodyIt->setLinearVel(initialBodyVelocity);
+         bodyIt->setAngularVel(Vector3<real_t>(real_t(0)));
+      }
+   }
+
+   cr.setGlobalLinearAcceleration(Vector3<real_t>(real_t(0)));
+}
+
+
+//*******************************************************************************************************************
+/*!\brief Simulation of settling particles inside a rectangular column filled with viscous fluid
+ *
+ * This application is used in the paper
+ *  Rettinger, Ruede - "Dynamic Load Balancing Techniques for Particulate Flow Simulations", submitted to Computation
+ * in Section 4 to apply the load estimator and to evaluate different load distribution strategies.
+ *
+ * It, however, features several different command line arguments that can be used to tweak the simulation.
+ * The setup can be horizontally period, a box or a hopper geometry (configurable, as in the paper).
+ * The size, resolution and used blocks for the domain partitioning can be changed.
+ * It even features adaptive mesh refinement, with different refinement criteria:
+ *  - particle based (always on, also for global bodies like bounding planes)
+ *  - optionally: vorticity- or gradient-based (with lower and upper limits)
+ * Since the paper, however, uses a uniform grid, many evaluation functionalities might not work properly for this case.
+ * Initially, all particles are pushed upwards to obtain a dense packing at the top plane.
+ *
+ * Most importantly, the load balancing can be modified:
+ *  - load estimation strategies:
+ *    - pure LBM = number of cells per block = constant workload per block
+ *    - pure PE = number of local particles + baseweight
+ *    - coupling based load estimator = use fitted function from Sec. 3 of paper
+ *  - load distribution strategies:
+ *    - space-filling curves: Hilbert and Morton
+ *    - ParMETIS (and several algorithms and parameters, also multiple constraints possible)
+ *    - diffusive (and options)
+ *  - load balancing (/refinement check ) frequency
+ */
+//*******************************************************************************************************************
+int main( int argc, char **argv )
+{
+   debug::enterTestMode();
+
+   mpi::Environment env( argc, argv );
+
+   ///////////////////
+   // Customization //
+   ///////////////////
+
+   // simulation control
+   bool shortRun = false;
+   bool funcTest = false;
+   bool fileIO = true;
+   bool logging = false; // logging of physical components
+   uint_t vtkWriteFreqDD = 0; //domain decomposition
+   uint_t vtkWriteFreqBo = 0; //bodies
+   uint_t vtkWriteFreqFl = 0; //fluid
+   uint_t vtkWriteFreq = 0; //general
+   std::string baseFolder = "vtk_out_AMRSedimentSettling"; // folder for vtk and file output
+
+   // physical setup
+   real_t GalileoNumber = real_t(50);
+   real_t densityRatio = real_t(1.5);
+   real_t diameter = real_t(15);
+   real_t solidVolumeFraction = real_t(0.1);
+   uint_t blockSize = uint_t(32);
+   uint_t XBlocks = uint_t(12);
+   uint_t YBlocks = uint_t(12);
+   uint_t ZBlocks = uint_t(16);
+   bool useBox = false;
+   bool useHopper = false;
+   bool useEllipsoids = false;
+   real_t hopperRelHeight = real_t(0.5); // for hopper setup
+   real_t hopperRelOpening = real_t(0.3); // for hopper setup
+
+   uint_t timestepsOnFinestLevel = uint_t(80000);
+
+   //numerical parameters
+   bool averageForceTorqueOverTwoTimSteps = true;
+   uint_t numberOfLevels = uint_t(1);
+   uint_t refinementCheckFrequency = uint_t(100);
+   uint_t numPeSubCycles = uint_t(10);
+
+   // refinement criteria
+   real_t lowerFluidRefinementLimit = real_t(0);
+   real_t upperFluidRefinementLimit = std::numeric_limits<real_t>::infinity();
+   bool useVorticityCriterion = false;
+   bool useGradientCriterion = false;
+
+   // load balancing
+   std::string loadEvaluationStrategy = "LBM"; //LBM, PE, Fit
+   std::string loadDistributionStrategy = "Hilbert"; //Morton, Hilbert, ParMetis, Diffusive
+
+   real_t parMetis_ipc2redist = real_t(1000);
+   real_t parMetisTolerance = real_t(-1);
+   std::string parMetisAlgorithmString = "ADAPTIVE_REPART";
+
+   uint_t diffusionFlowIterations = uint_t(15);
+   uint_t diffusionMaxIterations = uint_t(20);
+
+
+   for( int i = 1; i < argc; ++i )
+   {
+      if( std::strcmp( argv[i], "--shortRun" )                 == 0 ) { shortRun = true; continue; }
+      if( std::strcmp( argv[i], "--funcTest" )                 == 0 ) { funcTest = true; continue; }
+      if( std::strcmp( argv[i], "--fileIO" )                   == 0 ) { fileIO = true; continue; }
+      if( std::strcmp( argv[i], "--logging" )                  == 0 ) { logging = true; continue; }
+      if( std::strcmp( argv[i], "--vtkWriteFreqDD" )           == 0 ) { vtkWriteFreqDD = uint_c( std::atof( argv[++i] ) ); continue; }
+      if( std::strcmp( argv[i], "--vtkWriteFreqBo" )           == 0 ) { vtkWriteFreqBo = uint_c( std::atof( argv[++i] ) ); continue; }
+      if( std::strcmp( argv[i], "--vtkWriteFreqFl" )           == 0 ) { vtkWriteFreqFl = uint_c( std::atof( argv[++i] ) ); continue; }
+      if( std::strcmp( argv[i], "--vtkWriteFreq" )             == 0 ) { vtkWriteFreq = uint_c( std::atof( argv[++i] ) ); continue; }
+      if( std::strcmp( argv[i], "--baseFolder" )               == 0 ) { baseFolder = argv[++i]; continue; }
+      if( std::strcmp( argv[i], "--densityRatio" )             == 0 ) { densityRatio = std::atof( argv[++i] ); continue; }
+      if( std::strcmp( argv[i], "--Ga" )                       == 0 ) { GalileoNumber = std::atof( argv[++i] ); continue; }
+      if( std::strcmp( argv[i], "--diameter" )                 == 0 ) { diameter = std::atof( argv[++i] ); continue; }
+      if( std::strcmp( argv[i], "--blockSize" )                == 0 ) { blockSize = uint_c(std::atof( argv[++i] ) ); continue; }
+      if( std::strcmp( argv[i], "--XBlocks" )                  == 0 ) { XBlocks = uint_c(std::atof( argv[++i] ) ); continue; }
+      if( std::strcmp( argv[i], "--YBlocks" )                  == 0 ) { YBlocks = uint_c(std::atof( argv[++i] ) ); continue; }
+      if( std::strcmp( argv[i], "--ZBlocks" )                  == 0 ) { ZBlocks = uint_c(std::atof( argv[++i] ) ); continue; }
+      if( std::strcmp( argv[i], "--useBox" )                   == 0 ) { useBox = true; continue; }
+      if( std::strcmp( argv[i], "--useHopper" )                == 0 ) { useHopper = true; continue; }
+      if( std::strcmp( argv[i], "--hopperHeight" )             == 0 ) { hopperRelHeight = std::atof( argv[++i] ); continue; }
+      if( std::strcmp( argv[i], "--hopperOpening" )            == 0 ) { hopperRelOpening = std::atof( argv[++i] ); continue; }
+      if( std::strcmp( argv[i], "--timesteps" )                == 0 ) { timestepsOnFinestLevel = uint_c(std::atof( argv[++i] ) ); continue; }
+      if( std::strcmp( argv[i], "--noForceAveraging" )         == 0 ) { averageForceTorqueOverTwoTimSteps = false; continue; }
+      if( std::strcmp( argv[i], "--numPeSubCycles" )           == 0 ) { numPeSubCycles = uint_c(std::atof( argv[++i] )); continue; }
+      if( std::strcmp( argv[i], "--numLevels" )                == 0 ) { numberOfLevels = uint_c( std::atof( argv[++i] ) ); continue; }
+      if( std::strcmp( argv[i], "--refinementCheckFrequency" ) == 0 ) { refinementCheckFrequency = uint_c( std::atof( argv[++i] ) ); continue; }
+      if( std::strcmp( argv[i], "--lowerLimit" )               == 0 ) { lowerFluidRefinementLimit = std::atof( argv[++i] ); continue; }
+      if( std::strcmp( argv[i], "--upperLimit" )               == 0 ) { upperFluidRefinementLimit = std::atof( argv[++i] ); continue; }
+      if( std::strcmp( argv[i], "--useVorticityCriterion" )    == 0 ) { useVorticityCriterion = true; continue; }
+      if( std::strcmp( argv[i], "--useGradientCriterion" )     == 0 ) { useGradientCriterion = true; continue; }
+      if( std::strcmp( argv[i], "--loadEvaluationStrategy" )   == 0 ) { loadEvaluationStrategy = argv[++i]; continue; }
+      if( std::strcmp( argv[i], "--loadDistributionStrategy" ) == 0 ) { loadDistributionStrategy = argv[++i]; continue; }
+      if( std::strcmp( argv[i], "--ipc2redist" )               == 0 ) { parMetis_ipc2redist = std::atof( argv[++i] ); continue; }
+      if( std::strcmp( argv[i], "--parMetisTolerance" )        == 0 ) { parMetisTolerance = std::atof( argv[++i] ); continue; }
+      if( std::strcmp( argv[i], "--parMetisAlgorithm" )        == 0 ) { parMetisAlgorithmString = argv[++i]; continue; }
+      if( std::strcmp( argv[i], "--diffusionFlowIterations" )  == 0 ) { diffusionFlowIterations = uint_c(std::atof(argv[++i])); continue; }
+      if( std::strcmp( argv[i], "--diffusionMaxIterations" )   == 0 ) { diffusionMaxIterations = uint_c(std::atof(argv[++i])); continue; }
+      if( std::strcmp( argv[i], "--useEllipsoids" )            == 0 ) { useEllipsoids = true; continue; }
+      WALBERLA_ABORT("Unrecognized command line argument found: " << argv[i]);
+   }
+
+   if( funcTest )
+   {
+      walberla::logging::Logging::instance()->setLogLevel(logging::Logging::LogLevel::WARNING);
+   }
+
+   if( fileIO || logging )
+   {
+      WALBERLA_ROOT_SECTION(){
+         // create base directory if it does not yet exist
+         filesystem::path tpath( baseFolder );
+         if( !filesystem::exists( tpath ) )
+            filesystem::create_directory( tpath );
+      }
+   }
+
+   if( useVorticityCriterion && useGradientCriterion )
+   {
+      WALBERLA_ABORT("Use either vorticity or gradient criterion for refinement!");
+   }
+
+   if( loadEvaluationStrategy != "LBM" && loadEvaluationStrategy != "PE" && loadEvaluationStrategy != "Fit" && loadEvaluationStrategy != "FitMulti")
+   {
+      WALBERLA_ABORT("Invalid load evaluation strategy: " << loadEvaluationStrategy);
+   }
+
+   if( vtkWriteFreq != 0 )
+   {
+      vtkWriteFreqDD = vtkWriteFreq;
+      vtkWriteFreqBo = vtkWriteFreq;
+      vtkWriteFreqFl = vtkWriteFreq;
+   }
+
+   if( diameter > real_c(blockSize) )
+   {
+      WALBERLA_LOG_WARNING("PE Body Synchronization might not work since bodies are large compared to block size!");
+   }
+
+   if( useHopper )
+   {
+      WALBERLA_CHECK(hopperRelHeight >= real_t(0) && hopperRelHeight <= real_t(1), "Invalid relative hopper height of " << hopperRelHeight);
+      WALBERLA_CHECK(hopperRelOpening >= real_t(0) && hopperRelOpening <= real_t(1), "Invalid relative hopper opening of " << hopperRelOpening);
+   }
+
+
+   //////////////////////////
+   // NUMERICAL PARAMETERS //
+   //////////////////////////
+
+   const Vector3<uint_t> domainSize( XBlocks * blockSize, YBlocks * blockSize, ZBlocks * blockSize );
+   const real_t domainVolume = real_t(domainSize[0] * domainSize[1] * domainSize[2]);
+   const real_t sphereVolume = math::M_PI / real_t(6) * diameter * diameter * diameter;
+   const uint_t numberOfSediments = uint_c(std::ceil(solidVolumeFraction * domainVolume / sphereVolume));
+
+   real_t expectedSedimentVolumeFraction = (useBox||useHopper) ? real_t(0.45) : real_t(0.52);
+   const real_t expectedSedimentedVolume = real_t(1)/expectedSedimentVolumeFraction * real_c(numberOfSediments) * sphereVolume;
+   const real_t expectedSedimentedHeight = std::max(diameter, expectedSedimentedVolume / real_c(domainSize[0] * domainSize[1]));
+
+   const real_t uRef = real_t(0.02);
+   const real_t xRef = diameter;
+   const real_t tRef = xRef / uRef;
+
+   const real_t gravitationalAcceleration = uRef * uRef / ( (densityRatio-real_t(1)) * diameter );
+   const real_t viscosity = uRef * diameter / GalileoNumber;
+   const real_t omega = lbm::collision_model::omegaFromViscosity(viscosity);
+   const real_t tau = real_t(1) / omega;
+
+   const uint_t loggingDisplayFrequency = uint_t(100);
+
+   const real_t dx = real_t(1);
+   const real_t overlap = real_t( 1.5 ) * dx;
+
+   if( useVorticityCriterion && floatIsEqual(lowerFluidRefinementLimit, real_t(0)) && std::isinf(upperFluidRefinementLimit) )
+   {
+      // use computed criterion instead of user input
+      lowerFluidRefinementLimit = real_t(0.05) * uRef;
+      upperFluidRefinementLimit = real_t(0.1) * uRef;
+   }
+
+   const uint_t finestLevel = numberOfLevels - uint_t(1);
+   std::stringstream omega_msg;
+   for( uint_t i = 0; i < numberOfLevels; ++i )
+   {
+      real_t omegaLvl = lbm::collision_model::levelDependentRelaxationParameter( i, omega, finestLevel );
+      omega_msg << omegaLvl << " ( on level " << i << ", tau = " << real_t(1)/omega << " ), ";
+   }
+
+   const uint_t levelScalingFactor = ( uint_t(1) << finestLevel );
+   const uint_t lbmTimeStepsPerTimeLoopIteration = levelScalingFactor;
+
+   const uint_t timesteps = funcTest ? 1 : ( shortRun ? uint_t(100) : uint_t( timestepsOnFinestLevel / lbmTimeStepsPerTimeLoopIteration ) );
+
+   WALBERLA_LOG_INFO_ON_ROOT("Setup (in simulation, i.e. lattice, units):");
+   WALBERLA_LOG_INFO_ON_ROOT(" - domain size = " << domainSize);
+   WALBERLA_LOG_INFO_ON_ROOT(" - sediment diameter = " << diameter );
+   WALBERLA_LOG_INFO_ON_ROOT(" - Galileo number = " << GalileoNumber );
+   WALBERLA_LOG_INFO_ON_ROOT(" - number of sediments: " << numberOfSediments);
+   WALBERLA_LOG_INFO_ON_ROOT(" - densityRatio = " << densityRatio );
+   WALBERLA_LOG_INFO_ON_ROOT(" - fluid: relaxation time (tau) = " << tau << ", kin. visc = " << viscosity );
+   WALBERLA_LOG_INFO_ON_ROOT(" - gravitational acceleration = " << gravitationalAcceleration );
+   WALBERLA_LOG_INFO_ON_ROOT(" - reference values: x = " << xRef << ", t = " << tRef << ", vel = " << uRef);
+   WALBERLA_LOG_INFO_ON_ROOT(" - omega: " << omega_msg.str());
+   WALBERLA_LOG_INFO_ON_ROOT(" - number of levels: " << numberOfLevels);
+   WALBERLA_LOG_INFO_ON_ROOT(" - number of pe sub cycles: " << numPeSubCycles);
+   if( useVorticityCriterion )
+   {
+      WALBERLA_LOG_INFO_ON_ROOT(" - using vorticity criterion with lower limit = " << lowerFluidRefinementLimit << " and upper limit = " << upperFluidRefinementLimit );
+   }
+   if( useGradientCriterion )
+   {
+      WALBERLA_LOG_INFO_ON_ROOT(" - using gradient criterion with lower limit = " << lowerFluidRefinementLimit << " and upper limit = " << upperFluidRefinementLimit );
+   }
+   if( vtkWriteFreqDD > 0 )
+   {
+      WALBERLA_LOG_INFO_ON_ROOT(" - writing vtk files of domain decomposition to folder \"" << baseFolder << "\" with frequency " << vtkWriteFreqDD);
+   }
+   if( vtkWriteFreqBo > 0 )
+   {
+      WALBERLA_LOG_INFO_ON_ROOT(" - writing vtk files of bodies data to folder \"" << baseFolder << "\" with frequency " << vtkWriteFreqBo);
+   }
+   if( vtkWriteFreqFl > 0 )
+   {
+      WALBERLA_LOG_INFO_ON_ROOT(" - writing vtk files of fluid data to folder \"" << baseFolder << "\" with frequency " << vtkWriteFreqFl);
+   }
+   if( useEllipsoids )
+   {
+      WALBERLA_LOG_INFO_ON_ROOT(" - using (prolate) ellipsoids as sediments");
+   }
+   if( useBox )
+   {
+      WALBERLA_LOG_INFO_ON_ROOT(" - using box setup");
+   }
+   else if ( useHopper )
+   {
+      WALBERLA_LOG_INFO_ON_ROOT(" - using hopper setup");
+   }
+   else
+   {
+      WALBERLA_LOG_INFO_ON_ROOT(" - using horizontally periodic domain");
+   }
+
+
+
+   if( refinementCheckFrequency == 0 && numberOfLevels != 1 )
+   {
+      // determine check frequency automatically based on maximum admissable velocity and block sizes
+      real_t uMax = real_t(0.1);
+      refinementCheckFrequency = uint_c(( overlap + real_c(blockSize) - real_t(2) * real_t(FieldGhostLayers) * dx) / uMax) / lbmTimeStepsPerTimeLoopIteration;
+   }
+   WALBERLA_LOG_INFO_ON_ROOT(" - refinement / load balancing check frequency (coarse time steps): " << refinementCheckFrequency);
+   WALBERLA_LOG_INFO_ON_ROOT(" - load evaluation strategy: " << loadEvaluationStrategy);
+   WALBERLA_LOG_INFO_ON_ROOT(" - load distribution strategy: " << loadDistributionStrategy);
+
+   ///////////////////////////
+   // BLOCK STRUCTURE SETUP //
+   ///////////////////////////
+
+   Vector3<uint_t> blockSizeInCells( blockSize );
+
+   AABB simulationDomain( real_t(0), real_t(0), real_t(0), real_c(domainSize[0]), real_c(domainSize[1]), real_c(domainSize[2]) );
+   AABB sedimentDomain( real_t(0), real_t(0), real_c(domainSize[2]) - expectedSedimentedHeight, real_c(domainSize[0]), real_c(domainSize[1]), real_c(domainSize[2]) );
+
+   AABB initialRefinementDomain = sedimentDomain;
+   if( useBox || useHopper )
+   {
+      // require finest levels also along bounding planes -> initially refine everywhere
+      initialRefinementDomain = simulationDomain;
+   }
+
+   auto blocks = createBlockStructure( simulationDomain, blockSizeInCells, numberOfLevels, initialRefinementDomain, (useBox||useHopper), loadDistributionStrategy );
+
+   //write initial domain decomposition to file
+   if( vtkWriteFreqDD > 0 )
+   {
+      vtk::writeDomainDecomposition( blocks, "initial_domain_decomposition", baseFolder );
+   }
+
+
+   /////////////////
+   // PE COUPLING //
+   /////////////////
+
+   // set up pe functionality
+   shared_ptr<pe::BodyStorage> globalBodyStorage = make_shared<pe::BodyStorage>();
+   pe::SetBodyTypeIDs<BodyTypeTuple>::execute();
+
+   auto bodyStorageID  = blocks->addBlockData(pe::createStorageDataHandling<BodyTypeTuple>(), "pe Body Storage");
+   auto ccdID          = blocks->addBlockData(pe::ccd::createHashGridsDataHandling( globalBodyStorage, bodyStorageID ), "CCD");
+   BlockDataID fcdID   = (useEllipsoids) ? blocks->addBlockData( pe::fcd::createGenericFCDDataHandling<BodyTypeTuple, pe::fcd::GJKEPACollideFunctor>(), "FCD" )
+                                         : blocks->addBlockData(pe::fcd::createGenericFCDDataHandling<BodyTypeTuple, pe::fcd::AnalyticCollideFunctor>(), "FCD");
+
+   WcTimingTree timingTreePE;
+
+   // set up collision response
+   pe::cr::HCSITS cr(globalBodyStorage, blocks->getBlockStoragePointer(), bodyStorageID, ccdID, fcdID, &timingTreePE );
+   cr.setMaxIterations(10);
+   cr.setRelaxationModel( pe::cr::HardContactSemiImplicitTimesteppingSolvers::ApproximateInelasticCoulombContactByDecoupling );
+
+   // set up synchronization procedure
+   std::function<void(void)> syncCall = std::bind( pe::syncNextNeighbors<BodyTypeTuple>, std::ref(blocks->getBlockForest()), bodyStorageID, &timingTreePE, overlap, false );
+
+   // create pe bodies
+
+   // add the sediments
+   auto peMaterial = pe::createMaterial( "mat", densityRatio, real_t(1), real_t(0.25), real_t(0.25), real_t(0), real_t(200), real_t(100), real_t(100), real_t(100) );
+
+   // create two planes at bottom and top of domain for a horizontally periodic box
+   pe::createPlane( *globalBodyStorage, 0, Vector3<real_t>(0,0,1), Vector3<real_t>(0,0,0), peMaterial );
+   pe::createPlane( *globalBodyStorage, 0, Vector3<real_t>(0,0,-1), Vector3<real_t>(0,0,simulationDomain.zMax()), peMaterial );
+   if( useBox )
+   {
+      // add four more planes to obtain a closed box
+      pe::createPlane( *globalBodyStorage, 0, Vector3<real_t>(1,0,0), Vector3<real_t>(0,0,0), peMaterial );
+      pe::createPlane( *globalBodyStorage, 0, Vector3<real_t>(-1,0,0), Vector3<real_t>(simulationDomain.xMax(),0,0), peMaterial );
+      pe::createPlane( *globalBodyStorage, 0, Vector3<real_t>(0,1,0), Vector3<real_t>(0,0,0), peMaterial );
+      pe::createPlane( *globalBodyStorage, 0, Vector3<real_t>(0,-1,0), Vector3<real_t>(0,simulationDomain.yMax(),0), peMaterial );
+   }
+   else if ( useHopper )
+   {
+      // box bounding planes
+      pe::createPlane( *globalBodyStorage, 0, Vector3<real_t>(1,0,0), Vector3<real_t>(0,0,0), peMaterial );
+      pe::createPlane( *globalBodyStorage, 0, Vector3<real_t>(-1,0,0), Vector3<real_t>(simulationDomain.xMax(),0,0), peMaterial );
+      pe::createPlane( *globalBodyStorage, 0, Vector3<real_t>(0,1,0), Vector3<real_t>(0,0,0), peMaterial );
+      pe::createPlane( *globalBodyStorage, 0, Vector3<real_t>(0,-1,0), Vector3<real_t>(0,simulationDomain.yMax(),0), peMaterial );
+
+      //hopper planes
+      real_t xMax = simulationDomain.xMax();
+      real_t yMax = simulationDomain.yMax();
+      real_t zMax = simulationDomain.zMax();
+      Vector3<real_t> p1(0,0,hopperRelHeight*zMax);
+      pe::createPlane( *globalBodyStorage, 0, Vector3<real_t>(p1[2],0,hopperRelOpening*xMax-p1[0]), p1, peMaterial );
+      pe::createPlane( *globalBodyStorage, 0, Vector3<real_t>(0,p1[2],hopperRelOpening*yMax-p1[0]), p1, peMaterial );
+
+      Vector3<real_t> p2(xMax,yMax,hopperRelHeight*zMax);
+      pe::createPlane( *globalBodyStorage, 0, Vector3<real_t>(-p2[2],0,-((real_t(1)-hopperRelOpening)*xMax-p2[0])), p2, peMaterial );
+      pe::createPlane( *globalBodyStorage, 0, Vector3<real_t>(0,-p2[2],-((real_t(1)-hopperRelOpening)*yMax-p2[1])), p2, peMaterial );
+   }
+
+   AABB sedimentGenerationDomain( real_t(0), real_t(0), real_t(0.5)*real_c(domainSize[2]), real_c(domainSize[0]), real_c(domainSize[1]), real_c(domainSize[2]) );
+   createSedimentLayer(numberOfSediments, sedimentGenerationDomain, diameter, sedimentDomain.zMin(), peMaterial, cr, syncCall, blocks, globalBodyStorage, bodyStorageID, gravitationalAcceleration, useEllipsoids, shortRun );
+
+   // reset timer to not cover init stats
+   timingTreePE.reset();
+
+   // now we can use the information about the body positions to adapt the refinement
+
+   ///////////////////////////
+   // DYNAMIC REFINEMENT, 1 //
+   ///////////////////////////
+
+   auto & blockforest = blocks->getBlockForest();
+   blockforest.recalculateBlockLevelsInRefresh( true );
+   blockforest.alwaysRebalanceInRefresh( true ); //load balancing every time refresh is triggered
+   blockforest.reevaluateMinTargetLevelsAfterForcedRefinement( false );
+   blockforest.allowRefreshChangingDepth( false );
+   blockforest.allowMultipleRefreshCycles( false ); // otherwise info collections are invalid
+
+   {
+      blockforest::CombinedMinTargetLevelDeterminationFunctions initialMinTargetLevelDeterminationFunctions;
+
+      blockforest::AABBRefinementSelection aabbRefinementSelection;
+      aabbRefinementSelection.addAABB(sedimentDomain,finestLevel );
+      initialMinTargetLevelDeterminationFunctions.add( aabbRefinementSelection );
+
+      // refinement along global bodies (bounding planes) to have consistent mapping (required for CLI always, or SimpleBB with non-AABB planes)
+      real_t blockExtension = real_c(FieldGhostLayers);
+      pe_coupling::amr::GlobalBodyPresenceLevelDetermination globalBodyPresenceRefinement( globalBodyStorage, finestLevel, blockExtension, pe_coupling::selectGlobalBodies );
+      initialMinTargetLevelDeterminationFunctions.add(globalBodyPresenceRefinement);
+
+      blockforest.setRefreshMinTargetLevelDeterminationFunction( initialMinTargetLevelDeterminationFunctions );
+
+      for( uint_t refreshCycle = uint_t(0); refreshCycle < finestLevel; ++refreshCycle)
+      {
+
+         WALBERLA_LOG_INFO_ON_ROOT("Refreshing blockforest...")
+
+         // check refinement criterions and refine/coarsen if necessary
+         uint_t stampBefore = blocks->getBlockForest().getModificationStamp();
+         blocks->refresh();
+         uint_t stampAfter = blocks->getBlockForest().getModificationStamp();
+
+         if( stampBefore == stampAfter )
+         {
+            break;
+         }
+
+         WALBERLA_LOG_INFO_ON_ROOT("Recreating data structures..");
+
+         // rebuild PE data structures
+         pe::clearSynchronization( blockforest, bodyStorageID);
+
+         syncCall();
+
+         for (auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt)
+         {
+            pe::ccd::ICCD* ccd = blockIt->getData< pe::ccd::ICCD >( ccdID );
+            ccd->reloadBodies();
+         }
+      }
+   }
+
+   uint_t numberOfInitialFineBlocks = blockforest.getNumberOfBlocks(finestLevel);
+   mpi::allReduceInplace(numberOfInitialFineBlocks, mpi::SUM);
+   WALBERLA_LOG_INFO_ON_ROOT("Total number of initial fine blocks in simulation: " << numberOfInitialFineBlocks);
+
+   uint_t numberOfProcesses = uint_c(MPIManager::instance()->numProcesses());
+
+
+   ///////////////////////
+   // ADD DATA TO BLOCKS //
+   ////////////////////////
+
+   // create the lattice model
+   LatticeModel_T latticeModel = LatticeModel_T( lbm::collision_model::TRT::constructWithMagicNumber( omega, lbm::collision_model::TRT::threeSixteenth, finestLevel ) );
+
+   // add PDF field
+   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, "pdf field (zyxf)", latticeModel,
+                                                                         Vector3< real_t >( real_t(0) ), real_t(1),
+                                                                         FieldGhostLayers, field::zyxf );
+   // add flag field
+   BlockDataID flagFieldID = field::addFlagFieldToStorage<FlagField_T>( blocks, "flag field", FieldGhostLayers );
+
+   // add body field
+   BlockDataID bodyFieldID = field::addToStorage<BodyField_T>( blocks, "body field", NULL, field::zyxf, FieldGhostLayers );
+
+   // add velocity field and utility
+   BlockDataID velocityFieldID = field::addToStorage<VelocityField_T>( blocks, "velocity field", Vector3<real_t>(real_t(0)), field::zyxf, uint_t(2) );
+
+   typedef lbm::VelocityFieldWriter< PdfField_T, VelocityField_T > VelocityFieldWriter_T;
+   BlockSweepWrapper< VelocityFieldWriter_T > velocityFieldWriter( blocks, VelocityFieldWriter_T( pdfFieldID, velocityFieldID ) );
+
+
+   shared_ptr<blockforest::communication::NonUniformBufferedScheme<stencil::D3Q27> > velocityCommunicationScheme = make_shared<blockforest::communication::NonUniformBufferedScheme<stencil::D3Q27> >( blocks );
+   velocityCommunicationScheme->addPackInfo( make_shared< field::refinement::PackInfo<VelocityField_T, stencil::D3Q27> >( velocityFieldID ) );
+
+   // add boundary handling & initialize outer domain boundaries
+   BlockDataID boundaryHandlingID = blocks->addBlockData( make_shared< MyBoundaryHandling >( blocks, flagFieldID, pdfFieldID, bodyFieldID ),
+                                                          "boundary handling" );
+
+   // map planes into the LBM simulation -> act as no-slip boundaries
+   //pe_coupling::mapBodies< BoundaryHandling_T >( *blocks, boundaryHandlingID, bodyStorageID, *globalBodyStorage, NoSlip_Flag, pe_coupling::selectGlobalBodies );
+   pe_coupling::mapMovingBodies< BoundaryHandling_T >( *blocks, boundaryHandlingID, bodyStorageID, *globalBodyStorage, bodyFieldID, MO_Flag, pe_coupling::selectGlobalBodies );
+
+   // map pe bodies into the LBM simulation
+   pe_coupling::mapMovingBodies< BoundaryHandling_T >( *blocks, boundaryHandlingID, bodyStorageID, *globalBodyStorage, bodyFieldID, MO_Flag, pe_coupling::selectRegularBodies );
+
+
+   // force averaging functionality
+   shared_ptr<pe_coupling::BodiesForceTorqueContainer> bodiesFTContainer1 = make_shared<pe_coupling::BodiesForceTorqueContainer>(blocks, bodyStorageID);
+   std::function<void(void)> storeForceTorqueInCont1 = std::bind(&pe_coupling::BodiesForceTorqueContainer::store, bodiesFTContainer1);
+
+   shared_ptr<pe_coupling::BodiesForceTorqueContainer> bodiesFTContainer2 = make_shared<pe_coupling::BodiesForceTorqueContainer>(blocks, bodyStorageID);
+   std::function<void(void)> setForceTorqueOnBodiesFromCont2 = std::bind(&pe_coupling::BodiesForceTorqueContainer::setOnBodies, bodiesFTContainer2);
+
+   shared_ptr<pe_coupling::ForceTorqueOnBodiesScaler> forceScaler = make_shared<pe_coupling::ForceTorqueOnBodiesScaler>(blocks, bodyStorageID, real_t(0.5));
+   std::function<void(void)> setForceScalingFactorToOne = std::bind(&pe_coupling::ForceTorqueOnBodiesScaler::resetScalingFactor,forceScaler,real_t(1));
+   std::function<void(void)> setForceScalingFactorToHalf = std::bind(&pe_coupling::ForceTorqueOnBodiesScaler::resetScalingFactor,forceScaler,real_t(0.5));
+
+   if( averageForceTorqueOverTwoTimSteps ) {
+      bodiesFTContainer2->store();
+
+      setForceScalingFactorToOne();
+   }
+
+   ///////////////////////////
+   // DYNAMIC REFINEMENT, 2 //
+   ///////////////////////////
+
+   blockforest::CombinedMinTargetLevelDeterminationFunctions minTargetLevelDeterminationFunctions;
+
+   // add refinement criterion based on particle presence
+   shared_ptr<pe_coupling::InfoCollection> couplingInfoCollection = walberla::make_shared<pe_coupling::InfoCollection>();
+   pe_coupling::amr::BodyPresenceLevelDetermination particlePresenceRefinement( couplingInfoCollection, finestLevel );
+
+   minTargetLevelDeterminationFunctions.add( particlePresenceRefinement );
+
+   // also add (possible) refinement criteria based on fluid quantities
+
+   if( useVorticityCriterion )
+   {
+      // add refinement criterion based on vorticity magnitude
+      field::FlagFieldEvaluationFilter<FlagField_T> flagFieldFilter( flagFieldID, Fluid_Flag );
+      lbm::refinement::VorticityBasedLevelDetermination< field::FlagFieldEvaluationFilter<FlagField_T> > vorticityRefinement(
+            velocityFieldID, flagFieldFilter, upperFluidRefinementLimit, lowerFluidRefinementLimit, finestLevel );
+
+      minTargetLevelDeterminationFunctions.add( vorticityRefinement );
+   }
+
+   if( useGradientCriterion )
+   {
+      // add refinement criterion based on velocity gradient magnitude
+      field::FlagFieldEvaluationFilter<FlagField_T> flagFieldFilter( flagFieldID, Fluid_Flag );
+      VectorGradientRefinement< LatticeModel_T, field::FlagFieldEvaluationFilter<FlagField_T> > gradientRefinement(
+            velocityFieldID, flagFieldFilter, upperFluidRefinementLimit, lowerFluidRefinementLimit, finestLevel );
+
+      minTargetLevelDeterminationFunctions.add( gradientRefinement );
+   }
+
+   // refinement along global bodies (bounding planes) to have consistent mapping (required for CLI always, or SimpleBB with non-AABB planes)
+   real_t blockExtension = real_c(FieldGhostLayers);
+   pe_coupling::amr::GlobalBodyPresenceLevelDetermination globalBodyPresenceRefinement( globalBodyStorage, finestLevel, blockExtension, pe_coupling::selectGlobalBodies );
+   minTargetLevelDeterminationFunctions.add(globalBodyPresenceRefinement);
+
+   blockforest.setRefreshMinTargetLevelDeterminationFunction( minTargetLevelDeterminationFunctions );
+
+   bool curveAllGather = true;
+   bool balanceLevelwise = true;
+
+   real_t peBlockBaseWeight = real_t(1); //default value, might not be the best
+   shared_ptr<pe::InfoCollection> peInfoCollection = walberla::make_shared<pe::InfoCollection>();
+
+   if( loadDistributionStrategy == "Hilbert" || loadDistributionStrategy == "Morton")
+   {
+      if( loadDistributionStrategy == "Hilbert")
+      {
+         bool useHilbert = true;
+         blockforest.setRefreshPhantomBlockMigrationPreparationFunction( blockforest::DynamicCurveBalance< blockforest::PODPhantomWeight<real_t> >( useHilbert, curveAllGather, balanceLevelwise ) );
+      }
+      else if (loadDistributionStrategy == "Morton" )
+      {
+         bool useHilbert = false;
+         blockforest.setRefreshPhantomBlockMigrationPreparationFunction( blockforest::DynamicCurveBalance< blockforest::PODPhantomWeight<real_t> >( useHilbert, curveAllGather, balanceLevelwise ) );
+      }
+
+      blockforest.setRefreshPhantomBlockDataPackFunction(blockforest::PODPhantomWeightPackUnpack<real_t>());
+      blockforest.setRefreshPhantomBlockDataUnpackFunction(blockforest::PODPhantomWeightPackUnpack<real_t>());
+
+      if( loadEvaluationStrategy == "Fit" )
+      {
+         if( useEllipsoids )
+         {
+            pe_coupling::amr::WeightAssignmentFunctor weightAssignmentFunctor(couplingInfoCollection, fittedTotalWeightEvaluationFunctionEllipsoids);
+            blockforest.setRefreshPhantomBlockDataAssignmentFunction(weightAssignmentFunctor);
+         } else{
+            pe_coupling::amr::WeightAssignmentFunctor weightAssignmentFunctor(couplingInfoCollection, fittedTotalWeightEvaluationFunctionSpheres);
+            blockforest.setRefreshPhantomBlockDataAssignmentFunction(weightAssignmentFunctor);
+         }
+      }
+      else if( loadEvaluationStrategy == "PE" )
+      {
+         pe::amr::WeightAssignmentFunctor weightAssignmentFunctor(peInfoCollection, peBlockBaseWeight );
+         blockforest.setRefreshPhantomBlockDataAssignmentFunction(weightAssignmentFunctor);
+      }
+      else if( loadEvaluationStrategy == "LBM" )
+      {
+         pe_coupling::amr::WeightAssignmentFunctor weightAssignmentFunctor(couplingInfoCollection, pe_coupling::amr::defaultWeightEvaluationFunction);
+         blockforest.setRefreshPhantomBlockDataAssignmentFunction(weightAssignmentFunctor);
+      }
+      else
+      {
+         WALBERLA_ABORT("Invalid load evaluation strategy: " << loadEvaluationStrategy);
+      }
+
+   }
+   else if( loadDistributionStrategy == "ParMetis")
+   {
+      uint_t ncon = 1;
+      if( loadEvaluationStrategy == "FitMulti")
+      {
+         ncon = 2;
+      }
+
+      blockforest::DynamicParMetis::Algorithm parMetisAlgorithm = blockforest::DynamicParMetis::stringToAlgorithm(parMetisAlgorithmString);
+      blockforest::DynamicParMetis::WeightsToUse parMetisWeightsToUse = blockforest::DynamicParMetis::WeightsToUse::PARMETIS_BOTH_WEIGHTS;
+      blockforest::DynamicParMetis::EdgeSource parMetisEdgeSource = blockforest::DynamicParMetis::EdgeSource::PARMETIS_EDGES_FROM_EDGE_WEIGHTS;
+
+      blockforest::DynamicParMetis dynamicParMetis(parMetisAlgorithm, parMetisWeightsToUse, parMetisEdgeSource, ncon);
+      dynamicParMetis.setipc2redist(parMetis_ipc2redist);
+
+      real_t loadImbalanceTolerance = (parMetisTolerance < real_t(1)) ? std::max(real_t(1.05), real_t(1) + real_t(1) / ( real_c(numberOfInitialFineBlocks) / real_c(numberOfProcesses) ) ) : parMetisTolerance;
+      std::vector<double> parMetisLoadImbalanceTolerance(ncon, double(loadImbalanceTolerance));
+      dynamicParMetis.setImbalanceTolerance(parMetisLoadImbalanceTolerance[0], 0);
+
+      WALBERLA_LOG_INFO_ON_ROOT(" - ParMetis configuration: ");
+      WALBERLA_LOG_INFO_ON_ROOT("   - algorithm = " << dynamicParMetis.algorithmToString() );
+      WALBERLA_LOG_INFO_ON_ROOT("   - weights to use = " << dynamicParMetis.weightsToUseToString() );
+      WALBERLA_LOG_INFO_ON_ROOT("   - edge source = " << dynamicParMetis.edgeSourceToString() );
+      WALBERLA_LOG_INFO_ON_ROOT("   - ncon = " << ncon );
+      WALBERLA_LOG_INFO_ON_ROOT("   - ipc2redist parameter = " << dynamicParMetis.getipc2redist() );
+
+      blockforest.setRefreshPhantomBlockDataPackFunction(blockforest::DynamicParMetisBlockInfoPackUnpack());
+      blockforest.setRefreshPhantomBlockDataUnpackFunction(blockforest::DynamicParMetisBlockInfoPackUnpack());
+
+      if( loadEvaluationStrategy == "Fit" )
+      {
+         WALBERLA_LOG_INFO_ON_ROOT("   - load imbalance tolerance = <" << parMetisLoadImbalanceTolerance[0] << ">" );
+         if( useEllipsoids )
+         {
+            pe_coupling::amr::MetisAssignmentFunctor weightAssignmentFunctor(couplingInfoCollection, fittedTotalWeightEvaluationFunctionEllipsoids);
+            blockforest.setRefreshPhantomBlockDataAssignmentFunction(weightAssignmentFunctor);
+         } else{
+            pe_coupling::amr::MetisAssignmentFunctor weightAssignmentFunctor(couplingInfoCollection, fittedTotalWeightEvaluationFunctionSpheres);
+            blockforest.setRefreshPhantomBlockDataAssignmentFunction(weightAssignmentFunctor);
+         }
+      }
+      else if( loadEvaluationStrategy == "FitMulti" )
+      {
+         double imbalanceTolerancePE = 10.;
+         parMetisLoadImbalanceTolerance[1] = std::min(imbalanceTolerancePE, real_c(MPIManager::instance()->numProcesses()));
+         WALBERLA_LOG_INFO_ON_ROOT("   - load imbalance tolerances = <" << parMetisLoadImbalanceTolerance[0] << ", " << parMetisLoadImbalanceTolerance[1] << ">" );
+         dynamicParMetis.setImbalanceTolerance(parMetisLoadImbalanceTolerance[1], 1);
+
+         if( useEllipsoids )
+         {
+            std::vector< std::function<real_t(const pe_coupling::BlockInfo&)> > weightEvaluationFunctions(ncon);
+            weightEvaluationFunctions[0] = fittedLBMWeightEvaluationFunctionEllipsoids;
+            weightEvaluationFunctions[1] = fittedPEWeightEvaluationFunctionEllipsoids;
+            pe_coupling::amr::MetisAssignmentFunctor weightAssignmentFunctor(couplingInfoCollection, weightEvaluationFunctions);
+            blockforest.setRefreshPhantomBlockDataAssignmentFunction(weightAssignmentFunctor);
+         } else{
+            std::vector< std::function<real_t(const pe_coupling::BlockInfo&)> > weightEvaluationFunctions(ncon);
+            weightEvaluationFunctions[0] = fittedLBMWeightEvaluationFunctionSpheres;
+            weightEvaluationFunctions[1] = fittedPEWeightEvaluationFunctionSpheres;
+            pe_coupling::amr::MetisAssignmentFunctor weightAssignmentFunctor(couplingInfoCollection, weightEvaluationFunctions);
+            blockforest.setRefreshPhantomBlockDataAssignmentFunction(weightAssignmentFunctor);
+         }
+      }
+      else if( loadEvaluationStrategy == "PE" )
+      {
+         pe::amr::MetisAssignmentFunctor weightAssignmentFunctor(peInfoCollection, peBlockBaseWeight );
+         blockforest.setRefreshPhantomBlockDataAssignmentFunction(weightAssignmentFunctor);
+      }
+      else if( loadEvaluationStrategy == "LBM" )
+      {
+         pe_coupling::amr::MetisAssignmentFunctor weightAssignmentFunctor(couplingInfoCollection, pe_coupling::amr::defaultWeightEvaluationFunction);
+         blockforest.setRefreshPhantomBlockDataAssignmentFunction(weightAssignmentFunctor);
+      }
+      else
+      {
+         WALBERLA_ABORT("Invalid load evaluation strategy: " << loadEvaluationStrategy);
+      }
+
+      blockforest.setRefreshPhantomBlockMigrationPreparationFunction( dynamicParMetis );
+
+   }
+   else if( loadDistributionStrategy == "Diffusive")
+   {
+      using DB_T = blockforest::DynamicDiffusionBalance< blockforest::PODPhantomWeight<real_t> >;
+      DB_T dynamicDiffusion(diffusionMaxIterations, diffusionFlowIterations );
+      dynamicDiffusion.setMode(DB_T::Mode::DIFFUSION_PUSH);
+
+      WALBERLA_LOG_INFO_ON_ROOT(" - Dynamic diffusion configuration: ");
+      WALBERLA_LOG_INFO_ON_ROOT("   - max iterations = " << dynamicDiffusion.getMaxIterations() );
+      WALBERLA_LOG_INFO_ON_ROOT("   - flow iterations = " << dynamicDiffusion.getFlowIterations());
+
+      blockforest.setRefreshPhantomBlockDataPackFunction(blockforest::PODPhantomWeightPackUnpack<real_t>());
+      blockforest.setRefreshPhantomBlockDataUnpackFunction(blockforest::PODPhantomWeightPackUnpack<real_t>());
+      blockforest.setRefreshPhantomBlockMigrationPreparationFunction( dynamicDiffusion );
+
+      if( loadEvaluationStrategy == "Fit" )
+      {
+         if( useEllipsoids )
+         {
+            pe_coupling::amr::WeightAssignmentFunctor weightAssignmentFunctor(couplingInfoCollection, fittedTotalWeightEvaluationFunctionEllipsoids);
+            blockforest.setRefreshPhantomBlockDataAssignmentFunction(weightAssignmentFunctor);
+         } else{
+            pe_coupling::amr::WeightAssignmentFunctor weightAssignmentFunctor(couplingInfoCollection, fittedTotalWeightEvaluationFunctionSpheres);
+            blockforest.setRefreshPhantomBlockDataAssignmentFunction(weightAssignmentFunctor);
+         }
+      }
+      else if( loadEvaluationStrategy == "PE" )
+      {
+         pe::amr::WeightAssignmentFunctor weightAssignmentFunctor(peInfoCollection, peBlockBaseWeight );
+         blockforest.setRefreshPhantomBlockDataAssignmentFunction(weightAssignmentFunctor);
+      }
+      else if( loadEvaluationStrategy == "LBM" )
+      {
+         pe_coupling::amr::WeightAssignmentFunctor weightAssignmentFunctor(couplingInfoCollection, pe_coupling::amr::defaultWeightEvaluationFunction);
+         blockforest.setRefreshPhantomBlockDataAssignmentFunction(weightAssignmentFunctor);
+      }
+      else
+      {
+         WALBERLA_ABORT("Invalid load evaluation strategy: " << loadEvaluationStrategy);
+      }
+
+   } else
+   {
+      WALBERLA_ABORT("Load distribution strategy \"" << loadDistributionStrategy << "\t not implemented! - Aborting" );
+   }
+
+
+   ///////////////
+   // TIME LOOP //
+   ///////////////
+
+   // create the timeloop
+   SweepTimeloop timeloop( blocks->getBlockStorage(), timesteps );
+
+   if( vtkWriteFreqBo != uint_t(0) ) {
+
+      // pe bodies
+      if (useEllipsoids) {
+         auto bodyVtkOutput = make_shared<pe::EllipsoidVtkOutput>(bodyStorageID, blocks->getBlockStorage());
+         auto bodyVTK = vtk::createVTKOutput_PointData(bodyVtkOutput, "bodies", vtkWriteFreqBo, baseFolder);
+         timeloop.addFuncBeforeTimeStep(vtk::writeFiles(bodyVTK), "VTK (sediment data)");
+
+      } else {
+         auto bodyVtkOutput = make_shared<pe::SphereVtkOutput>(bodyStorageID, blocks->getBlockStorage());
+         auto bodyVTK = vtk::createVTKOutput_PointData(bodyVtkOutput, "bodies", vtkWriteFreqBo, baseFolder);
+         timeloop.addFuncBeforeTimeStep(vtk::writeFiles(bodyVTK), "VTK (sediment data)");
+      }
+   }
+
+   if( vtkWriteFreqFl != uint_t(0) ) {
+
+      // pdf field
+      auto pdfFieldVTK = vtk::createVTKOutput_BlockData(blocks, "fluid_field", vtkWriteFreqFl, 0, false, baseFolder);
+
+      field::FlagFieldCellFilter<FlagField_T> fluidFilter(flagFieldID);
+      fluidFilter.addFlag(Fluid_Flag);
+      pdfFieldVTK->addCellInclusionFilter(fluidFilter);
+
+      pdfFieldVTK->addCellDataWriter(
+            make_shared<lbm::VelocityVTKWriter<LatticeModel_T, float> >(pdfFieldID, "VelocityFromPDF"));
+      pdfFieldVTK->addCellDataWriter(
+            make_shared<lbm::DensityVTKWriter<LatticeModel_T, float> >(pdfFieldID, "DensityFromPDF"));
+
+      timeloop.addFuncBeforeTimeStep(vtk::writeFiles(pdfFieldVTK), "VTK (fluid field data)");
+   }
+
+   if( vtkWriteFreqDD != uint_t(0) ) {
+      auto domainDecompVTK = vtk::createVTKOutput_DomainDecomposition(blocks, "domain_decomposition", vtkWriteFreqDD, baseFolder );
+      timeloop.addFuncBeforeTimeStep( vtk::writeFiles(domainDecompVTK), "VTK (domain decomposition)");
+   }
+
+   WcTimingPool timeloopTiming;
+   shared_ptr<WcTimingPool> timeloopRefinementTiming = make_shared<WcTimingPool>();
+   shared_ptr<WcTimingPool> timeloopRefinementTimingLevelwise = make_shared<WcTimingPool>();
+
+
+   auto sweep = lbm::makeCellwiseSweep< LatticeModel_T, FlagField_T >( pdfFieldID, flagFieldID, Fluid_Flag );
+   auto refinementTimestep = lbm::refinement::makeTimeStep< LatticeModel_T, BoundaryHandling_T >( blocks, sweep, pdfFieldID, boundaryHandlingID );
+
+   refinementTimestep->enableTiming( timeloopRefinementTiming, timeloopRefinementTimingLevelwise );
+
+   // Averaging the force/torque over two time steps is said to damp oscillations of the interaction force/torque.
+   // See Ladd - " Numerical simulations of particulate suspensions via a discretized Boltzmann equation. Part 1. Theoretical foundation", 1994, p. 302
+   if( averageForceTorqueOverTwoTimSteps ) {
+
+      // store force/torque from hydrodynamic interactions in container1
+      refinementTimestep->addPostStreamVoidFunction(lbm::refinement::FunctorWrapper(storeForceTorqueInCont1), "Force Storing", finestLevel);
+
+      // set force/torque from previous time step (in container2)
+      refinementTimestep->addPostStreamVoidFunction(lbm::refinement::FunctorWrapper(setForceTorqueOnBodiesFromCont2), "Force setting", finestLevel);
+
+      // average the force/torque by scaling it with factor 1/2 (except in first timestep and directly after refinement, there it is 1)
+      refinementTimestep->addPostStreamVoidFunction(lbm::refinement::FunctorWrapper(SharedFunctor<pe_coupling::ForceTorqueOnBodiesScaler>(forceScaler)), "Force averaging", finestLevel);
+      refinementTimestep->addPostStreamVoidFunction(lbm::refinement::FunctorWrapper(setForceScalingFactorToHalf), "Force scaling adjustment", finestLevel);
+
+      // swap containers
+      refinementTimestep->addPostStreamVoidFunction(lbm::refinement::FunctorWrapper(pe_coupling::BodyContainerSwapper(bodiesFTContainer1, bodiesFTContainer2)), "Swap FT container", finestLevel);
+
+   }
+
+   Vector3<real_t> gravitationalForce( real_t(0), real_t(0), -(densityRatio - real_t(1)) * gravitationalAcceleration * sphereVolume );
+   refinementTimestep->addPostStreamVoidFunction(lbm::refinement::FunctorWrapper(pe_coupling::ForceOnBodiesAdder( blocks, bodyStorageID, gravitationalForce )), "Gravitational force", finestLevel );
+
+   // add pe timesteps
+   refinementTimestep->addPostStreamVoidFunction(lbm::refinement::FunctorWrapper(pe_coupling::TimeStep( blocks, bodyStorageID, cr, syncCall, real_t(1), numPeSubCycles)),
+                                                  "pe Time Step", finestLevel );
+
+   // add sweep for updating the pe body mapping into the LBM simulation
+   refinementTimestep->addPostStreamVoidFunction(lbm::refinement::SweepAsFunctorWrapper( pe_coupling::BodyMapping< BoundaryHandling_T >( blocks, boundaryHandlingID, bodyStorageID, globalBodyStorage, bodyFieldID,  MO_Flag, FormerMO_Flag, pe_coupling::selectRegularBodies ), blocks ),
+                                                 "Body Mapping", finestLevel );
+
+   // add sweep for restoring PDFs in cells previously occupied by pe bodies
+   typedef pe_coupling::EquilibriumReconstructor< LatticeModel_T, BoundaryHandling_T > Reconstructor_T;
+   Reconstructor_T reconstructor( blocks, boundaryHandlingID, pdfFieldID, bodyFieldID );
+   refinementTimestep->addPostStreamVoidFunction(lbm::refinement::SweepAsFunctorWrapper( pe_coupling::PDFReconstruction< LatticeModel_T, BoundaryHandling_T, Reconstructor_T > ( blocks,
+                                                 boundaryHandlingID, bodyStorageID, globalBodyStorage, bodyFieldID, reconstructor, FormerMO_Flag, Fluid_Flag ), blocks ),
+                                                 "PDF Restore", finestLevel );
+
+
+   // add LBM sweep with refinement
+   timeloop.addFuncBeforeTimeStep( makeSharedFunctor( refinementTimestep ), "LBM refinement time step" );
+
+   std::string loggingFileName( baseFolder + "/Logging_Ga");
+   loggingFileName += std::to_string(uint_c(GalileoNumber));
+   loggingFileName += "_lvl";
+   loggingFileName += std::to_string(numberOfLevels);
+   loggingFileName += ".txt";
+   if( logging  )
+   {
+      WALBERLA_LOG_INFO_ON_ROOT(" - writing logging output to file \"" << loggingFileName << "\"");
+   }
+   shared_ptr< PropertyLogger > logger = walberla::make_shared< PropertyLogger >( &timeloop, blocks, bodyStorageID,
+                                                                                  loggingFileName, fileIO );
+   if(logging)
+   {
+      timeloop.addFuncAfterTimeStep( SharedFunctor< PropertyLogger >( logger ), "Property logger" );
+   }
+
+
+   timeloop.addFuncAfterTimeStep( RemainingTimeLogger( timeloop.getNrOfTimeSteps() ), "Remaining Time Logger" );
+
+
+   // add top level timing pool output
+   timeloop.addFuncAfterTimeStep( TimingPoolLogger( timeloopTiming, timeloop, loggingDisplayFrequency ), "Regular Timing Logger" );
+
+   // add regular refinement timing pool output
+   timeloop.addFuncAfterTimeStep( TimingPoolLogger( *timeloopRefinementTiming, timeloop, loggingDisplayFrequency ), "Refinement Timing Logger" );
+
+   // add level wise timing pool output
+   //if( numberOfLevels != uint_t(1))
+      timeloop.addFuncAfterTimeStep( TimingPoolLogger( *timeloopRefinementTimingLevelwise, timeloop, loggingDisplayFrequency ), "Refinement Levelwise Timing Logger" );
+
+   // add PE timing tree output
+   timeloop.addFuncAfterTimeStep( TimingTreeLogger( timingTreePE, timeloop, loggingDisplayFrequency ), "PE Timing Tree Timing Logger" );
+
+
+   ////////////////////////
+   // EXECUTE SIMULATION //
+   ////////////////////////
+
+   uint_t loadEvaluationFrequency = refinementCheckFrequency;
+   TimingEvaluator timingEvaluator( *timeloopRefinementTimingLevelwise, timingTreePE, numberOfLevels );
+
+   // file for simulation infos
+   std::string infoFileName( baseFolder + "/simulation_info.txt");
+   WALBERLA_ROOT_SECTION()
+   {
+      std::ofstream file;
+      file.open( infoFileName.c_str(), std::fstream::out | std::fstream::trunc );
+      file << "#i\t t\t tSim\t tLB\t numProcs\t levelwise blocks (min/max/sum)\n";
+      file.close();
+   }
+
+   // process local timing measurements and predicted loads
+   std::string processLocalFiles(baseFolder + "/processLocalFiles");
+   WALBERLA_ROOT_SECTION()
+   {
+      filesystem::path tpath( processLocalFiles );
+      if( !filesystem::exists( tpath ) )
+         filesystem::create_directory( tpath );
+   }
+   std::string measurementFileProcessName(processLocalFiles + "/measurements_" + std::to_string(MPIManager::instance()->rank()) + ".txt");
+   {
+      std::ofstream file;
+      file.open( measurementFileProcessName.c_str(), std::fstream::out | std::fstream::trunc );
+      file << "#i\t t\t mTotSim\t mLB\t mLBM\t mBH\t mCoup1\t mCoup2\t mRB\t cLBM\t cRB\t numBlocks\n";
+      file.close();
+   }
+
+   std::string predictionFileProcessName(processLocalFiles + "/predictions_" + std::to_string(MPIManager::instance()->rank()) + ".txt");
+   {
+      std::ofstream file;
+      file.open( predictionFileProcessName.c_str(), std::fstream::out | std::fstream::trunc );
+      file << "#i\t t\t wlLBM\t wlBH\t wlCoup1\t wlCoup2\t wlRB\t edgecut\t numBlocks\n";
+      file.close();
+   }
+
+   std::vector<std::string> LBMTimer;
+   LBMTimer.push_back("collide");
+   LBMTimer.push_back("stream");
+   LBMTimer.push_back("stream & collide");
+
+   std::vector<std::string> bhTimer;
+   bhTimer.push_back("boundary handling");
+
+   std::vector<std::string> couplingTimer1;
+   couplingTimer1.push_back("Body Mapping");
+   std::vector<std::string> couplingTimer2;
+   couplingTimer2.push_back("PDF Restore");
+
+   std::vector<std::string> peTimer;
+   peTimer.push_back("Simulation Step.Collision Detection");
+   peTimer.push_back("Simulation Step.Collision Response Integration");
+   peTimer.push_back("Simulation Step.Collision Response Resolution.Collision Response Solving");
+
+   std::vector<std::string> LBMCommTimer;
+   LBMCommTimer.push_back("communication equal level [pack & send]");
+   LBMCommTimer.push_back("communication equal level [wait & unpack]");
+
+   std::vector<std::string> peCommTimer;
+   //Adapt if using different collision response (like DEM!)
+   peCommTimer.push_back("Simulation Step.Collision Response Resolution.Velocity Sync");
+   peCommTimer.push_back("Sync");
+
+
+   real_t terminationPosition = expectedSedimentedHeight;
+   real_t terminationVelocity = real_t(0.05) * uRef;
+
+   real_t oldmTotSim = real_t(0);
+   real_t oldmLB = real_t(0);
+
+   uint_t measurementFileCounter = uint_t(0);
+   uint_t predictionFileCounter = uint_t(0);
+
+   std::string loadEvaluationStep("load evaluation");
+
+   // time loop
+   for (uint_t i = 0; i < timesteps; ++i )
+   {
+
+      // evaluate measurements (note: reflect simulation behavior BEFORE the evaluation)
+      if( loadEvaluationFrequency > 0 && i % loadEvaluationFrequency == 0 && i > 0 && fileIO)
+      {
+
+         timeloopTiming[loadEvaluationStep].start();
+
+         // write process local timing measurements to files (per process, per load balancing step)
+         {
+
+            // evaluate all required timers
+            uint_t evalLevel = finestLevel;
+
+            real_t mTotSim = ( timeloopTiming.timerExists("LBM refinement time step") ) ? timeloopTiming["LBM refinement time step"].total() : real_t(0);
+
+            real_t mLB = ( timeloopTiming.timerExists("refinement checking") ) ? timeloopTiming["refinement checking"].total() : real_t(0);
+
+            real_t mLBM = timingEvaluator.getTimings(LBMTimer, evalLevel);
+            real_t mBH  = timingEvaluator.getTimings(bhTimer, evalLevel);
+            real_t mCoup1 = timingEvaluator.getTimings(couplingTimer1, evalLevel);
+            real_t mCoup2 = timingEvaluator.getTimings(couplingTimer2, evalLevel);
+            real_t mPE = timingEvaluator.getTimings(peTimer, evalLevel);
+
+            real_t cLBM = timingEvaluator.getTimings(LBMCommTimer, evalLevel);
+            real_t cRB = timingEvaluator.getTimings(peCommTimer, evalLevel);
+
+            auto & forest = blocks->getBlockForest();
+            uint_t numBlocks = forest.getNumberOfBlocks(finestLevel);
+
+            // write to process local file
+            std::ofstream file;
+            file.open( measurementFileProcessName.c_str(), std::ofstream::app  );
+            file << measurementFileCounter << "\t " << real_c(i) / tRef << "\t"
+                 << mTotSim - oldmTotSim << "\t" << mLB - oldmLB << "\t" << mLBM << "\t" << mBH << "\t" << mCoup1 << "\t"
+                 << mCoup2 << "\t" << mPE << "\t" << cLBM << "\t" << cRB << "\t" << numBlocks << "\n";
+            file.close();
+
+            oldmTotSim = mTotSim;
+            oldmLB = mLB;
+            measurementFileCounter++;
+
+            // reset timers to have measurement from evaluation to evaluation point
+            timeloopRefinementTimingLevelwise->clear();
+            timingTreePE.reset();
+
+         }
+
+         // evaluate general simulation infos (on root)
+         {
+            real_t totalTimeToCurrentTimestep, totalLBTimeToCurrentTimestep;
+            evaluateTotalSimulationTimePassed(timeloopTiming, totalTimeToCurrentTimestep, totalLBTimeToCurrentTimestep);
+            std::vector<math::DistributedSample> numberOfBlocksPerLevel(numberOfLevels);
+
+            auto & forest = blocks->getBlockForest();
+            for( uint_t lvl = 0; lvl < numberOfLevels; ++lvl)
+            {
+               uint_t numBlocks = forest.getNumberOfBlocks(lvl);
+               numberOfBlocksPerLevel[lvl].castToRealAndInsert(numBlocks);
+            }
+
+            for( uint_t lvl = 0; lvl < numberOfLevels; ++lvl)
+            {
+               numberOfBlocksPerLevel[lvl].mpiGatherRoot();
+            }
+
+            WALBERLA_ROOT_SECTION()
+            {
+               std::ofstream file;
+               file.open( infoFileName.c_str(), std::ofstream::app  );
+               file << i << "\t " << real_c(i) / tRef << "\t"
+                    << totalTimeToCurrentTimestep << "\t " << totalLBTimeToCurrentTimestep << "\t " << numberOfProcesses << "\t ";
+
+               for( uint_t lvl = 0; lvl < numberOfLevels; ++lvl)
+               {
+                  file << uint_c(numberOfBlocksPerLevel[numberOfLevels-1-lvl].min()) << "\t ";
+                  file << uint_c(numberOfBlocksPerLevel[numberOfLevels-1-lvl].max()) << "\t ";
+                  file << uint_c(numberOfBlocksPerLevel[numberOfLevels-1-lvl].sum()) << "\t ";
+               }
+               file << "\n";
+
+               file.close();
+            }
+         }
+
+         timeloopTiming[loadEvaluationStep].end();
+
+      }
+
+
+      if( refinementCheckFrequency != 0 && i % refinementCheckFrequency == 0)
+      {
+
+         WALBERLA_LOG_INFO_ON_ROOT("Checking for refinement and load balancing...")
+
+         std::string refinementCheckStep("refinement checking");
+         timeloopTiming[refinementCheckStep].start();
+
+         if( loadEvaluationStrategy != "LBM" ) {
+
+            // first evaluate all data that is required for the refinement checks
+
+            // update info collections for the particle presence based check and the load balancing:
+            auto &forest = blocks->getBlockForest();
+            pe_coupling::createWithNeighborhood<BoundaryHandling_T>(forest, boundaryHandlingID, bodyStorageID, ccdID,
+                                                                    fcdID, numPeSubCycles, *couplingInfoCollection);
+            pe::createWithNeighborhood(forest, bodyStorageID, *peInfoCollection);
+
+            // for the fluid property based check:
+            if (useVorticityCriterion || useGradientCriterion) {
+               velocityFieldWriter();
+               (*velocityCommunicationScheme)();
+            }
+
+            WALBERLA_LOG_INFO_ON_ROOT("Refreshing blockforest...")
+
+            // check refinement criterions and refine/coarsen if necessary
+            uint_t stampBefore = blocks->getBlockForest().getModificationStamp();
+            blocks->refresh();
+            uint_t stampAfter = blocks->getBlockForest().getModificationStamp();
+
+            bool recreatingNecessary = false;
+
+            if (stampBefore != stampAfter) {
+               recreatingNecessary = true;
+            }
+
+            bool reducedRecreationFlag = mpi::allReduce(recreatingNecessary, mpi::LOGICAL_OR);
+
+            if (reducedRecreationFlag != recreatingNecessary) {
+               WALBERLA_LOG_INFO("Reduced recreation flag different from individual one");
+            }
+
+            recreatingNecessary = reducedRecreationFlag;
+
+            if (recreatingNecessary) {
+
+               WALBERLA_LOG_INFO_ON_ROOT("Recreating data structures..");
+
+               // rebuild PE data structures
+               pe::clearSynchronization(blockforest, bodyStorageID);
+
+               syncCall();
+
+               for (auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt) {
+                  pe::ccd::ICCD *ccd = blockIt->getData<pe::ccd::ICCD>(ccdID);
+                  ccd->reloadBodies();
+               }
+
+               clearBoundaryHandling(forest, boundaryHandlingID);
+               clearBodyField(forest, bodyFieldID);
+
+               if (averageForceTorqueOverTwoTimSteps) {
+
+                  // clear containers from old values
+                  bodiesFTContainer1->clear();
+                  bodiesFTContainer2->clear();
+
+                  // initialize FT container on all blocks anew, i.e. with the currently acting force/torque, which is zero after the refinement step
+                  bodiesFTContainer2->store();
+
+                  // set force scaling factor to one after refinement since force history is not present on blocks after refinement
+                  // thus the usual averaging of 1/2 (over two time steps) can not be carried out, i.e. it would lead to 1/2 of the acting force
+                  // the scaling factor is thus adapted for the next timestep to 1, and then changed back to 1/2 (in the timeloop)
+                  setForceScalingFactorToOne();
+               }
+
+               recreateBoundaryHandling(forest, boundaryHandlingID);
+
+               // re-set the no-slip flags along the walls
+               pe_coupling::mapMovingBodies<BoundaryHandling_T>(*blocks, boundaryHandlingID, bodyStorageID,
+                                                                *globalBodyStorage, bodyFieldID, MO_Flag,
+                                                                pe_coupling::selectGlobalBodies);
+
+               // re-map the body into the domain (initializing the bodyField as well)
+               pe_coupling::mapMovingBodies<BoundaryHandling_T>(*blocks, boundaryHandlingID, bodyStorageID,
+                                                                *globalBodyStorage, bodyFieldID, MO_Flag,
+                                                                pe_coupling::selectRegularBodies);
+            }
+
+         }
+
+         timeloopTiming[refinementCheckStep].end();
+      }
+
+      // evaluate predictions (note: reflect the predictions for all upcoming simulations, thus the corresponding measurements have to be taken afterwards)
+      if( loadEvaluationFrequency > 0 && i % loadEvaluationFrequency == 0 && fileIO)
+      {
+
+         timeloopTiming[loadEvaluationStep].start();
+
+         // write process local load predictions to files (per process, per load balancing step)
+         {
+
+            real_t wlLBM = real_t(0);
+            real_t wlBH = real_t(0);
+            real_t wlCoup1 = real_t(0);
+            real_t wlCoup2 = real_t(0);
+            real_t wlRB = real_t(0);
+
+            auto & forest = blocks->getBlockForest();
+            pe_coupling::createWithNeighborhood<BoundaryHandling_T>(forest, boundaryHandlingID, bodyStorageID, ccdID, fcdID, numPeSubCycles, *couplingInfoCollection);
+
+            for( auto blockIt = forest.begin(); blockIt != forest.end(); ++blockIt ) {
+               blockforest::Block *block = static_cast<blockforest::Block *> (&(*blockIt));
+               auto blockID = block->getId();
+               auto infoIt = couplingInfoCollection->find(blockID);
+               auto blockInfo = infoIt->second;
+
+               if( useEllipsoids )
+               {
+
+                  WALBERLA_ABORT("Not yet implemented!");
+
+               }
+               else
+               {
+                  wlLBM   += fittedLBMWeightEvaluationFunctionSpheres(blockInfo);
+                  wlBH    += fittedBHWeightEvaluationFunctionSpheres(blockInfo);
+                  wlCoup1 += fittedCoupling1WeightEvaluationFunctionSpheres(blockInfo);
+                  wlCoup2 += fittedCoupling2WeightEvaluationFunctionSpheres(blockInfo);
+                  wlRB    += fittedPEWeightEvaluationFunctionSpheres(blockInfo);
+               }
+
+            }
+
+            // note: we count the edge weight doubled here in total (to and from the other process). ParMetis only counts one direction.
+            uint_t edgecut = evaluateEdgeCut(forest);
+
+            uint_t numBlocks = forest.getNumberOfBlocks(finestLevel);
+
+            std::ofstream file;
+            file.open( predictionFileProcessName.c_str(), std::ofstream::app  );
+            file << predictionFileCounter << "\t " << real_c(i) / tRef << "\t"
+                 << wlLBM << "\t" << wlBH << "\t" << wlCoup1 << "\t" << wlCoup2 << "\t" << wlRB << "\t"
+                 << edgecut << "\t" << numBlocks << "\n";
+            file.close();
+
+            predictionFileCounter++;;
+         }
+
+         timeloopTiming[loadEvaluationStep].end();
+
+      }
+
+      // perform a single simulation step
+      timeloop.singleStep( timeloopTiming );
+
+
+      if( logging )
+      {
+         real_t curMeanPos = logger->getMeanPosition();
+         real_t curMeanVel = logger->getMeanVelocity();
+         if( curMeanPos <= terminationPosition && std::fabs(curMeanVel) < terminationVelocity )
+         {
+            WALBERLA_LOG_INFO_ON_ROOT("Sediments passed terminal mean position " << terminationPosition << " and reached velocity " << curMeanVel << " - terminating simulation!");
+            break;
+         }
+      }
+
+   }
+
+   timeloopTiming.logResultOnRoot();
+
+
+   return EXIT_SUCCESS;
+}
+
+} // namespace amr_sediment_settling
+
+int main( int argc, char **argv ){
+   amr_sediment_settling::main(argc, argv);
+}
diff --git a/apps/benchmarks/AdaptiveMeshRefinementFluidParticleCoupling/CMakeLists.txt b/apps/benchmarks/AdaptiveMeshRefinementFluidParticleCoupling/CMakeLists.txt
new file mode 100644
index 000000000..5d3db5ef3
--- /dev/null
+++ b/apps/benchmarks/AdaptiveMeshRefinementFluidParticleCoupling/CMakeLists.txt
@@ -0,0 +1,5 @@
+waLBerla_add_executable( NAME WorkloadEvaluation FILES WorkloadEvaluation.cpp DEPENDS blockforest boundary core field lbm pe pe_coupling postprocessing stencil timeloop vtk )
+
+if( WALBERLA_BUILD_WITH_PARMETIS )
+   waLBerla_add_executable( NAME AMRSedimentSettling FILES AMRSedimentSettling.cpp DEPENDS blockforest boundary core field lbm pe pe_coupling postprocessing stencil timeloop vtk )
+endif()
\ No newline at end of file
diff --git a/apps/benchmarks/AdaptiveMeshRefinementFluidParticleCoupling/WorkloadEvaluation.cpp b/apps/benchmarks/AdaptiveMeshRefinementFluidParticleCoupling/WorkloadEvaluation.cpp
new file mode 100644
index 000000000..038b054a5
--- /dev/null
+++ b/apps/benchmarks/AdaptiveMeshRefinementFluidParticleCoupling/WorkloadEvaluation.cpp
@@ -0,0 +1,985 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file WorkLoadEvaluation.cpp
+//! \ingroup pe_coupling
+//! \author Christoph Rettinger <christoph.rettinger@fau.de>
+//
+//======================================================================================================================
+
+#include "blockforest/Initialization.h"
+#include "blockforest/communication/UniformBufferedScheme.h"
+
+#include "boundary/all.h"
+
+#include "core/DataTypes.h"
+#include "core/Environment.h"
+#include "core/debug/Debug.h"
+#include "core/debug/TestSubsystem.h"
+#include "core/logging/Logging.h"
+#include "core/math/all.h"
+#include "core/SharedFunctor.h"
+#include "core/timing/RemainingTimeLogger.h"
+#include "core/mpi/MPIManager.h"
+#include "core/mpi/Reduce.h"
+#include "core/mpi/Broadcast.h"
+
+#include "domain_decomposition/SharedSweep.h"
+
+#include "field/AddToStorage.h"
+#include "field/communication/PackInfo.h"
+
+#include "lbm/boundary/all.h"
+#include "lbm/communication/PdfFieldPackInfo.h"
+#include "lbm/field/AddToStorage.h"
+#include "lbm/field/MacroscopicValueCalculation.h"
+#include "lbm/field/PdfField.h"
+#include "lbm/lattice_model/D3Q19.h"
+#include "lbm/lattice_model/ForceModel.h"
+#include "lbm/sweeps/CellwiseSweep.h"
+#include "lbm/sweeps/SweepWrappers.h"
+#include "lbm/BlockForestEvaluation.h"
+
+#include "stencil/D3Q27.h"
+
+#include "timeloop/SweepTimeloop.h"
+
+#include "pe/basic.h"
+#include "pe/fcd/GJKEPACollideFunctor.h"
+#include "pe/vtk/BodyVtkOutput.h"
+#include "pe/vtk/EllipsoidVtkOutput.h"
+#include "pe/vtk/SphereVtkOutput.h"
+
+#include "pe_coupling/mapping/all.h"
+#include "pe_coupling/momentum_exchange_method/all.h"
+#include "pe_coupling/utility/all.h"
+
+#include "vtk/all.h"
+#include "field/vtk/all.h"
+#include "lbm/vtk/all.h"
+
+#include <vector>
+#include <iomanip>
+#include <iostream>
+
+namespace workload_evaluation
+{
+
+///////////
+// USING //
+///////////
+
+using namespace walberla;
+using walberla::uint_t;
+
+// PDF field, flag field & body field
+typedef lbm::D3Q19< lbm::collision_model::TRT, false>  LatticeModel_T;
+
+typedef LatticeModel_T::Stencil Stencil_T;
+typedef lbm::PdfField< LatticeModel_T > PdfField_T;
+
+typedef walberla::uint8_t                 flag_t;
+typedef FlagField< flag_t >               FlagField_T;
+typedef GhostLayerField< pe::BodyID, 1 >  BodyField_T;
+
+const uint_t FieldGhostLayers = 1;
+
+// boundary handling
+typedef pe_coupling::CurvedLinear< LatticeModel_T, FlagField_T > MO_CLI_T;
+
+typedef boost::tuples::tuple<MO_CLI_T >               BoundaryConditions_T;
+typedef BoundaryHandling< FlagField_T, Stencil_T, BoundaryConditions_T > BoundaryHandling_T;
+
+typedef boost::tuple<pe::Sphere, pe::Ellipsoid, pe::Plane> BodyTypeTuple;
+
+///////////
+// FLAGS //
+///////////
+
+const FlagUID Fluid_Flag   ( "fluid" );
+const FlagUID MO_CLI_Flag  ( "moving obstacle CLI" );
+const FlagUID FormerMO_Flag( "former moving obstacle" );
+
+
+/////////////////////////////////////
+// BOUNDARY HANDLING CUSTOMIZATION //
+/////////////////////////////////////
+
+class MyBoundaryHandling
+{
+public:
+
+   MyBoundaryHandling( const BlockDataID & flagFieldID, const BlockDataID & pdfFieldID, const BlockDataID & bodyFieldID, bool useEntireFieldTraversal ) :
+      flagFieldID_( flagFieldID ), pdfFieldID_( pdfFieldID ), bodyFieldID_ ( bodyFieldID ), useEntireFieldTraversal_( useEntireFieldTraversal ) {}
+
+   BoundaryHandling_T * operator()( IBlock* const block, const StructuredBlockStorage* const storage ) const;
+
+private:
+
+   const BlockDataID flagFieldID_;
+   const BlockDataID pdfFieldID_;
+   const BlockDataID bodyFieldID_;
+   bool useEntireFieldTraversal_;
+
+}; // class MyBoundaryHandling
+
+BoundaryHandling_T * MyBoundaryHandling::operator()( IBlock * const block, const StructuredBlockStorage * const storage ) const
+{
+   WALBERLA_ASSERT_NOT_NULLPTR( block );
+   WALBERLA_ASSERT_NOT_NULLPTR( storage );
+
+   FlagField_T * flagField       = block->getData< FlagField_T >( flagFieldID_ );
+   PdfField_T *  pdfField        = block->getData< PdfField_T > ( pdfFieldID_ );
+   BodyField_T * bodyField       = block->getData< BodyField_T >( bodyFieldID_ );
+
+   const auto fluid = flagField->flagExists( Fluid_Flag ) ? flagField->getFlag( Fluid_Flag ) : flagField->registerFlag( Fluid_Flag );
+
+   BoundaryHandling_T::Mode mode = (useEntireFieldTraversal_) ? BoundaryHandling_T::Mode::ENTIRE_FIELD_TRAVERSAL : BoundaryHandling_T::Mode::OPTIMIZED_SPARSE_TRAVERSAL ;
+
+   BoundaryHandling_T * handling = new BoundaryHandling_T( "fixed obstacle boundary handling", flagField, fluid,
+                                                           boost::tuples::make_tuple( MO_CLI_T ( "MO_CLI", MO_CLI_Flag, pdfField, flagField, bodyField, fluid, *storage, *block ) ),
+                                                           mode);
+
+   // boundaries are set by mapping the planes into the domain
+
+   handling->fillWithDomain( FieldGhostLayers );
+
+   return handling;
+}
+
+
+class CollisionPropertiesEvaluator
+{
+public:
+   CollisionPropertiesEvaluator( pe::cr::ICR & collisionResponse ) : collisionResponse_( collisionResponse )
+   {}
+
+   real_t get()
+   {
+      real_t maxPen = std::fabs(collisionResponse_.getMaximumPenetration());
+      WALBERLA_MPI_SECTION()
+      {
+         mpi::allReduceInplace( maxPen, mpi::MAX );
+      }
+      return maxPen;
+   }
+
+private:
+   pe::cr::ICR & collisionResponse_;
+};
+
+class ContactDistanceEvaluator
+{
+public:
+   ContactDistanceEvaluator( const shared_ptr< StructuredBlockStorage > & blocks, const BlockDataID ccdID, const BlockDataID fcdID ) :
+   blocks_( blocks ), ccdID_(ccdID), fcdID_(fcdID)
+   {}
+
+   real_t get()
+   {
+      real_t maximumPenetration = real_t(0);
+      for (auto it = blocks_->begin(); it != blocks_->end(); ++it) {
+         IBlock &currentBlock = *it;
+
+         pe::ccd::ICCD *ccd = currentBlock.getData<pe::ccd::ICCD>(ccdID_);
+         pe::fcd::IFCD *fcd = currentBlock.getData<pe::fcd::IFCD>(fcdID_);
+         ccd->generatePossibleContacts();
+         pe::Contacts& contacts = fcd->generateContacts( ccd->getPossibleContacts() );
+         size_t numContacts( contacts.size() );
+
+         for( size_t i = 0; i < numContacts; ++i )
+         {
+            const pe::ContactID c( &contacts[i] );
+            maximumPenetration = std::max( maximumPenetration, std::fabs(c->getDistance()));
+         }
+      }
+      WALBERLA_MPI_SECTION()
+      {
+         mpi::allReduceInplace( maximumPenetration, mpi::MAX );
+      }
+      return maximumPenetration;
+   }
+private:
+   shared_ptr< StructuredBlockStorage > blocks_;
+   const BlockDataID ccdID_;
+   const BlockDataID fcdID_;
+};
+
+class MaxVelocityEvaluator
+{
+public:
+   MaxVelocityEvaluator( const shared_ptr< StructuredBlockStorage > & blocks, const BlockDataID bodyStorageID ) :
+         blocks_( blocks ), bodyStorageID_(bodyStorageID)
+   {}
+
+   Vector3<real_t> get()
+   {
+      real_t maxVelX = real_t(0);
+      real_t maxVelY = real_t(0);
+      real_t maxVelZ = real_t(0);
+
+      for (auto it = blocks_->begin(); it != blocks_->end(); ++it) {
+
+         for( auto bodyIt = pe::LocalBodyIterator::begin(*it, bodyStorageID_ ); bodyIt != pe::LocalBodyIterator::end(); ++bodyIt ) {
+            auto vel = bodyIt->getLinearVel();
+            maxVelX = std::max(maxVelX, std::fabs(vel[0]));
+            maxVelY = std::max(maxVelY, std::fabs(vel[1]));
+            maxVelZ = std::max(maxVelZ, std::fabs(vel[2]));
+         }
+      }
+      WALBERLA_MPI_SECTION()
+      {
+         mpi::allReduceInplace( maxVelX, mpi::MAX );
+         mpi::allReduceInplace( maxVelY, mpi::MAX );
+         mpi::allReduceInplace( maxVelZ, mpi::MAX );
+      }
+      return Vector3<real_t>(maxVelX, maxVelY, maxVelZ);
+   }
+
+   real_t getMagnitude()
+   {
+      real_t magnitude = real_t(0);
+
+      for (auto it = blocks_->begin(); it != blocks_->end(); ++it) {
+
+         for( auto bodyIt = pe::LocalBodyIterator::begin(*it, bodyStorageID_ ); bodyIt != pe::LocalBodyIterator::end(); ++bodyIt ) {
+            magnitude = std::max(magnitude, bodyIt->getLinearVel().length());
+         }
+      }
+      WALBERLA_MPI_SECTION()
+      {
+         mpi::allReduceInplace( magnitude, mpi::MAX );
+      }
+      return magnitude;
+
+   }
+
+private:
+   shared_ptr< StructuredBlockStorage > blocks_;
+   const BlockDataID bodyStorageID_;
+};
+
+void evaluateFluidQuantities(const shared_ptr< StructuredBlockStorage > & blocks, const BlockDataID boundaryHandlingID,
+                             uint_t & numCells, uint_t & numFluidCells, uint_t & numNBCells )
+{
+
+   for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt)
+   {
+      BoundaryHandling_T * boundaryHandling = blockIt->getData< BoundaryHandling_T >( boundaryHandlingID );
+      auto xyzSize = boundaryHandling->getFlagField()->xyzSize();
+      numCells += xyzSize.numCells();
+
+      for( cell_idx_t z = cell_idx_t(xyzSize.zMin()); z <= cell_idx_t(xyzSize.zMax()); ++z ){
+         for( cell_idx_t y = cell_idx_t(xyzSize.yMin()); y <= cell_idx_t(xyzSize.yMax()); ++y ){
+            for( cell_idx_t x = cell_idx_t(xyzSize.xMin()); x <= cell_idx_t(xyzSize.xMax()); ++x ) {
+               if (boundaryHandling->isDomain(x, y, z)) {
+                  ++numFluidCells;
+               }
+               if (boundaryHandling->isNearBoundary(x, y, z)) {
+                  ++numNBCells;
+               }
+            }
+         }
+      }
+   }
+}
+
+void evaluatePEQuantities( const shared_ptr< StructuredBlockStorage > & blocks, const BlockDataID bodyStorageID,
+                           const BlockDataID ccdID, const BlockDataID fcdID,
+                           uint_t & numLocalParticles, uint_t & numShadowParticles, uint_t & numContacts)
+{
+
+   for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt) {
+      pe::Storage * bodyStorage = blockIt->getData<pe::Storage>(bodyStorageID);
+      pe::BodyStorage const & localStorage  = (*bodyStorage)[pe::StorageType::LOCAL];
+      pe::BodyStorage const & shadowStorage = (*bodyStorage)[pe::StorageType::SHADOW];
+      numLocalParticles += localStorage.size();
+      numShadowParticles += shadowStorage.size();
+
+      pe::ccd::ICCD * ccd = blockIt->getData<pe::ccd::ICCD>(ccdID);
+      pe::fcd::IFCD * fcd = blockIt->getData<pe::fcd::IFCD>(fcdID);
+      ccd->generatePossibleContacts();
+      pe::Contacts& contacts = fcd->generateContacts( ccd->getPossibleContacts() );
+      numContacts += contacts.size();
+   }
+}
+
+void evaluateTimers(WcTimingPool & timingPool, WcTimingTree & peTimingTree,
+                    const std::vector<std::vector<std::string> > & timerKeys,
+                    std::vector<real_t> & timings )
+{
+
+   for( auto timingsIt = timings.begin(); timingsIt != timings.end(); ++timingsIt )
+   {
+      *timingsIt = real_t(0);
+   }
+
+   timingPool.unifyRegisteredTimersAcrossProcesses();
+   peTimingTree.synchronize();
+
+   real_t scalingFactor = real_t(1000); // milliseconds
+
+   for(uint_t i = uint_t(0); i < timerKeys.size(); ++i )
+   {
+      auto keys = timerKeys[i];
+      for(auto keyIt = keys.begin(); keyIt != keys.end(); ++keyIt)
+      {
+         std::string timerName = *keyIt;
+         if(timingPool.timerExists(timerName))
+         {
+            timings[i] += real_c(timingPool[timerName].total()) * scalingFactor;
+         }
+         if(peTimingTree.timerExists(timerName))
+         {
+            timings[i] += real_c(peTimingTree[timerName].total()) * scalingFactor;
+         }
+      }
+
+   }
+}
+
+void resetTimers(WcTimingPool & timingPool, WcTimingTree & peTimingTree)
+{
+   timingPool.clear();
+   peTimingTree.reset();
+}
+
+//*******************************************************************************************************************
+/*! Application to evaluate the workload (time measurements) for a fluid-particle simulation
+ *
+ * This application is used in the paper
+ *  Rettinger, Ruede - "Dynamic Load Balancing Techniques for Particulate Flow Simulations", submitted to Computation
+ * in Section 3 to develop and calibrate the workload estimator.
+ * The setup features settling particle inside a horizontally periodic box.
+ * A comprehensive description is given in Sec. 3.3 of the paper.
+ * It uses 4 x 4 x 5 blocks for domain partitioning.
+ * For each block ( = each process), the block local quantities are evaluated as well as the timing infos of
+ * the fluid-particle interaction algorithm. Those infos are then written to files that can be used later on
+ * for function fitting to obtain a workload estimator.
+ *
+ * NOTE: Since this estimator relies on timing measurements, this evaluation procedure should be carried out everytime
+ * a different implementation, hardware or algorithm is used.
+ *
+ */
+//*******************************************************************************************************************
+int main( int argc, char **argv )
+{
+   debug::enterTestMode();
+
+   mpi::Environment env( argc, argv );
+
+
+   real_t solidVolumeFraction = real_t(0.2);
+
+   // LBM / numerical parameters
+   uint_t blockSize  = uint_t(32);
+   real_t uSettling = real_t(0.1); // characteristic settling velocity
+   real_t diameter = real_t(10);
+
+   real_t Ga = real_t(30); //Galileo number
+   uint_t numPeSubCycles = uint_t(10);
+
+   uint_t vtkIOFreq = 0;
+   real_t timestepsNonDim = real_t(2.5);
+   uint_t numSamples = uint_t(2000);
+   std::string baseFolder = "workload_files"; // folder for vtk and file output
+
+   bool useEllipsoids = false;
+   bool optimizeForSmallObstacleFraction = false;
+   bool noFileOutput = false;
+   bool fixBodies = false;
+   bool useEntireFieldTraversal = true;
+   bool averageForceTorqueOverTwoTimSteps = true;
+   bool useFusedStreamCollide = false;
+
+
+   for( int i = 1; i < argc; ++i )
+   {
+      if( std::strcmp( argv[i], "--vtkIOFreq"               ) == 0 ) { vtkIOFreq = uint_c( std::atof( argv[++i] ) ); continue; }
+      if( std::strcmp( argv[i], "--noFileOutput"            ) == 0 ) { noFileOutput = true; continue; }
+      if( std::strcmp( argv[i], "--basefolder"              ) == 0 ) { baseFolder = argv[++i]; continue; }
+      if( std::strcmp( argv[i], "--solidVolumeFraction"     ) == 0 ) { solidVolumeFraction = real_c(std::atof( argv[++i] )); continue; }
+      if( std::strcmp( argv[i], "--diameter"                ) == 0 ) { diameter = real_c(std::atof( argv[++i] )); continue; }
+      if( std::strcmp( argv[i], "--blockSize"               ) == 0 ) { blockSize = uint_c(std::atof( argv[++i]) ); continue; }
+      if( std::strcmp( argv[i], "--uSettling"               ) == 0 ) { uSettling = real_c(std::atof( argv[++i] )); continue; }
+      if( std::strcmp( argv[i], "--Ga"                      ) == 0 ) { Ga = real_c(std::atof( argv[++i] )); continue; }
+      if( std::strcmp( argv[i], "--timestepsNonDim"         ) == 0 ) { timestepsNonDim = real_c(std::atof( argv[++i] )); continue; }
+      if( std::strcmp( argv[i], "--numPeSubCycles"          ) == 0 ) { numPeSubCycles = uint_c(std::atof( argv[++i] )); continue; }
+      if( std::strcmp( argv[i], "--useEllipsoids"           ) == 0 ) { useEllipsoids = true; continue; }
+      if( std::strcmp( argv[i], "--optSmallSVF"             ) == 0 ) { optimizeForSmallObstacleFraction = true; continue; }
+      if( std::strcmp( argv[i], "--fixBodies"               ) == 0 ) { fixBodies = true; continue; }
+      if( std::strcmp( argv[i], "--useEntireFieldTraversal" ) == 0 ) { useEntireFieldTraversal = true; continue; }
+      if( std::strcmp( argv[i], "--numSamples"              ) == 0 ) { numSamples = uint_c(std::atof( argv[++i] )); continue; }
+      if( std::strcmp( argv[i], "--noForceAveraging"        ) == 0 ) { averageForceTorqueOverTwoTimSteps = false; continue; }
+      if( std::strcmp( argv[i], "--useFusedStreamCollide"   ) == 0 ) { useFusedStreamCollide = true; continue; }
+      WALBERLA_ABORT("Unrecognized command line argument found: " << argv[i]);
+   }
+
+   WALBERLA_CHECK(diameter > real_t(1));
+   WALBERLA_CHECK(uSettling > real_t(0));
+   WALBERLA_CHECK(Ga > real_t(0));
+   WALBERLA_CHECK(solidVolumeFraction > real_t(0));
+   WALBERLA_CHECK(solidVolumeFraction < real_t(0.65));
+
+   ///////////////////////////
+   // SIMULATION PROPERTIES //
+   ///////////////////////////
+
+   const uint_t XBlocks = uint_t(4);
+   const uint_t YBlocks = uint_t(4);
+   const uint_t ZBlocks = uint_t(5);
+
+   if( MPIManager::instance()->numProcesses() != XBlocks * YBlocks * ZBlocks )
+   {
+      WALBERLA_LOG_WARNING_ON_ROOT("WARNING! You have specified less or more processes than number of blocks -> the time measurements are no longer blockwise!")
+   }
+
+   if( diameter > real_c(blockSize) )
+   {
+      WALBERLA_LOG_WARNING_ON_ROOT("The bodies might be too large to work with the currently used synchronization!");
+   }
+
+
+   WALBERLA_LOG_INFO_ON_ROOT("Using setup with sedimenting particles -> creating two planes and applying gravitational force")
+   if( useEllipsoids ){ WALBERLA_LOG_INFO_ON_ROOT("using ELLIPSOIDS"); }
+   else{ WALBERLA_LOG_INFO_ON_ROOT("using SPHERES"); }
+
+
+   const uint_t XCells = blockSize * XBlocks;
+   const uint_t YCells = blockSize * YBlocks;
+   const uint_t ZCells = blockSize * ZBlocks;
+
+   const real_t topWallOffset = real_t(1.05) * real_t(blockSize); // move the top wall downwards to take away a certain portion of the overall domain
+
+
+   // determine number of spheres to generate, if necessary scale diameter a bit to reach desired solid volume fraction
+   real_t domainHeight = real_c(ZCells) - topWallOffset;
+   real_t fluidVolume =  real_c( XCells * YCells ) * domainHeight;
+   real_t solidVolume = solidVolumeFraction * fluidVolume;
+   uint_t numberOfParticles = uint_c(std::ceil(solidVolume / ( math::M_PI / real_t(6) * diameter * diameter * diameter )));
+   diameter = std::cbrt( solidVolume / ( real_c(numberOfParticles) * math::M_PI / real_t(6) ) );
+
+   real_t densityRatio = real_t(2.5);
+
+   real_t viscosity = uSettling * diameter / Ga;
+   const real_t omega = lbm::collision_model::omegaFromViscosity(viscosity);
+
+   const real_t gravitationalAcceleration = uSettling * uSettling / ( (densityRatio-real_t(1)) * diameter );
+
+   real_t tref = diameter / uSettling;
+   real_t Tref = domainHeight / uSettling;
+
+   uint_t timesteps = uint_c(timestepsNonDim * Tref);
+
+   const real_t dx = real_c(1);
+   WALBERLA_LOG_INFO_ON_ROOT("viscosity = " << viscosity);
+   WALBERLA_LOG_INFO_ON_ROOT("tau = " << real_t(1)/omega);
+   WALBERLA_LOG_INFO_ON_ROOT("diameter = " << diameter);
+   WALBERLA_LOG_INFO_ON_ROOT("solid volume fraction = " << solidVolumeFraction);
+   WALBERLA_LOG_INFO_ON_ROOT("domain size (in cells) = " << XCells << " x " << ZCells << " x " << ZCells);
+   WALBERLA_LOG_INFO_ON_ROOT("number of bodies = " << numberOfParticles);
+   WALBERLA_LOG_INFO_ON_ROOT("gravitational acceleration = " << gravitationalAcceleration);
+   WALBERLA_LOG_INFO_ON_ROOT("Ga = " << Ga);
+   WALBERLA_LOG_INFO_ON_ROOT("uSettling = " << uSettling);
+   WALBERLA_LOG_INFO_ON_ROOT("tref = " << tref);
+   WALBERLA_LOG_INFO_ON_ROOT("Tref = " << Tref);
+   WALBERLA_LOG_INFO_ON_ROOT("timesteps = " << timesteps);
+   WALBERLA_LOG_INFO_ON_ROOT("number of workload samples = " << numSamples);
+
+   // create folder to store logging files
+   WALBERLA_ROOT_SECTION()
+   {
+      walberla::filesystem::path path1( baseFolder );
+      if( !walberla::filesystem::exists( path1 ) )
+         walberla::filesystem::create_directory( path1 );
+   }
+
+
+   ///////////////////////////
+   // BLOCK STRUCTURE SETUP //
+   ///////////////////////////
+
+   Vector3<bool> periodicity( true );
+   periodicity[2] = false;
+
+   // create domain
+   shared_ptr< StructuredBlockForest > blocks = blockforest::createUniformBlockGrid( XBlocks, YBlocks, ZBlocks, blockSize, blockSize, blockSize, dx,
+                                                    0, false, false, //one block per process!
+                                                    periodicity[0], periodicity[1], periodicity[2], //periodicity
+                                                    false );
+
+   ////////
+   // PE //
+   ////////
+
+   shared_ptr<pe::BodyStorage> globalBodyStorage = make_shared<pe::BodyStorage>();
+   pe::SetBodyTypeIDs<BodyTypeTuple>::execute();
+   auto bodyStorageID = blocks->addBlockData(pe::createStorageDataHandling<BodyTypeTuple>(), "pe Body Storage");
+   auto ccdID = blocks->addBlockData(pe::ccd::createHashGridsDataHandling( globalBodyStorage, bodyStorageID ), "CCD");
+   BlockDataID fcdID   = (useEllipsoids) ? blocks->addBlockData( pe::fcd::createGenericFCDDataHandling<BodyTypeTuple, pe::fcd::GJKEPACollideFunctor>(), "FCD" )
+                                         : blocks->addBlockData(pe::fcd::createGenericFCDDataHandling<BodyTypeTuple, pe::fcd::AnalyticCollideFunctor>(), "FCD");
+
+   WcTimingTree timingTreePE;
+
+   pe::cr::HCSITS cr(globalBodyStorage, blocks->getBlockStoragePointer(), bodyStorageID, ccdID, fcdID, &timingTreePE );
+   cr.setMaxIterations(10);
+   cr.setRelaxationModel( pe::cr::HardContactSemiImplicitTimesteppingSolvers::ApproximateInelasticCoulombContactByDecoupling );
+   cr.setErrorReductionParameter(real_t(0.8));
+
+   /////////////////
+   // PE COUPLING //
+   /////////////////
+
+   // connect to pe
+   const real_t overlap = real_c( 1.5 ) * dx;
+
+   std::function<void(void)> syncCall = std::bind( pe::syncNextNeighbors<BodyTypeTuple>, std::ref(blocks->getBlockForest()), bodyStorageID, &timingTreePE, overlap, false );
+
+   auto generationDomain = AABB( real_t(0), real_t(0), real_t(0), real_c(XCells), real_c(YCells), real_c(ZCells) - topWallOffset);
+   auto peMaterial = pe::createMaterial( "mat", densityRatio, real_t(1), real_t(0.25), real_t(0.25), real_t(0), real_t(200), real_t(100), real_t(100), real_t(100) );
+
+   // create two planes at bottom and top of domain
+   pe::createPlane( *globalBodyStorage, 0, Vector3<real_t>(0,0,1), Vector3<real_t>(0,0,0), peMaterial );
+   pe::createPlane( *globalBodyStorage, 0, Vector3<real_t>(0,0,-1), Vector3<real_t>(0,0,real_c(ZCells)-topWallOffset), peMaterial );
+
+   real_t xParticle = real_t(0);
+   real_t yParticle = real_t(0);
+   real_t zParticle = real_t(0);
+
+   for( uint_t nPart = 0; nPart < numberOfParticles; ++nPart )
+   {
+
+      WALBERLA_ROOT_SECTION()
+      {
+         xParticle = math::realRandom<real_t>(generationDomain.xMin(), generationDomain.xMax());
+         yParticle = math::realRandom<real_t>(generationDomain.yMin(), generationDomain.yMax());
+         zParticle = math::realRandom<real_t>(generationDomain.zMin(), generationDomain.zMax());
+      }
+
+      WALBERLA_MPI_SECTION()
+      {
+         mpi::broadcastObject( xParticle );
+         mpi::broadcastObject( yParticle );
+         mpi::broadcastObject( zParticle );
+      }
+
+      if( useEllipsoids )
+      {
+         // prolate ellipsoids
+         real_t axisFactor = real_t(1.5);
+         real_t axisFactor2 = std::sqrt(real_t(1)/axisFactor);
+         real_t radius = diameter * real_t(0.5);
+         pe::createEllipsoid( *globalBodyStorage, blocks->getBlockStorage(), bodyStorageID, 0, Vector3<real_t>( xParticle, yParticle, zParticle ), Vector3<real_t>(axisFactor*radius, axisFactor2*radius, axisFactor2*radius), peMaterial );
+
+      } else{
+         pe::createSphere( *globalBodyStorage, blocks->getBlockStorage(), bodyStorageID, 0, Vector3<real_t>( xParticle, yParticle, zParticle ), diameter * real_t(0.5), peMaterial );
+      }
+
+   }
+
+   syncCall();
+
+   // resolve possible overlaps of the particles due to the random initialization
+
+   // 100 iterations of solver to resolve all major overlaps
+   {
+      for( uint_t pet = uint_t(1); pet <= uint_t(100); ++pet )
+      {
+         cr.timestep( real_t(1) );
+         syncCall();
+
+         // reset all velocities to zero
+         for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
+         {
+            for( auto bodyIt = pe::BodyIterator::begin( *blockIt, bodyStorageID); bodyIt != pe::BodyIterator::end(); ++bodyIt )
+            {
+               bodyIt->setLinearVel(Vector3<real_t>(real_t(0)));
+               bodyIt->setAngularVel(Vector3<real_t>(real_t(0)));
+            }
+         }
+      }
+   }
+
+
+   // resolve remaining overlaps via particle simulation
+   {
+      const uint_t initialPeSteps = uint_t(2000);
+      const real_t dt_PE_init = real_t(1);
+      const real_t overlapLimit = real_t(0.001) * diameter;
+
+      WALBERLA_LOG_INFO_ON_ROOT("Particle creation done --- resolving overlaps with goal all < " << overlapLimit / diameter * real_t(100) << "%");
+
+      CollisionPropertiesEvaluator collisionPropertiesEvaluator( cr );
+
+      ContactDistanceEvaluator contactDistanceEvaluator(blocks, ccdID, fcdID);
+      MaxVelocityEvaluator maxVelEvaluator(blocks, bodyStorageID);
+
+      for( uint_t pet = uint_t(1); pet <= initialPeSteps; ++pet )
+      {
+         cr.timestep( dt_PE_init );
+         syncCall();
+         real_t maxPen = collisionPropertiesEvaluator.get();
+
+         if( maxPen < overlapLimit )
+         {
+            WALBERLA_LOG_INFO_ON_ROOT("Carried out " << pet << " PE-only time steps to resolve initial overlaps");
+            WALBERLA_LOG_INFO_ON_ROOT("Final max penetration from cr is " << maxPen << " = " << maxPen / diameter * real_t(100) << "%");
+
+            break;
+         }
+
+         real_t maxMagnitude = maxVelEvaluator.getMagnitude();
+
+         if( maxMagnitude * dt_PE_init > overlapLimit)
+         {
+            // avoid too large response velocities by setting them to zero
+            for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
+            {
+               for( auto bodyIt = pe::BodyIterator::begin( *blockIt, bodyStorageID); bodyIt != pe::BodyIterator::end(); ++bodyIt )
+               {
+                  bodyIt->setLinearVel(Vector3<real_t>(real_t(0)));
+                  bodyIt->setAngularVel(Vector3<real_t>(real_t(0)));
+               }
+            }
+         }
+         else
+         {
+            cr.setErrorReductionParameter(real_t(0.8));
+         }
+
+         if( pet % uint_t(20) == uint_t(0) )
+         {
+            WALBERLA_LOG_INFO_ON_ROOT(pet << " - current max overlap = " << maxPen / diameter * real_t(100) << "%, max vel magnitude = " << maxMagnitude );
+         }
+      }
+   }
+
+   // reset all velocities to zero
+   Vector3<real_t> initialBodyVelocity(real_t(0));
+
+   WALBERLA_LOG_INFO_ON_ROOT("Setting initial velocity " << initialBodyVelocity << " of all bodies");
+   for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
+   {
+      for( auto bodyIt = pe::BodyIterator::begin( *blockIt, bodyStorageID); bodyIt != pe::BodyIterator::end(); ++bodyIt )
+      {
+         bodyIt->setLinearVel(initialBodyVelocity);
+         bodyIt->setAngularVel(Vector3<real_t>(real_t(0)));
+      }
+   }
+
+   ///////////////////////
+   // ADD DATA TO BLOCKS //
+   ////////////////////////
+
+   // create the lattice model
+   LatticeModel_T latticeModel = LatticeModel_T( lbm::collision_model::TRT::constructWithMagicNumber( omega ) );
+
+   // add PDF field
+   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, "pdf field (zyxf)", latticeModel,
+                                                                         Vector3< real_t >( real_t(0) ), real_t(1),
+                                                                         uint_t(1), field::zyxf );
+
+   // add flag field
+   BlockDataID flagFieldID = field::addFlagFieldToStorage<FlagField_T>( blocks, "flag field" );
+
+   // add body field
+   BlockDataID bodyFieldID = field::addToStorage<BodyField_T>( blocks, "body field", NULL, field::zyxf );
+
+   // add boundary handling & initialize outer domain boundaries
+   BlockDataID boundaryHandlingID = blocks->addStructuredBlockData< BoundaryHandling_T >(
+                                    MyBoundaryHandling( flagFieldID, pdfFieldID, bodyFieldID, useEntireFieldTraversal ), "boundary handling" );
+
+
+   // initially map pe bodies into the LBM simulation
+   pe_coupling::mapMovingBodies< BoundaryHandling_T >( *blocks, boundaryHandlingID, bodyStorageID, *globalBodyStorage, bodyFieldID, MO_CLI_Flag );
+
+   lbm::BlockForestEvaluation<FlagField_T> bfEval(blocks, flagFieldID, Fluid_Flag);
+
+   WALBERLA_LOG_INFO_ON_ROOT(bfEval.loggingString());
+
+   ///////////////
+   // TIME LOOP //
+   ///////////////
+
+   // create the timeloop
+   SweepTimeloop timeloop( blocks->getBlockStorage(), timesteps );
+
+   if( vtkIOFreq != uint_t(0) )
+   {
+      // pe bodies
+      if(useEllipsoids)
+      {
+         auto bodyVtkOutput = make_shared<pe::EllipsoidVtkOutput>( bodyStorageID, blocks->getBlockStorage() );
+         auto bodyVTK = vtk::createVTKOutput_PointData( bodyVtkOutput, "bodies", vtkIOFreq, baseFolder );
+         timeloop.addFuncBeforeTimeStep( vtk::writeFiles( bodyVTK ), "VTK (body data)" );
+
+      }else
+      {
+         auto bodyVtkOutput = make_shared<pe::SphereVtkOutput>( bodyStorageID, blocks->getBlockStorage() );
+         auto bodyVTK = vtk::createVTKOutput_PointData( bodyVtkOutput, "bodies", vtkIOFreq, baseFolder );
+         timeloop.addFuncBeforeTimeStep( vtk::writeFiles( bodyVTK ), "VTK (body data)" );
+      }
+
+
+      // flag field
+      auto flagFieldVTK = vtk::createVTKOutput_BlockData( blocks, "flag_field", vtkIOFreq, 1, false, baseFolder );
+      flagFieldVTK->addCellDataWriter( make_shared< field::VTKWriter< FlagField_T > >( flagFieldID, "FlagField" ) );
+      timeloop.addFuncAfterTimeStep( vtk::writeFiles( flagFieldVTK ), "VTK (flag field data)" );
+
+      // pdf field
+      auto pdfFieldVTK = vtk::createVTKOutput_BlockData( blocks, "fluid_field", vtkIOFreq, 0, false, baseFolder );
+
+      field::FlagFieldCellFilter< FlagField_T > fluidFilter( flagFieldID );
+      fluidFilter.addFlag( Fluid_Flag );
+      pdfFieldVTK->addCellInclusionFilter( fluidFilter );
+
+      pdfFieldVTK->addCellDataWriter( make_shared< lbm::VelocityVTKWriter< LatticeModel_T, float > >( pdfFieldID, "VelocityFromPDF" ) );
+      pdfFieldVTK->addCellDataWriter( make_shared< lbm::DensityVTKWriter < LatticeModel_T, float > >( pdfFieldID, "DensityFromPDF" ) );
+
+      timeloop.addFuncBeforeTimeStep( vtk::writeFiles( pdfFieldVTK ), "VTK (fluid field data)" );
+
+      auto domainDecompVTK = vtk::createVTKOutput_DomainDecomposition(blocks, "domain_decomposition", vtkIOFreq, baseFolder );
+      timeloop.addFuncBeforeTimeStep( vtk::writeFiles(domainDecompVTK), "VTK (domain decomposition)");
+   }
+
+   // sweep for updating the pe body mapping into the LBM simulation
+   timeloop.add()
+         << Sweep( pe_coupling::BodyMapping< BoundaryHandling_T >( blocks, boundaryHandlingID, bodyStorageID, globalBodyStorage, bodyFieldID, MO_CLI_Flag, FormerMO_Flag, pe_coupling::selectRegularBodies ), "Body Mapping" );
+
+   // sweep for restoring PDFs in cells previously occupied by pe bodies
+   typedef pe_coupling::EquilibriumReconstructor< LatticeModel_T, BoundaryHandling_T > Reconstructor_T;
+   Reconstructor_T reconstructor( blocks, boundaryHandlingID, pdfFieldID, bodyFieldID);
+   timeloop.add()
+         << Sweep( pe_coupling::PDFReconstruction< LatticeModel_T, BoundaryHandling_T, Reconstructor_T >
+                         ( blocks, boundaryHandlingID, bodyStorageID, globalBodyStorage, bodyFieldID, reconstructor, FormerMO_Flag, Fluid_Flag, pe_coupling::selectRegularBodies, optimizeForSmallObstacleFraction ), "PDF Restore" );
+
+   shared_ptr<pe_coupling::BodiesForceTorqueContainer> bodiesFTContainer1 = make_shared<pe_coupling::BodiesForceTorqueContainer>(blocks, bodyStorageID);
+   std::function<void(void)> storeForceTorqueInCont1 = std::bind(&pe_coupling::BodiesForceTorqueContainer::store, bodiesFTContainer1);
+   shared_ptr<pe_coupling::BodiesForceTorqueContainer> bodiesFTContainer2 = make_shared<pe_coupling::BodiesForceTorqueContainer>(blocks, bodyStorageID);
+   std::function<void(void)> setForceTorqueOnBodiesFromCont2 = std::bind(&pe_coupling::BodiesForceTorqueContainer::setOnBodies, bodiesFTContainer2);
+   shared_ptr<pe_coupling::ForceTorqueOnBodiesScaler> forceScaler = make_shared<pe_coupling::ForceTorqueOnBodiesScaler>(blocks, bodyStorageID, real_t(1));
+   std::function<void(void)> setForceScalingFactorToHalf = std::bind(&pe_coupling::ForceTorqueOnBodiesScaler::resetScalingFactor,forceScaler,real_t(0.5));
+
+   if( averageForceTorqueOverTwoTimSteps ) {
+      bodiesFTContainer2->store();
+   }
+
+
+   // setup of the LBM communication for synchronizing the pdf field between neighboring blocks
+   std::function< void () > commFunction;
+
+   blockforest::communication::UniformBufferedScheme< stencil::D3Q27 > scheme( blocks );
+   scheme.addPackInfo( make_shared< field::communication::PackInfo< PdfField_T > >( pdfFieldID ) );
+   commFunction = scheme;
+
+   auto sweep = lbm::makeCellwiseSweep< LatticeModel_T, FlagField_T >( pdfFieldID, flagFieldID, Fluid_Flag );
+
+   if( !useFusedStreamCollide )
+   {
+      // streaming & collide
+      timeloop.add() << Sweep( makeCollideSweep(sweep), "Collide" );
+   }
+
+   // add LBM communication function and boundary handling sweep (does the hydro force calculations and the no-slip treatment)
+   timeloop.add() << BeforeFunction( commFunction, "LBM Communication" )
+                  << Sweep( BoundaryHandling_T::getBlockSweep( boundaryHandlingID ), "Boundary Handling" );
+
+   if( useFusedStreamCollide )
+   {
+      // streaming & collide
+      timeloop.add() << Sweep( makeSharedSweep(sweep), "Stream&Collide" );
+   } else
+   {
+      // streaming & collide
+      timeloop.add() << Sweep( makeStreamSweep(sweep), "Stream" );
+
+   }
+
+   // Averaging the force/torque over two time steps is said to damp oscillations of the interaction force/torque.
+   // See Ladd - " Numerical simulations of particulate suspensions via a discretized Boltzmann equation. Part 1. Theoretical foundation", 1994, p. 302
+   if( averageForceTorqueOverTwoTimSteps ) {
+
+      // store force/torque from hydrodynamic interactions in container1
+      timeloop.addFuncAfterTimeStep(storeForceTorqueInCont1, "Force Storing");
+
+      // set force/torque from previous time step (in container2)
+      timeloop.addFuncAfterTimeStep(setForceTorqueOnBodiesFromCont2, "Force setting");
+
+      // average the force/torque by scaling it with factor 1/2 (except in first timestep, there it is 1, which it is initially)
+      timeloop.addFuncAfterTimeStep( pe_coupling::ForceTorqueOnBodiesScaler(blocks, bodyStorageID, real_t(0.5)),  "Force averaging");
+      timeloop.addFuncAfterTimeStep( setForceScalingFactorToHalf, "Force scaling adjustment" );
+
+      // swap containers
+      timeloop.addFuncAfterTimeStep( pe_coupling::BodyContainerSwapper( bodiesFTContainer1, bodiesFTContainer2 ), "Swap FT container" );
+
+   }
+
+   real_t sphereVolume = diameter * diameter * diameter * math::M_PI / real_t(6);
+   Vector3<real_t> gravitationalForce( real_t(0), real_t(0), -(densityRatio - real_t(1)) * gravitationalAcceleration * sphereVolume );
+   timeloop.addFuncAfterTimeStep(pe_coupling::ForceOnBodiesAdder( blocks, bodyStorageID, gravitationalForce ), "Gravitational force" );
+
+   if( fixBodies ) {
+      // reset all forces
+      timeloop.addFuncAfterTimeStep( pe_coupling::ForceTorqueOnBodiesResetter(blocks, bodyStorageID), "Force Resetting");
+   } else{
+      // add pe timesteps
+      timeloop.addFuncAfterTimeStep( pe_coupling::TimeStep( blocks, bodyStorageID, cr, syncCall, real_t(1), numPeSubCycles ), "pe Time Step" );
+   }
+
+   timeloop.addFuncAfterTimeStep( RemainingTimeLogger( timeloop.getNrOfTimeSteps() ), "Remaining Time Logger" );
+
+
+
+   ////////////////////////
+   // EXECUTE SIMULATION //
+   ////////////////////////
+
+   WcTimingPool timeloopTiming;
+
+   std::vector< std::vector<std::string> > timerKeys;
+   std::vector<std::string> LBMTimer;
+   LBMTimer.push_back("Stream&Collide");
+   LBMTimer.push_back("Stream");
+   LBMTimer.push_back("Collide");
+   timerKeys.push_back(LBMTimer);
+
+   std::vector<std::string> bhTimer;
+   bhTimer.push_back("Boundary Handling");
+   timerKeys.push_back(bhTimer);
+
+   std::vector<std::string> couplingTimer1;
+   couplingTimer1.push_back("Body Mapping");
+   std::vector<std::string> couplingTimer2;
+   couplingTimer2.push_back("PDF Restore");
+   timerKeys.push_back(couplingTimer1);
+   timerKeys.push_back(couplingTimer2);
+
+   std::vector<std::string> peTimer;
+   peTimer.push_back("Simulation Step.Collision Detection");
+   peTimer.push_back("Simulation Step.Collision Response Integration");
+   peTimer.push_back("Simulation Step.Collision Response Resolution.Collision Response Solving");
+   timerKeys.push_back(peTimer);
+
+   uint_t numCells, numFluidCells, numNBCells, numLocalParticles, numShadowParticles, numContacts;
+   numCells = uint_t(0);
+   numFluidCells = uint_t(0);
+   numNBCells = uint_t(0);
+   numLocalParticles = uint_t(0);
+   numShadowParticles = uint_t(0);
+   numContacts = uint_t(0);
+
+   std::vector<real_t> timings(timerKeys.size());
+
+   resetTimers(timeloopTiming, timingTreePE);
+
+   // every rank writes its own file -> numProcesses number of samples!
+   int myRank = MPIManager::instance()->rank();
+
+   std::string logFileName = baseFolder + "/load";
+   logFileName += "_settling";
+   if( useEllipsoids)
+   {
+      logFileName += "_ellipsoids";
+   }
+   else
+   {
+      logFileName += "_spheres";
+   }
+   logFileName += "_d" + std::to_string(int_c(std::ceil(diameter)));
+   logFileName += "_bs" + std::to_string(blockSize);
+   logFileName += "_" + std::to_string(myRank) + ".txt";
+
+
+   std::ofstream file;
+
+   if(!noFileOutput)
+   {
+      WALBERLA_LOG_INFO_ON_ROOT("Writing load info to file " << logFileName);
+      file.open( logFileName.c_str(), std::ofstream::app );
+      file << "# svf = " << solidVolumeFraction << ", d = " << diameter << ", domain = " << XCells << "x" << YCells << "x" << ZCells << "\n";
+   }
+
+
+   uint_t timeStepOfFirstTiming = uint_t(50);
+
+   if( timesteps - timeStepOfFirstTiming < numSamples )
+   {
+      WALBERLA_LOG_WARNING_ON_ROOT("Less actual time steps than number of required samples!");
+   }
+
+   uint_t nSample( 0 ); // number of current sample
+   real_t samplingFrequency = real_c(timesteps - timeStepOfFirstTiming) / real_c(numSamples);
+
+   // time loop
+   for (uint_t i = 1; i <= timesteps; ++i )
+   {
+      // perform a single simulation step
+      timeloop.singleStep( timeloopTiming );
+
+      // check if current time step should be included in sample
+      if( i >= uint_c( samplingFrequency * real_c(nSample) ) + timeStepOfFirstTiming )
+      {
+         // include -> evaluate all timers and quantities
+
+         evaluateFluidQuantities(blocks, boundaryHandlingID, numCells, numFluidCells, numNBCells);
+         evaluatePEQuantities(blocks, bodyStorageID, ccdID, fcdID, numLocalParticles, numShadowParticles, numContacts);
+
+         evaluateTimers(timeloopTiming, timingTreePE, timerKeys, timings);
+
+         if(!noFileOutput)
+         {
+            real_t totalTime = std::accumulate(timings.begin(), timings.end(), real_t(0) );
+
+            file << timeloop.getCurrentTimeStep() << " " << real_c(timeloop.getCurrentTimeStep()) / Tref << " "
+                 << numCells << " " << numFluidCells << " " << numNBCells << " "
+                 << numLocalParticles << " " << numShadowParticles << " " << numContacts << " " << numPeSubCycles;
+            for( auto timeIt = timings.begin(); timeIt != timings.end(); ++timeIt )
+            {
+               file << " " << *timeIt;
+            }
+            file << " " << totalTime << "\n";
+         }
+
+         numCells = uint_t(0);
+         numFluidCells = uint_t(0);
+         numNBCells = uint_t(0);
+         numLocalParticles = uint_t(0);
+         numShadowParticles = uint_t(0);
+         numContacts = uint_t(0);
+
+         ++nSample;
+      }
+
+      // reset timers to always include only a single time step in them
+      resetTimers(timeloopTiming, timingTreePE);
+   }
+
+   if(!noFileOutput) {
+      file.close();
+   }
+
+   //timeloopTiming.logResultOnRoot();
+
+   WALBERLA_LOG_INFO_ON_ROOT("Simulation finished!");
+
+   return 0;
+
+}
+
+} //namespace workload_evaluation
+
+int main( int argc, char **argv ){
+   workload_evaluation::main(argc, argv);
+}
diff --git a/apps/benchmarks/CMakeLists.txt b/apps/benchmarks/CMakeLists.txt
index 31e6417ae..4fa0c4cd0 100644
--- a/apps/benchmarks/CMakeLists.txt
+++ b/apps/benchmarks/CMakeLists.txt
@@ -1,3 +1,4 @@
+add_subdirectory( AdaptiveMeshRefinementFluidParticleCoupling )
 add_subdirectory( ComplexGeometry )
 add_subdirectory( DEM )
 add_subdirectory( MeshDistance )
-- 
GitLab