diff --git a/apps/benchmarks/CMakeLists.txt b/apps/benchmarks/CMakeLists.txt
index d16b4255d1c2d48cc25d1d9587f453a7c321ce28..7d5901c16064c4d3b103d5af59b40db3d9aa6403 100644
--- a/apps/benchmarks/CMakeLists.txt
+++ b/apps/benchmarks/CMakeLists.txt
@@ -4,6 +4,7 @@ add_subdirectory( DEM )
 add_subdirectory( MeshDistance )
 add_subdirectory( CouetteFlow )
 add_subdirectory( FluidParticleCoupling )
+add_subdirectory( FluidParticleCouplingWithLoadBalancing )
 add_subdirectory( ForcesOnSphereNearPlaneInShearFlow )
 add_subdirectory( GranularGas )
 add_subdirectory( IntegratorAccuracy )
diff --git a/apps/benchmarks/FluidParticleCouplingWithLoadBalancing/CMakeLists.txt b/apps/benchmarks/FluidParticleCouplingWithLoadBalancing/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4742f47b6648f374935d7d48f35d8ec3983c39f3
--- /dev/null
+++ b/apps/benchmarks/FluidParticleCouplingWithLoadBalancing/CMakeLists.txt
@@ -0,0 +1,3 @@
+waLBerla_add_executable( NAME FluidParticleWorkloadEvaluation FILES FluidParticleWorkloadEvaluation.cpp DEPENDS blockforest boundary core field lbm lbm_mesapd_coupling mesa_pd postprocessing timeloop vtk )
+
+waLBerla_add_executable( NAME FluidParticleWorkloadDistribution FILES FluidParticleWorkloadDistribution.cpp DEPENDS blockforest boundary core field lbm lbm_mesapd_coupling mesa_pd postprocessing timeloop vtk )
diff --git a/apps/benchmarks/FluidParticleCouplingWithLoadBalancing/FluidParticleWorkloadDistribution.cpp b/apps/benchmarks/FluidParticleCouplingWithLoadBalancing/FluidParticleWorkloadDistribution.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9f93af67b37cfec23a3543948c70da55c388542b
--- /dev/null
+++ b/apps/benchmarks/FluidParticleCouplingWithLoadBalancing/FluidParticleWorkloadDistribution.cpp
@@ -0,0 +1,1730 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file FluidParticleWorkloadDistribution.cpp
+//! \author Christoph Rettinger <christoph.rettinger@fau.de>
+//
+//======================================================================================================================
+
+#include "blockforest/Initialization.h"
+#include "blockforest/communication/UniformBufferedScheme.h"
+#include "blockforest/loadbalancing/all.h"
+#include "blockforest/AABBRefinementSelection.h"
+
+#include "boundary/all.h"
+
+#include "core/DataTypes.h"
+#include "core/Environment.h"
+#include "core/debug/Debug.h"
+#include "core/debug/TestSubsystem.h"
+#include "core/math/all.h"
+#include "core/timing/RemainingTimeLogger.h"
+#include "core/mpi/Broadcast.h"
+
+#include "domain_decomposition/SharedSweep.h"
+#include "domain_decomposition/BlockSweepWrapper.h"
+
+#include "field/AddToStorage.h"
+#include "field/StabilityChecker.h"
+#include "field/communication/PackInfo.h"
+
+#include "lbm/boundary/all.h"
+#include "lbm/communication/PdfFieldPackInfo.h"
+#include "lbm/field/AddToStorage.h"
+#include "lbm/field/PdfField.h"
+#include "lbm/field/VelocityFieldWriter.h"
+#include "lbm/lattice_model/D3Q19.h"
+#include "lbm/sweeps/CellwiseSweep.h"
+
+#include "lbm_mesapd_coupling/amr/BlockInfo.h"
+#include "lbm_mesapd_coupling/amr/InfoCollection.h"
+#include "lbm_mesapd_coupling/amr/weight_assignment/WeightEvaluationFunctions.h"
+#include "lbm_mesapd_coupling/amr/weight_assignment/WeightAssignmentFunctor.h"
+#include "lbm_mesapd_coupling/amr/weight_assignment/MetisAssignmentFunctor.h"
+#include "lbm_mesapd_coupling/mapping/ParticleMapping.h"
+#include "lbm_mesapd_coupling/momentum_exchange_method/MovingParticleMapping.h"
+#include "lbm_mesapd_coupling/momentum_exchange_method/boundary/CurvedLinear.h"
+#include "lbm_mesapd_coupling/momentum_exchange_method/reconstruction/Reconstructor.h"
+#include "lbm_mesapd_coupling/momentum_exchange_method/reconstruction/PdfReconstructionManager.h"
+#include "lbm_mesapd_coupling/utility/AddForceOnParticlesKernel.h"
+#include "lbm_mesapd_coupling/utility/ParticleSelector.h"
+#include "lbm_mesapd_coupling/DataTypes.h"
+#include "lbm_mesapd_coupling/utility/AverageHydrodynamicForceTorqueKernel.h"
+#include "lbm_mesapd_coupling/utility/AddHydrodynamicInteractionKernel.h"
+#include "lbm_mesapd_coupling/utility/ResetHydrodynamicForceTorqueKernel.h"
+#include "lbm_mesapd_coupling/utility/LubricationCorrectionKernel.h"
+#include "lbm_mesapd_coupling/utility/InitializeHydrodynamicForceTorqueForAveragingKernel.h"
+#include "lbm_mesapd_coupling/utility/InspectionProbe.h"
+
+#include "mesa_pd/collision_detection/AnalyticContactDetection.h"
+#include "mesa_pd/data/ParticleAccessorWithShape.h"
+#include "mesa_pd/data/ParticleStorage.h"
+#include "mesa_pd/data/ShapeStorage.h"
+#include "mesa_pd/data/DataTypes.h"
+#include "mesa_pd/data/shape/HalfSpace.h"
+#include "mesa_pd/data/shape/Sphere.h"
+#include "mesa_pd/domain/BlockForestDomain.h"
+#include "mesa_pd/domain/BlockForestDataHandling.h"
+#include "mesa_pd/kernel/AssocToBlock.h"
+#include "mesa_pd/kernel/DoubleCast.h"
+#include "mesa_pd/kernel/ExplicitEuler.h"
+#include "mesa_pd/kernel/LinearSpringDashpot.h"
+#include "mesa_pd/kernel/ParticleSelector.h"
+#include "mesa_pd/kernel/VelocityVerlet.h"
+#include "mesa_pd/mpi/ClearNextNeighborSync.h"
+#include "mesa_pd/mpi/SyncNextNeighbors.h"
+#include "mesa_pd/mpi/SyncNextNeighborsBlockForest.h"
+#include "mesa_pd/mpi/ReduceProperty.h"
+#include "mesa_pd/mpi/ReduceContactHistory.h"
+#include "mesa_pd/mpi/ContactFilter.h"
+#include "mesa_pd/mpi/notifications/ForceTorqueNotification.h"
+#include "mesa_pd/mpi/notifications/HydrodynamicForceTorqueNotification.h"
+#include "mesa_pd/vtk/ParticleVtkOutput.h"
+
+#include "timeloop/SweepTimeloop.h"
+
+#include "vtk/all.h"
+#include "field/vtk/all.h"
+#include "lbm/vtk/all.h"
+
+#include "Utility.h"
+
+namespace fluid_particle_workload_distribution
+{
+
+///////////
+// USING //
+///////////
+
+using namespace walberla;
+using walberla::uint_t;
+
+using LatticeModel_T = lbm::D3Q19< lbm::collision_model::TRT, false >;
+using Stencil_T = LatticeModel_T::Stencil;
+using PdfField_T = lbm::PdfField<LatticeModel_T>;
+using flag_t = walberla::uint8_t;
+using FlagField_T = FlagField<flag_t>;
+
+using ParticleField_T = lbm_mesapd_coupling::ParticleField_T;
+using ParticleAccessor_T = mesa_pd::data::ParticleAccessorWithShape;
+
+const uint_t FieldGhostLayers = 1;
+
+using NoSlip_T = lbm::NoSlip<LatticeModel_T, flag_t> ;
+using MO_T = lbm_mesapd_coupling::CurvedLinear< LatticeModel_T, FlagField_T, ParticleAccessor_T >;
+using BoundaryHandling_T = BoundaryHandling< FlagField_T, Stencil_T, NoSlip_T, MO_T >;
+
+
+///////////
+// FLAGS //
+///////////
+
+const FlagUID Fluid_Flag( "fluid" );
+const FlagUID NoSlip_Flag( "no slip" );
+const FlagUID MO_Flag( "moving obstacle" );
+const FlagUID FormerMO_Flag( "former moving obstacle" );
+
+
+///////////////////////////////////
+// LOAD BALANCING FUNCTIONALITY //
+//////////////////////////////////
+
+
+
+/////////////////////
+// BLOCK STRUCTURE //
+/////////////////////
+
+static void workloadAndMemoryAssignment( SetupBlockForest& forest )
+{
+   for (auto &block : forest) {
+      block.setWorkload( numeric_cast< workload_t >( uint_t(1) << block.getLevel() ) );
+      block.setMemory( numeric_cast< memory_t >(1) );
+   }
+}
+
+static shared_ptr< StructuredBlockForest > createBlockStructure( const AABB & domainAABB, Vector3<uint_t> blockSizeInCells,
+                                                                 bool useBox, const std::string & loadDistributionStrategy,
+                                                                 bool keepGlobalBlockInformation = false )
+{
+   SetupBlockForest sforest;
+
+   Vector3<uint_t> numberOfBlocksPerDirection( uint_c(domainAABB.size(0)) / blockSizeInCells[0],
+                                               uint_c(domainAABB.size(1)) / blockSizeInCells[1],
+                                               uint_c(domainAABB.size(2)) / blockSizeInCells[2] );
+
+   for(uint_t i = 0; i < 3; ++i )
+   {
+      WALBERLA_CHECK_EQUAL( numberOfBlocksPerDirection[i] * blockSizeInCells[i], uint_c(domainAABB.size(i)),
+                            "Domain can not be decomposed in direction " << i << " into fine blocks of size " << blockSizeInCells[i] );
+   }
+
+   sforest.addWorkloadMemorySUIDAssignmentFunction( workloadAndMemoryAssignment );
+
+   Vector3<bool> periodicity( true, true, false);
+   if( useBox )
+   {
+      periodicity[0] = false;
+      periodicity[1] = false;
+   }
+   sforest.init( domainAABB,
+                 numberOfBlocksPerDirection[0], numberOfBlocksPerDirection[1], numberOfBlocksPerDirection[2],
+                 periodicity[0], periodicity[1], periodicity[2]);
+
+   // calculate process distribution
+   const memory_t memoryLimit = math::Limits< memory_t >::inf();
+
+   if( loadDistributionStrategy == "Hilbert" )
+   {
+      bool useHilbert = true;
+      sforest.balanceLoad( blockforest::StaticLevelwiseCurveBalance(useHilbert), uint_c( MPIManager::instance()->numProcesses() ), real_t(0), memoryLimit, true );
+   } else if ( loadDistributionStrategy == "Morton" )
+   {
+      bool useHilbert = false;
+      sforest.balanceLoad( blockforest::StaticLevelwiseCurveBalance(useHilbert), uint_c( MPIManager::instance()->numProcesses() ), real_t(0), memoryLimit, true );
+   } else if ( loadDistributionStrategy == "ParMetis" )
+   {
+      blockforest::StaticLevelwiseParMetis::Algorithm algorithm = blockforest::StaticLevelwiseParMetis::Algorithm::PARMETIS_PART_GEOM_KWAY;
+      blockforest::StaticLevelwiseParMetis staticParMetis(algorithm);
+      sforest.balanceLoad( staticParMetis, uint_c( MPIManager::instance()->numProcesses() ), real_t(0), memoryLimit, true );
+   } else if (loadDistributionStrategy == "Diffusive" )
+   {
+      // also use Hilbert curve here
+      bool useHilbert = true;
+      sforest.balanceLoad( blockforest::StaticLevelwiseCurveBalance(useHilbert), uint_c( MPIManager::instance()->numProcesses() ), real_t(0), memoryLimit, true );
+   } else
+   {
+      WALBERLA_ABORT("Load distribution strategy \"" << loadDistributionStrategy << "\t not implemented! - Aborting" );
+   }
+
+   WALBERLA_LOG_INFO_ON_ROOT( sforest );
+
+
+   // create StructuredBlockForest (encapsulates a newly created BlockForest)
+   shared_ptr< StructuredBlockForest > sbf =
+         make_shared< StructuredBlockForest >( make_shared< BlockForest >( uint_c( MPIManager::instance()->rank() ), sforest, keepGlobalBlockInformation ),
+                                               blockSizeInCells[0], blockSizeInCells[1], blockSizeInCells[2]);
+   sbf->createCellBoundingBoxes();
+
+   return sbf;
+}
+
+/////////////////////////////////////
+// BOUNDARY HANDLING CUSTOMIZATION //
+/////////////////////////////////////
+class MyBoundaryHandling : public blockforest::AlwaysInitializeBlockDataHandling< BoundaryHandling_T >
+{
+public:
+   MyBoundaryHandling( const weak_ptr< StructuredBlockStorage > & blocks,
+                       const BlockDataID & flagFieldID, const BlockDataID & pdfFieldID,
+                       const BlockDataID & particleFieldID, const shared_ptr<ParticleAccessor_T>& ac ) :
+         blocks_( blocks ), flagFieldID_( flagFieldID ), pdfFieldID_( pdfFieldID ), particleFieldID_( particleFieldID ), ac_( ac )
+   {}
+
+   BoundaryHandling_T * initialize( IBlock * const block ) override;
+
+private:
+
+   weak_ptr< StructuredBlockStorage > blocks_;
+
+   const BlockDataID flagFieldID_;
+   const BlockDataID pdfFieldID_;
+   const BlockDataID particleFieldID_;
+   shared_ptr<ParticleAccessor_T> ac_;
+
+}; // class MyBoundaryHandling
+
+BoundaryHandling_T * MyBoundaryHandling::initialize( IBlock * const block )
+{
+   WALBERLA_ASSERT_NOT_NULLPTR( block );
+
+   auto * flagField = block->getData< FlagField_T >( flagFieldID_ );
+   auto *  pdfField = block->getData< PdfField_T > ( pdfFieldID_ );
+   auto* particleField = block->getData<lbm_mesapd_coupling::ParticleField_T>(particleFieldID_);
+
+   const auto fluid = flagField->flagExists( Fluid_Flag ) ? flagField->getFlag( Fluid_Flag ) : flagField->registerFlag( Fluid_Flag );
+
+   auto blocksPtr = blocks_.lock();
+   WALBERLA_CHECK_NOT_NULLPTR( blocksPtr );
+
+   BoundaryHandling_T * handling = new BoundaryHandling_T( "moving obstacle boundary handling", flagField, fluid,
+                                                           NoSlip_T( "NoSlip", NoSlip_Flag, pdfField ),
+                                                           MO_T( "MO", MO_Flag, pdfField, flagField, particleField, ac_, fluid, *blocksPtr, *block ),
+                                                           BoundaryHandling_T::Mode::ENTIRE_FIELD_TRAVERSAL);
+
+   handling->fillWithDomain( FieldGhostLayers );
+
+   return handling;
+}
+
+void clearBoundaryHandling( BlockForest & forest, const BlockDataID & boundaryHandlingID ) {
+   for( auto blockIt = forest.begin(); blockIt != forest.end(); ++blockIt )
+   {
+      BoundaryHandling_T * boundaryHandling = blockIt->getData<BoundaryHandling_T>(boundaryHandlingID);
+      boundaryHandling->clear( FieldGhostLayers );
+   }
+}
+
+void recreateBoundaryHandling( BlockForest & forest, const BlockDataID & boundaryHandlingID ) {
+   for( auto blockIt = forest.begin(); blockIt != forest.end(); ++blockIt )
+   {
+      BoundaryHandling_T * boundaryHandling = blockIt->getData<BoundaryHandling_T>(boundaryHandlingID);
+      boundaryHandling->fillWithDomain( FieldGhostLayers );
+   }
+}
+
+void clearParticleField( BlockForest & forest, const BlockDataID & particleFieldID, const ParticleAccessor_T & accessor ) {
+   for (auto blockIt = forest.begin(); blockIt != forest.end(); ++blockIt) {
+      ParticleField_T *particleField = blockIt->getData<ParticleField_T>(particleFieldID);
+      particleField->setWithGhostLayer(accessor.getInvalidUid());
+   }
+}
+//*******************************************************************************************************************
+
+void evaluateTimers(WcTimingPool & timingPool,
+                    const std::vector<std::vector<std::string> > & timerKeys,
+                    std::vector<real_t> & timings )
+{
+
+   for (auto & timingsIt : timings)
+   {
+      timingsIt = real_t(0);
+   }
+
+   timingPool.unifyRegisteredTimersAcrossProcesses();
+
+   for (auto i = uint_t(0); i < timerKeys.size(); ++i )
+   {
+      auto keys = timerKeys[i];
+      for (const auto &timerName : keys)
+      {
+         if(timingPool.timerExists(timerName))
+         {
+            timings[i] += real_c(timingPool[timerName].total());
+         }
+      }
+
+   }
+}
+
+uint_t evaluateEdgeCut(BlockForest & forest)
+{
+
+   //note: only works for edges in uniform grids
+
+   auto edgecut = uint_t(0); // = edge weights between processes
+
+   for( auto blockIt = forest.begin(); blockIt != forest.end(); ++blockIt )
+   {
+      auto * block = static_cast<blockforest::Block*> (&(*blockIt));
+
+      real_t blockVolume = block->getAABB().volume();
+      real_t approximateEdgeLength = std::cbrt( blockVolume );
+
+      uint_t faceNeighborWeight = uint_c(approximateEdgeLength * approximateEdgeLength ); //common face
+      uint_t edgeNeighborWeight = uint_c(approximateEdgeLength); //common edge
+      uint_t cornerNeighborWeight = uint_c( 1 ); //common corner
+
+
+      for( const uint_t idx : blockforest::getFaceNeighborhoodSectionIndices() )
+      {
+         for (auto nb = uint_t(0); nb < block->getNeighborhoodSectionSize(idx); ++nb)
+         {
+            if( block->neighborExistsRemotely(idx,nb) ) edgecut += faceNeighborWeight;
+         }
+      }
+
+      for( const uint_t idx : blockforest::getEdgeNeighborhoodSectionIndices() )
+      {
+         for (auto nb = uint_t(0); nb < block->getNeighborhoodSectionSize(idx); ++nb)
+         {
+            if( block->neighborExistsRemotely(idx,nb) ) edgecut += edgeNeighborWeight;
+         }
+      }
+
+      for( const uint_t idx : blockforest::getCornerNeighborhoodSectionIndices() )
+      {
+         for (auto nb = uint_t(0); nb < block->getNeighborhoodSectionSize(idx); ++nb)
+         {
+            if( block->neighborExistsRemotely(idx,nb) ) edgecut += cornerNeighborWeight;
+         }
+      }
+   }
+   return edgecut;
+}
+
+
+void evaluateTotalSimulationTimePassed(WcTimingPool & timeloopTimingPool, const std::string & simulationString, const std::string & loadbalancingString,
+                                       double & totalSimTime, double & totalLBTime)
+{
+   shared_ptr< WcTimingPool> reduced = timeloopTimingPool.getReduced(timing::REDUCE_TOTAL, 0);
+
+   double totalTime = 0.0;
+   WALBERLA_ROOT_SECTION(){
+      totalTime = (*reduced)[simulationString].total();
+   }
+   totalSimTime = totalTime;
+
+   double lbTime = 0.0;
+   WALBERLA_ROOT_SECTION(){
+      lbTime = (*reduced)[loadbalancingString].total();
+   }
+   totalLBTime = lbTime;
+}
+
+void createPlane( const shared_ptr<mesa_pd::data::ParticleStorage> & ps, const shared_ptr<mesa_pd::data::ShapeStorage> & ss,
+                  const Vector3<real_t> position, const Vector3<real_t> normal)
+{
+   mesa_pd::data::Particle&& p0 = *ps->create(true);
+   p0.setPosition(position);
+   p0.setShapeID(ss->create<mesa_pd::data::HalfSpace>( normal.getNormalized() ));
+   p0.setOwner(mpi::MPIManager::instance()->rank());
+   p0.setType(0);
+   mesa_pd::data::particle_flags::set(p0.getFlagsRef(), mesa_pd::data::particle_flags::INFINITE);
+   mesa_pd::data::particle_flags::set(p0.getFlagsRef(), mesa_pd::data::particle_flags::FIXED);
+   WALBERLA_LOG_INFO_ON_ROOT("Created plane at position " << position << " with normal " << normal.getNormalized  ());
+}
+
+void createBasicPlaneSetup(const shared_ptr<mesa_pd::data::ParticleStorage> & ps, const shared_ptr<mesa_pd::data::ShapeStorage> & ss, const math::AABB & simulationDomain)
+{
+   createPlane(ps, ss, simulationDomain.minCorner(), Vector3<real_t>(0,0, 1));
+   //createPlane(ps, ss, Vector3<real_t>(simulationDomain.xMax() * 0.5, simulationDomain.yMax() * 0.5, simulationDomain.zMin() + std::max(simulationDomain.xMax(), simulationDomain.yMax()) * 0.5 ), Vector3<real_t>(0,1, 1));
+   createPlane(ps, ss, simulationDomain.maxCorner(), Vector3<real_t>(0,0,-1));
+}
+
+void addBoxPlaneSetup(const shared_ptr<mesa_pd::data::ParticleStorage> & ps, const shared_ptr<mesa_pd::data::ShapeStorage> & ss, const math::AABB & simulationDomain)
+{
+   // add bounding planes (four horizontal directions)
+   createPlane(ps, ss, simulationDomain.minCorner(), Vector3<real_t>( 1, 0,0));
+   createPlane(ps, ss, simulationDomain.maxCorner(), Vector3<real_t>(-1, 0,0));
+   createPlane(ps, ss, simulationDomain.minCorner(), Vector3<real_t>( 0, 1,0));
+   createPlane(ps, ss, simulationDomain.maxCorner(), Vector3<real_t>( 0,-1,0));
+}
+
+void addHopperPlaneSetup(const shared_ptr<mesa_pd::data::ParticleStorage> & ps, const shared_ptr<mesa_pd::data::ShapeStorage> & ss, const math::AABB & simulationDomain,
+                         real_t hopperRelHeight, real_t hopperRelOpening)
+{
+   //hopper planes
+   real_t xMax = simulationDomain.xMax();
+   real_t yMax = simulationDomain.yMax();
+   real_t zMax = simulationDomain.zMax();
+   Vector3<real_t> p1(0,0,hopperRelHeight*zMax);
+   Vector3<real_t> n1(p1[2],0,hopperRelOpening*xMax-p1[0]);
+   Vector3<real_t> n2(0,p1[2],hopperRelOpening*yMax-p1[0]);
+   createPlane(ps, ss, p1, n1);
+   createPlane(ps, ss, p1, n2);
+
+   Vector3<real_t> p2(xMax,yMax,hopperRelHeight*zMax);
+   Vector3<real_t> n3(-p2[2],0,-((real_t(1)-hopperRelOpening)*xMax-p2[0]));
+   Vector3<real_t> n4(0,-p2[2],-((real_t(1)-hopperRelOpening)*yMax-p2[1]));
+   createPlane(ps, ss, p2, n3);
+   createPlane(ps, ss, p2, n4);
+}
+
+void evaluateParticleSimulation(const shared_ptr<mesa_pd::data::ParticleStorage> & ps, const shared_ptr<mesa_pd::data::ShapeStorage> & ss,
+                                uint_t numParticles, uint_t numGlobalParticles)
+{
+   auto numShapes = ss->shapes.size();
+   uint_t numLocalParticles = 0;
+   uint_t numGhostParticles = 0;
+   uint_t numGlobalParticlesOfRank = 0;
+   for(auto p = ps->begin(); p != ps->end(); ++p)
+   {
+      using namespace walberla::mesa_pd::data::particle_flags;
+      if (isSet(p->getFlags(), GHOST))
+      {
+         ++numGhostParticles;
+      } else if (isSet(p->getFlags(), GLOBAL))
+      {
+         ++numGlobalParticlesOfRank;
+      } else
+      {
+         ++numLocalParticles;
+      }
+      if(p->getShapeID() >= numShapes)
+      {
+         WALBERLA_LOG_INFO("Found invalid shape id " << *p);
+      }
+   }
+   //WALBERLA_LOG_INFO(numShapes << " " << numLocalParticles << " " << numGhostParticles);
+
+   if(numGlobalParticlesOfRank != numGlobalParticles)
+   {
+      WALBERLA_LOG_INFO("Number of global particles has changed to " << numGlobalParticlesOfRank);
+   }
+
+   mpi::reduceInplace(numLocalParticles, mpi::SUM);
+   if(numLocalParticles != numParticles)
+   {
+      WALBERLA_LOG_INFO_ON_ROOT("Number of particles has changed to " << numLocalParticles);
+   }
+}
+
+bool isnan(Vector3<real_t> vec)
+{
+   return std::isnan(vec[0]) || std::isnan(vec[1]) || std::isnan(vec[2]);
+}
+
+void checkParticleProperties(const shared_ptr<mesa_pd::data::ParticleStorage> & ps)
+{
+   for(auto p = ps->begin(); p != ps->end(); ++p)
+   {
+      if(isnan(p->getHydrodynamicForce())) WALBERLA_LOG_INFO("Found nan in hyd Force " << *p);
+      if(isnan(p->getOldHydrodynamicForce())) WALBERLA_LOG_INFO("Found nan in old hyd Force " << *p);
+      if(isnan(p->getForce())) WALBERLA_LOG_INFO("Found nan in Force " << *p);
+      if(isnan(p->getOldForce())) WALBERLA_LOG_INFO("Found nan in Force " << *p);
+   }
+}
+
+template< typename Probe_T>
+void checkMapping(const shared_ptr< StructuredBlockStorage > & blocks, const BlockDataID pdfFieldID,
+                  const BlockDataID boundaryHandlingID, Probe_T & probe )
+{
+
+   for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt)
+   {
+      auto * pdfField = blockIt->getData< PdfField_T >( pdfFieldID );
+      auto * boundaryHandling = blockIt->getData< BoundaryHandling_T >( boundaryHandlingID );
+
+      WALBERLA_FOR_ALL_CELLS_XYZ(pdfField,
+          if (boundaryHandling->isDomain(x, y, z))
+          {
+             uint_t f = uint_t(0);
+             //for( uint_t f = uint_t(0); f < PdfField_T::F_SIZE; ++f )
+             //{
+                if( !walberla::field::internal::stabilityCheckerIsFinite( pdfField->get( x, y, z, cell_idx_c(f) ) ) )
+                {
+
+                   Vector3< real_t > center;
+                   blocks->getBlockLocalCellCenter( *blockIt, Cell(x,y,z), center );
+
+                   Cell gCell(x,y,z);
+                   blocks->transformBlockLocalToGlobalCell( gCell, *blockIt);
+
+                   WALBERLA_LOG_INFO("Instability found in block local cell( " << x << ", " << y << ", " << z << " ) at index " << f
+                                         << " = global cell ( " << gCell.x() << ", " << gCell.y() << ", " << gCell.z() << " ) with cell center ( " << center[0] << ", " << center[1] << ", " << center[2] << " )");
+
+                   probe.setPosition(center);
+                   real_t rho;
+                   Vector3<real_t> velocity;
+                   probe(rho, velocity);
+
+                }
+             //}
+          }
+          );
+
+   }
+}
+
+//*******************************************************************************************************************
+/*!\brief Simulation of settling particles inside a rectangular column filled with viscous fluid
+ *
+ * This application is used in the paper
+ *  Rettinger, Ruede - "Dynamic Load Balancing Techniques for Particulate Flow Simulations", 2019, Computation
+ * in Section 4 to apply the load estimator and to evaluate different load distribution strategies.
+ * It has here been adapted to the new mesapd and its coupling.
+ * More infos can be found in the PhD thesis by Christoph Rettinger.
+ *
+ * It, however, features several different command line arguments that can be used to tweak the simulation.
+ * The setup can be horizontally period, a box or a hopper geometry (configurable, as in the paper).
+ * The size, resolution and used blocks for the domain partitioning can be changed.
+ * Initially, all particles are pushed upwards to obtain a dense packing at the top plane.
+ *
+ * Most importantly, the load balancing can be modified:
+ *  - load estimation strategies:
+ *    - pure LBM = number of cells per block = constant workload per block
+ *    - coupling based load estimator = use fitted function from Sec. 3 of paper
+ *  - load distribution strategies:
+ *    - space-filling curves: Hilbert and Morton
+ *    - ParMETIS (and several algorithms and parameters)
+ *    - diffusive (and options)
+ *  - load balancing frequency
+ */
+//*******************************************************************************************************************
+int main( int argc, char **argv )
+{
+   debug::enterTestMode();
+
+   mpi::Environment env( argc, argv );
+
+   MPIManager::instance()->useWorldComm();
+
+   ///////////////////
+   // Customization //
+   ///////////////////
+
+   // simulation control
+   bool shortRun = false;
+   bool funcTest = false;
+   bool fileIO = true;
+   bool checkSimulation = false;
+   uint_t vtkWriteFreqDD = 0; //domain decomposition
+   uint_t vtkWriteFreqPa = 0; //particles
+   uint_t vtkWriteFreqFl = 0; //fluid
+   uint_t vtkWriteFreq = 0; //general
+   uint_t vtkWriteFreqInit = 0; //initial (particle-only) simulation
+   std::string baseFolder = "vtk_out_WorkloadDistribution"; // folder for vtk and file output
+   bool useProgressLogging = false;
+
+   // physical setup
+   auto GalileoNumber = real_t(50);
+   auto densityRatio = real_t(1.5);
+   auto diameter = real_t(15);
+   auto solidVolumeFraction = real_t(0.1);
+   auto blockSize = uint_t(32);
+   auto XBlocks = uint_t(12);
+   auto YBlocks = uint_t(12);
+   auto ZBlocks = uint_t(16);
+   bool useBox = false;
+   bool useHopper = false;
+   auto hopperRelHeight = real_t(0.5); // for hopper setup
+   auto hopperRelOpening = real_t(0.3); // for hopper setup
+
+   auto timesteps = uint_t(80000);
+
+   //numerical parameters
+   auto loadBalancingCheckFrequency = uint_t(100);
+   auto numRPDSubCycles = uint_t(10);
+   bool useBlockForestSync = false;
+
+   // load balancing
+   std::string loadEvaluationStrategy = "LBM"; //LBM, Fit
+   std::string loadDistributionStrategy = "Hilbert"; //Morton, Hilbert, ParMetis, Diffusive
+   real_t blockBaseWeight = real_t(1);
+
+   auto parMetis_ipc2redist = real_t(1000);
+   auto parMetisTolerance = real_t(-1);
+   std::string parMetisAlgorithmString = "ADAPTIVE_REPART";
+
+   auto diffusionFlowIterations = uint_t(15);
+   auto diffusionMaxIterations = uint_t(20);
+
+   bool useNoSlipForPlanes = false;
+
+
+   for( int i = 1; i < argc; ++i )
+   {
+      if( std::strcmp( argv[i], "--shortRun" )                 == 0 ) { shortRun = true; continue; }
+      if( std::strcmp( argv[i], "--funcTest" )                 == 0 ) { funcTest = true; continue; }
+      if( std::strcmp( argv[i], "--fileIO" )                   == 0 ) { fileIO = true; continue; }
+      if( std::strcmp( argv[i], "--useProgressLogging" )       == 0 ) { useProgressLogging = true; continue; }
+      if( std::strcmp( argv[i], "--checkSimulation" )          == 0 ) { checkSimulation = true; continue; }
+      if( std::strcmp( argv[i], "--vtkWriteFreqDD" )           == 0 ) { vtkWriteFreqDD = uint_c( std::atof( argv[++i] ) ); continue; }
+      if( std::strcmp( argv[i], "--vtkWriteFreqPa" )           == 0 ) { vtkWriteFreqPa = uint_c( std::atof( argv[++i] ) ); continue; }
+      if( std::strcmp( argv[i], "--vtkWriteFreqFl" )           == 0 ) { vtkWriteFreqFl = uint_c( std::atof( argv[++i] ) ); continue; }
+      if( std::strcmp( argv[i], "--vtkWriteFreq" )             == 0 ) { vtkWriteFreq = uint_c( std::atof( argv[++i] ) ); continue; }
+      if( std::strcmp( argv[i], "--vtkWriteFreqInit" )         == 0 ) { vtkWriteFreqInit = uint_c( std::atof( argv[++i] ) ); continue; }
+      if( std::strcmp( argv[i], "--baseFolder" )               == 0 ) { baseFolder = argv[++i]; continue; }
+      if( std::strcmp( argv[i], "--densityRatio" )             == 0 ) { densityRatio = real_c(std::atof( argv[++i] )); continue; }
+      if( std::strcmp( argv[i], "--Ga" )                       == 0 ) { GalileoNumber = real_c(std::atof( argv[++i] )); continue; }
+      if( std::strcmp( argv[i], "--diameter" )                 == 0 ) { diameter = real_c(std::atof( argv[++i] )); continue; }
+      if( std::strcmp( argv[i], "--blockSize" )                == 0 ) { blockSize = uint_c(std::atof( argv[++i] ) ); continue; }
+      if( std::strcmp( argv[i], "--XBlocks" )                  == 0 ) { XBlocks = uint_c(std::atof( argv[++i] ) ); continue; }
+      if( std::strcmp( argv[i], "--YBlocks" )                  == 0 ) { YBlocks = uint_c(std::atof( argv[++i] ) ); continue; }
+      if( std::strcmp( argv[i], "--ZBlocks" )                  == 0 ) { ZBlocks = uint_c(std::atof( argv[++i] ) ); continue; }
+      if( std::strcmp( argv[i], "--useBox" )                   == 0 ) { useBox = true; continue; }
+      if( std::strcmp( argv[i], "--useHopper" )                == 0 ) { useHopper = true; continue; }
+      if( std::strcmp( argv[i], "--hopperHeight" )             == 0 ) { hopperRelHeight = real_c(std::atof( argv[++i] )); continue; }
+      if( std::strcmp( argv[i], "--hopperOpening" )            == 0 ) { hopperRelOpening = real_c(std::atof( argv[++i] )); continue; }
+      if( std::strcmp( argv[i], "--timesteps" )                == 0 ) { timesteps = uint_c(std::atof( argv[++i] ) ); continue; }
+      if( std::strcmp( argv[i], "--numRPDSubCycles" )          == 0 ) { numRPDSubCycles = uint_c(std::atof( argv[++i] )); continue; }
+      if( std::strcmp( argv[i], "--useBlockForestSync" )       == 0 ) { useBlockForestSync = true; continue; }
+      if( std::strcmp( argv[i], "--useNoSlipForPlanes" )       == 0 ) { useNoSlipForPlanes = true; continue; }
+      if( std::strcmp( argv[i], "--blockBaseWeight" )          == 0 ) { blockBaseWeight = real_c(std::atof( argv[++i] )); continue; }
+      if( std::strcmp( argv[i], "--loadBalancingCheckFrequency" ) == 0 ) { loadBalancingCheckFrequency = uint_c( std::atof( argv[++i] ) ); continue; }
+      if( std::strcmp( argv[i], "--loadEvaluationStrategy" )   == 0 ) { loadEvaluationStrategy = argv[++i]; continue; }
+      if( std::strcmp( argv[i], "--loadDistributionStrategy" ) == 0 ) { loadDistributionStrategy = argv[++i]; continue; }
+      if( std::strcmp( argv[i], "--ipc2redist" )               == 0 ) { parMetis_ipc2redist = real_c(std::atof( argv[++i] )); continue; }
+      if( std::strcmp( argv[i], "--parMetisTolerance" )        == 0 ) { parMetisTolerance = real_c(std::atof( argv[++i] )); continue; }
+      if( std::strcmp( argv[i], "--parMetisAlgorithm" )        == 0 ) { parMetisAlgorithmString = argv[++i]; continue; }
+      if( std::strcmp( argv[i], "--diffusionFlowIterations" )  == 0 ) { diffusionFlowIterations = uint_c(std::atof(argv[++i])); continue; }
+      if( std::strcmp( argv[i], "--diffusionMaxIterations" )   == 0 ) { diffusionMaxIterations = uint_c(std::atof(argv[++i])); continue; }
+      WALBERLA_ABORT("Unrecognized command line argument found: " << argv[i]);
+   }
+
+   if( fileIO )
+   {
+      WALBERLA_ROOT_SECTION(){
+         // create base directory if it does not yet exist
+         filesystem::path tpath( baseFolder );
+         if( !filesystem::exists( tpath ) )
+            filesystem::create_directory( tpath );
+      }
+      WALBERLA_MPI_BARRIER();
+   }
+
+   if(useProgressLogging) logging::Logging::instance()->includeLoggingToFile(baseFolder + "/progress_logging.txt");
+
+   if( loadEvaluationStrategy != "LBM" && loadEvaluationStrategy != "Fit" && loadEvaluationStrategy != "FitMulti")
+   {
+      WALBERLA_ABORT("Invalid load evaluation strategy: " << loadEvaluationStrategy);
+   }
+
+   if( vtkWriteFreq != 0 )
+   {
+      vtkWriteFreqDD = vtkWriteFreq;
+      vtkWriteFreqPa = vtkWriteFreq;
+      vtkWriteFreqFl = vtkWriteFreq;
+   }
+
+   if( diameter > real_c(blockSize) )
+   {
+      WALBERLA_LOG_WARNING("Particle Synchronization might not work since bodies are large compared to block size!");
+   }
+
+   if( useHopper )
+   {
+      WALBERLA_CHECK(hopperRelHeight >= real_t(0) && hopperRelHeight <= real_t(1), "Invalid relative hopper height of " << hopperRelHeight);
+      WALBERLA_CHECK(hopperRelOpening >= real_t(0) && hopperRelOpening <= real_t(1), "Invalid relative hopper opening of " << hopperRelOpening);
+   }
+
+
+   //////////////////////////
+   // NUMERICAL PARAMETERS //
+   //////////////////////////
+
+   const Vector3<uint_t> domainSize( XBlocks * blockSize, YBlocks * blockSize, ZBlocks * blockSize );
+   const auto domainVolume = real_t(domainSize[0] * domainSize[1] * domainSize[2]);
+   const real_t sphereVolume = math::pi / real_t(6) * diameter * diameter * diameter;
+   const uint_t numberOfSediments = uint_c(std::ceil(solidVolumeFraction * domainVolume / sphereVolume));
+
+   real_t expectedSedimentVolumeFraction = (useBox||useHopper) ? real_t(0.45) : real_t(0.52);
+   const real_t expectedSedimentedVolume = real_t(1)/expectedSedimentVolumeFraction * real_c(numberOfSediments) * sphereVolume;
+   const real_t expectedSedimentedHeight = std::max(diameter, expectedSedimentedVolume / real_c(domainSize[0] * domainSize[1]));
+
+   const auto uRef = real_t(0.02);
+   const real_t xRef = diameter;
+   const real_t tRef = xRef / uRef;
+
+   const real_t gravitationalAcceleration = uRef * uRef / ( (densityRatio-real_t(1)) * diameter );
+   const real_t viscosity = uRef * diameter / GalileoNumber;
+   const real_t omega = lbm::collision_model::omegaFromViscosity(viscosity);
+   const real_t tau = real_t(1) / omega;
+
+   const auto dx = real_t(1);
+   const real_t overlap = real_t( 1.5 ) * dx;
+
+   timesteps = funcTest ? 1 : ( shortRun ? uint_t(100) : timesteps );
+
+   WALBERLA_LOG_INFO_ON_ROOT("Setup (in simulation, i.e. lattice, units):");
+   WALBERLA_LOG_INFO_ON_ROOT(" - domain size = " << domainSize);
+   WALBERLA_LOG_INFO_ON_ROOT(" - sediment diameter = " << diameter );
+   WALBERLA_LOG_INFO_ON_ROOT(" - Galileo number = " << GalileoNumber );
+   WALBERLA_LOG_INFO_ON_ROOT(" - number of sediments: " << numberOfSediments);
+   WALBERLA_LOG_INFO_ON_ROOT(" - densityRatio = " << densityRatio );
+   WALBERLA_LOG_INFO_ON_ROOT(" - fluid: relaxation time (tau) = " << tau << ", kin. visc = " << viscosity );
+   WALBERLA_LOG_INFO_ON_ROOT(" - gravitational acceleration = " << gravitationalAcceleration );
+   WALBERLA_LOG_INFO_ON_ROOT(" - reference values: x = " << xRef << ", t = " << tRef << ", vel = " << uRef);
+   WALBERLA_LOG_INFO_ON_ROOT(" - omega: " << omega);
+   if( vtkWriteFreqDD > 0 )
+   {
+      WALBERLA_LOG_INFO_ON_ROOT(" - writing vtk files of domain decomposition to folder \"" << baseFolder << "\" with frequency " << vtkWriteFreqDD);
+   }
+   if( vtkWriteFreqPa > 0 )
+   {
+      WALBERLA_LOG_INFO_ON_ROOT(" - writing vtk files of bodies data to folder \"" << baseFolder << "\" with frequency " << vtkWriteFreqPa);
+   }
+   if( vtkWriteFreqFl > 0 )
+   {
+      WALBERLA_LOG_INFO_ON_ROOT(" - writing vtk files of fluid data to folder \"" << baseFolder << "\" with frequency " << vtkWriteFreqFl);
+   }
+   if( useBox )
+   {
+      WALBERLA_LOG_INFO_ON_ROOT(" - using box setup");
+   }
+   else if ( useHopper )
+   {
+      WALBERLA_LOG_INFO_ON_ROOT(" - using hopper setup");
+   }
+   else
+   {
+      WALBERLA_LOG_INFO_ON_ROOT(" - using horizontally periodic domain");
+   }
+
+   WALBERLA_LOG_INFO_ON_ROOT(" - refinement / load balancing check frequency: " << loadBalancingCheckFrequency);
+   WALBERLA_LOG_INFO_ON_ROOT(" - load evaluation strategy: " << loadEvaluationStrategy);
+   WALBERLA_LOG_INFO_ON_ROOT(" - load distribution strategy: " << loadDistributionStrategy);
+
+   ///////////////////////////
+   // BLOCK STRUCTURE SETUP //
+   ///////////////////////////
+
+   Vector3<uint_t> blockSizeInCells( blockSize );
+
+   AABB simulationDomain( real_t(0), real_t(0), real_t(0), real_c(domainSize[0]), real_c(domainSize[1]), real_c(domainSize[2]) );
+   AABB sedimentDomain( real_t(0), real_t(0), real_c(domainSize[2]) - expectedSedimentedHeight, real_c(domainSize[0]), real_c(domainSize[1]), real_c(domainSize[2]) );
+
+   auto blocks = createBlockStructure( simulationDomain, blockSizeInCells, (useBox||useHopper), loadDistributionStrategy );
+
+   //write initial domain decomposition to file
+   if( vtkWriteFreqDD > 0 )
+   {
+      vtk::writeDomainDecomposition( blocks, "initial_domain_decomposition", baseFolder );
+   }
+
+   /////////
+   // RPD //
+   /////////
+
+   const real_t restitutionCoeff = real_t(0.97);
+   const real_t frictionCoeffStatic = real_t(0.8);
+   const real_t frictionCoeffDynamic = real_t(0.15);
+   const real_t collisionTime = real_t(4) * diameter; // from my paper
+   const real_t poissonsRatio = real_t(0.22);
+   const real_t kappa = real_t(2) * ( real_t(1) - poissonsRatio ) / ( real_t(2) - poissonsRatio ) ;
+
+   auto rpdDomain = std::make_shared<mesa_pd::domain::BlockForestDomain>(blocks->getBlockForestPointer());
+
+   //init data structures
+   auto ps = walberla::make_shared<mesa_pd::data::ParticleStorage>(1);
+   blocks->addBlockData(mesa_pd::domain::createBlockForestDataHandling(ps), "Particle Storage"); // returned ID is not used, but ps has to be known to blockforest
+   auto ss = walberla::make_shared<mesa_pd::data::ShapeStorage>();
+   auto accessor = walberla::make_shared<ParticleAccessor_T >(ps, ss);
+
+   real_t timeStepSizeRPD = real_t(1)/real_t(numRPDSubCycles);
+   mesa_pd::kernel::VelocityVerletPreForceUpdate  vvIntegratorPreForce(timeStepSizeRPD);
+   mesa_pd::kernel::VelocityVerletPostForceUpdate vvIntegratorPostForce(timeStepSizeRPD);
+
+   // types: 0 = wall, 1: sphere
+   mesa_pd::kernel::LinearSpringDashpot collisionResponse(2);
+   collisionResponse.setFrictionCoefficientDynamic(0,1,frictionCoeffDynamic);
+   collisionResponse.setFrictionCoefficientDynamic(1,1,frictionCoeffDynamic);
+   collisionResponse.setFrictionCoefficientStatic(0,1,frictionCoeffStatic);
+   collisionResponse.setFrictionCoefficientStatic(1,1,frictionCoeffStatic);
+
+   const real_t particleMass = densityRatio * sphereVolume;
+   const real_t effMass_sphereWall = particleMass;
+   const real_t effMass_sphereSphere = particleMass * particleMass / ( real_t(2) * particleMass );
+   collisionResponse.setStiffnessAndDamping(0,1,restitutionCoeff,collisionTime,kappa,effMass_sphereWall);
+   collisionResponse.setStiffnessAndDamping(1,1,restitutionCoeff,collisionTime,kappa,effMass_sphereSphere);
+
+   mesa_pd::mpi::ReduceProperty reduceProperty;
+   mesa_pd::mpi::ReduceContactHistory reduceAndSwapContactHistory;
+   mesa_pd::mpi::SyncNextNeighbors syncNextNeighborFunc;
+   mesa_pd::kernel::AssocToBlock associateToBlock(blocks->getBlockForestPointer());
+   mesa_pd::mpi::SyncNextNeighborsBlockForest syncNextNeighborBlockForestFunc;
+
+
+   // create bounding planes
+
+   createBasicPlaneSetup(ps, ss, simulationDomain);
+   if(useBox || useHopper) addBoxPlaneSetup(ps, ss, simulationDomain);
+   if(useHopper) addHopperPlaneSetup(ps, ss, simulationDomain, hopperRelHeight, hopperRelOpening);
+
+   auto numGlobalParticles = ps->size();
+   WALBERLA_LOG_INFO_ON_ROOT("Created " << numGlobalParticles << " global particles");
+
+   // add the sediments
+
+   WALBERLA_LOG_INFO_ON_ROOT("Starting creation of sediments");
+
+   AABB sedimentGenerationDomain( real_t(0), real_t(0), real_t(0.5)*real_c(domainSize[2]), real_c(domainSize[0]), real_c(domainSize[1]), real_c(domainSize[2]) );
+
+   auto xParticle = real_t(0);
+   auto yParticle = real_t(0);
+   auto zParticle = real_t(0);
+
+   auto rank = mpi::MPIManager::instance()->rank();
+
+   auto sphereShape = ss->create<mesa_pd::data::Sphere>( diameter * real_t(0.5) );
+   ss->shapes[sphereShape]->updateMassAndInertia(densityRatio);
+
+   std::mt19937 randomGenerator (static_cast<unsigned int>(2610)); // fixed seed: quasi-random and reproducable
+
+   for( uint_t nSed = 0; nSed < numberOfSediments; ++nSed )
+   {
+
+      WALBERLA_ROOT_SECTION()
+      {
+         xParticle = math::realRandom<real_t>(sedimentGenerationDomain.xMin(), sedimentGenerationDomain.xMax(),randomGenerator);
+         yParticle = math::realRandom<real_t>(sedimentGenerationDomain.yMin(), sedimentGenerationDomain.yMax(),randomGenerator);
+         zParticle = math::realRandom<real_t>(sedimentGenerationDomain.zMin(), sedimentGenerationDomain.zMax(),randomGenerator);
+      }
+
+      WALBERLA_MPI_SECTION()
+      {
+         mpi::broadcastObject( xParticle );
+         mpi::broadcastObject( yParticle );
+         mpi::broadcastObject( zParticle );
+      }
+
+      auto position = Vector3<real_t>( xParticle, yParticle, zParticle );
+
+      if (!rpdDomain->isContainedInProcessSubdomain(uint_c(rank), position)) continue;
+      auto p = ps->create();
+      p->setPosition(position);
+      p->setInteractionRadius(diameter * real_t(0.5));
+      p->setShapeID(sphereShape);
+      p->setType(1);
+      p->setOwner(rank);
+
+   }
+
+   if(useBlockForestSync)
+   {
+      ps->forEachParticle(false, mesa_pd::kernel::SelectLocal(), *accessor, associateToBlock, *accessor);
+      syncNextNeighborBlockForestFunc(*ps, blocks->getBlockForestPointer(), rpdDomain, overlap);
+
+   } else
+   {
+      syncNextNeighborFunc(*ps, *rpdDomain, overlap);
+   }
+
+
+   // Carry out particle-only simulation to obtain dense packing at top plane
+   // consists of three phases:
+   // 1: carefully resolve initial overlaps due to random generation, no gravity
+   // 2: apply low gravity in positive z-direction to create packing until convergence or targeted packing height reached
+   // 3: carry out a few time steps with gravity in negative direction to relay the system towards the real setup
+
+   const bool useOpenMP = false;
+   const real_t dt_RPD_Init = real_t(1);
+   const auto particleSimStepsPhase1 = uint_t(1000);
+   const auto maxParticleSimStepsPhase2 = (shortRun) ? uint_t(10) : uint_t(200000);
+   const auto particleSimStepsPhase3 = uint_t(std::sqrt(real_t(2)/std::fabs(gravitationalAcceleration)));
+
+   uint_t maxInitialParticleSimSteps = particleSimStepsPhase1 + maxParticleSimStepsPhase2 + particleSimStepsPhase3;
+
+   auto particleVtkOutput = make_shared<mesa_pd::vtk::ParticleVtkOutput>(ps);
+   particleVtkOutput->addOutput<mesa_pd::data::SelectParticleLinearVelocity>("velocity");
+   particleVtkOutput->addOutput<mesa_pd::data::SelectParticleOwner>("owner");
+   particleVtkOutput->setParticleSelector( [sphereShape](const mesa_pd::data::ParticleStorage::iterator& pIt) {return !mesa_pd::data::particle_flags::isSet(pIt->getFlags(), mesa_pd::data::particle_flags::GHOST) && pIt->getShapeID() == sphereShape;} ); //limit output to local sphere
+   auto particleVtkWriterInit = vtk::createVTKOutput_PointData(particleVtkOutput, "Particles_init", 1, baseFolder, "simulation_step");
+
+   real_t gravitationalAccelerationGeneration = gravitationalAcceleration;
+   auto oldMinParticlePosition = real_t(0);
+   real_t phase2ConvergenceLimit = std::fabs(gravitationalAccelerationGeneration);
+   real_t heightConvergenceThreshold = sedimentDomain.zMin();
+
+   uint_t beginOfPhase3SimStep = uint_t(0);
+
+   uint_t currentPhase = 1;
+
+   for(auto pet = uint_t(0); pet <= maxInitialParticleSimSteps; ++pet )
+   {
+
+      real_t maxPenetrationDepth = 0;
+      ps->forEachParticlePairHalf(useOpenMP, mesa_pd::kernel::ExcludeInfiniteInfinite(), *accessor,
+                                  [&collisionResponse, &rpdDomain, &maxPenetrationDepth, dt_RPD_Init]
+                                        (const size_t idx1, const size_t idx2, auto& ac)
+                                  {
+                                     mesa_pd::collision_detection::AnalyticContactDetection acd;
+                                     mesa_pd::kernel::DoubleCast double_cast;
+                                     mesa_pd::mpi::ContactFilter contact_filter;
+                                     if (double_cast(idx1, idx2, ac, acd, ac ))
+                                     {
+                                        if (contact_filter(acd.getIdx1(), acd.getIdx2(), ac, acd.getContactPoint(), *rpdDomain))
+                                        {
+                                           maxPenetrationDepth = std::max(maxPenetrationDepth, std::abs(acd.getPenetrationDepth()));
+                                           collisionResponse(acd.getIdx1(), acd.getIdx2(), ac, acd.getContactPoint(),
+                                                             acd.getContactNormal(), acd.getPenetrationDepth(), dt_RPD_Init);
+                                        }
+                                     }
+                                  },
+                                  *accessor );
+
+      reduceAndSwapContactHistory(*ps);
+
+      mpi::allReduceInplace(maxPenetrationDepth, mpi::MAX);
+
+      reduceProperty.operator()<mesa_pd::ForceTorqueNotification>(*ps);
+
+      ps->forEachParticle( useOpenMP, mesa_pd::kernel::SelectLocal(), *accessor, mesa_pd::kernel::ExplicitEuler(dt_RPD_Init), *accessor);
+      if(useBlockForestSync)
+      {
+         syncNextNeighborBlockForestFunc(*ps, blocks->getBlockForestPointer(), rpdDomain, overlap);
+         ps->forEachParticle(false, mesa_pd::kernel::SelectLocal(), *accessor, associateToBlock, *accessor);
+
+      } else
+      {
+         syncNextNeighborFunc(*ps, *rpdDomain, overlap);
+      }
+
+      if( vtkWriteFreqInit > uint_t(0) && pet % vtkWriteFreqInit == uint_t(0) )
+      {
+         particleVtkWriterInit->write();
+      }
+
+
+      if(currentPhase == 1)
+      {
+         // damp velocites to avoid too large ones
+         ps->forEachParticle( useOpenMP, mesa_pd::kernel::SelectLocal(), *accessor,
+                              [](const size_t idx, ParticleAccessor_T& ac){
+                                 ac.setLinearVelocity(idx, ac.getLinearVelocity(idx) * real_t(0.5));
+                                 ac.setAngularVelocity(idx, ac.getAngularVelocity(idx) * real_t(0.5));
+                              }, *accessor);
+
+         if(pet > particleSimStepsPhase1)
+         {
+            WALBERLA_LOG_INFO_ON_ROOT("Starting phase 2 of initial particle simulation, with height threshold = " << heightConvergenceThreshold);
+            currentPhase = 2;
+
+            ps->forEachParticle( useOpenMP, mesa_pd::kernel::SelectLocal(), *accessor,
+                                 [](const size_t idx, ParticleAccessor_T& ac){
+                                    ac.setLinearVelocity(idx, Vector3<real_t>(0.0));
+                                    ac.setAngularVelocity(idx, Vector3<real_t>(0.0));
+                                 }, *accessor);
+         }
+      } else if(currentPhase == 2)
+      {
+
+         Vector3<real_t> gravitationalForce( real_t(0), real_t(0), (densityRatio - real_t(1)) * gravitationalAccelerationGeneration * sphereVolume );
+         lbm_mesapd_coupling::AddForceOnParticlesKernel addGravitationalForce(gravitationalForce);
+         ps->forEachParticle( useOpenMP, mesa_pd::kernel::SelectLocal(), *accessor, addGravitationalForce, *accessor );
+
+
+         real_t minParticlePosition = sedimentGenerationDomain.zMax();
+         ps->forEachParticle( useOpenMP, mesa_pd::kernel::SelectLocal(), *accessor,
+                              [&minParticlePosition](const size_t idx, ParticleAccessor_T& ac){
+                                 minParticlePosition = std::min(ac.getPosition(idx)[2], minParticlePosition);
+                              }, *accessor);
+
+         WALBERLA_MPI_SECTION()
+         {
+            mpi::allReduceInplace(minParticlePosition, mpi::MIN);
+         }
+
+         WALBERLA_ROOT_SECTION()
+         {
+            if( pet % 100 == 0)
+            {
+               WALBERLA_LOG_INFO("[" << pet << "] Min position of all particles = " << minParticlePosition << " with goal height " << heightConvergenceThreshold);
+            }
+         }
+
+         if( minParticlePosition > heightConvergenceThreshold ) currentPhase = 3;
+
+         if( pet % 500 == 0)
+         {
+            if( std::fabs(minParticlePosition - oldMinParticlePosition) / minParticlePosition  < phase2ConvergenceLimit ) currentPhase = 3;
+            oldMinParticlePosition = minParticlePosition;
+         }
+
+         if( currentPhase == 3)
+         {
+            WALBERLA_LOG_INFO_ON_ROOT("Starting phase 3 of initial particle simulation");
+            beginOfPhase3SimStep = pet;
+         }
+
+      } else if(currentPhase == 3)
+      {
+         Vector3<real_t> gravitationalForce( real_t(0), real_t(0), -(densityRatio - real_t(1)) * gravitationalAccelerationGeneration * sphereVolume );
+         lbm_mesapd_coupling::AddForceOnParticlesKernel addGravitationalForce(gravitationalForce);
+         ps->forEachParticle( useOpenMP, mesa_pd::kernel::SelectLocal(), *accessor, addGravitationalForce, *accessor );
+
+         if(pet - beginOfPhase3SimStep > particleSimStepsPhase3)
+         {
+            Vector3<real_t> initialParticleVelocity(real_t(0));
+            WALBERLA_LOG_INFO_ON_ROOT("Setting initial velocity " << initialParticleVelocity << " of all particles");
+            // reset velocities to avoid too large ones
+            ps->forEachParticle( useOpenMP, mesa_pd::kernel::SelectLocal(), *accessor,
+                                 [initialParticleVelocity](const size_t idx, ParticleAccessor_T& ac){
+                                    ac.setLinearVelocity(idx, initialParticleVelocity);
+                                    ac.setAngularVelocity(idx, Vector3<real_t>(0));
+                                 }, *accessor);
+            break;
+         }
+      }
+   }
+   WALBERLA_LOG_INFO_ON_ROOT("Sediment layer creation done!");
+
+   ///////////////////////
+   // ADD DATA TO BLOCKS //
+   ////////////////////////
+
+   // create the lattice model
+   LatticeModel_T latticeModel = LatticeModel_T( lbm::collision_model::TRT::constructWithMagicNumber( omega, lbm::collision_model::TRT::threeSixteenth ) );
+
+   // add PDF field
+   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, "pdf field (zyxf)", latticeModel,
+                                                                         Vector3< real_t >( real_t(0) ), real_t(1),
+                                                                         FieldGhostLayers, field::zyxf );
+   // add flag field
+   BlockDataID flagFieldID = field::addFlagFieldToStorage<FlagField_T>( blocks, "flag field", FieldGhostLayers );
+
+   // add particle field
+   BlockDataID particleFieldID = field::addToStorage<lbm_mesapd_coupling::ParticleField_T>( blocks, "particle field", accessor->getInvalidUid(), field::zyxf, FieldGhostLayers );
+
+   // add boundary handling & initialize outer domain boundaries
+   BlockDataID boundaryHandlingID = blocks->addBlockData( make_shared< MyBoundaryHandling >( blocks, flagFieldID, pdfFieldID, particleFieldID, accessor ), "boundary handling" );
+
+   Vector3<real_t> gravitationalForce( real_t(0), real_t(0), -(densityRatio - real_t(1)) * gravitationalAcceleration * sphereVolume );
+   lbm_mesapd_coupling::AddForceOnParticlesKernel addGravitationalForce(gravitationalForce);
+   lbm_mesapd_coupling::ResetHydrodynamicForceTorqueKernel resetHydrodynamicForceTorque;
+   lbm_mesapd_coupling::AverageHydrodynamicForceTorqueKernel averageHydrodynamicForceTorque;
+   lbm_mesapd_coupling::LubricationCorrectionKernel lubricationCorrectionKernel(viscosity, [](real_t r){return (real_t(0.001 + real_t(0.00007)*r))*r;});
+   lbm_mesapd_coupling::ParticleMappingKernel<BoundaryHandling_T> particleMappingKernel(blocks, boundaryHandlingID);
+   lbm_mesapd_coupling::MovingParticleMappingKernel<BoundaryHandling_T> movingParticleMappingKernel(blocks, boundaryHandlingID, particleFieldID);
+   lbm_mesapd_coupling::ParticleMappingKernel<BoundaryHandling_T> fixedParticleMappingKernel(blocks, boundaryHandlingID);
+   lbm_mesapd_coupling::InitializeHydrodynamicForceTorqueForAveragingKernel initializeHydrodynamicForceTorqueForAveragingKernel;
+
+   WALBERLA_LOG_INFO_ON_ROOT(" - Lubrication correction:");
+   WALBERLA_LOG_INFO_ON_ROOT("   - normal cut off distance = " << lubricationCorrectionKernel.getNormalCutOffDistance());
+   WALBERLA_LOG_INFO_ON_ROOT("   - tangential translational cut off distance = " << lubricationCorrectionKernel.getTangentialTranslationalCutOffDistance());
+   WALBERLA_LOG_INFO_ON_ROOT("   - tangential rotational cut off distance = " << lubricationCorrectionKernel.getTangentialRotationalCutOffDistance());
+   const real_t maximumLubricationCutOffDistance = std::max(lubricationCorrectionKernel.getNormalCutOffDistance(), std::max(lubricationCorrectionKernel.getTangentialRotationalCutOffDistance(), lubricationCorrectionKernel.getTangentialTranslationalCutOffDistance()));
+
+   std::function<void(void)> particleMappingCall = [ps, accessor, &movingParticleMappingKernel, &fixedParticleMappingKernel, useNoSlipForPlanes]()
+   {
+      // map planes into the LBM simulation -> act as no-slip boundaries
+      if(useNoSlipForPlanes)
+      {
+         ps->forEachParticle(false, lbm_mesapd_coupling::GlobalParticlesSelector(), *accessor, fixedParticleMappingKernel, *accessor, NoSlip_Flag);
+      }
+      else {
+         ps->forEachParticle(false, lbm_mesapd_coupling::GlobalParticlesSelector(), *accessor, movingParticleMappingKernel, *accessor, MO_Flag);
+      }
+      // map particles into the LBM simulation
+      ps->forEachParticle(false, lbm_mesapd_coupling::RegularParticlesSelector(), *accessor, movingParticleMappingKernel, *accessor, MO_Flag);
+   };
+
+   particleMappingCall();
+
+
+   //////////////////////////
+   // LOAD BALANCING UTILs //
+   //////////////////////////
+
+   auto & blockforest = blocks->getBlockForest();
+   blockforest.recalculateBlockLevelsInRefresh( false ); // = only load balancing, no refinement checking
+   blockforest.alwaysRebalanceInRefresh( true ); //load balancing every time refresh is triggered
+   blockforest.reevaluateMinTargetLevelsAfterForcedRefinement( false );
+   blockforest.allowRefreshChangingDepth( false );
+   blockforest.allowMultipleRefreshCycles( false ); // otherwise info collections are invalid
+
+   shared_ptr<lbm_mesapd_coupling::amr::InfoCollection> couplingInfoCollection = walberla::make_shared<lbm_mesapd_coupling::amr::InfoCollection>();
+   uint_t numberOfProcesses = uint_c(MPIManager::instance()->numProcesses());
+
+   if( loadDistributionStrategy == "Hilbert" || loadDistributionStrategy == "Morton")
+   {
+
+      bool curveAllGather = true;
+      bool balanceLevelwise = true;
+
+      if( loadDistributionStrategy == "Hilbert")
+      {
+         bool useHilbert = true;
+         blockforest.setRefreshPhantomBlockMigrationPreparationFunction( blockforest::DynamicCurveBalance< blockforest::PODPhantomWeight<real_t> >( useHilbert, curveAllGather, balanceLevelwise ) );
+      }
+      else if (loadDistributionStrategy == "Morton" )
+      {
+         bool useHilbert = false;
+         blockforest.setRefreshPhantomBlockMigrationPreparationFunction( blockforest::DynamicCurveBalance< blockforest::PODPhantomWeight<real_t> >( useHilbert, curveAllGather, balanceLevelwise ) );
+      }
+
+      if( loadEvaluationStrategy == "Fit" )
+      {
+         lbm_mesapd_coupling::amr::WeightEvaluationFunctor weightEvaluationFunctor(couplingInfoCollection, lbm_mesapd_coupling::amr::fittedTotalWeightEvaluationFunction);
+         lbm_mesapd_coupling::amr::WeightAssignmentFunctor weightAssignmentFunctor(weightEvaluationFunctor);
+         weightAssignmentFunctor.setBlockBaseWeight(blockBaseWeight);
+         blockforest.setRefreshPhantomBlockDataAssignmentFunction(weightAssignmentFunctor);
+      }
+      else if( loadEvaluationStrategy == "LBM" )
+      {
+         lbm_mesapd_coupling::amr::WeightAssignmentFunctor weightAssignmentFunctor(lbm_mesapd_coupling::amr::defaultWeightEvaluationFunction);
+         weightAssignmentFunctor.setBlockBaseWeight(blockBaseWeight);
+         blockforest.setRefreshPhantomBlockDataAssignmentFunction(weightAssignmentFunctor);
+      }
+      else
+      {
+         WALBERLA_ABORT("Invalid load evaluation strategy: " << loadEvaluationStrategy);
+      }
+
+      blockforest.setRefreshPhantomBlockDataPackFunction(blockforest::PODPhantomWeightPackUnpack<real_t>());
+      blockforest.setRefreshPhantomBlockDataUnpackFunction(blockforest::PODPhantomWeightPackUnpack<real_t>());
+
+   }
+   else if( loadDistributionStrategy == "ParMetis")
+   {
+
+#ifndef WALBERLA_BUILD_WITH_PARMETIS
+      WALBERLA_ABORT( "You are trying to use ParMetis functionality but waLBerla is not configured to use it. Set 'WALBERLA_BUILD_WITH_PARMETIS' to 'ON' in your CMake cache to build against an installed version of ParMetis!" );
+#endif
+
+      blockforest::DynamicParMetis::Algorithm parMetisAlgorithm = blockforest::DynamicParMetis::stringToAlgorithm(parMetisAlgorithmString);
+      blockforest::DynamicParMetis::WeightsToUse parMetisWeightsToUse = blockforest::DynamicParMetis::WeightsToUse::PARMETIS_BOTH_WEIGHTS;
+      blockforest::DynamicParMetis::EdgeSource parMetisEdgeSource = blockforest::DynamicParMetis::EdgeSource::PARMETIS_EDGES_FROM_EDGE_WEIGHTS;
+
+      blockforest::DynamicParMetis dynamicParMetis(parMetisAlgorithm, parMetisWeightsToUse, parMetisEdgeSource);
+      dynamicParMetis.setipc2redist(parMetis_ipc2redist);
+
+      auto numberOfBlocks = XBlocks * YBlocks * ZBlocks;
+
+      real_t loadImbalanceTolerance = (parMetisTolerance < real_t(1)) ? std::max(real_t(1.05), real_t(1) + real_t(1) / ( real_c(numberOfBlocks) / real_c(numberOfProcesses) ) ) : parMetisTolerance;
+      dynamicParMetis.setImbalanceTolerance(double(loadImbalanceTolerance), 0);
+
+      WALBERLA_LOG_INFO_ON_ROOT(" - ParMetis configuration: ");
+      WALBERLA_LOG_INFO_ON_ROOT("   - algorithm = " << dynamicParMetis.algorithmToString() );
+      WALBERLA_LOG_INFO_ON_ROOT("   - weights to use = " << dynamicParMetis.weightsToUseToString() );
+      WALBERLA_LOG_INFO_ON_ROOT("   - edge source = " << dynamicParMetis.edgeSourceToString() );
+      WALBERLA_LOG_INFO_ON_ROOT("   - ipc2redist parameter = " << dynamicParMetis.getipc2redist() );
+      WALBERLA_LOG_INFO_ON_ROOT("   - imbalance tolerance = " << dynamicParMetis.getImbalanceTolerance() );
+
+      blockforest.setRefreshPhantomBlockDataPackFunction(blockforest::DynamicParMetisBlockInfoPackUnpack());
+      blockforest.setRefreshPhantomBlockDataUnpackFunction(blockforest::DynamicParMetisBlockInfoPackUnpack());
+      blockforest.setRefreshPhantomBlockMigrationPreparationFunction( dynamicParMetis );
+
+      if( loadEvaluationStrategy == "Fit" )
+      {
+         lbm_mesapd_coupling::amr::WeightEvaluationFunctor weightEvaluationFunctor(couplingInfoCollection, lbm_mesapd_coupling::amr::fittedTotalWeightEvaluationFunction);
+         lbm_mesapd_coupling::amr::MetisAssignmentFunctor weightAssignmentFunctor(weightEvaluationFunctor); //attention: special METIS assignment functor!
+         weightAssignmentFunctor.setBlockBaseWeight(blockBaseWeight);
+         real_t weightMultiplicator = real_t(1000); // values from predictor are in range [0-5] which is too coarse when cast to int as done in parmetis
+         weightAssignmentFunctor.setWeightMultiplicator(weightMultiplicator);
+         blockforest.setRefreshPhantomBlockDataAssignmentFunction(weightAssignmentFunctor);
+      }
+      else if( loadEvaluationStrategy == "LBM" )
+      {
+         lbm_mesapd_coupling::amr::MetisAssignmentFunctor weightAssignmentFunctor(lbm_mesapd_coupling::amr::defaultWeightEvaluationFunction);
+         weightAssignmentFunctor.setBlockBaseWeight(blockBaseWeight);
+         blockforest.setRefreshPhantomBlockDataAssignmentFunction(weightAssignmentFunctor);
+      }
+      else
+      {
+         WALBERLA_ABORT("Invalid load evaluation strategy: " << loadEvaluationStrategy);
+      }
+
+   }
+   else if( loadDistributionStrategy == "Diffusive")
+   {
+      using DB_T = blockforest::DynamicDiffusionBalance< blockforest::PODPhantomWeight<real_t> >;
+      DB_T dynamicDiffusion(diffusionMaxIterations, diffusionFlowIterations );
+      dynamicDiffusion.setMode(DB_T::Mode::DIFFUSION_PUSH);
+
+      WALBERLA_LOG_INFO_ON_ROOT(" - Dynamic diffusion configuration: ");
+      WALBERLA_LOG_INFO_ON_ROOT("   - max iterations = " << dynamicDiffusion.getMaxIterations() );
+      WALBERLA_LOG_INFO_ON_ROOT("   - flow iterations = " << dynamicDiffusion.getFlowIterations());
+
+      blockforest.setRefreshPhantomBlockDataPackFunction(blockforest::PODPhantomWeightPackUnpack<real_t>());
+      blockforest.setRefreshPhantomBlockDataUnpackFunction(blockforest::PODPhantomWeightPackUnpack<real_t>());
+      blockforest.setRefreshPhantomBlockMigrationPreparationFunction( dynamicDiffusion );
+
+      if( loadEvaluationStrategy == "Fit" )
+      {
+         lbm_mesapd_coupling::amr::WeightEvaluationFunctor weightEvaluationFunctor(couplingInfoCollection, lbm_mesapd_coupling::amr::fittedTotalWeightEvaluationFunction);
+         lbm_mesapd_coupling::amr::WeightAssignmentFunctor weightAssignmentFunctor(weightEvaluationFunctor);
+         weightAssignmentFunctor.setBlockBaseWeight(blockBaseWeight);
+         blockforest.setRefreshPhantomBlockDataAssignmentFunction(weightAssignmentFunctor);
+      }
+      else if( loadEvaluationStrategy == "LBM" )
+      {
+         lbm_mesapd_coupling::amr::WeightAssignmentFunctor weightAssignmentFunctor(lbm_mesapd_coupling::amr::defaultWeightEvaluationFunction);
+         weightAssignmentFunctor.setBlockBaseWeight(blockBaseWeight);
+         blockforest.setRefreshPhantomBlockDataAssignmentFunction(weightAssignmentFunctor);
+      }
+      else
+      {
+         WALBERLA_ABORT("Invalid load evaluation strategy: " << loadEvaluationStrategy);
+      }
+
+   } else
+   {
+      WALBERLA_ABORT("Load distribution strategy \"" << loadDistributionStrategy << "\t not implemented! - Aborting" );
+   }
+
+   lbm_mesapd_coupling::amr::BlockInfo emptyExampleBlock(blockSize*blockSize*blockSize, 0, 0, 0, 0, numRPDSubCycles);
+   WALBERLA_LOG_INFO_ON_ROOT("An example empty block has the weight: " << lbm_mesapd_coupling::amr::fittedTotalWeightEvaluationFunction(emptyExampleBlock));
+
+
+   ///////////////
+   // TIME LOOP //
+   ///////////////
+
+   // create the timeloop
+   SweepTimeloop timeloop( blocks->getBlockStorage(), timesteps );
+
+   timeloop.addFuncBeforeTimeStep( RemainingTimeLogger( timeloop.getNrOfTimeSteps() ), "Remaining Time Logger" );
+
+   if( vtkWriteFreqPa != uint_t(0) ) {
+      auto particleVtkWriter = vtk::createVTKOutput_PointData(particleVtkOutput, "Particles", vtkWriteFreqPa, baseFolder, "simulation_step");
+      timeloop.addFuncBeforeTimeStep( vtk::writeFiles( particleVtkWriter ), "VTK (sphere data)" );
+   }
+
+   if( vtkWriteFreqFl != uint_t(0) ) {
+
+      // pdf field
+      auto pdfFieldVTK = vtk::createVTKOutput_BlockData(blocks, "fluid_field", vtkWriteFreqFl, 0, false, baseFolder);
+
+      field::FlagFieldCellFilter<FlagField_T> fluidFilter(flagFieldID);
+      fluidFilter.addFlag(Fluid_Flag);
+      pdfFieldVTK->addCellInclusionFilter(fluidFilter);
+
+      pdfFieldVTK->addCellDataWriter(
+            make_shared<lbm::VelocityVTKWriter<LatticeModel_T, float> >(pdfFieldID, "VelocityFromPDF"));
+      pdfFieldVTK->addCellDataWriter(
+            make_shared<lbm::DensityVTKWriter<LatticeModel_T, float> >(pdfFieldID, "DensityFromPDF"));
+
+      timeloop.addFuncBeforeTimeStep(vtk::writeFiles(pdfFieldVTK), "VTK (fluid field data)");
+   }
+
+   if( vtkWriteFreqDD != uint_t(0) ) {
+      auto domainDecompVTK = vtk::createVTKOutput_DomainDecomposition(blocks, "domain_decomposition", vtkWriteFreqDD, baseFolder );
+      timeloop.addFuncBeforeTimeStep( vtk::writeFiles(domainDecompVTK), "VTK (domain decomposition)");
+   }
+
+
+   // setup of the LBM communication for synchronizing the pdf field between neighboring blocks
+   blockforest::communication::UniformBufferedScheme< Stencil_T > optimizedPDFCommunicationScheme( blocks );
+   optimizedPDFCommunicationScheme.addPackInfo( make_shared< lbm::PdfFieldPackInfo< LatticeModel_T > >( pdfFieldID ) ); // optimized sync
+
+   auto sweep = lbm::makeCellwiseSweep< LatticeModel_T, FlagField_T >( pdfFieldID, flagFieldID, Fluid_Flag );
+
+   // add LBM communication function and boundary handling sweep (does the hydro force calculations and the no-slip treatment)
+   timeloop.add() << BeforeFunction( optimizedPDFCommunicationScheme, "LBM Communication" )
+                  << Sweep( BoundaryHandling_T::getBlockSweep( boundaryHandlingID ), "Boundary Handling" );
+
+   // streaming & collide
+   timeloop.add() << Sweep( makeSharedSweep(sweep), "Stream&Collide" );
+
+
+   SweepTimeloop timeloopAfterParticles( blocks->getBlockStorage(), timesteps );
+
+   // sweep for updating the particle mapping into the LBM simulation
+   bool conserveMomentum = false;
+   timeloopAfterParticles.add() << Sweep( lbm_mesapd_coupling::makeMovingParticleMapping<PdfField_T,BoundaryHandling_T>(blocks, pdfFieldID, boundaryHandlingID, particleFieldID, accessor, MO_Flag, FormerMO_Flag,
+                                                                                                                        lbm_mesapd_coupling::RegularParticlesSelector(), conserveMomentum), "Particle Mapping" );
+
+   bool recomputeTargetDensity = false;
+   auto gradReconstructor = lbm_mesapd_coupling::makeGradsMomentApproximationReconstructor<BoundaryHandling_T>(blocks, boundaryHandlingID, omega, recomputeTargetDensity,true);
+   blockforest::communication::UniformBufferedScheme< Stencil_T > fullPDFCommunicationScheme( blocks );
+   fullPDFCommunicationScheme.addPackInfo( make_shared< field::communication::PackInfo< PdfField_T > >( pdfFieldID ) ); // full sync
+   timeloopAfterParticles.add() << BeforeFunction( fullPDFCommunicationScheme, "PDF Communication" )
+                                << Sweep( makeSharedSweep(lbm_mesapd_coupling::makePdfReconstructionManager<PdfField_T,BoundaryHandling_T>(blocks, pdfFieldID, boundaryHandlingID, particleFieldID, accessor, FormerMO_Flag, Fluid_Flag,
+                                                                                                                                           gradReconstructor, conserveMomentum) ), "PDF Restore" );
+
+   ////////////////////////
+   // EXECUTE SIMULATION //
+   ////////////////////////
+
+
+   uint_t loadEvaluationFrequency = loadBalancingCheckFrequency;
+
+   // file for simulation infos
+   std::string infoFileName( baseFolder + "/simulation_info.txt");
+   WALBERLA_ROOT_SECTION()
+   {
+      std::ofstream file;
+      file.open( infoFileName.c_str(), std::fstream::out | std::fstream::trunc );
+      file << "#i\t t\t tSim\t tLoadBal\t numProcs\t blocks (min/max/sum)\n";
+      file.close();
+   }
+
+   // process local timing measurements and predicted loads
+   std::string processLocalFiles(baseFolder + "/processLocalFiles");
+   WALBERLA_ROOT_SECTION()
+   {
+      filesystem::path tpath( processLocalFiles );
+      if( !filesystem::exists( tpath ) )
+         filesystem::create_directory( tpath );
+   }
+   std::string measurementFileProcessName(processLocalFiles + "/measurements_" + std::to_string(MPIManager::instance()->rank()) + ".txt");
+   {
+      std::ofstream file;
+      file.open( measurementFileProcessName.c_str(), std::fstream::out | std::fstream::trunc );
+      file << "#i\t t\t mTotSim\t mLB\t mLBM\t mBH\t mCoup1\t mCoup2\t mRPD\t cLBM\t cRB\t numBlocks\n";
+      file.close();
+   }
+
+   std::string predictionFileProcessName(processLocalFiles + "/predictions_" + std::to_string(MPIManager::instance()->rank()) + ".txt");
+   {
+      std::ofstream file;
+      file.open( predictionFileProcessName.c_str(), std::fstream::out | std::fstream::trunc );
+      file << "#i\t t\t wlLBM\t wlBH\t wlCoup1\t wlCoup2\t wlRPD\t edgecut\t numBlocks\n";
+      file.close();
+   }
+
+   std::vector< std::vector<std::string> > timerKeys;
+   std::vector<std::string> LBMTimer;
+   LBMTimer.emplace_back("Stream&Collide");
+   timerKeys.push_back(LBMTimer);
+
+   std::vector<std::string> bhTimer;
+   bhTimer.emplace_back("Boundary Handling");
+   timerKeys.push_back(bhTimer);
+
+   std::vector<std::string> couplingTimer1;
+   couplingTimer1.emplace_back("Particle Mapping");
+   std::vector<std::string> couplingTimer2;
+   couplingTimer2.emplace_back("PDF Restore");
+   timerKeys.push_back(couplingTimer1);
+   timerKeys.push_back(couplingTimer2);
+
+   std::vector<std::string> rpdTimer;
+   rpdTimer.emplace_back("RPD Force");
+   rpdTimer.emplace_back("RPD VV1");
+   rpdTimer.emplace_back("RPD VV2");
+   rpdTimer.emplace_back("RPD Lub");
+   rpdTimer.emplace_back("RPD Collision");
+   timerKeys.push_back(rpdTimer);
+
+   std::vector<std::string> LBMCommTimer;
+   LBMCommTimer.emplace_back("LBM Communication");
+   LBMCommTimer.emplace_back("PDF Communication");
+   timerKeys.push_back(LBMCommTimer);
+
+
+   std::vector<std::string> rpdCommTimer;
+   rpdCommTimer.emplace_back("Reduce Force Torque");
+   rpdCommTimer.emplace_back("Reduce Hyd Force Torque");
+   rpdCommTimer.emplace_back("Reduce Contact History");
+   rpdCommTimer.emplace_back("Sync");
+   timerKeys.push_back(rpdCommTimer);
+
+   std::vector<real_t> timings(timerKeys.size());
+
+   double oldmTotSim = 0.0;
+   double oldmLB = 0.0;
+
+   auto measurementFileCounter = uint_t(0);
+   auto predictionFileCounter = uint_t(0);
+
+   std::string loadEvaluationStep("load evaluation");
+   std::string loadBalancingStep("load balancing");
+   std::string simulationStep("simulation");
+
+   WcTimingPool timeloopTiming;
+   WcTimingPool simulationTiming;
+
+   lbm_mesapd_coupling::InspectionProbe<PdfField_T,BoundaryHandling_T,ParticleAccessor_T> probeForNaNs( Vector3<real_t>(0, 0, 0),
+                                                                                                  blocks, pdfFieldID, boundaryHandlingID, particleFieldID, accessor,
+                                                                                                  true, true, "" );
+
+   // time loop
+   for (uint_t i = 0; i < timesteps; ++i )
+   {
+
+      // evaluate measurements (note: reflect simulation behavior BEFORE the evaluation)
+      if( loadEvaluationFrequency > 0 && i % loadEvaluationFrequency == 0 && i > 0 && fileIO)
+      {
+
+         simulationTiming[loadEvaluationStep].start();
+
+         // write process local timing measurements to files (per process, per load balancing step)
+         {
+            auto mTotSim = simulationTiming[simulationStep].total();
+            auto mLB = simulationTiming[loadBalancingStep].total();
+
+            evaluateTimers(timeloopTiming, timerKeys, timings);
+
+            auto & forest = blocks->getBlockForest();
+            uint_t numBlocks = forest.getNumberOfBlocks();
+
+            // write to process local file
+            std::ofstream file;
+            file.open( measurementFileProcessName.c_str(), std::ofstream::app  );
+            file << measurementFileCounter << "\t " << real_c(i) / tRef << "\t"
+                 << mTotSim - oldmTotSim << "\t" << mLB - oldmLB << "\t";
+            for (real_t timing : timings) {
+               file << "\t" << timing;
+            }
+            file << "\t" << numBlocks << "\n";
+            file.close();
+
+            oldmTotSim = mTotSim;
+            oldmLB = mLB;
+            measurementFileCounter++;
+
+            // reset timer to have measurement from evaluation to evaluation point
+            timeloopTiming.clear();
+
+         }
+
+         // evaluate general simulation infos (on root)
+         {
+            double totalTimeToCurrentTimestep(0.0);
+            double totalLBTimeToCurrentTimestep(0.0);
+            evaluateTotalSimulationTimePassed(simulationTiming, simulationStep, loadBalancingStep,
+                                              totalTimeToCurrentTimestep, totalLBTimeToCurrentTimestep);
+            math::DistributedSample numberOfBlocks;
+
+            auto & forest = blocks->getBlockForest();
+            uint_t numBlocks = forest.getNumberOfBlocks();
+            numberOfBlocks.castToRealAndInsert(numBlocks);
+            numberOfBlocks.mpiGatherRoot();
+
+            WALBERLA_ROOT_SECTION()
+            {
+               std::ofstream file;
+               file.open( infoFileName.c_str(), std::ofstream::app  );
+               file << i << "\t " << real_c(i) / tRef << "\t"
+                    << totalTimeToCurrentTimestep << "\t " << totalLBTimeToCurrentTimestep << "\t "
+                    << numberOfProcesses << "\t ";
+               file << uint_c(numberOfBlocks.min()) << "\t ";
+               file << uint_c(numberOfBlocks.max()) << "\t ";
+               file << uint_c(numberOfBlocks.sum()) << "\n ";
+               file.close();
+            }
+         }
+
+         simulationTiming[loadEvaluationStep].end();
+
+      }
+
+      if( loadBalancingCheckFrequency != 0 && i % loadBalancingCheckFrequency == 0)
+      {
+
+         if(useProgressLogging) walberla::logging::Logging::instance()->setFileLogLevel(logging::Logging::LogLevel::PROGRESS);
+
+         simulationTiming[loadBalancingStep].start();
+
+         if( loadEvaluationStrategy != "LBM" ) {
+
+            WALBERLA_LOG_INFO_ON_ROOT("Checking for load balancing...");
+
+            // update info collections for the particle presence based check and the load balancing:
+            auto &forest = blocks->getBlockForest();
+            lbm_mesapd_coupling::amr::updateAndSyncInfoCollection<BoundaryHandling_T,ParticleAccessor_T >(forest, boundaryHandlingID, *accessor, numRPDSubCycles, *couplingInfoCollection);
+
+            if(useProgressLogging)  WALBERLA_LOG_INFO("Created info collection with size " << couplingInfoCollection->size());
+
+            auto numBlocksBefore = forest.getNumberOfBlocks();
+            if(useProgressLogging)  WALBERLA_LOG_INFO("Number of blocks before refresh " << numBlocksBefore);
+
+            mesa_pd::mpi::ClearNextNeighborSync CNNS;
+            CNNS(*accessor);
+
+            if(useProgressLogging) WALBERLA_LOG_INFO_ON_ROOT("Cleared particle sync and starting refresh");
+
+            blocks->refresh();
+
+            auto numBlocksAfter = forest.getNumberOfBlocks();
+            if(useProgressLogging)  WALBERLA_LOG_INFO("Number of blocks after refresh " << numBlocksAfter);
+
+            if(useProgressLogging) WALBERLA_LOG_INFO_ON_ROOT("Refresh finished, recreating all datastructures");
+
+            //WALBERLA_LOG_INFO(rank << ", " << numBlocksBefore << " -> " << numBlocksAfter);
+            rpdDomain->refresh();
+            if(useBlockForestSync)
+            {
+               ps->forEachParticle(useOpenMP, mesa_pd::kernel::SelectLocal(), *accessor, associateToBlock, *accessor);
+               syncNextNeighborBlockForestFunc(*ps, blocks->getBlockForestPointer(), rpdDomain, overlap);
+
+            } else
+            {
+               syncNextNeighborFunc(*ps, *rpdDomain, overlap);
+            }
+
+            clearBoundaryHandling(forest, boundaryHandlingID);
+            clearParticleField(forest, particleFieldID, *accessor);
+
+            recreateBoundaryHandling(forest, boundaryHandlingID);
+
+            //NOTE: order in mapping is important: first all global/fixed particles,
+            // only then moving ones, which do not overwrite the mapping of the global/fixed ones
+            particleMappingCall();
+
+         }
+         simulationTiming[loadBalancingStep].end();
+
+         if(useProgressLogging) walberla::logging::Logging::instance()->setFileLogLevel(logging::Logging::LogLevel::INFO);
+
+      }
+
+      if(checkSimulation)
+      {
+         WALBERLA_LOG_INFO_ON_ROOT("Checking time step " << i );
+         evaluateParticleSimulation(ps, ss, numberOfSediments, numGlobalParticles);
+         checkMapping(blocks, pdfFieldID, boundaryHandlingID, probeForNaNs);
+      }
+
+      // evaluate predictions (note: reflect the predictions for all upcoming simulations, thus the corresponding measurements have to be taken afterwards)
+      if( loadEvaluationFrequency > 0 && i % loadEvaluationFrequency == 0 && fileIO)
+      {
+
+         simulationTiming[loadEvaluationStep].start();
+
+         // write process local load predictions to files (per process, per load balancing step)
+         {
+
+            auto wlLBM = real_t(0);
+            auto wlBH = real_t(0);
+            auto wlCoup1 = real_t(0);
+            auto wlCoup2 = real_t(0);
+            auto wlRPD = real_t(0);
+
+            auto & forest = blocks->getBlockForest();
+            lbm_mesapd_coupling::amr::updateAndSyncInfoCollection<BoundaryHandling_T,ParticleAccessor_T >(forest, boundaryHandlingID, *accessor, numRPDSubCycles, *couplingInfoCollection);
+
+            for( auto blockIt = forest.begin(); blockIt != forest.end(); ++blockIt ) {
+               auto * block = static_cast<blockforest::Block *> (&(*blockIt));
+               const auto &blockID = block->getId();
+               auto infoIt = couplingInfoCollection->find(blockID);
+               auto blockInfo = infoIt->second;
+
+               wlLBM   += lbm_mesapd_coupling::amr::fittedLBMWeightEvaluationFunction(blockInfo);
+               wlBH    += lbm_mesapd_coupling::amr::fittedBHWeightEvaluationFunction(blockInfo);
+               wlCoup1 += lbm_mesapd_coupling::amr::fittedCoup1WeightEvaluationFunction(blockInfo);
+               wlCoup2 += lbm_mesapd_coupling::amr::fittedCoup2WeightEvaluationFunction(blockInfo);
+               wlRPD   += lbm_mesapd_coupling::amr::fittedRPDWeightEvaluationFunction(blockInfo);
+
+            }
+
+            // note: we count the edge weight doubled here in total (to and from the other process). ParMetis only counts one direction.
+            uint_t edgecut = evaluateEdgeCut(forest);
+            uint_t numBlocks = forest.getNumberOfBlocks();
+
+            std::ofstream file;
+            file.open( predictionFileProcessName.c_str(), std::ofstream::app  );
+            file << predictionFileCounter << "\t " << real_c(i) / tRef << "\t"
+                 << wlLBM << "\t" << wlBH << "\t" << wlCoup1 << "\t" << wlCoup2 << "\t" << wlRPD << "\t"
+                 << edgecut << "\t" << numBlocks << "\n";
+            file.close();
+
+            predictionFileCounter++;;
+         }
+
+         simulationTiming[loadEvaluationStep].end();
+
+      }
+
+      simulationTiming[simulationStep].start();
+
+      // perform a single simulation step
+
+      timeloop.singleStep( timeloopTiming );
+
+      timeloopTiming["Reduce Hyd Force Torque"].start();
+      reduceProperty.operator()<mesa_pd::HydrodynamicForceTorqueNotification>(*ps);
+      timeloopTiming["Reduce Hyd Force Torque"].end();
+
+      timeloopTiming["RPD Force"].start();
+      if( i == 0 )
+      {
+         ps->forEachParticle(useOpenMP, mesa_pd::kernel::SelectLocal(), *accessor, initializeHydrodynamicForceTorqueForAveragingKernel, *accessor );
+      }
+      ps->forEachParticle(useOpenMP, mesa_pd::kernel::SelectLocal(), *accessor, averageHydrodynamicForceTorque, *accessor );
+      timeloopTiming["RPD Force"].end();
+
+      if(checkSimulation)
+      {
+         checkParticleProperties(ps);
+      }
+
+      for(auto subCycle = uint_t(0); subCycle < numRPDSubCycles; ++subCycle )
+      {
+
+         timeloopTiming["RPD VV1"].start();
+         ps->forEachParticle(useOpenMP, mesa_pd::kernel::SelectLocal(), *accessor, vvIntegratorPreForce, *accessor);
+         timeloopTiming["RPD VV1"].end();
+
+         timeloopTiming["Sync"].start();
+         if(useBlockForestSync)
+         {
+            syncNextNeighborBlockForestFunc(*ps, blocks->getBlockForestPointer(), rpdDomain, overlap);
+            ps->forEachParticle(useOpenMP, mesa_pd::kernel::SelectLocal(), *accessor, associateToBlock, *accessor);
+         } else
+         {
+            syncNextNeighborFunc(*ps, *rpdDomain, overlap);
+         }
+         timeloopTiming["Sync"].end();
+
+         // lubrication correction
+         timeloopTiming["RPD Lub"].start();
+         ps->forEachParticlePairHalf(useOpenMP, mesa_pd::kernel::ExcludeInfiniteInfinite(), *accessor,
+                                     [&lubricationCorrectionKernel,maximumLubricationCutOffDistance, &rpdDomain]
+                                           (const size_t idx1, const size_t idx2, auto& ac)
+                                     {
+                                        mesa_pd::collision_detection::AnalyticContactDetection acd;
+                                        acd.getContactThreshold() = maximumLubricationCutOffDistance;
+                                        mesa_pd::kernel::DoubleCast double_cast;
+                                        mesa_pd::mpi::ContactFilter contact_filter;
+                                        if (double_cast(idx1, idx2, ac, acd, ac ))
+                                        {
+                                           if (contact_filter(acd.getIdx1(), acd.getIdx2(), ac, acd.getContactPoint(), *rpdDomain))
+                                           {
+                                              double_cast(acd.getIdx1(), acd.getIdx2(), ac, lubricationCorrectionKernel, ac, acd.getContactNormal(), acd.getPenetrationDepth());
+                                           }
+                                        }
+                                     },
+                                     *accessor );
+         timeloopTiming["RPD Lub"].end();
+
+         // collision response
+         timeloopTiming["RPD Collision"].start();
+         ps->forEachParticlePairHalf(useOpenMP, mesa_pd::kernel::ExcludeInfiniteInfinite(), *accessor,
+                                     [&collisionResponse, &rpdDomain, timeStepSizeRPD]
+                                           (const size_t idx1, const size_t idx2, auto& ac)
+                                     {
+                                        mesa_pd::collision_detection::AnalyticContactDetection acd;
+                                        mesa_pd::kernel::DoubleCast double_cast;
+                                        mesa_pd::mpi::ContactFilter contact_filter;
+                                        if (double_cast(idx1, idx2, ac, acd, ac ))
+                                        {
+                                           if (contact_filter(acd.getIdx1(), acd.getIdx2(), ac, acd.getContactPoint(), *rpdDomain))
+                                           {
+                                              collisionResponse(acd.getIdx1(), acd.getIdx2(), ac, acd.getContactPoint(), acd.getContactNormal(), acd.getPenetrationDepth(), timeStepSizeRPD);
+                                           }
+                                        }
+                                     },
+                                     *accessor );
+
+         timeloopTiming["RPD Collision"].end();
+
+         timeloopTiming["Reduce Contact History"].start();
+         reduceAndSwapContactHistory(*ps);
+         timeloopTiming["Reduce Contact History"].end();
+
+         timeloopTiming["RPD Force"].start();
+         lbm_mesapd_coupling::AddHydrodynamicInteractionKernel addHydrodynamicInteraction;
+         ps->forEachParticle( useOpenMP, mesa_pd::kernel::SelectLocal(), *accessor, addHydrodynamicInteraction, *accessor );
+         ps->forEachParticle( useOpenMP, mesa_pd::kernel::SelectLocal(), *accessor, addGravitationalForce, *accessor );
+         timeloopTiming["RPD Force"].end();
+
+         timeloopTiming["Reduce Force Torque"].start();
+         reduceProperty.operator()<mesa_pd::ForceTorqueNotification>(*ps);
+         timeloopTiming["Reduce Force Torque"].end();
+
+         // integration
+         timeloopTiming["RPD VV2"].start();
+         ps->forEachParticle( useOpenMP, mesa_pd::kernel::SelectLocal(), *accessor, vvIntegratorPostForce, *accessor);
+         timeloopTiming["RPD VV2"].end();
+
+         timeloopTiming["Sync"].start();
+         if(useBlockForestSync)
+         {
+            syncNextNeighborBlockForestFunc(*ps, blocks->getBlockForestPointer(), rpdDomain, overlap);
+         } else
+         {
+            syncNextNeighborFunc(*ps, *rpdDomain, overlap);
+         }
+         timeloopTiming["Sync"].end();
+
+
+      }
+
+      timeloopTiming["RPD Force"].start();
+      ps->forEachParticle( useOpenMP, mesa_pd::kernel::SelectAll(), *accessor, resetHydrodynamicForceTorque, *accessor );
+      timeloopTiming["RPD Force"].end();
+
+      // update particle mapping
+      timeloopAfterParticles.singleStep(timeloopTiming);
+
+      simulationTiming[simulationStep].end();
+
+   }
+
+   simulationTiming.logResultOnRoot();
+
+
+   return EXIT_SUCCESS;
+}
+
+} // namespace fluid_particle_workload_distribution
+
+int main( int argc, char **argv ){
+   fluid_particle_workload_distribution::main(argc, argv);
+}
diff --git a/apps/benchmarks/FluidParticleCouplingWithLoadBalancing/FluidParticleWorkloadEvaluation.cpp b/apps/benchmarks/FluidParticleCouplingWithLoadBalancing/FluidParticleWorkloadEvaluation.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..062e3f9c95f25baa5ddadd3b6511d9695cb5e457
--- /dev/null
+++ b/apps/benchmarks/FluidParticleCouplingWithLoadBalancing/FluidParticleWorkloadEvaluation.cpp
@@ -0,0 +1,956 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file FluidParticleWorkLoadEvaluation.cpp
+//! \author Christoph Rettinger <christoph.rettinger@fau.de>
+//
+//======================================================================================================================
+
+#include "blockforest/Initialization.h"
+#include "blockforest/communication/UniformBufferedScheme.h"
+
+#include "boundary/all.h"
+
+#include "core/DataTypes.h"
+#include "core/Environment.h"
+#include "core/debug/Debug.h"
+#include "core/debug/TestSubsystem.h"
+#include "core/logging/Logging.h"
+#include "core/math/all.h"
+#include "core/timing/RemainingTimeLogger.h"
+#include "core/mpi/MPIManager.h"
+#include "core/mpi/Reduce.h"
+#include "core/mpi/Broadcast.h"
+
+#include "domain_decomposition/SharedSweep.h"
+
+#include "field/AddToStorage.h"
+#include "field/communication/PackInfo.h"
+
+#include "lbm/communication/PdfFieldPackInfo.h"
+#include "lbm/field/AddToStorage.h"
+#include "lbm/field/PdfField.h"
+#include "lbm/lattice_model/D3Q19.h"
+#include "lbm/lattice_model/ForceModel.h"
+#include "lbm/sweeps/CellwiseSweep.h"
+#include "lbm/sweeps/SweepWrappers.h"
+#include "lbm/BlockForestEvaluation.h"
+
+#include "lbm_mesapd_coupling/mapping/ParticleMapping.h"
+#include "lbm_mesapd_coupling/momentum_exchange_method/MovingParticleMapping.h"
+#include "lbm_mesapd_coupling/momentum_exchange_method/boundary/CurvedLinear.h"
+#include "lbm_mesapd_coupling/momentum_exchange_method/reconstruction/Reconstructor.h"
+#include "lbm_mesapd_coupling/momentum_exchange_method/reconstruction/PdfReconstructionManager.h"
+#include "lbm_mesapd_coupling/utility/AddForceOnParticlesKernel.h"
+#include "lbm_mesapd_coupling/utility/ParticleSelector.h"
+#include "lbm_mesapd_coupling/DataTypes.h"
+#include "lbm_mesapd_coupling/utility/AverageHydrodynamicForceTorqueKernel.h"
+#include "lbm_mesapd_coupling/utility/AddHydrodynamicInteractionKernel.h"
+#include "lbm_mesapd_coupling/utility/ResetHydrodynamicForceTorqueKernel.h"
+#include "lbm_mesapd_coupling/utility/LubricationCorrectionKernel.h"
+#include "lbm_mesapd_coupling/utility/InitializeHydrodynamicForceTorqueForAveragingKernel.h"
+
+#include "mesa_pd/collision_detection/AnalyticContactDetection.h"
+#include "mesa_pd/data/ParticleAccessorWithShape.h"
+#include "mesa_pd/data/ParticleStorage.h"
+#include "mesa_pd/data/ShapeStorage.h"
+#include "mesa_pd/data/DataTypes.h"
+#include "mesa_pd/data/shape/HalfSpace.h"
+#include "mesa_pd/data/shape/Sphere.h"
+#include "mesa_pd/domain/BlockForestDomain.h"
+#include "mesa_pd/domain/BlockForestDataHandling.h"
+#include "mesa_pd/kernel/DoubleCast.h"
+#include "mesa_pd/kernel/ExplicitEuler.h"
+#include "mesa_pd/kernel/LinearSpringDashpot.h"
+#include "mesa_pd/kernel/ParticleSelector.h"
+#include "mesa_pd/kernel/VelocityVerlet.h"
+#include "mesa_pd/mpi/SyncNextNeighbors.h"
+#include "mesa_pd/mpi/ReduceProperty.h"
+#include "mesa_pd/mpi/ReduceContactHistory.h"
+#include "mesa_pd/mpi/ContactFilter.h"
+#include "mesa_pd/mpi/notifications/ForceTorqueNotification.h"
+#include "mesa_pd/mpi/notifications/HydrodynamicForceTorqueNotification.h"
+#include "mesa_pd/vtk/ParticleVtkOutput.h"
+
+#include "timeloop/SweepTimeloop.h"
+
+#include "vtk/all.h"
+#include "field/vtk/all.h"
+#include "lbm/vtk/all.h"
+
+#include <vector>
+#include <iostream>
+
+namespace fluid_particle_workload_evaluation
+{
+
+///////////
+// USING //
+///////////
+
+using namespace walberla;
+using walberla::uint_t;
+
+using LatticeModel_T = lbm::D3Q19< lbm::collision_model::TRT, false >;
+using Stencil_T = LatticeModel_T::Stencil;
+using PdfField_T = lbm::PdfField<LatticeModel_T>;
+using flag_t = walberla::uint8_t;
+using FlagField_T = FlagField<flag_t>;
+
+const uint_t FieldGhostLayers = 1;
+
+///////////
+// FLAGS //
+///////////
+
+const FlagUID Fluid_Flag   ( "fluid" );
+const FlagUID MO_Flag  ( "moving obstacle" );
+const FlagUID FormerMO_Flag( "former moving obstacle" );
+
+
+/////////////////////////////////////
+// BOUNDARY HANDLING CUSTOMIZATION //
+/////////////////////////////////////
+template <typename ParticleAccessor_T>
+class MyBoundaryHandling
+{
+public:
+
+   using MO_T = lbm_mesapd_coupling::CurvedLinear< LatticeModel_T, FlagField_T, ParticleAccessor_T >;
+   using Type = BoundaryHandling< FlagField_T, Stencil_T, MO_T >;
+
+   MyBoundaryHandling( const BlockDataID & flagFieldID, const BlockDataID & pdfFieldID,
+                       const BlockDataID & particleFieldID, const shared_ptr<ParticleAccessor_T>& ac,
+                       bool useEntireFieldTraversal) :
+         flagFieldID_( flagFieldID ), pdfFieldID_( pdfFieldID ), particleFieldID_( particleFieldID ), ac_( ac ), useEntireFieldTraversal_(useEntireFieldTraversal) {}
+
+   Type * operator()( IBlock* const block, const StructuredBlockStorage* const storage ) const
+   {
+      WALBERLA_ASSERT_NOT_NULLPTR( block );
+      WALBERLA_ASSERT_NOT_NULLPTR( storage );
+
+      auto * flagField     = block->getData< FlagField_T >( flagFieldID_ );
+      auto *  pdfField     = block->getData< PdfField_T > ( pdfFieldID_ );
+      auto * particleField = block->getData< lbm_mesapd_coupling::ParticleField_T > ( particleFieldID_ );
+
+      const auto fluid = flagField->flagExists( Fluid_Flag ) ? flagField->getFlag( Fluid_Flag ) : flagField->registerFlag( Fluid_Flag );
+
+      typename Type::Mode mode = (useEntireFieldTraversal_) ? Type::Mode::ENTIRE_FIELD_TRAVERSAL : Type::Mode::OPTIMIZED_SPARSE_TRAVERSAL ;
+
+      Type * handling = new Type( "moving obstacle boundary handling", flagField, fluid,
+                                  MO_T( "MO", MO_Flag, pdfField, flagField, particleField, ac_, fluid, *storage, *block ),
+                                  mode);
+
+      handling->fillWithDomain( FieldGhostLayers );
+
+      return handling;
+   }
+
+private:
+
+   const BlockDataID flagFieldID_;
+   const BlockDataID pdfFieldID_;
+   const BlockDataID particleFieldID_;
+
+   shared_ptr<ParticleAccessor_T> ac_;
+
+   bool useEntireFieldTraversal_;
+};
+
+template< typename BoundaryHandling_T >
+void evaluateFluidQuantities(const shared_ptr< StructuredBlockStorage > & blocks, const BlockDataID boundaryHandlingID,
+                             uint_t & numCells, uint_t & numFluidCells, uint_t & numNBCells )
+{
+
+   for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt)
+   {
+      auto * boundaryHandling = blockIt->getData< BoundaryHandling_T >( boundaryHandlingID );
+      auto xyzSize = boundaryHandling->getFlagField()->xyzSize();
+      numCells += xyzSize.numCells();
+
+      for(auto z = cell_idx_t(xyzSize.zMin()); z <= cell_idx_t(xyzSize.zMax()); ++z ){
+         for(auto y = cell_idx_t(xyzSize.yMin()); y <= cell_idx_t(xyzSize.yMax()); ++y ){
+            for(auto x = cell_idx_t(xyzSize.xMin()); x <= cell_idx_t(xyzSize.xMax()); ++x ) {
+               if (boundaryHandling->isDomain(x, y, z)) {
+                  ++numFluidCells;
+               }
+               if (boundaryHandling->isNearBoundary(x, y, z)) {
+                  ++numNBCells;
+               }
+            }
+         }
+      }
+   }
+}
+
+void evaluateRPDQuantities( const shared_ptr< mesa_pd::data::ParticleStorage > & ps,
+                            uint_t & numLocalParticles, uint_t & numGhostParticles)
+{
+
+   for (auto pIt = ps->begin(); pIt != ps->end(); ++pIt)
+   {
+      using namespace walberla::mesa_pd::data::particle_flags;
+      if (isSet(pIt->getFlags(), GHOST))
+      {
+         ++numGhostParticles;
+      } else
+      {
+         //note: global particles are included here
+         // use if (!isSet(pIt->getFlags(), GLOBAL)) if should be excluded
+         ++numLocalParticles;
+      }
+   }
+}
+
+void evaluateTimers(WcTimingPool & timingPool,
+                    const std::vector<std::vector<std::string> > & timerKeys,
+                    std::vector<double> & timings )
+{
+
+   for (auto & timingsIt : timings)
+   {
+      timingsIt = 0.0;
+   }
+
+   timingPool.unifyRegisteredTimersAcrossProcesses();
+
+   double scalingFactor = 1000.0; // milliseconds
+
+   for (auto i = uint_t(0); i < timerKeys.size(); ++i )
+   {
+      auto keys = timerKeys[i];
+      for (const auto &timerName : keys)
+      {
+         if(timingPool.timerExists(timerName))
+         {
+            timings[i] += timingPool[timerName].total() * scalingFactor;
+         }
+      }
+
+   }
+}
+
+
+//*******************************************************************************************************************
+/*! Application to evaluate the workload (time measurements) for a fluid-particle simulation
+ *
+ * This application is used in the paper
+ *  Rettinger, Ruede - "Dynamic Load Balancing Techniques for Particulate Flow Simulations", 2019, Computation
+ * in Section 3 to develop and calibrate the workload estimator.
+ * The setup features settling particle inside a horizontally periodic box.
+ * A comprehensive description is given in Sec. 3.3 of the paper.
+ * It uses 4 x 4 x 5 blocks for domain partitioning.
+ * For each block ( = each process), the block local quantities are evaluated as well as the timing infos of
+ * the fluid-particle interaction algorithm. Those infos are then written to files that can be used later on
+ * for function fitting to obtain a workload estimator.
+ *
+ * NOTE: Since this estimator relies on timing measurements, this evaluation procedure should be carried out everytime
+ * a different implementation, hardware or algorithm is used.
+ *
+ */
+//*******************************************************************************************************************
+int main( int argc, char **argv )
+{
+   debug::enterTestMode();
+
+   mpi::Environment env( argc, argv );
+
+   auto solidVolumeFraction = real_t(0.2);
+
+   // LBM / numerical parameters
+   auto blockSize  = uint_t(32);
+   auto uSettling = real_t(0.1); // characteristic settling velocity
+   auto diameter = real_t(10);
+
+   auto Ga = real_t(30); //Galileo number
+   auto numRPDSubCycles = uint_t(10);
+
+   auto vtkIOFreq = uint_t(0);
+   auto timestepsNonDim = real_t(2.5);
+   auto numSamples = uint_t(2000);
+   std::string baseFolder = "workload_files"; // folder for vtk and file output
+
+   bool noFileOutput = false;
+   bool useEntireFieldTraversal = true;
+   bool useFusedStreamCollide = false;
+
+   auto XBlocks = uint_t(4);
+   auto YBlocks = uint_t(4);
+   auto ZBlocks = uint_t(5);
+
+   real_t topWallOffsetFactor = real_t(1.05);
+
+   bool vtkDuringInit = false;
+
+   for( int i = 1; i < argc; ++i )
+   {
+      if( std::strcmp( argv[i], "--vtkIOFreq"               ) == 0 ) { vtkIOFreq = uint_c( std::atof( argv[++i] ) ); continue; }
+      if( std::strcmp( argv[i], "--noFileOutput"            ) == 0 ) { noFileOutput = true; continue; }
+      if( std::strcmp( argv[i], "--vtkDuringInit"           ) == 0 ) { vtkDuringInit = true; continue; }
+      if( std::strcmp( argv[i], "--basefolder"              ) == 0 ) { baseFolder = argv[++i]; continue; }
+      if( std::strcmp( argv[i], "--solidVolumeFraction"     ) == 0 ) { solidVolumeFraction = real_c(std::atof( argv[++i] )); continue; }
+      if( std::strcmp( argv[i], "--diameter"                ) == 0 ) { diameter = real_c(std::atof( argv[++i] )); continue; }
+      if( std::strcmp( argv[i], "--blockSize"               ) == 0 ) { blockSize = uint_c(std::atof( argv[++i]) ); continue; }
+      if( std::strcmp( argv[i], "--XBlocks"                 ) == 0 ) { XBlocks = uint_c(std::atof( argv[++i]) ); continue; }
+      if( std::strcmp( argv[i], "--YBlocks"                 ) == 0 ) { YBlocks = uint_c(std::atof( argv[++i]) ); continue; }
+      if( std::strcmp( argv[i], "--ZBlocks"                 ) == 0 ) { ZBlocks = uint_c(std::atof( argv[++i]) ); continue; }
+      if( std::strcmp( argv[i], "--uSettling"               ) == 0 ) { uSettling = real_c(std::atof( argv[++i] )); continue; }
+      if( std::strcmp( argv[i], "--topWallOffsetFactor"     ) == 0 ) { topWallOffsetFactor = real_c(std::atof( argv[++i] )); continue; }
+      if( std::strcmp( argv[i], "--Ga"                      ) == 0 ) { Ga = real_c(std::atof( argv[++i] )); continue; }
+      if( std::strcmp( argv[i], "--timestepsNonDim"         ) == 0 ) { timestepsNonDim = real_c(std::atof( argv[++i] )); continue; }
+      if( std::strcmp( argv[i], "--numRPDSubCycles"         ) == 0 ) { numRPDSubCycles = uint_c(std::atof( argv[++i] )); continue; }
+      if( std::strcmp( argv[i], "--useEntireFieldTraversal" ) == 0 ) { useEntireFieldTraversal = true; continue; }
+      if( std::strcmp( argv[i], "--numSamples"              ) == 0 ) { numSamples = uint_c(std::atof( argv[++i] )); continue; }
+      if( std::strcmp( argv[i], "--useFusedStreamCollide"   ) == 0 ) { useFusedStreamCollide = true; continue; }
+      WALBERLA_ABORT("Unrecognized command line argument found: " << argv[i]);
+   }
+
+   WALBERLA_CHECK(diameter > real_t(1));
+   WALBERLA_CHECK(uSettling > real_t(0));
+   WALBERLA_CHECK(Ga > real_t(0));
+   WALBERLA_CHECK(solidVolumeFraction > real_t(0));
+   WALBERLA_CHECK(solidVolumeFraction < real_t(0.65));
+
+   ///////////////////////////
+   // SIMULATION PROPERTIES //
+   ///////////////////////////
+
+
+   if( MPIManager::instance()->numProcesses() != int(XBlocks * YBlocks * ZBlocks) )
+   {
+      WALBERLA_LOG_WARNING_ON_ROOT("WARNING! You have specified less or more processes than number of blocks -> the time measurements are no longer blockwise!")
+   }
+
+   if( diameter > real_c(blockSize) )
+   {
+      WALBERLA_LOG_WARNING_ON_ROOT("The bodies might be too large to work with the currently used synchronization!");
+   }
+
+   WALBERLA_LOG_INFO_ON_ROOT("Using setup with sedimenting particles -> creating two planes and applying gravitational force")
+
+   const uint_t XCells = blockSize * XBlocks;
+   const uint_t YCells = blockSize * YBlocks;
+   const uint_t ZCells = blockSize * ZBlocks;
+
+   const real_t topWallOffset =  topWallOffsetFactor * real_t(blockSize); // move the top wall downwards to take away a certain portion of the overall domain
+
+   // determine number of spheres to generate, if necessary scale diameter a bit to reach desired solid volume fraction
+   real_t domainHeight = real_c(ZCells) - topWallOffset;
+   real_t fluidVolume =  real_c( XCells * YCells ) * domainHeight;
+   real_t solidVolume = solidVolumeFraction * fluidVolume;
+   uint_t numberOfParticles = uint_c(std::ceil(solidVolume / ( math::pi / real_t(6) * diameter * diameter * diameter )));
+   diameter = std::cbrt( solidVolume / ( real_c(numberOfParticles) * math::pi / real_t(6) ) );
+
+   auto densityRatio = real_t(2.5);
+
+   real_t viscosity = uSettling * diameter / Ga;
+   const real_t omega = lbm::collision_model::omegaFromViscosity(viscosity);
+
+   const real_t gravitationalAcceleration = uSettling * uSettling / ( (densityRatio-real_t(1)) * diameter );
+
+   real_t tref = diameter / uSettling;
+   real_t Tref = domainHeight / uSettling;
+
+   uint_t timesteps = uint_c(timestepsNonDim * Tref);
+
+   const real_t dx = real_c(1);
+   WALBERLA_LOG_INFO_ON_ROOT("viscosity = " << viscosity);
+   WALBERLA_LOG_INFO_ON_ROOT("tau = " << real_t(1)/omega);
+   WALBERLA_LOG_INFO_ON_ROOT("diameter = " << diameter);
+   WALBERLA_LOG_INFO_ON_ROOT("solid volume fraction = " << solidVolumeFraction);
+   WALBERLA_LOG_INFO_ON_ROOT("domain size (in cells) = " << XCells << " x " << YCells << " x " << ZCells);
+   WALBERLA_LOG_INFO_ON_ROOT("number of bodies = " << numberOfParticles);
+   WALBERLA_LOG_INFO_ON_ROOT("gravitational acceleration = " << gravitationalAcceleration);
+   WALBERLA_LOG_INFO_ON_ROOT("Ga = " << Ga);
+   WALBERLA_LOG_INFO_ON_ROOT("uSettling = " << uSettling);
+   WALBERLA_LOG_INFO_ON_ROOT("tref = " << tref);
+   WALBERLA_LOG_INFO_ON_ROOT("Tref = " << Tref);
+   WALBERLA_LOG_INFO_ON_ROOT("timesteps = " << timesteps);
+   WALBERLA_LOG_INFO_ON_ROOT("number of workload samples = " << numSamples);
+
+   // create folder to store logging files
+   WALBERLA_ROOT_SECTION()
+   {
+      walberla::filesystem::path path1( baseFolder );
+      if( !walberla::filesystem::exists( path1 ) )
+         walberla::filesystem::create_directory( path1 );
+   }
+
+
+   ///////////////////////////
+   // BLOCK STRUCTURE SETUP //
+   ///////////////////////////
+
+   Vector3<bool> periodicity( true );
+   periodicity[2] = false;
+
+   // create domain
+   shared_ptr< StructuredBlockForest > blocks = blockforest::createUniformBlockGrid( XBlocks, YBlocks, ZBlocks, blockSize, blockSize, blockSize, dx,
+                                                    0, false, false, //one block per process!
+                                                    periodicity[0], periodicity[1], periodicity[2], //periodicity
+                                                    false );
+
+   /////////
+   // RPD //
+   /////////
+
+   const real_t restitutionCoeff = real_t(0.97);
+   const real_t frictionCoeffStatic = real_t(0.8);
+   const real_t frictionCoeffDynamic = real_t(0.15);
+   const real_t collisionTime = real_t(4) * diameter; // from my paper
+   const real_t poissonsRatio = real_t(0.22);
+   const real_t kappa = real_t(2) * ( real_t(1) - poissonsRatio ) / ( real_t(2) - poissonsRatio ) ;
+
+   auto rpdDomain = std::make_shared<mesa_pd::domain::BlockForestDomain>(blocks->getBlockForestPointer());
+
+   //init data structures
+   auto ps = walberla::make_shared<mesa_pd::data::ParticleStorage>(1);
+   auto ss = walberla::make_shared<mesa_pd::data::ShapeStorage>();
+   using ParticleAccessor_T = mesa_pd::data::ParticleAccessorWithShape;
+   auto accessor = walberla::make_shared<ParticleAccessor_T >(ps, ss);
+
+   real_t timeStepSizeRPD = real_t(1)/real_t(numRPDSubCycles);
+   mesa_pd::kernel::VelocityVerletPreForceUpdate  vvIntegratorPreForce(timeStepSizeRPD);
+   mesa_pd::kernel::VelocityVerletPostForceUpdate vvIntegratorPostForce(timeStepSizeRPD);
+
+   // types: 0 = wall, 1: sphere
+   mesa_pd::kernel::LinearSpringDashpot collisionResponse(2);
+   collisionResponse.setFrictionCoefficientDynamic(0,1,frictionCoeffDynamic);
+   collisionResponse.setFrictionCoefficientDynamic(1,1,frictionCoeffDynamic);
+   collisionResponse.setFrictionCoefficientStatic(0,1,frictionCoeffStatic);
+   collisionResponse.setFrictionCoefficientStatic(1,1,frictionCoeffStatic);
+
+   const real_t sphereVolume = math::pi / real_t(6) * diameter * diameter * diameter;
+   const real_t particleMass = densityRatio * sphereVolume;
+   const real_t effMass_sphereWall = particleMass;
+   const real_t effMass_sphereSphere = particleMass * particleMass / ( real_t(2) * particleMass );
+   collisionResponse.setStiffnessAndDamping(0,1,restitutionCoeff,collisionTime,kappa,effMass_sphereWall);
+   collisionResponse.setStiffnessAndDamping(1,1,restitutionCoeff,collisionTime,kappa,effMass_sphereSphere);
+
+   mesa_pd::mpi::ReduceProperty reduceProperty;
+   mesa_pd::mpi::ReduceContactHistory reduceAndSwapContactHistory;
+
+   //////////////
+   // COUPLING //
+   //////////////
+
+   // connect to pe
+   const real_t overlap = real_c( 1.5 ) * dx;
+
+   std::function<void(void)> syncCall = [&ps,&rpdDomain,overlap](){
+      mesa_pd::mpi::SyncNextNeighbors syncNextNeighborFunc;
+      syncNextNeighborFunc(*ps, *rpdDomain, overlap);
+   };
+
+   auto generationDomain = AABB( real_t(0), real_t(0), real_t(0), real_c(XCells), real_c(YCells), real_c(ZCells) - topWallOffset);
+
+   // create plane at top and bottom
+
+   mesa_pd::data::Particle&& p0 = *ps->create(true);
+   p0.setPosition(generationDomain.minCorner());
+   p0.setShapeID(ss->create<mesa_pd::data::HalfSpace>( Vector3<real_t>(0,0,1) ));
+   p0.setOwner(mpi::MPIManager::instance()->rank());
+   p0.setType(0);
+   mesa_pd::data::particle_flags::set(p0.getFlagsRef(), mesa_pd::data::particle_flags::INFINITE);
+   mesa_pd::data::particle_flags::set(p0.getFlagsRef(), mesa_pd::data::particle_flags::FIXED);
+
+   mesa_pd::data::Particle&& p1 = *ps->create(true);
+   p1.setPosition(generationDomain.maxCorner());
+   p1.setShapeID(ss->create<mesa_pd::data::HalfSpace>( Vector3<real_t>(0,0,-1) ));
+   p1.setOwner(mpi::MPIManager::instance()->rank());
+   p1.setType(0);
+   mesa_pd::data::particle_flags::set(p1.getFlagsRef(), mesa_pd::data::particle_flags::INFINITE);
+   mesa_pd::data::particle_flags::set(p1.getFlagsRef(), mesa_pd::data::particle_flags::FIXED);
+
+   auto sphereShape = ss->create<mesa_pd::data::Sphere>( diameter * real_t(0.5) );
+   ss->shapes[sphereShape]->updateMassAndInertia(densityRatio);
+
+   auto xParticle = real_t(0);
+   auto yParticle = real_t(0);
+   auto zParticle = real_t(0);
+
+   auto rank = mpi::MPIManager::instance()->rank();
+
+   for( uint_t nPart = 0; nPart < numberOfParticles; ++nPart )
+   {
+
+      WALBERLA_ROOT_SECTION()
+      {
+         xParticle = math::realRandom<real_t>(generationDomain.xMin(), generationDomain.xMax());
+         yParticle = math::realRandom<real_t>(generationDomain.yMin(), generationDomain.yMax());
+         zParticle = math::realRandom<real_t>(generationDomain.zMin(), generationDomain.zMax());
+
+      }
+
+      WALBERLA_MPI_SECTION()
+      {
+         mpi::broadcastObject( xParticle );
+         mpi::broadcastObject( yParticle );
+         mpi::broadcastObject( zParticle );
+      }
+
+      auto position = Vector3<real_t>( xParticle, yParticle, zParticle );
+      //WALBERLA_LOG_INFO_ON_ROOT(position);
+
+      if (!rpdDomain->isContainedInProcessSubdomain(uint_c(rank), position)) continue;
+      auto p                       = ps->create();
+      p->setPosition(position);
+      p->setInteractionRadius(diameter * real_t(0.5));
+      p->setShapeID(sphereShape);
+      p->setType(1);
+      p->setOwner(rank);
+   }
+
+   syncCall();
+
+   // resolve possible overlaps of the particles due to the random initialization via a particle-only simulation
+
+   const bool useOpenMP = false;
+   const real_t dt_RPD_Init = real_t(1);
+   const auto initialParticleSimSteps = uint_t(20000);
+   const real_t overlapLimit = real_t(0.001) * diameter;
+
+   auto particleVtkOutput = make_shared<mesa_pd::vtk::ParticleVtkOutput>(ps);
+   particleVtkOutput->addOutput<mesa_pd::data::SelectParticleLinearVelocity>("velocity");
+   particleVtkOutput->setParticleSelector( [sphereShape](const mesa_pd::data::ParticleStorage::iterator& pIt) {return !mesa_pd::data::particle_flags::isSet(pIt->getFlags(), mesa_pd::data::particle_flags::GHOST) && pIt->getShapeID() == sphereShape;} ); //limit output to local sphere
+   auto particleVtkWriterInit = vtk::createVTKOutput_PointData(particleVtkOutput, "Particles_init", 1, baseFolder, "simulation_step");
+
+   for(auto pet = uint_t(0); pet <= initialParticleSimSteps; ++pet )
+   {
+
+
+      real_t maxPenetrationDepth = 0;
+      ps->forEachParticlePairHalf(useOpenMP, mesa_pd::kernel::ExcludeInfiniteInfinite(), *accessor,
+                                  [&collisionResponse, &rpdDomain, &maxPenetrationDepth, dt_RPD_Init]
+                                        (const size_t idx1, const size_t idx2, auto& ac)
+                                  {
+                                     mesa_pd::collision_detection::AnalyticContactDetection acd;
+                                     mesa_pd::kernel::DoubleCast double_cast;
+                                     mesa_pd::mpi::ContactFilter contact_filter;
+                                     if (double_cast(idx1, idx2, ac, acd, ac ))
+                                     {
+                                        if (contact_filter(acd.getIdx1(), acd.getIdx2(), ac, acd.getContactPoint(), *rpdDomain))
+                                        {
+                                           maxPenetrationDepth = std::max(maxPenetrationDepth, std::abs(acd.getPenetrationDepth()));
+                                           collisionResponse(acd.getIdx1(), acd.getIdx2(), ac, acd.getContactPoint(),
+                                                 acd.getContactNormal(), acd.getPenetrationDepth(), dt_RPD_Init);
+                                        }
+                                     }
+                                  },
+                                  *accessor );
+
+      reduceAndSwapContactHistory(*ps);
+
+      mpi::allReduceInplace(maxPenetrationDepth, mpi::MAX);
+
+      reduceProperty.operator()<mesa_pd::ForceTorqueNotification>(*ps);
+
+      ps->forEachParticle( useOpenMP, mesa_pd::kernel::SelectLocal(), *accessor, mesa_pd::kernel::ExplicitEuler(dt_RPD_Init), *accessor);
+      syncCall();
+
+      if( pet % uint_t(20) == uint_t(0) )
+      {
+         if(vtkDuringInit)
+         {
+            particleVtkWriterInit->write();
+         }
+         WALBERLA_LOG_INFO_ON_ROOT(pet << " - current max overlap = " << maxPenetrationDepth / diameter * real_t(100) << "%");
+      }
+
+      if(maxPenetrationDepth < overlapLimit) break;
+
+      // reset velocites to avoid too large ones
+
+      ps->forEachParticle( useOpenMP, mesa_pd::kernel::SelectLocal(), *accessor,
+            [](const size_t idx, ParticleAccessor_T& ac){
+               ac.setLinearVelocity(idx, ac.getLinearVelocity(idx) * real_t(0.5));
+               ac.setAngularVelocity(idx, ac.getAngularVelocity(idx) * real_t(0.5));
+               }, *accessor);
+
+
+   }
+
+
+   // reset all velocities to zero
+   Vector3<real_t> initialSphereVelocity(real_t(0));
+
+   ps->forEachParticle(useOpenMP, mesa_pd::kernel::SelectAll(), *accessor, [](const size_t idx, ParticleAccessor_T& ac){
+      ac.getNewContactHistoryRef(idx).clear();
+      ac.getOldContactHistoryRef(idx).clear();
+   }, *accessor);
+
+   WALBERLA_LOG_INFO_ON_ROOT("Setting initial velocity " << initialSphereVelocity << " of all spheres");
+   ps->forEachParticle( useOpenMP, mesa_pd::kernel::SelectLocal(), *accessor,
+                        [initialSphereVelocity](const size_t idx, ParticleAccessor_T& ac){
+                           ac.setLinearVelocity(idx, initialSphereVelocity);
+                           ac.setAngularVelocity(idx, Vector3<real_t>(real_t(0)));
+                        }, *accessor);
+
+   syncCall();
+
+   ///////////////////////
+   // ADD DATA TO BLOCKS //
+   ////////////////////////
+
+   // create the lattice model
+   LatticeModel_T latticeModel = LatticeModel_T( lbm::collision_model::TRT::constructWithMagicNumber( omega ) );
+
+   // add PDF field
+   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, "pdf field (zyxf)", latticeModel,
+                                                                         Vector3< real_t >( real_t(0) ), real_t(1),
+                                                                         uint_t(1), field::zyxf );
+
+   // add flag field
+   BlockDataID flagFieldID = field::addFlagFieldToStorage<FlagField_T>( blocks, "flag field" );
+
+   // add particle field
+   BlockDataID particleFieldID = field::addToStorage<lbm_mesapd_coupling::ParticleField_T>( blocks, "particle field", accessor->getInvalidUid(), field::zyxf, FieldGhostLayers );
+
+   // add boundary handling & initialize outer domain boundaries
+   using BoundaryHandling_T = MyBoundaryHandling<ParticleAccessor_T>::Type;
+   BlockDataID boundaryHandlingID = blocks->addStructuredBlockData< BoundaryHandling_T >(MyBoundaryHandling<ParticleAccessor_T>( flagFieldID, pdfFieldID, particleFieldID, accessor, useEntireFieldTraversal), "boundary handling" );
+
+   Vector3<real_t> gravitationalForce( real_t(0), real_t(0), -(densityRatio - real_t(1)) * gravitationalAcceleration * sphereVolume );
+   lbm_mesapd_coupling::AddForceOnParticlesKernel addGravitationalForce(gravitationalForce);
+   lbm_mesapd_coupling::ResetHydrodynamicForceTorqueKernel resetHydrodynamicForceTorque;
+   lbm_mesapd_coupling::AverageHydrodynamicForceTorqueKernel averageHydrodynamicForceTorque;
+   lbm_mesapd_coupling::LubricationCorrectionKernel lubricationCorrectionKernel(viscosity, [](real_t r){return (real_t(0.001 + real_t(0.00007)*r))*r;});
+   lbm_mesapd_coupling::ParticleMappingKernel<BoundaryHandling_T> particleMappingKernel(blocks, boundaryHandlingID);
+   lbm_mesapd_coupling::MovingParticleMappingKernel<BoundaryHandling_T> movingParticleMappingKernel(blocks, boundaryHandlingID, particleFieldID);
+
+   WALBERLA_LOG_INFO_ON_ROOT(" - Lubrication correction:");
+   WALBERLA_LOG_INFO_ON_ROOT("   - normal cut off distance = " << lubricationCorrectionKernel.getNormalCutOffDistance());
+   WALBERLA_LOG_INFO_ON_ROOT("   - tangential translational cut off distance = " << lubricationCorrectionKernel.getTangentialTranslationalCutOffDistance());
+   WALBERLA_LOG_INFO_ON_ROOT("   - tangential rotational cut off distance = " << lubricationCorrectionKernel.getTangentialRotationalCutOffDistance());
+   const real_t maximumLubricationCutOffDistance = std::max(lubricationCorrectionKernel.getNormalCutOffDistance(), std::max(lubricationCorrectionKernel.getTangentialRotationalCutOffDistance(), lubricationCorrectionKernel.getTangentialTranslationalCutOffDistance()));
+
+
+   // map planes into the LBM simulation -> act as no-slip boundaries
+   ps->forEachParticle(false, lbm_mesapd_coupling::GlobalParticlesSelector(), *accessor, movingParticleMappingKernel, *accessor, MO_Flag);
+
+   // map particles into the LBM simulation
+   ps->forEachParticle(false, lbm_mesapd_coupling::RegularParticlesSelector(), *accessor, movingParticleMappingKernel, *accessor, MO_Flag);
+
+   lbm::BlockForestEvaluation<FlagField_T> bfEval(blocks, flagFieldID, Fluid_Flag);
+
+   WALBERLA_LOG_INFO_ON_ROOT(bfEval.loggingString());
+
+   ///////////////
+   // TIME LOOP //
+   ///////////////
+
+   // create the timeloop
+   SweepTimeloop timeloop( blocks->getBlockStorage(), timesteps );
+
+   timeloop.addFuncBeforeTimeStep( RemainingTimeLogger( timeloop.getNrOfTimeSteps() ), "Remaining Time Logger" );
+
+   if( vtkIOFreq != uint_t(0) )
+   {
+      auto particleVtkWriter = vtk::createVTKOutput_PointData(particleVtkOutput, "Particles", vtkIOFreq, baseFolder, "simulation_step");
+      timeloop.addFuncBeforeTimeStep( vtk::writeFiles( particleVtkWriter ), "VTK (sphere data)" );
+
+      // flag field
+      auto flagFieldVTK = vtk::createVTKOutput_BlockData( blocks, "flag_field", vtkIOFreq, 1, false, baseFolder );
+      flagFieldVTK->addCellDataWriter( make_shared< field::VTKWriter< FlagField_T > >( flagFieldID, "FlagField" ) );
+      timeloop.addFuncBeforeTimeStep( vtk::writeFiles( flagFieldVTK ), "VTK (flag field data)" );
+
+      // pdf field
+      auto pdfFieldVTK = vtk::createVTKOutput_BlockData( blocks, "fluid_field", vtkIOFreq, 0, false, baseFolder );
+
+      field::FlagFieldCellFilter< FlagField_T > fluidFilter( flagFieldID );
+      fluidFilter.addFlag( Fluid_Flag );
+      pdfFieldVTK->addCellInclusionFilter( fluidFilter );
+
+      pdfFieldVTK->addCellDataWriter( make_shared< lbm::VelocityVTKWriter< LatticeModel_T, float > >( pdfFieldID, "VelocityFromPDF" ) );
+      pdfFieldVTK->addCellDataWriter( make_shared< lbm::DensityVTKWriter < LatticeModel_T, float > >( pdfFieldID, "DensityFromPDF" ) );
+
+      timeloop.addFuncBeforeTimeStep( vtk::writeFiles( pdfFieldVTK ), "VTK (fluid field data)" );
+
+      auto domainDecompVTK = vtk::createVTKOutput_DomainDecomposition(blocks, "domain_decomposition", vtkIOFreq, baseFolder );
+      timeloop.addFuncBeforeTimeStep( vtk::writeFiles(domainDecompVTK), "VTK (domain decomposition)");
+   }
+
+
+   // setup of the LBM communication for synchronizing the pdf field between neighboring blocks
+   blockforest::communication::UniformBufferedScheme< Stencil_T > optimizedPDFCommunicationScheme( blocks );
+   optimizedPDFCommunicationScheme.addPackInfo( make_shared< lbm::PdfFieldPackInfo< LatticeModel_T > >( pdfFieldID ) ); // optimized sync
+
+   auto sweep = lbm::makeCellwiseSweep< LatticeModel_T, FlagField_T >( pdfFieldID, flagFieldID, Fluid_Flag );
+
+   if( !useFusedStreamCollide )
+   {
+      // Collide
+      timeloop.add() << Sweep( makeCollideSweep(sweep), "Collide" );
+   }
+
+   // add LBM communication function and boundary handling sweep (does the hydro force calculations and the no-slip treatment)
+   timeloop.add() << BeforeFunction( optimizedPDFCommunicationScheme, "LBM Communication" )
+                  << Sweep( BoundaryHandling_T::getBlockSweep( boundaryHandlingID ), "Boundary Handling" );
+
+   if( useFusedStreamCollide )
+   {
+      // streaming & collide
+      timeloop.add() << Sweep( makeSharedSweep(sweep), "Stream&Collide" );
+   } else
+   {
+      // streaming
+      timeloop.add() << Sweep( makeStreamSweep(sweep), "Stream" );
+
+   }
+
+   SweepTimeloop timeloopAfterParticles( blocks->getBlockStorage(), timesteps );
+
+   bool conserveMomentum = false;
+   // sweep for updating the particle mapping into the LBM simulation
+   timeloopAfterParticles.add() << Sweep( lbm_mesapd_coupling::makeMovingParticleMapping<PdfField_T,BoundaryHandling_T>(blocks, pdfFieldID,boundaryHandlingID, particleFieldID, accessor, MO_Flag, FormerMO_Flag,
+         lbm_mesapd_coupling::RegularParticlesSelector(), conserveMomentum), "Particle Mapping" );
+
+   bool recomputeTargetDensity = false;
+   auto gradReconstructor = lbm_mesapd_coupling::makeGradsMomentApproximationReconstructor<BoundaryHandling_T>(blocks, boundaryHandlingID, omega, recomputeTargetDensity,true);
+
+
+   blockforest::communication::UniformBufferedScheme< Stencil_T > fullPDFCommunicationScheme( blocks );
+   fullPDFCommunicationScheme.addPackInfo( make_shared< field::communication::PackInfo< PdfField_T > >( pdfFieldID ) ); // full sync
+
+   timeloopAfterParticles.add() << BeforeFunction( fullPDFCommunicationScheme, "PDF Communication" )
+                                << Sweep( makeSharedSweep(lbm_mesapd_coupling::makePdfReconstructionManager<PdfField_T,BoundaryHandling_T>(blocks, pdfFieldID, boundaryHandlingID, particleFieldID, accessor, FormerMO_Flag, Fluid_Flag,
+                                      gradReconstructor, conserveMomentum) ), "PDF Restore" );
+
+
+   ////////////////////////
+   // EXECUTE SIMULATION //
+   ////////////////////////
+
+   WcTimingPool timeloopTiming;
+
+   std::vector< std::vector<std::string> > timerKeys;
+   std::vector<std::string> LBMTimer;
+   LBMTimer.emplace_back("Stream&Collide");
+   LBMTimer.emplace_back("Stream");
+   LBMTimer.emplace_back("Collide");
+   timerKeys.push_back(LBMTimer);
+
+   std::vector<std::string> bhTimer;
+   bhTimer.emplace_back("Boundary Handling");
+   timerKeys.push_back(bhTimer);
+
+   std::vector<std::string> couplingTimer1;
+   couplingTimer1.emplace_back("Particle Mapping");
+   std::vector<std::string> couplingTimer2;
+   couplingTimer2.emplace_back("PDF Restore");
+   timerKeys.push_back(couplingTimer1);
+   timerKeys.push_back(couplingTimer2);
+
+   std::vector<std::string> rpdTimer;
+   rpdTimer.emplace_back("RPD Force");
+   rpdTimer.emplace_back("RPD VV1");
+   rpdTimer.emplace_back("RPD VV2");
+   rpdTimer.emplace_back("RPD Lub");
+   rpdTimer.emplace_back("RPD Collision");
+   timerKeys.push_back(rpdTimer);
+
+   uint_t numCells = uint_t(0);
+   uint_t numFluidCells = uint_t(0);
+   uint_t numNBCells = uint_t(0);
+   uint_t numLocalParticles = uint_t(0);
+   uint_t numGhostParticles = uint_t(0);
+   uint_t numContacts = uint_t(0);
+
+   std::vector<double> timings(timerKeys.size());
+
+   // every rank writes its own file -> numProcesses number of samples!
+   int myRank = MPIManager::instance()->rank();
+
+   std::string logFileName = baseFolder + "/load";
+   logFileName += "_settling";
+   logFileName += "_spheres";
+   logFileName += "_d" + std::to_string(int_c(std::ceil(diameter)));
+   logFileName += "_bs" + std::to_string(blockSize);
+   logFileName += "_" + std::to_string(myRank) + ".txt";
+
+
+   std::ofstream file;
+
+   if(!noFileOutput)
+   {
+      WALBERLA_LOG_INFO_ON_ROOT("Writing load info to file " << logFileName);
+      file.open( logFileName.c_str());
+      file << "# svf = " << solidVolumeFraction << ", d = " << diameter << ", domain = " << XCells << "x" << YCells << "x" << ZCells << "\n";
+   }
+
+
+   auto timeStepOfFirstTiming = uint_t(50);
+
+   if( timesteps - timeStepOfFirstTiming < numSamples )
+   {
+      WALBERLA_LOG_WARNING_ON_ROOT("Less actual time steps than number of required samples!");
+   }
+
+   uint_t nSample( 0 ); // number of current sample
+   real_t samplingFrequency = real_c(timesteps - timeStepOfFirstTiming) / real_c(numSamples);
+
+   // time loop
+   for (uint_t i = 0; i < timesteps; ++i )
+   {
+      // perform a single simulation step
+
+      timeloop.singleStep( timeloopTiming );
+
+      reduceProperty.operator()<mesa_pd::HydrodynamicForceTorqueNotification>(*ps);
+
+      timeloopTiming["RPD Force"].start();
+      if( i == 0 )
+      {
+         lbm_mesapd_coupling::InitializeHydrodynamicForceTorqueForAveragingKernel initializeHydrodynamicForceTorqueForAveragingKernel;
+         ps->forEachParticle(useOpenMP, mesa_pd::kernel::SelectLocal(), *accessor, initializeHydrodynamicForceTorqueForAveragingKernel, *accessor );
+      }
+      ps->forEachParticle(useOpenMP, mesa_pd::kernel::SelectLocal(), *accessor, averageHydrodynamicForceTorque, *accessor );
+      timeloopTiming["RPD Force"].end();
+
+      for(auto subCycle = uint_t(0); subCycle < numRPDSubCycles; ++subCycle )
+      {
+
+         timeloopTiming["RPD VV1"].start();
+         ps->forEachParticle(useOpenMP, mesa_pd::kernel::SelectLocal(), *accessor, vvIntegratorPreForce, *accessor);
+         timeloopTiming["RPD VV1"].end();
+
+         syncCall();
+
+         // lubrication correction
+         timeloopTiming["RPD Lub"].start();
+         ps->forEachParticlePairHalf(useOpenMP, mesa_pd::kernel::ExcludeInfiniteInfinite(), *accessor,
+                                     [&lubricationCorrectionKernel,maximumLubricationCutOffDistance, &rpdDomain,&numContacts]
+                                     (const size_t idx1, const size_t idx2, auto& ac)
+                                     {
+                                        mesa_pd::collision_detection::AnalyticContactDetection acd;
+                                        acd.getContactThreshold() = maximumLubricationCutOffDistance;
+                                        mesa_pd::kernel::DoubleCast double_cast;
+                                        mesa_pd::mpi::ContactFilter contact_filter;
+                                        if (double_cast(idx1, idx2, ac, acd, ac ))
+                                        {
+                                           if (contact_filter(acd.getIdx1(), acd.getIdx2(), ac, acd.getContactPoint(), *rpdDomain))
+                                           {
+                                              double_cast(acd.getIdx1(), acd.getIdx2(), ac, lubricationCorrectionKernel, ac, acd.getContactNormal(), acd.getPenetrationDepth());
+                                              ++numContacts; // this then also includes all actual contacts since there the contact threshold is smaller
+                                           }
+                                        }
+                                     },
+                                     *accessor );
+         timeloopTiming["RPD Lub"].end();
+
+         // collision response
+         timeloopTiming["RPD Collision"].start();
+         ps->forEachParticlePairHalf(useOpenMP, mesa_pd::kernel::ExcludeInfiniteInfinite(), *accessor,
+                                     [&collisionResponse, &rpdDomain, timeStepSizeRPD]
+                                           (const size_t idx1, const size_t idx2, auto& ac)
+                                     {
+                                        mesa_pd::collision_detection::AnalyticContactDetection acd;
+                                        mesa_pd::kernel::DoubleCast double_cast;
+                                        mesa_pd::mpi::ContactFilter contact_filter;
+                                        if (double_cast(idx1, idx2, ac, acd, ac ))
+                                        {
+                                           if (contact_filter(acd.getIdx1(), acd.getIdx2(), ac, acd.getContactPoint(), *rpdDomain))
+                                           {
+                                              collisionResponse(acd.getIdx1(), acd.getIdx2(), ac, acd.getContactPoint(), acd.getContactNormal(), acd.getPenetrationDepth(), timeStepSizeRPD);
+                                           }
+                                        }
+                                     },
+                                     *accessor );
+
+         timeloopTiming["RPD Collision"].end();
+
+         reduceAndSwapContactHistory(*ps);
+
+         timeloopTiming["RPD Force"].start();
+         lbm_mesapd_coupling::AddHydrodynamicInteractionKernel addHydrodynamicInteraction;
+         ps->forEachParticle( useOpenMP, mesa_pd::kernel::SelectLocal(), *accessor, addHydrodynamicInteraction, *accessor );
+         ps->forEachParticle( useOpenMP, mesa_pd::kernel::SelectLocal(), *accessor, addGravitationalForce, *accessor );
+         timeloopTiming["RPD Force"].end();
+
+         reduceProperty.operator()<mesa_pd::ForceTorqueNotification>(*ps);
+
+         // integration
+         timeloopTiming["RPD VV2"].start();
+         ps->forEachParticle( useOpenMP, mesa_pd::kernel::SelectLocal(), *accessor, vvIntegratorPostForce, *accessor);
+         timeloopTiming["RPD VV2"].end();
+
+         syncCall();
+
+
+      }
+
+      timeloopTiming["RPD Force"].start();
+      ps->forEachParticle( useOpenMP, mesa_pd::kernel::SelectAll(), *accessor, resetHydrodynamicForceTorque, *accessor );
+      timeloopTiming["RPD Force"].end();
+
+
+      // update particle mapping
+      timeloopAfterParticles.singleStep(timeloopTiming);
+
+      //WALBERLA_LOG_INFO_ON_ROOT(timeloopTiming);
+
+      // check if current time step should be included in sample
+      if( i >= uint_c( samplingFrequency * real_c(nSample) ) + timeStepOfFirstTiming )
+      {
+         // include -> evaluate all timers and quantities
+
+         evaluateFluidQuantities<BoundaryHandling_T>(blocks, boundaryHandlingID, numCells, numFluidCells, numNBCells);
+         evaluateRPDQuantities(ps, numLocalParticles, numGhostParticles);
+
+         evaluateTimers(timeloopTiming, timerKeys, timings);
+
+         if(!noFileOutput)
+         {
+            auto totalTime = std::accumulate(timings.begin(), timings.end(), 0.0 );
+
+            file << timeloop.getCurrentTimeStep() << " " << real_c(timeloop.getCurrentTimeStep()) / Tref << " "
+                 << numCells << " " << numFluidCells << " " << numNBCells << " "
+                 << numLocalParticles << " " << numGhostParticles << " "
+                 << real_c(numContacts) / real_c(numRPDSubCycles) << " " << numRPDSubCycles;
+            for (auto timing : timings) {
+               file << " " << timing;
+            }
+            file << " " << totalTime << "\n";
+         }
+
+         ++nSample;
+      }
+
+      numCells = uint_t(0);
+      numFluidCells = uint_t(0);
+      numNBCells = uint_t(0);
+      numLocalParticles = uint_t(0);
+      numGhostParticles = uint_t(0);
+      numContacts = uint_t(0);
+
+      // reset timers to always include only a single time step in them
+      timeloopTiming.clear();
+   }
+
+   if(!noFileOutput) {
+      file.close();
+   }
+
+   WALBERLA_LOG_INFO_ON_ROOT("Simulation finished!");
+
+   return 0;
+
+}
+
+}
+
+int main( int argc, char **argv ){
+   fluid_particle_workload_evaluation::main(argc, argv);
+}
diff --git a/apps/benchmarks/FluidParticleCouplingWithLoadBalancing/Utility.h b/apps/benchmarks/FluidParticleCouplingWithLoadBalancing/Utility.h
new file mode 100644
index 0000000000000000000000000000000000000000..f9e4c05f1273a463486ef7b18647d31020d5bfda
--- /dev/null
+++ b/apps/benchmarks/FluidParticleCouplingWithLoadBalancing/Utility.h
@@ -0,0 +1,67 @@
+#pragma once
+
+#include "lbm_mesapd_coupling/amr/BlockInfo.h"
+
+namespace walberla {
+namespace lbm_mesapd_coupling  {
+namespace amr {
+
+/*
+ * Result from the workload evaluation as described in
+ *  Rettinger, Ruede - "Dynamic Load Balancing Techniques for Particulate Flow Simulations", 2019, Computation
+ */
+real_t fittedLBMWeightEvaluationFunction(const BlockInfo& blockInfo)
+{
+   uint_t Ce = blockInfo.numberOfCells;
+   uint_t F  = blockInfo.numberOfFluidCells;
+   real_t weight = real_t(7.597476065046571e-06) * real_c(Ce) + real_t(8.95723566283202e-05) * real_c(F) + real_t(-0.1526111388616016);
+   return std::max(weight,real_t(0));
+}
+real_t fittedBHWeightEvaluationFunction(const BlockInfo& blockInfo)
+{
+   uint_t Ce = blockInfo.numberOfCells;
+   uint_t NB = blockInfo.numberOfNearBoundaryCells;
+   real_t weight = real_t(1.3067711379655123e-07) * real_c(Ce) + real_t(0.0007289549127205142) * real_c(NB) + real_t(-0.1575698071795788);
+   return std::max(weight,real_t(0));
+}
+real_t fittedRPDWeightEvaluationFunction(const BlockInfo& blockInfo)
+{
+   uint_t Pl = blockInfo.numberOfLocalParticles;
+   uint_t Pg = blockInfo.numberOfGhostParticles;
+   uint_t Sc = blockInfo.numberOfRPDSubCycles;
+   real_t cPlPg2 = real_t(2.402288635599054e-05);
+   real_t cPl    = real_t(0.00040932622363097144);
+   real_t cPg    = real_t(0.0007268941363125683);
+   real_t c      = real_t(2.01883028312316e-19);
+   real_t weight = real_c(Sc) * ( cPlPg2 * real_c(Pl+Pg) * real_c(Pl+Pg) + cPl * real_c(Pl) + cPg * real_c(Pg) + c );
+   return std::max(weight,real_t(0));
+}
+real_t fittedCoup1WeightEvaluationFunction(const BlockInfo& blockInfo)
+{
+   uint_t Ce = blockInfo.numberOfCells;
+   uint_t F  = blockInfo.numberOfFluidCells;
+   uint_t Pl = blockInfo.numberOfLocalParticles;
+   uint_t Pg = blockInfo.numberOfGhostParticles;
+   real_t weight = real_t(5.610203409278647e-06) * real_c(Ce) + real_t(-7.257635845636656e-07) * real_c(F) + real_t(0.02049703546054693) * real_c(Pl) + real_t(0.04248208493809902) * real_c(Pg) + real_t(-0.26609470510074784);
+   return std::max(weight,real_t(0));
+}
+real_t fittedCoup2WeightEvaluationFunction(const BlockInfo& blockInfo)
+{
+   uint_t Ce = blockInfo.numberOfCells;
+   uint_t F  = blockInfo.numberOfFluidCells;
+   uint_t Pl = blockInfo.numberOfLocalParticles;
+   uint_t Pg = blockInfo.numberOfGhostParticles;
+   real_t weight = real_t(7.198479654682179e-06) * real_c(Ce) + real_t(1.178247475854302e-06) * real_c(F) + real_t(-0.0026401549115124628) * real_c(Pl) + real_t(0.008459646786179298) * real_c(Pg) + real_t(-0.001077320113275954);
+   return std::max(weight,real_t(0));
+}
+real_t fittedTotalWeightEvaluationFunction(const BlockInfo& blockInfo)
+{
+   return fittedLBMWeightEvaluationFunction(blockInfo) + fittedBHWeightEvaluationFunction(blockInfo) +
+          fittedRPDWeightEvaluationFunction(blockInfo) + fittedCoup1WeightEvaluationFunction(blockInfo) +
+          fittedCoup2WeightEvaluationFunction(blockInfo);
+}
+
+} //namespace amr
+} //namespace lbm_mesapd_coupling
+} //namespace walberla
+
diff --git a/src/lbm_mesapd_coupling/amr/BlockInfo.h b/src/lbm_mesapd_coupling/amr/BlockInfo.h
index 6b3e0706a9389c5e87be511175112584453ec81d..f36b491fb7ed60c93c9350fe14ab91e2769282a3 100644
--- a/src/lbm_mesapd_coupling/amr/BlockInfo.h
+++ b/src/lbm_mesapd_coupling/amr/BlockInfo.h
@@ -28,28 +28,27 @@
 
 namespace walberla {
 namespace lbm_mesapd_coupling {
+namespace amr {
 
 struct BlockInfo {
    // lbm quantities
    uint_t numberOfCells;
    uint_t numberOfFluidCells;
    uint_t numberOfNearBoundaryCells;
-   // pe quantities
+   // rpd quantities
    uint_t numberOfLocalParticles;
-   uint_t numberOfShadowParticles;
-   uint_t numberOfContacts;
+   uint_t numberOfGhostParticles;
    // coupling quantities
-   uint_t numberOfMESAPDSubCycles;
+   uint_t numberOfRPDSubCycles;
 
    BlockInfo()
          : numberOfCells(0), numberOfFluidCells(0), numberOfNearBoundaryCells(0),
-           numberOfLocalParticles(0), numberOfShadowParticles(0), numberOfContacts(0), numberOfMESAPDSubCycles(0) {}
+           numberOfLocalParticles(0), numberOfGhostParticles(0), numberOfRPDSubCycles(0) {}
 
    BlockInfo(const uint_t numCells, const uint_t numFluidCells, const uint_t numNearBoundaryCells,
-             const uint_t numLocalBodies, const uint_t numShadowParticles, const uint_t numContacts,
-             const uint_t numPeSubCycles)
+             const uint_t numLocalParticles, const uint_t numGhostParticles, const uint_t numRPDSubCycles)
          : numberOfCells(numCells), numberOfFluidCells(numFluidCells), numberOfNearBoundaryCells(numNearBoundaryCells),
-           numberOfLocalParticles(numLocalBodies), numberOfShadowParticles(numShadowParticles), numberOfContacts(numContacts), numberOfMESAPDSubCycles(numPeSubCycles) {}
+           numberOfLocalParticles(numLocalParticles), numberOfGhostParticles(numGhostParticles), numberOfRPDSubCycles(numRPDSubCycles) {}
 };
 
 
@@ -57,19 +56,17 @@ inline
 std::ostream& operator<<( std::ostream& os, const BlockInfo& bi )
 {
    os << bi.numberOfCells << " / " << bi.numberOfFluidCells << " / " << bi.numberOfNearBoundaryCells << " / "
-      << bi.numberOfLocalParticles << " / "<< bi.numberOfShadowParticles << " / " << bi.numberOfContacts << " / "
-      << bi.numberOfMESAPDSubCycles;
+      << bi.numberOfLocalParticles << " / "<< bi.numberOfGhostParticles << " / " << bi.numberOfRPDSubCycles;
    return os;
 }
 
 template< typename T,    // Element type of SendBuffer
-          typename G>    // Growth policy of SendBuffer
+      typename G>    // Growth policy of SendBuffer
 mpi::GenericSendBuffer<T,G>& operator<<( mpi::GenericSendBuffer<T,G> & buf, const BlockInfo& info )
 {
    buf.addDebugMarker( "pca" );
    buf << info.numberOfCells << info.numberOfFluidCells << info.numberOfNearBoundaryCells
-       << info.numberOfLocalParticles << info.numberOfShadowParticles << info.numberOfContacts
-       << info.numberOfMESAPDSubCycles;
+       << info.numberOfLocalParticles << info.numberOfGhostParticles << info.numberOfRPDSubCycles;
    return buf;
 }
 
@@ -78,10 +75,10 @@ mpi::GenericRecvBuffer<T>& operator>>( mpi::GenericRecvBuffer<T> & buf, BlockInf
 {
    buf.readDebugMarker( "pca" );
    buf >> info.numberOfCells >> info.numberOfFluidCells >> info.numberOfNearBoundaryCells
-       >> info.numberOfLocalParticles >> info.numberOfShadowParticles >> info.numberOfContacts
-       >> info.numberOfMESAPDSubCycles;
+       >> info.numberOfLocalParticles >> info.numberOfGhostParticles >> info.numberOfRPDSubCycles;
    return buf;
 }
 
+} // namespace amr
 } // namespace lbm_mesapd_coupling
 } // namespace walberla
diff --git a/src/lbm_mesapd_coupling/amr/InfoCollection.h b/src/lbm_mesapd_coupling/amr/InfoCollection.h
index 13f8682d5a36a1d6ce0d4f9cbd7c939741c23527..f3e596eb74e2bdf07baa5b3dd4e97da5441d5994 100644
--- a/src/lbm_mesapd_coupling/amr/InfoCollection.h
+++ b/src/lbm_mesapd_coupling/amr/InfoCollection.h
@@ -26,20 +26,21 @@
 #include "blockforest/BlockID.h"
 #include "core/mpi/BufferSystem.h"
 
-#include <mesa_pd/data/ParticleStorage.h>
+#include "mesa_pd/data/Flags.h"
 
 #include <map>
 
 namespace walberla {
 namespace lbm_mesapd_coupling {
+namespace amr {
 
-typedef std::map<blockforest::BlockID, BlockInfo>  InfoCollection;
-typedef std::pair<blockforest::BlockID, BlockInfo> InfoCollectionPair;
+using InfoCollection = std::map<blockforest::BlockID, BlockInfo>  ;
+using InfoCollectionPair = std::pair<blockforest::BlockID, BlockInfo>;
 
 template <typename BoundaryHandling_T, typename ParticleAccessor_T>
-void createWithNeighborhood(BlockForest& bf, const BlockDataID boundaryHandlingID,
-                            const ParticleAccessor_T& accessor, const uint_t numberOfMESAPDSubCycles,
-                            InfoCollection& ic ) {
+void updateAndSyncInfoCollection(BlockForest& bf, const BlockDataID boundaryHandlingID,
+                                 const ParticleAccessor_T& accessor, const uint_t numberOfRPDSubCycles,
+                                 InfoCollection& ic ) {
    ic.clear();
 
    mpi::BufferSystem bs( MPIManager::instance()->comm(), 856 );
@@ -66,22 +67,21 @@ void createWithNeighborhood(BlockForest& bf, const BlockDataID boundaryHandlingI
       }
 
       // evaluate MESAPD quantities
-      // count block local and (possible) shadow particles here
-      uint_t numLocalParticles = 0, numShadowParticles = 0;
+      // count block local and (possible) ghost particles here
+      uint_t numLocalParticles = 0, numGhostParticles = 0;
       for (size_t idx = 0; idx < accessor.size(); ++idx) {
-         if (block->getAABB().contains(accessor.getPosition(idx))) {
-            // a particle within a block on the current process cannot be a ghost particle
-            WALBERLA_ASSERT(!mesa_pd::data::particle_flags::isSet(accessor.getFlags(idx), mesa_pd::data::particle_flags::GHOST));
+
+         using namespace walberla::mesa_pd::data::particle_flags;
+         if( isSet(accessor.getFlags(idx), GLOBAL) ) continue;
+
+         if ( block->getAABB().contains(accessor.getPosition(idx)) ) {
             numLocalParticles++;
          } else if (block->getAABB().sqDistance(accessor.getPosition(idx)) < accessor.getInteractionRadius(idx)*accessor.getInteractionRadius(idx)) {
-            numShadowParticles++;
+            numGhostParticles++;
          }
       }
 
-      // count contacts here
-      const uint_t numContacts = 0;
-
-      BlockInfo blockInfo(numCells, numFluidCells, numNearBoundaryCells, numLocalParticles, numShadowParticles, numContacts, numberOfMESAPDSubCycles);
+      BlockInfo blockInfo(numCells, numFluidCells, numNearBoundaryCells, numLocalParticles, numGhostParticles, numberOfRPDSubCycles);
       InfoCollectionPair infoCollectionEntry(block->getId(), blockInfo);
 
       ic.insert( infoCollectionEntry );
@@ -102,6 +102,8 @@ void createWithNeighborhood(BlockForest& bf, const BlockDataID boundaryHandlingI
    bs.setReceiverInfoFromSendBufferState(false, true);
    bs.sendAll();
 
+   // info collection has to be distirbuted to neighboring processes such that later on when coarsening was applied,
+   // the weight of the coarsened block can be computed
    for( auto recvIt = bs.begin(); recvIt != bs.end(); ++recvIt )
    {
       while( !recvIt.buffer().isEmpty() )
@@ -132,7 +134,6 @@ void getBlockInfoFromInfoCollection( const PhantomBlock * block, const shared_pt
 
       // check the above mentioned assumptions
       WALBERLA_ASSERT_EQUAL(infoIt->second.numberOfLocalParticles, uint_t(0));
-      WALBERLA_ASSERT_EQUAL(infoIt->second.numberOfContacts, uint_t(0));
 
       blockInfo = infoIt->second;
    }
@@ -168,18 +169,40 @@ void getBlockInfoFromInfoCollection( const PhantomBlock * block, const shared_pt
 
          // check above mentioned assumptions
          WALBERLA_ASSERT_EQUAL(childIt->second.numberOfLocalParticles, uint_t(0));
-         WALBERLA_ASSERT_EQUAL(childIt->second.numberOfContacts, uint_t(0));
       }
       // total number of cells remains unchanged
       combinedInfo.numberOfFluidCells = uint_c(numFluidCells / uint_t(8)); //average
       combinedInfo.numberOfNearBoundaryCells = uint_c( numNearBoundaryCells / uint_t(8) ); //average
       combinedInfo.numberOfLocalParticles = uint_t(0);
-      combinedInfo.numberOfContacts = uint_t(0); //sum
-      // number of pe sub cycles stays the same
+      // number of rpd sub cycles stays the same
 
       blockInfo = combinedInfo;
    }
 }
 
+/*
+ * Provides mapping from phantom block to weight evaluation via info collection
+ */
+class WeightEvaluationFunctor
+{
+public:
+   WeightEvaluationFunctor(const shared_ptr<InfoCollection>& ic,
+                           const std::function<real_t(const BlockInfo&)> & weightEvaluationFct) :
+         ic_(ic), weightEvaluationFct_(weightEvaluationFct) {}
+
+   real_t operator()(const PhantomBlock * block)
+   {
+      BlockInfo blockInfo;
+      getBlockInfoFromInfoCollection(block, ic_, blockInfo);
+      return weightEvaluationFct_(blockInfo);
+   }
+
+private:
+   shared_ptr<InfoCollection> ic_;
+   std::function<real_t(const BlockInfo&)> weightEvaluationFct_;
+};
+
+
+} // namepace amr
 } // namespace lbm_mesapd_coupling
 } // namespace walberla
diff --git a/src/lbm_mesapd_coupling/amr/weight_assignment/MetisAssignmentFunctor.cpp b/src/lbm_mesapd_coupling/amr/weight_assignment/MetisAssignmentFunctor.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5c33cb777237d24594f627892cd5d74467637e5a
--- /dev/null
+++ b/src/lbm_mesapd_coupling/amr/weight_assignment/MetisAssignmentFunctor.cpp
@@ -0,0 +1,97 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file MetisAssignmentFunctor.cpp
+//! \author Christoph Rettinger <christoph.rettinger@fau.de>
+//
+//======================================================================================================================
+
+#include "MetisAssignmentFunctor.h"
+
+namespace walberla {
+namespace lbm_mesapd_coupling {
+namespace amr {
+
+void MetisAssignmentFunctor::operator()( std::vector< std::pair< const PhantomBlock *, walberla::any > > & blockData,
+      const PhantomBlockForest & /*phantomBlockForest*/)
+{
+   for (auto &it : blockData) {
+      const PhantomBlock * block = it.first;
+      //only change of one level is supported!
+      WALBERLA_ASSERT_LESS( std::abs(int_c(block->getLevel()) - int_c(block->getSourceLevel())), 2 );
+
+      std::vector<int64_t> metisVertexWeights(weightEvaluationFctVector_.size());
+
+      for( auto con = uint_t(0); con < weightEvaluationFctVector_.size(); ++con )
+      {
+         real_t vertexWeight = std::max(weightEvaluationFctVector_[con](block), blockBaseWeight_);
+
+         int64_t metisVertexWeight = int64_c( weightMultiplicator_ * vertexWeight );
+
+         WALBERLA_ASSERT_GREATER(metisVertexWeight, int64_t(0));
+         metisVertexWeights[con] = metisVertexWeight;
+      }
+
+      blockforest::DynamicParMetisBlockInfo info( metisVertexWeights );
+
+      info.setVertexCoords(it.first->getAABB().center() );
+
+      real_t blockVolume = it.first->getAABB().volume();
+      real_t approximateEdgeLength = std::cbrt( blockVolume );
+
+      int64_t faceNeighborWeight = int64_c(approximateEdgeLength * approximateEdgeLength ); //common face
+      int64_t edgeNeighborWeight = int64_c( approximateEdgeLength ); //common edge
+      int64_t cornerNeighborWeight = int64_c( 1 ); //common corner
+
+      int64_t vertexSize = int64_c(blockVolume);
+      WALBERLA_ASSERT_GREATER(vertexSize, int64_t(0));
+      info.setVertexSize( vertexSize );
+
+      for( const uint_t idx : blockforest::getFaceNeighborhoodSectionIndices() )
+      {
+         for( auto nb = uint_t(0); nb < it.first->getNeighborhoodSectionSize(idx); ++nb )
+         {
+            auto neighborBlockID = it.first->getNeighborId(idx,nb);
+            info.setEdgeWeight(neighborBlockID, faceNeighborWeight );
+         }
+      }
+
+      for( const uint_t idx : blockforest::getEdgeNeighborhoodSectionIndices() )
+      {
+         for( auto nb = uint_t(0); nb < it.first->getNeighborhoodSectionSize(idx); ++nb )
+         {
+            auto neighborBlockID = it.first->getNeighborId(idx,nb);
+            info.setEdgeWeight(neighborBlockID, edgeNeighborWeight );
+         }
+      }
+
+      for( const uint_t idx : blockforest::getCornerNeighborhoodSectionIndices() )
+      {
+         for( auto nb = uint_t(0); nb < it.first->getNeighborhoodSectionSize(idx); ++nb )
+         {
+            auto neighborBlockID = it.first->getNeighborId(idx,nb);
+            info.setEdgeWeight(neighborBlockID, cornerNeighborWeight );
+         }
+      }
+
+      it.second = info;
+
+   }
+}
+
+
+} // namespace amr
+} // namespace lbm_mesapd_coupling
+} // namespace walberla
diff --git a/src/lbm_mesapd_coupling/amr/weight_assignment/MetisAssignmentFunctor.h b/src/lbm_mesapd_coupling/amr/weight_assignment/MetisAssignmentFunctor.h
new file mode 100644
index 0000000000000000000000000000000000000000..962a6c975cbbe7ab0aa2fa3c569cae798dc90054
--- /dev/null
+++ b/src/lbm_mesapd_coupling/amr/weight_assignment/MetisAssignmentFunctor.h
@@ -0,0 +1,66 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file MetisAssignmentFunctor.h
+//! \author Christoph Rettinger <christoph.rettinger@fau.de>
+//
+//======================================================================================================================
+
+#pragma once
+
+#include "blockforest/PhantomBlockForest.h"
+#include "blockforest/PhantomBlock.h"
+#include "blockforest/loadbalancing/DynamicParMetis.h"
+
+#include <functional>
+#include <vector>
+
+namespace walberla {
+namespace lbm_mesapd_coupling {
+namespace amr {
+
+class MetisAssignmentFunctor
+{
+public:
+
+   using WeightEvaluationFct = std::function<real_t(const PhantomBlock *)>;
+
+   explicit MetisAssignmentFunctor( const WeightEvaluationFct& weightEvaluationFct)
+         : weightEvaluationFctVector_(1, weightEvaluationFct) {}
+
+   explicit MetisAssignmentFunctor( const std::vector< WeightEvaluationFct > & weightEvaluationFctVector )
+         : weightEvaluationFctVector_(weightEvaluationFctVector) {}
+
+   void operator()( std::vector< std::pair< const PhantomBlock *, walberla::any > > & blockData, const PhantomBlockForest & phantomBlockForest);
+
+   uint_t getNumberOfConstrains() const { return weightEvaluationFctVector_.size(); }
+
+   inline void   setBlockBaseWeight( const real_t blockBaseWeight ){ blockBaseWeight_ = blockBaseWeight; }
+   inline real_t getBlockBaseWeight() const { return blockBaseWeight_; }
+
+   inline void setWeightMultiplicator( const real_t weightMultiplicator ){ weightMultiplicator_ = weightMultiplicator; }
+
+private:
+   std::vector< WeightEvaluationFct > weightEvaluationFctVector_;
+   real_t blockBaseWeight_ = real_t(1);
+   real_t weightMultiplicator_ = real_t(1);
+};
+
+
+
+} // namespace amr
+} // namespace lbm_mesapd_coupling
+} // namespace walberla
+
diff --git a/src/lbm_mesapd_coupling/amr/weight_assignment/WeightAssignmentFunctor.h b/src/lbm_mesapd_coupling/amr/weight_assignment/WeightAssignmentFunctor.h
new file mode 100644
index 0000000000000000000000000000000000000000..0113d045a1d7a6b305f134d4045b9c49fd85457f
--- /dev/null
+++ b/src/lbm_mesapd_coupling/amr/weight_assignment/WeightAssignmentFunctor.h
@@ -0,0 +1,67 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file WeightAssignmentFunctor.h
+//! \author Christoph Rettinger <christoph.rettinger@fau.de>
+//
+//======================================================================================================================
+
+#pragma once
+
+#include "blockforest/loadbalancing/PODPhantomData.h"
+#include "blockforest/PhantomBlockForest.h"
+#include "blockforest/PhantomBlock.h"
+
+#include <functional>
+
+namespace walberla {
+namespace lbm_mesapd_coupling  {
+namespace amr {
+
+
+
+class WeightAssignmentFunctor
+{
+public:
+   using PhantomBlockWeight = walberla::blockforest::PODPhantomWeight<double>;
+   using WeightEvaluationFct = std::function<real_t(const PhantomBlock *)>;
+
+   explicit WeightAssignmentFunctor( const WeightEvaluationFct & weightEvaluationFct ) :
+         weightEvaluationFct_(weightEvaluationFct) {}
+
+   void operator()( std::vector< std::pair< const PhantomBlock *, walberla::any > > & blockData, const PhantomBlockForest & )
+   {
+      for (auto &it : blockData) {
+         const PhantomBlock * block = it.first;
+         //only change of one level is supported!
+         WALBERLA_ASSERT_LESS( std::abs(int_c(block->getLevel()) - int_c(block->getSourceLevel())), 2 );
+
+         real_t blockWeight = std::max(weightEvaluationFct_(block), blockBaseWeight_);
+         it.second = PhantomBlockWeight( double_c( blockWeight ) );
+      }
+   }
+
+   inline void   setBlockBaseWeight( const real_t blockBaseWeight ) { blockBaseWeight_ = blockBaseWeight; }
+   inline real_t getBlockBaseWeight() const { return blockBaseWeight_; }
+
+private:
+   WeightEvaluationFct weightEvaluationFct_;
+   real_t blockBaseWeight_ = real_t(1);
+};
+
+
+} // namespace amr
+} // namespace lbm_mesapd_coupling
+} // namespace walberla
diff --git a/src/lbm_mesapd_coupling/amr/weight_assignment/WeightEvaluationFunctions.h b/src/lbm_mesapd_coupling/amr/weight_assignment/WeightEvaluationFunctions.h
new file mode 100644
index 0000000000000000000000000000000000000000..6219f4f43f6c2212231b21afe68ce03c4251abc5
--- /dev/null
+++ b/src/lbm_mesapd_coupling/amr/weight_assignment/WeightEvaluationFunctions.h
@@ -0,0 +1,36 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file WeightEvaluationFunctions.h
+//! \author Christoph Rettinger <christoph.rettinger@fau.de>
+//
+//======================================================================================================================
+
+#pragma once
+
+#include "blockforest/PhantomBlock.h"
+
+namespace walberla {
+namespace lbm_mesapd_coupling  {
+namespace amr {
+
+real_t defaultWeightEvaluationFunction(const PhantomBlock * /*block*/)
+{
+   return real_t(1);
+}
+
+} // namespace amr
+} // namespace lbm_mesapd_coupling
+} // namespace walberla