diff --git a/apps/benchmarks/GranularGas/CMakeLists.txt b/apps/benchmarks/GranularGas/CMakeLists.txt
index f3f1531440983a0171594a6411517bd9933164aa..7a4ef8f3e7d19535eb16b7bcfcfa3b5f610ebb72 100644
--- a/apps/benchmarks/GranularGas/CMakeLists.txt
+++ b/apps/benchmarks/GranularGas/CMakeLists.txt
@@ -5,6 +5,10 @@ waLBerla_add_executable ( NAME PE_GranularGas
                           FILES PE_GranularGas.cpp SQLProperties.cpp Parameters.cpp NodeTimings.cpp
                           DEPENDS blockforest core pe postprocessing )
 
+waLBerla_add_executable ( NAME PE_LoadBalancing
+                          FILES PE_LoadBalancing.cpp SQLProperties.cpp Parameters.cpp NodeTimings.cpp
+                          DEPENDS blockforest core pe postprocessing )
+
 waLBerla_add_executable ( NAME MESA_PD_LoadBalancing
                           FILES MESA_PD_LoadBalancing.cpp SQLProperties.cpp Parameters.cpp NodeTimings.cpp sortParticleStorage.cpp CreateParticles.cpp
                           DEPENDS blockforest core pe mesa_pd postprocessing vtk )
@@ -16,3 +20,7 @@ waLBerla_add_executable ( NAME MESA_PD_GranularGas
 waLBerla_add_executable ( NAME MESA_PD_KernelBenchmark
                           FILES MESA_PD_KernelBenchmark.cpp SQLProperties.cpp Parameters.cpp NodeTimings.cpp sortParticleStorage.cpp CreateParticles.cpp
                           DEPENDS blockforest core pe mesa_pd postprocessing vtk )
+
+waLBerla_add_executable ( NAME MESA_PD_KernelLoadBalancing
+                          FILES MESA_PD_KernelLoadBalancing.cpp SQLProperties.cpp Parameters.cpp NodeTimings.cpp sortParticleStorage.cpp CreateParticles.cpp
+                          DEPENDS blockforest core pe mesa_pd postprocessing vtk )
diff --git a/apps/benchmarks/GranularGas/GranularGas.cfg b/apps/benchmarks/GranularGas/GranularGas.cfg
index 685b7dfc0f49a87241bc445637025d3d45ffa55d..0733512f4b293c8b8e0fd2d1f8b53f71ada8e98d 100644
--- a/apps/benchmarks/GranularGas/GranularGas.cfg
+++ b/apps/benchmarks/GranularGas/GranularGas.cfg
@@ -4,7 +4,7 @@ GranularGas
    simulationDomain < 80, 80, 80 >;
    blocks < 2,2,2 >;
    isPeriodic < 0, 0, 0 >;
-   initialRefinementLevel 2;
+   initialRefinementLevel 1;
    sorting linear;
 
    normal  <1,1,1>;
diff --git a/apps/benchmarks/GranularGas/MESA_PD_KernelLoadBalancing.cpp b/apps/benchmarks/GranularGas/MESA_PD_KernelLoadBalancing.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..69b78a5a8f3983cf1c743120f0b841a881b15912
--- /dev/null
+++ b/apps/benchmarks/GranularGas/MESA_PD_KernelLoadBalancing.cpp
@@ -0,0 +1,559 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file   MESA_PD_KernelLoadBalancing.cpp
+//! \author Sebastian Eibl <sebastian.eibl@fau.de>
+//
+//======================================================================================================================
+
+#include "Accessor.h"
+#include "check.h"
+#include "Contact.h"
+#include "CreateParticles.h"
+#include "NodeTimings.h"
+#include "Parameters.h"
+#include "SelectProperty.h"
+#include "sortParticleStorage.h"
+#include "SQLProperties.h"
+
+#include <mesa_pd/vtk/ParticleVtkOutput.h>
+
+#include <mesa_pd/collision_detection/AnalyticContactDetection.h>
+#include <mesa_pd/data/LinkedCells.h>
+#include <mesa_pd/data/ParticleAccessor.h>
+#include <mesa_pd/data/ParticleStorage.h>
+#include <mesa_pd/data/ShapeStorage.h>
+#include <mesa_pd/domain/BlockForestDataHandling.h>
+#include <mesa_pd/domain/BlockForestDomain.h>
+#include <mesa_pd/domain/InfoCollection.h>
+#include <mesa_pd/kernel/AssocToBlock.h>
+#include <mesa_pd/kernel/DoubleCast.h>
+#include <mesa_pd/kernel/ExplicitEulerWithShape.h>
+#include <mesa_pd/kernel/InsertParticleIntoLinkedCells.h>
+#include <mesa_pd/kernel/ParticleSelector.h>
+#include <mesa_pd/kernel/SpringDashpot.h>
+#include <mesa_pd/mpi/ContactFilter.h>
+#include <mesa_pd/mpi/ReduceProperty.h>
+#include <mesa_pd/mpi/SyncNextNeighbors.h>
+#include <mesa_pd/mpi/SyncNextNeighborsBlockForest.h>
+#include <mesa_pd/mpi/notifications/ForceTorqueNotification.h>
+#include <mesa_pd/sorting/HilbertCompareFunctor.h>
+#include <mesa_pd/sorting/LinearizedCompareFunctor.h>
+
+#include <blockforest/BlockForest.h>
+#include <blockforest/Initialization.h>
+#include <blockforest/loadbalancing/DynamicCurve.h>
+#include <blockforest/loadbalancing/DynamicParMetis.h>
+#include <blockforest/loadbalancing/PODPhantomData.h>
+#include <core/Abort.h>
+#include <core/Environment.h>
+#include <core/Hostname.h>
+#include <core/math/Random.h>
+#include <core/mpi/Gatherv.h>
+#include <core/mpi/RecvBuffer.h>
+#include <core/mpi/Reduce.h>
+#include <core/mpi/SendBuffer.h>
+#include <core/grid_generator/SCIterator.h>
+#include <core/logging/Logging.h>
+#include <core/OpenMP.h>
+#include <core/timing/Timer.h>
+#include <core/timing/TimingPool.h>
+#include <core/waLBerlaBuildInfo.h>
+#include <pe/amr/level_determination/MinMaxLevelDetermination.h>
+#include <pe/amr/weight_assignment/MetisAssignmentFunctor.h>
+#include <pe/amr/weight_assignment/WeightAssignmentFunctor.h>
+#include <postprocessing/sqlite/SQLite.h>
+#include <vtk/VTKOutput.h>
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <type_traits>
+
+namespace walberla {
+namespace mesa_pd {
+
+int main( int argc, char ** argv )
+{
+   using namespace walberla::timing;
+
+   Environment env(argc, argv);
+   auto mpiManager = walberla::mpi::MPIManager::instance();
+   mpiManager->useWorldComm();
+
+   logging::Logging::instance()->setStreamLogLevel(logging::Logging::INFO);
+   logging::Logging::instance()->setFileLogLevel(logging::Logging::INFO);
+
+   WALBERLA_LOG_INFO_ON_ROOT( "config file: " << argv[1] );
+   WALBERLA_LOG_INFO_ON_ROOT( "waLBerla Revision: " << WALBERLA_GIT_SHA1 );
+
+   math::seedRandomGenerator( static_cast<unsigned int>(1337 * mpiManager->worldRank()) );
+
+   std::map< std::string, walberla::int64_t > integerProperties;
+   std::map< std::string, double >            realProperties;
+   std::map< std::string, std::string >       stringProperties;
+
+   WALBERLA_LOG_INFO_ON_ROOT("*** READING CONFIG FILE ***");
+   auto cfg = env.config();
+   if (cfg == nullptr) WALBERLA_ABORT("No config specified!");
+   const Config::BlockHandle mainConf  = cfg->getBlock( "GranularGas" );
+   Parameters params;
+   loadFromConfig(params, mainConf);
+
+   WALBERLA_LOG_INFO_ON_ROOT("*** BLOCKFOREST ***");
+   // create forest
+   auto forest = blockforest::createBlockForestFromConfig( mainConf );
+   if (!forest)
+   {
+      WALBERLA_LOG_INFO_ON_ROOT( "No BlockForest created ... exiting!");
+      return EXIT_SUCCESS;
+   }
+
+   forest->recalculateBlockLevelsInRefresh( params.recalculateBlockLevelsInRefresh );
+   forest->alwaysRebalanceInRefresh( params.alwaysRebalanceInRefresh );
+   forest->reevaluateMinTargetLevelsAfterForcedRefinement( params.reevaluateMinTargetLevelsAfterForcedRefinement );
+   forest->allowRefreshChangingDepth( params.allowRefreshChangingDepth );
+
+   forest->allowMultipleRefreshCycles( params.allowMultipleRefreshCycles );
+   forest->checkForEarlyOutInRefresh( params.checkForEarlyOutInRefresh );
+   forest->checkForLateOutInRefresh( params.checkForLateOutInRefresh );
+
+   auto ic = make_shared<pe::InfoCollection>();
+
+   //   pe::amr::MinMaxLevelDetermination regrid(ic, regridMin, regridMax);
+   //   forest->setRefreshMinTargetLevelDeterminationFunction( regrid );
+
+   bool bRebalance = true;
+   if (params.LBAlgorithm == "None")
+   {
+      bRebalance = false;
+   } else if (params.LBAlgorithm == "Morton")
+   {
+      forest->setRefreshPhantomBlockDataAssignmentFunction( pe::amr::WeightAssignmentFunctor( ic, params.baseWeight ) );
+      forest->setRefreshPhantomBlockDataPackFunction( pe::amr::WeightAssignmentFunctor::PhantomBlockWeightPackUnpackFunctor() );
+      forest->setRefreshPhantomBlockDataUnpackFunction( pe::amr::WeightAssignmentFunctor::PhantomBlockWeightPackUnpackFunctor() );
+
+      auto prepFunc = blockforest::DynamicCurveBalance< pe::amr::WeightAssignmentFunctor::PhantomBlockWeight >( false, true, false );
+      prepFunc.setMaxBlocksPerProcess( params.maxBlocksPerProcess );
+      forest->setRefreshPhantomBlockMigrationPreparationFunction( prepFunc );
+   } else if (params.LBAlgorithm == "Hilbert")
+   {
+      forest->setRefreshPhantomBlockDataAssignmentFunction( pe::amr::WeightAssignmentFunctor( ic, params.baseWeight ) );
+      forest->setRefreshPhantomBlockDataPackFunction( pe::amr::WeightAssignmentFunctor::PhantomBlockWeightPackUnpackFunctor() );
+      forest->setRefreshPhantomBlockDataUnpackFunction( pe::amr::WeightAssignmentFunctor::PhantomBlockWeightPackUnpackFunctor() );
+
+      auto prepFunc = blockforest::DynamicCurveBalance< pe::amr::WeightAssignmentFunctor::PhantomBlockWeight >( true, true, false );
+      prepFunc.setMaxBlocksPerProcess( params.maxBlocksPerProcess );
+      forest->setRefreshPhantomBlockMigrationPreparationFunction( prepFunc );
+   } else if (params.LBAlgorithm == "Metis")
+   {
+      auto assFunc = pe::amr::MetisAssignmentFunctor( ic, params.baseWeight );
+      forest->setRefreshPhantomBlockDataAssignmentFunction( assFunc );
+      forest->setRefreshPhantomBlockDataPackFunction( pe::amr::MetisAssignmentFunctor::PhantomBlockWeightPackUnpackFunctor() );
+      forest->setRefreshPhantomBlockDataUnpackFunction( pe::amr::MetisAssignmentFunctor::PhantomBlockWeightPackUnpackFunctor() );
+
+      auto alg     = blockforest::DynamicParMetis::stringToAlgorithm(    params.metisAlgorithm );
+      auto vWeight = blockforest::DynamicParMetis::stringToWeightsToUse( params.metisWeightsToUse );
+      auto eWeight = blockforest::DynamicParMetis::stringToEdgeSource(   params.metisEdgeSource );
+
+      auto prepFunc = blockforest::DynamicParMetis( alg, vWeight, eWeight );
+      prepFunc.setipc2redist(params.metisipc2redist);
+      addParMetisPropertiesToSQL(prepFunc, integerProperties, realProperties, stringProperties);
+      forest->setRefreshPhantomBlockMigrationPreparationFunction( prepFunc );
+   } else if (params.LBAlgorithm == "Diffusive")
+   {
+      forest->setRefreshPhantomBlockDataAssignmentFunction( pe::amr::WeightAssignmentFunctor( ic, params.baseWeight ) );
+      forest->setRefreshPhantomBlockDataPackFunction( pe::amr::WeightAssignmentFunctor::PhantomBlockWeightPackUnpackFunctor() );
+      forest->setRefreshPhantomBlockDataUnpackFunction( pe::amr::WeightAssignmentFunctor::PhantomBlockWeightPackUnpackFunctor() );
+      auto prepFunc = blockforest::DynamicDiffusionBalance< pe::amr::WeightAssignmentFunctor::PhantomBlockWeight >( 1, 1, false );
+      //configure(cfg, prepFunc);
+      //addDynamicDiffusivePropertiesToSQL(prepFunc, integerProperties, realProperties, stringProperties);
+      forest->setRefreshPhantomBlockMigrationPreparationFunction(prepFunc);
+   } else
+   {
+      WALBERLA_ABORT("Unknown LBAlgorithm: " << params.LBAlgorithm);
+   }
+
+   auto domain = std::make_shared<domain::BlockForestDomain>(forest);
+
+   WALBERLA_LOG_INFO_ON_ROOT("*** SETUP - START ***");
+
+   //init data structures
+   auto ps = std::make_shared<data::ParticleStorage>(100);
+   auto ss = std::make_shared<data::ShapeStorage>();
+   ParticleAccessorWithShape accessor(ps, ss);
+   auto lc = std::make_shared<data::LinkedCells>(domain->getUnionOfLocalAABBs().getExtended(params.spacing), params.spacing );
+   forest->addBlockData(domain::createBlockForestDataHandling(ps), "Storage");
+
+   auto center = forest->getDomain().center();
+   auto  smallSphere = ss->create<data::Sphere>( params.radius );
+   ss->shapes[smallSphere]->updateMassAndInertia(real_t(2707));
+   for (auto& iBlk : *forest)
+   {
+      for (auto pt : grid_generator::SCGrid(iBlk.getAABB(),
+                                            Vector3<real_t>(params.spacing) * real_c(0.5),
+                                            params.spacing))
+      {
+         WALBERLA_CHECK(iBlk.getAABB().contains(pt));
+         auto tmp = dot( (pt - center), params.normal );
+         if (tmp < 0)
+         {
+            createSphere(*ps, pt, params.radius, smallSphere);
+         }
+      }
+   }
+   int64_t numParticles = int64_c(ps->size());
+   walberla::mpi::reduceInplace(numParticles, walberla::mpi::SUM);
+   WALBERLA_LOG_INFO_ON_ROOT("#particles created: " << numParticles);
+
+   WALBERLA_LOG_INFO_ON_ROOT("*** SETUP - END ***");
+
+   WALBERLA_LOG_INFO_ON_ROOT("*** VTK ***");
+   auto vtkDomainOutput = walberla::vtk::createVTKOutput_DomainDecomposition( forest, "domain_decomposition", 1, "vtk_out", "simulation_step" );
+   auto vtkOutput       = make_shared<mesa_pd::vtk::ParticleVtkOutput>(ps) ;
+   auto vtkWriter       = walberla::vtk::createVTKOutput_PointData(vtkOutput, "Bodies", 1, "vtk", "simulation_step", false, false);
+   vtkOutput->addOutput<SelectRank>("rank");
+   vtkOutput->addOutput<data::SelectParticleOwner>("owner");
+   vtkOutput->addOutput<SelectIdx>("idx");
+   //   vtkDomainOutput->write();
+
+   WALBERLA_LOG_INFO_ON_ROOT("*** SIMULATION - START ***");
+   // Init kernels
+   kernel::ExplicitEulerWithShape        explicitEulerWithShape( params.dt );
+   kernel::InsertParticleIntoLinkedCells ipilc;
+   kernel::SpringDashpot                 dem(1);
+   dem.setStiffness(0, 0, real_t(0));
+   dem.setDampingN (0, 0, real_t(0));
+   dem.setDampingT (0, 0, real_t(0));
+   dem.setFriction (0, 0, real_t(0));
+   collision_detection::AnalyticContactDetection              acd;
+   kernel::AssocToBlock                  assoc(forest);
+   kernel::DoubleCast                    double_cast;
+   mpi::ContactFilter                    contact_filter;
+   mpi::ReduceProperty                   RP;
+   mpi::SyncNextNeighborsBlockForest     SNN;
+   std::vector<Contact>                  contacts;
+   contacts.reserve(4000000);
+
+   // initial sync
+   ps->forEachParticle(true, kernel::SelectLocal(), accessor, assoc, accessor);
+   SNN(*ps, forest, domain);
+   sortParticleStorage(*ps, params.sorting, lc->domain_, uint_c(lc->numCellsPerDim_[0]));
+   //   vtkWriter->write();
+
+   WcTimingPool tpImbalanced;
+   WcTimingPool tpBalanced;
+
+   WALBERLA_MPI_BARRIER();
+   tpImbalanced["GenerateLinkedCells"].start();
+   for (int64_t i=0; i < params.simulationSteps; ++i)
+   {
+      lc->clear();
+      ps->forEachParticle(true, kernel::SelectAll(), accessor, ipilc, accessor, *lc);
+   }
+   tpImbalanced["GenerateLinkedCells"].end();
+
+   int64_t contactsChecked  = 0;
+   int64_t contactsDetected = 0;
+   int64_t contactsTreated  = 0;
+   WALBERLA_MPI_BARRIER();
+   tpImbalanced["ContactDetection"].start();
+   for (int64_t i=0; i < params.simulationSteps; ++i)
+   {
+      contacts.clear();
+      contactsChecked  = 0;
+      contactsDetected = 0;
+      contactsTreated  = 0;
+      lc->forEachParticlePairHalf(true,
+                                  kernel::SelectAll(),
+                                  accessor,
+                                  [&](const size_t idx1, const size_t idx2, auto& ac)
+      {
+         ++contactsChecked;
+         if (double_cast(idx1, idx2, ac, acd, ac ))
+         {
+            ++contactsDetected;
+            if (contact_filter(acd.getIdx1(), acd.getIdx2(), ac, acd.getContactPoint(), *domain))
+            {
+               ++contactsTreated;
+               contacts.emplace_back(acd.getIdx1(), acd.getIdx2(), acd.getContactPoint(), acd.getContactNormal(), acd.getPenetrationDepth());
+            }
+         }
+      },
+      accessor );
+   }
+   tpImbalanced["ContactDetection"].end();
+
+   WALBERLA_MPI_BARRIER();
+   tpImbalanced["DEM"].start();
+   for (int64_t i=0; i < params.simulationSteps; ++i)
+   {
+      for (auto& c : contacts)
+      {
+         dem(c.idx1_, c.idx2_, accessor, c.contactPoint_, c.contactNormal_, c.penetrationDepth_);
+      }
+   }
+   tpImbalanced["DEM"].end();
+
+   WALBERLA_MPI_BARRIER();
+   tpImbalanced["ReduceForce"].start();
+   for (int64_t i=0; i < params.simulationSteps; ++i)
+   {
+      RP.operator()<ForceTorqueNotification>(*ps);
+   }
+   tpImbalanced["ReduceForce"].end();
+
+   WALBERLA_MPI_BARRIER();
+   tpImbalanced["Euler"].start();
+   for (int64_t i=0; i < params.simulationSteps; ++i)
+   {
+      ps->forEachParticle(true, kernel::SelectLocal(), accessor, explicitEulerWithShape, accessor);
+   }
+   tpImbalanced["Euler"].end();
+
+   WALBERLA_MPI_BARRIER();
+   tpImbalanced["SNN"].start();
+   for (int64_t i=0; i < params.simulationSteps; ++i)
+   {
+      SNN(*ps, forest, domain);
+   }
+   tpImbalanced["SNN"].end();
+
+   WALBERLA_MPI_BARRIER();
+   if (bRebalance)
+   {
+      WALBERLA_LOG_DEVEL_ON_ROOT("running load balancing");
+      domain::createWithNeighborhood( accessor, *forest, *ic );
+      for (auto pIt = ps->begin(); pIt != ps->end(); )
+      {
+         using namespace walberla::mesa_pd::data::particle_flags;
+         if (isSet(pIt->getFlags(), GHOST))
+         {
+            pIt = ps->erase(pIt);
+         } else
+         {
+            pIt->getGhostOwnersRef().clear();
+            ++pIt;
+         }
+      }
+      forest->refresh();
+      domain->refresh();
+      lc = std::make_shared<data::LinkedCells>(domain->getUnionOfLocalAABBs().getExtended(params.spacing), params.spacing );
+      ps->forEachParticle(true, kernel::SelectLocal(), accessor, assoc, accessor);
+      SNN(*ps, forest, domain);
+      sortParticleStorage(*ps, params.sorting, lc->domain_, uint_c(lc->numCellsPerDim_[0]));
+   }
+
+   WALBERLA_MPI_BARRIER();
+   tpBalanced["GenerateLinkedCells"].start();
+   for (int64_t i=0; i < params.simulationSteps; ++i)
+   {
+      lc->clear();
+      ps->forEachParticle(true, kernel::SelectAll(), accessor, ipilc, accessor, *lc);
+   }
+   tpBalanced["GenerateLinkedCells"].end();
+
+   WALBERLA_MPI_BARRIER();
+   tpBalanced["ContactDetection"].start();
+   for (int64_t i=0; i < params.simulationSteps; ++i)
+   {
+      contacts.clear();
+      contactsChecked  = 0;
+      contactsDetected = 0;
+      contactsTreated  = 0;
+      lc->forEachParticlePairHalf(true,
+                                  kernel::SelectAll(),
+                                  accessor,
+                                  [&](const size_t idx1, const size_t idx2, auto& ac)
+      {
+         ++contactsChecked;
+         if (double_cast(idx1, idx2, ac, acd, ac ))
+         {
+            ++contactsDetected;
+            if (contact_filter(acd.getIdx1(), acd.getIdx2(), ac, acd.getContactPoint(), *domain))
+            {
+               ++contactsTreated;
+               contacts.emplace_back(acd.getIdx1(), acd.getIdx2(), acd.getContactPoint(), acd.getContactNormal(), acd.getPenetrationDepth());
+            }
+         }
+      },
+      accessor );
+   }
+   tpBalanced["ContactDetection"].end();
+
+   WALBERLA_MPI_BARRIER();
+   tpBalanced["DEM"].start();
+   for (int64_t i=0; i < params.simulationSteps; ++i)
+   {
+      for (auto& c : contacts)
+      {
+         dem(c.idx1_, c.idx2_, accessor, c.contactPoint_, c.contactNormal_, c.penetrationDepth_);
+      }
+   }
+   tpBalanced["DEM"].end();
+
+   WALBERLA_MPI_BARRIER();
+   tpBalanced["ReduceForce"].start();
+   for (int64_t i=0; i < params.simulationSteps; ++i)
+   {
+      RP.operator()<ForceTorqueNotification>(*ps);
+   }
+   tpBalanced["ReduceForce"].end();
+
+   WALBERLA_MPI_BARRIER();
+   tpBalanced["Euler"].start();
+   for (int64_t i=0; i < params.simulationSteps; ++i)
+   {
+      ps->forEachParticle(true, kernel::SelectLocal(), accessor, explicitEulerWithShape, accessor);
+   }
+   tpBalanced["Euler"].end();
+
+   WALBERLA_MPI_BARRIER();
+   tpBalanced["SNN"].start();
+   for (int64_t i=0; i < params.simulationSteps; ++i)
+   {
+      SNN(*ps, forest, domain);
+   }
+   tpBalanced["SNN"].end();
+
+   WALBERLA_LOG_INFO_ON_ROOT("*** SIMULATION - END ***");
+
+   if (params.checkSimulation)
+   {
+      check(*ps, *forest, params.spacing);
+   }
+
+
+   WALBERLA_LOG_INFO_ON_ROOT("*** SQL OUTPUT - START ***");
+   auto SNNBytesSent     = SNN.getBytesSent();
+   auto SNNBytesReceived = SNN.getBytesReceived();
+   auto SNNSends         = SNN.getNumberOfSends();
+   auto SNNReceives      = SNN.getNumberOfReceives();
+   auto RPBytesSent      = RP.getBytesSent();
+   auto RPBytesReceived  = RP.getBytesReceived();
+   auto RPSends          = RP.getNumberOfSends();
+   auto RPReceives       = RP.getNumberOfReceives();
+   walberla::mpi::reduceInplace(SNNBytesSent, walberla::mpi::SUM);
+   walberla::mpi::reduceInplace(SNNBytesReceived, walberla::mpi::SUM);
+   walberla::mpi::reduceInplace(SNNSends, walberla::mpi::SUM);
+   walberla::mpi::reduceInplace(SNNReceives, walberla::mpi::SUM);
+   walberla::mpi::reduceInplace(RPBytesSent, walberla::mpi::SUM);
+   walberla::mpi::reduceInplace(RPBytesReceived, walberla::mpi::SUM);
+   walberla::mpi::reduceInplace(RPSends, walberla::mpi::SUM);
+   walberla::mpi::reduceInplace(RPReceives, walberla::mpi::SUM);
+   auto cC = walberla::mpi::reduce(contactsChecked, walberla::mpi::SUM);
+   auto cD = walberla::mpi::reduce(contactsDetected, walberla::mpi::SUM);
+   auto cT = walberla::mpi::reduce(contactsTreated, walberla::mpi::SUM);
+   WALBERLA_LOG_DEVEL_ON_ROOT( "SNN bytes communicated:   " << SNNBytesSent << " / " << SNNBytesReceived );
+   WALBERLA_LOG_DEVEL_ON_ROOT( "SNN communication partners: " << SNNSends << " / " << SNNReceives );
+   WALBERLA_LOG_DEVEL_ON_ROOT( "RP bytes communicated:  " << RPBytesSent << " / " << RPBytesReceived );
+   WALBERLA_LOG_DEVEL_ON_ROOT( "RP communication partners: " << RPSends << " / " << RPReceives );
+   WALBERLA_LOG_DEVEL_ON_ROOT( "contacts checked/detected/treated: " << cC << " / " << cD << " / " << cT );
+
+   auto tpImbalancedReduced = tpImbalanced.getReduced();
+   WALBERLA_LOG_INFO_ON_ROOT(*tpImbalancedReduced);
+
+   auto tpBalancedReduced = tpBalanced.getReduced();
+   WALBERLA_LOG_INFO_ON_ROOT(*tpBalancedReduced);
+
+   numParticles = 0;
+   int64_t numGhostParticles = 0;
+   ps->forEachParticle(false,
+                       kernel::SelectAll(),
+                       accessor,
+                       [&numParticles, &numGhostParticles](const size_t idx, auto& ac)
+   {
+      if (data::particle_flags::isSet( ac.getFlagsRef(idx), data::particle_flags::GHOST))
+      {
+         ++numGhostParticles;
+      } else
+      {
+         ++numParticles;
+      }
+   },
+   accessor);
+   auto minParticles = walberla::mpi::reduce(numParticles, walberla::mpi::MIN);
+   auto maxParticles = walberla::mpi::reduce(numParticles, walberla::mpi::MAX);
+   WALBERLA_LOG_DEVEL_ON_ROOT("particle ratio: " << minParticles << " / " << maxParticles);
+   walberla::mpi::reduceInplace(numParticles, walberla::mpi::SUM);
+   walberla::mpi::reduceInplace(numGhostParticles, walberla::mpi::SUM);
+   walberla::mpi::reduceInplace(contactsChecked, walberla::mpi::SUM);
+   walberla::mpi::reduceInplace(contactsDetected, walberla::mpi::SUM);
+   walberla::mpi::reduceInplace(contactsTreated, walberla::mpi::SUM);
+   double linkedCellsVolume = lc->domain_.volume();
+   walberla::mpi::reduceInplace(linkedCellsVolume, walberla::mpi::SUM);
+   size_t numLinkedCells = lc->cells_.size();
+   walberla::mpi::reduceInplace(numLinkedCells, walberla::mpi::SUM);
+   size_t local_aabbs         = domain->getNumLocalAABBs();
+   size_t neighbor_subdomains = domain->getNumNeighborSubdomains();
+   size_t neighbor_processes  = domain->getNumNeighborProcesses();
+   walberla::mpi::reduceInplace(local_aabbs, walberla::mpi::SUM);
+   walberla::mpi::reduceInplace(neighbor_subdomains, walberla::mpi::SUM);
+   walberla::mpi::reduceInplace(neighbor_processes, walberla::mpi::SUM);
+
+   uint_t runId = uint_c(-1);
+   WALBERLA_ROOT_SECTION()
+   {
+      stringProperties["walberla_git"]         = WALBERLA_GIT_SHA1;
+      stringProperties["tag"]                  = "mesa_pd";
+      integerProperties["mpi_num_processes"]   = mpiManager->numProcesses();
+      integerProperties["omp_max_threads"]     = omp_get_max_threads();
+      integerProperties["num_particles"]       = numParticles;
+      integerProperties["num_ghost_particles"] = numGhostParticles;
+      integerProperties["minParticles"]        = minParticles;
+      integerProperties["maxParticles"]        = maxParticles;
+      integerProperties["contacts_checked"]    = contactsChecked;
+      integerProperties["contacts_detected"]   = contactsDetected;
+      integerProperties["contacts_treated"]    = contactsTreated;
+      integerProperties["local_aabbs"]         = int64_c(local_aabbs);
+      integerProperties["neighbor_subdomains"] = int64_c(neighbor_subdomains);
+      integerProperties["neighbor_processes"]  = int64_c(neighbor_processes);
+      integerProperties["SNNBytesSent"]        = SNNBytesSent;
+      integerProperties["SNNBytesReceived"]    = SNNBytesReceived;
+      integerProperties["SNNSends"]            = SNNSends;
+      integerProperties["SNNReceives"]         = SNNReceives;
+      integerProperties["RPBytesSent"]         = RPBytesSent;
+      integerProperties["RPBytesReceived"]     = RPBytesReceived;
+      integerProperties["RPSends"]             = RPSends;
+      integerProperties["RPReceives"]          = RPReceives;
+      realProperties["linkedCellsVolume"]      = linkedCellsVolume;
+      integerProperties["numLinkedCells"]      = int64_c(numLinkedCells);
+
+      addBuildInfoToSQL( integerProperties, realProperties, stringProperties );
+      saveToSQL(params, integerProperties, realProperties, stringProperties );
+      addDomainPropertiesToSQL(*forest, integerProperties, realProperties, stringProperties);
+      addSlurmPropertiesToSQL(integerProperties, realProperties, stringProperties);
+
+      runId = postprocessing::storeRunInSqliteDB( params.sqlFile, integerProperties, stringProperties, realProperties );
+      postprocessing::storeTimingPoolInSqliteDB( params.sqlFile, runId, *tpImbalancedReduced, "imbalanced" );
+      postprocessing::storeTimingPoolInSqliteDB( params.sqlFile, runId, *tpBalancedReduced, "balanced" );
+   }
+   if (params.storeNodeTimings)
+   {
+      storeNodeTimings(runId, params.sqlFile, "NodeTimingImbalanced", tpImbalanced);
+      storeNodeTimings(runId, params.sqlFile, "NodeTimingBalanced", tpBalanced);
+   }
+   WALBERLA_LOG_INFO_ON_ROOT("*** SQL OUTPUT - END ***");
+
+   return EXIT_SUCCESS;
+}
+
+} // namespace mesa_pd
+} // namespace walberla
+
+int main( int argc, char* argv[] )
+{
+   return walberla::mesa_pd::main( argc, argv );
+}
diff --git a/apps/benchmarks/GranularGas/MESA_PD_LoadBalancing.cpp b/apps/benchmarks/GranularGas/MESA_PD_LoadBalancing.cpp
index b3f5a838122637f0405376bb8ba84f13fac8e12e..7dec78ffea0e0d9fd0c460ff382f253b8bf19061 100644
--- a/apps/benchmarks/GranularGas/MESA_PD_LoadBalancing.cpp
+++ b/apps/benchmarks/GranularGas/MESA_PD_LoadBalancing.cpp
@@ -119,14 +119,14 @@ int main( int argc, char ** argv )
       return EXIT_SUCCESS;
    }
 
-   forest->recalculateBlockLevelsInRefresh( false );
-   forest->alwaysRebalanceInRefresh( true );
-   forest->reevaluateMinTargetLevelsAfterForcedRefinement( false );
-   forest->allowRefreshChangingDepth( false );
+   forest->recalculateBlockLevelsInRefresh( params.recalculateBlockLevelsInRefresh );
+   forest->alwaysRebalanceInRefresh( params.alwaysRebalanceInRefresh );
+   forest->reevaluateMinTargetLevelsAfterForcedRefinement( params.reevaluateMinTargetLevelsAfterForcedRefinement );
+   forest->allowRefreshChangingDepth( params.allowRefreshChangingDepth );
 
-   forest->allowMultipleRefreshCycles( false );
-   forest->checkForEarlyOutInRefresh( true );
-   forest->checkForLateOutInRefresh( true );
+   forest->allowMultipleRefreshCycles( params.allowMultipleRefreshCycles );
+   forest->checkForEarlyOutInRefresh( params.checkForEarlyOutInRefresh );
+   forest->checkForLateOutInRefresh( params.checkForLateOutInRefresh );
 
    auto ic = make_shared<pe::InfoCollection>();
 
@@ -252,6 +252,7 @@ int main( int argc, char ** argv )
    WcTimer      timerBalanced;
    WcTimingPool tpImbalanced;
    WcTimingPool tpBalanced;
+
    auto    SNNBytesSent     = SNN.getBytesSent();
    auto    SNNBytesReceived = SNN.getBytesReceived();
    auto    SNNSends         = SNN.getNumberOfSends();
@@ -493,6 +494,9 @@ int main( int argc, char ** argv )
       }
    },
    accessor);
+   auto minParticles = walberla::mpi::reduce(numParticles, walberla::mpi::MIN);
+   auto maxParticles = walberla::mpi::reduce(numParticles, walberla::mpi::MAX);
+   WALBERLA_LOG_DEVEL_ON_ROOT("particle ratio: " << minParticles << " / " << maxParticles);
    walberla::mpi::reduceInplace(numParticles, walberla::mpi::SUM);
    walberla::mpi::reduceInplace(numGhostParticles, walberla::mpi::SUM);
    walberla::mpi::reduceInplace(contactsChecked, walberla::mpi::SUM);
@@ -532,6 +536,8 @@ int main( int argc, char ** argv )
       realProperties["balanced_timer_total"]   = timerBalancedReduced->total();
       integerProperties["num_particles"]       = numParticles;
       integerProperties["num_ghost_particles"] = numGhostParticles;
+      integerProperties["minParticles"]        = minParticles;
+      integerProperties["maxParticles"]        = maxParticles;
       integerProperties["contacts_checked"]    = contactsChecked;
       integerProperties["contacts_detected"]   = contactsDetected;
       integerProperties["contacts_treated"]    = contactsTreated;
diff --git a/apps/benchmarks/GranularGas/PE_LoadBalancing.cpp b/apps/benchmarks/GranularGas/PE_LoadBalancing.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4fa1ad68eb3d866b2d6e1f8076d4a1a2acc943c8
--- /dev/null
+++ b/apps/benchmarks/GranularGas/PE_LoadBalancing.cpp
@@ -0,0 +1,466 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file   PE_LoadBalancing.cpp
+//! \author Sebastian Eibl <sebastian.eibl@fau.de>
+//
+//======================================================================================================================
+
+#include "NodeTimings.h"
+#include "Parameters.h"
+#include "SQLProperties.h"
+
+#include <pe/amr/InfoCollection.h>
+#include <pe/amr/level_determination/MinMaxLevelDetermination.h>
+#include <pe/amr/weight_assignment/MetisAssignmentFunctor.h>
+#include <pe/amr/weight_assignment/WeightAssignmentFunctor.h>
+#include <pe/basic.h>
+#include <pe/synchronization/ClearSynchronization.h>
+#include <pe/vtk/SphereVtkOutput.h>
+
+#include <blockforest/Initialization.h>
+#include <blockforest/loadbalancing/DynamicCurve.h>
+#include <blockforest/loadbalancing/DynamicParMetis.h>
+#include <blockforest/loadbalancing/PODPhantomData.h>
+#include <core/Abort.h>
+#include <core/Environment.h>
+#include <core/math/Random.h>
+#include <core/grid_generator/SCIterator.h>
+#include <core/logging/Logging.h>
+#include <core/OpenMP.h>
+#include <core/timing/TimingTree.h>
+#include <core/waLBerlaBuildInfo.h>
+#include <postprocessing/sqlite/SQLite.h>
+#include <vtk/VTKOutput.h>
+
+#include <functional>
+#include <memory>
+#include <tuple>
+
+namespace walberla {
+using namespace walberla::pe;
+using namespace walberla::timing;
+
+using BodyTuple = std::tuple<Sphere, Plane> ;
+
+int main( int argc, char ** argv )
+{
+   WcTimingTree tt;
+   Environment env(argc, argv);
+
+   logging::Logging::instance()->setStreamLogLevel(logging::Logging::INFO);
+   logging::Logging::instance()->setFileLogLevel(logging::Logging::INFO);
+
+   WALBERLA_LOG_INFO_ON_ROOT( "config file: " << argv[1] )
+         WALBERLA_LOG_INFO_ON_ROOT( "waLBerla Revision: " << WALBERLA_GIT_SHA1 );
+
+   math::seedRandomGenerator( static_cast<unsigned int>(1337 * mpi::MPIManager::instance()->worldRank()) );
+
+   std::map< std::string, walberla::int64_t > integerProperties;
+   std::map< std::string, double >            realProperties;
+   std::map< std::string, std::string >       stringProperties;
+
+   WALBERLA_LOG_INFO_ON_ROOT("*** READING COMMANDLINE ARGUMENTS ***");
+   bool bDEM = false;
+   bool bHCSITS = false;
+
+   bool bNN = false;
+   bool bSO = false;
+
+   bool bInelasticFrictionlessContact = false;
+   bool bApproximateInelasticCoulombContactByDecoupling = false;
+   bool bInelasticCoulombContactByDecoupling = false;
+   bool bInelasticGeneralizedMaximumDissipationContact = false;
+
+   for( int i = 1; i < argc; ++i )
+   {
+      if( std::strcmp( argv[i], "--DEM" ) == 0 ) bDEM = true;
+      if( std::strcmp( argv[i], "--HCSITS" ) == 0 ) bHCSITS = true;
+
+      if( std::strcmp( argv[i], "--syncNextNeighbor" ) == 0 ) bNN = true;
+      if( std::strcmp( argv[i], "--syncShadowOwners" ) == 0 ) bSO = true;
+
+      if( std::strcmp( argv[i], "--InelasticFrictionlessContact" ) == 0 ) bInelasticFrictionlessContact = true;
+      if( std::strcmp( argv[i], "--ApproximateInelasticCoulombContactByDecoupling" ) == 0 ) bApproximateInelasticCoulombContactByDecoupling = true;
+      if( std::strcmp( argv[i], "--InelasticCoulombContactByDecoupling" ) == 0 ) bInelasticCoulombContactByDecoupling = true;
+      if( std::strcmp( argv[i], "--InelasticGeneralizedMaximumDissipationContact" ) == 0 ) bInelasticGeneralizedMaximumDissipationContact = true;
+   }
+
+   WALBERLA_LOG_INFO_ON_ROOT("*** READING CONFIG FILE ***");
+   auto cfg = env.config();
+   if (cfg == nullptr) WALBERLA_ABORT("No config specified!");
+   const Config::BlockHandle mainConf  = cfg->getBlock( "GranularGas" );
+   mesa_pd::Parameters params;
+   loadFromConfig(params, mainConf);
+
+   WALBERLA_LOG_INFO_ON_ROOT("*** GLOBALBODYSTORAGE ***");
+   shared_ptr<BodyStorage> globalBodyStorage = make_shared<BodyStorage>();
+
+   WALBERLA_LOG_INFO_ON_ROOT("*** BLOCKFOREST ***");
+   // create forest
+   shared_ptr< BlockForest > forest = blockforest::createBlockForestFromConfig( mainConf );
+   if (!forest)
+   {
+      WALBERLA_LOG_INFO_ON_ROOT( "No BlockForest created ... exiting!");
+      return EXIT_SUCCESS;
+   }
+
+   forest->recalculateBlockLevelsInRefresh( params.recalculateBlockLevelsInRefresh );
+   forest->alwaysRebalanceInRefresh( params.alwaysRebalanceInRefresh );
+   forest->reevaluateMinTargetLevelsAfterForcedRefinement( params.reevaluateMinTargetLevelsAfterForcedRefinement );
+   forest->allowRefreshChangingDepth( params.allowRefreshChangingDepth );
+
+   forest->allowMultipleRefreshCycles( params.allowMultipleRefreshCycles );
+   forest->checkForEarlyOutInRefresh( params.checkForEarlyOutInRefresh );
+   forest->checkForLateOutInRefresh( params.checkForLateOutInRefresh );
+
+   WALBERLA_LOG_INFO_ON_ROOT("simulationDomain: " << forest->getDomain());
+
+   WALBERLA_LOG_INFO_ON_ROOT("blocks: " << Vector3<uint_t>(forest->getXSize(), forest->getYSize(), forest->getZSize()) );
+
+   auto ic = make_shared<pe::InfoCollection>();
+
+   //   pe::amr::MinMaxLevelDetermination regrid(ic, regridMin, regridMax);
+   //   forest->setRefreshMinTargetLevelDeterminationFunction( regrid );
+
+   bool bRebalance = true;
+   if (params.LBAlgorithm == "None")
+   {
+      bRebalance = false;
+   } else if (params.LBAlgorithm == "Morton")
+   {
+      forest->setRefreshPhantomBlockDataAssignmentFunction( pe::amr::WeightAssignmentFunctor( ic, params.baseWeight ) );
+      forest->setRefreshPhantomBlockDataPackFunction( pe::amr::WeightAssignmentFunctor::PhantomBlockWeightPackUnpackFunctor() );
+      forest->setRefreshPhantomBlockDataUnpackFunction( pe::amr::WeightAssignmentFunctor::PhantomBlockWeightPackUnpackFunctor() );
+
+      auto prepFunc = blockforest::DynamicCurveBalance< pe::amr::WeightAssignmentFunctor::PhantomBlockWeight >( false, true, false );
+      prepFunc.setMaxBlocksPerProcess( params.maxBlocksPerProcess );
+      forest->setRefreshPhantomBlockMigrationPreparationFunction( prepFunc );
+   } else if (params.LBAlgorithm == "Hilbert")
+   {
+      forest->setRefreshPhantomBlockDataAssignmentFunction( pe::amr::WeightAssignmentFunctor( ic, params.baseWeight ) );
+      forest->setRefreshPhantomBlockDataPackFunction( pe::amr::WeightAssignmentFunctor::PhantomBlockWeightPackUnpackFunctor() );
+      forest->setRefreshPhantomBlockDataUnpackFunction( pe::amr::WeightAssignmentFunctor::PhantomBlockWeightPackUnpackFunctor() );
+
+      auto prepFunc = blockforest::DynamicCurveBalance< pe::amr::WeightAssignmentFunctor::PhantomBlockWeight >( true, true, false );
+      prepFunc.setMaxBlocksPerProcess( params.maxBlocksPerProcess );
+      forest->setRefreshPhantomBlockMigrationPreparationFunction( prepFunc );
+   } else if (params.LBAlgorithm == "Metis")
+   {
+      auto assFunc = pe::amr::MetisAssignmentFunctor( ic, params.baseWeight );
+      forest->setRefreshPhantomBlockDataAssignmentFunction( assFunc );
+      forest->setRefreshPhantomBlockDataPackFunction( pe::amr::MetisAssignmentFunctor::PhantomBlockWeightPackUnpackFunctor() );
+      forest->setRefreshPhantomBlockDataUnpackFunction( pe::amr::MetisAssignmentFunctor::PhantomBlockWeightPackUnpackFunctor() );
+
+      auto alg     = blockforest::DynamicParMetis::stringToAlgorithm(    params.metisAlgorithm );
+      auto vWeight = blockforest::DynamicParMetis::stringToWeightsToUse( params.metisWeightsToUse );
+      auto eWeight = blockforest::DynamicParMetis::stringToEdgeSource(   params.metisEdgeSource );
+
+      auto prepFunc = blockforest::DynamicParMetis( alg, vWeight, eWeight );
+      prepFunc.setipc2redist(params.metisipc2redist);
+      mesa_pd::addParMetisPropertiesToSQL(prepFunc, integerProperties, realProperties, stringProperties);
+      forest->setRefreshPhantomBlockMigrationPreparationFunction( prepFunc );
+   } else if (params.LBAlgorithm == "Diffusive")
+   {
+      forest->setRefreshPhantomBlockDataAssignmentFunction( pe::amr::WeightAssignmentFunctor( ic, params.baseWeight ) );
+      forest->setRefreshPhantomBlockDataPackFunction( pe::amr::WeightAssignmentFunctor::PhantomBlockWeightPackUnpackFunctor() );
+      forest->setRefreshPhantomBlockDataUnpackFunction( pe::amr::WeightAssignmentFunctor::PhantomBlockWeightPackUnpackFunctor() );
+      auto prepFunc = blockforest::DynamicDiffusionBalance< pe::amr::WeightAssignmentFunctor::PhantomBlockWeight >( 1, 1, false );
+      //configure(cfg, prepFunc);
+      //addDynamicDiffusivePropertiesToSQL(prepFunc, integerProperties, realProperties, stringProperties);
+      forest->setRefreshPhantomBlockMigrationPreparationFunction(prepFunc);
+   } else
+   {
+      WALBERLA_ABORT("Unknown LBAlgorithm: " << params.LBAlgorithm);
+   }
+
+   WALBERLA_LOG_INFO_ON_ROOT("*** BODYTUPLE ***");
+   // initialize body type ids
+   SetBodyTypeIDs<BodyTuple>::execute();
+
+   WALBERLA_LOG_INFO_ON_ROOT("*** STORAGEDATAHANDLING ***");
+   // add block data
+   auto storageID           = forest->addBlockData(createStorageDataHandling<BodyTuple>(), "Storage");
+   auto ccdID               = forest->addBlockData(ccd::createHashGridsDataHandling( globalBodyStorage, storageID ), "CCD");
+   auto fcdID               = forest->addBlockData(fcd::createGenericFCDDataHandling<BodyTuple, fcd::AnalyticCollideFunctor>(), "FCD");
+
+   WALBERLA_LOG_INFO_ON_ROOT("*** INTEGRATOR ***");
+   std::unique_ptr<cr::ICR> cr;
+   if (bDEM)
+   {
+      cr = std::make_unique<cr::DEM>(globalBodyStorage, forest, storageID, ccdID, fcdID, &tt);
+      WALBERLA_LOG_INFO_ON_ROOT("Using DEM!");
+   } else if (bHCSITS)
+   {
+      cr = std::make_unique<cr::HCSITS>(globalBodyStorage, forest, storageID, ccdID, fcdID, &tt);
+      configure(mainConf, *static_cast<cr::HCSITS*>(cr.get()));
+      WALBERLA_LOG_INFO_ON_ROOT("Using HCSITS!");
+
+      cr::HCSITS* hcsits = static_cast<cr::HCSITS*>(cr.get());
+
+      if (bInelasticFrictionlessContact)
+      {
+         hcsits->setRelaxationModel(cr::HCSITS::InelasticFrictionlessContact);
+         WALBERLA_LOG_INFO_ON_ROOT("Using InelasticFrictionlessContact!");
+      } else if (bApproximateInelasticCoulombContactByDecoupling)
+      {
+         hcsits->setRelaxationModel(cr::HCSITS::ApproximateInelasticCoulombContactByDecoupling);
+         WALBERLA_LOG_INFO_ON_ROOT("Using ApproximateInelasticCoulombContactByDecoupling!");
+      } else if (bInelasticCoulombContactByDecoupling)
+      {
+         hcsits->setRelaxationModel(cr::HCSITS::InelasticCoulombContactByDecoupling);
+         WALBERLA_LOG_INFO_ON_ROOT("Using InelasticCoulombContactByDecoupling!");
+      } else if (bInelasticGeneralizedMaximumDissipationContact)
+      {
+         hcsits->setRelaxationModel(cr::HCSITS::InelasticGeneralizedMaximumDissipationContact);
+         WALBERLA_LOG_INFO_ON_ROOT("Using InelasticGeneralizedMaximumDissipationContact!");
+      } else
+      {
+         WALBERLA_ABORT("Friction model could not be determined!");
+      }
+   } else
+   {
+      WALBERLA_ABORT("Model could not be determined!");
+   }
+
+   WALBERLA_LOG_INFO_ON_ROOT("*** SYNCCALL ***");
+   std::function<void(void)> syncCallWithoutTT;
+   if (bNN)
+   {
+      syncCallWithoutTT = std::bind( pe::syncNextNeighbors<BodyTuple>, std::ref(*forest), storageID, &tt, real_c(0.1), false );
+      WALBERLA_LOG_INFO_ON_ROOT("Using NextNeighbor sync!");
+   } else if (bSO)
+   {
+      syncCallWithoutTT = std::bind( pe::syncShadowOwners<BodyTuple>, std::ref(*forest), storageID, &tt, real_c(0.1), false );
+      WALBERLA_LOG_INFO_ON_ROOT("Using ShadowOwner sync!");
+   } else
+   {
+      WALBERLA_ABORT("Synchronization method could not be determined!");
+   }
+
+   WALBERLA_LOG_INFO_ON_ROOT("*** VTK ***");
+   auto vtkDomainOutput = vtk::createVTKOutput_DomainDecomposition( forest, "domain_decomposition", 1, "vtk_out", "simulation_step" );
+   auto vtkSphereHelper = make_shared<SphereVtkOutput>(storageID, *forest) ;
+   auto vtkSphereOutput = vtk::createVTKOutput_PointData(vtkSphereHelper, "Bodies", 1, "vtk_out", "simulation_step", false, false);
+
+   WALBERLA_LOG_INFO_ON_ROOT("*** SETUP - START ***");
+   //const real_t   static_cof  ( real_c(0.1) / 2 );   // Coefficient of static friction. Note: pe doubles the input coefficient of friction for material-material contacts.
+   //const real_t   dynamic_cof ( static_cof ); // Coefficient of dynamic friction. Similar to static friction for low speed friction.
+   MaterialID     material = createMaterial( "granular", real_t( 1.0 ), 0, 0, 0, real_t( 0.5 ), 1, real_t(1e-6), 0, 0 );
+
+   auto simulationDomain = forest->getDomain();
+   const auto& generationDomain = simulationDomain; // simulationDomain.getExtended(-real_c(0.5) * spacing);
+   int64_t numParticles = 0;
+
+   auto center = forest->getDomain().center();
+   for (auto& currentBlock : *forest)
+   {
+      for (auto it = grid_generator::SCIterator(currentBlock.getAABB().getIntersection(generationDomain),
+                                                Vector3<real_t>(params.spacing) * real_c(0.5),
+                                                params.spacing);
+           it != grid_generator::SCIterator();
+           ++it)
+      {
+         auto tmp = dot( (*it - center), params.normal );
+         if (tmp < 0)
+         {
+            SphereID sp = pe::createSphere( *globalBodyStorage, *forest, storageID, 0, *it, params.radius, material);
+            if (sp != nullptr) ++numParticles;
+         }
+      }
+   }
+   mpi::reduceInplace(numParticles, mpi::SUM);
+   WALBERLA_LOG_INFO_ON_ROOT("#particles created: " << numParticles);
+
+   WALBERLA_LOG_INFO_ON_ROOT("*** SETUP - END ***");
+
+   // synchronize particles
+   syncCallWithoutTT();
+   syncCallWithoutTT();
+
+   WALBERLA_LOG_INFO_ON_ROOT("*** SIMULATION - START ***");
+
+   WcTimer      timerImbalanced;
+   WcTimer      timerLoadBalancing;
+   WcTimer      timerBalanced;
+   WcTimingPool tpImbalanced;
+   WcTimingPool tpBalanced;
+   WALBERLA_MPI_BARRIER();
+   timerImbalanced.start();
+   for (int64_t i=0; i < params.simulationSteps; ++i)
+   {
+      if( i % 200 == 0 )
+      {
+         WALBERLA_LOG_DEVEL_ON_ROOT( "Timestep " << i << " / " << params.simulationSteps );
+      }
+
+      tpImbalanced["CR"].start();
+      cr->timestep( real_c(params.dt) );
+      tpImbalanced["CR"].end();
+      tpImbalanced["Sync"].start();
+      syncCallWithoutTT();
+      tpImbalanced["Sync"].end();
+
+      //if( i % visSpacing == 0 )
+      //{
+      //   vtkDomainOutput->write( );
+      //   vtkSphereOutput->write( );
+      //}
+   }
+   timerImbalanced.end();
+
+   WALBERLA_MPI_BARRIER();
+   timerLoadBalancing.start();
+   WALBERLA_LOG_INFO_ON_ROOT("*** Rebalance ***");
+   createWithNeighborhoodLocalShadow( *forest, storageID, *ic );
+   clearSynchronization( *forest, storageID );
+   forest->refresh();
+   integerProperties["MigrationIterations1"] = int64_c(forest->phantomBlockMigrationIterations());
+   syncNextNeighbors<BodyTuple>(*forest, storageID);
+   for (auto blockIt = forest->begin(); blockIt != forest->end(); ++blockIt)
+   {
+      ccd::ICCD* ccd = blockIt->getData< ccd::ICCD >( ccdID );
+      ccd->reloadBodies();
+   }
+   timerLoadBalancing.end();
+
+   WALBERLA_MPI_BARRIER();
+   timerBalanced.start();
+   for (int64_t i=0; i < params.simulationSteps; ++i)
+   {
+      if( i % 200 == 0 )
+      {
+         WALBERLA_LOG_DEVEL_ON_ROOT( "Timestep " << i << " / " << params.simulationSteps );
+      }
+
+      tpBalanced["CR"].start();
+      cr->timestep( real_c(params.dt) );
+      tpBalanced["CR"].end();
+      tpBalanced["Sync"].start();
+      syncCallWithoutTT();
+      tpBalanced["Sync"].end();
+
+      //if( i % visSpacing == 0 )
+      //{
+      //   vtkDomainOutput->write( );
+      //   vtkSphereOutput->write( );
+      //}
+   }
+   timerBalanced.end();
+
+   auto timerImbalancedReduced = walberla::timing::getReduced(timerImbalanced, REDUCE_TOTAL, 0);
+   double PUpSImbalanced = 0.0;
+   WALBERLA_ROOT_SECTION()
+   {
+      WALBERLA_LOG_INFO_ON_ROOT("IMBALANCED " << *timerImbalancedReduced);
+      PUpSImbalanced = double_c(numParticles) * double_c(params.simulationSteps) / double_c(timerImbalancedReduced->max());
+      WALBERLA_LOG_INFO_ON_ROOT("PUpS: " << PUpSImbalanced);
+   }
+
+   auto timerBalancedReduced = walberla::timing::getReduced(timerBalanced, REDUCE_TOTAL, 0);
+   double PUpSBalanced = 0.0;
+   WALBERLA_ROOT_SECTION()
+   {
+      WALBERLA_LOG_INFO_ON_ROOT("BALANCED " << *timerBalancedReduced);
+      PUpSBalanced = double_c(numParticles) * double_c(params.simulationSteps) / double_c(timerBalancedReduced->max());
+      WALBERLA_LOG_INFO_ON_ROOT("PUpS: " << PUpSBalanced);
+   }
+
+   auto timerLoadBalancingReduced = walberla::timing::getReduced(timerLoadBalancing, REDUCE_TOTAL, 0);
+
+   auto tpImbalancedReduced = tpImbalanced.getReduced();
+   WALBERLA_LOG_INFO_ON_ROOT(*tpImbalancedReduced);
+
+   auto tpBalancedReduced = tpBalanced.getReduced();
+   WALBERLA_LOG_INFO_ON_ROOT(*tpBalancedReduced);
+   WALBERLA_LOG_INFO_ON_ROOT("*** SIMULATION - END ***");
+
+   auto temp = tt.getReduced( );
+   WALBERLA_ROOT_SECTION()
+   {
+      std::cout << temp;
+   }
+
+   WALBERLA_LOG_INFO_ON_ROOT("*** CHECKING RESULT - START ***");
+   numParticles = 0;
+   int64_t numGhostParticles = 0;
+   for (auto& currentBlock : *forest)
+   {
+      Storage * storage = currentBlock.getData< Storage >( storageID );
+      BodyStorage& localStorage = (*storage)[0];
+      BodyStorage& shadowStorage = (*storage)[1];
+      numParticles += localStorage.size();
+      numGhostParticles += shadowStorage.size();
+   }
+   auto minParticles = mpi::reduce(numParticles, mpi::MIN);
+   auto maxParticles = mpi::reduce(numParticles, mpi::MAX);
+   WALBERLA_LOG_DEVEL_ON_ROOT("particle ratio: " << minParticles << " / " << maxParticles);
+
+   mpi::reduceInplace(numParticles, mpi::SUM);
+   mpi::reduceInplace(numGhostParticles, mpi::SUM);
+   WALBERLA_LOG_INFO_ON_ROOT("*** CHECKING RESULT - END ***");
+
+   uint_t runId = uint_c(-1);
+   WALBERLA_ROOT_SECTION()
+   {
+      stringProperties["walberla_git"]         = WALBERLA_GIT_SHA1;
+      stringProperties["tag"]                  = "pe";
+      integerProperties["bDEM"]                = bDEM;
+      integerProperties["bNN"]                 = bNN;
+      integerProperties["mpi_num_processes"]   = mpi::MPIManager::instance()->numProcesses();
+      integerProperties["omp_max_threads"]     = omp_get_max_threads();
+      realProperties["imbalanced_PUpS"]          = double_c(PUpSImbalanced);
+      realProperties["imbalanced_timer_min"]     = timerImbalancedReduced->min();
+      realProperties["imbalanced_timer_max"]     = timerImbalancedReduced->max();
+      realProperties["imbalanced_timer_average"] = timerImbalancedReduced->average();
+      realProperties["imbalanced_timer_total"]   = timerImbalancedReduced->total();
+      realProperties["loadbalancing_timer_min"]     = timerLoadBalancingReduced->min();
+      realProperties["loadbalancing_timer_max"]     = timerLoadBalancingReduced->max();
+      realProperties["loadbalancing_timer_average"] = timerLoadBalancingReduced->average();
+      realProperties["loadbalancing_timer_total"]   = timerLoadBalancingReduced->total();
+      realProperties["balanced_PUpS"]          = double_c(PUpSBalanced);
+      realProperties["balanced_timer_min"]     = timerBalancedReduced->min();
+      realProperties["balanced_timer_max"]     = timerBalancedReduced->max();
+      realProperties["balanced_timer_average"] = timerBalancedReduced->average();
+      realProperties["balanced_timer_total"]   = timerBalancedReduced->total();
+      integerProperties["num_particles"]       = numParticles;
+      integerProperties["num_ghost_particles"] = numGhostParticles;
+      integerProperties["minParticles"]        = minParticles;
+      integerProperties["maxParticles"]        = maxParticles;
+
+      mesa_pd::addBuildInfoToSQL( integerProperties, realProperties, stringProperties );
+      saveToSQL(params, integerProperties, realProperties, stringProperties );
+      mesa_pd::addDomainPropertiesToSQL(*forest, integerProperties, realProperties, stringProperties);
+      mesa_pd::addSlurmPropertiesToSQL(integerProperties, realProperties, stringProperties);
+
+      runId = postprocessing::storeRunInSqliteDB( params.sqlFile, integerProperties, stringProperties, realProperties );
+      postprocessing::storeTimingPoolInSqliteDB( params.sqlFile, runId, *tpImbalancedReduced, "imbalanced" );
+      postprocessing::storeTimingPoolInSqliteDB( params.sqlFile, runId, *tpImbalancedReduced, "balanced" );
+   }
+   if (params.storeNodeTimings)
+   {
+      mesa_pd::storeNodeTimings(runId, params.sqlFile, "NodeTimingImbalanced", tpImbalanced);
+      mesa_pd::storeNodeTimings(runId, params.sqlFile, "NodeTimingBalanced", tpBalanced);
+   }
+   WALBERLA_LOG_INFO_ON_ROOT("*** SQL OUTPUT - END ***");
+
+   return EXIT_SUCCESS;
+}
+} // namespace walberla
+
+int main( int argc, char* argv[] )
+{
+   return walberla::main( argc, argv );
+}
diff --git a/apps/benchmarks/GranularGas/Parameters.cpp b/apps/benchmarks/GranularGas/Parameters.cpp
index 228f9e33e38e366af2cddb7a9dcfcc8371655d91..a6b937da3dcc9fdd6e0aeb80429a395a86dc3294 100644
--- a/apps/benchmarks/GranularGas/Parameters.cpp
+++ b/apps/benchmarks/GranularGas/Parameters.cpp
@@ -75,6 +75,27 @@ void loadFromConfig(Parameters& params, const Config::BlockHandle& cfg)
    params.sqlFile = cfg.getParameter<std::string>("sqlFile", "benchmark.sqlite" );
    WALBERLA_LOG_INFO_ON_ROOT("sqlFile: " << params.sqlFile);
    
+   params.recalculateBlockLevelsInRefresh = cfg.getParameter<bool>("recalculateBlockLevelsInRefresh", false );
+   WALBERLA_LOG_INFO_ON_ROOT("recalculateBlockLevelsInRefresh: " << params.recalculateBlockLevelsInRefresh);
+   
+   params.alwaysRebalanceInRefresh = cfg.getParameter<bool>("alwaysRebalanceInRefresh", true );
+   WALBERLA_LOG_INFO_ON_ROOT("alwaysRebalanceInRefresh: " << params.alwaysRebalanceInRefresh);
+   
+   params.reevaluateMinTargetLevelsAfterForcedRefinement = cfg.getParameter<bool>("reevaluateMinTargetLevelsAfterForcedRefinement", false );
+   WALBERLA_LOG_INFO_ON_ROOT("reevaluateMinTargetLevelsAfterForcedRefinement: " << params.reevaluateMinTargetLevelsAfterForcedRefinement);
+   
+   params.allowRefreshChangingDepth = cfg.getParameter<bool>("allowRefreshChangingDepth", false );
+   WALBERLA_LOG_INFO_ON_ROOT("allowRefreshChangingDepth: " << params.allowRefreshChangingDepth);
+   
+   params.allowMultipleRefreshCycles = cfg.getParameter<bool>("allowMultipleRefreshCycles", false );
+   WALBERLA_LOG_INFO_ON_ROOT("allowMultipleRefreshCycles: " << params.allowMultipleRefreshCycles);
+   
+   params.checkForEarlyOutInRefresh = cfg.getParameter<bool>("checkForEarlyOutInRefresh", true );
+   WALBERLA_LOG_INFO_ON_ROOT("checkForEarlyOutInRefresh: " << params.checkForEarlyOutInRefresh);
+   
+   params.checkForLateOutInRefresh = cfg.getParameter<bool>("checkForLateOutInRefresh", true );
+   WALBERLA_LOG_INFO_ON_ROOT("checkForLateOutInRefresh: " << params.checkForLateOutInRefresh);
+   
    params.regridMin = cfg.getParameter<uint_t>("regridMin", uint_c(100) );
    WALBERLA_LOG_INFO_ON_ROOT("regridMin: " << params.regridMin);
    
@@ -136,6 +157,13 @@ void saveToSQL(const Parameters& params,
    
    
    
+   
+   
+   
+   
+   
+   
+   
    realProperties["baseWeight"] = double_c(params.baseWeight);
    
    realProperties["metisipc2redist"] = double_c(params.metisipc2redist);
diff --git a/apps/benchmarks/GranularGas/Parameters.h b/apps/benchmarks/GranularGas/Parameters.h
index c765ab2021730aca04dd58bd7741e6f7a16fab67..c216aabfffe36ba2150147133e6d9068be54620b 100644
--- a/apps/benchmarks/GranularGas/Parameters.h
+++ b/apps/benchmarks/GranularGas/Parameters.h
@@ -50,6 +50,13 @@ struct Parameters
    int64_t visSpacing = 1000;
    std::string path = "vtk_out";
    std::string sqlFile = "benchmark.sqlite";
+   bool recalculateBlockLevelsInRefresh = false;
+   bool alwaysRebalanceInRefresh = true;
+   bool reevaluateMinTargetLevelsAfterForcedRefinement = false;
+   bool allowRefreshChangingDepth = false;
+   bool allowMultipleRefreshCycles = false;
+   bool checkForEarlyOutInRefresh = true;
+   bool checkForLateOutInRefresh = true;
    uint_t regridMin = uint_c(100);
    uint_t regridMax = uint_c(1000);
    int maxBlocksPerProcess = int_c(1000);
diff --git a/apps/benchmarks/GranularGas/generateConfig.py b/apps/benchmarks/GranularGas/generateConfig.py
index c5e777106d01125fb73a380ff0598653b4c45668..3f5637107912aebdd828ddedd93fa7f983b55474 100755
--- a/apps/benchmarks/GranularGas/generateConfig.py
+++ b/apps/benchmarks/GranularGas/generateConfig.py
@@ -19,6 +19,15 @@ cfg.addParameter("visSpacing",             "int64_t",     "1000")
 cfg.addParameter("path",                   "std::string", '"vtk_out"')
 cfg.addParameter("sqlFile",                "std::string", '"benchmark.sqlite"')
 
+cfg.addParameter("recalculateBlockLevelsInRefresh",                "bool", "false");
+cfg.addParameter("alwaysRebalanceInRefresh",                       "bool", "true");
+cfg.addParameter("reevaluateMinTargetLevelsAfterForcedRefinement", "bool", "false");
+cfg.addParameter("allowRefreshChangingDepth",                      "bool", "false");
+
+cfg.addParameter("allowMultipleRefreshCycles",                     "bool", "false");
+cfg.addParameter("checkForEarlyOutInRefresh",                      "bool", "true");
+cfg.addParameter("checkForLateOutInRefresh",                       "bool", "true");
+
 cfg.addParameter("regridMin",              "uint_t",      'uint_c(100)')
 cfg.addParameter("regridMax",              "uint_t",      'uint_c(1000)')
 cfg.addParameter("maxBlocksPerProcess",    "int",         'int_c(1000)')