diff --git a/apps/benchmarks/GranularGas/GranularGas.cfg b/apps/benchmarks/GranularGas/GranularGas.cfg
index 2c321312ff72d4174d09d9c0d8a5f058f02d39a7..685b7dfc0f49a87241bc445637025d3d45ffa55d 100644
--- a/apps/benchmarks/GranularGas/GranularGas.cfg
+++ b/apps/benchmarks/GranularGas/GranularGas.cfg
@@ -1,12 +1,13 @@
 GranularGas
 {
    simulationCorner < 0, 0, 0 >;
-   simulationDomain < 40, 40, 40 >;
+   simulationDomain < 80, 80, 80 >;
    blocks < 2,2,2 >;
-   isPeriodic < 1, 1, 1 >;
-   initialRefinementLevel 0;
-   sorting none;
+   isPeriodic < 0, 0, 0 >;
+   initialRefinementLevel 2;
+   sorting linear;
 
+   normal  <1,1,1>;
    radius  0.6;
    spacing 1.0;
    vMax    0.0;
diff --git a/apps/benchmarks/GranularGas/MESA_PD_LoadBalancing.cpp b/apps/benchmarks/GranularGas/MESA_PD_LoadBalancing.cpp
index 597eb3f7b52e82159eeb8d9aca5eb8d4fe54ead9..dd25a6bede0bf18c1ff694d18688f1ef6814b6a4 100644
--- a/apps/benchmarks/GranularGas/MESA_PD_LoadBalancing.cpp
+++ b/apps/benchmarks/GranularGas/MESA_PD_LoadBalancing.cpp
@@ -89,8 +89,9 @@ int main( int argc, char ** argv )
    auto mpiManager = walberla::mpi::MPIManager::instance();
    mpiManager->useWorldComm();
 
-   logging::Logging::instance()->setStreamLogLevel(logging::Logging::INFO);
-   logging::Logging::instance()->setFileLogLevel(logging::Logging::INFO);
+   //   logging::Logging::instance()->setStreamLogLevel(logging::Logging::INFO);
+   //   logging::Logging::instance()->includeLoggingToFile("LoadBalancing");
+   //   logging::Logging::instance()->setFileLogLevel(logging::Logging::DETAIL);
 
    WALBERLA_LOG_INFO_ON_ROOT( "config file: " << argv[1] );
    WALBERLA_LOG_INFO_ON_ROOT( "waLBerla Revision: " << WALBERLA_GIT_SHA1 );
@@ -184,7 +185,6 @@ int main( int argc, char ** argv )
    }
 
    auto domain = std::make_shared<domain::BlockForestDomain>(forest);
-   auto simulationDomain = forest->getDomain();
 
    WALBERLA_LOG_INFO_ON_ROOT("*** SETUP - START ***");
 
@@ -205,7 +205,7 @@ int main( int argc, char ** argv )
                                             params.spacing))
       {
          WALBERLA_CHECK(iBlk.getAABB().contains(pt));
-         auto tmp = dot( (pt - center), Vec3(real_t(1), real_t(1), real_t(1)) );
+         auto tmp = dot( (pt - center), params.normal );
          if (tmp < 0)
          {
             createSphere(*ps, pt, params.radius, smallSphere);
@@ -216,27 +216,6 @@ int main( int argc, char ** argv )
    walberla::mpi::reduceInplace(numParticles, walberla::mpi::SUM);
    WALBERLA_LOG_INFO_ON_ROOT("#particles created: " << numParticles);
 
-   auto shift = (params.spacing - params.radius - params.radius) * real_t(0.5);
-   auto confiningDomain = simulationDomain.getExtended(shift);
-
-   //   if (!forest->isPeriodic(0))
-   //   {
-   //      createPlane(*ps, *ss, confiningDomain.minCorner(), Vec3(+1,0,0));
-   //      createPlane(*ps, *ss, confiningDomain.maxCorner(), Vec3(-1,0,0));
-   //   }
-
-   //   if (!forest->isPeriodic(1))
-   //   {
-   //      createPlane(*ps, *ss, confiningDomain.minCorner(), Vec3(0,+1,0));
-   //      createPlane(*ps, *ss, confiningDomain.maxCorner(), Vec3(0,-1,0));
-   //   }
-
-   //   if (!forest->isPeriodic(2))
-   //   {
-   //      createPlane(*ps, *ss, confiningDomain.minCorner(), Vec3(0,0,+1));
-   //      createPlane(*ps, *ss, confiningDomain.maxCorner(), Vec3(0,0,-1));
-   //   }
-
    WALBERLA_LOG_INFO_ON_ROOT("*** SETUP - END ***");
 
    WALBERLA_LOG_INFO_ON_ROOT("*** VTK ***");
@@ -264,224 +243,328 @@ int main( int argc, char ** argv )
    mpi::SyncNextNeighborsBlockForest     SNN;
 
    // initial sync
-   domain::createWithNeighborhood( accessor, *forest, *ic );
-   forest->refresh();
-   domain->refresh();
-   lc = std::make_shared<data::LinkedCells>(domain->getUnionOfLocalAABBs().getExtended(params.spacing), params.spacing );
    ps->forEachParticle(true, kernel::SelectLocal(), accessor, assoc, accessor);
-   WALBERLA_LOG_DEVEL_VAR(ps->size());
    SNN(*ps, forest, domain);
    sortParticleStorage(*ps, params.sorting, lc->domain_, uint_c(lc->numCellsPerDim_[0]));
 
-   for (int64_t outerIteration = 0; outerIteration < params.numOuterIterations; ++outerIteration)
+   WcTimer      timerImbalanced;
+   WcTimer      timerLoadBalancing;
+   WcTimer      timerBalanced;
+   WcTimingPool tpImbalanced;
+   WcTimingPool tpBalanced;
+   auto    SNNBytesSent     = SNN.getBytesSent();
+   auto    SNNBytesReceived = SNN.getBytesReceived();
+   auto    SNNSends         = SNN.getNumberOfSends();
+   auto    SNNReceives      = SNN.getNumberOfReceives();
+   auto    RPBytesSent      = RP.getBytesSent();
+   auto    RPBytesReceived  = RP.getBytesReceived();
+   auto    RPSends          = RP.getNumberOfSends();
+   auto    RPReceives       = RP.getNumberOfReceives();
+   int64_t contactsChecked  = 0;
+   int64_t contactsDetected = 0;
+   int64_t contactsTreated  = 0;
+
+   WALBERLA_LOG_DEVEL_ON_ROOT("running imbalanced simulation");
+   timerImbalanced.start();
+   for (int64_t i=0; i < params.simulationSteps; ++i)
    {
-      WALBERLA_LOG_DEVEL_ON_ROOT("*** RUNNING OUTER ITERATION " << outerIteration << " ***");
-
-      WcTimer      timer;
-      WcTimingPool tp;
-      auto    SNNBytesSent     = SNN.getBytesSent();
-      auto    SNNBytesReceived = SNN.getBytesReceived();
-      auto    SNNSends         = SNN.getNumberOfSends();
-      auto    SNNReceives      = SNN.getNumberOfReceives();
-      auto    RPBytesSent      = RP.getBytesSent();
-      auto    RPBytesReceived  = RP.getBytesReceived();
-      auto    RPSends          = RP.getNumberOfSends();
-      auto    RPReceives       = RP.getNumberOfReceives();
-      int64_t contactsChecked  = 0;
-      int64_t contactsDetected = 0;
-      int64_t contactsTreated  = 0;
+      //WALBERLA_LOG_DEVEL_ON_ROOT("timestep: " << i << " / " << params.simulationSteps );
+      //         if (i % params.visSpacing == 0)
+      //         {
+      //            vtkWriter->write();
+      //         }
+
+      tpImbalanced["AssocToBlock"].start();
+      ps->forEachParticle(true, kernel::SelectLocal(), accessor, assoc, accessor);
       if (params.bBarrier) WALBERLA_MPI_BARRIER();
-      std::stringstream fout;
-      timer.start();
-      for (int64_t i=0; i < params.simulationSteps; ++i)
+      tpImbalanced["AssocToBlock"].end();
+
+      tpImbalanced["GenerateLinkedCells"].start();
+      lc->clear();
+      ps->forEachParticle(true, kernel::SelectAll(), accessor, ipilc, accessor, *lc);
+      if (params.bBarrier) WALBERLA_MPI_BARRIER();
+      tpImbalanced["GenerateLinkedCells"].end();
+
+      tpImbalanced["DEM"].start();
+      contactsChecked  = 0;
+      contactsDetected = 0;
+      contactsTreated  = 0;
+      lc->forEachParticlePairHalf(true,
+                                  kernel::SelectAll(),
+                                  accessor,
+                                  [&](const size_t idx1, const size_t idx2, auto& ac)
       {
-         fout.clear();
-         //WALBERLA_LOG_DEVEL_ON_ROOT("timestep: " << i << " / " << params.simulationSteps );
-         //         if (i % params.visSpacing == 0)
-         //         {
-         //            vtkWriter->write();
-         //         }
-
-         tp["AssocToBlock"].start();
-         ps->forEachParticle(true, kernel::SelectLocal(), accessor, assoc, accessor);
-         if (params.bBarrier) WALBERLA_MPI_BARRIER();
-         tp["AssocToBlock"].end();
-
-         tp["GenerateLinkedCells"].start();
-         lc->clear();
-         ps->forEachParticle(true, kernel::SelectAll(), accessor, ipilc, accessor, *lc);
-         if (params.bBarrier) WALBERLA_MPI_BARRIER();
-         tp["GenerateLinkedCells"].end();
-
-         tp["DEM"].start();
-         contactsChecked  = 0;
-         contactsDetected = 0;
-         contactsTreated  = 0;
-         lc->forEachParticlePairHalf(true,
-                                     kernel::SelectAll(),
-                                     accessor,
-                                     [&](const size_t idx1, const size_t idx2, auto& ac)
+         ++contactsChecked;
+         if (double_cast(idx1, idx2, ac, acd, ac ))
          {
-            ++contactsChecked;
-            if (double_cast(idx1, idx2, ac, acd, ac ))
+            ++contactsDetected;
+            if (contact_filter(acd.getIdx1(), acd.getIdx2(), ac, acd.getContactPoint(), *domain))
             {
-               ++contactsDetected;
-               if (contact_filter(acd.getIdx1(), acd.getIdx2(), ac, acd.getContactPoint(), *domain))
-               {
-                  ++contactsTreated;
-                  dem(acd.getIdx1(), acd.getIdx2(), ac, acd.getContactPoint(), acd.getContactNormal(), acd.getPenetrationDepth());
-               }
+               ++contactsTreated;
+               dem(acd.getIdx1(), acd.getIdx2(), ac, acd.getContactPoint(), acd.getContactNormal(), acd.getPenetrationDepth());
             }
-         },
-         accessor );
-         if (params.bBarrier) WALBERLA_MPI_BARRIER();
-         tp["DEM"].end();
-
-         tp["ReduceForce"].start();
-         RP.operator()<ForceTorqueNotification>(*ps);
-         if (params.bBarrier) WALBERLA_MPI_BARRIER();
-         tp["ReduceForce"].end();
-
-         tp["Euler"].start();
-         //ps->forEachParticle(false, [&](const size_t idx){WALBERLA_CHECK_EQUAL(ps->getForce(idx), Vec3(0,0,0), *(*ps)[idx] << "\n" << idx);});
-         ps->forEachParticle(true, kernel::SelectLocal(), accessor, explicitEulerWithShape, accessor);
-         if (params.bBarrier) WALBERLA_MPI_BARRIER();
-         tp["Euler"].end();
-
-         tp["SNN"].start();
-         SNN(*ps, forest, domain);
-         if (params.bBarrier) WALBERLA_MPI_BARRIER();
-         tp["SNN"].end();
-      }
-      timer.end();
-
-      SNNBytesSent     = SNN.getBytesSent();
-      SNNBytesReceived = SNN.getBytesReceived();
-      SNNSends         = SNN.getNumberOfSends();
-      SNNReceives      = SNN.getNumberOfReceives();
-      RPBytesSent      = RP.getBytesSent();
-      RPBytesReceived  = RP.getBytesReceived();
-      RPSends          = RP.getNumberOfSends();
-      RPReceives       = RP.getNumberOfReceives();
-      walberla::mpi::reduceInplace(SNNBytesSent, walberla::mpi::SUM);
-      walberla::mpi::reduceInplace(SNNBytesReceived, walberla::mpi::SUM);
-      walberla::mpi::reduceInplace(SNNSends, walberla::mpi::SUM);
-      walberla::mpi::reduceInplace(SNNReceives, walberla::mpi::SUM);
-      walberla::mpi::reduceInplace(RPBytesSent, walberla::mpi::SUM);
-      walberla::mpi::reduceInplace(RPBytesReceived, walberla::mpi::SUM);
-      walberla::mpi::reduceInplace(RPSends, walberla::mpi::SUM);
-      walberla::mpi::reduceInplace(RPReceives, walberla::mpi::SUM);
-      auto cC = walberla::mpi::reduce(contactsChecked, walberla::mpi::SUM);
-      auto cD = walberla::mpi::reduce(contactsDetected, walberla::mpi::SUM);
-      auto cT = walberla::mpi::reduce(contactsTreated, walberla::mpi::SUM);
-      WALBERLA_LOG_DEVEL_ON_ROOT( "SNN bytes communicated:   " << SNNBytesSent << " / " << SNNBytesReceived );
-      WALBERLA_LOG_DEVEL_ON_ROOT( "SNN communication partners: " << SNNSends << " / " << SNNReceives );
-      WALBERLA_LOG_DEVEL_ON_ROOT( "RP bytes communicated:  " << RPBytesSent << " / " << RPBytesReceived );
-      WALBERLA_LOG_DEVEL_ON_ROOT( "RP communication partners: " << RPSends << " / " << RPReceives );
-      WALBERLA_LOG_DEVEL_ON_ROOT( "contacts checked/detected/treated: " << cC << " / " << cD << " / " << cT );
-
-      auto timer_reduced = walberla::timing::getReduced(timer, REDUCE_TOTAL, 0);
-      double PUpS = 0.0;
-      WALBERLA_ROOT_SECTION()
-      {
-         WALBERLA_LOG_INFO_ON_ROOT(*timer_reduced);
-         WALBERLA_LOG_INFO_ON_ROOT("runtime: " << timer_reduced->max());
-         PUpS = double_c(numParticles) * double_c(params.simulationSteps) / double_c(timer_reduced->max());
-         WALBERLA_LOG_INFO_ON_ROOT("PUpS: " << PUpS);
-      }
+         }
+      },
+      accessor );
+      if (params.bBarrier) WALBERLA_MPI_BARRIER();
+      tpImbalanced["DEM"].end();
 
-      auto tp_reduced = tp.getReduced();
-      WALBERLA_LOG_INFO_ON_ROOT(*tp_reduced);
-      WALBERLA_LOG_INFO_ON_ROOT("*** SIMULATION - END ***");
+      tpImbalanced["ReduceForce"].start();
+      RP.operator()<ForceTorqueNotification>(*ps);
+      if (params.bBarrier) WALBERLA_MPI_BARRIER();
+      tpImbalanced["ReduceForce"].end();
 
-      if (params.checkSimulation)
-      {
-         check(*ps, *forest, params.spacing);
-      }
+      tpImbalanced["Euler"].start();
+      //ps->forEachParticle(false, [&](const size_t idx){WALBERLA_CHECK_EQUAL(ps->getForce(idx), Vec3(0,0,0), *(*ps)[idx] << "\n" << idx);});
+      ps->forEachParticle(true, kernel::SelectLocal(), accessor, explicitEulerWithShape, accessor);
+      if (params.bBarrier) WALBERLA_MPI_BARRIER();
+      tpImbalanced["Euler"].end();
+
+      tpImbalanced["SNN"].start();
+      SNN(*ps, forest, domain);
+      if (params.bBarrier) WALBERLA_MPI_BARRIER();
+      tpImbalanced["SNN"].end();
+   }
+   timerImbalanced.end();
+   WALBERLA_MPI_BARRIER();
 
-      WALBERLA_LOG_INFO_ON_ROOT("*** SQL OUTPUT - START ***");
-      numParticles = 0;
-      int64_t numGhostParticles = 0;
-      ps->forEachParticle(false,
-                          kernel::SelectAll(),
-                          accessor,
-                          [&numParticles, &numGhostParticles](const size_t idx, auto& ac)
+   timerLoadBalancing.start();
+   if (bRebalance)
+   {
+      WALBERLA_LOG_DEVEL_ON_ROOT("running load balancing");
+      domain::createWithNeighborhood( accessor, *forest, *ic );
+      for (auto pIt = ps->begin(); pIt != ps->end(); )
       {
-         if (data::particle_flags::isSet( ac.getFlagsRef(idx), data::particle_flags::GHOST))
+         using namespace walberla::mesa_pd::data::particle_flags;
+         if (isSet(pIt->getFlags(), GHOST))
          {
-            ++numGhostParticles;
+            pIt = ps->erase(pIt);
          } else
          {
-            ++numParticles;
+            pIt->getGhostOwnersRef().clear();
+            ++pIt;
          }
-      },
-      accessor);
-      walberla::mpi::reduceInplace(numParticles, walberla::mpi::SUM);
-      walberla::mpi::reduceInplace(numGhostParticles, walberla::mpi::SUM);
-      walberla::mpi::reduceInplace(contactsChecked, walberla::mpi::SUM);
-      walberla::mpi::reduceInplace(contactsDetected, walberla::mpi::SUM);
-      walberla::mpi::reduceInplace(contactsTreated, walberla::mpi::SUM);
-      double linkedCellsVolume = lc->domain_.volume();
-      walberla::mpi::reduceInplace(linkedCellsVolume, walberla::mpi::SUM);
-      size_t numLinkedCells = lc->cells_.size();
-      walberla::mpi::reduceInplace(numLinkedCells, walberla::mpi::SUM);
-      size_t local_aabbs         = domain->getNumLocalAABBs();
-      size_t neighbor_subdomains = domain->getNumNeighborSubdomains();
-      size_t neighbor_processes  = domain->getNumNeighborProcesses();
-      walberla::mpi::reduceInplace(local_aabbs, walberla::mpi::SUM);
-      walberla::mpi::reduceInplace(neighbor_subdomains, walberla::mpi::SUM);
-      walberla::mpi::reduceInplace(neighbor_processes, walberla::mpi::SUM);
-
-      WALBERLA_LOG_DEVEL_VAR_ON_ROOT(numParticles);
-      WALBERLA_LOG_DEVEL_VAR_ON_ROOT(numGhostParticles);
-
-      uint_t runId = uint_c(-1);
-      WALBERLA_ROOT_SECTION()
-      {
-         stringProperties["walberla_git"]         = WALBERLA_GIT_SHA1;
-         stringProperties["tag"]                  = "mesa_pd";
-         integerProperties["mpi_num_processes"]   = mpiManager->numProcesses();
-         integerProperties["omp_max_threads"]     = omp_get_max_threads();
-         realProperties["PUpS"]                   = double_c(PUpS);
-         realProperties["timer_min"]              = timer_reduced->min();
-         realProperties["timer_max"]              = timer_reduced->max();
-         realProperties["timer_average"]          = timer_reduced->average();
-         realProperties["timer_total"]            = timer_reduced->total();
-         integerProperties["outerIteration"]      = int64_c(outerIteration);
-         integerProperties["num_particles"]       = numParticles;
-         integerProperties["num_ghost_particles"] = numGhostParticles;
-         integerProperties["contacts_checked"]    = contactsChecked;
-         integerProperties["contacts_detected"]   = contactsDetected;
-         integerProperties["contacts_treated"]    = contactsTreated;
-         integerProperties["local_aabbs"]         = int64_c(local_aabbs);
-         integerProperties["neighbor_subdomains"] = int64_c(neighbor_subdomains);
-         integerProperties["neighbor_processes"]  = int64_c(neighbor_processes);
-         integerProperties["SNNBytesSent"]        = SNNBytesSent;
-         integerProperties["SNNBytesReceived"]    = SNNBytesReceived;
-         integerProperties["SNNSends"]            = SNNSends;
-         integerProperties["SNNReceives"]         = SNNReceives;
-         integerProperties["RPBytesSent"]         = RPBytesSent;
-         integerProperties["RPBytesReceived"]     = RPBytesReceived;
-         integerProperties["RPSends"]             = RPSends;
-         integerProperties["RPReceives"]          = RPReceives;
-         realProperties["linkedCellsVolume"]      = linkedCellsVolume;
-         integerProperties["numLinkedCells"]      = int64_c(numLinkedCells);
-
-         addBuildInfoToSQL( integerProperties, realProperties, stringProperties );
-         saveToSQL(params, integerProperties, realProperties, stringProperties );
-         addDomainPropertiesToSQL(*forest, integerProperties, realProperties, stringProperties);
-         addSlurmPropertiesToSQL(integerProperties, realProperties, stringProperties);
-
-         runId = postprocessing::storeRunInSqliteDB( params.sqlFile, integerProperties, stringProperties, realProperties );
-         postprocessing::storeTimingPoolInSqliteDB( params.sqlFile, runId, *tp_reduced, "Timeloop" );
       }
+      forest->refresh();
+      domain->refresh();
+      lc = std::make_shared<data::LinkedCells>(domain->getUnionOfLocalAABBs().getExtended(params.spacing), params.spacing );
+      ps->forEachParticle(true, kernel::SelectLocal(), accessor, assoc, accessor);
+      WALBERLA_LOG_DEVEL_VAR(ps->size());
+      SNN(*ps, forest, domain);
+      sortParticleStorage(*ps, params.sorting, lc->domain_, uint_c(lc->numCellsPerDim_[0]));
+   }
+   timerLoadBalancing.end();
+
+   WALBERLA_MPI_BARRIER();
+   WALBERLA_LOG_DEVEL_ON_ROOT("running balanced simulation");
+   timerBalanced.start();
+   for (int64_t i=0; i < params.simulationSteps; ++i)
+   {
+      //WALBERLA_LOG_DEVEL_ON_ROOT("timestep: " << i << " / " << params.simulationSteps );
+      //         if (i % params.visSpacing == 0)
+      //         {
+      //            vtkWriter->write();
+      //         }
+
+      tpBalanced["AssocToBlock"].start();
+      ps->forEachParticle(true, kernel::SelectLocal(), accessor, assoc, accessor);
+      if (params.bBarrier) WALBERLA_MPI_BARRIER();
+      tpBalanced["AssocToBlock"].end();
+
+      tpBalanced["GenerateLinkedCells"].start();
+      lc->clear();
+      ps->forEachParticle(true, kernel::SelectAll(), accessor, ipilc, accessor, *lc);
+      if (params.bBarrier) WALBERLA_MPI_BARRIER();
+      tpBalanced["GenerateLinkedCells"].end();
+
+      tpBalanced["DEM"].start();
+      contactsChecked  = 0;
+      contactsDetected = 0;
+      contactsTreated  = 0;
+      lc->forEachParticlePairHalf(true,
+                                  kernel::SelectAll(),
+                                  accessor,
+                                  [&](const size_t idx1, const size_t idx2, auto& ac)
+      {
+         ++contactsChecked;
+         if (double_cast(idx1, idx2, ac, acd, ac ))
+         {
+            ++contactsDetected;
+            if (contact_filter(acd.getIdx1(), acd.getIdx2(), ac, acd.getContactPoint(), *domain))
+            {
+               ++contactsTreated;
+               dem(acd.getIdx1(), acd.getIdx2(), ac, acd.getContactPoint(), acd.getContactNormal(), acd.getPenetrationDepth());
+            }
+         }
+      },
+      accessor );
+      if (params.bBarrier) WALBERLA_MPI_BARRIER();
+      tpBalanced["DEM"].end();
+
+      tpBalanced["ReduceForce"].start();
+      RP.operator()<ForceTorqueNotification>(*ps);
+      if (params.bBarrier) WALBERLA_MPI_BARRIER();
+      tpBalanced["ReduceForce"].end();
+
+      tpBalanced["Euler"].start();
+      //ps->forEachParticle(false, [&](const size_t idx){WALBERLA_CHECK_EQUAL(ps->getForce(idx), Vec3(0,0,0), *(*ps)[idx] << "\n" << idx);});
+      ps->forEachParticle(true, kernel::SelectLocal(), accessor, explicitEulerWithShape, accessor);
+      if (params.bBarrier) WALBERLA_MPI_BARRIER();
+      tpBalanced["Euler"].end();
+
+      tpBalanced["SNN"].start();
+      SNN(*ps, forest, domain);
+      if (params.bBarrier) WALBERLA_MPI_BARRIER();
+      tpBalanced["SNN"].end();
+   }
+   timerBalanced.end();
+
+   SNNBytesSent     = SNN.getBytesSent();
+   SNNBytesReceived = SNN.getBytesReceived();
+   SNNSends         = SNN.getNumberOfSends();
+   SNNReceives      = SNN.getNumberOfReceives();
+   RPBytesSent      = RP.getBytesSent();
+   RPBytesReceived  = RP.getBytesReceived();
+   RPSends          = RP.getNumberOfSends();
+   RPReceives       = RP.getNumberOfReceives();
+   walberla::mpi::reduceInplace(SNNBytesSent, walberla::mpi::SUM);
+   walberla::mpi::reduceInplace(SNNBytesReceived, walberla::mpi::SUM);
+   walberla::mpi::reduceInplace(SNNSends, walberla::mpi::SUM);
+   walberla::mpi::reduceInplace(SNNReceives, walberla::mpi::SUM);
+   walberla::mpi::reduceInplace(RPBytesSent, walberla::mpi::SUM);
+   walberla::mpi::reduceInplace(RPBytesReceived, walberla::mpi::SUM);
+   walberla::mpi::reduceInplace(RPSends, walberla::mpi::SUM);
+   walberla::mpi::reduceInplace(RPReceives, walberla::mpi::SUM);
+   auto cC = walberla::mpi::reduce(contactsChecked, walberla::mpi::SUM);
+   auto cD = walberla::mpi::reduce(contactsDetected, walberla::mpi::SUM);
+   auto cT = walberla::mpi::reduce(contactsTreated, walberla::mpi::SUM);
+   WALBERLA_LOG_DEVEL_ON_ROOT( "SNN bytes communicated:   " << SNNBytesSent << " / " << SNNBytesReceived );
+   WALBERLA_LOG_DEVEL_ON_ROOT( "SNN communication partners: " << SNNSends << " / " << SNNReceives );
+   WALBERLA_LOG_DEVEL_ON_ROOT( "RP bytes communicated:  " << RPBytesSent << " / " << RPBytesReceived );
+   WALBERLA_LOG_DEVEL_ON_ROOT( "RP communication partners: " << RPSends << " / " << RPReceives );
+   WALBERLA_LOG_DEVEL_ON_ROOT( "contacts checked/detected/treated: " << cC << " / " << cD << " / " << cT );
+
+   auto timerImbalancedReduced = walberla::timing::getReduced(timerImbalanced, REDUCE_TOTAL, 0);
+   double PUpSImbalanced = 0.0;
+   WALBERLA_ROOT_SECTION()
+   {
+      WALBERLA_LOG_INFO_ON_ROOT(*timerImbalancedReduced);
+      PUpSImbalanced = double_c(numParticles) * double_c(params.simulationSteps) / double_c(timerImbalancedReduced->max());
+      WALBERLA_LOG_INFO_ON_ROOT("PUpS: " << PUpSImbalanced);
+   }
+
+   auto timerBalancedReduced = walberla::timing::getReduced(timerBalanced, REDUCE_TOTAL, 0);
+   double PUpSBalanced = 0.0;
+   WALBERLA_ROOT_SECTION()
+   {
+      WALBERLA_LOG_INFO_ON_ROOT(*timerBalancedReduced);
+      PUpSBalanced = double_c(numParticles) * double_c(params.simulationSteps) / double_c(timerBalancedReduced->max());
+      WALBERLA_LOG_INFO_ON_ROOT("PUpS: " << PUpSBalanced);
+   }
+
+   auto timerLoadBalancingReduced = walberla::timing::getReduced(timerLoadBalancing, REDUCE_TOTAL, 0);
+
+   auto tpImbalancedReduced = tpImbalanced.getReduced();
+   WALBERLA_LOG_INFO_ON_ROOT(*tpImbalancedReduced);
+
+   auto tpBalancedReduced = tpBalanced.getReduced();
+   WALBERLA_LOG_INFO_ON_ROOT(*tpBalancedReduced);
+   WALBERLA_LOG_INFO_ON_ROOT("*** SIMULATION - END ***");
+
+   if (params.checkSimulation)
+   {
+      check(*ps, *forest, params.spacing);
+   }
 
-      if (params.storeNodeTimings)
+   WALBERLA_LOG_INFO_ON_ROOT("*** SQL OUTPUT - START ***");
+   numParticles = 0;
+   int64_t numGhostParticles = 0;
+   ps->forEachParticle(false,
+                       kernel::SelectAll(),
+                       accessor,
+                       [&numParticles, &numGhostParticles](const size_t idx, auto& ac)
+   {
+      if (data::particle_flags::isSet( ac.getFlagsRef(idx), data::particle_flags::GHOST))
+      {
+         ++numGhostParticles;
+      } else
       {
-         storeNodeTimings(runId, params.sqlFile, "NodeTiming", tp);
+         ++numParticles;
       }
-      WALBERLA_LOG_INFO_ON_ROOT("*** SQL OUTPUT - END ***");
+   },
+   accessor);
+   walberla::mpi::reduceInplace(numParticles, walberla::mpi::SUM);
+   walberla::mpi::reduceInplace(numGhostParticles, walberla::mpi::SUM);
+   walberla::mpi::reduceInplace(contactsChecked, walberla::mpi::SUM);
+   walberla::mpi::reduceInplace(contactsDetected, walberla::mpi::SUM);
+   walberla::mpi::reduceInplace(contactsTreated, walberla::mpi::SUM);
+   double linkedCellsVolume = lc->domain_.volume();
+   walberla::mpi::reduceInplace(linkedCellsVolume, walberla::mpi::SUM);
+   size_t numLinkedCells = lc->cells_.size();
+   walberla::mpi::reduceInplace(numLinkedCells, walberla::mpi::SUM);
+   size_t local_aabbs         = domain->getNumLocalAABBs();
+   size_t neighbor_subdomains = domain->getNumNeighborSubdomains();
+   size_t neighbor_processes  = domain->getNumNeighborProcesses();
+   walberla::mpi::reduceInplace(local_aabbs, walberla::mpi::SUM);
+   walberla::mpi::reduceInplace(neighbor_subdomains, walberla::mpi::SUM);
+   walberla::mpi::reduceInplace(neighbor_processes, walberla::mpi::SUM);
+
+   uint_t runId = uint_c(-1);
+   WALBERLA_ROOT_SECTION()
+   {
+      stringProperties["walberla_git"]         = WALBERLA_GIT_SHA1;
+      stringProperties["tag"]                  = "mesa_pd";
+      integerProperties["mpi_num_processes"]   = mpiManager->numProcesses();
+      integerProperties["omp_max_threads"]     = omp_get_max_threads();
+      realProperties["imbalanced_PUpS"]          = double_c(PUpSImbalanced);
+      realProperties["imbalanced_timer_min"]     = timerImbalancedReduced->min();
+      realProperties["imbalanced_timer_max"]     = timerImbalancedReduced->max();
+      realProperties["imbalanced_timer_average"] = timerImbalancedReduced->average();
+      realProperties["imbalanced_timer_total"]   = timerImbalancedReduced->total();
+      realProperties["loadbalancing_timer_min"]     = timerLoadBalancingReduced->min();
+      realProperties["loadbalancing_timer_max"]     = timerLoadBalancingReduced->max();
+      realProperties["loadbalancing_timer_average"] = timerLoadBalancingReduced->average();
+      realProperties["loadbalancing_timer_total"]   = timerLoadBalancingReduced->total();
+      realProperties["balanced_PUpS"]          = double_c(PUpSBalanced);
+      realProperties["balanced_timer_min"]     = timerBalancedReduced->min();
+      realProperties["balanced_timer_max"]     = timerBalancedReduced->max();
+      realProperties["balanced_timer_average"] = timerBalancedReduced->average();
+      realProperties["balanced_timer_total"]   = timerBalancedReduced->total();
+      integerProperties["num_particles"]       = numParticles;
+      integerProperties["num_ghost_particles"] = numGhostParticles;
+      integerProperties["contacts_checked"]    = contactsChecked;
+      integerProperties["contacts_detected"]   = contactsDetected;
+      integerProperties["contacts_treated"]    = contactsTreated;
+      integerProperties["local_aabbs"]         = int64_c(local_aabbs);
+      integerProperties["neighbor_subdomains"] = int64_c(neighbor_subdomains);
+      integerProperties["neighbor_processes"]  = int64_c(neighbor_processes);
+      integerProperties["SNNBytesSent"]        = SNNBytesSent;
+      integerProperties["SNNBytesReceived"]    = SNNBytesReceived;
+      integerProperties["SNNSends"]            = SNNSends;
+      integerProperties["SNNReceives"]         = SNNReceives;
+      integerProperties["RPBytesSent"]         = RPBytesSent;
+      integerProperties["RPBytesReceived"]     = RPBytesReceived;
+      integerProperties["RPSends"]             = RPSends;
+      integerProperties["RPReceives"]          = RPReceives;
+      realProperties["linkedCellsVolume"]      = linkedCellsVolume;
+      integerProperties["numLinkedCells"]      = int64_c(numLinkedCells);
+
+      addBuildInfoToSQL( integerProperties, realProperties, stringProperties );
+      saveToSQL(params, integerProperties, realProperties, stringProperties );
+      addDomainPropertiesToSQL(*forest, integerProperties, realProperties, stringProperties);
+      addSlurmPropertiesToSQL(integerProperties, realProperties, stringProperties);
+
+      runId = postprocessing::storeRunInSqliteDB( params.sqlFile, integerProperties, stringProperties, realProperties );
+      postprocessing::storeTimingPoolInSqliteDB( params.sqlFile, runId, *tpImbalancedReduced, "imbalanced" );
+      postprocessing::storeTimingPoolInSqliteDB( params.sqlFile, runId, *tpImbalancedReduced, "balanced" );
+   }
+
+   if (params.storeNodeTimings)
+   {
+      storeNodeTimings(runId, params.sqlFile, "NodeTimingImbalanced", tpImbalanced);
+      storeNodeTimings(runId, params.sqlFile, "NodeTimingBalanced", tpBalanced);
    }
+   WALBERLA_LOG_INFO_ON_ROOT("*** SQL OUTPUT - END ***");
 
    return EXIT_SUCCESS;
 }
diff --git a/apps/benchmarks/GranularGas/PE_GranularGas.cpp b/apps/benchmarks/GranularGas/PE_GranularGas.cpp
index e235f348f7ae80200091c61c506856d1dd209adf..96ded8e71b85cffe7d76ad7c986b1b860f97556c 100644
--- a/apps/benchmarks/GranularGas/PE_GranularGas.cpp
+++ b/apps/benchmarks/GranularGas/PE_GranularGas.cpp
@@ -90,7 +90,7 @@ int main( int argc, char ** argv )
    auto cfg = env.config();
    if (cfg == nullptr) WALBERLA_ABORT("No config specified!");
    const Config::BlockHandle mainConf  = cfg->getBlock( "GranularGas" );
-   Parameters params;
+   mesa_pd::Parameters params;
    loadFromConfig(params, mainConf);
 
    WALBERLA_LOG_INFO_ON_ROOT("*** GLOBALBODYSTORAGE ***");
diff --git a/apps/benchmarks/GranularGas/Parameters.cpp b/apps/benchmarks/GranularGas/Parameters.cpp
index 2215898ba99aef5fb6c4abc595ca073f5791a321..228f9e33e38e366af2cddb7a9dcfcc8371655d91 100644
--- a/apps/benchmarks/GranularGas/Parameters.cpp
+++ b/apps/benchmarks/GranularGas/Parameters.cpp
@@ -29,12 +29,16 @@
 #include <core/logging/Logging.h>
 
 namespace walberla {
+namespace mesa_pd {
 
 void loadFromConfig(Parameters& params, const Config::BlockHandle& cfg)
 {
    params.sorting = cfg.getParameter<std::string>("sorting", "none" );
    WALBERLA_LOG_INFO_ON_ROOT("sorting: " << params.sorting);
    
+   params.normal = cfg.getParameter<Vec3>("normal", Vec3(real_t(1.0), real_t(1.0), real_t(1.0)) );
+   WALBERLA_LOG_INFO_ON_ROOT("normal: " << params.normal);
+   
    params.spacing = cfg.getParameter<real_t>("spacing", real_t(1.0) );
    WALBERLA_LOG_INFO_ON_ROOT("spacing: " << params.spacing);
    
@@ -107,6 +111,7 @@ void saveToSQL(const Parameters& params,
 {
    stringProperties["sorting"] = params.sorting;
    
+   
    realProperties["spacing"] = double_c(params.spacing);
    
    realProperties["radius"] = double_c(params.radius);
@@ -145,4 +150,5 @@ void saveToSQL(const Parameters& params,
    
 }
 
+} //namespace mesa_pd
 } //namespace walberla
\ No newline at end of file
diff --git a/apps/benchmarks/GranularGas/Parameters.h b/apps/benchmarks/GranularGas/Parameters.h
index 31ce0cb1238da1ab7dab3c80ed6770bed7b92a09..c765ab2021730aca04dd58bd7741e6f7a16fab67 100644
--- a/apps/benchmarks/GranularGas/Parameters.h
+++ b/apps/benchmarks/GranularGas/Parameters.h
@@ -27,15 +27,17 @@
 #pragma once
 
 #include <core/config/Config.h>
-#include <core/DataTypes.h>
+#include <mesa_pd/data/DataTypes.h>
 
 #include <string>
 
 namespace walberla {
+namespace mesa_pd {
 
 struct Parameters
 {
    std::string sorting = "none";
+   Vec3 normal = Vec3(real_t(1.0), real_t(1.0), real_t(1.0));
    real_t spacing = real_t(1.0);
    real_t radius = real_t(0.5);
    bool bBarrier = false;
@@ -67,4 +69,5 @@ void saveToSQL(const Parameters& params,
                std::map< std::string, double >&            realProperties,
                std::map< std::string, std::string >&       stringProperties );
 
+} //namespace mesa_pd
 } //namespace walberla
\ No newline at end of file
diff --git a/apps/benchmarks/GranularGas/Parameters.templ.cpp b/apps/benchmarks/GranularGas/Parameters.templ.cpp
index 8f9be98116d57a52f6d9b9ea200e6ee0882d36f6..c8aae96b39b6587ca89328685ae2d81250ac0095 100644
--- a/apps/benchmarks/GranularGas/Parameters.templ.cpp
+++ b/apps/benchmarks/GranularGas/Parameters.templ.cpp
@@ -29,6 +29,7 @@
 #include <core/logging/Logging.h>
 
 namespace walberla {
+namespace mesa_pd {
 
 void loadFromConfig(Parameters& params, const Config::BlockHandle& cfg)
 {
@@ -56,4 +57,5 @@ void saveToSQL(const Parameters& params,
    {% endfor %}
 }
 
+} //namespace mesa_pd
 } //namespace walberla
diff --git a/apps/benchmarks/GranularGas/Parameters.templ.h b/apps/benchmarks/GranularGas/Parameters.templ.h
index 29b18935ca1b3ffe708826b141c2cc170ab4c321..6f02004127219d02cf5a9105e23a9986b7711494 100644
--- a/apps/benchmarks/GranularGas/Parameters.templ.h
+++ b/apps/benchmarks/GranularGas/Parameters.templ.h
@@ -27,11 +27,12 @@
 #pragma once
 
 #include <core/config/Config.h>
-#include <core/DataTypes.h>
+#include <mesa_pd/data/DataTypes.h>
 
 #include <string>
 
 namespace walberla {
+namespace mesa_pd {
 
 struct Parameters
 {
@@ -48,4 +49,5 @@ void saveToSQL(const Parameters& params,
                std::map< std::string, double >&            realProperties,
                std::map< std::string, std::string >&       stringProperties );
 
+} //namespace mesa_pd
 } //namespace walberla
diff --git a/apps/benchmarks/GranularGas/generateConfig.py b/apps/benchmarks/GranularGas/generateConfig.py
index 04fdf21220a74cf59e2b61bd2a612f41999024e6..c5e777106d01125fb73a380ff0598653b4c45668 100755
--- a/apps/benchmarks/GranularGas/generateConfig.py
+++ b/apps/benchmarks/GranularGas/generateConfig.py
@@ -5,6 +5,7 @@ from ConfigGenerator import Config
 
 cfg = Config()
 cfg.addParameter("sorting",                "std::string", '"none"')
+cfg.addParameter("normal",                 "Vec3",        "Vec3(real_t(1.0), real_t(1.0), real_t(1.0))")
 cfg.addParameter("spacing",                "real_t",      "real_t(1.0)")
 cfg.addParameter("radius",                 "real_t",      "real_t(0.5)")
 cfg.addParameter("bBarrier",               "bool",        "false")
@@ -28,5 +29,4 @@ cfg.addParameter("metisAlgorithm",         "std::string", '"PART_GEOM_KWAY"' );
 cfg.addParameter("metisWeightsToUse",      "std::string", '"BOTH_WEIGHTS"' );
 cfg.addParameter("metisEdgeSource",        "std::string", '"EDGES_FROM_EDGE_WEIGHTS"' );
 
-
 cfg.generate()