diff --git a/apps/benchmarks/MotionSingleHeavySphere/MotionSingleHeavySphere.cpp b/apps/benchmarks/MotionSingleHeavySphere/MotionSingleHeavySphere.cpp
index 29aba5b387cfa4d521bf2d1712e89589d2764091..b037a5cbf43b6431ca24d74a2b0bf6a8f8192a33 100644
--- a/apps/benchmarks/MotionSingleHeavySphere/MotionSingleHeavySphere.cpp
+++ b/apps/benchmarks/MotionSingleHeavySphere/MotionSingleHeavySphere.cpp
@@ -237,13 +237,14 @@ BoundaryHandling_T * MyBoundaryHandling::operator()( IBlock * const block, const
    return handling;
 }
 
-
-class ForceEval
+/*
+ * Functionality for continuous logging of sphere properties during initial (resting) simulation
+ */
+class RestingSphereForceEvaluator
 {
 public:
-   ForceEval( SweepTimeloop* timeloop,
-              const shared_ptr< StructuredBlockStorage > & blocks, const ConstBlockDataID & bodyStorageID,
-              uint_t averageFrequency, const std::string & basefolder ) :
+   RestingSphereForceEvaluator( SweepTimeloop* timeloop, const shared_ptr< StructuredBlockStorage > & blocks,
+                                const ConstBlockDataID & bodyStorageID, uint_t averageFrequency, const std::string & basefolder ) :
       timeloop_( timeloop ), blocks_( blocks ), bodyStorageID_( bodyStorageID ), averageFrequency_( averageFrequency )
    {
       // initially write header in file
@@ -325,15 +326,17 @@ private:
 
 };
 
-class SphereEval
+/*
+ * Functionality for continuous logging of sphere properties during moving simulation
+ */
+class MovingSpherePropertyEvaluator
 {
 public:
-   SphereEval( SweepTimeloop* timeloop,
-               const shared_ptr< StructuredBlockStorage > & blocks, const ConstBlockDataID & bodyStorageID,
-               const std::string & basefolder,
-               const Vector3<real_t> & u_infty, const real_t Galileo, const real_t GalileoSim,
-               const real_t gravity, const real_t viscosity, const real_t diameter, const real_t densityRatio,
-               const uint_t numLBMSubCycles ) :
+   MovingSpherePropertyEvaluator( SweepTimeloop* timeloop, const shared_ptr< StructuredBlockStorage > & blocks,
+                                  const ConstBlockDataID & bodyStorageID, const std::string & basefolder,
+                                  const Vector3<real_t> & u_infty, const real_t Galileo, const real_t GalileoSim,
+                                  const real_t gravity, const real_t viscosity, const real_t diameter,
+                                  const real_t densityRatio, const uint_t numLBMSubCycles ) :
       timeloop_( timeloop ), blocks_( blocks ), bodyStorageID_( bodyStorageID ),
       u_infty_( u_infty ), gravity_( gravity ), viscosity_( viscosity ),
       diameter_( diameter ), densityRatio_( densityRatio ), numLBMSubCycles_( numLBMSubCycles )
@@ -494,95 +497,9 @@ private:
 
 };
 
-class ResetForce
-{
-   public:
-   ResetForce( const shared_ptr< StructuredBlockStorage > & blocks, const BlockDataID & bodyStorageID )
-      : blocks_( blocks ), bodyStorageID_( bodyStorageID ) { }
-
-      void operator()()
-      {
-
-         for( auto blockIt = blocks_->begin(); blockIt != blocks_->end(); ++blockIt )
-         {
-            for( auto bodyIt = pe::BodyIterator::begin< pe::Sphere >( *blockIt, bodyStorageID_); bodyIt != pe::BodyIterator::end<pe::Sphere>(); ++bodyIt )
-            {
-               bodyIt->resetForceAndTorque();
-            }
-         }
-      }
-   private:
-      const shared_ptr< StructuredBlockStorage > blocks_;
-      const BlockDataID bodyStorageID_;
-};
-
-class GravitationalForce
-{
-   public:
-   GravitationalForce( const shared_ptr< StructuredBlockStorage > & blocks, const BlockDataID & bodyStorageID,
-                       real_t force )
-      : blocks_( blocks ), bodyStorageID_( bodyStorageID ), force_( force ) { }
-
-      void operator()()
-      {
-
-         for( auto blockIt = blocks_->begin(); blockIt != blocks_->end(); ++blockIt )
-         {
-            for( auto bodyIt = pe::LocalBodyIterator::begin< pe::Sphere >( *blockIt, bodyStorageID_); bodyIt != pe::LocalBodyIterator::end<pe::Sphere>(); ++bodyIt )
-            {
-               bodyIt->addForce ( real_t(0), real_t(0), force_ );
-            }
-         }
-      }
-   private:
-      const shared_ptr< StructuredBlockStorage > blocks_;
-      const BlockDataID bodyStorageID_;
-      const real_t force_;
-};
-
-
-
 /*
- * When using N LBM steps and one (larger) pe step in a single simulation step, the force applied on the pe bodies in each LBM sep has to be scaled
- * by a factor of 1/N before running the pe simulation. This corresponds to an averaging of the force and torque over the N LBM steps and is said to
- * damp oscillations. Usually, N = 2.
- * See Ladd - "Numerical simulations of particulate suspensions via a discretized Boltzmann equation. Part 1. Theoretical foundation", 1994, p. 302
- */
-class AverageForce
-{
-   public:
-      AverageForce( const shared_ptr< StructuredBlockStorage > & blocks, const BlockDataID & bodyStorageID,
-                    const uint_t lbmSubCycles )
-      : blocks_( blocks ), bodyStorageID_( bodyStorageID ), invLbmSubCycles_( real_t(1) / real_c( lbmSubCycles ) ) { }
-
-      void operator()()
-      {
-         Vector3<real_t> force( real_t(0) );
-         Vector3<real_t> torque( real_t(0) );
-
-         for( auto blockIt = blocks_->begin(); blockIt != blocks_->end(); ++blockIt )
-         {
-            for( auto bodyIt = pe::BodyIterator::begin( *blockIt, bodyStorageID_); bodyIt != pe::BodyIterator::end(); ++bodyIt )
-            {
-               force  = invLbmSubCycles_ * bodyIt->getForce();
-               torque = invLbmSubCycles_ * bodyIt->getTorque();
-
-               bodyIt->resetForceAndTorque();
-
-               bodyIt->setForce ( force );
-               bodyIt->setTorque( torque );
-            }
-         }
-      }
-   private:
-      const shared_ptr< StructuredBlockStorage > blocks_;
-      const BlockDataID bodyStorageID_;
-      real_t invLbmSubCycles_;
-};
-
-
-/*
- * Result evaluation
+ * Result evaluation for the written VTK files
+ *
  * input: vel (fluid velocity), u_p (particle velocity), u_infty (inflow velocity)
  *
  * needed data:
@@ -598,45 +515,37 @@ class AverageForce
  *                             vel_r_perp     = vel_r * e_p_perp
  *                             vel_r_Hz_perp  = vel_r * e_p_Hz_perp
  */
-class LogVTKInfo
+class VTKInfoLogger
 {
 public:
 
-   LogVTKInfo( SweepTimeloop* timeloop,
-               const shared_ptr< StructuredBlockStorage > & blocks, const ConstBlockDataID & bodyStorageID,
-               const std::string & baseFolder,
-               const Vector3<real_t> u_infty, const real_t gravity,
-               const real_t diameter, const real_t densityRatio ) :
+   VTKInfoLogger( SweepTimeloop* timeloop, const shared_ptr< StructuredBlockStorage > & blocks,
+                  const ConstBlockDataID & bodyStorageID, const std::string & baseFolder,
+                  const Vector3<real_t> u_infty ) :
    timeloop_( timeloop ), blocks_( blocks ), bodyStorageID_( bodyStorageID ), baseFolder_( baseFolder ), u_infty_( u_infty )
-   {
-      u_p_r_sqr_mag_ = real_t(0);
-      u_ref_ = std::sqrt( std::fabs(densityRatio - real_t(1)) * gravity * diameter );
-   }
+   { }
 
    void operator()()
    {
-      Vector3<real_t> transVel( real_t(0) );
+      Vector3<real_t> u_p( real_t(0) );
       for( auto blockIt = blocks_->begin(); blockIt != blocks_->end(); ++blockIt )
       {
          for( auto bodyIt = pe::LocalBodyIterator::begin<pe::Sphere>(*blockIt, bodyStorageID_ ); bodyIt != pe::LocalBodyIterator::end<pe::Sphere>(); ++bodyIt )
          {
-            transVel = bodyIt->getLinearVel();
+            u_p = bodyIt->getLinearVel();
          }
       }
 
       WALBERLA_MPI_SECTION()
       {
-         mpi::allReduceInplace( transVel[0], mpi::SUM );
-         mpi::allReduceInplace( transVel[1], mpi::SUM );
-         mpi::allReduceInplace( transVel[2], mpi::SUM );
+         mpi::allReduceInplace( u_p[0], mpi::SUM );
+         mpi::allReduceInplace( u_p[1], mpi::SUM );
+         mpi::allReduceInplace( u_p[2], mpi::SUM );
 
       }
 
-      // store particle velocity
-      u_p_ = transVel;
-
-      Vector3<real_t> u_p_r = u_p_ - u_infty_;
-      u_p_r_sqr_mag_ = u_p_r.sqrLength();
+      Vector3<real_t> u_p_r = u_p - u_infty_;
+      real_t u_p_r_sqr_mag = u_p_r.sqrLength();
 
       real_t u_p_H = std::sqrt( u_p_r[0] * u_p_r[0] + u_p_r[1] * u_p_r[1]);
 
@@ -660,8 +569,8 @@ public:
          file.open( filename.c_str() );
          file.precision(8);
 
-         file << "u_p = "           << u_p_ << "\n";
-         file << "u_p_r_2 = "       << u_p_r_sqr_mag_ << "\n\n";
+         file << "u_p = "           << u_p << "\n";
+         file << "u_p_r_2 = "       << u_p_r_sqr_mag << "\n\n";
 
          file << "e_{pH} = "        << e_p_H        << "\n";
          file << "e_{pparallel} = " << e_p_parallel << "\n";
@@ -670,22 +579,6 @@ public:
 
          file.close();
       }
-
-   }
-
-   Vector3<real_t> getParticleVel() const
-   {
-      return u_p_;
-   }
-
-   real_t getURef() const
-   {
-      return u_ref_;
-   }
-
-   real_t getUPRSqrMag() const
-   {
-      return u_p_r_sqr_mag_;
    }
 
 private:
@@ -698,10 +591,6 @@ private:
    std::string baseFolder_;
    Vector3<real_t> u_infty_;
 
-   Vector3< real_t > u_p_;
-   real_t u_p_r_sqr_mag_;
-   real_t u_ref_;
-
 };
 
 class PDFFieldCopy
@@ -869,7 +758,7 @@ int main( int argc, char **argv )
    ///////////////////////////
    const int numProcs = MPIManager::instance()->numProcesses();
 
-   if( numProcs < 16 ) WALBERLA_ABORT("At least 16 MPI ranks have to be used due to horizontal periodicity requirements!");
+   WALBERLA_CHECK(numProcs % 16 != 0, "An integer multiple of 16 MPI ranks has to be used due to horizontal periodicity and domain decomposition requirements!");
 
    uint_t XBlocks, YBlocks, ZBlocks;
    if( numProcs >= 192 )
@@ -1130,11 +1019,11 @@ int main( int argc, char **argv )
    }
 
    // evaluate the drag force acting on the sphere...
-   shared_ptr< ForceEval > forceEval = walberla::make_shared< ForceEval >( &timeloopInit, blocks, bodyStorageID, averageFrequency, basefolder );
-   timeloopInit.addFuncAfterTimeStep( SharedFunctor< ForceEval >( forceEval ), "Evaluating drag force" );
+   shared_ptr< RestingSphereForceEvaluator > forceEval = walberla::make_shared< RestingSphereForceEvaluator >( &timeloopInit, blocks, bodyStorageID, averageFrequency, basefolder );
+   timeloopInit.addFuncAfterTimeStep( SharedFunctor< RestingSphereForceEvaluator >( forceEval ), "Evaluating drag force" );
 
    // ...then reset the force
-   timeloopInit.addFuncAfterTimeStep( ResetForce( blocks, bodyStorageID ), "Resetting force on body");
+   timeloopInit.addFuncAfterTimeStep( pe_coupling::ForceTorqueOnBodiesResetter( blocks, bodyStorageID ), "Resetting force on body");
 
    if( vtkIOInit )
    {
@@ -1300,8 +1189,7 @@ int main( int argc, char **argv )
       }
 
       if( numLBMSubCycles != uint_t(1) )
-         timeloop.addFuncAfterTimeStep( AverageForce( blocks, bodyStorageID, numLBMSubCycles ), "Force averaging for several LBM steps" );
-
+         timeloop.addFuncAfterTimeStep( pe_coupling::ForceTorqueOnBodiesScaler( blocks, bodyStorageID, real_t(1) / real_c(numLBMSubCycles) ), "Force averaging for several LBM steps" );
 
    }else{
 
@@ -1341,15 +1229,16 @@ int main( int argc, char **argv )
       }
 
       if( numLBMSubCycles != uint_t(1) )
-         timeloop.addFuncAfterTimeStep( AverageForce( blocks, bodyStorageID, numLBMSubCycles ), "Force averaging for several LBM steps" );
+         timeloop.addFuncAfterTimeStep( pe_coupling::ForceTorqueOnBodiesScaler( blocks, bodyStorageID, real_t(1) / real_c(numLBMSubCycles) ), "Force averaging for several LBM steps" );
 
    }
 
    // add gravity
-   timeloop.addFuncAfterTimeStep( GravitationalForce( blocks, bodyStorageID, - gravity * ( densityRatio - real_t(1) ) * diameter * diameter * diameter * math::PI / real_t(6) ), "Add gravitational force" );
+   Vector3<real_t> extForcesOnSphere( real_t(0), real_t(0), - gravity * ( densityRatio - real_t(1) ) * diameter * diameter * diameter * math::PI / real_t(6));
+   timeloop.addFuncAfterTimeStep( pe_coupling::ForceOnBodiesAdder( blocks, bodyStorageID, extForcesOnSphere ), "Add external forces (gravity and buoyancy)" );
 
    // evaluate the sphere properties
-   timeloop.addFuncAfterTimeStep( SphereEval( &timeloop, blocks, bodyStorageID, basefolder, uInfty, Galileo, GalileoSim, gravity, viscosity, diameter, densityRatio, numLBMSubCycles ), "Evaluate sphere");
+   timeloop.addFuncAfterTimeStep( MovingSpherePropertyEvaluator( &timeloop, blocks, bodyStorageID, basefolder, uInfty, Galileo, GalileoSim, gravity, viscosity, diameter, densityRatio, numLBMSubCycles ), "Evaluate sphere");
 
    // advance pe rigid body simulation
    timeloop.addFuncAfterTimeStep( pe_coupling::TimeStep( blocks, bodyStorageID, cr, syncCall, real_c(numLBMSubCycles), numPeSubCycles ), "pe Time Step" );
@@ -1363,8 +1252,7 @@ int main( int argc, char **argv )
    pdfFieldVTK->addBeforeFunction( pdfGhostLayerSync );
 
    // function to output plane infos for vtk output
-   shared_ptr< LogVTKInfo > logVTKInfo = walberla::make_shared< LogVTKInfo >( &timeloop, blocks, bodyStorageID, basefolder, uInfty, gravity, diameter, densityRatio );
-   pdfFieldVTK->addBeforeFunction( SharedFunctor< LogVTKInfo >( logVTKInfo ) );
+   pdfFieldVTK->addBeforeFunction(VTKInfoLogger( &timeloop, blocks, bodyStorageID, basefolder, uInfty ));
 
    // create folder for log_vtk files to not pollute the basefolder
    boost::filesystem::path tpath2( basefolder+"/log_vtk" );
diff --git a/src/pe_coupling/utility/BodiesForceTorqueContainer.h b/src/pe_coupling/utility/BodiesForceTorqueContainer.h
new file mode 100644
index 0000000000000000000000000000000000000000..a32dc589d99d0a1138b4e5d1cab974338dc0933e
--- /dev/null
+++ b/src/pe_coupling/utility/BodiesForceTorqueContainer.h
@@ -0,0 +1,142 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can 
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of 
+//  the License, or (at your option) any later version.
+//  
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT 
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
+//  for more details.
+//  
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file BodiesForceTorqueContainer.h
+//! \ingroup pe_coupling
+//! \author Christoph Rettinger <christoph.rettinger@fau.de>
+//
+//======================================================================================================================
+
+#pragma once
+
+#include "core/math/Vector3.h"
+#include "domain_decomposition/StructuredBlockStorage.h"
+#include "pe/rigidbody/BodyIterators.h"
+#include "pe/synchronization/SyncForces.h"
+
+#include <map>
+#include <vector>
+
+namespace walberla {
+namespace pe_coupling {
+
+class BodiesForceTorqueContainer
+{  
+public:
+
+   BodiesForceTorqueContainer( const shared_ptr<StructuredBlockStorage> & blockStorage, const BlockDataID & bodyStorageID )
+   : blockStorage_( blockStorage ), bodyStorageID_( bodyStorageID )
+     { }
+
+   void operator()()
+   {
+      store();
+   }
+
+   void store()
+   {
+      // clear map
+      clear();
+
+      // sum up all forces/torques from shadow copies on local body (owner)
+      pe::reduceForces( blockStorage_->getBlockStorage(), bodyStorageID_ );
+      // send total forces/torques to shadow owners
+      pe::distributeForces( blockStorage_->getBlockStorage(), bodyStorageID_ );
+
+      // (re-)build map
+      for( auto blockIt = blockStorage_->begin(); blockIt != blockStorage_->end(); ++blockIt )
+      {
+         for( auto bodyIt = pe::BodyIterator::begin(*blockIt, bodyStorageID_); bodyIt != pe::BodyIterator::end(); ++bodyIt )
+         {
+            auto & f = bodyForceTorqueMap_[ bodyIt->getSystemID() ];
+
+            // only add if body has not been added already before (from another block)
+            if( f.empty() )
+            {
+               const auto & force = bodyIt->getForce();
+               f.push_back( force[0] );
+               f.push_back( force[1] );
+               f.push_back( force[2] );
+
+               const auto & torque = bodyIt->getTorque();
+               f.push_back( torque[0] );
+               f.push_back( torque[1] );
+               f.push_back( torque[2] );
+            }
+
+            // reset of force/torque on remote bodies necessary to erase the multiple occurrences of forces/torques on bodies
+            // (due to call to distributeForces() before)
+            if ( bodyIt->isRemote() )
+            {
+               bodyIt->resetForceAndTorque();
+            }
+         }
+      }
+
+   }
+
+   void setOnBodies()
+   {
+      // owning process sets the force/torque on the bodies
+      for( auto blockIt = blockStorage_->begin(); blockIt != blockStorage_->end(); ++blockIt )
+      {
+         for( auto bodyIt = pe::LocalBodyIterator::begin(*blockIt, bodyStorageID_); bodyIt != pe::LocalBodyIterator::end(); ++bodyIt )
+         {
+            const auto &f = bodyForceTorqueMap_[bodyIt->getSystemID()];
+            WALBERLA_ASSERT( !f.empty(), "When attempting to set force/torque on local body " << bodyIt->getSystemID() << " at position " << bodyIt->getPosition() << ", body was not found in map!");
+            bodyIt->addForce ( f[0], f[1], f[2] );
+            bodyIt->addTorque( f[3], f[4], f[5] );
+         }
+      }
+   }
+
+   void clear()
+   {
+      bodyForceTorqueMap_.clear();
+   }
+
+   void swap( BodiesForceTorqueContainer & other )
+   {
+      std::swap( this->bodyForceTorqueMap_, other.bodyForceTorqueMap_ );
+   }
+
+private:
+
+   shared_ptr<StructuredBlockStorage> blockStorage_;
+   const BlockDataID bodyStorageID_;
+   std::map< walberla::id_t, std::vector<real_t> > bodyForceTorqueMap_;
+};
+
+
+class BodyContainerSwapper
+{
+public:
+   BodyContainerSwapper( const shared_ptr<BodiesForceTorqueContainer> & cont1,
+                         const shared_ptr<BodiesForceTorqueContainer> & cont2 )
+   : cont1_( cont1 ), cont2_( cont2 )
+   { }
+
+   void operator()()
+   {
+      cont1_->swap( *cont2_ );
+   }
+
+private:
+   shared_ptr<BodiesForceTorqueContainer> cont1_;
+   shared_ptr<BodiesForceTorqueContainer> cont2_;
+};
+
+} // namespace pe_coupling
+} // namespace walberla
diff --git a/src/pe_coupling/utility/ForceOnBodiesAdder.h b/src/pe_coupling/utility/ForceOnBodiesAdder.h
new file mode 100644
index 0000000000000000000000000000000000000000..2d6e3f4ead588cc83b1fbc147e76fed8d4a1d38c
--- /dev/null
+++ b/src/pe_coupling/utility/ForceOnBodiesAdder.h
@@ -0,0 +1,66 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can 
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of 
+//  the License, or (at your option) any later version.
+//  
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT 
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
+//  for more details.
+//  
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file ForceOnBodiesAdder.h
+//! \ingroup pe_coupling
+//! \author Christoph Rettinger <christoph.rettinger@fau.de>
+//
+//======================================================================================================================
+
+#pragma once
+
+#include "core/math/Vector3.h"
+#include "domain_decomposition/StructuredBlockStorage.h"
+#include "pe/rigidbody/BodyIterators.h"
+
+
+namespace walberla {
+namespace pe_coupling {
+
+class ForceOnBodiesAdder
+{  
+public:
+
+   ForceOnBodiesAdder( const shared_ptr<StructuredBlockStorage> & blockStorage, const BlockDataID & bodyStorageID,
+                       const Vector3<real_t> & force )
+   : blockStorage_( blockStorage ), bodyStorageID_( bodyStorageID ), force_( force )
+     { }
+
+   // set a constant force on all (only local, to avoid force duplication) bodies
+   void operator()()
+   {
+      for( auto blockIt = blockStorage_->begin(); blockIt != blockStorage_->end(); ++blockIt )
+      {
+         for( auto bodyIt = pe::LocalBodyIterator::begin( *blockIt, bodyStorageID_); bodyIt != pe::LocalBodyIterator::end(); ++bodyIt )
+         {
+            bodyIt->addForce ( force_ );
+         }
+      }
+   }
+
+   void updateForce( const Vector3<real_t> & newForce )
+   {
+      force_ = newForce;
+   }
+
+private:
+
+   shared_ptr<StructuredBlockStorage> blockStorage_;
+   const BlockDataID bodyStorageID_;
+   Vector3<real_t> force_;
+};
+
+} // namespace pe_coupling
+} // namespace walberla
diff --git a/src/pe_coupling/utility/ForceTorqueOnBodiesResetter.h b/src/pe_coupling/utility/ForceTorqueOnBodiesResetter.h
new file mode 100644
index 0000000000000000000000000000000000000000..c42e96b0c9cd6bdbac95688654aa9d4b1b979e17
--- /dev/null
+++ b/src/pe_coupling/utility/ForceTorqueOnBodiesResetter.h
@@ -0,0 +1,60 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can 
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of 
+//  the License, or (at your option) any later version.
+//  
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT 
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
+//  for more details.
+//  
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file ForceTorqueOnBodiesResetter.h
+//! \ingroup pe_coupling
+//! \author Christoph Rettinger <christoph.rettinger@fau.de>
+//
+//======================================================================================================================
+
+#pragma once
+
+#include "core/math/Vector3.h"
+#include "domain_decomposition/StructuredBlockStorage.h"
+#include "pe/rigidbody/BodyIterators.h"
+
+
+namespace walberla {
+namespace pe_coupling {
+
+class ForceTorqueOnBodiesResetter
+{  
+public:
+
+   ForceTorqueOnBodiesResetter( const shared_ptr<StructuredBlockStorage> & blockStorage, const BlockDataID & bodyStorageID )
+   : blockStorage_( blockStorage ), bodyStorageID_( bodyStorageID )
+     { }
+
+   // resets forces and torques on all (local and remote) bodies
+   void operator()()
+   {
+      for( auto blockIt = blockStorage_->begin(); blockIt != blockStorage_->end(); ++blockIt )
+      {
+         for( auto bodyIt = pe::BodyIterator::begin( *blockIt, bodyStorageID_); bodyIt != pe::BodyIterator::end(); ++bodyIt )
+         {
+            bodyIt->resetForceAndTorque();
+         }
+      }
+   }
+
+
+private:
+
+   shared_ptr<StructuredBlockStorage> blockStorage_;
+   const BlockDataID bodyStorageID_;
+};
+
+} // namespace pe_coupling
+} // namespace walberla
diff --git a/src/pe_coupling/utility/ForceTorqueOnBodiesScaler.h b/src/pe_coupling/utility/ForceTorqueOnBodiesScaler.h
new file mode 100644
index 0000000000000000000000000000000000000000..5f07300f3c68c43fb1ded26723cbddb69b225b60
--- /dev/null
+++ b/src/pe_coupling/utility/ForceTorqueOnBodiesScaler.h
@@ -0,0 +1,72 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can 
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of 
+//  the License, or (at your option) any later version.
+//  
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT 
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
+//  for more details.
+//  
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file ForceTorqueOnBodiesScaler.h
+//! \ingroup pe_coupling
+//! \author Christoph Rettinger <christoph.rettinger@fau.de>
+//
+//======================================================================================================================
+
+#pragma once
+
+#include "core/math/Vector3.h"
+#include "domain_decomposition/StructuredBlockStorage.h"
+#include "pe/rigidbody/BodyIterators.h"
+
+
+namespace walberla {
+namespace pe_coupling {
+
+// scales force/torquew on all bodies (local and remote) by a constant scalar value
+// can e.g. be used to average the force/torque over two time steps
+class ForceTorqueOnBodiesScaler
+{  
+public:
+
+   ForceTorqueOnBodiesScaler( const shared_ptr<StructuredBlockStorage> & blockStorage, const BlockDataID & bodyStorageID,
+                              const real_t & scalingFactor )
+   : blockStorage_( blockStorage ), bodyStorageID_( bodyStorageID ), scalingFactor_( scalingFactor )
+     { }
+
+   // resets forces and torques on all (local and remote) bodies
+   void operator()()
+   {
+      Vector3<real_t> force(real_t(0));
+      Vector3<real_t> torque(real_t(0));
+      for( auto blockIt = blockStorage_->begin(); blockIt != blockStorage_->end(); ++blockIt )
+      {
+         for( auto bodyIt = pe::BodyIterator::begin( *blockIt, bodyStorageID_); bodyIt != pe::BodyIterator::end(); ++bodyIt )
+         {
+            force  = scalingFactor_ * bodyIt->getForce();
+            torque = scalingFactor_ * bodyIt->getTorque();
+
+            bodyIt->resetForceAndTorque();
+
+            bodyIt->setForce ( force );
+            bodyIt->setTorque( torque );
+         }
+      }
+   }
+
+
+private:
+
+   shared_ptr<StructuredBlockStorage> blockStorage_;
+   const BlockDataID bodyStorageID_;
+   const real_t scalingFactor_;
+};
+
+} // namespace pe_coupling
+} // namespace walberla
diff --git a/src/pe_coupling/utility/TimeStep.h b/src/pe_coupling/utility/TimeStep.h
index b282766b3e69810d8f7e56fd94b60ec3778ebbce..791274c233566f3b81c7b5e43d1d2414367e72cb 100644
--- a/src/pe_coupling/utility/TimeStep.h
+++ b/src/pe_coupling/utility/TimeStep.h
@@ -1,15 +1,15 @@
 //======================================================================================================================
 //
-//  This file is part of waLBerla. waLBerla is free software: you can 
+//  This file is part of waLBerla. waLBerla is free software: you can
 //  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of 
+//  License as published by the Free Software Foundation, either version 3 of
 //  the License, or (at your option) any later version.
-//  
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT 
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 //  for more details.
-//  
+//
 //  You should have received a copy of the GNU General Public License along
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
@@ -44,92 +44,110 @@ class TimeStep
 {
 public:
 
-    explicit TimeStep( const shared_ptr<StructuredBlockStorage> & blockStorage,
-                       const BlockDataID & bodyStorageID,
-                       pe::cr::ICR & collisionResponse,
-                       boost::function<void (void)> synchronizeFunc,
-                       const real_t timestep = real_t(1), const uint_t intermediateSteps = uint_c(1) )
-        : timestep_( timestep )
-        , intermediateSteps_( ( intermediateSteps == 0 ) ? uint_c(1) : intermediateSteps )
-        , blockStorage_( blockStorage )
-        , bodyStorageID_(bodyStorageID)
-        , collisionResponse_(collisionResponse)
-        , synchronizeFunc_(synchronizeFunc)
-    {}
-
-    void operator()()
-    {
-        if( intermediateSteps_ == 1 )
-        {
-            collisionResponse_.timestep( timestep_ );
-            synchronizeFunc_( );
-        }
-        else
-        {
-            // sum up all forces from shadow copies on local body (owner)
-            pe::reduceForces( blockStorage_->getBlockStorage(), bodyStorageID_ );
-            // send total forces to shadow owners
-            pe::distributeForces( blockStorage_->getBlockStorage(), bodyStorageID_ );
-
-            // generate map from all known bodies (process local) to total forces
-            std::map< walberla::id_t, std::vector< real_t > > forceMap;
-            for (auto blockIt = blockStorage_->begin(); blockIt != blockStorage_->end(); ++blockIt)
+   explicit TimeStep( const shared_ptr<StructuredBlockStorage> & blockStorage,
+                      const BlockDataID & bodyStorageID,
+                      pe::cr::ICR & collisionResponse,
+                      boost::function<void (void)> synchronizeFunc,
+                      const real_t timeStep = real_t(1), const uint_t intermediateSteps = uint_c(1) )
+         : timeStep_( timeStep )
+         , intermediateSteps_( ( intermediateSteps == 0 ) ? uint_c(1) : intermediateSteps )
+         , blockStorage_( blockStorage )
+         , bodyStorageID_(bodyStorageID)
+         , collisionResponse_(collisionResponse)
+         , synchronizeFunc_(synchronizeFunc)
+   {}
+
+   void operator()()
+   {
+      if( intermediateSteps_ == 1 )
+      {
+         collisionResponse_.timestep( timeStep_ );
+         synchronizeFunc_( );
+      }
+      else
+      {
+         // during the intermediate time steps of the collision response, the currently acting forces
+         // (interaction forces, gravitational force, ...) have to remain constant.
+         // Since they are reset after the call to collision response, they have to be stored explicitly before.
+         // Then they are set again after each intermediate step.
+
+         // sum up all forces/torques from shadow copies on local body (owner)
+         pe::reduceForces( blockStorage_->getBlockStorage(), bodyStorageID_ );
+         // send total forces/torques to shadow owners
+         pe::distributeForces( blockStorage_->getBlockStorage(), bodyStorageID_ );
+
+         // generate map from all known bodies (process local) to total forces
+         std::map< walberla::id_t, std::vector< real_t > > forceMap;
+         for( auto blockIt = blockStorage_->begin(); blockIt != blockStorage_->end(); ++blockIt )
+         {
+            // iterate over local and remote bodies and store force/torque in map
+            // Remote bodies are required since during the then following collision response time steps,
+            // bodies can change owner ship (i.e. a now remote body could become a local body ).
+            // Since only the owning process re-sets the force/torque later, remote body values have to be stores as well.
+            for( auto bodyIt = pe::BodyIterator::begin(*blockIt, bodyStorageID_); bodyIt != pe::BodyIterator::end(); ++bodyIt )
             {
-                for( auto bodyIt = pe::BodyIterator::begin(*blockIt, bodyStorageID_); bodyIt != pe::BodyIterator::end(); ++bodyIt )
-                {
-                    auto & f = forceMap[ bodyIt->getSystemID() ];
-
-                    const auto & force = bodyIt->getForce();
-                    f.push_back( force[0] );
-                    f.push_back( force[1] );
-                    f.push_back( force[2] );
-
-                    const auto & torque = bodyIt->getTorque();
-                    f.push_back( torque[0] );
-                    f.push_back( torque[1] );
-                    f.push_back( torque[2] );
-
-                    if ( bodyIt->isRemote() )
-                    {
-                        bodyIt->resetForceAndTorque();
-                    }
-                }
+               auto & f = forceMap[ bodyIt->getSystemID() ];
+
+               // only add if body has not been added already before (from another block)
+               if( f.empty() )
+               {
+                  const auto & force = bodyIt->getForce();
+                  f.push_back( force[0] );
+                  f.push_back( force[1] );
+                  f.push_back( force[2] );
+
+                  const auto & torque = bodyIt->getTorque();
+                  f.push_back( torque[0] );
+                  f.push_back( torque[1] );
+                  f.push_back( torque[2] );
+               }
+
+               // reset of force/torque on remote bodies necessary to erase the multiple occurrences of forces/torques on bodies
+               // (due to call to distributeForces() before)
+               if ( bodyIt->isRemote() )
+               {
+                  bodyIt->resetForceAndTorque();
+               }
             }
-
-            // perform pe time steps
-            const real_t subTimestep = timestep_ / real_c( intermediateSteps_ );
-            for( uint_t i = 0; i != intermediateSteps_; ++i )
+         }
+
+         // perform pe time steps
+         const real_t subTimestep = timeStep_ / real_c( intermediateSteps_ );
+         for( uint_t i = 0; i != intermediateSteps_; ++i )
+         {
+            // in the first set forces on local bodies are already set by force synchronization
+            if( i != 0 )
             {
-                // in the first set forces on local bodies are already set by force synchronization
-                if( i != 0 ) {
-                    for (auto blockIt = blockStorage_->begin(); blockIt != blockStorage_->end(); ++blockIt)
-                    {
-                        for( auto body = pe::LocalBodyIterator::begin(*blockIt, bodyStorageID_); body != pe::LocalBodyIterator::end(); ++body )
-                        {
-                            const auto & f = forceMap[ body->getSystemID() ];
-                            body->addForce ( f[0], f[1], f[2] );
-                            body->addTorque( f[3], f[4], f[5] );
-                        }
-                    }
-                }
-
-                collisionResponse_.timestep( subTimestep );
-                synchronizeFunc_( );
+               for( auto blockIt = blockStorage_->begin(); blockIt != blockStorage_->end(); ++blockIt )
+               {
+                  // only owning process sets force/torque on bodies
+                  for( auto bodyIt = pe::LocalBodyIterator::begin(*blockIt, bodyStorageID_); bodyIt != pe::LocalBodyIterator::end(); ++bodyIt )
+                  {
+                     const auto & f = forceMap[ bodyIt->getSystemID() ];
+                     WALBERLA_ASSERT( !f.empty(), "When attempting to set force/torque on local body " << bodyIt->getSystemID() << " at position " << bodyIt->getPosition() << ", body was not found in map!");
+                     bodyIt->addForce ( f[0], f[1], f[2] );
+                     bodyIt->addTorque( f[3], f[4], f[5] );
+                  }
+               }
             }
-        }
-    }
+
+            collisionResponse_.timestep( subTimestep );
+            synchronizeFunc_( );
+         }
+      }
+   }
 
 protected:
 
-    const real_t timestep_;
+   const real_t timeStep_;
 
-    const uint_t intermediateSteps_;
+   const uint_t intermediateSteps_;
 
-    shared_ptr<StructuredBlockStorage> blockStorage_;
-    const BlockDataID &  bodyStorageID_;
+   shared_ptr<StructuredBlockStorage> blockStorage_;
+   const BlockDataID &  bodyStorageID_;
 
-    pe::cr::ICR & collisionResponse_;
-    boost::function<void (void)> synchronizeFunc_;
+   pe::cr::ICR & collisionResponse_;
+   boost::function<void (void)> synchronizeFunc_;
 
 }; // class TimeStep
 
diff --git a/src/pe_coupling/utility/all.h b/src/pe_coupling/utility/all.h
index 8713b748a4842e332b79826d11a991969ed9f936..9bb040405c7171d88c01eb64a78c19fb1feafaf2 100644
--- a/src/pe_coupling/utility/all.h
+++ b/src/pe_coupling/utility/all.h
@@ -22,5 +22,9 @@
 
 #pragma once
 
+#include "BodiesForceTorqueContainer.h"
+#include "ForceOnBodiesAdder.h"
+#include "ForceTorqueOnBodiesResetter.h"
+#include "ForceTorqueOnBodiesScaler.h"
 #include "LubricationCorrection.h"
 #include "TimeStep.h"
diff --git a/tests/pe_coupling/momentum_exchange_method/DragForceSphereMEM.cpp b/tests/pe_coupling/momentum_exchange_method/DragForceSphereMEM.cpp
index faa683155b2dd42de1be1587643d836c1d8fc8dc..095fed5f11f8dcc5d0ac7f1b40a88f6827d31821 100644
--- a/tests/pe_coupling/momentum_exchange_method/DragForceSphereMEM.cpp
+++ b/tests/pe_coupling/momentum_exchange_method/DragForceSphereMEM.cpp
@@ -58,6 +58,7 @@
 
 #include "pe_coupling/mapping/all.h"
 #include "pe_coupling/momentum_exchange_method/all.h"
+#include "pe_coupling/utility/all.h"
 
 #include <vector>
 #include <iomanip>
@@ -331,30 +332,6 @@ private:
 
 };
 
-
-class ResetForce
-{
-public:
-   ResetForce( const shared_ptr< StructuredBlockStorage > & blocks,
-               const BlockDataID & bodyStorageID )
-   : blocks_( blocks ), bodyStorageID_( bodyStorageID )
-   { }
-
-   void operator()()
-   {
-      for( auto blockIt = blocks_->begin(); blockIt != blocks_->end(); ++blockIt )
-      {
-         for( auto bodyIt = pe::BodyIterator::begin( *blockIt, bodyStorageID_); bodyIt != pe::BodyIterator::end(); ++bodyIt )
-         {
-            bodyIt->resetForceAndTorque();
-         }
-      }
-   }
-private:
-   shared_ptr< StructuredBlockStorage > blocks_;
-   const BlockDataID bodyStorageID_;
-};
-
 class PDFFieldCopy
 {
 public:
@@ -583,7 +560,7 @@ int main( int argc, char **argv )
                   << AfterFunction( SharedFunctor< DragForceEvaluator >( forceEval ), "drag force evaluation" );
 
    // resetting force
-   timeloop.addFuncAfterTimeStep( ResetForce( blocks, bodyStorageID ), "reset force on sphere");
+   timeloop.addFuncAfterTimeStep( pe_coupling::ForceTorqueOnBodiesResetter( blocks, bodyStorageID ), "reset force on sphere");
 
    timeloop.addFuncAfterTimeStep( RemainingTimeLogger( timeloop.getNrOfTimeSteps() ), "Remaining Time Logger" );
 
diff --git a/tests/pe_coupling/momentum_exchange_method/DragForceSphereMEMRefinement.cpp b/tests/pe_coupling/momentum_exchange_method/DragForceSphereMEMRefinement.cpp
index 2772c305e8ab51b7446ecbd63af9392938bf5927..40fb413f62730015652120d39ecb2c747c261a7c 100644
--- a/tests/pe_coupling/momentum_exchange_method/DragForceSphereMEMRefinement.cpp
+++ b/tests/pe_coupling/momentum_exchange_method/DragForceSphereMEMRefinement.cpp
@@ -61,6 +61,7 @@
 
 #include "pe_coupling/mapping/all.h"
 #include "pe_coupling/momentum_exchange_method/all.h"
+#include "pe_coupling/utility/all.h"
 
 #include "vtk/all.h"
 #include "field/vtk/all.h"
@@ -405,30 +406,6 @@ class ForceEval
 };
 
 
-class ResetForce
-{
-   public:
-      ResetForce( const shared_ptr< StructuredBlockStorage > & blocks,
-                  const BlockDataID & bodyStorageID )
-      : blocks_( blocks ), bodyStorageID_( bodyStorageID )
-      { }
-
-      void operator()()
-      {
-         for( auto blockIt = blocks_->begin(); blockIt != blocks_->end(); ++blockIt )
-         {
-            for( auto bodyIt = pe::BodyIterator::begin( *blockIt, bodyStorageID_); bodyIt != pe::BodyIterator::end(); ++bodyIt )
-            {
-                bodyIt->resetForceAndTorque();
-            }
-         }
-      }
-private:
-      shared_ptr< StructuredBlockStorage > blocks_;
-      const BlockDataID bodyStorageID_;
-};
-
-
 //////////
 // MAIN //
 //////////
@@ -596,7 +573,7 @@ int main( int argc, char **argv )
    timeloop.addFuncAfterTimeStep( makeSharedFunctor( forceEval ), "drag force evaluation" );
 
    // resetting force
-   timeloop.addFuncAfterTimeStep( ResetForce( blocks, bodyStorageID ), "reset force on sphere");
+   timeloop.addFuncAfterTimeStep( pe_coupling::ForceTorqueOnBodiesResetter( blocks, bodyStorageID ), "reset force on sphere");
 
    if( vtkIO )
    {
diff --git a/tests/pe_coupling/momentum_exchange_method/SegreSilberbergMEM.cpp b/tests/pe_coupling/momentum_exchange_method/SegreSilberbergMEM.cpp
index 752d57d302bea9ca299ad22171f38254154cf704..3bff481c7b2f27b128219a193d0874c63d30b895 100644
--- a/tests/pe_coupling/momentum_exchange_method/SegreSilberbergMEM.cpp
+++ b/tests/pe_coupling/momentum_exchange_method/SegreSilberbergMEM.cpp
@@ -241,11 +241,11 @@ void initPDF(const shared_ptr< StructuredBlockStorage > & blocks, const BlockDat
          // obtain the physical coordinate of the center of the current cell
          const Vector3< real_t > p = blocks->getBlockLocalCellCenter( *block, *cell );
 
-         const real_t rho = real_c(1);
-         const real_t height = real_c(setup.zlength) * real_c(0.5);
+         const real_t rho = real_t(1);
+         const real_t height = real_c(setup.zlength) * real_t(0.5);
 
          // parabolic Poiseuille profile for a given external force
-         Vector3< real_t > velocity ( setup.forcing / ( real_c(2) * setup.nu) * (height * height - ( p[2] - height ) * ( p[2] - height ) ), real_c(0), real_c(0) );
+         Vector3< real_t > velocity ( setup.forcing / ( real_t(2) * setup.nu) * (height * height - ( p[2] - height ) * ( p[2] - height ) ), real_t(0), real_t(0) );
          pdf->setToEquilibrium( *cell, velocity, rho );
       }
    }
@@ -385,72 +385,8 @@ class SteadyStateCheck
       real_t posNew_;
 };
 
-// When using N LBM steps and one (larger) pe step in a single simulation step, the force applied on the pe bodies in each LBM sep has to be scaled
-// by a factor of 1/N before running the pe simulation. This corresponds to an averaging of the force and torque over the N LBM steps and is said to
-// damp oscillations. Usually, N = 2.
-// See Ladd - " Numerical simulations of particulate suspensions via a discretized Boltzmann equation. Part 1. Theoretical foundation", 1994, p. 302
-class AverageForce
-{
-   public:
-      AverageForce( const shared_ptr< StructuredBlockStorage > & blocks, const BlockDataID & bodyStorageID,
-                    const uint_t lbmSubCycles )
-      : blocks_( blocks ), bodyStorageID_( bodyStorageID ), invLbmSubCycles_( real_c(1) / real_c( lbmSubCycles ) ) { }
-
-      void operator()()
-      {
-         pe::Vec3 force  = pe::Vec3(0.0);
-         pe::Vec3 torque = pe::Vec3(0.0);
-
-         for( auto blockIt = blocks_->begin(); blockIt != blocks_->end(); ++blockIt )
-         {
-            for( auto bodyIt = pe::BodyIterator::begin( *blockIt, bodyStorageID_); bodyIt != pe::BodyIterator::end(); ++bodyIt )
-            {
-               force  = invLbmSubCycles_ * bodyIt->getForce();
-               torque = invLbmSubCycles_ * bodyIt->getTorque();
-
-               bodyIt->resetForceAndTorque();
-
-               bodyIt->setForce ( force );
-               bodyIt->setTorque( torque );
-            }
-         }
-      }
-   private:
-      shared_ptr< StructuredBlockStorage > blocks_;
-      const BlockDataID bodyStorageID_;
-      real_t invLbmSubCycles_;
-};
 
 
-// The flow in the channel is here driven by a body force and is periodic
-// Thus, the pressure gradient, usually encountered in a channel flow, is not present here
-// The buoyancy force on the body due to this pressure gradient has to be added 'artificially'
-// F_{buoyancy} = - V_{body} * grad ( p ) = V_{body} * \rho_{fluid} *  a
-// ( V_{body} := volume of body,  a := acceleration driving the flow )
-class BuoyancyForce
-{
-   public:
-   BuoyancyForce( const shared_ptr< StructuredBlockStorage > & blocks, const BlockDataID & bodyStorageID,
-                  const Vector3<real_t> pressureForce )
-      : blocks_( blocks ), bodyStorageID_( bodyStorageID ), pressureForce_( pressureForce ) { }
-
-      void operator()()
-      {
-
-         for( auto blockIt = blocks_->begin(); blockIt != blocks_->end(); ++blockIt )
-         {
-            for( auto bodyIt = pe::LocalBodyIterator::begin( *blockIt, bodyStorageID_); bodyIt != pe::LocalBodyIterator::end(); ++bodyIt )
-            {
-               bodyIt->addForce ( pressureForce_ );
-            }
-         }
-      }
-   private:
-      shared_ptr< StructuredBlockStorage > blocks_;
-      const BlockDataID bodyStorageID_;
-      const Vector3<real_t> pressureForce_;
-};
-
 class PDFFieldCopy
 {
 public:
@@ -523,6 +459,7 @@ int main( int argc, char **argv )
    bool MO_MR            = false; //use multireflection boundary condition
    bool eanReconstructor = false; //use equilibrium and non-equilibrium reconstructor
    bool extReconstructor = false; //use extrapolation reconstructor
+   bool averageForceTorqueOverTwoTimSteps = true;
 
    for( int i = 1; i < argc; ++i )
    {
@@ -535,6 +472,7 @@ int main( int argc, char **argv )
       if( std::strcmp( argv[i], "--MO_MR"  )           == 0 ) { MO_MR            = true; continue; }
       if( std::strcmp( argv[i], "--eanReconstructor" ) == 0 ) { eanReconstructor = true; continue; }
       if( std::strcmp( argv[i], "--extReconstructor" ) == 0 ) { extReconstructor = true; continue; }
+      if( std::strcmp( argv[i], "--noForceAveraging" ) == 0 ) { averageForceTorqueOverTwoTimSteps = false; continue; }
       WALBERLA_ABORT("Unrecognized command line argument found: " << argv[i]);
    }
 
@@ -544,28 +482,26 @@ int main( int argc, char **argv )
 
    Setup setup;
 
-   setup.particleDiam =  real_c( 12 );                          // particle diameter
-   setup.rho_p = real_c( 1 );                                   // particle density
-   setup.tau = real_c( 1.4 );                                   // relaxation time
-   setup.nu = (real_c(2) * setup.tau - real_c(1) ) / real_c(6); // viscosity in lattice units
-   setup.timesteps = functest ? 1 : ( shortrun ? uint_c(200) : uint_c( 250000 ) ); // maximum number of time steps
-   setup.zlength = uint_c( 4.0 * setup.particleDiam );          // zlength in lattice cells of the channel
+   setup.particleDiam =  real_t( 12 );                          // particle diameter
+   setup.rho_p = real_t( 1 );                                   // particle density
+   setup.tau = real_t( 1.4 );                                   // relaxation time
+   setup.nu = (real_t(2) * setup.tau - real_t(1) ) / real_t(6); // viscosity in lattice units
+   setup.timesteps = functest ? 1 : ( shortrun ? uint_t(200) : uint_t( 250000 ) ); // maximum number of time steps
+   setup.zlength = uint_c( real_t(4) * setup.particleDiam );          // zlength in lattice cells of the channel
    setup.xlength = setup.zlength;                               // xlength in lattice cells of the channel
    setup.ylength = setup.zlength;                               // ylength in lattice cells of the channel
-   setup.Re = real_c( 13 );                                     // bulk Reynoldsnumber = uMean * width / nu
+   setup.Re = real_t( 13 );                                     // bulk Reynoldsnumber = uMean * width / nu
 
-   const real_t particleRadius = real_c(0.5) * setup.particleDiam; // particle radius
-   const real_t dx = real_c(1);                                    // lattice grid spacing
-   const uint_t numLbmSubCycles = uint_c(2);                       // number of LBM subcycles, i.e. number of LBM steps until one pe step is carried out
-   const real_t dt_pe       = real_c(numLbmSubCycles);             // time step size for PE (LU): here 2 times as large as for LBM
-   const uint_t pe_interval = uint_c(1);                           // subcycling interval for PE: only 1 PE step
-   const real_t omega = real_c(1) / setup.tau;                     // relaxation rate
-   const real_t convergenceLimit = real_c( 1e-5 );                 // tolerance for relative change in position
-   setup.checkFrequency = uint_c( real_c(100) / dt_pe );           // evaluate the position only every checkFrequency LBM time steps
+   const real_t particleRadius = real_t(0.5) * setup.particleDiam; // particle radius
+   const real_t dx = real_t(1);                                    // lattice grid spacing
+   const uint_t numPeSubcycles = uint_c(1);                        // number of pe subcycles per LBM step
+   const real_t omega = real_t(1) / setup.tau;                     // relaxation rate
+   const real_t convergenceLimit = real_t( 1e-5 );                 // tolerance for relative change in position
+   setup.checkFrequency = uint_t( 100 );           // evaluate the position only every checkFrequency LBM time steps
 
    const real_t uMean = setup.Re * setup.nu / real_c( setup.zlength );
-   const real_t uMax = real_c(2) * uMean;
-   setup.forcing = uMax * ( real_c(2) * setup.nu ) / real_c( real_c(3./4.) * real_c(setup.zlength) * real_c(setup.zlength) ); // forcing to drive the flow
+   const real_t uMax = real_t(2) * uMean;
+   setup.forcing = uMax * ( real_t(2) * setup.nu ) / ( real_t(3./4.) * real_c(setup.zlength) * real_c(setup.zlength) ); // forcing to drive the flow
    setup.Re_p = uMean * particleRadius * particleRadius / ( setup.nu * real_c( setup.zlength ) );  // particle Reynolds number = uMean * r * r / ( nu * width )
 
    if( fileIO || vtkIO ){
@@ -581,9 +517,9 @@ int main( int argc, char **argv )
    // BLOCK STRUCTURE SETUP //
    ///////////////////////////
 
-   setup.nBlocks[0] = uint_c(3);
-   setup.nBlocks[1] = uint_c(3);
-   setup.nBlocks[2] = (processes == 18) ? uint_c(2) : uint_c(1);
+   setup.nBlocks[0] = uint_t(3);
+   setup.nBlocks[1] = uint_t(3);
+   setup.nBlocks[2] = (processes == 18) ? uint_t(2) : uint_t(1);
    setup.cellsPerBlock[0] = setup.xlength / setup.nBlocks[0];
    setup.cellsPerBlock[1] = setup.ylength / setup.nBlocks[1];
    setup.cellsPerBlock[2] = setup.zlength / setup.nBlocks[2];
@@ -609,7 +545,7 @@ int main( int argc, char **argv )
    pe::cr::DEM cr(globalBodyStorage, blocks->getBlockStoragePointer(), bodyStorageID, ccdID, fcdID, NULL);
 
    // set up synchronization procedure
-   const real_t overlap = real_c( 1.5 ) * dx;
+   const real_t overlap = real_t( 1.5 ) * dx;
    boost::function<void(void)> syncCall;
    if (!syncShadowOwners)
    {
@@ -622,19 +558,19 @@ int main( int argc, char **argv )
    // create pe bodies
 
    // bounding planes (global)
-   const auto planeMaterial = pe::createMaterial( "myPlaneMat", real_c(8920), real_c(0), real_c(1), real_c(1), real_c(0), real_c(1), real_c(1), real_c(0), real_c(0) );
+   const auto planeMaterial = pe::createMaterial( "myPlaneMat", real_t(8920), real_t(0), real_t(1), real_t(1), real_t(0), real_t(1), real_t(1), real_t(0), real_t(0) );
    pe::createPlane( *globalBodyStorage, 0, Vector3<real_t>(0,0,1), Vector3<real_t>(0,0,0), planeMaterial );
    pe::createPlane( *globalBodyStorage, 0, Vector3<real_t>(0,0,-1), Vector3<real_t>(0,0,real_c(setup.zlength)), planeMaterial );
 
    // add the sphere
-   const auto sphereMaterial = pe::createMaterial( "mySphereMat", setup.rho_p , real_c(0.5), real_c(0.1), real_c(0.1), real_c(0.24), real_c(200), real_c(200), real_c(0), real_c(0) );
-   Vector3<real_t> position( real_c(setup.xlength) * real_c(0.5), real_c(setup.ylength) * real_c(0.5), real_c(setup.zlength) * real_c(0.5) - real_c(1) );
+   const auto sphereMaterial = pe::createMaterial( "mySphereMat", setup.rho_p , real_t(0.5), real_t(0.1), real_t(0.1), real_t(0.24), real_t(200), real_t(200), real_t(0), real_t(0) );
+   Vector3<real_t> position( real_c(setup.xlength) * real_t(0.5), real_c(setup.ylength) * real_t(0.5), real_c(setup.zlength) * real_t(0.5) - real_t(1) );
    auto sphere = pe::createSphere( *globalBodyStorage, blocks->getBlockStorage(), bodyStorageID, 0, position, particleRadius, sphereMaterial );
    if( sphere != NULL )
    {
-      real_t height = real_c( 0.5 ) * real_c( setup.zlength );
+      real_t height = real_t( 0.5 ) * real_c( setup.zlength );
       // set sphere velocity to undisturbed fluid velocity at sphere center
-      sphere->setLinearVel( setup.forcing/( real_c(2) * setup.nu) * ( height * height - ( position[2] - height ) * ( position[2] - height ) ), real_c(0), real_c(0) );
+      sphere->setLinearVel( setup.forcing/( real_t(2) * setup.nu) * ( height * height - ( position[2] - height ) * ( position[2] - height ) ), real_t(0), real_t(0) );
    }
 
    syncCall();
@@ -644,16 +580,16 @@ int main( int argc, char **argv )
    ////////////////////////
 
    // create the lattice model
-   LatticeModel_T latticeModel = LatticeModel_T( lbm::collision_model::TRT::constructWithMagicNumber( omega ), SimpleConstant( Vector3<real_t> ( setup.forcing, real_c(0), real_c(0) ) ) );
+   LatticeModel_T latticeModel = LatticeModel_T( lbm::collision_model::TRT::constructWithMagicNumber( omega ), SimpleConstant( Vector3<real_t> ( setup.forcing, real_t(0), real_t(0) ) ) );
 
    // add PDF field
    BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, "pdf field (zyxf)", latticeModel,
-                                                                         Vector3< real_t >( real_c(0), real_c(0), real_c(0) ), real_c(1),
+                                                                         Vector3< real_t >( real_t(0) ), real_t(1),
                                                                          uint_t(1), field::zyxf );
 
    // add PDF field, for MR boundary condition only
    BlockDataID pdfFieldPreColID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, " pre collision pdf field", latticeModel,
-                                                                               Vector3< real_t >( real_c(0), real_c(0), real_c(0) ), real_c(1),
+                                                                               Vector3< real_t >( real_t(0) ), real_t(1),
                                                                                uint_t(1), field::zyxf );
 
    // initialize already with the Poiseuille flow profile
@@ -733,48 +669,71 @@ int main( int argc, char **argv )
                       ( blocks, boundaryHandlingID, bodyStorageID, bodyFieldID, reconstructor, FormerMO_Flag, Fluid_Flag  ), "PDF Restore" );
    }
 
-   for( uint_t lbmSubCycle = uint_c(0); lbmSubCycle < numLbmSubCycles; ++lbmSubCycle )
+   shared_ptr<pe_coupling::BodiesForceTorqueContainer> bodiesFTContainer1 = make_shared<pe_coupling::BodiesForceTorqueContainer>(blocks, bodyStorageID);
+   boost::function<void(void)> storeForceTorqueInCont1 = boost::bind(&pe_coupling::BodiesForceTorqueContainer::store, bodiesFTContainer1);
+   shared_ptr<pe_coupling::BodiesForceTorqueContainer> bodiesFTContainer2 = make_shared<pe_coupling::BodiesForceTorqueContainer>(blocks, bodyStorageID);
+   boost::function<void(void)> setForceTorqueOnBodiesFromCont2 = boost::bind(&pe_coupling::BodiesForceTorqueContainer::setOnBodies, bodiesFTContainer2);
+
+   bodiesFTContainer2->store();
+
+   if( MO_MR )
    {
-      if( MO_MR )
-      {
-         // copy pdfs to have the pre collision PDFs available, needed for MR boundary treatment
-         timeloop.add() << Sweep( PDFFieldCopy( pdfFieldID, pdfFieldPreColID ), "pdf field copy" );
+      // copy pdfs to have the pre collision PDFs available, needed for MR boundary treatment
+      timeloop.add() << Sweep( PDFFieldCopy( pdfFieldID, pdfFieldPreColID ), "pdf field copy" );
 
-         auto sweep = lbm::makeCellwiseSweep< LatticeModel_T, FlagField_T >( pdfFieldID, flagFieldID, Fluid_Flag );
+      auto sweep = lbm::makeCellwiseSweep< LatticeModel_T, FlagField_T >( pdfFieldID, flagFieldID, Fluid_Flag );
 
-         // collision sweep
-         timeloop.add() << Sweep( lbm::makeCollideSweep( sweep ), "cell-wise LB sweep (collide)" );
+      // collision sweep
+      timeloop.add() << Sweep( lbm::makeCollideSweep( sweep ), "cell-wise LB sweep (collide)" );
 
-         // add LBM communication function and boundary handling sweep (does the hydro force calculations and the no-slip treatment)
-         timeloop.add() << BeforeFunction( commFunction, "LBM Communication" )
-                        << Sweep( BoundaryHandling_T::getBlockSweep( boundaryHandlingID ), "Boundary Handling" );
+      // add LBM communication function and boundary handling sweep (does the hydro force calculations and the no-slip treatment)
+      timeloop.add() << BeforeFunction( commFunction, "LBM Communication" )
+                     << Sweep( BoundaryHandling_T::getBlockSweep( boundaryHandlingID ), "Boundary Handling" );
 
-         // streaming
-         timeloop.add() << Sweep( lbm::makeStreamSweep( sweep ), "cell-wise LB sweep (stream)" );
-      }
-      else
-      {
-         // add LBM communication function and boundary handling sweep (does the hydro force calculations and the no-slip treatment)
-         timeloop.add() << BeforeFunction( commFunction, "LBM Communication" )
-                        << Sweep( BoundaryHandling_T::getBlockSweep( boundaryHandlingID ), "Boundary Handling" );
+      // streaming
+      timeloop.add() << Sweep( lbm::makeStreamSweep( sweep ), "cell-wise LB sweep (stream)" );
+   }
+   else
+   {
+      // add LBM communication function and boundary handling sweep (does the hydro force calculations and the no-slip treatment)
+      timeloop.add() << BeforeFunction( commFunction, "LBM Communication" )
+                     << Sweep( BoundaryHandling_T::getBlockSweep( boundaryHandlingID ), "Boundary Handling" );
 
-         // stream + collide LBM step
-         timeloop.add()
-            << Sweep( makeSharedSweep( lbm::makeCellwiseSweep< LatticeModel_T, FlagField_T >( pdfFieldID, flagFieldID, Fluid_Flag ) ), "cell-wise LB sweep" );
-      }
+      // stream + collide LBM step
+      timeloop.add()
+         << Sweep( makeSharedSweep( lbm::makeCellwiseSweep< LatticeModel_T, FlagField_T >( pdfFieldID, flagFieldID, Fluid_Flag ) ), "cell-wise LB sweep" );
    }
 
-   // average the forces acting on the bodies over the number of LBM steps
-   timeloop.addFuncAfterTimeStep( AverageForce( blocks, bodyStorageID, numLbmSubCycles ), "Force averaging for several LBM steps" );
+   // Averaging the force/torque over two time steps is said to damp oscillations of the interaction force/torque.
+   // See Ladd - " Numerical simulations of particulate suspensions via a discretized Boltzmann equation. Part 1. Theoretical foundation", 1994, p. 302
+   if( averageForceTorqueOverTwoTimSteps ) {
+
+      // store force/torque from hydrodynamic interactions in container1
+      timeloop.addFuncAfterTimeStep(storeForceTorqueInCont1, "Force Storing");
+
+      // set force/torque from previous time step (in container2)
+      timeloop.addFuncAfterTimeStep(setForceTorqueOnBodiesFromCont2, "Force setting");
+
+      // average the force/torque by scaling it with factor 1/2
+      timeloop.addFuncAfterTimeStep( pe_coupling::ForceTorqueOnBodiesScaler(blocks, bodyStorageID, real_t(0.5)),  "Force averaging");
+
+      // swap containers
+      timeloop.addFuncAfterTimeStep( pe_coupling::BodyContainerSwapper( bodiesFTContainer1, bodiesFTContainer2 ), "Swap FT container" );
+
+   }
 
    // add pressure force contribution
-   timeloop.addFuncAfterTimeStep( BuoyancyForce( blocks, bodyStorageID,
-                                                 Vector3<real_t>( math::M_PI / real_c(6) * setup.forcing * setup.particleDiam * setup.particleDiam * setup.particleDiam ,
-                                                                  real_c(0), real_c(0) ) ),
-                                  "Buoyancy force" );
+   // The flow in the channel is here driven by a body force and is periodic
+   // Thus, the pressure gradient, usually encountered in a channel flow, is not present here
+   // The buoyancy force on the body due to this pressure gradient has to be added 'artificially'
+   // F_{buoyancy} = - V_{body} * grad ( p ) = V_{body} * \rho_{fluid} *  a
+   // ( V_{body} := volume of body,  a := acceleration driving the flow )
+   Vector3<real_t> buoyancyForce(math::M_PI / real_t(6) * setup.forcing * setup.particleDiam * setup.particleDiam * setup.particleDiam ,
+                                 real_t(0), real_t(0));
+   timeloop.addFuncAfterTimeStep( pe_coupling::ForceOnBodiesAdder( blocks, bodyStorageID, buoyancyForce ), "Buoyancy force" );
 
    // add pe timesteps
-   timeloop.addFuncAfterTimeStep( pe_coupling::TimeStep( blocks, bodyStorageID, cr, syncCall, dt_pe, pe_interval ), "pe Time Step" );
+   timeloop.addFuncAfterTimeStep( pe_coupling::TimeStep( blocks, bodyStorageID, cr, syncCall, real_t(1), numPeSubcycles ), "pe Time Step" );
 
    // check for convergence of the particle position
    shared_ptr< SteadyStateCheck > check = make_shared< SteadyStateCheck >( &timeloop, &setup, blocks, bodyStorageID,
@@ -797,7 +756,7 @@ int main( int argc, char **argv )
 
       // pdf field (ghost layers cannot be written because re-sampling/coarsening is applied)
       auto pdfFieldVTK = vtk::createVTKOutput_BlockData( blocks, "fluid_field", writeFrequency );
-      pdfFieldVTK->setSamplingResolution( real_c(1) );
+      pdfFieldVTK->setSamplingResolution( real_t(1) );
 
       blockforest::communication::UniformBufferedScheme< stencil::D3Q27 > pdfGhostLayerSync( blocks );
       pdfGhostLayerSync.addPackInfo( make_shared< field::communication::PackInfo< PdfField_T > >( pdfFieldID ) );
@@ -843,7 +802,7 @@ int main( int argc, char **argv )
    {
       // reference value from Inamuro et al. - "Flow between parallel walls containing the lines of neutrally buoyant circular cylinders" (2000), Table 2
       // note that this reference value provides only a rough estimate
-      real_t referencePos = real_c( 0.2745 ) * real_c( setup.zlength );
+      real_t referencePos = real_t( 0.2745 ) * real_c( setup.zlength );
       real_t relErr = std::fabs( referencePos - check->getPosition() ) / referencePos;
       if ( fileIO )
       {
@@ -855,7 +814,7 @@ int main( int argc, char **argv )
          }
       }
       // the relative error has to be below 10%
-      WALBERLA_CHECK_LESS( relErr, real_c(0.1) );
+      WALBERLA_CHECK_LESS( relErr, real_t(0.1) );
    }
 
    return EXIT_SUCCESS;
diff --git a/tests/pe_coupling/momentum_exchange_method/TorqueSphereMEM.cpp b/tests/pe_coupling/momentum_exchange_method/TorqueSphereMEM.cpp
index 70f2d41507b697020279be13ac744633391c872c..4e41079db7b9a0b8e18993d776340a36de2cd535 100644
--- a/tests/pe_coupling/momentum_exchange_method/TorqueSphereMEM.cpp
+++ b/tests/pe_coupling/momentum_exchange_method/TorqueSphereMEM.cpp
@@ -58,6 +58,7 @@
 
 #include "pe_coupling/mapping/all.h"
 #include "pe_coupling/momentum_exchange_method/all.h"
+#include "pe_coupling/utility/all.h"
 
 #include <vector>
 #include <iomanip>
@@ -276,30 +277,6 @@ class TorqueEval
 
 };
 
-
-class ResetForce
-{
-   public:
-      ResetForce( const shared_ptr< StructuredBlockStorage > & blocks,
-                  const BlockDataID & bodyStorageID )
-      : blocks_( blocks ), bodyStorageID_( bodyStorageID )
-      { }
-
-      void operator()()
-      {
-         for( auto blockIt = blocks_->begin(); blockIt != blocks_->end(); ++blockIt )
-         {
-            for( auto bodyIt = pe::BodyIterator::begin( *blockIt, bodyStorageID_); bodyIt != pe::BodyIterator::end(); ++bodyIt )
-            {
-                bodyIt->resetForceAndTorque();
-            }
-         }
-      }
-private:
-      shared_ptr< StructuredBlockStorage > blocks_;
-      const BlockDataID bodyStorageID_;
-};
-
 class PDFFieldCopy
 {
 public:
@@ -539,7 +516,7 @@ int main( int argc, char **argv )
                      << AfterFunction( SharedFunctor< TorqueEval >( torqueEval ), "torque evaluation" );
    }
    // resetting force
-   timeloop.addFuncAfterTimeStep( ResetForce( blocks, bodyStorageID ), "reset force on sphere");
+   timeloop.addFuncAfterTimeStep( pe_coupling::ForceTorqueOnBodiesResetter( blocks, bodyStorageID ), "reset force on sphere");
 
    timeloop.addFuncAfterTimeStep( RemainingTimeLogger( timeloop.getNrOfTimeSteps() ), "Remaining Time Logger" );