diff --git a/apps/benchmarks/CMakeLists.txt b/apps/benchmarks/CMakeLists.txt
index 2e835d671e25837aad63e89e6534f9e44ade2384..d5612a35b72b52f7c2bf1d5e0cb98da6dfbd1934 100644
--- a/apps/benchmarks/CMakeLists.txt
+++ b/apps/benchmarks/CMakeLists.txt
@@ -8,6 +8,7 @@ add_subdirectory( NonUniformGrid )
 add_subdirectory( MotionSingleHeavySphere )
 add_subdirectory( PeriodicGranularGas )
 add_subdirectory( PoiseuilleChannel )
+add_subdirectory( ProbeVsExtraMessage )
 add_subdirectory( SchaeferTurek )
 add_subdirectory( UniformGrid )
 add_subdirectory( UniformGridGPU )
diff --git a/apps/benchmarks/ProbeVsExtraMessage/CMakeLists.txt b/apps/benchmarks/ProbeVsExtraMessage/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7330aa0ddb83f973d47071a4dba1a4c03baed242
--- /dev/null
+++ b/apps/benchmarks/ProbeVsExtraMessage/CMakeLists.txt
@@ -0,0 +1,2 @@
+waLBerla_add_executable ( NAME ProbeVsExtraMessage 
+                          DEPENDS core postprocessing stencil )
diff --git a/apps/benchmarks/ProbeVsExtraMessage/ProbeVsExtraMessage.cpp b/apps/benchmarks/ProbeVsExtraMessage/ProbeVsExtraMessage.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6fec0642ef3a5f476955403eaa3bd77b22fe96ea
--- /dev/null
+++ b/apps/benchmarks/ProbeVsExtraMessage/ProbeVsExtraMessage.cpp
@@ -0,0 +1,211 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file ProbeVsExtraMessage.h
+//! \author Martin Bauer <martin.bauer@fau.de>
+//! \author Sebastian Eibl <sebastian.eibl@fau.de>
+//! \brief Micro Benchmark, measuring time for different variable sized communications
+//
+//======================================================================================================================
+
+#include "core/DataTypes.h"
+#include "core/Environment.h"
+#include "core/math/Vector3.h"
+#include "core/mpi/BufferSystem.h"
+#include "core/mpi/MPIManager.h"
+#include "core/timing/TimingPool.h"
+#include "postprocessing/sqlite/SQLite.h"
+#include "stencil/D3Q27.h"
+#include "stencil/D3Q19.h"
+#include "stencil/D3Q7.h"
+
+#include <array>
+#include <iostream>
+#include <sstream>
+
+namespace walberla {
+
+class MPIInfo
+{
+public:
+   MPIInfo( const Vector3<uint_t>& procs, const Vector3<bool>& periodicity );
+
+   int getNeighborRank(const stencil::Direction& dir);
+private:
+   Vector3<uint_t> procs_;
+   Vector3<bool>   periodicity_;
+   Vector3<int>    pos_;
+};
+
+MPIInfo::MPIInfo( const Vector3<uint_t>& procs, const Vector3<bool>& periodicity )
+   : procs_(procs)
+   , periodicity_(periodicity)
+{
+   mpi::MPIManager::instance()->createCartesianComm(procs[0], procs[1], procs[2], periodicity[0], periodicity[1], periodicity[2]);
+   mpi::MPIManager::instance()->cartesianCoord(pos_.data());
+}
+
+int MPIInfo::getNeighborRank( const stencil::Direction& dir )
+{
+   auto neighborCoord = pos_;
+   neighborCoord[0] += stencil::cx[dir];
+   neighborCoord[1] += stencil::cy[dir];
+   neighborCoord[2] += stencil::cz[dir];
+   if (neighborCoord[0] < 0) return -1;
+   if (neighborCoord[1] < 0) return -1;
+   if (neighborCoord[2] < 0) return -1;
+   if (neighborCoord[0] >= int_c(procs_[0])) return -1;
+   if (neighborCoord[1] >= int_c(procs_[1])) return -1;
+   if (neighborCoord[2] >= int_c(procs_[2])) return -1;
+   return mpi::MPIManager::instance()->cartesianRank(uint_c(neighborCoord[0]), uint_c(neighborCoord[1]), uint_c(neighborCoord[2]));
+}
+
+template <typename Stencil>
+void communicate( MPIInfo& mpiInfo,
+                  const uint_t iterations,
+                  const uint_t messageSize,
+                  const bool iProbe,
+                  WcTimingPool& tp)
+{
+   std::vector<char> sendBuf(messageSize);
+   std::vector<char> recvBuf(messageSize);
+
+   WcTimer& timer = tp[iProbe ? "IProbe" : "twoMessage"];
+
+   mpi::BufferSystem bs( mpi::MPIManager::instance()->comm() );
+   bs.useIProbe(iProbe);
+
+   for( uint_t i =0; i < iterations; ++i )
+   {
+      timer.start();
+
+      for (auto dirIt = Stencil::beginNoCenter(); dirIt != Stencil::end(); ++dirIt)
+      {
+         auto recvRank = mpiInfo.getNeighborRank( *dirIt );
+         if (recvRank == -1) continue;
+         bs.sendBuffer(recvRank) << sendBuf;
+         WALBERLA_ASSERT_EQUAL(bs.sendBuffer(recvRank).size(), messageSize + sizeof(size_t));
+      }
+
+      bs.setReceiverInfoFromSendBufferState(false, true);
+      bs.sendAll();
+
+      for( auto it = bs.begin(); it != bs.end(); ++it )
+      {
+         WALBERLA_ASSERT_EQUAL(it.buffer().size(), messageSize + sizeof(size_t));
+         it.buffer() >> recvBuf;
+         WALBERLA_ASSERT_EQUAL(recvBuf.size(), messageSize);
+         WALBERLA_ASSERT(it.buffer().isEmpty());
+      }
+
+      timer.end();
+   }
+}
+
+int main( int argc, char ** argv )
+{
+   mpi::Environment mpiEnv( argc, argv );
+
+   if ( argc != 10 )
+   {
+      WALBERLA_ROOT_SECTION()
+      {
+         std::cout << "Usage ./probeVsExtraMessage x y z px py pz iterations messageSize stencil " << std::endl;
+         std::cout << std::endl;
+         std::cout << "x: number of processes in x direction" << std::endl;
+         std::cout << "y: number of processes in y direction" << std::endl;
+         std::cout << "z: number of processes in z direction" << std::endl;
+         std::cout << "px: periodic in x direction?" << std::endl;
+         std::cout << "py: periodic in y direction?" << std::endl;
+         std::cout << "pz: periodic in z direction?" << std::endl;
+         std::cout << "iterations: number of communications per case" << std::endl;
+         std::cout << "messageSize: size of the SendBuffer in bytes" << std::endl;
+         std::cout << "stencil: communication stencil (D3Q27, D3Q19, D3Q7)" << std::endl;
+
+      }
+      return EXIT_FAILURE;
+   }
+
+   Vector3<uint_t> procs;
+   procs[0] = std::stoul(argv[1]);
+   procs[1] = std::stoul(argv[2]);
+   procs[2] = std::stoul(argv[3]);
+   WALBERLA_CHECK_EQUAL(procs[0]*procs[1]*procs[2], mpi::MPIManager::instance()->numProcesses());
+   Vector3<bool> periodicity;
+   periodicity[0] = std::stoul(argv[4]);
+   periodicity[1] = std::stoul(argv[5]);
+   periodicity[2] = std::stoul(argv[6]);
+
+   uint_t iterations  = std::stoul(argv[7]);
+   uint_t messageSize = std::stoul(argv[8]);
+   std::string stencil = argv[9];
+   WALBERLA_CHECK_EQUAL(stencil, "D3Q27", "only D3Q27 is supported!");
+
+   WALBERLA_LOG_DEVEL_VAR_ON_ROOT(procs);
+   WALBERLA_LOG_DEVEL_VAR_ON_ROOT(periodicity);
+   WALBERLA_LOG_DEVEL_VAR_ON_ROOT(iterations);
+   WALBERLA_LOG_DEVEL_VAR_ON_ROOT(messageSize);
+   WALBERLA_LOG_DEVEL_VAR_ON_ROOT(stencil);
+
+   MPIInfo mpiInfo(procs, periodicity);
+
+   WcTimingPool tp;
+   WALBERLA_MPI_BARRIER();
+   if (stencil == "D3Q27")
+   {
+      communicate<stencil::D3Q27>(mpiInfo, iterations, messageSize, false, tp);
+      communicate<stencil::D3Q27>(mpiInfo, iterations, messageSize, true, tp);
+   } else if (stencil == "D3Q19")
+   {
+      communicate<stencil::D3Q19>(mpiInfo, iterations, messageSize, false, tp);
+      communicate<stencil::D3Q19>(mpiInfo, iterations, messageSize, true, tp);
+   } else if (stencil == "D3Q7")
+   {
+      communicate<stencil::D3Q7>(mpiInfo, iterations, messageSize, false, tp);
+      communicate<stencil::D3Q7>(mpiInfo, iterations, messageSize, true, tp);
+   } else
+   {
+      WALBERLA_ABORT("stencil not supported: " << stencil);
+   }
+   WALBERLA_LOG_INFO_ON_ROOT(tp);
+
+   WALBERLA_ROOT_SECTION()
+   {
+      std::map< std::string, walberla::int64_t > integerProperties;
+      std::map< std::string, double >            realProperties;
+      std::map< std::string, std::string >       stringProperties;
+
+      integerProperties["procs_x"]      = int64_c(procs[0]);
+      integerProperties["procs_y"]      = int64_c(procs[1]);
+      integerProperties["procs_z"]      = int64_c(procs[2]);
+      integerProperties["priodicity_x"] = int64_c(periodicity[0]);
+      integerProperties["priodicity_y"] = int64_c(periodicity[1]);
+      integerProperties["priodicity_z"] = int64_c(periodicity[2]);
+      integerProperties["iterations"]   = int64_c(iterations);
+      integerProperties["messageSize"]  = int64_c(messageSize);
+      stringProperties["stencil"]       = stencil;
+
+      auto runId = postprocessing::storeRunInSqliteDB( "ProbeVsTwoMessages.sqlite", integerProperties, stringProperties, realProperties );
+      postprocessing::storeTimingPoolInSqliteDB( "ProbeVsTwoMessages.sqlite", runId, tp, "Timings" );
+   }
+
+   return 0;
+}
+} // namespace walberla
+
+int main( int argc, char* argv[] )
+{
+   return walberla::main( argc, argv );
+}
diff --git a/src/core/math/Matrix2.h b/src/core/math/Matrix2.h
index 3dae71c58038bb2e4a58c3ad0e8d73c3247de23d..8e0504359bb722f912d159557034c5bdab9773e6 100644
--- a/src/core/math/Matrix2.h
+++ b/src/core/math/Matrix2.h
@@ -151,6 +151,7 @@ public:
    inline const Matrix2       getInverse()                               const;
    inline bool                isSingular()                               const;
    inline bool                isSymmetric()                              const;
+   inline Type*               data()                                     {return v_;}
    //@}
    //*******************************************************************************************************************
 
diff --git a/src/core/math/Matrix3.h b/src/core/math/Matrix3.h
index 85efdb583907f405e0fe39ba546533444f4d03cc..311a041200f49c08bb21c4f456c382dc2d0fb75b 100644
--- a/src/core/math/Matrix3.h
+++ b/src/core/math/Matrix3.h
@@ -173,6 +173,7 @@ public:
                               inline const Matrix3       getCholesky()                              const;
    template< typename Other > inline const Vector3<HIGH> solve( const Vector3<Other> &rhs )         const;
                               inline Type                trace()                                    const;
+                              inline Type*               data()                                     {return v_;}
    //@}
    //*******************************************************************************************************************
 
diff --git a/src/core/math/Vector2.h b/src/core/math/Vector2.h
index b09038329a709ec04af02e0246cd9529b2ca9639..c9b9dba4be0d8c5e38fc9296ed037e47ced1559c 100644
--- a/src/core/math/Vector2.h
+++ b/src/core/math/Vector2.h
@@ -159,6 +159,7 @@ public:
    inline Length          length()                       const;
    inline Type            sqrLength()                    const;
    inline Vector2<Length> getNormalized()                const;
+   inline Type*           data()                         {return v_;}
    //@}
    //*******************************************************************************************************************
 
diff --git a/src/core/math/Vector3.h b/src/core/math/Vector3.h
index eb0e848168ce0bd9aa31f11691756bc838e3a42b..c322c1380bd5ed9e8e794ec3c255d0be03ffc8b8 100644
--- a/src/core/math/Vector3.h
+++ b/src/core/math/Vector3.h
@@ -165,6 +165,7 @@ public:
    inline Vector3<Length> getNormalized()                const;
    inline Vector3<Length> getNormalizedOrZero()          const;
    inline void            reset();
+   inline Type*           data()                         {return v_;}
    //@}
    //*******************************************************************************************************************
 
diff --git a/src/core/mpi/BufferSystem.h b/src/core/mpi/BufferSystem.h
index 6a0048b2c1055992c413fd82f6045fd7edcf1b41..e4ba192e59876f68b3cf4d26d5da3fe6a10e9574 100644
--- a/src/core/mpi/BufferSystem.h
+++ b/src/core/mpi/BufferSystem.h
@@ -194,6 +194,9 @@ public:
    //@}
    //*******************************************************************************************************************
 
+   void useIProbe(const bool use) { useIProbe_ = use; }
+   bool isIProbedUsed() const { return useIProbe_; }
+
    ///Bytes sent during the current or last communication
    int64_t getBytesSent() const { return bytesSent_; }
    ///Bytes received during the current or last communication
@@ -226,10 +229,11 @@ protected:
    RecvBuffer_T * waitForNext( MPIRank & fromRank );
    void setCommunicationType( const bool knownSize );
 
-   internal::KnownSizeCommunication<RecvBuffer_T, SendBuffer_T>   knownSizeComm_;
-   internal::UnknownSizeCommunication<RecvBuffer_T, SendBuffer_T> unknownSizeComm_;
-   internal::NoMPICommunication<RecvBuffer_T, SendBuffer_T>       noMPIComm_;
-   internal::AbstractCommunication<RecvBuffer_T, SendBuffer_T> *  currentComm_;  //< after receiver setup, this points to unknown- or knownSizeComm_
+   internal::KnownSizeCommunication<RecvBuffer_T, SendBuffer_T>         knownSizeComm_;
+   internal::UnknownSizeCommunication<RecvBuffer_T, SendBuffer_T>       unknownSizeComm_;
+   internal::UnknownSizeCommunicationIProbe<RecvBuffer_T, SendBuffer_T> unknownSizeCommIProbe_;
+   internal::NoMPICommunication<RecvBuffer_T, SendBuffer_T>             noMPIComm_;
+   internal::AbstractCommunication<RecvBuffer_T, SendBuffer_T> *        currentComm_;  //< after receiver setup, this points to unknown- or knownSizeComm_
 
    bool sizeChangesEverytime_; //< if set to true, the receiveSizeUnknown_ is set to true before communicating
    bool communicationRunning_; //< indicates if a communication step is currently running
@@ -251,6 +255,8 @@ protected:
    //each concurrently running communication uses different tags
    static std::set<int> activeTags_;
 
+   bool useIProbe_ = false; ///< switch betwenn IProbe and two message communication for varying size communication
+
    int64_t bytesSent_        = 0; ///< number of bytes sent during last communication
    int64_t bytesReceived_    = 0; ///< number of bytes received during last communication
 
diff --git a/src/core/mpi/BufferSystem.impl.h b/src/core/mpi/BufferSystem.impl.h
index 240f49a0c9431bb8fac7a4c0738daffdc8ce957c..8a4185dd5de991eb85137ed5855ea53104773c1c 100644
--- a/src/core/mpi/BufferSystem.impl.h
+++ b/src/core/mpi/BufferSystem.impl.h
@@ -127,6 +127,7 @@ template< typename Rb, typename Sb>
 GenericBufferSystem<Rb, Sb>::GenericBufferSystem( const MPI_Comm & communicator, int tag )
    : knownSizeComm_  ( communicator, tag ),
      unknownSizeComm_( communicator, tag ),
+     unknownSizeCommIProbe_( communicator, tag ),
      noMPIComm_( communicator, tag ),
      currentComm_    ( nullptr ),
      sizeChangesEverytime_( true ),
@@ -138,6 +139,7 @@ template< typename Rb, typename Sb>
 GenericBufferSystem<Rb, Sb>::GenericBufferSystem( const GenericBufferSystem &other )
    : knownSizeComm_  ( other.knownSizeComm_.getCommunicator(), other.knownSizeComm_.getTag() ),
      unknownSizeComm_( other.knownSizeComm_.getCommunicator(), other.knownSizeComm_.getTag() ),
+     unknownSizeCommIProbe_( other.knownSizeComm_.getCommunicator(), other.knownSizeComm_.getTag() ),
      noMPIComm_      ( other.knownSizeComm_.getCommunicator(), other.knownSizeComm_.getTag() ),
      currentComm_ ( nullptr ),
      sizeChangesEverytime_( other.sizeChangesEverytime_ ),
@@ -150,6 +152,8 @@ GenericBufferSystem<Rb, Sb>::GenericBufferSystem( const GenericBufferSystem &oth
       currentComm_ = &knownSizeComm_;
    else if ( other.currentComm_ == &other.unknownSizeComm_ )
       currentComm_ = &unknownSizeComm_;
+   else if ( other.currentComm_ == &other.unknownSizeCommIProbe_ )
+      currentComm_ = &unknownSizeCommIProbe_;
    else if ( other.currentComm_ == &other.noMPIComm_ )
       currentComm_ = &noMPIComm_;
    else
@@ -170,6 +174,8 @@ GenericBufferSystem<Rb, Sb> & GenericBufferSystem<Rb, Sb>::operator=( const Gene
       currentComm_ = &knownSizeComm_;
    else if ( other.currentComm_ == &other.unknownSizeComm_ )
       currentComm_ = &unknownSizeComm_;
+   else if ( other.currentComm_ == &other.unknownSizeCommIProbe_ )
+      currentComm_ = &unknownSizeCommIProbe_;
    else if ( other.currentComm_ == &other.noMPIComm_ )
       currentComm_ = &noMPIComm_;
    else
@@ -508,6 +514,8 @@ void GenericBufferSystem<Rb, Sb>::setCommunicationType( const bool knownSize )
    {
       if( knownSize )
          currentComm_ = &knownSizeComm_;
+      else if ( useIProbe_ )
+         currentComm_ = &unknownSizeCommIProbe_;
       else
          currentComm_ = &unknownSizeComm_;
    }
diff --git a/src/core/mpi/BufferSystemHelper.h b/src/core/mpi/BufferSystemHelper.h
index e143f6b10fba7ad99d4d687750a14b40f2123ce1..661e463587263ec62d8aadbc1ba93d15660ca2ed 100644
--- a/src/core/mpi/BufferSystemHelper.h
+++ b/src/core/mpi/BufferSystemHelper.h
@@ -161,6 +161,34 @@ namespace internal {
    };
 
 
+   template< typename RecvBuffer_T, typename SendBuffer_T>
+   class UnknownSizeCommunicationIProbe : public AbstractCommunication<RecvBuffer_T, SendBuffer_T>
+   {
+   public:
+      using typename AbstractCommunication<RecvBuffer_T, SendBuffer_T>::ReceiveInfo;
+
+      UnknownSizeCommunicationIProbe( const MPI_Comm & communicator, int tag = 0 )
+           :  AbstractCommunication<RecvBuffer_T, SendBuffer_T>( communicator, tag ), sending_(false), receiving_(false) {}
+
+      virtual ~UnknownSizeCommunicationIProbe() {}
+
+      virtual void send( MPIRank receiver, const SendBuffer_T & sendBuffer );
+      virtual void waitForSends();
+
+      virtual void scheduleReceives( std::map<MPIRank, ReceiveInfo> & recvInfos );
+
+      /// size field of recvInfos can be invalid, is filled in with the actual message size
+      virtual MPIRank waitForNextReceive( std::map<MPIRank, ReceiveInfo> & recvInfos );
+
+   private:
+      bool sending_;
+      bool receiving_;
+      int  pendingReceives_;
+
+      std::vector<MPI_Request> sendRequests_;
+   };
+
+
    template< typename RecvBuffer_T, typename SendBuffer_T>
    class NoMPICommunication : public AbstractCommunication<RecvBuffer_T, SendBuffer_T>
    {
diff --git a/src/core/mpi/BufferSystemHelper.impl.h b/src/core/mpi/BufferSystemHelper.impl.h
index f982aab6894e9aff58579f8875598caa11180cee..fc2c6a5ae5a2623de6469eb3c6b2c54128927379 100644
--- a/src/core/mpi/BufferSystemHelper.impl.h
+++ b/src/core/mpi/BufferSystemHelper.impl.h
@@ -349,6 +349,133 @@ MPIRank UnknownSizeCommunication<Rb, Sb>::waitForNextReceive( std::map<MPIRank,
 
 
 
+//======================================================================================================================
+//
+//  Unknown Size Communication (IProbe method)
+//
+//======================================================================================================================
+
+template< typename Rb, typename Sb>
+void UnknownSizeCommunicationIProbe<Rb, Sb>::send( MPIRank receiver, const Sb & sendBuffer )
+{
+   WALBERLA_NON_MPI_SECTION() { WALBERLA_ASSERT( false ); }
+
+   if ( ! sending_ )
+      sending_ = true;
+
+
+   // Send content message
+
+   sendRequests_.push_back( MPI_REQUEST_NULL );
+   MPI_Request & contentMsgReq = sendRequests_.back();
+
+   //WALBERLA_LOG_DEVEL("sending " << sendBuffer.size() << " to " << receiver);
+
+   MPI_Isend(sendBuffer.ptr(),           // pointer to size buffer
+             int_c( sendBuffer.size() ), // send one size
+             MPI_BYTE,                   // type
+             receiver,                   // receiver rank
+             this->tag_,                 // message tag
+             this->communicator_,        // communicator
+             & contentMsgReq             // request needed for wait
+             );
+}
+
+template< typename Rb, typename Sb>
+void UnknownSizeCommunicationIProbe<Rb, Sb>::waitForSends()
+{
+   WALBERLA_NON_MPI_SECTION() { WALBERLA_ASSERT( false ); }
+
+   sending_ = false;
+
+   if ( sendRequests_.empty() )
+      return;
+
+   MPI_Waitall( int_c( sendRequests_.size() ),
+                &sendRequests_[0],
+                MPI_STATUSES_IGNORE );
+
+   sendRequests_.clear();
+}
+
+template< typename Rb, typename Sb>
+void UnknownSizeCommunicationIProbe<Rb, Sb>::scheduleReceives( std::map<MPIRank, ReceiveInfo> & recvInfos )
+{
+   WALBERLA_NON_MPI_SECTION() { WALBERLA_ASSERT( false ); }
+
+   receiving_ = true;
+
+   pendingReceives_ = int_c(recvInfos.size());
+   for( auto it = recvInfos.begin(); it != recvInfos.end(); ++it )
+   {
+      ReceiveInfo & recvInfo = it->second;
+      recvInfo.size = INVALID_SIZE;
+   }
+}
+
+
+template< typename Rb, typename Sb>
+MPIRank UnknownSizeCommunicationIProbe<Rb, Sb>::waitForNextReceive( std::map<MPIRank, ReceiveInfo> & recvInfos )
+{
+   WALBERLA_NON_MPI_SECTION() { WALBERLA_ASSERT( false ); }
+
+   WALBERLA_ASSERT( receiving_ );
+
+   if( pendingReceives_ < 1 ) {
+      receiving_ = false;
+      return INVALID_RANK;
+   }
+
+   // On first entry in this function:
+   //  - receive request vector contains requests for size messages
+   //  - all ReceiveInfo's have sizeReceived=false
+   while( true ) // this loops as long as size messages are received
+   {
+      for( auto it = recvInfos.begin(); it != recvInfos.end(); ++it )
+      {
+         const MPIRank sender   = it->first;
+         ReceiveInfo & recvInfo = it->second;
+
+         if (recvInfo.size != INVALID_SIZE) continue;
+
+         int probeFlag;
+         MPI_Status probeStatus;
+         MPI_Iprobe( sender,
+                     this->tag_,
+                     this->communicator_,
+                     &probeFlag,
+                     &probeStatus);
+         if (probeFlag)
+         {
+            int count = 0;
+            MPI_Get_count( &probeStatus, MPI_BYTE, &count );
+            //WALBERLA_LOG_DEVEL("received " << count << " from " << sender);
+            recvInfo.size = count;
+            recvInfo.buffer.resize( uint_c( recvInfo.size ) );
+
+            MPI_Status recvStatus;
+            MPI_Recv( recvInfo.buffer.ptr(),      // where to store received size
+                      count,                      // size of expected message
+                      MPI_BYTE,                   // type
+                      sender,                     // rank of sender process
+                      this->tag_,                 // message tag
+                      this->communicator_,        // communicator
+                      &recvStatus                 // request, needed for wait
+                      );
+
+            --pendingReceives_;
+            return sender;
+         }
+      }
+   }
+
+   WALBERLA_ASSERT( false );
+   return INVALID_RANK; //cannot happen - only to prevent compiler warnings
+}
+
+
+
+
 //======================================================================================================================
 //
 //  NoMPI Communication
diff --git a/src/core/timing/Timer.h b/src/core/timing/Timer.h
index 86814b043ec8ae7b05d9c8d3027db477a1fb1336..d3c72dfe05fc095791f0ee4b8b1a65c4126c9a60 100644
--- a/src/core/timing/Timer.h
+++ b/src/core/timing/Timer.h
@@ -28,6 +28,8 @@
 #include "WcPolicy.h"
 #include "core/DataTypes.h"
 
+#include <iomanip>
+#include <iostream>
 #include <limits>
 
 
@@ -442,6 +444,24 @@ inline void Timer<TP>::merge( const Timer<TP> & other )
 //**********************************************************************************************************************
 
 
+//======================================================================================================================
+//
+//  OSTREAM OVERLOAD
+//
+//======================================================================================================================
+
+template< typename TP >  // Timing policy
+std::ostream & operator<< ( std::ostream & os, const Timer<TP> & timer )
+{
+   os << std::fixed << std::setprecision(3) <<
+         "average: " << timer.average() <<
+         " | min: " << timer.min() <<
+         " | max: " << timer.max() <<
+         " | variance: " << timer.variance();
+   return os;
+}
+
+
 } // namespace timing
 
 typedef timing::Timer<timing::CpuPolicy>  CpuTimer;
diff --git a/tests/core/CMakeLists.txt b/tests/core/CMakeLists.txt
index 0a2f5a3851424b89295222d15ff981c329f45723..5f7d18a88ee93d20245bfb9129c929834bd0bd49 100644
--- a/tests/core/CMakeLists.txt
+++ b/tests/core/CMakeLists.txt
@@ -148,9 +148,6 @@ waLBerla_execute_test( NAME SetReductionTest27 COMMAND $<TARGET_FILE:SetReductio
 
 
 
-waLBerla_compile_test( FILES mpi/ProbeVsExtraMessage.cpp DEPENDS postprocessing)
-
-
 ##############
 # selectable #
 ##############
diff --git a/tests/core/mpi/BufferSystemTest.cpp b/tests/core/mpi/BufferSystemTest.cpp
index d84d12da34e0ee84143d5ae1d7f59442f2bec2a6..6de0e0ce4f9de043fae0a5dcfd5884ea06bb4d04 100644
--- a/tests/core/mpi/BufferSystemTest.cpp
+++ b/tests/core/mpi/BufferSystemTest.cpp
@@ -119,7 +119,7 @@ void symmetricCommunication()
  * Every process sends a message as big as his rank number
  * to the neighboring processes (1D , periodic boundary)
  */
-void asymmetricCommunication()
+void asymmetricCommunication(const bool useIProbe)
 {
    auto mpiManager = MPIManager::instance();
 
@@ -132,6 +132,7 @@ void asymmetricCommunication()
 
 
    BufferSystem bs ( MPI_COMM_WORLD );
+   bs.useIProbe(useIProbe);
 
    // Set receiver information
    std::set<int> receiveFrom;
@@ -186,7 +187,7 @@ void asymmetricCommunication()
 
 // like asymmetricCommunication, but the message size is a random value
 // that changes every communication step
-void timeVaryingCommunication()
+void timeVaryingCommunication(const bool useIProbe)
 {
    auto mpiManager = MPIManager::instance();
 
@@ -198,6 +199,7 @@ void timeVaryingCommunication()
    WALBERLA_CHECK_GREATER_EQUAL( numProcesses, 3 );
 
    BufferSystem bs ( MPI_COMM_WORLD );
+   bs.useIProbe(useIProbe);
 
    // artificial special case: no message from root
    bs.sendBuffer( rightNeighbor );
@@ -257,7 +259,7 @@ void timeVaryingCommunication()
  *    i.e. rank 1 sends a "1" once, rank 2 sends a message containing two "2"'s ...
  */
 
-void gatherUsingAsymmetricCommunication()
+void gatherUsingAsymmetricCommunication(const bool useIProbe)
 {
    int rank          = MPIManager::instance()->worldRank();
    int numProcesses  = MPIManager::instance()->numProcesses();
@@ -267,6 +269,7 @@ void gatherUsingAsymmetricCommunication()
    const int TAG=42;
 
    BufferSystem bs (MPI_COMM_WORLD, TAG );
+   bs.useIProbe(useIProbe);
 
 
    if ( rank ==0 )
@@ -381,13 +384,16 @@ int main(int argc, char**argv)
    symmetricCommunication();
 
    WALBERLA_LOG_INFO_ON_ROOT("Testing Asymmetric Communication...");
-   asymmetricCommunication();
+   asymmetricCommunication(false);
+   asymmetricCommunication(true);
 
    WALBERLA_LOG_INFO_ON_ROOT("Testing time-varying Communication...");
-   timeVaryingCommunication();
+   timeVaryingCommunication(false);
+   timeVaryingCommunication(true);
 
    WALBERLA_LOG_INFO_ON_ROOT("Testing Gather Operation...");
-   gatherUsingAsymmetricCommunication();
+   gatherUsingAsymmetricCommunication(false);
+   gatherUsingAsymmetricCommunication(true);
 
    WALBERLA_LOG_INFO_ON_ROOT("Testing self-send...");
    selfSend();
diff --git a/tests/core/mpi/ProbeVsExtraMessage.cpp b/tests/core/mpi/ProbeVsExtraMessage.cpp
deleted file mode 100644
index a6d3377437c4df849bac98d4a3631065dd355d12..0000000000000000000000000000000000000000
--- a/tests/core/mpi/ProbeVsExtraMessage.cpp
+++ /dev/null
@@ -1,216 +0,0 @@
-//======================================================================================================================
-//
-//  This file is part of waLBerla. waLBerla is free software: you can
-//  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of
-//  the License, or (at your option) any later version.
-//
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-//  for more details.
-//
-//  You should have received a copy of the GNU General Public License along
-//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
-//
-//! \file ProbeVsExtraMessage.h
-//! \author Martin Bauer <martin.bauer@fau.de>
-//! \brief Micro Benchmark, measuring time for different variable sized communications
-//
-//======================================================================================================================
-
-
-#include "core/debug/TestSubsystem.h"
-#include "core/Environment.h"
-#include "core/mpi/MPIManager.h"
-#include "core/math/Random.h"
-#include "core/timing/TimingPool.h"
-#include "postprocessing/sqlite/SQLite.h"
-
-#include <iostream>
-#include <sstream>
-
-namespace walberla {
-
-int getProcToReceiveFrom()
-{
-   auto mpi = MPIManager::instance();
-   int res = mpi->worldRank() -1;
-   if ( res < 0 )
-      res = mpi->numProcesses() -1;
-   return res;
-}
-
-int getProcToSendTo()
-{
-   auto mpi = MPIManager::instance();
-   int res = mpi->worldRank() +1;
-   if ( res == mpi->numProcesses() )
-      res = 0;
-   return res;
-}
-
-int getRandomMessageSize( uint_t maxMessageSize )
-{
-   return int_c( math::intRandom( maxMessageSize / 3, maxMessageSize-1 ) );
-}
-
-void iprobeVersion( uint_t iterations, uint_t maxMessageSize, WcTimingPool & timingPool )
-{
-   char * sendBuf = new char[ maxMessageSize ];
-   char * recvBuf = new char[ maxMessageSize ];
-
-   auto & timer = timingPool["iprobe"];
-
-   for( uint_t i =0; i < iterations; ++i )
-   {
-      int messageSize = getRandomMessageSize( maxMessageSize );
-
-      timer.start();
-      MPI_Request sendRequest;
-      MPI_Isend( sendBuf, messageSize, MPI_BYTE, getProcToSendTo(), 0, MPI_COMM_WORLD, &sendRequest );
-
-      MPI_Status probeStatus;
-      MPI_Probe( getProcToReceiveFrom(), 0, MPI_COMM_WORLD, &probeStatus );
-
-      int count = 0;
-      MPI_Get_count( &probeStatus, MPI_BYTE, &count );
-      MPI_Recv( recvBuf, count, MPI_BYTE, getProcToReceiveFrom(), 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE );
-      MPI_Waitall( 1, &sendRequest, MPI_STATUSES_IGNORE );
-      timer.end();
-   }
-
-   delete [] sendBuf;
-   delete [] recvBuf;
-}
-
-
-void twoMessageVersion( uint_t iterations, uint_t maxMessageSize, WcTimingPool & timingPool )
-{
-   char * sendBuf = new char[ maxMessageSize ];
-   char * recvBuf = new char[ maxMessageSize ];
-
-   auto & timer = timingPool["twoMessages"];
-
-   int recvSize;
-   MPI_Request sendRequests[2];
-
-   const int TAG_SIZE_MSG    = 1;
-   const int TAG_CONTENT_MSG = 0;
-
-   for( uint_t i =0; i < iterations; ++i )
-   {
-      int sendSize = getRandomMessageSize( maxMessageSize );
-
-      timer.start();
-
-      MPI_Request recvSizeMsgRequest;
-      MPI_Irecv( &recvSize, 1      , MPI_INT,  getProcToReceiveFrom(), TAG_SIZE_MSG,    MPI_COMM_WORLD, &recvSizeMsgRequest );
-      MPI_Isend( &sendSize, 1      , MPI_INT,  getProcToSendTo(),      TAG_SIZE_MSG,    MPI_COMM_WORLD, &sendRequests[0]    );
-      MPI_Isend( sendBuf , sendSize, MPI_BYTE, getProcToSendTo(),      TAG_CONTENT_MSG, MPI_COMM_WORLD, &sendRequests[1]    );
-
-      // wait for size message to arrive
-      MPI_Waitall( 1, &recvSizeMsgRequest, MPI_STATUSES_IGNORE );
-      // receive content
-      MPI_Recv( recvBuf, recvSize, MPI_BYTE, getProcToReceiveFrom(), TAG_CONTENT_MSG, MPI_COMM_WORLD, MPI_STATUS_IGNORE );
-
-      // wait for sends to finish
-      MPI_Waitall( 2, sendRequests, MPI_STATUSES_IGNORE );
-
-      timer.end();
-   }
-
-   delete [] sendBuf;
-   delete [] recvBuf;
-
-}
-
-void maxMessageSizeVersion( uint_t iterations, uint_t maxMessageSize, WcTimingPool & timingPool )
-{
-   char * sendBuf = new char[ maxMessageSize ];
-   char * recvBuf = new char[ maxMessageSize ];
-
-   auto & timer = timingPool["maxMessageSizeKnown"];
-
-   for( uint_t i =0; i < iterations; ++i )
-   {
-      int messageSize = getRandomMessageSize( maxMessageSize );
-      timer.start();
-      MPI_Request sendRequest;
-      MPI_Request recvRequest;
-      MPI_Irecv( recvBuf, int_c(maxMessageSize), MPI_BYTE,  getProcToReceiveFrom(), 0, MPI_COMM_WORLD, &recvRequest );
-      MPI_Isend( sendBuf, messageSize,           MPI_BYTE,  getProcToSendTo(),      0, MPI_COMM_WORLD, &sendRequest );
-
-      MPI_Status status;
-      MPI_Waitall( 1, &recvRequest, &status );
-
-      int count = 0;
-      MPI_Get_count( &status, MPI_BYTE, &count );
-      MPI_Waitall( 1, &sendRequest, MPI_STATUSES_IGNORE );
-      timer.end();
-   }
-
-   delete [] sendBuf;
-   delete [] recvBuf;
-}
-
-
-
-
-
-int main( int argc, char ** argv )
-{
-   debug::enterTestMode();
-   mpi::Environment mpiEnv( argc, argv );
-   MPIManager::instance()->useWorldComm();
-
-   using namespace std;
-
-   WALBERLA_ROOT_SECTION() {
-      if ( argc != 3 && argc != 4 ) {
-         cerr << "Wrong number of arguments " << endl;
-         cerr << "Usage ./probeVsExtraMessage <iterations> <messageSize> " << endl;
-      }
-   }
-   uint_t iterations;
-   uint_t messageSize;
-   std::string arg1( argv[1] );
-   std::string arg2( argv[2] );
-   std::stringstream streamIterations  ( arg1 );
-   std::stringstream streamMessageSize ( arg2 );
-   streamIterations >> iterations;
-   streamMessageSize >> messageSize;
-
-   WcTimingPool tp;
-   iprobeVersion         ( iterations, messageSize, tp );
-   twoMessageVersion     ( iterations, messageSize, tp );
-   maxMessageSizeVersion ( iterations, messageSize, tp );
-
-
-
-   WALBERLA_ROOT_SECTION() {
-      cout << tp << endl;
-   }
-   if( argc == 4) {
-
-      const auto reducedTimeloopTiming = tp.getReduced();
-      WALBERLA_ROOT_SECTION() {
-         std::string dbFile( argv[3] );
-         std::map<std::string, walberla::int64_t> integerProperties;
-         integerProperties["iterations"] = int64_c(iterations);
-         integerProperties["messageSize"] = int64_c(messageSize);
-         integerProperties["processes"] = int64_c(mpi::MPIManager::instance()->numProcesses());
-         postprocessing::SQLiteDB db( dbFile );
-         auto runid = db.storeRun( integerProperties, std::map<std::string, std::string>(), std::map<string, double>());
-         db.storeTimingPool( runid, *reducedTimeloopTiming, "timings" );
-      }
-   }
-
-   return 0;
-}
-} // namespace walberla
-
-int main( int argc, char* argv[] )
-{
-  return walberla::main( argc, argv );
-}
\ No newline at end of file