diff --git a/src/cuda/GPUCopy.cpp b/src/cuda/GPUCopy.cpp
index e1d6c86e723befc0622631cda612359dbd429040..3c1c8f12649ab4f2de483e4b6a3ebf6877790910 100644
--- a/src/cuda/GPUCopy.cpp
+++ b/src/cuda/GPUCopy.cpp
@@ -1,21 +1,22 @@
 //======================================================================================================================
 //
-//  This file is part of waLBerla. waLBerla is free software: you can 
+//  This file is part of waLBerla. waLBerla is free software: you can
 //  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of 
+//  License as published by the Free Software Foundation, either version 3 of
 //  the License, or (at your option) any later version.
-//  
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT 
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 //  for more details.
-//  
+//
 //  You should have received a copy of the GNU General Public License along
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \file GPUCopy.cpp
 //! \ingroup cuda
 //! \author Paulo Carvalho <prcjunior@inf.ufpr.br>
+//! \author João Victor Tozatti Risso <jvtrisso@inf.ufpr.br>
 //! \brief Copy routines of 4D intervals involving GPU buffers.
 //
 //======================================================================================================================
@@ -29,116 +30,365 @@
 namespace walberla {
 namespace cuda {
 
+void copyDevToDevFZYX( const cudaPitchedPtr& dst, const cudaPitchedPtr& src,
+                       std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset,
+                       std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset,
+                       uint_t dstAllocSizeZ, uint_t srcAllocSizeZ, uint_t typeSize,
+                       std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize,
+                       cudaStream_t copyStream )
+{
+   const uint_t & Nx = std::get<0>(intervalSize),
+                & Ny = std::get<1>(intervalSize),
+                & Nz = std::get<2>(intervalSize),
+                & Nf = std::get<3>(intervalSize);
+
+   const uint_t & srcX = std::get<0>(srcOffset),
+                & srcY = std::get<1>(srcOffset),
+                & srcZ = std::get<2>(srcOffset),
+                & srcF = std::get<3>(srcOffset);
+
+   const uint_t & dstX = std::get<0>(dstOffset),
+                & dstY = std::get<1>(dstOffset),
+                & dstZ = std::get<2>(dstOffset),
+                & dstF = std::get<3>(dstOffset);
+
+   auto copyFunctor = [&](uint_t dstCoordF, uint_t srcCoordF, uint_t fIntervalSize) {
+      WALBERLA_ASSERT( fIntervalSize == 1 || ( Nz == dstAllocSizeZ && Nz == srcAllocSizeZ ) );
+
+      cudaMemcpy3DParms p;
+      memset( &p, 0, sizeof(p) );
+
+      p.srcPos = make_cudaPos( srcX * typeSize, srcY, srcCoordF * srcAllocSizeZ + srcZ );
+      p.srcPtr = make_cudaPitchedPtr( src.ptr, src.pitch, src.xsize, src.ysize );
 
-void copyDevToDevFZYXRestricted( const cudaPitchedPtr& dst, const cudaPitchedPtr& src,
-                                 uint_t typeSz, uint_t dstAllocZ, uint_t srcAllocZ,
-                                 uint_t dstX, uint_t dstY, uint_t dstZ, uint_t dstF,
-                                 uint_t srcX, uint_t srcY, uint_t srcZ, uint_t srcF,
-                                 uint_t Nx, uint_t Ny, uint_t Nz, uint_t Nf )
+      p.dstPos = make_cudaPos( dstX * typeSize, dstY, dstCoordF * dstAllocSizeZ + dstZ );
+      p.dstPtr = make_cudaPitchedPtr( dst.ptr, dst.pitch, dst.xsize, dst.ysize );
+
+      p.extent = make_cudaExtent( Nx * typeSize, Ny, Nz * fIntervalSize );
+      p.kind = cudaMemcpyDeviceToDevice;
+
+      if ( copyStream == 0 )
+      {
+         WALBERLA_CUDA_CHECK( cudaMemcpy3D(&p) );
+      }
+      else
+      {
+         WALBERLA_CUDA_CHECK( cudaMemcpy3DAsync(&p, copyStream) );
+      }
+   };
+
+   if( Nf == 1 || ( Nz == dstAllocSizeZ && Nz == srcAllocSizeZ ) )
+   {
+      copyFunctor( dstF, srcF, Nf );
+   }
+   else
+   {
+      for( uint_t f = 0; f < Nf; ++f )
+      {
+         copyFunctor( dstF + f, srcF + f, uint_c(1) );
+      }
+   }
+}
+
+
+void copyDevToDevZYXF( const cudaPitchedPtr& dst, const cudaPitchedPtr& src,
+                       std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset,
+                       std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset,
+                       uint_t dstAllocSizeY, uint_t srcAllocSizeY, uint_t typeSize,
+                       std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize,
+                       cudaStream_t copyStream )
 {
-   WALBERLA_ASSERT( Nf == 1 || ( Nz == dstAllocZ && Nz == srcAllocZ ) );
-
-   cudaMemcpy3DParms p;
-   memset( &p, 0, sizeof(p) );
-
-   p.srcPos.x = srcX * typeSz;
-   p.srcPos.y = srcY;
-   p.srcPos.z = srcF * srcAllocZ + srcZ;
-   p.srcPtr.ptr = src.ptr;
-   p.srcPtr.pitch = src.pitch;
-   p.srcPtr.xsize = src.xsize;
-   p.srcPtr.ysize = src.ysize;
-
-   p.dstPos.x = dstX * typeSz;
-   p.dstPos.y = dstY;
-   p.dstPos.z = dstF * dstAllocZ + dstZ;
-   p.dstPtr.ptr = dst.ptr;
-   p.dstPtr.pitch = dst.pitch;
-   p.dstPtr.xsize = dst.xsize;
-   p.dstPtr.ysize = dst.ysize;
-
-   p.extent.width = Nx * typeSz;
-   p.extent.height = Ny;
-   p.extent.depth = Nz * Nf;
-
-   p.kind = cudaMemcpyDeviceToDevice;
-
-   WALBERLA_CUDA_CHECK( cudaMemcpy3D(&p) );
+   const uint_t & Nx = std::get<0>(intervalSize),
+                & Ny = std::get<1>(intervalSize),
+                & Nz = std::get<2>(intervalSize),
+                & Nf = std::get<3>(intervalSize);
+
+   const uint_t & srcX = std::get<0>(srcOffset),
+                & srcY = std::get<1>(srcOffset),
+                & srcZ = std::get<2>(srcOffset),
+                & srcF = std::get<3>(srcOffset);
+
+   const uint_t & dstX = std::get<0>(dstOffset),
+                & dstY = std::get<1>(dstOffset),
+                & dstZ = std::get<2>(dstOffset),
+                & dstF = std::get<3>(dstOffset);
+
+   auto copyFunctor = [&](uint_t dstCoordZ, uint_t srcCoordZ, uint_t zIntervalSize) {
+      cudaMemcpy3DParms p;
+      memset( &p, 0, sizeof(p) );
+
+      p.srcPos = make_cudaPos( srcF * typeSize, srcX, srcCoordZ * srcAllocSizeY + srcY );
+      p.srcPtr = make_cudaPitchedPtr( src.ptr, src.pitch, src.xsize, src.ysize );
+
+      p.dstPos = make_cudaPos( dstF * typeSize, dstX, dstCoordZ * dstAllocSizeY + dstY );
+      p.dstPtr = make_cudaPitchedPtr( dst.ptr, dst.pitch, dst.xsize, dst.ysize );
+
+      p.extent = make_cudaExtent( Nf * typeSize, Nx, Ny * zIntervalSize );
+      p.kind = cudaMemcpyDeviceToDevice;
+
+      if ( copyStream == 0 )
+      {
+         WALBERLA_CUDA_CHECK( cudaMemcpy3D(&p) );
+      }
+      else
+      {
+         WALBERLA_CUDA_CHECK( cudaMemcpy3DAsync(&p, copyStream) );
+      }
+   };
+
+   if ( Nz == 1 || ( Ny == dstAllocSizeY && Ny == srcAllocSizeY ) )
+   {
+      copyFunctor( dstZ, srcZ, Nz );
+   }
+   else
+   {
+      for( uint_t z = 0; z < Nz; ++z )
+      {
+         copyFunctor( dstZ + z, srcZ + z, 1 );
+      }
+   }
 }
 
 
-void copyHostToDevFZYXRestricted( const cudaPitchedPtr& dst, unsigned char* src,
-                                  uint_t typeSz, uint_t dstAllocZ, uint_t srcAllocZ,
-                                  uint_t dstX, uint_t dstY, uint_t dstZ, uint_t dstF,
-                                  uint_t srcX, uint_t srcY, uint_t srcZ, uint_t srcF,
-                                  uint_t Nx, uint_t Ny, uint_t Nz, uint_t Nf )
+void copyHostToDevFZYX( const cudaPitchedPtr& dst, unsigned char* src,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset,
+                        uint_t dstAllocSizeZ, uint_t srcAllocSizeZ, uint_t typeSize,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize,
+                        cudaStream_t copyStream )
 {
-   WALBERLA_ASSERT( Nf == 1 || ( Nz == dstAllocZ && Nz == srcAllocZ ) );
-
-   cudaMemcpy3DParms p;
-   memset( &p, 0, sizeof(p) );
-
-   p.srcPos.x = srcX * typeSz;
-   p.srcPos.y = srcY;
-   p.srcPos.z = srcF * srcAllocZ + srcZ;
-   p.srcPtr.ptr = src;
-   p.srcPtr.pitch = Nx * typeSz;
-   p.srcPtr.xsize = Nx * typeSz;
-   p.srcPtr.ysize = Ny;
-
-   p.dstPos.x = dstX * typeSz;
-   p.dstPos.y = dstY;
-   p.dstPos.z = dstF * dstAllocZ + dstZ;
-   p.dstPtr.ptr = dst.ptr;
-   p.dstPtr.pitch = dst.pitch;
-   p.dstPtr.xsize = dst.xsize;
-   p.dstPtr.ysize = dst.ysize;
-
-   p.extent.width = Nx * typeSz;
-   p.extent.height = Ny;
-   p.extent.depth = Nz * Nf;
-
-   p.kind = cudaMemcpyHostToDevice;
-
-   WALBERLA_CUDA_CHECK( cudaMemcpy3D(&p) );
+   const uint_t & Nx = std::get<0>(intervalSize),
+                & Ny = std::get<1>(intervalSize),
+                & Nz = std::get<2>(intervalSize),
+                & Nf = std::get<3>(intervalSize);
+
+   const uint_t & srcX = std::get<0>(srcOffset),
+                & srcY = std::get<1>(srcOffset),
+                & srcZ = std::get<2>(srcOffset),
+                & srcF = std::get<3>(srcOffset);
+
+   const uint_t & dstX = std::get<0>(dstOffset),
+                & dstY = std::get<1>(dstOffset),
+                & dstZ = std::get<2>(dstOffset),
+                & dstF = std::get<3>(dstOffset);
+
+   auto copyFunctor = [&](uint_t dstCoordF, uint_t srcCoordF, uint_t fIntervalSize) {
+      cudaMemcpy3DParms p;
+      memset( &p, 0, sizeof(p) );
+
+      p.srcPos = make_cudaPos( srcX * typeSize, srcY, srcCoordF * srcAllocSizeZ + srcZ );
+      p.srcPtr = make_cudaPitchedPtr( src, Nx * typeSize, Nx * typeSize, Ny );
+
+      p.dstPos = make_cudaPos( dstX * typeSize, dstY, dstCoordF * dstAllocSizeZ + dstZ );
+      p.dstPtr = make_cudaPitchedPtr( dst.ptr, dst.pitch, dst.xsize, dst.ysize );
+
+      p.extent = make_cudaExtent( Nx * typeSize, Ny, Nz * fIntervalSize );
+      p.kind = cudaMemcpyHostToDevice;
+
+      if (copyStream == 0)
+      {
+         WALBERLA_CUDA_CHECK( cudaMemcpy3D(&p) );
+      }
+      else
+      {
+         // Using cudaMemcpy3DAsync requires page-locked memory on the host!
+         WALBERLA_CUDA_CHECK( cudaMemcpy3DAsync(&p, copyStream) );
+      }
+   };
+
+   if ( Nf == 1 || ( Nz == dstAllocSizeZ ) )
+   {
+      copyFunctor( dstF, srcF, Nf );
+   }
+   else
+   {
+      for( uint_t f = 0; f < Nf; ++f )
+      {
+         copyFunctor( dstF + f, srcF + f, uint_c(1) );
+      }
+   }
 }
 
+void copyHostToDevZYXF( const cudaPitchedPtr& dst, unsigned char* src,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset,
+                        uint_t dstAllocSizeY, uint_t srcAllocSizeY, uint_t typeSize,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize,
+                        cudaStream_t copyStream )
+{
+   const uint_t & Nx = std::get<0>(intervalSize),
+                & Ny = std::get<1>(intervalSize),
+                & Nz = std::get<2>(intervalSize),
+                & Nf = std::get<3>(intervalSize);
 
-void copyDevToHostFZYXRestricted( unsigned char* dst, const cudaPitchedPtr& src,
-                                  uint_t typeSz, uint_t dstAllocZ, uint_t srcAllocZ,
-                                  uint_t dstX, uint_t dstY, uint_t dstZ, uint_t dstF,
-                                  uint_t srcX, uint_t srcY, uint_t srcZ, uint_t srcF,
-                                  uint_t Nx, uint_t Ny, uint_t Nz, uint_t Nf )
+   const uint_t & srcX = std::get<0>(srcOffset),
+                & srcY = std::get<1>(srcOffset),
+                & srcZ = std::get<2>(srcOffset),
+                & srcF = std::get<3>(srcOffset);
+
+   const uint_t & dstX = std::get<0>(dstOffset),
+                & dstY = std::get<1>(dstOffset),
+                & dstZ = std::get<2>(dstOffset),
+                & dstF = std::get<3>(dstOffset);
+
+   auto copyFunctor = [&](uint_t dstCoordZ, uint_t srcCoordZ, uint_t zIntervalSize) {
+         cudaMemcpy3DParms p;
+         memset( &p, 0, sizeof(p) );
+
+         p.srcPos = make_cudaPos( srcF * typeSize, srcX, srcCoordZ * srcAllocSizeY + srcY );
+         p.srcPtr = make_cudaPitchedPtr( src, Nf * typeSize, Nf * typeSize, Nx );
+
+         p.dstPos = make_cudaPos( dstF * typeSize, dstX, dstCoordZ * dstAllocSizeY + dstY );
+         p.dstPtr = make_cudaPitchedPtr( dst.ptr, dst.pitch, dst.xsize, dst.ysize );
+
+         p.extent = make_cudaExtent( Nf * typeSize, Nx, Ny * zIntervalSize );
+         p.kind = cudaMemcpyHostToDevice;
+
+         if ( copyStream == 0 )
+         {
+            WALBERLA_CUDA_CHECK( cudaMemcpy3D(&p) );
+         }
+         else
+         {
+            // Using cudaMemcpy3DAsync requires page-locked memory on the host!
+            WALBERLA_CUDA_CHECK( cudaMemcpy3DAsync(&p, copyStream) );
+         }
+   };
+
+   if ( Nz == 1 || ( Ny == dstAllocSizeY && Ny == srcAllocSizeY ) )
+   {
+      copyFunctor( dstZ, srcZ, Nz );
+   }
+   else
+   {
+      for( uint_t z = 0; z < Nz; ++z )
+      {
+         copyFunctor( dstZ + z, srcZ + z, 1 );
+      }
+   }
+}
+
+
+void copyDevToHostFZYX( unsigned char* dst, const cudaPitchedPtr& src,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset,
+                        uint_t dstAllocSizeZ, uint_t srcAllocSizeZ, uint_t typeSize,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize,
+                        cudaStream_t copyStream )
 {
-   cudaMemcpy3DParms p;
-   memset( &p, 0, sizeof(p) );
-
-   p.srcPos.x = srcX * typeSz;
-   p.srcPos.y = srcY;
-   p.srcPos.z = srcF * srcAllocZ + srcZ;
-   p.srcPtr.ptr = src.ptr;
-   p.srcPtr.pitch = src.pitch;
-   p.srcPtr.xsize = src.xsize;
-   p.srcPtr.ysize = src.ysize;
-
-   p.dstPos.x = dstX * typeSz;
-   p.dstPos.y = dstY;
-   p.dstPos.z = dstF * dstAllocZ + dstZ;
-   p.dstPtr.ptr = dst;
-   p.dstPtr.pitch = Nx * typeSz;
-   p.dstPtr.xsize = Nx * typeSz;
-   p.dstPtr.ysize = Ny;
-
-   p.extent.width = Nx * typeSz;
-   p.extent.height = Ny;
-   p.extent.depth = Nz * Nf;
-
-   p.kind = cudaMemcpyDeviceToHost;
-
-   WALBERLA_CUDA_CHECK( cudaMemcpy3D(&p) );
+   const uint_t & Nx = std::get<0>(intervalSize),
+                & Ny = std::get<1>(intervalSize),
+                & Nz = std::get<2>(intervalSize),
+                & Nf = std::get<3>(intervalSize);
+
+   const uint_t & srcX = std::get<0>(srcOffset),
+                & srcY = std::get<1>(srcOffset),
+                & srcZ = std::get<2>(srcOffset),
+                & srcF = std::get<3>(srcOffset);
+
+   const uint_t & dstX = std::get<0>(dstOffset),
+                & dstY = std::get<1>(dstOffset),
+                & dstZ = std::get<2>(dstOffset),
+                & dstF = std::get<3>(dstOffset);
+
+   auto copyFunctor = [&](uint_t dstCoordF, uint_t srcCoordF, uint_t fIntervalSize) {
+      cudaMemcpy3DParms p;
+      memset( &p, 0, sizeof(p) );
+
+      p.srcPos = make_cudaPos( srcX * typeSize, srcY, srcCoordF * srcAllocSizeZ + srcZ );
+      p.srcPtr = make_cudaPitchedPtr( src.ptr, src.pitch, src.xsize, src.ysize );
+
+      p.dstPos = make_cudaPos( dstX * typeSize, dstY, dstCoordF * dstAllocSizeZ + dstZ );
+      p.dstPtr = make_cudaPitchedPtr( dst, Nx * typeSize, Nx * typeSize, Ny );
+
+      p.extent = make_cudaExtent( Nx * typeSize, Ny, Nz * fIntervalSize );
+      p.kind = cudaMemcpyDeviceToHost;
+
+      if ( copyStream == 0 )
+      {
+         WALBERLA_CUDA_CHECK( cudaMemcpy3D(&p) );
+      }
+      else
+      {
+         // Using cudaMemcpy3DAsync requires page-locked memory on the host!
+         WALBERLA_CUDA_CHECK( cudaMemcpy3DAsync(&p, copyStream) );
+      }
+   };
+
+   if( Nf == 1 || ( Nz == dstAllocSizeZ && Nz == srcAllocSizeZ ) )
+   {
+      copyFunctor( dstF, srcF, Nf );
+   }
+   else
+   {
+      for( uint_t f = 0; f < Nf; ++f )
+      {
+         copyFunctor( dstF + f, srcF + f, 1 );
+      }
+   }
 }
 
 
+void copyDevToHostZYXF( unsigned char* dst, const cudaPitchedPtr& src,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset,
+                        uint_t dstAllocSizeY, uint_t srcAllocSizeY, uint_t typeSize,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize,
+                        cudaStream_t copyStream )
+{
+   const uint_t & Nx = std::get<0>(intervalSize),
+                & Ny = std::get<1>(intervalSize),
+                & Nz = std::get<2>(intervalSize),
+                & Nf = std::get<3>(intervalSize);
+
+   const uint_t & srcX = std::get<0>(srcOffset),
+                & srcY = std::get<1>(srcOffset),
+                & srcZ = std::get<2>(srcOffset),
+                & srcF = std::get<3>(srcOffset);
+
+   const uint_t & dstX = std::get<0>(dstOffset),
+                & dstY = std::get<1>(dstOffset),
+                & dstZ = std::get<2>(dstOffset),
+                & dstF = std::get<3>(dstOffset);
+
+   auto copyFunctor = [&](uint_t dstCoordZ, uint_t srcCoordZ, uint_t zIntervalSize) {
+      cudaMemcpy3DParms p;
+      memset( &p, 0, sizeof(p) );
+
+      p.srcPos = make_cudaPos( srcF * typeSize, srcX, srcCoordZ * srcAllocSizeY + srcY );
+      p.srcPtr = make_cudaPitchedPtr( src.ptr, src.pitch, src.xsize, src.ysize );
+
+      p.dstPos = make_cudaPos( dstF * typeSize, dstX, dstCoordZ * dstAllocSizeY + dstY );
+      p.dstPtr = make_cudaPitchedPtr( dst, Nf * typeSize, Nf * typeSize, Nx );
+
+      p.extent = make_cudaExtent( Nf * typeSize, Nx, Ny * zIntervalSize );
+
+      p.kind = cudaMemcpyDeviceToHost;
+
+      if ( copyStream == 0 )
+      {
+         WALBERLA_CUDA_CHECK( cudaMemcpy3D(&p) );
+      }
+      else
+      {
+         // Using cudaMemcpy3DAsync requires page-locked memory on the host!
+         WALBERLA_CUDA_CHECK( cudaMemcpy3DAsync(&p, copyStream) );
+      }
+   };
+
+
+   if ( Nz == 1 || ( Ny == dstAllocSizeY && Ny == srcAllocSizeY ) )
+   {
+      copyFunctor( dstZ, srcZ, Nz );
+   }
+   else
+   {
+      for( uint_t z = 0; z < Nz; ++z )
+      {
+         copyFunctor( dstZ + z, srcZ + z, 1 );
+      }
+   }
+}
 
 } // namespace cuda
 } // namespace walberla
diff --git a/src/cuda/GPUCopy.h b/src/cuda/GPUCopy.h
index 7c6c69a7963cd8ecc3bdd1289065c3e070890a32..a854068cdc67fe5c4f9d202ecd7cc9559e9661f7 100644
--- a/src/cuda/GPUCopy.h
+++ b/src/cuda/GPUCopy.h
@@ -1,21 +1,22 @@
 //======================================================================================================================
 //
-//  This file is part of waLBerla. waLBerla is free software: you can 
+//  This file is part of waLBerla. waLBerla is free software: you can
 //  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of 
+//  License as published by the Free Software Foundation, either version 3 of
 //  the License, or (at your option) any later version.
-//  
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT 
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 //  for more details.
-//  
+//
 //  You should have received a copy of the GNU General Public License along
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \file GPUCopy.h
 //! \ingroup cuda
 //! \author Paulo Carvalho <prcjunior@inf.ufpr.br>
+//! \author João Victor Tozatti Risso <jvtrisso@inf.ufpr.br>
 //! \brief Copy routines of 4D intervals involving GPU buffers.
 //
 //======================================================================================================================
@@ -24,6 +25,7 @@
 
 #include "core/DataTypes.h"
 
+#include <tuple>
 #include <cuda_runtime.h>
 
 
@@ -31,152 +33,87 @@ namespace walberla {
 namespace cuda {
 
 
-//*******************************************************************************************************************
-/*! Restricted version of copyDevToDevFZYX() that requires Nf == 1 || ( Nz == dstAllocZ && Nz == srcAllocZ ).
- * See copyDevToDevFZYX() for more details.
- *******************************************************************************************************************/
-void copyDevToDevFZYXRestricted( const cudaPitchedPtr& dst, const cudaPitchedPtr& src,
-                                 uint_t typeSz, uint_t dstAllocZ, uint_t srcAllocZ,
-                                 uint_t dstX, uint_t dstY, uint_t dstZ, uint_t dstF,
-                                 uint_t srcX, uint_t srcY, uint_t srcZ, uint_t srcF,
-                                 uint_t Nx, uint_t Ny, uint_t Nz, uint_t Nf );
-
-
-//*******************************************************************************************************************
+//****************************************************************************************************************************
 /*! Copy a 4D interval of a device buffer to another device buffer with fzyx memory layout.
  *
- * \param dst        destination buffer
- * \param src        source buffer
- * \param typeSz     size of an f element
- * \param dstAllocZ  allocation size in z direction of the destination buffer
- * \param srcAllocZ  allocation size in z direction of the source buffer
- * \param dstX       x coordinate of the interval start point in the destination buffer
- * \param dstY       y coordinate of the interval start point in the destination buffer
- * \param dstZ       z coordinate of the interval start point in the destination buffer
- * \param dstF       f coordinate of the interval start point in the destination buffer
- * \param srcX       x coordinate of the interval start point in the source buffer
- * \param srcY       y coordinate of the interval start point in the source buffer
- * \param srcZ       z coordinate of the interval start point in the source buffer
- * \param srcF       f coordinate of the interval start point in the source buffer
- * \param Nx         interval size in x direction
- * \param Ny         interval size in y direction
- * \param Nz         interval size in z direction
- * \param Nf         interval size in f direction
- *******************************************************************************************************************/
-inline void copyDevToDevFZYX( const cudaPitchedPtr& dst, const cudaPitchedPtr& src,
-                              uint_t typeSz, uint_t dstAllocZ, uint_t srcAllocZ,
-                              uint_t dstX, uint_t dstY, uint_t dstZ, uint_t dstF,
-                              uint_t srcX, uint_t srcY, uint_t srcZ, uint_t srcF,
-                              uint_t Nx, uint_t Ny, uint_t Nz, uint_t Nf )
-{
-   if( Nf == 1 || ( Nz == dstAllocZ && Nz == srcAllocZ ) )
-   {
-      copyDevToDevFZYXRestricted( dst, src,
-                                  typeSz, dstAllocZ, srcAllocZ,
-                                  dstX, dstY, dstZ, dstF,
-                                  srcX, srcY, srcZ, srcF,
-                                  Nx, Ny, Nz, Nf );
-   }
-   else
-   {
-      for( uint_t f = 0; f < Nf; ++f )
-      {
-         copyDevToDevFZYXRestricted( dst, src,
-                                     typeSz, dstAllocZ, srcAllocZ,
-                                     dstX, dstY, dstZ, dstF + f,
-                                     srcX, srcY, srcZ, srcF + f,
-                                     Nx, Ny, Nz, 1 );
-      }
-   }
-}
-
-
-//*******************************************************************************************************************
-/*! Restricted version of copyHostToDevFZYX() that requires Nf == 1 || ( Nz == dstAllocZ && Nz == srcAllocZ ).
- * See copyHostToDevFZYX() for more details.
- *******************************************************************************************************************/
-void copyHostToDevFZYXRestricted( const cudaPitchedPtr& dst, unsigned char* src,
-                                  uint_t typeSz, uint_t dstAllocZ, uint_t srcAllocZ,
-                                  uint_t dstX, uint_t dstY, uint_t dstZ, uint_t dstF,
-                                  uint_t srcX, uint_t srcY, uint_t srcZ, uint_t srcF,
-                                  uint_t Nx, uint_t Ny, uint_t Nz, uint_t Nf );
-
+ * \param dst           destination buffer
+ * \param src           source buffer
+ * \param dstOffset     (x, y, z, f)-tuple containing the coordinate of the interval start point in the destination buffer
+ * \param srcOffset     (x, y, z, f)-tuple containing the coordinate of the interval start point in the source buffer
+ * \param dstAllocSizeY allocation size in y direction of the destination buffer
+ * \param srcAllocSizeY allocation size in y direction of the source buffer
+ * \param typeSize      size of an f element
+ * \param copyStream    CUDA stream, if not NULL copy operations will be performed asynchronously
+ *****************************************************************************************************************************/
+void copyDevToDevFZYX( const cudaPitchedPtr& dst, const cudaPitchedPtr& src,
+                       std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset,
+                       std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset,
+                       uint_t dstAllocSizeZ, uint_t srcAllocSizeZ, uint_t typeSize,
+                       std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize,
+                       cudaStream_t copyStream );
+
+//****************************************************************************************************************************
+/*! Copy a 4D interval of a device buffer to another device buffer with zyxf memory layout.
+ *
+ * \param dst           destination buffer
+ * \param src           source buffer
+ * \param dstOffset     (x, y, z, f)-tuple containing the coordinate of the interval start point in the destination buffer
+ * \param srcOffset     (x, y, z, f)-tuple containing the coordinate of the interval start point in the source buffer
+ * \param dstAllocSizeY allocation size in y direction of the destination buffer
+ * \param srcAllocSizeY allocation size in y direction of the source buffer
+ * \param typeSize      size of an f element
+ * \param copyStream    CUDA stream, if not NULL copy operations will be performed asynchronously
+ *****************************************************************************************************************************/
+void copyDevToDevZYXF( const cudaPitchedPtr& dst, const cudaPitchedPtr& src,
+                       std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset,
+                       std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset,
+                       uint_t dstAllocSizeY, uint_t srcAllocSizeY, uint_t typeSize,
+                       std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize,
+                       cudaStream_t copyStream );
 
 //*******************************************************************************************************************
 /*! Copy a 4D interval of a host buffer to a device buffer with fzyx memory layout. See copyDevToDevFZYX() for
  * parameter information.
  *******************************************************************************************************************/
-inline void copyHostToDevFZYX( const cudaPitchedPtr& dst, unsigned char* src,
-                               uint_t typeSz, uint_t dstAllocZ, uint_t srcAllocZ,
-                               uint_t dstX, uint_t dstY, uint_t dstZ, uint_t dstF,
-                               uint_t srcX, uint_t srcY, uint_t srcZ, uint_t srcF,
-                               uint_t Nx, uint_t Ny, uint_t Nz, uint_t Nf )
-{
-   if( Nf == 1 || ( Nz == dstAllocZ && Nz == srcAllocZ ) )
-   {
-      copyHostToDevFZYXRestricted( dst, src,
-                                   typeSz, dstAllocZ, srcAllocZ,
-                                   dstX, dstY, dstZ, dstF,
-                                   srcX, srcY, srcZ, srcF,
-                                   Nx, Ny, Nz, Nf );
-   }
-   else
-   {
-      for( uint_t f = 0; f < Nf; ++f )
-      {
-         copyHostToDevFZYXRestricted( dst, src,
-                                      typeSz, dstAllocZ, srcAllocZ,
-                                      dstX, dstY, dstZ, dstF + f,
-                                      srcX, srcY, srcZ, srcF + f,
-                                      Nx, Ny, Nz, 1 );
-      }
-   }
-}
-
+void copyHostToDevFZYX( const cudaPitchedPtr& dst, unsigned char* src,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset,
+                        uint_t dstAllocSizeZ, uint_t srcAllocSizeZ, uint_t typeSize,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize,
+                        cudaStream_t copyStream );
 
 //*******************************************************************************************************************
-/*! Restricted version of copyDevToHostFZYX() that requires Nf == 1 || ( Nz == dstAllocZ && Nz == srcAllocZ ).
- * See copyDevToHostFZYX() for more details.
+/*! Copy a 4D interval of a host buffer to a device buffer with zyxf memory layout. See copyDevToDevZYXF() for
+ * parameter information.
  *******************************************************************************************************************/
-void copyDevToHostFZYXRestricted( unsigned char* dst, const cudaPitchedPtr& src,
-                                  uint_t typeSz, uint_t dstAllocZ, uint_t srcAllocZ,
-                                  uint_t dstX, uint_t dstY, uint_t dstZ, uint_t dstF,
-                                  uint_t srcX, uint_t srcY, uint_t srcZ, uint_t srcF,
-                                  uint_t Nx, uint_t Ny, uint_t Nz, uint_t Nf );
-
+void copyHostToDevZYXF( const cudaPitchedPtr& dst, unsigned char* src,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset,
+                        uint_t dstAllocSizeY, uint_t srcAllocSizeY, uint_t typeSize,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize,
+                        cudaStream_t copyStream );
 
 //*******************************************************************************************************************
 /*! Copy a 4D interval of a device buffer to a host buffer with fzyx memory layout. See copyDevToDevFZYX() for
  * parameter information.
  *******************************************************************************************************************/
-inline void copyDevToHostFZYX( unsigned char* dst, const cudaPitchedPtr& src,
-                               uint_t typeSz, uint_t dstAllocZ, uint_t srcAllocZ,
-                               uint_t dstX, uint_t dstY, uint_t dstZ, uint_t dstF,
-                               uint_t srcX, uint_t srcY, uint_t srcZ, uint_t srcF,
-                               uint_t Nx, uint_t Ny, uint_t Nz, uint_t Nf )
-{
-   if( Nf == 1 || ( Nz == dstAllocZ && Nz == srcAllocZ ) )
-   {
-      copyDevToHostFZYXRestricted( dst, src,
-                                   typeSz, dstAllocZ, srcAllocZ,
-                                   dstX, dstY, dstZ, dstF,
-                                   srcX, srcY, srcZ, srcF,
-                                   Nx, Ny, Nz, Nf );
-   }
-   else
-   {
-      for( uint_t f = 0; f < Nf; ++f )
-      {
-         copyDevToHostFZYXRestricted( dst, src,
-                                      typeSz, dstAllocZ, srcAllocZ,
-                                      dstX, dstY, dstZ, dstF + f,
-                                      srcX, srcY, srcZ, srcF + f,
-                                      Nx, Ny, Nz, 1 );
-      }
-   }
-}
-
+void copyDevToHostFZYX( unsigned char* dst, const cudaPitchedPtr& src,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset,
+                        uint_t dstAllocSizeZ, uint_t srcAllocSizeZ, uint_t typeSize,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize,
+                        cudaStream_t copyStream );
 
+//*******************************************************************************************************************
+/*! Copy a 4D interval of a device buffer to a host buffer with zyxf memory layout. See copyDevToDevZYXF() for
+ * parameter information.
+ *******************************************************************************************************************/
+void copyDevToHostZYXF( unsigned char* dst, const cudaPitchedPtr& src,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset,
+                        uint_t dstAllocSizeY, uint_t srcAllocSizeY, uint_t typeSize,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize,
+                        cudaStream_t copyStream );
 
 } // namespace cuda
 } // namespace walberla
diff --git a/src/cuda/Kernel.h b/src/cuda/Kernel.h
index 3b6acf0a1fe2bb7956110236e2b68d07dcb5e4b3..59e588399da12888fe102685797ff6c67e9dde61 100644
--- a/src/cuda/Kernel.h
+++ b/src/cuda/Kernel.h
@@ -102,8 +102,8 @@ namespace cuda {
       template<typename T>  void addFieldIndexingParam( const T & indexing );
 
 
-      void configure( dim3 gridDim, dim3 blockDim );
-      void operator() () const;
+      void configure( dim3 gridDim, dim3 blockDim, std::size_t sharedMemSize = 0 );
+      void operator() ( cudaStream_t stream = 0 ) const;
 
 
    protected:
@@ -118,6 +118,7 @@ namespace cuda {
       bool configured_;
       dim3 gridDim_;
       dim3 blockDim_;
+      std::size_t sharedMemSize_;
 
       struct ParamInfo {
          std::vector<char> data;
@@ -178,7 +179,8 @@ namespace cuda {
    template<typename FP>
    Kernel<FP>::Kernel( FP funcPtr )
       : funcPtr_ ( funcPtr ),
-        configured_( false )
+        configured_( false ),
+        sharedMemSize_( 0 )
    {}
 
    template<typename FP>
@@ -206,12 +208,13 @@ namespace cuda {
    }
 
    template<typename FP>
-   void Kernel<FP>::configure( dim3 gridDim, dim3 blockDim )
+   void Kernel<FP>::configure( dim3 gridDim, dim3 blockDim, std::size_t sharedMemSize )
    {
       if ( ! configured_ )
       {
          gridDim_ = gridDim;
          blockDim_ = blockDim;
+         sharedMemSize_ = sharedMemSize;
          configured_ = true;
       }
       else
@@ -225,7 +228,7 @@ namespace cuda {
    }
 
    template<typename FP>
-   void Kernel<FP>::operator() () const
+   void Kernel<FP>::operator() ( cudaStream_t stream ) const
    {
       // check for correct number of parameter calls
 
@@ -235,7 +238,7 @@ namespace cuda {
       }
 
       // set the number of blocks and  threads,
-      WALBERLA_CUDA_CHECK( cudaConfigureCall( gridDim_, blockDim_ ) ); //TODO extend class to support streams
+      WALBERLA_CUDA_CHECK( cudaConfigureCall( gridDim_, blockDim_, sharedMemSize_, stream ) );
 
       // register all parameters
       for( auto paramIt = params_.begin(); paramIt != params_.end(); ++paramIt )  {
diff --git a/src/cuda/communication/GPUPackInfo.h b/src/cuda/communication/GPUPackInfo.h
index b3124dd392ebf5f27a7a72734afa349aa480ec45..61c698d82e729add8ffe639cfb31184d7d2bf5f4 100644
--- a/src/cuda/communication/GPUPackInfo.h
+++ b/src/cuda/communication/GPUPackInfo.h
@@ -16,16 +16,32 @@
 //! \file GPUPackInfo.h
 //! \ingroup cuda
 //! \author Paulo Carvalho <prcjunior@inf.ufpr.br>
+//! \author João Victor Tozatti Risso <jvtrisso@inf.ufpr.br>
 //======================================================================================================================
 
 #pragma once
 
-#include "cuda/GPUCopy.h"
-#include "communication/UniformPackInfo.h"
 #include "core/debug/Debug.h"
-#include "field/GhostRegions.h"
+#include "core/math/Vector3.h"
+#include "core/mpi/BufferSizeTrait.h"
+
 #include "stencil/Directions.h"
 
+#include "field/GhostRegions.h"
+
+#include "communication/UniformPackInfo.h"
+
+#include "blockforest/Block.h"
+
+#include "cuda/ErrorChecking.h"
+#include "cuda/GPUCopy.h"
+#include "cuda/communication/PinnedMemoryBuffer.h"
+
+#include <cuda_runtime.h>
+#include <map>
+#include <vector>
+#include <tuple>
+
 
 namespace walberla {
 namespace cuda {
@@ -42,17 +58,49 @@ template<typename GPUField_T>
 class GPUPackInfo : public walberla::communication::UniformPackInfo
 {
 public:
-   typedef typename GPUField_T::value_type T;
+   typedef typename GPUField_T::value_type FieldType;
+
+   GPUPackInfo( const BlockDataID & bdId, cudaStream_t stream = 0 )
+   : bdId_( bdId ), communicateAllGhostLayers_( true ), numberOfGhostLayers_( 0 )
+   {
+      streams_.push_back( stream );
+      copyAsync_ = (stream != 0);
+   }
+
+   GPUPackInfo( const BlockDataID & bdId, const uint_t numberOfGHostLayers, cudaStream_t stream = 0 )
+   : bdId_( bdId ), communicateAllGhostLayers_( false ), numberOfGhostLayers_(  numberOfGHostLayers )
+   {
+      streams_.push_back( stream );
+      copyAsync_ = (stream != 0);
+   }
 
-   GPUPackInfo( const BlockDataID & bdId ) : bdId_( bdId ), communicateAllGhostLayers_( true ),
-            numberOfGhostLayers_( 0 ) {}
+   GPUPackInfo( const BlockDataID & bdId, std::vector< cudaStream_t > & streams )
+   : bdId_( bdId ), communicateAllGhostLayers_( true ), numberOfGhostLayers_( 0 ), streams_( streams )
+   {
+      copyAsync_ = true;
+      for( auto streamIt = streams_.begin(); streamIt != streams_.end(); ++streamIt )
+         if ( *streamIt == 0 )
+         {
+            copyAsync_ = false;
+            break;
+         }
+   }
 
-   GPUPackInfo( const BlockDataID & bdId, const uint_t numberOfGHostLayers ) : bdId_( bdId ),
-            communicateAllGhostLayers_( false ), numberOfGhostLayers_(  numberOfGHostLayers ) {}
+   GPUPackInfo( const BlockDataID & bdId, const uint_t numberOfGHostLayers, std::vector< cudaStream_t > & streams )
+   : bdId_( bdId ), communicateAllGhostLayers_( false ), numberOfGhostLayers_(  numberOfGHostLayers ), streams_( streams )
+   {
+      copyAsync_ = true;
+      for( auto streamIt = streams_.begin(); streamIt != streams_.end(); ++streamIt )
+         if ( *streamIt == 0 )
+         {
+            copyAsync_ = false;
+            break;
+         }
+   }
 
    virtual ~GPUPackInfo() {}
 
-   bool constantDataExchange() const { return mpi::BufferSizeTrait<T>::constantSize; }
+   bool constantDataExchange() const { return mpi::BufferSizeTrait<FieldType>::constantSize; }
    bool threadsafeReceiving()  const { return true; }
 
    void unpackData(IBlock * receiver, stencil::Direction dir, mpi::RecvBuffer & buffer);
@@ -64,38 +112,73 @@ protected:
 
    uint_t numberOfGhostLayersToCommunicate( const GPUField_T * const field ) const;
 
+   inline cudaStream_t & getStream(stencil::Direction & dir) { return streams_[dir % streams_.size()]; }
+
    const BlockDataID bdId_;
    bool   communicateAllGhostLayers_;
    uint_t numberOfGhostLayers_;
+   std::vector< cudaStream_t > streams_;
+   bool copyAsync_;
+   std::map< stencil::Direction, PinnedMemoryBuffer > pinnedRecvBuffers_;
+   mutable std::map< stencil::Direction, PinnedMemoryBuffer > pinnedSendBuffers_;
 };
 
 
 template<typename GPUField_T>
 void GPUPackInfo<GPUField_T>::unpackData(IBlock * receiver, stencil::Direction dir, mpi::RecvBuffer & buffer)
 {
-   GPUField_T * f = receiver->getData< GPUField_T >( bdId_ );
-   WALBERLA_ASSERT_NOT_NULLPTR(f);
+   GPUField_T * fieldPtr = receiver->getData< GPUField_T >( bdId_ );
+   WALBERLA_ASSERT_NOT_NULLPTR(fieldPtr);
+
+   cell_idx_t nrOfGhostLayers = cell_idx_c( numberOfGhostLayersToCommunicate( fieldPtr ) );
 
-   if ( f->layout() != fzyx )
+   CellInterval fieldCi = field::getGhostRegion( *fieldPtr, dir, nrOfGhostLayers, false );
+
+   uint_t nrOfBytesToRead = fieldCi.numCells() * fieldPtr->fSize() * sizeof(FieldType);
+
+   unsigned char * bufPtr = buffer.skip(nrOfBytesToRead);
+
+   unsigned char * copyBufferPtr = bufPtr;
+   if ( copyAsync_ )
    {
-      WALBERLA_ABORT( "GPUPackInfo currently only supports fzyx layout" );
+      PinnedMemoryBuffer & pinnedBuffer = pinnedRecvBuffers_[dir];
+      copyBufferPtr = pinnedBuffer.resize( nrOfBytesToRead );
+      // Copy data into pinned memory buffer, in order to transfer it asynchronously to the GPU
+      std::copy( bufPtr, static_cast< unsigned char * >( bufPtr + nrOfBytesToRead ), copyBufferPtr );
    }
 
-   cell_idx_t nrOfGhostLayers = cell_idx_c( numberOfGhostLayersToCommunicate( f ) );
+   cudaStream_t & unpackStream = getStream(dir);
 
-   CellInterval ci = field::getGhostRegion( *f, dir, nrOfGhostLayers, false );
+   auto dstOffset = std::make_tuple( uint_c(fieldCi.xMin() + nrOfGhostLayers),
+                                     uint_c(fieldCi.yMin() + nrOfGhostLayers),
+                                     uint_c(fieldCi.zMin() + nrOfGhostLayers),
+                                     uint_c(0) );
+   auto srcOffset = std::make_tuple( uint_c(0), uint_c(0), uint_c(0), uint_c(0) );
 
-   uint_t nrOfBytesToRead = ci.xSize() * ci.ySize() * ci.zSize() * f->fSize() * sizeof(T);
+   auto intervalSize = std::make_tuple( fieldCi.xSize(), fieldCi.ySize(), fieldCi.zSize(),
+                                        fieldPtr->fSize() );
 
-   unsigned char * buf = buffer.skip(nrOfBytesToRead);
+   if ( fieldPtr->layout() == fzyx )
+   {
+      const uint_t dstAllocSizeZ = fieldPtr->zAllocSize();
+      const uint_t srcAllocSizeZ = fieldCi.zSize();
+      copyHostToDevFZYX( fieldPtr->pitchedPtr(), copyBufferPtr, dstOffset, srcOffset,
+                         dstAllocSizeZ, srcAllocSizeZ, sizeof(FieldType),
+                         intervalSize, unpackStream );
+   }
+   else
+   {
+      const uint_t dstAllocSizeY = fieldPtr->yAllocSize();
+      const uint_t srcAllocSizeY = fieldCi.ySize();
+      copyHostToDevZYXF( fieldPtr->pitchedPtr(), copyBufferPtr, dstOffset, srcOffset,
+                         dstAllocSizeY, srcAllocSizeY, sizeof(FieldType),
+                         intervalSize, unpackStream );
+   }
 
-   copyHostToDevFZYX( f->pitchedPtr(), buf,
-                      sizeof(T), f->zAllocSize(), ci.zSize(),
-                      uint_c(ci.xMin() + nrOfGhostLayers),
-                      uint_c(ci.yMin() + nrOfGhostLayers),
-                      uint_c(ci.zMin() + nrOfGhostLayers), 0,
-                      0, 0, 0, 0,
-                      ci.xSize(), ci.ySize(), ci.zSize(), f->fSize() );
+   if ( copyAsync_ )
+   {
+      WALBERLA_CUDA_CHECK( cudaStreamSynchronize( unpackStream ) );
+   }
 }
 
 
@@ -105,55 +188,116 @@ void GPUPackInfo<GPUField_T>::communicateLocal(const IBlock * sender, IBlock * r
    const GPUField_T * sf = sender  ->getData< GPUField_T >( bdId_ );
          GPUField_T * rf = receiver->getData< GPUField_T >( bdId_ );
 
-   if ( sf->layout() != fzyx || rf->layout() != fzyx )
-   {
-      WALBERLA_ABORT( "GPUPackInfo currently only supports fzyx layout" );
-   }
+   WALBERLA_ASSERT_NOT_NULLPTR( sf );
+   WALBERLA_ASSERT_NOT_NULLPTR( rf );
 
    WALBERLA_ASSERT_EQUAL(sf->xSize(), rf->xSize());
    WALBERLA_ASSERT_EQUAL(sf->ySize(), rf->ySize());
    WALBERLA_ASSERT_EQUAL(sf->zSize(), rf->zSize());
    WALBERLA_ASSERT_EQUAL(sf->fSize(), rf->fSize());
 
+   WALBERLA_CHECK( sf->layout() == rf->layout(), "GPUPackInfo::communicateLocal: fields must have the same layout!" );
+
    cell_idx_t nrOfGhostLayers = cell_idx_c( numberOfGhostLayersToCommunicate( sf ) );
 
    CellInterval sCi = field::getSliceBeforeGhostLayer( *sf, dir, nrOfGhostLayers, false );
    CellInterval rCi = field::getGhostRegion( *rf, stencil::inverseDir[dir], nrOfGhostLayers, false );
 
-   copyDevToDevFZYX( rf->pitchedPtr(), sf->pitchedPtr(),
-                     sizeof(T), rf->zAllocSize(), sf->zAllocSize(),
-                     uint_c(rCi.xMin() + nrOfGhostLayers), uint_c(rCi.yMin() + nrOfGhostLayers), uint_c(rCi.zMin() + nrOfGhostLayers), 0,
-                     uint_c(sCi.xMin() + nrOfGhostLayers), uint_c(sCi.yMin() + nrOfGhostLayers), uint_c(sCi.zMin() + nrOfGhostLayers), 0,
-                     rCi.xSize(), rCi.ySize(), rCi.zSize(), sf->fSize() );
+   cudaStream_t & commStream = getStream(dir);
+
+   auto dstOffset = std::make_tuple( uint_c(rCi.xMin() + nrOfGhostLayers),
+                                     uint_c(rCi.yMin() + nrOfGhostLayers),
+                                     uint_c(rCi.zMin() + nrOfGhostLayers),
+                                     uint_c(0) );
+
+   auto srcOffset = std::make_tuple( uint_c(sCi.xMin() + nrOfGhostLayers),
+                                     uint_c(sCi.yMin() + nrOfGhostLayers),
+                                     uint_c(sCi.zMin() + nrOfGhostLayers),
+                                     uint_c(0) );
+
+   auto intervalSize = std::make_tuple( rCi.xSize(), rCi.ySize(), rCi.zSize(), sf->fSize() );
+
+   if ( sf->layout() == fzyx )
+   {
+      const uint_t dstAllocSizeZ = rf->zAllocSize();
+      const uint_t srcAllocSizeZ = sf->zAllocSize();
+
+      copyDevToDevFZYX( rf->pitchedPtr(), sf->pitchedPtr(), dstOffset, srcOffset,
+                        dstAllocSizeZ, srcAllocSizeZ, sizeof(FieldType),
+                        intervalSize, commStream );
+   }
+   else
+   {
+      const uint_t dstAllocSizeY = rf->yAllocSize();
+      const uint_t srcAllocSizeY = sf->yAllocSize();
+
+      copyDevToDevZYXF( rf->pitchedPtr(), sf->pitchedPtr(), dstOffset, srcOffset,
+                        dstAllocSizeY, srcAllocSizeY, sizeof(FieldType),
+                        intervalSize, commStream );
+   }
+
+   if ( copyAsync_ )
+   {
+      WALBERLA_CUDA_CHECK( cudaStreamSynchronize( commStream ) );
+   }
 }
 
 
 template<typename GPUField_T>
 void GPUPackInfo<GPUField_T>::packDataImpl(const IBlock * sender, stencil::Direction dir, mpi::SendBuffer & outBuffer) const
 {
-   const GPUField_T * f = sender->getData< GPUField_T >( bdId_ );
-   WALBERLA_ASSERT_NOT_NULLPTR(f);
+   const GPUField_T * fieldPtr = sender->getData< GPUField_T >( bdId_ );
+   WALBERLA_ASSERT_NOT_NULLPTR(fieldPtr);
+
+   cell_idx_t nrOfGhostLayers = cell_idx_c( numberOfGhostLayersToCommunicate( fieldPtr ) );
+
+   CellInterval fieldCi = field::getSliceBeforeGhostLayer( *fieldPtr, dir, nrOfGhostLayers, false );
+
+   const std::size_t nrOfBytesToPack = fieldCi.numCells() * fieldPtr->fSize() * sizeof(FieldType);
+
+   unsigned char * outBufferPtr = outBuffer.forward( nrOfBytesToPack );
+
+   const cudaStream_t & packStream = streams_[dir % streams_.size()];
 
-   if ( f->layout() != fzyx )
+   unsigned char * copyBufferPtr = outBufferPtr;
+   if ( copyAsync_ )
    {
-      WALBERLA_ABORT( "GPUPackInfo currently only supports fzyx layout" );
+      PinnedMemoryBuffer & pinnedBuffer = pinnedSendBuffers_[dir];
+      copyBufferPtr = pinnedBuffer.resize( nrOfBytesToPack );
    }
 
-   cell_idx_t nrOfGhostLayers = cell_idx_c( numberOfGhostLayersToCommunicate( f ) );
+   auto dstOffset = std::make_tuple( uint_c(0), uint_c(0), uint_c(0), uint_c(0) );
+   auto srcOffset = std::make_tuple( uint_c(fieldCi.xMin() + nrOfGhostLayers),
+                                     uint_c(fieldCi.yMin() + nrOfGhostLayers),
+                                     uint_c(fieldCi.zMin() + nrOfGhostLayers),
+                                     uint_c(0) );
 
-   CellInterval ci = field::getSliceBeforeGhostLayer( *f, dir, nrOfGhostLayers, false );
+   auto intervalSize = std::make_tuple( fieldCi.xSize(), fieldCi.ySize(), fieldCi.zSize(),
+                                        fieldPtr->fSize() );
 
-   uint_t nrOfBytesToPack = ci.xSize() * ci.ySize() * ci.zSize() * f->fSize() * sizeof(T);
+   if ( fieldPtr->layout() == fzyx )
+   {
+      const uint_t dstAllocSizeZ = fieldCi.zSize();
+      const uint_t srcAllocSizeZ = fieldPtr->zAllocSize();
+      copyDevToHostFZYX( copyBufferPtr, fieldPtr->pitchedPtr(), dstOffset, srcOffset,
+                         dstAllocSizeZ, srcAllocSizeZ, sizeof(FieldType),
+                         intervalSize, packStream );
+   }
+   else
+   {
+      const uint_t dstAllocSizeZ = fieldCi.ySize();
+      const uint_t srcAllocSizeZ = fieldPtr->yAllocSize();
+      copyDevToHostZYXF( copyBufferPtr, fieldPtr->pitchedPtr(), dstOffset, srcOffset,
+                         dstAllocSizeZ, srcAllocSizeZ, sizeof(FieldType),
+                         intervalSize, packStream );
+   }
 
-   unsigned char * buf = outBuffer.forward( nrOfBytesToPack );
+   if ( copyAsync_ )
+   {
+      WALBERLA_CUDA_CHECK( cudaStreamSynchronize( packStream ) );
 
-   copyDevToHostFZYX( buf, f->pitchedPtr(),
-                      sizeof(T), ci.zSize(), f->zAllocSize(),
-                      0, 0, 0, 0,
-                      uint_c(ci.xMin() + nrOfGhostLayers),
-                      uint_c(ci.yMin() + nrOfGhostLayers),
-                      uint_c(ci.zMin() + nrOfGhostLayers), 0,
-                      ci.xSize(), ci.ySize(), ci.zSize(), f->fSize() );
+      std::copy( copyBufferPtr, static_cast<unsigned char *>( copyBufferPtr + nrOfBytesToPack ), outBufferPtr );
+   }
 }
 
 
diff --git a/src/cuda/communication/PinnedMemoryBuffer.h b/src/cuda/communication/PinnedMemoryBuffer.h
new file mode 100644
index 0000000000000000000000000000000000000000..702e71bd69aa9bfa7858e3595330b092edfe65dc
--- /dev/null
+++ b/src/cuda/communication/PinnedMemoryBuffer.h
@@ -0,0 +1,123 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file PinnedMemoryBuffer.h
+//! \ingroup cuda
+//! \author João Victor Tozatti Risso <jvtrisso@inf.ufpr.br>
+//! \brief Pinned Memory buffer for staging memory when using asynchronous CUDA memory copies.
+//
+//======================================================================================================================
+
+#pragma once
+
+#include "cuda/ErrorChecking.h"
+
+#include <algorithm>
+#include <cuda_runtime.h>
+
+
+namespace walberla {
+namespace cuda {
+namespace communication {
+
+template< typename T = unsigned char >
+class GenericPinnedMemoryBuffer
+{
+public:
+   typedef T ElementType;
+
+   GenericPinnedMemoryBuffer();
+
+   GenericPinnedMemoryBuffer( std::size_t initSize );
+
+   GenericPinnedMemoryBuffer( const GenericPinnedMemoryBuffer & pb );
+
+   ~GenericPinnedMemoryBuffer();
+
+   inline T* ptr() const { return data_; }
+
+   inline T* resize( std::size_t newSize );
+
+   inline std::size_t size() const { return size_; }
+
+   GenericPinnedMemoryBuffer & operator=( const GenericPinnedMemoryBuffer & pb ) = delete;
+private:
+   T * data_;
+   std::size_t size_;
+};
+
+typedef GenericPinnedMemoryBuffer<> PinnedMemoryBuffer;
+
+template< typename T >  // Element type
+GenericPinnedMemoryBuffer<T>::GenericPinnedMemoryBuffer()
+   : data_(nullptr), size_(0)
+{
+}
+
+
+template< typename T >  // Element type
+GenericPinnedMemoryBuffer<T>::GenericPinnedMemoryBuffer( std::size_t initSize )
+   : data_(nullptr), size_(initSize)
+{
+   if (initSize > 0)
+   {
+      WALBERLA_CUDA_CHECK( cudaMallocHost( &data_, size_ * sizeof(T) ) );
+   }
+}
+
+template< typename T >  // Element type
+GenericPinnedMemoryBuffer<T>::GenericPinnedMemoryBuffer( const GenericPinnedMemoryBuffer & pb )
+   : size_(pb.size_)
+{
+   if ( pb.size_ > 0 )
+   {
+      WALBERLA_CUDA_CHECK( cudaMallocHost( &data_, pb.size_ * sizeof(T) ) );
+
+      std::copy( pb.data_, static_cast<T *>(pb.data_ + pb.size_), data_ );
+   }
+}
+
+template< typename T >  // Element type
+GenericPinnedMemoryBuffer<T>::~GenericPinnedMemoryBuffer()
+{
+   if ( data_ != nullptr )
+   {
+      WALBERLA_CUDA_CHECK( cudaFreeHost( data_ ) );
+   }
+}
+
+template< typename T >  // Element type
+T * GenericPinnedMemoryBuffer<T>::resize(std::size_t newSize)
+{
+   if ( newSize > size_ )
+   {
+      T * newBegin;
+      WALBERLA_CUDA_CHECK( cudaMallocHost( &newBegin, newSize * sizeof(T) ) );
+
+      std::swap( data_, newBegin );
+      if ( newBegin != nullptr )
+      {
+         WALBERLA_CUDA_CHECK( cudaFreeHost( newBegin ) );
+      }
+
+      size_ = newSize;
+   }
+
+   return data_;
+}
+
+} // namespace communication
+} // namespace cuda
+} // namespace walberla
diff --git a/tests/cuda/CMakeLists.txt b/tests/cuda/CMakeLists.txt
index 7cde4254bbfbe376d04d666d23fdfb81b73f11eb..aa82a445f73db26c8cb7312ab0445a8eefe6b5bc 100644
--- a/tests/cuda/CMakeLists.txt
+++ b/tests/cuda/CMakeLists.txt
@@ -7,6 +7,9 @@
 waLBerla_compile_test( FILES communication/GPUPackInfoTest.cpp DEPENDS blockforest )
 waLBerla_execute_test( NAME  GPUPackInfoTest )
 
+waLBerla_compile_test( FILES communication/GPUPackInfoCommunicationTest.cpp DEPENDS domain_decomposition blockforest stencil )
+waLBerla_execute_test( NAME  GPUPackInfoCommunicationTest )
+
 waLBerla_compile_test( FILES FieldTransferTest )
 waLBerla_execute_test( NAME  FieldTransferTest )
 
diff --git a/tests/cuda/communication/GPUPackInfoCommunicationTest.cpp b/tests/cuda/communication/GPUPackInfoCommunicationTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f3f19d085012b02e8947d2d5100a7c2e3ad1dfb6
--- /dev/null
+++ b/tests/cuda/communication/GPUPackInfoCommunicationTest.cpp
@@ -0,0 +1,187 @@
+//========================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file GPUFieldPackInfoTest.cpp
+//! \ingroup cuda
+//! \author João Victor Tozatti Risso <jvtrisso@inf.ufpr.br>
+//! \brief Short communication test to verify the equivalence of GPUPackInfo using a default stream and multiple streams.
+//
+//========================================================================================================================
+
+#include "core/DataTypes.h"
+#include "core/debug/TestSubsystem.h"
+#include "core/math/Random.h"
+#include "core/mpi/Environment.h"
+
+#include "stencil/Directions.h"
+#include "stencil/Iterator.h"
+#include "stencil/D3Q27.h"
+
+#include "domain_decomposition/BlockDataID.h"
+
+#include "blockforest/Initialization.h"
+#include "blockforest/communication/UniformBufferedScheme.h"
+
+#include "field/GhostLayerField.h"
+
+#include "cuda/ErrorChecking.h"
+#include "cuda/HostFieldAllocator.h"
+#include "cuda/GPUField.h"
+#include "cuda/FieldCopy.h"
+#include "cuda/communication/GPUPackInfo.h"
+
+#include <cuda_runtime.h>
+#include <vector>
+
+using namespace walberla;
+
+using DataType = walberla::uint_t;
+using StencilType = stencil::D3Q27;
+using FieldType = field::GhostLayerField< DataType, StencilType::Size >;
+using GPUFieldType = cuda::GPUField< DataType >;
+using CommSchemeType = blockforest::communication::UniformBufferedScheme<StencilType>;
+using GPUPackInfoType = cuda::communication::GPUPackInfo< GPUFieldType >;
+
+static std::vector< cuda::Layout > fieldLayouts = { cuda::fzyx, cuda::zyxf };
+static uint_t fieldLayoutIndex = 0;
+
+
+FieldType * createField( IBlock* const block, StructuredBlockStorage* const storage )
+{
+   return new FieldType(
+            storage->getNumberOfXCells( *block ),   // number of cells in x direction per block
+            storage->getNumberOfYCells( *block ),   // number of cells in y direction per block
+            storage->getNumberOfZCells( *block ),   // number of cells in z direction per block
+            1,                                      // one ghost layer
+            DataType(0),                            // initial value
+            fieldLayouts[fieldLayoutIndex],         // layout
+            make_shared<cuda::HostFieldAllocator< DataType > >() // allocator for host pinned memory
+            );
+}
+
+
+GPUFieldType * createGPUField( IBlock* const block, StructuredBlockStorage* const storage )
+{
+   return new GPUFieldType(
+            storage->getNumberOfXCells( *block ), // number of cells in x direction
+            storage->getNumberOfYCells( *block ), // number of cells in y direction
+            storage->getNumberOfZCells( *block ), // number of cells in z direction
+            StencilType::Size,                    // number of cells for pdfs
+            1,                                    // one ghost layer
+            fieldLayouts[fieldLayoutIndex] );
+}
+
+
+void initFields( const shared_ptr< StructuredBlockStorage > & blocks, const BlockDataID & fieldID )
+{
+   for( auto block = blocks->begin(); block != blocks->end(); ++block )
+   {
+      auto fieldPtr = block->getData< FieldType >( fieldID );
+
+      for( auto fieldIt = fieldPtr->begin(); fieldIt != fieldPtr->end(); ++fieldIt )
+         *fieldIt = math::intRandom< DataType >();
+   }
+}
+
+
+int main( int argc, char ** argv )
+{
+   debug::enterTestMode();
+   mpi::Environment mpiEnv( argc, argv );
+
+   std::vector< cudaStream_t > streams;
+
+   for( uint_t i = 0; i < StencilType::Size; ++i )
+   {
+      cudaStream_t stream(nullptr);
+      WALBERLA_CUDA_CHECK( cudaStreamCreate(&stream) );
+      streams.push_back( stream );
+   }
+
+   const Vector3< uint_t > cells = Vector3< uint_t >( 4, 4, 4 );
+
+   uint_t nProc = uint_c( MPIManager::instance()->numProcesses() );
+
+   for(; fieldLayoutIndex < fieldLayouts.size(); ++fieldLayoutIndex )
+   {
+      auto blocks = blockforest::createUniformBlockGrid(nProc, 1, 1,                  // blocks
+                                                        cells[0], cells[1], cells[2], // cells
+                                                        1,                            // unit cell spacing
+                                                        true,                        // one block per process
+                                                        true, true, true);            // periodic in all directions
+
+      BlockDataID sourceFieldId = blocks->addStructuredBlockData< FieldType >( &createField,
+                                                                               "ScalarField" );
+
+      BlockDataID syncGPUFieldId = blocks->addStructuredBlockData< GPUFieldType >( &createGPUField,
+                                                                                   "syncGPUField" );
+
+      BlockDataID asyncGPUFieldId = blocks->addStructuredBlockData< GPUFieldType >( &createGPUField,
+                                                                                    "asyncGPUField" );
+
+      math::seedRandomGenerator( numeric_cast<boost::mt19937::result_type>( MPIManager::instance()->rank() ) );
+      // Initialize CPU field with random values
+      initFields( blocks, sourceFieldId );
+
+      // Copy same CPU field to both GPU fields
+      for( auto block = blocks->begin(); block != blocks->end(); ++block )
+      {
+         auto sourceFieldPtr = block->getData< FieldType >( sourceFieldId );
+
+         auto syncGPUFieldPtr = block->getData< GPUFieldType >( syncGPUFieldId );
+         cuda::fieldCpy( *syncGPUFieldPtr, *sourceFieldPtr );
+
+         auto asyncGPUFieldPtr = block->getData< GPUFieldType >( asyncGPUFieldId );
+         cuda::fieldCpy( *asyncGPUFieldPtr, *sourceFieldPtr );
+      }
+
+      // Setup communication schemes for synchronous GPUPackInfo
+      CommSchemeType syncCommScheme(blocks);
+      syncCommScheme.addPackInfo( boost::make_shared< GPUPackInfoType >( syncGPUFieldId ) );
+
+      // Setup communication scheme for asynchronous GPUPackInfo, which uses CUDA streams
+      CommSchemeType asyncCommScheme(blocks);
+      asyncCommScheme.addPackInfo( boost::make_shared< GPUPackInfoType >( asyncGPUFieldId, streams ) );
+
+      // Perform one communication step for each scheme
+      syncCommScheme();
+      asyncCommScheme();
+
+      // Check results
+      FieldType syncFieldCpu( cells[0], cells[1], cells[2], 1, fieldLayouts[fieldLayoutIndex],
+                              make_shared< cuda::HostFieldAllocator< DataType > >() );
+      FieldType asyncFieldCpu( cells[0], cells[1], cells[2], 1, fieldLayouts[fieldLayoutIndex],
+                               make_shared< cuda::HostFieldAllocator< DataType > >() );
+
+      for( auto block = blocks->begin(); block != blocks->end(); ++block )
+      {
+         auto syncGPUFieldPtr = block->getData< GPUFieldType >( syncGPUFieldId );
+         cuda::fieldCpy( syncFieldCpu, *syncGPUFieldPtr );
+
+         auto asyncGPUFieldPtr = block->getData< GPUFieldType >( asyncGPUFieldId );
+         cuda::fieldCpy( asyncFieldCpu, *asyncGPUFieldPtr );
+
+         for( auto syncIt = syncFieldCpu.beginWithGhostLayerXYZ(), asyncIt = asyncFieldCpu.beginWithGhostLayerXYZ();
+                  syncIt != syncFieldCpu.end();
+                  ++syncIt, ++asyncIt )
+            WALBERLA_CHECK_EQUAL( *syncIt, *asyncIt );
+      }
+   }
+
+   for( uint_t i = 0; i < StencilType::Size; ++i )
+      WALBERLA_CUDA_CHECK( cudaStreamDestroy(streams[i]) );
+
+   return EXIT_SUCCESS;
+}
diff --git a/tests/cuda/communication/GPUPackInfoTest.cpp b/tests/cuda/communication/GPUPackInfoTest.cpp
index 28d3def70cd23dd3a69c6b3758a3e58f260684cf..29fc84f259cb66a1928337fe85c467a83ae2c6b6 100644
--- a/tests/cuda/communication/GPUPackInfoTest.cpp
+++ b/tests/cuda/communication/GPUPackInfoTest.cpp
@@ -34,12 +34,15 @@
 #include "stencil/D3Q27.h"
 
 #include <cstring>
+#include <vector>
+#include <cuda_runtime.h>
 
 #define F_SIZE    19
 
-
 using namespace walberla;
 
+static std::vector< field::Layout > fieldLayouts = { field::fzyx, field::zyxf };
+static uint_t fieldLayoutIndex = 0;
 
 cuda::GPUField<int> * createGPUField( IBlock* const block, StructuredBlockStorage* const storage )
 {
@@ -49,7 +52,7 @@ cuda::GPUField<int> * createGPUField( IBlock* const block, StructuredBlockStorag
             storage->getNumberOfZCells( *block ), // number of cells in z direction
             F_SIZE,                               // fSize
             1,                                    // number of ghost layers
-            field::fzyx );
+            fieldLayouts[fieldLayoutIndex] );
 }
 
 // Tester base class. The communicate() template method allows testing different communication methods.
@@ -59,7 +62,9 @@ public:
 
    typedef cuda::communication::GPUPackInfo< cuda::GPUField<int> > GPUPackInfoType;
 
-   GPUPackInfoTester( IBlock* block, BlockDataID fieldId ): block_( block ), fieldId_( fieldId ) {}
+   GPUPackInfoTester( IBlock* block, BlockDataID fieldId, std::vector< cudaStream_t > & streams ) :
+      block_( block ), fieldId_( fieldId ), streams_(streams) {}
+
    virtual ~GPUPackInfoTester() {}
 
    void test( stencil::Direction dir )
@@ -72,7 +77,7 @@ public:
                gpuField.zSize(),       // number of cells in z direction
                1,                      // number of ghost layers
                0,                      // initial value
-               field::fzyx);
+               fieldLayouts[fieldLayoutIndex]);
       cpuField.setWithGhostLayer( 0 );
 
       int val = 0;
@@ -82,7 +87,7 @@ public:
       }
       cuda::fieldCpy( gpuField, cpuField );
 
-      GPUPackInfoType gpuPackInfo( fieldId_ );
+      GPUPackInfoType gpuPackInfo( fieldId_, streams_ );
 
       communicate( gpuPackInfo, dir );
       cuda::fieldCpy( cpuField, gpuField );
@@ -101,6 +106,7 @@ protected:
 
    IBlock* block_;
    BlockDataID fieldId_;
+   std::vector< cudaStream_t > streams_;
 };
 
 
@@ -108,7 +114,7 @@ protected:
 class GPUPackInfoBufferTester: public GPUPackInfoTester
 {
 public:
-   GPUPackInfoBufferTester( IBlock* block, BlockDataID fieldId ): GPUPackInfoTester( block, fieldId ) {}
+   GPUPackInfoBufferTester( IBlock* block, BlockDataID fieldId, std::vector< cudaStream_t > & streams): GPUPackInfoTester( block, fieldId, streams ) {}
 
 protected:
    void communicate( GPUPackInfoType& gpuPackInfo, stencil::Direction dir )
@@ -134,7 +140,7 @@ protected:
 class GPUPackInfoLocalTester: public GPUPackInfoTester
 {
 public:
-   GPUPackInfoLocalTester( IBlock* block, BlockDataID fieldId ): GPUPackInfoTester( block, fieldId ) {}
+   GPUPackInfoLocalTester( IBlock* block, BlockDataID fieldId, std::vector< cudaStream_t > & streams ): GPUPackInfoTester( block, fieldId, streams ) {}
 
 protected:
    void communicate( GPUPackInfoType& gpuPackInfo, stencil::Direction dir )
@@ -151,27 +157,42 @@ int main(int argc, char **argv)
    debug::enterTestMode();
    MPIManager::instance()->initializeMPI(&argc,&argv);
 
-   // Create BlockForest
-   uint_t processes = uint_c( MPIManager::instance()->numProcesses() );
-   auto blocks = createUniformBlockGrid(processes,1,1,  //blocks
-                                        2,2,2,          //cells
-                                        1,              //dx
-                                        false,          //one block per process
-                                        true,true,true);//periodicity
-
-
-   BlockDataID scalarGPUFieldId = blocks->addStructuredBlockData<cuda::GPUField<int> >(
-           &createGPUField, "ScalarGPUField" );
-
-   for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
+   for(; fieldLayoutIndex < fieldLayouts.size(); ++fieldLayoutIndex )
    {
-      GPUPackInfoBufferTester bufferTester( &(*blockIt), scalarGPUFieldId );
-      GPUPackInfoLocalTester localTester( &(*blockIt), scalarGPUFieldId );
+      std::vector< cudaStream_t > streams;
+      for( uint_t s = 0; s < stencil::D3Q27::Size; ++s )
+      {
+         cudaStream_t stream(nullptr);
+         WALBERLA_CUDA_CHECK( cudaStreamCreate( &stream ) );
+         streams.push_back( stream );
+      }
+      // Create BlockForest
+      uint_t processes = uint_c( MPIManager::instance()->numProcesses() );
+      auto blocks = createUniformBlockGrid(processes,1,1,  //blocks
+                                           2,2,2,          //cells
+                                           1,              //dx
+                                           false,          //one block per process
+                                           true,true,true);//periodicity
+
+      BlockDataID scalarGPUFieldId = blocks->addStructuredBlockData<cuda::GPUField<int> >(
+              &createGPUField, "ScalarGPUField" );
+
+      for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
+      {
+         GPUPackInfoBufferTester bufferTester( &(*blockIt), scalarGPUFieldId, streams );
+         GPUPackInfoLocalTester localTester( &(*blockIt), scalarGPUFieldId, streams );
+
+         for( auto dir = stencil::D3Q27::beginNoCenter(); dir != stencil::D3Q27::end(); ++dir )
+         {
+            localTester.test( *dir );
+            bufferTester.test( *dir );
+         }
+      }
 
-      for( auto dir = stencil::D3Q27::beginNoCenter(); dir != stencil::D3Q27::end(); ++dir )
+      for( auto streamIt = streams.begin(); streamIt != streams.end(); ++streamIt )
       {
-         localTester.test( *dir );
-         bufferTester.test( *dir );
+         cudaStream_t & stream = *streamIt;
+         WALBERLA_CUDA_CHECK( cudaStreamDestroy( stream ) );
       }
    }