From 6bfe8c59fc5fc53101476d4acdd865ee54da3d73 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jo=C3=A3o=20Victor=20Tozatti=20Risso?=
 <joaovictortr@protonmail.com>
Date: Tue, 5 Dec 2017 13:52:54 +0100
Subject: [PATCH] GPUPackInfo: add asynchronous (un)packing capabilities
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Changes introduced in this commit are the following:

- CUDA streams: Add support for asynchronous (un)packing operations using CUDA
  streams in cuda::communication::GPUPackInfo. Through asynchronous operations
  it is possible to overlap GPU computation and MPI communication in simulations
  (e.g. LBM simulations). Asynchronous copies in CUDA require pinned memory on
  the host, and for that purpose a staging buffer is introduced (i.e.
  cuda::communication::PinnedMemoryBuffer) in the cuda module, which is used to
  stage data between the GPU and the MPI buffers.

- zyxf layout: Add zyxf field layout support in GPUPackInfo through extensions
  of the functions in cuda::GPUCopy.

- Extended GPUPackInfo test: Add stream and zyxf layout tests to the
  GPUPackInfoTest to test the proposed implementation.

- Extended Kernel: add CUDA stream and shared memory configuration support in
  cuda::Kernel class.

Signed-off-by: JoÃ£o Victor Tozatti Risso <joaovictortr@protonmail.com>
---
 src/cuda/GPUCopy.cpp                          | 458 ++++++++++++++----
 src/cuda/GPUCopy.h                            | 211 +++-----
 src/cuda/Kernel.h                             |  15 +-
 src/cuda/communication/GPUPackInfo.h          | 240 +++++++--
 src/cuda/communication/PinnedMemoryBuffer.h   | 123 +++++
 tests/cuda/CMakeLists.txt                     |   3 +
 .../GPUPackInfoCommunicationTest.cpp          | 187 +++++++
 tests/cuda/communication/GPUPackInfoTest.cpp  |  71 ++-
 8 files changed, 988 insertions(+), 320 deletions(-)
 create mode 100644 src/cuda/communication/PinnedMemoryBuffer.h
 create mode 100644 tests/cuda/communication/GPUPackInfoCommunicationTest.cpp

diff --git a/src/cuda/GPUCopy.cpp b/src/cuda/GPUCopy.cpp
index e1d6c86e7..3c1c8f126 100644
--- a/src/cuda/GPUCopy.cpp
+++ b/src/cuda/GPUCopy.cpp
@@ -1,21 +1,22 @@
 //======================================================================================================================
 //
-//  This file is part of waLBerla. waLBerla is free software: you can 
+//  This file is part of waLBerla. waLBerla is free software: you can
 //  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of 
+//  License as published by the Free Software Foundation, either version 3 of
 //  the License, or (at your option) any later version.
-//  
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT 
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 //  for more details.
-//  
+//
 //  You should have received a copy of the GNU General Public License along
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \file GPUCopy.cpp
 //! \ingroup cuda
 //! \author Paulo Carvalho <prcjunior@inf.ufpr.br>
+//! \author JoÃ£o Victor Tozatti Risso <jvtrisso@inf.ufpr.br>
 //! \brief Copy routines of 4D intervals involving GPU buffers.
 //
 //======================================================================================================================
@@ -29,116 +30,365 @@
 namespace walberla {
 namespace cuda {
 
+void copyDevToDevFZYX( const cudaPitchedPtr& dst, const cudaPitchedPtr& src,
+                       std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset,
+                       std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset,
+                       uint_t dstAllocSizeZ, uint_t srcAllocSizeZ, uint_t typeSize,
+                       std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize,
+                       cudaStream_t copyStream )
+{
+   const uint_t & Nx = std::get<0>(intervalSize),
+                & Ny = std::get<1>(intervalSize),
+                & Nz = std::get<2>(intervalSize),
+                & Nf = std::get<3>(intervalSize);
+
+   const uint_t & srcX = std::get<0>(srcOffset),
+                & srcY = std::get<1>(srcOffset),
+                & srcZ = std::get<2>(srcOffset),
+                & srcF = std::get<3>(srcOffset);
+
+   const uint_t & dstX = std::get<0>(dstOffset),
+                & dstY = std::get<1>(dstOffset),
+                & dstZ = std::get<2>(dstOffset),
+                & dstF = std::get<3>(dstOffset);
+
+   auto copyFunctor = [&](uint_t dstCoordF, uint_t srcCoordF, uint_t fIntervalSize) {
+      WALBERLA_ASSERT( fIntervalSize == 1 || ( Nz == dstAllocSizeZ && Nz == srcAllocSizeZ ) );
+
+      cudaMemcpy3DParms p;
+      memset( &p, 0, sizeof(p) );
+
+      p.srcPos = make_cudaPos( srcX * typeSize, srcY, srcCoordF * srcAllocSizeZ + srcZ );
+      p.srcPtr = make_cudaPitchedPtr( src.ptr, src.pitch, src.xsize, src.ysize );
 
-void copyDevToDevFZYXRestricted( const cudaPitchedPtr& dst, const cudaPitchedPtr& src,
-                                 uint_t typeSz, uint_t dstAllocZ, uint_t srcAllocZ,
-                                 uint_t dstX, uint_t dstY, uint_t dstZ, uint_t dstF,
-                                 uint_t srcX, uint_t srcY, uint_t srcZ, uint_t srcF,
-                                 uint_t Nx, uint_t Ny, uint_t Nz, uint_t Nf )
+      p.dstPos = make_cudaPos( dstX * typeSize, dstY, dstCoordF * dstAllocSizeZ + dstZ );
+      p.dstPtr = make_cudaPitchedPtr( dst.ptr, dst.pitch, dst.xsize, dst.ysize );
+
+      p.extent = make_cudaExtent( Nx * typeSize, Ny, Nz * fIntervalSize );
+      p.kind = cudaMemcpyDeviceToDevice;
+
+      if ( copyStream == 0 )
+      {
+         WALBERLA_CUDA_CHECK( cudaMemcpy3D(&p) );
+      }
+      else
+      {
+         WALBERLA_CUDA_CHECK( cudaMemcpy3DAsync(&p, copyStream) );
+      }
+   };
+
+   if( Nf == 1 || ( Nz == dstAllocSizeZ && Nz == srcAllocSizeZ ) )
+   {
+      copyFunctor( dstF, srcF, Nf );
+   }
+   else
+   {
+      for( uint_t f = 0; f < Nf; ++f )
+      {
+         copyFunctor( dstF + f, srcF + f, uint_c(1) );
+      }
+   }
+}
+
+
+void copyDevToDevZYXF( const cudaPitchedPtr& dst, const cudaPitchedPtr& src,
+                       std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset,
+                       std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset,
+                       uint_t dstAllocSizeY, uint_t srcAllocSizeY, uint_t typeSize,
+                       std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize,
+                       cudaStream_t copyStream )
 {
-   WALBERLA_ASSERT( Nf == 1 || ( Nz == dstAllocZ && Nz == srcAllocZ ) );
-
-   cudaMemcpy3DParms p;
-   memset( &p, 0, sizeof(p) );
-
-   p.srcPos.x = srcX * typeSz;
-   p.srcPos.y = srcY;
-   p.srcPos.z = srcF * srcAllocZ + srcZ;
-   p.srcPtr.ptr = src.ptr;
-   p.srcPtr.pitch = src.pitch;
-   p.srcPtr.xsize = src.xsize;
-   p.srcPtr.ysize = src.ysize;
-
-   p.dstPos.x = dstX * typeSz;
-   p.dstPos.y = dstY;
-   p.dstPos.z = dstF * dstAllocZ + dstZ;
-   p.dstPtr.ptr = dst.ptr;
-   p.dstPtr.pitch = dst.pitch;
-   p.dstPtr.xsize = dst.xsize;
-   p.dstPtr.ysize = dst.ysize;
-
-   p.extent.width = Nx * typeSz;
-   p.extent.height = Ny;
-   p.extent.depth = Nz * Nf;
-
-   p.kind = cudaMemcpyDeviceToDevice;
-
-   WALBERLA_CUDA_CHECK( cudaMemcpy3D(&p) );
+   const uint_t & Nx = std::get<0>(intervalSize),
+                & Ny = std::get<1>(intervalSize),
+                & Nz = std::get<2>(intervalSize),
+                & Nf = std::get<3>(intervalSize);
+
+   const uint_t & srcX = std::get<0>(srcOffset),
+                & srcY = std::get<1>(srcOffset),
+                & srcZ = std::get<2>(srcOffset),
+                & srcF = std::get<3>(srcOffset);
+
+   const uint_t & dstX = std::get<0>(dstOffset),
+                & dstY = std::get<1>(dstOffset),
+                & dstZ = std::get<2>(dstOffset),
+                & dstF = std::get<3>(dstOffset);
+
+   auto copyFunctor = [&](uint_t dstCoordZ, uint_t srcCoordZ, uint_t zIntervalSize) {
+      cudaMemcpy3DParms p;
+      memset( &p, 0, sizeof(p) );
+
+      p.srcPos = make_cudaPos( srcF * typeSize, srcX, srcCoordZ * srcAllocSizeY + srcY );
+      p.srcPtr = make_cudaPitchedPtr( src.ptr, src.pitch, src.xsize, src.ysize );
+
+      p.dstPos = make_cudaPos( dstF * typeSize, dstX, dstCoordZ * dstAllocSizeY + dstY );
+      p.dstPtr = make_cudaPitchedPtr( dst.ptr, dst.pitch, dst.xsize, dst.ysize );
+
+      p.extent = make_cudaExtent( Nf * typeSize, Nx, Ny * zIntervalSize );
+      p.kind = cudaMemcpyDeviceToDevice;
+
+      if ( copyStream == 0 )
+      {
+         WALBERLA_CUDA_CHECK( cudaMemcpy3D(&p) );
+      }
+      else
+      {
+         WALBERLA_CUDA_CHECK( cudaMemcpy3DAsync(&p, copyStream) );
+      }
+   };
+
+   if ( Nz == 1 || ( Ny == dstAllocSizeY && Ny == srcAllocSizeY ) )
+   {
+      copyFunctor( dstZ, srcZ, Nz );
+   }
+   else
+   {
+      for( uint_t z = 0; z < Nz; ++z )
+      {
+         copyFunctor( dstZ + z, srcZ + z, 1 );
+      }
+   }
 }
 
 
-void copyHostToDevFZYXRestricted( const cudaPitchedPtr& dst, unsigned char* src,
-                                  uint_t typeSz, uint_t dstAllocZ, uint_t srcAllocZ,
-                                  uint_t dstX, uint_t dstY, uint_t dstZ, uint_t dstF,
-                                  uint_t srcX, uint_t srcY, uint_t srcZ, uint_t srcF,
-                                  uint_t Nx, uint_t Ny, uint_t Nz, uint_t Nf )
+void copyHostToDevFZYX( const cudaPitchedPtr& dst, unsigned char* src,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset,
+                        uint_t dstAllocSizeZ, uint_t srcAllocSizeZ, uint_t typeSize,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize,
+                        cudaStream_t copyStream )
 {
-   WALBERLA_ASSERT( Nf == 1 || ( Nz == dstAllocZ && Nz == srcAllocZ ) );
-
-   cudaMemcpy3DParms p;
-   memset( &p, 0, sizeof(p) );
-
-   p.srcPos.x = srcX * typeSz;
-   p.srcPos.y = srcY;
-   p.srcPos.z = srcF * srcAllocZ + srcZ;
-   p.srcPtr.ptr = src;
-   p.srcPtr.pitch = Nx * typeSz;
-   p.srcPtr.xsize = Nx * typeSz;
-   p.srcPtr.ysize = Ny;
-
-   p.dstPos.x = dstX * typeSz;
-   p.dstPos.y = dstY;
-   p.dstPos.z = dstF * dstAllocZ + dstZ;
-   p.dstPtr.ptr = dst.ptr;
-   p.dstPtr.pitch = dst.pitch;
-   p.dstPtr.xsize = dst.xsize;
-   p.dstPtr.ysize = dst.ysize;
-
-   p.extent.width = Nx * typeSz;
-   p.extent.height = Ny;
-   p.extent.depth = Nz * Nf;
-
-   p.kind = cudaMemcpyHostToDevice;
-
-   WALBERLA_CUDA_CHECK( cudaMemcpy3D(&p) );
+   const uint_t & Nx = std::get<0>(intervalSize),
+                & Ny = std::get<1>(intervalSize),
+                & Nz = std::get<2>(intervalSize),
+                & Nf = std::get<3>(intervalSize);
+
+   const uint_t & srcX = std::get<0>(srcOffset),
+                & srcY = std::get<1>(srcOffset),
+                & srcZ = std::get<2>(srcOffset),
+                & srcF = std::get<3>(srcOffset);
+
+   const uint_t & dstX = std::get<0>(dstOffset),
+                & dstY = std::get<1>(dstOffset),
+                & dstZ = std::get<2>(dstOffset),
+                & dstF = std::get<3>(dstOffset);
+
+   auto copyFunctor = [&](uint_t dstCoordF, uint_t srcCoordF, uint_t fIntervalSize) {
+      cudaMemcpy3DParms p;
+      memset( &p, 0, sizeof(p) );
+
+      p.srcPos = make_cudaPos( srcX * typeSize, srcY, srcCoordF * srcAllocSizeZ + srcZ );
+      p.srcPtr = make_cudaPitchedPtr( src, Nx * typeSize, Nx * typeSize, Ny );
+
+      p.dstPos = make_cudaPos( dstX * typeSize, dstY, dstCoordF * dstAllocSizeZ + dstZ );
+      p.dstPtr = make_cudaPitchedPtr( dst.ptr, dst.pitch, dst.xsize, dst.ysize );
+
+      p.extent = make_cudaExtent( Nx * typeSize, Ny, Nz * fIntervalSize );
+      p.kind = cudaMemcpyHostToDevice;
+
+      if (copyStream == 0)
+      {
+         WALBERLA_CUDA_CHECK( cudaMemcpy3D(&p) );
+      }
+      else
+      {
+         // Using cudaMemcpy3DAsync requires page-locked memory on the host!
+         WALBERLA_CUDA_CHECK( cudaMemcpy3DAsync(&p, copyStream) );
+      }
+   };
+
+   if ( Nf == 1 || ( Nz == dstAllocSizeZ ) )
+   {
+      copyFunctor( dstF, srcF, Nf );
+   }
+   else
+   {
+      for( uint_t f = 0; f < Nf; ++f )
+      {
+         copyFunctor( dstF + f, srcF + f, uint_c(1) );
+      }
+   }
 }
 
+void copyHostToDevZYXF( const cudaPitchedPtr& dst, unsigned char* src,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset,
+                        uint_t dstAllocSizeY, uint_t srcAllocSizeY, uint_t typeSize,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize,
+                        cudaStream_t copyStream )
+{
+   const uint_t & Nx = std::get<0>(intervalSize),
+                & Ny = std::get<1>(intervalSize),
+                & Nz = std::get<2>(intervalSize),
+                & Nf = std::get<3>(intervalSize);
 
-void copyDevToHostFZYXRestricted( unsigned char* dst, const cudaPitchedPtr& src,
-                                  uint_t typeSz, uint_t dstAllocZ, uint_t srcAllocZ,
-                                  uint_t dstX, uint_t dstY, uint_t dstZ, uint_t dstF,
-                                  uint_t srcX, uint_t srcY, uint_t srcZ, uint_t srcF,
-                                  uint_t Nx, uint_t Ny, uint_t Nz, uint_t Nf )
+   const uint_t & srcX = std::get<0>(srcOffset),
+                & srcY = std::get<1>(srcOffset),
+                & srcZ = std::get<2>(srcOffset),
+                & srcF = std::get<3>(srcOffset);
+
+   const uint_t & dstX = std::get<0>(dstOffset),
+                & dstY = std::get<1>(dstOffset),
+                & dstZ = std::get<2>(dstOffset),
+                & dstF = std::get<3>(dstOffset);
+
+   auto copyFunctor = [&](uint_t dstCoordZ, uint_t srcCoordZ, uint_t zIntervalSize) {
+         cudaMemcpy3DParms p;
+         memset( &p, 0, sizeof(p) );
+
+         p.srcPos = make_cudaPos( srcF * typeSize, srcX, srcCoordZ * srcAllocSizeY + srcY );
+         p.srcPtr = make_cudaPitchedPtr( src, Nf * typeSize, Nf * typeSize, Nx );
+
+         p.dstPos = make_cudaPos( dstF * typeSize, dstX, dstCoordZ * dstAllocSizeY + dstY );
+         p.dstPtr = make_cudaPitchedPtr( dst.ptr, dst.pitch, dst.xsize, dst.ysize );
+
+         p.extent = make_cudaExtent( Nf * typeSize, Nx, Ny * zIntervalSize );
+         p.kind = cudaMemcpyHostToDevice;
+
+         if ( copyStream == 0 )
+         {
+            WALBERLA_CUDA_CHECK( cudaMemcpy3D(&p) );
+         }
+         else
+         {
+            // Using cudaMemcpy3DAsync requires page-locked memory on the host!
+            WALBERLA_CUDA_CHECK( cudaMemcpy3DAsync(&p, copyStream) );
+         }
+   };
+
+   if ( Nz == 1 || ( Ny == dstAllocSizeY && Ny == srcAllocSizeY ) )
+   {
+      copyFunctor( dstZ, srcZ, Nz );
+   }
+   else
+   {
+      for( uint_t z = 0; z < Nz; ++z )
+      {
+         copyFunctor( dstZ + z, srcZ + z, 1 );
+      }
+   }
+}
+
+
+void copyDevToHostFZYX( unsigned char* dst, const cudaPitchedPtr& src,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset,
+                        uint_t dstAllocSizeZ, uint_t srcAllocSizeZ, uint_t typeSize,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize,
+                        cudaStream_t copyStream )
 {
-   cudaMemcpy3DParms p;
-   memset( &p, 0, sizeof(p) );
-
-   p.srcPos.x = srcX * typeSz;
-   p.srcPos.y = srcY;
-   p.srcPos.z = srcF * srcAllocZ + srcZ;
-   p.srcPtr.ptr = src.ptr;
-   p.srcPtr.pitch = src.pitch;
-   p.srcPtr.xsize = src.xsize;
-   p.srcPtr.ysize = src.ysize;
-
-   p.dstPos.x = dstX * typeSz;
-   p.dstPos.y = dstY;
-   p.dstPos.z = dstF * dstAllocZ + dstZ;
-   p.dstPtr.ptr = dst;
-   p.dstPtr.pitch = Nx * typeSz;
-   p.dstPtr.xsize = Nx * typeSz;
-   p.dstPtr.ysize = Ny;
-
-   p.extent.width = Nx * typeSz;
-   p.extent.height = Ny;
-   p.extent.depth = Nz * Nf;
-
-   p.kind = cudaMemcpyDeviceToHost;
-
-   WALBERLA_CUDA_CHECK( cudaMemcpy3D(&p) );
+   const uint_t & Nx = std::get<0>(intervalSize),
+                & Ny = std::get<1>(intervalSize),
+                & Nz = std::get<2>(intervalSize),
+                & Nf = std::get<3>(intervalSize);
+
+   const uint_t & srcX = std::get<0>(srcOffset),
+                & srcY = std::get<1>(srcOffset),
+                & srcZ = std::get<2>(srcOffset),
+                & srcF = std::get<3>(srcOffset);
+
+   const uint_t & dstX = std::get<0>(dstOffset),
+                & dstY = std::get<1>(dstOffset),
+                & dstZ = std::get<2>(dstOffset),
+                & dstF = std::get<3>(dstOffset);
+
+   auto copyFunctor = [&](uint_t dstCoordF, uint_t srcCoordF, uint_t fIntervalSize) {
+      cudaMemcpy3DParms p;
+      memset( &p, 0, sizeof(p) );
+
+      p.srcPos = make_cudaPos( srcX * typeSize, srcY, srcCoordF * srcAllocSizeZ + srcZ );
+      p.srcPtr = make_cudaPitchedPtr( src.ptr, src.pitch, src.xsize, src.ysize );
+
+      p.dstPos = make_cudaPos( dstX * typeSize, dstY, dstCoordF * dstAllocSizeZ + dstZ );
+      p.dstPtr = make_cudaPitchedPtr( dst, Nx * typeSize, Nx * typeSize, Ny );
+
+      p.extent = make_cudaExtent( Nx * typeSize, Ny, Nz * fIntervalSize );
+      p.kind = cudaMemcpyDeviceToHost;
+
+      if ( copyStream == 0 )
+      {
+         WALBERLA_CUDA_CHECK( cudaMemcpy3D(&p) );
+      }
+      else
+      {
+         // Using cudaMemcpy3DAsync requires page-locked memory on the host!
+         WALBERLA_CUDA_CHECK( cudaMemcpy3DAsync(&p, copyStream) );
+      }
+   };
+
+   if( Nf == 1 || ( Nz == dstAllocSizeZ && Nz == srcAllocSizeZ ) )
+   {
+      copyFunctor( dstF, srcF, Nf );
+   }
+   else
+   {
+      for( uint_t f = 0; f < Nf; ++f )
+      {
+         copyFunctor( dstF + f, srcF + f, 1 );
+      }
+   }
 }
 
 
+void copyDevToHostZYXF( unsigned char* dst, const cudaPitchedPtr& src,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset,
+                        uint_t dstAllocSizeY, uint_t srcAllocSizeY, uint_t typeSize,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize,
+                        cudaStream_t copyStream )
+{
+   const uint_t & Nx = std::get<0>(intervalSize),
+                & Ny = std::get<1>(intervalSize),
+                & Nz = std::get<2>(intervalSize),
+                & Nf = std::get<3>(intervalSize);
+
+   const uint_t & srcX = std::get<0>(srcOffset),
+                & srcY = std::get<1>(srcOffset),
+                & srcZ = std::get<2>(srcOffset),
+                & srcF = std::get<3>(srcOffset);
+
+   const uint_t & dstX = std::get<0>(dstOffset),
+                & dstY = std::get<1>(dstOffset),
+                & dstZ = std::get<2>(dstOffset),
+                & dstF = std::get<3>(dstOffset);
+
+   auto copyFunctor = [&](uint_t dstCoordZ, uint_t srcCoordZ, uint_t zIntervalSize) {
+      cudaMemcpy3DParms p;
+      memset( &p, 0, sizeof(p) );
+
+      p.srcPos = make_cudaPos( srcF * typeSize, srcX, srcCoordZ * srcAllocSizeY + srcY );
+      p.srcPtr = make_cudaPitchedPtr( src.ptr, src.pitch, src.xsize, src.ysize );
+
+      p.dstPos = make_cudaPos( dstF * typeSize, dstX, dstCoordZ * dstAllocSizeY + dstY );
+      p.dstPtr = make_cudaPitchedPtr( dst, Nf * typeSize, Nf * typeSize, Nx );
+
+      p.extent = make_cudaExtent( Nf * typeSize, Nx, Ny * zIntervalSize );
+
+      p.kind = cudaMemcpyDeviceToHost;
+
+      if ( copyStream == 0 )
+      {
+         WALBERLA_CUDA_CHECK( cudaMemcpy3D(&p) );
+      }
+      else
+      {
+         // Using cudaMemcpy3DAsync requires page-locked memory on the host!
+         WALBERLA_CUDA_CHECK( cudaMemcpy3DAsync(&p, copyStream) );
+      }
+   };
+
+
+   if ( Nz == 1 || ( Ny == dstAllocSizeY && Ny == srcAllocSizeY ) )
+   {
+      copyFunctor( dstZ, srcZ, Nz );
+   }
+   else
+   {
+      for( uint_t z = 0; z < Nz; ++z )
+      {
+         copyFunctor( dstZ + z, srcZ + z, 1 );
+      }
+   }
+}
 
 } // namespace cuda
 } // namespace walberla
diff --git a/src/cuda/GPUCopy.h b/src/cuda/GPUCopy.h
index 7c6c69a79..a854068cd 100644
--- a/src/cuda/GPUCopy.h
+++ b/src/cuda/GPUCopy.h
@@ -1,21 +1,22 @@
 //======================================================================================================================
 //
-//  This file is part of waLBerla. waLBerla is free software: you can 
+//  This file is part of waLBerla. waLBerla is free software: you can
 //  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of 
+//  License as published by the Free Software Foundation, either version 3 of
 //  the License, or (at your option) any later version.
-//  
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT 
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 //  for more details.
-//  
+//
 //  You should have received a copy of the GNU General Public License along
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \file GPUCopy.h
 //! \ingroup cuda
 //! \author Paulo Carvalho <prcjunior@inf.ufpr.br>
+//! \author JoÃ£o Victor Tozatti Risso <jvtrisso@inf.ufpr.br>
 //! \brief Copy routines of 4D intervals involving GPU buffers.
 //
 //======================================================================================================================
@@ -24,6 +25,7 @@
 
 #include "core/DataTypes.h"
 
+#include <tuple>
 #include <cuda_runtime.h>
 
 
@@ -31,152 +33,87 @@ namespace walberla {
 namespace cuda {
 
 
-//*******************************************************************************************************************
-/*! Restricted version of copyDevToDevFZYX() that requires Nf == 1 || ( Nz == dstAllocZ && Nz == srcAllocZ ).
- * See copyDevToDevFZYX() for more details.
- *******************************************************************************************************************/
-void copyDevToDevFZYXRestricted( const cudaPitchedPtr& dst, const cudaPitchedPtr& src,
-                                 uint_t typeSz, uint_t dstAllocZ, uint_t srcAllocZ,
-                                 uint_t dstX, uint_t dstY, uint_t dstZ, uint_t dstF,
-                                 uint_t srcX, uint_t srcY, uint_t srcZ, uint_t srcF,
-                                 uint_t Nx, uint_t Ny, uint_t Nz, uint_t Nf );
-
-
-//*******************************************************************************************************************
+//****************************************************************************************************************************
 /*! Copy a 4D interval of a device buffer to another device buffer with fzyx memory layout.
  *
- * \param dst        destination buffer
- * \param src        source buffer
- * \param typeSz     size of an f element
- * \param dstAllocZ  allocation size in z direction of the destination buffer
- * \param srcAllocZ  allocation size in z direction of the source buffer
- * \param dstX       x coordinate of the interval start point in the destination buffer
- * \param dstY       y coordinate of the interval start point in the destination buffer
- * \param dstZ       z coordinate of the interval start point in the destination buffer
- * \param dstF       f coordinate of the interval start point in the destination buffer
- * \param srcX       x coordinate of the interval start point in the source buffer
- * \param srcY       y coordinate of the interval start point in the source buffer
- * \param srcZ       z coordinate of the interval start point in the source buffer
- * \param srcF       f coordinate of the interval start point in the source buffer
- * \param Nx         interval size in x direction
- * \param Ny         interval size in y direction
- * \param Nz         interval size in z direction
- * \param Nf         interval size in f direction
- *******************************************************************************************************************/
-inline void copyDevToDevFZYX( const cudaPitchedPtr& dst, const cudaPitchedPtr& src,
-                              uint_t typeSz, uint_t dstAllocZ, uint_t srcAllocZ,
-                              uint_t dstX, uint_t dstY, uint_t dstZ, uint_t dstF,
-                              uint_t srcX, uint_t srcY, uint_t srcZ, uint_t srcF,
-                              uint_t Nx, uint_t Ny, uint_t Nz, uint_t Nf )
-{
-   if( Nf == 1 || ( Nz == dstAllocZ && Nz == srcAllocZ ) )
-   {
-      copyDevToDevFZYXRestricted( dst, src,
-                                  typeSz, dstAllocZ, srcAllocZ,
-                                  dstX, dstY, dstZ, dstF,
-                                  srcX, srcY, srcZ, srcF,
-                                  Nx, Ny, Nz, Nf );
-   }
-   else
-   {
-      for( uint_t f = 0; f < Nf; ++f )
-      {
-         copyDevToDevFZYXRestricted( dst, src,
-                                     typeSz, dstAllocZ, srcAllocZ,
-                                     dstX, dstY, dstZ, dstF + f,
-                                     srcX, srcY, srcZ, srcF + f,
-                                     Nx, Ny, Nz, 1 );
-      }
-   }
-}
-
-
-//*******************************************************************************************************************
-/*! Restricted version of copyHostToDevFZYX() that requires Nf == 1 || ( Nz == dstAllocZ && Nz == srcAllocZ ).
- * See copyHostToDevFZYX() for more details.
- *******************************************************************************************************************/
-void copyHostToDevFZYXRestricted( const cudaPitchedPtr& dst, unsigned char* src,
-                                  uint_t typeSz, uint_t dstAllocZ, uint_t srcAllocZ,
-                                  uint_t dstX, uint_t dstY, uint_t dstZ, uint_t dstF,
-                                  uint_t srcX, uint_t srcY, uint_t srcZ, uint_t srcF,
-                                  uint_t Nx, uint_t Ny, uint_t Nz, uint_t Nf );
-
+ * \param dst           destination buffer
+ * \param src           source buffer
+ * \param dstOffset     (x, y, z, f)-tuple containing the coordinate of the interval start point in the destination buffer
+ * \param srcOffset     (x, y, z, f)-tuple containing the coordinate of the interval start point in the source buffer
+ * \param dstAllocSizeY allocation size in y direction of the destination buffer
+ * \param srcAllocSizeY allocation size in y direction of the source buffer
+ * \param typeSize      size of an f element
+ * \param copyStream    CUDA stream, if not NULL copy operations will be performed asynchronously
+ *****************************************************************************************************************************/
+void copyDevToDevFZYX( const cudaPitchedPtr& dst, const cudaPitchedPtr& src,
+                       std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset,
+                       std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset,
+                       uint_t dstAllocSizeZ, uint_t srcAllocSizeZ, uint_t typeSize,
+                       std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize,
+                       cudaStream_t copyStream );
+
+//****************************************************************************************************************************
+/*! Copy a 4D interval of a device buffer to another device buffer with zyxf memory layout.
+ *
+ * \param dst           destination buffer
+ * \param src           source buffer
+ * \param dstOffset     (x, y, z, f)-tuple containing the coordinate of the interval start point in the destination buffer
+ * \param srcOffset     (x, y, z, f)-tuple containing the coordinate of the interval start point in the source buffer
+ * \param dstAllocSizeY allocation size in y direction of the destination buffer
+ * \param srcAllocSizeY allocation size in y direction of the source buffer
+ * \param typeSize      size of an f element
+ * \param copyStream    CUDA stream, if not NULL copy operations will be performed asynchronously
+ *****************************************************************************************************************************/
+void copyDevToDevZYXF( const cudaPitchedPtr& dst, const cudaPitchedPtr& src,
+                       std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset,
+                       std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset,
+                       uint_t dstAllocSizeY, uint_t srcAllocSizeY, uint_t typeSize,
+                       std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize,
+                       cudaStream_t copyStream );
 
 //*******************************************************************************************************************
 /*! Copy a 4D interval of a host buffer to a device buffer with fzyx memory layout. See copyDevToDevFZYX() for
  * parameter information.
  *******************************************************************************************************************/
-inline void copyHostToDevFZYX( const cudaPitchedPtr& dst, unsigned char* src,
-                               uint_t typeSz, uint_t dstAllocZ, uint_t srcAllocZ,
-                               uint_t dstX, uint_t dstY, uint_t dstZ, uint_t dstF,
-                               uint_t srcX, uint_t srcY, uint_t srcZ, uint_t srcF,
-                               uint_t Nx, uint_t Ny, uint_t Nz, uint_t Nf )
-{
-   if( Nf == 1 || ( Nz == dstAllocZ && Nz == srcAllocZ ) )
-   {
-      copyHostToDevFZYXRestricted( dst, src,
-                                   typeSz, dstAllocZ, srcAllocZ,
-                                   dstX, dstY, dstZ, dstF,
-                                   srcX, srcY, srcZ, srcF,
-                                   Nx, Ny, Nz, Nf );
-   }
-   else
-   {
-      for( uint_t f = 0; f < Nf; ++f )
-      {
-         copyHostToDevFZYXRestricted( dst, src,
-                                      typeSz, dstAllocZ, srcAllocZ,
-                                      dstX, dstY, dstZ, dstF + f,
-                                      srcX, srcY, srcZ, srcF + f,
-                                      Nx, Ny, Nz, 1 );
-      }
-   }
-}
-
+void copyHostToDevFZYX( const cudaPitchedPtr& dst, unsigned char* src,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset,
+                        uint_t dstAllocSizeZ, uint_t srcAllocSizeZ, uint_t typeSize,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize,
+                        cudaStream_t copyStream );
 
 //*******************************************************************************************************************
-/*! Restricted version of copyDevToHostFZYX() that requires Nf == 1 || ( Nz == dstAllocZ && Nz == srcAllocZ ).
- * See copyDevToHostFZYX() for more details.
+/*! Copy a 4D interval of a host buffer to a device buffer with zyxf memory layout. See copyDevToDevZYXF() for
+ * parameter information.
  *******************************************************************************************************************/
-void copyDevToHostFZYXRestricted( unsigned char* dst, const cudaPitchedPtr& src,
-                                  uint_t typeSz, uint_t dstAllocZ, uint_t srcAllocZ,
-                                  uint_t dstX, uint_t dstY, uint_t dstZ, uint_t dstF,
-                                  uint_t srcX, uint_t srcY, uint_t srcZ, uint_t srcF,
-                                  uint_t Nx, uint_t Ny, uint_t Nz, uint_t Nf );
-
+void copyHostToDevZYXF( const cudaPitchedPtr& dst, unsigned char* src,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset,
+                        uint_t dstAllocSizeY, uint_t srcAllocSizeY, uint_t typeSize,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize,
+                        cudaStream_t copyStream );
 
 //*******************************************************************************************************************
 /*! Copy a 4D interval of a device buffer to a host buffer with fzyx memory layout. See copyDevToDevFZYX() for
  * parameter information.
  *******************************************************************************************************************/
-inline void copyDevToHostFZYX( unsigned char* dst, const cudaPitchedPtr& src,
-                               uint_t typeSz, uint_t dstAllocZ, uint_t srcAllocZ,
-                               uint_t dstX, uint_t dstY, uint_t dstZ, uint_t dstF,
-                               uint_t srcX, uint_t srcY, uint_t srcZ, uint_t srcF,
-                               uint_t Nx, uint_t Ny, uint_t Nz, uint_t Nf )
-{
-   if( Nf == 1 || ( Nz == dstAllocZ && Nz == srcAllocZ ) )
-   {
-      copyDevToHostFZYXRestricted( dst, src,
-                                   typeSz, dstAllocZ, srcAllocZ,
-                                   dstX, dstY, dstZ, dstF,
-                                   srcX, srcY, srcZ, srcF,
-                                   Nx, Ny, Nz, Nf );
-   }
-   else
-   {
-      for( uint_t f = 0; f < Nf; ++f )
-      {
-         copyDevToHostFZYXRestricted( dst, src,
-                                      typeSz, dstAllocZ, srcAllocZ,
-                                      dstX, dstY, dstZ, dstF + f,
-                                      srcX, srcY, srcZ, srcF + f,
-                                      Nx, Ny, Nz, 1 );
-      }
-   }
-}
-
+void copyDevToHostFZYX( unsigned char* dst, const cudaPitchedPtr& src,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset,
+                        uint_t dstAllocSizeZ, uint_t srcAllocSizeZ, uint_t typeSize,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize,
+                        cudaStream_t copyStream );
 
+//*******************************************************************************************************************
+/*! Copy a 4D interval of a device buffer to a host buffer with zyxf memory layout. See copyDevToDevZYXF() for
+ * parameter information.
+ *******************************************************************************************************************/
+void copyDevToHostZYXF( unsigned char* dst, const cudaPitchedPtr& src,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset,
+                        uint_t dstAllocSizeY, uint_t srcAllocSizeY, uint_t typeSize,
+                        std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize,
+                        cudaStream_t copyStream );
 
 } // namespace cuda
 } // namespace walberla
diff --git a/src/cuda/Kernel.h b/src/cuda/Kernel.h
index 3b6acf0a1..59e588399 100644
--- a/src/cuda/Kernel.h
+++ b/src/cuda/Kernel.h
@@ -102,8 +102,8 @@ namespace cuda {
       template<typename T>  void addFieldIndexingParam( const T & indexing );
 
 
-      void configure( dim3 gridDim, dim3 blockDim );
-      void operator() () const;
+      void configure( dim3 gridDim, dim3 blockDim, std::size_t sharedMemSize = 0 );
+      void operator() ( cudaStream_t stream = 0 ) const;
 
 
    protected:
@@ -118,6 +118,7 @@ namespace cuda {
       bool configured_;
       dim3 gridDim_;
       dim3 blockDim_;
+      std::size_t sharedMemSize_;
 
       struct ParamInfo {
          std::vector<char> data;
@@ -178,7 +179,8 @@ namespace cuda {
    template<typename FP>
    Kernel<FP>::Kernel( FP funcPtr )
       : funcPtr_ ( funcPtr ),
-        configured_( false )
+        configured_( false ),
+        sharedMemSize_( 0 )
    {}
 
    template<typename FP>
@@ -206,12 +208,13 @@ namespace cuda {
    }
 
    template<typename FP>
-   void Kernel<FP>::configure( dim3 gridDim, dim3 blockDim )
+   void Kernel<FP>::configure( dim3 gridDim, dim3 blockDim, std::size_t sharedMemSize )
    {
       if ( ! configured_ )
       {
          gridDim_ = gridDim;
          blockDim_ = blockDim;
+         sharedMemSize_ = sharedMemSize;
          configured_ = true;
       }
       else
@@ -225,7 +228,7 @@ namespace cuda {
    }
 
    template<typename FP>
-   void Kernel<FP>::operator() () const
+   void Kernel<FP>::operator() ( cudaStream_t stream ) const
    {
       // check for correct number of parameter calls
 
@@ -235,7 +238,7 @@ namespace cuda {
       }
 
       // set the number of blocks and  threads,
-      WALBERLA_CUDA_CHECK( cudaConfigureCall( gridDim_, blockDim_ ) ); //TODO extend class to support streams
+      WALBERLA_CUDA_CHECK( cudaConfigureCall( gridDim_, blockDim_, sharedMemSize_, stream ) );
 
       // register all parameters
       for( auto paramIt = params_.begin(); paramIt != params_.end(); ++paramIt )  {
diff --git a/src/cuda/communication/GPUPackInfo.h b/src/cuda/communication/GPUPackInfo.h
index b3124dd39..61c698d82 100644
--- a/src/cuda/communication/GPUPackInfo.h
+++ b/src/cuda/communication/GPUPackInfo.h
@@ -16,16 +16,32 @@
 //! \file GPUPackInfo.h
 //! \ingroup cuda
 //! \author Paulo Carvalho <prcjunior@inf.ufpr.br>
+//! \author JoÃ£o Victor Tozatti Risso <jvtrisso@inf.ufpr.br>
 //======================================================================================================================
 
 #pragma once
 
-#include "cuda/GPUCopy.h"
-#include "communication/UniformPackInfo.h"
 #include "core/debug/Debug.h"
-#include "field/GhostRegions.h"
+#include "core/math/Vector3.h"
+#include "core/mpi/BufferSizeTrait.h"
+
 #include "stencil/Directions.h"
 
+#include "field/GhostRegions.h"
+
+#include "communication/UniformPackInfo.h"
+
+#include "blockforest/Block.h"
+
+#include "cuda/ErrorChecking.h"
+#include "cuda/GPUCopy.h"
+#include "cuda/communication/PinnedMemoryBuffer.h"
+
+#include <cuda_runtime.h>
+#include <map>
+#include <vector>
+#include <tuple>
+
 
 namespace walberla {
 namespace cuda {
@@ -42,17 +58,49 @@ template<typename GPUField_T>
 class GPUPackInfo : public walberla::communication::UniformPackInfo
 {
 public:
-   typedef typename GPUField_T::value_type T;
+   typedef typename GPUField_T::value_type FieldType;
+
+   GPUPackInfo( const BlockDataID & bdId, cudaStream_t stream = 0 )
+   : bdId_( bdId ), communicateAllGhostLayers_( true ), numberOfGhostLayers_( 0 )
+   {
+      streams_.push_back( stream );
+      copyAsync_ = (stream != 0);
+   }
+
+   GPUPackInfo( const BlockDataID & bdId, const uint_t numberOfGHostLayers, cudaStream_t stream = 0 )
+   : bdId_( bdId ), communicateAllGhostLayers_( false ), numberOfGhostLayers_(  numberOfGHostLayers )
+   {
+      streams_.push_back( stream );
+      copyAsync_ = (stream != 0);
+   }
 
-   GPUPackInfo( const BlockDataID & bdId ) : bdId_( bdId ), communicateAllGhostLayers_( true ),
-            numberOfGhostLayers_( 0 ) {}
+   GPUPackInfo( const BlockDataID & bdId, std::vector< cudaStream_t > & streams )
+   : bdId_( bdId ), communicateAllGhostLayers_( true ), numberOfGhostLayers_( 0 ), streams_( streams )
+   {
+      copyAsync_ = true;
+      for( auto streamIt = streams_.begin(); streamIt != streams_.end(); ++streamIt )
+         if ( *streamIt == 0 )
+         {
+            copyAsync_ = false;
+            break;
+         }
+   }
 
-   GPUPackInfo( const BlockDataID & bdId, const uint_t numberOfGHostLayers ) : bdId_( bdId ),
-            communicateAllGhostLayers_( false ), numberOfGhostLayers_(  numberOfGHostLayers ) {}
+   GPUPackInfo( const BlockDataID & bdId, const uint_t numberOfGHostLayers, std::vector< cudaStream_t > & streams )
+   : bdId_( bdId ), communicateAllGhostLayers_( false ), numberOfGhostLayers_(  numberOfGHostLayers ), streams_( streams )
+   {
+      copyAsync_ = true;
+      for( auto streamIt = streams_.begin(); streamIt != streams_.end(); ++streamIt )
+         if ( *streamIt == 0 )
+         {
+            copyAsync_ = false;
+            break;
+         }
+   }
 
    virtual ~GPUPackInfo() {}
 
-   bool constantDataExchange() const { return mpi::BufferSizeTrait<T>::constantSize; }
+   bool constantDataExchange() const { return mpi::BufferSizeTrait<FieldType>::constantSize; }
    bool threadsafeReceiving()  const { return true; }
 
    void unpackData(IBlock * receiver, stencil::Direction dir, mpi::RecvBuffer & buffer);
@@ -64,38 +112,73 @@ protected:
 
    uint_t numberOfGhostLayersToCommunicate( const GPUField_T * const field ) const;
 
+   inline cudaStream_t & getStream(stencil::Direction & dir) { return streams_[dir % streams_.size()]; }
+
    const BlockDataID bdId_;
    bool   communicateAllGhostLayers_;
    uint_t numberOfGhostLayers_;
+   std::vector< cudaStream_t > streams_;
+   bool copyAsync_;
+   std::map< stencil::Direction, PinnedMemoryBuffer > pinnedRecvBuffers_;
+   mutable std::map< stencil::Direction, PinnedMemoryBuffer > pinnedSendBuffers_;
 };
 
 
 template<typename GPUField_T>
 void GPUPackInfo<GPUField_T>::unpackData(IBlock * receiver, stencil::Direction dir, mpi::RecvBuffer & buffer)
 {
-   GPUField_T * f = receiver->getData< GPUField_T >( bdId_ );
-   WALBERLA_ASSERT_NOT_NULLPTR(f);
+   GPUField_T * fieldPtr = receiver->getData< GPUField_T >( bdId_ );
+   WALBERLA_ASSERT_NOT_NULLPTR(fieldPtr);
+
+   cell_idx_t nrOfGhostLayers = cell_idx_c( numberOfGhostLayersToCommunicate( fieldPtr ) );
 
-   if ( f->layout() != fzyx )
+   CellInterval fieldCi = field::getGhostRegion( *fieldPtr, dir, nrOfGhostLayers, false );
+
+   uint_t nrOfBytesToRead = fieldCi.numCells() * fieldPtr->fSize() * sizeof(FieldType);
+
+   unsigned char * bufPtr = buffer.skip(nrOfBytesToRead);
+
+   unsigned char * copyBufferPtr = bufPtr;
+   if ( copyAsync_ )
    {
-      WALBERLA_ABORT( "GPUPackInfo currently only supports fzyx layout" );
+      PinnedMemoryBuffer & pinnedBuffer = pinnedRecvBuffers_[dir];
+      copyBufferPtr = pinnedBuffer.resize( nrOfBytesToRead );
+      // Copy data into pinned memory buffer, in order to transfer it asynchronously to the GPU
+      std::copy( bufPtr, static_cast< unsigned char * >( bufPtr + nrOfBytesToRead ), copyBufferPtr );
    }
 
-   cell_idx_t nrOfGhostLayers = cell_idx_c( numberOfGhostLayersToCommunicate( f ) );
+   cudaStream_t & unpackStream = getStream(dir);
 
-   CellInterval ci = field::getGhostRegion( *f, dir, nrOfGhostLayers, false );
+   auto dstOffset = std::make_tuple( uint_c(fieldCi.xMin() + nrOfGhostLayers),
+                                     uint_c(fieldCi.yMin() + nrOfGhostLayers),
+                                     uint_c(fieldCi.zMin() + nrOfGhostLayers),
+                                     uint_c(0) );
+   auto srcOffset = std::make_tuple( uint_c(0), uint_c(0), uint_c(0), uint_c(0) );
 
-   uint_t nrOfBytesToRead = ci.xSize() * ci.ySize() * ci.zSize() * f->fSize() * sizeof(T);
+   auto intervalSize = std::make_tuple( fieldCi.xSize(), fieldCi.ySize(), fieldCi.zSize(),
+                                        fieldPtr->fSize() );
 
-   unsigned char * buf = buffer.skip(nrOfBytesToRead);
+   if ( fieldPtr->layout() == fzyx )
+   {
+      const uint_t dstAllocSizeZ = fieldPtr->zAllocSize();
+      const uint_t srcAllocSizeZ = fieldCi.zSize();
+      copyHostToDevFZYX( fieldPtr->pitchedPtr(), copyBufferPtr, dstOffset, srcOffset,
+                         dstAllocSizeZ, srcAllocSizeZ, sizeof(FieldType),
+                         intervalSize, unpackStream );
+   }
+   else
+   {
+      const uint_t dstAllocSizeY = fieldPtr->yAllocSize();
+      const uint_t srcAllocSizeY = fieldCi.ySize();
+      copyHostToDevZYXF( fieldPtr->pitchedPtr(), copyBufferPtr, dstOffset, srcOffset,
+                         dstAllocSizeY, srcAllocSizeY, sizeof(FieldType),
+                         intervalSize, unpackStream );
+   }
 
-   copyHostToDevFZYX( f->pitchedPtr(), buf,
-                      sizeof(T), f->zAllocSize(), ci.zSize(),
-                      uint_c(ci.xMin() + nrOfGhostLayers),
-                      uint_c(ci.yMin() + nrOfGhostLayers),
-                      uint_c(ci.zMin() + nrOfGhostLayers), 0,
-                      0, 0, 0, 0,
-                      ci.xSize(), ci.ySize(), ci.zSize(), f->fSize() );
+   if ( copyAsync_ )
+   {
+      WALBERLA_CUDA_CHECK( cudaStreamSynchronize( unpackStream ) );
+   }
 }
 
 
@@ -105,55 +188,116 @@ void GPUPackInfo<GPUField_T>::communicateLocal(const IBlock * sender, IBlock * r
    const GPUField_T * sf = sender  ->getData< GPUField_T >( bdId_ );
          GPUField_T * rf = receiver->getData< GPUField_T >( bdId_ );
 
-   if ( sf->layout() != fzyx || rf->layout() != fzyx )
-   {
-      WALBERLA_ABORT( "GPUPackInfo currently only supports fzyx layout" );
-   }
+   WALBERLA_ASSERT_NOT_NULLPTR( sf );
+   WALBERLA_ASSERT_NOT_NULLPTR( rf );
 
    WALBERLA_ASSERT_EQUAL(sf->xSize(), rf->xSize());
    WALBERLA_ASSERT_EQUAL(sf->ySize(), rf->ySize());
    WALBERLA_ASSERT_EQUAL(sf->zSize(), rf->zSize());
    WALBERLA_ASSERT_EQUAL(sf->fSize(), rf->fSize());
 
+   WALBERLA_CHECK( sf->layout() == rf->layout(), "GPUPackInfo::communicateLocal: fields must have the same layout!" );
+
    cell_idx_t nrOfGhostLayers = cell_idx_c( numberOfGhostLayersToCommunicate( sf ) );
 
    CellInterval sCi = field::getSliceBeforeGhostLayer( *sf, dir, nrOfGhostLayers, false );
    CellInterval rCi = field::getGhostRegion( *rf, stencil::inverseDir[dir], nrOfGhostLayers, false );
 
-   copyDevToDevFZYX( rf->pitchedPtr(), sf->pitchedPtr(),
-                     sizeof(T), rf->zAllocSize(), sf->zAllocSize(),
-                     uint_c(rCi.xMin() + nrOfGhostLayers), uint_c(rCi.yMin() + nrOfGhostLayers), uint_c(rCi.zMin() + nrOfGhostLayers), 0,
-                     uint_c(sCi.xMin() + nrOfGhostLayers), uint_c(sCi.yMin() + nrOfGhostLayers), uint_c(sCi.zMin() + nrOfGhostLayers), 0,
-                     rCi.xSize(), rCi.ySize(), rCi.zSize(), sf->fSize() );
+   cudaStream_t & commStream = getStream(dir);
+
+   auto dstOffset = std::make_tuple( uint_c(rCi.xMin() + nrOfGhostLayers),
+                                     uint_c(rCi.yMin() + nrOfGhostLayers),
+                                     uint_c(rCi.zMin() + nrOfGhostLayers),
+                                     uint_c(0) );
+
+   auto srcOffset = std::make_tuple( uint_c(sCi.xMin() + nrOfGhostLayers),
+                                     uint_c(sCi.yMin() + nrOfGhostLayers),
+                                     uint_c(sCi.zMin() + nrOfGhostLayers),
+                                     uint_c(0) );
+
+   auto intervalSize = std::make_tuple( rCi.xSize(), rCi.ySize(), rCi.zSize(), sf->fSize() );
+
+   if ( sf->layout() == fzyx )
+   {
+      const uint_t dstAllocSizeZ = rf->zAllocSize();
+      const uint_t srcAllocSizeZ = sf->zAllocSize();
+
+      copyDevToDevFZYX( rf->pitchedPtr(), sf->pitchedPtr(), dstOffset, srcOffset,
+                        dstAllocSizeZ, srcAllocSizeZ, sizeof(FieldType),
+                        intervalSize, commStream );
+   }
+   else
+   {
+      const uint_t dstAllocSizeY = rf->yAllocSize();
+      const uint_t srcAllocSizeY = sf->yAllocSize();
+
+      copyDevToDevZYXF( rf->pitchedPtr(), sf->pitchedPtr(), dstOffset, srcOffset,
+                        dstAllocSizeY, srcAllocSizeY, sizeof(FieldType),
+                        intervalSize, commStream );
+   }
+
+   if ( copyAsync_ )
+   {
+      WALBERLA_CUDA_CHECK( cudaStreamSynchronize( commStream ) );
+   }
 }
 
 
 template<typename GPUField_T>
 void GPUPackInfo<GPUField_T>::packDataImpl(const IBlock * sender, stencil::Direction dir, mpi::SendBuffer & outBuffer) const
 {
-   const GPUField_T * f = sender->getData< GPUField_T >( bdId_ );
-   WALBERLA_ASSERT_NOT_NULLPTR(f);
+   const GPUField_T * fieldPtr = sender->getData< GPUField_T >( bdId_ );
+   WALBERLA_ASSERT_NOT_NULLPTR(fieldPtr);
+
+   cell_idx_t nrOfGhostLayers = cell_idx_c( numberOfGhostLayersToCommunicate( fieldPtr ) );
+
+   CellInterval fieldCi = field::getSliceBeforeGhostLayer( *fieldPtr, dir, nrOfGhostLayers, false );
+
+   const std::size_t nrOfBytesToPack = fieldCi.numCells() * fieldPtr->fSize() * sizeof(FieldType);
+
+   unsigned char * outBufferPtr = outBuffer.forward( nrOfBytesToPack );
+
+   const cudaStream_t & packStream = streams_[dir % streams_.size()];
 
-   if ( f->layout() != fzyx )
+   unsigned char * copyBufferPtr = outBufferPtr;
+   if ( copyAsync_ )
    {
-      WALBERLA_ABORT( "GPUPackInfo currently only supports fzyx layout" );
+      PinnedMemoryBuffer & pinnedBuffer = pinnedSendBuffers_[dir];
+      copyBufferPtr = pinnedBuffer.resize( nrOfBytesToPack );
    }
 
-   cell_idx_t nrOfGhostLayers = cell_idx_c( numberOfGhostLayersToCommunicate( f ) );
+   auto dstOffset = std::make_tuple( uint_c(0), uint_c(0), uint_c(0), uint_c(0) );
+   auto srcOffset = std::make_tuple( uint_c(fieldCi.xMin() + nrOfGhostLayers),
+                                     uint_c(fieldCi.yMin() + nrOfGhostLayers),
+                                     uint_c(fieldCi.zMin() + nrOfGhostLayers),
+                                     uint_c(0) );
 
-   CellInterval ci = field::getSliceBeforeGhostLayer( *f, dir, nrOfGhostLayers, false );
+   auto intervalSize = std::make_tuple( fieldCi.xSize(), fieldCi.ySize(), fieldCi.zSize(),
+                                        fieldPtr->fSize() );
 
-   uint_t nrOfBytesToPack = ci.xSize() * ci.ySize() * ci.zSize() * f->fSize() * sizeof(T);
+   if ( fieldPtr->layout() == fzyx )
+   {
+      const uint_t dstAllocSizeZ = fieldCi.zSize();
+      const uint_t srcAllocSizeZ = fieldPtr->zAllocSize();
+      copyDevToHostFZYX( copyBufferPtr, fieldPtr->pitchedPtr(), dstOffset, srcOffset,
+                         dstAllocSizeZ, srcAllocSizeZ, sizeof(FieldType),
+                         intervalSize, packStream );
+   }
+   else
+   {
+      const uint_t dstAllocSizeZ = fieldCi.ySize();
+      const uint_t srcAllocSizeZ = fieldPtr->yAllocSize();
+      copyDevToHostZYXF( copyBufferPtr, fieldPtr->pitchedPtr(), dstOffset, srcOffset,
+                         dstAllocSizeZ, srcAllocSizeZ, sizeof(FieldType),
+                         intervalSize, packStream );
+   }
 
-   unsigned char * buf = outBuffer.forward( nrOfBytesToPack );
+   if ( copyAsync_ )
+   {
+      WALBERLA_CUDA_CHECK( cudaStreamSynchronize( packStream ) );
 
-   copyDevToHostFZYX( buf, f->pitchedPtr(),
-                      sizeof(T), ci.zSize(), f->zAllocSize(),
-                      0, 0, 0, 0,
-                      uint_c(ci.xMin() + nrOfGhostLayers),
-                      uint_c(ci.yMin() + nrOfGhostLayers),
-                      uint_c(ci.zMin() + nrOfGhostLayers), 0,
-                      ci.xSize(), ci.ySize(), ci.zSize(), f->fSize() );
+      std::copy( copyBufferPtr, static_cast<unsigned char *>( copyBufferPtr + nrOfBytesToPack ), outBufferPtr );
+   }
 }
 
 
diff --git a/src/cuda/communication/PinnedMemoryBuffer.h b/src/cuda/communication/PinnedMemoryBuffer.h
new file mode 100644
index 000000000..702e71bd6
--- /dev/null
+++ b/src/cuda/communication/PinnedMemoryBuffer.h
@@ -0,0 +1,123 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file PinnedMemoryBuffer.h
+//! \ingroup cuda
+//! \author JoÃ£o Victor Tozatti Risso <jvtrisso@inf.ufpr.br>
+//! \brief Pinned Memory buffer for staging memory when using asynchronous CUDA memory copies.
+//
+//======================================================================================================================
+
+#pragma once
+
+#include "cuda/ErrorChecking.h"
+
+#include <algorithm>
+#include <cuda_runtime.h>
+
+
+namespace walberla {
+namespace cuda {
+namespace communication {
+
+template< typename T = unsigned char >
+class GenericPinnedMemoryBuffer
+{
+public:
+   typedef T ElementType;
+
+   GenericPinnedMemoryBuffer();
+
+   GenericPinnedMemoryBuffer( std::size_t initSize );
+
+   GenericPinnedMemoryBuffer( const GenericPinnedMemoryBuffer & pb );
+
+   ~GenericPinnedMemoryBuffer();
+
+   inline T* ptr() const { return data_; }
+
+   inline T* resize( std::size_t newSize );
+
+   inline std::size_t size() const { return size_; }
+
+   GenericPinnedMemoryBuffer & operator=( const GenericPinnedMemoryBuffer & pb ) = delete;
+private:
+   T * data_;
+   std::size_t size_;
+};
+
+typedef GenericPinnedMemoryBuffer<> PinnedMemoryBuffer;
+
+template< typename T >  // Element type
+GenericPinnedMemoryBuffer<T>::GenericPinnedMemoryBuffer()
+   : data_(nullptr), size_(0)
+{
+}
+
+
+template< typename T >  // Element type
+GenericPinnedMemoryBuffer<T>::GenericPinnedMemoryBuffer( std::size_t initSize )
+   : data_(nullptr), size_(initSize)
+{
+   if (initSize > 0)
+   {
+      WALBERLA_CUDA_CHECK( cudaMallocHost( &data_, size_ * sizeof(T) ) );
+   }
+}
+
+template< typename T >  // Element type
+GenericPinnedMemoryBuffer<T>::GenericPinnedMemoryBuffer( const GenericPinnedMemoryBuffer & pb )
+   : size_(pb.size_)
+{
+   if ( pb.size_ > 0 )
+   {
+      WALBERLA_CUDA_CHECK( cudaMallocHost( &data_, pb.size_ * sizeof(T) ) );
+
+      std::copy( pb.data_, static_cast<T *>(pb.data_ + pb.size_), data_ );
+   }
+}
+
+template< typename T >  // Element type
+GenericPinnedMemoryBuffer<T>::~GenericPinnedMemoryBuffer()
+{
+   if ( data_ != nullptr )
+   {
+      WALBERLA_CUDA_CHECK( cudaFreeHost( data_ ) );
+   }
+}
+
+template< typename T >  // Element type
+T * GenericPinnedMemoryBuffer<T>::resize(std::size_t newSize)
+{
+   if ( newSize > size_ )
+   {
+      T * newBegin;
+      WALBERLA_CUDA_CHECK( cudaMallocHost( &newBegin, newSize * sizeof(T) ) );
+
+      std::swap( data_, newBegin );
+      if ( newBegin != nullptr )
+      {
+         WALBERLA_CUDA_CHECK( cudaFreeHost( newBegin ) );
+      }
+
+      size_ = newSize;
+   }
+
+   return data_;
+}
+
+} // namespace communication
+} // namespace cuda
+} // namespace walberla
diff --git a/tests/cuda/CMakeLists.txt b/tests/cuda/CMakeLists.txt
index 364b8bbea..c7727b0dd 100644
--- a/tests/cuda/CMakeLists.txt
+++ b/tests/cuda/CMakeLists.txt
@@ -7,6 +7,9 @@
 waLBerla_compile_test( FILES communication/GPUPackInfoTest.cpp DEPENDS blockforest )
 waLBerla_execute_test( NAME  GPUPackInfoTest )
 
+waLBerla_compile_test( FILES communication/GPUPackInfoCommunicationTest.cpp DEPENDS domain_decomposition blockforest stencil )
+waLBerla_execute_test( NAME  GPUPackInfoCommunicationTest )
+
 waLBerla_compile_test( FILES FieldTransferTest )
 waLBerla_execute_test( NAME  FieldTransferTest )
 
diff --git a/tests/cuda/communication/GPUPackInfoCommunicationTest.cpp b/tests/cuda/communication/GPUPackInfoCommunicationTest.cpp
new file mode 100644
index 000000000..f3f19d085
--- /dev/null
+++ b/tests/cuda/communication/GPUPackInfoCommunicationTest.cpp
@@ -0,0 +1,187 @@
+//========================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file GPUFieldPackInfoTest.cpp
+//! \ingroup cuda
+//! \author JoÃ£o Victor Tozatti Risso <jvtrisso@inf.ufpr.br>
+//! \brief Short communication test to verify the equivalence of GPUPackInfo using a default stream and multiple streams.
+//
+//========================================================================================================================
+
+#include "core/DataTypes.h"
+#include "core/debug/TestSubsystem.h"
+#include "core/math/Random.h"
+#include "core/mpi/Environment.h"
+
+#include "stencil/Directions.h"
+#include "stencil/Iterator.h"
+#include "stencil/D3Q27.h"
+
+#include "domain_decomposition/BlockDataID.h"
+
+#include "blockforest/Initialization.h"
+#include "blockforest/communication/UniformBufferedScheme.h"
+
+#include "field/GhostLayerField.h"
+
+#include "cuda/ErrorChecking.h"
+#include "cuda/HostFieldAllocator.h"
+#include "cuda/GPUField.h"
+#include "cuda/FieldCopy.h"
+#include "cuda/communication/GPUPackInfo.h"
+
+#include <cuda_runtime.h>
+#include <vector>
+
+using namespace walberla;
+
+using DataType = walberla::uint_t;
+using StencilType = stencil::D3Q27;
+using FieldType = field::GhostLayerField< DataType, StencilType::Size >;
+using GPUFieldType = cuda::GPUField< DataType >;
+using CommSchemeType = blockforest::communication::UniformBufferedScheme<StencilType>;
+using GPUPackInfoType = cuda::communication::GPUPackInfo< GPUFieldType >;
+
+static std::vector< cuda::Layout > fieldLayouts = { cuda::fzyx, cuda::zyxf };
+static uint_t fieldLayoutIndex = 0;
+
+
+FieldType * createField( IBlock* const block, StructuredBlockStorage* const storage )
+{
+   return new FieldType(
+            storage->getNumberOfXCells( *block ),   // number of cells in x direction per block
+            storage->getNumberOfYCells( *block ),   // number of cells in y direction per block
+            storage->getNumberOfZCells( *block ),   // number of cells in z direction per block
+            1,                                      // one ghost layer
+            DataType(0),                            // initial value
+            fieldLayouts[fieldLayoutIndex],         // layout
+            make_shared<cuda::HostFieldAllocator< DataType > >() // allocator for host pinned memory
+            );
+}
+
+
+GPUFieldType * createGPUField( IBlock* const block, StructuredBlockStorage* const storage )
+{
+   return new GPUFieldType(
+            storage->getNumberOfXCells( *block ), // number of cells in x direction
+            storage->getNumberOfYCells( *block ), // number of cells in y direction
+            storage->getNumberOfZCells( *block ), // number of cells in z direction
+            StencilType::Size,                    // number of cells for pdfs
+            1,                                    // one ghost layer
+            fieldLayouts[fieldLayoutIndex] );
+}
+
+
+void initFields( const shared_ptr< StructuredBlockStorage > & blocks, const BlockDataID & fieldID )
+{
+   for( auto block = blocks->begin(); block != blocks->end(); ++block )
+   {
+      auto fieldPtr = block->getData< FieldType >( fieldID );
+
+      for( auto fieldIt = fieldPtr->begin(); fieldIt != fieldPtr->end(); ++fieldIt )
+         *fieldIt = math::intRandom< DataType >();
+   }
+}
+
+
+int main( int argc, char ** argv )
+{
+   debug::enterTestMode();
+   mpi::Environment mpiEnv( argc, argv );
+
+   std::vector< cudaStream_t > streams;
+
+   for( uint_t i = 0; i < StencilType::Size; ++i )
+   {
+      cudaStream_t stream(nullptr);
+      WALBERLA_CUDA_CHECK( cudaStreamCreate(&stream) );
+      streams.push_back( stream );
+   }
+
+   const Vector3< uint_t > cells = Vector3< uint_t >( 4, 4, 4 );
+
+   uint_t nProc = uint_c( MPIManager::instance()->numProcesses() );
+
+   for(; fieldLayoutIndex < fieldLayouts.size(); ++fieldLayoutIndex )
+   {
+      auto blocks = blockforest::createUniformBlockGrid(nProc, 1, 1,                  // blocks
+                                                        cells[0], cells[1], cells[2], // cells
+                                                        1,                            // unit cell spacing
+                                                        true,                        // one block per process
+                                                        true, true, true);            // periodic in all directions
+
+      BlockDataID sourceFieldId = blocks->addStructuredBlockData< FieldType >( &createField,
+                                                                               "ScalarField" );
+
+      BlockDataID syncGPUFieldId = blocks->addStructuredBlockData< GPUFieldType >( &createGPUField,
+                                                                                   "syncGPUField" );
+
+      BlockDataID asyncGPUFieldId = blocks->addStructuredBlockData< GPUFieldType >( &createGPUField,
+                                                                                    "asyncGPUField" );
+
+      math::seedRandomGenerator( numeric_cast<boost::mt19937::result_type>( MPIManager::instance()->rank() ) );
+      // Initialize CPU field with random values
+      initFields( blocks, sourceFieldId );
+
+      // Copy same CPU field to both GPU fields
+      for( auto block = blocks->begin(); block != blocks->end(); ++block )
+      {
+         auto sourceFieldPtr = block->getData< FieldType >( sourceFieldId );
+
+         auto syncGPUFieldPtr = block->getData< GPUFieldType >( syncGPUFieldId );
+         cuda::fieldCpy( *syncGPUFieldPtr, *sourceFieldPtr );
+
+         auto asyncGPUFieldPtr = block->getData< GPUFieldType >( asyncGPUFieldId );
+         cuda::fieldCpy( *asyncGPUFieldPtr, *sourceFieldPtr );
+      }
+
+      // Setup communication schemes for synchronous GPUPackInfo
+      CommSchemeType syncCommScheme(blocks);
+      syncCommScheme.addPackInfo( boost::make_shared< GPUPackInfoType >( syncGPUFieldId ) );
+
+      // Setup communication scheme for asynchronous GPUPackInfo, which uses CUDA streams
+      CommSchemeType asyncCommScheme(blocks);
+      asyncCommScheme.addPackInfo( boost::make_shared< GPUPackInfoType >( asyncGPUFieldId, streams ) );
+
+      // Perform one communication step for each scheme
+      syncCommScheme();
+      asyncCommScheme();
+
+      // Check results
+      FieldType syncFieldCpu( cells[0], cells[1], cells[2], 1, fieldLayouts[fieldLayoutIndex],
+                              make_shared< cuda::HostFieldAllocator< DataType > >() );
+      FieldType asyncFieldCpu( cells[0], cells[1], cells[2], 1, fieldLayouts[fieldLayoutIndex],
+                               make_shared< cuda::HostFieldAllocator< DataType > >() );
+
+      for( auto block = blocks->begin(); block != blocks->end(); ++block )
+      {
+         auto syncGPUFieldPtr = block->getData< GPUFieldType >( syncGPUFieldId );
+         cuda::fieldCpy( syncFieldCpu, *syncGPUFieldPtr );
+
+         auto asyncGPUFieldPtr = block->getData< GPUFieldType >( asyncGPUFieldId );
+         cuda::fieldCpy( asyncFieldCpu, *asyncGPUFieldPtr );
+
+         for( auto syncIt = syncFieldCpu.beginWithGhostLayerXYZ(), asyncIt = asyncFieldCpu.beginWithGhostLayerXYZ();
+                  syncIt != syncFieldCpu.end();
+                  ++syncIt, ++asyncIt )
+            WALBERLA_CHECK_EQUAL( *syncIt, *asyncIt );
+      }
+   }
+
+   for( uint_t i = 0; i < StencilType::Size; ++i )
+      WALBERLA_CUDA_CHECK( cudaStreamDestroy(streams[i]) );
+
+   return EXIT_SUCCESS;
+}
diff --git a/tests/cuda/communication/GPUPackInfoTest.cpp b/tests/cuda/communication/GPUPackInfoTest.cpp
index 28d3def70..29fc84f25 100644
--- a/tests/cuda/communication/GPUPackInfoTest.cpp
+++ b/tests/cuda/communication/GPUPackInfoTest.cpp
@@ -34,12 +34,15 @@
 #include "stencil/D3Q27.h"
 
 #include <cstring>
+#include <vector>
+#include <cuda_runtime.h>
 
 #define F_SIZE    19
 
-
 using namespace walberla;
 
+static std::vector< field::Layout > fieldLayouts = { field::fzyx, field::zyxf };
+static uint_t fieldLayoutIndex = 0;
 
 cuda::GPUField<int> * createGPUField( IBlock* const block, StructuredBlockStorage* const storage )
 {
@@ -49,7 +52,7 @@ cuda::GPUField<int> * createGPUField( IBlock* const block, StructuredBlockStorag
             storage->getNumberOfZCells( *block ), // number of cells in z direction
             F_SIZE,                               // fSize
             1,                                    // number of ghost layers
-            field::fzyx );
+            fieldLayouts[fieldLayoutIndex] );
 }
 
 // Tester base class. The communicate() template method allows testing different communication methods.
@@ -59,7 +62,9 @@ public:
 
    typedef cuda::communication::GPUPackInfo< cuda::GPUField<int> > GPUPackInfoType;
 
-   GPUPackInfoTester( IBlock* block, BlockDataID fieldId ): block_( block ), fieldId_( fieldId ) {}
+   GPUPackInfoTester( IBlock* block, BlockDataID fieldId, std::vector< cudaStream_t > & streams ) :
+      block_( block ), fieldId_( fieldId ), streams_(streams) {}
+
    virtual ~GPUPackInfoTester() {}
 
    void test( stencil::Direction dir )
@@ -72,7 +77,7 @@ public:
                gpuField.zSize(),       // number of cells in z direction
                1,                      // number of ghost layers
                0,                      // initial value
-               field::fzyx);
+               fieldLayouts[fieldLayoutIndex]);
       cpuField.setWithGhostLayer( 0 );
 
       int val = 0;
@@ -82,7 +87,7 @@ public:
       }
       cuda::fieldCpy( gpuField, cpuField );
 
-      GPUPackInfoType gpuPackInfo( fieldId_ );
+      GPUPackInfoType gpuPackInfo( fieldId_, streams_ );
 
       communicate( gpuPackInfo, dir );
       cuda::fieldCpy( cpuField, gpuField );
@@ -101,6 +106,7 @@ protected:
 
    IBlock* block_;
    BlockDataID fieldId_;
+   std::vector< cudaStream_t > streams_;
 };
 
 
@@ -108,7 +114,7 @@ protected:
 class GPUPackInfoBufferTester: public GPUPackInfoTester
 {
 public:
-   GPUPackInfoBufferTester( IBlock* block, BlockDataID fieldId ): GPUPackInfoTester( block, fieldId ) {}
+   GPUPackInfoBufferTester( IBlock* block, BlockDataID fieldId, std::vector< cudaStream_t > & streams): GPUPackInfoTester( block, fieldId, streams ) {}
 
 protected:
    void communicate( GPUPackInfoType& gpuPackInfo, stencil::Direction dir )
@@ -134,7 +140,7 @@ protected:
 class GPUPackInfoLocalTester: public GPUPackInfoTester
 {
 public:
-   GPUPackInfoLocalTester( IBlock* block, BlockDataID fieldId ): GPUPackInfoTester( block, fieldId ) {}
+   GPUPackInfoLocalTester( IBlock* block, BlockDataID fieldId, std::vector< cudaStream_t > & streams ): GPUPackInfoTester( block, fieldId, streams ) {}
 
 protected:
    void communicate( GPUPackInfoType& gpuPackInfo, stencil::Direction dir )
@@ -151,27 +157,42 @@ int main(int argc, char **argv)
    debug::enterTestMode();
    MPIManager::instance()->initializeMPI(&argc,&argv);
 
-   // Create BlockForest
-   uint_t processes = uint_c( MPIManager::instance()->numProcesses() );
-   auto blocks = createUniformBlockGrid(processes,1,1,  //blocks
-                                        2,2,2,          //cells
-                                        1,              //dx
-                                        false,          //one block per process
-                                        true,true,true);//periodicity
-
-
-   BlockDataID scalarGPUFieldId = blocks->addStructuredBlockData<cuda::GPUField<int> >(
-           &createGPUField, "ScalarGPUField" );
-
-   for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
+   for(; fieldLayoutIndex < fieldLayouts.size(); ++fieldLayoutIndex )
    {
-      GPUPackInfoBufferTester bufferTester( &(*blockIt), scalarGPUFieldId );
-      GPUPackInfoLocalTester localTester( &(*blockIt), scalarGPUFieldId );
+      std::vector< cudaStream_t > streams;
+      for( uint_t s = 0; s < stencil::D3Q27::Size; ++s )
+      {
+         cudaStream_t stream(nullptr);
+         WALBERLA_CUDA_CHECK( cudaStreamCreate( &stream ) );
+         streams.push_back( stream );
+      }
+      // Create BlockForest
+      uint_t processes = uint_c( MPIManager::instance()->numProcesses() );
+      auto blocks = createUniformBlockGrid(processes,1,1,  //blocks
+                                           2,2,2,          //cells
+                                           1,              //dx
+                                           false,          //one block per process
+                                           true,true,true);//periodicity
+
+      BlockDataID scalarGPUFieldId = blocks->addStructuredBlockData<cuda::GPUField<int> >(
+              &createGPUField, "ScalarGPUField" );
+
+      for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
+      {
+         GPUPackInfoBufferTester bufferTester( &(*blockIt), scalarGPUFieldId, streams );
+         GPUPackInfoLocalTester localTester( &(*blockIt), scalarGPUFieldId, streams );
+
+         for( auto dir = stencil::D3Q27::beginNoCenter(); dir != stencil::D3Q27::end(); ++dir )
+         {
+            localTester.test( *dir );
+            bufferTester.test( *dir );
+         }
+      }
 
-      for( auto dir = stencil::D3Q27::beginNoCenter(); dir != stencil::D3Q27::end(); ++dir )
+      for( auto streamIt = streams.begin(); streamIt != streams.end(); ++streamIt )
       {
-         localTester.test( *dir );
-         bufferTester.test( *dir );
+         cudaStream_t & stream = *streamIt;
+         WALBERLA_CUDA_CHECK( cudaStreamDestroy( stream ) );
       }
    }
 
-- 
GitLab