From 319909f0bdf9d38296d194eaa52ccd92c825a29f Mon Sep 17 00:00:00 2001
From: Martin Bauer <martin.bauer@fau.de>
Date: Tue, 23 Oct 2018 13:27:24 +0200
Subject: [PATCH] New GPU communication scheme with GPU kernels for packing

Features:
   - uses generated pack infos for packing & unpacking directly on GPU
   - can directly send GPU buffers if cuda-enabled MPI is available,
     otherwise the packed buffers are transfered to CPU first
   - communication hiding with cuda streams: communication can be run
     asynchronously - especially useful when compute kernel is also
     split up into inner and outer part

- added RAII classes for CUDA streams and events
- equivalence test that checks if generated CPU and GPU (overlapped)
  versions are computing same result as normal waLBerla LBM kernel
---
 src/cuda/CudaRAII.h                           |  85 ++++++
 src/cuda/ErrorChecking.h                      |   2 +-
 src/cuda/GPUField.h                           |   3 +
 src/cuda/GPUField.impl.h                      |  20 ++
 src/cuda/communication/CustomMemoryBuffer.h   | 143 ++++++++++
 .../communication/CustomMemoryBuffer.impl.h   | 120 +++++++++
 src/cuda/communication/GPUPackInfo.h          |  25 +-
 src/cuda/communication/GeneratedGPUPackInfo.h |  44 ++++
 src/cuda/communication/PinnedMemoryBuffer.h   | 123 ---------
 src/cuda/communication/UniformGPUScheme.h     |  94 +++++++
 .../communication/UniformGPUScheme.impl.h     | 248 ++++++++++++++++++
 tests/cuda/CMakeLists.txt                     |   4 +
 tests/cuda/codegen/CudaJacobiKernel.py        |  11 +-
 tests/cuda/codegen/EquivalenceTest.cpp        | 178 +++++++++++++
 tests/cuda/codegen/EquivalenceTest.gen.py     |  40 +++
 15 files changed, 999 insertions(+), 141 deletions(-)
 create mode 100644 src/cuda/CudaRAII.h
 create mode 100644 src/cuda/communication/CustomMemoryBuffer.h
 create mode 100644 src/cuda/communication/CustomMemoryBuffer.impl.h
 create mode 100644 src/cuda/communication/GeneratedGPUPackInfo.h
 delete mode 100644 src/cuda/communication/PinnedMemoryBuffer.h
 create mode 100644 src/cuda/communication/UniformGPUScheme.h
 create mode 100644 src/cuda/communication/UniformGPUScheme.impl.h
 create mode 100644 tests/cuda/codegen/EquivalenceTest.cpp
 create mode 100644 tests/cuda/codegen/EquivalenceTest.gen.py

diff --git a/src/cuda/CudaRAII.h b/src/cuda/CudaRAII.h
new file mode 100644
index 000000000..f179348f7
--- /dev/null
+++ b/src/cuda/CudaRAII.h
@@ -0,0 +1,85 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file CudaRAII.h
+//! \ingroup cuda
+//! \author Martin Bauer <martin.bauer@fau.de>
+//
+//======================================================================================================================
+
+#include "ErrorChecking.h"
+
+
+namespace walberla {
+namespace cuda {
+
+
+class StreamRAII
+{
+public:
+   ~StreamRAII() {
+       if( stream_ != 0 ) {
+           WALBERLA_CUDA_CHECK( cudaStreamDestroy( stream_ ));
+       }
+   }
+
+   StreamRAII( StreamRAII && other) {
+      stream_ = other.stream_;
+      other.stream_ = 0;
+   }
+
+   StreamRAII(const StreamRAII&) = delete;
+   void operator=( const StreamRAII &) = delete;
+   operator cudaStream_t() const { return stream_; }
+
+
+   static StreamRAII defaultStream() {
+      StreamRAII result;
+      result.stream_ = 0;
+      return result;
+   }
+
+   static StreamRAII newPriorityStream(int priority) {
+      StreamRAII result;
+      WALBERLA_CUDA_CHECK( cudaStreamCreateWithPriority( &result.stream_, cudaStreamDefault, priority ));
+      return result;
+   }
+
+   static StreamRAII newStream() {
+      StreamRAII result;
+      WALBERLA_CUDA_CHECK( cudaStreamCreate( &result.stream_));
+      return result;
+   }
+
+private:
+   StreamRAII() {}
+   cudaStream_t stream_;
+};
+
+
+struct EventRAII
+{
+    explicit EventRAII() { WALBERLA_CUDA_CHECK( cudaEventCreate(&event) ); }
+    ~EventRAII()         { WALBERLA_CUDA_CHECK( cudaEventDestroy(event) ); }
+    EventRAII(const EventRAII &)   = delete;
+    void operator=( const EventRAII &) = delete;
+    operator cudaEvent_t() const { return event; }
+
+    cudaEvent_t event;
+};
+
+
+} // namespace cuda
+} // namespace walberla
\ No newline at end of file
diff --git a/src/cuda/ErrorChecking.h b/src/cuda/ErrorChecking.h
index 49b216744..82dc0b4a9 100644
--- a/src/cuda/ErrorChecking.h
+++ b/src/cuda/ErrorChecking.h
@@ -40,7 +40,7 @@ inline void checkForError( cudaError_t code, const std::string & callerPath, con
   if(code != cudaSuccess)
   {
     std::stringstream ss;
-    ss << "CUDA Error: " << cudaGetErrorString( code );
+    ss << "CUDA Error: " << code << " " << cudaGetErrorName(code) << ": " << cudaGetErrorString( code );
     Abort::instance()->abort( ss.str(), callerPath, line );
   }
 }
diff --git a/src/cuda/GPUField.h b/src/cuda/GPUField.h
index 116071a65..bc2447072 100755
--- a/src/cuda/GPUField.h
+++ b/src/cuda/GPUField.h
@@ -118,6 +118,9 @@ namespace cuda {
 
       inline uint_t  nrOfGhostLayers() const { return nrOfGhostLayers_; }
 
+      inline CellInterval xyzSize()               const;
+      inline CellInterval xyzSizeWithGhostLayer() const;
+
       bool operator==( const GPUField & other ) const;
 
       void getGhostRegion( stencil::Direction d, CellInterval & ci,
diff --git a/src/cuda/GPUField.impl.h b/src/cuda/GPUField.impl.h
index 946afd7ae..840ebd6a5 100644
--- a/src/cuda/GPUField.impl.h
+++ b/src/cuda/GPUField.impl.h
@@ -112,6 +112,26 @@ void GPUField<T>::getGhostRegion(stencil::Direction d, CellInterval & ci,
 }
 
 
+template<typename T>
+inline CellInterval GPUField<T>::xyzSize() const
+{
+   return CellInterval (0,0,0,
+                        cell_idx_c( xSize() )-1,
+                        cell_idx_c( ySize() )-1,
+                        cell_idx_c( zSize() )-1 );
+}
+
+template<typename T>
+inline CellInterval GPUField<T>::xyzSizeWithGhostLayer() const
+{
+   CellInterval ci = GPUField<T>::xyzSize();
+   for( uint_t i=0; i < 3; ++i ) {
+      ci.min()[i] -= cell_idx_c( nrOfGhostLayers_ );
+      ci.max()[i] += cell_idx_c( nrOfGhostLayers_ );
+   }
+   return ci;
+}
+
 template<typename T>
 void GPUField<T>::getSlice(stencil::Direction d, CellInterval & ci,
                            cell_idx_t distance, cell_idx_t thickness, bool fullSlice ) const
diff --git a/src/cuda/communication/CustomMemoryBuffer.h b/src/cuda/communication/CustomMemoryBuffer.h
new file mode 100644
index 000000000..8ad176077
--- /dev/null
+++ b/src/cuda/communication/CustomMemoryBuffer.h
@@ -0,0 +1,143 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file BasicBuffer.h
+//! \ingroup cuda
+//! \author Martin Bauer <martin.bauer@fau.de>
+//! \brief Basic Buffer supporting different memory spaces
+//
+//======================================================================================================================
+
+#pragma once
+
+#include "cuda/ErrorChecking.h"
+
+#include <algorithm>
+#include <cstring>
+
+
+namespace walberla {
+namespace cuda {
+namespace communication {
+
+
+   struct HostMemoryAllocator;
+   struct DeviceMemoryAllocator;
+
+   //*******************************************************************************************************************
+   /*!
+    * Simple buffer class that supports memory allocators, e.g. for pinned host memory or GPU memory
+    *
+    * \ingroup cuda
+    *
+    * In contrast to core::mpi::Buffer this class does not support stream operators "<<" and ">>" because these
+    * operators imply serial (un)packing which is not feasible on the GPU.
+    * The allocator template has to provide:
+    *   - static void *allocate( size_t size )
+    *   - void deallocate( void *ptr )
+    *   - void memcpy( void *dst, void *src, size_t count )
+    *
+    * The buffer has a beginning, a current position and an end position. Here is an overview of the most important
+    * operations:
+    *   - clear: reset current position to begin, does not change size
+    *   - advance: moves current position number of bytes forward and returns poitner to the old current position
+    *              two versions are available, one that automatically resizes and reallocates the buffer, and one that
+    *              fails if not enough space is available
+    */
+   //*******************************************************************************************************************
+   template<typename Allocator>
+   class CustomMemoryBuffer
+   {
+   public:
+      typedef uint8_t ElementType;
+
+      explicit CustomMemoryBuffer();
+      explicit CustomMemoryBuffer( std::size_t initSize );
+      explicit CustomMemoryBuffer( const CustomMemoryBuffer &pb );
+      ~CustomMemoryBuffer();
+      CustomMemoryBuffer &operator=( const CustomMemoryBuffer &pb );
+
+      void resize( std::size_t newSize );
+      inline std::size_t allocSize() const { return std::size_t(end_ - begin_); }
+      inline std::size_t size() const { return std::size_t(cur_ - begin_); }
+      ElementType *ptr() const { return begin_; }
+
+      inline void clear() { cur_ = begin_; }
+
+      ElementType *advance( std::size_t bytes );
+      ElementType *advanceNoResize( std::size_t bytes );
+
+      template<typename T>
+      T *advance( std::size_t bytes ) { return reinterpret_cast<T *>( advance( bytes * sizeof( T ))); }
+      template<typename T>
+      T *advanceNoResize( std::size_t bytes ) { return reinterpret_cast<T *>( advanceNoResize( bytes * sizeof( T ))); }
+
+   private:
+      ElementType *begin_;
+      ElementType *cur_;
+      ElementType *end_;
+   };
+
+
+   using PinnedMemoryBuffer = CustomMemoryBuffer<HostMemoryAllocator>;
+   using GPUMemoryBuffer    = CustomMemoryBuffer<DeviceMemoryAllocator>;
+
+
+   struct HostMemoryAllocator
+   {
+      static void *allocate( size_t size )
+      {
+         void *p;
+         WALBERLA_CUDA_CHECK( cudaMallocHost( &p, size ));
+         return p;
+      }
+
+      static void deallocate( void *ptr )
+      {
+         WALBERLA_CUDA_CHECK( cudaFreeHost( ptr ));
+      }
+
+      static void memcpy( void *dst, void *src, size_t count )
+      {
+         std::memcpy( dst, src, count );
+      }
+   };
+
+   struct DeviceMemoryAllocator
+   {
+      static void *allocate( size_t size )
+      {
+         void *p;
+         WALBERLA_CUDA_CHECK( cudaMalloc( &p, size ));
+         return p;
+      }
+
+      static void deallocate( void *ptr )
+      {
+         WALBERLA_CUDA_CHECK( cudaFree( ptr ));
+      }
+
+      static void memcpy( void *dst, void *src, size_t count )
+      {
+         cudaMemcpy( dst, src, count, cudaMemcpyDeviceToDevice );
+      }
+   };
+
+
+} // namespace communication
+} // namespace cuda
+} // namespace walberla
+
+#include "CustomMemoryBuffer.impl.h"
diff --git a/src/cuda/communication/CustomMemoryBuffer.impl.h b/src/cuda/communication/CustomMemoryBuffer.impl.h
new file mode 100644
index 000000000..377ba4bd3
--- /dev/null
+++ b/src/cuda/communication/CustomMemoryBuffer.impl.h
@@ -0,0 +1,120 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file BasicBuffer.h
+//! \ingroup cuda
+//! \author Martin Bauer <martin.bauer@fau.de>
+//
+//======================================================================================================================
+
+
+namespace walberla {
+namespace cuda {
+namespace communication {
+
+
+   template<typename Allocator>
+   CustomMemoryBuffer<Allocator>::CustomMemoryBuffer()
+           : begin_( nullptr ), cur_( nullptr ), end_( nullptr )
+   {
+   }
+
+   template<typename Allocator>
+   CustomMemoryBuffer<Allocator>::CustomMemoryBuffer( std::size_t initSize )
+           : begin_( nullptr ), cur_( nullptr ), end_( nullptr )
+   {
+      if( initSize > 0 )
+      {
+         begin_ = Allocator::allocate( initSize );
+         end_ = begin_ + initSize;
+         cur_ = begin_;
+      }
+   }
+
+   template<typename Allocator>
+   CustomMemoryBuffer<Allocator>::CustomMemoryBuffer( const CustomMemoryBuffer &pb )
+   {
+      if( pb.begin_ != nullptr )
+      {
+         begin_ = reinterpret_cast<ElementType *>(Allocator::allocate( pb.allocSize()));
+         end_ = begin_ + pb.allocSize();
+         Allocator::memcpy( begin_, pb.begin_, pb.allocSize());
+         cur_ = begin_ + pb.size();
+      }
+   }
+
+   template<typename Allocator>
+   CustomMemoryBuffer<Allocator> &CustomMemoryBuffer<Allocator>::operator=( const CustomMemoryBuffer<Allocator> &pb )
+   {
+      auto copy( pb );
+      std::swap( cur_, copy.cur_ );
+      std::swap( begin_, copy.begin_ );
+      std::swap( end_, copy.end_ );
+      return *this;
+   }
+
+   template<typename Allocator>
+   CustomMemoryBuffer<Allocator>::~CustomMemoryBuffer()
+   {
+      if( begin_ != nullptr )
+         Allocator::deallocate( begin_ );
+   }
+
+   template<typename Allocator>
+   void CustomMemoryBuffer<Allocator>::resize( std::size_t newSize )
+   {
+      if( newSize > allocSize())
+      {
+         auto offset = cur_ - begin_;
+
+         ElementType *newBegin;
+
+         newBegin = reinterpret_cast<ElementType *>(Allocator::allocate( newSize ));
+
+         Allocator::memcpy( newBegin, begin_, size_t(end_ - begin_) );
+
+         std::swap( begin_, newBegin );
+         if( newBegin != nullptr )
+            Allocator::deallocate( newBegin );
+
+         end_ = begin_ + newSize;
+         cur_ = begin_ + offset;
+      }
+
+   }
+
+   template<typename Allocator>
+   typename CustomMemoryBuffer<Allocator>::ElementType *CustomMemoryBuffer<Allocator>::advance( std::size_t bytes )
+   {
+      resize( size() + bytes );
+      auto result = cur_;
+      cur_ += bytes;
+      WALBERLA_ASSERT_LESS_EQUAL( cur_, end_ );
+      return result;
+   }
+
+   template<typename Allocator>
+   typename CustomMemoryBuffer<Allocator>::ElementType *CustomMemoryBuffer<Allocator>::advanceNoResize( std::size_t bytes )
+   {
+      auto newSize = size() + bytes;
+      if( newSize <= allocSize())
+         return advance( bytes );
+      else
+         return nullptr;
+   }
+
+} // namespace communication
+} // namespace cuda
+} // namespace walberla
diff --git a/src/cuda/communication/GPUPackInfo.h b/src/cuda/communication/GPUPackInfo.h
index 61c698d82..7e1bad001 100644
--- a/src/cuda/communication/GPUPackInfo.h
+++ b/src/cuda/communication/GPUPackInfo.h
@@ -21,21 +21,18 @@
 
 #pragma once
 
+#include "blockforest/Block.h"
+#include "communication/UniformPackInfo.h"
 #include "core/debug/Debug.h"
 #include "core/math/Vector3.h"
 #include "core/mpi/BufferSizeTrait.h"
-
-#include "stencil/Directions.h"
-
 #include "field/GhostRegions.h"
-
-#include "communication/UniformPackInfo.h"
-
-#include "blockforest/Block.h"
+#include "field/Layout.h"
+#include "stencil/Directions.h"
 
 #include "cuda/ErrorChecking.h"
 #include "cuda/GPUCopy.h"
-#include "cuda/communication/PinnedMemoryBuffer.h"
+#include "cuda/communication/CustomMemoryBuffer.h"
 
 #include <cuda_runtime.h>
 #include <map>
@@ -142,7 +139,8 @@ void GPUPackInfo<GPUField_T>::unpackData(IBlock * receiver, stencil::Direction d
    if ( copyAsync_ )
    {
       PinnedMemoryBuffer & pinnedBuffer = pinnedRecvBuffers_[dir];
-      copyBufferPtr = pinnedBuffer.resize( nrOfBytesToRead );
+      pinnedBuffer.clear();
+      copyBufferPtr = pinnedBuffer.advance( nrOfBytesToRead );
       // Copy data into pinned memory buffer, in order to transfer it asynchronously to the GPU
       std::copy( bufPtr, static_cast< unsigned char * >( bufPtr + nrOfBytesToRead ), copyBufferPtr );
    }
@@ -158,7 +156,7 @@ void GPUPackInfo<GPUField_T>::unpackData(IBlock * receiver, stencil::Direction d
    auto intervalSize = std::make_tuple( fieldCi.xSize(), fieldCi.ySize(), fieldCi.zSize(),
                                         fieldPtr->fSize() );
 
-   if ( fieldPtr->layout() == fzyx )
+   if ( fieldPtr->layout() == field::fzyx )
    {
       const uint_t dstAllocSizeZ = fieldPtr->zAllocSize();
       const uint_t srcAllocSizeZ = fieldCi.zSize();
@@ -217,7 +215,7 @@ void GPUPackInfo<GPUField_T>::communicateLocal(const IBlock * sender, IBlock * r
 
    auto intervalSize = std::make_tuple( rCi.xSize(), rCi.ySize(), rCi.zSize(), sf->fSize() );
 
-   if ( sf->layout() == fzyx )
+   if ( sf->layout() == field::fzyx )
    {
       const uint_t dstAllocSizeZ = rf->zAllocSize();
       const uint_t srcAllocSizeZ = sf->zAllocSize();
@@ -263,7 +261,8 @@ void GPUPackInfo<GPUField_T>::packDataImpl(const IBlock * sender, stencil::Direc
    if ( copyAsync_ )
    {
       PinnedMemoryBuffer & pinnedBuffer = pinnedSendBuffers_[dir];
-      copyBufferPtr = pinnedBuffer.resize( nrOfBytesToPack );
+      pinnedBuffer.clear();
+      copyBufferPtr = pinnedBuffer.advance( nrOfBytesToPack );
    }
 
    auto dstOffset = std::make_tuple( uint_c(0), uint_c(0), uint_c(0), uint_c(0) );
@@ -275,7 +274,7 @@ void GPUPackInfo<GPUField_T>::packDataImpl(const IBlock * sender, stencil::Direc
    auto intervalSize = std::make_tuple( fieldCi.xSize(), fieldCi.ySize(), fieldCi.zSize(),
                                         fieldPtr->fSize() );
 
-   if ( fieldPtr->layout() == fzyx )
+   if ( fieldPtr->layout() == field::fzyx )
    {
       const uint_t dstAllocSizeZ = fieldCi.zSize();
       const uint_t srcAllocSizeZ = fieldPtr->zAllocSize();
diff --git a/src/cuda/communication/GeneratedGPUPackInfo.h b/src/cuda/communication/GeneratedGPUPackInfo.h
new file mode 100644
index 000000000..752f2907c
--- /dev/null
+++ b/src/cuda/communication/GeneratedGPUPackInfo.h
@@ -0,0 +1,44 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file GeneratedGPUPackInfo.h
+//! \ingroup core
+//! \author Martin Bauer <martin.bauer@fau.de>
+//
+//======================================================================================================================
+
+
+#pragma once
+#include "stencil/Directions.h"
+#include "domain_decomposition/IBlock.h"
+#include <cuda_runtime.h>
+
+
+namespace walberla {
+namespace cuda {
+
+
+class GeneratedGPUPackInfo
+{
+public:
+   virtual void pack  ( stencil::Direction dir, unsigned char *buffer, IBlock *block, cudaStream_t stream ) = 0;
+   virtual void unpack( stencil::Direction dir, unsigned char *buffer, IBlock *block, cudaStream_t stream ) = 0;
+   virtual uint_t size( stencil::Direction dir, IBlock *block ) = 0;
+};
+
+
+
+} //namespace cuda
+} //namespace walberla
\ No newline at end of file
diff --git a/src/cuda/communication/PinnedMemoryBuffer.h b/src/cuda/communication/PinnedMemoryBuffer.h
deleted file mode 100644
index 702e71bd6..000000000
--- a/src/cuda/communication/PinnedMemoryBuffer.h
+++ /dev/null
@@ -1,123 +0,0 @@
-//======================================================================================================================
-//
-//  This file is part of waLBerla. waLBerla is free software: you can
-//  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of
-//  the License, or (at your option) any later version.
-//
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-//  for more details.
-//
-//  You should have received a copy of the GNU General Public License along
-//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
-//
-//! \file PinnedMemoryBuffer.h
-//! \ingroup cuda
-//! \author João Victor Tozatti Risso <jvtrisso@inf.ufpr.br>
-//! \brief Pinned Memory buffer for staging memory when using asynchronous CUDA memory copies.
-//
-//======================================================================================================================
-
-#pragma once
-
-#include "cuda/ErrorChecking.h"
-
-#include <algorithm>
-#include <cuda_runtime.h>
-
-
-namespace walberla {
-namespace cuda {
-namespace communication {
-
-template< typename T = unsigned char >
-class GenericPinnedMemoryBuffer
-{
-public:
-   typedef T ElementType;
-
-   GenericPinnedMemoryBuffer();
-
-   GenericPinnedMemoryBuffer( std::size_t initSize );
-
-   GenericPinnedMemoryBuffer( const GenericPinnedMemoryBuffer & pb );
-
-   ~GenericPinnedMemoryBuffer();
-
-   inline T* ptr() const { return data_; }
-
-   inline T* resize( std::size_t newSize );
-
-   inline std::size_t size() const { return size_; }
-
-   GenericPinnedMemoryBuffer & operator=( const GenericPinnedMemoryBuffer & pb ) = delete;
-private:
-   T * data_;
-   std::size_t size_;
-};
-
-typedef GenericPinnedMemoryBuffer<> PinnedMemoryBuffer;
-
-template< typename T >  // Element type
-GenericPinnedMemoryBuffer<T>::GenericPinnedMemoryBuffer()
-   : data_(nullptr), size_(0)
-{
-}
-
-
-template< typename T >  // Element type
-GenericPinnedMemoryBuffer<T>::GenericPinnedMemoryBuffer( std::size_t initSize )
-   : data_(nullptr), size_(initSize)
-{
-   if (initSize > 0)
-   {
-      WALBERLA_CUDA_CHECK( cudaMallocHost( &data_, size_ * sizeof(T) ) );
-   }
-}
-
-template< typename T >  // Element type
-GenericPinnedMemoryBuffer<T>::GenericPinnedMemoryBuffer( const GenericPinnedMemoryBuffer & pb )
-   : size_(pb.size_)
-{
-   if ( pb.size_ > 0 )
-   {
-      WALBERLA_CUDA_CHECK( cudaMallocHost( &data_, pb.size_ * sizeof(T) ) );
-
-      std::copy( pb.data_, static_cast<T *>(pb.data_ + pb.size_), data_ );
-   }
-}
-
-template< typename T >  // Element type
-GenericPinnedMemoryBuffer<T>::~GenericPinnedMemoryBuffer()
-{
-   if ( data_ != nullptr )
-   {
-      WALBERLA_CUDA_CHECK( cudaFreeHost( data_ ) );
-   }
-}
-
-template< typename T >  // Element type
-T * GenericPinnedMemoryBuffer<T>::resize(std::size_t newSize)
-{
-   if ( newSize > size_ )
-   {
-      T * newBegin;
-      WALBERLA_CUDA_CHECK( cudaMallocHost( &newBegin, newSize * sizeof(T) ) );
-
-      std::swap( data_, newBegin );
-      if ( newBegin != nullptr )
-      {
-         WALBERLA_CUDA_CHECK( cudaFreeHost( newBegin ) );
-      }
-
-      size_ = newSize;
-   }
-
-   return data_;
-}
-
-} // namespace communication
-} // namespace cuda
-} // namespace walberla
diff --git a/src/cuda/communication/UniformGPUScheme.h b/src/cuda/communication/UniformGPUScheme.h
new file mode 100644
index 000000000..b7e0bf06c
--- /dev/null
+++ b/src/cuda/communication/UniformGPUScheme.h
@@ -0,0 +1,94 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file UniformGPUScheme.h
+//! \ingroup cuda
+//! \author Martin Bauer <martin.bauer@fau.de>
+//
+//======================================================================================================================
+
+#pragma once
+
+#include "blockforest/StructuredBlockForest.h"
+#include "core/mpi/MPIWrapper.h"
+#include "core/mpi/BufferSystem.h"
+#include "core/WeakPtrWrapper.h"
+#include "domain_decomposition/IBlock.h"
+#include "stencil/Directions.h"
+
+#include "cuda/CudaRAII.h"
+#include "cuda/communication/GeneratedGPUPackInfo.h"
+#include "cuda/communication/CustomMemoryBuffer.h"
+
+#include <chrono>
+#include <thread>
+
+namespace walberla {
+namespace cuda {
+namespace communication {
+
+
+
+   template<typename Stencil>
+   class UniformGPUScheme
+   {
+   public:
+       explicit UniformGPUScheme( weak_ptr_wrapper<StructuredBlockForest> bf,
+                                  const shared_ptr<cuda::EventRAII> & startWaitEvent,
+                                  bool sendDirectlyFromGPU = false,
+                                  const int tag = 5432 );
+
+       void addPackInfo( const shared_ptr<GeneratedGPUPackInfo> &pi );
+
+       void startCommunication();
+       void wait();
+
+      void operator()()         { communicate(); }
+      inline void communicate() { startCommunication(); wait(); }
+
+   private:
+       void setupCommunication();
+
+       weak_ptr_wrapper<StructuredBlockForest> blockForest_;
+       shared_ptr<cuda::EventRAII> startWaitEvent_;
+       uint_t forestModificationStamp_;
+
+       bool setupBeforeNextCommunication_;
+       bool communicationInProgress_;
+       bool sendFromGPU_;
+
+       using CpuBuffer_T = cuda::communication::PinnedMemoryBuffer;
+       using GpuBuffer_T = cuda::communication::GPUMemoryBuffer;
+
+       mpi::GenericBufferSystem<CpuBuffer_T, CpuBuffer_T> bufferSystemCPU_;
+       mpi::GenericBufferSystem<GpuBuffer_T, GpuBuffer_T> bufferSystemGPU_;
+
+       std::vector<shared_ptr<GeneratedGPUPackInfo> > packInfos_;
+       std::map<stencil::Direction, cuda::StreamRAII> streams_;
+
+       struct Header
+       {
+           BlockID blockId;
+           stencil::Direction dir;
+       };
+       std::map<mpi::MPIRank, std::vector<Header> > headers_;
+   };
+
+
+} // namespace communication
+} // namespace cuda
+} // namespace walberla
+
+#include "UniformGPUScheme.impl.h"
diff --git a/src/cuda/communication/UniformGPUScheme.impl.h b/src/cuda/communication/UniformGPUScheme.impl.h
new file mode 100644
index 000000000..2b9ebd32a
--- /dev/null
+++ b/src/cuda/communication/UniformGPUScheme.impl.h
@@ -0,0 +1,248 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file UniformGPUScheme.impl.h
+//! \ingroup cuda
+//! \author Martin Bauer <martin.bauer@fau.de>
+//
+//======================================================================================================================
+
+
+namespace walberla {
+namespace cuda {
+namespace communication {
+
+
+template<typename Stencil>
+UniformGPUScheme<Stencil>::UniformGPUScheme( weak_ptr_wrapper <StructuredBlockForest> bf,
+                                             const shared_ptr <cuda::EventRAII> &startWaitEvent,
+                                             bool sendDirectlyFromGPU,
+                                             const int tag )
+        : blockForest_( bf ),
+          startWaitEvent_( startWaitEvent ),
+          setupBeforeNextCommunication_( true ),
+          communicationInProgress_( false ),
+          sendFromGPU_( sendDirectlyFromGPU ),
+          bufferSystemCPU_( mpi::MPIManager::instance()->comm(), tag ),
+          bufferSystemGPU_( mpi::MPIManager::instance()->comm(), tag ) {}
+
+
+   template<typename Stencil>
+   void UniformGPUScheme<Stencil>::startCommunication()
+   {
+      WALBERLA_ASSERT( !communicationInProgress_ );
+      auto forest = blockForest_.lock();
+
+      if( setupBeforeNextCommunication_ ||
+          forest->getBlockForest().getModificationStamp() != forestModificationStamp_ )
+         setupCommunication();
+
+      // Schedule Receives
+      if( sendFromGPU_ )
+         bufferSystemGPU_.scheduleReceives();
+      else
+         bufferSystemCPU_.scheduleReceives();
+
+
+      if( !sendFromGPU_ )
+         for( auto it : headers_ )
+            bufferSystemGPU_.sendBuffer( it.first ).clear();
+
+      // Start filling send buffers
+      for( auto &iBlock : *forest )
+      {
+         auto block = dynamic_cast< Block * >( &iBlock );
+         for( auto dir = Stencil::beginNoCenter(); dir != Stencil::end(); ++dir )
+         {
+            const auto neighborIdx = blockforest::getBlockNeighborhoodSectionIndex( *dir );
+            if( block->getNeighborhoodSectionSize( neighborIdx ) == uint_t( 0 ))
+               continue;
+            auto nProcess = mpi::MPIRank( block->getNeighborProcess( neighborIdx, uint_t( 0 )));
+
+            if( streams_.find( *dir ) == streams_.end() )
+            {
+               streams_.emplace( *dir, StreamRAII::newPriorityStream( -1 ));
+            }
+
+            auto &ci = streams_.at( *dir );
+
+            for( auto &pi : packInfos_ )
+            {
+               auto size = pi->size( *dir, block );
+               auto gpuDataPtr = bufferSystemGPU_.sendBuffer( nProcess ).advanceNoResize( size );
+               WALBERLA_ASSERT_NOT_NULLPTR( gpuDataPtr );
+               WALBERLA_CUDA_CHECK( cudaStreamWaitEvent( ci, *startWaitEvent_, 0 ));
+               pi->pack( *dir, gpuDataPtr, block, ci );
+
+               if( !sendFromGPU_ )
+               {
+                  auto cpuDataPtr = bufferSystemCPU_.sendBuffer( nProcess ).advanceNoResize( size );
+                  WALBERLA_ASSERT_NOT_NULLPTR( cpuDataPtr );
+                  WALBERLA_CUDA_CHECK(
+                          cudaMemcpyAsync( cpuDataPtr, gpuDataPtr, size, cudaMemcpyDeviceToHost, ci ));
+               }
+            }
+         }
+      }
+
+      // Busy waiting for packing to finish - then send
+      for( auto &ci : streams_ ) WALBERLA_CUDA_CHECK( cudaStreamSynchronize( ci.second ));
+
+      if( sendFromGPU_ )
+         bufferSystemGPU_.sendAll();
+      else
+         bufferSystemCPU_.sendAll();
+
+      communicationInProgress_ = true;
+   }
+
+
+   template<typename Stencil>
+   void UniformGPUScheme<Stencil>::wait()
+   {
+      WALBERLA_ASSERT( communicationInProgress_ );
+
+      auto forest = blockForest_.lock();
+
+      if( sendFromGPU_ )
+      {
+         for( auto recvInfo = bufferSystemGPU_.begin(); recvInfo != bufferSystemGPU_.end(); ++recvInfo )
+         {
+            for( auto &header : headers_[recvInfo.rank()] )
+            {
+               auto &ci = streams_.at( header.dir );
+               auto block = dynamic_cast< Block * >( forest->getBlock( header.blockId ));
+
+               for( auto &pi : packInfos_ )
+               {
+                  auto size = pi->size( header.dir, block );
+                  auto gpuDataPtr = recvInfo.buffer().advanceNoResize( size );
+                  WALBERLA_ASSERT_NOT_NULLPTR( gpuDataPtr );
+                  pi->unpack( stencil::inverseDir[header.dir], gpuDataPtr, block, ci );
+               }
+            }
+         }
+      }
+      else
+      {
+         for( auto recvInfo = bufferSystemCPU_.begin(); recvInfo != bufferSystemCPU_.end(); ++recvInfo )
+         {
+            using namespace std::chrono_literals;
+            std::this_thread::sleep_for( 1ms );
+            auto &gpuBuffer = bufferSystemGPU_.sendBuffer( recvInfo.rank());
+
+            gpuBuffer.clear();
+            for( auto &header : headers_[recvInfo.rank()] ) {
+               auto &ci = streams_.at( header.dir );
+               auto block = dynamic_cast< Block * >( forest->getBlock( header.blockId ));
+
+               for( auto &pi : packInfos_ )
+               {
+                  auto size = pi->size( header.dir, block );
+                  auto cpuDataPtr = recvInfo.buffer().advanceNoResize( size );
+                  auto gpuDataPtr = gpuBuffer.advanceNoResize( size );
+                  WALBERLA_ASSERT_NOT_NULLPTR( cpuDataPtr );
+                  WALBERLA_ASSERT_NOT_NULLPTR( gpuDataPtr );
+
+                  WALBERLA_CUDA_CHECK( cudaMemcpyAsync( gpuDataPtr, cpuDataPtr, size,
+                                                        cudaMemcpyHostToDevice, ci ));
+                  pi->unpack( stencil::inverseDir[header.dir], gpuDataPtr, block, ci );
+               }
+            }
+         }
+      }
+
+      for( auto &ci : streams_ )
+         WALBERLA_CUDA_CHECK( cudaStreamSynchronize( ci.second ));
+
+      communicationInProgress_ = false;
+   }
+
+
+   template<typename Stencil>
+   void UniformGPUScheme<Stencil>::setupCommunication()
+   {
+      auto forest = blockForest_.lock();
+
+      headers_.clear();
+
+      std::map<mpi::MPIRank, mpi::MPISize> receiverInfo; // how many bytes to send to each neighbor
+
+      mpi::BufferSystem headerExchangeBs( mpi::MPIManager::instance()->comm(), 123 );
+
+      for( auto &iBlock : *forest ) {
+         auto block = dynamic_cast< Block * >( &iBlock );
+
+         for( auto dir = Stencil::beginNoCenter(); dir != Stencil::end(); ++dir ) {
+            // skip if block has no neighbors in this direction
+            const auto neighborIdx = blockforest::getBlockNeighborhoodSectionIndex( *dir );
+            if( block->getNeighborhoodSectionSize( neighborIdx ) == uint_t( 0 ))
+               continue;
+
+            WALBERLA_ASSERT( block->neighborhoodSectionHasEquallySizedBlock( neighborIdx ),
+                             "Works for uniform setups only" );
+            WALBERLA_ASSERT_EQUAL( block->getNeighborhoodSectionSize( neighborIdx ), uint_t( 1 ),
+                                   "Works for uniform setups only" );
+
+            const BlockID &nBlockId = block->getNeighborId( neighborIdx, uint_t( 0 ));
+            auto nProcess = mpi::MPIRank( block->getNeighborProcess( neighborIdx, uint_t( 0 )));
+
+            for( auto &pi : packInfos_ )
+               receiverInfo[nProcess] += mpi::MPISize( pi->size( *dir, block ));
+
+            auto &headerBuffer = headerExchangeBs.sendBuffer( nProcess );
+            nBlockId.toBuffer( headerBuffer );
+            headerBuffer << *dir;
+         }
+      }
+
+      headerExchangeBs.setReceiverInfoFromSendBufferState( false, true );
+      headerExchangeBs.sendAll();
+      for( auto recvIter = headerExchangeBs.begin(); recvIter != headerExchangeBs.end(); ++recvIter ) {
+         auto &headerVector = headers_[recvIter.rank()];
+         auto &buffer = recvIter.buffer();
+         while ( buffer.size()) {
+            Header header;
+            header.blockId.fromBuffer( buffer );
+            buffer >> header.dir;
+            headerVector.push_back( header );
+         }
+      }
+
+      bufferSystemCPU_.setReceiverInfo( receiverInfo );
+      bufferSystemGPU_.setReceiverInfo( receiverInfo );
+
+      for( auto it : receiverInfo ) {
+         bufferSystemCPU_.sendBuffer( it.first ).resize( it.second );
+         bufferSystemGPU_.sendBuffer( it.first ).resize( it.second );
+      }
+
+      forestModificationStamp_ = forest->getBlockForest().getModificationStamp();
+      setupBeforeNextCommunication_ = false;
+   }
+
+
+   template<typename Stencil>
+   void UniformGPUScheme<Stencil>::addPackInfo( const shared_ptr<GeneratedGPUPackInfo> &pi )
+   {
+      WALBERLA_ASSERT( !communicationInProgress_, "Cannot add pack info while communication is in progress" );
+      packInfos_.push_back( pi );
+      setupBeforeNextCommunication_ = true;
+   }
+
+
+} // namespace communication
+} // namespace cuda
+} // namespace walberla
diff --git a/tests/cuda/CMakeLists.txt b/tests/cuda/CMakeLists.txt
index aa82a445f..39222ebde 100644
--- a/tests/cuda/CMakeLists.txt
+++ b/tests/cuda/CMakeLists.txt
@@ -31,3 +31,7 @@ waLBerla_compile_test( FILES communication/CommTest )
 
 waLBerla_compile_test( FILES CudaMPI DEPENDS blockforest timeloop gui )
 #waLBerla_execute_test( NAME  CudaMPI )
+
+waLBerla_add_executable ( NAME CpuGpuGeneratedEquivalenceTest
+                          FILES codegen/EquivalenceTest.cpp codegen/EquivalenceTest.gen.py
+                          DEPENDS blockforest boundary core cuda field stencil timeloop vtk gui )
diff --git a/tests/cuda/codegen/CudaJacobiKernel.py b/tests/cuda/codegen/CudaJacobiKernel.py
index d30da2edc..225593dcd 100644
--- a/tests/cuda/codegen/CudaJacobiKernel.py
+++ b/tests/cuda/codegen/CudaJacobiKernel.py
@@ -1,17 +1,20 @@
 from pystencils_walberla.sweep import Sweep
-from pystencils_walberla.cmake_integration import codegen
+
 
 def jacobi2D(sweep):
     src = sweep.field("f1")
     dst = sweep.temporaryField(src)
 
-    dst[0, 0] @= (src[1, 0] + src[-1, 0] + src[0, 1] + src[0, -1]) / (4 * S.h ** 2)
+    dst[0, 0] @= (src[1, 0] + src[-1, 0] + src[0, 1] + src[0, -1]) / (4 * sweep.constant.h ** 2)
+
 
 def jacobi3D(sweep):
     src = sweep.field("f1")
     dst = sweep.temporaryField(src)
 
-    dst[0,0,0] @= (src[1,0,0] + src[-1,0,0] + src[0,1,0] + src[0, -1, 0] + src[0, 0, 1] + src[0, 0 , -1] ) / (6 * S.h**2)
+    dst[0, 0, 0] @= (src[1, 0, 0] + src[-1, 0, 0] + src[0, 1, 0] + src[0, -1, 0] + src[0, 0, 1] + src[0, 0, -1]) / \
+                    (6 * sweep.constant.h ** 2)
+
 
 Sweep.generate('CudaJacobiKernel2D', jacobi2D, dim=2, target='gpu')
-Sweep.generate('CudaJacobiKernel3D', jacobi3D, dim=3, target='gpu')
\ No newline at end of file
+Sweep.generate('CudaJacobiKernel3D', jacobi3D, dim=3, target='gpu')
diff --git a/tests/cuda/codegen/EquivalenceTest.cpp b/tests/cuda/codegen/EquivalenceTest.cpp
new file mode 100644
index 000000000..bbdc079d7
--- /dev/null
+++ b/tests/cuda/codegen/EquivalenceTest.cpp
@@ -0,0 +1,178 @@
+#include "core/Environment.h"
+#include "python_coupling/CreateConfig.h"
+#include "blockforest/Initialization.h"
+#include "lbm/field/PdfField.h"
+#include "lbm/field/AddToStorage.h"
+#include "field/FlagField.h"
+#include "field/AddToStorage.h"
+#include "lbm/communication/PdfFieldPackInfo.h"
+#include "lbm/vtk/VTKOutput.h"
+#include "lbm/PerformanceLogger.h"
+#include "blockforest/communication/UniformBufferedScheme.h"
+#include "timeloop/all.h"
+#include "core/math/Random.h"
+#include "geometry/all.h"
+#include "cuda/HostFieldAllocator.h"
+#include "cuda/communication/GPUPackInfo.h"
+#include "core/timing/TimingPool.h"
+#include "core/timing/RemainingTimeLogger.h"
+#include "cuda/AddGPUFieldToStorage.h"
+#include "cuda/communication/UniformGPUScheme.h"
+#include "lbm/sweeps/CellwiseSweep.h"
+#include "domain_decomposition/SharedSweep.h"
+
+#include "EquivalenceTest_LatticeModel.h"
+#include "EquivalenceTest_GPUKernel.h"
+#include "EquivalenceTest_GPUPackInfo.h"
+
+using namespace walberla;
+
+using NativeLatticeModel_T = lbm::D3Q19<lbm::collision_model::SRT, false>;
+using GeneratedLatticeModel_T = lbm::EquivalenceTest_LatticeModel;
+
+using Stencil_T = GeneratedLatticeModel_T::Stencil;
+using CommunicationStencil_T = GeneratedLatticeModel_T::CommunicationStencil;
+using NativePdfField_T = lbm::PdfField<NativeLatticeModel_T>;
+using GeneratedPdfField_T = lbm::PdfField<GeneratedLatticeModel_T>;
+
+using flag_t = walberla::uint8_t;
+using FlagField_T = FlagField<flag_t>;
+
+using CpuCommScheme_T = blockforest::communication::UniformBufferedScheme<CommunicationStencil_T>;
+using GpuCommScheme_T = cuda::communication::UniformGPUScheme<CommunicationStencil_T>;
+
+
+template<typename PdfField_T>
+void initPdfField( const shared_ptr<StructuredBlockForest> &blocks, BlockDataID pdfFieldId )
+{
+   auto domainBB = blocks->getDomainCellBB();
+
+   for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
+   {
+      auto pdfField = blockIt->getData<PdfField_T>( pdfFieldId );
+      Cell offset( 0, 0, 0 );
+      blocks->transformBlockLocalToGlobalCell( offset, *blockIt );
+
+      WALBERLA_FOR_ALL_CELLS_XYZ( pdfField,
+         auto globalX = real_c( offset[0] + x );
+         auto globalZ = real_c( offset[2] + z );
+         auto xArg = real_c(std::sin(real_c(globalX) / real_t(4) * real_c(domainBB.size(0)) ));
+         auto zArg = real_c(std::sin(real_c(globalZ) / real_t(4) * real_c(domainBB.size(2)) ));
+         pdfField->setToEquilibrium( x, y, z, Vector3<real_t>( 0.05 * std::sin(xArg), 0, 0.05 * std::cos(zArg)));
+      );
+   }
+}
+
+
+int main( int argc, char **argv )
+{
+   mpi::Environment env( argc, argv );
+
+   for( auto cfg = python_coupling::configBegin( argc, argv ); cfg != python_coupling::configEnd(); ++cfg )
+   {
+      auto config = *cfg;
+      auto parameters = config->getOneBlock( "Parameters" );
+
+      auto blocks = blockforest::createUniformBlockGridFromConfig( config );
+
+      const real_t omega = parameters.getParameter<real_t>( "omega", real_c( 1.4 ));
+      const uint_t timesteps = parameters.getParameter<uint_t>( "timesteps", uint_c( 50 ));
+
+      // Boundary
+      BlockDataID flagFieldId = field::addFlagFieldToStorage<FlagField_T>( blocks, "flag field" );
+      const FlagUID fluidFlagUID( "Fluid" );
+      geometry::setNonBoundaryCellsToDomain<FlagField_T>( *blocks, flagFieldId, fluidFlagUID );
+
+
+      // Part 1 : Native walberla
+      NativeLatticeModel_T nativeLatticeModel = NativeLatticeModel_T( lbm::collision_model::SRT( omega ));
+      BlockDataID pdfFieldNativeId = lbm::addPdfFieldToStorage( blocks, "pdfNative", nativeLatticeModel, field::fzyx );
+      initPdfField<NativePdfField_T >( blocks, pdfFieldNativeId );
+      CpuCommScheme_T nativeComm( blocks );
+      nativeComm.addPackInfo( make_shared< lbm::PdfFieldPackInfo< NativeLatticeModel_T > >( pdfFieldNativeId ) );
+      auto nativeSweep = lbm::makeCellwiseSweep< NativeLatticeModel_T , FlagField_T >( pdfFieldNativeId, flagFieldId, fluidFlagUID );
+
+      SweepTimeloop nativeTimeLoop( blocks->getBlockStorage(), timesteps );
+      nativeTimeLoop.add() << BeforeFunction( nativeComm, "communication" )
+                           << Sweep(makeSharedSweep(nativeSweep), "native stream collide" );
+      nativeTimeLoop.run();
+
+
+      // Part 2: Generated CPU Version
+      GeneratedLatticeModel_T generatedLatticeModel = GeneratedLatticeModel_T( omega );
+      BlockDataID pdfFieldGeneratedId = lbm::addPdfFieldToStorage( blocks, "pdfGenerated", generatedLatticeModel, field::fzyx );
+      initPdfField<GeneratedPdfField_T >( blocks, pdfFieldGeneratedId );
+      CpuCommScheme_T cpuComm( blocks );
+      cpuComm.addPackInfo( make_shared< lbm::PdfFieldPackInfo< GeneratedLatticeModel_T > >( pdfFieldGeneratedId ) );
+      SweepTimeloop cpuTimeLoop( blocks->getBlockStorage(), timesteps );
+      cpuTimeLoop.add() << BeforeFunction( cpuComm, "communication" )
+                        << Sweep(GeneratedLatticeModel_T::Sweep( pdfFieldGeneratedId ), "generated stream collide on cpu" );
+      cpuTimeLoop.run();
+
+
+      // Part 3: Generated GPU Version
+      bool overlapCommunication = parameters.getParameter<bool>( "overlapCommunication", true );
+      bool cudaEnabledMPI = parameters.getParameter<bool>( "cudaEnabledMPI", false );
+
+      BlockDataID pdfShadowCPU = lbm::addPdfFieldToStorage( blocks, "cpu shadow field", generatedLatticeModel, field::fzyx );
+      initPdfField<GeneratedPdfField_T >( blocks, pdfShadowCPU );
+
+      BlockDataID pdfGpuFieldId = cuda::addGPUFieldToStorage<GeneratedPdfField_T >( blocks, pdfShadowCPU, "pdfs on gpu", true );
+      auto defaultKernelStream = overlapCommunication ? cuda::StreamRAII::newStream() : cuda::StreamRAII::defaultStream();
+      auto innerKernelStartedEvent = make_shared<cuda::EventRAII>();
+
+      pystencils::EquivalenceTest_GPUKernel cudaLbKernel( pdfGpuFieldId, omega, defaultKernelStream );
+      GpuCommScheme_T gpuComm( blocks, innerKernelStartedEvent, cudaEnabledMPI );
+      gpuComm.addPackInfo( make_shared<pystencils::EquivalenceTest_GPUPackInfo>( pdfGpuFieldId ));
+      auto runCommunication = [&]() { gpuComm(); };
+
+      SweepTimeloop gpuTimeLoop( blocks->getBlockStorage(), timesteps );
+      if( !overlapCommunication )
+      {
+         gpuTimeLoop.add() << BeforeFunction( runCommunication, "gpu communication" )
+                           << Sweep( cudaLbKernel, "LB stream & collide gpu" );
+      }
+      else
+      {
+         gpuTimeLoop.add() << Sweep( [&]( IBlock *b )
+                                  {
+                                     cudaEventRecord( *innerKernelStartedEvent, defaultKernelStream );
+                                     cudaLbKernel.inner( b );
+                                  }, "LBM @ inner" );
+         gpuTimeLoop.add() << BeforeFunction( runCommunication, "gpu communication" )
+                           << Sweep( [&]( IBlock *b ) { cudaLbKernel.outer( b ); }, "LBM @ outer" );
+      }
+      gpuTimeLoop.run();
+      cuda::fieldCpy<GeneratedPdfField_T, cuda::GPUField<real_t>> (blocks, pdfShadowCPU, pdfGpuFieldId);
+
+      // Compare all three versions
+      auto errorCPU = real_t(0);
+      auto errorGPU = real_t(0);
+
+      for( auto & block : *blocks )
+      {
+         auto native = block.getData<NativePdfField_T>( pdfFieldNativeId );
+         auto cpu = block.getData<GeneratedPdfField_T >( pdfFieldGeneratedId );
+         auto gpu = block.getData<GeneratedPdfField_T>( pdfShadowCPU );
+
+         WALBERLA_FOR_ALL_CELLS_XYZ(native,
+            for(cell_idx_t f = 0; f < cell_idx_c(NativeLatticeModel_T::Stencil::Q); ++f )
+            {
+               errorCPU += std::abs( native->get( x, y, z, f ) - cpu->get( x, y, z, f ));
+               errorGPU += std::abs( native->get( x, y, z, f ) - gpu->get( x, y, z, f ));
+            }
+         )
+      }
+      mpi::reduceInplace(errorCPU, mpi::SUM);
+      mpi::reduceInplace(errorGPU, mpi::SUM);
+      auto domainBB = blocks->getDomainCellBB();
+      errorCPU /= real_c(domainBB.numCells());
+      errorGPU /= real_c(domainBB.numCells());
+      WALBERLA_LOG_RESULT_ON_ROOT("CPU Error " << errorCPU );
+      WALBERLA_LOG_RESULT_ON_ROOT("GPU Error " << errorGPU );
+      WALBERLA_CHECK_FLOAT_EQUAL(errorCPU, real_c(0.0));
+      WALBERLA_CHECK_FLOAT_EQUAL(errorGPU, real_c(0.0));
+   }
+
+   return 0;
+}
\ No newline at end of file
diff --git a/tests/cuda/codegen/EquivalenceTest.gen.py b/tests/cuda/codegen/EquivalenceTest.gen.py
new file mode 100644
index 000000000..af4d3a8d8
--- /dev/null
+++ b/tests/cuda/codegen/EquivalenceTest.gen.py
@@ -0,0 +1,40 @@
+import sympy as sp
+from lbmpy_walberla import generate_lattice_model_files
+from lbmpy.creationfunctions import create_lb_update_rule
+from pystencils_walberla.sweep import Sweep
+
+# LB options
+options = {
+    'method': 'srt',
+    'stencil': 'D3Q19',
+    'relaxation_rate': sp.Symbol("omega"),
+    'field_name': 'pdfs',
+    'compressible': False,
+    'maxwellian_moments': False,
+    'temporary_field_name': 'pdfs_tmp',
+    'optimization': {'cse_global': False,
+                     'cse_pdfs': False,
+                     'double_precision': True}
+}
+
+# GPU optimization options
+opt =       {'gpu_indexing_params': {'block_size': (128, 2, 1)},  'data_type': 'float64'}
+outer_opt = {'gpu_indexing_params': {'block_size': (32, 32, 32)}, 'data_type': 'float64'}
+
+
+def lb_assignments():
+    ur = create_lb_update_rule(**options)
+    return ur.all_assignments
+
+
+generate_lattice_model_files(class_name='EquivalenceTest_LatticeModel', **options)
+
+Sweep.generate_inner_outer_kernel('EquivalenceTest_GPUKernel',
+                                  lambda: create_lb_update_rule(**options).all_assignments,
+                                  target='gpu',
+                                  temporary_fields=['pdfs_tmp'],
+                                  field_swaps=[('pdfs', 'pdfs_tmp')],
+                                  optimization=opt,
+                                  outer_optimization=outer_opt)
+
+Sweep.generate_pack_info('EquivalenceTest_GPUPackInfo', lb_assignments, target='gpu')
-- 
GitLab