From ba5733cc15a297aa142739c550cc961c7f5bd9a1 Mon Sep 17 00:00:00 2001
From: Martin Bauer <martin.bauer@fau.de>
Date: Tue, 30 May 2017 09:54:08 +0200
Subject: [PATCH] Python export for GPUFields and interface to pycuda

---
 apps/pythonmodule/CMakeLists.txt              |  13 +-
 apps/pythonmodule/PythonModule.cpp            |  13 +
 python/waLBerla/__init__.py                   |   5 +-
 python/waLBerla/cuda_extension.py             |  15 +
 src/cuda/AddGPUFieldToStorage.impl.h          |   1 +
 src/cuda/CMakeLists.txt                       |   2 +-
 src/cuda/FieldIndexing.h                      |   1 +
 ...FieldIndexing.cpp => FieldIndexing.impl.h} |   5 -
 src/cuda/FieldIndexing3D.h                    |   1 +
 ...dIndexing3D.cpp => FieldIndexing3D.impl.h} |   5 -
 src/cuda/FieldIndexingXYZ.h                   |   1 +
 ...ndexingXYZ.cpp => FieldIndexingXYZ.impl.h} |   3 -
 src/cuda/GPUField.h                           |  21 +-
 src/cuda/{GPUField.cpp => GPUField.impl.h}    |  19 +-
 src/cuda/GPUTypesExplicitInstantiation.h      |   8 -
 src/cuda/python/Exports.h                     |  43 +++
 src/cuda/python/Exports.impl.h                | 360 ++++++++++++++++++
 src/waLBerlaDefinitions.in.h                  |   2 +
 18 files changed, 483 insertions(+), 35 deletions(-)
 create mode 100644 python/waLBerla/cuda_extension.py
 rename src/cuda/{FieldIndexing.cpp => FieldIndexing.impl.h} (98%)
 rename src/cuda/{FieldIndexing3D.cpp => FieldIndexing3D.impl.h} (98%)
 rename src/cuda/{FieldIndexingXYZ.cpp => FieldIndexingXYZ.impl.h} (97%)
 rename src/cuda/{GPUField.cpp => GPUField.impl.h} (95%)
 delete mode 100644 src/cuda/GPUTypesExplicitInstantiation.h
 create mode 100644 src/cuda/python/Exports.h
 create mode 100644 src/cuda/python/Exports.impl.h

diff --git a/apps/pythonmodule/CMakeLists.txt b/apps/pythonmodule/CMakeLists.txt
index b52d000da..b4d92d772 100644
--- a/apps/pythonmodule/CMakeLists.txt
+++ b/apps/pythonmodule/CMakeLists.txt
@@ -2,13 +2,18 @@
 
 
 if ( WALBERLA_BUILD_WITH_PYTHON_MODULE )
-    
+
+    set(PYTHON_MODULE_DEPENDENCIES blockforest boundary domain_decomposition core field geometry lbm postprocessing python_coupling timeloop vtk)
+    if (WALBERLA_BUILD_WITH_CUDA)
+        set(PYTHON_MODULE_DEPENDENCIES ${PYTHON_MODULE_DEPENDENCIES} cuda)
+    endif()
+
     if( WALBERLA_CXX_COMPILER_IS_MSVC )
-       set ( pythonModules blockforest boundary domain_decomposition core field geometry lbm postprocessing python_coupling timeloop vtk)
+       set ( pythonModules ${PYTHON_MODULE_DEPENDENCIES})
     elseif( APPLE )
-       set ( pythonModules "-Wl,-force_load" blockforest boundary domain_decomposition core field geometry lbm postprocessing python_coupling timeloop vtk)
+       set ( pythonModules "-Wl,-force_load" ${PYTHON_MODULE_DEPENDENCIES})
     else()
-       set ( pythonModules "-Wl,-whole-archive" blockforest boundary domain_decomposition core field geometry lbm postprocessing python_coupling timeloop vtk  "-Wl,-no-whole-archive" )
+       set ( pythonModules "-Wl,-whole-archive" ${PYTHON_MODULE_DEPENDENCIES}  "-Wl,-no-whole-archive" )
     endif()
 
     if( WALBERLA_BUILD_WITH_PYTHON_LBM )
diff --git a/apps/pythonmodule/PythonModule.cpp b/apps/pythonmodule/PythonModule.cpp
index 9d6791c0a..82fab2e04 100644
--- a/apps/pythonmodule/PythonModule.cpp
+++ b/apps/pythonmodule/PythonModule.cpp
@@ -30,10 +30,15 @@
 #include "timeloop/python/Exports.h"
 #include "vtk/python/Exports.h"
 
+#ifdef WALBERLA_BUILD_WITH_CUDA
+#include "cuda/python/Exports.h"
+#endif
+
 #include <boost/mpl/vector.hpp>
 #include <boost/mpl/insert_range.hpp>
 
 
+
 namespace bmpl = boost::mpl;
 using namespace walberla;
 
@@ -111,6 +116,14 @@ struct InitObject
       // Timeloop
       pythonManager->addExporterFunction( timeloop::exportModuleToPython );
 
+#ifdef WALBERLA_BUILD_WITH_CUDA
+      using walberla::cuda::GPUField;
+      typedef bmpl::vector<GPUField<double>, GPUField<float>, GPUField<int>, GPUField<uint8_t>, GPUField<uint16_t> > GPUFields;
+
+      pythonManager->addExporterFunction( cuda::exportModuleToPython<GPUFields> );
+      pythonManager->addBlockDataConversion<GPUFields>();
+#endif
+
       python_coupling::initWalberlaForPythonModule();
    }
 };
diff --git a/python/waLBerla/__init__.py b/python/waLBerla/__init__.py
index 622f88ab7..93244d0aa 100644
--- a/python/waLBerla/__init__.py
+++ b/python/waLBerla/__init__.py
@@ -28,7 +28,10 @@ if cpp_available:
         # extend the C++ module with some python functions
         from .field_extension import extend as extend_field
         extend_field( field     )
-
+    if 'cuda' in globals():
+        sys.modules[__name__ + '.cuda'] = cuda
+        from .cuda_extension import extend as extend_cuda
+        extend_cuda( cuda )
     if 'geometry' in globals():
         sys.modules[__name__ + '.geometry'] = geometry
     if 'lbm' in globals():
diff --git a/python/waLBerla/cuda_extension.py b/python/waLBerla/cuda_extension.py
new file mode 100644
index 000000000..be218d116
--- /dev/null
+++ b/python/waLBerla/cuda_extension.py
@@ -0,0 +1,15 @@
+from pycuda.gpuarray import GPUArray
+import numpy as np
+
+def toGpuArray(f):
+    """Converts a waLBerla GPUField to a pycuda GPUArray"""
+    if not f:
+        return None
+    dtype = np.dtype(f.dtypeStr)
+    strides = [dtype.itemsize*a for a in f.strides]
+    return GPUArray(f.sizeWithGhostLayers, dtype, gpudata=f.ptr, strides=strides)
+
+
+def extend(cppCudaModule):
+    cppCudaModule.toGpuArray = toGpuArray
+
diff --git a/src/cuda/AddGPUFieldToStorage.impl.h b/src/cuda/AddGPUFieldToStorage.impl.h
index f007181a6..03b90c728 100644
--- a/src/cuda/AddGPUFieldToStorage.impl.h
+++ b/src/cuda/AddGPUFieldToStorage.impl.h
@@ -21,6 +21,7 @@
 
 #pragma once
 
+#include "cuda/FieldCopy.h"
 
 namespace walberla {
 namespace cuda {
diff --git a/src/cuda/CMakeLists.txt b/src/cuda/CMakeLists.txt
index a4c149c36..83db21519 100644
--- a/src/cuda/CMakeLists.txt
+++ b/src/cuda/CMakeLists.txt
@@ -4,6 +4,6 @@
 #
 ###################################################################################################
 
-waLBerla_add_module( DEPENDS core communication domain_decomposition field stencil BUILD_ONLY_IF_FOUND CUDA ) 
+waLBerla_add_module( DEPENDS core communication domain_decomposition python_coupling field stencil BUILD_ONLY_IF_FOUND CUDA )
 
 ###################################################################################################                        
\ No newline at end of file
diff --git a/src/cuda/FieldIndexing.h b/src/cuda/FieldIndexing.h
index 653a5de27..7ed089f4b 100644
--- a/src/cuda/FieldIndexing.h
+++ b/src/cuda/FieldIndexing.h
@@ -91,4 +91,5 @@ namespace cuda {
 } // namespace cuda
 } // namespace walberla
 
+#include "FieldIndexing.impl.h"
 
diff --git a/src/cuda/FieldIndexing.cpp b/src/cuda/FieldIndexing.impl.h
similarity index 98%
rename from src/cuda/FieldIndexing.cpp
rename to src/cuda/FieldIndexing.impl.h
index 413bbe1aa..c4837d3c1 100644
--- a/src/cuda/FieldIndexing.cpp
+++ b/src/cuda/FieldIndexing.impl.h
@@ -20,7 +20,6 @@
 //======================================================================================================================
 
 #include "FieldIndexing.h"
-#include "GPUTypesExplicitInstantiation.h"
 #include "GPUField.h"
 
 #include "core/cell/CellInterval.h"
@@ -224,10 +223,6 @@ FieldIndexing<T> FieldIndexing<T>::all ( const GPUField<T> & f, const cell::Cell
 
 
 
-
-GPU_CLASS_TEMPLATE_INSTANTIATION( FieldIndexing )
-
-
 } // namespace cuda
 } // namespace walberla
 
diff --git a/src/cuda/FieldIndexing3D.h b/src/cuda/FieldIndexing3D.h
index 0dbe97566..c6637ec6c 100644
--- a/src/cuda/FieldIndexing3D.h
+++ b/src/cuda/FieldIndexing3D.h
@@ -103,3 +103,4 @@ namespace cuda {
 } // namespace walberla
 
 
+#include "FieldIndexing.impl.h"
\ No newline at end of file
diff --git a/src/cuda/FieldIndexing3D.cpp b/src/cuda/FieldIndexing3D.impl.h
similarity index 98%
rename from src/cuda/FieldIndexing3D.cpp
rename to src/cuda/FieldIndexing3D.impl.h
index 5a797a23d..896f7e1d2 100644
--- a/src/cuda/FieldIndexing3D.cpp
+++ b/src/cuda/FieldIndexing3D.impl.h
@@ -20,7 +20,6 @@
 //======================================================================================================================
 
 #include "FieldIndexing3D.h"
-#include "GPUTypesExplicitInstantiation.h"
 #include "GPUField.h"
 
 #include "core/cell/CellInterval.h"
@@ -165,10 +164,6 @@ FieldIndexing3D<T> FieldIndexing3D<T>::intervalXYZ( const GPUField<T> & f, const
 
 
 
-
-GPU_CLASS_TEMPLATE_INSTANTIATION( FieldIndexing3D )
-
-
 } // namespace cuda
 } // namespace walberla
 
diff --git a/src/cuda/FieldIndexingXYZ.h b/src/cuda/FieldIndexingXYZ.h
index 2c25975ea..18a6e2645 100644
--- a/src/cuda/FieldIndexingXYZ.h
+++ b/src/cuda/FieldIndexingXYZ.h
@@ -77,3 +77,4 @@ template< typename T> class GPUField;
 } // namespace walberla
 
 
+#include "FieldIndexingXYZ.impl.h"
\ No newline at end of file
diff --git a/src/cuda/FieldIndexingXYZ.cpp b/src/cuda/FieldIndexingXYZ.impl.h
similarity index 97%
rename from src/cuda/FieldIndexingXYZ.cpp
rename to src/cuda/FieldIndexingXYZ.impl.h
index 8cc0bd638..c8ec561f9 100644
--- a/src/cuda/FieldIndexingXYZ.cpp
+++ b/src/cuda/FieldIndexingXYZ.impl.h
@@ -20,7 +20,6 @@
 //======================================================================================================================
 
 #include "FieldIndexingXYZ.h"
-#include "GPUTypesExplicitInstantiation.h"
 #include "GPUField.h"
 
 #include "core/cell/CellInterval.h"
@@ -114,8 +113,6 @@ FieldIndexingXYZ<T> FieldIndexingXYZ<T>::withGhostLayerXYZ( const GPUField<T> &
 }
 
 
-GPU_CLASS_TEMPLATE_INSTANTIATION( FieldIndexingXYZ )
-
 
 } // namespace cuda
 } // namespace walberla
diff --git a/src/cuda/GPUField.h b/src/cuda/GPUField.h
index 3153aba60..437fe1c95 100755
--- a/src/cuda/GPUField.h
+++ b/src/cuda/GPUField.h
@@ -79,11 +79,27 @@ namespace cuda {
       inline uint_t  zSize() const  { return zSize_; }
       inline uint_t  fSize() const  { return fSize_; }
       inline uint_t  size()  const  { return fSize() * xSize() * ySize() * zSize(); }
+      inline uint_t  size( uint_t coord )  const;
+
+      inline uint_t       xSizeWithGhostLayer()        const  { return xSize() + uint_c(2)*nrOfGhostLayers_; }
+      inline uint_t       ySizeWithGhostLayer()        const  { return ySize() + uint_c(2)*nrOfGhostLayers_; }
+      inline uint_t       zSizeWithGhostLayer()        const  { return zSize() + uint_c(2)*nrOfGhostLayers_; }
+      inline uint_t       sizeWithGhostLayer(uint_t i) const  { return i==3 ? fSize_ :
+                                                                              size(i) + uint_c(2)*nrOfGhostLayers_; }
 
       cell_idx_t xOff() const { return cell_idx_c( nrOfGhostLayers_ ); }
       cell_idx_t yOff() const { return cell_idx_c( nrOfGhostLayers_ ); }
       cell_idx_t zOff() const { return cell_idx_c( nrOfGhostLayers_ ); }
 
+      cell_idx_t xStride() const { return (layout_ == fzyx) ? cell_idx_t(1) :
+                                                              cell_idx_c(fAllocSize()); }
+      cell_idx_t yStride() const { return (layout_ == fzyx) ? cell_idx_t(xAllocSize()) :
+                                                              cell_idx_c(fAllocSize() * xAllocSize()); }
+      cell_idx_t zStride() const { return (layout_ == fzyx) ? cell_idx_t(xAllocSize() * yAllocSize()) :
+                                                              cell_idx_c(fAllocSize() * xAllocSize() * yAllocSize()); }
+      cell_idx_t fStride() const { return (layout_ == fzyx) ? cell_idx_t(xAllocSize() * yAllocSize() * zAllocSize()) :
+                                                              cell_idx_c(1); }
+
 
       uint_t xAllocSize() const;
       uint_t yAllocSize() const;
@@ -91,8 +107,8 @@ namespace cuda {
       uint_t fAllocSize() const;
       inline uint_t allocSize() const { return fAllocSize() * xAllocSize() * yAllocSize() * zAllocSize(); }
 
-      inline bool hasSameAllocSize( const GPUField<T> & other ) const;
-      inline bool hasSameSize( const GPUField<T> & other ) const;
+      bool hasSameAllocSize( const GPUField<T> & other ) const;
+      bool hasSameSize( const GPUField<T> & other ) const;
 
       GPUField<T> * cloneUninitialized() const;
 
@@ -133,3 +149,4 @@ namespace cuda {
 } // namespace walberla
 
 
+#include "GPUField.impl.h"
\ No newline at end of file
diff --git a/src/cuda/GPUField.cpp b/src/cuda/GPUField.impl.h
similarity index 95%
rename from src/cuda/GPUField.cpp
rename to src/cuda/GPUField.impl.h
index 8d2b51ed4..b6fe3f8a8 100644
--- a/src/cuda/GPUField.cpp
+++ b/src/cuda/GPUField.impl.h
@@ -21,7 +21,6 @@
 
 #include "GPUField.h"
 #include "ErrorChecking.h"
-#include "GPUTypesExplicitInstantiation.h"
 
 #include "core/logging/Logging.h"
 
@@ -124,12 +123,23 @@ void GPUField<T>::getSlice(stencil::Direction d, CellInterval & ci,
    }
 }
 
+template<typename T>
+inline uint_t GPUField<T>::size( uint_t coord ) const
+{
+   switch (coord) {
+      case 0: return this->xSize();
+      case 1: return this->ySize();
+      case 2: return this->zSize();
+      case 3: return this->fSize();
+      default: WALBERLA_ASSERT(false); return 0;
+   }
+}
 
 //*******************************************************************************************************************
 /*! True if sizes of all dimensions match
  *******************************************************************************************************************/
 template<typename T>
-inline bool GPUField<T>::hasSameSize( const GPUField<T> & other ) const
+bool GPUField<T>::hasSameSize( const GPUField<T> & other ) const
 {
    return xSize() == other.xSize() &&
           ySize() == other.ySize() &&
@@ -140,7 +150,7 @@ inline bool GPUField<T>::hasSameSize( const GPUField<T> & other ) const
 /*! True if allocation sizes of all dimensions match
  *******************************************************************************************************************/
 template<typename T>
-inline bool GPUField<T>::hasSameAllocSize( const GPUField<T> & other ) const
+bool GPUField<T>::hasSameAllocSize( const GPUField<T> & other ) const
 {
    return xAllocSize() == other.xAllocSize() &&
           yAllocSize() == other.yAllocSize() &&
@@ -238,9 +248,6 @@ void GPUField<T>::swapDataPointers( GPUField<T> & other )
 
 
 
-GPU_CLASS_TEMPLATE_INSTANTIATION( GPUField )
-
-
 } // namespace cuda
 } // namespace walberla
 
diff --git a/src/cuda/GPUTypesExplicitInstantiation.h b/src/cuda/GPUTypesExplicitInstantiation.h
deleted file mode 100644
index bdc4b5846..000000000
--- a/src/cuda/GPUTypesExplicitInstantiation.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#define GPU_CLASS_TEMPLATE_INSTANTIATION(ClassName)\
-   template class ClassName< double   >;\
-   template class ClassName< float    >;\
-   template class ClassName< int      >;\
-   template class ClassName< uint8_t  >;\
-   template class ClassName< uint16_t >;
-
-
diff --git a/src/cuda/python/Exports.h b/src/cuda/python/Exports.h
new file mode 100644
index 000000000..a2990cfc2
--- /dev/null
+++ b/src/cuda/python/Exports.h
@@ -0,0 +1,43 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file FieldExport.h
+//! \ingroup cuda
+//! \author Martin Bauer <martin.bauer@fau.de>
+//
+//======================================================================================================================
+
+#pragma once
+
+#ifdef WALBERLA_BUILD_WITH_PYTHON
+
+
+#include <string>
+
+namespace walberla {
+namespace cuda {
+
+
+   template<typename GpuFields >
+   void exportModuleToPython();
+
+
+} // namespace cuda
+} // namespace walberla
+
+#include "Exports.impl.h"
+
+
+#endif //WALBERLA_BUILD_WITH_PYTHON
diff --git a/src/cuda/python/Exports.impl.h b/src/cuda/python/Exports.impl.h
new file mode 100644
index 000000000..1d2e2dc23
--- /dev/null
+++ b/src/cuda/python/Exports.impl.h
@@ -0,0 +1,360 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file FieldExport.cpp
+//! \ingroup cuda
+//! \author Martin Bauer <martin.bauer@fau.de>
+//
+//======================================================================================================================
+
+// Do not reorder includes - the include order is important
+#include "python_coupling/PythonWrapper.h"
+
+#include "core/logging/Logging.h"
+#include "cuda/GPUField.h"
+#include "cuda/communication/GPUPackInfo.h"
+#include "cuda/AddGPUFieldToStorage.h"
+
+#include "field/communication/UniformMPIDatatypeInfo.h"
+
+#include "field/AddToStorage.h"
+#include "field/python/FieldExport.h"
+
+#include "python_coupling/helper/MplHelpers.h"
+#include "python_coupling/helper/BoostPythonHelpers.h"
+
+#include <boost/type_traits/is_unsigned.hpp>
+
+#include <iostream>
+#include <cuda/communication/GPUPackInfo.h>
+
+namespace walberla {
+namespace cuda {
+
+
+
+namespace internal {
+
+   //===================================================================================================================
+   //
+   //  Field export
+   //
+   //===================================================================================================================
+
+   template<typename GpuField_T>
+   uint64_t gpufield_ptr(const GpuField_T & gpuField)
+   {
+      return reinterpret_cast<uint64_t>(gpuField.pitchedPtr().ptr);
+   }
+
+   template<typename GpuField_T>
+   std::string gpufield_dtypeStr(const GpuField_T & )
+   {
+      return std::string(field::internal::PythonFormatString<typename GpuField_T::value_type>::get());
+   }
+
+   struct GpuFieldExporter
+   {
+      template< typename GpuField_T>
+      void operator() ( python_coupling::NonCopyableWrap<GpuField_T> )
+      {
+         using namespace boost::python;
+
+         class_<GpuField_T, shared_ptr<GpuField_T>, boost::noncopyable>( "GpuField", no_init )
+            .add_property("layout",              &field::internal::field_layout            < GpuField_T > )
+            .add_property("size",                &field::internal::field_size              < GpuField_T > )
+            .add_property("sizeWithGhostLayers", &field::internal::field_sizeWithGhostLayer< GpuField_T > )
+            .add_property("allocSize",           &field::internal::field_allocSize         < GpuField_T > )
+            .add_property("strides",             &field::internal::field_strides           < GpuField_T > )
+            .add_property("offsets",             &field::internal::field_offsets           < GpuField_T > )
+            .add_property("ptr",                 &gpufield_ptr                             < GpuField_T > )
+            .add_property("dtypeStr",            &gpufield_dtypeStr                        < GpuField_T > )
+            .def("swapDataPointers",             &field::internal::field_swapDataPointers  < GpuField_T > )
+            .add_property("nrOfGhostLayers",     &GpuField_T::nrOfGhostLayers )
+            .def("cloneUninitialized", &GpuField_T::cloneUninitialized, return_value_policy<manage_new_object>())
+            ;
+
+
+         using field::communication::PackInfo;
+         using communication::GPUPackInfo;
+         class_< GPUPackInfo<GpuField_T>,
+                 shared_ptr< GPUPackInfo<GpuField_T> >,
+                 bases<walberla::communication::UniformPackInfo>,
+                 boost::noncopyable >( "GpuFieldPackInfo", no_init );
+
+
+         using field::communication::UniformMPIDatatypeInfo;
+         class_< UniformMPIDatatypeInfo<GpuField_T>,
+                 shared_ptr< UniformMPIDatatypeInfo<GpuField_T> >,
+                 bases<walberla::communication::UniformMPIDatatypeInfo>,
+                 boost::noncopyable >( "GpuFieldMPIDataTypeInfo", no_init );
+
+      }
+   };
+
+
+   //===================================================================================================================
+   //
+   //  createField
+   //
+   //===================================================================================================================
+
+   class CreateFieldExporter
+   {
+   public:
+      CreateFieldExporter( uint_t xs, uint_t ys, uint_t zs, uint_t fs, uint_t gl,
+                           Layout layout, const boost::python::object & type, bool usePitchedMem,
+                           const shared_ptr<boost::python::object> & resultPointer )
+         : xs_( xs ), ys_(ys), zs_(zs), fs_(fs), gl_(gl),
+           layout_( layout),  type_( type ), usePitchedMem_( usePitchedMem ) , resultPointer_( resultPointer )
+      {}
+
+      template< typename GpuField_T>
+      void operator() ( python_coupling::NonCopyableWrap<GpuField_T> )
+      {
+         using namespace boost::python;
+         typedef typename GpuField_T::value_type T;
+         if( python_coupling::isCppEqualToPythonType<T>( (PyTypeObject *)type_.ptr() )  )
+         {
+            *resultPointer_ = object( make_shared< GPUField<T> >( xs_,ys_,zs_, fs_,  gl_, layout_, usePitchedMem_ )  );
+         }
+      }
+
+   private:
+      uint_t xs_;
+      uint_t ys_;
+      uint_t zs_;
+      uint_t fs_;
+      uint_t gl_;
+      Layout layout_;
+      boost::python::object type_;
+      bool usePitchedMem_;
+      shared_ptr<boost::python::object> resultPointer_;
+   };
+
+   template<typename GpuFields>
+   boost::python::object createPythonGpuField( boost::python::list size,
+                                               boost::python::object type,
+                                               uint_t ghostLayers,
+                                               Layout layout,
+                                               bool usePitchedMem)
+   {
+      using namespace boost::python;
+      uint_t xSize = extract<uint_t> ( size[0] );
+      uint_t ySize = extract<uint_t> ( size[1] );
+      uint_t zSize = extract<uint_t> ( size[2] );
+      uint_t sizeLen = uint_c( len( size ) );
+      uint_t fSize = 1;
+      if ( sizeLen == 4 )
+         fSize = extract<uint_t> ( size[3] );
+
+      if ( ! PyType_Check( type.ptr() ) ) {
+         PyErr_SetString( PyExc_RuntimeError, "Invalid 'type' parameter");
+         throw error_already_set();
+      }
+
+      auto result = make_shared<boost::python::object>();
+      CreateFieldExporter exporter( xSize,ySize, zSize, fSize, ghostLayers, layout, type, usePitchedMem, result );
+      python_coupling::for_each_noncopyable_type< GpuFields >( exporter );
+
+      if ( *result == object()  )
+      {
+         PyErr_SetString( PyExc_ValueError, "Cannot create field of this type");
+         throw error_already_set();
+      }
+      else {
+         return *result;
+      }
+   }
+
+
+   //===================================================================================================================
+   //
+   //  addToStorage
+   //
+   //===================================================================================================================
+
+   class AddToStorageExporter
+   {
+   public:
+      AddToStorageExporter( const shared_ptr<StructuredBlockStorage> & blocks,
+                           const std::string & name, uint_t fs, uint_t gl, Layout layout,
+                           const boost::python::object & type,
+                           bool usePitchedMem )
+         : blocks_( blocks ), name_( name ), fs_( fs ),
+           gl_(gl),layout_( layout),  type_( type ), usePitchedMem_(usePitchedMem), found_(false)
+      {}
+
+      template< typename GpuField_T>
+      void operator() ( python_coupling::NonCopyableWrap<GpuField_T> )
+      {
+         typedef typename GpuField_T::value_type T;
+         if( python_coupling::isCppEqualToPythonType<T>( (PyTypeObject *)type_.ptr() )  )
+         {
+            WALBERLA_ASSERT(!found_);
+            addGPUFieldToStorage<GPUField<T> >(blocks_, name_, fs_, layout_, gl_, usePitchedMem_);
+            found_ = true;
+         }
+      }
+
+      bool successful() const { return found_; }
+   private:
+      shared_ptr< StructuredBlockStorage > blocks_;
+      std::string name_;
+      uint_t fs_;
+      uint_t gl_;
+      Layout layout_;
+      boost::python::object type_;
+      bool usePitchedMem_;
+      bool found_;
+   };
+
+   template<typename GpuFields>
+   void addToStorage( const shared_ptr<StructuredBlockStorage> & blocks, const std::string & name,
+                      boost::python::object type, uint_t fs, uint_t gl, Layout layout, bool usePitchedMem )
+   {
+      using namespace boost::python;
+
+      if ( ! PyType_Check( type.ptr() ) ) {
+         PyErr_SetString( PyExc_RuntimeError, "Invalid 'type' parameter");
+         throw error_already_set();
+      }
+
+      auto result = make_shared<boost::python::object>();
+      AddToStorageExporter exporter( blocks, name, fs, gl, layout, type, usePitchedMem );
+      python_coupling::for_each_noncopyable_type<GpuFields>( boost::ref(exporter) );
+
+      if ( ! exporter.successful() ) {
+         PyErr_SetString( PyExc_ValueError, "Adding Field failed.");
+         throw error_already_set();
+      }
+   }
+
+
+   //===================================================================================================================
+   //
+   //  createPackInfo Export
+   //
+   //===================================================================================================================
+
+   template< typename GPUField_T >
+   boost::python::object createGPUPackInfoToObject( BlockDataID bdId, uint_t numberOfGhostLayers )
+   {
+      using cuda::communication::GPUPackInfo;
+      if ( numberOfGhostLayers > 0  )
+         return boost::python::object( make_shared< GPUPackInfo<GPUField_T> >( bdId, numberOfGhostLayers ) );
+      else
+         return boost::python::object( make_shared< GPUPackInfo<GPUField_T> >( bdId ) );
+   }
+
+   FunctionExporterClass( createGPUPackInfoToObject, boost::python::object( BlockDataID, uint_t  ) );
+
+   template< typename GpuFields>
+   boost::python::object createPackInfo( const shared_ptr<StructuredBlockStorage> & bs,
+                                         const std::string & blockDataName, uint_t numberOfGhostLayers )
+   {
+      using cuda::communication::GPUPackInfo;
+
+      auto bdId = python_coupling::blockDataIDFromString( *bs, blockDataName );
+      if ( bs->begin() == bs->end() ) {
+         // if no blocks are on this field an arbitrary PackInfo can be returned
+         return createGPUPackInfoToObject< GPUField<real_t> > ( bdId, numberOfGhostLayers );
+      }
+
+      IBlock * firstBlock =  & ( * bs->begin() );
+      python_coupling::Dispatcher<GpuFields, Exporter_createGPUPackInfoToObject > dispatcher( firstBlock );
+      return dispatcher( bdId )( bdId, numberOfGhostLayers ) ;
+   }
+
+
+   //===================================================================================================================
+   //
+   //  createMPIDatatypeInfo
+   //
+   //===================================================================================================================
+
+
+   template< typename GpuField_T >
+   boost::python::object createMPIDatatypeInfoToObject( BlockDataID bdId, uint_t numberOfGhostLayers )
+   {
+      using field::communication::UniformMPIDatatypeInfo;
+      if ( numberOfGhostLayers > 0 )
+         return boost::python::object( make_shared< UniformMPIDatatypeInfo<GpuField_T> >( bdId, numberOfGhostLayers ) );
+      else
+         return boost::python::object( make_shared< UniformMPIDatatypeInfo<GpuField_T> >( bdId ) );
+   }
+
+   FunctionExporterClass( createMPIDatatypeInfoToObject, boost::python::object( BlockDataID, uint_t  ) );
+
+   template< typename GpuFields>
+   boost::python::object createMPIDatatypeInfo( const shared_ptr<StructuredBlockStorage> & bs,
+                                                const std::string & blockDataName,
+                                                uint_t numberOfGhostLayers)
+   {
+      auto bdId = python_coupling::blockDataIDFromString( *bs, blockDataName );
+      if ( bs->begin() == bs->end() ) {
+         // if no blocks are on this field an arbitrary MPIDatatypeInfo can be returned
+         return createMPIDatatypeInfoToObject< GPUField<real_t> > ( bdId, numberOfGhostLayers );
+      }
+
+      IBlock * firstBlock =  & ( * bs->begin() );
+      python_coupling::Dispatcher<GpuFields, Exporter_createMPIDatatypeInfoToObject > dispatcher( firstBlock );
+      return dispatcher( bdId )( bdId, numberOfGhostLayers );
+   }
+
+
+
+} // namespace internal
+
+
+
+
+template<typename GpuFields >
+void exportModuleToPython()
+{
+   python_coupling::ModuleScope fieldModule( "cuda" );
+
+   using namespace boost::python;
+
+   python_coupling::for_each_noncopyable_type<GpuFields>( internal::GpuFieldExporter() );
+
+   def( "createGpuField", &internal::createPythonGpuField<GpuFields>, ( ( arg("size")                     ),
+                                                                         ( arg("type")                     ),
+                                                                         ( arg("ghostLayers") = uint_t(1)  ),
+                                                                         ( arg("layout")      = field::zyxf),
+                                                                         ( arg("usePitchedMem") = true     )  ) );
+
+
+   def( "addGpuFieldToStorage",  &internal::addToStorage<GpuFields>, ( ( arg("blocks")                  ),
+                                                                        ( arg("name")                    ),
+                                                                        ( arg("type")                    ),
+                                                                        ( arg("fSize")       = 1         ),
+                                                                        ( arg("ghostLayers") = uint_t(1) ),
+                                                                        ( arg("layout")      = field::zyxf      ),
+                                                                        ( arg("usePitchedMem") = object()  ) ) );
+
+   def( "createMPIDatatypeInfo",&internal::createMPIDatatypeInfo<GpuFields>, ( arg("blocks"), arg("blockDataName"), arg("numberOfGhostLayers" ) =0 ) );
+   def( "createPackInfo",       &internal::createPackInfo<GpuFields>,        ( arg("blocks"), arg("blockDataName"), arg("numberOfGhostLayers" ) =0 ) );
+
+}
+
+
+
+
+
+} // namespace cuda
+} // namespace walberla
+
+
diff --git a/src/waLBerlaDefinitions.in.h b/src/waLBerlaDefinitions.in.h
index ed587958e..ce7d276a1 100644
--- a/src/waLBerlaDefinitions.in.h
+++ b/src/waLBerlaDefinitions.in.h
@@ -29,6 +29,8 @@
 
 #cmakedefine WALBERLA_BUILD_WITH_OPENMESH
 
+#cmakedefine WALBERLA_BUILD_WITH_CUDA
+
 #cmakedefine WALBERLA_BUFFER_DEBUG
 
 #cmakedefine WALBERLA_THREAD_SAFE_LOGGING
-- 
GitLab