Martin Bauer · Martin Bauer · 7da42ec8 · 7da42ec8 · 7da42ec8 · 7da42ec8
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -81,6 +81,8 @@ option ( WALBERLA_BUILD_WITH_PYTHON_LBM     "Include LBM module into python modu
 option ( WALBERLA_BUILD_WITH_LIKWID_MARKERS "Compile in markers for likwid-perfctr"              )
+option ( WALBERLA_BUILD_WITH_CUDA	        "Enable CUDA support"                                )
 option ( WALBERLA_BUILD_WITH_FASTMATH       "Fast math"                                          )
@@ -993,6 +995,45 @@ endif()
+############################################################################################################################
+##
+## CUDA
+##
+############################################################################################################################
+if ( WALBERLA_BUILD_WITH_CUDA )
+    #   set ( BUILD_SHARED_LIBS                      ON )
+    set ( CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE ON )
+    set ( CUDA_PROPAGATE_HOST_FLAGS OFF CACHE BOOL "" )
+    if ( (NOT DEFINED CUDA_HOST_COMPILER) AND (${CMAKE_C_COMPILER} MATCHES "ccache") )
+        string ( STRIP "${CMAKE_C_COMPILER_ARG1}" stripped_compiler_string )
+        find_program ( CUDA_HOST_COMPILER ${stripped_compiler_string} )
+    endif ()
+    find_package ( CUDA REQUIRED )
+    if ( CUDA_FOUND )
+        include_directories ( ${CUDA_INCLUDE_DIRS} )
+        list ( APPEND SERVICE_LIBS ${CUDA_LIBRARIES} )
+        if ( NOT "${CUDA_NVCC_FLAGS}" MATCHES "-std=" )
+            list ( APPEND CUDA_NVCC_FLAGS "-std=c++11" )
+        endif ()
+        # Bug with gcc5 and cuda7.5:
+        #list( APPEND CUDA_NVCC_FLAGS  "-D_MWAITXINTRIN_H_INCLUDED -D_FORCE_INLINES  -D__STRICT_ANSI__")
+        # NOTICE: exisiting cuda flags are overwritten
+        #set ( CUDA_NVCC_FLAGS "--compiler-bindir=/usr/bin/g++-4.3" )
+        #set ( CUDA_NVCC_FLAGS "-arch sm_20" )
+    else()
+        set ( WALBERLA_BUILD_WITH_CUDA FALSE )
+    endif ( )
+endif ( )
+############################################################################################################################
 ############################################################################################################################
 ##
 ##  Testing Coverage 

--- a/apps/tutorials/CMakeLists.txt
+++ b/apps/tutorials/CMakeLists.txt
 add_subdirectory(basics)
+add_subdirectory(cuda)
 add_subdirectory(lbm)
 add_subdirectory(pde)
\ No newline at end of file
--- a/apps/tutorials/cuda/01_GameOfLife_cuda.cpp
+++ b/apps/tutorials/cuda/01_GameOfLife_cuda.cpp
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file 03_GameOfLife.cpp
+//! \author Martin Bauer <martin.bauer@fau.de>
+//
+//======================================================================================================================
+#include "01_GameOfLife_kernels.h"
+#include "cuda/HostFieldAllocator.h"
+#include "blockforest/Initialization.h"
+#include "blockforest/communication/UniformDirectScheme.h"
+#include "core/Environment.h"
+#include "cuda/HostFieldAllocator.h"
+#include "cuda/FieldCopy.h"
+#include "cuda/GPUField.h"
+#include "cuda/Kernel.h"
+#include "cuda/AddGPUFieldToStorage.h"
+#include "field/AddToStorage.h"
+#include "field/communication/UniformMPIDatatypeInfo.h"
+#include "geometry/initializer/ScalarFieldFromGrayScaleImage.h"
+#include "geometry/structured/GrayScaleImage.h"
+#include "gui/Gui.h"
+#include "stencil/D2Q9.h"
+#include "timeloop/SweepTimeloop.h"
+using namespace walberla;
+typedef GhostLayerField<double,1> ScalarField;
+typedef cuda::GPUField<double> GPUField;
+ScalarField * createField( IBlock* const block, StructuredBlockStorage* const storage )
+{
+   return new ScalarField (
+            storage->getNumberOfXCells( *block ),   // number of cells in x direction per block
+            storage->getNumberOfYCells( *block ),   // number of cells in y direction per block
+            storage->getNumberOfZCells( *block ),   // number of cells in z direction per block
+            1,                                      // one ghost layer
+            real_t(0),                              // initial value
+            field::fzyx,                            // layout
+            make_shared<cuda::HostFieldAllocator<double> >()  // allocator for host pinned memory
+            );
+}
+class GameOfLifeSweepCUDA
+{
+   public:
+      GameOfLifeSweepCUDA( BlockDataID gpuFieldSrcID, BlockDataID gpuFieldDstID )
+         : gpuFieldSrcID_( gpuFieldSrcID ), gpuFieldDstID_( gpuFieldDstID )
+      {
+      }
+      void operator() ( IBlock * block )
+      {
+         auto srcCudaField = block->getData< cuda::GPUField<real_t> > ( gpuFieldSrcID_ );
+         auto dstCudaField = block->getData< cuda::GPUField<real_t> > ( gpuFieldDstID_ );
+         auto myKernel = cuda::make_kernel( &gameOfLifeKernel );
+         myKernel.addFieldIndexingParam( cuda::FieldIndexing<double>::xyz( *srcCudaField ) );
+         myKernel.addFieldIndexingParam( cuda::FieldIndexing<double>::xyz( *dstCudaField ) );
+         myKernel();
+         srcCudaField->swapDataPointers( dstCudaField );
+      }
+   private:
+      BlockDataID gpuFieldSrcID_;
+      BlockDataID gpuFieldDstID_;
+};
+int main( int argc, char ** argv )
+{
+   walberla::Environment env( argc, argv );
+   geometry::GrayScaleImage image ("GosperGliderGun.png");
+   // Create blocks
+   shared_ptr< StructuredBlockForest > blocks = blockforest::createUniformBlockGrid (
+            uint_t(1) ,              uint_t(2),                           uint_t(1), // number of blocks in x,y,z direction
+            image.size( uint_t(0) ), image.size( uint_t(1) ) / uint_t(2), uint_t(1), // how many cells per block (x,y,z)
+            real_t(1),                                                               // dx: length of one cell in physical coordinates
+            false,                                                                   // one block per process - "false" means all blocks to one process
+            false, false, false );                                                   // no periodicity
+   BlockDataID cpuFieldID = blocks->addStructuredBlockData<ScalarField>( &createField, "CPU Field" );
+   // Initializing the field from an image
+   using geometry::initializer::ScalarFieldFromGrayScaleImage;
+   ScalarFieldFromGrayScaleImage fieldInitializer ( *blocks, cpuFieldID ) ;
+   fieldInitializer.init( image, uint_t(2), false );
+   BlockDataID gpuFieldSrcID = cuda::addGPUFieldToStorage<ScalarField>( blocks, cpuFieldID, "GPU Field Src" );
+   BlockDataID gpuFieldDstID = cuda::addGPUFieldToStorage<ScalarField>( blocks, cpuFieldID, "GPU Field Dst" );
+   typedef blockforest::communication::UniformDirectScheme<stencil::D2Q9 > CommScheme;
+   CommScheme communication( blocks );
+   communication.addDataToCommunicate( make_shared<field::communication::UniformMPIDatatypeInfo<GPUField> > (gpuFieldSrcID) );
+   // Create Timeloop
+   const uint_t numberOfTimesteps = uint_t(10); // number of timesteps for non-gui runs
+   SweepTimeloop timeloop ( blocks, numberOfTimesteps );
+   // Registering the sweep
+   timeloop.add() << BeforeFunction(  communication, "Communication" )
+                  << Sweep( GameOfLifeSweepCUDA(gpuFieldSrcID, gpuFieldDstID ), "GameOfLifeSweep" );
+   timeloop.add() << Sweep( cuda::fieldCpyFunctor<ScalarField, GPUField >(cpuFieldID, gpuFieldDstID) );
+   GUI gui ( timeloop, blocks, argc, argv );
+   gui.run();
+   return 0;
+}
--- a/apps/tutorials/cuda/01_GameOfLife_cuda.dox
+++ b/apps/tutorials/cuda/01_GameOfLife_cuda.dox
+namespace walberla{
+/**
+\page tutorial_cuda01 Tutorial - CUDA 1:  Game of Life on GPU
+\image html tutorial_cuda01_nvidia_titan.png
+> _Note:_ This tutorial required a CUDA aware MPI library.
+> If you get a SEGFAULT when executing this tutorial, make sure that your MPI library was built with
+> CUDA support! For instructions how to build OpenMPI with CUDA see this [page](https://www.open-mpi.org/faq/?category=building#build-cuda).
+\section cuda01_fields Creating Fields
+To run a simulation on a NVIDIA graphics card, we have to allocate data on the GPU and
+write a CUDA kernel that operates on this data. In this tutorial we first allocate a field on the GPU
+and learn about functionality to transfer data between CPU and GPU fields.
+Since initialization and output routines are usually not time critical, they are implemented
+for CPU fields only. In waLBerla we set up the complete simulation using
+CPU fields, copy the initialized fields over to the GPU, do the complete computation there, and, in the
+end, copy everything back to do the output from the CPU field.
+So only the time critical kernels have to be written in CUDA.
+Thus the setup code of the GPU GameOfLife program is very similar to its CPU version, which was implemented
+in a previous tutorial ( \ref tutorial_basics_03 ).
+One difference is, that fields which are often transfered from/to the GPU should be allocated with
+a different field allocator: cuda::HostFieldAllocator . This allocator uses cudaHostAlloc() instead of "new" ,
+such that the memory is marked "pinned", which means that it is always held in RAM and cannot be swapped out to disk.
+Data transfer from pinned memory is faster than from normal memory. The usage of this allocator is not
+mandatory, the data transfer functions work (slightly slower) also with normally allocated fields.
+\code
+ScalarField * createField( IBlock* const block, StructuredBlockStorage* const storage )
+{
+   return new ScalarField (
+            storage->getNumberOfXCells( *block ),   // number of cells in x direction per block
+            storage->getNumberOfYCells( *block ),   // number of cells in y direction per block
+            storage->getNumberOfZCells( *block ),   // number of cells in z direction per block
+            1,                                      // one ghost layer
+            real_t(0),                              // initial value
+            field::fzyx,                            // layout
+            make_shared<cuda::HostFieldAllocator<double> >()  // allocator for host pinned memory
+            );
+}
+\endcode
+Now we initialize the CPU field just like in the previous tutorial \ref tutorial_basics03 .
+Then two GPU fields are created: "source" and "destination" field. The helper function
+cuda::addGPUFieldToStorage() creates a cuda::GPUField field of the same size and layout of the given
+CPU field:
+\code
+BlockDataID gpuFieldSrcID = cuda::addGPUFieldToStorage<ScalarField>( blocks, cpuFieldID, "GPU Field Src" );
+BlockDataID gpuFieldDstID = cuda::addGPUFieldToStorage<ScalarField>( blocks, cpuFieldID, "GPU Field Dst" );
+\endcode
+The contents of the new GPU fields are initialized with the contents of the given CPU field.
+\section cuda01_kernels Writing and calling CUDA kernels
+For a basic understanding of the CUDA support in waLBerla please read \ref cudaPage first.
+After reading this page you should know what a FieldAccessor is and how to call CUDA kernels from
+cpp files. So we can now start with writing
+a CUDA kernel for the Game of Life algorithm. We place this in a separate file with ".cu" extension.
+The build system then automatically detects that this file should be compiled with the CUDA C++ compiler.
+The kernel gets two field accessors as arguments, one for the source and one for the destination field.
+Both accessors have to be configured using the CUDA variables blockIdx and threadIdx, such that afterwards
+the get() and getNeighbor() functions of the accessor class can work correctly.
+\code
+__global__ void gameOfLifeKernel( cuda::FieldAccessor<double> src, cuda::FieldAccessor<double> dst  )
+{
+   src.set( blockIdx, threadIdx );
+   dst.set( blockIdx, threadIdx );
+   int liveNeighbors = 0;
+   if ( src.getNeighbor(  1, 0,0 ) > 0.5 ) ++liveNeighbors;
+   if ( src.getNeighbor( -1, 0,0 ) > 0.5 ) ++liveNeighbors;
+   // normal Game of Life algorithm ....
+   // ...
+}
+\endcode
+To call this kernel we write a thin wrapper sweep which only has to get the GPU fields out of the blockstorage
+and passes them to the CUDA kernel. We use the cuda::Kernel class from waLBerla here, so that we can write this
+sweep in a normal cpp file.
+Here are the contents of this sweep:
+\code
+auto srcCudaField = block->getData< cuda::GPUField<real_t> > ( gpuFieldSrcID_ );
+auto dstCudaField = block->getData< cuda::GPUField<real_t> > ( gpuFieldDstID_ );
+auto myKernel = cuda::make_kernel( &gameOfLifeKernel );
+myKernel.addFieldIndexingParam( cuda::FieldIndexing<double>::xyz( *srcCudaField ) );
+myKernel.addFieldIndexingParam( cuda::FieldIndexing<double>::xyz( *dstCudaField ) );
+myKernel();
+srcCudaField->swapDataPointers( dstCudaField );
+\endcode
+All the computations are done on the GPU. The CPU field is not updated automatically! It was just used for
+setup reasons.
+To see if our kernel works, we copy the contents back to the CPU field after every timestep:
+\code
+timeloop.add() << Sweep( cuda::fieldCpyFunctor<ScalarField, GPUField >(cpuFieldID, gpuFieldDstID) );
+\endcode
+Of course this makes no sense for real simulations, since the transfer time is much higher than the
+time that was saved by doing the computation on the GPU. For production runs, one would usually transfer the
+field back every n'th timestep and write e.g. a VTK frame.
+\section cuda01_comm Communication
+In waLBerla there are two types of communication: _buffered_ and _direct_ communication.
+While buffered communication first collects all data in a buffer and sends only one message per communciation step and neighbor
+the direct communciation strategy, which is based on MPI datatypes, uses no intermediate buffers and therefore has to send
+more messages than buffered communication. For details see \ref walberla_communication .
+In the tutorials up to now, only the buffered approach was used. In this tutorial, we switch to the direct communciation strategy
+because then we can use the CUDA support of the MPI library to directly communciate from/to GPU memory.
+The usage of the two different communication schemes is very similar. Instead of creating a blockforest::communication::UniformBufferedScheme
+we create a blockforest::communication::UniformDirectScheme.
+Then we register a  field::communication::UniformMPIDatatypeInfo instead of the field::communication::PackInfo.
+\code
+typedef blockforest::communication::UniformDirectScheme<stencil::D2Q9 > CommScheme;
+CommScheme communication( blocks );
+communication.addDataToCommunicate( make_shared<field::communication::UniformMPIDatatypeInfo<GPUField> > (gpuFieldSrcID) );
+\endcode
+This scheme also supports heterogenous simulations, i.e. using a CPU field on
+some processes and a GPU field on other processes.
+*/
+}
--- a/apps/tutorials/cuda/01_GameOfLife_kernels.cu
+++ b/apps/tutorials/cuda/01_GameOfLife_kernels.cu
+#include "../cuda/01_GameOfLife_kernels.h"
+#include <iostream>
+namespace walberla {
+__global__ void gameOfLifeKernel( cuda::FieldAccessor<double> src, cuda::FieldAccessor<double> dst  )
+{
+   src.set( blockIdx, threadIdx );
+   dst.set( blockIdx, threadIdx );
+   // Count number of living neighbors
+   int liveNeighbors = 0;
+   if ( src.getNeighbor(  1, 0,0 ) > 0.5 ) ++liveNeighbors;
+   if ( src.getNeighbor( -1, 0,0 ) > 0.5 ) ++liveNeighbors;
+   if ( src.getNeighbor(  0,+1,0 ) > 0.5 ) ++liveNeighbors;
+   if ( src.getNeighbor(  0,-1,0 ) > 0.5 ) ++liveNeighbors;
+   if ( src.getNeighbor(  -1, -1, 0 ) > 0.5 ) ++liveNeighbors;
+   if ( src.getNeighbor(  -1, +1, 0 ) > 0.5 ) ++liveNeighbors;
+   if ( src.getNeighbor(  +1, -1,0 ) > 0.5 ) ++liveNeighbors;
+   if ( src.getNeighbor(  +1, +1,0 ) > 0.5 ) ++liveNeighbors;
+   // cell dies because of under- or over-population
+   if ( liveNeighbors < 2 || liveNeighbors > 3 )
+      dst.get() = 0.0;
+   else if ( liveNeighbors == 3 ) // cell comes alive
+      dst.get() = 1.0;
+   else
+      dst.get() = src.get();
+}
+} // namespace walberla
--- a/apps/tutorials/cuda/01_GameOfLife_kernels.h
+++ b/apps/tutorials/cuda/01_GameOfLife_kernels.h
+#include <iostream>
+#include "cuda/FieldIndexing.h"
+namespace walberla {
+__global__ void gameOfLifeKernel( cuda::FieldAccessor<double> src, cuda::FieldAccessor<double> dst  );
+} // namespace walberla
--- a/apps/tutorials/cuda/CMakeLists.txt
+++ b/apps/tutorials/cuda/CMakeLists.txt
+waLBerla_link_files_to_builddir( *.prm )
+waLBerla_link_files_to_builddir( *.png )
+waLBerla_add_executable ( NAME 01_GameOfLife_cuda
+                          FILES 01_GameOfLife_cuda.cpp 01_GameOfLife_kernels.cu
+                          DEPENDS blockforest core cuda field lbm geometry timeloop gui )                          
\ No newline at end of file
--- a/apps/tutorials/cuda/GosperGliderGun.png
+++ b/apps/tutorials/cuda/GosperGliderGun.png
--- a/cmake/waLBerlaFunctions.cmake
+++ b/cmake/waLBerlaFunctions.cmake
@@ -84,13 +84,17 @@ function ( waLBerla_add_module )
    set( hasSourceFiles FALSE ) 
 	foreach ( sourceFile ${sourceFiles} ) 
- 	   if ( ${sourceFile} MATCHES "\\.(c|cpp)" ) 
+ 	   if ( ${sourceFile} MATCHES "\\.(c|cpp|cu)" )
 	      set( hasSourceFiles TRUE ) 
 	   endif( ) 
 	endforeach( ) 
- 	if ( hasSourceFiles )      
+ 	if ( hasSourceFiles )
- 	   add_library( ${moduleLibraryName} STATIC ${sourceFiles} ${otherFiles} )  
+ 	    if ( CUDA_FOUND )
+            cuda_add_library( ${moduleLibraryName} STATIC ${sourceFiles} ${otherFiles} )
+        else()
+            add_library( ${moduleLibraryName} STATIC ${sourceFiles} ${otherFiles} )
+        endif( CUDA_FOUND )
 	else( ) 
 	   add_custom_target( ${moduleLibraryName} SOURCES ${sourceFiles} ${otherFiles} )  # dummy IDE target 
 	endif( ) 
@@ -194,7 +198,13 @@ function ( waLBerla_add_executable )
        endif ( )
    endif()
-    add_executable( ${ARG_NAME} ${sourceFiles} )
+    if ( CUDA_FOUND )
+        cuda_add_executable( ${ARG_NAME} ${sourceFiles} )
+    else()
+        add_executable( ${ARG_NAME} ${sourceFiles} )
+    endif()
+    #add_executable( ${ARG_NAME} ${sourceFiles} )
    target_link_modules  ( ${ARG_NAME} ${ARG_DEPENDS}  )    
    target_link_libraries( ${ARG_NAME} ${SERVICE_LIBS} )

--- a/src/cuda/AddGPUFieldToStorage.h
+++ b/src/cuda/AddGPUFieldToStorage.h
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file AddGPUFieldToStorage.h
+//! \ingroup cuda
+//! \author Martin Bauer <martin.bauer@fau.de>
+//
+//======================================================================================================================
+#pragma once
+#include "GPUField.h"
+#include "domain_decomposition/StructuredBlockStorage.h"
+#include <boost/bind.hpp>
+namespace walberla {
+namespace cuda {
+   //*******************************************************************************************************************
+   /*! Adds a cuda::GPUField to a StructuredBlockStorage
+   *
+   *  - Similar to walberla::field::addToStorage() functions
+   *  - created field is uninitialized
+   */
+   //*******************************************************************************************************************
+   template< typename GPUField_T>
+   BlockDataID addGPUFieldToStorage(const shared_ptr< StructuredBlockStorage >& bs,
+                                    const std::string & identifier,
+                                    uint_t fSize,
+                                    const Layout layout = fzyx,
+                                    uint_t nrOfGhostLayers = 1,
+                                    bool usePitchedMem = true );
+   //*******************************************************************************************************************
+   /*! Adds a cuda::GPUField to a StructuredBlockStorage using data from a CPU field
+   *
+   *  - adds a GPU field to a StructuredBlockStorage using a CPU field
+   *  - sizes, number of ghostlayers and layout are the same as the CPU field
+   *  - GPU field is initialized with the data currently stored in the CPU field
+   *  @tparam Field_T  type of the CPU field, the created GPUField will be of type cuda::GPUField<Field_T::value_type>
+   */
+   //*******************************************************************************************************************
+   template< typename Field_T>
+   BlockDataID addGPUFieldToStorage( const shared_ptr< StructuredBlockStorage > & bs,
+                                     ConstBlockDataID cpuFieldID,
+                                     const std::string & identifier,
+                                     bool usePitchedMem = true );
+} // namespace cuda
+} // namespace walberla
+#include "AddGPUFieldToStorage.impl.h"
--- a/src/cuda/AddGPUFieldToStorage.impl.h
+++ b/src/cuda/AddGPUFieldToStorage.impl.h
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file AddGPUFieldToStorage.impl.h
+//! \ingroup cuda
+//! \author Martin Bauer <martin.bauer@fau.de>
+//
+//======================================================================================================================
+#pragma once
+namespace walberla {
+namespace cuda {
+   namespace internal
+   {
+      template< typename GPUField_T>
+      GPUField_T * createGPUField( const IBlock * const block,
+                                   const StructuredBlockStorage * const bs,
+                                   uint_t ghostLayers,
+                                   uint_t fSize,
+                                   const field::Layout & layout,
+                                   bool usePitchedMem )
+      {
+         return new GPUField_T( bs->getNumberOfXCells( *block ),
+                                bs->getNumberOfYCells( *block ),
+                                bs->getNumberOfZCells( *block ),
+                                fSize, ghostLayers, layout, usePitchedMem );
+      }
+      template< typename Field_T>
+      GPUField< typename Field_T::value_type> *
+      createGPUFieldFromCPUField( const IBlock * const block,
+                                  const StructuredBlockStorage * const,
+                                  ConstBlockDataID cpuFieldID,
+                                  bool usePitchedMem
+                                )
+      {
+         typedef GPUField< typename Field_T::value_type> GPUField_T;
+         const Field_T * f = block->getData<Field_T>( cpuFieldID );
+         auto gpuField = new GPUField_T( f->xSize(), f->ySize(), f->zSize(), f->fSize(),
+                                         f->nrOfGhostLayers(), f->layout(), usePitchedMem );
+         cuda::fieldCpy( *gpuField, *f );
+         return gpuField;
+      }
+   }
+   template< typename GPUField_T>
+   BlockDataID addGPUFieldToStorage(const shared_ptr< StructuredBlockStorage >& bs,
+                                    const std::string & identifier,
+                                    uint_t fSize,
+                                    const Layout layout,
+                                    uint_t nrOfGhostLayers,
+                                    bool usePitchedMem )
+   {
+      auto func = boost::bind ( internal::createGPUField<GPUField_T>, _1, _2, nrOfGhostLayers, fSize, layout, usePitchedMem );
+      return bs->addStructuredBlockData< GPUField_T >( func, identifier );
+   }
+   template< typename Field_T>
+   BlockDataID addGPUFieldToStorage( const shared_ptr< StructuredBlockStorage > & bs,
+                                     ConstBlockDataID cpuFieldID,
+                                     const std::string & identifier,
+                                     bool usePitchedMem )
+   {
+      auto func = boost::bind ( internal::createGPUFieldFromCPUField<Field_T>, _1, _2, cpuFieldID, usePitchedMem );
+      return bs->addStructuredBlockData< GPUField<typename Field_T::value_type> >( func, identifier );
+   }
+} // namespace cuda
+} // namespace walberla
--- a/src/cuda/CMakeLists.txt
+++ b/src/cuda/CMakeLists.txt
+###################################################################################################
+#
+# Module cuda
+#
+###################################################################################################
+waLBerla_add_module( DEPENDS core communication domain_decomposition field stencil BUILD_ONLY_IF_FOUND CUDA ) 
+###################################################################################################                        
\ No newline at end of file
--- a/src/cuda/ErrorChecking.h
+++ b/src/cuda/ErrorChecking.h
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file ErrorChecking.h
+//! \ingroup cuda
+//! \author Martin Bauer <martin.bauer@fau.de>
+//
+//======================================================================================================================
+#pragma once
+#include "core/Abort.h"
+#include <sstream>
+#include <cuda_runtime.h>
+namespace walberla {
+namespace cuda {
+#define WALBERLA_CUDA_CHECK(ans) { ::walberla::cuda::checkForError((ans), __FILE__, __LINE__); }
+inline void checkForError( cudaError_t code, const std::string & callerPath, const int line )
+{
+  if(code != cudaSuccess)
+  {
+    std::stringstream ss;
+    ss << "CUDA Error: " << cudaGetErrorString( code );
+    Abort::instance()->abort( ss.str(), callerPath, line );
+  }
+}
+} // namespace cuda
+} // namespace walberla
--- a/src/cuda/FieldAccessor.h
+++ b/src/cuda/FieldAccessor.h
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file SimpleFieldAccessor.h
+//! \ingroup cuda
+//! \author Martin Bauer <martin.bauer@fau.de>
+//
+//======================================================================================================================
+#pragma once
+#include <cuda_runtime.h>
+#include "core/DataTypes.h"
+namespace walberla {
+namespace cuda {
+   template<typename T>
+   class FieldAccessor
+   {
+   public:
+      enum IndexingScheme { FZYX, FZY, FZ, F,
+                            ZYXF, ZYX, ZY, Z
+                            };
+      FieldAccessor( char * ptr,
+                     uint_t xOffset,
+                     uint_t yOffset,
+                     uint_t zOffset,
+                     uint_t fOffset,
+                     IndexingScheme indexingScheme )
+            : ptr_(ptr), xOffset_(xOffset), yOffset_(yOffset), zOffset_(zOffset),
+              fOffset_(fOffset), indexingScheme_(indexingScheme )
+      {}
+      __device__ void set( uint3 blockIdx, uint3 threadIdx )
+      {
+         switch ( indexingScheme_)
+         {
+            case FZYX: ptr_ += blockIdx.z * fOffset_ + blockIdx.y * zOffset_ + blockIdx.x * yOffset_ + threadIdx.x * xOffset_; break;
+            case FZY : ptr_ +=                         blockIdx.y * fOffset_ + blockIdx.x * zOffset_ + threadIdx.x * yOffset_; break;
+            case FZ  : ptr_ +=                                                 blockIdx.x * fOffset_ + threadIdx.x * zOffset_; break;
+            case F   : ptr_ +=                                                                         threadIdx.x * fOffset_; break;
+            case ZYXF: ptr_ += blockIdx.z * zOffset_ + blockIdx.y * yOffset_ + blockIdx.x * xOffset_ + threadIdx.x * fOffset_; break;
+            case ZYX : ptr_ +=                         blockIdx.y * zOffset_ + blockIdx.x * yOffset_ + threadIdx.x * xOffset_; break;
+            case ZY  : ptr_ +=                                                 blockIdx.x * zOffset_ + threadIdx.x * yOffset_; break;
+            case Z   : ptr_ +=                                                                         threadIdx.x * zOffset_; break;
+         }
+      }
+      __device__ uint_t getLinearIndex( uint3 blockIdx, uint3 threadIdx, uint3 gridDim, uint3 blockDim )
+      {
+         return threadIdx.x                              +
+                blockIdx.x * blockDim.x                  +
+                blockIdx.y * blockDim.x * gridDim.x      +
+                blockIdx.z * blockDim.x * gridDim.x * gridDim.y ;
+      }
+      // This is always true for this specific field indexing class.
+      __device__ __forceinline__ bool isValidPosition()  { return true; }
+      __device__ T & get()       { return * (T*)(ptr_);                }
+      __device__ T & get( int f) { return * (T*)(ptr_ + f * fOffset_); }
+      __device__ T & getNeighbor( int cx, int cy, int cz ) const
+      {
+         return * (T*)( ptr_ + cx * xOffset_ +
+                               cy * yOffset_ +
+                               cz * zOffset_ );
+      }
+      __device__ T & getNeighbor( int cx, int cy, int cz, int cf )
+      {
+         return * (T*)( ptr_ + cx * xOffset_ +
+                               cy * yOffset_ +
+                               cz * zOffset_ +
+                               cf * fOffset_ );
+      }
+   protected:
+      char * ptr_;
+      uint_t xOffset_;
+      uint_t yOffset_;
+      uint_t zOffset_;
+      uint_t fOffset_;
+      IndexingScheme indexingScheme_;
+   };
+} // namespace cuda
+} // namespace walberla
--- a/src/cuda/FieldAccessor3D.h
+++ b/src/cuda/FieldAccessor3D.h
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file FieldAccessor3D.h
+//! \ingroup cuda
+//! \author Paulo Carvalho <prcjunior@inf.ufpr.br>
+//
+//======================================================================================================================
+#pragma once
+#include "core/DataTypes.h"
+#include <cuda_runtime.h>
+namespace walberla {
+namespace cuda {
+   template<typename T>
+   class FieldAccessor3D
+   {
+   public:
+      FieldAccessor3D( char * ptr,
+                       uint_t xOffset,
+                       uint_t yOffset,
+                       uint_t zOffset,
+                       uint_t fOffset,
+                       const dim3 & idxDim,
+                       const dim3 & blockDim )
+            : ptr_( ptr ), xOffset_( xOffset ), yOffset_( yOffset ), zOffset_( zOffset ), fOffset_( fOffset ),
+              idxDim_( idxDim ), blockDim_( blockDim ), isValidPosition_( false )
+      {}
+      __device__ __forceinline__ void set( const uint3& blockIdx, const uint3& threadIdx )
+      {
+         uint_t x = blockIdx.x * blockDim_.x + threadIdx.x;
+         uint_t y = blockIdx.y * blockDim_.y + threadIdx.y;
+         uint_t z = blockIdx.z * blockDim_.z + threadIdx.z;
+         if ( x < idxDim_.x && y < idxDim_.y && z < idxDim_.z )
+         {
+            ptr_ += x * xOffset_ + y * yOffset_ + z * zOffset_;
+            isValidPosition_ = true;
+         }
+      }
+      __device__ __forceinline__ bool isValidPosition()  { return isValidPosition_; }
+      __device__ __forceinline__ T & get()               { return * (T*)(ptr_);                }
+      __device__ __forceinline__ T & get( int f )        { return * (T*)(ptr_ + f * fOffset_); }
+      __device__ __forceinline__ T & getNeighbor( int cx, int cy, int cz ) const
+      {
+         return * (T*)( ptr_ + cx * xOffset_ +
+                               cy * yOffset_ +
+                               cz * zOffset_ );
+      }
+      __device__ __forceinline__ T & getNeighbor( int cx, int cy, int cz, int cf )
+      {
+         return * (T*)( ptr_ + cx * xOffset_ +
+                               cy * yOffset_ +
+                               cz * zOffset_ +
+                               cf * fOffset_ );
+      }
+   protected:
+      char * ptr_;
+      uint_t xOffset_;
+      uint_t yOffset_;
+      uint_t zOffset_;
+      uint_t fOffset_;
+      dim3 idxDim_;
+      dim3 blockDim_;
+      bool isValidPosition_;
+   };
+} // namespace cuda
+} // namespace walberla
--- a/src/cuda/FieldAccessorXYZ.h
+++ b/src/cuda/FieldAccessorXYZ.h
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file FieldAccessorXYZ.h
+//! \ingroup cuda
+//! \author Martin Bauer <martin.bauer@fau.de>
+//
+//======================================================================================================================
+#pragma once
+#include <cuda_runtime.h>
+#include "core/DataTypes.h"
+namespace walberla {
+namespace cuda {
+   template<typename T>
+   class FieldAccessorXYZ
+   {
+   public:
+      FieldAccessorXYZ( char * ptr, size_t xOffset, size_t yOffset, size_t zOffset, size_t fOffset )
+            : ptr_(ptr), xOffset_(xOffset), yOffset_(yOffset), zOffset_(zOffset), fOffset_(fOffset)
+      {}
+      __device__ void set( uint3 blockIdx, uint3 threadIdx )
+      {
+         ptr_ += threadIdx.x * xOffset_ +
+                 blockIdx.x  * yOffset_ +
+                 blockIdx.y  * zOffset_ ;
+      }
+      __device__ T & get()       { return * (T*)(ptr_);                }
+      __device__ T & get( int f) { return * (T*)(ptr_ + f * fOffset_); }
+      __device__ T & getNeighbor( int cx, int cy, int cz ) const
+      {
+         return * (T*)( ptr_ + cx * (int)(xOffset_) +
+                               cy * (int)(yOffset_) +
+                               cz * (int)(zOffset_) );
+      }
+      __device__ T & getNeighbor( int cx, int cy, int cz, int cf )
+      {
+         return * (T*)( ptr_ + cx * (int)(xOffset_) +
+                               cy * (int)(yOffset_) +
+                               cz * (int)(zOffset_) +
+                               cf * (int)(fOffset_) );
+      }
+   protected:
+      char * ptr_;
+      size_t xOffset_;
+      size_t yOffset_;
+      size_t zOffset_;
+      size_t fOffset_;
+   };
+} // namespace cuda
+} // namespace walberla
--- a/src/cuda/FieldCopy.h
+++ b/src/cuda/FieldCopy.h
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file FieldCopy.h
+//! \ingroup cuda
+//! \author Martin Bauer <martin.bauer@fau.de>
+//
+//======================================================================================================================
+#pragma once
+#include "ErrorChecking.h"
+#include "GPUField.h"
+#include "domain_decomposition/StructuredBlockStorage.h"
+#include "field/Field.h"
+#include "field/GhostLayerField.h"
+#include "core/Abort.h"
+#include "core/logging/Logging.h"
+#include <cuda_runtime.h>
+namespace walberla {
+namespace cuda {
+   template<typename DstType, typename SrcType>
+   void fieldCpy( const shared_ptr< StructuredBlockStorage > & blocks,  BlockDataID dstID, ConstBlockDataID srcID )
+   {
+      for ( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
+      {
+               DstType * dst = blockIt->getData<DstType>( dstID );
+         const SrcType * src = blockIt->getData<SrcType>( srcID );
+         fieldCpy( *dst, *src );
+      }
+   }
+   template<typename DstType, typename SrcType>
+   boost::function<void()> fieldCpyFunctor( const shared_ptr< StructuredBlockStorage > & blocks,
+                                            BlockDataID dstID, ConstBlockDataID srcID )
+   {
+      return boost::bind( fieldCpy<DstType,SrcType>, blocks, dstID, srcID );
+   }
+   template<typename DstType, typename SrcType>
+   void fieldCpySweepFunction( BlockDataID dstID, ConstBlockDataID srcID, IBlock * block )
+   {
+            DstType * dst = block->getData<DstType>( dstID );
+      const SrcType * src = block->getData<SrcType>( srcID );
+      fieldCpy( *dst, *src );
+   }
+   template<typename DstType, typename SrcType>
+   boost::function<void(IBlock*)> fieldCpyFunctor( BlockDataID dstID, ConstBlockDataID srcID )
+   {
+      return boost::bind( fieldCpySweepFunction<DstType,SrcType>, dstID, srcID, _1 );
+   }
+   template<typename T, uint_t fs>
+   void fieldCpy( cuda::GPUField<T> & dst, const field::Field<T,fs> & src );
+   template<typename T, uint_t fs>
+   void fieldCpy( field::Field<T,fs> & dst, const cuda::GPUField<T> & src );
+   //===================================================================================================================
+   //
+   //  Implementation
+   //
+   //===================================================================================================================
+   template<typename T, uint_t fs>
+   void fieldCpy( cuda::GPUField<T> & dst, const field::Field<T,fs> & src )
+   {
+      cudaMemcpy3DParms p;
+      memset( &p, 0, sizeof(p) );
+      if ( dst.layout() != src.layout() ) {
+         WALBERLA_ABORT( "Cannot copy fields with different layout" );
+      }
+      bool canCopy = ( src.layout()     == fzyx &&
+                       dst.fAllocSize() == src.fAllocSize() &&
+                       dst.zAllocSize() == src.zAllocSize() &&
+                       dst.yAllocSize() == src.yAllocSize() &&
+                       dst.xSize()      == src.xSize() )
+                      ||
+                      ( src.layout()     == zyxf &&
+                        dst.zAllocSize() == src.zAllocSize() &&
+                        dst.yAllocSize() == src.yAllocSize() &&
+                        dst.xAllocSize() == src.xAllocSize() &&
+                        dst.fSize()      == src.fSize() );
+      if ( !canCopy ) {
+         WALBERLA_ABORT("Field have to have the same size ");
+      }
+      if ( dst.layout() == fzyx )
+      {
+         p.srcPtr = make_cudaPitchedPtr( (void*)(src.data()),          // pointer
+                                         sizeof(T) * src.xAllocSize(), // pitch
+                                         src.xAllocSize(),             // inner dimension size
+                                         src.yAllocSize()  );          // next outer dimension size
+         p.extent.width  = std::min( dst.xAllocSize(), src.xAllocSize() ) * sizeof(T);
+         p.extent.height = dst.yAllocSize();
+         p.extent.depth  = dst.zAllocSize() * dst.fAllocSize();
+      }
+      else
+      {
+         p.srcPtr = make_cudaPitchedPtr( (void*)(src.data()),          // pointer
+                                         sizeof(T) * src.fAllocSize(), // pitch
+                                         src.fAllocSize(),             // inner dimension size
+                                         src.xAllocSize()  );          // next outer dimension size
+         p.extent.width  = std::min( dst.fAllocSize(), src.fAllocSize() ) * sizeof(T);
+         p.extent.height = dst.xAllocSize();
+         p.extent.depth  = dst.yAllocSize() * dst.zAllocSize();
+      }
+      p.dstPtr = dst.pitchedPtr();
+      p.kind = cudaMemcpyHostToDevice;
+      WALBERLA_CUDA_CHECK( cudaMemcpy3D( &p ) );
+   }
+   template<typename T, uint_t fs>
+   void fieldCpy( field::Field<T,fs> & dst, const cuda::GPUField<T> & src )
+   {
+      cudaMemcpy3DParms p;
+      memset( &p, 0, sizeof(p) );
+      if ( dst.layout() != src.layout() ) {
+         WALBERLA_ABORT( "Cannot copy fields with different layout" );
+      }
+      bool canCopy = ( src.layout()     == fzyx &&
+                       dst.fAllocSize() == src.fAllocSize() &&
+                       dst.zAllocSize() == src.zAllocSize() &&
+                       dst.yAllocSize() == src.yAllocSize() &&
+                       dst.xSize()      == src.xSize() )
+                      ||
+                      ( src.layout()     == zyxf &&
+                        dst.zAllocSize() == src.zAllocSize() &&
+                        dst.yAllocSize() == src.yAllocSize() &&
+                        dst.xAllocSize() == src.xAllocSize() &&
+                        dst.fSize()      == src.fSize() );
+      if ( !canCopy ) {
+         WALBERLA_ABORT("Field have to have the same size ");
+      }
+      if ( dst.layout() == fzyx )
+      {
+         p.dstPtr = make_cudaPitchedPtr( (void*)(dst.data()),          // pointer
+                                         sizeof(T) * dst.xAllocSize(), // pitch
+                                         dst.xAllocSize(),             // inner dimension size
+                                         dst.yAllocSize()  );          // next outer dimension size
+         p.extent.width  = std::min( dst.xAllocSize(), src.xAllocSize() ) * sizeof(T);
+         p.extent.height = dst.yAllocSize();
+         p.extent.depth  = dst.zAllocSize() * dst.fAllocSize();
+      }
+      else
+      {
+         p.dstPtr = make_cudaPitchedPtr( (void*)(dst.data()),          // pointer
+                                         sizeof(T) * dst.fAllocSize(), // pitch
+                                         dst.fAllocSize(),             // inner dimension size
+                                         dst.xAllocSize()  );          // next outer dimension size
+         p.extent.width  = std::min( dst.fAllocSize(), src.fAllocSize() ) * sizeof(T);
+         p.extent.height = dst.xAllocSize();
+         p.extent.depth  = dst.yAllocSize() * dst.zAllocSize();
+      }
+      p.srcPtr = src.pitchedPtr();
+      p.kind = cudaMemcpyDeviceToHost;
+      WALBERLA_CUDA_CHECK( cudaMemcpy3D( &p ) );
+   }
+} // namespace cuda
+} // namespace walberla
--- a/src/cuda/FieldIndexing.cpp
+++ b/src/cuda/FieldIndexing.cpp
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file SimpleFieldIndexing.cpp
+//! \ingroup cuda
+//! \author Martin Bauer <martin.bauer@fau.de>
+//
+//======================================================================================================================
+#include "FieldIndexing.h"
+#include "GPUTypesExplicitInstantiation.h"
+#include "GPUField.h"
+#include "core/cell/CellInterval.h"
+#include "core/debug/Debug.h"
+#include "core/logging/Logging.h"
+#include "field/Layout.h"
+#include <cuda_runtime.h>
+#include <limits>
+#include <cmath>
+namespace walberla {
+namespace cuda {
+template< typename T>
+FieldIndexing<T>::FieldIndexing ( const GPUField<T> & field,
+                                  dim3 _blockDim, dim3 _gridDim,
+                                  const FieldAccessor<T> _gpuAccess )
+   : field_( field ),
+     blockDim_( _blockDim ),
+     gridDim_( _gridDim ),
+     gpuAccess_( _gpuAccess )
+{
+   WALBERLA_DEBUG_SECTION()
+   {
+      cudaDeviceProp prop;
+      int count;
+      cudaGetDeviceCount(&count);
+      int threadsPerBlock = std::numeric_limits<int>::max();
+      for (int i = 0; i < count; i++) {
+         cudaGetDeviceProperties(&prop, i);
+         threadsPerBlock = std::min( prop.maxThreadsPerBlock, threadsPerBlock );
+      }
+      WALBERLA_ASSERT_LESS_MSG( int_c( blockDim_.x ), threadsPerBlock,
+                               "InnerCoordThreadIndexing works only for fields where each dimension x,y,z is smaller " <<
+                               "than the maximal thread count per CUDA block." );
+   }
+}
+template< typename T>
+void shiftCoordinatesWhileFastestCoordHasSizeOne( typename FieldAccessor<T>::IndexingScheme & indexing, dim3 & gridDim, dim3 & blockDim )
+{
+   bool runLoop = true;
+   while( blockDim.x == 1 && runLoop )
+   {
+      blockDim.x = gridDim.x;
+      gridDim.x  = gridDim.y;
+      gridDim.y  = gridDim.z;
+      gridDim.z  = 1;
+      switch ( indexing ) {
+         case FieldAccessor<T>::FZYX: indexing = FieldAccessor<T>::FZY; break;
+         case FieldAccessor<T>::FZY : indexing = FieldAccessor<T>::FZ ; break;
+         case FieldAccessor<T>::FZ  : indexing = FieldAccessor<T>::F  ; break;
+         case FieldAccessor<T>::ZYXF: indexing = FieldAccessor<T>::ZYX ; break;
+         case FieldAccessor<T>::ZYX : indexing = FieldAccessor<T>::ZY  ; break;
+         case FieldAccessor<T>::ZY  : indexing = FieldAccessor<T>::Z   ; break;
+         // iteration goes only over a single element - stop the loop
+         case FieldAccessor<T>::Z: runLoop = false; break;
+         case FieldAccessor<T>::F: runLoop = false; break;
+      }
+   }
+}
+template< typename T>
+FieldIndexing<T> FieldIndexing<T>::interval ( const GPUField<T> & f, const CellInterval & ci, int fBegin, int fEnd )
+{
+   uint_t xOffset, yOffset, zOffset, fOffset;
+   if ( f.layout() == field::zyxf )
+   {
+      fOffset = sizeof(T);
+      xOffset = f.pitchedPtr().pitch;
+      yOffset = xOffset * f.xAllocSize();
+      zOffset = yOffset * f.yAllocSize();
+   }
+   else
+   {
+      xOffset = sizeof(T);
+      yOffset = f.pitchedPtr().pitch;
+      zOffset = yOffset * f.yAllocSize();
+      fOffset = zOffset * f.zAllocSize();
+   }
+   char * data = (char*)f.pitchedPtr().ptr;
+   // Jump over ghost cells to first inner cell
+   cell_idx_t gl = cell_idx_c( f.nrOfGhostLayers() );
+   data += ( ci.xMin() + gl )* xOffset +
+           ( ci.yMin() + gl )* yOffset +
+           ( ci.zMin() + gl )* zOffset;
+   dim3 gridDim;
+   dim3 blockDim;
+   typename FieldAccessor<T>::IndexingScheme firstCoord;
+   if ( f.layout() == fzyx )
+   {
+      firstCoord = FieldAccessor<T>::FZYX;
+      blockDim = dim3 ( (unsigned int)ci.xSize(), 1, 1 );
+      gridDim  = dim3 ( (unsigned int)ci.ySize(), (unsigned int)ci.zSize(), (unsigned int)( fEnd - fBegin) );
+   }
+   else
+   {
+      firstCoord = FieldAccessor<T>::ZYXF;
+      blockDim = dim3 ( (unsigned int)(fEnd - fBegin), 1, 1 );
+      gridDim  = dim3 ( (unsigned int)ci.xSize(), (unsigned int)ci.ySize(), (unsigned int)ci.zSize() );
+   }
+   shiftCoordinatesWhileFastestCoordHasSizeOne<T>( firstCoord, gridDim, blockDim );
+   return FieldIndexing<T> ( f, blockDim, gridDim,
+                             FieldAccessor<T>( data, xOffset, yOffset, zOffset, fOffset, firstCoord ) );
+}
+template< typename T>
+FieldIndexing<T> FieldIndexing<T>::xyz ( const GPUField<T> & f )
+{
+   CellInterval ci ( 0,0,0,
+                    cell_idx_c( f.xSize() ) - 1,
+                    cell_idx_c( f.ySize() ) - 1,
+                    cell_idx_c( f.zSize() ) - 1 );
+   return interval( f, ci, 0,1 );
+}
+template< typename T>
+FieldIndexing<T> FieldIndexing<T>::withGhostLayerXYZ( const GPUField<T> & f, uint_t numGhostLayers )
+{
+   cell_idx_t gl = std::min( cell_idx_c(numGhostLayers), cell_idx_c( f.nrOfGhostLayers() ) );
+   CellInterval ci ( -gl,-gl,-gl,
+                    cell_idx_c( f.xSize() ) + gl - 1,
+                    cell_idx_c( f.ySize() ) + gl - 1,
+                    cell_idx_c( f.zSize() ) + gl - 1 );
+   return interval( f, ci, 0, 1 );
+}
+template< typename T>
+FieldIndexing<T> FieldIndexing<T>::ghostLayerOnlyXYZ( const GPUField<T> & f, uint_t thickness,
+                                                      stencil::Direction dir, bool fullSlice )
+{
+   CellInterval ci;
+   f.getGhostRegion( dir, ci, cell_idx_c(thickness), fullSlice );
+   return interval( f, ci, 0, 1 );
+}
+template< typename T>
+FieldIndexing<T> FieldIndexing<T>::sliceBeforeGhostLayerXYZ( const GPUField<T> & f, uint_t thickness,
+                                                             stencil::Direction dir, bool fullSlice )
+{
+   CellInterval ci;
+   f.getSliceBeforeGhostLayer( dir, ci, cell_idx_c(thickness), fullSlice );
+   return interval( f, ci, 0, 1 );
+}
+template< typename T>
+FieldIndexing<T> FieldIndexing<T>::sliceXYZ( const GPUField<T> & f, cell_idx_t distance, uint_t thickness,
+                                             stencil::Direction dir, bool fullSlice )
+{
+   CellInterval ci;
+   f.getSlice( dir, ci, distance, cell_idx_c(thickness), fullSlice );
+   return interval( f, ci );
+}
+template< typename T>
+FieldIndexing<T> FieldIndexing<T>::allInner ( const GPUField<T> & f )
+{
+   CellInterval ci ( 0,0,0,
+                    cell_idx_c( f.xSize() ) - 1,
+                    cell_idx_c( f.ySize() ) - 1,
+                    cell_idx_c( f.zSize() ) - 1 );
+   return interval( f, ci, 0, cell_idx_c( f.fSize() ) );
+}
+template< typename T>
+FieldIndexing<T> FieldIndexing<T>::allWithGhostLayer ( const GPUField<T> & f )
+{
+   cell_idx_t gl = cell_idx_c( f.nrOfGhostLayers() );
+   CellInterval ci ( -gl,-gl,-gl,
+                    cell_idx_c( f.xSize() ) + gl - 1,
+                    cell_idx_c( f.ySize() ) + gl - 1,
+                    cell_idx_c( f.zSize() ) + gl - 1 );
+   return interval( f, ci, 0, cell_idx_c( f.fSize() ) );
+}
+template< typename T>
+FieldIndexing<T> FieldIndexing<T>::all ( const GPUField<T> & f, const cell::CellInterval & ci )
+{
+   return interval( f, ci, 0, cell_idx_c( f.fSize() ) );
+}
+GPU_CLASS_TEMPLATE_INSTANTIATION( FieldIndexing )
+} // namespace cuda
+} // namespace walberla
--- a/src/cuda/FieldIndexing.h
+++ b/src/cuda/FieldIndexing.h
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file SimpleFieldIndexing.h
+//! \ingroup cuda
+//! \author Martin Bauer <martin.bauer@fau.de>
+//! \brief Indexing Scheme that executes all elements of inner coordinate within on thread block
+//
+//======================================================================================================================
+#pragma once
+#include "FieldAccessor.h"
+#include "stencil/Directions.h"
+#include <cuda_runtime.h>
+namespace walberla { namespace cell {  class CellInterval;  } }
+namespace walberla {
+namespace cuda {
+   // Forward Declarations
+   template< typename T> class GPUField;
+   template<typename T>
+   class FieldIndexing
+   {
+   public:
+      //** Kernel call        ******************************************************************************************
+      /*! \name Kernel call  */
+      //@{
+      dim3 blockDim() const                      { return blockDim_; }
+      dim3 gridDim () const                      { return gridDim_;  }
+      const FieldAccessor<T> & gpuAccess() const { return gpuAccess_; }
+      //@}
+      //****************************************************************************************************************
+      //** Creation        *********************************************************************************************
+      /*! \name Creation  */
+      //@{
+      static FieldIndexing<T> interval ( const GPUField<T> & f,
+                                               const cell::CellInterval & ci,
+                                               int fBegin=0, int fEnd=1 );
+      static FieldIndexing<T> xyz ( const GPUField<T> & f );
+      static FieldIndexing<T> withGhostLayerXYZ       ( const GPUField<T> & f, uint_t numGhostLayers );
+      static FieldIndexing<T> ghostLayerOnlyXYZ       ( const GPUField<T> & f, uint_t thickness,
+                                                              stencil::Direction dir, bool fullSlice = false );
+      static FieldIndexing<T> sliceBeforeGhostLayerXYZ( const GPUField<T> & f, uint_t thickness,
+                                                              stencil::Direction dir, bool fullSlice = false );
+      static FieldIndexing<T> sliceXYZ                ( const GPUField<T> & f, cell_idx_t distance, uint_t thickness,
+                                                              stencil::Direction dir, bool fullSlice = false );
+      static FieldIndexing<T> allInner          ( const GPUField<T> & f );
+      static FieldIndexing<T> allWithGhostLayer ( const GPUField<T> & f );
+      static FieldIndexing<T> all               ( const GPUField<T> & f, const cell::CellInterval & ci );
+      //@}
+      //****************************************************************************************************************
+   protected:
+      FieldIndexing ( const GPUField<T> & field,
+                      dim3 _blockDim, dim3 _gridDim,
+                      const FieldAccessor<T> _gpuAccess );
+      const GPUField<T> &  field_;
+      dim3 blockDim_;
+      dim3 gridDim_;
+      FieldAccessor<T> gpuAccess_;
+   };
+} // namespace cuda
+} // namespace walberla
--- a/src/cuda/FieldIndexing3D.cpp
+++ b/src/cuda/FieldIndexing3D.cpp
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file FieldIndexing3D.cpp
+//! \ingroup cuda
+//! \author Paulo Carvalho <prcjunior@inf.ufpr.br>
+//
+//======================================================================================================================
+#include "FieldIndexing3D.h"
+#include "GPUTypesExplicitInstantiation.h"
+#include "GPUField.h"
+#include "core/cell/CellInterval.h"
+#include "core/debug/Debug.h"
+#include "core/logging/Logging.h"
+#include "field/Layout.h"
+#include <cuda_runtime.h>
+#include <limits>
+#include <cmath>
+#define BLOCK_SIZE_X       uint_t( 32 )
+#define BLOCK_SIZE_Y       uint_t( 2 )
+#define BLOCK_SIZE_Z       uint_t( 2 )
+namespace walberla {
+namespace cuda {
+// Returns ( a % b != 0 ) ? ( a / b + 1 ) : ( a / b )
+inline unsigned int iDivUp( unsigned int a, unsigned int b ) { return ( a + b - 1 ) / b; }
+dim3 FieldIndexing3DBase::preferredBlockDim_( BLOCK_SIZE_X, BLOCK_SIZE_Y, BLOCK_SIZE_Z );
+template< typename T>
+FieldIndexing3D<T>::FieldIndexing3D( const GPUField<T> & field,
+                                     const dim3 & _blockDim, const dim3 & _gridDim,
+                                     const FieldAccessor3D<T> & _gpuAccess )
+   : field_( field ),
+     blockDim_( _blockDim ),
+     gridDim_( _gridDim ),
+     gpuAccess_( _gpuAccess )
+{
+   WALBERLA_DEBUG_SECTION()
+   {
+      cudaDeviceProp prop;
+      int count;
+      cudaGetDeviceCount(&count);
+      int threadsPerBlock = std::numeric_limits<int>::max();
+      for (int i = 0; i < count; i++) {
+         cudaGetDeviceProperties(&prop, i);
+         threadsPerBlock = std::min( prop.maxThreadsPerBlock, threadsPerBlock );
+      }
+      WALBERLA_ASSERT_LESS_MSG( int_c( blockDim_.x ), threadsPerBlock,
+                               "InnerCoordThreadIndexing works only for fields where each dimension x,y,z is smaller " <<
+                               "than the maximal thread count per CUDA block." );
+   }
+}
+template< typename T>
+FieldIndexing3D<T> FieldIndexing3D<T>::interval( const GPUField<T> & f, const CellInterval & ci )
+{
+   uint_t xOffset, yOffset, zOffset, fOffset;
+   WALBERLA_ASSERT( f.layout() == field::fzyx );
+   xOffset = sizeof(T);
+   yOffset = f.pitchedPtr().pitch;
+   zOffset = yOffset * f.yAllocSize();
+   fOffset = zOffset * f.zAllocSize();
+   char * data = (char*)f.pitchedPtr().ptr;
+   // position data according to ci
+   cell_idx_t gl = cell_idx_c( f.nrOfGhostLayers() );
+   data += ( ci.xMin() + gl ) * xOffset +
+           ( ci.yMin() + gl ) * yOffset +
+           ( ci.zMin() + gl ) * zOffset;
+   dim3 idxDim( (unsigned int)ci.xSize(), (unsigned int)ci.ySize(), (unsigned int)ci.zSize() );
+   unsigned int bx = std::min( preferredBlockDim_.x, idxDim.x );
+   unsigned int by = std::min( preferredBlockDim_.y, idxDim.y );
+   unsigned int bz = std::min( preferredBlockDim_.z, idxDim.z );
+   dim3 gridDim( iDivUp( idxDim.x, bx ),
+                 iDivUp( idxDim.y, by ),
+                 iDivUp( idxDim.z, bz ) );
+   dim3 blockDim( bx, by, bz );
+   return FieldIndexing3D<T>( f, blockDim, gridDim,
+                              FieldAccessor3D<T>( data, xOffset, yOffset, zOffset, fOffset,
+                                                  idxDim, blockDim ) );
+}
+template< typename T>
+FieldIndexing3D<T> FieldIndexing3D<T>::xyz ( const GPUField<T> & f )
+{
+   CellInterval ci( 0,0,0,
+                    cell_idx_c( f.xSize() ) - 1,
+                    cell_idx_c( f.ySize() ) - 1,
+                    cell_idx_c( f.zSize() ) - 1 );
+   return interval( f, ci );
+}
+template< typename T>
+FieldIndexing3D<T> FieldIndexing3D<T>::withGhostLayerXYZ( const GPUField<T> & f, uint_t numGhostLayers )
+{
+   cell_idx_t gl = std::min( cell_idx_c( numGhostLayers ), cell_idx_c( f.nrOfGhostLayers() ) );
+   CellInterval ci( -gl,-gl,-gl,
+                    cell_idx_c( f.xSize() ) + gl - 1,
+                    cell_idx_c( f.ySize() ) + gl - 1,
+                    cell_idx_c( f.zSize() ) + gl - 1 );
+   return interval( f, ci );
+}
+template< typename T>
+FieldIndexing3D<T> FieldIndexing3D<T>::ghostLayerOnlyXYZ( const GPUField<T> & f, uint_t thickness,
+                                                       stencil::Direction dir, bool fullSlice )
+{
+   CellInterval ci;
+   f.getGhostRegion( dir, ci, cell_idx_c(thickness), fullSlice );
+   return interval( f, ci );
+}
+template< typename T>
+FieldIndexing3D<T> FieldIndexing3D<T>::sliceBeforeGhostLayerXYZ( const GPUField<T> & f, uint_t thickness,
+                                                              stencil::Direction dir, bool fullSlice )
+{
+   CellInterval ci;
+   f.getSliceBeforeGhostLayer( dir, ci, cell_idx_c(thickness), fullSlice );
+   return interval( f, ci );
+}
+template< typename T>
+FieldIndexing3D<T> FieldIndexing3D<T>::sliceXYZ( const GPUField<T> & f, cell_idx_t distance, uint_t thickness,
+                                              stencil::Direction dir, bool fullSlice )
+{
+   CellInterval ci;
+   f.getSlice( dir, ci, distance, cell_idx_c(thickness), fullSlice );
+   return interval( f, ci );
+}
+template< typename T>
+FieldIndexing3D<T> FieldIndexing3D<T>::intervalXYZ( const GPUField<T> & f, const cell::CellInterval & ci )
+{
+   return interval( f, ci );
+}
+GPU_CLASS_TEMPLATE_INSTANTIATION( FieldIndexing3D )
+} // namespace cuda
+} // namespace walberla
No results found