diff --git a/cmake/waLBerlaHelperFunctions.cmake b/cmake/waLBerlaHelperFunctions.cmake
index df759b16241646ceb8b056866d31aa175982d0d4..02ef2aa007aace8de1aca86928cb819a21a905cf 100644
--- a/cmake/waLBerlaHelperFunctions.cmake
+++ b/cmake/waLBerlaHelperFunctions.cmake
@@ -33,20 +33,20 @@ function( handle_python_codegen sourceFilesOut codeGenRequiredOut )
     set(codeGenRequired NO)
     foreach( sourceFile ${ARGN} )
         if( ${sourceFile} MATCHES ".*\\.gen\\.py$" )
-            get_filename_component(sourceFile ${sourceFile} NAME)
-            if( ${sourceFile} MATCHES ".*\\.cuda\\.gen\\.py$" )
-                string(REPLACE ".cuda.gen.py" ".h"  genHeaderFile ${sourceFile})
-                string(REPLACE ".cuda.gen.py" ".cu" genSourceFile ${sourceFile})
+            get_filename_component(sourceFileName ${sourceFile} NAME)
+            if( ${sourceFileName} MATCHES ".*\\.cuda\\.gen\\.py$" )
+                string(REPLACE ".cuda.gen.py" ".h"  genHeaderFile ${sourceFileName})
+                string(REPLACE ".cuda.gen.py" ".cu" genSourceFile ${sourceFileName})
             else()
-                string(REPLACE ".gen.py" ".h"  genHeaderFile ${sourceFile})
-                string(REPLACE ".gen.py" ".cpp" genSourceFile ${sourceFile})
+                string(REPLACE ".gen.py" ".h"  genHeaderFile ${sourceFileName})
+                string(REPLACE ".gen.py" ".cpp" genSourceFile ${sourceFileName})
             endif()
             list(APPEND result ${CMAKE_CURRENT_BINARY_DIR}/${genSourceFile}
                                ${CMAKE_CURRENT_BINARY_DIR}/${genHeaderFile})
             add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${genSourceFile}
                                       ${CMAKE_CURRENT_BINARY_DIR}/${genHeaderFile}
-                               DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${sourceFile}
-                               COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/${sourceFile}
+                               DEPENDS ${sourceFile}
+                               COMMAND ${PYTHON_EXECUTABLE} ${sourceFile}
                                WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
             include_directories(${CMAKE_CURRENT_BINARY_DIR})
             set(codeGenRequired YES)
diff --git a/cmake/waLBerlaModuleDependencySystem.cmake b/cmake/waLBerlaModuleDependencySystem.cmake
index 07b547551eb2be4d1d4ede08bffaf3d7d2f3f2af..45f7dbdc7f18e72de73c17e544867bb16590b3ad 100644
--- a/cmake/waLBerlaModuleDependencySystem.cmake
+++ b/cmake/waLBerlaModuleDependencySystem.cmake
@@ -7,11 +7,11 @@
 # Here is an explanation of the waLBerla module mechanism:        
 #  - One folder with a CMakeLists.txt that is a subfolder of one of the directories listed in the variable
 #    WALBERLA_MODULE_DIRS can be a module
-#  - the name of the module is the path relative to an WALBERLA_MODULE_DIRS entry
+#  - the name of the module is the path relative to a WALBERLA_MODULE_DIRS entry
 #  - waLBerla modules are all placed in the src/ subdirectory, so WALBERLA_MODULE_DIRS contains ${waLBerla_SOURCE}/src/
 #  - to create a module call waLBerla_module() inside this folder
-#  - this creates a static library that has the same name as the module, but slashes are replaced by minuses
-#    in case the module contains only header files no static lib is generated, only a custom target is added
+#  - this creates a static library that has the same name as the module, but slashes are replaced by minuses.
+#    In case the module contains only header files no static lib is generated, only a custom target is added
 #    to display the module in Visual Studio.
 #  - waLBerla_module takes a list of dependent modules. A second list of dependencies is generated by parsing
 #    all files in the module for corresponding "#include" lines. This mechanism is not a complete preprocessor
diff --git a/tests/cuda/CMakeLists.txt b/tests/cuda/CMakeLists.txt
index 62e1c45c501d58967725551e5c39bf7b8086fb9e..364b8bbea1b3980955955b22d3795ffc4048a2f9 100644
--- a/tests/cuda/CMakeLists.txt
+++ b/tests/cuda/CMakeLists.txt
@@ -16,6 +16,11 @@ waLBerla_execute_test( NAME  SimpleKernelTest )
 waLBerla_compile_test( FILES FieldIndexing3DTest.cpp FieldIndexing3DTest.cu )
 waLBerla_execute_test( NAME  FieldIndexing3DTest )
 
+waLBerla_compile_test( FILES codegen/CodegenJacobiGPU.cpp
+                             codegen/JacobiKernel2D.cuda.gen.py
+                             codegen/JacobiKernel3D.cuda.gen.py
+                       DEPENDS blockforest timeloop gui )
+waLBerla_execute_test( NAME CodegenJacobiGPU )
 
 
 # The following tests work only for CUDA enabled MPI
diff --git a/tests/cuda/codegen/CodegenJacobiGPU.cpp b/tests/cuda/codegen/CodegenJacobiGPU.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f81ecf1a6c2a2c91481b5203b3ecad99489865b8
--- /dev/null
+++ b/tests/cuda/codegen/CodegenJacobiGPU.cpp
@@ -0,0 +1,189 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file JacobiGpu.cpp
+//! \author Martin Bauer <martin.bauer@fau.de>
+//
+//======================================================================================================================
+
+#include "JacobiKernel2D.h"
+#include "JacobiKernel3D.h"
+
+#include "cuda/HostFieldAllocator.h"
+#include "blockforest/Initialization.h"
+#include "blockforest/communication/UniformDirectScheme.h"
+#include "blockforest/communication/UniformBufferedScheme.h"
+
+#include "core/Environment.h"
+#include "core/debug/TestSubsystem.h"
+
+#include "cuda/HostFieldAllocator.h"
+#include "cuda/FieldCopy.h"
+#include "cuda/GPUField.h"
+#include "cuda/Kernel.h"
+#include "cuda/AddGPUFieldToStorage.h"
+#include "cuda/communication/GPUPackInfo.h"
+#include "cuda/FieldIndexing.h"
+
+#include "field/AddToStorage.h"
+#include "field/communication/UniformMPIDatatypeInfo.h"
+#include "field/vtk/VTKWriter.h"
+
+#include "geometry/initializer/ScalarFieldFromGrayScaleImage.h"
+
+#include "gui/Gui.h"
+
+#include "stencil/D2Q9.h"
+#include "stencil/D3Q7.h"
+
+#include "timeloop/SweepTimeloop.h"
+
+
+using namespace walberla;
+
+typedef GhostLayerField<double,1> ScalarField;
+typedef cuda::GPUField<double> GPUField;
+
+
+ScalarField * createField( IBlock* const block, StructuredBlockStorage* const storage )
+{
+   return new ScalarField (
+            storage->getNumberOfXCells( *block ),   // number of cells in x direction per block
+            storage->getNumberOfYCells( *block ),   // number of cells in y direction per block
+            storage->getNumberOfZCells( *block ),   // number of cells in z direction per block
+            1,                                      // one ghost layer
+            double(0),                              // initial value
+            field::fzyx,                            // layout
+            make_shared<cuda::HostFieldAllocator<double> >()  // allocator for host pinned memory
+            );
+}
+
+void testJacobi2D()
+{
+   uint_t xSize = 20;
+   uint_t ySize = 20;
+
+   // Create blocks
+   shared_ptr< StructuredBlockForest > blocks = blockforest::createUniformBlockGrid (
+           uint_t(1) , uint_t(1),  uint_t(1),  // number of blocks in x,y,z direction
+           xSize, ySize, uint_t(1),  // how many cells per block (x,y,z)
+           real_t(1),                          // dx: length of one cell in physical coordinates
+           false,                              // one block per process - "false" means all blocks to one process
+           true, true, true );                 // no periodicity
+
+
+   BlockDataID cpuFieldID = blocks->addStructuredBlockData<ScalarField>( &createField, "CPU Field" );
+   BlockDataID gpuField = cuda::addGPUFieldToStorage<ScalarField>( blocks, cpuFieldID, "GPU Field Src" );
+
+
+   for(auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt)
+   {
+      auto f = blockIt->getData<ScalarField>( cpuFieldID );
+      for( cell_idx_t y = 0; y < cell_idx_c( f->ySize() / 2 ); ++y )
+         for( cell_idx_t x = 0; x < cell_idx_c( f->xSize() / 2 ); ++x )
+            f->get( x, y, 0 ) = 1.0;
+   }
+
+
+
+   typedef blockforest::communication::UniformBufferedScheme<stencil::D2Q9> CommScheme;
+   typedef cuda::communication::GPUPackInfo<GPUField> Packing;
+
+   CommScheme commScheme(blocks);
+   commScheme.addDataToCommunicate( make_shared<Packing>(gpuField) );
+
+   // Create Timeloop
+   const uint_t numberOfTimesteps = uint_t(800);
+   SweepTimeloop timeloop ( blocks, numberOfTimesteps );
+
+   // Registering the sweep
+   timeloop.add() << BeforeFunction(  commScheme, "Communication" )
+                  << Sweep( pystencils::JacobiKernel2D(gpuField, 1.0), "Jacobi Kernel" );
+
+
+   cuda::fieldCpy<GPUField, ScalarField>( blocks, gpuField, cpuFieldID );
+   timeloop.run();
+   cuda::fieldCpy<ScalarField, GPUField>( blocks, cpuFieldID, gpuField );
+
+   auto firstBlock = blocks->begin();
+   auto f = firstBlock->getData<ScalarField>( cpuFieldID );
+   WALBERLA_CHECK_FLOAT_EQUAL(f->get(0,0,0), real_t(1.0 / 4.0));
+}
+
+
+void testJacobi3D()
+{
+   uint_t xSize = 12;
+   uint_t ySize = 12;
+   uint_t zSize = 12;
+
+   // Create blocks
+   shared_ptr< StructuredBlockForest > blocks = blockforest::createUniformBlockGrid (
+           uint_t(1) , uint_t(1),  uint_t(1),  // number of blocks in x,y,z direction
+           xSize, ySize, zSize,                // how many cells per block (x,y,z)
+           real_t(1),                          // dx: length of one cell in physical coordinates
+           false,                              // one block per process - "false" means all blocks to one process
+           true, true, true );                 // no periodicity
+
+
+   BlockDataID cpuFieldID = blocks->addStructuredBlockData<ScalarField>( &createField, "CPU Field" );
+   BlockDataID gpuField = cuda::addGPUFieldToStorage<ScalarField>( blocks, cpuFieldID, "GPU Field Src" );
+
+
+   for(auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt)
+   {
+      auto f = blockIt->getData<ScalarField>( cpuFieldID );
+      for( cell_idx_t z = 0; z < cell_idx_c( f->zSize() / 2 ); ++z )
+         for( cell_idx_t y = 0; y < cell_idx_c( f->ySize() / 2 ); ++y )
+            for( cell_idx_t x = 0; x < cell_idx_c( f->xSize() / 2 ); ++x )
+               f->get( x, y, z ) = 1.0;
+   }
+
+
+
+   typedef blockforest::communication::UniformBufferedScheme<stencil::D3Q7> CommScheme;
+   typedef cuda::communication::GPUPackInfo<GPUField> Packing;
+
+   CommScheme commScheme(blocks);
+   commScheme.addDataToCommunicate( make_shared<Packing>(gpuField) );
+
+   // Create Timeloop
+   const uint_t numberOfTimesteps = uint_t(800);
+   SweepTimeloop timeloop ( blocks, numberOfTimesteps );
+
+   // Registering the sweep
+   timeloop.add() << BeforeFunction(  commScheme, "Communication" )
+                  << Sweep( pystencils::JacobiKernel3D(gpuField, 1.0), "Jacobi Kernel" );
+
+
+   cuda::fieldCpy<GPUField, ScalarField>( blocks, gpuField, cpuFieldID );
+   timeloop.run();
+   cuda::fieldCpy<ScalarField, GPUField>( blocks, cpuFieldID, gpuField );
+
+   auto firstBlock = blocks->begin();
+   auto f = firstBlock->getData<ScalarField>( cpuFieldID );
+   WALBERLA_CHECK_FLOAT_EQUAL(f->get(0,0,0), real_t(1.0 / 8.0));
+}
+
+int main( int argc, char ** argv )
+{
+   mpi::Environment env( argc, argv );
+   debug::enterTestMode();
+
+   testJacobi2D();
+   testJacobi3D();
+
+   return 0;
+}
diff --git a/tests/cuda/codegen/JacobiKernel2D.cuda.gen.py b/tests/cuda/codegen/JacobiKernel2D.cuda.gen.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca6fca69cd5702bb2ec856b312f4e52890f1ec64
--- /dev/null
+++ b/tests/cuda/codegen/JacobiKernel2D.cuda.gen.py
@@ -0,0 +1,12 @@
+from pystencils_walberla import Sweep
+
+k = Sweep(dim=2)
+
+src = k.field("f1")
+dst = k.temporaryField(src)
+h = k.constant("h")
+
+rhs = (src[1,0] + src[-1,0] + src[0,1] + src[0, -1] ) / (4 * h**2)
+k.addEq(dst[0,0], rhs)
+
+k.generate()
diff --git a/tests/cuda/codegen/JacobiKernel3D.cuda.gen.py b/tests/cuda/codegen/JacobiKernel3D.cuda.gen.py
new file mode 100644
index 0000000000000000000000000000000000000000..32ac6d17eff7cd85a103c38383e000f678ca04d9
--- /dev/null
+++ b/tests/cuda/codegen/JacobiKernel3D.cuda.gen.py
@@ -0,0 +1,12 @@
+from pystencils_walberla import Sweep
+
+k = Sweep(dim=3)
+
+src = k.field("f1")
+dst = k.temporaryField(src)
+h = k.constant("h")
+
+rhs = (src[1,0,0] + src[-1,0,0] + src[0,1,0] + src[0, -1, 0] + src[0, 0, 1] + src[0, 0 , -1] ) / (6 * h**2)
+k.addEq(dst[0,0,0], rhs)
+
+k.generate()
diff --git a/tests/field/CMakeLists.txt b/tests/field/CMakeLists.txt
index 5426bc964844211226ba076b73768ec81379f690..4307644a0dcbbdd7d803f6a91d91b8e02e1c869f 100644
--- a/tests/field/CMakeLists.txt
+++ b/tests/field/CMakeLists.txt
@@ -50,3 +50,11 @@ if( WALBERLA_BUILD_WITH_MPI )
 endif( WALBERLA_BUILD_WITH_MPI )
 
 
+
+# CodeGen Tests
+
+waLBerla_compile_test( FILES codegen/CodegenJacobiCPU.cpp codegen/JacobiKernel2D.gen.py codegen/JacobiKernel3D.gen.py
+                       DEPENDS gui timeloop )
+waLBerla_execute_test( NAME CodegenJacobiCPU )
+
+
diff --git a/tests/field/codegen/CodegenJacobiCPU.cpp b/tests/field/codegen/CodegenJacobiCPU.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2023baeb9598fa2e27150adabca8832b19ff75ff
--- /dev/null
+++ b/tests/field/codegen/CodegenJacobiCPU.cpp
@@ -0,0 +1,149 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file CodegenJacobiGPU.cpp
+//! \author Martin Bauer <martin.bauer@fau.de>
+//
+//======================================================================================================================
+
+#include "JacobiKernel2D.h"
+#include "JacobiKernel3D.h"
+#include "blockforest/Initialization.h"
+#include "blockforest/communication/UniformDirectScheme.h"
+#include "blockforest/communication/UniformBufferedScheme.h"
+
+#include "core/Environment.h"
+#include "core/debug/TestSubsystem.h"
+
+#include "field/AddToStorage.h"
+#include "field/communication/PackInfo.h"
+
+#include "gui/Gui.h"
+
+#include "stencil/D2Q9.h"
+#include "stencil/D3Q7.h"
+
+#include "timeloop/SweepTimeloop.h"
+
+
+using namespace walberla;
+
+typedef GhostLayerField<double,1> ScalarField;
+
+
+void testJacobi2D()
+{
+   uint_t xSize = 20;
+   uint_t ySize = 20;
+   // Create blocks
+   shared_ptr< StructuredBlockForest > blocks = blockforest::createUniformBlockGrid (
+           uint_t(1) , uint_t(1),  uint_t(1),  // number of blocks in x,y,z direction
+           xSize, ySize, uint_t(1),            // how many cells per block (x,y,z)
+           real_t(1),                          // dx: length of one cell in physical coordinates
+           false,                              // one block per process - "false" means all blocks to one process
+           true, true, true );                 // no periodicity
+
+
+   BlockDataID fieldID = field::addToStorage<ScalarField>(blocks, "Field", real_t(0.0));
+
+   // Initialize a quarter of the field with ones, the rest remains 0
+   // Jacobi averages the domain -> every cell should be at 0.25 at sufficiently many timesteps
+   for(auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt)
+   {
+      auto f = blockIt->getData<ScalarField>( fieldID );
+      for( cell_idx_t y = 0; y < cell_idx_c( f->ySize() / 2 ); ++y )
+         for( cell_idx_t x = 0; x < cell_idx_c( f->xSize() / 2 ); ++x )
+            f->get( x, y, 0 ) = 1.0;
+   }
+
+   typedef blockforest::communication::UniformBufferedScheme<stencil::D2Q9> CommScheme;
+   typedef field::communication::PackInfo<ScalarField> Packing;
+   CommScheme commScheme(blocks);
+   commScheme.addDataToCommunicate( make_shared<Packing>(fieldID) );
+
+   // Create Timeloop
+   const uint_t numberOfTimesteps = uint_t(800);
+   SweepTimeloop timeloop ( blocks, numberOfTimesteps );
+
+   // Registering the sweep
+   timeloop.add() << BeforeFunction(  commScheme, "Communication" )
+                  << Sweep( pystencils::JacobiKernel2D(fieldID, 1.0), "Jacobi Kernel" );
+
+   timeloop.run();
+
+   auto firstBlock = blocks->begin();
+   auto f = firstBlock->getData<ScalarField>( fieldID );
+   WALBERLA_CHECK_FLOAT_EQUAL(f->get(0,0,0), real_t(1.0 / 4.0));
+}
+
+
+void testJacobi3D()
+{
+   uint_t xSize = 12;
+   uint_t ySize = 12;
+   uint_t zSize = 12;
+   // Create blocks
+   shared_ptr< StructuredBlockForest > blocks = blockforest::createUniformBlockGrid (
+           uint_t(1) , uint_t(1),  uint_t(1),  // number of blocks in x,y,z direction
+           xSize, ySize, zSize,                // how many cells per block (x,y,z)
+           real_t(1),                          // dx: length of one cell in physical coordinates
+           false,                              // one block per process - "false" means all blocks to one process
+           true, true, true );                 // no periodicity
+
+
+   BlockDataID fieldID = field::addToStorage<ScalarField>(blocks, "Field", real_t(0.0));
+
+   // Initialize a quarter of the field with ones, the rest remains 0
+   // Jacobi averages the domain -> every cell should be at 0.25 at sufficiently many timesteps
+   for(auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt)
+   {
+      auto f = blockIt->getData<ScalarField>( fieldID );
+      for( cell_idx_t z = 0; z < cell_idx_c( f->zSize() / 2); ++z )
+         for( cell_idx_t y = 0; y < cell_idx_c( f->ySize() / 2 ); ++y )
+            for( cell_idx_t x = 0; x < cell_idx_c( f->xSize() / 2 ); ++x )
+               f->get( x, y, z ) = 1.0;
+   }
+
+   typedef blockforest::communication::UniformBufferedScheme<stencil::D3Q7> CommScheme;
+   typedef field::communication::PackInfo<ScalarField> Packing;
+   CommScheme commScheme(blocks);
+   commScheme.addDataToCommunicate( make_shared<Packing>(fieldID) );
+
+   // Create Timeloop
+   const uint_t numberOfTimesteps = uint_t(800); // number of timesteps for non-gui runs
+   SweepTimeloop timeloop ( blocks, numberOfTimesteps );
+
+   // Registering the sweep
+   timeloop.add() << BeforeFunction(  commScheme, "Communication" )
+                  << Sweep( pystencils::JacobiKernel3D(fieldID, 1.0), "Jacobi Kernel" );
+
+   timeloop.run();
+
+   auto firstBlock = blocks->begin();
+   auto f = firstBlock->getData<ScalarField>( fieldID );
+   WALBERLA_CHECK_FLOAT_EQUAL(f->get(0,0,0), real_t(1.0 / 8.0));
+}
+
+
+int main( int argc, char ** argv )
+{
+   mpi::Environment env( argc, argv );
+   debug::enterTestMode();
+
+   testJacobi2D();
+   testJacobi3D();
+
+   return 0;
+}
diff --git a/tests/field/codegen/JacobiKernel2D.gen.py b/tests/field/codegen/JacobiKernel2D.gen.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca6fca69cd5702bb2ec856b312f4e52890f1ec64
--- /dev/null
+++ b/tests/field/codegen/JacobiKernel2D.gen.py
@@ -0,0 +1,12 @@
+from pystencils_walberla import Sweep
+
+k = Sweep(dim=2)
+
+src = k.field("f1")
+dst = k.temporaryField(src)
+h = k.constant("h")
+
+rhs = (src[1,0] + src[-1,0] + src[0,1] + src[0, -1] ) / (4 * h**2)
+k.addEq(dst[0,0], rhs)
+
+k.generate()
diff --git a/tests/field/codegen/JacobiKernel3D.gen.py b/tests/field/codegen/JacobiKernel3D.gen.py
new file mode 100644
index 0000000000000000000000000000000000000000..32ac6d17eff7cd85a103c38383e000f678ca04d9
--- /dev/null
+++ b/tests/field/codegen/JacobiKernel3D.gen.py
@@ -0,0 +1,12 @@
+from pystencils_walberla import Sweep
+
+k = Sweep(dim=3)
+
+src = k.field("f1")
+dst = k.temporaryField(src)
+h = k.constant("h")
+
+rhs = (src[1,0,0] + src[-1,0,0] + src[0,1,0] + src[0, -1, 0] + src[0, 0, 1] + src[0, 0 , -1] ) / (6 * h**2)
+k.addEq(dst[0,0,0], rhs)
+
+k.generate()