From 69b7ff12958dfb97c3281282af7799eabf62d5d5 Mon Sep 17 00:00:00 2001
From: Dominik Thoennes <dominik.thoennes@fau.de>
Date: Thu, 27 Feb 2020 09:47:27 +0100
Subject: [PATCH] move lbmpy_walberla und pystencils_walberla into walberla
 repo

---
 .gitlab-ci.yml                                |  48 +-
 CMakeLists.txt                                |   2 +-
 cmake/waLBerlaHelperFunctions.cmake           |   3 +-
 python/lbmpy_walberla/__init__.py             |   4 +
 python/lbmpy_walberla/boundary.py             |  92 +++
 python/lbmpy_walberla/sparse.py               |  49 ++
 .../templates/Boundary.tmpl.cpp               | 104 ++++
 .../lbmpy_walberla/templates/Boundary.tmpl.h  | 188 ++++++
 .../templates/LatticeModel.tmpl.cpp           | 127 ++++
 .../templates/LatticeModel.tmpl.h             | 554 ++++++++++++++++++
 python/lbmpy_walberla/tests/__init__.py       |   0
 .../tests/test_walberla_codegen.py            | 107 ++++
 .../lbmpy_walberla/walberla_lbm_generation.py | 271 +++++++++
 python/pystencils_walberla/__init__.py        |   8 +
 .../pystencils_walberla/cmake_integration.py  | 111 ++++
 python/pystencils_walberla/codegen.py         | 370 ++++++++++++
 python/pystencils_walberla/jinja_filters.py   | 376 ++++++++++++
 .../templates/CpuPackInfo.tmpl.cpp            | 105 ++++
 .../templates/CpuPackInfo.tmpl.h              |  62 ++
 .../templates/GpuPackInfo.tmpl.cpp            | 111 ++++
 .../templates/GpuPackInfo.tmpl.h              |  46 ++
 .../templates/MpiDtypeInfo.tmpl.h             |  84 +++
 .../templates/Sweep.tmpl.cpp                  |  95 +++
 .../templates/Sweep.tmpl.h                    |  93 +++
 .../templates/SweepInnerOuter.tmpl.cpp        | 141 +++++
 .../templates/SweepInnerOuter.tmpl.h          | 113 ++++
 python/pystencils_walberla/tests/__init__.py  |   0
 .../tests/test_packinfo_generation.py         |  41 ++
 .../tests/test_walberla_gen.py                |  56 ++
 29 files changed, 3323 insertions(+), 38 deletions(-)
 create mode 100644 python/lbmpy_walberla/__init__.py
 create mode 100644 python/lbmpy_walberla/boundary.py
 create mode 100644 python/lbmpy_walberla/sparse.py
 create mode 100644 python/lbmpy_walberla/templates/Boundary.tmpl.cpp
 create mode 100644 python/lbmpy_walberla/templates/Boundary.tmpl.h
 create mode 100644 python/lbmpy_walberla/templates/LatticeModel.tmpl.cpp
 create mode 100644 python/lbmpy_walberla/templates/LatticeModel.tmpl.h
 create mode 100644 python/lbmpy_walberla/tests/__init__.py
 create mode 100644 python/lbmpy_walberla/tests/test_walberla_codegen.py
 create mode 100644 python/lbmpy_walberla/walberla_lbm_generation.py
 create mode 100644 python/pystencils_walberla/__init__.py
 create mode 100644 python/pystencils_walberla/cmake_integration.py
 create mode 100644 python/pystencils_walberla/codegen.py
 create mode 100644 python/pystencils_walberla/jinja_filters.py
 create mode 100644 python/pystencils_walberla/templates/CpuPackInfo.tmpl.cpp
 create mode 100644 python/pystencils_walberla/templates/CpuPackInfo.tmpl.h
 create mode 100644 python/pystencils_walberla/templates/GpuPackInfo.tmpl.cpp
 create mode 100644 python/pystencils_walberla/templates/GpuPackInfo.tmpl.h
 create mode 100644 python/pystencils_walberla/templates/MpiDtypeInfo.tmpl.h
 create mode 100644 python/pystencils_walberla/templates/Sweep.tmpl.cpp
 create mode 100644 python/pystencils_walberla/templates/Sweep.tmpl.h
 create mode 100644 python/pystencils_walberla/templates/SweepInnerOuter.tmpl.cpp
 create mode 100644 python/pystencils_walberla/templates/SweepInnerOuter.tmpl.h
 create mode 100644 python/pystencils_walberla/tests/__init__.py
 create mode 100644 python/pystencils_walberla/tests/test_packinfo_generation.py
 create mode 100644 python/pystencils_walberla/tests/test_walberla_gen.py

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 6b327d488..360b2d122 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -376,8 +376,7 @@ intel_19_hybrid:
    <<: *build_definition
    image: i10git.cs.fau.de:5005/walberla/buildenvs/intel:19
    before_script:
-      - pip3 install git+https://i10git.cs.fau.de/pycodegen/lbmpy.git
-      - pip3 install git+https://i10git.cs.fau.de/pycodegen/lbmpy_walberla.git
+      - pip3 install lbmpy
    variables:
       <<: *build_hybrid_variables
       WALBERLA_BUILD_WITH_CUDA: "OFF"
@@ -394,8 +393,7 @@ intel_19_serial_dbg:
    <<: *build_definition
    image: i10git.cs.fau.de:5005/walberla/buildenvs/intel:19
    before_script:
-      - pip3 install git+https://i10git.cs.fau.de/pycodegen/lbmpy.git
-      - pip3 install git+https://i10git.cs.fau.de/pycodegen/lbmpy_walberla.git
+      - pip3 install lbmpy
    variables:
       <<: *build_serial_dbg_variables
       WALBERLA_BUILD_WITH_CUDA: "OFF"
@@ -412,8 +410,7 @@ intel_19_mpionly_dbg:
    <<: *build_definition
    image: i10git.cs.fau.de:5005/walberla/buildenvs/intel:19
    before_script:
-      - pip3 install git+https://i10git.cs.fau.de/pycodegen/lbmpy.git
-      - pip3 install git+https://i10git.cs.fau.de/pycodegen/lbmpy_walberla.git
+      - pip3 install lbmpy
    variables:
       <<: *build_mpionly_dbg_variables
       WALBERLA_BUILD_WITH_CUDA: "OFF"
@@ -430,8 +427,7 @@ intel_19_hybrid_dbg:
    <<: *build_definition
    image: i10git.cs.fau.de:5005/walberla/buildenvs/intel:19
    before_script:
-      - pip3 install git+https://i10git.cs.fau.de/pycodegen/lbmpy.git
-      - pip3 install git+https://i10git.cs.fau.de/pycodegen/lbmpy_walberla.git
+      - pip3 install lbmpy
    variables:
       <<: *build_hybrid_dbg_variables
       WALBERLA_BUILD_WITH_CUDA: "OFF"
@@ -447,13 +443,9 @@ intel_19_hybrid_dbg:
 intel_19_hybrid_dbg_sp:
    <<: *build_definition
    image: i10git.cs.fau.de:5005/walberla/buildenvs/intel:19
-   before_script:
-      - pip3 install git+https://i10git.cs.fau.de/pycodegen/lbmpy.git
-      - pip3 install git+https://i10git.cs.fau.de/pycodegen/lbmpy_walberla.git
    variables:
       <<: *build_hybrid_dbg_sp_variables
       WALBERLA_BUILD_WITH_CUDA: "OFF"
-      WALBERLA_BUILD_WITH_CODEGEN: "ON"
       WALBERLA_ENABLE_GUI: 0
    except:
       variables:
@@ -871,8 +863,7 @@ gcc_9_hybrid:
    image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:9
    stage: pretest
    before_script:
-      - pip3 install git+https://i10git.cs.fau.de/pycodegen/lbmpy.git
-      - pip3 install git+https://i10git.cs.fau.de/pycodegen/lbmpy_walberla.git
+      - pip3 install lbmpy
    variables:
       <<: *build_hybrid_variables
       WALBERLA_BUILD_WITH_CUDA: "OFF"
@@ -888,8 +879,7 @@ gcc_9_serial_dbg:
    <<: *build_definition
    image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:9
    before_script:
-      - pip3 install git+https://i10git.cs.fau.de/pycodegen/lbmpy.git
-      - pip3 install git+https://i10git.cs.fau.de/pycodegen/lbmpy_walberla.git
+      - pip3 install lbmpy
    variables:
       <<: *build_serial_dbg_variables
       WALBERLA_BUILD_WITH_CUDA: "OFF"
@@ -905,8 +895,7 @@ gcc_9_mpionly_dbg:
    <<: *build_definition
    image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:9
    before_script:
-      - pip3 install git+https://i10git.cs.fau.de/pycodegen/lbmpy.git
-      - pip3 install git+https://i10git.cs.fau.de/pycodegen/lbmpy_walberla.git
+      - pip3 install lbmpy
    variables:
       <<: *build_mpionly_dbg_variables
       WALBERLA_BUILD_WITH_CUDA: "OFF"
@@ -922,8 +911,7 @@ gcc_9_hybrid_dbg:
    <<: *build_definition
    image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:9
    before_script:
-      - pip3 install git+https://i10git.cs.fau.de/pycodegen/lbmpy.git
-      - pip3 install git+https://i10git.cs.fau.de/pycodegen/lbmpy_walberla.git
+      - pip3 install lbmpy
    variables:
       <<: *build_hybrid_dbg_variables
       WALBERLA_BUILD_WITH_CUDA: "OFF"
@@ -938,13 +926,9 @@ gcc_9_hybrid_dbg:
 gcc_9_hybrid_dbg_sp:
    <<: *build_definition
    image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:9
-   before_script:
-      - pip3 install git+https://i10git.cs.fau.de/pycodegen/lbmpy.git
-      - pip3 install git+https://i10git.cs.fau.de/pycodegen/lbmpy_walberla.git
    variables:
       <<: *build_hybrid_dbg_sp_variables
       WALBERLA_BUILD_WITH_CUDA: "OFF"
-      WALBERLA_BUILD_WITH_CODEGEN: "ON"
       WALBERLA_ENABLE_GUI: 0
    except:
       variables:
@@ -1444,8 +1428,7 @@ clang_9.0_hybrid:
    <<: *build_definition
    image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:9.0
    before_script:
-      - pip3 install git+https://i10git.cs.fau.de/pycodegen/lbmpy.git
-      - pip3 install git+https://i10git.cs.fau.de/pycodegen/lbmpy_walberla.git
+      - pip3 install lbmpy
    variables:
       <<: *build_hybrid_variables
       WALBERLA_BUILD_WITH_CUDA: "OFF"
@@ -1461,8 +1444,7 @@ clang_9.0_serial_dbg:
    <<: *build_definition
    image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:9.0
    before_script:
-      - pip3 install git+https://i10git.cs.fau.de/pycodegen/lbmpy.git
-      - pip3 install git+https://i10git.cs.fau.de/pycodegen/lbmpy_walberla.git
+      - pip3 install lbmpy
    variables:
       <<: *build_serial_dbg_variables
       WALBERLA_BUILD_WITH_CUDA: "OFF"
@@ -1478,8 +1460,7 @@ clang_9.0_mpionly_dbg:
    <<: *build_definition
    image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:9.0
    before_script:
-      - pip3 install git+https://i10git.cs.fau.de/pycodegen/lbmpy.git
-      - pip3 install git+https://i10git.cs.fau.de/pycodegen/lbmpy_walberla.git
+      - pip3 install lbmpy
    variables:
       <<: *build_mpionly_dbg_variables
       WALBERLA_BUILD_WITH_CUDA: "OFF"
@@ -1495,8 +1476,7 @@ clang_9.0_hybrid_dbg:
    <<: *build_definition
    image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:9.0
    before_script:
-      - pip3 install git+https://i10git.cs.fau.de/pycodegen/lbmpy.git
-      - pip3 install git+https://i10git.cs.fau.de/pycodegen/lbmpy_walberla.git
+      - pip3 install lbmpy
    variables:
       <<: *build_hybrid_dbg_variables
       WALBERLA_BUILD_WITH_CUDA: "OFF"
@@ -1512,13 +1492,9 @@ clang_9.0_hybrid_dbg_sp:
    <<: *build_definition
    image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:9.0
    stage: pretest
-   before_script:
-      - pip3 install git+https://i10git.cs.fau.de/pycodegen/lbmpy.git
-      - pip3 install git+https://i10git.cs.fau.de/pycodegen/lbmpy_walberla.git
    variables:
       <<: *build_hybrid_dbg_sp_variables
       WALBERLA_BUILD_WITH_CUDA: "OFF"
-      WALBERLA_BUILD_WITH_CODEGEN: "ON"
       WALBERLA_ENABLE_GUI: 0
    except:
       variables:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0e0d24515..639b3c0ef 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -560,7 +560,7 @@ endif ( )
 #############################################################################################################################
 if ( WALBERLA_BUILD_WITH_CODEGEN )
     find_package( PythonInterp 3 QUIET REQUIRED)
-    execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import pystencils_walberla; from pystencils.include import get_pystencils_include_path; print(get_pystencils_include_path())"
+    execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "from pystencils.include import get_pystencils_include_path; print(get_pystencils_include_path())"
                     RESULT_VARIABLE PYTHON_RET_CODE
                     OUTPUT_VARIABLE PYSTENCILS_INCLUDE_PATH)
     if(NOT PYTHON_RET_CODE EQUAL 0)
diff --git a/cmake/waLBerlaHelperFunctions.cmake b/cmake/waLBerlaHelperFunctions.cmake
index 7a49ab385..21c9b01b5 100644
--- a/cmake/waLBerlaHelperFunctions.cmake
+++ b/cmake/waLBerlaHelperFunctions.cmake
@@ -64,10 +64,11 @@ function( handle_python_codegen sourceFilesOut generatedSourceFilesOut generator
                 string(REPLACE "\"" "\\\"" pythonParameters ${pythonParameters})   # even one more quoting level required
                 string(REPLACE "\n" "" pythonParameters ${pythonParameters})  # remove newline characters
 
+                set( WALBERLA_PYTHON_DIR ${walberla_SOURCE_DIR}/python)
                 file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/${codegenCfg}")
                 add_custom_command(OUTPUT ${generatedWithAbsolutePath}
                                    DEPENDS ${sourceFile}
-                                   COMMAND ${PYTHON_EXECUTABLE} ${sourceFile} ${pythonParameters}
+                                   COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${WALBERLA_PYTHON_DIR} ${PYTHON_EXECUTABLE} ${sourceFile} ${pythonParameters}
                                    WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/${codegenCfg}")
             endif()
         else()
diff --git a/python/lbmpy_walberla/__init__.py b/python/lbmpy_walberla/__init__.py
new file mode 100644
index 000000000..f7b022ab0
--- /dev/null
+++ b/python/lbmpy_walberla/__init__.py
@@ -0,0 +1,4 @@
+from .boundary import generate_boundary
+from .walberla_lbm_generation import RefinementScaling, generate_lattice_model
+
+__all__ = ['generate_lattice_model', 'RefinementScaling', 'generate_boundary']
diff --git a/python/lbmpy_walberla/boundary.py b/python/lbmpy_walberla/boundary.py
new file mode 100644
index 000000000..32176989f
--- /dev/null
+++ b/python/lbmpy_walberla/boundary.py
@@ -0,0 +1,92 @@
+import numpy as np
+from jinja2 import Environment, PackageLoader, StrictUndefined
+
+from lbmpy.boundaries.boundaryhandling import create_lattice_boltzmann_boundary_kernel
+from lbmpy_walberla.walberla_lbm_generation import KernelInfo
+from pystencils import Field, FieldType
+from pystencils.boundaries.createindexlist import (
+    boundary_index_array_coordinate_names, direction_member_name,
+    numpy_data_type_for_boundary_object)
+from pystencils.data_types import TypedSymbol, create_type
+from pystencils_walberla.codegen import default_create_kernel_parameters
+from pystencils_walberla.jinja_filters import add_pystencils_filters_to_jinja_env
+
+
+def generate_boundary(generation_context, class_name, boundary_object, lb_method, **create_kernel_params):
+    struct_name = "IndexInfo"
+    boundary_object.name = class_name
+
+    create_kernel_params = default_create_kernel_parameters(generation_context, create_kernel_params)
+    target = create_kernel_params['target']
+
+    index_struct_dtype = numpy_data_type_for_boundary_object(boundary_object, lb_method.dim)
+
+    pdf_field = Field.create_generic('pdfs', lb_method.dim,
+                                     np.float64 if generation_context.double_accuracy else np.float32,
+                                     index_dimensions=1, layout='fzyx', index_shape=[len(lb_method.stencil)])
+
+    index_field = Field('indexVector', FieldType.INDEXED, index_struct_dtype, layout=[0],
+                        shape=(TypedSymbol("indexVectorSize", create_type(np.int64)), 1), strides=(1, 1))
+
+    kernel = create_lattice_boltzmann_boundary_kernel(pdf_field, index_field, lb_method, boundary_object, target=target,
+                                                      openmp=generation_context.openmp)
+    kernel.function_name = "boundary_" + boundary_object.name
+
+    # waLBerla is a 3D framework. Therefore, a zero for the z index has to be added if we work in 2D
+    if lb_method.dim == 2:
+        stencil = ()
+        for d in lb_method.stencil:
+            d = d + (0,)
+            stencil = stencil + (d,)
+    else:
+        stencil = lb_method.stencil
+
+    stencil_info = [(i, ", ".join([str(e) for e in d])) for i, d in enumerate(stencil)]
+
+    context = {
+        'class_name': boundary_object.name,
+        'StructName': struct_name,
+        'StructDeclaration': struct_from_numpy_dtype(struct_name, index_struct_dtype),
+        'kernel': KernelInfo(kernel),
+        'stencil_info': stencil_info,
+        'dim': lb_method.dim,
+        'target': target,
+        'namespace': 'lbm',
+    }
+
+    env = Environment(loader=PackageLoader('lbmpy_walberla'), undefined=StrictUndefined)
+    add_pystencils_filters_to_jinja_env(env)
+
+    header = env.get_template('Boundary.tmpl.h').render(**context)
+    source = env.get_template('Boundary.tmpl.cpp').render(**context)
+
+    source_extension = "cpp" if create_kernel_params.get("target", "cpu") == "cpu" else "cu"
+    generation_context.write_file("{}.h".format(class_name), header)
+    generation_context.write_file("{}.{}".format(class_name, source_extension), source)
+
+
+def struct_from_numpy_dtype(struct_name, numpy_dtype):
+    result = "struct %s { \n" % (struct_name,)
+
+    equality_compare = []
+    constructor_params = []
+    constructor_initializer_list = []
+    for name, (sub_type, offset) in numpy_dtype.fields.items():
+        pystencils_type = create_type(sub_type)
+        result += "    %s %s;\n" % (pystencils_type, name)
+        if name in boundary_index_array_coordinate_names or name == direction_member_name:
+            constructor_params.append("%s %s_" % (pystencils_type, name))
+            constructor_initializer_list.append("%s(%s_)" % (name, name))
+        else:
+            constructor_initializer_list.append("%s()" % name)
+        if pystencils_type.is_float():
+            equality_compare.append("floatIsEqual(%s, o.%s)" % (name, name))
+        else:
+            equality_compare.append("%s == o.%s" % (name, name))
+
+    result += "    %s(%s) : %s {}\n" % \
+              (struct_name, ", ".join(constructor_params), ", ".join(constructor_initializer_list))
+    result += "    bool operator==(const %s & o) const {\n        return %s;\n    }\n" % \
+              (struct_name, " && ".join(equality_compare))
+    result += "};\n"
+    return result
diff --git a/python/lbmpy_walberla/sparse.py b/python/lbmpy_walberla/sparse.py
new file mode 100644
index 000000000..95e45f64e
--- /dev/null
+++ b/python/lbmpy_walberla/sparse.py
@@ -0,0 +1,49 @@
+import numpy as np
+
+from lbmpy.sparse import (
+    create_lb_update_rule_sparse, create_macroscopic_value_getter_sparse,
+    create_macroscopic_value_setter_sparse, create_symbolic_list)
+from pystencils import TypedSymbol, create_kernel
+
+
+class ListLbGenerator:
+    def __init__(self, collision_rule, pdf_type=np.float64, index_type=np.uint32, layout='SoA'):
+        self.collision_rule = collision_rule
+
+        num_cells = TypedSymbol('num_cells', np.uint64)
+        method = self.collision_rule.method
+        q = len(method.stencil)
+
+        self.num_cells = num_cells
+        self.src = create_symbolic_list('src', num_cells, q, pdf_type, layout)
+        self.dst = create_symbolic_list('dst', num_cells, q, pdf_type, layout)
+        self.idx = create_symbolic_list('idx', num_cells, q, index_type, layout)
+
+    def kernel(self, kernel_type='stream_pull_collide'):
+        ac = create_lb_update_rule_sparse(self.collision_rule, self.src, self.dst, self.idx, kernel_type)
+        return create_kernel(ac)
+
+    def getter_ast(self, density=True, velocity=True):
+        assert density or velocity
+
+        method = self.collision_rule.method
+        output_descriptor = {}
+        if density:
+            output_descriptor['density'] = create_symbolic_list('rho', self.src.spatial_shape[0], 1, self.src.dtype)
+        if velocity:
+            output_descriptor['velocity'] = create_symbolic_list('vel', self.src.spatial_shape[0],
+                                                                 method.dim, self.src.dtype)
+
+        ac = create_macroscopic_value_getter_sparse(method, self.src, output_descriptor)
+        return create_kernel(ac)
+
+    def setter_ast(self, density=True, velocity=True):
+        method = self.collision_rule.method
+        if density is True:
+            density = create_symbolic_list('rho', self.src.spatial_shape[0], 1, self.src.dtype).center
+
+        if velocity is True:
+            velocity = create_symbolic_list('vel', self.src.spatial_shape[0], method.dim, self.src.dtype).center_vector
+
+        ac = create_macroscopic_value_setter_sparse(method, self.src, density, velocity)
+        return create_kernel(ac)
diff --git a/python/lbmpy_walberla/templates/Boundary.tmpl.cpp b/python/lbmpy_walberla/templates/Boundary.tmpl.cpp
new file mode 100644
index 000000000..d4b3333b1
--- /dev/null
+++ b/python/lbmpy_walberla/templates/Boundary.tmpl.cpp
@@ -0,0 +1,104 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file {{class_name}}.cpp
+//! \\ingroup lbm
+//! \\author lbmpy
+//======================================================================================================================
+
+#include <cmath>
+
+#include "core/DataTypes.h"
+#include "core/Macros.h"
+#include "{{class_name}}.h"
+{% if target == 'gpu' -%}
+#include "cuda/ErrorChecking.h"
+{%- endif %}
+
+
+{% if target == 'cpu' -%}
+#define FUNC_PREFIX
+{%- elif target == 'gpu' -%}
+#define FUNC_PREFIX __global__
+{%- endif %}
+
+using namespace std;
+
+namespace walberla {
+namespace {{namespace}} {
+
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#pragma GCC diagnostic ignored "-Wconversion"
+#endif
+
+#ifdef __CUDACC__
+#pragma push
+#pragma diag_suppress = declared_but_not_referenced
+#endif
+
+
+{{kernel|generate_definition}}
+
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+
+#ifdef __CUDACC__
+#pragma pop
+#endif
+
+
+void {{class_name}}::run( IBlock * block, IndexVectors::Type type {% if target == 'gpu'%}, cudaStream_t stream {%endif%})
+{
+    auto * indexVectors = block->getData<IndexVectors>(indexVectorID);
+    int64_t indexVectorSize = int64_c( indexVectors->indexVector(type).size() );
+    if( indexVectorSize == 0)
+        return;
+
+    {% if target == 'gpu' -%}
+    auto pointer = indexVectors->pointerGpu(type);
+    {% else %}
+    auto pointer = indexVectors->pointerCpu(type);
+    {% endif %}
+
+    uint8_t * _data_indexVector = reinterpret_cast<uint8_t*>(pointer);
+
+    {{kernel|generate_block_data_to_field_extraction(['indexVector', 'indexVectorSize'])|indent(4)}}
+    {{kernel|generate_call(spatial_shape_symbols=['indexVectorSize'], stream='stream')|indent(4)}}
+}
+
+void {{class_name}}::operator() ( IBlock * block{% if target == 'gpu'%}, cudaStream_t stream {%endif%} )
+{
+    run( block, IndexVectors::ALL{% if target == 'gpu'%}, stream {%endif%});
+}
+
+void {{class_name}}::inner( IBlock * block{% if target == 'gpu'%}, cudaStream_t stream {%endif%} )
+{
+    run( block, IndexVectors::INNER{% if target == 'gpu'%}, stream {%endif%} );
+}
+
+void {{class_name}}::outer( IBlock * block{% if target == 'gpu'%}, cudaStream_t stream {%endif%} )
+{
+    run( block, IndexVectors::OUTER{% if target == 'gpu'%}, stream {%endif%} );
+}
+
+
+} // namespace {{namespace}}
+} // namespace walberla
+
+
diff --git a/python/lbmpy_walberla/templates/Boundary.tmpl.h b/python/lbmpy_walberla/templates/Boundary.tmpl.h
new file mode 100644
index 000000000..97dabe87a
--- /dev/null
+++ b/python/lbmpy_walberla/templates/Boundary.tmpl.h
@@ -0,0 +1,188 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file {{class_name}}.h
+//! \\author pystencils
+//======================================================================================================================
+
+
+#include "core/DataTypes.h"
+
+{% if target is equalto 'cpu' -%}
+#include "field/GhostLayerField.h"
+{%- elif target is equalto 'gpu' -%}
+#include "cuda/GPUField.h"
+{%- endif %}
+#include "domain_decomposition/BlockDataID.h"
+#include "domain_decomposition/IBlock.h"
+#include "blockforest/StructuredBlockForest.h"
+#include "field/FlagField.h"
+
+#include <set>
+#include <vector>
+
+#ifdef __GNUC__
+#define RESTRICT __restrict__
+#elif _MSC_VER
+#define RESTRICT __restrict
+#else
+#define RESTRICT
+#endif
+
+namespace walberla {
+namespace {{namespace}} {
+
+
+class {{class_name}}
+{
+public:
+    {{StructDeclaration|indent(4)}}
+
+
+    class IndexVectors
+    {
+    public:
+        using CpuIndexVector = std::vector<{{StructName}}>;
+
+        enum Type {
+            ALL = 0,
+            INNER = 1,
+            OUTER = 2,
+            NUM_TYPES = 3
+        };
+
+        IndexVectors() : cpuVectors_(NUM_TYPES)  {}
+        bool operator==(IndexVectors & other) { return other.cpuVectors_ == cpuVectors_; }
+
+        {% if target == 'gpu' -%}
+        ~IndexVectors() {
+            for( auto & gpuVec: gpuVectors_)
+                cudaFree( gpuVec );
+        }
+        {% endif %}
+
+        CpuIndexVector & indexVector(Type t) { return cpuVectors_[t]; }
+        {{StructName}} * pointerCpu(Type t)  { return &(cpuVectors_[t][0]); }
+
+        {% if target == 'gpu' -%}
+        {{StructName}} * pointerGpu(Type t)  { return gpuVectors_[t]; }
+        {% endif %}
+
+        void syncGPU()
+        {
+            {% if target == 'gpu' -%}
+            gpuVectors_.resize( cpuVectors_.size() );
+            for(size_t i=0; i < size_t(NUM_TYPES); ++i )
+            {
+                auto & gpuVec = gpuVectors_[i];
+                auto & cpuVec = cpuVectors_[i];
+                cudaFree( gpuVec );
+                cudaMalloc( &gpuVec, sizeof({{StructName}}) * cpuVec.size() );
+                cudaMemcpy( gpuVec, &cpuVec[0], sizeof({{StructName}}) * cpuVec.size(), cudaMemcpyHostToDevice );
+            }
+            {%- endif %}
+        }
+
+    private:
+        std::vector<CpuIndexVector> cpuVectors_;
+
+        {% if target == 'gpu' -%}
+        using GpuIndexVector = {{StructName}} *;
+        std::vector<GpuIndexVector> gpuVectors_;
+        {% endif %}
+    };
+
+
+    {{class_name}}( const shared_ptr<StructuredBlockForest> & blocks,
+                   {{kernel|generate_constructor_parameters(['indexVector', 'indexVectorSize'])}} )
+        : {{ kernel|generate_constructor_initializer_list(['indexVector', 'indexVectorSize']) }}
+    {
+        auto createIdxVector = []( IBlock * const , StructuredBlockStorage * const ) { return new IndexVectors(); };
+        indexVectorID = blocks->addStructuredBlockData< IndexVectors >( createIdxVector, "IndexField_{{class_name}}");
+    };
+
+    void operator() ( IBlock * block {% if target == 'gpu'%}, cudaStream_t stream = 0 {%endif%});
+    void inner( IBlock * block {% if target == 'gpu'%}, cudaStream_t stream = 0 {%endif%});
+    void outer( IBlock * block {% if target == 'gpu'%}, cudaStream_t stream = 0 {%endif%});
+
+
+    template<typename FlagField_T>
+    void fillFromFlagField( const shared_ptr<StructuredBlockForest> & blocks, ConstBlockDataID flagFieldID,
+                            FlagUID boundaryFlagUID, FlagUID domainFlagUID)
+    {
+        for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
+            fillFromFlagField<FlagField_T>( &*blockIt, flagFieldID, boundaryFlagUID, domainFlagUID );
+    }
+
+
+    template<typename FlagField_T>
+    void fillFromFlagField( IBlock * block, ConstBlockDataID flagFieldID,
+                            FlagUID boundaryFlagUID, FlagUID domainFlagUID )
+    {
+        auto * indexVectors = block->getData< IndexVectors > ( indexVectorID );
+        auto & indexVectorAll = indexVectors->indexVector(IndexVectors::ALL);
+        auto & indexVectorInner = indexVectors->indexVector(IndexVectors::INNER);
+        auto & indexVectorOuter = indexVectors->indexVector(IndexVectors::OUTER);
+
+
+        auto * flagField = block->getData< FlagField_T > ( flagFieldID );
+
+        if( !(flagField->flagExists(boundaryFlagUID) && flagField->flagExists(domainFlagUID) ))
+            return;
+
+        auto boundaryFlag = flagField->getFlag(boundaryFlagUID);
+        auto domainFlag = flagField->getFlag(domainFlagUID);
+
+        auto inner = flagField->xyzSize();
+        inner.expand( cell_idx_t(-1) );
+
+
+        indexVectorAll.clear();
+        indexVectorInner.clear();
+        indexVectorOuter.clear();
+
+        for( auto it = flagField->begin(); it != flagField->end(); ++it )
+        {
+            if( ! isFlagSet(it, domainFlag) )
+                continue;
+
+            {%- for dirIdx, offset in stencil_info %}
+            if ( isFlagSet( it.neighbor({{offset}} {%if dim == 3%}, 0 {%endif %}), boundaryFlag ) )
+            {
+                auto element = {{StructName}}(it.x(), it.y(), {%if dim == 3%} it.z(), {%endif %} {{dirIdx}} );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+            }
+            {% endfor %}
+        }
+
+        indexVectors->syncGPU();
+    }
+
+private:
+    void run( IBlock * block, IndexVectors::Type type{% if target == 'gpu'%}, cudaStream_t stream = 0 {%endif%});
+
+    BlockDataID indexVectorID;
+public:
+    {{kernel|generate_members(('indexVector', 'indexVectorSize'))|indent(4)}}
+};
+
+
+
+} // namespace {{namespace}}
+} // namespace walberla
diff --git a/python/lbmpy_walberla/templates/LatticeModel.tmpl.cpp b/python/lbmpy_walberla/templates/LatticeModel.tmpl.cpp
new file mode 100644
index 000000000..2b2065cc7
--- /dev/null
+++ b/python/lbmpy_walberla/templates/LatticeModel.tmpl.cpp
@@ -0,0 +1,127 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\author Martin Bauer <martin.bauer@fau.de>
+//======================================================================================================================
+
+#include <cmath>
+
+#include "core/DataTypes.h"
+#include "core/Macros.h"
+#include "lbm/field/PdfField.h"
+#include "lbm/sweeps/Streaming.h"
+#include "{{class_name}}.h"
+
+#ifdef _MSC_VER
+#  pragma warning( disable : 4458 )
+#endif
+
+{% if target is equalto 'cpu' -%}
+#define FUNC_PREFIX
+{%- elif target is equalto 'gpu' -%}
+#define FUNC_PREFIX __global__
+{%- endif %}
+
+#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG )
+#   pragma GCC diagnostic push
+#   pragma GCC diagnostic ignored "-Wfloat-equal"
+#   pragma GCC diagnostic ignored "-Wshadow"
+#   pragma GCC diagnostic ignored "-Wconversion"
+#   pragma GCC diagnostic ignored "-Wunused-variable"
+#   pragma GCC diagnostic ignored "-Wunused-parameter"
+#endif
+
+{% for header in headers %}
+#include {{header}}
+{% endfor %}
+
+
+using namespace std;
+
+namespace walberla {
+namespace {{namespace}} {
+
+{{stream_collide_kernel|generate_definition(target)}}
+{{collide_kernel|generate_definition(target)}}
+{{stream_kernel|generate_definition(target)}}
+
+
+const real_t {{class_name}}::w[{{Q}}] = { {{weights}} };
+const real_t {{class_name}}::wInv[{{Q}}] = { {{inverse_weights}} };
+
+void {{class_name}}::Sweep::streamCollide( IBlock * block, const uint_t numberOfGhostLayersToInclude )
+{
+    {{stream_collide_kernel|generate_block_data_to_field_extraction(parameters=['pdfs', 'pdfs_tmp'])|indent(4)}}
+
+    auto & lm = dynamic_cast< lbm::PdfField<{{class_name}}> * > (pdfs)->latticeModel();
+    WALBERLA_ASSERT_EQUAL( *(lm.blockId), block->getId() );
+
+    {{stream_collide_kernel|generate_refs_for_kernel_parameters(prefix='lm.', parameters_to_ignore=['pdfs', 'pdfs_tmp'])|indent(4) }}
+    {{stream_collide_kernel|generate_call('cell_idx_c(numberOfGhostLayersToInclude)')|indent(4)}}
+    {{stream_collide_kernel|generate_swaps|indent(4)}}
+}
+
+void {{class_name}}::Sweep::collide( IBlock * block, const uint_t numberOfGhostLayersToInclude )
+{
+   {{collide_kernel|generate_block_data_to_field_extraction(parameters=['pdfs'])|indent(4)}}
+
+    auto & lm = dynamic_cast< lbm::PdfField<{{class_name}}> * > (pdfs)->latticeModel();
+    WALBERLA_ASSERT_EQUAL( *(lm.blockId), block->getId() );
+
+    {{collide_kernel|generate_refs_for_kernel_parameters(prefix='lm.', parameters_to_ignore=['pdfs', 'pdfs_tmp'])|indent(4) }}
+    {{collide_kernel|generate_call('cell_idx_c(numberOfGhostLayersToInclude)')|indent(4)}}
+}
+
+
+void {{class_name}}::Sweep::stream( IBlock * block, const uint_t numberOfGhostLayersToInclude )
+{
+    {{stream_kernel|generate_block_data_to_field_extraction(parameters=['pdfs', 'pdfs_tmp'])|indent(4)}}
+
+    {{stream_kernel|generate_call('cell_idx_c(numberOfGhostLayersToInclude)')|indent(4)}}
+    
+    {{stream_kernel|generate_swaps|indent(4)}}
+}
+
+
+} // namespace {{namespace}}
+} // namespace walberla
+
+
+
+
+// Buffer Packing
+
+namespace walberla {
+namespace mpi {
+
+mpi::SendBuffer & operator<< (mpi::SendBuffer & buf, const ::walberla::{{namespace}}::{{class_name}} & lm)
+{
+    buf << lm.currentLevel;
+    return buf;
+}
+
+mpi::RecvBuffer & operator>> (mpi::RecvBuffer & buf, ::walberla::{{namespace}}::{{class_name}} & lm)
+{
+    buf >> lm.currentLevel;
+    return buf;
+}
+
+
+} // namespace mpi
+} // namespace walberla
+
+#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG )
+#   pragma GCC diagnostic pop
+#endif
\ No newline at end of file
diff --git a/python/lbmpy_walberla/templates/LatticeModel.tmpl.h b/python/lbmpy_walberla/templates/LatticeModel.tmpl.h
new file mode 100644
index 000000000..be160e99e
--- /dev/null
+++ b/python/lbmpy_walberla/templates/LatticeModel.tmpl.h
@@ -0,0 +1,554 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\author Martin Bauer <martin.bauer@fau.de>
+//
+//======================================================================================================================
+
+
+#include "core/DataTypes.h"
+#include "core/logging/Logging.h"
+
+#include "field/GhostLayerField.h"
+#include "field/SwapableCompare.h"
+#include "domain_decomposition/BlockDataID.h"
+#include "domain_decomposition/IBlock.h"
+#include "stencil/{{stencil_name}}.h"
+
+#include "lbm/lattice_model/EquilibriumDistribution.h"
+#include "lbm/field/Density.h"
+#include "lbm/field/DensityAndMomentumDensity.h"
+#include "lbm/field/DensityAndVelocity.h"
+#include "lbm/field/PressureTensor.h"
+#include "lbm/field/ShearRate.h"
+
+#include <set>
+
+#ifdef __GNUC__
+#define RESTRICT __restrict__
+#elif _MSC_VER
+#define RESTRICT __restrict
+#else
+#define RESTRICT
+#endif
+
+#ifdef WALBERLA_CXX_COMPILER_IS_GNU
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#endif
+
+#ifdef WALBERLA_CXX_COMPILER_IS_CLANG
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wunused-variable"
+#pragma clang diagnostic ignored "-Wunused-parameter"
+#endif
+
+{% set lmIgnores = ('pdfs', 'pdfs_tmp') %}
+{% set lmOffsets = ('block_offset_0', 'block_offset_1', 'block_offset_2') %}
+
+
+// Forward declarations
+namespace walberla{
+namespace {{namespace}} {
+   class {{class_name}};
+}}
+namespace walberla {
+namespace mpi {
+    mpi::SendBuffer & operator<< (mpi::SendBuffer & buf, const ::walberla::{{namespace}}::{{class_name}} & lm);
+    mpi::RecvBuffer & operator>> (mpi::RecvBuffer & buf,       ::walberla::{{namespace}}::{{class_name}} & lm);
+}}
+
+
+
+
+namespace walberla {
+namespace {{namespace}} {
+
+
+/**
+{{class_name}} was generated with lbmpy. Do not edit this file directly. Instead modify {{class_name}}.py.
+For details see documentation of lbmpy.
+
+Usage:
+    - Create an instance of this lattice model class: the constructor parameters vary depending on the configure
+      lattice model. A model with constant force needs a single force vector, while a model with variable forces needs
+      a force field. All constructor parameters are ordered alphabetically.
+    - Create a PDFField with the lattice model as template argument to store the particle distribution functions.
+      Use the PDFField to get and modify macroscopic values.
+    - The internal class {{class_name}}::Sweep is a functor to execute one LB time step.
+      Stream, collide steps can be executed separately, or together in an optimized stream-pull-collide scheme
+
+*/
+class {{class_name}}
+{
+
+public:
+    typedef stencil::{{stencil_name}} Stencil;
+    typedef stencil::{{stencil_name}} CommunicationStencil;
+    static const real_t w[{{Q}}];
+    static const real_t wInv[{{Q}}];
+
+    static const bool compressible = {% if compressible %}true{% else %}false{% endif %};
+
+    class Sweep
+    {
+    public:
+        Sweep( BlockDataID _pdfsID ) : pdfsID(_pdfsID) {};
+
+        //void stream       ( IBlock * const block, const uint_t numberOfGhostLayersToInclude = uint_t(0) );
+        void collide      ( IBlock * const block, const uint_t numberOfGhostLayersToInclude = uint_t(0) );
+        void streamCollide( IBlock * const block, const uint_t numberOfGhostLayersToInclude = uint_t(0) );
+        void stream       ( IBlock * const block, const uint_t numberOfGhostLayersToInclude = uint_t(0) );
+
+        void operator() ( IBlock * const block, const uint_t numberOfGhostLayersToInclude = uint_t(0) )
+        {
+            streamCollide( block, numberOfGhostLayersToInclude );
+        }
+
+    private:
+        {{stream_collide_kernel|generate_members(only_fields=True)|indent(8)}}
+    };
+
+    {{class_name}}( {{stream_collide_kernel|generate_constructor_parameters(lmIgnores+lmOffsets) }} )
+        : {{ stream_collide_kernel|generate_constructor_initializer_list(lmIgnores+lmOffsets) }}, currentLevel(0)
+    {};
+
+    void configure( IBlock & block, StructuredBlockStorage & storage )  { configureBlock( &block, &storage ); }
+
+    // Parameters:
+    {{stream_collide_kernel|generate_members(lmIgnores)|indent(4)}}
+
+private:
+    void configureBlock(IBlock * block, StructuredBlockStorage * storage)
+    {
+        {{stream_collide_kernel|generate_block_data_to_field_extraction(lmIgnores+lmOffsets, no_declarations=True)|indent(8)}}
+        {% if need_block_offsets[0] -%}
+        block_offset_0 = uint32_c(storage->getBlockCellBB(*block).xMin());
+        {% endif -%}
+        {%- if need_block_offsets[1] -%}
+        block_offset_1 = uint32_c(storage->getBlockCellBB(*block).yMin());
+        {% endif -%}
+        {%- if need_block_offsets[2] -%}
+        block_offset_2 = uint32_c(storage->getBlockCellBB(*block).zMin());
+        {% endif %}
+        blockId = &block->getId();
+
+        {% if refinement_scaling_info -%}
+        const uint_t targetLevel = block->getBlockStorage().getLevel(*block);
+
+        if( targetLevel != currentLevel )
+        {
+            real_t level_scale_factor(1);
+            if( currentLevel < targetLevel )
+               level_scale_factor = real_c( uint_t(1) << ( targetLevel - currentLevel ) );
+            else // currentLevel > targetLevel
+               level_scale_factor = real_t(1) / real_c( uint_t(1) << ( currentLevel - targetLevel ) );
+
+            {% for scalingType, name, expression in refinement_scaling_info -%}
+            {% if scalingType == 'normal' %}
+            {{name}} = {{expression}};
+            {% elif scalingType in ('field_with_f', 'field_xyz') %}
+            auto it = {{name}}->{% if scalingType == 'field_with_f'%} beginWithGhostLayer(){% else %}beginWithGhostLayerXYZ(){% endif %};
+            for( ; it != {{name}}->end(); ++it )
+            {
+                 auto x = it.x();
+                 auto y = it.y();
+                 auto z = it.z();
+                 {% if scalingType == 'field_with_f' -%}
+                 auto f = it.f();
+                 {% endif -%}
+                 *it = {{expression}};
+            }
+            {% endif -%}
+            {% endfor -%}
+
+            currentLevel = targetLevel;
+        }
+        {% endif %}
+    }
+
+    // Updated by configureBlock:
+    {{stream_collide_kernel|generate_block_data_to_field_extraction(lmIgnores, declarations_only=True)|indent(4)}}
+    const IBlockID * blockId;
+    uint_t currentLevel;
+
+    // Backend classes can access private members:
+    friend class {{class_name}}::Sweep;
+    template<class LM, class Enable> friend class  EquilibriumDistribution;
+    template<class LM, class Enable> friend struct Equilibrium;
+    template<class LM, class Enable> friend struct internal::AdaptVelocityToForce;
+    template<class LM, class Enable> friend struct Density;
+    template<class LM>               friend struct DensityAndVelocity;
+    template<class LM, class Enable> friend struct DensityAndMomentumDensity;
+    template<class LM, class Enable> friend struct MomentumDensity;
+    template<class LM, class It, class Enable> friend struct DensityAndVelocityRange;
+
+    friend mpi::SendBuffer & ::walberla::mpi::operator<< (mpi::SendBuffer & , const {{class_name}} & );
+    friend mpi::RecvBuffer & ::walberla::mpi::operator>> (mpi::RecvBuffer & ,       {{class_name}} & );
+
+};
+
+
+
+
+//======================================================================================================================
+//
+//  Implementation of macroscopic value backend
+//
+//======================================================================================================================
+
+
+
+template<>
+class EquilibriumDistribution< {{class_name}}, void>
+{
+public:
+   typedef typename {{class_name}}::Stencil Stencil;
+
+   static real_t get( const stencil::Direction direction,
+                      const Vector3< real_t > & u = Vector3< real_t >( real_t(0.0) ),
+                      real_t rho = real_t(1.0) )
+   {
+        {% if not compressible %}
+        rho -= real_t(1.0);
+        {% endif %}
+        {{equilibrium_from_direction}}
+   }
+
+   static real_t getSymmetricPart( const stencil::Direction direction,
+                                   const Vector3<real_t> & u = Vector3< real_t >(real_t(0.0)),
+                                   real_t rho = real_t(1.0) )
+   {
+        {% if not compressible %}
+        rho -= real_t(1.0);
+        {% endif %}
+        {{symmetric_equilibrium_from_direction}}
+   }
+
+   static real_t getAsymmetricPart( const stencil::Direction direction,
+                                    const Vector3< real_t > & u = Vector3<real_t>( real_t(0.0) ),
+                                    real_t rho = real_t(1.0) )
+   {
+        {% if not compressible %}
+        rho -= real_t(1.0);
+        {% endif %}
+        {{asymmetric_equilibrium_from_direction}}
+   }
+
+   static std::vector< real_t > get( const Vector3< real_t > & u = Vector3<real_t>( real_t(0.0) ),
+                                     real_t rho = real_t(1.0) )
+   {
+      {% if not compressible %}
+      rho -= real_t(1.0);
+      {% endif %}
+
+      std::vector< real_t > equilibrium( Stencil::Size );
+      for( auto d = Stencil::begin(); d != Stencil::end(); ++d )
+      {
+         equilibrium[d.toIdx()] = get(*d, u, rho);
+      }
+      return equilibrium;
+   }
+};
+
+
+namespace internal {
+
+template<>
+struct AdaptVelocityToForce<{{class_name}}, void>
+{
+   template< typename FieldPtrOrIterator >
+   static Vector3<real_t> get( FieldPtrOrIterator & it, const {{class_name}} & lm,
+                               const Vector3< real_t > & velocity, const real_t rho )
+   {
+      auto x = it.x();
+      auto y = it.y();
+      auto z = it.z();
+      {% if macroscopic_velocity_shift %}
+      return velocity - Vector3<real_t>({{macroscopic_velocity_shift | join(",") }} {% if D == 2 %}, real_t(0.0) {%endif %} );
+      {% else %}
+      return velocity;
+      {% endif %}
+   }
+
+   static Vector3<real_t> get( const cell_idx_t x, const cell_idx_t y, const cell_idx_t z, const {{class_name}} & lm,
+                               const Vector3< real_t > & velocity, const real_t rho )
+   {
+      {% if macroscopic_velocity_shift %}
+
+      return velocity - Vector3<real_t>({{macroscopic_velocity_shift | join(",") }} {% if D == 2 %}, real_t(0.0) {%endif %} );
+      {% else %}
+      return velocity;
+      {% endif %}
+   }
+};
+} // namespace internal
+
+
+
+template<>
+struct Equilibrium< {{class_name}}, void >
+{
+
+   template< typename FieldPtrOrIterator >
+   static void set( FieldPtrOrIterator & it,
+                    const Vector3< real_t > & u = Vector3< real_t >( real_t(0.0) ), real_t rho = real_t(1.0) )
+   {
+        {%if not compressible %}
+        rho -= real_t(1.0);
+        {%endif %}
+
+       {% for eqTerm in equilibrium -%}
+       it[{{loop.index0 }}] = {{eqTerm}};
+       {% endfor -%}
+   }
+
+   template< typename PdfField_T >
+   static void set( PdfField_T & pdf, const cell_idx_t x, const cell_idx_t y, const cell_idx_t z,
+                    const Vector3< real_t > & u = Vector3< real_t >( real_t(0.0) ), real_t rho = real_t(1.0) )
+   {
+      {%if not compressible %}
+      rho -= real_t(1.0);
+      {%endif %}
+
+      real_t & xyz0 = pdf(x,y,z,0);
+      {% for eqTerm in equilibrium -%}
+      pdf.getF( &xyz0, {{loop.index0 }})= {{eqTerm}};
+      {% endfor -%}
+   }
+};
+
+
+template<>
+struct Density<{{class_name}}, void>
+{
+   template< typename FieldPtrOrIterator >
+   static inline real_t get( const {{class_name}} & , const FieldPtrOrIterator & it )
+   {
+        {% for i in range(Q) -%}
+            const real_t f_{{i}} = it[{{i}}];
+        {% endfor -%}
+        {{density_getters | indent(8)}}
+        return rho;
+   }
+
+   template< typename PdfField_T >
+   static inline real_t get( const {{class_name}} & ,
+                             const PdfField_T & pdf, const cell_idx_t x, const cell_idx_t y, const cell_idx_t z )
+   {
+        const real_t & xyz0 = pdf(x,y,z,0);
+        {% for i in range(Q) -%}
+            const real_t f_{{i}} = pdf.getF( &xyz0, {{i}});
+        {% endfor -%}
+        {{density_getters | indent(8)}}
+        return rho;
+   }
+};
+
+
+template<>
+struct DensityAndVelocity<{{class_name}}>
+{
+    template< typename FieldPtrOrIterator >
+    static void set( FieldPtrOrIterator & it, const {{class_name}} & lm,
+                     const Vector3< real_t > & u = Vector3< real_t >( real_t(0.0) ), const real_t rho_in = real_t(1.0) )
+    {
+        auto x = it.x();
+        auto y = it.y();
+        auto z = it.z();
+
+        {{density_velocity_setter_macroscopic_values | indent(8)}}
+        {% if D == 2 -%}
+        const real_t u_2(0.0);
+        {% endif %}
+
+        Equilibrium<{{class_name}}>::set(it, Vector3<real_t>(u_0, u_1, u_2), rho{%if not compressible %} + real_t(1) {%endif%});
+    }
+
+    template< typename PdfField_T >
+    static void set( PdfField_T & pdf, const cell_idx_t x, const cell_idx_t y, const cell_idx_t z, const {{class_name}} & lm,
+                     const Vector3< real_t > & u = Vector3< real_t >( real_t(0.0) ), const real_t rho_in = real_t(1.0) )
+    {
+        {{density_velocity_setter_macroscopic_values | indent(8)}}
+        {% if D == 2 -%}
+        const real_t u_2(0.0);
+        {% endif %}
+
+        Equilibrium<{{class_name}}>::set(pdf, x, y, z, Vector3<real_t>(u_0, u_1, u_2), rho {%if not compressible %} + real_t(1) {%endif%});
+    }
+};
+
+
+template<typename FieldIteratorXYZ >
+struct DensityAndVelocityRange<{{class_name}}, FieldIteratorXYZ>
+{
+
+   static void set( FieldIteratorXYZ & begin, const FieldIteratorXYZ & end, const {{class_name}} & lm,
+                    const Vector3< real_t > & u = Vector3< real_t >( real_t(0.0) ), const real_t rho_in = real_t(1.0) )
+   {
+        for( auto cellIt = begin; cellIt != end; ++cellIt )
+        {
+            const auto x = cellIt.x();
+            const auto y = cellIt.y();
+            const auto z = cellIt.z();
+            {{density_velocity_setter_macroscopic_values | indent(12)}}
+            {% if D == 2 -%}
+            const real_t u_2(0.0);
+            {% endif %}
+
+            Equilibrium<{{class_name}}>::set(cellIt, Vector3<real_t>(u_0, u_1, u_2), rho{%if not compressible %} + real_t(1) {%endif%});
+        }
+   }
+};
+
+
+
+template<>
+struct DensityAndMomentumDensity<{{class_name}}>
+{
+   template< typename FieldPtrOrIterator >
+   static real_t get( Vector3< real_t > & momentumDensity, const {{class_name}} & lm,
+                      const FieldPtrOrIterator & it )
+   {
+        const auto x = it.x();
+        const auto y = it.y();
+        const auto z = it.z();
+
+        {% for i in range(Q) -%}
+            const real_t f_{{i}} = it[{{i}}];
+        {% endfor -%}
+
+        {{momentum_density_getter | indent(8) }}
+        {% for i in range(D) -%}
+            momentumDensity[{{i}}] = md_{{i}};
+        {% endfor %}
+        return rho;
+   }
+
+   template< typename PdfField_T >
+   static real_t get( Vector3< real_t > & momentumDensity, const {{class_name}} & lm, const PdfField_T & pdf,
+                      const cell_idx_t x, const cell_idx_t y, const cell_idx_t z )
+   {
+        const real_t & xyz0 = pdf(x,y,z,0);
+        {% for i in range(Q) -%}
+            const real_t f_{{i}} = pdf.getF( &xyz0, {{i}});
+        {% endfor -%}
+
+        {{momentum_density_getter | indent(8) }}
+        {% for i in range(D) -%}
+            momentumDensity[{{i}}] = md_{{i}};
+        {% endfor %}
+       return rho;
+   }
+};
+
+
+template<>
+struct MomentumDensity< {{class_name}}>
+{
+   template< typename FieldPtrOrIterator >
+   static void get( Vector3< real_t > & momentumDensity, const {{class_name}} & lm, const FieldPtrOrIterator & it )
+   {
+        const auto x = it.x();
+        const auto y = it.y();
+        const auto z = it.z();
+
+        {% for i in range(Q) -%}
+            const real_t f_{{i}} = it[{{i}}];
+        {% endfor -%}
+
+        {{momentum_density_getter | indent(8) }}
+        {% for i in range(D) -%}
+            momentumDensity[{{i}}] = md_{{i}};
+        {% endfor %}
+   }
+
+   template< typename PdfField_T >
+   static void get( Vector3< real_t > & momentumDensity, const {{class_name}} & lm, const PdfField_T & pdf,
+                    const cell_idx_t x, const cell_idx_t y, const cell_idx_t z )
+   {
+        const real_t & xyz0 = pdf(x,y,z,0);
+        {% for i in range(Q) -%}
+            const real_t f_{{i}} = pdf.getF( &xyz0, {{i}});
+        {% endfor -%}
+
+        {{momentum_density_getter | indent(8) }}
+        {% for i in range(D) -%}
+            momentumDensity[{{i}}] = md_{{i}};
+        {% endfor %}
+   }
+};
+
+
+template<>
+struct PressureTensor<{{class_name}}>
+{
+   template< typename FieldPtrOrIterator >
+   static void get( Matrix3< real_t > & /* pressureTensor */, const {{class_name}} & /* latticeModel */, const FieldPtrOrIterator & /* it */ )
+   {
+       WALBERLA_ABORT("Not implemented");
+   }
+
+   template< typename PdfField_T >
+   static void get( Matrix3< real_t > & /* pressureTensor */, const {{class_name}} & /* latticeModel */, const PdfField_T & /* pdf */,
+                    const cell_idx_t /* x */, const cell_idx_t /* y */, const cell_idx_t /* z */ )
+   {
+       WALBERLA_ABORT("Not implemented");
+   }
+};
+
+
+template<>
+struct ShearRate<{{class_name}}>
+{
+   template< typename FieldPtrOrIterator >
+   static inline real_t get( const {{class_name}} & /* latticeModel */, const FieldPtrOrIterator & /* it */,
+                             const Vector3< real_t > & /* velocity */, const real_t /* rho */)
+   {
+       WALBERLA_ABORT("Not implemented");
+       return real_t(0.0);
+   }
+
+   template< typename PdfField_T >
+   static inline real_t get( const {{class_name}} & latticeModel,
+                             const PdfField_T & /* pdf */, const cell_idx_t /* x */, const cell_idx_t /* y */, const cell_idx_t /* z */,
+                             const Vector3< real_t > & /* velocity */, const real_t /* rho */ )
+   {
+       WALBERLA_ABORT("Not implemented");
+       return real_t(0.0);
+   }
+
+   static inline real_t get( const std::vector< real_t > & /* nonEquilibrium */, const real_t /* relaxationParam */,
+                             const real_t /* rho */ = real_t(1) )
+   {
+       WALBERLA_ABORT("Not implemented");
+       return real_t(0.0);
+   }
+};
+
+
+} // namespace {{namespace}}
+} // namespace walberla
+
+
+
+#ifdef WALBERLA_CXX_COMPILER_IS_GNU
+#pragma GCC diagnostic pop
+#endif
+
+#ifdef WALBERLA_CXX_COMPILER_IS_CLANG
+#pragma clang diagnostic pop
+#endif
diff --git a/python/lbmpy_walberla/tests/__init__.py b/python/lbmpy_walberla/tests/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/python/lbmpy_walberla/tests/test_walberla_codegen.py b/python/lbmpy_walberla/tests/test_walberla_codegen.py
new file mode 100644
index 000000000..60b86fa57
--- /dev/null
+++ b/python/lbmpy_walberla/tests/test_walberla_codegen.py
@@ -0,0 +1,107 @@
+import unittest
+
+import sympy as sp
+
+import pystencils as ps
+from lbmpy.boundaries import UBB, NoSlip
+from lbmpy.creationfunctions import create_lb_method, create_lb_update_rule, create_lb_collision_rule
+from lbmpy_walberla import RefinementScaling, generate_boundary, generate_lattice_model
+from lbmpy_walberla.sparse import ListLbGenerator
+from pystencils_walberla import generate_pack_info_for_field, generate_pack_info_from_kernel
+from pystencils_walberla.cmake_integration import ManualCodeGenerationContext
+
+
+class WalberlaLbmpyCodegenTest(unittest.TestCase):
+
+    @staticmethod
+    def test_lattice_model():
+        with ManualCodeGenerationContext() as ctx:
+            force_field = ps.fields("force(3): [3D]", layout='fzyx')
+            omega = sp.Symbol("omega")
+
+            cr = create_lb_collision_rule(stencil='D3Q19', method='srt', relaxation_rates=[omega], compressible=True,
+                                          force_model='guo', force=force_field.center_vector)
+
+            scaling = RefinementScaling()
+            scaling.add_standard_relaxation_rate_scaling(omega)
+            scaling.add_force_scaling(force_field)
+
+            generate_lattice_model(ctx, 'SrtWithForceFieldModel', cr, refinement_scaling=scaling)
+            generate_boundary(ctx, 'MyUBB', UBB([0.05, 0, 0]), cr.method)
+            generate_boundary(ctx, 'MyNoSlip', NoSlip(), cr.method)
+            assert 'static const bool compressible = true;' in ctx.files['SrtWithForceFieldModel.h']
+
+    @staticmethod
+    def test_sparse():
+        from lbmpy.creationfunctions import create_lb_collision_rule
+        from pystencils import get_code_str
+        g = ListLbGenerator(create_lb_collision_rule())
+        kernel_code = get_code_str(g.kernel())
+        assert 'num_cells' in kernel_code
+        setter_code = get_code_str(g.setter_ast())
+        assert 'num_cells' in setter_code
+        getter_code = get_code_str(g.getter_ast())
+        assert 'num_cells' in getter_code
+
+    @staticmethod
+    def test_pack_info():
+        with ManualCodeGenerationContext() as ctx:
+            f = ps.fields("f(9): [3D]")
+            generate_pack_info_for_field(ctx, 'MyPackInfo1', f)
+
+            lb_assignments = create_lb_update_rule(stencil='D3Q19', method='srt').main_assignments
+            generate_pack_info_from_kernel(ctx, 'MyPackInfo2', lb_assignments)
+
+    @staticmethod
+    def test_incompressible():
+        with ManualCodeGenerationContext() as ctx:
+            omega = sp.Symbol("omega")
+
+            cr = create_lb_collision_rule(stencil='D3Q19', method='srt', relaxation_rates=[omega], compressible=False)
+            generate_lattice_model(ctx, 'Model', cr)
+            assert 'static const bool compressible = false;' in ctx.files['Model.h']
+
+    @staticmethod
+    def test_output_field():
+        with ManualCodeGenerationContext(openmp=True, double_accuracy=True) as ctx:
+            omega_field = ps.fields("omega_out: [3D]", layout='fzyx')
+            parameters = {
+                'stencil': 'D3Q27',
+                'method': 'trt-kbc-n1',
+                'compressible': True,
+                'entropic': True,
+                'omega_output_field': omega_field,
+            }
+            cr = create_lb_collision_rule(**parameters)
+            generate_lattice_model(ctx, 'Model', cr)
+
+    @staticmethod
+    def test_fluctuating():
+        with ManualCodeGenerationContext(openmp=True, double_accuracy=True) as ctx:
+            omega_shear = sp.symbols("omega")
+            force_field, vel_field = ps.fields("force(3), velocity(3): [3D]", layout='fzyx')
+
+            # the collision rule of the LB method where the some advanced features
+            collision_rule = create_lb_collision_rule(
+                stencil='D3Q19', compressible=True, fluctuating={'seed': 0, 'temperature': 1e-6},
+                method='mrt', relaxation_rates=[omega_shear]*19,
+                force_model='guo', force=force_field.center_vector,
+                optimization={'cse_global': False}
+            )
+            generate_lattice_model(ctx, 'FluctuatingMRT', collision_rule)
+
+    @staticmethod
+    def test_boundary_3D():
+        with ManualCodeGenerationContext(openmp=True, double_accuracy=True) as ctx:
+            lb_method = create_lb_method(stencil='D3Q19', method='srt')
+            generate_boundary(ctx, 'Boundary', NoSlip(), lb_method, target='gpu')
+
+    @staticmethod
+    def test_boundary_2D():
+        with ManualCodeGenerationContext(openmp=True, double_accuracy=True) as ctx:
+            lb_method = create_lb_method(stencil='D2Q9', method='srt')
+            generate_boundary(ctx, 'Boundary', NoSlip(), lb_method, target='gpu')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/lbmpy_walberla/walberla_lbm_generation.py b/python/lbmpy_walberla/walberla_lbm_generation.py
new file mode 100644
index 000000000..05e98a96f
--- /dev/null
+++ b/python/lbmpy_walberla/walberla_lbm_generation.py
@@ -0,0 +1,271 @@
+import warnings
+
+import numpy as np
+import sympy as sp
+from jinja2 import Environment, PackageLoader, StrictUndefined, Template
+from sympy.tensor import IndexedBase
+
+import pystencils as ps
+from lbmpy.creationfunctions import create_lb_update_rule, update_with_default_parameters
+from lbmpy.fieldaccess import CollideOnlyInplaceAccessor, StreamPullTwoFieldsAccessor
+from lbmpy.relaxationrates import relaxation_rate_scaling
+from lbmpy.stencils import get_stencil
+from lbmpy.updatekernels import create_lbm_kernel, create_stream_pull_only_kernel
+from pystencils import AssignmentCollection, create_kernel
+from pystencils.astnodes import SympyAssignment
+from pystencils.backends.cbackend import CBackend, CustomSympyPrinter, get_headers
+from pystencils.data_types import TypedSymbol, type_all_numbers
+from pystencils.field import Field
+from pystencils.stencil import have_same_entries, offset_to_direction_string
+from pystencils.sympyextensions import get_symmetric_part
+from pystencils.transformations import add_types
+from pystencils_walberla.codegen import KernelInfo, default_create_kernel_parameters
+from pystencils_walberla.jinja_filters import add_pystencils_filters_to_jinja_env
+
+cpp_printer = CustomSympyPrinter()
+REFINEMENT_SCALE_FACTOR = sp.Symbol("level_scale_factor")
+
+
+def __lattice_model(generation_context, class_name, lb_method, stream_collide_ast, collide_ast, stream_ast,
+                    refinement_scaling):
+    stencil_name = get_stencil_name(lb_method.stencil)
+    if not stencil_name:
+        raise ValueError("lb_method uses a stencil that is not supported in waLBerla")
+    is_float = not generation_context.double_accuracy
+    dtype = np.float32 if is_float else np.float64
+
+    vel_symbols = lb_method.conserved_quantity_computation.first_order_moment_symbols
+    rho_sym = sp.Symbol("rho")
+    pdfs_sym = sp.symbols("f_:%d" % (len(lb_method.stencil),))
+    vel_arr_symbols = [IndexedBase(sp.Symbol('u'), shape=(1,))[i] for i in range(len(vel_symbols))]
+    momentum_density_symbols = [sp.Symbol("md_%d" % (i,)) for i in range(len(vel_symbols))]
+
+    equilibrium = lb_method.get_equilibrium()
+    equilibrium = equilibrium.new_with_substitutions({a: b for a, b in zip(vel_symbols, vel_arr_symbols)})
+    _, _, equilibrium = add_types(equilibrium.main_assignments, "float32" if is_float else "float64", False)
+    equilibrium = sp.Matrix([e.rhs for e in equilibrium])
+
+    symmetric_equilibrium = get_symmetric_part(equilibrium, vel_arr_symbols)
+    asymmetric_equilibrium = sp.expand(equilibrium - symmetric_equilibrium)
+
+    force_model = lb_method.force_model
+    macroscopic_velocity_shift = None
+    if force_model:
+        if hasattr(force_model, 'macroscopic_velocity_shift'):
+            macroscopic_velocity_shift = [expression_to_code(e, "lm.", ["rho"],dtype=dtype)
+                                          for e in force_model.macroscopic_velocity_shift(rho_sym)]
+
+    cqc = lb_method.conserved_quantity_computation
+
+    eq_input_from_input_eqs = cqc.equilibrium_input_equations_from_init_values(sp.Symbol("rho_in"), vel_arr_symbols)
+    density_velocity_setter_macroscopic_values = equations_to_code(eq_input_from_input_eqs, dtype=dtype,
+                                                                   variables_without_prefix=['rho_in', 'u'])
+    momentum_density_getter = cqc.output_equations_from_pdfs(pdfs_sym, {'density': rho_sym,
+                                                                        'momentum_density': momentum_density_symbols})
+    constant_suffix = "f" if is_float else ""
+
+    required_headers = get_headers(stream_collide_ast)
+
+    if refinement_scaling:
+        refinement_scaling_info = [ (e0,e1,expression_to_code(e2, '', dtype=dtype)) for e0,e1,e2 in refinement_scaling.scaling_info ]
+    else:
+        refinement_scaling_info = None
+
+    jinja_context = {
+        'class_name': class_name,
+        'stencil_name': stencil_name,
+        'D': lb_method.dim,
+        'Q': len(lb_method.stencil),
+        'compressible': lb_method.conserved_quantity_computation.compressible,
+        'weights': ",".join(str(w.evalf()) + constant_suffix for w in lb_method.weights),
+        'inverse_weights': ",".join(str((1/w).evalf()) + constant_suffix for w in lb_method.weights),
+
+        'equilibrium_from_direction': stencil_switch_statement(lb_method.stencil, equilibrium),
+        'symmetric_equilibrium_from_direction': stencil_switch_statement(lb_method.stencil, symmetric_equilibrium),
+        'asymmetric_equilibrium_from_direction': stencil_switch_statement(lb_method.stencil, asymmetric_equilibrium),
+        'equilibrium': [cpp_printer.doprint(e) for e in equilibrium],
+
+        'macroscopic_velocity_shift': macroscopic_velocity_shift,
+        'density_getters': equations_to_code(cqc.output_equations_from_pdfs(pdfs_sym, {"density": rho_sym}),
+                                             variables_without_prefix=[e.name for e in pdfs_sym], dtype=dtype),
+        'momentum_density_getter': equations_to_code(momentum_density_getter, variables_without_prefix=pdfs_sym,
+                                                     dtype=dtype),
+        'density_velocity_setter_macroscopic_values': density_velocity_setter_macroscopic_values,
+
+        'refinement_scaling_info': refinement_scaling_info,
+
+        'stream_collide_kernel': KernelInfo(stream_collide_ast, ['pdfs_tmp'], [('pdfs', 'pdfs_tmp')], []),
+        'collide_kernel': KernelInfo(collide_ast, [], [], []),
+        'stream_kernel': KernelInfo(stream_ast, ['pdfs_tmp'], [('pdfs', 'pdfs_tmp')], []),
+        'target': 'cpu',
+        'namespace': 'lbm',
+        'headers': required_headers,
+        'need_block_offsets': ['block_offset_{}'.format(i) in [param.symbol.name for param in stream_collide_ast.get_parameters()] for i in range(3)],
+    }
+
+    env = Environment(loader=PackageLoader('lbmpy_walberla'), undefined=StrictUndefined)
+    add_pystencils_filters_to_jinja_env(env)
+
+    header = env.get_template('LatticeModel.tmpl.h').render(**jinja_context)
+    source = env.get_template('LatticeModel.tmpl.cpp').render(**jinja_context)
+
+    generation_context.write_file("{}.h".format(class_name), header)
+    generation_context.write_file("{}.cpp".format(class_name), source)
+
+
+def generate_lattice_model(generation_context, class_name, collision_rule, refinement_scaling=None,
+                           **create_kernel_params):
+
+    # usually a numpy layout is chosen by default i.e. xyzf - which is bad for waLBerla where at least the spatial
+    # coordinates should be ordered in reverse direction i.e. zyx
+    is_float = not generation_context.double_accuracy
+    dtype = np.float32 if is_float else np.float64
+    lb_method = collision_rule.method
+
+    q = len(lb_method.stencil)
+    dim = lb_method.dim
+
+    create_kernel_params = default_create_kernel_parameters(generation_context, create_kernel_params)
+    if create_kernel_params['target'] == 'gpu':
+        raise ValueError("Lattice Models can only be generated for CPUs. To generate LBM on GPUs use sweeps directly")
+
+    src_field = ps.Field.create_generic('pdfs', dim, dtype, index_dimensions=1, layout='fzyx', index_shape=(q,))
+    dst_field = ps.Field.create_generic('pdfs_tmp', dim, dtype, index_dimensions=1, layout='fzyx', index_shape=(q,))
+
+    stream_collide_update_rule = create_lbm_kernel(collision_rule, src_field, dst_field, StreamPullTwoFieldsAccessor())
+    stream_collide_ast = create_kernel(stream_collide_update_rule, **create_kernel_params)
+    stream_collide_ast.function_name = 'kernel_streamCollide'
+
+    collide_update_rule = create_lbm_kernel(collision_rule, src_field, dst_field, CollideOnlyInplaceAccessor())
+    collide_ast = create_kernel(collide_update_rule, **create_kernel_params)
+    collide_ast.function_name = 'kernel_collide'
+
+    stream_update_rule = create_stream_pull_only_kernel(lb_method.stencil, None, 'pdfs', 'pdfs_tmp', 'fzyx', dtype)
+    stream_ast = create_kernel(stream_update_rule, **create_kernel_params)
+    stream_ast.function_name = 'kernel_stream'
+    __lattice_model(generation_context, class_name, lb_method, stream_collide_ast, collide_ast, stream_ast,
+                    refinement_scaling)
+
+
+
+
+class RefinementScaling:
+    level_scale_factor = sp.Symbol("level_scale_factor")
+
+    def __init__(self):
+        self.scaling_info = []
+
+    def add_standard_relaxation_rate_scaling(self, viscosity_relaxation_rate):
+        self.add_scaling(viscosity_relaxation_rate, relaxation_rate_scaling)
+
+    def add_force_scaling(self, force_parameter):
+        self.add_scaling(force_parameter, lambda param, factor: param * factor)
+
+    def add_scaling(self, parameter, scaling_rule):
+        """
+        Adds a scaling rule, how parameters on refined blocks are modified
+
+        :param parameter: parameter to modify: may either be a Field, Field.Access or a Symbol
+        :param scaling_rule: function taking the parameter to be scaled as symbol and the scaling factor i.e.
+                            how much finer the current block is compared to coarsest resolution
+        """
+        if isinstance(parameter, Field):
+            field = parameter
+            name = field.name
+            if field.index_dimensions > 0:
+                scaling_type = 'field_with_f'
+                field_access = field(sp.Symbol("f"))
+            else:
+                scaling_type = 'field_xyz'
+                field_access = field.center
+            expr = scaling_rule(field_access, self.level_scale_factor)
+            self.scaling_info.append((scaling_type, name, expr))
+        elif isinstance(parameter, Field.Access):
+            field_access = parameter
+            expr = scaling_rule(field_access, self.level_scale_factor)
+            name = field_access.field.name
+            self.scaling_info.append(('field_xyz', name, expr))
+        elif isinstance(parameter, sp.Symbol):
+            expr = scaling_rule(parameter, self.level_scale_factor)
+            self.scaling_info.append(('normal', parameter.name, expr))
+        elif isinstance(parameter, list) or isinstance(parameter, tuple):
+            for p in parameter:
+                self.add_scaling(p, scaling_rule)
+        else:
+            raise ValueError("Invalid value for viscosity_relaxation_rate")
+
+
+# ------------------------------------------ Internal ------------------------------------------------------------------
+
+
+def stencil_switch_statement(stencil, values):
+    template = Template("""
+    using namespace stencil;
+    switch( direction ) {
+        {% for direction_name, value in dir_to_value_dict.items() -%}
+            case {{direction_name}}: return {{value}};
+        {% endfor -%}
+        default:
+            WALBERLA_ABORT("Invalid Direction");
+    }
+    """)
+
+    dir_to_value_dict = {offset_to_direction_string(d): cpp_printer.doprint(v) for d, v in zip(stencil, values)}
+    return template.render(dir_to_value_dict=dir_to_value_dict, undefined=StrictUndefined)
+
+
+def field_and_symbol_substitute(expr, variable_prefix="lm.", variables_without_prefix=[]):
+    variables_without_prefix = [v.name if isinstance(v, sp.Symbol) else v for v in variables_without_prefix]
+    substitutions = {}
+    for sym in expr.atoms(sp.Symbol):
+        if isinstance(sym, Field.Access):
+            fa = sym
+            prefix = "" if fa.field.name in variables_without_prefix else variable_prefix
+            if fa.field.index_dimensions == 0:
+                substitutions[fa] = sp.Symbol("%s%s->get(x,y,z)" % (prefix, fa.field.name))
+            else:
+                assert fa.field.index_dimensions == 1, "walberla supports only 0 or 1 index dimensions"
+                substitutions[fa] = sp.Symbol("%s%s->get(x,y,z,%s)" % (prefix, fa.field.name, fa.index[0]))
+        else:
+            if sym.name not in variables_without_prefix:
+                substitutions[sym] = sp.Symbol(variable_prefix + sym.name)
+    return expr.subs(substitutions)
+
+
+def expression_to_code(expr, variable_prefix="lm.", variables_without_prefix=[],dtype="double"):
+    """
+    Takes a sympy expression and creates a C code string from it. Replaces field accesses by
+    walberla field accesses i.e. field_W^1 -> field->get(-1, 0, 0, 1)
+    :param expr: sympy expression
+    :param variable_prefix: all variables (and field) are prefixed with this string
+                           this is used for member variables mostly
+    :param variables_without_prefix: this variables are not prefixed
+    :return: code string
+    """
+    return cpp_printer.doprint(type_expr(field_and_symbol_substitute(expr, variable_prefix, variables_without_prefix),dtype=dtype))
+
+def type_expr(eq, dtype):
+    eq=type_all_numbers(eq,dtype=dtype)
+    return eq.subs({s: TypedSymbol(s.name, dtype) for s in eq.atoms(sp.Symbol)})
+
+def equations_to_code(equations, variable_prefix="lm.", variables_without_prefix=[], dtype="double"):
+
+    if isinstance(equations, AssignmentCollection):
+        equations = equations.all_assignments
+
+    variables_without_prefix = list(variables_without_prefix)
+    c_backend = CBackend()
+    result = []
+    left_hand_side_names = [e.lhs.name for e in equations]
+    for eq in equations:
+        assignment = SympyAssignment(type_expr(eq.lhs,dtype=dtype),
+                                     type_expr(field_and_symbol_substitute(eq.rhs, variable_prefix,
+                                                                 variables_without_prefix + left_hand_side_names),dtype=dtype))
+        result.append(c_backend(assignment))
+    return "\n".join(result)
+
+
+def get_stencil_name(stencil):
+    for name in ('D2Q9', 'D3Q15', 'D3Q19', 'D3Q27'):
+        if have_same_entries(stencil, get_stencil(name, 'walberla')):
+            return name
diff --git a/python/pystencils_walberla/__init__.py b/python/pystencils_walberla/__init__.py
new file mode 100644
index 000000000..767581a75
--- /dev/null
+++ b/python/pystencils_walberla/__init__.py
@@ -0,0 +1,8 @@
+from .cmake_integration import CodeGeneration
+from .codegen import (
+    generate_pack_info, generate_pack_info_for_field, generate_pack_info_from_kernel,
+    generate_mpidtype_info_from_kernel, generate_sweep)
+
+__all__ = ['CodeGeneration',
+           'generate_sweep', 'generate_pack_info_from_kernel', 'generate_pack_info_for_field', 'generate_pack_info',
+           'generate_mpidtype_info_from_kernel']
diff --git a/python/pystencils_walberla/cmake_integration.py b/python/pystencils_walberla/cmake_integration.py
new file mode 100644
index 000000000..4bc5586af
--- /dev/null
+++ b/python/pystencils_walberla/cmake_integration.py
@@ -0,0 +1,111 @@
+"""
+Mechanism for integrating code generation into walberla's CMake system.
+CMake needs to determine which C++ source files are generated by which Python script.
+The list of files should be available fast, without running the generation process itself.
+Thus all code generation function are registered at a single class that manages this process.
+
+Usage example:
+    from pystencils_walberla.cmake_integration import codegen
+    codegen.register(['MyClass.h', 'MyClass.cpp'], functionReturningTwoStringsForHeaderAndCpp)
+
+"""
+import json
+import os
+import sys
+import warnings
+
+__all__ = ['CodeGeneration', 'ManualCodeGenerationContext']
+
+
+class CodeGeneration:
+    def __init__(self):
+        expected_files, cmake_vars = parse_json_args()
+        self.context = CodeGenerationContext(cmake_vars)
+        self.expected_files = expected_files
+
+    def __enter__(self):
+        return self.context
+
+    def __exit__(self, *args):
+        if self.expected_files and (set(self.context.files_written) != set(self.expected_files)):
+            expected = set(os.path.realpath(f) for f in self.expected_files)
+            written = set(os.path.realpath(f) for f in self.context.files_written)
+            only_in_cmake = expected - written
+            only_generated = written - expected
+            error_message = "Generated files specified not correctly in cmake with 'waLBerla_python_file_generates'\n"
+            if only_in_cmake:
+                error_message += "Files only specified in CMake {}\n".format(
+                    [os.path.basename(p) for p in only_in_cmake])
+            if only_generated:
+                error_message += "Unexpected generated files {}\n".format([os.path.basename(p) for p in only_generated])
+            raise ValueError(error_message)
+
+
+def parse_json_args():
+    default = {'EXPECTED_FILES': [],
+               'CMAKE_VARS': {'WALBERLA_BUILD_WITH_OPENMP': False,
+                              'WALBERLA_OPTIMIZE_FOR_LOCALHOST': False,
+                              'WALBERLA_DOUBLE_ACCURACY': True,
+                              'WALBERLA_BUILD_WITH_MPI': True,
+                              'WALBERLA_BUILD_WITH_CUDA': False,
+                              "CODEGEN_CFG": ""}
+               }
+
+    if len(sys.argv) == 2:
+        try:
+            parsed = json.loads(sys.argv[1])
+        except json.JSONDecodeError:
+            warnings.warn("Could not parse JSON arguments: " + sys.argv[1])
+            parsed = default
+    else:
+        parsed = default
+    expected_files = parsed['EXPECTED_FILES']
+    cmake_vars = {}
+    for key, value in parsed['CMAKE_VARS'].items():
+        if value in ("ON", "1", "YES"):
+            value = True
+        elif value in ("OFF", "0", "NO"):
+            value = False
+        cmake_vars[key] = value
+    return expected_files, cmake_vars
+
+
+class CodeGenerationContext:
+    def __init__(self, cmake_vars):
+        self.files_written = []
+        self.openmp = cmake_vars['WALBERLA_BUILD_WITH_OPENMP']
+        self.optimize_for_localhost = cmake_vars['WALBERLA_OPTIMIZE_FOR_LOCALHOST']
+        self.mpi = cmake_vars['WALBERLA_BUILD_WITH_MPI']
+        self.double_accuracy = cmake_vars['WALBERLA_DOUBLE_ACCURACY']
+        self.cuda = cmake_vars['WALBERLA_BUILD_WITH_CUDA']
+        self.config = cmake_vars['CODEGEN_CFG'].strip()
+
+    def write_file(self, name, content):
+        self.files_written.append(os.path.abspath(name))
+        with open(name, 'w') as f:
+            f.write(content)
+
+
+class ManualCodeGenerationContext:
+    """Context for testing - does not actually write files but puts them into a public dict
+    Environment parameters like if OpenMP, MPI or CPU-specific optimization should be used can be explicitly passed
+    to constructor instead of getting them from CMake
+    """
+
+    def __init__(self, openmp=False, optimize_for_localhost=False, mpi=True, double_accuracy=True):
+        self.openmp = openmp
+        self.optimize_for_localhost = optimize_for_localhost
+        self.mpi = mpi
+        self.double_accuracy = double_accuracy
+        self.files = dict()
+        self.cuda = False
+        self.config = ""
+
+    def write_file(self, name, content):
+        self.files[name] = content
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        pass
diff --git a/python/pystencils_walberla/codegen.py b/python/pystencils_walberla/codegen.py
new file mode 100644
index 000000000..fb665277b
--- /dev/null
+++ b/python/pystencils_walberla/codegen.py
@@ -0,0 +1,370 @@
+from collections import OrderedDict, defaultdict
+from itertools import product
+from typing import Dict, Optional, Sequence, Tuple
+
+from jinja2 import Environment, PackageLoader, StrictUndefined
+
+from pystencils import (
+    Assignment, AssignmentCollection, Field, FieldType, create_kernel, create_staggered_kernel)
+from pystencils.astnodes import KernelFunction
+from pystencils.backends.cbackend import get_headers
+from pystencils.backends.simd_instruction_sets import get_supported_instruction_sets
+from pystencils.stencil import inverse_direction, offset_to_direction_string
+from pystencils_walberla.jinja_filters import add_pystencils_filters_to_jinja_env
+
+__all__ = ['generate_sweep', 'generate_pack_info', 'generate_pack_info_for_field', 'generate_pack_info_from_kernel',
+           'generate_mpidtype_info_from_kernel', 'default_create_kernel_parameters', 'KernelInfo']
+
+
+def generate_sweep(generation_context, class_name, assignments,
+                   namespace='pystencils', field_swaps=(), staggered=False, varying_parameters=(),
+                   inner_outer_split=False,
+                   **create_kernel_params):
+    """Generates a waLBerla sweep from a pystencils representation.
+
+    The constructor of the C++ sweep class expects all kernel parameters (fields and parameters) in alphabetical order.
+    Fields have to passed using BlockDataID's pointing to walberla fields
+
+    Args:
+        generation_context: build system context filled with information from waLBerla's CMake. The context for example
+                            defines where to write generated files, if OpenMP is available or which SIMD instruction
+                            set should be used. See waLBerla examples on how to get a context.
+        class_name: name of the generated sweep class
+        assignments: list of assignments defining the stencil update rule or a :class:`KernelFunction`
+        namespace: the generated class is accessible as walberla::<namespace>::<class_name>
+        field_swaps: sequence of field pairs (field, temporary_field). The generated sweep only gets the first field
+                     as argument, creating a temporary field internally which is swapped with the first field after
+                     each iteration.
+        staggered: set to True to create staggered kernels with `pystencils.create_staggered_kernel`
+        varying_parameters: Depending on the configuration, the generated kernels may receive different arguments for
+                            different setups. To not have to adapt the C++ application when then parameter change,
+                            the varying_parameters sequence can contain parameter names, which are always expected by
+                            the C++ class constructor even if the kernel does not need them.
+        inner_outer_split: if True generate a sweep that supports separate iteration over inner and outer regions
+                           to allow for communication hiding.
+        **create_kernel_params: remaining keyword arguments are passed to `pystencils.create_kernel`
+    """
+    create_kernel_params = default_create_kernel_parameters(generation_context, create_kernel_params)
+
+    if not generation_context.cuda and create_kernel_params['target'] == 'gpu':
+        return
+
+    if isinstance(assignments, KernelFunction):
+        ast = assignments
+        create_kernel_params['target'] = ast.target
+    elif not staggered:
+        ast = create_kernel(assignments, **create_kernel_params)
+    else:
+        ast = create_staggered_kernel(assignments, **create_kernel_params)
+
+    def to_name(f):
+        return f.name if isinstance(f, Field) else f
+
+    field_swaps = tuple((to_name(e[0]), to_name(e[1])) for e in field_swaps)
+    temporary_fields = tuple(e[1] for e in field_swaps)
+
+    ast.function_name = class_name.lower()
+
+    env = Environment(loader=PackageLoader('pystencils_walberla'), undefined=StrictUndefined)
+    add_pystencils_filters_to_jinja_env(env)
+
+    if inner_outer_split is False:
+        jinja_context = {
+            'kernel': KernelInfo(ast, temporary_fields, field_swaps, varying_parameters),
+            'namespace': namespace,
+            'class_name': class_name,
+            'target': create_kernel_params.get("target", "cpu"),
+            'headers': get_headers(ast),
+        }
+        header = env.get_template("Sweep.tmpl.h").render(**jinja_context)
+        source = env.get_template("Sweep.tmpl.cpp").render(**jinja_context)
+    else:
+        main_kernel_info = KernelInfo(ast, temporary_fields, field_swaps, varying_parameters)
+        representative_field = {p.field_name for p in main_kernel_info.parameters if p.is_field_parameter}
+        representative_field = sorted(representative_field)[0]
+
+        jinja_context = {
+            'kernel': main_kernel_info,
+            'namespace': namespace,
+            'class_name': class_name,
+            'target': create_kernel_params.get("target", "cpu"),
+            'field': representative_field,
+            'headers': get_headers(ast),
+        }
+        header = env.get_template("SweepInnerOuter.tmpl.h").render(**jinja_context)
+        source = env.get_template("SweepInnerOuter.tmpl.cpp").render(**jinja_context)
+
+    source_extension = "cpp" if create_kernel_params.get("target", "cpu") == "cpu" else "cu"
+    generation_context.write_file("{}.h".format(class_name), header)
+    generation_context.write_file("{}.{}".format(class_name, source_extension), source)
+
+
+def generate_pack_info_for_field(generation_context, class_name: str, field: Field,
+                                 direction_subset: Optional[Tuple[Tuple[int, int, int]]] = None,
+                                 **create_kernel_params):
+    """Creates a pack info for a pystencils field assuming a pull-type stencil, packing all cell elements.
+
+    Args:
+        generation_context: see documentation of `generate_sweep`
+        class_name: name of the generated class
+        field: pystencils field for which to generate pack info
+        direction_subset: optional sequence of directions for which values should be packed
+                          otherwise a D3Q27 stencil is assumed
+        **create_kernel_params: remaining keyword arguments are passed to `pystencils.create_kernel`
+    """
+    if not direction_subset:
+        direction_subset = tuple((i, j, k) for i, j, k in product(*[(-1, 0, 1)] * 3))
+
+    all_index_accesses = [field(*ind) for ind in product(*[range(s) for s in field.index_shape])]
+    return generate_pack_info(generation_context, class_name, {direction_subset: all_index_accesses},
+                              **create_kernel_params)
+
+
+def generate_pack_info_from_kernel(generation_context, class_name: str, assignments: Sequence[Assignment],
+                                   kind='pull', **create_kernel_params):
+    """Generates a waLBerla GPU PackInfo from a (pull) kernel.
+
+    Args:
+        generation_context: see documentation of `generate_sweep`
+        class_name: name of the generated class
+        assignments: list of assignments from the compute kernel - generates PackInfo for "pull" part only
+                     i.e. the kernel is expected to only write to the center
+        kind:                      
+        **create_kernel_params: remaining keyword arguments are passed to `pystencils.create_kernel`
+    """
+    assert kind in ('push', 'pull')
+    reads = set()
+    writes = set()
+
+    if isinstance(assignments, AssignmentCollection):
+        assignments = assignments.all_assignments
+
+    for a in assignments:
+        if not isinstance(a, Assignment):
+            continue
+        reads.update(a.rhs.atoms(Field.Access))
+        writes.update(a.lhs.atoms(Field.Access))
+    spec = defaultdict(set)
+    if kind == 'pull':
+        for fa in reads:
+            assert all(abs(e) <= 1 for e in fa.offsets)
+            if all(offset == 0 for offset in fa.offsets):
+                continue
+            comm_direction = inverse_direction(fa.offsets)
+            for comm_dir in comm_directions(comm_direction):
+                spec[(comm_dir,)].add(fa.field.center(*fa.index))
+    elif kind == 'push':
+        for fa in writes:
+            assert all(abs(e) <= 1 for e in fa.offsets)
+            if all(offset == 0 for offset in fa.offsets):
+                continue
+            for comm_dir in comm_directions(fa.offsets):
+                spec[(comm_dir,)].add(fa)
+    else:
+        raise ValueError("Invalid 'kind' parameter")
+    return generate_pack_info(generation_context, class_name, spec, **create_kernel_params)
+
+
+def generate_pack_info(generation_context, class_name: str,
+                       directions_to_pack_terms: Dict[Tuple[Tuple], Sequence[Field.Access]],
+                       namespace='pystencils',
+                       **create_kernel_params):
+    """Generates a waLBerla GPU PackInfo
+
+    Args:
+        generation_context: see documentation of `generate_sweep`
+        class_name: name of the generated class
+        directions_to_pack_terms: maps tuples of directions to read field accesses, specifying which values have to be
+                                  packed for which direction
+        namespace: inner namespace of the generated class
+        **create_kernel_params: remaining keyword arguments are passed to `pystencils.create_kernel`
+    """
+    items = [(e[0], sorted(e[1], key=lambda x: str(x))) for e in directions_to_pack_terms.items()]
+    items = sorted(items, key=lambda e: e[0])
+    directions_to_pack_terms = OrderedDict(items)
+
+    create_kernel_params = default_create_kernel_parameters(generation_context, create_kernel_params)
+    target = create_kernel_params.get('target', 'cpu')
+
+    template_name = "CpuPackInfo.tmpl" if target == 'cpu' else 'GpuPackInfo.tmpl'
+
+    fields_accessed = set()
+    for terms in directions_to_pack_terms.values():
+        for term in terms:
+            assert isinstance(term, Field.Access)  # and all(e == 0 for e in term.offsets)
+            fields_accessed.add(term)
+
+    field_names = {fa.field.name for fa in fields_accessed}
+
+    data_types = {fa.field.dtype for fa in fields_accessed}
+    if len(data_types) == 0:
+        raise ValueError("No fields to pack!")
+    if len(data_types) != 1:
+        err_detail = "\n".join(" - {} [{}]".format(f.name, f.dtype) for f in fields_accessed)
+        raise NotImplementedError("Fields of different data types are used - this is not supported.\n" + err_detail)
+    dtype = data_types.pop()
+
+    pack_kernels = OrderedDict()
+    unpack_kernels = OrderedDict()
+    all_accesses = set()
+    elements_per_cell = OrderedDict()
+    for direction_set, terms in directions_to_pack_terms.items():
+        for d in direction_set:
+            if not all(abs(i) <= 1 for i in d):
+                raise NotImplementedError("Only first neighborhood supported")
+
+        buffer = Field.create_generic('buffer', spatial_dimensions=1, field_type=FieldType.BUFFER,
+                                      dtype=dtype.numpy_dtype, index_shape=(len(terms),))
+
+        direction_strings = tuple(offset_to_direction_string(d) for d in direction_set)
+        all_accesses.update(terms)
+
+        pack_assignments = [Assignment(buffer(i), term) for i, term in enumerate(terms)]
+        pack_ast = create_kernel(pack_assignments, **create_kernel_params, ghost_layers=0)
+        pack_ast.function_name = 'pack_{}'.format("_".join(direction_strings))
+        unpack_assignments = [Assignment(term, buffer(i)) for i, term in enumerate(terms)]
+        unpack_ast = create_kernel(unpack_assignments, **create_kernel_params, ghost_layers=0)
+        unpack_ast.function_name = 'unpack_{}'.format("_".join(direction_strings))
+
+        pack_kernels[direction_strings] = KernelInfo(pack_ast)
+        unpack_kernels[direction_strings] = KernelInfo(unpack_ast)
+        elements_per_cell[direction_strings] = len(terms)
+
+    fused_kernel = create_kernel([Assignment(buffer.center, t) for t in all_accesses], **create_kernel_params)
+
+    jinja_context = {
+        'class_name': class_name,
+        'pack_kernels': pack_kernels,
+        'unpack_kernels': unpack_kernels,
+        'fused_kernel': KernelInfo(fused_kernel),
+        'elements_per_cell': elements_per_cell,
+        'headers': get_headers(fused_kernel),
+        'target': target,
+        'dtype': dtype,
+        'field_name': field_names.pop(),
+        'namespace': namespace,
+    }
+    env = Environment(loader=PackageLoader('pystencils_walberla'), undefined=StrictUndefined)
+    add_pystencils_filters_to_jinja_env(env)
+    header = env.get_template(template_name + ".h").render(**jinja_context)
+    source = env.get_template(template_name + ".cpp").render(**jinja_context)
+
+    source_extension = "cpp" if target == "cpu" else "cu"
+    generation_context.write_file("{}.h".format(class_name), header)
+    generation_context.write_file("{}.{}".format(class_name, source_extension), source)
+
+
+def generate_mpidtype_info_from_kernel(generation_context, class_name: str,
+                                       assignments: Sequence[Assignment], kind='pull', namespace='pystencils', ):
+    assert kind in ('push', 'pull')
+    reads = set()
+    writes = set()
+
+    if isinstance(assignments, AssignmentCollection):
+        assignments = assignments.all_assignments
+
+    for a in assignments:
+        if not isinstance(a, Assignment):
+            continue
+        reads.update(a.rhs.atoms(Field.Access))
+        writes.update(a.lhs.atoms(Field.Access))
+
+    spec = defaultdict(set)
+    if kind == 'pull':
+        read_fields = set(fa.field for fa in reads)
+        assert len(read_fields) == 1, "Only scenarios where one fields neighbors are accessed"
+        field = read_fields.pop()
+        for fa in reads:
+            assert all(abs(e) <= 1 for e in fa.offsets)
+            if all(offset == 0 for offset in fa.offsets):
+                continue
+            comm_direction = inverse_direction(fa.offsets)
+            for comm_dir in comm_directions(comm_direction):
+                assert len(fa.index) == 1, "Supports only fields with a single index dimension"
+                spec[(offset_to_direction_string(comm_dir),)].add(fa.index[0])
+    elif kind == 'push':
+        written_fields = set(fa.field for fa in writes)
+        assert len(written_fields) == 1, "Only scenarios where one fields neighbors are accessed"
+        field = written_fields.pop()
+
+        for fa in writes:
+            assert all(abs(e) <= 1 for e in fa.offsets)
+            if all(offset == 0 for offset in fa.offsets):
+                continue
+            for comm_dir in comm_directions(fa.offsets):
+                assert len(fa.index) == 1, "Supports only fields with a single index dimension"
+                spec[(offset_to_direction_string(comm_dir),)].add(fa.index[0])
+    else:
+        raise ValueError("Invalid 'kind' parameter")
+
+    jinja_context = {
+        'class_name': class_name,
+        'namespace': namespace,
+        'kind': kind,
+        'field_name': field.name,
+        'f_size': field.index_shape[0],
+        'spec': spec,
+    }
+    env = Environment(loader=PackageLoader('pystencils_walberla'), undefined=StrictUndefined)
+    header = env.get_template("MpiDtypeInfo.tmpl.h").render(**jinja_context)
+    generation_context.write_file("{}.h".format(class_name), header)
+
+
+# ---------------------------------- Internal --------------------------------------------------------------------------
+
+
+class KernelInfo:
+    def __init__(self, ast, temporary_fields=(), field_swaps=(), varying_parameters=()):
+        self.ast = ast
+        self.temporary_fields = tuple(temporary_fields)
+        self.field_swaps = tuple(field_swaps)
+        self.varying_parameters = tuple(varying_parameters)
+        self.parameters = ast.get_parameters()  # cache parameters here
+
+
+def default_create_kernel_parameters(generation_context, params):
+    default_dtype = "float64" if generation_context.double_accuracy else 'float32'
+
+    if generation_context.optimize_for_localhost:
+        supported_instruction_sets = get_supported_instruction_sets()
+        if supported_instruction_sets:
+            default_vec_is = get_supported_instruction_sets()[-1]
+        else:  # if cpuinfo package is not installed
+            default_vec_is = 'sse'
+    else:
+        default_vec_is = None
+
+    params['target'] = params.get('target', 'cpu')
+    params['data_type'] = params.get('data_type', default_dtype)
+    params['cpu_openmp'] = params.get('cpu_openmp', generation_context.openmp)
+    params['cpu_vectorize_info'] = params.get('cpu_vectorize_info', {})
+
+    vec = params['cpu_vectorize_info']
+    vec['instruction_set'] = vec.get('instruction_set', default_vec_is)
+    vec['assume_inner_stride_one'] = True
+    vec['assume_aligned'] = vec.get('assume_aligned', False)
+    vec['nontemporal'] = vec.get('nontemporal', False)
+    return params
+
+
+def comm_directions(direction):
+    if all(e == 0 for e in direction):
+        yield direction
+    binary_numbers_list = binary_numbers(len(direction))
+    for comm_direction in binary_numbers_list:
+        for i in range(len(direction)):
+            if direction[i] == 0:
+                comm_direction[i] = 0
+            if direction[i] == -1 and comm_direction[i] == 1:
+                comm_direction[i] = -1
+        if not all(e == 0 for e in comm_direction):
+            yield tuple(comm_direction)
+
+
+def binary_numbers(n):
+    result = list()
+    for i in range(1 << n):
+        binary_number = bin(i)[2:]
+        binary_number = '0' * (n - len(binary_number)) + binary_number
+        result.append((list(map(int, binary_number))))
+    return result
diff --git a/python/pystencils_walberla/jinja_filters.py b/python/pystencils_walberla/jinja_filters.py
new file mode 100644
index 000000000..174b9dfec
--- /dev/null
+++ b/python/pystencils_walberla/jinja_filters.py
@@ -0,0 +1,376 @@
+import jinja2
+import sympy as sp
+
+from pystencils import TypedSymbol
+from pystencils.backends.cbackend import generate_c
+from pystencils.backends.cuda_backend import CudaSympyPrinter
+from pystencils.data_types import get_base_type
+from pystencils.field import FieldType
+from pystencils.kernelparameters import SHAPE_DTYPE
+from pystencils.sympyextensions import prod
+
+temporary_fieldMemberTemplate = """
+private: std::set< {type} *, field::SwapableCompare< {type} * > > cache_{original_field_name}_;"""
+
+temporary_fieldTemplate = """
+// Getting temporary field {tmp_field_name}
+auto it = cache_{original_field_name}_.find( {original_field_name} );
+if( it != cache_{original_field_name}_.end() )
+{{
+    {tmp_field_name} = *it;
+}}
+else 
+{{
+    {tmp_field_name} = {original_field_name}->cloneUninitialized();
+    cache_{original_field_name}_.insert({tmp_field_name});
+}}
+"""
+
+temporary_constructor = """
+~{class_name}() {{  {contents} }}
+"""
+
+delete_loop = """
+    for(auto p: cache_{original_field_name}_) {{
+        delete p;
+    }}
+"""
+
+
+def make_field_type(dtype, f_size, is_gpu):
+    if is_gpu:
+        return "cuda::GPUField<%s>" % (dtype,)
+    else:
+        return "GhostLayerField<%s, %d>" % (dtype, f_size)
+
+
+def get_field_fsize(field):
+    """Determines the size of the index coordinate. Since walberla fields only support one index dimension,
+    pystencils fields with multiple index dimensions are linearized to a single index dimension.
+    """
+    assert field.has_fixed_index_shape, \
+        "All Fields have to be created with fixed index coordinate shape using index_shape=(q,) " + str(field.name)
+
+    if field.index_dimensions == 0:
+        return 1
+    else:
+        return prod(field.index_shape)
+
+
+def get_field_stride(param):
+    field = param.fields[0]
+    type_str = get_base_type(param.symbol.dtype).base_name
+    stride_names = ['xStride()', 'yStride()', 'zStride()', 'fStride()']
+    stride_names = ["%s(%s->%s)" % (type_str, param.field_name, e) for e in stride_names]
+    strides = stride_names[:field.spatial_dimensions]
+    if field.index_dimensions > 0:
+        additional_strides = [1]
+        for shape in reversed(field.index_shape[1:]):
+            additional_strides.append(additional_strides[-1] * shape)
+        assert len(additional_strides) == field.index_dimensions
+        f_stride_name = stride_names[-1]
+        strides.extend(["%s(%d * %s)" % (type_str, e, f_stride_name) for e in reversed(additional_strides)])
+    return strides[param.symbol.coordinate]
+
+
+def generate_declaration(kernel_info, target='cpu'):
+    """Generates the declaration of the kernel function"""
+    ast = kernel_info.ast
+    result = generate_c(ast, signature_only=True, dialect='cuda' if target == 'gpu' else 'c') + ";"
+    result = "namespace internal_%s {\n%s\n}" % (ast.function_name, result,)
+    return result
+
+
+def generate_definition(kernel_info, target='cpu'):
+    """Generates the definition (i.e. implementation) of the kernel function"""
+    ast = kernel_info.ast
+    result = generate_c(ast, dialect='cuda' if target == 'gpu' else 'c')
+    result = "namespace internal_%s {\nstatic %s\n}" % (ast.function_name, result)
+    return result
+
+
+def field_extraction_code(field, is_temporary, declaration_only=False,
+                          no_declaration=False, is_gpu=False):
+    """Returns code string for getting a field pointer.
+
+    This can happen in two ways: either the field is extracted from a walberla block, or a temporary field to swap is
+    created.
+
+    Args:
+        field: the field for which the code should be created
+        is_temporary: new_filtered field from block (False) or create a temporary copy of an existing field (True)
+        declaration_only: only create declaration instead of the full code
+        no_declaration: create the extraction code, and assume that declarations are elsewhere
+        is_gpu: if the field is a GhostLayerField or a GpuField
+    """
+
+    # Determine size of f coordinate which is a template parameter
+    f_size = get_field_fsize(field)
+    field_name = field.name
+    dtype = get_base_type(field.dtype)
+    field_type = make_field_type(dtype, f_size, is_gpu)
+
+    if not is_temporary:
+        dtype = get_base_type(field.dtype)
+        field_type = make_field_type(dtype, f_size, is_gpu)
+        if declaration_only:
+            return "%s * %s;" % (field_type, field_name)
+        else:
+            prefix = "" if no_declaration else "auto "
+            return "%s%s = block->uncheckedFastGetData< %s >(%sID);" % (prefix, field_name, field_type, field_name)
+    else:
+        assert field_name.endswith('_tmp')
+        original_field_name = field_name[:-len('_tmp')]
+        if declaration_only:
+            return "%s * %s;" % (field_type, field_name)
+        else:
+            declaration = "{type} * {tmp_field_name};".format(type=field_type, tmp_field_name=field_name)
+            tmp_field_str = temporary_fieldTemplate.format(original_field_name=original_field_name,
+                                                           tmp_field_name=field_name, type=field_type)
+            return tmp_field_str if no_declaration else declaration + tmp_field_str
+
+
+@jinja2.contextfilter
+def generate_block_data_to_field_extraction(ctx, kernel_info, parameters_to_ignore=(), parameters=None,
+                                            declarations_only=False, no_declarations=False):
+    """Generates code that extracts all required fields of a kernel from a walberla block storage."""
+    if parameters is not None:
+        assert parameters_to_ignore == ()
+        field_parameters = []
+        for param in kernel_info.parameters:
+            if param.is_field_pointer and param.field_name in parameters:
+                field_parameters.append(param.fields[0])
+    else:
+        field_parameters = []
+        for param in kernel_info.parameters:
+            if param.is_field_pointer and param.field_name not in parameters_to_ignore:
+                field_parameters.append(param.fields[0])
+
+    normal_fields = {f for f in field_parameters if f.name not in kernel_info.temporary_fields}
+    temporary_fields = {f for f in field_parameters if f.name in kernel_info.temporary_fields}
+
+    args = {
+        'declaration_only': declarations_only,
+        'no_declaration': no_declarations,
+        'is_gpu': ctx['target'] == 'gpu',
+    }
+    result = "\n".join(field_extraction_code(field=field, is_temporary=False, **args) for field in normal_fields) + "\n"
+    result += "\n".join(field_extraction_code(field=field, is_temporary=True, **args) for field in temporary_fields)
+    return result
+
+
+def generate_refs_for_kernel_parameters(kernel_info, prefix, parameters_to_ignore):
+    symbols = {p.field_name for p in kernel_info.parameters if p.is_field_pointer}
+    symbols.update(p.symbol.name for p in kernel_info.parameters if not p.is_field_parameter)
+    symbols.difference_update(parameters_to_ignore)
+    return "\n".join("auto & %s = %s%s;" % (s, prefix, s) for s in symbols)
+
+
+@jinja2.contextfilter
+def generate_call(ctx, kernel_info, ghost_layers_to_include=0, cell_interval=None, stream='0',
+                  spatial_shape_symbols=()):
+    """Generates the function call to a pystencils kernel
+
+    Args:
+        kernel_info:
+        ghost_layers_to_include: if left to 0, only the inner part of the ghost layer field is looped over
+                                 a CHECK is inserted that the field has as many ghost layers as the pystencils AST
+                                 needs. This parameter specifies how many ghost layers the kernel should view as
+                                 "inner area". The ghost layer field has to have the required number of ghost layers
+                                 remaining. Parameter has to be left to default if cell_interval is given.
+        cell_interval: Defines the name (string) of a walberla CellInterval object in scope,
+                       that defines the inner region for the kernel to loop over. Parameter has to be left to default
+                       if ghost_layers_to_include is specified.
+        stream: optional name of cuda stream variable
+        spatial_shape_symbols: relevant only for gpu kernels - to determine CUDA block and grid sizes the iteration
+                               region (i.e. field shape) has to be known. This can normally be inferred by the kernel
+                               parameters - however in special cases like boundary conditions a manual specification
+                               may be necessary.
+    """
+    assert isinstance(ghost_layers_to_include, str) or ghost_layers_to_include >= 0
+    ast = kernel_info.ast
+    ast_params = kernel_info.parameters
+    is_cpu = ctx['target'] == 'cpu'
+
+    ghost_layers_to_include = sp.sympify(ghost_layers_to_include)
+    if ast.ghost_layers is None:
+        required_ghost_layers = 0
+    else:
+        # ghost layer info is ((x_gl_front, x_gl_end), (y_gl_front, y_gl_end).. )
+        required_ghost_layers = max(max(ast.ghost_layers))
+
+    kernel_call_lines = []
+
+    def get_start_coordinates(field_object):
+        if cell_interval is None:
+            return [-ghost_layers_to_include - required_ghost_layers] * field_object.spatial_dimensions
+        else:
+            assert ghost_layers_to_include == 0
+            return [sp.Symbol("{ci}.{coord}Min()".format(coord=coord_name, ci=cell_interval)) - required_ghost_layers
+                    for coord_name in ('x', 'y', 'z')]
+
+    def get_end_coordinates(field_object):
+        if cell_interval is None:
+            shape_names = ['xSize()', 'ySize()', 'zSize()'][:field_object.spatial_dimensions]
+            offset = 2 * ghost_layers_to_include + 2 * required_ghost_layers
+            return ["cell_idx_c(%s->%s) + %s" % (field_object.name, e, offset) for e in shape_names]
+        else:
+            assert ghost_layers_to_include == 0
+            return ["cell_idx_c({ci}.{coord}Size()) + {gl}".format(coord=coord_name, ci=cell_interval,
+                                                                   gl=2 * required_ghost_layers)
+                    for coord_name in ('x', 'y', 'z')]
+
+    for param in ast_params:
+        if param.is_field_parameter and FieldType.is_indexed(param.fields[0]):
+            continue
+
+        if param.is_field_pointer:
+            field = param.fields[0]
+            if field.field_type == FieldType.BUFFER:
+                kernel_call_lines.append("%s %s = %s;" % (param.symbol.dtype, param.symbol.name, param.field_name))
+            else:
+                coordinates = get_start_coordinates(field)
+                actual_gls = "int_c(%s->nrOfGhostLayers())" % (param.field_name, )
+                coord_set = set(coordinates)
+                coord_set = sorted(coord_set, key=lambda e: str(e))
+                for c in coord_set:
+                    kernel_call_lines.append("WALBERLA_ASSERT_GREATER_EQUAL(%s, -%s);" %
+                                             (c, actual_gls))
+                while len(coordinates) < 4:
+                    coordinates.append(0)
+                coordinates = tuple(coordinates)
+                kernel_call_lines.append("%s %s = %s->dataAt(%s, %s, %s, %s);" %
+                                         ((param.symbol.dtype, param.symbol.name, param.field_name) + coordinates))
+        elif param.is_field_stride:
+            casted_stride = get_field_stride(param)
+            type_str = param.symbol.dtype.base_name
+            kernel_call_lines.append("const %s %s = %s;" % (type_str, param.symbol.name, casted_stride))
+        elif param.is_field_shape:
+            coord = param.symbol.coordinate
+            field = param.fields[0]
+            type_str = param.symbol.dtype.base_name
+            shape = "%s(%s)" % (type_str, get_end_coordinates(field)[coord])
+            assert coord < 3
+            max_value = "%s->%sSizeWithGhostLayer()" % (field.name, ('x', 'y', 'z')[coord])
+            kernel_call_lines.append("WALBERLA_ASSERT_GREATER_EQUAL(%s, %s);" % (max_value, shape))
+            kernel_call_lines.append("const %s %s = %s;" % (type_str, param.symbol.name, shape))
+
+    call_parameters = ", ".join([p.symbol.name for p in ast_params])
+    if not is_cpu:
+        if not spatial_shape_symbols:
+            spatial_shape_symbols = [p.symbol for p in ast_params if p.is_field_shape]
+            spatial_shape_symbols.sort(key=lambda e: e.coordinate)
+        else:
+            spatial_shape_symbols = [TypedSymbol(s, SHAPE_DTYPE) for s in spatial_shape_symbols]
+
+        indexing_dict = ast.indexing.call_parameters(spatial_shape_symbols)
+        sp_printer_c = CudaSympyPrinter()
+        kernel_call_lines += [
+            "dim3 _block(int(%s), int(%s), int(%s));" % tuple(sp_printer_c.doprint(e) for e in indexing_dict['block']),
+            "dim3 _grid(int(%s), int(%s), int(%s));" % tuple(sp_printer_c.doprint(e) for e in indexing_dict['grid']),
+            "internal_%s::%s<<<_grid, _block, 0, %s>>>(%s);" % (ast.function_name, ast.function_name,
+                                                                stream, call_parameters),
+        ]
+    else:
+        kernel_call_lines.append("internal_%s::%s(%s);" % (ast.function_name, ast.function_name, call_parameters))
+    return "\n".join(kernel_call_lines)
+
+
+def generate_swaps(kernel_info):
+    """Generates code to swap main fields with temporary fields"""
+    swaps = ""
+    for src, dst in kernel_info.field_swaps:
+        swaps += "%s->swapDataPointers(%s);\n" % (src, dst)
+    return swaps
+
+
+def generate_constructor_initializer_list(kernel_info, parameters_to_ignore=None):
+    if parameters_to_ignore is None:
+        parameters_to_ignore = []
+
+    parameters_to_ignore += kernel_info.temporary_fields
+
+    parameter_initializer_list = []
+    for param in kernel_info.parameters:
+        if param.is_field_pointer and param.field_name not in parameters_to_ignore:
+            parameter_initializer_list.append("%sID(%sID_)" % (param.field_name, param.field_name))
+        elif not param.is_field_parameter and param.symbol.name not in parameters_to_ignore:
+            parameter_initializer_list.append("%s(%s_)" % (param.symbol.name, param.symbol.name))
+    return ", ".join(parameter_initializer_list)
+
+
+def generate_constructor_parameters(kernel_info, parameters_to_ignore=None):
+    if parameters_to_ignore is None:
+        parameters_to_ignore = []
+        
+    varying_parameters = []
+    if hasattr(kernel_info, 'varying_parameters'):
+        varying_parameters = kernel_info.varying_parameters
+    varying_parameter_names = tuple(e[1] for e in varying_parameters)
+    parameters_to_ignore += kernel_info.temporary_fields + varying_parameter_names
+
+    parameter_list = []
+    for param in kernel_info.parameters:
+        if param.is_field_pointer and param.field_name not in parameters_to_ignore:
+            parameter_list.append("BlockDataID %sID_" % (param.field_name, ))
+        elif not param.is_field_parameter and param.symbol.name not in parameters_to_ignore:
+            parameter_list.append("%s %s_" % (param.symbol.dtype, param.symbol.name,))
+    varying_parameters = ["%s %s_" % e for e in varying_parameters]
+    return ", ".join(parameter_list + varying_parameters)
+
+
+@jinja2.contextfilter
+def generate_members(ctx, kernel_info, parameters_to_ignore=(), only_fields=False):
+    ast = kernel_info.ast
+    fields = {f.name: f for f in ast.fields_accessed}
+
+    params_to_skip = tuple(parameters_to_ignore) + tuple(kernel_info.temporary_fields)
+    params_to_skip += tuple(e[1] for e in kernel_info.varying_parameters)
+    is_gpu = ctx['target'] == 'gpu'
+
+    result = []
+    for param in kernel_info.parameters:
+        if only_fields and not param.is_field_parameter:
+            continue
+        if param.is_field_pointer and param.field_name not in params_to_skip:
+            result.append("BlockDataID %sID;" % (param.field_name, ))
+        elif not param.is_field_parameter and param.symbol.name not in params_to_skip:
+            result.append("%s %s;" % (param.symbol.dtype, param.symbol.name,))
+
+    for field_name in kernel_info.temporary_fields:
+        f = fields[field_name]
+        if field_name in parameters_to_ignore:
+            continue
+        assert field_name.endswith('_tmp')
+        original_field_name = field_name[:-len('_tmp')]
+        f_size = get_field_fsize(f)
+        field_type = make_field_type(get_base_type(f.dtype), f_size, is_gpu)
+        result.append(temporary_fieldMemberTemplate.format(type=field_type, original_field_name=original_field_name))
+
+    if hasattr(kernel_info, 'varying_parameters'):
+        result.extend(["%s %s;" % e for e in kernel_info.varying_parameters])
+
+    return "\n".join(result)
+
+
+def generate_destructor(kernel_info, class_name):
+    if not kernel_info.temporary_fields:
+        return ""
+    else:
+        contents = ""
+        for field_name in kernel_info.temporary_fields:
+            contents += delete_loop.format(original_field_name=field_name[:-len('_tmp')])
+        return temporary_constructor.format(contents=contents, class_name=class_name)
+
+
+def add_pystencils_filters_to_jinja_env(jinja_env):
+    jinja_env.filters['generate_definition'] = generate_definition
+    jinja_env.filters['generate_declaration'] = generate_declaration
+    jinja_env.filters['generate_members'] = generate_members
+    jinja_env.filters['generate_constructor_parameters'] = generate_constructor_parameters
+    jinja_env.filters['generate_constructor_initializer_list'] = generate_constructor_initializer_list
+    jinja_env.filters['generate_call'] = generate_call
+    jinja_env.filters['generate_block_data_to_field_extraction'] = generate_block_data_to_field_extraction
+    jinja_env.filters['generate_swaps'] = generate_swaps
+    jinja_env.filters['generate_refs_for_kernel_parameters'] = generate_refs_for_kernel_parameters
+    jinja_env.filters['generate_destructor'] = generate_destructor
diff --git a/python/pystencils_walberla/templates/CpuPackInfo.tmpl.cpp b/python/pystencils_walberla/templates/CpuPackInfo.tmpl.cpp
new file mode 100644
index 000000000..24fd29370
--- /dev/null
+++ b/python/pystencils_walberla/templates/CpuPackInfo.tmpl.cpp
@@ -0,0 +1,105 @@
+#include "stencil/Directions.h"
+#include "core/cell/CellInterval.h"
+#include "core/DataTypes.h"
+#include "{{class_name}}.h"
+
+{% for header in headers %}
+#include {{header}}
+{% endfor %}
+
+namespace walberla {
+namespace {{namespace}} {
+
+using walberla::cell::CellInterval;
+using walberla::stencil::Direction;
+
+
+{% for kernel in pack_kernels.values() %}
+{{kernel|generate_definition(target)}}
+{% endfor %}
+
+{% for kernel in unpack_kernels.values() %}
+{{kernel|generate_definition(target)}}
+{% endfor %}
+
+
+void {{class_name}}::pack(Direction dir, unsigned char * byte_buffer, IBlock * block) const
+{
+    {{dtype}} * buffer = reinterpret_cast<{{dtype}}*>(byte_buffer);
+
+    {{fused_kernel|generate_block_data_to_field_extraction(parameters_to_ignore=['buffer'])|indent(4)}}
+    CellInterval ci;
+    {{field_name}}->getSliceBeforeGhostLayer(dir, ci, 1, false);
+
+    switch( dir )
+    {
+        {%- for direction_set, kernel in pack_kernels.items()  %}
+        {%- for dir in direction_set %}
+        case stencil::{{dir}}:
+        {%- endfor %}
+        {
+            {{kernel|generate_call(cell_interval="ci")|indent(12)}}
+            break;
+        }
+        {% endfor %}
+
+        default:
+            WALBERLA_ASSERT(false);
+    }
+}
+
+
+void {{class_name}}::unpack(Direction dir, unsigned char * byte_buffer, IBlock * block) const
+{
+    {{dtype}} * buffer = reinterpret_cast<{{dtype}}*>(byte_buffer);
+
+    {{fused_kernel|generate_block_data_to_field_extraction(parameters_to_ignore=['buffer'])|indent(4)}}
+    CellInterval ci;
+    {{field_name}}->getGhostRegion(dir, ci, 1, false);
+    auto communciationDirection = stencil::inverseDir[dir];
+
+    switch( communciationDirection )
+    {
+        {%- for direction_set, kernel in unpack_kernels.items()  %}
+        {%- for dir in direction_set %}
+        case stencil::{{dir}}:
+        {%- endfor %}
+        {
+            {{kernel|generate_call(cell_interval="ci")|indent(12)}}
+            break;
+        }
+        {% endfor %}
+
+        default:
+            WALBERLA_ASSERT(false);
+    }
+}
+
+
+uint_t {{class_name}}::size(stencil::Direction dir, const IBlock * block) const
+{
+    {{fused_kernel|generate_block_data_to_field_extraction(parameters_to_ignore=['buffer'])|indent(4)}}
+    CellInterval ci;
+    {{field_name}}->getGhostRegion(dir, ci, 1, false);
+
+    uint_t elementsPerCell = 0;
+
+    switch( dir )
+    {
+        {%- for direction_set, elements in elements_per_cell.items()  %}
+        {%- for dir in direction_set %}
+        case stencil::{{dir}}:
+        {%- endfor %}
+            elementsPerCell = {{elements}};
+            break;
+        {% endfor %}
+        default:
+            elementsPerCell = 0;
+    }
+    return ci.numCells() * elementsPerCell * sizeof( {{dtype}} );
+}
+
+
+
+} // namespace {{namespace}}
+} // namespace walberla
\ No newline at end of file
diff --git a/python/pystencils_walberla/templates/CpuPackInfo.tmpl.h b/python/pystencils_walberla/templates/CpuPackInfo.tmpl.h
new file mode 100644
index 000000000..ab7ef56fd
--- /dev/null
+++ b/python/pystencils_walberla/templates/CpuPackInfo.tmpl.h
@@ -0,0 +1,62 @@
+#pragma once
+#include "stencil/Directions.h"
+#include "core/cell/CellInterval.h"
+#include "core/DataTypes.h"
+#include "field/GhostLayerField.h"
+#include "domain_decomposition/IBlock.h"
+#include "communication/UniformPackInfo.h"
+
+#define FUNC_PREFIX
+
+#ifdef __GNUC__
+#define RESTRICT __restrict__
+#elif _MSC_VER
+#define RESTRICT __restrict
+#else
+#define RESTRICT
+#endif
+
+namespace walberla {
+namespace {{namespace}} {
+
+
+class {{class_name}} : public ::walberla::communication::UniformPackInfo
+{
+public:
+    {{class_name}}( {{fused_kernel|generate_constructor_parameters(parameters_to_ignore=['buffer'])}} )
+        : {{ fused_kernel|generate_constructor_initializer_list(parameters_to_ignore=['buffer']) }}
+    {};
+    virtual ~{{class_name}}() {}
+
+   bool constantDataExchange() const { return true; }
+   bool threadsafeReceiving()  const { return true; }
+
+   void unpackData(IBlock * receiver, stencil::Direction dir, mpi::RecvBuffer & buffer) {
+        const auto dataSize = size(dir, receiver);
+        unpack(dir, buffer.skip(dataSize), receiver);
+   }
+
+   void communicateLocal(const IBlock * sender, IBlock * receiver, stencil::Direction dir) {
+       //TODO: optimize by generating kernel for this case
+       mpi::SendBuffer sBuffer;
+       packData( sender, dir, sBuffer );
+       mpi::RecvBuffer rBuffer( sBuffer );
+       unpackData( receiver, stencil::inverseDir[dir], rBuffer );
+   }
+
+private:
+   void packDataImpl(const IBlock * sender, stencil::Direction dir, mpi::SendBuffer & outBuffer) const {
+        const auto dataSize = size(dir, sender);
+        pack(dir, outBuffer.forward(dataSize), const_cast<IBlock*>(sender));
+   }
+
+   void pack  (stencil::Direction dir, unsigned char * buffer, IBlock * block) const;
+   void unpack(stencil::Direction dir, unsigned char * buffer, IBlock * block) const;
+   uint_t size  (stencil::Direction dir, const IBlock * block) const;
+
+    {{fused_kernel|generate_members(parameters_to_ignore=['buffer'])|indent(4)}}
+};
+
+
+} // namespace {{namespace}}
+} // namespace walberla
diff --git a/python/pystencils_walberla/templates/GpuPackInfo.tmpl.cpp b/python/pystencils_walberla/templates/GpuPackInfo.tmpl.cpp
new file mode 100644
index 000000000..d2624b188
--- /dev/null
+++ b/python/pystencils_walberla/templates/GpuPackInfo.tmpl.cpp
@@ -0,0 +1,111 @@
+#include "stencil/Directions.h"
+#include "core/cell/CellInterval.h"
+#include "cuda/GPUField.h"
+#include "core/DataTypes.h"
+#include "{{class_name}}.h"
+
+
+{% if target is equalto 'cpu' -%}
+#define FUNC_PREFIX
+{%- elif target is equalto 'gpu' -%}
+#define FUNC_PREFIX __global__
+{%- endif %}
+
+
+namespace walberla {
+namespace {{namespace}} {
+
+using walberla::cell::CellInterval;
+using walberla::stencil::Direction;
+
+
+{% for kernel in pack_kernels.values() %}
+{{kernel|generate_definition(target)}}
+{% endfor %}
+
+{% for kernel in unpack_kernels.values() %}
+{{kernel|generate_definition(target)}}
+{% endfor %}
+
+
+
+void {{class_name}}::pack(Direction dir, unsigned char * byte_buffer, IBlock * block, cudaStream_t stream)
+{
+    {{dtype}} * buffer = reinterpret_cast<{{dtype}}*>(byte_buffer);
+
+    {{fused_kernel|generate_block_data_to_field_extraction(parameters_to_ignore=['buffer'])|indent(4)}}
+    CellInterval ci;
+    {{field_name}}->getSliceBeforeGhostLayer(dir, ci, 1, false);
+
+    switch( dir )
+    {
+        {%- for direction_set, kernel in pack_kernels.items()  %}
+        {%- for dir in direction_set %}
+        case stencil::{{dir}}:
+        {%- endfor %}
+        {
+            {{kernel|generate_call(cell_interval="ci", stream="stream")|indent(12)}}
+            break;
+        }
+        {% endfor %}
+
+        default:
+            WALBERLA_ASSERT(false);
+    }
+}
+
+
+void {{class_name}}::unpack(Direction dir, unsigned char * byte_buffer, IBlock * block, cudaStream_t stream)
+{
+    {{dtype}} * buffer = reinterpret_cast<{{dtype}}*>(byte_buffer);
+
+    {{fused_kernel|generate_block_data_to_field_extraction(parameters_to_ignore=['buffer'])|indent(4)}}
+    CellInterval ci;
+    {{field_name}}->getGhostRegion(dir, ci, 1, false);
+    auto communciationDirection = stencil::inverseDir[dir];
+
+    switch( communciationDirection )
+    {
+        {%- for direction_set, kernel in unpack_kernels.items()  %}
+        {%- for dir in direction_set %}
+        case stencil::{{dir}}:
+        {%- endfor %}
+        {
+            {{kernel|generate_call(cell_interval="ci", stream="stream")|indent(12)}}
+            break;
+        }
+        {% endfor %}
+
+        default:
+            WALBERLA_ASSERT(false);
+    }
+}
+
+
+uint_t {{class_name}}::size(stencil::Direction dir, IBlock * block)
+{
+    {{fused_kernel|generate_block_data_to_field_extraction(parameters_to_ignore=['buffer'])|indent(4)}}
+    CellInterval ci;
+    {{field_name}}->getGhostRegion(dir, ci, 1, false);
+
+    uint_t elementsPerCell = 0;
+
+    switch( dir )
+    {
+        {%- for direction_set, elements in elements_per_cell.items()  %}
+        {%- for dir in direction_set %}
+        case stencil::{{dir}}:
+        {%- endfor %}
+            elementsPerCell = {{elements}};
+            break;
+        {% endfor %}
+        default:
+            elementsPerCell = 0;
+    }
+    return ci.numCells() * elementsPerCell * sizeof( {{dtype}} );
+}
+
+
+
+} // namespace {{namespace}}
+} // namespace walberla
\ No newline at end of file
diff --git a/python/pystencils_walberla/templates/GpuPackInfo.tmpl.h b/python/pystencils_walberla/templates/GpuPackInfo.tmpl.h
new file mode 100644
index 000000000..8b70e1cb8
--- /dev/null
+++ b/python/pystencils_walberla/templates/GpuPackInfo.tmpl.h
@@ -0,0 +1,46 @@
+#pragma once
+#include "stencil/Directions.h"
+#include "core/cell/CellInterval.h"
+#include "cuda/GPUField.h"
+#include "core/DataTypes.h"
+#include "domain_decomposition/IBlock.h"
+#include "cuda/communication/GeneratedGPUPackInfo.h"
+
+
+{% if target is equalto 'cpu' -%}
+#define FUNC_PREFIX
+{%- elif target is equalto 'gpu' -%}
+#define FUNC_PREFIX __global__
+{%- endif %}
+
+#ifdef __GNUC__
+#define RESTRICT __restrict__
+#elif _MSC_VER
+#define RESTRICT __restrict
+#else
+#define RESTRICT
+#endif
+
+namespace walberla {
+namespace {{namespace}} {
+
+
+class {{class_name}} : public ::walberla::cuda::GeneratedGPUPackInfo
+{
+public:
+    {{class_name}}( {{fused_kernel|generate_constructor_parameters(parameters_to_ignore=['buffer'])}} )
+        : {{ fused_kernel|generate_constructor_initializer_list(parameters_to_ignore=['buffer']) }}
+    {};
+    virtual ~{{class_name}}() {}
+
+    virtual void pack  (stencil::Direction dir, unsigned char * buffer, IBlock * block, cudaStream_t stream);
+    virtual void unpack(stencil::Direction dir, unsigned char * buffer, IBlock * block, cudaStream_t stream);
+    virtual uint_t size  (stencil::Direction dir, IBlock * block);
+
+private:
+    {{fused_kernel|generate_members(parameters_to_ignore=['buffer'])|indent(4)}}
+};
+
+
+} // namespace {{namespace}}
+} // namespace walberla
diff --git a/python/pystencils_walberla/templates/MpiDtypeInfo.tmpl.h b/python/pystencils_walberla/templates/MpiDtypeInfo.tmpl.h
new file mode 100644
index 000000000..3f9cbb2e6
--- /dev/null
+++ b/python/pystencils_walberla/templates/MpiDtypeInfo.tmpl.h
@@ -0,0 +1,84 @@
+#pragma once
+
+#include "core/debug/Debug.h"
+#include "communication/UniformMPIDatatypeInfo.h"
+#include "field/communication/MPIDatatypes.h"
+
+#include <set>
+
+namespace walberla {
+namespace {{namespace}} {
+
+class {{class_name}} : public ::walberla::communication::UniformMPIDatatypeInfo
+{
+public:
+    using GhostLayerField_T = GhostLayerField<real_t, {{f_size}}>;
+
+    {{class_name}}( BlockDataID {{field_name}} )
+        :{{field_name}}_({{field_name}})
+    {}
+    virtual ~{{class_name}}() {}
+
+    virtual shared_ptr<mpi::Datatype> getSendDatatype ( IBlock * block, const stencil::Direction dir )
+    {
+       {% if kind == 'pull' %}
+        return make_shared<mpi::Datatype>( field::communication::mpiDatatypeSliceBeforeGhostlayerXYZ(
+                *getField( block ), dir, uint_t( 1 ), getOptimizedCommunicationIndices( dir ), false ) );
+       {% else %}
+        return make_shared<mpi::Datatype>( field::communication::mpiDatatypeGhostLayerOnlyXYZ(
+                *getField( block ), dir, false, getOptimizedCommunicationIndices( dir ) ) );
+       {% endif %}
+    }
+
+    virtual shared_ptr<mpi::Datatype> getRecvDatatype ( IBlock * block, const stencil::Direction dir )
+    {
+        {% if kind == 'pull' %}
+        return make_shared<mpi::Datatype>( field::communication::mpiDatatypeGhostLayerOnlyXYZ(
+                *getField( block ), dir, false, getOptimizedCommunicationIndices( stencil::inverseDir[dir] ) ) );
+        {% else %}
+        return make_shared<mpi::Datatype>( field::communication::mpiDatatypeSliceBeforeGhostlayerXYZ(
+                *getField( block ), dir, uint_t( 1 ), getOptimizedCommunicationIndices( stencil::inverseDir[dir] ), false ) );
+        {% endif %}
+    }
+
+    virtual void * getSendPointer( IBlock * block, const stencil::Direction ) {
+        return getField(block)->data();
+    }
+
+    virtual void * getRecvPointer( IBlock * block, const stencil::Direction ) {
+        return getField(block)->data();
+    }
+
+private:
+
+    inline static std::set< cell_idx_t > getOptimizedCommunicationIndices( const stencil::Direction dir )
+    {
+        switch(dir)
+        {
+            {%- for direction_set, index_set in spec.items()  %}
+            {%- for dir in direction_set %}
+            case stencil::{{dir}}:
+            {%- endfor %}
+               return {{index_set}};
+            {% endfor %}
+            default:
+                WALBERLA_ASSERT(false);
+                return {};
+        }
+    }
+
+    GhostLayerField_T * getField( IBlock * block )
+    {
+        GhostLayerField_T * const f = block->getData<GhostLayerField_T>( {{field_name}}_ );
+        WALBERLA_ASSERT_NOT_NULLPTR( f );
+        return f;
+    }
+
+    BlockDataID {{field_name}}_;
+};
+
+
+} // namespace {{namespace}}
+} // namespace walberla
+
+
diff --git a/python/pystencils_walberla/templates/Sweep.tmpl.cpp b/python/pystencils_walberla/templates/Sweep.tmpl.cpp
new file mode 100644
index 000000000..8628e40a9
--- /dev/null
+++ b/python/pystencils_walberla/templates/Sweep.tmpl.cpp
@@ -0,0 +1,95 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file {{class_name}}.cpp
+//! \\ingroup lbm
+//! \\author lbmpy
+//======================================================================================================================
+
+#include <cmath>
+
+#include "core/DataTypes.h"
+#include "core/Macros.h"
+#include "{{class_name}}.h"
+{% for header in headers %}
+#include {{header}}
+{% endfor %}
+
+
+{% if target is equalto 'cpu' -%}
+#define FUNC_PREFIX
+{%- elif target is equalto 'gpu' -%}
+#define FUNC_PREFIX __global__
+{%- endif %}
+
+#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG )
+#   pragma GCC diagnostic push
+#   pragma GCC diagnostic ignored "-Wfloat-equal"
+#   pragma GCC diagnostic ignored "-Wshadow"
+#   pragma GCC diagnostic ignored "-Wconversion"
+#   pragma GCC diagnostic ignored "-Wunused-variable"
+#endif
+
+#if ( defined WALBERLA_CXX_COMPILER_IS_INTEL )
+#pragma warning push
+#pragma warning( disable :  1599 )
+#endif
+
+using namespace std;
+
+namespace walberla {
+namespace {{namespace}} {
+
+
+{{kernel|generate_definition(target)}}
+
+void {{class_name}}::operator()( IBlock * block{%if target is equalto 'gpu'%} , cudaStream_t stream{% endif %} )
+{
+    {{kernel|generate_block_data_to_field_extraction|indent(4)}}
+    {{kernel|generate_call(stream='stream')|indent(4)}}
+    {{kernel|generate_swaps|indent(4)}}
+}
+
+
+void {{class_name}}::runOnCellInterval( const shared_ptr<StructuredBlockStorage> & blocks,
+                                        const CellInterval & globalCellInterval,
+                                        cell_idx_t ghostLayers,
+                                        IBlock * block{%if target is equalto 'gpu'%} , cudaStream_t stream{% endif %} )
+{
+    CellInterval ci = globalCellInterval;
+    CellInterval blockBB = blocks->getBlockCellBB( *block);
+    blockBB.expand( ghostLayers );
+    ci.intersect( blockBB );
+    blocks->transformGlobalToBlockLocalCellInterval( ci, *block );
+    if( ci.empty() )
+        return;
+
+    {{kernel|generate_block_data_to_field_extraction|indent(4)}}
+    {{kernel|generate_call(stream='stream', cell_interval='ci')|indent(4)}}
+    {{kernel|generate_swaps|indent(4)}}
+}
+
+
+} // namespace {{namespace}}
+} // namespace walberla
+
+
+#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG )
+#   pragma GCC diagnostic pop
+#endif
+
+#if ( defined WALBERLA_CXX_COMPILER_IS_INTEL )
+#pragma warning pop
+#endif
diff --git a/python/pystencils_walberla/templates/Sweep.tmpl.h b/python/pystencils_walberla/templates/Sweep.tmpl.h
new file mode 100644
index 000000000..9de4b96c2
--- /dev/null
+++ b/python/pystencils_walberla/templates/Sweep.tmpl.h
@@ -0,0 +1,93 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file {{class_name}}.h
+//! \\author pystencils
+//======================================================================================================================
+
+#pragma once
+#include "core/DataTypes.h"
+
+{% if target is equalto 'cpu' -%}
+#include "field/GhostLayerField.h"
+{%- elif target is equalto 'gpu' -%}
+#include "cuda/GPUField.h"
+{%- endif %}
+#include "field/SwapableCompare.h"
+#include "domain_decomposition/BlockDataID.h"
+#include "domain_decomposition/IBlock.h"
+#include "domain_decomposition/StructuredBlockStorage.h"
+#include <set>
+
+#ifdef __GNUC__
+#define RESTRICT __restrict__
+#elif _MSC_VER
+#define RESTRICT __restrict
+#else
+#define RESTRICT
+#endif
+
+#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG )
+#   pragma GCC diagnostic push
+#   pragma GCC diagnostic ignored "-Wunused-parameter"
+#endif
+
+namespace walberla {
+namespace {{namespace}} {
+
+
+class {{class_name}}
+{
+public:
+    {{class_name}}( {{kernel|generate_constructor_parameters}})
+        : {{ kernel|generate_constructor_initializer_list }}
+    {};
+
+    {{ kernel| generate_destructor(class_name) |indent(4) }}
+
+    void operator() ( IBlock * block{%if target is equalto 'gpu'%} , cudaStream_t stream = 0{% endif %} );
+    void runOnCellInterval(const shared_ptr<StructuredBlockStorage> & blocks,
+                           const CellInterval & globalCellInterval, cell_idx_t ghostLayers, IBlock * block
+                           {%if target is equalto 'gpu'%} , cudaStream_t stream = 0{% endif %});
+
+
+
+    static std::function<void (IBlock*)> getSweep(const shared_ptr<{{class_name}}> & kernel) {
+        return [kernel](IBlock * b) { (*kernel)(b); };
+    }
+
+    static std::function<void (IBlock*{%if target is equalto 'gpu'%} , cudaStream_t {% endif %})>
+            getSweepOnCellInterval(const shared_ptr<{{class_name}}> & kernel,
+                                   const shared_ptr<StructuredBlockStorage> & blocks,
+                                   const CellInterval & globalCellInterval,
+                                   cell_idx_t ghostLayers=1 )
+    {
+        return [kernel, blocks, globalCellInterval, ghostLayers] (IBlock * b{%if target is equalto 'gpu'%} , cudaStream_t stream = 0{% endif %}) {
+            kernel->runOnCellInterval(blocks, globalCellInterval, ghostLayers, b{%if target is equalto 'gpu'%},stream {% endif %});
+        };
+    }
+
+    {{ kernel|generate_members|indent(4) }}
+
+};
+
+
+} // namespace {{namespace}}
+} // namespace walberla
+
+
+#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG )
+#   pragma GCC diagnostic pop
+#endif
diff --git a/python/pystencils_walberla/templates/SweepInnerOuter.tmpl.cpp b/python/pystencils_walberla/templates/SweepInnerOuter.tmpl.cpp
new file mode 100644
index 000000000..bdff84684
--- /dev/null
+++ b/python/pystencils_walberla/templates/SweepInnerOuter.tmpl.cpp
@@ -0,0 +1,141 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file {{class_name}}.cpp
+//! \\ingroup lbm
+//! \\author lbmpy
+//======================================================================================================================
+
+#include <cmath>
+
+#include "core/DataTypes.h"
+#include "core/Macros.h"
+#include "{{class_name}}.h"
+
+
+{% if target is equalto 'cpu' -%}
+#define FUNC_PREFIX
+{%- elif target is equalto 'gpu' -%}
+#define FUNC_PREFIX __global__
+{%- endif %}
+
+#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG )
+#   pragma GCC diagnostic push
+#   pragma GCC diagnostic ignored "-Wfloat-equal"
+#   pragma GCC diagnostic ignored "-Wshadow"
+#   pragma GCC diagnostic ignored "-Wconversion"
+#endif
+
+using namespace std;
+
+namespace walberla {
+namespace {{namespace}} {
+
+{{kernel|generate_definition(target)}}
+
+void {{class_name}}::operator() ( IBlock * block{%if target is equalto 'gpu'%} , cudaStream_t stream{% endif %} )
+{
+    {{kernel|generate_block_data_to_field_extraction|indent(4)}}
+    {{kernel|generate_call(stream='stream')|indent(4)}}
+    {{kernel|generate_swaps|indent(4)}}
+}
+
+
+void {{class_name}}::runOnCellInterval( const shared_ptr<StructuredBlockStorage> & blocks,
+                                        const CellInterval & globalCellInterval,
+                                        cell_idx_t ghostLayers,
+                                        IBlock * block{%if target is equalto 'gpu'%} , cudaStream_t stream{% endif %} )
+{
+    CellInterval ci = globalCellInterval;
+    CellInterval blockBB = blocks->getBlockCellBB( *block);
+    blockBB.expand( ghostLayers );
+    ci.intersect( blockBB );
+    blocks->transformGlobalToBlockLocalCellInterval( ci, *block );
+    if( ci.empty() )
+        return;
+
+    {{kernel|generate_block_data_to_field_extraction|indent(4)}}
+    {{kernel|generate_call(stream='stream', cell_interval='ci')|indent(4)}}
+    {{kernel|generate_swaps|indent(4)}}
+}
+
+
+void {{class_name}}::inner( IBlock * block{%if target is equalto 'gpu'%} , cudaStream_t stream{% endif %} )
+{
+    {{kernel|generate_block_data_to_field_extraction|indent(4)}}
+
+    CellInterval inner = {{field}}->xyzSize();
+    inner.expand(Cell(-outerWidth_[0], -outerWidth_[1], -outerWidth_[2]));
+
+    {{kernel|generate_call(stream='stream', cell_interval='inner')|indent(4)}}
+}
+
+
+void {{class_name}}::outer( IBlock * block{%if target is equalto 'gpu'%} , cudaStream_t stream {% endif %} )
+{
+    {{kernel|generate_block_data_to_field_extraction|indent(4)}}
+
+    if( layers_.size() == 0 )
+    {
+        CellInterval ci;
+
+        {{field}}->getSliceBeforeGhostLayer(stencil::T, ci, outerWidth_[2], false);
+        layers_.push_back(ci);
+        {{field}}->getSliceBeforeGhostLayer(stencil::B, ci, outerWidth_[2], false);
+        layers_.push_back(ci);
+
+        {{field}}->getSliceBeforeGhostLayer(stencil::N, ci, outerWidth_[1], false);
+        ci.expand(Cell(0, 0, -outerWidth_[2]));
+        layers_.push_back(ci);
+        {{field}}->getSliceBeforeGhostLayer(stencil::S, ci, outerWidth_[1], false);
+        ci.expand(Cell(0, 0, -outerWidth_[2]));
+        layers_.push_back(ci);
+
+        {{field}}->getSliceBeforeGhostLayer(stencil::E, ci, outerWidth_[0], false);
+        ci.expand(Cell(0, -outerWidth_[1], -outerWidth_[2]));
+        layers_.push_back(ci);
+        {{field}}->getSliceBeforeGhostLayer(stencil::W, ci, outerWidth_[0], false);
+        ci.expand(Cell(0, -outerWidth_[1], -outerWidth_[2]));
+        layers_.push_back(ci);
+    }
+
+    {%if target is equalto 'gpu'%}
+    {
+        auto parallelSection_ = parallelStreams_.parallelSection( stream );
+        for( auto & ci: layers_ )
+        {
+            parallelSection_.run([&]( auto s ) {
+                {{kernel|generate_call(stream='s', cell_interval='ci')|indent(16)}}
+            });
+        }
+    }
+    {% else %}
+    for( auto & ci: layers_ )
+    {
+        {{kernel|generate_call(cell_interval='ci')|indent(8)}}
+    }
+    {% endif %}
+
+    {{kernel|generate_swaps|indent(4)}}
+}
+
+
+} // namespace {{namespace}}
+} // namespace walberla
+
+
+#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG )
+#   pragma GCC diagnostic pop
+#endif
diff --git a/python/pystencils_walberla/templates/SweepInnerOuter.tmpl.h b/python/pystencils_walberla/templates/SweepInnerOuter.tmpl.h
new file mode 100644
index 000000000..466f51071
--- /dev/null
+++ b/python/pystencils_walberla/templates/SweepInnerOuter.tmpl.h
@@ -0,0 +1,113 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \\file {{class_name}}.h
+//! \\author pystencils
+//======================================================================================================================
+
+#pragma once
+#include "core/DataTypes.h"
+
+{% if target is equalto 'cpu' -%}
+#include "field/GhostLayerField.h"
+{%- elif target is equalto 'gpu' -%}
+#include "cuda/GPUField.h"
+#include "cuda/ParallelStreams.h"
+{%- endif %}
+#include "field/SwapableCompare.h"
+#include "domain_decomposition/BlockDataID.h"
+#include "domain_decomposition/IBlock.h"
+#include "domain_decomposition/StructuredBlockStorage.h"
+#include <set>
+
+#ifdef __GNUC__
+#define RESTRICT __restrict__
+#elif _MSC_VER
+#define RESTRICT __restrict
+#else
+#define RESTRICT
+#endif
+
+#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG )
+#   pragma GCC diagnostic push
+#   pragma GCC diagnostic ignored "-Wunused-parameter"
+#   pragma GCC diagnostic ignored "-Wreorder"
+#endif
+
+namespace walberla {
+namespace {{namespace}} {
+
+
+class {{class_name}}
+{
+public:
+    {{class_name}}( {{kernel|generate_constructor_parameters}}, const Cell & outerWidth=Cell(1, 1, 1))
+        : {{ kernel|generate_constructor_initializer_list }}, outerWidth_(outerWidth)
+    {};
+
+    {{ kernel| generate_destructor(class_name) |indent(4) }}
+
+
+    void operator() ( IBlock * block{%if target is equalto 'gpu'%} , cudaStream_t stream = 0{% endif %} );
+
+    void runOnCellInterval(const shared_ptr<StructuredBlockStorage> & blocks,
+                           const CellInterval & globalCellInterval, cell_idx_t ghostLayers, IBlock * block
+                           {%if target is equalto 'gpu'%} , cudaStream_t stream = 0{% endif %});
+
+
+
+    static std::function<void (IBlock*)> getSweep(const shared_ptr<{{class_name}}> & kernel) {
+        return [kernel](IBlock * b) { (*kernel)(b); };
+    }
+
+    static std::function<void (IBlock*{%if target is equalto 'gpu'%} , cudaStream_t {% endif %})>
+            getSweepOnCellInterval(const shared_ptr<{{class_name}}> & kernel,
+                                   const shared_ptr<StructuredBlockStorage> & blocks,
+                                   const CellInterval & globalCellInterval,
+                                   cell_idx_t ghostLayers=1 )
+    {
+        return [kernel, blocks, globalCellInterval, ghostLayers] (IBlock * b{%if target is equalto 'gpu'%} , cudaStream_t stream = 0{% endif %}) {
+            kernel->runOnCellInterval(blocks, globalCellInterval, ghostLayers, b{%if target is equalto 'gpu'%},stream {% endif %});
+        };
+    }
+
+
+    void inner( IBlock * block{%if target is equalto 'gpu'%} , cudaStream_t stream = 0{% endif %} );
+    void outer( IBlock * block{%if target is equalto 'gpu'%} , cudaStream_t stream = 0{% endif %} );
+
+    void setOuterPriority(int priority ) {
+        {%if target is equalto 'gpu'%}
+        parallelStreams_.setStreamPriority(priority);
+        {%endif%}
+    }
+    {{kernel|generate_members|indent(4)}}
+
+private:
+    {%if target is equalto 'gpu'%}
+    cuda::ParallelStreams parallelStreams_;
+    {% endif %}
+
+    Cell outerWidth_;
+    std::vector<CellInterval> layers_;
+};
+
+
+} // namespace {{namespace}}
+} // namespace walberla
+
+
+#if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG )
+#   pragma GCC diagnostic pop
+#endif
diff --git a/python/pystencils_walberla/tests/__init__.py b/python/pystencils_walberla/tests/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/python/pystencils_walberla/tests/test_packinfo_generation.py b/python/pystencils_walberla/tests/test_packinfo_generation.py
new file mode 100644
index 000000000..78ebdd92a
--- /dev/null
+++ b/python/pystencils_walberla/tests/test_packinfo_generation.py
@@ -0,0 +1,41 @@
+import unittest
+
+import pystencils as ps
+from pystencils_walberla import generate_pack_info_for_field, generate_pack_info_from_kernel
+from pystencils_walberla.cmake_integration import ManualCodeGenerationContext
+
+
+class PackinfoGenTest(unittest.TestCase):
+
+    @staticmethod
+    def test_packinfo_walberla_gen():
+        for openmp in (False, True):
+            for da in (False, True):
+                with ManualCodeGenerationContext(openmp=openmp, double_accuracy=da) as ctx:
+                    dtype = "float64" if ctx.double_accuracy else "float32"
+                    f, g = ps.fields("f, g(4): {}[3D]".format(dtype))
+                    generate_pack_info_for_field(ctx, 'PI1', f)
+
+                    src, dst = ps.fields("src, src_tmp: {}[2D]".format(dtype))
+                    stencil = [[0, -1, 0],
+                               [-1, 4, -1],
+                               [0, -1, 0]]
+                    assignments = [ps.assignment_from_stencil(stencil, src, dst, normalization_factor=4)]
+                    generate_pack_info_from_kernel(ctx, 'PI2', assignments)
+
+                    expected_files = ('PI1.cpp', 'PI1.h', 'PI2.cpp', 'PI2.h')
+                    assert all(e in ctx.files for e in expected_files)
+
+                    for file_name_to_test in ('PI1.cpp', 'PI2.cpp'):
+                        file_to_test = ctx.files[file_name_to_test]
+                        if openmp:
+                            assert '#pragma omp parallel' in file_to_test
+
+                        if da:
+                            assert 'float ' not in file_to_test
+                        else:
+                            assert 'double ' not in file_to_test
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/pystencils_walberla/tests/test_walberla_gen.py b/python/pystencils_walberla/tests/test_walberla_gen.py
new file mode 100644
index 000000000..3ffe156e9
--- /dev/null
+++ b/python/pystencils_walberla/tests/test_walberla_gen.py
@@ -0,0 +1,56 @@
+import unittest
+
+import sympy as sp
+
+import pystencils as ps
+from pystencils_walberla import generate_sweep
+from pystencils_walberla.cmake_integration import ManualCodeGenerationContext
+
+
+class CodegenTest(unittest.TestCase):
+
+    @staticmethod
+    def test_codegen():
+        for openmp in (False, True):
+            for da in (False, True):
+                with ManualCodeGenerationContext(openmp=openmp, double_accuracy=da) as ctx:
+                    h = sp.symbols("h")
+
+                    dtype = "float64" if ctx.double_accuracy else "float32"
+
+                    # ----- Jacobi 2D - created by specifying weights in nested list --------------------------
+                    src, dst = ps.fields("src, src_tmp: {}[2D]".format(dtype))
+                    stencil = [[0, -1, 0],
+                               [-1, 4, -1],
+                               [0, -1, 0]]
+                    assignments = ps.assignment_from_stencil(stencil, src, dst, normalization_factor=4 * h ** 2)
+                    generate_sweep(ctx, 'JacobiKernel2D', assignments, field_swaps=[(src, dst)])
+
+                    # ----- Jacobi 3D - created by using kernel_decorator with assignments in '@=' format -----
+                    src, dst = ps.fields("src, src_tmp: {}[3D]".format(dtype))
+
+                    @ps.kernel
+                    def kernel_func():
+                        dst[0, 0, 0] @= (src[1, 0, 0] + src[-1, 0, 0] +
+                                         src[0, 1, 0] + src[0, -1, 0] +
+                                         src[0, 0, 1] + src[0, 0, -1]) / (6 * h ** 2)
+
+                    generate_sweep(ctx, 'JacobiKernel3D', kernel_func, field_swaps=[(src, dst)])
+
+                    expected_files = ('JacobiKernel3D.cpp', 'JacobiKernel3D.h',
+                                      'JacobiKernel2D.cpp', 'JacobiKernel2D.h')
+                    assert all(e in ctx.files for e in expected_files)
+
+                    for file_name_to_test in ('JacobiKernel3D.cpp', 'JacobiKernel2D.cpp'):
+                        file_to_test = ctx.files[file_name_to_test]
+                        if openmp:
+                            assert '#pragma omp parallel' in file_to_test
+
+                        if da:
+                            assert 'float ' not in file_to_test
+                        else:
+                            assert 'double ' not in file_to_test
+
+
+if __name__ == '__main__':
+    unittest.main()
-- 
GitLab