diff --git a/apps/tutorials/codegen/02_LBMLatticeModelGeneration.cpp b/apps/tutorials/codegen/02_LBMLatticeModelGeneration.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..934ab64ff3c49752d05ecfe99b39699b4b0272b3
--- /dev/null
+++ b/apps/tutorials/codegen/02_LBMLatticeModelGeneration.cpp
@@ -0,0 +1,187 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file 02_LBMLatticeModelGeneration.cpp
+//! \author Frederik Hennig <frederik.hennig@fau.de>
+//
+//======================================================================================================================
+
+#include "blockforest/all.h"
+
+#include "core/all.h"
+
+#include "domain_decomposition/all.h"
+
+#include "field/all.h"
+
+#include "geometry/all.h"
+
+#include "lbm/boundary/factories/DefaultBoundaryHandling.h"
+#include "lbm/field/AddToStorage.h"
+#include "lbm/field/PdfField.h"
+#include "lbm/field/initializer/all.h"
+#include "lbm/vtk/VTKOutput.h"
+
+#include "timeloop/all.h"
+
+//    Codegen Includes
+#include "SRTLatticeModel.h"
+#include "SRTPackInfo.h"
+
+namespace walberla
+{
+///////////////////////
+/// Typedef Aliases ///
+///////////////////////
+
+// Typedef alias for the lattice model
+typedef lbm::SRTLatticeModel LatticeModel_T;
+
+// Communication Pack Info
+typedef pystencils::SRTPackInfo PackInfo_T;
+
+// Also set aliases for the stencils involved...
+typedef LatticeModel_T::Stencil Stencil_T;
+typedef LatticeModel_T::CommunicationStencil CommunicationStencil_T;
+
+// ... and for the required field types.
+typedef lbm::PdfField< LatticeModel_T > PdfField_T;
+
+// Also, for boundary handling, a flag data type and flag field are required.
+typedef walberla::uint8_t flag_t;
+typedef FlagField< flag_t > FlagField_T;
+typedef lbm::DefaultBoundaryHandlingFactory< LatticeModel_T, FlagField_T > BHFactory;
+
+/////////////////////////////////////////
+/// Shear Flow Initialization Functor ///
+/////////////////////////////////////////
+
+struct ShearFlowInit
+{
+ private:
+   lbm::initializer::ExprSystemInitFunction exprInitFunc_;
+   const real_t noiseMagnitude_;
+   math::RealRandom< real_t > rng_;
+
+ public:
+   ShearFlowInit(const shared_ptr< StructuredBlockForest >& blocks, const Config::BlockHandle& setup)
+      : exprInitFunc_(blocks), noiseMagnitude_(setup.getParameter< real_t >("noise_magnitude")),
+        rng_(setup.getParameter< std::mt19937::result_type >("noise_seed", 42))
+   {
+      if (!exprInitFunc_.parse(setup)) { WALBERLA_ABORT("Shear Flow Setup was incomplete."); }
+   }
+
+   std::vector< real_t > operator()(const Cell& globalCell)
+   {
+      std::vector< real_t > densityAndVelocity = exprInitFunc_(globalCell);
+      real_t yPerturbation                     = noiseMagnitude_ * rng_();
+      densityAndVelocity[2] += yPerturbation;
+
+      return densityAndVelocity;
+   }
+};
+
+/////////////////////
+/// Main Function ///
+/////////////////////
+
+int main(int argc, char** argv)
+{
+   walberla::Environment walberlaEnv(argc, argv);
+
+   if (!walberlaEnv.config()) { WALBERLA_ABORT("No configuration file specified!"); }
+
+   ///////////////////////////////////////////////////////
+   /// Block Storage Creation and Simulation Parameter ///
+   ///////////////////////////////////////////////////////
+
+   auto blocks = blockforest::createUniformBlockGridFromConfig(walberlaEnv.config());
+
+   // read parameters
+   auto parameters = walberlaEnv.config()->getOneBlock("Parameters");
+
+   const uint_t timesteps = parameters.getParameter< uint_t >("timesteps", uint_c(10));
+   const real_t omega     = parameters.getParameter< real_t >("omega", real_c(1.8));
+   const double remainingTimeLoggerFrequency =
+      parameters.getParameter< double >("remainingTimeLoggerFrequency", 3.0); // in seconds
+
+   ///////////////////
+   /// Field Setup ///
+   ///////////////////
+
+   LatticeModel_T latticeModel = LatticeModel_T(omega);
+   BlockDataID pdfFieldId      = lbm::addPdfFieldToStorage(blocks, "pdf field", latticeModel, field::fzyx);
+   BlockDataID flagFieldId     = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field");
+
+   ////////////////////////
+   /// Shear Flow Setup ///
+   ////////////////////////
+
+   auto shearFlowSetup = walberlaEnv.config()->getOneBlock("ShearFlowSetup");
+   ShearFlowInit shearFlowInitFunc(blocks, shearFlowSetup);
+   lbm::initializer::PdfFieldInitializer< LatticeModel_T > fieldInit(pdfFieldId, blocks);
+   fieldInit.initDensityAndVelocity(shearFlowInitFunc);
+
+   /////////////////////////
+   /// Boundary Handling ///
+   /////////////////////////
+
+   const FlagUID fluidFlagUID("Fluid");
+
+   auto boundariesConfig = walberlaEnv.config()->getOneBlock("Boundaries");
+
+   BlockDataID boundaryHandlingId =
+      BHFactory::addBoundaryHandlingToStorage(blocks, "boundary handling", flagFieldId, pdfFieldId, fluidFlagUID,
+                                              Vector3< real_t >(), Vector3< real_t >(), real_c(0.0), real_c(0.0));
+
+   geometry::initBoundaryHandling< BHFactory::BoundaryHandling >(*blocks, boundaryHandlingId, boundariesConfig);
+   geometry::setNonBoundaryCellsToDomain< BHFactory::BoundaryHandling >(*blocks, boundaryHandlingId);
+
+   /////////////////
+   /// Time Loop ///
+   /////////////////
+
+   SweepTimeloop timeloop(blocks->getBlockStorage(), timesteps);
+
+   // Communication
+   blockforest::communication::UniformBufferedScheme< CommunicationStencil_T > communication(blocks);
+   communication.addPackInfo(make_shared< PackInfo_T >(pdfFieldId));
+
+   // Timeloop
+   timeloop.add() << BeforeFunction(communication, "communication")
+                  << Sweep(BHFactory::BoundaryHandling::getBlockSweep(boundaryHandlingId), "Boundary Handling");
+   timeloop.add() << Sweep(LatticeModel_T::Sweep(pdfFieldId), "LB stream & collide");
+
+   // Stability Checker
+   timeloop.addFuncAfterTimeStep(makeSharedFunctor(field::makeStabilityChecker< PdfField_T, FlagField_T >(
+                                    walberlaEnv.config(), blocks, pdfFieldId, flagFieldId, fluidFlagUID)),
+                                 "LBM stability check");
+
+   // Time logger
+   timeloop.addFuncAfterTimeStep(timing::RemainingTimeLogger(timeloop.getNrOfTimeSteps(), remainingTimeLoggerFrequency),
+                                 "remaining time logger");
+
+   // LBM VTK Output
+   lbm::VTKOutput< LatticeModel_T, FlagField_T >::addToTimeloop(timeloop, blocks, walberlaEnv.config(), pdfFieldId,
+                                                                flagFieldId, fluidFlagUID);
+
+   timeloop.run();
+
+   return EXIT_SUCCESS;
+}
+
+} // namespace walberla
+
+int main(int argc, char** argv) { return walberla::main(argc, argv); }
\ No newline at end of file
diff --git a/apps/tutorials/codegen/02_LBMLatticeModelGeneration.dox b/apps/tutorials/codegen/02_LBMLatticeModelGeneration.dox
new file mode 100644
index 0000000000000000000000000000000000000000..51e697cd20af8d49ca7a557cd8ee8df14a44f977
--- /dev/null
+++ b/apps/tutorials/codegen/02_LBMLatticeModelGeneration.dox
@@ -0,0 +1,220 @@
+namespace walberla{
+
+/**
+\page tutorial_codegen02 Tutorial - Code Generation 2: Lattice Model Generation with lbmpy
+
+\section overview Overview
+
+This tutorial demonstrates how to use <a href="https://pycodegen.pages.i10git.cs.fau.de/lbmpy" target="_blank">lbmpy</a> with waLBerla to generate efficient implementations of Lattice Boltzmann Methods (LBM) to be included in large-scale distributed memory fluid flow simulations. While waLBerla provides an advanced framework for setting up and running complex fluid simulations, lbmpy brings the flexibility to generate highly optimized LBMs for CPUs and GPUs. Manually writing an efficient C++ or GPU implementation of an advanced LBM can be very cumbersome, especially because the compute kernel needs to be optimized for specific hardware. For this reason, lbmpy was developed. It is a Python framework which allows defining a set of LB equations at an abstract level, thus enabling development on the mathematical description of the problem directly and then generate a highly optimized C, OpenCL or CUDA implementation of these equations. An introduction to lbmpy can be found <a href="https://pycodegen.pages.i10git.cs.fau.de/lbmpy/notebooks/00_tutorial_lbmpy_walberla_overview.html" target="_blank">here</a>.
+
+As in the previous tutorial (\ref tutorial_codegen01), we will first define the numeric methods and set up code generation in a Python script. We will then include the generated code in a waLBerla application to simulate a two-dimensional shear flow in a periodic domain. Additionally, it will be shown how a waLBerla simulation with complex initial conditions can be initialized using parameter files.
+
+\section latticemodelcodegen_python Setting up code generation in Python
+
+In this section, we will define a single relaxation time (SRT) collision operator in Python using lbmpy and generate a waLBerla lattice model implementation from its equations. A lattice model defines several things, including:
+- The **velocity set** together with its weights. For our two-dimensional domain, we will be using the D2Q9 velocity set.
+- The **LBM sweep** which consists of the collision operator and the streaming pattern. We will be using a standard pull-scheme with a temporary field.
+- The **force model**. Since no forces will be present in our simulation, we will not use a force model in this tutorial. For details about force models, see <a href="https://pycodegen.pages.i10git.cs.fau.de/lbmpy/sphinx/forcemodels.html" target="_blank">lbmpy's documentation on force models</a>.
+
+In addition to the lattice model, it will be shown how to generate a method-specific pack info class for MPI communication to reduce communication load to the necessary minimum.
+
+In the code generation python script, we first require a few imports from lbmpy itself and from the waLBerla code generation libraries. lbmpy code generation is based on pystencils; the basic procedure is thus the same as in the previous tutorial. We need the `CodeGeneration` context from the `pystencils_walberla` module for the connection to the build system. For generating the communication pack info, we will use `generate_pack_info_from_kernel` from the same module. This method of pack info generation is not limited to LBM implementations but can be used with any sweep kernel. The function `generate_pack_info_from_kernel` takes a pystencils `AssignmentCollection` and extracts all field accesses to determine which cell entries need to be communicated.
+
+From the `lbmpy.creationfunctions` we require the functions to create collision and update rules. For the actual code generation, `generate_lattice_model` from `lbmpy_walberla` is required. Since we will define symbols, `SymPy` is also needed.
+
+\code
+import sympy as sp
+
+from lbmpy.creationfunctions import create_lb_collision_rule, create_lb_update_rule
+
+from pystencils_walberla import CodeGeneration, generate_pack_info_from_kernel
+from lbmpy_walberla import generate_lattice_model
+\endcode
+
+First, we define a few general parameters. These include the stencil (D2Q9) and the memory layout (`fzyx`, see \ref tutorial_codegen01 ). We define a SymPy symbol for the relaxation rate \f$ \omega \f$. This means we can later set it to a specific value from the waLBerla code. A dictionary with optimization parameters is also set up. Here, we enable global common subexpression elimination (`cse_global`) and set the PDF field's memory layout.
+\code
+stencil = 'D2Q9'
+omega = sp.Symbol('omega')
+layout = 'fzyx'
+
+#   Optimization
+optimizations = {'target': 'cpu', 'cse_global': True, 'field_layout': layout}
+\endcode
+
+Next, we set the parameters for the SRT method in a dictionary and create both the collision and update rules by calling the respective lbmpy functions. They both return an `AssignmentCollection` containing all necessary equations. The only parameters needed for SRT are the stencil and the relaxation rate. For generating the lattice model, we only require the collision rule's equations since `generate_lattice_model` adds the two-fields pull scheme for the streaming step internally. At this point, the lattice model generation is limited to the standard stream-pull-collide scheme.
+
+The update rule is still needed in the code generation process; namely for the pack info generation. The collision step only acts within one cell. Thus, the collision rule's equations contain no neighbour accesses. Calling `create_lb_update_rule` inserts the two-fields pull scheme as `generate_lattice_model`, and resulting update rule contains exactly those neighbour accesses which are required for `generate_pack_info_from_kernel` to build the optimized pack info.
+
+\code
+srt_params = {'stencil': stencil,
+              'method': 'srt',
+              'relaxation_rate': omega}
+
+srt_collision_rule = create_lb_collision_rule(optimization=optimizations, **srt_params)
+srt_update_rule = create_lb_update_rule(collision_rule=srt_collision_rule, optimization=optimizations)
+\endcode
+
+Finally, we create the code generation context and call the respective functions for generating the lattice model and the pack info. Both require the context and a class name as parameters. To  `generate_lattice_model`, we also pass the collision rule and the field layout; `generate_pack_info_from_kernel` receives the update rule.
+
+\code
+with CodeGeneration() as ctx:
+    generate_lattice_model(ctx, "SRTLatticeModel", srt_collision_rule, field_layout=layout)
+    generate_pack_info_from_kernel(ctx, "SRTPackInfo", srt_update_rule)
+\endcode
+
+Notice that, other than in \ref tutorial_codegen01, we did not need to define any fields. Both the source and destination PDF fields are created internally by lbmpy and `generate_lattice_model`.
+
+Furthermore, if we optimise the waLBerla for the machine, it is compiled on with the CMake flag `OPTIMIZE_FOR_LOCALHOST`, the code generator automatically introduces vector intrinsics in the kernel files. Available intrinsics sets are `sse`, `avx` and `avx512`. These sets can be passed manually with the argument `cpu_vectorize_info`. More information on CPU optimisations available in `lbmpy` can be found <a href="https://pycodegen.pages.i10git.cs.fau.de/lbmpy/sphinx/kernelcreation.html" target="_blank">here</a>. By installing the `cpu_vectorize_info` package, it is also possible for `lbmpy` to automatically determine the support intrinsics set of the hardware.
+
+As a final touch, we still need to set up the CMake build target for the code generation script. This time, two distinct classes (the lattice model and the pack information) will be generated. Therefore, we need to list the header and source file names for both classes separately.
+
+\code
+walberla_generate_target_from_python( NAME 02_LBMLatticeModelGenerationPython
+                                      FILE 02_LBMLatticeModelGeneration.py
+                                      OUT_FILES SRTLatticeModel.cpp SRTLatticeModel.h
+                                                SRTPackInfo.cpp SRTPackInfo.h )
+\endcode
+
+This completes the code generation part.
+
+\section lbmpy_simulation_app The Simulation application
+
+This section is concerned with the implementation of a waLBerla C++ application for the simulation of a periodic shear flow using the generated SRT lattice model and pack info. After adding the code generation target defined above as a dependency to the application's CMake target, we can include both generated classes:
+
+\code
+#include "SRTLatticeModel.h"
+#include "SRTPackInfo.h"
+\endcode
+
+For convenience, we define typedef aliases for the several types and templates that will later be used. Since our generated LBM implementation is consistent with the waLBerla LatticeModel API, we can use the lbm::PdfField class template for our PDF fields.
+
+\code
+// Typedef alias for the lattice model
+typedef lbm::SRTLatticeModel LatticeModel_T;
+
+// Communication Pack Info
+typedef pystencils::SRTPackInfo PackInfo_T;
+
+// Also set aliases for the stencils involved...
+typedef LatticeModel_T::Stencil Stencil_T;
+typedef LatticeModel_T::CommunicationStencil CommunicationStencil_T;
+
+// ... and for the required field types.
+typedef lbm::PdfField< LatticeModel_T > PdfField_T;
+
+// Also, for boundary handling, a flag data type and flag field are required.
+typedef walberla::uint8_t flag_t;
+typedef FlagField< flag_t > FlagField_T;
+typedef lbm::DefaultBoundaryHandlingFactory< LatticeModel_T, FlagField_T > BHFactory;
+\endcode
+
+The application will be mostly similar to \ref tutorial_lbm01, so we will mainly focus on the differences here. It will also be configured by a parameter file as described in the other tutorial.
+
+\subsection lbmpy_add_lattice_model_sweep Adding the generated sweep
+
+The only significant difference caused by the usage of a generated lattice model is when the LBM sweep is added to the timeloop. The sweep is exposed as a static member of the generated lattice model class and can be added like this:
+
+\code
+timeloop.add() << Sweep(LatticeModel_T::Sweep(pdfFieldId), "LB stream & collide");
+\endcode
+
+The remaining extensions concern only the setup of boundaries and the initial velocities.
+
+\subsection lbmpy_shear_flow_init Setup of the Shear Flow Scenario
+
+We will set up a shear flow scenario in a rectangular, two-dimensional domain which is periodic in the x-direction and limited by NoSlip-boundaries (i.e. walls) to the north and south. The fluid will be moving rightwards in the northern and southern parts of the domain, and leftwards in the middle with the same speed. Its velocity in the y-direction will be slightly perturbed by random noise, which will cause the development of vortices between the shear layers during the simulation.
+
+For simplicity, the boundaries are set up using the lbm::DefaultBoundaryHandlingFactory as described in \ref tutorial_lbm01.
+
+\code
+BlockDataID boundaryHandlingId =
+      BHFactory::addBoundaryHandlingToStorage(blocks, "boundary handling", flagFieldId, pdfFieldId, fluidFlagUID,
+                                              Vector3< real_t >(), Vector3< real_t >(), real_c(0.0), real_c(0.0));
+\endcode
+
+In the parameter file, the boundary block only defines the NoSlip boundaries. Also, in the `DomainSetup` block, the domain needs to be periodic in the x-direction.
+
+\code
+DomainSetup
+{
+   blocks        <  1,    1, 1 >;
+   cellsPerBlock <  300, 80, 1 >;
+   periodic      <  1,    0, 1 >;
+}
+
+(...)
+
+Boundaries
+{
+   Border { direction S,N; walldistance -1; NoSlip {} }
+}
+\endcode
+
+The velocity initialization can be defined directly in the parameter file. We will be using the lbm::initializer::PdfFieldInitializer class with lbm::initializer::ExprSystemInitFunction which is capable of parsing mathematical expressions from the parameter file to set up complex initial flows. For this purpose, waLBerla uses the [exprtk](http://www.partow.net/programming/exprtk/index.html) C++ library. We will need to extend this functionality a little to introduce the random noise.
+
+The PdfFieldInitializer's `initDensityAndVelocity` function expects a function of type `std::vector< real_t > (const Cell&)`. This function will receive a `Cell` with global coordinates and should return a `std::vector` with four entries: One density and three cartesian velocity components for the given cell. For this purpose, we create a functor struct with these members:
+
+- An instance `exprInitFunc_` of lbm::initializer::ExprSystemInitFunction for initializing the x-velocities;
+- A random number generator `rng_` for the y-velocities, which is an instance of walberla::math::RealRandom;
+- The magnitude of the random noise.
+
+All members will be set using parameters specified in a parameter file block. The functor `exprInitFunc_` is initialized by calling its `parse` method inside the constructor.
+
+We also define the `operator()` with the above signature. It first calls `exprInitFunc_` to set density and velocity, and then adds the random perturbation to the y component.
+
+\code
+struct ShearFlowInit
+{
+ private:
+   lbm::initializer::ExprSystemInitFunction exprInitFunc_;
+   const real_t noiseMagnitude_;
+   math::RealRandom< real_t > rng_;
+
+ public:
+   ShearFlowInit(const shared_ptr< StructuredBlockForest >& blocks, const Config::BlockHandle& setup)
+      : exprInitFunc_(blocks), noiseMagnitude_(setup.getParameter< real_t >("noise_magnitude")),
+        rng_(setup.getParameter< std::mt19937::result_type >("noise_seed", 42))
+   {
+      if (!exprInitFunc_.parse(setup)) { WALBERLA_ABORT("Shear Flow Setup was incomplete."); }
+   }
+
+   std::vector< real_t > operator()(const Cell& globalCell)
+   {
+      std::vector< real_t > densityAndVelocity = exprInitFunc_(globalCell);
+      real_t yPerturbation                     = noiseMagnitude_ * rng_();
+      densityAndVelocity[2] += yPerturbation;
+
+      return densityAndVelocity;
+   }
+};
+\endcode
+
+The required parameters and expressions for initializing the density and velocity are defined in the parameter file. The block is called `ShearFlowSetup`. For `rho`, `u_x`, `u_y` and `u_z`, mathematical expressions can be specified which may include the variables `x`, `y`, `z` for a cell's global position and `n_x`, `n_y`, `n_z` representing the number of cells in each direction. These expressions will be evaluated for each domain cell.
+
+A seed for the random number generator is also specified, which controls the random noise and therefore makes the test case reproducible.
+
+\code
+ShearFlowSetup
+{
+   rho   1;
+
+   u_x   if( ( y / n_y < 0.3 ) or (y / n_y > 0.7) , 0.08, -0.08);
+   u_y   0;
+   u_z   0;
+
+   noise_magnitude     0.005;
+   noise_seed          42;
+}
+\endcode
+
+This completes the C++ implementation. It will produce VTK files which can then be viewed using ParaView, as described in previous tutorials.
+
+\section lbmpy_lattice_model_outlook Outlook
+
+Although generated lattice models are easy to define and easy to integrate with the waLBerla framework, they are somewhat limited in features. For example, the waLBerla code generation methods for lattice models do not allow advanced streaming patterns or the generation of CUDA code. This is remedied by generating sweeps using `generate_sweep` directly instead of `generate_lattice_model`. The result is a generic waLBerla sweep which is not consistent with the lattice model API, thus making implementations a little more complicated since many class templates created for the use with lattice models can not be used. For this problem too, code generation offers solutions which will be further explained in the next tutorial.
+
+\tableofcontents
+
+*/
+
+}
\ No newline at end of file
diff --git a/apps/tutorials/codegen/02_LBMLatticeModelGeneration.prm b/apps/tutorials/codegen/02_LBMLatticeModelGeneration.prm
new file mode 100644
index 0000000000000000000000000000000000000000..004374b2372873bc4af8a7114c00c3e4fe697d9c
--- /dev/null
+++ b/apps/tutorials/codegen/02_LBMLatticeModelGeneration.prm
@@ -0,0 +1,64 @@
+
+Parameters 
+{
+    omega           1.8;
+	timesteps       10000;
+
+	remainingTimeLoggerFrequency 3; // in seconds
+}
+
+ShearFlowSetup
+{
+   rho   1;
+   
+   u_x   if( ( y / n_y < 0.3 ) or (y / n_y > 0.7) , 0.08, -0.08);
+   u_y   0;
+   u_z   0;
+
+   noise_magnitude     0.005;
+   noise_seed          42;
+}
+
+DomainSetup
+{
+   blocks        <  1,    1, 1 >;
+   cellsPerBlock <  300, 80, 1 >;
+   periodic      <  1,    0, 1 >;  
+}
+
+StabilityChecker
+{
+   checkFrequency 1;
+   streamOutput   false;
+   vtkOutput      true;
+}
+
+Boundaries 
+{   
+	Border { direction S, N; walldistance -1; NoSlip {} }
+}
+
+
+VTK 
+{
+   // for parameter documentation see src/vtk/Initialization.cpp
+   srt_velocity_field
+   {
+      writeFrequency 100;
+      ghostLayers    1;
+
+      baseFolder     vtk_out/srt_lattice_model;
+      
+      before_functions {
+         PDFGhostLayerSync;
+      }
+      
+      inclusion_filters {
+         DomainFilter;
+      }
+      
+      writers {
+         Velocity;
+      }
+   }
+}
diff --git a/apps/tutorials/codegen/02_LBMLatticeModelGeneration.py b/apps/tutorials/codegen/02_LBMLatticeModelGeneration.py
new file mode 100644
index 0000000000000000000000000000000000000000..39deba33e739693965d7bc8119e013265342dbdf
--- /dev/null
+++ b/apps/tutorials/codegen/02_LBMLatticeModelGeneration.py
@@ -0,0 +1,38 @@
+import sympy as sp
+
+from lbmpy.creationfunctions import create_lb_collision_rule, create_lb_update_rule
+
+from pystencils_walberla import CodeGeneration, generate_pack_info_from_kernel
+from lbmpy_walberla import generate_lattice_model
+
+#   ========================
+#      General Parameters
+#   ========================
+
+stencil = 'D2Q9'
+omega = sp.Symbol('omega')
+layout = 'fzyx'
+
+#   Optimizations to be used by the code generator
+optimizations = {'cse_global': True, 'field_layout': layout}
+
+#   ===========================
+#      SRT Method Definition
+#   ===========================
+
+srt_params = {'stencil': stencil,
+              'method': 'srt',
+              'relaxation_rate': omega}
+
+srt_collision_rule = create_lb_collision_rule(optimization=optimizations, **srt_params)
+srt_update_rule = create_lb_update_rule(collision_rule=srt_collision_rule, optimization=optimizations)
+
+#   =====================
+#      Code Generation
+#   =====================
+
+with CodeGeneration() as ctx:
+    # generation of the lattice model ...
+    generate_lattice_model(ctx, "SRTLatticeModel", srt_collision_rule, field_layout=layout)
+    # ... and generation of the pack information to be used for the MPI communication
+    generate_pack_info_from_kernel(ctx, "SRTPackInfo", srt_update_rule)
diff --git a/apps/tutorials/codegen/03_AdvancedLBMCodegen.cpp b/apps/tutorials/codegen/03_AdvancedLBMCodegen.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1f24dfffe59f267ae39d80bcdaeddb71a8db73de
--- /dev/null
+++ b/apps/tutorials/codegen/03_AdvancedLBMCodegen.cpp
@@ -0,0 +1,250 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file 03_AdvancedLBMCodegen.cpp
+//! \author Frederik Hennig <frederik.hennig@fau.de>
+//
+//======================================================================================================================
+
+#include "blockforest/all.h"
+
+#include "core/all.h"
+
+#if defined(WALBERLA_BUILD_WITH_CUDA)
+#   include "cuda/AddGPUFieldToStorage.h"
+#   include "cuda/DeviceSelectMPI.h"
+#   include "cuda/HostFieldAllocator.h"
+#   include "cuda/NVTX.h"
+#   include "cuda/ParallelStreams.h"
+#   include "cuda/communication/GPUPackInfo.h"
+#   include "cuda/communication/UniformGPUScheme.h"
+#endif
+
+#include "domain_decomposition/all.h"
+
+#include "field/all.h"
+#include "field/vtk/VTKWriter.h"
+
+#include "geometry/all.h"
+
+#include "stencil/D2Q9.h"
+
+#include "timeloop/all.h"
+
+//    Codegen Includes
+#include "CumulantMRTNoSlip.h"
+#include "CumulantMRTPackInfo.h"
+#include "CumulantMRTSweep.h"
+#include "InitialPDFsSetter.h"
+
+namespace walberla
+{
+///////////////////////
+/// Typedef Aliases ///
+///////////////////////
+
+// Communication Pack Info
+typedef pystencils::CumulantMRTPackInfo PackInfo_T;
+
+// LB Method Stencil
+typedef stencil::D2Q9 Stencil_T;
+
+// PDF field type
+typedef field::GhostLayerField< real_t, Stencil_T::Size > PdfField_T;
+
+// Velocity Field Type
+typedef field::GhostLayerField< real_t, Stencil_T::D > VectorField_T;
+
+// Boundary Handling
+typedef walberla::uint8_t flag_t;
+typedef FlagField< flag_t > FlagField_T;
+typedef lbm::CumulantMRTNoSlip NoSlip_T;
+
+#if defined(WALBERLA_BUILD_WITH_CUDA)
+typedef cuda::GPUField< double > GPUField;
+#endif
+
+//////////////////////////////////////////
+/// Shear Flow Velocity Initialization ///
+//////////////////////////////////////////
+
+void initShearFlowVelocityField(const shared_ptr< StructuredBlockForest >& blocks, const BlockDataID& velocityFieldId,
+                                const Config::BlockHandle& config)
+{
+   math::RealRandom< real_t > rng(config.getParameter< std::mt19937::result_type >("noiseSeed", 42));
+
+   real_t velocityMagnitude = config.getParameter< real_t >("velocityMagnitude", real_c(0.08));
+   real_t noiseMagnitude    = config.getParameter< real_t >("noiseMagnitude", real_c(0.1) * velocityMagnitude);
+
+   real_t n_y = real_c(blocks->getNumberOfYCells());
+
+   for (auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt)
+   {
+      auto u = (*blockIt).getData< VectorField_T >(velocityFieldId);
+
+      for (auto cellIt = u->beginWithGhostLayerXYZ(); cellIt != u->end(); ++cellIt)
+      {
+         Cell globalCell(cellIt.cell());
+         blocks->transformBlockLocalToGlobalCell(globalCell, *blockIt);
+
+         real_t relative_y = real_c(globalCell.y()) / n_y;
+
+         u->get(cellIt.cell(), 0) = relative_y < 0.3 || relative_y > 0.7 ? velocityMagnitude : -velocityMagnitude;
+
+         u->get(cellIt.cell(), 1) = noiseMagnitude * rng();
+      }
+   }
+}
+
+/////////////////////
+/// Main Function ///
+/////////////////////
+
+int main(int argc, char** argv)
+{
+   walberla::Environment walberlaEnv(argc, argv);
+
+   if (!walberlaEnv.config()) { WALBERLA_ABORT("No configuration file specified!"); }
+
+   ///////////////////////////////////////////////////////
+   /// Block Storage Creation and Simulation Parameter ///
+   ///////////////////////////////////////////////////////
+
+   auto blocks = blockforest::createUniformBlockGridFromConfig(walberlaEnv.config());
+
+   // read parameters
+   auto parameters = walberlaEnv.config()->getOneBlock("Parameters");
+
+   const uint_t timesteps = parameters.getParameter< uint_t >("timesteps", uint_c(10));
+   const real_t omega     = parameters.getParameter< real_t >("omega", real_c(1.8));
+   const double remainingTimeLoggerFrequency =
+      parameters.getParameter< double >("remainingTimeLoggerFrequency", 3.0); // in seconds
+   const uint_t VTKwriteFrequency = parameters.getParameter< uint_t >("VTKwriteFrequency", 1000);
+
+   ////////////////////////////////////
+   /// PDF Field and Velocity Setup ///
+   ////////////////////////////////////
+
+   // Common Fields
+   BlockDataID velocityFieldId = field::addToStorage< VectorField_T >(blocks, "velocity", real_c(0.0), field::fzyx);
+   BlockDataID flagFieldId     = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field");
+
+#if defined(WALBERLA_BUILD_WITH_CUDA)
+   // GPU Field for PDFs
+   BlockDataID pdfFieldId = cuda::addGPUFieldToStorage< cuda::GPUField< real_t > >(
+      blocks, "pdf field on GPU", Stencil_T::Size, field::fzyx, uint_t(1));
+
+   // GPU Velocity Field
+   BlockDataID velocityFieldIdGPU =
+      cuda::addGPUFieldToStorage< VectorField_T >(blocks, velocityFieldId, "velocity on GPU", true);
+#else
+   // CPU Field for PDFs
+   BlockDataID pdfFieldId = field::addToStorage< PdfField_T >(blocks, "pdf field", real_c(0.0), field::fzyx);
+#endif
+
+   // Velocity field setup
+   auto shearFlowSetup = walberlaEnv.config()->getOneBlock("ShearFlowSetup");
+   initShearFlowVelocityField(blocks, velocityFieldId, shearFlowSetup);
+
+   real_t rho = shearFlowSetup.getParameter("rho", real_c(1.0));
+
+   // pdfs setup
+#if defined(WALBERLA_BUILD_WITH_CUDA)
+   cuda::fieldCpy< GPUField, VectorField_T >(blocks, velocityFieldIdGPU, velocityFieldId);
+   pystencils::InitialPDFsSetter pdfSetter(pdfFieldId, velocityFieldIdGPU, rho);
+#else
+   pystencils::InitialPDFsSetter pdfSetter(pdfFieldId, velocityFieldId, rho);
+#endif
+
+   for (auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt)
+   {
+      pdfSetter(&(*blockIt));
+   }
+
+   /////////////
+   /// Sweep ///
+   /////////////
+
+#if defined(WALBERLA_BUILD_WITH_CUDA)
+   pystencils::CumulantMRTSweep CumulantMRTSweep(pdfFieldId, velocityFieldIdGPU, omega);
+#else
+   pystencils::CumulantMRTSweep CumulantMRTSweep(pdfFieldId, velocityFieldId, omega);
+#endif
+
+   /////////////////////////
+   /// Boundary Handling ///
+   /////////////////////////
+
+   const FlagUID fluidFlagUID("Fluid");
+
+   auto boundariesConfig = walberlaEnv.config()->getOneBlock("Boundaries");
+
+   NoSlip_T noSlip(blocks, pdfFieldId);
+
+   geometry::initBoundaryHandling< FlagField_T >(*blocks, flagFieldId, boundariesConfig);
+   geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, flagFieldId, fluidFlagUID);
+
+   noSlip.fillFromFlagField< FlagField_T >(blocks, flagFieldId, FlagUID("NoSlip"), fluidFlagUID);
+
+   /////////////////
+   /// Time Loop ///
+   /////////////////
+
+   SweepTimeloop timeloop(blocks->getBlockStorage(), timesteps);
+
+   // Communication
+#if defined(WALBERLA_BUILD_WITH_CUDA)
+   cuda::communication::UniformGPUScheme< Stencil_T > com(blocks, 0);
+   com.addPackInfo(make_shared< PackInfo_T >(pdfFieldId));
+   auto communication = std::function< void() >([&]() { com.communicate(nullptr); });
+#else
+   blockforest::communication::UniformBufferedScheme< Stencil_T > communication(blocks);
+   communication.addPackInfo(make_shared< PackInfo_T >(pdfFieldId));
+#endif
+
+   // Timeloop
+   timeloop.add() << BeforeFunction(communication, "communication") << Sweep(noSlip);
+   timeloop.add() << Sweep(CumulantMRTSweep);
+
+   // Time logger
+   timeloop.addFuncAfterTimeStep(timing::RemainingTimeLogger(timeloop.getNrOfTimeSteps(), remainingTimeLoggerFrequency),
+                                 "remaining time logger");
+
+   if (VTKwriteFrequency > 0)
+   {
+      const std::string path = "vtk_out/tut03";
+      auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "cumulant_mrt_velocity_field", VTKwriteFrequency, 0,
+                                                      false, path, "simulation_step", false, true, true, false, 0);
+
+#if defined(WALBERLA_BUILD_WITH_CUDA)
+      // Copy velocity data to CPU before output
+      vtkOutput->addBeforeFunction(
+         [&]() { cuda::fieldCpy< VectorField_T, GPUField >(blocks, velocityFieldId, velocityFieldIdGPU); });
+#endif
+
+      auto velWriter = make_shared< field::VTKWriter< VectorField_T > >(velocityFieldId, "Velocity");
+      vtkOutput->addCellDataWriter(velWriter);
+
+      timeloop.addFuncBeforeTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
+   }
+
+   timeloop.run();
+
+   return EXIT_SUCCESS;
+}
+
+} // namespace walberla
+
+int main(int argc, char** argv) { return walberla::main(argc, argv); }
\ No newline at end of file
diff --git a/apps/tutorials/codegen/03_AdvancedLBMCodegen.dox b/apps/tutorials/codegen/03_AdvancedLBMCodegen.dox
new file mode 100644
index 0000000000000000000000000000000000000000..9eae32554b8a306b8e5a1defdff4b95c0e363fa0
--- /dev/null
+++ b/apps/tutorials/codegen/03_AdvancedLBMCodegen.dox
@@ -0,0 +1,244 @@
+namespace walberla{
+
+/**
+\page tutorial_codegen03 Tutorial - Code Generation 3: Advanced LBM Code Generation
+
+\section overview Overview
+
+This tutorial demonstrates how to use [pystencils](https://pycodegen.pages.i10git.cs.fau.de/pystencils) and [lbmpy](https://pycodegen.pages.i10git.cs.fau.de/lbmpy) to generate highly optimised and hardware-specific Lattice Boltzmann simulation code within the waLBerla framework. Other than in \ref tutorial_codegen02, we will be generating a full LBM sweep instead of a lattice model class. Furthermore, we will generate a communication pack info class and a sweep to initialise the PDF field. A hardware-specific implementation of a NoSlip boundary handler will also be generated. Those components will then be combined in a waLBerla application for simulating the same shear flow scenario as in the previous tutorial.
+
+For large-scale LB simulations, the highly parallel design of a general-purpose graphics processing unit (GPGPU) can yield significant improvements in performance. The waLBerla framework relies on CUDA to run simulations on NVIDIA GPUs. In this tutorial, we will also show how code generation can be used to generate native CUDA code for different kinds of kernels.
+
+In this tutorial, we will be using the more advanced cumulant-based multiple-relaxation-time (MRT) collision operator. Instead of relaxing the entire distribution functions toward their equilibrium values, their [cumulants](https://en.wikipedia.org/wiki/Cumulant) are relaxed with individual relaxation rates. We will also use the D2Q9 velocity set. For this velocity set, the zeroth- and first-order cumulants correspond to density and momentum which are conserved during collisions, so their relaxation rates can be set to zero. We will specify one common relaxation rate \f$ \omega \f$ for the three second-order cumulants to ensure the correct viscosity of the fluid; the higher-order cumulants will be set to their equilibrium values which correspond to a relaxation rate of 1.
+
+\section advancedlbmcodegen_python Code Generation in Python
+
+We start by defining some general parameters as well as [SymPy](https://www.sympy.org/) symbols and pystencils fields for later usage. Those include the stencil (`D2Q9`), the memory layout (`fzyx`, see \ref tutorial_codegen01) and the symbol \f$ \omega \f$ for the relaxation rate of the second-order cumulants.
+
+For the stream-pull-collide type kernel, we need two PDF fields which we set up using pystencils. We will use the built-in field swapping functionality of `generate_sweep`. It will generate the pointer swapping for the temporary destination field with the source field after each time step. This requires us to obey a specific naming convention: The temporary field *must* have the same *field name* (passed as a string to `ps.fields`) as the original field but with a suffix `_tmp`. Otherwise, an error will occur during compilation.
+
+For VTK output and the initial velocity setup, we define a velocity vector field as an output field for the LB method.
+
+\code
+stencil = 'D2Q9'
+omega = sp.Symbol('omega')
+layout = 'fzyx'
+
+#   PDF Fields
+pdfs, pdfs_tmp = ps.fields('pdfs(9), pdfs_tmp(9): [2D]', layout=layout)
+
+#   Velocity Output Field
+velocity = ps.fields("velocity(2): [2D]", layout=layout)
+output = {'velocity': velocity}
+
+#   Optimization
+optimization = {'cse_global': True,
+                'symbolic_field': pdfs,
+                'symbolic_temporary_field': pdfs_tmp,
+                'field_layout': layout}
+\endcode
+
+We set up the cumulant-based MRT method with relaxation rates as described above. We use `generate_lb_update_rule` from lbmpy to derive the set of equations describing the collision operator together with the *pull* streaming pattern. These equations define the entire LBM sweep.
+
+\code
+lbm_params = {'stencil': stencil,
+              'method': 'mrt_raw',
+              'relaxation_rates': [0, 0, 0, omega, omega, omega, 1, 1, 1],
+              'cumulant': True,
+              'compressible': True}
+
+lbm_update_rule = create_lb_update_rule(optimization=optimization,
+                                        output=output,
+                                        **lbm_params)
+
+lbm_method = lbm_update_rule.method
+\endcode
+
+In \ref tutorial_codegen02, we were able to use the framework built around the waLBerla lattice model template API for setting up the shear flow's initial velocity profile. Since we are not using a lattice model class this time, this API is not available to us. With lbmpy, though, we can generate a kernel which takes in scalar values or fields for the initial density and velocity and sets the initial PDF values to the corresponding equilibrium. The function `macroscopic_values_setter` from `lbmpy.macroscopic_value_kernels` returns a set of assignments for this initialization procedure. It takes the LB method definition as an argument, as well as either symbols or pystencils field accesses for the initial density `rho` and the initial velocity. Lastly, it takes the PDF field's centre vector as the destination for the PDF values. We define a separate symbol for the density and use the velocity field defined above.
+
+\code
+initial_rho = sp.Symbol('rho_0')
+
+pdfs_setter = macroscopic_values_setter(lbm_method,
+                                        initial_rho,
+                                        velocity.center_vector,
+                                        pdfs.center_vector)
+\endcode
+
+Everything is now prepared to generate the actual C++ code. We create the code generation context and evaluate the `ctx.cuda` flag to find out if waLBerla is configured to build GPU code. If CUDA is enabled, we set the `target` to `gpu`; otherwise to `cpu`.  The target is then passed to all code generation functions. If GPU code is to be generated, the generated classes will be implemented in `*.cu` files, and their sweeps will run on the GPU.
+
+Several functions from `pystencils_walberla` and `lbmpy_walberla` are called to generate the classes:
+
+- The LBM sweep is generated by the `lbm_update_rule` equations using `generate_sweep`. This function takes an additional parameter `field_swaps` which takes a list of pairs of fields to generate the pointer swapping.
+- The communication pack info is generated using `generate_pack_info_from_kernel` which infers from the update rule's write accesses the PDF indices that need to be communicated. Without further specification, it assumes a pull-type kernel.
+- The PDF initialization kernel is generated from the `pdfs_setter` assignment collection using `generate_sweep`.
+- Using `generate_boundary`, we generate an optimised implementation of a NoSlip boundary handler for the domain's walls.
+
+\code
+with CodeGeneration() as ctx:
+    if ctx.cuda:
+        target = 'gpu'
+    else:
+        target = 'cpu'
+
+    #   LBM Sweep
+    generate_sweep(ctx, "CumulantMRTSweep", lbm_update_rule, field_swaps=[(pdfs, pdfs_tmp)], target=target)
+
+    #   Pack Info
+    generate_pack_info_from_kernel(ctx, "CumulantMRTPackInfo", lbm_update_rule, target=target)
+
+    #   Macroscopic Values Setter
+    generate_sweep(ctx, "InitialPDFsSetter", pdfs_setter, target=target)
+
+    #   NoSlip Boundary
+    generate_boundary(ctx, "CumulantMRTNoSlip", NoSlip(), lbm_method, target=target)
+\endcode
+
+As in \ref tutorial_codegen02, the classes generated by the above code need to be registered with CMake using the `walberla_generate_target_from_python` macro. Since the source file extension is different if CUDA code is generated (`*.cu` instead of `*.cpp`), the code generation target needs to be added twice. During the build process, the correct target is selected through the surrounding `if(WALBERLA_BUILD_WITH_CUDA)` block. Furthermore, the application depends on `cuda`, which is used from the waLBerla backend.
+
+\section advancedlbmcodegen_application The waLBerla application
+
+We will now integrate the generated classes into a waLBerla application. After adding the code generation target as a CMake dependency, we can include their header files:
+
+\code
+#include "CumulantMRTNoSlip.h"
+#include "CumulantMRTPackInfo.h"
+#include "CumulantMRTSweep.h"
+#include "DensityAndVelocityFieldSetter.h"
+\endcode
+
+We set up typedef aliases for the generated pack info and the D2Q9 stencil. For the PDF and velocity fields, we use instances of the field::GhostLayerField template. The number of entries of the PDF field is specified by the `Stencil_T::Size` parameter. As our domain is two-dimensional, the velocity at each lattice node is a two-dimensional vector. Thus, we set up the velocity field to have two index dimensions passing the stencil's dimension as a template parameter. Finally, we also define a typedef alias for our generated NoSlip boundary.
+
+\code
+// Communication Pack Info
+typedef pystencils::CumulantMRTPackInfo PackInfo_T;
+
+// LB Method Stencil
+typedef stencil::D2Q9 Stencil_T;
+
+// PDF field type
+typedef field::GhostLayerField< real_t, Stencil_T::Size > PdfField_T;
+
+// Velocity Field Type
+typedef field::GhostLayerField< real_t, Stencil_T::D > VectorField_T;
+
+// Boundary Handling
+typedef walberla::uint8_t flag_t;
+typedef FlagField< flag_t > FlagField_T;
+typedef lbm::CumulantMRTNoSlip NoSlip_T;
+\endcode
+
+The PDF field, the velocity field and the flag field which stores information about the cell types for boundary handling are set up using field::addToStorage. The memory layout `fzyx` is explicitly set for the PDF and velocity fields. We set initial values of `0.0` for both of them since they will be explicitly initialised later on.
+
+\subsection advancedlbmpy_app_sweep_and_packinfo Using the generated sweep and pack info
+
+The generated pack info class can be used in exactly the same way as a waLBerla-native pack info. It simply needs to be registered with the communication scheme by calling `addPackInfo`:
+
+\code
+   blockforest::communication::UniformBufferedScheme< Stencil_T > communication(blocks);
+   communication.addPackInfo(make_shared< PackInfo_T >(pdfFieldId));
+\endcode
+
+Upon construction, the generated sweep class requires the PDF field, the velocity field and the relaxation rate `omega`. The parameter `omega` is read from a parameter file. After creation, the sweep functor can be passed directly to the timeloop.
+
+\code
+   timeloop.add() << Sweep(pystencils::CumulantMRTSweep(pdfFieldId, velocityFieldId, omega));
+\endcode
+
+\subsection advancedlbmpy_app_boundary Adding the NoSlip Boundary
+
+The generated NoSlip boundary works independently of the native waLBerla LBM boundary handling classes, which are templated for the use with lattice models. Still, we use the functions exposed by the geometry module to initialise the flag field from the parameter file. The NoSlip boundary handler is constructed passing the block grid and the PDF field's identifier. After the flag field has been set up, the `fillFromFlagField` method is called to register the boundary cells with the boundary handler.
+
+\code
+   const FlagUID fluidFlagUID("Fluid");
+
+   auto boundariesConfig = walberlaEnv.config()->getOneBlock("Boundaries");
+
+   NoSlip_T noSlip(blocks, pdfFieldId);
+
+   geometry::initBoundaryHandling< FlagField_T >(*blocks, flagFieldId, boundariesConfig);
+   geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, flagFieldId, fluidFlagUID);
+
+   noSlip.fillFromFlagField< FlagField_T >(blocks, flagFieldId, FlagUID("NoSlip"), fluidFlagUID);
+\endcode
+
+The generated boundary handler holds an internal list of tuples `(x, y, dir)`, the so-called index vector. Each entry corresponds to a fluid cell which is a direct neighbour of a boundary cell. The direction pointing towards the boundary cell and the position is saved for each of these fluid cells. In our scenario, the north and south domain borders are marked as NoSlip-boundaries. The boundary cells are located on the northern and southern ghost layers, so the top and bottom row of fluid cells are neighbours to these boundaries. Thus, in the index vector, for all cells of the bottom row the tuples `(x, 0, 2)`, `(x, 0, 7)` and `(x, 0, 8)` are saved. The indices 2, 7 and 8 correspond to the South, South-East and South-West directions of the D2Q9 lattice, respectively. The boundary sweep then runs over this index vector, reflecting the outgoing populations in the stored directions off the wall according to the NoSlip rule.
+
+The NoSlip sweep can now be added to the timeloop before the LBM sweep. We also register the communication functor as a BeforeFunction of the boundary sweep, since it needs to be run first in every time step.
+
+\code
+   timeloop.add() << BeforeFunction(communication, "communication") << Sweep(noSlip);
+\endcode
+
+\subsection advancedlbmpy_velinit Initialisation of the Velocity Profile
+
+Setting up the shear flow's initial velocity profile consists of two parts. First, the velocity field is initialised to the shear flow. Second, the prepared velocity field is passed to the generated PDF field setter, which then initialises the equilibrium PDF values from the velocities. The density is uniformly set to unity.
+
+The `initShearFlowVelocityField` function takes a pointer to the block storage, the velocity field's identifier and a handle to a parameter file block. It then reads the magnitudes of the velocity's $x$-component and the perturbation in the $y$-direction. For the random noise, a random number generator is created with a seed specified in the parameter file. Then, the function iterates over all blocks (distributed to MPI processes) and cells. A cell's velocity is then set according to its global position.
+
+\code
+void initShearFlowVelocityField(const shared_ptr< StructuredBlockForest >& blocks,
+                                const BlockDataID& velocityFieldId,
+                                const Config::BlockHandle& config)
+{
+   math::RealRandom< real_t > rng(config.getParameter< std::mt19937::result_type >("noiseSeed", 42));
+
+   real_t velocityMagnitude = config.getParameter< real_t >("velocityMagnitude", real_c(0.08));
+   real_t noiseMagnitude    = config.getParameter< real_t >("noiseMagnitude", real_c(0.1) * velocityMagnitude);
+
+   real_t n_y = real_c(blocks->getNumberOfYCells());
+
+   for (auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt)
+   {
+      auto u = (*blockIt).getData< VectorField_T >(velocityFieldId);
+
+      for (auto cellIt = u->beginWithGhostLayerXYZ(); cellIt != u->end(); ++cellIt)
+      {
+         Cell globalCell(cellIt.cell());
+         blocks->transformBlockLocalToGlobalCell(globalCell, *blockIt);
+
+         real_t relative_y = real_c(globalCell.y()) / n_y;
+
+         u->get(cellIt.cell(), 0) = relative_y < 0.3 || relative_y > 0.7 ? velocityMagnitude : -velocityMagnitude;
+
+         u->get(cellIt.cell(), 1) = noiseMagnitude * rng();
+      }
+   }
+}
+\endcode
+
+After the velocity field has been initialised, the generated `InitialPDFsSetter` sweep is created and run once on each block. After this step, the PDF field will be initialised with the correct equilibrium distributions for the first iteration.
+
+\code
+   // Velocity field setup
+   auto shearFlowSetup = walberlaEnv.config()->getOneBlock("ShearFlowSetup");
+   initShearFlowVelocityField(blocks, velocityFieldId, shearFlowSetup);
+
+   real_t rho = shearFlowSetup.getParameter("rho", real_c(1.0));
+
+   // pdfs setup
+   pystencils::InitialPDFsSetter pdfSetter(pdfFieldId, velocityFieldId, rho);
+
+   for (auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt)
+   {
+      pdfSetter(&(*blockIt));
+   }
+\endcode
+
+The simulation is now ready to run.
+
+\subsection advancedlbmpy_cuda Differences in the GPU application
+
+If CUDA is enabled, some implementation details need to be different from a CPU-only version. This mainly concerns the creation and management of fields, MPI communication and VTK output. Since the initialisation, LBM and NoSlip sweeps run entirely on the GPU, the PDF field has to be set up only in graphics memory. In contrast to that is the velocity field required by CPU and GPU. The shear flow velocity profile is constructed by CPU code before the initialisation kernel maps it onto the PDF field on the GPU. Also, the VTK output routines which run on the CPU need to read the velocity field. It thus needs to be created twice: Once in the main memory, and once in GPU memory. It is then copied on-demand from the GPU to the CPU. Furthermore, we create a flag field, which is only needed on the CPU. After the initialisation, we use it to create the index-vectors for the boundary-handling. The index vectors are then transferred to the GPU and not the entire flag field.
+
+For the largest part, though, the C++ code is identical. The code snippets presented above represent only the CPU variant of the code. The GPU implementation can be found in the source file 03_AdvancedLBMCodegen.cpp. There, code blocks which are different from the CPU to the GPU implementation are toggled via preprocessor conditionals.
+
+\section advancedlbmpy_conclusion Conclusion and Outlook
+
+We have now successfully implemented a waLBerla LBM simulation application with an advanced collision operator, which can be specialised for both CPU and GPU hardware targets. Other possible extensions would be the use of advanced streaming patterns like the AA-pattern or EsoTwist to reduce the simulation's memory footprint. Also, lbmpy gives us the tools to develop advanced lattice Boltzmann methods for many kinds of applications. Thus, the basic principles demonstrated in these tutorials can be used for creating more complicated simulations with optimised lattice Boltzmann code.
+
+\tableofcontents
+
+*/
+
+}
\ No newline at end of file
diff --git a/apps/tutorials/codegen/03_AdvancedLBMCodegen.prm b/apps/tutorials/codegen/03_AdvancedLBMCodegen.prm
new file mode 100644
index 0000000000000000000000000000000000000000..d292c02d13450a5ae947f3284ded79baf2348538
--- /dev/null
+++ b/apps/tutorials/codegen/03_AdvancedLBMCodegen.prm
@@ -0,0 +1,38 @@
+
+Parameters 
+{
+	omega           1.8;
+	timesteps       10001;
+
+	remainingTimeLoggerFrequency 3; // in seconds
+	VTKwriteFrequency 1000;
+}
+
+ShearFlowSetup
+{
+   rho   1.0;
+   
+   velocityMagnitude   0.08;
+   noiseMagnitude     0.005;
+
+   noiseSeed            42;
+}
+
+DomainSetup
+{
+   blocks        <  1,    1, 1 >;
+   cellsPerBlock <  300, 80, 1 >;
+   periodic      <  1,    0, 1 >;  
+}
+
+StabilityChecker
+{
+   checkFrequency 1;
+   streamOutput   false;
+   vtkOutput      true;
+}
+
+Boundaries 
+{   
+	Border { direction S,N; walldistance -1; flag NoSlip; }		
+}
diff --git a/apps/tutorials/codegen/03_AdvancedLBMCodegen.py b/apps/tutorials/codegen/03_AdvancedLBMCodegen.py
new file mode 100644
index 0000000000000000000000000000000000000000..a397909805f5fb18f1f1d3a9235717219d7bdc7d
--- /dev/null
+++ b/apps/tutorials/codegen/03_AdvancedLBMCodegen.py
@@ -0,0 +1,81 @@
+import sympy as sp
+import pystencils as ps
+
+from lbmpy.creationfunctions import create_lb_update_rule
+from lbmpy.macroscopic_value_kernels import macroscopic_values_setter
+from lbmpy.boundaries import NoSlip
+
+from pystencils_walberla import CodeGeneration, generate_sweep, generate_pack_info_from_kernel
+from lbmpy_walberla import generate_boundary
+
+
+#   ========================
+#      General Parameters
+#   ========================
+
+stencil = 'D2Q9'
+omega = sp.Symbol('omega')
+layout = 'fzyx'
+
+#   PDF Fields
+pdfs, pdfs_tmp = ps.fields('pdfs(9), pdfs_tmp(9): [2D]', layout=layout)
+
+#   Velocity Output Field
+velocity = ps.fields("velocity(2): [2D]", layout=layout)
+output = {'velocity': velocity}
+
+#   Optimization
+optimization = {'cse_global': True,
+                'symbolic_field': pdfs,
+                'symbolic_temporary_field': pdfs_tmp,
+                'field_layout': layout}
+
+
+#   ==================
+#      Method Setup
+#   ==================
+
+lbm_params = {'stencil': stencil,
+              'method': 'mrt_raw',
+              'relaxation_rates': [0, 0, 0, omega, omega, omega, 1, 1, 1],
+              'cumulant': True,
+              'compressible': True}
+
+lbm_update_rule = create_lb_update_rule(optimization=optimization,
+                                        output=output,
+                                        **lbm_params)
+
+lbm_method = lbm_update_rule.method
+
+#   ========================
+#      PDF Initialization
+#   ========================
+
+initial_rho = sp.Symbol('rho_0')
+
+pdfs_setter = macroscopic_values_setter(lbm_method,
+                                        initial_rho,
+                                        velocity.center_vector,
+                                        pdfs.center_vector)
+
+#   =====================
+#      Code Generation
+#   =====================
+
+with CodeGeneration() as ctx:
+    if ctx.cuda:
+        target = 'gpu'
+    else:
+        target = 'cpu'
+
+    #   LBM Sweep
+    generate_sweep(ctx, "CumulantMRTSweep", lbm_update_rule, field_swaps=[(pdfs, pdfs_tmp)], target=target)
+
+    #   Pack Info
+    generate_pack_info_from_kernel(ctx, "CumulantMRTPackInfo", lbm_update_rule, target=target)
+
+    #   Macroscopic Values Setter
+    generate_sweep(ctx, "InitialPDFsSetter", pdfs_setter, target=target)
+
+    #   NoSlip Boundary
+    generate_boundary(ctx, "CumulantMRTNoSlip", NoSlip(), lbm_method, target=target)
diff --git a/apps/tutorials/codegen/CMakeLists.txt b/apps/tutorials/codegen/CMakeLists.txt
index d8234d517a84aec87c69e227dc875a3491f07cb7..339f648197b4d716c9357f19b5f03ad4fdb43ffd 100644
--- a/apps/tutorials/codegen/CMakeLists.txt
+++ b/apps/tutorials/codegen/CMakeLists.txt
@@ -1,6 +1,8 @@
 # Code Generation Tutorials
 
 if( WALBERLA_BUILD_WITH_CODEGEN )
+
+    #   Tutorial 1: Heat Equation with pystencils
     walberla_generate_target_from_python( NAME CodegenHeatEquationKernel
         FILE HeatEquationKernel.py
         OUT_FILES HeatEquationKernel.cpp HeatEquationKernel.h )
@@ -8,4 +10,42 @@ if( WALBERLA_BUILD_WITH_CODEGEN )
     walberla_add_executable ( NAME 01_CodegenHeatEquation 
                               FILES 01_CodegenHeatEquation.cpp
                               DEPENDS blockforest core field stencil timeloop vtk pde CodegenHeatEquationKernel )
+
+    #   Tutorial 2: lbmpy Lattice Model Generation
+    waLBerla_link_files_to_builddir( *.prm )
+    
+    walberla_generate_target_from_python( NAME 02_LBMLatticeModelGenerationPython
+        FILE 02_LBMLatticeModelGeneration.py
+        OUT_FILES   SRTLatticeModel.cpp SRTLatticeModel.h 
+                    SRTPackInfo.cpp SRTPackInfo.h )
+
+    walberla_add_executable ( NAME 02_LBMLatticeModelGenerationApp 
+                    FILES 02_LBMLatticeModelGeneration.cpp
+                    DEPENDS blockforest core domain_decomposition field geometry timeloop lbm stencil vtk 02_LBMLatticeModelGenerationPython )
+
+    #   Tutorial 3: Advanced lbmpy Code Generation
+    if(WALBERLA_BUILD_WITH_CUDA)
+        walberla_generate_target_from_python( NAME 03_AdvancedLBMCodegenPython
+            FILE 03_AdvancedLBMCodegen.py
+            OUT_FILES   CumulantMRTSweep.cu CumulantMRTSweep.h
+                        CumulantMRTPackInfo.cu CumulantMRTPackInfo.h
+                        InitialPDFsSetter.cu InitialPDFsSetter.h
+                        CumulantMRTNoSlip.cu CumulantMRTNoSlip.h)
+
+        walberla_add_executable ( NAME 03_AdvancedLBMCodegenApp
+                        FILES 03_AdvancedLBMCodegen.cpp
+                        DEPENDS blockforest cuda core domain_decomposition field geometry timeloop lbm stencil vtk 03_AdvancedLBMCodegenPython )
+    else()
+        walberla_generate_target_from_python( NAME 03_AdvancedLBMCodegenPython
+                FILE 03_AdvancedLBMCodegen.py
+                OUT_FILES   CumulantMRTSweep.cpp CumulantMRTSweep.h
+                CumulantMRTPackInfo.cpp CumulantMRTPackInfo.h
+                InitialPDFsSetter.cpp InitialPDFsSetter.h
+                CumulantMRTNoSlip.cpp CumulantMRTNoSlip.h)
+
+        walberla_add_executable ( NAME 03_AdvancedLBMCodegenApp
+                FILES 03_AdvancedLBMCodegen.cpp
+                DEPENDS blockforest core domain_decomposition field geometry timeloop lbm stencil vtk 03_AdvancedLBMCodegenPython )
+    endif()
+
 endif()
\ No newline at end of file
diff --git a/doc/Mainpage.dox b/doc/Mainpage.dox
index ed13b558e7565c5ea9ce6a273fc955c99157c9ac..0f35c3422b1e59f367c7339373bc4c57931c6a97 100644
--- a/doc/Mainpage.dox
+++ b/doc/Mainpage.dox
@@ -43,11 +43,17 @@ all the basic data strcutures and concepts of the framework.
 - \ref tutorial_lbm04 \n
   A LBM simulation with complex geometries is built.
 
-\subsection codegen Code Generation with pystencils
+\subsection codegen Code Generation
 
 - \ref tutorial_codegen01 \n
-  This tutorial shows how to use pystencils to generate waLBerla sweeps from symbolic descriptions. 
-   
+  This tutorial shows how to use pystencils to generate waLBerla sweeps from symbolic descriptions.
+- \ref tutorial_codegen02 \n
+  This tutorial demonstrates how to use lbmpy to generate an efficient lattice model for a waLBerla sweep.
+- \ref tutorial_codegen03 \n
+  This tutorial describes how to use lbmpy to generate a full LBM sweep for waLBerla.
+
+
+
 \section further_info Further information
 
 Not all features of the framework are covered in the tutorials.
diff --git a/python/pystencils_walberla/codegen.py b/python/pystencils_walberla/codegen.py
index a314baf7827c730fdae8a144faad25605f386953..37013cba6203cdd9a45da7b044361dc71fa457d1 100644
--- a/python/pystencils_walberla/codegen.py
+++ b/python/pystencils_walberla/codegen.py
@@ -186,8 +186,14 @@ def generate_pack_info(generation_context, class_name: str,
     items = sorted(items, key=lambda e: e[0])
     directions_to_pack_terms = OrderedDict(items)
 
+    if 'cpu_vectorize_info' in create_kernel_params:
+        vec_params = create_kernel_params['cpu_vectorize_info']
+        if 'instruction_set' in vec_params and vec_params['instruction_set'] is not None:
+            raise NotImplementedError("Vectorisation of the pack info is not implemented.")
+
     create_kernel_params = default_create_kernel_parameters(generation_context, create_kernel_params)
     target = create_kernel_params.get('target', 'cpu')
+    create_kernel_params['cpu_vectorize_info']['instruction_set'] = None
 
     template_name = "CpuPackInfo.tmpl" if target == 'cpu' else 'GpuPackInfo.tmpl'