diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index bc53eea36eb16cccb33006b4d6bf4eb669ab2fb8..694cf47bdcac8f6eaee541c471b13f03d1ef429b 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -21,7 +21,7 @@ tests-and-coverage:
     - py.test -v -n $NUM_CORES --cov-report html --cov-report term --cov=. -m "not longrun"
   tags:
     - docker
-    - cuda
+    - cuda11
     - AVX
   artifacts:
     when: always
@@ -43,7 +43,7 @@ test-longrun:
     - py.test -v -n $NUM_CORES --cov-report html --cov-report term --cov=.
   tags:
     - docker
-    - cuda
+    - cuda11
     - AVX
   artifacts:
     paths:
@@ -79,7 +79,7 @@ ubuntu:
     - pytest-3 -v -m "not longrun"
   tags:
     - docker
-    - cuda
+    - cuda11
 
 minimal-conda:
   stage: test
@@ -93,6 +93,39 @@ minimal-conda:
   tags:
     - docker
 
+pycodegen-integration:
+  image: i10git.cs.fau.de:5005/pycodegen/pycodegen/full
+  stage: test
+  when: manual
+  script:
+    # run lbmpy long test pipeline
+    - export NUM_CORES=$(nproc --all)
+    - mkdir -p ~/.config/matplotlib
+    - echo "backend:template" > ~/.config/matplotlib/matplotlibrc
+    - pip install git+https://gitlab-ci-token:${CI_JOB_TOKEN}@i10git.cs.fau.de/pycodegen/pystencils.git@master#egg=pystencils
+    - py.test -v -n $NUM_CORES --cov-report html --cov-report term --cov=.
+    # change path to outside of lbmpy and call pip to set the environment variable right
+    - cd ..
+    - pip install -e lbmpy
+    # fetch pycodegen repository with waLberla as submodule and install waLBerla to run the integration tests
+    - git clone https://gitlab-ci-token:${CI_JOB_TOKEN}@i10git.cs.fau.de/pycodegen/pycodegen.git
+    - cd pycodegen
+    - git submodule sync --recursive
+    - git submodule update --init --recursive
+    - git submodule foreach git fetch origin   # compare the latest master version!
+    - git submodule foreach git reset --hard origin/master
+    - ./install_walberla.sh
+    # build all integration tests
+    - cd walberla/build/
+    - make -j $NUM_CORES CodegenJacobiCPU CodegenJacobiGPU CodegenPoisson MicroBenchmarkGpuLbm LbCodeGenerationExample
+    - cd apps/benchmarks/UniformGridGPU
+    - make -j $NUM_CORES
+    - cd ../UniformGridGenerated
+    - make -j $NUM_CORES
+  tags:
+    - docker
+    - cuda11
+    - AVX
 
 # -------------------- Linter & Documentation --------------------------------------------------------------------------
 
@@ -107,7 +140,7 @@ flake8-lint:
     - flake8 lbmpy
   tags:
     - docker
-    - cuda
+    - cuda11
 
 
 build-documentation:
@@ -120,7 +153,7 @@ build-documentation:
     - sphinx-build -W -b html doc  html_doc
   tags:
     - docker
-    - cuda
+    - cuda11
   artifacts:
     paths:
       - html_doc
diff --git a/README.md b/README.md
index 617a9ce3b4779c883546077b478d019172665064..cd7fce1e2c5f58943c8957d31f57191632575c1b 100644
--- a/README.md
+++ b/README.md
@@ -2,9 +2,9 @@ lbmpy
 =====
 
 [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/mabau/lbmpy/master?filepath=doc%2Fnotebooks)
-[![Docs](https://img.shields.io/badge/read-the_docs-brightgreen.svg)](http://pycodegen.pages.walberla.net/lbmpy)
+[![Docs](https://img.shields.io/badge/read-the_docs-brightgreen.svg)](http://pycodegen.pages.i10git.cs.fau.de/lbmpy)
 [![pipeline status](https://i10git.cs.fau.de/pycodegen/lbmpy/badges/master/pipeline.svg)](https://i10git.cs.fau.de/pycodegen/lbmpy/commits/master)
-[![coverage report](https://i10git.cs.fau.de/pycodegen/lbmpy/badges/master/coverage.svg)](http://pycodegen.pages.walberla.net/lbmpy/coverage_report)
+[![coverage report](https://i10git.cs.fau.de/pycodegen/lbmpy/badges/master/coverage.svg)](http://pycodegen.pages.i10git.cs.fau.de/lbmpy/coverage_report)
 
 
 Run fast fluid simulations based on the lattice Boltzmann method in Python on CPUs and GPUs.
@@ -39,7 +39,7 @@ pip install lbmpy[interactive]
 Without `[interactive]` you get a minimal version with very little dependencies.
 
 All options:
-- `gpu`: use this if nVidia GPU is available and CUDA is installed
+- `gpu`: use this if a NVIDIA GPU is available and CUDA is installed
 - `opencl`: use this to enable the target `opencl` (execution using OpenCL)
 - `alltrafos`: pulls in additional dependencies for loop simplification e.g. libisl
 - `interactive`: installs dependencies to work in Jupyter including image I/O, plotting etc.
@@ -53,5 +53,5 @@ pip install lbmpy[interactive,gpu,doc]
 Documentation
 -------------
 
-Read the docs [here](http://pycodegen.pages.walberla.net/lbmpy) and
+Read the docs [here](http://pycodegen.pages.i10git.cs.fau.de/lbmpy) and
 check out the Jupyter notebooks in `doc/notebooks`. 
diff --git a/conftest.py b/conftest.py
index 3ae58d02970293604366736556a8e85aed0e061a..2f4100ef2d97f6dc10b736bdaac96f0367669fe4 100644
--- a/conftest.py
+++ b/conftest.py
@@ -21,6 +21,12 @@ from lbmpy.phasefield.simplex_projection import simplex_projection_2d  # NOQA
 SCRIPT_FOLDER = os.path.dirname(os.path.realpath(__file__))
 sys.path.insert(0, os.path.abspath('lbmpy'))
 
+# the Ubuntu pipeline uses an older version of pytest which uses deprecated functionality.
+# This leads to many warinings in the test and coverage pipeline.
+pytest_numeric_version = [int(x, 10) for x in pytest.__version__.split('.')]
+pytest_numeric_version.reverse()
+pytest_version = sum(x * (100 ** i) for i, x in enumerate(pytest_numeric_version))
+
 
 def add_path_to_ignore(path):
     if not os.path.exists(path):
@@ -121,7 +127,10 @@ class IPyNbFile(pytest.File):
             warnings.filterwarnings("ignore", "IPython.core.inputsplitter is deprecated")
             notebook = nbformat.read(notebook_contents, 4)
             code, _ = exporter.from_notebook_node(notebook)
-            yield IPyNbTest(self.name, self, code)
+            if pytest_version >= 50403:
+                yield IPyNbTest.from_parent(name=self.name, parent=self, code=code)
+            else:
+                yield IPyNbTest(self.name, self, code)
 
     def teardown(self):
         pass
@@ -130,4 +139,7 @@ class IPyNbFile(pytest.File):
 def pytest_collect_file(path, parent):
     glob_exprs = ["*demo*.ipynb", "*tutorial*.ipynb", "test_*.ipynb"]
     if any(path.fnmatch(g) for g in glob_exprs):
-        return IPyNbFile(path, parent)
+        if pytest_version >= 50403:
+            return IPyNbFile.from_parent(fspath=path, parent=parent)
+        else:
+            return IPyNbFile(path, parent)
diff --git a/lbmpy/creationfunctions.py b/lbmpy/creationfunctions.py
index c36f867c937aad8c56ff97b89657673ad73c0066..52c9bbbb4bbc7bb7f897dca5bdbb74d2adfa7c77 100644
--- a/lbmpy/creationfunctions.py
+++ b/lbmpy/creationfunctions.py
@@ -322,20 +322,6 @@ def create_lb_collision_rule(lb_method=None, optimization={}, **kwargs):
     else:
         collision_rule = lb_method.get_collision_rule(keep_rrs_symbolic=keep_rrs_symbolic)
 
-    if params['output'] and params['kernel_type'] == 'stream_pull_collide':
-        cqc = lb_method.conserved_quantity_computation
-        output_eqs = cqc.output_equations_from_pdfs(lb_method.pre_collision_pdf_symbols, params['output'])
-        collision_rule = collision_rule.new_merged(output_eqs)
-
-    if opt_params['simplification'] == 'auto':
-        simplification = create_simplification_strategy(lb_method, split_inner_loop=split_inner_loop)
-    else:
-        simplification = opt_params['simplification']
-    collision_rule = simplification(collision_rule)
-
-    if params['fluctuating']:
-        add_fluctuations_to_collision_rule(collision_rule, **params['fluctuating'])
-
     if params['entropic']:
         if params['smagorinsky']:
             raise ValueError("Choose either entropic or smagorinsky")
@@ -355,6 +341,20 @@ def create_lb_collision_rule(lb_method=None, optimization={}, **kwargs):
         if 'split_groups' in collision_rule.simplification_hints:
             collision_rule.simplification_hints['split_groups'][0].append(sp.Symbol("smagorinsky_omega"))
 
+    if params['output'] and params['kernel_type'] == 'stream_pull_collide':
+        cqc = lb_method.conserved_quantity_computation
+        output_eqs = cqc.output_equations_from_pdfs(lb_method.pre_collision_pdf_symbols, params['output'])
+        collision_rule = collision_rule.new_merged(output_eqs)
+
+    if opt_params['simplification'] == 'auto':
+        simplification = create_simplification_strategy(lb_method, split_inner_loop=split_inner_loop)
+    else:
+        simplification = opt_params['simplification']
+    collision_rule = simplification(collision_rule)
+
+    if params['fluctuating']:
+        add_fluctuations_to_collision_rule(collision_rule, **params['fluctuating'])
+
     cse_pdfs = False if 'cse_pdfs' not in opt_params else opt_params['cse_pdfs']
     cse_global = False if 'cse_global' not in opt_params else opt_params['cse_global']
     if cse_pdfs:
diff --git a/lbmpy/moments.py b/lbmpy/moments.py
index 84c25c4049e3c41543c98231a47850a96b2b8390..13a84ce62c52cff5ac155c96ec703dd20f04d8fa 100644
--- a/lbmpy/moments.py
+++ b/lbmpy/moments.py
@@ -456,7 +456,7 @@ def extract_monomials(sequence_of_polynomials, dim=3):
     >>> extract_monomials([x**2 + y**2 + y, y + y**2])
     {(0, 2, 0), (0, 1, 0), (2, 0, 0)}
     >>> extract_monomials([x**2 + y**2 + y, y + y**2], dim=2)
-    {(0, 1), (2, 0), (0, 2)}
+    {(0, 1), (0, 2), (2, 0)}
     """
     monomials = set()
     for polynomial in sequence_of_polynomials:
@@ -479,8 +479,8 @@ def monomial_to_polynomial_transformation_matrix(monomials, polynomials):
     >>> mons = list(extract_monomials(polys, dim=2))
     >>> monomial_to_polynomial_transformation_matrix(mons, polys)
     Matrix([
-    [7,  3, 2],
-    [9, -5, 0]])
+    [ 3, 2, 7],
+    [-5, 0, 9]])
     """
     dim = len(monomials[0])
 
diff --git a/lbmpy_tests/test_lbstep.py b/lbmpy_tests/test_lbstep.py
index eabbf2629718bf19796ca956bf424e0a15fc721d..02e3c44a565e239b0be1738085a659038948c894 100644
--- a/lbmpy_tests/test_lbstep.py
+++ b/lbmpy_tests/test_lbstep.py
@@ -54,24 +54,29 @@ def test_data_handling_2d_opencl():
     pystencils.opencl.opencljit.init_globally()
     print("--- LDC 2D test ---")
     results = []
-    for parallel in [True, False] if parallel_available else [False]:
-        for gpu in [True, False] if gpu_available else [False]:
-            if parallel and gpu and not hasattr(wLB, 'cuda'):
-                continue
 
-            print("Testing parallel: %s\tgpu: %s" % (parallel, gpu))
-            opt_params = {'target': 'opencl' if gpu else 'cpu',
-                          'gpu_indexing_params': {'block_size': (8, 4, 2)}}
-            if parallel:
-                from pystencils.datahandling import ParallelDataHandling
-                blocks = wLB.createUniformBlockGrid(blocks=(2, 3, 1), cellsPerBlock=(5, 5, 1),
-                                                    oneBlockPerProcess=False)
-                dh = ParallelDataHandling(blocks, dim=2)
-                rho = ldc_setup(data_handling=dh, optimization=opt_params)
-                results.append(rho)
-            else:
-                rho = ldc_setup(domain_size=(10, 15), parallel=False, optimization=opt_params)
-                results.append(rho)
+    # Since waLBerla has no OpenCL Backend yet, it is not possible to use the
+    # parallel Datahandling with OpenCL at the moment
+
+    # TODO: Activate parallel Datahandling if Backend is available
+    parallel = False
+    for gpu in [True, False] if gpu_available else [False]:
+        if parallel and gpu and not hasattr(wLB, 'cuda'):
+            continue
+
+        print("Testing parallel: %s\tgpu: %s" % (parallel, gpu))
+        opt_params = {'target': 'opencl' if gpu else 'cpu',
+                      'gpu_indexing_params': {'block_size': (8, 4, 2)}}
+        if parallel:
+            from pystencils.datahandling import ParallelDataHandling
+            blocks = wLB.createUniformBlockGrid(blocks=(2, 3, 1), cellsPerBlock=(5, 5, 1),
+                                                oneBlockPerProcess=False)
+            dh = ParallelDataHandling(blocks, dim=2)
+            rho = ldc_setup(data_handling=dh, optimization=opt_params)
+            results.append(rho)
+        else:
+            rho = ldc_setup(domain_size=(10, 15), parallel=False, optimization=opt_params)
+            results.append(rho)
     for i, arr in enumerate(results[1:]):
         print("Testing equivalence version 0 with version %d" % (i + 1,))
         np.testing.assert_almost_equal(results[0], arr)
diff --git a/lbmpy_tests/test_split_optimization.py b/lbmpy_tests/test_split_optimization.py
index 2e0b744191eb4c8c477f2ccb6617807d604ce1bf..dd1093806086c5adc7875a72d69cc515bca4b282 100644
--- a/lbmpy_tests/test_split_optimization.py
+++ b/lbmpy_tests/test_split_optimization.py
@@ -4,6 +4,7 @@ import pytest
 from lbmpy.creationfunctions import create_lb_ast
 from lbmpy.scenarios import create_lid_driven_cavity
 from pystencils.sympyextensions import count_operations_in_ast
+from sympy.core.cache import clear_cache
 
 
 def test_split_number_of_operations():
@@ -36,6 +37,7 @@ def test_equivalence():
         for compressible in (True, False):
             for method in ('srt', 'mrt'):
                 for force in ((0, 0, 0), (1e-6, 1e-7, 2e-6)):
+                    clear_cache()
                     common_params = {'domain_size': (20, 30) if stencil.startswith('D2') else (10, 13, 7),
                                      'stencil': stencil,
                                      'method': method,
diff --git a/lbmpy_tests/test_srt_trt_simplifications.py b/lbmpy_tests/test_srt_trt_simplifications.py
index eb799d4484ab538eaac3f6d2a26c67b2ca0539e6..ca830c4d3ce5f36a1fdc459be6bd598194c3bdb8 100644
--- a/lbmpy_tests/test_srt_trt_simplifications.py
+++ b/lbmpy_tests/test_srt_trt_simplifications.py
@@ -38,7 +38,7 @@ def test_simplifications_srt_d2q9_incompressible():
 def test_simplifications_srt_d2q9_compressible():
     omega = sp.symbols('omega')
     method = create_srt(get_stencil("D2Q9"), omega, compressible=True, equilibrium_order=2)
-    check_method(method, [53, 57, 1], [53, 41, 1])
+    check_method(method, [53, 58, 1], [53, 42, 1])
 
 
 def test_simplifications_trt_d2q9_incompressible():
@@ -50,7 +50,7 @@ def test_simplifications_trt_d2q9_incompressible():
 def test_simplifications_trt_d2q9_compressible():
     o1, o2 = sp.symbols("omega_1 omega_2")
     method = create_trt(get_stencil("D2Q9"), o1, o2, compressible=True)
-    check_method(method, [77, 105, 1], [65, 55, 1])
+    check_method(method, [77, 106, 1], [65, 56, 1])
 
 
 def test_simplifications_trt_d3q19_force_incompressible():
@@ -64,4 +64,4 @@ def test_simplifications_trt_d3q19_force_compressible():
     o1, o2 = sp.symbols("omega_1 omega_2")
     force_model = Luo([sp.Rational(1, 3), sp.Rational(1, 2), sp.Rational(1, 5)])
     method = create_trt_with_magic_number(get_stencil("D3Q19"), o1, compressible=False, force_model=force_model)
-    check_method(method, [270, 283, 1], [243, 177, 1])
+    check_method(method, [270, 284, 1], [243, 178, 1])