c7aba2e3 · 9362b3e1 · 154a3f90 · 28a082dd · 01982e87 · fb0f395e
--- a/.clang-format
+++ b/.clang-format
+Language:        Cpp
+AccessModifierOffset: -2
+AlignAfterOpenBracket: Align
+AlignConsecutiveAssignments: true
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlines: DontAlign
+AlignOperands:   true
+AlignTrailingComments: true
+AllowAllParametersOfDeclarationOnNextLine: false
+AllowShortBlocksOnASingleLine: true
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: All
+AllowShortIfStatementsOnASingleLine: true
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: false
+AlwaysBreakTemplateDeclarations: true
+BinPackArguments: true
+BinPackParameters: true
+BraceWrapping:   
+  AfterClass:      true
+  AfterControlStatement: true
+  AfterEnum:       false
+  AfterFunction:   true
+  AfterNamespace:  true
+  AfterObjCDeclaration: false
+  AfterStruct:     true
+  AfterUnion:      true
+  AfterExternBlock: true
+  BeforeCatch:     false
+  BeforeElse:      true
+  IndentBraces:    false
+  SplitEmptyFunction: false
+  SplitEmptyRecord: false
+  SplitEmptyNamespace: false
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: Custom
+BreakBeforeInheritanceComma: false
+BreakBeforeTernaryOperators: false
+BreakConstructorInitializers: BeforeColon
+BreakStringLiterals: true
+ColumnLimit:     120
+CommentPragmas:  '^ IWYU pragma:'
+CompactNamespaces: false
+ConstructorInitializerAllOnOneLineOrOnePerLine: false
+ConstructorInitializerIndentWidth: 3
+ContinuationIndentWidth: 3
+Cpp11BracedListStyle: false
+DerivePointerAlignment: false
+DisableFormat:   false
+ExperimentalAutoDetectBinPacking: false
+FixNamespaceComments: true
+ForEachMacros:   [ foreach, Q_FOREACH, BOOST_FOREACH ]
+IncludeBlocks: Regroup
+IncludeCategories: 
+  - Regex:           '^"blockforest/'
+    Priority:        1
+  - Regex:           '^"boundary/'
+    Priority:        2
+  - Regex:           '^"communication/'
+    Priority:        3
+  - Regex:           '^"core/'
+    Priority:        4
+  - Regex:           '^"domain_decomposition/'
+    Priority:        5
+  - Regex:           '^"executiontree/'
+    Priority:        6
+  - Regex:           '^"fft/'
+    Priority:        7
+  - Regex:           '^"field/'
+    Priority:        8
+  - Regex:           '^"gather/'
+    Priority:        9
+  - Regex:           '^"geometry/'
+    Priority:        10
+  - Regex:           '^"gpu/'
+    Priority:        11
+  - Regex:           '^"gpu/'
+    Priority:        12
+  - Regex:           '^"lbm/'
+    Priority:        13
+  - Regex:           '^"lbm_mesapd_coupling/'
+    Priority:        14
+  - Regex:           '^"mesh/'
+    Priority:        15
+  - Regex:           '^"mesa_pd/'
+    Priority:        16
+  - Regex:           '^"pde/'
+    Priority:        17
+  - Regex:           '^"pe/'
+    Priority:        18
+  - Regex:           '^"pe_coupling/'
+    Priority:        19
+  - Regex:           '^"postprocessing/'
+    Priority:        20
+  - Regex:           '^"python_coupling/'
+    Priority:        21
+  - Regex:           '^"simd/'
+    Priority:        22
+  - Regex:           '^"sqlite/'
+    Priority:        23
+  - Regex:           '^"stencil/'
+    Priority:        24
+  - Regex:           '^"timeloop/'
+    Priority:        25
+  - Regex:           '^"vtk/'
+    Priority:        26
+  - Regex:           '^<boost/'
+    Priority:        27
+  - Regex:           '^<'
+    Priority:        28
+IndentCaseLabels: false
+IndentPPDirectives: AfterHash
+IndentWidth: 3
+IndentWrappedFunctionNames: true
+KeepEmptyLinesAtTheStartOfBlocks: false
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBlockIndentWidth: 2
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: true
+PenaltyBreakAssignment: 2
+PenaltyBreakBeforeFirstCallParameter: 19
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 60
+PointerAlignment: Left
+ReflowComments:  true
+SortIncludes:    true
+SortUsingDeclarations: true
+SpaceAfterCStyleCast: true
+SpaceAfterTemplateKeyword: false
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeParens: ControlStatements
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles:  true
+SpacesInContainerLiterals: true
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard:        Cpp11
+TabWidth:        3
+UseTab:          Never
\ No newline at end of file
--- a/.clang-tidy
+++ b/.clang-tidy
+---
+Checks:          '
+
+-*,
+
+boost-*,
+-boost-use-ranges,
+
+bugprone-*,
+-bugprone-branch-clone,
+-bugprone-exception-escape,
+-bugprone-easily-swappable-parameters,
+-bugprone-crtp-constructor-accessibility,
+-bugprone-implicit-widening-of-multiplication-result,
+-bugprone-macro-parentheses,
+-bugprone-narrowing-conversions,
+-bugprone-switch-missing-default-case,
+-bugprone-assignment-in-if-condition,
+-bugprone-reserved-identifier,
+
+misc-*,
+-misc-no-recursion,
+-misc-non-private-member-variables-in-classes,
+-misc-include-cleaner,
+-misc-header-include-cycle,
+-misc-use-internal-linkage,
+-misc-use-anonymous-namespace,
+-misc-const-correctness,
+-misc-misplaced-const
+
+modernize-*,
+-modernize-use-auto,
+-modernize-pass-by-value,
+-modernize-raw-string-literal,
+-modernize-use-transparent-functors,
+-modernize-use-trailing-return-type,
+-modernize-use-nodiscard,
+-modernize-macro-to-enum,
+-modernize-concat-nested-namespaces,
+
+mpi-*,
+-mpi-type-mismatch,
+
+openmp-*,
+-openmp-exception-escape,
+-openmp-use-default-none,
+
+performance-*,
+-performance-enum-size,
+-performance-noexcept-swap,
+-performance-unnecessary-value-param,
+-performance-avoid-endl,
+-performance-no-int-to-ptr,
+
+portability-*,
+
+readability-container-size-empty,
+readability-delete-null-pointer,
+readability-deleted-default,
+readability-misplaced-array-index,
+readability-non-const-parameter,
+readability-redundant-control-flow,
+readability-redundant-declaration,
+readability-redundant-function-ptr-dereference,
+readability-redundant-preprocessor,
+readability-redundant-smartptr-get,
+readability-redundant-string-cstr,
+readability-simplify-subscript-expr,
+readability-static-accessed-through-instance,
+readability-static-definition-in-anonymous-namespace,
+readability-string-compare,
+readability-uniqueptr-delete-release,
+readability-use-anyofallof
+
+'
+WarningsAsErrors: '*'
+HeaderFilterRegex: '.*'
+...
+
--- a/.editorconfig
+++ b/.editorconfig
+# See https://editorconfig.org/
+# top-most .editorconfig-file
+root = true
+
+[*]
+tab_width = 3
+indent_style = space
+indent_size = 3
+charset = utf-8
+trim_trailing_whitespace = true
+insert_final_newline = false
+
+[*.py]
+tab_width = 4
+indent_size = 4
+insert_final_newline = true
\ No newline at end of file
--- a/.flake8
+++ b/.flake8
+[flake8]
+max-line-length=120
+exclude=apps/showcases/Mixer/GenerateModule.py, # contains only statements
+        apps/benchmarks/FieldCommunication/config.py # just a config file 
+        utilities/bashhelper/createShortcuts.py # contains a lot of really long strings
+ignore = W503 C901 E741
--- a/.github/workflows/basicTests.yml
+++ b/.github/workflows/basicTests.yml
+name: waLBerla Tests
+
+on: push
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v2
+    - name: create build directory
+      run: mkdir build
+    - name: CMake
+      run: cmake -S . -B build -DWALBERLA_BUILD_WITH_MPI=OFF -DWALBERLA_BUILD_TESTS=ON -DWALBERLA_BUILD_WITH_OPENMP=ON
+    - name: make
+      run: cmake --build build -j 2
+    - name: ctest
+      run: cd build; ctest
--- a/.gitignore
+++ b/.gitignore
@@ -4,48 +4,72 @@ ui_*
 qrc_*
 *~

-# Backup files of kate/kwrite
+
+# macOS
+**/.DS_Store
+
+# CLion indexing
+*.uuid
+.fleet
+

 # Generated files
 *.out
 /src/waLBerlaDefinitions.h
 /src/core/waLBerlaBuildInfo.cpp

+
 # Eclipse
 /.cproject
 /.project
 /.pydevproject
 /.settings

+
 # Visual Studio
 /CMakeSettings.json
 /.vs

+# Visual Studio Code
+/.vscode
+
+# Zed
+/.cache*
+
+# CLion
+*.idea
+*.clion*
+
+# QtCreator
+CMakeLists.txt.user.*
+
+
 # Binary Files
 /bin/
 /lib/
 *.a
-/build
+/build*


 # Logfiles
 logfile*.txt
 *TestLog_p*.txt

+# Python venv
+.venv

 # Compiled python
 *.pyc

+# Jupyter Notebook
+**/.ipynb_checkpoints
+
 # Blockforest saves
 *.sav


-/utilities/.idea
-/utilities/py_waLBerla/.idea
-/python/.idea
-
-/src/stencil/.idea
-
+# CMake
+CMakeUserPresets.json
 /CMakeLists.txt.user

 # CMake build files
@@ -58,3 +82,11 @@ cmake_install.cmake
 CMakeDefs.h
 /moduleStatistics.json
 /walberla-config.cmake
+cmake-build-*
+
+
+# Virtual environments
+.venv/
+env/
+venv/
+ENV/
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
 ###############################################################################
 ##                                                                           ##
-##    Genral settings                                                        ##
+##    General settings                                                       ##
 ##                                                                           ##
 ###############################################################################

 stages:
+   - pretest
+   - "Code Quality"
   - test
   - deploy
+   - benchmark
+

-   
 ###############################################################################
 ##                                                                           ##
 ##    Build templates                                                        ##
 ##                                                                           ##
 ###############################################################################

-.build_template: &build_definition
+.build_template:
   script:
+      - source /entrypoint.sh
+      - pip install -I cmake==3.16.3 jinja2
      - export NUM_CORES=$(nproc --all)
      - export MAX_BUILD_CORES=$(( $(awk '( $1 == "MemTotal:" ) { print $2 }' /proc/meminfo) / ( 4 * 1024 * 1024  ) ))
      - "[[ $MAX_BUILD_CORES -lt $NUM_CORES ]] && export NUM_BUILD_CORES=$MAX_BUILD_CORES || export NUM_BUILD_CORES=$NUM_CORES"
@@ -24,42 +29,43 @@ stages:
      - cmake --version
      - ccache --version
      - mpirun --version
+      - python3 --version
+      - python3 python/mesa_pd.py -y .
      - export CCACHE_BASEDIR=$CI_PROJECT_DIR
      - mkdir $CI_PROJECT_DIR/build
      - cd $CI_PROJECT_DIR/build
-      - if dpkg --compare-versions `ompi_info | head -2 | tail -1 | sed 's/[^0-9.]*\([0-9.]*\).*/\1/'` ge 1.10; then export MPIEXEC_PREFLAGS="--allow-run-as-root" ; fi
-      - cmake .. -DWALBERLA_BUFFER_DEBUG=$WALBERLA_BUFFER_DEBUG -DWALBERLA_BUILD_TESTS=ON -DWALBERLA_BUILD_BENCHMARKS=ON -DWALBERLA_BUILD_TUTORIALS=ON -DWALBERLA_BUILD_WITH_MPI=$WALBERLA_BUILD_WITH_MPI -DWALBERLA_BUILD_WITH_CUDA=$WALBERLA_BUILD_WITH_CUDA -DWALBERLA_BUILD_WITH_PYTHON=$WALBERLA_BUILD_WITH_PYTHON -DWALBERLA_BUILD_WITH_OPENMP=$WALBERLA_BUILD_WITH_OPENMP -DCMAKE_BUILD_TYPE=$CMAKE_BUILD_TYPE -DMPIEXEC_PREFLAGS=$MPIEXEC_PREFLAGS -DWALBERLA_DOUBLE_ACCURACY=$WALBERLA_DOUBLE_ACCURACY -DWARNING_ERROR=ON
-      - cmake . -LAH
+      - if command -v ompi_info && dpkg --compare-versions `ompi_info | head -2 | tail -1 | sed 's/[^0-9.]*\([0-9.]*\).*/\1/'` ge 1.10; then export MPIEXEC_PREFLAGS="--allow-run-as-root" ; fi
+      - cmake ..
+        -DCMAKE_CXX_FLAGS=$CMAKE_CXX_FLAGS
+        -DWALBERLA_BUFFER_DEBUG=$WALBERLA_BUFFER_DEBUG
+        -DWALBERLA_BUILD_TESTS=ON
+        -DWALBERLA_BUILD_BENCHMARKS=ON
+        -DWALBERLA_BUILD_TUTORIALS=ON
+        -DWALBERLA_BUILD_TOOLS=ON
+        -DWALBERLA_BUILD_SHOWCASES=ON
+        -DWALBERLA_BUILD_WITH_MPI=$WALBERLA_BUILD_WITH_MPI
+        -DWALBERLA_BUILD_WITH_CUDA=$WALBERLA_BUILD_WITH_CUDA
+        -DWALBERLA_BUILD_WITH_PYTHON=$WALBERLA_BUILD_WITH_PYTHON
+        -DWALBERLA_BUILD_WITH_OPENMP=$WALBERLA_BUILD_WITH_OPENMP
+        -DCMAKE_BUILD_TYPE=$CMAKE_BUILD_TYPE -DMPIEXEC_PREFLAGS=$MPIEXEC_PREFLAGS
+        -DWALBERLA_DOUBLE_ACCURACY=$WALBERLA_DOUBLE_ACCURACY
+        -DWARNING_ERROR=$WARNING_ERROR
+        -DWALBERLA_BUILD_WITH_METIS=$WALBERLA_BUILD_WITH_METIS
+        -DWALBERLA_BUILD_WITH_PARMETIS=$WALBERLA_BUILD_WITH_PARMETIS
+        -DWALBERLA_BUILD_WITH_FFTW=$WALBERLA_BUILD_WITH_FFTW
+        -DWALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT=$WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT
+        -DWALBERLA_BUILD_WITH_CODEGEN=$WALBERLA_BUILD_WITH_CODEGEN
+        -DWALBERLA_STL_BOUNDS_CHECKS=$WALBERLA_STL_BOUNDS_CHECKS
+        -DWALBERLA_LOGLEVEL=$WALBERLA_LOGLEVEL
+        -DCMAKE_CUDA_ARCHITECTURES=60
+      - cmake . -LA
      - make -j $NUM_BUILD_CORES -l $NUM_CORES
-      - ctest -LE $CTEST_EXCLUDE_LABELS -C $CMAKE_BUILD_TYPE --output-on-failure -j $NUM_CORES
+      - ctest -LE $CTEST_EXCLUDE_LABELS -C $CMAKE_BUILD_TYPE --output-on-failure -j $NUM_CORES -T Test
+   after_script:
+      - pip3 install lxml
+      - python3 cmake/ctest2junit.py build > report.xml
   tags:
      - docker
-
-
-.build_serial_template: &build_serial_definition
-   <<: *build_definition
-   variables:
-      CTEST_EXCLUDE_LABELS: "longrun"
-      WALBERLA_BUILD_WITH_MPI: "OFF"
-      WALBERLA_BUILD_WITH_OPENMP: "OFF"
-      CMAKE_BUILD_TYPE: "Release"
-      WALBERLA_BUFFER_DEBUG: "OFF"
-      WALBERLA_DOUBLE_ACCURACY: "ON"
-
-
-.build_mpionly_template: &build_mpionly_definition
-   <<: *build_definition
-   variables:
-       CTEST_EXCLUDE_LABELS: "longrun"
-       WALBERLA_BUILD_WITH_MPI: "ON"
-       WALBERLA_BUILD_WITH_OPENMP: "OFF"
-       CMAKE_BUILD_TYPE: "Release"
-       WALBERLA_BUFFER_DEBUG: "OFF"
-       WALBERLA_DOUBLE_ACCURACY: "ON"
-
-
-.build_hybrid_template: &build_hybrid_definition
-   <<: *build_definition
   variables:
      CTEST_EXCLUDE_LABELS: "longrun"
      WALBERLA_BUILD_WITH_MPI: "ON"
@@ -69,52 +75,17 @@ stages:
      CMAKE_BUILD_TYPE: "Release"
      WALBERLA_BUFFER_DEBUG: "OFF"
      WALBERLA_DOUBLE_ACCURACY: "ON"
-
-.build_serial_dbg_template: &build_serial_dbg_definition
-   <<: *build_definition
-   variables:
-      CTEST_EXCLUDE_LABELS: "longrun"
-      WALBERLA_BUILD_WITH_MPI: "OFF"
-      WALBERLA_BUILD_WITH_OPENMP: "OFF"
-      CMAKE_BUILD_TYPE: "DebugOptimized"
-      WALBERLA_BUFFER_DEBUG: "OFF"
-      WALBERLA_DOUBLE_ACCURACY: "ON"
-
-
-.build_mpionly_dbg_template: &build_mpionly_dbg_definition
-   <<: *build_definition
-   variables:
-       CTEST_EXCLUDE_LABELS: "longrun"
-       WALBERLA_BUILD_WITH_MPI: "ON"
-       WALBERLA_BUILD_WITH_OPENMP: "OFF"
-       CMAKE_BUILD_TYPE: "DebugOptimized"
-       WALBERLA_BUFFER_DEBUG: "OFF"
-       WALBERLA_DOUBLE_ACCURACY: "ON"
-
-
-.build_hybrid_dbg_template: &build_hybrid_dbg_definition
-   <<: *build_definition
-   variables:
-      CTEST_EXCLUDE_LABELS: "longrun"
-      WALBERLA_BUILD_WITH_MPI: "ON"
-      WALBERLA_BUILD_WITH_OPENMP: "ON"
-      OMP_NUM_THREADS: "4"
-      OMP_WAIT_POLICY: "PASSIVE"
-      CMAKE_BUILD_TYPE: "DebugOptimized"
-      WALBERLA_BUFFER_DEBUG: "OFF"
-      WALBERLA_DOUBLE_ACCURACY: "ON"
-
-.build_hybrid_dbg_sp_template: &build_hybrid_dbg_sp_definition
-   <<: *build_definition
-   variables:
-      CTEST_EXCLUDE_LABELS: "longrun"
-      WALBERLA_BUILD_WITH_MPI: "ON"
-      WALBERLA_BUILD_WITH_OPENMP: "ON"
-      OMP_NUM_THREADS: "4"
-      OMP_WAIT_POLICY: "PASSIVE"
-      CMAKE_BUILD_TYPE: "DebugOptimized"
-      WALBERLA_BUFFER_DEBUG: "OFF"
-      WALBERLA_DOUBLE_ACCURACY: "OFF"
+      WALBERLA_BUILD_WITH_METIS: "ON"
+      WALBERLA_BUILD_WITH_PARMETIS: "ON"
+      WALBERLA_BUILD_WITH_FFTW: "ON"
+      WALBERLA_LOGLEVEL: "DETAIL"
+      WARNING_ERROR: "ON"
+   artifacts:
+      when: always
+      reports:
+         junit:
+            - report.xml
+            - python/report.xml


 ###############################################################################
@@ -125,822 +96,1913 @@ stages:



-intel_15_serial:
-   <<: *build_serial_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/intel:15
+icc_2022_serial:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/icc-2022:32
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      WALBERLA_BUILD_WITH_MPI: "OFF"
+      WALBERLA_BUILD_WITH_OPENMP: "OFF"
+      WALBERLA_BUILD_WITH_PARMETIS: "OFF"
+      WARNING_ERROR: "OFF"
   only:
-      - triggers
+      variables:
+         - $ENABLE_NIGHTLY_BUILDS
   tags:
+      - cuda11
      - docker
-      - intel

-intel_15_mpionly:
-   <<: *build_mpionly_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/intel:15
+icc_2022_mpionly:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/icc-2022:32
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      WALBERLA_BUILD_WITH_OPENMP: "OFF"
+      WARNING_ERROR: "OFF"
   only:
-      - triggers
+      variables:
+         - $ENABLE_NIGHTLY_BUILDS
   tags:
+      - cuda11
      - docker
-      - intel

-intel_15_hybrid:
-   <<: *build_hybrid_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/intel:15
-   only:
-      - triggers
+icc_2022_hybrid:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/icc-2022:32
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      WARNING_ERROR: "OFF"
   tags:
+      - cuda11
      - docker
-      - intel

-intel_15_serial_dbg:
-   <<: *build_serial_dbg_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/intel:15
-   only:
-      - triggers
+icc_2022_serial_dbg:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/icc-2022:32
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      WALBERLA_BUILD_WITH_MPI: "OFF"
+      WALBERLA_BUILD_WITH_OPENMP: "OFF"
+      WALBERLA_BUILD_WITH_PARMETIS: "OFF"
+      CMAKE_BUILD_TYPE: "DebugOptimized"
+      WARNING_ERROR: "OFF"
   tags:
+      - cuda11
      - docker
-      - intel

-intel_15_mpionly_dbg:
-   <<: *build_mpionly_dbg_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/intel:15
-   only:
-      - triggers
+icc_2022_mpionly_dbg:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/icc-2022:32
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      CMAKE_BUILD_TYPE: "DebugOptimized"
+      WALBERLA_BUILD_WITH_OPENMP: "OFF"
+      WARNING_ERROR: "OFF"
   tags:
+      - cuda11
      - docker
-      - intel

-intel_15_hybrid_dbg:
-   <<: *build_hybrid_dbg_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/intel:15
+icc_2022_hybrid_dbg:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/icc-2022:32
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      CMAKE_BUILD_TYPE: "DebugOptimized"
+      WARNING_ERROR: "OFF"
   tags:
+      - cuda11
      - docker
-      - intel

-intel_15_hybrid_dbg_sp:
-   <<: *build_hybrid_dbg_sp_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/intel:15
-   only:
-      - triggers
+icc_2022_hybrid_dbg_sp:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/icc-2022:32
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      CMAKE_BUILD_TYPE: "DebugOptimized"
+      WALBERLA_DOUBLE_ACCURACY: "OFF"
+      WALBERLA_BUILD_WITH_PARMETIS: "OFF"
+      WALBERLA_BUILD_WITH_METIS: "OFF"
+      WARNING_ERROR: "OFF"
   tags:
+      - cuda11
      - docker
-      - intel

-intel_16_serial:
-   <<: *build_serial_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/intel:16
+icx_2024_serial:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/icx-2024:32
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      WALBERLA_BUILD_WITH_MPI: "OFF"
+      WALBERLA_BUILD_WITH_OPENMP: "OFF"
+      WALBERLA_BUILD_WITH_PARMETIS: "OFF"
   only:
-      - triggers
+      variables:
+         - $ENABLE_NIGHTLY_BUILDS
   tags:
-      - cuda
+      - cuda11
      - docker
-      - intel

-intel_16_mpionly:
-   <<: *build_mpionly_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/intel:16
+icx_2024_mpionly:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/icx-2024:32
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      WALBERLA_BUILD_WITH_OPENMP: "OFF"
   only:
-      - triggers
+      variables:
+         - $ENABLE_NIGHTLY_BUILDS
   tags:
-      - cuda
+      - cuda11
      - docker
-      - intel

-intel_16_hybrid:
-   <<: *build_hybrid_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/intel:16
-   only:
-      - triggers
+icx_2024_hybrid:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/icx-2024:32
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
   tags:
-      - cuda
+      - cuda11
      - docker
-      - intel

-intel_16_serial_dbg:
-   <<: *build_serial_dbg_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/intel:16
-   only:
-      - triggers
+icx_2024_serial_dbg:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/icx-2024:32
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      WALBERLA_BUILD_WITH_MPI: "OFF"
+      WALBERLA_BUILD_WITH_OPENMP: "OFF"
+      WALBERLA_BUILD_WITH_PARMETIS: "OFF"
+      CMAKE_BUILD_TYPE: "DebugOptimized"
   tags:
-      - cuda
+      - cuda11
      - docker
-      - intel

-intel_16_mpionly_dbg:
-   <<: *build_mpionly_dbg_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/intel:16
-   only:
-      - triggers
+icx_2024_mpionly_dbg:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/icx-2024:32
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      CMAKE_BUILD_TYPE: "DebugOptimized"
+      WALBERLA_BUILD_WITH_OPENMP: "OFF"
   tags:
-      - cuda
+      - cuda11
      - docker
-      - intel

-intel_16_hybrid_dbg:
-   <<: *build_hybrid_dbg_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/intel:16
-   only:
-      - triggers
+icx_2024_hybrid_dbg:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/icx-2024:32
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      CMAKE_BUILD_TYPE: "DebugOptimized"
   tags:
-      - cuda
+      - cuda11
      - docker
-      - intel

-intel_16_hybrid_dbg_sp:
-   <<: *build_hybrid_dbg_sp_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/intel:16
-   only:
-      - triggers
+icx_2024_hybrid_dbg_sp:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/icx-2024:32
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      CMAKE_BUILD_TYPE: "DebugOptimized"
+      WALBERLA_DOUBLE_ACCURACY: "OFF"
+      WALBERLA_BUILD_WITH_PARMETIS: "OFF"
+      WALBERLA_BUILD_WITH_METIS: "OFF"
   tags:
-      - cuda
+      - cuda11
      - docker
-      - intel

-intel_17_serial:
-   <<: *build_serial_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/intel:17
+gcc_10_serial:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-10:32
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.7 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=gcc CXX=g++ ci-venv/bin/python -m pip install cupy-cuda11x
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      WALBERLA_BUILD_WITH_MPI: "OFF"
+      WALBERLA_BUILD_WITH_OPENMP: "OFF"
+      WALBERLA_BUILD_WITH_PARMETIS: "OFF"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
   only:
-      - triggers
+      variables:
+         - $ENABLE_NIGHTLY_BUILDS
   tags:
+      - cuda11
      - docker
-      - intel

-intel_17_mpionly:
-   <<: *build_mpionly_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/intel:17
+gcc_10_mpionly:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-10:32
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.7 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=gcc CXX=g++ ci-venv/bin/python -m pip install cupy-cuda11x
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      WALBERLA_BUILD_WITH_OPENMP: "OFF"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
   only:
-      - triggers
-   tags:
-      - docker
-      - intel
-
-intel_17_hybrid:
-   <<: *build_hybrid_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/intel:17
-   tags:
-      - docker
-      - intel
-
-intel_17_serial_dbg:
-   <<: *build_serial_dbg_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/intel:17
-   tags:
-      - docker
-      - intel
-
-intel_17_mpionly_dbg:
-   <<: *build_mpionly_dbg_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/intel:17
-   tags:
-      - docker
-      - intel
-
-intel_17_hybrid_dbg:
-   <<: *build_hybrid_dbg_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/intel:17
-   tags:
-      - docker
-      - intel
-
-intel_17_hybrid_dbg_sp:
-   <<: *build_hybrid_dbg_sp_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/intel:17
+      variables:
+         - $ENABLE_NIGHTLY_BUILDS
   tags:
+      - cuda11
      - docker
-      - intel

-gcc_4.7_serial:
-   <<: *build_serial_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:4.7
+gcc_10_hybrid:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-10:32
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.7 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=gcc CXX=g++ ci-venv/bin/python -m pip install cupy-cuda11x
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
   only:
-      - triggers
+      variables:
+         - $ENABLE_NIGHTLY_BUILDS
   tags:
+      - cuda11
      - docker

-gcc_4.7_mpionly:
-   <<: *build_mpionly_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:4.7
+gcc_10_serial_dbg:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-10:32
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.7 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=gcc CXX=g++ ci-venv/bin/python -m pip install cupy-cuda11x
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      WALBERLA_BUILD_WITH_MPI: "OFF"
+      WALBERLA_BUILD_WITH_OPENMP: "OFF"
+      WALBERLA_BUILD_WITH_PARMETIS: "OFF"
+      CMAKE_BUILD_TYPE: "DebugOptimized"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
   only:
-      - triggers
+      variables:
+         - $ENABLE_NIGHTLY_BUILDS
   tags:
+      - cuda11
      - docker

-gcc_4.7_hybrid:
-   <<: *build_hybrid_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:4.7
+gcc_10_mpionly_dbg:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-10:32
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.7 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=gcc CXX=g++ ci-venv/bin/python -m pip install cupy-cuda11x
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      CMAKE_BUILD_TYPE: "DebugOptimized"
+      WALBERLA_BUILD_WITH_OPENMP: "OFF"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
   only:
-      - triggers
+      variables:
+         - $ENABLE_NIGHTLY_BUILDS
   tags:
+      - cuda11
      - docker

-gcc_4.7_serial_dbg:
-   <<: *build_serial_dbg_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:4.7
-   only:
-      - triggers
+gcc_10_hybrid_dbg:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-10:32
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.7 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=gcc CXX=g++ ci-venv/bin/python -m pip install cupy-cuda11x
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      CMAKE_BUILD_TYPE: "DebugOptimized"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
   tags:
+      - cuda11
      - docker

-gcc_4.7_mpionly_dbg:
-   <<: *build_mpionly_dbg_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:4.7
+gcc_10_hybrid_dbg_sp:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-10:32
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.7 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=gcc CXX=g++ ci-venv/bin/python -m pip install cupy-cuda11x
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      CMAKE_BUILD_TYPE: "DebugOptimized"
+      WALBERLA_DOUBLE_ACCURACY: "OFF"
+      WALBERLA_BUILD_WITH_PARMETIS: "OFF"
+      WALBERLA_BUILD_WITH_METIS: "OFF"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
   only:
-      - triggers
+      variables:
+         - $ENABLE_NIGHTLY_BUILDS
   tags:
+      - cuda11
      - docker

-gcc_4.7_hybrid_dbg:
-   <<: *build_hybrid_dbg_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:4.7
+gcc_11_serial:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-11:32
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.7 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=gcc CXX=g++ ci-venv/bin/python -m pip install cupy-cuda11x
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      WALBERLA_BUILD_WITH_MPI: "OFF"
+      WALBERLA_BUILD_WITH_OPENMP: "OFF"
+      WALBERLA_BUILD_WITH_PARMETIS: "OFF"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
+   only:
+      variables:
+         - $ENABLE_NIGHTLY_BUILDS
   tags:
+      - cuda11
      - docker

-gcc_4.7_hybrid_dbg_sp:
-   <<: *build_hybrid_dbg_sp_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:4.7
+gcc_11_mpionly:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-11:32
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.7 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=gcc CXX=g++ ci-venv/bin/python -m pip install cupy-cuda11x
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      WALBERLA_BUILD_WITH_OPENMP: "OFF"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
   only:
-      - triggers
+      variables:
+         - $ENABLE_NIGHTLY_BUILDS
   tags:
+      - cuda11
      - docker

-gcc_4.8_serial:
-   <<: *build_serial_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:4.8
+gcc_11_hybrid:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-11:32
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.7 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=gcc CXX=g++ ci-venv/bin/python -m pip install cupy-cuda11x
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
   only:
-      - triggers
+      variables:
+         - $ENABLE_NIGHTLY_BUILDS
   tags:
+      - cuda11
      - docker

-gcc_4.8_mpionly:
-   <<: *build_mpionly_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:4.8
+gcc_11_serial_dbg:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-11:32
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.7 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=gcc CXX=g++ ci-venv/bin/python -m pip install cupy-cuda11x
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      WALBERLA_BUILD_WITH_MPI: "OFF"
+      WALBERLA_BUILD_WITH_OPENMP: "OFF"
+      WALBERLA_BUILD_WITH_PARMETIS: "OFF"
+      CMAKE_BUILD_TYPE: "DebugOptimized"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
   only:
-      - triggers
+      variables:
+         - $ENABLE_NIGHTLY_BUILDS
   tags:
+      - cuda11
      - docker

-gcc_4.8_hybrid:
-   <<: *build_hybrid_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:4.8
+gcc_11_mpionly_dbg:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-11:32
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.7 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=gcc CXX=g++ ci-venv/bin/python -m pip install cupy-cuda11x
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      CMAKE_BUILD_TYPE: "DebugOptimized"
+      WALBERLA_BUILD_WITH_OPENMP: "OFF"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
   only:
-      - triggers
+      variables:
+         - $ENABLE_NIGHTLY_BUILDS
   tags:
+      - cuda11
      - docker

-gcc_4.8_serial_dbg:
-   <<: *build_serial_dbg_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:4.8
+gcc_11_hybrid_dbg:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-11:32
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.7 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=gcc CXX=g++ ci-venv/bin/python -m pip install cupy-cuda11x
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      CMAKE_BUILD_TYPE: "DebugOptimized"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
   only:
-      - triggers
+      variables:
+         - $ENABLE_NIGHTLY_BUILDS
   tags:
+      - cuda11
      - docker

-gcc_4.8_mpionly_dbg:
-   <<: *build_mpionly_dbg_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:4.8
+gcc_11_hybrid_dbg_sp:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-11:32
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.7 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=gcc CXX=g++ ci-venv/bin/python -m pip install cupy-cuda11x
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      CMAKE_BUILD_TYPE: "DebugOptimized"
+      WALBERLA_DOUBLE_ACCURACY: "OFF"
+      WALBERLA_BUILD_WITH_PARMETIS: "OFF"
+      WALBERLA_BUILD_WITH_METIS: "OFF"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
   only:
-      - triggers
+      variables:
+         - $ENABLE_NIGHTLY_BUILDS
   tags:
+      - cuda11
      - docker

-gcc_4.8_hybrid_dbg:
-   <<: *build_hybrid_dbg_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:4.8
+gcc_12_serial:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-12:32
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.7 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=gcc CXX=g++ ci-venv/bin/python -m pip install cupy-cuda11x
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      WALBERLA_BUILD_WITH_MPI: "OFF"
+      WALBERLA_BUILD_WITH_OPENMP: "OFF"
+      WALBERLA_BUILD_WITH_PARMETIS: "OFF"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
   only:
-      - triggers
+      variables:
+         - $ENABLE_NIGHTLY_BUILDS
   tags:
+      - cuda11
      - docker

-gcc_4.8_hybrid_dbg_sp:
-   <<: *build_hybrid_dbg_sp_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:4.8
+gcc_12_mpionly:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-12:32
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.7 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=gcc CXX=g++ ci-venv/bin/python -m pip install cupy-cuda11x
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      WALBERLA_BUILD_WITH_OPENMP: "OFF"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
   only:
-      - triggers
+      variables:
+         - $ENABLE_NIGHTLY_BUILDS
   tags:
+      - cuda11
      - docker

-gcc_4.9_serial:
-   <<: *build_serial_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:4.9
+gcc_12_hybrid:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-12:32
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.7 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=gcc CXX=g++ ci-venv/bin/python -m pip install cupy-cuda11x
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
   only:
-      - triggers
+      variables:
+         - $ENABLE_NIGHTLY_BUILDS
   tags:
+      - cuda11
      - docker

-gcc_4.9_mpionly:
-   <<: *build_mpionly_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:4.9
+gcc_12_serial_dbg:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-12:32
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.7 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=gcc CXX=g++ ci-venv/bin/python -m pip install cupy-cuda11x
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      WALBERLA_BUILD_WITH_MPI: "OFF"
+      WALBERLA_BUILD_WITH_OPENMP: "OFF"
+      WALBERLA_BUILD_WITH_PARMETIS: "OFF"
+      CMAKE_BUILD_TYPE: "DebugOptimized"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
   only:
-      - triggers
+      variables:
+         - $ENABLE_NIGHTLY_BUILDS
   tags:
+      - cuda11
      - docker

-gcc_4.9_hybrid:
-   <<: *build_hybrid_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:4.9
+gcc_12_mpionly_dbg:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-12:32
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.7 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=gcc CXX=g++ ci-venv/bin/python -m pip install cupy-cuda11x
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      CMAKE_BUILD_TYPE: "DebugOptimized"
+      WALBERLA_BUILD_WITH_OPENMP: "OFF"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
   only:
-      - triggers
+      variables:
+         - $ENABLE_NIGHTLY_BUILDS
   tags:
+      - cuda11
      - docker

-gcc_4.9_serial_dbg:
-   <<: *build_serial_dbg_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:4.9
+gcc_12_hybrid_dbg:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-12:32
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.7 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=gcc CXX=g++ ci-venv/bin/python -m pip install cupy-cuda11x
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      CMAKE_BUILD_TYPE: "DebugOptimized"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
   only:
-      - triggers
+      variables:
+         - $ENABLE_NIGHTLY_BUILDS
   tags:
+      - cuda11
      - docker

-gcc_4.9_mpionly_dbg:
-   <<: *build_mpionly_dbg_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:4.9
+gcc_12_hybrid_dbg_sp:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-12:32
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.7 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=gcc CXX=g++ ci-venv/bin/python -m pip install cupy-cuda11x
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      CMAKE_BUILD_TYPE: "DebugOptimized"
+      WALBERLA_DOUBLE_ACCURACY: "OFF"
+      WALBERLA_BUILD_WITH_PARMETIS: "OFF"
+      WALBERLA_BUILD_WITH_METIS: "OFF"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
   only:
-      - triggers
+      variables:
+         - $ENABLE_NIGHTLY_BUILDS
   tags:
+      - cuda11
      - docker

-gcc_4.9_hybrid_dbg:
-   <<: *build_hybrid_dbg_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:4.9
+gcc_13_serial:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-13:32
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.7 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=gcc CXX=g++ ci-venv/bin/python -m pip install cupy-cuda11x
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      WALBERLA_BUILD_WITH_MPI: "OFF"
+      WALBERLA_BUILD_WITH_OPENMP: "OFF"
+      WALBERLA_BUILD_WITH_PARMETIS: "OFF"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
+      WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT: "ON"
   only:
-      - triggers
+      variables:
+         - $ENABLE_NIGHTLY_BUILDS
   tags:
+      - cuda11
      - docker

-gcc_4.9_hybrid_dbg_sp:
-   <<: *build_hybrid_dbg_sp_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:4.9
+gcc_13_mpionly:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-13:32
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.7 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=gcc CXX=g++ ci-venv/bin/python -m pip install cupy-cuda11x
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      WALBERLA_BUILD_WITH_OPENMP: "OFF"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
+      WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT: "ON"
   only:
-      - triggers
+      variables:
+         - $ENABLE_NIGHTLY_BUILDS
   tags:
+      - cuda11
      - docker

-gcc_5_serial:
-   <<: *build_serial_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:5
-   only:
-      - triggers
+gcc_13_hybrid:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-13:32
+   stage: pretest
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.7 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=gcc CXX=g++ ci-venv/bin/python -m pip install cupy-cuda11x
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
+      WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT: "ON"
   tags:
-      - cuda
+      - cuda11
      - docker

-gcc_5_mpionly:
-   <<: *build_mpionly_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:5
-   only:
-      - triggers
+gcc_13_serial_dbg:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-13:32
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.7 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=gcc CXX=g++ ci-venv/bin/python -m pip install cupy-cuda11x
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      WALBERLA_BUILD_WITH_MPI: "OFF"
+      WALBERLA_BUILD_WITH_OPENMP: "OFF"
+      WALBERLA_BUILD_WITH_PARMETIS: "OFF"
+      CMAKE_BUILD_TYPE: "DebugOptimized"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
+      WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT: "ON"
   tags:
-      - cuda
+      - cuda11
      - docker

-gcc_5_hybrid:
-   <<: *build_hybrid_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:5
+gcc_13_mpionly_dbg:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-13:32
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.7 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=gcc CXX=g++ ci-venv/bin/python -m pip install cupy-cuda11x
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      CMAKE_BUILD_TYPE: "DebugOptimized"
+      WALBERLA_BUILD_WITH_OPENMP: "OFF"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
+      WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT: "ON"
   tags:
-      - cuda
+      - cuda11
      - docker

-gcc_5_serial_dbg:
-   <<: *build_serial_dbg_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:5
+gcc_13_hybrid_dbg:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-13:32
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.7 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=gcc CXX=g++ ci-venv/bin/python -m pip install cupy-cuda11x
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      CMAKE_BUILD_TYPE: "DebugOptimized"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
+      WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT: "ON"
   tags:
-      - cuda
+      - cuda11
      - docker

-gcc_5_mpionly_dbg:
-   <<: *build_mpionly_dbg_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:5
-   only:
-      - triggers
+gcc_13_hybrid_dbg_sp:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-13:32
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.7 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=gcc CXX=g++ ci-venv/bin/python -m pip install cupy-cuda11x
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      CMAKE_BUILD_TYPE: "DebugOptimized"
+      WALBERLA_DOUBLE_ACCURACY: "OFF"
+      WALBERLA_BUILD_WITH_PARMETIS: "OFF"
+      WALBERLA_BUILD_WITH_METIS: "OFF"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
+      WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT: "ON"
   tags:
-      - cuda
+      - cuda11
      - docker

-gcc_5_hybrid_dbg:
-   <<: *build_hybrid_dbg_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:5
+clang_14_serial:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-14:32
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.7 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=clang CXX=clang++ ci-venv/bin/python -m pip install cupy-cuda11x
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      WALBERLA_BUILD_WITH_MPI: "OFF"
+      WALBERLA_BUILD_WITH_OPENMP: "OFF"
+      WALBERLA_BUILD_WITH_PARMETIS: "OFF"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
   only:
-      - triggers
+      variables:
+         - $ENABLE_NIGHTLY_BUILDS
   tags:
-      - cuda
+      - cuda11
      - docker

-gcc_5_hybrid_dbg_sp:
-   <<: *build_hybrid_dbg_sp_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:5
+clang_14_mpionly:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-14:32
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.7 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=clang CXX=clang++ ci-venv/bin/python -m pip install cupy-cuda11x
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      WALBERLA_BUILD_WITH_OPENMP: "OFF"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
   only:
-      - triggers
+      variables:
+         - $ENABLE_NIGHTLY_BUILDS
   tags:
-      - cuda
+      - cuda11
      - docker

-gcc_6_serial:
-   <<: *build_serial_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:6
+clang_14_hybrid:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-14:32
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.7 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=clang CXX=clang++ ci-venv/bin/python -m pip install cupy-cuda11x
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
   only:
-      - triggers
+      variables:
+         - $ENABLE_NIGHTLY_BUILDS
   tags:
+      - cuda11
      - docker

-gcc_6_mpionly:
-   <<: *build_mpionly_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:6
-   only:
-      - triggers
-   tags:
-      - docker
-
-gcc_6_hybrid:
-   <<: *build_hybrid_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:6
-   tags:
-      - docker
-
-gcc_6_serial_dbg:
-   <<: *build_serial_dbg_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:6
-   tags:
-      - docker
-
-gcc_6_mpionly_dbg:
-   <<: *build_mpionly_dbg_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:6
-   tags:
-      - docker
-
-gcc_6_hybrid_dbg:
-   <<: *build_hybrid_dbg_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:6
-   tags:
-      - docker
-
-gcc_6_hybrid_dbg_sp:
-   <<: *build_hybrid_dbg_sp_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc:6
-   tags:
-      - docker
-
-clang_3.4_serial:
-   <<: *build_serial_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:3.4
-   only:
-      - triggers
-   tags:
-      - docker
-
-clang_3.4_mpionly:
-   <<: *build_mpionly_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:3.4
-   only:
-      - triggers
-   tags:
-      - docker
-
-clang_3.4_serial_dbg:
-   <<: *build_serial_dbg_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:3.4
-   only:
-      - triggers
-   tags:
-      - docker
-
-clang_3.4_mpionly_dbg:
-   <<: *build_mpionly_dbg_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:3.4
-   tags:
-      - docker
-
-clang_3.5_serial:
-   <<: *build_serial_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:3.5
-   only:
-      - triggers
-   tags:
-      - cuda
-      - docker
-
-clang_3.5_mpionly:
-   <<: *build_mpionly_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:3.5
-   only:
-      - triggers
-   tags:
-      - cuda
-      - docker
-
-clang_3.5_serial_dbg:
-   <<: *build_serial_dbg_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:3.5
-   only:
-      - triggers
-   tags:
-      - cuda
-      - docker
-
-clang_3.5_mpionly_dbg:
-   <<: *build_mpionly_dbg_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:3.5
-   only:
-      - triggers
-   tags:
-      - cuda
-      - docker
-
-clang_3.6_serial:
-   <<: *build_serial_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:3.6
-   only:
-      - triggers
-   tags:
-      - cuda
-      - docker
-
-clang_3.6_mpionly:
-   <<: *build_mpionly_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:3.6
-   only:
-      - triggers
-   tags:
-      - cuda
-      - docker
-
-clang_3.6_serial_dbg:
-   <<: *build_serial_dbg_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:3.6
-   only:
-      - triggers
-   tags:
-      - cuda
-      - docker
-
-clang_3.6_mpionly_dbg:
-   <<: *build_mpionly_dbg_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:3.6
-   only:
-      - triggers
-   tags:
-      - cuda
-      - docker
-
-clang_3.7_serial:
-   <<: *build_serial_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:3.7
+clang_14_serial_dbg:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-14:32
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.7 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=clang CXX=clang++ ci-venv/bin/python -m pip install cupy-cuda11x
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      WALBERLA_BUILD_WITH_MPI: "OFF"
+      WALBERLA_BUILD_WITH_OPENMP: "OFF"
+      WALBERLA_BUILD_WITH_PARMETIS: "OFF"
+      CMAKE_BUILD_TYPE: "DebugOptimized"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
   only:
-      - triggers
+      variables:
+         - $ENABLE_NIGHTLY_BUILDS
   tags:
-      - cuda
+      - cuda11
      - docker

-clang_3.7_mpionly:
-   <<: *build_mpionly_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:3.7
+clang_14_mpionly_dbg:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-14:32
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.7 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=clang CXX=clang++ ci-venv/bin/python -m pip install cupy-cuda11x
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      CMAKE_BUILD_TYPE: "DebugOptimized"
+      WALBERLA_BUILD_WITH_OPENMP: "OFF"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
   only:
-      - triggers
+      variables:
+         - $ENABLE_NIGHTLY_BUILDS
   tags:
-      - cuda
+      - cuda11
      - docker

-clang_3.7_serial_dbg:
-   <<: *build_serial_dbg_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:3.7
-   only:
-      - triggers
+clang_14_hybrid_dbg:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-14:32
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.7 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=clang CXX=clang++ ci-venv/bin/python -m pip install cupy-cuda11x
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      CMAKE_BUILD_TYPE: "DebugOptimized"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
   tags:
-      - cuda
+      - cuda11
      - docker

-clang_3.7_mpionly_dbg:
-   <<: *build_mpionly_dbg_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:3.7
+clang_14_hybrid_dbg_sp:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-14:32
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.7 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=clang CXX=clang++ ci-venv/bin/python -m pip install cupy-cuda11x
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      CMAKE_BUILD_TYPE: "DebugOptimized"
+      WALBERLA_DOUBLE_ACCURACY: "OFF"
+      WALBERLA_BUILD_WITH_PARMETIS: "OFF"
+      WALBERLA_BUILD_WITH_METIS: "OFF"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
   only:
-      - triggers
+      variables:
+         - $ENABLE_NIGHTLY_BUILDS
   tags:
-      - cuda
+      - cuda11
      - docker

-clang_3.8_serial:
-   <<: *build_serial_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:3.8
+clang_15_serial:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-15:32
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.7 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=clang CXX=clang++ ci-venv/bin/python -m pip install cupy-cuda11x
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      WALBERLA_BUILD_WITH_MPI: "OFF"
+      WALBERLA_BUILD_WITH_OPENMP: "OFF"
+      WALBERLA_BUILD_WITH_PARMETIS: "OFF"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
   only:
-      - triggers
+      variables:
+         - $ENABLE_NIGHTLY_BUILDS
   tags:
-      - cuda
+      - cuda11
      - docker

-clang_3.8_mpionly:
-   <<: *build_mpionly_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:3.8
+clang_15_mpionly:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-15:32
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.7 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=clang CXX=clang++ ci-venv/bin/python -m pip install cupy-cuda11x
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      WALBERLA_BUILD_WITH_OPENMP: "OFF"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
   only:
-      - triggers
+      variables:
+         - $ENABLE_NIGHTLY_BUILDS
   tags:
-      - cuda
+      - cuda11
      - docker

-clang_3.8_hybrid:
-   <<: *build_hybrid_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:3.8
+clang_15_hybrid:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-15:32
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.7 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=clang CXX=clang++ ci-venv/bin/python -m pip install cupy-cuda11x
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
   only:
-      - triggers
+      variables:
+         - $ENABLE_NIGHTLY_BUILDS
   tags:
-      - cuda
+      - cuda11
      - docker

-clang_3.8_serial_dbg:
-   <<: *build_serial_dbg_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:3.8
+clang_15_serial_dbg:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-15:32
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.7 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=clang CXX=clang++ ci-venv/bin/python -m pip install cupy-cuda11x
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      WALBERLA_BUILD_WITH_MPI: "OFF"
+      WALBERLA_BUILD_WITH_OPENMP: "OFF"
+      WALBERLA_BUILD_WITH_PARMETIS: "OFF"
+      CMAKE_BUILD_TYPE: "DebugOptimized"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
   only:
-      - triggers
+      variables:
+         - $ENABLE_NIGHTLY_BUILDS
   tags:
-      - cuda
+      - cuda11
      - docker

-clang_3.8_mpionly_dbg:
-   <<: *build_mpionly_dbg_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:3.8
+clang_15_mpionly_dbg:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-15:32
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.7 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=clang CXX=clang++ ci-venv/bin/python -m pip install cupy-cuda11x
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      CMAKE_BUILD_TYPE: "DebugOptimized"
+      WALBERLA_BUILD_WITH_OPENMP: "OFF"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
   only:
-      - triggers
+      variables:
+         - $ENABLE_NIGHTLY_BUILDS
   tags:
-      - cuda
+      - cuda11
      - docker

-clang_3.8_hybrid_dbg:
-   <<: *build_hybrid_dbg_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:3.8
+clang_15_hybrid_dbg:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-15:32
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.7 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=clang CXX=clang++ ci-venv/bin/python -m pip install cupy-cuda11x
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      CMAKE_BUILD_TYPE: "DebugOptimized"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
   only:
-      - triggers
+      variables:
+         - $ENABLE_NIGHTLY_BUILDS
   tags:
-      - cuda
+      - cuda11
      - docker

-clang_3.8_hybrid_dbg_sp:
-   <<: *build_hybrid_dbg_sp_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:3.8
+clang_15_hybrid_dbg_sp:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-15:32
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.7 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=clang CXX=clang++ ci-venv/bin/python -m pip install cupy-cuda11x
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      CMAKE_BUILD_TYPE: "DebugOptimized"
+      WALBERLA_DOUBLE_ACCURACY: "OFF"
+      WALBERLA_BUILD_WITH_PARMETIS: "OFF"
+      WALBERLA_BUILD_WITH_METIS: "OFF"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
   only:
-      - triggers
+      variables:
+         - $ENABLE_NIGHTLY_BUILDS
   tags:
-      - cuda
+      - cuda11
      - docker

-clang_3.9_serial:
-   <<: *build_serial_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:3.9
+clang_16_serial:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-16:32
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.7 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=clang CXX=clang++ ci-venv/bin/python -m pip install cupy-cuda11x
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      WALBERLA_BUILD_WITH_MPI: "OFF"
+      WALBERLA_BUILD_WITH_OPENMP: "OFF"
+      WALBERLA_BUILD_WITH_PARMETIS: "OFF"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
   only:
-      - triggers
+      variables:
+         - $ENABLE_NIGHTLY_BUILDS
   tags:
+      - cuda11
      - docker

-clang_3.9_mpionly:
-   <<: *build_mpionly_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:3.9
+clang_16_mpionly:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-16:32
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.7 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=clang CXX=clang++ ci-venv/bin/python -m pip install cupy-cuda11x
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      WALBERLA_BUILD_WITH_OPENMP: "OFF"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
   only:
-      - triggers
+      variables:
+         - $ENABLE_NIGHTLY_BUILDS
   tags:
+      - cuda11
      - docker

-clang_3.9_hybrid:
-   <<: *build_hybrid_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:3.9
+clang_16_hybrid:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-16:32
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.7 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=clang CXX=clang++ ci-venv/bin/python -m pip install cupy-cuda11x
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
   only:
-      - triggers
+      variables:
+         - $ENABLE_NIGHTLY_BUILDS
   tags:
+      - cuda11
      - docker

-clang_3.9_serial_dbg:
-   <<: *build_serial_dbg_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:3.9
+clang_16_serial_dbg:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-16:32
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.7 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=clang CXX=clang++ ci-venv/bin/python -m pip install cupy-cuda11x
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      WALBERLA_BUILD_WITH_MPI: "OFF"
+      WALBERLA_BUILD_WITH_OPENMP: "OFF"
+      WALBERLA_BUILD_WITH_PARMETIS: "OFF"
+      CMAKE_BUILD_TYPE: "DebugOptimized"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
   only:
-      - triggers
+      variables:
+         - $ENABLE_NIGHTLY_BUILDS
   tags:
+      - cuda11
      - docker

-clang_3.9_mpionly_dbg:
-   <<: *build_mpionly_dbg_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:3.9
+clang_16_mpionly_dbg:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-16:32
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.7 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=clang CXX=clang++ ci-venv/bin/python -m pip install cupy-cuda11x
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      CMAKE_BUILD_TYPE: "DebugOptimized"
+      WALBERLA_BUILD_WITH_OPENMP: "OFF"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
   only:
-      - triggers
+      variables:
+         - $ENABLE_NIGHTLY_BUILDS
   tags:
+      - cuda11
      - docker

-clang_3.9_hybrid_dbg:
-   <<: *build_hybrid_dbg_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:3.9
+clang_16_hybrid_dbg:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-16:32
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.7 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=clang CXX=clang++ ci-venv/bin/python -m pip install cupy-cuda11x
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      CMAKE_BUILD_TYPE: "DebugOptimized"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
   only:
-      - triggers
+      variables:
+         - $ENABLE_NIGHTLY_BUILDS
   tags:
+      - cuda11
      - docker

-clang_3.9_hybrid_dbg_sp:
-   <<: *build_hybrid_dbg_sp_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:3.9
+clang_16_hybrid_dbg_sp:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-16:32
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.7 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=clang CXX=clang++ ci-venv/bin/python -m pip install cupy-cuda11x
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      CMAKE_BUILD_TYPE: "DebugOptimized"
+      WALBERLA_DOUBLE_ACCURACY: "OFF"
+      WALBERLA_BUILD_WITH_PARMETIS: "OFF"
+      WALBERLA_BUILD_WITH_METIS: "OFF"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
   only:
-      - triggers
+      variables:
+         - $ENABLE_NIGHTLY_BUILDS
   tags:
+      - cuda11
      - docker

-clang_4.0_serial:
-   <<: *build_serial_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:4.0
+clang_17_serial:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-17:32
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.7 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=clang CXX=clang++ ci-venv/bin/python -m pip install cupy-cuda11x
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      WALBERLA_BUILD_WITH_MPI: "OFF"
+      WALBERLA_BUILD_WITH_OPENMP: "OFF"
+      WALBERLA_BUILD_WITH_PARMETIS: "OFF"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
   only:
-      - triggers
+      variables:
+         - $ENABLE_NIGHTLY_BUILDS
   tags:
+      - cuda11
      - docker

-clang_4.0_mpionly:
-   <<: *build_mpionly_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:4.0
+clang_17_mpionly:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-17:32
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.7 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=clang CXX=clang++ ci-venv/bin/python -m pip install cupy-cuda11x
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      WALBERLA_BUILD_WITH_OPENMP: "OFF"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
   only:
-      - triggers
+      variables:
+         - $ENABLE_NIGHTLY_BUILDS
   tags:
+      - cuda11
      - docker

-clang_4.0_hybrid:
-   <<: *build_hybrid_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:4.0
-   only:
-      - triggers
+clang_17_hybrid:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-17:32
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.7 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=clang CXX=clang++ ci-venv/bin/python -m pip install cupy-cuda11x
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
   tags:
+      - cuda11
      - docker

-clang_4.0_serial_dbg:
-   <<: *build_serial_dbg_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:4.0
-   only:
-      - triggers
+clang_17_serial_dbg:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-17:32
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.7 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=clang CXX=clang++ ci-venv/bin/python -m pip install cupy-cuda11x
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      WALBERLA_BUILD_WITH_MPI: "OFF"
+      WALBERLA_BUILD_WITH_OPENMP: "OFF"
+      WALBERLA_BUILD_WITH_PARMETIS: "OFF"
+      CMAKE_BUILD_TYPE: "DebugOptimized"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
   tags:
+      - cuda11
      - docker

-clang_4.0_mpionly_dbg:
-   <<: *build_mpionly_dbg_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:4.0
-   only:
-      - triggers
+clang_17_mpionly_dbg:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-17:32
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.7 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=clang CXX=clang++ ci-venv/bin/python -m pip install cupy-cuda11x
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      CMAKE_BUILD_TYPE: "DebugOptimized"
+      WALBERLA_BUILD_WITH_OPENMP: "OFF"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
   tags:
+      - cuda11
      - docker

-clang_4.0_hybrid_dbg:
-   <<: *build_hybrid_dbg_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:4.0
-   only:
-      - triggers
+clang_17_hybrid_dbg:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-17:32
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.7 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=clang CXX=clang++ ci-venv/bin/python -m pip install cupy-cuda11x
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      CMAKE_BUILD_TYPE: "DebugOptimized"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
   tags:
+      - cuda11
      - docker

-clang_4.0_hybrid_dbg_sp:
-   <<: *build_hybrid_dbg_sp_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:4.0
-   only:
-      - triggers
+clang_17_hybrid_dbg_sp:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-17:32
+   stage: pretest
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.7 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=clang CXX=clang++ ci-venv/bin/python -m pip install cupy-cuda11x
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      CMAKE_BUILD_TYPE: "DebugOptimized"
+      WALBERLA_DOUBLE_ACCURACY: "OFF"
+      WALBERLA_BUILD_WITH_PARMETIS: "OFF"
+      WALBERLA_BUILD_WITH_METIS: "OFF"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
   tags:
+      - cuda11
      - docker

-clang_5.0_serial:
-   <<: *build_serial_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:5.0
+aocc_4_serial:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/aocc-4:32
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.7 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=clang CXX=clang++ ci-venv/bin/python -m pip install cupy-cuda11x
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      WALBERLA_BUILD_WITH_MPI: "OFF"
+      WALBERLA_BUILD_WITH_OPENMP: "OFF"
+      WALBERLA_BUILD_WITH_PARMETIS: "OFF"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
   only:
-      - triggers
+      variables:
+         - $ENABLE_NIGHTLY_BUILDS
   tags:
+      - cuda11
      - docker

-clang_5.0_mpionly:
-   <<: *build_mpionly_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:5.0
+aocc_4_mpionly:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/aocc-4:32
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.7 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=clang CXX=clang++ ci-venv/bin/python -m pip install cupy-cuda11x
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      WALBERLA_BUILD_WITH_OPENMP: "OFF"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
   only:
-      - triggers
+      variables:
+         - $ENABLE_NIGHTLY_BUILDS
   tags:
+      - cuda11
      - docker

-clang_5.0_hybrid:
-   <<: *build_hybrid_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:5.0
+aocc_4_hybrid:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/aocc-4:32
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.7 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=clang CXX=clang++ ci-venv/bin/python -m pip install cupy-cuda11x
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
   tags:
+      - cuda11
      - docker

-clang_5.0_serial_dbg:
-   <<: *build_serial_dbg_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:5.0
+aocc_4_serial_dbg:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/aocc-4:32
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.7 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=clang CXX=clang++ ci-venv/bin/python -m pip install cupy-cuda11x
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      WALBERLA_BUILD_WITH_MPI: "OFF"
+      WALBERLA_BUILD_WITH_OPENMP: "OFF"
+      WALBERLA_BUILD_WITH_PARMETIS: "OFF"
+      CMAKE_BUILD_TYPE: "DebugOptimized"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
   tags:
+      - cuda11
      - docker

-clang_5.0_mpionly_dbg:
-   <<: *build_mpionly_dbg_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:5.0
+aocc_4_mpionly_dbg:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/aocc-4:32
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.7 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=clang CXX=clang++ ci-venv/bin/python -m pip install cupy-cuda11x
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      CMAKE_BUILD_TYPE: "DebugOptimized"
+      WALBERLA_BUILD_WITH_OPENMP: "OFF"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
   tags:
+      - cuda11
      - docker

-clang_5.0_hybrid_dbg:
-   <<: *build_hybrid_dbg_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:5.0
+aocc_4_hybrid_dbg:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/aocc-4:32
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.7 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=clang CXX=clang++ ci-venv/bin/python -m pip install cupy-cuda11x
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      CMAKE_BUILD_TYPE: "DebugOptimized"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
   tags:
+      - cuda11
      - docker

-clang_5.0_hybrid_dbg_sp:
-   <<: *build_hybrid_dbg_sp_definition
-   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang:5.0
+aocc_4_hybrid_dbg_sp:
+   extends: .build_template
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/aocc-4:32
+   before_script:
+      - python3 -m venv ci-venv
+      - source ci-venv/bin/activate
+      - python3 -m pip install lbmpy==1.3.7 jinja2 pytest
+      - cd python
+      - python3 -m pytest --junitxml=report.xml pystencils_walberla lbmpy_walberla
+      - python3 -m pip list
+      - deactivate
+      - python3 -m pip install numpy
+      - cd ..
+      - CC=clang CXX=clang++ ci-venv/bin/python -m pip install cupy-cuda11x
+   variables:
+      WALBERLA_BUILD_WITH_CUDA: "ON"
+      CMAKE_BUILD_TYPE: "DebugOptimized"
+      WALBERLA_DOUBLE_ACCURACY: "OFF"
+      WALBERLA_BUILD_WITH_PARMETIS: "OFF"
+      WALBERLA_BUILD_WITH_METIS: "OFF"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      Python_ROOT_DIR: "./ci-venv"
   tags:
+      - cuda11
      - docker


-
 ###############################################################################
 ##                                                                           ##
 ##    Documentation                                                         ##
@@ -948,14 +2010,14 @@ clang_5.0_hybrid_dbg_sp:
 ###############################################################################

 doc:
-   image: walberla/buildenv-ubuntu-basic:16.04
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-13
   script:
      - cmake --version
      - doxygen --version
      - mkdir $CI_PROJECT_DIR/build
      - cd $CI_PROJECT_DIR/build
      - cmake ..
-      - cmake . -LAH
+      - cmake . -LA
      - make doc
   tags:
      - docker
@@ -972,316 +2034,170 @@ doc:
 ##                                                                           ##
 ###############################################################################

+clang-tidy:
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-18
+   stage: "Code Quality"
+   needs: []
+   before_script:
+      - pip install pyyaml
+   script:
+      - $CXX --version
+      - clang-tidy -version
+      - cmake --version
+      - mkdir $CI_PROJECT_DIR/build
+      - cd $CI_PROJECT_DIR/build
+      - cmake .. -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DWALBERLA_BUFFER_DEBUG=ON -DWALBERLA_BUILD_TESTS=ON -DWALBERLA_BUILD_BENCHMARKS=ON -DWALBERLA_BUILD_TUTORIALS=ON -DWALBERLA_BUILD_TOOLS=ON -DWALBERLA_BUILD_WITH_MPI=ON -DWALBERLA_BUILD_WITH_OPENMP=ON -DCMAKE_BUILD_TYPE=Debug -DWALBERLA_BUILD_WITH_METIS=ON -DWALBERLA_BUILD_WITH_PARMETIS=ON -DWALBERLA_BUILD_WITH_OPENMESH=ON -DWALBERLA_DOUBLE_ACCURACY=ON -DWALBERLA_LOGLEVEL=DETAIL
+      - cmake . -LA
+      - python3 utilities/clang-tidy/analyze.py -p utilities/clang-tidy/analyze.yml -r $CI_PROJECT_DIR -c compile_commands.json -o clang-tidy-output
+   after_script:
+      - mkdir -p $CI_PROJECT_DIR/artifacts
+      - mv $CI_PROJECT_DIR/build/clang-tidy-output $CI_PROJECT_DIR/artifacts/clang-tidy-output
+   artifacts:
+      when: always
+      paths:
+         - $CI_PROJECT_DIR/artifacts/clang-tidy-output
+   tags:
+      - docker
+
+
 cppcheck:
-   image: walberla/cppcheck
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/cppcheck
   script:
      - cppcheck --version
-      - cppcheck . --max-configs=10 --enable=warning --enable=style --enable=performance --enable=portability -i src/gui/extern -i src/geometry/structured/extern -i sqlite3.c -i StackWalker.cpp -I src/ -I tests/ -I apps/ -D WALBERLA_BUILD_WITH_MPI -D WALBERLA_BUILD_WITH_METIS -D WALBERLA_BUILD_WITH_BOOST_THREAD -D WALBERLA_BUILD_WITH_PYTHON --xml 2> report.xml
+      - cppcheck . --max-configs=10 --enable=warning --enable=style --enable=performance --enable=portability -i src/geometry/structured/extern -i sqlite3.c -i StackWalker.cpp -I src/ -I tests/ -I apps/ -D WALBERLA_BUILD_WITH_MPI -D WALBERLA_BUILD_WITH_METIS -D WALBERLA_BUILD_WITH_PYTHON --xml 2> report.xml
      - cppcheck-htmlreport --file=report.xml --report-dir=html_report --source-dir=.
   artifacts:
      untracked: true
   only:
-      - triggers
+      variables:
+         - $ENABLE_NIGHTLY_BUILDS
   tags:
      - docker


 coverage:
-   image: walberla/coverage
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-13
   script:
+      - pip3 install gcovr
+      - export NUM_CORES=$(nproc --all)
+      - export MAX_BUILD_CORES=$(( $(awk '( $1 == "MemTotal:" ) { print $2 }' /proc/meminfo) / ( 4 * 1024 * 1024  ) ))
      - $CXX --version
      - cmake --version
      - gcovr --version
      - mkdir build
      - cd build
      - if dpkg --compare-versions `ompi_info | head -2 | tail -1 | sed 's/[^0-9.]*\([0-9.]*\).*/\1/'` ge 1.10; then export MPIEXEC_PREFLAGS="--allow-run-as-root" ; fi
-      - cmake .. -DWALBERLA_BUILD_TESTS=ON -DWALBERLA_BUILD_BENCHMARKS=ON -DWALBERLA_BUILD_TUTORIALS=ON -DWALBERLA_BUILD_WITH_MPI=ON -DWALBERLA_BUILD_WITH_OPENMP=OFF -DCMAKE_BUILD_TYPE=Debug -DMPIEXEC_PREFLAGS=$MPIEXEC_PREFLAGS
-      - cmake . -LAH
-      - make -j 8
-      - ctest -LE longrun --output-on-failure -j 8_Hybrid
+      - cmake .. -DWALBERLA_BUILD_TESTS=ON -DWALBERLA_BUILD_BENCHMARKS=ON -DWALBERLA_BUILD_TUTORIALS=ON -DWALBERLA_BUILD_WITH_MPI=ON -DWALBERLA_BUILD_WITH_OPENMP=OFF -DCMAKE_BUILD_TYPE=DebugOptimized -DMPIEXEC_PREFLAGS=$MPIEXEC_PREFLAGS -DWALBERLA_BUILD_WITH_CODEGEN=OFF -DWALBERLA_BUILD_WITH_GCOV=ON  -DWALBERLA_LOGLEVEL=DETAIL
+      - cmake . -LA
+      - make -j $NUM_BUILD_CORES -l $NUM_CORES
+      - ctest -LE longrun --output-on-failure -j $NUM_CORES --timeout 3000
      - cd ..
      - mkdir coverage
-      - cd coverage
      - pwd
-      - gcovr -r $CI_PROJECT_DIR -f ".*\\/src\\/.*" -k
-      - gcovr -r $CI_PROJECT_DIR -f ".*\\/src\\/.*" --html --html-details -o coverage.html -g
+      - gcovr -r $CI_PROJECT_DIR -k build -f "src" --print-summary --html coverage/coverage.html --html-details --xml coverage/coverage.xml
+   coverage: /^\s*lines:\s*\d+.\d+\%/
   artifacts:
      paths:
         - coverage/
+      reports:
+        coverage_report:
+          coverage_format: cobertura
+          path: coverage/coverage.xml
   only:
-      - triggers
+      variables:
+         - $ENABLE_NIGHTLY_BUILDS
   tags:
      - docker
-   variables:
-      CXXFLAGS: "-fprofile-arcs -ftest-coverage -fPIC -O0"
-      LDFLAGS: "-fprofile-arcs -ftest-coverage -fPIC -O0"
-


 ###############################################################################
 ##                                                                           ##
-##    Windows Builds                                                         ##
+##    macOS Builds                                                           ##
 ##                                                                           ##
 ###############################################################################
-      
-     
-.win_build_template: &win_build_definition
-   tags:
-      - win
+
+
+.mac_build_template: &mac_build_definition
   script:
-      - export PreferredToolArchitecture=x64
-      - export OMP_NUM_THREADS=4
-      - export OMP_WAIT_POLICY="PASSIVE"
-      - export MSMPI_DISABLE_SHM=1
+      - export NUM_CORES=$(system_profiler SPHardwareDataType | grep 'Total Number of Cores' | awk '{print $5}')
+      - export MAX_BUILD_CORES=$(( $(system_profiler SPHardwareDataType | grep 'Memory' | awk '{print $2}') / 4 ))
+      - "[[ $MAX_BUILD_CORES -lt $NUM_CORES ]] && export NUM_BUILD_CORES=$MAX_BUILD_CORES || export NUM_BUILD_CORES=$NUM_CORES"
+      - c++ --version
      - cmake --version
+      - mpirun --version
      - mkdir build
      - cd build
-      - cmake -LAH -DWALBERLA_BUILD_TESTS=ON -DWALBERLA_BUILD_BENCHMARKS=ON -DWALBERLA_BUILD_TUTORIALS=ON -DWALBERLA_BUILD_WITH_MPI=$WALBERLA_BUILD_WITH_MPI -DWALBERLA_BUILD_WITH_OPENMP=$WALBERLA_BUILD_WITH_OPENMP -DWALBERLA_DOUBLE_ACCURACY=$WALBERLA_DOUBLE_ACCURACY -DWARNING_ERROR=ON -G "$CMAKE_GENERATOR" ..
-      - MSBuild.exe walberla.sln /property:Configuration=$BUILD_CONFIGURATION /verbosity:minimal /maxcpucount:4
-      - ctest -LE $CTEST_EXCLUDE_LABELS -C $BUILD_CONFIGURATION --output-on-failure -j 4
-      
-      
-msvc-12_Hybrid_Dbg:
-   <<: *win_build_definition
-   variables:
-      CMAKE_GENERATOR: "Visual Studio 12 2013 Win64"
-      BUILD_CONFIGURATION: "DebugOptimized"
-      CTEST_EXCLUDE_LABELS: "longrun"
-      WALBERLA_BUILD_WITH_MPI: "ON"
-      WALBERLA_BUILD_WITH_OPENMP: "ON"
-      WALBERLA_DOUBLE_ACCURACY: "ON"
-      
-msvc-12_Hybrid_SP_Dbg:
-   <<: *win_build_definition
-   variables:
-      CMAKE_GENERATOR: "Visual Studio 12 2013 Win64"
-      BUILD_CONFIGURATION: "DebugOptimized"
-      CTEST_EXCLUDE_LABELS: "longrun"
-      WALBERLA_BUILD_WITH_MPI: "ON"
-      WALBERLA_BUILD_WITH_OPENMP: "ON"
-      WALBERLA_DOUBLE_ACCURACY: "OFF"
-   only:
-      - triggers
-      
-msvc-12_Hybrid:
-   <<: *win_build_definition
-   variables:
-      CMAKE_GENERATOR: "Visual Studio 12 2013 Win64"
-      BUILD_CONFIGURATION: "Release"
-      CTEST_EXCLUDE_LABELS: "longrun"
-      WALBERLA_BUILD_WITH_MPI: "ON"
-      WALBERLA_BUILD_WITH_OPENMP: "ON"
-      WALBERLA_DOUBLE_ACCURACY: "ON"
-   only:
-      - triggers   
-      
-msvc-12_Serial_Dbg:
-   <<: *win_build_definition
-   variables:
-      CMAKE_GENERATOR: "Visual Studio 12 2013 Win64"
-      BUILD_CONFIGURATION: "DebugOptimized"
-      CTEST_EXCLUDE_LABELS: "longrun"
-      WALBERLA_BUILD_WITH_MPI: "OFF"
-      WALBERLA_BUILD_WITH_OPENMP: "OFF"
-      WALBERLA_DOUBLE_ACCURACY: "ON"
-   only:
-      - triggers
-      
-msvc-12_Serial:
-   <<: *win_build_definition
-   variables:
-      CMAKE_GENERATOR: "Visual Studio 12 2013 Win64"
-      BUILD_CONFIGURATION: "Release"
-      CTEST_EXCLUDE_LABELS: "longrun"
-      WALBERLA_BUILD_WITH_MPI: "OFF"
-      WALBERLA_BUILD_WITH_OPENMP: "OFF"
-      WALBERLA_DOUBLE_ACCURACY: "ON"
-   only:
-      - triggers
-      
-msvc-12_MpiOnly_Dbg:
-   <<: *win_build_definition
-   variables:
-      CMAKE_GENERATOR: "Visual Studio 12 2013 Win64"
-      BUILD_CONFIGURATION: "DebugOptimized"
-      CTEST_EXCLUDE_LABELS: "longrun"
-      WALBERLA_BUILD_WITH_MPI: "ON"
-      WALBERLA_BUILD_WITH_OPENMP: "OFF"
-      WALBERLA_DOUBLE_ACCURACY: "ON"
-   only:
-      - triggers
-      
-msvc-12_MpiOnly:
-   <<: *win_build_definition
-   variables:
-      CMAKE_GENERATOR: "Visual Studio 12 2013 Win64"
-      BUILD_CONFIGURATION: "Release"
-      CTEST_EXCLUDE_LABELS: "longrun"
-      WALBERLA_BUILD_WITH_MPI: "ON"
-      WALBERLA_BUILD_WITH_OPENMP: "OFF"
-      WALBERLA_DOUBLE_ACCURACY: "ON"
-   only:
-      - triggers
-      
-msvc-14_Hybrid_Dbg:
-   <<: *win_build_definition
-   variables:
-      CMAKE_GENERATOR: "Visual Studio 14 2015 Win64"
-      BUILD_CONFIGURATION: "DebugOptimized"
-      CTEST_EXCLUDE_LABELS: "longrun"
-      WALBERLA_BUILD_WITH_MPI: "ON"
-      WALBERLA_BUILD_WITH_OPENMP: "ON"
-      WALBERLA_DOUBLE_ACCURACY: "ON"
-   only:
-      - triggers
-      
-msvc-14_Hybrid_SP_Dbg:
-   <<: *win_build_definition
-   variables:
-      CMAKE_GENERATOR: "Visual Studio 14 2015 Win64"
-      BUILD_CONFIGURATION: "DebugOptimized"
-      CTEST_EXCLUDE_LABELS: "longrun"
-      WALBERLA_BUILD_WITH_MPI: "ON"
-      WALBERLA_BUILD_WITH_OPENMP: "ON"
-      WALBERLA_DOUBLE_ACCURACY: "OFF"
-   only:
-      - triggers
-      
-msvc-14_Hybrid:
-   <<: *win_build_definition
-   variables:
-      CMAKE_GENERATOR: "Visual Studio 14 2015 Win64"
-      BUILD_CONFIGURATION: "Release"
-      CTEST_EXCLUDE_LABELS: "longrun"
-      WALBERLA_BUILD_WITH_MPI: "ON"
-      WALBERLA_BUILD_WITH_OPENMP: "ON"
-      WALBERLA_DOUBLE_ACCURACY: "ON"
-   only:
-      - triggers
-      
-msvc-14_Serial_Dbg:
-   <<: *win_build_definition
-   variables:
-      CMAKE_GENERATOR: "Visual Studio 14 2015 Win64"
-      BUILD_CONFIGURATION: "DebugOptimized"
-      CTEST_EXCLUDE_LABELS: "longrun"
-      WALBERLA_BUILD_WITH_MPI: "OFF"
-      WALBERLA_BUILD_WITH_OPENMP: "OFF"
-      WALBERLA_DOUBLE_ACCURACY: "ON"
-   only:
-      - triggers
-      
-msvc-14_Serial:
-   <<: *win_build_definition
-   variables:
-      CMAKE_GENERATOR: "Visual Studio 14 2015 Win64"
-      BUILD_CONFIGURATION: "Release"
-      CTEST_EXCLUDE_LABELS: "longrun"
-      WALBERLA_BUILD_WITH_MPI: "OFF"
-      WALBERLA_BUILD_WITH_OPENMP: "OFF"
-      WALBERLA_DOUBLE_ACCURACY: "ON"
-   only:
-      - triggers
-      
-msvc-14_MpiOnly_Dbg:
-   <<: *win_build_definition
-   variables:
-      CMAKE_GENERATOR: "Visual Studio 14 2015 Win64"
-      BUILD_CONFIGURATION: "DebugOptimized"
-      CTEST_EXCLUDE_LABELS: "longrun"
-      WALBERLA_BUILD_WITH_MPI: "ON"
-      WALBERLA_BUILD_WITH_OPENMP: "OFF"
-      WALBERLA_DOUBLE_ACCURACY: "ON"
-   only:
-      - triggers
-      
-msvc-14_MpiOnly:
-   <<: *win_build_definition
-   variables:
-      CMAKE_GENERATOR: "Visual Studio 14 2015 Win64"
-      BUILD_CONFIGURATION: "Release"
-      CTEST_EXCLUDE_LABELS: "longrun"
-      WALBERLA_BUILD_WITH_MPI: "ON"
-      WALBERLA_BUILD_WITH_OPENMP: "OFF"
-      WALBERLA_DOUBLE_ACCURACY: "ON"
-   only:
-      - triggers
-      
-      
-msvc-14.1_Hybrid_Dbg:
-   <<: *win_build_definition
-   variables:
-      CMAKE_GENERATOR: "Visual Studio 15 2017 Win64"
-      BUILD_CONFIGURATION: "DebugOptimized"
-      CTEST_EXCLUDE_LABELS: "longrun"
-      WALBERLA_BUILD_WITH_MPI: "ON"
-      WALBERLA_BUILD_WITH_OPENMP: "ON"
-      WALBERLA_DOUBLE_ACCURACY: "ON"
-      
-msvc-14.1_Hybrid_SP_Dbg:
-   <<: *win_build_definition
-   variables:
-      CMAKE_GENERATOR: "Visual Studio 15 2017 Win64"
-      BUILD_CONFIGURATION: "DebugOptimized"
-      CTEST_EXCLUDE_LABELS: "longrun"
-      WALBERLA_BUILD_WITH_MPI: "ON"
-      WALBERLA_BUILD_WITH_OPENMP: "ON"
-      WALBERLA_DOUBLE_ACCURACY: "OFF"
-      
-msvc-14.1_Hybrid:
-   <<: *win_build_definition
+      - cmake .. -DWALBERLA_BUILD_TESTS=ON -DWALBERLA_BUILD_BENCHMARKS=ON -DWALBERLA_BUILD_TUTORIALS=ON -DWALBERLA_BUILD_TOOLS=ON -DWALBERLA_BUILD_WITH_MPI=$WALBERLA_BUILD_WITH_MPI -DWALBERLA_BUILD_WITH_PYTHON=$WALBERLA_BUILD_WITH_PYTHON -DWALBERLA_BUILD_WITH_CODEGEN=$WALBERLA_BUILD_WITH_CODEGEN -DWALBERLA_BUILD_WITH_OPENMP=$WALBERLA_BUILD_WITH_OPENMP -DWALBERLA_BUILD_WITH_CUDA=$WALBERLA_BUILD_WITH_CUDA -DWALBERLA_LOGLEVEL=$WALBERLA_LOGLEVEL -DCMAKE_BUILD_TYPE=$CMAKE_BUILD_TYPE -DWARNING_ERROR=ON -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
+      - cmake . -LA
+      - make -j $NUM_BUILD_CORES -l $NUM_CORES
+      - ctest -LE "$CTEST_EXCLUDE_LABELS|cuda" -C $CMAKE_BUILD_TYPE --output-on-failure -j $NUM_CORES -T Test
+   after_script:
+      - pip3 install lxml
+      - python3 cmake/ctest2junit.py build > report.xml
   variables:
-      CMAKE_GENERATOR: "Visual Studio 15 2017 Win64"
-      BUILD_CONFIGURATION: "Release"
-      CTEST_EXCLUDE_LABELS: "longrun"
-      WALBERLA_BUILD_WITH_MPI: "ON"
-      WALBERLA_BUILD_WITH_OPENMP: "ON"
-      WALBERLA_DOUBLE_ACCURACY: "ON"
-      
-msvc-14.1_Serial_Dbg:
-   <<: *win_build_definition
+      WALBERLA_LOGLEVEL: "DETAIL"
+   tags:
+      - macmini
+   artifacts:
+      when: always
+      reports:
+         junit:
+            - report.xml
+            - python/report.xml
+
+mac_Serial_Dbg:
+   extends: .mac_build_template
+   before_script:
+     - pip3 install pystencils==1.3.6
+     - pip3 install lbmpy==1.3.7
   variables:
-      CMAKE_GENERATOR: "Visual Studio 15 2017 Win64"
-      BUILD_CONFIGURATION: "DebugOptimized"
+      CMAKE_BUILD_TYPE: "DebugOptimized"
      CTEST_EXCLUDE_LABELS: "longrun"
      WALBERLA_BUILD_WITH_MPI: "OFF"
      WALBERLA_BUILD_WITH_OPENMP: "OFF"
-      WALBERLA_DOUBLE_ACCURACY: "ON"
-      
-msvc-14.1_Serial:
-   <<: *win_build_definition
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+
+mac_Serial:
+   extends: .mac_build_template
+   before_script:
+     - pip3 install pystencils==1.3.6
+     - pip3 install lbmpy==1.3.7
   variables:
-      CMAKE_GENERATOR: "Visual Studio 15 2017 Win64"
-      BUILD_CONFIGURATION: "Release"
+      CMAKE_BUILD_TYPE: "Release"
      CTEST_EXCLUDE_LABELS: "longrun"
      WALBERLA_BUILD_WITH_MPI: "OFF"
      WALBERLA_BUILD_WITH_OPENMP: "OFF"
-      WALBERLA_DOUBLE_ACCURACY: "ON"
-   only:
-      - triggers
-      
-msvc-14.1_MpiOnly_Dbg:
-   <<: *win_build_definition
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+
+mac_MpiOnly_Dbg:
+   extends: .mac_build_template
+   before_script:
+     - pip3 install pystencils==1.3.6
+     - pip3 install lbmpy==1.3.7
   variables:
-      CMAKE_GENERATOR: "Visual Studio 15 2017 Win64"
-      BUILD_CONFIGURATION: "DebugOptimized"
+      CMAKE_BUILD_TYPE: "DebugOptimized"
      CTEST_EXCLUDE_LABELS: "longrun"
      WALBERLA_BUILD_WITH_MPI: "ON"
      WALBERLA_BUILD_WITH_OPENMP: "OFF"
-      WALBERLA_DOUBLE_ACCURACY: "ON"
-      
-msvc-14.1_MpiOnly:
-   <<: *win_build_definition
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      OMPI_MCA_btl: "self,tcp"
+
+mac_MpiOnly:
+   extends: .mac_build_template
+   before_script:
+     - pip3 install pystencils==1.3.6
+     - pip3 install lbmpy==1.3.7
   variables:
-      CMAKE_GENERATOR: "Visual Studio 15 2017 Win64"
-      BUILD_CONFIGURATION: "Release"
+      CMAKE_BUILD_TYPE: "Release"
      CTEST_EXCLUDE_LABELS: "longrun"
      WALBERLA_BUILD_WITH_MPI: "ON"
      WALBERLA_BUILD_WITH_OPENMP: "OFF"
-      WALBERLA_DOUBLE_ACCURACY: "ON"
-   only:
-      - triggers
-
+      WALBERLA_BUILD_WITH_PYTHON: "ON"
+      WALBERLA_BUILD_WITH_CODEGEN: "ON"
+      OMPI_MCA_btl: "self,tcp"

 ###############################################################################
 ##                                                                           ##
@@ -1301,24 +2217,24 @@ msvc-14.1_MpiOnly:
   dependencies: []
   when: manual
   only:
-      - master@walberla/walberla   
+      - master@walberla/walberla
      - tags@walberla/walberla

-conda-py36-win-withoutLbm:
+conda-py36-win:
   <<: *conda_deploy_definition
   tags:
      - win
   script:
-      - conda build --python=3.6 --user=lssfau utilities\\conda\\withoutLbm
-      
-conda-py35-win-withoutLbm:
+      - conda build --python=3.6 --user=lssfau utilities\\conda\\walberla
+
+conda-py37-win:
   <<: *conda_deploy_definition
   tags:
      - win
   script:
-      - conda build --python=3.5 --user=lssfau utilities\\conda\\withoutLbm
+      - conda build --python=3.7 --user=lssfau utilities\\conda\\walberla

-conda-py36-linux-withoutLbm:
+conda-py37-linux:
   <<: *conda_deploy_definition
   tags:
      - docker
@@ -1326,9 +2242,9 @@ conda-py36-linux-withoutLbm:
   script:
      - apt-get update
      - apt-get install -y build-essential
-      - conda build --python=3.6 --user=lssfau utilities/conda/withoutLbm
-      
-conda-py35-linux-withoutLbm:
+      - conda build --python=3.7 --user=lssfau utilities/conda/walberla
+
+conda-py36-linux:
   <<: *conda_deploy_definition
   tags:
      - docker
@@ -1336,24 +2252,114 @@ conda-py35-linux-withoutLbm:
   script:
      - apt-get update
      - apt-get install -y build-essential
-      - conda build --python=3.5 --user=lssfau utilities/conda/withoutLbm
-      
-conda-py36-linux-withLbm:
-   <<: *conda_deploy_definition
-   tags:
-      - docker
-   image: continuumio/miniconda3
+      - conda build --python=3.6 --user=lssfau utilities/conda/walberla
+
+
+###############################################################################
+##                                                                           ##
+##    Benchmarks                                                             ##
+##                                                                           ##
+###############################################################################
+
+.benchmark_template: &benchmark_definition
   script:
-      - apt-get update
-      - apt-get install -y build-essential
-      - conda build --python=3.6 --user=lssfau utilities/conda/withLbm
-      
-conda-py35-linux-withLbm:
-   <<: *conda_deploy_definition
+      - apt-get update --fix-missing
+      - apt-get install -y python3-influxdb python3-git
+      - $CXX --version
+      - cmake --version
+      - ccache --version
+      - mpirun --version
+      - export CCACHE_BASEDIR=$CI_PROJECT_DIR
+      - mkdir $CI_PROJECT_DIR/build
+      - cd $CI_PROJECT_DIR/build
+      - cmake .. -DWALBERLA_BUFFER_DEBUG=OFF -DWALBERLA_BUILD_TESTS=OFF -DWALBERLA_BUILD_BENCHMARKS=ON -DWALBERLA_BUILD_TUTORIALS=OFF -DWALBERLA_BUILD_TOOLS=OFF -DWALBERLA_BUILD_WITH_MPI=ON -DWALBERLA_BUILD_WITH_CUDA=OFF -DWALBERLA_BUILD_WITH_PYTHON=OFF -DWALBERLA_BUILD_WITH_OPENMP=OFF -DCMAKE_BUILD_TYPE=RELEASE -DMPIEXEC_PREFLAGS=$MPIEXEC_PREFLAGS -DWALBERLA_DOUBLE_ACCURACY=ON -DWARNING_ERROR=ON -DWALBERLA_BUILD_WITH_METIS=OFF -DWALBERLA_BUILD_WITH_PARMETIS=OFF -DWALBERLA_OPTIMIZE_FOR_LOCALHOST=ON -DWALBERLA_BUILD_WITH_FASTMATH=ON -DWALBERLA_BUILD_WITH_LTO=ON
+      - cmake . -LA
+      - cd apps/benchmarks/GranularGas
+      - make -j 20
+      - export PATH=$PATH:/usr/local/likwid/bin
+      - likwid-setFrequencies -t 0
+      - likwid-setFrequencies -g performance
+      - likwid-setFrequencies -f 3.3 # set frequency to 3.3
+      - mpirun --allow-run-as-root -np 8 --map-by core --bind-to core --report-bindings ./PE_GranularGas PE_Benchmark.cfg --DEM --syncNextNeighbor | tee GranularGas_DEM_NN.txt
+      - mpirun --allow-run-as-root -np 8 --map-by core --bind-to core --report-bindings ./PE_GranularGas PE_Benchmark.cfg --DEM --syncShadowOwners | tee GranularGas_DEM_SO.txt
+      - mpirun --allow-run-as-root -np 8 --map-by core --bind-to core --report-bindings ./PE_GranularGas PE_Benchmark.cfg --HCSITS --syncNextNeighbor --InelasticFrictionlessContact | tee GranularGas_HCSITS_NN_IFC.txt
+      - mpirun --allow-run-as-root -np 8 --map-by core --bind-to core --report-bindings ./PE_GranularGas PE_Benchmark.cfg --HCSITS --syncNextNeighbor --ApproximateInelasticCoulombContactByDecoupling | tee GranularGas_HCSITS_NN_AICCBD.txt
+      - mpirun --allow-run-as-root -np 8 --map-by core --bind-to core --report-bindings ./PE_GranularGas PE_Benchmark.cfg --HCSITS --syncNextNeighbor --InelasticCoulombContactByDecoupling | tee GranularGas_HCSITS_NN_ICCBD.txt
+      - mpirun --allow-run-as-root -np 8 --map-by core --bind-to core --report-bindings ./PE_GranularGas PE_Benchmark.cfg --HCSITS --syncNextNeighbor --InelasticGeneralizedMaximumDissipationContact | tee GranularGas_HCSITS_NN_IGMDC.txt
+      - mpirun --allow-run-as-root -np 8 --map-by core --bind-to core --report-bindings ./PE_GranularGas PE_Benchmark.cfg --HCSITS --syncShadowOwners --InelasticFrictionlessContact | tee GranularGas_HCSITS_SO_IFC.txt
+      - python3 pe_upload.py
+      - mpirun --allow-run-as-root -np 8 --map-by core --bind-to core --report-bindings ./MESA_PD_KernelBenchmark MESA_PD_Benchmark.cfg | tee mesa_pd.txt
+      - python3 mesa_pd_upload.py
+   when: manual
+   needs: [ ]
+   stage: benchmark
   tags:
-      - docker
-   image: continuumio/miniconda3
-   script:
-      - apt-get update
-      - apt-get install -y build-essential
-      - conda build --python=3.5 --user=lssfau utilities/conda/withLbm
+      - docker-benchmark
+   artifacts:
+      paths:
+         - $CI_PROJECT_DIR/build/apps/benchmarks/GranularGas/*.txt
+         - $CI_PROJECT_DIR/build/apps/benchmarks/GranularGas/*.sqlite
+
+benchmark_intel19:
+   <<: *benchmark_definition
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/intel-2022
+
+benchmark_gcc8:
+   <<: *benchmark_definition
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/gcc-13
+
+benchmark_clang8:
+   <<: *benchmark_definition
+   image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-17
+
+benchmark_ClangBuildAnalyzer:
+  script:
+    - cmake --version
+    - ccache --version
+    - mpirun --version
+    - export CC=clang
+    - export CXX=clang++
+    - $CXX --version
+    - cd /tmp
+    - git clone https://github.com/aras-p/ClangBuildAnalyzer.git
+    - cd ClangBuildAnalyzer
+    - cmake .
+    - make
+    - export PATH+=:$(pwd)
+    - mkdir $CI_PROJECT_DIR/build
+    - cd $CI_PROJECT_DIR/build
+    - cmake .. -DWALBERLA_BUFFER_DEBUG=OFF -DWALBERLA_BUILD_TESTS=ON -DWALBERLA_BUILD_BENCHMARKS=ON -DWALBERLA_BUILD_TUTORIALS=ON -DWALBERLA_BUILD_TOOLS=OFF -DWALBERLA_BUILD_WITH_MPI=ON -DWALBERLA_BUILD_WITH_CUDA=OFF -DWALBERLA_BUILD_WITH_PYTHON=OFF -DWALBERLA_BUILD_WITH_OPENMP=OFF -DCMAKE_BUILD_TYPE=RELEASE -DMPIEXEC_PREFLAGS=$MPIEXEC_PREFLAGS -DWALBERLA_DOUBLE_ACCURACY=ON -DWARNING_ERROR=ON -DWALBERLA_BUILD_WITH_METIS=OFF -DWALBERLA_BUILD_WITH_PARMETIS=OFF -DWALBERLA_OPTIMIZE_FOR_LOCALHOST=ON -DWALBERLA_BUILD_WITH_FASTMATH=ON -DWALBERLA_BUILD_WITH_LTO=ON -DCMAKE_CXX_FLAGS=-ftime-trace -G Ninja
+    - cmake . -LA
+    - ClangBuildAnalyzer --start .
+    - ninja all
+    - ClangBuildAnalyzer --stop . CBA
+    - ClangBuildAnalyzer --analyze CBA
+  image: i10git.cs.fau.de:5005/walberla/buildenvs/clang-17
+  tags:
+    - docker-benchmark
+  only:
+     variables:
+        - $ENABLE_NIGHTLY_BUILDS
+
+continuous_benchmark_trigger:
+  stage: benchmark
+  image: curlimages/curl
+  tags:
+    - docker
+  script:
+    - curl
+      --fail
+      --request POST
+      --form "token=$CB_TRIGGER_TOKEN"
+      --form "ref=master"
+      --form "variables[WALBERLA_GITLAB_INSTANCE]=https://$CI_SERVER_HOST"
+      --form "variables[WALBERLA_PROJECT_ID]=$CI_PROJECT_PATH"
+      --form "variables[WALBERLA_BRANCH]=$CI_COMMIT_BRANCH"
+      --form "variables[WALBERLA_COMMIT]=$CI_COMMIT_SHA"
+      "$CB_TRIGGER_API_URL"
+  rules:
+    - if: '$CI_PROJECT_PATH == "walberla/walberla" && $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH'
+      when: on_success
+    - if: $CI_PIPELINE_SOURCE != "merge_request_event"
+      when: manual
+      allow_failure: true
\ No newline at end of file
--- a/src/pe/rigidbody/Owner.cpp
+++ b/src/pe/rigidbody/Owner.cpp
--- a/.isort.cfg
+++ b/.isort.cfg
+[settings]
+line_length=100
+balanced_wrapping=True
+multi_line_output=4
+known_third_party=sympy
--- a/AUTHORS.txt
+++ b/AUTHORS.txt
 List of contributors
 ====================

+Cameron Stewart
 Christian Feichtinger
 Christian Godenschwager
 Christoph Rettinger
+Christoph Schwarzmeier
 Daniel Ritter
 Daniela Anderl
 David Staubach
@@ -11,22 +13,34 @@ Dominik Bartuschat
 Ehsan Fattahi
 Felix Winterhalter
 Florian Schornbaum
+Frederik Hennig
+Grigorii Drozdov
+Helen Schottenhamml
+Igor Ostanin
 Jan Götz
+Jan Hönig
+João Victor Tozatti Risso
 Johannes Habich
 Klaus Iglberger
 Kristina Pickl
 Lorenz Hufnagel
+Lukas Werner
+Markus Holzer
 Martin Bauer
 Matthias Markl
 Michael Kuron
+Nils Kohl
 Paulo Carvalho
+Philipp Suffa
 Regina Ammer
 Sagar Dolas
 Sebastian Eibl
 Silke Bergler
 Simon Bogner
 Stefan Donath
+Stephan Seitz
 Sunil Kontham
+Tobias Leemann
 Tobias Preclik
 Tobias Scharpff
 Tobias Schruff
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
+# Changelog
+
+## [6.1] - 2022-07-25
+### Added
+- Free-surface LBM extension:
+  - Add implementation
+  - Add several showcases
+  - Add several tests
+- LBM - MESA_PD coupling:
+  - Add partially saturated cells method (PSM)
+  - Add fluidized bed showcase
+  - Add virtual mass stabilization technique for light particles
+  - Add support for more shapes, e.g., convex polyhedron
+- MESA_PD:
+   - Add extensive application for dense particle packing generation
+- AMD - HIP support
+  - Support of the ROCm Toolchain and thus AMD HIP as second GPU language
+  - All CUDA related files, namespaces, folders etc are renamed to gpu.
+  - Include "GPUWrapper.h" to use general GPU functions cudaMalloc -> gpuMalloc
+  - WALBERLA_BUILD_WITH_HIP and WALBERLA_BUILD_WITH_GPU_SUPPORT as new CMake variables introduced
+
+### Changed
+- Update and extend phase-field LBM showcases
+- Allow access to PDF centering information (for being used in generated LBM kernels)
+- Adapt code generation backend to be compatible with pystencils 1.0 and lbmpy 1.0
+- Required minimum dependencies:
+  - C++17-compliant compiler
+  - CMake 3.14
+  - pybind 2.6.2
+  - lbmpy 1.0
+  - pystencils 1.0
+
+### Deprecated
+- GUI
+
+## [5.1] - 2020-04-09
+### Added
+- Add new tutorials and showcases
+- Extend MESA-PD functionalities, including several molecular dynamics models
+- Fluid-particle coupling with MESA-PD: functionalities, tests, benchmark scenarios
+
+### Changed
+- Update to C++17
+- Update CUDA compiler support
+- Extend Clang-Tidy coverage
+- Add closer integration of code generation using pystencils and lbmpy
+- Python Coupling now build upon pybind11. Boost.Python is no longer supported
+  - lbm module dropped from python coupling due to deprecation for a long time
+  - geometry, postprocessing and timeloop dropped from python coupling due to its low usage
+  - PEP8-ification of Python API. This means all keyword arguments are now in snake_case and not in CamelCase as before.
+
+### Fixed
+- Guo force model for non-SRT, may change simulation results
+
+## [4.1] - 2019-04-19
+### Added
+- Galerkin coarsening for Multigrid
+- LBM-PE-Coupling:
+  - new coupling approach for unresolved particle interactions (discrete particle method)
+  - adaptive grid refinement for coupled simulations
+  - load balancing functionalities for coupled simulations
+  - module description
+- integrated *pystencils* and *lbmpy* code generation for kernels and pack infos
+- new GPU communication, including support for GPUDirect
+- load balancing functionality for the pe
+- implemented IProbe communication as an alternative to two message communication for unknown size communication
+- new creation helpers for BlockForest
+- Minor:
+   - dynamic load balancing now possible with levels ignored
+   - `ExtendedBoundaryHandlingFactory` now uses `ParserUBB` instead of `UBB` so that velocity profiles can be specified as an equation in a parameter file
+   - Enabled the body selection functions for PSM coupling method
+   - grid_generators now allow range based for loops
+
+### Changed
+- A compiler with full C++14 support is now required
+- All Boost usage has been replaced with the corresponding standard library functionality, except for Boost.Python (used for the `python_coupling` module), Boost.PropertyTree (used in `config::configToBoostPropertyTree`) and Boost.Graph (used by `math::EquationSystem`). This usually means you need to replace `boost::` with `std::` in your code and change some `#include`s.
+- API changes in blockforest::PhantomBlockForest, blockforest::loadbalancing, pe::amr::weight_assignment
+- API change for vtk::VTKOutput::getFilenames
+- made SendBuffer memory access more high level
+- PE coupling:
+   - changed body mapping functions: removed most of them, added mapping-decider function, accomodated changes to test cases, added new mapping test case
+   - changed pe coupling mapping functions interfaces, added new mapping functions, adapted test cases and benchmarks
+   - change in lubrication correction functionality to not require the lattice model but use the dynamic viscosity directly
+- PE:
+   - rebased Union on BodyStorage
+   - `pe::Union`, `boundary::BoundaryHandling` and `boundary::BoundaryHandlingCollection` are now variadic templates instead of taking a tuple of bodies/boundaries/handlers. This means that you need to replace `std::tuple<A,B>` with `A,B` in these cases.
+   - using smart pointers for all memory management
+   - made setMass protected
+   - extended DEM collision model with dt
+   - pe::createBlockForest changed to support a variable number of processes
+   - changed BodyStatistics to use shared_ptr instead of reference
+
+### Removed
+- Remove dependency resolution from Singleton
+- from PE
+   - Node
+   - PtrVector
+   - contacts from RigidBody
+   - attached bodies
+   - attachables
+
+### Fixed
+- Fix implict/explict naming in pe::cr
+
+### Deprecated
+- all dynamic level-wise balance functions (use the more general ones, without "level-wise")
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,7 +5,7 @@
 ## Contents:
 ##   - definition of build options
 ##   - compiler variables ( c++ standard, warnings etc. )
-##   - Finding of service libraries. Required: boost, Optional: MPI, PE, METIS
+##   - Finding of service libraries. Required: none, Optional: MPI, FFTW3, METIS, OpenMesh, Python
 ##     the include paths are set, and the libraries are added to variable SERVICE_LIBS
 ##   - Subdirectory cmake lists are called
 ##       -> src/   this folder contains all modules, each module (that contains c or cpp files) is linked to a
@@ -25,22 +25,18 @@
 ##
 ############################################################################################################################

-CMAKE_MINIMUM_REQUIRED (VERSION 2.8)
+CMAKE_MINIMUM_REQUIRED (VERSION 3.24)


 PROJECT ( walberla )

-set ( CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${walberla_SOURCE_DIR}/cmake )
+set ( CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${walberla_SOURCE_DIR}/cmake ${walberla_SOURCE_DIR}/cmake/compileroptions )

 include ( waLBerlaFunctions )

-set_version( 4 0 )
+set_version( 7 0 )

-if( CMAKE_VERSION VERSION_LESS 2.8.3 )
-   include( CMakeParseArgumentsCompat )
-else()
-   include( CMakeParseArguments )
-endif()
+include( CMakeParseArguments )

 # Enable CTest
 enable_testing()
@@ -61,30 +57,31 @@ include( CTest )

 # Build options
 option ( WALBERLA_DOUBLE_ACCURACY           "Floating point accuracy, defaults to double"     ON )
-option ( WALBERLA_ENABLE_GUI                "Compile with GUI"                                   )

-option ( WALBERLA_BUILD_TESTS               "Build Testcases"                                    )
+option ( WALBERLA_BUILD_TESTS               "Build Testcases"                                 ON )
 option ( WALBERLA_BUILD_BENCHMARKS          "Build Benchmarks"                                ON )
-option ( WALBERLA_BUILD_TOOLS               "Build Tools"                                        )
+option ( WALBERLA_BUILD_TOOLS               "Build Tools"                                     ON )
 option ( WALBERLA_BUILD_TUTORIALS           "Build Tutorials"                                 ON )
+option ( WALBERLA_BUILD_SHOWCASES           "Build Showcases"                                 ON )
+option ( WALBERLA_BUILD_DOC                 "Build Documentation"                             ON )

 option ( WALBERLA_BUILD_WITH_MPI            "Build with MPI"                                  ON )
 option ( WALBERLA_BUILD_WITH_METIS          "Build with metis graph partitioner"             OFF )
 option ( WALBERLA_BUILD_WITH_PARMETIS       "Build with ParMetis graph partitioner"          OFF )
+option ( WALBERLA_BUILD_WITH_FFTW           "Build with FFTW Fourier Transform library"      OFF )

 option ( WALBERLA_BUILD_WITH_GPROF          "Enables gprof"                                      )
 option ( WALBERLA_BUILD_WITH_GCOV           "Enables gcov"                                       )
 option ( WALBERLA_BUILD_WITH_LTO            "Enable link time optimizations"                     )
 option ( WALBERLA_BUILD_WITH_OPENMP         "Enable OpenMP support"                              )
 option ( WALBERLA_BUILD_WITH_PYTHON         "Support for embedding Python"                       )
-option ( WALBERLA_BUILD_WITH_PYTHON_MODULE  "Build waLBerla python module"                       )
-option ( WALBERLA_BUILD_WITH_PYTHON_LBM     "Include LBM module into python module"          OFF )
 option ( WALBERLA_BUILD_WITH_CODEGEN        "Enable pystencils code generation"              OFF )


 option ( WALBERLA_BUILD_WITH_LIKWID_MARKERS "Compile in markers for likwid-perfctr"              )

 option ( WALBERLA_BUILD_WITH_CUDA	        "Enable CUDA support"                                )
+option ( WALBERLA_BUILD_WITH_HIP	           "Enable ROCm HIP support"                            )


 option ( WALBERLA_BUILD_WITH_FASTMATH       "Fast math"                                          )
@@ -93,15 +90,19 @@ option ( WALBERLA_SIMD_FORCE_SCALAR         "Do not use SIMD operations even whe

 option ( WALBERLA_BUFFER_DEBUG              "Type checking for BufferSystem ( slow )"        OFF )

-option ( WALBERLA_NO_OUTDATED_FEATURES      "Show warning/errors when outdated features "
-                                            "(i.e. features that will be deprecated) are used"   )
-
 # Profile guided optimization
-option ( WALBERLA_PROFILE_GENERATE  "Generates Profile for Optimization"             )
-option ( WALBERLA_PROFILE_USE       "Uses Profile to optimize"                       )
+option ( WALBERLA_PROFILE_GENERATE          "Generates Profile for Optimization"                 )
+option ( WALBERLA_PROFILE_USE               "Uses Profile to optimize"                           )

 # Compiler Optimization
-option ( WALBERLA_OPTIMIZE_FOR_LOCALHOST "Enable compiler optimizations spcific to localhost" )
+option ( WALBERLA_OPTIMIZE_FOR_LOCALHOST    "Enable compiler optimizations spcific to localhost" )
+
+option ( WALBERLA_LOG_SKIPPED               "Log skipped cmake targets"                      ON  )
+option ( WALBERLA_DEPS_ERROR                "Fail if module dependencies are not met"        OFF )
+
+option ( WALBERLA_GIT_SUBMODULE_AUTO        "Check submodules during cmake run"               ON )
+
+option ( WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT "Experimental half precision support"    OFF )

 # Installation Directory
 set ( CMAKE_INSTALL_PREFIX /usr/local/waLBerla CACHE STRING "The default installation directory."   )
@@ -111,6 +112,7 @@ if ( NOT CMAKE_BUILD_TYPE )
    set ( CMAKE_BUILD_TYPE Release CACHE STRING "Build Types: Debug Release DebugOptimized RelWithDebInfo MinSizeRel."  FORCE )
 endif()
 SET_PROPERTY( CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS Debug Release DebugOptimized RelWithDebInfo MinSizeRel )
+set( CMAKE_TRY_COMPILE_CONFIGURATION ${CMAKE_BUILD_TYPE} )

 # Debugging options                                      )
 option ( WALBERLA_STL_BOUNDS_CHECKS  "Use debug capabilites of libstd++: iterator and bounds checks" )
@@ -132,8 +134,16 @@ list( APPEND WALBERLA_MODULE_DIRS "${walberla_SOURCE_DIR}/src" "${walberla_SOURC
 list( REMOVE_DUPLICATES  WALBERLA_MODULE_DIRS )
 set ( WALBERLA_MODULE_DIRS  ${WALBERLA_MODULE_DIRS} CACHE INTERNAL "All folders that contain modules or tests" )

+# target_link_libraries needs to called with keywords everywhere if it is called with keywords once
+if( DEFINED CUDA_LINK_LIBRARIES_KEYWORD AND NOT CUDA_LINK_LIBRARIES_KEYWORD STREQUAL "" )
+    set( WALBERLA_LINK_LIBRARIES_KEYWORD PUBLIC )
+endif()
+
 ############################################################################################################################

+set( CMAKE_CXX_STANDARD 17 )
+set( CMAKE_CXX_STANDARD_REQUIRED ON )
+set( CMAKE_CXX_EXTENSIONS OFF )

 ############################################################################################################################
 ##
@@ -165,6 +175,11 @@ if( CMAKE_CXX_COMPILER MATCHES "icpc" OR CMAKE_CXX_COMPILER_ARG1 MATCHES "icpc"
       SET(CMAKE_LINKER "${XILD}")
    ENDIF(XILD)
    MARK_AS_ADVANCED(XILD)
+
+    if( CMAKE_CXX_COMPILER_VERSION VERSION_LESS "19.0.5" )
+      # std::filesystem uses ABI tags, which don't work 19.0.2 but do in 19.0.5
+      add_flag ( CMAKE_CXX_FLAGS "-D_GLIBCXX_USE_CXX11_ABI=0" )
+    endif()
 else()
    option ( WALBERLA_CXX_COMPILER_IS_INTEL "Use Intel compiler" OFF  )
 endif()
@@ -173,6 +188,7 @@ mark_as_advanced ( WALBERLA_CXX_COMPILER_IS_INTEL )
 # Check for Gnu compiler
 if ( CMAKE_COMPILER_IS_GNUCXX  AND NOT WALBERLA_CXX_COMPILER_IS_INTEL )
     option ( WALBERLA_CXX_COMPILER_IS_GNU "Use gnu compiler" ON  )
+     include(GNU)
 else()
     option ( WALBERLA_CXX_COMPILER_IS_GNU "Use gnu compiler" OFF  )
 endif()
@@ -181,6 +197,7 @@ mark_as_advanced ( WALBERLA_CXX_COMPILER_IS_GNU )
 # Check for Visual Studio
 if ( MSVC )
     option ( WALBERLA_CXX_COMPILER_IS_MSVC "Use Visual Studio compiler" ON  )
+     include( MSVC )
 else()
     option ( WALBERLA_CXX_COMPILER_IS_MSVC "Use Visual Studio compiler" OFF  )
 endif()
@@ -189,37 +206,38 @@ mark_as_advanced ( WALBERLA_CXX_COMPILER_IS_MSVC )
 # Check for IBM compiler
 if( CMAKE_CXX_COMPILER MATCHES "xlc" OR CMAKE_CXX_COMPILER_ARG1 MATCHES "xlc" )
    option ( WALBERLA_CXX_COMPILER_IS_IBM "Use IBM compiler" ON  )
+    include(IBM)
 else()
    option ( WALBERLA_CXX_COMPILER_IS_IBM "Use IBM compiler" OFF  )
 endif()
 mark_as_advanced ( WALBERLA_CXX_COMPILER_IS_IBM )

-# Check for NEC SX compiler
-if( CMAKE_CXX_COMPILER MATCHES "sxc" OR CMAKE_CXX_COMPILER_ARG1 MATCHES "sxc" OR CMAKE_CXX_COMPILER MATCHES "sxmpic" OR CMAKE_CXX_COMPILER_ARG1 MATCHES "sxmpic" )
-    option ( WALBERLA_CXX_COMPILER_IS_NEC "Use NEC compiler" ON  )
-else()
-    option ( WALBERLA_CXX_COMPILER_IS_NEC "Use NEC compiler" OFF  )
-endif()
-mark_as_advanced ( WALBERLA_CXX_COMPILER_IS_NEC )
-
-# Check for Clang compiler
-if( CMAKE_CXX_COMPILER MATCHES "clang" OR CMAKE_CXX_COMPILER_ARG1 MATCHES "clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang" )
+if( CMAKE_CXX_COMPILER MATCHES "clang" OR CMAKE_CXX_COMPILER_ARG1 MATCHES "clang" OR CMAKE_CXX_COMPILER MATCHES "hipcc" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" )
    option ( WALBERLA_CXX_COMPILER_IS_CLANG "Use clang compiler" ON  )
+    include(Clang)
 else()
    option ( WALBERLA_CXX_COMPILER_IS_CLANG "Use clang compiler" OFF  )
 endif()
 mark_as_advanced ( WALBERLA_CXX_COMPILER_IS_CLANG )

+# Check for Cray compiler
 if( CMAKE_CXX_COMPILER_ID MATCHES Cray )
    option ( WALBERLA_CXX_COMPILER_IS_CRAY "Use Cray compiler" ON   )
-    if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 8.4)
-        message( FATAL_ERROR "Insufficient Cray Compiler Environment version" )
-    endif()
+    include(Cray)
 else()
    option ( WALBERLA_CXX_COMPILER_IS_CRAY "Use Cray compiler" OFF  )
 endif()
 mark_as_advanced ( WALBERLA_CXX_COMPILER_IS_CRAY )

+# Check for Fujitsu compiler
+if( CMAKE_CXX_COMPILER_ID MATCHES FujitsuClang )
+    option ( WALBERLA_CXX_COMPILER_IS_FUJITSUCLANG "Use FujitsuClang compiler" ON  )
+    include(FujitsuClang)
+else()
+    option ( WALBERLA_CXX_COMPILER_IS_FUJITSUCLANG "Use FujitsuClang compiler" OFF  )
+endif()
+mark_as_advanced ( WALBERLA_CXX_COMPILER_IS_FUJITSUCLANG )
+
 # Check for MPI wrapper
 get_filename_component( CXX_COMPILER_WITHOUT_PATH ${CMAKE_CXX_COMPILER} NAME )
 if( CXX_COMPILER_WITHOUT_PATH MATCHES "mpi" OR CMAKE_CXX_COMPILER_ARG1 MATCHES "mpi" )
@@ -229,6 +247,15 @@ else()
 endif()
 mark_as_advanced ( WALBERLA_CXX_COMPILER_IS_MPI_WRAPPER )

+# Check for intel llvm compiler
+if( CMAKE_CXX_COMPILER MATCHES "icpx" OR CMAKE_CXX_COMPILER_ARG1 MATCHES "icpx" )
+   option ( WALBERLA_CXX_COMPILER_IS_INTELLLVM "Use Intel LLVM compiler" ON  )
+   include(IntelLLVM)
+else()
+   option ( WALBERLA_CXX_COMPILER_IS_INTELLLVM "Use Intel LLVM compiler" OFF  )
+endif()
+mark_as_advanced ( WALBERLA_CXX_COMPILER_IS_INTELLLVM )
+
 ############################################################################################################################


@@ -239,175 +266,7 @@ mark_as_advanced ( WALBERLA_CXX_COMPILER_IS_MPI_WRAPPER )
 ##
 ############################################################################################################################

-# Profile guided optimization
-if ( WALBERLA_PROFILE_GENERATE )
-    if( WALBERLA_CXX_COMPILER_IS_INTEL )
-        add_flag( CMAKE_CXX_FLAGS "-prof-gen" )
-        file( MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/profile" )
-        add_flag( CMAKE_CXX_FLAGS "-prof-dir${CMAKE_BINARY_DIR}/profile" )
-    elseif( WALBERLA_CXX_COMPILER_IS_GNU )
-        add_flag( CMAKE_CXX_FLAGS "-fprofile-generate" )
-    elseif( WALBERLA_CXX_COMPILER_IS_MSVC )
-      add_flag ( CMAKE_CXX_FLAGS           "/GL"                )
-      add_flag ( CMAKE_MODULE_LINKER_FLAGS "/LTCG:PGINSTRUMENT" )
-      add_flag ( CMAKE_SHARED_LINKER_FLAGS "/LTCG:PGINSTRUMENT" )
-      add_flag ( CMAKE_EXE_LINKER_FLAGS    "/LTCG:PGINSTRUMENT" )
-    endif()
-endif()
-
-if ( WALBERLA_PROFILE_USE )
-    if( WALBERLA_CXX_COMPILER_IS_INTEL )
-       add_flag( CMAKE_CXX_FLAGS "-prof-use" )
-       add_flag( CMAKE_CXX_FLAGS "-prof-dir${CMAKE_BINARY_DIR}/profile" )
-    elseif( WALBERLA_CXX_COMPILER_IS_GNU )
-       add_flag( CMAKE_CXX_FLAGS "-fprofile-use" )
-    elseif( WALBERLA_CXX_COMPILER_IS_MSVC )
-      add_flag ( CMAKE_CXX_FLAGS           "/GL"              )
-      add_flag ( CMAKE_MODULE_LINKER_FLAGS "/LTCG:PGOPTIMIZE" )
-      add_flag ( CMAKE_SHARED_LINKER_FLAGS "/LTCG:PGOPTIMIZE" )
-      add_flag ( CMAKE_EXE_LINKER_FLAGS    "/LTCG:PGOPTIMIZE" )
-    endif()
-endif()
-
-# common flags for intel and g++
-if( WALBERLA_CXX_COMPILER_IS_GNU OR WALBERLA_CXX_COMPILER_IS_INTEL )
-   if( CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.7 )
-      add_flag ( CMAKE_CXX_FLAGS "-std=c++0x" )
-   else()
-      add_flag ( CMAKE_CXX_FLAGS "-std=c++11" )
-   endif()
-   #add_flag ( CMAKE_C_FLAGS   "-std=c99" )
-   add_flag ( CMAKE_CXX_FLAGS "-Wall -Wconversion -Wshadow" )
-endif()
-
-# C++11 language features for IBM compiler
-if( WALBERLA_CXX_COMPILER_IS_IBM )
-   add_flag ( CMAKE_CXX_FLAGS "-qlanglvl=autotypededuction -qlanglvl=decltype -qlanglvl=static_assert -qlanglvl=rightanglebracket -qlanglvl=c99longlong" )
-endif()
-
-# C++11 language features for Cray compiler
-if( WALBERLA_CXX_COMPILER_IS_CRAY )
-   add_flag ( CMAKE_CXX_FLAGS "-hstd=c++11" )
-endif()
-
-# C++11 language features for NEC compiler
-if( WALBERLA_CXX_COMPILER_IS_NEC )
-   add_flag ( CMAKE_CXX_FLAGS "-Kcpp11 -Krtti -Kexceptions -size_t64 -Kgcc" )
-   add_flag ( CMAKE_CXX_FLAGS "-D__BIG_ENDIAN -D__BYTE_ORDER=__BIG_ENDIAN" )
-   add_flag ( CMAKE_CXX_FLAGS "-Tnoauto,used" )
-   add_flag ( CMAKE_EXE_LINKER_FLAGS "-Wl,-h,muldefs" )
-   add_flag ( CMAKE_C_FLAGS "-size_t64 -Kgcc" )
-   add_flag ( CMAKE_C_FLAGS "-D__BIG_ENDIAN -D__BYTE_ORDER=__BIG_ENDIAN" )
-   add_flag ( CMAKE_C_FLAGS "-DSQLITE_OMIT_WAL -DHAVE_UTIME -DTHREADSAFE=0" )
-   set( CMAKE_RANLIB /bin/true )
-   set( CMAKE_SKIP_BUILD_RPATH TRUE )
-   set( CMAKE_C_FLAGS_DEBUGOPTIMIZED    "-Chopt -g"                               )
-   set( CMAKE_C_FLAGS_DEBUG             "-Cdebug -g"                              )
-   set( CMAKE_CXX_FLAGS_DEBUGOPTIMIZED  "-Chopt -g"                               )
-   set( CMAKE_CXX_FLAGS_DEBUG           "-Cdebug -g"                              )
-endif()

-# Fixes linker errors with IBM compiler
-if( WALBERLA_CXX_COMPILER_IS_IBM )
-   add_flag ( CMAKE_CXX_FLAGS "-qpic=large" )
-endif()
-# Fixes linker errors with Cray compiler
-if( WALBERLA_CXX_COMPILER_IS_CRAY )
-   add_flag ( CMAKE_EXE_LINKER_FLAGS  "-dynamic -L/opt/gcc/4.9.3/snos/lib64" )
-endif()
-
-# Silences compiler and linker warnings and information with the IBM compiler
-if( WALBERLA_CXX_COMPILER_IS_IBM )
-   add_flag ( CMAKE_CXX_FLAGS "-qsuppress=1586-267" )  # 1586-267 (I) Inlining of specified subprogram failed due to the presence of a C++ exception handler
-   add_flag ( CMAKE_CXX_FLAGS "-qsuppress=1586-266" )  # 1586-266 (I) Inlining of specified subprogram failed due to the presence of a global label
-   add_flag ( CMAKE_CXX_FLAGS "-qsuppress=1540-0724" ) # 1540-0724 (W) The non-type template argument "2147483648" of type "T" has wrapped [coming from boost/integer_traits.hpp]
-   add_flag ( CMAKE_CXX_FLAGS "-qsuppress=1540-0095" ) # 1540-0095 (W) The friend function declaration ... [coming from boost/mpl/map/aux_/map0.hpp]
-   add_flag ( CMAKE_CXX_FLAGS "-qsuppress=1500-030" )  # 1500-030: (I) INFORMATION: [...] Additional optimization may be attained by recompiling and specifying MAXMEM option with a value greater than 8192.
-   add_flag ( CMAKE_C_FLAGS "-qsuppress=1500-030" )    # 1500-030: (I) INFORMATION: [...] Additional optimization may be attained by recompiling and specifying MAXMEM option with a value greater than 8192.
-endif()
-
-# Silences compiler and linker warnings and information with the Cray compiler
-if( WALBERLA_CXX_COMPILER_IS_CRAY )
-   set( CMAKE_INCLUDE_SYSTEM_FLAG_CXX "-isystem " )
-   add_flag ( CMAKE_CXX_FLAGS "-h nomessage=1" )      # CC-1    The source file does not end with a new-line character.
-   add_flag ( CMAKE_C_FLAGS   "-DSQLITE_HAVE_ISNAN" ) # SQLite will not work correctly with the -ffast-math option of GCC.
-   add_flag ( CMAKE_CXX_FLAGS "-DSQLITE_HAVE_ISNAN" ) # SQLite will not work correctly with the -ffast-math option of GCC.
-endif()
-
-# architecture optimization
-if( WALBERLA_OPTIMIZE_FOR_LOCALHOST )
-   if( WALBERLA_CXX_COMPILER_IS_GNU OR WALBERLA_CXX_COMPILER_IS_INTEL OR WALBERLA_CXX_COMPILER_IS_CLANG )
-      add_flag ( CMAKE_CXX_FLAGS "-march=native" )
-      add_flag ( CMAKE_C_FLAGS   "-march=native" )
-      if( WALBERLA_CXX_COMPILER_IS_INTEL )
-        add_flag ( CMAKE_CXX_FLAGS "-xhost" )
-        add_flag ( CMAKE_C_FLAGS   "-xhost" )
-      endif()
-   endif()
-endif()
-
-# warning flags
-if( WALBERLA_CXX_COMPILER_IS_INTEL )
-   # system headers are also supported by intel, but cmake does not recognize that
-   set( CMAKE_INCLUDE_SYSTEM_FLAG_CXX "-isystem " )
-   add_flag ( CMAKE_CXX_FLAGS "-wd2928,2504,2259,1682,597" )
-elseif( WALBERLA_CXX_COMPILER_IS_GNU )
-   add_flag ( CMAKE_CXX_FLAGS "-Wfloat-equal -Wextra" )
-elseif( WALBERLA_CXX_COMPILER_IS_NEC )
-   add_flag ( CMAKE_CXX_FLAGS "-wall" )
-endif()
-
-if ( WARNING_PEDANTIC AND WALBERLA_CXX_COMPILER_IS_GNU )
-   add_flag ( CMAKE_CXX_FLAGS "-pedantic" )
-endif ( )
-
- # omit deprecated warnings
-if( NOT WARNING_DEPRECATED)
-   if( WALBERLA_CXX_COMPILER_IS_INTEL )
-       add_flag( CMAKE_CXX_FLAGS "-wd1478" )  # Disable compiler warning # 1478: "declared as deprecated"
-   elseif( WALBERLA_CXX_COMPILER_IS_GNU OR WALBERLA_CXX_COMPILER_IS_CLANG )
-       add_flag ( CMAKE_CXX_FLAGS "-Wno-deprecated-declarations")
-   endif()
-endif()
-
-# Treat warnings as errors
-if ( WARNING_ERROR )
-   if( WALBERLA_CXX_COMPILER_IS_GNU OR WALBERLA_CXX_COMPILER_IS_INTEL OR WALBERLA_CXX_COMPILER_IS_CLANG )
-      add_flag ( CMAKE_CXX_FLAGS "-pedantic-errors -Werror" )
-   elseif( WALBERLA_CXX_COMPILER_IS_MSVC )
-      add_flag ( CMAKE_CXX_FLAGS "/WX" )
-   elseif ( WALBERLA_CXX_COMPILER_IS_CRAY )
-      add_flag ( CMAKE_CXX_FLAGS "-h error_on_warning" )
-   endif()
-endif ( )
-
-
-if ( WALBERLA_CXX_COMPILER_IS_CLANG )
-    add_flag ( CMAKE_CXX_FLAGS "-Wall -Wconversion -Wshadow -Wno-c++11-extensions -std=c++11 -Qunused-arguments" )
-    add_flag ( CMAKE_CXX_FLAGS "-D'_LIBCPP_EXTERN_TEMPLATE(...)='")
-endif ( )
-
-if( WALBERLA_CXX_COMPILER_IS_GNU OR WALBERLA_CXX_COMPILER_IS_INTEL OR WALBERLA_CXX_COMPILER_IS_CLANG )
-    if ( WALBERLA_STL_BOUNDS_CHECKS )
-        add_definitions ( "-D_GLIBCXX_DEBUG" )
-    endif()
-endif()
-
-
-#fastmath
-if ( WALBERLA_BUILD_WITH_FASTMATH )
-    if ( WALBERLA_CXX_COMPILER_IS_INTEL )
-        add_flag( CMAKE_CXX_FLAGS "-fp-model fast=2 -no-prec-sqrt -no-prec-div" )
-    endif()
-
-    if ( WALBERLA_CXX_COMPILER_IS_GNU OR WALBERLA_CXX_COMPILER_IS_CLANG )
-        add_flag( CMAKE_CXX_FLAGS "-ffast-math")
-    endif()
-
-    if( WALBERLA_CXX_COMPILER_IS_MSVC )
-        add_flag( CMAKE_CXX_FLAGS "/fp:fast" )
-    endif()
-endif()

 # Xcode generator disables -isystem flag, even though current versions of Xcode support it
 if(CMAKE_GENERATOR STREQUAL "Xcode")
@@ -416,86 +275,35 @@ if(CMAKE_GENERATOR STREQUAL "Xcode")
 endif()


-#GCC 5+ ABI selection
-if( WALBERLA_CXX_COMPILER_IS_GNU )
-   if( NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.0.0 )
-      option ( WALBERLA_USE_CPP11_ABI "On GCC 5+ use the C++11 ABI" ON )
-      if( WALBERLA_USE_CPP11_ABI )
-         add_flag( CMAKE_CXX_FLAGS "-D_GLIBCXX_USE_CXX11_ABI=1" )
-      else()
-         add_flag( CMAKE_CXX_FLAGS "-D_GLIBCXX_USE_CXX11_ABI=0" )
-      endif()
-   endif()
-endif()
-
-
 # disable Xcode 7.3+ linker deduplication pass to speed up linking in debug mode
-if ( APPLE )
-   execute_process( COMMAND ${CMAKE_LINKER} -v OUTPUT_VARIABLE LINKER_VERSION ERROR_VARIABLE LINKER_VERSION )
-   string( REGEX MATCH "ld64-[0-9\\.\\-]+" LINKER_VERSION ${LINKER_VERSION} )
-   string( REGEX MATCHALL "[^\\-]+" LINKER_VERSION ${LINKER_VERSION} )
-   list( GET LINKER_VERSION 0 LINKER_TYPE )
-   list( GET LINKER_VERSION 1 LINKER_VERSION )
-   if( LINKER_TYPE STREQUAL "ld64" AND LINKER_VERSION VERSION_GREATER 264.3.101 )
-       add_flag( CMAKE_EXE_LINKER_FLAGS_DEBUG    "-Wl,-no_deduplicate")
-       add_flag( CMAKE_MODULE_LINKER_FLAGS_DEBUG "-Wl,-no_deduplicate")
-       add_flag( CMAKE_SHARED_LINKER_FLAGS_DEBUG "-Wl,-no_deduplicate")
-   endif()
-endif()
+#if ( APPLE )
+#   execute_process( COMMAND ${CMAKE_LINKER} -v OUTPUT_VARIABLE LINKER_VERSION ERROR_VARIABLE LINKER_VERSION )
+#   string( REGEX MATCH "ld64-[0-9\\.\\-]+" LINKER_VERSION ${LINKER_VERSION} )
+#   string( REGEX MATCHALL "[^\\-]+" LINKER_VERSION ${LINKER_VERSION} )
+#   list( GET LINKER_VERSION 0 LINKER_TYPE )
+#   list( GET LINKER_VERSION 1 LINKER_VERSION )
+#   if( LINKER_TYPE STREQUAL "ld64" AND LINKER_VERSION VERSION_GREATER 264.3.101 )
+#       add_flag( CMAKE_EXE_LINKER_FLAGS_DEBUG    "-Wl,-no_deduplicate")
+#       add_flag( CMAKE_MODULE_LINKER_FLAGS_DEBUG "-Wl,-no_deduplicate")
+#       add_flag( CMAKE_SHARED_LINKER_FLAGS_DEBUG "-Wl,-no_deduplicate")
+#   endif()
+#endif()


 ############################################################################################################################


-
 ############################################################################################################################
 ##
-##  Visual Studio Setup
+## Python
 ##
-############################################################################################################################
-if ( WALBERLA_CXX_COMPILER_IS_MSVC )
-   string( REGEX REPLACE "[/-]W[0-4]" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} ) # remove default warning flags
-
-   option ( WALBERLA_GROUP_PROJECTS   "Flag if the projects are grouped or in a flat hierarchy"    ON )
-   option ( WALBERLA_GROUP_FILES      "Flag if the files are grouped or in a flat hierarchy"       ON )
-   set_property ( GLOBAL PROPERTY USE_FOLDERS ${WALBERLA_GROUP_PROJECTS} )
-
-   option ( WALBERLA_VS_MULTI_PROCESS_BUILD "Use the /mp option for VS builds" ON )
-   if( WALBERLA_VS_MULTI_PROCESS_BUILD )
-      add_flag ( CMAKE_CXX_FLAGS "-MP" ) # enable multi-threaded compiling
-   endif()
-
-   add_definitions ( "-DNOMINMAX" )                # Disable Min/Max-Macros
-   add_definitions ( "-D_WIN32_WINNT=0x501" )      # Minimum Windows versions is Windows XP
-   add_definitions ( "-DWINVER=0x501" )            # Minimum Windows versions is Windows XP
-   add_definitions ( "-D_CRT_SECURE_NO_WARNINGS" ) # disable warnings promoting Microsoft's security enhanced CRT
-   add_definitions ( "-D_SCL_SECURE_NO_WARNINGS" ) # disable warnings triggered by Microsoft's checked iterators
-   add_flag ( CMAKE_CXX_FLAGS "-W4" )              # set warning level to maximum
-   add_flag ( CMAKE_CXX_FLAGS "-bigobj" )          # enable big object files
-   add_flag ( CMAKE_CXX_FLAGS "-wd4127" )          # disable compiler warning C4127: "conditional expression is constant"
-   add_flag ( CMAKE_CXX_FLAGS "-wd4512" )          # disable compiler warning C4512: "assignment operator could not be generated"
-   add_flag ( CMAKE_CXX_FLAGS "-wd4913" )          # disable compiler warning C4512: "user defined binary operator ',' exists but
-                                                   # no overload could convert all operands, default built-in binary operator ','
-                                                   # used"
-   add_flag ( CMAKE_CXX_FLAGS "-wd4702" )          # disable compiler warning C4702: "unreachable code"
-   add_flag ( CMAKE_CXX_FLAGS "-wd4505" )          # disable compiler warning C4505: "unreferenced local function has been removed"
-   add_flag ( CMAKE_CXX_FLAGS "-wd4503" )          # disable compiler warning C4503: "'identifier' : decorated name length exceeded, name was truncated"
-
-   if ( WARNING_ERROR )
-      add_flag ( CMAKE_CXX_FLAGS "-WX" )           # Treat warnings as errors
-   endif ( )
-
-   if( NOT WARNING_DEPRECATED)
-      add_definitions( "-D_CRT_SECURE_NO_DEPRECATE" )
-      add_definitions( "-D_SCL_SECURE_NO_DEPRECATE" )
-      add_flag       ( CMAKE_CXX_FLAGS "-wd4996"    ) # Disable compiler warning C4996: "declared as deprecated"
-   endif()
-
-endif ( )
-############################################################################################################################
-
-
+#############################################################################################################################

+if ( WALBERLA_BUILD_WITH_CODEGEN OR WALBERLA_BUILD_WITH_PYTHON )
+   cmake_policy( SET CMP0094 NEW )
+   set( Python_FIND_FRAMEWORK LAST )
+   find_package( Python COMPONENTS Interpreter Development )
+endif ()

 ############################################################################################################################
 ##
@@ -503,12 +311,28 @@ endif ( )
 ##
 #############################################################################################################################
 if ( WALBERLA_BUILD_WITH_CODEGEN )
-    find_package( PythonInterp 3 QUIET REQUIRED)
-    execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import pystencils_walberla" RESULT_VARIABLE PYTHON_RET_CODE)
-    if(NOT PYTHON_RET_CODE EQUAL 0)
-        message(FATAL_ERROR "WALBERLA_BUILD_WITH_CODEGEN activated and pystencils_walberla package not found")
+   set(LBMPY_MIN_VERSION 1.3.7)
+   execute_process(COMMAND ${Python_EXECUTABLE} -c "import lbmpy; print(lbmpy.__version__)"
+         RESULT_VARIABLE LBMPY_FOUND OUTPUT_VARIABLE LBMPY_VERSION)
+    if(NOT LBMPY_FOUND EQUAL 0)
+       message(FATAL_ERROR "WALBERLA_BUILD_WITH_CODEGEN activated but pystencils or lbmpy package not found.
+                            Please install lbmpy e.g.: 'pip3 install lbmpy'")
+    elseif(LBMPY_VERSION VERSION_LESS LBMPY_MIN_VERSION)
+       string(STRIP ${LBMPY_VERSION} LBMPY_VERSION_STRIP)
+       message(WARNING
+             "lbmpy version ${LBMPY_VERSION_STRIP} was found.\n"
+             "We recommend to use at least version ${LBMPY_MIN_VERSION}.")
+    endif()
+    execute_process(COMMAND ${Python_EXECUTABLE} -c "from pystencils.include import get_pystencils_include_path; print(get_pystencils_include_path())"
+                    OUTPUT_VARIABLE PYSTENCILS_INCLUDE_PATH)
+    include_directories( ${PYSTENCILS_INCLUDE_PATH} )
+
+    execute_process(COMMAND ${Python_EXECUTABLE} -c "import jinja2"
+          RESULT_VARIABLE JINJA2_FOUND )
+    if(NOT JINJA2_FOUND EQUAL 0)
+       message(FATAL_ERROR "WALBERLA_BUILD_WITH_CODEGEN activated and jinja2 package not found.
+       Please install jinja2 e.g.: 'pip3 install jinja2'")
    endif()
-
 endif()
 ############################################################################################################################

@@ -522,205 +346,49 @@ endif()
 #############################################################################################################################
 if ( WALBERLA_BUILD_WITH_PYTHON )

-    set ( waLBerla_REQUIRED_MIN_PYTHON_VERSION "2.7")
-
-    find_package( PythonInterp 3 QUIET) # search for Python3 first
-    find_package( PythonInterp QUIET) # fallback to any Python version
-
-    find_package( PythonLibs QUIET REQUIRED)
-
-    if( PYTHONLIBS_VERSION_STRING VERSION_LESS ${waLBerla_REQUIRED_MIN_PYTHON_VERSION} )
-        message( FATAL_ERROR "Found old python library: ${PYTHONLIBS_VERSION_STRING} need at least ${waLBerla_REQUIRED_MIN_PYTHON_VERSION}" )
-    endif()
-
-    option( WALBERLA_USE_PYTHON_DEBUG_LIBRARY "Make use of the python debug library" OFF )
-
-    if( WALBERLA_USE_PYTHON_DEBUG_LIBRARY )
-      # you have to make sure this matches the settings you compiled boost with!
-      add_definitions( "-DBOOST_DEBUG_PYTHON" )
-    endif()
-
-    if( NOT (PYTHON_LIBRARY AND PYTHON_INCLUDE_DIR ) )
-        message( FATAL_ERROR "Couldn't find any python library" )
-    endif()
-
-    SET( WALBERLA_BUILD_WITH_PYTHON 1 )
-    include_directories( ${PYTHON_INCLUDE_DIR} )
-    list ( APPEND SERVICE_LIBS ${PYTHON_LIBRARY} )
-
-    if( NOT WALBERLA_CXX_COMPILER_IS_MSVC )
-        list ( APPEND SERVICE_LIBS -lutil )
-    endif()
-
-    if ( WALBERLA_BUILD_WITH_PYTHON_MODULE )
-        # a python module is a shared library - so everything has to be compiled to position independent code
-        # otherwise linking the static libs into the shared lib will result in errors
-        if( NOT WALBERLA_CXX_COMPILER_IS_MSVC )
-            add_flag ( CMAKE_CXX_FLAGS "-fPIC" )
-            add_flag ( CMAKE_C_FLAGS "-fPIC" )
-        endif()
-    endif()
-
-    if( MSVC10 )
-        include(CMakeDependentOption)
-        CMAKE_DEPENDENT_OPTION( PYTHON_FIXED_HYPOT_REDEFINITION "fixed _hypot redefinition by python" OFF "WALBERLA_BUILD_WITH_PYTHON" OFF )
-        if( NOT PYTHON_FIXED_HYPOT_REDEFINITION )
-            message( WARNING "Make sure you modified your pyconfig.h that _hypot is not redefined -> see: http://connect.microsoft.com/VisualStudio/feedback/details/633988/warning-in-math-h-line-162-re-nonstandard-extensions-used" )
-        endif()
-    endif()
-
-
-    # Sphinx documentation
-    # to build documentation make sure to have sphinx and read-the-docs theme installed
-    # Install with: "pip install sphinx sphinx_rtd_theme"
-    add_custom_target( docPython sphinx-build -b html "${walberla_SOURCE_DIR}/python/waLBerla_docs" "${walberla_BINARY_DIR}/doc/python"
-                       COMMENT "Building HTML documentation for Python extension with Sphinx")
-
-endif()
-
-
-############################################################################################################################
-##
-## BOOST Libraries
-##
-#############################################################################################################################
-set ( waLBerla_REQUIRED_MIN_BOOST_VERSION "1.48")
-
-option ( WALBERLA_BUILD_WITH_BOOST_THREAD "Build with boost thread library support" ON )
-# There have been problems with the IBM compiler and boost thread, so WALBERLA_BUILD_WITH_BOOST_THREAD is disabled by default for this compiler
-if( WALBERLA_CXX_COMPILER_IS_IBM )
-   set ( WALBERLA_BUILD_WITH_BOOST_THREAD OFF CACHE BOOL "Build with boost thread library support" FORCE )
-endif()
-
-if ( WALBERLA_BUILD_WITH_BOOST_THREAD )
-   list ( APPEND waLBerla_REQUIRED_BOOST_COMPONENTS chrono filesystem system thread )
-else ()
-   list ( APPEND waLBerla_REQUIRED_BOOST_COMPONENTS chrono filesystem system )
-endif ()
-
-if (( WALBERLA_CXX_COMPILER_IS_IBM ) OR
-    ( WALBERLA_CXX_COMPILER_IS_GNU   AND CMAKE_CXX_COMPILER_VERSION EQUAL 4.7 ) OR
-    ( WALBERLA_CXX_COMPILER_IS_CLANG AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 3.5 ) )
-      list ( APPEND waLBerla_REQUIRED_BOOST_COMPONENTS regex)
-    endif()
-
-if ( WALBERLA_BUILD_WITH_PYTHON AND WALBERLA_CXX_COMPILER_IS_MSVC )
-    list( APPEND waLBerla_REQUIRED_BOOST_COMPONENTS python3 )
-endif()
+    include(FetchContent)

-# This variable is necessary, if the CMAKE version used is not aware of a more recent boost version (keep this up to date!)
-set ( Boost_ADDITIONAL_VERSIONS
-      "1.45" "1.45.0" "1.46" "1.46.0" "1.46.1" "1.47" "1.47.0" "1.48" "1.48.0" "1.49" "1.49.0"
-      "1.50" "1.50.0" "1.51" "1.51.0" "1.52" "1.52.0" "1.53" "1.53.0" "1.54" "1.54.0" "1.55" "1.55.0"
-      "1.56" "1.56.0" "1.57" "1.57.0" "1.58" "1.58.0" "1.59" "1.59.0" "1.60" "1.60.0" "1.61" "1.61.0" "1.62" "1.62.0" "1.63" "1.63.0")
-
-set ( Boost_USE_STATIC_LIBS    OFF CACHE BOOL "Use boost static libraries" )
-set ( Boost_USE_MULTITHREADED  OFF CACHE BOOL "Use boost multithreaded libraries" )
-set ( Boost_USE_STATIC_RUNTIME OFF CACHE BOOL "Use boost libraries statically linked to runtime libs" )
-
-# if you defined BOOST_ROOT or BOOST_BASE in your environment use it here to find boost too
-if ( NOT BOOST_ROOT )
-   foreach ( var  BOOST_ROOT  BOOST_BASE )
-      if ( NOT "$ENV{${var}}" STREQUAL "" )
-         message ( STATUS "Use environment boost directory: $ENV{${var}}" )
-         set ( BOOST_ROOT $ENV{${var}} CACHE INTERNAL "")
-         break ( )
-      endif ( )
-   endforeach ( )
-endif ( )
+    FetchContent_Declare(
+            pybind11
+            GIT_REPOSITORY https://github.com/pybind/pybind11.git
+            GIT_TAG        v2.13.6
+    )

-find_package ( Boost ${waLBerla_REQUIRED_MIN_BOOST_VERSION} COMPONENTS ${waLBerla_REQUIRED_BOOST_COMPONENTS} QUIET )
-
-if( NOT Boost_FOUND )
-   message ( WARNING
-      "The specified configuration of the BOOST libraries was not found on your system! Now trying some other configuration..." )
-   foreach ( Boost_USE_STATIC_LIBS ON OFF )
-      foreach ( Boost_USE_MULTITHREADED ON OFF )
-         find_package ( Boost ${waLBerla_REQUIRED_MIN_BOOST_VERSION} COMPONENTS ${waLBerla_REQUIRED_BOOST_COMPONENTS} QUIET )
-         if ( Boost_FOUND )
-            set ( Boost_USE_STATIC_LIBS   ${Boost_USE_STATIC_LIBS}   CACHE BOOL "Use boost static libraries"        FORCE )
-            set ( Boost_USE_MULTITHREADED ${Boost_USE_MULTITHREADED} CACHE BOOL "Use boost multithreaded libraries" FORCE )
-            set ( Boost_USE_MULTITHREADED_LIBRARY ${Boost_USE_MULTITHREADED} )
-            message ( STATUS "Working configuration of the BOOST libraries was found :o)!" )
-            message ( STATUS "Boost_USE_STATIC_LIBS and Boost_USE_MULTITHREADED was adapted accordingly." )
-            BREAK ( )
-         endif ( Boost_FOUND )
-      endforeach ( Boost_USE_MULTITHREADED )
-      if ( Boost_FOUND )
-         BREAK ( )
-      endif ( Boost_FOUND )
-   endforeach ( Boost_USE_STATIC_LIBS )
-endif ( NOT Boost_FOUND )
-
-if ( Boost_FOUND )
-   if(CMAKE_GENERATOR STREQUAL "Xcode")
-      # this is needed because the SYSTEM flag to include_directories does not work
-      add_flag ( CMAKE_CXX_FLAGS "-isystem ${Boost_INCLUDE_DIRS}" )
-   else()
-      include_directories ( SYSTEM ${Boost_INCLUDE_DIRS} )
-   endif()
-   link_directories ( ${Boost_LIBRARY_DIRS} )
-   list ( APPEND SERVICE_LIBS ${Boost_LIBRARIES} )
-   add_definitions ( -DBOOST_ALL_NO_LIB ) # Disable Boost auto-linking (CMAKE does that for us...)
-
-   #fix for static lib usage: http://stackoverflow.com/questions/11812463/boost-python-link-errors-under-windows-msvc10
-   if( PYTHONLIBS_FOUND AND Boost_USE_STATIC_LIBS)
-      add_definitions( -DBOOST_PYTHON_STATIC_LIB )
-   endif()
+    FetchContent_MakeAvailable(pybind11)

-   #fix for strange link behaviour of boost to python: boost only links to 'pyhton*.lib' and not to the absolute path
-   if( WIN32 AND PYTHONLIBS_FOUND )
-      get_filename_component( PYTHON_LIBRARY_DIR ${PYTHON_INCLUDE_DIR} PATH )
-      link_directories( ${PYTHON_LIBRARY_DIR}/libs )
-      list( APPEND LINK_DIRS ${PYTHON_LIBRARY_DIR}/libs )
-   endif()
+    # a python module is a shared library - so everything has to be compiled to position independent code
+    # otherwise linking the static libs into the shared lib will result in errors
+    set(CMAKE_POSITION_INDEPENDENT_CODE ON)

-else( Boost_FOUND )
-   # Search again, this time with the REQUIRED option. This will give a CMAKE error and a detailed error message for the user
-   find_package ( Boost ${waLBerla_REQUIRED_MIN_BOOST_VERSION} REQUIRED ${waLBerla_REQUIRED_BOOST_COMPONENTS} )
-endif( Boost_FOUND )

-
-# Check if Python3 found and look for according boost python library
-if ( WALBERLA_BUILD_WITH_PYTHON AND NOT WALBERLA_CXX_COMPILER_IS_MSVC)
-    SET(_boost_MULTITHREADED "")
-    if (Boost_USE_MULTITHREADED OR Boost_USE_MULTITHREADED_LIBRARY)
-        SET(_boost_MULTITHREADED "-mt")
-    endif()
-    if( PYTHON_LIBRARY MATCHES "python3" )
-        find_library( BOOST_PYTHON_LIBRARY NAMES
-                boost_python-py36${_boost_MULTITHREADED} boost_python-py35${_boost_MULTITHREADED}
-                boost_python-py34${_boost_MULTITHREADED} boost_python-py33${_boost_MULTITHREADED}
-                boost_python-py32${_boost_MULTITHREADED} boost_python3${_boost_MULTITHREADED}
-                boost_python${_boost_MULTITHREADED}
-                PATHS ${Boost_LIBRARY_DIRS} NO_DEFAULT_PATH )
-    else()
-        find_library( BOOST_PYTHON_LIBRARY NAMES boost_python${_boost_MULTITHREADED}
-                      PATHS ${Boost_LIBRARY_DIRS} NO_DEFAULT_PATH )
-    endif()
-    message(STATUS "Using Boost Python Library ${BOOST_PYTHON_LIBRARY}" )
-    list ( APPEND SERVICE_LIBS ${BOOST_PYTHON_LIBRARY} )
+    if(WALBERLA_BUILD_DOC)
+      # Sphinx documentation
+      # to build documentation make sure to have sphinx and read-the-docs theme installed
+      # Install with: "pip install sphinx sphinx_rtd_theme"
+      add_custom_target( docPython sphinx-build -b html "${walberla_SOURCE_DIR}/python/waLBerla_docs" "${walberla_BINARY_DIR}/doc/python"
+                         COMMENT "Building HTML documentation for Python extension with Sphinx")
+  endif()
 endif()


-############################################################################################################################
-
-
-
-

 ############################################################################################################################
 ##
-## PThread is required in Linux environments by boost::thread
+## PThread is required in Linux environments by std::thread
 ##
 ############################################################################################################################

-if ( NOT WIN32 AND WALBERLA_BUILD_WITH_BOOST_THREAD )
-   add_flag( CMAKE_CXX_FLAGS "-pthread" )
+set( THREADS_PREFER_PTHREAD_FLAG TRUE )
+find_package(Threads)
+if ( Threads_FOUND )
+   if( CMAKE_USE_PTHREADS_INIT )
+      add_flag( CMAKE_CXX_FLAGS "-pthread" )
+   else()
+      add_flag( CMAKE_CXX_FLAGS "${CMAKE_THREAD_LIBS_INIT}" )
+   endif()
 endif()


-
 ############################################################################################################################
 ##
 ## MPI
@@ -740,18 +408,16 @@ if ( WALBERLA_BUILD_WITH_MPI AND NOT WALBERLA_CXX_COMPILER_IS_MPI_WRAPPER )
      if ( WIN32 )
         message ( STATUS "Enter Workaround Routine for Windows and OpenMPI: PRESS CONFIGURE ONE MORE TIME!" )
         string ( REGEX REPLACE "(.*)/bin/.*" "\\1" MPI_PATH ${MPI_CXX_COMPILER} )
-         find_path ( MPI_INCLUDE_PATH mpi.h
+         find_path ( MPI_C_INCLUDE_PATH mpi.h
            HINTS ${MPI_PATH}
            PATH_SUFFIXES include Inc)
-         set ( MPI_CXX_INCLUDE_PATH ${MPI_INCLUDE_PATH} CACHE FILEPATH "" FORCE )
-         set ( MPI_C_INCLUDE_PATH   ${MPI_INCLUDE_PATH} CACHE FILEPATH "" FORCE )
+         set ( MPI_CXX_INCLUDE_PATH ${MPI_C_INCLUDE_PATH} CACHE FILEPATH "" FORCE )

         set ( MPI_CXX_LIBRARIES "MPI_CXX_LIBRARIES-NOTFOUND" CACHE FILEPATH "Cleared" FORCE )
         find_library ( MPI_CXX_LIBRARIES
            NAMES         mpi++ mpicxx cxx mpi_cxx libmpi++ libmpicxx libcxx libmpi_cxx
            HINTS         ${MPI_PATH}
            PATH_SUFFIXES lib )
-         set ( MPI_LIBRARY "MPI_CXX_LIBRARIES" CACHE FILEPATH "" FORCE )

         if ( NOT MPI_CXX_LIBRARIES STREQUAL "MPI_CXX_LIBRARIES-NOTFOUND" )
            set ( MPI_CXX_FOUND ON FORCE )
@@ -762,7 +428,6 @@ if ( WALBERLA_BUILD_WITH_MPI AND NOT WALBERLA_CXX_COMPILER_IS_MPI_WRAPPER )
           NAMES         mpi mpich mpich2 msmpi libmpi libmpich libmpich2 libmsmpi
           HINTS         ${MPI_PATH}
           PATH_SUFFIXES lib )
-         set ( MPI_EXTRA_LIBRARY "MPI_C_LIBRARIES" CACHE FILEPATH "" FORCE )

         if ( NOT MPI_C_LIBRARIES STREQUAL "MPI_C_LIBRARIES-NOTFOUND" )
            set ( MPI_C_FOUND ON FORCE )
@@ -779,62 +444,26 @@ if ( WALBERLA_BUILD_WITH_MPI AND NOT WALBERLA_CXX_COMPILER_IS_MPI_WRAPPER )
   endif ( )

   if ( MPI_FOUND )
-       if ( MPI_CXX_FOUND )
-         include_directories ( SYSTEM ${MPI_CXX_INCLUDE_PATH} ${MPI_C_INCLUDE_PATH} )
-         foreach( LIB ${MPI_C_LIBRARIES} ${MPI_CXX_LIBRARIES} )
-           if ( LIB )
-              list ( APPEND SERVICE_LIBS ${LIB} )
-           endif ( )
-         endforeach ( )
-         add_flag ( CMAKE_CXX_FLAGS "${MPI_CXX_COMPILE_FLAGS}" )
-         add_flag ( CMAKE_C_FLAGS   "${MPI_C_COMPILE_FLAGS}" )
-      else ( ) # For older CMake versions
-         include_directories ( SYSTEM ${MPI_INCLUDE_PATH} )
-         list ( APPEND SERVICE_LIBS ${MPI_LIBRARY} )
-         if ( MPI_EXTRA_LIBRARY )
-            list ( APPEND SERVICE_LIBS ${MPI_EXTRA_LIBRARY} )
+     include_directories ( SYSTEM ${MPI_CXX_INCLUDE_PATH} ${MPI_C_INCLUDE_PATH} )
+     foreach( LIB ${MPI_C_LIBRARIES} ${MPI_CXX_LIBRARIES} )
+         if ( LIB )
+            list ( APPEND SERVICE_LIBS ${LIB} )
         endif ( )
-         add_flag ( CMAKE_C_FLAGS "${MPI_COMPILE_FLAGS}" )
-      endif ( )
+     endforeach ( )
+     add_flag ( CMAKE_CXX_FLAGS "${MPI_CXX_COMPILE_FLAGS}" )
+     add_flag ( CMAKE_C_FLAGS   "${MPI_C_COMPILE_FLAGS}" )

     add_flag ( CMAKE_MODULE_LINKER_FLAGS "${MPI_CXX_LINK_FLAGS}" )
     add_flag ( CMAKE_EXE_LINKER_FLAGS    "${MPI_CXX_LINK_FLAGS}" )
     add_flag ( CMAKE_SHARED_LINKER_FLAGS "${MPI_CXX_LINK_FLAGS}" )
-
-     # When using Intel MPI, mpi.h has to be included before including the standard library
-     # therefore we use the -include flag to enforce this.
-     if ( MPI_INCLUDE_PATH MATCHES "intel" )
-         message (STATUS "Activating IntelMPI include workaround for mpi.h" )
-         add_flag ( CMAKE_CXX_FLAGS "-include mpi.h" )
-         add_flag ( CMAKE_C_FLAGS   "-include mpi.h" )
-     endif ( )
   endif ( )
 endif ( )
-############################################################################################################################
-
-
-
-############################################################################################################################
-##
-## Qt
-##
-############################################################################################################################
-option (WALBERLA_ENABLE_GUI "This flag builds the graphical user interface, depends on Qt Libraries")
-
-if ( WALBERLA_ENABLE_GUI )
-
-    find_package( Qt4 COMPONENTS QtCore QtGui QtOpenGL QtXml REQUIRED )
-    find_package( OpenGL REQUIRED )
-
-    INCLUDE( ${QT_USE_FILE} )
-    list ( APPEND SERVICE_LIBS ${OPENGL_LIBRARIES} ${QT_LIBRARIES} )
-
-    # Workaround for Qt4 moc and newer boost versions - moc cannot parse BOOST_JOIN
-    # so additional defines are passed to the moc compiler that prevent the problematic header to be parsed
-    set( QT_MOC_EXECUTABLE ${QT_MOC_EXECUTABLE} -DBOOST_NO_TEMPLATE_PARTIAL_SPECIALIZATION -DBOOST_TT_HAS_OPERATOR_HPP_INCLUDED )
-
-endif(WALBERLA_ENABLE_GUI)

+# OpenMPI 3.0 and higher checks the number of processes against the number of CPUs
+execute_process(COMMAND ${MPIEXEC} --version RESULT_VARIABLE mpi_version_result OUTPUT_VARIABLE mpi_version_output)
+if (mpi_version_result EQUAL 0 AND mpi_version_output MATCHES "\\(Open(RTE| MPI)\\) ([3-9]\\.|1[0-9])")
+    set ( MPIEXEC_PREFLAGS "${MPIEXEC_PREFLAGS}" "-oversubscribe" CACHE STRING "" FORCE)
+endif()
 ############################################################################################################################


@@ -845,41 +474,24 @@ endif(WALBERLA_ENABLE_GUI)
 ##
 ############################################################################################################################

+if ( WALBERLA_BUILD_WITH_PARMETIS )
+   # metis is required for parmetis
+   set( WALBERLA_BUILD_WITH_METIS TRUE FORCE )
+endif ()
+
 if ( WALBERLA_BUILD_WITH_METIS )
-    find_package ( Metis QUIET )
+   find_package( Metis REQUIRED )

-    if ( METIS_FOUND )
-        include_directories( ${METIS_INCLUDE_DIRS} )
-        link_directories   ( ${METIS_LIBRARY_DIR}  )
-        list ( APPEND SERVICE_LIBS ${METIS_LIBRARIES} )
-        set  ( WALBERLA_BUILD_WITH_METIS TRUE )
-    else()
-        set  ( WALBERLA_BUILD_WITH_METIS OFF CACHE BOOL "Build with metis graph partitioner" FORCE )
-    endif()
-else()
-    set ( METIS_FOUND OFF CACHE BOOL "Metis found" FORCE )
-endif()
+   include_directories( ${METIS_INCLUDE_DIRS} )
+   list( APPEND SERVICE_LIBS ${METIS_LIBRARIES} )

+endif()

 if ( WALBERLA_BUILD_WITH_PARMETIS )
-   find_path(PARMETIS_INCLUDE_DIR parmetis.h
-      /usr/local/include
-      /usr/include
-      ${PARMETIS_ROOT}/include
-      $ENV{PARMETIS_ROOT}/include
-   )
-
-  find_library(PARMETIS_LIBRARY parmetis
-    /usr/local/lib
-    /usr/lib
-    ${PARMETIS_ROOT}/lib
-    $ENV{PARMETIS_ROOT}/lib
-  )
-
-  if( PARMETIS_INCLUDE_DIR AND PARMETIS_LIBRARY AND METIS_LIBRARY )
-    include_directories( ${PARMETIS_INCLUDE_DIR} )
-    list ( APPEND SERVICE_LIBS ${PARMETIS_LIBRARY} ${METIS_LIBRARY} )
-  endif()
+   find_package( Parmetis REQUIRED )
+
+   include_directories( ${PARMETIS_INCLUDE_DIR} )
+   list( APPEND SERVICE_LIBS ${PARMETIS_LIBRARY} )
 endif()

 ############################################################################################################################
@@ -892,22 +504,24 @@ endif()
 ##
 ############################################################################################################################

-if( WALBERLA_BUILD_WITH_MPI )
-   find_package( PFFT )
-   find_package( FFTW3 )
-   set( FFT_REQUIRED_LIBRARIES pfft fftw3_mpi fftw3 )
-   if( PFFT_FOUND AND FFTW3_MPI_FOUND )
-      set( WALBERLA_BUILD_WITH_FFT TRUE CACHE INTERNAL "Build with FFT" )
-      include_directories( SYSTEM ${PFFT_INCLUDE_DIR} ${FFTW3_MPI_INCLUDE_DIR} )
-      list( APPEND SERVICE_LIBS ${PFFT_LIBRARIES} ${FFTW3_LIBRARIES} ${FFTW3_MPI_LIBRARIES} )
-   endif()
-else()
-   find_package( FFTW3 )
-   set( FFT_REQUIRED_LIBRARIES fftw3 )
-   if ( FFTW3_FOUND )
-      set( WALBERLA_BUILD_WITH_FFT TRUE CACHE INTERNAL "Build with FFT" )
-      include_directories( SYSTEM ${FFTW3_INCLUDE_DIR} )
-      list( APPEND SERVICE_LIBS ${FFTW3_LIBRARIES} )
+if ( WALBERLA_BUILD_WITH_FFTW )
+   if( WALBERLA_BUILD_WITH_MPI )
+      find_package( PFFT )
+      find_package( FFTW3 )
+      set( FFT_REQUIRED_LIBRARIES pfft fftw3_mpi fftw3 )
+      if( PFFT_FOUND AND FFTW3_MPI_FOUND )
+         set( WALBERLA_BUILD_WITH_FFT TRUE CACHE INTERNAL "Build with FFT" )
+         include_directories( SYSTEM ${PFFT_INCLUDE_DIR} ${FFTW3_MPI_INCLUDE_DIR} )
+         list( APPEND SERVICE_LIBS ${PFFT_LIBRARIES} ${FFTW3_LIBRARIES} ${FFTW3_MPI_LIBRARIES} )
+      endif()
+   else()
+      find_package( FFTW3 )
+      set( FFT_REQUIRED_LIBRARIES fftw3 )
+      if ( FFTW3_FOUND )
+         set( WALBERLA_BUILD_WITH_FFT TRUE CACHE INTERNAL "Build with FFT" )
+         include_directories( SYSTEM ${FFTW3_INCLUDE_DIR} )
+         list( APPEND SERVICE_LIBS ${FFTW3_LIBRARIES} )
+      endif()
   endif()
 endif()

@@ -924,7 +538,11 @@ if( (NOT DEFINED WALBERLA_BUILD_WITH_OPENMESH) OR WALBERLA_BUILD_WITH_OPENMESH )
      set( WALBERLA_BUILD_WITH_OPENMESH ON CACHE BOOL "Build with OpenMesh support" )
      include_directories( SYSTEM ${OPENMESH_INCLUDE_DIRS} )
      list( APPEND SERVICE_LIBS ${OPENMESH_LIBRARIES} )
+      if( WALBERLA_CXX_COMPILER_IS_MSVC )
+         add_definitions(-D_USE_MATH_DEFINES )
+      endif()
   else()
+      message("   If OpenMesh required, set OPENMESH_LIBRARY_DIR to the OpenMesh directory.")
      set( WALBERLA_BUILD_WITH_OPENMESH OFF CACHE BOOL "Build with OpenMesh support" FORCE )
   endif()
 endif()
@@ -943,18 +561,6 @@ set( CMAKE_MODULE_LINKER_FLAGS_DEBUGOPTIMIZED ${CMAKE_MODULE_LINKER_FLAGS_DEBUG}

 set_property(GLOBAL PROPERTY DEBUG_CONFIGURATIONS Debug DebugOptimized)

-if ( WALBERLA_CXX_COMPILER_IS_MSVC )
-    string(REPLACE "/Od" "/O2"   CMAKE_C_FLAGS_DEBUGOPTIMIZED   ${CMAKE_C_FLAGS_DEBUGOPTIMIZED})
-    string(REPLACE "/Ob0" "/Ob2" CMAKE_C_FLAGS_DEBUGOPTIMIZED   ${CMAKE_C_FLAGS_DEBUGOPTIMIZED})
-    string(REPLACE "/RTC1" ""    CMAKE_C_FLAGS_DEBUGOPTIMIZED   ${CMAKE_C_FLAGS_DEBUGOPTIMIZED})
-    string(REPLACE "/Od" "/O2"   CMAKE_CXX_FLAGS_DEBUGOPTIMIZED ${CMAKE_CXX_FLAGS_DEBUGOPTIMIZED})
-    string(REPLACE "/Ob0" "/Ob2" CMAKE_CXX_FLAGS_DEBUGOPTIMIZED ${CMAKE_CXX_FLAGS_DEBUGOPTIMIZED})
-    string(REPLACE "/RTC1" ""    CMAKE_CXX_FLAGS_DEBUGOPTIMIZED ${CMAKE_CXX_FLAGS_DEBUGOPTIMIZED})
-elseif( WALBERLA_CXX_COMPILER_IS_GNU OR WALBERLA_CXX_COMPILER_IS_INTEL OR WALBERLA_CXX_COMPILER_IS_CLANG )
-   set( CMAKE_C_FLAGS_DEBUGOPTIMIZED   "${CMAKE_C_FLAGS_DEBUGOPTIMIZED} -O3" )
-   set( CMAKE_CXX_FLAGS_DEBUGOPTIMIZED "${CMAKE_CXX_FLAGS_DEBUGOPTIMIZED} -O3" )
-endif()
-
 set(CMAKE_C_FLAGS_DEBUGOPTIMIZED ${CMAKE_C_FLAGS_DEBUGOPTIMIZED} CACHE STRING
    "Flags used by the compiler during DebugOptimized builds")
 set(CMAKE_CXX_FLAGS_DEBUGOPTIMIZED ${CMAKE_CXX_FLAGS_DEBUGOPTIMIZED}  CACHE STRING
@@ -996,32 +602,21 @@ endif()

 option ( WALBERLA_THREAD_SAFE_LOGGING "Enables/Disables thread-safe logging" ON )

-if ( WALBERLA_BUILD_WITH_OPENMP )
-    if ( WALBERLA_CXX_COMPILER_IS_INTEL )
-       add_flag ( CMAKE_C_FLAGS   "-openmp" )
-       add_flag ( CMAKE_CXX_FLAGS "-openmp" )
-    elseif ( CMAKE_COMPILER_IS_GNUCXX )
-      add_flag ( CMAKE_C_FLAGS   "-fopenmp" )
-      add_flag ( CMAKE_CXX_FLAGS "-fopenmp" )
-    elseif ( WALBERLA_CXX_COMPILER_IS_CLANG )
-       add_flag ( CMAKE_C_FLAGS   "-fopenmp" )
-       add_flag ( CMAKE_CXX_FLAGS "-fopenmp" )
-    elseif ( WALBERLA_CXX_COMPILER_IS_MSVC )
-      add_flag ( CMAKE_C_FLAGS   "/openmp" )
-      add_flag ( CMAKE_CXX_FLAGS "/openmp" )
-    elseif ( WALBERLA_CXX_COMPILER_IS_IBM )
-      add_flag ( CMAKE_C_FLAGS   "-qsmp=omp" )
-      add_flag ( CMAKE_CXX_FLAGS "-qsmp=omp" )
-      # There has been an internal compiler error with the IBM compiler, so WALBERLA_THREAD_SAFE_LOGGING is disabled by default for this compiler
-      set ( WALBERLA_THREAD_SAFE_LOGGING OFF CACHE BOOL "Enables/Disables thread-safe logging" FORCE )
-    elseif ( WALBERLA_CXX_COMPILER_IS_NEC )
-      add_flag ( CMAKE_C_FLAGS   "-Popenmp" )
-      add_flag ( CMAKE_CXX_FLAGS "-Popenmp" )
+if ( WALBERLA_BUILD_WITH_OPENMP AND NOT OpenMP_FOUND )
+    if( APPLE AND EXISTS /opt/local/lib/libomp AND EXISTS /opt/local/include/libomp ) # find libomp from MacPorts
+        set( CMAKE_FRAMEWORK_PATH /opt/local/lib/libomp )
+        set( CMAKE_INCLUDE_PATH /opt/local/include/libomp )
    endif()
-else()
-    if ( WALBERLA_CXX_COMPILER_IS_CRAY )
-      add_flag ( CMAKE_C_FLAGS   "-h noomp" )
-      add_flag ( CMAKE_CXX_FLAGS "-h noomp" )
+    find_package( OpenMP )
+    if (OpenMP_FOUND)
+        add_flag ( CMAKE_C_FLAGS   "${OpenMP_C_FLAGS}" )
+        add_flag ( CMAKE_CXX_FLAGS "${OpenMP_CXX_FLAGS}" )
+        list ( APPEND SERVICE_LIBS ${OpenMP_CXX_LIBRARIES} )
+        if( OpenMP_CXX_INCLUDE_DIRS )
+            include_directories( ${OpenMP_CXX_INCLUDE_DIRS} )
+        endif()
+    else()
+         message(FATAL_ERROR "Could NOT enable OpenMP")
    endif()
 endif()
 ############################################################################################################################
@@ -1034,74 +629,49 @@ endif()
 ##
 ############################################################################################################################
 if ( WALBERLA_BUILD_WITH_CUDA )
-
-    get_directory_property(COMPILE_DEFINITIONS_SAVED_STATE DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMPILE_DEFINITIONS)
-    # cleanup compile definitions for CUDA (remove generator expression and empty elements which lead to warnings)
-    set(CLEANED_COMPILE_DEFINITIONS )
-    foreach( element ${COMPILE_DEFINITIONS_SAVED_STATE})
-        if(NOT ${element} MATCHES "^\\$")
-            list(APPEND CLEANED_COMPILE_DEFINITIONS ${element})
-        endif()
-    endforeach()
-    set_directory_properties(PROPERTIES COMPILE_DEFINITIONS CLEANED_COMPILE_DEFINITIONS)
-
-    #   set ( BUILD_SHARED_LIBS                      ON )
-    set ( CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE ON )
-
-    if( NOT WALBERLA_CXX_COMPILER_IS_MSVC )
-        set ( CUDA_PROPAGATE_HOST_FLAGS OFF CACHE BOOL "" )
+    if (WALBERLA_BUILD_WITH_HIP)
+       message(FATAL_ERROR "For GPU support either use CUDA or HIP. Both simultaneously is not supported.")
    endif()

-    if ( (NOT DEFINED CUDA_HOST_COMPILER) AND (${CMAKE_C_COMPILER} MATCHES "ccache") )
-        string ( STRIP "${CMAKE_C_COMPILER_ARG1}" stripped_compiler_string )
-        find_program ( CUDA_HOST_COMPILER ${stripped_compiler_string} )
-    endif ()
-
-    find_package ( CUDA REQUIRED )
+    include(CheckLanguage)
+    check_language(CUDA)
+    if( CMAKE_CUDA_COMPILER )

-    if ( CUDA_FOUND )
-        include_directories ( ${CUDA_INCLUDE_DIRS} )
-        list ( APPEND SERVICE_LIBS ${CUDA_LIBRARIES} )
-
-        list( APPEND CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets")
+        if(${CMAKE_VERSION} VERSION_GREATER "3.18.0" AND NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+            set(CMAKE_CUDA_ARCHITECTURES OFF)
+            message(WARNING "CMAKE_CUDA_ARCHITECTURES was not set. It is automatically set to: ${CMAKE_CUDA_ARCHITECTURES}")
+        endif()

-        # FindCUDA does not respect system includes i.e. there are also warnings for boost etc reported (as of cmake 3.5.1)
-        # if all includes are added to the flags manually as sytem includes they occur double on the command line
-        # but the compiler seems to note the "isystem" not the "-I"
-        # it is also not possible to get all system include directories - so as a workaround we at least add boost here
-        # as system include
-        foreach( boostInclude ${Boost_INCLUDE_DIRS} AND NOT WALBERLA_CXX_COMPILER_IS_MSVC )
-            list( APPEND CUDA_NVCC_FLAGS "-isystem ${boostInclude}" )
-        endforeach()
+        enable_language(CUDA)

-        if ( NOT "${CUDA_NVCC_FLAGS}" MATCHES "-std=" AND NOT WALBERLA_CXX_COMPILER_IS_MSVC )
-            list ( APPEND CUDA_NVCC_FLAGS "-std=c++11" )
-        endif ()
+        #include directories and cudart lib is needed for cpp files that use cuda headers/libs
+        include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+        find_library(CUDART_LIBRARY cudart ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
+        list ( APPEND SERVICE_LIBS ${CUDART_LIBRARY} )

-        # Bug with gcc5 and cuda7.5:
-        #list( APPEND CUDA_NVCC_FLAGS  "-D_MWAITXINTRIN_H_INCLUDED -D_FORCE_INLINES  -D__STRICT_ANSI__")
+        find_library( NVTX_LIBRARY nvToolsExt ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES} )
+        list ( APPEND SERVICE_LIBS ${NVTX_LIBRARY} )

-        # NOTICE: exisiting cuda flags are overwritten
-        #set ( CUDA_NVCC_FLAGS "--compiler-bindir=/usr/bin/g++-4.3" )
-        #set ( CUDA_NVCC_FLAGS "-arch sm_20" )
+        #CUDA_FOUND is need for our cmake mechanism
+        set ( CUDA_FOUND TRUE )
+        set (WALBERLA_BUILD_WITH_GPU_SUPPORT TRUE)
    else()
+        message( WARNING "CUDA could not be enabled. The host compiler might not be compatible. Check CMakeFiles/CMakeError.log for more information" )
        set ( WALBERLA_BUILD_WITH_CUDA FALSE )
    endif ( )
-
-    set_directory_properties(PROPERTIES COMPILE_DEFINITIONS "${COMPILE_DEFINITIONS_SAVED_STATE}" )
 endif ( )
-############################################################################################################################
-

+if (WALBERLA_BUILD_WITH_CUDA AND (CMAKE_CUDA_COMPILER_VERSION VERSION_LESS "11.0" OR CMAKE_VERSION VERSION_LESS 3.18.0))
+    # CUDA < 11 does not support C++17. std::experimental::any works with C++14, unlike std::any.
+    set(CMAKE_CUDA_STANDARD 14)
+    set(WALBERLA_USE_STD_EXPERIMENTAL_ANY 1)
+endif()

-############################################################################################################################
-##
-##  Testing Coverage
-##
-############################################################################################################################
-if (WALBERLA_BUILD_WITH_GCOV AND CMAKE_COMPILER_IS_GNUCXX  )
-    add_flag ( CMAKE_CXX_FLAGS_DEBUG "-fprofile-arcs -ftest-coverage" )
-    add_flag ( LD_FLAGS              "-fprofile-arcs -ftest-coverage" )
+# Can be used in CMake files containing generated files where the file suffix is dependent on the target
+if (WALBERLA_BUILD_WITH_CUDA AND CUDA_FOUND)
+    set(CODEGEN_FILE_SUFFIX "cu")
+else()
+    set(CODEGEN_FILE_SUFFIX "cpp")
 endif()
 ############################################################################################################################

@@ -1109,21 +679,36 @@ endif()

 ############################################################################################################################
 ##
-##  Profiling with gprof
+## ROCm HIP
 ##
 ############################################################################################################################
-
-if ( WALBERLA_BUILD_WITH_GPROF )
-    if ( WALBERLA_CXX_COMPILER_IS_INTEL )
-        add_flag ( CMAKE_CXX_FLAGS        "-pg" )
-        add_flag ( CMAKE_EXE_LINKER_FLAGS "-pg" )
-    elseif ( WALBERLA_CXX_COMPILER_IS_GNU OR WALBERLA_CXX_COMPILER_IS_CLANG )
-        add_flag ( CMAKE_CXX_FLAGS        "-pg" )
+if ( WALBERLA_BUILD_WITH_HIP )
+    if (WALBERLA_BUILD_WITH_CUDA)
+       message(FATAL_ERROR "For GPU support either use CUDA or HIP. Both simultaneously is not supported.")
    endif()
-endif()
+    if (${CMAKE_VERSION} VERSION_LESS "3.21.0")
+       message(FATAL_ERROR "For HIP support CMake > 3.21.0 is needed. Please install a newer version")
+    endif()
+
+    include(CheckLanguage)
+    check_language(HIP)
+
+    if( CMAKE_HIP_COMPILER )
+        enable_language(HIP)
+        # since waLBerla also supports CUDA we only use HIP on an AMD platform
+        add_compile_definitions(__HIP_PLATFORM_AMD__)
+        # include_directories(${HSA_HEADER})
+        set (WALBERLA_BUILD_WITH_GPU_SUPPORT TRUE)
+    else()
+         message("HIP compiler not found. HIP support is not possible")
+        set ( WALBERLA_BUILD_WITH_HIP FALSE )
+    endif ( )
+endif ( )
+
 ############################################################################################################################


+
 ############################################################################################################################
 ##
 ##  Likwid Marker API
@@ -1131,7 +716,7 @@ endif()
 ############################################################################################################################


-if ( WALBERLA_BUILD_WITH_LIKWID_MARKERS )
+if ( WALBERLA_BUILD_WITH_LIKWID_MARKERS AND NOT LIKWID_FOUND )
    find_library( LIKWID_LIB likwid HINTS $ENV{LIKWID_LIBDIR} $ENV{LIKWID_ROOT}/lib )
    find_path( LIKWID_INCLUDE_DIR likwid.h HINTS $ENV{LIKWID_INCDIR} $ENV{LIKWID_ROOT}/include )

@@ -1154,105 +739,85 @@ endif()
 ##
 ############################################################################################################################
 if ( WALBERLA_BUILD_WITH_LTO  )
-
-   if( WALBERLA_CXX_COMPILER_IS_INTEL )
-      add_flag( CMAKE_CXX_FLAGS_RELEASE "-ip -ipo3" )
-      add_flag( CMAKE_C_FLAGS_RELEASE   "-ip -ipo3" )
-   endif()
-
-   if ( CMAKE_COMPILER_IS_GNUCXX )
-      add_flag ( CMAKE_C_FLAGS_RELEASE     "-flto=3" )
-      add_flag ( CMAKE_CXX_FLAGS_RELEASE   "-flto=3" )
-      add_flag ( CMAKE_EXE_LINKER_FLAGS    "-fuse-linker-plugin" )
-   endif ( )
-
-   if( WALBERLA_CXX_COMPILER_IS_MSVC )
-      add_flag ( CMAKE_CXX_FLAGS_RELEASE           "/GL"   )
-      add_flag ( CMAKE_EXE_LINKER_FLAGS_RELEASE    "/LTCG" )
-      add_flag ( CMAKE_SHARED_LINKER_FLAGS_RELEASE "/LTCG" )
-      add_flag ( CMAKE_MODULE_LINKER_FLAGS_RELEASE "/LTCG" )
-   endif ( )
-
-   if( WALBERLA_CXX_COMPILER_IS_IBM )
-      add_flag ( CMAKE_C_FLAGS_RELEASE     "-qipa" )
-      add_flag ( CMAKE_CXX_FLAGS_RELEASE   "-qipa" )
-      add_flag ( CMAKE_EXE_LINKER_FLAGS    "-qipa" )
-   endif( )
-
+    cmake_policy( SET CMP0069 NEW )
+    include( CheckIPOSupported )
+    check_ipo_supported( RESULT LTO_SUPPORTED LANGUAGES CXX )
+    if( LTO_SUPPORTED )
+       set( CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE )
+    else()
+       message( WARNING "Link-time optimization is not supported with this compiler" )
+    endif()
 endif ( )
-############################################################################################################################

 ############################################################################################################################
 ##
-##  Sanitizer
+##  Some more compiler flags that need to happen after any try_compile (e.g. inside FindMPI) (not sure if still true)
 ##
 ############################################################################################################################
-if ( WALBERLA_SANITIZE_ADDRESS )
-    if ( WALBERLA_CXX_COMPILER_IS_GNU OR WALBERLA_CXX_COMPILER_IS_CLANG )
-        add_flag( CMAKE_CXX_FLAGS "-fsanitize=address")
-    endif()
-endif()

-if ( WALBERLA_SANITIZE_UNDEFINED )
-    if ( WALBERLA_CXX_COMPILER_IS_GNU OR WALBERLA_CXX_COMPILER_IS_CLANG )
-        add_flag( CMAKE_CXX_FLAGS "-fsanitize=undefined")
-    endif()
-endif()
+# Treat warnings as errors
+if ( WARNING_ERROR )
+   set (CMAKE_COMPILE_WARNING_AS_ERROR ON)
+endif ( )

 ############################################################################################################################
-# Documentation Generation
-#
-# Build documentation using Doxygen (www.doxygen.org)
+##
+##  Half precision
+##
 ############################################################################################################################
-find_package ( Doxygen  )
-find_package ( HTMLHelp )
+if ( WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT )
+   ### Compiler requirements:
+   ### Within this project, there are several checks to ensure that the template parameter 'ValueType'
+   ### is a floating point number. The check is_floating_point<ValueType> is done primarily in our MPI implementation.
+   ### The IEE 754 floating type format _Float16, evaluates to true only if your compiler supports the
+   ### open C++23 standard P1467R9 (Extended floating-point types and standard names).
+   ### Compare:
+   ###  https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2022/p1467r9.html
+   ###
+   ### Right now (18.12.2023) this is the case only for gcc13.
+   ### For more information see:
+   ###   https://gcc.gnu.org/projects/cxx-status.html#:~:text=Extended%20floating%2Dpoint%20types%20and%20standard%20names
+   ###   https://clang.llvm.org/cxx_status.html#:~:text=Extended%20floating%2Dpoint%20types%20and%20standard%20names

-if ( HTML_HELP_COMPILER EQUAL "" )
-   set ( HTML_HELP_FOUND "NO" )
-else ( )
-   set ( HTML_HELP_FOUND "YES" )
-endif ( )
-
-if ( DOXYGEN_FOUND )
-   set ( DOXYGEN_HTML_HEADER ${walberla_SOURCE_DIR}/doc/header.html )
-   set ( DOXYGEN_HTML_FOOTER ${walberla_SOURCE_DIR}/doc/footer.html )
-   set ( DOXYGEN_HTML_OUTPUT "html" )
+   try_compile( WALBERLA_SUPPORT_HALF_PRECISION "${CMAKE_CURRENT_BINARY_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}/cmake/TestFloat16.cpp"
+         CXX_STANDARD 23 OUTPUT_VARIABLE TRY_COMPILE_OUTPUT )
+   ## message( STATUS ${TRY_COMPILE_OUTPUT} )
+   if ( NOT WALBERLA_SUPPORT_HALF_PRECISION )
+      message( FATAL_ERROR "Compiler: ${CMAKE_CXX_COMPILER} Version: ${CMAKE_CXX_COMPILER_VERSION} does not support half precision" )
+   endif ()

-   configure_file ( ${walberla_SOURCE_DIR}/doc/doxygen.in ${walberla_BINARY_DIR}/doc/doxygen.cfg @ONLY )
+endif () # Check if WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT is set

-   add_custom_target ( doc   ${DOXYGEN_EXECUTABLE} ${walberla_BINARY_DIR}/doc/doxygen.cfg
-                       COMMENT "Generating API documentation with Doxygen" VERBATIM )
-
-endif ( )
 ############################################################################################################################
+# Documentation Generation
+#
+if (WALBERLA_BUILD_DOC)
+  # Build documentation using Doxygen (www.doxygen.org)
+  ############################################################################################################################
+  find_package ( Doxygen  )
+  find_package ( HTMLHelp )

+  if ( HTML_HELP_COMPILER EQUAL "" )
+     set ( HTML_HELP_FOUND "NO" )
+  else ( )
+     set ( HTML_HELP_FOUND "YES" )
+  endif ( )

+  if ( DOXYGEN_FOUND )
+     set ( DOXYGEN_HTML_HEADER ${walberla_SOURCE_DIR}/doc/header.html )
+     set ( DOXYGEN_HTML_FOOTER ${walberla_SOURCE_DIR}/doc/footer.html )
+     set ( DOXYGEN_HTML_OUTPUT "html" )

+     configure_file ( ${walberla_SOURCE_DIR}/doc/doxygen.in ${walberla_BINARY_DIR}/doc/doxygen.cfg @ONLY )

+     add_custom_target ( doc   ${DOXYGEN_EXECUTABLE} ${walberla_BINARY_DIR}/doc/doxygen.cfg
+                            COMMENT "Generating API documentation with Doxygen" VERBATIM )
+
+  endif ( )
+endif()

-############################################################################################################################
-#
-# Fix compiler bugs
-#
 ############################################################################################################################

-# The NEC SX has a few issues in its standard library headers
-if( WALBERLA_CXX_COMPILER_IS_NEC )
-   file( WRITE ${walberla_BINARY_DIR}/CMakeFiles/src/math.h         "#include_next <math.h>\n#undef fpclassify\n#undef signbit\n#undef isfinite\n#undef isinf\n#undef isnan\n#undef isnormal\n#undef isgreater\n#undef isgreaterequal\n#undef isless\n#undef islessequal\n#undef islessgreater\n#undef isunordered\n")
-   file( WRITE ${walberla_BINARY_DIR}/CMakeFiles/src/sys/types.h    "#define uint_t SX_UINT_T\n#include \"/SX/usr/include/sys/types.h\"   \n#undef uint_t\n")
-   file( WRITE ${walberla_BINARY_DIR}/CMakeFiles/src/sys/acl.h      "#define uint_t SX_UINT_T\n#include \"/SX/usr/include/sys/acl.h\"     \n#undef uint_t\n")
-   file( WRITE ${walberla_BINARY_DIR}/CMakeFiles/src/sys/if_ehcpl.h "#define uint_t SX_UINT_T\n#include \"/SX/usr/include/sys/if_ehcpl.h\"\n#undef uint_t\n")
-   file( WRITE ${walberla_BINARY_DIR}/CMakeFiles/src/sys/ptms.h     "#define uint_t SX_UINT_T\n#include \"/SX/usr/include/sys/ptms.h\"    \n#undef uint_t\n")
-   file( WRITE ${walberla_BINARY_DIR}/CMakeFiles/src/sys/stream.h   "#define uint_t SX_UINT_T\n#include \"/SX/usr/include/sys/stream.h\"  \n#undef uint_t\n")
-   file( WRITE ${walberla_BINARY_DIR}/CMakeFiles/src/sys/strsubr.h  "#define uint_t SX_UINT_T\n#include \"/SX/usr/include/sys/strsubr.h\" \n#undef uint_t\n")
-   configure_file ( ${walberla_BINARY_DIR}/CMakeFiles/src/math.h         ${walberla_BINARY_DIR}/src/math.h COPYONLY )
-   configure_file ( ${walberla_BINARY_DIR}/CMakeFiles/src/sys/types.h    ${walberla_BINARY_DIR}/src/sys/types.h    COPYONLY )
-   configure_file ( ${walberla_BINARY_DIR}/CMakeFiles/src/sys/acl.h      ${walberla_BINARY_DIR}/src/sys/acl.h      COPYONLY )
-   configure_file ( ${walberla_BINARY_DIR}/CMakeFiles/src/sys/if_ehcpl.h ${walberla_BINARY_DIR}/src/sys/if_ehcpl.h COPYONLY )
-   configure_file ( ${walberla_BINARY_DIR}/CMakeFiles/src/sys/ptms.h     ${walberla_BINARY_DIR}/src/sys/ptms.h     COPYONLY )
-   configure_file ( ${walberla_BINARY_DIR}/CMakeFiles/src/sys/stream.h   ${walberla_BINARY_DIR}/src/sys/stream.h   COPYONLY )
-   configure_file ( ${walberla_BINARY_DIR}/CMakeFiles/src/sys/strsubr.h  ${walberla_BINARY_DIR}/src/sys/strsubr.h  COPYONLY )
-endif()



@@ -1267,6 +832,11 @@ include_directories ( ${CMAKE_CURRENT_BINARY_DIR}/src )
 # All include paths are specified relative to src/ directory
 include_directories ( ${CMAKE_CURRENT_SOURCE_DIR}/src )

+# external
+add_subdirectory( extern )
+
+# sources
+add_subdirectory ( src )

 # Generate file with compile options, and add install rule for it
 configure_file ( src/waLBerlaDefinitions.in.h
@@ -1274,14 +844,10 @@ configure_file ( src/waLBerlaDefinitions.in.h

 install( FILES ${walberla_BINARY_DIR}/src/waLBerlaDefinitions.h DESTINATION walberla/ )

-# sources
-add_subdirectory ( src )

 # test
 if ( WALBERLA_BUILD_TESTS )
    add_subdirectory ( tests )
-else()
-    add_subdirectory( tests EXCLUDE_FROM_ALL )
 endif()


@@ -1290,3 +856,12 @@ add_subdirectory ( apps )
 waLBerla_export()

 ############################################################################################################################
+
+############################################################################################################################
+##
+## clang-tidy
+##
+############################################################################################################################
+
+waLBerla_link_files_to_builddir( .clang-tidy )
+add_subdirectory( utilities )
\ No newline at end of file
--- a/CMakePresets.json
+++ b/CMakePresets.json
+{
+    "version": 6,
+    "cmakeMinimumRequired": {
+      "major": 3,
+      "minor": 23,
+      "patch": 0
+    },
+    "configurePresets": [
+      {
+        "name": "clang-tidy",
+        "generator": "Unix Makefiles",
+        "binaryDir": "${sourceDir}/build/clang-tidy",
+        "cacheVariables": {
+          "CMAKE_EXPORT_COMPILE_COMMANDS": true,
+          "WALBERLA_BUFFER_DEBUG": true,
+          "WALBERLA_BUILD_TESTS": true,
+          "WALBERLA_BUILD_BENCHMARKS": true,
+          "WALBERLA_BUILD_TUTORIALS": true,
+          "WALBERLA_BUILD_TOOLS": true,
+          "WALBERLA_BUILD_WITH_MPI": true,
+          "WALBERLA_BUILD_WITH_OPENMP": true,
+          "CMAKE_BUILD_TYPE": "Debug",
+          "WALBERLA_BUILD_WITH_METIS": true,
+          "WALBERLA_BUILD_WITH_PARMETIS": true,
+          "WALBERLA_BUILD_WITH_OPENMESH": true,
+          "WALBERLA_DOUBLE_ACCURACY": true,
+          "WALBERLA_LOGLEVEL": "DETAIL"
+        }
+      }
+    ]
+  }
+  
\ No newline at end of file
--- a/README.md
+++ b/README.md
 # waLBerla

-waLBerla (widely applicable Lattice Boltzmann from Erlangen) is a massively 
-parallel framework for multi physics applications. Besides its original 
-objective, Lattice Boltzmann solvers for hydrodynamics, it now contains 
-modules for other applications like Multigrid and rigid body dynamics 
-as well. Great emphasis is placed on the interoperability between the modules 
-in particular the fluid-particle coupling. 
-It scales from laptops to current and future supercomputers while maintaining 
+waLBerla (widely applicable Lattice Boltzmann from Erlangen) is a massively
+parallel framework for multi physics applications. Besides its original
+objective, Lattice Boltzmann solvers for hydrodynamics, it now contains
+modules for other applications like Multigrid and rigid body dynamics
+as well. Great emphasis is placed on the interoperability between the modules
+in particular the fluid-particle coupling.
+It scales from laptops to current and future supercomputers while maintaining
 near-perfect efficiency.

-See http://walberla.net/ for more information and a showcase of applications.
+See https://www.walberla.net/ for more information and a showcase of applications.

 ## Documentation and Tutorials

@@ -19,13 +19,45 @@ is documented in [Sphinx](http://walberla.net/sphinx/index.html).

 ## Getting started

-The minimum requirements are a C++11-compliant compiler (e.g. GCC or Clang),
-the [Boost](http://www.boost.org) library and the [CMake](http://www.cmake.org)
+The minimum requirements are a C++17-compliant compiler (e.g. GCC or Clang)
+and the [CMake](http://www.cmake.org)
 build system. Furthermore, you need an MPI library (like
 [Open MPI](http://www.open-mpi.org)) if you want to make use of parallel
 processing capabilities. All of these dependencies are typically available in
 your operating system's package manager.

+### CMake
+
+The typical steps, assuming your are in the waLBerla source directory, are:
+
+- `mkdir build; cd build` create a build directory and change into it
+- `cmake ..` call CMake with the waLBerla source directory as an argument
+- `make` build waLBerla
+
+To specify a CMake option you need to use `-D(Option)=(Value)`. For example to set the C++ compiler one can use:
+`cmake -DCMAKE_CXX_COMILER=clang++`
+
+To list and modify the CMake options the `ccmake` tool can be used. Just call `ccmake .` in your **build** directory to see and change the
+CMake options and variables.
+
+Some important CMake variables:
+
+- `WALBERLA_BUILD_WITH_CODEGEN` Enable pystencils code generation"
+- `Python_ROOT_DIR` Specify the directory of the `python` executable. e.g. `~/miniconda/bin/`
+- `MPI_HOME` Specify the base directory of the MPI installation.
+- `WALBERLA_BUILD_WITH_PYTHON` Support for embedding Python
+- `WALBERLA_BUILD_WITH_CUDA` Enable CUDA support
+
+For a full list of CMake Option see the [CMakeLists.txt](CMakeLists.txt) file or use `ccmake` as described above.
+
+### Codegen and Python
+
+To use the `lbmpy`/`pystencils` code generation please install the packages with e.g. `pip3 install lbmpy` and specify the correct python
+environment when calling CMake.
+
+In previous versions of CMake one could use `PYTHON_EXECUTABLE` or `PYTHON_ROOT_DIR` (all upper case) to specify the python executable or
+the directory. This does **NOT** work anymore. Please use `Python_ROOT_DIR`.
+
 ## Get involved

 ### Contributing
@@ -45,12 +77,42 @@ Many thanks go to waLBerla's [contributors](AUTHORS.txt)

 ### Please cite us

-If you use waLBerla in a publication, please cite the following article:
-
- C. Godenschwager, F. Schornbaum, M. Bauer, H. Köstler, and U. Rüde. A
-framework for hybrid parallel flow simulations with a trillion cells in complex
-geometries. In: Proceedings of the International Conference on High Performance
-Computing, Networking, Storage and Analysis, page 35. ACM, 2013.
+If you use waLBerla in a publication, please cite the following articles:
+
+Overview:
+- M. Bauer et al., *waLBerla: A block-structured high-performance framework for
+  multiphysics simulations*. Computers & Mathematics with Applications, 2020.
+  https://doi.org/10.1016/j.camwa.2020.01.007.
+
+Grid Refinement:
+- F. Schornbaum and U. Rüde, *Massively parallel algorithms for the lattice boltzmann
+  method on nonuniform grids*. SIAM Journal on Scientific Computing, 2016.
+  https://doi.org/10.1137/15M1035240
+
+LBM - Particle Coupling:
+- C. Rettinger and U. Rüde, *A comparative study of fluid-particle coupling methods for
+  fully resolved lattice Boltzmann simulations*. Computers & Fluids, 2017.
+  https://doi.org/10.1016/j.compfluid.2017.05.033
+
+Free-surface LBM:
+- C. Schwarzmeier et al., *Comparison of free-surface and conservative Allen-Cahn phase-field
+  lattice Boltzmann method*. Journal of Computational Physics, 2023.
+  https://doi.org/10.1016/j.jcp.2022.111753
+
+Allen-Cahn phase-field LBM
+- M. Holzer et al., *Highly efficient lattice Boltzmann multiphase simulations of immiscible
+  fluids at high-density ratios on CPUs and GPUs through code generation*. The International Journal of High Performance Computing Applications, 2021.
+  https://doi.org/10.1177/10943420211016525
+
+MESA-PD:
+- S. Eibl and U. Rüde, *A Modular and Extensible Software Architecture for Particle Dynamics*.
+  Proceedings Of The 8Th International Conference On Discrete Element Methods.
+  https://mercurylab.co.uk/dem8/full-papers/#page-content
+
+Carbon Nanotubes:
+- G. Drozdov et al., *Densification of single-walled carbon nanotube films:
+  Mesoscopic distinct element method simulations and experimental validation*.
+  Journal of Applied Physics, 2020. https://doi.org/10.1063/5.0025505

 ## License


--- a/apps/CMakeLists.txt
+++ b/apps/CMakeLists.txt
@@ -2,31 +2,26 @@
 # Benchmarks
 if ( WALBERLA_BUILD_BENCHMARKS )
    add_subdirectory ( benchmarks )
-else ()
-    add_subdirectory ( benchmarks EXCLUDE_FROM_ALL )
 endif()


 # Tools
 if ( WALBERLA_BUILD_TOOLS )
    add_subdirectory ( tools )
-else ()
-    add_subdirectory ( tools EXCLUDE_FROM_ALL )
 endif()


 # Tutorials
 if ( WALBERLA_BUILD_TUTORIALS )
    add_subdirectory ( tutorials )
-else ()
-    add_subdirectory ( tutorials EXCLUDE_FROM_ALL )
 endif()

+# Showcases
+if ( WALBERLA_BUILD_SHOWCASES )
+    add_subdirectory ( showcases )
+endif()

 # Python module
 if ( WALBERLA_BUILD_WITH_PYTHON )
    add_subdirectory( pythonmodule )
-    # no else with "EXLUDE_FROM_ALL" here, since if WALBERLA_BUILD_WITH_PYTHON_MODULE is not activated
-    # waLBerla was build without -fPIC , so no linking into shared library is possible
 endif()
-
--- a/apps/benchmarks/AdaptiveMeshRefinementFluidParticleCoupling/AMRSedimentSettling.cpp
+++ b/apps/benchmarks/AdaptiveMeshRefinementFluidParticleCoupling/AMRSedimentSettling.cpp
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file AMRSedimentSettling.cpp
+//! \ingroup pe_coupling
+//! \author Christoph Rettinger <christoph.rettinger@fau.de>
+//
+//======================================================================================================================
+
+#include "blockforest/Initialization.h"
+#include "blockforest/loadbalancing/InfoCollection.h"
+#include "blockforest/loadbalancing/DynamicCurve.h"
+#include "blockforest/loadbalancing/DynamicDiffusive.h"
+#include "blockforest/loadbalancing/DynamicParMetis.h"
+#include "blockforest/loadbalancing/StaticCurve.h"
+#include "blockforest/loadbalancing/StaticParMetis.h"
+#include "blockforest/loadbalancing/weight_assignment/MetisAssignmentFunctor.h"
+#include "blockforest/loadbalancing/weight_assignment/WeightAssignmentFunctor.h"
+#include "blockforest/AABBRefinementSelection.h"
+
+#include "boundary/all.h"
+
+#include "core/DataTypes.h"
+#include "core/Environment.h"
+#include "core/SharedFunctor.h"
+#include "core/debug/Debug.h"
+#include "core/debug/TestSubsystem.h"
+#include "core/math/all.h"
+#include "core/timing/RemainingTimeLogger.h"
+#include "core/mpi/Broadcast.h"
+
+#include "domain_decomposition/BlockSweepWrapper.h"
+
+#include "field/AddToStorage.h"
+#include "field/StabilityChecker.h"
+#include "field/communication/PackInfo.h"
+
+#include "lbm/boundary/all.h"
+#include "lbm/field/AddToStorage.h"
+#include "lbm/field/PdfField.h"
+#include "lbm/field/VelocityFieldWriter.h"
+#include "lbm/lattice_model/D3Q19.h"
+#include "lbm/refinement/all.h"
+#include "lbm/sweeps/CellwiseSweep.h"
+
+#include "pe/basic.h"
+#include "pe/Types.h"
+#include "pe/fcd/GJKEPACollideFunctor.h"
+#include "pe/vtk/SphereVtkOutput.h"
+#include "pe/vtk/EllipsoidVtkOutput.h"
+#include "pe/cr/ICR.h"
+#include "pe/synchronization/ClearSynchronization.h"
+#include "pe/amr/InfoCollection.h"
+
+#include "pe_coupling/amr/all.h"
+#include "pe_coupling/mapping/all.h"
+#include "pe_coupling/momentum_exchange_method/all.h"
+#include "pe_coupling/utility/all.h"
+
+#include "timeloop/SweepTimeloop.h"
+
+#include "vtk/all.h"
+#include "field/vtk/all.h"
+#include "lbm/vtk/all.h"
+
+namespace amr_sediment_settling
+{
+
+///////////
+// USING //
+///////////
+
+using namespace walberla;
+using walberla::uint_t;
+
+//////////////
+// TYPEDEFS //
+//////////////
+
+// PDF field, flag field & body field
+using LatticeModel_T = lbm::D3Q19<lbm::collision_model::TRT, false>;
+using Stencil_T = LatticeModel_T::Stencil;
+using PdfField_T = lbm::PdfField<LatticeModel_T>;
+
+using flag_t = walberla::uint8_t;
+using FlagField_T = FlagField<flag_t>;
+using BodyField_T = GhostLayerField<pe::BodyID, 1>;
+using VelocityField_T = GhostLayerField<Vector3<real_t>, 1>;
+
+const uint_t FieldGhostLayers = 4;
+
+// boundary handling
+using NoSlip_T = lbm::NoSlip<LatticeModel_T, flag_t>;
+
+using MO_T = pe_coupling::CurvedLinear<LatticeModel_T, FlagField_T>;
+
+using BoundaryHandling_T = BoundaryHandling<FlagField_T, Stencil_T, NoSlip_T, MO_T>;
+
+using BodyTypeTuple = std::tuple<pe::Sphere, pe::Ellipsoid, pe::Plane>;
+
+///////////
+// FLAGS //
+///////////
+
+const FlagUID Fluid_Flag( "fluid" );
+const FlagUID NoSlip_Flag( "no slip" );
+const FlagUID MO_Flag( "moving obstacle" );
+const FlagUID FormerMO_Flag( "former moving obstacle" );
+
+
+//////////////////////////////////////
+// DYNAMIC REFINEMENT FUNCTIONALITY //
+//////////////////////////////////////
+
+
+/*
+ * Refinement check based on gradient magnitude
+ * If gradient magnitude is below lowerLimit in all cells of a block, that block could be coarsened.
+ * If the gradient value is above the upperLimit for at least one cell, that block gets marked for refinement.
+ * Else, the block remains on the current level.
+ */
+template< typename LatticeModel_T, typename Filter_T >
+class VectorGradientRefinement
+{
+public:
+   using VectorField_T = GhostLayerField<Vector3<real_t>, 1>;
+   using Stencil_T = typename LatticeModel_T::Stencil;
+
+   VectorGradientRefinement( const ConstBlockDataID & fieldID, const Filter_T & filter,
+                             const real_t upperLimit, const real_t lowerLimit, const uint_t maxLevel ) :
+         fieldID_( fieldID ), filter_( filter ),
+         upperLimit_( upperLimit ), lowerLimit_( lowerLimit ), maxLevel_( maxLevel )
+   {}
+
+   void operator()( std::vector< std::pair< const Block *, uint_t > > & minTargetLevels,
+                    std::vector< const Block * > & blocksAlreadyMarkedForRefinement,
+                    const BlockForest & forest );
+
+private:
+
+   ConstBlockDataID fieldID_;
+
+   Filter_T filter_;
+
+   real_t upperLimit_;
+   real_t lowerLimit_;
+
+   uint_t maxLevel_;
+
+}; // class GradientRefinement
+
+template< typename LatticeModel_T, typename Filter_T >
+void VectorGradientRefinement< LatticeModel_T, Filter_T >::operator()( std::vector< std::pair< const Block *, uint_t > > & minTargetLevels,
+                                                                       std::vector< const Block * > &, const BlockForest & )
+{
+   for(auto & minTargetLevel : minTargetLevels)
+   {
+      const Block * const block = minTargetLevel.first;
+
+      const uint_t currentLevelOfBlock = block->getLevel();
+
+      const VectorField_T * uField = block->template getData< VectorField_T >( fieldID_ );
+
+      if( uField == nullptr )
+      {
+         minTargetLevel.second = uint_t(0);
+         continue;
+      }
+
+      Matrix3<real_t> uGradient( real_t(0) );
+
+      bool refine( false );
+      bool coarsen( true );
+
+      filter_( *block );
+
+      WALBERLA_FOR_ALL_CELLS_XYZ( uField,
+
+          std::vector< Vector3<real_t> > uValues( Stencil_T::Size, Vector3<real_t>(real_t(0)) );
+
+          Vector3<real_t> uInCenterCell = uField->get( x,y,z );
+
+          for( auto dir = Stencil_T::beginNoCenter(); dir != Stencil_T::end(); ++dir)
+          {
+             // check if boundary treatment is necessary
+             if( filter_( x+dir.cx(),y+dir.cy(),z+dir.cz() ) )
+             {
+                // copy from center cell
+                uValues[ *dir ] = uInCenterCell;
+             } else {
+                uValues[ *dir ] = uField->get( x+dir.cx(),y+dir.cy(),z+dir.cz() );
+             }
+          }
+
+          // obtain the matrix grad(u) with the help of the gradient formula from
+          // See: Ramadugu et al - Lattice differential operators for computational physics (2013)
+          // with T = c_s**2
+          const auto inv_c_s_sqr = real_t(3);
+          uGradient = real_t(0);
+          for( auto dir = Stencil_T::beginNoCenter(); dir != Stencil_T::end(); ++dir)
+          {
+             real_t cx = real_c(dir.cx());
+             real_t cy = real_c(dir.cy());
+             real_t cz = real_c(dir.cz());
+
+             // grad(ux)
+             real_t ux = uValues[ *dir ][0];
+             uGradient[ 0 ] += LatticeModel_T::w[ dir.toIdx() ] * cx * ux;
+             uGradient[ 3 ] += LatticeModel_T::w[ dir.toIdx() ] * cy * ux;
+             uGradient[ 6 ] += LatticeModel_T::w[ dir.toIdx() ] * cz * ux;
+
+             // grad(uy)
+             real_t uy = uValues[ *dir ][1];
+             uGradient[ 1 ] += LatticeModel_T::w[ dir.toIdx() ] * cx * uy;
+             uGradient[ 4 ] += LatticeModel_T::w[ dir.toIdx() ] * cy * uy;
+             uGradient[ 7 ] += LatticeModel_T::w[ dir.toIdx() ] * cz * uy;
+
+             // grad(uz)
+             real_t uz = uValues[ *dir ][2];
+             uGradient[ 2 ] += LatticeModel_T::w[ dir.toIdx() ] * cx * uz;
+             uGradient[ 5 ] += LatticeModel_T::w[ dir.toIdx() ] * cy * uz;
+             uGradient[ 8 ] += LatticeModel_T::w[ dir.toIdx() ] * cz * uz;
+
+          }
+          uGradient *= inv_c_s_sqr;
+
+          auto norm = real_t(0);
+          //compute maximums norm of 3x3 matrix
+          for (auto i = uint_t(0); i < uint_t(3*3); ++i)
+             norm = std::max(norm, std::fabs(uGradient[i]));
+
+          if( norm > lowerLimit_ )
+          {
+             coarsen = false;
+             if( norm > upperLimit_ )
+                refine = true;
+          }
+
+      )
+
+      if( refine && currentLevelOfBlock < maxLevel_ )
+      {
+         WALBERLA_ASSERT( !coarsen );
+         minTargetLevel.second = currentLevelOfBlock + uint_t(1);
+      }
+      if( coarsen && currentLevelOfBlock > uint_t(0) )
+      {
+         WALBERLA_ASSERT( !refine );
+         minTargetLevel.second = currentLevelOfBlock - uint_t(1);
+      }
+   }
+}
+
+
+// Load estimators for spheres and ellipsoids, obtained at SuperMUC Phase 2
+// See Sec. 3 in the paper for more infos
+
+/////////////
+// Spheres //
+/////////////
+real_t fittedLBMWeightEvaluationFunctionSpheres(const pe_coupling::BlockInfo& blockInfo)
+{
+   uint_t Ce = blockInfo.numberOfCells;
+   uint_t F  = blockInfo.numberOfFluidCells;
+
+   // from fits with optimized D3Q19 LBM kernel (no forces)
+   real_t weight = real_t(9.990957667738165e-06) * real_c(Ce) + real_t(0.00015749920523711047) * real_c(F) + real_t(-0.08232498905584973);
+   return weight;
+}
+
+real_t fittedBHWeightEvaluationFunctionSpheres(const pe_coupling::BlockInfo& blockInfo)
+{
+   uint_t Ce = blockInfo.numberOfCells;
+   uint_t NB = blockInfo.numberOfNearBoundaryCells;
+
+   // from fits with optimized D3Q19 LBM kernel (no forces)
+   real_t weight = real_t(6.654810986939097e-06) * real_c(Ce) + real_t(0.0007061414693533274) * real_c(NB) + real_t(-0.1094292992294259);
+   return weight;
+}
+
+real_t fittedCoupling1WeightEvaluationFunctionSpheres(const pe_coupling::BlockInfo& blockInfo)
+{
+   uint_t Ce = blockInfo.numberOfCells;
+   uint_t F  = blockInfo.numberOfFluidCells;
+   uint_t Pl = blockInfo.numberOfLocalBodies;
+   uint_t Ps = blockInfo.numberOfShadowBodies;
+
+   // from fits with optimized D3Q19 LBM kernel (no forces)
+   real_t weight = real_t(3.07542641675429e-06) * real_c(Ce) + real_t(2.419364600880769e-07) * real_c(F) + real_t(0.01413718259604757) * real_c(Pl) + real_t(0.027761707343462727) * real_c(Ps) + real_t(-0.13991481483939272);
+   return weight;
+}
+
+real_t fittedCoupling2WeightEvaluationFunctionSpheres(const pe_coupling::BlockInfo& blockInfo)
+{
+   uint_t Ce = blockInfo.numberOfCells;
+   uint_t F  = blockInfo.numberOfFluidCells;
+   uint_t Pl = blockInfo.numberOfLocalBodies;
+   uint_t Ps = blockInfo.numberOfShadowBodies;
+
+   // from fits with optimized D3Q19 LBM kernel (no forces)
+   real_t weight = real_t(5.988401232749505e-06) * real_c(Ce) + real_t(3.903532223977357e-06) * real_c(F) + real_t(-0.008802674250816316) * real_c(Pl) + real_t(0.02505020738346139) * real_c(Ps) + real_t(-0.12970723676003335);
+   return weight;
+}
+
+real_t fittedPEWeightEvaluationFunctionSpheres(const pe_coupling::BlockInfo& blockInfo)
+{
+   uint_t Pl = blockInfo.numberOfLocalBodies;
+   uint_t Ps = blockInfo.numberOfShadowBodies;
+   uint_t Ct = blockInfo.numberOfContacts;
+   uint_t Sc = blockInfo.numberOfPeSubCycles;
+
+   // from fits with optimized D3Q19 LBM kernel (no forces)
+   real_t cPlPs2 = real_t(1.1562854544700417e-06);
+   real_t cPl    = real_t(0.0009620525068318354);
+   real_t cPs    = real_t(0.00027549401081063894);
+   real_t cCt    = real_t(0.0014801932788115464);
+   real_t c      = real_t(0.01883682418448259);
+   real_t weight = real_c(Sc) * ( cPlPs2 * real_c(Pl+Ps) * real_c(Pl+Ps) + cPl * real_c(Pl) + cPs * real_c(Ps) + cCt * real_c(Ct) + c );
+   return weight;
+}
+
+real_t fittedTotalWeightEvaluationFunctionSpheres(const pe_coupling::BlockInfo& blockInfo)
+{
+   return fittedLBMWeightEvaluationFunctionSpheres(blockInfo) + fittedBHWeightEvaluationFunctionSpheres(blockInfo) +
+          fittedCoupling1WeightEvaluationFunctionSpheres(blockInfo) + fittedCoupling2WeightEvaluationFunctionSpheres(blockInfo) +
+          fittedPEWeightEvaluationFunctionSpheres(blockInfo);
+}
+
+////////////////
+// Ellipsoids //
+////////////////
+real_t fittedLBMWeightEvaluationFunctionEllipsoids(const pe_coupling::BlockInfo& blockInfo)
+{
+   uint_t Ce = blockInfo.numberOfCells;
+   uint_t F  = blockInfo.numberOfFluidCells;
+   uint_t NB = blockInfo.numberOfNearBoundaryCells;
+
+   // from fits with optimized D3Q19 LBM kernel (no forces)
+   real_t cCe = real_t(4.69973868717e-05);
+   real_t cF  = real_t(0.000110568537442);
+   real_t weight = cCe * real_c(Ce) + cF * real_c(F);
+   if( NB > uint_t(0) ) weight += real_t(5.96551488486e-05) * real_c(Ce) + real_t(-5.75351782026e-05) * real_c(F) + real_t(0.000695800745231) * real_c(NB);
+   return weight;
+}
+
+real_t fittedCouplingWeightEvaluationFunctionEllipsoids(const pe_coupling::BlockInfo& blockInfo)
+{
+   uint_t Ce = blockInfo.numberOfCells;
+   uint_t F  = blockInfo.numberOfFluidCells;
+   uint_t Pl = blockInfo.numberOfLocalBodies;
+   uint_t Ps = blockInfo.numberOfShadowBodies;
+
+   // from fits with optimized D3Q19 LBM kernel (no forces)
+   real_t cCe = real_t(0.000176674935526);
+   real_t cF  = real_t(-0.000170513513027);
+   real_t cPl = real_t(0.0252031634776);
+   real_t cPs = real_t(0.0356835220918);
+   real_t weight = real_t(0);
+   if( (Pl + Ps ) > uint_t(0) ) weight = cCe * real_c(Ce) + cF * real_c(F) + cPl * real_c(Pl) + cPs * real_c(Ps);
+   return weight;
+}
+
+real_t fittedPEWeightEvaluationFunctionEllipsoids(const pe_coupling::BlockInfo& blockInfo)
+{
+   uint_t Pl = blockInfo.numberOfLocalBodies;
+   uint_t Ps = blockInfo.numberOfShadowBodies;
+   uint_t Ct = blockInfo.numberOfContacts;
+   uint_t Sc = blockInfo.numberOfPeSubCycles;
+
+   // from fits with optimized D3Q19 LBM kernel (no forces)
+   real_t cPlPs2 = real_t(8.24153555785e-06);
+   real_t cPl    = real_t(0.00135966650494);
+   real_t cPs    = real_t(0.00440464092538);
+   real_t cCt    = real_t(0.0216278259881);
+   real_t weight = real_c(Sc) * ( cPlPs2 * real_c(Pl+Ps) * real_c(Pl+Ps) + cPl * real_c(Pl) + cPs * real_c(Ps) + cCt * real_c(Ct) );
+   return weight;
+}
+
+real_t fittedTotalWeightEvaluationFunctionEllipsoids(const pe_coupling::BlockInfo& blockInfo)
+{
+   return fittedLBMWeightEvaluationFunctionEllipsoids(blockInfo) +
+          fittedCouplingWeightEvaluationFunctionEllipsoids(blockInfo) +
+          fittedPEWeightEvaluationFunctionEllipsoids(blockInfo);
+}
+
+
+struct TimingPoolLogger
+{
+   TimingPoolLogger( const shared_ptr<WcTimingPool> & timingPool, const shared_ptr<SweepTimeloop> & timeloop, const uint_t interval )
+         : timingPool_( timingPool ), timeloop_( timeloop ), interval_( interval )
+   {
+   }
+
+   void operator()()
+   {
+      if( interval_ > uint_t(0) && timeloop_->getCurrentTimeStep() % interval_ == uint_t(0) )
+      {
+         timingPool_->logResultOnRoot();
+      }
+   }
+
+private:
+   shared_ptr<WcTimingPool> timingPool_;
+   shared_ptr<SweepTimeloop> timeloop_;
+   uint_t interval_;
+};
+
+struct TimingTreeLogger
+{
+   TimingTreeLogger( const shared_ptr<WcTimingTree> & timingTree, const shared_ptr<SweepTimeloop> & timeloop, const uint_t interval )
+         : timingTree_( timingTree ), timeloop_( timeloop ), interval_( interval )
+   {
+   }
+
+   void operator()()
+   {
+      if( interval_ > uint_t(0) && timeloop_->getCurrentTimeStep() % interval_ == uint_t(0) )
+      {
+         timingTree_->synchronize();
+         auto reducedTimingTree = timingTree_->getReduced();
+         WALBERLA_LOG_INFO_ON_ROOT( reducedTimingTree );
+      }
+   }
+
+private:
+   shared_ptr<WcTimingTree> timingTree_;
+   shared_ptr<SweepTimeloop> timeloop_;
+   uint_t interval_;
+};
+
+/////////////////////
+// BLOCK STRUCTURE //
+/////////////////////
+
+static void refinementSelection( SetupBlockForest& forest, uint_t levels, const AABB & refinementBox )
+{
+   auto dx = real_t(1); // dx on finest level
+   for (auto &block : forest) {
+      uint_t blockLevel = block.getLevel();
+      uint_t levelScalingFactor = ( uint_t(1) << (levels - uint_t(1) - blockLevel) );
+      real_t dxOnLevel = dx * real_c(levelScalingFactor);
+      AABB blockAABB = block.getAABB();
+
+      // extend block AABB by ghostlayers
+      AABB extendedBlockAABB = blockAABB.getExtended( dxOnLevel * real_c(FieldGhostLayers) );
+
+      if( extendedBlockAABB.intersects( refinementBox ) )
+         if( blockLevel < ( levels - uint_t(1) ) )
+            block.setMarker( true );
+   }
+}
+
+static void workloadAndMemoryAssignment( SetupBlockForest& forest )
+{
+   for (auto &block : forest) {
+      block.setWorkload( numeric_cast< workload_t >( uint_t(1) << block.getLevel() ) );
+      block.setMemory( numeric_cast< memory_t >(1) );
+   }
+}
+
+static shared_ptr< StructuredBlockForest > createBlockStructure( const AABB & domainAABB, Vector3<uint_t> blockSizeInCells,
+                                                                 uint_t numberOfLevels, const AABB & refinementBox,
+                                                                 bool useBox, const std::string & loadDistributionStrategy,
+                                                                 bool keepGlobalBlockInformation = false )
+{
+   SetupBlockForest sforest;
+
+   Vector3<uint_t> numberOfFineBlocksPerDirection( uint_c(domainAABB.size(0)) / blockSizeInCells[0],
+                                                   uint_c(domainAABB.size(1)) / blockSizeInCells[1],
+                                                   uint_c(domainAABB.size(2)) / blockSizeInCells[2] );
+
+   for(uint_t i = 0; i < 3; ++i )
+   {
+      WALBERLA_CHECK_EQUAL( numberOfFineBlocksPerDirection[i] * blockSizeInCells[i], uint_c(domainAABB.size(i)),
+                            "Domain can not be decomposed in direction " << i << " into fine blocks of size " << blockSizeInCells[i] );
+   }
+
+   uint_t levelScalingFactor = ( uint_t(1) << ( numberOfLevels - uint_t(1) ) );
+   Vector3<uint_t> numberOfCoarseBlocksPerDirection( numberOfFineBlocksPerDirection / levelScalingFactor );
+
+   for(uint_t i = 0; i < 3; ++i )
+   {
+      WALBERLA_CHECK_EQUAL(numberOfCoarseBlocksPerDirection[i] * levelScalingFactor, numberOfFineBlocksPerDirection[i],
+                            "Domain can not be refined in direction " << i << " according to the specified number of levels!" );
+   }
+
+   WALBERLA_LOG_INFO_ON_ROOT(" - refinement box: " << refinementBox);
+
+   MPIManager::instance()->useWorldComm();
+
+   sforest.addRefinementSelectionFunction( [numberOfLevels, refinementBox](auto && PH1) { refinementSelection(std::forward<decltype(PH1)>(PH1), numberOfLevels, refinementBox); } );
+   sforest.addWorkloadMemorySUIDAssignmentFunction( workloadAndMemoryAssignment );
+
+   Vector3<bool> periodicity( true, true, false);
+   if( useBox )
+   {
+      periodicity[0] = false;
+      periodicity[1] = false;
+   }
+   sforest.init( domainAABB,
+                 numberOfCoarseBlocksPerDirection[0], numberOfCoarseBlocksPerDirection[1], numberOfCoarseBlocksPerDirection[2],
+                 periodicity[0], periodicity[1], periodicity[2]);
+
+   // calculate process distribution
+   const memory_t memoryLimit = math::Limits< memory_t >::inf();
+
+   if( loadDistributionStrategy == "Hilbert" )
+   {
+      bool useHilbert = true;
+      sforest.balanceLoad( blockforest::StaticLevelwiseCurveBalance(useHilbert), uint_c( MPIManager::instance()->numProcesses() ), real_t(0), memoryLimit, true );
+   } else if ( loadDistributionStrategy == "Morton" )
+   {
+      bool useHilbert = false;
+      sforest.balanceLoad( blockforest::StaticLevelwiseCurveBalance(useHilbert), uint_c( MPIManager::instance()->numProcesses() ), real_t(0), memoryLimit, true );
+   } else if ( loadDistributionStrategy == "ParMetis" )
+   {
+      blockforest::StaticLevelwiseParMetis::Algorithm algorithm = blockforest::StaticLevelwiseParMetis::Algorithm::PARMETIS_PART_GEOM_KWAY;
+      blockforest::StaticLevelwiseParMetis staticParMetis(algorithm);
+      sforest.balanceLoad( staticParMetis, uint_c( MPIManager::instance()->numProcesses() ), real_t(0), memoryLimit, true );
+   } else if (loadDistributionStrategy == "Diffusive" )
+   {
+      // also use Hilbert curve here
+      bool useHilbert = true;
+      sforest.balanceLoad( blockforest::StaticLevelwiseCurveBalance(useHilbert), uint_c( MPIManager::instance()->numProcesses() ), real_t(0), memoryLimit, true );
+   } else
+   {
+      WALBERLA_ABORT("Load distribution strategy \"" << loadDistributionStrategy << "\t not implemented! - Aborting" );
+   }
+
+   WALBERLA_LOG_INFO_ON_ROOT( sforest );
+
+
+   // create StructuredBlockForest (encapsulates a newly created BlockForest)
+   shared_ptr< StructuredBlockForest > sbf =
+         make_shared< StructuredBlockForest >( make_shared< BlockForest >( uint_c( MPIManager::instance()->rank() ), sforest, keepGlobalBlockInformation ),
+                                               blockSizeInCells[0], blockSizeInCells[1], blockSizeInCells[2]);
+   sbf->createCellBoundingBoxes();
+
+   return sbf;
+}
+
+/////////////////////////////////////
+// BOUNDARY HANDLING CUSTOMIZATION //
+/////////////////////////////////////
+class MyBoundaryHandling : public blockforest::AlwaysInitializeBlockDataHandling< BoundaryHandling_T >
+{
+public:
+   MyBoundaryHandling( const weak_ptr< StructuredBlockStorage > & blocks,
+                       const BlockDataID & flagFieldID, const BlockDataID & pdfFieldID, const BlockDataID & bodyFieldID ) :
+         blocks_( blocks ), flagFieldID_( flagFieldID ), pdfFieldID_( pdfFieldID ), bodyFieldID_ ( bodyFieldID )
+   {}
+
+   BoundaryHandling_T * initialize( IBlock * const block ) override;
+
+private:
+
+   weak_ptr< StructuredBlockStorage > blocks_;
+
+   const BlockDataID flagFieldID_;
+   const BlockDataID pdfFieldID_;
+   const BlockDataID bodyFieldID_;
+
+
+}; // class MyBoundaryHandling
+
+BoundaryHandling_T * MyBoundaryHandling::initialize( IBlock * const block )
+{
+   WALBERLA_ASSERT_NOT_NULLPTR( block );
+
+   auto * flagField = block->getData< FlagField_T >( flagFieldID_ );
+   auto *  pdfField = block->getData< PdfField_T > ( pdfFieldID_ );
+   auto * bodyField = block->getData< BodyField_T >( bodyFieldID_ );
+
+   const auto fluid = flagField->flagExists( Fluid_Flag ) ? flagField->getFlag( Fluid_Flag ) : flagField->registerFlag( Fluid_Flag );
+
+   auto blocksPtr = blocks_.lock();
+   WALBERLA_CHECK_NOT_NULLPTR( blocksPtr );
+
+   BoundaryHandling_T * handling = new BoundaryHandling_T( "moving obstacle boundary handling", flagField, fluid,
+                                                           NoSlip_T( "NoSlip", NoSlip_Flag, pdfField ),
+                                                           MO_T( "MO", MO_Flag, pdfField, flagField, bodyField, fluid, *blocksPtr, *block ),
+                                                           BoundaryHandling_T::Mode::ENTIRE_FIELD_TRAVERSAL);
+
+   handling->fillWithDomain( FieldGhostLayers );
+
+   return handling;
+}
+
+
+//*******************************************************************************************************************
+
+
+//*******************************************************************************************************************
+/*!\brief Evaluating the position and velocity of the sediments
+ *
+ */
+//*******************************************************************************************************************
+class PropertyLogger
+{
+public:
+   PropertyLogger( const shared_ptr<SweepTimeloop> & timeloop, const shared_ptr< StructuredBlockStorage > & blocks,
+                   const BlockDataID & bodyStorageID, const std::string & fileName, bool fileIO) :
+      timeloop_( timeloop ), blocks_( blocks ), bodyStorageID_( bodyStorageID ), fileName_( fileName ), fileIO_(fileIO),
+      meanPos_( real_t(0) ), meanVel_( real_t(0) ), maxVel_( real_t(0) )
+   {
+      if ( fileIO_ )
+      {
+         WALBERLA_ROOT_SECTION()
+         {
+            std::ofstream file;
+            file.open( fileName_.c_str() );
+            file << "#\t pos\t vel\t maxVel\n";
+            file.close();
+         }
+      }
+   }
+
+   void operator()()
+   {
+      const uint_t timestep (timeloop_->getCurrentTimeStep() );
+
+      auto numSediments = uint_t(0);
+      auto meanPos = real_t(0);
+      auto meanVel = real_t(0);
+      auto maxVel = real_t(0);
+
+      for( auto blockIt = blocks_->begin(); blockIt != blocks_->end(); ++blockIt )
+      {
+         for( auto bodyIt = pe::LocalBodyIterator::begin( *blockIt, bodyStorageID_); bodyIt != pe::LocalBodyIterator::end(); ++bodyIt )
+         {
+            meanPos += bodyIt->getPosition()[2];
+            meanVel += bodyIt->getLinearVel()[2];
+            maxVel = std::max(maxVel, std::fabs(bodyIt->getLinearVel()[2]));
+            ++numSediments;
+         }
+      }
+
+      WALBERLA_MPI_SECTION()
+      {
+         mpi::allReduceInplace( numSediments, mpi::SUM );
+         mpi::allReduceInplace( meanPos, mpi::SUM );
+         mpi::allReduceInplace( meanVel, mpi::SUM );
+         mpi::allReduceInplace( maxVel, mpi::MAX );
+      }
+
+      meanPos /= real_c(numSediments);
+      meanVel /= real_c(numSediments);
+
+      meanPos_ = meanPos;
+      meanVel_ = meanVel;
+      maxVel_ = maxVel;
+
+      if( fileIO_ )
+         writeToFile( timestep );
+   }
+
+   real_t getMeanPosition() const
+   {
+      return meanPos_;
+   }
+
+   real_t getMaxVelocity() const
+   {
+      return maxVel_;
+   }
+
+   real_t getMeanVelocity() const
+   {
+      return meanVel_;
+   }
+
+
+private:
+   void writeToFile( uint_t timestep )
+   {
+      WALBERLA_ROOT_SECTION()
+      {
+         std::ofstream file;
+         file.open( fileName_.c_str(), std::ofstream::app );
+
+         file << timestep << "\t" << meanPos_ << "\t" << meanVel_ << "\t" << maxVel_ << "\n";
+         file.close();
+      }
+   }
+
+   shared_ptr<SweepTimeloop> timeloop_;
+   shared_ptr< StructuredBlockStorage > blocks_;
+   const BlockDataID bodyStorageID_;
+   std::string fileName_;
+   bool fileIO_;
+
+   real_t meanPos_;
+   real_t meanVel_;
+   real_t maxVel_;
+};
+
+void clearBoundaryHandling( BlockForest & forest, const BlockDataID & boundaryHandlingID )
+{
+   for( auto blockIt = forest.begin(); blockIt != forest.end(); ++blockIt )
+   {
+      auto * boundaryHandling = blockIt->getData<BoundaryHandling_T>(boundaryHandlingID);
+      boundaryHandling->clear( FieldGhostLayers );
+   }
+}
+
+void clearBodyField( BlockForest & forest, const BlockDataID & bodyFieldID )
+{
+   for( auto blockIt = forest.begin(); blockIt != forest.end(); ++blockIt )
+   {
+      auto * bodyField = blockIt->getData<BodyField_T>(bodyFieldID);
+      bodyField->setWithGhostLayer( NULL );
+   }
+}
+
+void recreateBoundaryHandling( BlockForest & forest, const BlockDataID & boundaryHandlingID )
+{
+   for( auto blockIt = forest.begin(); blockIt != forest.end(); ++blockIt )
+   {
+      auto * boundaryHandling = blockIt->getData<BoundaryHandling_T>(boundaryHandlingID);
+      boundaryHandling->fillWithDomain( FieldGhostLayers );
+   }
+}
+
+
+class TimingEvaluator
+{
+public:
+   TimingEvaluator( const shared_ptr<WcTimingPool> & levelwiseTimingPool, const shared_ptr<WcTimingTree> & peTimingTree, uint_t numberOfLevels)
+   : levelwiseTimingPool_( levelwiseTimingPool ), peTimingTree_( peTimingTree ), numberOfLevels_( numberOfLevels )
+   {}
+
+   real_t getTimings(const std::vector<std::string> & timerNames, uint_t level )
+   {
+
+      auto timing = real_t(0);
+      for (const auto &timerName : timerNames)
+      {
+         std::string timerNameLvlWise = timerName;// +
+         // put level between timer string and possible suffix
+         auto suffixBegin = timerNameLvlWise.find_first_of('[');
+         if( suffixBegin != std::string::npos)
+         {
+            // suffix detected
+            auto suffixEnd = timerNameLvlWise.find_last_of(']');
+            if( suffixEnd != std::string::npos)
+            {
+               auto timerString = timerNameLvlWise.substr(0,suffixBegin);
+               auto suffixString = timerNameLvlWise.substr(suffixBegin,suffixEnd-suffixBegin+1);
+
+               timerNameLvlWise = timerString + "(" + std::to_string(level) + ") " + suffixString; // NOLINT
+
+            }
+            else
+            {
+               WALBERLA_ABORT("Invalid timer string");
+            }
+         }
+         else
+         {
+            timerNameLvlWise += " (" + std::to_string(level) + ")";;
+         }
+
+         if( levelwiseTimingPool_->timerExists(timerNameLvlWise))
+            timing += real_c((*levelwiseTimingPool_)[timerNameLvlWise].total());
+
+         if( level == numberOfLevels_- 1)
+         {
+            if( peTimingTree_->timerExists(timerName))
+               timing += real_c((*peTimingTree_)[timerName].total());
+         }
+      }
+
+      return timing;
+   }
+
+
+private:
+
+   shared_ptr<WcTimingPool> levelwiseTimingPool_;
+   shared_ptr<WcTimingTree> peTimingTree_;
+   uint_t numberOfLevels_;
+};
+
+
+real_t weightEvaluation(BlockForest & forest,
+                        const shared_ptr<pe_coupling::InfoCollection>& couplingInfoCollection,
+                        const shared_ptr<blockforest::InfoCollection> & peInfoCollection,
+                        real_t peBlockBaseWeight,
+                        const std::string & loadEvaluationStrategy,
+                        uint_t level,
+                        bool useEllipsoids )
+{
+   auto weight = real_t(0);
+   for( auto blockIt = forest.begin(); blockIt != forest.end(); ++blockIt )
+   {
+      if( forest.getLevel(*blockIt) != level) continue;
+
+
+      auto * block = static_cast<blockforest::Block*> (&(*blockIt));
+      const auto &blockID = block->getId();
+
+      if(loadEvaluationStrategy == "LBM")
+      {
+         auto infoIt = couplingInfoCollection->find( blockID );
+         weight += pe_coupling::amr::defaultWeightEvaluationFunction(infoIt->second);
+
+      }else if(loadEvaluationStrategy == "PE")
+      {
+         auto infoIt = peInfoCollection->find( blockID );
+         weight += real_c(infoIt->second.computationalWeight) + peBlockBaseWeight;
+      }else if(loadEvaluationStrategy == "Fit" || loadEvaluationStrategy == "FitMulti")
+      {
+         auto infoIt = couplingInfoCollection->find( blockID );
+         if( useEllipsoids )
+         {
+            weight += fittedTotalWeightEvaluationFunctionEllipsoids(infoIt->second);
+         }
+         else
+         {
+            weight += fittedTotalWeightEvaluationFunctionSpheres(infoIt->second);
+         }
+      }else
+      {
+         WALBERLA_ABORT("Load balancing strategy not defined");
+      }
+   }
+   return weight;
+}
+
+
+uint_t evaluateEdgeCut(BlockForest & forest)
+{
+
+   //note: only works for edges in uniform grids
+
+   auto edgecut = uint_t(0); // = edge weights between processes
+
+   for( auto blockIt = forest.begin(); blockIt != forest.end(); ++blockIt )
+   {
+      auto * block = static_cast<blockforest::Block*> (&(*blockIt));
+
+      real_t blockVolume = block->getAABB().volume();
+      real_t approximateEdgeLength = std::cbrt( blockVolume );
+
+      uint_t faceNeighborWeight = uint_c(approximateEdgeLength * approximateEdgeLength ); //common face
+      uint_t edgeNeighborWeight = uint_c(approximateEdgeLength); //common edge
+      uint_t cornerNeighborWeight = uint_c( 1 ); //common corner
+
+
+      for( const uint_t idx : blockforest::getFaceNeighborhoodSectionIndices() )
+      {
+         for (auto nb = uint_t(0); nb < block->getNeighborhoodSectionSize(idx); ++nb)
+         {
+            if( block->neighborExistsRemotely(idx,nb) ) edgecut += faceNeighborWeight;
+         }
+      }
+
+      for( const uint_t idx : blockforest::getEdgeNeighborhoodSectionIndices() )
+      {
+         for (auto nb = uint_t(0); nb < block->getNeighborhoodSectionSize(idx); ++nb)
+         {
+            if( block->neighborExistsRemotely(idx,nb) ) edgecut += edgeNeighborWeight;
+         }
+      }
+
+      for( const uint_t idx : blockforest::getCornerNeighborhoodSectionIndices() )
+      {
+         for (auto nb = uint_t(0); nb < block->getNeighborhoodSectionSize(idx); ++nb)
+         {
+            if( block->neighborExistsRemotely(idx,nb) ) edgecut += cornerNeighborWeight;
+         }
+      }
+   }
+   return edgecut;
+}
+
+
+void evaluateTotalSimulationTimePassed(WcTimingPool & timeloopTimingPool, real_t & totalSimTime, real_t & totalLBTime)
+{
+   shared_ptr< WcTimingPool> reduced = timeloopTimingPool.getReduced(timing::REDUCE_TOTAL, 0);
+
+   std::string simulationString("LBM refinement time step");
+   auto totalTime = real_t(0);
+   WALBERLA_ROOT_SECTION(){
+      totalTime = real_c((*reduced)[simulationString].total());
+   }
+   totalSimTime = totalTime;
+
+   std::string lbString("refinement checking");
+   auto lbTime = real_t(0);
+   WALBERLA_ROOT_SECTION(){
+      lbTime = real_c((*reduced)[lbString].total());
+   }
+   totalLBTime = lbTime;
+
+}
+
+void createSedimentLayer(uint_t numberOfSediments, const AABB & generationDomain, real_t diameter, real_t heightBorder,
+                         pe::MaterialID peMaterial,
+                         pe::cr::HCSITS & cr, const std::function<void(void)> & syncCall,
+                         const shared_ptr< StructuredBlockForest > & blocks,
+                         const shared_ptr<pe::BodyStorage> & globalBodyStorage, BlockDataID bodyStorageID,
+                         real_t gravitationalAcceleration, bool useEllipsoids, bool shortRun)
+{
+   WALBERLA_LOG_INFO_ON_ROOT("Starting creation of sediments");
+
+   auto xParticle = real_t(0);
+   auto yParticle = real_t(0);
+   auto zParticle = real_t(0);
+
+   for( uint_t nSed = 0; nSed < numberOfSediments; ++nSed )
+   {
+
+      WALBERLA_ROOT_SECTION()
+      {
+         xParticle = math::realRandom<real_t>(generationDomain.xMin(), generationDomain.xMax());
+         yParticle = math::realRandom<real_t>(generationDomain.yMin(), generationDomain.yMax());
+         zParticle = math::realRandom<real_t>(generationDomain.zMin(), generationDomain.zMax());
+      }
+
+      WALBERLA_MPI_SECTION()
+      {
+         mpi::broadcastObject( xParticle );
+         mpi::broadcastObject( yParticle );
+         mpi::broadcastObject( zParticle );
+      }
+
+      if( useEllipsoids )
+      {
+         // prolate ellipsoids
+         auto axisFactor = real_t(1.5);
+         real_t axisFactor2 = std::sqrt(real_t(1)/axisFactor);
+         real_t radius = diameter * real_t(0.5);
+         pe::createEllipsoid( *globalBodyStorage, blocks->getBlockStorage(), bodyStorageID, 0, Vector3<real_t>( xParticle, yParticle, zParticle ), Vector3<real_t>(axisFactor*radius, axisFactor2*radius, axisFactor2*radius), peMaterial );
+      }
+      else
+      {
+         pe::createSphere( *globalBodyStorage, blocks->getBlockStorage(), bodyStorageID, 0, Vector3<real_t>( xParticle, yParticle, zParticle ), diameter * real_t(0.5), peMaterial );
+      }
+
+   }
+
+   syncCall();
+
+   // carry out 100 simulations to resolve all overlaps
+   for (auto pet = uint_t(1); pet <= uint_t(100); ++pet)
+   {
+      cr.timestep( real_t(1) );
+      syncCall();
+
+      // reset all velocities to zero
+      for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
+      {
+         for( auto bodyIt = pe::BodyIterator::begin( *blockIt, bodyStorageID); bodyIt != pe::BodyIterator::end(); ++bodyIt )
+         {
+            bodyIt->setLinearVel(Vector3<real_t>(real_t(0)));
+            bodyIt->setAngularVel(Vector3<real_t>(real_t(0)));
+         }
+      }
+   }
+
+
+   const auto maxInitialPeSteps = (shortRun) ? uint_t(10) : uint_t(200000);
+   const auto dt_PE_init = real_t(1);
+
+   real_t gravityGeneration = real_t(0.1) * gravitationalAcceleration;
+   cr.setGlobalLinearAcceleration(Vector3<real_t>(real_t(0), real_t(0), gravityGeneration));
+
+   auto oldMinBodyPosition = real_t(0);
+   real_t convergenceLimit = std::fabs(gravityGeneration);
+   for (auto pet = uint_t(1); pet <= maxInitialPeSteps; ++pet)
+   {
+      cr.timestep( dt_PE_init );
+      syncCall();
+
+      real_t minBodyPosition = generationDomain.zMax();
+      for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
+      {
+         for( auto bodyIt = pe::LocalBodyIterator::begin( *blockIt, bodyStorageID); bodyIt != pe::LocalBodyIterator::end(); ++bodyIt )
+         {
+            minBodyPosition = std::min(bodyIt->getPosition()[2], minBodyPosition);
+         }
+      }
+
+      WALBERLA_MPI_SECTION()
+      {
+         mpi::allReduceInplace(minBodyPosition, mpi::MIN);
+      }
+
+      if( minBodyPosition > heightBorder ) break;
+
+      if( pet % 500 == 0)
+      {
+         if( std::fabs(minBodyPosition - oldMinBodyPosition) / minBodyPosition  < convergenceLimit ) break;
+         oldMinBodyPosition = minBodyPosition;
+      }
+
+      WALBERLA_ROOT_SECTION()
+      {
+         if( pet % 100 == 0)
+         {
+            WALBERLA_LOG_INFO("[" << pet << "] Min position of all bodies = " << minBodyPosition << " with goal height " << heightBorder);
+         }
+      }
+
+   }
+
+   // revert gravitational acceleration to 'real' direction
+   cr.setGlobalLinearAcceleration(Vector3<real_t>(real_t(0), real_t(0), -gravityGeneration));
+
+   // carry out a few time steps to relax the system towards the real condition
+   const auto relaxationTimeSteps = uint_t(std::sqrt(real_t(2)/std::fabs(gravitationalAcceleration)));
+   WALBERLA_LOG_INFO_ON_ROOT("Carrying out " << relaxationTimeSteps << " more time steps with correct gravity");
+   for (auto pet = uint_t(1); pet <= relaxationTimeSteps; ++pet)
+   {
+      cr.timestep(dt_PE_init);
+      syncCall();
+   }
+
+   WALBERLA_LOG_INFO_ON_ROOT("Sediment layer creation done!");
+
+   // reset all velocities to zero
+   Vector3<real_t> initialBodyVelocity(real_t(0));
+   WALBERLA_LOG_INFO_ON_ROOT("Setting initial velocity " << initialBodyVelocity << " of all bodies");
+   for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
+   {
+      for( auto bodyIt = pe::BodyIterator::begin( *blockIt, bodyStorageID); bodyIt != pe::BodyIterator::end(); ++bodyIt )
+      {
+         bodyIt->setLinearVel(initialBodyVelocity);
+         bodyIt->setAngularVel(Vector3<real_t>(real_t(0)));
+      }
+   }
+
+   cr.setGlobalLinearAcceleration(Vector3<real_t>(real_t(0)));
+}
+
+
+//*******************************************************************************************************************
+/*!\brief Simulation of settling particles inside a rectangular column filled with viscous fluid
+ *
+ * This application is used in the paper
+ *  Rettinger, Ruede - "Dynamic Load Balancing Techniques for Particulate Flow Simulations", submitted to Computation
+ * in Section 4 to apply the load estimator and to evaluate different load distribution strategies.
+ *
+ * It, however, features several different command line arguments that can be used to tweak the simulation.
+ * The setup can be horizontally period, a box or a hopper geometry (configurable, as in the paper).
+ * The size, resolution and used blocks for the domain partitioning can be changed.
+ * It even features adaptive mesh refinement, with different refinement criteria:
+ *  - particle based (always on, also for global bodies like bounding planes)
+ *  - optionally: vorticity- or gradient-based (with lower and upper limits)
+ * Since the paper, however, uses a uniform grid, many evaluation functionalities might not work properly for this case.
+ * Initially, all particles are pushed upwards to obtain a dense packing at the top plane.
+ *
+ * Most importantly, the load balancing can be modified:
+ *  - load estimation strategies:
+ *    - pure LBM = number of cells per block = constant workload per block
+ *    - pure PE = number of local particles + baseweight
+ *    - coupling based load estimator = use fitted function from Sec. 3 of paper
+ *  - load distribution strategies:
+ *    - space-filling curves: Hilbert and Morton
+ *    - ParMETIS (and several algorithms and parameters, also multiple constraints possible)
+ *    - diffusive (and options)
+ *  - load balancing (/refinement check ) frequency
+ */
+//*******************************************************************************************************************
+int main( int argc, char **argv )
+{
+   debug::enterTestMode();
+
+   mpi::Environment env( argc, argv );
+
+   ///////////////////
+   // Customization //
+   ///////////////////
+
+   // simulation control
+   bool shortRun = false;
+   bool funcTest = false;
+   bool fileIO = true;
+   bool logging = false; // logging of physical components
+   uint_t vtkWriteFreqDD = 0; //domain decomposition
+   uint_t vtkWriteFreqBo = 0; //bodies
+   uint_t vtkWriteFreqFl = 0; //fluid
+   uint_t vtkWriteFreq = 0; //general
+   std::string baseFolder = "vtk_out_AMRSedimentSettling"; // folder for vtk and file output
+
+   // physical setup
+   auto GalileoNumber = real_t(50);
+   auto densityRatio = real_t(1.5);
+   auto diameter = real_t(15);
+   auto solidVolumeFraction = real_t(0.1);
+   auto blockSize = uint_t(32);
+   auto XBlocks = uint_t(12);
+   auto YBlocks = uint_t(12);
+   auto ZBlocks = uint_t(16);
+   bool useBox = false;
+   bool useHopper = false;
+   bool useEllipsoids = false;
+   auto hopperRelHeight = real_t(0.5); // for hopper setup
+   auto hopperRelOpening = real_t(0.3); // for hopper setup
+
+   auto timestepsOnFinestLevel = uint_t(80000);
+
+   //numerical parameters
+   bool averageForceTorqueOverTwoTimSteps = true;
+   auto numberOfLevels = uint_t(1);
+   auto refinementCheckFrequency = uint_t(100);
+   auto numPeSubCycles = uint_t(10);
+
+   // refinement criteria
+   auto lowerFluidRefinementLimit = real_t(0);
+   auto upperFluidRefinementLimit = std::numeric_limits<real_t>::infinity();
+   bool useVorticityCriterion = false;
+   bool useGradientCriterion = false;
+
+   // load balancing
+   std::string loadEvaluationStrategy = "LBM"; //LBM, PE, Fit
+   std::string loadDistributionStrategy = "Hilbert"; //Morton, Hilbert, ParMetis, Diffusive
+
+   auto parMetis_ipc2redist = real_t(1000);
+   auto parMetisTolerance = real_t(-1);
+   std::string parMetisAlgorithmString = "ADAPTIVE_REPART";
+
+   auto diffusionFlowIterations = uint_t(15);
+   auto diffusionMaxIterations = uint_t(20);
+
+
+   for( int i = 1; i < argc; ++i )
+   {
+      if( std::strcmp( argv[i], "--shortRun" )                 == 0 ) { shortRun = true; continue; }
+      if( std::strcmp( argv[i], "--funcTest" )                 == 0 ) { funcTest = true; continue; }
+      if( std::strcmp( argv[i], "--fileIO" )                   == 0 ) { fileIO = true; continue; }
+      if( std::strcmp( argv[i], "--logging" )                  == 0 ) { logging = true; continue; }
+      if( std::strcmp( argv[i], "--vtkWriteFreqDD" )           == 0 ) { vtkWriteFreqDD = uint_c( std::atof( argv[++i] ) ); continue; }
+      if( std::strcmp( argv[i], "--vtkWriteFreqBo" )           == 0 ) { vtkWriteFreqBo = uint_c( std::atof( argv[++i] ) ); continue; }
+      if( std::strcmp( argv[i], "--vtkWriteFreqFl" )           == 0 ) { vtkWriteFreqFl = uint_c( std::atof( argv[++i] ) ); continue; }
+      if( std::strcmp( argv[i], "--vtkWriteFreq" )             == 0 ) { vtkWriteFreq = uint_c( std::atof( argv[++i] ) ); continue; }
+      if( std::strcmp( argv[i], "--baseFolder" )               == 0 ) { baseFolder = argv[++i]; continue; }
+      if( std::strcmp( argv[i], "--densityRatio" )             == 0 ) { densityRatio = real_c(std::atof( argv[++i] )); continue; }
+      if( std::strcmp( argv[i], "--Ga" )                       == 0 ) { GalileoNumber = real_c(std::atof( argv[++i] )); continue; }
+      if( std::strcmp( argv[i], "--diameter" )                 == 0 ) { diameter = real_c(std::atof( argv[++i] )); continue; }
+      if( std::strcmp( argv[i], "--blockSize" )                == 0 ) { blockSize = uint_c(std::atof( argv[++i] ) ); continue; }
+      if( std::strcmp( argv[i], "--XBlocks" )                  == 0 ) { XBlocks = uint_c(std::atof( argv[++i] ) ); continue; }
+      if( std::strcmp( argv[i], "--YBlocks" )                  == 0 ) { YBlocks = uint_c(std::atof( argv[++i] ) ); continue; }
+      if( std::strcmp( argv[i], "--ZBlocks" )                  == 0 ) { ZBlocks = uint_c(std::atof( argv[++i] ) ); continue; }
+      if( std::strcmp( argv[i], "--useBox" )                   == 0 ) { useBox = true; continue; }
+      if( std::strcmp( argv[i], "--useHopper" )                == 0 ) { useHopper = true; continue; }
+      if( std::strcmp( argv[i], "--hopperHeight" )             == 0 ) { hopperRelHeight = real_c(std::atof( argv[++i] )); continue; }
+      if( std::strcmp( argv[i], "--hopperOpening" )            == 0 ) { hopperRelOpening = real_c(std::atof( argv[++i] )); continue; }
+      if( std::strcmp( argv[i], "--timesteps" )                == 0 ) { timestepsOnFinestLevel = uint_c(std::atof( argv[++i] ) ); continue; }
+      if( std::strcmp( argv[i], "--noForceAveraging" )         == 0 ) { averageForceTorqueOverTwoTimSteps = false; continue; }
+      if( std::strcmp( argv[i], "--numPeSubCycles" )           == 0 ) { numPeSubCycles = uint_c(std::atof( argv[++i] )); continue; }
+      if( std::strcmp( argv[i], "--numLevels" )                == 0 ) { numberOfLevels = uint_c( std::atof( argv[++i] ) ); continue; }
+      if( std::strcmp( argv[i], "--refinementCheckFrequency" ) == 0 ) { refinementCheckFrequency = uint_c( std::atof( argv[++i] ) ); continue; }
+      if( std::strcmp( argv[i], "--lowerLimit" )               == 0 ) { lowerFluidRefinementLimit = real_c(std::atof( argv[++i] )); continue; }
+      if( std::strcmp( argv[i], "--upperLimit" )               == 0 ) { upperFluidRefinementLimit = real_c(std::atof( argv[++i] )); continue; }
+      if( std::strcmp( argv[i], "--useVorticityCriterion" )    == 0 ) { useVorticityCriterion = true; continue; }
+      if( std::strcmp( argv[i], "--useGradientCriterion" )     == 0 ) { useGradientCriterion = true; continue; }
+      if( std::strcmp( argv[i], "--loadEvaluationStrategy" )   == 0 ) { loadEvaluationStrategy = argv[++i]; continue; }
+      if( std::strcmp( argv[i], "--loadDistributionStrategy" ) == 0 ) { loadDistributionStrategy = argv[++i]; continue; }
+      if( std::strcmp( argv[i], "--ipc2redist" )               == 0 ) { parMetis_ipc2redist = real_c(std::atof( argv[++i] )); continue; }
+      if( std::strcmp( argv[i], "--parMetisTolerance" )        == 0 ) { parMetisTolerance = real_c(std::atof( argv[++i] )); continue; }
+      if( std::strcmp( argv[i], "--parMetisAlgorithm" )        == 0 ) { parMetisAlgorithmString = argv[++i]; continue; }
+      if( std::strcmp( argv[i], "--diffusionFlowIterations" )  == 0 ) { diffusionFlowIterations = uint_c(std::atof(argv[++i])); continue; }
+      if( std::strcmp( argv[i], "--diffusionMaxIterations" )   == 0 ) { diffusionMaxIterations = uint_c(std::atof(argv[++i])); continue; }
+      if( std::strcmp( argv[i], "--useEllipsoids" )            == 0 ) { useEllipsoids = true; continue; }
+      WALBERLA_ABORT("Unrecognized command line argument found: " << argv[i]);
+   }
+
+   if( funcTest )
+   {
+      walberla::logging::Logging::instance()->setLogLevel(logging::Logging::LogLevel::WARNING);
+   }
+
+   if( fileIO || logging )
+   {
+      WALBERLA_ROOT_SECTION(){
+         // create base directory if it does not yet exist
+         filesystem::path tpath( baseFolder );
+         if( !filesystem::exists( tpath ) )
+            filesystem::create_directory( tpath );
+      }
+   }
+
+   if( useVorticityCriterion && useGradientCriterion )
+   {
+      WALBERLA_ABORT("Use either vorticity or gradient criterion for refinement!");
+   }
+
+   if( loadEvaluationStrategy != "LBM" && loadEvaluationStrategy != "PE" && loadEvaluationStrategy != "Fit" && loadEvaluationStrategy != "FitMulti")
+   {
+      WALBERLA_ABORT("Invalid load evaluation strategy: " << loadEvaluationStrategy);
+   }
+
+   if( vtkWriteFreq != 0 )
+   {
+      vtkWriteFreqDD = vtkWriteFreq;
+      vtkWriteFreqBo = vtkWriteFreq;
+      vtkWriteFreqFl = vtkWriteFreq;
+   }
+
+   if( diameter > real_c(blockSize) )
+   {
+      WALBERLA_LOG_WARNING("PE Body Synchronization might not work since bodies are large compared to block size!");
+   }
+
+   if( useHopper )
+   {
+      WALBERLA_CHECK(hopperRelHeight >= real_t(0) && hopperRelHeight <= real_t(1), "Invalid relative hopper height of " << hopperRelHeight);
+      WALBERLA_CHECK(hopperRelOpening >= real_t(0) && hopperRelOpening <= real_t(1), "Invalid relative hopper opening of " << hopperRelOpening);
+   }
+
+
+   //////////////////////////
+   // NUMERICAL PARAMETERS //
+   //////////////////////////
+
+   const Vector3<uint_t> domainSize( XBlocks * blockSize, YBlocks * blockSize, ZBlocks * blockSize );
+   const auto domainVolume = real_t(domainSize[0] * domainSize[1] * domainSize[2]);
+   const real_t sphereVolume = math::pi / real_t(6) * diameter * diameter * diameter;
+   const uint_t numberOfSediments = uint_c(std::ceil(solidVolumeFraction * domainVolume / sphereVolume));
+
+   real_t expectedSedimentVolumeFraction = (useBox||useHopper) ? real_t(0.45) : real_t(0.52);
+   const real_t expectedSedimentedVolume = real_t(1)/expectedSedimentVolumeFraction * real_c(numberOfSediments) * sphereVolume;
+   const real_t expectedSedimentedHeight = std::max(diameter, expectedSedimentedVolume / real_c(domainSize[0] * domainSize[1]));
+
+   const auto uRef = real_t(0.02);
+   const real_t xRef = diameter;
+   const real_t tRef = xRef / uRef;
+
+   const real_t gravitationalAcceleration = uRef * uRef / ( (densityRatio-real_t(1)) * diameter );
+   const real_t viscosity = uRef * diameter / GalileoNumber;
+   const real_t omega = lbm::collision_model::omegaFromViscosity(viscosity);
+   const real_t tau = real_t(1) / omega;
+
+   const auto loggingDisplayFrequency = uint_t(100);
+
+   const auto dx = real_t(1);
+   const real_t overlap = real_t( 1.5 ) * dx;
+
+   if( useVorticityCriterion && floatIsEqual(lowerFluidRefinementLimit, real_t(0)) && std::isinf(upperFluidRefinementLimit) )
+   {
+      // use computed criterion instead of user input
+      lowerFluidRefinementLimit = real_t(0.05) * uRef;
+      upperFluidRefinementLimit = real_t(0.1) * uRef;
+   }
+
+   const uint_t finestLevel = numberOfLevels - uint_t(1);
+   std::stringstream omega_msg;
+   for( uint_t i = 0; i < numberOfLevels; ++i )
+   {
+      real_t omegaLvl = lbm::collision_model::levelDependentRelaxationParameter( i, omega, finestLevel );
+      omega_msg << omegaLvl << " ( on level " << i << ", tau = " << real_t(1)/omega << " ), ";
+   }
+
+   const uint_t levelScalingFactor = ( uint_t(1) << finestLevel );
+   const uint_t lbmTimeStepsPerTimeLoopIteration = levelScalingFactor;
+
+   const uint_t timesteps = funcTest ? 1 : ( shortRun ? uint_t(100) : uint_t( timestepsOnFinestLevel / lbmTimeStepsPerTimeLoopIteration ) );
+
+   WALBERLA_LOG_INFO_ON_ROOT("Setup (in simulation, i.e. lattice, units):");
+   WALBERLA_LOG_INFO_ON_ROOT(" - domain size = " << domainSize);
+   WALBERLA_LOG_INFO_ON_ROOT(" - sediment diameter = " << diameter );
+   WALBERLA_LOG_INFO_ON_ROOT(" - Galileo number = " << GalileoNumber );
+   WALBERLA_LOG_INFO_ON_ROOT(" - number of sediments: " << numberOfSediments);
+   WALBERLA_LOG_INFO_ON_ROOT(" - densityRatio = " << densityRatio );
+   WALBERLA_LOG_INFO_ON_ROOT(" - fluid: relaxation time (tau) = " << tau << ", kin. visc = " << viscosity );
+   WALBERLA_LOG_INFO_ON_ROOT(" - gravitational acceleration = " << gravitationalAcceleration );
+   WALBERLA_LOG_INFO_ON_ROOT(" - reference values: x = " << xRef << ", t = " << tRef << ", vel = " << uRef);
+   WALBERLA_LOG_INFO_ON_ROOT(" - omega: " << omega_msg.str());
+   WALBERLA_LOG_INFO_ON_ROOT(" - number of levels: " << numberOfLevels);
+   WALBERLA_LOG_INFO_ON_ROOT(" - number of pe sub cycles: " << numPeSubCycles);
+   if( useVorticityCriterion )
+   {
+      WALBERLA_LOG_INFO_ON_ROOT(" - using vorticity criterion with lower limit = " << lowerFluidRefinementLimit << " and upper limit = " << upperFluidRefinementLimit );
+   }
+   if( useGradientCriterion )
+   {
+      WALBERLA_LOG_INFO_ON_ROOT(" - using gradient criterion with lower limit = " << lowerFluidRefinementLimit << " and upper limit = " << upperFluidRefinementLimit );
+   }
+   if( vtkWriteFreqDD > 0 )
+   {
+      WALBERLA_LOG_INFO_ON_ROOT(" - writing vtk files of domain decomposition to folder \"" << baseFolder << "\" with frequency " << vtkWriteFreqDD);
+   }
+   if( vtkWriteFreqBo > 0 )
+   {
+      WALBERLA_LOG_INFO_ON_ROOT(" - writing vtk files of bodies data to folder \"" << baseFolder << "\" with frequency " << vtkWriteFreqBo);
+   }
+   if( vtkWriteFreqFl > 0 )
+   {
+      WALBERLA_LOG_INFO_ON_ROOT(" - writing vtk files of fluid data to folder \"" << baseFolder << "\" with frequency " << vtkWriteFreqFl);
+   }
+   if( useEllipsoids )
+   {
+      WALBERLA_LOG_INFO_ON_ROOT(" - using (prolate) ellipsoids as sediments");
+   }
+   if( useBox )
+   {
+      WALBERLA_LOG_INFO_ON_ROOT(" - using box setup");
+   }
+   else if ( useHopper )
+   {
+      WALBERLA_LOG_INFO_ON_ROOT(" - using hopper setup");
+   }
+   else
+   {
+      WALBERLA_LOG_INFO_ON_ROOT(" - using horizontally periodic domain");
+   }
+
+   if( refinementCheckFrequency == 0 && numberOfLevels != 1 )
+   {
+      // determine check frequency automatically based on maximum admissible velocity and block sizes
+      auto uMax = real_t(0.1);
+      refinementCheckFrequency = uint_c(( overlap + real_c(blockSize) - real_t(2) * real_t(FieldGhostLayers) * dx) / uMax) / lbmTimeStepsPerTimeLoopIteration;
+   }
+   WALBERLA_LOG_INFO_ON_ROOT(" - refinement / load balancing check frequency (coarse time steps): " << refinementCheckFrequency);
+   WALBERLA_LOG_INFO_ON_ROOT(" - load evaluation strategy: " << loadEvaluationStrategy);
+   WALBERLA_LOG_INFO_ON_ROOT(" - load distribution strategy: " << loadDistributionStrategy);
+
+   ///////////////////////////
+   // BLOCK STRUCTURE SETUP //
+   ///////////////////////////
+
+   Vector3<uint_t> blockSizeInCells( blockSize );
+
+   AABB simulationDomain( real_t(0), real_t(0), real_t(0), real_c(domainSize[0]), real_c(domainSize[1]), real_c(domainSize[2]) );
+   AABB sedimentDomain( real_t(0), real_t(0), real_c(domainSize[2]) - expectedSedimentedHeight, real_c(domainSize[0]), real_c(domainSize[1]), real_c(domainSize[2]) );
+
+   AABB initialRefinementDomain = sedimentDomain;
+   if( useBox || useHopper )
+   {
+      // require finest levels also along bounding planes -> initially refine everywhere
+      initialRefinementDomain = simulationDomain;
+   }
+
+   auto blocks = createBlockStructure( simulationDomain, blockSizeInCells, numberOfLevels, initialRefinementDomain, (useBox||useHopper), loadDistributionStrategy );
+
+   //write initial domain decomposition to file
+   if( vtkWriteFreqDD > 0 )
+   {
+      vtk::writeDomainDecomposition( blocks, "initial_domain_decomposition", baseFolder );
+   }
+
+
+   /////////////////
+   // PE COUPLING //
+   /////////////////
+
+   // set up pe functionality
+   shared_ptr<pe::BodyStorage> globalBodyStorage = make_shared<pe::BodyStorage>();
+   pe::SetBodyTypeIDs<BodyTypeTuple>::execute();
+
+   auto bodyStorageID  = blocks->addBlockData(pe::createStorageDataHandling<BodyTypeTuple>(), "pe Body Storage");
+   auto ccdID          = blocks->addBlockData(pe::ccd::createHashGridsDataHandling( globalBodyStorage, bodyStorageID ), "CCD");
+   BlockDataID fcdID   = (useEllipsoids) ? blocks->addBlockData( pe::fcd::createGenericFCDDataHandling<BodyTypeTuple, pe::fcd::GJKEPACollideFunctor>(), "FCD" )
+                                         : blocks->addBlockData(pe::fcd::createGenericFCDDataHandling<BodyTypeTuple, pe::fcd::AnalyticCollideFunctor>(), "FCD");
+
+   shared_ptr<WcTimingTree> timingTreePE = make_shared<WcTimingTree>();
+
+   // set up collision response
+   pe::cr::HCSITS cr(globalBodyStorage, blocks->getBlockStoragePointer(), bodyStorageID, ccdID, fcdID, &(*timingTreePE) );
+   cr.setMaxIterations(10);
+   cr.setRelaxationModel( pe::cr::HardContactSemiImplicitTimesteppingSolvers::ApproximateInelasticCoulombContactByDecoupling );
+
+   // set up synchronization procedure
+   std::function<void(void)> syncCall = [&capture0 = blocks->getBlockForest(), bodyStorageID, capture1 = &(*timingTreePE), overlap] { pe::syncNextNeighbors<BodyTypeTuple>(capture0, bodyStorageID, capture1, overlap, false); };
+
+   // create pe bodies
+
+   // add the sediments
+   auto peMaterial = pe::createMaterial( "mat", densityRatio, real_t(1), real_t(0.25), real_t(0.25), real_t(0), real_t(200), real_t(100), real_t(100), real_t(100) );
+
+   // create two planes at bottom and top of domain for a horizontally periodic box
+   pe::createPlane( *globalBodyStorage, 0, Vector3<real_t>(0,0,1), Vector3<real_t>(0,0,0), peMaterial );
+   pe::createPlane( *globalBodyStorage, 0, Vector3<real_t>(0,0,-1), Vector3<real_t>(0,0,simulationDomain.zMax()), peMaterial );
+   if( useBox )
+   {
+      // add four more planes to obtain a closed box
+      pe::createPlane( *globalBodyStorage, 0, Vector3<real_t>(1,0,0), Vector3<real_t>(0,0,0), peMaterial );
+      pe::createPlane( *globalBodyStorage, 0, Vector3<real_t>(-1,0,0), Vector3<real_t>(simulationDomain.xMax(),0,0), peMaterial );
+      pe::createPlane( *globalBodyStorage, 0, Vector3<real_t>(0,1,0), Vector3<real_t>(0,0,0), peMaterial );
+      pe::createPlane( *globalBodyStorage, 0, Vector3<real_t>(0,-1,0), Vector3<real_t>(0,simulationDomain.yMax(),0), peMaterial );
+   }
+   else if ( useHopper )
+   {
+      // box bounding planes
+      pe::createPlane( *globalBodyStorage, 0, Vector3<real_t>(1,0,0), Vector3<real_t>(0,0,0), peMaterial );
+      pe::createPlane( *globalBodyStorage, 0, Vector3<real_t>(-1,0,0), Vector3<real_t>(simulationDomain.xMax(),0,0), peMaterial );
+      pe::createPlane( *globalBodyStorage, 0, Vector3<real_t>(0,1,0), Vector3<real_t>(0,0,0), peMaterial );
+      pe::createPlane( *globalBodyStorage, 0, Vector3<real_t>(0,-1,0), Vector3<real_t>(0,simulationDomain.yMax(),0), peMaterial );
+
+      //hopper planes
+      real_t xMax = simulationDomain.xMax();
+      real_t yMax = simulationDomain.yMax();
+      real_t zMax = simulationDomain.zMax();
+      Vector3<real_t> p1(0,0,hopperRelHeight*zMax);
+      pe::createPlane( *globalBodyStorage, 0, Vector3<real_t>(p1[2],0,hopperRelOpening*xMax-p1[0]), p1, peMaterial );
+      pe::createPlane( *globalBodyStorage, 0, Vector3<real_t>(0,p1[2],hopperRelOpening*yMax-p1[0]), p1, peMaterial );
+
+      Vector3<real_t> p2(xMax,yMax,hopperRelHeight*zMax);
+      pe::createPlane( *globalBodyStorage, 0, Vector3<real_t>(-p2[2],0,-((real_t(1)-hopperRelOpening)*xMax-p2[0])), p2, peMaterial );
+      pe::createPlane( *globalBodyStorage, 0, Vector3<real_t>(0,-p2[2],-((real_t(1)-hopperRelOpening)*yMax-p2[1])), p2, peMaterial );
+   }
+
+   AABB sedimentGenerationDomain( real_t(0), real_t(0), real_t(0.5)*real_c(domainSize[2]), real_c(domainSize[0]), real_c(domainSize[1]), real_c(domainSize[2]) );
+   createSedimentLayer(numberOfSediments, sedimentGenerationDomain, diameter, sedimentDomain.zMin(), peMaterial, cr, syncCall, blocks, globalBodyStorage, bodyStorageID, gravitationalAcceleration, useEllipsoids, shortRun );
+
+   // reset timer to not cover init stats
+   timingTreePE.reset();
+
+   // now we can use the information about the body positions to adapt the refinement
+
+   ///////////////////////////
+   // DYNAMIC REFINEMENT, 1 //
+   ///////////////////////////
+
+   auto & blockforest = blocks->getBlockForest();
+   blockforest.recalculateBlockLevelsInRefresh( true );
+   blockforest.alwaysRebalanceInRefresh( true ); //load balancing every time refresh is triggered
+   blockforest.reevaluateMinTargetLevelsAfterForcedRefinement( false );
+   blockforest.allowRefreshChangingDepth( false );
+   blockforest.allowMultipleRefreshCycles( false ); // otherwise info collections are invalid
+
+   {
+      blockforest::CombinedMinTargetLevelDeterminationFunctions initialMinTargetLevelDeterminationFunctions;
+
+      blockforest::AABBRefinementSelection aabbRefinementSelection;
+      aabbRefinementSelection.addAABB(sedimentDomain,finestLevel );
+      initialMinTargetLevelDeterminationFunctions.add( aabbRefinementSelection );
+
+      // refinement along global bodies (bounding planes) to have consistent mapping (required for CLI always, or SimpleBB with non-AABB planes)
+      real_t blockExtension = real_c(FieldGhostLayers);
+      pe_coupling::amr::GlobalBodyPresenceLevelDetermination globalBodyPresenceRefinement( globalBodyStorage, finestLevel, blockExtension, pe_coupling::selectGlobalBodies );
+      initialMinTargetLevelDeterminationFunctions.add(globalBodyPresenceRefinement);
+
+      blockforest.setRefreshMinTargetLevelDeterminationFunction( initialMinTargetLevelDeterminationFunctions );
+
+      for ( auto refreshCycle = uint_t(0); refreshCycle < finestLevel; ++refreshCycle)
+      {
+
+         WALBERLA_LOG_INFO_ON_ROOT("Refreshing blockforest...")
+
+         // check refinement criteria and refine/coarsen if necessary
+         uint_t stampBefore = blocks->getBlockForest().getModificationStamp();
+         blocks->refresh();
+         uint_t stampAfter = blocks->getBlockForest().getModificationStamp();
+
+         if( stampBefore == stampAfter )
+         {
+            break;
+         }
+
+         WALBERLA_LOG_INFO_ON_ROOT("Recreating data structures..");
+
+         // rebuild PE data structures
+         pe::clearSynchronization( blockforest, bodyStorageID);
+
+         syncCall();
+
+         for (auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt)
+         {
+            auto * ccd = blockIt->getData< pe::ccd::ICCD >( ccdID );
+            ccd->reloadBodies();
+         }
+      }
+   }
+
+   uint_t numberOfInitialFineBlocks = blockforest.getNumberOfBlocks(finestLevel);
+   mpi::allReduceInplace(numberOfInitialFineBlocks, mpi::SUM);
+   WALBERLA_LOG_INFO_ON_ROOT("Total number of initial fine blocks in simulation: " << numberOfInitialFineBlocks);
+
+   uint_t numberOfProcesses = uint_c(MPIManager::instance()->numProcesses());
+
+
+   ///////////////////////
+   // ADD DATA TO BLOCKS //
+   ////////////////////////
+
+   // create the lattice model
+   LatticeModel_T latticeModel = LatticeModel_T( lbm::collision_model::TRT::constructWithMagicNumber( omega, lbm::collision_model::TRT::threeSixteenth, finestLevel ) );
+
+   // add PDF field
+   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, "pdf field (fzyx)", latticeModel,
+                                                                         Vector3< real_t >( real_t(0) ), real_t(1),
+                                                                         FieldGhostLayers, field::fzyx );
+   // add flag field
+   BlockDataID flagFieldID = field::addFlagFieldToStorage<FlagField_T>( blocks, "flag field", FieldGhostLayers );
+
+   // add body field
+   BlockDataID bodyFieldID = field::addToStorage<BodyField_T>( blocks, "body field", nullptr, field::fzyx, FieldGhostLayers );
+
+   // add velocity field and utility
+   BlockDataID velocityFieldID = field::addToStorage<VelocityField_T>( blocks, "velocity field", Vector3<real_t>(real_t(0)), field::fzyx, uint_t(2) );
+
+   using VelocityFieldWriter_T = lbm::VelocityFieldWriter<PdfField_T, VelocityField_T>;
+   BlockSweepWrapper< VelocityFieldWriter_T > velocityFieldWriter( blocks, VelocityFieldWriter_T( pdfFieldID, velocityFieldID ) );
+
+
+   shared_ptr<blockforest::communication::NonUniformBufferedScheme<stencil::D3Q27> > velocityCommunicationScheme = make_shared<blockforest::communication::NonUniformBufferedScheme<stencil::D3Q27> >( blocks );
+   velocityCommunicationScheme->addPackInfo( make_shared< field::refinement::PackInfo<VelocityField_T, stencil::D3Q27> >( velocityFieldID ) );
+
+   // add boundary handling & initialize outer domain boundaries
+   BlockDataID boundaryHandlingID = blocks->addBlockData( make_shared< MyBoundaryHandling >( blocks, flagFieldID, pdfFieldID, bodyFieldID ),
+                                                          "boundary handling" );
+
+   // map planes into the LBM simulation -> act as no-slip boundaries
+   //pe_coupling::mapBodies< BoundaryHandling_T >( *blocks, boundaryHandlingID, bodyStorageID, *globalBodyStorage, NoSlip_Flag, pe_coupling::selectGlobalBodies );
+   pe_coupling::mapMovingBodies< BoundaryHandling_T >( *blocks, boundaryHandlingID, bodyStorageID, *globalBodyStorage, bodyFieldID, MO_Flag, pe_coupling::selectGlobalBodies );
+
+   // map pe bodies into the LBM simulation
+   pe_coupling::mapMovingBodies< BoundaryHandling_T >( *blocks, boundaryHandlingID, bodyStorageID, *globalBodyStorage, bodyFieldID, MO_Flag, pe_coupling::selectRegularBodies );
+
+
+   // force averaging functionality
+   shared_ptr<pe_coupling::BodiesForceTorqueContainer> bodiesFTContainer1 = make_shared<pe_coupling::BodiesForceTorqueContainer>(blocks, bodyStorageID);
+   std::function<void(void)> storeForceTorqueInCont1 = [bodiesFTContainer1] { bodiesFTContainer1->store(); };
+
+   shared_ptr<pe_coupling::BodiesForceTorqueContainer> bodiesFTContainer2 = make_shared<pe_coupling::BodiesForceTorqueContainer>(blocks, bodyStorageID);
+   std::function<void(void)> setForceTorqueOnBodiesFromCont2 = [bodiesFTContainer2] { bodiesFTContainer2->setOnBodies(); };
+
+   shared_ptr<pe_coupling::ForceTorqueOnBodiesScaler> forceScaler = make_shared<pe_coupling::ForceTorqueOnBodiesScaler>(blocks, bodyStorageID, real_t(0.5));
+   std::function<void(void)> setForceScalingFactorToOne = [forceScaler] { forceScaler->resetScalingFactor(real_t(1)); };
+   std::function<void(void)> setForceScalingFactorToHalf = [forceScaler] { forceScaler->resetScalingFactor(real_t(0.5)); };
+
+   if( averageForceTorqueOverTwoTimSteps ) {
+      bodiesFTContainer2->store();
+
+      setForceScalingFactorToOne();
+   }
+
+   ///////////////////////////
+   // DYNAMIC REFINEMENT, 2 //
+   ///////////////////////////
+
+   blockforest::CombinedMinTargetLevelDeterminationFunctions minTargetLevelDeterminationFunctions;
+
+   // add refinement criterion based on particle presence
+   shared_ptr<pe_coupling::InfoCollection> couplingInfoCollection = walberla::make_shared<pe_coupling::InfoCollection>();
+   pe_coupling::amr::BodyPresenceLevelDetermination particlePresenceRefinement( couplingInfoCollection, finestLevel );
+
+   minTargetLevelDeterminationFunctions.add( particlePresenceRefinement );
+
+   // also add (possible) refinement criteria based on fluid quantities
+
+   if( useVorticityCriterion )
+   {
+      // add refinement criterion based on vorticity magnitude
+      field::FlagFieldEvaluationFilter<FlagField_T> flagFieldFilter( flagFieldID, Fluid_Flag );
+      lbm::refinement::VorticityBasedLevelDetermination< field::FlagFieldEvaluationFilter<FlagField_T> > vorticityRefinement(
+            velocityFieldID, flagFieldFilter, upperFluidRefinementLimit, lowerFluidRefinementLimit, finestLevel );
+
+      minTargetLevelDeterminationFunctions.add( vorticityRefinement );
+   }
+
+   if( useGradientCriterion )
+   {
+      // add refinement criterion based on velocity gradient magnitude
+      field::FlagFieldEvaluationFilter<FlagField_T> flagFieldFilter( flagFieldID, Fluid_Flag );
+      VectorGradientRefinement< LatticeModel_T, field::FlagFieldEvaluationFilter<FlagField_T> > gradientRefinement(
+            velocityFieldID, flagFieldFilter, upperFluidRefinementLimit, lowerFluidRefinementLimit, finestLevel );
+
+      minTargetLevelDeterminationFunctions.add( gradientRefinement );
+   }
+
+   // refinement along global bodies (bounding planes) to have consistent mapping (required for CLI always, or SimpleBB with non-AABB planes)
+   real_t blockExtension = real_c(FieldGhostLayers);
+   pe_coupling::amr::GlobalBodyPresenceLevelDetermination globalBodyPresenceRefinement( globalBodyStorage, finestLevel, blockExtension, pe_coupling::selectGlobalBodies );
+   minTargetLevelDeterminationFunctions.add(globalBodyPresenceRefinement);
+
+   blockforest.setRefreshMinTargetLevelDeterminationFunction( minTargetLevelDeterminationFunctions );
+
+   bool curveAllGather = true;
+   bool balanceLevelwise = true;
+
+   auto peBlockBaseWeight = real_t(1); //default value, might not be the best
+   shared_ptr<blockforest::InfoCollection> peInfoCollection = walberla::make_shared<blockforest::InfoCollection>();
+
+   if( loadDistributionStrategy == "Hilbert" || loadDistributionStrategy == "Morton")
+   {
+      if( loadDistributionStrategy == "Hilbert")
+      {
+         bool useHilbert = true;
+         blockforest.setRefreshPhantomBlockMigrationPreparationFunction( blockforest::DynamicCurveBalance< blockforest::PODPhantomWeight<real_t> >( useHilbert, curveAllGather, balanceLevelwise ) );
+      }
+      else if (loadDistributionStrategy == "Morton" )
+      {
+         bool useHilbert = false;
+         blockforest.setRefreshPhantomBlockMigrationPreparationFunction( blockforest::DynamicCurveBalance< blockforest::PODPhantomWeight<real_t> >( useHilbert, curveAllGather, balanceLevelwise ) );
+      }
+
+      blockforest.setRefreshPhantomBlockDataPackFunction(blockforest::PODPhantomWeightPackUnpack<real_t>());
+      blockforest.setRefreshPhantomBlockDataUnpackFunction(blockforest::PODPhantomWeightPackUnpack<real_t>());
+
+      if( loadEvaluationStrategy == "Fit" )
+      {
+         if( useEllipsoids )
+         {
+            pe_coupling::amr::WeightAssignmentFunctor weightAssignmentFunctor(couplingInfoCollection, fittedTotalWeightEvaluationFunctionEllipsoids);
+            blockforest.setRefreshPhantomBlockDataAssignmentFunction(weightAssignmentFunctor);
+         } else{
+            pe_coupling::amr::WeightAssignmentFunctor weightAssignmentFunctor(couplingInfoCollection, fittedTotalWeightEvaluationFunctionSpheres);
+            blockforest.setRefreshPhantomBlockDataAssignmentFunction(weightAssignmentFunctor);
+         }
+      }
+      else if( loadEvaluationStrategy == "PE" )
+      {
+         blockforest::WeightAssignmentFunctor weightAssignmentFunctor(peInfoCollection, peBlockBaseWeight );
+         blockforest.setRefreshPhantomBlockDataAssignmentFunction(weightAssignmentFunctor);
+      }
+      else if( loadEvaluationStrategy == "LBM" )
+      {
+         pe_coupling::amr::WeightAssignmentFunctor weightAssignmentFunctor(couplingInfoCollection, pe_coupling::amr::defaultWeightEvaluationFunction);
+         blockforest.setRefreshPhantomBlockDataAssignmentFunction(weightAssignmentFunctor);
+      }
+      else
+      {
+         WALBERLA_ABORT("Invalid load evaluation strategy: " << loadEvaluationStrategy);
+      }
+
+   }
+   else if( loadDistributionStrategy == "ParMetis")
+   {
+
+#ifndef WALBERLA_BUILD_WITH_PARMETIS
+      WALBERLA_ABORT( "You are trying to use ParMetis functionality but waLBerla is not configured to use it. Set 'WALBERLA_BUILD_WITH_PARMETIS' to 'ON' in your CMake cache to build against an installed version of ParMetis!" );
+#endif
+
+      uint_t ncon = 1;
+      if( loadEvaluationStrategy == "FitMulti")
+      {
+         ncon = 2;
+      }
+
+      blockforest::DynamicParMetis::Algorithm parMetisAlgorithm = blockforest::DynamicParMetis::stringToAlgorithm(parMetisAlgorithmString);
+      blockforest::DynamicParMetis::WeightsToUse parMetisWeightsToUse = blockforest::DynamicParMetis::WeightsToUse::PARMETIS_BOTH_WEIGHTS;
+      blockforest::DynamicParMetis::EdgeSource parMetisEdgeSource = blockforest::DynamicParMetis::EdgeSource::PARMETIS_EDGES_FROM_EDGE_WEIGHTS;
+
+      blockforest::DynamicParMetis dynamicParMetis(parMetisAlgorithm, parMetisWeightsToUse, parMetisEdgeSource, ncon);
+      dynamicParMetis.setipc2redist(parMetis_ipc2redist);
+
+      real_t loadImbalanceTolerance = (parMetisTolerance < real_t(1)) ? std::max(real_t(1.05), real_t(1) + real_t(1) / ( real_c(numberOfInitialFineBlocks) / real_c(numberOfProcesses) ) ) : parMetisTolerance;
+      std::vector<double> parMetisLoadImbalanceTolerance(ncon, double(loadImbalanceTolerance));
+      dynamicParMetis.setImbalanceTolerance(parMetisLoadImbalanceTolerance[0], 0);
+
+      WALBERLA_LOG_INFO_ON_ROOT(" - ParMetis configuration: ");
+      WALBERLA_LOG_INFO_ON_ROOT("   - algorithm = " << dynamicParMetis.algorithmToString() );
+      WALBERLA_LOG_INFO_ON_ROOT("   - weights to use = " << dynamicParMetis.weightsToUseToString() );
+      WALBERLA_LOG_INFO_ON_ROOT("   - edge source = " << dynamicParMetis.edgeSourceToString() );
+      WALBERLA_LOG_INFO_ON_ROOT("   - ncon = " << ncon );
+      WALBERLA_LOG_INFO_ON_ROOT("   - ipc2redist parameter = " << dynamicParMetis.getipc2redist() );
+
+      blockforest.setRefreshPhantomBlockDataPackFunction(blockforest::DynamicParMetisBlockInfoPackUnpack());
+      blockforest.setRefreshPhantomBlockDataUnpackFunction(blockforest::DynamicParMetisBlockInfoPackUnpack());
+
+      if( loadEvaluationStrategy == "Fit" )
+      {
+         WALBERLA_LOG_INFO_ON_ROOT("   - load imbalance tolerance = <" << parMetisLoadImbalanceTolerance[0] << ">" );
+         if( useEllipsoids )
+         {
+            pe_coupling::amr::MetisAssignmentFunctor weightAssignmentFunctor(couplingInfoCollection, fittedTotalWeightEvaluationFunctionEllipsoids);
+            blockforest.setRefreshPhantomBlockDataAssignmentFunction(weightAssignmentFunctor);
+         } else{
+            pe_coupling::amr::MetisAssignmentFunctor weightAssignmentFunctor(couplingInfoCollection, fittedTotalWeightEvaluationFunctionSpheres);
+            blockforest.setRefreshPhantomBlockDataAssignmentFunction(weightAssignmentFunctor);
+         }
+      }
+      else if( loadEvaluationStrategy == "FitMulti" )
+      {
+         double imbalanceTolerancePE = 10.;
+         parMetisLoadImbalanceTolerance[1] = std::min(imbalanceTolerancePE, static_cast<double>(MPIManager::instance()->numProcesses()));
+         WALBERLA_LOG_INFO_ON_ROOT("   - load imbalance tolerances = <" << parMetisLoadImbalanceTolerance[0] << ", " << parMetisLoadImbalanceTolerance[1] << ">" );
+         dynamicParMetis.setImbalanceTolerance(parMetisLoadImbalanceTolerance[1], 1);
+
+         if( useEllipsoids )
+         {
+            std::vector< std::function<real_t(const pe_coupling::BlockInfo&)> > weightEvaluationFunctions(ncon);
+            weightEvaluationFunctions[0] = fittedLBMWeightEvaluationFunctionEllipsoids;
+            weightEvaluationFunctions[1] = fittedPEWeightEvaluationFunctionEllipsoids;
+            pe_coupling::amr::MetisAssignmentFunctor weightAssignmentFunctor(couplingInfoCollection, weightEvaluationFunctions);
+            blockforest.setRefreshPhantomBlockDataAssignmentFunction(weightAssignmentFunctor);
+         } else{
+            std::vector< std::function<real_t(const pe_coupling::BlockInfo&)> > weightEvaluationFunctions(ncon);
+            weightEvaluationFunctions[0] = fittedLBMWeightEvaluationFunctionSpheres;
+            weightEvaluationFunctions[1] = fittedPEWeightEvaluationFunctionSpheres;
+            pe_coupling::amr::MetisAssignmentFunctor weightAssignmentFunctor(couplingInfoCollection, weightEvaluationFunctions);
+            blockforest.setRefreshPhantomBlockDataAssignmentFunction(weightAssignmentFunctor);
+         }
+      }
+      else if( loadEvaluationStrategy == "PE" )
+      {
+         blockforest::MetisAssignmentFunctor weightAssignmentFunctor(peInfoCollection, peBlockBaseWeight );
+         blockforest.setRefreshPhantomBlockDataAssignmentFunction(weightAssignmentFunctor);
+      }
+      else if( loadEvaluationStrategy == "LBM" )
+      {
+         pe_coupling::amr::MetisAssignmentFunctor weightAssignmentFunctor(couplingInfoCollection, pe_coupling::amr::defaultWeightEvaluationFunction);
+         blockforest.setRefreshPhantomBlockDataAssignmentFunction(weightAssignmentFunctor);
+      }
+      else
+      {
+         WALBERLA_ABORT("Invalid load evaluation strategy: " << loadEvaluationStrategy);
+      }
+
+      blockforest.setRefreshPhantomBlockMigrationPreparationFunction( dynamicParMetis );
+
+   }
+   else if( loadDistributionStrategy == "Diffusive")
+   {
+      using DB_T = blockforest::DynamicDiffusionBalance< blockforest::PODPhantomWeight<real_t> >;
+      DB_T dynamicDiffusion(diffusionMaxIterations, diffusionFlowIterations );
+      dynamicDiffusion.setMode(DB_T::Mode::DIFFUSION_PUSH);
+
+      WALBERLA_LOG_INFO_ON_ROOT(" - Dynamic diffusion configuration: ");
+      WALBERLA_LOG_INFO_ON_ROOT("   - max iterations = " << dynamicDiffusion.getMaxIterations() );
+      WALBERLA_LOG_INFO_ON_ROOT("   - flow iterations = " << dynamicDiffusion.getFlowIterations());
+
+      blockforest.setRefreshPhantomBlockDataPackFunction(blockforest::PODPhantomWeightPackUnpack<real_t>());
+      blockforest.setRefreshPhantomBlockDataUnpackFunction(blockforest::PODPhantomWeightPackUnpack<real_t>());
+      blockforest.setRefreshPhantomBlockMigrationPreparationFunction( dynamicDiffusion );
+
+      if( loadEvaluationStrategy == "Fit" )
+      {
+         if( useEllipsoids )
+         {
+            pe_coupling::amr::WeightAssignmentFunctor weightAssignmentFunctor(couplingInfoCollection, fittedTotalWeightEvaluationFunctionEllipsoids);
+            blockforest.setRefreshPhantomBlockDataAssignmentFunction(weightAssignmentFunctor);
+         } else{
+            pe_coupling::amr::WeightAssignmentFunctor weightAssignmentFunctor(couplingInfoCollection, fittedTotalWeightEvaluationFunctionSpheres);
+            blockforest.setRefreshPhantomBlockDataAssignmentFunction(weightAssignmentFunctor);
+         }
+      }
+      else if( loadEvaluationStrategy == "PE" )
+      {
+         blockforest::WeightAssignmentFunctor weightAssignmentFunctor(peInfoCollection, peBlockBaseWeight );
+         blockforest.setRefreshPhantomBlockDataAssignmentFunction(weightAssignmentFunctor);
+      }
+      else if( loadEvaluationStrategy == "LBM" )
+      {
+         pe_coupling::amr::WeightAssignmentFunctor weightAssignmentFunctor(couplingInfoCollection, pe_coupling::amr::defaultWeightEvaluationFunction);
+         blockforest.setRefreshPhantomBlockDataAssignmentFunction(weightAssignmentFunctor);
+      }
+      else
+      {
+         WALBERLA_ABORT("Invalid load evaluation strategy: " << loadEvaluationStrategy);
+      }
+
+   } else
+   {
+      WALBERLA_ABORT("Load distribution strategy \"" << loadDistributionStrategy << "\t not implemented! - Aborting" );
+   }
+
+
+   ///////////////
+   // TIME LOOP //
+   ///////////////
+
+   // create the timeloop
+   auto timeloop = make_shared<SweepTimeloop>( blocks->getBlockStorage(), timesteps );
+
+   if( vtkWriteFreqBo != uint_t(0) ) {
+
+      // pe bodies
+      if (useEllipsoids) {
+         auto bodyVtkOutput = make_shared<pe::EllipsoidVtkOutput>(bodyStorageID, blocks->getBlockStorage());
+         auto bodyVTK = vtk::createVTKOutput_PointData(bodyVtkOutput, "bodies", vtkWriteFreqBo, baseFolder);
+         timeloop->addFuncBeforeTimeStep(vtk::writeFiles(bodyVTK), "VTK (sediment data)");
+
+      } else {
+         auto bodyVtkOutput = make_shared<pe::SphereVtkOutput>(bodyStorageID, blocks->getBlockStorage());
+         auto bodyVTK = vtk::createVTKOutput_PointData(bodyVtkOutput, "bodies", vtkWriteFreqBo, baseFolder);
+         timeloop->addFuncBeforeTimeStep(vtk::writeFiles(bodyVTK), "VTK (sediment data)");
+      }
+   }
+
+   if( vtkWriteFreqFl != uint_t(0) ) {
+
+      // pdf field
+      auto pdfFieldVTK = vtk::createVTKOutput_BlockData(blocks, "fluid_field", vtkWriteFreqFl, 0, false, baseFolder);
+
+      field::FlagFieldCellFilter<FlagField_T> fluidFilter(flagFieldID);
+      fluidFilter.addFlag(Fluid_Flag);
+      pdfFieldVTK->addCellInclusionFilter(fluidFilter);
+
+      pdfFieldVTK->addCellDataWriter(
+            make_shared<lbm::VelocityVTKWriter<LatticeModel_T, float> >(pdfFieldID, "VelocityFromPDF"));
+      pdfFieldVTK->addCellDataWriter(
+            make_shared<lbm::DensityVTKWriter<LatticeModel_T, float> >(pdfFieldID, "DensityFromPDF"));
+
+      timeloop->addFuncBeforeTimeStep(vtk::writeFiles(pdfFieldVTK), "VTK (fluid field data)");
+   }
+
+   if( vtkWriteFreqDD != uint_t(0) ) {
+      auto domainDecompVTK = vtk::createVTKOutput_DomainDecomposition(blocks, "domain_decomposition", vtkWriteFreqDD, baseFolder );
+      timeloop->addFuncBeforeTimeStep( vtk::writeFiles(domainDecompVTK), "VTK (domain decomposition)");
+   }
+
+   shared_ptr<WcTimingPool> timeloopTiming = make_shared<WcTimingPool>();
+   shared_ptr<WcTimingPool> timeloopRefinementTiming = make_shared<WcTimingPool>();
+   shared_ptr<WcTimingPool> timeloopRefinementTimingLevelwise = make_shared<WcTimingPool>();
+
+
+   auto sweep = lbm::makeCellwiseSweep< LatticeModel_T, FlagField_T >( pdfFieldID, flagFieldID, Fluid_Flag );
+   auto refinementTimestep = lbm::refinement::makeTimeStep< LatticeModel_T, BoundaryHandling_T >( blocks, sweep, pdfFieldID, boundaryHandlingID );
+
+   refinementTimestep->enableTiming( timeloopRefinementTiming, timeloopRefinementTimingLevelwise );
+
+   // Averaging the force/torque over two time steps is said to damp oscillations of the interaction force/torque.
+   // See Ladd - " Numerical simulations of particulate suspensions via a discretized Boltzmann equation. Part 1. Theoretical foundation", 1994, p. 302
+   if( averageForceTorqueOverTwoTimSteps ) {
+
+      // store force/torque from hydrodynamic interactions in container1
+      refinementTimestep->addPostStreamVoidFunction(lbm::refinement::FunctorWrapper(storeForceTorqueInCont1), "Force Storing", finestLevel);
+
+      // set force/torque from previous time step (in container2)
+      refinementTimestep->addPostStreamVoidFunction(lbm::refinement::FunctorWrapper(setForceTorqueOnBodiesFromCont2), "Force setting", finestLevel);
+
+      // average the force/torque by scaling it with factor 1/2 (except in first timestep and directly after refinement, there it is 1)
+      refinementTimestep->addPostStreamVoidFunction(lbm::refinement::FunctorWrapper(SharedFunctor<pe_coupling::ForceTorqueOnBodiesScaler>(forceScaler)), "Force averaging", finestLevel);
+      refinementTimestep->addPostStreamVoidFunction(lbm::refinement::FunctorWrapper(setForceScalingFactorToHalf), "Force scaling adjustment", finestLevel);
+
+      // swap containers
+      refinementTimestep->addPostStreamVoidFunction(lbm::refinement::FunctorWrapper(pe_coupling::BodyContainerSwapper(bodiesFTContainer1, bodiesFTContainer2)), "Swap FT container", finestLevel);
+
+   }
+
+   Vector3<real_t> gravitationalForce( real_t(0), real_t(0), -(densityRatio - real_t(1)) * gravitationalAcceleration * sphereVolume );
+   refinementTimestep->addPostStreamVoidFunction(lbm::refinement::FunctorWrapper(pe_coupling::ForceOnBodiesAdder( blocks, bodyStorageID, gravitationalForce )), "Gravitational force", finestLevel );
+
+   // add pe timesteps
+   refinementTimestep->addPostStreamVoidFunction(lbm::refinement::FunctorWrapper(pe_coupling::TimeStep( blocks, bodyStorageID, cr, syncCall, real_t(1), numPeSubCycles)),
+                                                  "pe Time Step", finestLevel );
+
+   // add sweep for updating the pe body mapping into the LBM simulation
+   refinementTimestep->addPostStreamVoidFunction(lbm::refinement::SweepAsFunctorWrapper( pe_coupling::BodyMapping< LatticeModel_T, BoundaryHandling_T >( blocks, pdfFieldID, boundaryHandlingID, bodyStorageID, globalBodyStorage, bodyFieldID,  MO_Flag, FormerMO_Flag, pe_coupling::selectRegularBodies ), blocks ),
+                                                 "Body Mapping", finestLevel );
+
+   // add sweep for restoring PDFs in cells previously occupied by pe bodies
+   using Reconstructor_T = pe_coupling::EquilibriumReconstructor<LatticeModel_T, BoundaryHandling_T>;
+   Reconstructor_T reconstructor( blocks, boundaryHandlingID, bodyFieldID );
+   refinementTimestep->addPostStreamVoidFunction(lbm::refinement::SweepAsFunctorWrapper( pe_coupling::PDFReconstruction< LatticeModel_T, BoundaryHandling_T, Reconstructor_T > ( blocks, pdfFieldID,
+                                                 boundaryHandlingID, bodyStorageID, globalBodyStorage, bodyFieldID, reconstructor, FormerMO_Flag, Fluid_Flag ), blocks ),
+                                                 "PDF Restore", finestLevel );
+
+
+   // add LBM sweep with refinement
+   timeloop->addFuncBeforeTimeStep( makeSharedFunctor( refinementTimestep ), "LBM refinement time step" );
+
+   std::string loggingFileName( baseFolder + "/Logging_Ga");
+   loggingFileName += std::to_string(uint_c(GalileoNumber));
+   loggingFileName += "_lvl";
+   loggingFileName += std::to_string(numberOfLevels);
+   loggingFileName += ".txt";
+   if( logging  )
+   {
+      WALBERLA_LOG_INFO_ON_ROOT(" - writing logging output to file \"" << loggingFileName << "\"");
+   }
+   shared_ptr< PropertyLogger > logger = walberla::make_shared< PropertyLogger >( timeloop, blocks, bodyStorageID,
+                                                                                  loggingFileName, fileIO );
+   if(logging)
+   {
+      timeloop->addFuncAfterTimeStep( SharedFunctor< PropertyLogger >( logger ), "Property logger" );
+   }
+
+
+   timeloop->addFuncAfterTimeStep( RemainingTimeLogger( timeloop->getNrOfTimeSteps() ), "Remaining Time Logger" );
+
+
+   // add top level timing pool output
+   timeloop->addFuncAfterTimeStep( TimingPoolLogger( timeloopTiming, timeloop, loggingDisplayFrequency ), "Regular Timing Logger" );
+
+   // add regular refinement timing pool output
+   timeloop->addFuncAfterTimeStep( TimingPoolLogger( timeloopRefinementTiming, timeloop, loggingDisplayFrequency ), "Refinement Timing Logger" );
+
+   // add level wise timing pool output
+   //if( numberOfLevels != uint_t(1))
+   timeloop->addFuncAfterTimeStep( TimingPoolLogger( timeloopRefinementTimingLevelwise, timeloop, loggingDisplayFrequency ), "Refinement Levelwise Timing Logger" );
+
+   // add PE timing tree output
+   timeloop->addFuncAfterTimeStep( TimingTreeLogger( timingTreePE, timeloop, loggingDisplayFrequency ), "PE Timing Tree Timing Logger" );
+
+
+   ////////////////////////
+   // EXECUTE SIMULATION //
+   ////////////////////////
+
+   uint_t loadEvaluationFrequency = refinementCheckFrequency;
+   TimingEvaluator timingEvaluator( timeloopRefinementTimingLevelwise, timingTreePE, numberOfLevels );
+
+   // file for simulation infos
+   std::string infoFileName( baseFolder + "/simulation_info.txt");
+   WALBERLA_ROOT_SECTION()
+   {
+      std::ofstream file;
+      file.open( infoFileName.c_str(), std::fstream::out | std::fstream::trunc );
+      file << "#i\t t\t tSim\t tLB\t numProcs\t levelwise blocks (min/max/sum)\n";
+      file.close();
+   }
+
+   // process local timing measurements and predicted loads
+   std::string processLocalFiles(baseFolder + "/processLocalFiles");
+   WALBERLA_ROOT_SECTION()
+   {
+      filesystem::path tpath( processLocalFiles );
+      if( !filesystem::exists( tpath ) )
+         filesystem::create_directory( tpath );
+   }
+   std::string measurementFileProcessName(processLocalFiles + "/measurements_" + std::to_string(MPIManager::instance()->rank()) + ".txt");
+   {
+      std::ofstream file;
+      file.open( measurementFileProcessName.c_str(), std::fstream::out | std::fstream::trunc );
+      file << "#i\t t\t mTotSim\t mLB\t mLBM\t mBH\t mCoup1\t mCoup2\t mRB\t cLBM\t cRB\t numBlocks\n";
+      file.close();
+   }
+
+   std::string predictionFileProcessName(processLocalFiles + "/predictions_" + std::to_string(MPIManager::instance()->rank()) + ".txt");
+   {
+      std::ofstream file;
+      file.open( predictionFileProcessName.c_str(), std::fstream::out | std::fstream::trunc );
+      file << "#i\t t\t wlLBM\t wlBH\t wlCoup1\t wlCoup2\t wlRB\t edgecut\t numBlocks\n";
+      file.close();
+   }
+
+   std::vector<std::string> LBMTimer;
+   LBMTimer.emplace_back("collide");
+   LBMTimer.emplace_back("stream");
+   LBMTimer.emplace_back("stream & collide");
+
+   std::vector<std::string> bhTimer;
+   bhTimer.emplace_back("boundary handling");
+
+   std::vector<std::string> couplingTimer1;
+   couplingTimer1.emplace_back("Body Mapping");
+   std::vector<std::string> couplingTimer2;
+   couplingTimer2.emplace_back("PDF Restore");
+
+   std::vector<std::string> peTimer;
+   peTimer.emplace_back("Simulation Step.Collision Detection");
+   peTimer.emplace_back("Simulation Step.Collision Response Integration");
+   peTimer.emplace_back("Simulation Step.Collision Response Resolution.Collision Response Solving");
+
+   std::vector<std::string> LBMCommTimer;
+   LBMCommTimer.emplace_back("communication equal level [pack & send]");
+   LBMCommTimer.emplace_back("communication equal level [wait & unpack]");
+
+   std::vector<std::string> peCommTimer;
+   //Adapt if using different collision response (like DEM!)
+   peCommTimer.emplace_back("Simulation Step.Collision Response Resolution.Velocity Sync");
+   peCommTimer.emplace_back("Sync");
+
+
+   real_t terminationPosition = expectedSedimentedHeight;
+   real_t terminationVelocity = real_t(0.05) * uRef;
+
+   auto oldmTotSim = real_t(0);
+   auto oldmLB = real_t(0);
+
+   auto measurementFileCounter = uint_t(0);
+   auto predictionFileCounter = uint_t(0);
+
+   std::string loadEvaluationStep("load evaluation");
+
+   // time loop
+   for (uint_t i = 0; i < timesteps; ++i )
+   {
+
+      // evaluate measurements (note: reflect simulation behavior BEFORE the evaluation)
+      if( loadEvaluationFrequency > 0 && i % loadEvaluationFrequency == 0 && i > 0 && fileIO)
+      {
+
+         (*timeloopTiming)[loadEvaluationStep].start();
+
+         // write process local timing measurements to files (per process, per load balancing step)
+         {
+
+            // evaluate all required timers
+            uint_t evalLevel = finestLevel;
+
+            real_t mTotSim = ( (*timeloopTiming).timerExists("LBM refinement time step") ) ? real_c((*timeloopTiming)["LBM refinement time step"].total()) : real_t(0);
+
+            real_t mLB = ( (*timeloopTiming).timerExists("refinement checking") ) ? real_c((*timeloopTiming)["refinement checking"].total()) : real_t(0);
+
+            real_t mLBM = timingEvaluator.getTimings(LBMTimer, evalLevel);
+            real_t mBH  = timingEvaluator.getTimings(bhTimer, evalLevel);
+            real_t mCoup1 = timingEvaluator.getTimings(couplingTimer1, evalLevel);
+            real_t mCoup2 = timingEvaluator.getTimings(couplingTimer2, evalLevel);
+            real_t mPE = timingEvaluator.getTimings(peTimer, evalLevel);
+
+            real_t cLBM = timingEvaluator.getTimings(LBMCommTimer, evalLevel);
+            real_t cRB = timingEvaluator.getTimings(peCommTimer, evalLevel);
+
+            auto & forest = blocks->getBlockForest();
+            uint_t numBlocks = forest.getNumberOfBlocks(finestLevel);
+
+            // write to process local file
+            std::ofstream file;
+            file.open( measurementFileProcessName.c_str(), std::ofstream::app  );
+            file << measurementFileCounter << "\t " << real_c(i) / tRef << "\t"
+                 << mTotSim - oldmTotSim << "\t" << mLB - oldmLB << "\t" << mLBM << "\t" << mBH << "\t" << mCoup1 << "\t"
+                 << mCoup2 << "\t" << mPE << "\t" << cLBM << "\t" << cRB << "\t" << numBlocks << "\n";
+            file.close();
+
+            oldmTotSim = mTotSim;
+            oldmLB = mLB;
+            measurementFileCounter++;
+
+            // reset timers to have measurement from evaluation to evaluation point
+            timeloopRefinementTimingLevelwise->clear();
+            timingTreePE.reset();
+
+         }
+
+         // evaluate general simulation infos (on root)
+         {
+            real_t totalTimeToCurrentTimestep;
+            real_t totalLBTimeToCurrentTimestep;
+            evaluateTotalSimulationTimePassed(*timeloopTiming, totalTimeToCurrentTimestep, totalLBTimeToCurrentTimestep);
+            std::vector<math::DistributedSample> numberOfBlocksPerLevel(numberOfLevels);
+
+            auto & forest = blocks->getBlockForest();
+            for( uint_t lvl = 0; lvl < numberOfLevels; ++lvl)
+            {
+               uint_t numBlocks = forest.getNumberOfBlocks(lvl);
+               numberOfBlocksPerLevel[lvl].castToRealAndInsert(numBlocks);
+            }
+
+            for( uint_t lvl = 0; lvl < numberOfLevels; ++lvl)
+            {
+               numberOfBlocksPerLevel[lvl].mpiGatherRoot();
+            }
+
+            WALBERLA_ROOT_SECTION()
+            {
+               std::ofstream file;
+               file.open( infoFileName.c_str(), std::ofstream::app  );
+               file << i << "\t " << real_c(i) / tRef << "\t"
+                    << totalTimeToCurrentTimestep << "\t " << totalLBTimeToCurrentTimestep << "\t " << numberOfProcesses << "\t ";
+
+               for( uint_t lvl = 0; lvl < numberOfLevels; ++lvl)
+               {
+                  file << uint_c(numberOfBlocksPerLevel[numberOfLevels-1-lvl].min()) << "\t ";
+                  file << uint_c(numberOfBlocksPerLevel[numberOfLevels-1-lvl].max()) << "\t ";
+                  file << uint_c(numberOfBlocksPerLevel[numberOfLevels-1-lvl].sum()) << "\t ";
+               }
+               file << "\n";
+
+               file.close();
+            }
+         }
+
+         (*timeloopTiming)[loadEvaluationStep].end();
+
+      }
+
+
+      if( refinementCheckFrequency != 0 && i % refinementCheckFrequency == 0)
+      {
+
+         WALBERLA_LOG_INFO_ON_ROOT("Checking for refinement and load balancing...")
+
+         std::string refinementCheckStep("refinement checking");
+         (*timeloopTiming)[refinementCheckStep].start();
+
+         if( loadEvaluationStrategy != "LBM" ) {
+
+            // first evaluate all data that is required for the refinement checks
+
+            // update info collections for the particle presence based check and the load balancing:
+            auto &forest = blocks->getBlockForest();
+            pe_coupling::createWithNeighborhood<BoundaryHandling_T>(forest, boundaryHandlingID, bodyStorageID, ccdID,
+                                                                    fcdID, numPeSubCycles, *couplingInfoCollection);
+            pe::createWithNeighborhoodLocalShadow(forest, bodyStorageID, *peInfoCollection);
+
+            // for the fluid property based check:
+            if (useVorticityCriterion || useGradientCriterion) {
+               velocityFieldWriter();
+               (*velocityCommunicationScheme)();
+            }
+
+            WALBERLA_LOG_INFO_ON_ROOT("Refreshing blockforest...")
+
+            // check refinement criteria and refine/coarsen if necessary
+            uint_t stampBefore = blocks->getBlockForest().getModificationStamp();
+            blocks->refresh();
+            uint_t stampAfter = blocks->getBlockForest().getModificationStamp();
+
+            bool recreatingNecessary = false;
+
+            if (stampBefore != stampAfter) {
+               recreatingNecessary = true;
+            }
+
+            bool reducedRecreationFlag = mpi::allReduce(recreatingNecessary, mpi::LOGICAL_OR);
+
+            if (reducedRecreationFlag != recreatingNecessary) {
+               WALBERLA_LOG_INFO("Reduced recreation flag different from individual one");
+            }
+
+            recreatingNecessary = reducedRecreationFlag;
+
+            if (recreatingNecessary) {
+
+               WALBERLA_LOG_INFO_ON_ROOT("Recreating data structures..");
+
+               // rebuild PE data structures
+               pe::clearSynchronization(blockforest, bodyStorageID);
+
+               syncCall();
+
+               for (auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt) {
+                  auto * ccd = blockIt->getData<pe::ccd::ICCD>(ccdID);
+                  ccd->reloadBodies();
+               }
+
+               clearBoundaryHandling(forest, boundaryHandlingID);
+               clearBodyField(forest, bodyFieldID);
+
+               if (averageForceTorqueOverTwoTimSteps) {
+
+                  // clear containers from old values
+                  bodiesFTContainer1->clear();
+                  bodiesFTContainer2->clear();
+
+                  // initialize FT container on all blocks anew, i.e. with the currently acting force/torque, which is zero after the refinement step
+                  bodiesFTContainer2->store();
+
+                  // set force scaling factor to one after refinement since force history is not present on blocks after refinement
+                  // thus the usual averaging of 1/2 (over two time steps) can not be carried out, i.e. it would lead to 1/2 of the acting force
+                  // the scaling factor is thus adapted for the next timestep to 1, and then changed back to 1/2 (in the timeloop)
+                  setForceScalingFactorToOne();
+               }
+
+               recreateBoundaryHandling(forest, boundaryHandlingID);
+
+               // re-set the no-slip flags along the walls
+               pe_coupling::mapMovingBodies<BoundaryHandling_T>(*blocks, boundaryHandlingID, bodyStorageID,
+                                                                *globalBodyStorage, bodyFieldID, MO_Flag,
+                                                                pe_coupling::selectGlobalBodies);
+
+               // re-map the body into the domain (initializing the bodyField as well)
+               pe_coupling::mapMovingBodies<BoundaryHandling_T>(*blocks, boundaryHandlingID, bodyStorageID,
+                                                                *globalBodyStorage, bodyFieldID, MO_Flag,
+                                                                pe_coupling::selectRegularBodies);
+            }
+
+         }
+
+         (*timeloopTiming)[refinementCheckStep].end();
+      }
+
+      // evaluate predictions (note: reflect the predictions for all upcoming simulations, thus the corresponding measurements have to be taken afterwards)
+      if( loadEvaluationFrequency > 0 && i % loadEvaluationFrequency == 0 && fileIO)
+      {
+
+         (*timeloopTiming)[loadEvaluationStep].start();
+
+         // write process local load predictions to files (per process, per load balancing step)
+         {
+
+            auto wlLBM = real_t(0);
+            auto wlBH = real_t(0);
+            auto wlCoup1 = real_t(0);
+            auto wlCoup2 = real_t(0);
+            auto wlRB = real_t(0);
+
+            auto & forest = blocks->getBlockForest();
+            pe_coupling::createWithNeighborhood<BoundaryHandling_T>(forest, boundaryHandlingID, bodyStorageID, ccdID, fcdID, numPeSubCycles, *couplingInfoCollection);
+
+            for( auto blockIt = forest.begin(); blockIt != forest.end(); ++blockIt ) {
+               auto * block = static_cast<blockforest::Block *> (&(*blockIt));
+               const auto &blockID = block->getId();
+               auto infoIt = couplingInfoCollection->find(blockID);
+               auto blockInfo = infoIt->second;
+
+               if( useEllipsoids )
+               {
+                  WALBERLA_ABORT("Not yet implemented!");
+               }
+               else
+               {
+                  wlLBM   += fittedLBMWeightEvaluationFunctionSpheres(blockInfo);
+                  wlBH    += fittedBHWeightEvaluationFunctionSpheres(blockInfo);
+                  wlCoup1 += fittedCoupling1WeightEvaluationFunctionSpheres(blockInfo);
+                  wlCoup2 += fittedCoupling2WeightEvaluationFunctionSpheres(blockInfo);
+                  wlRB    += fittedPEWeightEvaluationFunctionSpheres(blockInfo);
+               }
+
+            }
+
+            // note: we count the edge weight doubled here in total (to and from the other process). ParMetis only counts one direction.
+            uint_t edgecut = evaluateEdgeCut(forest);
+
+            uint_t numBlocks = forest.getNumberOfBlocks(finestLevel);
+
+            std::ofstream file;
+            file.open( predictionFileProcessName.c_str(), std::ofstream::app  );
+            file << predictionFileCounter << "\t " << real_c(i) / tRef << "\t"
+                 << wlLBM << "\t" << wlBH << "\t" << wlCoup1 << "\t" << wlCoup2 << "\t" << wlRB << "\t"
+                 << edgecut << "\t" << numBlocks << "\n";
+            file.close();
+
+            predictionFileCounter++;;
+         }
+
+         (*timeloopTiming)[loadEvaluationStep].end();
+
+      }
+
+      // perform a single simulation step
+      timeloop->singleStep( *timeloopTiming );
+
+
+      if( logging )
+      {
+         real_t curMeanPos = logger->getMeanPosition();
+         real_t curMeanVel = logger->getMeanVelocity();
+         if( curMeanPos <= terminationPosition && std::fabs(curMeanVel) < terminationVelocity )
+         {
+            WALBERLA_LOG_INFO_ON_ROOT("Sediments passed terminal mean position " << terminationPosition << " and reached velocity " << curMeanVel << " - terminating simulation!");
+            break;
+         }
+      }
+
+   }
+
+   (*timeloopTiming).logResultOnRoot();
+
+
+   return EXIT_SUCCESS;
+}
+
+} // namespace amr_sediment_settling
+
+int main( int argc, char **argv ){
+   amr_sediment_settling::main(argc, argv);
+}
\ No newline at end of file
--- a/apps/benchmarks/AdaptiveMeshRefinementFluidParticleCoupling/AMRSettlingSphere.cpp
+++ b/apps/benchmarks/AdaptiveMeshRefinementFluidParticleCoupling/AMRSettlingSphere.cpp
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file AMRSettlingSphere.cpp
+//! \ingroup pe_coupling
+//! \author Christoph Rettinger <christoph.rettinger@fau.de>
+//
+//======================================================================================================================
+
+#include "blockforest/Initialization.h"
+#include "blockforest/communication/UniformBufferedScheme.h"
+#include "blockforest/loadbalancing/all.h"
+
+#include "boundary/all.h"
+
+#include "core/DataTypes.h"
+#include "core/Environment.h"
+#include "core/SharedFunctor.h"
+#include "core/debug/Debug.h"
+#include "core/debug/TestSubsystem.h"
+#include "core/math/all.h"
+#include "core/timing/RemainingTimeLogger.h"
+
+#include "domain_decomposition/SharedSweep.h"
+#include "domain_decomposition/BlockSweepWrapper.h"
+
+#include "field/AddToStorage.h"
+#include "field/StabilityChecker.h"
+#include "field/communication/PackInfo.h"
+
+#include "lbm/boundary/all.h"
+#include "lbm/communication/PdfFieldPackInfo.h"
+#include "lbm/field/AddToStorage.h"
+#include "lbm/field/PdfField.h"
+#include "lbm/field/VelocityFieldWriter.h"
+#include "lbm/lattice_model/D3Q19.h"
+#include "lbm/refinement/all.h"
+#include "lbm/sweeps/CellwiseSweep.h"
+#include "lbm/sweeps/SweepWrappers.h"
+
+#include "pe/amr/InfoCollection.h"
+#include "pe/basic.h"
+#include "pe/vtk/BodyVtkOutput.h"
+#include "pe/vtk/SphereVtkOutput.h"
+#include "pe/cr/ICR.h"
+#include "pe/Types.h"
+#include "pe/synchronization/ClearSynchronization.h"
+
+#include "pe_coupling/amr/all.h"
+#include "pe_coupling/mapping/all.h"
+#include "pe_coupling/momentum_exchange_method/all.h"
+#include "pe_coupling/utility/all.h"
+
+#include "timeloop/SweepTimeloop.h"
+
+#include "vtk/all.h"
+#include "field/vtk/all.h"
+#include "lbm/vtk/all.h"
+
+namespace amr_settling_sphere
+{
+
+///////////
+// USING //
+///////////
+
+using namespace walberla;
+using walberla::uint_t;
+
+//////////////
+// TYPEDEFS //
+//////////////
+
+// PDF field, flag field & body field
+using LatticeModel_T = lbm::D3Q19<lbm::collision_model::TRT, false>;
+using Stencil_T = LatticeModel_T::Stencil;
+using PdfField_T = lbm::PdfField<LatticeModel_T>;
+
+using flag_t = walberla::uint8_t;
+using FlagField_T = FlagField<flag_t>;
+using BodyField_T = GhostLayerField<pe::BodyID, 1>;
+using VelocityField_T = GhostLayerField<Vector3<real_t>, 1>;
+
+const uint_t FieldGhostLayers = 4;
+
+// boundary handling
+using NoSlip_T = lbm::NoSlip<LatticeModel_T, flag_t>;
+
+using MO_T = pe_coupling::CurvedLinear<LatticeModel_T, FlagField_T>;
+
+using BoundaryHandling_T = BoundaryHandling<FlagField_T, Stencil_T, NoSlip_T, MO_T>;
+
+using BodyTypeTuple = std::tuple<pe::Sphere, pe::Ellipsoid, pe::Plane>;
+
+///////////
+// FLAGS //
+///////////
+
+const FlagUID Fluid_Flag( "fluid" );
+const FlagUID NoSlip_Flag( "no slip" );
+const FlagUID MO_Flag( "moving obstacle" );
+const FlagUID FormerMO_Flag( "former moving obstacle" );
+
+
+//////////////////////////////////////
+// DYNAMIC REFINEMENT FUNCTIONALITY //
+//////////////////////////////////////
+
+/*
+ * Refinement check based on gradient magnitude
+ * If gradient magnitude is below lowerLimit in all cells of a block, that block could be coarsened.
+ * If the gradient value is above the upperLimit for at least one cell, that block gets marked for refinement.
+ * Else, the block remains on the current level.
+ */
+template< typename LatticeModel_T, typename Filter_T >
+class VectorGradientRefinement
+{
+public:
+   using VectorField_T = GhostLayerField<Vector3<real_t>, 1>;
+   using Stencil_T = typename LatticeModel_T::Stencil;
+
+   VectorGradientRefinement( const ConstBlockDataID & fieldID, const Filter_T & filter,
+                             const real_t upperLimit, const real_t lowerLimit, const uint_t maxLevel ) :
+         fieldID_( fieldID ), filter_( filter ),
+         upperLimit_( upperLimit ), lowerLimit_( lowerLimit ), maxLevel_( maxLevel )
+   {}
+
+   void operator()( std::vector< std::pair< const Block *, uint_t > > & minTargetLevels,
+                    std::vector< const Block * > & blocksAlreadyMarkedForRefinement,
+                    const BlockForest & forest );
+
+private:
+
+   ConstBlockDataID fieldID_;
+
+   Filter_T filter_;
+
+   real_t upperLimit_;
+   real_t lowerLimit_;
+
+   uint_t maxLevel_;
+
+}; // class GradientRefinement
+
+template< typename LatticeModel_T, typename Filter_T >
+void VectorGradientRefinement< LatticeModel_T, Filter_T >::operator()( std::vector< std::pair< const Block *, uint_t > > & minTargetLevels,
+                                                                       std::vector< const Block * > &, const BlockForest & )
+{
+   for(auto & minTargetLevel : minTargetLevels)
+   {
+      const Block * const block = minTargetLevel.first;
+
+      const uint_t currentLevelOfBlock = block->getLevel();
+
+      const VectorField_T * uField = block->template getData< VectorField_T >( fieldID_ );
+
+      if( uField == nullptr )
+      {
+         minTargetLevel.second = uint_t(0);
+         continue;
+      }
+
+      Matrix3<real_t> uGradient( real_t(0) );
+
+      bool refine( false );
+      bool coarsen( true );
+
+      filter_( *block );
+
+      WALBERLA_FOR_ALL_CELLS_XYZ( uField,
+
+                                  std::vector< Vector3<real_t> > uValues( Stencil_T::Size, Vector3<real_t>(real_t(0)) );
+
+                                        Vector3<real_t> uInCenterCell = uField->get( x,y,z );
+
+                                        for( auto dir = Stencil_T::beginNoCenter(); dir != Stencil_T::end(); ++dir)
+                                        {
+                                           // check if boundary treatment is necessary
+                                           if( filter_( x+dir.cx(),y+dir.cy(),z+dir.cz() ) )
+                                           {
+                                              // copy from center cell
+                                              uValues[ *dir ] = uInCenterCell;
+                                           } else {
+                                              uValues[ *dir ] = uField->get( x+dir.cx(),y+dir.cy(),z+dir.cz() );
+                                           }
+                                        }
+
+                                        // obtain the matrix grad(u) with the help of the gradient formula from
+                                        // See: Ramadugu et al - Lattice differential operators for computational physics (2013)
+                                        // with T = c_s**2
+                                        const real_t inv_c_s_sqr = real_t(3);
+                                        uGradient = real_t(0);
+                                        for( auto dir = Stencil_T::beginNoCenter(); dir != Stencil_T::end(); ++dir)
+                                        {
+                                           real_t cx = real_c(dir.cx());
+                                           real_t cy = real_c(dir.cy());
+                                           real_t cz = real_c(dir.cz());
+
+                                           // grad(ux)
+                                           real_t ux = uValues[ *dir ][0];
+                                           uGradient[ 0 ] += LatticeModel_T::w[ dir.toIdx() ] * cx * ux;
+                                           uGradient[ 3 ] += LatticeModel_T::w[ dir.toIdx() ] * cy * ux;
+                                           uGradient[ 6 ] += LatticeModel_T::w[ dir.toIdx() ] * cz * ux;
+
+                                           // grad(uy)
+                                           real_t uy = uValues[ *dir ][1];
+                                           uGradient[ 1 ] += LatticeModel_T::w[ dir.toIdx() ] * cx * uy;
+                                           uGradient[ 4 ] += LatticeModel_T::w[ dir.toIdx() ] * cy * uy;
+                                           uGradient[ 7 ] += LatticeModel_T::w[ dir.toIdx() ] * cz * uy;
+
+                                           // grad(uz)
+                                           real_t uz = uValues[ *dir ][2];
+                                           uGradient[ 2 ] += LatticeModel_T::w[ dir.toIdx() ] * cx * uz;
+                                           uGradient[ 5 ] += LatticeModel_T::w[ dir.toIdx() ] * cy * uz;
+                                           uGradient[ 8 ] += LatticeModel_T::w[ dir.toIdx() ] * cz * uz;
+
+                                        }
+                                        uGradient *= inv_c_s_sqr;
+
+                                        real_t norm( real_t(0) );
+                                        //compute maximums norm of 3x3 matrix
+                                        for( uint_t i = uint_t(0); i < uint_t(3*3); ++i )
+                                           norm = std::max(norm, std::fabs(uGradient[i]));
+
+                                        if( norm > lowerLimit_ )
+                                        {
+                                           coarsen = false;
+                                           if( norm > upperLimit_ )
+                                              refine = true;
+                                        }
+
+      )
+
+      if( refine && currentLevelOfBlock < maxLevel_ )
+      {
+         WALBERLA_ASSERT( !coarsen );
+         minTargetLevel.second = currentLevelOfBlock + uint_t(1);
+      }
+      if( coarsen && currentLevelOfBlock > uint_t(0) )
+      {
+         WALBERLA_ASSERT( !refine );
+         minTargetLevel.second = currentLevelOfBlock - uint_t(1);
+      }
+   }
+}
+
+struct TimingPoolLogger
+{
+   TimingPoolLogger( const shared_ptr<WcTimingPool> & timingPool, const shared_ptr<SweepTimeloop> & timeloop, const uint_t interval )
+         : timingPool_( timingPool ), timeloop_( timeloop ), interval_( interval )
+   {
+   }
+
+   void operator()()
+   {
+      if( interval_ > uint_t(0) && timeloop_->getCurrentTimeStep() % interval_ == uint_t(0) )
+      {
+         timingPool_->logResultOnRoot();
+      }
+   }
+
+private:
+   shared_ptr<WcTimingPool> timingPool_;
+   shared_ptr<SweepTimeloop> timeloop_;
+   uint_t interval_;
+};
+
+struct TimingTreeLogger
+{
+   TimingTreeLogger( const shared_ptr<WcTimingTree> & timingTree, const shared_ptr<SweepTimeloop> & timeloop, const uint_t interval )
+         : timingTree_( timingTree ), timeloop_( timeloop ), interval_( interval )
+   {
+   }
+
+   void operator()()
+   {
+      if( interval_ > uint_t(0) && timeloop_->getCurrentTimeStep() % interval_ == uint_t(0) )
+      {
+         timingTree_->synchronize();
+         auto reducedTimingTree = timingTree_->getReduced();
+         WALBERLA_LOG_INFO_ON_ROOT( reducedTimingTree );
+      }
+   }
+
+private:
+   shared_ptr<WcTimingTree> timingTree_;
+   shared_ptr<SweepTimeloop> timeloop_;
+   uint_t interval_;
+};
+
+/////////////////////
+// BLOCK STRUCTURE //
+/////////////////////
+
+static void refinementSelection( SetupBlockForest& forest, uint_t levels, const AABB & refinementBox )
+{
+   real_t dx = real_t(1); // dx on finest level
+   for(auto & block : forest)
+   {
+      uint_t blockLevel = block.getLevel();
+      uint_t levelScalingFactor = ( uint_t(1) << (levels - uint_t(1) - blockLevel) );
+      real_t dxOnLevel = dx * real_c(levelScalingFactor);
+      AABB blockAABB = block.getAABB();
+
+      // extend block AABB by ghostlayers
+      AABB extendedBlockAABB = blockAABB.getExtended( dxOnLevel * real_c(FieldGhostLayers) );
+
+      if( extendedBlockAABB.intersects( refinementBox ) )
+         if( blockLevel < ( levels - uint_t(1) ) )
+            block.setMarker( true );
+   }
+}
+
+static void workloadAndMemoryAssignment( SetupBlockForest& forest )
+{
+   for(auto & block : forest)
+   {
+      block.setWorkload( numeric_cast< workload_t >( uint_t(1) << block.getLevel() ) );
+      block.setMemory( numeric_cast< memory_t >(1) );
+   }
+}
+
+static shared_ptr< StructuredBlockForest > createBlockStructure( const AABB & domainAABB, Vector3<uint_t> blockSizeInCells,
+                                                                 uint_t numberOfLevels, real_t diameter, Vector3<real_t> spherePosition,
+                                                                 bool useStaticRefinement,
+                                                                 bool keepGlobalBlockInformation = false )
+{
+   SetupBlockForest sforest;
+
+   Vector3<uint_t> numberOfFineBlocksPerDirection( uint_c(domainAABB.size(0)) / blockSizeInCells[0],
+                                                   uint_c(domainAABB.size(1)) / blockSizeInCells[1],
+                                                   uint_c(domainAABB.size(2)) / blockSizeInCells[2] );
+
+   for(uint_t i = 0; i < 3; ++i )
+   {
+      WALBERLA_CHECK_EQUAL( numberOfFineBlocksPerDirection[i] * blockSizeInCells[i], uint_c(domainAABB.size(i)),
+                            "Domain can not be decomposed in direction " << i << " into fine blocks of size " << blockSizeInCells[i] );
+   }
+
+   uint_t levelScalingFactor = ( uint_t(1) << ( numberOfLevels - uint_t(1) ) );
+   Vector3<uint_t> numberOfCoarseBlocksPerDirection( numberOfFineBlocksPerDirection / levelScalingFactor );
+
+   for(uint_t i = 0; i < 3; ++i )
+   {
+      WALBERLA_CHECK_EQUAL(numberOfCoarseBlocksPerDirection[i] * levelScalingFactor, numberOfFineBlocksPerDirection[i],
+                           "Domain can not be refined in direction " << i << " according to the specified number of levels!" );
+   }
+
+   AABB refinementBox;
+   if(useStaticRefinement)
+   {
+      refinementBox = AABB( std::floor(spherePosition[0] - real_t(0.5) * diameter),
+                            std::floor(spherePosition[1] - real_t(0.5) * diameter),
+                            domainAABB.zMin(),
+                            std::ceil( spherePosition[0] + real_t(0.5) * diameter),
+                            std::ceil( spherePosition[1] + real_t(0.5) * diameter),
+                            domainAABB.zMax() );
+   }else{
+      refinementBox = AABB( std::floor(spherePosition[0] - real_t(0.5) * diameter),
+                            std::floor(spherePosition[1] - real_t(0.5) * diameter),
+                            std::floor(spherePosition[2] - real_t(0.5) * diameter),
+                            std::ceil( spherePosition[0] + real_t(0.5) * diameter),
+                            std::ceil( spherePosition[1] + real_t(0.5) * diameter),
+                            std::ceil( spherePosition[2] + real_t(0.5) * diameter) );
+   }
+
+   WALBERLA_LOG_INFO_ON_ROOT(" - refinement box: " << refinementBox);
+
+   sforest.addRefinementSelectionFunction( [numberOfLevels, refinementBox](auto && PH1) { refinementSelection(std::forward<decltype(PH1)>(PH1), numberOfLevels, refinementBox); } );
+   sforest.addWorkloadMemorySUIDAssignmentFunction( workloadAndMemoryAssignment );
+
+   sforest.init( domainAABB, numberOfCoarseBlocksPerDirection[0], numberOfCoarseBlocksPerDirection[1], numberOfCoarseBlocksPerDirection[2], true, true, true );
+
+   // calculate process distribution
+   const memory_t memoryLimit = math::Limits< memory_t >::inf();
+
+   sforest.balanceLoad( blockforest::StaticLevelwiseCurveBalance(true), uint_c( MPIManager::instance()->numProcesses() ), real_t(0), memoryLimit, true );
+
+   WALBERLA_LOG_INFO_ON_ROOT( sforest );
+
+   MPIManager::instance()->useWorldComm();
+
+   // create StructuredBlockForest (encapsulates a newly created BlockForest)
+   shared_ptr< StructuredBlockForest > sbf =
+         make_shared< StructuredBlockForest >( make_shared< BlockForest >( uint_c( MPIManager::instance()->rank() ), sforest, keepGlobalBlockInformation ),
+                                               blockSizeInCells[0], blockSizeInCells[1], blockSizeInCells[2]);
+   sbf->createCellBoundingBoxes();
+
+   return sbf;
+}
+
+/////////////////////////////////////
+// BOUNDARY HANDLING CUSTOMIZATION //
+/////////////////////////////////////
+class MyBoundaryHandling : public blockforest::AlwaysInitializeBlockDataHandling< BoundaryHandling_T >
+{
+public:
+   MyBoundaryHandling( const weak_ptr< StructuredBlockStorage > & blocks,
+                       const BlockDataID & flagFieldID, const BlockDataID & pdfFieldID, const BlockDataID & bodyFieldID ) :
+         blocks_( blocks ), flagFieldID_( flagFieldID ), pdfFieldID_( pdfFieldID ), bodyFieldID_ ( bodyFieldID )
+   {}
+
+   BoundaryHandling_T * initialize( IBlock * const block ) override;
+
+private:
+
+   weak_ptr< StructuredBlockStorage > blocks_;
+
+   const BlockDataID flagFieldID_;
+   const BlockDataID pdfFieldID_;
+   const BlockDataID bodyFieldID_;
+
+
+}; // class MyBoundaryHandling
+
+BoundaryHandling_T * MyBoundaryHandling::initialize( IBlock * const block )
+{
+   WALBERLA_ASSERT_NOT_NULLPTR( block );
+
+   FlagField_T * flagField       = block->getData< FlagField_T >( flagFieldID_ );
+   PdfField_T *  pdfField        = block->getData< PdfField_T > ( pdfFieldID_ );
+   BodyField_T * bodyField       = block->getData< BodyField_T >( bodyFieldID_ );
+
+   const auto fluid = flagField->flagExists( Fluid_Flag ) ? flagField->getFlag( Fluid_Flag ) : flagField->registerFlag( Fluid_Flag );
+
+   auto blocksPtr = blocks_.lock();
+   WALBERLA_CHECK_NOT_NULLPTR( blocksPtr );
+
+   BoundaryHandling_T * handling = new BoundaryHandling_T( "moving obstacle boundary handling", flagField, fluid,
+                                                           NoSlip_T( "NoSlip", NoSlip_Flag, pdfField ),
+                                                           MO_T( "MO", MO_Flag, pdfField, flagField, bodyField, fluid, *blocksPtr, *block ) );
+
+   handling->fillWithDomain( FieldGhostLayers );
+
+   return handling;
+}
+
+
+//*******************************************************************************************************************
+
+
+//*******************************************************************************************************************
+/*!\brief Evaluating the position and velocity of the sphere
+ *
+ */
+//*******************************************************************************************************************
+class SpherePropertyLogger
+{
+public:
+   SpherePropertyLogger( const shared_ptr<SweepTimeloop> & timeloop, const shared_ptr< StructuredBlockStorage > & blocks,
+                         const BlockDataID & bodyStorageID, const std::string & fileName, bool fileIO,
+                         real_t xRef, real_t tRef, uint_t lbmTimeStepsPerTimeLoopIteration,
+                         real_t diameter, real_t viscosity) :
+         timeloop_( timeloop ), blocks_( blocks ), bodyStorageID_( bodyStorageID ), fileName_( fileName ), fileIO_(fileIO),
+         xRef_( xRef ), tRef_( tRef ), lbmTimeStepsPerTimeLoopIteration_( lbmTimeStepsPerTimeLoopIteration ),
+         diameter_( diameter ), viscosity_( viscosity ),
+         position_( real_t(0) ), maxVelocity_( real_t(0) )
+   {
+      if ( fileIO_ )
+      {
+         WALBERLA_ROOT_SECTION()
+         {
+            std::ofstream file;
+            file.open( fileName_.c_str() );
+            file << "#\t t\t posX\t posY\t posZ\t velX\t velY\t velZ\t posX*\t posY*\t posZ*\t velX*\t velY*\t velZ*\t Re\n";
+            file.close();
+         }
+      }
+   }
+
+   void operator()()
+   {
+      const uint_t timestep (timeloop_->getCurrentTimeStep() * lbmTimeStepsPerTimeLoopIteration_ );
+
+      Vector3<real_t> pos(real_t(0));
+      Vector3<real_t> transVel(real_t(0));
+
+      for( auto blockIt = blocks_->begin(); blockIt != blocks_->end(); ++blockIt )
+      {
+         for( auto bodyIt = pe::LocalBodyIterator::begin( *blockIt, bodyStorageID_); bodyIt != pe::LocalBodyIterator::end(); ++bodyIt )
+         {
+            pos = bodyIt->getPosition();
+            transVel = bodyIt->getLinearVel();
+         }
+      }
+
+      WALBERLA_MPI_SECTION()
+      {
+         mpi::allReduceInplace( pos, mpi::SUM );
+         mpi::allReduceInplace( transVel, mpi::SUM );
+      }
+
+      position_ = pos[2];
+      maxVelocity_ = std::max(maxVelocity_, -transVel[2]);
+
+      if( fileIO_ )
+         writeToFile( timestep, pos, transVel);
+   }
+
+   real_t getPosition() const
+   {
+      return position_;
+   }
+
+   real_t getMaxVelocity() const
+   {
+      return maxVelocity_;
+   }
+
+private:
+   void writeToFile( uint_t timestep, const Vector3<real_t> & position, const Vector3<real_t> & velocity )
+   {
+      WALBERLA_ROOT_SECTION()
+      {
+         std::ofstream file;
+         file.open( fileName_.c_str(), std::ofstream::app );
+
+         auto scaledPosition = position / xRef_;
+         auto scaledVelocity = velocity / (xRef_ / tRef_);
+         real_t Re = std::fabs(velocity[2]) * diameter_ / viscosity_;
+
+         file << timestep << "\t" << real_c(timestep) / tRef_ << "\t"
+              << "\t" << position[0] << "\t" << position[1] << "\t" << position[2]
+              << "\t" << velocity[0] << "\t" << velocity[1] << "\t" << velocity[2]
+              << "\t" << scaledPosition[0] << "\t" << scaledPosition[1] << "\t" << scaledPosition[2]
+              << "\t" << scaledVelocity[0] << "\t" << scaledVelocity[1] << "\t" << scaledVelocity[2]
+              << "\t" << Re << "\n";
+         file.close();
+      }
+   }
+
+   shared_ptr<SweepTimeloop> timeloop_;
+   shared_ptr< StructuredBlockStorage > blocks_;
+   const BlockDataID bodyStorageID_;
+   std::string fileName_;
+   bool fileIO_;
+   real_t xRef_, tRef_;
+   uint_t lbmTimeStepsPerTimeLoopIteration_;
+   real_t diameter_, viscosity_;
+
+   real_t position_;
+   real_t maxVelocity_;
+};
+
+void clearBoundaryHandling( BlockForest & forest, const BlockDataID & boundaryHandlingID )
+{
+   for( auto blockIt = forest.begin(); blockIt != forest.end(); ++blockIt )
+   {
+      BoundaryHandling_T * boundaryHandling = blockIt->getData<BoundaryHandling_T>(boundaryHandlingID);
+      boundaryHandling->clear( FieldGhostLayers );
+   }
+}
+
+void clearBodyField( BlockForest & forest, const BlockDataID & bodyFieldID )
+{
+   for( auto blockIt = forest.begin(); blockIt != forest.end(); ++blockIt )
+   {
+      BodyField_T * bodyField = blockIt->getData<BodyField_T>(bodyFieldID);
+      bodyField->setWithGhostLayer( NULL );
+   }
+}
+
+void recreateBoundaryHandling( BlockForest & forest, const BlockDataID & boundaryHandlingID )
+{
+   for( auto blockIt = forest.begin(); blockIt != forest.end(); ++blockIt )
+   {
+      BoundaryHandling_T * boundaryHandling = blockIt->getData<BoundaryHandling_T>(boundaryHandlingID);
+      boundaryHandling->fillWithDomain( FieldGhostLayers );
+   }
+}
+
+
+class LoadImbalanceEvaluator
+{
+public:
+   LoadImbalanceEvaluator( const shared_ptr<WcTimingPool> & levelwiseTimingPool, const shared_ptr<WcTimingTree> & peTimingTree, uint_t numberOfLevels)
+         : levelwiseTimingPool_( levelwiseTimingPool ), peTimingTree_( peTimingTree ), numberOfLevels_( numberOfLevels )
+   {
+      // workload timer that are present on each level
+      timerOnEachLevel_.emplace_back("collide");
+      timerOnEachLevel_.emplace_back("boundary handling");
+      timerOnEachLevel_.emplace_back("linear explosion");
+      timerOnEachLevel_.emplace_back("stream");
+
+      // workload timer only present on finest level
+      timerOnFinestLevel_.emplace_back("Body Mapping");
+      timerOnFinestLevel_.emplace_back("PDF Restore");
+      timerOnFinestLevel_.emplace_back("stream & collide");
+      //timerOnFinestLevel_.push_back("pe Time Step");
+
+      // workload timer used in PE
+      timerInPE_.emplace_back("CCD");
+      timerInPE_.emplace_back("FCD");
+      timerInPE_.emplace_back("Integration");
+
+   }
+
+   void operator()()
+   {
+      std::vector<real_t> minTimingsPerLevel(numberOfLevels_);
+      std::vector<real_t> maxTimingsPerLevel(numberOfLevels_);
+      std::vector<real_t> avgTimingsPerLevel(numberOfLevels_);
+      uint_t numProcesses = uint_c(MPIManager::instance()->numProcesses());
+
+      levelwiseTimingPool_->unifyRegisteredTimersAcrossProcesses();
+      peTimingTree_->synchronize();
+
+      for( uint_t level = 0; level < numberOfLevels_; ++level)
+      {
+         real_t timeOnLevelProcessLocal = real_t(0);
+         for(auto const & timerIt : timerOnEachLevel_)
+         {
+            std::string timerName = timerIt + " (" + std::to_string(level) + ")";
+            timeOnLevelProcessLocal += real_c((*levelwiseTimingPool_)[timerName].total());
+         }
+
+         if( level == numberOfLevels_- 1)
+         {
+            // evaluate more timers on finest level
+
+            for(auto const & timerIt : timerOnFinestLevel_)
+            {
+               std::string timerName = timerIt + " (" + std::to_string(level) + ")";
+               timeOnLevelProcessLocal += real_c((*levelwiseTimingPool_)[timerName].total());
+            }
+            for(auto const & timerName : timerInPE_)
+            {
+                timeOnLevelProcessLocal += real_c((*peTimingTree_)[timerName].total());
+            }
+         }
+
+         minTimingsPerLevel[level] = mpi::reduce( timeOnLevelProcessLocal, mpi::MIN );
+         maxTimingsPerLevel[level] = mpi::reduce( timeOnLevelProcessLocal, mpi::MAX );
+         avgTimingsPerLevel[level] = mpi::reduce( timeOnLevelProcessLocal, mpi::SUM ) / real_c(numProcesses);
+
+      }
+
+      for( uint_t level = 0; level < numberOfLevels_; ++level)
+      {
+         WALBERLA_LOG_INFO_ON_ROOT("Level " << level << ": min = " << minTimingsPerLevel[level] << ", max = " << maxTimingsPerLevel[level] << ", avg = " << avgTimingsPerLevel[level]);
+         WALBERLA_LOG_INFO_ON_ROOT("==> Imbalance(max/min) = " <<  maxTimingsPerLevel[level] / minTimingsPerLevel[level] << ", imbalance(max/avg) = " << maxTimingsPerLevel[level]/avgTimingsPerLevel[level]);
+      }
+   }
+
+private:
+
+   shared_ptr<WcTimingPool> levelwiseTimingPool_;
+   shared_ptr<WcTimingTree> peTimingTree_;
+   uint_t numberOfLevels_;
+   std::vector<std::string> timerOnEachLevel_;
+   std::vector<std::string> timerOnFinestLevel_;
+   std::vector<std::string> timerInPE_;
+};
+
+class TimingResetter
+{
+public:
+   TimingResetter(const shared_ptr<WcTimingPool> & levelwiseTimingPool, const shared_ptr<WcTimingTree> & peTimingTree )
+         : levelwiseTimingPool_( levelwiseTimingPool ), peTimingTree_( peTimingTree )
+   {}
+
+   void operator()()
+   {
+      levelwiseTimingPool_->clear();
+      peTimingTree_->reset();
+
+   }
+
+private:
+
+   shared_ptr<WcTimingPool> levelwiseTimingPool_;
+   shared_ptr<WcTimingTree> peTimingTree_;
+};
+
+//////////
+// MAIN //
+//////////
+
+//*******************************************************************************************************************
+/*!\brief Setup that simulates the settling of a sphere inside a rectangular column filled with viscous fluid
+ *
+ * The rectangular column is fully periodic and of size [xSizeNonDim x ySizeNonDim x zSizeNonDim] * diameter.
+ *
+ * The settling behavior can be modified via the Galileo number and the density ratio.
+ * Numerical parameters are the diameter (i.e. resolution) and the characteristic settling velocity ug.
+ *
+ * If numLevels = 0, then a uniform grid is used.
+ * Else, adaptive grid refinement according to the specified criteria is applied.
+ *
+ */
+//*******************************************************************************************************************
+
+int main( int argc, char **argv )
+{
+   debug::enterTestMode();
+
+   mpi::Environment env( argc, argv );
+
+   ///////////////////
+   // Customization //
+   ///////////////////
+
+   // simulation control
+   bool shortrun = false;
+   bool funcTest = false;
+   bool fileIO = true;
+   uint_t vtkWriteFreqDD = 0; //domain decomposition
+   uint_t vtkWriteFreqBo = 0; //bodies
+   uint_t vtkWriteFreqFl = 0; //fluid
+   uint_t vtkWriteFreq = 0; //general
+   bool vtkWriteFluidSlice = false;
+   std::string baseFolder = "vtk_out_AMRSettlingSphere"; // folder for vtk and file output
+
+   // physical setup
+   real_t GalileoNumber = real_t(200);
+   real_t densityRatio = real_t(1.1);
+   real_t diameter = real_t(20);
+   real_t ug = real_t(0.03); // characteristic settling velocity
+   uint_t xSizeNonDim = uint_t(16);
+   uint_t ySizeNonDim = uint_t(16);
+   uint_t zSizeNonDim = uint_t(32);
+
+   //numerical parameters
+   bool averageForceTorqueOverTwoTimSteps = true;
+   uint_t numberOfLevels = uint_t(3);
+   uint_t refinementCheckFrequency = uint_t(0);
+   bool useStaticRefinement = false;
+
+   real_t lowerFluidRefinementLimit = real_t(0);
+   real_t upperFluidRefinementLimit = std::numeric_limits<real_t>::infinity();
+
+   bool useVorticityCriterion = false;
+   bool useGradientCriterion = false;
+
+   // initialize the horizontal sphere velocity to avoid ambiguity due to physical instability that determines the horizontal movement direction
+   bool initializeSphereVelocity = false;
+   // add small offset to initial sphere position to break numerical symmetry of setup
+   bool offsetSphere = false;
+
+   // evaluate and print current imbalances in the workload
+   bool evaluateLoadImbalance = false;
+
+   for( int i = 1; i < argc; ++i )
+   {
+      if( std::strcmp( argv[i], "--shortrun" )         == 0 ) { shortrun = true; continue; }
+      if( std::strcmp( argv[i], "--funcTest" )         == 0 ) { funcTest = true; continue; }
+      if( std::strcmp( argv[i], "--fileIO" )           == 0 ) { fileIO = true; continue; }
+      if( std::strcmp( argv[i], "--vtkWriteFreqDD" )   == 0 ) { vtkWriteFreqDD = uint_c( std::atof( argv[++i] ) ); continue; }
+      if( std::strcmp( argv[i], "--vtkWriteFreqBo" )   == 0 ) { vtkWriteFreqBo = uint_c( std::atof( argv[++i] ) ); continue; }
+      if( std::strcmp( argv[i], "--vtkWriteFreqFl" )   == 0 ) { vtkWriteFreqFl = uint_c( std::atof( argv[++i] ) ); continue; }
+      if( std::strcmp( argv[i], "--vtkWriteFluidSlice" ) == 0 ) { vtkWriteFluidSlice = true; continue; }
+      if( std::strcmp( argv[i], "--vtkWriteFreq" )     == 0 ) { vtkWriteFreq = uint_c( std::atof( argv[++i] ) ); continue; }
+      if( std::strcmp( argv[i], "--densityRatio" )     == 0 ) { densityRatio = real_c(std::atof( argv[++i] )); continue; }
+      if( std::strcmp( argv[i], "--Ga" )               == 0 ) { GalileoNumber = real_c(std::atof( argv[++i] )); continue; }
+      if( std::strcmp( argv[i], "--ug" )               == 0 ) { ug = real_c(std::atof( argv[++i] )); continue; }
+      if( std::strcmp( argv[i], "--diameter" )         == 0 ) { diameter = real_c(std::atof( argv[++i] )); continue; }
+      if( std::strcmp( argv[i], "--xSizeNonDim" )      == 0 ) { xSizeNonDim = uint_c( std::atof( argv[++i] ) ); continue; }
+      if( std::strcmp( argv[i], "--ySizeNonDim" )      == 0 ) { ySizeNonDim = uint_c( std::atof( argv[++i] ) ); continue; }
+      if( std::strcmp( argv[i], "--zSizeNonDim" )      == 0 ) { zSizeNonDim = uint_c( std::atof( argv[++i] ) ); continue; }
+      if( std::strcmp( argv[i], "--noForceAveraging" ) == 0 ) { averageForceTorqueOverTwoTimSteps = false; continue; }
+      if( std::strcmp( argv[i], "--numLevels" )        == 0 ) { numberOfLevels = uint_c( std::atof( argv[++i] ) ); continue; }
+      if( std::strcmp( argv[i], "--refinementCheckFrequency" ) == 0 ) { refinementCheckFrequency = uint_c( std::atof( argv[++i] ) ); continue; }
+      if( std::strcmp( argv[i], "--useStaticRefinement" ) == 0 ) { useStaticRefinement = true; continue; }
+      if( std::strcmp( argv[i], "--lowerLimit" )       == 0 ) { lowerFluidRefinementLimit = real_c(std::atof( argv[++i] )); continue; }
+      if( std::strcmp( argv[i], "--upperLimit" )       == 0 ) { upperFluidRefinementLimit = real_c(std::atof( argv[++i] )); continue; }
+      if( std::strcmp( argv[i], "--baseFolder" )       == 0 ) { baseFolder = argv[++i]; continue; }
+      if( std::strcmp( argv[i], "--useVorticityCriterion" ) == 0 ) { useVorticityCriterion = true; continue; }
+      if( std::strcmp( argv[i], "--useGradientCriterion" )  == 0 ) { useGradientCriterion = true; continue; }
+      if( std::strcmp( argv[i], "--initializeSphereVelocity" )  == 0 ) { initializeSphereVelocity = true; continue; }
+      if( std::strcmp( argv[i], "--offsetSphere" )  == 0 ) { offsetSphere = true; continue; }
+      if( std::strcmp( argv[i], "--evaluateLoadImbalance" )  == 0 ) { evaluateLoadImbalance = true; continue; }
+      WALBERLA_ABORT("Unrecognized command line argument found: " << argv[i]);
+   }
+
+   if( funcTest )
+   {
+      walberla::logging::Logging::instance()->setLogLevel(logging::Logging::LogLevel::WARNING);
+   }
+
+   if( fileIO )
+   {
+      WALBERLA_ROOT_SECTION(){
+         // create base directory if it does not yet exist
+         filesystem::path tpath( baseFolder );
+         if( !filesystem::exists( tpath ) )
+            filesystem::create_directory( tpath );
+      }
+   }
+
+   if( useVorticityCriterion && useGradientCriterion )
+   {
+      WALBERLA_ABORT("Use either vorticity or gradient criterion for refinement!");
+   }
+
+   if( vtkWriteFreq != 0 )
+   {
+      vtkWriteFreqDD = vtkWriteFreq;
+      vtkWriteFreqBo = vtkWriteFreq;
+      vtkWriteFreqFl = vtkWriteFreq;
+   }
+
+
+   //////////////////////////
+   // NUMERICAL PARAMETERS //
+   //////////////////////////
+
+   const Vector3<uint_t> domainSize( uint_c(real_t(xSizeNonDim) * diameter ),
+                                     uint_c(real_t(ySizeNonDim) * diameter ),
+                                     uint_c(real_t(zSizeNonDim) * diameter ) );
+   const real_t sphereVolume = math::pi / real_t(6) * diameter * diameter * diameter;
+
+   const real_t xRef = diameter;
+   const real_t tRef = xRef / ug;
+
+   const real_t gravitationalAcceleration = ug * ug / ( (densityRatio-real_t(1)) * diameter );
+   const real_t viscosity = ug * diameter / GalileoNumber;
+   const real_t omega = lbm::collision_model::omegaFromViscosity(viscosity);
+   const real_t tau = real_t(1) / omega;
+
+   const uint_t timesteps = funcTest ? 1 : ( shortrun ? uint_t(2000) : uint_t( 250000 ) );
+   const uint_t numPeSubCycles = uint_t(1);
+
+   const uint_t loggingDisplayFrequency = uint_t(100);
+
+   const real_t dx = real_t(1);
+   const real_t overlap = real_t( 1.5 ) * dx;
+
+   Vector3<real_t> initialSpherePosition( real_t(0.5) * real_c(domainSize[0]),
+                                          real_t(0.5) * real_c(domainSize[1]),
+                                          real_t(0.5) * real_c(domainSize[2]));
+   if( offsetSphere )
+   {
+      Vector3<real_t> offset( real_t(0.3), real_t(0.2), real_t(0));
+      initialSpherePosition += offset;
+   }
+
+   if( useVorticityCriterion && floatIsEqual(lowerFluidRefinementLimit, real_t(0)) && std::isinf(upperFluidRefinementLimit) )
+   {
+      // use computed criterion instead of user input
+      lowerFluidRefinementLimit = real_t(0.05) * ug;
+      upperFluidRefinementLimit = real_t(0.1) * ug;
+   }
+
+   const uint_t finestLevel = numberOfLevels - uint_t(1);
+   std::stringstream omega_msg;
+   for( uint_t i = 0; i < numberOfLevels; ++i )
+   {
+      omega_msg << lbm::collision_model::levelDependentRelaxationParameter( i, omega, finestLevel ) << " ( on level " << i << " ), ";
+   }
+
+
+   WALBERLA_LOG_INFO_ON_ROOT("Setup (in simulation, i.e. lattice, units):");
+   WALBERLA_LOG_INFO_ON_ROOT(" - domain size = " << domainSize);
+   WALBERLA_LOG_INFO_ON_ROOT(" - sphere diameter = " << diameter );
+   WALBERLA_LOG_INFO_ON_ROOT(" - Galileo number = " << GalileoNumber );
+   WALBERLA_LOG_INFO_ON_ROOT(" - densityRatio = " << densityRatio );
+   WALBERLA_LOG_INFO_ON_ROOT(" - fluid: relaxation time (tau) = " << tau << ", kin. visc = " << viscosity );
+   WALBERLA_LOG_INFO_ON_ROOT(" - gravitational acceleration = " << gravitationalAcceleration );
+   WALBERLA_LOG_INFO_ON_ROOT(" - initial sphere position = " << initialSpherePosition );
+   WALBERLA_LOG_INFO_ON_ROOT(" - reference values: x = " << xRef << ", t = " << tRef << ", vel = " << ug);
+   WALBERLA_LOG_INFO_ON_ROOT(" - omega: " << omega_msg.str());
+   WALBERLA_LOG_INFO_ON_ROOT(" - number of levels: " << numberOfLevels);
+   if(useStaticRefinement)
+   {
+      WALBERLA_LOG_INFO_ON_ROOT(" - using static refinement");
+   }
+   if( useVorticityCriterion )
+   {
+      WALBERLA_LOG_INFO_ON_ROOT(" - using vorticity criterion with lower limit = " << lowerFluidRefinementLimit << " and upper limit = " << upperFluidRefinementLimit );
+   }
+   if( useGradientCriterion )
+   {
+      WALBERLA_LOG_INFO_ON_ROOT(" - using gradient criterion with lower limit = " << lowerFluidRefinementLimit << " and upper limit = " << upperFluidRefinementLimit );
+   }
+   if( vtkWriteFreqDD > 0 )
+   {
+      WALBERLA_LOG_INFO_ON_ROOT(" - writing vtk files of domain decomposition to folder \"" << baseFolder << "\" with frequency " << vtkWriteFreqDD);
+   }
+   if( vtkWriteFreqBo > 0 )
+   {
+      WALBERLA_LOG_INFO_ON_ROOT(" - writing vtk files of bodies data to folder \"" << baseFolder << "\" with frequency " << vtkWriteFreqBo);
+   }
+   if( vtkWriteFreqFl > 0 )
+   {
+      if( vtkWriteFluidSlice ){
+         WALBERLA_LOG_INFO_ON_ROOT(" - writing vtk files of sliced fluid data to folder \"" << baseFolder << "\" with frequency " << vtkWriteFreqFl);
+      }
+      else{
+         WALBERLA_LOG_INFO_ON_ROOT(" - writing vtk files of full fluid data to folder \"" << baseFolder << "\" with frequency " << vtkWriteFreqFl);
+      }
+   }
+
+   ///////////////////////////
+   // BLOCK STRUCTURE SETUP //
+   ///////////////////////////
+
+   const uint_t levelScalingFactor = ( uint_t(1) << finestLevel );
+   const uint_t lbmTimeStepsPerTimeLoopIteration = levelScalingFactor;
+
+   uint_t blockSize = std::max(uint_t(16), uint_c(diameter) );
+   Vector3<uint_t> blockSizeInCells( blockSize );
+
+   AABB simulationDomain( real_t(0), real_t(0), real_t(0), real_c(domainSize[0]), real_c(domainSize[1]), real_c(domainSize[2]) );
+   auto blocks = createBlockStructure( simulationDomain, blockSizeInCells, numberOfLevels, diameter, initialSpherePosition, useStaticRefinement );
+
+   //write domain decomposition to file
+   if( vtkWriteFreqDD > 0 )
+   {
+      vtk::writeDomainDecomposition( blocks, "initial_domain_decomposition", baseFolder );
+   }
+
+   if( !useStaticRefinement && refinementCheckFrequency == 0 && numberOfLevels != 1 )
+   {
+      // determine check frequency automatically based on maximum admissible velocity and block sizes
+      real_t uMax = real_t(0.1);
+      real_t refinementCheckFrequencyFinestLevel = ( overlap + real_c(blockSize) - real_t(2) * real_t(FieldGhostLayers) * dx) / uMax;
+      refinementCheckFrequency = uint_c( refinementCheckFrequencyFinestLevel / real_t(lbmTimeStepsPerTimeLoopIteration));
+   }
+   WALBERLA_LOG_INFO_ON_ROOT(" - refinement check frequency: " << refinementCheckFrequency);
+
+
+   /////////////////
+   // PE COUPLING //
+   /////////////////
+
+   // set up pe functionality
+   shared_ptr<pe::BodyStorage> globalBodyStorage = make_shared<pe::BodyStorage>();
+   pe::SetBodyTypeIDs<BodyTypeTuple>::execute();
+
+   auto bodyStorageID  = blocks->addBlockData(pe::createStorageDataHandling<BodyTypeTuple>(), "pe Body Storage");
+   auto ccdID          = blocks->addBlockData(pe::ccd::createHashGridsDataHandling( globalBodyStorage, bodyStorageID ), "CCD");
+   auto fcdID          = blocks->addBlockData(pe::fcd::createGenericFCDDataHandling<BodyTypeTuple, pe::fcd::AnalyticCollideFunctor>(), "FCD");
+
+   shared_ptr<WcTimingTree> timingTreePE = make_shared<WcTimingTree>();
+
+   // set up collision response, here DEM solver
+   pe::cr::DEM cr(globalBodyStorage, blocks->getBlockStoragePointer(), bodyStorageID, ccdID, fcdID, &(*timingTreePE));
+
+   // set up synchronization procedure
+   std::function<void(void)> syncCall = [&capture0 = blocks->getBlockForest(), bodyStorageID, capture1 = &(*timingTreePE), overlap] { pe::syncShadowOwners<BodyTypeTuple>(capture0, bodyStorageID, capture1, overlap, false); };
+
+   // create pe bodies
+
+   // add the sphere
+   const auto sphereMaterial = pe::createMaterial( "mySphereMat", densityRatio , real_t(0.5), real_t(0.1), real_t(0.1), real_t(0.24), real_t(200), real_t(200), real_t(0), real_t(0) );
+   auto sphere = pe::createSphere( *globalBodyStorage, blocks->getBlockStorage(), bodyStorageID, 0, initialSpherePosition, real_t(0.5) * diameter, sphereMaterial );
+
+   if( initializeSphereVelocity && sphere != nullptr )
+   {
+      Vector3<real_t> initialSphereVelocity( real_t(0.01) * ug, real_t(0.01) * ug, real_t(0));
+      sphere->setLinearVel(initialSphereVelocity);
+      WALBERLA_LOG_INFO(" - setting initial sphere velocity of " << initialSphereVelocity);
+   }
+
+   uint_t minBlockSizeInCells = blockSizeInCells.min();
+   for( uint_t i = 0; i < uint_c(diameter / real_c(minBlockSizeInCells)) + 1; ++i)
+      syncCall();
+
+   ///////////////////////
+   // ADD DATA TO BLOCKS //
+   ////////////////////////
+
+   // create the lattice model
+   LatticeModel_T latticeModel = LatticeModel_T( lbm::collision_model::TRT::constructWithMagicNumber( omega, lbm::collision_model::TRT::threeSixteenth, finestLevel ) );
+
+   // add PDF field
+   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, "pdf field (fzyx)", latticeModel,
+                                                                         Vector3< real_t >( real_t(0) ), real_t(1),
+                                                                         FieldGhostLayers, field::fzyx );
+   // add flag field
+   BlockDataID flagFieldID = field::addFlagFieldToStorage<FlagField_T>( blocks, "flag field", FieldGhostLayers );
+
+   // add body field
+   BlockDataID bodyFieldID = field::addToStorage<BodyField_T>( blocks, "body field", nullptr, field::fzyx, FieldGhostLayers );
+
+   // add velocity field and utility
+   BlockDataID velocityFieldID = field::addToStorage<VelocityField_T>( blocks, "velocity field", Vector3<real_t>(real_t(0)), field::fzyx, uint_t(2) );
+
+   using VelocityFieldWriter_T = lbm::VelocityFieldWriter<PdfField_T, VelocityField_T>;
+   BlockSweepWrapper< VelocityFieldWriter_T > velocityFieldWriter( blocks, VelocityFieldWriter_T( pdfFieldID, velocityFieldID ) );
+
+
+   shared_ptr<blockforest::communication::NonUniformBufferedScheme<stencil::D3Q27> > velocityCommunicationScheme = make_shared<blockforest::communication::NonUniformBufferedScheme<stencil::D3Q27> >( blocks );
+   velocityCommunicationScheme->addPackInfo( make_shared< field::refinement::PackInfo<VelocityField_T, stencil::D3Q27> >( velocityFieldID ) );
+
+   // add boundary handling & initialize outer domain boundaries
+   BlockDataID boundaryHandlingID = blocks->addBlockData( make_shared< MyBoundaryHandling >( blocks, flagFieldID, pdfFieldID, bodyFieldID ),
+                                                          "boundary handling" );
+
+   // map planes into the LBM simulation -> act as no-slip boundaries
+   pe_coupling::mapBodies< BoundaryHandling_T >( *blocks, boundaryHandlingID, bodyStorageID, *globalBodyStorage, NoSlip_Flag, pe_coupling::selectGlobalBodies );
+
+   // map pe bodies into the LBM simulation
+   pe_coupling::mapMovingBodies< BoundaryHandling_T >( *blocks, boundaryHandlingID, bodyStorageID, *globalBodyStorage, bodyFieldID, MO_Flag, pe_coupling::selectRegularBodies );
+
+
+   // force averaging functionality
+   shared_ptr<pe_coupling::BodiesForceTorqueContainer> bodiesFTContainer1 = make_shared<pe_coupling::BodiesForceTorqueContainer>(blocks, bodyStorageID);
+   std::function<void(void)> storeForceTorqueInCont1 = [bodiesFTContainer1] { bodiesFTContainer1->store(); };
+   shared_ptr<pe_coupling::BodiesForceTorqueContainer> bodiesFTContainer2 = make_shared<pe_coupling::BodiesForceTorqueContainer>(blocks, bodyStorageID);
+   std::function<void(void)> setForceTorqueOnBodiesFromCont2 = [bodiesFTContainer2] { bodiesFTContainer2->setOnBodies(); };
+   shared_ptr<pe_coupling::ForceTorqueOnBodiesScaler> forceScaler = make_shared<pe_coupling::ForceTorqueOnBodiesScaler>(blocks, bodyStorageID, real_t(0.5));
+   std::function<void(void)> setForceScalingFactorToOne = [forceScaler] { forceScaler->resetScalingFactor(real_t(1)); };
+   std::function<void(void)> setForceScalingFactorToHalf = [forceScaler] { forceScaler->resetScalingFactor(real_t(0.5)); };
+
+   if( averageForceTorqueOverTwoTimSteps ) {
+      bodiesFTContainer2->store();
+
+      setForceScalingFactorToOne();
+   }
+
+   ////////////////////////
+   // DYNAMIC REFINEMENT //
+   ////////////////////////
+
+   auto & blockforest = blocks->getBlockForest();
+   blockforest.recalculateBlockLevelsInRefresh( true );
+   blockforest.alwaysRebalanceInRefresh( false );
+   blockforest.reevaluateMinTargetLevelsAfterForcedRefinement( false );
+   blockforest.allowRefreshChangingDepth( false );
+   blockforest.allowMultipleRefreshCycles( false ); // otherwise info collections are invalid
+
+   blockforest::CombinedMinTargetLevelDeterminationFunctions minTargetLevelDeterminationFunctions;
+
+   // add refinement criterion based on particle presence
+   shared_ptr<pe_coupling::InfoCollection> couplingInfoCollection = walberla::make_shared<pe_coupling::InfoCollection>();
+   pe_coupling::amr::BodyPresenceLevelDetermination particlePresenceRefinement( couplingInfoCollection, finestLevel );
+
+   minTargetLevelDeterminationFunctions.add( particlePresenceRefinement );
+
+   if( useVorticityCriterion )
+   {
+      // add refinement criterion based on vorticity magnitude
+      field::FlagFieldEvaluationFilter<FlagField_T> flagFieldFilter( flagFieldID, Fluid_Flag );
+      lbm::refinement::VorticityBasedLevelDetermination< field::FlagFieldEvaluationFilter<FlagField_T> > vorticityRefinement(
+            velocityFieldID, flagFieldFilter, upperFluidRefinementLimit, lowerFluidRefinementLimit, finestLevel );
+
+      minTargetLevelDeterminationFunctions.add( vorticityRefinement );
+   }
+
+   if( useGradientCriterion )
+   {
+      // add refinement criterion based on velocity gradient magnitude
+      field::FlagFieldEvaluationFilter<FlagField_T> flagFieldFilter( flagFieldID, Fluid_Flag );
+      VectorGradientRefinement< LatticeModel_T, field::FlagFieldEvaluationFilter<FlagField_T> > gradientRefinement(
+            velocityFieldID, flagFieldFilter, upperFluidRefinementLimit, lowerFluidRefinementLimit, finestLevel );
+
+      minTargetLevelDeterminationFunctions.add( gradientRefinement );
+   }
+
+   blockforest.setRefreshMinTargetLevelDeterminationFunction( minTargetLevelDeterminationFunctions );
+
+   bool curveHilbert = true; //false = use Morton
+   bool curveAllGather = true;
+   bool balanceLevelwise = true;
+   blockforest.setRefreshPhantomBlockMigrationPreparationFunction( blockforest::DynamicCurveBalance< blockforest::PODPhantomWeight<real_t> >( curveHilbert, curveAllGather, balanceLevelwise ) );
+
+   blockforest.setRefreshPhantomBlockDataPackFunction(blockforest::PODPhantomWeightPackUnpack<real_t>());
+   blockforest.setRefreshPhantomBlockDataUnpackFunction(blockforest::PODPhantomWeightPackUnpack<real_t>());
+
+   pe_coupling::amr::WeightAssignmentFunctor weightAssignmentFunctor(couplingInfoCollection, pe_coupling::amr::defaultWeightEvaluationFunction);
+   blockforest.setRefreshPhantomBlockDataAssignmentFunction(weightAssignmentFunctor);
+
+
+   ///////////////
+   // TIME LOOP //
+   ///////////////
+
+   // create the timeloop
+   shared_ptr<SweepTimeloop> timeloop = make_shared<SweepTimeloop>( blocks->getBlockStorage(), timesteps );
+
+   shared_ptr<WcTimingPool> timeloopTiming = make_shared<WcTimingPool>();
+   shared_ptr<WcTimingPool> timeloopRefinementTiming = make_shared<WcTimingPool>();
+   shared_ptr<WcTimingPool> timeloopRefinementTimingLevelwise = make_shared<WcTimingPool>();
+
+   if( vtkWriteFreqDD != uint_t(0) ) {
+      auto domainDecompVTK = vtk::createVTKOutput_DomainDecomposition(blocks, "domain_decomposition", vtkWriteFreqDD, baseFolder );
+      timeloop->addFuncBeforeTimeStep( vtk::writeFiles(domainDecompVTK), "VTK (domain decomposition)");
+   }
+
+   if( vtkWriteFreqBo != uint_t(0) ) {
+      // pe bodies
+      auto bodyVtkOutput = make_shared<pe::SphereVtkOutput>(bodyStorageID, blocks->getBlockStorage());
+      auto bodyVTK = vtk::createVTKOutput_PointData(bodyVtkOutput, "bodies", vtkWriteFreqBo, baseFolder);
+      timeloop->addFuncBeforeTimeStep(vtk::writeFiles(bodyVTK), "VTK (sphere data)");
+   }
+
+   if( vtkWriteFreqFl != uint_t(0) ) {
+      // flag field
+      //auto flagFieldVTK = vtk::createVTKOutput_BlockData( blocks, "flag_field", vtkIOFreq, 0, false, baseFolder );
+      //flagFieldVTK->addCellDataWriter( make_shared< field::VTKWriter< FlagField_T > >( flagFieldID, "FlagField" ) );
+      //timeloop.addFuncAfterTimeStep( vtk::writeFiles( flagFieldVTK ), "VTK (flag field data)" );
+
+      // pdf field
+      auto pdfFieldVTK = vtk::createVTKOutput_BlockData(blocks, "fluid_field", vtkWriteFreqFl, 0, false, baseFolder);
+
+      field::FlagFieldCellFilter< FlagField_T > fluidFilter( flagFieldID );
+      fluidFilter.addFlag( Fluid_Flag );
+
+      if(vtkWriteFluidSlice)
+      {
+         AABB sliceAABB( real_t(0), real_c(domainSize[1])*real_t(0.5)-real_t(1), real_t(0),
+                         real_c(domainSize[0]), real_c(domainSize[1])*real_t(0.5)+real_t(1), real_c(domainSize[2]) );
+         vtk::AABBCellFilter aabbSliceFilter( sliceAABB );
+
+         vtk::ChainedFilter combinedSliceFilter;
+         combinedSliceFilter.addFilter( fluidFilter );
+         combinedSliceFilter.addFilter( aabbSliceFilter );
+
+         pdfFieldVTK->addCellInclusionFilter( combinedSliceFilter );
+      }
+      else {
+         pdfFieldVTK->addCellInclusionFilter( fluidFilter );
+      }
+
+      pdfFieldVTK->addCellDataWriter(
+            make_shared<lbm::VelocityVTKWriter<LatticeModel_T, float> >(pdfFieldID, "VelocityFromPDF"));
+      pdfFieldVTK->addCellDataWriter(
+            make_shared<lbm::DensityVTKWriter<LatticeModel_T, float> >(pdfFieldID, "DensityFromPDF"));
+
+      timeloop->addFuncBeforeTimeStep(vtk::writeFiles(pdfFieldVTK), "VTK (fluid field data)");
+   }
+
+
+
+   auto sweep = lbm::makeCellwiseSweep< LatticeModel_T, FlagField_T >( pdfFieldID, flagFieldID, Fluid_Flag );
+   auto refinementTimestep = lbm::refinement::makeTimeStep< LatticeModel_T, BoundaryHandling_T >( blocks, sweep, pdfFieldID, boundaryHandlingID );
+
+   refinementTimestep->enableTiming( timeloopRefinementTiming, timeloopRefinementTimingLevelwise );
+
+   // Averaging the force/torque over two time steps is said to damp oscillations of the interaction force/torque.
+   // See Ladd - " Numerical simulations of particulate suspensions via a discretized Boltzmann equation. Part 1. Theoretical foundation", 1994, p. 302
+   if( averageForceTorqueOverTwoTimSteps ) {
+
+      // store force/torque from hydrodynamic interactions in container1
+      refinementTimestep->addPostStreamVoidFunction(lbm::refinement::FunctorWrapper(storeForceTorqueInCont1), "Force Storing", finestLevel);
+
+      // set force/torque from previous time step (in container2)
+      refinementTimestep->addPostStreamVoidFunction(lbm::refinement::FunctorWrapper(setForceTorqueOnBodiesFromCont2), "Force setting", finestLevel);
+
+      // average the force/torque by scaling it with factor 1/2 (except in first timestep and directly after refinement, there it is 1)
+      refinementTimestep->addPostStreamVoidFunction(lbm::refinement::FunctorWrapper(SharedFunctor<pe_coupling::ForceTorqueOnBodiesScaler>(forceScaler)), "Force averaging", finestLevel);
+      refinementTimestep->addPostStreamVoidFunction(lbm::refinement::FunctorWrapper(setForceScalingFactorToHalf), "Force scaling adjustment", finestLevel);
+
+      // swap containers
+      refinementTimestep->addPostStreamVoidFunction(lbm::refinement::FunctorWrapper(pe_coupling::BodyContainerSwapper(bodiesFTContainer1, bodiesFTContainer2)), "Swap FT container", finestLevel);
+
+   }
+
+   Vector3<real_t> gravitationalForce( real_t(0), real_t(0), -(densityRatio - real_t(1)) * gravitationalAcceleration * sphereVolume );
+   refinementTimestep->addPostStreamVoidFunction(lbm::refinement::FunctorWrapper(pe_coupling::ForceOnBodiesAdder( blocks, bodyStorageID, gravitationalForce )), "Gravitational force", finestLevel );
+
+   // add pe timesteps
+   refinementTimestep->addPostStreamVoidFunction(lbm::refinement::FunctorWrapper(pe_coupling::TimeStep( blocks, bodyStorageID, cr, syncCall, real_t(1), numPeSubCycles)),
+                                                 "pe Time Step", finestLevel );
+
+   // add sweep for updating the pe body mapping into the LBM simulation
+   refinementTimestep->addPostStreamVoidFunction(lbm::refinement::SweepAsFunctorWrapper( pe_coupling::BodyMapping< LatticeModel_T, BoundaryHandling_T >( blocks, pdfFieldID, boundaryHandlingID, bodyStorageID, globalBodyStorage, bodyFieldID,  MO_Flag, FormerMO_Flag, pe_coupling::selectRegularBodies ), blocks ),
+                                                 "Body Mapping", finestLevel );
+
+   // add sweep for restoring PDFs in cells previously occupied by pe bodies
+   using Reconstructor_T = pe_coupling::EquilibriumReconstructor<LatticeModel_T, BoundaryHandling_T>;
+   Reconstructor_T reconstructor( blocks, boundaryHandlingID, bodyFieldID );
+   refinementTimestep->addPostStreamVoidFunction(lbm::refinement::SweepAsFunctorWrapper( pe_coupling::PDFReconstruction< LatticeModel_T, BoundaryHandling_T, Reconstructor_T > ( blocks, pdfFieldID,
+                                                                                                                                                                                 boundaryHandlingID, bodyStorageID, globalBodyStorage, bodyFieldID, reconstructor, FormerMO_Flag, Fluid_Flag ), blocks ),
+                                                 "PDF Restore", finestLevel );
+
+
+   // add LBM sweep with refinement
+   timeloop->addFuncBeforeTimeStep( makeSharedFunctor( refinementTimestep ), "LBM refinement time step" );
+
+   // check for convergence of the particle position
+   std::string loggingFileName( baseFolder + "/LoggingAMRSettlingSphere_Ga");
+   loggingFileName += std::to_string(uint_c(GalileoNumber));
+   loggingFileName += "_lvl";
+   loggingFileName += std::to_string(numberOfLevels);
+   loggingFileName += ".txt";
+   if( fileIO  )
+   {
+      WALBERLA_LOG_INFO_ON_ROOT(" - writing logging output to file \"" << loggingFileName << "\"");
+   }
+   shared_ptr< SpherePropertyLogger > logger = walberla::make_shared< SpherePropertyLogger >( timeloop, blocks, bodyStorageID,
+                                                                                              loggingFileName, fileIO, xRef, tRef,
+                                                                                              lbmTimeStepsPerTimeLoopIteration,
+                                                                                              diameter, viscosity);
+   timeloop->addFuncAfterTimeStep( SharedFunctor< SpherePropertyLogger >( logger ), "Sphere property logger" );
+
+   timeloop->addFuncAfterTimeStep( RemainingTimeLogger( timeloop->getNrOfTimeSteps() ), "Remaining Time Logger" );
+
+   if( evaluateLoadImbalance ) timeloop->addFuncAfterTimeStep( LoadImbalanceEvaluator( timeloopRefinementTimingLevelwise, timingTreePE, numberOfLevels ), "Load Imbalance Evaluator" );
+
+   // add level wise timing pool output
+   timeloop->addFuncAfterTimeStep( TimingPoolLogger( timeloopTiming, timeloop, loggingDisplayFrequency ), "Regular Timing Logger" );
+
+   // add regular refinement timing pool output
+   timeloop->addFuncAfterTimeStep( TimingPoolLogger( timeloopRefinementTiming, timeloop, loggingDisplayFrequency ), "Refinement Timing Logger" );
+
+   // add level wise timing pool output
+   timeloop->addFuncAfterTimeStep( TimingPoolLogger( timeloopRefinementTimingLevelwise, timeloop, loggingDisplayFrequency ), "Refinement Levelwise Timing Logger" );
+
+   // add PE timing tree output
+   timeloop->addFuncAfterTimeStep( TimingTreeLogger( timingTreePE, timeloop, loggingDisplayFrequency ), "PE Timing Tree Timing Logger" );
+
+
+   timeloop->addFuncAfterTimeStep( TimingResetter( timeloopRefinementTimingLevelwise, timingTreePE ), "Timing Resetter" );
+
+   ////////////////////////
+   // EXECUTE SIMULATION //
+   ////////////////////////
+
+   real_t terminationPosition = diameter;
+   real_t curPos = initialSpherePosition[2];
+   real_t oldPos = initialSpherePosition[2];
+
+   uint_t numberOfPassesThroughTerminationPosition = 3;
+   uint_t passCounter = uint_t(0);
+
+   // time loop
+   for (uint_t i = 0; i < timesteps; ++i )
+   {
+
+      if( refinementCheckFrequency != 0 && i % refinementCheckFrequency == 0)
+      {
+         // first evaluate all data that is required for the refinement checks
+
+         // for the particle presence based check:
+         auto & forest = blocks->getBlockForest();
+         pe_coupling::createWithNeighborhood<BoundaryHandling_T>(forest, boundaryHandlingID, bodyStorageID, ccdID, fcdID, numPeSubCycles, *couplingInfoCollection);
+
+         // for the fluid property based check:
+         if( useVorticityCriterion || useGradientCriterion )
+         {
+            velocityFieldWriter();
+            (*velocityCommunicationScheme)();
+         }
+
+         // check refinement criteria and refine/coarsen if necessary
+         uint_t stampBefore = blocks->getBlockForest().getModificationStamp();
+         blocks->refresh();
+         uint_t stampAfter = blocks->getBlockForest().getModificationStamp();
+
+         if(stampBefore == stampAfter)
+         {
+            // nothing has changed
+            continue;
+         }
+
+         WALBERLA_LOG_INFO_ON_ROOT("Adapting grid and reinitializing data structures");
+
+         // rebuild PE data structures
+         pe::clearSynchronization( blockforest, bodyStorageID);
+
+         for( uint_t syncStep = 0; syncStep < uint_c(diameter / real_c(minBlockSizeInCells)) + 1; ++syncStep)
+            syncCall();
+
+         for (auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt)
+         {
+            pe::ccd::ICCD* ccd = blockIt->getData< pe::ccd::ICCD >( ccdID );
+            ccd->reloadBodies();
+         }
+
+         clearBoundaryHandling(forest, boundaryHandlingID);
+         clearBodyField(forest, bodyFieldID);
+
+         if( averageForceTorqueOverTwoTimSteps ) {
+
+            // clear containers from old values
+            bodiesFTContainer1->clear();
+            bodiesFTContainer2->clear();
+
+            // initialize FT container on all blocks anew, i.e. with the currently acting force/torque, which is zero after the refinement step
+            bodiesFTContainer2->store();
+
+            // set force scaling factor to one after refinement since force history is not present on blocks after refinement
+            // thus the usual averaging of 1/2 (over two time steps) can not be carried out, i.e. it would lead to 1/2 of the acting force
+            // the scaling factor is thus adapted for the next timestep to 1, and then changed back to 1/2 (in the timeloop)
+            setForceScalingFactorToOne();
+         }
+
+         recreateBoundaryHandling(forest, boundaryHandlingID);
+
+         // re-set the no-slip flags along the walls
+         pe_coupling::mapBodies< BoundaryHandling_T >( *blocks, boundaryHandlingID, bodyStorageID, *globalBodyStorage, NoSlip_Flag, pe_coupling::selectGlobalBodies );
+
+         // re-map the body into the domain (initializing the bodyField as well)
+         pe_coupling::mapMovingBodies< BoundaryHandling_T >( *blocks, boundaryHandlingID, bodyStorageID, *globalBodyStorage, bodyFieldID, MO_Flag, pe_coupling::selectRegularBodies );
+
+         // some evaluation
+         uint_t numBlocksFinestLevel = forest.getNumberOfBlocks(numberOfLevels-1);
+         mpi::allReduceInplace(numBlocksFinestLevel, mpi::SUM);
+         WALBERLA_LOG_INFO_ON_ROOT("Total number of blocks on finest level = " << numBlocksFinestLevel);
+
+      }
+
+      // perform a single simulation step
+      timeloop->singleStep( *timeloopTiming );
+
+      oldPos = curPos;
+      curPos = logger->getPosition();
+
+      if( curPos <= terminationPosition && oldPos > terminationPosition )
+      {
+         ++passCounter;
+         if( passCounter == numberOfPassesThroughTerminationPosition )
+         {
+            WALBERLA_LOG_INFO_ON_ROOT("Sphere passed terminal position " << terminationPosition << " for the " << passCounter << ". time - terminating simulation!");
+            break;
+         }
+      }
+   }
+
+   timeloopTiming->logResultOnRoot();
+
+   return EXIT_SUCCESS;
+}
+
+} // namespace amr_settling_sphere
+
+int main( int argc, char **argv ){
+   amr_settling_sphere::main(argc, argv);
+}
\ No newline at end of file
--- a/apps/benchmarks/AdaptiveMeshRefinementFluidParticleCoupling/CMakeLists.txt
+++ b/apps/benchmarks/AdaptiveMeshRefinementFluidParticleCoupling/CMakeLists.txt
+waLBerla_add_executable( NAME WorkloadEvaluation FILES WorkloadEvaluation.cpp DEPENDS walberla::blockforest walberla::boundary walberla::core walberla::field walberla::lbm walberla::pe walberla::pe_coupling walberla::postprocessing walberla::stencil walberla::timeloop walberla::vtk )
+
+waLBerla_add_executable( NAME AMRSedimentSettling FILES AMRSedimentSettling.cpp DEPENDS walberla::blockforest walberla::boundary walberla::core walberla::field walberla::lbm walberla::pe walberla::pe_coupling walberla::postprocessing walberla::stencil walberla::timeloop walberla::vtk )
+
+waLBerla_add_executable( NAME AMRSettlingSphere FILES AMRSettlingSphere.cpp DEPENDS walberla::blockforest walberla::boundary walberla::core walberla::field walberla::lbm walberla::pe walberla::pe_coupling walberla::postprocessing walberla::stencil walberla::timeloop walberla::vtk )
--- a/apps/benchmarks/AdaptiveMeshRefinementFluidParticleCoupling/WorkloadEvaluation.cpp
+++ b/apps/benchmarks/AdaptiveMeshRefinementFluidParticleCoupling/WorkloadEvaluation.cpp
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file WorkLoadEvaluation.cpp
+//! \ingroup pe_coupling
+//! \author Christoph Rettinger <christoph.rettinger@fau.de>
+//
+//======================================================================================================================
+
+#include "blockforest/Initialization.h"
+#include "blockforest/communication/UniformBufferedScheme.h"
+
+#include "boundary/all.h"
+
+#include "core/DataTypes.h"
+#include "core/Environment.h"
+#include "core/debug/Debug.h"
+#include "core/debug/TestSubsystem.h"
+#include "core/logging/Logging.h"
+#include "core/math/all.h"
+#include "core/SharedFunctor.h"
+#include "core/timing/RemainingTimeLogger.h"
+#include "core/mpi/MPIManager.h"
+#include "core/mpi/Reduce.h"
+#include "core/mpi/Broadcast.h"
+
+#include "domain_decomposition/SharedSweep.h"
+
+#include "field/AddToStorage.h"
+#include "field/communication/PackInfo.h"
+
+#include "lbm/boundary/all.h"
+#include "lbm/communication/PdfFieldPackInfo.h"
+#include "lbm/field/AddToStorage.h"
+#include "lbm/field/MacroscopicValueCalculation.h"
+#include "lbm/field/PdfField.h"
+#include "lbm/lattice_model/D3Q19.h"
+#include "lbm/lattice_model/ForceModel.h"
+#include "lbm/sweeps/CellwiseSweep.h"
+#include "lbm/sweeps/SweepWrappers.h"
+#include "lbm/BlockForestEvaluation.h"
+
+#include "stencil/D3Q27.h"
+
+#include "timeloop/SweepTimeloop.h"
+
+#include "pe/basic.h"
+#include "pe/cr/ICR.h"
+#include "pe/fcd/GJKEPACollideFunctor.h"
+#include "pe/vtk/BodyVtkOutput.h"
+#include "pe/vtk/EllipsoidVtkOutput.h"
+#include "pe/vtk/SphereVtkOutput.h"
+
+#include "pe_coupling/mapping/all.h"
+#include "pe_coupling/momentum_exchange_method/all.h"
+#include "pe_coupling/utility/all.h"
+
+#include "vtk/all.h"
+#include "field/vtk/all.h"
+#include "lbm/vtk/all.h"
+
+#include <vector>
+#include <iomanip>
+#include <iostream>
+
+namespace workload_evaluation
+{
+
+///////////
+// USING //
+///////////
+
+using namespace walberla;
+using walberla::uint_t;
+
+// PDF field, flag field & body field
+using LatticeModel_T = lbm::D3Q19<lbm::collision_model::TRT, false>;
+
+using Stencil_T = LatticeModel_T::Stencil;
+using PdfField_T = lbm::PdfField<LatticeModel_T>;
+
+using flag_t = walberla::uint8_t;
+using FlagField_T = FlagField<flag_t>;
+using BodyField_T = GhostLayerField<pe::BodyID, 1>;
+
+const uint_t FieldGhostLayers = 1;
+
+// boundary handling
+using MO_CLI_T = pe_coupling::CurvedLinear<LatticeModel_T, FlagField_T>;
+
+using BoundaryHandling_T = BoundaryHandling<FlagField_T, Stencil_T, MO_CLI_T>;
+
+using BodyTypeTuple = std::tuple<pe::Sphere, pe::Ellipsoid, pe::Plane>;
+
+///////////
+// FLAGS //
+///////////
+
+const FlagUID Fluid_Flag   ( "fluid" );
+const FlagUID MO_CLI_Flag  ( "moving obstacle CLI" );
+const FlagUID FormerMO_Flag( "former moving obstacle" );
+
+
+/////////////////////////////////////
+// BOUNDARY HANDLING CUSTOMIZATION //
+/////////////////////////////////////
+
+class MyBoundaryHandling
+{
+public:
+
+   MyBoundaryHandling( const BlockDataID & flagFieldID, const BlockDataID & pdfFieldID, const BlockDataID & bodyFieldID, bool useEntireFieldTraversal ) :
+      flagFieldID_( flagFieldID ), pdfFieldID_( pdfFieldID ), bodyFieldID_ ( bodyFieldID ), useEntireFieldTraversal_( useEntireFieldTraversal ) {}
+
+   BoundaryHandling_T * operator()( IBlock* const block, const StructuredBlockStorage* const storage ) const;
+
+private:
+
+   const BlockDataID flagFieldID_;
+   const BlockDataID pdfFieldID_;
+   const BlockDataID bodyFieldID_;
+   bool useEntireFieldTraversal_;
+
+}; // class MyBoundaryHandling
+
+BoundaryHandling_T * MyBoundaryHandling::operator()( IBlock * const block, const StructuredBlockStorage * const storage ) const
+{
+   WALBERLA_ASSERT_NOT_NULLPTR( block );
+   WALBERLA_ASSERT_NOT_NULLPTR( storage );
+
+   auto * flagField = block->getData< FlagField_T >( flagFieldID_ );
+   auto *  pdfField = block->getData< PdfField_T > ( pdfFieldID_ );
+   auto * bodyField = block->getData< BodyField_T >( bodyFieldID_ );
+
+   const auto fluid = flagField->flagExists( Fluid_Flag ) ? flagField->getFlag( Fluid_Flag ) : flagField->registerFlag( Fluid_Flag );
+
+   BoundaryHandling_T::Mode mode = (useEntireFieldTraversal_) ? BoundaryHandling_T::Mode::ENTIRE_FIELD_TRAVERSAL : BoundaryHandling_T::Mode::OPTIMIZED_SPARSE_TRAVERSAL ;
+
+   BoundaryHandling_T * handling = new BoundaryHandling_T( "fixed obstacle boundary handling", flagField, fluid,
+                                                           MO_CLI_T ( "MO_CLI", MO_CLI_Flag, pdfField, flagField, bodyField, fluid, *storage, *block ),
+                                                           mode);
+
+   // boundaries are set by mapping the planes into the domain
+
+   handling->fillWithDomain( FieldGhostLayers );
+
+   return handling;
+}
+
+
+class CollisionPropertiesEvaluator
+{
+public:
+   explicit CollisionPropertiesEvaluator( pe::cr::ICR & collisionResponse ) : collisionResponse_( collisionResponse )
+   {}
+
+   real_t get()
+   {
+      real_t maxPen = std::fabs(collisionResponse_.getMaximumPenetration());
+      WALBERLA_MPI_SECTION()
+      {
+         mpi::allReduceInplace( maxPen, mpi::MAX );
+      }
+      return maxPen;
+   }
+
+private:
+   pe::cr::ICR & collisionResponse_;
+};
+
+class ContactDistanceEvaluator
+{
+public:
+   ContactDistanceEvaluator( const shared_ptr< StructuredBlockStorage > & blocks, const BlockDataID ccdID, const BlockDataID fcdID ) :
+   blocks_( blocks ), ccdID_(ccdID), fcdID_(fcdID)
+   {}
+
+   real_t get()
+   {
+      auto maximumPenetration = real_t(0);
+      for (auto it = blocks_->begin(); it != blocks_->end(); ++it) {
+         IBlock &currentBlock = *it;
+
+         auto *ccd = currentBlock.getData<pe::ccd::ICCD>(ccdID_);
+         auto *fcd = currentBlock.getData<pe::fcd::IFCD>(fcdID_);
+         ccd->generatePossibleContacts();
+         pe::Contacts& contacts = fcd->generateContacts( ccd->getPossibleContacts() );
+         size_t numContacts( contacts.size() );
+
+         for( size_t i = 0; i < numContacts; ++i )
+         {
+            const pe::ContactID c( &contacts[i] );
+            maximumPenetration = std::max( maximumPenetration, std::fabs(c->getDistance()));
+         }
+      }
+      WALBERLA_MPI_SECTION()
+      {
+         mpi::allReduceInplace( maximumPenetration, mpi::MAX );
+      }
+      return maximumPenetration;
+   }
+private:
+   shared_ptr< StructuredBlockStorage > blocks_;
+   const BlockDataID ccdID_;
+   const BlockDataID fcdID_;
+};
+
+class MaxVelocityEvaluator
+{
+public:
+   MaxVelocityEvaluator( const shared_ptr< StructuredBlockStorage > & blocks, const BlockDataID bodyStorageID ) :
+         blocks_( blocks ), bodyStorageID_(bodyStorageID)
+   {}
+
+   Vector3<real_t> get()
+   {
+      auto maxVelX = real_t(0);
+      auto maxVelY = real_t(0);
+      auto maxVelZ = real_t(0);
+
+      for (auto it = blocks_->begin(); it != blocks_->end(); ++it) {
+
+         for( auto bodyIt = pe::LocalBodyIterator::begin(*it, bodyStorageID_ ); bodyIt != pe::LocalBodyIterator::end(); ++bodyIt ) {
+            auto vel = bodyIt->getLinearVel();
+            maxVelX = std::max(maxVelX, std::fabs(vel[0]));
+            maxVelY = std::max(maxVelY, std::fabs(vel[1]));
+            maxVelZ = std::max(maxVelZ, std::fabs(vel[2]));
+         }
+      }
+      WALBERLA_MPI_SECTION()
+      {
+         mpi::allReduceInplace( maxVelX, mpi::MAX );
+         mpi::allReduceInplace( maxVelY, mpi::MAX );
+         mpi::allReduceInplace( maxVelZ, mpi::MAX );
+      }
+      return Vector3<real_t>(maxVelX, maxVelY, maxVelZ);
+   }
+
+   real_t getMagnitude()
+   {
+      auto magnitude = real_t(0);
+
+      for (auto it = blocks_->begin(); it != blocks_->end(); ++it) {
+
+         for( auto bodyIt = pe::LocalBodyIterator::begin(*it, bodyStorageID_ ); bodyIt != pe::LocalBodyIterator::end(); ++bodyIt ) {
+            magnitude = std::max(magnitude, bodyIt->getLinearVel().length());
+         }
+      }
+      WALBERLA_MPI_SECTION()
+      {
+         mpi::allReduceInplace( magnitude, mpi::MAX );
+      }
+      return magnitude;
+
+   }
+
+private:
+   shared_ptr< StructuredBlockStorage > blocks_;
+   const BlockDataID bodyStorageID_;
+};
+
+void evaluateFluidQuantities(const shared_ptr< StructuredBlockStorage > & blocks, const BlockDataID boundaryHandlingID,
+                             uint_t & numCells, uint_t & numFluidCells, uint_t & numNBCells )
+{
+
+   for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt)
+   {
+      auto * boundaryHandling = blockIt->getData< BoundaryHandling_T >( boundaryHandlingID );
+      auto xyzSize = boundaryHandling->getFlagField()->xyzSize();
+      numCells += xyzSize.numCells();
+
+      for(auto z = cell_idx_t(xyzSize.zMin()); z <= cell_idx_t(xyzSize.zMax()); ++z ){
+         for(auto y = cell_idx_t(xyzSize.yMin()); y <= cell_idx_t(xyzSize.yMax()); ++y ){
+            for(auto x = cell_idx_t(xyzSize.xMin()); x <= cell_idx_t(xyzSize.xMax()); ++x ) {
+               if (boundaryHandling->isDomain(x, y, z)) {
+                  ++numFluidCells;
+               }
+               if (boundaryHandling->isNearBoundary(x, y, z)) {
+                  ++numNBCells;
+               }
+            }
+         }
+      }
+   }
+}
+
+void evaluatePEQuantities( const shared_ptr< StructuredBlockStorage > & blocks, const BlockDataID bodyStorageID,
+                           const pe::cr::ICR & cr,
+                           uint_t & numLocalParticles, uint_t & numShadowParticles, uint_t & numContacts)
+{
+
+   for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt) {
+      auto * bodyStorage = blockIt->getData<pe::Storage>(bodyStorageID);
+      pe::BodyStorage const & localStorage  = (*bodyStorage)[pe::StorageType::LOCAL];
+      pe::BodyStorage const & shadowStorage = (*bodyStorage)[pe::StorageType::SHADOW];
+      numLocalParticles += localStorage.size();
+      numShadowParticles += shadowStorage.size();
+
+      numContacts += cr.getNumberOfContactsTreated();
+   }
+}
+
+void evaluateTimers(WcTimingPool & timingPool, WcTimingTree & peTimingTree,
+                    const std::vector<std::vector<std::string> > & timerKeys,
+                    std::vector<real_t> & timings )
+{
+
+   for (auto & timingsIt : timings)
+   {
+      timingsIt = real_t(0);
+   }
+
+   timingPool.unifyRegisteredTimersAcrossProcesses();
+   peTimingTree.synchronize();
+
+   auto scalingFactor = real_t(1000); // milliseconds
+
+   for (auto i = uint_t(0); i < timerKeys.size(); ++i )
+   {
+      auto keys = timerKeys[i];
+      for (const auto &timerName : keys)
+      {
+         if(timingPool.timerExists(timerName))
+         {
+            timings[i] += real_c(timingPool[timerName].total()) * scalingFactor;
+         }
+         if(peTimingTree.timerExists(timerName))
+         {
+            timings[i] += real_c(peTimingTree[timerName].total()) * scalingFactor;
+         }
+      }
+
+   }
+}
+
+void resetTimers(WcTimingPool & timingPool, WcTimingTree & peTimingTree)
+{
+   timingPool.clear();
+   peTimingTree.reset();
+}
+
+//*******************************************************************************************************************
+/*! Application to evaluate the workload (time measurements) for a fluid-particle simulation
+ *
+ * This application is used in the paper
+ *  Rettinger, Ruede - "Dynamic Load Balancing Techniques for Particulate Flow Simulations", submitted to Computation
+ * in Section 3 to develop and calibrate the workload estimator.
+ * The setup features settling particle inside a horizontally periodic box.
+ * A comprehensive description is given in Sec. 3.3 of the paper.
+ * It uses 4 x 4 x 5 blocks for domain partitioning.
+ * For each block ( = each process), the block local quantities are evaluated as well as the timing infos of
+ * the fluid-particle interaction algorithm. Those infos are then written to files that can be used later on
+ * for function fitting to obtain a workload estimator.
+ *
+ * NOTE: Since this estimator relies on timing measurements, this evaluation procedure should be carried out everytime
+ * a different implementation, hardware or algorithm is used.
+ *
+ */
+//*******************************************************************************************************************
+int main( int argc, char **argv )
+{
+   debug::enterTestMode();
+
+   mpi::Environment env( argc, argv );
+
+
+   auto solidVolumeFraction = real_t(0.2);
+
+   // LBM / numerical parameters
+   auto blockSize  = uint_t(32);
+   auto uSettling = real_t(0.1); // characteristic settling velocity
+   auto diameter = real_t(10);
+
+   auto Ga = real_t(30); //Galileo number
+   auto numPeSubCycles = uint_t(10);
+
+   auto vtkIOFreq = uint_t(0);
+   auto timestepsNonDim = real_t(2.5);
+   auto numSamples = uint_t(2000);
+   std::string baseFolder = "workload_files"; // folder for vtk and file output
+
+   bool useEllipsoids = false;
+   bool optimizeForSmallObstacleFraction = false;
+   bool noFileOutput = false;
+   bool fixBodies = false;
+   bool useEntireFieldTraversal = true;
+   bool averageForceTorqueOverTwoTimSteps = true;
+   bool useFusedStreamCollide = false;
+
+
+   for( int i = 1; i < argc; ++i )
+   {
+      if( std::strcmp( argv[i], "--vtkIOFreq"               ) == 0 ) { vtkIOFreq = uint_c( std::atof( argv[++i] ) ); continue; }
+      if( std::strcmp( argv[i], "--noFileOutput"            ) == 0 ) { noFileOutput = true; continue; }
+      if( std::strcmp( argv[i], "--basefolder"              ) == 0 ) { baseFolder = argv[++i]; continue; }
+      if( std::strcmp( argv[i], "--solidVolumeFraction"     ) == 0 ) { solidVolumeFraction = real_c(std::atof( argv[++i] )); continue; }
+      if( std::strcmp( argv[i], "--diameter"                ) == 0 ) { diameter = real_c(std::atof( argv[++i] )); continue; }
+      if( std::strcmp( argv[i], "--blockSize"               ) == 0 ) { blockSize = uint_c(std::atof( argv[++i]) ); continue; }
+      if( std::strcmp( argv[i], "--uSettling"               ) == 0 ) { uSettling = real_c(std::atof( argv[++i] )); continue; }
+      if( std::strcmp( argv[i], "--Ga"                      ) == 0 ) { Ga = real_c(std::atof( argv[++i] )); continue; }
+      if( std::strcmp( argv[i], "--timestepsNonDim"         ) == 0 ) { timestepsNonDim = real_c(std::atof( argv[++i] )); continue; }
+      if( std::strcmp( argv[i], "--numPeSubCycles"          ) == 0 ) { numPeSubCycles = uint_c(std::atof( argv[++i] )); continue; }
+      if( std::strcmp( argv[i], "--useEllipsoids"           ) == 0 ) { useEllipsoids = true; continue; }
+      if( std::strcmp( argv[i], "--optSmallSVF"             ) == 0 ) { optimizeForSmallObstacleFraction = true; continue; }
+      if( std::strcmp( argv[i], "--fixBodies"               ) == 0 ) { fixBodies = true; continue; }
+      if( std::strcmp( argv[i], "--useEntireFieldTraversal" ) == 0 ) { useEntireFieldTraversal = true; continue; }
+      if( std::strcmp( argv[i], "--numSamples"              ) == 0 ) { numSamples = uint_c(std::atof( argv[++i] )); continue; }
+      if( std::strcmp( argv[i], "--noForceAveraging"        ) == 0 ) { averageForceTorqueOverTwoTimSteps = false; continue; }
+      if( std::strcmp( argv[i], "--useFusedStreamCollide"   ) == 0 ) { useFusedStreamCollide = true; continue; }
+      WALBERLA_ABORT("Unrecognized command line argument found: " << argv[i]);
+   }
+
+   WALBERLA_CHECK(diameter > real_t(1));
+   WALBERLA_CHECK(uSettling > real_t(0));
+   WALBERLA_CHECK(Ga > real_t(0));
+   WALBERLA_CHECK(solidVolumeFraction > real_t(0));
+   WALBERLA_CHECK(solidVolumeFraction < real_t(0.65));
+
+   ///////////////////////////
+   // SIMULATION PROPERTIES //
+   ///////////////////////////
+
+   const auto XBlocks = uint_t(4);
+   const auto YBlocks = uint_t(4);
+   const auto ZBlocks = uint_t(5);
+
+   if( MPIManager::instance()->numProcesses() != XBlocks * YBlocks * ZBlocks )
+   {
+      WALBERLA_LOG_WARNING_ON_ROOT("WARNING! You have specified less or more processes than number of blocks -> the time measurements are no longer blockwise!")
+   }
+
+   if( diameter > real_c(blockSize) )
+   {
+      WALBERLA_LOG_WARNING_ON_ROOT("The bodies might be too large to work with the currently used synchronization!");
+   }
+
+
+   WALBERLA_LOG_INFO_ON_ROOT("Using setup with sedimenting particles -> creating two planes and applying gravitational force")
+   if( useEllipsoids ){ WALBERLA_LOG_INFO_ON_ROOT("using ELLIPSOIDS"); }
+   else{ WALBERLA_LOG_INFO_ON_ROOT("using SPHERES"); }
+
+
+   const uint_t XCells = blockSize * XBlocks;
+   const uint_t YCells = blockSize * YBlocks;
+   const uint_t ZCells = blockSize * ZBlocks;
+
+   const real_t topWallOffset = real_t(1.05) * real_t(blockSize); // move the top wall downwards to take away a certain portion of the overall domain
+
+
+   // determine number of spheres to generate, if necessary scale diameter a bit to reach desired solid volume fraction
+   real_t domainHeight = real_c(ZCells) - topWallOffset;
+   real_t fluidVolume =  real_c( XCells * YCells ) * domainHeight;
+   real_t solidVolume = solidVolumeFraction * fluidVolume;
+   uint_t numberOfParticles = uint_c(std::ceil(solidVolume / ( math::pi / real_t(6) * diameter * diameter * diameter )));
+   diameter = std::cbrt( solidVolume / ( real_c(numberOfParticles) * math::pi / real_t(6) ) );
+
+   auto densityRatio = real_t(2.5);
+
+   real_t viscosity = uSettling * diameter / Ga;
+   const real_t omega = lbm::collision_model::omegaFromViscosity(viscosity);
+
+   const real_t gravitationalAcceleration = uSettling * uSettling / ( (densityRatio-real_t(1)) * diameter );
+
+   real_t tref = diameter / uSettling;
+   real_t Tref = domainHeight / uSettling;
+
+   uint_t timesteps = uint_c(timestepsNonDim * Tref);
+
+   const real_t dx = real_c(1);
+   WALBERLA_LOG_INFO_ON_ROOT("viscosity = " << viscosity);
+   WALBERLA_LOG_INFO_ON_ROOT("tau = " << real_t(1)/omega);
+   WALBERLA_LOG_INFO_ON_ROOT("diameter = " << diameter);
+   WALBERLA_LOG_INFO_ON_ROOT("solid volume fraction = " << solidVolumeFraction);
+   WALBERLA_LOG_INFO_ON_ROOT("domain size (in cells) = " << XCells << " x " << ZCells << " x " << ZCells);
+   WALBERLA_LOG_INFO_ON_ROOT("number of bodies = " << numberOfParticles);
+   WALBERLA_LOG_INFO_ON_ROOT("gravitational acceleration = " << gravitationalAcceleration);
+   WALBERLA_LOG_INFO_ON_ROOT("Ga = " << Ga);
+   WALBERLA_LOG_INFO_ON_ROOT("uSettling = " << uSettling);
+   WALBERLA_LOG_INFO_ON_ROOT("tref = " << tref);
+   WALBERLA_LOG_INFO_ON_ROOT("Tref = " << Tref);
+   WALBERLA_LOG_INFO_ON_ROOT("timesteps = " << timesteps);
+   WALBERLA_LOG_INFO_ON_ROOT("number of workload samples = " << numSamples);
+
+   // create folder to store logging files
+   WALBERLA_ROOT_SECTION()
+   {
+      walberla::filesystem::path path1( baseFolder );
+      if( !walberla::filesystem::exists( path1 ) )
+         walberla::filesystem::create_directory( path1 );
+   }
+
+
+   ///////////////////////////
+   // BLOCK STRUCTURE SETUP //
+   ///////////////////////////
+
+   Vector3<bool> periodicity( true );
+   periodicity[2] = false;
+
+   // create domain
+   shared_ptr< StructuredBlockForest > blocks = blockforest::createUniformBlockGrid( XBlocks, YBlocks, ZBlocks, blockSize, blockSize, blockSize, dx,
+                                                    0, false, false, //one block per process!
+                                                    periodicity[0], periodicity[1], periodicity[2], //periodicity
+                                                    false );
+
+   ////////
+   // PE //
+   ////////
+
+   shared_ptr<pe::BodyStorage> globalBodyStorage = make_shared<pe::BodyStorage>();
+   pe::SetBodyTypeIDs<BodyTypeTuple>::execute();
+   auto bodyStorageID = blocks->addBlockData(pe::createStorageDataHandling<BodyTypeTuple>(), "pe Body Storage");
+   auto ccdID = blocks->addBlockData(pe::ccd::createHashGridsDataHandling( globalBodyStorage, bodyStorageID ), "CCD");
+   BlockDataID fcdID   = (useEllipsoids) ? blocks->addBlockData( pe::fcd::createGenericFCDDataHandling<BodyTypeTuple, pe::fcd::GJKEPACollideFunctor>(), "FCD" )
+                                         : blocks->addBlockData(pe::fcd::createGenericFCDDataHandling<BodyTypeTuple, pe::fcd::AnalyticCollideFunctor>(), "FCD");
+
+   WcTimingTree timingTreePE;
+
+   pe::cr::HCSITS cr(globalBodyStorage, blocks->getBlockStoragePointer(), bodyStorageID, ccdID, fcdID, &timingTreePE );
+   cr.setMaxIterations(10);
+   cr.setRelaxationModel( pe::cr::HardContactSemiImplicitTimesteppingSolvers::ApproximateInelasticCoulombContactByDecoupling );
+   cr.setErrorReductionParameter(real_t(0.8));
+
+   /////////////////
+   // PE COUPLING //
+   /////////////////
+
+   // connect to pe
+   const real_t overlap = real_c( 1.5 ) * dx;
+
+   std::function<void(void)> syncCall = [&capture0 = blocks->getBlockForest(), bodyStorageID, capture1 = &timingTreePE, overlap] { pe::syncNextNeighbors<BodyTypeTuple>(capture0, bodyStorageID, capture1, overlap, false); };
+
+   auto generationDomain = AABB( real_t(0), real_t(0), real_t(0), real_c(XCells), real_c(YCells), real_c(ZCells) - topWallOffset);
+   auto peMaterial = pe::createMaterial( "mat", densityRatio, real_t(1), real_t(0.25), real_t(0.25), real_t(0), real_t(200), real_t(100), real_t(100), real_t(100) );
+
+   // create two planes at bottom and top of domain
+   pe::createPlane( *globalBodyStorage, 0, Vector3<real_t>(0,0,1), Vector3<real_t>(0,0,0), peMaterial );
+   pe::createPlane( *globalBodyStorage, 0, Vector3<real_t>(0,0,-1), Vector3<real_t>(0,0,real_c(ZCells)-topWallOffset), peMaterial );
+
+   auto xParticle = real_t(0);
+   auto yParticle = real_t(0);
+   auto zParticle = real_t(0);
+
+   for( uint_t nPart = 0; nPart < numberOfParticles; ++nPart )
+   {
+
+      WALBERLA_ROOT_SECTION()
+      {
+         xParticle = math::realRandom<real_t>(generationDomain.xMin(), generationDomain.xMax());
+         yParticle = math::realRandom<real_t>(generationDomain.yMin(), generationDomain.yMax());
+         zParticle = math::realRandom<real_t>(generationDomain.zMin(), generationDomain.zMax());
+      }
+
+      WALBERLA_MPI_SECTION()
+      {
+         mpi::broadcastObject( xParticle );
+         mpi::broadcastObject( yParticle );
+         mpi::broadcastObject( zParticle );
+      }
+
+      if( useEllipsoids )
+      {
+         // prolate ellipsoids
+         auto axisFactor = real_t(1.5);
+         real_t axisFactor2 = std::sqrt(real_t(1)/axisFactor);
+         real_t radius = diameter * real_t(0.5);
+         pe::createEllipsoid( *globalBodyStorage, blocks->getBlockStorage(), bodyStorageID, 0, Vector3<real_t>( xParticle, yParticle, zParticle ), Vector3<real_t>(axisFactor*radius, axisFactor2*radius, axisFactor2*radius), peMaterial );
+
+      } else{
+         pe::createSphere( *globalBodyStorage, blocks->getBlockStorage(), bodyStorageID, 0, Vector3<real_t>( xParticle, yParticle, zParticle ), diameter * real_t(0.5), peMaterial );
+      }
+
+   }
+
+   syncCall();
+
+   // resolve possible overlaps of the particles due to the random initialization
+
+   // 100 iterations of solver to resolve all major overlaps
+   {
+      for (auto pet = uint_t(1); pet <= uint_t(100); ++pet )
+      {
+         cr.timestep( real_t(1) );
+         syncCall();
+
+         // reset all velocities to zero
+         for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
+         {
+            for( auto bodyIt = pe::BodyIterator::begin( *blockIt, bodyStorageID); bodyIt != pe::BodyIterator::end(); ++bodyIt )
+            {
+               bodyIt->setLinearVel(Vector3<real_t>(real_t(0)));
+               bodyIt->setAngularVel(Vector3<real_t>(real_t(0)));
+            }
+         }
+      }
+   }
+
+
+   // resolve remaining overlaps via particle simulation
+   {
+      const auto initialPeSteps = uint_t(2000);
+      const auto dt_PE_init = real_t(1);
+      const real_t overlapLimit = real_t(0.001) * diameter;
+
+      WALBERLA_LOG_INFO_ON_ROOT("Particle creation done --- resolving overlaps with goal all < " << overlapLimit / diameter * real_t(100) << "%");
+
+      CollisionPropertiesEvaluator collisionPropertiesEvaluator( cr );
+
+      ContactDistanceEvaluator contactDistanceEvaluator(blocks, ccdID, fcdID);
+      MaxVelocityEvaluator maxVelEvaluator(blocks, bodyStorageID);
+
+      for(auto pet = uint_t(1); pet <= initialPeSteps; ++pet )
+      {
+         cr.timestep( dt_PE_init );
+         syncCall();
+         real_t maxPen = collisionPropertiesEvaluator.get();
+
+         if( maxPen < overlapLimit )
+         {
+            WALBERLA_LOG_INFO_ON_ROOT("Carried out " << pet << " PE-only time steps to resolve initial overlaps");
+            WALBERLA_LOG_INFO_ON_ROOT("Final max penetration from cr is " << maxPen << " = " << maxPen / diameter * real_t(100) << "%");
+
+            break;
+         }
+
+         real_t maxMagnitude = maxVelEvaluator.getMagnitude();
+
+         if( maxMagnitude * dt_PE_init > overlapLimit)
+         {
+            // avoid too large response velocities by setting them to zero
+            for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
+            {
+               for( auto bodyIt = pe::BodyIterator::begin( *blockIt, bodyStorageID); bodyIt != pe::BodyIterator::end(); ++bodyIt )
+               {
+                  bodyIt->setLinearVel(Vector3<real_t>(real_t(0)));
+                  bodyIt->setAngularVel(Vector3<real_t>(real_t(0)));
+               }
+            }
+         }
+         else
+         {
+            cr.setErrorReductionParameter(real_t(0.8));
+         }
+
+         if( pet % uint_t(20) == uint_t(0) )
+         {
+            WALBERLA_LOG_INFO_ON_ROOT(pet << " - current max overlap = " << maxPen / diameter * real_t(100) << "%, max vel magnitude = " << maxMagnitude );
+         }
+      }
+   }
+
+   // reset all velocities to zero
+   Vector3<real_t> initialBodyVelocity(real_t(0));
+
+   WALBERLA_LOG_INFO_ON_ROOT("Setting initial velocity " << initialBodyVelocity << " of all bodies");
+   for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
+   {
+      for( auto bodyIt = pe::BodyIterator::begin( *blockIt, bodyStorageID); bodyIt != pe::BodyIterator::end(); ++bodyIt )
+      {
+         bodyIt->setLinearVel(initialBodyVelocity);
+         bodyIt->setAngularVel(Vector3<real_t>(real_t(0)));
+      }
+   }
+
+   ///////////////////////
+   // ADD DATA TO BLOCKS //
+   ////////////////////////
+
+   // create the lattice model
+   LatticeModel_T latticeModel = LatticeModel_T( lbm::collision_model::TRT::constructWithMagicNumber( omega ) );
+
+   // add PDF field
+   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, "pdf field (fzyx)", latticeModel,
+                                                                         Vector3< real_t >( real_t(0) ), real_t(1),
+                                                                         uint_t(1), field::fzyx );
+
+   // add flag field
+   BlockDataID flagFieldID = field::addFlagFieldToStorage<FlagField_T>( blocks, "flag field" );
+
+   // add body field
+   BlockDataID bodyFieldID = field::addToStorage<BodyField_T>( blocks, "body field", nullptr, field::fzyx );
+
+   // add boundary handling & initialize outer domain boundaries
+   BlockDataID boundaryHandlingID = blocks->addStructuredBlockData< BoundaryHandling_T >(
+                                    MyBoundaryHandling( flagFieldID, pdfFieldID, bodyFieldID, useEntireFieldTraversal ), "boundary handling" );
+
+
+   // initially map pe bodies into the LBM simulation
+   pe_coupling::mapMovingBodies< BoundaryHandling_T >( *blocks, boundaryHandlingID, bodyStorageID, *globalBodyStorage, bodyFieldID, MO_CLI_Flag );
+
+   lbm::BlockForestEvaluation<FlagField_T> bfEval(blocks, flagFieldID, Fluid_Flag);
+
+   WALBERLA_LOG_INFO_ON_ROOT(bfEval.loggingString());
+
+   ///////////////
+   // TIME LOOP //
+   ///////////////
+
+   // create the timeloop
+   SweepTimeloop timeloop( blocks->getBlockStorage(), timesteps );
+
+   if( vtkIOFreq != uint_t(0) )
+   {
+      // pe bodies
+      if(useEllipsoids)
+      {
+         auto bodyVtkOutput = make_shared<pe::EllipsoidVtkOutput>( bodyStorageID, blocks->getBlockStorage() );
+         auto bodyVTK = vtk::createVTKOutput_PointData( bodyVtkOutput, "bodies", vtkIOFreq, baseFolder );
+         timeloop.addFuncBeforeTimeStep( vtk::writeFiles( bodyVTK ), "VTK (body data)" );
+
+      }else
+      {
+         auto bodyVtkOutput = make_shared<pe::SphereVtkOutput>( bodyStorageID, blocks->getBlockStorage() );
+         auto bodyVTK = vtk::createVTKOutput_PointData( bodyVtkOutput, "bodies", vtkIOFreq, baseFolder );
+         timeloop.addFuncBeforeTimeStep( vtk::writeFiles( bodyVTK ), "VTK (body data)" );
+      }
+
+
+      // flag field
+      auto flagFieldVTK = vtk::createVTKOutput_BlockData( blocks, "flag_field", vtkIOFreq, 1, false, baseFolder );
+      flagFieldVTK->addCellDataWriter( make_shared< field::VTKWriter< FlagField_T > >( flagFieldID, "FlagField" ) );
+      timeloop.addFuncAfterTimeStep( vtk::writeFiles( flagFieldVTK ), "VTK (flag field data)" );
+
+      // pdf field
+      auto pdfFieldVTK = vtk::createVTKOutput_BlockData( blocks, "fluid_field", vtkIOFreq, 0, false, baseFolder );
+
+      field::FlagFieldCellFilter< FlagField_T > fluidFilter( flagFieldID );
+      fluidFilter.addFlag( Fluid_Flag );
+      pdfFieldVTK->addCellInclusionFilter( fluidFilter );
+
+      pdfFieldVTK->addCellDataWriter( make_shared< lbm::VelocityVTKWriter< LatticeModel_T, float > >( pdfFieldID, "VelocityFromPDF" ) );
+      pdfFieldVTK->addCellDataWriter( make_shared< lbm::DensityVTKWriter < LatticeModel_T, float > >( pdfFieldID, "DensityFromPDF" ) );
+
+      timeloop.addFuncBeforeTimeStep( vtk::writeFiles( pdfFieldVTK ), "VTK (fluid field data)" );
+
+      auto domainDecompVTK = vtk::createVTKOutput_DomainDecomposition(blocks, "domain_decomposition", vtkIOFreq, baseFolder );
+      timeloop.addFuncBeforeTimeStep( vtk::writeFiles(domainDecompVTK), "VTK (domain decomposition)");
+   }
+
+   // sweep for updating the pe body mapping into the LBM simulation
+   timeloop.add()
+         << Sweep( pe_coupling::BodyMapping< LatticeModel_T, BoundaryHandling_T >( blocks, pdfFieldID, boundaryHandlingID, bodyStorageID, globalBodyStorage, bodyFieldID, MO_CLI_Flag, FormerMO_Flag, pe_coupling::selectRegularBodies ), "Body Mapping" );
+
+   // sweep for restoring PDFs in cells previously occupied by pe bodies
+   using Reconstructor_T = pe_coupling::EquilibriumReconstructor<LatticeModel_T, BoundaryHandling_T>;
+   Reconstructor_T reconstructor( blocks, boundaryHandlingID, bodyFieldID);
+   timeloop.add()
+         << Sweep( pe_coupling::PDFReconstruction< LatticeModel_T, BoundaryHandling_T, Reconstructor_T >
+                         ( blocks, pdfFieldID, boundaryHandlingID, bodyStorageID, globalBodyStorage, bodyFieldID, reconstructor, FormerMO_Flag, Fluid_Flag, pe_coupling::selectRegularBodies, optimizeForSmallObstacleFraction ), "PDF Restore" );
+
+   shared_ptr<pe_coupling::BodiesForceTorqueContainer> bodiesFTContainer1 = make_shared<pe_coupling::BodiesForceTorqueContainer>(blocks, bodyStorageID);
+   std::function<void(void)> storeForceTorqueInCont1 = [bodiesFTContainer1] { bodiesFTContainer1->store(); };
+   shared_ptr<pe_coupling::BodiesForceTorqueContainer> bodiesFTContainer2 = make_shared<pe_coupling::BodiesForceTorqueContainer>(blocks, bodyStorageID);
+   std::function<void(void)> setForceTorqueOnBodiesFromCont2 = [bodiesFTContainer2] { bodiesFTContainer2->setOnBodies(); };
+   shared_ptr<pe_coupling::ForceTorqueOnBodiesScaler> forceScaler = make_shared<pe_coupling::ForceTorqueOnBodiesScaler>(blocks, bodyStorageID, real_t(1));
+   std::function<void(void)> setForceScalingFactorToHalf = [forceScaler] { forceScaler->resetScalingFactor(real_t(0.5)); };
+
+   if( averageForceTorqueOverTwoTimSteps ) {
+      bodiesFTContainer2->store();
+   }
+
+
+   // setup of the LBM communication for synchronizing the pdf field between neighboring blocks
+   std::function< void () > commFunction;
+
+   blockforest::communication::UniformBufferedScheme< stencil::D3Q27 > scheme( blocks );
+   scheme.addPackInfo( make_shared< field::communication::PackInfo< PdfField_T > >( pdfFieldID ) );
+   commFunction = scheme;
+
+   auto sweep = lbm::makeCellwiseSweep< LatticeModel_T, FlagField_T >( pdfFieldID, flagFieldID, Fluid_Flag );
+
+   if( !useFusedStreamCollide )
+   {
+      // streaming & collide
+      timeloop.add() << Sweep( makeCollideSweep(sweep), "Collide" );
+   }
+
+   // add LBM communication function and boundary handling sweep (does the hydro force calculations and the no-slip treatment)
+   timeloop.add() << BeforeFunction( commFunction, "LBM Communication" )
+                  << Sweep( BoundaryHandling_T::getBlockSweep( boundaryHandlingID ), "Boundary Handling" );
+
+   if( useFusedStreamCollide )
+   {
+      // streaming & collide
+      timeloop.add() << Sweep( makeSharedSweep(sweep), "Stream&Collide" );
+   } else
+   {
+      // streaming & collide
+      timeloop.add() << Sweep( makeStreamSweep(sweep), "Stream" );
+
+   }
+
+   // Averaging the force/torque over two time steps is said to damp oscillations of the interaction force/torque.
+   // See Ladd - " Numerical simulations of particulate suspensions via a discretized Boltzmann equation. Part 1. Theoretical foundation", 1994, p. 302
+   if( averageForceTorqueOverTwoTimSteps ) {
+
+      // store force/torque from hydrodynamic interactions in container1
+      timeloop.addFuncAfterTimeStep(storeForceTorqueInCont1, "Force Storing");
+
+      // set force/torque from previous time step (in container2)
+      timeloop.addFuncAfterTimeStep(setForceTorqueOnBodiesFromCont2, "Force setting");
+
+      // average the force/torque by scaling it with factor 1/2 (except in first timestep, there it is 1, which it is initially)
+      timeloop.addFuncAfterTimeStep( pe_coupling::ForceTorqueOnBodiesScaler(blocks, bodyStorageID, real_t(0.5)),  "Force averaging");
+      timeloop.addFuncAfterTimeStep( setForceScalingFactorToHalf, "Force scaling adjustment" );
+
+      // swap containers
+      timeloop.addFuncAfterTimeStep( pe_coupling::BodyContainerSwapper( bodiesFTContainer1, bodiesFTContainer2 ), "Swap FT container" );
+
+   }
+
+   real_t sphereVolume = diameter * diameter * diameter * math::pi / real_t(6);
+   Vector3<real_t> gravitationalForce( real_t(0), real_t(0), -(densityRatio - real_t(1)) * gravitationalAcceleration * sphereVolume );
+   timeloop.addFuncAfterTimeStep(pe_coupling::ForceOnBodiesAdder( blocks, bodyStorageID, gravitationalForce ), "Gravitational force" );
+
+   if( fixBodies ) {
+      // reset all forces
+      timeloop.addFuncAfterTimeStep( pe_coupling::ForceTorqueOnBodiesResetter(blocks, bodyStorageID), "Force Resetting");
+   } else{
+      // add pe timesteps
+      timeloop.addFuncAfterTimeStep( pe_coupling::TimeStep( blocks, bodyStorageID, cr, syncCall, real_t(1), numPeSubCycles ), "pe Time Step" );
+   }
+
+   timeloop.addFuncAfterTimeStep( RemainingTimeLogger( timeloop.getNrOfTimeSteps() ), "Remaining Time Logger" );
+
+
+
+   ////////////////////////
+   // EXECUTE SIMULATION //
+   ////////////////////////
+
+   WcTimingPool timeloopTiming;
+
+   std::vector< std::vector<std::string> > timerKeys;
+   std::vector<std::string> LBMTimer;
+   LBMTimer.emplace_back("Stream&Collide");
+   LBMTimer.emplace_back("Stream");
+   LBMTimer.emplace_back("Collide");
+   timerKeys.push_back(LBMTimer);
+
+   std::vector<std::string> bhTimer;
+   bhTimer.emplace_back("Boundary Handling");
+   timerKeys.push_back(bhTimer);
+
+   std::vector<std::string> couplingTimer1;
+   couplingTimer1.emplace_back("Body Mapping");
+   std::vector<std::string> couplingTimer2;
+   couplingTimer2.emplace_back("PDF Restore");
+   timerKeys.push_back(couplingTimer1);
+   timerKeys.push_back(couplingTimer2);
+
+   std::vector<std::string> peTimer;
+   peTimer.emplace_back("Simulation Step.Collision Detection");
+   peTimer.emplace_back("Simulation Step.Collision Response Integration");
+   peTimer.emplace_back("Simulation Step.Collision Response Resolution.Collision Response Solving");
+   timerKeys.push_back(peTimer);
+
+   uint_t numCells;
+   uint_t numFluidCells;
+   uint_t numNBCells;
+   uint_t numLocalParticles;
+   uint_t numShadowParticles;
+   uint_t numContacts;
+   numCells = uint_t(0);
+   numFluidCells = uint_t(0);
+   numNBCells = uint_t(0);
+   numLocalParticles = uint_t(0);
+   numShadowParticles = uint_t(0);
+   numContacts = uint_t(0);
+
+   std::vector<real_t> timings(timerKeys.size());
+
+   resetTimers(timeloopTiming, timingTreePE);
+
+   // every rank writes its own file -> numProcesses number of samples!
+   int myRank = MPIManager::instance()->rank();
+
+   std::string logFileName = baseFolder + "/load";
+   logFileName += "_settling";
+   if( useEllipsoids)
+   {
+      logFileName += "_ellipsoids";
+   }
+   else
+   {
+      logFileName += "_spheres";
+   }
+   logFileName += "_d" + std::to_string(int_c(std::ceil(diameter)));
+   logFileName += "_bs" + std::to_string(blockSize);
+   logFileName += "_" + std::to_string(myRank) + ".txt";
+
+
+   std::ofstream file;
+
+   if(!noFileOutput)
+   {
+      WALBERLA_LOG_INFO_ON_ROOT("Writing load info to file " << logFileName);
+      file.open( logFileName.c_str(), std::ofstream::app );
+      file << "# svf = " << solidVolumeFraction << ", d = " << diameter << ", domain = " << XCells << "x" << YCells << "x" << ZCells << "\n";
+   }
+
+
+   auto timeStepOfFirstTiming = uint_t(50);
+
+   if( timesteps - timeStepOfFirstTiming < numSamples )
+   {
+      WALBERLA_LOG_WARNING_ON_ROOT("Less actual time steps than number of required samples!");
+   }
+
+   uint_t nSample( 0 ); // number of current sample
+   real_t samplingFrequency = real_c(timesteps - timeStepOfFirstTiming) / real_c(numSamples);
+
+   // time loop
+   for (uint_t i = 1; i <= timesteps; ++i )
+   {
+      // perform a single simulation step
+      timeloop.singleStep( timeloopTiming );
+
+      // check if current time step should be included in sample
+      if( i >= uint_c( samplingFrequency * real_c(nSample) ) + timeStepOfFirstTiming )
+      {
+         // include -> evaluate all timers and quantities
+
+         evaluateFluidQuantities(blocks, boundaryHandlingID, numCells, numFluidCells, numNBCells);
+         evaluatePEQuantities(blocks, bodyStorageID, cr, numLocalParticles, numShadowParticles, numContacts);
+
+         evaluateTimers(timeloopTiming, timingTreePE, timerKeys, timings);
+
+         if(!noFileOutput)
+         {
+            real_t totalTime = std::accumulate(timings.begin(), timings.end(), real_t(0) );
+
+            file << timeloop.getCurrentTimeStep() << " " << real_c(timeloop.getCurrentTimeStep()) / Tref << " "
+                 << numCells << " " << numFluidCells << " " << numNBCells << " "
+                 << numLocalParticles << " " << numShadowParticles << " " << numContacts << " " << numPeSubCycles;
+            for (real_t timing : timings) {
+               file << " " << timing;
+            }
+            file << " " << totalTime << "\n";
+         }
+
+         numCells = uint_t(0);
+         numFluidCells = uint_t(0);
+         numNBCells = uint_t(0);
+         numLocalParticles = uint_t(0);
+         numShadowParticles = uint_t(0);
+         numContacts = uint_t(0);
+
+         ++nSample;
+      }
+
+      // reset timers to always include only a single time step in them
+      resetTimers(timeloopTiming, timingTreePE);
+   }
+
+   if(!noFileOutput) {
+      file.close();
+   }
+
+   //timeloopTiming.logResultOnRoot();
+
+   WALBERLA_LOG_INFO_ON_ROOT("Simulation finished!");
+
+   return 0;
+
+}
+
+} //namespace workload_evaluation
+
+int main( int argc, char **argv ){
+   workload_evaluation::main(argc, argv);
+}
\ No newline at end of file
--- a/apps/benchmarks/CMakeLists.txt
+++ b/apps/benchmarks/CMakeLists.txt
+add_subdirectory( AdaptiveMeshRefinementFluidParticleCoupling )
+add_subdirectory( CNT )
+add_subdirectory( ComplexGeometry )
+add_subdirectory( DEM )
+add_subdirectory( MeshDistance )
 add_subdirectory( CouetteFlow )
+add_subdirectory( FreeSurfaceAdvection )
+add_subdirectory( FluidizedBed )
+add_subdirectory( FluidParticleCoupling )
+add_subdirectory( FluidParticleCouplingWithLoadBalancing )
+add_subdirectory( ForcesOnSphereNearPlaneInShearFlow )
+add_subdirectory(Percolation)
+add_subdirectory( GranularGas )
+add_subdirectory( IntegratorAccuracy )
+add_subdirectory( LennardJones )
 add_subdirectory( NonUniformGrid )
+add_subdirectory( MotionSingleHeavySphere )
 add_subdirectory( PoiseuilleChannel )
+add_subdirectory( ProbeVsExtraMessage )
 add_subdirectory( SchaeferTurek )
-add_subdirectory( UniformGrid )
\ No newline at end of file
+add_subdirectory( UniformGrid )
+
+if ( WALBERLA_BUILD_WITH_PYTHON )
+   add_subdirectory( FieldCommunication )
+
+   if ( WALBERLA_BUILD_WITH_CODEGEN )
+      add_subdirectory( UniformGridCPU )
+      add_subdirectory( PhaseFieldAllenCahn )
+      add_subdirectory( NonUniformGridCPU )
+      add_subdirectory( TurbulentChannel )
+   endif()
+
+   if ( WALBERLA_BUILD_WITH_CODEGEN AND WALBERLA_BUILD_WITH_GPU_SUPPORT )
+      add_subdirectory( UniformGridGPU )
+      add_subdirectory( NonUniformGridGPU )
+   endif()
+
+endif()
+
+
+
No results found