diff --git a/CMakeLists.txt b/CMakeLists.txt
index ad983d0599758187c7b207bf6bee094acc0f26fd..e3b808b8d12c0c708bef8d1fbd1c280aab9a62e0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1255,28 +1255,34 @@ endif()
 ##  Half precision
 ##
 ############################################################################################################################
-if (WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT)
-    if (WALBERLA_CXX_COMPILER_IS_GNU OR WALBERLA_CXX_COMPILER_IS_CLANG)
-        message(STATUS "Configuring with *experimental* half precision (float16) support. You better know what you are doing.")
-        if (WALBERLA_CXX_COMPILER_IS_GNU AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 12.0.0)
-            message(WARNING "[WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT] "
-                    "Half precision support for gcc has only been tested with version >= 12. "
-                    "You are using a previous version - it may not work correctly.")
-        endif ()
-        if (WALBERLA_CXX_COMPILER_IS_CLANG AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 15.0.0)
-            message(WARNING "[WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT] "
-                    "Half precision support for clang has only been tested with version >= 15. "
-                    "You are using a previous version - it may not work correctly.")
-        endif ()
-        if (NOT WALBERLA_OPTIMIZE_FOR_LOCALHOST)
-            message(WARNING "[WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT] "
-                    "You are not optimizing for localhost. You may encounter linker errors, or WORSE: silent incorrect fp16 arithmetic! Consider also enabling WALBERLA_OPTIMIZE_FOR_LOCALHOST!")
-        endif ()
-    else ()
-        message(FATAL_ERROR "[WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT] "
-                "Half precision support is currently only available for gcc and clang.")
-    endif ()
-endif ()
+if ( WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT )
+   ### Compiler requirements:
+   ### Within this project, there are several checks to ensure that the template parameter 'ValueType'
+   ### is a floating point number. The check is_floating_point<ValueType> is done primarily in our MPI implementation.
+   ### The IEE 754 floating type format _Float16, evaluates to true only if your compiler supports the
+   ### open C++23 standard P1467R9 (Extended floating-point types and standard names).
+   ### Compare:
+   ###  https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2022/p1467r9.html
+   ###
+   ### Right now (18.12.2023) this is the case only for gcc13.
+   ### For more information see:
+   ###   https://gcc.gnu.org/projects/cxx-status.html#:~:text=Extended%20floating%2Dpoint%20types%20and%20standard%20names
+   ###   https://clang.llvm.org/cxx_status.html#:~:text=Extended%20floating%2Dpoint%20types%20and%20standard%20names
+
+   try_compile( WALBERLA_SUPPORT_HALF_PRECISION "${CMAKE_CURRENT_BINARY_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}/cmake/TestFloat16.cpp"
+         CXX_STANDARD 23 OUTPUT_VARIABLE TRY_COMPILE_OUTPUT )
+   ## message( STATUS ${TRY_COMPILE_OUTPUT} )
+   if ( NOT WALBERLA_SUPPORT_HALF_PRECISION )
+      message( FATAL_ERROR "Compiler: ${CMAKE_CXX_COMPILER} Version: ${CMAKE_CXX_COMPILER_VERSION} does not support half precision" )
+   endif ()
+
+   # Local host optimization
+   if ( NOT WALBERLA_OPTIMIZE_FOR_LOCALHOST )
+      message( WARNING "[WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT] "
+            "You are not optimizing for localhost. You may encounter linker errors, or WORSE: silent incorrect fp16 arithmetic! Consider also enabling WALBERLA_OPTIMIZE_FOR_LOCALHOST!" )
+   endif () # Local host check
+
+endif () # Check if WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT is set
 
 ############################################################################################################################
 # Documentation Generation
diff --git a/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.cpp b/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.cpp
index 08e5de4d928d9a23e76104823721dfbb5d811f69..494424a562e25503c6b189fa5cf2d957fafca313 100644
--- a/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.cpp
+++ b/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.cpp
@@ -194,11 +194,11 @@ int main(int argc, char** argv)
 
       gpu::communication::UniformGPUScheme< Stencil_T > comEven(blocks, false);
       comEven.addPackInfo(make_shared< PackInfoEven_T >(pdfFieldIDGPU));
-      auto evenComm = std::function< void() >([&]() { comEven.communicate(nullptr); });
+      auto evenComm = std::function< void() >([&]() { comEven.communicate(); });
 
       gpu::communication::UniformGPUScheme< Stencil_T > comODD(blocks, false);
       comODD.addPackInfo(make_shared< PackInfoOdd_T >(pdfFieldIDGPU));
-      auto oddComm = std::function< void() >([&]() { comODD.communicate(nullptr); });
+      auto oddComm = std::function< void() >([&]() { comODD.communicate(); });
 #else
       blockforest::communication::UniformBufferedScheme< Stencil_T > evenComm(blocks);
       evenComm.addPackInfo(make_shared< PackInfoEven_T >(pdfFieldID));
diff --git a/apps/benchmarks/NonUniformGridCPU/CMakeLists.txt b/apps/benchmarks/NonUniformGridCPU/CMakeLists.txt
index de1a2c1db0cbd078a80879f964f6a046f98afc95..2b37ed6fb19797229ae5507f4d26f3f031491a87 100644
--- a/apps/benchmarks/NonUniformGridCPU/CMakeLists.txt
+++ b/apps/benchmarks/NonUniformGridCPU/CMakeLists.txt
@@ -13,9 +13,9 @@ waLBerla_generate_target_from_python(NAME NonUniformGridCPUGenerated
 
 waLBerla_add_executable( NAME NonUniformGridGenerator
                          FILES NonUniformGridGenerator.cpp LdcSetup.h
-                         DEPENDS blockforest core domain_decomposition field geometry python_coupling vtk )
+                         DEPENDS blockforest core field python_coupling )
 
 
 waLBerla_add_executable( NAME NonUniformGridCPU
                          FILES NonUniformGridCPU.cpp LdcSetup.h
-                         DEPENDS blockforest boundary core domain_decomposition field geometry python_coupling timeloop vtk NonUniformGridCPUGenerated )
+                         DEPENDS blockforest boundary core domain_decomposition field geometry lbm_generated python_coupling timeloop vtk NonUniformGridCPUGenerated )
diff --git a/apps/benchmarks/NonUniformGridCPU/LdcSetup.h b/apps/benchmarks/NonUniformGridCPU/LdcSetup.h
index 1657e85e0e58dc0a1107ba62505e438c1821a1c1..070656cb23582a4da83f954c3e108aa70e69b315 100644
--- a/apps/benchmarks/NonUniformGridCPU/LdcSetup.h
+++ b/apps/benchmarks/NonUniformGridCPU/LdcSetup.h
@@ -14,12 +14,15 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \file LdcSetup.h
+//! \author Markus Holzer <markus.holzer@fau.de>
 //! \author Frederik Hennig <frederik.hennig@fau.de>
 //
 //======================================================================================================================
+#pragma once
 
 #include "blockforest/SetupBlock.h"
 #include "blockforest/SetupBlockForest.h"
+#include "blockforest/StructuredBlockForest.h"
 
 #include "core/all.h"
 
@@ -45,14 +48,14 @@ class LDCRefinement
    {
       const AABB & domain = forest.getDomain();
 
-      real_t xSize = ( domain.xSize() / real_t(12) ) * real_c( 0.99 );
-      real_t ySize = ( domain.ySize() / real_t(12) ) * real_c( 0.99 );
+      const real_t xSize = ( domain.xSize() / real_t(12) ) * real_c( 0.99 );
+      const real_t ySize = ( domain.ySize() / real_t(12) ) * real_c( 0.99 );
 
-      AABB leftCorner( domain.xMin(), domain.yMax() - ySize, domain.zMin(),
-                      domain.xMin() + xSize, domain.yMax() + ySize, domain.zMax() );
+      const AABB leftCorner( domain.xMin(), domain.yMin(), domain.zMin(),
+                            domain.xMin() + xSize, domain.yMin() + ySize, domain.zMax() );
 
-      AABB rightCorner( domain.xMax() - xSize, domain.yMax() - ySize, domain.zMin(),
-                       domain.xMax(), domain.yMax(), domain.zMax() );
+      const AABB rightCorner( domain.xMax() - xSize, domain.yMin(), domain.zMin(),
+                             domain.xMax(), domain.yMin() + ySize, domain.zMax() );
 
       for(auto & block : forest)
       {
diff --git a/apps/benchmarks/NonUniformGridGPU/CMakeLists.txt b/apps/benchmarks/NonUniformGridGPU/CMakeLists.txt
index d6840007e14d5f5af685bb5b262c8bcfd6138d6e..f6b4e1ff3779f624c8fb9845425d9d6a86103ee9 100644
--- a/apps/benchmarks/NonUniformGridGPU/CMakeLists.txt
+++ b/apps/benchmarks/NonUniformGridGPU/CMakeLists.txt
@@ -11,5 +11,5 @@ waLBerla_generate_target_from_python(NAME NonUniformGridGPUGenerated
         NonUniformGridGPUBoundaryCollection.h
         NonUniformGridGPUInfoHeader.h)
 waLBerla_add_executable( NAME NonUniformGridGPU
-                         FILES NonUniformGridGPU.cpp
-                         DEPENDS blockforest boundary core gpu domain_decomposition field geometry python_coupling timeloop vtk NonUniformGridGPUGenerated )
\ No newline at end of file
+                         FILES NonUniformGridGPU.cpp LdcSetup.h
+                         DEPENDS blockforest boundary core gpu domain_decomposition field geometry lbm_generated python_coupling timeloop vtk NonUniformGridGPUGenerated )
\ No newline at end of file
diff --git a/apps/benchmarks/NonUniformGridGPU/LdcSetup.h b/apps/benchmarks/NonUniformGridGPU/LdcSetup.h
new file mode 100644
index 0000000000000000000000000000000000000000..238943a7daa9745054980e6011d46ef037ef27ec
--- /dev/null
+++ b/apps/benchmarks/NonUniformGridGPU/LdcSetup.h
@@ -0,0 +1,110 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file LdcSetup.h
+//! \author Markus Holzer <markus.holzer@fau.de>
+//! \author Frederik Hennig <frederik.hennig@fau.de>
+//
+//======================================================================================================================
+#pragma once
+
+#include "blockforest/SetupBlock.h"
+#include "blockforest/SetupBlockForest.h"
+#include "blockforest/StructuredBlockForest.h"
+
+#include "core/all.h"
+
+#include "field/FlagField.h"
+
+#include "field/FlagUID.h"
+
+using namespace walberla;
+using RefinementSelectionFunctor = SetupBlockForest::RefinementSelectionFunction;
+using FlagField_T          = FlagField< uint8_t >;
+
+class LDCRefinement
+{
+ private:
+   const uint_t refinementDepth_;
+
+ public:
+   explicit LDCRefinement(const uint_t depth) : refinementDepth_(depth){};
+
+   void operator()(SetupBlockForest& forest) const
+   {
+      const AABB & domain = forest.getDomain();
+
+      const real_t xSize = ( domain.xSize() / real_t(12) ) * real_c( 0.99 );
+      const real_t ySize = ( domain.ySize() / real_t(12) ) * real_c( 0.99 );
+
+      const AABB leftCorner( domain.xMin(), domain.yMin(), domain.zMin(),
+                             domain.xMin() + xSize, domain.yMin() + ySize, domain.zMax() );
+
+      const AABB rightCorner( domain.xMax() - xSize, domain.yMin(), domain.zMin(),
+                              domain.xMax(), domain.yMin() + ySize, domain.zMax() );
+
+      for(auto & block : forest)
+      {
+         auto & aabb = block.getAABB();
+         if( leftCorner.intersects( aabb ) || rightCorner.intersects( aabb ) )
+         {
+            if( block.getLevel() < refinementDepth_)
+               block.setMarker( true );
+         }
+      }
+   }
+};
+
+class LDC
+{
+ private:
+   const std::string refinementProfile_;
+   const uint_t refinementDepth_;
+
+   const FlagUID noSlipFlagUID_;
+   const FlagUID ubbFlagUID_;
+
+ public:
+   explicit LDC(const uint_t depth) : refinementDepth_(depth), noSlipFlagUID_("NoSlip"), ubbFlagUID_("UBB"){};
+
+   RefinementSelectionFunctor refinementSelector() const
+   {
+      return LDCRefinement(refinementDepth_);
+   }
+
+   void setupBoundaryFlagField(StructuredBlockForest& sbfs, const BlockDataID flagFieldID)
+   {
+      for (auto bIt = sbfs.begin(); bIt != sbfs.end(); ++bIt)
+      {
+         auto& b           = dynamic_cast< Block& >(*bIt);
+         const uint_t level       = b.getLevel();
+         auto flagField     = b.getData< FlagField_T >(flagFieldID);
+         const uint8_t noslipFlag = flagField->registerFlag(noSlipFlagUID_);
+         const uint8_t ubbFlag    = flagField->registerFlag(ubbFlagUID_);
+         for (auto cIt = flagField->beginWithGhostLayerXYZ(2); cIt != flagField->end(); ++cIt)
+         {
+            const Cell localCell = cIt.cell();
+            Cell globalCell(localCell);
+            sbfs.transformBlockLocalToGlobalCell(globalCell, b);
+            if (globalCell.y() >= cell_idx_c(sbfs.getNumberOfYCells(level))) { flagField->addFlag(localCell, ubbFlag); }
+            else if (globalCell.z() < 0 || globalCell.y() < 0 || globalCell.x() < 0 ||
+                     globalCell.x() >= cell_idx_c(sbfs.getNumberOfXCells(level)) || globalCell.z() >= cell_idx_c(sbfs.getNumberOfZCells(level)))
+            {
+               flagField->addFlag(localCell, noslipFlag);
+            }
+         }
+      }
+   }
+};
diff --git a/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.cpp b/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.cpp
index 8110dbbb4437b8d3c56d9b855de501fd55521454..919755d6d7cd481dd90a2f1310965e0c6947432c 100644
--- a/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.cpp
+++ b/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.cpp
@@ -19,7 +19,6 @@
 //======================================================================================================================
 
 #include "blockforest/Initialization.h"
-#include "blockforest/SetupBlockForest.h"
 #include "blockforest/loadbalancing/StaticCurve.h"
 
 #include "core/Environment.h"
@@ -54,6 +53,7 @@
 
 #include <cmath>
 
+#include "LdcSetup.h"
 #include "NonUniformGridGPUInfoHeader.h"
 using namespace walberla;
 
@@ -69,82 +69,6 @@ using BoundaryCollection_T = lbm::NonUniformGridGPUBoundaryCollection< FlagField
 using SweepCollection_T = lbm::NonUniformGridGPUSweepCollection;
 
 using gpu::communication::NonUniformGPUScheme;
-using RefinementSelectionFunctor = SetupBlockForest::RefinementSelectionFunction;
-
-class LDCRefinement
-{
- private:
-   const uint_t refinementDepth_;
-
- public:
-   explicit LDCRefinement(const uint_t depth) : refinementDepth_(depth){};
-
-   void operator()(SetupBlockForest& forest) const
-   {
-      const AABB & domain = forest.getDomain();
-
-      real_t xSize = ( domain.xSize() / real_t(12) ) * real_c( 0.99 );
-      real_t ySize = ( domain.ySize() / real_t(12) ) * real_c( 0.99 );
-
-      AABB leftCorner( domain.xMin(), domain.yMin(), domain.zMin(),
-                       domain.xMin() + xSize, domain.yMin() + ySize, domain.zMax() );
-
-      AABB rightCorner( domain.xMax() - xSize, domain.yMin(), domain.zMin(),
-                        domain.xMax(), domain.yMin() + ySize, domain.zMax() );
-
-      for(auto & block : forest)
-      {
-         auto & aabb = block.getAABB();
-         if( leftCorner.intersects( aabb ) || rightCorner.intersects( aabb ) )
-         {
-            if( block.getLevel() < refinementDepth_)
-               block.setMarker( true );
-         }
-      }
-   }
-};
-
-class LDC
-{
- private:
-   const std::string refinementProfile_;
-   const uint_t refinementDepth_;
-
-   const FlagUID noSlipFlagUID_;
-   const FlagUID ubbFlagUID_;
-
- public:
-   explicit LDC(const uint_t depth) : refinementDepth_(depth), noSlipFlagUID_("NoSlip"), ubbFlagUID_("UBB"){};
-
-   RefinementSelectionFunctor refinementSelector() const
-   {
-      return LDCRefinement(refinementDepth_);
-   }
-
-   void setupBoundaryFlagField(StructuredBlockForest& sbfs, const BlockDataID flagFieldID)
-   {
-      for (auto bIt = sbfs.begin(); bIt != sbfs.end(); ++bIt)
-      {
-         auto& b           = dynamic_cast< Block& >(*bIt);
-         const uint_t level       = b.getLevel();
-         auto flagField     = b.getData< FlagField_T >(flagFieldID);
-         const uint8_t noslipFlag = flagField->registerFlag(noSlipFlagUID_);
-         const uint8_t ubbFlag    = flagField->registerFlag(ubbFlagUID_);
-         for (auto cIt = flagField->beginWithGhostLayerXYZ(2); cIt != flagField->end(); ++cIt)
-         {
-            const Cell localCell = cIt.cell();
-            Cell globalCell(localCell);
-            sbfs.transformBlockLocalToGlobalCell(globalCell, b);
-            if (globalCell.y() >= cell_idx_c(sbfs.getNumberOfYCells(level))) { flagField->addFlag(localCell, ubbFlag); }
-            else if (globalCell.z() < 0 || globalCell.y() < 0 || globalCell.x() < 0 ||
-                     globalCell.x() >= cell_idx_c(sbfs.getNumberOfXCells(level)) || globalCell.z() >= cell_idx_c(sbfs.getNumberOfZCells(level)))
-            {
-               flagField->addFlag(localCell, noslipFlag);
-            }
-         }
-      }
-   }
-};
 
 namespace {
 void createSetupBlockForest(SetupBlockForest& setupBfs, const Config::BlockHandle& domainSetup, LDC& ldcSetup, const uint_t numProcesses=uint_c(MPIManager::instance()->numProcesses())) {
diff --git a/apps/benchmarks/NonUniformGridGPU/simulation_setup/benchmark_configs.py b/apps/benchmarks/NonUniformGridGPU/simulation_setup/benchmark_configs.py
index ccfcecacfb5c0f5a79b56aea13409cc3da4d2748..34bc6caa92b92d5239c9ca1409660b062247a469 100644
--- a/apps/benchmarks/NonUniformGridGPU/simulation_setup/benchmark_configs.py
+++ b/apps/benchmarks/NonUniformGridGPU/simulation_setup/benchmark_configs.py
@@ -26,7 +26,7 @@ class Scenario:
             },
             'Parameters': {
                 'omega': 1.95,
-                'timesteps': 10001,
+                'timesteps': 30001,
 
                 'refinementDepth': self.refinement_depth,
                 'writeSetupForestAndReturn': False,
@@ -37,7 +37,7 @@ class Scenario:
 
                 'remainingTimeLoggerFrequency': 3,
 
-                'vtkWriteFrequency': 5000,
+                'vtkWriteFrequency': 10000,
             },
             'Logging': {
                 'logLevel': "info",
diff --git a/apps/benchmarks/PhaseFieldAllenCahn/benchmark_multiphase.cpp b/apps/benchmarks/PhaseFieldAllenCahn/benchmark_multiphase.cpp
index c8992a65afb93fa7dae572959a654651f14aabde..2a59e6be99b49942a169d1c24921a93cbe8abd1e 100644
--- a/apps/benchmarks/PhaseFieldAllenCahn/benchmark_multiphase.cpp
+++ b/apps/benchmarks/PhaseFieldAllenCahn/benchmark_multiphase.cpp
@@ -18,7 +18,6 @@
 //
 //======================================================================================================================
 #include "blockforest/Initialization.h"
-#include "blockforest/communication/UniformDirectScheme.h"
 
 #include "core/Environment.h"
 #include "core/logging/Initialization.h"
@@ -66,7 +65,7 @@ typedef gpu::GPUField< real_t > GPUField;
 
 int main(int argc, char** argv)
 {
-   mpi::Environment env(argc, argv);
+   const mpi::Environment env(argc, argv);
 #if defined(WALBERLA_BUILD_WITH_CUDA)
    gpu::selectDeviceBasedOnMpiRank();
 #endif
@@ -92,14 +91,14 @@ int main(int argc, char** argv)
 
 #if defined(WALBERLA_BUILD_WITH_CUDA)
       // CPU fields
-      BlockDataID vel_field   = field::addToStorage< VelocityField_T >(blocks, "vel", real_c(0.0), field::fzyx);
+      const BlockDataID vel_field   = field::addToStorage< VelocityField_T >(blocks, "vel", real_c(0.0), field::fzyx);
       BlockDataID phase_field = field::addToStorage< PhaseField_T >(blocks, "phase", real_c(0.0), field::fzyx);
       // GPU fields
-      BlockDataID lb_phase_field_gpu = gpu::addGPUFieldToStorage< gpu::GPUField< real_t > >(
+      const BlockDataID lb_phase_field_gpu = gpu::addGPUFieldToStorage< gpu::GPUField< real_t > >(
          blocks, "lb phase field on GPU", Stencil_phase_T::Size, field::fzyx, 1);
-      BlockDataID lb_velocity_field_gpu = gpu::addGPUFieldToStorage< gpu::GPUField< real_t > >(
+      const BlockDataID lb_velocity_field_gpu = gpu::addGPUFieldToStorage< gpu::GPUField< real_t > >(
          blocks, "lb velocity field on GPU", Stencil_hydro_T::Size, field::fzyx, 1);
-      BlockDataID vel_field_gpu =
+      const BlockDataID vel_field_gpu =
          gpu::addGPUFieldToStorage< VelocityField_T >(blocks, vel_field, "velocity field on GPU", true);
       BlockDataID phase_field_gpu =
          gpu::addGPUFieldToStorage< PhaseField_T >(blocks, phase_field, "phase field on GPU", true);
@@ -215,15 +214,15 @@ int main(int argc, char** argv)
       auto timeLoop = make_shared< SweepTimeloop >(blocks->getBlockStorage(), timesteps);
 #if defined(WALBERLA_BUILD_WITH_CUDA)
       auto normalTimeStep = [&]() {
-         Comm_velocity_based_distributions->startCommunication(defaultStream);
+         Comm_velocity_based_distributions->startCommunication();
          for (auto& block : *blocks)
             phase_field_LB_step(&block, defaultStream);
-         Comm_velocity_based_distributions->wait(defaultStream);
+         Comm_velocity_based_distributions->wait();
 
-         Comm_phase_field_distributions->startCommunication(defaultStream);
+         Comm_phase_field_distributions->startCommunication();
          for (auto& block : *blocks)
             hydro_LB_step(&block, defaultStream);
-         Comm_phase_field_distributions->wait(defaultStream);
+         Comm_phase_field_distributions->wait();
       };
       auto phase_only = [&]() {
          for (auto& block : *blocks)
diff --git a/apps/benchmarks/UniformGridCPU/UniformGridCPU.cpp b/apps/benchmarks/UniformGridCPU/UniformGridCPU.cpp
index fc229eb116f86ee6c967a8fba5a1948f7b73fc64..dfcd22a87e6942fb7ac2bc5789ac92fdd65fec9f 100644
--- a/apps/benchmarks/UniformGridCPU/UniformGridCPU.cpp
+++ b/apps/benchmarks/UniformGridCPU/UniformGridCPU.cpp
@@ -68,8 +68,8 @@ int main(int argc, char** argv)
 {
    const mpi::Environment env(argc, argv);
 
-   std::string input_filename(argv[1]);
-   bool inputIsPython = string_ends_with(input_filename, ".py");
+   const std::string input_filename(argv[1]);
+   const bool inputIsPython = string_ends_with(input_filename, ".py");
 
    for (auto cfg = python_coupling::configBegin(argc, argv); cfg != python_coupling::configEnd(); ++cfg)
    {
diff --git a/apps/benchmarks/UniformGridCPU/simulation_setup/benchmark_configs.py b/apps/benchmarks/UniformGridCPU/simulation_setup/benchmark_configs.py
index c3076cb3e1f457ecae1f6a5090adac2314c34e97..21235056434f3daeebdbac212ec8de60d58810b4 100644
--- a/apps/benchmarks/UniformGridCPU/simulation_setup/benchmark_configs.py
+++ b/apps/benchmarks/UniformGridCPU/simulation_setup/benchmark_configs.py
@@ -22,12 +22,12 @@ def num_time_steps(block_size, time_steps_for_128_block=TIME_STEPS_FOR_128_BLOCK
 
 
 ldc_setup = {'Border': [
-    {'direction': 'N', 'walldistance': -1, 'flag': 'NoSlip'},
-    {'direction': 'S', 'walldistance': -1, 'flag': 'NoSlip'},
+    {'direction': 'N', 'walldistance': -1, 'flag': 'UBB'},
     {'direction': 'W', 'walldistance': -1, 'flag': 'NoSlip'},
     {'direction': 'E', 'walldistance': -1, 'flag': 'NoSlip'},
-    {'direction': 'T', 'walldistance': -1, 'flag': 'UBB'},
+    {'direction': 'S', 'walldistance': -1, 'flag': 'NoSlip'},
     {'direction': 'B', 'walldistance': -1, 'flag': 'NoSlip'},
+    {'direction': 'T', 'walldistance': -1, 'flag': 'NoSlip'},
 ]}
 
 
diff --git a/apps/benchmarks/UniformGridGPU/CMakeLists.txt b/apps/benchmarks/UniformGridGPU/CMakeLists.txt
index 66a5b0fa4f4a3588f36ba4dbd5feb732131f76d0..2607004f3749f366f8155a0dd200202f00e45867 100644
--- a/apps/benchmarks/UniformGridGPU/CMakeLists.txt
+++ b/apps/benchmarks/UniformGridGPU/CMakeLists.txt
@@ -25,7 +25,7 @@ foreach(streaming_pattern pull push aa esotwist)
 
             waLBerla_add_executable(NAME UniformGridGPU_${config}
                     FILES UniformGridGPU.cpp
-                    DEPENDS blockforest boundary core gpu domain_decomposition field geometry python_coupling timeloop vtk UniformGridGPUGenerated_${config})
+                    DEPENDS blockforest boundary core gpu domain_decomposition field geometry lbm_generated python_coupling timeloop vtk UniformGridGPUGenerated_${config})
 
             # all configs are excluded from all except for pull d3q27.
             if (${streaming_pattern} STREQUAL "pull" AND ${stencil} STREQUAL "d3q27")
diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
index ee022f457738fb6f8aa71f615441e9279fd25eca..fdc8969d626b866b978dfd1260565c50f96f01b8 100644
--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
@@ -167,22 +167,22 @@ int main(int argc, char** argv)
 
       if (timeStepStrategy == "noOverlap") {
          if (boundariesConfig){
-            timeLoop.add() << BeforeFunction(communication.getCommunicateFunctor(defaultStream), "communication")
+            timeLoop.add() << BeforeFunction(communication.getCommunicateFunctor(), "communication")
                            << Sweep(boundaryCollection.getSweep(BoundaryCollection_T::ALL, defaultStream), "Boundary Conditions");
             timeLoop.add() << Sweep(sweepCollection.streamCollide(SweepCollection_T::ALL, defaultStream), "LBM StreamCollide");
          }else {
-            timeLoop.add() << BeforeFunction(communication.getCommunicateFunctor(defaultStream), "communication")
+            timeLoop.add() << BeforeFunction(communication.getCommunicateFunctor(), "communication")
                            << Sweep(sweepCollection.streamCollide(SweepCollection_T::ALL, defaultStream), "LBM StreamCollide");}
 
       } else if (timeStepStrategy == "simpleOverlap") {
          if (boundariesConfig){
-            timeLoop.add() << BeforeFunction(communication.getStartCommunicateFunctor(defaultStream), "Start Communication")
+            timeLoop.add() << BeforeFunction(communication.getStartCommunicateFunctor(), "Start Communication")
                            << Sweep(boundaryCollection.getSweep(BoundaryCollection_T::ALL, defaultStream), "Boundary Conditions");
             timeLoop.add() << Sweep(sweepCollection.streamCollide(SweepCollection_T::INNER, defaultStream), "LBM StreamCollide Inner Frame");
             timeLoop.add() << BeforeFunction(communication.getWaitFunctor(), "Wait for Communication")
                            << Sweep(sweepCollection.streamCollide(SweepCollection_T::OUTER, defaultStream), "LBM StreamCollide Outer Frame");
          }else{
-            timeLoop.add() << BeforeFunction(communication.getStartCommunicateFunctor(defaultStream), "Start Communication")
+            timeLoop.add() << BeforeFunction(communication.getStartCommunicateFunctor(), "Start Communication")
                            << Sweep(sweepCollection.streamCollide(SweepCollection_T::INNER, defaultStream), "LBM StreamCollide Inner Frame");
             timeLoop.add() << BeforeFunction(communication.getWaitFunctor(), "Wait for Communication")
                            << Sweep(sweepCollection.streamCollide(SweepCollection_T::OUTER,defaultStream), "LBM StreamCollide Outer Frame");}
@@ -240,14 +240,14 @@ int main(int argc, char** argv)
          WALBERLA_GPU_CHECK(gpuPeekAtLastError())
 
          timeLoop.setCurrentTimeStepToZero();
-         WcTimingPool const timeloopTiming;
+         WcTimingPool timeloopTiming;
          WcTimer simTimer;
 
          WALBERLA_GPU_CHECK( gpuDeviceSynchronize() )
          WALBERLA_GPU_CHECK( gpuPeekAtLastError() )
          WALBERLA_LOG_INFO_ON_ROOT("Starting simulation with " << timesteps << " time steps")
          simTimer.start();
-         timeLoop.run();
+         timeLoop.run(timeloopTiming);
          WALBERLA_GPU_CHECK( gpuDeviceSynchronize() )
          simTimer.end();
 
diff --git a/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs.py b/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs.py
index e1972d914a5acc26ab54aaf3cb86c615ac4d3b77..74be4378e0e2acef0bcb3c36f0f6d64916bba6c8 100755
--- a/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs.py
+++ b/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs.py
@@ -26,12 +26,13 @@ BASE_CONFIG = {
 }
 
 ldc_setup = {'Border': [
-    {'direction': 'N', 'walldistance': -1, 'flag': 'NoSlip'},
-    {'direction': 'S', 'walldistance': -1, 'flag': 'NoSlip'},
     {'direction': 'N', 'walldistance': -1, 'flag': 'UBB'},
+    {'direction': 'W', 'walldistance': -1, 'flag': 'NoSlip'},
     {'direction': 'E', 'walldistance': -1, 'flag': 'NoSlip'},
-    {'direction': 'T', 'walldistance': -1, 'flag': 'NoSlip'},
+    {'direction': 'S', 'walldistance': -1, 'flag': 'NoSlip'},
     {'direction': 'B', 'walldistance': -1, 'flag': 'NoSlip'},
+    {'direction': 'T', 'walldistance': -1, 'flag': 'NoSlip'},
+
 ]}
 
 
@@ -55,7 +56,7 @@ def domain_block_size_ok(block_size, total_mem, gls=1, q=27, size_per_value=8):
 
 
 class Scenario:
-    def __init__(self, cells_per_block=(256, 128, 128), periodic=(1, 1, 1), cuda_blocks=(256, 1, 1),
+    def __init__(self, cells_per_block=(256, 128, 128), periodic=(1, 1, 1), cuda_blocks=(128, 1, 1),
                  timesteps=None, time_step_strategy="normal", omega=1.8, cuda_enabled_mpi=False,
                  inner_outer_split=(1, 1, 1), warmup_steps=5, outer_iterations=3,
                  init_shear_flow=False, boundary_setup=False,
@@ -110,7 +111,11 @@ class Scenario:
                 'innerOuterSplit': self.inner_outer_split,
                 'vtkWriteFrequency': self.vtk_write_frequency,
                 'remainingTimeLoggerFrequency': self.remaining_time_logger_frequency
+            },
+            'Logging': {
+                'logLevel': 'info',  # info progress detail tracing
             }
+
         }
         if self.boundary_setup:
             config_dict["Boundaries"] = ldc_setup
@@ -184,12 +189,14 @@ def overlap_benchmark():
     # no overlap
     scenarios.add(Scenario(time_step_strategy='noOverlap',
                            inner_outer_split=(1, 1, 1),
-                           cuda_enabled_mpi=cuda_enabled_mpi))
+                           cuda_enabled_mpi=cuda_enabled_mpi,
+                           outer_iterations=1))
 
     for inner_outer_split in inner_outer_splits:
         scenario = Scenario(time_step_strategy='simpleOverlap',
                             inner_outer_split=inner_outer_split,
-                            cuda_enabled_mpi=cuda_enabled_mpi)
+                            cuda_enabled_mpi=cuda_enabled_mpi,
+                            outer_iterations=1)
         scenarios.add(scenario)
 
 
@@ -228,6 +235,7 @@ def single_gpu_benchmark():
                                 cuda_blocks=cuda_block_size,
                                 time_step_strategy='kernelOnly',
                                 timesteps=num_time_steps(block_size, 2000),
+                                outer_iterations=1,
                                 additional_info=additional_info)
             scenarios.add(scenario)
 
@@ -237,18 +245,18 @@ def validation_run():
     wlb.log_info_on_root("Validation run")
     wlb.log_info_on_root("")
 
-    time_step_strategy = "noOverlap"  # "noOverlap"
+    time_step_strategy = "noOverlap"  # "simpleOverlap"
 
     scenarios = wlb.ScenarioManager()
-    scenario = Scenario(cells_per_block=(64, 64, 64),
+    scenario = Scenario(cells_per_block=(128, 128, 128),
                         time_step_strategy=time_step_strategy,
-                        timesteps=1000,
+                        timesteps=10001,
                         outer_iterations=1,
                         warmup_steps=0,
                         init_shear_flow=False,
                         boundary_setup=True,
-                        vtk_write_frequency=0,
-                        remaining_time_logger_frequency=10)
+                        vtk_write_frequency=5000,
+                        remaining_time_logger_frequency=30)
     scenarios.add(scenario)
 
 
diff --git a/apps/showcases/PhaseFieldAllenCahn/GPU/multiphase.cpp b/apps/showcases/PhaseFieldAllenCahn/GPU/multiphase.cpp
index 2800b98cb65008ef5d66aa98853fb5589087c8d5..9b883282a0437628203cfb04de660f0301b5758a 100644
--- a/apps/showcases/PhaseFieldAllenCahn/GPU/multiphase.cpp
+++ b/apps/showcases/PhaseFieldAllenCahn/GPU/multiphase.cpp
@@ -72,7 +72,7 @@ typedef gpu::GPUField< uint8_t > GPUField_int;
 
 int main(int argc, char** argv)
 {
-   mpi::Environment Env(argc, argv);
+   const mpi::Environment Env(argc, argv);
    gpu::selectDeviceBasedOnMpiRank();
    exportDataStructuresToPython();
 
@@ -114,17 +114,17 @@ int main(int argc, char** argv)
       BlockDataID vel_field   = field::addToStorage< VelocityField_T >(blocks, "vel", real_c(0.0), field::fzyx);
       BlockDataID phase_field = field::addToStorage< PhaseField_T >(blocks, "phase", real_c(0.0), field::fzyx);
       // GPU fields
-      BlockDataID lb_phase_field_gpu = gpu::addGPUFieldToStorage< gpu::GPUField< real_t > >(
+      const BlockDataID lb_phase_field_gpu = gpu::addGPUFieldToStorage< gpu::GPUField< real_t > >(
          blocks, "lb phase field on GPU", Stencil_phase_T::Size, field::fzyx, 1);
-      BlockDataID lb_velocity_field_gpu = gpu::addGPUFieldToStorage< gpu::GPUField< real_t > >(
+      const BlockDataID lb_velocity_field_gpu = gpu::addGPUFieldToStorage< gpu::GPUField< real_t > >(
          blocks, "lb velocity field on GPU", Stencil_hydro_T::Size, field::fzyx, 1);
       BlockDataID vel_field_gpu =
          gpu::addGPUFieldToStorage< VelocityField_T >(blocks, vel_field, "velocity field on GPU", true);
       BlockDataID phase_field_gpu =
          gpu::addGPUFieldToStorage< PhaseField_T >(blocks, phase_field, "phase field on GPU", true);
       // Flag field
-      BlockDataID flagFieldID     = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field");
-      BlockDataID flagFieldID_gpu = gpu::addGPUFieldToStorage< FlagField_T >(blocks, flagFieldID, "flag on GPU", true);
+      const BlockDataID flagFieldID     = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field");
+      const BlockDataID flagFieldID_gpu = gpu::addGPUFieldToStorage< FlagField_T >(blocks, flagFieldID, "flag on GPU", true);
 
       auto physical_parameters     = config->getOneBlock("PhysicalParameters");
       const real_t density_liquid  = physical_parameters.getParameter< real_t >("density_liquid", real_c(1.0));
@@ -181,11 +181,11 @@ int main(int argc, char** argv)
                                                               interface_thickness);
       pystencils::initialize_velocity_based_distributions init_g(lb_velocity_field_gpu, vel_field_gpu);
 
-      pystencils::phase_field_LB_step phase_field_LB_step(flagFieldID_gpu, lb_phase_field_gpu, phase_field_gpu,
+      const pystencils::phase_field_LB_step phase_field_LB_step(flagFieldID_gpu, lb_phase_field_gpu, phase_field_gpu,
                                                           vel_field_gpu, mobility, interface_thickness, gpuBlockSize[0],
                                                           gpuBlockSize[1], gpuBlockSize[2]);
 
-      pystencils::hydro_LB_step hydro_LB_step(flagFieldID_gpu, lb_velocity_field_gpu, phase_field_gpu, vel_field_gpu,
+      const pystencils::hydro_LB_step hydro_LB_step(flagFieldID_gpu, lb_velocity_field_gpu, phase_field_gpu, vel_field_gpu,
                                               gravitational_acceleration, interface_thickness, density_liquid,
                                               density_gas, surface_tension, relaxation_time_liquid, relaxation_time_gas,
                                               gpuBlockSize[0], gpuBlockSize[1], gpuBlockSize[2]);
@@ -193,8 +193,8 @@ int main(int argc, char** argv)
       ////////////////////////
       // ADD COMMUNICATION //
       //////////////////////
-      int streamLowPriority  = 0;
-      int streamHighPriority = 0;
+      const int streamLowPriority  = 0;
+      const int streamHighPriority = 0;
       auto defaultStream     = gpu::StreamRAII::newPriorityStream(streamLowPriority);
       auto innerOuterStreams = gpu::ParallelStreams(streamHighPriority);
 
@@ -204,20 +204,20 @@ int main(int argc, char** argv)
          make_shared< lbm::PackInfo_velocity_based_distributions >(lb_velocity_field_gpu);
       UniformGPUSchemeVelocityDistributions->addPackInfo(generatedPackInfo_velocity_based_distributions);
       auto Comm_velocity_based_distributions =
-         std::function< void() >([&]() { UniformGPUSchemeVelocityDistributions->communicate(defaultStream); });
+         std::function< void() >([&]() { UniformGPUSchemeVelocityDistributions->communicate(); });
       auto Comm_velocity_based_distributions_start =
-         std::function< void() >([&]() { UniformGPUSchemeVelocityDistributions->startCommunication(defaultStream); });
+         std::function< void() >([&]() { UniformGPUSchemeVelocityDistributions->startCommunication(); });
       auto Comm_velocity_based_distributions_wait =
-         std::function< void() >([&]() { UniformGPUSchemeVelocityDistributions->wait(defaultStream); });
+         std::function< void() >([&]() { UniformGPUSchemeVelocityDistributions->wait(); });
 
       auto UniformGPUSchemePhaseField =
          make_shared< gpu::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, cudaEnabledMpi);
       auto generatedPackInfo_phase_field = make_shared< pystencils::PackInfo_phase_field >(phase_field_gpu);
       UniformGPUSchemePhaseField->addPackInfo(generatedPackInfo_phase_field);
-      auto Comm_phase_field = std::function< void() >([&]() { UniformGPUSchemePhaseField->communicate(defaultStream); });
+      auto Comm_phase_field = std::function< void() >([&]() { UniformGPUSchemePhaseField->communicate(); });
       auto Comm_phase_field_start =
-         std::function< void() >([&]() { UniformGPUSchemePhaseField->startCommunication(defaultStream); });
-      auto Comm_phase_field_wait = std::function< void() >([&]() { UniformGPUSchemePhaseField->wait(defaultStream); });
+         std::function< void() >([&]() { UniformGPUSchemePhaseField->startCommunication(); });
+      auto Comm_phase_field_wait = std::function< void() >([&]() { UniformGPUSchemePhaseField->wait(); });
 
       auto UniformGPUSchemePhaseFieldDistributions =
          make_shared< gpu::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, cudaEnabledMpi);
@@ -225,11 +225,11 @@ int main(int argc, char** argv)
          make_shared< lbm::PackInfo_phase_field_distributions >(lb_phase_field_gpu);
       UniformGPUSchemePhaseFieldDistributions->addPackInfo(generatedPackInfo_phase_field_distributions);
       auto Comm_phase_field_distributions =
-         std::function< void() >([&]() { UniformGPUSchemePhaseFieldDistributions->communicate(defaultStream); });
+         std::function< void() >([&]() { UniformGPUSchemePhaseFieldDistributions->communicate(); });
       auto Comm_phase_field_distributions_start =
-         std::function< void() >([&]() { UniformGPUSchemePhaseFieldDistributions->startCommunication(defaultStream); });
+         std::function< void() >([&]() { UniformGPUSchemePhaseFieldDistributions->startCommunication(); });
       auto Comm_phase_field_distributions_wait =
-         std::function< void() >([&]() { UniformGPUSchemePhaseFieldDistributions->wait(defaultStream); });
+         std::function< void() >([&]() { UniformGPUSchemePhaseFieldDistributions->wait(); });
 
       ////////////////////////
       // BOUNDARY HANDLING //
@@ -394,7 +394,7 @@ int main(int argc, char** argv)
                                                                                      targetRank, MPI_COMM_WORLD);
                   WALBERLA_EXCLUSIVE_WORLD_SECTION(targetRank)
                   {
-                     std::string path = "";
+                     const std::string path = "";
                      std::ostringstream out;
                      out << std::internal << std::setfill('0') << std::setw(6) << counter;
                      geometry::writeMesh(
diff --git a/apps/tutorials/codegen/03_AdvancedLBMCodegen.cpp b/apps/tutorials/codegen/03_AdvancedLBMCodegen.cpp
index 1856106c5b61880752c7216ee10eabda140485b1..2dbeb9c8e2d82d28bc508c4ff215d6fa04bc2ca4 100644
--- a/apps/tutorials/codegen/03_AdvancedLBMCodegen.cpp
+++ b/apps/tutorials/codegen/03_AdvancedLBMCodegen.cpp
@@ -203,9 +203,9 @@ int main(int argc, char** argv)
    // Communication
 #if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT)
    const bool sendDirectlyFromGPU = false;
-   gpu::communication::UniformGPUScheme< Stencil_T > com(blocks, sendDirectlyFromGPU);
+   gpu::communication::UniformGPUScheme< Stencil_T > com(blocks, sendDirectlyFromGPU,  false);
    com.addPackInfo(make_shared< PackInfo_T >(pdfFieldId));
-   auto communication = std::function< void() >([&]() { com.communicate(nullptr); });
+   auto communication = std::function< void() >([&]() { com.communicate(); });
 #else
    blockforest::communication::UniformBufferedScheme< Stencil_T > communication(blocks);
    communication.addPackInfo(make_shared< PackInfo_T >(pdfFieldId));
diff --git a/cmake/FindOpenMesh.cmake b/cmake/FindOpenMesh.cmake
index dfe98d2dd3758f50fd042bb3d265e79131447ff0..4dbb7f7dd06fe02b5019b4a52e76287eafbc8651 100644
--- a/cmake/FindOpenMesh.cmake
+++ b/cmake/FindOpenMesh.cmake
@@ -15,7 +15,7 @@
 #===========================================================================
 #                                                                           
 #                               OpenMesh                                    
-#           Copyright (c) 2001-2015, RWTH-Aachen University                 
+#           Copyright (c) 2001-2023, RWTH-Aachen University                 
 #           Department of Computer Graphics and Multimedia                 
 #                          All rights reserved.                             
 #                            www.openmesh.org                               
@@ -53,7 +53,7 @@
 #                                                                            
 #===========================================================================
 
-cmake_minimum_required(VERSION 2.8.12)
+cmake_minimum_required(VERSION 3.3.0)
 
 #if already found via finder or simulated finder in openmesh CMakeLists.txt, skip the search
 IF (NOT OPENMESH_FOUND) 
@@ -63,6 +63,13 @@ IF (NOT OPENMESH_FOUND)
     "${CMAKE_SOURCE_DIR}/OpenMesh/src/OpenMesh"
     "${CMAKE_SOURCE_DIR}/libs_required/OpenMesh/src/OpenMesh"
     "${CMAKE_SOURCE_DIR}/../OpenMesh/src/OpenMesh"
+    "C:/Program Files/OpenMesh 10.0"
+    "C:/Program Files/OpenMesh 9.0"
+    "C:/Program Files/OpenMesh 8.1"
+    "C:/Program Files/OpenMesh 8.0"
+    "C:/Program Files/OpenMesh 7.2"
+    "C:/Program Files/OpenMesh 7.1"
+    "C:/Program Files/OpenMesh 7.0"
     "C:/Program Files/OpenMesh 6.3"
     "C:/Program Files/OpenMesh 6.2"
     "C:/Program Files/OpenMesh 6.1"
@@ -81,6 +88,12 @@ IF (NOT OPENMESH_FOUND)
     "C:/Program Files/OpenMesh 2.4.1"
     "C:/Program Files/OpenMesh 2.4"
     "C:/Program Files/OpenMesh 2.0/include"
+    "C:/libs/OpenMesh 10.0"
+    "C:/libs/OpenMesh 9.0"
+    "C:/libs/OpenMesh 8.1"
+    "C:/libs/OpenMesh 8.0"
+    "C:/libs/OpenMesh 7.1"
+    "C:/libs/OpenMesh 7.0"
     "C:/libs/OpenMesh 6.3"
     "C:/libs/OpenMesh 6.2"
     "C:/libs/OpenMesh 6.1"
@@ -98,7 +111,7 @@ IF (NOT OPENMESH_FOUND)
     "C:/libs/OpenMesh 3.0"
     "C:/libs/OpenMesh 2.4.1"
     "C:/libs/OpenMesh 2.4"
-    "${OPENMESH_DIR}"
+    "${OPENMESH_LIBRARY_DIR}"
   )
 
   FIND_PATH (OPENMESH_INCLUDE_DIR OpenMesh/Core/Mesh/PolyMeshT.hh
diff --git a/cmake/TestFloat16.cpp b/cmake/TestFloat16.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bae373fbb5af94c152c405f4572bdbb764a43c04
--- /dev/null
+++ b/cmake/TestFloat16.cpp
@@ -0,0 +1,7 @@
+#include <iostream>
+
+
+int main()
+{
+   static_assert(std::is_floating_point_v<_Float16>);
+}
\ No newline at end of file
diff --git a/python/lbmpy_walberla/additional_data_handler.py b/python/lbmpy_walberla/additional_data_handler.py
index 16e5cb35f8ed64ae77eeda06c08812bbb57e950a..efd86fe92043777b69620b99d138fc1331a98580 100644
--- a/python/lbmpy_walberla/additional_data_handler.py
+++ b/python/lbmpy_walberla/additional_data_handler.py
@@ -3,7 +3,11 @@ from pystencils.stencil import inverse_direction
 from pystencils.typing import BasicType
 
 from lbmpy.advanced_streaming import AccessPdfValues, numeric_offsets, numeric_index, Timestep, is_inplace
-from lbmpy.advanced_streaming.indexing import MirroredStencilDirections
+# until lbmpy version 1.3.2 
+try:
+    from lbmpy.advanced_streaming.indexing import MirroredStencilDirections
+except ImportError:
+    from lbmpy.custom_code_nodes import MirroredStencilDirections
 from lbmpy.boundaries.boundaryconditions import LbBoundary
 from lbmpy.boundaries import ExtrapolationOutflow, FreeSlip, UBB
 
diff --git a/src/blockforest/Initialization.cpp b/src/blockforest/Initialization.cpp
index b91923eebd5b53156fbe3e916c8aeb84963a283b..d800a75b10a112edba39a6655c84bfad53cb1426 100644
--- a/src/blockforest/Initialization.cpp
+++ b/src/blockforest/Initialization.cpp
@@ -130,7 +130,7 @@ shared_ptr< StructuredBlockForest > createUniformBlockGridFromConfig( const Conf
                                               cell_idx_c(cells[1])-1,
                                               cell_idx_c(cells[2])-1 );
 
-      uint_t nrOfProcesses = uint_c( MPIManager::instance()->numProcesses() );
+      const uint_t nrOfProcesses = uint_c( MPIManager::instance()->numProcesses() );
 
       calculateCellDistribution( cells, nrOfProcesses, blocks, cellsPerBlock );
    }
diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index 099e0b5732a9d5b112600d150162f728bf8dcb8f..71bb209d70167ecd5ef056c61350269667029930 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -29,6 +29,15 @@ add_library( core )
 if( MPI_FOUND )
    target_link_libraries( core PUBLIC MPI::MPI_CXX )
 endif()
+
+if ( WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT )
+    # Actual support for float16 is available only since C++23
+    #   before is_arithmetic and is_floating_point evaluated to false,
+    #   also many STL functions are compatible with float16 only since C++23.
+    # Which features are actually supported depend on the compiler
+    target_compile_features(core PUBLIC cxx_std_23)
+endif ()
+
 target_link_libraries( core PUBLIC ${SERVICE_LIBS} )
 target_sources( core
       PRIVATE
diff --git a/src/core/DataTypes.cpp b/src/core/DataTypes.cpp
index ead9f6fb680c12a053c9ebf4fe9fe9333c443027..0b9dcad1fe5e4c0e8973cf2f2719b003fb7134a0 100644
--- a/src/core/DataTypes.cpp
+++ b/src/core/DataTypes.cpp
@@ -26,6 +26,11 @@ namespace walberla {
 
 namespace real_comparison
 {
+   #ifdef WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT
+//   const    bfloat16 Epsilon<    bfloat16 >::value = static_cast<    bfloat16 >(1e-2); // machine eps is 2^-7
+   const     float16 Epsilon<     float16 >::value = static_cast<     float16 >(1e-3); // machine eps is 2^-10
+   // Note, depending on the kind of float16 <bfloat, float16> another Epsilon must be used.
+   #endif
    const       float Epsilon<       float >::value = static_cast<       float >(1e-4);
    const      double Epsilon<      double >::value = static_cast<      double >(1e-8);
    const long double Epsilon< long double >::value = static_cast< long double >(1e-10);
diff --git a/src/core/DataTypes.h b/src/core/DataTypes.h
index bae5b7651eaa17bc67c9fe822eeb386de38f61ca..4e7c019a86dcf0ce23fb7c8a9e66e6add8500bde 100644
--- a/src/core/DataTypes.h
+++ b/src/core/DataTypes.h
@@ -175,21 +175,33 @@ using real_t = float;
 /// Only bandwidth bound code may therefore benefit. None of this is guaranteed, and may change in the future.
 ///
 #ifdef WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT
-#   if defined(WALBERLA_CXX_COMPILER_IS_CLANG) || defined(WALBERLA_CXX_COMPILER_IS_GNU)
-/// Clang version must be 15 or higher for x86 half precision support.
-/// GCC version must be 12 or higher for x86 half precision support.
-/// Also support seems to require SSE, so ensure that respective instruction sets are enabled.
+/// FIXME: (not really right) Clang version must be 15 or higher for x86 half precision support.
+/// FIXME: (not really right) GCC version must be 12 or higher for x86 half precision support.
+/// FIXME: (I don't know) Also support seems to require SSE, so ensure that respective instruction sets are enabled.
 /// See
 ///   https://clang.llvm.org/docs/LanguageExtensions.html#half-precision-floating-point
 ///   https://gcc.gnu.org/onlinedocs/gcc/Half-Precision.html
 /// for more information.
+/// ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+/// Compiler requirements:
+/// Within this project, there are several checks to ensure that the template parameter 'ValueType'
+/// is a floating point number. The check is_floating_point<ValueType> is done primarily in our MPI implementation.
+/// The IEE 754 floating type format _Float16, evaluates to true only if your compiler supports the
+/// open C++23 standard P1467R9 (Extended floating-point types and standard names).
+/// Compare:
+///  https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2022/p1467r9.html
+///
+/// Right now (18.12.2023) this is the case only for gcc13.
+/// For more information see:
+///   https://gcc.gnu.org/projects/cxx-status.html#:~:text=Extended%20floating%2Dpoint%20types%20and%20standard%20names
+///   https://clang.llvm.org/cxx_status.html#:~:text=Extended%20floating%2Dpoint%20types%20and%20standard%20names
+
 using half    = _Float16;
+// Note: there are two possible float16 formats.
+// The one used right now is the IEE 754 float16 standard, consisting of a 5 bit exponent and a 10 bit mantissa.
+// Another possible half precision format would be the one from Google Brain (bfloat16) with an 8 bit exponent and a 7 bit mantissa.
+// Compare https://i10git.cs.fau.de/ab04unyc/walberla/-/issues/23
 using float16 = half;
-#   else
-static_assert(false, "\n\n### Attempting to built walberla with half precision support.\n"
-                     "### However, the compiler you chose is not suited for that, or we simply have not implemented "
-                     "support for half precision and your compiler.\n");
-#   endif
 #endif
 using float32 = float;
 using float64 = double;
@@ -228,6 +240,10 @@ inline bool realIsIdentical( const real_t a, const real_t b )
 namespace real_comparison
 {
    template< class T > struct Epsilon;
+   #ifdef WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT
+   using walberla::float16;
+   template<> struct Epsilon<     float16 > { static const     float16 value; };
+   #endif
    template<> struct Epsilon<       float > { static const       float value; };
    template<> struct Epsilon<      double > { static const      double value; };
    template<> struct Epsilon< long double > { static const long double value; };
@@ -254,6 +270,14 @@ inline bool floatIsEqual( float lhs, float rhs, const float epsilon = real_compa
    return std::fabs( lhs - rhs ) < epsilon;
 }
 
+#ifdef WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT
+inline bool floatIsEqual( walberla::float16 lhs, walberla::float16 rhs, const walberla::float16 epsilon = real_comparison::Epsilon<walberla::float16>::value )
+{
+   const auto difference = lhs - rhs;
+   return ( (difference < 0) ? -difference : difference ) < epsilon;
+}
+#endif
+
 } // namespace walberla
 
 #define WALBERLA_UNUSED(x)  (void)(x)
diff --git a/src/core/Environment.h b/src/core/Environment.h
index 492467d52b62f266e3c348615369010da208d03b..61ef4f2b13db47c5f2ba7e3bed11cc95c43980e9 100644
--- a/src/core/Environment.h
+++ b/src/core/Environment.h
@@ -44,9 +44,8 @@ public:
    *
    * \param argc,argv  If command line parameters are present they have to contain at least
    *                   the path to the configuration file and optionally pairs of param-value:
-   *                   "-myParameter myValue"
-   *                   These values are then substituted in configuration files at
-   *                   positions marked with $(myParameter)
+   *                   "-blockName.parameterName=parameterValue"
+   *                   These values are then substituted in configuration files.
    *                   It is also possible to pass no command line options (see description below )
    *
    * If command line arguments are present the constructor initializes static objects
diff --git a/src/core/mpi/MPIWrapper.h b/src/core/mpi/MPIWrapper.h
index 51ab22e26ed3b9e38a858ab8040c39325751ff12..eecee3136c702a61e7987d25705076a7c780a635 100644
--- a/src/core/mpi/MPIWrapper.h
+++ b/src/core/mpi/MPIWrapper.h
@@ -353,6 +353,9 @@ WALBERLA_CREATE_MPITRAIT_SPECIALIZATION( unsigned short int , MPI_UNSIGNED_SHORT
 WALBERLA_CREATE_MPITRAIT_SPECIALIZATION( unsigned int       , MPI_UNSIGNED           );
 WALBERLA_CREATE_MPITRAIT_SPECIALIZATION( unsigned long int  , MPI_UNSIGNED_LONG      );
 WALBERLA_CREATE_MPITRAIT_SPECIALIZATION( unsigned long long , MPI_UNSIGNED_LONG_LONG );
+#ifdef WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT
+   WALBERLA_CREATE_MPITRAIT_SPECIALIZATION( walberla::float16  , MPI_WCHAR              );
+#endif
 WALBERLA_CREATE_MPITRAIT_SPECIALIZATION( float              , MPI_FLOAT              );
 WALBERLA_CREATE_MPITRAIT_SPECIALIZATION( double             , MPI_DOUBLE             );
 WALBERLA_CREATE_MPITRAIT_SPECIALIZATION( long double        , MPI_LONG_DOUBLE        );
diff --git a/src/geometry/initializer/BoundaryFromDomainBorder.impl.h b/src/geometry/initializer/BoundaryFromDomainBorder.impl.h
index 38a97d82451d6fc114541f892aa023660c3a9b02..c3ae21b55cd3709ff28bdf7012151d51b6b311eb 100644
--- a/src/geometry/initializer/BoundaryFromDomainBorder.impl.h
+++ b/src/geometry/initializer/BoundaryFromDomainBorder.impl.h
@@ -65,7 +65,7 @@ void BoundaryFromDomainBorder<Handling>::init( const Config::BlockHandle & block
    BoundarySetter<Handling> boundarySetter;
    boundarySetter.setConfigBlock( blockHandle );
 
-   std::string directionStr = blockHandle.getParameter<std::string>( "direction" );
+   const std::string directionStr = blockHandle.getParameter<std::string>( "direction" );
    cell_idx_t  wallDistance            = blockHandle.getParameter<cell_idx_t>( "walldistance", 0 );
    cell_idx_t  ghostLayersToInitialize = blockHandle.getParameter<cell_idx_t>( "ghostLayersToInitialize", std::numeric_limits<cell_idx_t>::max() );
 
@@ -75,8 +75,8 @@ void BoundaryFromDomainBorder<Handling>::init( const Config::BlockHandle & block
    using stencil::D3Q7;
    for( auto dirIt = D3Q7::beginNoCenter(); dirIt != D3Q7::end(); ++dirIt )
    {
-      bool isAll = string_icompare( directionStr, "all" ) == 0;
-      bool isInDirectionStrings = std::find( directionStrings.begin(), directionStrings.end(),
+      const bool isAll = string_icompare( directionStr, "all" ) == 0;
+      const bool isInDirectionStrings = std::find( directionStrings.begin(), directionStrings.end(),
                                              stencil::dirToString[*dirIt] ) != directionStrings.end();
 
       if( isAll || isInDirectionStrings )
@@ -87,7 +87,7 @@ void BoundaryFromDomainBorder<Handling>::init( const Config::BlockHandle & block
    }
 
    if ( ! atLeastOneBoundarySet )
-      WALBERLA_ABORT( "Invalid Direction " << directionStr << ". Allowed values: all, N,S,W,E,T,B ");
+      WALBERLA_ABORT( "Invalid Direction " << directionStr << ". Allowed values: all, N,S,W,E,T,B ")
 }
 
 
diff --git a/src/gpu/communication/NonUniformGPUScheme.h b/src/gpu/communication/NonUniformGPUScheme.h
index 093ec4cad2a830a80042073f905bb1c7316bf8ae..745d28cc5f18e0df1ce6eeeda0cfbf5d478656ee 100644
--- a/src/gpu/communication/NonUniformGPUScheme.h
+++ b/src/gpu/communication/NonUniformGPUScheme.h
@@ -307,6 +307,9 @@ void NonUniformGPUScheme< Stencil >::startCommunicationEqualLevel(const uint_t i
       for (auto it : headers_[EQUAL_LEVEL][index])
          bufferSystemGPU_[EQUAL_LEVEL][index].sendBuffer(it.first).clear();
 
+   // wait until communication dependent kernels are finished
+   WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
+
    // Start filling send buffers
    for (auto& iBlock : *forest)
    {
@@ -397,6 +400,9 @@ void NonUniformGPUScheme< Stencil >::startCommunicationCoarseToFine(const uint_t
       for (auto it : headers_[COARSE_TO_FINE][index])
          bufferSystemGPU_[COARSE_TO_FINE][index].sendBuffer(it.first).clear();
 
+   // wait until communication dependent kernels are finished
+   WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
+
    // Start filling send buffers
    for (auto& iBlock : *forest)
    {
@@ -431,7 +437,7 @@ void NonUniformGPUScheme< Stencil >::startCommunicationCoarseToFine(const uint_t
                {
                   WALBERLA_ASSERT_NOT_NULLPTR(gpuDataBuffer.cur())
                   WALBERLA_ASSERT_GREATER_EQUAL(gpuDataBuffer.remainingSize(), pi->sizeCoarseToFineSend(coarseBlock, fineReceiverId, *dir))
-                  pi->communicateLocalCoarseToFine(coarseBlock, fineReceiverBlock, *dir, gpuDataBuffer, streams_[*dir]);
+                  pi->communicateLocalCoarseToFine(coarseBlock, fineReceiverBlock, *dir, gpuDataBuffer, nullptr);
                }
             }
             else
@@ -500,6 +506,9 @@ void NonUniformGPUScheme< Stencil >::startCommunicationFineToCoarse(const uint_t
       for (auto it : headers_[FINE_TO_COARSE][index])
          bufferSystemGPU_[FINE_TO_COARSE][index].sendBuffer(it.first).clear();
 
+   // wait until communication dependent kernels are finished
+   WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
+
    // Start filling send buffers
    for (auto& iBlock : *forest)
    {
@@ -532,7 +541,7 @@ void NonUniformGPUScheme< Stencil >::startCommunicationFineToCoarse(const uint_t
             {
                WALBERLA_ASSERT_NOT_NULLPTR(gpuDataBuffer.cur())
                WALBERLA_ASSERT_GREATER_EQUAL(gpuDataBuffer.allocSize() - gpuDataBuffer.size(), pi->sizeFineToCoarseSend(fineBlock, *dir))
-               pi->communicateLocalFineToCoarse(fineBlock, coarseReceiverBlock, *dir, gpuDataBuffer, streams_[*dir]);
+               pi->communicateLocalFineToCoarse(fineBlock, coarseReceiverBlock, *dir, gpuDataBuffer, nullptr);
             }
          }
          else
diff --git a/src/gpu/communication/UniformGPUScheme.h b/src/gpu/communication/UniformGPUScheme.h
index 5c9604ccd8cc00e5cdb2d9f9c1085ace2f2e44a5..bc481d8950c25d4aa5196316c641e8b67e34318a 100644
--- a/src/gpu/communication/UniformGPUScheme.h
+++ b/src/gpu/communication/UniformGPUScheme.h
@@ -18,7 +18,6 @@
 //! \author Martin Bauer <martin.bauer@fau.de>
 //
 //======================================================================================================================
-
 #pragma once
 
 #include "blockforest/StructuredBlockForest.h"
@@ -32,9 +31,7 @@
 
 #include <thread>
 
-#include "gpu/GPURAII.h"
 #include "gpu/GPUWrapper.h"
-#include "gpu/ParallelStreams.h"
 #include "gpu/communication/CustomMemoryBuffer.h"
 #include "gpu/communication/GeneratedGPUPackInfo.h"
 
@@ -49,29 +46,34 @@ namespace communication {
    class UniformGPUScheme
    {
    public:
-       explicit UniformGPUScheme( weak_ptr<StructuredBlockForest> bf,
-                                  bool sendDirectlyFromGPU = false,
-                                  bool useLocalCommunication = true,
+       explicit UniformGPUScheme( const weak_ptr< StructuredBlockForest >& bf,
+                                  const bool sendDirectlyFromGPU = false,
+                                  const bool useLocalCommunication = true,
                                   const int tag = 5432 );
 
-       explicit UniformGPUScheme( weak_ptr<StructuredBlockForest> bf,
-                                 const Set<SUID> & requiredBlockSelectors,
-                                 const Set<SUID> & incompatibleBlockSelectors,
-                                 bool sendDirectlyFromGPU = false,
-                                 bool useLocalCommunication = true,
-                                 const int tag = 5432 );
+       explicit UniformGPUScheme( const weak_ptr< StructuredBlockForest >& bf,
+                                  const Set<SUID> & requiredBlockSelectors,
+                                  const Set<SUID> & incompatibleBlockSelectors,
+                                  const bool sendDirectlyFromGPU = false,
+                                  const bool useLocalCommunication = true,
+                                  const int tag = 5432 );
+       ~UniformGPUScheme()
+       {
+          for (uint_t i = 0; i < Stencil::Q; ++i)
+             WALBERLA_GPU_CHECK(gpuStreamDestroy(streams_[i]))
+       }
 
        void addPackInfo( const shared_ptr<GeneratedGPUPackInfo> &pi );
 
-       void startCommunication( gpuStream_t stream = nullptr);
-       void wait( gpuStream_t stream = nullptr);
+       void startCommunication();
+       void wait();
 
-       void operator()( gpuStream_t stream = nullptr )         { communicate( stream ); }
-       inline void communicate( gpuStream_t stream = nullptr ) { startCommunication(stream); wait(stream); }
+       void operator()()         { communicate( ); }
+       inline void communicate() { startCommunication(); wait(); }
 
-       std::function<void()> getCommunicateFunctor( gpuStream_t stream = nullptr );
-       std::function<void()> getStartCommunicateFunctor( gpuStream_t stream = nullptr );
-       std::function<void()> getWaitFunctor( gpuStream_t stream = nullptr );
+       std::function<void()> getCommunicateFunctor();
+       std::function<void()> getStartCommunicateFunctor();
+       std::function<void()> getWaitFunctor();
 
    private:
        void setupCommunication();
@@ -81,8 +83,8 @@ namespace communication {
 
        bool setupBeforeNextCommunication_;
        bool communicationInProgress_;
-       bool sendFromGPU_;
-       bool useLocalCommunication_;
+       const bool sendFromGPU_;
+       const bool useLocalCommunication_;
 
        using CpuBuffer_T = gpu::communication::PinnedMemoryBuffer;
        using GpuBuffer_T = gpu::communication::GPUMemoryBuffer;
@@ -92,8 +94,6 @@ namespace communication {
 
        std::vector<shared_ptr<GeneratedGPUPackInfo> > packInfos_;
 
-       ParallelStreams parallelSectionManager_;
-
        struct Header
        {
            BlockID blockId;
@@ -103,6 +103,8 @@ namespace communication {
 
        Set<SUID> requiredBlockSelectors_;
        Set<SUID> incompatibleBlockSelectors_;
+
+       gpuStream_t streams_[Stencil::Q];
    };
 
 
diff --git a/src/gpu/communication/UniformGPUScheme.impl.h b/src/gpu/communication/UniformGPUScheme.impl.h
index 93f6dd85e0e3f44293b9943e1bf252ce52c6ad33..84d9e0f22dd5661d1d428525d3758a5bb9a29488 100644
--- a/src/gpu/communication/UniformGPUScheme.impl.h
+++ b/src/gpu/communication/UniformGPUScheme.impl.h
@@ -19,10 +19,6 @@
 //
 //======================================================================================================================
 
-#include "core/mpi/MPIWrapper.h"
-
-#include "gpu/ParallelStreams.h"
-
 namespace walberla {
 namespace gpu
 {
@@ -30,9 +26,9 @@ namespace communication {
 
 
    template<typename Stencil>
-   UniformGPUScheme<Stencil>::UniformGPUScheme( weak_ptr <StructuredBlockForest> bf,
-                                                bool sendDirectlyFromGPU,
-                                                bool useLocalCommunication,
+   UniformGPUScheme<Stencil>::UniformGPUScheme( const weak_ptr< StructuredBlockForest >& bf,
+                                                const bool sendDirectlyFromGPU,
+                                                const bool useLocalCommunication,
                                                 const int tag )
         : blockForest_( bf ),
           setupBeforeNextCommunication_( true ),
@@ -41,7 +37,6 @@ namespace communication {
           useLocalCommunication_(useLocalCommunication),
           bufferSystemCPU_( mpi::MPIManager::instance()->comm(), tag ),
           bufferSystemGPU_( mpi::MPIManager::instance()->comm(), tag ),
-          parallelSectionManager_( -1 ),
           requiredBlockSelectors_( Set<SUID>::emptySet() ),
           incompatibleBlockSelectors_( Set<SUID>::emptySet() )
    {
@@ -52,14 +47,17 @@ namespace communication {
          WALBERLA_CHECK(!sendDirectlyFromGPU)
 #endif
       }
+
+      for (uint_t i = 0; i < Stencil::Q; ++i)
+         WALBERLA_GPU_CHECK(gpuStreamCreate(&streams_[i]))
    }
 
    template<typename Stencil>
-   UniformGPUScheme<Stencil>::UniformGPUScheme( weak_ptr <StructuredBlockForest> bf,
+   UniformGPUScheme<Stencil>::UniformGPUScheme( const weak_ptr< StructuredBlockForest >& bf,
                                                 const Set<SUID> & requiredBlockSelectors,
                                                 const Set<SUID> & incompatibleBlockSelectors,
-                                                bool sendDirectlyFromGPU,
-                                                bool useLocalCommunication,
+                                                const bool sendDirectlyFromGPU,
+                                                const bool useLocalCommunication,
                                                 const int tag )
       : blockForest_( bf ),
         setupBeforeNextCommunication_( true ),
@@ -68,7 +66,6 @@ namespace communication {
         useLocalCommunication_(useLocalCommunication),
         bufferSystemCPU_( mpi::MPIManager::instance()->comm(), tag ),
         bufferSystemGPU_( mpi::MPIManager::instance()->comm(), tag ),
-        parallelSectionManager_( -1 ),
         requiredBlockSelectors_( requiredBlockSelectors ),
         incompatibleBlockSelectors_( incompatibleBlockSelectors )
    {
@@ -78,11 +75,14 @@ namespace communication {
          WALBERLA_CHECK(!sendDirectlyFromGPU)
 #endif
       }
+
+      for (uint_t i = 0; i < Stencil::Q; ++i)
+         WALBERLA_GPU_CHECK(gpuStreamCreate(&streams_[i]))
    }
 
 
    template<typename Stencil>
-   void UniformGPUScheme<Stencil>::startCommunication( gpuStream_t stream )
+   void UniformGPUScheme<Stencil>::startCommunication( )
    {
       WALBERLA_ASSERT( !communicationInProgress_ )
       auto forest = blockForest_.lock();
@@ -102,9 +102,11 @@ namespace communication {
          for( auto it : headers_ )
             bufferSystemGPU_.sendBuffer( it.first ).clear();
 
+      // wait until communication dependent kernels are finished
+      WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
+
       // Start filling send buffers
       {
-         auto parallelSection = parallelSectionManager_.parallelSection( stream );
          for( auto &iBlock : *forest )
          {
             auto senderBlock = dynamic_cast< Block * >( &iBlock );
@@ -127,7 +129,7 @@ namespace communication {
                   auto receiverBlock = dynamic_cast< Block * >( forest->getBlock( senderBlock->getNeighborId( neighborIdx, uint_t(0) )) );
                   for (auto& pi : packInfos_)
                   {
-                     pi->communicateLocal(*dir, senderBlock, receiverBlock, stream);
+                     pi->communicateLocal(*dir, senderBlock, receiverBlock, streams_[*dir]);
                   }
                }
                else
@@ -136,26 +138,27 @@ namespace communication {
 
                   for( auto &pi : packInfos_ )
                   {
-                     parallelSection.run([&](auto s) {
                      auto size = pi->size( *dir, senderBlock );
                      auto gpuDataPtr = bufferSystemGPU_.sendBuffer( nProcess ).advanceNoResize( size );
                      WALBERLA_ASSERT_NOT_NULLPTR( gpuDataPtr )
-                     pi->pack( *dir, gpuDataPtr, senderBlock, s );
+                     pi->pack( *dir, gpuDataPtr, senderBlock, streams_[*dir] );
 
                      if( !sendFromGPU_ )
                      {
                         auto cpuDataPtr = bufferSystemCPU_.sendBuffer( nProcess ).advanceNoResize( size );
                         WALBERLA_ASSERT_NOT_NULLPTR( cpuDataPtr )
-                        WALBERLA_GPU_CHECK( gpuMemcpyAsync( cpuDataPtr, gpuDataPtr, size, gpuMemcpyDeviceToHost, s ))
+                        WALBERLA_GPU_CHECK( gpuMemcpyAsync( cpuDataPtr, gpuDataPtr, size, gpuMemcpyDeviceToHost, streams_[*dir] ))
                      }
-                     });
                   }
                }
             }
          }
       }
       // wait for packing to finish
-      WALBERLA_GPU_CHECK( gpuStreamSynchronize( stream ) );
+      for (uint_t i = 0; i < Stencil::Q; ++i)
+      {
+         WALBERLA_GPU_CHECK(gpuStreamSynchronize(streams_[i]))
+      }
 
       if( sendFromGPU_ )
          bufferSystemGPU_.sendAll();
@@ -167,7 +170,7 @@ namespace communication {
 
 
    template<typename Stencil>
-   void UniformGPUScheme<Stencil>::wait( gpuStream_t stream )
+   void UniformGPUScheme<Stencil>::wait()
    {
       WALBERLA_ASSERT( communicationInProgress_ )
 
@@ -175,7 +178,6 @@ namespace communication {
 
       if( sendFromGPU_ )
       {
-         auto parallelSection = parallelSectionManager_.parallelSection( stream );
          for( auto recvInfo = bufferSystemGPU_.begin(); recvInfo != bufferSystemGPU_.end(); ++recvInfo )
          {
             recvInfo.buffer().clear();
@@ -188,16 +190,13 @@ namespace communication {
                   auto size = pi->size( header.dir, block );
                   auto gpuDataPtr = recvInfo.buffer().advanceNoResize( size );
                   WALBERLA_ASSERT_NOT_NULLPTR( gpuDataPtr )
-                  parallelSection.run([&](auto s) {
-                     pi->unpack( stencil::inverseDir[header.dir], gpuDataPtr, block, s );
-                  });
+                  pi->unpack( stencil::inverseDir[header.dir], gpuDataPtr, block, streams_[header.dir] );
                }
             }
          }
       }
       else
       {
-         auto parallelSection = parallelSectionManager_.parallelSection( stream );
          for( auto recvInfo = bufferSystemCPU_.begin(); recvInfo != bufferSystemCPU_.end(); ++recvInfo )
          {
             auto &gpuBuffer = bufferSystemGPU_.sendBuffer( recvInfo.rank());
@@ -214,17 +213,17 @@ namespace communication {
                   auto gpuDataPtr = gpuBuffer.advanceNoResize( size );
                   WALBERLA_ASSERT_NOT_NULLPTR( cpuDataPtr )
                   WALBERLA_ASSERT_NOT_NULLPTR( gpuDataPtr )
-                  parallelSection.run([&](auto s) {
                      WALBERLA_GPU_CHECK( gpuMemcpyAsync( gpuDataPtr, cpuDataPtr, size,
-                                                           gpuMemcpyHostToDevice, s ))
-                     pi->unpack( stencil::inverseDir[header.dir], gpuDataPtr, block, s );
-                  });
+                                                           gpuMemcpyHostToDevice, streams_[header.dir] ))
+                     pi->unpack( stencil::inverseDir[header.dir], gpuDataPtr, block, streams_[header.dir] );
                }
             }
          }
       }
-
-      WALBERLA_GPU_CHECK( gpuDeviceSynchronize() )
+      for (uint_t i = 0; i < Stencil::Q; ++i)
+      {
+         WALBERLA_GPU_CHECK(gpuStreamSynchronize(streams_[i]))
+      }
       communicationInProgress_ = false;
    }
 
@@ -312,21 +311,21 @@ namespace communication {
    }
 
    template< typename Stencil >
-   std::function<void()> UniformGPUScheme<Stencil>::getCommunicateFunctor(gpuStream_t stream)
+   std::function<void()> UniformGPUScheme<Stencil>::getCommunicateFunctor()
    {
-      return [this, stream]() { communicate( stream ); };
+      return [this]() { communicate( ); };
    }
 
    template< typename Stencil >
-   std::function<void()> UniformGPUScheme<Stencil>::getStartCommunicateFunctor(gpuStream_t stream)
+   std::function<void()> UniformGPUScheme<Stencil>::getStartCommunicateFunctor()
    {
-      return [this, stream]() { startCommunication( stream ); };
+      return [this]() { startCommunication(); };
    }
 
    template< typename Stencil >
-   std::function<void()> UniformGPUScheme<Stencil>::getWaitFunctor(gpuStream_t stream)
+   std::function<void()> UniformGPUScheme<Stencil>::getWaitFunctor()
    {
-      return [this, stream]() { wait( stream ); };
+      return [this]() { wait(); };
    }
 
 } // namespace communication
diff --git a/src/lbm_generated/evaluation/PerformanceEvaluation.h b/src/lbm_generated/evaluation/PerformanceEvaluation.h
index 9fb7e934a2506ca360af12882a0775bcf8281eb6..36f112ac3360f1125fcc11733c8ab56955daceb0 100644
--- a/src/lbm_generated/evaluation/PerformanceEvaluation.h
+++ b/src/lbm_generated/evaluation/PerformanceEvaluation.h
@@ -24,6 +24,7 @@
 #include "core/DataTypes.h"
 #include "core/Hostname.h"
 #include "core/Set.h"
+#include "core/OpenMP.h"
 #include "core/waLBerlaBuildInfo.h"
 #include "core/debug/CheckFunctions.h"
 #include "core/logging/Logging.h"
@@ -223,10 +224,17 @@ PerformanceEvaluationBase< CellCounter_T, FluidCellCounter_T >::PerformanceEvalu
      fluidCells_( fluidCellCounter )
 {
 #ifdef _OPENMP
-   if( std::getenv( "OMP_NUM_THREADS" ) == NULL )
-      WALBERLA_ABORT( "If you are using a version of the program that was compiled with OpenMP you have to "
-                      "specify the environment variable \'OMP_NUM_THREADS\' accordingly!" );
-   threadsPerProcess_ = std::atoi( std::getenv( "OMP_NUM_THREADS" ) );
+   if( std::getenv( "OMP_NUM_THREADS" ) )
+   {
+      threadsPerProcess_ = std::atoi( std::getenv( "OMP_NUM_THREADS" ) );
+   }
+   else
+   {
+      WALBERLA_LOG_WARNING( "You are using a version of the program that was compiled with OpenMP and the environment "
+                           "variable \'OMP_NUM_THREADS\' was not set!\nThe number of threads will now determined with "
+                           "\'omp_get_max_threads\'.\nIf this not correct in your case please set \'OMP_NUM_THREADS\'" );
+      threadsPerProcess_ =  omp_get_max_threads();
+   }
 #endif
 
    if( std::getenv( "THREADS_PER_CORE" ) )
@@ -256,7 +264,7 @@ std::string PerformanceEvaluationBase< CellCounter_T, FluidCellCounter_T >::logg
 {
    std::ostringstream oss;
 
-   std::string na( "n/a *)" );
+   const std::string na( "n/a *)" );
 
    std::ostringstream threadsPerCoreString;
    threadsPerCoreString << threadsPerCore_;
diff --git a/src/lbm_generated/gpu/BasicRecursiveTimeStepGPU.impl.h b/src/lbm_generated/gpu/BasicRecursiveTimeStepGPU.impl.h
index 5bb43c3c874e160253dd096e87ee8c50e2aa08b3..6665cf3513df16ca09e7c0cfc7c66c35639b1869 100644
--- a/src/lbm_generated/gpu/BasicRecursiveTimeStepGPU.impl.h
+++ b/src/lbm_generated/gpu/BasicRecursiveTimeStepGPU.impl.h
@@ -243,7 +243,7 @@ void BasicRecursiveTimeStepGPU< PdfField_T, SweepCollection_T, BoundaryCollectio
    for(auto it = CommunicationStencil::beginNoCenter(); it != CommunicationStencil::end(); ++it){
       uint_t nSecIdx = blockforest::getBlockNeighborhoodSectionIndex(*it);
       // Propagate on ghost layers shadowing coarse or no blocks
-      if(!block->neighborhoodSectionHasSmallerBlocks(nSecIdx)){
+      if(block->neighborhoodSectionHasLargerBlock(nSecIdx)){
          CellInterval ci;
          pdfField->getGhostRegion(*it, ci, 1);
          sweepCollection_.streamOnlyNoAdvancementCellInterval(block, ci, gpuStream);
diff --git a/tests/core/CMakeLists.txt b/tests/core/CMakeLists.txt
index 46b98eb48c90884feda14e4dde15341b4b5f3687..8d3f0298ac3bf387dfb19d18b46bfaff8caa6912 100644
--- a/tests/core/CMakeLists.txt
+++ b/tests/core/CMakeLists.txt
@@ -222,3 +222,17 @@ if( WALBERLA_BUILD_WITH_PARMETIS )
    waLBerla_compile_test( NAME PlainParMetisTest FILES load_balancing/PlainParMetisTest.cpp )
    waLBerla_execute_test( NAME PlainParMetisTest PROCESSES 3 )
 endif()
+
+###################
+# Mixed Precision #
+###################
+
+if ( WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT )
+   waLBerla_compile_test( Name Float16SupportTest FILES Float16SupportTest.cpp DEPENDS core)
+   # Actual support for float16 is available only since C++23
+   #   before is_arithmetic and is_floating_point evaluated to false,
+   #   also many STL functions are compatible with float16 only since C++23.
+   # Which features are actually supported depend on the compiler
+   target_compile_features( Float16SupportTest PUBLIC cxx_std_23 )
+   waLBerla_execute_test(NAME Float16SupportTest)
+endif ()
\ No newline at end of file
diff --git a/tests/core/Float16SupportTest.cpp b/tests/core/Float16SupportTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..04ea9378f54eee4ee78f81177fc609d732da21c5
--- /dev/null
+++ b/tests/core/Float16SupportTest.cpp
@@ -0,0 +1,163 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file Float16SupportTest.cpp
+//! \ingroup core
+//! \author Michael Zikeli <michael.zikeli@fau.de>
+//
+//======================================================================================================================
+
+#include <memory>
+#include <numeric>
+
+#include "core/DataTypes.h"
+#include "core/Environment.h"
+#include "core/logging/Logging.h"
+
+namespace walberla::simple_Float16_test {
+using walberla::floatIsEqual;
+using walberla::real_t;
+using walberla::uint_c;
+using walberla::uint_t;
+
+// === Choosing Accuracy ===
+//+++ Precision : fp16 +++
+using walberla::float16;
+using walberla::float32;
+using walberla::float64;
+using dst_t                         = float16;
+using src_t                         = real_t;
+constexpr real_t     precisionLimit = walberla::float16( 1e-3 );
+const std::string    precisionType  = "float16";
+constexpr const auto maxLevel       = uint_t( 3 );
+
+void simple_array_test()
+{
+   auto fpSrc = std::make_shared< src_t[] >( 10 );
+   auto fpDst = std::make_shared< dst_t[] >( 10 );
+
+   std::fill_n( fpSrc.get(), 10, 17. );
+   std::fill_n( fpDst.get(), 10, (dst_t) 17. );
+
+   fpSrc[5] = 8.;
+   fpDst[5] = (dst_t) 8.;
+
+   // Test equality with custom compare
+   WALBERLA_CHECK_LESS( std::fabs( fpSrc[9] - (src_t) fpDst[9] ), precisionLimit );
+   WALBERLA_CHECK_LESS( std::fabs( fpSrc[5] - (src_t) fpDst[5] ), precisionLimit );
+   // Test specialized floatIsEqual
+   WALBERLA_CHECK( floatIsEqual( fpSrc[9], (src_t) fpDst[9], (src_t) precisionLimit ) );
+   WALBERLA_CHECK( floatIsEqual( (dst_t) fpSrc[9], fpDst[9], (dst_t) precisionLimit ) );
+   WALBERLA_CHECK_FLOAT_EQUAL( (dst_t) fpSrc[9], fpDst[9] );
+
+   // Test std::fill_n
+   auto other_fpDst = std::make_shared< dst_t[] >( 10 );
+   std::fill_n( other_fpDst.get(), 10, (dst_t) 2. );
+   WALBERLA_CHECK_FLOAT_EQUAL( (dst_t) 2., other_fpDst[9] );
+   WALBERLA_CHECK_FLOAT_EQUAL( (dst_t) 2., other_fpDst[5] );
+
+   // Test std::swap
+   std::swap( fpDst, other_fpDst );
+   fpDst[5] = (dst_t) 9.;
+
+   WALBERLA_CHECK_FLOAT_EQUAL( (dst_t) fpSrc[9], other_fpDst[9] );
+   WALBERLA_CHECK_FLOAT_EQUAL( (dst_t) fpSrc[5], other_fpDst[5] );
+   WALBERLA_CHECK_FLOAT_EQUAL( (dst_t) 2., fpDst[9] );
+   WALBERLA_CHECK_FLOAT_EQUAL( (dst_t) 9., fpDst[5] );
+
+} // simple_Float16_test::simple_array_test()
+
+void vector_test()
+{
+   auto fpSrc      = std::vector< src_t >( 10 );
+   auto fpDst_cast = std::vector< dst_t >( 10 );
+   auto fp32       = std::vector< walberla::float32 >( 10 );
+   auto fpDst      = std::vector< dst_t >( 10 );
+
+   fpSrc.assign( 10, 1.5 );
+   fpDst_cast.assign( 10, (dst_t) 1.5 );
+   fp32.assign( 10, 1.5f );
+   std::copy( fpSrc.begin(), fpSrc.end(), fpDst.begin() );
+   WALBERLA_LOG_WARNING_ON_ROOT(
+       " std::vector.assign is not able to assign "
+       << typeid( src_t ).name() << " values to container of type " << precisionType << ".\n"
+       << " Therefore, the floating point value for assign must be cast beforehand or std::copy must be used, since it uses a static_cast internally." );
+
+   fpSrc[5]      = 2.3;
+   fpDst_cast[5] = (dst_t) 2.3;
+   fp32[5]       = 2.3f;
+   fpDst[5]      = (dst_t) 2.3;
+
+   WALBERLA_CHECK_FLOAT_EQUAL( (walberla::float32) fpSrc[0], fp32[0] );
+   WALBERLA_CHECK_FLOAT_EQUAL( (walberla::float32) fpSrc[9], fp32[9] );
+   WALBERLA_CHECK_FLOAT_EQUAL( (walberla::float32) fpSrc[5], fp32[5] );
+   WALBERLA_CHECK_FLOAT_EQUAL( (dst_t) fpSrc[0], fpDst_cast[0] );
+   WALBERLA_CHECK_FLOAT_EQUAL( (dst_t) fpSrc[9], fpDst_cast[9] );
+   WALBERLA_CHECK_FLOAT_EQUAL( (dst_t) fpSrc[5], fpDst_cast[5] );
+   WALBERLA_CHECK_FLOAT_EQUAL( (dst_t) fpSrc[0], fpDst[0] );
+   WALBERLA_CHECK_FLOAT_EQUAL( (dst_t) fpSrc[9], fpDst[9] );
+   WALBERLA_CHECK_FLOAT_EQUAL( (dst_t) fpSrc[5], fpDst[5] );
+   WALBERLA_CHECK_EQUAL( typeid( fpDst ), typeid( fpDst_cast ) );
+
+   // Add up all elements of the vector to check whether the result is sufficiently correct.
+   {
+      const auto sumSrc = std::reduce(fpSrc.begin(), fpSrc.end());
+      const auto sumDst = std::reduce(fpDst.begin(), fpDst.end());
+      WALBERLA_CHECK_FLOAT_EQUAL( (dst_t)sumSrc, sumDst );
+   }
+   {
+      fpSrc.assign( 13, 1.3 );
+      std::copy( fpSrc.begin(), fpSrc.end(), fpDst.begin() );
+      const auto sumSrc = std::reduce(fpSrc.begin(), fpSrc.end());
+      const auto sumDst = std::reduce(fpDst.begin(), fpDst.end());
+      WALBERLA_CHECK_FLOAT_UNEQUAL( (dst_t)sumSrc, sumDst );
+   }
+} // simple_Float16_test::vector_test()
+
+int main( int argc, char** argv )
+{
+   // This check only works since C++23 and is used in many implementations, so it's important, that it works.
+   WALBERLA_CHECK( std::is_arithmetic< dst_t >::value );
+
+   walberla::Environment env( argc, argv );
+   walberla::logging::Logging::instance()->setLogLevel( walberla::logging::Logging::INFO );
+   walberla::MPIManager::instance()->useWorldComm();
+
+   WALBERLA_LOG_INFO_ON_ROOT( " This run is executed with " << precisionType );
+   WALBERLA_LOG_INFO_ON_ROOT( " machine precision limit is " << precisionLimit );
+   const std::string stringLine( 125, '=' );
+   WALBERLA_LOG_INFO_ON_ROOT( stringLine );
+
+   WALBERLA_LOG_INFO_ON_ROOT( " Start a test with shared_pointer<float16[]>." );
+   simple_array_test();
+
+   WALBERLA_LOG_INFO_ON_ROOT( " Start a test with std::vector<float16>." );
+   vector_test();
+
+   WALBERLA_LOG_INFO_ON_ROOT( " Start a where float32 is sufficient but float16 is not." );
+   WALBERLA_CHECK_FLOAT_UNEQUAL( dst_t(1.0)-dst_t(0.3), 1.0-0.3 );
+   WALBERLA_CHECK_FLOAT_EQUAL( 1.0f-0.3f, 1.0-0.3 );
+
+   return 0;
+} // simple_Float16_test::main()
+
+} // namespace walberla::simple_Float16_test
+
+int main( int argc, char** argv )
+{
+   walberla::simple_Float16_test::main( argc, argv );
+
+   return EXIT_SUCCESS;
+} // main()
diff --git a/tests/gpu/communication/CommTest.cpp b/tests/gpu/communication/CommTest.cpp
index 5bc87aa13f9a2c72351b532fb20e3614cdfc8d82..a8a60321d9ee45f131843dbcb9f974f41ace0b01 100644
--- a/tests/gpu/communication/CommTest.cpp
+++ b/tests/gpu/communication/CommTest.cpp
@@ -47,7 +47,7 @@ void hostToHost()
       hostField2.set(hostField1);
    }
    double const endTime = MPI_Wtime();
-   std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl;
+   std::cout << __FUNCTION__ << ": " << endTime - startTime << '\n';
 }
 
 void hostToDevice()
@@ -61,7 +61,7 @@ void hostToDevice()
       gpu::fieldCpy(deviceField, hostField);
    }
    double const endTime = MPI_Wtime();
-   std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl;
+   std::cout << __FUNCTION__ << ": " << endTime - startTime << '\n';
 }
 
 void deviceToHost()
@@ -76,7 +76,7 @@ void deviceToHost()
       gpu::fieldCpy(hostField, deviceField);
    }
    double const endTime = MPI_Wtime();
-   std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl;
+   std::cout << __FUNCTION__ << ": " << endTime - startTime << '\n';
 }
 
 void mpiHostToHost()
@@ -100,7 +100,7 @@ void mpiHostToHost()
       MPI_Wait(&request2, MPI_STATUS_IGNORE);
    }
    double const endTime = MPI_Wtime();
-   std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl;
+   std::cout << __FUNCTION__ << ": " << endTime - startTime << '\n';
 }
 
 void mpiHostToDevice()
@@ -124,7 +124,7 @@ void mpiHostToDevice()
       MPI_Wait(&request2, MPI_STATUS_IGNORE);
    }
    double const endTime = MPI_Wtime();
-   std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl;
+   std::cout << __FUNCTION__ << ": " << endTime - startTime << '\n';
 }
 
 void mpiDeviceToHost()
@@ -148,7 +148,7 @@ void mpiDeviceToHost()
       MPI_Wait(&request2, MPI_STATUS_IGNORE);
    }
    double const endTime = MPI_Wtime();
-   std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl;
+   std::cout << __FUNCTION__ << ": " << endTime - startTime << '\n';
 }
 
 void mpiDeviceToDevice()
@@ -172,7 +172,7 @@ void mpiDeviceToDevice()
       MPI_Wait(&request2, MPI_STATUS_IGNORE);
    }
    double const endTime = MPI_Wtime();
-   std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl;
+   std::cout << __FUNCTION__ << ": " << endTime - startTime << '\n';
 }
 
 void mpiCopyHostToDevice()
@@ -199,7 +199,7 @@ void mpiCopyHostToDevice()
       gpu::fieldCpy(deviceField, hostField2);
    }
    double const endTime = MPI_Wtime();
-   std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl;
+   std::cout << __FUNCTION__ << ": " << endTime - startTime << '\n';
 }
 
 void mpiCopyDeviceToHost()
@@ -226,7 +226,7 @@ void mpiCopyDeviceToHost()
       MPI_Wait(&request2, MPI_STATUS_IGNORE);
    }
    double const endTime = MPI_Wtime();
-   std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl;
+   std::cout << __FUNCTION__ << ": " << endTime - startTime << '\n';
 }
 
 int main(int argc, char** argv)