diff --git a/.clang-format b/.clang-format
index 5f0ff6558e68a5373308323e7d4205b357e135eb..e5114ffd062d399b881d114f50e94b45988832dc 100644
--- a/.clang-format
+++ b/.clang-format
@@ -61,8 +61,6 @@ IncludeCategories:
     Priority:        3
   - Regex:           '^"core/'
     Priority:        4
-  - Regex:           '^"cuda/'
-    Priority:        5
   - Regex:           '^"domain_decomposition/'
     Priority:        6
   - Regex:           '^"executiontree/'
@@ -75,6 +73,8 @@ IncludeCategories:
     Priority:        10
   - Regex:           '^"geometry/'
     Priority:        11
+  - Regex:           '^"gpu/'
+    Priority:        12
   - Regex:           '^"gui/'
     Priority:        12
   - Regex:           '^"lbm/'
diff --git a/src/core/mpi/MPIWrapper.h b/src/core/mpi/MPIWrapper.h
index 6b406c631072d43fc8d95b8a9c7f25e6b0472be6..51ab22e26ed3b9e38a858ab8040c39325751ff12 100644
--- a/src/core/mpi/MPIWrapper.h
+++ b/src/core/mpi/MPIWrapper.h
@@ -66,6 +66,9 @@ namespace mpistubs {
 #   pragma GCC diagnostic ignored "-Wsign-conversion"
 #endif
 #include <mpi.h>
+#if defined(OPEN_MPI) && OPEN_MPI
+#include <mpi-ext.h>
+#endif
 #if ( defined WALBERLA_CXX_COMPILER_IS_GNU ) || ( defined WALBERLA_CXX_COMPILER_IS_CLANG )
 #   pragma GCC diagnostic pop
 #endif
diff --git a/src/gpu/communication/NonUniformGPUScheme.h b/src/gpu/communication/NonUniformGPUScheme.h
index 1d3583a12f83295114c5758769689aac186e8399..093ec4cad2a830a80042073f905bb1c7316bf8ae 100644
--- a/src/gpu/communication/NonUniformGPUScheme.h
+++ b/src/gpu/communication/NonUniformGPUScheme.h
@@ -30,13 +30,13 @@
 
 #include "domain_decomposition/IBlock.h"
 
-#include "stencil/Directions.h"
-
 #include "gpu/ErrorChecking.h"
 #include "gpu/GPUWrapper.h"
 #include "gpu/communication/CustomMemoryBuffer.h"
 #include "gpu/communication/GeneratedNonUniformGPUPackInfo.h"
 
+#include "stencil/Directions.h"
+
 #include <memory>
 #include <thread>
 
@@ -138,7 +138,8 @@ NonUniformGPUScheme< Stencil >::NonUniformGPUScheme(const weak_ptr< StructuredBl
 {
    WALBERLA_MPI_SECTION()
    {
-#if !(defined(MPIX_CUDA_AWARE_SUPPORT) && MPIX_CUDA_AWARE_SUPPORT)
+// Open MPI supports compile time CUDA-aware support check
+#if (defined(OPEN_MPI) && OPEN_MPI) && !(defined(MPIX_CUDA_AWARE_SUPPORT) && MPIX_CUDA_AWARE_SUPPORT)
       WALBERLA_CHECK(!sendDirectlyFromGPU)
 #endif
    }
diff --git a/src/gpu/communication/UniformGPUScheme.impl.h b/src/gpu/communication/UniformGPUScheme.impl.h
index 8a8616c1e6cd371a987bd45a86e677b09d289883..93f6dd85e0e3f44293b9943e1bf252ce52c6ad33 100644
--- a/src/gpu/communication/UniformGPUScheme.impl.h
+++ b/src/gpu/communication/UniformGPUScheme.impl.h
@@ -19,6 +19,8 @@
 //
 //======================================================================================================================
 
+#include "core/mpi/MPIWrapper.h"
+
 #include "gpu/ParallelStreams.h"
 
 namespace walberla {
@@ -45,7 +47,8 @@ namespace communication {
    {
       WALBERLA_MPI_SECTION()
       {
-#if !(defined(MPIX_CUDA_AWARE_SUPPORT) && MPIX_CUDA_AWARE_SUPPORT)
+// Open MPI supports compile time CUDA-aware support check
+#if (defined(OPEN_MPI) && OPEN_MPI) && !(defined(MPIX_CUDA_AWARE_SUPPORT) && MPIX_CUDA_AWARE_SUPPORT)
          WALBERLA_CHECK(!sendDirectlyFromGPU)
 #endif
       }