From 2049c5bff82134f565ad2a5c28f62bef0c710fb6 Mon Sep 17 00:00:00 2001
From: Martin Bauer <martin.bauer@fau.de>
Date: Thu, 7 Mar 2019 16:09:28 +0100
Subject: [PATCH] CUDA: fallbacks for older CUDA versions

---
 src/cuda/DeviceSelectMPI.cpp | 32 +++++++++++++-------------------
 src/cuda/ExecutionTreeGPU.h  | 13 +++++++++++++
 2 files changed, 26 insertions(+), 19 deletions(-)

diff --git a/src/cuda/DeviceSelectMPI.cpp b/src/cuda/DeviceSelectMPI.cpp
index d934abe79..f8eee582e 100644
--- a/src/cuda/DeviceSelectMPI.cpp
+++ b/src/cuda/DeviceSelectMPI.cpp
@@ -29,8 +29,6 @@ namespace cuda {
 
 #if MPI_VERSION == 2 || MPI_VERSION == 1
 
-#ifndef MPI_COMM_TYPE_SHARED
-
 void selectDeviceBasedOnMpiRank() {
    WALBERLA_ABORT("Your MPI implementation is tool old - it does not support CUDA device selection based on MPI rank");
 }
@@ -39,43 +37,39 @@ void selectDeviceBasedOnMpiRank() {
 
 void selectDeviceBasedOnMpiRank()
 {
+#ifdef WALBERLA_BUILD_WITH_MPI
    int deviceCount;
-   WALBERLA_CUDA_CHECK( cudaGetDeviceCount ( &deviceCount ) );
-
+   WALBERLA_CUDA_CHECK( cudaGetDeviceCount( &deviceCount ));
+   WALBERLA_LOG_INFO_ON_ROOT( "Selecting CUDA device depending on MPI Rank" );
 
    MPI_Info info;
    MPI_Info_create( &info );
    MPI_Comm newCommunicator;
-   MPI_Comm_split_type( MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, info, &newCommunicator );
+   MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, info, &newCommunicator );
 
    int processesOnNode;
    int rankOnNode;
    MPI_Comm_size( newCommunicator, &processesOnNode );
    MPI_Comm_rank( newCommunicator, &rankOnNode );
 
-   if( deviceCount == processesOnNode )
+   if ( deviceCount == processesOnNode )
    {
-      WALBERLA_CUDA_CHECK( cudaSetDevice( rankOnNode ) );
+      WALBERLA_CUDA_CHECK( cudaSetDevice( rankOnNode ));
    }
    else if ( deviceCount > processesOnNode )
    {
-      WALBERLA_LOG_WARNING("Not using all available GPUs on node. Processes on node "
-                                   << processesOnNode << " available GPUs on node " << deviceCount );
-      WALBERLA_CUDA_CHECK( cudaSetDevice( rankOnNode ) );
+      WALBERLA_LOG_WARNING( "Not using all available GPUs on node. Processes on node "
+                               << processesOnNode << " available GPUs on node " << deviceCount );
+      WALBERLA_CUDA_CHECK( cudaSetDevice( rankOnNode ));
    }
    else
    {
-      WALBERLA_LOG_WARNING("Too many processes started per node - should be one per GPU. Number of processes per node "
-                                   << processesOnNode << ", available GPUs on node " << deviceCount );
-      WALBERLA_CUDA_CHECK( cudaSetDevice( rankOnNode % deviceCount ) );
+      WALBERLA_LOG_WARNING( "Too many processes started per node - should be one per GPU. Number of processes per node "
+                               << processesOnNode << ", available GPUs on node " << deviceCount );
+      WALBERLA_CUDA_CHECK( cudaSetDevice( rankOnNode % deviceCount ));
    }
-}
 #endif
-
-
-#else
-
-void selectDeviceBasedOnMpiRank() {}
+}
 
 #endif
 
diff --git a/src/cuda/ExecutionTreeGPU.h b/src/cuda/ExecutionTreeGPU.h
index 9f458f289..0b4e565d5 100644
--- a/src/cuda/ExecutionTreeGPU.h
+++ b/src/cuda/ExecutionTreeGPU.h
@@ -26,6 +26,19 @@
 
 #include <cuda_runtime.h>
 
+#ifdef CUDART_VERSION
+#if CUDART_VERSION <= 9020
+cudaError_t cudaLaunchHostFunc( cudaStream_t,  void(CUDART_CB* )( void*  userData ), void* ) {
+        static bool printedWarning = false;
+        if( ! printedWarning ) {
+                WALBERLA_LOG_WARNING_ON_ROOT("Timing of CUDA functions only implemented for CUDA versions >= 10.0" );
+                printedWarning = true;
+        }
+        return cudaSuccess;
+}
+#endif
+#endif
+
 namespace walberla {
 namespace executiontree {
 
-- 
GitLab