diff --git a/src/cuda/DeviceSelectMPI.cpp b/src/cuda/DeviceSelectMPI.cpp index d934abe797175a89ee5006a305755ccd2de25270..f8eee582e69528c32caf4f34f0d742c7a1548a28 100644 --- a/src/cuda/DeviceSelectMPI.cpp +++ b/src/cuda/DeviceSelectMPI.cpp @@ -29,8 +29,6 @@ namespace cuda { #if MPI_VERSION == 2 || MPI_VERSION == 1 -#ifndef MPI_COMM_TYPE_SHARED - void selectDeviceBasedOnMpiRank() { WALBERLA_ABORT("Your MPI implementation is tool old - it does not support CUDA device selection based on MPI rank"); } @@ -39,43 +37,39 @@ void selectDeviceBasedOnMpiRank() { void selectDeviceBasedOnMpiRank() { +#ifdef WALBERLA_BUILD_WITH_MPI int deviceCount; - WALBERLA_CUDA_CHECK( cudaGetDeviceCount ( &deviceCount ) ); - + WALBERLA_CUDA_CHECK( cudaGetDeviceCount( &deviceCount )); + WALBERLA_LOG_INFO_ON_ROOT( "Selecting CUDA device depending on MPI Rank" ); MPI_Info info; MPI_Info_create( &info ); MPI_Comm newCommunicator; - MPI_Comm_split_type( MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, info, &newCommunicator ); + MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, info, &newCommunicator ); int processesOnNode; int rankOnNode; MPI_Comm_size( newCommunicator, &processesOnNode ); MPI_Comm_rank( newCommunicator, &rankOnNode ); - if( deviceCount == processesOnNode ) + if ( deviceCount == processesOnNode ) { - WALBERLA_CUDA_CHECK( cudaSetDevice( rankOnNode ) ); + WALBERLA_CUDA_CHECK( cudaSetDevice( rankOnNode )); } else if ( deviceCount > processesOnNode ) { - WALBERLA_LOG_WARNING("Not using all available GPUs on node. Processes on node " - << processesOnNode << " available GPUs on node " << deviceCount ); - WALBERLA_CUDA_CHECK( cudaSetDevice( rankOnNode ) ); + WALBERLA_LOG_WARNING( "Not using all available GPUs on node. Processes on node " + << processesOnNode << " available GPUs on node " << deviceCount ); + WALBERLA_CUDA_CHECK( cudaSetDevice( rankOnNode )); } else { - WALBERLA_LOG_WARNING("Too many processes started per node - should be one per GPU. Number of processes per node " - << processesOnNode << ", available GPUs on node " << deviceCount ); - WALBERLA_CUDA_CHECK( cudaSetDevice( rankOnNode % deviceCount ) ); + WALBERLA_LOG_WARNING( "Too many processes started per node - should be one per GPU. Number of processes per node " + << processesOnNode << ", available GPUs on node " << deviceCount ); + WALBERLA_CUDA_CHECK( cudaSetDevice( rankOnNode % deviceCount )); } -} #endif - - -#else - -void selectDeviceBasedOnMpiRank() {} +} #endif diff --git a/src/cuda/ExecutionTreeGPU.h b/src/cuda/ExecutionTreeGPU.h index 9f458f289bf356e3343d177d43aa1395e4b4428f..0b4e565d56d5fd15a4985d0962eea300d99eea4b 100644 --- a/src/cuda/ExecutionTreeGPU.h +++ b/src/cuda/ExecutionTreeGPU.h @@ -26,6 +26,19 @@ #include <cuda_runtime.h> +#ifdef CUDART_VERSION +#if CUDART_VERSION <= 9020 +cudaError_t cudaLaunchHostFunc( cudaStream_t, void(CUDART_CB* )( void* userData ), void* ) { + static bool printedWarning = false; + if( ! printedWarning ) { + WALBERLA_LOG_WARNING_ON_ROOT("Timing of CUDA functions only implemented for CUDA versions >= 10.0" ); + printedWarning = true; + } + return cudaSuccess; +} +#endif +#endif + namespace walberla { namespace executiontree {