diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp index e1b183d8532a11643345d7cecd3df4cea784d64c..32d06a834473ff86e05f75a4d0c4cccec3a07e67 100644 --- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp +++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp @@ -89,15 +89,27 @@ int main( int argc, char **argv ) lbm::UniformGridGPU_UBB ubb(blocks, pdfFieldGpuID); lbm::UniformGridGPU_NoSlip noSlip(blocks, pdfFieldGpuID); - //lbm::GeneratedFixedDensity pressure(blocks, pdfFieldGpuID); ubb.fillFromFlagField<FlagField_T>( blocks, flagFieldID, FlagUID("UBB"), fluidFlagUID ); noSlip.fillFromFlagField<FlagField_T>( blocks, flagFieldID, FlagUID("NoSlip"), fluidFlagUID ); - //pressure.fillFromFlagField<FlagField_T>( blocks, flagFieldID, FlagUID("pressure"), fluidFlagUID ); // Communication setup bool cudaEnabledMPI = parameters.getParameter<bool>( "cudaEnabledMPI", false ); - int communicationScheme = parameters.getParameter<int>( "communicationScheme", (int) CommunicationSchemeType::UniformGPUScheme_Baseline ); + + const std::string communicationSchemeStr = parameters.getParameter<std::string>("communicationScheme", "UniformGPUScheme_Baseline"); + CommunicationSchemeType communicationScheme; + if( communicationSchemeStr == "GPUPackInfo_Baseline") + communicationScheme = GPUPackInfo_Baseline; + else if (communicationSchemeStr == "GPUPackInfo_Streams") + communicationScheme = GPUPackInfo_Streams; + else if (communicationSchemeStr == "UniformGPUScheme_Baseline") + communicationScheme = UniformGPUScheme_Baseline; + else if (communicationSchemeStr == "UniformGPUScheme_Memcpy") + communicationScheme = UniformGPUScheme_Memcpy; + else { + WALBERLA_ABORT_NO_DEBUG_INFO("Invalid choice for communicationScheme") + } + Vector3<int> innerOuterSplit = parameters.getParameter<Vector3<int> >("innerOuterSplit", Vector3<int>(1, 1, 1)); diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.prm b/apps/benchmarks/UniformGridGPU/UniformGridGPU.prm index c6c1e18b2e4ce242241912d2cfb46291a3cfab12..81a0996f872ab2f409a0bef1a9619de8a2095e6b 100644 --- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.prm +++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.prm @@ -7,18 +7,21 @@ DomainSetup Parameters { - omega 1.8; - timesteps 10000; - warmupSteps 0; - outerIterations 1; - remainingTimeLoggerFrequency 30; - vtkWriteFrequency 500; + timesteps 10000; // time steps of one performance measurement + warmupSteps 0; // number of steps to run before measurement starts + outerIterations 1; // how many measurements to conduct - cudaEnabledMPI false; + // Can be one of: GPUPackInfo_Baseline, GPUPackInfo_Streams, UniformGPUScheme_Baseline, UniformGPUScheme_Memcpy + communicationScheme UniformGPUScheme_Baseline; - timeStepStrategy noOverlap; - innerOuterSplit < 64, 1, 1>; + vtkWriteFrequency 0; // write a VTK file every n'th step, if zero VTK output is disabled + cudaEnabledMPI false; // switch on if you have a CUDA-enabled MPI implementation + + timeStepStrategy noOverlap; // can be: noOverlap, simpleOverlap, complexOverlap, kernelOnly + innerOuterSplit < 32, 1, 1>; // slice-thickness that 'outer'-kernels process when overlapping + + remainingTimeLoggerFrequency 0; // interval in seconds to log the estimated remaining time } /* diff --git a/apps/benchmarks/UniformGridGPU/simulation_setup/overlap_benchmark.py b/apps/benchmarks/UniformGridGPU/simulation_setup/overlap_benchmark.py old mode 100644 new mode 100755 index 123b953817391e225275209ba00641d8f6d9ffd9..420be74fed17999af756a54aa7330bd9217580cf --- a/apps/benchmarks/UniformGridGPU/simulation_setup/overlap_benchmark.py +++ b/apps/benchmarks/UniformGridGPU/simulation_setup/overlap_benchmark.py @@ -60,15 +60,21 @@ def overlap_benchmark(): (4, 4, 1), (8, 8, 1), (16, 16, 1), (32, 32, 1), (4, 4, 4), (8, 8, 8), (16, 16, 16), (32, 32, 32)] - scenarios.add(Scenario(timeStepStrategy='noOverlap')) - for strategy in ['simpleOverlap', 'complexOverlap']: - for inner_outer_split in inner_outer_splits: - scenario = Scenario(timeStepStrategy=strategy, innerOuterSplit=inner_outer_split) - scenarios.add(scenario) + for comm_strategy in ['UniformGPUScheme_Baseline', 'UniformGPUScheme_Memcpy']: # 'GPUPackInfo_Baseline', 'GPUPackInfo_Streams' + # no overlap + scenarios.add(Scenario(timeStepStrategy='noOverlap', communicationScheme=comm_strategy, innerOuterSplit=(1, 1, 1))) + # overlap + for overlap_strategy in ['simpleOverlap', 'complexOverlap']: + for inner_outer_split in inner_outer_splits: + scenario = Scenario(timeStepStrategy=overlap_strategy, + communicationScheme=comm_strategy, + innerOuterSplit=inner_outer_split) + scenarios.add(scenario) -if __name__ == '__main__': - for node_count in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2400]: + +def generate_jobscripts(machine='pizdaint_hybrid'): + for node_count in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 2400]: with open("job_overlap_benchmark_{:04d}.sh".format(node_count), 'w') as f: js = createJobscript(nodes=node_count, output_file='overlap_bench_{:04d}_%j.txt'.format(node_count), @@ -77,9 +83,14 @@ if __name__ == '__main__': exe_name='UniformGridBenchmarkGPU', parameter_files=['overlap_benchmark.py'], wall_time=timedelta(minutes=25), - machine='pizdaint_hybrid', + machine=machine, account='d105', ) f.write(js) + + +if __name__ == '__main__': + print("Called without waLBerla - generating job scripts for PizDaint") + generate_jobscripts() else: overlap_benchmark() diff --git a/src/cuda/DeviceSelectMPI.cpp b/src/cuda/DeviceSelectMPI.cpp index f8eee582e69528c32caf4f34f0d742c7a1548a28..3ba255d9f6fd926721477158243022c0012611ed 100644 --- a/src/cuda/DeviceSelectMPI.cpp +++ b/src/cuda/DeviceSelectMPI.cpp @@ -58,8 +58,8 @@ void selectDeviceBasedOnMpiRank() } else if ( deviceCount > processesOnNode ) { - WALBERLA_LOG_WARNING( "Not using all available GPUs on node. Processes on node " - << processesOnNode << " available GPUs on node " << deviceCount ); + WALBERLA_LOG_WARNING( "Not using all available GPUs on node. Processes on node: " + << processesOnNode << ", available GPUs on node: " << deviceCount ); WALBERLA_CUDA_CHECK( cudaSetDevice( rankOnNode )); } else