UniformGridGPU: select comm type by string instead of number

6d6abeec · Martin Bauer · 755972f9 · 6d6abeec · 6d6abeec · 6d6abeec
Commit 6d6abeec authored 5 years ago by Martin Bauer
--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
@@ -89,15 +89,27 @@ int main( int argc, char **argv )
      lbm::UniformGridGPU_UBB ubb(blocks, pdfFieldGpuID);
      lbm::UniformGridGPU_NoSlip noSlip(blocks, pdfFieldGpuID);
-      //lbm::GeneratedFixedDensity pressure(blocks, pdfFieldGpuID);
      ubb.fillFromFlagField<FlagField_T>( blocks, flagFieldID, FlagUID("UBB"), fluidFlagUID );
      noSlip.fillFromFlagField<FlagField_T>( blocks, flagFieldID, FlagUID("NoSlip"), fluidFlagUID );
-      //pressure.fillFromFlagField<FlagField_T>( blocks, flagFieldID, FlagUID("pressure"), fluidFlagUID );
       // Communication setup
      bool cudaEnabledMPI = parameters.getParameter<bool>( "cudaEnabledMPI", false );
-      int communicationScheme = parameters.getParameter<int>( "communicationScheme", (int) CommunicationSchemeType::UniformGPUScheme_Baseline );
+      const std::string communicationSchemeStr = parameters.getParameter<std::string>("communicationScheme", "UniformGPUScheme_Baseline");
+      CommunicationSchemeType communicationScheme;
+      if( communicationSchemeStr == "GPUPackInfo_Baseline")
+          communicationScheme = GPUPackInfo_Baseline;
+      else if (communicationSchemeStr == "GPUPackInfo_Streams")
+          communicationScheme = GPUPackInfo_Streams;
+      else if (communicationSchemeStr == "UniformGPUScheme_Baseline")
+          communicationScheme = UniformGPUScheme_Baseline;
+      else if (communicationSchemeStr == "UniformGPUScheme_Memcpy")
+          communicationScheme = UniformGPUScheme_Memcpy;
+      else {
+          WALBERLA_ABORT_NO_DEBUG_INFO("Invalid choice for communicationScheme")
+      }
      Vector3<int> innerOuterSplit = parameters.getParameter<Vector3<int> >("innerOuterSplit", Vector3<int>(1, 1, 1));

--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.prm
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.prm
@@ -7,18 +7,21 @@ DomainSetup
 Parameters 
 {
-	omega           1.8;
-	timesteps       10000;
-    warmupSteps     0;
-    outerIterations 1;
-	remainingTimeLoggerFrequency 30;
+	timesteps       10000;  // time steps of one performance measurement
-	vtkWriteFrequency 500;
+	warmupSteps     0;      // number of steps to run before measurement starts
+    outerIterations 1;      // how many measurements to conduct
-	cudaEnabledMPI false;
+    // Can be one of: GPUPackInfo_Baseline, GPUPackInfo_Streams, UniformGPUScheme_Baseline, UniformGPUScheme_Memcpy
+    communicationScheme UniformGPUScheme_Baseline;
-	timeStepStrategy noOverlap;
+	vtkWriteFrequency 0;             // write a VTK file every n'th step, if zero VTK output is disabled
-	innerOuterSplit < 64, 1, 1>;
+	cudaEnabledMPI false;            // switch on if you have a CUDA-enabled MPI implementation
+	timeStepStrategy noOverlap;      // can be: noOverlap, simpleOverlap, complexOverlap, kernelOnly
+	innerOuterSplit < 32, 1, 1>;     // slice-thickness that 'outer'-kernels process when overlapping
+	remainingTimeLoggerFrequency 0;  // interval in seconds to log the estimated remaining time
 }
 /*

--- a/apps/benchmarks/UniformGridGPU/simulation_setup/overlap_benchmark.py
+++ b/apps/benchmarks/UniformGridGPU/simulation_setup/overlap_benchmark.py
@@ -60,15 +60,21 @@ def overlap_benchmark():
                          (4, 4, 1), (8, 8, 1), (16, 16, 1), (32, 32, 1),
                          (4, 4, 4), (8, 8, 8), (16, 16, 16), (32, 32, 32)]
-    scenarios.add(Scenario(timeStepStrategy='noOverlap'))
+    for comm_strategy in ['UniformGPUScheme_Baseline', 'UniformGPUScheme_Memcpy']:  # 'GPUPackInfo_Baseline', 'GPUPackInfo_Streams'
-    for strategy in ['simpleOverlap', 'complexOverlap']:
+        # no overlap
-        for inner_outer_split in inner_outer_splits:
+        scenarios.add(Scenario(timeStepStrategy='noOverlap', communicationScheme=comm_strategy, innerOuterSplit=(1, 1, 1)))
-            scenario = Scenario(timeStepStrategy=strategy, innerOuterSplit=inner_outer_split)
-            scenarios.add(scenario)
+        # overlap
+        for overlap_strategy in ['simpleOverlap', 'complexOverlap']:
+            for inner_outer_split in inner_outer_splits:
+                scenario = Scenario(timeStepStrategy=overlap_strategy,
+                                    communicationScheme=comm_strategy,
+                                    innerOuterSplit=inner_outer_split)
+                scenarios.add(scenario)
-if __name__ == '__main__':
-    for node_count in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2400]:
+def generate_jobscripts(machine='pizdaint_hybrid'):
+    for node_count in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 2400]:
        with open("job_overlap_benchmark_{:04d}.sh".format(node_count), 'w') as f:
            js = createJobscript(nodes=node_count,
                                 output_file='overlap_bench_{:04d}_%j.txt'.format(node_count),
@@ -77,9 +83,14 @@ if __name__ == '__main__':
                                 exe_name='UniformGridBenchmarkGPU',
                                 parameter_files=['overlap_benchmark.py'],
                                 wall_time=timedelta(minutes=25),
-                                 machine='pizdaint_hybrid',
+                                 machine=machine,
                                 account='d105',
                                 )
            f.write(js)
+if __name__ == '__main__':
+    print("Called without waLBerla - generating job scripts for PizDaint")
+    generate_jobscripts()
 else:
    overlap_benchmark()
--- a/src/cuda/DeviceSelectMPI.cpp
+++ b/src/cuda/DeviceSelectMPI.cpp
@@ -58,8 +58,8 @@ void selectDeviceBasedOnMpiRank()
   }
   else if ( deviceCount > processesOnNode )
   {
-      WALBERLA_LOG_WARNING( "Not using all available GPUs on node. Processes on node "
+      WALBERLA_LOG_WARNING( "Not using all available GPUs on node. Processes on node: "
-                               << processesOnNode << " available GPUs on node " << deviceCount );
+                               << processesOnNode << ", available GPUs on node: " << deviceCount );
      WALBERLA_CUDA_CHECK( cudaSetDevice( rankOnNode ));
   }
   else