UniformGridGPU: Fix in timing for shorter runs

628f2060 · Martin Bauer · 79668534 · 628f2060 · 628f2060 · 628f2060
Commit 628f2060 authored 5 years ago by Martin Bauer
--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
 #include "core/Environment.h"
+#include "core/logging/Initialization.h"
 #include "python_coupling/CreateConfig.h"
 #include "python_coupling/PythonCallback.h"
 #include "python_coupling/DictWrapper.h"
@@ -55,8 +56,11 @@ int main( int argc, char **argv )
   for( auto cfg = python_coupling::configBegin( argc, argv ); cfg != python_coupling::configEnd(); ++cfg )
   {
      auto config = *cfg;
+      logging::configureLogging( config );
      auto blocks = blockforest::createUniformBlockGridFromConfig( config );

+      Vector3<uint_t> cellsPerBlock = config->getBlock( "DomainSetup" ).getParameter<Vector3<uint_t>  >( "cellsPerBlock" );
+
      // Reading parameters
      auto parameters = config->getOneBlock( "Parameters" );
      const real_t omega = parameters.getParameter<real_t>( "omega", real_c( 1.4 ));
@@ -177,28 +181,25 @@ int main( int argc, char **argv )
      timeLoop.addFuncAfterTimeStep( timing::RemainingTimeLogger( timeLoop.getNrOfTimeSteps(), remainingTimeLoggerFrequency ), "remaining time logger" );


-      auto performanceReportFrequency = parameters.getParameter< uint_t >( "performanceReportFrequency", 500 ); // in timesteps
-      lbm::PerformanceLogger<FlagField_T> performanceLogger(blocks, flagFieldID, fluidFlagUID, performanceReportFrequency);
-      timeLoop.addFuncAfterTimeStep([&performanceLogger] { performanceLogger(); }, "performance logger" );
-
+      WcTimer simTimer;
+      cudaDeviceSynchronize();
      WALBERLA_LOG_INFO_ON_ROOT("Starting simulation with " << timesteps << " time steps");
+      simTimer.start();
+      cudaDeviceSynchronize();
      timeLoop.run();
+      simTimer.end();
      WALBERLA_LOG_INFO_ON_ROOT("Simulation finished");
-
-      std::map< std::string, int > integerProperties;
-      std::map< std::string, double > realProperties;
-      std::map< std::string, std::string > stringProperties;
-
-      performanceLogger.logOverallResultsOnRoot();
-      performanceLogger.getBestResultsForSQLOnRoot(integerProperties, realProperties, stringProperties);
-
+      auto time = simTimer.last();
+      auto nrOfCells = real_c( cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2] );
+      auto mlupsPerProcess = nrOfCells * real_c( timesteps ) / time * 1e-6;
+      WALBERLA_LOG_RESULT_ON_ROOT("MLUPS per process " << mlupsPerProcess);
+      WALBERLA_LOG_RESULT_ON_ROOT("Time per time step " << time / real_c( timesteps ) );
      WALBERLA_ROOT_SECTION()
      {
         python_coupling::PythonCallback pythonCallbackResults ( "results_callback" );
         if ( pythonCallbackResults.isCallable() )
         {
-            pythonCallbackResults.data().exposeValue( "mlups_total", realProperties["MLUPS"] );
-            pythonCallbackResults.data().exposeValue( "mlups_process", realProperties["MLUPS_process"] );
+            pythonCallbackResults.data().exposeValue( "mlups_per_process", mlupsPerProcess );

            // Call Python function to report results
            pythonCallbackResults();

--- a/apps/benchmarks/UniformGridGPU/simulation_setup/base.py
+++ b/apps/benchmarks/UniformGridGPU/simulation_setup/base.py
@@ -26,7 +26,7 @@ communication_schemes = ['GPUPackInfo_Streams', 'UniformGPUScheme_Baseline', 'Un
 def calculate_time_steps(runtime, expected_mlups, domain_size):
    cells = reduce(operator.mul, domain_size, 1)
    time_steps_per_second = expected_mlups * 1e6 / cells
-    return time_steps_per_second * runtime
+    return int(time_steps_per_second * runtime)


 def side_length_to_fill_memory(memory_fill_percentage, memory_in_gb):

--- a/apps/benchmarks/UniformGridGPU/simulation_setup/pizdaint_jobfiles.py
+++ b/apps/benchmarks/UniformGridGPU/simulation_setup/pizdaint_jobfiles.py
+#!/usr/bin/env python3
 from os import getcwd
 from waLBerla.tools.jobscripts import createJobscript
 from datetime import timedelta
@@ -6,12 +7,13 @@ from datetime import timedelta
 for node_count in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2400]:
    with open("job_weak_scaling_{:04d}.sh".format(node_count), 'w') as f:
        js = createJobscript(nodes=node_count,
-                             output_file='out_lbm_bench_%j.txt',
-                             error_file='err_lbm_bench_%j.txt',
+                             output_file='out_lbm_bench_{:04d}_%j.txt'.format(node_count),
+                             error_file='err_lbm_bench_{:04d}_%j.txt'.format(node_count),
                             initial_dir=getcwd(),
                             exe_name='UniformGridBenchmarkGPU',
                             parameter_files=['weak_scaling.py'],
                             wall_time=timedelta(minutes=25),
-                             machine='pizdaint_hybrid'
+                             machine='pizdaint_hybrid',
+                             account='d105',
                             )
        f.write(js)
--- a/apps/benchmarks/UniformGridGPU/simulation_setup/weak_scaling.py
+++ b/apps/benchmarks/UniformGridGPU/simulation_setup/weak_scaling.py
@@ -19,7 +19,7 @@ gpu_memory_gb = 16
 cells_per_block = [side_length_to_fill_memory(pc, gpu_memory_gb) for pc in (0.8, 0.5, 0.05)]

 expected_mlups = 200  # to compute how many time steps have to be done
-time_per_scenarios = 3  # benchmark time in seconds
+time_per_scenarios = 5  # benchmark time in seconds

 fully_periodic = [False, True]