From aeff901c3cfcbe3ecb0d8a386508a5718a155c00 Mon Sep 17 00:00:00 2001
From: Martin Bauer <martin.bauer@fau.de>
Date: Mon, 7 Oct 2019 13:55:40 +0200
Subject: [PATCH] UniformGridGenerated: detailed output

---
 .../UniformGridGenerated/CMakeLists.txt       |   1 +
 .../UniformGridGenerated/UniformGrid.prm      |   1 -
 .../UniformGridGenerated.cpp                  |  36 ++++---
 .../UniformGridGenerated.py                   |  63 +++++++----
 .../benchmarks/UniformGridGenerated/params.py | 100 ++++++++++++++++++
 5 files changed, 164 insertions(+), 37 deletions(-)
 create mode 100644 apps/benchmarks/UniformGridGenerated/params.py

diff --git a/apps/benchmarks/UniformGridGenerated/CMakeLists.txt b/apps/benchmarks/UniformGridGenerated/CMakeLists.txt
index 5d3f2dca0..b3f9795f7 100644
--- a/apps/benchmarks/UniformGridGenerated/CMakeLists.txt
+++ b/apps/benchmarks/UniformGridGenerated/CMakeLists.txt
@@ -1,4 +1,5 @@
 waLBerla_link_files_to_builddir( "*.prm" )
+waLBerla_link_files_to_builddir( "*.py" )
 
 
 waLBerla_python_file_generates(UniformGridGenerated.py
diff --git a/apps/benchmarks/UniformGridGenerated/UniformGrid.prm b/apps/benchmarks/UniformGridGenerated/UniformGrid.prm
index e53b3cd4f..cf2f26ad8 100644
--- a/apps/benchmarks/UniformGridGenerated/UniformGrid.prm
+++ b/apps/benchmarks/UniformGridGenerated/UniformGrid.prm
@@ -13,7 +13,6 @@ Parameters
     outerIterations 1;      // how many measurements to conduct
 
 	vtkWriteFrequency 100;           // write a VTK file every n'th step, if zero VTK output is disabled
-	cudaEnabledMPI false;            // switch on if you have a CUDA-enabled MPI implementation
 	timeStepMode aa;                 // can be: noOverlap, simpleOverlap, complexOverlap, kernelOnly
 	remainingTimeLoggerFrequency 0;  // interval in seconds to log the estimated remaining time
     directComm 1;
diff --git a/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.cpp b/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.cpp
index d6671b3b2..d5515c0c7 100644
--- a/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.cpp
+++ b/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.cpp
@@ -1,5 +1,6 @@
 #include "core/Environment.h"
 #include "core/logging/Initialization.h"
+#include "core/OpenMP.h"
 #include "python_coupling/CreateConfig.h"
 #include "python_coupling/PythonCallback.h"
 #include "python_coupling/DictWrapper.h"
@@ -11,6 +12,7 @@
 #include "timeloop/all.h"
 #include "core/timing/TimingPool.h"
 #include "core/timing/RemainingTimeLogger.h"
+#include "core/waLBerlaBuildInfo.h"
 #include "domain_decomposition/SharedSweep.h"
 #include "gui/Gui.h"
 #include "InitShearVelocity.h"
@@ -30,6 +32,7 @@
 #include "GenMpiDtypeInfoAAPull.h"
 #include "GenMpiDtypeInfoAAPush.h"
 
+#include <iomanip>
 
 using namespace walberla;
 
@@ -162,24 +165,26 @@ int main( int argc, char **argv )
           {
               timeLoop.setCurrentTimeStepToZero();
               WcTimer simTimer;
-              WALBERLA_LOG_INFO_ON_ROOT( "Starting simulation with " << timesteps << " time steps" );
+
+              auto threads = omp_get_max_threads();
+
               simTimer.start();
               timeLoop.run();
-              /*
-              pystencils::GenLbKernelAAEven k1(pdfFieldId, omega);
-              pystencils::GenLbKernelAAOdd k2(pdfFieldId, omega);
-              for(int t=0; t < timesteps / 2; ++t)
-              { for( auto & b : *blocks) {
-                k1(&b);
-                k2(&b);
-              }}*/
               simTimer.end();
-              WALBERLA_LOG_INFO_ON_ROOT( "Simulation finished" );
               auto time = simTimer.last();
               auto nrOfCells = real_c( cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2] );
               auto mlupsPerProcess = nrOfCells * real_c( timesteps ) / time * 1e-6;
-              WALBERLA_LOG_RESULT_ON_ROOT( "MLUPS per process " << mlupsPerProcess );
-              WALBERLA_LOG_RESULT_ON_ROOT( "Time per time step " << time / real_c( timesteps ));
+
+              using std::setw;
+              WALBERLA_LOG_INFO_ON_ROOT(setw(18) << timeStepMode <<
+                                                     "  procs: " << setw(6) << MPIManager::instance()->numProcesses() <<
+                                                     "  threads: " << threads <<
+                                                     "  direct_comm: " << directComm <<
+                                                     "  time steps: " << timesteps <<
+                                                     setw(15) << "  block size: " << cellsPerBlock <<
+                                                     "  mlups/core:  " << int(mlupsPerProcess/ threads) <<
+                                                     "  mlups:  " << int(mlupsPerProcess) *  MPIManager::instance()->numProcesses());
+
               WALBERLA_ROOT_SECTION()
               {
                   python_coupling::PythonCallback pythonCallbackResults( "results_callback" );
@@ -188,8 +193,11 @@ int main( int argc, char **argv )
                       pythonCallbackResults.data().exposeValue( "mlupsPerProcess", mlupsPerProcess );
                       pythonCallbackResults.data().exposeValue( "stencil", infoStencil );
                       pythonCallbackResults.data().exposeValue( "configName", infoConfigName );
-                      pythonCallbackResults.data().exposeValue( "cse_global", infoCseGlobal );
-                      pythonCallbackResults.data().exposeValue( "cse_pdfs", infoCsePdfs );
+                      pythonCallbackResults.data().exposeValue( "optimizations", optimizationDict );
+                      pythonCallbackResults.data().exposeValue( "githash", core::buildinfo::gitSHA1() );
+                      pythonCallbackResults.data().exposeValue( "compilerFlags", core::buildinfo::compilerFlags() );
+                      pythonCallbackResults.data().exposeValue( "buildMachine", core::buildinfo::buildMachine() );
+
                       // Call Python function to report results
                       pythonCallbackResults();
                   }
diff --git a/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.py b/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.py
index 3958d308e..1228bf95f 100644
--- a/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.py
+++ b/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.py
@@ -69,25 +69,33 @@ info_header = """
 #include "stencil/D3Q{q}.h"\nusing Stencil_T = walberla::stencil::D3Q{q}; 
 const char * infoStencil = "{stencil}";
 const char * infoConfigName = "{configName}";
-const bool infoCseGlobal = {cse_global};
-const bool infoCsePdfs = {cse_pdfs};
+const char * optimizationDict = "{optimizationDict}";
 """
 
-
 with CodeGeneration() as ctx:
     common_options = {
         'field_name': 'pdfs',
         'temporary_field_name': 'pdfs_tmp',
-        'optimization': {'cse_global': False,
-                         'cse_pdfs': False,
-                         'split': False}
+    }
+    opts = {
+        'two_field_cse_pdfs': False,
+        'two_field_cse_global': False,
+        'two_field_split': True,
+        'two_field_nt_stores': True,
+
+        'aa_even_cse_pdfs': False,
+        'aa_even_cse_global': False,
+        'aa_even_split': False,
+        'aa_even_nt_stores': False,
+
+        'aa_odd_cse_pdfs': False,
+        'aa_odd_cse_global': False,
+        'aa_odd_split': True,
+        'aa_odd_nt_stores': False,
     }
     config_name = ctx.config
     noopt = False
     d3q27 = False
-    if config_name.endswith("_noopt"):
-        noopt = True
-        config_name = config_name[:-len("_noopt")]
     if config_name.endswith("_d3q27"):
         d3q27 = True
         config_name = config_name[:-len("_d3q27")]
@@ -104,20 +112,33 @@ with CodeGeneration() as ctx:
     stencil_str = options['stencil']
     q = int(stencil_str[stencil_str.find('Q')+1:])
     pdfs, velocity_field = ps.fields("pdfs({q}), velocity(3) : double[3D]".format(q=q), layout='fzyx')
-    options['optimization']['symbolic_field'] = pdfs
 
-    update_rule_two_field = create_lb_update_rule(**options)
-    update_rule_aa_even = create_lb_update_rule(kernel_type=AAEvenTimeStepAccessor(), **options)
-    options['optimization']['split'] = True
-    update_rule_aa_odd = create_lb_update_rule(kernel_type=AAOddTimeStepAccessor(), **options)
-
-    vec = {'nontemporal': False, 'assume_aligned': True, 'assume_inner_stride_one': True}
+    update_rule_two_field = create_lb_update_rule(optimization={'symbolic_field': pdfs,
+                                                                'split': opts['two_field_split'],
+                                                                'cse_global': opts['two_field_cse_global'],
+                                                                'cse_pdfs': opts['two_field_cse_pdfs']}, **options)
+    update_rule_aa_even = create_lb_update_rule(kernel_type=AAEvenTimeStepAccessor(),
+                                                optimization={'symbolic_field': pdfs,
+                                                              'split': opts['aa_even_split'],
+                                                              'cse_global': opts['aa_even_cse_global'],
+                                                              'cse_pdfs': opts['aa_even_cse_pdfs']}, **options)
+    update_rule_aa_odd = create_lb_update_rule(kernel_type=AAOddTimeStepAccessor(),
+                                               optimization={'symbolic_field': pdfs,
+                                                             'split': opts['aa_odd_split'],
+                                                             'cse_global': opts['aa_odd_cse_global'],
+                                                             'cse_pdfs': opts['aa_odd_cse_pdfs']}, **options)
+
+    vec = { 'assume_aligned': True, 'assume_inner_stride_one': True}
 
     # Sweeps
-    generate_sweep(ctx, 'GenLbKernel', update_rule_two_field, field_swaps=[('pdfs', 'pdfs_tmp')])
-    generate_sweep(ctx, 'GenLbKernelAAEven', update_rule_aa_even, cpu_vectorize_info={'assume_aligned': True},
+    vec['nontemporal'] = opts['two_field_nt_stores']
+    generate_sweep(ctx, 'GenLbKernel', update_rule_two_field, field_swaps=[('pdfs', 'pdfs_tmp')],
+                   cpu_vectorize_info=vec)
+    vec['nontemporal'] = opts['aa_even_nt_stores']
+    generate_sweep(ctx, 'GenLbKernelAAEven', update_rule_aa_even, cpu_vectorize_info=vec,
                    cpu_openmp=True, ghost_layers=1)
-    generate_sweep(ctx, 'GenLbKernelAAOdd', update_rule_aa_odd, cpu_vectorize_info={'assume_aligned': True},
+    vec['nontemporal'] = opts['aa_odd_nt_stores']
+    generate_sweep(ctx, 'GenLbKernelAAOdd', update_rule_aa_odd, cpu_vectorize_info=vec,
                    cpu_openmp=True, ghost_layers=1)
 
     setter_assignments = macroscopic_values_setter(update_rule_two_field.method, velocity=velocity_field.center_vector,
@@ -144,8 +165,6 @@ with CodeGeneration() as ctx:
         'stencil': stencil_str,
         'q': q,
         'configName': ctx.config,
-        'cse_global': int(options['optimization']['cse_global']),
-        'cse_pdfs': int(options['optimization']['cse_pdfs']),
+        'optimizationDict': str(opts),
     }
     ctx.write_file("GenDefines.h", info_header.format(**infoHeaderParams))
-
diff --git a/apps/benchmarks/UniformGridGenerated/params.py b/apps/benchmarks/UniformGridGenerated/params.py
new file mode 100644
index 000000000..399d29fd0
--- /dev/null
+++ b/apps/benchmarks/UniformGridGenerated/params.py
@@ -0,0 +1,100 @@
+import math
+import os
+import operator
+import waLBerla as wlb
+from waLBerla.tools.sqlitedb import *
+from functools import reduce
+
+
+def prod(seq):
+    return reduce(operator.mul, seq, 1)
+
+
+def get_block_decomposition(block_decomposition, num_processes):
+    bx = by = bz = 1
+    blocks_per_axis = int(math.log(num_processes, 2))
+    for i in range(blocks_per_axis):
+        decomposition_axis = block_decomposition[i % len(block_decomposition)]
+        if decomposition_axis == 'y':
+            by *= 2
+        elif decomposition_axis == 'z':
+            bz *= 2
+        elif decomposition_axis == 'x':
+            bx *= 2
+
+    assert (bx * by * bz) == num_processes
+    return bx, by, bz
+
+
+def calculate_time_steps(runtime, expected_mlups, domain_size):
+    cells = prod(domain_size)
+    time_steps_per_second = expected_mlups * 1e6 / cells
+    return int(time_steps_per_second * runtime)
+
+
+class BenchmarkScenario:
+    def __init__(self, block_size=(256, 128, 128), direct_comm=True, time_step_mode='aa', db_file_name='uniform_grid_gen.sqlite'):
+        self.block_size = block_size
+        self.direct_comm = direct_comm
+        self.time_step_mode = time_step_mode
+        self.threads = int(os.environ['OMP_NUM_THREADS'])
+        self.processes = wlb.mpi.numProcesses()
+        self.db_file_name = db_file_name
+
+    @wlb.member_callback
+    def config(self, **kwargs):
+        time_steps_for_128_cubed = 50
+        time_steps = int(128**3 / prod(self.block_size) * time_steps_for_128_cubed)
+        time_steps = max(10, time_steps)
+        cfg = {
+            'DomainSetup': {
+                'blocks': (1, 1, self.processes),
+                'cellsPerBlock': (self.block_size[0], self.block_size[1], self.block_size[2] * self.threads),
+                'periodic': (1, 1, 1),
+            },
+            'Parameters': {
+                'timesteps': time_steps,
+                'warmupSteps': 6,
+                'outerIterations': 3,
+                'vtkWriteFrequency': 0,
+                'remainingTimeLoggerFrequency': 0,
+                'omega': 1.6,
+                'timeStepMode': self.time_step_mode,
+                'directComm': self.direct_comm,
+            }
+        }
+        return cfg
+
+    @wlb.member_callback
+    def results_callback(self, mlupsPerProcess, optimizations, **kwargs):
+        cfg = self.config()
+        result = {
+            'block_size': self.block_size,
+            'mlups_per_core': mlupsPerProcess / self.threads,
+            'threads': self.threads,
+            'processes': self.processes,
+            'time_step_mode': self.time_step_mode,
+            'direct_comm': self.direct_comm,
+            'time_steps': cfg['Parameters']['timesteps'],
+            'I_MPI_PIN_CELL': os.environ.get('I_MPI_PIN_CELL', ''),
+            'I_MPI_PIN_DOMAIN': os.environ.get('I_MPI_PIN_CELL', ''),
+        }
+
+        optimizations = eval(optimizations)
+        result.update(optimizations)
+        result.update(kwargs)
+        sequenceValuesToScalars(result)
+        checkAndUpdateSchema(result, "runs", self.db_file_name)
+        storeSingle(result, "runs", self.db_file_name)
+
+
+def benchmark():
+    scenarios = wlb.ScenarioManager()
+    for block_size in [(128, 128, 128), (128, 64, 64), (64, 64, 128), (64, 64, 64), (64, 32, 32), (32, 32, 32), (16, 16, 16), (256, 128, 64), (512, 128, 32)]:
+        for direct_comm in (True, False):
+            for time_step_mode in ['aa', 'aaKernelOnly', 'twoField']:
+                sc = BenchmarkScenario(block_size=block_size, direct_comm=direct_comm, time_step_mode=time_step_mode)
+                scenarios.add(sc)
+
+benchmark()
+
-- 
GitLab