From a1f716d0e757ac8e33abeb7868e98757b33a0b2a Mon Sep 17 00:00:00 2001
From: Martin Bauer <martin.bauer@fau.de>
Date: Tue, 1 Oct 2019 08:25:52 +0200
Subject: [PATCH] New benchmark for generated LBM on uniform grid on CPU

---
 apps/benchmarks/CMakeLists.txt                |   1 +
 .../simulation_setup/communication_compare.py | 196 ++++++++++++++++++
 .../UniformGridGenerated/CMakeLists.txt       |  12 ++
 .../UniformGridGenerated.cpp                  | 157 ++++++++++++++
 .../UniformGridGenerated.py                   | 133 ++++++++++++
 5 files changed, 499 insertions(+)
 create mode 100755 apps/benchmarks/UniformGridGPU/simulation_setup/communication_compare.py
 create mode 100644 apps/benchmarks/UniformGridGenerated/CMakeLists.txt
 create mode 100644 apps/benchmarks/UniformGridGenerated/UniformGridGenerated.cpp
 create mode 100644 apps/benchmarks/UniformGridGenerated/UniformGridGenerated.py

diff --git a/apps/benchmarks/CMakeLists.txt b/apps/benchmarks/CMakeLists.txt
index 8e83319a..a59d3bc1 100644
--- a/apps/benchmarks/CMakeLists.txt
+++ b/apps/benchmarks/CMakeLists.txt
@@ -14,3 +14,4 @@ add_subdirectory( ProbeVsExtraMessage )
 add_subdirectory( SchaeferTurek )
 add_subdirectory( UniformGrid )
 add_subdirectory( UniformGridGPU )
+add_subdirectory( UniformGridGenerated )
diff --git a/apps/benchmarks/UniformGridGPU/simulation_setup/communication_compare.py b/apps/benchmarks/UniformGridGPU/simulation_setup/communication_compare.py
new file mode 100755
index 00000000..4a20b377
--- /dev/null
+++ b/apps/benchmarks/UniformGridGPU/simulation_setup/communication_compare.py
@@ -0,0 +1,196 @@
+#!/usr/bin/env python3
+
+import os
+import pandas as pd
+import waLBerla as wlb
+from waLBerla.tools.config import block_decomposition
+from waLBerla.tools.sqlitedb import sequenceValuesToScalars
+from copy import deepcopy
+import sys
+
+CSV_FILE = "new.csv"
+
+BASE_CONFIG = {
+    'DomainSetup': {
+        'cellsPerBlock': (256, 128, 128),
+        'periodic': (1, 1, 1),
+    },
+    'Parameters': {
+        'omega': 1.8,
+        'timesteps': 400,
+        'cudaEnabledMPI': False,
+        'warmupSteps': 5,
+        'outerIterations': 3,
+        'initShearFlow': True,
+    }
+}
+
+
+class Scenario:
+    def __init__(self, cells_per_block=(256, 128, 128), **kwargs):
+        self.config_dict = deepcopy(BASE_CONFIG)
+        self.config_dict['Parameters'].update(kwargs)
+        self.config_dict['DomainSetup']['blocks'] = block_decomposition(wlb.mpi.numProcesses())
+        self.config_dict['DomainSetup']['cellsPerBlock'] = cells_per_block
+
+    @wlb.member_callback
+    def config(self, **kwargs):
+        from pprint import pformat
+        wlb.log_info_on_root("Scenario:\n" + pformat(self.config_dict))
+        return self.config_dict
+
+    @wlb.member_callback
+    def results_callback(self, **kwargs):
+        data = {}
+        data.update(self.config_dict['Parameters'])
+        data.update(self.config_dict['DomainSetup'])
+        data.update(kwargs)
+        data['executable'] = sys.argv[0]
+        data['compile_flags'] = wlb.build_info.compiler_flags
+        data['walberla_version'] = wlb.build_info.version
+        data['build_machine'] = wlb.build_info.build_machine
+        sequenceValuesToScalars(data)
+
+        df = pd.DataFrame.from_records([data])
+        if not os.path.isfile(CSV_FILE):
+            df.to_csv(CSV_FILE, index=False)
+        else:
+            df.to_csv(CSV_FILE, index=False, mode='a', header=False)
+
+
+def overlap_benchmark():
+    scenarios = wlb.ScenarioManager()
+    inner_outer_splits = [(1, 1, 1), (4, 1, 1), (8, 1, 1), (16, 1, 1), (32, 1, 1),
+                          (4, 4, 1), (8, 8, 1), (16, 16, 1), (32, 32, 1),
+                          (4, 4, 4), (8, 8, 8), (16, 16, 16), (32, 32, 32)]
+
+    for comm_strategy in ['UniformGPUScheme_Baseline', 'UniformGPUScheme_Memcpy']:  # 'GPUPackInfo_Baseline', 'GPUPackInfo_Streams'
+        # no overlap
+        scenarios.add(Scenario(timeStepStrategy='noOverlap', communicationScheme=comm_strategy, innerOuterSplit=(1, 1, 1)))
+
+        # overlap
+        for overlap_strategy in ['simpleOverlap', 'complexOverlap']:
+            for inner_outer_split in inner_outer_splits:
+                scenario = Scenario(timeStepStrategy=overlap_strategy,
+                                    communicationScheme=comm_strategy,
+                                    innerOuterSplit=inner_outer_split)
+                scenarios.add(scenario)
+
+
+def communication_compare():
+    scenarios = wlb.ScenarioManager()
+    for block_size in [(128, 128, 128), (32, 32, 32), (64, 64, 64), (256, 256, 256)]:
+        for comm_strategy in ['UniformGPUScheme_Baseline', 'UniformGPUScheme_Memcpy']:
+            sc = Scenario(cells_per_block=block_size,
+                          gpuBlockSize=(128, 1, 1),
+                          timeStepStrategy='noOverlap',
+                          communicationScheme=comm_strategy)
+            scenarios.add(sc)
+            for inner_outer_split in [(4, 1, 1), (8, 1, 1), (16, 1, 1), (32, 1, 1)]:
+                if 3 * inner_outer_split[0] < block_size[0]:
+                    continue
+                sc = Scenario(cells_per_block=block_size,
+                              gpuBlockSize=(128, 1, 1),
+                              timeStepStrategy='simpleOverlap',
+                              innerOuterSplit=inner_outer_split,
+                              communicationScheme=comm_strategy)
+                scenarios.add(sc)
+
+
+def single_gpu_benchmark():
+    scenarios = wlb.ScenarioManager()
+    block_sizes = [(i, i, i) for i in (64, 128, 256, 384)] + [(512, 512, 128)]
+    cuda_blocks = [(32, 1, 1), (64, 1, 1), (128, 1, 1), (256, 1, 1), (512, 1, 1),
+                   (32, 2, 1), (64, 2, 1), (128, 2, 1), (256, 2, 1),
+                   (32, 4, 1), (64, 4, 1), (128, 4, 1),
+                   (32, 8, 1), (64, 8, 1),
+                   (32, 16, 1)]
+    for block_size in block_sizes:
+        for cuda_block_size in cuda_blocks:
+            cells = block_size[0] * block_size[1] * block_size[2]
+            time_steps_for_128_cubed = 1000
+            time_steps = (128 ** 3 / cells) * time_steps_for_128_cubed
+            scenario = Scenario(cells_per_block=block_size,
+                                gpuBlockSize=cuda_block_size,
+                                timeStepStrategy='kernelOnly',
+                                timesteps=int(time_steps))
+            scenarios.add(scenario)
+
+
+job_script_header = """
+#!/bin/bash -l
+#SBATCH --job-name=scaling
+#SBATCH --time=0:30:00
+#SBATCH --nodes={nodes}
+#SBATCH -o out_scaling_{nodes}_%j.txt
+#SBATCH -e err_scaling_{nodes}_%j.txt
+#SBATCH --ntasks-per-core=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --cpus-per-task=1
+#SBATCH --partition=normal
+#SBATCH --constraint=gpu
+#SBATCH --account=d105
+
+cd {folder}
+
+source ~/env.sh
+
+module load daint-gpu
+module load craype-accel-nvidia60
+export MPICH_RDMA_ENABLED_CUDA=1  # allow GPU-GPU data transfer
+export CRAY_CUDA_MPS=1            # allow GPU sharing
+export MPICH_G2G_PIPELINE=256     # adapt maximum number of concurrent in-flight messages
+
+export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
+export CRAY_CUDA_MPS=1
+
+export MPICH_RANK_REORDER_METHOD=3
+export PMI_MMAP_SYNC_WAIT_TIME=300
+
+
+# grid_order -R -H -c 1,1,8 -g 16,16,8
+
+ulimit -c 0
+"""
+
+job_script_exe_part = """
+
+export WALBERLA_SCENARIO_IDX=0
+while srun -n {nodes} ./{app} {config}
+do
+ ((WALBERLA_SCENARIO_IDX++))
+done
+"""
+
+
+all_executables = ('UniformGridBenchmarkGPU_mrt_d3q27',
+                   'UniformGridBenchmarkGPU_smagorinsky_d3q27',
+                   'UniformGridBenchmarkGPU_cumulant'
+                   'UniformGridBenchmarkGPU_cumulant_d3q27',
+                   )
+
+
+def generate_jobscripts(exe_names=all_executables):
+    for node_count in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 2400]:
+        folder_name = "scaling_{:04d}".format(node_count)
+        os.makedirs(folder_name, exist_ok=True)
+
+        # run grid_order
+        import subprocess
+        decomposition = block_decomposition(node_count)
+        decomposition_str = ",".join(str(e) for e in decomposition)
+        subprocess.check_call(['grid_order', '-R', '-H', '-g', decomposition_str])
+
+        job_script = job_script_header.format(nodes=node_count, folder=os.path.join(os.getcwd(), folder_name))
+        for exe in exe_names:
+            job_script += job_script_exe_part.format(app="../" + exe, nodes=node_count, config='../communication_compare.py')
+
+        with open(os.path.join(folder_name, 'job.sh'), 'w') as f:
+            f.write(job_script)
+
+
+if __name__ == '__main__':
+    print("Called without waLBerla - generating job scripts for PizDaint")
+    generate_jobscripts()
+else:
+    single_gpu_benchmark()
diff --git a/apps/benchmarks/UniformGridGenerated/CMakeLists.txt b/apps/benchmarks/UniformGridGenerated/CMakeLists.txt
new file mode 100644
index 00000000..def7d93f
--- /dev/null
+++ b/apps/benchmarks/UniformGridGenerated/CMakeLists.txt
@@ -0,0 +1,12 @@
+waLBerla_python_file_generates(UniformGridGenerated.py
+        UniformGridGenerated_LatticeModel.cpp
+        UniformGridGenerated_Defines.h)
+
+
+foreach(config trt )
+    waLBerla_add_executable ( NAME UniformGridBenchmarkGenerated_${config}
+            FILES UniformGridGenerated.cpp UniformGridGenerated.py
+            DEPENDS blockforest boundary core domain_decomposition field geometry timeloop vtk gui
+            CODEGEN_CFG ${config})
+endforeach()
+
diff --git a/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.cpp b/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.cpp
new file mode 100644
index 00000000..f48b0f75
--- /dev/null
+++ b/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.cpp
@@ -0,0 +1,157 @@
+#include "core/Environment.h"
+#include "core/logging/Initialization.h"
+#include "core/math/Random.h"
+#include "python_coupling/CreateConfig.h"
+#include "python_coupling/PythonCallback.h"
+#include "python_coupling/DictWrapper.h"
+#include "blockforest/Initialization.h"
+#include "field/vtk/VTKWriter.h"
+#include "field/communication/PackInfo.h"
+#include "blockforest/communication/UniformBufferedScheme.h"
+#include "timeloop/all.h"
+#include "core/timing/TimingPool.h"
+#include "core/timing/RemainingTimeLogger.h"
+#include "domain_decomposition/SharedSweep.h"
+#include "lbm/communication/PdfFieldPackInfo.h"
+#include "lbm/field/AddToStorage.h"
+#include "lbm/vtk/VTKOutput.h"
+#include "lbm/gui/Connection.h"
+#include "lbm/vtk/Velocity.h"
+#include "gui/Gui.h"
+
+#include "UniformGridGenerated_LatticeModel.h"
+#include "UniformGridGenerated_Defines.h"
+
+
+using namespace walberla;
+
+typedef lbm::UniformGridGenerated_LatticeModel LatticeModel_T;
+typedef LatticeModel_T::Stencil                Stencil_T;
+typedef LatticeModel_T::CommunicationStencil   CommunicationStencil_T;
+typedef lbm::PdfField< LatticeModel_T >        PdfField_T;
+
+
+void initShearVelocity(const shared_ptr<StructuredBlockStorage> & blocks, BlockDataID pdfFieldId,
+                       const real_t xMagnitude=0.1, const real_t fluctuationMagnitude=0.05 )
+{
+    math::seedRandomGenerator(0);
+    auto halfZ = blocks->getDomainCellBB().zMax() / 2;
+    for( auto & block: *blocks)
+    {
+        auto pdfField = block.getData<PdfField_T>( pdfFieldId );
+        WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(pdfField,
+            Cell globalCell;
+            blocks->transformBlockLocalToGlobalCell(globalCell, block, Cell(x, y, z));
+            real_t randomReal = xMagnitude * math::realRandom<real_t>(-fluctuationMagnitude, fluctuationMagnitude);
+
+            if( globalCell[2] >= halfZ ) {
+                pdfField->setDensityAndVelocity(x, y, z, Vector3<real_t>(xMagnitude, 0, randomReal), real_t(1.0));
+            } else {
+                pdfField->setDensityAndVelocity(x, y, z, Vector3<real_t>(-xMagnitude, 0, randomReal), real_t(1.0));
+            }
+        );
+    }
+}
+
+
+int main( int argc, char **argv )
+{
+   mpi::Environment env( argc, argv );
+
+   for( auto cfg = python_coupling::configBegin( argc, argv ); cfg != python_coupling::configEnd(); ++cfg )
+   {
+      WALBERLA_MPI_WORLD_BARRIER();
+
+      auto config = *cfg;
+      logging::configureLogging( config );
+      auto blocks = blockforest::createUniformBlockGridFromConfig( config );
+
+      Vector3<uint_t> cellsPerBlock = config->getBlock( "DomainSetup" ).getParameter<Vector3<uint_t>  >( "cellsPerBlock" );
+      // Reading parameters
+      auto parameters = config->getOneBlock( "Parameters" );
+      const std::string timeStepStrategy = parameters.getParameter<std::string>( "timeStepStrategy", "normal");
+      const real_t omega = parameters.getParameter<real_t>( "omega", real_c( 1.4 ));
+      const uint_t timesteps = parameters.getParameter<uint_t>( "timesteps", uint_c( 50 ));
+      const bool initShearFlow = parameters.getParameter<bool>("initShearFlow", false);
+
+      // Creating fields
+      LatticeModel_T latticeModel = LatticeModel_T( omega );
+      BlockDataID pdfFieldId = lbm::addPdfFieldToStorage( blocks, "pdf field", latticeModel);
+
+      if( initShearFlow ) {
+          initShearVelocity(blocks, pdfFieldId);
+      }
+
+      SweepTimeloop timeLoop( blocks->getBlockStorage(), timesteps );
+      blockforest::communication::UniformBufferedScheme< CommunicationStencil_T > communication( blocks );
+      communication.addPackInfo( make_shared< lbm::PdfFieldPackInfo< LatticeModel_T > >( pdfFieldId ) );
+
+      timeLoop.add() << BeforeFunction( communication, "communication" )
+                     << Sweep( LatticeModel_T::Sweep( pdfFieldId ), "LB stream & collide" );
+
+      int warmupSteps = parameters.getParameter<int>( "warmupSteps", 2 );
+      int outerIterations = parameters.getParameter<int>( "outerIterations", 1 );
+      for(int i=0; i < warmupSteps; ++i )
+         timeLoop.singleStep();
+
+      auto remainingTimeLoggerFrequency = parameters.getParameter< double >( "remainingTimeLoggerFrequency", -1.0 ); // in seconds
+      if (remainingTimeLoggerFrequency > 0) {
+          auto logger = timing::RemainingTimeLogger( timeLoop.getNrOfTimeSteps() * outerIterations, remainingTimeLoggerFrequency );
+          timeLoop.addFuncAfterTimeStep( logger, "remaining time logger" );
+      }
+
+      // VTK
+      uint_t vtkWriteFrequency = parameters.getParameter<uint_t>( "vtkWriteFrequency", 0 );
+      if( vtkWriteFrequency > 0 )
+      {
+          auto vtkOutput = vtk::createVTKOutput_BlockData( *blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out",
+                                                           "simulation_step", false, true, true, false, 0 );
+          auto velWriter = make_shared< lbm::VelocityVTKWriter<LatticeModel_T> >(pdfFieldId, "vel");
+          vtkOutput->addCellDataWriter(velWriter);
+          timeLoop.addFuncAfterTimeStep( vtk::writeFiles( vtkOutput ), "VTK Output" );
+      }
+
+
+      bool useGui = parameters.getParameter<bool>( "useGui", false );
+      if( useGui )
+      {
+          GUI gui( timeLoop, blocks, argc, argv);
+          lbm::connectToGui<LatticeModel_T>(gui);
+          gui.run();
+      }
+      else
+      {
+          for ( int outerIteration = 0; outerIteration < outerIterations; ++outerIteration )
+          {
+              timeLoop.setCurrentTimeStepToZero();
+              WcTimer simTimer;
+              WALBERLA_LOG_INFO_ON_ROOT( "Starting simulation with " << timesteps << " time steps" );
+              simTimer.start();
+              timeLoop.run();
+              simTimer.end();
+              WALBERLA_LOG_INFO_ON_ROOT( "Simulation finished" );
+              auto time = simTimer.last();
+              auto nrOfCells = real_c( cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2] );
+              auto mlupsPerProcess = nrOfCells * real_c( timesteps ) / time * 1e-6;
+              WALBERLA_LOG_RESULT_ON_ROOT( "MLUPS per process " << mlupsPerProcess );
+              WALBERLA_LOG_RESULT_ON_ROOT( "Time per time step " << time / real_c( timesteps ));
+              WALBERLA_ROOT_SECTION()
+              {
+                  python_coupling::PythonCallback pythonCallbackResults( "results_callback" );
+                  if ( pythonCallbackResults.isCallable())
+                  {
+                      pythonCallbackResults.data().exposeValue( "mlupsPerProcess", mlupsPerProcess );
+                      pythonCallbackResults.data().exposeValue( "stencil", infoStencil );
+                      pythonCallbackResults.data().exposeValue( "configName", infoConfigName );
+                      pythonCallbackResults.data().exposeValue( "cse_global", infoCseGlobal );
+                      pythonCallbackResults.data().exposeValue( "cse_pdfs", infoCsePdfs );
+                      // Call Python function to report results
+                      pythonCallbackResults();
+                  }
+              }
+          }
+      }
+   }
+
+   return 0;
+}
diff --git a/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.py b/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.py
new file mode 100644
index 00000000..8f7bf791
--- /dev/null
+++ b/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.py
@@ -0,0 +1,133 @@
+import sympy as sp
+import pystencils as ps
+from lbmpy.creationfunctions import create_lb_collision_rule
+from lbmpy.fieldaccess import StreamPullTwoFieldsAccessor
+from lbmpy_walberla import generate_lattice_model
+from pystencils_walberla import CodeGeneration
+
+omega = sp.symbols("omega")
+omega_fill = sp.symbols("omega_:10")
+
+options_dict = {
+    'srt': {
+        'method': 'srt',
+        'stencil': 'D3Q19',
+        'relaxation_rate': omega,
+        'compressible': False,
+    },
+    'trt': {
+        'method': 'trt',
+        'stencil': 'D3Q19',
+        'relaxation_rate': omega,
+    },
+    'mrt': {
+        'method': 'mrt',
+        'stencil': 'D3Q19',
+        'relaxation_rates': [0, omega, 1.3, 1.4, omega, 1.2, 1.1, 1.15, 1.234, 1.4235, 1.242, 1.2567, 0.9, 0.7],
+    },
+    'mrt_full': {
+        'method': 'mrt',
+        'stencil': 'D3Q19',
+        'relaxation_rates': [omega_fill[0], omega, omega_fill[1], omega_fill[2], omega_fill[3], omega_fill[4], omega_fill[5]],
+    },
+    'mrt3': {
+        'method': 'mrt3',
+        'stencil': 'D3Q19',
+        'relaxation_rates': [omega, 1.1, 1.2],
+    },
+    'entropic': {
+        'method': 'mrt3',
+        'stencil': 'D3Q19',
+        'compressible': True,
+        'relaxation_rates': [omega, omega, sp.Symbol("omega_free")],
+        'entropic': True,
+    },
+    'entropic_kbc_n4': {
+        'method': 'trt-kbc-n4',
+        'stencil': 'D3Q27',
+        'compressible': True,
+        'relaxation_rates': [omega, sp.Symbol("omega_free")],
+        'entropic': True,
+    },
+    'smagorinsky': {
+        'method': 'srt',
+        'stencil': 'D3Q19',
+        'smagorinsky': True,
+        'relaxation_rate': omega,
+    },
+    'cumulant': {
+        'stencil': 'D3Q19',
+        'compressible': True,
+        'method': 'mrt',
+        'cumulant': True,
+        'relaxation_rates': [0, omega, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+    },
+}
+
+info_header = """
+#include "stencil/D3Q{q}.h"\nusing Stencil_T = walberla::stencil::D3Q{q}; 
+const char * infoStencil = "{stencil}";
+const char * infoConfigName = "{configName}";
+const bool infoCseGlobal = {cse_global};
+const bool infoCsePdfs = {cse_pdfs};
+"""
+
+
+with CodeGeneration() as ctx:
+    accessor = StreamPullTwoFieldsAccessor()
+    assert not accessor.is_inplace, "This app does not work for inplace accessors"
+
+    common_options = {
+        'field_name': 'pdfs',
+        'temporary_field_name': 'pdfs_tmp',
+        'kernel_type': accessor,
+        'optimization': {'cse_global': True,
+                         'cse_pdfs': False}
+    }
+    config_name = ctx.config
+    noopt = False
+    d3q27 = False
+    if config_name.endswith("_noopt"):
+        noopt = True
+        config_name = config_name[:-len("_noopt")]
+    if config_name.endswith("_d3q27"):
+        d3q27 = True
+        config_name = config_name[:-len("_d3q27")]
+
+    options = options_dict[config_name]
+    options.update(common_options)
+    options = options.copy()
+
+    if noopt:
+        options['optimization']['cse_global'] = False
+        options['optimization']['cse_pdfs'] = False
+    if d3q27:
+        options['stencil'] = 'D3Q27'
+
+    stencil_str = options['stencil']
+    q = int(stencil_str[stencil_str.find('Q')+1:])
+    pdfs, velocity_field = ps.fields("pdfs({q}), velocity(3) : double[3D]".format(q=q), layout='fzyx')
+    options['optimization']['symbolic_field'] = pdfs
+
+    vp = [
+        ('double', 'omega_0'),
+        ('double', 'omega_1'),
+        ('double', 'omega_2'),
+        ('double', 'omega_3'),
+        ('double', 'omega_4'),
+        ('double', 'omega_5'),
+        ('double', 'omega_6'),
+        ('int32_t', 'cudaBlockSize0'),
+        ('int32_t', 'cudaBlockSize1'),
+    ]
+    update_rule = create_lb_collision_rule(**options)
+    generate_lattice_model(ctx, 'UniformGridGenerated_LatticeModel', update_rule)
+
+    infoHeaderParams = {
+        'stencil': stencil_str,
+        'q': q,
+        'configName': ctx.config,
+        'cse_global': int(options['optimization']['cse_global']),
+        'cse_pdfs': int(options['optimization']['cse_pdfs']),
+    }
+    ctx.write_file("UniformGridGenerated_Defines.h", info_header.format(**infoHeaderParams))
-- 
GitLab