Extensions in UniformGridGPU benchmark

- different stencils supported - mpi datatype communication: full vs. required pdfs only - adapted config file

Extensions in UniformGridGPU benchmark
- different stencils supported - mpi datatype communication: full vs. required pdfs only - adapted config file
58b3e682 · Martin Bauer · f040919f · 58b3e682 · 58b3e682 · 58b3e682
Commit 58b3e682 authored 5 years ago by Martin Bauer
--- a/apps/benchmarks/UniformGridGPU/CMakeLists.txt
+++ b/apps/benchmarks/UniformGridGPU/CMakeLists.txt
@@ -29,10 +29,12 @@ waLBerla_python_file_generates(UniformGridGPU_AA.py
        UniformGridGPU_AA_PackInfoPush.cu UniformGridGPU_AA_PackInfoPush.h
        UniformGridGPU_AA_MacroSetter.cpp UniformGridGPU_AA_MacroSetter.h
        UniformGridGPU_AA_MacroGetter.cpp UniformGridGPU_AA_MacroGetter.h
+        UniformGridGPU_AA_Defines.h
        )

-set(config "srt")
-waLBerla_add_executable ( NAME UniformGridBenchmarkGPU_AA_${config}
-        FILES UniformGridGPU_AA.cpp UniformGridGPU_AA.py
-        DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk gui
-        CODEGEN_CFG ${config})
+foreach(config srt trt mrt smagorinsky entropic )
+    waLBerla_add_executable ( NAME UniformGridBenchmarkGPU_AA_${config}
+            FILES UniformGridGPU_AA.cpp UniformGridGPU_AA.py
+            DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk gui
+            CODEGEN_CFG ${config})
+endforeach()
--- a/apps/benchmarks/UniformGridGPU/InitShearVelocity.h
+++ b/apps/benchmarks/UniformGridGPU/InitShearVelocity.h
@@ -6,7 +6,7 @@ namespace walberla {


 inline void initShearVelocity(const shared_ptr<StructuredBlockStorage> & blocks, BlockDataID velFieldID,
-                              const real_t xMagnitude=0.1, const real_t fluctuationMagnitude=0.05 )
+                              const real_t xMagnitude=0.005, const real_t fluctuationMagnitude=0.05 )
 {
    math::seedRandomGenerator(0);
    auto halfZ = blocks->getDomainCellBB().zMax() / 2;

--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
@@ -151,6 +151,8 @@ int main( int argc, char **argv )
          communicationScheme = UniformGPUScheme_Memcpy;
      else if (communicationSchemeStr == "MPIDatatypes")
          communicationScheme = MPIDatatypes;
+      else if (communicationSchemeStr == "MPIDatatypesFull")
+          communicationScheme = MPIDatatypesFull;
      else {
          WALBERLA_ABORT_NO_DEBUG_INFO("Invalid choice for communicationScheme")
      }

--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.prm
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.prm
 DomainSetup
 {
   blocks        <  1,    1,   1 >;
-   cellsPerBlock <  512, 256, 256 >;
+   cellsPerBlock <  128, 128, 128 >;
   periodic      <  1,    1,   1 >;
 }

 Parameters 
 {

-	timesteps       2000; //10000;  // time steps of one performance measurement
-	warmupSteps     0;      // number of steps to run before measurement starts
-    outerIterations 1;      // how many measurements to conduct
+	timesteps       1000;   // time steps of one performance measurement
+	warmupSteps     100;    // number of steps to run before measurement starts
+    outerIterations 3;      // how many measurements to conduct

    // Can be one of: GPUPackInfo_Baseline, GPUPackInfo_Streams, UniformGPUScheme_Baseline, UniformGPUScheme_Memcpy
    communicationScheme UniformGPUScheme_Baseline;

-	vtkWriteFrequency 0; //100;             // write a VTK file every n'th step, if zero VTK output is disabled
+	vtkWriteFrequency 0;             // write a VTK file every n'th step, if zero VTK output is disabled
 	cudaEnabledMPI false;            // switch on if you have a CUDA-enabled MPI implementation

-	timeStepStrategy noOverlap;      // can be: noOverlap, simpleOverlap, complexOverlap, kernelOnly
+	timeStepStrategy kernelOnly;    // can be: noOverlap, simpleOverlap, complexOverlap, kernelOnly
 	innerOuterSplit < 8, 1, 1>;     // slice-thickness that 'outer'-kernels process when overlapping

 	remainingTimeLoggerFrequency 5;  // interval in seconds to log the estimated remaining time
+    	gpuBlockSize < 128, 1, 1>;

 	omega 1.92;
 	initShearFlow 1;

--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.py
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.py
@@ -12,10 +12,14 @@ from pystencils.fast_approximation import insert_fast_sqrts, insert_fast_divisio
 from lbmpy.macroscopic_value_kernels import macroscopic_values_getter, macroscopic_values_setter

 omega = sp.symbols("omega")
-# sweep_block_size = (128, 1, 1)
-sweep_block_size = (TypedSymbol("cudaBlockSize0", np.int32),
-                    TypedSymbol("cudaBlockSize1", np.int32),
-                    1)
+compile_time_block_size = False
+
+if compile_time_block_size:
+    sweep_block_size = (128, 1, 1)
+else:
+    sweep_block_size = (TypedSymbol("cudaBlockSize0", np.int32),
+                        TypedSymbol("cudaBlockSize1", np.int32),
+                        1)

 sweep_params = {'block_size': sweep_block_size}


--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.cpp
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.cpp
@@ -21,9 +21,13 @@
 #include "cuda/communication/UniformGPUScheme.h"
 #include "cuda/DeviceSelectMPI.h"
 #include "domain_decomposition/SharedSweep.h"
-#include "stencil/D3Q19.h"
-#include "stencil/D3Q27.h"
 #include "InitShearVelocity.h"
+#include "gui/Gui.h"
+
+#ifdef WALBERLA_ENABLE_GUI
+#include "lbm/gui/PdfFieldDisplayAdaptor.h"
+#endif
+

 #include "UniformGridGPU_AA_PackInfoPush.h"
 #include "UniformGridGPU_AA_PackInfoPull.h"
@@ -31,11 +35,12 @@
 #include "UniformGridGPU_AA_MacroGetter.h"
 #include "UniformGridGPU_AA_LbKernelEven.h"
 #include "UniformGridGPU_AA_LbKernelOdd.h"
+#include "UniformGridGPU_AA_Defines.h"

+#include <cmath>

 using namespace walberla;

-using Stencil_T = stencil::D3Q19; //TODO make generic - and determine from python script
 using CommunicationStencil_T = Stencil_T;
 using PdfField_T = GhostLayerField< real_t, Stencil_T::Q >;
 using VelocityField_T = GhostLayerField< real_t, 3 >;
@@ -50,6 +55,8 @@ int main( int argc, char **argv )
    {
        WALBERLA_MPI_WORLD_BARRIER();

+        WALBERLA_CUDA_CHECK( cudaPeekAtLastError() );
+
        auto config = *cfg;
        logging::configureLogging( config );
        auto blocks = blockforest::createUniformBlockGridFromConfig( config );
@@ -61,7 +68,7 @@ int main( int argc, char **argv )
        const uint_t timesteps = parameters.getParameter< uint_t >( "timesteps", uint_c( 50 ));

        // Creating fields
-        BlockDataID pdfFieldCpuID = field::addToStorage< PdfField_T >( blocks, "pdfs cpu", real_t( 42.0 ), field::fzyx );
+        BlockDataID pdfFieldCpuID = field::addToStorage< PdfField_T >( blocks, "pdfs cpu", real_t( std::nan("") ), field::fzyx );
        BlockDataID velFieldCpuID = field::addToStorage< VelocityField_T >( blocks, "vel", real_t( 0 ), field::fzyx );

        WALBERLA_LOG_INFO_ON_ROOT( "Initializing shear flow" );
@@ -215,32 +222,52 @@ int main( int argc, char **argv )
            timeLoop.addFuncAfterTimeStep( logger, "remaining time logger" );
        }

-        for ( int outerIteration = 0; outerIteration < outerIterations; ++outerIteration )
+        bool useGui = parameters.getParameter<bool>( "useGui", false );
+        if( useGui )
+        {
+            cuda::fieldCpy< PdfField_T, cuda::GPUField< real_t > >( blocks, pdfFieldCpuID, pdfFieldGpuID );
+            timeLoop.addFuncAfterTimeStep( cuda::fieldCpyFunctor<PdfField_T, cuda::GPUField<real_t> >( blocks, pdfFieldCpuID, pdfFieldGpuID ), "copy to CPU" );
+            GUI gui( timeLoop, blocks, argc, argv);
+                gui.registerDisplayAdaptorCreator(
+                [&](const IBlock & block, ConstBlockDataID blockDataID) -> gui::DisplayAdaptor * {
+                    if ( block.isDataOfType< PdfField_T >( blockDataID) )
+                        return new lbm::PdfFieldDisplayAdaptor<GhostLayerField<real_t, Stencil_T::Q>, Stencil_T >( blockDataID );
+                    return nullptr;
+                });
+            gui.run();
+        }
+        else
        {
-            timeLoop.setCurrentTimeStepToZero();
-            WcTimer simTimer;
-            cudaDeviceSynchronize();
-            WALBERLA_LOG_INFO_ON_ROOT( "Starting simulation with " << timesteps << " time steps" );
-            simTimer.start();
-            timeLoop.run();
-            cudaDeviceSynchronize();
-            simTimer.end();
-            WALBERLA_LOG_INFO_ON_ROOT( "Simulation finished" );
-            auto time = simTimer.last();
-            auto nrOfCells = real_c( cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2] );
-
-            auto mlupsPerProcess = nrOfCells * real_c( timesteps ) / time * 1e-6;
-            WALBERLA_LOG_RESULT_ON_ROOT( "MLUPS per process " << mlupsPerProcess );
-            WALBERLA_LOG_RESULT_ON_ROOT( "Time per time step " << time / real_c( timesteps ));
-            WALBERLA_ROOT_SECTION()
+            for ( int outerIteration = 0; outerIteration < outerIterations; ++outerIteration )
            {
-                python_coupling::PythonCallback pythonCallbackResults( "results_callback" );
-                if ( pythonCallbackResults.isCallable())
+                WALBERLA_CUDA_CHECK( cudaPeekAtLastError() );
+
+                timeLoop.setCurrentTimeStepToZero();
+                WcTimer simTimer;
+                cudaDeviceSynchronize();
+                WALBERLA_CUDA_CHECK( cudaPeekAtLastError() );
+                WALBERLA_LOG_INFO_ON_ROOT( "Starting simulation with " << timesteps << " time steps" );
+                simTimer.start();
+                timeLoop.run();
+                cudaDeviceSynchronize();
+                simTimer.end();
+                WALBERLA_LOG_INFO_ON_ROOT( "Simulation finished" );
+                auto time = simTimer.last();
+                auto nrOfCells = real_c( cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2] );
+
+                auto mlupsPerProcess = nrOfCells * real_c( timesteps ) / time * 1e-6;
+                WALBERLA_LOG_RESULT_ON_ROOT( "MLUPS per process " << mlupsPerProcess );
+                WALBERLA_LOG_RESULT_ON_ROOT( "Time per time step " << time / real_c( timesteps ));
+                WALBERLA_ROOT_SECTION()
                {
-                    pythonCallbackResults.data().exposeValue( "mlupsPerProcess", mlupsPerProcess );
-                    pythonCallbackResults.data().exposeValue( "githash", WALBERLA_GIT_SHA1 );
-                    // Call Python function to report results
-                    pythonCallbackResults();
+                    python_coupling::PythonCallback pythonCallbackResults( "results_callback" );
+                    if ( pythonCallbackResults.isCallable())
+                    {
+                        pythonCallbackResults.data().exposeValue( "mlupsPerProcess", mlupsPerProcess );
+                        pythonCallbackResults.data().exposeValue( "githash", WALBERLA_GIT_SHA1 );
+                        // Call Python function to report results
+                        pythonCallbackResults();
+                    }
                }
            }
        }

--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.py
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.py
@@ -10,10 +10,14 @@ from pystencils.fast_approximation import insert_fast_sqrts, insert_fast_divisio
 from lbmpy.macroscopic_value_kernels import macroscopic_values_getter, macroscopic_values_setter

 omega = sp.symbols("omega")
-# sweep_block_size = (128, 1, 1)
-sweep_block_size = (TypedSymbol("cudaBlockSize0", np.int32),
-                    TypedSymbol("cudaBlockSize1", np.int32),
-                    1)
+compile_time_block_size = False
+
+if compile_time_block_size:
+    sweep_block_size = (128, 1, 1)
+else:
+    sweep_block_size = (TypedSymbol("cudaBlockSize0", np.int32),
+                        TypedSymbol("cudaBlockSize1", np.int32),
+                        1)

 sweep_params = {'block_size': sweep_block_size}

@@ -58,7 +62,9 @@ with CodeGeneration() as ctx:
    common_options = {
        'field_name': 'pdfs',
        'optimization': {'cse_global': True,
-                         'cse_pdfs': False}
+                         'cse_pdfs': False,
+                         'field_layout': 'fzyx',
+                         }
    }
    options = options_dict.get(ctx.config, options_dict['srt'])
    options.update(common_options)
@@ -98,3 +104,6 @@ with CodeGeneration() as ctx:
    # communication
    generate_pack_info_from_kernel(ctx, 'UniformGridGPU_AA_PackInfoPull', update_rules['Odd'], kind='pull', target='gpu')
    generate_pack_info_from_kernel(ctx, 'UniformGridGPU_AA_PackInfoPush', update_rules['Odd'], kind='push', target='gpu')
+
+    ctx.write_file("UniformGridGPU_AA_Defines.h",
+                   '#include "stencil/D3Q{0}.h"\nusing Stencil_T = walberla::stencil::D3Q{0}; \n '.format(q))
--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU_Communication.h
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU_Communication.h
@@ -6,6 +6,7 @@
 #include "blockforest/communication/UniformBufferedScheme.h"
 #include "blockforest/communication/UniformDirectScheme.h"
 #include "field/communication/StencilRestrictedMPIDatatypeInfo.h"
+#include "field/communication/UniformMPIDatatypeInfo.h"
 #include "cuda/communication/GPUPackInfo.h"
 #include "cuda/communication/UniformGPUScheme.h"
 #include "cuda/communication/MemcpyPackInfo.h"
@@ -20,7 +21,8 @@ enum CommunicationSchemeType {
    GPUPackInfo_Streams = 1,
    UniformGPUScheme_Baseline = 2,
    UniformGPUScheme_Memcpy = 3,
-    MPIDatatypes = 4
+    MPIDatatypes = 4,
+    MPIDatatypesFull = 5
 };


@@ -36,6 +38,7 @@ public:
        auto generatedPackInfo = make_shared<pystencils::UniformGridGPU_PackInfo>( bdId );
        auto memcpyPackInfo = make_shared< cuda::communication::MemcpyPackInfo< GPUFieldType > >( bdId );
        auto dataTypeInfo = make_shared< field::communication::StencilRestrictedMPIDatatypeInfo< GPUFieldType, StencilType > >( bdId );
+        auto dataTypeInfoFull = make_shared< field::communication::UniformMPIDatatypeInfo<GPUFieldType> >( bdId );

        switch(_commSchemeType)
        {
@@ -63,6 +66,12 @@ public:
                }
                _directScheme = make_shared< blockforest::communication::UniformDirectScheme< StencilType > >( bf, dataTypeInfo );
                break;
+            case MPIDatatypesFull:
+                if( ! cudaEnabledMPI ) {
+                    WALBERLA_ABORT("MPI datatype-based communication not possible if no cudaEnabledMPI is available.");
+                }
+                _directScheme = make_shared< blockforest::communication::UniformDirectScheme< StencilType > >( bf, dataTypeInfoFull );
+                break;
            default:
                WALBERLA_ABORT("Invalid GPU communication scheme specified!");
        }
@@ -103,6 +112,7 @@ public:
                _gpuCommunicationScheme->startCommunication( communicationStream );
                break;
            case MPIDatatypes:
+            case MPIDatatypesFull:
                WALBERLA_ASSERT_NOT_NULLPTR( _directScheme );
                _directScheme->startCommunication();
                break;
@@ -131,6 +141,7 @@ public:
                _gpuCommunicationScheme->wait( communicationStream );
                break;
            case MPIDatatypes:
+            case MPIDatatypesFull:
                WALBERLA_ASSERT_NOT_NULLPTR( _directScheme );
                _directScheme->wait();
                break;

--- a/apps/benchmarks/UniformGridGPU/simulation_setup/overlap_benchmark.py
+++ b/apps/benchmarks/UniformGridGPU/simulation_setup/overlap_benchmark.py
@@ -9,6 +9,7 @@ from os import getcwd
 from waLBerla.tools.jobscripts import createJobscript
 from datetime import timedelta
 from copy import deepcopy
+import sys

 CSV_FILE = "overlap_benchmark.csv"

@@ -22,16 +23,18 @@ BASE_CONFIG = {
        'timesteps': 400,
        'cudaEnabledMPI': False,
        'warmupSteps': 5,
-        'outerIterations': 1,
+        'outerIterations': 3,
+        'initShearFlow': True,
    }
 }


 class Scenario:
-    def __init__(self, **kwargs):
+    def __init__(self, cells_per_block=(256, 128, 128), **kwargs):
        self.config_dict = deepcopy(BASE_CONFIG)
        self.config_dict['Parameters'].update(kwargs)
        self.config_dict['DomainSetup']['blocks'] = block_decomposition(wlb.mpi.numProcesses())
+        self.config_dict['DomainSetup']['cellsPerBlock'] = cells_per_block

    @wlb.member_callback
    def config(self, **kwargs):
@@ -45,6 +48,10 @@ class Scenario:
        data.update(self.config_dict['Parameters'])
        data.update(self.config_dict['DomainSetup'])
        data.update(kwargs)
+        data['executable'] = sys.argv[0]
+        data['compile_flags'] = wlb.build_info.compiler_flags
+        data['walberla_version'] = wlb.build_info.version
+        data['build_machine'] = wlb.build_info.build_machine
        sequenceValuesToScalars(data)

        df = pd.DataFrame.from_records([data])
@@ -73,15 +80,50 @@ def overlap_benchmark():
                scenarios.add(scenario)


-def generate_jobscripts(machine='pizdaint_hybrid'):
+def single_gpu_benchmark():
+    scenarios = wlb.ScenarioManager()
+    block_sizes = [(i, i, i) for i in (64, 128, 256, 384)] + [(512, 512, 128)]
+    cuda_blocks = [(32, 1, 1), (64, 1, 1), (128, 1, 1), (256, 1, 1), (512, 1, 1),
+                   (32, 2, 1), (64, 2, 1), (128, 2, 1), (256, 2, 1),
+                   (32, 4, 1), (64, 4, 1), (128, 4, 1),
+                   (32, 8, 1), (64, 8, 1),
+                   (32, 16, 1)]
+    for block_size in block_sizes:
+        for cuda_block_size in cuda_blocks:
+            cells = block_size[0] * block_size[1] * block_size[2]
+            time_steps_for_128_cubed = 1000
+            time_steps = (128 ** 3 / cells) * time_steps_for_128_cubed
+            scenario = Scenario(cells_per_block=block_size,
+                                gpuBlockSize=cuda_block_size,
+                                timeStepStrategy='kernelOnly',
+                                timesteps=int(time_steps))
+            scenarios.add(scenario)
+
+
+all_executables = ('UniformGridBenchmarkGPU_AA_entropic',
+                   'UniformGridBenchmarkGPU_AA_mrt',
+                   'UniformGridBenchmarkGPU_AA_smagorinsky',
+                   'UniformGridBenchmarkGPU_AA_srt',
+                   'UniformGridBenchmarkGPU_AA_trt',
+                   'UniformGridBenchmarkGPU_entropic',
+                   'UniformGridBenchmarkGPU_mrt',
+                   'UniformGridBenchmarkGPU_smagorinsky',
+                   'UniformGridBenchmarkGPU_srt',
+                   'UniformGridBenchmarkGPU_trt')
+
+
+def generate_jobscripts(machine='pizdaint_hybrid',
+                        exe_names=all_executables):
    for node_count in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 2400]:
        with open("job_overlap_benchmark_{:04d}.sh".format(node_count), 'w') as f:
+
            js = createJobscript(nodes=node_count,
                                 output_file='overlap_bench_{:04d}_%j.txt'.format(node_count),
                                 error_file='overlap_bench_{:04d}_%j.txt'.format(node_count),
                                 initial_dir=getcwd(),
-                                 exe_name='UniformGridBenchmarkGPU',
-                                 parameter_files=['overlap_benchmark.py'],
+                                 #exe_name='UniformGridBenchmarkGPU',
+                                 #parameter_files=['overlap_benchmark.py'],
+                                 commands=list((exe, 'overlap_benchmark.py') for exe in exe_names),
                                 wall_time=timedelta(minutes=25),
                                 machine=machine,
                                 account='d105',
@@ -93,4 +135,4 @@ if __name__ == '__main__':
    print("Called without waLBerla - generating job scripts for PizDaint")
    generate_jobscripts()
 else:
-    overlap_benchmark()
+    single_gpu_benchmark()