UniformGridGPU more configurations

e33c19be · Martin Bauer · 2ad061e8 · e33c19be · e33c19be · e33c19be
Commit e33c19be authored 5 years ago by Martin Bauer
--- a/apps/benchmarks/UniformGridGPU/CMakeLists.txt
+++ b/apps/benchmarks/UniformGridGPU/CMakeLists.txt
@@ -13,7 +13,7 @@ waLBerla_python_file_generates(UniformGridGPU.py
        UniformGridGPU_Defines.h
        )

-foreach(config srt trt mrt smagorinsky entropic )
+foreach(config srt trt mrt smagorinsky entropic smagorinsky_noopt entropic_kbc_n4 entropic_kbc_n4_noopt mrt_noopt mrt_full mrt_full_noopt)
    waLBerla_add_executable ( NAME UniformGridBenchmarkGPU_${config}
                              FILES UniformGridGPU.cpp UniformGridGPU.py
                              DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk gui

--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
@@ -95,26 +95,32 @@ int main( int argc, char **argv )
      Vector3<uint_t> cellsPerBlock = config->getBlock( "DomainSetup" ).getParameter<Vector3<uint_t>  >( "cellsPerBlock" );
      // Reading parameters
      auto parameters = config->getOneBlock( "Parameters" );
+      const std::string timeStepStrategy = parameters.getParameter<std::string>( "timeStepStrategy", "normal");
      const real_t omega = parameters.getParameter<real_t>( "omega", real_c( 1.4 ));
      const uint_t timesteps = parameters.getParameter<uint_t>( "timesteps", uint_c( 50 ));
      const bool initShearFlow = parameters.getParameter<bool>("initShearFlow", false);

      // Creating fields
-      BlockDataID pdfFieldCpuID = field::addToStorage< PdfField_T >( blocks, "pdfs cpu", real_t(99.8), field::fzyx);
+      BlockDataID pdfFieldCpuID = field::addToStorage< PdfField_T >( blocks, "pdfs cpu", real_t(0), field::fzyx);
      BlockDataID velFieldCpuID = field::addToStorage< VelocityField_T >( blocks, "vel", real_t(0), field::fzyx);

-      if( initShearFlow ) {
-          WALBERLA_LOG_INFO_ON_ROOT("Initializing shear flow");
-          initShearVelocity( blocks, velFieldCpuID );
-      }
-      pystencils::UniformGridGPU_MacroSetter setterSweep(pdfFieldCpuID, velFieldCpuID);
-      for( auto & block : *blocks )
-          setterSweep( &block );
-      // setter sweep only initializes interior of domain - for push schemes to work a first communication is required here
-      blockforest::communication::UniformBufferedScheme<CommunicationStencil_T> initialComm(blocks);
-      initialComm.addPackInfo( make_shared< field::communication::PackInfo<PdfField_T> >( pdfFieldCpuID ) );
-      initialComm();
+      if( timeStepStrategy != "kernelOnlyNoInit")
+      {
+          if ( initShearFlow )
+          {
+              WALBERLA_LOG_INFO_ON_ROOT( "Initializing shear flow" );
+              initShearVelocity( blocks, velFieldCpuID );
+          }
+
+          pystencils::UniformGridGPU_MacroSetter setterSweep(pdfFieldCpuID, velFieldCpuID);
+          for( auto & block : *blocks )
+              setterSweep( &block );

+          // setter sweep only initializes interior of domain - for push schemes to work a first communication is required here
+          blockforest::communication::UniformBufferedScheme<CommunicationStencil_T> initialComm(blocks);
+          initialComm.addPackInfo( make_shared< field::communication::PackInfo<PdfField_T> >( pdfFieldCpuID ) );
+          initialComm();
+      }

      BlockDataID pdfFieldGpuID = cuda::addGPUFieldToStorage<PdfField_T >( blocks, pdfFieldCpuID, "pdfs on GPU", true );
      BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >( blocks, "flag field" );
@@ -165,7 +171,9 @@ int main( int argc, char **argv )
      int streamLowPriority = 0;
      WALBERLA_CUDA_CHECK( cudaDeviceGetStreamPriorityRange(&streamLowPriority, &streamHighPriority) );
      WALBERLA_CHECK(gpuBlockSize[2] == 1);
-      pystencils::UniformGridGPU_LbKernel lbKernel( pdfFieldGpuID, omega, gpuBlockSize[0], gpuBlockSize[1],
+      pystencils::UniformGridGPU_LbKernel lbKernel( pdfFieldGpuID, omega,
+                                                    1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7,
+                                                    gpuBlockSize[0], gpuBlockSize[1],
                                                    Cell(innerOuterSplit[0], innerOuterSplit[1], innerOuterSplit[2]) );
      lbKernel.setOuterPriority( streamHighPriority );
      UniformGridGPU_Communication< CommunicationStencil_T, cuda::GPUField< double > >
@@ -252,7 +260,6 @@ int main( int argc, char **argv )

      SweepTimeloop timeLoop( blocks->getBlockStorage(), timesteps );

-      const std::string timeStepStrategy = parameters.getParameter<std::string>( "timeStepStrategy", "normal");
      std::function<void()> timeStep;
      if (timeStepStrategy == "noOverlap")
          timeStep = std::function<void()>( normalTimeStep );
@@ -260,7 +267,7 @@ int main( int argc, char **argv )
          timeStep = std::function<void()>( overlapTimeStep );
      else if (timeStepStrategy == "simpleOverlap")
          timeStep = simpleOverlapTimeStep;
-      else if (timeStepStrategy == "kernelOnly") {
+      else if (timeStepStrategy == "kernelOnly" or timeStepStrategy == "kernelOnlyNoInit") {
          WALBERLA_LOG_INFO_ON_ROOT("Running only compute kernel without boundary - this makes only sense for benchmarking!")
          timeStep = kernelOnlyFunc;
      }

--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.py
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.py
@@ -12,6 +12,7 @@ from pystencils.fast_approximation import insert_fast_sqrts, insert_fast_divisio
 from lbmpy.macroscopic_value_kernels import macroscopic_values_getter, macroscopic_values_setter

 omega = sp.symbols("omega")
+omega_fill = sp.symbols("omega_:10")
 compile_time_block_size = False

 if compile_time_block_size:
@@ -40,6 +41,11 @@ options_dict = {
        'stencil': 'D3Q19',
        'relaxation_rates': [0, omega, 1.3, 1.4, omega, 1.2, 1.1],
    },
+    'mrt_full': {
+        'method': 'mrt',
+        'stencil': 'D3Q19',
+        'relaxation_rates': [omega_fill[0], omega, omega_fill[1], omega_fill[2], omega_fill[3], omega_fill[4], omega_fill[5]],
+    },
    'entropic': {
        'method': 'mrt3',
        'stencil': 'D3Q19',
@@ -47,6 +53,13 @@ options_dict = {
        'relaxation_rates': [omega, omega, sp.Symbol("omega_free")],
        'entropic': True,
    },
+    'entropic_kbc_n4': {
+        'method': 'trt-kbc-n4',
+        'stencil': 'D3Q27',
+        'compressible': True,
+        'relaxation_rates': [omega, sp.Symbol("omega_free")],
+        'entropic': True,
+    },
    'smagorinsky': {
        'method': 'srt',
        'stencil': 'D3Q19',
@@ -76,8 +89,19 @@ with CodeGeneration() as ctx:
        'optimization': {'cse_global': True,
                         'cse_pdfs': False}
    }
-    options = options_dict.get(ctx.config, options_dict['srt'])
+    config_name = ctx.config
+    noopt = False
+    if config_name.endswith("_noopt"):
+        noopt = True
+        config_name = config_name[:-len("_noopt")]
+
+    options = options_dict[config_name]
    options.update(common_options)
+    options = options.copy()
+
+    if noopt:
+        options['optimization']['cse_global'] = False
+        options['optimization']['cse_pdfs'] = False

    stencil_str = options['stencil']
    q = int(stencil_str[stencil_str.find('Q')+1:])
@@ -85,14 +109,22 @@ with CodeGeneration() as ctx:
    options['optimization']['symbolic_field'] = pdfs

    vp = [
+        ('double', 'omega_0'),
+        ('double', 'omega_1'),
+        ('double', 'omega_2'),
+        ('double', 'omega_3'),
+        ('double', 'omega_4'),
+        ('double', 'omega_5'),
+        ('double', 'omega_6'),
        ('int32_t', 'cudaBlockSize0'),
-        ('int32_t', 'cudaBlockSize1')
+        ('int32_t', 'cudaBlockSize1'),
    ]
    lb_method = create_lb_method(**options)
    update_rule = create_lb_update_rule(lb_method=lb_method, **options)

-    update_rule = insert_fast_divisions(update_rule)
-    update_rule = insert_fast_sqrts(update_rule)
+    if not noopt:
+        update_rule = insert_fast_divisions(update_rule)
+        update_rule = insert_fast_sqrts(update_rule)

    # CPU lattice model - required for macroscopic value computation, VTK output etc.
    options_without_opt = options.copy()