UniformGridGPU

- fix in time measurement - options to disable boundaries and/or communication - priority for communication & outer part of kernel

UniformGridGPU
- fix in time measurement - options to disable boundaries and/or communication - priority for communication & outer part of kernel
6dc55108 · Martin Bauer · 628f2060 · 6dc55108 · 6dc55108 · 6dc55108
Commit 6dc55108 authored 5 years ago by Martin Bauer
--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
@@ -87,7 +87,10 @@ int main( int argc, char **argv )
      noSlip.fillFromFlagField<FlagField_T>( blocks, flagFieldID, FlagUID("NoSlip"), fluidFlagUID );
      //pressure.fillFromFlagField<FlagField_T>( blocks, flagFieldID, FlagUID("pressure"), fluidFlagUID );

-      // Communication setup
+      bool disableBoundaries = parameters.getParameter<bool>( "disableBoundaries", false );
+      bool kernelOnly = parameters.getParameter<bool>( "kernelOnly", false );
+
+       // Communication setup
      bool overlapCommunication = parameters.getParameter<bool>( "overlapCommunication", true );
      bool cudaEnabledMPI = parameters.getParameter<bool>( "cudaEnabledMPI", false );
      int communicationScheme = parameters.getParameter<int>( "communicationScheme", (int) CommunicationSchemeType::UniformGPUScheme_Baseline );
@@ -98,8 +101,6 @@ int main( int argc, char **argv )

      pystencils::UniformGridGPU_LbKernel lbKernel( pdfFieldGpuID, omega );
      lbKernel.setOuterPriority( streamHighPriority );
-      //CommScheme_T gpuComm( blocks, cudaEnabledMPI );
-      //gpuComm.addPackInfo( make_shared<pystencils::UniformGridGPU_PackInfo>( pdfFieldGpuID ));
      UniformGridGPU_Communication< CommunicationStencil_T, cuda::GPUField< double > >
         gpuComm( blocks, pdfFieldGpuID, (CommunicationSchemeType) communicationScheme, cudaEnabledMPI );

@@ -116,6 +117,7 @@ int main( int argc, char **argv )
         {
            for( auto &block: *blocks )
            {
+               if(!disableBoundaries)
               {
                  auto p = boundaryInnerStreams.parallelSection( innerStream );
                  p.run( [&block, &ubb]( cudaStream_t s ) { ubb.inner( &block, s ); } );
@@ -131,6 +133,7 @@ int main( int argc, char **argv )

            for( auto &block: *blocks )
            {
+               if(!disableBoundaries)
               {
                  auto p = boundaryOuterStreams.parallelSection( outerStream );
                  p.run( [&block, &ubb]( cudaStream_t s ) { ubb.outer( &block, s ); } );
@@ -148,6 +151,7 @@ int main( int argc, char **argv )
         gpuComm();
         for( auto &block: *blocks )
         {
+            if(!disableBoundaries)
            {
               auto p = boundaryStreams.parallelSection( defaultStream );
               p.run( [&block, &ubb]( cudaStream_t s ) { ubb( &block, s ); } );
@@ -157,9 +161,21 @@ int main( int argc, char **argv )
         }
      };

+      auto kernelOnlyFunc = [&] ()
+      {
+          for( auto &block: *blocks )
+              lbKernel( &block );
+      };
+
      SweepTimeloop timeLoop( blocks->getBlockStorage(), timesteps );
      std::function<void()> timeStep = overlapCommunication ? std::function<void()>( overlapTimeStep ) :
                                                              std::function<void()>( normalTimeStep );
+      if( kernelOnly )
+      {
+          WALBERLA_LOG_INFO_ON_ROOT("Running only compute kernel without boundary - this makes only sense for benchmarking!")
+          timeStep = kernelOnlyFunc;
+      }
+
      timeLoop.add() << BeforeFunction( timeStep  )
                     << Sweep( []( IBlock * ) {}, "time step" );

@@ -185,8 +201,8 @@ int main( int argc, char **argv )
      cudaDeviceSynchronize();
      WALBERLA_LOG_INFO_ON_ROOT("Starting simulation with " << timesteps << " time steps");
      simTimer.start();
-      cudaDeviceSynchronize();
      timeLoop.run();
+      cudaDeviceSynchronize();
      simTimer.end();
      WALBERLA_LOG_INFO_ON_ROOT("Simulation finished");
      auto time = simTimer.last();

--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.prm
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.prm
@@ -2,13 +2,16 @@
 Parameters 
 {
 	omega           1.8;
-	timesteps       1000;
+	timesteps       500;

 	remainingTimeLoggerFrequency 3;
 	vtkWriteFrequency 0;

 	overlapCommunication true;
 	cudaEnabledMPI false;
+
+	kernelOnly false;
+	disableBoundaries false;
 }

 DomainSetup
@@ -20,8 +23,10 @@ DomainSetup

 Boundaries 
 {
+    /*
 	Border { direction W;    walldistance -1;  flag NoSlip; }
 	Border { direction E;    walldistance -1;  flag NoSlip; }
    Border { direction S;    walldistance -1;  flag NoSlip; }
    Border { direction N;    walldistance -1;  flag UBB; }
+    */
 }
--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.py
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.py
@@ -6,6 +6,10 @@ from lbmpy_walberla import generate_lattice_model, generate_boundary
 from pystencils_walberla import CodeGeneration, generate_sweep


+sweep_block_size = (128, 1, 1)
+sweep_params = {'block_size': sweep_block_size}
+
+
 with CodeGeneration() as ctx:
    # LB options
    options = {
@@ -16,8 +20,8 @@ with CodeGeneration() as ctx:
        'compressible': False,
        'temporary_field_name': 'pdfs_tmp',
        'optimization': {'cse_global': True,
-                         'cse_pdfs': True,
-                         'gpu_indexing_params': {'block_size': (128, 1, 1)}}
+                         'cse_pdfs': False,
+                         }
    }
    lb_method = create_lb_method(**options)
    update_rule = create_lb_update_rule(lb_method=lb_method, **options)
@@ -27,7 +31,7 @@ with CodeGeneration() as ctx:

    # gpu LB sweep & boundaries
    generate_sweep(ctx, 'UniformGridGPU_LbKernel', update_rule, field_swaps=[('pdfs', 'pdfs_tmp')],
-                   inner_outer_split=True, target='gpu')
+                   inner_outer_split=True, target='gpu', gpu_indexing_params=sweep_params)
    generate_boundary(ctx, 'UniformGridGPU_NoSlip', NoSlip(), lb_method, target='gpu')
    generate_boundary(ctx, 'UniformGridGPU_UBB', UBB([0.05, 0, 0]), lb_method, target='gpu')