UniformGridGPU.cpp 15.6 KB
Newer Older
1
#include "core/Environment.h"
2
#include "core/logging/Initialization.h"
3
#include "core/math/Random.h"
4
#include "python_coupling/CreateConfig.h"
João Victor Tozatti Risso's avatar
João Victor Tozatti Risso committed
5
6
#include "python_coupling/PythonCallback.h"
#include "python_coupling/DictWrapper.h"
7
8
9
#include "blockforest/Initialization.h"
#include "field/FlagField.h"
#include "field/AddToStorage.h"
10
11
#include "field/vtk/VTKWriter.h"
#include "field/communication/PackInfo.h"
12
13
14
15
16
17
18
19
#include "lbm/PerformanceLogger.h"
#include "blockforest/communication/UniformBufferedScheme.h"
#include "timeloop/all.h"
#include "core/math/Random.h"
#include "geometry/all.h"
#include "cuda/HostFieldAllocator.h"
#include "cuda/communication/GPUPackInfo.h"
#include "cuda/ParallelStreams.h"
Martin Bauer's avatar
Martin Bauer committed
20
#include "cuda/NVTX.h"
21
22
23
24
#include "core/timing/TimingPool.h"
#include "core/timing/RemainingTimeLogger.h"
#include "cuda/AddGPUFieldToStorage.h"
#include "cuda/communication/UniformGPUScheme.h"
25
#include "cuda/DeviceSelectMPI.h"
26
#include "domain_decomposition/SharedSweep.h"
27
28
#include "gui/Gui.h"
#include "lbm/gui/Connection.h"
29
30
31
32
33
34

#include "UniformGridGPU_LatticeModel.h"
#include "UniformGridGPU_LbKernel.h"
#include "UniformGridGPU_PackInfo.h"
#include "UniformGridGPU_UBB.h"
#include "UniformGridGPU_NoSlip.h"
João Victor Tozatti Risso's avatar
João Victor Tozatti Risso committed
35
#include "UniformGridGPU_Communication.h"
36
37
#include "UniformGridGPU_MacroSetter.h"
#include "UniformGridGPU_MacroGetter.h"
38
#include "UniformGridGPU_Defines.h"
39
40
41
42
43
44


using namespace walberla;

using LatticeModel_T = lbm::UniformGridGPU_LatticeModel;

45
46
47
const auto Q = LatticeModel_T::Stencil::Q;


48
49
using Stencil_T = LatticeModel_T::Stencil;
using CommunicationStencil_T = LatticeModel_T::CommunicationStencil;
50
using PdfField_T = GhostLayerField<real_t, Q>;
51
using CommScheme_T = cuda::communication::UniformGPUScheme<CommunicationStencil_T>;
52
using VelocityField_T = GhostLayerField<real_t, 3>;
53
54
55
56
using flag_t = walberla::uint8_t;
using FlagField_T = FlagField<flag_t>;


57
void initShearVelocity(const shared_ptr<StructuredBlockStorage> & blocks, BlockDataID velFieldID,
Markus Holzer's avatar
Markus Holzer committed
58
                       const real_t xMagnitude=real_t(0.1), const real_t fluctuationMagnitude=real_t(0.05) )
59
60
61
62
63
{
    math::seedRandomGenerator(0);
    auto halfZ = blocks->getDomainCellBB().zMax() / 2;
    for( auto & block: *blocks)
    {
64
65
        auto velField = block.getData<VelocityField_T>( velFieldID );
        WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(velField,
66
67
68
            Cell globalCell;
            blocks->transformBlockLocalToGlobalCell(globalCell, block, Cell(x, y, z));
            real_t randomReal = xMagnitude * math::realRandom<real_t>(-fluctuationMagnitude, fluctuationMagnitude);
69
70
71
            velField->get(x, y, z, 1) = real_t(0);
            velField->get(x, y, z, 2) = randomReal;

72
            if( globalCell[2] >= halfZ ) {
73
                velField->get(x, y, z, 0) = xMagnitude;
74
            } else {
75
                velField->get(x, y, z, 0) = -xMagnitude;
76
77
78
79
80
            }
        );
    }
}

81
82
83
84

int main( int argc, char **argv )
{
   mpi::Environment env( argc, argv );
85
   cuda::selectDeviceBasedOnMpiRank();
86
87
88

   for( auto cfg = python_coupling::configBegin( argc, argv ); cfg != python_coupling::configEnd(); ++cfg )
   {
Martin Bauer's avatar
Martin Bauer committed
89
90
      WALBERLA_MPI_WORLD_BARRIER();

91
      auto config = *cfg;
92
      logging::configureLogging( config );
93
94
      auto blocks = blockforest::createUniformBlockGridFromConfig( config );

95
      Vector3<uint_t> cellsPerBlock = config->getBlock( "DomainSetup" ).getParameter<Vector3<uint_t>  >( "cellsPerBlock" );
96
97
      // Reading parameters
      auto parameters = config->getOneBlock( "Parameters" );
98
      const std::string timeStepStrategy = parameters.getParameter<std::string>( "timeStepStrategy", "normal");
99
100
      const real_t omega = parameters.getParameter<real_t>( "omega", real_c( 1.4 ));
      const uint_t timesteps = parameters.getParameter<uint_t>( "timesteps", uint_c( 50 ));
101
      const bool initShearFlow = parameters.getParameter<bool>("initShearFlow", true);
102
103

      // Creating fields
104
      BlockDataID pdfFieldCpuID = field::addToStorage< PdfField_T >( blocks, "pdfs cpu", real_t(0), field::fzyx);
105
106
      BlockDataID velFieldCpuID = field::addToStorage< VelocityField_T >( blocks, "vel", real_t(0), field::fzyx);

107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
      if( timeStepStrategy != "kernelOnlyNoInit")
      {
          if ( initShearFlow )
          {
              WALBERLA_LOG_INFO_ON_ROOT( "Initializing shear flow" );
              initShearVelocity( blocks, velFieldCpuID );
          }

          pystencils::UniformGridGPU_MacroSetter setterSweep(pdfFieldCpuID, velFieldCpuID);
          for( auto & block : *blocks )
              setterSweep( &block );

          // setter sweep only initializes interior of domain - for push schemes to work a first communication is required here
          blockforest::communication::UniformBufferedScheme<CommunicationStencil_T> initialComm(blocks);
          initialComm.addPackInfo( make_shared< field::communication::PackInfo<PdfField_T> >( pdfFieldCpuID ) );
          initialComm();
123
124
      }

125
126
127
      BlockDataID pdfFieldGpuID = cuda::addGPUFieldToStorage<PdfField_T >( blocks, pdfFieldCpuID, "pdfs on GPU", true );
      BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >( blocks, "flag field" );

128

129
130
      // Boundaries
      const FlagUID fluidFlagUID( "Fluid" );
Martin Bauer's avatar
Martin Bauer committed
131
132
133
134
135
136
137
138
      auto boundariesConfig = config->getBlock( "Boundaries" );
      bool disableBoundaries = true;
      if( boundariesConfig )
      {
          disableBoundaries = false;
          geometry::initBoundaryHandling< FlagField_T >( *blocks, flagFieldID, boundariesConfig );
          geometry::setNonBoundaryCellsToDomain< FlagField_T >( *blocks, flagFieldID, fluidFlagUID );
      }
139
140
141
142
143
144
145

      lbm::UniformGridGPU_UBB ubb(blocks, pdfFieldGpuID);
      lbm::UniformGridGPU_NoSlip noSlip(blocks, pdfFieldGpuID);

      ubb.fillFromFlagField<FlagField_T>( blocks, flagFieldID, FlagUID("UBB"), fluidFlagUID );
      noSlip.fillFromFlagField<FlagField_T>( blocks, flagFieldID, FlagUID("NoSlip"), fluidFlagUID );

Martin Bauer's avatar
Martin Bauer committed
146
       // Communication setup
147
      bool cudaEnabledMPI = parameters.getParameter<bool>( "cudaEnabledMPI", false );
148
      Vector3<int32_t> gpuBlockSize = parameters.getParameter<Vector3<int32_t> > ("gpuBlockSize", Vector3<int32_t>(256, 1, 1));
149
150
151
152
153
154
155
156
157
158
      const std::string communicationSchemeStr = parameters.getParameter<std::string>("communicationScheme", "UniformGPUScheme_Baseline");
      CommunicationSchemeType communicationScheme;
      if( communicationSchemeStr == "GPUPackInfo_Baseline")
          communicationScheme = GPUPackInfo_Baseline;
      else if (communicationSchemeStr == "GPUPackInfo_Streams")
          communicationScheme = GPUPackInfo_Streams;
      else if (communicationSchemeStr == "UniformGPUScheme_Baseline")
          communicationScheme = UniformGPUScheme_Baseline;
      else if (communicationSchemeStr == "UniformGPUScheme_Memcpy")
          communicationScheme = UniformGPUScheme_Memcpy;
159
160
      else if (communicationSchemeStr == "MPIDatatypes")
          communicationScheme = MPIDatatypes;
161
162
      else if (communicationSchemeStr == "MPIDatatypesFull")
          communicationScheme = MPIDatatypesFull;
163
164
165
166
      else {
          WALBERLA_ABORT_NO_DEBUG_INFO("Invalid choice for communicationScheme")
      }

Martin Bauer's avatar
Martin Bauer committed
167
      Vector3<int> innerOuterSplit = parameters.getParameter<Vector3<int> >("innerOuterSplit", Vector3<int>(1, 1, 1));
168
      for(uint_t i=0; i< 3; ++i)
169
170
171
172
173
      {
          if( int_c(cellsPerBlock[i]) <= innerOuterSplit[i] * 2) {
              WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller or increase cellsPerBlock");
          }
      }
174
175
176
177

      int streamHighPriority = 0;
      int streamLowPriority = 0;
      WALBERLA_CUDA_CHECK( cudaDeviceGetStreamPriorityRange(&streamLowPriority, &streamHighPriority) );
178
      WALBERLA_CHECK(gpuBlockSize[2] == 1);
179
180
181
      pystencils::UniformGridGPU_LbKernel lbKernel( pdfFieldGpuID, omega,
                                                    1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7,
                                                    gpuBlockSize[0], gpuBlockSize[1],
182
                                                    Cell(innerOuterSplit[0], innerOuterSplit[1], innerOuterSplit[2]) );
183
      lbKernel.setOuterPriority( streamHighPriority );
João Victor Tozatti Risso's avatar
João Victor Tozatti Risso committed
184
185
      UniformGridGPU_Communication< CommunicationStencil_T, cuda::GPUField< double > >
         gpuComm( blocks, pdfFieldGpuID, (CommunicationSchemeType) communicationScheme, cudaEnabledMPI );
186
187
188
189
190
191

      auto defaultStream = cuda::StreamRAII::newPriorityStream( streamLowPriority );
      auto innerOuterStreams = cuda::ParallelStreams( streamHighPriority );
      auto boundaryOuterStreams = cuda::ParallelStreams( streamHighPriority );
      auto boundaryInnerStreams = cuda::ParallelStreams( streamHighPriority );

Martin Bauer's avatar
Martin Bauer committed
192
193
194
195
196
197
198
199
200
201
202
203
      uint_t currentTimeStep = 0;

      auto simpleOverlapTimeStep = [&] ()
      {
          gpuComm.startCommunication(defaultStream);
          for( auto &block: *blocks )
              lbKernel.inner( &block, defaultStream );
          gpuComm.wait(defaultStream);
          for( auto &block: *blocks )
              lbKernel.outer( &block, defaultStream );
      };

204
205
      auto overlapTimeStep = [&]()
      {
Martin Bauer's avatar
Martin Bauer committed
206
         cuda::NvtxRange namedRange("timestep");
207
208
209
210
         auto innerOuterSection = innerOuterStreams.parallelSection( defaultStream );

         innerOuterSection.run([&]( auto innerStream )
         {
Martin Bauer's avatar
Martin Bauer committed
211
            cuda::nameStream(innerStream, "inner stream");
212
213
            for( auto &block: *blocks )
            {
Martin Bauer's avatar
Martin Bauer committed
214
               if(!disableBoundaries)
215
216
               {
                  auto p = boundaryInnerStreams.parallelSection( innerStream );
217
218
                  p.run( [&block, &ubb]( cudaStream_t s ) { ubb.inner( &block, s ); } );
                  p.run( [&block, &noSlip]( cudaStream_t s ) { noSlip.inner( &block, s ); } );
219
220
221
222
223
224
225
               }
               lbKernel.inner( &block, innerStream );
            }
         });

         innerOuterSection.run([&]( auto outerStream )
         {
Martin Bauer's avatar
Martin Bauer committed
226
            cuda::nameStream(outerStream, "outer stream");
227
            gpuComm( outerStream );
João Victor Tozatti Risso's avatar
João Victor Tozatti Risso committed
228

229
230
            for( auto &block: *blocks )
            {
Martin Bauer's avatar
Martin Bauer committed
231
               if(!disableBoundaries)
232
233
               {
                  auto p = boundaryOuterStreams.parallelSection( outerStream );
234
235
                  p.run( [&block, &ubb]( cudaStream_t s ) { ubb.outer( &block, s ); } );
                  p.run( [&block, &noSlip]( cudaStream_t s ) { noSlip.outer( &block, s ); } );
236
237
238
239
               }
               lbKernel.outer( &block, outerStream );
            }
         });
Martin Bauer's avatar
Martin Bauer committed
240
         currentTimeStep += 1;
241
242
243
244
245
246
247
248
249
      };


      auto boundaryStreams = cuda::ParallelStreams( streamHighPriority );
      auto normalTimeStep = [&]()
      {
         gpuComm();
         for( auto &block: *blocks )
         {
Martin Bauer's avatar
Martin Bauer committed
250
            if(!disableBoundaries)
251
252
            {
               auto p = boundaryStreams.parallelSection( defaultStream );
253
254
               p.run( [&block, &ubb]( cudaStream_t s ) { ubb( &block, s ); } );
               p.run( [&block, &noSlip]( cudaStream_t s ) { noSlip( &block, s ); } );
255
256
257
258
259
            }
            lbKernel( &block );
         }
      };

Martin Bauer's avatar
Martin Bauer committed
260
261
262
263
264
265
      auto kernelOnlyFunc = [&] ()
      {
          for( auto &block: *blocks )
              lbKernel( &block );
      };

266
      SweepTimeloop timeLoop( blocks->getBlockStorage(), timesteps );
Martin Bauer's avatar
Martin Bauer committed
267
268
269
270
271
272
273
274

      std::function<void()> timeStep;
      if (timeStepStrategy == "noOverlap")
          timeStep = std::function<void()>( normalTimeStep );
      else if (timeStepStrategy == "complexOverlap")
          timeStep = std::function<void()>( overlapTimeStep );
      else if (timeStepStrategy == "simpleOverlap")
          timeStep = simpleOverlapTimeStep;
275
      else if (timeStepStrategy == "kernelOnly" or timeStepStrategy == "kernelOnlyNoInit") {
Martin Bauer's avatar
Martin Bauer committed
276
277
278
          WALBERLA_LOG_INFO_ON_ROOT("Running only compute kernel without boundary - this makes only sense for benchmarking!")
          timeStep = kernelOnlyFunc;
      }
Martin Bauer's avatar
Martin Bauer committed
279
280
281
      else {
          WALBERLA_ABORT_NO_DEBUG_INFO("Invalid value for 'timeStepStrategy'. Allowed values are 'noOverlap', 'complexOverlap', 'simpleOverlap', 'kernelOnly'");
      }
Martin Bauer's avatar
Martin Bauer committed
282

283
284
285
      timeLoop.add() << BeforeFunction( timeStep  )
                     << Sweep( []( IBlock * ) {}, "time step" );

286
287
      pystencils::UniformGridGPU_MacroGetter getterSweep( pdfFieldCpuID, velFieldCpuID );

288
289
290
291
292
293
      // VTK
      uint_t vtkWriteFrequency = parameters.getParameter<uint_t>( "vtkWriteFrequency", 0 );
      if( vtkWriteFrequency > 0 )
      {
         auto vtkOutput = vtk::createVTKOutput_BlockData( *blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out",
                                                          "simulation_step", false, true, true, false, 0 );
294
295
296
297
298
299
300
         auto velWriter = make_shared< field::VTKWriter<VelocityField_T> >(velFieldCpuID, "vel");
         vtkOutput->addCellDataWriter(velWriter);
         vtkOutput->addBeforeFunction( [&]() {
             cuda::fieldCpy<PdfField_T, cuda::GPUField<real_t> >( blocks, pdfFieldCpuID, pdfFieldGpuID );
             for( auto & block : *blocks )
                 getterSweep( &block );
         });
301
302
303
         timeLoop.addFuncAfterTimeStep( vtk::writeFiles( vtkOutput ), "VTK Output" );
      }

João Victor Tozatti Risso's avatar
João Victor Tozatti Risso committed
304

Martin Bauer's avatar
Martin Bauer committed
305
306
307
308
309
310
311
312

      int warmupSteps = parameters.getParameter<int>( "warmupSteps", 2 );
      int outerIterations = parameters.getParameter<int>( "outerIterations", 1 );
      for(int i=0; i < warmupSteps; ++i )
         timeLoop.singleStep();

      auto remainingTimeLoggerFrequency = parameters.getParameter< double >( "remainingTimeLoggerFrequency", -1.0 ); // in seconds
      if (remainingTimeLoggerFrequency > 0) {
313
          auto logger = timing::RemainingTimeLogger( timeLoop.getNrOfTimeSteps() * uint_c( outerIterations ), remainingTimeLoggerFrequency );
Martin Bauer's avatar
Martin Bauer committed
314
315
316
          timeLoop.addFuncAfterTimeStep( logger, "remaining time logger" );
      }

317
318
319
320
321
322
323
324
      bool useGui = parameters.getParameter<bool>( "useGui", false );
      if( useGui )
      {
          GUI gui( timeLoop, blocks, argc, argv);
          lbm::connectToGui<LatticeModel_T>(gui);
          gui.run();
      }
      else
Martin Bauer's avatar
Martin Bauer committed
325
      {
326
          for ( int outerIteration = 0; outerIteration < outerIterations; ++outerIteration )
Martin Bauer's avatar
Martin Bauer committed
327
          {
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
              timeLoop.setCurrentTimeStepToZero();
              WcTimer simTimer;
              cudaDeviceSynchronize();
              WALBERLA_LOG_INFO_ON_ROOT( "Starting simulation with " << timesteps << " time steps" );
              simTimer.start();
              timeLoop.run();
              cudaDeviceSynchronize();
              simTimer.end();
              WALBERLA_LOG_INFO_ON_ROOT( "Simulation finished" );
              auto time = simTimer.last();
              auto nrOfCells = real_c( cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2] );
              auto mlupsPerProcess = nrOfCells * real_c( timesteps ) / time * 1e-6;
              WALBERLA_LOG_RESULT_ON_ROOT( "MLUPS per process " << mlupsPerProcess );
              WALBERLA_LOG_RESULT_ON_ROOT( "Time per time step " << time / real_c( timesteps ));
              WALBERLA_ROOT_SECTION()
Martin Bauer's avatar
Martin Bauer committed
343
              {
344
345
346
                  python_coupling::PythonCallback pythonCallbackResults( "results_callback" );
                  if ( pythonCallbackResults.isCallable())
                  {
347
                      const char * storagePattern = "twofield";
348
                      pythonCallbackResults.data().exposeValue( "mlupsPerProcess", mlupsPerProcess );
349
350
                      pythonCallbackResults.data().exposeValue( "stencil", infoStencil );
                      pythonCallbackResults.data().exposeValue( "configName", infoConfigName );
351
                      pythonCallbackResults.data().exposeValue( "storagePattern", storagePattern );
352
353
                      pythonCallbackResults.data().exposeValue( "cse_global", infoCseGlobal );
                      pythonCallbackResults.data().exposeValue( "cse_pdfs", infoCsePdfs );
354
355
356
                      // Call Python function to report results
                      pythonCallbackResults();
                  }
Martin Bauer's avatar
Martin Bauer committed
357
358
              }
          }
João Victor Tozatti Risso's avatar
João Victor Tozatti Risso committed
359
      }
360
361
362
   }

   return 0;
João Victor Tozatti Risso's avatar
João Victor Tozatti Risso committed
363
}