UniformGridGPU.cpp 13.6 KB
Newer Older
1
2
#include "blockforest/Initialization.h"

3
#include "core/Environment.h"
4
#include "core/logging/Initialization.h"
5
#include "core/timing/RemainingTimeLogger.h"
6
7
#include "core/timing/TimingPool.h"

8
#include "cuda/AddGPUFieldToStorage.h"
9
#include "cuda/DeviceSelectMPI.h"
10
11
12
13
#include "cuda/ParallelStreams.h"
#include "cuda/communication/UniformGPUScheme.h"
#include "cuda/FieldCopy.h"
#include "cuda/lbm/CombinedInPlaceGpuPackInfo.h"
14

15
16
17
18
#include "field/AddToStorage.h"
#include "field/FlagField.h"
#include "field/communication/PackInfo.h"
#include "field/vtk/VTKWriter.h"
19

20
#include "geometry/InitBoundaryHandling.h"
21

22
#include "lbm/inplace_streaming/TimestepTracker.h"
23

24
25
26
#include "python_coupling/CreateConfig.h"
#include "python_coupling/DictWrapper.h"
#include "python_coupling/PythonCallback.h"
27

28
#include "timeloop/SweepTimeloop.h"
29

30
#include "InitShearVelocity.h"
31

32
#include <cmath>
33

34
35
#include "UniformGridGPU_InfoHeader.h"
using namespace walberla;
36

37
using FlagField_T            = FlagField<uint8_t>;
38

39
int main(int argc, char** argv)
40
{
41
   mpi::Environment env(argc, argv);
42
   cuda::selectDeviceBasedOnMpiRank();
43

44
   for (auto cfg = python_coupling::configBegin(argc, argv); cfg != python_coupling::configEnd(); ++cfg)
45
   {
46
47
48
49
50
51
52
      WALBERLA_MPI_WORLD_BARRIER()

      WALBERLA_CUDA_CHECK(cudaPeekAtLastError())

      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
      ///                                        SETUP AND CONFIGURATION                                             ///
      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
Martin Bauer's avatar
Martin Bauer committed
53

54
      auto config = *cfg;
55
56
      logging::configureLogging(config);
      auto blocks = blockforest::createUniformBlockGridFromConfig(config);
57

58
59
      Vector3< uint_t > cellsPerBlock =
         config->getBlock("DomainSetup").getParameter< Vector3< uint_t > >("cellsPerBlock");
60
      // Reading parameters
61
62
63
      auto parameters        = config->getOneBlock("Parameters");
      const real_t omega     = parameters.getParameter< real_t >("omega", real_c(1.4));
      const uint_t timesteps = parameters.getParameter< uint_t >("timesteps", uint_c(50));
64
      const bool initShearFlow = parameters.getParameter<bool>("initShearFlow", true);
65
66

      // Creating fields
67
68
69
70
71
72
73
74
75
76
77
78
79
80
      BlockDataID pdfFieldCpuID =
         field::addToStorage< PdfField_T >(blocks, "pdfs cpu", real_t(std::nan("")), field::fzyx);
      BlockDataID velFieldCpuID = field::addToStorage< VelocityField_T >(blocks, "vel", real_t(0), field::fzyx);

      // Initialize velocity on cpu
      if( initShearFlow ){
         WALBERLA_LOG_INFO_ON_ROOT("Initializing shear flow")
         initShearVelocity(blocks, velFieldCpuID);
      }

      BlockDataID pdfFieldGpuID = cuda::addGPUFieldToStorage< PdfField_T >(blocks, pdfFieldCpuID, "pdfs on GPU", true);
      // Velocity field is copied to the GPU
      BlockDataID velFieldGpuID =
         cuda::addGPUFieldToStorage< VelocityField_T >(blocks, velFieldCpuID, "velocity on GPU", true);
81

82
83
84
85
86
87
88
89
90
91
      pystencils::UniformGridGPU_MacroSetter setterSweep(pdfFieldGpuID, velFieldGpuID);

      // Set up initial PDF values
      for (auto& block : *blocks)
         setterSweep(&block);

      Vector3< int > innerOuterSplit =
         parameters.getParameter< Vector3< int > >("innerOuterSplit", Vector3< int >(1, 1, 1));

      for (uint_t i = 0; i < 3; ++i)
92
      {
93
94
95
96
         if (int_c(cellsPerBlock[i]) <= innerOuterSplit[i] * 2)
         {
            WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller or increase cellsPerBlock")
         }
97
98
      }

99
100
101
102
      Cell innerOuterSplitCell(innerOuterSplit[0], innerOuterSplit[1], innerOuterSplit[2]);
      bool cudaEnabledMPI = parameters.getParameter< bool >("cudaEnabledMPI", false);
      Vector3< int32_t > gpuBlockSize =
         parameters.getParameter< Vector3< int32_t > >("gpuBlockSize", Vector3< int32_t >(256, 1, 1));
103

104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
      int streamHighPriority = 0;
      int streamLowPriority  = 0;
      WALBERLA_CUDA_CHECK(cudaDeviceGetStreamPriorityRange(&streamLowPriority, &streamHighPriority))

      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
      ///                                      LB SWEEPS AND BOUNDARY HANDLING                                       ///
      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////

      using LbSweep      = lbm::UniformGridGPU_LbKernel;
      using PackInfoEven = lbm::UniformGridGPU_PackInfoEven;
      using PackInfoOdd  = lbm::UniformGridGPU_PackInfoOdd;
      using cuda::communication::UniformGPUScheme;

      LbSweep lbSweep(pdfFieldGpuID, omega, gpuBlockSize[0], gpuBlockSize[1], gpuBlockSize[2], innerOuterSplitCell);
      lbSweep.setOuterPriority(streamHighPriority);
119

120
121
      // Boundaries
      const FlagUID fluidFlagUID( "Fluid" );
122
      BlockDataID flagFieldID = field::addFlagFieldToStorage<FlagField_T>(blocks, "Boundary Flag Field");
Martin Bauer's avatar
Martin Bauer committed
123
      auto boundariesConfig = config->getBlock( "Boundaries" );
124
      bool boundaries = false;
Martin Bauer's avatar
Martin Bauer committed
125
126
      if( boundariesConfig )
      {
127
128
129
         boundaries = true;
         geometry::initBoundaryHandling< FlagField_T >( *blocks, flagFieldID, boundariesConfig );
         geometry::setNonBoundaryCellsToDomain< FlagField_T >( *blocks, flagFieldID, fluidFlagUID );
Martin Bauer's avatar
Martin Bauer committed
130
      }
131
132

      lbm::UniformGridGPU_NoSlip noSlip(blocks, pdfFieldGpuID);
133
      noSlip.fillFromFlagField<FlagField_T>(blocks, flagFieldID, FlagUID("NoSlip"), fluidFlagUID);
134

135
136
      lbm::UniformGridGPU_UBB ubb(blocks, pdfFieldGpuID);
      ubb.fillFromFlagField<FlagField_T>(blocks, flagFieldID, FlagUID("UBB"), fluidFlagUID);
137

138
139
      // Initial setup is the post-collision state of an even time step
      auto tracker = make_shared< lbm::TimestepTracker >(0);
140

141
142
143
      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
      ///                                           COMMUNICATION SCHEME                                             ///
      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
Martin Bauer's avatar
Martin Bauer committed
144

145
146
147
      UniformGPUScheme< Stencil_T > comm(blocks, cudaEnabledMPI);
      auto packInfo = make_shared< lbm::CombinedInPlaceGpuPackInfo< PackInfoEven, PackInfoOdd > >(tracker, pdfFieldGpuID);
      comm.addPackInfo(packInfo);
148

149
150
151
      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
      ///                                          TIME STEP DEFINITIONS                                             ///
      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
152

153
      auto defaultStream = cuda::StreamRAII::newPriorityStream(streamLowPriority);
João Victor Tozatti Risso's avatar
João Victor Tozatti Risso committed
154

155
156
157
      auto boundarySweep = [&](IBlock * block, uint8_t t, cudaStream_t stream){
         noSlip.run(block, t, stream);
         ubb.run(block, t, stream);
158
159
      };

160
161
162
163
      auto boundaryInner = [&](IBlock * block, uint8_t t, cudaStream_t stream){
         noSlip.inner(block, t, stream);
         ubb.inner(block, t, stream);
      };
164

165
166
167
168
169
170
171
172
173
174
175
      auto boundaryOuter = [&](IBlock * block, uint8_t t, cudaStream_t stream){
         noSlip.outer(block, t, stream);
         ubb.outer(block, t, stream);
      };

      auto simpleOverlapTimeStep = [&]() {
         // Communicate post-collision values of previous timestep...
         comm.startCommunication(defaultStream);
         for (auto& block : *blocks){
            if(boundaries) boundaryInner(&block, tracker->getCounter(), defaultStream);
            lbSweep.inner(&block, tracker->getCounterPlusOne(), defaultStream);
176
         }
177
178
179
180
181
182
183
         comm.wait(defaultStream);
         for (auto& block : *blocks){
            if(boundaries) boundaryOuter(&block, tracker->getCounter(), defaultStream);
            lbSweep.outer(&block, tracker->getCounterPlusOne(), defaultStream);
         }

         tracker->advance();
184
185
      };

186
187
188
189
190
191
192
193
      auto normalTimeStep = [&]() {
         comm.communicate(defaultStream);
         for (auto& block : *blocks){
            if(boundaries) boundarySweep(&block, tracker->getCounter(), defaultStream);
            lbSweep(&block, tracker->getCounterPlusOne(), defaultStream);
         }

         tracker->advance();
Martin Bauer's avatar
Martin Bauer committed
194
195
      };

196
197
198
199
200
201
202
      // With two-fields patterns, ghost layer cells act as constant stream-in boundaries;
      // with in-place patterns, ghost layer cells act as wet-node no-slip boundaries.
      auto kernelOnlyFunc = [&]() {
         tracker->advance();
         for (auto& block : *blocks)
            lbSweep(&block, tracker->getCounter(), defaultStream);
      };
Martin Bauer's avatar
Martin Bauer committed
203

204
205
206
207
208
209
210
211
      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
      ///                                             TIME LOOP SETUP                                                ///
      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////

      SweepTimeloop timeLoop(blocks->getBlockStorage(), timesteps);

      const std::string timeStepStrategy = parameters.getParameter< std::string >("timeStepStrategy", "normal");
      std::function< void() > timeStep;
Martin Bauer's avatar
Martin Bauer committed
212
      if (timeStepStrategy == "noOverlap")
213
         timeStep = std::function< void() >(normalTimeStep);
Martin Bauer's avatar
Martin Bauer committed
214
      else if (timeStepStrategy == "simpleOverlap")
215
216
217
218
219
220
221
222
         timeStep = simpleOverlapTimeStep;
      else if (timeStepStrategy == "kernelOnly")
      {
         WALBERLA_LOG_INFO_ON_ROOT(
            "Running only compute kernel without boundary - this makes only sense for benchmarking!")
         // Run initial communication once to provide any missing stream-in populations
         comm.communicate();
         timeStep = kernelOnlyFunc;
Martin Bauer's avatar
Martin Bauer committed
223
      }
224
225
226
227
      else
      {
         WALBERLA_ABORT_NO_DEBUG_INFO("Invalid value for 'timeStepStrategy'. Allowed values are 'noOverlap', "
                                      "'simpleOverlap', 'kernelOnly'")
Martin Bauer's avatar
Martin Bauer committed
228
      }
Martin Bauer's avatar
Martin Bauer committed
229

230
      timeLoop.add() << BeforeFunction(timeStep) << Sweep([](IBlock*) {}, "time step");
231

232
      // VTK
233
234
      uint_t vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0);
      if (vtkWriteFrequency > 0)
235
      {
236
237
238
         auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out",
                                                         "simulation_step", false, true, true, false, 0);
         auto velWriter = make_shared< field::VTKWriter< VelocityField_T > >(velFieldCpuID, "vel");
239
         vtkOutput->addCellDataWriter(velWriter);
240
241
242

         vtkOutput->addBeforeFunction([&]() {
           cuda::fieldCpy< VelocityField_T , cuda::GPUField< real_t > >(blocks, velFieldCpuID, velFieldGpuID);
243
         });
244
         timeLoop.addFuncAfterTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
245
246
      }

247
248
249
      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
      ///                                               BENCHMARK                                                    ///
      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
João Victor Tozatti Risso's avatar
João Victor Tozatti Risso committed
250

251
252
253
      int warmupSteps     = parameters.getParameter< int >("warmupSteps", 2);
      int outerIterations = parameters.getParameter< int >("outerIterations", 1);
      for (int i = 0; i < warmupSteps; ++i)
Martin Bauer's avatar
Martin Bauer committed
254
255
         timeLoop.singleStep();

256
257
258
      double remainingTimeLoggerFrequency =
         parameters.getParameter< double >("remainingTimeLoggerFrequency", -1.0); // in seconds
      if (remainingTimeLoggerFrequency > 0)
259
      {
260
261
262
         auto logger = timing::RemainingTimeLogger(timeLoop.getNrOfTimeSteps() * uint_c(outerIterations),
                                                   remainingTimeLoggerFrequency);
         timeLoop.addFuncAfterTimeStep(logger, "remaining time logger");
263
      }
264
265

      for (int outerIteration = 0; outerIteration < outerIterations; ++outerIteration)
Martin Bauer's avatar
Martin Bauer committed
266
      {
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
         WALBERLA_CUDA_CHECK(cudaPeekAtLastError())

         timeLoop.setCurrentTimeStepToZero();
         WcTimer simTimer;
         cudaDeviceSynchronize();
         WALBERLA_CUDA_CHECK(cudaPeekAtLastError())
         WALBERLA_LOG_INFO_ON_ROOT("Starting simulation with " << timesteps << " time steps")
         simTimer.start();
         timeLoop.run();
         cudaDeviceSynchronize();
         simTimer.end();
         WALBERLA_LOG_INFO_ON_ROOT("Simulation finished")
         auto time      = simTimer.last();
         auto nrOfCells = real_c(cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2]);

         auto mlupsPerProcess = nrOfCells * real_c(timesteps) / time * 1e-6;
         WALBERLA_LOG_RESULT_ON_ROOT("MLUPS per process " << mlupsPerProcess)
         WALBERLA_LOG_RESULT_ON_ROOT("Time per time step " << time / real_c(timesteps))
         WALBERLA_ROOT_SECTION()
         {
            python_coupling::PythonCallback pythonCallbackResults("results_callback");
            if (pythonCallbackResults.isCallable())
            {
               pythonCallbackResults.data().exposeValue("mlupsPerProcess", mlupsPerProcess);
               pythonCallbackResults.data().exposeValue("stencil", infoStencil);
               pythonCallbackResults.data().exposeValue("streamingPattern", infoStreamingPattern);
               pythonCallbackResults.data().exposeValue("collisionSetup", infoCollisionSetup);
               pythonCallbackResults.data().exposeValue("cse_global", infoCseGlobal);
               pythonCallbackResults.data().exposeValue("cse_pdfs", infoCsePdfs);
               // Call Python function to report results
               pythonCallbackResults();
            }
         }
João Victor Tozatti Risso's avatar
João Victor Tozatti Risso committed
300
      }
301
302
303
   }

   return 0;
João Victor Tozatti Risso's avatar
João Victor Tozatti Risso committed
304
}