From 16c96838e03c514360c071634bb78a0b3a77f5a0 Mon Sep 17 00:00:00 2001
From: markus holzer <markus.holzer@fau.de>
Date: Fri, 25 Sep 2020 11:00:40 +0200
Subject: [PATCH] Implemented tutorial 03 for GPU usage

---
 .../codegen/03_AdvancedLBMCodegen.cpp         | 72 +++++++++++++++----
 .../codegen/03_AdvancedLBMCodegen.prm         | 21 +-----
 .../codegen/03_AdvancedLBMCodegen.py          | 12 ++--
 apps/tutorials/codegen/CMakeLists.txt         | 34 ++++++---
 4 files changed, 91 insertions(+), 48 deletions(-)

diff --git a/apps/tutorials/codegen/03_AdvancedLBMCodegen.cpp b/apps/tutorials/codegen/03_AdvancedLBMCodegen.cpp
index 78723495f..feb0a66e0 100644
--- a/apps/tutorials/codegen/03_AdvancedLBMCodegen.cpp
+++ b/apps/tutorials/codegen/03_AdvancedLBMCodegen.cpp
@@ -22,6 +22,16 @@
 
 #include "core/all.h"
 
+#if defined(WALBERLA_BUILD_WITH_CUDA)
+#include "cuda/HostFieldAllocator.h"
+#include "cuda/ParallelStreams.h"
+#include "cuda/NVTX.h"
+#include "cuda/AddGPUFieldToStorage.h"
+#include "cuda/communication/GPUPackInfo.h"
+#include "cuda/DeviceSelectMPI.h"
+#include "cuda/communication/UniformGPUScheme.h"
+#endif
+
 #include "domain_decomposition/all.h"
 
 #include "field/all.h"
@@ -29,8 +39,6 @@
 
 #include "geometry/all.h"
 
-#include "lbm/vtk/VTKOutput.h"
-
 #include "stencil/D2Q9.h"
 
 #include "timeloop/all.h"
@@ -64,6 +72,10 @@ typedef walberla::uint8_t flag_t;
 typedef FlagField< flag_t > FlagField_T;
 typedef lbm::CumulantMRTNoSlip NoSlip_T;
 
+#if defined(WALBERLA_BUILD_WITH_CUDA)
+typedef cuda::GPUField<double> GPUField;
+#endif
+
 //////////////////////////////////////////
 /// Shear Flow Velocity Initialization ///
 //////////////////////////////////////////
@@ -119,15 +131,24 @@ int main(int argc, char** argv)
    const real_t omega     = parameters.getParameter< real_t >("omega", real_c(1.8));
    const double remainingTimeLoggerFrequency =
       parameters.getParameter< double >("remainingTimeLoggerFrequency", 3.0); // in seconds
+   const int32_t VTKwriteFrequency = parameters.getParameter< int32_t >("VTKwriteFrequency", 1000);
 
    ////////////////////////////////////
    /// PDF Field and Velocity Setup ///
    ////////////////////////////////////
-
+#if defined(WALBERLA_BUILD_WITH_CUDA)
+   // CPU fields
+   BlockDataID velocityFieldId = field::addToStorage< VectorField_T >(blocks, "velocity", real_c(0.0), field::fzyx);
+   BlockDataID flagFieldId = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field");
+   // GPU fields
+   BlockDataID pdfFieldId = cuda::addGPUFieldToStorage<cuda::GPUField<real_t >>( blocks,"pdf field on GPU", Stencil_T::Size, field::fzyx, uint_t (1));
+   BlockDataID velocityFieldIdGPU = cuda::addGPUFieldToStorage<VectorField_T >( blocks, velocityFieldId, "velocity on GPU", true );
+#else
    BlockDataID velocityFieldId = field::addToStorage< VectorField_T >(blocks, "velocity", real_c(0.0), field::fzyx);
 
    BlockDataID pdfFieldId  = field::addToStorage< PdfField_T >(blocks, "pdf field", real_c(0.0), field::fzyx);
    BlockDataID flagFieldId = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field");
+#endif
 
    // Velocity field setup
    auto shearFlowSetup = walberlaEnv.config()->getOneBlock("ShearFlowSetup");
@@ -136,13 +157,28 @@ int main(int argc, char** argv)
    real_t rho = shearFlowSetup.getParameter("rho", real_c(1.0));
 
    // pdfs setup
+#if defined(WALBERLA_BUILD_WITH_CUDA)
+   cuda::fieldCpy<GPUField, VectorField_T>(blocks, velocityFieldIdGPU, velocityFieldId);
+   pystencils::InitialPDFsSetter pdfSetter(pdfFieldId, velocityFieldIdGPU, rho);
+#else
    pystencils::InitialPDFsSetter pdfSetter(pdfFieldId, velocityFieldId, rho);
+#endif
 
    for (auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt)
    {
       pdfSetter(&(*blockIt));
    }
 
+   /////////////
+   /// Sweep ///
+   /////////////
+
+#if defined(WALBERLA_BUILD_WITH_CUDA)
+   pystencils::CumulantMRTSweep CumulantMRTSweep(pdfFieldId, velocityFieldIdGPU, omega);
+#else
+   pystencils::CumulantMRTSweep CumulantMRTSweep(pdfFieldId, velocityFieldId, omega);
+#endif
+
    /////////////////////////
    /// Boundary Handling ///
    /////////////////////////
@@ -165,29 +201,37 @@ int main(int argc, char** argv)
    SweepTimeloop timeloop(blocks->getBlockStorage(), timesteps);
 
    // Communication
+#if defined(WALBERLA_BUILD_WITH_CUDA)
+   cuda::communication::UniformGPUScheme< Stencil_T > com( blocks, 0 );
+   com.addPackInfo(make_shared< PackInfo_T >(pdfFieldId));
+   auto simpleOverlapTimeStep = [&]()
+   {
+     com.communicate(nullptr);
+   };
+   auto communication = std::function<void()>( simpleOverlapTimeStep );
+#else
    blockforest::communication::UniformBufferedScheme< Stencil_T > communication(blocks);
    communication.addPackInfo(make_shared< PackInfo_T >(pdfFieldId));
+#endif
 
    // Timeloop
    timeloop.add() << BeforeFunction(communication, "communication") << Sweep(noSlip);
-   timeloop.add() << Sweep(pystencils::CumulantMRTSweep(pdfFieldId, velocityFieldId, omega));
-
-   // Stability Checker
-   timeloop.addFuncAfterTimeStep(makeSharedFunctor(field::makeStabilityChecker< PdfField_T, FlagField_T >(
-                                    walberlaEnv.config(), blocks, pdfFieldId, flagFieldId, fluidFlagUID)),
-                                 "LBM stability check");
+   timeloop.add() << Sweep(CumulantMRTSweep);
 
    // Time logger
    timeloop.addFuncAfterTimeStep(timing::RemainingTimeLogger(timeloop.getNrOfTimeSteps(), remainingTimeLoggerFrequency),
                                  "remaining time logger");
 
-   uint_t vtkWriteFrequency = uint_c(100);
-   if (vtkWriteFrequency > 0)
+   if (VTKwriteFrequency > 0)
    {
       const std::string path = "vtk_out/tut03";
-      auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "cumulant_mrt_velocity_field", vtkWriteFrequency, 0,
-                                                      false, path, "simulation_step", false, true, true, false, 0);
-
+      auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "cumulant_mrt_velocity_field", VTKwriteFrequency, 0,
+                                                false, path, "simulation_step", false, true, true, false, 0);
+      #if defined(WALBERLA_BUILD_WITH_CUDA)
+         vtkOutput->addBeforeFunction( [&]() {
+            cuda::fieldCpy<VectorField_T, GPUField>(blocks, velocityFieldId, velocityFieldIdGPU);
+          });
+      #endif
       auto velWriter = make_shared< field::VTKWriter< VectorField_T > >(velocityFieldId, "Velocity");
       vtkOutput->addCellDataWriter(velWriter);
 
diff --git a/apps/tutorials/codegen/03_AdvancedLBMCodegen.prm b/apps/tutorials/codegen/03_AdvancedLBMCodegen.prm
index b194b4149..d292c02d1 100644
--- a/apps/tutorials/codegen/03_AdvancedLBMCodegen.prm
+++ b/apps/tutorials/codegen/03_AdvancedLBMCodegen.prm
@@ -2,9 +2,10 @@
 Parameters 
 {
 	omega           1.8;
-	timesteps       100000;
+	timesteps       10001;
 
 	remainingTimeLoggerFrequency 3; // in seconds
+	VTKwriteFrequency 1000;
 }
 
 ShearFlowSetup
@@ -35,21 +36,3 @@ Boundaries
 {   
 	Border { direction S,N; walldistance -1; flag NoSlip; }		
 }
-
-
-VTK 
-{
-   velocity_field
-   {
-      writeFrequency 10;
-      ghostLayers    1;
-      
-      inclusion_filters {
-         DomainFilter;
-      }
-      
-      writers {
-         Velocity;
-      }
-   }
-}
diff --git a/apps/tutorials/codegen/03_AdvancedLBMCodegen.py b/apps/tutorials/codegen/03_AdvancedLBMCodegen.py
index 302afa414..f659051f6 100644
--- a/apps/tutorials/codegen/03_AdvancedLBMCodegen.py
+++ b/apps/tutorials/codegen/03_AdvancedLBMCodegen.py
@@ -64,15 +64,19 @@ pdfs_setter = macroscopic_values_setter(lbm_method,
 #   =====================
 
 with CodeGeneration() as ctx:
+    if ctx.cuda:
+        target = 'gpu'
+    else:
+        target = 'cpu'
 
     #   LBM Sweep
-    generate_sweep(ctx, "CumulantMRTSweep", lbm_update_rule, field_swaps=[(pdfs, pdfs_tmp)])
+    generate_sweep(ctx, "CumulantMRTSweep", lbm_update_rule, field_swaps=[(pdfs, pdfs_tmp)], target=target)
 
     #   Pack Info
-    generate_pack_info_from_kernel(ctx, "CumulantMRTPackInfo", lbm_update_rule)
+    generate_pack_info_from_kernel(ctx, "CumulantMRTPackInfo", lbm_update_rule, target=target)
 
     #   Macroscopic Values Setter
-    generate_sweep(ctx, "InitialPDFsSetter", pdfs_setter)
+    generate_sweep(ctx, "InitialPDFsSetter", pdfs_setter, target=target)
 
     #   NoSlip Boundary
-    generate_boundary(ctx, "CumulantMRTNoSlip", NoSlip(), lbm_method)
+    generate_boundary(ctx, "CumulantMRTNoSlip", NoSlip(), lbm_method, target=target)
diff --git a/apps/tutorials/codegen/CMakeLists.txt b/apps/tutorials/codegen/CMakeLists.txt
index c9eb292d3..339f64819 100644
--- a/apps/tutorials/codegen/CMakeLists.txt
+++ b/apps/tutorials/codegen/CMakeLists.txt
@@ -24,16 +24,28 @@ if( WALBERLA_BUILD_WITH_CODEGEN )
                     DEPENDS blockforest core domain_decomposition field geometry timeloop lbm stencil vtk 02_LBMLatticeModelGenerationPython )
 
     #   Tutorial 3: Advanced lbmpy Code Generation
-    walberla_generate_target_from_python( NAME 03_AdvancedLBMCodegenPython
-        FILE 03_AdvancedLBMCodegen.py
-        OUT_FILES   CumulantMRTSweep.cpp CumulantMRTSweep.h
-                    CumulantMRTPackInfo.cpp CumulantMRTPackInfo.h
-                    InitialPDFsSetter.cpp InitialPDFsSetter.h
-                    CumulantMRTNoSlip.cpp CumulantMRTNoSlip.h)
-
-    walberla_add_executable ( NAME 03_AdvancedLBMCodegenApp 
-                    FILES 03_AdvancedLBMCodegen.cpp
-                    DEPENDS blockforest core domain_decomposition field geometry timeloop lbm stencil vtk 03_AdvancedLBMCodegenPython )
-
+    if(WALBERLA_BUILD_WITH_CUDA)
+        walberla_generate_target_from_python( NAME 03_AdvancedLBMCodegenPython
+            FILE 03_AdvancedLBMCodegen.py
+            OUT_FILES   CumulantMRTSweep.cu CumulantMRTSweep.h
+                        CumulantMRTPackInfo.cu CumulantMRTPackInfo.h
+                        InitialPDFsSetter.cu InitialPDFsSetter.h
+                        CumulantMRTNoSlip.cu CumulantMRTNoSlip.h)
+
+        walberla_add_executable ( NAME 03_AdvancedLBMCodegenApp
+                        FILES 03_AdvancedLBMCodegen.cpp
+                        DEPENDS blockforest cuda core domain_decomposition field geometry timeloop lbm stencil vtk 03_AdvancedLBMCodegenPython )
+    else()
+        walberla_generate_target_from_python( NAME 03_AdvancedLBMCodegenPython
+                FILE 03_AdvancedLBMCodegen.py
+                OUT_FILES   CumulantMRTSweep.cpp CumulantMRTSweep.h
+                CumulantMRTPackInfo.cpp CumulantMRTPackInfo.h
+                InitialPDFsSetter.cpp InitialPDFsSetter.h
+                CumulantMRTNoSlip.cpp CumulantMRTNoSlip.h)
+
+        walberla_add_executable ( NAME 03_AdvancedLBMCodegenApp
+                FILES 03_AdvancedLBMCodegen.cpp
+                DEPENDS blockforest core domain_decomposition field geometry timeloop lbm stencil vtk 03_AdvancedLBMCodegenPython )
+    endif()
 
 endif()
\ No newline at end of file
-- 
GitLab