From 16c96838e03c514360c071634bb78a0b3a77f5a0 Mon Sep 17 00:00:00 2001 From: markus holzer <markus.holzer@fau.de> Date: Fri, 25 Sep 2020 11:00:40 +0200 Subject: [PATCH] Implemented tutorial 03 for GPU usage --- .../codegen/03_AdvancedLBMCodegen.cpp | 72 +++++++++++++++---- .../codegen/03_AdvancedLBMCodegen.prm | 21 +----- .../codegen/03_AdvancedLBMCodegen.py | 12 ++-- apps/tutorials/codegen/CMakeLists.txt | 34 ++++++--- 4 files changed, 91 insertions(+), 48 deletions(-) diff --git a/apps/tutorials/codegen/03_AdvancedLBMCodegen.cpp b/apps/tutorials/codegen/03_AdvancedLBMCodegen.cpp index 78723495f..feb0a66e0 100644 --- a/apps/tutorials/codegen/03_AdvancedLBMCodegen.cpp +++ b/apps/tutorials/codegen/03_AdvancedLBMCodegen.cpp @@ -22,6 +22,16 @@ #include "core/all.h" +#if defined(WALBERLA_BUILD_WITH_CUDA) +#include "cuda/HostFieldAllocator.h" +#include "cuda/ParallelStreams.h" +#include "cuda/NVTX.h" +#include "cuda/AddGPUFieldToStorage.h" +#include "cuda/communication/GPUPackInfo.h" +#include "cuda/DeviceSelectMPI.h" +#include "cuda/communication/UniformGPUScheme.h" +#endif + #include "domain_decomposition/all.h" #include "field/all.h" @@ -29,8 +39,6 @@ #include "geometry/all.h" -#include "lbm/vtk/VTKOutput.h" - #include "stencil/D2Q9.h" #include "timeloop/all.h" @@ -64,6 +72,10 @@ typedef walberla::uint8_t flag_t; typedef FlagField< flag_t > FlagField_T; typedef lbm::CumulantMRTNoSlip NoSlip_T; +#if defined(WALBERLA_BUILD_WITH_CUDA) +typedef cuda::GPUField<double> GPUField; +#endif + ////////////////////////////////////////// /// Shear Flow Velocity Initialization /// ////////////////////////////////////////// @@ -119,15 +131,24 @@ int main(int argc, char** argv) const real_t omega = parameters.getParameter< real_t >("omega", real_c(1.8)); const double remainingTimeLoggerFrequency = parameters.getParameter< double >("remainingTimeLoggerFrequency", 3.0); // in seconds + const int32_t VTKwriteFrequency = parameters.getParameter< int32_t >("VTKwriteFrequency", 1000); //////////////////////////////////// /// PDF Field and Velocity Setup /// //////////////////////////////////// - +#if defined(WALBERLA_BUILD_WITH_CUDA) + // CPU fields + BlockDataID velocityFieldId = field::addToStorage< VectorField_T >(blocks, "velocity", real_c(0.0), field::fzyx); + BlockDataID flagFieldId = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field"); + // GPU fields + BlockDataID pdfFieldId = cuda::addGPUFieldToStorage<cuda::GPUField<real_t >>( blocks,"pdf field on GPU", Stencil_T::Size, field::fzyx, uint_t (1)); + BlockDataID velocityFieldIdGPU = cuda::addGPUFieldToStorage<VectorField_T >( blocks, velocityFieldId, "velocity on GPU", true ); +#else BlockDataID velocityFieldId = field::addToStorage< VectorField_T >(blocks, "velocity", real_c(0.0), field::fzyx); BlockDataID pdfFieldId = field::addToStorage< PdfField_T >(blocks, "pdf field", real_c(0.0), field::fzyx); BlockDataID flagFieldId = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field"); +#endif // Velocity field setup auto shearFlowSetup = walberlaEnv.config()->getOneBlock("ShearFlowSetup"); @@ -136,13 +157,28 @@ int main(int argc, char** argv) real_t rho = shearFlowSetup.getParameter("rho", real_c(1.0)); // pdfs setup +#if defined(WALBERLA_BUILD_WITH_CUDA) + cuda::fieldCpy<GPUField, VectorField_T>(blocks, velocityFieldIdGPU, velocityFieldId); + pystencils::InitialPDFsSetter pdfSetter(pdfFieldId, velocityFieldIdGPU, rho); +#else pystencils::InitialPDFsSetter pdfSetter(pdfFieldId, velocityFieldId, rho); +#endif for (auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt) { pdfSetter(&(*blockIt)); } + ///////////// + /// Sweep /// + ///////////// + +#if defined(WALBERLA_BUILD_WITH_CUDA) + pystencils::CumulantMRTSweep CumulantMRTSweep(pdfFieldId, velocityFieldIdGPU, omega); +#else + pystencils::CumulantMRTSweep CumulantMRTSweep(pdfFieldId, velocityFieldId, omega); +#endif + ///////////////////////// /// Boundary Handling /// ///////////////////////// @@ -165,29 +201,37 @@ int main(int argc, char** argv) SweepTimeloop timeloop(blocks->getBlockStorage(), timesteps); // Communication +#if defined(WALBERLA_BUILD_WITH_CUDA) + cuda::communication::UniformGPUScheme< Stencil_T > com( blocks, 0 ); + com.addPackInfo(make_shared< PackInfo_T >(pdfFieldId)); + auto simpleOverlapTimeStep = [&]() + { + com.communicate(nullptr); + }; + auto communication = std::function<void()>( simpleOverlapTimeStep ); +#else blockforest::communication::UniformBufferedScheme< Stencil_T > communication(blocks); communication.addPackInfo(make_shared< PackInfo_T >(pdfFieldId)); +#endif // Timeloop timeloop.add() << BeforeFunction(communication, "communication") << Sweep(noSlip); - timeloop.add() << Sweep(pystencils::CumulantMRTSweep(pdfFieldId, velocityFieldId, omega)); - - // Stability Checker - timeloop.addFuncAfterTimeStep(makeSharedFunctor(field::makeStabilityChecker< PdfField_T, FlagField_T >( - walberlaEnv.config(), blocks, pdfFieldId, flagFieldId, fluidFlagUID)), - "LBM stability check"); + timeloop.add() << Sweep(CumulantMRTSweep); // Time logger timeloop.addFuncAfterTimeStep(timing::RemainingTimeLogger(timeloop.getNrOfTimeSteps(), remainingTimeLoggerFrequency), "remaining time logger"); - uint_t vtkWriteFrequency = uint_c(100); - if (vtkWriteFrequency > 0) + if (VTKwriteFrequency > 0) { const std::string path = "vtk_out/tut03"; - auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "cumulant_mrt_velocity_field", vtkWriteFrequency, 0, - false, path, "simulation_step", false, true, true, false, 0); - + auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "cumulant_mrt_velocity_field", VTKwriteFrequency, 0, + false, path, "simulation_step", false, true, true, false, 0); + #if defined(WALBERLA_BUILD_WITH_CUDA) + vtkOutput->addBeforeFunction( [&]() { + cuda::fieldCpy<VectorField_T, GPUField>(blocks, velocityFieldId, velocityFieldIdGPU); + }); + #endif auto velWriter = make_shared< field::VTKWriter< VectorField_T > >(velocityFieldId, "Velocity"); vtkOutput->addCellDataWriter(velWriter); diff --git a/apps/tutorials/codegen/03_AdvancedLBMCodegen.prm b/apps/tutorials/codegen/03_AdvancedLBMCodegen.prm index b194b4149..d292c02d1 100644 --- a/apps/tutorials/codegen/03_AdvancedLBMCodegen.prm +++ b/apps/tutorials/codegen/03_AdvancedLBMCodegen.prm @@ -2,9 +2,10 @@ Parameters { omega 1.8; - timesteps 100000; + timesteps 10001; remainingTimeLoggerFrequency 3; // in seconds + VTKwriteFrequency 1000; } ShearFlowSetup @@ -35,21 +36,3 @@ Boundaries { Border { direction S,N; walldistance -1; flag NoSlip; } } - - -VTK -{ - velocity_field - { - writeFrequency 10; - ghostLayers 1; - - inclusion_filters { - DomainFilter; - } - - writers { - Velocity; - } - } -} diff --git a/apps/tutorials/codegen/03_AdvancedLBMCodegen.py b/apps/tutorials/codegen/03_AdvancedLBMCodegen.py index 302afa414..f659051f6 100644 --- a/apps/tutorials/codegen/03_AdvancedLBMCodegen.py +++ b/apps/tutorials/codegen/03_AdvancedLBMCodegen.py @@ -64,15 +64,19 @@ pdfs_setter = macroscopic_values_setter(lbm_method, # ===================== with CodeGeneration() as ctx: + if ctx.cuda: + target = 'gpu' + else: + target = 'cpu' # LBM Sweep - generate_sweep(ctx, "CumulantMRTSweep", lbm_update_rule, field_swaps=[(pdfs, pdfs_tmp)]) + generate_sweep(ctx, "CumulantMRTSweep", lbm_update_rule, field_swaps=[(pdfs, pdfs_tmp)], target=target) # Pack Info - generate_pack_info_from_kernel(ctx, "CumulantMRTPackInfo", lbm_update_rule) + generate_pack_info_from_kernel(ctx, "CumulantMRTPackInfo", lbm_update_rule, target=target) # Macroscopic Values Setter - generate_sweep(ctx, "InitialPDFsSetter", pdfs_setter) + generate_sweep(ctx, "InitialPDFsSetter", pdfs_setter, target=target) # NoSlip Boundary - generate_boundary(ctx, "CumulantMRTNoSlip", NoSlip(), lbm_method) + generate_boundary(ctx, "CumulantMRTNoSlip", NoSlip(), lbm_method, target=target) diff --git a/apps/tutorials/codegen/CMakeLists.txt b/apps/tutorials/codegen/CMakeLists.txt index c9eb292d3..339f64819 100644 --- a/apps/tutorials/codegen/CMakeLists.txt +++ b/apps/tutorials/codegen/CMakeLists.txt @@ -24,16 +24,28 @@ if( WALBERLA_BUILD_WITH_CODEGEN ) DEPENDS blockforest core domain_decomposition field geometry timeloop lbm stencil vtk 02_LBMLatticeModelGenerationPython ) # Tutorial 3: Advanced lbmpy Code Generation - walberla_generate_target_from_python( NAME 03_AdvancedLBMCodegenPython - FILE 03_AdvancedLBMCodegen.py - OUT_FILES CumulantMRTSweep.cpp CumulantMRTSweep.h - CumulantMRTPackInfo.cpp CumulantMRTPackInfo.h - InitialPDFsSetter.cpp InitialPDFsSetter.h - CumulantMRTNoSlip.cpp CumulantMRTNoSlip.h) - - walberla_add_executable ( NAME 03_AdvancedLBMCodegenApp - FILES 03_AdvancedLBMCodegen.cpp - DEPENDS blockforest core domain_decomposition field geometry timeloop lbm stencil vtk 03_AdvancedLBMCodegenPython ) - + if(WALBERLA_BUILD_WITH_CUDA) + walberla_generate_target_from_python( NAME 03_AdvancedLBMCodegenPython + FILE 03_AdvancedLBMCodegen.py + OUT_FILES CumulantMRTSweep.cu CumulantMRTSweep.h + CumulantMRTPackInfo.cu CumulantMRTPackInfo.h + InitialPDFsSetter.cu InitialPDFsSetter.h + CumulantMRTNoSlip.cu CumulantMRTNoSlip.h) + + walberla_add_executable ( NAME 03_AdvancedLBMCodegenApp + FILES 03_AdvancedLBMCodegen.cpp + DEPENDS blockforest cuda core domain_decomposition field geometry timeloop lbm stencil vtk 03_AdvancedLBMCodegenPython ) + else() + walberla_generate_target_from_python( NAME 03_AdvancedLBMCodegenPython + FILE 03_AdvancedLBMCodegen.py + OUT_FILES CumulantMRTSweep.cpp CumulantMRTSweep.h + CumulantMRTPackInfo.cpp CumulantMRTPackInfo.h + InitialPDFsSetter.cpp InitialPDFsSetter.h + CumulantMRTNoSlip.cpp CumulantMRTNoSlip.h) + + walberla_add_executable ( NAME 03_AdvancedLBMCodegenApp + FILES 03_AdvancedLBMCodegen.cpp + DEPENDS blockforest core domain_decomposition field geometry timeloop lbm stencil vtk 03_AdvancedLBMCodegenPython ) + endif() endif() \ No newline at end of file -- GitLab