From 6a61fc77a95f5ea417403d0ad38a65dc10b62a74 Mon Sep 17 00:00:00 2001 From: Martin Bauer <martin.bauer@fau.de> Date: Fri, 14 Jun 2019 13:51:04 +0200 Subject: [PATCH] UniformGridGPU: New app with AA pattern (single PDF field) --- apps/benchmarks/UniformGridGPU/CMakeLists.txt | 22 +- .../UniformGridGPU/InitShearVelocity.h | 33 +++ .../UniformGridGPU/UniformGridGPU.cpp | 115 +++++--- .../UniformGridGPU/UniformGridGPU.prm | 9 +- .../UniformGridGPU/UniformGridGPU.py | 25 +- .../UniformGridGPU/UniformGridGPU_AA.cpp | 250 ++++++++++++++++++ .../UniformGridGPU/UniformGridGPU_AA.py | 100 +++++++ 7 files changed, 503 insertions(+), 51 deletions(-) create mode 100644 apps/benchmarks/UniformGridGPU/InitShearVelocity.h create mode 100644 apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.cpp create mode 100644 apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.py diff --git a/apps/benchmarks/UniformGridGPU/CMakeLists.txt b/apps/benchmarks/UniformGridGPU/CMakeLists.txt index c3052b523..6e704b430 100644 --- a/apps/benchmarks/UniformGridGPU/CMakeLists.txt +++ b/apps/benchmarks/UniformGridGPU/CMakeLists.txt @@ -8,11 +8,31 @@ waLBerla_python_file_generates(UniformGridGPU.py UniformGridGPU_NoSlip.cu UniformGridGPU_NoSlip.h UniformGridGPU_UBB.cu UniformGridGPU_UBB.h UniformGridGPU_PackInfo.cu UniformGridGPU_PackInfo.h + UniformGridGPU_MacroSetter.cpp UniformGridGPU_MacroSetter.h + UniformGridGPU_MacroGetter.cpp UniformGridGPU_MacroGetter.h ) foreach(config srt trt mrt smagorinsky entropic ) waLBerla_add_executable ( NAME UniformGridBenchmarkGPU_${config} FILES UniformGridGPU.cpp UniformGridGPU.py - DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk + DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk gui CODEGEN_CFG ${config}) endforeach() + + + + +waLBerla_python_file_generates(UniformGridGPU_AA.py + UniformGridGPU_AA_LbKernelEven.cu UniformGridGPU_AA_LbKernelEven.h + UniformGridGPU_AA_LbKernelOdd.cu UniformGridGPU_AA_LbKernelOdd.h + UniformGridGPU_AA_PackInfoPull.cu UniformGridGPU_AA_PackInfoPull.h + UniformGridGPU_AA_PackInfoPush.cu UniformGridGPU_AA_PackInfoPush.h + UniformGridGPU_AA_MacroSetter.cpp UniformGridGPU_AA_MacroSetter.h + UniformGridGPU_AA_MacroGetter.cpp UniformGridGPU_AA_MacroGetter.h + ) + +set(config "srt") +waLBerla_add_executable ( NAME UniformGridBenchmarkGPU_AA_${config} + FILES UniformGridGPU_AA.cpp UniformGridGPU_AA.py + DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk gui + CODEGEN_CFG ${config}) diff --git a/apps/benchmarks/UniformGridGPU/InitShearVelocity.h b/apps/benchmarks/UniformGridGPU/InitShearVelocity.h new file mode 100644 index 000000000..fe038ebdb --- /dev/null +++ b/apps/benchmarks/UniformGridGPU/InitShearVelocity.h @@ -0,0 +1,33 @@ +#include "core/math/Random.h" +#include "domain_decomposition/StructuredBlockStorage.h" + + +namespace walberla { + + +inline void initShearVelocity(const shared_ptr<StructuredBlockStorage> & blocks, BlockDataID velFieldID, + const real_t xMagnitude=0.1, const real_t fluctuationMagnitude=0.05 ) +{ + math::seedRandomGenerator(0); + auto halfZ = blocks->getDomainCellBB().zMax() / 2; + for( auto & block: *blocks) + { + auto velField = block.getData<GhostLayerField<real_t, 3> >( velFieldID ); + WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(velField, + Cell globalCell; + blocks->transformBlockLocalToGlobalCell(globalCell, block, Cell(x, y, z)); + real_t randomReal = xMagnitude * math::realRandom<real_t>(-fluctuationMagnitude, fluctuationMagnitude); + velField->get(x, y, z, 1) = real_t(0); + velField->get(x, y, z, 2) = randomReal; + + if( globalCell[2] >= halfZ ) { + velField->get(x, y, z, 0) = xMagnitude; + } else { + velField->get(x, y, z, 0) = -xMagnitude; + } + ); + } +} + + +} \ No newline at end of file diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp index 181f981bc..ff2c93c4d 100644 --- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp +++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp @@ -5,12 +5,10 @@ #include "python_coupling/PythonCallback.h" #include "python_coupling/DictWrapper.h" #include "blockforest/Initialization.h" -#include "lbm/field/PdfField.h" -#include "lbm/field/AddToStorage.h" #include "field/FlagField.h" #include "field/AddToStorage.h" -#include "lbm/communication/PdfFieldPackInfo.h" -#include "lbm/vtk/VTKOutput.h" +#include "field/vtk/VTKWriter.h" +#include "field/communication/PackInfo.h" #include "lbm/PerformanceLogger.h" #include "blockforest/communication/UniformBufferedScheme.h" #include "timeloop/all.h" @@ -25,8 +23,9 @@ #include "cuda/AddGPUFieldToStorage.h" #include "cuda/communication/UniformGPUScheme.h" #include "cuda/DeviceSelectMPI.h" -#include "lbm/sweeps/CellwiseSweep.h" #include "domain_decomposition/SharedSweep.h" +#include "gui/Gui.h" +#include "lbm/gui/Connection.h" #include "UniformGridGPU_LatticeModel.h" #include "UniformGridGPU_LbKernel.h" @@ -34,37 +33,45 @@ #include "UniformGridGPU_UBB.h" #include "UniformGridGPU_NoSlip.h" #include "UniformGridGPU_Communication.h" +#include "UniformGridGPU_MacroSetter.h" +#include "UniformGridGPU_MacroGetter.h" using namespace walberla; using LatticeModel_T = lbm::UniformGridGPU_LatticeModel; +const auto Q = LatticeModel_T::Stencil::Q; + + using Stencil_T = LatticeModel_T::Stencil; using CommunicationStencil_T = LatticeModel_T::CommunicationStencil; -using PdfField_T = lbm::PdfField<LatticeModel_T>; +using PdfField_T = GhostLayerField<real_t, Q>; using CommScheme_T = cuda::communication::UniformGPUScheme<CommunicationStencil_T>; - +using VelocityField_T = GhostLayerField<real_t, 3>; using flag_t = walberla::uint8_t; using FlagField_T = FlagField<flag_t>; -void initShearVelocity(const shared_ptr<StructuredBlockStorage> & blocks, BlockDataID pdfFieldID, +void initShearVelocity(const shared_ptr<StructuredBlockStorage> & blocks, BlockDataID velFieldID, const real_t xMagnitude=0.1, const real_t fluctuationMagnitude=0.05 ) { math::seedRandomGenerator(0); auto halfZ = blocks->getDomainCellBB().zMax() / 2; for( auto & block: *blocks) { - PdfField_T * pdfField = block.getData<PdfField_T>( pdfFieldID ); - WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(pdfField, + auto velField = block.getData<VelocityField_T>( velFieldID ); + WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(velField, Cell globalCell; blocks->transformBlockLocalToGlobalCell(globalCell, block, Cell(x, y, z)); real_t randomReal = xMagnitude * math::realRandom<real_t>(-fluctuationMagnitude, fluctuationMagnitude); + velField->get(x, y, z, 1) = real_t(0); + velField->get(x, y, z, 2) = randomReal; + if( globalCell[2] >= halfZ ) { - pdfField->setDensityAndVelocity(x, y, z, Vector3<real_t>(xMagnitude, 0, randomReal), real_t(1.0)); + velField->get(x, y, z, 0) = xMagnitude; } else { - pdfField->setDensityAndVelocity(x, y, z, Vector3<real_t>(-xMagnitude, 0,randomReal), real_t(1.0)); + velField->get(x, y, z, 0) = -xMagnitude; } ); } @@ -89,16 +96,24 @@ int main( int argc, char **argv ) auto parameters = config->getOneBlock( "Parameters" ); const real_t omega = parameters.getParameter<real_t>( "omega", real_c( 1.4 )); const uint_t timesteps = parameters.getParameter<uint_t>( "timesteps", uint_c( 50 )); - const Vector3<real_t> initialVelocity = parameters.getParameter< Vector3<real_t> >( "initialVelocity", Vector3<real_t>() ); const bool initShearFlow = parameters.getParameter<bool>("initShearFlow", false); // Creating fields - auto latticeModel = LatticeModel_T( omega ); - BlockDataID pdfFieldCpuID = lbm::addPdfFieldToStorage( blocks, "pdfs on CPU", latticeModel, initialVelocity, real_t(1), field::fzyx ); + BlockDataID pdfFieldCpuID = field::addToStorage< PdfField_T >( blocks, "pdfs cpu", real_t(99.8), field::fzyx); + BlockDataID velFieldCpuID = field::addToStorage< VelocityField_T >( blocks, "vel", real_t(0), field::fzyx); + if( initShearFlow ) { WALBERLA_LOG_INFO_ON_ROOT("Initializing shear flow"); - initShearVelocity( blocks, pdfFieldCpuID ); + initShearVelocity( blocks, velFieldCpuID ); } + pystencils::UniformGridGPU_MacroSetter setterSweep(pdfFieldCpuID, velFieldCpuID); + for( auto & block : *blocks ) + setterSweep( &block ); + // setter sweep only initializes interior of domain - for push schemes to work a first communication is required here + blockforest::communication::UniformBufferedScheme<CommunicationStencil_T> initialComm(blocks); + initialComm.addPackInfo( make_shared< field::communication::PackInfo<PdfField_T> >( pdfFieldCpuID ) ); + initialComm(); + BlockDataID pdfFieldGpuID = cuda::addGPUFieldToStorage<PdfField_T >( blocks, pdfFieldCpuID, "pdfs on GPU", true ); BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >( blocks, "flag field" ); @@ -253,17 +268,21 @@ int main( int argc, char **argv ) timeLoop.add() << BeforeFunction( timeStep ) << Sweep( []( IBlock * ) {}, "time step" ); + pystencils::UniformGridGPU_MacroGetter getterSweep( pdfFieldCpuID, velFieldCpuID ); + // VTK uint_t vtkWriteFrequency = parameters.getParameter<uint_t>( "vtkWriteFrequency", 0 ); if( vtkWriteFrequency > 0 ) { auto vtkOutput = vtk::createVTKOutput_BlockData( *blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out", "simulation_step", false, true, true, false, 0 ); - vtkOutput->addCellDataWriter( - make_shared<lbm::VelocityVTKWriter<LatticeModel_T> >( pdfFieldCpuID, "Velocity" )); - vtkOutput->addCellDataWriter( make_shared<lbm::DensityVTKWriter<LatticeModel_T> >( pdfFieldCpuID, "Density" )); - vtkOutput->addBeforeFunction( - cuda::fieldCpyFunctor<PdfField_T, cuda::GPUField<real_t> >( blocks, pdfFieldCpuID, pdfFieldGpuID )); + auto velWriter = make_shared< field::VTKWriter<VelocityField_T> >(velFieldCpuID, "vel"); + vtkOutput->addCellDataWriter(velWriter); + vtkOutput->addBeforeFunction( [&]() { + cuda::fieldCpy<PdfField_T, cuda::GPUField<real_t> >( blocks, pdfFieldCpuID, pdfFieldGpuID ); + for( auto & block : *blocks ) + getterSweep( &block ); + }); timeLoop.addFuncAfterTimeStep( vtk::writeFiles( vtkOutput ), "VTK Output" ); } @@ -280,31 +299,41 @@ int main( int argc, char **argv ) timeLoop.addFuncAfterTimeStep( logger, "remaining time logger" ); } - for(int outerIteration = 0; outerIteration < outerIterations; ++outerIteration) + bool useGui = parameters.getParameter<bool>( "useGui", false ); + if( useGui ) + { + GUI gui( timeLoop, blocks, argc, argv); + lbm::connectToGui<LatticeModel_T>(gui); + gui.run(); + } + else { - timeLoop.setCurrentTimeStepToZero(); - WcTimer simTimer; - cudaDeviceSynchronize(); - WALBERLA_LOG_INFO_ON_ROOT( "Starting simulation with " << timesteps << " time steps" ); - simTimer.start(); - timeLoop.run(); - cudaDeviceSynchronize(); - simTimer.end(); - WALBERLA_LOG_INFO_ON_ROOT( "Simulation finished" ); - auto time = simTimer.last(); - auto nrOfCells = real_c( cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2] ); - auto mlupsPerProcess = nrOfCells * real_c( timesteps ) / time * 1e-6; - WALBERLA_LOG_RESULT_ON_ROOT( "MLUPS per process " << mlupsPerProcess ); - WALBERLA_LOG_RESULT_ON_ROOT( "Time per time step " << time / real_c( timesteps )); - WALBERLA_ROOT_SECTION() + for ( int outerIteration = 0; outerIteration < outerIterations; ++outerIteration ) { - python_coupling::PythonCallback pythonCallbackResults( "results_callback" ); - if ( pythonCallbackResults.isCallable()) + timeLoop.setCurrentTimeStepToZero(); + WcTimer simTimer; + cudaDeviceSynchronize(); + WALBERLA_LOG_INFO_ON_ROOT( "Starting simulation with " << timesteps << " time steps" ); + simTimer.start(); + timeLoop.run(); + cudaDeviceSynchronize(); + simTimer.end(); + WALBERLA_LOG_INFO_ON_ROOT( "Simulation finished" ); + auto time = simTimer.last(); + auto nrOfCells = real_c( cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2] ); + auto mlupsPerProcess = nrOfCells * real_c( timesteps ) / time * 1e-6; + WALBERLA_LOG_RESULT_ON_ROOT( "MLUPS per process " << mlupsPerProcess ); + WALBERLA_LOG_RESULT_ON_ROOT( "Time per time step " << time / real_c( timesteps )); + WALBERLA_ROOT_SECTION() { - pythonCallbackResults.data().exposeValue( "mlupsPerProcess", mlupsPerProcess ); - pythonCallbackResults.data().exposeValue( "githash", WALBERLA_GIT_SHA1 ); - // Call Python function to report results - pythonCallbackResults(); + python_coupling::PythonCallback pythonCallbackResults( "results_callback" ); + if ( pythonCallbackResults.isCallable()) + { + pythonCallbackResults.data().exposeValue( "mlupsPerProcess", mlupsPerProcess ); + pythonCallbackResults.data().exposeValue( "githash", WALBERLA_GIT_SHA1 ); + // Call Python function to report results + pythonCallbackResults(); + } } } } diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.prm b/apps/benchmarks/UniformGridGPU/UniformGridGPU.prm index 9c0796a49..de2821b82 100644 --- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.prm +++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.prm @@ -1,21 +1,21 @@ DomainSetup { - blocks < 2, 1, 1 >; - cellsPerBlock < 64, 64, 64 >; + blocks < 1, 1, 1 >; + cellsPerBlock < 512, 256, 256 >; periodic < 1, 1, 1 >; } Parameters { - timesteps 10000; // time steps of one performance measurement + timesteps 2000; //10000; // time steps of one performance measurement warmupSteps 0; // number of steps to run before measurement starts outerIterations 1; // how many measurements to conduct // Can be one of: GPUPackInfo_Baseline, GPUPackInfo_Streams, UniformGPUScheme_Baseline, UniformGPUScheme_Memcpy communicationScheme UniformGPUScheme_Baseline; - vtkWriteFrequency 100; // write a VTK file every n'th step, if zero VTK output is disabled + vtkWriteFrequency 0; //100; // write a VTK file every n'th step, if zero VTK output is disabled cudaEnabledMPI false; // switch on if you have a CUDA-enabled MPI implementation timeStepStrategy noOverlap; // can be: noOverlap, simpleOverlap, complexOverlap, kernelOnly @@ -25,6 +25,7 @@ Parameters omega 1.92; initShearFlow 1; + useGui 0; } /* diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.py b/apps/benchmarks/UniformGridGPU/UniformGridGPU.py index 6e0539753..80a4a3490 100644 --- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.py +++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.py @@ -1,12 +1,15 @@ import sympy as sp import numpy as np +import pystencils as ps from lbmpy.creationfunctions import create_lb_method, create_lb_update_rule from lbmpy.boundaries import NoSlip, UBB +from lbmpy.fieldaccess import StreamPullTwoFieldsAccessor, StreamPushTwoFieldsAccessor from pystencils_walberla import generate_pack_info_from_kernel from lbmpy_walberla import generate_lattice_model, generate_boundary from pystencils_walberla import CodeGeneration, generate_sweep from pystencils.data_types import TypedSymbol from pystencils.fast_approximation import insert_fast_sqrts, insert_fast_divisions +from lbmpy.macroscopic_value_kernels import macroscopic_values_getter, macroscopic_values_setter omega = sp.symbols("omega") # sweep_block_size = (128, 1, 1) @@ -16,7 +19,7 @@ sweep_block_size = (TypedSymbol("cudaBlockSize0", np.int32), sweep_params = {'block_size': sweep_block_size} -options = { +options_dict = { 'srt': { 'method': 'srt', 'stencil': 'D3Q19', @@ -49,17 +52,25 @@ options = { } with CodeGeneration() as ctx: + accessor = StreamPullTwoFieldsAccessor() + #accessor = StreamPushTwoFieldsAccessor() + assert not accessor.is_inplace, "This app does not work for inplace accessors" common_options = { 'field_name': 'pdfs', 'temporary_field_name': 'pdfs_tmp', - #'kernel_type': 'collide_stream_push', + 'kernel_type': accessor, 'optimization': {'cse_global': True, 'cse_pdfs': False} } - options = options[ctx.config] + options = options_dict.get(ctx.config, options_dict['srt']) options.update(common_options) + stencil_str = options['stencil'] + q = int(stencil_str[stencil_str.find('Q')+1:]) + pdfs, velocity_field = ps.fields("pdfs({q}), velocity(3) : double[3D]".format(q=q), layout='fzyx') + options['optimization']['symbolic_field'] = pdfs + vp = [ ('int32_t', 'cudaBlockSize0'), ('int32_t', 'cudaBlockSize1') @@ -84,5 +95,13 @@ with CodeGeneration() as ctx: generate_boundary(ctx, 'UniformGridGPU_NoSlip', NoSlip(), lb_method, target='gpu') generate_boundary(ctx, 'UniformGridGPU_UBB', UBB([0.05, 0, 0]), lb_method, target='gpu') + # getter & setter + setter_assignments = macroscopic_values_setter(lb_method, velocity=velocity_field.center_vector, + pdfs=pdfs.center_vector, density=1) + getter_assignments = macroscopic_values_getter(lb_method, velocity=velocity_field.center_vector, + pdfs=pdfs.center_vector, density=None) + generate_sweep(ctx, 'UniformGridGPU_MacroSetter', setter_assignments) + generate_sweep(ctx, 'UniformGridGPU_MacroGetter', getter_assignments) + # communication generate_pack_info_from_kernel(ctx, 'UniformGridGPU_PackInfo', update_rule, target='gpu') diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.cpp b/apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.cpp new file mode 100644 index 000000000..348f63fd0 --- /dev/null +++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.cpp @@ -0,0 +1,250 @@ +#include "core/Environment.h" +#include "core/logging/Initialization.h" +#include "python_coupling/CreateConfig.h" +#include "python_coupling/PythonCallback.h" +#include "python_coupling/DictWrapper.h" +#include "blockforest/Initialization.h" +#include "field/FlagField.h" +#include "field/AddToStorage.h" +#include "field/vtk/VTKWriter.h" +#include "field/communication/PackInfo.h" +#include "lbm/PerformanceLogger.h" +#include "blockforest/communication/UniformBufferedScheme.h" +#include "timeloop/all.h" +#include "geometry/all.h" +#include "cuda/HostFieldAllocator.h" +#include "cuda/communication/GPUPackInfo.h" +#include "cuda/ParallelStreams.h" +#include "core/timing/TimingPool.h" +#include "core/timing/RemainingTimeLogger.h" +#include "cuda/AddGPUFieldToStorage.h" +#include "cuda/communication/UniformGPUScheme.h" +#include "cuda/DeviceSelectMPI.h" +#include "domain_decomposition/SharedSweep.h" +#include "stencil/D3Q19.h" +#include "stencil/D3Q27.h" +#include "InitShearVelocity.h" + +#include "UniformGridGPU_AA_PackInfoPush.h" +#include "UniformGridGPU_AA_PackInfoPull.h" +#include "UniformGridGPU_AA_MacroSetter.h" +#include "UniformGridGPU_AA_MacroGetter.h" +#include "UniformGridGPU_AA_LbKernelEven.h" +#include "UniformGridGPU_AA_LbKernelOdd.h" + + +using namespace walberla; + +using Stencil_T = stencil::D3Q19; //TODO make generic - and determine from python script +using CommunicationStencil_T = Stencil_T; +using PdfField_T = GhostLayerField< real_t, Stencil_T::Q >; +using VelocityField_T = GhostLayerField< real_t, 3 >; + + +int main( int argc, char **argv ) +{ + mpi::Environment env( argc, argv ); + cuda::selectDeviceBasedOnMpiRank(); + + for ( auto cfg = python_coupling::configBegin( argc, argv ); cfg != python_coupling::configEnd(); ++cfg ) + { + WALBERLA_MPI_WORLD_BARRIER(); + + auto config = *cfg; + logging::configureLogging( config ); + auto blocks = blockforest::createUniformBlockGridFromConfig( config ); + + Vector3< uint_t > cellsPerBlock = config->getBlock( "DomainSetup" ).getParameter< Vector3< uint_t > >( "cellsPerBlock" ); + // Reading parameters + auto parameters = config->getOneBlock( "Parameters" ); + const real_t omega = parameters.getParameter< real_t >( "omega", real_c( 1.4 )); + const uint_t timesteps = parameters.getParameter< uint_t >( "timesteps", uint_c( 50 )); + + // Creating fields + BlockDataID pdfFieldCpuID = field::addToStorage< PdfField_T >( blocks, "pdfs cpu", real_t( 42.0 ), field::fzyx ); + BlockDataID velFieldCpuID = field::addToStorage< VelocityField_T >( blocks, "vel", real_t( 0 ), field::fzyx ); + + WALBERLA_LOG_INFO_ON_ROOT( "Initializing shear flow" ); + initShearVelocity( blocks, velFieldCpuID ); + + pystencils::UniformGridGPU_AA_MacroGetter getterSweep( pdfFieldCpuID, velFieldCpuID ); + pystencils::UniformGridGPU_AA_MacroSetter setterSweep( pdfFieldCpuID, velFieldCpuID ); + + for ( auto &block : *blocks ) + setterSweep( &block ); + + BlockDataID pdfFieldGpuID = cuda::addGPUFieldToStorage< PdfField_T >( blocks, pdfFieldCpuID, "pdfs on GPU", true ); + + Vector3<int> innerOuterSplit = parameters.getParameter<Vector3<int> >("innerOuterSplit", Vector3<int>(1, 1, 1)); + Cell innerOuterSplitCell (innerOuterSplit[0], innerOuterSplit[1], innerOuterSplit[2]); + bool cudaEnabledMPI = parameters.getParameter<bool>( "cudaEnabledMPI", false ); + Vector3<int32_t> gpuBlockSize = parameters.getParameter<Vector3<int32_t> > ("gpuBlockSize", Vector3<int32_t>(256, 1, 1)); + + int streamHighPriority = 0; + int streamLowPriority = 0; + WALBERLA_CUDA_CHECK( cudaDeviceGetStreamPriorityRange( &streamLowPriority, &streamHighPriority )); + WALBERLA_CHECK( gpuBlockSize[2] == 1 ); + + + using KernelEven = pystencils::UniformGridGPU_AA_LbKernelEven; + using KernelOdd = pystencils::UniformGridGPU_AA_LbKernelOdd; + using PackInfoPull = pystencils::UniformGridGPU_AA_PackInfoPull; + using PackInfoPush = pystencils::UniformGridGPU_AA_PackInfoPush; + using cuda::communication::UniformGPUScheme; + + KernelEven kernelEven( pdfFieldGpuID, omega, gpuBlockSize[0], gpuBlockSize[1], innerOuterSplitCell ); + KernelOdd kernelOdd ( pdfFieldGpuID, omega, gpuBlockSize[0], gpuBlockSize[1], innerOuterSplitCell ); + + kernelEven.setOuterPriority( streamHighPriority ); + kernelOdd .setOuterPriority( streamHighPriority ); + + auto pullScheme = make_shared< UniformGPUScheme< Stencil_T > >( blocks, cudaEnabledMPI ); + auto pushScheme = make_shared< UniformGPUScheme< Stencil_T > >( blocks, cudaEnabledMPI ); + pullScheme->addPackInfo( make_shared< PackInfoPull >( pdfFieldGpuID ) ); + pushScheme->addPackInfo( make_shared< PackInfoPush >( pdfFieldGpuID ) ); + + + auto defaultStream = cuda::StreamRAII::newPriorityStream( streamLowPriority ); + + auto setupPhase = [&]() { + for ( auto &block: *blocks ) + kernelEven( &block ); + + pullScheme->communicate(); + + for ( auto &block: *blocks ) + kernelOdd( &block ); + }; + + + auto tearDownPhase = [&]() { + pushScheme->communicate(); + cuda::fieldCpy< PdfField_T, cuda::GPUField< real_t > >( blocks, pdfFieldCpuID, pdfFieldGpuID ); + for ( auto &block : *blocks ) + getterSweep( &block ); + }; + + + auto simpleOverlapTimeStep = [&]() + { + // Even + pushScheme->startCommunication( defaultStream ); + for ( auto &block: *blocks ) + kernelEven.inner( &block, defaultStream ); + pushScheme->wait( defaultStream ); + for ( auto &block: *blocks ) + kernelEven.outer( &block, defaultStream ); + + // Odd + pullScheme->startCommunication( defaultStream ); + for ( auto &block: *blocks ) + kernelOdd.inner( &block, defaultStream ); + pullScheme->wait( defaultStream ); + for ( auto &block: *blocks ) + kernelOdd.outer( &block, defaultStream ); + }; + + auto normalTimeStep = [&]() + { + pushScheme->communicate( defaultStream ); + for ( auto &block: *blocks ) + kernelEven( &block, defaultStream ); + + pullScheme->communicate( defaultStream ); + for ( auto &block: *blocks ) + kernelOdd( &block, defaultStream ); + }; + + auto kernelOnlyFunc = [&]() + { + for ( auto &block: *blocks ) + kernelEven( &block, defaultStream ); + for ( auto &block: *blocks ) + kernelOdd( &block, defaultStream ); + }; + + SweepTimeloop timeLoop( blocks->getBlockStorage(), timesteps / 2 ); + + const std::string timeStepStrategy = parameters.getParameter< std::string >( "timeStepStrategy", "normal" ); + std::function< void() > timeStep; + if ( timeStepStrategy == "noOverlap" ) + timeStep = std::function< void() >( normalTimeStep ); + else if ( timeStepStrategy == "simpleOverlap" ) + timeStep = simpleOverlapTimeStep; + else if ( timeStepStrategy == "kernelOnly" ) + { + WALBERLA_LOG_INFO_ON_ROOT( "Running only compute kernel without boundary - this makes only sense for benchmarking!" ) + timeStep = kernelOnlyFunc; + } + else + { + WALBERLA_ABORT_NO_DEBUG_INFO( + "Invalid value for 'timeStepStrategy'. Allowed values are 'noOverlap', 'complexOverlap', 'simpleOverlap', 'kernelOnly'" ); + } + + timeLoop.add() << BeforeFunction( timeStep ) + << Sweep( []( IBlock * ) {}, "time step" ); + + + // VTK + uint_t vtkWriteFrequency = parameters.getParameter< uint_t >( "vtkWriteFrequency", 0 ); + if ( vtkWriteFrequency > 0 ) + { + auto vtkOutput = vtk::createVTKOutput_BlockData( *blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out", + "simulation_step", false, true, true, false, 0 ); + auto velWriter = make_shared< field::VTKWriter< VelocityField_T > >( velFieldCpuID, "vel" ); + vtkOutput->addCellDataWriter( velWriter ); + vtkOutput->addBeforeFunction( [&]() + { + tearDownPhase(); + setupPhase(); + } ); + timeLoop.addFuncAfterTimeStep( vtk::writeFiles( vtkOutput ), "VTK Output" ); + } + + int warmupSteps = parameters.getParameter< int >( "warmupSteps", 2 ); + int outerIterations = parameters.getParameter< int >( "outerIterations", 1 ); + setupPhase(); + for ( int i = 0; i < warmupSteps; ++i ) + timeLoop.singleStep(); + + double remainingTimeLoggerFrequency = parameters.getParameter< double >( "remainingTimeLoggerFrequency", -1.0 ); // in seconds + if ( remainingTimeLoggerFrequency > 0 ) + { + auto logger = timing::RemainingTimeLogger( timeLoop.getNrOfTimeSteps() * outerIterations, remainingTimeLoggerFrequency ); + timeLoop.addFuncAfterTimeStep( logger, "remaining time logger" ); + } + + for ( int outerIteration = 0; outerIteration < outerIterations; ++outerIteration ) + { + timeLoop.setCurrentTimeStepToZero(); + WcTimer simTimer; + cudaDeviceSynchronize(); + WALBERLA_LOG_INFO_ON_ROOT( "Starting simulation with " << timesteps << " time steps" ); + simTimer.start(); + timeLoop.run(); + cudaDeviceSynchronize(); + simTimer.end(); + WALBERLA_LOG_INFO_ON_ROOT( "Simulation finished" ); + auto time = simTimer.last(); + auto nrOfCells = real_c( cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2] ); + + auto mlupsPerProcess = nrOfCells * real_c( timesteps ) / time * 1e-6; + WALBERLA_LOG_RESULT_ON_ROOT( "MLUPS per process " << mlupsPerProcess ); + WALBERLA_LOG_RESULT_ON_ROOT( "Time per time step " << time / real_c( timesteps )); + WALBERLA_ROOT_SECTION() + { + python_coupling::PythonCallback pythonCallbackResults( "results_callback" ); + if ( pythonCallbackResults.isCallable()) + { + pythonCallbackResults.data().exposeValue( "mlupsPerProcess", mlupsPerProcess ); + pythonCallbackResults.data().exposeValue( "githash", WALBERLA_GIT_SHA1 ); + // Call Python function to report results + pythonCallbackResults(); + } + } + } + } + + return 0; +} diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.py b/apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.py new file mode 100644 index 000000000..45ea8043d --- /dev/null +++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.py @@ -0,0 +1,100 @@ +import sympy as sp +import numpy as np +import pystencils as ps +from lbmpy.creationfunctions import create_lb_method, create_lb_update_rule +from lbmpy.fieldaccess import AAEvenTimeStepAccessor, AAOddTimeStepAccessor +from pystencils_walberla import generate_pack_info_from_kernel +from pystencils_walberla import CodeGeneration, generate_sweep +from pystencils.data_types import TypedSymbol +from pystencils.fast_approximation import insert_fast_sqrts, insert_fast_divisions +from lbmpy.macroscopic_value_kernels import macroscopic_values_getter, macroscopic_values_setter + +omega = sp.symbols("omega") +# sweep_block_size = (128, 1, 1) +sweep_block_size = (TypedSymbol("cudaBlockSize0", np.int32), + TypedSymbol("cudaBlockSize1", np.int32), + 1) + +sweep_params = {'block_size': sweep_block_size} + +options_dict = { + 'srt': { + 'method': 'srt', + 'stencil': 'D3Q19', + 'relaxation_rate': omega, + 'compressible': False, + }, + 'trt': { + 'method': 'trt', + 'stencil': 'D3Q19', + 'relaxation_rate': omega, + }, + 'mrt': { + 'method': 'mrt', + 'stencil': 'D3Q19', + 'relaxation_rates': [0, omega, 1.3, 1.4, omega, 1.2, 1.1], + }, + 'entropic': { + 'method': 'mrt3', + 'stencil': 'D3Q19', + 'compressible': True, + 'relaxation_rates': [omega, omega, sp.Symbol("omega_free")], + 'entropic': True, + }, + 'smagorinsky': { + 'method': 'srt', + 'stencil': 'D3Q19', + 'smagorinsky': True, + 'relaxation_rate': omega, + } +} + +with CodeGeneration() as ctx: + accessors = { + 'Even': AAEvenTimeStepAccessor(), + 'Odd': AAOddTimeStepAccessor() + } + + common_options = { + 'field_name': 'pdfs', + 'optimization': {'cse_global': True, + 'cse_pdfs': False} + } + options = options_dict.get(ctx.config, options_dict['srt']) + options.update(common_options) + + stencil_str = options['stencil'] + q = int(stencil_str[stencil_str.find('Q') + 1:]) + pdfs, velocity_field = ps.fields("pdfs({q}), velocity(3) : double[3D]".format(q=q), layout='fzyx') + options['optimization']['symbolic_field'] = pdfs + + vp = [ + ('int32_t', 'cudaBlockSize0'), + ('int32_t', 'cudaBlockSize1') + ] + lb_method = create_lb_method(**options) + + # Kernels + options_without_opt = options.copy() + del options_without_opt['optimization'] + update_rules = {} + for name, accessor in accessors.items(): + update_rule = create_lb_update_rule(lb_method=lb_method, kernel_type=accessor, **options) + update_rule = insert_fast_divisions(update_rule) + update_rule = insert_fast_sqrts(update_rule) + update_rules[name] = update_rule + generate_sweep(ctx, 'UniformGridGPU_AA_LbKernel' + name, update_rule, + inner_outer_split=True, target='gpu', gpu_indexing_params=sweep_params, + varying_parameters=vp) + + # getter & setter + setter_assignments = macroscopic_values_setter(lb_method, velocity=velocity_field.center_vector, + pdfs=pdfs.center_vector, density=1) + getter_assignments = macroscopic_values_getter(lb_method, velocity=velocity_field.center_vector, + pdfs=pdfs.center_vector, density=None) + generate_sweep(ctx, 'UniformGridGPU_AA_MacroSetter', setter_assignments) + generate_sweep(ctx, 'UniformGridGPU_AA_MacroGetter', getter_assignments) + + # communication + generate_pack_info_from_kernel(ctx, 'UniformGridGPU_AA_PackInfoPull', update_rules['Odd'], kind='pull', target='gpu') + generate_pack_info_from_kernel(ctx, 'UniformGridGPU_AA_PackInfoPush', update_rules['Odd'], kind='push', target='gpu') -- GitLab