diff --git a/apps/benchmarks/UniformGridGPU/CMakeLists.txt b/apps/benchmarks/UniformGridGPU/CMakeLists.txt index b53cb24ca8ad9be070ab93d40ae86346a9c20f32..c3052b52393513f988d00b8730aad3ba6d6d1e81 100644 --- a/apps/benchmarks/UniformGridGPU/CMakeLists.txt +++ b/apps/benchmarks/UniformGridGPU/CMakeLists.txt @@ -10,6 +10,9 @@ waLBerla_python_file_generates(UniformGridGPU.py UniformGridGPU_PackInfo.cu UniformGridGPU_PackInfo.h ) -waLBerla_add_executable ( NAME UniformGridBenchmarkGPU - FILES UniformGridGPU.cpp UniformGridGPU.py - DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk ) +foreach(config srt trt mrt smagorinsky entropic ) + waLBerla_add_executable ( NAME UniformGridBenchmarkGPU_${config} + FILES UniformGridGPU.cpp UniformGridGPU.py + DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk + CODEGEN_CFG ${config}) +endforeach() diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp index 32d06a834473ff86e05f75a4d0c4cccec3a07e67..181f981bc05c74a122f770fb88e3164f560d100a 100644 --- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp +++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp @@ -1,5 +1,6 @@ #include "core/Environment.h" #include "core/logging/Initialization.h" +#include "core/math/Random.h" #include "python_coupling/CreateConfig.h" #include "python_coupling/PythonCallback.h" #include "python_coupling/DictWrapper.h" @@ -48,6 +49,27 @@ using flag_t = walberla::uint8_t; using FlagField_T = FlagField<flag_t>; +void initShearVelocity(const shared_ptr<StructuredBlockStorage> & blocks, BlockDataID pdfFieldID, + const real_t xMagnitude=0.1, const real_t fluctuationMagnitude=0.05 ) +{ + math::seedRandomGenerator(0); + auto halfZ = blocks->getDomainCellBB().zMax() / 2; + for( auto & block: *blocks) + { + PdfField_T * pdfField = block.getData<PdfField_T>( pdfFieldID ); + WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(pdfField, + Cell globalCell; + blocks->transformBlockLocalToGlobalCell(globalCell, block, Cell(x, y, z)); + real_t randomReal = xMagnitude * math::realRandom<real_t>(-fluctuationMagnitude, fluctuationMagnitude); + if( globalCell[2] >= halfZ ) { + pdfField->setDensityAndVelocity(x, y, z, Vector3<real_t>(xMagnitude, 0, randomReal), real_t(1.0)); + } else { + pdfField->setDensityAndVelocity(x, y, z, Vector3<real_t>(-xMagnitude, 0,randomReal), real_t(1.0)); + } + ); + } +} + int main( int argc, char **argv ) { @@ -63,19 +85,25 @@ int main( int argc, char **argv ) auto blocks = blockforest::createUniformBlockGridFromConfig( config ); Vector3<uint_t> cellsPerBlock = config->getBlock( "DomainSetup" ).getParameter<Vector3<uint_t> >( "cellsPerBlock" ); - // Reading parameters auto parameters = config->getOneBlock( "Parameters" ); const real_t omega = parameters.getParameter<real_t>( "omega", real_c( 1.4 )); const uint_t timesteps = parameters.getParameter<uint_t>( "timesteps", uint_c( 50 )); const Vector3<real_t> initialVelocity = parameters.getParameter< Vector3<real_t> >( "initialVelocity", Vector3<real_t>() ); + const bool initShearFlow = parameters.getParameter<bool>("initShearFlow", false); // Creating fields auto latticeModel = LatticeModel_T( omega ); BlockDataID pdfFieldCpuID = lbm::addPdfFieldToStorage( blocks, "pdfs on CPU", latticeModel, initialVelocity, real_t(1), field::fzyx ); + if( initShearFlow ) { + WALBERLA_LOG_INFO_ON_ROOT("Initializing shear flow"); + initShearVelocity( blocks, pdfFieldCpuID ); + } + BlockDataID pdfFieldGpuID = cuda::addGPUFieldToStorage<PdfField_T >( blocks, pdfFieldCpuID, "pdfs on GPU", true ); BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >( blocks, "flag field" ); + // Boundaries const FlagUID fluidFlagUID( "Fluid" ); auto boundariesConfig = config->getBlock( "Boundaries" ); @@ -95,7 +123,7 @@ int main( int argc, char **argv ) // Communication setup bool cudaEnabledMPI = parameters.getParameter<bool>( "cudaEnabledMPI", false ); - + Vector3<int32_t> gpuBlockSize = parameters.getParameter<Vector3<int32_t> > ("gpuBlockSize", Vector3<int32_t>(256, 1, 1)); const std::string communicationSchemeStr = parameters.getParameter<std::string>("communicationScheme", "UniformGPUScheme_Baseline"); CommunicationSchemeType communicationScheme; if( communicationSchemeStr == "GPUPackInfo_Baseline") @@ -106,6 +134,8 @@ int main( int argc, char **argv ) communicationScheme = UniformGPUScheme_Baseline; else if (communicationSchemeStr == "UniformGPUScheme_Memcpy") communicationScheme = UniformGPUScheme_Memcpy; + else if (communicationSchemeStr == "MPIDatatypes") + communicationScheme = MPIDatatypes; else { WALBERLA_ABORT_NO_DEBUG_INFO("Invalid choice for communicationScheme") } @@ -116,8 +146,9 @@ int main( int argc, char **argv ) int streamHighPriority = 0; int streamLowPriority = 0; WALBERLA_CUDA_CHECK( cudaDeviceGetStreamPriorityRange(&streamLowPriority, &streamHighPriority) ); - - pystencils::UniformGridGPU_LbKernel lbKernel( pdfFieldGpuID, omega, Cell(innerOuterSplit[0], innerOuterSplit[1], innerOuterSplit[2]) ); + WALBERLA_CHECK(gpuBlockSize[2] == 1); + pystencils::UniformGridGPU_LbKernel lbKernel( pdfFieldGpuID, omega, gpuBlockSize[0], gpuBlockSize[1], + Cell(innerOuterSplit[0], innerOuterSplit[1], innerOuterSplit[2]) ); lbKernel.setOuterPriority( streamHighPriority ); UniformGridGPU_Communication< CommunicationStencil_T, cuda::GPUField< double > > gpuComm( blocks, pdfFieldGpuID, (CommunicationSchemeType) communicationScheme, cudaEnabledMPI ); diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.prm b/apps/benchmarks/UniformGridGPU/UniformGridGPU.prm index 81a0996f872ab2f409a0bef1a9619de8a2095e6b..9c0796a49e48af5e8ae999f80a69c8b8de19a6d3 100644 --- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.prm +++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.prm @@ -1,7 +1,7 @@ DomainSetup { - blocks < 1, 1, 1 >; - cellsPerBlock < 256, 256, 128 >; + blocks < 2, 1, 1 >; + cellsPerBlock < 64, 64, 64 >; periodic < 1, 1, 1 >; } @@ -15,13 +15,16 @@ Parameters // Can be one of: GPUPackInfo_Baseline, GPUPackInfo_Streams, UniformGPUScheme_Baseline, UniformGPUScheme_Memcpy communicationScheme UniformGPUScheme_Baseline; - vtkWriteFrequency 0; // write a VTK file every n'th step, if zero VTK output is disabled + vtkWriteFrequency 100; // write a VTK file every n'th step, if zero VTK output is disabled cudaEnabledMPI false; // switch on if you have a CUDA-enabled MPI implementation timeStepStrategy noOverlap; // can be: noOverlap, simpleOverlap, complexOverlap, kernelOnly - innerOuterSplit < 32, 1, 1>; // slice-thickness that 'outer'-kernels process when overlapping + innerOuterSplit < 8, 1, 1>; // slice-thickness that 'outer'-kernels process when overlapping - remainingTimeLoggerFrequency 0; // interval in seconds to log the estimated remaining time + remainingTimeLoggerFrequency 5; // interval in seconds to log the estimated remaining time + + omega 1.92; + initShearFlow 1; } /* diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.py b/apps/benchmarks/UniformGridGPU/UniformGridGPU.py index e5b05555a6c5cf47a6a01bba93f8da312a22818d..6e0539753bf8e0efbbae8d2475cba6a9f6e4f92d 100644 --- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.py +++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.py @@ -1,37 +1,86 @@ import sympy as sp +import numpy as np from lbmpy.creationfunctions import create_lb_method, create_lb_update_rule from lbmpy.boundaries import NoSlip, UBB from pystencils_walberla import generate_pack_info_from_kernel from lbmpy_walberla import generate_lattice_model, generate_boundary from pystencils_walberla import CodeGeneration, generate_sweep +from pystencils.data_types import TypedSymbol +from pystencils.fast_approximation import insert_fast_sqrts, insert_fast_divisions +omega = sp.symbols("omega") +# sweep_block_size = (128, 1, 1) +sweep_block_size = (TypedSymbol("cudaBlockSize0", np.int32), + TypedSymbol("cudaBlockSize1", np.int32), + 1) -sweep_block_size = (128, 1, 1) sweep_params = {'block_size': sweep_block_size} - -with CodeGeneration() as ctx: - # LB options - options = { +options = { + 'srt': { 'method': 'srt', 'stencil': 'D3Q19', - 'relaxation_rate': sp.Symbol("omega"), - 'field_name': 'pdfs', + 'relaxation_rate': omega, 'compressible': False, + }, + 'trt': { + 'method': 'trt', + 'stencil': 'D3Q19', + 'relaxation_rate': omega, + }, + 'mrt': { + 'method': 'mrt', + 'stencil': 'D3Q19', + 'relaxation_rates': [0, omega, 1.3, 1.4, omega, 1.2, 1.1], + }, + 'entropic': { + 'method': 'mrt3', + 'stencil': 'D3Q19', + 'compressible': True, + 'relaxation_rates': [omega, omega, sp.Symbol("omega_free")], + 'entropic': True, + }, + 'smagorinsky': { + 'method': 'srt', + 'stencil': 'D3Q19', + 'smagorinsky': True, + 'relaxation_rate': omega, + } +} + +with CodeGeneration() as ctx: + + common_options = { + 'field_name': 'pdfs', 'temporary_field_name': 'pdfs_tmp', + #'kernel_type': 'collide_stream_push', 'optimization': {'cse_global': True, - 'cse_pdfs': False, - } + 'cse_pdfs': False} } + options = options[ctx.config] + options.update(common_options) + + vp = [ + ('int32_t', 'cudaBlockSize0'), + ('int32_t', 'cudaBlockSize1') + ] lb_method = create_lb_method(**options) update_rule = create_lb_update_rule(lb_method=lb_method, **options) + update_rule = insert_fast_divisions(update_rule) + update_rule = insert_fast_sqrts(update_rule) + # CPU lattice model - required for macroscopic value computation, VTK output etc. - generate_lattice_model(ctx, 'UniformGridGPU_LatticeModel', lb_method) + options_without_opt = options.copy() + del options_without_opt['optimization'] + generate_lattice_model(ctx, 'UniformGridGPU_LatticeModel', lb_method, update_rule_params=options_without_opt) # gpu LB sweep & boundaries - generate_sweep(ctx, 'UniformGridGPU_LbKernel', update_rule, field_swaps=[('pdfs', 'pdfs_tmp')], - inner_outer_split=True, target='gpu', gpu_indexing_params=sweep_params) + generate_sweep(ctx, 'UniformGridGPU_LbKernel', update_rule, + field_swaps=[('pdfs', 'pdfs_tmp')], + inner_outer_split=True, target='gpu', gpu_indexing_params=sweep_params, + varying_parameters=vp) + generate_boundary(ctx, 'UniformGridGPU_NoSlip', NoSlip(), lb_method, target='gpu') generate_boundary(ctx, 'UniformGridGPU_UBB', UBB([0.05, 0, 0]), lb_method, target='gpu') diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU_Communication.h b/apps/benchmarks/UniformGridGPU/UniformGridGPU_Communication.h index 3e45caa94d6f34bafdc302e97422522b072a615e..6a3bf6b532fe67b753cf2d64737a25d7747e067d 100644 --- a/apps/benchmarks/UniformGridGPU/UniformGridGPU_Communication.h +++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU_Communication.h @@ -4,6 +4,8 @@ #include "blockforest/Block.h" #include "blockforest/StructuredBlockForest.h" #include "blockforest/communication/UniformBufferedScheme.h" +#include "blockforest/communication/UniformDirectScheme.h" +#include "field/communication/StencilRestrictedMPIDatatypeInfo.h" #include "cuda/communication/GPUPackInfo.h" #include "cuda/communication/UniformGPUScheme.h" #include "cuda/communication/MemcpyPackInfo.h" @@ -17,7 +19,8 @@ enum CommunicationSchemeType { GPUPackInfo_Baseline = 0, GPUPackInfo_Streams = 1, UniformGPUScheme_Baseline = 2, - UniformGPUScheme_Memcpy = 3 + UniformGPUScheme_Memcpy = 3, + MPIDatatypes = 4 }; @@ -28,8 +31,12 @@ public: explicit UniformGridGPU_Communication(weak_ptr_wrapper<StructuredBlockForest> bf, const BlockDataID & bdId, CommunicationSchemeType commSchemeType, bool cudaEnabledMPI = false) : _commSchemeType(commSchemeType), _cpuCommunicationScheme(nullptr), _gpuPackInfo(nullptr), - _gpuCommunicationScheme(nullptr), _generatedPackInfo(nullptr) + _gpuCommunicationScheme(nullptr), _directScheme(nullptr) { + auto generatedPackInfo = make_shared<pystencils::UniformGridGPU_PackInfo>( bdId ); + auto memcpyPackInfo = make_shared< cuda::communication::MemcpyPackInfo< GPUFieldType > >( bdId ); + auto dataTypeInfo = make_shared< field::communication::StencilRestrictedMPIDatatypeInfo< GPUFieldType, StencilType > >( bdId ); + switch(_commSchemeType) { case GPUPackInfo_Baseline: @@ -44,13 +51,17 @@ public: break; case UniformGPUScheme_Baseline: _gpuCommunicationScheme = make_shared< cuda::communication::UniformGPUScheme< StencilType > >( bf, cudaEnabledMPI ); - _generatedPackInfo = make_shared<pystencils::UniformGridGPU_PackInfo>( bdId ); - _gpuCommunicationScheme->addPackInfo( _generatedPackInfo ); + _gpuCommunicationScheme->addPackInfo( generatedPackInfo ); break; case UniformGPUScheme_Memcpy: _gpuCommunicationScheme = make_shared< cuda::communication::UniformGPUScheme< StencilType > >( bf, cudaEnabledMPI ); - _memcpyPackInfo = make_shared< cuda::communication::MemcpyPackInfo< GPUFieldType > >( bdId ); - _gpuCommunicationScheme->addPackInfo( _memcpyPackInfo ); + _gpuCommunicationScheme->addPackInfo( memcpyPackInfo ); + break; + case MPIDatatypes: + if( ! cudaEnabledMPI ) { + WALBERLA_ABORT("MPI datatype-based communication not possible if no cudaEnabledMPI is available."); + } + _directScheme = make_shared< blockforest::communication::UniformDirectScheme< StencilType > >( bf, dataTypeInfo ); break; default: WALBERLA_ABORT("Invalid GPU communication scheme specified!"); @@ -91,6 +102,10 @@ public: WALBERLA_ASSERT_NOT_NULLPTR( _gpuCommunicationScheme ); _gpuCommunicationScheme->startCommunication( communicationStream ); break; + case MPIDatatypes: + WALBERLA_ASSERT_NOT_NULLPTR( _directScheme ); + _directScheme->startCommunication(); + break; } } @@ -115,6 +130,10 @@ public: WALBERLA_ASSERT_NOT_NULLPTR( _gpuCommunicationScheme ); _gpuCommunicationScheme->wait( communicationStream ); break; + case MPIDatatypes: + WALBERLA_ASSERT_NOT_NULLPTR( _directScheme ); + _directScheme->wait(); + break; } } @@ -123,6 +142,5 @@ private: shared_ptr< blockforest::communication::UniformBufferedScheme< StencilType > > _cpuCommunicationScheme; shared_ptr< cuda::communication::GPUPackInfo< GPUFieldType > > _gpuPackInfo; shared_ptr< cuda::communication::UniformGPUScheme< StencilType > > _gpuCommunicationScheme; - shared_ptr< pystencils::UniformGridGPU_PackInfo > _generatedPackInfo; - shared_ptr< cuda::communication::MemcpyPackInfo< GPUFieldType > > _memcpyPackInfo; + shared_ptr< blockforest::communication::UniformDirectScheme<StencilType> > _directScheme; };