diff --git a/.gitignore b/.gitignore index c755c9f0b48f05521eb5de8638ae4b3229f16044..4c63e3b65e5c1acb0f32cbc2d7f9975427a68c57 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,9 @@ qrc_* # macOS **/.DS_Store +# CLion indexing +*.uuid + # Generated files *.out diff --git a/apps/benchmarks/NonUniformGrid/NonUniformGrid.cpp b/apps/benchmarks/NonUniformGrid/NonUniformGrid.cpp index 817097ccab6f4a08d1eb0bd80b169d1dca326111..4a674d48b66838c4de39e4316c551b7ef6d2de89 100644 --- a/apps/benchmarks/NonUniformGrid/NonUniformGrid.cpp +++ b/apps/benchmarks/NonUniformGrid/NonUniformGrid.cpp @@ -243,23 +243,23 @@ static void refinementSelection( SetupBlockForest& forest ) AABB rightCorner( domain.xMax() - xSize, domain.yMin(), domain.zMax() - zSize, domain.xMax(), domain.yMax(), domain.zMax() ); - for( auto block = forest.begin(); block != forest.end(); ++block ) + for(auto & block : forest) { - auto & aabb = block->getAABB(); + auto & aabb = block.getAABB(); if( leftCorner.intersects( aabb ) || rightCorner.intersects( aabb ) ) { - if( block->getLevel() < ( BlockForestLevels - uint_t(1) ) ) - block->setMarker( true ); + if( block.getLevel() < ( BlockForestLevels - uint_t(1) ) ) + block.setMarker( true ); } } } static void workloadAndMemoryAssignment( SetupBlockForest & forest, const memory_t memoryPerBlock ) { - for( auto block = forest.begin(); block != forest.end(); ++block ) + for(auto & block : forest) { - block->setWorkload( numeric_cast< workload_t >( uint_t(1) << block->getLevel() ) ); - block->setMemory( memoryPerBlock ); + block.setWorkload( numeric_cast< workload_t >( uint_t(1) << block.getLevel() ) ); + block.setMemory( memoryPerBlock ); } } @@ -295,7 +295,7 @@ void createSetupBlockForest( blockforest::SetupBlockForest & sforest, const Conf #ifndef NDEBUG WALBERLA_ASSERT_EQUAL( sforest.getNumberOfBlocks(), numberOfBlocks( workerProcesses, fineBlocksPerProcess ) ); - for( uint_t i = uint_t(0); i != BlockForestLevels; ++i ) + for( auto i = uint_t(0); i != BlockForestLevels; ++i ) { std::vector< blockforest::SetupBlock * > blocks; sforest.getBlocks( blocks, i ); @@ -415,13 +415,13 @@ void ReGrid::operator()( std::vector< std::pair< const Block *, uint_t > > & min domain.xMax(), domain.yMax(), domain.zMin() + real_c(0.99) * zSize ); } - for( auto it = minTargetLevels.begin(); it != minTargetLevels.end(); ++it ) + for(auto & minTargetLevel : minTargetLevels) { - auto & aabb = it->first->getAABB(); + auto & aabb = minTargetLevel.first->getAABB(); if( left.intersects( aabb ) || right.intersects( aabb ) ) - it->second = BlockForestLevels - uint_t(1); + minTargetLevel.second = BlockForestLevels - uint_t(1); else - it->second = uint_t(0); + minTargetLevel.second = uint_t(0); } } diff --git a/apps/benchmarks/NonUniformGridCPU/NonUniformGridCPU.cpp b/apps/benchmarks/NonUniformGridCPU/NonUniformGridCPU.cpp index 3a44867523a9b94f8f2f9b57bc8b5aeb6aac6819..4d3ce332f4f6b23e7fbfb5e885b61c3268232155 100644 --- a/apps/benchmarks/NonUniformGridCPU/NonUniformGridCPU.cpp +++ b/apps/benchmarks/NonUniformGridCPU/NonUniformGridCPU.cpp @@ -66,22 +66,32 @@ using RefinementSelectionFunctor = SetupBlockForest::RefinementSelectionFunction class LDCRefinement { - private: +private: const uint_t refinementDepth_; - public: - LDCRefinement(const uint_t depth) : refinementDepth_(depth){}; +public: + explicit LDCRefinement(const uint_t depth) : refinementDepth_(depth){}; - void operator()(SetupBlockForest& forest) + void operator()(SetupBlockForest& forest) const { - std::vector< SetupBlock* > blocks; - forest.getBlocks(blocks); + const AABB & domain = forest.getDomain(); - for (auto block : blocks) + real_t xSize = ( domain.xSize() / real_t(12) ) * real_c( 0.99 ); + real_t ySize = ( domain.ySize() / real_t(12) ) * real_c( 0.99 ); + + AABB leftCorner( domain.xMin(), domain.yMin(), domain.zMin(), + domain.xMin() + xSize, domain.yMin() + ySize, domain.zMax() ); + + AABB rightCorner( domain.xMax() - xSize, domain.yMin(), domain.zMin(), + domain.xMax(), domain.yMin() + ySize, domain.zMax() ); + + for(auto & block : forest) { - if (forest.atDomainYMaxBorder(*block)) + auto & aabb = block.getAABB(); + if( leftCorner.intersects( aabb ) || rightCorner.intersects( aabb ) ) { - if (block->getLevel() < refinementDepth_) { block->setMarker(true); } + if( block.getLevel() < refinementDepth_) + block.setMarker( true ); } } } @@ -89,17 +99,26 @@ class LDCRefinement class LDC { - public: - LDC(const uint_t depth) : refinementDepth_(depth), noSlipFlagUID_("NoSlip"), ubbFlagUID_("UBB"){}; +private: + const std::string refinementProfile_; + const uint_t refinementDepth_; + + const FlagUID noSlipFlagUID_; + const FlagUID ubbFlagUID_; + +public: + explicit LDC(const uint_t depth) : refinementDepth_(depth), noSlipFlagUID_("NoSlip"), ubbFlagUID_("UBB"){}; - Vector3< real_t > acceleration() const { return Vector3< real_t >(0.0); } - RefinementSelectionFunctor refinementSelector() { return LDCRefinement(refinementDepth_); } + RefinementSelectionFunctor refinementSelector() const + { + return LDCRefinement(refinementDepth_); + } void setupBoundaryFlagField(StructuredBlockForest& sbfs, const BlockDataID flagFieldID) { for (auto bIt = sbfs.begin(); bIt != sbfs.end(); ++bIt) { - Block& b = dynamic_cast< Block& >(*bIt); + auto& b = dynamic_cast< Block& >(*bIt); const uint_t level = b.getLevel(); auto flagField = b.getData< FlagField_T >(flagFieldID); const uint8_t noslipFlag = flagField->registerFlag(noSlipFlagUID_); @@ -118,26 +137,20 @@ class LDC } } } - private: - const std::string refinementProfile_; - const uint_t refinementDepth_; - - const FlagUID noSlipFlagUID_; - const FlagUID ubbFlagUID_; }; static void createSetupBlockForest(SetupBlockForest& setupBfs, const Config::BlockHandle& domainSetup, LDC& ldcSetup, const uint_t numProcesses=uint_c(MPIManager::instance()->numProcesses())) { - Vector3< real_t > domainSize = domainSetup.getParameter< Vector3< real_t > >("domainSize"); - Vector3< uint_t > rootBlocks = domainSetup.getParameter< Vector3< uint_t > >("rootBlocks"); - Vector3< bool > periodic = domainSetup.getParameter< Vector3< bool > >("periodic"); + Vector3<real_t> domainSize = domainSetup.getParameter<Vector3<real_t> >("domainSize"); + Vector3<uint_t> rootBlocks = domainSetup.getParameter<Vector3<uint_t> >("rootBlocks"); + Vector3<bool> periodic = domainSetup.getParameter<Vector3<bool> >("periodic"); auto refSelection = ldcSetup.refinementSelector(); - setupBfs.addRefinementSelectionFunction(std::function< void(SetupBlockForest&) >(refSelection)); + setupBfs.addRefinementSelectionFunction(std::function<void(SetupBlockForest &)>(refSelection)); const AABB domain(real_t(0.0), real_t(0.0), real_t(0.0), domainSize[0], domainSize[1], domainSize[2]); - setupBfs.addWorkloadMemorySUIDAssignmentFunction( blockforest::uniformWorkloadAndMemoryAssignment ); + setupBfs.addWorkloadMemorySUIDAssignmentFunction(blockforest::uniformWorkloadAndMemoryAssignment); setupBfs.init(domain, rootBlocks[0], rootBlocks[1], rootBlocks[2], periodic[0], periodic[1], periodic[2]); - setupBfs.balanceLoad(blockforest::StaticLevelwiseCurveBalance(true), numProcesses); + setupBfs.balanceLoad(blockforest::StaticLevelwiseCurveBalanceWeighted(), numProcesses); } int main(int argc, char** argv) @@ -147,7 +160,7 @@ int main(int argc, char** argv) for (auto cfg = python_coupling::configBegin(argc, argv); cfg != python_coupling::configEnd(); ++cfg) { - WALBERLA_MPI_WORLD_BARRIER() + WALBERLA_MPI_BARRIER() ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// SETUP AND CONFIGURATION /// ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -197,8 +210,6 @@ int main(int argc, char** argv) std::make_shared< StructuredBlockForest >(bfs, cellsPerBlock[0], cellsPerBlock[1], cellsPerBlock[2]); blocks->createCellBoundingBoxes(); - WALBERLA_ROOT_SECTION() { vtk::writeDomainDecomposition(blocks, "domainDecomposition", "vtk_out"); } - WALBERLA_LOG_INFO_ON_ROOT("Blocks created: " << blocks->getNumberOfBlocks()) for (uint_t level = 0; level <= refinementDepth; level++) { @@ -223,6 +234,8 @@ int main(int argc, char** argv) { sweepCollection.initialise(&block, 2); } + WALBERLA_MPI_BARRIER() + WALBERLA_LOG_INFO_ON_ROOT("Initialisation done") ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// LB SWEEPS AND BOUNDARY HANDLING /// ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -294,9 +307,13 @@ int main(int argc, char** argv) WcTimingPool timeloopTiming; WcTimer simTimer; + WALBERLA_MPI_BARRIER() WALBERLA_LOG_INFO_ON_ROOT("Starting benchmark with " << timesteps << " time steps") + WALBERLA_MPI_BARRIER() + simTimer.start(); timeLoop.run(timeloopTiming); + WALBERLA_MPI_BARRIER() simTimer.end(); WALBERLA_LOG_INFO_ON_ROOT("Benchmark finished") diff --git a/apps/benchmarks/NonUniformGridCPU/simulation_setup/benchmark_configs.py b/apps/benchmarks/NonUniformGridCPU/simulation_setup/benchmark_configs.py index 1de18d9f0f6ed8ef684eddee74f4712c8f72c852..ccfcecacfb5c0f5a79b56aea13409cc3da4d2748 100644 --- a/apps/benchmarks/NonUniformGridCPU/simulation_setup/benchmark_configs.py +++ b/apps/benchmarks/NonUniformGridCPU/simulation_setup/benchmark_configs.py @@ -2,12 +2,13 @@ import waLBerla as wlb class Scenario: - def __init__(self, domain_size=(32, 32, 32), root_blocks=(2, 2, 2), - cells_per_block=(16, 16, 16)): + def __init__(self, domain_size=(64, 64, 64), root_blocks=(2, 2, 2), + cells_per_block=(32, 32, 32), refinement_depth=0): self.domain_size = domain_size self.root_blocks = root_blocks self.cells_per_block = cells_per_block + self.refinement_depth = refinement_depth self.periodic = (0, 0, 0) @@ -25,21 +26,25 @@ class Scenario: }, 'Parameters': { 'omega': 1.95, - 'timesteps': 101, + 'timesteps': 10001, - 'refinementDepth': 1, + 'refinementDepth': self.refinement_depth, 'writeSetupForestAndReturn': False, 'numProcesses': 1, + 'cudaEnabledMPI': False, 'benchmarkKernelOnly': False, 'remainingTimeLoggerFrequency': 3, - 'vtkWriteFrequency': 50, + 'vtkWriteFrequency': 5000, + }, + 'Logging': { + 'logLevel': "info", } } - if print_dict: + if print_dict and config_dict["Parameters"]["writeSetupForestAndReturn"] is False: wlb.log_info_on_root("Scenario:\n" + pformat(config_dict)) return config_dict @@ -47,10 +52,17 @@ class Scenario: def validation_run(): """Run with full periodic shear flow or boundary scenario (ldc) to check if the code works""" wlb.log_info_on_root("Validation run") - wlb.log_info_on_root("") + + domain_size = (96, 96, 96) + cells_per_block = (32, 32, 32) + + root_blocks = tuple([d // c for d, c in zip(domain_size, cells_per_block)]) scenarios = wlb.ScenarioManager() - scenario = Scenario() + scenario = Scenario(domain_size=domain_size, + root_blocks=root_blocks, + cells_per_block=cells_per_block, + refinement_depth=1) scenarios.add(scenario) diff --git a/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.cpp b/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.cpp index fa3905b4236295275d82e2e4aad91be4ddcbb5ba..8110dbbb4437b8d3c56d9b855de501fd55521454 100644 --- a/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.cpp +++ b/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.cpp @@ -50,11 +50,8 @@ #include "lbm_generated/gpu/BasicRecursiveTimeStepGPU.h" #include "python_coupling/CreateConfig.h" -#include "python_coupling/DictWrapper.h" #include "python_coupling/PythonCallback.h" -#include "timeloop/SweepTimeloop.h" - #include <cmath> #include "NonUniformGridGPUInfoHeader.h" @@ -80,18 +77,28 @@ class LDCRefinement const uint_t refinementDepth_; public: - LDCRefinement(const uint_t depth) : refinementDepth_(depth){}; + explicit LDCRefinement(const uint_t depth) : refinementDepth_(depth){}; - void operator()(SetupBlockForest& forest) + void operator()(SetupBlockForest& forest) const { - std::vector< SetupBlock* > blocks; - forest.getBlocks(blocks); + const AABB & domain = forest.getDomain(); + + real_t xSize = ( domain.xSize() / real_t(12) ) * real_c( 0.99 ); + real_t ySize = ( domain.ySize() / real_t(12) ) * real_c( 0.99 ); + + AABB leftCorner( domain.xMin(), domain.yMin(), domain.zMin(), + domain.xMin() + xSize, domain.yMin() + ySize, domain.zMax() ); - for (auto block : blocks) + AABB rightCorner( domain.xMax() - xSize, domain.yMin(), domain.zMin(), + domain.xMax(), domain.yMin() + ySize, domain.zMax() ); + + for(auto & block : forest) { - if (forest.atDomainYMaxBorder(*block)) + auto & aabb = block.getAABB(); + if( leftCorner.intersects( aabb ) || rightCorner.intersects( aabb ) ) { - if (block->getLevel() < refinementDepth_) { block->setMarker(true); } + if( block.getLevel() < refinementDepth_) + block.setMarker( true ); } } } @@ -107,10 +114,9 @@ class LDC const FlagUID ubbFlagUID_; public: - LDC(const uint_t depth) : refinementDepth_(depth), noSlipFlagUID_("NoSlip"), ubbFlagUID_("UBB"){}; + explicit LDC(const uint_t depth) : refinementDepth_(depth), noSlipFlagUID_("NoSlip"), ubbFlagUID_("UBB"){}; - Vector3< real_t > acceleration() const { return Vector3< real_t >(0.0); } - RefinementSelectionFunctor refinementSelector() + RefinementSelectionFunctor refinementSelector() const { return LDCRefinement(refinementDepth_); } @@ -119,7 +125,7 @@ class LDC { for (auto bIt = sbfs.begin(); bIt != sbfs.end(); ++bIt) { - Block& b = dynamic_cast< Block& >(*bIt); + auto& b = dynamic_cast< Block& >(*bIt); const uint_t level = b.getLevel(); auto flagField = b.getData< FlagField_T >(flagFieldID); const uint8_t noslipFlag = flagField->registerFlag(noSlipFlagUID_); @@ -140,18 +146,19 @@ class LDC } }; -static void createSetupBlockForest(SetupBlockForest& setupBfs, const Config::BlockHandle& domainSetup, LDC& ldcSetup, const uint_t numProcesses=uint_c(MPIManager::instance()->numProcesses())) -{ - Vector3< real_t > domainSize = domainSetup.getParameter< Vector3< real_t > >("domainSize"); - Vector3< uint_t > rootBlocks = domainSetup.getParameter< Vector3< uint_t > >("rootBlocks"); - Vector3< bool > periodic = domainSetup.getParameter< Vector3< bool > >("periodic"); - - auto refSelection = ldcSetup.refinementSelector(); - setupBfs.addRefinementSelectionFunction(std::function< void(SetupBlockForest&) >(refSelection)); - const AABB domain(real_t(0.0), real_t(0.0), real_t(0.0), domainSize[0], domainSize[1], domainSize[2]); - setupBfs.addWorkloadMemorySUIDAssignmentFunction( blockforest::uniformWorkloadAndMemoryAssignment ); - setupBfs.init(domain, rootBlocks[0], rootBlocks[1], rootBlocks[2], periodic[0], periodic[1], periodic[2]); - setupBfs.balanceLoad(blockforest::StaticLevelwiseCurveBalanceWeighted(), numProcesses); +namespace { +void createSetupBlockForest(SetupBlockForest& setupBfs, const Config::BlockHandle& domainSetup, LDC& ldcSetup, const uint_t numProcesses=uint_c(MPIManager::instance()->numProcesses())) { + Vector3<real_t> domainSize = domainSetup.getParameter<Vector3<real_t> >("domainSize"); + Vector3<uint_t> rootBlocks = domainSetup.getParameter<Vector3<uint_t> >("rootBlocks"); + Vector3<bool> periodic = domainSetup.getParameter<Vector3<bool> >("periodic"); + + auto refSelection = ldcSetup.refinementSelector(); + setupBfs.addRefinementSelectionFunction(std::function<void(SetupBlockForest &)>(refSelection)); + const AABB domain(real_t(0.0), real_t(0.0), real_t(0.0), domainSize[0], domainSize[1], domainSize[2]); + setupBfs.addWorkloadMemorySUIDAssignmentFunction(blockforest::uniformWorkloadAndMemoryAssignment); + setupBfs.init(domain, rootBlocks[0], rootBlocks[1], rootBlocks[2], periodic[0], periodic[1], periodic[2]); + setupBfs.balanceLoad(blockforest::StaticLevelwiseCurveBalanceWeighted(), numProcesses); +} } int main(int argc, char** argv) @@ -203,7 +210,7 @@ int main(int argc, char** argv) totalCellUpdates += timesteps * math::uintPow2(level) * numberOfCells; WALBERLA_LOG_INFO_ON_ROOT("Level " << level << " Blocks: " << numberOfBlocks) } - cudaDeviceProp prop; + cudaDeviceProp prop{}; WALBERLA_GPU_CHECK(gpuGetDeviceProperties(&prop, 0)) const uint_t totalNumberCells = setupBfs.getNumberOfBlocks() * cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2]; @@ -232,8 +239,6 @@ int main(int argc, char** argv) auto blocks = std::make_shared< StructuredBlockForest >(bfs, cellsPerBlock[0], cellsPerBlock[1], cellsPerBlock[2]); blocks->createCellBoundingBoxes(); - WALBERLA_ROOT_SECTION() { vtk::writeDomainDecomposition(blocks, "domainDecomposition", "vtk_out"); } - WALBERLA_LOG_INFO_ON_ROOT("Blocks created: " << blocks->getNumberOfBlocks()) for (uint_t level = 0; level <= refinementDepth; level++) { @@ -263,6 +268,10 @@ int main(int argc, char** argv) { sweepCollection.initialise(&iBlock, 2, nullptr); } + WALBERLA_GPU_CHECK(gpuDeviceSynchronize()) + WALBERLA_GPU_CHECK(gpuPeekAtLastError()) + WALBERLA_MPI_BARRIER() + WALBERLA_LOG_INFO_ON_ROOT("Initialisation done") ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// LB SWEEPS AND BOUNDARY HANDLING /// ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -280,6 +289,7 @@ int main(int argc, char** argv) auto communication = std::make_shared< NonUniformGPUScheme <CommunicationStencil_T>> (blocks, cudaEnabledMPI); auto packInfo = lbm_generated::setupNonuniformGPUPdfCommunication<GPUPdfField_T>(blocks, pdfFieldGpuID); communication->addPackInfo(packInfo); + WALBERLA_MPI_BARRIER() ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// TIME STEP DEFINITIONS /// @@ -343,10 +353,14 @@ int main(int argc, char** argv) WALBERLA_GPU_CHECK(gpuDeviceSynchronize()) WALBERLA_GPU_CHECK(gpuPeekAtLastError()) + WALBERLA_MPI_BARRIER() WALBERLA_LOG_INFO_ON_ROOT("Starting simulation with " << timesteps << " time steps") + WALBERLA_MPI_BARRIER() + simTimer.start(); timeLoop.run(timeloopTiming); WALBERLA_GPU_CHECK(gpuDeviceSynchronize()) + WALBERLA_MPI_BARRIER() simTimer.end(); WALBERLA_LOG_INFO_ON_ROOT("Simulation finished") diff --git a/apps/benchmarks/NonUniformGridGPU/simulation_setup/benchmark_configs.py b/apps/benchmarks/NonUniformGridGPU/simulation_setup/benchmark_configs.py index d05852fd1934c71ea67d6cce3a8ae3f4cc80e61a..ccfcecacfb5c0f5a79b56aea13409cc3da4d2748 100644 --- a/apps/benchmarks/NonUniformGridGPU/simulation_setup/benchmark_configs.py +++ b/apps/benchmarks/NonUniformGridGPU/simulation_setup/benchmark_configs.py @@ -26,7 +26,7 @@ class Scenario: }, 'Parameters': { 'omega': 1.95, - 'timesteps': 1501, + 'timesteps': 10001, 'refinementDepth': self.refinement_depth, 'writeSetupForestAndReturn': False, @@ -37,7 +37,10 @@ class Scenario: 'remainingTimeLoggerFrequency': 3, - 'vtkWriteFrequency': 500, + 'vtkWriteFrequency': 5000, + }, + 'Logging': { + 'logLevel': "info", } } @@ -50,7 +53,7 @@ def validation_run(): """Run with full periodic shear flow or boundary scenario (ldc) to check if the code works""" wlb.log_info_on_root("Validation run") - domain_size = (64, 64, 64) + domain_size = (96, 96, 96) cells_per_block = (32, 32, 32) root_blocks = tuple([d // c for d, c in zip(domain_size, cells_per_block)]) diff --git a/python/pystencils_walberla/codegen.py b/python/pystencils_walberla/codegen.py index ac475f72c9489d9e7b74ce25d9bf303413ae7834..59fd6028fafcd2bab784c1a595f8511464108def 100644 --- a/python/pystencils_walberla/codegen.py +++ b/python/pystencils_walberla/codegen.py @@ -428,6 +428,7 @@ class KernelInfo: def generate_kernel_invocation_code(self, **kwargs): ast = self.ast ast_params = self.parameters + fnc_name = ast.function_name is_cpu = self.ast.target == Target.CPU call_parameters = ", ".join([p.symbol.name for p in ast_params]) @@ -446,20 +447,18 @@ class KernelInfo: indexing_dict = ast.indexing.call_parameters(spatial_shape_symbols) sp_printer_c = CudaSympyPrinter() - block = tuple(sp_printer_c.doprint(e) for e in indexing_dict['block']) grid = tuple(sp_printer_c.doprint(e) for e in indexing_dict['grid']) - kernel_launch = f"internal_{ast.function_name}::{ast.function_name}<<<_grid, _block, 0, {stream}>>>({call_parameters});" - kernel_call_lines = [ - f"dim3 _block(uint32_t({block[0]}), uint32_t({block[1]}), uint32_t({block[2]}));", - f"dim3 _grid(uint32_t({grid[0]}), uint32_t({grid[1]}), uint32_t({grid[2]}));", - kernel_launch] + f"dim3 _block(uint64_c({block[0]}), uint64_c({block[1]}), uint64_c({block[2]}));", + f"dim3 _grid(uint64_c({grid[0]}), uint64_c({grid[1]}), uint64_c({grid[2]}));", + f"internal_{fnc_name}::{fnc_name}<<<_grid, _block, 0, {stream}>>>({call_parameters});" + ] return "\n".join(kernel_call_lines) else: - return f"internal_{ast.function_name}::{ast.function_name}({call_parameters});" + return f"internal_{fnc_name}::{fnc_name}({call_parameters});" def get_vectorize_instruction_set(generation_context): diff --git a/python/pystencils_walberla/kernel_info.py b/python/pystencils_walberla/kernel_info.py index 1382d94f4220495da28bf02113636fdf8addbaf1..019843f903407717a222c5a44cc945f59a88fcfa 100644 --- a/python/pystencils_walberla/kernel_info.py +++ b/python/pystencils_walberla/kernel_info.py @@ -35,6 +35,7 @@ class KernelInfo: def generate_kernel_invocation_code(self, **kwargs): ast = self.ast ast_params = self.parameters + fnc_name = ast.function_name is_cpu = self.ast.target == Target.CPU call_parameters = ", ".join([p.symbol.name for p in ast_params]) @@ -53,15 +54,15 @@ class KernelInfo: indexing_dict = ast.indexing.call_parameters(spatial_shape_symbols) sp_printer_c = CudaSympyPrinter() + block = tuple(sp_printer_c.doprint(e) for e in indexing_dict['block']) + grid = tuple(sp_printer_c.doprint(e) for e in indexing_dict['grid']) + kernel_call_lines = [ - "dim3 _block(int(%s), int(%s), int(%s));" % tuple(sp_printer_c.doprint(e) - for e in indexing_dict['block']), - "dim3 _grid(int(%s), int(%s), int(%s));" % tuple(sp_printer_c.doprint(e) - for e in indexing_dict['grid']), - "internal_%s::%s<<<_grid, _block, 0, %s>>>(%s);" % (ast.function_name, ast.function_name, - stream, call_parameters), + f"dim3 _block(uint64_c({block[0]}), uint64_c({block[1]}), uint64_c({block[2]}));", + f"dim3 _grid(uint64_c({grid[0]}), uint64_c({grid[1]}), uint64_c({grid[2]}));", + f"internal_{fnc_name}::{fnc_name}<<<_grid, _block, 0, {stream}>>>({call_parameters});" ] return "\n".join(kernel_call_lines) else: - return f"internal_{ast.function_name}::{ast.function_name}({call_parameters});" + return f"internal_{fnc_name}::{fnc_name}({call_parameters});" diff --git a/python/pystencils_walberla/kernel_selection.py b/python/pystencils_walberla/kernel_selection.py index c946f85105185159e317ea18d4740667cd7761c7..544b86de027af9f810df34f9bf10a21db7444fd6 100644 --- a/python/pystencils_walberla/kernel_selection.py +++ b/python/pystencils_walberla/kernel_selection.py @@ -172,6 +172,7 @@ class KernelCallNode(AbstractKernelSelectionNode): def get_code(self, **kwargs): ast = self.ast ast_params = self.parameters + fnc_name = ast.function_name is_cpu = self.ast.target == Target.CPU call_parameters = ", ".join([p.symbol.name for p in ast_params]) @@ -190,20 +191,18 @@ class KernelCallNode(AbstractKernelSelectionNode): indexing_dict = ast.indexing.call_parameters(spatial_shape_symbols) sp_printer_c = CudaSympyPrinter() - block = tuple(sp_printer_c.doprint(e) for e in indexing_dict['block']) grid = tuple(sp_printer_c.doprint(e) for e in indexing_dict['grid']) - kernel_launch = f"internal_{ast.function_name}::{ast.function_name}<<<_grid, _block, 0, {stream}>>>({call_parameters});" - kernel_call_lines = [ - f"dim3 _block(uint32_t({block[0]}), uint32_t({block[1]}), uint32_t({block[2]}));", - f"dim3 _grid(uint32_t({grid[0]}), uint32_t({grid[1]}), uint32_t({grid[2]}));", - kernel_launch] + f"dim3 _block(uint64_c({block[0]}), uint64_c({block[1]}), uint64_c({block[2]}));", + f"dim3 _grid(uint64_c({grid[0]}), uint64_c({grid[1]}), uint64_c({grid[2]}));", + f"internal_{fnc_name}::{fnc_name}<<<_grid, _block, 0, {stream}>>>({call_parameters});" + ] return "\n".join(kernel_call_lines) else: - return f"internal_{ast.function_name}::{ast.function_name}({call_parameters});" + return f"internal_{fnc_name}::{fnc_name}({call_parameters});" class SimpleBooleanCondition(AbstractConditionNode): diff --git a/src/blockforest/BlockForest.h b/src/blockforest/BlockForest.h index 6108ea3e2ce523ca6f8e11549e354f97324524b9..58df1ba4cfa8d99f7a5c1b437c67dfec61109d4e 100644 --- a/src/blockforest/BlockForest.h +++ b/src/blockforest/BlockForest.h @@ -239,7 +239,7 @@ public: bool storesUniformBlockGrid() const { return depth_ == 0; } uint_t getTreeIdDigits() const { return treeIdDigits_; } - uint_t getBlockIdBytes() const { uint_t bits = treeIdDigits_ + 3 * depth_; return (bits >> 3) + (( bits & 7 ) ? uint_c(1) : uint_c(0)); } + uint_t getBlockIdBytes() const { uint_t const bits = treeIdDigits_ + 3 * depth_; return (bits >> 3) + (( bits & 7 ) ? uint_c(1) : uint_c(0)); } uint_t getDepth() const { return depth_; } uint_t getNumberOfLevels() const { return depth_ + uint_t(1); } @@ -635,7 +635,7 @@ inline void BlockForest::getBlocksOverlappedByAABB( std::vector< IBlock* >& bloc inline const Block* BlockForest::getBlock( const IBlockID& id ) const { - WALBERLA_ASSERT_EQUAL( dynamic_cast< const BlockID* >( &id ), &id ); + WALBERLA_ASSERT_EQUAL( dynamic_cast< const BlockID* >( &id ), &id ) auto it = blocks_.find( *static_cast< const BlockID* >( &id ) ); @@ -649,7 +649,7 @@ inline const Block* BlockForest::getBlock( const IBlockID& id ) const { inline Block* BlockForest::getBlock( const IBlockID& id ) { - WALBERLA_ASSERT_EQUAL( dynamic_cast< const BlockID* >( &id ), &id ); + WALBERLA_ASSERT_EQUAL( dynamic_cast< const BlockID* >( &id ), &id ) auto it = blocks_.find( *static_cast< const BlockID* >( &id ) ); @@ -708,7 +708,7 @@ inline void BlockForest::getAllBlocks( std::vector< shared_ptr< IBlockID > >& bl for( auto block = begin(); block != end(); ++block ) { const IBlockID& id = block->getId(); - WALBERLA_ASSERT_EQUAL( dynamic_cast< const BlockID* >( &id ), &id ); + WALBERLA_ASSERT_EQUAL( dynamic_cast< const BlockID* >( &id ), &id ) blocks.push_back( walberla::make_shared< BlockID >( *static_cast< const BlockID* >( &id ) ) ); } } @@ -744,7 +744,7 @@ inline bool BlockForest::blockExistsRemotely( const real_t x, const real_t y, co inline bool BlockForest::blockExists( const IBlockID& id ) const { - WALBERLA_ASSERT_EQUAL( dynamic_cast< const BlockID* >( &id ), &id ); + WALBERLA_ASSERT_EQUAL( dynamic_cast< const BlockID* >( &id ), &id ) if( blockInformation_->active() ) return blockInformation_->exists( *static_cast< const BlockID* >( &id ) ); @@ -763,7 +763,7 @@ inline bool BlockForest::blockExistsLocally( const IBlockID& id ) const { inline bool BlockForest::blockExistsRemotely( const IBlockID& id ) const { - WALBERLA_ASSERT_EQUAL( dynamic_cast< const BlockID* >( &id ), &id ); + WALBERLA_ASSERT_EQUAL( dynamic_cast< const BlockID* >( &id ), &id ) if( blockInformation_->active() ) return blockInformation_->existsRemotely( *static_cast< const BlockID* >( &id ) ); @@ -801,7 +801,7 @@ inline bool BlockForest::rootBlockExistsRemotely( const uint_t x, const uint_t y inline uint_t BlockForest::getLevel( const IBlock& block ) const { - WALBERLA_ASSERT_EQUAL( dynamic_cast< const Block* >( &block ), &block ); + WALBERLA_ASSERT_EQUAL( dynamic_cast< const Block* >( &block ), &block ) return static_cast< const Block* >( &block )->getLevel(); } @@ -810,8 +810,8 @@ inline uint_t BlockForest::getLevel( const IBlock& block ) const { inline uint_t BlockForest::getLevelFromBlockId( const BlockID& id ) const { - WALBERLA_ASSERT_GREATER_EQUAL( id.getUsedBits(), treeIdDigits_ ); - WALBERLA_ASSERT_EQUAL( ( id.getUsedBits() - treeIdDigits_ ) % 3, 0 ); + WALBERLA_ASSERT_GREATER_EQUAL( id.getUsedBits(), treeIdDigits_ ) + WALBERLA_ASSERT_EQUAL( ( id.getUsedBits() - treeIdDigits_ ) % 3, 0 ) return ( id.getUsedBits() - treeIdDigits_ ) / 3; } @@ -820,7 +820,7 @@ inline uint_t BlockForest::getLevelFromBlockId( const BlockID& id ) const { inline uint_t BlockForest::getAABBFromBlockId( AABB& aabb, const BlockID& id ) const { - BlockReconstruction::AABBReconstruction aabbReconstruction( domain_, size_[0], size_[1], size_[2], treeIdDigits_ ); + BlockReconstruction::AABBReconstruction const aabbReconstruction( domain_, size_[0], size_[1], size_[2], treeIdDigits_ ); return aabbReconstruction( aabb, id ); } @@ -838,9 +838,9 @@ inline AABB BlockForest::getAABBFromBlockId( const BlockID& id ) const { inline void BlockForest::getRootBlockID( BlockID& id, const uint_t x, const uint_t y, const uint_t z ) const { - WALBERLA_ASSERT_LESS( x, size_[0] ); - WALBERLA_ASSERT_LESS( y, size_[1] ); - WALBERLA_ASSERT_LESS( z, size_[2] ); + WALBERLA_ASSERT_LESS( x, size_[0] ) + WALBERLA_ASSERT_LESS( y, size_[1] ) + WALBERLA_ASSERT_LESS( z, size_[2] ) id.clear(); id.init( z * size_[1] * size_[0] + y * size_[0] + x, uint_c(1) << ( treeIdDigits_ - 1 ) ); @@ -1017,7 +1017,7 @@ public: for( auto function = functions_.begin(); function != functions_.end(); ++function, ++iter ) { (*function)( *iter, blocksAlreadyMarkedForRefinement, forest ); - WALBERLA_ASSERT_EQUAL(iter->size(), numberOfBlocks, "Number of blocks has changed during min target level determination!"); + WALBERLA_ASSERT_EQUAL(iter->size(), numberOfBlocks, "Number of blocks has changed during min target level determination!") } // combine the outcome of the different functions into a single target level @@ -1026,7 +1026,7 @@ public: { for( uint_t fct = 0; fct < functions_.size(); ++fct) { - WALBERLA_ASSERT_EQUAL(minTargetLevelsPerFunction[fct][block].first->getId(), minTargetLevels[block].first->getId()); + WALBERLA_ASSERT_EQUAL(minTargetLevelsPerFunction[fct][block].first->getId(), minTargetLevels[block].first->getId()) targetLevels[fct] = minTargetLevelsPerFunction[fct][block].second; } minTargetLevels[block].second = targetLevelReductionFct_(targetLevels); diff --git a/src/blockforest/SetupBlockForest.h b/src/blockforest/SetupBlockForest.h index cc6268bde5190492d072717ae2d37d84358b8a41..1d105148f1d1c6beff3114d70b26099528ede467 100644 --- a/src/blockforest/SetupBlockForest.h +++ b/src/blockforest/SetupBlockForest.h @@ -175,7 +175,7 @@ public: uint_t getMinLevel() const; uint_t getMaxLevel() const; uint_t getTreeIdDigits() const { return treeIdDigits_; } - uint_t getBlockIdBytes() const { uint_t bits = treeIdDigits_ + 3 * depth_; return (bits >> 3) + (( bits & 7 ) ? uint_c(1) : uint_c(0)); } + uint_t getBlockIdBytes() const { uint_t const bits = treeIdDigits_ + 3 * depth_; return (bits >> 3) + (( bits & 7 ) ? uint_c(1) : uint_c(0)); } uint_t getNumberOfTrees() const { return forest_.size(); } uint_t getNumberOfRootBlocks() const { return numberOfRootBlocks_; } @@ -312,7 +312,7 @@ public: uint_t getNumberOfProcesses () const { return numberOfProcesses_; } uint_t getNumberOfWorkerProcesses() const { return numberOfProcesses_ - numberOfBufferProcesses_; } uint_t getNumberOfBufferProcesses() const { return numberOfBufferProcesses_; } - uint_t getProcessIdBytes () const { uint_t bits = uintMSBPosition( numberOfProcesses_ - 1 ); + uint_t getProcessIdBytes () const { uint_t const bits = uintMSBPosition( numberOfProcesses_ - 1 ); return ( bits >> 3 ) + ( ( bits & 7 ) ? uint_c(1) : uint_c(0) ); } inline bool insertBuffersIntoProcessNetwork() const { return insertBuffersIntoProcessNetwork_; } @@ -505,19 +505,20 @@ inline uint_t SetupBlockForest::mapForestCoordinatesToTreeIndex( const uint_t x, inline void SetupBlockForest::mapTreeIndexToForestCoordinates( const uint_t treeIndex, const uint_t xSize, const uint_t ySize, uint_t& x, uint_t& y, uint_t& z ) -{ - z = treeIndex / ( xSize * ySize ); - uint_t index = treeIndex % ( xSize * ySize ); - y = index / xSize; - x = index % xSize; + { + uint_t const index = treeIndex % ( xSize * ySize ); + + z = treeIndex / ( xSize * ySize ); + y = index / xSize; + x = index % xSize; } -inline void SetupBlockForest::mapTreeIndexToForestCoordinates( const uint_t treeIndex, uint_t& x, uint_t& y, uint_t& z ) const { +inline void SetupBlockForest::mapTreeIndexToForestCoordinates( const uint_t treeIndex, uint_t& x, uint_t& y, uint_t& z ) const +{ WALBERLA_ASSERT_LESS( treeIndex, size_[0] * size_[1] * size_[2] ) - mapTreeIndexToForestCoordinates( treeIndex, size_[0], size_[1], x, y, z ); } @@ -530,7 +531,8 @@ inline void SetupBlockForest::getRootBlockAABB( AABB& aabb, const uint_t x, cons -inline void SetupBlockForest::getRootBlockAABB( AABB& aabb, const uint_t treeIndex ) const { +inline void SetupBlockForest::getRootBlockAABB( AABB& aabb, const uint_t treeIndex ) const +{ uint_t x; uint_t y; @@ -615,10 +617,10 @@ inline void SetupBlockForest::initWorkloadMemorySUID( const Set<SUID>& selector if( workloadMemorySUIDAssignmentFunctions.empty() ) { WALBERLA_LOG_PROGRESS( "Initializing SetupBlockForest: No callback functions for assigning workload, memory requirements, and SUIDs to blocks found.\n" - " Default values (zero/nothing) will be assigned ..." ); + " Default values (zero/nothing) will be assigned ..." ) } else { - WALBERLA_LOG_PROGRESS( "Initializing SetupBlockForest: Assigning workload, memory requirements, and SUIDs to blocks ..." ); + WALBERLA_LOG_PROGRESS( "Initializing SetupBlockForest: Assigning workload, memory requirements, and SUIDs to blocks ..." ) } for( uint_t i = 0; i != workloadMemorySUIDAssignmentFunctions.size(); ++i ) diff --git a/src/blockforest/StructuredBlockForest.h b/src/blockforest/StructuredBlockForest.h index 04b61a94fbc7b4b84ab60714db58a78932471a7a..4e1f0cb5f9619f29a63c9f195b4f44bac909985c 100644 --- a/src/blockforest/StructuredBlockForest.h +++ b/src/blockforest/StructuredBlockForest.h @@ -67,9 +67,9 @@ public: uint_t getNumberOfYCells( const IBlock& /*block*/ ) const override { return blockCells_[1]; } uint_t getNumberOfZCells( const IBlock& /*block*/ ) const override { return blockCells_[2]; } #else - uint_t getNumberOfXCells( const IBlock& block ) const override { WALBERLA_ASSERT_EQUAL( &(getBlockStorage()), &(block.getBlockStorage()) ); return blockCells_[0]; } - uint_t getNumberOfYCells( const IBlock& block ) const override { WALBERLA_ASSERT_EQUAL( &(getBlockStorage()), &(block.getBlockStorage()) ); return blockCells_[1]; } - uint_t getNumberOfZCells( const IBlock& block ) const override { WALBERLA_ASSERT_EQUAL( &(getBlockStorage()), &(block.getBlockStorage()) ); return blockCells_[2]; } + uint_t getNumberOfXCells( const IBlock& block ) const override { WALBERLA_ASSERT_EQUAL( &(getBlockStorage()), &(block.getBlockStorage()) ) return blockCells_[0]; } + uint_t getNumberOfYCells( const IBlock& block ) const override { WALBERLA_ASSERT_EQUAL( &(getBlockStorage()), &(block.getBlockStorage()) ) return blockCells_[1]; } + uint_t getNumberOfZCells( const IBlock& block ) const override { WALBERLA_ASSERT_EQUAL( &(getBlockStorage()), &(block.getBlockStorage()) ) return blockCells_[2]; } #endif using StructuredBlockStorage::getNumberOfCells; @@ -231,7 +231,7 @@ inline bool StructuredBlockForest::blockExistsRemotely( const Cell& cell, const inline uint_t StructuredBlockForest::getLevel( const IBlock& block ) const { - WALBERLA_ASSERT_EQUAL( dynamic_cast< const Block* >( &block ), &block ); + WALBERLA_ASSERT_EQUAL( dynamic_cast< const Block* >( &block ), &block ) return static_cast< const Block* >( &block )->getLevel(); } @@ -244,8 +244,8 @@ inline uint_t StructuredBlockForest::getNumberOfCells( const IBlock& /*block*/, inline uint_t StructuredBlockForest::getNumberOfCells( const IBlock& block, const uint_t index ) const { #endif - WALBERLA_ASSERT_EQUAL( &(getBlockStorage()), &(block.getBlockStorage()) ); - WALBERLA_ASSERT_LESS( index, 3 ); + WALBERLA_ASSERT_EQUAL( &(getBlockStorage()), &(block.getBlockStorage()) ) + WALBERLA_ASSERT_LESS( index, 3 ) return blockCells_[ index ]; } diff --git a/src/blockforest/communication/NonUniformBufferedScheme.h b/src/blockforest/communication/NonUniformBufferedScheme.h index be27a51ec805285144983d2d3a3618c502596d50..ff1d3ba3b85f97d558378f1125b00816d734307b 100644 --- a/src/blockforest/communication/NonUniformBufferedScheme.h +++ b/src/blockforest/communication/NonUniformBufferedScheme.h @@ -43,9 +43,7 @@ #include <vector> -namespace walberla { -namespace blockforest { -namespace communication { +namespace walberla::blockforest::communication { template< typename Stencil > @@ -66,12 +64,12 @@ public: /*! \name Construction & Destruction */ //@{ explicit NonUniformBufferedScheme( const weak_ptr<StructuredBlockForest>& bf, - const int baseTag = 778 ); // waLBerla = 119+97+76+66+101+114+108+97 + int baseTag = 778 ); // waLBerla = 119+97+76+66+101+114+108+97 NonUniformBufferedScheme( const weak_ptr<StructuredBlockForest>& bf, const Set<SUID> & requiredBlockSelectors, const Set<SUID> & incompatibleBlockSelectors, - const int baseTag = 778 ); // waLBerla = 119+97+76+66+101+114+108+97 + int baseTag = 778 ); // waLBerla = 119+97+76+66+101+114+108+97 ~NonUniformBufferedScheme(); //@} @@ -87,15 +85,17 @@ public: //** Synchronous Communication ************************************************************************************** /*! \name Synchronous Communication */ //@{ - inline void operator() () { communicateEqualLevel(); communicateCoarseToFine(); communicateFineToCoarse(); } + inline void operator() () { communicate(); } + + inline void communicate() {communicateEqualLevel(); communicateCoarseToFine(); communicateFineToCoarse();} inline void communicateEqualLevel(); inline void communicateCoarseToFine(); inline void communicateFineToCoarse(); - inline void communicateEqualLevel ( const uint_t level ); - inline void communicateCoarseToFine( const uint_t fineLevel ); - inline void communicateFineToCoarse( const uint_t fineLevel ); + inline void communicateEqualLevel ( uint_t level ); + inline void communicateCoarseToFine( uint_t fineLevel ); + inline void communicateFineToCoarse( uint_t fineLevel ); std::function<void()> communicateEqualLevelFunctor(const uint_t level) { return [level, this](){ NonUniformBufferedScheme::communicateEqualLevel(level);}; @@ -110,7 +110,7 @@ public: //******************************************************************************************************************* - LocalCommunicationMode localMode() const { return localMode_; } + [[nodiscard]] LocalCommunicationMode localMode() const { return localMode_; } inline void setLocalMode( const LocalCommunicationMode & mode ); @@ -125,9 +125,9 @@ public: inline void startCommunicateCoarseToFine(); inline void startCommunicateFineToCoarse(); - inline void startCommunicateEqualLevel ( const uint_t level ); - inline void startCommunicateCoarseToFine( const uint_t fineLevel ); - inline void startCommunicateFineToCoarse( const uint_t fineLevel ); + inline void startCommunicateEqualLevel ( uint_t level ); + inline void startCommunicateCoarseToFine( uint_t fineLevel ); + inline void startCommunicateFineToCoarse( uint_t fineLevel ); void wait() { waitCommunicateEqualLevel(); waitCommunicateCoarseToFine(); waitCommunicateFineToCoarse(); } std::function<void() > getWaitFunctor() { return std::bind( &NonUniformBufferedScheme::wait, this ); } @@ -136,9 +136,9 @@ public: inline void waitCommunicateCoarseToFine(); inline void waitCommunicateFineToCoarse(); - inline void waitCommunicateEqualLevel ( const uint_t level ); - inline void waitCommunicateCoarseToFine( const uint_t fineLevel ); - inline void waitCommunicateFineToCoarse( const uint_t fineLevel ); + inline void waitCommunicateEqualLevel ( uint_t level ); + inline void waitCommunicateCoarseToFine( uint_t fineLevel ); + inline void waitCommunicateFineToCoarse( uint_t fineLevel ); //@} //******************************************************************************************************************* @@ -147,14 +147,14 @@ protected: void init(); void refresh(); - void startCommunicationEqualLevel ( const uint_t index, std::set< uint_t > & participatingLevels ); - void startCommunicationCoarseToFine( const uint_t index, const uint_t coarsestLevel, const uint_t finestLevel ); - void startCommunicationFineToCoarse( const uint_t index, const uint_t coarsestLevel, const uint_t finestLevel ); + void startCommunicationEqualLevel ( uint_t index, std::set< uint_t > & participatingLevels ); + void startCommunicationCoarseToFine( uint_t index, uint_t coarsestLevel, uint_t finestLevel ); + void startCommunicationFineToCoarse( uint_t index, uint_t coarsestLevel, uint_t finestLevel ); void resetBufferSystem( shared_ptr< mpi::OpenMPBufferSystem > & bufferSystem ); - void start( const INDEX i, const uint_t j ); - void wait( const INDEX i, const uint_t j ); + void start( INDEX i, uint_t j ); + void wait( INDEX i, uint_t j ); static void writeHeader( SendBuffer & buffer, const BlockID & sender, const BlockID & receiver, const stencil::Direction & dir ); static void readHeader( RecvBuffer & buffer, BlockID & sender, BlockID & receiver, stencil::Direction & dir ); @@ -162,17 +162,17 @@ protected: static void send( SendBuffer & buffer, std::vector< SendBufferFunction > & functions ); void receive( RecvBuffer & buffer ); - void localBufferPacking( const INDEX i, const uint_t j, const uint_t bufferIndex, const PackInfo & packInfo, + void localBufferPacking( INDEX i, uint_t j, uint_t bufferIndex, const PackInfo & packInfo, const Block * sender, const Block * receiver, const stencil::Direction & dir ); - void localBufferUnpacking( const INDEX i, const uint_t j, const uint_t bufferIndex, const PackInfo & packInfo, + void localBufferUnpacking( INDEX i, uint_t j, uint_t bufferIndex, const PackInfo & packInfo, Block * receiver, const Block * sender, const stencil::Direction & dir ); - bool isAnyCommunicationInProgress() const; + [[nodiscard]] bool isAnyCommunicationInProgress() const; weak_ptr<StructuredBlockForest> blockForest_; - uint_t forestModificationStamp_; + uint_t forestModificationStamp_{uint_c(0)}; std::vector< PackInfo > packInfos_; @@ -279,8 +279,8 @@ void NonUniformBufferedScheme<Stencil>::refresh() } #ifndef NDEBUG - for( auto p = packInfos_.begin(); p != packInfos_.end(); ++p ) - (*p)->clearBufferSizeCheckMap(); + for(auto & packInfo : packInfos_) + packInfo->clearBufferSizeCheckMap(); #endif forestModificationStamp_ = forest->getBlockForest().getModificationStamp(); @@ -312,8 +312,8 @@ inline void NonUniformBufferedScheme<Stencil>::addPackInfo( const PackInfo & pac packInfos_.push_back( packInfo ); for( uint_t i = 0; i != 3; ++i ) - for( auto level = setupBeforeNextCommunication_[i].begin(); level != setupBeforeNextCommunication_[i].end(); ++level ) - *level = char(1); + for(char & level : setupBeforeNextCommunication_[i]) + level = char(1); } @@ -380,8 +380,8 @@ inline void NonUniformBufferedScheme<Stencil>::setLocalMode( const LocalCommunic localMode_ = mode; for( uint_t i = 0; i != 3; ++i ) - for( auto level = setupBeforeNextCommunication_[i].begin(); level != setupBeforeNextCommunication_[i].end(); ++level ) - *level = char(1); + for(char & level : setupBeforeNextCommunication_[i]) + level = char(1); } } @@ -419,7 +419,7 @@ inline void NonUniformBufferedScheme<Stencil>::startCommunicateCoarseToFine() if( forestModificationStamp_ != forest->getBlockForest().getModificationStamp() ) refresh(); - const uint_t coarsestLevel = uint_t(0); + const auto coarsestLevel = uint_t(0); const uint_t finestLevel = levelIndex - uint_t(1); startCommunicationCoarseToFine( levelIndex, coarsestLevel, finestLevel ); @@ -440,7 +440,7 @@ inline void NonUniformBufferedScheme<Stencil>::startCommunicateFineToCoarse() if( forestModificationStamp_ != forest->getBlockForest().getModificationStamp() ) refresh(); - const uint_t coarsestLevel = uint_t(0); + const auto coarsestLevel = uint_t(0); const uint_t finestLevel = levelIndex - uint_t(1); startCommunicationFineToCoarse( levelIndex, coarsestLevel, finestLevel ); @@ -633,7 +633,7 @@ void NonUniformBufferedScheme<Stencil>::startCommunicationEqualLevel( const uint for( auto it = forest->begin(); it != forest->end(); ++it ) { - Block * block = dynamic_cast< Block * >( it.get() ); + auto * block = dynamic_cast< Block * >( it.get() ); if( !selectable::isSetSelected( block->getState(), requiredBlockSelectors_, incompatibleBlockSelectors_ ) ) continue; @@ -660,7 +660,7 @@ void NonUniformBufferedScheme<Stencil>::startCommunicationEqualLevel( const uint auto neighbor = dynamic_cast< Block * >( forest->getBlock(receiverId) ); WALBERLA_ASSERT_EQUAL( neighbor->getProcess(), block->getProcess() ) - for( auto packInfo = packInfos_.begin(); packInfo != packInfos_.end(); ++packInfo ) + for(auto & packInfo : packInfos_) { if( localMode_ == BUFFER ) { @@ -669,14 +669,14 @@ void NonUniformBufferedScheme<Stencil>::startCommunicationEqualLevel( const uint const uint_t bufferIndex = uint_c( localBuffers.size() ) - uint_t(1); VoidFunction pack = std::bind( &NonUniformBufferedScheme<Stencil>::localBufferPacking, this, - EQUAL_LEVEL, index, bufferIndex, std::cref( *packInfo ), block, neighbor, *dir ); + EQUAL_LEVEL, index, bufferIndex, std::cref( packInfo ), block, neighbor, *dir ); threadsafeLocalCommunication.push_back( pack ); VoidFunction unpack = std::bind( &NonUniformBufferedScheme<Stencil>::localBufferUnpacking, this, - EQUAL_LEVEL, index, bufferIndex, std::cref( *packInfo ), neighbor, block, *dir ); + EQUAL_LEVEL, index, bufferIndex, std::cref( packInfo ), neighbor, block, *dir ); - if( (*packInfo)->threadsafeReceiving() ) + if( packInfo->threadsafeReceiving() ) threadsafeLocalCommunicationUnpack.push_back( unpack ); else localCommunicationUnpack.push_back( unpack ); @@ -684,8 +684,8 @@ void NonUniformBufferedScheme<Stencil>::startCommunicationEqualLevel( const uint else { VoidFunction localCommunicationFunction = std::bind( &blockforest::communication::NonUniformPackInfo::communicateLocalEqualLevel, - *packInfo, block, neighbor, *dir ); - if( (*packInfo)->threadsafeReceiving() ) + packInfo, block, neighbor, *dir ); + if( packInfo->threadsafeReceiving() ) threadsafeLocalCommunication.push_back( localCommunicationFunction ); else localCommunication.push_back( localCommunicationFunction ); @@ -699,18 +699,18 @@ void NonUniformBufferedScheme<Stencil>::startCommunicationEqualLevel( const uint if( !packInfos_.empty() ) sendFunctions[ nProcess ].push_back( std::bind( NonUniformBufferedScheme<Stencil>::writeHeader, std::placeholders::_1, block->getId(), receiverId, *dir ) ); - for( auto packInfo = packInfos_.begin(); packInfo != packInfos_.end(); ++packInfo ) - sendFunctions[ nProcess ].push_back( std::bind( &blockforest::communication::NonUniformPackInfo::packDataEqualLevel, *packInfo, block, *dir, std::placeholders::_1 ) ); + for(auto & packInfo : packInfos_) + sendFunctions[ nProcess ].push_back( std::bind( &blockforest::communication::NonUniformPackInfo::packDataEqualLevel, packInfo, block, *dir, std::placeholders::_1 ) ); } } } resetBufferSystem( bufferSystem ); - for( auto sender = sendFunctions.begin(); sender != sendFunctions.end(); ++sender ) + for( auto & sender : sendFunctions) { - bufferSystem->addSendingFunction ( int_c(sender->first), std::bind( NonUniformBufferedScheme<Stencil>::send, std::placeholders::_1, sender->second ) ); - bufferSystem->addReceivingFunction( int_c(sender->first), std::bind( &NonUniformBufferedScheme<Stencil>::receive, this, std::placeholders::_1 ) ); + bufferSystem->addSendingFunction ( int_c(sender.first), std::bind( NonUniformBufferedScheme<Stencil>::send, std::placeholders::_1, sender.second ) ); + bufferSystem->addReceivingFunction( int_c(sender.first), std::bind( &NonUniformBufferedScheme<Stencil>::receive, this, std::placeholders::_1 ) ); } setupBeforeNextCommunication = char(0); @@ -758,7 +758,7 @@ void NonUniformBufferedScheme<Stencil>::startCommunicationCoarseToFine( const ui WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to access communication for a block storage object that doesn't exist anymore" ) for( auto it = forest->begin(); it != forest->end(); ++it ) { - Block * block = dynamic_cast< Block * >( it.get() ); + auto * block = dynamic_cast< Block * >( it.get() ); if( !selectable::isSetSelected( block->getState(), requiredBlockSelectors_, incompatibleBlockSelectors_ ) ) continue; @@ -786,7 +786,7 @@ void NonUniformBufferedScheme<Stencil>::startCommunicationCoarseToFine( const ui auto neighbor = dynamic_cast< Block * >( forest->getBlock(receiverId) ); WALBERLA_ASSERT_EQUAL( neighbor->getProcess(), block->getProcess() ) - for( auto packInfo = packInfos_.begin(); packInfo != packInfos_.end(); ++packInfo ) + for(auto & packInfo : packInfos_) { if( localMode_ == BUFFER ) { @@ -795,14 +795,14 @@ void NonUniformBufferedScheme<Stencil>::startCommunicationCoarseToFine( const ui const uint_t bufferIndex = uint_c( localBuffers.size() ) - uint_t(1); VoidFunction pack = std::bind( &NonUniformBufferedScheme<Stencil>::localBufferPacking, this, - COARSE_TO_FINE, index, bufferIndex, std::cref( *packInfo ), block, neighbor, *dir ); + COARSE_TO_FINE, index, bufferIndex, std::cref( packInfo ), block, neighbor, *dir ); threadsafeLocalCommunication.push_back( pack ); VoidFunction unpack = std::bind( &NonUniformBufferedScheme<Stencil>::localBufferUnpacking, this, - COARSE_TO_FINE, index, bufferIndex, std::cref( *packInfo ), neighbor, block, *dir ); + COARSE_TO_FINE, index, bufferIndex, std::cref( packInfo ), neighbor, block, *dir ); - if( (*packInfo)->threadsafeReceiving() ) + if( packInfo->threadsafeReceiving() ) threadsafeLocalCommunicationUnpack.push_back( unpack ); else localCommunicationUnpack.push_back( unpack ); @@ -810,8 +810,8 @@ void NonUniformBufferedScheme<Stencil>::startCommunicationCoarseToFine( const ui else { VoidFunction localCommunicationFunction = std::bind( &blockforest::communication::NonUniformPackInfo::communicateLocalCoarseToFine, - *packInfo, block, neighbor, *dir ); - if( (*packInfo)->threadsafeReceiving() ) + packInfo, block, neighbor, *dir ); + if( packInfo->threadsafeReceiving() ) threadsafeLocalCommunication.push_back( localCommunicationFunction ); else localCommunication.push_back( localCommunicationFunction ); @@ -821,12 +821,11 @@ void NonUniformBufferedScheme<Stencil>::startCommunicationCoarseToFine( const ui else { auto nProcess = block->getNeighborProcess( neighborIdx, n ); - if( !packInfos_.empty() ) sendFunctions[ nProcess ].push_back( std::bind( NonUniformBufferedScheme<Stencil>::writeHeader, std::placeholders::_1, block->getId(), receiverId, *dir ) ); - for( auto packInfo = packInfos_.begin(); packInfo != packInfos_.end(); ++packInfo ) - sendFunctions[ nProcess ].push_back( std::bind( &blockforest::communication::NonUniformPackInfo::packDataCoarseToFine, *packInfo, block, receiverId, *dir, std::placeholders::_1 ) ); + for(auto & packInfo : packInfos_) + sendFunctions[ nProcess ].push_back( std::bind( &blockforest::communication::NonUniformPackInfo::packDataCoarseToFine, packInfo, block, receiverId, *dir, std::placeholders::_1 ) ); } } } @@ -852,11 +851,11 @@ void NonUniformBufferedScheme<Stencil>::startCommunicationCoarseToFine( const ui resetBufferSystem( bufferSystem ); - for( auto sender = sendFunctions.begin(); sender != sendFunctions.end(); ++sender ) - bufferSystem->addSendingFunction( int_c(sender->first), std::bind( NonUniformBufferedScheme<Stencil>::send, std::placeholders::_1, sender->second ) ); + for( auto sender : sendFunctions ) + bufferSystem->addSendingFunction( int_c(sender.first), std::bind( NonUniformBufferedScheme<Stencil>::send, std::placeholders::_1, sender.second ) ); - for( auto receiver = ranksToReceiveFrom.begin(); receiver != ranksToReceiveFrom.end(); ++receiver ) - bufferSystem->addReceivingFunction( int_c(*receiver), std::bind( &NonUniformBufferedScheme<Stencil>::receive, this, std::placeholders::_1 ) ); + for(auto receiver : ranksToReceiveFrom) + bufferSystem->addReceivingFunction( int_c(receiver), std::bind( &NonUniformBufferedScheme<Stencil>::receive, this, std::placeholders::_1 ) ); setupBeforeNextCommunication = char(0); } @@ -904,7 +903,7 @@ void NonUniformBufferedScheme<Stencil>::startCommunicationFineToCoarse( const ui for( auto it = forest->begin(); it != forest->end(); ++it ) { - Block * block = dynamic_cast< Block * >( it.get() ); + auto * block = dynamic_cast< Block * >( it.get() ); if( !selectable::isSetSelected( block->getState(), requiredBlockSelectors_, incompatibleBlockSelectors_ ) ) continue; @@ -932,7 +931,7 @@ void NonUniformBufferedScheme<Stencil>::startCommunicationFineToCoarse( const ui auto neighbor = dynamic_cast< Block * >( forest->getBlock(receiverId) ); WALBERLA_ASSERT_EQUAL( neighbor->getProcess(), block->getProcess() ) - for( auto packInfo = packInfos_.begin(); packInfo != packInfos_.end(); ++packInfo ) + for(auto & packInfo : packInfos_) { if( localMode_ == BUFFER ) { @@ -941,14 +940,14 @@ void NonUniformBufferedScheme<Stencil>::startCommunicationFineToCoarse( const ui const uint_t bufferIndex = uint_c( localBuffers.size() ) - uint_t(1); VoidFunction pack = std::bind( &NonUniformBufferedScheme<Stencil>::localBufferPacking, this, - FINE_TO_COARSE, index, bufferIndex, std::cref( *packInfo ), block, neighbor, *dir ); + FINE_TO_COARSE, index, bufferIndex, std::cref( packInfo ), block, neighbor, *dir ); threadsafeLocalCommunication.push_back( pack ); VoidFunction unpack = std::bind( &NonUniformBufferedScheme<Stencil>::localBufferUnpacking, this, - FINE_TO_COARSE, index, bufferIndex, std::cref( *packInfo ), neighbor, block, *dir ); + FINE_TO_COARSE, index, bufferIndex, std::cref( packInfo ), neighbor, block, *dir ); - if( (*packInfo)->threadsafeReceiving() ) + if( packInfo->threadsafeReceiving() ) threadsafeLocalCommunicationUnpack.push_back( unpack ); else localCommunicationUnpack.push_back( unpack ); @@ -956,8 +955,8 @@ void NonUniformBufferedScheme<Stencil>::startCommunicationFineToCoarse( const ui else { VoidFunction localCommunicationFunction = std::bind( &blockforest::communication::NonUniformPackInfo::communicateLocalFineToCoarse, - *packInfo, block, neighbor, *dir ); - if( (*packInfo)->threadsafeReceiving() ) + packInfo, block, neighbor, *dir ); + if( packInfo->threadsafeReceiving() ) threadsafeLocalCommunication.push_back( localCommunicationFunction ); else localCommunication.push_back( localCommunicationFunction ); @@ -971,8 +970,8 @@ void NonUniformBufferedScheme<Stencil>::startCommunicationFineToCoarse( const ui if( !packInfos_.empty() ) sendFunctions[ nProcess ].push_back( std::bind( NonUniformBufferedScheme<Stencil>::writeHeader, std::placeholders::_1, block->getId(), receiverId, *dir ) ); - for( auto packInfo = packInfos_.begin(); packInfo != packInfos_.end(); ++packInfo ) - sendFunctions[ nProcess ].push_back( std::bind( &blockforest::communication::NonUniformPackInfo::packDataFineToCoarse, *packInfo, block, receiverId, *dir, std::placeholders::_1 ) ); + for(auto & packInfo : packInfos_) + sendFunctions[ nProcess ].push_back( std::bind( &blockforest::communication::NonUniformPackInfo::packDataFineToCoarse, packInfo, block, receiverId, *dir, std::placeholders::_1 ) ); } } } @@ -997,11 +996,11 @@ void NonUniformBufferedScheme<Stencil>::startCommunicationFineToCoarse( const ui resetBufferSystem( bufferSystem ); - for( auto sender = sendFunctions.begin(); sender != sendFunctions.end(); ++sender ) - bufferSystem->addSendingFunction( int_c(sender->first), std::bind( NonUniformBufferedScheme<Stencil>::send, std::placeholders::_1, sender->second ) ); + for( auto sender : sendFunctions ) + bufferSystem->addSendingFunction( int_c(sender.first), std::bind( NonUniformBufferedScheme<Stencil>::send, std::placeholders::_1, sender.second ) ); - for( auto receiver = ranksToReceiveFrom.begin(); receiver != ranksToReceiveFrom.end(); ++receiver ) - bufferSystem->addReceivingFunction( int_c(*receiver), std::bind( &NonUniformBufferedScheme<Stencil>::receive, this, std::placeholders::_1 ) ); + for(auto receiver : ranksToReceiveFrom) + bufferSystem->addReceivingFunction( int_c(receiver), std::bind( &NonUniformBufferedScheme<Stencil>::receive, this, std::placeholders::_1 ) ); setupBeforeNextCommunication = char(0); } @@ -1019,10 +1018,10 @@ void NonUniformBufferedScheme<Stencil>::resetBufferSystem( shared_ptr< mpi::Open bool constantSizes = true; bool threadsafeReceive = true; - for( auto packInfo = packInfos_.begin(); packInfo != packInfos_.end(); ++packInfo ) + for(auto & packInfo : packInfos_) { - if( !(*packInfo)->constantDataExchange() ) constantSizes = false; - if( !(*packInfo)->threadsafeReceiving() ) threadsafeReceive = false; + if( !packInfo->constantDataExchange() ) constantSizes = false; + if( !packInfo->threadsafeReceiving() ) threadsafeReceive = false; } bufferSystem->setReceiverInfo( !constantSizes ); @@ -1048,8 +1047,8 @@ void NonUniformBufferedScheme<Stencil>::start( const INDEX i, const uint_t j ) if( localMode_ == START ) { - for( auto function = localCommunication.begin(); function != localCommunication.end(); ++function ) - (*function)(); + for(auto & function : localCommunication) + function(); const int threadsafeLocalCommunicationSize = int_c( threadsafeLocalCommunication.size() ); #ifdef _OPENMP @@ -1089,8 +1088,8 @@ void NonUniformBufferedScheme<Stencil>::wait( const INDEX i, const uint_t j ) if( localMode_ == WAIT ) { - for( auto function = localCommunication.begin(); function != localCommunication.end(); ++function ) - (*function)(); + for(auto & function : localCommunication) + function(); const int threadsafeLocalCommunicationSize = int_c( threadsafeLocalCommunication.size() ); #ifdef _OPENMP @@ -1101,8 +1100,8 @@ void NonUniformBufferedScheme<Stencil>::wait( const INDEX i, const uint_t j ) } else if( localMode_ == BUFFER ) { - for( auto function = localCommunicationUnpack.begin(); function != localCommunicationUnpack.end(); ++function ) - (*function)(); + for(auto & function : localCommunicationUnpack) + function(); const int threadsafeLocalCommunicationUnpackSize = int_c( threadsafeLocalCommunicationUnpack.size() ); #ifdef _OPENMP @@ -1144,8 +1143,8 @@ void NonUniformBufferedScheme<Stencil>::readHeader( RecvBuffer & buffer, BlockID template< typename Stencil > void NonUniformBufferedScheme<Stencil>::send( SendBuffer & buffer, std::vector< SendBufferFunction > & functions ) { - for( auto function = functions.begin(); function != functions.end(); ++function ) - (*function)( buffer ); + for(auto & function : functions) + function( buffer ); } @@ -1171,18 +1170,18 @@ void NonUniformBufferedScheme<Stencil>::receive( RecvBuffer & buffer ) if( senderLevel == receiverLevel ) { - for( auto packInfo = packInfos_.begin(); packInfo != packInfos_.end(); ++packInfo ) - (*packInfo)->unpackDataEqualLevel( block, stencil::inverseDir[dir], buffer ); + for(auto & packInfo : packInfos_) + packInfo->unpackDataEqualLevel( block, stencil::inverseDir[dir], buffer ); } else if( senderLevel < receiverLevel ) // coarse to fine { - for( auto packInfo = packInfos_.begin(); packInfo != packInfos_.end(); ++packInfo ) - (*packInfo)->unpackDataCoarseToFine( block, sender, stencil::inverseDir[dir], buffer ); + for(auto & packInfo : packInfos_) + packInfo->unpackDataCoarseToFine( block, sender, stencil::inverseDir[dir], buffer ); } else if( senderLevel > receiverLevel ) // fine to coarse { - for( auto packInfo = packInfos_.begin(); packInfo != packInfos_.end(); ++packInfo ) - (*packInfo)->unpackDataFineToCoarse( block, sender, stencil::inverseDir[dir], buffer ); + for(auto & packInfo : packInfos_) + packInfo->unpackDataFineToCoarse( block, sender, stencil::inverseDir[dir], buffer ); } } } @@ -1244,15 +1243,13 @@ void NonUniformBufferedScheme<Stencil>::localBufferUnpacking( const INDEX i, con template< typename Stencil > bool NonUniformBufferedScheme<Stencil>::isAnyCommunicationInProgress() const { - for( auto caseIt = communicationInProgress_.begin(); caseIt != communicationInProgress_.end(); ++caseIt ) - for( auto levelIt = caseIt->begin(); levelIt != caseIt->end(); ++levelIt ) - if( *levelIt ) + for(const auto & communicationInProgres : communicationInProgress_) + for(bool inProgress : communicationInProgres) + if( inProgress ) return true; return false; } -} // namespace communication -} // namespace blockforest } // namespace walberla diff --git a/src/core/mpi/BufferSystem.h b/src/core/mpi/BufferSystem.h index 04161810a408aab7bc0baf6ddb6159714333494a..02bdd60bb38452899d84abeb78d19c68b03f1b69 100644 --- a/src/core/mpi/BufferSystem.h +++ b/src/core/mpi/BufferSystem.h @@ -245,9 +245,9 @@ protected: struct SendInfo { - SendInfo() : alreadySent(false) {} + SendInfo()= default; SendBuffer_T buffer; - bool alreadySent; + bool alreadySent{false}; }; std::map<MPIRank, SendInfo> sendInfos_; diff --git a/src/core/mpi/BufferSystem.impl.h b/src/core/mpi/BufferSystem.impl.h index 183d29bd86b7412090916e0b2be92c3dc7a4c352..3715a7ea9f3daf95c0bde6163954ca72159e48d3 100644 --- a/src/core/mpi/BufferSystem.impl.h +++ b/src/core/mpi/BufferSystem.impl.h @@ -52,7 +52,7 @@ void GenericBufferSystem<Rb, Sb>::iterator::operator++() { currentRecvBuffer_ = bufferSystem_.waitForNext( currentSenderRank_ ); if ( ! currentRecvBuffer_ ) { - WALBERLA_ASSERT_EQUAL( currentSenderRank_, -1 ); + WALBERLA_ASSERT_EQUAL( currentSenderRank_, -1 ) } else { bufferSystem_.bytesReceived_ += currentRecvBuffer_->size() * sizeof(RecvBuffer::ElementType); @@ -64,7 +64,7 @@ template< typename Rb, typename Sb> bool GenericBufferSystem<Rb, Sb>::iterator::operator==( const typename GenericBufferSystem<Rb, Sb>::iterator & other ) { // only equality checks with end iterators are allowed - WALBERLA_ASSERT( other.currentSenderRank_ == -1 || currentSenderRank_ == -1 ); + WALBERLA_ASSERT( other.currentSenderRank_ == -1 || currentSenderRank_ == -1 ) return ( currentSenderRank_ == other.currentSenderRank_ ); } @@ -73,7 +73,7 @@ template< typename Rb, typename Sb> bool GenericBufferSystem<Rb, Sb>::iterator::operator!=( const typename GenericBufferSystem<Rb, Sb>::iterator & other ) { // only equality checks with end iterators are allowed - WALBERLA_ASSERT( other.currentSenderRank_ == -1 || currentSenderRank_ == -1 ); + WALBERLA_ASSERT( other.currentSenderRank_ == -1 || currentSenderRank_ == -1 ) return ( currentSenderRank_ != other.currentSenderRank_ ); } @@ -90,7 +90,7 @@ template< typename Rb, typename Sb> template<typename RankIter> void GenericBufferSystem<Rb, Sb>::setReceiverInfo( RankIter rankBegin, RankIter rankEnd, bool changingSize ) { - WALBERLA_ASSERT( ! communicationRunning_ ); + WALBERLA_ASSERT( ! communicationRunning_ ) recvInfos_.clear(); for ( auto it = rankBegin; it != rankEnd; ++it ) @@ -147,7 +147,7 @@ GenericBufferSystem<Rb, Sb>::GenericBufferSystem( const GenericBufferSystem &oth recvInfos_( other.recvInfos_ ), sendInfos_( other.sendInfos_ ) { - WALBERLA_ASSERT( !communicationRunning_, "Can't copy GenericBufferSystem while communication is running" ); + WALBERLA_ASSERT( !communicationRunning_, "Can't copy GenericBufferSystem while communication is running" ) if( other.currentComm_ == &other.knownSizeComm_ ) currentComm_ = &knownSizeComm_; else if ( other.currentComm_ == &other.unknownSizeComm_ ) @@ -163,7 +163,7 @@ GenericBufferSystem<Rb, Sb>::GenericBufferSystem( const GenericBufferSystem &oth template< typename Rb, typename Sb> GenericBufferSystem<Rb, Sb> & GenericBufferSystem<Rb, Sb>::operator=( const GenericBufferSystem<Rb, Sb> & other ) { - WALBERLA_ASSERT( !communicationRunning_, "Can't copy GenericBufferSystem while communication is running" ); + WALBERLA_ASSERT( !communicationRunning_, "Can't copy GenericBufferSystem while communication is running" ) sizeChangesEverytime_ = other.sizeChangesEverytime_; communicationRunning_ = other.communicationRunning_; @@ -204,12 +204,11 @@ GenericBufferSystem<Rb, Sb> & GenericBufferSystem<Rb, Sb>::operator=( const Gene template< typename Rb, typename Sb> void GenericBufferSystem<Rb, Sb>::setReceiverInfo( const std::set<MPIRank> & ranksToRecvFrom, bool changingSize ) { - WALBERLA_ASSERT( ! communicationRunning_ ); + WALBERLA_ASSERT( ! communicationRunning_ ) recvInfos_.clear(); - for ( auto it = ranksToRecvFrom.begin(); it != ranksToRecvFrom.end(); ++it ) + for ( auto sender : ranksToRecvFrom ) { - const MPIRank sender = *it; recvInfos_[ sender ].size = INVALID_SIZE; } @@ -234,10 +233,10 @@ void GenericBufferSystem<Rb, Sb>::setReceiverInfo( const std::map<MPIRank,MPISiz WALBERLA_ASSERT( ! communicationRunning_ ) recvInfos_.clear(); - for ( auto it = ranksToRecvFrom.begin(); it != ranksToRecvFrom.end(); ++it ) + for ( auto it : ranksToRecvFrom ) { - const MPIRank sender = it->first; - const MPISize senderSize = it->second; + const MPIRank sender = it.first; + const MPISize senderSize = it.second; WALBERLA_ASSERT_GREATER( senderSize, 0 ) recvInfos_[ sender ].size = senderSize; } @@ -269,7 +268,7 @@ void GenericBufferSystem<Rb, Sb>::setReceiverInfo( const std::map<MPIRank,MPISiz template< typename Rb, typename Sb> void GenericBufferSystem<Rb, Sb>::setReceiverInfoFromSendBufferState( bool useSizeFromSendBuffers, bool changingSize ) { - WALBERLA_ASSERT( ! communicationRunning_ ); + WALBERLA_ASSERT( ! communicationRunning_ ) recvInfos_.clear(); for ( auto it = sendInfos_.begin(); it != sendInfos_.end(); ++it ) @@ -303,7 +302,7 @@ void GenericBufferSystem<Rb, Sb>::setReceiverInfoFromSendBufferState( bool useSi template< typename Rb, typename Sb> void GenericBufferSystem<Rb, Sb>::sizeHasChanged( bool alwaysChangingSize ) { - WALBERLA_ASSERT( ! communicationRunning_ ); + WALBERLA_ASSERT( ! communicationRunning_ ) sizeChangesEverytime_ = alwaysChangingSize; setCommunicationType( false ); @@ -347,7 +346,7 @@ Sb & GenericBufferSystem<Rb, Sb>::sendBuffer( MPIRank rank ) template< typename Rb, typename Sb> void GenericBufferSystem<Rb, Sb>::sendAll() { - WALBERLA_ASSERT_NOT_NULLPTR( currentComm_ ); // call setReceiverInfo first! + WALBERLA_ASSERT_NOT_NULLPTR( currentComm_ ) // call setReceiverInfo first! if ( !communicationRunning_ ) startCommunication(); @@ -380,14 +379,14 @@ void GenericBufferSystem<Rb, Sb>::sendAll() template< typename Rb, typename Sb> void GenericBufferSystem<Rb, Sb>::send( MPIRank rank ) { - WALBERLA_ASSERT_NOT_NULLPTR( currentComm_ ); // call setReceiverInfo first! + WALBERLA_ASSERT_NOT_NULLPTR( currentComm_ ) // call setReceiverInfo first! if ( !communicationRunning_ ) startCommunication(); auto iter = sendInfos_.find(rank); - WALBERLA_ASSERT( iter != sendInfos_.end() ); // no send buffer was created for this rank - WALBERLA_ASSERT( ! iter->second.alreadySent ); // this buffer has already been sent + WALBERLA_ASSERT( iter != sendInfos_.end() ) // no send buffer was created for this rank + WALBERLA_ASSERT( ! iter->second.alreadySent ) // this buffer has already been sent if ( iter->second.buffer.size() > 0 ) { @@ -424,10 +423,10 @@ void GenericBufferSystem<Rb, Sb>::startCommunication() const auto tag = currentComm_->getTag(); WALBERLA_CHECK_EQUAL(activeTags_.find(tag), activeTags_.end(), - "Another communication with the same MPI tag is currently in progress."); + "Another communication with the same MPI tag is currently in progress.") activeTags_.insert(tag); - WALBERLA_CHECK( ! communicationRunning_ ); + WALBERLA_CHECK( ! communicationRunning_ ) currentComm_->scheduleReceives( recvInfos_ ); communicationRunning_ = true; @@ -452,7 +451,7 @@ void GenericBufferSystem<Rb, Sb>::startCommunication() template< typename Rb, typename Sb> void GenericBufferSystem<Rb, Sb>::endCommunication() { - WALBERLA_CHECK( communicationRunning_ ); + WALBERLA_CHECK( communicationRunning_ ) currentComm_->waitForSends(); // Clear send buffers @@ -482,7 +481,7 @@ void GenericBufferSystem<Rb, Sb>::endCommunication() template< typename Rb, typename Sb> Rb * GenericBufferSystem<Rb, Sb>::waitForNext( MPIRank & fromRank ) { - WALBERLA_ASSERT( communicationRunning_ ); + WALBERLA_ASSERT( communicationRunning_ ) fromRank = currentComm_->waitForNextReceive( recvInfos_ ); @@ -531,7 +530,7 @@ void GenericBufferSystem<Rb, Sb>::setCommunicationType( const bool knownSize ) template< typename Rb, typename Sb> typename GenericBufferSystem<Rb, Sb>::RankRange GenericBufferSystem<Rb,Sb>::noRanks() { - return RankRange(); + return {}; } template< typename Rb, typename Sb> typename GenericBufferSystem<Rb, Sb>::RankRange GenericBufferSystem<Rb,Sb>::allRanks() @@ -544,7 +543,7 @@ typename GenericBufferSystem<Rb, Sb>::RankRange GenericBufferSystem<Rb,Sb>::allR template< typename Rb, typename Sb> typename GenericBufferSystem<Rb, Sb>::RankRange GenericBufferSystem<Rb,Sb>::allRanksButRoot() { - int numProcesses = MPIManager::instance()->numProcesses(); + int const numProcesses = MPIManager::instance()->numProcesses(); RankRange r; std::generate_n( std::inserter(r, r.end()), numProcesses-1, [&]{ return MPIRank(r.size()+size_t(1)); } ); return r; @@ -552,7 +551,7 @@ typename GenericBufferSystem<Rb, Sb>::RankRange GenericBufferSystem<Rb,Sb>::allR template< typename Rb, typename Sb> typename GenericBufferSystem<Rb, Sb>::RankRange GenericBufferSystem<Rb,Sb>::onlyRank( int includedRank ) { - WALBERLA_ASSERT_LESS( includedRank, MPIManager::instance()->numProcesses() ); + WALBERLA_ASSERT_LESS( includedRank, MPIManager::instance()->numProcesses() ) return RankRange { includedRank }; } diff --git a/src/core/mpi/BufferSystemHelper.impl.h b/src/core/mpi/BufferSystemHelper.impl.h index e6236f100ca3d8f58817824c16ac5844e41f4af6..beee5bf78d4b03926b516a09b19b482e77254ba7 100644 --- a/src/core/mpi/BufferSystemHelper.impl.h +++ b/src/core/mpi/BufferSystemHelper.impl.h @@ -147,14 +147,14 @@ MPIRank KnownSizeCommunication<Rb, Sb>::waitForNextReceive( std::map<MPIRank, Re recvRequests_[ uint_c( requestIndex ) ] = MPI_REQUEST_NULL; - MPIRank senderRank = status.MPI_SOURCE; + MPIRank const senderRank = status.MPI_SOURCE; WALBERLA_ASSERT_GREATER_EQUAL( senderRank, 0 ); #ifndef NDEBUG int receivedBytes; MPI_Get_count( &status, MPI_BYTE, &receivedBytes ); - WALBERLA_ASSERT_EQUAL ( recvInfos[senderRank].size, receivedBytes ); + WALBERLA_ASSERT_EQUAL ( recvInfos[senderRank].size, receivedBytes ) #else WALBERLA_UNUSED( recvInfos ); #endif @@ -493,10 +493,7 @@ void NoMPICommunication<Rb, Sb>::send( MPIRank receiver, const Sb & sendBuffer ) } template< typename Rb, typename Sb> -void NoMPICommunication<Rb, Sb>::waitForSends() -{ - return; -} +void NoMPICommunication<Rb, Sb>::waitForSends(){} template< typename Rb, typename Sb> void NoMPICommunication<Rb, Sb>::scheduleReceives( std::map<MPIRank, ReceiveInfo> & recvInfos ) diff --git a/src/core/mpi/OpenMPBufferSystem.impl.h b/src/core/mpi/OpenMPBufferSystem.impl.h index 5230d5b9de74a93ba98db6081e53ee752cc86bf7..f87895d7a3467ae4f7984e91e95c1c75472e13b3 100644 --- a/src/core/mpi/OpenMPBufferSystem.impl.h +++ b/src/core/mpi/OpenMPBufferSystem.impl.h @@ -57,7 +57,7 @@ void GenericOpenMPBufferSystem<Rb, Sb>::addSendingFunction( MPIRank rank, const dirty_ = true; sendRanks_.push_back( rank ); sendFunctions_.push_back( sendFunction ); - WALBERLA_ASSERT_EQUAL( sendRanks_.size(), sendFunctions_.size() ); + WALBERLA_ASSERT_EQUAL( sendRanks_.size(), sendFunctions_.size() ) } @@ -72,8 +72,8 @@ void GenericOpenMPBufferSystem<Rb, Sb>::setupBufferSystem() []( typename decltype(recvFunctions_)::value_type r ){ return r.first; }); bs_.setReceiverInfo( recvRanks, sizeChangesEverytime_ ); - for( auto sendRank = sendRanks_.begin(); sendRank != sendRanks_.end(); ++sendRank ) // Do NOT delete this for loop! This loop is needed ... - bs_.sendBuffer( *sendRank ); // ... so that the "sendBuffer(rank)" call in startCommunicationOpenMP is thread-safe! + for(auto sendRank : sendRanks_) // Do NOT delete this for loop! This loop is needed ... + bs_.sendBuffer( sendRank ); // ... so that the "sendBuffer(rank)" call in startCommunicationOpenMP is thread-safe! dirty_ = false; } @@ -101,7 +101,7 @@ void GenericOpenMPBufferSystem<Rb, Sb>::startCommunicationSerial() { bs_.scheduleReceives(); - WALBERLA_ASSERT_EQUAL( sendRanks_.size(), sendFunctions_.size() ); + WALBERLA_ASSERT_EQUAL( sendRanks_.size(), sendFunctions_.size() ) const uint_t nrOfSendFunctions = sendFunctions_.size(); @@ -122,7 +122,7 @@ void GenericOpenMPBufferSystem<Rb, Sb>::startCommunicationOpenMP() { bs_.scheduleReceives(); - WALBERLA_ASSERT_EQUAL( sendRanks_.size(), sendFunctions_.size() ); + WALBERLA_ASSERT_EQUAL( sendRanks_.size(), sendFunctions_.size() ) const int nrOfSendFunctions = int_c( sendFunctions_.size() ); @@ -173,7 +173,7 @@ void GenericOpenMPBufferSystem<Rb, Sb>::waitSerial() RecvBuffer & recvBuffer = recvIt.buffer(); // call unpacking - WALBERLA_ASSERT( recvFunctions_.find( rank ) != recvFunctions_.end() ); + WALBERLA_ASSERT( recvFunctions_.find( rank ) != recvFunctions_.end() ) recvFunctions_[rank] ( recvBuffer ); } } @@ -198,19 +198,19 @@ void GenericOpenMPBufferSystem<Rb, Sb>::waitOpenMP() recvBuffer = bs_.waitForNext( recvRank ); - WALBERLA_ASSERT_GREATER_EQUAL( recvRank, 0 ); - WALBERLA_ASSERT_NOT_NULLPTR( recvBuffer ); + WALBERLA_ASSERT_GREATER_EQUAL( recvRank, 0 ) + WALBERLA_ASSERT_NOT_NULLPTR( recvBuffer ) - WALBERLA_ASSERT( recvFunctions_.find( recvRank ) != recvFunctions_.end() ); + WALBERLA_ASSERT( recvFunctions_.find( recvRank ) != recvFunctions_.end() ) recvFunctions_[recvRank] ( *recvBuffer ); } MPIRank rank; RecvBuffer * ret = bs_.waitForNext( rank ); - WALBERLA_ASSERT_NULLPTR( ret ); // call last time to finish communication + WALBERLA_ASSERT_NULLPTR( ret ) // call last time to finish communication WALBERLA_UNUSED( ret ); - WALBERLA_ASSERT( ! bs_.isCommunicationRunning() ); + WALBERLA_ASSERT( ! bs_.isCommunicationRunning() ) } diff --git a/src/gpu/communication/CustomMemoryBuffer.h b/src/gpu/communication/CustomMemoryBuffer.h index e01e873708d84788fcecfb33a83ab3616b07c752..ce8fbc49acea51b736c79e11525c87ec482f069e 100644 --- a/src/gpu/communication/CustomMemoryBuffer.h +++ b/src/gpu/communication/CustomMemoryBuffer.h @@ -73,6 +73,7 @@ namespace communication { void resize( std::size_t newSize ); inline std::size_t allocSize() const { return std::size_t(end_ - begin_); } inline std::size_t size() const { return std::size_t(cur_ - begin_); } + inline std::size_t remainingSize() const { return std::size_t(end_ - cur_); } ElementType *ptr() const { return begin_; } ElementType *cur() const { return cur_; } diff --git a/src/gpu/communication/GeneratedNonUniformGPUPackInfo.h b/src/gpu/communication/GeneratedNonUniformGPUPackInfo.h index f6b39d9b0fe0dd1b9c90c5d63eb7b8ca00bd3d0f..a5dc8d8cfccf70decacc7b33ecb5318e0490b933 100644 --- a/src/gpu/communication/GeneratedNonUniformGPUPackInfo.h +++ b/src/gpu/communication/GeneratedNonUniformGPUPackInfo.h @@ -38,32 +38,29 @@ namespace walberla::gpu { class GeneratedNonUniformGPUPackInfo { public: - using VoidFunction = std::function< void( gpuStream_t) >; GeneratedNonUniformGPUPackInfo() = default; virtual ~GeneratedNonUniformGPUPackInfo() = default; virtual bool constantDataExchange() const = 0; - virtual bool threadsafeReceiving() const = 0; + virtual bool threadsafeReceiving() const = 0; - inline void packDataEqualLevel( const Block * sender, stencil::Direction dir, GpuBuffer_T & buffer) const; - virtual void unpackDataEqualLevel( Block * receiver, stencil::Direction dir, GpuBuffer_T & buffer) = 0; + inline void packDataEqualLevel( const Block * sender, stencil::Direction dir, GpuBuffer_T & buffer, gpuStream_t stream = nullptr) const; + virtual void unpackDataEqualLevel( Block * receiver, stencil::Direction dir, GpuBuffer_T & buffer, gpuStream_t stream) = 0; virtual void communicateLocalEqualLevel( const Block * sender, Block * receiver, stencil::Direction dir, gpuStream_t stream) = 0; - virtual void getLocalEqualLevelCommFunction( std::vector< VoidFunction >& commFunctions, const Block * sender, Block * receiver, stencil::Direction dir) = 0; - inline void packDataCoarseToFine ( const Block * coarseSender, const BlockID & fineReceiver, stencil::Direction dir, GpuBuffer_T & buffer) const; - virtual void unpackDataCoarseToFine ( Block * fineReceiver, const BlockID & coarseSender, stencil::Direction dir, GpuBuffer_T & buffer) = 0; - virtual void communicateLocalCoarseToFine( const Block * coarseSender, Block * fineReceiver, stencil::Direction dir ) = 0; + inline void packDataCoarseToFine ( const Block * coarseSender, const BlockID & fineReceiver, stencil::Direction dir, GpuBuffer_T & buffer, gpuStream_t stream = nullptr) const; + virtual void unpackDataCoarseToFine ( Block * fineReceiver, const BlockID & coarseSender, stencil::Direction dir, GpuBuffer_T & buffer, gpuStream_t stream) = 0; + virtual void communicateLocalCoarseToFine( const Block * coarseSender, Block * fineReceiver, stencil::Direction dir, gpuStream_t stream) = 0; virtual void communicateLocalCoarseToFine( const Block * coarseSender, Block * fineReceiver, stencil::Direction dir, GpuBuffer_T & buffer, gpuStream_t stream) = 0; - virtual void getLocalCoarseToFineCommFunction( std::vector< VoidFunction >& commFunctions, const Block * coarseSender, Block * fineReceiver, stencil::Direction dir, GpuBuffer_T & buffer) = 0; - inline void packDataFineToCoarse ( const Block * fineSender, const BlockID & coarseReceiver, stencil::Direction dir, GpuBuffer_T & buffer) const; - virtual void unpackDataFineToCoarse ( Block * coarseReceiver, const BlockID & fineSender, stencil::Direction dir, GpuBuffer_T & buffer) = 0; - virtual void communicateLocalFineToCoarse( const Block * fineSender, Block * coarseReceiver, stencil::Direction dir) = 0; + inline void packDataFineToCoarse ( const Block * fineSender, const BlockID & coarseReceiver, stencil::Direction dir, GpuBuffer_T & buffer, gpuStream_t stream = nullptr) const; + virtual void unpackDataFineToCoarse ( Block * coarseReceiver, const BlockID & fineSender, stencil::Direction dir, GpuBuffer_T & buffer, gpuStream_t stream) = 0; + virtual void communicateLocalFineToCoarse( const Block * fineSender, Block * coarseReceiver, stencil::Direction dir, gpuStream_t stream) = 0; virtual void communicateLocalFineToCoarse( const Block * fineSender, Block * coarseReceiver, stencil::Direction dir, GpuBuffer_T & buffer, gpuStream_t stream) = 0; - virtual void getLocalFineToCoarseCommFunction( std::vector< VoidFunction >& commFunctions, const Block * fineSender, Block * coarseReceiver, stencil::Direction dir, GpuBuffer_T & buffer) = 0; virtual uint_t sizeEqualLevelSend( const Block * sender, stencil::Direction dir) = 0; virtual uint_t sizeCoarseToFineSend ( const Block * coarseSender, const BlockID & fineReceiver, stencil::Direction dir) = 0; + virtual uint_t sizeCoarseToFineReceive ( Block* fineReceiver, stencil::Direction dir) = 0; virtual uint_t sizeFineToCoarseSend ( const Block * fineSender, stencil::Direction dir) = 0; @@ -72,9 +69,9 @@ class GeneratedNonUniformGPUPackInfo #endif protected: - virtual void packDataEqualLevelImpl(const Block* sender, stencil::Direction dir, GpuBuffer_T & buffer) const = 0; - virtual void packDataCoarseToFineImpl(const Block* coarseSender, const BlockID& fineReceiver, stencil::Direction dir, GpuBuffer_T & buffer) const = 0; - virtual void packDataFineToCoarseImpl(const Block* fineSender, const BlockID& coarseReceiver, stencil::Direction dir, GpuBuffer_T & buffer) const = 0; + virtual void packDataEqualLevelImpl(const Block* sender, stencil::Direction dir, GpuBuffer_T & buffer, gpuStream_t stream) const = 0; + virtual void packDataCoarseToFineImpl(const Block* coarseSender, const BlockID& fineReceiver, stencil::Direction dir, GpuBuffer_T & buffer, gpuStream_t stream) const = 0; + virtual void packDataFineToCoarseImpl(const Block* fineSender, const BlockID& coarseReceiver, stencil::Direction dir, GpuBuffer_T & buffer, gpuStream_t stream) const = 0; #ifndef NDEBUG mutable std::map< const Block *, std::map< stencil::Direction, std::map< uint_t, size_t > > > bufferSize_; @@ -82,13 +79,13 @@ class GeneratedNonUniformGPUPackInfo }; -inline void GeneratedNonUniformGPUPackInfo::packDataEqualLevel( const Block * sender, stencil::Direction dir, GpuBuffer_T & buffer ) const +inline void GeneratedNonUniformGPUPackInfo::packDataEqualLevel( const Block * sender, stencil::Direction dir, GpuBuffer_T & buffer, gpuStream_t stream) const { #ifndef NDEBUG size_t const sizeBefore = buffer.size(); #endif - packDataEqualLevelImpl( sender, dir, buffer ); + packDataEqualLevelImpl( sender, dir, buffer, stream); #ifndef NDEBUG size_t const sizeAfter = buffer.size(); @@ -107,13 +104,13 @@ if( constantDataExchange() ) -inline void GeneratedNonUniformGPUPackInfo::packDataCoarseToFine( const Block * coarseSender, const BlockID & fineReceiver, stencil::Direction dir, GpuBuffer_T & buffer ) const +inline void GeneratedNonUniformGPUPackInfo::packDataCoarseToFine( const Block * coarseSender, const BlockID & fineReceiver, stencil::Direction dir, GpuBuffer_T & buffer, gpuStream_t stream) const { #ifndef NDEBUG size_t const sizeBefore = buffer.size(); #endif - packDataCoarseToFineImpl( coarseSender, fineReceiver, dir, buffer ); + packDataCoarseToFineImpl( coarseSender, fineReceiver, dir, buffer, stream); #ifndef NDEBUG size_t const sizeAfter = buffer.size(); @@ -132,13 +129,13 @@ if( constantDataExchange() ) -inline void GeneratedNonUniformGPUPackInfo::packDataFineToCoarse( const Block * fineSender, const BlockID & coarseReceiver, stencil::Direction dir, GpuBuffer_T & buffer ) const +inline void GeneratedNonUniformGPUPackInfo::packDataFineToCoarse( const Block * fineSender, const BlockID & coarseReceiver, stencil::Direction dir, GpuBuffer_T & buffer, gpuStream_t stream) const { #ifndef NDEBUG size_t const sizeBefore = buffer.size(); #endif - packDataFineToCoarseImpl( fineSender, coarseReceiver, dir, buffer ); + packDataFineToCoarseImpl( fineSender, coarseReceiver, dir, buffer, stream); #ifndef NDEBUG size_t const sizeAfter = buffer.size(); diff --git a/src/gpu/communication/NonUniformGPUScheme.h b/src/gpu/communication/NonUniformGPUScheme.h index 4b9576434d52dc8726daa35ab91a32fd8984a780..1d3583a12f83295114c5758769689aac186e8399 100644 --- a/src/gpu/communication/NonUniformGPUScheme.h +++ b/src/gpu/communication/NonUniformGPUScheme.h @@ -23,40 +23,41 @@ #include "blockforest/StructuredBlockForest.h" +#include "core/DataTypes.h" #include "core/mpi/BufferSystem.h" +#include "core/mpi/MPIManager.h" #include "core/mpi/MPIWrapper.h" #include "domain_decomposition/IBlock.h" #include "stencil/Directions.h" -#include <thread> - #include "gpu/ErrorChecking.h" -#include "gpu/GPURAII.h" #include "gpu/GPUWrapper.h" -#include "gpu/ParallelStreams.h" #include "gpu/communication/CustomMemoryBuffer.h" #include "gpu/communication/GeneratedNonUniformGPUPackInfo.h" +#include <memory> +#include <thread> + namespace walberla::gpu::communication { template< typename Stencil > class NonUniformGPUScheme { - public: +public: enum INDEX { EQUAL_LEVEL = 0, COARSE_TO_FINE = 1, FINE_TO_COARSE = 2 }; using CpuBuffer_T = walberla::gpu::communication::PinnedMemoryBuffer; using GpuBuffer_T = walberla::gpu::communication::GPUMemoryBuffer; explicit NonUniformGPUScheme(const weak_ptr< StructuredBlockForest >& bf, bool sendDirectlyFromGPU = false, - const int tag = 5432); + int tag = 5432); explicit NonUniformGPUScheme(const weak_ptr< StructuredBlockForest >& bf, const Set< SUID >& requiredBlockSelectors, const Set< SUID >& incompatibleBlockSelectors, bool sendDirectlyFromGPU = false, - const int tag = 5432); + int tag = 5432); ~NonUniformGPUScheme(); @@ -67,9 +68,9 @@ class NonUniformGPUScheme //@} //******************************************************************************************************************* - inline void communicateEqualLevel(const uint_t level); - inline void communicateCoarseToFine(const uint_t fineLevel); - inline void communicateFineToCoarse(const uint_t fineLevel); + inline void communicateEqualLevel(uint_t level); + inline void communicateCoarseToFine(uint_t fineLevel); + inline void communicateFineToCoarse(uint_t fineLevel); std::function<void()> communicateEqualLevelFunctor(const uint_t level) { return [level, this](){ NonUniformGPUScheme::communicateEqualLevel(level);}; @@ -81,28 +82,28 @@ class NonUniformGPUScheme return [fineLevel, this](){ NonUniformGPUScheme::communicateFineToCoarse(fineLevel);}; } - inline void startCommunicateEqualLevel(const uint_t level); - inline void startCommunicateCoarseToFine(const uint_t fineLevel); - inline void startCommunicateFineToCoarse(const uint_t fineLevel); + inline void startCommunicateEqualLevel(uint_t level); + inline void startCommunicateCoarseToFine(uint_t fineLevel); + inline void startCommunicateFineToCoarse(uint_t fineLevel); - inline void waitCommunicateEqualLevel(const uint_t level); - inline void waitCommunicateCoarseToFine(const uint_t fineLevel); - inline void waitCommunicateFineToCoarse(const uint_t fineLevel); + inline void waitCommunicateEqualLevel(uint_t level); + inline void waitCommunicateCoarseToFine(uint_t fineLevel); + inline void waitCommunicateFineToCoarse(uint_t fineLevel); - private: +private: void setupCommunication(); void init(); void refresh(); - bool isAnyCommunicationInProgress() const; + [[nodiscard]] bool isAnyCommunicationInProgress() const; - void startCommunicationEqualLevel(const uint_t index, std::set< uint_t >& participatingLevels); - void startCommunicationCoarseToFine(const uint_t index, const uint_t coarsestLevel); - void startCommunicationFineToCoarse(const uint_t index, const uint_t finestLevel); + void startCommunicationEqualLevel(uint_t index, std::set< uint_t >& participatingLevels); + void startCommunicationCoarseToFine(uint_t index, uint_t coarsestLevel); + void startCommunicationFineToCoarse(uint_t index, uint_t finestLevel); weak_ptr< StructuredBlockForest > blockForest_; - uint_t forestModificationStamp_; + uint_t forestModificationStamp_{uint_c(0)}; std::vector< std::vector< bool > > communicationInProgress_; bool sendFromGPU_; @@ -111,11 +112,10 @@ class NonUniformGPUScheme std::vector< std::vector< mpi::GenericBufferSystem< CpuBuffer_T, CpuBuffer_T > > > bufferSystemCPU_; std::vector< std::vector< mpi::GenericBufferSystem< GpuBuffer_T, GpuBuffer_T > > > bufferSystemGPU_; std::vector< std::vector< GpuBuffer_T > > localBuffer_; + GpuBuffer_T adaptiveGPUBuffer; std::vector< shared_ptr< GeneratedNonUniformGPUPackInfo > > packInfos_; - ParallelStreams parallelSectionManager_; - struct Header { BlockID receiverId; @@ -126,13 +126,15 @@ class NonUniformGPUScheme Set< SUID > requiredBlockSelectors_; Set< SUID > incompatibleBlockSelectors_; + + gpuStream_t streams_[Stencil::Q]; }; template< typename Stencil > NonUniformGPUScheme< Stencil >::NonUniformGPUScheme(const weak_ptr< StructuredBlockForest >& bf, bool sendDirectlyFromGPU, const int tag) - : blockForest_(bf), sendFromGPU_(sendDirectlyFromGPU), baseTag_(tag), parallelSectionManager_(-1), - requiredBlockSelectors_(Set< SUID >::emptySet()), incompatibleBlockSelectors_(Set< SUID >::emptySet()) + : blockForest_(bf), sendFromGPU_(sendDirectlyFromGPU), baseTag_(tag), + requiredBlockSelectors_(Set< SUID >::emptySet()), incompatibleBlockSelectors_(Set< SUID >::emptySet()) { WALBERLA_MPI_SECTION() { @@ -148,9 +150,8 @@ NonUniformGPUScheme< Stencil >::NonUniformGPUScheme(const weak_ptr< StructuredBl const Set< SUID >& requiredBlockSelectors, const Set< SUID >& incompatibleBlockSelectors, bool sendDirectlyFromGPU, const int tag) - : blockForest_(bf), requiredBlockSelectors_(requiredBlockSelectors), - incompatibleBlockSelectors_(incompatibleBlockSelectors), sendFromGPU_(sendDirectlyFromGPU), baseTag_(tag), - parallelSectionManager_(-1) + : blockForest_(bf), requiredBlockSelectors_(requiredBlockSelectors), + incompatibleBlockSelectors_(incompatibleBlockSelectors), sendFromGPU_(sendDirectlyFromGPU), baseTag_(tag) { WALBERLA_MPI_SECTION() { @@ -172,6 +173,11 @@ void NonUniformGPUScheme< Stencil >::init() communicationInProgress_.resize(3); refresh(); + + for (uint_t i = 0; i < Stencil::Q; ++i) + { + WALBERLA_GPU_CHECK(gpuStreamCreate(&streams_[i])) + } } template< typename Stencil > @@ -195,10 +201,8 @@ void NonUniformGPUScheme< Stencil >::refresh() for (uint_t j = 0; j <= levels; ++j) { headers_[i][j].clear(); - bufferSystemCPU_[i].emplace_back( - mpi::MPIManager::instance()->comm(), baseTag_ + int_c(i * levels + j)); - bufferSystemGPU_[i].emplace_back( - mpi::MPIManager::instance()->comm(), baseTag_ + int_c(i * levels + j)); + bufferSystemCPU_[i].emplace_back(mpi::MPIManager::instance()->comm(), baseTag_ + int_c(i * levels + j)); + bufferSystemGPU_[i].emplace_back(mpi::MPIManager::instance()->comm(), baseTag_ + int_c(i * levels + j)); localBuffer_[i].emplace_back(); } @@ -206,10 +210,9 @@ void NonUniformGPUScheme< Stencil >::refresh() } #ifndef NDEBUG - for (auto p = packInfos_.begin(); p != packInfos_.end(); ++p) - (*p)->clearBufferSizeCheckMap(); + for (auto & packInfo : packInfos_) + packInfo->clearBufferSizeCheckMap(); #endif - forestModificationStamp_ = forest->getBlockForest().getModificationStamp(); } @@ -304,63 +307,66 @@ void NonUniformGPUScheme< Stencil >::startCommunicationEqualLevel(const uint_t i bufferSystemGPU_[EQUAL_LEVEL][index].sendBuffer(it.first).clear(); // Start filling send buffers + for (auto& iBlock : *forest) { - for (auto& iBlock : *forest) + auto senderBlock = dynamic_cast< Block* >(&iBlock); + + if (!selectable::isSetSelected(senderBlock->getState(), requiredBlockSelectors_, incompatibleBlockSelectors_)) + continue; + + if (participatingLevels.find(senderBlock->getLevel()) == participatingLevels.end()) + continue; + + for (auto dir = Stencil::beginNoCenter(); dir != Stencil::end(); ++dir) { - auto senderBlock = dynamic_cast< Block* >(&iBlock); + const auto neighborIdx = blockforest::getBlockNeighborhoodSectionIndex(*dir); - if (!selectable::isSetSelected(senderBlock->getState(), requiredBlockSelectors_, incompatibleBlockSelectors_)) + if (!(senderBlock->neighborhoodSectionHasEquallySizedBlock(neighborIdx))) continue; - - if (participatingLevels.find(senderBlock->getLevel()) == participatingLevels.end()) + WALBERLA_ASSERT_EQUAL(senderBlock->getNeighborhoodSectionSize(neighborIdx), uint_t(1)) + if (!selectable::isSetSelected(senderBlock->getNeighborState(neighborIdx, uint_t(0)),requiredBlockSelectors_, incompatibleBlockSelectors_)) continue; - for (auto dir = Stencil::beginNoCenter(); dir != Stencil::end(); ++dir) + if( senderBlock->neighborExistsLocally( neighborIdx, uint_t(0) ) ) { - const auto neighborIdx = blockforest::getBlockNeighborhoodSectionIndex(*dir); - - if (!(senderBlock->neighborhoodSectionHasEquallySizedBlock(neighborIdx))) - continue; - WALBERLA_ASSERT_EQUAL(senderBlock->getNeighborhoodSectionSize(neighborIdx), uint_t(1)) - if (!selectable::isSetSelected(senderBlock->getNeighborState(neighborIdx, uint_t(0)),requiredBlockSelectors_, incompatibleBlockSelectors_)) - continue; + auto receiverBlock = dynamic_cast< Block * >( forest->getBlock( senderBlock->getNeighborId( neighborIdx, uint_t(0) )) ); + for (auto& pi : packInfos_) + { + pi->communicateLocalEqualLevel(senderBlock, receiverBlock, *dir, streams_[*dir]); + } + } + else + { + auto nProcess = mpi::MPIRank(senderBlock->getNeighborProcess(neighborIdx, uint_t(0))); + GpuBuffer_T& gpuDataBuffer = bufferSystemGPU_[EQUAL_LEVEL][index].sendBuffer(nProcess); - if( senderBlock->neighborExistsLocally( neighborIdx, uint_t(0) ) ) + for (auto& pi : packInfos_) { - auto receiverBlock = dynamic_cast< Block * >( forest->getBlock( senderBlock->getNeighborId( neighborIdx, uint_t(0) )) ); - for (auto& pi : packInfos_) + WALBERLA_ASSERT_NOT_NULLPTR(gpuDataBuffer.cur()) + WALBERLA_ASSERT_GREATER_EQUAL(gpuDataBuffer.remainingSize(), pi->sizeEqualLevelSend(senderBlock, *dir)) + if(sendFromGPU_) { - pi->communicateLocalEqualLevel(senderBlock, receiverBlock, *dir, nullptr); + pi->packDataEqualLevel(senderBlock, *dir, gpuDataBuffer, streams_[*dir]); } - } - else - { - auto nProcess = mpi::MPIRank(senderBlock->getNeighborProcess(neighborIdx, uint_t(0))); - GpuBuffer_T& gpuDataBuffer = bufferSystemGPU_[EQUAL_LEVEL][index].sendBuffer(nProcess); - - for (auto& pi : packInfos_) + else { - WALBERLA_ASSERT_NOT_NULLPTR(gpuDataBuffer.cur()) - WALBERLA_ASSERT_GREATER_EQUAL(gpuDataBuffer.allocSize() - gpuDataBuffer.size(), pi->sizeEqualLevelSend(senderBlock, *dir)) - - pi->packDataEqualLevel(senderBlock, *dir, gpuDataBuffer); - - if (!sendFromGPU_) - { - auto gpuDataPtr = gpuDataBuffer.cur(); - auto size = pi->sizeEqualLevelSend(senderBlock, *dir); - auto cpuDataPtr = bufferSystemCPU_[EQUAL_LEVEL][index].sendBuffer(nProcess).advanceNoResize(size); - WALBERLA_ASSERT_NOT_NULLPTR(cpuDataPtr) - WALBERLA_GPU_CHECK(gpuMemcpyAsync(cpuDataPtr, gpuDataPtr, size, gpuMemcpyDeviceToHost)) - } + auto gpuDataPtr = gpuDataBuffer.cur(); + // packDataEqualLevel moves the pointer with advanceNoResize + pi->packDataEqualLevel(senderBlock, *dir, gpuDataBuffer, streams_[*dir]); + auto size = pi->sizeEqualLevelSend(senderBlock, *dir); + auto cpuDataPtr = bufferSystemCPU_[EQUAL_LEVEL][index].sendBuffer(nProcess).advanceNoResize(size); + WALBERLA_ASSERT_NOT_NULLPTR(cpuDataPtr) + WALBERLA_GPU_CHECK(gpuMemcpyAsync(cpuDataPtr, gpuDataPtr, size, gpuMemcpyDeviceToHost, streams_[*dir])) } } } } } - // wait for packing to finish - WALBERLA_GPU_CHECK( gpuDeviceSynchronize() ) + for (uint_t i = 0; i < Stencil::Q; ++i) + { + WALBERLA_GPU_CHECK(gpuStreamSynchronize(streams_[i])) + } if (sendFromGPU_) @@ -391,78 +397,78 @@ void NonUniformGPUScheme< Stencil >::startCommunicationCoarseToFine(const uint_t bufferSystemGPU_[COARSE_TO_FINE][index].sendBuffer(it.first).clear(); // Start filling send buffers + for (auto& iBlock : *forest) { - for (auto& iBlock : *forest) - { - auto coarseBlock = dynamic_cast< Block* >(&iBlock); - auto nLevel = coarseBlock->getLevel(); + auto coarseBlock = dynamic_cast< Block* >(&iBlock); + auto nLevel = coarseBlock->getLevel(); - if (!selectable::isSetSelected(coarseBlock->getState(), requiredBlockSelectors_, incompatibleBlockSelectors_)) - continue; + if (!selectable::isSetSelected(coarseBlock->getState(), requiredBlockSelectors_, incompatibleBlockSelectors_)) + continue; - if (nLevel != coarsestLevel) continue; + if (nLevel != coarsestLevel) continue; - for (auto dir = Stencil::beginNoCenter(); dir != Stencil::end(); ++dir) - { - const auto neighborIdx = blockforest::getBlockNeighborhoodSectionIndex(*dir); + for (auto dir = Stencil::beginNoCenter(); dir != Stencil::end(); ++dir) + { + const auto neighborIdx = blockforest::getBlockNeighborhoodSectionIndex(*dir); + + if (coarseBlock->getNeighborhoodSectionSize(neighborIdx) == uint_t(0)) continue; + if (!(coarseBlock->neighborhoodSectionHasSmallerBlocks(neighborIdx))) continue; - if (coarseBlock->getNeighborhoodSectionSize(neighborIdx) == uint_t(0)) continue; - if (!(coarseBlock->neighborhoodSectionHasSmallerBlocks(neighborIdx))) continue; + for (uint_t n = 0; n != coarseBlock->getNeighborhoodSectionSize(neighborIdx); ++n) + { + const BlockID& fineReceiverId = coarseBlock->getNeighborId(neighborIdx, n); + if (!selectable::isSetSelected(coarseBlock->getNeighborState(neighborIdx, n), requiredBlockSelectors_, + incompatibleBlockSelectors_)) + continue; - for (uint_t n = 0; n != coarseBlock->getNeighborhoodSectionSize(neighborIdx); ++n) + if( coarseBlock->neighborExistsLocally( neighborIdx, n ) ) { - const BlockID& fineReceiverId = coarseBlock->getNeighborId(neighborIdx, n); - if (!selectable::isSetSelected(coarseBlock->getNeighborState(neighborIdx, n), requiredBlockSelectors_, - incompatibleBlockSelectors_)) - continue; + auto fineReceiverBlock = dynamic_cast< Block * >( forest->getBlock( fineReceiverId ) ); + GpuBuffer_T& gpuDataBuffer = localBuffer_[COARSE_TO_FINE][index]; - if( coarseBlock->neighborExistsLocally( neighborIdx, n ) ) + for (auto& pi : packInfos_) { - auto fineReceiverBlock = dynamic_cast< Block * >( forest->getBlock( fineReceiverId ) ); - // for (auto& pi : packInfos_) - // { - // pi->communicateLocalCoarseToFine(coarseBlock, fineReceiverBlock, *dir); - // } - - GpuBuffer_T& gpuDataBuffer = localBuffer_[COARSE_TO_FINE][index]; WALBERLA_ASSERT_NOT_NULLPTR(gpuDataBuffer.cur()) - - for (auto& pi : packInfos_) - { - WALBERLA_ASSERT_GREATER_EQUAL(gpuDataBuffer.allocSize() - gpuDataBuffer.size(), pi->sizeCoarseToFineSend(coarseBlock, fineReceiverId, *dir)) - pi->communicateLocalCoarseToFine(coarseBlock, fineReceiverBlock, *dir, gpuDataBuffer, nullptr); - } + WALBERLA_ASSERT_GREATER_EQUAL(gpuDataBuffer.remainingSize(), pi->sizeCoarseToFineSend(coarseBlock, fineReceiverId, *dir)) + pi->communicateLocalCoarseToFine(coarseBlock, fineReceiverBlock, *dir, gpuDataBuffer, streams_[*dir]); } - else + } + else + { + auto nProcess = mpi::MPIRank(coarseBlock->getNeighborProcess(neighborIdx, n)); + GpuBuffer_T& gpuDataBuffer = bufferSystemGPU_[COARSE_TO_FINE][index].sendBuffer(nProcess); + gpuDataBuffer.clear(); + for (auto& pi : packInfos_) { - auto nProcess = mpi::MPIRank(coarseBlock->getNeighborProcess(neighborIdx, n)); - GpuBuffer_T& gpuDataBuffer = bufferSystemGPU_[COARSE_TO_FINE][index].sendBuffer(nProcess); - for (auto& pi : packInfos_) + WALBERLA_ASSERT_NOT_NULLPTR(gpuDataBuffer.cur()) + WALBERLA_ASSERT_GREATER_EQUAL(gpuDataBuffer.remainingSize(), pi->sizeCoarseToFineSend(coarseBlock, fineReceiverId, *dir)) + if (sendFromGPU_) { - WALBERLA_ASSERT_NOT_NULLPTR(gpuDataBuffer.cur()) - WALBERLA_ASSERT_GREATER_EQUAL(gpuDataBuffer.allocSize() - gpuDataBuffer.size(), pi->sizeCoarseToFineSend(coarseBlock, fineReceiverId, *dir)) - - pi->packDataCoarseToFine(coarseBlock, fineReceiverId, *dir, gpuDataBuffer); - - if (!sendFromGPU_) - { - auto gpuDataPtr = gpuDataBuffer.cur(); - auto size = pi->sizeCoarseToFineSend(coarseBlock, fineReceiverId, *dir); - auto cpuDataPtr = - bufferSystemCPU_[COARSE_TO_FINE][index].sendBuffer(nProcess).advanceNoResize(size); - WALBERLA_ASSERT_NOT_NULLPTR(cpuDataPtr) - WALBERLA_GPU_CHECK(gpuMemcpyAsync(cpuDataPtr, gpuDataPtr, size, gpuMemcpyDeviceToHost)) - } + pi->packDataCoarseToFine(coarseBlock, fineReceiverId, *dir, gpuDataBuffer, streams_[*dir]); + } + else + { + auto gpuDataPtr = gpuDataBuffer.cur(); + // packDataCoarseToFine moves the pointer with advanceNoResize + pi->packDataCoarseToFine(coarseBlock, fineReceiverId, *dir, gpuDataBuffer, streams_[*dir]); + auto size = pi->sizeCoarseToFineSend(coarseBlock, fineReceiverId, *dir); + auto cpuDataPtr = bufferSystemCPU_[COARSE_TO_FINE][index].sendBuffer(nProcess).advanceNoResize(size); + WALBERLA_ASSERT_NOT_NULLPTR(cpuDataPtr) + WALBERLA_GPU_CHECK(gpuMemcpyAsync(cpuDataPtr, gpuDataPtr, size, gpuMemcpyDeviceToHost, streams_[*dir])) } } } } - localBuffer_[COARSE_TO_FINE][index].clear(); } + localBuffer_[COARSE_TO_FINE][index].clear(); } + // wait for packing to finish - WALBERLA_GPU_CHECK( gpuDeviceSynchronize() ) + for (uint_t i = 0; i < Stencil::Q; ++i) + { + WALBERLA_GPU_CHECK(gpuStreamSynchronize(streams_[i])) + } if (sendFromGPU_) bufferSystemGPU_[COARSE_TO_FINE][index].sendAll(); @@ -494,74 +500,73 @@ void NonUniformGPUScheme< Stencil >::startCommunicationFineToCoarse(const uint_t bufferSystemGPU_[FINE_TO_COARSE][index].sendBuffer(it.first).clear(); // Start filling send buffers + for (auto& iBlock : *forest) { - for (auto& iBlock : *forest) - { - auto fineBlock = dynamic_cast< Block* >(&iBlock); - auto nLevel = fineBlock->getLevel(); + auto fineBlock = dynamic_cast< Block* >(&iBlock); + auto nLevel = fineBlock->getLevel(); - if (!selectable::isSetSelected(fineBlock->getState(), requiredBlockSelectors_, incompatibleBlockSelectors_)) - continue; + if (!selectable::isSetSelected(fineBlock->getState(), requiredBlockSelectors_, incompatibleBlockSelectors_)) + continue; - if (nLevel != finestLevel) continue; + if (nLevel != finestLevel) continue; - for (auto dir = Stencil::beginNoCenter(); dir != Stencil::end(); ++dir) - { - const auto neighborIdx = blockforest::getBlockNeighborhoodSectionIndex(*dir); + for (auto dir = Stencil::beginNoCenter(); dir != Stencil::end(); ++dir) + { + const auto neighborIdx = blockforest::getBlockNeighborhoodSectionIndex(*dir); - if (fineBlock->getNeighborhoodSectionSize(neighborIdx) == uint_t(0)) continue; - if (!(fineBlock->neighborhoodSectionHasLargerBlock(neighborIdx))) continue; - WALBERLA_ASSERT_EQUAL(fineBlock->getNeighborhoodSectionSize(neighborIdx), uint_t(1)) + if (fineBlock->getNeighborhoodSectionSize(neighborIdx) == uint_t(0)) continue; + if (!(fineBlock->neighborhoodSectionHasLargerBlock(neighborIdx))) continue; + WALBERLA_ASSERT_EQUAL(fineBlock->getNeighborhoodSectionSize(neighborIdx), uint_t(1)) - const BlockID& coarseReceiverId = fineBlock->getNeighborId(neighborIdx, uint_t(0)); - if (!selectable::isSetSelected(fineBlock->getNeighborState(neighborIdx, uint_t(0)), requiredBlockSelectors_, - incompatibleBlockSelectors_)) - continue; - if( fineBlock->neighborExistsLocally( neighborIdx, uint_t(0) ) ) - { - auto coarseReceiverBlock = dynamic_cast< Block * >( forest->getBlock( coarseReceiverId ) ); - // for (auto& pi : packInfos_) - // { - // pi->communicateLocalFineToCoarse(fineBlock, coarseReceiverBlock, *dir); - // } + const BlockID& coarseReceiverId = fineBlock->getNeighborId(neighborIdx, uint_t(0)); + if (!selectable::isSetSelected(fineBlock->getNeighborState(neighborIdx, uint_t(0)), requiredBlockSelectors_, + incompatibleBlockSelectors_)) + continue; + if( fineBlock->neighborExistsLocally( neighborIdx, uint_t(0) ) ) + { + auto coarseReceiverBlock = dynamic_cast< Block * >( forest->getBlock( coarseReceiverId ) ); + GpuBuffer_T& gpuDataBuffer = localBuffer_[FINE_TO_COARSE][index]; - GpuBuffer_T& gpuDataBuffer = localBuffer_[FINE_TO_COARSE][index]; + for (auto& pi : packInfos_) + { WALBERLA_ASSERT_NOT_NULLPTR(gpuDataBuffer.cur()) - for (auto& pi : packInfos_) - { - WALBERLA_ASSERT_GREATER_EQUAL(gpuDataBuffer.allocSize() - gpuDataBuffer.size(), pi->sizeFineToCoarseSend(fineBlock, *dir)) - pi->communicateLocalFineToCoarse(fineBlock, coarseReceiverBlock, *dir, gpuDataBuffer, nullptr); - } + WALBERLA_ASSERT_GREATER_EQUAL(gpuDataBuffer.allocSize() - gpuDataBuffer.size(), pi->sizeFineToCoarseSend(fineBlock, *dir)) + pi->communicateLocalFineToCoarse(fineBlock, coarseReceiverBlock, *dir, gpuDataBuffer, streams_[*dir]); } - else + } + else + { + auto nProcess = mpi::MPIRank(fineBlock->getNeighborProcess(neighborIdx, uint_t(0))); + GpuBuffer_T& gpuDataBuffer = bufferSystemGPU_[FINE_TO_COARSE][index].sendBuffer(nProcess); + gpuDataBuffer.clear(); + for (auto& pi : packInfos_) { - auto nProcess = mpi::MPIRank(fineBlock->getNeighborProcess(neighborIdx, uint_t(0))); - GpuBuffer_T& gpuDataBuffer = bufferSystemGPU_[FINE_TO_COARSE][index].sendBuffer(nProcess); - - for (auto& pi : packInfos_) + WALBERLA_ASSERT_NOT_NULLPTR(gpuDataBuffer.cur()) + WALBERLA_ASSERT_GREATER_EQUAL(gpuDataBuffer.remainingSize(), pi->sizeFineToCoarseSend(fineBlock, *dir)) + if (sendFromGPU_) { - WALBERLA_ASSERT_NOT_NULLPTR(gpuDataBuffer.cur()) - WALBERLA_ASSERT_GREATER_EQUAL(gpuDataBuffer.allocSize() - gpuDataBuffer.size(), pi->sizeFineToCoarseSend(fineBlock, *dir)) - - pi->packDataFineToCoarse(fineBlock, coarseReceiverId, *dir, gpuDataBuffer); - - if (!sendFromGPU_) - { - auto gpuDataPtr = gpuDataBuffer.cur(); - auto size = pi->sizeFineToCoarseSend(fineBlock, *dir); - auto cpuDataPtr = bufferSystemCPU_[FINE_TO_COARSE][index].sendBuffer(nProcess).advanceNoResize(size); - WALBERLA_ASSERT_NOT_NULLPTR(cpuDataPtr) - WALBERLA_GPU_CHECK(gpuMemcpyAsync(cpuDataPtr, gpuDataPtr, size, gpuMemcpyDeviceToHost)) - } + pi->packDataFineToCoarse(fineBlock, coarseReceiverId, *dir, gpuDataBuffer, streams_[*dir]); + } + else + { + auto gpuDataPtr = gpuDataBuffer.cur(); + // packDataFineToCoarse moves the pointer with advanceNoResize + pi->packDataFineToCoarse(fineBlock, coarseReceiverId, *dir, gpuDataBuffer, streams_[*dir]); + auto size = pi->sizeFineToCoarseSend(fineBlock, *dir); + auto cpuDataPtr = bufferSystemCPU_[FINE_TO_COARSE][index].sendBuffer(nProcess).advanceNoResize(size); + WALBERLA_ASSERT_NOT_NULLPTR(cpuDataPtr) + WALBERLA_GPU_CHECK(gpuMemcpyAsync(cpuDataPtr, gpuDataPtr, size, gpuMemcpyDeviceToHost, streams_[*dir])) } } } - localBuffer_[FINE_TO_COARSE][index].clear(); } + localBuffer_[FINE_TO_COARSE][index].clear(); } - // wait for packing to finish - WALBERLA_GPU_CHECK( gpuDeviceSynchronize() ) + for (uint_t i = 0; i < Stencil::Q; ++i) + { + WALBERLA_GPU_CHECK(gpuStreamSynchronize(streams_[i])) + } if (sendFromGPU_) bufferSystemGPU_[FINE_TO_COARSE][index].sendAll(); @@ -596,9 +601,7 @@ void NonUniformGPUScheme< Stencil >::waitCommunicateEqualLevel(const uint_t leve { GpuBuffer_T& gpuDataBuffer = recvInfo.buffer(); WALBERLA_ASSERT_NOT_NULLPTR(gpuDataBuffer.cur()) - // parallelSection.run([&](auto s) { - pi->unpackDataEqualLevel(block, stencil::inverseDir[header.dir], gpuDataBuffer); - // }); + pi->unpackDataEqualLevel(block, stencil::inverseDir[header.dir], gpuDataBuffer, streams_[stencil::inverseDir[header.dir]]); } } } @@ -608,29 +611,31 @@ void NonUniformGPUScheme< Stencil >::waitCommunicateEqualLevel(const uint_t leve for (auto recvInfo = bufferSystemCPU_[EQUAL_LEVEL][level].begin(); recvInfo != bufferSystemCPU_[EQUAL_LEVEL][level].end(); ++recvInfo) { - auto& gpuBuffer = bufferSystemGPU_[EQUAL_LEVEL][level].sendBuffer(recvInfo.rank()); + auto &gpuBuffer = bufferSystemGPU_[EQUAL_LEVEL][level].sendBuffer(recvInfo.rank()); recvInfo.buffer().clear(); gpuBuffer.clear(); - for (auto& header : headers_[EQUAL_LEVEL][level][recvInfo.rank()]) + + for (auto &header : headers_[EQUAL_LEVEL][level][recvInfo.rank()]) { auto block = dynamic_cast< Block* >(forest->getBlock(header.receiverId)); - auto senderBlock = dynamic_cast< Block* >(forest->getBlock(header.senderId)); - for (auto& pi : packInfos_) { - auto size = pi->sizeEqualLevelSend(senderBlock, header.dir); + auto size = pi->sizeEqualLevelSend(block, stencil::inverseDir[header.dir]); auto cpuDataPtr = recvInfo.buffer().advanceNoResize(size); auto gpuDataPtr = gpuBuffer.cur(); // advanceNoResize( size ); WALBERLA_ASSERT_NOT_NULLPTR(cpuDataPtr) WALBERLA_ASSERT_NOT_NULLPTR(gpuDataPtr) - WALBERLA_GPU_CHECK(gpuMemcpyAsync(gpuDataPtr, cpuDataPtr, size, gpuMemcpyHostToDevice, nullptr)) - pi->unpackDataEqualLevel(block, stencil::inverseDir[header.dir], gpuBuffer); + WALBERLA_GPU_CHECK(gpuMemcpyAsync(gpuDataPtr, cpuDataPtr, size, gpuMemcpyHostToDevice, streams_[stencil::inverseDir[header.dir]])) + pi->unpackDataEqualLevel(block, stencil::inverseDir[header.dir], gpuBuffer, streams_[stencil::inverseDir[header.dir]]); } } } - WALBERLA_GPU_CHECK(gpuDeviceSynchronize()) + } + for (uint_t i = 0; i < Stencil::Q; ++i) + { + WALBERLA_GPU_CHECK(gpuStreamSynchronize(streams_[i])) } communicationInProgress_[EQUAL_LEVEL][level] = false; } @@ -647,61 +652,51 @@ void NonUniformGPUScheme< Stencil >::waitCommunicateCoarseToFine(const uint_t fi "Trying to access communication for a block storage object that doesn't exist anymore") WALBERLA_ASSERT_LESS(fineLevel, forest->getNumberOfLevels()) - if (sendFromGPU_) - { - // auto parallelSection = parallelSectionManager_.parallelSection( nullptr ); + if (sendFromGPU_) { for (auto recvInfo = bufferSystemGPU_[COARSE_TO_FINE][fineLevel].begin(); - recvInfo != bufferSystemGPU_[COARSE_TO_FINE][fineLevel].end(); ++recvInfo) - { + recvInfo != bufferSystemGPU_[COARSE_TO_FINE][fineLevel].end(); ++recvInfo) { recvInfo.buffer().clear(); - for (auto& header : headers_[COARSE_TO_FINE][fineLevel][recvInfo.rank()]) - { - auto block = dynamic_cast< Block* >(forest->getBlock(header.receiverId)); - auto senderBlock = dynamic_cast< Block* >(forest->getBlock(header.senderId)); - - for (auto& pi : packInfos_) - { - // auto size = pi->sizeCoarseToFineSend( senderBlock, block->getId(), header.dir ); - GpuBuffer_T& gpuDataBuffer = recvInfo.buffer(); + for (auto &header: headers_[COARSE_TO_FINE][fineLevel][recvInfo.rank()]) { + auto fineReceiver = dynamic_cast< Block * >(forest->getBlock(header.receiverId)); + for (auto &pi: packInfos_) { + GpuBuffer_T &gpuDataBuffer = recvInfo.buffer(); WALBERLA_ASSERT_NOT_NULLPTR(gpuDataBuffer.cur()) - // parallelSection.run([&](auto s) { - pi->unpackDataCoarseToFine(block, senderBlock->getId(), stencil::inverseDir[header.dir], gpuDataBuffer); - // }); + pi->unpackDataCoarseToFine(fineReceiver, header.senderId, stencil::inverseDir[header.dir], + gpuDataBuffer, streams_[stencil::inverseDir[header.dir]]); } } } - } - else + } else { - auto parallelSection = parallelSectionManager_.parallelSection(nullptr); for (auto recvInfo = bufferSystemCPU_[COARSE_TO_FINE][fineLevel].begin(); - recvInfo != bufferSystemCPU_[COARSE_TO_FINE][fineLevel].end(); ++recvInfo) - { - auto& gpuBuffer = bufferSystemGPU_[COARSE_TO_FINE][fineLevel].sendBuffer(recvInfo.rank()); + recvInfo != bufferSystemCPU_[COARSE_TO_FINE][fineLevel].end(); ++recvInfo) { + adaptiveGPUBuffer.clear(); + adaptiveGPUBuffer.resize(recvInfo.buffer().allocSize()); recvInfo.buffer().clear(); - gpuBuffer.clear(); - for (auto& header : headers_[COARSE_TO_FINE][fineLevel][recvInfo.rank()]) - { - auto block = dynamic_cast< Block* >(forest->getBlock(header.receiverId)); - auto senderBlock = dynamic_cast< Block* >(forest->getBlock(header.senderId)); - for (auto& pi : packInfos_) - { - auto size = pi->sizeCoarseToFineSend(senderBlock, block->getId(), header.dir); + for (auto &header: headers_[COARSE_TO_FINE][fineLevel][recvInfo.rank()]) { + auto fineReceiver = dynamic_cast< Block * >(forest->getBlock(header.receiverId)); + // WALBERLA_ASSERT_NOT_NULLPTR(fineReceiver) + for (auto &pi: packInfos_) { + auto size = pi->sizeCoarseToFineReceive(fineReceiver, stencil::inverseDir[header.dir]); + auto cpuDataPtr = recvInfo.buffer().advanceNoResize(size); - auto gpuDataPtr = gpuBuffer.cur(); // advanceNoResize( size ); + auto gpuDataPtr = adaptiveGPUBuffer.cur(); // advanceNoResize( size ); WALBERLA_ASSERT_NOT_NULLPTR(cpuDataPtr) WALBERLA_ASSERT_NOT_NULLPTR(gpuDataPtr) - parallelSection.run([&](auto s) { - WALBERLA_GPU_CHECK(gpuMemcpyAsync(gpuDataPtr, cpuDataPtr, size, gpuMemcpyHostToDevice, s)) - pi->unpackDataCoarseToFine(block, senderBlock->getId(), stencil::inverseDir[header.dir], gpuBuffer); - }); + WALBERLA_GPU_CHECK(gpuMemcpyAsync(gpuDataPtr, cpuDataPtr, size, gpuMemcpyHostToDevice, streams_[stencil::inverseDir[header.dir]])) + pi->unpackDataCoarseToFine(fineReceiver, header.senderId, stencil::inverseDir[header.dir], adaptiveGPUBuffer, streams_[stencil::inverseDir[header.dir]]); } } } } + + for (uint_t i = 0; i < Stencil::Q; ++i) + { + WALBERLA_GPU_CHECK(gpuStreamSynchronize(streams_[i])) + } communicationInProgress_[COARSE_TO_FINE][fineLevel] = false; } @@ -716,11 +711,9 @@ void NonUniformGPUScheme< Stencil >::waitCommunicateFineToCoarse(const uint_t fi WALBERLA_CHECK_NOT_NULLPTR(forest, "Trying to access communication for a block storage object that doesn't exist anymore") WALBERLA_ASSERT_LESS(fineLevel, forest->getNumberOfLevels()) - // WALBERLA_ASSERT_EQUAL( forestModificationStamp_, forest->getBlockForest().getModificationStamp() ); if (sendFromGPU_) { - // auto parallelSection = parallelSectionManager_.parallelSection( nullptr ); for (auto recvInfo = bufferSystemGPU_[FINE_TO_COARSE][fineLevel].begin(); recvInfo != bufferSystemGPU_[FINE_TO_COARSE][fineLevel].end(); ++recvInfo) { @@ -728,50 +721,44 @@ void NonUniformGPUScheme< Stencil >::waitCommunicateFineToCoarse(const uint_t fi for (auto& header : headers_[FINE_TO_COARSE][fineLevel][recvInfo.rank()]) { auto block = dynamic_cast< Block* >(forest->getBlock(header.receiverId)); - auto senderBlock = dynamic_cast< Block* >(forest->getBlock(header.senderId)); - for (auto& pi : packInfos_) { GpuBuffer_T& gpuDataBuffer = recvInfo.buffer(); WALBERLA_ASSERT_NOT_NULLPTR(gpuDataBuffer.cur()) - // parallelSection.run([&](auto s) { - pi->unpackDataFineToCoarse(block, senderBlock->getId(), stencil::inverseDir[header.dir], gpuDataBuffer); - // }); + pi->unpackDataFineToCoarse(block, header.senderId, stencil::inverseDir[header.dir], gpuDataBuffer, streams_[stencil::inverseDir[header.dir]]); } } } } else { - auto parallelSection = parallelSectionManager_.parallelSection(nullptr); for (auto recvInfo = bufferSystemCPU_[FINE_TO_COARSE][fineLevel].begin(); recvInfo != bufferSystemCPU_[FINE_TO_COARSE][fineLevel].end(); ++recvInfo) { - auto& gpuBuffer = bufferSystemGPU_[FINE_TO_COARSE][fineLevel].sendBuffer(recvInfo.rank()); - recvInfo.buffer().clear(); - gpuBuffer.clear(); + adaptiveGPUBuffer.clear(); + adaptiveGPUBuffer.resize(recvInfo.buffer().allocSize()); for (auto& header : headers_[FINE_TO_COARSE][fineLevel][recvInfo.rank()]) { auto block = dynamic_cast< Block* >(forest->getBlock(header.receiverId)); - auto senderBlock = dynamic_cast< Block* >(forest->getBlock(header.senderId)); - for (auto& pi : packInfos_) { - auto size = pi->sizeFineToCoarseSend(senderBlock, header.dir); + auto size = pi->sizeFineToCoarseSend(block, stencil::inverseDir[header.dir]); auto cpuDataPtr = recvInfo.buffer().advanceNoResize(size); - auto gpuDataPtr = gpuBuffer.cur(); // advanceNoResize( size ); + auto gpuDataPtr = adaptiveGPUBuffer.cur(); // advanceNoResize( size ); WALBERLA_ASSERT_NOT_NULLPTR(cpuDataPtr) WALBERLA_ASSERT_NOT_NULLPTR(gpuDataPtr) - parallelSection.run([&](auto s) { - WALBERLA_GPU_CHECK(gpuMemcpyAsync(gpuDataPtr, cpuDataPtr, size, gpuMemcpyHostToDevice, s)) - pi->unpackDataFineToCoarse(block, senderBlock->getId(), stencil::inverseDir[header.dir], gpuBuffer); - }); + WALBERLA_GPU_CHECK(gpuMemcpyAsync(gpuDataPtr, cpuDataPtr, size, gpuMemcpyHostToDevice, streams_[stencil::inverseDir[header.dir]])) + pi->unpackDataFineToCoarse(block, header.senderId, stencil::inverseDir[header.dir], adaptiveGPUBuffer, streams_[stencil::inverseDir[header.dir]]); } } } } + for (uint_t i = 0; i < Stencil::Q; ++i) + { + WALBERLA_GPU_CHECK(gpuStreamSynchronize(streams_[i])) + } communicationInProgress_[FINE_TO_COARSE][fineLevel] = false; } @@ -785,25 +772,32 @@ void NonUniformGPUScheme< Stencil >::setupCommunication() "Trying to access communication for a block storage object that doesn't exist anymore") const uint_t levels = forest->getNumberOfLevels(); - std::vector< std::vector< std::map< mpi::MPIRank, mpi::MPISize > > > - receiverInfo; // how many bytes to send to each neighbor - std::vector< std::vector< mpi::BufferSystem > > headerExchangeBs; + std::vector< std::vector< std::map< mpi::MPIRank, mpi::MPISize > > > senderInfo; // how many bytes to send to each neighbor + std::vector< std::vector< std::set< mpi::MPIRank > > > receiverInfo; // how many bytes to receive from each neighbor + std::vector< std::vector< shared_ptr< mpi::BufferSystem > > > headerExchangeBs; + + std::vector< std::vector< mpi::MPISize > > localBufferSize; + + localBufferSize.resize(3); + senderInfo.resize(3); receiverInfo.resize(3); + + senderInfo[EQUAL_LEVEL].resize(levels + uint_c(1)); + senderInfo[COARSE_TO_FINE].resize(levels + uint_c(1)); + senderInfo[FINE_TO_COARSE].resize(levels + uint_c(1)); + receiverInfo[EQUAL_LEVEL].resize(levels + uint_c(1)); receiverInfo[COARSE_TO_FINE].resize(levels + uint_c(1)); receiverInfo[FINE_TO_COARSE].resize(levels + uint_c(1)); - std::vector< std::vector< mpi::MPISize > > localBufferSize; - headerExchangeBs.resize(3); - localBufferSize.resize(3); for (uint_t j = 0; j <= levels; ++j) { - headerExchangeBs[EQUAL_LEVEL].push_back(mpi::BufferSystem(mpi::MPIManager::instance()->comm(), 123)); - headerExchangeBs[COARSE_TO_FINE].push_back(mpi::BufferSystem(mpi::MPIManager::instance()->comm(), 123)); - headerExchangeBs[FINE_TO_COARSE].push_back(mpi::BufferSystem(mpi::MPIManager::instance()->comm(), 123)); + headerExchangeBs[EQUAL_LEVEL].push_back(make_shared< mpi::BufferSystem >(mpi::MPIManager::instance()->comm(), 123)); + headerExchangeBs[COARSE_TO_FINE].push_back(make_shared< mpi::BufferSystem >(mpi::MPIManager::instance()->comm(), 123)); + headerExchangeBs[FINE_TO_COARSE].push_back(make_shared< mpi::BufferSystem >(mpi::MPIManager::instance()->comm(), 123)); localBufferSize[EQUAL_LEVEL].push_back(mpi::MPISize(0)); localBufferSize[COARSE_TO_FINE].push_back(mpi::MPISize(0)); @@ -816,7 +810,7 @@ void NonUniformGPUScheme< Stencil >::setupCommunication() if (!selectable::isSetSelected(block->getState(), requiredBlockSelectors_, incompatibleBlockSelectors_)) continue; const BlockID& senderId = block->getId(); - auto nLevel = block->getLevel(); + auto level = block->getLevel(); for (auto dir = Stencil::beginNoCenter(); dir != Stencil::end(); ++dir) { @@ -824,6 +818,7 @@ void NonUniformGPUScheme< Stencil >::setupCommunication() const auto neighborIdx = blockforest::getBlockNeighborhoodSectionIndex(*dir); if (block->getNeighborhoodSectionSize(neighborIdx) == uint_t(0)) continue; + // EQUAL_LEVEL communication if (block->neighborhoodSectionHasEquallySizedBlock(neighborIdx)) { WALBERLA_ASSERT_EQUAL(block->getNeighborhoodSectionSize(neighborIdx), uint_t(1)) @@ -838,25 +833,28 @@ void NonUniformGPUScheme< Stencil >::setupCommunication() for (auto& pi : packInfos_) { - receiverInfo[EQUAL_LEVEL][nLevel][nProcess] += mpi::MPISize(pi->sizeEqualLevelSend(block, *dir)); + senderInfo[EQUAL_LEVEL][level][nProcess] += mpi::MPISize(pi->sizeEqualLevelSend(block, *dir)); } - auto& headerBuffer = headerExchangeBs[EQUAL_LEVEL][nLevel].sendBuffer(nProcess); + auto& headerBuffer = headerExchangeBs[EQUAL_LEVEL][level]->sendBuffer(nProcess); receiverId.toBuffer(headerBuffer); senderId.toBuffer(headerBuffer); headerBuffer << *dir; + + receiverInfo[EQUAL_LEVEL][level].insert( nProcess ); } else if (block->neighborhoodSectionHasSmallerBlocks(neighborIdx)) { - auto fineLevel = nLevel + uint_c(1); // For indexing always the fineLevel is taken to be consistent. + auto fineLevel = level + uint_c(1); // For indexing always the fineLevel is taken to be consistent. WALBERLA_ASSERT_LESS(fineLevel, levels) for (uint_t n = 0; n != block->getNeighborhoodSectionSize(neighborIdx); ++n) { const BlockID& receiverId = block->getNeighborId(neighborIdx, n); - if (!selectable::isSetSelected(block->getNeighborState(neighborIdx, n), requiredBlockSelectors_, - incompatibleBlockSelectors_)) + + if (!selectable::isSetSelected(block->getNeighborState(neighborIdx, n), requiredBlockSelectors_, incompatibleBlockSelectors_)) continue; + if( block->neighborExistsLocally( neighborIdx, n ) ) { for (auto& pi : packInfos_) @@ -866,19 +864,21 @@ void NonUniformGPUScheme< Stencil >::setupCommunication() auto nProcess = mpi::MPIRank(block->getNeighborProcess(neighborIdx, n)); for (auto& pi : packInfos_) - receiverInfo[COARSE_TO_FINE][fineLevel][nProcess] += - mpi::MPISize(pi->sizeCoarseToFineSend(block, receiverId, *dir)); - auto& headerBuffer = headerExchangeBs[COARSE_TO_FINE][fineLevel].sendBuffer(nProcess); + senderInfo[COARSE_TO_FINE][fineLevel][nProcess] += mpi::MPISize(pi->sizeCoarseToFineSend(block, receiverId, *dir)); + + auto& headerBuffer = headerExchangeBs[COARSE_TO_FINE][fineLevel]->sendBuffer(nProcess); receiverId.toBuffer(headerBuffer); senderId.toBuffer(headerBuffer); headerBuffer << *dir; + + receiverInfo[FINE_TO_COARSE][fineLevel].insert( nProcess ); } } else if (block->neighborhoodSectionHasLargerBlock(neighborIdx)) { WALBERLA_ASSERT_EQUAL(block->getNeighborhoodSectionSize(neighborIdx), uint_t(1)) - const BlockID& receiverId = block->getNeighborId(neighborIdx, uint_t(0)); + if (!selectable::isSetSelected(block->getNeighborState(neighborIdx, uint_t(0)), requiredBlockSelectors_, incompatibleBlockSelectors_)) continue; @@ -886,18 +886,20 @@ void NonUniformGPUScheme< Stencil >::setupCommunication() if( block->neighborExistsLocally( neighborIdx, uint_t(0) ) ) { for (auto& pi : packInfos_) - localBufferSize[FINE_TO_COARSE][nLevel] += mpi::MPISize(pi->sizeFineToCoarseSend(block, *dir)); + localBufferSize[FINE_TO_COARSE][level] += mpi::MPISize(pi->sizeFineToCoarseSend(block, *dir)); continue; } auto nProcess = mpi::MPIRank(block->getNeighborProcess(neighborIdx, uint_t(0))); for (auto& pi : packInfos_) - receiverInfo[FINE_TO_COARSE][nLevel][nProcess] += mpi::MPISize(pi->sizeFineToCoarseSend(block, *dir)); + senderInfo[FINE_TO_COARSE][level][nProcess] += mpi::MPISize(pi->sizeFineToCoarseSend(block, *dir)); - auto& headerBuffer = headerExchangeBs[FINE_TO_COARSE][nLevel].sendBuffer(nProcess); + auto& headerBuffer = headerExchangeBs[FINE_TO_COARSE][level]->sendBuffer(nProcess); receiverId.toBuffer(headerBuffer); senderId.toBuffer(headerBuffer); headerBuffer << *dir; + + receiverInfo[COARSE_TO_FINE][level].insert( nProcess ); } } } @@ -906,14 +908,12 @@ void NonUniformGPUScheme< Stencil >::setupCommunication() { for (uint_t j = 0; j <= levels; ++j) { - headerExchangeBs[i][j].setReceiverInfoFromSendBufferState(false, true); - headerExchangeBs[i][j].sendAll(); - for (auto recvIter = headerExchangeBs[i][j].begin(); recvIter != headerExchangeBs[i][j].end(); ++recvIter) - { - auto& headerVector = headers_[i][j][recvIter.rank()]; - auto& buffer = recvIter.buffer(); - while (buffer.size()) - { + headerExchangeBs[i][j]->setReceiverInfo(receiverInfo[i][j], false); + headerExchangeBs[i][j]->sendAll(); + for (auto recvIter = headerExchangeBs[i][j]->begin(); recvIter != headerExchangeBs[i][j]->end(); ++recvIter) { + auto &headerVector = headers_[i][j][recvIter.rank()]; + auto &buffer = recvIter.buffer(); + while (buffer.size()) { Header header; header.receiverId.fromBuffer(buffer); header.senderId.fromBuffer(buffer); @@ -921,11 +921,9 @@ void NonUniformGPUScheme< Stencil >::setupCommunication() headerVector.push_back(header); } } - - bufferSystemCPU_[i][j].setReceiverInfo(receiverInfo[i][j]); - bufferSystemGPU_[i][j].setReceiverInfo(receiverInfo[i][j]); - - for (auto it : receiverInfo[i][j]) + bufferSystemCPU_[i][j].setReceiverInfo(receiverInfo[i][j], false); + bufferSystemGPU_[i][j].setReceiverInfo(receiverInfo[i][j], false); + for (auto it : senderInfo[i][j]) { bufferSystemCPU_[i][j].sendBuffer(it.first).resize(size_t(it.second)); bufferSystemGPU_[i][j].sendBuffer(it.first).resize(size_t(it.second)); @@ -934,16 +932,16 @@ void NonUniformGPUScheme< Stencil >::setupCommunication() localBuffer_[i][j].resize(size_t(localBufferSize[i][j])); } } - forestModificationStamp_ = forest->getBlockForest().getModificationStamp(); } template< typename Stencil > bool NonUniformGPUScheme< Stencil >::isAnyCommunicationInProgress() const { - for (auto caseIt = communicationInProgress_.begin(); caseIt != communicationInProgress_.end(); ++caseIt) - for (auto levelIt = caseIt->begin(); levelIt != caseIt->end(); ++levelIt) - if (*levelIt) return true; + const uint_t levels = uint_c(communicationInProgress_[0].size()); + for (uint_t i = 0; i != 3; ++i) + for (uint_t j = 0; j != levels; ++j) + if (communicationInProgress_[i][j]) return true; return false; } @@ -957,6 +955,11 @@ NonUniformGPUScheme< Stencil >::~NonUniformGPUScheme() waitCommunicateCoarseToFine(i); waitCommunicateFineToCoarse(i); } + + for (uint_t i = 0; i < Stencil::Q; ++i) + { + WALBERLA_GPU_CHECK(gpuStreamDestroy(streams_[i])) + } } template< typename Stencil > @@ -969,5 +972,4 @@ void NonUniformGPUScheme< Stencil >::addPackInfo(const shared_ptr< GeneratedNonU packInfos_.push_back(pi); setupCommunication(); } - } // namespace walberla::gpu::communication diff --git a/src/lbm_generated/gpu/NonuniformGeneratedGPUPdfPackInfo.h b/src/lbm_generated/gpu/NonuniformGeneratedGPUPdfPackInfo.h index d6ac87010a6889b899380514ec51d717159bd6f8..96c514fcc4369084273c098ac9bf4ad21310ae29 100644 --- a/src/lbm_generated/gpu/NonuniformGeneratedGPUPdfPackInfo.h +++ b/src/lbm_generated/gpu/NonuniformGeneratedGPUPdfPackInfo.h @@ -42,23 +42,23 @@ template< typename PdfField_T, bool inplace > class NonuniformGPUPackingKernelsWrapper { public: - void packAll(PdfField_T* srcField, CellInterval ci, unsigned char* outBuffer, gpuStream_t stream = nullptr) const = 0; - void unpackAll(PdfField_T* dstField, CellInterval ci, unsigned char* inBuffer, gpuStream_t stream = nullptr) const = 0; + void packAll(PdfField_T* srcField, CellInterval ci, unsigned char* outBuffer, gpuStream_t stream ) const = 0; + void unpackAll(PdfField_T* dstField, CellInterval ci, unsigned char* inBuffer, gpuStream_t stream ) const = 0; void localCopyAll(PdfField_T* srcField, CellInterval srcInterval, PdfField_T* dstField, - CellInterval dstInterval, gpuStream_t stream = nullptr) const = 0; + CellInterval dstInterval, gpuStream_t stream ) const = 0; - void packDirection(PdfField_T* srcField, CellInterval ci, unsigned char* outBuffer, Direction dir, gpuStream_t stream = nullptr) const = 0; - void unpackDirection(PdfField_T* dstField, CellInterval ci, unsigned char* inBuffer, Direction dir, gpuStream_t stream = nullptr) const = 0; + void packDirection(PdfField_T* srcField, CellInterval ci, unsigned char* outBuffer, Direction dir, gpuStream_t stream ) const = 0; + void unpackDirection(PdfField_T* dstField, CellInterval ci, unsigned char* inBuffer, Direction dir, gpuStream_t stream ) const = 0; void localCopyDirection(PdfField_T* srcField, CellInterval srcInterval, PdfField_T* dstField, CellInterval dstInterval, Direction dir, gpuStream_t stream) const = 0; void unpackRedistribute(PdfField_T* dstField, CellInterval ci, unsigned char* inBuffer, - stencil::Direction dir, gpuStream_t stream = nullptr) const = 0; + stencil::Direction dir, gpuStream_t stream ) const = 0; void packPartialCoalescence(PdfField_T* srcField, PartialCoalescenceMaskFieldGPU* maskField, CellInterval ci, - unsigned char* outBuffer, Direction dir, gpuStream_t stream = nullptr) const = 0; + unsigned char* outBuffer, Direction dir, gpuStream_t stream ) const = 0; void zeroCoalescenceRegion(PdfField_T* dstField, CellInterval ci, Direction dir) const = 0; - void unpackCoalescence(PdfField_T* dstField, CellInterval ci, unsigned char* inBuffer, Direction dir, gpuStream_t stream = nullptr) const = 0; + void unpackCoalescence(PdfField_T* dstField, CellInterval ci, unsigned char* inBuffer, Direction dir, gpuStream_t stream ) const = 0; uint_t size(CellInterval ci, Direction dir) const = 0; uint_t size(CellInterval ci) const = 0; @@ -239,9 +239,7 @@ template< typename PdfField_T > class NonuniformGeneratedGPUPdfPackInfo : public walberla::gpu::GeneratedNonUniformGPUPackInfo { public: - using VoidFunction = std::function< void(gpuStream_t) >; using LatticeStorageSpecification_T = typename PdfField_T::LatticeStorageSpecification; - using PackingKernels_T = typename LatticeStorageSpecification_T::PackKernels; using Stencil = typename LatticeStorageSpecification_T::Stencil; using CommunicationStencil = typename LatticeStorageSpecification_T::CommunicationStencil; using CommData_T = NonuniformGPUCommData< LatticeStorageSpecification_T >; @@ -253,58 +251,53 @@ class NonuniformGeneratedGPUPdfPackInfo : public walberla::gpu::GeneratedNonUnif bool threadsafeReceiving() const override { return false; }; /// Equal Level - void unpackDataEqualLevel(Block* receiver, Direction dir, GpuBuffer_T& buffer) override; + void unpackDataEqualLevel(Block* receiver, Direction dir, GpuBuffer_T& buffer, gpuStream_t stream) override; void communicateLocalEqualLevel(const Block* sender, Block* receiver, stencil::Direction dir, gpuStream_t stream) override; - void getLocalEqualLevelCommFunction(std::vector< VoidFunction >& commFunctions, const Block* sender, Block* receiver, - stencil::Direction dir) override; /// Coarse to Fine void unpackDataCoarseToFine(Block* fineReceiver, const BlockID& coarseSender, stencil::Direction dir, - GpuBuffer_T& buffer) override; - void communicateLocalCoarseToFine(const Block* coarseSender, Block* fineReceiver, stencil::Direction dir) override; + GpuBuffer_T& buffer, gpuStream_t stream) override; + void communicateLocalCoarseToFine(const Block* coarseSender, Block* fineReceiver, stencil::Direction dir, gpuStream_t stream) override; void communicateLocalCoarseToFine(const Block* coarseSender, Block* fineReceiver, stencil::Direction dir, GpuBuffer_T& buffer, gpuStream_t stream) override; - void getLocalCoarseToFineCommFunction(std::vector< VoidFunction >& commFunctions, const Block* coarseSender, - Block* fineReceiver, stencil::Direction dir, GpuBuffer_T& buffer) override; /// Fine to Coarse void prepareCoalescence(Block* coarseReceiver, gpuStream_t gpuStream = nullptr); void unpackDataFineToCoarse(Block* coarseReceiver, const BlockID& fineSender, stencil::Direction dir, - GpuBuffer_T& buffer) override; + GpuBuffer_T& buffer, gpuStream_t stream) override; - void communicateLocalFineToCoarse(const Block* fineSender, Block* coarseReceiver, stencil::Direction dir) override; + void communicateLocalFineToCoarse(const Block* fineSender, Block* coarseReceiver, stencil::Direction dir, gpuStream_t stream) override; void communicateLocalFineToCoarse(const Block* fineSender, Block* coarseReceiver, stencil::Direction dir, GpuBuffer_T& buffer, gpuStream_t stream) override; - void getLocalFineToCoarseCommFunction(std::vector< VoidFunction >& commFunctions, const Block* fineSender, - Block* coarseReceiver, stencil::Direction dir, GpuBuffer_T& buffer) override; uint_t sizeEqualLevelSend(const Block* sender, stencil::Direction dir) override; uint_t sizeCoarseToFineSend(const Block* coarseSender, const BlockID& fineReceiver, stencil::Direction dir) override; + uint_t sizeCoarseToFineReceive ( Block* fineReceiver, stencil::Direction dir) override; uint_t sizeFineToCoarseSend(const Block* fineSender, stencil::Direction dir) override; protected: - void packDataEqualLevelImpl(const Block* sender, stencil::Direction dir, GpuBuffer_T& buffer) const override; + void packDataEqualLevelImpl(const Block* sender, stencil::Direction dir, GpuBuffer_T& buffer, gpuStream_t stream) const override; void packDataCoarseToFineImpl(const Block* coarseSender, const BlockID& fineReceiver, stencil::Direction dir, - GpuBuffer_T& buffer) const override; + GpuBuffer_T& buffer, gpuStream_t stream) const override; void packDataFineToCoarseImpl(const Block* fineSender, const BlockID& coarseReceiver, stencil::Direction dir, - GpuBuffer_T& buffer) const override; + GpuBuffer_T& buffer, gpuStream_t stream) const override; private: /// Helper Functions /// As in PdfFieldPackInfo.h Vector3< cell_idx_t > getNeighborShift(const BlockID& fineBlock, stencil::Direction dir) const; bool areNeighborsInDirection(const Block* block, const BlockID& neighborID, - const Vector3< cell_idx_t > dirVec) const; + Vector3< cell_idx_t > dirVec) const; - CellInterval intervalHullInDirection(const CellInterval& ci, const Vector3< cell_idx_t > tangentialDir, + CellInterval intervalHullInDirection(const CellInterval& ci, Vector3< cell_idx_t > tangentialDir, cell_idx_t width) const; - bool skipsThroughCoarseBlock(const Block* block, const Direction dir) const; + bool skipsThroughCoarseBlock(const Block* block, Direction dir) const; - void getCoarseBlockCommIntervals(const BlockID& fineBlockID, const Direction dir, const PdfField_T* field, + void getCoarseBlockCommIntervals(const BlockID& fineBlockID, Direction dir, const PdfField_T* field, std::vector< std::pair< Direction, CellInterval > >& intervals) const; - void getFineBlockCommIntervals(const BlockID& fineBlockID, const Direction dir, const PdfField_T* field, + void getFineBlockCommIntervals(const BlockID& fineBlockID, Direction dir, const PdfField_T* field, std::vector< std::pair< Direction, CellInterval > >& intervals) const; CellInterval getCoarseBlockCoalescenceInterval(const Block* coarseBlock, const BlockID& fineBlockID, Direction dir, @@ -324,7 +317,7 @@ class NonuniformGeneratedGPUPdfPackInfo : public walberla::gpu::GeneratedNonUnif template< typename PdfField_T > std::shared_ptr< NonuniformGeneratedGPUPdfPackInfo< PdfField_T > > setupNonuniformGPUPdfCommunication(const std::weak_ptr< StructuredBlockForest >& blocks, - const BlockDataID pdfFieldID, + BlockDataID pdfFieldID, const std::string& dataIdentifier = "NonuniformGPUCommData"); } // namespace walberla::lbm_generated diff --git a/src/lbm_generated/gpu/NonuniformGeneratedGPUPdfPackInfo.impl.h b/src/lbm_generated/gpu/NonuniformGeneratedGPUPdfPackInfo.impl.h index adfbb419a8d3a3c82217fecf974977b28bb2a19b..987cebe9b2bfd343ed0277ed3faefef4dddaa753 100644 --- a/src/lbm_generated/gpu/NonuniformGeneratedGPUPdfPackInfo.impl.h +++ b/src/lbm_generated/gpu/NonuniformGeneratedGPUPdfPackInfo.impl.h @@ -66,7 +66,7 @@ std::shared_ptr< NonuniformGeneratedGPUPdfPackInfo< PdfField_T > > template< typename PdfField_T> void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::unpackDataEqualLevel(Block* receiver, Direction dir, - GpuBuffer_T & buffer) + GpuBuffer_T & buffer, gpuStream_t stream) { auto field = receiver->getData< PdfField_T >(pdfFieldID_); CellInterval ci; @@ -74,7 +74,7 @@ void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::unpackDataEqualLevel(Block field->getGhostRegion(dir, ci, gls, false); uint_t size = kernels_.size(ci, dir); auto bufferPtr = buffer.advanceNoResize(size); - kernels_.unpackDirection(field, ci, bufferPtr, dir); + kernels_.unpackDirection(field, ci, bufferPtr, dir, stream); } template< typename PdfField_T> @@ -92,37 +92,10 @@ void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::communicateLocalEqualLevel kernels_.localCopyDirection(srcField, srcRegion, dstField, dstRegion, dir, stream); } -template< typename PdfField_T> -void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::getLocalEqualLevelCommFunction( - std::vector< VoidFunction >& commFunctions, const Block* sender, Block* receiver, - stencil::Direction dir) -{ - auto srcField = const_cast< Block* >(sender)->getData< PdfField_T >(pdfFieldID_); - auto dstField = receiver->getData< PdfField_T >(pdfFieldID_); - - CellInterval srcRegion; - CellInterval dstRegion; - cell_idx_t gls = skipsThroughCoarseBlock(sender, dir) ? 2 : 1; - srcField->getSliceBeforeGhostLayer(dir, srcRegion, gls, false); - dstField->getGhostRegion(stencil::inverseDir[dir], dstRegion, gls, false); - -// VoidFunction t = std::bind(kernels_.localCopyDirection, -// srcField, srcRegion, dstField, dstRegion, dir, std::placeholders::_1 ); - -// CellInterval test(srcRegion.min(), srcRegion.max()); -// CellInterval test2(dstRegion.min(), dstRegion.max()); - - - auto commFunction = [this, srcField, srcRegion, dstField, dstRegion, dir](gpuStream_t gpuStream) - { - kernels_.localCopyDirection(srcField, srcRegion, dstField, dstRegion, dir, gpuStream); - }; - commFunctions.emplace_back(commFunction); -} template< typename PdfField_T> void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::packDataEqualLevelImpl( - const Block* sender, stencil::Direction dir, GpuBuffer_T & buffer) const + const Block* sender, stencil::Direction dir, GpuBuffer_T & buffer, gpuStream_t stream) const { auto field = const_cast< Block* >(sender)->getData< PdfField_T >(pdfFieldID_); CellInterval ci; @@ -130,7 +103,7 @@ void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::packDataEqualLevelImpl( field->getSliceBeforeGhostLayer(dir, ci, gls, false); uint_t size = kernels_.size(ci, dir); auto bufferPtr = buffer.advanceNoResize(size); - kernels_.packDirection(field, ci, bufferPtr, dir); + kernels_.packDirection(field, ci, bufferPtr, dir, stream); } /*********************************************************************************************************************** @@ -139,7 +112,7 @@ void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::packDataEqualLevelImpl( template< typename PdfField_T> void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::packDataCoarseToFineImpl( - const Block* coarseSender, const BlockID& fineReceiver, stencil::Direction dir, GpuBuffer_T & buffer) const + const Block* coarseSender, const BlockID& fineReceiver, stencil::Direction dir, GpuBuffer_T & buffer, gpuStream_t stream) const { auto field = const_cast< Block* >(coarseSender)->getData< PdfField_T >(pdfFieldID_); @@ -149,15 +122,14 @@ void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::packDataCoarseToFineImpl( for (auto t : intervals) { CellInterval ci = t.second; - uint_t size = kernels_.size(ci); - auto bufferPtr = buffer.advanceNoResize(size); - kernels_.packAll(field, ci, bufferPtr); + auto bufferPtr = buffer.advanceNoResize(kernels_.size(ci)); + kernels_.packAll(field, ci, bufferPtr, stream); } } template< typename PdfField_T> void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::unpackDataCoarseToFine( - Block* fineReceiver, const BlockID& /*coarseSender*/, stencil::Direction dir, GpuBuffer_T & buffer) + Block* fineReceiver, const BlockID& /*coarseSender*/, stencil::Direction dir, GpuBuffer_T & buffer, gpuStream_t stream) { auto field = fineReceiver->getData< PdfField_T >(pdfFieldID_); @@ -170,13 +142,13 @@ void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::unpackDataCoarseToFine( CellInterval ci = t.second; uint_t size = kernels_.redistributeSize(ci); auto bufferPtr = buffer.advanceNoResize(size); - kernels_.unpackRedistribute(field, ci, bufferPtr, d); + kernels_.unpackRedistribute(field, ci, bufferPtr, d, stream); } } template< typename PdfField_T> void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::communicateLocalCoarseToFine( - const Block* coarseSender, Block* fineReceiver, stencil::Direction dir) + const Block* coarseSender, Block* fineReceiver, stencil::Direction dir, gpuStream_t stream) { auto srcField = const_cast< Block* >(coarseSender)->getData< PdfField_T >(pdfFieldID_); auto dstField = fineReceiver->getData< PdfField_T >(pdfFieldID_); @@ -208,8 +180,8 @@ void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::communicateLocalCoarseToFi // TODO: This is a dirty workaround. Code-generate direct redistribution! unsigned char *buffer; WALBERLA_GPU_CHECK( gpuMalloc( &buffer, packSize)) - kernels_.packAll(srcField, srcInterval, buffer); - kernels_.unpackRedistribute(dstField, dstInterval, buffer, unpackDir); + kernels_.packAll(srcField, srcInterval, buffer, stream); + kernels_.unpackRedistribute(dstField, dstInterval, buffer, unpackDir, stream); WALBERLA_GPU_CHECK(gpuFree(buffer)) } } @@ -251,50 +223,6 @@ void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::communicateLocalCoarseToFi } } -template< typename PdfField_T> -void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::getLocalCoarseToFineCommFunction( - std::vector< VoidFunction >& commFunctions, - const Block* coarseSender, Block* fineReceiver, stencil::Direction dir, GpuBuffer_T & buffer) -{ - auto srcField = const_cast< Block* >(coarseSender)->getData< PdfField_T >(pdfFieldID_); - auto dstField = fineReceiver->getData< PdfField_T >(pdfFieldID_); - - std::vector< std::pair< Direction, CellInterval > > srcIntervals; - getCoarseBlockCommIntervals(fineReceiver->getId(), dir, srcField, srcIntervals); - - std::vector< std::pair< Direction, CellInterval > > dstIntervals; - getFineBlockCommIntervals(fineReceiver->getId(), stencil::inverseDir[dir], dstField, dstIntervals); - - WALBERLA_ASSERT_EQUAL(srcIntervals.size(), dstIntervals.size()) - - for(size_t index = 0; index < srcIntervals.size(); index++) - { - CellInterval srcInterval = srcIntervals[index].second; - - Direction const unpackDir = dstIntervals[index].first; - CellInterval dstInterval = dstIntervals[index].second; - - uint_t packSize = kernels_.size(srcInterval); - -#ifndef NDEBUG - Direction const packDir = srcIntervals[index].first; - WALBERLA_ASSERT_EQUAL(packDir, stencil::inverseDir[unpackDir]) - uint_t unpackSize = kernels_.redistributeSize(dstInterval); - WALBERLA_ASSERT_EQUAL(packSize, unpackSize) -#endif - - auto bufferPtr = buffer.advanceNoResize(packSize); - - auto commFunction = [this, srcField, srcInterval, bufferPtr, dstField, dstInterval, unpackDir](gpuStream_t gpuStream) - { - kernels_.packAll(srcField, srcInterval, bufferPtr, gpuStream); - kernels_.unpackRedistribute(dstField, dstInterval, bufferPtr, unpackDir, gpuStream); - }; - commFunctions.emplace_back(commFunction); - } -} - - /*********************************************************************************************************************** * Fine to Coarse Communication * @@ -318,19 +246,19 @@ void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::prepareCoalescence(Block* template< typename PdfField_T> void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::unpackDataFineToCoarse( Block* coarseReceiver, const walberla::BlockID& fineSender, walberla::stencil::Direction dir, - GpuBuffer_T & buffer) + GpuBuffer_T & buffer, gpuStream_t stream) { auto dstField = coarseReceiver->getData<PdfField_T>(pdfFieldID_); CellInterval ci = getCoarseBlockCoalescenceInterval(coarseReceiver, fineSender, dir, dstField); uint_t size = kernels_.size(ci, dir); unsigned char* bufferPtr = buffer.advanceNoResize(size); - kernels_.unpackCoalescence(dstField, ci, bufferPtr, dir); + kernels_.unpackCoalescence(dstField, ci, bufferPtr, dir, stream); } template< typename PdfField_T> void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::communicateLocalFineToCoarse( - const Block* fineSender, Block* coarseReceiver, walberla::stencil::Direction dir) + const Block* fineSender, Block* coarseReceiver, walberla::stencil::Direction dir, gpuStream_t stream) { auto varFineSender = const_cast< Block * >(fineSender); auto srcField = varFineSender->getData< PdfField_T >(pdfFieldID_); @@ -354,8 +282,8 @@ void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::communicateLocalFineToCoar // TODO: This is a dirty workaround. Code-generate direct redistribution! unsigned char *buffer; WALBERLA_GPU_CHECK( gpuMalloc( &buffer, packSize)) - kernels_.packPartialCoalescence(srcField, maskField, srcInterval, buffer, dir); - kernels_.unpackCoalescence(dstField, dstInterval, buffer, invDir); + kernels_.packPartialCoalescence(srcField, maskField, srcInterval, buffer, dir, stream); + kernels_.unpackCoalescence(dstField, dstInterval, buffer, invDir, stream); WALBERLA_GPU_CHECK(gpuFree(buffer)) } @@ -388,39 +316,6 @@ void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::communicateLocalFineToCoar kernels_.unpackCoalescence(dstField, dstInterval, bufferPtr, invDir, stream); } -template< typename PdfField_T> -void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::getLocalFineToCoarseCommFunction( - std::vector< VoidFunction >& commFunctions, - const Block* fineSender, Block* coarseReceiver, walberla::stencil::Direction dir, GpuBuffer_T & buffer) -{ - auto varFineSender = const_cast< Block * >(fineSender); - auto srcField = varFineSender->getData< PdfField_T >(pdfFieldID_); - auto srcCommData = varFineSender->getData< CommData_T >(commDataID_); - PartialCoalescenceMaskFieldGPU * maskField = &(srcCommData->getMaskFieldGPU()); - auto dstField = coarseReceiver->getData<PdfField_T>(pdfFieldID_); - Direction invDir = stencil::inverseDir[dir]; - - CellInterval srcInterval; - srcField->getGhostRegion(dir, srcInterval, 2); - uint_t packSize = kernels_.partialCoalescenceSize(srcInterval, dir); - - CellInterval dstInterval = getCoarseBlockCoalescenceInterval(coarseReceiver, fineSender->getId(), - invDir, dstField); - -#ifndef NDEBUG - uint_t unpackSize = kernels_.size(dstInterval, invDir); - WALBERLA_ASSERT_EQUAL(packSize, unpackSize) -#endif - - auto bufferPtr = buffer.advanceNoResize(packSize); - auto commFunction = [this, srcField, maskField, srcInterval, bufferPtr, dir, dstField, dstInterval, invDir](gpuStream_t gpuStream) - { - kernels_.packPartialCoalescence(srcField, maskField, srcInterval, bufferPtr, dir, gpuStream); - kernels_.unpackCoalescence(dstField, dstInterval, bufferPtr, invDir, gpuStream); - }; - commFunctions.emplace_back(commFunction); -} - template< typename PdfField_T> uint_t NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::sizeEqualLevelSend( const Block * sender, stencil::Direction dir) @@ -453,16 +348,35 @@ uint_t NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::sizeCoarseToFineSend ( c return size; } +template< typename PdfField_T> +uint_t NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::sizeCoarseToFineReceive ( Block* fineReceiver, stencil::Direction dir) +{ + + auto field = fineReceiver->getData< PdfField_T >(pdfFieldID_); + + std::vector< std::pair< Direction, CellInterval > > intervals; + getFineBlockCommIntervals(fineReceiver->getId(), dir, field, intervals); + + uint_t size = 0; + for (auto t : intervals) + { + size += kernels_.redistributeSize(t.second); + } + WALBERLA_ASSERT_GREATER(size, 0) + return size; +} + template< typename PdfField_T> uint_t NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::sizeFineToCoarseSend ( const Block * sender, stencil::Direction dir) { - auto field = const_cast< Block* >(sender)->getData< PdfField_T >(pdfFieldID_); + auto block = const_cast< Block* >(sender); + auto srcField = block->getData< PdfField_T >(pdfFieldID_); - CellInterval ci; - field->getGhostRegion(dir, ci, 2); - return kernels_.partialCoalescenceSize(ci, dir); + CellInterval ci; + srcField->getGhostRegion(dir, ci, 2); + return kernels_.partialCoalescenceSize(ci, dir); } @@ -470,7 +384,7 @@ uint_t NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::sizeFineToCoarseSend ( c template< typename PdfField_T> void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::packDataFineToCoarseImpl( const Block* fineSender, const walberla::BlockID& /*coarseReceiver*/, walberla::stencil::Direction dir, - GpuBuffer_T & buffer) const + GpuBuffer_T & buffer, gpuStream_t stream) const { auto varBlock = const_cast< Block* >(fineSender); auto srcField = varBlock->getData< PdfField_T >(pdfFieldID_); @@ -481,7 +395,7 @@ void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::packDataFineToCoarseImpl( srcField->getGhostRegion(dir, ci, 2); uint_t size = kernels_.partialCoalescenceSize(ci, dir); auto bufferPtr = buffer.advanceNoResize(size); - kernels_.packPartialCoalescence(srcField, maskField, ci, bufferPtr, dir); + kernels_.packPartialCoalescence(srcField, maskField, ci, bufferPtr, dir, stream); } /*********************************************************************************************************************** diff --git a/src/mesh/blockforest/BlockForestInitialization.cpp b/src/mesh/blockforest/BlockForestInitialization.cpp index 91b702813a8c20d590bc4ab3a8213c53223b72cf..48a7edf06a08d1e0fe2ddb82c91bd4440bea4bf3 100644 --- a/src/mesh/blockforest/BlockForestInitialization.cpp +++ b/src/mesh/blockforest/BlockForestInitialization.cpp @@ -35,19 +35,16 @@ namespace walberla { namespace mesh { -static inline uint_t uintAbsDiff( const uint_t x, const uint_t y ) -{ - return x > y ? x - y : y - x; -} - -static inline void compareAABB( const AABB & oldAABB, const AABB & newAABB ) -{ - if(oldAABB != newAABB) - { - WALBERLA_LOG_INFO_ON_ROOT( "Domain AABB has been adapted to requested block size:\n" \ - "Old AABB: " << oldAABB << "\n" \ - "New AABB: " << newAABB << "\n" ) - } +namespace { + inline uint_t uintAbsDiff(const uint_t x, const uint_t y) {return x > y ? x - y : y - x;} + + inline void compareAABB(const AABB &oldAABB, const AABB &newAABB) { + if (oldAABB != newAABB) { + WALBERLA_LOG_INFO_ON_ROOT("Domain AABB has been adapted to requested block size:\n" \ + "Old AABB: " << oldAABB << "\n" \ + "New AABB: " << newAABB << "\n") + } + } } ComplexGeometryBlockforestCreator::ComplexGeometryBlockforestCreator( const AABB & aabb ) @@ -213,7 +210,7 @@ shared_ptr<SetupBlockForest> ComplexGeometryBlockforestCreator::createSetupBlock shared_ptr<BlockForest> ComplexGeometryBlockforestCreator::createBlockForest( const uint_t targetNumRootBlocks ) const { - shared_ptr< blockforest::SetupBlockForest> setupBlockForest = createSetupBlockForest( targetNumRootBlocks ); + shared_ptr< blockforest::SetupBlockForest> const setupBlockForest = createSetupBlockForest( targetNumRootBlocks ); auto blockForest = make_shared< blockforest::BlockForest >( MPIManager::instance()->rank(), *setupBlockForest ); @@ -222,7 +219,7 @@ shared_ptr<BlockForest> ComplexGeometryBlockforestCreator::createBlockForest( co shared_ptr<BlockForest> ComplexGeometryBlockforestCreator::createBlockForest( const Vector3<real_t> & blockSize ) const { - shared_ptr< blockforest::SetupBlockForest> setupBlockForest = createSetupBlockForest( blockSize ); + shared_ptr< blockforest::SetupBlockForest> const setupBlockForest = createSetupBlockForest( blockSize ); auto blockForest = make_shared< blockforest::BlockForest >( MPIManager::instance()->rank(), *setupBlockForest ); @@ -304,7 +301,7 @@ shared_ptr<SetupBlockForest> ComplexGeometryStructuredBlockforestCreator::create { Vector3<uint_t> numCells = domainSizeCells(); - real_t domainVolume = real_c( numCells[0] ) * real_c( numCells[1] ) * real_c( numCells[2] ); + real_t const domainVolume = real_c( numCells[0] ) * real_c( numCells[1] ) * real_c( numCells[2] ); std::set< uint_t > blockSizeTested; @@ -437,7 +434,7 @@ shared_ptr<SetupBlockForest> ComplexGeometryStructuredBlockforestCreator::create shared_ptr<SetupBlockForest> ComplexGeometryStructuredBlockforestCreator::createSetupBlockForest( const Vector3<uint_t> & cellsPerBlock, const Vector3<uint_t> & numBlocks ) const { - uint_t numProcesses = numBlocks[0] * numBlocks[1] * numBlocks[2]; + uint_t const numProcesses = numBlocks[0] * numBlocks[1] * numBlocks[2]; AABB newAABB(real_t(0), real_t(0), real_t(0), real_c(numBlocks[0] * cellsPerBlock[0]) * cellSize_[0], @@ -465,7 +462,7 @@ shared_ptr<SetupBlockForest> ComplexGeometryStructuredBlockforestCreator::create shared_ptr<StructuredBlockForest> ComplexGeometryStructuredBlockforestCreator::createStructuredBlockForest( const uint_t targetNumRootBlocks ) const { - shared_ptr< blockforest::SetupBlockForest> setupBlockForest = createSetupBlockForest( targetNumRootBlocks ); + shared_ptr< blockforest::SetupBlockForest> const setupBlockForest = createSetupBlockForest( targetNumRootBlocks ); Vector3< real_t > blockSize( setupBlockForest->getRootBlockXSize(), setupBlockForest->getRootBlockYSize(), @@ -488,7 +485,7 @@ shared_ptr<StructuredBlockForest> ComplexGeometryStructuredBlockforestCreator::c shared_ptr<StructuredBlockForest> ComplexGeometryStructuredBlockforestCreator::createStructuredBlockForest( const Vector3<uint_t> & blockSize ) const { - shared_ptr< blockforest::SetupBlockForest> setupBlockForest = createSetupBlockForest( blockSize ); + shared_ptr< blockforest::SetupBlockForest> const setupBlockForest = createSetupBlockForest( blockSize ); auto blockForest = make_shared< blockforest::BlockForest >( MPIManager::instance()->rank(), *setupBlockForest ); @@ -501,7 +498,7 @@ shared_ptr<StructuredBlockForest> ComplexGeometryStructuredBlockforestCreator::c shared_ptr<StructuredBlockForest> ComplexGeometryStructuredBlockforestCreator::createStructuredBlockForest( const Vector3<uint_t> & cellsPerBlock, const Vector3<uint_t> & numBlocks ) const { - shared_ptr< blockforest::SetupBlockForest> setupBlockForest = createSetupBlockForest( cellsPerBlock, numBlocks ); + shared_ptr< blockforest::SetupBlockForest> const setupBlockForest = createSetupBlockForest( cellsPerBlock, numBlocks ); auto blockForest = make_shared< blockforest::BlockForest >( MPIManager::instance()->rank(), *setupBlockForest ); diff --git a/src/mesh/blockforest/BlockForestInitialization.h b/src/mesh/blockforest/BlockForestInitialization.h index 976e298b95def7a52e46b29c8887d1281c868b90..6ce1f3f9b2c8a96f4fcfe872bdd03154259a895f 100644 --- a/src/mesh/blockforest/BlockForestInitialization.h +++ b/src/mesh/blockforest/BlockForestInitialization.h @@ -55,10 +55,10 @@ public: void setPeriodicity( const Vector3< bool > & periodicity ) { periodicity_ = periodicity; } - shared_ptr<SetupBlockForest> createSetupBlockForest ( const uint_t targetNumRootBlocks, const uint_t numProcesses = uint_c( MPIManager::instance()->numProcesses() ) ) const; - shared_ptr<SetupBlockForest> createSetupBlockForest ( const Vector3<real_t> & blockSize, const uint_t numProcesses = uint_c( MPIManager::instance()->numProcesses() ) ) const; + shared_ptr<SetupBlockForest> createSetupBlockForest ( uint_t targetNumRootBlocks, uint_t numProcesses = uint_c( MPIManager::instance()->numProcesses() ) ) const; + shared_ptr<SetupBlockForest> createSetupBlockForest ( const Vector3<real_t> & blockSize, uint_t numProcesses = uint_c( MPIManager::instance()->numProcesses() ) ) const; - shared_ptr<BlockForest> createBlockForest ( const uint_t targetNumRootBlocks ) const; + shared_ptr<BlockForest> createBlockForest ( uint_t targetNumRootBlocks ) const; shared_ptr<BlockForest> createBlockForest ( const Vector3<real_t> & blockSize ) const; private: @@ -95,11 +95,11 @@ public: void setPeriodicity( const Vector3< bool > & periodicity ) { periodicity_ = periodicity; } - shared_ptr<SetupBlockForest> createSetupBlockForest ( const uint_t targetNumRootBlocks, const uint_t numProcesses = uint_c( MPIManager::instance()->numProcesses() ) ) const; - shared_ptr<SetupBlockForest> createSetupBlockForest ( const Vector3<uint_t> & blockSize, const uint_t numProcesses = uint_c( MPIManager::instance()->numProcesses() ) ) const; + shared_ptr<SetupBlockForest> createSetupBlockForest ( uint_t targetNumRootBlocks, uint_t numProcesses = uint_c( MPIManager::instance()->numProcesses() ) ) const; + shared_ptr<SetupBlockForest> createSetupBlockForest ( const Vector3<uint_t> & blockSize, uint_t numProcesses = uint_c( MPIManager::instance()->numProcesses() ) ) const; shared_ptr<SetupBlockForest> createSetupBlockForest ( const Vector3<uint_t> & cellsPerBlock, const Vector3<uint_t> & numBlocks ) const; - shared_ptr<StructuredBlockForest> createStructuredBlockForest( const uint_t targetNumRootBlocks ) const; + shared_ptr<StructuredBlockForest> createStructuredBlockForest( uint_t targetNumRootBlocks ) const; shared_ptr<StructuredBlockForest> createStructuredBlockForest( const Vector3<uint_t> & blockSize ) const; shared_ptr<StructuredBlockForest> createStructuredBlockForest( const Vector3<uint_t> & cellsPerBlock, const Vector3<uint_t> & numBlocks ) const; diff --git a/src/mesh/boundary/BoundarySetup.cpp b/src/mesh/boundary/BoundarySetup.cpp index e4d443b5378f24f7986c29e6e7e2d901bf58c74d..7003fa77260d548d8db0a7e73aac29728e818ec5 100644 --- a/src/mesh/boundary/BoundarySetup.cpp +++ b/src/mesh/boundary/BoundarySetup.cpp @@ -24,55 +24,56 @@ #include "field/AddToStorage.h" #include "field/vtk/VTKWriter.h" -namespace walberla { -namespace mesh { +namespace walberla::mesh { - -BoundarySetup::BoundarySetup( const shared_ptr< StructuredBlockStorage > & structuredBlockStorage, const DistanceFunction & distanceFunction, const uint_t numGhostLayers ) +BoundarySetup::BoundarySetup( const shared_ptr< StructuredBlockStorage > & structuredBlockStorage, const DistanceFunction & distanceFunction, const uint_t numGhostLayers, const bool doRefinementCorrection) : structuredBlockStorage_( structuredBlockStorage ), distanceFunction_( distanceFunction ), numGhostLayers_( numGhostLayers ), cellVectorChunkSize_( size_t(1000) ) { voxelize(); - try { - auto & blockForest = dynamic_cast< StructuredBlockForest & >( *structuredBlockStorage_ ); - if( !blockForest.storesUniformBlockGrid() ) - refinementCorrection( blockForest ); - } - catch( const std::bad_cast & ) {} // If it's not a BlockForest no refinement correction is done + if (doRefinementCorrection) + { + try { + auto & blockForest = dynamic_cast< StructuredBlockForest & >( *structuredBlockStorage_ ); + if( !blockForest.storesUniformBlockGrid() ) + refinementCorrection( blockForest ); + } + catch( const std::bad_cast & ) {} // If it's not a BlockForest no refinement correction is done + } } void BoundarySetup::divideAndPushCellInterval( const CellInterval & ci, std::queue< CellInterval > & outputQueue ) { - WALBERLA_ASSERT( !ci.empty() ); + WALBERLA_ASSERT( !ci.empty() ) Cell newMax( ci.xMin() + std::max( cell_idx_c( ci.xSize() ) / cell_idx_t(2) - cell_idx_t(1), cell_idx_t(0) ), ci.yMin() + std::max( cell_idx_c( ci.ySize() ) / cell_idx_t(2) - cell_idx_t(1), cell_idx_t(0) ), ci.zMin() + std::max( cell_idx_c( ci.zSize() ) / cell_idx_t(2) - cell_idx_t(1), cell_idx_t(0) ) ); - WALBERLA_ASSERT( ci.contains( newMax ) ); + WALBERLA_ASSERT( ci.contains( newMax ) ) Cell newMin( newMax[0] + cell_idx_c( 1 ), newMax[1] + cell_idx_c( 1 ), newMax[2] + cell_idx_c( 1 ) ); - outputQueue.push( CellInterval( ci.xMin(), ci.yMin(), ci.zMin(), newMax[0], newMax[1], newMax[2] ) ); + outputQueue.emplace( ci.xMin(), ci.yMin(), ci.zMin(), newMax[0], newMax[1], newMax[2] ); if( newMin[2] <= ci.zMax() ) - outputQueue.push( CellInterval( ci.xMin(), ci.yMin(), newMin[2], newMax[0], newMax[1], ci.zMax() ) ); + outputQueue.emplace( ci.xMin(), ci.yMin(), newMin[2], newMax[0], newMax[1], ci.zMax() ); if( newMin[1] <= ci.yMax() ) { - outputQueue.push( CellInterval( ci.xMin(), newMin[1], ci.zMin(), newMax[0], ci.yMax(), newMax[2]) ); + outputQueue.emplace( ci.xMin(), newMin[1], ci.zMin(), newMax[0], ci.yMax(), newMax[2] ); if( newMin[2] <= ci.zMax() ) - outputQueue.push( CellInterval( ci.xMin(), newMin[1], newMin[2], newMax[0], ci.yMax(), ci.zMax() ) ); + outputQueue.emplace( ci.xMin(), newMin[1], newMin[2], newMax[0], ci.yMax(), ci.zMax() ); } if( newMin[0] <= ci.xMax() ) { - outputQueue.push( CellInterval( newMin[0], ci.yMin(), ci.zMin(), ci.xMax(), newMax[1], newMax[2] ) ); + outputQueue.emplace( newMin[0], ci.yMin(), ci.zMin(), ci.xMax(), newMax[1], newMax[2] ); if( newMin[2] <= ci.zMax() ) - outputQueue.push( CellInterval( newMin[0], ci.yMin(), newMin[2], ci.xMax(), newMax[1], ci.zMax() ) ); + outputQueue.emplace( newMin[0], ci.yMin(), newMin[2], ci.xMax(), newMax[1], ci.zMax() ); if( newMin[1] <= ci.yMax() ) { - outputQueue.push( CellInterval( newMin[0], newMin[1], ci.zMin(), ci.xMax(), ci.yMax(), newMax[2] ) ); + outputQueue.emplace( newMin[0], newMin[1], ci.zMin(), ci.xMax(), ci.yMax(), newMax[2] ); if( newMin[2] <= ci.zMax() ) - outputQueue.push( CellInterval( newMin[0], newMin[1], newMin[2], ci.xMax(), ci.yMax(), ci.zMax() ) ); + outputQueue.emplace( newMin[0], newMin[1], newMin[2], ci.xMax(), ci.yMax(), ci.zMax() ); } } } @@ -92,7 +93,7 @@ void BoundarySetup::allocateOrResetVoxelizationField() voxelizationFieldId_ = make_shared< BlockDataID >( field::addToStorage< VoxelizationField >( structuredBlockStorage_, "voxelization field", uint8_t(0), field::fzyx, numGhostLayers_ ) ); } - WALBERLA_ASSERT_NOT_NULLPTR( voxelizationFieldId_ ); + WALBERLA_ASSERT_NOT_NULLPTR( voxelizationFieldId_ ) } void BoundarySetup::deallocateVoxelizationField() @@ -112,8 +113,8 @@ void BoundarySetup::voxelize() { VoxelizationField * voxelizationField = block.getData< VoxelizationField >( *voxelizationFieldId_ ); - WALBERLA_ASSERT_NOT_NULLPTR( voxelizationField ); - WALBERLA_ASSERT_EQUAL( numGhostLayers_, voxelizationField->nrOfGhostLayers() ); + WALBERLA_ASSERT_NOT_NULLPTR( voxelizationField ) + WALBERLA_ASSERT_EQUAL( numGhostLayers_, voxelizationField->nrOfGhostLayers() ) CellInterval blockCi = voxelizationField->xyzSizeWithGhostLayer(); structuredBlockStorage_->transformBlockLocalToGlobalCellInterval( blockCi, block ); @@ -125,11 +126,11 @@ void BoundarySetup::voxelize() { const CellInterval & curCi = ciQueue.front(); - WALBERLA_ASSERT( !curCi.empty(), "Cell Interval: " << curCi ); + WALBERLA_ASSERT( !curCi.empty(), "Cell Interval: " << curCi ) - AABB curAABB = structuredBlockStorage_->getAABBFromCellBB( curCi, structuredBlockStorage_->getLevel( block ) ); + AABB const curAABB = structuredBlockStorage_->getAABBFromCellBB( curCi, structuredBlockStorage_->getLevel( block ) ); - WALBERLA_ASSERT( !curAABB.empty(), "AABB: " << curAABB ); + WALBERLA_ASSERT( !curAABB.empty(), "AABB: " << curAABB ) Vector3<real_t> cellCenter = curAABB.center(); structuredBlockStorage_->mapToPeriodicDomain( cellCenter ); @@ -170,7 +171,7 @@ void BoundarySetup::voxelize() continue; } - WALBERLA_ASSERT_GREATER( curCi.numCells(), uint_t(1) ); + WALBERLA_ASSERT_GREATER( curCi.numCells(), uint_t(1) ) divideAndPushCellInterval( curCi, ciQueue ); ciQueue.pop(); @@ -184,7 +185,7 @@ void BoundarySetup::refinementCorrection( StructuredBlockForest & blockForest ) || blockForest.getNumberOfYCellsPerBlock() < uint_t( 16 ) || blockForest.getNumberOfZCellsPerBlock() < uint_t( 16 ) ) { - WALBERLA_ABORT( "The mesh boundary setup requires a block size of at least 16 in each dimension, when refinement is used!" ); + WALBERLA_ABORT( "The mesh boundary setup requires a block size of at least 16 in each dimension, when refinement is used!" ) } for( auto & iBlock : blockForest ) @@ -253,12 +254,11 @@ void BoundarySetup::refinementCorrection( StructuredBlockForest & blockForest ) void BoundarySetup::writeVTKVoxelfile( const std::string & identifier, bool writeGhostLayers, const std::string & baseFolder, const std::string & executionFolder ) { - WALBERLA_ASSERT_NOT_NULLPTR( voxelizationFieldId_ ); - WALBERLA_ASSERT_NOT_NULLPTR( structuredBlockStorage_ ); + WALBERLA_ASSERT_NOT_NULLPTR( voxelizationFieldId_ ) + WALBERLA_ASSERT_NOT_NULLPTR( structuredBlockStorage_ ) field::createVTKOutput< VoxelizationField >( *voxelizationFieldId_, *structuredBlockStorage_, identifier, uint_t(1), writeGhostLayers ? numGhostLayers_ : uint_t(0), false, baseFolder, executionFolder )(); } -} // namespace mesh -} // namespace walberla \ No newline at end of file +} // namespace walberla::mesh \ No newline at end of file diff --git a/src/mesh/boundary/BoundarySetup.h b/src/mesh/boundary/BoundarySetup.h index 6486bb75d88667e2e560d6dbada4e91df2046828..1993738499166006a8ef0a9eaa50589192709632 100644 --- a/src/mesh/boundary/BoundarySetup.h +++ b/src/mesh/boundary/BoundarySetup.h @@ -41,8 +41,7 @@ #include <queue> -namespace walberla { -namespace mesh { +namespace walberla::mesh { class BoundarySetup { @@ -53,23 +52,23 @@ public: enum Location { INSIDE, OUTSIDE }; - BoundarySetup( const shared_ptr< StructuredBlockStorage > & structuredBlockStorage, const DistanceFunction & distanceFunction, const uint_t numGhostLayers ); + BoundarySetup( const shared_ptr< StructuredBlockStorage > & structuredBlockStorage, const DistanceFunction & distanceFunction, uint_t numGhostLayers, bool doRefinementCorrection = true ); ~BoundarySetup() { deallocateVoxelizationField(); } template< typename BoundaryHandlingType > - void setDomainCells( const BlockDataID boundaryHandlingID, const Location domainLocation ); + void setDomainCells( BlockDataID boundaryHandlingID, Location domainLocation ); template< typename BoundaryHandlingType, typename BoundaryFunction, typename Stencil = stencil::D3Q27 > - void setBoundaries( const BlockDataID boundaryHandlingID, const BoundaryFunction & boundaryFunction, Location boundaryLocation ); + void setBoundaries( BlockDataID boundaryHandlingID, const BoundaryFunction & boundaryFunction, Location boundaryLocation ); template<typename FlagField_T> - void setFlag( const BlockDataID flagFieldID, field::FlagUID flagUID, Location boundaryLocation ); + void setFlag( BlockDataID flagFieldID, field::FlagUID flagUID, Location boundaryLocation ); // set flag with FlagUID wherever boundaryUID is set template< typename FlagField_T, typename BoundaryFunction, typename Stencil = stencil::D3Q27 > - void setBoundaryFlag(const BlockDataID flagFieldID, field::FlagUID flagUID, boundary::BoundaryUID boundaryUID, + void setBoundaryFlag(BlockDataID flagFieldID, field::FlagUID flagUID, boundary::BoundaryUID boundaryUID, const BoundaryFunction& boundaryFunction, Location boundaryLocation); void writeVTKVoxelfile( const std::string & identifier = "voxelization", bool writeGhostLayers = false, const std::string & baseFolder = std::string("vtk_out"), @@ -100,12 +99,12 @@ void BoundarySetup::setDomainCells( const BlockDataID boundaryHandlingId, const for( auto & block : *structuredBlockStorage_ ) { BoundaryHandlingType * boundaryHandling = block.getData< BoundaryHandlingType >( boundaryHandlingId ); - WALBERLA_CHECK_NOT_NULLPTR( boundaryHandling, "boundaryHandlingId invalid!" ); + WALBERLA_CHECK_NOT_NULLPTR( boundaryHandling, "boundaryHandlingId invalid!" ) const VoxelizationField * voxelizationField = block.getData< VoxelizationField >( *voxelizationFieldId_ ); - WALBERLA_ASSERT_NOT_NULLPTR( voxelizationField ); + WALBERLA_ASSERT_NOT_NULLPTR( voxelizationField ) WALBERLA_CHECK_LESS_EQUAL( numGhostLayers_, boundaryHandling->getFlagField()->nrOfGhostLayers(), "You want to use mesh boundary setup with " \ - << numGhostLayers_ << " but your flag field has only " << boundaryHandling->getFlagField()->nrOfGhostLayers() << " ghost layers!" ); + << numGhostLayers_ << " but your flag field has only " << boundaryHandling->getFlagField()->nrOfGhostLayers() << " ghost layers!" ) const uint8_t domainValue = domainLocation == INSIDE ? uint8_t(1) : uint8_t(0); @@ -132,13 +131,13 @@ void BoundarySetup::setFlag( const BlockDataID flagFieldID, field::FlagUID flagU for( auto & block : *structuredBlockStorage_ ) { FlagField_T * flagField = block.getData< FlagField_T >( flagFieldID ); - WALBERLA_CHECK_NOT_NULLPTR( flagField, "flagFieldID invalid!" ); + WALBERLA_CHECK_NOT_NULLPTR( flagField, "flagFieldID invalid!" ) auto flag = flagField->getFlag(flagUID); const VoxelizationField * voxelizationField = block.getData< VoxelizationField >( *voxelizationFieldId_ ); - WALBERLA_ASSERT_NOT_NULLPTR( voxelizationField ); + WALBERLA_ASSERT_NOT_NULLPTR( voxelizationField ) WALBERLA_CHECK_LESS_EQUAL( numGhostLayers_, flagField->nrOfGhostLayers(), "You want to use mesh boundary setup with " \ - << numGhostLayers_ << " but your flag field has only " << flagField->nrOfGhostLayers() << " ghost layers!" ); + << numGhostLayers_ << " but your flag field has only " << flagField->nrOfGhostLayers() << " ghost layers!" ) const uint8_t domainValue = boundaryLocation == INSIDE ? uint8_t(0) : uint8_t(1); @@ -156,15 +155,15 @@ void BoundarySetup::setBoundaryFlag(const BlockDataID flagFieldID, field::FlagUI for (auto& block : *structuredBlockStorage_) { FlagField_T* flagField = block.getData< FlagField_T >(flagFieldID); - WALBERLA_CHECK_NOT_NULLPTR(flagField, "flagFieldID invalid!"); + WALBERLA_CHECK_NOT_NULLPTR(flagField, "flagFieldID invalid!") auto flag = flagField->getFlag(flagUID); const VoxelizationField* voxelizationField = block.getData< VoxelizationField >(*voxelizationFieldId_); - WALBERLA_ASSERT_NOT_NULLPTR(voxelizationField); + WALBERLA_ASSERT_NOT_NULLPTR(voxelizationField) WALBERLA_CHECK_LESS_EQUAL(numGhostLayers_, flagField->nrOfGhostLayers(), "You want to use mesh boundary setup with " << numGhostLayers_ << " but your flag field has only " - << flagField->nrOfGhostLayers() << " ghost layers!"); + << flagField->nrOfGhostLayers() << " ghost layers!") // get where the (fluid) domain is located (on the inside/outside of the mesh) const uint8_t domainValue = boundaryLocation == INSIDE ? uint8_t(0) : uint8_t(1); @@ -211,12 +210,12 @@ void BoundarySetup::setBoundaries( const BlockDataID boundaryHandlingID, const B for( auto & block : *structuredBlockStorage_ ) { BoundaryHandlingType * boundaryHandling = block.getData< BoundaryHandlingType >( boundaryHandlingID ); - WALBERLA_CHECK_NOT_NULLPTR( boundaryHandling, "boundaryHandlingId invalid!" ); + WALBERLA_CHECK_NOT_NULLPTR( boundaryHandling, "boundaryHandlingId invalid!" ) const VoxelizationField * voxelizationField = block.getData< VoxelizationField >( *voxelizationFieldId_ ); - WALBERLA_ASSERT_NOT_NULLPTR( voxelizationField ); + WALBERLA_ASSERT_NOT_NULLPTR( voxelizationField ) WALBERLA_CHECK_LESS_EQUAL( numGhostLayers_, boundaryHandling->getFlagField()->nrOfGhostLayers(), "You want to use mesh boundary setup with " \ - << numGhostLayers_ << " but your flag field has only " << boundaryHandling->getFlagField()->nrOfGhostLayers() << " ghost layers!" ); + << numGhostLayers_ << " but your flag field has only " << boundaryHandling->getFlagField()->nrOfGhostLayers() << " ghost layers!" ) const uint8_t domainValue = boundaryLocation == INSIDE ? uint8_t(0) : uint8_t(1); @@ -244,5 +243,4 @@ void BoundarySetup::setBoundaries( const BlockDataID boundaryHandlingID, const B } } -} // namespace mesh -} // namespace walberla \ No newline at end of file +} // namespace walberla::mesh \ No newline at end of file