diff --git a/src/cuda/communication/UniformGPUScheme.h b/src/cuda/communication/UniformGPUScheme.h index dcd93210ae137e8c9afadcbd6d735e460bbf6447..173cfcc4c44166f7ff05e8963fbe7135123aba40 100644 --- a/src/cuda/communication/UniformGPUScheme.h +++ b/src/cuda/communication/UniformGPUScheme.h @@ -48,6 +48,12 @@ namespace communication { bool sendDirectlyFromGPU = false, const int tag = 5432 ); + explicit UniformGPUScheme( weak_ptr<StructuredBlockForest> bf, + const Set<SUID> & requiredBlockSelectors, + const Set<SUID> & incompatibleBlockSelectors, + bool sendDirectlyFromGPU = false, + const int tag = 5432 ); + void addPackInfo( const shared_ptr<GeneratedGPUPackInfo> &pi ); void startCommunication( cudaStream_t stream = nullptr); @@ -82,6 +88,9 @@ namespace communication { stencil::Direction dir; }; std::map<mpi::MPIRank, std::vector<Header> > headers_; + + Set<SUID> requiredBlockSelectors_; + Set<SUID> incompatibleBlockSelectors_; }; diff --git a/src/cuda/communication/UniformGPUScheme.impl.h b/src/cuda/communication/UniformGPUScheme.impl.h index fa8e0c2aa7289e67280761d94ee8d98b79f3cf0b..089f03e78ec30b5ff7d2ca451ba0f82e41bcc0c8 100644 --- a/src/cuda/communication/UniformGPUScheme.impl.h +++ b/src/cuda/communication/UniformGPUScheme.impl.h @@ -26,17 +26,36 @@ namespace cuda { namespace communication { -template<typename Stencil> -UniformGPUScheme<Stencil>::UniformGPUScheme( weak_ptr <StructuredBlockForest> bf, - bool sendDirectlyFromGPU, - const int tag ) + template<typename Stencil> + UniformGPUScheme<Stencil>::UniformGPUScheme( weak_ptr <StructuredBlockForest> bf, + bool sendDirectlyFromGPU, + const int tag ) : blockForest_( bf ), setupBeforeNextCommunication_( true ), communicationInProgress_( false ), sendFromGPU_( sendDirectlyFromGPU ), bufferSystemCPU_( mpi::MPIManager::instance()->comm(), tag ), bufferSystemGPU_( mpi::MPIManager::instance()->comm(), tag ), - parallelSectionManager_( -1 ) + parallelSectionManager_( -1 ), + requiredBlockSelectors_( Set<SUID>::emptySet() ), + incompatibleBlockSelectors_( Set<SUID>::emptySet() ) + {} + + template<typename Stencil> + UniformGPUScheme<Stencil>::UniformGPUScheme( weak_ptr <StructuredBlockForest> bf, + const Set<SUID> & requiredBlockSelectors, + const Set<SUID> & incompatibleBlockSelectors, + bool sendDirectlyFromGPU, + const int tag ) + : blockForest_( bf ), + setupBeforeNextCommunication_( true ), + communicationInProgress_( false ), + sendFromGPU_( sendDirectlyFromGPU ), + bufferSystemCPU_( mpi::MPIManager::instance()->comm(), tag ), + bufferSystemGPU_( mpi::MPIManager::instance()->comm(), tag ), + parallelSectionManager_( -1 ), + requiredBlockSelectors_( requiredBlockSelectors ), + incompatibleBlockSelectors_( incompatibleBlockSelectors ) {} @@ -67,6 +86,10 @@ UniformGPUScheme<Stencil>::UniformGPUScheme( weak_ptr <StructuredBlockForest> bf for( auto &iBlock : *forest ) { auto block = dynamic_cast< Block * >( &iBlock ); + + if( !selectable::isSetSelected( block->getState(), requiredBlockSelectors_, incompatibleBlockSelectors_ ) ) + continue; + for( auto dir = Stencil::beginNoCenter(); dir != Stencil::end(); ++dir ) { const auto neighborIdx = blockforest::getBlockNeighborhoodSectionIndex( *dir ); @@ -74,6 +97,9 @@ UniformGPUScheme<Stencil>::UniformGPUScheme( weak_ptr <StructuredBlockForest> bf continue; auto nProcess = mpi::MPIRank( block->getNeighborProcess( neighborIdx, uint_t( 0 ))); + if( !selectable::isSetSelected( block->getNeighborState( neighborIdx, uint_t(0) ), requiredBlockSelectors_, incompatibleBlockSelectors_ ) ) + continue; + for( auto &pi : packInfos_ ) { parallelSection.run([&](auto s) { @@ -183,6 +209,9 @@ UniformGPUScheme<Stencil>::UniformGPUScheme( weak_ptr <StructuredBlockForest> bf for( auto &iBlock : *forest ) { auto block = dynamic_cast< Block * >( &iBlock ); + if( !selectable::isSetSelected( block->getState(), requiredBlockSelectors_, incompatibleBlockSelectors_ ) ) + continue; + for( auto dir = Stencil::beginNoCenter(); dir != Stencil::end(); ++dir ) { // skip if block has no neighbors in this direction const auto neighborIdx = blockforest::getBlockNeighborhoodSectionIndex( *dir ); @@ -195,6 +224,10 @@ UniformGPUScheme<Stencil>::UniformGPUScheme( weak_ptr <StructuredBlockForest> bf "Works for uniform setups only" ) const BlockID &nBlockId = block->getNeighborId( neighborIdx, uint_t( 0 )); + + if( !selectable::isSetSelected( block->getNeighborState( neighborIdx, uint_t(0) ), requiredBlockSelectors_, incompatibleBlockSelectors_ ) ) + continue; + auto nProcess = mpi::MPIRank( block->getNeighborProcess( neighborIdx, uint_t( 0 ))); for( auto &pi : packInfos_ ) diff --git a/tests/cuda/CMakeLists.txt b/tests/cuda/CMakeLists.txt index d301b7eb5ccc1fd707cdb5b40d9ecd6f1b63a836..aae265a7b70a4d09aeb8530a193ee743c5398c56 100644 --- a/tests/cuda/CMakeLists.txt +++ b/tests/cuda/CMakeLists.txt @@ -12,6 +12,9 @@ waLBerla_execute_test( NAME GPUPackInfoTest ) waLBerla_compile_test( FILES communication/GPUPackInfoCommunicationTest.cpp DEPENDS domain_decomposition blockforest stencil ) waLBerla_execute_test( NAME GPUPackInfoCommunicationTest ) +waLBerla_compile_test( FILES communication/GPUBlockSelectorCommunicationTest.cpp DEPENDS domain_decomposition blockforest stencil ) +waLBerla_execute_test( NAME GPUBlockSelectorCommunicationTest ) + waLBerla_compile_test( FILES FieldTransferTest ) waLBerla_execute_test( NAME FieldTransferTest ) diff --git a/tests/cuda/communication/GPUBlockSelectorCommunicationTest.cpp b/tests/cuda/communication/GPUBlockSelectorCommunicationTest.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d4f5f140551ffaf9a995ab0dfe257fb89a19a188 --- /dev/null +++ b/tests/cuda/communication/GPUBlockSelectorCommunicationTest.cpp @@ -0,0 +1,189 @@ +//======================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file GPUBlockSelectorCommunicationTest.cpp +//! \ingroup cuda +//! \author Helen Schottenhamml <helen.schottenhamml@fau.de> +//! \brief Short communication test for the usage of block selectors in UniformGPUScheme. +// +//======================================================================================================================== + +#include <blockforest/GlobalLoadBalancing.h> +#include <blockforest/Initialization.h> +#include <blockforest/SetupBlockForest.h> +#include <core/DataTypes.h> +#include <core/debug/TestSubsystem.h> +#include <core/math/Random.h> +#include <core/Environment.h> +#include <cuda/AddGPUFieldToStorage.h> +#include <cuda/ErrorChecking.h> +#include <cuda/FieldCopy.h> +#include <cuda/GPUField.h> +#include <cuda/communication/MemcpyPackInfo.h> +#include <cuda/communication/UniformGPUScheme.h> +#include <cuda_runtime.h> +#include <domain_decomposition/BlockDataID.h> +#include <field/AddToStorage.h> +#include <field/GhostLayerField.h> +#include <stencil/D3Q27.h> +#include <stencil/Directions.h> +#include <stencil/Iterator.h> +#include <vector> + +namespace walberla +{ +using Type_T = int; + +using Stencil_T = stencil::D3Q27; +using ScalarField_T = field::GhostLayerField< Type_T, 1 >; +using GPUScalarField_T = cuda::GPUField< Type_T >; + +const Set< SUID > requiredBlockSelector("communication"); +const Set< SUID > incompatibleBlockSelector("no communication"); + +void suidAssignmentFunction( blockforest::SetupBlockForest & forest ) { + + for( auto & sblock : forest ) { + if( forest.atDomainXMinBorder( sblock ) ) { + sblock.addState(incompatibleBlockSelector); + } else { + sblock.addState(requiredBlockSelector); + } + sblock.setWorkload(walberla::numeric_cast<walberla::workload_t>(1)); + } +} + +void initScalarField(std::shared_ptr< StructuredBlockForest >& blocks, const BlockDataID& fieldID) +{ + for (auto& block : *blocks) + { + Type_T val; + if (blocks->atDomainXMinBorder(block)) { + val = Type_T(-1); + } else if (blocks->atDomainXMaxBorder(block)) { + val = Type_T(1); + } else { + val = Type_T(0); + } + + auto* field = block.getData< ScalarField_T >(fieldID); + WALBERLA_ASSERT_NOT_NULLPTR(field) + + const auto cells = field->xyzSizeWithGhostLayer(); + + for (auto cell : cells) + { + field->get(cell) = val; + } + } +} + +std::shared_ptr< StructuredBlockForest > createSelectorBlockGrid ( + const uint_t numberOfXBlocks, const uint_t numberOfYBlocks, const uint_t numberOfZBlocks, + const uint_t numberOfXCellsPerBlock, const uint_t numberOfYCellsPerBlock, const uint_t numberOfZCellsPerBlock, + const real_t dx, + const bool xPeriodic, const bool yPeriodic, const bool zPeriodic, + const bool keepGlobalBlockInformation ) +{ + // initialize SetupBlockForest = determine domain decomposition + + SetupBlockForest sforest; + + sforest.addWorkloadMemorySUIDAssignmentFunction(suidAssignmentFunction); + + AABB domainAABB{ real_c(0), real_c(0), real_c(0), + dx * real_c( numberOfXBlocks * numberOfXCellsPerBlock ), + dx * real_c( numberOfYBlocks * numberOfYCellsPerBlock ), + dx * real_c( numberOfZBlocks * numberOfZCellsPerBlock ) }; + sforest.init(domainAABB, numberOfXBlocks, numberOfYBlocks, numberOfZBlocks, xPeriodic, yPeriodic, zPeriodic); + + // calculate process distribution + + const memory_t memoryLimit = numeric_cast< memory_t >(sforest.getNumberOfBlocks()); + + blockforest::GlobalLoadBalancing::MetisConfiguration< SetupBlock > metisConfig( + true, false, + std::bind(blockforest::cellWeightedCommunicationCost, std::placeholders::_1, std::placeholders::_2, numberOfXCellsPerBlock, + numberOfYCellsPerBlock, numberOfZCellsPerBlock)); + + sforest.calculateProcessDistribution_Default(uint_c(MPIManager::instance()->numProcesses()), memoryLimit, "hilbert", + 10, false, metisConfig); + + if (!MPIManager::instance()->rankValid()) MPIManager::instance()->useWorldComm(); + + // create StructuredBlockForest (encapsulates a newly created BlockForest) + + auto bf = + std::make_shared< BlockForest >(uint_c(MPIManager::instance()->rank()), sforest, keepGlobalBlockInformation); + + auto sbf = std::make_shared< StructuredBlockForest >(bf, numberOfXCellsPerBlock, numberOfYCellsPerBlock, + numberOfZCellsPerBlock); + sbf->createCellBoundingBoxes(); + + return sbf; +} + +int main(int argc, char** argv) +{ + debug::enterTestMode(); + walberla::Environment walberlaEnv(argc, argv); + + const Vector3<uint_t> nBlocks { 3, 1, 1 }; + const Vector3<uint_t> cells { 2, 2, 1 }; + Vector3<real_t> domainSize; + for( uint_t d = 0; d < 3; ++d ) { + domainSize[d] = real_c(cells[d] * nBlocks[d]); + } + + auto blocks = createSelectorBlockGrid(nBlocks[0], nBlocks[1], nBlocks[2], + cells[0], cells[1], cells[2], 1, false, true, true, true); + + BlockDataID fieldID = field::addToStorage< ScalarField_T >(blocks, "scalar", Type_T(0), field::fzyx, uint_t(1)); + initScalarField(blocks, fieldID); + + BlockDataID gpuFieldID = cuda::addGPUFieldToStorage< ScalarField_T >(blocks, fieldID, "GPU scalar"); + + // Setup communication schemes for GPUPackInfo + cuda::communication::UniformGPUScheme< Stencil_T > communication(blocks, requiredBlockSelector, incompatibleBlockSelector); + communication.addPackInfo(std::make_shared< cuda::communication::MemcpyPackInfo< GPUScalarField_T > >(gpuFieldID)); + + // Perform one communication step + communication(); + + // Copy to CPU + cuda::fieldCpy< ScalarField_T, GPUScalarField_T >( blocks, fieldID, gpuFieldID ); + + // Check for correct data in ghost layers of middle block + auto middleBlock = blocks->getBlock( domainSize[0] / real_c(2), domainSize[1] / real_c(2), domainSize[2] / real_c(2) ); + auto cpuField = middleBlock->getData<ScalarField_T>(fieldID); + WALBERLA_ASSERT_NOT_NULLPTR(cpuField) + + // avoid unused variable warning in release mode + (void) cpuField; + + // check for missing communication with left neighbour (first block, incompatible selector) + WALBERLA_ASSERT_EQUAL(cpuField->get(-1, 0, 0), 0, "Communication with left neighbor detected.") + WALBERLA_ASSERT_EQUAL(cpuField->get(-1, 1, 0), 0, "Communication with left neighbor detected.") + + // check for correct communication with right neighbor (third block, required selector) + WALBERLA_ASSERT_EQUAL(cpuField->get(cell_idx_t(cells[0]), 0, 0), 1, "No communication with right neighbor detected.") + WALBERLA_ASSERT_EQUAL(cpuField->get(cell_idx_t(cells[0]), 1, 0), 1, "No communication with right neighbor detected.") + + return EXIT_SUCCESS; +} + +} // namespace walberla + +int main(int argc, char** argv) { return walberla::main(argc, argv); }