Commit e3cc06cd authored by Helen Schottenhamml's avatar Helen Schottenhamml Committed by Markus Holzer
Browse files

GPU Communication Block Selectors

parent 7fb0d512
......@@ -48,6 +48,12 @@ namespace communication {
bool sendDirectlyFromGPU = false,
const int tag = 5432 );
explicit UniformGPUScheme( weak_ptr<StructuredBlockForest> bf,
const Set<SUID> & requiredBlockSelectors,
const Set<SUID> & incompatibleBlockSelectors,
bool sendDirectlyFromGPU = false,
const int tag = 5432 );
void addPackInfo( const shared_ptr<GeneratedGPUPackInfo> &pi );
void startCommunication( cudaStream_t stream = nullptr);
......@@ -82,6 +88,9 @@ namespace communication {
stencil::Direction dir;
};
std::map<mpi::MPIRank, std::vector<Header> > headers_;
Set<SUID> requiredBlockSelectors_;
Set<SUID> incompatibleBlockSelectors_;
};
......
......@@ -26,17 +26,36 @@ namespace cuda {
namespace communication {
template<typename Stencil>
UniformGPUScheme<Stencil>::UniformGPUScheme( weak_ptr <StructuredBlockForest> bf,
bool sendDirectlyFromGPU,
const int tag )
template<typename Stencil>
UniformGPUScheme<Stencil>::UniformGPUScheme( weak_ptr <StructuredBlockForest> bf,
bool sendDirectlyFromGPU,
const int tag )
: blockForest_( bf ),
setupBeforeNextCommunication_( true ),
communicationInProgress_( false ),
sendFromGPU_( sendDirectlyFromGPU ),
bufferSystemCPU_( mpi::MPIManager::instance()->comm(), tag ),
bufferSystemGPU_( mpi::MPIManager::instance()->comm(), tag ),
parallelSectionManager_( -1 )
parallelSectionManager_( -1 ),
requiredBlockSelectors_( Set<SUID>::emptySet() ),
incompatibleBlockSelectors_( Set<SUID>::emptySet() )
{}
template<typename Stencil>
UniformGPUScheme<Stencil>::UniformGPUScheme( weak_ptr <StructuredBlockForest> bf,
const Set<SUID> & requiredBlockSelectors,
const Set<SUID> & incompatibleBlockSelectors,
bool sendDirectlyFromGPU,
const int tag )
: blockForest_( bf ),
setupBeforeNextCommunication_( true ),
communicationInProgress_( false ),
sendFromGPU_( sendDirectlyFromGPU ),
bufferSystemCPU_( mpi::MPIManager::instance()->comm(), tag ),
bufferSystemGPU_( mpi::MPIManager::instance()->comm(), tag ),
parallelSectionManager_( -1 ),
requiredBlockSelectors_( requiredBlockSelectors ),
incompatibleBlockSelectors_( incompatibleBlockSelectors )
{}
......@@ -67,6 +86,10 @@ UniformGPUScheme<Stencil>::UniformGPUScheme( weak_ptr <StructuredBlockForest> bf
for( auto &iBlock : *forest )
{
auto block = dynamic_cast< Block * >( &iBlock );
if( !selectable::isSetSelected( block->getState(), requiredBlockSelectors_, incompatibleBlockSelectors_ ) )
continue;
for( auto dir = Stencil::beginNoCenter(); dir != Stencil::end(); ++dir )
{
const auto neighborIdx = blockforest::getBlockNeighborhoodSectionIndex( *dir );
......@@ -74,6 +97,9 @@ UniformGPUScheme<Stencil>::UniformGPUScheme( weak_ptr <StructuredBlockForest> bf
continue;
auto nProcess = mpi::MPIRank( block->getNeighborProcess( neighborIdx, uint_t( 0 )));
if( !selectable::isSetSelected( block->getNeighborState( neighborIdx, uint_t(0) ), requiredBlockSelectors_, incompatibleBlockSelectors_ ) )
continue;
for( auto &pi : packInfos_ )
{
parallelSection.run([&](auto s) {
......@@ -183,6 +209,9 @@ UniformGPUScheme<Stencil>::UniformGPUScheme( weak_ptr <StructuredBlockForest> bf
for( auto &iBlock : *forest ) {
auto block = dynamic_cast< Block * >( &iBlock );
if( !selectable::isSetSelected( block->getState(), requiredBlockSelectors_, incompatibleBlockSelectors_ ) )
continue;
for( auto dir = Stencil::beginNoCenter(); dir != Stencil::end(); ++dir ) {
// skip if block has no neighbors in this direction
const auto neighborIdx = blockforest::getBlockNeighborhoodSectionIndex( *dir );
......@@ -195,6 +224,10 @@ UniformGPUScheme<Stencil>::UniformGPUScheme( weak_ptr <StructuredBlockForest> bf
"Works for uniform setups only" )
const BlockID &nBlockId = block->getNeighborId( neighborIdx, uint_t( 0 ));
if( !selectable::isSetSelected( block->getNeighborState( neighborIdx, uint_t(0) ), requiredBlockSelectors_, incompatibleBlockSelectors_ ) )
continue;
auto nProcess = mpi::MPIRank( block->getNeighborProcess( neighborIdx, uint_t( 0 )));
for( auto &pi : packInfos_ )
......
......@@ -12,6 +12,9 @@ waLBerla_execute_test( NAME GPUPackInfoTest )
waLBerla_compile_test( FILES communication/GPUPackInfoCommunicationTest.cpp DEPENDS domain_decomposition blockforest stencil )
waLBerla_execute_test( NAME GPUPackInfoCommunicationTest )
waLBerla_compile_test( FILES communication/GPUBlockSelectorCommunicationTest.cpp DEPENDS domain_decomposition blockforest stencil )
waLBerla_execute_test( NAME GPUBlockSelectorCommunicationTest )
waLBerla_compile_test( FILES FieldTransferTest )
waLBerla_execute_test( NAME FieldTransferTest )
......
//========================================================================================================================
//
// This file is part of waLBerla. waLBerla is free software: you can
// redistribute it and/or modify it under the terms of the GNU General Public
// License as published by the Free Software Foundation, either version 3 of
// the License, or (at your option) any later version.
//
// waLBerla is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
//
//! \file GPUBlockSelectorCommunicationTest.cpp
//! \ingroup cuda
//! \author Helen Schottenhamml <helen.schottenhamml@fau.de>
//! \brief Short communication test for the usage of block selectors in UniformGPUScheme.
//
//========================================================================================================================
#include <blockforest/GlobalLoadBalancing.h>
#include <blockforest/Initialization.h>
#include <blockforest/SetupBlockForest.h>
#include <core/DataTypes.h>
#include <core/debug/TestSubsystem.h>
#include <core/math/Random.h>
#include <core/Environment.h>
#include <cuda/AddGPUFieldToStorage.h>
#include <cuda/ErrorChecking.h>
#include <cuda/FieldCopy.h>
#include <cuda/GPUField.h>
#include <cuda/communication/MemcpyPackInfo.h>
#include <cuda/communication/UniformGPUScheme.h>
#include <cuda_runtime.h>
#include <domain_decomposition/BlockDataID.h>
#include <field/AddToStorage.h>
#include <field/GhostLayerField.h>
#include <stencil/D3Q27.h>
#include <stencil/Directions.h>
#include <stencil/Iterator.h>
#include <vector>
namespace walberla
{
using Type_T = int;
using Stencil_T = stencil::D3Q27;
using ScalarField_T = field::GhostLayerField< Type_T, 1 >;
using GPUScalarField_T = cuda::GPUField< Type_T >;
const Set< SUID > requiredBlockSelector("communication");
const Set< SUID > incompatibleBlockSelector("no communication");
void suidAssignmentFunction( blockforest::SetupBlockForest & forest ) {
for( auto & sblock : forest ) {
if( forest.atDomainXMinBorder( sblock ) ) {
sblock.addState(incompatibleBlockSelector);
} else {
sblock.addState(requiredBlockSelector);
}
sblock.setWorkload(walberla::numeric_cast<walberla::workload_t>(1));
}
}
void initScalarField(std::shared_ptr< StructuredBlockForest >& blocks, const BlockDataID& fieldID)
{
for (auto& block : *blocks)
{
Type_T val;
if (blocks->atDomainXMinBorder(block)) {
val = Type_T(-1);
} else if (blocks->atDomainXMaxBorder(block)) {
val = Type_T(1);
} else {
val = Type_T(0);
}
auto* field = block.getData< ScalarField_T >(fieldID);
WALBERLA_ASSERT_NOT_NULLPTR(field)
const auto cells = field->xyzSizeWithGhostLayer();
for (auto cell : cells)
{
field->get(cell) = val;
}
}
}
std::shared_ptr< StructuredBlockForest > createSelectorBlockGrid (
const uint_t numberOfXBlocks, const uint_t numberOfYBlocks, const uint_t numberOfZBlocks,
const uint_t numberOfXCellsPerBlock, const uint_t numberOfYCellsPerBlock, const uint_t numberOfZCellsPerBlock,
const real_t dx,
const bool xPeriodic, const bool yPeriodic, const bool zPeriodic,
const bool keepGlobalBlockInformation )
{
// initialize SetupBlockForest = determine domain decomposition
SetupBlockForest sforest;
sforest.addWorkloadMemorySUIDAssignmentFunction(suidAssignmentFunction);
AABB domainAABB{ real_c(0), real_c(0), real_c(0),
dx * real_c( numberOfXBlocks * numberOfXCellsPerBlock ),
dx * real_c( numberOfYBlocks * numberOfYCellsPerBlock ),
dx * real_c( numberOfZBlocks * numberOfZCellsPerBlock ) };
sforest.init(domainAABB, numberOfXBlocks, numberOfYBlocks, numberOfZBlocks, xPeriodic, yPeriodic, zPeriodic);
// calculate process distribution
const memory_t memoryLimit = numeric_cast< memory_t >(sforest.getNumberOfBlocks());
blockforest::GlobalLoadBalancing::MetisConfiguration< SetupBlock > metisConfig(
true, false,
std::bind(blockforest::cellWeightedCommunicationCost, std::placeholders::_1, std::placeholders::_2, numberOfXCellsPerBlock,
numberOfYCellsPerBlock, numberOfZCellsPerBlock));
sforest.calculateProcessDistribution_Default(uint_c(MPIManager::instance()->numProcesses()), memoryLimit, "hilbert",
10, false, metisConfig);
if (!MPIManager::instance()->rankValid()) MPIManager::instance()->useWorldComm();
// create StructuredBlockForest (encapsulates a newly created BlockForest)
auto bf =
std::make_shared< BlockForest >(uint_c(MPIManager::instance()->rank()), sforest, keepGlobalBlockInformation);
auto sbf = std::make_shared< StructuredBlockForest >(bf, numberOfXCellsPerBlock, numberOfYCellsPerBlock,
numberOfZCellsPerBlock);
sbf->createCellBoundingBoxes();
return sbf;
}
int main(int argc, char** argv)
{
debug::enterTestMode();
walberla::Environment walberlaEnv(argc, argv);
const Vector3<uint_t> nBlocks { 3, 1, 1 };
const Vector3<uint_t> cells { 2, 2, 1 };
Vector3<real_t> domainSize;
for( uint_t d = 0; d < 3; ++d ) {
domainSize[d] = real_c(cells[d] * nBlocks[d]);
}
auto blocks = createSelectorBlockGrid(nBlocks[0], nBlocks[1], nBlocks[2],
cells[0], cells[1], cells[2], 1, false, true, true, true);
BlockDataID fieldID = field::addToStorage< ScalarField_T >(blocks, "scalar", Type_T(0), field::fzyx, uint_t(1));
initScalarField(blocks, fieldID);
BlockDataID gpuFieldID = cuda::addGPUFieldToStorage< ScalarField_T >(blocks, fieldID, "GPU scalar");
// Setup communication schemes for GPUPackInfo
cuda::communication::UniformGPUScheme< Stencil_T > communication(blocks, requiredBlockSelector, incompatibleBlockSelector);
communication.addPackInfo(std::make_shared< cuda::communication::MemcpyPackInfo< GPUScalarField_T > >(gpuFieldID));
// Perform one communication step
communication();
// Copy to CPU
cuda::fieldCpy< ScalarField_T, GPUScalarField_T >( blocks, fieldID, gpuFieldID );
// Check for correct data in ghost layers of middle block
auto middleBlock = blocks->getBlock( domainSize[0] / real_c(2), domainSize[1] / real_c(2), domainSize[2] / real_c(2) );
auto cpuField = middleBlock->getData<ScalarField_T>(fieldID);
WALBERLA_ASSERT_NOT_NULLPTR(cpuField)
// avoid unused variable warning in release mode
(void) cpuField;
// check for missing communication with left neighbour (first block, incompatible selector)
WALBERLA_ASSERT_EQUAL(cpuField->get(-1, 0, 0), 0, "Communication with left neighbor detected.")
WALBERLA_ASSERT_EQUAL(cpuField->get(-1, 1, 0), 0, "Communication with left neighbor detected.")
// check for correct communication with right neighbor (third block, required selector)
WALBERLA_ASSERT_EQUAL(cpuField->get(cell_idx_t(cells[0]), 0, 0), 1, "No communication with right neighbor detected.")
WALBERLA_ASSERT_EQUAL(cpuField->get(cell_idx_t(cells[0]), 1, 0), 1, "No communication with right neighbor detected.")
return EXIT_SUCCESS;
}
} // namespace walberla
int main(int argc, char** argv) { return walberla::main(argc, argv); }
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment