diff --git a/apps/benchmarks/UniformGridGPU/CMakeLists.txt b/apps/benchmarks/UniformGridGPU/CMakeLists.txt index 9372c0b5275d6057ea188cc5667883a5d8e30d30..4e38013a36e8d79680b6683c502441a6cd7824b8 100644 --- a/apps/benchmarks/UniformGridGPU/CMakeLists.txt +++ b/apps/benchmarks/UniformGridGPU/CMakeLists.txt @@ -1,5 +1,6 @@ waLBerla_link_files_to_builddir( "*.prm" ) +waLBerla_link_files_to_builddir( "simulation_setup" ) waLBerla_python_file_generates(UniformGridGPU.py UniformGridGPU_LatticeModel.cpp UniformGridGPU_LatticeModel.h diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp index 572326ea6acdc02390dac1ac13be5877bb817b1b..d749568caed53a30379bafeb8c65fd742b45bbc3 100644 --- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp +++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp @@ -1,5 +1,7 @@ #include "core/Environment.h" #include "python_coupling/CreateConfig.h" +#include "python_coupling/PythonCallback.h" +#include "python_coupling/DictWrapper.h" #include "blockforest/Initialization.h" #include "lbm/field/PdfField.h" #include "lbm/field/AddToStorage.h" @@ -28,6 +30,7 @@ #include "UniformGridGPU_PackInfo.h" #include "UniformGridGPU_UBB.h" #include "UniformGridGPU_NoSlip.h" +#include "UniformGridGPU_Communication.h" using namespace walberla; @@ -80,11 +83,10 @@ int main( int argc, char **argv ) noSlip.fillFromFlagField<FlagField_T>( blocks, flagFieldID, FlagUID("NoSlip"), fluidFlagUID ); //pressure.fillFromFlagField<FlagField_T>( blocks, flagFieldID, FlagUID("pressure"), fluidFlagUID ); - - // Communication setup bool overlapCommunication = parameters.getParameter<bool>( "overlapCommunication", true ); bool cudaEnabledMPI = parameters.getParameter<bool>( "cudaEnabledMPI", false ); + int communicationScheme = parameters.getParameter<int>( "communicationScheme", (int) CommunicationSchemeType::UniformGPUScheme_Baseline ); int streamHighPriority = 0; int streamLowPriority = 0; @@ -92,9 +94,10 @@ int main( int argc, char **argv ) pystencils::UniformGridGPU_LbKernel lbKernel( pdfFieldGpuID, omega ); lbKernel.setOuterPriority( streamHighPriority ); - CommScheme_T gpuComm( blocks, cudaEnabledMPI ); - gpuComm.addPackInfo( make_shared<pystencils::UniformGridGPU_PackInfo>( pdfFieldGpuID )); - + //CommScheme_T gpuComm( blocks, cudaEnabledMPI ); + //gpuComm.addPackInfo( make_shared<pystencils::UniformGridGPU_PackInfo>( pdfFieldGpuID )); + UniformGridGPU_Communication< CommunicationStencil_T, cuda::GPUField< double > > + gpuComm( blocks, pdfFieldGpuID, (CommunicationSchemeType) communicationScheme, cudaEnabledMPI ); auto defaultStream = cuda::StreamRAII::newPriorityStream( streamLowPriority ); auto innerOuterStreams = cuda::ParallelStreams( streamHighPriority ); @@ -121,6 +124,7 @@ int main( int argc, char **argv ) innerOuterSection.run([&]( auto outerStream ) { gpuComm( outerStream ); + for( auto &block: *blocks ) { { @@ -172,11 +176,42 @@ int main( int argc, char **argv ) auto remainingTimeLoggerFrequency = parameters.getParameter< double >( "remainingTimeLoggerFrequency", 3.0 ); // in seconds timeLoop.addFuncAfterTimeStep( timing::RemainingTimeLogger( timeLoop.getNrOfTimeSteps(), remainingTimeLoggerFrequency ), "remaining time logger" ); + /* lbm::PerformanceLogger<FlagField_T> performanceLogger(blocks, flagFieldID, fluidFlagUID, 500); timeLoop.addFuncAfterTimeStep( performanceLogger, "remaining time logger" ); timeLoop.run(); + */ + + auto performanceReportFrequency = parameters.getParameter< uint_t >( "performanceReportFrequency", 500 ); // in timesteps + lbm::PerformanceLogger<FlagField_T> performanceLogger(blocks, flagFieldID, fluidFlagUID, performanceReportFrequency); + timeLoop.addFuncAfterTimeStep([&performanceLogger] { performanceLogger(); }, "performance logger" ); + + timeLoop.run(); + + std::map< std::string, int > integerProperties; + std::map< std::string, double > realProperties; + std::map< std::string, std::string > stringProperties; + + performanceLogger.logOverallResultsOnRoot(); + performanceLogger.getBestResultsForSQLOnRoot(integerProperties, realProperties, stringProperties); + + WALBERLA_ROOT_SECTION() + { + python_coupling::PythonCallback pythonCallbackResults ( "results_callback" ); + if ( pythonCallbackResults.isCallable() ) + { + pythonCallbackResults.data().exposeValue( "mlups_total", realProperties["MLUPS"] ); + pythonCallbackResults.data().exposeValue( "mlups_process", realProperties["MLUPS_process"] ); + + pythonCallbackResults.data().exposeValue( "mflups_total", realProperties["MFLUPS"] ); + pythonCallbackResults.data().exposeValue( "mflups_process", realProperties["MFLUPS_process"] ); + + // Call Python function to report results + pythonCallbackResults(); + } + } } return 0; -} \ No newline at end of file +} diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU_Communication.h b/apps/benchmarks/UniformGridGPU/UniformGridGPU_Communication.h new file mode 100644 index 0000000000000000000000000000000000000000..3e45caa94d6f34bafdc302e97422522b072a615e --- /dev/null +++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU_Communication.h @@ -0,0 +1,128 @@ +#pragma once + +#include "core/debug/Debug.h" +#include "blockforest/Block.h" +#include "blockforest/StructuredBlockForest.h" +#include "blockforest/communication/UniformBufferedScheme.h" +#include "cuda/communication/GPUPackInfo.h" +#include "cuda/communication/UniformGPUScheme.h" +#include "cuda/communication/MemcpyPackInfo.h" +#include "UniformGridGPU_PackInfo.h" + + +using namespace walberla; + + +enum CommunicationSchemeType { + GPUPackInfo_Baseline = 0, + GPUPackInfo_Streams = 1, + UniformGPUScheme_Baseline = 2, + UniformGPUScheme_Memcpy = 3 +}; + + +template<typename StencilType, typename GPUFieldType > +class UniformGridGPU_Communication +{ +public: + explicit UniformGridGPU_Communication(weak_ptr_wrapper<StructuredBlockForest> bf, const BlockDataID & bdId, + CommunicationSchemeType commSchemeType, bool cudaEnabledMPI = false) + : _commSchemeType(commSchemeType), _cpuCommunicationScheme(nullptr), _gpuPackInfo(nullptr), + _gpuCommunicationScheme(nullptr), _generatedPackInfo(nullptr) + { + switch(_commSchemeType) + { + case GPUPackInfo_Baseline: + _gpuPackInfo = make_shared< cuda::communication::GPUPackInfo< GPUFieldType > >( bdId ); + _cpuCommunicationScheme = make_shared< blockforest::communication::UniformBufferedScheme< StencilType > >( bf ); + _cpuCommunicationScheme->addPackInfo( _gpuPackInfo ); + break; + case GPUPackInfo_Streams: + _gpuPackInfo = make_shared< cuda::communication::GPUPackInfo< GPUFieldType > >( bdId ); + _cpuCommunicationScheme = make_shared< blockforest::communication::UniformBufferedScheme< StencilType > >( bf ); + _cpuCommunicationScheme->addPackInfo( _gpuPackInfo ); + break; + case UniformGPUScheme_Baseline: + _gpuCommunicationScheme = make_shared< cuda::communication::UniformGPUScheme< StencilType > >( bf, cudaEnabledMPI ); + _generatedPackInfo = make_shared<pystencils::UniformGridGPU_PackInfo>( bdId ); + _gpuCommunicationScheme->addPackInfo( _generatedPackInfo ); + break; + case UniformGPUScheme_Memcpy: + _gpuCommunicationScheme = make_shared< cuda::communication::UniformGPUScheme< StencilType > >( bf, cudaEnabledMPI ); + _memcpyPackInfo = make_shared< cuda::communication::MemcpyPackInfo< GPUFieldType > >( bdId ); + _gpuCommunicationScheme->addPackInfo( _memcpyPackInfo ); + break; + default: + WALBERLA_ABORT("Invalid GPU communication scheme specified!"); + } + } + + UniformGridGPU_Communication(UniformGridGPU_Communication &) = delete; + + void operator()( cudaStream_t communicationStream = 0 ) + { + startCommunication( communicationStream ); + wait( communicationStream ); + } + + void startCommunication( cudaStream_t communicationStream ) + { + switch( _commSchemeType ) + { + case GPUPackInfo_Streams: + // Set communication stream to enable asynchronous operations + // in GPUPackInfo. + WALBERLA_ASSERT_NOT_NULLPTR( _gpuPackInfo ); + _gpuPackInfo->setCommunicationStream( communicationStream ); + // Start communication using UniformBufferedScheme + WALBERLA_ASSERT_NOT_NULLPTR( _cpuCommunicationScheme ); + _cpuCommunicationScheme->startCommunication(); + break; + case GPUPackInfo_Baseline: + // Start communication using UniformBufferedScheme + WALBERLA_ASSERT_NOT_NULLPTR( _cpuCommunicationScheme ); + _cpuCommunicationScheme->startCommunication(); + break; + case UniformGPUScheme_Baseline: + WALBERLA_ASSERT_NOT_NULLPTR( _gpuCommunicationScheme ); + _gpuCommunicationScheme->startCommunication( communicationStream ); + break; + case UniformGPUScheme_Memcpy: + WALBERLA_ASSERT_NOT_NULLPTR( _gpuCommunicationScheme ); + _gpuCommunicationScheme->startCommunication( communicationStream ); + break; + } + } + + void wait( cudaStream_t communicationStream ) + { + switch( _commSchemeType ) + { + case GPUPackInfo_Baseline: + WALBERLA_ASSERT_NOT_NULLPTR( _cpuCommunicationScheme ); + _cpuCommunicationScheme->wait(); + break; + case GPUPackInfo_Streams: + WALBERLA_ASSERT_NOT_NULLPTR( _cpuCommunicationScheme ); + _gpuPackInfo->setCommunicationStream( communicationStream ); + _cpuCommunicationScheme->wait(); + break; + case UniformGPUScheme_Baseline: + WALBERLA_ASSERT_NOT_NULLPTR( _gpuCommunicationScheme ); + _gpuCommunicationScheme->wait( communicationStream ); + break; + case UniformGPUScheme_Memcpy: + WALBERLA_ASSERT_NOT_NULLPTR( _gpuCommunicationScheme ); + _gpuCommunicationScheme->wait( communicationStream ); + break; + } + } + +private: + CommunicationSchemeType _commSchemeType; + shared_ptr< blockforest::communication::UniformBufferedScheme< StencilType > > _cpuCommunicationScheme; + shared_ptr< cuda::communication::GPUPackInfo< GPUFieldType > > _gpuPackInfo; + shared_ptr< cuda::communication::UniformGPUScheme< StencilType > > _gpuCommunicationScheme; + shared_ptr< pystencils::UniformGridGPU_PackInfo > _generatedPackInfo; + shared_ptr< cuda::communication::MemcpyPackInfo< GPUFieldType > > _memcpyPackInfo; +}; diff --git a/apps/benchmarks/UniformGridGPU/simulation_setup/base.py b/apps/benchmarks/UniformGridGPU/simulation_setup/base.py new file mode 100644 index 0000000000000000000000000000000000000000..c72988a8bc772426e794b2e5660fa34fd8d17126 --- /dev/null +++ b/apps/benchmarks/UniformGridGPU/simulation_setup/base.py @@ -0,0 +1,39 @@ +# encoding: utf-8 + +import math +import waLBerla as wlb + + +# Constants that define the size of blocks that are used in the benchmarks +MIN_CELLS_PER_BLOCK = 16 +MAX_CELLS_PER_BLOCK = 256 +INC_CELLS_PER_BLOCK = 16 +# Amount of cells per block +cells_per_block_interval = range(MIN_CELLS_PER_BLOCK, MAX_CELLS_PER_BLOCK + 1, INC_CELLS_PER_BLOCK) +# Blocks with size in [16, 32, 64, 128, 256] +cells_per_block = [num_cells for num_cells in cells_per_block_interval] +# Number of active MPI processes +num_processes = wlb.mpi.numProcesses() +# Whether to overlap computation with communication +overlap_communication = [False, True] +# Whether MPI supports buffers in GPU memory +cuda_enabled_mpi = [False, True] +# Supported communication schemes +communication_schemes = ['GPUPackInfo_Streams', 'UniformGPUScheme_Baseline', 'UniformGPUScheme_Memcpy'] + + +def get_block_decomposition(block_decomposition, num_processes): + bx = by = bz = 1 + blocks_per_axis = int(math.log(num_processes, 2)) + for i in range(blocks_per_axis): + decomposition_axis = block_decomposition[i % len(block_decomposition)] + if decomposition_axis == 'y': + by *= 2 + elif decomposition_axis == 'z': + bz *= 2 + elif decomposition_axis == 'x': + bx *= 2 + + assert (bx * by * bz) == num_processes + + return (bx, by, bz) diff --git a/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark.py b/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark.py new file mode 100644 index 0000000000000000000000000000000000000000..76c8a8967711f7e902acda23b4ad9bf56f587f31 --- /dev/null +++ b/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark.py @@ -0,0 +1,101 @@ +# encoding: utf-8 + +import os +import pandas as pd +import waLBerla as wlb +import copy +from datetime import datetime + + +CommunicationSchemeType = { + 'GPUPackInfo_Baseline': 0, + 'GPUPackInfo_Streams': 1, + 'UniformGPUScheme_Baseline': 2, + 'UniformGPUScheme_Memcpy': 3, +} + +CommunicationSchemeName = { + 0: 'GPUPackInfo_Baseline', + 1: 'GPUPackInfo_Streams', + 2: 'UniformGPUScheme_Baseline', + 3: 'UniformGPUScheme_Memcpy', +} + +# Base configuration for the benchmark +BASE_CONFIG = { + 'DomainSetup' : { + 'cellsPerBlock': (64, 64, 64), + 'blocks': (1, 1, 1), + 'nrOfProcesses': (1, 1, 1), + 'periodic': (0, 0, 1), + 'dx': 1.0 + }, + 'Parameters': { + 'omega': 1.8, + 'timesteps': 1001, + 'remainingTimeLoggerFrequency': 250, + 'vtkWriteFrequency': 0, + 'overlapCommunication': False, + 'cudaEnabledMPI': False, + 'initialVelocity': (0, 0, 0), + 'performanceReportFrequency': 250, + 'communicationScheme': CommunicationSchemeType['UniformGPUScheme_Baseline'], + }, + 'Boundaries': { + 'Border': [ + {'direction': 'W', 'walldistance': -1, 'flag': 'NoSlip'}, + {'direction': 'E', 'walldistance': -1, 'flag': 'NoSlip'}, + {'direction': 'S', 'walldistance': -1, 'flag': 'NoSlip'}, + {'direction': 'N', 'walldistance': -1, 'flag': 'UBB'}, + ] + } +} + + +class BenchmarkScenario: + def __init__(self, testcase, decomposition_axes=None): + self.testcase = testcase + self.scenario_config = copy.deepcopy(BASE_CONFIG) + self.decomposition_axes = decomposition_axes + + now = datetime.now().replace(second=0, microsecond=0) + self.output_filename = f'{self.testcase}_{now.strftime("%Y-%m-%d_%H-%M")}.csv' + + @wlb.member_callback + def config(self, **kwargs): + return self.scenario_config + + @wlb.member_callback + def results_callback(self, **kwargs): + block_setup = self.scenario_config.get('DomainSetup') + params = self.scenario_config.get('Parameters') + + data = [{ + 'processesX': block_setup.get('nrOfProcesses')[0], + 'processesY': block_setup.get('nrOfProcesses')[1], + 'processesZ': block_setup.get('nrOfProcesses')[2], + 'blocksX': block_setup.get('blocks')[0], + 'blocksY': block_setup.get('blocks')[1], + 'blocksZ': block_setup.get('blocks')[2], + 'cellsPerBlockX': block_setup.get('cellsPerBlock')[0], + 'cellsPerBlockY': block_setup.get('cellsPerBlock')[1], + 'cellsPerBlockZ': block_setup.get('cellsPerBlock')[2], + 'cudaEnabledMPI': params.get('cudaEnabledMPI'), + 'overlapCommunication': params.get('overlapCommunication'), + 'domainDecomposition': self.decomposition_axes, + 'communicationScheme': CommunicationSchemeName[params.get('communicationScheme')], + 'mlupsTotal': kwargs.get('mlups_total'), + 'mlupsProcess': kwargs.get('mlups_process'), + 'mflupsTotal': kwargs.get('mflups_total'), + 'mflupsProcess': kwargs.get('mflups_process'), + }] + + self.save_data(data) + + def save_data(self, data): + df = pd.DataFrame(data) + if not os.path.isfile(self.output_filename): + df.to_csv(self.output_filename, index=False) + else: + df.to_csv(self.output_filename, index=False, mode='a', header=False) + diff --git a/apps/benchmarks/UniformGridGPU/simulation_setup/inter_node.py b/apps/benchmarks/UniformGridGPU/simulation_setup/inter_node.py new file mode 100644 index 0000000000000000000000000000000000000000..6498878fa229f8f2a7219fb2dc85abd32b211d19 --- /dev/null +++ b/apps/benchmarks/UniformGridGPU/simulation_setup/inter_node.py @@ -0,0 +1,47 @@ +# encoding: utf-8 + +import itertools +import waLBerla as wlb +from base import get_block_decomposition, communication_schemes, overlap_communication, \ + cuda_enabled_mpi, cells_per_block, num_processes +from benchmark import BenchmarkScenario, CommunicationSchemeType + + +# Stores the scenarios for the current simulation +scenarios = wlb.ScenarioManager() + +# Generates all block decompositions of xyz, 2 directions at a time +#block_decompositions = itertools.combinations_with_replacement('xyz', r=2) +block_decompositions = ['xy', 'yz', 'xz'] + +scenario_generator = itertools.product(communication_schemes, overlap_communication, cuda_enabled_mpi, block_decompositions, cells_per_block) + +testcase_name = "inter-node" + +for scenario_params in scenario_generator: + # Extract parameters from tuple + comm_scheme, is_communication_overlapped, is_cuda_enabled_mpi, decomposition_axes, num_cells_per_block = scenario_params + if comm_scheme != 'UniformGPUScheme_Baseline' and is_cuda_enabled_mpi is True: + # Skip CUDA enabled MPI tests for GPUPackInfo tests + continue + elif comm_scheme == 'GPUPackInfo_Baseline' and is_communication_overlapped is True: + # Skip communication overlap tests for GPUPackInfo baseline + continue + # Convert the axes decompositions to string + decomposition_axes_str = ''.join(decomposition_axes) + # Compute block decomposition based on the specified axes and the number of processes + blocks = get_block_decomposition(decomposition_axes, num_processes) + # Create a benchmark scenario + scenario = BenchmarkScenario(testcase=testcase_name, decomposition_axes=decomposition_axes_str) + # Domain Setup parameters + domain_setup = scenario.scenario_config['DomainSetup'] + domain_setup['cellsPerBlock'] = 3 * (num_cells_per_block,) + domain_setup['nrOfProcesses'] = blocks + domain_setup['blocks'] = blocks + # Additional parameters for benchmarking + params = scenario.scenario_config['Parameters'] + params['cudaEnabledMPI'] = is_cuda_enabled_mpi + params['overlapCommunication'] = is_communication_overlapped + params['communicationScheme'] = CommunicationSchemeType[comm_scheme] + # Add scenario for execution + scenarios.add(scenario) diff --git a/apps/benchmarks/UniformGridGPU/simulation_setup/intra_node.py b/apps/benchmarks/UniformGridGPU/simulation_setup/intra_node.py new file mode 100644 index 0000000000000000000000000000000000000000..46b6c06002d0ac663dfeef6ae2dbb32bb8ba300d --- /dev/null +++ b/apps/benchmarks/UniformGridGPU/simulation_setup/intra_node.py @@ -0,0 +1,47 @@ +# encoding: utf-8 + +import itertools +import waLBerla as wlb +from base import get_block_decomposition, communication_schemes, overlap_communication, \ + cuda_enabled_mpi, cells_per_block, num_processes +from benchmark import BenchmarkScenario, CommunicationSchemeType + + +# Stores the scenarios for the current simulation +scenarios = wlb.ScenarioManager() + +# Generates all block decompositions of xyz, 2 directions at a time +#block_decompositions = itertools.combinations_with_replacement('xyz', r=2) +block_decompositions = ['xy', 'yz', 'zx'] + +scenario_generator = itertools.product(communication_schemes, overlap_communication, cuda_enabled_mpi, block_decompositions, cells_per_block) + +testcase_name = "intra-node" + +for scenario_params in scenario_generator: + # Extract parameters from tuple + comm_scheme, is_communication_overlapped, is_cuda_enabled_mpi, decomposition_axes, num_cells_per_block = scenario_params + if comm_scheme != 'UniformGPUScheme_Baseline' and is_cuda_enabled_mpi is True: + # Skip CUDA enabled MPI tests for GPUPackInfo tests + continue + elif comm_scheme == 'GPUPackInfo_Baseline' and is_communication_overlapped is True: + # Skip communication overlap tests for GPUPackInfo baseline + continue + # Convert the axes decompositions to string + decomposition_axes_str = ''.join(decomposition_axes) + # Compute block decomposition based on the specified axes and the number of processes + blocks = get_block_decomposition(decomposition_axes, num_processes) + # Create a benchmark scenario + scenario = BenchmarkScenario(testcase=testcase_name, decomposition_axes=decomposition_axes_str) + # Domain Setup parameters + domain_setup = scenario.scenario_config['DomainSetup'] + domain_setup['cellsPerBlock'] = 3 * (num_cells_per_block,) + domain_setup['nrOfProcesses'] = blocks + domain_setup['blocks'] = blocks + # Additional parameters for benchmarking + params = scenario.scenario_config['Parameters'] + params['cudaEnabledMPI'] = is_cuda_enabled_mpi + params['overlapCommunication'] = is_communication_overlapped + params['communicationScheme'] = CommunicationSchemeType[comm_scheme] + # Add scenario for execution + scenarios.add(scenario) diff --git a/apps/benchmarks/UniformGridGPU/simulation_setup/single_node.py b/apps/benchmarks/UniformGridGPU/simulation_setup/single_node.py new file mode 100644 index 0000000000000000000000000000000000000000..78da581df4c61cb84a2e94fae4a9b4ed5505d393 --- /dev/null +++ b/apps/benchmarks/UniformGridGPU/simulation_setup/single_node.py @@ -0,0 +1,20 @@ +# encoding: utf-8 + +import itertools +import waLBerla as wlb +from base import cells_per_block, num_processes +from benchmark import BenchmarkScenario + + +scenarios = wlb.ScenarioManager() + +testcase_name = "single-node" + +assert num_processes == 1 + +for num_cells_per_block in cells_per_block: + # Create a benchmark scenario + scenario = BenchmarkScenario(testcase=testcase_name) + scenario.scenario_config['DomainSetup']['cellsPerBlock'] = 3 * (num_cells_per_block,) + # Add scenario for execution + scenarios.add(scenario) diff --git a/apps/benchmarks/UniformGridGPU/simulation_setup/strong_scaling.py b/apps/benchmarks/UniformGridGPU/simulation_setup/strong_scaling.py new file mode 100644 index 0000000000000000000000000000000000000000..7f038c837b81d836fba828685e15d634bad3f5dc --- /dev/null +++ b/apps/benchmarks/UniformGridGPU/simulation_setup/strong_scaling.py @@ -0,0 +1,56 @@ +# encoding: utf-8 + +import itertools +import waLBerla as wlb +from base import get_block_decomposition, communication_schemes, overlap_communication, \ + cuda_enabled_mpi, num_processes +from benchmark import BenchmarkScenario, CommunicationSchemeType + + +# Stores the scenarios for the current simulation +scenarios = wlb.ScenarioManager() + +# Generates all block decompositions of xyz, 2 directions at a time +#block_decompositions = itertools.combinations_with_replacement('xyz', r=2) +block_decompositions = ['xyz', 'yzx', 'yxz', 'zyx'] + +cells_per_block = [256,] + +if num_processes == 1: + scenario_generator = itertools.product(communication_schemes, [False,], [False,], + block_decompositions, cells_per_block) +else: + scenario_generator = itertools.product(communication_schemes, overlap_communication, + cuda_enabled_mpi, block_decompositions, cells_per_block) + +testcase_name = "strong-scaling" + +for scenario_params in scenario_generator: + # Extract parameters from tuple + comm_scheme, is_communication_overlapped, is_cuda_enabled_mpi, decomposition_axes, num_cells_per_block = scenario_params + if comm_scheme != 'UniformGPUScheme_Baseline' and is_cuda_enabled_mpi is True: + # Skip CUDA enabled MPI tests for GPUPackInfo tests + continue + elif comm_scheme == 'GPUPackInfo_Baseline' and is_communication_overlapped is True: + # Skip communication overlap tests for GPUPackInfo baseline + continue + + # Convert the axes decompositions to string + decomposition_axes_str = ''.join(decomposition_axes) + # Compute block decomposition based on the specified axes and the number of processes + blocks = get_block_decomposition(decomposition_axes, num_processes) + # Create a benchmark scenario + scenario = BenchmarkScenario(testcase=testcase_name, decomposition_axes=decomposition_axes_str) + # Domain Setup parameters + domain_setup = scenario.scenario_config['DomainSetup'] + domain_setup['cellsPerBlock'] = tuple(num_cells_per_block // block for block in blocks) + domain_setup['nrOfProcesses'] = blocks + domain_setup['blocks'] = blocks + # Additional parameters for benchmarking + params = scenario.scenario_config['Parameters'] + params['cudaEnabledMPI'] = is_cuda_enabled_mpi + params['overlapCommunication'] = is_communication_overlapped + params['communicationScheme'] = CommunicationSchemeType[comm_scheme] + # Add scenario for execution + scenarios.add(scenario) + diff --git a/apps/benchmarks/UniformGridGPU/simulation_setup/weak_scaling.py b/apps/benchmarks/UniformGridGPU/simulation_setup/weak_scaling.py new file mode 100644 index 0000000000000000000000000000000000000000..c29d935dca8306856ef4646c0565f636bb0fe7fb --- /dev/null +++ b/apps/benchmarks/UniformGridGPU/simulation_setup/weak_scaling.py @@ -0,0 +1,55 @@ +# encoding: utf-8 + +import itertools +import waLBerla as wlb +from base import get_block_decomposition, communication_schemes, overlap_communication, \ + cuda_enabled_mpi, num_processes +from benchmark import BenchmarkScenario, CommunicationSchemeType + + +# Stores the scenarios for the current simulation +scenarios = wlb.ScenarioManager() + +# Generates all block decompositions of xyz, 2 directions at a time +#block_decompositions = itertools.combinations_with_replacement('xyz', r=2) +block_decompositions = ['xyz', 'yzx', 'zyx', 'yxz'] + +cells_per_block = [64, 128, 240, 256] + +if num_processes == 1: + scenario_generator = itertools.product(communication_schemes, [False,], [False,], + block_decompositions, cells_per_block) +else: + scenario_generator = itertools.product(communication_schemes, overlap_communication, + cuda_enabled_mpi, block_decompositions, cells_per_block) + +testcase_name = "weak-scaling" + +for scenario_params in scenario_generator: + # Extract parameters from tuple + comm_scheme, is_communication_overlapped, is_cuda_enabled_mpi, decomposition_axes, num_cells_per_block = scenario_params + if comm_scheme != 'UniformGPUScheme_Baseline' and is_cuda_enabled_mpi is True: + # Skip CUDA enabled MPI tests for GPUPackInfo tests + continue + elif comm_scheme == 'GPUPackInfo_Baseline' and is_communication_overlapped is True: + # Skip communication overlap tests for GPUPackInfo without streams + continue + + # Convert the axes decompositions to string + decomposition_axes_str = ''.join(decomposition_axes) + # Compute block decomposition based on the specified axes and the number of processes + blocks = get_block_decomposition(decomposition_axes, num_processes) + # Create a benchmark scenario + scenario = BenchmarkScenario(testcase=testcase_name, decomposition_axes=decomposition_axes_str) + # Domain Setup parameters + domain_setup = scenario.scenario_config['DomainSetup'] + domain_setup['cellsPerBlock'] = 3 * (num_cells_per_block,) + domain_setup['nrOfProcesses'] = blocks + domain_setup['blocks'] = blocks + # Additional parameters for benchmarking + params = scenario.scenario_config['Parameters'] + params['cudaEnabledMPI'] = is_cuda_enabled_mpi + params['overlapCommunication'] = is_communication_overlapped + params['communicationScheme'] = CommunicationSchemeType[comm_scheme] + # Add scenario for execution + scenarios.add(scenario) diff --git a/src/cuda/communication/GPUPackInfo.h b/src/cuda/communication/GPUPackInfo.h index 7e1bad001bbaa9e912c972eef0004373262135cb..661029b40dc1f39e55a20c7a708d19bc89da4cfa 100644 --- a/src/cuda/communication/GPUPackInfo.h +++ b/src/cuda/communication/GPUPackInfo.h @@ -57,42 +57,16 @@ class GPUPackInfo : public walberla::communication::UniformPackInfo public: typedef typename GPUField_T::value_type FieldType; - GPUPackInfo( const BlockDataID & bdId, cudaStream_t stream = 0 ) - : bdId_( bdId ), communicateAllGhostLayers_( true ), numberOfGhostLayers_( 0 ) + GPUPackInfo( const BlockDataID & bdId ) + : bdId_( bdId ), communicateAllGhostLayers_( true ), numberOfGhostLayers_( 0 ), + copyAsync_( false ), communicationStream_( 0 ) { - streams_.push_back( stream ); - copyAsync_ = (stream != 0); } - GPUPackInfo( const BlockDataID & bdId, const uint_t numberOfGHostLayers, cudaStream_t stream = 0 ) - : bdId_( bdId ), communicateAllGhostLayers_( false ), numberOfGhostLayers_( numberOfGHostLayers ) + GPUPackInfo( const BlockDataID & bdId, const uint_t numberOfGHostLayers ) + : bdId_( bdId ), communicateAllGhostLayers_( false ), numberOfGhostLayers_( numberOfGHostLayers ), + copyAsync_( false ), communicationStream_( 0 ) { - streams_.push_back( stream ); - copyAsync_ = (stream != 0); - } - - GPUPackInfo( const BlockDataID & bdId, std::vector< cudaStream_t > & streams ) - : bdId_( bdId ), communicateAllGhostLayers_( true ), numberOfGhostLayers_( 0 ), streams_( streams ) - { - copyAsync_ = true; - for( auto streamIt = streams_.begin(); streamIt != streams_.end(); ++streamIt ) - if ( *streamIt == 0 ) - { - copyAsync_ = false; - break; - } - } - - GPUPackInfo( const BlockDataID & bdId, const uint_t numberOfGHostLayers, std::vector< cudaStream_t > & streams ) - : bdId_( bdId ), communicateAllGhostLayers_( false ), numberOfGhostLayers_( numberOfGHostLayers ), streams_( streams ) - { - copyAsync_ = true; - for( auto streamIt = streams_.begin(); streamIt != streams_.end(); ++streamIt ) - if ( *streamIt == 0 ) - { - copyAsync_ = false; - break; - } } virtual ~GPUPackInfo() {} @@ -104,18 +78,25 @@ public: void communicateLocal(const IBlock * sender, IBlock * receiver, stencil::Direction dir); + void setCommunicationStream( cudaStream_t stream ) + { + if ( stream != 0 ) + { + copyAsync_ = true; + communicationStream_ = stream; + } + } + protected: void packDataImpl(const IBlock * sender, stencil::Direction dir, mpi::SendBuffer & outBuffer) const; uint_t numberOfGhostLayersToCommunicate( const GPUField_T * const field ) const; - inline cudaStream_t & getStream(stencil::Direction & dir) { return streams_[dir % streams_.size()]; } - const BlockDataID bdId_; bool communicateAllGhostLayers_; uint_t numberOfGhostLayers_; - std::vector< cudaStream_t > streams_; bool copyAsync_; + cudaStream_t communicationStream_; std::map< stencil::Direction, PinnedMemoryBuffer > pinnedRecvBuffers_; mutable std::map< stencil::Direction, PinnedMemoryBuffer > pinnedSendBuffers_; }; @@ -145,7 +126,7 @@ void GPUPackInfo<GPUField_T>::unpackData(IBlock * receiver, stencil::Direction d std::copy( bufPtr, static_cast< unsigned char * >( bufPtr + nrOfBytesToRead ), copyBufferPtr ); } - cudaStream_t & unpackStream = getStream(dir); + cudaStream_t & unpackStream = communicationStream_; auto dstOffset = std::make_tuple( uint_c(fieldCi.xMin() + nrOfGhostLayers), uint_c(fieldCi.yMin() + nrOfGhostLayers), @@ -201,7 +182,7 @@ void GPUPackInfo<GPUField_T>::communicateLocal(const IBlock * sender, IBlock * r CellInterval sCi = field::getSliceBeforeGhostLayer( *sf, dir, nrOfGhostLayers, false ); CellInterval rCi = field::getGhostRegion( *rf, stencil::inverseDir[dir], nrOfGhostLayers, false ); - cudaStream_t & commStream = getStream(dir); + cudaStream_t & commStream = communicationStream_; auto dstOffset = std::make_tuple( uint_c(rCi.xMin() + nrOfGhostLayers), uint_c(rCi.yMin() + nrOfGhostLayers), @@ -255,7 +236,7 @@ void GPUPackInfo<GPUField_T>::packDataImpl(const IBlock * sender, stencil::Direc unsigned char * outBufferPtr = outBuffer.forward( nrOfBytesToPack ); - const cudaStream_t & packStream = streams_[dir % streams_.size()]; + const cudaStream_t & packStream = communicationStream_; unsigned char * copyBufferPtr = outBufferPtr; if ( copyAsync_ ) diff --git a/src/cuda/communication/MemcpyPackInfo.h b/src/cuda/communication/MemcpyPackInfo.h new file mode 100644 index 0000000000000000000000000000000000000000..5041f231efd23a4e364c42f82ab14dbc87fc17f4 --- /dev/null +++ b/src/cuda/communication/MemcpyPackInfo.h @@ -0,0 +1,40 @@ +#pragma once + +#include "stencil/Directions.h" +#include "core/cell/CellInterval.h" +#include "cuda/GPUField.h" +#include "core/DataTypes.h" +#include "domain_decomposition/IBlock.h" +#include "cuda/communication/GeneratedGPUPackInfo.h" + + +namespace walberla { +namespace cuda { +namespace communication { + +template<typename GPUFieldType> +class MemcpyPackInfo : public ::walberla::cuda::GeneratedGPUPackInfo +{ +public: + MemcpyPackInfo( BlockDataID pdfsID_ ) + : pdfsID(pdfsID_), numberOfGhostLayers_(0), communicateAllGhostLayers_(true) + {}; + + + virtual void pack (stencil::Direction dir, unsigned char * buffer, IBlock * block, cudaStream_t stream); + virtual void unpack(stencil::Direction dir, unsigned char * buffer, IBlock * block, cudaStream_t stream); + virtual uint_t size(stencil::Direction dir, IBlock * block); + +private: + BlockDataID pdfsID; + uint_t numberOfGhostLayers_; + bool communicateAllGhostLayers_; + + uint_t numberOfGhostLayersToCommunicate( const GPUFieldType * const field ) const; +}; + +} // namespace communication +} // namespace cuda +} // namespace walberla + +#include "MemcpyPackInfo.impl.h" diff --git a/src/cuda/communication/MemcpyPackInfo.impl.h b/src/cuda/communication/MemcpyPackInfo.impl.h new file mode 100644 index 0000000000000000000000000000000000000000..166a7611225a643f095e40863d529ffbdac377e5 --- /dev/null +++ b/src/cuda/communication/MemcpyPackInfo.impl.h @@ -0,0 +1,231 @@ +#include "blockforest/Block.h" +#include "field/GhostRegions.h" +#include "field/Layout.h" +#include "stencil/Directions.h" +#include "core/cell/CellInterval.h" +#include "cuda/GPUField.h" +#include "cuda/GPUCopy.h" +#include "core/DataTypes.h" +#include "MemcpyPackInfo.h" +#include <cuda_runtime.h> + + +namespace walberla { +namespace cuda { +namespace communication { + +template<typename GPUFieldType> +void MemcpyPackInfo< GPUFieldType >::pack(stencil::Direction dir, unsigned char * byte_buffer, + IBlock * block, cudaStream_t stream) +{ + // Extract field data pointer from the block + const GPUFieldType * fieldPtr = block->getData< GPUFieldType >( pdfsID ); + WALBERLA_ASSERT_NOT_NULLPTR( fieldPtr ); + // + cell_idx_t nrOfGhostLayers = cell_idx_c( numberOfGhostLayersToCommunicate( fieldPtr ) ); + CellInterval fieldCi = field::getGhostRegion( *fieldPtr, dir, nrOfGhostLayers, false ); + + // Base offsets into the buffer and GPUField, respectively + auto dstOffset = std::make_tuple( uint_c(0), uint_c(0), uint_c(0), uint_c(0) ); + auto srcOffset = std::make_tuple( uint_c(fieldCi.xMin() + nrOfGhostLayers), + uint_c(fieldCi.yMin() + nrOfGhostLayers), + uint_c(fieldCi.zMin() + nrOfGhostLayers), + uint_c(0) ); + + // Size of data to pack, in terms of elements of the field + auto intervalSize = std::make_tuple( fieldCi.xSize(), fieldCi.ySize(), + fieldCi.zSize(), fieldPtr->fSize() ); + + if ( fieldPtr->layout() == field::fzyx ) + { + const uint_t dstAllocSizeZ = fieldCi.zSize(); + const uint_t srcAllocSizeZ = fieldPtr->zAllocSize(); + + cudaPitchedPtr byteBufferPitchedPtr = make_cudaPitchedPtr( byte_buffer, + fieldCi.xSize() * sizeof(typename GPUFieldType::value_type), + fieldCi.xSize() * sizeof(typename GPUFieldType::value_type), + fieldCi.ySize() ); + + copyDevToDevFZYX( byteBufferPitchedPtr, fieldPtr->pitchedPtr(), dstOffset, srcOffset, + dstAllocSizeZ, srcAllocSizeZ, sizeof(typename GPUFieldType::value_type), + intervalSize, stream ); + } + else + { + const uint_t dstAllocSizeZ = fieldCi.ySize(); + const uint_t srcAllocSizeZ = fieldPtr->yAllocSize(); + + cudaPitchedPtr byteBufferPitchedPtr = make_cudaPitchedPtr( byte_buffer, + fieldPtr->fSize() * sizeof(typename GPUFieldType::value_type), + fieldPtr->fSize() * sizeof(typename GPUFieldType::value_type), + fieldCi.xSize() ); + copyDevToDevZYXF( byteBufferPitchedPtr, fieldPtr->pitchedPtr(), dstOffset, srcOffset, + dstAllocSizeZ, srcAllocSizeZ, sizeof(typename GPUFieldType::value_type), + intervalSize, stream ); + } +} + +template<typename GPUFieldType> +void MemcpyPackInfo< GPUFieldType >::unpack(stencil::Direction dir, unsigned char * byte_buffer, + IBlock * block, cudaStream_t stream) +{ + GPUFieldType * fieldPtr = block->getData< GPUFieldType >( pdfsID ); + WALBERLA_ASSERT_NOT_NULLPTR(fieldPtr); + + cell_idx_t nrOfGhostLayers = cell_idx_c( numberOfGhostLayersToCommunicate( fieldPtr ) ); + + CellInterval fieldCi = field::getGhostRegion( *fieldPtr, dir, nrOfGhostLayers, false ); + + auto dstOffset = std::make_tuple( uint_c(fieldCi.xMin() + nrOfGhostLayers), + uint_c(fieldCi.yMin() + nrOfGhostLayers), + uint_c(fieldCi.zMin() + nrOfGhostLayers), + uint_c(0) ); + auto srcOffset = std::make_tuple( uint_c(0), uint_c(0), uint_c(0), uint_c(0) ); + + auto intervalSize = std::make_tuple( fieldCi.xSize(), fieldCi.ySize(), fieldCi.zSize(), fieldPtr->fSize() ); + + if ( fieldPtr->layout() == field::fzyx ) + { + const uint_t dstAllocSizeZ = fieldPtr->zAllocSize(); + const uint_t srcAllocSizeZ = fieldCi.zSize(); + + cudaPitchedPtr byteBufferPitchedPtr = make_cudaPitchedPtr( byte_buffer, + fieldCi.xSize() * sizeof(typename GPUFieldType::value_type), + fieldCi.xSize() * sizeof(typename GPUFieldType::value_type), + fieldCi.ySize() ); + + copyDevToDevFZYX( fieldPtr->pitchedPtr(), byteBufferPitchedPtr, dstOffset, srcOffset, + dstAllocSizeZ, srcAllocSizeZ, sizeof(typename GPUFieldType::value_type), + intervalSize, stream ); + } + else + { + const uint_t dstAllocSizeY = fieldPtr->yAllocSize(); + const uint_t srcAllocSizeY = fieldCi.ySize(); + cudaPitchedPtr byteBufferPitchedPtr = make_cudaPitchedPtr( byte_buffer, + fieldPtr->fSize() * sizeof(typename GPUFieldType::value_type), + fieldPtr->fSize() * sizeof(typename GPUFieldType::value_type), + fieldCi.xSize() ); + copyDevToDevZYXF( fieldPtr->pitchedPtr(), byteBufferPitchedPtr, dstOffset, srcOffset, + dstAllocSizeY, srcAllocSizeY, sizeof(typename GPUFieldType::value_type), + intervalSize, stream ); + } +} + +template<typename GPUFieldType> +uint_t MemcpyPackInfo< GPUFieldType >::size(stencil::Direction dir, IBlock * block) +{ + auto pdfs = block->getData< GPUFieldType >(pdfsID); + + CellInterval ci; + cell_idx_t nrOfGhostLayers = cell_idx_c( numberOfGhostLayersToCommunicate( pdfs ) ); + pdfs->getGhostRegion(dir, ci, nrOfGhostLayers, false); + + /* + uint_t elementsPerCell = 0; + + switch( dir ) + { + case stencil::SW: + elementsPerCell = 1; + break; + + case stencil::S: + elementsPerCell = 5; + break; + + case stencil::W: + elementsPerCell = 5; + break; + + case stencil::B: + elementsPerCell = 5; + break; + + case stencil::T: + elementsPerCell = 5; + break; + + case stencil::BN: + elementsPerCell = 1; + break; + + case stencil::N: + elementsPerCell = 5; + break; + + case stencil::TE: + elementsPerCell = 1; + break; + + case stencil::E: + elementsPerCell = 5; + break; + + case stencil::BE: + elementsPerCell = 1; + break; + + case stencil::SE: + elementsPerCell = 1; + break; + + case stencil::C: + elementsPerCell = 1; + break; + + case stencil::TN: + elementsPerCell = 1; + break; + + case stencil::TS: + elementsPerCell = 1; + break; + + case stencil::NE: + elementsPerCell = 1; + break; + + case stencil::BW: + elementsPerCell = 1; + break; + + case stencil::NW: + elementsPerCell = 1; + break; + + case stencil::BS: + elementsPerCell = 1; + break; + + case stencil::TW: + elementsPerCell = 1; + break; + + default: + elementsPerCell = 0; + } + + return ci.numCells() * elementsPerCell * sizeof(typename GPUFieldType::value_type); + */ + uint_t totalCells = ci.xSize() * ci.ySize() * ci.zSize() * pdfs->fSize() * sizeof(typename GPUFieldType::value_type); + return totalCells; +} + +template<typename GPUFieldType> +uint_t MemcpyPackInfo< GPUFieldType >::numberOfGhostLayersToCommunicate( const GPUFieldType * const field ) const +{ + if( communicateAllGhostLayers_ ) + { + return field->nrOfGhostLayers(); + } + else + { + WALBERLA_ASSERT_LESS_EQUAL( numberOfGhostLayers_, field->nrOfGhostLayers() ); + return numberOfGhostLayers_; + } +} + +} // namespace communication +} // namespace cuda +} // namespace walberla \ No newline at end of file