Commit 58b3e682 authored by Martin Bauer's avatar Martin Bauer
Browse files

Extensions in UniformGridGPU benchmark

- different stencils supported
- mpi datatype communication: full vs. required pdfs only
- adapted config file
parent f040919f
......@@ -29,10 +29,12 @@ waLBerla_python_file_generates(UniformGridGPU_AA.py
UniformGridGPU_AA_PackInfoPush.cu UniformGridGPU_AA_PackInfoPush.h
UniformGridGPU_AA_MacroSetter.cpp UniformGridGPU_AA_MacroSetter.h
UniformGridGPU_AA_MacroGetter.cpp UniformGridGPU_AA_MacroGetter.h
UniformGridGPU_AA_Defines.h
)
set(config "srt")
waLBerla_add_executable ( NAME UniformGridBenchmarkGPU_AA_${config}
FILES UniformGridGPU_AA.cpp UniformGridGPU_AA.py
DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk gui
CODEGEN_CFG ${config})
foreach(config srt trt mrt smagorinsky entropic )
waLBerla_add_executable ( NAME UniformGridBenchmarkGPU_AA_${config}
FILES UniformGridGPU_AA.cpp UniformGridGPU_AA.py
DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk gui
CODEGEN_CFG ${config})
endforeach()
......@@ -6,7 +6,7 @@ namespace walberla {
inline void initShearVelocity(const shared_ptr<StructuredBlockStorage> & blocks, BlockDataID velFieldID,
const real_t xMagnitude=0.1, const real_t fluctuationMagnitude=0.05 )
const real_t xMagnitude=0.005, const real_t fluctuationMagnitude=0.05 )
{
math::seedRandomGenerator(0);
auto halfZ = blocks->getDomainCellBB().zMax() / 2;
......
......@@ -151,6 +151,8 @@ int main( int argc, char **argv )
communicationScheme = UniformGPUScheme_Memcpy;
else if (communicationSchemeStr == "MPIDatatypes")
communicationScheme = MPIDatatypes;
else if (communicationSchemeStr == "MPIDatatypesFull")
communicationScheme = MPIDatatypesFull;
else {
WALBERLA_ABORT_NO_DEBUG_INFO("Invalid choice for communicationScheme")
}
......
DomainSetup
{
blocks < 1, 1, 1 >;
cellsPerBlock < 512, 256, 256 >;
cellsPerBlock < 128, 128, 128 >;
periodic < 1, 1, 1 >;
}
Parameters
{
timesteps 2000; //10000; // time steps of one performance measurement
warmupSteps 0; // number of steps to run before measurement starts
outerIterations 1; // how many measurements to conduct
timesteps 1000; // time steps of one performance measurement
warmupSteps 100; // number of steps to run before measurement starts
outerIterations 3; // how many measurements to conduct
// Can be one of: GPUPackInfo_Baseline, GPUPackInfo_Streams, UniformGPUScheme_Baseline, UniformGPUScheme_Memcpy
communicationScheme UniformGPUScheme_Baseline;
vtkWriteFrequency 0; //100; // write a VTK file every n'th step, if zero VTK output is disabled
vtkWriteFrequency 0; // write a VTK file every n'th step, if zero VTK output is disabled
cudaEnabledMPI false; // switch on if you have a CUDA-enabled MPI implementation
timeStepStrategy noOverlap; // can be: noOverlap, simpleOverlap, complexOverlap, kernelOnly
timeStepStrategy kernelOnly; // can be: noOverlap, simpleOverlap, complexOverlap, kernelOnly
innerOuterSplit < 8, 1, 1>; // slice-thickness that 'outer'-kernels process when overlapping
remainingTimeLoggerFrequency 5; // interval in seconds to log the estimated remaining time
gpuBlockSize < 128, 1, 1>;
omega 1.92;
initShearFlow 1;
......
......@@ -12,10 +12,14 @@ from pystencils.fast_approximation import insert_fast_sqrts, insert_fast_divisio
from lbmpy.macroscopic_value_kernels import macroscopic_values_getter, macroscopic_values_setter
omega = sp.symbols("omega")
# sweep_block_size = (128, 1, 1)
sweep_block_size = (TypedSymbol("cudaBlockSize0", np.int32),
TypedSymbol("cudaBlockSize1", np.int32),
1)
compile_time_block_size = False
if compile_time_block_size:
sweep_block_size = (128, 1, 1)
else:
sweep_block_size = (TypedSymbol("cudaBlockSize0", np.int32),
TypedSymbol("cudaBlockSize1", np.int32),
1)
sweep_params = {'block_size': sweep_block_size}
......
......@@ -21,9 +21,13 @@
#include "cuda/communication/UniformGPUScheme.h"
#include "cuda/DeviceSelectMPI.h"
#include "domain_decomposition/SharedSweep.h"
#include "stencil/D3Q19.h"
#include "stencil/D3Q27.h"
#include "InitShearVelocity.h"
#include "gui/Gui.h"
#ifdef WALBERLA_ENABLE_GUI
#include "lbm/gui/PdfFieldDisplayAdaptor.h"
#endif
#include "UniformGridGPU_AA_PackInfoPush.h"
#include "UniformGridGPU_AA_PackInfoPull.h"
......@@ -31,11 +35,12 @@
#include "UniformGridGPU_AA_MacroGetter.h"
#include "UniformGridGPU_AA_LbKernelEven.h"
#include "UniformGridGPU_AA_LbKernelOdd.h"
#include "UniformGridGPU_AA_Defines.h"
#include <cmath>
using namespace walberla;
using Stencil_T = stencil::D3Q19; //TODO make generic - and determine from python script
using CommunicationStencil_T = Stencil_T;
using PdfField_T = GhostLayerField< real_t, Stencil_T::Q >;
using VelocityField_T = GhostLayerField< real_t, 3 >;
......@@ -50,6 +55,8 @@ int main( int argc, char **argv )
{
WALBERLA_MPI_WORLD_BARRIER();
WALBERLA_CUDA_CHECK( cudaPeekAtLastError() );
auto config = *cfg;
logging::configureLogging( config );
auto blocks = blockforest::createUniformBlockGridFromConfig( config );
......@@ -61,7 +68,7 @@ int main( int argc, char **argv )
const uint_t timesteps = parameters.getParameter< uint_t >( "timesteps", uint_c( 50 ));
// Creating fields
BlockDataID pdfFieldCpuID = field::addToStorage< PdfField_T >( blocks, "pdfs cpu", real_t( 42.0 ), field::fzyx );
BlockDataID pdfFieldCpuID = field::addToStorage< PdfField_T >( blocks, "pdfs cpu", real_t( std::nan("") ), field::fzyx );
BlockDataID velFieldCpuID = field::addToStorage< VelocityField_T >( blocks, "vel", real_t( 0 ), field::fzyx );
WALBERLA_LOG_INFO_ON_ROOT( "Initializing shear flow" );
......@@ -215,32 +222,52 @@ int main( int argc, char **argv )
timeLoop.addFuncAfterTimeStep( logger, "remaining time logger" );
}
for ( int outerIteration = 0; outerIteration < outerIterations; ++outerIteration )
bool useGui = parameters.getParameter<bool>( "useGui", false );
if( useGui )
{
cuda::fieldCpy< PdfField_T, cuda::GPUField< real_t > >( blocks, pdfFieldCpuID, pdfFieldGpuID );
timeLoop.addFuncAfterTimeStep( cuda::fieldCpyFunctor<PdfField_T, cuda::GPUField<real_t> >( blocks, pdfFieldCpuID, pdfFieldGpuID ), "copy to CPU" );
GUI gui( timeLoop, blocks, argc, argv);
gui.registerDisplayAdaptorCreator(
[&](const IBlock & block, ConstBlockDataID blockDataID) -> gui::DisplayAdaptor * {
if ( block.isDataOfType< PdfField_T >( blockDataID) )
return new lbm::PdfFieldDisplayAdaptor<GhostLayerField<real_t, Stencil_T::Q>, Stencil_T >( blockDataID );
return nullptr;
});
gui.run();
}
else
{
timeLoop.setCurrentTimeStepToZero();
WcTimer simTimer;
cudaDeviceSynchronize();
WALBERLA_LOG_INFO_ON_ROOT( "Starting simulation with " << timesteps << " time steps" );
simTimer.start();
timeLoop.run();
cudaDeviceSynchronize();
simTimer.end();
WALBERLA_LOG_INFO_ON_ROOT( "Simulation finished" );
auto time = simTimer.last();
auto nrOfCells = real_c( cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2] );
auto mlupsPerProcess = nrOfCells * real_c( timesteps ) / time * 1e-6;
WALBERLA_LOG_RESULT_ON_ROOT( "MLUPS per process " << mlupsPerProcess );
WALBERLA_LOG_RESULT_ON_ROOT( "Time per time step " << time / real_c( timesteps ));
WALBERLA_ROOT_SECTION()
for ( int outerIteration = 0; outerIteration < outerIterations; ++outerIteration )
{
python_coupling::PythonCallback pythonCallbackResults( "results_callback" );
if ( pythonCallbackResults.isCallable())
WALBERLA_CUDA_CHECK( cudaPeekAtLastError() );
timeLoop.setCurrentTimeStepToZero();
WcTimer simTimer;
cudaDeviceSynchronize();
WALBERLA_CUDA_CHECK( cudaPeekAtLastError() );
WALBERLA_LOG_INFO_ON_ROOT( "Starting simulation with " << timesteps << " time steps" );
simTimer.start();
timeLoop.run();
cudaDeviceSynchronize();
simTimer.end();
WALBERLA_LOG_INFO_ON_ROOT( "Simulation finished" );
auto time = simTimer.last();
auto nrOfCells = real_c( cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2] );
auto mlupsPerProcess = nrOfCells * real_c( timesteps ) / time * 1e-6;
WALBERLA_LOG_RESULT_ON_ROOT( "MLUPS per process " << mlupsPerProcess );
WALBERLA_LOG_RESULT_ON_ROOT( "Time per time step " << time / real_c( timesteps ));
WALBERLA_ROOT_SECTION()
{
pythonCallbackResults.data().exposeValue( "mlupsPerProcess", mlupsPerProcess );
pythonCallbackResults.data().exposeValue( "githash", WALBERLA_GIT_SHA1 );
// Call Python function to report results
pythonCallbackResults();
python_coupling::PythonCallback pythonCallbackResults( "results_callback" );
if ( pythonCallbackResults.isCallable())
{
pythonCallbackResults.data().exposeValue( "mlupsPerProcess", mlupsPerProcess );
pythonCallbackResults.data().exposeValue( "githash", WALBERLA_GIT_SHA1 );
// Call Python function to report results
pythonCallbackResults();
}
}
}
}
......
......@@ -10,10 +10,14 @@ from pystencils.fast_approximation import insert_fast_sqrts, insert_fast_divisio
from lbmpy.macroscopic_value_kernels import macroscopic_values_getter, macroscopic_values_setter
omega = sp.symbols("omega")
# sweep_block_size = (128, 1, 1)
sweep_block_size = (TypedSymbol("cudaBlockSize0", np.int32),
TypedSymbol("cudaBlockSize1", np.int32),
1)
compile_time_block_size = False
if compile_time_block_size:
sweep_block_size = (128, 1, 1)
else:
sweep_block_size = (TypedSymbol("cudaBlockSize0", np.int32),
TypedSymbol("cudaBlockSize1", np.int32),
1)
sweep_params = {'block_size': sweep_block_size}
......@@ -58,7 +62,9 @@ with CodeGeneration() as ctx:
common_options = {
'field_name': 'pdfs',
'optimization': {'cse_global': True,
'cse_pdfs': False}
'cse_pdfs': False,
'field_layout': 'fzyx',
}
}
options = options_dict.get(ctx.config, options_dict['srt'])
options.update(common_options)
......@@ -98,3 +104,6 @@ with CodeGeneration() as ctx:
# communication
generate_pack_info_from_kernel(ctx, 'UniformGridGPU_AA_PackInfoPull', update_rules['Odd'], kind='pull', target='gpu')
generate_pack_info_from_kernel(ctx, 'UniformGridGPU_AA_PackInfoPush', update_rules['Odd'], kind='push', target='gpu')
ctx.write_file("UniformGridGPU_AA_Defines.h",
'#include "stencil/D3Q{0}.h"\nusing Stencil_T = walberla::stencil::D3Q{0}; \n '.format(q))
......@@ -6,6 +6,7 @@
#include "blockforest/communication/UniformBufferedScheme.h"
#include "blockforest/communication/UniformDirectScheme.h"
#include "field/communication/StencilRestrictedMPIDatatypeInfo.h"
#include "field/communication/UniformMPIDatatypeInfo.h"
#include "cuda/communication/GPUPackInfo.h"
#include "cuda/communication/UniformGPUScheme.h"
#include "cuda/communication/MemcpyPackInfo.h"
......@@ -20,7 +21,8 @@ enum CommunicationSchemeType {
GPUPackInfo_Streams = 1,
UniformGPUScheme_Baseline = 2,
UniformGPUScheme_Memcpy = 3,
MPIDatatypes = 4
MPIDatatypes = 4,
MPIDatatypesFull = 5
};
......@@ -36,6 +38,7 @@ public:
auto generatedPackInfo = make_shared<pystencils::UniformGridGPU_PackInfo>( bdId );
auto memcpyPackInfo = make_shared< cuda::communication::MemcpyPackInfo< GPUFieldType > >( bdId );
auto dataTypeInfo = make_shared< field::communication::StencilRestrictedMPIDatatypeInfo< GPUFieldType, StencilType > >( bdId );
auto dataTypeInfoFull = make_shared< field::communication::UniformMPIDatatypeInfo<GPUFieldType> >( bdId );
switch(_commSchemeType)
{
......@@ -63,6 +66,12 @@ public:
}
_directScheme = make_shared< blockforest::communication::UniformDirectScheme< StencilType > >( bf, dataTypeInfo );
break;
case MPIDatatypesFull:
if( ! cudaEnabledMPI ) {
WALBERLA_ABORT("MPI datatype-based communication not possible if no cudaEnabledMPI is available.");
}
_directScheme = make_shared< blockforest::communication::UniformDirectScheme< StencilType > >( bf, dataTypeInfoFull );
break;
default:
WALBERLA_ABORT("Invalid GPU communication scheme specified!");
}
......@@ -103,6 +112,7 @@ public:
_gpuCommunicationScheme->startCommunication( communicationStream );
break;
case MPIDatatypes:
case MPIDatatypesFull:
WALBERLA_ASSERT_NOT_NULLPTR( _directScheme );
_directScheme->startCommunication();
break;
......@@ -131,6 +141,7 @@ public:
_gpuCommunicationScheme->wait( communicationStream );
break;
case MPIDatatypes:
case MPIDatatypesFull:
WALBERLA_ASSERT_NOT_NULLPTR( _directScheme );
_directScheme->wait();
break;
......
......@@ -9,6 +9,7 @@ from os import getcwd
from waLBerla.tools.jobscripts import createJobscript
from datetime import timedelta
from copy import deepcopy
import sys
CSV_FILE = "overlap_benchmark.csv"
......@@ -22,16 +23,18 @@ BASE_CONFIG = {
'timesteps': 400,
'cudaEnabledMPI': False,
'warmupSteps': 5,
'outerIterations': 1,
'outerIterations': 3,
'initShearFlow': True,
}
}
class Scenario:
def __init__(self, **kwargs):
def __init__(self, cells_per_block=(256, 128, 128), **kwargs):
self.config_dict = deepcopy(BASE_CONFIG)
self.config_dict['Parameters'].update(kwargs)
self.config_dict['DomainSetup']['blocks'] = block_decomposition(wlb.mpi.numProcesses())
self.config_dict['DomainSetup']['cellsPerBlock'] = cells_per_block
@wlb.member_callback
def config(self, **kwargs):
......@@ -45,6 +48,10 @@ class Scenario:
data.update(self.config_dict['Parameters'])
data.update(self.config_dict['DomainSetup'])
data.update(kwargs)
data['executable'] = sys.argv[0]
data['compile_flags'] = wlb.build_info.compiler_flags
data['walberla_version'] = wlb.build_info.version
data['build_machine'] = wlb.build_info.build_machine
sequenceValuesToScalars(data)
df = pd.DataFrame.from_records([data])
......@@ -73,15 +80,50 @@ def overlap_benchmark():
scenarios.add(scenario)
def generate_jobscripts(machine='pizdaint_hybrid'):
def single_gpu_benchmark():
scenarios = wlb.ScenarioManager()
block_sizes = [(i, i, i) for i in (64, 128, 256, 384)] + [(512, 512, 128)]
cuda_blocks = [(32, 1, 1), (64, 1, 1), (128, 1, 1), (256, 1, 1), (512, 1, 1),
(32, 2, 1), (64, 2, 1), (128, 2, 1), (256, 2, 1),
(32, 4, 1), (64, 4, 1), (128, 4, 1),
(32, 8, 1), (64, 8, 1),
(32, 16, 1)]
for block_size in block_sizes:
for cuda_block_size in cuda_blocks:
cells = block_size[0] * block_size[1] * block_size[2]
time_steps_for_128_cubed = 1000
time_steps = (128 ** 3 / cells) * time_steps_for_128_cubed
scenario = Scenario(cells_per_block=block_size,
gpuBlockSize=cuda_block_size,
timeStepStrategy='kernelOnly',
timesteps=int(time_steps))
scenarios.add(scenario)
all_executables = ('UniformGridBenchmarkGPU_AA_entropic',
'UniformGridBenchmarkGPU_AA_mrt',
'UniformGridBenchmarkGPU_AA_smagorinsky',
'UniformGridBenchmarkGPU_AA_srt',
'UniformGridBenchmarkGPU_AA_trt',
'UniformGridBenchmarkGPU_entropic',
'UniformGridBenchmarkGPU_mrt',
'UniformGridBenchmarkGPU_smagorinsky',
'UniformGridBenchmarkGPU_srt',
'UniformGridBenchmarkGPU_trt')
def generate_jobscripts(machine='pizdaint_hybrid',
exe_names=all_executables):
for node_count in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 2400]:
with open("job_overlap_benchmark_{:04d}.sh".format(node_count), 'w') as f:
js = createJobscript(nodes=node_count,
output_file='overlap_bench_{:04d}_%j.txt'.format(node_count),
error_file='overlap_bench_{:04d}_%j.txt'.format(node_count),
initial_dir=getcwd(),
exe_name='UniformGridBenchmarkGPU',
parameter_files=['overlap_benchmark.py'],
#exe_name='UniformGridBenchmarkGPU',
#parameter_files=['overlap_benchmark.py'],
commands=list((exe, 'overlap_benchmark.py') for exe in exe_names),
wall_time=timedelta(minutes=25),
machine=machine,
account='d105',
......@@ -93,4 +135,4 @@ if __name__ == '__main__':
print("Called without waLBerla - generating job scripts for PizDaint")
generate_jobscripts()
else:
overlap_benchmark()
single_gpu_benchmark()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment