Skip to content
Snippets Groups Projects
Commit 6a61fc77 authored by Martin Bauer's avatar Martin Bauer
Browse files

UniformGridGPU: New app with AA pattern (single PDF field)

parent 0b75cce0
Branches
Tags
No related merge requests found
......@@ -8,11 +8,31 @@ waLBerla_python_file_generates(UniformGridGPU.py
UniformGridGPU_NoSlip.cu UniformGridGPU_NoSlip.h
UniformGridGPU_UBB.cu UniformGridGPU_UBB.h
UniformGridGPU_PackInfo.cu UniformGridGPU_PackInfo.h
UniformGridGPU_MacroSetter.cpp UniformGridGPU_MacroSetter.h
UniformGridGPU_MacroGetter.cpp UniformGridGPU_MacroGetter.h
)
foreach(config srt trt mrt smagorinsky entropic )
waLBerla_add_executable ( NAME UniformGridBenchmarkGPU_${config}
FILES UniformGridGPU.cpp UniformGridGPU.py
DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk
DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk gui
CODEGEN_CFG ${config})
endforeach()
waLBerla_python_file_generates(UniformGridGPU_AA.py
UniformGridGPU_AA_LbKernelEven.cu UniformGridGPU_AA_LbKernelEven.h
UniformGridGPU_AA_LbKernelOdd.cu UniformGridGPU_AA_LbKernelOdd.h
UniformGridGPU_AA_PackInfoPull.cu UniformGridGPU_AA_PackInfoPull.h
UniformGridGPU_AA_PackInfoPush.cu UniformGridGPU_AA_PackInfoPush.h
UniformGridGPU_AA_MacroSetter.cpp UniformGridGPU_AA_MacroSetter.h
UniformGridGPU_AA_MacroGetter.cpp UniformGridGPU_AA_MacroGetter.h
)
set(config "srt")
waLBerla_add_executable ( NAME UniformGridBenchmarkGPU_AA_${config}
FILES UniformGridGPU_AA.cpp UniformGridGPU_AA.py
DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk gui
CODEGEN_CFG ${config})
#include "core/math/Random.h"
#include "domain_decomposition/StructuredBlockStorage.h"
namespace walberla {
inline void initShearVelocity(const shared_ptr<StructuredBlockStorage> & blocks, BlockDataID velFieldID,
const real_t xMagnitude=0.1, const real_t fluctuationMagnitude=0.05 )
{
math::seedRandomGenerator(0);
auto halfZ = blocks->getDomainCellBB().zMax() / 2;
for( auto & block: *blocks)
{
auto velField = block.getData<GhostLayerField<real_t, 3> >( velFieldID );
WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(velField,
Cell globalCell;
blocks->transformBlockLocalToGlobalCell(globalCell, block, Cell(x, y, z));
real_t randomReal = xMagnitude * math::realRandom<real_t>(-fluctuationMagnitude, fluctuationMagnitude);
velField->get(x, y, z, 1) = real_t(0);
velField->get(x, y, z, 2) = randomReal;
if( globalCell[2] >= halfZ ) {
velField->get(x, y, z, 0) = xMagnitude;
} else {
velField->get(x, y, z, 0) = -xMagnitude;
}
);
}
}
}
\ No newline at end of file
......@@ -5,12 +5,10 @@
#include "python_coupling/PythonCallback.h"
#include "python_coupling/DictWrapper.h"
#include "blockforest/Initialization.h"
#include "lbm/field/PdfField.h"
#include "lbm/field/AddToStorage.h"
#include "field/FlagField.h"
#include "field/AddToStorage.h"
#include "lbm/communication/PdfFieldPackInfo.h"
#include "lbm/vtk/VTKOutput.h"
#include "field/vtk/VTKWriter.h"
#include "field/communication/PackInfo.h"
#include "lbm/PerformanceLogger.h"
#include "blockforest/communication/UniformBufferedScheme.h"
#include "timeloop/all.h"
......@@ -25,8 +23,9 @@
#include "cuda/AddGPUFieldToStorage.h"
#include "cuda/communication/UniformGPUScheme.h"
#include "cuda/DeviceSelectMPI.h"
#include "lbm/sweeps/CellwiseSweep.h"
#include "domain_decomposition/SharedSweep.h"
#include "gui/Gui.h"
#include "lbm/gui/Connection.h"
#include "UniformGridGPU_LatticeModel.h"
#include "UniformGridGPU_LbKernel.h"
......@@ -34,37 +33,45 @@
#include "UniformGridGPU_UBB.h"
#include "UniformGridGPU_NoSlip.h"
#include "UniformGridGPU_Communication.h"
#include "UniformGridGPU_MacroSetter.h"
#include "UniformGridGPU_MacroGetter.h"
using namespace walberla;
using LatticeModel_T = lbm::UniformGridGPU_LatticeModel;
const auto Q = LatticeModel_T::Stencil::Q;
using Stencil_T = LatticeModel_T::Stencil;
using CommunicationStencil_T = LatticeModel_T::CommunicationStencil;
using PdfField_T = lbm::PdfField<LatticeModel_T>;
using PdfField_T = GhostLayerField<real_t, Q>;
using CommScheme_T = cuda::communication::UniformGPUScheme<CommunicationStencil_T>;
using VelocityField_T = GhostLayerField<real_t, 3>;
using flag_t = walberla::uint8_t;
using FlagField_T = FlagField<flag_t>;
void initShearVelocity(const shared_ptr<StructuredBlockStorage> & blocks, BlockDataID pdfFieldID,
void initShearVelocity(const shared_ptr<StructuredBlockStorage> & blocks, BlockDataID velFieldID,
const real_t xMagnitude=0.1, const real_t fluctuationMagnitude=0.05 )
{
math::seedRandomGenerator(0);
auto halfZ = blocks->getDomainCellBB().zMax() / 2;
for( auto & block: *blocks)
{
PdfField_T * pdfField = block.getData<PdfField_T>( pdfFieldID );
WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(pdfField,
auto velField = block.getData<VelocityField_T>( velFieldID );
WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(velField,
Cell globalCell;
blocks->transformBlockLocalToGlobalCell(globalCell, block, Cell(x, y, z));
real_t randomReal = xMagnitude * math::realRandom<real_t>(-fluctuationMagnitude, fluctuationMagnitude);
velField->get(x, y, z, 1) = real_t(0);
velField->get(x, y, z, 2) = randomReal;
if( globalCell[2] >= halfZ ) {
pdfField->setDensityAndVelocity(x, y, z, Vector3<real_t>(xMagnitude, 0, randomReal), real_t(1.0));
velField->get(x, y, z, 0) = xMagnitude;
} else {
pdfField->setDensityAndVelocity(x, y, z, Vector3<real_t>(-xMagnitude, 0,randomReal), real_t(1.0));
velField->get(x, y, z, 0) = -xMagnitude;
}
);
}
......@@ -89,16 +96,24 @@ int main( int argc, char **argv )
auto parameters = config->getOneBlock( "Parameters" );
const real_t omega = parameters.getParameter<real_t>( "omega", real_c( 1.4 ));
const uint_t timesteps = parameters.getParameter<uint_t>( "timesteps", uint_c( 50 ));
const Vector3<real_t> initialVelocity = parameters.getParameter< Vector3<real_t> >( "initialVelocity", Vector3<real_t>() );
const bool initShearFlow = parameters.getParameter<bool>("initShearFlow", false);
// Creating fields
auto latticeModel = LatticeModel_T( omega );
BlockDataID pdfFieldCpuID = lbm::addPdfFieldToStorage( blocks, "pdfs on CPU", latticeModel, initialVelocity, real_t(1), field::fzyx );
BlockDataID pdfFieldCpuID = field::addToStorage< PdfField_T >( blocks, "pdfs cpu", real_t(99.8), field::fzyx);
BlockDataID velFieldCpuID = field::addToStorage< VelocityField_T >( blocks, "vel", real_t(0), field::fzyx);
if( initShearFlow ) {
WALBERLA_LOG_INFO_ON_ROOT("Initializing shear flow");
initShearVelocity( blocks, pdfFieldCpuID );
initShearVelocity( blocks, velFieldCpuID );
}
pystencils::UniformGridGPU_MacroSetter setterSweep(pdfFieldCpuID, velFieldCpuID);
for( auto & block : *blocks )
setterSweep( &block );
// setter sweep only initializes interior of domain - for push schemes to work a first communication is required here
blockforest::communication::UniformBufferedScheme<CommunicationStencil_T> initialComm(blocks);
initialComm.addPackInfo( make_shared< field::communication::PackInfo<PdfField_T> >( pdfFieldCpuID ) );
initialComm();
BlockDataID pdfFieldGpuID = cuda::addGPUFieldToStorage<PdfField_T >( blocks, pdfFieldCpuID, "pdfs on GPU", true );
BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >( blocks, "flag field" );
......@@ -253,17 +268,21 @@ int main( int argc, char **argv )
timeLoop.add() << BeforeFunction( timeStep )
<< Sweep( []( IBlock * ) {}, "time step" );
pystencils::UniformGridGPU_MacroGetter getterSweep( pdfFieldCpuID, velFieldCpuID );
// VTK
uint_t vtkWriteFrequency = parameters.getParameter<uint_t>( "vtkWriteFrequency", 0 );
if( vtkWriteFrequency > 0 )
{
auto vtkOutput = vtk::createVTKOutput_BlockData( *blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out",
"simulation_step", false, true, true, false, 0 );
vtkOutput->addCellDataWriter(
make_shared<lbm::VelocityVTKWriter<LatticeModel_T> >( pdfFieldCpuID, "Velocity" ));
vtkOutput->addCellDataWriter( make_shared<lbm::DensityVTKWriter<LatticeModel_T> >( pdfFieldCpuID, "Density" ));
vtkOutput->addBeforeFunction(
cuda::fieldCpyFunctor<PdfField_T, cuda::GPUField<real_t> >( blocks, pdfFieldCpuID, pdfFieldGpuID ));
auto velWriter = make_shared< field::VTKWriter<VelocityField_T> >(velFieldCpuID, "vel");
vtkOutput->addCellDataWriter(velWriter);
vtkOutput->addBeforeFunction( [&]() {
cuda::fieldCpy<PdfField_T, cuda::GPUField<real_t> >( blocks, pdfFieldCpuID, pdfFieldGpuID );
for( auto & block : *blocks )
getterSweep( &block );
});
timeLoop.addFuncAfterTimeStep( vtk::writeFiles( vtkOutput ), "VTK Output" );
}
......@@ -280,31 +299,41 @@ int main( int argc, char **argv )
timeLoop.addFuncAfterTimeStep( logger, "remaining time logger" );
}
for(int outerIteration = 0; outerIteration < outerIterations; ++outerIteration)
bool useGui = parameters.getParameter<bool>( "useGui", false );
if( useGui )
{
GUI gui( timeLoop, blocks, argc, argv);
lbm::connectToGui<LatticeModel_T>(gui);
gui.run();
}
else
{
timeLoop.setCurrentTimeStepToZero();
WcTimer simTimer;
cudaDeviceSynchronize();
WALBERLA_LOG_INFO_ON_ROOT( "Starting simulation with " << timesteps << " time steps" );
simTimer.start();
timeLoop.run();
cudaDeviceSynchronize();
simTimer.end();
WALBERLA_LOG_INFO_ON_ROOT( "Simulation finished" );
auto time = simTimer.last();
auto nrOfCells = real_c( cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2] );
auto mlupsPerProcess = nrOfCells * real_c( timesteps ) / time * 1e-6;
WALBERLA_LOG_RESULT_ON_ROOT( "MLUPS per process " << mlupsPerProcess );
WALBERLA_LOG_RESULT_ON_ROOT( "Time per time step " << time / real_c( timesteps ));
WALBERLA_ROOT_SECTION()
for ( int outerIteration = 0; outerIteration < outerIterations; ++outerIteration )
{
python_coupling::PythonCallback pythonCallbackResults( "results_callback" );
if ( pythonCallbackResults.isCallable())
timeLoop.setCurrentTimeStepToZero();
WcTimer simTimer;
cudaDeviceSynchronize();
WALBERLA_LOG_INFO_ON_ROOT( "Starting simulation with " << timesteps << " time steps" );
simTimer.start();
timeLoop.run();
cudaDeviceSynchronize();
simTimer.end();
WALBERLA_LOG_INFO_ON_ROOT( "Simulation finished" );
auto time = simTimer.last();
auto nrOfCells = real_c( cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2] );
auto mlupsPerProcess = nrOfCells * real_c( timesteps ) / time * 1e-6;
WALBERLA_LOG_RESULT_ON_ROOT( "MLUPS per process " << mlupsPerProcess );
WALBERLA_LOG_RESULT_ON_ROOT( "Time per time step " << time / real_c( timesteps ));
WALBERLA_ROOT_SECTION()
{
pythonCallbackResults.data().exposeValue( "mlupsPerProcess", mlupsPerProcess );
pythonCallbackResults.data().exposeValue( "githash", WALBERLA_GIT_SHA1 );
// Call Python function to report results
pythonCallbackResults();
python_coupling::PythonCallback pythonCallbackResults( "results_callback" );
if ( pythonCallbackResults.isCallable())
{
pythonCallbackResults.data().exposeValue( "mlupsPerProcess", mlupsPerProcess );
pythonCallbackResults.data().exposeValue( "githash", WALBERLA_GIT_SHA1 );
// Call Python function to report results
pythonCallbackResults();
}
}
}
}
......
DomainSetup
{
blocks < 2, 1, 1 >;
cellsPerBlock < 64, 64, 64 >;
blocks < 1, 1, 1 >;
cellsPerBlock < 512, 256, 256 >;
periodic < 1, 1, 1 >;
}
Parameters
{
timesteps 10000; // time steps of one performance measurement
timesteps 2000; //10000; // time steps of one performance measurement
warmupSteps 0; // number of steps to run before measurement starts
outerIterations 1; // how many measurements to conduct
// Can be one of: GPUPackInfo_Baseline, GPUPackInfo_Streams, UniformGPUScheme_Baseline, UniformGPUScheme_Memcpy
communicationScheme UniformGPUScheme_Baseline;
vtkWriteFrequency 100; // write a VTK file every n'th step, if zero VTK output is disabled
vtkWriteFrequency 0; //100; // write a VTK file every n'th step, if zero VTK output is disabled
cudaEnabledMPI false; // switch on if you have a CUDA-enabled MPI implementation
timeStepStrategy noOverlap; // can be: noOverlap, simpleOverlap, complexOverlap, kernelOnly
......@@ -25,6 +25,7 @@ Parameters
omega 1.92;
initShearFlow 1;
useGui 0;
}
/*
......
import sympy as sp
import numpy as np
import pystencils as ps
from lbmpy.creationfunctions import create_lb_method, create_lb_update_rule
from lbmpy.boundaries import NoSlip, UBB
from lbmpy.fieldaccess import StreamPullTwoFieldsAccessor, StreamPushTwoFieldsAccessor
from pystencils_walberla import generate_pack_info_from_kernel
from lbmpy_walberla import generate_lattice_model, generate_boundary
from pystencils_walberla import CodeGeneration, generate_sweep
from pystencils.data_types import TypedSymbol
from pystencils.fast_approximation import insert_fast_sqrts, insert_fast_divisions
from lbmpy.macroscopic_value_kernels import macroscopic_values_getter, macroscopic_values_setter
omega = sp.symbols("omega")
# sweep_block_size = (128, 1, 1)
......@@ -16,7 +19,7 @@ sweep_block_size = (TypedSymbol("cudaBlockSize0", np.int32),
sweep_params = {'block_size': sweep_block_size}
options = {
options_dict = {
'srt': {
'method': 'srt',
'stencil': 'D3Q19',
......@@ -49,17 +52,25 @@ options = {
}
with CodeGeneration() as ctx:
accessor = StreamPullTwoFieldsAccessor()
#accessor = StreamPushTwoFieldsAccessor()
assert not accessor.is_inplace, "This app does not work for inplace accessors"
common_options = {
'field_name': 'pdfs',
'temporary_field_name': 'pdfs_tmp',
#'kernel_type': 'collide_stream_push',
'kernel_type': accessor,
'optimization': {'cse_global': True,
'cse_pdfs': False}
}
options = options[ctx.config]
options = options_dict.get(ctx.config, options_dict['srt'])
options.update(common_options)
stencil_str = options['stencil']
q = int(stencil_str[stencil_str.find('Q')+1:])
pdfs, velocity_field = ps.fields("pdfs({q}), velocity(3) : double[3D]".format(q=q), layout='fzyx')
options['optimization']['symbolic_field'] = pdfs
vp = [
('int32_t', 'cudaBlockSize0'),
('int32_t', 'cudaBlockSize1')
......@@ -84,5 +95,13 @@ with CodeGeneration() as ctx:
generate_boundary(ctx, 'UniformGridGPU_NoSlip', NoSlip(), lb_method, target='gpu')
generate_boundary(ctx, 'UniformGridGPU_UBB', UBB([0.05, 0, 0]), lb_method, target='gpu')
# getter & setter
setter_assignments = macroscopic_values_setter(lb_method, velocity=velocity_field.center_vector,
pdfs=pdfs.center_vector, density=1)
getter_assignments = macroscopic_values_getter(lb_method, velocity=velocity_field.center_vector,
pdfs=pdfs.center_vector, density=None)
generate_sweep(ctx, 'UniformGridGPU_MacroSetter', setter_assignments)
generate_sweep(ctx, 'UniformGridGPU_MacroGetter', getter_assignments)
# communication
generate_pack_info_from_kernel(ctx, 'UniformGridGPU_PackInfo', update_rule, target='gpu')
#include "core/Environment.h"
#include "core/logging/Initialization.h"
#include "python_coupling/CreateConfig.h"
#include "python_coupling/PythonCallback.h"
#include "python_coupling/DictWrapper.h"
#include "blockforest/Initialization.h"
#include "field/FlagField.h"
#include "field/AddToStorage.h"
#include "field/vtk/VTKWriter.h"
#include "field/communication/PackInfo.h"
#include "lbm/PerformanceLogger.h"
#include "blockforest/communication/UniformBufferedScheme.h"
#include "timeloop/all.h"
#include "geometry/all.h"
#include "cuda/HostFieldAllocator.h"
#include "cuda/communication/GPUPackInfo.h"
#include "cuda/ParallelStreams.h"
#include "core/timing/TimingPool.h"
#include "core/timing/RemainingTimeLogger.h"
#include "cuda/AddGPUFieldToStorage.h"
#include "cuda/communication/UniformGPUScheme.h"
#include "cuda/DeviceSelectMPI.h"
#include "domain_decomposition/SharedSweep.h"
#include "stencil/D3Q19.h"
#include "stencil/D3Q27.h"
#include "InitShearVelocity.h"
#include "UniformGridGPU_AA_PackInfoPush.h"
#include "UniformGridGPU_AA_PackInfoPull.h"
#include "UniformGridGPU_AA_MacroSetter.h"
#include "UniformGridGPU_AA_MacroGetter.h"
#include "UniformGridGPU_AA_LbKernelEven.h"
#include "UniformGridGPU_AA_LbKernelOdd.h"
using namespace walberla;
using Stencil_T = stencil::D3Q19; //TODO make generic - and determine from python script
using CommunicationStencil_T = Stencil_T;
using PdfField_T = GhostLayerField< real_t, Stencil_T::Q >;
using VelocityField_T = GhostLayerField< real_t, 3 >;
int main( int argc, char **argv )
{
mpi::Environment env( argc, argv );
cuda::selectDeviceBasedOnMpiRank();
for ( auto cfg = python_coupling::configBegin( argc, argv ); cfg != python_coupling::configEnd(); ++cfg )
{
WALBERLA_MPI_WORLD_BARRIER();
auto config = *cfg;
logging::configureLogging( config );
auto blocks = blockforest::createUniformBlockGridFromConfig( config );
Vector3< uint_t > cellsPerBlock = config->getBlock( "DomainSetup" ).getParameter< Vector3< uint_t > >( "cellsPerBlock" );
// Reading parameters
auto parameters = config->getOneBlock( "Parameters" );
const real_t omega = parameters.getParameter< real_t >( "omega", real_c( 1.4 ));
const uint_t timesteps = parameters.getParameter< uint_t >( "timesteps", uint_c( 50 ));
// Creating fields
BlockDataID pdfFieldCpuID = field::addToStorage< PdfField_T >( blocks, "pdfs cpu", real_t( 42.0 ), field::fzyx );
BlockDataID velFieldCpuID = field::addToStorage< VelocityField_T >( blocks, "vel", real_t( 0 ), field::fzyx );
WALBERLA_LOG_INFO_ON_ROOT( "Initializing shear flow" );
initShearVelocity( blocks, velFieldCpuID );
pystencils::UniformGridGPU_AA_MacroGetter getterSweep( pdfFieldCpuID, velFieldCpuID );
pystencils::UniformGridGPU_AA_MacroSetter setterSweep( pdfFieldCpuID, velFieldCpuID );
for ( auto &block : *blocks )
setterSweep( &block );
BlockDataID pdfFieldGpuID = cuda::addGPUFieldToStorage< PdfField_T >( blocks, pdfFieldCpuID, "pdfs on GPU", true );
Vector3<int> innerOuterSplit = parameters.getParameter<Vector3<int> >("innerOuterSplit", Vector3<int>(1, 1, 1));
Cell innerOuterSplitCell (innerOuterSplit[0], innerOuterSplit[1], innerOuterSplit[2]);
bool cudaEnabledMPI = parameters.getParameter<bool>( "cudaEnabledMPI", false );
Vector3<int32_t> gpuBlockSize = parameters.getParameter<Vector3<int32_t> > ("gpuBlockSize", Vector3<int32_t>(256, 1, 1));
int streamHighPriority = 0;
int streamLowPriority = 0;
WALBERLA_CUDA_CHECK( cudaDeviceGetStreamPriorityRange( &streamLowPriority, &streamHighPriority ));
WALBERLA_CHECK( gpuBlockSize[2] == 1 );
using KernelEven = pystencils::UniformGridGPU_AA_LbKernelEven;
using KernelOdd = pystencils::UniformGridGPU_AA_LbKernelOdd;
using PackInfoPull = pystencils::UniformGridGPU_AA_PackInfoPull;
using PackInfoPush = pystencils::UniformGridGPU_AA_PackInfoPush;
using cuda::communication::UniformGPUScheme;
KernelEven kernelEven( pdfFieldGpuID, omega, gpuBlockSize[0], gpuBlockSize[1], innerOuterSplitCell );
KernelOdd kernelOdd ( pdfFieldGpuID, omega, gpuBlockSize[0], gpuBlockSize[1], innerOuterSplitCell );
kernelEven.setOuterPriority( streamHighPriority );
kernelOdd .setOuterPriority( streamHighPriority );
auto pullScheme = make_shared< UniformGPUScheme< Stencil_T > >( blocks, cudaEnabledMPI );
auto pushScheme = make_shared< UniformGPUScheme< Stencil_T > >( blocks, cudaEnabledMPI );
pullScheme->addPackInfo( make_shared< PackInfoPull >( pdfFieldGpuID ) );
pushScheme->addPackInfo( make_shared< PackInfoPush >( pdfFieldGpuID ) );
auto defaultStream = cuda::StreamRAII::newPriorityStream( streamLowPriority );
auto setupPhase = [&]() {
for ( auto &block: *blocks )
kernelEven( &block );
pullScheme->communicate();
for ( auto &block: *blocks )
kernelOdd( &block );
};
auto tearDownPhase = [&]() {
pushScheme->communicate();
cuda::fieldCpy< PdfField_T, cuda::GPUField< real_t > >( blocks, pdfFieldCpuID, pdfFieldGpuID );
for ( auto &block : *blocks )
getterSweep( &block );
};
auto simpleOverlapTimeStep = [&]()
{
// Even
pushScheme->startCommunication( defaultStream );
for ( auto &block: *blocks )
kernelEven.inner( &block, defaultStream );
pushScheme->wait( defaultStream );
for ( auto &block: *blocks )
kernelEven.outer( &block, defaultStream );
// Odd
pullScheme->startCommunication( defaultStream );
for ( auto &block: *blocks )
kernelOdd.inner( &block, defaultStream );
pullScheme->wait( defaultStream );
for ( auto &block: *blocks )
kernelOdd.outer( &block, defaultStream );
};
auto normalTimeStep = [&]()
{
pushScheme->communicate( defaultStream );
for ( auto &block: *blocks )
kernelEven( &block, defaultStream );
pullScheme->communicate( defaultStream );
for ( auto &block: *blocks )
kernelOdd( &block, defaultStream );
};
auto kernelOnlyFunc = [&]()
{
for ( auto &block: *blocks )
kernelEven( &block, defaultStream );
for ( auto &block: *blocks )
kernelOdd( &block, defaultStream );
};
SweepTimeloop timeLoop( blocks->getBlockStorage(), timesteps / 2 );
const std::string timeStepStrategy = parameters.getParameter< std::string >( "timeStepStrategy", "normal" );
std::function< void() > timeStep;
if ( timeStepStrategy == "noOverlap" )
timeStep = std::function< void() >( normalTimeStep );
else if ( timeStepStrategy == "simpleOverlap" )
timeStep = simpleOverlapTimeStep;
else if ( timeStepStrategy == "kernelOnly" )
{
WALBERLA_LOG_INFO_ON_ROOT( "Running only compute kernel without boundary - this makes only sense for benchmarking!" )
timeStep = kernelOnlyFunc;
}
else
{
WALBERLA_ABORT_NO_DEBUG_INFO(
"Invalid value for 'timeStepStrategy'. Allowed values are 'noOverlap', 'complexOverlap', 'simpleOverlap', 'kernelOnly'" );
}
timeLoop.add() << BeforeFunction( timeStep )
<< Sweep( []( IBlock * ) {}, "time step" );
// VTK
uint_t vtkWriteFrequency = parameters.getParameter< uint_t >( "vtkWriteFrequency", 0 );
if ( vtkWriteFrequency > 0 )
{
auto vtkOutput = vtk::createVTKOutput_BlockData( *blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out",
"simulation_step", false, true, true, false, 0 );
auto velWriter = make_shared< field::VTKWriter< VelocityField_T > >( velFieldCpuID, "vel" );
vtkOutput->addCellDataWriter( velWriter );
vtkOutput->addBeforeFunction( [&]()
{
tearDownPhase();
setupPhase();
} );
timeLoop.addFuncAfterTimeStep( vtk::writeFiles( vtkOutput ), "VTK Output" );
}
int warmupSteps = parameters.getParameter< int >( "warmupSteps", 2 );
int outerIterations = parameters.getParameter< int >( "outerIterations", 1 );
setupPhase();
for ( int i = 0; i < warmupSteps; ++i )
timeLoop.singleStep();
double remainingTimeLoggerFrequency = parameters.getParameter< double >( "remainingTimeLoggerFrequency", -1.0 ); // in seconds
if ( remainingTimeLoggerFrequency > 0 )
{
auto logger = timing::RemainingTimeLogger( timeLoop.getNrOfTimeSteps() * outerIterations, remainingTimeLoggerFrequency );
timeLoop.addFuncAfterTimeStep( logger, "remaining time logger" );
}
for ( int outerIteration = 0; outerIteration < outerIterations; ++outerIteration )
{
timeLoop.setCurrentTimeStepToZero();
WcTimer simTimer;
cudaDeviceSynchronize();
WALBERLA_LOG_INFO_ON_ROOT( "Starting simulation with " << timesteps << " time steps" );
simTimer.start();
timeLoop.run();
cudaDeviceSynchronize();
simTimer.end();
WALBERLA_LOG_INFO_ON_ROOT( "Simulation finished" );
auto time = simTimer.last();
auto nrOfCells = real_c( cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2] );
auto mlupsPerProcess = nrOfCells * real_c( timesteps ) / time * 1e-6;
WALBERLA_LOG_RESULT_ON_ROOT( "MLUPS per process " << mlupsPerProcess );
WALBERLA_LOG_RESULT_ON_ROOT( "Time per time step " << time / real_c( timesteps ));
WALBERLA_ROOT_SECTION()
{
python_coupling::PythonCallback pythonCallbackResults( "results_callback" );
if ( pythonCallbackResults.isCallable())
{
pythonCallbackResults.data().exposeValue( "mlupsPerProcess", mlupsPerProcess );
pythonCallbackResults.data().exposeValue( "githash", WALBERLA_GIT_SHA1 );
// Call Python function to report results
pythonCallbackResults();
}
}
}
}
return 0;
}
import sympy as sp
import numpy as np
import pystencils as ps
from lbmpy.creationfunctions import create_lb_method, create_lb_update_rule
from lbmpy.fieldaccess import AAEvenTimeStepAccessor, AAOddTimeStepAccessor
from pystencils_walberla import generate_pack_info_from_kernel
from pystencils_walberla import CodeGeneration, generate_sweep
from pystencils.data_types import TypedSymbol
from pystencils.fast_approximation import insert_fast_sqrts, insert_fast_divisions
from lbmpy.macroscopic_value_kernels import macroscopic_values_getter, macroscopic_values_setter
omega = sp.symbols("omega")
# sweep_block_size = (128, 1, 1)
sweep_block_size = (TypedSymbol("cudaBlockSize0", np.int32),
TypedSymbol("cudaBlockSize1", np.int32),
1)
sweep_params = {'block_size': sweep_block_size}
options_dict = {
'srt': {
'method': 'srt',
'stencil': 'D3Q19',
'relaxation_rate': omega,
'compressible': False,
},
'trt': {
'method': 'trt',
'stencil': 'D3Q19',
'relaxation_rate': omega,
},
'mrt': {
'method': 'mrt',
'stencil': 'D3Q19',
'relaxation_rates': [0, omega, 1.3, 1.4, omega, 1.2, 1.1],
},
'entropic': {
'method': 'mrt3',
'stencil': 'D3Q19',
'compressible': True,
'relaxation_rates': [omega, omega, sp.Symbol("omega_free")],
'entropic': True,
},
'smagorinsky': {
'method': 'srt',
'stencil': 'D3Q19',
'smagorinsky': True,
'relaxation_rate': omega,
}
}
with CodeGeneration() as ctx:
accessors = {
'Even': AAEvenTimeStepAccessor(),
'Odd': AAOddTimeStepAccessor()
}
common_options = {
'field_name': 'pdfs',
'optimization': {'cse_global': True,
'cse_pdfs': False}
}
options = options_dict.get(ctx.config, options_dict['srt'])
options.update(common_options)
stencil_str = options['stencil']
q = int(stencil_str[stencil_str.find('Q') + 1:])
pdfs, velocity_field = ps.fields("pdfs({q}), velocity(3) : double[3D]".format(q=q), layout='fzyx')
options['optimization']['symbolic_field'] = pdfs
vp = [
('int32_t', 'cudaBlockSize0'),
('int32_t', 'cudaBlockSize1')
]
lb_method = create_lb_method(**options)
# Kernels
options_without_opt = options.copy()
del options_without_opt['optimization']
update_rules = {}
for name, accessor in accessors.items():
update_rule = create_lb_update_rule(lb_method=lb_method, kernel_type=accessor, **options)
update_rule = insert_fast_divisions(update_rule)
update_rule = insert_fast_sqrts(update_rule)
update_rules[name] = update_rule
generate_sweep(ctx, 'UniformGridGPU_AA_LbKernel' + name, update_rule,
inner_outer_split=True, target='gpu', gpu_indexing_params=sweep_params,
varying_parameters=vp)
# getter & setter
setter_assignments = macroscopic_values_setter(lb_method, velocity=velocity_field.center_vector,
pdfs=pdfs.center_vector, density=1)
getter_assignments = macroscopic_values_getter(lb_method, velocity=velocity_field.center_vector,
pdfs=pdfs.center_vector, density=None)
generate_sweep(ctx, 'UniformGridGPU_AA_MacroSetter', setter_assignments)
generate_sweep(ctx, 'UniformGridGPU_AA_MacroGetter', getter_assignments)
# communication
generate_pack_info_from_kernel(ctx, 'UniformGridGPU_AA_PackInfoPull', update_rules['Odd'], kind='pull', target='gpu')
generate_pack_info_from_kernel(ctx, 'UniformGridGPU_AA_PackInfoPush', update_rules['Odd'], kind='push', target='gpu')
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment