Commit e33c19be authored by Martin Bauer's avatar Martin Bauer
Browse files

UniformGridGPU more configurations

parent 2ad061e8
......@@ -13,7 +13,7 @@ waLBerla_python_file_generates(UniformGridGPU.py
UniformGridGPU_Defines.h
)
foreach(config srt trt mrt smagorinsky entropic )
foreach(config srt trt mrt smagorinsky entropic smagorinsky_noopt entropic_kbc_n4 entropic_kbc_n4_noopt mrt_noopt mrt_full mrt_full_noopt)
waLBerla_add_executable ( NAME UniformGridBenchmarkGPU_${config}
FILES UniformGridGPU.cpp UniformGridGPU.py
DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk gui
......
......@@ -95,26 +95,32 @@ int main( int argc, char **argv )
Vector3<uint_t> cellsPerBlock = config->getBlock( "DomainSetup" ).getParameter<Vector3<uint_t> >( "cellsPerBlock" );
// Reading parameters
auto parameters = config->getOneBlock( "Parameters" );
const std::string timeStepStrategy = parameters.getParameter<std::string>( "timeStepStrategy", "normal");
const real_t omega = parameters.getParameter<real_t>( "omega", real_c( 1.4 ));
const uint_t timesteps = parameters.getParameter<uint_t>( "timesteps", uint_c( 50 ));
const bool initShearFlow = parameters.getParameter<bool>("initShearFlow", false);
// Creating fields
BlockDataID pdfFieldCpuID = field::addToStorage< PdfField_T >( blocks, "pdfs cpu", real_t(99.8), field::fzyx);
BlockDataID pdfFieldCpuID = field::addToStorage< PdfField_T >( blocks, "pdfs cpu", real_t(0), field::fzyx);
BlockDataID velFieldCpuID = field::addToStorage< VelocityField_T >( blocks, "vel", real_t(0), field::fzyx);
if( initShearFlow ) {
WALBERLA_LOG_INFO_ON_ROOT("Initializing shear flow");
initShearVelocity( blocks, velFieldCpuID );
}
pystencils::UniformGridGPU_MacroSetter setterSweep(pdfFieldCpuID, velFieldCpuID);
for( auto & block : *blocks )
setterSweep( &block );
// setter sweep only initializes interior of domain - for push schemes to work a first communication is required here
blockforest::communication::UniformBufferedScheme<CommunicationStencil_T> initialComm(blocks);
initialComm.addPackInfo( make_shared< field::communication::PackInfo<PdfField_T> >( pdfFieldCpuID ) );
initialComm();
if( timeStepStrategy != "kernelOnlyNoInit")
{
if ( initShearFlow )
{
WALBERLA_LOG_INFO_ON_ROOT( "Initializing shear flow" );
initShearVelocity( blocks, velFieldCpuID );
}
pystencils::UniformGridGPU_MacroSetter setterSweep(pdfFieldCpuID, velFieldCpuID);
for( auto & block : *blocks )
setterSweep( &block );
// setter sweep only initializes interior of domain - for push schemes to work a first communication is required here
blockforest::communication::UniformBufferedScheme<CommunicationStencil_T> initialComm(blocks);
initialComm.addPackInfo( make_shared< field::communication::PackInfo<PdfField_T> >( pdfFieldCpuID ) );
initialComm();
}
BlockDataID pdfFieldGpuID = cuda::addGPUFieldToStorage<PdfField_T >( blocks, pdfFieldCpuID, "pdfs on GPU", true );
BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >( blocks, "flag field" );
......@@ -165,7 +171,9 @@ int main( int argc, char **argv )
int streamLowPriority = 0;
WALBERLA_CUDA_CHECK( cudaDeviceGetStreamPriorityRange(&streamLowPriority, &streamHighPriority) );
WALBERLA_CHECK(gpuBlockSize[2] == 1);
pystencils::UniformGridGPU_LbKernel lbKernel( pdfFieldGpuID, omega, gpuBlockSize[0], gpuBlockSize[1],
pystencils::UniformGridGPU_LbKernel lbKernel( pdfFieldGpuID, omega,
1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7,
gpuBlockSize[0], gpuBlockSize[1],
Cell(innerOuterSplit[0], innerOuterSplit[1], innerOuterSplit[2]) );
lbKernel.setOuterPriority( streamHighPriority );
UniformGridGPU_Communication< CommunicationStencil_T, cuda::GPUField< double > >
......@@ -252,7 +260,6 @@ int main( int argc, char **argv )
SweepTimeloop timeLoop( blocks->getBlockStorage(), timesteps );
const std::string timeStepStrategy = parameters.getParameter<std::string>( "timeStepStrategy", "normal");
std::function<void()> timeStep;
if (timeStepStrategy == "noOverlap")
timeStep = std::function<void()>( normalTimeStep );
......@@ -260,7 +267,7 @@ int main( int argc, char **argv )
timeStep = std::function<void()>( overlapTimeStep );
else if (timeStepStrategy == "simpleOverlap")
timeStep = simpleOverlapTimeStep;
else if (timeStepStrategy == "kernelOnly") {
else if (timeStepStrategy == "kernelOnly" or timeStepStrategy == "kernelOnlyNoInit") {
WALBERLA_LOG_INFO_ON_ROOT("Running only compute kernel without boundary - this makes only sense for benchmarking!")
timeStep = kernelOnlyFunc;
}
......
......@@ -12,6 +12,7 @@ from pystencils.fast_approximation import insert_fast_sqrts, insert_fast_divisio
from lbmpy.macroscopic_value_kernels import macroscopic_values_getter, macroscopic_values_setter
omega = sp.symbols("omega")
omega_fill = sp.symbols("omega_:10")
compile_time_block_size = False
if compile_time_block_size:
......@@ -40,6 +41,11 @@ options_dict = {
'stencil': 'D3Q19',
'relaxation_rates': [0, omega, 1.3, 1.4, omega, 1.2, 1.1],
},
'mrt_full': {
'method': 'mrt',
'stencil': 'D3Q19',
'relaxation_rates': [omega_fill[0], omega, omega_fill[1], omega_fill[2], omega_fill[3], omega_fill[4], omega_fill[5]],
},
'entropic': {
'method': 'mrt3',
'stencil': 'D3Q19',
......@@ -47,6 +53,13 @@ options_dict = {
'relaxation_rates': [omega, omega, sp.Symbol("omega_free")],
'entropic': True,
},
'entropic_kbc_n4': {
'method': 'trt-kbc-n4',
'stencil': 'D3Q27',
'compressible': True,
'relaxation_rates': [omega, sp.Symbol("omega_free")],
'entropic': True,
},
'smagorinsky': {
'method': 'srt',
'stencil': 'D3Q19',
......@@ -76,8 +89,19 @@ with CodeGeneration() as ctx:
'optimization': {'cse_global': True,
'cse_pdfs': False}
}
options = options_dict.get(ctx.config, options_dict['srt'])
config_name = ctx.config
noopt = False
if config_name.endswith("_noopt"):
noopt = True
config_name = config_name[:-len("_noopt")]
options = options_dict[config_name]
options.update(common_options)
options = options.copy()
if noopt:
options['optimization']['cse_global'] = False
options['optimization']['cse_pdfs'] = False
stencil_str = options['stencil']
q = int(stencil_str[stencil_str.find('Q')+1:])
......@@ -85,14 +109,22 @@ with CodeGeneration() as ctx:
options['optimization']['symbolic_field'] = pdfs
vp = [
('double', 'omega_0'),
('double', 'omega_1'),
('double', 'omega_2'),
('double', 'omega_3'),
('double', 'omega_4'),
('double', 'omega_5'),
('double', 'omega_6'),
('int32_t', 'cudaBlockSize0'),
('int32_t', 'cudaBlockSize1')
('int32_t', 'cudaBlockSize1'),
]
lb_method = create_lb_method(**options)
update_rule = create_lb_update_rule(lb_method=lb_method, **options)
update_rule = insert_fast_divisions(update_rule)
update_rule = insert_fast_sqrts(update_rule)
if not noopt:
update_rule = insert_fast_divisions(update_rule)
update_rule = insert_fast_sqrts(update_rule)
# CPU lattice model - required for macroscopic value computation, VTK output etc.
options_without_opt = options.copy()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment