Commit d48349d8 authored by Dominik Thoennes's avatar Dominik Thoennes
Browse files

Merge branch 'master' into thoennes/add-oneapi-22

parents b7838d4b 03b9f95f
Pipeline #33161 failed with stages
in 247 minutes and 23 seconds
......@@ -1011,11 +1011,18 @@ endif()
option ( WALBERLA_THREAD_SAFE_LOGGING "Enables/Disables thread-safe logging" ON )
if ( WALBERLA_BUILD_WITH_OPENMP )
if( APPLE AND EXISTS /opt/local/lib/libomp AND EXISTS /opt/local/include/libomp ) # find libomp from MacPorts
set( CMAKE_FRAMEWORK_PATH /opt/local/lib/libomp )
set( CMAKE_INCLUDE_PATH /opt/local/include/libomp )
endif()
find_package( OpenMP )
if (OpenMP_FOUND)
add_flag ( CMAKE_C_FLAGS "${OpenMP_C_FLAGS}" )
add_flag ( CMAKE_CXX_FLAGS "${OpenMP_CXX_FLAGS}" )
list ( APPEND SERVICE_LIBS ${OpenMP_CXX_LIBRARIES} )
if( OpenMP_CXX_INCLUDE_DIRS )
include_directories( ${OpenMP_CXX_INCLUDE_DIRS} )
endif()
else()
#workarounds
if ( WALBERLA_CXX_COMPILER_IS_NEC )
......
from pystencils.field import fields
from lbmpy.advanced_streaming.utility import get_timesteps, Timestep
from lbmpy.advanced_streaming.utility import get_timesteps
from lbmpy.macroscopic_value_kernels import macroscopic_values_setter
from lbmpy.stencils import get_stencil
from lbmpy.creationfunctions import create_lb_collision_rule, create_lb_method, create_lb_update_rule
from lbmpy.creationfunctions import create_lb_collision_rule
from lbmpy.boundaries import NoSlip, UBB, ExtrapolationOutflow
from pystencils_walberla import CodeGeneration, generate_sweep, generate_info_header
......
......@@ -86,7 +86,6 @@ using FlagField_T = FlagField< flag_t >;
#if defined(WALBERLA_BUILD_WITH_CUDA)
typedef cuda::GPUField< real_t > GPUField;
#endif
// using CommScheme_T = cuda::communication::UniformGPUScheme<stencil::D2Q9>;
int main(int argc, char** argv)
{
......@@ -185,7 +184,7 @@ int main(int argc, char** argv)
auto Comm_velocity_based_distributions =
make_shared< cuda::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, 0);
auto generatedPackInfo_velocity_based_distributions =
make_shared< pystencils::PackInfo_velocity_based_distributions >(lb_velocity_field_gpu);
make_shared< lbm::PackInfo_velocity_based_distributions >(lb_velocity_field_gpu);
Comm_velocity_based_distributions->addPackInfo(generatedPackInfo_velocity_based_distributions);
auto generatedPackInfo_phase_field = make_shared< pystencils::PackInfo_phase_field >(phase_field_gpu);
Comm_velocity_based_distributions->addPackInfo(generatedPackInfo_phase_field);
......@@ -193,7 +192,7 @@ int main(int argc, char** argv)
auto Comm_phase_field_distributions =
make_shared< cuda::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, 0);
auto generatedPackInfo_phase_field_distributions =
make_shared< pystencils::PackInfo_phase_field_distributions >(lb_phase_field_gpu);
make_shared< lbm::PackInfo_phase_field_distributions >(lb_phase_field_gpu);
Comm_phase_field_distributions->addPackInfo(generatedPackInfo_phase_field_distributions);
#else
......@@ -202,14 +201,14 @@ int main(int argc, char** argv)
auto generatedPackInfo_phase_field = make_shared< pystencils::PackInfo_phase_field >(phase_field);
auto generatedPackInfo_velocity_based_distributions =
make_shared< pystencils::PackInfo_velocity_based_distributions >(lb_velocity_field);
make_shared< lbm::PackInfo_velocity_based_distributions >(lb_velocity_field);
Comm_velocity_based_distributions.addPackInfo(generatedPackInfo_phase_field);
Comm_velocity_based_distributions.addPackInfo(generatedPackInfo_velocity_based_distributions);
blockforest::communication::UniformBufferedScheme< Stencil_hydro_T > Comm_phase_field_distributions(blocks);
auto generatedPackInfo_phase_field_distributions =
make_shared< pystencils::PackInfo_phase_field_distributions >(lb_phase_field);
make_shared< lbm::PackInfo_phase_field_distributions >(lb_phase_field);
Comm_phase_field_distributions.addPackInfo(generatedPackInfo_phase_field_distributions);
#endif
......
......@@ -5,11 +5,12 @@ from pystencils import AssignmentCollection
from lbmpy.creationfunctions import create_lb_method, create_lb_update_rule
from lbmpy.stencils import get_stencil
from pystencils_walberla import CodeGeneration, generate_sweep, generate_pack_info_from_kernel
from pystencils_walberla import CodeGeneration, generate_sweep, generate_pack_info_for_field
from lbmpy_walberla import generate_lb_pack_info
from lbmpy.phasefield_allen_cahn.kernel_equations import initializer_kernel_phase_field_lb, \
initializer_kernel_hydro_lb, interface_tracking_force, \
hydrodynamic_force, get_collision_assignments_hydro
hydrodynamic_force, get_collision_assignments_hydro, get_collision_assignments_phase
from lbmpy.phasefield_allen_cahn.force_model import MultiphaseForceModel
......@@ -52,6 +53,7 @@ w_c = 1.0 / (0.5 + (3.0 * M))
u = fields(f"vel_field({dimensions}): [{dimensions}D]", layout='fzyx')
# phase-field
C = fields(f"phase_field: [{dimensions}D]", layout='fzyx')
C_tmp = fields(f"phase_field_tmp: [{dimensions}D]", layout='fzyx')
# phase-field distribution functions
h = fields(f"lb_phase_field({q_phase}): [{dimensions}D]", layout='fzyx')
......@@ -88,32 +90,26 @@ h_updates = initializer_kernel_phase_field_lb(h, C, u, method_phase, W)
g_updates = initializer_kernel_hydro_lb(g, u, method_hydro)
force_h = [f / 3 for f in interface_tracking_force(C, stencil_phase, W)]
force_h = [f / 3 for f in interface_tracking_force(C, stencil_phase, W, fd_stencil=get_stencil("D3Q27"))]
force_model_h = MultiphaseForceModel(force=force_h)
force_g = hydrodynamic_force(g, C, method_hydro, relaxation_time, density_liquid, density_gas, kappa, beta, body_force)
force_g = hydrodynamic_force(g, C, method_hydro, relaxation_time, density_liquid, density_gas, kappa, beta, body_force,
fd_stencil=get_stencil("D3Q27"))
h_tmp_symbol_list = [h_tmp.center(i) for i, _ in enumerate(stencil_phase)]
sum_h = np.sum(h_tmp_symbol_list[:])
force_model_g = MultiphaseForceModel(force=force_g, rho=density)
####################
# LBM UPDATE RULES #
####################
method_phase.set_force_model(force_model_h)
phase_field_LB_step = get_collision_assignments_phase(lb_method=method_phase,
velocity_input=u,
output={'density': C_tmp},
force_model=force_model_h,
symbolic_fields={"symbolic_field": h,
"symbolic_temporary_field": h_tmp},
kernel_type='stream_pull_collide')
phase_field_LB_step = create_lb_update_rule(lb_method=method_phase,
velocity_input=u,
compressible=True,
optimization={"symbolic_field": h,
"symbolic_temporary_field": h_tmp},
kernel_type='stream_pull_collide')
phase_field_LB_step.set_main_assignments_from_dict({**phase_field_LB_step.main_assignments_dict, **{C.center: sum_h}})
phase_field_LB_step = AssignmentCollection(main_assignments=phase_field_LB_step.main_assignments,
subexpressions=phase_field_LB_step.subexpressions)
phase_field_LB_step = sympy_cse(phase_field_LB_step)
# ---------------------------------------------------------------------------------------------------------
......@@ -121,18 +117,12 @@ phase_field_LB_step = sympy_cse(phase_field_LB_step)
hydro_LB_step = get_collision_assignments_hydro(lb_method=method_hydro,
density=density,
velocity_input=u,
force=force_g,
sub_iterations=1,
force_model=force_model_g,
sub_iterations=2,
symbolic_fields={"symbolic_field": g,
"symbolic_temporary_field": g_tmp},
kernel_type='collide_stream_push')
# streaming of the hydrodynamic distribution
stream_hydro = create_lb_update_rule(stencil=stencil_hydro,
optimization={"symbolic_field": g,
"symbolic_temporary_field": g_tmp},
kernel_type='stream_pull_only')
###################
# GENERATE SWEEPS #
###################
......@@ -161,7 +151,7 @@ with CodeGeneration() as ctx:
generate_sweep(ctx, 'initialize_velocity_based_distributions', g_updates)
generate_sweep(ctx, 'phase_field_LB_step', phase_field_LB_step,
field_swaps=[(h, h_tmp)],
field_swaps=[(h, h_tmp), (C, C_tmp)],
inner_outer_split=True,
cpu_vectorize_info=cpu_vec)
......@@ -171,12 +161,13 @@ with CodeGeneration() as ctx:
cpu_vectorize_info=cpu_vec)
# communication
generate_pack_info_from_kernel(ctx, 'PackInfo_phase_field_distributions',
phase_field_LB_step.main_assignments, target='cpu')
generate_pack_info_from_kernel(ctx, 'PackInfo_phase_field',
hydro_LB_step.all_assignments, target='cpu', kind='pull')
generate_pack_info_from_kernel(ctx, 'PackInfo_velocity_based_distributions',
hydro_LB_step.all_assignments, target='cpu', kind='push')
generate_lb_pack_info(ctx, 'PackInfo_phase_field_distributions', stencil_phase, h,
streaming_pattern='pull', target='cpu')
generate_lb_pack_info(ctx, 'PackInfo_velocity_based_distributions', stencil_hydro, g,
streaming_pattern='push', target='cpu')
generate_pack_info_for_field(ctx, 'PackInfo_phase_field', C, target='cpu')
ctx.write_file("GenDefines.h", info_header)
......@@ -187,7 +178,7 @@ with CodeGeneration() as ctx:
g_updates, target='gpu')
generate_sweep(ctx, 'phase_field_LB_step', phase_field_LB_step,
field_swaps=[(h, h_tmp)],
field_swaps=[(h, h_tmp), (C, C_tmp)],
inner_outer_split=True,
target='gpu',
gpu_indexing_params=sweep_params,
......@@ -200,12 +191,13 @@ with CodeGeneration() as ctx:
gpu_indexing_params=sweep_params,
varying_parameters=vp)
# communication
generate_pack_info_from_kernel(ctx, 'PackInfo_phase_field_distributions',
phase_field_LB_step.main_assignments, target='gpu')
generate_pack_info_from_kernel(ctx, 'PackInfo_phase_field',
hydro_LB_step.all_assignments, target='gpu', kind='pull')
generate_pack_info_from_kernel(ctx, 'PackInfo_velocity_based_distributions',
hydro_LB_step.all_assignments, target='gpu', kind='push')
generate_lb_pack_info(ctx, 'PackInfo_phase_field_distributions', stencil_phase, h,
streaming_pattern='pull', target='gpu')
generate_lb_pack_info(ctx, 'PackInfo_velocity_based_distributions', stencil_hydro, g,
streaming_pattern='push', target='gpu')
generate_pack_info_for_field(ctx, 'PackInfo_phase_field', C, target='gpu')
ctx.write_file("GenDefines.h", info_header)
......
......@@ -4,49 +4,27 @@ waLBerla_link_files_to_builddir( "*.py" )
waLBerla_link_files_to_builddir( "simulation_setup" )
foreach (config srt trt mrt smagorinsky entropic smagorinsky_noopt entropic_kbc_n4
entropic_kbc_n4_noopt mrt_noopt mrt_full mrt_full_noopt
cumulant cumulant_d3q27
srt_d3q27 mrt_d3q27 mrt_d3q27_noopt smagorinsky_d3q27 smagorinsky_d3q27_noopt mrt_full_d3q27 mrt_full_d3q27_noopt)
waLBerla_generate_target_from_python(NAME UniformGridGPUGenerated_${config}
FILE UniformGridGPU.py
CODEGEN_CFG ${config}
OUT_FILES UniformGridGPU_LatticeModel.cpp UniformGridGPU_LatticeModel.h
UniformGridGPU_LbKernel.cu UniformGridGPU_LbKernel.h
UniformGridGPU_NoSlip.cu UniformGridGPU_NoSlip.h
UniformGridGPU_UBB.cu UniformGridGPU_UBB.h
UniformGridGPU_PackInfo.cu UniformGridGPU_PackInfo.h
UniformGridGPU_MacroSetter.cpp UniformGridGPU_MacroSetter.h
UniformGridGPU_MacroGetter.cpp UniformGridGPU_MacroGetter.h
UniformGridGPU_Defines.h
)
waLBerla_add_executable(NAME UniformGridBenchmarkGPU_${config}
FILES UniformGridGPU.cpp
DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk gui UniformGridGPUGenerated_${config})
set_target_properties( UniformGridBenchmarkGPU_${config} PROPERTIES CXX_VISIBILITY_PRESET hidden)
endforeach ()
foreach (config srt trt mrt smagorinsky entropic)
waLBerla_generate_target_from_python(NAME UniformGridGPUGenerated_AA_${config}
FILE UniformGridGPU_AA.py
CODEGEN_CFG ${config}
OUT_FILES UniformGridGPU_AA_PackInfoPull.cu UniformGridGPU_AA_PackInfoPull.h
UniformGridGPU_AA_LbKernelOdd.cu UniformGridGPU_AA_LbKernelOdd.h
UniformGridGPU_AA_LbKernelEven.cu UniformGridGPU_AA_LbKernelEven.h
UniformGridGPU_AA_PackInfoPush.cu UniformGridGPU_AA_PackInfoPush.h
UniformGridGPU_AA_MacroSetter.cpp UniformGridGPU_AA_MacroSetter.h
UniformGridGPU_AA_MacroGetter.cpp UniformGridGPU_AA_MacroGetter.h
UniformGridGPU_AA_Defines.h
)
waLBerla_add_executable(NAME UniformGridBenchmarkGPU_AA_${config}
FILES UniformGridGPU_AA.cpp
DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk gui UniformGridGPUGenerated_AA_${config})
set_target_properties( UniformGridBenchmarkGPU_AA_${config} PROPERTIES CXX_VISIBILITY_PRESET hidden)
endforeach ()
foreach(streaming_pattern aa) # choose from {pull, push, aa, esotwist}
foreach(stencil d3q27) # choose from {d3q19 d3q27}
foreach (collision_setup srt trt mrt cumulant) # choose from {srt trt mrt cumulant entropic smagorinsky}
set(config ${stencil}_${streaming_pattern}_${collision_setup})
waLBerla_generate_target_from_python(NAME UniformGridGPUGenerated_${config}
FILE UniformGridGPU.py
CODEGEN_CFG ${config}
OUT_FILES UniformGridGPU_LbKernel.cu UniformGridGPU_LbKernel.h
UniformGridGPU_PackInfoEven.cu UniformGridGPU_PackInfoEven.h
UniformGridGPU_PackInfoOdd.cu UniformGridGPU_PackInfoOdd.h
UniformGridGPU_NoSlip.cu UniformGridGPU_NoSlip.h
UniformGridGPU_UBB.cu UniformGridGPU_UBB.h
UniformGridGPU_MacroSetter.cu UniformGridGPU_MacroSetter.h
UniformGridGPU_InfoHeader.h
)
waLBerla_add_executable(NAME UniformGridGPU_${config}
FILES UniformGridGPU.cpp
DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk UniformGridGPUGenerated_${config})
set_target_properties( UniformGridGPU_${config} PROPERTIES CXX_VISIBILITY_PRESET hidden)
endforeach ()
endforeach()
endforeach()
\ No newline at end of file
#include "blockforest/Initialization.h"
#include "core/Environment.h"
#include "core/logging/Initialization.h"
#include "core/math/Random.h"
#include "python_coupling/CreateConfig.h"
#include "python_coupling/PythonCallback.h"
#include "python_coupling/DictWrapper.h"
#include "blockforest/Initialization.h"
#include "field/FlagField.h"
#include "field/AddToStorage.h"
#include "field/vtk/VTKWriter.h"
#include "field/communication/PackInfo.h"
#include "lbm/PerformanceLogger.h"
#include "blockforest/communication/UniformBufferedScheme.h"
#include "timeloop/all.h"
#include "core/math/Random.h"
#include "geometry/all.h"
#include "cuda/HostFieldAllocator.h"
#include "cuda/communication/GPUPackInfo.h"
#include "cuda/ParallelStreams.h"
#include "cuda/NVTX.h"
#include "core/timing/TimingPool.h"
#include "core/timing/RemainingTimeLogger.h"
#include "core/timing/TimingPool.h"
#include "cuda/AddGPUFieldToStorage.h"
#include "cuda/communication/UniformGPUScheme.h"
#include "cuda/DeviceSelectMPI.h"
#include "domain_decomposition/SharedSweep.h"
#include "gui/Gui.h"
#include "lbm/gui/Connection.h"
#include "UniformGridGPU_LatticeModel.h"
#include "UniformGridGPU_LbKernel.h"
#include "UniformGridGPU_PackInfo.h"
#include "UniformGridGPU_UBB.h"
#include "UniformGridGPU_NoSlip.h"
#include "UniformGridGPU_Communication.h"
#include "UniformGridGPU_MacroSetter.h"
#include "UniformGridGPU_MacroGetter.h"
#include "UniformGridGPU_Defines.h"
#include "cuda/ParallelStreams.h"
#include "cuda/communication/UniformGPUScheme.h"
#include "cuda/FieldCopy.h"
#include "cuda/lbm/CombinedInPlaceGpuPackInfo.h"
#include "field/AddToStorage.h"
#include "field/FlagField.h"
#include "field/communication/PackInfo.h"
#include "field/vtk/VTKWriter.h"
using namespace walberla;
#include "geometry/InitBoundaryHandling.h"
using LatticeModel_T = lbm::UniformGridGPU_LatticeModel;
#include "lbm/inplace_streaming/TimestepTracker.h"
const auto Q = LatticeModel_T::Stencil::Q;
#include "python_coupling/CreateConfig.h"
#include "python_coupling/DictWrapper.h"
#include "python_coupling/PythonCallback.h"
#include "timeloop/SweepTimeloop.h"
using Stencil_T = LatticeModel_T::Stencil;
using CommunicationStencil_T = LatticeModel_T::CommunicationStencil;
using PdfField_T = GhostLayerField<real_t, Q>;
using CommScheme_T = cuda::communication::UniformGPUScheme<CommunicationStencil_T>;
using VelocityField_T = GhostLayerField<real_t, 3>;
using flag_t = walberla::uint8_t;
using FlagField_T = FlagField<flag_t>;
#include "InitShearVelocity.h"
#include <cmath>
void initShearVelocity(const shared_ptr<StructuredBlockStorage> & blocks, BlockDataID velFieldID,
const real_t xMagnitude=real_t(0.1), const real_t fluctuationMagnitude=real_t(0.05) )
{
math::seedRandomGenerator(0);
auto halfZ = blocks->getDomainCellBB().zMax() / 2;
for( auto & block: *blocks)
{
auto velField = block.getData<VelocityField_T>( velFieldID );
WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(velField,
Cell globalCell;
blocks->transformBlockLocalToGlobalCell(globalCell, block, Cell(x, y, z));
real_t randomReal = xMagnitude * math::realRandom<real_t>(-fluctuationMagnitude, fluctuationMagnitude);
velField->get(x, y, z, 1) = real_t(0);
velField->get(x, y, z, 2) = randomReal;
if( globalCell[2] >= halfZ ) {
velField->get(x, y, z, 0) = xMagnitude;
} else {
velField->get(x, y, z, 0) = -xMagnitude;
}
);
}
}
#include "UniformGridGPU_InfoHeader.h"
using namespace walberla;
using FlagField_T = FlagField<uint8_t>;
int main( int argc, char **argv )
int main(int argc, char** argv)
{
mpi::Environment env( argc, argv );
mpi::Environment env(argc, argv);
cuda::selectDeviceBasedOnMpiRank();
for( auto cfg = python_coupling::configBegin( argc, argv ); cfg != python_coupling::configEnd(); ++cfg )
for (auto cfg = python_coupling::configBegin(argc, argv); cfg != python_coupling::configEnd(); ++cfg)
{
WALBERLA_MPI_WORLD_BARRIER();
WALBERLA_MPI_WORLD_BARRIER()
WALBERLA_CUDA_CHECK(cudaPeekAtLastError())
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// SETUP AND CONFIGURATION ///
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
auto config = *cfg;
logging::configureLogging( config );
auto blocks = blockforest::createUniformBlockGridFromConfig( config );
logging::configureLogging(config);
auto blocks = blockforest::createUniformBlockGridFromConfig(config);
Vector3<uint_t> cellsPerBlock = config->getBlock( "DomainSetup" ).getParameter<Vector3<uint_t> >( "cellsPerBlock" );
Vector3< uint_t > cellsPerBlock =
config->getBlock("DomainSetup").getParameter< Vector3< uint_t > >("cellsPerBlock");
// Reading parameters
auto parameters = config->getOneBlock( "Parameters" );
const std::string timeStepStrategy = parameters.getParameter<std::string>( "timeStepStrategy", "normal");
const real_t omega = parameters.getParameter<real_t>( "omega", real_c( 1.4 ));
const uint_t timesteps = parameters.getParameter<uint_t>( "timesteps", uint_c( 50 ));
auto parameters = config->getOneBlock("Parameters");
const real_t omega = parameters.getParameter< real_t >("omega", real_c(1.4));
const uint_t timesteps = parameters.getParameter< uint_t >("timesteps", uint_c(50));
const bool initShearFlow = parameters.getParameter<bool>("initShearFlow", true);
// Creating fields
BlockDataID pdfFieldCpuID = field::addToStorage< PdfField_T >( blocks, "pdfs cpu", real_t(0), field::fzyx);
BlockDataID velFieldCpuID = field::addToStorage< VelocityField_T >( blocks, "vel", real_t(0), field::fzyx);
BlockDataID pdfFieldCpuID =
field::addToStorage< PdfField_T >(blocks, "pdfs cpu", real_t(std::nan("")), field::fzyx);
BlockDataID velFieldCpuID = field::addToStorage< VelocityField_T >(blocks, "vel", real_t(0), field::fzyx);
// Initialize velocity on cpu
if( initShearFlow ){
WALBERLA_LOG_INFO_ON_ROOT("Initializing shear flow")
initShearVelocity(blocks, velFieldCpuID);
}
BlockDataID pdfFieldGpuID = cuda::addGPUFieldToStorage< PdfField_T >(blocks, pdfFieldCpuID, "pdfs on GPU", true);
// Velocity field is copied to the GPU
BlockDataID velFieldGpuID =
cuda::addGPUFieldToStorage< VelocityField_T >(blocks, velFieldCpuID, "velocity on GPU", true);
if( timeStepStrategy != "kernelOnlyNoInit")
pystencils::UniformGridGPU_MacroSetter setterSweep(pdfFieldGpuID, velFieldGpuID);
// Set up initial PDF values
for (auto& block : *blocks)
setterSweep(&block);
Vector3< int > innerOuterSplit =
parameters.getParameter< Vector3< int > >("innerOuterSplit", Vector3< int >(1, 1, 1));
for (uint_t i = 0; i < 3; ++i)
{
if ( initShearFlow )
{
WALBERLA_LOG_INFO_ON_ROOT( "Initializing shear flow" );
initShearVelocity( blocks, velFieldCpuID );
}
pystencils::UniformGridGPU_MacroSetter setterSweep(pdfFieldCpuID, velFieldCpuID);
for( auto & block : *blocks )
setterSweep( &block );
// setter sweep only initializes interior of domain - for push schemes to work a first communication is required here
blockforest::communication::UniformBufferedScheme<CommunicationStencil_T> initialComm(blocks);
initialComm.addPackInfo( make_shared< field::communication::PackInfo<PdfField_T> >( pdfFieldCpuID ) );
initialComm();
if (int_c(cellsPerBlock[i]) <= innerOuterSplit[i] * 2)
{
WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller or increase cellsPerBlock")
}
}
BlockDataID pdfFieldGpuID = cuda::addGPUFieldToStorage<PdfField_T >( blocks, pdfFieldCpuID, "pdfs on GPU", true );
BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >( blocks, "flag field" );
Cell innerOuterSplitCell(innerOuterSplit[0], innerOuterSplit[1], innerOuterSplit[2]);
bool cudaEnabledMPI = parameters.getParameter< bool >("cudaEnabledMPI", false);
Vector3< int32_t > gpuBlockSize =
parameters.getParameter< Vector3< int32_t > >("gpuBlockSize", Vector3< int32_t >(256, 1, 1));
int streamHighPriority = 0;
int streamLowPriority = 0;
WALBERLA_CUDA_CHECK(cudaDeviceGetStreamPriorityRange(&streamLowPriority, &streamHighPriority))
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// LB SWEEPS AND BOUNDARY HANDLING ///
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
using LbSweep = lbm::UniformGridGPU_LbKernel;
using PackInfoEven = lbm::UniformGridGPU_PackInfoEven;
using PackInfoOdd = lbm::UniformGridGPU_PackInfoOdd;
using cuda::communication::UniformGPUScheme;
LbSweep lbSweep(pdfFieldGpuID, omega, gpuBlockSize[0], gpuBlockSize[1], gpuBlockSize[2], innerOuterSplitCell);
lbSweep.setOuterPriority(streamHighPriority);
// Boundaries
const FlagUID fluidFlagUID( "Fluid" );
BlockDataID flagFieldID = field::addFlagFieldToStorage<FlagField_T>(blocks, "Boundary Flag Field");
auto boundariesConfig = config->getBlock( "Boundaries" );
bool disableBoundaries = true;
bool boundaries = false;
if( boundariesConfig )
{
disableBoundaries = false;
geometry::initBoundaryHandling< FlagField_T >( *blocks, flagFieldID, boundariesConfig );
geometry::setNonBoundaryCellsToDomain< FlagField_T >( *blocks, flagFieldID, fluidFlagUID );
boundaries = true;
geometry::initBoundaryHandling< FlagField_T >( *blocks, flagFieldID, boundariesConfig );
geometry::setNonBoundaryCellsToDomain< FlagField_T >( *blocks, flagFieldID, fluidFlagUID );
}
lbm::UniformGridGPU_UBB ubb(blocks, pdfFieldGpuID);
lbm::UniformGridGPU_NoSlip noSlip(blocks, pdfFieldGpuID);
noSlip.fillFromFlagField<FlagField_T>(blocks, flagFieldID, FlagUID("NoSlip"), fluidFlagUID);
ubb.fillFromFlagField<FlagField_T>( blocks, flagFieldID, FlagUID("UBB"), fluidFlagUID );
noSlip.fillFromFlagField<FlagField_T>( blocks, flagFieldID, FlagUID("NoSlip"), fluidFlagUID );
// Communication setup
bool cudaEnabledMPI = parameters.getParameter<bool>( "cudaEnabledMPI", false );
Vector3<int32_t> gpuBlockSize = parameters.getParameter<Vector3<int32_t> > ("gpuBlockSize", Vector3<int32_t>(256, 1, 1));
const std::string communicationSchemeStr = parameters.getParameter<std::string>("communicationScheme", "UniformGPUScheme_Baseline");
CommunicationSchemeType communicationScheme;
if( communicationSchemeStr == "GPUPackInfo_Baseline")
communicationScheme = GPUPackInfo_Baseline;
else if (communicationSchemeStr == "GPUPackInfo_Streams")
communicationScheme = GPUPackInfo_Streams;
else if (communicationSchemeStr == "UniformGPUScheme_Baseline")
communicationScheme = UniformGPUScheme_Baseline;
else if (communicationSchemeStr == "UniformGPUScheme_Memcpy")
communicationScheme = UniformGPUScheme_Memcpy;
else if (communicationSchemeStr == "MPIDatatypes")
communicationScheme = MPIDatatypes;
else if (communicationSchemeStr == "MPIDatatypesFull")
communicationScheme = MPIDatatypesFull;
else {
WALBERLA_ABORT_NO_DEBUG_INFO("Invalid choice for communicationScheme")
}
lbm::UniformGridGPU_UBB ubb(blocks, pdfFieldGpuID);
ubb.fillFromFlagField<FlagField_T>(blocks, flagFieldID, FlagUID("UBB"), fluidFlagUID);
Vector3<int> innerOuterSplit = parameters.getParameter<Vector3<int> >("innerOuterSplit", Vector3<int>(1, 1, 1));
for(uint_t i=0; i< 3; ++i)
{
if( int_c(cellsPerBlock[i]) <= innerOuterSplit[i] * 2) {
WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller or increase cellsPerBlock");
}
}
// Initial setup is the post-collision state of an even time step
auto tracker = make_shared< lbm::TimestepTracker >(0);
int streamHighPriority = 0;
int streamLowPriority = 0;
WALBERLA_CUDA_CHECK( cudaDeviceGetStreamPriorityRange(&streamLowPriority, &streamHighPriority) );
WALBERLA_CHECK(gpuBlockSize[2] == 1);
pystencils::UniformGridGPU_LbKernel lbKernel( pdfFieldGpuID, omega,
1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7,
gpuBlockSize[0], gpuBlockSize[1],
Cell(innerOuterSplit[0], innerOuterSplit[1], innerOuterSplit[2]) );
lbKernel.setOuterPriority( streamHighPriority );
UniformGridGPU_Communication< CommunicationStencil_T, cuda::GPUField< double > >
gpuComm( blocks, pdfFieldGpuID, (CommunicationSchemeType) communicationScheme, cudaEnabledMPI );
auto defaultStream = cuda::StreamRAII::newPriorityStream( streamLowPriority );
auto innerOuterStreams = cuda::ParallelStreams( streamHighPriority );
auto boundaryOuterStreams = cuda::ParallelStreams( streamHighPriority );