diff --git a/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.cpp b/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.cpp index 68aab5882901317263c04045b69ac0a6de403adc..48f029deedca7e127d170206c35b778920b5d6ee 100644 --- a/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.cpp +++ b/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.cpp @@ -70,21 +70,29 @@ auto pdfFieldAdder = [](IBlock* const block, StructuredBlockStorage* const stora }; auto VelocityCallback = [](const Cell& pos, const shared_ptr< StructuredBlockForest >& SbF, IBlock& block, - real_t inflow_velocity) { - Cell globalCell; - CellInterval domain = SbF->getDomainCellBB(); - real_t h_y = domain.yMax() - domain.yMin(); - real_t h_z = domain.zMax() - domain.zMin(); - SbF->transformBlockLocalToGlobalCell(globalCell, block, pos); + real_t inflow_velocity, const bool constant_inflow = true) { + if (constant_inflow) + { + Vector3< real_t > result(inflow_velocity, 0.0, 0.0); + return result; + } + else + { + Cell globalCell; + CellInterval domain = SbF->getDomainCellBB(); + real_t h_y = real_c(domain.ySize()); + real_t h_z = real_c(domain.zSize()); + SbF->transformBlockLocalToGlobalCell(globalCell, block, pos); - real_t y1 = globalCell[1] - (h_y / 2.0 + 0.5); - real_t z1 = globalCell[2] - (h_z / 2.0 + 0.5); + real_t y1 = real_c(globalCell[1] - (h_y / 2.0 - 0.5)); + real_t z1 = real_c(globalCell[2] - (h_z / 2.0 - 0.5)); - real_t u = (inflow_velocity * 16) / (h_y * h_y * h_z * h_z) * (h_y / 2.0 - y1) * (h_y / 2 + y1) * (h_z / 2 - z1) * - (h_z / 2 + z1); + real_t u = (inflow_velocity * real_c(16)) / (h_y * h_y * h_z * h_z) * (h_y / real_c(2.0) - y1) * + (h_y / real_c(2.0) + y1) * (h_z / real_c(2.0) - z1) * (h_z / real_c(2.0) + z1); - Vector3< real_t > result(u, 0.0, 0.0); - return result; + Vector3< real_t > result(u, 0.0, 0.0); + return result; + } }; class AlternatingBeforeFunction @@ -147,6 +155,7 @@ int main(int argc, char** argv) const real_t u_max = parameters.getParameter< real_t >("u_max", real_t(0.05)); const real_t reynolds_number = parameters.getParameter< real_t >("reynolds_number", real_t(1000)); const uint_t diameter_sphere = parameters.getParameter< uint_t >("diameter_sphere", uint_t(5)); + const bool constant_inflow = parameters.getParameter< bool >("constant_inflow", true); const double remainingTimeLoggerFrequency = parameters.getParameter< double >("remainingTimeLoggerFrequency", 3.0); // in seconds @@ -204,7 +213,7 @@ int main(int argc, char** argv) auto boundariesConfig = config->getOneBlock("Boundaries"); std::function< Vector3< real_t >(const Cell&, const shared_ptr< StructuredBlockForest >&, IBlock&) > - velocity_initialisation = std::bind(VelocityCallback, _1, _2, _3, u_max); + velocity_initialisation = std::bind(VelocityCallback, _1, _2, _3, u_max, constant_inflow); #if defined(WALBERLA_BUILD_WITH_CUDA) lbm::FlowAroundSphereCodeGen_UBB ubb(blocks, pdfFieldIDGPU, velocity_initialisation); @@ -236,10 +245,9 @@ int main(int argc, char** argv) AlternatingBeforeFunction communication(evenComm, oddComm, tracker); // add LBM sweep and communication to time loop - timeloop.add() << BeforeFunction(communication, "communication") - << Sweep(noSlip.getSweep(tracker), "noSlip boundary"); + timeloop.add() << BeforeFunction(communication, "communication") << Sweep(ubb.getSweep(tracker), "ubb boundary"); timeloop.add() << Sweep(outflow.getSweep(tracker), "outflow boundary"); - timeloop.add() << Sweep(ubb.getSweep(tracker), "ubb boundary"); + timeloop.add() << Sweep(noSlip.getSweep(tracker), "noSlip boundary"); timeloop.add() << BeforeFunction(tracker->getAdvancementFunction(), "Timestep Advancement") << Sweep(lbSweep.getSweep(tracker), "LB update rule"); diff --git a/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.py b/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.py index 42b9f33d24cdbc2e060a313033e0b2458eded091..c565baa589f21dc6681bdd40826aef643b0afd2a 100644 --- a/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.py +++ b/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.py @@ -14,51 +14,54 @@ from lbmpy_walberla import generate_alternating_lbm_sweep, generate_alternating_ import sympy as sp -stencil = get_stencil("D3Q27") -q = len(stencil) -dim = len(stencil[0]) -streaming_pattern = 'esotwist' -timesteps = get_timesteps(streaming_pattern) - -pdfs, velocity_field, density_field = fields(f"pdfs({q}), velocity({dim}), density(1) : double[{dim}D]", layout='fzyx') -omega = sp.Symbol("omega") -u_max = sp.Symbol("u_max") - -output = { - 'density': density_field, - 'velocity': velocity_field -} - -opt = {'symbolic_field': pdfs, - 'cse_global': False, - 'cse_pdfs': False} - -method_params = {'method': 'cumulant', - 'stencil': stencil, - 'relaxation_rate': omega, - 'galilean_correction': True, - 'field_name': 'pdfs', - 'streaming_pattern': streaming_pattern, - 'output': output, - 'optimization': opt} - -collision_rule = create_lb_collision_rule(**method_params) -lb_method = collision_rule.method - -# getter & setter -setter_assignments = macroscopic_values_setter(lb_method, velocity=velocity_field.center_vector, - pdfs=pdfs, density=1.0, - streaming_pattern=streaming_pattern, - previous_timestep=timesteps[0]) - -# opt = {'instruction_set': 'sse', 'assume_aligned': True, 'nontemporal': False, 'assume_inner_stride_one': True} - -stencil_typedefs = {'Stencil_T': stencil} -field_typedefs = {'PdfField_T': pdfs, - 'VelocityField_T': velocity_field, - 'ScalarField_T': density_field} - with CodeGeneration() as ctx: + data_type = "float64" if ctx.double_accuracy else "float32" + stencil = get_stencil("D3Q27") + q = len(stencil) + dim = len(stencil[0]) + streaming_pattern = 'esotwist' + timesteps = get_timesteps(streaming_pattern) + + pdfs, velocity_field, density_field = fields(f"pdfs({q}), velocity({dim}), density(1) : {data_type}[{dim}D]", + layout='fzyx') + omega = sp.Symbol("omega") + u_max = sp.Symbol("u_max") + + output = { + 'density': density_field, + 'velocity': velocity_field + } + + opt = {'symbolic_field': pdfs, + 'cse_global': False, + 'cse_pdfs': False, + 'double_precision': True if ctx.double_accuracy else False} + + method_params = {'method': 'cumulant', + 'stencil': stencil, + 'relaxation_rate': omega, + 'galilean_correction': True, + 'field_name': 'pdfs', + 'streaming_pattern': streaming_pattern, + 'output': output, + 'optimization': opt} + + collision_rule = create_lb_collision_rule(**method_params) + lb_method = collision_rule.method + + # getter & setter + setter_assignments = macroscopic_values_setter(lb_method, velocity=velocity_field.center_vector, + pdfs=pdfs, density=1.0, + streaming_pattern=streaming_pattern, + previous_timestep=timesteps[0]) + + # opt = {'instruction_set': 'sse', 'assume_aligned': True, 'nontemporal': False, 'assume_inner_stride_one': True} + + stencil_typedefs = {'Stencil_T': stencil} + field_typedefs = {'PdfField_T': pdfs, + 'VelocityField_T': velocity_field, + 'ScalarField_T': density_field} + if ctx.cuda: target = 'gpu' else: diff --git a/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGenParameters.py b/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGenParameters.py index 6d046735fd3867cb83670e13090d43f0e52166d4..41d38d16218d97a633ccca62c951356b16c2f446 100644 --- a/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGenParameters.py +++ b/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGenParameters.py @@ -4,13 +4,15 @@ from lbmpy.relaxationrates import relaxation_rate_from_lattice_viscosity class Scenario: def __init__(self): - self.timesteps = 101 - self.vtkWriteFrequency = 2000 + self.timesteps = 1001 + self.vtkWriteFrequency = 100 self.cells = (384, 128, 128) self.blocks = (1, 1, 1) self.periodic = (0, 0, 0) + self.constant_inflow = True + self.diameter_sphere = min(self.cells) // 2 self.u_max = 0.1 self.reynolds_number = 1000000 @@ -38,7 +40,8 @@ class Scenario: 'omega': self.omega, 'u_max': self.u_max, 'reynolds_number': self.reynolds_number, - 'diameter_sphere': self.diameter_sphere + 'diameter_sphere': self.diameter_sphere, + 'constant_inflow': self.constant_inflow }, 'Boundaries': { 'Border': [ @@ -54,7 +57,7 @@ class Scenario: 'midpoint': (int(0.40 * self.total_cells[0]), self.total_cells[1] // 2, self.total_cells[2] // 2), 'radius': self.diameter_sphere // 2, 'flag': 'NoSlip'} - ] + ], }, } diff --git a/apps/benchmarks/UniformGridGPU/CMakeLists.txt b/apps/benchmarks/UniformGridGPU/CMakeLists.txt index b3ef93bb589a84f807ac00464fa84cd141c1eb0c..a3a430204b232de43d07819e1a1647d6e444b999 100644 --- a/apps/benchmarks/UniformGridGPU/CMakeLists.txt +++ b/apps/benchmarks/UniformGridGPU/CMakeLists.txt @@ -4,9 +4,9 @@ waLBerla_link_files_to_builddir( "*.py" ) waLBerla_link_files_to_builddir( "simulation_setup" ) -foreach(streaming_pattern aa) # choose from {pull, push, aa, esotwist} - foreach(stencil d3q27) # choose from {d3q19 d3q27} - foreach (collision_setup srt trt mrt cumulant) # choose from {srt trt mrt cumulant entropic smagorinsky} +foreach(streaming_pattern pull push aa esotwist) + foreach(stencil d3q19 d3q27) + foreach (collision_setup srt trt mrt cumulant entropic smagorinsky mrt-overrelax cumulant-overrelax) set(config ${stencil}_${streaming_pattern}_${collision_setup}) waLBerla_generate_target_from_python(NAME UniformGridGPUGenerated_${config} FILE UniformGridGPU.py @@ -17,6 +17,7 @@ foreach(streaming_pattern aa) # choose from {pull, push, aa, esotwist} UniformGridGPU_NoSlip.cu UniformGridGPU_NoSlip.h UniformGridGPU_UBB.cu UniformGridGPU_UBB.h UniformGridGPU_MacroSetter.cu UniformGridGPU_MacroSetter.h + UniformGridGPU_StreamOnlyKernel.cu UniformGridGPU_StreamOnlyKernel.h UniformGridGPU_InfoHeader.h ) @@ -25,6 +26,16 @@ foreach(streaming_pattern aa) # choose from {pull, push, aa, esotwist} FILES UniformGridGPU.cpp DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk UniformGridGPUGenerated_${config}) set_target_properties( UniformGridGPU_${config} PROPERTIES CXX_VISIBILITY_PRESET hidden) + + # all configs are excluded from all except for pull d3q27. + if (${streaming_pattern} STREQUAL "pull" AND ${stencil} STREQUAL "d3q27") + set_target_properties( UniformGridGPUGenerated_${config} PROPERTIES EXCLUDE_FROM_ALL FALSE) + set_target_properties( UniformGridGPU_${config} PROPERTIES EXCLUDE_FROM_ALL FALSE) + else() + set_target_properties( UniformGridGPUGenerated_${config} PROPERTIES EXCLUDE_FROM_ALL TRUE) + set_target_properties( UniformGridGPU_${config} PROPERTIES EXCLUDE_FROM_ALL TRUE) + endif(${streaming_pattern} STREQUAL "pull" AND ${stencil} STREQUAL "d3q27") + endforeach () endforeach() endforeach() \ No newline at end of file diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp index bdcffe3ad3a8c1992d7746f4e40fd34fc9325c4a..b12c534a4a1fbe05fb58fd4c4571263c5e6d6ff9 100644 --- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp +++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp @@ -1,3 +1,25 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file UniformGridGPU.cpp +//! \author Martin Bauer <martin.bauer@fau.de> +//! \author Frederik Hennig <frederik.hennig@fau.de> +//! \author Markus Holzer <markus.holzer@fau.de> +// +//====================================================================================================================== + #include "blockforest/Initialization.h" #include "core/Environment.h" @@ -7,9 +29,9 @@ #include "cuda/AddGPUFieldToStorage.h" #include "cuda/DeviceSelectMPI.h" +#include "cuda/FieldCopy.h" #include "cuda/ParallelStreams.h" #include "cuda/communication/UniformGPUScheme.h" -#include "cuda/FieldCopy.h" #include "cuda/lbm/CombinedInPlaceGpuPackInfo.h" #include "field/AddToStorage.h" @@ -27,14 +49,13 @@ #include "timeloop/SweepTimeloop.h" -#include "InitShearVelocity.h" - #include <cmath> +#include "InitShearVelocity.h" #include "UniformGridGPU_InfoHeader.h" using namespace walberla; -using FlagField_T = FlagField<uint8_t>; +using FlagField_T = FlagField< uint8_t >; int main(int argc, char** argv) { @@ -58,10 +79,10 @@ int main(int argc, char** argv) Vector3< uint_t > cellsPerBlock = config->getBlock("DomainSetup").getParameter< Vector3< uint_t > >("cellsPerBlock"); // Reading parameters - auto parameters = config->getOneBlock("Parameters"); - const real_t omega = parameters.getParameter< real_t >("omega", real_c(1.4)); - const uint_t timesteps = parameters.getParameter< uint_t >("timesteps", uint_c(50)); - const bool initShearFlow = parameters.getParameter<bool>("initShearFlow", true); + auto parameters = config->getOneBlock("Parameters"); + const real_t omega = parameters.getParameter< real_t >("omega", real_c(1.4)); + const uint_t timesteps = parameters.getParameter< uint_t >("timesteps", uint_c(50)); + const bool initShearFlow = parameters.getParameter< bool >("initShearFlow", true); // Creating fields BlockDataID pdfFieldCpuID = @@ -69,7 +90,8 @@ int main(int argc, char** argv) BlockDataID velFieldCpuID = field::addToStorage< VelocityField_T >(blocks, "vel", real_t(0), field::fzyx); // Initialize velocity on cpu - if( initShearFlow ){ + if (initShearFlow) + { WALBERLA_LOG_INFO_ON_ROOT("Initializing shear flow") initShearVelocity(blocks, velFieldCpuID); } @@ -91,9 +113,7 @@ int main(int argc, char** argv) for (uint_t i = 0; i < 3; ++i) { if (int_c(cellsPerBlock[i]) <= innerOuterSplit[i] * 2) - { - WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller or increase cellsPerBlock") - } + { WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller or increase cellsPerBlock") } } Cell innerOuterSplitCell(innerOuterSplit[0], innerOuterSplit[1], innerOuterSplit[2]); @@ -117,23 +137,26 @@ int main(int argc, char** argv) LbSweep lbSweep(pdfFieldGpuID, omega, gpuBlockSize[0], gpuBlockSize[1], gpuBlockSize[2], innerOuterSplitCell); lbSweep.setOuterPriority(streamHighPriority); + pystencils::UniformGridGPU_StreamOnlyKernel StreamOnlyKernel(pdfFieldGpuID, gpuBlockSize[0], gpuBlockSize[1], + gpuBlockSize[2]); + // Boundaries - const FlagUID fluidFlagUID( "Fluid" ); - BlockDataID flagFieldID = field::addFlagFieldToStorage<FlagField_T>(blocks, "Boundary Flag Field"); - auto boundariesConfig = config->getBlock( "Boundaries" ); - bool boundaries = false; - if( boundariesConfig ) + const FlagUID fluidFlagUID("Fluid"); + BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >(blocks, "Boundary Flag Field"); + auto boundariesConfig = config->getBlock("Boundaries"); + bool boundaries = false; + if (boundariesConfig) { boundaries = true; - geometry::initBoundaryHandling< FlagField_T >( *blocks, flagFieldID, boundariesConfig ); - geometry::setNonBoundaryCellsToDomain< FlagField_T >( *blocks, flagFieldID, fluidFlagUID ); + geometry::initBoundaryHandling< FlagField_T >(*blocks, flagFieldID, boundariesConfig); + geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, flagFieldID, fluidFlagUID); } lbm::UniformGridGPU_NoSlip noSlip(blocks, pdfFieldGpuID); - noSlip.fillFromFlagField<FlagField_T>(blocks, flagFieldID, FlagUID("NoSlip"), fluidFlagUID); + noSlip.fillFromFlagField< FlagField_T >(blocks, flagFieldID, FlagUID("NoSlip"), fluidFlagUID); lbm::UniformGridGPU_UBB ubb(blocks, pdfFieldGpuID); - ubb.fillFromFlagField<FlagField_T>(blocks, flagFieldID, FlagUID("UBB"), fluidFlagUID); + ubb.fillFromFlagField< FlagField_T >(blocks, flagFieldID, FlagUID("UBB"), fluidFlagUID); // Initial setup is the post-collision state of an even time step auto tracker = make_shared< lbm::TimestepTracker >(0); @@ -143,7 +166,8 @@ int main(int argc, char** argv) ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// UniformGPUScheme< Stencil_T > comm(blocks, cudaEnabledMPI); - auto packInfo = make_shared< lbm::CombinedInPlaceGpuPackInfo< PackInfoEven, PackInfoOdd > >(tracker, pdfFieldGpuID); + auto packInfo = + make_shared< lbm::CombinedInPlaceGpuPackInfo< PackInfoEven, PackInfoOdd > >(tracker, pdfFieldGpuID); comm.addPackInfo(packInfo); ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -152,17 +176,17 @@ int main(int argc, char** argv) auto defaultStream = cuda::StreamRAII::newPriorityStream(streamLowPriority); - auto boundarySweep = [&](IBlock * block, uint8_t t, cudaStream_t stream){ + auto boundarySweep = [&](IBlock* block, uint8_t t, cudaStream_t stream) { noSlip.run(block, t, stream); ubb.run(block, t, stream); }; - auto boundaryInner = [&](IBlock * block, uint8_t t, cudaStream_t stream){ + auto boundaryInner = [&](IBlock* block, uint8_t t, cudaStream_t stream) { noSlip.inner(block, t, stream); ubb.inner(block, t, stream); }; - auto boundaryOuter = [&](IBlock * block, uint8_t t, cudaStream_t stream){ + auto boundaryOuter = [&](IBlock* block, uint8_t t, cudaStream_t stream) { noSlip.outer(block, t, stream); ubb.outer(block, t, stream); }; @@ -170,13 +194,15 @@ int main(int argc, char** argv) auto simpleOverlapTimeStep = [&]() { // Communicate post-collision values of previous timestep... comm.startCommunication(defaultStream); - for (auto& block : *blocks){ - if(boundaries) boundaryInner(&block, tracker->getCounter(), defaultStream); + for (auto& block : *blocks) + { + if (boundaries) boundaryInner(&block, tracker->getCounter(), defaultStream); lbSweep.inner(&block, tracker->getCounterPlusOne(), defaultStream); } comm.wait(defaultStream); - for (auto& block : *blocks){ - if(boundaries) boundaryOuter(&block, tracker->getCounter(), defaultStream); + for (auto& block : *blocks) + { + if (boundaries) boundaryOuter(&block, tracker->getCounter(), defaultStream); lbSweep.outer(&block, tracker->getCounterPlusOne(), defaultStream); } @@ -185,8 +211,9 @@ int main(int argc, char** argv) auto normalTimeStep = [&]() { comm.communicate(defaultStream); - for (auto& block : *blocks){ - if(boundaries) boundarySweep(&block, tracker->getCounter(), defaultStream); + for (auto& block : *blocks) + { + if (boundaries) boundarySweep(&block, tracker->getCounter(), defaultStream); lbSweep(&block, tracker->getCounterPlusOne(), defaultStream); } @@ -201,6 +228,12 @@ int main(int argc, char** argv) lbSweep(&block, tracker->getCounter(), defaultStream); }; + // Stream only function to test a streaming pattern without executing lbm operations inside + auto StreamOnlyFunc = [&]() { + for (auto& block : *blocks) + StreamOnlyKernel(&block, defaultStream); + }; + ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// TIME LOOP SETUP /// ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -221,6 +254,13 @@ int main(int argc, char** argv) comm.communicate(); timeStep = kernelOnlyFunc; } + else if (timeStepStrategy == "StreamOnly") + { + WALBERLA_LOG_INFO_ON_ROOT( + "Running only streaming kernel without LBM - this makes only sense for benchmarking!") + // Run initial communication once to provide any missing stream-in populations + timeStep = StreamOnlyFunc; + } else { WALBERLA_ABORT_NO_DEBUG_INFO("Invalid value for 'timeStepStrategy'. Allowed values are 'noOverlap', " @@ -239,7 +279,7 @@ int main(int argc, char** argv) vtkOutput->addCellDataWriter(velWriter); vtkOutput->addBeforeFunction([&]() { - cuda::fieldCpy< VelocityField_T , cuda::GPUField< real_t > >(blocks, velFieldCpuID, velFieldGpuID); + cuda::fieldCpy< VelocityField_T, cuda::GPUField< real_t > >(blocks, velFieldCpuID, velFieldGpuID); }); timeLoop.addFuncAfterTimeStep(vtk::writeFiles(vtkOutput), "VTK Output"); } diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.py b/apps/benchmarks/UniformGridGPU/UniformGridGPU.py index 7d3c8e7f21e602128564f4ea8e22a119e7080989..b615be83289620e218ce160d57f7588ba0886647 100644 --- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.py +++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.py @@ -11,6 +11,8 @@ from lbmpy.boundaries import NoSlip, UBB from lbmpy.creationfunctions import create_lb_collision_rule from lbmpy.macroscopic_value_kernels import macroscopic_values_setter from lbmpy.stencils import get_stencil +from lbmpy.updatekernels import create_stream_only_kernel +from lbmpy.fieldaccess import * from pystencils_walberla import CodeGeneration, generate_info_header, generate_sweep from lbmpy_walberla import generate_alternating_lbm_sweep, generate_lb_pack_info, generate_alternating_lbm_boundary @@ -44,7 +46,7 @@ options_dict = { }, 'mrt-overrelax': { 'method': 'mrt', - 'relaxation_rates': [omega, 1.3, 1.4, omega, 1.2, 1.1], + 'relaxation_rates': [omega] + [1 + x * 1e-2 for x in range(1, 11)], }, 'cumulant': { 'method': 'cumulant', @@ -59,7 +61,7 @@ options_dict = { 'entropic': { 'method': 'mrt', 'compressible': True, - 'relaxation_rates': [omega, omega, omega_free, omega_free, omega_free], + 'relaxation_rates': [omega, omega] + [omega_free] * 6, 'entropic': True, }, 'smagorinsky': { @@ -81,6 +83,7 @@ const bool infoCsePdfs = {cse_pdfs}; optimize = True with CodeGeneration() as ctx: + field_type = "float64" if ctx.double_accuracy else "float32" config_tokens = ctx.config.split('_') assert len(config_tokens) >= 3 @@ -99,7 +102,8 @@ with CodeGeneration() as ctx: q = len(stencil) dim = len(stencil[0]) assert dim == 3, "This app supports only three-dimensional stencils" - pdfs, pdfs_tmp, velocity_field = ps.fields(f"pdfs({q}), pdfs_tmp({q}), velocity(3) : double[3D]", layout='fzyx') + pdfs, pdfs_tmp, velocity_field = ps.fields(f"pdfs({q}), pdfs_tmp({q}), velocity(3) : {field_type}[3D]", + layout='fzyx') common_options = { 'stencil': stencil, @@ -110,7 +114,7 @@ with CodeGeneration() as ctx: 'cse_pdfs': False, 'symbolic_field': pdfs, 'field_layout': 'fzyx', - 'gpu_indexing_params': gpu_indexing_params, + 'gpu_indexing_params': gpu_indexing_params } } @@ -128,6 +132,14 @@ with CodeGeneration() as ctx: ('int32_t', 'cudaBlockSize2') ] + # Sweep for Stream only. This is for benchmarking an empty streaming pattern without LBM. + # is_inplace is set to False to ensure that the streaming is done with src and dst field. + # If this is not the case the compiler might simplify the streaming in a way that benchmarking makes no sense. + accessor = CollideOnlyInplaceAccessor() + accessor.is_inplace = False + field_swaps_stream_only = [(pdfs, pdfs_tmp)] + stream_only_kernel = create_stream_only_kernel(stencil, pdfs, pdfs_tmp, accessor=accessor) + # LB Sweep collision_rule = create_lb_collision_rule(**options) @@ -148,6 +160,10 @@ with CodeGeneration() as ctx: previous_timestep=Timestep.EVEN) generate_sweep(ctx, 'UniformGridGPU_MacroSetter', setter_assignments, target='gpu') + # Stream only kernel + generate_sweep(ctx, 'UniformGridGPU_StreamOnlyKernel', stream_only_kernel, field_swaps=field_swaps_stream_only, + gpu_indexing_params=gpu_indexing_params, varying_parameters=vp, target='gpu') + # Boundaries noslip = NoSlip() ubb = UBB((0.05, 0, 0)) diff --git a/python/lbmpy_walberla/walberla_lbm_generation.py b/python/lbmpy_walberla/walberla_lbm_generation.py index 936fcae85ccaff437f97cd28c8e4bf5f74c30cb1..21463f79ca7df3e1fc56c63f6025da7291fe1cd3 100644 --- a/python/lbmpy_walberla/walberla_lbm_generation.py +++ b/python/lbmpy_walberla/walberla_lbm_generation.py @@ -9,7 +9,7 @@ import pystencils as ps from lbmpy.fieldaccess import CollideOnlyInplaceAccessor, StreamPullTwoFieldsAccessor from lbmpy.relaxationrates import relaxation_rate_scaling from lbmpy.stencils import get_stencil -from lbmpy.updatekernels import create_lbm_kernel, create_stream_pull_only_kernel +from lbmpy.updatekernels import create_lbm_kernel, create_stream_only_kernel from pystencils import AssignmentCollection, create_kernel from pystencils.astnodes import SympyAssignment from pystencils.backends.cbackend import CBackend, CustomSympyPrinter, get_headers @@ -31,7 +31,6 @@ def __lattice_model(generation_context, class_name, lb_method, stream_collide_as if not stencil_name: raise ValueError("lb_method uses a stencil that is not supported in waLBerla") - communication_stencil_name = stencil_name if stencil_name != "D3Q15" else "D3Q27" is_float = not generation_context.double_accuracy dtype_string = "float32" if is_float else "float64" @@ -165,8 +164,8 @@ def generate_lattice_model(generation_context, class_name, collision_rule, field collide_ast.function_name = 'kernel_collide' collide_ast.assumed_inner_stride_one = create_kernel_params['cpu_vectorize_info']['assume_inner_stride_one'] - stream_update_rule = create_stream_pull_only_kernel(lb_method.stencil, None, 'pdfs', 'pdfs_tmp', field_layout, - dtype) + stream_update_rule = create_stream_only_kernel(lb_method.stencil, src_field, dst_field, + accessor=StreamPullTwoFieldsAccessor()) stream_ast = create_kernel(stream_update_rule, **create_kernel_params) stream_ast.function_name = 'kernel_stream' stream_ast.assumed_inner_stride_one = create_kernel_params['cpu_vectorize_info']['assume_inner_stride_one']