Skip to content
Snippets Groups Projects
Commit ce49cdbf authored by Markus Holzer's avatar Markus Holzer Committed by Helen Schottenhamml
Browse files

Enable single precision for benchmark cases

parent b4b0cb4b
No related merge requests found
...@@ -70,21 +70,29 @@ auto pdfFieldAdder = [](IBlock* const block, StructuredBlockStorage* const stora ...@@ -70,21 +70,29 @@ auto pdfFieldAdder = [](IBlock* const block, StructuredBlockStorage* const stora
}; };
auto VelocityCallback = [](const Cell& pos, const shared_ptr< StructuredBlockForest >& SbF, IBlock& block, auto VelocityCallback = [](const Cell& pos, const shared_ptr< StructuredBlockForest >& SbF, IBlock& block,
real_t inflow_velocity) { real_t inflow_velocity, const bool constant_inflow = true) {
Cell globalCell; if (constant_inflow)
CellInterval domain = SbF->getDomainCellBB(); {
real_t h_y = domain.yMax() - domain.yMin(); Vector3< real_t > result(inflow_velocity, 0.0, 0.0);
real_t h_z = domain.zMax() - domain.zMin(); return result;
SbF->transformBlockLocalToGlobalCell(globalCell, block, pos); }
else
{
Cell globalCell;
CellInterval domain = SbF->getDomainCellBB();
real_t h_y = real_c(domain.ySize());
real_t h_z = real_c(domain.zSize());
SbF->transformBlockLocalToGlobalCell(globalCell, block, pos);
real_t y1 = globalCell[1] - (h_y / 2.0 + 0.5); real_t y1 = real_c(globalCell[1] - (h_y / 2.0 - 0.5));
real_t z1 = globalCell[2] - (h_z / 2.0 + 0.5); real_t z1 = real_c(globalCell[2] - (h_z / 2.0 - 0.5));
real_t u = (inflow_velocity * 16) / (h_y * h_y * h_z * h_z) * (h_y / 2.0 - y1) * (h_y / 2 + y1) * (h_z / 2 - z1) * real_t u = (inflow_velocity * real_c(16)) / (h_y * h_y * h_z * h_z) * (h_y / real_c(2.0) - y1) *
(h_z / 2 + z1); (h_y / real_c(2.0) + y1) * (h_z / real_c(2.0) - z1) * (h_z / real_c(2.0) + z1);
Vector3< real_t > result(u, 0.0, 0.0); Vector3< real_t > result(u, 0.0, 0.0);
return result; return result;
}
}; };
class AlternatingBeforeFunction class AlternatingBeforeFunction
...@@ -147,6 +155,7 @@ int main(int argc, char** argv) ...@@ -147,6 +155,7 @@ int main(int argc, char** argv)
const real_t u_max = parameters.getParameter< real_t >("u_max", real_t(0.05)); const real_t u_max = parameters.getParameter< real_t >("u_max", real_t(0.05));
const real_t reynolds_number = parameters.getParameter< real_t >("reynolds_number", real_t(1000)); const real_t reynolds_number = parameters.getParameter< real_t >("reynolds_number", real_t(1000));
const uint_t diameter_sphere = parameters.getParameter< uint_t >("diameter_sphere", uint_t(5)); const uint_t diameter_sphere = parameters.getParameter< uint_t >("diameter_sphere", uint_t(5));
const bool constant_inflow = parameters.getParameter< bool >("constant_inflow", true);
const double remainingTimeLoggerFrequency = const double remainingTimeLoggerFrequency =
parameters.getParameter< double >("remainingTimeLoggerFrequency", 3.0); // in seconds parameters.getParameter< double >("remainingTimeLoggerFrequency", 3.0); // in seconds
...@@ -204,7 +213,7 @@ int main(int argc, char** argv) ...@@ -204,7 +213,7 @@ int main(int argc, char** argv)
auto boundariesConfig = config->getOneBlock("Boundaries"); auto boundariesConfig = config->getOneBlock("Boundaries");
std::function< Vector3< real_t >(const Cell&, const shared_ptr< StructuredBlockForest >&, IBlock&) > std::function< Vector3< real_t >(const Cell&, const shared_ptr< StructuredBlockForest >&, IBlock&) >
velocity_initialisation = std::bind(VelocityCallback, _1, _2, _3, u_max); velocity_initialisation = std::bind(VelocityCallback, _1, _2, _3, u_max, constant_inflow);
#if defined(WALBERLA_BUILD_WITH_CUDA) #if defined(WALBERLA_BUILD_WITH_CUDA)
lbm::FlowAroundSphereCodeGen_UBB ubb(blocks, pdfFieldIDGPU, velocity_initialisation); lbm::FlowAroundSphereCodeGen_UBB ubb(blocks, pdfFieldIDGPU, velocity_initialisation);
...@@ -236,10 +245,9 @@ int main(int argc, char** argv) ...@@ -236,10 +245,9 @@ int main(int argc, char** argv)
AlternatingBeforeFunction communication(evenComm, oddComm, tracker); AlternatingBeforeFunction communication(evenComm, oddComm, tracker);
// add LBM sweep and communication to time loop // add LBM sweep and communication to time loop
timeloop.add() << BeforeFunction(communication, "communication") timeloop.add() << BeforeFunction(communication, "communication") << Sweep(ubb.getSweep(tracker), "ubb boundary");
<< Sweep(noSlip.getSweep(tracker), "noSlip boundary");
timeloop.add() << Sweep(outflow.getSweep(tracker), "outflow boundary"); timeloop.add() << Sweep(outflow.getSweep(tracker), "outflow boundary");
timeloop.add() << Sweep(ubb.getSweep(tracker), "ubb boundary"); timeloop.add() << Sweep(noSlip.getSweep(tracker), "noSlip boundary");
timeloop.add() << BeforeFunction(tracker->getAdvancementFunction(), "Timestep Advancement") timeloop.add() << BeforeFunction(tracker->getAdvancementFunction(), "Timestep Advancement")
<< Sweep(lbSweep.getSweep(tracker), "LB update rule"); << Sweep(lbSweep.getSweep(tracker), "LB update rule");
......
...@@ -14,51 +14,54 @@ from lbmpy_walberla import generate_alternating_lbm_sweep, generate_alternating_ ...@@ -14,51 +14,54 @@ from lbmpy_walberla import generate_alternating_lbm_sweep, generate_alternating_
import sympy as sp import sympy as sp
stencil = get_stencil("D3Q27")
q = len(stencil)
dim = len(stencil[0])
streaming_pattern = 'esotwist'
timesteps = get_timesteps(streaming_pattern)
pdfs, velocity_field, density_field = fields(f"pdfs({q}), velocity({dim}), density(1) : double[{dim}D]", layout='fzyx')
omega = sp.Symbol("omega")
u_max = sp.Symbol("u_max")
output = {
'density': density_field,
'velocity': velocity_field
}
opt = {'symbolic_field': pdfs,
'cse_global': False,
'cse_pdfs': False}
method_params = {'method': 'cumulant',
'stencil': stencil,
'relaxation_rate': omega,
'galilean_correction': True,
'field_name': 'pdfs',
'streaming_pattern': streaming_pattern,
'output': output,
'optimization': opt}
collision_rule = create_lb_collision_rule(**method_params)
lb_method = collision_rule.method
# getter & setter
setter_assignments = macroscopic_values_setter(lb_method, velocity=velocity_field.center_vector,
pdfs=pdfs, density=1.0,
streaming_pattern=streaming_pattern,
previous_timestep=timesteps[0])
# opt = {'instruction_set': 'sse', 'assume_aligned': True, 'nontemporal': False, 'assume_inner_stride_one': True}
stencil_typedefs = {'Stencil_T': stencil}
field_typedefs = {'PdfField_T': pdfs,
'VelocityField_T': velocity_field,
'ScalarField_T': density_field}
with CodeGeneration() as ctx: with CodeGeneration() as ctx:
data_type = "float64" if ctx.double_accuracy else "float32"
stencil = get_stencil("D3Q27")
q = len(stencil)
dim = len(stencil[0])
streaming_pattern = 'esotwist'
timesteps = get_timesteps(streaming_pattern)
pdfs, velocity_field, density_field = fields(f"pdfs({q}), velocity({dim}), density(1) : {data_type}[{dim}D]",
layout='fzyx')
omega = sp.Symbol("omega")
u_max = sp.Symbol("u_max")
output = {
'density': density_field,
'velocity': velocity_field
}
opt = {'symbolic_field': pdfs,
'cse_global': False,
'cse_pdfs': False,
'double_precision': True if ctx.double_accuracy else False}
method_params = {'method': 'cumulant',
'stencil': stencil,
'relaxation_rate': omega,
'galilean_correction': True,
'field_name': 'pdfs',
'streaming_pattern': streaming_pattern,
'output': output,
'optimization': opt}
collision_rule = create_lb_collision_rule(**method_params)
lb_method = collision_rule.method
# getter & setter
setter_assignments = macroscopic_values_setter(lb_method, velocity=velocity_field.center_vector,
pdfs=pdfs, density=1.0,
streaming_pattern=streaming_pattern,
previous_timestep=timesteps[0])
# opt = {'instruction_set': 'sse', 'assume_aligned': True, 'nontemporal': False, 'assume_inner_stride_one': True}
stencil_typedefs = {'Stencil_T': stencil}
field_typedefs = {'PdfField_T': pdfs,
'VelocityField_T': velocity_field,
'ScalarField_T': density_field}
if ctx.cuda: if ctx.cuda:
target = 'gpu' target = 'gpu'
else: else:
......
...@@ -4,13 +4,15 @@ from lbmpy.relaxationrates import relaxation_rate_from_lattice_viscosity ...@@ -4,13 +4,15 @@ from lbmpy.relaxationrates import relaxation_rate_from_lattice_viscosity
class Scenario: class Scenario:
def __init__(self): def __init__(self):
self.timesteps = 101 self.timesteps = 1001
self.vtkWriteFrequency = 2000 self.vtkWriteFrequency = 100
self.cells = (384, 128, 128) self.cells = (384, 128, 128)
self.blocks = (1, 1, 1) self.blocks = (1, 1, 1)
self.periodic = (0, 0, 0) self.periodic = (0, 0, 0)
self.constant_inflow = True
self.diameter_sphere = min(self.cells) // 2 self.diameter_sphere = min(self.cells) // 2
self.u_max = 0.1 self.u_max = 0.1
self.reynolds_number = 1000000 self.reynolds_number = 1000000
...@@ -38,7 +40,8 @@ class Scenario: ...@@ -38,7 +40,8 @@ class Scenario:
'omega': self.omega, 'omega': self.omega,
'u_max': self.u_max, 'u_max': self.u_max,
'reynolds_number': self.reynolds_number, 'reynolds_number': self.reynolds_number,
'diameter_sphere': self.diameter_sphere 'diameter_sphere': self.diameter_sphere,
'constant_inflow': self.constant_inflow
}, },
'Boundaries': { 'Boundaries': {
'Border': [ 'Border': [
...@@ -54,7 +57,7 @@ class Scenario: ...@@ -54,7 +57,7 @@ class Scenario:
'midpoint': (int(0.40 * self.total_cells[0]), self.total_cells[1] // 2, self.total_cells[2] // 2), 'midpoint': (int(0.40 * self.total_cells[0]), self.total_cells[1] // 2, self.total_cells[2] // 2),
'radius': self.diameter_sphere // 2, 'radius': self.diameter_sphere // 2,
'flag': 'NoSlip'} 'flag': 'NoSlip'}
] ],
}, },
} }
......
...@@ -4,9 +4,9 @@ waLBerla_link_files_to_builddir( "*.py" ) ...@@ -4,9 +4,9 @@ waLBerla_link_files_to_builddir( "*.py" )
waLBerla_link_files_to_builddir( "simulation_setup" ) waLBerla_link_files_to_builddir( "simulation_setup" )
foreach(streaming_pattern aa) # choose from {pull, push, aa, esotwist} foreach(streaming_pattern pull push aa esotwist)
foreach(stencil d3q27) # choose from {d3q19 d3q27} foreach(stencil d3q19 d3q27)
foreach (collision_setup srt trt mrt cumulant) # choose from {srt trt mrt cumulant entropic smagorinsky} foreach (collision_setup srt trt mrt cumulant entropic smagorinsky mrt-overrelax cumulant-overrelax)
set(config ${stencil}_${streaming_pattern}_${collision_setup}) set(config ${stencil}_${streaming_pattern}_${collision_setup})
waLBerla_generate_target_from_python(NAME UniformGridGPUGenerated_${config} waLBerla_generate_target_from_python(NAME UniformGridGPUGenerated_${config}
FILE UniformGridGPU.py FILE UniformGridGPU.py
...@@ -17,6 +17,7 @@ foreach(streaming_pattern aa) # choose from {pull, push, aa, esotwist} ...@@ -17,6 +17,7 @@ foreach(streaming_pattern aa) # choose from {pull, push, aa, esotwist}
UniformGridGPU_NoSlip.cu UniformGridGPU_NoSlip.h UniformGridGPU_NoSlip.cu UniformGridGPU_NoSlip.h
UniformGridGPU_UBB.cu UniformGridGPU_UBB.h UniformGridGPU_UBB.cu UniformGridGPU_UBB.h
UniformGridGPU_MacroSetter.cu UniformGridGPU_MacroSetter.h UniformGridGPU_MacroSetter.cu UniformGridGPU_MacroSetter.h
UniformGridGPU_StreamOnlyKernel.cu UniformGridGPU_StreamOnlyKernel.h
UniformGridGPU_InfoHeader.h UniformGridGPU_InfoHeader.h
) )
...@@ -25,6 +26,16 @@ foreach(streaming_pattern aa) # choose from {pull, push, aa, esotwist} ...@@ -25,6 +26,16 @@ foreach(streaming_pattern aa) # choose from {pull, push, aa, esotwist}
FILES UniformGridGPU.cpp FILES UniformGridGPU.cpp
DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk UniformGridGPUGenerated_${config}) DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk UniformGridGPUGenerated_${config})
set_target_properties( UniformGridGPU_${config} PROPERTIES CXX_VISIBILITY_PRESET hidden) set_target_properties( UniformGridGPU_${config} PROPERTIES CXX_VISIBILITY_PRESET hidden)
# all configs are excluded from all except for pull d3q27.
if (${streaming_pattern} STREQUAL "pull" AND ${stencil} STREQUAL "d3q27")
set_target_properties( UniformGridGPUGenerated_${config} PROPERTIES EXCLUDE_FROM_ALL FALSE)
set_target_properties( UniformGridGPU_${config} PROPERTIES EXCLUDE_FROM_ALL FALSE)
else()
set_target_properties( UniformGridGPUGenerated_${config} PROPERTIES EXCLUDE_FROM_ALL TRUE)
set_target_properties( UniformGridGPU_${config} PROPERTIES EXCLUDE_FROM_ALL TRUE)
endif(${streaming_pattern} STREQUAL "pull" AND ${stencil} STREQUAL "d3q27")
endforeach () endforeach ()
endforeach() endforeach()
endforeach() endforeach()
\ No newline at end of file
//======================================================================================================================
//
// This file is part of waLBerla. waLBerla is free software: you can
// redistribute it and/or modify it under the terms of the GNU General Public
// License as published by the Free Software Foundation, either version 3 of
// the License, or (at your option) any later version.
//
// waLBerla is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
//
//! \file UniformGridGPU.cpp
//! \author Martin Bauer <martin.bauer@fau.de>
//! \author Frederik Hennig <frederik.hennig@fau.de>
//! \author Markus Holzer <markus.holzer@fau.de>
//
//======================================================================================================================
#include "blockforest/Initialization.h" #include "blockforest/Initialization.h"
#include "core/Environment.h" #include "core/Environment.h"
...@@ -7,9 +29,9 @@ ...@@ -7,9 +29,9 @@
#include "cuda/AddGPUFieldToStorage.h" #include "cuda/AddGPUFieldToStorage.h"
#include "cuda/DeviceSelectMPI.h" #include "cuda/DeviceSelectMPI.h"
#include "cuda/FieldCopy.h"
#include "cuda/ParallelStreams.h" #include "cuda/ParallelStreams.h"
#include "cuda/communication/UniformGPUScheme.h" #include "cuda/communication/UniformGPUScheme.h"
#include "cuda/FieldCopy.h"
#include "cuda/lbm/CombinedInPlaceGpuPackInfo.h" #include "cuda/lbm/CombinedInPlaceGpuPackInfo.h"
#include "field/AddToStorage.h" #include "field/AddToStorage.h"
...@@ -27,14 +49,13 @@ ...@@ -27,14 +49,13 @@
#include "timeloop/SweepTimeloop.h" #include "timeloop/SweepTimeloop.h"
#include "InitShearVelocity.h"
#include <cmath> #include <cmath>
#include "InitShearVelocity.h"
#include "UniformGridGPU_InfoHeader.h" #include "UniformGridGPU_InfoHeader.h"
using namespace walberla; using namespace walberla;
using FlagField_T = FlagField<uint8_t>; using FlagField_T = FlagField< uint8_t >;
int main(int argc, char** argv) int main(int argc, char** argv)
{ {
...@@ -58,10 +79,10 @@ int main(int argc, char** argv) ...@@ -58,10 +79,10 @@ int main(int argc, char** argv)
Vector3< uint_t > cellsPerBlock = Vector3< uint_t > cellsPerBlock =
config->getBlock("DomainSetup").getParameter< Vector3< uint_t > >("cellsPerBlock"); config->getBlock("DomainSetup").getParameter< Vector3< uint_t > >("cellsPerBlock");
// Reading parameters // Reading parameters
auto parameters = config->getOneBlock("Parameters"); auto parameters = config->getOneBlock("Parameters");
const real_t omega = parameters.getParameter< real_t >("omega", real_c(1.4)); const real_t omega = parameters.getParameter< real_t >("omega", real_c(1.4));
const uint_t timesteps = parameters.getParameter< uint_t >("timesteps", uint_c(50)); const uint_t timesteps = parameters.getParameter< uint_t >("timesteps", uint_c(50));
const bool initShearFlow = parameters.getParameter<bool>("initShearFlow", true); const bool initShearFlow = parameters.getParameter< bool >("initShearFlow", true);
// Creating fields // Creating fields
BlockDataID pdfFieldCpuID = BlockDataID pdfFieldCpuID =
...@@ -69,7 +90,8 @@ int main(int argc, char** argv) ...@@ -69,7 +90,8 @@ int main(int argc, char** argv)
BlockDataID velFieldCpuID = field::addToStorage< VelocityField_T >(blocks, "vel", real_t(0), field::fzyx); BlockDataID velFieldCpuID = field::addToStorage< VelocityField_T >(blocks, "vel", real_t(0), field::fzyx);
// Initialize velocity on cpu // Initialize velocity on cpu
if( initShearFlow ){ if (initShearFlow)
{
WALBERLA_LOG_INFO_ON_ROOT("Initializing shear flow") WALBERLA_LOG_INFO_ON_ROOT("Initializing shear flow")
initShearVelocity(blocks, velFieldCpuID); initShearVelocity(blocks, velFieldCpuID);
} }
...@@ -91,9 +113,7 @@ int main(int argc, char** argv) ...@@ -91,9 +113,7 @@ int main(int argc, char** argv)
for (uint_t i = 0; i < 3; ++i) for (uint_t i = 0; i < 3; ++i)
{ {
if (int_c(cellsPerBlock[i]) <= innerOuterSplit[i] * 2) if (int_c(cellsPerBlock[i]) <= innerOuterSplit[i] * 2)
{ { WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller or increase cellsPerBlock") }
WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller or increase cellsPerBlock")
}
} }
Cell innerOuterSplitCell(innerOuterSplit[0], innerOuterSplit[1], innerOuterSplit[2]); Cell innerOuterSplitCell(innerOuterSplit[0], innerOuterSplit[1], innerOuterSplit[2]);
...@@ -117,23 +137,26 @@ int main(int argc, char** argv) ...@@ -117,23 +137,26 @@ int main(int argc, char** argv)
LbSweep lbSweep(pdfFieldGpuID, omega, gpuBlockSize[0], gpuBlockSize[1], gpuBlockSize[2], innerOuterSplitCell); LbSweep lbSweep(pdfFieldGpuID, omega, gpuBlockSize[0], gpuBlockSize[1], gpuBlockSize[2], innerOuterSplitCell);
lbSweep.setOuterPriority(streamHighPriority); lbSweep.setOuterPriority(streamHighPriority);
pystencils::UniformGridGPU_StreamOnlyKernel StreamOnlyKernel(pdfFieldGpuID, gpuBlockSize[0], gpuBlockSize[1],
gpuBlockSize[2]);
// Boundaries // Boundaries
const FlagUID fluidFlagUID( "Fluid" ); const FlagUID fluidFlagUID("Fluid");
BlockDataID flagFieldID = field::addFlagFieldToStorage<FlagField_T>(blocks, "Boundary Flag Field"); BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >(blocks, "Boundary Flag Field");
auto boundariesConfig = config->getBlock( "Boundaries" ); auto boundariesConfig = config->getBlock("Boundaries");
bool boundaries = false; bool boundaries = false;
if( boundariesConfig ) if (boundariesConfig)
{ {
boundaries = true; boundaries = true;
geometry::initBoundaryHandling< FlagField_T >( *blocks, flagFieldID, boundariesConfig ); geometry::initBoundaryHandling< FlagField_T >(*blocks, flagFieldID, boundariesConfig);
geometry::setNonBoundaryCellsToDomain< FlagField_T >( *blocks, flagFieldID, fluidFlagUID ); geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, flagFieldID, fluidFlagUID);
} }
lbm::UniformGridGPU_NoSlip noSlip(blocks, pdfFieldGpuID); lbm::UniformGridGPU_NoSlip noSlip(blocks, pdfFieldGpuID);
noSlip.fillFromFlagField<FlagField_T>(blocks, flagFieldID, FlagUID("NoSlip"), fluidFlagUID); noSlip.fillFromFlagField< FlagField_T >(blocks, flagFieldID, FlagUID("NoSlip"), fluidFlagUID);
lbm::UniformGridGPU_UBB ubb(blocks, pdfFieldGpuID); lbm::UniformGridGPU_UBB ubb(blocks, pdfFieldGpuID);
ubb.fillFromFlagField<FlagField_T>(blocks, flagFieldID, FlagUID("UBB"), fluidFlagUID); ubb.fillFromFlagField< FlagField_T >(blocks, flagFieldID, FlagUID("UBB"), fluidFlagUID);
// Initial setup is the post-collision state of an even time step // Initial setup is the post-collision state of an even time step
auto tracker = make_shared< lbm::TimestepTracker >(0); auto tracker = make_shared< lbm::TimestepTracker >(0);
...@@ -143,7 +166,8 @@ int main(int argc, char** argv) ...@@ -143,7 +166,8 @@ int main(int argc, char** argv)
////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
UniformGPUScheme< Stencil_T > comm(blocks, cudaEnabledMPI); UniformGPUScheme< Stencil_T > comm(blocks, cudaEnabledMPI);
auto packInfo = make_shared< lbm::CombinedInPlaceGpuPackInfo< PackInfoEven, PackInfoOdd > >(tracker, pdfFieldGpuID); auto packInfo =
make_shared< lbm::CombinedInPlaceGpuPackInfo< PackInfoEven, PackInfoOdd > >(tracker, pdfFieldGpuID);
comm.addPackInfo(packInfo); comm.addPackInfo(packInfo);
////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
...@@ -152,17 +176,17 @@ int main(int argc, char** argv) ...@@ -152,17 +176,17 @@ int main(int argc, char** argv)
auto defaultStream = cuda::StreamRAII::newPriorityStream(streamLowPriority); auto defaultStream = cuda::StreamRAII::newPriorityStream(streamLowPriority);
auto boundarySweep = [&](IBlock * block, uint8_t t, cudaStream_t stream){ auto boundarySweep = [&](IBlock* block, uint8_t t, cudaStream_t stream) {
noSlip.run(block, t, stream); noSlip.run(block, t, stream);
ubb.run(block, t, stream); ubb.run(block, t, stream);
}; };
auto boundaryInner = [&](IBlock * block, uint8_t t, cudaStream_t stream){ auto boundaryInner = [&](IBlock* block, uint8_t t, cudaStream_t stream) {
noSlip.inner(block, t, stream); noSlip.inner(block, t, stream);
ubb.inner(block, t, stream); ubb.inner(block, t, stream);
}; };
auto boundaryOuter = [&](IBlock * block, uint8_t t, cudaStream_t stream){ auto boundaryOuter = [&](IBlock* block, uint8_t t, cudaStream_t stream) {
noSlip.outer(block, t, stream); noSlip.outer(block, t, stream);
ubb.outer(block, t, stream); ubb.outer(block, t, stream);
}; };
...@@ -170,13 +194,15 @@ int main(int argc, char** argv) ...@@ -170,13 +194,15 @@ int main(int argc, char** argv)
auto simpleOverlapTimeStep = [&]() { auto simpleOverlapTimeStep = [&]() {
// Communicate post-collision values of previous timestep... // Communicate post-collision values of previous timestep...
comm.startCommunication(defaultStream); comm.startCommunication(defaultStream);
for (auto& block : *blocks){ for (auto& block : *blocks)
if(boundaries) boundaryInner(&block, tracker->getCounter(), defaultStream); {
if (boundaries) boundaryInner(&block, tracker->getCounter(), defaultStream);
lbSweep.inner(&block, tracker->getCounterPlusOne(), defaultStream); lbSweep.inner(&block, tracker->getCounterPlusOne(), defaultStream);
} }
comm.wait(defaultStream); comm.wait(defaultStream);
for (auto& block : *blocks){ for (auto& block : *blocks)
if(boundaries) boundaryOuter(&block, tracker->getCounter(), defaultStream); {
if (boundaries) boundaryOuter(&block, tracker->getCounter(), defaultStream);
lbSweep.outer(&block, tracker->getCounterPlusOne(), defaultStream); lbSweep.outer(&block, tracker->getCounterPlusOne(), defaultStream);
} }
...@@ -185,8 +211,9 @@ int main(int argc, char** argv) ...@@ -185,8 +211,9 @@ int main(int argc, char** argv)
auto normalTimeStep = [&]() { auto normalTimeStep = [&]() {
comm.communicate(defaultStream); comm.communicate(defaultStream);
for (auto& block : *blocks){ for (auto& block : *blocks)
if(boundaries) boundarySweep(&block, tracker->getCounter(), defaultStream); {
if (boundaries) boundarySweep(&block, tracker->getCounter(), defaultStream);
lbSweep(&block, tracker->getCounterPlusOne(), defaultStream); lbSweep(&block, tracker->getCounterPlusOne(), defaultStream);
} }
...@@ -201,6 +228,12 @@ int main(int argc, char** argv) ...@@ -201,6 +228,12 @@ int main(int argc, char** argv)
lbSweep(&block, tracker->getCounter(), defaultStream); lbSweep(&block, tracker->getCounter(), defaultStream);
}; };
// Stream only function to test a streaming pattern without executing lbm operations inside
auto StreamOnlyFunc = [&]() {
for (auto& block : *blocks)
StreamOnlyKernel(&block, defaultStream);
};
////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// TIME LOOP SETUP /// /// TIME LOOP SETUP ///
////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
...@@ -221,6 +254,13 @@ int main(int argc, char** argv) ...@@ -221,6 +254,13 @@ int main(int argc, char** argv)
comm.communicate(); comm.communicate();
timeStep = kernelOnlyFunc; timeStep = kernelOnlyFunc;
} }
else if (timeStepStrategy == "StreamOnly")
{
WALBERLA_LOG_INFO_ON_ROOT(
"Running only streaming kernel without LBM - this makes only sense for benchmarking!")
// Run initial communication once to provide any missing stream-in populations
timeStep = StreamOnlyFunc;
}
else else
{ {
WALBERLA_ABORT_NO_DEBUG_INFO("Invalid value for 'timeStepStrategy'. Allowed values are 'noOverlap', " WALBERLA_ABORT_NO_DEBUG_INFO("Invalid value for 'timeStepStrategy'. Allowed values are 'noOverlap', "
...@@ -239,7 +279,7 @@ int main(int argc, char** argv) ...@@ -239,7 +279,7 @@ int main(int argc, char** argv)
vtkOutput->addCellDataWriter(velWriter); vtkOutput->addCellDataWriter(velWriter);
vtkOutput->addBeforeFunction([&]() { vtkOutput->addBeforeFunction([&]() {
cuda::fieldCpy< VelocityField_T , cuda::GPUField< real_t > >(blocks, velFieldCpuID, velFieldGpuID); cuda::fieldCpy< VelocityField_T, cuda::GPUField< real_t > >(blocks, velFieldCpuID, velFieldGpuID);
}); });
timeLoop.addFuncAfterTimeStep(vtk::writeFiles(vtkOutput), "VTK Output"); timeLoop.addFuncAfterTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
} }
......
...@@ -11,6 +11,8 @@ from lbmpy.boundaries import NoSlip, UBB ...@@ -11,6 +11,8 @@ from lbmpy.boundaries import NoSlip, UBB
from lbmpy.creationfunctions import create_lb_collision_rule from lbmpy.creationfunctions import create_lb_collision_rule
from lbmpy.macroscopic_value_kernels import macroscopic_values_setter from lbmpy.macroscopic_value_kernels import macroscopic_values_setter
from lbmpy.stencils import get_stencil from lbmpy.stencils import get_stencil
from lbmpy.updatekernels import create_stream_only_kernel
from lbmpy.fieldaccess import *
from pystencils_walberla import CodeGeneration, generate_info_header, generate_sweep from pystencils_walberla import CodeGeneration, generate_info_header, generate_sweep
from lbmpy_walberla import generate_alternating_lbm_sweep, generate_lb_pack_info, generate_alternating_lbm_boundary from lbmpy_walberla import generate_alternating_lbm_sweep, generate_lb_pack_info, generate_alternating_lbm_boundary
...@@ -44,7 +46,7 @@ options_dict = { ...@@ -44,7 +46,7 @@ options_dict = {
}, },
'mrt-overrelax': { 'mrt-overrelax': {
'method': 'mrt', 'method': 'mrt',
'relaxation_rates': [omega, 1.3, 1.4, omega, 1.2, 1.1], 'relaxation_rates': [omega] + [1 + x * 1e-2 for x in range(1, 11)],
}, },
'cumulant': { 'cumulant': {
'method': 'cumulant', 'method': 'cumulant',
...@@ -59,7 +61,7 @@ options_dict = { ...@@ -59,7 +61,7 @@ options_dict = {
'entropic': { 'entropic': {
'method': 'mrt', 'method': 'mrt',
'compressible': True, 'compressible': True,
'relaxation_rates': [omega, omega, omega_free, omega_free, omega_free], 'relaxation_rates': [omega, omega] + [omega_free] * 6,
'entropic': True, 'entropic': True,
}, },
'smagorinsky': { 'smagorinsky': {
...@@ -81,6 +83,7 @@ const bool infoCsePdfs = {cse_pdfs}; ...@@ -81,6 +83,7 @@ const bool infoCsePdfs = {cse_pdfs};
optimize = True optimize = True
with CodeGeneration() as ctx: with CodeGeneration() as ctx:
field_type = "float64" if ctx.double_accuracy else "float32"
config_tokens = ctx.config.split('_') config_tokens = ctx.config.split('_')
assert len(config_tokens) >= 3 assert len(config_tokens) >= 3
...@@ -99,7 +102,8 @@ with CodeGeneration() as ctx: ...@@ -99,7 +102,8 @@ with CodeGeneration() as ctx:
q = len(stencil) q = len(stencil)
dim = len(stencil[0]) dim = len(stencil[0])
assert dim == 3, "This app supports only three-dimensional stencils" assert dim == 3, "This app supports only three-dimensional stencils"
pdfs, pdfs_tmp, velocity_field = ps.fields(f"pdfs({q}), pdfs_tmp({q}), velocity(3) : double[3D]", layout='fzyx') pdfs, pdfs_tmp, velocity_field = ps.fields(f"pdfs({q}), pdfs_tmp({q}), velocity(3) : {field_type}[3D]",
layout='fzyx')
common_options = { common_options = {
'stencil': stencil, 'stencil': stencil,
...@@ -110,7 +114,7 @@ with CodeGeneration() as ctx: ...@@ -110,7 +114,7 @@ with CodeGeneration() as ctx:
'cse_pdfs': False, 'cse_pdfs': False,
'symbolic_field': pdfs, 'symbolic_field': pdfs,
'field_layout': 'fzyx', 'field_layout': 'fzyx',
'gpu_indexing_params': gpu_indexing_params, 'gpu_indexing_params': gpu_indexing_params
} }
} }
...@@ -128,6 +132,14 @@ with CodeGeneration() as ctx: ...@@ -128,6 +132,14 @@ with CodeGeneration() as ctx:
('int32_t', 'cudaBlockSize2') ('int32_t', 'cudaBlockSize2')
] ]
# Sweep for Stream only. This is for benchmarking an empty streaming pattern without LBM.
# is_inplace is set to False to ensure that the streaming is done with src and dst field.
# If this is not the case the compiler might simplify the streaming in a way that benchmarking makes no sense.
accessor = CollideOnlyInplaceAccessor()
accessor.is_inplace = False
field_swaps_stream_only = [(pdfs, pdfs_tmp)]
stream_only_kernel = create_stream_only_kernel(stencil, pdfs, pdfs_tmp, accessor=accessor)
# LB Sweep # LB Sweep
collision_rule = create_lb_collision_rule(**options) collision_rule = create_lb_collision_rule(**options)
...@@ -148,6 +160,10 @@ with CodeGeneration() as ctx: ...@@ -148,6 +160,10 @@ with CodeGeneration() as ctx:
previous_timestep=Timestep.EVEN) previous_timestep=Timestep.EVEN)
generate_sweep(ctx, 'UniformGridGPU_MacroSetter', setter_assignments, target='gpu') generate_sweep(ctx, 'UniformGridGPU_MacroSetter', setter_assignments, target='gpu')
# Stream only kernel
generate_sweep(ctx, 'UniformGridGPU_StreamOnlyKernel', stream_only_kernel, field_swaps=field_swaps_stream_only,
gpu_indexing_params=gpu_indexing_params, varying_parameters=vp, target='gpu')
# Boundaries # Boundaries
noslip = NoSlip() noslip = NoSlip()
ubb = UBB((0.05, 0, 0)) ubb = UBB((0.05, 0, 0))
......
...@@ -9,7 +9,7 @@ import pystencils as ps ...@@ -9,7 +9,7 @@ import pystencils as ps
from lbmpy.fieldaccess import CollideOnlyInplaceAccessor, StreamPullTwoFieldsAccessor from lbmpy.fieldaccess import CollideOnlyInplaceAccessor, StreamPullTwoFieldsAccessor
from lbmpy.relaxationrates import relaxation_rate_scaling from lbmpy.relaxationrates import relaxation_rate_scaling
from lbmpy.stencils import get_stencil from lbmpy.stencils import get_stencil
from lbmpy.updatekernels import create_lbm_kernel, create_stream_pull_only_kernel from lbmpy.updatekernels import create_lbm_kernel, create_stream_only_kernel
from pystencils import AssignmentCollection, create_kernel from pystencils import AssignmentCollection, create_kernel
from pystencils.astnodes import SympyAssignment from pystencils.astnodes import SympyAssignment
from pystencils.backends.cbackend import CBackend, CustomSympyPrinter, get_headers from pystencils.backends.cbackend import CBackend, CustomSympyPrinter, get_headers
...@@ -31,7 +31,6 @@ def __lattice_model(generation_context, class_name, lb_method, stream_collide_as ...@@ -31,7 +31,6 @@ def __lattice_model(generation_context, class_name, lb_method, stream_collide_as
if not stencil_name: if not stencil_name:
raise ValueError("lb_method uses a stencil that is not supported in waLBerla") raise ValueError("lb_method uses a stencil that is not supported in waLBerla")
communication_stencil_name = stencil_name if stencil_name != "D3Q15" else "D3Q27" communication_stencil_name = stencil_name if stencil_name != "D3Q15" else "D3Q27"
is_float = not generation_context.double_accuracy is_float = not generation_context.double_accuracy
dtype_string = "float32" if is_float else "float64" dtype_string = "float32" if is_float else "float64"
...@@ -165,8 +164,8 @@ def generate_lattice_model(generation_context, class_name, collision_rule, field ...@@ -165,8 +164,8 @@ def generate_lattice_model(generation_context, class_name, collision_rule, field
collide_ast.function_name = 'kernel_collide' collide_ast.function_name = 'kernel_collide'
collide_ast.assumed_inner_stride_one = create_kernel_params['cpu_vectorize_info']['assume_inner_stride_one'] collide_ast.assumed_inner_stride_one = create_kernel_params['cpu_vectorize_info']['assume_inner_stride_one']
stream_update_rule = create_stream_pull_only_kernel(lb_method.stencil, None, 'pdfs', 'pdfs_tmp', field_layout, stream_update_rule = create_stream_only_kernel(lb_method.stencil, src_field, dst_field,
dtype) accessor=StreamPullTwoFieldsAccessor())
stream_ast = create_kernel(stream_update_rule, **create_kernel_params) stream_ast = create_kernel(stream_update_rule, **create_kernel_params)
stream_ast.function_name = 'kernel_stream' stream_ast.function_name = 'kernel_stream'
stream_ast.assumed_inner_stride_one = create_kernel_params['cpu_vectorize_info']['assume_inner_stride_one'] stream_ast.assumed_inner_stride_one = create_kernel_params['cpu_vectorize_info']['assume_inner_stride_one']
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment