Commit ce49cdbf authored by Markus Holzer's avatar Markus Holzer Committed by Helen Schottenhamml
Browse files

Enable single precision for benchmark cases

parent b4b0cb4b
...@@ -70,21 +70,29 @@ auto pdfFieldAdder = [](IBlock* const block, StructuredBlockStorage* const stora ...@@ -70,21 +70,29 @@ auto pdfFieldAdder = [](IBlock* const block, StructuredBlockStorage* const stora
}; };
auto VelocityCallback = [](const Cell& pos, const shared_ptr< StructuredBlockForest >& SbF, IBlock& block, auto VelocityCallback = [](const Cell& pos, const shared_ptr< StructuredBlockForest >& SbF, IBlock& block,
real_t inflow_velocity) { real_t inflow_velocity, const bool constant_inflow = true) {
Cell globalCell; if (constant_inflow)
CellInterval domain = SbF->getDomainCellBB(); {
real_t h_y = domain.yMax() - domain.yMin(); Vector3< real_t > result(inflow_velocity, 0.0, 0.0);
real_t h_z = domain.zMax() - domain.zMin(); return result;
SbF->transformBlockLocalToGlobalCell(globalCell, block, pos); }
else
{
Cell globalCell;
CellInterval domain = SbF->getDomainCellBB();
real_t h_y = real_c(domain.ySize());
real_t h_z = real_c(domain.zSize());
SbF->transformBlockLocalToGlobalCell(globalCell, block, pos);
real_t y1 = globalCell[1] - (h_y / 2.0 + 0.5); real_t y1 = real_c(globalCell[1] - (h_y / 2.0 - 0.5));
real_t z1 = globalCell[2] - (h_z / 2.0 + 0.5); real_t z1 = real_c(globalCell[2] - (h_z / 2.0 - 0.5));
real_t u = (inflow_velocity * 16) / (h_y * h_y * h_z * h_z) * (h_y / 2.0 - y1) * (h_y / 2 + y1) * (h_z / 2 - z1) * real_t u = (inflow_velocity * real_c(16)) / (h_y * h_y * h_z * h_z) * (h_y / real_c(2.0) - y1) *
(h_z / 2 + z1); (h_y / real_c(2.0) + y1) * (h_z / real_c(2.0) - z1) * (h_z / real_c(2.0) + z1);
Vector3< real_t > result(u, 0.0, 0.0); Vector3< real_t > result(u, 0.0, 0.0);
return result; return result;
}
}; };
class AlternatingBeforeFunction class AlternatingBeforeFunction
...@@ -147,6 +155,7 @@ int main(int argc, char** argv) ...@@ -147,6 +155,7 @@ int main(int argc, char** argv)
const real_t u_max = parameters.getParameter< real_t >("u_max", real_t(0.05)); const real_t u_max = parameters.getParameter< real_t >("u_max", real_t(0.05));
const real_t reynolds_number = parameters.getParameter< real_t >("reynolds_number", real_t(1000)); const real_t reynolds_number = parameters.getParameter< real_t >("reynolds_number", real_t(1000));
const uint_t diameter_sphere = parameters.getParameter< uint_t >("diameter_sphere", uint_t(5)); const uint_t diameter_sphere = parameters.getParameter< uint_t >("diameter_sphere", uint_t(5));
const bool constant_inflow = parameters.getParameter< bool >("constant_inflow", true);
const double remainingTimeLoggerFrequency = const double remainingTimeLoggerFrequency =
parameters.getParameter< double >("remainingTimeLoggerFrequency", 3.0); // in seconds parameters.getParameter< double >("remainingTimeLoggerFrequency", 3.0); // in seconds
...@@ -204,7 +213,7 @@ int main(int argc, char** argv) ...@@ -204,7 +213,7 @@ int main(int argc, char** argv)
auto boundariesConfig = config->getOneBlock("Boundaries"); auto boundariesConfig = config->getOneBlock("Boundaries");
std::function< Vector3< real_t >(const Cell&, const shared_ptr< StructuredBlockForest >&, IBlock&) > std::function< Vector3< real_t >(const Cell&, const shared_ptr< StructuredBlockForest >&, IBlock&) >
velocity_initialisation = std::bind(VelocityCallback, _1, _2, _3, u_max); velocity_initialisation = std::bind(VelocityCallback, _1, _2, _3, u_max, constant_inflow);
#if defined(WALBERLA_BUILD_WITH_CUDA) #if defined(WALBERLA_BUILD_WITH_CUDA)
lbm::FlowAroundSphereCodeGen_UBB ubb(blocks, pdfFieldIDGPU, velocity_initialisation); lbm::FlowAroundSphereCodeGen_UBB ubb(blocks, pdfFieldIDGPU, velocity_initialisation);
...@@ -236,10 +245,9 @@ int main(int argc, char** argv) ...@@ -236,10 +245,9 @@ int main(int argc, char** argv)
AlternatingBeforeFunction communication(evenComm, oddComm, tracker); AlternatingBeforeFunction communication(evenComm, oddComm, tracker);
// add LBM sweep and communication to time loop // add LBM sweep and communication to time loop
timeloop.add() << BeforeFunction(communication, "communication") timeloop.add() << BeforeFunction(communication, "communication") << Sweep(ubb.getSweep(tracker), "ubb boundary");
<< Sweep(noSlip.getSweep(tracker), "noSlip boundary");
timeloop.add() << Sweep(outflow.getSweep(tracker), "outflow boundary"); timeloop.add() << Sweep(outflow.getSweep(tracker), "outflow boundary");
timeloop.add() << Sweep(ubb.getSweep(tracker), "ubb boundary"); timeloop.add() << Sweep(noSlip.getSweep(tracker), "noSlip boundary");
timeloop.add() << BeforeFunction(tracker->getAdvancementFunction(), "Timestep Advancement") timeloop.add() << BeforeFunction(tracker->getAdvancementFunction(), "Timestep Advancement")
<< Sweep(lbSweep.getSweep(tracker), "LB update rule"); << Sweep(lbSweep.getSweep(tracker), "LB update rule");
......
...@@ -14,51 +14,54 @@ from lbmpy_walberla import generate_alternating_lbm_sweep, generate_alternating_ ...@@ -14,51 +14,54 @@ from lbmpy_walberla import generate_alternating_lbm_sweep, generate_alternating_
import sympy as sp import sympy as sp
stencil = get_stencil("D3Q27")
q = len(stencil)
dim = len(stencil[0])
streaming_pattern = 'esotwist'
timesteps = get_timesteps(streaming_pattern)
pdfs, velocity_field, density_field = fields(f"pdfs({q}), velocity({dim}), density(1) : double[{dim}D]", layout='fzyx')
omega = sp.Symbol("omega")
u_max = sp.Symbol("u_max")
output = {
'density': density_field,
'velocity': velocity_field
}
opt = {'symbolic_field': pdfs,
'cse_global': False,
'cse_pdfs': False}
method_params = {'method': 'cumulant',
'stencil': stencil,
'relaxation_rate': omega,
'galilean_correction': True,
'field_name': 'pdfs',
'streaming_pattern': streaming_pattern,
'output': output,
'optimization': opt}
collision_rule = create_lb_collision_rule(**method_params)
lb_method = collision_rule.method
# getter & setter
setter_assignments = macroscopic_values_setter(lb_method, velocity=velocity_field.center_vector,
pdfs=pdfs, density=1.0,
streaming_pattern=streaming_pattern,
previous_timestep=timesteps[0])
# opt = {'instruction_set': 'sse', 'assume_aligned': True, 'nontemporal': False, 'assume_inner_stride_one': True}
stencil_typedefs = {'Stencil_T': stencil}
field_typedefs = {'PdfField_T': pdfs,
'VelocityField_T': velocity_field,
'ScalarField_T': density_field}
with CodeGeneration() as ctx: with CodeGeneration() as ctx:
data_type = "float64" if ctx.double_accuracy else "float32"
stencil = get_stencil("D3Q27")
q = len(stencil)
dim = len(stencil[0])
streaming_pattern = 'esotwist'
timesteps = get_timesteps(streaming_pattern)
pdfs, velocity_field, density_field = fields(f"pdfs({q}), velocity({dim}), density(1) : {data_type}[{dim}D]",
layout='fzyx')
omega = sp.Symbol("omega")
u_max = sp.Symbol("u_max")
output = {
'density': density_field,
'velocity': velocity_field
}
opt = {'symbolic_field': pdfs,
'cse_global': False,
'cse_pdfs': False,
'double_precision': True if ctx.double_accuracy else False}
method_params = {'method': 'cumulant',
'stencil': stencil,
'relaxation_rate': omega,
'galilean_correction': True,
'field_name': 'pdfs',
'streaming_pattern': streaming_pattern,
'output': output,
'optimization': opt}
collision_rule = create_lb_collision_rule(**method_params)
lb_method = collision_rule.method
# getter & setter
setter_assignments = macroscopic_values_setter(lb_method, velocity=velocity_field.center_vector,
pdfs=pdfs, density=1.0,
streaming_pattern=streaming_pattern,
previous_timestep=timesteps[0])
# opt = {'instruction_set': 'sse', 'assume_aligned': True, 'nontemporal': False, 'assume_inner_stride_one': True}
stencil_typedefs = {'Stencil_T': stencil}
field_typedefs = {'PdfField_T': pdfs,
'VelocityField_T': velocity_field,
'ScalarField_T': density_field}
if ctx.cuda: if ctx.cuda:
target = 'gpu' target = 'gpu'
else: else:
......
...@@ -4,13 +4,15 @@ from lbmpy.relaxationrates import relaxation_rate_from_lattice_viscosity ...@@ -4,13 +4,15 @@ from lbmpy.relaxationrates import relaxation_rate_from_lattice_viscosity
class Scenario: class Scenario:
def __init__(self): def __init__(self):
self.timesteps = 101 self.timesteps = 1001
self.vtkWriteFrequency = 2000 self.vtkWriteFrequency = 100
self.cells = (384, 128, 128) self.cells = (384, 128, 128)
self.blocks = (1, 1, 1) self.blocks = (1, 1, 1)
self.periodic = (0, 0, 0) self.periodic = (0, 0, 0)
self.constant_inflow = True
self.diameter_sphere = min(self.cells) // 2 self.diameter_sphere = min(self.cells) // 2
self.u_max = 0.1 self.u_max = 0.1
self.reynolds_number = 1000000 self.reynolds_number = 1000000
...@@ -38,7 +40,8 @@ class Scenario: ...@@ -38,7 +40,8 @@ class Scenario:
'omega': self.omega, 'omega': self.omega,
'u_max': self.u_max, 'u_max': self.u_max,
'reynolds_number': self.reynolds_number, 'reynolds_number': self.reynolds_number,
'diameter_sphere': self.diameter_sphere 'diameter_sphere': self.diameter_sphere,
'constant_inflow': self.constant_inflow
}, },
'Boundaries': { 'Boundaries': {
'Border': [ 'Border': [
...@@ -54,7 +57,7 @@ class Scenario: ...@@ -54,7 +57,7 @@ class Scenario:
'midpoint': (int(0.40 * self.total_cells[0]), self.total_cells[1] // 2, self.total_cells[2] // 2), 'midpoint': (int(0.40 * self.total_cells[0]), self.total_cells[1] // 2, self.total_cells[2] // 2),
'radius': self.diameter_sphere // 2, 'radius': self.diameter_sphere // 2,
'flag': 'NoSlip'} 'flag': 'NoSlip'}
] ],
}, },
} }
......
...@@ -4,9 +4,9 @@ waLBerla_link_files_to_builddir( "*.py" ) ...@@ -4,9 +4,9 @@ waLBerla_link_files_to_builddir( "*.py" )
waLBerla_link_files_to_builddir( "simulation_setup" ) waLBerla_link_files_to_builddir( "simulation_setup" )
foreach(streaming_pattern aa) # choose from {pull, push, aa, esotwist} foreach(streaming_pattern pull push aa esotwist)
foreach(stencil d3q27) # choose from {d3q19 d3q27} foreach(stencil d3q19 d3q27)
foreach (collision_setup srt trt mrt cumulant) # choose from {srt trt mrt cumulant entropic smagorinsky} foreach (collision_setup srt trt mrt cumulant entropic smagorinsky mrt-overrelax cumulant-overrelax)
set(config ${stencil}_${streaming_pattern}_${collision_setup}) set(config ${stencil}_${streaming_pattern}_${collision_setup})
waLBerla_generate_target_from_python(NAME UniformGridGPUGenerated_${config} waLBerla_generate_target_from_python(NAME UniformGridGPUGenerated_${config}
FILE UniformGridGPU.py FILE UniformGridGPU.py
...@@ -17,6 +17,7 @@ foreach(streaming_pattern aa) # choose from {pull, push, aa, esotwist} ...@@ -17,6 +17,7 @@ foreach(streaming_pattern aa) # choose from {pull, push, aa, esotwist}
UniformGridGPU_NoSlip.cu UniformGridGPU_NoSlip.h UniformGridGPU_NoSlip.cu UniformGridGPU_NoSlip.h
UniformGridGPU_UBB.cu UniformGridGPU_UBB.h UniformGridGPU_UBB.cu UniformGridGPU_UBB.h
UniformGridGPU_MacroSetter.cu UniformGridGPU_MacroSetter.h UniformGridGPU_MacroSetter.cu UniformGridGPU_MacroSetter.h
UniformGridGPU_StreamOnlyKernel.cu UniformGridGPU_StreamOnlyKernel.h
UniformGridGPU_InfoHeader.h UniformGridGPU_InfoHeader.h
) )
...@@ -25,6 +26,16 @@ foreach(streaming_pattern aa) # choose from {pull, push, aa, esotwist} ...@@ -25,6 +26,16 @@ foreach(streaming_pattern aa) # choose from {pull, push, aa, esotwist}
FILES UniformGridGPU.cpp FILES UniformGridGPU.cpp
DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk UniformGridGPUGenerated_${config}) DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk UniformGridGPUGenerated_${config})
set_target_properties( UniformGridGPU_${config} PROPERTIES CXX_VISIBILITY_PRESET hidden) set_target_properties( UniformGridGPU_${config} PROPERTIES CXX_VISIBILITY_PRESET hidden)
# all configs are excluded from all except for pull d3q27.
if (${streaming_pattern} STREQUAL "pull" AND ${stencil} STREQUAL "d3q27")
set_target_properties( UniformGridGPUGenerated_${config} PROPERTIES EXCLUDE_FROM_ALL FALSE)
set_target_properties( UniformGridGPU_${config} PROPERTIES EXCLUDE_FROM_ALL FALSE)
else()
set_target_properties( UniformGridGPUGenerated_${config} PROPERTIES EXCLUDE_FROM_ALL TRUE)
set_target_properties( UniformGridGPU_${config} PROPERTIES EXCLUDE_FROM_ALL TRUE)
endif(${streaming_pattern} STREQUAL "pull" AND ${stencil} STREQUAL "d3q27")
endforeach () endforeach ()
endforeach() endforeach()
endforeach() endforeach()
\ No newline at end of file
//======================================================================================================================
//
// This file is part of waLBerla. waLBerla is free software: you can
// redistribute it and/or modify it under the terms of the GNU General Public
// License as published by the Free Software Foundation, either version 3 of
// the License, or (at your option) any later version.
//
// waLBerla is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
//
//! \file UniformGridGPU.cpp
//! \author Martin Bauer <martin.bauer@fau.de>
//! \author Frederik Hennig <frederik.hennig@fau.de>
//! \author Markus Holzer <markus.holzer@fau.de>
//
//======================================================================================================================
#include "blockforest/Initialization.h" #include "blockforest/Initialization.h"
#include "core/Environment.h" #include "core/Environment.h"
...@@ -7,9 +29,9 @@ ...@@ -7,9 +29,9 @@
#include "cuda/AddGPUFieldToStorage.h" #include "cuda/AddGPUFieldToStorage.h"
#include "cuda/DeviceSelectMPI.h" #include "cuda/DeviceSelectMPI.h"
#include "cuda/FieldCopy.h"
#include "cuda/ParallelStreams.h" #include "cuda/ParallelStreams.h"
#include "cuda/communication/UniformGPUScheme.h" #include "cuda/communication/UniformGPUScheme.h"
#include "cuda/FieldCopy.h"
#include "cuda/lbm/CombinedInPlaceGpuPackInfo.h" #include "cuda/lbm/CombinedInPlaceGpuPackInfo.h"
#include "field/AddToStorage.h" #include "field/AddToStorage.h"
...@@ -27,14 +49,13 @@ ...@@ -27,14 +49,13 @@
#include "timeloop/SweepTimeloop.h" #include "timeloop/SweepTimeloop.h"
#include "InitShearVelocity.h"
#include <cmath> #include <cmath>
#include "InitShearVelocity.h"
#include "UniformGridGPU_InfoHeader.h" #include "UniformGridGPU_InfoHeader.h"
using namespace walberla; using namespace walberla;
using FlagField_T = FlagField<uint8_t>; using FlagField_T = FlagField< uint8_t >;
int main(int argc, char** argv) int main(int argc, char** argv)
{ {
...@@ -58,10 +79,10 @@ int main(int argc, char** argv) ...@@ -58,10 +79,10 @@ int main(int argc, char** argv)
Vector3< uint_t > cellsPerBlock = Vector3< uint_t > cellsPerBlock =
config->getBlock("DomainSetup").getParameter< Vector3< uint_t > >("cellsPerBlock"); config->getBlock("DomainSetup").getParameter< Vector3< uint_t > >("cellsPerBlock");
// Reading parameters // Reading parameters
auto parameters = config->getOneBlock("Parameters"); auto parameters = config->getOneBlock("Parameters");
const real_t omega = parameters.getParameter< real_t >("omega", real_c(1.4)); const real_t omega = parameters.getParameter< real_t >("omega", real_c(1.4));
const uint_t timesteps = parameters.getParameter< uint_t >("timesteps", uint_c(50)); const uint_t timesteps = parameters.getParameter< uint_t >("timesteps", uint_c(50));
const bool initShearFlow = parameters.getParameter<bool>("initShearFlow", true); const bool initShearFlow = parameters.getParameter< bool >("initShearFlow", true);
// Creating fields // Creating fields
BlockDataID pdfFieldCpuID = BlockDataID pdfFieldCpuID =
...@@ -69,7 +90,8 @@ int main(int argc, char** argv) ...@@ -69,7 +90,8 @@ int main(int argc, char** argv)
BlockDataID velFieldCpuID = field::addToStorage< VelocityField_T >(blocks, "vel", real_t(0), field::fzyx); BlockDataID velFieldCpuID = field::addToStorage< VelocityField_T >(blocks, "vel", real_t(0), field::fzyx);
// Initialize velocity on cpu // Initialize velocity on cpu
if( initShearFlow ){ if (initShearFlow)
{
WALBERLA_LOG_INFO_ON_ROOT("Initializing shear flow") WALBERLA_LOG_INFO_ON_ROOT("Initializing shear flow")
initShearVelocity(blocks, velFieldCpuID); initShearVelocity(blocks, velFieldCpuID);
} }
...@@ -91,9 +113,7 @@ int main(int argc, char** argv) ...@@ -91,9 +113,7 @@ int main(int argc, char** argv)
for (uint_t i = 0; i < 3; ++i) for (uint_t i = 0; i < 3; ++i)
{ {
if (int_c(cellsPerBlock[i]) <= innerOuterSplit[i] * 2) if (int_c(cellsPerBlock[i]) <= innerOuterSplit[i] * 2)
{ { WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller or increase cellsPerBlock") }
WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller or increase cellsPerBlock")
}
} }
Cell innerOuterSplitCell(innerOuterSplit[0], innerOuterSplit[1], innerOuterSplit[2]); Cell innerOuterSplitCell(innerOuterSplit[0], innerOuterSplit[1], innerOuterSplit[2]);
...@@ -117,23 +137,26 @@ int main(int argc, char** argv) ...@@ -117,23 +137,26 @@ int main(int argc, char** argv)
LbSweep lbSweep(pdfFieldGpuID, omega, gpuBlockSize[0], gpuBlockSize[1], gpuBlockSize[2], innerOuterSplitCell); LbSweep lbSweep(pdfFieldGpuID, omega, gpuBlockSize[0], gpuBlockSize[1], gpuBlockSize[2], innerOuterSplitCell);
lbSweep.setOuterPriority(streamHighPriority); lbSweep.setOuterPriority(streamHighPriority);
pystencils::UniformGridGPU_StreamOnlyKernel StreamOnlyKernel(pdfFieldGpuID, gpuBlockSize[0], gpuBlockSize[1],
gpuBlockSize[2]);
// Boundaries // Boundaries
const FlagUID fluidFlagUID( "Fluid" ); const FlagUID fluidFlagUID("Fluid");
BlockDataID flagFieldID = field::addFlagFieldToStorage<FlagField_T>(blocks, "Boundary Flag Field"); BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >(blocks, "Boundary Flag Field");
auto boundariesConfig = config->getBlock( "Boundaries" ); auto boundariesConfig = config->getBlock("Boundaries");
bool boundaries = false; bool boundaries = false;
if( boundariesConfig ) if (boundariesConfig)
{ {
boundaries = true; boundaries = true;
geometry::initBoundaryHandling< FlagField_T >( *blocks, flagFieldID, boundariesConfig ); geometry::initBoundaryHandling< FlagField_T >(*blocks, flagFieldID, boundariesConfig);
geometry::setNonBoundaryCellsToDomain< FlagField_T >( *blocks, flagFieldID, fluidFlagUID ); geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, flagFieldID, fluidFlagUID);
} }
lbm::UniformGridGPU_NoSlip noSlip(blocks, pdfFieldGpuID); lbm::UniformGridGPU_NoSlip noSlip(blocks, pdfFieldGpuID);
noSlip.fillFromFlagField<FlagField_T>(blocks, flagFieldID, FlagUID("NoSlip"), fluidFlagUID); noSlip.fillFromFlagField< FlagField_T >(blocks, flagFieldID, FlagUID("NoSlip"), fluidFlagUID);
lbm::UniformGridGPU_UBB ubb(blocks, pdfFieldGpuID); lbm::UniformGridGPU_UBB ubb(blocks, pdfFieldGpuID);
ubb.fillFromFlagField<FlagField_T>(blocks, flagFieldID, FlagUID("UBB"), fluidFlagUID); ubb.fillFromFlagField< FlagField_T >(blocks, flagFieldID, FlagUID("UBB"), fluidFlagUID);
// Initial setup is the post-collision state of an even time step // Initial setup is the post-collision state of an even time step
auto tracker = make_shared< lbm::TimestepTracker >(0); auto tracker = make_shared< lbm::TimestepTracker >(0);
...@@ -143,7 +166,8 @@ int main(int argc, char** argv) ...@@ -143,7 +166,8 @@ int main(int argc, char** argv)
////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
UniformGPUScheme< Stencil_T > comm(blocks, cudaEnabledMPI); UniformGPUScheme< Stencil_T > comm(blocks, cudaEnabledMPI);
auto packInfo = make_shared< lbm::CombinedInPlaceGpuPackInfo< PackInfoEven, PackInfoOdd > >(tracker, pdfFieldGpuID); auto packInfo =
make_shared< lbm::CombinedInPlaceGpuPackInfo< PackInfoEven, PackInfoOdd > >(tracker, pdfFieldGpuID);
comm.addPackInfo(packInfo); comm.addPackInfo(packInfo);
////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
...@@ -152,17 +176,17 @@ int main(int argc, char** argv) ...@@ -152,17 +176,17 @@ int main(int argc, char** argv)
auto defaultStream = cuda::StreamRAII::newPriorityStream(streamLowPriority); auto defaultStream = cuda::StreamRAII::newPriorityStream(streamLowPriority);
auto boundarySweep = [&](IBlock * block, uint8_t t, cudaStream_t stream){ auto boundarySweep = [&](IBlock* block, uint8_t t, cudaStream_t stream) {
noSlip.run(block, t, stream); noSlip.run(block, t, stream);
ubb.run(block, t, stream); ubb.run(block, t, stream);
}; };
auto boundaryInner = [&](IBlock * block, uint8_t t, cudaStream_t stream){ auto boundaryInner = [&](IBlock* block, uint8_t t, cudaStream_t stream) {
noSlip.inner(block, t, stream); noSlip.inner(block, t, stream);
ubb.inner(block, t, stream); ubb.inner(block, t, stream);
}; };
auto boundaryOuter = [&](IBlock * block, uint8_t t, cudaStream_t stream){ auto boundaryOuter = [&](IBlock* block, uint8_t t, cudaStream_t stream) {
noSlip.outer(block, t, stream); noSlip.outer(block, t, stream);
ubb.outer(block, t, stream); ubb.outer(block, t, stream);
}; };
...@@ -170,13 +194,15 @@ int main(int argc, char** argv) ...@@ -170,13 +194,15 @@ int main(int argc, char** argv)
auto simpleOverlapTimeStep = [&]() { auto simpleOverlapTimeStep = [&]() {
// Communicate post-collision values of previous timestep... // Communicate post-collision values of previous timestep...
comm.startCommunication(defaultStream); comm.startCommunication(defaultStream);
for (auto& block : *blocks){ for (auto& block : *blocks)
if(boundaries) boundaryInner(&block, tracker->getCounter(), defaultStream); {
if (boundaries) boundaryInner(&block, tracker->getCounter(), defaultStream);
lbSweep.inner(&block, tracker->getCounterPlusOne(), defaultStream); lbSweep.inner(&block, tracker->getCounterPlusOne(), defaultStream);
} }
comm.wait(defaultStream); comm.wait(defaultStream);
for (auto& block : *blocks){ for (auto& block : *blocks)
if(boundaries) boundaryOuter(&block, tracker->getCounter(), defaultStream); {
if (boundaries) boundaryOuter(&block, tracker->getCounter(), defaultStream);
lbSweep.outer(&block, tracker->getCounterPlusOne(), defaultStream); lbSweep.outer(&block, tracker->getCounterPlusOne(), defaultStream);
} }
...@@ -185,8 +211,9 @@ int main(int argc, char** argv) ...@@ -185,8 +211,9 @@ int main(int argc, char** argv)
auto normalTimeStep = [&]() { auto normalTimeStep = [&]() {
comm.communicate(defaultStream); comm.communicate(defaultStream);
for (auto& block : *blocks){ for (auto& block : *blocks)
if(boundaries) boundarySweep(&block, tracker->getCounter(), defaultStream); {
if (boundaries) boundarySweep(&block, tracker->getCounter(), defaultStream);
lbSweep(&block, tracker->getCounterPlusOne(), defaultStream); lbSweep(&block, tracker->getCounterPlusOne(), defaultStream);
} }
...@@ -201,6 +228,12 @@ int main(int argc, char** argv) ...@@ -201,6 +228,12 @@ int main(int argc, char** argv)
lbSweep(&block, tracker->getCounter(), defaultStream); lbSweep(&block, tracker->getCounter(), defaultStream);
}; };
// Stream only function to test a streaming pattern without executing lbm operations inside
auto StreamOnlyFunc = [&]() {
for (auto& block : *blocks)
StreamOnlyKernel(&block, defaultStream);
};
////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// TIME LOOP SETUP /// /// TIME LOOP SETUP ///
////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
...@@ -221,6 +254,13 @@ int main(int argc, char** argv) ...@@ -221,6 +254,13 @@ int main(int argc, char** argv)
comm.communicate(); comm.communicate();
timeStep = kernelOnlyFunc; timeStep = kernelOnlyFunc;
} }
else if (timeStepStrategy == "StreamOnly")
{