Commit 7be319e9 authored by Helen Schottenhamml's avatar Helen Schottenhamml
Browse files

Merge branch 'FixFlowAroundSphere' into 'master'

Enable single precision for benchmark cases

Closes #154

See merge request !475
parents b4b0cb4b ce49cdbf
Pipeline #33806 passed with stages
in 407 minutes and 58 seconds
......@@ -70,21 +70,29 @@ auto pdfFieldAdder = [](IBlock* const block, StructuredBlockStorage* const stora
};
auto VelocityCallback = [](const Cell& pos, const shared_ptr< StructuredBlockForest >& SbF, IBlock& block,
real_t inflow_velocity) {
Cell globalCell;
CellInterval domain = SbF->getDomainCellBB();
real_t h_y = domain.yMax() - domain.yMin();
real_t h_z = domain.zMax() - domain.zMin();
SbF->transformBlockLocalToGlobalCell(globalCell, block, pos);
real_t inflow_velocity, const bool constant_inflow = true) {
if (constant_inflow)
{
Vector3< real_t > result(inflow_velocity, 0.0, 0.0);
return result;
}
else
{
Cell globalCell;
CellInterval domain = SbF->getDomainCellBB();
real_t h_y = real_c(domain.ySize());
real_t h_z = real_c(domain.zSize());
SbF->transformBlockLocalToGlobalCell(globalCell, block, pos);
real_t y1 = globalCell[1] - (h_y / 2.0 + 0.5);
real_t z1 = globalCell[2] - (h_z / 2.0 + 0.5);
real_t y1 = real_c(globalCell[1] - (h_y / 2.0 - 0.5));
real_t z1 = real_c(globalCell[2] - (h_z / 2.0 - 0.5));
real_t u = (inflow_velocity * 16) / (h_y * h_y * h_z * h_z) * (h_y / 2.0 - y1) * (h_y / 2 + y1) * (h_z / 2 - z1) *
(h_z / 2 + z1);
real_t u = (inflow_velocity * real_c(16)) / (h_y * h_y * h_z * h_z) * (h_y / real_c(2.0) - y1) *
(h_y / real_c(2.0) + y1) * (h_z / real_c(2.0) - z1) * (h_z / real_c(2.0) + z1);
Vector3< real_t > result(u, 0.0, 0.0);
return result;
Vector3< real_t > result(u, 0.0, 0.0);
return result;
}
};
class AlternatingBeforeFunction
......@@ -147,6 +155,7 @@ int main(int argc, char** argv)
const real_t u_max = parameters.getParameter< real_t >("u_max", real_t(0.05));
const real_t reynolds_number = parameters.getParameter< real_t >("reynolds_number", real_t(1000));
const uint_t diameter_sphere = parameters.getParameter< uint_t >("diameter_sphere", uint_t(5));
const bool constant_inflow = parameters.getParameter< bool >("constant_inflow", true);
const double remainingTimeLoggerFrequency =
parameters.getParameter< double >("remainingTimeLoggerFrequency", 3.0); // in seconds
......@@ -204,7 +213,7 @@ int main(int argc, char** argv)
auto boundariesConfig = config->getOneBlock("Boundaries");
std::function< Vector3< real_t >(const Cell&, const shared_ptr< StructuredBlockForest >&, IBlock&) >
velocity_initialisation = std::bind(VelocityCallback, _1, _2, _3, u_max);
velocity_initialisation = std::bind(VelocityCallback, _1, _2, _3, u_max, constant_inflow);
#if defined(WALBERLA_BUILD_WITH_CUDA)
lbm::FlowAroundSphereCodeGen_UBB ubb(blocks, pdfFieldIDGPU, velocity_initialisation);
......@@ -236,10 +245,9 @@ int main(int argc, char** argv)
AlternatingBeforeFunction communication(evenComm, oddComm, tracker);
// add LBM sweep and communication to time loop
timeloop.add() << BeforeFunction(communication, "communication")
<< Sweep(noSlip.getSweep(tracker), "noSlip boundary");
timeloop.add() << BeforeFunction(communication, "communication") << Sweep(ubb.getSweep(tracker), "ubb boundary");
timeloop.add() << Sweep(outflow.getSweep(tracker), "outflow boundary");
timeloop.add() << Sweep(ubb.getSweep(tracker), "ubb boundary");
timeloop.add() << Sweep(noSlip.getSweep(tracker), "noSlip boundary");
timeloop.add() << BeforeFunction(tracker->getAdvancementFunction(), "Timestep Advancement")
<< Sweep(lbSweep.getSweep(tracker), "LB update rule");
......
......@@ -14,51 +14,54 @@ from lbmpy_walberla import generate_alternating_lbm_sweep, generate_alternating_
import sympy as sp
stencil = get_stencil("D3Q27")
q = len(stencil)
dim = len(stencil[0])
streaming_pattern = 'esotwist'
timesteps = get_timesteps(streaming_pattern)
pdfs, velocity_field, density_field = fields(f"pdfs({q}), velocity({dim}), density(1) : double[{dim}D]", layout='fzyx')
omega = sp.Symbol("omega")
u_max = sp.Symbol("u_max")
output = {
'density': density_field,
'velocity': velocity_field
}
opt = {'symbolic_field': pdfs,
'cse_global': False,
'cse_pdfs': False}
method_params = {'method': 'cumulant',
'stencil': stencil,
'relaxation_rate': omega,
'galilean_correction': True,
'field_name': 'pdfs',
'streaming_pattern': streaming_pattern,
'output': output,
'optimization': opt}
collision_rule = create_lb_collision_rule(**method_params)
lb_method = collision_rule.method
# getter & setter
setter_assignments = macroscopic_values_setter(lb_method, velocity=velocity_field.center_vector,
pdfs=pdfs, density=1.0,
streaming_pattern=streaming_pattern,
previous_timestep=timesteps[0])
# opt = {'instruction_set': 'sse', 'assume_aligned': True, 'nontemporal': False, 'assume_inner_stride_one': True}
stencil_typedefs = {'Stencil_T': stencil}
field_typedefs = {'PdfField_T': pdfs,
'VelocityField_T': velocity_field,
'ScalarField_T': density_field}
with CodeGeneration() as ctx:
data_type = "float64" if ctx.double_accuracy else "float32"
stencil = get_stencil("D3Q27")
q = len(stencil)
dim = len(stencil[0])
streaming_pattern = 'esotwist'
timesteps = get_timesteps(streaming_pattern)
pdfs, velocity_field, density_field = fields(f"pdfs({q}), velocity({dim}), density(1) : {data_type}[{dim}D]",
layout='fzyx')
omega = sp.Symbol("omega")
u_max = sp.Symbol("u_max")
output = {
'density': density_field,
'velocity': velocity_field
}
opt = {'symbolic_field': pdfs,
'cse_global': False,
'cse_pdfs': False,
'double_precision': True if ctx.double_accuracy else False}
method_params = {'method': 'cumulant',
'stencil': stencil,
'relaxation_rate': omega,
'galilean_correction': True,
'field_name': 'pdfs',
'streaming_pattern': streaming_pattern,
'output': output,
'optimization': opt}
collision_rule = create_lb_collision_rule(**method_params)
lb_method = collision_rule.method
# getter & setter
setter_assignments = macroscopic_values_setter(lb_method, velocity=velocity_field.center_vector,
pdfs=pdfs, density=1.0,
streaming_pattern=streaming_pattern,
previous_timestep=timesteps[0])
# opt = {'instruction_set': 'sse', 'assume_aligned': True, 'nontemporal': False, 'assume_inner_stride_one': True}
stencil_typedefs = {'Stencil_T': stencil}
field_typedefs = {'PdfField_T': pdfs,
'VelocityField_T': velocity_field,
'ScalarField_T': density_field}
if ctx.cuda:
target = 'gpu'
else:
......
......@@ -4,13 +4,15 @@ from lbmpy.relaxationrates import relaxation_rate_from_lattice_viscosity
class Scenario:
def __init__(self):
self.timesteps = 101
self.vtkWriteFrequency = 2000
self.timesteps = 1001
self.vtkWriteFrequency = 100
self.cells = (384, 128, 128)
self.blocks = (1, 1, 1)
self.periodic = (0, 0, 0)
self.constant_inflow = True
self.diameter_sphere = min(self.cells) // 2
self.u_max = 0.1
self.reynolds_number = 1000000
......@@ -38,7 +40,8 @@ class Scenario:
'omega': self.omega,
'u_max': self.u_max,
'reynolds_number': self.reynolds_number,
'diameter_sphere': self.diameter_sphere
'diameter_sphere': self.diameter_sphere,
'constant_inflow': self.constant_inflow
},
'Boundaries': {
'Border': [
......@@ -54,7 +57,7 @@ class Scenario:
'midpoint': (int(0.40 * self.total_cells[0]), self.total_cells[1] // 2, self.total_cells[2] // 2),
'radius': self.diameter_sphere // 2,
'flag': 'NoSlip'}
]
],
},
}
......
......@@ -4,9 +4,9 @@ waLBerla_link_files_to_builddir( "*.py" )
waLBerla_link_files_to_builddir( "simulation_setup" )
foreach(streaming_pattern aa) # choose from {pull, push, aa, esotwist}
foreach(stencil d3q27) # choose from {d3q19 d3q27}
foreach (collision_setup srt trt mrt cumulant) # choose from {srt trt mrt cumulant entropic smagorinsky}
foreach(streaming_pattern pull push aa esotwist)
foreach(stencil d3q19 d3q27)
foreach (collision_setup srt trt mrt cumulant entropic smagorinsky mrt-overrelax cumulant-overrelax)
set(config ${stencil}_${streaming_pattern}_${collision_setup})
waLBerla_generate_target_from_python(NAME UniformGridGPUGenerated_${config}
FILE UniformGridGPU.py
......@@ -17,6 +17,7 @@ foreach(streaming_pattern aa) # choose from {pull, push, aa, esotwist}
UniformGridGPU_NoSlip.cu UniformGridGPU_NoSlip.h
UniformGridGPU_UBB.cu UniformGridGPU_UBB.h
UniformGridGPU_MacroSetter.cu UniformGridGPU_MacroSetter.h
UniformGridGPU_StreamOnlyKernel.cu UniformGridGPU_StreamOnlyKernel.h
UniformGridGPU_InfoHeader.h
)
......@@ -25,6 +26,16 @@ foreach(streaming_pattern aa) # choose from {pull, push, aa, esotwist}
FILES UniformGridGPU.cpp
DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk UniformGridGPUGenerated_${config})
set_target_properties( UniformGridGPU_${config} PROPERTIES CXX_VISIBILITY_PRESET hidden)
# all configs are excluded from all except for pull d3q27.
if (${streaming_pattern} STREQUAL "pull" AND ${stencil} STREQUAL "d3q27")
set_target_properties( UniformGridGPUGenerated_${config} PROPERTIES EXCLUDE_FROM_ALL FALSE)
set_target_properties( UniformGridGPU_${config} PROPERTIES EXCLUDE_FROM_ALL FALSE)
else()
set_target_properties( UniformGridGPUGenerated_${config} PROPERTIES EXCLUDE_FROM_ALL TRUE)
set_target_properties( UniformGridGPU_${config} PROPERTIES EXCLUDE_FROM_ALL TRUE)
endif(${streaming_pattern} STREQUAL "pull" AND ${stencil} STREQUAL "d3q27")
endforeach ()
endforeach()
endforeach()
\ No newline at end of file
//======================================================================================================================
//
// This file is part of waLBerla. waLBerla is free software: you can
// redistribute it and/or modify it under the terms of the GNU General Public
// License as published by the Free Software Foundation, either version 3 of
// the License, or (at your option) any later version.
//
// waLBerla is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
//
//! \file UniformGridGPU.cpp
//! \author Martin Bauer <martin.bauer@fau.de>
//! \author Frederik Hennig <frederik.hennig@fau.de>
//! \author Markus Holzer <markus.holzer@fau.de>
//
//======================================================================================================================
#include "blockforest/Initialization.h"
#include "core/Environment.h"
......@@ -7,9 +29,9 @@
#include "cuda/AddGPUFieldToStorage.h"
#include "cuda/DeviceSelectMPI.h"
#include "cuda/FieldCopy.h"
#include "cuda/ParallelStreams.h"
#include "cuda/communication/UniformGPUScheme.h"
#include "cuda/FieldCopy.h"
#include "cuda/lbm/CombinedInPlaceGpuPackInfo.h"
#include "field/AddToStorage.h"
......@@ -27,14 +49,13 @@
#include "timeloop/SweepTimeloop.h"
#include "InitShearVelocity.h"
#include <cmath>
#include "InitShearVelocity.h"
#include "UniformGridGPU_InfoHeader.h"
using namespace walberla;
using FlagField_T = FlagField<uint8_t>;
using FlagField_T = FlagField< uint8_t >;
int main(int argc, char** argv)
{
......@@ -58,10 +79,10 @@ int main(int argc, char** argv)
Vector3< uint_t > cellsPerBlock =
config->getBlock("DomainSetup").getParameter< Vector3< uint_t > >("cellsPerBlock");
// Reading parameters
auto parameters = config->getOneBlock("Parameters");
const real_t omega = parameters.getParameter< real_t >("omega", real_c(1.4));
const uint_t timesteps = parameters.getParameter< uint_t >("timesteps", uint_c(50));
const bool initShearFlow = parameters.getParameter<bool>("initShearFlow", true);
auto parameters = config->getOneBlock("Parameters");
const real_t omega = parameters.getParameter< real_t >("omega", real_c(1.4));
const uint_t timesteps = parameters.getParameter< uint_t >("timesteps", uint_c(50));
const bool initShearFlow = parameters.getParameter< bool >("initShearFlow", true);
// Creating fields
BlockDataID pdfFieldCpuID =
......@@ -69,7 +90,8 @@ int main(int argc, char** argv)
BlockDataID velFieldCpuID = field::addToStorage< VelocityField_T >(blocks, "vel", real_t(0), field::fzyx);
// Initialize velocity on cpu
if( initShearFlow ){
if (initShearFlow)
{
WALBERLA_LOG_INFO_ON_ROOT("Initializing shear flow")
initShearVelocity(blocks, velFieldCpuID);
}
......@@ -91,9 +113,7 @@ int main(int argc, char** argv)
for (uint_t i = 0; i < 3; ++i)
{
if (int_c(cellsPerBlock[i]) <= innerOuterSplit[i] * 2)
{
WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller or increase cellsPerBlock")
}
{ WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller or increase cellsPerBlock") }
}
Cell innerOuterSplitCell(innerOuterSplit[0], innerOuterSplit[1], innerOuterSplit[2]);
......@@ -117,23 +137,26 @@ int main(int argc, char** argv)
LbSweep lbSweep(pdfFieldGpuID, omega, gpuBlockSize[0], gpuBlockSize[1], gpuBlockSize[2], innerOuterSplitCell);
lbSweep.setOuterPriority(streamHighPriority);
pystencils::UniformGridGPU_StreamOnlyKernel StreamOnlyKernel(pdfFieldGpuID, gpuBlockSize[0], gpuBlockSize[1],
gpuBlockSize[2]);
// Boundaries
const FlagUID fluidFlagUID( "Fluid" );
BlockDataID flagFieldID = field::addFlagFieldToStorage<FlagField_T>(blocks, "Boundary Flag Field");
auto boundariesConfig = config->getBlock( "Boundaries" );
bool boundaries = false;
if( boundariesConfig )
const FlagUID fluidFlagUID("Fluid");
BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >(blocks, "Boundary Flag Field");
auto boundariesConfig = config->getBlock("Boundaries");
bool boundaries = false;
if (boundariesConfig)
{
boundaries = true;
geometry::initBoundaryHandling< FlagField_T >( *blocks, flagFieldID, boundariesConfig );
geometry::setNonBoundaryCellsToDomain< FlagField_T >( *blocks, flagFieldID, fluidFlagUID );
geometry::initBoundaryHandling< FlagField_T >(*blocks, flagFieldID, boundariesConfig);
geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, flagFieldID, fluidFlagUID);
}
lbm::UniformGridGPU_NoSlip noSlip(blocks, pdfFieldGpuID);
noSlip.fillFromFlagField<FlagField_T>(blocks, flagFieldID, FlagUID("NoSlip"), fluidFlagUID);
noSlip.fillFromFlagField< FlagField_T >(blocks, flagFieldID, FlagUID("NoSlip"), fluidFlagUID);
lbm::UniformGridGPU_UBB ubb(blocks, pdfFieldGpuID);
ubb.fillFromFlagField<FlagField_T>(blocks, flagFieldID, FlagUID("UBB"), fluidFlagUID);
ubb.fillFromFlagField< FlagField_T >(blocks, flagFieldID, FlagUID("UBB"), fluidFlagUID);
// Initial setup is the post-collision state of an even time step
auto tracker = make_shared< lbm::TimestepTracker >(0);
......@@ -143,7 +166,8 @@ int main(int argc, char** argv)
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
UniformGPUScheme< Stencil_T > comm(blocks, cudaEnabledMPI);
auto packInfo = make_shared< lbm::CombinedInPlaceGpuPackInfo< PackInfoEven, PackInfoOdd > >(tracker, pdfFieldGpuID);
auto packInfo =
make_shared< lbm::CombinedInPlaceGpuPackInfo< PackInfoEven, PackInfoOdd > >(tracker, pdfFieldGpuID);
comm.addPackInfo(packInfo);
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
......@@ -152,17 +176,17 @@ int main(int argc, char** argv)
auto defaultStream = cuda::StreamRAII::newPriorityStream(streamLowPriority);
auto boundarySweep = [&](IBlock * block, uint8_t t, cudaStream_t stream){
auto boundarySweep = [&](IBlock* block, uint8_t t, cudaStream_t stream) {
noSlip.run(block, t, stream);
ubb.run(block, t, stream);
};
auto boundaryInner = [&](IBlock * block, uint8_t t, cudaStream_t stream){
auto boundaryInner = [&](IBlock* block, uint8_t t, cudaStream_t stream) {
noSlip.inner(block, t, stream);
ubb.inner(block, t, stream);
};
auto boundaryOuter = [&](IBlock * block, uint8_t t, cudaStream_t stream){
auto boundaryOuter = [&](IBlock* block, uint8_t t, cudaStream_t stream) {
noSlip.outer(block, t, stream);
ubb.outer(block, t, stream);
};
......@@ -170,13 +194,15 @@ int main(int argc, char** argv)
auto simpleOverlapTimeStep = [&]() {
// Communicate post-collision values of previous timestep...
comm.startCommunication(defaultStream);
for (auto& block : *blocks){
if(boundaries) boundaryInner(&block, tracker->getCounter(), defaultStream);
for (auto& block : *blocks)
{
if (boundaries) boundaryInner(&block, tracker->getCounter(), defaultStream);
lbSweep.inner(&block, tracker->getCounterPlusOne(), defaultStream);
}
comm.wait(defaultStream);
for (auto& block : *blocks){
if(boundaries) boundaryOuter(&block, tracker->getCounter(), defaultStream);
for (auto& block : *blocks)
{
if (boundaries) boundaryOuter(&block, tracker->getCounter(), defaultStream);
lbSweep.outer(&block, tracker->getCounterPlusOne(), defaultStream);
}
......@@ -185,8 +211,9 @@ int main(int argc, char** argv)
auto normalTimeStep = [&]() {
comm.communicate(defaultStream);
for (auto& block : *blocks){
if(boundaries) boundarySweep(&block, tracker->getCounter(), defaultStream);
for (auto& block : *blocks)
{
if (boundaries) boundarySweep(&block, tracker->getCounter(), defaultStream);
lbSweep(&block, tracker->getCounterPlusOne(), defaultStream);
}
......@@ -201,6 +228,12 @@ int main(int argc, char** argv)
lbSweep(&block, tracker->getCounter(), defaultStream);
};
// Stream only function to test a streaming pattern without executing lbm operations inside
auto StreamOnlyFunc = [&]() {
for (auto& block : *blocks)
StreamOnlyKernel(&block, defaultStream);
};
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// TIME LOOP SETUP ///
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
......@@ -221,6 +254,13 @@ int main(int argc, char** argv)
comm.communicate();
timeStep = kernelOnlyFunc;
}
else if (timeStepStrategy == "StreamOnly")
{
WALBERLA_LOG_INFO_ON_ROOT(
"Running only streaming kernel without LBM - this makes only sense for benchmarking!")
// Run initial communication once to provide any missing stream-in populations
timeStep = StreamOnlyFunc;
}
else
{
WALBERLA_ABORT_NO_DEBUG_INFO("Invalid value for 'timeStepStrategy'. Allowed values are 'noOverlap', "
......@@ -239,7 +279,7 @@ int main(int argc, char** argv)
vtkOutput->addCellDataWriter(velWriter);
vtkOutput->addBeforeFunction([&]() {
cuda::fieldCpy< VelocityField_T , cuda::GPUField< real_t > >(blocks, velFieldCpuID, velFieldGpuID);
cuda::fieldCpy< VelocityField_T, cuda::GPUField< real_t > >(blocks, velFieldCpuID, velFieldGpuID);
});
timeLoop.addFuncAfterTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
}
......
......@@ -11,6 +11,8 @@ from lbmpy.boundaries import NoSlip, UBB
from lbmpy.creationfunctions import create_lb_collision_rule
from lbmpy.macroscopic_value_kernels import macroscopic_values_setter
from lbmpy.stencils import get_stencil
from lbmpy.updatekernels import create_stream_only_kernel
from lbmpy.fieldaccess import *
from pystencils_walberla import CodeGeneration, generate_info_header, generate_sweep
from lbmpy_walberla import generate_alternating_lbm_sweep, generate_lb_pack_info, generate_alternating_lbm_boundary
......@@ -44,7 +46,7 @@ options_dict = {
},
'mrt-overrelax': {
'method': 'mrt',
'relaxation_rates': [omega, 1.3, 1.4, omega, 1.2, 1.1],
'relaxation_rates': [omega] + [1 + x * 1e-2 for x in range(1, 11)],
},
'cumulant': {
'method': 'cumulant',
......@@ -59,7 +61,7 @@ options_dict = {
'entropic': {
'method': 'mrt',
'compressible': True,
'relaxation_rates': [omega, omega, omega_free, omega_free, omega_free],
'relaxation_rates': [omega, omega] + [omega_free] * 6,
'entropic': True,
},
'smagorinsky': {
......@@ -81,6 +83,7 @@ const bool infoCsePdfs = {cse_pdfs};
optimize = True
with CodeGeneration() as ctx:
field_type = "float64" if ctx.double_accuracy else "float32"
config_tokens = ctx.config.split('_')
assert len(config_tokens) >= 3
......@@ -99,7 +102,8 @@ with CodeGeneration() as ctx:
q = len(stencil)
dim = len(stencil[0])
assert dim == 3, "This app supports only three-dimensional stencils"
pdfs, pdfs_tmp, velocity_field = ps.fields(f"pdfs({q}), pdfs_tmp({q}), velocity(3) : double[3D]", layout='fzyx')
pdfs, pdfs_tmp, velocity_field = ps.fields(f"pdfs({q}), pdfs_tmp({q}), velocity(3) : {field_type}[3D]",
layout='fzyx')
common_options = {
'stencil': stencil,
......@@ -110,7 +114,7 @@ with CodeGeneration() as ctx:
'cse_pdfs': False,
'symbolic_field': pdfs,
'field_layout': 'fzyx',
'gpu_indexing_params': gpu_indexing_params,
'gpu_indexing_params': gpu_indexing_params
}
}
......@@ -128,6 +132,14 @@ with CodeGeneration() as ctx:
('int32_t', 'cudaBlockSize2')
]
# Sweep for Stream only. This is for benchmarking an empty streaming pattern without LBM.
# is_inplace is set to False to ensure that the streaming is done with src and dst field.
# If this is not the case the compiler might simplify the streaming in a way that benchmarking makes no sense.
accessor = CollideOnlyInplaceAccessor()
accessor.is_inplace = False
field_swaps_stream_only = [(pdfs, pdfs_tmp)]
stream_only_kernel = create_stream_only_kernel(stencil, pdfs, pdfs_tmp, accessor=accessor)
# LB Sweep
collision_rule = create_lb_collision_rule(**options)
......@@ -148,6 +160,10 @@ with CodeGeneration() as ctx:
previous_timestep=Timestep.EVEN)
generate_sweep(ctx, 'UniformGridGPU_MacroSetter', setter_assignments, target='gpu')
# Stream only kernel
generate_sweep(ctx, 'UniformGridGPU_StreamOnlyKernel', stream_only_kernel, field_swaps=field_swaps_stream_only,
gpu_indexing_params=gpu_indexing_params, varying_parameters=vp, target='gpu')
# Boundaries
noslip = NoSlip()
ubb = UBB((0.05, 0, 0))
......
......@@ -9,7 +9,7 @@ import pystencils as ps
from lbmpy.fieldaccess import CollideOnlyInplaceAccessor, StreamPullTwoFieldsAccessor
from lbmpy.relaxationrates import relaxation_rate_scaling
from lbmpy.stencils import get_stencil
from lbmpy.updatekernels import create_lbm_kernel, create_stream_pull_only_kernel
from lbmpy.updatekernels import create_lbm_kernel, create_stream_only_kernel
from pystencils import AssignmentCollection, create_kernel
from pystencils.astnodes import SympyAssignment
from pystencils.backends.cbackend import CBackend, CustomSympyPrinter, get_headers
......@@ -31,7 +31,6 @@ def __lattice_model(generation_context, class_name, lb_method, stream_collide_as
if not stencil_name:
raise ValueError("lb_method uses a stencil that is not supported in waLBerla")
communication_stencil_name = stencil_name if stencil_name != "D3Q15" else "D3Q27"
is_float = not generation_context.double_accuracy
dtype_string = "float32" if is_float else "float64"
......@@ -165,8 +164,8 @@ def generate_lattice_model(generation_context, class_name, collision_rule, field
collide_ast.function_name = 'kernel_collide'
collide_ast.assumed_inner_stride_one = create_kernel_params['cpu_vectorize_info']['assume_inner_stride_one']
stream_update_rule = create_stream_pull_only_kernel(lb_method.stencil, None, 'pdfs', 'pdfs_tmp', field_layout,
dtype)
stream_update_rule = create_stream_only_kernel(lb_method.stencil, src_field, dst_field,
accessor=StreamPullTwoFieldsAccessor())
stream_ast = create_kernel(stream_update_rule, **create_kernel_params)
stream_ast.function_name = 'kernel_stream'
stream_ast.assumed_inner_stride_one = create_kernel_params['cpu_vectorize_info']['assume_inner_stride_one']
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment