Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Jan Hönig
waLBerla
Commits
ce49cdbf
Commit
ce49cdbf
authored
Aug 17, 2021
by
Markus Holzer
Committed by
Helen Schottenhamml
Aug 17, 2021
Browse files
Enable single precision for benchmark cases
parent
b4b0cb4b
Changes
7
Hide whitespace changes
Inline
Side-by-side
apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.cpp
View file @
ce49cdbf
...
...
@@ -70,21 +70,29 @@ auto pdfFieldAdder = [](IBlock* const block, StructuredBlockStorage* const stora
};
auto
VelocityCallback
=
[](
const
Cell
&
pos
,
const
shared_ptr
<
StructuredBlockForest
>&
SbF
,
IBlock
&
block
,
real_t
inflow_velocity
)
{
Cell
globalCell
;
CellInterval
domain
=
SbF
->
getDomainCellBB
();
real_t
h_y
=
domain
.
yMax
()
-
domain
.
yMin
();
real_t
h_z
=
domain
.
zMax
()
-
domain
.
zMin
();
SbF
->
transformBlockLocalToGlobalCell
(
globalCell
,
block
,
pos
);
real_t
inflow_velocity
,
const
bool
constant_inflow
=
true
)
{
if
(
constant_inflow
)
{
Vector3
<
real_t
>
result
(
inflow_velocity
,
0.0
,
0.0
);
return
result
;
}
else
{
Cell
globalCell
;
CellInterval
domain
=
SbF
->
getDomainCellBB
();
real_t
h_y
=
real_c
(
domain
.
ySize
());
real_t
h_z
=
real_c
(
domain
.
zSize
());
SbF
->
transformBlockLocalToGlobalCell
(
globalCell
,
block
,
pos
);
real_t
y1
=
globalCell
[
1
]
-
(
h_y
/
2.0
+
0.5
);
real_t
z1
=
globalCell
[
2
]
-
(
h_z
/
2.0
+
0.5
);
real_t
y1
=
real_c
(
globalCell
[
1
]
-
(
h_y
/
2.0
-
0.5
)
)
;
real_t
z1
=
real_c
(
globalCell
[
2
]
-
(
h_z
/
2.0
-
0.5
)
)
;
real_t
u
=
(
inflow_velocity
*
16
)
/
(
h_y
*
h_y
*
h_z
*
h_z
)
*
(
h_y
/
2.0
-
y1
)
*
(
h_y
/
2
+
y1
)
*
(
h_z
/
2
-
z1
)
*
(
h_z
/
2
+
z1
);
real_t
u
=
(
inflow_velocity
*
real_c
(
16
)
)
/
(
h_y
*
h_y
*
h_z
*
h_z
)
*
(
h_y
/
real_c
(
2.0
)
-
y1
)
*
(
h_y
/
real_c
(
2.0
)
+
y1
)
*
(
h_z
/
real_c
(
2.0
)
-
z1
)
*
(
h_z
/
real_c
(
2.0
)
+
z1
);
Vector3
<
real_t
>
result
(
u
,
0.0
,
0.0
);
return
result
;
Vector3
<
real_t
>
result
(
u
,
0.0
,
0.0
);
return
result
;
}
};
class
AlternatingBeforeFunction
...
...
@@ -147,6 +155,7 @@ int main(int argc, char** argv)
const
real_t
u_max
=
parameters
.
getParameter
<
real_t
>
(
"u_max"
,
real_t
(
0.05
));
const
real_t
reynolds_number
=
parameters
.
getParameter
<
real_t
>
(
"reynolds_number"
,
real_t
(
1000
));
const
uint_t
diameter_sphere
=
parameters
.
getParameter
<
uint_t
>
(
"diameter_sphere"
,
uint_t
(
5
));
const
bool
constant_inflow
=
parameters
.
getParameter
<
bool
>
(
"constant_inflow"
,
true
);
const
double
remainingTimeLoggerFrequency
=
parameters
.
getParameter
<
double
>
(
"remainingTimeLoggerFrequency"
,
3.0
);
// in seconds
...
...
@@ -204,7 +213,7 @@ int main(int argc, char** argv)
auto
boundariesConfig
=
config
->
getOneBlock
(
"Boundaries"
);
std
::
function
<
Vector3
<
real_t
>
(
const
Cell
&
,
const
shared_ptr
<
StructuredBlockForest
>&
,
IBlock
&
)
>
velocity_initialisation
=
std
::
bind
(
VelocityCallback
,
_1
,
_2
,
_3
,
u_max
);
velocity_initialisation
=
std
::
bind
(
VelocityCallback
,
_1
,
_2
,
_3
,
u_max
,
constant_inflow
);
#if defined(WALBERLA_BUILD_WITH_CUDA)
lbm
::
FlowAroundSphereCodeGen_UBB
ubb
(
blocks
,
pdfFieldIDGPU
,
velocity_initialisation
);
...
...
@@ -236,10 +245,9 @@ int main(int argc, char** argv)
AlternatingBeforeFunction
communication
(
evenComm
,
oddComm
,
tracker
);
// add LBM sweep and communication to time loop
timeloop
.
add
()
<<
BeforeFunction
(
communication
,
"communication"
)
<<
Sweep
(
noSlip
.
getSweep
(
tracker
),
"noSlip boundary"
);
timeloop
.
add
()
<<
BeforeFunction
(
communication
,
"communication"
)
<<
Sweep
(
ubb
.
getSweep
(
tracker
),
"ubb boundary"
);
timeloop
.
add
()
<<
Sweep
(
outflow
.
getSweep
(
tracker
),
"outflow boundary"
);
timeloop
.
add
()
<<
Sweep
(
ubb
.
getSweep
(
tracker
),
"
ubb
boundary"
);
timeloop
.
add
()
<<
Sweep
(
noSlip
.
getSweep
(
tracker
),
"
noSlip
boundary"
);
timeloop
.
add
()
<<
BeforeFunction
(
tracker
->
getAdvancementFunction
(),
"Timestep Advancement"
)
<<
Sweep
(
lbSweep
.
getSweep
(
tracker
),
"LB update rule"
);
...
...
apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.py
View file @
ce49cdbf
...
...
@@ -14,51 +14,54 @@ from lbmpy_walberla import generate_alternating_lbm_sweep, generate_alternating_
import
sympy
as
sp
stencil
=
get_stencil
(
"D3Q27"
)
q
=
len
(
stencil
)
dim
=
len
(
stencil
[
0
])
streaming_pattern
=
'esotwist'
timesteps
=
get_timesteps
(
streaming_pattern
)
pdfs
,
velocity_field
,
density_field
=
fields
(
f
"pdfs(
{
q
}
), velocity(
{
dim
}
), density(1) : double[
{
dim
}
D]"
,
layout
=
'fzyx'
)
omega
=
sp
.
Symbol
(
"omega"
)
u_max
=
sp
.
Symbol
(
"u_max"
)
output
=
{
'density'
:
density_field
,
'velocity'
:
velocity_field
}
opt
=
{
'symbolic_field'
:
pdfs
,
'cse_global'
:
False
,
'cse_pdfs'
:
False
}
method_params
=
{
'method'
:
'cumulant'
,
'stencil'
:
stencil
,
'relaxation_rate'
:
omega
,
'galilean_correction'
:
True
,
'field_name'
:
'pdfs'
,
'streaming_pattern'
:
streaming_pattern
,
'output'
:
output
,
'optimization'
:
opt
}
collision_rule
=
create_lb_collision_rule
(
**
method_params
)
lb_method
=
collision_rule
.
method
# getter & setter
setter_assignments
=
macroscopic_values_setter
(
lb_method
,
velocity
=
velocity_field
.
center_vector
,
pdfs
=
pdfs
,
density
=
1.0
,
streaming_pattern
=
streaming_pattern
,
previous_timestep
=
timesteps
[
0
])
# opt = {'instruction_set': 'sse', 'assume_aligned': True, 'nontemporal': False, 'assume_inner_stride_one': True}
stencil_typedefs
=
{
'Stencil_T'
:
stencil
}
field_typedefs
=
{
'PdfField_T'
:
pdfs
,
'VelocityField_T'
:
velocity_field
,
'ScalarField_T'
:
density_field
}
with
CodeGeneration
()
as
ctx
:
data_type
=
"float64"
if
ctx
.
double_accuracy
else
"float32"
stencil
=
get_stencil
(
"D3Q27"
)
q
=
len
(
stencil
)
dim
=
len
(
stencil
[
0
])
streaming_pattern
=
'esotwist'
timesteps
=
get_timesteps
(
streaming_pattern
)
pdfs
,
velocity_field
,
density_field
=
fields
(
f
"pdfs(
{
q
}
), velocity(
{
dim
}
), density(1) :
{
data_type
}
[
{
dim
}
D]"
,
layout
=
'fzyx'
)
omega
=
sp
.
Symbol
(
"omega"
)
u_max
=
sp
.
Symbol
(
"u_max"
)
output
=
{
'density'
:
density_field
,
'velocity'
:
velocity_field
}
opt
=
{
'symbolic_field'
:
pdfs
,
'cse_global'
:
False
,
'cse_pdfs'
:
False
,
'double_precision'
:
True
if
ctx
.
double_accuracy
else
False
}
method_params
=
{
'method'
:
'cumulant'
,
'stencil'
:
stencil
,
'relaxation_rate'
:
omega
,
'galilean_correction'
:
True
,
'field_name'
:
'pdfs'
,
'streaming_pattern'
:
streaming_pattern
,
'output'
:
output
,
'optimization'
:
opt
}
collision_rule
=
create_lb_collision_rule
(
**
method_params
)
lb_method
=
collision_rule
.
method
# getter & setter
setter_assignments
=
macroscopic_values_setter
(
lb_method
,
velocity
=
velocity_field
.
center_vector
,
pdfs
=
pdfs
,
density
=
1.0
,
streaming_pattern
=
streaming_pattern
,
previous_timestep
=
timesteps
[
0
])
# opt = {'instruction_set': 'sse', 'assume_aligned': True, 'nontemporal': False, 'assume_inner_stride_one': True}
stencil_typedefs
=
{
'Stencil_T'
:
stencil
}
field_typedefs
=
{
'PdfField_T'
:
pdfs
,
'VelocityField_T'
:
velocity_field
,
'ScalarField_T'
:
density_field
}
if
ctx
.
cuda
:
target
=
'gpu'
else
:
...
...
apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGenParameters.py
View file @
ce49cdbf
...
...
@@ -4,13 +4,15 @@ from lbmpy.relaxationrates import relaxation_rate_from_lattice_viscosity
class
Scenario
:
def
__init__
(
self
):
self
.
timesteps
=
101
self
.
vtkWriteFrequency
=
20
00
self
.
timesteps
=
10
0
1
self
.
vtkWriteFrequency
=
1
00
self
.
cells
=
(
384
,
128
,
128
)
self
.
blocks
=
(
1
,
1
,
1
)
self
.
periodic
=
(
0
,
0
,
0
)
self
.
constant_inflow
=
True
self
.
diameter_sphere
=
min
(
self
.
cells
)
//
2
self
.
u_max
=
0.1
self
.
reynolds_number
=
1000000
...
...
@@ -38,7 +40,8 @@ class Scenario:
'omega'
:
self
.
omega
,
'u_max'
:
self
.
u_max
,
'reynolds_number'
:
self
.
reynolds_number
,
'diameter_sphere'
:
self
.
diameter_sphere
'diameter_sphere'
:
self
.
diameter_sphere
,
'constant_inflow'
:
self
.
constant_inflow
},
'Boundaries'
:
{
'Border'
:
[
...
...
@@ -54,7 +57,7 @@ class Scenario:
'midpoint'
:
(
int
(
0.40
*
self
.
total_cells
[
0
]),
self
.
total_cells
[
1
]
//
2
,
self
.
total_cells
[
2
]
//
2
),
'radius'
:
self
.
diameter_sphere
//
2
,
'flag'
:
'NoSlip'
}
]
]
,
},
}
...
...
apps/benchmarks/UniformGridGPU/CMakeLists.txt
View file @
ce49cdbf
...
...
@@ -4,9 +4,9 @@ waLBerla_link_files_to_builddir( "*.py" )
waLBerla_link_files_to_builddir
(
"simulation_setup"
)
foreach
(
streaming_pattern
aa
)
# choose from {
pull
,
push
,
aa
,
esotwist
}
foreach
(
stencil
d3q27
)
# choose from {
d3q19 d3q27
}
foreach
(
collision_setup srt trt mrt cumulant
)
# choose from {srt trt mrt cumulant entropic smagorinsky}
foreach
(
streaming_pattern pull push aa esotwist
)
foreach
(
stencil d3q19 d3q27
)
foreach
(
collision_setup srt trt mrt cumulant
entropic smagorinsky mrt-overrelax cumulant-overrelax
)
set
(
config
${
stencil
}
_
${
streaming_pattern
}
_
${
collision_setup
}
)
waLBerla_generate_target_from_python
(
NAME UniformGridGPUGenerated_
${
config
}
FILE UniformGridGPU.py
...
...
@@ -17,6 +17,7 @@ foreach(streaming_pattern aa) # choose from {pull, push, aa, esotwist}
UniformGridGPU_NoSlip.cu UniformGridGPU_NoSlip.h
UniformGridGPU_UBB.cu UniformGridGPU_UBB.h
UniformGridGPU_MacroSetter.cu UniformGridGPU_MacroSetter.h
UniformGridGPU_StreamOnlyKernel.cu UniformGridGPU_StreamOnlyKernel.h
UniformGridGPU_InfoHeader.h
)
...
...
@@ -25,6 +26,16 @@ foreach(streaming_pattern aa) # choose from {pull, push, aa, esotwist}
FILES UniformGridGPU.cpp
DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk UniformGridGPUGenerated_
${
config
}
)
set_target_properties
(
UniformGridGPU_
${
config
}
PROPERTIES CXX_VISIBILITY_PRESET hidden
)
# all configs are excluded from all except for pull d3q27.
if
(
${
streaming_pattern
}
STREQUAL
"pull"
AND
${
stencil
}
STREQUAL
"d3q27"
)
set_target_properties
(
UniformGridGPUGenerated_
${
config
}
PROPERTIES EXCLUDE_FROM_ALL FALSE
)
set_target_properties
(
UniformGridGPU_
${
config
}
PROPERTIES EXCLUDE_FROM_ALL FALSE
)
else
()
set_target_properties
(
UniformGridGPUGenerated_
${
config
}
PROPERTIES EXCLUDE_FROM_ALL TRUE
)
set_target_properties
(
UniformGridGPU_
${
config
}
PROPERTIES EXCLUDE_FROM_ALL TRUE
)
endif
(
${
streaming_pattern
}
STREQUAL
"pull"
AND
${
stencil
}
STREQUAL
"d3q27"
)
endforeach
()
endforeach
()
endforeach
()
\ No newline at end of file
apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
View file @
ce49cdbf
//======================================================================================================================
//
// This file is part of waLBerla. waLBerla is free software: you can
// redistribute it and/or modify it under the terms of the GNU General Public
// License as published by the Free Software Foundation, either version 3 of
// the License, or (at your option) any later version.
//
// waLBerla is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
//
//! \file UniformGridGPU.cpp
//! \author Martin Bauer <martin.bauer@fau.de>
//! \author Frederik Hennig <frederik.hennig@fau.de>
//! \author Markus Holzer <markus.holzer@fau.de>
//
//======================================================================================================================
#include
"blockforest/Initialization.h"
#include
"core/Environment.h"
...
...
@@ -7,9 +29,9 @@
#include
"cuda/AddGPUFieldToStorage.h"
#include
"cuda/DeviceSelectMPI.h"
#include
"cuda/FieldCopy.h"
#include
"cuda/ParallelStreams.h"
#include
"cuda/communication/UniformGPUScheme.h"
#include
"cuda/FieldCopy.h"
#include
"cuda/lbm/CombinedInPlaceGpuPackInfo.h"
#include
"field/AddToStorage.h"
...
...
@@ -27,14 +49,13 @@
#include
"timeloop/SweepTimeloop.h"
#include
"InitShearVelocity.h"
#include
<cmath>
#include
"InitShearVelocity.h"
#include
"UniformGridGPU_InfoHeader.h"
using
namespace
walberla
;
using
FlagField_T
=
FlagField
<
uint8_t
>
;
using
FlagField_T
=
FlagField
<
uint8_t
>
;
int
main
(
int
argc
,
char
**
argv
)
{
...
...
@@ -58,10 +79,10 @@ int main(int argc, char** argv)
Vector3
<
uint_t
>
cellsPerBlock
=
config
->
getBlock
(
"DomainSetup"
).
getParameter
<
Vector3
<
uint_t
>
>
(
"cellsPerBlock"
);
// Reading parameters
auto
parameters
=
config
->
getOneBlock
(
"Parameters"
);
const
real_t
omega
=
parameters
.
getParameter
<
real_t
>
(
"omega"
,
real_c
(
1.4
));
const
uint_t
timesteps
=
parameters
.
getParameter
<
uint_t
>
(
"timesteps"
,
uint_c
(
50
));
const
bool
initShearFlow
=
parameters
.
getParameter
<
bool
>
(
"initShearFlow"
,
true
);
auto
parameters
=
config
->
getOneBlock
(
"Parameters"
);
const
real_t
omega
=
parameters
.
getParameter
<
real_t
>
(
"omega"
,
real_c
(
1.4
));
const
uint_t
timesteps
=
parameters
.
getParameter
<
uint_t
>
(
"timesteps"
,
uint_c
(
50
));
const
bool
initShearFlow
=
parameters
.
getParameter
<
bool
>
(
"initShearFlow"
,
true
);
// Creating fields
BlockDataID
pdfFieldCpuID
=
...
...
@@ -69,7 +90,8 @@ int main(int argc, char** argv)
BlockDataID
velFieldCpuID
=
field
::
addToStorage
<
VelocityField_T
>
(
blocks
,
"vel"
,
real_t
(
0
),
field
::
fzyx
);
// Initialize velocity on cpu
if
(
initShearFlow
){
if
(
initShearFlow
)
{
WALBERLA_LOG_INFO_ON_ROOT
(
"Initializing shear flow"
)
initShearVelocity
(
blocks
,
velFieldCpuID
);
}
...
...
@@ -91,9 +113,7 @@ int main(int argc, char** argv)
for
(
uint_t
i
=
0
;
i
<
3
;
++
i
)
{
if
(
int_c
(
cellsPerBlock
[
i
])
<=
innerOuterSplit
[
i
]
*
2
)
{
WALBERLA_ABORT_NO_DEBUG_INFO
(
"innerOuterSplit too large - make it smaller or increase cellsPerBlock"
)
}
{
WALBERLA_ABORT_NO_DEBUG_INFO
(
"innerOuterSplit too large - make it smaller or increase cellsPerBlock"
)
}
}
Cell
innerOuterSplitCell
(
innerOuterSplit
[
0
],
innerOuterSplit
[
1
],
innerOuterSplit
[
2
]);
...
...
@@ -117,23 +137,26 @@ int main(int argc, char** argv)
LbSweep
lbSweep
(
pdfFieldGpuID
,
omega
,
gpuBlockSize
[
0
],
gpuBlockSize
[
1
],
gpuBlockSize
[
2
],
innerOuterSplitCell
);
lbSweep
.
setOuterPriority
(
streamHighPriority
);
pystencils
::
UniformGridGPU_StreamOnlyKernel
StreamOnlyKernel
(
pdfFieldGpuID
,
gpuBlockSize
[
0
],
gpuBlockSize
[
1
],
gpuBlockSize
[
2
]);
// Boundaries
const
FlagUID
fluidFlagUID
(
"Fluid"
);
BlockDataID
flagFieldID
=
field
::
addFlagFieldToStorage
<
FlagField_T
>
(
blocks
,
"Boundary Flag Field"
);
auto
boundariesConfig
=
config
->
getBlock
(
"Boundaries"
);
bool
boundaries
=
false
;
if
(
boundariesConfig
)
const
FlagUID
fluidFlagUID
(
"Fluid"
);
BlockDataID
flagFieldID
=
field
::
addFlagFieldToStorage
<
FlagField_T
>
(
blocks
,
"Boundary Flag Field"
);
auto
boundariesConfig
=
config
->
getBlock
(
"Boundaries"
);
bool
boundaries
=
false
;
if
(
boundariesConfig
)
{
boundaries
=
true
;
geometry
::
initBoundaryHandling
<
FlagField_T
>
(
*
blocks
,
flagFieldID
,
boundariesConfig
);
geometry
::
setNonBoundaryCellsToDomain
<
FlagField_T
>
(
*
blocks
,
flagFieldID
,
fluidFlagUID
);
geometry
::
initBoundaryHandling
<
FlagField_T
>
(
*
blocks
,
flagFieldID
,
boundariesConfig
);
geometry
::
setNonBoundaryCellsToDomain
<
FlagField_T
>
(
*
blocks
,
flagFieldID
,
fluidFlagUID
);
}
lbm
::
UniformGridGPU_NoSlip
noSlip
(
blocks
,
pdfFieldGpuID
);
noSlip
.
fillFromFlagField
<
FlagField_T
>
(
blocks
,
flagFieldID
,
FlagUID
(
"NoSlip"
),
fluidFlagUID
);
noSlip
.
fillFromFlagField
<
FlagField_T
>
(
blocks
,
flagFieldID
,
FlagUID
(
"NoSlip"
),
fluidFlagUID
);
lbm
::
UniformGridGPU_UBB
ubb
(
blocks
,
pdfFieldGpuID
);
ubb
.
fillFromFlagField
<
FlagField_T
>
(
blocks
,
flagFieldID
,
FlagUID
(
"UBB"
),
fluidFlagUID
);
ubb
.
fillFromFlagField
<
FlagField_T
>
(
blocks
,
flagFieldID
,
FlagUID
(
"UBB"
),
fluidFlagUID
);
// Initial setup is the post-collision state of an even time step
auto
tracker
=
make_shared
<
lbm
::
TimestepTracker
>
(
0
);
...
...
@@ -143,7 +166,8 @@ int main(int argc, char** argv)
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
UniformGPUScheme
<
Stencil_T
>
comm
(
blocks
,
cudaEnabledMPI
);
auto
packInfo
=
make_shared
<
lbm
::
CombinedInPlaceGpuPackInfo
<
PackInfoEven
,
PackInfoOdd
>
>
(
tracker
,
pdfFieldGpuID
);
auto
packInfo
=
make_shared
<
lbm
::
CombinedInPlaceGpuPackInfo
<
PackInfoEven
,
PackInfoOdd
>
>
(
tracker
,
pdfFieldGpuID
);
comm
.
addPackInfo
(
packInfo
);
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
...
...
@@ -152,17 +176,17 @@ int main(int argc, char** argv)
auto
defaultStream
=
cuda
::
StreamRAII
::
newPriorityStream
(
streamLowPriority
);
auto
boundarySweep
=
[
&
](
IBlock
*
block
,
uint8_t
t
,
cudaStream_t
stream
){
auto
boundarySweep
=
[
&
](
IBlock
*
block
,
uint8_t
t
,
cudaStream_t
stream
)
{
noSlip
.
run
(
block
,
t
,
stream
);
ubb
.
run
(
block
,
t
,
stream
);
};
auto
boundaryInner
=
[
&
](
IBlock
*
block
,
uint8_t
t
,
cudaStream_t
stream
){
auto
boundaryInner
=
[
&
](
IBlock
*
block
,
uint8_t
t
,
cudaStream_t
stream
)
{
noSlip
.
inner
(
block
,
t
,
stream
);
ubb
.
inner
(
block
,
t
,
stream
);
};
auto
boundaryOuter
=
[
&
](
IBlock
*
block
,
uint8_t
t
,
cudaStream_t
stream
){
auto
boundaryOuter
=
[
&
](
IBlock
*
block
,
uint8_t
t
,
cudaStream_t
stream
)
{
noSlip
.
outer
(
block
,
t
,
stream
);
ubb
.
outer
(
block
,
t
,
stream
);
};
...
...
@@ -170,13 +194,15 @@ int main(int argc, char** argv)
auto
simpleOverlapTimeStep
=
[
&
]()
{
// Communicate post-collision values of previous timestep...
comm
.
startCommunication
(
defaultStream
);
for
(
auto
&
block
:
*
blocks
){
if
(
boundaries
)
boundaryInner
(
&
block
,
tracker
->
getCounter
(),
defaultStream
);
for
(
auto
&
block
:
*
blocks
)
{
if
(
boundaries
)
boundaryInner
(
&
block
,
tracker
->
getCounter
(),
defaultStream
);
lbSweep
.
inner
(
&
block
,
tracker
->
getCounterPlusOne
(),
defaultStream
);
}
comm
.
wait
(
defaultStream
);
for
(
auto
&
block
:
*
blocks
){
if
(
boundaries
)
boundaryOuter
(
&
block
,
tracker
->
getCounter
(),
defaultStream
);
for
(
auto
&
block
:
*
blocks
)
{
if
(
boundaries
)
boundaryOuter
(
&
block
,
tracker
->
getCounter
(),
defaultStream
);
lbSweep
.
outer
(
&
block
,
tracker
->
getCounterPlusOne
(),
defaultStream
);
}
...
...
@@ -185,8 +211,9 @@ int main(int argc, char** argv)
auto
normalTimeStep
=
[
&
]()
{
comm
.
communicate
(
defaultStream
);
for
(
auto
&
block
:
*
blocks
){
if
(
boundaries
)
boundarySweep
(
&
block
,
tracker
->
getCounter
(),
defaultStream
);
for
(
auto
&
block
:
*
blocks
)
{
if
(
boundaries
)
boundarySweep
(
&
block
,
tracker
->
getCounter
(),
defaultStream
);
lbSweep
(
&
block
,
tracker
->
getCounterPlusOne
(),
defaultStream
);
}
...
...
@@ -201,6 +228,12 @@ int main(int argc, char** argv)
lbSweep
(
&
block
,
tracker
->
getCounter
(),
defaultStream
);
};
// Stream only function to test a streaming pattern without executing lbm operations inside
auto
StreamOnlyFunc
=
[
&
]()
{
for
(
auto
&
block
:
*
blocks
)
StreamOnlyKernel
(
&
block
,
defaultStream
);
};
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// TIME LOOP SETUP ///
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
...
...
@@ -221,6 +254,13 @@ int main(int argc, char** argv)
comm
.
communicate
();
timeStep
=
kernelOnlyFunc
;
}
else
if
(
timeStepStrategy
==
"StreamOnly"
)
{
WALBERLA_LOG_INFO_ON_ROOT
(
"Running only streaming kernel without LBM - this makes only sense for benchmarking!"
)
// Run initial communication once to provide any missing stream-in populations
timeStep
=
StreamOnlyFunc
;
}
else
{
WALBERLA_ABORT_NO_DEBUG_INFO
(
"Invalid value for 'timeStepStrategy'. Allowed values are 'noOverlap', "
...
...
@@ -239,7 +279,7 @@ int main(int argc, char** argv)
vtkOutput
->
addCellDataWriter
(
velWriter
);
vtkOutput
->
addBeforeFunction
([
&
]()
{
cuda
::
fieldCpy
<
VelocityField_T
,
cuda
::
GPUField
<
real_t
>
>
(
blocks
,
velFieldCpuID
,
velFieldGpuID
);
cuda
::
fieldCpy
<
VelocityField_T
,
cuda
::
GPUField
<
real_t
>
>
(
blocks
,
velFieldCpuID
,
velFieldGpuID
);
});
timeLoop
.
addFuncAfterTimeStep
(
vtk
::
writeFiles
(
vtkOutput
),
"VTK Output"
);
}
...
...
apps/benchmarks/UniformGridGPU/UniformGridGPU.py
View file @
ce49cdbf
...
...
@@ -11,6 +11,8 @@ from lbmpy.boundaries import NoSlip, UBB
from
lbmpy.creationfunctions
import
create_lb_collision_rule
from
lbmpy.macroscopic_value_kernels
import
macroscopic_values_setter
from
lbmpy.stencils
import
get_stencil
from
lbmpy.updatekernels
import
create_stream_only_kernel
from
lbmpy.fieldaccess
import
*
from
pystencils_walberla
import
CodeGeneration
,
generate_info_header
,
generate_sweep
from
lbmpy_walberla
import
generate_alternating_lbm_sweep
,
generate_lb_pack_info
,
generate_alternating_lbm_boundary
...
...
@@ -44,7 +46,7 @@ options_dict = {
},
'mrt-overrelax'
:
{
'method'
:
'mrt'
,
'relaxation_rates'
:
[
omega
,
1.3
,
1.4
,
omega
,
1.2
,
1
.
1
],
'relaxation_rates'
:
[
omega
]
+
[
1
+
x
*
1e-2
for
x
in
range
(
1
,
11
)
],
},
'cumulant'
:
{
'method'
:
'cumulant'
,
...
...
@@ -59,7 +61,7 @@ options_dict = {
'entropic'
:
{
'method'
:
'mrt'
,
'compressible'
:
True
,
'relaxation_rates'
:
[
omega
,
omega
,
omega_free
,
omega_free
,
omega_free
]
,
'relaxation_rates'
:
[
omega
,
omega
]
+
[
omega_free
]
*
6
,
'entropic'
:
True
,
},
'smagorinsky'
:
{
...
...
@@ -81,6 +83,7 @@ const bool infoCsePdfs = {cse_pdfs};
optimize
=
True
with
CodeGeneration
()
as
ctx
:
field_type
=
"float64"
if
ctx
.
double_accuracy
else
"float32"
config_tokens
=
ctx
.
config
.
split
(
'_'
)
assert
len
(
config_tokens
)
>=
3
...
...
@@ -99,7 +102,8 @@ with CodeGeneration() as ctx:
q
=
len
(
stencil
)
dim
=
len
(
stencil
[
0
])
assert
dim
==
3
,
"This app supports only three-dimensional stencils"
pdfs
,
pdfs_tmp
,
velocity_field
=
ps
.
fields
(
f
"pdfs(
{
q
}
), pdfs_tmp(
{
q
}
), velocity(3) : double[3D]"
,
layout
=
'fzyx'
)
pdfs
,
pdfs_tmp
,
velocity_field
=
ps
.
fields
(
f
"pdfs(
{
q
}
), pdfs_tmp(
{
q
}
), velocity(3) :
{
field_type
}
[3D]"
,
layout
=
'fzyx'
)
common_options
=
{
'stencil'
:
stencil
,
...
...
@@ -110,7 +114,7 @@ with CodeGeneration() as ctx:
'cse_pdfs'
:
False
,
'symbolic_field'
:
pdfs
,
'field_layout'
:
'fzyx'
,
'gpu_indexing_params'
:
gpu_indexing_params
,
'gpu_indexing_params'
:
gpu_indexing_params
}
}
...
...
@@ -128,6 +132,14 @@ with CodeGeneration() as ctx:
(
'int32_t'
,
'cudaBlockSize2'
)
]
# Sweep for Stream only. This is for benchmarking an empty streaming pattern without LBM.
# is_inplace is set to False to ensure that the streaming is done with src and dst field.
# If this is not the case the compiler might simplify the streaming in a way that benchmarking makes no sense.
accessor
=
CollideOnlyInplaceAccessor
()
accessor
.
is_inplace
=
False
field_swaps_stream_only
=
[(
pdfs
,
pdfs_tmp
)]
stream_only_kernel
=
create_stream_only_kernel
(
stencil
,
pdfs
,
pdfs_tmp
,
accessor
=
accessor
)
# LB Sweep
collision_rule
=
create_lb_collision_rule
(
**
options
)
...
...
@@ -148,6 +160,10 @@ with CodeGeneration() as ctx:
previous_timestep
=
Timestep
.
EVEN
)
generate_sweep
(
ctx
,
'UniformGridGPU_MacroSetter'
,
setter_assignments
,
target
=
'gpu'
)
# Stream only kernel
generate_sweep
(
ctx
,
'UniformGridGPU_StreamOnlyKernel'
,
stream_only_kernel
,
field_swaps
=
field_swaps_stream_only
,
gpu_indexing_params
=
gpu_indexing_params
,
varying_parameters
=
vp
,
target
=
'gpu'
)
# Boundaries
noslip
=
NoSlip
()
ubb
=
UBB
((
0.05
,
0
,
0
))
...
...
python/lbmpy_walberla/walberla_lbm_generation.py
View file @
ce49cdbf
...
...
@@ -9,7 +9,7 @@ import pystencils as ps
from
lbmpy.fieldaccess
import
CollideOnlyInplaceAccessor
,
StreamPullTwoFieldsAccessor
from
lbmpy.relaxationrates
import
relaxation_rate_scaling
from
lbmpy.stencils
import
get_stencil
from
lbmpy.updatekernels
import
create_lbm_kernel
,
create_stream_
pull_
only_kernel
from
lbmpy.updatekernels
import
create_lbm_kernel
,
create_stream_only_kernel
from
pystencils
import
AssignmentCollection
,
create_kernel
from
pystencils.astnodes
import
SympyAssignment
from
pystencils.backends.cbackend
import
CBackend
,
CustomSympyPrinter
,
get_headers
...
...
@@ -31,7 +31,6 @@ def __lattice_model(generation_context, class_name, lb_method, stream_collide_as
if
not
stencil_name
:
raise
ValueError
(
"lb_method uses a stencil that is not supported in waLBerla"
)
communication_stencil_name
=
stencil_name
if
stencil_name
!=
"D3Q15"
else
"D3Q27"
is_float
=
not
generation_context
.
double_accuracy
dtype_string
=
"float32"
if
is_float
else
"float64"
...
...
@@ -165,8 +164,8 @@ def generate_lattice_model(generation_context, class_name, collision_rule, field
collide_ast
.
function_name
=
'kernel_collide'
collide_ast
.
assumed_inner_stride_one
=
create_kernel_params
[
'cpu_vectorize_info'
][
'assume_inner_stride_one'
]
stream_update_rule
=
create_stream_
pull_
only_kernel
(
lb_method
.
stencil
,
None
,
'pdfs'
,
'pdfs_tmp'
,
field_layout
,
dtype
)
stream_update_rule
=
create_stream_only_kernel
(
lb_method
.
stencil
,
src_field
,
dst_field
,
accessor
=
StreamPullTwoFieldsAccessor
()
)
stream_ast
=
create_kernel
(
stream_update_rule
,
**
create_kernel_params
)
stream_ast
.
function_name
=
'kernel_stream'
stream_ast
.
assumed_inner_stride_one
=
create_kernel_params
[
'cpu_vectorize_info'
][
'assume_inner_stride_one'
]
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment