Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Frederik Hennig
lbmpy
Commits
ac69e4f1
Commit
ac69e4f1
authored
Oct 19, 2020
by
Frederik Hennig
Browse files
Periodicity Handling for CUDA and OpenCL
parent
9bc0632b
Pipeline
#27376
passed with stage
in 31 minutes and 42 seconds
Changes
5
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
lbmpy/advanced_streaming/boundaryhandling.py
View file @
ac69e4f1
...
...
@@ -47,10 +47,10 @@ class FlexibleLBMBoundaryHandling(BoundaryHandling):
if
boundary_obj
not
in
self
.
_boundary_object_to_boundary_info
:
sym_index_field
=
Field
.
create_generic
(
'indexField'
,
spatial_dimensions
=
1
,
dtype
=
numpy_data_type_for_boundary_object
(
boundary_obj
,
self
.
dim
))
kernels
=
[
self
.
_create_boundary_kernel
(
self
.
_data_handling
.
fields
[
self
.
_field_name
],
sym_index_field
,
boundary_obj
,
Timestep
.
EVEN
).
compile
(),
self
.
_create_boundary_kernel
(
self
.
_data_handling
.
fields
[
self
.
_field_name
],
sym_index_field
,
boundary_obj
,
Timestep
.
ODD
).
compile
()
]
kernels
=
[
self
.
_create_boundary_kernel
(
self
.
_data_handling
.
fields
[
self
.
_field_name
],
sym_index_field
,
boundary_obj
,
Timestep
.
EVEN
).
compile
(),
self
.
_create_boundary_kernel
(
self
.
_data_handling
.
fields
[
self
.
_field_name
],
sym_index_field
,
boundary_obj
,
Timestep
.
ODD
).
compile
()]
if
flag
is
None
:
flag
=
self
.
flag_interface
.
reserve_next_flag
()
boundary_info
=
self
.
FlexibleBoundaryInfo
(
self
,
boundary_obj
,
flag
,
kernels
)
...
...
@@ -70,7 +70,7 @@ class FlexibleLBMBoundaryHandling(BoundaryHandling):
prev_timestep
=
self
.
_boundary_handling
.
prev_timestep
if
prev_timestep
is
None
:
raise
Exception
(
"The flexible boundary kernel property was accessed while "
"The flexible boundary kernel property was accessed while "
+
"there was no boundary handling in progress."
)
return
self
.
_kernels
[
prev_timestep
]
...
...
lbmpy/advanced_streaming/communication.py
View file @
ac69e4f1
from
pystencils
import
Field
from
pystencils.slicing
import
shift_slice
,
get_slice_before_ghost_layer
from
lbmpy.advanced_streaming.utility
import
is_inplace
,
get_accessor
,
numeric_index
,
numeric_offsets
,
Timestep
,
get_timesteps
from
pystencils
import
Field
,
Assignment
from
pystencils.slicing
import
shift_slice
,
get_slice_before_ghost_layer
,
normalize_slice
from
lbmpy.advanced_streaming.utility
import
is_inplace
,
get_accessor
,
numeric_index
,
\
numeric_offsets
,
Timestep
,
get_timesteps
from
pystencils.datahandling
import
SerialDataHandling
from
itertools
import
chain
...
...
@@ -98,19 +99,83 @@ def get_communication_slices(
return
slices_per_comm_direction
def
periodic_pdf_copy_kernel
(
pdf_field
,
src_slice
,
dst_slice
,
domain_size
=
None
,
target
=
'gpu'
,
opencl_queue
=
None
,
opencl_ctx
=
None
):
"""Copies a rectangular array slice onto another non-overlapping array slice"""
from
pystencils.gpucuda.kernelcreation
import
create_cuda_kernel
pdf_idx
=
src_slice
[
-
1
]
assert
isinstance
(
pdf_idx
,
int
),
"PDF index needs to be an integer constant"
assert
pdf_idx
==
dst_slice
[
-
1
],
"Source and Destination PDF indices must be equal"
src_slice
=
src_slice
[:
-
1
]
dst_slice
=
dst_slice
[:
-
1
]
if
domain_size
is
None
:
domain_size
=
pdf_field
.
spatial_shape
normalized_from_slice
=
normalize_slice
(
src_slice
,
domain_size
)
normalized_to_slice
=
normalize_slice
(
dst_slice
,
domain_size
)
def
_start
(
s
):
return
s
.
start
if
isinstance
(
s
,
slice
)
else
s
def
_stop
(
s
):
return
s
.
stop
if
isinstance
(
s
,
slice
)
else
s
offset
=
[
_start
(
s1
)
-
_start
(
s2
)
for
s1
,
s2
in
zip
(
normalized_from_slice
,
normalized_to_slice
)]
assert
offset
==
[
_stop
(
s1
)
-
_stop
(
s2
)
for
s1
,
s2
in
zip
(
normalized_from_slice
,
normalized_to_slice
)],
\
"Slices have to have same size"
copy_eq
=
Assignment
(
pdf_field
(
pdf_idx
),
pdf_field
[
tuple
(
offset
)](
pdf_idx
))
ast
=
create_cuda_kernel
([
copy_eq
],
iteration_slice
=
dst_slice
,
skip_independence_check
=
True
)
if
target
==
'gpu'
:
from
pystencils.gpucuda
import
make_python_function
return
make_python_function
(
ast
)
elif
target
==
'opencl'
:
from
pystencils.opencl
import
make_python_function
return
make_python_function
(
ast
,
opencl_queue
,
opencl_ctx
)
else
:
raise
ValueError
(
'Invalid target:'
,
target
)
class
PeriodicityHandling
:
def
__init__
(
self
,
stencil
,
data_handling
,
pdf_field_name
,
streaming_pattern
=
'pull'
,
ghost_layers
=
1
,
gpu
=
False
):
streaming_pattern
=
'pull'
,
ghost_layers
=
1
,
target
=
'cpu'
,
opencl_queue
=
None
,
opencl_ctx
=
None
,
pycuda_direct_copy
=
True
):
"""
Periodicity Handling for Lattice Boltzmann Streaming.
**On the usage with cuda/opencl:**
- pycuda allows the copying of sliced arrays within device memory using the numpy syntax,
e.g. `dst[:,0] = src[:,-1]`. In this implementation, this is the default for periodicity
handling. Alternatively, if you set `pycuda_direct_copy=False`, GPU kernels are generated and
compiled. The compiled kernels are almost twice as fast in execution as pycuda array copying,
but especially for large stencils like D3Q27, their compilation can take up to 20 seconds.
Choose your weapon depending on your use case.
- pyopencl does not support copying of non-contiguous sliced arrays, so the usage of compiled
copy kernels is forced on us. On the positive side, compilation of the OpenCL kernels appears
to be about four times faster.
"""
if
not
isinstance
(
data_handling
,
SerialDataHandling
):
raise
ValueError
(
'Only single node data handling is supported!'
)
assert
target
in
[
'cpu'
,
'gpu'
,
'opencl'
]
self
.
stencil
=
stencil
self
.
dh
=
data_handling
self
.
pdf_field_name
=
pdf_field_name
self
.
ghost_layers
=
ghost_layers
periodicity
=
data_handling
.
periodicity
self
.
inplace_pattern
=
is_inplace
(
streaming_pattern
)
self
.
gpu
=
gpu
self
.
target
=
target
self
.
cpu
=
target
==
'cpu'
self
.
opencl_queue
=
opencl_queue
self
.
opencl_ctx
=
opencl_ctx
self
.
pycuda_direct_copy
=
target
==
'gpu'
and
pycuda_direct_copy
def
is_copy_direction
(
direction
):
for
d
,
p
in
zip
(
direction
,
periodicity
):
...
...
@@ -120,46 +185,49 @@ class PeriodicityHandling:
return
True
copy_directions
=
tuple
(
filter
(
is_copy_direction
,
stencil
[
1
:]))
# if self.inplace_pattern:
# self.comm_slices = dict()
# for timestep in [Timestep.EVEN, Timestep.ODD]:
# slices_per_comm_dir = get_communication_slices(stencil=stencil,
# comm_stencil=copy_directions,
# streaming_pattern=streaming_pattern,
# prev_timestep=timestep,
# ghost_layers=ghost_layers)
# self.comm_slices[timestep] = list(chain.from_iterable(v for k, v in slices_per_comm_dir.items()))
# else:
# slices_per_comm_dir = get_communication_slices(stencil=stencil,
# comm_stencil=copy_directions,
# streaming_pattern=streaming_pattern,
# prev_timestep=Timestep.BOTH,
# ghost_layers=ghost_layers)
# self.comm_slices = list(chain.from_iterable(v for k, v in slices_per_comm_dir.items()))
self
.
comm_slices
=
[]
for
timestep
in
get_timesteps
(
streaming_pattern
):
timesteps
=
get_timesteps
(
streaming_pattern
)
for
timestep
in
timesteps
:
slices_per_comm_dir
=
get_communication_slices
(
stencil
=
stencil
,
comm_stencil
=
copy_directions
,
streaming_pattern
=
streaming_pattern
,
prev_timestep
=
timestep
,
ghost_layers
=
ghost_layers
)
comm_stencil
=
copy_directions
,
streaming_pattern
=
streaming_pattern
,
prev_timestep
=
timestep
,
ghost_layers
=
ghost_layers
)
self
.
comm_slices
.
append
(
list
(
chain
.
from_iterable
(
v
for
k
,
v
in
slices_per_comm_dir
.
items
())))
if
target
==
'opencl'
or
(
target
==
'gpu'
and
not
pycuda_direct_copy
):
self
.
device_copy_kernels
=
[]
for
timestep
in
timesteps
:
self
.
device_copy_kernels
.
append
(
self
.
_compile_copy_kernels
(
timestep
))
def
__call__
(
self
,
prev_timestep
=
Timestep
.
BOTH
):
if
self
.
gpu
:
self
.
_periodicity_handling_gpu
(
prev_timestep
)
else
:
if
self
.
cpu
:
self
.
_periodicity_handling_cpu
(
prev_timestep
)
else
:
self
.
_periodicity_handling_gpu
(
prev_timestep
)
def
_periodicity_handling_cpu
(
self
,
prev_timestep
):
arr
=
self
.
dh
.
cpu_arrays
[
self
.
pdf_field_name
]
# if prev_timestep == 'both':
# comm_slices = self.comm_slices
# else:
comm_slices
=
self
.
comm_slices
[
prev_timestep
.
idx
]
for
src
,
dst
in
comm_slices
:
arr
[
dst
]
=
arr
[
src
]
def
_compile_copy_kernels
(
self
,
timestep
):
pdf_field
=
self
.
dh
.
fields
[
self
.
pdf_field_name
]
kernels
=
[]
for
src
,
dst
in
self
.
comm_slices
[
timestep
.
idx
]:
kernels
.
append
(
periodic_pdf_copy_kernel
(
pdf_field
,
src
,
dst
,
target
=
self
.
target
,
opencl_queue
=
self
.
opencl_queue
,
opencl_ctx
=
self
.
opencl_ctx
))
return
kernels
def
_periodicity_handling_gpu
(
self
,
prev_timestep
):
raise
NotImplementedError
()
arr
=
self
.
dh
.
gpu_arrays
[
self
.
pdf_field_name
]
if
self
.
pycuda_direct_copy
:
for
src
,
dst
in
self
.
comm_slices
[
prev_timestep
.
idx
]:
arr
[
dst
]
=
arr
[
src
]
else
:
kernel_args
=
{
self
.
pdf_field_name
:
arr
}
for
kernel
in
self
.
device_copy_kernels
[
prev_timestep
.
idx
]:
kernel
(
**
kernel_args
)
lbmpy/advanced_streaming/utility.py
View file @
ac69e4f1
...
...
@@ -9,6 +9,7 @@ from lbmpy.fieldaccess import PdfFieldAccessor, \
import
pystencils
as
ps
from
enum
import
IntEnum
class
Timestep
(
IntEnum
):
EVEN
=
0
ODD
=
1
...
...
@@ -22,6 +23,7 @@ class Timestep(IntEnum):
"""To use this timestep as an array index"""
return
self
%
2
streaming_patterns
=
[
'push'
,
'pull'
,
'aa'
,
'esotwist'
]
even_accessors
=
{
...
...
@@ -39,7 +41,7 @@ odd_accessors = {
}
def
get_accessor
(
streaming_pattern
:
str
,
timestep
:
Timestep
)
->
PdfFieldAccessor
:
def
get_accessor
(
streaming_pattern
:
str
,
timestep
:
Timestep
)
->
PdfFieldAccessor
:
if
streaming_pattern
not
in
streaming_patterns
:
raise
ValueError
(
"Invalid value of parameter 'streaming_pattern'."
,
streaming_pattern
)
...
...
@@ -56,9 +58,11 @@ def is_inplace(streaming_pattern):
return
streaming_pattern
in
[
'aa'
,
'esotwist'
]
def
get_timesteps
(
streaming_pattern
):
return
(
Timestep
.
EVEN
,
Timestep
.
ODD
)
if
is_inplace
(
streaming_pattern
)
else
(
Timestep
.
BOTH
,
)
def
numeric_offsets
(
field_access
:
ps
.
Field
.
Access
):
return
tuple
(
int
(
o
)
for
o
in
field_access
.
offsets
)
...
...
lbmpy_tests/advanced_streaming/test_fully_periodic.ipynb
View file @
ac69e4f1
This source diff could not be displayed because it is too large. You can
view the blob
instead.
lbmpy_tests/advanced_streaming/test_fully_periodic_flow.py
View file @
ac69e4f1
...
...
@@ -17,10 +17,33 @@ from numpy.testing import assert_allclose, assert_array_equal
all_results
=
dict
()
targets
=
[
'cpu'
]
try
:
import
pycuda.autoinit
targets
+=
[
'gpu'
]
except
ImportError
:
pass
try
:
import
pystencils.opencl.autoinit
targets
+=
[
'opencl'
]
except
ImportError
:
pass
@
pytest
.
mark
.
parametrize
(
'target'
,
targets
)
@
pytest
.
mark
.
parametrize
(
'stencil'
,
[
'D2Q9'
,
'D3Q19'
,
'D3Q27'
])
@
pytest
.
mark
.
parametrize
(
'streaming_pattern'
,
streaming_patterns
)
def
test_fully_periodic_flow
(
stencil
,
streaming_pattern
):
def
test_fully_periodic_flow
(
target
,
stencil
,
streaming_pattern
):
if
target
==
'opencl'
:
from
pystencils.opencl.opencljit
import
get_global_cl_queue
opencl_queue
=
get_global_cl_queue
()
else
:
opencl_queue
=
None
gpu
=
target
in
[
'gpu'
,
'opencl'
]
# Stencil
stencil
=
get_stencil
(
stencil
)
...
...
@@ -36,7 +59,8 @@ def test_fully_periodic_flow(stencil, streaming_pattern):
domain_size
=
(
30
,)
*
dim
periodicity
=
(
True
,)
*
dim
dh
=
create_data_handling
(
domain_size
=
domain_size
,
periodicity
=
periodicity
)
dh
=
create_data_handling
(
domain_size
=
domain_size
,
periodicity
=
periodicity
,
default_target
=
target
,
opencl_queue
=
opencl_queue
)
pdfs
=
dh
.
add_array
(
'pdfs'
,
q
)
if
not
inplace
:
...
...
@@ -51,7 +75,7 @@ def test_fully_periodic_flow(stencil, streaming_pattern):
optimization
=
{
'symbolic_field'
:
pdfs
,
'target'
:
'cpu'
'target'
:
target
}
if
not
inplace
:
...
...
@@ -92,17 +116,17 @@ def test_fully_periodic_flow(stencil, streaming_pattern):
setter
=
flexible_macroscopic_values_setter
(
lb_method
,
density
,
velocity
,
pdfs
,
streaming_pattern
=
streaming_pattern
,
previous_timestep
=
zeroth_timestep
)
setter_kernel
=
create_kernel
(
setter
,
ghost_layers
=
1
).
compile
()
setter_kernel
=
create_kernel
(
setter
,
ghost_layers
=
1
,
target
=
target
).
compile
()
getter_kernels
=
[]
for
t
in
timesteps
:
getter
=
flexible_macroscopic_values_getter
(
lb_method
,
density_field
,
velocity_field
,
pdfs
,
streaming_pattern
=
streaming_pattern
,
previous_timestep
=
t
)
getter_kernels
.
append
(
create_kernel
(
getter
,
ghost_layers
=
1
).
compile
())
getter_kernels
.
append
(
create_kernel
(
getter
,
ghost_layers
=
1
,
target
=
target
).
compile
())
# Periodicity
periodicity_handler
=
PeriodicityHandling
(
stencil
,
dh
,
pdfs
.
name
,
streaming_pattern
=
streaming_pattern
)
periodicity_handler
=
PeriodicityHandling
(
stencil
,
dh
,
pdfs
.
name
,
streaming_pattern
=
streaming_pattern
,
target
=
target
)
# Initialization and Timestep
current_timestep
=
zeroth_timestep
...
...
@@ -138,7 +162,8 @@ def test_fully_periodic_flow(stencil, streaming_pattern):
one_step
()
# Evaluation
if
gpu
:
dh
.
to_cpu
(
velocity_field
.
name
)
u
=
dh
.
gather_array
(
velocity_field
.
name
)
# Equal to the steady-state velocity field up to numerical errors
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment