Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Frederik Hennig
lbmpy
Commits
ac69e4f1
Commit
ac69e4f1
authored
Oct 19, 2020
by
Frederik Hennig
Browse files
Periodicity Handling for CUDA and OpenCL
parent
9bc0632b
Pipeline
#27376
passed with stage
in 31 minutes and 42 seconds
Changes
5
Pipelines
1
Expand all
Hide whitespace changes
Inline
Side-by-side
lbmpy/advanced_streaming/boundaryhandling.py
View file @
ac69e4f1
...
@@ -47,10 +47,10 @@ class FlexibleLBMBoundaryHandling(BoundaryHandling):
...
@@ -47,10 +47,10 @@ class FlexibleLBMBoundaryHandling(BoundaryHandling):
if
boundary_obj
not
in
self
.
_boundary_object_to_boundary_info
:
if
boundary_obj
not
in
self
.
_boundary_object_to_boundary_info
:
sym_index_field
=
Field
.
create_generic
(
'indexField'
,
spatial_dimensions
=
1
,
sym_index_field
=
Field
.
create_generic
(
'indexField'
,
spatial_dimensions
=
1
,
dtype
=
numpy_data_type_for_boundary_object
(
boundary_obj
,
self
.
dim
))
dtype
=
numpy_data_type_for_boundary_object
(
boundary_obj
,
self
.
dim
))
kernels
=
[
self
.
_create_boundary_kernel
(
kernels
=
[
self
.
_create_boundary_kernel
(
self
.
_data_handling
.
fields
[
self
.
_field_name
],
sym_index_field
,
boundary_obj
,
Timestep
.
EVEN
).
compile
(),
self
.
_data_handling
.
fields
[
self
.
_field_name
],
sym_index_field
,
boundary_obj
,
Timestep
.
EVEN
).
compile
(),
self
.
_create_boundary_kernel
(
self
.
_create_boundary_kernel
(
self
.
_data_handling
.
fields
[
self
.
_field_name
],
sym_index_field
,
boundary_obj
,
Timestep
.
ODD
).
compile
()
]
self
.
_data_handling
.
fields
[
self
.
_field_name
],
sym_index_field
,
boundary_obj
,
Timestep
.
ODD
).
compile
()]
if
flag
is
None
:
if
flag
is
None
:
flag
=
self
.
flag_interface
.
reserve_next_flag
()
flag
=
self
.
flag_interface
.
reserve_next_flag
()
boundary_info
=
self
.
FlexibleBoundaryInfo
(
self
,
boundary_obj
,
flag
,
kernels
)
boundary_info
=
self
.
FlexibleBoundaryInfo
(
self
,
boundary_obj
,
flag
,
kernels
)
...
@@ -70,7 +70,7 @@ class FlexibleLBMBoundaryHandling(BoundaryHandling):
...
@@ -70,7 +70,7 @@ class FlexibleLBMBoundaryHandling(BoundaryHandling):
prev_timestep
=
self
.
_boundary_handling
.
prev_timestep
prev_timestep
=
self
.
_boundary_handling
.
prev_timestep
if
prev_timestep
is
None
:
if
prev_timestep
is
None
:
raise
Exception
(
raise
Exception
(
"The flexible boundary kernel property was accessed while "
"The flexible boundary kernel property was accessed while "
+
"there was no boundary handling in progress."
)
+
"there was no boundary handling in progress."
)
return
self
.
_kernels
[
prev_timestep
]
return
self
.
_kernels
[
prev_timestep
]
...
...
lbmpy/advanced_streaming/communication.py
View file @
ac69e4f1
from
pystencils
import
Field
from
pystencils
import
Field
,
Assignment
from
pystencils.slicing
import
shift_slice
,
get_slice_before_ghost_layer
from
pystencils.slicing
import
shift_slice
,
get_slice_before_ghost_layer
,
normalize_slice
from
lbmpy.advanced_streaming.utility
import
is_inplace
,
get_accessor
,
numeric_index
,
numeric_offsets
,
Timestep
,
get_timesteps
from
lbmpy.advanced_streaming.utility
import
is_inplace
,
get_accessor
,
numeric_index
,
\
numeric_offsets
,
Timestep
,
get_timesteps
from
pystencils.datahandling
import
SerialDataHandling
from
pystencils.datahandling
import
SerialDataHandling
from
itertools
import
chain
from
itertools
import
chain
...
@@ -98,19 +99,83 @@ def get_communication_slices(
...
@@ -98,19 +99,83 @@ def get_communication_slices(
return
slices_per_comm_direction
return
slices_per_comm_direction
def
periodic_pdf_copy_kernel
(
pdf_field
,
src_slice
,
dst_slice
,
domain_size
=
None
,
target
=
'gpu'
,
opencl_queue
=
None
,
opencl_ctx
=
None
):
"""Copies a rectangular array slice onto another non-overlapping array slice"""
from
pystencils.gpucuda.kernelcreation
import
create_cuda_kernel
pdf_idx
=
src_slice
[
-
1
]
assert
isinstance
(
pdf_idx
,
int
),
"PDF index needs to be an integer constant"
assert
pdf_idx
==
dst_slice
[
-
1
],
"Source and Destination PDF indices must be equal"
src_slice
=
src_slice
[:
-
1
]
dst_slice
=
dst_slice
[:
-
1
]
if
domain_size
is
None
:
domain_size
=
pdf_field
.
spatial_shape
normalized_from_slice
=
normalize_slice
(
src_slice
,
domain_size
)
normalized_to_slice
=
normalize_slice
(
dst_slice
,
domain_size
)
def
_start
(
s
):
return
s
.
start
if
isinstance
(
s
,
slice
)
else
s
def
_stop
(
s
):
return
s
.
stop
if
isinstance
(
s
,
slice
)
else
s
offset
=
[
_start
(
s1
)
-
_start
(
s2
)
for
s1
,
s2
in
zip
(
normalized_from_slice
,
normalized_to_slice
)]
assert
offset
==
[
_stop
(
s1
)
-
_stop
(
s2
)
for
s1
,
s2
in
zip
(
normalized_from_slice
,
normalized_to_slice
)],
\
"Slices have to have same size"
copy_eq
=
Assignment
(
pdf_field
(
pdf_idx
),
pdf_field
[
tuple
(
offset
)](
pdf_idx
))
ast
=
create_cuda_kernel
([
copy_eq
],
iteration_slice
=
dst_slice
,
skip_independence_check
=
True
)
if
target
==
'gpu'
:
from
pystencils.gpucuda
import
make_python_function
return
make_python_function
(
ast
)
elif
target
==
'opencl'
:
from
pystencils.opencl
import
make_python_function
return
make_python_function
(
ast
,
opencl_queue
,
opencl_ctx
)
else
:
raise
ValueError
(
'Invalid target:'
,
target
)
class
PeriodicityHandling
:
class
PeriodicityHandling
:
def
__init__
(
self
,
stencil
,
data_handling
,
pdf_field_name
,
def
__init__
(
self
,
stencil
,
data_handling
,
pdf_field_name
,
streaming_pattern
=
'pull'
,
ghost_layers
=
1
,
gpu
=
False
):
streaming_pattern
=
'pull'
,
ghost_layers
=
1
,
target
=
'cpu'
,
opencl_queue
=
None
,
opencl_ctx
=
None
,
pycuda_direct_copy
=
True
):
"""
Periodicity Handling for Lattice Boltzmann Streaming.
**On the usage with cuda/opencl:**
- pycuda allows the copying of sliced arrays within device memory using the numpy syntax,
e.g. `dst[:,0] = src[:,-1]`. In this implementation, this is the default for periodicity
handling. Alternatively, if you set `pycuda_direct_copy=False`, GPU kernels are generated and
compiled. The compiled kernels are almost twice as fast in execution as pycuda array copying,
but especially for large stencils like D3Q27, their compilation can take up to 20 seconds.
Choose your weapon depending on your use case.
- pyopencl does not support copying of non-contiguous sliced arrays, so the usage of compiled
copy kernels is forced on us. On the positive side, compilation of the OpenCL kernels appears
to be about four times faster.
"""
if
not
isinstance
(
data_handling
,
SerialDataHandling
):
if
not
isinstance
(
data_handling
,
SerialDataHandling
):
raise
ValueError
(
'Only single node data handling is supported!'
)
raise
ValueError
(
'Only single node data handling is supported!'
)
assert
target
in
[
'cpu'
,
'gpu'
,
'opencl'
]
self
.
stencil
=
stencil
self
.
stencil
=
stencil
self
.
dh
=
data_handling
self
.
dh
=
data_handling
self
.
pdf_field_name
=
pdf_field_name
self
.
pdf_field_name
=
pdf_field_name
self
.
ghost_layers
=
ghost_layers
periodicity
=
data_handling
.
periodicity
periodicity
=
data_handling
.
periodicity
self
.
inplace_pattern
=
is_inplace
(
streaming_pattern
)
self
.
inplace_pattern
=
is_inplace
(
streaming_pattern
)
self
.
gpu
=
gpu
self
.
target
=
target
self
.
cpu
=
target
==
'cpu'
self
.
opencl_queue
=
opencl_queue
self
.
opencl_ctx
=
opencl_ctx
self
.
pycuda_direct_copy
=
target
==
'gpu'
and
pycuda_direct_copy
def
is_copy_direction
(
direction
):
def
is_copy_direction
(
direction
):
for
d
,
p
in
zip
(
direction
,
periodicity
):
for
d
,
p
in
zip
(
direction
,
periodicity
):
...
@@ -120,46 +185,49 @@ class PeriodicityHandling:
...
@@ -120,46 +185,49 @@ class PeriodicityHandling:
return
True
return
True
copy_directions
=
tuple
(
filter
(
is_copy_direction
,
stencil
[
1
:]))
copy_directions
=
tuple
(
filter
(
is_copy_direction
,
stencil
[
1
:]))
# if self.inplace_pattern:
# self.comm_slices = dict()
# for timestep in [Timestep.EVEN, Timestep.ODD]:
# slices_per_comm_dir = get_communication_slices(stencil=stencil,
# comm_stencil=copy_directions,
# streaming_pattern=streaming_pattern,
# prev_timestep=timestep,
# ghost_layers=ghost_layers)
# self.comm_slices[timestep] = list(chain.from_iterable(v for k, v in slices_per_comm_dir.items()))
# else:
# slices_per_comm_dir = get_communication_slices(stencil=stencil,
# comm_stencil=copy_directions,
# streaming_pattern=streaming_pattern,
# prev_timestep=Timestep.BOTH,
# ghost_layers=ghost_layers)
# self.comm_slices = list(chain.from_iterable(v for k, v in slices_per_comm_dir.items()))
self
.
comm_slices
=
[]
self
.
comm_slices
=
[]
for
timestep
in
get_timesteps
(
streaming_pattern
):
timesteps
=
get_timesteps
(
streaming_pattern
)
for
timestep
in
timesteps
:
slices_per_comm_dir
=
get_communication_slices
(
stencil
=
stencil
,
slices_per_comm_dir
=
get_communication_slices
(
stencil
=
stencil
,
comm_stencil
=
copy_directions
,
comm_stencil
=
copy_directions
,
streaming_pattern
=
streaming_pattern
,
streaming_pattern
=
streaming_pattern
,
prev_timestep
=
timestep
,
prev_timestep
=
timestep
,
ghost_layers
=
ghost_layers
)
ghost_layers
=
ghost_layers
)
self
.
comm_slices
.
append
(
list
(
chain
.
from_iterable
(
v
for
k
,
v
in
slices_per_comm_dir
.
items
())))
self
.
comm_slices
.
append
(
list
(
chain
.
from_iterable
(
v
for
k
,
v
in
slices_per_comm_dir
.
items
())))
if
target
==
'opencl'
or
(
target
==
'gpu'
and
not
pycuda_direct_copy
):
self
.
device_copy_kernels
=
[]
for
timestep
in
timesteps
:
self
.
device_copy_kernels
.
append
(
self
.
_compile_copy_kernels
(
timestep
))
def
__call__
(
self
,
prev_timestep
=
Timestep
.
BOTH
):
def
__call__
(
self
,
prev_timestep
=
Timestep
.
BOTH
):
if
self
.
gpu
:
if
self
.
cpu
:
self
.
_periodicity_handling_gpu
(
prev_timestep
)
else
:
self
.
_periodicity_handling_cpu
(
prev_timestep
)
self
.
_periodicity_handling_cpu
(
prev_timestep
)
else
:
self
.
_periodicity_handling_gpu
(
prev_timestep
)
def
_periodicity_handling_cpu
(
self
,
prev_timestep
):
def
_periodicity_handling_cpu
(
self
,
prev_timestep
):
arr
=
self
.
dh
.
cpu_arrays
[
self
.
pdf_field_name
]
arr
=
self
.
dh
.
cpu_arrays
[
self
.
pdf_field_name
]
# if prev_timestep == 'both':
# comm_slices = self.comm_slices
# else:
comm_slices
=
self
.
comm_slices
[
prev_timestep
.
idx
]
comm_slices
=
self
.
comm_slices
[
prev_timestep
.
idx
]
for
src
,
dst
in
comm_slices
:
for
src
,
dst
in
comm_slices
:
arr
[
dst
]
=
arr
[
src
]
arr
[
dst
]
=
arr
[
src
]
def
_compile_copy_kernels
(
self
,
timestep
):
pdf_field
=
self
.
dh
.
fields
[
self
.
pdf_field_name
]
kernels
=
[]
for
src
,
dst
in
self
.
comm_slices
[
timestep
.
idx
]:
kernels
.
append
(
periodic_pdf_copy_kernel
(
pdf_field
,
src
,
dst
,
target
=
self
.
target
,
opencl_queue
=
self
.
opencl_queue
,
opencl_ctx
=
self
.
opencl_ctx
))
return
kernels
def
_periodicity_handling_gpu
(
self
,
prev_timestep
):
def
_periodicity_handling_gpu
(
self
,
prev_timestep
):
raise
NotImplementedError
()
arr
=
self
.
dh
.
gpu_arrays
[
self
.
pdf_field_name
]
if
self
.
pycuda_direct_copy
:
for
src
,
dst
in
self
.
comm_slices
[
prev_timestep
.
idx
]:
arr
[
dst
]
=
arr
[
src
]
else
:
kernel_args
=
{
self
.
pdf_field_name
:
arr
}
for
kernel
in
self
.
device_copy_kernels
[
prev_timestep
.
idx
]:
kernel
(
**
kernel_args
)
lbmpy/advanced_streaming/utility.py
View file @
ac69e4f1
...
@@ -9,6 +9,7 @@ from lbmpy.fieldaccess import PdfFieldAccessor, \
...
@@ -9,6 +9,7 @@ from lbmpy.fieldaccess import PdfFieldAccessor, \
import
pystencils
as
ps
import
pystencils
as
ps
from
enum
import
IntEnum
from
enum
import
IntEnum
class
Timestep
(
IntEnum
):
class
Timestep
(
IntEnum
):
EVEN
=
0
EVEN
=
0
ODD
=
1
ODD
=
1
...
@@ -22,6 +23,7 @@ class Timestep(IntEnum):
...
@@ -22,6 +23,7 @@ class Timestep(IntEnum):
"""To use this timestep as an array index"""
"""To use this timestep as an array index"""
return
self
%
2
return
self
%
2
streaming_patterns
=
[
'push'
,
'pull'
,
'aa'
,
'esotwist'
]
streaming_patterns
=
[
'push'
,
'pull'
,
'aa'
,
'esotwist'
]
even_accessors
=
{
even_accessors
=
{
...
@@ -39,7 +41,7 @@ odd_accessors = {
...
@@ -39,7 +41,7 @@ odd_accessors = {
}
}
def
get_accessor
(
streaming_pattern
:
str
,
timestep
:
Timestep
)
->
PdfFieldAccessor
:
def
get_accessor
(
streaming_pattern
:
str
,
timestep
:
Timestep
)
->
PdfFieldAccessor
:
if
streaming_pattern
not
in
streaming_patterns
:
if
streaming_pattern
not
in
streaming_patterns
:
raise
ValueError
(
raise
ValueError
(
"Invalid value of parameter 'streaming_pattern'."
,
streaming_pattern
)
"Invalid value of parameter 'streaming_pattern'."
,
streaming_pattern
)
...
@@ -56,9 +58,11 @@ def is_inplace(streaming_pattern):
...
@@ -56,9 +58,11 @@ def is_inplace(streaming_pattern):
return
streaming_pattern
in
[
'aa'
,
'esotwist'
]
return
streaming_pattern
in
[
'aa'
,
'esotwist'
]
def
get_timesteps
(
streaming_pattern
):
def
get_timesteps
(
streaming_pattern
):
return
(
Timestep
.
EVEN
,
Timestep
.
ODD
)
if
is_inplace
(
streaming_pattern
)
else
(
Timestep
.
BOTH
,
)
return
(
Timestep
.
EVEN
,
Timestep
.
ODD
)
if
is_inplace
(
streaming_pattern
)
else
(
Timestep
.
BOTH
,
)
def
numeric_offsets
(
field_access
:
ps
.
Field
.
Access
):
def
numeric_offsets
(
field_access
:
ps
.
Field
.
Access
):
return
tuple
(
int
(
o
)
for
o
in
field_access
.
offsets
)
return
tuple
(
int
(
o
)
for
o
in
field_access
.
offsets
)
...
...
lbmpy_tests/advanced_streaming/test_fully_periodic.ipynb
View file @
ac69e4f1
This diff is collapsed.
Click to expand it.
lbmpy_tests/advanced_streaming/test_fully_periodic_flow.py
View file @
ac69e4f1
...
@@ -17,10 +17,33 @@ from numpy.testing import assert_allclose, assert_array_equal
...
@@ -17,10 +17,33 @@ from numpy.testing import assert_allclose, assert_array_equal
all_results
=
dict
()
all_results
=
dict
()
targets
=
[
'cpu'
]
try
:
import
pycuda.autoinit
targets
+=
[
'gpu'
]
except
ImportError
:
pass
try
:
import
pystencils.opencl.autoinit
targets
+=
[
'opencl'
]
except
ImportError
:
pass
@
pytest
.
mark
.
parametrize
(
'target'
,
targets
)
@
pytest
.
mark
.
parametrize
(
'stencil'
,
[
'D2Q9'
,
'D3Q19'
,
'D3Q27'
])
@
pytest
.
mark
.
parametrize
(
'stencil'
,
[
'D2Q9'
,
'D3Q19'
,
'D3Q27'
])
@
pytest
.
mark
.
parametrize
(
'streaming_pattern'
,
streaming_patterns
)
@
pytest
.
mark
.
parametrize
(
'streaming_pattern'
,
streaming_patterns
)
def
test_fully_periodic_flow
(
stencil
,
streaming_pattern
):
def
test_fully_periodic_flow
(
target
,
stencil
,
streaming_pattern
):
if
target
==
'opencl'
:
from
pystencils.opencl.opencljit
import
get_global_cl_queue
opencl_queue
=
get_global_cl_queue
()
else
:
opencl_queue
=
None
gpu
=
target
in
[
'gpu'
,
'opencl'
]
# Stencil
# Stencil
stencil
=
get_stencil
(
stencil
)
stencil
=
get_stencil
(
stencil
)
...
@@ -36,7 +59,8 @@ def test_fully_periodic_flow(stencil, streaming_pattern):
...
@@ -36,7 +59,8 @@ def test_fully_periodic_flow(stencil, streaming_pattern):
domain_size
=
(
30
,)
*
dim
domain_size
=
(
30
,)
*
dim
periodicity
=
(
True
,)
*
dim
periodicity
=
(
True
,)
*
dim
dh
=
create_data_handling
(
domain_size
=
domain_size
,
periodicity
=
periodicity
)
dh
=
create_data_handling
(
domain_size
=
domain_size
,
periodicity
=
periodicity
,
default_target
=
target
,
opencl_queue
=
opencl_queue
)
pdfs
=
dh
.
add_array
(
'pdfs'
,
q
)
pdfs
=
dh
.
add_array
(
'pdfs'
,
q
)
if
not
inplace
:
if
not
inplace
:
...
@@ -51,7 +75,7 @@ def test_fully_periodic_flow(stencil, streaming_pattern):
...
@@ -51,7 +75,7 @@ def test_fully_periodic_flow(stencil, streaming_pattern):
optimization
=
{
optimization
=
{
'symbolic_field'
:
pdfs
,
'symbolic_field'
:
pdfs
,
'target'
:
'cpu'
'target'
:
target
}
}
if
not
inplace
:
if
not
inplace
:
...
@@ -92,17 +116,17 @@ def test_fully_periodic_flow(stencil, streaming_pattern):
...
@@ -92,17 +116,17 @@ def test_fully_periodic_flow(stencil, streaming_pattern):
setter
=
flexible_macroscopic_values_setter
(
setter
=
flexible_macroscopic_values_setter
(
lb_method
,
density
,
velocity
,
pdfs
,
lb_method
,
density
,
velocity
,
pdfs
,
streaming_pattern
=
streaming_pattern
,
previous_timestep
=
zeroth_timestep
)
streaming_pattern
=
streaming_pattern
,
previous_timestep
=
zeroth_timestep
)
setter_kernel
=
create_kernel
(
setter
,
ghost_layers
=
1
).
compile
()
setter_kernel
=
create_kernel
(
setter
,
ghost_layers
=
1
,
target
=
target
).
compile
()
getter_kernels
=
[]
getter_kernels
=
[]
for
t
in
timesteps
:
for
t
in
timesteps
:
getter
=
flexible_macroscopic_values_getter
(
getter
=
flexible_macroscopic_values_getter
(
lb_method
,
density_field
,
velocity_field
,
pdfs
,
lb_method
,
density_field
,
velocity_field
,
pdfs
,
streaming_pattern
=
streaming_pattern
,
previous_timestep
=
t
)
streaming_pattern
=
streaming_pattern
,
previous_timestep
=
t
)
getter_kernels
.
append
(
create_kernel
(
getter
,
ghost_layers
=
1
).
compile
())
getter_kernels
.
append
(
create_kernel
(
getter
,
ghost_layers
=
1
,
target
=
target
).
compile
())
# Periodicity
# Periodicity
periodicity_handler
=
PeriodicityHandling
(
stencil
,
dh
,
pdfs
.
name
,
streaming_pattern
=
streaming_pattern
)
periodicity_handler
=
PeriodicityHandling
(
stencil
,
dh
,
pdfs
.
name
,
streaming_pattern
=
streaming_pattern
,
target
=
target
)
# Initialization and Timestep
# Initialization and Timestep
current_timestep
=
zeroth_timestep
current_timestep
=
zeroth_timestep
...
@@ -138,7 +162,8 @@ def test_fully_periodic_flow(stencil, streaming_pattern):
...
@@ -138,7 +162,8 @@ def test_fully_periodic_flow(stencil, streaming_pattern):
one_step
()
one_step
()
# Evaluation
# Evaluation
if
gpu
:
dh
.
to_cpu
(
velocity_field
.
name
)
u
=
dh
.
gather_array
(
velocity_field
.
name
)
u
=
dh
.
gather_array
(
velocity_field
.
name
)
# Equal to the steady-state velocity field up to numerical errors
# Equal to the steady-state velocity field up to numerical errors
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment