Commit ac69e4f1 authored by Frederik Hennig's avatar Frederik Hennig
Browse files

Periodicity Handling for CUDA and OpenCL

parent 9bc0632b
Pipeline #27376 passed with stage
in 31 minutes and 42 seconds
......@@ -47,10 +47,10 @@ class FlexibleLBMBoundaryHandling(BoundaryHandling):
if boundary_obj not in self._boundary_object_to_boundary_info:
sym_index_field = Field.create_generic('indexField', spatial_dimensions=1,
dtype=numpy_data_type_for_boundary_object(boundary_obj, self.dim))
kernels = [ self._create_boundary_kernel(
kernels = [self._create_boundary_kernel(
self._data_handling.fields[self._field_name], sym_index_field, boundary_obj, Timestep.EVEN).compile(),
self._data_handling.fields[self._field_name], sym_index_field, boundary_obj, Timestep.ODD).compile() ]
self._data_handling.fields[self._field_name], sym_index_field, boundary_obj, Timestep.ODD).compile()]
if flag is None:
flag = self.flag_interface.reserve_next_flag()
boundary_info = self.FlexibleBoundaryInfo(self, boundary_obj, flag, kernels)
from pystencils import Field
from pystencils.slicing import shift_slice, get_slice_before_ghost_layer
from lbmpy.advanced_streaming.utility import is_inplace, get_accessor, numeric_index, numeric_offsets, Timestep, get_timesteps
from pystencils import Field, Assignment
from pystencils.slicing import shift_slice, get_slice_before_ghost_layer, normalize_slice
from lbmpy.advanced_streaming.utility import is_inplace, get_accessor, numeric_index, \
numeric_offsets, Timestep, get_timesteps
from pystencils.datahandling import SerialDataHandling
from itertools import chain
......@@ -98,19 +99,83 @@ def get_communication_slices(
return slices_per_comm_direction
def periodic_pdf_copy_kernel(pdf_field, src_slice, dst_slice,
domain_size=None, target='gpu',
opencl_queue=None, opencl_ctx=None):
"""Copies a rectangular array slice onto another non-overlapping array slice"""
from pystencils.gpucuda.kernelcreation import create_cuda_kernel
pdf_idx = src_slice[-1]
assert isinstance(pdf_idx, int), "PDF index needs to be an integer constant"
assert pdf_idx == dst_slice[-1], "Source and Destination PDF indices must be equal"
src_slice = src_slice[:-1]
dst_slice = dst_slice[:-1]
if domain_size is None:
domain_size = pdf_field.spatial_shape
normalized_from_slice = normalize_slice(src_slice, domain_size)
normalized_to_slice = normalize_slice(dst_slice, domain_size)
def _start(s):
return s.start if isinstance(s, slice) else s
def _stop(s):
return s.stop if isinstance(s, slice) else s
offset = [_start(s1) - _start(s2) for s1, s2 in zip(normalized_from_slice, normalized_to_slice)]
assert offset == [_stop(s1) - _stop(s2) for s1, s2 in zip(normalized_from_slice, normalized_to_slice)], \
"Slices have to have same size"
copy_eq = Assignment(pdf_field(pdf_idx), pdf_field[tuple(offset)](pdf_idx))
ast = create_cuda_kernel([copy_eq], iteration_slice=dst_slice, skip_independence_check=True)
if target == 'gpu':
from pystencils.gpucuda import make_python_function
return make_python_function(ast)
elif target == 'opencl':
from pystencils.opencl import make_python_function
return make_python_function(ast, opencl_queue, opencl_ctx)
raise ValueError('Invalid target:', target)
class PeriodicityHandling:
def __init__(self, stencil, data_handling, pdf_field_name,
streaming_pattern='pull', ghost_layers=1, gpu=False):
streaming_pattern='pull', ghost_layers=1, target='cpu',
opencl_queue=None, opencl_ctx=None,
Periodicity Handling for Lattice Boltzmann Streaming.
**On the usage with cuda/opencl:**
- pycuda allows the copying of sliced arrays within device memory using the numpy syntax,
e.g. `dst[:,0] = src[:,-1]`. In this implementation, this is the default for periodicity
handling. Alternatively, if you set `pycuda_direct_copy=False`, GPU kernels are generated and
compiled. The compiled kernels are almost twice as fast in execution as pycuda array copying,
but especially for large stencils like D3Q27, their compilation can take up to 20 seconds.
Choose your weapon depending on your use case.
- pyopencl does not support copying of non-contiguous sliced arrays, so the usage of compiled
copy kernels is forced on us. On the positive side, compilation of the OpenCL kernels appears
to be about four times faster.
if not isinstance(data_handling, SerialDataHandling):
raise ValueError('Only single node data handling is supported!')
assert target in ['cpu', 'gpu', 'opencl']
self.stencil = stencil
self.dh = data_handling
self.pdf_field_name = pdf_field_name
self.ghost_layers = ghost_layers
periodicity = data_handling.periodicity
self.inplace_pattern = is_inplace(streaming_pattern)
self.gpu = gpu = target
self.cpu = target == 'cpu'
self.opencl_queue = opencl_queue
self.opencl_ctx = opencl_ctx
self.pycuda_direct_copy = target == 'gpu' and pycuda_direct_copy
def is_copy_direction(direction):
for d, p in zip(direction, periodicity):
......@@ -120,25 +185,9 @@ class PeriodicityHandling:
return True
copy_directions = tuple(filter(is_copy_direction, stencil[1:]))
# if self.inplace_pattern:
# self.comm_slices = dict()
# for timestep in [Timestep.EVEN, Timestep.ODD]:
# slices_per_comm_dir = get_communication_slices(stencil=stencil,
# comm_stencil=copy_directions,
# streaming_pattern=streaming_pattern,
# prev_timestep=timestep,
# ghost_layers=ghost_layers)
# self.comm_slices[timestep] = list(chain.from_iterable(v for k, v in slices_per_comm_dir.items()))
# else:
# slices_per_comm_dir = get_communication_slices(stencil=stencil,
# comm_stencil=copy_directions,
# streaming_pattern=streaming_pattern,
# prev_timestep=Timestep.BOTH,
# ghost_layers=ghost_layers)
# self.comm_slices = list(chain.from_iterable(v for k, v in slices_per_comm_dir.items()))
self.comm_slices = []
for timestep in get_timesteps(streaming_pattern):
timesteps = get_timesteps(streaming_pattern)
for timestep in timesteps:
slices_per_comm_dir = get_communication_slices(stencil=stencil,
......@@ -146,20 +195,39 @@ class PeriodicityHandling:
self.comm_slices.append(list(chain.from_iterable(v for k, v in slices_per_comm_dir.items())))
if target == 'opencl' or (target == 'gpu' and not pycuda_direct_copy):
self.device_copy_kernels = []
for timestep in timesteps:
def __call__(self, prev_timestep=Timestep.BOTH):
if self.gpu:
if self.cpu:
def _periodicity_handling_cpu(self, prev_timestep):
arr = self.dh.cpu_arrays[self.pdf_field_name]
# if prev_timestep == 'both':
# comm_slices = self.comm_slices
# else:
comm_slices = self.comm_slices[prev_timestep.idx]
for src, dst in comm_slices:
arr[dst] = arr[src]
def _compile_copy_kernels(self, timestep):
pdf_field = self.dh.fields[self.pdf_field_name]
kernels = []
for src, dst in self.comm_slices[timestep.idx]:
pdf_field, src, dst,,
opencl_queue=self.opencl_queue, opencl_ctx=self.opencl_ctx))
return kernels
def _periodicity_handling_gpu(self, prev_timestep):
raise NotImplementedError()
arr = self.dh.gpu_arrays[self.pdf_field_name]
if self.pycuda_direct_copy:
for src, dst in self.comm_slices[prev_timestep.idx]:
arr[dst] = arr[src]
kernel_args = {self.pdf_field_name: arr}
for kernel in self.device_copy_kernels[prev_timestep.idx]:
......@@ -9,6 +9,7 @@ from lbmpy.fieldaccess import PdfFieldAccessor, \
import pystencils as ps
from enum import IntEnum
class Timestep(IntEnum):
EVEN = 0
ODD = 1
......@@ -22,6 +23,7 @@ class Timestep(IntEnum):
"""To use this timestep as an array index"""
return self % 2
streaming_patterns = ['push', 'pull', 'aa', 'esotwist']
even_accessors = {
......@@ -39,7 +41,7 @@ odd_accessors = {
def get_accessor(streaming_pattern : str, timestep : Timestep) -> PdfFieldAccessor:
def get_accessor(streaming_pattern: str, timestep: Timestep) -> PdfFieldAccessor:
if streaming_pattern not in streaming_patterns:
raise ValueError(
"Invalid value of parameter 'streaming_pattern'.", streaming_pattern)
......@@ -56,9 +58,11 @@ def is_inplace(streaming_pattern):
return streaming_pattern in ['aa', 'esotwist']
def get_timesteps(streaming_pattern):
return (Timestep.EVEN, Timestep.ODD) if is_inplace(streaming_pattern) else (Timestep.BOTH, )
def numeric_offsets(field_access: ps.Field.Access):
return tuple(int(o) for o in field_access.offsets)
......@@ -17,10 +17,33 @@ from numpy.testing import assert_allclose, assert_array_equal
all_results = dict()
targets = ['cpu']
import pycuda.autoinit
targets += ['gpu']
except ImportError:
import pystencils.opencl.autoinit
targets += ['opencl']
except ImportError:
@pytest.mark.parametrize('target', targets)
@pytest.mark.parametrize('stencil', ['D2Q9', 'D3Q19', 'D3Q27'])
@pytest.mark.parametrize('streaming_pattern', streaming_patterns)
def test_fully_periodic_flow(stencil, streaming_pattern):
def test_fully_periodic_flow(target, stencil, streaming_pattern):
if target == 'opencl':
from pystencils.opencl.opencljit import get_global_cl_queue
opencl_queue = get_global_cl_queue()
opencl_queue = None
gpu = target in ['gpu', 'opencl']
# Stencil
stencil = get_stencil(stencil)
......@@ -36,7 +59,8 @@ def test_fully_periodic_flow(stencil, streaming_pattern):
domain_size = (30,) * dim
periodicity = (True,) * dim
dh = create_data_handling(domain_size=domain_size, periodicity=periodicity)
dh = create_data_handling(domain_size=domain_size, periodicity=periodicity,
default_target=target, opencl_queue=opencl_queue)
pdfs = dh.add_array('pdfs', q)
if not inplace:
......@@ -51,7 +75,7 @@ def test_fully_periodic_flow(stencil, streaming_pattern):
optimization = {
'symbolic_field': pdfs,
'target': 'cpu'
'target': target
if not inplace:
......@@ -92,17 +116,17 @@ def test_fully_periodic_flow(stencil, streaming_pattern):
setter = flexible_macroscopic_values_setter(
lb_method, density, velocity, pdfs,
streaming_pattern=streaming_pattern, previous_timestep=zeroth_timestep)
setter_kernel = create_kernel(setter, ghost_layers=1).compile()
setter_kernel = create_kernel(setter, ghost_layers=1, target=target).compile()
getter_kernels = []
for t in timesteps:
getter = flexible_macroscopic_values_getter(
lb_method, density_field, velocity_field, pdfs,
streaming_pattern=streaming_pattern, previous_timestep=t)
getter_kernels.append(create_kernel(getter, ghost_layers=1).compile())
getter_kernels.append(create_kernel(getter, ghost_layers=1, target=target).compile())
# Periodicity
periodicity_handler = PeriodicityHandling(stencil, dh,, streaming_pattern=streaming_pattern)
periodicity_handler = PeriodicityHandling(stencil, dh,, streaming_pattern=streaming_pattern, target=target)
# Initialization and Timestep
current_timestep = zeroth_timestep
......@@ -138,7 +162,8 @@ def test_fully_periodic_flow(stencil, streaming_pattern):
# Evaluation
if gpu:
u = dh.gather_array(
# Equal to the steady-state velocity field up to numerical errors
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment