Commit ac69e4f1 authored by Frederik Hennig's avatar Frederik Hennig
Browse files

Periodicity Handling for CUDA and OpenCL

parent 9bc0632b
Pipeline #27376 passed with stage
in 31 minutes and 42 seconds
...@@ -47,10 +47,10 @@ class FlexibleLBMBoundaryHandling(BoundaryHandling): ...@@ -47,10 +47,10 @@ class FlexibleLBMBoundaryHandling(BoundaryHandling):
if boundary_obj not in self._boundary_object_to_boundary_info: if boundary_obj not in self._boundary_object_to_boundary_info:
sym_index_field = Field.create_generic('indexField', spatial_dimensions=1, sym_index_field = Field.create_generic('indexField', spatial_dimensions=1,
dtype=numpy_data_type_for_boundary_object(boundary_obj, self.dim)) dtype=numpy_data_type_for_boundary_object(boundary_obj, self.dim))
kernels = [ self._create_boundary_kernel( kernels = [self._create_boundary_kernel(
self._data_handling.fields[self._field_name], sym_index_field, boundary_obj, Timestep.EVEN).compile(), self._data_handling.fields[self._field_name], sym_index_field, boundary_obj, Timestep.EVEN).compile(),
self._create_boundary_kernel( self._create_boundary_kernel(
self._data_handling.fields[self._field_name], sym_index_field, boundary_obj, Timestep.ODD).compile() ] self._data_handling.fields[self._field_name], sym_index_field, boundary_obj, Timestep.ODD).compile()]
if flag is None: if flag is None:
flag = self.flag_interface.reserve_next_flag() flag = self.flag_interface.reserve_next_flag()
boundary_info = self.FlexibleBoundaryInfo(self, boundary_obj, flag, kernels) boundary_info = self.FlexibleBoundaryInfo(self, boundary_obj, flag, kernels)
...@@ -70,7 +70,7 @@ class FlexibleLBMBoundaryHandling(BoundaryHandling): ...@@ -70,7 +70,7 @@ class FlexibleLBMBoundaryHandling(BoundaryHandling):
prev_timestep = self._boundary_handling.prev_timestep prev_timestep = self._boundary_handling.prev_timestep
if prev_timestep is None: if prev_timestep is None:
raise Exception( raise Exception(
"The flexible boundary kernel property was accessed while " "The flexible boundary kernel property was accessed while "
+ "there was no boundary handling in progress.") + "there was no boundary handling in progress.")
return self._kernels[prev_timestep] return self._kernels[prev_timestep]
......
from pystencils import Field from pystencils import Field, Assignment
from pystencils.slicing import shift_slice, get_slice_before_ghost_layer from pystencils.slicing import shift_slice, get_slice_before_ghost_layer, normalize_slice
from lbmpy.advanced_streaming.utility import is_inplace, get_accessor, numeric_index, numeric_offsets, Timestep, get_timesteps from lbmpy.advanced_streaming.utility import is_inplace, get_accessor, numeric_index, \
numeric_offsets, Timestep, get_timesteps
from pystencils.datahandling import SerialDataHandling from pystencils.datahandling import SerialDataHandling
from itertools import chain from itertools import chain
...@@ -98,19 +99,83 @@ def get_communication_slices( ...@@ -98,19 +99,83 @@ def get_communication_slices(
return slices_per_comm_direction return slices_per_comm_direction
def periodic_pdf_copy_kernel(pdf_field, src_slice, dst_slice,
domain_size=None, target='gpu',
opencl_queue=None, opencl_ctx=None):
"""Copies a rectangular array slice onto another non-overlapping array slice"""
from pystencils.gpucuda.kernelcreation import create_cuda_kernel
pdf_idx = src_slice[-1]
assert isinstance(pdf_idx, int), "PDF index needs to be an integer constant"
assert pdf_idx == dst_slice[-1], "Source and Destination PDF indices must be equal"
src_slice = src_slice[:-1]
dst_slice = dst_slice[:-1]
if domain_size is None:
domain_size = pdf_field.spatial_shape
normalized_from_slice = normalize_slice(src_slice, domain_size)
normalized_to_slice = normalize_slice(dst_slice, domain_size)
def _start(s):
return s.start if isinstance(s, slice) else s
def _stop(s):
return s.stop if isinstance(s, slice) else s
offset = [_start(s1) - _start(s2) for s1, s2 in zip(normalized_from_slice, normalized_to_slice)]
assert offset == [_stop(s1) - _stop(s2) for s1, s2 in zip(normalized_from_slice, normalized_to_slice)], \
"Slices have to have same size"
copy_eq = Assignment(pdf_field(pdf_idx), pdf_field[tuple(offset)](pdf_idx))
ast = create_cuda_kernel([copy_eq], iteration_slice=dst_slice, skip_independence_check=True)
if target == 'gpu':
from pystencils.gpucuda import make_python_function
return make_python_function(ast)
elif target == 'opencl':
from pystencils.opencl import make_python_function
return make_python_function(ast, opencl_queue, opencl_ctx)
else:
raise ValueError('Invalid target:', target)
class PeriodicityHandling: class PeriodicityHandling:
def __init__(self, stencil, data_handling, pdf_field_name, def __init__(self, stencil, data_handling, pdf_field_name,
streaming_pattern='pull', ghost_layers=1, gpu=False): streaming_pattern='pull', ghost_layers=1, target='cpu',
opencl_queue=None, opencl_ctx=None,
pycuda_direct_copy=True):
"""
Periodicity Handling for Lattice Boltzmann Streaming.
**On the usage with cuda/opencl:**
- pycuda allows the copying of sliced arrays within device memory using the numpy syntax,
e.g. `dst[:,0] = src[:,-1]`. In this implementation, this is the default for periodicity
handling. Alternatively, if you set `pycuda_direct_copy=False`, GPU kernels are generated and
compiled. The compiled kernels are almost twice as fast in execution as pycuda array copying,
but especially for large stencils like D3Q27, their compilation can take up to 20 seconds.
Choose your weapon depending on your use case.
- pyopencl does not support copying of non-contiguous sliced arrays, so the usage of compiled
copy kernels is forced on us. On the positive side, compilation of the OpenCL kernels appears
to be about four times faster.
"""
if not isinstance(data_handling, SerialDataHandling): if not isinstance(data_handling, SerialDataHandling):
raise ValueError('Only single node data handling is supported!') raise ValueError('Only single node data handling is supported!')
assert target in ['cpu', 'gpu', 'opencl']
self.stencil = stencil self.stencil = stencil
self.dh = data_handling self.dh = data_handling
self.pdf_field_name = pdf_field_name self.pdf_field_name = pdf_field_name
self.ghost_layers = ghost_layers
periodicity = data_handling.periodicity periodicity = data_handling.periodicity
self.inplace_pattern = is_inplace(streaming_pattern) self.inplace_pattern = is_inplace(streaming_pattern)
self.gpu = gpu self.target = target
self.cpu = target == 'cpu'
self.opencl_queue = opencl_queue
self.opencl_ctx = opencl_ctx
self.pycuda_direct_copy = target == 'gpu' and pycuda_direct_copy
def is_copy_direction(direction): def is_copy_direction(direction):
for d, p in zip(direction, periodicity): for d, p in zip(direction, periodicity):
...@@ -120,46 +185,49 @@ class PeriodicityHandling: ...@@ -120,46 +185,49 @@ class PeriodicityHandling:
return True return True
copy_directions = tuple(filter(is_copy_direction, stencil[1:])) copy_directions = tuple(filter(is_copy_direction, stencil[1:]))
# if self.inplace_pattern:
# self.comm_slices = dict()
# for timestep in [Timestep.EVEN, Timestep.ODD]:
# slices_per_comm_dir = get_communication_slices(stencil=stencil,
# comm_stencil=copy_directions,
# streaming_pattern=streaming_pattern,
# prev_timestep=timestep,
# ghost_layers=ghost_layers)
# self.comm_slices[timestep] = list(chain.from_iterable(v for k, v in slices_per_comm_dir.items()))
# else:
# slices_per_comm_dir = get_communication_slices(stencil=stencil,
# comm_stencil=copy_directions,
# streaming_pattern=streaming_pattern,
# prev_timestep=Timestep.BOTH,
# ghost_layers=ghost_layers)
# self.comm_slices = list(chain.from_iterable(v for k, v in slices_per_comm_dir.items()))
self.comm_slices = [] self.comm_slices = []
for timestep in get_timesteps(streaming_pattern): timesteps = get_timesteps(streaming_pattern)
for timestep in timesteps:
slices_per_comm_dir = get_communication_slices(stencil=stencil, slices_per_comm_dir = get_communication_slices(stencil=stencil,
comm_stencil=copy_directions, comm_stencil=copy_directions,
streaming_pattern=streaming_pattern, streaming_pattern=streaming_pattern,
prev_timestep=timestep, prev_timestep=timestep,
ghost_layers=ghost_layers) ghost_layers=ghost_layers)
self.comm_slices.append(list(chain.from_iterable(v for k, v in slices_per_comm_dir.items()))) self.comm_slices.append(list(chain.from_iterable(v for k, v in slices_per_comm_dir.items())))
if target == 'opencl' or (target == 'gpu' and not pycuda_direct_copy):
self.device_copy_kernels = []
for timestep in timesteps:
self.device_copy_kernels.append(self._compile_copy_kernels(timestep))
def __call__(self, prev_timestep=Timestep.BOTH): def __call__(self, prev_timestep=Timestep.BOTH):
if self.gpu: if self.cpu:
self._periodicity_handling_gpu(prev_timestep)
else:
self._periodicity_handling_cpu(prev_timestep) self._periodicity_handling_cpu(prev_timestep)
else:
self._periodicity_handling_gpu(prev_timestep)
def _periodicity_handling_cpu(self, prev_timestep): def _periodicity_handling_cpu(self, prev_timestep):
arr = self.dh.cpu_arrays[self.pdf_field_name] arr = self.dh.cpu_arrays[self.pdf_field_name]
# if prev_timestep == 'both':
# comm_slices = self.comm_slices
# else:
comm_slices = self.comm_slices[prev_timestep.idx] comm_slices = self.comm_slices[prev_timestep.idx]
for src, dst in comm_slices: for src, dst in comm_slices:
arr[dst] = arr[src] arr[dst] = arr[src]
def _compile_copy_kernels(self, timestep):
pdf_field = self.dh.fields[self.pdf_field_name]
kernels = []
for src, dst in self.comm_slices[timestep.idx]:
kernels.append(
periodic_pdf_copy_kernel(
pdf_field, src, dst, target=self.target,
opencl_queue=self.opencl_queue, opencl_ctx=self.opencl_ctx))
return kernels
def _periodicity_handling_gpu(self, prev_timestep): def _periodicity_handling_gpu(self, prev_timestep):
raise NotImplementedError() arr = self.dh.gpu_arrays[self.pdf_field_name]
if self.pycuda_direct_copy:
for src, dst in self.comm_slices[prev_timestep.idx]:
arr[dst] = arr[src]
else:
kernel_args = {self.pdf_field_name: arr}
for kernel in self.device_copy_kernels[prev_timestep.idx]:
kernel(**kernel_args)
...@@ -9,6 +9,7 @@ from lbmpy.fieldaccess import PdfFieldAccessor, \ ...@@ -9,6 +9,7 @@ from lbmpy.fieldaccess import PdfFieldAccessor, \
import pystencils as ps import pystencils as ps
from enum import IntEnum from enum import IntEnum
class Timestep(IntEnum): class Timestep(IntEnum):
EVEN = 0 EVEN = 0
ODD = 1 ODD = 1
...@@ -22,6 +23,7 @@ class Timestep(IntEnum): ...@@ -22,6 +23,7 @@ class Timestep(IntEnum):
"""To use this timestep as an array index""" """To use this timestep as an array index"""
return self % 2 return self % 2
streaming_patterns = ['push', 'pull', 'aa', 'esotwist'] streaming_patterns = ['push', 'pull', 'aa', 'esotwist']
even_accessors = { even_accessors = {
...@@ -39,7 +41,7 @@ odd_accessors = { ...@@ -39,7 +41,7 @@ odd_accessors = {
} }
def get_accessor(streaming_pattern : str, timestep : Timestep) -> PdfFieldAccessor: def get_accessor(streaming_pattern: str, timestep: Timestep) -> PdfFieldAccessor:
if streaming_pattern not in streaming_patterns: if streaming_pattern not in streaming_patterns:
raise ValueError( raise ValueError(
"Invalid value of parameter 'streaming_pattern'.", streaming_pattern) "Invalid value of parameter 'streaming_pattern'.", streaming_pattern)
...@@ -56,9 +58,11 @@ def is_inplace(streaming_pattern): ...@@ -56,9 +58,11 @@ def is_inplace(streaming_pattern):
return streaming_pattern in ['aa', 'esotwist'] return streaming_pattern in ['aa', 'esotwist']
def get_timesteps(streaming_pattern): def get_timesteps(streaming_pattern):
return (Timestep.EVEN, Timestep.ODD) if is_inplace(streaming_pattern) else (Timestep.BOTH, ) return (Timestep.EVEN, Timestep.ODD) if is_inplace(streaming_pattern) else (Timestep.BOTH, )
def numeric_offsets(field_access: ps.Field.Access): def numeric_offsets(field_access: ps.Field.Access):
return tuple(int(o) for o in field_access.offsets) return tuple(int(o) for o in field_access.offsets)
......
...@@ -17,10 +17,33 @@ from numpy.testing import assert_allclose, assert_array_equal ...@@ -17,10 +17,33 @@ from numpy.testing import assert_allclose, assert_array_equal
all_results = dict() all_results = dict()
targets = ['cpu']
try:
import pycuda.autoinit
targets += ['gpu']
except ImportError:
pass
try:
import pystencils.opencl.autoinit
targets += ['opencl']
except ImportError:
pass
@pytest.mark.parametrize('target', targets)
@pytest.mark.parametrize('stencil', ['D2Q9', 'D3Q19', 'D3Q27']) @pytest.mark.parametrize('stencil', ['D2Q9', 'D3Q19', 'D3Q27'])
@pytest.mark.parametrize('streaming_pattern', streaming_patterns) @pytest.mark.parametrize('streaming_pattern', streaming_patterns)
def test_fully_periodic_flow(stencil, streaming_pattern): def test_fully_periodic_flow(target, stencil, streaming_pattern):
if target == 'opencl':
from pystencils.opencl.opencljit import get_global_cl_queue
opencl_queue = get_global_cl_queue()
else:
opencl_queue = None
gpu = target in ['gpu', 'opencl']
# Stencil # Stencil
stencil = get_stencil(stencil) stencil = get_stencil(stencil)
...@@ -36,7 +59,8 @@ def test_fully_periodic_flow(stencil, streaming_pattern): ...@@ -36,7 +59,8 @@ def test_fully_periodic_flow(stencil, streaming_pattern):
domain_size = (30,) * dim domain_size = (30,) * dim
periodicity = (True,) * dim periodicity = (True,) * dim
dh = create_data_handling(domain_size=domain_size, periodicity=periodicity) dh = create_data_handling(domain_size=domain_size, periodicity=periodicity,
default_target=target, opencl_queue=opencl_queue)
pdfs = dh.add_array('pdfs', q) pdfs = dh.add_array('pdfs', q)
if not inplace: if not inplace:
...@@ -51,7 +75,7 @@ def test_fully_periodic_flow(stencil, streaming_pattern): ...@@ -51,7 +75,7 @@ def test_fully_periodic_flow(stencil, streaming_pattern):
optimization = { optimization = {
'symbolic_field': pdfs, 'symbolic_field': pdfs,
'target': 'cpu' 'target': target
} }
if not inplace: if not inplace:
...@@ -92,17 +116,17 @@ def test_fully_periodic_flow(stencil, streaming_pattern): ...@@ -92,17 +116,17 @@ def test_fully_periodic_flow(stencil, streaming_pattern):
setter = flexible_macroscopic_values_setter( setter = flexible_macroscopic_values_setter(
lb_method, density, velocity, pdfs, lb_method, density, velocity, pdfs,
streaming_pattern=streaming_pattern, previous_timestep=zeroth_timestep) streaming_pattern=streaming_pattern, previous_timestep=zeroth_timestep)
setter_kernel = create_kernel(setter, ghost_layers=1).compile() setter_kernel = create_kernel(setter, ghost_layers=1, target=target).compile()
getter_kernels = [] getter_kernels = []
for t in timesteps: for t in timesteps:
getter = flexible_macroscopic_values_getter( getter = flexible_macroscopic_values_getter(
lb_method, density_field, velocity_field, pdfs, lb_method, density_field, velocity_field, pdfs,
streaming_pattern=streaming_pattern, previous_timestep=t) streaming_pattern=streaming_pattern, previous_timestep=t)
getter_kernels.append(create_kernel(getter, ghost_layers=1).compile()) getter_kernels.append(create_kernel(getter, ghost_layers=1, target=target).compile())
# Periodicity # Periodicity
periodicity_handler = PeriodicityHandling(stencil, dh, pdfs.name, streaming_pattern=streaming_pattern) periodicity_handler = PeriodicityHandling(stencil, dh, pdfs.name, streaming_pattern=streaming_pattern, target=target)
# Initialization and Timestep # Initialization and Timestep
current_timestep = zeroth_timestep current_timestep = zeroth_timestep
...@@ -138,7 +162,8 @@ def test_fully_periodic_flow(stencil, streaming_pattern): ...@@ -138,7 +162,8 @@ def test_fully_periodic_flow(stencil, streaming_pattern):
one_step() one_step()
# Evaluation # Evaluation
if gpu:
dh.to_cpu(velocity_field.name)
u = dh.gather_array(velocity_field.name) u = dh.gather_array(velocity_field.name)
# Equal to the steady-state velocity field up to numerical errors # Equal to the steady-state velocity field up to numerical errors
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment