Skip to content
Snippets Groups Projects
Commit e7a8f3a3 authored by Markus Holzer's avatar Markus Holzer
Browse files

Merge branch 'fhennig/fix-gpu-slice-copy' into 'master'

Fix GPU copy kernels in periodicity handling

Closes #45

See merge request pycodegen/lbmpy!182
parents eb626105 d86ea18e
Branches
1 merge request!182Fix GPU copy kernels in periodicity handling
Pipeline #73782 passed with stages
in 8 minutes and 54 seconds
import itertools
from pystencils import CreateKernelConfig, Field, Assignment, AssignmentCollection
from pystencils.slicing import shift_slice, get_slice_before_ghost_layer, normalize_slice
from lbmpy.advanced_streaming.utility import is_inplace, get_accessor, numeric_index, \
Timestep, get_timesteps, numeric_offsets
from pystencils.slicing import (
shift_slice,
get_slice_before_ghost_layer,
normalize_slice,
)
from lbmpy.advanced_streaming.utility import (
is_inplace,
get_accessor,
numeric_index,
Timestep,
get_timesteps,
numeric_offsets,
)
from pystencils.datahandling import SerialDataHandling
from pystencils.enums import Target
from itertools import chain
......@@ -10,22 +20,28 @@ from itertools import chain
class LBMPeriodicityHandling:
def __init__(self, stencil, data_handling, pdf_field_name,
streaming_pattern='pull', ghost_layers=1,
cupy_direct_copy=True):
def __init__(
self,
stencil,
data_handling,
pdf_field_name,
streaming_pattern="pull",
ghost_layers=1,
cupy_direct_copy=True,
):
"""
Periodicity Handling for Lattice Boltzmann Streaming.
**On the usage with cuda:**
- cupy allows the copying of sliced arrays within device memory using the numpy syntax,
e.g. `dst[:,0] = src[:,-1]`. In this implementation, this is the default for periodicity
handling. Alternatively, if you set `cupy_direct_copy=False`, GPU kernels are generated and
compiled. The compiled kernels are almost twice as fast in execution as cupy array copying,
but especially for large stencils like D3Q27, their compilation can take up to 20 seconds.
Choose your weapon depending on your use case.
Periodicity Handling for Lattice Boltzmann Streaming.
**On the usage with cuda:**
- cupy allows the copying of sliced arrays within device memory using the numpy syntax,
e.g. `dst[:,0] = src[:,-1]`. In this implementation, this is the default for periodicity
handling. Alternatively, if you set `cupy_direct_copy=False`, GPU kernels are generated and
compiled. The compiled kernels are almost twice as fast in execution as cupy array copying,
but especially for large stencils like D3Q27, their compilation can take up to 20 seconds.
Choose your weapon depending on your use case.
"""
if not isinstance(data_handling, SerialDataHandling):
raise ValueError('Only serial data handling is supported!')
raise ValueError("Only serial data handling is supported!")
self.stencil = stencil
self.dim = stencil.D
......@@ -56,12 +72,16 @@ class LBMPeriodicityHandling:
self.comm_slices = []
timesteps = get_timesteps(streaming_pattern)
for timestep in timesteps:
slices_per_comm_dir = get_communication_slices(stencil=stencil,
comm_stencil=copy_directions,
streaming_pattern=streaming_pattern,
prev_timestep=timestep,
ghost_layers=ghost_layers)
self.comm_slices.append(list(chain.from_iterable(v for k, v in slices_per_comm_dir.items())))
slices_per_comm_dir = get_communication_slices(
stencil=stencil,
comm_stencil=copy_directions,
streaming_pattern=streaming_pattern,
prev_timestep=timestep,
ghost_layers=ghost_layers,
)
self.comm_slices.append(
list(chain.from_iterable(v for k, v in slices_per_comm_dir.items()))
)
if self.target == Target.GPU and not cupy_direct_copy:
self.device_copy_kernels = list()
......@@ -81,11 +101,11 @@ class LBMPeriodicityHandling:
arr[dst] = arr[src]
def _compile_copy_kernels(self, timestep):
assert self.target == Target.GPU
pdf_field = self.dh.fields[self.pdf_field_name]
kernels = []
for src, dst in self.comm_slices[timestep.idx]:
kernels.append(
periodic_pdf_copy_kernel(pdf_field, src, dst, target=self.target))
kernels.append(periodic_pdf_gpu_copy_kernel(pdf_field, src, dst))
return kernels
def _periodicity_handling_gpu(self, prev_timestep):
......@@ -100,7 +120,12 @@ class LBMPeriodicityHandling:
def get_communication_slices(
stencil, comm_stencil=None, streaming_pattern='pull', prev_timestep=Timestep.BOTH, ghost_layers=1):
stencil,
comm_stencil=None,
streaming_pattern="pull",
prev_timestep=Timestep.BOTH,
ghost_layers=1,
):
"""
Return the source and destination slices for periodicity handling or communication between blocks.
......@@ -116,7 +141,9 @@ def get_communication_slices(
if comm_stencil is None:
comm_stencil = itertools.product(*([-1, 0, 1] for _ in range(stencil.D)))
pdfs = Field.create_generic('pdfs', spatial_dimensions=len(stencil[0]), index_shape=(stencil.Q,))
pdfs = Field.create_generic(
"pdfs", spatial_dimensions=len(stencil[0]), index_shape=(stencil.Q,)
)
write_accesses = get_accessor(streaming_pattern, prev_timestep).write(pdfs, stencil)
slices_per_comm_direction = dict()
......@@ -130,7 +157,9 @@ def get_communication_slices(
d = stencil.index(streaming_dir)
write_index = numeric_index(write_accesses[d])[0]
origin_slice = get_slice_before_ghost_layer(comm_dir, ghost_layers=ghost_layers, thickness=1)
origin_slice = get_slice_before_ghost_layer(
comm_dir, ghost_layers=ghost_layers, thickness=1
)
src_slice = _fix_length_one_slices(origin_slice)
write_offsets = numeric_offsets(write_accesses[d])
......@@ -138,13 +167,15 @@ def get_communication_slices(
# TODO: this is just a hotfix. _trim_slice_in_direction breaks FreeSlip BC with adjacent periodic side
if streaming_pattern != "pull":
src_slice = shift_slice(_trim_slice_in_direction(src_slice, tangential_dir), write_offsets)
src_slice = shift_slice(
_trim_slice_in_direction(src_slice, tangential_dir), write_offsets
)
neighbour_transform = _get_neighbour_transform(comm_dir, ghost_layers)
dst_slice = shift_slice(src_slice, neighbour_transform)
src_slice = src_slice + (write_index, )
dst_slice = dst_slice + (write_index, )
src_slice = src_slice + (write_index,)
dst_slice = dst_slice + (write_index,)
slices_for_dir.append((src_slice, dst_slice))
......@@ -152,10 +183,10 @@ def get_communication_slices(
return slices_per_comm_direction
def periodic_pdf_copy_kernel(pdf_field, src_slice, dst_slice,
domain_size=None, target=Target.GPU):
"""Copies a rectangular array slice onto another non-overlapping array slice"""
from pystencils.gpucuda.kernelcreation import create_cuda_kernel
def periodic_pdf_gpu_copy_kernel(pdf_field, src_slice, dst_slice, domain_size=None):
"""Generate a GPU kernel which copies all values from one slice of a field
to another non-overlapping slice."""
from pystencils import create_kernel
pdf_idx = src_slice[-1]
assert isinstance(pdf_idx, int), "PDF index needs to be an integer constant"
......@@ -176,18 +207,28 @@ def periodic_pdf_copy_kernel(pdf_field, src_slice, dst_slice,
def _stop(s):
return s.stop if isinstance(s, slice) else s
offset = [_start(s1) - _start(s2) for s1, s2 in zip(normalized_from_slice, normalized_to_slice)]
assert offset == [_stop(s1) - _stop(s2) for s1, s2 in zip(normalized_from_slice, normalized_to_slice)], \
"Slices have to have same size"
copy_eq = AssignmentCollection(main_assignments=[Assignment(pdf_field(pdf_idx), pdf_field[tuple(offset)](pdf_idx))])
config = CreateKernelConfig(iteration_slice=dst_slice, skip_independence_check=True)
ast = create_cuda_kernel(copy_eq, config=config)
if target == Target.GPU:
from pystencils.gpucuda import make_python_function
return make_python_function(ast)
else:
raise ValueError('Invalid target:', target)
offset = [
_start(s1) - _start(s2)
for s1, s2 in zip(normalized_from_slice, normalized_to_slice)
]
assert offset == [
_stop(s1) - _stop(s2)
for s1, s2 in zip(normalized_from_slice, normalized_to_slice)
], "Slices have to have same size"
copy_eq = AssignmentCollection(
main_assignments=[
Assignment(pdf_field(pdf_idx), pdf_field[tuple(offset)](pdf_idx))
]
)
config = CreateKernelConfig(
iteration_slice=dst_slice,
skip_independence_check=True,
target=Target.GPU,
)
ast = create_kernel(copy_eq, config=config)
return ast.compile()
def _extend_dir(direction):
......@@ -196,10 +237,10 @@ def _extend_dir(direction):
elif direction[0] == 0:
for d in [-1, 0, 1]:
for rest in _extend_dir(direction[1:]):
yield (d, ) + rest
yield (d,) + rest
else:
for rest in _extend_dir(direction[1:]):
yield (direction[0], ) + rest
yield (direction[0],) + rest
def _get_neighbour_transform(direction, ghost_layers):
......
......@@ -172,7 +172,7 @@ class PdfsToMomentsByMatrixTransform(AbstractRawMomentTransform):
# ----------------------------- Private Members -----------------------------
@ property
@property
def _default_simplification(self):
forward_simp = SimplificationStrategy()
# forward_simp.add(substitute_moments_in_conserved_quantity_equations)
......@@ -218,7 +218,7 @@ class PdfsToMomentsByChimeraTransform(AbstractRawMomentTransform):
self.moment_polynomials)
self.poly_to_mono_matrix = self.mono_to_poly_matrix.inv()
@ property
@property
def absorbs_conserved_quantity_equations(self):
return True
......@@ -414,7 +414,7 @@ class PdfsToMomentsByChimeraTransform(AbstractRawMomentTransform):
# ----------------------------- Private Members -----------------------------
@ property
@property
def _default_simplification(self):
from lbmpy.methods.momentbased.momentbasedsimplifications import (
substitute_moments_in_conserved_quantity_equations,
......
......@@ -5,36 +5,52 @@ import numpy as np
from lbmpy.stencils import LBStencil
from pystencils.slicing import get_slice_before_ghost_layer, get_ghost_region_slice
from lbmpy.creationfunctions import create_lb_update_rule, LBMConfig, LBMOptimisation
from lbmpy.advanced_streaming.communication import get_communication_slices, _fix_length_one_slices, \
LBMPeriodicityHandling
from lbmpy.advanced_streaming.communication import (
get_communication_slices,
_fix_length_one_slices,
LBMPeriodicityHandling,
periodic_pdf_gpu_copy_kernel,
)
from lbmpy.advanced_streaming.utility import streaming_patterns, Timestep
from lbmpy.enums import Stencil
import pytest
@pytest.mark.parametrize('stencil', [Stencil.D2Q9, Stencil.D3Q15, Stencil.D3Q19, Stencil.D3Q27])
@pytest.mark.parametrize('streaming_pattern', streaming_patterns)
@pytest.mark.parametrize('timestep', [Timestep.EVEN, Timestep.ODD])
@pytest.mark.parametrize(
"stencil", [Stencil.D2Q9, Stencil.D3Q15, Stencil.D3Q19, Stencil.D3Q27]
)
@pytest.mark.parametrize("streaming_pattern", streaming_patterns)
@pytest.mark.parametrize("timestep", [Timestep.EVEN, Timestep.ODD])
def test_slices_not_empty(stencil, streaming_pattern, timestep):
stencil = LBStencil(stencil)
arr = np.zeros((4,) * stencil.D + (stencil.Q,))
slices = get_communication_slices(stencil, streaming_pattern=streaming_pattern, prev_timestep=timestep,
ghost_layers=1)
slices = get_communication_slices(
stencil,
streaming_pattern=streaming_pattern,
prev_timestep=timestep,
ghost_layers=1,
)
for _, slices_list in slices.items():
for src, dst in slices_list:
assert all(s != 0 for s in arr[src].shape)
assert all(s != 0 for s in arr[dst].shape)
@pytest.mark.parametrize('stencil', [Stencil.D2Q9, Stencil.D3Q15, Stencil.D3Q19, Stencil.D3Q27])
@pytest.mark.parametrize('streaming_pattern', streaming_patterns)
@pytest.mark.parametrize('timestep', [Timestep.EVEN, Timestep.ODD])
@pytest.mark.parametrize(
"stencil", [Stencil.D2Q9, Stencil.D3Q15, Stencil.D3Q19, Stencil.D3Q27]
)
@pytest.mark.parametrize("streaming_pattern", streaming_patterns)
@pytest.mark.parametrize("timestep", [Timestep.EVEN, Timestep.ODD])
def test_src_dst_same_shape(stencil, streaming_pattern, timestep):
stencil = LBStencil(stencil)
arr = np.zeros((4,) * stencil.D + (stencil.Q,))
slices = get_communication_slices(stencil, streaming_pattern=streaming_pattern, prev_timestep=timestep,
ghost_layers=1)
slices = get_communication_slices(
stencil,
streaming_pattern=streaming_pattern,
prev_timestep=timestep,
ghost_layers=1,
)
for _, slices_list in slices.items():
for src, dst in slices_list:
src_shape = arr[src].shape
......@@ -42,12 +58,15 @@ def test_src_dst_same_shape(stencil, streaming_pattern, timestep):
assert src_shape == dst_shape
@pytest.mark.parametrize('stencil', [Stencil.D2Q9, Stencil.D3Q15, Stencil.D3Q19, Stencil.D3Q27])
@pytest.mark.parametrize(
"stencil", [Stencil.D2Q9, Stencil.D3Q15, Stencil.D3Q19, Stencil.D3Q27]
)
def test_pull_communication_slices(stencil):
stencil = LBStencil(stencil)
slices = get_communication_slices(
stencil, streaming_pattern='pull', prev_timestep=Timestep.BOTH, ghost_layers=1)
stencil, streaming_pattern="pull", prev_timestep=Timestep.BOTH, ghost_layers=1
)
for i, d in enumerate(stencil):
if i == 0:
continue
......@@ -58,21 +77,115 @@ def test_pull_communication_slices(stencil):
dst = s[1][:-1]
break
inner_slice = _fix_length_one_slices(get_slice_before_ghost_layer(d, ghost_layers=1))
inner_slice = _fix_length_one_slices(
get_slice_before_ghost_layer(d, ghost_layers=1)
)
inv_dir = (-e for e in d)
gl_slice = _fix_length_one_slices(get_ghost_region_slice(inv_dir, ghost_layers=1))
gl_slice = _fix_length_one_slices(
get_ghost_region_slice(inv_dir, ghost_layers=1)
)
assert src == inner_slice
assert dst == gl_slice
@pytest.mark.parametrize('stencil_name', [Stencil.D2Q9, Stencil.D3Q15, Stencil.D3Q19, Stencil.D3Q27])
@pytest.mark.parametrize("direction", LBStencil(Stencil.D3Q27).stencil_entries)
@pytest.mark.parametrize("pull", [False, True])
def test_gpu_comm_kernels(direction: tuple, pull: bool):
pytest.importorskip("cupy")
stencil = LBStencil(Stencil.D3Q27)
inv_dir = stencil[stencil.inverse_index(direction)]
target = ps.Target.GPU
domain_size = (4,) * stencil.D
dh: ps.datahandling.SerialDataHandling = ps.create_data_handling(
domain_size,
periodicity=(True,) * stencil.D,
parallel=False,
default_target=target,
)
field = dh.add_array("field", values_per_cell=2)
if pull:
dst_slice = get_ghost_region_slice(inv_dir)
src_slice = get_slice_before_ghost_layer(direction)
else:
dst_slice = get_slice_before_ghost_layer(direction)
src_slice = get_ghost_region_slice(inv_dir)
src_slice += (1,)
dst_slice += (1,)
kernel = periodic_pdf_gpu_copy_kernel(field, src_slice, dst_slice)
dh.cpu_arrays[field.name][src_slice] = 42.0
dh.all_to_gpu()
dh.run_kernel(kernel)
dh.all_to_cpu()
np.testing.assert_equal(dh.cpu_arrays[field.name][dst_slice], 42.0)
@pytest.mark.parametrize("stencil", [Stencil.D2Q9, Stencil.D3Q19])
@pytest.mark.parametrize("streaming_pattern", streaming_patterns)
def test_direct_copy_and_kernels_equivalence(stencil: Stencil, streaming_pattern: str):
pytest.importorskip("cupy")
target = ps.Target.GPU
stencil = LBStencil(stencil)
domain_size = (4,) * stencil.D
dh: ps.datahandling.SerialDataHandling = ps.create_data_handling(
domain_size,
periodicity=(True,) * stencil.D,
parallel=False,
default_target=target,
)
pdfs_a = dh.add_array("pdfs_a", values_per_cell=stencil.Q)
pdfs_b = dh.add_array("pdfs_b", values_per_cell=stencil.Q)
dh.fill(pdfs_a.name, 0.0, ghost_layers=True)
dh.fill(pdfs_b.name, 0.0, ghost_layers=True)
for q in range(stencil.Q):
sl = ps.make_slice[:4, :4, q] if stencil.D == 2 else ps.make_slice[:4, :4, :4, q]
dh.cpu_arrays[pdfs_a.name][sl] = q
dh.cpu_arrays[pdfs_b.name][sl] = q
dh.all_to_gpu()
direct_copy = LBMPeriodicityHandling(stencil, dh, pdfs_a.name, streaming_pattern, cupy_direct_copy=True)
kernels_copy = LBMPeriodicityHandling(stencil, dh, pdfs_b.name, streaming_pattern, cupy_direct_copy=False)
direct_copy(Timestep.EVEN)
kernels_copy(Timestep.EVEN)
dh.all_to_cpu()
np.testing.assert_equal(
dh.cpu_arrays[pdfs_a.name],
dh.cpu_arrays[pdfs_b.name]
)
@pytest.mark.parametrize(
"stencil_name", [Stencil.D2Q9, Stencil.D3Q15, Stencil.D3Q19, Stencil.D3Q27]
)
def test_optimised_and_full_communication_equivalence(stencil_name):
target = ps.Target.CPU
stencil = LBStencil(stencil_name)
domain_size = (4, ) * stencil.D
domain_size = (4,) * stencil.D
dh = ps.create_data_handling(domain_size, periodicity=(True, ) * stencil.D,
parallel=False, default_target=target)
dh = ps.create_data_handling(
domain_size,
periodicity=(True,) * stencil.D,
parallel=False,
default_target=target,
)
pdf = dh.add_array("pdf", values_per_cell=len(stencil), dtype=np.int64)
dh.fill("pdf", 0, ghost_layers=True)
......@@ -82,9 +195,9 @@ def test_optimised_and_full_communication_equivalence(stencil_name):
gl = dh.ghost_layers_of_field("pdf")
num = 0
for idx, x in np.ndenumerate(dh.cpu_arrays['pdf']):
dh.cpu_arrays['pdf'][idx] = num
dh.cpu_arrays['pdf_tmp'][idx] = num
for idx, x in np.ndenumerate(dh.cpu_arrays["pdf"]):
dh.cpu_arrays["pdf"][idx] = num
dh.cpu_arrays["pdf_tmp"][idx] = num
num += 1
lbm_config = LBMConfig(stencil=stencil, kernel_type="stream_pull_only")
......@@ -95,21 +208,27 @@ def test_optimised_and_full_communication_equivalence(stencil_name):
ast = ps.create_kernel(ac, config=config)
stream = ast.compile()
full_communication = dh.synchronization_function(pdf.name, target=dh.default_target, optimization={"openmp": True})
full_communication = dh.synchronization_function(
pdf.name, target=dh.default_target, optimization={"openmp": True}
)
full_communication()
dh.run_kernel(stream)
dh.swap("pdf", "pdf_tmp")
pdf_full_communication = np.copy(dh.cpu_arrays['pdf'])
pdf_full_communication = np.copy(dh.cpu_arrays["pdf"])
num = 0
for idx, x in np.ndenumerate(dh.cpu_arrays['pdf']):
dh.cpu_arrays['pdf'][idx] = num
dh.cpu_arrays['pdf_tmp'][idx] = num
for idx, x in np.ndenumerate(dh.cpu_arrays["pdf"]):
dh.cpu_arrays["pdf"][idx] = num
dh.cpu_arrays["pdf_tmp"][idx] = num
num += 1
optimised_communication = LBMPeriodicityHandling(stencil=stencil, data_handling=dh, pdf_field_name=pdf.name,
streaming_pattern='pull')
optimised_communication = LBMPeriodicityHandling(
stencil=stencil,
data_handling=dh,
pdf_field_name=pdf.name,
streaming_pattern="pull",
)
optimised_communication()
dh.run_kernel(stream)
dh.swap("pdf", "pdf_tmp")
......@@ -119,9 +238,14 @@ def test_optimised_and_full_communication_equivalence(stencil_name):
for j in range(gl, domain_size[1]):
for k in range(gl, domain_size[2]):
for f in range(len(stencil)):
assert dh.cpu_arrays['pdf'][i, j, k, f] == pdf_full_communication[i, j, k, f], print(f)
assert (
dh.cpu_arrays["pdf"][i, j, k, f]
== pdf_full_communication[i, j, k, f]
), print(f)
else:
for i in range(gl, domain_size[0]):
for j in range(gl, domain_size[1]):
for f in range(len(stencil)):
assert dh.cpu_arrays['pdf'][i, j, f] == pdf_full_communication[i, j, f]
assert (
dh.cpu_arrays["pdf"][i, j, f] == pdf_full_communication[i, j, f]
)
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment