diff --git a/pystencils/boundaries/boundaryconditions.py b/pystencils/boundaries/boundaryconditions.py index 10870c4cd5cd999b05fbc3dd72cb9de6710c25a6..6628be82082366e61a4b518d2806f92838057e64 100644 --- a/pystencils/boundaries/boundaryconditions.py +++ b/pystencils/boundaries/boundaryconditions.py @@ -1,6 +1,6 @@ -from typing import List, Tuple, Any from pystencils import Assignment from pystencils.boundaries.boundaryhandling import BoundaryOffsetInfo +from typing import List, Tuple, Any from pystencils.data_types import create_type diff --git a/pystencils/datahandling/serial_datahandling.py b/pystencils/datahandling/serial_datahandling.py index 7e4d19cd8e24aa485a48893d4c992d34078a0f13..dd2b70b89bc8ecf3d8ecb1c187be6442f6aa0efc 100644 --- a/pystencils/datahandling/serial_datahandling.py +++ b/pystencils/datahandling/serial_datahandling.py @@ -250,7 +250,14 @@ class SerialDataHandling(DataHandling): def synchronization_function_gpu(self, names, stencil_name=None, **_): return self.synchronization_function(names, stencil_name, 'gpu') - def synchronization_function(self, names, stencil=None, target=None, **_): + def get_communication_before_sweep(self, names, stencil_name=None, update_rule=None, **_): + return self.synchronization_function(names, stencil_name, self.default_target, update_rule, 'pull') + + def get_communication_after_sweep(self, names, stencil_name=None, update_rule=None, **_): + return self.synchronization_function(names, stencil_name, self.default_target, update_rule, 'push') + + def synchronization_function(self, names, stencil=None, target=None, update_rule=None, + reads_writes="pull", **_): if target is None: target = self.default_target assert target in ('cpu', 'gpu') @@ -265,11 +272,11 @@ class SerialDataHandling(DataHandling): elif (stencil is None and self.dim == 3) or (stencil is not None and stencil.startswith('D3')): directions = itertools.product(*[neighbors] * 3) else: - raise ValueError("Invalid stencil") + directions = itertools.product(*[neighbors] * 1) for direction in directions: use_direction = True - if direction == (0, 0) or direction == (0, 0, 0): + if direction == (0,) or direction == (0, 0) or direction == (0, 0, 0): use_direction = False for component, periodicity in zip(direction, self._periodicity): if not periodicity and component != 0: @@ -290,15 +297,31 @@ class SerialDataHandling(DataHandling): if len(filtered_stencil) > 0: if target == 'cpu': - from pystencils.slicing import get_periodic_boundary_functor - result.append(get_periodic_boundary_functor(filtered_stencil, ghost_layers=gls)) + if update_rule is None: + from pystencils.slicing import get_periodic_boundary_functor + result.append(get_periodic_boundary_functor(filtered_stencil, ghost_layers=gls)) + else: + from pystencils.gpucuda.periodicity import get_periodic_boundary_functor as boundary_func + result.append(boundary_func(filtered_stencil, self._domainSize, + index_dimensions=self.fields[name].index_dimensions, + index_dim_shape=values_per_cell, + dtype=self.fields[name].dtype.numpy_dtype, + ghost_layers=gls, + target='cpu', + update_rule=update_rule, + reads_writes=reads_writes)) + else: from pystencils.gpucuda.periodicity import get_periodic_boundary_functor as boundary_func + result.append(boundary_func(filtered_stencil, self._domainSize, index_dimensions=self.fields[name].index_dimensions, index_dim_shape=values_per_cell, dtype=self.fields[name].dtype.numpy_dtype, - ghost_layers=gls)) + ghost_layers=gls, + target='gpu', + update_rule=update_rule, + reads_writes=reads_writes)) if target == 'cpu': def result_functor(): diff --git a/pystencils/gpucuda/periodicity.py b/pystencils/gpucuda/periodicity.py index 5657d4618623b421c0d3ef25a60774085d85bb32..ba6232322b00871d0a21c7a21e935c12f011f728 100644 --- a/pystencils/gpucuda/periodicity.py +++ b/pystencils/gpucuda/periodicity.py @@ -1,11 +1,15 @@ import numpy as np -from pystencils import Field, Assignment +from pystencils import Field, Assignment, AssignmentCollection from pystencils.slicing import normalize_slice, get_periodic_boundary_src_dst_slices -from pystencils.gpucuda import make_python_function -from pystencils.gpucuda.kernelcreation import create_cuda_kernel +from pystencils.gpucuda import make_python_function as make_python_function_gpu +from pystencils.cpu import make_python_function as make_python_function_cpu +from pystencils import create_kernel +from lbmpy.methods.abstractlbmethod import LbmCollisionRule -def create_copy_kernel(domain_size, from_slice, to_slice, index_dimensions=0, index_dim_shape=1, dtype=np.float64): + +def create_copy_kernel(domain_size, from_slice, to_slice, stencil, index_dimensions=0, index_dim_shape=1, + dtype=np.float64, target=None, update_rule_assignments=None, reads_writes="pull"): """Copies a rectangular part of an array to another non-overlapping part""" if index_dimensions not in (0, 1): raise NotImplementedError("Works only for one or zero index coordinates") @@ -18,25 +22,86 @@ def create_copy_kernel(domain_size, from_slice, to_slice, index_dimensions=0, in assert offset == [s1.stop - s2.stop for s1, s2 in zip(normalized_from_slice, normalized_to_slice)], \ "Slices have to have same size" - update_eqs = [] - for i in range(index_dim_shape): - eq = Assignment(f(i), f[tuple(offset)](i)) - update_eqs.append(eq) + update_eqs = set() + if update_rule_assignments is None: + for i in range(index_dim_shape): + eq = Assignment(f(i), f[tuple(offset)](i)) + update_eqs.add(eq) + else: + for update_rule_assignment in update_rule_assignments: + if reads_writes == "pull": + field_accesses = update_rule_assignment.rhs.atoms(Field.Access) + else: + field_accesses = update_rule_assignment.lhs.atoms(Field.Access) + + for field_access in field_accesses: + if 0 in stencil: + for stencil_part, off in zip(stencil, field_access.offsets): + if (stencil_part < 0 and off < 0) or (stencil_part > 0 and off > 0): + if reads_writes == "pull": + eq = Assignment(f(field_access.index[0]), f[tuple(offset)](field_access.index[0])) + update_eqs.add(eq) + if reads_writes == "push": + eq = Assignment(f[tuple(offset)](field_access.index[0]), f(field_access.index[0])) + update_eqs.add(eq) + else: + if stencil == field_access.offsets: + if reads_writes == "pull": + eq = Assignment(f(field_access.index[0]), f[tuple(offset)](field_access.index[0])) + update_eqs.add(eq) + if reads_writes == "push": + eq = Assignment(f[tuple(offset)](field_access.index[0]), f(field_access.index[0])) + update_eqs.add(eq) + + kernels = [] + if len(update_eqs) > 0: + ast = create_kernel(list(update_eqs), iteration_slice=to_slice, + skip_independence_check=True, target=target) + if target == 'cpu': + kernels.append(make_python_function_cpu(ast)) + else: + kernels.append(make_python_function_gpu(ast)) - ast = create_cuda_kernel(update_eqs, iteration_slice=to_slice, skip_independence_check=True) - return make_python_function(ast) + return kernels def get_periodic_boundary_functor(stencil, domain_size, index_dimensions=0, index_dim_shape=1, ghost_layers=1, - thickness=None, dtype=float): + thickness=None, dtype=float, target=None, update_rule=None, reads_writes="pull"): + + # sort stencil entry's by number of zeros to make sure, that surfaces, edges and corners are + # communicated in that order. Otherwise values (e.g. corner values) will be overwritten + stencil.sort(key=number_of_zeros, reverse=True) + src_dst_slice_tuples = get_periodic_boundary_src_dst_slices(stencil, ghost_layers, thickness) kernels = [] - index_dimensions = index_dimensions - for src_slice, dst_slice in src_dst_slice_tuples: - kernels.append(create_copy_kernel(domain_size, src_slice, dst_slice, index_dimensions, index_dim_shape, dtype)) + + if isinstance(update_rule, Assignment): + update_rule_assignments = [update_rule] + elif isinstance(update_rule, (AssignmentCollection, LbmCollisionRule)): + update_rule_assignments = update_rule.all_assignments + else: + update_rule_assignments = None + + # by default we only get the communication pattern for pull stream + if reads_writes is not "pull" or reads_writes is not "push": + assert "not a valid argument for the definition of read or/and write access" + + for i, (src_slice, dst_slice) in enumerate(src_dst_slice_tuples): + for j in create_copy_kernel(domain_size, src_slice, dst_slice, stencil[i], index_dimensions, + index_dim_shape, dtype, target, update_rule_assignments, reads_writes): + kernels.append(j) def functor(pdfs, **_): for kernel in kernels: kernel(pdfs=pdfs) return functor + + +def number_of_zeros(stencil_entry): + count_zeros = 0 + for tmp in stencil_entry: + if tmp == 0: + count_zeros += 1 + + return count_zeros diff --git a/pystencils/transformations.py b/pystencils/transformations.py index 4490e3e1a8e265549879e30fe7c60da02e510336..0346b7d93744e30f968dcb6567cfd8706b6fdc37 100644 --- a/pystencils/transformations.py +++ b/pystencils/transformations.py @@ -175,12 +175,13 @@ def make_loop_over_domain(body, function_name, iteration_slice=None, ghost_layer if iteration_slice is not None: iteration_slice = normalize_slice(iteration_slice, shape) - - if ghost_layers is None: - required_ghost_layers = max([fa.required_ghost_layers for fa in field_accesses]) - ghost_layers = [(required_ghost_layers, required_ghost_layers)] * len(loop_order) - if isinstance(ghost_layers, int): - ghost_layers = [(ghost_layers, ghost_layers)] * len(loop_order) + ghost_layers = 0 + else: + if ghost_layers is None: + required_ghost_layers = max([fa.required_ghost_layers for fa in field_accesses]) + ghost_layers = [(required_ghost_layers, required_ghost_layers)] * len(loop_order) + if isinstance(ghost_layers, int): + ghost_layers = [(ghost_layers, ghost_layers)] * len(loop_order) current_body = body for i, loop_coordinate in enumerate(reversed(loop_order)): diff --git a/pystencils_tests/test_cudagpu.py b/pystencils_tests/test_cudagpu.py index 3afddb128d2bb649f6bc5454d3ddfcd71d2a61a2..4a682526d0fbeaad813ddb308954508468fc80cb 100644 --- a/pystencils_tests/test_cudagpu.py +++ b/pystencils_tests/test_cudagpu.py @@ -135,7 +135,7 @@ def test_periodicity(): arr_gpu = gpuarray.to_gpu(arr_cpu) periodicity_stencil = [(1, 0), (-1, 0), (1, 1)] - periodic_gpu_kernel = periodic_gpu(periodicity_stencil, (5, 5), 1, 2) + periodic_gpu_kernel = periodic_gpu(periodicity_stencil, (5, 5), 1, 2, target='gpu') periodic_cpu_kernel = periodic_cpu(periodicity_stencil) cpu_result = np.copy(arr_cpu)