diff --git a/pystencils/gpucuda/indexing.py b/pystencils/gpucuda/indexing.py index 9e2df4e0a1183d752f560985ab15acee041f2b76..423b115837bda768ec8313abdc50c9a5ac017085 100644 --- a/pystencils/gpucuda/indexing.py +++ b/pystencils/gpucuda/indexing.py @@ -169,7 +169,7 @@ class BlockIndexing(AbstractIndexing): conditions = [c < e for c, e in zip(self.coordinates, end)] for cuda_index, iter_slice in zip(self.cuda_indices, self._iterationSlice): - if iter_slice.step > 1: + if isinstance(iter_slice, slice) and iter_slice.step > 1: conditions.append(sp.Eq(sp.Mod(cuda_index, iter_slice.step), 0)) condition = conditions[0] @@ -177,6 +177,9 @@ class BlockIndexing(AbstractIndexing): condition = sp.And(condition, c) return Block([Conditional(condition, kernel_content)]) + def iteration_space(self, arr_shape): + return _iteration_space(self._iterationSlice, arr_shape) + @staticmethod def limit_block_size_by_register_restriction(block_size, required_registers_per_thread, device=None): """Shrinks the block_size if there are too many registers used per multiprocessor. @@ -284,6 +287,9 @@ class LineIndexing(AbstractIndexing): def symbolic_parameters(self): return set() + def iteration_space(self, arr_shape): + return _iteration_space(self._iterationSlice, arr_shape) + # -------------------------------------- Helper functions -------------------------------------------------------------- @@ -310,6 +316,23 @@ def _get_end_from_slice(iteration_slice, arr_shape): return res +def _get_steps_from_slice(iteration_slice): + res = [] + for slice_component in iteration_slice: + if type(slice_component) is slice: + res.append(slice_component.step) + else: + res.append(1) + return res + + +def _iteration_space(iteration_slice, arr_shape): + starts = _get_start_from_slice(iteration_slice) + ends = _get_end_from_slice(iteration_slice, arr_shape) + steps = _get_steps_from_slice(iteration_slice) + return [slice(start, end, step) for start, end, step in zip(starts, ends, steps)] + + def indexing_creator_from_params(gpu_indexing, gpu_indexing_params): if isinstance(gpu_indexing, str): if gpu_indexing == 'block': diff --git a/pystencils/gpucuda/kernelcreation.py b/pystencils/gpucuda/kernelcreation.py index 611c1e56b4975d3cad12440553032f67b6465c82..96a53138316ac97d4518cc870d7a4dd5f3fed001 100644 --- a/pystencils/gpucuda/kernelcreation.py +++ b/pystencils/gpucuda/kernelcreation.py @@ -13,7 +13,7 @@ from pystencils.node_collection import NodeCollection from pystencils.gpucuda.indexing import indexing_creator_from_params from pystencils.simp.assignment_collection import AssignmentCollection from pystencils.transformations import ( - get_base_buffer_index, get_common_shape, parse_base_pointer_info, + get_base_buffer_index, get_common_field, parse_base_pointer_info, resolve_buffer_accesses, resolve_field_accesses, unify_shape_symbols) @@ -44,7 +44,9 @@ def create_cuda_kernel(assignments: Union[AssignmentCollection, NodeCollection], field_accesses = {e for e in field_accesses if not e.is_absolute_access} num_buffer_accesses += sum(1 for access in eq.atoms(Field.Access) if FieldType.is_buffer(access.field)) - common_shape = get_common_shape(fields_without_buffers) + # common shape and field to from the iteration space + common_field = get_common_field(fields_without_buffers) + common_shape = common_field.spatial_shape if iteration_slice is None: # determine iteration slice from ghost layers @@ -62,7 +64,7 @@ def create_cuda_kernel(assignments: Union[AssignmentCollection, NodeCollection], iteration_slice.append(slice(ghost_layers[i][0], -ghost_layers[i][1] if ghost_layers[i][1] > 0 else None)) - indexing = indexing_creator(field=list(fields_without_buffers)[0], iteration_slice=iteration_slice) + indexing = indexing_creator(field=common_field, iteration_slice=iteration_slice) coord_mapping = indexing.coordinates cell_idx_assignments = [SympyAssignment(LoopOverCoordinate.get_loop_counter_symbol(i), value) @@ -92,7 +94,8 @@ def create_cuda_kernel(assignments: Union[AssignmentCollection, NodeCollection], coord_mapping = {f.name: cell_idx_symbols for f in all_fields} if any(FieldType.is_buffer(f) for f in all_fields): - resolve_buffer_accesses(ast, get_base_buffer_index(ast, indexing.coordinates, common_shape), read_only_fields) + iteration_space = indexing.iteration_space(common_shape) + resolve_buffer_accesses(ast, get_base_buffer_index(ast, cell_idx_symbols, iteration_space), read_only_fields) resolve_field_accesses(ast, read_only_fields, field_to_base_pointer_info=base_pointer_info, field_to_fixed_coordinates=coord_mapping) @@ -157,7 +160,7 @@ def created_indexed_cuda_kernel(assignments: Union[AssignmentCollection, NodeCol iteration_slice=[slice(None, None, None)] * len(idx_field.spatial_shape)) function_body = Block(coordinate_symbol_assignments + assignments) - function_body = indexing.guard(function_body, get_common_shape(index_fields)) + function_body = indexing.guard(function_body, get_common_field(index_fields).spatial_shape) ast = KernelFunction(function_body, Target.GPU, Backend.CUDA, make_python_function, None, function_name, assignments=assignments) ast.global_variables.update(indexing.index_variables) diff --git a/pystencils/transformations.py b/pystencils/transformations.py index 9f10a5f0a3bd1a1e23587ff58dcb2f3a7f50f6b6..02806d62223c8443cc2bb91740afd2c0ef6bda55 100644 --- a/pystencils/transformations.py +++ b/pystencils/transformations.py @@ -122,9 +122,10 @@ def unify_shape_symbols(body, common_shape, fields): body.subs(substitutions) -def get_common_shape(field_set): - """Takes a set of pystencils Fields and returns their common spatial shape if it exists. Otherwise - ValueError is raised""" +def get_common_field(field_set): + """Takes a set of pystencils Fields, checks if a common spatial shape exists and returns one + representative field, that can be used for shape information etc. in the kernel creation. + If the fields have different shapes ValueError is raised""" nr_of_fixed_shaped_fields = 0 for f in field_set: if f.has_fixed_shape: @@ -142,8 +143,9 @@ def get_common_shape(field_set): if len(shape_set) != 1: raise ValueError("Differently sized field accesses in loop body: " + str(shape_set)) - shape = list(sorted(shape_set, key=lambda e: str(e[0])))[0] - return shape + # Sort the fields by their name to ensure that always the same field is returned + reference_field = list(sorted(field_set, key=lambda e: str(e)))[0] + return reference_field def make_loop_over_domain(body, iteration_slice=None, ghost_layers=None, loop_order=None): @@ -178,13 +180,15 @@ def make_loop_over_domain(body, iteration_slice=None, ghost_layers=None, loop_or if absolut_accesses_only: absolut_access_fields = {e.field for e in body.atoms(Field.Access)} - shape = get_common_shape(absolut_access_fields) + common_field = get_common_field(absolut_access_fields) + common_shape = common_field.spatial_shape else: - shape = get_common_shape(fields) - unify_shape_symbols(body, common_shape=shape, fields=fields) + common_field = get_common_field(fields) + common_shape = common_field.spatial_shape + unify_shape_symbols(body, common_shape=common_shape, fields=fields) if iteration_slice is not None: - iteration_slice = normalize_slice(iteration_slice, shape) + iteration_slice = normalize_slice(iteration_slice, common_shape) if ghost_layers is None: if absolut_accesses_only: @@ -199,7 +203,7 @@ def make_loop_over_domain(body, iteration_slice=None, ghost_layers=None, loop_or for i, loop_coordinate in enumerate(reversed(loop_order)): if iteration_slice is None: begin = ghost_layers[loop_coordinate][0] - end = shape[loop_coordinate] - ghost_layers[loop_coordinate][1] + end = common_shape[loop_coordinate] - ghost_layers[loop_coordinate][1] new_loop = ast.LoopOverCoordinate(current_body, loop_coordinate, begin, end, 1) current_body = ast.Block([new_loop]) else: @@ -351,7 +355,7 @@ def get_base_buffer_index(ast_node, loop_counters=None, loop_iterations=None): ast_node: ast before any field accesses are resolved loop_counters: for CPU kernels: leave to default 'None' (can be determined from loop nodes) for GPU kernels: list of 'loop counters' from inner to outer loop - loop_iterations: number of iterations of each loop from inner to outer, for CPU kernels leave to default + loop_iterations: iteration slice for each loop from inner to outer, for CPU kernels leave to default Returns: base buffer index - required by 'resolve_buffer_accesses' function @@ -363,15 +367,14 @@ def get_base_buffer_index(ast_node, loop_counters=None, loop_iterations=None): assert len(loops) == len(parents_of_innermost_loop) assert all(l1 is l2 for l1, l2 in zip(loops, parents_of_innermost_loop)) - actual_sizes = [int_div((loop.stop - loop.start), loop.step) - if loop.step != 1 else loop.stop - loop.start for loop in loops] + loop_counters = [loop.loop_counter_symbol for loop in loops] + loop_iterations = [slice(loop.start, loop.stop, loop.step) for loop in loops] - actual_steps = [int_div((loop.loop_counter_symbol - loop.start), loop.step) - if loop.step != 1 else loop.loop_counter_symbol - loop.start for loop in loops] + actual_sizes = [int_div((s.stop - s.start), s.step) + if s.step != 1 else s.stop - s.start for s in loop_iterations] - else: - actual_sizes = loop_iterations - actual_steps = loop_counters + actual_steps = [int_div((ctr - s.start), s.step) + if s.step != 1 else ctr - s.start for ctr, s in zip(loop_counters, loop_iterations)] field_accesses = ast_node.atoms(Field.Access) buffer_accesses = {fa for fa in field_accesses if FieldType.is_buffer(fa.field)} diff --git a/pystencils_tests/test_buffer.py b/pystencils_tests/test_buffer.py index b8af6f53f0b25075d55a182a2a93231fc03b60f7..a21d68cc406c604b3566b46a0167a4ba356c59cc 100644 --- a/pystencils_tests/test_buffer.py +++ b/pystencils_tests/test_buffer.py @@ -22,7 +22,7 @@ def _generate_fields(dt=np.uint64, num_directions=1, layout='numpy'): field_layout = layout_string_to_tuple(layout, len(size)) src_arr = create_numpy_array_with_layout(size, field_layout, dtype=dt) - array_data = np.reshape(np.arange(1, int(np.prod(size)+1)), size) + array_data = np.reshape(np.arange(1, int(np.prod(size) + 1)), size) # Use flat iterator to input data into the array src_arr.flat = add_ghost_layers(array_data, index_dimensions=1 if num_directions > 1 else 0).astype(dt).flat dst_arr = np.zeros(src_arr.shape, dtype=dt) @@ -41,7 +41,8 @@ def test_full_scalar_field(): field_type=FieldType.BUFFER, dtype=src_arr.dtype) pack_eqs = [Assignment(buffer.center(), src_field.center())] - pack_code = create_kernel(pack_eqs, data_type={'src_field': src_arr.dtype, 'buffer': buffer.dtype}) + config = ps.CreateKernelConfig(data_type={'src_field': src_arr.dtype, 'buffer': buffer.dtype}) + pack_code = create_kernel(pack_eqs, config=config) code = ps.get_code_str(pack_code) ps.show_code(pack_code) @@ -49,7 +50,9 @@ def test_full_scalar_field(): pack_kernel(buffer=buffer_arr, src_field=src_arr) unpack_eqs = [Assignment(dst_field.center(), buffer.center())] - unpack_code = create_kernel(unpack_eqs, data_type={'dst_field': dst_arr.dtype, 'buffer': buffer.dtype}) + + config = ps.CreateKernelConfig(data_type={'dst_field': dst_arr.dtype, 'buffer': buffer.dtype}) + unpack_code = create_kernel(unpack_eqs, config=config) unpack_kernel = unpack_code.compile() unpack_kernel(dst_field=dst_arr, buffer=buffer_arr) @@ -73,14 +76,18 @@ def test_field_slice(): field_type=FieldType.BUFFER, dtype=src_arr.dtype) pack_eqs = [Assignment(buffer.center(), src_field.center())] - pack_code = create_kernel(pack_eqs, data_type={'src_field': src_arr.dtype, 'buffer': buffer.dtype}) + + config = ps.CreateKernelConfig(data_type={'src_field': src_arr.dtype, 'buffer': buffer.dtype}) + pack_code = create_kernel(pack_eqs, config=config) pack_kernel = pack_code.compile() pack_kernel(buffer=bufferArr, src_field=src_arr[pack_slice]) # Unpack into ghost layer of dst_field in N direction unpack_eqs = [Assignment(dst_field.center(), buffer.center())] - unpack_code = create_kernel(unpack_eqs, data_type={'dst_field': dst_arr.dtype, 'buffer': buffer.dtype}) + + config = ps.CreateKernelConfig(data_type={'dst_field': dst_arr.dtype, 'buffer': buffer.dtype}) + unpack_code = create_kernel(unpack_eqs, config=config) unpack_kernel = unpack_code.compile() unpack_kernel(buffer=bufferArr, dst_field=dst_arr[unpack_slice]) @@ -105,7 +112,8 @@ def test_all_cell_values(): eq = Assignment(buffer(idx), src_field(idx)) pack_eqs.append(eq) - pack_code = create_kernel(pack_eqs, data_type={'src_field': src_arr.dtype, 'buffer': buffer.dtype}) + config = ps.CreateKernelConfig(data_type={'src_field': src_arr.dtype, 'buffer': buffer.dtype}) + pack_code = create_kernel(pack_eqs, config=config) pack_kernel = pack_code.compile() pack_kernel(buffer=bufferArr, src_field=src_arr) @@ -115,7 +123,8 @@ def test_all_cell_values(): eq = Assignment(dst_field(idx), buffer(idx)) unpack_eqs.append(eq) - unpack_code = create_kernel(unpack_eqs, data_type={'dst_field': dst_arr.dtype, 'buffer': buffer.dtype}) + config = ps.CreateKernelConfig(data_type={'dst_field': dst_arr.dtype, 'buffer': buffer.dtype}) + unpack_code = create_kernel(unpack_eqs, config=config) unpack_kernel = unpack_code.compile() unpack_kernel(buffer=bufferArr, dst_field=dst_arr) @@ -141,7 +150,8 @@ def test_subset_cell_values(): eq = Assignment(buffer(buffer_idx), src_field(cell_idx)) pack_eqs.append(eq) - pack_code = create_kernel(pack_eqs, data_type={'src_field': src_arr.dtype, 'buffer': buffer.dtype}) + config = ps.CreateKernelConfig(data_type={'src_field': src_arr.dtype, 'buffer': buffer.dtype}) + pack_code = create_kernel(pack_eqs, config=config) pack_kernel = pack_code.compile() pack_kernel(buffer=bufferArr, src_field=src_arr) @@ -151,7 +161,8 @@ def test_subset_cell_values(): eq = Assignment(dst_field(cell_idx), buffer(buffer_idx)) unpack_eqs.append(eq) - unpack_code = create_kernel(unpack_eqs, data_type={'dst_field': dst_arr.dtype, 'buffer': buffer.dtype}) + config = ps.CreateKernelConfig(data_type={'dst_field': dst_arr.dtype, 'buffer': buffer.dtype}) + unpack_code = create_kernel(unpack_eqs, config=config) unpack_kernel = unpack_code.compile() unpack_kernel(buffer=bufferArr, dst_field=dst_arr) @@ -176,7 +187,8 @@ def test_field_layouts(): eq = Assignment(buffer(idx), src_field(idx)) pack_eqs.append(eq) - pack_code = create_kernel(pack_eqs, data_type={'src_field': src_arr.dtype, 'buffer': buffer.dtype}) + config = ps.CreateKernelConfig(data_type={'src_field': src_arr.dtype, 'buffer': buffer.dtype}) + pack_code = create_kernel(pack_eqs, config=config) pack_kernel = pack_code.compile() pack_kernel(buffer=bufferArr, src_field=src_arr) @@ -186,7 +198,8 @@ def test_field_layouts(): eq = Assignment(dst_field(idx), buffer(idx)) unpack_eqs.append(eq) - unpack_code = create_kernel(unpack_eqs, data_type={'dst_field': dst_arr.dtype, 'buffer': buffer.dtype}) + config = ps.CreateKernelConfig(data_type={'dst_field': dst_arr.dtype, 'buffer': buffer.dtype}) + unpack_code = create_kernel(unpack_eqs, config=config) unpack_kernel = unpack_code.compile() unpack_kernel(buffer=bufferArr, dst_field=dst_arr) @@ -202,7 +215,7 @@ def test_iteration_slices(): src_field = Field.create_generic("src_field", spatial_dimensions, index_shape=(num_cell_values,), dtype=dt) dst_field = Field.create_generic("dst_field", spatial_dimensions, index_shape=(num_cell_values,), dtype=dt) buffer = Field.create_generic("buffer", spatial_dimensions=1, index_dimensions=1, - field_type=FieldType.BUFFER, dtype=src_arr.dtype) + field_type=FieldType.BUFFER, dtype=src_arr.dtype) pack_eqs = [] # Since we are packing all cell values for all cells, then @@ -214,13 +227,16 @@ def test_iteration_slices(): dim = src_field.spatial_dimensions # Pack only the leftmost slice, only every second cell - pack_slice = (slice(None, None, 2),) * (dim-1) + (0, ) + pack_slice = (slice(None, None, 2),) * (dim - 1) + (0,) # Fill the entire array with data src_arr[(slice(None, None, 1),) * dim] = np.arange(num_cell_values) dst_arr.fill(0) - pack_code = create_kernel(pack_eqs, iteration_slice=pack_slice, data_type={'src_field': src_arr.dtype, 'buffer': buffer.dtype}) + config = ps.CreateKernelConfig(iteration_slice=pack_slice, + data_type={'src_field': src_arr.dtype, 'buffer': buffer.dtype}) + + pack_code = create_kernel(pack_eqs, config=config) pack_kernel = pack_code.compile() pack_kernel(buffer=bufferArr, src_field=src_arr) @@ -230,12 +246,14 @@ def test_iteration_slices(): eq = Assignment(dst_field(idx), buffer(idx)) unpack_eqs.append(eq) - unpack_code = create_kernel(unpack_eqs, iteration_slice=pack_slice, data_type={'dst_field': dst_arr.dtype, 'buffer': buffer.dtype}) + config = ps.CreateKernelConfig(iteration_slice=pack_slice, + data_type={'dst_field': dst_arr.dtype, 'buffer': buffer.dtype}) + + unpack_code = create_kernel(unpack_eqs, config=config) unpack_kernel = unpack_code.compile() unpack_kernel(buffer=bufferArr, dst_field=dst_arr) # Check if only every second entry of the leftmost slice has been copied np.testing.assert_equal(dst_arr[pack_slice], src_arr[pack_slice]) - np.testing.assert_equal(dst_arr[(slice(1, None, 2),) * (dim-1) + (0,)], 0) - np.testing.assert_equal(dst_arr[(slice(None, None, 1),) * (dim-1) + (slice(1,None),)], 0) - + np.testing.assert_equal(dst_arr[(slice(1, None, 2),) * (dim - 1) + (0,)], 0) + np.testing.assert_equal(dst_arr[(slice(None, None, 1),) * (dim - 1) + (slice(1, None),)], 0) diff --git a/pystencils_tests/test_buffer_gpu.py b/pystencils_tests/test_buffer_gpu.py index dd20acd5d6e4bb080980d7ddab83e38e820f7099..ae0a1548790fda79c50b6c6b6952868755009d44 100644 --- a/pystencils_tests/test_buffer_gpu.py +++ b/pystencils_tests/test_buffer_gpu.py @@ -274,3 +274,59 @@ def test_buffer_indexing(): assert s in src_field_size assert len(spatial_shape_symbols) <= 3 + + +def test_iteration_slices(): + num_cell_values = 19 + dt = np.uint64 + fields = _generate_fields(dt=dt, stencil_directions=num_cell_values) + for (src_arr, gpu_src_arr, gpu_dst_arr, gpu_buffer_arr) in fields: + src_field = Field.create_from_numpy_array("src_field", gpu_src_arr, index_dimensions=1) + dst_field = Field.create_from_numpy_array("dst_field", gpu_src_arr, index_dimensions=1) + buffer = Field.create_generic("buffer", spatial_dimensions=1, index_dimensions=1, + field_type=FieldType.BUFFER, dtype=src_arr.dtype) + + pack_eqs = [] + # Since we are packing all cell values for all cells, then + # the buffer index is equivalent to the field index + for idx in range(num_cell_values): + eq = Assignment(buffer(idx), src_field(idx)) + pack_eqs.append(eq) + + dim = src_field.spatial_dimensions + + # Pack only the leftmost slice, only every second cell + pack_slice = (slice(None, None, 2),) * (dim - 1) + (0,) + + # Fill the entire array with data + src_arr[(slice(None, None, 1),) * dim] = np.arange(num_cell_values) + gpu_src_arr[(slice(None, None, 1),) * dim] = src_arr + gpu_dst_arr.fill(0) + + config = CreateKernelConfig(target=Target.GPU, iteration_slice=pack_slice, + data_type={'src_field': gpu_src_arr.dtype, 'buffer': gpu_buffer_arr.dtype}) + + pack_code = create_kernel(pack_eqs, config=config) + pack_kernel = pack_code.compile() + pack_kernel(buffer=gpu_buffer_arr, src_field=gpu_src_arr) + + unpack_eqs = [] + + for idx in range(num_cell_values): + eq = Assignment(dst_field(idx), buffer(idx)) + unpack_eqs.append(eq) + + config = CreateKernelConfig(target=Target.GPU, iteration_slice=pack_slice, + data_type={'dst_field': gpu_dst_arr.dtype, 'buffer': gpu_buffer_arr.dtype}) + + unpack_code = create_kernel(unpack_eqs, config=config) + unpack_kernel = unpack_code.compile() + unpack_kernel(buffer=gpu_buffer_arr, dst_field=gpu_dst_arr) + + dst_arr = gpu_dst_arr.get() + src_arr = gpu_src_arr.get() + + # Check if only every second entry of the leftmost slice has been copied + np.testing.assert_equal(dst_arr[pack_slice], src_arr[pack_slice]) + np.testing.assert_equal(dst_arr[(slice(1, None, 2),) * (dim - 1) + (0,)], 0) + np.testing.assert_equal(dst_arr[(slice(None, None, 1),) * (dim - 1) + (slice(1, None),)], 0)