diff --git a/pystencils/astnodes.py b/pystencils/astnodes.py index 85c6033c513bb42646c76168eb13abc51f9b37ce..58a438717931539a07ffe066ff7f5f8421b1f14a 100644 --- a/pystencils/astnodes.py +++ b/pystencils/astnodes.py @@ -404,7 +404,7 @@ class PragmaBlock(Block): class LoopOverCoordinate(Node): LOOP_COUNTER_NAME_PREFIX = "ctr" - BlOCK_LOOP_COUNTER_NAME_PREFIX = "_blockctr" + BLOCK_LOOP_COUNTER_NAME_PREFIX = "_blockctr" def __init__(self, body, coordinate_to_loop_over, start, stop, step=1, is_block_loop=False): super(LoopOverCoordinate, self).__init__(parent=None) @@ -479,7 +479,7 @@ class LoopOverCoordinate(Node): @staticmethod def get_block_loop_counter_name(coordinate_to_loop_over): - return f"{LoopOverCoordinate.BlOCK_LOOP_COUNTER_NAME_PREFIX}_{coordinate_to_loop_over}" + return f"{LoopOverCoordinate.BLOCK_LOOP_COUNTER_NAME_PREFIX}_{coordinate_to_loop_over}" @property def loop_counter_name(self): diff --git a/pystencils/transformations.py b/pystencils/transformations.py index 5e306f2de168994575562a92156408f128ab447c..100d0c20a2702dabbdb874fda1cc6d0022cfd7ec 100644 --- a/pystencils/transformations.py +++ b/pystencils/transformations.py @@ -1258,7 +1258,8 @@ def loop_blocking(ast_node: ast.KernelFunction, block_size) -> int: Args: ast_node: kernel function node before vectorization transformation has been applied - block_size: sequence defining block size in x, y, (z) direction + block_size: sequence defining block size in x, y, (z) direction. + If chosen as zero the direction will not be used for blocking. Returns: number of dimensions blocked @@ -1270,8 +1271,10 @@ def loop_blocking(ast_node: ast.KernelFunction, block_size) -> int: body = ast_node.body coordinates = [] + coordinates_taken_into_account = 0 loop_starts = {} loop_stops = {} + for loop in loops: coord = loop.coordinate_to_loop_over if coord not in coordinates: @@ -1285,6 +1288,9 @@ def loop_blocking(ast_node: ast.KernelFunction, block_size) -> int: # Create the outer loops that iterate over the blocks outer_loop = None for coord in reversed(coordinates): + if block_size[coord] == 0: + continue + coordinates_taken_into_account += 1 body = ast.Block([outer_loop]) if outer_loop else body outer_loop = ast.LoopOverCoordinate(body, coord, @@ -1298,6 +1304,8 @@ def loop_blocking(ast_node: ast.KernelFunction, block_size) -> int: # modify the existing loops to only iterate within one block for inner_loop in loops: coord = inner_loop.coordinate_to_loop_over + if block_size[coord] == 0: + continue block_ctr = ast.LoopOverCoordinate.get_block_loop_counter_symbol(coord) loop_range = inner_loop.stop - inner_loop.start if sp.sympify( @@ -1307,7 +1315,7 @@ def loop_blocking(ast_node: ast.KernelFunction, block_size) -> int: stop = sp.Min(inner_loop.stop, block_ctr + block_size[coord]) inner_loop.start = block_ctr inner_loop.stop = stop - return len(coordinates) + return coordinates_taken_into_account def implement_interpolations(ast_node: ast.Node, diff --git a/pystencils_tests/test_blocking.py b/pystencils_tests/test_blocking.py index b2b815b1a317d9f3228e9cc780c66336f378f782..1c564627152fe0f68a05819dec5cdcdc0bf19330 100644 --- a/pystencils_tests/test_blocking.py +++ b/pystencils_tests/test_blocking.py @@ -18,14 +18,20 @@ def check_equivalence(assignments, src_arr): for vectorization in [False, {'assume_inner_stride_one': True}]: with_blocking = ps.create_kernel(assignments, cpu_blocking=(8, 16, 4), cpu_openmp=openmp, cpu_vectorize_info=vectorization).compile() + with_blocking_only_over_y = ps.create_kernel(assignments, cpu_blocking=(0, 16, 0), cpu_openmp=openmp, + cpu_vectorize_info=vectorization).compile() without_blocking = ps.create_kernel(assignments).compile() + print(f" openmp {openmp}, vectorization {vectorization}") dst_arr = np.zeros_like(src_arr) + dst2_arr = np.zeros_like(src_arr) ref_arr = np.zeros_like(src_arr) np.copyto(src_arr, np.random.rand(*src_arr.shape)) with_blocking(src=src_arr, dst=dst_arr) + with_blocking_only_over_y(src=src_arr, dst=dst2_arr) without_blocking(src=src_arr, dst=ref_arr) np.testing.assert_almost_equal(ref_arr, dst_arr) + np.testing.assert_almost_equal(ref_arr, dst2_arr) def test_jacobi3d_var_size():