diff --git a/pystencils/cpu/kernelcreation.py b/pystencils/cpu/kernelcreation.py index e58641e021d485f34098bfc920ad43beda10e589..c99332ff0f5f4b181ec6f46ec0cf404c5d85359f 100644 --- a/pystencils/cpu/kernelcreation.py +++ b/pystencils/cpu/kernelcreation.py @@ -76,8 +76,6 @@ def create_kernel(assignments: NodeCollection, base_pointer_spec = config.base_pointer_specification if base_pointer_spec is None: base_pointer_spec = [] - if config.cpu_vectorize_info and config.cpu_vectorize_info.get('nontemporal'): - base_pointer_spec = [['spatialInner0'], ['spatialInner1']] if len(loop_order) >= 2 else [['spatialInner0']] base_pointer_info = {field.name: parse_base_pointer_info(base_pointer_spec, loop_order, field.spatial_dimensions, field.index_dimensions) for field in fields_without_buffers} diff --git a/pystencils/cpu/vectorization.py b/pystencils/cpu/vectorization.py index 67cb91221d3212cacb6328bd48291a2bb75a6d24..f39c52d81475f1deba8eeefa98e256f954755b54 100644 --- a/pystencils/cpu/vectorization.py +++ b/pystencils/cpu/vectorization.py @@ -140,12 +140,18 @@ def vectorize_inner_loops_and_adapt_load_stores(ast_node, assume_aligned, nontem strided, keep_loop_stop, assume_sufficient_line_padding, default_float_type): """Goes over all innermost loops, changes increment to vector width and replaces field accesses by vector type.""" - vector_width = ast_node.instruction_set['width'] - - all_loops = filtered_tree_iteration(ast_node, ast.LoopOverCoordinate, stop_type=ast.SympyAssignment) + all_loops = list(filtered_tree_iteration(ast_node, ast.LoopOverCoordinate, stop_type=ast.SympyAssignment)) inner_loops = [loop for loop in all_loops if loop.is_innermost_loop] zero_loop_counters = {loop.loop_counter_symbol: 0 for loop in all_loops} + assert ast_node.instruction_set,\ + "The ast needs to hold information about the instruction_set for the vectorisation" + vector_width = ast_node.instruction_set['width'] + vector_int_width = ast_node.instruction_set['intwidth'] + + load_a = ast_node.instruction_set['loadA'] + load_u = ast_node.instruction_set['loadU'] + for loop_node in inner_loops: loop_range = loop_node.stop - loop_node.start @@ -174,8 +180,18 @@ def vectorize_inner_loops_and_adapt_load_stores(ast_node, assume_aligned, nontem for indexed in loop_node.atoms(sp.Indexed): base, index = indexed.args if loop_counter_symbol in index.atoms(sp.Symbol): + if not isinstance(vector_width, int) or load_a == load_u: + # When the vector width is not known during code generation, we cannot determine whether + # the access is aligned or not. None of the current sizeless vector ISAs (SVE and RISC-V-V) + # have separate load/store instructions for aligned and unaligned, so there is no disadvantage + # to falling back to unaligned here. When new ISAs become available, this may need to be revisited. + + # On sized vector ISAs that do not have separate instructions for aligned and unaligned access, + # alignment does not matter here either + aligned_access = False + else: + aligned_access = (index - loop_counter_symbol).subs(zero_loop_counters) % vector_width == 0 loop_counter_is_offset = loop_counter_symbol not in (index - loop_counter_symbol).atoms() - aligned_access = (index - loop_counter_symbol).subs(zero_loop_counters) == 0 stride = sp.simplify(index.subs({loop_counter_symbol: loop_counter_symbol + 1}) - index) if not loop_counter_is_offset and (not strided or loop_counter_symbol in stride.atoms()): successful = False @@ -204,7 +220,6 @@ def vectorize_inner_loops_and_adapt_load_stores(ast_node, assume_aligned, nontem loop_node.step = vector_width loop_node.subs(substitutions) - vector_int_width = ast_node.instruction_set['intwidth'] arg_1 = CastFunc(loop_counter_symbol, VectorType(loop_counter_symbol.dtype, vector_int_width)) arg_2 = CastFunc(tuple(range(vector_int_width if type(vector_int_width) is int else 2)), VectorType(loop_counter_symbol.dtype, vector_int_width)) diff --git a/pystencils_tests/test_vectorization_specific.py b/pystencils_tests/test_vectorization_specific.py index 46e13c2d7f59bfaa9fa50f5e3d8632da3c1a25ac..db49657557b9eea5fd6e05f51f69f2032ffd4e3a 100644 --- a/pystencils_tests/test_vectorization_specific.py +++ b/pystencils_tests/test_vectorization_specific.py @@ -119,11 +119,14 @@ def test_alignment_and_correct_ghost_layers(gl_field, gl_kernel, instruction_set cpu_vectorize_info=opt, ghost_layers=gl_kernel) ast = ps.create_kernel(update_rule, config=config) kernel = ast.compile() - if gl_kernel != gl_field: - with pytest.raises(ValueError): - dh.run_kernel(kernel) - else: + if ast.instruction_set['loadA'] == ast.instruction_set['loadU']: dh.run_kernel(kernel) + else: + if gl_kernel != gl_field: + with pytest.raises(ValueError): + dh.run_kernel(kernel) + else: + dh.run_kernel(kernel) @pytest.mark.parametrize('instruction_set', supported_instruction_sets)