Merge branch 'FixAlign' into 'master'

[FIX] Alignement detection See merge request !351

Merge branch 'FixAlign' into 'master'
[FIX] Alignement detection See merge request !351
bb7a3cf2 · Michael Kuron · 5d1ce2a5 · 539b6be3 · bb7a3cf2 · bb7a3cf2
Commit bb7a3cf2 authored 1 year ago by Michael Kuron
--- a/pystencils/cpu/kernelcreation.py
+++ b/pystencils/cpu/kernelcreation.py
@@ -76,8 +76,6 @@ def create_kernel(assignments: NodeCollection,
    base_pointer_spec = config.base_pointer_specification
    if base_pointer_spec is None:
        base_pointer_spec = []
-        if config.cpu_vectorize_info and config.cpu_vectorize_info.get('nontemporal'):
-            base_pointer_spec = [['spatialInner0'], ['spatialInner1']] if len(loop_order) >= 2 else [['spatialInner0']]
    base_pointer_info = {field.name: parse_base_pointer_info(base_pointer_spec, loop_order,
                                                             field.spatial_dimensions, field.index_dimensions)
                         for field in fields_without_buffers}

--- a/pystencils/cpu/vectorization.py
+++ b/pystencils/cpu/vectorization.py
@@ -140,12 +140,18 @@ def vectorize_inner_loops_and_adapt_load_stores(ast_node, assume_aligned, nontem
                                                strided, keep_loop_stop, assume_sufficient_line_padding,
                                                default_float_type):
    """Goes over all innermost loops, changes increment to vector width and replaces field accesses by vector type."""
-    vector_width = ast_node.instruction_set['width']
+    all_loops = list(filtered_tree_iteration(ast_node, ast.LoopOverCoordinate, stop_type=ast.SympyAssignment))
-    all_loops = filtered_tree_iteration(ast_node, ast.LoopOverCoordinate, stop_type=ast.SympyAssignment)
    inner_loops = [loop for loop in all_loops if loop.is_innermost_loop]
    zero_loop_counters = {loop.loop_counter_symbol: 0 for loop in all_loops}
+    assert ast_node.instruction_set,\
+        "The ast needs to hold information about the instruction_set for the vectorisation"
+    vector_width = ast_node.instruction_set['width']
+    vector_int_width = ast_node.instruction_set['intwidth']
+    load_a = ast_node.instruction_set['loadA']
+    load_u = ast_node.instruction_set['loadU']
    for loop_node in inner_loops:
        loop_range = loop_node.stop - loop_node.start
@@ -174,8 +180,18 @@ def vectorize_inner_loops_and_adapt_load_stores(ast_node, assume_aligned, nontem
        for indexed in loop_node.atoms(sp.Indexed):
            base, index = indexed.args
            if loop_counter_symbol in index.atoms(sp.Symbol):
+                if not isinstance(vector_width, int) or load_a == load_u:
+                    # When the vector width is not known during code generation, we cannot determine whether
+                    # the access is aligned or not. None of the current sizeless vector ISAs (SVE and RISC-V-V)
+                    # have separate load/store instructions for aligned and unaligned, so there is no disadvantage
+                    # to falling back to unaligned here. When new ISAs become available, this may need to be revisited.
+                    # On sized vector ISAs that do not have separate instructions for aligned and unaligned access,
+                    # alignment does not matter here either
+                    aligned_access = False
+                else:
+                    aligned_access = (index - loop_counter_symbol).subs(zero_loop_counters) % vector_width == 0
                loop_counter_is_offset = loop_counter_symbol not in (index - loop_counter_symbol).atoms()
-                aligned_access = (index - loop_counter_symbol).subs(zero_loop_counters) == 0
                stride = sp.simplify(index.subs({loop_counter_symbol: loop_counter_symbol + 1}) - index)
                if not loop_counter_is_offset and (not strided or loop_counter_symbol in stride.atoms()):
                    successful = False
@@ -204,7 +220,6 @@ def vectorize_inner_loops_and_adapt_load_stores(ast_node, assume_aligned, nontem
        loop_node.step = vector_width
        loop_node.subs(substitutions)
-        vector_int_width = ast_node.instruction_set['intwidth']
        arg_1 = CastFunc(loop_counter_symbol, VectorType(loop_counter_symbol.dtype, vector_int_width))
        arg_2 = CastFunc(tuple(range(vector_int_width if type(vector_int_width) is int else 2)),
                         VectorType(loop_counter_symbol.dtype, vector_int_width))

--- a/pystencils_tests/test_vectorization_specific.py
+++ b/pystencils_tests/test_vectorization_specific.py
@@ -119,11 +119,14 @@ def test_alignment_and_correct_ghost_layers(gl_field, gl_kernel, instruction_set
                                                  cpu_vectorize_info=opt, ghost_layers=gl_kernel)
    ast = ps.create_kernel(update_rule, config=config)
    kernel = ast.compile()
-    if gl_kernel != gl_field:
+    if ast.instruction_set['loadA'] == ast.instruction_set['loadU']:
-        with pytest.raises(ValueError):
-            dh.run_kernel(kernel)
-    else:
        dh.run_kernel(kernel)
+    else:
+        if gl_kernel != gl_field:
+            with pytest.raises(ValueError):
+                dh.run_kernel(kernel)
+        else:
+            dh.run_kernel(kernel)
 @pytest.mark.parametrize('instruction_set', supported_instruction_sets)