Remove loadA/storeA from ISAs where it is the same as loadU/storeU

7a1a4415 · Michael Kuron · bb7a3cf2 · 7a1a4415 · 7a1a4415 · 7a1a4415
Commit 7a1a4415 authored 1 year ago by Michael Kuron
--- a/pystencils/backends/arm_instruction_sets.py
+++ b/pystencils/backends/arm_instruction_sets.py
@@ -35,9 +35,7 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'):
        'sqrt': 'sqrt[0]',

        'loadU': 'ld1[0]',
-        'loadA': 'ld1[0]',
        'storeU': 'st1[0, 1]',
-        'storeA': 'st1[0, 1]',

        'abs': 'abs[0]',
        '==': f'{cmp}eq[0, 1]',
@@ -123,7 +121,6 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'):
        result['all'] = f'svcntp_b{bits[data_type]}({predicate}, {{0}}) == {width}'

        result['maskStoreU'] = result['storeU'].replace(predicate, '{2}')
-        result['maskStoreA'] = result['storeA'].replace(predicate, '{2}')
        result['maskStoreS'] = result['storeS'].replace(predicate, '{3}')

        if instruction_set != 'sve':

--- a/pystencils/backends/riscv_instruction_sets.py
+++ b/pystencils/backends/riscv_instruction_sets.py
@@ -30,11 +30,8 @@ def get_vector_instruction_set_riscv(data_type='double', instruction_set='rvv'):
        'sqrt': 'fsqrt_v[0]',

        'loadU': f'le{bits[data_type]}_v[0]',
-        'loadA': f'le{bits[data_type]}_v[0]',
        'storeU': f'se{bits[data_type]}_v[0, 1]',
-        'storeA': f'se{bits[data_type]}_v[0, 1]',
        'maskStoreU': f'se{bits[data_type]}_v[2, 0, 1]',
-        'maskStoreA': f'se{bits[data_type]}_v[2, 0, 1]',
        'loadS': f'lse{bits[data_type]}_v[0, 1]',
        'storeS': f'sse{bits[data_type]}_v[0, 2, 1]',
        'maskStoreS': f'sse{bits[data_type]}_v[2, 0, 3, 1]',

--- a/pystencils/cpu/vectorization.py
+++ b/pystencils/cpu/vectorization.py
@@ -130,7 +130,7 @@ def vectorize(kernel_ast: ast.KernelFunction, instruction_set: str = 'best',
    if nontemporal and 'cachelineZero' in vector_is:
        kernel_ast.use_all_written_field_sizes = True
    strided = 'storeS' in vector_is and 'loadS' in vector_is
-    keep_loop_stop = '{loop_stop}' in vector_is['storeA' if assume_aligned else 'storeU']
+    keep_loop_stop = '{loop_stop}' in vector_is['storeA' if assume_aligned and 'storeA' in vector_is else 'storeU']
    vectorize_inner_loops_and_adapt_load_stores(kernel_ast, assume_aligned, nontemporal,
                                                strided, keep_loop_stop, assume_sufficient_line_padding,
                                                default_float_type)
@@ -144,13 +144,10 @@ def vectorize_inner_loops_and_adapt_load_stores(ast_node, assume_aligned, nontem
    inner_loops = [loop for loop in all_loops if loop.is_innermost_loop]
    zero_loop_counters = {loop.loop_counter_symbol: 0 for loop in all_loops}

-    assert ast_node.instruction_set,\
-        "The ast needs to hold information about the instruction_set for the vectorisation"
-    vector_width = ast_node.instruction_set['width']
-    vector_int_width = ast_node.instruction_set['intwidth']
-
-    load_a = ast_node.instruction_set['loadA']
-    load_u = ast_node.instruction_set['loadU']
+    vector_is = ast_node.instruction_set
+    assert vector_is, "The ast needs to hold information about the instruction_set for the vectorisation"
+    vector_width = vector_is['width']
+    vector_int_width = vector_is['intwidth']

    for loop_node in inner_loops:
        loop_range = loop_node.stop - loop_node.start
@@ -180,16 +177,13 @@ def vectorize_inner_loops_and_adapt_load_stores(ast_node, assume_aligned, nontem
        for indexed in loop_node.atoms(sp.Indexed):
            base, index = indexed.args
            if loop_counter_symbol in index.atoms(sp.Symbol):
-                if not isinstance(vector_width, int) or load_a == load_u:
-                    # When the vector width is not known during code generation, we cannot determine whether
-                    # the access is aligned or not. None of the current sizeless vector ISAs (SVE and RISC-V-V)
-                    # have separate load/store instructions for aligned and unaligned, so there is no disadvantage
-                    # to falling back to unaligned here. When new ISAs become available, this may need to be revisited.
-                    
-                    # On sized vector ISAs that do not have separate instructions for aligned and unaligned access,
-                    # alignment does not matter here either
+                if 'loadA' not in vector_is and 'storeA' not in vector_is and 'maskStoreA' not in vector_is:
+                    # don't need to generate the alignment check when there are no aligned load/store instructions
                    aligned_access = False
                else:
+                    if not isinstance(vector_width, int):
+                        raise NotImplementedError('Access alignment cannot be statically determined for sizeless '
+                                                  'vector ISAs')
                    aligned_access = (index - loop_counter_symbol).subs(zero_loop_counters) % vector_width == 0
                loop_counter_is_offset = loop_counter_symbol not in (index - loop_counter_symbol).atoms()
                stride = sp.simplify(index.subs({loop_counter_symbol: loop_counter_symbol + 1}) - index)
@@ -238,7 +232,7 @@ def vectorize_inner_loops_and_adapt_load_stores(ast_node, assume_aligned, nontem
            substitutions.update({s[0]: s[1] for s in zip(rng.result_symbols, new_result_symbols)})
            rng._symbols_defined = set(new_result_symbols)
        fast_subs(loop_node, substitutions, skip=lambda e: isinstance(e, RNGBase))
-        insert_vector_casts(loop_node, ast_node.instruction_set, default_float_type)
+        insert_vector_casts(loop_node, vector_is, default_float_type)


 def mask_conditionals(loop_body):

--- a/pystencils_tests/test_vectorization_specific.py
+++ b/pystencils_tests/test_vectorization_specific.py
@@ -119,14 +119,11 @@ def test_alignment_and_correct_ghost_layers(gl_field, gl_kernel, instruction_set
                                                  cpu_vectorize_info=opt, ghost_layers=gl_kernel)
    ast = ps.create_kernel(update_rule, config=config)
    kernel = ast.compile()
-    if ast.instruction_set['loadA'] == ast.instruction_set['loadU']:
-        dh.run_kernel(kernel)
-    else:
-        if gl_kernel != gl_field:
-            with pytest.raises(ValueError):
-                dh.run_kernel(kernel)
-        else:
+    if ('loadA' in ast.instruction_set or 'storeA' in ast.instruction_set) and gl_kernel != gl_field:
+        with pytest.raises(ValueError):
            dh.run_kernel(kernel)
+    else:
+        dh.run_kernel(kernel)


 @pytest.mark.parametrize('instruction_set', supported_instruction_sets)