diff --git a/pystencils/cpu/vectorization.py b/pystencils/cpu/vectorization.py index 7996d6d3f87f980f8ba348a618712291618e1816..4d609a114e22b450662ed4787bbc9e41b8ad2c12 100644 --- a/pystencils/cpu/vectorization.py +++ b/pystencils/cpu/vectorization.py @@ -125,21 +125,22 @@ def vectorize(kernel_ast: ast.KernelFunction, instruction_set: str = 'best', # TODO: future work allow mixed precision fields default_float_type = 'double' if float_size == 8 else 'float' vector_is = get_vector_instruction_set(default_float_type, instruction_set=instruction_set) - vector_width = vector_is['width'] kernel_ast.instruction_set = vector_is strided = 'storeS' in vector_is and 'loadS' in vector_is keep_loop_stop = '{loop_stop}' in vector_is['storeA' if assume_aligned else 'storeU'] - vectorize_inner_loops_and_adapt_load_stores(kernel_ast, vector_width, assume_aligned, nontemporal, + vectorize_inner_loops_and_adapt_load_stores(kernel_ast, assume_aligned, nontemporal, strided, keep_loop_stop, assume_sufficient_line_padding, default_float_type) # is in vectorize_inner_loops_and_adapt_load_stores.. insert_vector_casts(kernel_ast, default_float_type) -def vectorize_inner_loops_and_adapt_load_stores(ast_node, vector_width, assume_aligned, nontemporal_fields, +def vectorize_inner_loops_and_adapt_load_stores(ast_node, assume_aligned, nontemporal_fields, strided, keep_loop_stop, assume_sufficient_line_padding, default_float_type): """Goes over all innermost loops, changes increment to vector width and replaces field accesses by vector type.""" + vector_width = ast_node.instruction_set['width'] + all_loops = filtered_tree_iteration(ast_node, ast.LoopOverCoordinate, stop_type=ast.SympyAssignment) inner_loops = [n for n in all_loops if n.is_innermost_loop] zero_loop_counters = {l.loop_counter_symbol: 0 for l in all_loops} @@ -219,7 +220,7 @@ def vectorize_inner_loops_and_adapt_load_stores(ast_node, vector_width, assume_a substitutions.update({s[0]: s[1] for s in zip(rng.result_symbols, new_result_symbols)}) rng._symbols_defined = set(new_result_symbols) fast_subs(loop_node, substitutions, skip=lambda e: isinstance(e, RNGBase)) - insert_vector_casts(loop_node, default_float_type, vector_width) + insert_vector_casts(loop_node, ast_node.instruction_set, default_float_type) def mask_conditionals(loop_body): @@ -248,7 +249,7 @@ def mask_conditionals(loop_body): visit_node(loop_body, mask=True) -def insert_vector_casts(ast_node, default_float_type='double', vector_width=4): +def insert_vector_casts(ast_node, instruction_set, default_float_type='double'): """Inserts necessary casts from scalar values to vector values.""" handled_functions = (sp.Add, sp.Mul, fast_division, fast_sqrt, fast_inv_sqrt, vec_any, vec_all, DivFunc, @@ -262,8 +263,8 @@ def insert_vector_casts(ast_node, default_float_type='double', vector_width=4): arg = visit_expr(expr.args[0]) assert cast_type in [BasicType('float32'), BasicType('float64')],\ f'Vectorization cannot vectorize type {cast_type}' - return expr.func(arg, VectorType(cast_type, vector_width)) - elif expr.func is sp.Abs and 'abs' not in ast_node.instruction_set: + return expr.func(arg, VectorType(cast_type, instruction_set['width'])) + elif expr.func is sp.Abs and 'abs' not in instruction_set: new_arg = visit_expr(expr.args[0], default_type) base_type = get_type_of_expression(expr.args[0]).base_type if type(expr.args[0]) is VectorMemoryAccess \ else get_type_of_expression(expr.args[0]) diff --git a/pystencils/typing/leaf_typing.py b/pystencils/typing/leaf_typing.py index aa23de65d8ba9329cedb36acf047ac81e5f414f1..6ccd864e347611d50fab3a45ee7848da4dd3817b 100644 --- a/pystencils/typing/leaf_typing.py +++ b/pystencils/typing/leaf_typing.py @@ -16,6 +16,7 @@ from sympy.logic.boolalg import BooleanAtom from pystencils import astnodes as ast from pystencils.functions import DivFunc, AddressOf +from pystencils.cpu.vectorization import vec_all, vec_any from pystencils.field import Field from pystencils.typing.types import BasicType, create_type, PointerType from pystencils.typing.utilities import get_type_of_expression, collate_types @@ -170,6 +171,8 @@ class TypeAdder: new_value = value if value_type == collated_type else CastFunc(value, collated_type) return expr.func(new_access, condition, new_value), collated_type + elif isinstance(expr, (vec_any, vec_all)): + return expr, bool_type elif isinstance(expr, BooleanFunction): args_types = [self.figure_out_type(a) for a in expr.args] new_args = [a if t.dtype_eq(bool_type) else BooleanCastFunc(a, bool_type) for a, t in args_types] diff --git a/pystencils_tests/test_conditional_vec.py b/pystencils_tests/test_conditional_vec.py index 1a962d00f8cb92c5f2bf6619307ce17777190c4b..2d8193a467c99b4a1825af99584aedb32f29ee06 100644 --- a/pystencils_tests/test_conditional_vec.py +++ b/pystencils_tests/test_conditional_vec.py @@ -3,10 +3,11 @@ import sympy as sp import pytest import pystencils as ps -from pystencils.astnodes import Block, Conditional +from pystencils.astnodes import Block, Conditional, SympyAssignment from pystencils.backends.simd_instruction_sets import get_supported_instruction_sets, get_vector_instruction_set from pystencils.enums import Target from pystencils.cpu.vectorization import vec_all, vec_any +from pystencils.node_collection import NodeCollection supported_instruction_sets = get_supported_instruction_sets() if get_supported_instruction_sets() else [] @@ -24,12 +25,12 @@ def test_vec_any(instruction_set, dtype): data = ps.fields(f"data: {dtype}[2D]", data=data_arr) c = [ - ps.Assignment(sp.Symbol("t1"), vec_any(data.center() > 0.0)), - Conditional(vec_any(data.center() > 0.0), Block([ - ps.Assignment(data.center(), 2.0) - ])) + SympyAssignment(sp.Symbol("t1"), vec_any(data.center() > 0.0)), + Conditional(vec_any(data.center() > 0.0), Block([SympyAssignment(data.center(), 2.0)])) ] - ast = ps.create_kernel(c, target=ps.Target.CPU, + + assignmets = NodeCollection(c) + ast = ps.create_kernel(assignments=assignmets, target=ps.Target.CPU, cpu_vectorize_info={'instruction_set': instruction_set}) kernel = ast.compile() kernel(data=data_arr) @@ -52,12 +53,9 @@ def test_vec_all(instruction_set, dtype): data_arr[3:9, 1:3 * width - 1] = 1.0 data = ps.fields(f"data: {dtype}[2D]", data=data_arr) - c = [ - Conditional(vec_all(data.center() > 0.0), Block([ - ps.Assignment(data.center(), 2.0) - ])) - ] - ast = ps.create_kernel(c, target=Target.CPU, + c = [Conditional(vec_all(data.center() > 0.0), Block([SympyAssignment(data.center(), 2.0)]))] + assignmets = NodeCollection(c) + ast = ps.create_kernel(assignmets, target=Target.CPU, cpu_vectorize_info={'instruction_set': instruction_set}) kernel = ast.compile() kernel(data=data_arr) @@ -101,12 +99,10 @@ def test_vec_maskstore(instruction_set, dtype): data_arr[3:-3, 3:-3] = 1.0 data = ps.fields(f"data: {dtype}[2D]", data=data_arr) - c = [ - Conditional(data.center() < 1.0, Block([ - ps.Assignment(data.center(), 2.0) - ])) - ] - ast = ps.create_kernel(c, target=Target.CPU, + c = [Conditional(data.center() < 1.0, Block([SympyAssignment(data.center(), 2.0)]))] + + assignmets = NodeCollection(c) + ast = ps.create_kernel(assignmets, target=Target.CPU, cpu_vectorize_info={'instruction_set': instruction_set}) kernel = ast.compile() kernel(data=data_arr) diff --git a/pystencils_tests/test_vectorization.py b/pystencils_tests/test_vectorization.py index 55070e5475ce59118f41d104c2db9caa28c029b3..8b685c28ae439352ae2860d36694d48dd68e4aa1 100644 --- a/pystencils_tests/test_vectorization.py +++ b/pystencils_tests/test_vectorization.py @@ -15,20 +15,10 @@ from pystencils.transformations import replace_inner_stride_with_one supported_instruction_sets = get_supported_instruction_sets() if supported_instruction_sets: instruction_set = supported_instruction_sets[-1] - instructions = get_vector_instruction_set(instruction_set=instruction_set) else: instruction_set = None -# CI: -# FAILED pystencils_tests/test_vectorization.py::test_vectorised_pow - NotImple... -# FAILED pystencils_tests/test_vectorization.py::test_inplace_update - NotImple... -# FAILED pystencils_tests/test_vectorization.py::test_vectorised_fast_approximations -# test_issue40 - -# Jan: -# test_vectorised_pow -# test_issue40 # TODO: Skip tests if no instruction set is available and check all codes if they are really vectorised ! def test_vector_type_propagation(instruction_set=instruction_set): @@ -113,6 +103,7 @@ def test_inplace_update(instruction_set=instruction_set): def test_vectorization_fixed_size(instruction_set=instruction_set): + instructions = get_vector_instruction_set(instruction_set=instruction_set) configurations = [] # Fixed size - multiple of four arr = np.ones((20 + 2, 24 + 2)) * 5.0 @@ -135,7 +126,7 @@ def test_vectorization_fixed_size(instruction_set=instruction_set): code = ps.get_code_str(ast) add_instruction = instructions["+"][:instructions["+"].find("(")] assert add_instruction in code - print(code) + # print(code) func = ast.compile() dst = np.zeros_like(arr) @@ -264,6 +255,7 @@ def test_vectorised_pow(instruction_set=instruction_set): ast = ps.create_kernel(as1) vectorize(ast, instruction_set=instruction_set) + print(ast) ast.compile() ast = ps.create_kernel(as2) diff --git a/pystencils_tests/test_vectorization_specific.py b/pystencils_tests/test_vectorization_specific.py index 13bb412f0ba031599a124500f485eafb8386c1ef..3a56970667a445158c78af1ecc136cc686bc1c59 100644 --- a/pystencils_tests/test_vectorization_specific.py +++ b/pystencils_tests/test_vectorization_specific.py @@ -124,7 +124,7 @@ def test_cacheline_size(instruction_set): assert cacheline_size & (cacheline_size - 1) == 0, "Cache line size is not a power of 2" -# test_vectorization is not parametrized because it is supposed to run without pytest, so we parametrize it here +# TODO move to vectorise @pytest.mark.parametrize('instruction_set', sorted(set(supported_instruction_sets) - {test_vectorization.instruction_set})) @pytest.mark.parametrize('function',