diff --git a/pystencils/astnodes.py b/pystencils/astnodes.py index ef0bcc6d758fdb67fc42b708038882360a9eee65..f3ed2711c844c0bbd8c75100e9a7d1a2d506b7b7 100644 --- a/pystencils/astnodes.py +++ b/pystencils/astnodes.py @@ -448,7 +448,7 @@ class LoopOverCoordinate(Node): def new_loop_with_different_body(self, new_body): result = LoopOverCoordinate(new_body, self.coordinate_to_loop_over, self.start, self.stop, self.step, self.is_block_loop) - result.prefix_lines = [l for l in self.prefix_lines] + result.prefix_lines = [prefix_line for prefix_line in self.prefix_lines] return result def subs(self, subs_dict): diff --git a/pystencils/backends/cbackend.py b/pystencils/backends/cbackend.py index 486bd126025612914804a1011aaecc10e2912f59..de9cb0d31b3be9903ac758991a0fca345f17a4a7 100644 --- a/pystencils/backends/cbackend.py +++ b/pystencils/backends/cbackend.py @@ -444,7 +444,8 @@ class CustomSympyPrinter(CCodePrinter): def _print_Pow(self, expr): """Don't use std::pow function, for small integer exponents, write as multiplication""" if isinstance(expr.exp, sp.Integer) and (-8 < expr.exp < 8): - raise NotImplementedError("This pow should be simplified already?") + raise ValueError(f"This expression: {expr} contains a pow function that should be simplified already with " + f"a sequence of multiplications") return super(CustomSympyPrinter, self)._print_Pow(expr) # TODO don't print ones in sp.Mul @@ -767,7 +768,10 @@ class VectorizedCustomSympyPrinter(CustomSympyPrinter): return processed def _print_Pow(self, expr): - result = self._scalarFallback('_print_Pow', expr) + try: + result = self._scalarFallback('_print_Pow', expr) + except ValueError: + result = None if result: return result @@ -799,7 +803,10 @@ class VectorizedCustomSympyPrinter(CustomSympyPrinter): # noinspection PyProtectedMember from sympy.core.mul import _keep_coeff - result = self._scalarFallback('_print_Mul', expr) + if not inside_add: + result = self._scalarFallback('_print_Mul', expr) + else: + result = None if result: return result diff --git a/pystencils/cpu/vectorization.py b/pystencils/cpu/vectorization.py index b3236a3c5cab7925116431ae343e20ca0fea0f1f..3141eb4009579d4028a1a70a488a78f051519cdb 100644 --- a/pystencils/cpu/vectorization.py +++ b/pystencils/cpu/vectorization.py @@ -9,7 +9,6 @@ import pystencils.astnodes as ast from pystencils.backends.simd_instruction_sets import get_supported_instruction_sets, get_vector_instruction_set from pystencils.typing import (BasicType, PointerType, TypedSymbol, VectorType, CastFunc, collate_types, get_type_of_expression, VectorMemoryAccess) -from pystencils.fast_approximation import fast_division, fast_inv_sqrt, fast_sqrt from pystencils.functions import DivFunc from pystencils.field import Field from pystencils.integer_functions import modulo_ceil, modulo_floor @@ -133,7 +132,6 @@ def vectorize(kernel_ast: ast.KernelFunction, instruction_set: str = 'best', vectorize_inner_loops_and_adapt_load_stores(kernel_ast, assume_aligned, nontemporal, strided, keep_loop_stop, assume_sufficient_line_padding, default_float_type) - # is in vectorize_inner_loops_and_adapt_load_stores.. insert_vector_casts(kernel_ast, default_float_type) def vectorize_inner_loops_and_adapt_load_stores(ast_node, assume_aligned, nontemporal_fields, @@ -143,8 +141,8 @@ def vectorize_inner_loops_and_adapt_load_stores(ast_node, assume_aligned, nontem vector_width = ast_node.instruction_set['width'] all_loops = filtered_tree_iteration(ast_node, ast.LoopOverCoordinate, stop_type=ast.SympyAssignment) - inner_loops = [n for n in all_loops if n.is_innermost_loop] - zero_loop_counters = {l.loop_counter_symbol: 0 for l in all_loops} + inner_loops = [loop for loop in all_loops if loop.is_innermost_loop] + zero_loop_counters = {loop.loop_counter_symbol: 0 for loop in all_loops} for loop_node in inner_loops: loop_range = loop_node.stop - loop_node.start @@ -158,7 +156,8 @@ def vectorize_inner_loops_and_adapt_load_stores(ast_node, assume_aligned, nontem loop_node.stop = new_stop else: cutting_point = modulo_floor(loop_range, vector_width) + loop_node.start - loop_nodes = [l for l in cut_loop(loop_node, [cutting_point]).args if isinstance(l, ast.LoopOverCoordinate)] + loop_nodes = [loop for loop in cut_loop(loop_node, [cutting_point]).args + if isinstance(loop, ast.LoopOverCoordinate)] assert len(loop_nodes) in (0, 1, 2) # 2 for main and tail loop, 1 if loop range divisible by vector width if len(loop_nodes) == 0: continue @@ -179,8 +178,7 @@ def vectorize_inner_loops_and_adapt_load_stores(ast_node, assume_aligned, nontem successful = False break typed_symbol = base.label - assert type(typed_symbol.dtype) is PointerType, \ - f"Type of access is {typed_symbol.dtype}, {indexed}" + assert type(typed_symbol.dtype) is PointerType, f"Type of access is {typed_symbol.dtype}, {indexed}" vec_type = VectorType(typed_symbol.dtype.base_type, vector_width) use_aligned_access = aligned_access and assume_aligned @@ -254,8 +252,7 @@ def mask_conditionals(loop_body): def insert_vector_casts(ast_node, instruction_set, default_float_type='double'): """Inserts necessary casts from scalar values to vector values.""" - handled_functions = (sp.Add, sp.Mul, fast_division, fast_sqrt, fast_inv_sqrt, vec_any, vec_all, DivFunc, - sp.UnevaluatedExpr, sp.Abs) + handled_functions = (sp.Add, sp.Mul, vec_any, vec_all, DivFunc, sp.UnevaluatedExpr, sp.Abs) def visit_expr(expr, default_type='double'): # TODO Vectorization Revamp: get rid of default_type if isinstance(expr, VectorMemoryAccess): diff --git a/pystencils/transformations.py b/pystencils/transformations.py index 43beefd25f79eebe8b78c40d46186e69ae195d64..06005cbf709fdfeb74a14dc141f0256aee6039cb 100644 --- a/pystencils/transformations.py +++ b/pystencils/transformations.py @@ -13,6 +13,7 @@ from pystencils.typing import (CastFunc, PointerType, StructType, TypedSymbol, g ReinterpretCastFunc, get_next_parent_of_type, parents_of_type) from pystencils.field import Field, FieldType from pystencils.typing import FieldPointerSymbol +from pystencils.sympyextensions import fast_subs from pystencils.simp.assignment_collection import AssignmentCollection from pystencils.slicing import normalize_slice from pystencils.integer_functions import int_div @@ -650,11 +651,11 @@ def split_inner_loop(ast_node: ast.Node, symbol_groups): and which no symbol in a symbol group depends on, are not updated! """ all_loops = ast_node.atoms(ast.LoopOverCoordinate) - inner_loop = [l for l in all_loops if l.is_innermost_loop] + inner_loop = [loop for loop in all_loops if loop.is_innermost_loop] assert len(inner_loop) == 1, "Error in AST: multiple innermost loops. Was split transformation already called?" inner_loop = inner_loop[0] assert type(inner_loop.body) is ast.Block - outer_loop = [l for l in all_loops if l.is_outermost_loop] + outer_loop = [loop for loop in all_loops if loop.is_outermost_loop] assert len(outer_loop) == 1, "Error in AST, multiple outermost loops." outer_loop = outer_loop[0] @@ -688,8 +689,8 @@ def split_inner_loop(ast_node: ast.Node, symbol_groups): assignment_group = [] for assignment in inner_loop.body.args: if assignment.lhs in symbols_resolved: - new_rhs = assignment.rhs.subs( - symbols_with_temporary_array.items()) + # use fast_subs here because it checks if multiplications should be evaluated or not + new_rhs = fast_subs(assignment.rhs, symbols_with_temporary_array) if not isinstance(assignment.lhs, Field.Access) and assignment.lhs in symbol_group: assert type(assignment.lhs) is TypedSymbol new_ts = TypedSymbol(assignment.lhs.name, PointerType(assignment.lhs.dtype)) diff --git a/pystencils_tests/test_transformations.py b/pystencils_tests/test_transformations.py index 3ede70a85cac1ad1b10c46a90dab62390002f8a2..d6e6888b5027a4d403b1158d1204be97cc35455f 100644 --- a/pystencils_tests/test_transformations.py +++ b/pystencils_tests/test_transformations.py @@ -1,5 +1,7 @@ +import sympy as sp + import pystencils as ps -from pystencils import TypedSymbol +from pystencils import fields, TypedSymbol from pystencils.astnodes import LoopOverCoordinate, SympyAssignment from pystencils.typing import create_type from pystencils.transformations import filtered_tree_iteration, get_loop_hierarchy, get_loop_counter_symbol_hierarchy @@ -10,8 +12,8 @@ def test_loop_information(): update_rule = ps.Assignment(g[0, 0], f[0, 0]) ast = ps.create_kernel(update_rule) - inner_loops = [l for l in filtered_tree_iteration(ast, LoopOverCoordinate, stop_type=SympyAssignment) - if l.is_innermost_loop] + inner_loops = [loop for loop in filtered_tree_iteration(ast, LoopOverCoordinate, stop_type=SympyAssignment) + if loop.is_innermost_loop] loop_order = [] for i in get_loop_hierarchy(inner_loops[0].args[0]): @@ -23,3 +25,58 @@ def test_loop_information(): assert loop_symbols == [TypedSymbol("ctr_1", create_type("int"), nonnegative=True), TypedSymbol("ctr_0", create_type("int"), nonnegative=True)] + + +def test_split_optimisation(): + src, dst = fields(f"src(9), dst(9): [2D]", layout='fzyx') + + stencil = ((0, 0), (0, 1), (0, -1), (-1, 0), (1, 0), (-1, 1), (1, 1), (-1, -1), (1, -1)) + w = [sp.Rational(4, 9)] + w += [sp.Rational(1, 9)] * 4 + w += [sp.Rational(1, 36)] * 4 + cs = sp.Rational(1, 3) + + subexpressions = [] + main_assignements = [] + + rho = sp.symbols("rho") + velo = sp.symbols("u_:2") + + density = 0 + velocity_x = 0 + velocity_y = 0 + for d in stencil: + density += src[d] + velocity_x += d[0] * src[d] + velocity_y += d[1] * src[d] + + subexpressions.append(ps.Assignment(rho, density)) + subexpressions.append(ps.Assignment(velo[0], velocity_x)) + subexpressions.append(ps.Assignment(velo[1], velocity_y)) + + for i, d in enumerate(stencil): + u_d = velo[0] * d[0] + velo[1] * d[1] + u_2 = velo[0] * velo[0] + velo[1] * velo[1] + + expr = w[i] * rho * (1 + u_d / cs + u_d ** 2 / (2 * cs ** 2) - u_2 / (2 * cs)) + + main_assignements.append(ps.Assignment(dst.center_vector[i], expr)) + + ac = ps.AssignmentCollection(main_assignments=main_assignements, subexpressions=subexpressions) + + simplification_hint = {'density': rho, + 'velocity': (velo[0], velo[1]), + 'split_groups': [[rho, velo[0], velo[1], dst.center_vector[0]], + [dst.center_vector[1], dst.center_vector[2]], + [dst.center_vector[3], dst.center_vector[4]], + [dst.center_vector[5], dst.center_vector[6]], + [dst.center_vector[7], dst.center_vector[8]]]} + + ac.simplification_hints = simplification_hint + ast = ps.create_kernel(ac) + + code = ps.get_code_str(ast) + # after the split optimisation the two for loops are split into 6 + assert code.count("for") == 6 + + print(code) diff --git a/pystencils_tests/test_vectorization.py b/pystencils_tests/test_vectorization.py index 19f266b12b55ce66f971e0634e11f7178c6f70bc..f526341ec587b508f6c28f8dd2596125366f8ecc 100644 --- a/pystencils_tests/test_vectorization.py +++ b/pystencils_tests/test_vectorization.py @@ -8,7 +8,6 @@ import sympy as sp import pystencils as ps from pystencils.backends.simd_instruction_sets import get_supported_instruction_sets, get_vector_instruction_set from pystencils.cpu.vectorization import vectorize -from pystencils.fast_approximation import insert_fast_sqrts, insert_fast_divisions from pystencils.enums import Target from pystencils.transformations import replace_inner_stride_with_one @@ -19,7 +18,6 @@ else: instruction_set = None - # TODO: Skip tests if no instruction set is available and check all codes if they are really vectorised ! def test_vector_type_propagation(instruction_set=instruction_set): a, b, c, d, e = sp.symbols("a b c d e") @@ -276,35 +274,6 @@ def test_vectorised_pow(instruction_set=instruction_set): ast.compile() -def test_vectorised_fast_approximations(instruction_set=instruction_set): - # fast_approximations are a gpu thing - arr = np.zeros((24, 24)) - f, g = ps.fields(f=arr, g=arr) - - expr = sp.sqrt(f[0, 0] + f[1, 0]) - assignment = ps.Assignment(g[0, 0], insert_fast_sqrts(expr)) - ast = ps.create_kernel(assignment) - vectorize(ast, instruction_set=instruction_set) - - with pytest.raises(Exception): - ast.compile() - - expr = f[0, 0] / f[1, 0] - assignment = ps.Assignment(g[0, 0], insert_fast_divisions(expr)) - ast = ps.create_kernel(assignment) - vectorize(ast, instruction_set=instruction_set) - - with pytest.raises(Exception): - ast.compile() - - assignment = ps.Assignment(sp.Symbol("tmp"), 3 / sp.sqrt(f[0, 0] + f[1, 0])) - ast = ps.create_kernel(insert_fast_sqrts(assignment)) - vectorize(ast, instruction_set=instruction_set) - - with pytest.raises(Exception): - ast.compile() - - def test_issue40(*_): """https://i10git.cs.fau.de/pycodegen/pystencils/-/issues/40""" opt = {'instruction_set': "avx512", 'assume_aligned': False, diff --git a/pystencils_tests/test_vectorization_specific.py b/pystencils_tests/test_vectorization_specific.py index 367250dda361c2d08c3f614741c8566b545423ae..49152a42021405677e493a9a365a51e00285b5ab 100644 --- a/pystencils_tests/test_vectorization_specific.py +++ b/pystencils_tests/test_vectorization_specific.py @@ -120,7 +120,8 @@ def test_alignment_and_correct_ghost_layers(gl_field, gl_kernel, instruction_set update_rule = ps.Assignment(dst[0, 0], src[0, 0]) opt = {'instruction_set': instruction_set, 'assume_aligned': True, 'nontemporal': True, 'assume_inner_stride_one': True} - config = pystencils.config.CreateKernelConfig(target=dh.default_target, cpu_vectorize_info=opt, ghost_layers=gl_kernel) + config = pystencils.config.CreateKernelConfig(target=dh.default_target, + cpu_vectorize_info=opt, ghost_layers=gl_kernel) ast = ps.create_kernel(update_rule, config=config) kernel = ast.compile() if gl_kernel != gl_field: