diff --git a/pystencils/astnodes.py b/pystencils/astnodes.py
index ef0bcc6d758fdb67fc42b708038882360a9eee65..f3ed2711c844c0bbd8c75100e9a7d1a2d506b7b7 100644
--- a/pystencils/astnodes.py
+++ b/pystencils/astnodes.py
@@ -448,7 +448,7 @@ class LoopOverCoordinate(Node):
     def new_loop_with_different_body(self, new_body):
         result = LoopOverCoordinate(new_body, self.coordinate_to_loop_over, self.start, self.stop,
                                     self.step, self.is_block_loop)
-        result.prefix_lines = [l for l in self.prefix_lines]
+        result.prefix_lines = [prefix_line for prefix_line in self.prefix_lines]
         return result
 
     def subs(self, subs_dict):
diff --git a/pystencils/backends/cbackend.py b/pystencils/backends/cbackend.py
index 486bd126025612914804a1011aaecc10e2912f59..de9cb0d31b3be9903ac758991a0fca345f17a4a7 100644
--- a/pystencils/backends/cbackend.py
+++ b/pystencils/backends/cbackend.py
@@ -444,7 +444,8 @@ class CustomSympyPrinter(CCodePrinter):
     def _print_Pow(self, expr):
         """Don't use std::pow function, for small integer exponents, write as multiplication"""
         if isinstance(expr.exp, sp.Integer) and (-8 < expr.exp < 8):
-            raise NotImplementedError("This pow should be simplified already?")
+            raise ValueError(f"This expression: {expr} contains a pow function that should be simplified already with "
+                             f"a sequence of multiplications")
         return super(CustomSympyPrinter, self)._print_Pow(expr)
 
     # TODO don't print ones in sp.Mul
@@ -767,7 +768,10 @@ class VectorizedCustomSympyPrinter(CustomSympyPrinter):
         return processed
 
     def _print_Pow(self, expr):
-        result = self._scalarFallback('_print_Pow', expr)
+        try:
+            result = self._scalarFallback('_print_Pow', expr)
+        except ValueError:
+            result = None
         if result:
             return result
 
@@ -799,7 +803,10 @@ class VectorizedCustomSympyPrinter(CustomSympyPrinter):
         # noinspection PyProtectedMember
         from sympy.core.mul import _keep_coeff
 
-        result = self._scalarFallback('_print_Mul', expr)
+        if not inside_add:
+            result = self._scalarFallback('_print_Mul', expr)
+        else:
+            result = None
         if result:
             return result
 
diff --git a/pystencils/cpu/vectorization.py b/pystencils/cpu/vectorization.py
index b3236a3c5cab7925116431ae343e20ca0fea0f1f..3141eb4009579d4028a1a70a488a78f051519cdb 100644
--- a/pystencils/cpu/vectorization.py
+++ b/pystencils/cpu/vectorization.py
@@ -9,7 +9,6 @@ import pystencils.astnodes as ast
 from pystencils.backends.simd_instruction_sets import get_supported_instruction_sets, get_vector_instruction_set
 from pystencils.typing import (BasicType, PointerType, TypedSymbol, VectorType, CastFunc, collate_types,
                                get_type_of_expression, VectorMemoryAccess)
-from pystencils.fast_approximation import fast_division, fast_inv_sqrt, fast_sqrt
 from pystencils.functions import DivFunc
 from pystencils.field import Field
 from pystencils.integer_functions import modulo_ceil, modulo_floor
@@ -133,7 +132,6 @@ def vectorize(kernel_ast: ast.KernelFunction, instruction_set: str = 'best',
     vectorize_inner_loops_and_adapt_load_stores(kernel_ast, assume_aligned, nontemporal,
                                                 strided, keep_loop_stop, assume_sufficient_line_padding,
                                                 default_float_type)
-    # is in vectorize_inner_loops_and_adapt_load_stores.. insert_vector_casts(kernel_ast, default_float_type)
 
 
 def vectorize_inner_loops_and_adapt_load_stores(ast_node, assume_aligned, nontemporal_fields,
@@ -143,8 +141,8 @@ def vectorize_inner_loops_and_adapt_load_stores(ast_node, assume_aligned, nontem
     vector_width = ast_node.instruction_set['width']
 
     all_loops = filtered_tree_iteration(ast_node, ast.LoopOverCoordinate, stop_type=ast.SympyAssignment)
-    inner_loops = [n for n in all_loops if n.is_innermost_loop]
-    zero_loop_counters = {l.loop_counter_symbol: 0 for l in all_loops}
+    inner_loops = [loop for loop in all_loops if loop.is_innermost_loop]
+    zero_loop_counters = {loop.loop_counter_symbol: 0 for loop in all_loops}
 
     for loop_node in inner_loops:
         loop_range = loop_node.stop - loop_node.start
@@ -158,7 +156,8 @@ def vectorize_inner_loops_and_adapt_load_stores(ast_node, assume_aligned, nontem
             loop_node.stop = new_stop
         else:
             cutting_point = modulo_floor(loop_range, vector_width) + loop_node.start
-            loop_nodes = [l for l in cut_loop(loop_node, [cutting_point]).args if isinstance(l, ast.LoopOverCoordinate)]
+            loop_nodes = [loop for loop in cut_loop(loop_node, [cutting_point]).args
+                          if isinstance(loop, ast.LoopOverCoordinate)]
             assert len(loop_nodes) in (0, 1, 2)  # 2 for main and tail loop, 1 if loop range divisible by vector width
             if len(loop_nodes) == 0:
                 continue
@@ -179,8 +178,7 @@ def vectorize_inner_loops_and_adapt_load_stores(ast_node, assume_aligned, nontem
                     successful = False
                     break
                 typed_symbol = base.label
-                assert type(typed_symbol.dtype) is PointerType, \
-                    f"Type of access is {typed_symbol.dtype}, {indexed}"
+                assert type(typed_symbol.dtype) is PointerType, f"Type of access is {typed_symbol.dtype}, {indexed}"
 
                 vec_type = VectorType(typed_symbol.dtype.base_type, vector_width)
                 use_aligned_access = aligned_access and assume_aligned
@@ -254,8 +252,7 @@ def mask_conditionals(loop_body):
 def insert_vector_casts(ast_node, instruction_set, default_float_type='double'):
     """Inserts necessary casts from scalar values to vector values."""
 
-    handled_functions = (sp.Add, sp.Mul, fast_division, fast_sqrt, fast_inv_sqrt, vec_any, vec_all, DivFunc,
-                         sp.UnevaluatedExpr, sp.Abs)
+    handled_functions = (sp.Add, sp.Mul, vec_any, vec_all, DivFunc, sp.UnevaluatedExpr, sp.Abs)
 
     def visit_expr(expr, default_type='double'):  # TODO Vectorization Revamp: get rid of default_type
         if isinstance(expr, VectorMemoryAccess):
diff --git a/pystencils/transformations.py b/pystencils/transformations.py
index 43beefd25f79eebe8b78c40d46186e69ae195d64..06005cbf709fdfeb74a14dc141f0256aee6039cb 100644
--- a/pystencils/transformations.py
+++ b/pystencils/transformations.py
@@ -13,6 +13,7 @@ from pystencils.typing import (CastFunc, PointerType, StructType, TypedSymbol, g
                                ReinterpretCastFunc, get_next_parent_of_type, parents_of_type)
 from pystencils.field import Field, FieldType
 from pystencils.typing import FieldPointerSymbol
+from pystencils.sympyextensions import fast_subs
 from pystencils.simp.assignment_collection import AssignmentCollection
 from pystencils.slicing import normalize_slice
 from pystencils.integer_functions import int_div
@@ -650,11 +651,11 @@ def split_inner_loop(ast_node: ast.Node, symbol_groups):
                        and which no symbol in a symbol group depends on, are not updated!
     """
     all_loops = ast_node.atoms(ast.LoopOverCoordinate)
-    inner_loop = [l for l in all_loops if l.is_innermost_loop]
+    inner_loop = [loop for loop in all_loops if loop.is_innermost_loop]
     assert len(inner_loop) == 1, "Error in AST: multiple innermost loops. Was split transformation already called?"
     inner_loop = inner_loop[0]
     assert type(inner_loop.body) is ast.Block
-    outer_loop = [l for l in all_loops if l.is_outermost_loop]
+    outer_loop = [loop for loop in all_loops if loop.is_outermost_loop]
     assert len(outer_loop) == 1, "Error in AST, multiple outermost loops."
     outer_loop = outer_loop[0]
 
@@ -688,8 +689,8 @@ def split_inner_loop(ast_node: ast.Node, symbol_groups):
         assignment_group = []
         for assignment in inner_loop.body.args:
             if assignment.lhs in symbols_resolved:
-                new_rhs = assignment.rhs.subs(
-                    symbols_with_temporary_array.items())
+                # use fast_subs here because it checks if multiplications should be evaluated or not
+                new_rhs = fast_subs(assignment.rhs, symbols_with_temporary_array)
                 if not isinstance(assignment.lhs, Field.Access) and assignment.lhs in symbol_group:
                     assert type(assignment.lhs) is TypedSymbol
                     new_ts = TypedSymbol(assignment.lhs.name, PointerType(assignment.lhs.dtype))
diff --git a/pystencils_tests/test_transformations.py b/pystencils_tests/test_transformations.py
index 3ede70a85cac1ad1b10c46a90dab62390002f8a2..d6e6888b5027a4d403b1158d1204be97cc35455f 100644
--- a/pystencils_tests/test_transformations.py
+++ b/pystencils_tests/test_transformations.py
@@ -1,5 +1,7 @@
+import sympy as sp
+
 import pystencils as ps
-from pystencils import TypedSymbol
+from pystencils import fields, TypedSymbol
 from pystencils.astnodes import LoopOverCoordinate, SympyAssignment
 from pystencils.typing import create_type
 from pystencils.transformations import filtered_tree_iteration, get_loop_hierarchy, get_loop_counter_symbol_hierarchy
@@ -10,8 +12,8 @@ def test_loop_information():
     update_rule = ps.Assignment(g[0, 0], f[0, 0])
 
     ast = ps.create_kernel(update_rule)
-    inner_loops = [l for l in filtered_tree_iteration(ast, LoopOverCoordinate, stop_type=SympyAssignment)
-                   if l.is_innermost_loop]
+    inner_loops = [loop for loop in filtered_tree_iteration(ast, LoopOverCoordinate, stop_type=SympyAssignment)
+                   if loop.is_innermost_loop]
 
     loop_order = []
     for i in get_loop_hierarchy(inner_loops[0].args[0]):
@@ -23,3 +25,58 @@ def test_loop_information():
 
     assert loop_symbols == [TypedSymbol("ctr_1", create_type("int"), nonnegative=True),
                             TypedSymbol("ctr_0", create_type("int"), nonnegative=True)]
+
+
+def test_split_optimisation():
+    src, dst = fields(f"src(9), dst(9): [2D]", layout='fzyx')
+
+    stencil = ((0, 0), (0, 1), (0, -1), (-1, 0), (1, 0), (-1, 1), (1, 1), (-1, -1), (1, -1))
+    w = [sp.Rational(4, 9)]
+    w += [sp.Rational(1, 9)] * 4
+    w += [sp.Rational(1, 36)] * 4
+    cs = sp.Rational(1, 3)
+
+    subexpressions = []
+    main_assignements = []
+
+    rho = sp.symbols("rho")
+    velo = sp.symbols("u_:2")
+
+    density = 0
+    velocity_x = 0
+    velocity_y = 0
+    for d in stencil:
+        density += src[d]
+        velocity_x += d[0] * src[d]
+        velocity_y += d[1] * src[d]
+
+    subexpressions.append(ps.Assignment(rho, density))
+    subexpressions.append(ps.Assignment(velo[0], velocity_x))
+    subexpressions.append(ps.Assignment(velo[1], velocity_y))
+
+    for i, d in enumerate(stencil):
+        u_d = velo[0] * d[0] + velo[1] * d[1]
+        u_2 = velo[0] * velo[0] + velo[1] * velo[1]
+
+        expr = w[i] * rho * (1 + u_d / cs + u_d ** 2 / (2 * cs ** 2) - u_2 / (2 * cs))
+
+        main_assignements.append(ps.Assignment(dst.center_vector[i], expr))
+
+    ac = ps.AssignmentCollection(main_assignments=main_assignements, subexpressions=subexpressions)
+
+    simplification_hint = {'density': rho,
+                           'velocity': (velo[0], velo[1]),
+                           'split_groups': [[rho, velo[0], velo[1], dst.center_vector[0]],
+                                            [dst.center_vector[1], dst.center_vector[2]],
+                                            [dst.center_vector[3], dst.center_vector[4]],
+                                            [dst.center_vector[5], dst.center_vector[6]],
+                                            [dst.center_vector[7], dst.center_vector[8]]]}
+
+    ac.simplification_hints = simplification_hint
+    ast = ps.create_kernel(ac)
+
+    code = ps.get_code_str(ast)
+    # after the split optimisation the two for loops are split into 6
+    assert code.count("for") == 6
+
+    print(code)
diff --git a/pystencils_tests/test_vectorization.py b/pystencils_tests/test_vectorization.py
index 19f266b12b55ce66f971e0634e11f7178c6f70bc..f526341ec587b508f6c28f8dd2596125366f8ecc 100644
--- a/pystencils_tests/test_vectorization.py
+++ b/pystencils_tests/test_vectorization.py
@@ -8,7 +8,6 @@ import sympy as sp
 import pystencils as ps
 from pystencils.backends.simd_instruction_sets import get_supported_instruction_sets, get_vector_instruction_set
 from pystencils.cpu.vectorization import vectorize
-from pystencils.fast_approximation import insert_fast_sqrts, insert_fast_divisions
 from pystencils.enums import Target
 from pystencils.transformations import replace_inner_stride_with_one
 
@@ -19,7 +18,6 @@ else:
     instruction_set = None
 
 
-
 # TODO: Skip tests if no instruction set is available and check all codes if they are really vectorised !
 def test_vector_type_propagation(instruction_set=instruction_set):
     a, b, c, d, e = sp.symbols("a b c d e")
@@ -276,35 +274,6 @@ def test_vectorised_pow(instruction_set=instruction_set):
     ast.compile()
 
 
-def test_vectorised_fast_approximations(instruction_set=instruction_set):
-    # fast_approximations are a gpu thing
-    arr = np.zeros((24, 24))
-    f, g = ps.fields(f=arr, g=arr)
-
-    expr = sp.sqrt(f[0, 0] + f[1, 0])
-    assignment = ps.Assignment(g[0, 0], insert_fast_sqrts(expr))
-    ast = ps.create_kernel(assignment)
-    vectorize(ast, instruction_set=instruction_set)
-
-    with pytest.raises(Exception):
-        ast.compile()
-
-    expr = f[0, 0] / f[1, 0]
-    assignment = ps.Assignment(g[0, 0], insert_fast_divisions(expr))
-    ast = ps.create_kernel(assignment)
-    vectorize(ast, instruction_set=instruction_set)
-
-    with pytest.raises(Exception):
-        ast.compile()
-
-    assignment = ps.Assignment(sp.Symbol("tmp"), 3 / sp.sqrt(f[0, 0] + f[1, 0]))
-    ast = ps.create_kernel(insert_fast_sqrts(assignment))
-    vectorize(ast, instruction_set=instruction_set)
-
-    with pytest.raises(Exception):
-        ast.compile()
-
-
 def test_issue40(*_):
     """https://i10git.cs.fau.de/pycodegen/pystencils/-/issues/40"""
     opt = {'instruction_set': "avx512", 'assume_aligned': False,
diff --git a/pystencils_tests/test_vectorization_specific.py b/pystencils_tests/test_vectorization_specific.py
index 367250dda361c2d08c3f614741c8566b545423ae..49152a42021405677e493a9a365a51e00285b5ab 100644
--- a/pystencils_tests/test_vectorization_specific.py
+++ b/pystencils_tests/test_vectorization_specific.py
@@ -120,7 +120,8 @@ def test_alignment_and_correct_ghost_layers(gl_field, gl_kernel, instruction_set
     update_rule = ps.Assignment(dst[0, 0], src[0, 0])
     opt = {'instruction_set': instruction_set, 'assume_aligned': True,
            'nontemporal': True, 'assume_inner_stride_one': True}
-    config = pystencils.config.CreateKernelConfig(target=dh.default_target, cpu_vectorize_info=opt, ghost_layers=gl_kernel)
+    config = pystencils.config.CreateKernelConfig(target=dh.default_target,
+                                                  cpu_vectorize_info=opt, ghost_layers=gl_kernel)
     ast = ps.create_kernel(update_rule, config=config)
     kernel = ast.compile()
     if gl_kernel != gl_field: