diff --git a/pystencils/alignedarray.py b/pystencils/alignedarray.py index 26c3aa5ba90798b7e21b221951d5b75b36fdeaea..067a26d58370460beb1852add9ae46b9709b0595 100644 --- a/pystencils/alignedarray.py +++ b/pystencils/alignedarray.py @@ -1,5 +1,4 @@ import numpy as np -from pystencils.typing import numpy_name_to_c def aligned_empty(shape, byte_alignment=True, dtype=np.float64, byte_offset=0, order='C', align_inner_coordinate=True): @@ -21,26 +20,25 @@ def aligned_empty(shape, byte_alignment=True, dtype=np.float64, byte_offset=0, o from pystencils.backends.simd_instruction_sets import (get_supported_instruction_sets, get_cacheline_size, get_vector_instruction_set) - type_name = numpy_name_to_c(np.dtype(dtype).name) instruction_sets = get_supported_instruction_sets() if instruction_sets is None: byte_alignment = 64 elif byte_alignment == 'cacheline': cacheline_sizes = [get_cacheline_size(is_name) for is_name in instruction_sets] if all([s is None for s in cacheline_sizes]): - widths = [get_vector_instruction_set(type_name, is_name)['width'] * np.dtype(dtype).itemsize + widths = [get_vector_instruction_set(dtype, is_name)['width'] * np.dtype(dtype).itemsize for is_name in instruction_sets - if type(get_vector_instruction_set(type_name, is_name)['width']) is int] + if type(get_vector_instruction_set(dtype, is_name)['width']) is int] byte_alignment = 64 if all([s is None for s in widths]) else max(widths) else: byte_alignment = max([s for s in cacheline_sizes if s is not None]) - elif not any([type(get_vector_instruction_set(type_name, is_name)['width']) is int + elif not any([type(get_vector_instruction_set(dtype, is_name)['width']) is int for is_name in instruction_sets]): byte_alignment = 64 else: - byte_alignment = max([get_vector_instruction_set(type_name, is_name)['width'] * np.dtype(dtype).itemsize + byte_alignment = max([get_vector_instruction_set(dtype, is_name)['width'] * np.dtype(dtype).itemsize for is_name in instruction_sets - if type(get_vector_instruction_set(type_name, is_name)['width']) is int]) + if type(get_vector_instruction_set(dtype, is_name)['width']) is int]) if (not align_inner_coordinate) or (not hasattr(shape, '__len__')): size = np.prod(shape) d = np.dtype(dtype) @@ -78,7 +76,7 @@ def aligned_empty(shape, byte_alignment=True, dtype=np.float64, byte_offset=0, o return tmp -def aligned_zeros(shape, byte_alignment=True, dtype=float, byte_offset=0, order='C', align_inner_coordinate=True): +def aligned_zeros(shape, byte_alignment=True, dtype=np.float64, byte_offset=0, order='C', align_inner_coordinate=True): arr = aligned_empty(shape, dtype=dtype, byte_offset=byte_offset, order=order, byte_alignment=byte_alignment, align_inner_coordinate=align_inner_coordinate) x = np.zeros((), arr.dtype) @@ -86,7 +84,7 @@ def aligned_zeros(shape, byte_alignment=True, dtype=float, byte_offset=0, order= return arr -def aligned_ones(shape, byte_alignment=True, dtype=float, byte_offset=0, order='C', align_inner_coordinate=True): +def aligned_ones(shape, byte_alignment=True, dtype=np.float64, byte_offset=0, order='C', align_inner_coordinate=True): arr = aligned_empty(shape, dtype=dtype, byte_offset=byte_offset, order=order, byte_alignment=byte_alignment, align_inner_coordinate=align_inner_coordinate) x = np.ones((), arr.dtype) diff --git a/pystencils/backends/cbackend.py b/pystencils/backends/cbackend.py index e06c298987650d507d6c8f8aa21ffc09001efda3..7665f9dfc3ad32ac8ea9ead994af46e48b6610fa 100644 --- a/pystencils/backends/cbackend.py +++ b/pystencils/backends/cbackend.py @@ -629,6 +629,7 @@ class VectorizedCustomSympyPrinter(CustomSympyPrinter): def _print_CastFunc(self, expr): arg, data_type = expr.args if type(data_type) is VectorType: + base_type = data_type.base_type # vector_memory_access is a cast_func itself so it should't be directly inside a cast_func assert not isinstance(arg, VectorMemoryAccess) if isinstance(arg, sp.Tuple): @@ -648,19 +649,18 @@ class VectorizedCustomSympyPrinter(CustomSympyPrinter): elif isinstance(arg, TypedSymbol): return self._typed_vectorized_symbol(arg, data_type) elif isinstance(arg, (InverseTrigonometricFunction, TrigonometricFunction, HyperbolicFunction)) \ - and data_type == BasicType('float32'): + and base_type == BasicType('float32'): raise NotImplementedError('Vectorizer is not tested for trigonometric functions yet') # known = self.known_functions[arg.__class__.__name__.lower()] # code = self._print(arg) # return code.replace(known, f"{known}f") - elif isinstance(arg, sp.Pow) and data_type == BasicType('float32'): - raise NotImplementedError('Vectorizer cannot print casted aka. not double pow') - # known = ['sqrt', 'cbrt', 'pow'] - # code = self._print(arg) - # for k in known: - # if k in code: - # return code.replace(k, f'{k}f') - # raise ValueError(f"{code} doesn't give {known=} function back.") + elif isinstance(arg, sp.Pow): + if base_type == BasicType('float32') or base_type == BasicType('float64'): + return self._print_Pow(arg) + else: + raise NotImplementedError('Integer Pow is not implemented') + elif isinstance(arg, sp.UnevaluatedExpr): + return self._print(arg.args[0]) else: raise NotImplementedError('Vectorizer cannot cast between different datatypes') # to_type = self.instruction_set['suffix'][data_type.base_type.c_name] @@ -770,6 +770,8 @@ class VectorizedCustomSympyPrinter(CustomSympyPrinter): return processed def _print_Pow(self, expr): + # Due to loop cutting sp.Mul is evaluated again. + try: result = self._scalarFallback('_print_Pow', expr) except ValueError: @@ -778,26 +780,21 @@ class VectorizedCustomSympyPrinter(CustomSympyPrinter): return result one = self.instruction_set['makeVecConst'].format(1.0, **self._kwargs) + root = self.instruction_set['sqrt'].format(self._print(expr.base), **self._kwargs) if isinstance(expr.exp, CastFunc) and expr.exp.args[0].is_number: exp = expr.exp.args[0] else: exp = expr.exp + # TODO the printer should not have any intelligence like this. + # TODO To remove all of these cases the vectoriser needs to be reworked. See loop cutting if exp.is_integer and exp.is_number and 0 < exp < 8: - return "(" + self._print(sp.Mul(*[expr.base] * exp, evaluate=False)) + ")" - elif exp == -1: - one = self.instruction_set['makeVecConst'].format(1.0, **self._kwargs) - return self.instruction_set['/'].format(one, self._print(expr.base), **self._kwargs) + return self._print(sp.Mul(*[expr.base] * exp, evaluate=False)) elif exp == 0.5: - return self.instruction_set['sqrt'].format(self._print(expr.base), **self._kwargs) + return root elif exp == -0.5: - root = self.instruction_set['sqrt'].format(self._print(expr.base), **self._kwargs) return self.instruction_set['/'].format(one, root, **self._kwargs) - elif exp.is_integer and exp.is_number and - 8 < exp < 0: - return self.instruction_set['/'].format(one, - self._print(sp.Mul(*[expr.base] * (-exp), evaluate=False)), - **self._kwargs) else: raise ValueError("Generic exponential not supported: " + str(expr)) diff --git a/pystencils/backends/simd_instruction_sets.py b/pystencils/backends/simd_instruction_sets.py index f2df619639ac8fe2352f4d61bd37a6841defe406..6af548048c6784d65aa9a1537c953d56ce03e346 100644 --- a/pystencils/backends/simd_instruction_sets.py +++ b/pystencils/backends/simd_instruction_sets.py @@ -2,22 +2,34 @@ import math import os import platform from ctypes import CDLL +from warnings import warn + +import numpy as np from pystencils.backends.x86_instruction_sets import get_vector_instruction_set_x86 from pystencils.backends.arm_instruction_sets import get_vector_instruction_set_arm from pystencils.backends.ppc_instruction_sets import get_vector_instruction_set_ppc from pystencils.backends.riscv_instruction_sets import get_vector_instruction_set_riscv +from pystencils.typing import numpy_name_to_c def get_vector_instruction_set(data_type='double', instruction_set='avx'): + if data_type == 'float': + warn(f"Ambiguous input for data_type: {data_type}. For single precision please use float32. " + f"For more information please take numpy.dtype as a reference. This input will not be supported in future " + f"releases") + data_type = 'float64' + + type_name = numpy_name_to_c(np.dtype(data_type).name) + if instruction_set in ['neon'] or instruction_set.startswith('sve'): - return get_vector_instruction_set_arm(data_type, instruction_set) + return get_vector_instruction_set_arm(type_name, instruction_set) elif instruction_set in ['vsx']: - return get_vector_instruction_set_ppc(data_type, instruction_set) + return get_vector_instruction_set_ppc(type_name, instruction_set) elif instruction_set in ['rvv']: - return get_vector_instruction_set_riscv(data_type, instruction_set) + return get_vector_instruction_set_riscv(type_name, instruction_set) else: - return get_vector_instruction_set_x86(data_type, instruction_set) + return get_vector_instruction_set_x86(type_name, instruction_set) _cache = None diff --git a/pystencils/cpu/vectorization.py b/pystencils/cpu/vectorization.py index 3adcbc31b0840ac5e4973c3c06cce850217e5890..420e57c709e1697885ea686fe2596ad4c417b5ff 100644 --- a/pystencils/cpu/vectorization.py +++ b/pystencils/cpu/vectorization.py @@ -123,7 +123,7 @@ def vectorize(kernel_ast: ast.KernelFunction, instruction_set: str = 'best', "to differently typed floating point fields") float_size = field_float_dtypes.pop().numpy_dtype.itemsize assert float_size in (8, 4) - default_float_type = 'double' if float_size == 8 else 'float' + default_float_type = 'float64' if float_size == 8 else 'float32' vector_is = get_vector_instruction_set(default_float_type, instruction_set=instruction_set) kernel_ast.instruction_set = vector_is @@ -158,6 +158,7 @@ def vectorize_inner_loops_and_adapt_load_stores(ast_node, assume_aligned, nontem loop_node.stop = new_stop else: cutting_point = modulo_floor(loop_range, vector_width) + loop_node.start + # TODO cut_loop calls deepcopy on the loop_node. This is bad as documented in cut_loop loop_nodes = [loop for loop in cut_loop(loop_node, [cutting_point]).args if isinstance(loop, ast.LoopOverCoordinate)] assert len(loop_nodes) in (0, 1, 2) # 2 for main and tail loop, 1 if loop range divisible by vector width @@ -254,7 +255,7 @@ def mask_conditionals(loop_body): def insert_vector_casts(ast_node, instruction_set, default_float_type='double'): """Inserts necessary casts from scalar values to vector values.""" - handled_functions = (sp.Add, sp.Mul, vec_any, vec_all, DivFunc, sp.UnevaluatedExpr, sp.Abs) + handled_functions = (sp.Add, sp.Mul, vec_any, vec_all, DivFunc, sp.Abs) def visit_expr(expr, default_type='double'): # TODO Vectorization Revamp: get rid of default_type if isinstance(expr, VectorMemoryAccess): @@ -296,6 +297,26 @@ def insert_vector_casts(ast_node, instruction_set, default_float_type='double'): CastFunc(a, target_type) if t != target_type and not isinstance(a, VectorMemoryAccess) else a for a, t in zip(new_args, arg_types)] return expr.func(*casted_args) + elif expr.func is sp.UnevaluatedExpr: + assert expr.args[0].is_Pow or expr.args[0].is_Mul, "UnevaluatedExpr only implemented holding Mul or Pow" + # TODO this is only because cut_loop evaluates the multiplications again due to deepcopy. All this should + # TODO be fixed for real at some point. + if expr.args[0].is_Pow: + base = expr.args[0].base + exp = expr.args[0].exp + expr = sp.UnevaluatedExpr(sp.Mul(*([base] * +exp), evaluate=False)) + + new_args = [visit_expr(a, default_type) for a in expr.args[0].args] + arg_types = [get_type_of_expression(a, default_float_type=default_type) for a in new_args] + + target_type = collate_types(arg_types) + if not any(type(t) is VectorType for t in arg_types): + target_type = VectorType(target_type, instruction_set['width']) + + casted_args = [ + CastFunc(a, target_type) if t != target_type and not isinstance(a, VectorMemoryAccess) else a + for a, t in zip(new_args, arg_types)] + return expr.func(expr.args[0].func(*casted_args, evaluate=False)) elif expr.func is sp.Pow: new_arg = visit_expr(expr.args[0], default_type) return expr.func(new_arg, expr.args[1]) diff --git a/pystencils/node_collection.py b/pystencils/node_collection.py index 804a74e207bc79756a286feab2d14c59ff123713..227e1a10deb391cc8782a4678060164b754bd44b 100644 --- a/pystencils/node_collection.py +++ b/pystencils/node_collection.py @@ -43,14 +43,15 @@ class NodeCollection: def evaluate_terms(self): evaluate_constant_terms = ReplaceOptim( lambda e: hasattr(e, 'is_constant') and e.is_constant and not e.is_integer, - lambda p: p.evalf()) + lambda p: p.evalf() + ) evaluate_pow = ReplaceOptim( lambda e: e.is_Pow and e.exp.is_Integer and abs(e.exp) <= 8, - lambda p: ( - sp.UnevaluatedExpr(sp.Mul(*([p.base] * +p.exp), evaluate=False)) if p.exp > 0 else - DivFunc(sp.Integer(1), sp.Mul(*([p.base] * -p.exp), evaluate=False)) - )) + lambda p: sp.UnevaluatedExpr(sp.Mul(*([p.base] * +p.exp), evaluate=False)) if p.exp > 0 else + (DivFunc(sp.Integer(1), p.base) if p.exp == -1 else + DivFunc(sp.Integer(1), sp.UnevaluatedExpr(sp.Mul(*([p.base] * -p.exp), evaluate=False)))) + ) sympy_optimisations = [evaluate_constant_terms, evaluate_pow] if self.is_Nodes: @@ -65,6 +66,7 @@ class NodeCollection: return optimize(node, sympy_optimisations) else: raise NotImplementedError(f'{node} {type(node)} has no valid visitor') + self.all_assignments = [visitor(assignment) for assignment in self.all_assignments] else: self.all_assignments = [Assignment(a.lhs, optimize(a.rhs, sympy_optimisations)) diff --git a/pystencils/transformations.py b/pystencils/transformations.py index 09c2e0976ddf14e2745a796e060a79e011d8ca67..1a690409d2696af5aafa8461a38ee20eb7a655c1 100644 --- a/pystencils/transformations.py +++ b/pystencils/transformations.py @@ -721,7 +721,8 @@ def cut_loop(loop_node, cutting_points): One loop is transformed into len(cuttingPoints)+1 new loops that range from old_begin to cutting_points[1], ..., cutting_points[-1] to old_end - Modifies the ast in place + Modifies the ast in place. Note Issue #5783 of SymPy. Deepcopy will evaluate mul + https://github.com/sympy/sympy/issues/5783 Returns: list of new loop nodes diff --git a/pystencils_tests/test_conditional_vec.py b/pystencils_tests/test_conditional_vec.py index 6cb60006d05f6e5afa5b562173b0e66c9c689920..ebb25d50d58b2a1ba0c8a5b7a5e089f1e6a7ea5d 100644 --- a/pystencils_tests/test_conditional_vec.py +++ b/pystencils_tests/test_conditional_vec.py @@ -13,13 +13,13 @@ supported_instruction_sets = get_supported_instruction_sets() if get_supported_i @pytest.mark.parametrize('instruction_set', supported_instruction_sets) -@pytest.mark.parametrize('dtype', ('float', 'double')) +@pytest.mark.parametrize('dtype', ('float32', 'float64')) def test_vec_any(instruction_set, dtype): if instruction_set in ['sve', 'rvv']: width = 4 # we don't know the actual value else: width = get_vector_instruction_set(dtype, instruction_set)['width'] - data_arr = np.zeros((4 * width, 4 * width), dtype=np.float64 if dtype == 'double' else np.float32) + data_arr = np.zeros((4 * width, 4 * width), dtype=dtype) data_arr[3:9, 1:3 * width - 1] = 1.0 data = ps.fields(f"data: {dtype}[2D]", data=data_arr) @@ -42,13 +42,13 @@ def test_vec_any(instruction_set, dtype): @pytest.mark.parametrize('instruction_set', supported_instruction_sets) -@pytest.mark.parametrize('dtype', ('float', 'double')) +@pytest.mark.parametrize('dtype', ('float32', 'float64')) def test_vec_all(instruction_set, dtype): if instruction_set in ['sve', 'rvv']: width = 1000 # we don't know the actual value, need something guaranteed larger than vector else: width = get_vector_instruction_set(dtype, instruction_set)['width'] - data_arr = np.zeros((4 * width, 4 * width), dtype=np.float64 if dtype == 'double' else np.float32) + data_arr = np.zeros((4 * width, 4 * width), dtype=dtype) data_arr[3:9, 1:3 * width - 1] = 1.0 data = ps.fields(f"data: {dtype}[2D]", data=data_arr) @@ -95,7 +95,7 @@ def test_boolean_before_loop(): @pytest.mark.parametrize('instruction_set', supported_instruction_sets) @pytest.mark.parametrize('dtype', ('float32', 'float64')) def test_vec_maskstore(instruction_set, dtype): - data_arr = np.zeros((16, 16), dtype=np.float64 if dtype == 'float64' else np.float32) + data_arr = np.zeros((16, 16), dtype=dtype) data_arr[3:-3, 3:-3] = 1.0 data = ps.fields(f"data: {dtype}[2D]", data=data_arr) diff --git a/pystencils_tests/test_vectorization.py b/pystencils_tests/test_vectorization.py index 7b97b7b0a8ae99a98993b8807744931a97a6ee78..3718770b9384e0f01dd899c3afed298208fc95a9 100644 --- a/pystencils_tests/test_vectorization.py +++ b/pystencils_tests/test_vectorization.py @@ -261,7 +261,6 @@ def test_vectorised_pow(instruction_set=instruction_set): ast = ps.create_kernel(as1) vectorize(ast, instruction_set=instruction_set) - print(ast) ast.compile() ast = ps.create_kernel(as2) @@ -299,4 +298,3 @@ def test_issue40(*_): code = ps.get_code_str(ast) assert 'epi32' not in code - diff --git a/pystencils_tests/test_vectorization_specific.py b/pystencils_tests/test_vectorization_specific.py index 49152a42021405677e493a9a365a51e00285b5ab..c6a3bf2210727921f58fd06608c0366132860a2d 100644 --- a/pystencils_tests/test_vectorization_specific.py +++ b/pystencils_tests/test_vectorization_specific.py @@ -37,13 +37,13 @@ def test_vectorisation_varying_arch(instruction_set): np.testing.assert_equal(arr, 2) -@pytest.mark.parametrize('dtype', ('float', 'double')) +@pytest.mark.parametrize('dtype', ('float32', 'float64')) @pytest.mark.parametrize('instruction_set', supported_instruction_sets) def test_vectorized_abs(instruction_set, dtype): """Some instructions sets have abs, some don't. Furthermore, the special treatment of unary minus makes this data type-sensitive too. """ - arr = np.ones((2 ** 2 + 2, 2 ** 3 + 2), dtype=np.float64 if dtype == 'double' else np.float32) + arr = np.ones((2 ** 2 + 2, 2 ** 3 + 2), dtype=dtype) arr[-3:, :] = -1 f, g = ps.fields(f=arr, g=arr) @@ -58,42 +58,39 @@ def test_vectorized_abs(instruction_set, dtype): np.testing.assert_equal(np.sum(dst[1:-1, 1:-1]), 2 ** 2 * 2 ** 3) -@pytest.mark.parametrize('dtype', ('float', 'double')) +@pytest.mark.parametrize('dtype', ('float32', 'float64')) @pytest.mark.parametrize('instruction_set', supported_instruction_sets) def test_strided(instruction_set, dtype): - type_string = "float64" if dtype == 'double' else "float32" - - f, g = ps.fields(f"f, g : {type_string}[2D]") + f, g = ps.fields(f"f, g : {dtype}[2D]") update_rule = [ps.Assignment(g[0, 0], f[0, 0] + f[-1, 0] + f[1, 0] + f[0, 1] + f[0, -1] + 42.0)] - if 'storeS' not in get_vector_instruction_set(dtype, instruction_set) and instruction_set not in ['avx512', - 'rvv'] and not instruction_set.startswith( - 'sve'): + if 'storeS' not in get_vector_instruction_set(dtype, instruction_set) \ + and instruction_set not in ['avx512', 'rvv'] and not instruction_set.startswith('sve'): with pytest.warns(UserWarning) as warn: config = pystencils.config.CreateKernelConfig(cpu_vectorize_info={'instruction_set': instruction_set}, - default_number_float=type_string) + default_number_float=dtype) ast = ps.create_kernel(update_rule, config=config) assert 'Could not vectorize loop' in warn[0].message.args[0] else: with pytest.warns(None) as warn: config = pystencils.config.CreateKernelConfig(cpu_vectorize_info={'instruction_set': instruction_set}, - default_number_float=type_string) + default_number_float=dtype) ast = ps.create_kernel(update_rule, config=config) assert len(warn) == 0 # ps.show_code(ast) func = ast.compile() - ref_config = pystencils.config.CreateKernelConfig(default_number_float=type_string) + ref_config = pystencils.config.CreateKernelConfig(default_number_float=dtype) ref_func = ps.create_kernel(update_rule, config=ref_config).compile() # For some reason other array creations fail on the emulated ppc pipeline size = (25, 19) - arr = np.zeros(size).astype(type_string) + arr = np.zeros(size).astype(dtype) for i in range(size[0]): for j in range(size[1]): arr[i, j] = i * j - dst = np.zeros_like(arr, dtype=type_string) - ref = np.zeros_like(arr, dtype=type_string) + dst = np.zeros_like(arr, dtype=dtype) + ref = np.zeros_like(arr, dtype=dtype) func(g=dst, f=arr) ref_func(g=ref, f=arr) @@ -101,15 +98,13 @@ def test_strided(instruction_set, dtype): # print("dst: ", dst) # print("np array: ", arr) - np.testing.assert_almost_equal(dst[1:-1, 1:-1], ref[1:-1, 1:-1], 13 if dtype == 'double' else 5) + np.testing.assert_almost_equal(dst[1:-1, 1:-1], ref[1:-1, 1:-1], 13 if dtype == 'float64' else 5) -@pytest.mark.parametrize('dtype', ('float', 'double')) +@pytest.mark.parametrize('dtype', ('float32', 'float64')) @pytest.mark.parametrize('instruction_set', supported_instruction_sets) @pytest.mark.parametrize('gl_field, gl_kernel', [(1, 0), (0, 1), (1, 1)]) def test_alignment_and_correct_ghost_layers(gl_field, gl_kernel, instruction_set, dtype): - dtype = np.float64 if dtype == 'double' else np.float32 - domain_size = (128, 128) dh = ps.create_data_handling(domain_size, periodicity=(True, True), default_target=Target.CPU) src = dh.add_array("src", values_per_cell=1, dtype=dtype, ghost_layers=gl_field, alignment=True) @@ -153,18 +148,151 @@ def test_vectorization_other(instruction_set, function): test_vectorization.__dict__[function](instruction_set) -@pytest.mark.parametrize('dtype', ('float', 'double')) +@pytest.mark.parametrize('dtype', ('float32', 'float64')) @pytest.mark.parametrize('instruction_set', supported_instruction_sets) @pytest.mark.parametrize('field_layout', ('fzyx', 'zyxf')) def test_square_root(dtype, instruction_set, field_layout): config = pystencils.config.CreateKernelConfig(data_type=dtype, + default_number_float=dtype, cpu_vectorize_info={'instruction_set': instruction_set, - 'assume_inner_stride_one': True, - 'assume_aligned': False, 'nontemporal': False}) + 'assume_inner_stride_one': True, + 'assume_aligned': False, + 'nontemporal': False}) src_field = ps.Field.create_generic('pdfs', 2, dtype, index_dimensions=1, layout=field_layout, index_shape=(9,)) eq = [ps.Assignment(sp.Symbol("xi"), sum(src_field.center_vector)), ps.Assignment(sp.Symbol("xi_2"), sp.Symbol("xi") * sp.sqrt(src_field.center))] - ps.create_kernel(eq, config=config).compile() + ast = ps.create_kernel(eq, config=config) + ast.compile() + code = ps.get_code_str(ast) + print(code) + + +@pytest.mark.parametrize('dtype', ('float32', 'float64')) +@pytest.mark.parametrize('instruction_set', supported_instruction_sets) +@pytest.mark.parametrize('padding', (True, False)) +def test_square_root_2(dtype, instruction_set, padding): + x, y = sp.symbols("x y") + src = ps.fields(f"src: {dtype}[2D]", layout='fzyx') + + up = ps.Assignment(src[0, 0], 1 / x + sp.sqrt(y * 0.52 + x ** 2)) + + cpu_vec = {'instruction_set': instruction_set, 'assume_inner_stride_one': True, + 'assume_sufficient_line_padding': padding, + 'assume_aligned': True} + + config = ps.CreateKernelConfig(data_type=dtype, default_number_float=dtype, cpu_vectorize_info=cpu_vec) + ast = ps.create_kernel(up, config=config) + ast.compile() + + code = ps.get_code_str(ast) + print(code) + + +@pytest.mark.parametrize('dtype', ('float32', 'float64')) +@pytest.mark.parametrize('instruction_set', supported_instruction_sets) +@pytest.mark.parametrize('padding', (True, False)) +def test_pow(dtype, instruction_set, padding): + config = pystencils.config.CreateKernelConfig(data_type=dtype, + default_number_float=dtype, + cpu_vectorize_info={'instruction_set': instruction_set, + 'assume_inner_stride_one': True, + 'assume_sufficient_line_padding': padding, + 'assume_aligned': False, 'nontemporal': False}) + + src_field = ps.Field.create_generic('pdfs', 2, dtype, index_dimensions=1, layout='fzyx', index_shape=(9,)) + + eq = [ps.Assignment(sp.Symbol("xi"), sum(src_field.center_vector)), + ps.Assignment(sp.Symbol("xi_2"), sp.Symbol("xi") * sp.Pow(src_field.center, 0.5))] + + ast = ps.create_kernel(eq, config=config) + ast.compile() + code = ps.get_code_str(ast) + + print(code) + + +@pytest.mark.parametrize('dtype', ('float32', 'float64')) +@pytest.mark.parametrize('instruction_set', supported_instruction_sets) +@pytest.mark.parametrize('padding', (True, False)) +def test_issue62(dtype, instruction_set, padding): + opt = {'instruction_set': instruction_set, 'assume_aligned': True, + 'assume_inner_stride_one': True, + 'assume_sufficient_line_padding': padding} + + dx = sp.Symbol("dx") + dy = sp.Symbol("dy") + src, dst, rhs = ps.fields(f"src, src_tmp, rhs: {dtype}[2D]", layout='fzyx') + + up = ps.Assignment(src[0, 0], ((dy ** 2 * (src[1, 0] + src[-1, 0])) + + (dx ** 2 * (src[0, 1] + src[0, -1])) + - (rhs[0, 0] * dx ** 2 * dy ** 2)) / (2 * (dx ** 2 + dy ** 2))) + + config = ps.CreateKernelConfig(data_type=dtype, + default_number_float=dtype, + cpu_vectorize_info=opt) + + ast = ps.create_kernel(up, config=config) + ast.compile() + code = ps.get_code_str(ast) + + print(code) + + assert 'pow' not in code + + +@pytest.mark.parametrize('dtype', ('float32', 'float64')) +@pytest.mark.parametrize('instruction_set', supported_instruction_sets) +def test_div_and_unevaluated_expr(dtype, instruction_set): + opt = {'instruction_set': instruction_set, 'assume_aligned': True, 'assume_inner_stride_one': True, + 'assume_sufficient_line_padding': False} + + x, y, z = sp.symbols("x y z") + rhs = (-4 * x ** 4 * y ** 2 * z ** 2 + (x ** 4 * y ** 2 / 3) + (x ** 4 * z ** 2 / 3)) / x ** 3 + + src = ps.fields(f"src: {dtype}[2D]", layout='fzyx') + + up = ps.Assignment(src[0, 0], rhs) + + config = ps.CreateKernelConfig(data_type=dtype, + default_number_float=dtype, + cpu_vectorize_info=opt) + + ast = ps.create_kernel(up, config=config) + code = ps.get_code_str(ast) + # print(code) + + ast.compile() + + assert 'pow' not in code + + +# TODO this test case needs a complete rework of the vectoriser. The reason is that the vectoriser does not +# TODO vectorise symbols at the moment because they could be strides or field sizes, thus involved in pointer arithmetic +# TODO This means that the vectoriser only works if fields are involved on the rhs. +# @pytest.mark.parametrize('dtype', ('float32', 'float64')) +# @pytest.mark.parametrize('instruction_set', supported_instruction_sets) +# def test_vectorised_symbols(dtype, instruction_set): +# opt = {'instruction_set': instruction_set, 'assume_aligned': True, 'assume_inner_stride_one': True, +# 'assume_sufficient_line_padding': False} +# +# x, y, z = sp.symbols("x y z") +# rhs = 1 / x ** 2 * (x + y) +# +# src = ps.fields(f"src: {dtype}[2D]", layout='fzyx') +# +# up = ps.Assignment(src[0, 0], rhs) +# +# config = ps.CreateKernelConfig(data_type=dtype, +# default_number_float=dtype, +# cpu_vectorize_info=opt) +# +# ast = ps.create_kernel(up, config=config) +# code = ps.get_code_str(ast) +# print(code) +# +# ast.compile() +# +# assert 'pow' not in code