Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
No results found
Show changes
Showing
with 315 additions and 17 deletions
...@@ -22,6 +22,8 @@ if get_compiler_config()['os'] == 'windows': ...@@ -22,6 +22,8 @@ if get_compiler_config()['os'] == 'windows':
instruction_sets.remove('avx') instruction_sets.remove('avx')
if 'avx512' in instruction_sets and '/arch:avx512' not in get_compiler_config()['flags'].lower(): if 'avx512' in instruction_sets and '/arch:avx512' not in get_compiler_config()['flags'].lower():
instruction_sets.remove('avx512') instruction_sets.remove('avx512')
if 'avx512vl' in instruction_sets and '/arch:avx512' not in get_compiler_config()['flags'].lower():
instruction_sets.remove('avx512vl')
@pytest.mark.parametrize('target, rng', ((Target.CPU, 'philox'), (Target.CPU, 'aesni'), (Target.GPU, 'philox'))) @pytest.mark.parametrize('target, rng', ((Target.CPU, 'philox'), (Target.CPU, 'aesni'), (Target.GPU, 'philox')))
...@@ -29,8 +31,8 @@ if get_compiler_config()['os'] == 'windows': ...@@ -29,8 +31,8 @@ if get_compiler_config()['os'] == 'windows':
@pytest.mark.parametrize('dtype', ('float', 'double')) @pytest.mark.parametrize('dtype', ('float', 'double'))
def test_rng(target, rng, precision, dtype, t=124, offsets=(0, 0), keys=(0, 0), offset_values=None): def test_rng(target, rng, precision, dtype, t=124, offsets=(0, 0), keys=(0, 0), offset_values=None):
if target == Target.GPU: if target == Target.GPU:
pytest.importorskip('pycuda') pytest.importorskip('cupy')
if instruction_sets and {'neon', 'sve', 'vsx', 'rvv'}.intersection(instruction_sets) and rng == 'aesni': if instruction_sets and {'neon', 'sve', 'sve2', 'sme', 'vsx', 'rvv'}.intersection(instruction_sets) and rng == 'aesni':
pytest.xfail('AES not yet implemented for this architecture') pytest.xfail('AES not yet implemented for this architecture')
if rng == 'aesni' and len(keys) == 2: if rng == 'aesni' and len(keys) == 2:
keys *= 2 keys *= 2
...@@ -120,7 +122,7 @@ def test_rng_offsets(kind, vectorized): ...@@ -120,7 +122,7 @@ def test_rng_offsets(kind, vectorized):
@pytest.mark.parametrize('rng', ('philox', 'aesni')) @pytest.mark.parametrize('rng', ('philox', 'aesni'))
@pytest.mark.parametrize('precision,dtype', (('float', 'float'), ('double', 'double'))) @pytest.mark.parametrize('precision,dtype', (('float', 'float'), ('double', 'double')))
def test_rng_vectorized(target, rng, precision, dtype, t=130, offsets=(1, 3), keys=(0, 0), offset_values=None): def test_rng_vectorized(target, rng, precision, dtype, t=130, offsets=(1, 3), keys=(0, 0), offset_values=None):
if (target in ['neon', 'vsx', 'rvv'] or target.startswith('sve')) and rng == 'aesni': if (target in ['neon', 'vsx', 'rvv', 'sme'] or target.startswith('sve')) and rng == 'aesni':
pytest.xfail('AES not yet implemented for this architecture') pytest.xfail('AES not yet implemented for this architecture')
cpu_vectorize_info = {'assume_inner_stride_one': True, 'assume_aligned': True, 'instruction_set': target} cpu_vectorize_info = {'assume_inner_stride_one': True, 'assume_aligned': True, 'instruction_set': target}
......
...@@ -4,14 +4,14 @@ import pytest ...@@ -4,14 +4,14 @@ import pytest
import pystencils.config import pystencils.config
import sympy as sp import sympy as sp
import pystencils as ps import pystencils as ps
import numpy as np
from pystencils import Assignment, AssignmentCollection, fields
from pystencils.simp import subexpression_substitution_in_main_assignments from pystencils.simp import subexpression_substitution_in_main_assignments
from pystencils.simp import add_subexpressions_for_divisions from pystencils.simp import add_subexpressions_for_divisions
from pystencils.simp import add_subexpressions_for_sums from pystencils.simp import add_subexpressions_for_sums
from pystencils.simp import add_subexpressions_for_field_reads from pystencils.simp import add_subexpressions_for_field_reads
from pystencils.simp.simplifications import add_subexpressions_for_constants from pystencils.simp.simplifications import add_subexpressions_for_constants
from pystencils import Assignment, AssignmentCollection, fields from pystencils.typing import BasicType, TypedSymbol
a, b, c, d, x, y, z = sp.symbols("a b c d x y z") a, b, c, d, x, y, z = sp.symbols("a b c d x y z")
s0, s1, s2, s3 = sp.symbols("s_:4") s0, s1, s2, s3 = sp.symbols("s_:4")
...@@ -133,14 +133,28 @@ def test_add_subexpressions_for_sums(): ...@@ -133,14 +133,28 @@ def test_add_subexpressions_for_sums():
def test_add_subexpressions_for_field_reads(): def test_add_subexpressions_for_field_reads():
s, v = fields("s(5), v(5): double[2D]") s, v = fields("s(5), v(5): double[2D]")
subexpressions = [] subexpressions = []
main = [
Assignment(s[0, 0](0), 3 * v[0, 0](0)), main = [Assignment(s[0, 0](0), 3 * v[0, 0](0)),
Assignment(s[0, 0](1), 10 * v[0, 0](1)) Assignment(s[0, 0](1), 10 * v[0, 0](1))]
]
ac = AssignmentCollection(main, subexpressions) ac = AssignmentCollection(main, subexpressions)
assert len(ac.subexpressions) == 0 assert len(ac.subexpressions) == 0
ac = add_subexpressions_for_field_reads(ac) ac2 = add_subexpressions_for_field_reads(ac)
assert len(ac.subexpressions) == 2 assert len(ac2.subexpressions) == 2
ac3 = add_subexpressions_for_field_reads(ac, data_type="float32")
assert len(ac3.subexpressions) == 2
assert isinstance(ac3.subexpressions[0].lhs, TypedSymbol)
assert ac3.subexpressions[0].lhs.dtype == BasicType("float32")
# added check for early out of add_subexpressions_for_field_reads is no fields appear on the rhs (See #92)
main = [Assignment(s[0, 0](0), 3.0),
Assignment(s[0, 0](1), 4.0)]
ac4 = AssignmentCollection(main, subexpressions)
assert len(ac4.subexpressions) == 0
ac5 = add_subexpressions_for_field_reads(ac4)
assert ac5 is not None
assert ac4 is ac5
@pytest.mark.parametrize('target', (ps.Target.CPU, ps.Target.GPU)) @pytest.mark.parametrize('target', (ps.Target.CPU, ps.Target.GPU))
...@@ -148,7 +162,7 @@ def test_add_subexpressions_for_field_reads(): ...@@ -148,7 +162,7 @@ def test_add_subexpressions_for_field_reads():
@pytest.mark.skipif((vs.major, vs.minor, vs.micro) == (3, 8, 2), reason="does not work on python 3.8.2 for some reason") @pytest.mark.skipif((vs.major, vs.minor, vs.micro) == (3, 8, 2), reason="does not work on python 3.8.2 for some reason")
def test_sympy_optimizations(target, dtype): def test_sympy_optimizations(target, dtype):
if target == ps.Target.GPU: if target == ps.Target.GPU:
pytest.importorskip("pycuda") pytest.importorskip("cupy")
src, dst = ps.fields(f'src, dst: {dtype}[2d]') src, dst = ps.fields(f'src, dst: {dtype}[2d]')
assignments = ps.AssignmentCollection({ assignments = ps.AssignmentCollection({
...@@ -172,7 +186,7 @@ def test_sympy_optimizations(target, dtype): ...@@ -172,7 +186,7 @@ def test_sympy_optimizations(target, dtype):
@pytest.mark.skipif((vs.major, vs.minor, vs.micro) == (3, 8, 2), reason="does not work on python 3.8.2 for some reason") @pytest.mark.skipif((vs.major, vs.minor, vs.micro) == (3, 8, 2), reason="does not work on python 3.8.2 for some reason")
def test_evaluate_constant_terms(target, simplification): def test_evaluate_constant_terms(target, simplification):
if target == ps.Target.GPU: if target == ps.Target.GPU:
pytest.importorskip("pycuda") pytest.importorskip("cupy")
src, dst = ps.fields('src, dst: float32[2d]') src, dst = ps.fields('src, dst: float32[2d]')
# cos of a number will always be simplified # cos of a number will always be simplified
......
import numpy as np
import sympy as sp
import pytest
from pystencils import (
Assignment,
Field,
TypedSymbol,
create_kernel,
make_slice,
Target,
create_data_handling,
)
from pystencils.simp import sympy_cse_on_assignment_list
@pytest.mark.parametrize("target", [Target.CPU, Target.GPU])
def test_sliced_iteration(target):
if target == Target.GPU:
pytest.importorskip("cupy")
size = (4, 4)
dh = create_data_handling(size, default_target=target, default_ghost_layers=0)
src_field = dh.add_array("src", 1)
dst_field = dh.add_array("dst", 1)
dh.fill(src_field.name, 1.0, ghost_layers=True)
dh.fill(dst_field.name, 0.0, ghost_layers=True)
a, b = sp.symbols("a b")
update_rule = Assignment(
dst_field[0, 0],
(
a * src_field[0, 1]
+ a * src_field[0, -1]
+ b * src_field[1, 0]
+ b * src_field[-1, 0]
)
/ 4,
)
s = make_slice[1:3, 1]
kernel = create_kernel(
sympy_cse_on_assignment_list([update_rule]), iteration_slice=s, target=target
).compile()
if target == Target.GPU:
dh.all_to_gpu()
dh.run_kernel(kernel, a=1.0, b=1.0)
if target == Target.GPU:
dh.all_to_cpu()
expected_result = np.zeros(size)
expected_result[1:3, 1] = 1
np.testing.assert_almost_equal(dh.gather_array(dst_field.name), expected_result)
@pytest.mark.parametrize("target", [Target.CPU, Target.GPU])
def test_symbols_in_slice(target):
if target == Target.GPU:
pytest.xfail("Iteration slices including arbitrary symbols are currently broken on GPU")
size = (4, 4)
dh = create_data_handling(size, default_target=target, default_ghost_layers=0)
src_field = dh.add_array("src", 1)
dst_field = dh.add_array("dst", 1)
dh.fill(src_field.name, 1.0, ghost_layers=True)
dh.fill(dst_field.name, 0.0, ghost_layers=True)
a, b = sp.symbols("a b")
update_rule = Assignment(
dst_field[0, 0],
(
a * src_field[0, 1]
+ a * src_field[0, -1]
+ b * src_field[1, 0]
+ b * src_field[-1, 0]
)
/ 4,
)
x_end = TypedSymbol("x_end", "int")
s = make_slice[1:x_end, 1]
x_end_value = size[1] - 1
kernel = create_kernel(
sympy_cse_on_assignment_list([update_rule]), iteration_slice=s, target=target
).compile()
if target == Target.GPU:
dh.all_to_gpu()
dh.run_kernel(kernel, a=1.0, b=1.0, x_end=x_end_value)
if target == Target.GPU:
dh.all_to_cpu()
expected_result = np.zeros(size)
expected_result[1:x_end_value, 1] = 1
np.testing.assert_almost_equal(dh.gather_array(dst_field.name), expected_result)
...@@ -15,6 +15,7 @@ from pystencils.sympyextensions import scalar_product ...@@ -15,6 +15,7 @@ from pystencils.sympyextensions import scalar_product
from pystencils.sympyextensions import kronecker_delta from pystencils.sympyextensions import kronecker_delta
from pystencils import Assignment from pystencils import Assignment
from pystencils.functions import DivFunc
from pystencils.fast_approximation import (fast_division, fast_inv_sqrt, fast_sqrt, from pystencils.fast_approximation import (fast_division, fast_inv_sqrt, fast_sqrt,
insert_fast_divisions, insert_fast_sqrts) insert_fast_divisions, insert_fast_sqrts)
...@@ -163,6 +164,30 @@ def test_count_operations(): ...@@ -163,6 +164,30 @@ def test_count_operations():
assert ops['divs'] == 1 assert ops['divs'] == 1
assert ops['sqrts'] == 1 assert ops['sqrts'] == 1
expr = DivFunc(x, y)
ops = count_operations(expr, only_type=None)
assert ops['divs'] == 1
expr = DivFunc(x + z, y + z)
ops = count_operations(expr, only_type=None)
assert ops['adds'] == 2
assert ops['divs'] == 1
expr = sp.UnevaluatedExpr(sp.Mul(*[x]*100, evaluate=False))
ops = count_operations(expr, only_type=None)
assert ops['muls'] == 99
expr = DivFunc(1, sp.UnevaluatedExpr(sp.Mul(*[x]*100, evaluate=False)))
ops = count_operations(expr, only_type=None)
assert ops['divs'] == 1
assert ops['muls'] == 99
expr = DivFunc(y + z, sp.UnevaluatedExpr(sp.Mul(*[x]*100, evaluate=False)))
ops = count_operations(expr, only_type=None)
assert ops['adds'] == 1
assert ops['divs'] == 1
assert ops['muls'] == 99
def test_common_denominator(): def test_common_denominator():
x = sympy.symbols('x') x = sympy.symbols('x')
......
import sympy as sp
import numpy as np
import pystencils as ps
from pystencils import fields, TypedSymbol
from pystencils.astnodes import LoopOverCoordinate, SympyAssignment
from pystencils.typing import create_type
from pystencils.transformations import (
filtered_tree_iteration, get_loop_hierarchy, get_loop_counter_symbol_hierarchy,
iterate_loops_by_depth, split_inner_loop, loop_blocking
)
from pystencils.cpu import add_pragmas
def test_loop_information():
f, g = ps.fields("f, g: double[2D]")
update_rule = ps.Assignment(g[0, 0], f[0, 0])
ast = ps.create_kernel(update_rule)
inner_loops = [loop for loop in filtered_tree_iteration(ast, LoopOverCoordinate, stop_type=SympyAssignment)
if loop.is_innermost_loop]
loop_order = []
for i in get_loop_hierarchy(inner_loops[0].args[0]):
loop_order.append(i)
assert loop_order == [0, 1]
loop_symbols = get_loop_counter_symbol_hierarchy(inner_loops[0].args[0])
assert loop_symbols == [TypedSymbol("ctr_1", create_type("int"), nonnegative=True),
TypedSymbol("ctr_0", create_type("int"), nonnegative=True)]
def test_iterate_loops_by_depth():
f, g = ps.fields("f, g: double[3D]", layout="fzyx")
x = ps.TypedSymbol('x', np.float64)
subs = [ps.Assignment(x, f[0, 0, 0])]
mains = [ps.Assignment(g[0, 0, 0], x)]
ac = ps.AssignmentCollection(mains, subexpressions=subs)
config = ps.CreateKernelConfig(cpu_blocking=(0, 16, 0))
ast = ps.create_kernel(ac, config=config)
split_inner_loop(ast, [[x], [g[0,0,0]]])
loops = list(iterate_loops_by_depth(ast, 0))
assert len(loops) == 1
assert loops[0].loop_counter_symbol.name == "_blockctr_1"
loops = list(iterate_loops_by_depth(ast, 1))
assert len(loops) == 1
assert loops[0].loop_counter_symbol.name == "ctr_2"
loops = list(iterate_loops_by_depth(ast, 2))
assert len(loops) == 1
assert loops[0].loop_counter_symbol.name == "ctr_1"
loops = list(iterate_loops_by_depth(ast, 3))
assert len(loops) == 2
assert loops[0].loop_counter_symbol.name == "ctr_0"
assert loops[1].loop_counter_symbol.name == "ctr_0"
innermost = list(iterate_loops_by_depth(ast, -1))
assert loops == innermost
def test_split_optimisation():
src, dst = fields(f"src(9), dst(9): [2D]", layout='fzyx')
stencil = ((0, 0), (0, 1), (0, -1), (-1, 0), (1, 0), (-1, 1), (1, 1), (-1, -1), (1, -1))
w = [sp.Rational(4, 9)]
w += [sp.Rational(1, 9)] * 4
w += [sp.Rational(1, 36)] * 4
cs = sp.Rational(1, 3)
subexpressions = []
main_assignements = []
rho = sp.symbols("rho")
velo = sp.symbols("u_:2")
density = 0
velocity_x = 0
velocity_y = 0
for d in stencil:
density += src[d]
velocity_x += d[0] * src[d]
velocity_y += d[1] * src[d]
subexpressions.append(ps.Assignment(rho, density))
subexpressions.append(ps.Assignment(velo[0], velocity_x))
subexpressions.append(ps.Assignment(velo[1], velocity_y))
for i, d in enumerate(stencil):
u_d = velo[0] * d[0] + velo[1] * d[1]
u_2 = velo[0] * velo[0] + velo[1] * velo[1]
expr = w[i] * rho * (1 + u_d / cs + u_d ** 2 / (2 * cs ** 2) - u_2 / (2 * cs))
main_assignements.append(ps.Assignment(dst.center_vector[i], expr))
ac = ps.AssignmentCollection(main_assignments=main_assignements, subexpressions=subexpressions)
simplification_hint = {'density': rho,
'velocity': (velo[0], velo[1]),
'split_groups': [[rho, velo[0], velo[1], dst.center_vector[0]],
[dst.center_vector[1], dst.center_vector[2]],
[dst.center_vector[3], dst.center_vector[4]],
[dst.center_vector[5], dst.center_vector[6]],
[dst.center_vector[7], dst.center_vector[8]]]}
ac.simplification_hints = simplification_hint
ast = ps.create_kernel(ac)
code = ps.get_code_str(ast)
# after the split optimisation the two for loops are split into 6
assert code.count("for") == 6
print(code)
def test_pragmas():
f, g = ps.fields("f, g: double[3D]", layout="fzyx")
x = ps.TypedSymbol('x', np.float64)
subs = [ps.Assignment(x, f[0, 0, 0])]
mains = [ps.Assignment(g[0, 0, 0], x)]
ac = ps.AssignmentCollection(mains, subexpressions=subs)
def prepend_omp_pragmas(ast):
add_pragmas(ast, ["#pragma omp for schedule(dynamic)"], nesting_depth=0)
add_pragmas(ast, ["#pragma omp simd simdlen(8)"], nesting_depth=-1)
ast_passes = [prepend_omp_pragmas]
config = ps.CreateKernelConfig(target=ps.Target.CPU, cpu_prepend_optimizations=ast_passes)
ast = ps.create_kernel(ac, config=config)
code = ps.get_code_str(ast)
assert code.find("#pragma omp for schedule(dynamic)") != -1
assert code.find("#pragma omp simd simdlen(8)") != -1
loops = [loop for loop in filtered_tree_iteration(ast, LoopOverCoordinate, stop_type=SympyAssignment)]
innermost = list(filter(lambda n: n.is_innermost_loop, loops))
assert innermost[0].prefix_lines == ["#pragma omp simd simdlen(8)"]
outermost = list(filter(lambda n: n.is_outermost_loop, loops))
assert outermost[0].prefix_lines == ["#pragma omp for schedule(dynamic)"]
...@@ -26,6 +26,8 @@ def test_type_interference(): ...@@ -26,6 +26,8 @@ def test_type_interference():
assert 'const uint16_t f' in code assert 'const uint16_t f' in code
assert 'const int64_t e' in code assert 'const int64_t e' in code
assert 'const float d = ((float)(b)) + ((float)(c)) + ((float)(e)) + _data_x_00_10[_stride_x_2*ctr_2];' in code assert 'const float d = ((float)(b)) + ((float)(c)) + ((float)(e)) + ' \
assert '_data_x_00_10[_stride_x_2*ctr_2] = ((float)(b)) + ((float)(c)) + _data_x_00_10[_stride_x_2*ctr_2];' in code '_data_x[_stride_x_0*ctr_0 + _stride_x_1*ctr_1 + _stride_x_2*ctr_2];' in code
assert '_data_x[_stride_x_0*ctr_0 + _stride_x_1*ctr_1 + _stride_x_2*ctr_2] = (' \
'(float)(b)) + ((float)(c)) + _data_x[_stride_x_0*ctr_0 + _stride_x_1*ctr_1 + _stride_x_2*ctr_2];' in code
assert 'const double g = a + ((double)(b)) + ((double)(d));' in code assert 'const double g = a + ((double)(b)) + ((double)(d));' in code
...@@ -185,9 +185,11 @@ def test_integer_comparision(dtype): ...@@ -185,9 +185,11 @@ def test_integer_comparision(dtype):
# There should be an explicit cast for the integer zero to the type of the field on the rhs # There should be an explicit cast for the integer zero to the type of the field on the rhs
if dtype == 'float64': if dtype == 'float64':
t = "_data_f_00[_stride_f_1*ctr_1] = ((((dir) == (1))) ? (0.0): (_data_f_00[_stride_f_1*ctr_1]));" t = "_data_f[_stride_f_0*ctr_0 + _stride_f_1*ctr_1] = " \
"((((dir) == (1))) ? (0.0): (_data_f[_stride_f_0*ctr_0 + _stride_f_1*ctr_1]));"
else: else:
t = "_data_f_00[_stride_f_1*ctr_1] = ((((dir) == (1))) ? (0.0f): (_data_f_00[_stride_f_1*ctr_1]));" t = "_data_f[_stride_f_0*ctr_0 + _stride_f_1*ctr_1] = " \
"((((dir) == (1))) ? (0.0f): (_data_f[_stride_f_0*ctr_0 + _stride_f_1*ctr_1]));"
assert t in code assert t in code
......