1e02cdc7 · 1e02cdc7 · 1e02cdc7 · 1e02cdc7 · 1e02cdc7 · 1e02cdc7
--- a/pystencils/simp/liveness_opts.py
+++ b/pystencils/simp/liveness_opts.py
-from sympy import Symbol, Dummy
-
-from pystencils import Field, Assignment
-
-import random
-import copy
-
-
-def get_usage(atoms):
-    reg_usage = {}
-    for atom in atoms:
-        reg_usage[atom.lhs] = 0
-    for atom in atoms:
-        for arg in atom.rhs.atoms():
-            if isinstance(arg, Symbol) and not isinstance(arg, Field.Access):
-                if arg in reg_usage:
-                    reg_usage[arg] += 1
-                else:
-                    print(str(arg) + " is unsatisfied")
-    return reg_usage
-
-
-def get_definitions(eqs):
-    definitions = {}
-    for eq in eqs:
-        definitions[eq.lhs] = eq
-    return definitions
-
-
-def get_roots(eqs):
-    roots = []
-    for eq in eqs:
-        if isinstance(eq.lhs, Field.Access):
-            roots.append(eq.lhs)
-    if not roots:
-        roots.append(eqs[-1].lhs)
-    return roots
-
-
-def merge_field_accesses(eqs):
-    field_accesses = {}
-
-    for eq in eqs:
-        for arg in eq.rhs.atoms():
-            if isinstance(arg, Field.Access) and arg not in field_accesses:
-                field_accesses[arg] = Dummy()
-
-    for i in range(0, len(eqs)):
-        for f, s in field_accesses.items():
-            if f in eqs[i].atoms():
-                eqs[i] = eqs[i].subs(f, s)
-
-    for f, s in field_accesses.items():
-        eqs.insert(0, Assignment(s, f))
-
-    return eqs
-
-
-def refuse_eqs(input_eqs, max_depth=0, max_usage=1):
-    eqs = copy.copy(input_eqs)
-    usages = get_usage(eqs)
-    definitions = get_definitions(eqs)
-
-    def inline_trivially_schedulable(sym, depth):
-
-        if sym not in usages or usages[sym] > max_usage or depth > max_depth:
-            return sym
-
-        rhs = definitions[sym].rhs
-        if len(rhs.args) == 0:
-            return rhs
-
-        return rhs.func(*[inline_trivially_schedulable(arg, depth + 1) for arg in rhs.args])
-
-    for idx, eq in enumerate(eqs):
-        if usages[eq.lhs] > 1 or isinstance(eq.lhs, Field.Access):
-            if not isinstance(eq.rhs, Symbol):
-
-                eqs[idx] = Assignment(eq.lhs,
-                                      eq.rhs.func(*[inline_trivially_schedulable(arg, 0) for arg in eq.rhs.args]))
-
-    count = 0
-    while (len(eqs) != count):
-        count = len(eqs)
-        usages = get_usage(eqs)
-        eqs = [eq for eq in eqs if usages[eq.lhs] > 0 or isinstance(eq.lhs, Field.Access)]
-
-    return eqs
-
-
-def schedule_eqs(eqs, candidate_count=20):
-    if candidate_count == 0:
-        return eqs
-
-    definitions = get_definitions(eqs)
-    definition_atoms = {}
-    for sym, definition in definitions.items():
-        definition_atoms[sym] = list(definition.rhs.atoms(Symbol))
-    roots = get_roots(eqs)
-    initial_usages = get_usage(eqs)
-
-    level = 0
-    current_level_set = set([frozenset(roots)])
-    current_usages = {frozenset(roots): {u: 0 for u in roots}}
-    current_schedules = {frozenset(roots): (0, [])}
-    max_regs = 0
-    while len(current_level_set) > 0:
-        new_usages = dict()
-        new_schedules = dict()
-        new_level_set = set()
-
-        min_regs = min([len(current_usages[dec_set]) for dec_set in current_level_set])
-        max_regs = max(max_regs, min_regs)
-        candidates = [(dec_set, len(current_usages[dec_set])) for dec_set in current_level_set]
-
-        random.shuffle(candidates)
-        candidates.sort(key=lambda d: d[1])
-
-        for dec_set, regs in candidates[:candidate_count]:
-            for dec in dec_set:
-                new_dec_set = set(dec_set)
-                new_dec_set.remove(dec)
-                usage = dict(current_usages[dec_set])
-                usage.pop(dec)
-                atoms = definition_atoms[dec]
-                for arg in atoms:
-                    if not isinstance(arg, Field.Access):
-                        argu = usage.get(arg, initial_usages[arg]) - 1
-                        if argu == 0:
-                            new_dec_set.add(arg)
-                        usage[arg] = argu
-                frozen_new_dec_set = frozenset(new_dec_set)
-                schedule = current_schedules[dec_set]
-                max_reg_count = max(len(usage), schedule[0])
-
-                if frozen_new_dec_set not in new_schedules or max_reg_count < new_schedules[frozen_new_dec_set][0]:
-
-                    new_schedule = list(schedule[1])
-                    new_schedule.append(definitions[dec])
-                    new_schedules[frozen_new_dec_set] = (max_reg_count, new_schedule)
-
-                if len(frozen_new_dec_set) > 0:
-                    new_level_set.add(frozen_new_dec_set)
-                new_usages[frozen_new_dec_set] = usage
-
-        current_schedules = new_schedules
-        current_usages = new_usages
-        current_level_set = new_level_set
-
-        level += 1
-
-    schedule = current_schedules[frozenset()]
-    schedule[1].reverse()
-    return (schedule[1])
-
-
-def liveness_opt_transformation(eqs):
-    return refuse_eqs(merge_field_accesses(schedule_eqs(eqs, 3)), 1, 3)
--- a/pystencils/simp/simplifications.py
+++ b/pystencils/simp/simplifications.py
-import sympy as sp
-from typing import Callable, List
-
-from pystencils import Field
-from pystencils.assignment import Assignment
-from pystencils.simp.assignment_collection import AssignmentCollection
-from pystencils.sympyextensions import subs_additive
-
-AC = AssignmentCollection
-
-
-def sympy_cse(ac: AC) -> AC:
-    """Searches for common subexpressions inside the equation collection.
-
-    Searches is done in both the existing subexpressions as well as the assignments themselves.
-    It uses the sympy subexpression detection to do this. Return a new equation collection
-    with the additional subexpressions found
-    """
-    symbol_gen = ac.subexpression_symbol_generator
-    replacements, new_eq = sp.cse(ac.subexpressions + ac.main_assignments,
-                                  symbols=symbol_gen)
-    replacement_eqs = [Assignment(*r) for r in replacements]
-
-    modified_subexpressions = new_eq[:len(ac.subexpressions)]
-    modified_update_equations = new_eq[len(ac.subexpressions):]
-
-    new_subexpressions = replacement_eqs + modified_subexpressions
-    topologically_sorted_pairs = sp.cse_main.reps_toposort([[e.lhs, e.rhs] for e in new_subexpressions])
-    new_subexpressions = [Assignment(a[0], a[1]) for a in topologically_sorted_pairs]
-
-    return ac.copy(modified_update_equations, new_subexpressions)
-
-
-def sympy_cse_on_assignment_list(assignments: List[Assignment]) -> List[Assignment]:
-    """Extracts common subexpressions from a list of assignments."""
-    ec = AC([], assignments)
-    return sympy_cse(ec).all_assignments
-
-
-def subexpression_substitution_in_existing_subexpressions(ac: AC) -> AC:
-    """Goes through the subexpressions list and replaces the term in the following subexpressions."""
-    result = []
-    for outer_ctr, s in enumerate(ac.subexpressions):
-        new_rhs = s.rhs
-        for inner_ctr in range(outer_ctr):
-            sub_expr = ac.subexpressions[inner_ctr]
-            new_rhs = subs_additive(new_rhs, sub_expr.lhs, sub_expr.rhs, required_match_replacement=1.0)
-            new_rhs = new_rhs.subs(sub_expr.rhs, sub_expr.lhs)
-        result.append(Assignment(s.lhs, new_rhs))
-
-    return ac.copy(ac.main_assignments, result)
-
-
-def subexpression_substitution_in_main_assignments(ac: AC) -> AC:
-    """Replaces already existing subexpressions in the equations of the assignment_collection."""
-    result = []
-    for s in ac.main_assignments:
-        new_rhs = s.rhs
-        for sub_expr in ac.subexpressions:
-            new_rhs = subs_additive(new_rhs, sub_expr.lhs, sub_expr.rhs, required_match_replacement=1.0)
-        result.append(Assignment(s.lhs, new_rhs))
-    return ac.copy(result)
-
-
-def add_subexpressions_for_divisions(ac: AC) -> AC:
-    r"""Introduces subexpressions for all divisions which have no constant in the denominator.
-
-    For example :math:`\frac{1}{x}` is replaced while :math:`\frac{1}{3}` is not replaced.
-    """
-    divisors = set()
-
-    def search_divisors(term):
-        if term.func == sp.Pow:
-            if term.exp.is_integer and term.exp.is_number and term.exp < 0:
-                divisors.add(term)
-        else:
-            for a in term.args:
-                search_divisors(a)
-
-    for eq in ac.all_assignments:
-        search_divisors(eq.rhs)
-
-    new_symbol_gen = ac.subexpression_symbol_generator
-    substitutions = {divisor: new_symbol for new_symbol, divisor in zip(new_symbol_gen, divisors)}
-    return ac.new_with_substitutions(substitutions, True)
-
-
-def add_subexpressions_for_field_reads(ac: AC, subexpressions=True, main_assignments=True) -> AC:
-    r"""Substitutes field accesses on rhs of assignments with subexpressions
-
-    Can change semantics of the update rule (which is the goal of this transformation)
-    This is useful if a field should be update in place - all values are loaded before into subexpression variables,
-    then the new values are computed and written to the same field in-place.
-    """
-    field_reads = set()
-    if subexpressions:
-        for assignment in ac.subexpressions:
-            field_reads.update(assignment.rhs.atoms(Field.Access))
-    if main_assignments:
-        for assignment in ac.main_assignments:
-            field_reads.update(assignment.rhs.atoms(Field.Access))
-    substitutions = {fa: sp.Dummy() for fa in field_reads}
-    return ac.new_with_substitutions(substitutions, add_substitutions_as_subexpressions=True, substitute_on_lhs=False)
-
-
-def apply_to_all_assignments(operation: Callable[[sp.Expr], sp.Expr]) -> Callable[[AC], AC]:
-    """Applies sympy expand operation to all equations in collection."""
-    def f(assignment_collection: AC) -> AC:
-        result = [Assignment(eq.lhs, operation(eq.rhs)) for eq in assignment_collection.main_assignments]
-        return assignment_collection.copy(result)
-    f.__name__ = operation.__name__
-    return f
-
-
-def apply_on_all_subexpressions(operation: Callable[[sp.Expr], sp.Expr]) -> Callable[[AC], AC]:
-    """Applies the given operation on all subexpressions of the AC."""
-    def f(ac: AC) -> AC:
-        result = [Assignment(eq.lhs, operation(eq.rhs)) for eq in ac.subexpressions]
-        return ac.copy(ac.main_assignments, result)
-    f.__name__ = operation.__name__
-    return f
--- a/pystencils/sympy_gmpy_bug_workaround.py
+++ b/pystencils/sympy_gmpy_bug_workaround.py
-# Disable gmpy backend until this bug is resolved if joblib serialize
-# See https://github.com/sympy/sympy/pull/13530
-import os
-import warnings
-
-os.environ['MPMATH_NOGMPY'] = '1'
-try:
-    import mpmath.libmp
-    # In case the user has imported sympy first, then pystencils
-    if mpmath.libmp.BACKEND == 'gmpy':
-        warnings.warn("You are using the gmpy backend. You might encounter an error 'argument is not an mpz sympy'. "
-                      "This is due to a known bug in sympy/gmpy library. "
-                      "To prevent this, import pystencils first then sympy or set the environment variable "
-                      "MPMATH_NOGMPY=1")
-except ImportError:
-    pass
-
-__all__ = []
--- a/pystencils_tests/benchmark/SkylakeSP_Gold-5122_allinclusive.yaml
+++ b/pystencils_tests/benchmark/SkylakeSP_Gold-5122_allinclusive.yaml
-# FIXME
-# FIXME performance counters might be wrong. This will only affect the Benchmark model
-# FIXME bandwidth measurements need validation
-# FIXME
-
-kerncraft version: 0.7.2
-model name: Intel(R) Xeon(R) Gold 5122 CPU @ 3.60GHz
-model type: Intel Core Skylake SP
-sockets: 2
-cores per socket: 4
-threads per core: 2
-NUMA domains per socket: 1
-cores per NUMA domain: 4
-clock: 3.6 GHz
-FLOPs per cycle:
-  SP:
-    total: 64
-    FMA: 64
-    ADD: 32
-    MUL: 32
-  DP:
-    total: 32
-    FMA: 32
-    ADD: 16
-    MUL: 16
-micro-architecture: SKX
-compiler:
-  !!omap
-  - icc: -O3 -fno-alias -xCORE-AVX512
-  - clang: -O3 -march=skylake-avx512 -D_POSIX_C_SOURCE=200112L
-  - gcc: -O3 -march=skylake-avx512
-cacheline size: 64 B
-overlapping model:
-  ports: ["0", "0DV", "1", "2", "3", "4", "5", "6", "7"]
-  performance counter metric:
-          Max(UOPS_DISPATCHED_PORT_PORT_0:PMC[0-3],
-          UOPS_DISPATCHED_PORT_PORT_1:PMC[0-3],
-          UOPS_DISPATCHED_PORT_PORT_4:PMC[0-3],
-          UOPS_DISPATCHED_PORT_PORT_5:PMC[0-3],
-          UOPS_DISPATCHED_PORT_PORT_6:PMC[0-3],
-          UOPS_DISPATCHED_PORT_PORT_7:PMC[0-3])
-non-overlapping model:
-  ports: ["2D", "3D"]
-  performance counter metric: T_OL + T_L1L2 + T_L2L3 + T_L3MEM
-memory hierarchy:
- level: L1
-  performance counter metrics:
-    accesses:  MEM_INST_RETIRED_ALL_LOADS:PMC[0-3]
-    misses: L1D_REPLACEMENT:PMC[0-3]
-    evicts: L2_TRANS_L1D_WB:PMC[0-3]
-  cache per group:
-    sets: 64
-    ways: 8
-    cl_size: 64
-    replacement_policy: 'LRU'
-    write_allocate: True
-    write_back: True
-    load_from: L2
-    store_to: L2
-  size per group: 32.00 kB
-  groups: 8
-  cores per group: 1
-  threads per group: 2
- level: L2
-  non-overlap upstream throughput: [64 B/cy, 'half-duplex']
-  performance counter metrics:
-    accesses: L1D_REPLACEMENT:PMC[0-3]
-    misses: L2_LINES_IN_ALL:PMC[0-3]
-    evicts: L2_TRANS_L2_WB:PMC[0-3]
-  cache per group:
-    sets: 1024
-    ways: 16
-    cl_size: 64
-    replacement_policy: 'LRU'
-    write_allocate: True
-    write_back: True
-    load_from: null  # L3 is a victim cache, thus unless a hit in L3, misses get forwarded to MEM
-    victims_to: L3  # all victims, modified or not are passed onto L3
-    store_to: L3
-  size per group: 1.00 MB
-  groups: 8
-  cores per group: 1
-  threads per group: 2
- level: L3
-  non-overlap upstream throughput: [16 B/cy, 'full-duplex']
-  performance counter metrics:
-    accesses: L2_LINES_IN_ALL:PMC[0-3]
-    # FIXME not all misses in L2 lead to loads from L3, only the hits do
-    misses: (CAS_COUNT_RD:MBOX0C[01] + CAS_COUNT_WR:MBOX0C[01] +
-             CAS_COUNT_RD:MBOX1C[01] + CAS_COUNT_WR:MBOX1C[01] +
-             CAS_COUNT_RD:MBOX2C[01] + CAS_COUNT_WR:MBOX2C[01] +
-             CAS_COUNT_RD:MBOX3C[01] + CAS_COUNT_WR:MBOX3C[01] +
-             CAS_COUNT_RD:MBOX4C[01] + CAS_COUNT_WR:MBOX4C[01] +
-             CAS_COUNT_RD:MBOX5C[01] + CAS_COUNT_WR:MBOX5C[01])
-    evicts: L2_TRANS_L2_WB:PMC[0-3]
-  cache per group:
-    sets: 16896
-    # TODO is actuall something else, but necessary to get to 16.5 MB
-    ways: 16
-    # TODO is actually 11, but pycachesim only supports powers of two
-    cl_size: 64
-    replacement_policy: 'LRU'
-    write_allocate: False
-    write_back: True
-  size per group: 16.50 MB
-  groups: 2
-  cores per group: 4
-  threads per group: 8
- level: MEM
-  cores per group: 4
-  threads per group: 8
-  non-overlap upstream throughput: ['full socket memory bandwidth', 'half-duplex']
-  penalty cycles per read stream: 0
-  size per group:
-benchmarks:
-  kernels:
-    load:
-      read streams:
-        streams: 1
-        bytes: 8.00 B
-      read+write streams:
-        streams: 0
-        bytes: 0.00 B
-      write streams:
-        streams: 0
-        bytes: 0.00 B
-      FLOPs per iteration: 0
-    copy:
-      read streams:
-        streams: 1
-        bytes: 8.00 B
-      read+write streams:
-        streams: 0
-        bytes: 0.00 B
-      write streams:
-        streams: 1
-        bytes: 8.00 B
-      FLOPs per iteration: 0
-    update:
-      read streams:
-        streams: 1
-        bytes: 8.00 B
-      read+write streams:
-        streams: 1
-        bytes: 8.00 B
-      write streams:
-        streams: 1
-        bytes: 8.00 B
-      FLOPs per iteration: 0
-    triad:
-      read streams:
-        streams: 3
-        bytes: 24.00 B
-      read+write streams:
-        streams: 0
-        bytes: 0.00 B
-      write streams:
-        streams: 1
-        bytes: 8.00 B
-      FLOPs per iteration: 2
-    daxpy:
-      read streams:
-        streams: 2
-        bytes: 16.00 B
-      read+write streams:
-        streams: 1
-        bytes: 8.00 B
-      write streams:
-        streams: 1
-        bytes: 8.00 B
-      FLOPs per iteration: 2
-  measurements:
-    L1:
-      1:
-        threads per core: 1
-        cores:
-        - 1
-        - 2
-        - 3
-        - 4
-        threads:
-        - 1
-        - 2
-        - 3
-        - 4
-        size per core:
-        - 21.12 kB
-        - 21.12 kB
-        - 21.12 kB
-        - 21.12 kB
-        size per thread:
-        - 21.12 kB
-        - 21.12 kB
-        - 21.12 kB
-        - 21.12 kB
-        total size:
-        - 21.12 kB
-        - 42.24 kB
-        - 63.36 kB
-        - 84.48 kB
-        results:
-          load:
-          - 42.98 GB/s
-          - 85.08 GB/s
-          - 127.45 GB/s
-          - 169.92 GB/s
-          copy:
-          - 56.07 GB/s
-          - 111.50 GB/s
-          - 164.90 GB/s
-          - 221.50 GB/s
-          update:
-          - 56.54 GB/s
-          - 112.25 GB/s
-          - 168.50 GB/s
-          - 224.75 GB/s
-          triad:
-          - 45.90 GB/s
-          - 89.81 GB/s
-          - 127.29 GB/s
-          - 169.57 GB/s
-          daxpy:
-          - 36.62 GB/s
-          - 71.30 GB/s
-          - 103.52 GB/s
-          - 135.26 GB/s
-      2:
-        threads per core: 2
-        cores:
-        - 1
-        - 2
-        - 3
-        - 4
-        threads:
-        - 2
-        - 4
-        - 6
-        - 8
-        size per core:
-        - 21.12 kB
-        - 21.12 kB
-        - 21.12 kB
-        - 21.12 kB
-        size per thread:
-        - 10.56 kB
-        - 10.56 kB
-        - 10.56 kB
-        - 10.56 kB
-        total size:
-        - 21.12 kB
-        - 42.24 kB
-        - 63.36 kB
-        - 84.48 kB
-        results:
-          load:
-          - 49.61 GB/s
-          - 98.80 GB/s
-          - 147.98 GB/s
-          - 198.22 GB/s
-          copy:
-          - 55.98 GB/s
-          - 111.56 GB/s
-          - 167.08 GB/s
-          - 220.42 GB/s
-          update:
-          - 56.53 GB/s
-          - 112.72 GB/s
-          - 168.95 GB/s
-          - 225.31 GB/s
-          triad:
-          - 54.01 GB/s
-          - 104.58 GB/s
-          - 153.02 GB/s
-          - 200.93 GB/s
-          daxpy:
-          - 41.11 GB/s
-          - 80.28 GB/s
-          - 115.71 GB/s
-          - 152.81 GB/s
-    L2:
-      1:
-        threads per core: 1
-        cores:
-        - 1
-        - 2
-        - 3
-        - 4
-        threads:
-        - 1
-        - 2
-        - 3
-        - 4
-        size per core:
-        - 660.00 kB
-        - 660.00 kB
-        - 660.00 kB
-        - 660.00 kB
-        size per thread:
-        - 660.00 kB
-        - 660.00 kB
-        - 660.00 kB
-        - 660.00 kB
-        total size:
-        - 660.00 kB
-        - 1.32 MB
-        - 1.98 MB
-        - 2.64 MB
-        results:
-          load:
-          - 27.15 GB/s
-          - 54.09 GB/s
-          - 80.61 GB/s
-          - 106.41 GB/s
-          copy:
-          - 43.53 GB/s
-          - 90.07 GB/s
-          - 127.73 GB/s
-          - 171.81 GB/s
-          update:
-          - 50.38 GB/s
-          - 98.47 GB/s
-          - 147.91 GB/s
-          - 197.20 GB/s
-          triad:
-          - 43.38 GB/s
-          - 83.72 GB/s
-          - 124.83 GB/s
-          - 166.04 GB/s
-          daxpy:
-          - 36.29 GB/s
-          - 71.29 GB/s
-          - 103.33 GB/s
-          - 136.48 GB/s
-      2:
-        threads per core: 2
-        cores:
-        - 1
-        - 2
-        - 3
-        - 4
-        threads:
-        - 2
-        - 4
-        - 6
-        - 8
-        size per core:
-        - 660.00 kB
-        - 660.00 kB
-        - 660.00 kB
-        - 660.00 kB
-        size per thread:
-        - 330.00 kB
-        - 330.00 kB
-        - 330.00 kB
-        - 330.00 kB
-        total size:
-        - 660.00 kB
-        - 1.32 MB
-        - 1.98 MB
-        - 2.64 MB
-        results:
-          load:
-          - 35.29 GB/s
-          - 70.28 GB/s
-          - 104.67 GB/s
-          - 139.63 GB/s
-          copy:
-          - 42.23 GB/s
-          - 83.70 GB/s
-          - 124.33 GB/s
-          - 167.50 GB/s
-          update:
-          - 50.09 GB/s
-          - 99.77 GB/s
-          - 149.87 GB/s
-          - 198.82 GB/s
-          triad:
-          - 52.38 GB/s
-          - 100.00 GB/s
-          - 147.40 GB/s
-          - 193.31 GB/s
-          daxpy:
-          - 41.14 GB/s
-          - 80.22 GB/s
-          - 116.23 GB/s
-          - 155.08 GB/s
-    L3:
-      1:
-        threads per core: 1
-        cores:
-        - 1
-        - 2
-        - 3
-        - 4
-        threads:
-        - 1
-        - 2
-        - 3
-        - 4
-        size per core:
-        - 10.56 MB
-        - 5.28 MB
-        - 3.52 MB
-        - 2.64 MB
-        size per thread:
-        - 10.56 MB
-        - 5.28 MB
-        - 3.52 MB
-        - 2.64 MB
-        total size:
-        - 10.56 MB
-        - 10.56 MB
-        - 10.56 MB
-        - 10.56 MB
-        results:
-          load:
-          - 22.40 GB/s
-          - 44.77 GB/s
-          - 65.71 GB/s
-          - 89.26 GB/s
-          copy:
-          - 25.32 GB/s
-          - 49.70 GB/s
-          - 72.89 GB/s
-          - 98.62 GB/s
-          update:
-          - 41.24 GB/s
-          - 81.14 GB/s
-          - 122.22 GB/s
-          - 166.44 GB/s
-          triad:
-          - 25.61 GB/s
-          - 50.02 GB/s
-          - 73.23 GB/s
-          - 98.95 GB/s
-          daxpy:
-          - 32.07 GB/s
-          - 62.65 GB/s
-          - 89.91 GB/s
-          - 120.65 GB/s
-      2:
-        threads per core: 2
-        cores:
-        - 1
-        - 2
-        - 3
-        - 4
-        threads:
-        - 2
-        - 4
-        - 6
-        - 8
-        size per core:
-        - 10.56 MB
-        - 5.28 MB
-        - 3.52 MB
-        - 2.64 MB
-        size per thread:
-        - 5.28 MB
-        - 2.64 MB
-        - 1.76 MB
-        - 1.32 MB
-        total size:
-        - 10.56 MB
-        - 10.56 MB
-        - 10.56 MB
-        - 10.56 MB
-        results:
-          load:
-          - 26.18 GB/s
-          - 51.85 GB/s
-          - 75.82 GB/s
-          - 101.39 GB/s
-          copy:
-          - 26.22 GB/s
-          - 51.83 GB/s
-          - 76.40 GB/s
-          - 102.84 GB/s
-          update:
-          - 43.51 GB/s
-          - 86.75 GB/s
-          - 129.86 GB/s
-          - 174.54 GB/s
-          triad:
-          - 26.39 GB/s
-          - 51.80 GB/s
-          - 76.27 GB/s
-          - 102.66 GB/s
-          daxpy:
-          - 37.43 GB/s
-          - 73.16 GB/s
-          - 106.53 GB/s
-          - 142.76 GB/s
-    MEM:
-      1:
-        threads per core: 1
-        cores:
-        - 1
-        - 2
-        - 3
-        - 4
-        threads:
-        - 1
-        - 2
-        - 3
-        - 4
-        size per core:
-        - 240.00 MB
-        - 120.00 MB
-        - 80.00 MB
-        - 60.00 MB
-        size per thread:
-        - 240.00 MB
-        - 120.00 MB
-        - 80.00 MB
-        - 60.00 MB
-        total size:
-        - 240.00 MB
-        - 240.00 MB
-        - 240.00 MB
-        - 240.00 MB
-        results:
-          load:
-          - 12.03 GB/s
-          - 24.38 GB/s
-          - 34.83 GB/s
-          - 45.05 GB/s
-          copy:
-          - 12.32 GB/s
-          - 24.40 GB/s
-          - 32.82 GB/s
-          - 37.00 GB/s
-          update:
-          - 20.83 GB/s
-          - 40.25 GB/s
-          - 48.81 GB/s
-          - 54.84 GB/s
-          triad:
-          - 11.64 GB/s
-          - 23.17 GB/s
-          - 34.78 GB/s
-          - 42.97 GB/s
-          daxpy:
-          - 17.69 GB/s
-          - 34.02 GB/s
-          - 48.12 GB/s
-          - 55.73 GB/s
-      2:
-        threads per core: 2
-        cores:
-        - 1
-        - 2
-        - 3
-        - 4
-        threads:
-        - 2
-        - 4
-        - 6
-        - 8
-        size per core:
-        - 240.00 MB
-        - 120.00 MB
-        - 80.00 MB
-        - 60.00 MB
-        size per thread:
-        - 120.00 MB
-        - 60.00 MB
-        - 40.00 MB
-        - 30.00 MB
-        total size:
-        - 240.00 MB
-        - 240.00 MB
-        - 240.00 MB
-        - 240.00 MB
-        results:
-          load:
-          - 15.33 GB/s
-          - 28.32 GB/s
-          - 41.34 GB/s
-          - 53.02 GB/s
-          copy:
-          - 13.96 GB/s
-          - 26.61 GB/s
-          - 34.39 GB/s
-          - 38.96 GB/s
-          update:
-          - 26.47 GB/s
-          - 47.82 GB/s
-          - 56.70 GB/s
-          - 62.78 GB/s
-          triad:
-          - 14.42 GB/s
-          - 26.66 GB/s
-          - 36.94 GB/s
-          - 44.01 GB/s
-          daxpy:
-          - 20.96 GB/s
-          - 39.12 GB/s
-          - 51.55 GB/s
-          - 58.37 GB/s
--- a/pystencils_tests/benchmark/benchmark.py
+++ b/pystencils_tests/benchmark/benchmark.py
-import os
-import math
-import time
-import numpy as np
-import sympy as sp
-from influxdb import InfluxDBClient
-from git import Repo
-from kerncraft.models import ECM, Benchmark, Roofline, RooflineIACA
-from kerncraft.machinemodel import MachineModel
-from kerncraft.prefixedunit import PrefixedUnit
-from pystencils.kerncraft_coupling import KerncraftParameters, PyStencilsKerncraftKernel
-from pystencils import Field, Assignment, create_kernel
-
-
-def outputBenchmark(analysis):
-    output = {}
-    keys = ['Runtime (per repetition) [s]', 'Iterations per repetition',
-            'Runtime (per cacheline update) [cy/CL]', 'MEM volume (per repetition) [B]',
-            'Performance [MFLOP/s]', 'Performance [MLUP/s]', 'Performance [MIt/s]', 'MEM BW [MByte/s]']
-    copies = {key: analysis[key] for key in keys}
-    output.update(copies)
-
-    for cache, metrics in analysis['data transfers'].items():
-        for metric_name, metric_value in metrics.items():
-            fixed = metric_value.with_prefix('')
-            output[cache + ' ' + metric_name + ' ' + fixed.prefix + fixed.unit] = fixed.value
-
-    for level, value in analysis['ECM'].items():
-        output['Phenomenological ECM ' + level + ' cy/CL'] = value
-    return output
-
-
-def outputECM(analysis):
-    output = {}
-    keys = ['T_nOL', 'T_OL', 'cl throughput', 'uops']
-    copies = {key: analysis[key] for key in keys}
-    output.update(copies)
-
-    if 'memory bandwidth kernel' in analysis:
-        output['memory bandwidth kernel' + analysis['memory bandwidth kernel'] + analysis['memory bandwidth'].prefix +
-               analysis['memory bandwidth'].unit] = analysis['memory bandwidth'].value
-
-    output['scaling cores'] = int(analysis['scaling cores']) if not math.isinf(analysis['scaling cores']) else -1
-
-    for key, value in analysis['cycles']:
-        output[key] = value
-    return output
-
-
-def outputRoofline(analysis):
-    output = {}
-    keys = ['min performance']#'bottleneck level'
-    copies = {key: analysis[key] for key in keys}
-    output.update(copies)
-    # TODO save bottleneck information (compute it here)
-
-    #fixed = analysis['max_flops'].with_prefix('G')
-    #output['max GFlop/s'] = fixed.value
-
-    #if analysis['min performance'] > max_flops:
-    #    # CPU bound
-    #    print('CPU bound with {} cores(s)'.format(self._args.cores), file=output_file)
-    #    print('{!s} due to CPU max. FLOP/s'.format(max_flops), file=output_file)
-    #else:
-    # Memory bound
-    bottleneck = analysis['mem bottlenecks'][analysis['bottleneck level']]
-    output['bottleneck GFlop/s'] = bottleneck['performance'].with_prefix('G').value
-    output['bottleneck level'] = bottleneck['level']
-    output['bottleneck bw kernel'] = bottleneck['bw kernel']
-    output['bottleneck arithmetic intensity'] = bottleneck['arithmetic intensity']
-
-    for i, level in enumerate(analysis['mem bottlenecks']):
-        if level is None:
-            continue
-        for key, value in level.items():
-            if isinstance(value, PrefixedUnit):
-                fixed = value.with_prefix('G')
-                output['level ' + str(i) + ' ' + key + ' [' + fixed.prefix + fixed.unit + ']'] = 'inf' if isinstance(
-                    fixed.value, float) and math.isinf(fixed.value) else fixed.value
-            else:
-                output['level ' + str(i) + ' ' + key] = 'inf' if isinstance(value, float) and math.isinf(
-                    value) else value
-    return output
-
-
-def outputRooflineIACA(analysis):
-    output = {}
-    keys = ['min performance'] #'bottleneck level'
-    copies = {key: analysis[key] for key in keys}
-    #output.update(copies)
-    # TODO save bottleneck information (compute it here)
-
-    #fixed = analysis['max_flops'].with_prefix('G')
-    #output['max GFlop/s'] = fixed.value
-
-    #if analysis['min performance'] > max_flops:
-    #    # CPU bound
-    #    print('CPU bound with {} cores(s)'.format(self._args.cores), file=output_file)
-    #    print('{!s} due to CPU max. FLOP/s'.format(max_flops), file=output_file)
-    #else:
-    # Memory bound
-    bottleneck = analysis['mem bottlenecks'][analysis['bottleneck level']]
-    output['bottleneck GFlop/s'] = bottleneck['performance'].with_prefix('G').value
-    output['bottleneck level'] = bottleneck['level']
-    output['bottleneck bw kernel'] = bottleneck['bw kernel']
-    output['bottleneck arithmetic intensity'] = bottleneck['arithmetic intensity']
-
-    for i, level in enumerate(analysis['mem bottlenecks']):
-        if level is None:
-            continue
-        for key, value in level.items():
-            if isinstance(value, PrefixedUnit):
-                fixed = value.with_prefix('G')
-                output['level ' + str(i) + ' ' + key + ' [' + fixed.prefix + fixed.unit + ']'] = 'inf' if isinstance(
-                    fixed.value, float) and math.isinf(fixed.value) else fixed.value
-            else:
-                output['level ' + str(i) + ' ' + key] = 'inf' if isinstance(value, float) and math.isinf(
-                    value) else value
-    return output
-
-
-def reportAnalysis(ast, models, machine, tags, fields=None):
-    kernel = PyStencilsKerncraftKernel(ast, machine)
-    client = InfluxDBClient('i10grafana.informatik.uni-erlangen.de', 8086, 'pystencils',
-                            'roggan', 'pystencils')
-    repo = Repo(search_parent_directories=True)
-    commit = repo.head.commit
-    point_time = int(time.time())
-
-    for model in models:
-        benchmark = model(kernel, machine, KerncraftParameters())
-        benchmark.analyze()
-        analysis = benchmark.results
-        if model is Benchmark:
-            output = outputBenchmark(analysis)
-        elif model is ECM:
-            output = outputECM(analysis)
-        elif model is Roofline:
-            output = outputRoofline(analysis)
-        elif model is RooflineIACA:
-            output = outputRooflineIACA(analysis)
-        else:
-            raise ValueError('No valid model for analysis given!')
-
-        if fields is not None:
-            output.update(fields)
-
-        output['commit'] = commit.hexsha
-
-        json_body = [
-            {
-                'measurement': model.__name__,
-                'tags': tags,
-                'time': point_time,
-                'fields': output
-            }
-        ]
-        client.write_points(json_body, time_precision='s')
-
-
-def main():
-    size = [20, 200, 200]
-    arr = np.zeros(size)
-    a = Field.create_from_numpy_array('a', arr, index_dimensions=0)
-    b = Field.create_from_numpy_array('b', arr, index_dimensions=0)
-    s = sp.Symbol("s")
-    rhs = a[0, -1, 0] + a[0, 1, 0] + \
-          a[-1, 0, 0] + a[1, 0, 0] + \
-          a[0, 0, -1] + a[0, 0, 1]
-
-    updateRule = Assignment(b[0, 0, 0], s * rhs)
-    ast = create_kernel([updateRule])
-    INPUT_FOLDER = "./"
-    machineFilePath = os.path.join(INPUT_FOLDER, "SkylakeSP_Gold-5122_allinclusive.yaml")
-    machine = MachineModel(path_to_yaml=machineFilePath)
-    tags = {
-                    'host': os.uname()[1],
-                    'project': 'pystencils',
-                    'kernel': 'jacobi_3D ' + str(size)
-                }
-
-    reportAnalysis(ast, [ECM, Roofline, RooflineIACA, Benchmark], machine, tags)
-
-
-if __name__ == '__main__':
-    main()
-    while False:
-        main()
-        time.sleep(3600)
--- a/pystencils_tests/benchmark/generate.py
+++ b/pystencils_tests/benchmark/generate.py
-import sympy as sp
-import numpy as np
-from pystencils import Field, Assignment, create_kernel
-
-
-def meassure():
-    size = [30, 50, 3]
-    arr = np.zeros(size)
-    a = Field.create_from_numpy_array('a', arr, index_dimensions=1)
-    b = Field.create_from_numpy_array('b', arr, index_dimensions=1)
-    s = sp.Symbol("s")
-    rhs = a[0, -1](0) + a[0, 1] + a[-1, 0] + a[1, 0]
-    updateRule = Assignment(b[0, 0], s * rhs)
-    print(updateRule)
-
-    ast = create_kernel([updateRule])
-
-    # benchmark = generate_benchmark(ast)
-    # main = benchmark[0]
-    # kernel = benchmark[1]
-    # with open('src/main.cpp', 'w') as file:
-    #     file.write(main)
-    # with open('src/kernel.cpp', 'w') as file:
-    #     file.write(kernel)
-
-    func = ast.compile({'omega': 2/3})
-
-    from pystencils.kerncraft_coupling.generate_benchmark import generate_benchmark
-    from pystencils.kerncraft_coupling import BenchmarkAnalysis
-    from pystencils.kerncraft_coupling.kerncraft_interface import PyStencilsKerncraftKernel, KerncraftParameters
-    from kerncraft.machinemodel import MachineModel
-    from kerncraft.models import ECMData
-
-
-    machineFilePath = "../pystencils_tests/kerncraft_inputs/default_machine_file.yaml"
-    machine = MachineModel(path_to_yaml=machineFilePath)
-
-
-    benchmark = BenchmarkAnalysis(ast, machine)
-    #TODO what do i want to do with benchmark?
-
-    kernel = PyStencilsKerncraftKernel(ast)
-    model = ECMData(kernel, machine, KerncraftParameters())
-    model.analyze()
-    model.report()
-
-
-if __name__ == "__main__":
-    meassure()
-
-
--- a/pystencils_tests/benchmark/iacaMarks.h
+++ b/pystencils_tests/benchmark/iacaMarks.h
-/*
-* Copyright (2008-2009) Intel Corporation All Rights Reserved. 
-* The source code contained or described herein and all documents 
-* related to the source code ("Material") are owned by Intel Corporation 
-* or its suppliers or licensors. Title to the Material remains with 
-* Intel Corporation or its suppliers and licensors. The Material 
-* contains trade secrets and proprietary and confidential information 
-* of Intel or its suppliers and licensors. The Material is protected 
-* by worldwide copyright and trade secret laws and treaty provisions. 
-* No part of the Material may be used, copied, reproduced, modified, 
-* published, uploaded, posted, transmitted, distributed, or disclosed 
-* in any way without Intel(R)s prior express written permission.
-*
-* No license under any patent, copyright, trade secret or other 
-* intellectual property right is granted to or conferred upon you by 
-* disclosure or delivery of the Materials, either expressly, by implication,
-* inducement, estoppel or otherwise. Any license under such intellectual 
-* property rights must be express and approved by Intel in writing.
-*/
-
-#if defined (__GNUC__) 
-#define IACA_SSC_MARK( MARK_ID )						\
-__asm__ __volatile__ (									\
-					  "\n\t  movl $"#MARK_ID", %%ebx"	\
-					  "\n\t  .byte 0x64, 0x67, 0x90"	\
-					  : : : "memory" );
-
-#else
-#define IACA_SSC_MARK(x) {__asm  mov ebx, x\
-	__asm  _emit 0x64 \
-	__asm  _emit 0x67 \
-	__asm  _emit 0x90 }
-#endif
-
-#define IACA_START {IACA_SSC_MARK(111)}
-#define IACA_END {IACA_SSC_MARK(222)}
-
-#ifdef _WIN64
-#include <intrin.h>
-#define IACA_VC64_START __writegsbyte(111, 111);
-#define IACA_VC64_END   __writegsbyte(222, 222);
-#endif
-
-/**************** asm *****************
-;START_MARKER
-mov ebx, 111
-db 0x64, 0x67, 0x90
-
-;END_MARKER
-mov ebx, 222
-db 0x64, 0x67, 0x90
-
-**************************************/
--- a/pystencils_tests/benchmark/main.c
+++ b/pystencils_tests/benchmark/main.c
-#include "iacaMarks.h"
-
-int main(int argc, char * argv[]){
-	int a = 0;
-	for(int i = 0; i < argc+100000; i++){
-		IACA_START
-		a += i;
-	}
-	IACA_END
-	return a;
-}
--- a/pystencils_tests/kerncraft_inputs/2d-5pt.c
+++ b/pystencils_tests/kerncraft_inputs/2d-5pt.c
-double a[30][50][3];
-double b[30][50][3];
-double s;
-
-for(int j=1; j<30-1; ++j)
-    for(int i=1; i<50-1; ++i)
-        b[j][i] = ( a[j][i-1] + a[j][i+1]
-                  + a[j-1][i] + a[j+1][i]) * s;
--- a/pystencils_tests/kerncraft_inputs/3d-7pt.c
+++ b/pystencils_tests/kerncraft_inputs/3d-7pt.c
-double a[M][N][N];
-double b[M][N][N];
-double s;
-
-for(int k=1; k<M-1; ++k)
-    for(int j=1; j<N-1; ++j)
-        for(int i=1; i<N-1; ++i)
-            b[k][j][i] = ( a[k][j][i-1] + a[k][j][i+1]
-                         + a[k][j-1][i] + a[k][j+1][i]
-                         + a[k-1][j][i] + a[k+1][j][i]) * s;
--- a/pystencils_tests/kerncraft_inputs/default_machine_file.yaml
+++ b/pystencils_tests/kerncraft_inputs/default_machine_file.yaml
-kerncraft version: 0.7.3
-clock: 2.7 GHz
-cores per socket: 8
-cores per NUMA domain: 8
-NUMA domains per socket: 1
-model type: Intel Core SandyBridge EP processor
-model name: Intel(R) Xeon(R) CPU E5-2680 0 @ 2.70GHz
-sockets: 2
-threads per core: 2
-cacheline size: 64 B
-compiler:
-    !!omap
-    - icc: -O3 -xAVX -fno-alias -qopenmp
-    - clang: -O3 -march=corei7-avx -mtune=corei7-avx -D_POSIX_C_SOURCE=200112L -fopenmp
-    - gcc: -O3 -march=corei7-avx -D_POSIX_C_SOURCE=200112L -fopenmp
-micro-architecture: SNB
-FLOPs per cycle:
-    SP: {total: 16, ADD: 8, MUL: 8}
-    DP: {total: 8, ADD: 4, MUL: 4}
-overlapping model:
-    ports: ["0", "0DV", "1", "2", "3", "4", "5"]
-    performance counter metric:
-        Max(UOPS_DISPATCHED_PORT_PORT_0:PMC[0-3],
-            UOPS_DISPATCHED_PORT_PORT_1:PMC[0-3],
-            UOPS_DISPATCHED_PORT_PORT_4:PMC[0-3],
-            UOPS_DISPATCHED_PORT_PORT_5:PMC[0-3])
-non-overlapping model:
-    ports: ["2D", "3D"]
-    performance counter metric: T_OL + T_L1L2 + T_L2L3 + T_L3MEM
-write-allocate: True
-memory hierarchy:
-    - level: L1
-      cache per group: {
-         'sets': 64, 'ways': 8, 'cl_size': 64, # 32 kB
-         'replacement_policy': 'LRU',
-         'write_allocate': True, 'write_back': True,
-         'load_from': 'L2', 'store_to': 'L2'}
-      cores per group: 1
-      threads per group: 2
-      groups: 16
-      performance counter metrics:
-          accesses: MEM_UOPS_RETIRED_LOADS:PMC[0-3]
-          misses: L1D_REPLACEMENT:PMC[0-3]
-          evicts: L1D_M_EVICT:PMC[0-3]
-    - level: L2
-      cache per group: {
-         'sets': 512, 'ways': 8, 'cl_size': 64, # 256 kB
-         'replacement_policy': 'LRU',
-         'write_allocate': True, 'write_back': True,
-         'load_from': 'L3', 'store_to': 'L3'}
-      cores per group: 1
-      threads per group: 2
-      groups: 16
-      non-overlap upstream throughput: [32 B/cy, 'half-duplex']
-      performance counter metrics:
-          accesses: L1D_REPLACEMENT:PMC[0-3]
-          misses: L2_LINES_IN_ALL:PMC[0-3]
-          evicts: L2_TRANS_L2_WB:PMC[0-3]
-    - level: L3
-      cache per group: {
-         'sets': 20480, 'ways': 16, 'cl_size': 64, # 20 MB
-         'replacement_policy': 'LRU',
-         'write_allocate': True, 'write_back': True}
-      cores per group: 8
-      threads per group: 16
-      groups: 2
-      non-overlap upstream throughput: [32 B/cy, 'half-duplex']
-      performance counter metrics:
-          accesses: L2_LINES_IN_ALL:PMC[0-3]
-          misses: (CAS_COUNT_RD:MBOX0C[01] + CAS_COUNT_RD:MBOX1C[01] +
-                   CAS_COUNT_RD:MBOX2C[01] + CAS_COUNT_RD:MBOX3C[01])
-          evicts: (CAS_COUNT_WR:MBOX0C[01] + CAS_COUNT_WR:MBOX1C[01] +
-                   CAS_COUNT_WR:MBOX2C[01] + CAS_COUNT_WR:MBOX3C[01])
-    - level: MEM
-      cores per group: 8
-      non-overlap upstream throughput: ['full socket memory bandwidth', 'half-duplex']
-      size per group: null
-      threads per group: 16
-benchmarks:
-  kernels:
-    copy:
-      FLOPs per iteration: 0
-      read streams: {bytes: 8.00 B, streams: 1}
-      read+write streams: {bytes: 0.00 B, streams: 0}
-      write streams: {bytes: 8.00 B, streams: 1}
-    daxpy:
-      FLOPs per iteration: 2
-      read streams: {bytes: 16.00 B, streams: 2}
-      read+write streams: {bytes: 8.00 B, streams: 1}
-      write streams: {bytes: 8.00 B, streams: 1}
-    load:
-      FLOPs per iteration: 0
-      read streams: {bytes: 8.00 B, streams: 1}
-      read+write streams: {bytes: 0.00 B, streams: 0}
-      write streams: {bytes: 0.00 B, streams: 0}
-    triad:
-      FLOPs per iteration: 2
-      read streams: {bytes: 24.00 B, streams: 3}
-      read+write streams: {bytes: 0.00 B, streams: 0}
-      write streams: {bytes: 8.00 B, streams: 1}
-    update:
-      FLOPs per iteration: 0
-      read streams: {bytes: 8.00 B, streams: 1}
-      read+write streams: {bytes: 8.00 B, streams: 1}
-      write streams: {bytes: 8.00 B, streams: 1}
-  measurements:
-    L1:
-      1:
-        cores: [1, 2, 3, 4, 5, 6, 7, 8]
-        results:
-          copy: [81.98 GB/s, 163.75 GB/s, 245.62 GB/s, 327.69 GB/s, 409.41 GB/s, 489.83
-              GB/s, 571.67 GB/s, 653.50 GB/s]
-          daxpy: [71.55 GB/s, 143.01 GB/s, 214.86 GB/s, 286.26 GB/s, 355.60 GB/s,
-            426.71 GB/s, 497.45 GB/s, 568.97 GB/s]
-          load: [61.92 GB/s, 122.79 GB/s, 183.01 GB/s, 244.30 GB/s, 306.76 GB/s, 368.46
-              GB/s, 427.41 GB/s, 490.88 GB/s]
-          triad: [81.61 GB/s, 163.25 GB/s, 244.92 GB/s, 326.65 GB/s, 406.69 GB/s,
-            487.76 GB/s, 569.10 GB/s, 650.39 GB/s]
-          update: [84.03 GB/s, 168.02 GB/s, 252.10 GB/s, 335.94 GB/s, 419.90 GB/s,
-            503.88 GB/s, 587.86 GB/s, 671.88 GB/s]
-        size per core: [16.00 kB, 16.00 kB, 16.00 kB, 16.00 kB, 16.00 kB, 16.00 kB,
-          16.00 kB, 16.00 kB]
-        size per thread: [16.00 kB, 16.00 kB, 16.00 kB, 16.00 kB, 16.00 kB, 16.00
-            kB, 16.00 kB, 16.00 kB]
-        threads: [1, 2, 3, 4, 5, 6, 7, 8]
-        threads per core: 1
-        total size: [16.00 kB, 32.00 kB, 48.00 kB, 64.00 kB, 80.00 kB, 96.00 kB, 112.00
-            kB, 128.00 kB]
-      2:
-        cores: [1, 2, 3, 4, 5, 6, 7, 8]
-        results:
-          copy: [79.53 GB/s, 158.70 GB/s, 238.20 GB/s, 317.62 GB/s, 397.09 GB/s, 476.33
-              GB/s, 555.69 GB/s, 634.96 GB/s]
-          daxpy: [70.94 GB/s, 141.90 GB/s, 212.97 GB/s, 283.91 GB/s, 354.93 GB/s,
-            425.85 GB/s, 496.74 GB/s, 567.40 GB/s]
-          load: [57.01 GB/s, 114.11 GB/s, 171.11 GB/s, 228.13 GB/s, 285.15 GB/s, 342.11
-              GB/s, 399.11 GB/s, 456.11 GB/s]
-          triad: [79.48 GB/s, 159.03 GB/s, 238.53 GB/s, 318.04 GB/s, 392.11 GB/s,
-            477.10 GB/s, 538.36 GB/s, 636.02 GB/s]
-          update: [82.75 GB/s, 165.55 GB/s, 248.50 GB/s, 331.32 GB/s, 414.06 GB/s,
-            496.82 GB/s, 579.83 GB/s, 662.36 GB/s]
-        size per core: [16.00 kB, 16.00 kB, 16.00 kB, 16.00 kB, 16.00 kB, 16.00 kB,
-          16.00 kB, 16.00 kB]
-        size per thread: [8.00 kB, 8.00 kB, 8.00 kB, 8.00 kB, 8.00 kB, 8.00 kB, 8.00
-            kB, 8.00 kB]
-        threads: [2, 4, 6, 8, 10, 12, 14, 16]
-        threads per core: 2
-        total size: [16.00 kB, 32.00 kB, 48.00 kB, 64.00 kB, 80.00 kB, 96.00 kB, 112.00
-            kB, 128.00 kB]
-    L2:
-      1:
-        cores: [1, 2, 3, 4, 5, 6, 7, 8]
-        results:
-          copy: [41.28 GB/s, 81.96 GB/s, 120.28 GB/s, 160.70 GB/s, 203.22 GB/s, 239.97
-              GB/s, 271.13 GB/s, 307.01 GB/s]
-          daxpy: [48.85 GB/s, 98.62 GB/s, 143.29 GB/s, 197.76 GB/s, 230.58 GB/s, 284.98
-              GB/s, 334.22 GB/s, 385.72 GB/s]
-          load: [38.51 GB/s, 76.67 GB/s, 114.73 GB/s, 152.90 GB/s, 188.69 GB/s, 223.64
-              GB/s, 265.21 GB/s, 289.41 GB/s]
-          triad: [40.92 GB/s, 83.49 GB/s, 124.48 GB/s, 165.24 GB/s, 206.74 GB/s, 237.90
-              GB/s, 274.96 GB/s, 329.09 GB/s]
-          update: [50.37 GB/s, 100.05 GB/s, 145.43 GB/s, 196.82 GB/s, 244.07 GB/s,
-            301.62 GB/s, 336.88 GB/s, 403.78 GB/s]
-        size per core: [128.00 kB, 128.00 kB, 128.00 kB, 128.00 kB, 128.00 kB, 128.00
-            kB, 128.00 kB, 128.00 kB]
-        size per thread: [128.00 kB, 128.00 kB, 128.00 kB, 128.00 kB, 128.00 kB, 128.00
-            kB, 128.00 kB, 128.00 kB]
-        threads: [1, 2, 3, 4, 5, 6, 7, 8]
-        threads per core: 1
-        total size: [128.00 kB, 256.00 kB, 384.00 kB, 512.00 kB, 640.00 kB, 768.00
-            kB, 0.90 MB, 1.02 MB]
-      2:
-        cores: [1, 2, 3, 4, 5, 6, 7, 8]
-        results:
-          copy: [42.17 GB/s, 83.47 GB/s, 124.57 GB/s, 163.78 GB/s, 202.56 GB/s, 242.80
-              GB/s, 276.95 GB/s, 311.36 GB/s]
-          daxpy: [50.87 GB/s, 98.72 GB/s, 152.12 GB/s, 193.48 GB/s, 251.36 GB/s, 301.72
-              GB/s, 352.55 GB/s, 365.28 GB/s]
-          load: [39.62 GB/s, 79.03 GB/s, 118.03 GB/s, 157.85 GB/s, 196.48 GB/s, 237.44
-              GB/s, 276.81 GB/s, 309.71 GB/s]
-          triad: [44.80 GB/s, 88.35 GB/s, 125.13 GB/s, 169.94 GB/s, 209.60 GB/s, 260.15
-              GB/s, 300.75 GB/s, 333.08 GB/s]
-          update: [49.80 GB/s, 100.70 GB/s, 150.56 GB/s, 196.44 GB/s, 251.90 GB/s,
-            280.93 GB/s, 352.74 GB/s, 399.27 GB/s]
-        size per core: [128.00 kB, 128.00 kB, 128.00 kB, 128.00 kB, 128.00 kB, 128.00
-            kB, 128.00 kB, 128.00 kB]
-        size per thread: [64.00 kB, 64.00 kB, 64.00 kB, 64.00 kB, 64.00 kB, 64.00
-            kB, 64.00 kB, 64.00 kB]
-        threads: [2, 4, 6, 8, 10, 12, 14, 16]
-        threads per core: 2
-        total size: [128.00 kB, 256.00 kB, 384.00 kB, 512.00 kB, 640.00 kB, 768.00
-            kB, 0.90 MB, 1.02 MB]
-    L3:
-      1:
-        cores: [1, 2, 3, 4, 5, 6, 7, 8]
-        results:
-          copy: [23.21 GB/s, 46.01 GB/s, 67.96 GB/s, 90.17 GB/s, 111.47 GB/s, 133.14
-              GB/s, 153.84 GB/s, 174.92 GB/s]
-          daxpy: [30.35 GB/s, 60.32 GB/s, 90.00 GB/s, 119.71 GB/s, 148.87 GB/s, 178.39
-              GB/s, 207.10 GB/s, 236.25 GB/s]
-          load: [23.35 GB/s, 46.52 GB/s, 69.57 GB/s, 92.60 GB/s, 115.77 GB/s, 138.89
-              GB/s, 161.82 GB/s, 184.11 GB/s]
-          triad: [25.18 GB/s, 50.08 GB/s, 74.33 GB/s, 98.78 GB/s, 122.66 GB/s, 146.78
-              GB/s, 170.52 GB/s, 194.47 GB/s]
-          update: [32.67 GB/s, 64.65 GB/s, 95.98 GB/s, 127.29 GB/s, 157.67 GB/s, 188.22
-              GB/s, 217.41 GB/s, 246.99 GB/s]
-        size per core: [1.25 MB, 1.25 MB, 1.25 MB, 1.25 MB, 1.25 MB, 1.25 MB, 1.25
-            MB, 1.25 MB]
-        size per thread: [1.25 MB, 1.25 MB, 1.25 MB, 1.25 MB, 1.25 MB, 1.25 MB, 1.25
-            MB, 1.25 MB]
-        threads: [1, 2, 3, 4, 5, 6, 7, 8]
-        threads per core: 1
-        total size: [1.25 MB, 2.50 MB, 3.75 MB, 5.00 MB, 6.25 MB, 7.50 MB, 8.75 MB,
-          10.00 MB]
-      2:
-        cores: [1, 2, 3, 4, 5, 6, 7, 8]
-        results:
-          copy: [23.83 GB/s, 47.25 GB/s, 69.84 GB/s, 92.61 GB/s, 114.31 GB/s, 136.48
-              GB/s, 157.55 GB/s, 178.99 GB/s]
-          daxpy: [31.52 GB/s, 62.72 GB/s, 93.43 GB/s, 124.29 GB/s, 154.55 GB/s, 185.18
-              GB/s, 215.10 GB/s, 245.24 GB/s]
-          load: [27.63 GB/s, 54.93 GB/s, 81.57 GB/s, 108.63 GB/s, 134.91 GB/s, 161.72
-              GB/s, 188.15 GB/s, 214.94 GB/s]
-          triad: [25.90 GB/s, 51.76 GB/s, 76.73 GB/s, 102.29 GB/s, 126.17 GB/s, 152.10
-              GB/s, 176.71 GB/s, 200.64 GB/s]
-          update: [34.10 GB/s, 67.67 GB/s, 100.62 GB/s, 133.50 GB/s, 165.61 GB/s,
-            197.74 GB/s, 228.73 GB/s, 259.05 GB/s]
-        size per core: [1.25 MB, 1.25 MB, 1.25 MB, 1.25 MB, 1.25 MB, 1.25 MB, 1.25
-            MB, 1.25 MB]
-        size per thread: [625.00 kB, 625.00 kB, 625.00 kB, 625.00 kB, 625.00 kB, 625.00
-            kB, 625.00 kB, 625.00 kB]
-        threads: [2, 4, 6, 8, 10, 12, 14, 16]
-        threads per core: 2
-        total size: [1.25 MB, 2.50 MB, 3.75 MB, 5.00 MB, 6.25 MB, 7.50 MB, 8.75 MB,
-          10.00 MB]
-    MEM:
-      1:
-        cores: [1, 2, 3, 4, 5, 6, 7, 8]
-        results:
-          copy: [11.60 GB/s, 21.29 GB/s, 25.94 GB/s, 27.28 GB/s, 27.47 GB/s, 27.36
-              GB/s, 27.21 GB/s, 27.12 GB/s]
-          daxpy: [17.33 GB/s, 31.89 GB/s, 38.65 GB/s, 40.50 GB/s, 40.81 GB/s, 40.62
-              GB/s, 40.59 GB/s, 40.26 GB/s]
-          load: [12.01 GB/s, 23.04 GB/s, 32.79 GB/s, 40.21 GB/s, 43.39 GB/s, 44.14
-              GB/s, 44.42 GB/s, 44.40 GB/s]
-          triad: [12.73 GB/s, 24.27 GB/s, 30.43 GB/s, 31.46 GB/s, 31.77 GB/s, 31.74
-              GB/s, 31.65 GB/s, 31.52 GB/s]
-          update: [18.91 GB/s, 32.43 GB/s, 37.28 GB/s, 39.98 GB/s, 40.99 GB/s, 40.92
-              GB/s, 40.61 GB/s, 40.34 GB/s]
-        size per core: [40.00 MB, 20.00 MB, 13.33 MB, 10.00 MB, 8.00 MB, 6.67 MB,
-          5.71 MB, 5.00 MB]
-        size per thread: [40.00 MB, 20.00 MB, 13.33 MB, 10.00 MB, 8.00 MB, 6.67 MB,
-          5.71 MB, 5.00 MB]
-        threads: [1, 2, 3, 4, 5, 6, 7, 8]
-        threads per core: 1
-        total size: [40.00 MB, 40.00 MB, 40.00 MB, 40.00 MB, 40.00 MB, 40.00 MB, 40.00 MB, 40.00 MB, 40.00 MB, 40.00 MB]
-      2:
-        cores: [1, 2, 3, 4, 5, 6, 7, 8]
-        results:
-          copy: [10.92 GB/s, 20.62 GB/s, 25.34 GB/s, 26.22 GB/s, 26.32 GB/s, 26.31
-              GB/s, 26.22 GB/s, 26.16 GB/s]
-          daxpy: [17.15 GB/s, 31.96 GB/s, 38.12 GB/s, 39.19 GB/s, 39.38 GB/s, 39.16
-              GB/s, 39.06 GB/s, 38.87 GB/s]
-          load: [13.49 GB/s, 25.92 GB/s, 36.16 GB/s, 41.56 GB/s, 43.34 GB/s, 43.40
-              GB/s, 43.01 GB/s, 42.66 GB/s]
-          triad: [12.38 GB/s, 23.17 GB/s, 28.69 GB/s, 29.98 GB/s, 30.50 GB/s, 30.59
-              GB/s, 30.75 GB/s, 30.70 GB/s]
-          update: [19.67 GB/s, 34.93 GB/s, 39.93 GB/s, 40.79 GB/s, 40.43 GB/s, 40.03
-              GB/s, 39.62 GB/s, 39.33 GB/s]
-        size per core: [40.00 MB, 20.00 MB, 13.33 MB, 10.00 MB, 8.00 MB, 6.67 MB,
-          5.71 MB, 5.00 MB]
-        size per thread: [20.00 MB, 10.00 MB, 6.67 MB, 5.00 MB, 4.00 MB, 3.33 MB,
-          2.86 MB, 2.50 MB]
-        threads: [2, 4, 6, 8, 10, 12, 14, 16]
-        threads per core: 2
-        total size: [40.00 MB, 40.00 MB, 40.00 MB, 40.00 MB, 40.00 MB, 40.00 MB, 40.00 MB, 40.00 MB, 40.00 MB, 40.00 MB]
-
--- a/pystencils_tests/test_assignment_collection.py
+++ b/pystencils_tests/test_assignment_collection.py
-import sympy as sp
-from pystencils import Assignment, AssignmentCollection
-from pystencils.simp.assignment_collection import SymbolGen
-
-
-def test_assignment_collection():
-    x, y, z, t = sp.symbols("x y z t")
-    symbol_gen = SymbolGen("a")
-
-    ac = AssignmentCollection([Assignment(z, x + y)],
-                              [], subexpression_symbol_generator=symbol_gen)
-
-    lhs = ac.add_subexpression(t)
-    assert lhs == sp.Symbol("a_0")
-    ac.subexpressions.append(Assignment(t, 3))
-    ac.topological_sort(sort_main_assignments=False, sort_subexpressions=True)
-    assert ac.subexpressions[0].lhs == t
-
-    assert ac.new_with_inserted_subexpression(sp.Symbol("not_defined")) == ac
-    ac_inserted = ac.new_with_inserted_subexpression(t)
-    ac_inserted2 = ac.new_without_subexpressions({lhs})
-    assert all(a == b for a, b in zip(ac_inserted.all_assignments, ac_inserted2.all_assignments))
-
-    print(ac_inserted)
-    assert ac_inserted.subexpressions[0] == Assignment(lhs, 3)
-
-    assert 'a_0' in str(ac_inserted)
-    assert '<table' in ac_inserted._repr_html_()
--- a/pystencils_tests/test_basic_usage_llvm.ipynb
+++ b/pystencils_tests/test_basic_usage_llvm.ipynb
-%% Cell type:markdown id: tags:
-
-# pystencils - LLVM generation
-The generation of LLVM code is simliar but not identical as seen with the C++ version. For the generation itself a python module ``llvmlite`` is used. This module provides the necessary support and bindings for LLVM. In order to generate from the AST to llvm, the AST needs to be transformed to support type conversions. This is the biggest difference to the C++ version. C++ doesn't need that since the language itself handles the casts.
-
-In this example a simple weighted Jacobi kernel is generated, so the focus remains on the part of LLVM generation.
-
-%% Cell type:code id: tags:
-
-``` python
-import sympy as sp
-import numpy as np
-import ctypes
-from pystencils import Field, Assignment
-from pystencils import create_kernel
-from pystencils.display_utils import to_dot
-
-sp.init_printing()
-```
-
-%% Cell type:markdown id: tags:
-
-The numpy arrays (with inital values) create *Field*s for the update Rule. Later those arrays are used for the computation itself.
-
-%% Cell type:code id: tags:
-
-``` python
-src_arr = np.zeros((30, 20))
-src_arr[0,:] = 1.0
-src_arr[:,0] = 1.0
-dst_arr = src_arr.copy()
-
-src_field = Field.create_from_numpy_array('src', src_arr)
-dst_field = Field.create_from_numpy_array('dst', dst_arr)
-```
-
-%% Cell type:markdown id: tags:
-
-Using the *Field* objects and the additional *Symbol* $\omega$ for the weight the update rule is specified as a *sympy* equation.
-
-%% Cell type:code id: tags:
-
-``` python
-omega = sp.symbols("omega")
-update_rule = Assignment(dst_field[0,0], omega * (src_field[0,1] + src_field[0,-1] + src_field[1,0] + src_field[-1,0]) / 4
-                   + (1.-omega)*src_field[0,0])
-update_rule
-```
-
-%% Output
-
-
-                                ω⋅(src_E + src_N + src_S + src_W)
-    dst_C := src_C⋅(-ω + 1.0) + ─────────────────────────────────
-                                                4
-
-%% Cell type:markdown id: tags:
-
-With this update rule an abstract syntax tree (AST) can be created. This AST can be used to print the LLVM code. The creation follows the same routines as the C++ version does. However at the end there are two more steps. In order to generate LLVM, type casting and pointer arithmetic had to be introduced (which C++ does for you).
-
-%% Cell type:code id: tags:
-
-``` python
-ast = create_kernel([update_rule], target='llvm')
-print(str(ast))
-```
-
-%% Output
-
-    KernelFunction kernel([<double * RESTRICT fd_dst>, <double * RESTRICT const fd_src>, <double omega>])
-    	Block for(ctr_0=1; ctr_0<29; ctr_0+=1)
-    		Block fd_dst_C ← pointer_arithmetic_func(fd_dst, 20*ctr_0)
-    		fd_src_C ← pointer_arithmetic_func(fd_src, 20*ctr_0)
-    		fd_src_E ← pointer_arithmetic_func(fd_src, 20*ctr_0 + 20)
-    		fd_src_W ← pointer_arithmetic_func(fd_src, 20*ctr_0 - 20)
-    		for(ctr_1=1; ctr_1<19; ctr_1+=1)
-    			Block fd_dst_C[ctr_1] ← omega*(fd_src_C[ctr_1 + 1] + fd_src_C[ctr_1 - 1] + fd_src_E[ctr_1] + fd_src_W[ctr_1])/4 + (omega*cast_func(-1, double) + 1.0)*fd_src_C[ctr_1]
-    
-    
-    
-
-%% Cell type:markdown id: tags:
-
-It is possible to examine the AST further.
-
-%% Cell type:code id: tags:
-
-``` python
-to_dot(ast)
-```
-
-%% Output
-
-
-    <graphviz.files.Source at 0x7f2d14f276a0>
-
-%% Cell type:markdown id: tags:
-
-With transformed AST it is now possible to generate and compile the AST into LLVM. Notice that unlike in C++ version, no files are writen to the hard drive (although it is possible).
-
-There are multiple ways how to generate and compile the AST. The most simple one is simillar to the C++ version. Using the ``compile()`` function of the generated AST
-
-You can also manually create a python function with ``make_python_function``.
-
-Another option is obtaining the jit itself with ``generate_and_jit``.
-The function ``generate_and_jit`` first generates and the compiles the AST.
-
-If even more controll is needed, it is possible to use the functions ``generateLLVM`` and ``compileLLVM`` to achieve the same. For further controll, instead of calling ``compileLLVM`` the jit object itself can be created and its necessary functions for the compilation have to be run manually (``parse``, (``optimize``,) ``compile``)
-
-%% Cell type:code id: tags:
-
-``` python
-kernel = ast.compile()
-
-#kernel = make_python_function(ast)
-
-# Or alternativally
-#jit = generate_and_jit(ast)
-# Call: jit('kernel', src_arr, dst_arr)
-```
-
-%% Cell type:markdown id: tags:
-
-The compiled function(s) can be used now. Either call the function (with arguments, if not given before) or call the jit object with the function's name and its arguments. Here, numpy arrays are automatically adjusted with ctypes.
-
-The functions and arguments can be read as well.
-
-**All of the information the jit object has comes from the module which was parsed. If you parse a second module and don't run the compilation step, the module and the compiled code are not the same anymore, thus leading to false information**
-
-%% Cell type:code id: tags:
-
-``` python
-#jit.print_functions()
-```
-
-%% Cell type:code id: tags:
-
-``` python
-for i in range(100):
-    kernel(src=src_arr, dst=dst_arr, omega=2/3)
-    src_arr, dst_arr = dst_arr, src_arr
-
-```
-
-%% Cell type:markdown id: tags:
-
-The output is drawn with matplotlib.
-
-%% Cell type:code id: tags:
-
-``` python
-import matplotlib.pyplot as plt
-from matplotlib import cm
-fig = plt.figure()
-ax = fig.add_subplot(111)
-ax.imshow(dst_arr, cmap=cm.jet)
-plt.show()
-```
-
-%% Output
-
-
-%% Cell type:code id: tags:
-
-``` python
-```
-%% Cell type:markdown id: tags:
-
-# pystencils - LLVM generation
-The generation of LLVM code is simliar but not identical as seen with the C++ version. For the generation itself a python module ``llvmlite`` is used. This module provides the necessary support and bindings for LLVM. In order to generate from the AST to llvm, the AST needs to be transformed to support type conversions. This is the biggest difference to the C++ version. C++ doesn't need that since the language itself handles the casts.
-
-In this example a simple weighted Jacobi kernel is generated, so the focus remains on the part of LLVM generation.
-
-%% Cell type:code id: tags:
-
-``` python
-import sympy as sp
-import numpy as np
-import ctypes
-from pystencils import Field, Assignment
-from pystencils import create_kernel
-from pystencils.display_utils import to_dot
-
-sp.init_printing()
-```
-
-%% Cell type:markdown id: tags:
-
-The numpy arrays (with inital values) create *Field*s for the update Rule. Later those arrays are used for the computation itself.
-
-%% Cell type:code id: tags:
-
-``` python
-src_arr = np.zeros((30, 20))
-src_arr[0,:] = 1.0
-src_arr[:,0] = 1.0
-dst_arr = src_arr.copy()
-
-src_field = Field.create_from_numpy_array('src', src_arr)
-dst_field = Field.create_from_numpy_array('dst', dst_arr)
-```
-
-%% Cell type:markdown id: tags:
-
-Using the *Field* objects and the additional *Symbol* $\omega$ for the weight the update rule is specified as a *sympy* equation.
-
-%% Cell type:code id: tags:
-
-``` python
-omega = sp.symbols("omega")
-update_rule = Assignment(dst_field[0,0], omega * (src_field[0,1] + src_field[0,-1] + src_field[1,0] + src_field[-1,0]) / 4
-                   + (1.-omega)*src_field[0,0])
-update_rule
-```
-
-%% Output
-
-
-                                ω⋅(src_E + src_N + src_S + src_W)
-    dst_C := src_C⋅(-ω + 1.0) + ─────────────────────────────────
-                                                4
-
-%% Cell type:markdown id: tags:
-
-With this update rule an abstract syntax tree (AST) can be created. This AST can be used to print the LLVM code. The creation follows the same routines as the C++ version does. However at the end there are two more steps. In order to generate LLVM, type casting and pointer arithmetic had to be introduced (which C++ does for you).
-
-%% Cell type:code id: tags:
-
-``` python
-ast = create_kernel([update_rule], target='llvm')
-print(str(ast))
-```
-
-%% Output
-
-    KernelFunction kernel([<double * RESTRICT fd_dst>, <double * RESTRICT const fd_src>, <double omega>])
-    	Block for(ctr_0=1; ctr_0<29; ctr_0+=1)
-    		Block fd_dst_C ← pointer_arithmetic_func(fd_dst, 20*ctr_0)
-    		fd_src_C ← pointer_arithmetic_func(fd_src, 20*ctr_0)
-    		fd_src_E ← pointer_arithmetic_func(fd_src, 20*ctr_0 + 20)
-    		fd_src_W ← pointer_arithmetic_func(fd_src, 20*ctr_0 - 20)
-    		for(ctr_1=1; ctr_1<19; ctr_1+=1)
-    			Block fd_dst_C[ctr_1] ← omega*(fd_src_C[ctr_1 + 1] + fd_src_C[ctr_1 - 1] + fd_src_E[ctr_1] + fd_src_W[ctr_1])/4 + (omega*cast_func(-1, double) + 1.0)*fd_src_C[ctr_1]
-    
-    
-    
-
-%% Cell type:markdown id: tags:
-
-It is possible to examine the AST further.
-
-%% Cell type:code id: tags:
-
-``` python
-to_dot(ast)
-```
-
-%% Output
-
-
-    <graphviz.files.Source at 0x7f2d14f276a0>
-
-%% Cell type:markdown id: tags:
-
-With transformed AST it is now possible to generate and compile the AST into LLVM. Notice that unlike in C++ version, no files are writen to the hard drive (although it is possible).
-
-There are multiple ways how to generate and compile the AST. The most simple one is simillar to the C++ version. Using the ``compile()`` function of the generated AST
-
-You can also manually create a python function with ``make_python_function``.
-
-Another option is obtaining the jit itself with ``generate_and_jit``.
-The function ``generate_and_jit`` first generates and the compiles the AST.
-
-If even more controll is needed, it is possible to use the functions ``generateLLVM`` and ``compileLLVM`` to achieve the same. For further controll, instead of calling ``compileLLVM`` the jit object itself can be created and its necessary functions for the compilation have to be run manually (``parse``, (``optimize``,) ``compile``)
-
-%% Cell type:code id: tags:
-
-``` python
-kernel = ast.compile()
-
-#kernel = make_python_function(ast)
-
-# Or alternativally
-#jit = generate_and_jit(ast)
-# Call: jit('kernel', src_arr, dst_arr)
-```
-
-%% Cell type:markdown id: tags:
-
-The compiled function(s) can be used now. Either call the function (with arguments, if not given before) or call the jit object with the function's name and its arguments. Here, numpy arrays are automatically adjusted with ctypes.
-
-The functions and arguments can be read as well.
-
-**All of the information the jit object has comes from the module which was parsed. If you parse a second module and don't run the compilation step, the module and the compiled code are not the same anymore, thus leading to false information**
-
-%% Cell type:code id: tags:
-
-``` python
-#jit.print_functions()
-```
-
-%% Cell type:code id: tags:
-
-``` python
-for i in range(100):
-    kernel(src=src_arr, dst=dst_arr, omega=2/3)
-    src_arr, dst_arr = dst_arr, src_arr
-
-```
-
-%% Cell type:markdown id: tags:
-
-The output is drawn with matplotlib.
-
-%% Cell type:code id: tags:
-
-``` python
-import matplotlib.pyplot as plt
-from matplotlib import cm
-fig = plt.figure()
-ax = fig.add_subplot(111)
-ax.imshow(dst_arr, cmap=cm.jet)
-plt.show()
-```
-
-%% Output
-
-
-%% Cell type:code id: tags:
-
-``` python
-```
--- a/pystencils_tests/test_datahandling_parallel.py
+++ b/pystencils_tests/test_datahandling_parallel.py
-import numpy as np
-from pystencils.datahandling.parallel_datahandling import ParallelDataHandling
-from pystencils_tests.test_datahandling import access_and_gather, synchronization, kernel_execution_jacobi, vtk_output, \
-    reduction
-import waLBerla as wlb
-
-
-def test_access_and_gather():
-    block_size = (4, 7, 1)
-    num_blocks = (3, 2, 1)
-    cells = tuple(a * b for a, b in zip(block_size, num_blocks))
-    blocks = wlb.createUniformBlockGrid(blocks=num_blocks, cellsPerBlock=block_size, oneBlockPerProcess=False,
-                                        periodic=(1, 1, 1))
-    dh = ParallelDataHandling(blocks, default_ghost_layers=2)
-    access_and_gather(dh, cells)
-    synchronization(dh, test_gpu=False)
-    if hasattr(wlb, 'cuda'):
-        synchronization(dh, test_gpu=True)
-
-
-def test_gpu():
-    if not hasattr(wlb, 'cuda'):
-        print("Skip GPU tests because walberla was built without CUDA")
-        return
-
-    block_size = (4, 7, 1)
-    num_blocks = (3, 2, 1)
-    blocks = wlb.createUniformBlockGrid(blocks=num_blocks, cellsPerBlock=block_size, oneBlockPerProcess=False)
-    dh = ParallelDataHandling(blocks, default_ghost_layers=2)
-    dh.add_array('v', values_per_cell=3, dtype=np.int64, ghost_layers=2, gpu=True)
-
-    for b in dh.iterate():
-        b['v'].fill(42)
-    dh.all_to_gpu()
-    for b in dh.iterate():
-        b['v'].fill(0)
-    dh.to_cpu('v')
-    for b in dh.iterate():
-        np.testing.assert_equal(b['v'], 42)
-
-
-def test_kernel():
-
-    for gpu in (True, False):
-        if gpu and not hasattr(wlb, 'cuda'):
-            print("Skipping CUDA tests because walberla was built without GPU support")
-            continue
-
-        # 3D
-        blocks = wlb.createUniformBlockGrid(blocks=(3, 2, 4), cellsPerBlock=(3, 2, 5), oneBlockPerProcess=False)
-        dh = ParallelDataHandling(blocks)
-        kernel_execution_jacobi(dh, test_gpu=gpu)
-        reduction(dh)
-
-        # 2D
-        blocks = wlb.createUniformBlockGrid(blocks=(3, 2, 1), cellsPerBlock=(3, 2, 1), oneBlockPerProcess=False)
-        dh = ParallelDataHandling(blocks, dim=2)
-        kernel_execution_jacobi(dh, test_gpu=gpu)
-        reduction(dh)
-
-
-def test_vtk_output():
-    blocks = wlb.createUniformBlockGrid(blocks=(3, 2, 4), cellsPerBlock=(3, 2, 5), oneBlockPerProcess=False)
-    dh = ParallelDataHandling(blocks)
-    vtk_output(dh)
--- a/pystencils_tests/test_fd_derivation.ipynb
+++ b/pystencils_tests/test_fd_derivation.ipynb
-%% Cell type:code id: tags:
-
-``` python
-from lbmpy.session import *
-from lbmpy.phasefield.scenarios import *
-import pystencils as ps
-from pystencils.fd.derivation import *
-```
-
-%% Cell type:markdown id: tags:
-
-# 2D standard stencils
-
-%% Cell type:code id: tags:
-
-``` python
-stencil = [(-1, 0), (1, 0), (0, -1), (0, 1), (0, 0)]
-standard_2d_00 = FiniteDifferenceStencilDerivation((0,0), stencil)
-f = ps.fields("f: [2D]")
-standard_2d_00_res = standard_2d_00.get_stencil()
-res = standard_2d_00_res.apply(f.center)
-expected = f[-1, 0] - 2 * f[0, 0] + f[1, 0]
-assert res == expected
-```
-
-%% Cell type:code id: tags:
-
-``` python
-assert standard_2d_00_res.accuracy == 2
-assert not standard_2d_00_res.is_isotropic
-standard_2d_00_res
-```
-
-%% Output
-
-    Finite difference stencil of accuracy 2, isotropic error: False
-
-%% Cell type:code id: tags:
-
-``` python
-standard_2d_00.get_stencil().as_matrix()
-```
-
-%% Output
-
-
-    $$\left[\begin{matrix}0 & 0 & 0\\1 & -2 & 1\\0 & 0 & 0\end{matrix}\right]$$
-    ⎡0  0   0⎤
-    ⎢        ⎥
-    ⎢1  -2  1⎥
-    ⎢        ⎥
-    ⎣0  0   0⎦
-
-%% Cell type:markdown id: tags:
-
-# 2D isotropic stencils
-
-## second x-derivative
-
-%% Cell type:code id: tags:
-
-``` python
-stencil = get_stencil("D2Q9")
-isotropic_2d_00 = FiniteDifferenceStencilDerivation((0,0), stencil)
-isotropic_2d_00_res = isotropic_2d_00.get_stencil(isotropic=True)
-assert isotropic_2d_00_res.is_isotropic
-assert isotropic_2d_00_res.accuracy == 2
-isotropic_2d_00_res
-```
-
-%% Output
-
-    Finite difference stencil of accuracy 2, isotropic error: True
-
-%% Cell type:code id: tags:
-
-``` python
-isotropic_2d_00_res.as_matrix()
-```
-
-%% Output
-
-
-    $$\left[\begin{matrix}\frac{1}{12} & - \frac{1}{6} & \frac{1}{12}\\\frac{5}{6} & - \frac{5}{3} & \frac{5}{6}\\\frac{1}{12} & - \frac{1}{6} & \frac{1}{12}\end{matrix}\right]$$
-    ⎡1/12  -1/6  1/12⎤
-    ⎢                ⎥
-    ⎢5/6   -5/3  5/6 ⎥
-    ⎢                ⎥
-    ⎣1/12  -1/6  1/12⎦
-
-%% Cell type:code id: tags:
-
-``` python
-plt.figure(figsize=(2,2))
-isotropic_2d_00_res.visualize()
-```
-
-%% Output
-
-
-
-%% Cell type:code id: tags:
-
-``` python
-expected_result = sp.Matrix([[1, -2, 1], [10, -20, 10], [1, -2, 1]]) / 12
-assert expected_result == isotropic_2d_00_res.as_matrix()
-```
-
-%% Cell type:markdown id: tags:
-
-## Isotropic laplacian
-
-%% Cell type:code id: tags:
-
-``` python
-isotropic_2d_11 = FiniteDifferenceStencilDerivation((1,1), stencil)
-isotropic_2d_11_res = isotropic_2d_11.get_stencil(isotropic=True)
-iso_laplacian = isotropic_2d_00_res.as_matrix() + isotropic_2d_11_res.as_matrix()
-iso_laplacian
-```
-
-%% Output
-
-
-    $$\left[\begin{matrix}\frac{1}{6} & \frac{2}{3} & \frac{1}{6}\\\frac{2}{3} & - \frac{10}{3} & \frac{2}{3}\\\frac{1}{6} & \frac{2}{3} & \frac{1}{6}\end{matrix}\right]$$
-    ⎡1/6   2/3   1/6⎤
-    ⎢               ⎥
-    ⎢2/3  -10/3  2/3⎥
-    ⎢               ⎥
-    ⎣1/6   2/3   1/6⎦
-
-%% Cell type:code id: tags:
-
-``` python
-expected_result = sp.Matrix([[1, 4, 1], [4, -20, 4], [1, 4, 1]]) / 6
-assert iso_laplacian == expected_result
-```
-%% Cell type:code id: tags:
-
-``` python
-from lbmpy.session import *
-from lbmpy.phasefield.scenarios import *
-import pystencils as ps
-from pystencils.fd.derivation import *
-```
-
-%% Cell type:markdown id: tags:
-
-# 2D standard stencils
-
-%% Cell type:code id: tags:
-
-``` python
-stencil = [(-1, 0), (1, 0), (0, -1), (0, 1), (0, 0)]
-standard_2d_00 = FiniteDifferenceStencilDerivation((0,0), stencil)
-f = ps.fields("f: [2D]")
-standard_2d_00_res = standard_2d_00.get_stencil()
-res = standard_2d_00_res.apply(f.center)
-expected = f[-1, 0] - 2 * f[0, 0] + f[1, 0]
-assert res == expected
-```
-
-%% Cell type:code id: tags:
-
-``` python
-assert standard_2d_00_res.accuracy == 2
-assert not standard_2d_00_res.is_isotropic
-standard_2d_00_res
-```
-
-%% Output
-
-    Finite difference stencil of accuracy 2, isotropic error: False
-
-%% Cell type:code id: tags:
-
-``` python
-standard_2d_00.get_stencil().as_matrix()
-```
-
-%% Output
-
-
-    $$\left[\begin{matrix}0 & 0 & 0\\1 & -2 & 1\\0 & 0 & 0\end{matrix}\right]$$
-    ⎡0  0   0⎤
-    ⎢        ⎥
-    ⎢1  -2  1⎥
-    ⎢        ⎥
-    ⎣0  0   0⎦
-
-%% Cell type:markdown id: tags:
-
-# 2D isotropic stencils
-
-## second x-derivative
-
-%% Cell type:code id: tags:
-
-``` python
-stencil = get_stencil("D2Q9")
-isotropic_2d_00 = FiniteDifferenceStencilDerivation((0,0), stencil)
-isotropic_2d_00_res = isotropic_2d_00.get_stencil(isotropic=True)
-assert isotropic_2d_00_res.is_isotropic
-assert isotropic_2d_00_res.accuracy == 2
-isotropic_2d_00_res
-```
-
-%% Output
-
-    Finite difference stencil of accuracy 2, isotropic error: True
-
-%% Cell type:code id: tags:
-
-``` python
-isotropic_2d_00_res.as_matrix()
-```
-
-%% Output
-
-
-    $$\left[\begin{matrix}\frac{1}{12} & - \frac{1}{6} & \frac{1}{12}\\\frac{5}{6} & - \frac{5}{3} & \frac{5}{6}\\\frac{1}{12} & - \frac{1}{6} & \frac{1}{12}\end{matrix}\right]$$
-    ⎡1/12  -1/6  1/12⎤
-    ⎢                ⎥
-    ⎢5/6   -5/3  5/6 ⎥
-    ⎢                ⎥
-    ⎣1/12  -1/6  1/12⎦
-
-%% Cell type:code id: tags:
-
-``` python
-plt.figure(figsize=(2,2))
-isotropic_2d_00_res.visualize()
-```
-
-%% Output
-
-
-
-%% Cell type:code id: tags:
-
-``` python
-expected_result = sp.Matrix([[1, -2, 1], [10, -20, 10], [1, -2, 1]]) / 12
-assert expected_result == isotropic_2d_00_res.as_matrix()
-```
-
-%% Cell type:markdown id: tags:
-
-## Isotropic laplacian
-
-%% Cell type:code id: tags:
-
-``` python
-isotropic_2d_11 = FiniteDifferenceStencilDerivation((1,1), stencil)
-isotropic_2d_11_res = isotropic_2d_11.get_stencil(isotropic=True)
-iso_laplacian = isotropic_2d_00_res.as_matrix() + isotropic_2d_11_res.as_matrix()
-iso_laplacian
-```
-
-%% Output
-
-
-    $$\left[\begin{matrix}\frac{1}{6} & \frac{2}{3} & \frac{1}{6}\\\frac{2}{3} & - \frac{10}{3} & \frac{2}{3}\\\frac{1}{6} & \frac{2}{3} & \frac{1}{6}\end{matrix}\right]$$
-    ⎡1/6   2/3   1/6⎤
-    ⎢               ⎥
-    ⎢2/3  -10/3  2/3⎥
-    ⎢               ⎥
-    ⎣1/6   2/3   1/6⎦
-
-%% Cell type:code id: tags:
-
-``` python
-expected_result = sp.Matrix([[1, 4, 1], [4, -20, 4], [1, 4, 1]]) / 6
-assert iso_laplacian == expected_result
-```
--- a/pystencils_tests/test_field.py
+++ b/pystencils_tests/test_field.py
-import pytest
-import numpy as np
-import sympy as sp
-import pystencils as ps
-from pystencils import Field, FieldType
-from pystencils.field import layout_string_to_tuple
-
-
-def test_field_basic():
-    f = Field.create_generic('f', spatial_dimensions=2)
-    assert FieldType.is_generic(f)
-    assert f['E'] == f[1, 0]
-    assert f['N'] == f[0, 1]
-    assert '_' in f.center._latex('dummy')
-
-    f = Field.create_fixed_size('f', (10, 10), strides=(80, 8), dtype=np.float64)
-    assert f.spatial_strides == (10, 1)
-    assert f.index_strides == ()
-    assert f.center_vector == sp.Matrix([f.center])
-
-    f = Field.create_fixed_size('f', (8, 8, 2, 2), index_dimensions=2)
-    assert f.center_vector == sp.Matrix([[f(0, 0), f(0, 1)],
-                                         [f(1, 0), f(1, 1)]])
-    field_access = f[1, 1]
-    assert field_access.nr_of_coordinates == 2
-    assert field_access.offset_name == 'NE'
-    neighbor = field_access.neighbor(coord_id=0, offset=-2)
-    assert neighbor.offsets == (-1, 1)
-    assert '_' in neighbor._latex('dummy')
-
-    f = Field.create_generic('f', spatial_dimensions=5, index_dimensions=2)
-    field_access = f[1, -1, 2, -3, 0](1, 0)
-    assert field_access.offsets == (1, -1, 2, -3, 0)
-    assert field_access.index == (1, 0)
-
-
-def test_error_handling():
-    struct_dtype = np.dtype([('a', np.int32), ('b', np.float64), ('c', np.uint32)])
-    Field.create_generic('f', spatial_dimensions=2, index_dimensions=0, dtype=struct_dtype)
-    with pytest.raises(ValueError) as e:
-        Field.create_generic('f', spatial_dimensions=2, index_dimensions=1, dtype=struct_dtype)
-    assert 'index dimension' in str(e)
-
-    arr = np.array([[1, 2.0, 3], [1, 2.0, 3]], dtype=struct_dtype)
-    Field.create_from_numpy_array('f', arr, index_dimensions=0)
-    with pytest.raises(ValueError) as e:
-        Field.create_from_numpy_array('f', arr, index_dimensions=1)
-    assert 'Structured arrays' in str(e)
-
-    arr = np.zeros([3, 3, 3])
-    Field.create_from_numpy_array('f', arr, index_dimensions=2)
-    with pytest.raises(ValueError) as e:
-        Field.create_from_numpy_array('f', arr, index_dimensions=3)
-    assert 'Too many' in str(e)
-
-    Field.create_fixed_size('f', (3, 2, 4), index_dimensions=0, dtype=struct_dtype, layout='reverse_numpy')
-    with pytest.raises(ValueError) as e:
-        Field.create_fixed_size('f', (3, 2, 4), index_dimensions=1, dtype=struct_dtype, layout='reverse_numpy')
-    assert 'Structured arrays' in str(e)
-
-    f = Field.create_fixed_size('f', (10, 10))
-    with pytest.raises(ValueError) as e:
-        f[1]
-    assert 'Wrong number of spatial indices' in str(e)
-
-    f = Field.create_generic('f', spatial_dimensions=2, index_shape=(3,))
-    with pytest.raises(ValueError) as e:
-        f(3)
-    assert 'out of bounds' in str(e)
-
-    f = Field.create_fixed_size('f', (10, 10, 3, 4), index_dimensions=2)
-    with pytest.raises(ValueError) as e:
-        f(3, 0)
-    assert 'out of bounds' in str(e)
-
-    with pytest.raises(ValueError) as e:
-        f(1, 0)(1, 0)
-    assert 'Indexing an already indexed' in str(e)
-
-    with pytest.raises(ValueError) as e:
-        f(1)
-    assert 'Wrong number of indices' in str(e)
-
-    with pytest.raises(ValueError) as e:
-        Field.create_generic('f', spatial_dimensions=2, layout='wrong')
-    assert 'Unknown layout descriptor' in str(e)
-
-    assert layout_string_to_tuple('fzyx', dim=4) == (3, 2, 1, 0)
-    with pytest.raises(ValueError) as e:
-        layout_string_to_tuple('wrong', dim=4)
-    assert 'Unknown layout descriptor' in str(e)
-
-
-def test_decorator_scoping():
-    dst = ps.fields('dst : double[2D]')
-
-    def f1():
-        a = sp.Symbol("a")
-
-        def f2():
-            b = sp.Symbol("b")
-
-            @ps.kernel
-            def decorated_func():
-                dst[0, 0] @= a + b
-
-            return decorated_func
-
-        return f2
-
-    assert f1()(), ps.Assignment(dst[0, 0], sp.Symbol("a") + sp.Symbol("b"))
-
-
-def test_string_creation():
-    x, y, z = ps.fields('  x(4),    y(3,5) z : double[  3,  47]')
-    assert x.index_shape == (4,)
-    assert y.index_shape == (3, 5)
-    assert z.spatial_shape == (3, 47)
--- a/pystencils_tests/test_finite_differences.py
+++ b/pystencils_tests/test_finite_differences.py
-import sympy as sp
-import pystencils as ps
-from pystencils.stencils import stencil_coefficients
-from pystencils.fd.spatial import fd_stencils_standard, fd_stencils_isotropic, discretize_spatial
-from pystencils.fd import diff
-
-
-def test_spatial_2d_unit_sum():
-    f = ps.fields("f: double[2D]")
-    h = sp.symbols("h")
-
-    terms = [diff(f, 0),
-             diff(f, 1),
-             diff(f, 0, 1),
-             diff(f, 0, 0),
-             diff(f, 1, 1),
-             diff(f, 0, 0) + diff(f, 1, 1)]
-
-    schemes = [fd_stencils_standard, fd_stencils_isotropic]
-
-    for term in terms:
-        for scheme in schemes:
-            discretized = discretize_spatial(term, dx=h, stencil=scheme)
-            _, coefficients = stencil_coefficients(discretized)
-            assert sum(coefficients) == 0
-
-def test_spatial_1d_unit_sum():
-    f = ps.fields("f: double[1D]")
-    h = sp.symbols("h")
-
-    terms = [diff(f, 0),
-             diff(f, 0, 0)]
-
-    schemes = [fd_stencils_standard, fd_stencils_isotropic]
-
-    for term in terms:
-        for scheme in schemes:
-            discretized = discretize_spatial(term, dx=h, stencil=scheme)
-            _, coefficients = stencil_coefficients(discretized)
-            assert sum(coefficients) == 0
--- a/pystencils_tests/test_indexed_kernels.py
+++ b/pystencils_tests/test_indexed_kernels.py
-import numpy as np
-from pystencils import Field, Assignment
-from pystencils.cpu import create_indexed_kernel, make_python_function
-
-
-def test_indexed_kernel():
-    arr = np.zeros((3, 4))
-    dtype = np.dtype([('x', int), ('y', int), ('value', arr.dtype)])
-    index_arr = np.zeros((3,), dtype=dtype)
-    index_arr[0] = (0, 2, 3.0)
-    index_arr[1] = (1, 3, 42.0)
-    index_arr[2] = (2, 1, 5.0)
-
-    indexed_field = Field.create_from_numpy_array('index', index_arr)
-    normal_field = Field.create_from_numpy_array('f', arr)
-    update_rule = Assignment(normal_field[0, 0], indexed_field('value'))
-    ast = create_indexed_kernel([update_rule], [indexed_field])
-    kernel = make_python_function(ast)
-    kernel(f=arr, index=index_arr)
-    for i in range(index_arr.shape[0]):
-        np.testing.assert_allclose(arr[index_arr[i]['x'], index_arr[i]['y']], index_arr[i]['value'], atol=1e-13)
-
-
-def test_indexed_cuda_kernel():
-    try:
-        import pycuda
-    except ImportError:
-        pycuda = None
-
-    if pycuda:
-        from pystencils.gpucuda import make_python_function
-        import pycuda.gpuarray as gpuarray
-        from pystencils.gpucuda.kernelcreation import created_indexed_cuda_kernel
-
-        arr = np.zeros((3, 4))
-        dtype = np.dtype([('x', int), ('y', int), ('value', arr.dtype)])
-        index_arr = np.zeros((3,), dtype=dtype)
-        index_arr[0] = (0, 2, 3.0)
-        index_arr[1] = (1, 3, 42.0)
-        index_arr[2] = (2, 1, 5.0)
-
-        indexed_field = Field.create_from_numpy_array('index', index_arr)
-        normal_field = Field.create_from_numpy_array('f', arr)
-        update_rule = Assignment(normal_field[0, 0], indexed_field('value'))
-        ast = created_indexed_cuda_kernel([update_rule], [indexed_field])
-        kernel = make_python_function(ast)
-
-        gpu_arr = gpuarray.to_gpu(arr)
-        gpu_index_arr = gpuarray.to_gpu(index_arr)
-        kernel(f=gpu_arr, index=gpu_index_arr)
-        gpu_arr.get(arr)
-        for i in range(index_arr.shape[0]):
-            np.testing.assert_allclose(arr[index_arr[i]['x'], index_arr[i]['y']], index_arr[i]['value'], atol=1e-13)
-    else:
-        print("Did not run test on GPU since no pycuda is available")
--- a/pystencils_tests/test_jacobi_llvm.py
+++ b/pystencils_tests/test_jacobi_llvm.py
-import numpy as np
-from pystencils import Assignment, Field
-from pystencils.llvm import create_kernel, make_python_function
-from pystencils.llvm.llvmjit import generate_and_jit
-
-
-def test_jacobi_fixed_field_size():
-    size = (30, 20)
-
-    src_field_llvm = np.random.rand(*size)
-    src_field_py = np.copy(src_field_llvm)
-    dst_field_llvm = np.zeros(size)
-    dst_field_py = np.zeros(size)
-
-    f = Field.create_from_numpy_array("f", src_field_llvm)
-    d = Field.create_from_numpy_array("d", dst_field_llvm)
-
-    jacobi = Assignment(d[0, 0], (f[1, 0] + f[-1, 0] + f[0, 1] + f[0, -1]) / 4)
-    ast = create_kernel([jacobi])
-
-    for x in range(1, size[0] - 1):
-        for y in range(1, size[1] - 1):
-            dst_field_py[x, y] = 0.25 * (src_field_py[x - 1, y] + src_field_py[x + 1, y] +
-                                         src_field_py[x, y - 1] + src_field_py[x, y + 1])
-
-    jit = generate_and_jit(ast)
-    jit('kernel', dst_field_llvm, src_field_llvm)
-    error = np.sum(np.abs(dst_field_py - dst_field_llvm))
-    np.testing.assert_almost_equal(error, 0.0)
-
-
-def test_jacobi_variable_field_size():
-    size = (3, 3, 3)
-    f = Field.create_generic("f", 3)
-    d = Field.create_generic("d", 3)
-    jacobi = Assignment(d[0, 0, 0], (f[1, 0, 0] + f[-1, 0, 0] + f[0, 1, 0] + f[0, -1, 0]) / 4)
-    ast = create_kernel([jacobi])
-
-    src_field_llvm = np.random.rand(*size)
-    src_field_py = np.copy(src_field_llvm)
-    dst_field_llvm = np.zeros(size)
-    dst_field_py = np.zeros(size)
-
-    for x in range(1, size[0] - 1):
-        for y in range(1, size[1] - 1):
-            for z in range(1, size[2] - 1):
-                dst_field_py[x, y, z] = 0.25 * (src_field_py[x - 1, y, z] + src_field_py[x + 1, y, z] +
-                                                src_field_py[x, y - 1, z] + src_field_py[x, y + 1, z])
-
-    kernel = make_python_function(ast, {'f': src_field_llvm, 'd': dst_field_llvm})
-    kernel()
-    error = np.sum(np.abs(dst_field_py - dst_field_llvm))
-    np.testing.assert_almost_equal(error, 0.0)
--- a/pystencils_tests/test_kerncraft_coupling.py
+++ b/pystencils_tests/test_kerncraft_coupling.py
-import os
-import numpy as np
-import sympy as sp
-from pystencils import Field, Assignment
-from pystencils.kerncraft_coupling import PyStencilsKerncraftKernel, KerncraftParameters
-from pystencils.kerncraft_coupling.generate_benchmark import generate_benchmark
-from pystencils.cpu import create_kernel
-import pytest
-
-from kerncraft.models import ECMData, ECM, Benchmark
-from kerncraft.machinemodel import MachineModel
-from kerncraft.kernel import KernelCode
-
-
-SCRIPT_FOLDER = os.path.dirname(os.path.realpath(__file__))
-INPUT_FOLDER = os.path.join(SCRIPT_FOLDER, "kerncraft_inputs")
-
-
-@pytest.mark.kernkraft
-def test_compilation():
-    kernel_file_path = os.path.join(INPUT_FOLDER, "2d-5pt.c")
-    with open(kernel_file_path) as kernel_file:
-        reference_kernel = KernelCode(kernel_file.read(), machine=None, filename=kernel_file_path)
-        reference_kernel.as_code('likwid')
-
-    size = [30, 50, 3]
-    arr = np.zeros(size)
-    a = Field.create_from_numpy_array('a', arr, index_dimensions=1)
-    b = Field.create_from_numpy_array('b', arr, index_dimensions=1)
-    s = sp.Symbol("s")
-    rhs = a[0, -1](0) + a[0, 1] + a[-1, 0] + a[1, 0]
-    update_rule = Assignment(b[0, 0], s * rhs)
-    ast = create_kernel([update_rule])
-    mine = generate_benchmark(ast, likwid=False)
-    print(mine)
-
-
-@pytest.mark.kernkraft
-def analysis(kernel, model='ecmdata'):
-    machine_file_path = os.path.join(INPUT_FOLDER, "default_machine_file.yaml")
-    machine = MachineModel(path_to_yaml=machine_file_path)
-    if model == 'ecmdata':
-        model = ECMData(kernel, machine, KerncraftParameters())
-    elif model == 'ecm':
-        model = ECM(kernel, machine, KerncraftParameters())
-        # model.analyze()
-        # model.plot()
-    elif model == 'benchmark':
-        model = Benchmark(kernel, machine, KerncraftParameters())
-    else:
-        model = ECM(kernel, machine, KerncraftParameters())
-    model.analyze()
-    return model
-
-
-@pytest.mark.kernkraft
-def test_3d_7pt_iaca():
-    # Make sure you use the intel compiler
-    size = [20, 200, 200]
-    kernel_file_path = os.path.join(INPUT_FOLDER, "3d-7pt.c")
-    machine_file_path = os.path.join(INPUT_FOLDER, "default_machine_file.yaml")
-    machine = MachineModel(path_to_yaml=machine_file_path)
-    with open(kernel_file_path) as kernel_file:
-        reference_kernel = KernelCode(kernel_file.read(), machine=machine, filename=kernel_file_path)
-    reference_kernel.set_constant('M', size[0])
-    reference_kernel.set_constant('N', size[1])
-    assert size[1] == size[2]
-    analysis(reference_kernel, model='ecm')
-
-    arr = np.zeros(size)
-    a = Field.create_from_numpy_array('a', arr, index_dimensions=0)
-    b = Field.create_from_numpy_array('b', arr, index_dimensions=0)
-    s = sp.Symbol("s")
-    rhs = a[0, -1, 0] + a[0, 1, 0] + a[-1, 0, 0] + a[1, 0, 0] + a[0, 0, -1] + a[0, 0, 1]
-
-    update_rule = Assignment(b[0, 0, 0], s * rhs)
-    ast = create_kernel([update_rule])
-    k = PyStencilsKerncraftKernel(ast, machine)
-    analysis(k, model='ecm')
-    assert reference_kernel._flops == k._flops
-    # assert reference.results['cl throughput'] == analysis.results['cl throughput']
-
-
-@pytest.mark.kernkraft
-def test_2d_5pt():
-    size = [30, 50, 3]
-    kernel_file_path = os.path.join(INPUT_FOLDER, "2d-5pt.c")
-    with open(kernel_file_path) as kernel_file:
-        reference_kernel = KernelCode(kernel_file.read(), machine=None, filename=kernel_file_path)
-    reference = analysis(reference_kernel)
-
-    arr = np.zeros(size)
-    a = Field.create_from_numpy_array('a', arr, index_dimensions=1)
-    b = Field.create_from_numpy_array('b', arr, index_dimensions=1)
-    s = sp.Symbol("s")
-    rhs = a[0, -1](0) + a[0, 1] + a[-1, 0] + a[1, 0]
-    update_rule = Assignment(b[0, 0], s * rhs)
-    ast = create_kernel([update_rule])
-    k = PyStencilsKerncraftKernel(ast)
-    result = analysis(k)
-
-    for e1, e2 in zip(reference.results['cycles'], result.results['cycles']):
-        assert e1 == e2
-
-
-@pytest.mark.kernkraft
-def test_3d_7pt():
-    size = [30, 50, 50]
-    kernel_file_path = os.path.join(INPUT_FOLDER, "3d-7pt.c")
-    with open(kernel_file_path) as kernel_file:
-        reference_kernel = KernelCode(kernel_file.read(), machine=None, filename=kernel_file_path)
-    reference_kernel.set_constant('M', size[0])
-    reference_kernel.set_constant('N', size[1])
-    assert size[1] == size[2]
-    reference = analysis(reference_kernel)
-
-    arr = np.zeros(size)
-    a = Field.create_from_numpy_array('a', arr, index_dimensions=0)
-    b = Field.create_from_numpy_array('b', arr, index_dimensions=0)
-    s = sp.Symbol("s")
-    rhs = a[0, -1, 0] + a[0, 1, 0] + a[-1, 0, 0] + a[1, 0, 0] + a[0, 0, -1] + a[0, 0, 1]
-
-    update_rule = Assignment(b[0, 0, 0], s * rhs)
-    ast = create_kernel([update_rule])
-    k = PyStencilsKerncraftKernel(ast)
-    result = analysis(k)
-
-    for e1, e2 in zip(reference.results['cycles'], result.results['cycles']):
-        assert e1 == e2
No results found