ef496986 · ef496986 · ef496986 · ef496986 · ef496986 · ef496986
--- a/pystencils/data_types.py
+++ b/pystencils/data_types.py
-import ctypes
-
-import numpy as np
-import sympy as sp
-from sympy.core.cache import cacheit
-from sympy.logic.boolalg import Boolean
-
-from pystencils.cache import memorycache
-from pystencils.utils import all_equal
-
-try:
-    import llvmlite.ir as ir
-except ImportError as e:
-    ir = None
-    _ir_importerror = e
-
-
-# noinspection PyPep8Naming
-class address_of(sp.Function):
-    is_Atom = True
-
-    def __new__(cls, arg):
-        obj = sp.Function.__new__(cls, arg)
-        return obj
-
-    @property
-    def canonical(self):
-        if hasattr(self.args[0], 'canonical'):
-            return self.args[0].canonical
-        else:
-            raise NotImplementedError()
-
-    @property
-    def is_commutative(self):
-        return self.args[0].is_commutative
-
-    @property
-    def dtype(self):
-        if hasattr(self.args[0], 'dtype'):
-            return PointerType(self.args[0].dtype, restrict=True)
-        else:
-            return PointerType('void', restrict=True)
-
-
-# noinspection PyPep8Naming
-class cast_func(sp.Function):
-    is_Atom = True
-
-    def __new__(cls, *args, **kwargs):
-        if len(args) != 2:
-            pass
-        expr, dtype, *other_args = args
-        if not isinstance(dtype, Type):
-            dtype = create_type(dtype)
-        # to work in conditions of sp.Piecewise cast_func has to be of type Boolean as well
-        # however, a cast_function should only be a boolean if its argument is a boolean, otherwise this leads
-        # to problems when for example comparing cast_func's for equality
-        #
-        # lhs = bitwise_and(a, cast_func(1, 'int'))
-        # rhs = cast_func(0, 'int')
-        # print( sp.Ne(lhs, rhs) ) # would give true if all cast_funcs are booleans
-        # -> thus a separate class boolean_cast_func is introduced
-        if isinstance(expr, Boolean):
-            cls = boolean_cast_func
-        return sp.Function.__new__(cls, expr, dtype, *other_args, **kwargs)
-
-    @property
-    def canonical(self):
-        if hasattr(self.args[0], 'canonical'):
-            return self.args[0].canonical
-        else:
-            raise NotImplementedError()
-
-    @property
-    def is_commutative(self):
-        return self.args[0].is_commutative
-
-    def _eval_evalf(self, *args, **kwargs):
-        return self.args[0].evalf()
-
-    @property
-    def dtype(self):
-        return self.args[1]
-
-
-# noinspection PyPep8Naming
-class boolean_cast_func(cast_func, Boolean):
-    pass
-
-
-# noinspection PyPep8Naming
-class vector_memory_access(cast_func):
-    nargs = (4,)
-
-
-# noinspection PyPep8Naming
-class reinterpret_cast_func(cast_func):
-    pass
-
-
-# noinspection PyPep8Naming
-class pointer_arithmetic_func(sp.Function, Boolean):
-    @property
-    def canonical(self):
-        if hasattr(self.args[0], 'canonical'):
-            return self.args[0].canonical
-        else:
-            raise NotImplementedError()
-
-
-class TypedSymbol(sp.Symbol):
-    def __new__(cls, *args, **kwds):
-        obj = TypedSymbol.__xnew_cached_(cls, *args, **kwds)
-        return obj
-
-    def __new_stage2__(cls, name, dtype, *args, **kwargs):
-        obj = super(TypedSymbol, cls).__xnew__(cls, name, *args, **kwargs)
-        try:
-            obj._dtype = create_type(dtype)
-        except (TypeError, ValueError):
-            # on error keep the string
-            obj._dtype = dtype
-        return obj
-
-    __xnew__ = staticmethod(__new_stage2__)
-    __xnew_cached_ = staticmethod(cacheit(__new_stage2__))
-
-    @property
-    def dtype(self):
-        return self._dtype
-
-    def _hashable_content(self):
-        return super()._hashable_content(), hash(self._dtype)
-
-    def __getnewargs__(self):
-        return self.name, self.dtype
-
-    # For reference: Numpy type hierarchy https://docs.scipy.org/doc/numpy-1.13.0/reference/arrays.scalars.html
-    @property
-    def is_integer(self):
-        if hasattr(self.dtype, 'numpy_dtype'):
-            return np.issubdtype(self.dtype.numpy_dtype, np.integer) or super().is_integer
-        else:
-            return super().is_integer
-
-    @property
-    def is_negative(self):
-        if hasattr(self.dtype, 'numpy_dtype'):
-            if np.issubdtype(self.dtype.numpy_dtype, np.unsignedinteger):
-                return False
-
-        return super().is_negative
-
-    @property
-    def is_nonnegative(self):
-        if self.is_negative is False:
-            return True
-        else:
-            return super().is_nonnegative
-
-    @property
-    def is_real(self):
-        if hasattr(self.dtype, 'numpy_dtype'):
-            return np.issubdtype(self.dtype.numpy_dtype, np.integer) or \
-                np.issubdtype(self.dtype.numpy_dtype, np.floating) or \
-                super().is_real
-        else:
-            return super().is_real
-
-
-def create_type(specification):
-    """Creates a subclass of Type according to a string or an object of subclass Type.
-
-    Args:
-        specification: Type object, or a string
-
-    Returns:
-        Type object, or a new Type object parsed from the string
-    """
-    if isinstance(specification, Type):
-        return specification
-    else:
-        numpy_dtype = np.dtype(specification)
-        if numpy_dtype.fields is None:
-            return BasicType(numpy_dtype, const=False)
-        else:
-            return StructType(numpy_dtype, const=False)
-
-
-@memorycache(maxsize=64)
-def create_composite_type_from_string(specification):
-    """Creates a new Type object from a c-like string specification.
-
-    Args:
-        specification: Specification string
-
-    Returns:
-        Type object
-    """
-    specification = specification.lower().split()
-    parts = []
-    current = []
-    for s in specification:
-        if s == '*':
-            parts.append(current)
-            current = [s]
-        else:
-            current.append(s)
-    if len(current) > 0:
-        parts.append(current)
-        # Parse native part
-    base_part = parts.pop(0)
-    const = False
-    if 'const' in base_part:
-        const = True
-        base_part.remove('const')
-    assert len(base_part) == 1
-    if base_part[0][-1] == "*":
-        base_part[0] = base_part[0][:-1]
-        parts.append('*')
-    current_type = BasicType(np.dtype(base_part[0]), const)
-    # Parse pointer parts
-    for part in parts:
-        restrict = False
-        const = False
-        if 'restrict' in part:
-            restrict = True
-            part.remove('restrict')
-        if 'const' in part:
-            const = True
-            part.remove("const")
-        assert len(part) == 1 and part[0] == '*'
-        current_type = PointerType(current_type, const, restrict)
-    return current_type
-
-
-def get_base_type(data_type):
-    while data_type.base_type is not None:
-        data_type = data_type.base_type
-    return data_type
-
-
-def to_ctypes(data_type):
-    """
-    Transforms a given Type into ctypes
-    :param data_type: Subclass of Type
-    :return: ctypes type object
-    """
-    if isinstance(data_type, PointerType):
-        return ctypes.POINTER(to_ctypes(data_type.base_type))
-    elif isinstance(data_type, StructType):
-        return ctypes.POINTER(ctypes.c_uint8)
-    else:
-        return to_ctypes.map[data_type.numpy_dtype]
-
-
-to_ctypes.map = {
-    np.dtype(np.int8): ctypes.c_int8,
-    np.dtype(np.int16): ctypes.c_int16,
-    np.dtype(np.int32): ctypes.c_int32,
-    np.dtype(np.int64): ctypes.c_int64,
-
-    np.dtype(np.uint8): ctypes.c_uint8,
-    np.dtype(np.uint16): ctypes.c_uint16,
-    np.dtype(np.uint32): ctypes.c_uint32,
-    np.dtype(np.uint64): ctypes.c_uint64,
-
-    np.dtype(np.float32): ctypes.c_float,
-    np.dtype(np.float64): ctypes.c_double,
-}
-
-
-def ctypes_from_llvm(data_type):
-    if not ir:
-        raise _ir_importerror
-    if isinstance(data_type, ir.PointerType):
-        ctype = ctypes_from_llvm(data_type.pointee)
-        if ctype is None:
-            return ctypes.c_void_p
-        else:
-            return ctypes.POINTER(ctype)
-    elif isinstance(data_type, ir.IntType):
-        if data_type.width == 8:
-            return ctypes.c_int8
-        elif data_type.width == 16:
-            return ctypes.c_int16
-        elif data_type.width == 32:
-            return ctypes.c_int32
-        elif data_type.width == 64:
-            return ctypes.c_int64
-        else:
-            raise ValueError("Int width %d is not supported" % data_type.width)
-    elif isinstance(data_type, ir.FloatType):
-        return ctypes.c_float
-    elif isinstance(data_type, ir.DoubleType):
-        return ctypes.c_double
-    elif isinstance(data_type, ir.VoidType):
-        return None  # Void type is not supported by ctypes
-    else:
-        raise NotImplementedError('Data type %s of %s is not supported yet' % (type(data_type), data_type))
-
-
-def to_llvm_type(data_type):
-    """
-    Transforms a given type into ctypes
-    :param data_type: Subclass of Type
-    :return: llvmlite type object
-    """
-    if not ir:
-        raise _ir_importerror
-    if isinstance(data_type, PointerType):
-        return to_llvm_type(data_type.base_type).as_pointer()
-    else:
-        return to_llvm_type.map[data_type.numpy_dtype]
-
-
-if ir:
-    to_llvm_type.map = {
-        np.dtype(np.int8): ir.IntType(8),
-        np.dtype(np.int16): ir.IntType(16),
-        np.dtype(np.int32): ir.IntType(32),
-        np.dtype(np.int64): ir.IntType(64),
-
-        np.dtype(np.uint8): ir.IntType(8),
-        np.dtype(np.uint16): ir.IntType(16),
-        np.dtype(np.uint32): ir.IntType(32),
-        np.dtype(np.uint64): ir.IntType(64),
-
-        np.dtype(np.float32): ir.FloatType(),
-        np.dtype(np.float64): ir.DoubleType(),
-    }
-
-
-def peel_off_type(dtype, type_to_peel_off):
-    while type(dtype) is type_to_peel_off:
-        dtype = dtype.base_type
-    return dtype
-
-
-def collate_types(types):
-    """
-    Takes a sequence of types and returns their "common type" e.g. (float, double, float) -> double
-    Uses the collation rules from numpy.
-    """
-
-    # Pointer arithmetic case i.e. pointer + integer is allowed
-    if any(type(t) is PointerType for t in types):
-        pointer_type = None
-        for t in types:
-            if type(t) is PointerType:
-                if pointer_type is not None:
-                    raise ValueError("Cannot collate the combination of two pointer types")
-                pointer_type = t
-            elif type(t) is BasicType:
-                if not (t.is_int() or t.is_uint()):
-                    raise ValueError("Invalid pointer arithmetic")
-            else:
-                raise ValueError("Invalid pointer arithmetic")
-        return pointer_type
-
-    # peel of vector types, if at least one vector type occurred the result will also be the vector type
-    vector_type = [t for t in types if type(t) is VectorType]
-    if not all_equal(t.width for t in vector_type):
-        raise ValueError("Collation failed because of vector types with different width")
-    types = [peel_off_type(t, VectorType) for t in types]
-
-    # now we should have a list of basic types - struct types are not yet supported
-    assert all(type(t) is BasicType for t in types)
-
-    if any(t.is_float() for t in types):
-        types = tuple(t for t in types if t.is_float())
-    # use numpy collation -> create type from numpy type -> and, put vector type around if necessary
-    result_numpy_type = np.result_type(*(t.numpy_dtype for t in types))
-    result = BasicType(result_numpy_type)
-    if vector_type:
-        result = VectorType(result, vector_type[0].width)
-    return result
-
-
-@memorycache(maxsize=2048)
-def get_type_of_expression(expr, default_float_type='double', default_int_type='int'):
-    from pystencils.astnodes import ResolvedFieldAccess
-    from pystencils.cpu.vectorization import vec_all, vec_any
-
-    expr = sp.sympify(expr)
-    if isinstance(expr, sp.Integer):
-        return create_type(default_int_type)
-    elif isinstance(expr, sp.Rational) or isinstance(expr, sp.Float):
-        return create_type(default_float_type)
-    elif isinstance(expr, ResolvedFieldAccess):
-        return expr.field.dtype
-    elif isinstance(expr, TypedSymbol):
-        return expr.dtype
-    elif isinstance(expr, sp.Symbol):
-        raise ValueError("All symbols inside this expression have to be typed! ", str(expr))
-    elif isinstance(expr, cast_func):
-        return expr.args[1]
-    elif isinstance(expr, vec_any) or isinstance(expr, vec_all):
-        return create_type("bool")
-    elif hasattr(expr, 'func') and expr.func == sp.Piecewise:
-        collated_result_type = collate_types(tuple(get_type_of_expression(a[0]) for a in expr.args))
-        collated_condition_type = collate_types(tuple(get_type_of_expression(a[1]) for a in expr.args))
-        if type(collated_condition_type) is VectorType and type(collated_result_type) is not VectorType:
-            collated_result_type = VectorType(collated_result_type, width=collated_condition_type.width)
-        return collated_result_type
-    elif isinstance(expr, sp.Indexed):
-        typed_symbol = expr.base.label
-        return typed_symbol.dtype.base_type
-    elif isinstance(expr, sp.boolalg.Boolean) or isinstance(expr, sp.boolalg.BooleanFunction):
-        # if any arg is of vector type return a vector boolean, else return a normal scalar boolean
-        result = create_type("bool")
-        vec_args = [get_type_of_expression(a) for a in expr.args if isinstance(get_type_of_expression(a), VectorType)]
-        if vec_args:
-            result = VectorType(result, width=vec_args[0].width)
-        return result
-    elif isinstance(expr, sp.Pow):
-        return get_type_of_expression(expr.args[0])
-    elif isinstance(expr, sp.Expr):
-        expr: sp.Expr
-        if expr.args:
-            types = tuple(get_type_of_expression(a) for a in expr.args)
-            return collate_types(types)
-        else:
-            if expr.is_integer:
-                return create_type(default_int_type)
-            else:
-                return create_type(default_float_type)
-
-    raise NotImplementedError("Could not determine type for", expr, type(expr))
-
-
-class Type(sp.Basic):
-    is_Atom = True
-
-    def __new__(cls, *args, **kwargs):
-        return sp.Basic.__new__(cls)
-
-    def _sympystr(self, *args, **kwargs):
-        return str(self)
-
-
-class BasicType(Type):
-    @staticmethod
-    def numpy_name_to_c(name):
-        if name == 'float64':
-            return 'double'
-        elif name == 'float32':
-            return 'float'
-        elif name.startswith('int'):
-            width = int(name[len("int"):])
-            return "int%d_t" % (width,)
-        elif name.startswith('uint'):
-            width = int(name[len("uint"):])
-            return "uint%d_t" % (width,)
-        elif name == 'bool':
-            return 'bool'
-        else:
-            raise NotImplementedError("Can map numpy to C name for %s" % (name,))
-
-    def __init__(self, dtype, const=False):
-        self.const = const
-        if isinstance(dtype, Type):
-            self._dtype = dtype.numpy_dtype
-        else:
-            self._dtype = np.dtype(dtype)
-        assert self._dtype.fields is None, "Tried to initialize NativeType with a structured type"
-        assert self._dtype.hasobject is False
-        assert self._dtype.subdtype is None
-
-    def __getnewargs__(self):
-        return self.numpy_dtype, self.const
-
-    @property
-    def base_type(self):
-        return None
-
-    @property
-    def numpy_dtype(self):
-        return self._dtype
-
-    @property
-    def item_size(self):
-        return 1
-
-    def is_int(self):
-        return self.numpy_dtype in np.sctypes['int'] or self.numpy_dtype in np.sctypes['uint']
-
-    def is_float(self):
-        return self.numpy_dtype in np.sctypes['float']
-
-    def is_uint(self):
-        return self.numpy_dtype in np.sctypes['uint']
-
-    def is_complex(self):
-        return self.numpy_dtype in np.sctypes['complex']
-
-    def is_other(self):
-        return self.numpy_dtype in np.sctypes['others']
-
-    @property
-    def base_name(self):
-        return BasicType.numpy_name_to_c(str(self._dtype))
-
-    def __str__(self):
-        result = BasicType.numpy_name_to_c(str(self._dtype))
-        if self.const:
-            result += " const"
-        return result
-
-    def __repr__(self):
-        return str(self)
-
-    def __eq__(self, other):
-        if not isinstance(other, BasicType):
-            return False
-        else:
-            return (self.numpy_dtype, self.const) == (other.numpy_dtype, other.const)
-
-    def __hash__(self):
-        return hash(str(self))
-
-
-class VectorType(Type):
-    instruction_set = None
-
-    def __init__(self, base_type, width=4):
-        self._base_type = base_type
-        self.width = width
-
-    @property
-    def base_type(self):
-        return self._base_type
-
-    @property
-    def item_size(self):
-        return self.width * self.base_type.item_size
-
-    def __eq__(self, other):
-        if not isinstance(other, VectorType):
-            return False
-        else:
-            return (self.base_type, self.width) == (other.base_type, other.width)
-
-    def __str__(self):
-        if self.instruction_set is None:
-            return "%s[%d]" % (self.base_type, self.width)
-        else:
-            if self.base_type == create_type("int64"):
-                return self.instruction_set['int']
-            elif self.base_type == create_type("float64"):
-                return self.instruction_set['double']
-            elif self.base_type == create_type("float32"):
-                return self.instruction_set['float']
-            elif self.base_type == create_type("bool"):
-                return self.instruction_set['bool']
-            else:
-                raise NotImplementedError()
-
-    def __hash__(self):
-        return hash((self.base_type, self.width))
-
-    def __getnewargs__(self):
-        return self._base_type, self.width
-
-
-class PointerType(Type):
-    def __init__(self, base_type, const=False, restrict=True):
-        self._base_type = base_type
-        self.const = const
-        self.restrict = restrict
-
-    def __getnewargs__(self):
-        return self.base_type, self.const, self.restrict
-
-    @property
-    def alias(self):
-        return not self.restrict
-
-    @property
-    def base_type(self):
-        return self._base_type
-
-    @property
-    def item_size(self):
-        return self.base_type.item_size
-
-    def __eq__(self, other):
-        if not isinstance(other, PointerType):
-            return False
-        else:
-            return (self.base_type, self.const, self.restrict) == (other.base_type, other.const, other.restrict)
-
-    def __str__(self):
-        components = [str(self.base_type), '*']
-        if self.restrict:
-            components.append('RESTRICT')
-        if self.const:
-            components.append("const")
-        return " ".join(components)
-
-    def __repr__(self):
-        return str(self)
-
-    def __hash__(self):
-        return hash((self._base_type, self.const, self.restrict))
-
-
-class StructType:
-    def __init__(self, numpy_type, const=False):
-        self.const = const
-        self._dtype = np.dtype(numpy_type)
-
-    def __getnewargs__(self):
-        return self.numpy_dtype, self.const
-
-    @property
-    def base_type(self):
-        return None
-
-    @property
-    def numpy_dtype(self):
-        return self._dtype
-
-    @property
-    def item_size(self):
-        return self.numpy_dtype.itemsize
-
-    def get_element_offset(self, element_name):
-        return self.numpy_dtype.fields[element_name][1]
-
-    def get_element_type(self, element_name):
-        np_element_type = self.numpy_dtype.fields[element_name][0]
-        return BasicType(np_element_type, self.const)
-
-    def has_element(self, element_name):
-        return element_name in self.numpy_dtype.fields
-
-    def __eq__(self, other):
-        if not isinstance(other, StructType):
-            return False
-        else:
-            return (self.numpy_dtype, self.const) == (other.numpy_dtype, other.const)
-
-    def __str__(self):
-        # structs are handled byte-wise
-        result = "uint8_t"
-        if self.const:
-            result += " const"
-        return result
-
-    def __repr__(self):
-        return str(self)
-
-    def __hash__(self):
-        return hash((self.numpy_dtype, self.const))
--- a/pystencils/include/PyStencilsField.h
+++ b/pystencils/include/PyStencilsField.h
-#pragma once
-
-extern "C++" {
-#ifdef __CUDA_ARCH__
-template <typename DTYPE_T, std::size_t DIMENSION> struct PyStencilsField {
-  DTYPE_T *data;
-  DTYPE_T shape[DIMENSION];
-  DTYPE_T stride[DIMENSION];
-};
-#else
-#include <array>
-
-template <typename DTYPE_T, std::size_t DIMENSION> struct PyStencilsField {
-  DTYPE_T *data;
-  std::array<DTYPE_T, DIMENSION> shape;
-  std::array<DTYPE_T, DIMENSION> stride;
-};
-#endif
-}
--- a/pystencils/include/aesni_rand.h
+++ b/pystencils/include/aesni_rand.h
-#if !defined(__AES__) || !defined(__SSE2__)
-#error AES-NI and SSE2 need to be enabled
-#endif
-
-#include <emmintrin.h> // SSE2
-#include <wmmintrin.h> // AES
-#ifdef __AVX512VL__
-#include <immintrin.h> // AVX*
-#endif
-#include <cstdint>
-
-#define QUALIFIERS inline
-#define TWOPOW53_INV_DOUBLE (1.1102230246251565e-16)
-#define TWOPOW32_INV_FLOAT (2.3283064e-10f)
-
-typedef std::uint32_t uint32;
-typedef std::uint64_t uint64;
-
-QUALIFIERS __m128i aesni1xm128i(const __m128i & in, const __m128i & k) {
-    __m128i x = _mm_xor_si128(k, in);
-    x = _mm_aesenc_si128(x, k);     // 1
-    x = _mm_aesenc_si128(x, k);     // 2
-    x = _mm_aesenc_si128(x, k);     // 3
-    x = _mm_aesenc_si128(x, k);     // 4
-    x = _mm_aesenc_si128(x, k);     // 5
-    x = _mm_aesenc_si128(x, k);     // 6
-    x = _mm_aesenc_si128(x, k);     // 7
-    x = _mm_aesenc_si128(x, k);     // 8
-    x = _mm_aesenc_si128(x, k);     // 9
-    x = _mm_aesenclast_si128(x, k); // 10
-    return x;
-}
-
-QUALIFIERS __m128 _my_cvtepu32_ps(const __m128i v)
-{
-#ifdef __AVX512VL__
-    return _mm_cvtepu32_ps(v);
-#else
-    __m128i v2 = _mm_srli_epi32(v, 1);
-    __m128i v1 = _mm_and_si128(v, _mm_set1_epi32(1));
-    __m128 v2f = _mm_cvtepi32_ps(v2);
-    __m128 v1f = _mm_cvtepi32_ps(v1);
-    return _mm_add_ps(_mm_add_ps(v2f, v2f), v1f);
-#endif
-}
-
-QUALIFIERS __m128d _my_cvtepu64_pd(const __m128i x)
-{
-#ifdef __AVX512VL__
-    return _mm_cvtepu64_pd(x);
-#else
-    uint64 r[2];
-    _mm_storeu_si128((__m128i*)r, x);
-    return _mm_set_pd((double)r[1], (double)r[0]);
-#endif
-}
-
-
-QUALIFIERS void aesni_double2(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3,
-                              uint32 key0, uint32 key1, uint32 key2, uint32 key3,
-                              double & rnd1, double & rnd2)
-{
-    // pack input and call AES
-    __m128i c128 = _mm_set_epi32(ctr3, ctr2, ctr1, ctr0);
-    __m128i k128 = _mm_set_epi32(key3, key2, key1, key0);
-    c128 = aesni1xm128i(c128, k128);
-
-    // convert 32 to 64 bit and put 0th and 2nd element into x, 1st and 3rd element into y
-    __m128i x = _mm_and_si128(c128, _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff));
-    __m128i y = _mm_and_si128(c128, _mm_set_epi32(0xffffffff, 0, 0xffffffff, 0));
-    y = _mm_srli_si128(y, 4);
-
-    // calculate z = x ^ y << (53 - 32))
-    __m128i z = _mm_sll_epi64(y, _mm_set_epi64x(53 - 32, 53 - 32));
-    z = _mm_xor_si128(x, z);
-
-    // convert uint64 to double
-    __m128d rs = _my_cvtepu64_pd(z);
-    // calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
-    rs = _mm_mul_pd(rs, _mm_set_pd1(TWOPOW53_INV_DOUBLE));
-    rs = _mm_add_pd(rs, _mm_set_pd1(TWOPOW53_INV_DOUBLE/2.0));
-
-    // store result
-    double rr[2];
-    _mm_storeu_pd(rr, rs);
-    rnd1 = rr[0];
-    rnd2 = rr[1];
-}
-
-
-QUALIFIERS void aesni_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3,
-                             uint32 key0, uint32 key1, uint32 key2, uint32 key3,
-                             float & rnd1, float & rnd2, float & rnd3, float & rnd4)
-{
-    // pack input and call AES
-    __m128i c128 = _mm_set_epi32(ctr3, ctr2, ctr1, ctr0);
-    __m128i k128 = _mm_set_epi32(key3, key2, key1, key0);
-    c128 = aesni1xm128i(c128, k128);
-
-    // convert uint32 to float
-    __m128 rs = _my_cvtepu32_ps(c128);
-    // calculate rs * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
-    rs = _mm_mul_ps(rs, _mm_set_ps1(TWOPOW32_INV_FLOAT));
-    rs = _mm_add_ps(rs, _mm_set_ps1(TWOPOW32_INV_FLOAT/2.0f));
-
-    // store result
-    float r[4];
-    _mm_storeu_ps(r, rs);
-    rnd1 = r[0];
-    rnd2 = r[1];
-    rnd3 = r[2];
-    rnd4 = r[3];
-}
\ No newline at end of file
--- a/pystencils/include/opencl_stdint.h
+++ b/pystencils/include/opencl_stdint.h
-#ifndef OPENCL_STDINT
-#define OPENCL_STDINT
-
-typedef unsigned int      uint;
-typedef unsigned int      uint_t;
-
-typedef signed char       int8_t;
-typedef signed short      int16_t;
-typedef signed int        int32_t;
-typedef signed long int   int64_t;
-typedef unsigned char     uint8_t;
-typedef unsigned short    uint16_t;
-typedef unsigned int      uint32_t;
-typedef unsigned long int uint64_t;
-
-#endif
--- a/pystencils/include/philox_rand.h
+++ b/pystencils/include/philox_rand.h
-#include <cstdint>
-
-#ifndef __CUDA_ARCH__
-#define QUALIFIERS inline
-#else
-#define QUALIFIERS static __forceinline__ __device__
-#endif
-
-#define PHILOX_W32_0   (0x9E3779B9)
-#define PHILOX_W32_1   (0xBB67AE85)
-#define PHILOX_M4x32_0 (0xD2511F53)
-#define PHILOX_M4x32_1 (0xCD9E8D57)
-#define TWOPOW53_INV_DOUBLE (1.1102230246251565e-16)
-#define TWOPOW32_INV_FLOAT (2.3283064e-10f)
-
-typedef std::uint32_t uint32;
-typedef std::uint64_t uint64;
-
-
-QUALIFIERS uint32 mulhilo32(uint32 a, uint32 b, uint32* hip)
-{
-#ifndef __CUDA_ARCH__
-    // host code
-    uint64 product = ((uint64)a) * ((uint64)b);
-    *hip = product >> 32;
-    return (uint32)product;
-#else
-    // device code
-    *hip = __umulhi(a,b);
-    return a*b;
-#endif
-}
-
-QUALIFIERS void _philox4x32round(uint32* ctr, uint32* key)
-{
-    uint32 hi0;
-    uint32 hi1;
-    uint32 lo0 = mulhilo32(PHILOX_M4x32_0, ctr[0], &hi0);
-    uint32 lo1 = mulhilo32(PHILOX_M4x32_1, ctr[2], &hi1);
-
-    ctr[0] = hi1^ctr[1]^key[0];
-    ctr[1] = lo1;
-    ctr[2] = hi0^ctr[3]^key[1];
-    ctr[3] = lo0;
-}
-
-QUALIFIERS void _philox4x32bumpkey(uint32* key)
-{
-    key[0] += PHILOX_W32_0;
-    key[1] += PHILOX_W32_1;
-}
-
-QUALIFIERS double _uniform_double_hq(uint32 x, uint32 y)
-{
-    uint64 z = (uint64)x ^ ((uint64)y << (53 - 32));
-    return z * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0);
-}
-
-
-QUALIFIERS void philox_double2(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3,
-                               uint32 key0, uint32 key1, double & rnd1, double & rnd2)
-{
-    uint32 key[2] = {key0, key1};
-    uint32 ctr[4] = {ctr0, ctr1, ctr2, ctr3};
-    _philox4x32round(ctr, key);                           // 1
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 2
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 3
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 4
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 5
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 6
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 7
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 8
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 9
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 10
-
-    rnd1 = _uniform_double_hq(ctr[0], ctr[1]);
-    rnd2 = _uniform_double_hq(ctr[2], ctr[3]);
-}
-
-
-
-QUALIFIERS void philox_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3,
-                              uint32 key0, uint32 key1,
-                              float & rnd1, float & rnd2, float & rnd3, float & rnd4)
-{
-    uint32 key[2] = {key0, key1};
-    uint32 ctr[4] = {ctr0, ctr1, ctr2, ctr3};
-    _philox4x32round(ctr, key);                           // 1
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 2
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 3
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 4
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 5
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 6
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 7
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 8
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 9
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 10
-
-    rnd1 = ctr[0] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
-    rnd2 = ctr[1] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
-    rnd3 = ctr[2] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
-    rnd4 = ctr[3] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
-}
\ No newline at end of file
--- a/pystencils/kerncraft_coupling/__init__.py
+++ b/pystencils/kerncraft_coupling/__init__.py
-from .generate_benchmark import generate_benchmark, run_c_benchmark
-from .kerncraft_interface import KerncraftParameters, PyStencilsKerncraftKernel
-
-__all__ = ['PyStencilsKerncraftKernel', 'KerncraftParameters', 'generate_benchmark', 'run_c_benchmark']
--- a/pystencils/kerncraft_coupling/generate_benchmark.py
+++ b/pystencils/kerncraft_coupling/generate_benchmark.py
-import os
-import subprocess
-
-from jinja2 import Template
-
-from pystencils.astnodes import PragmaBlock
-from pystencils.backends.cbackend import generate_c, get_headers
-from pystencils.cpu.cpujit import get_compiler_config, run_compile_step
-from pystencils.data_types import get_base_type
-from pystencils.include import get_pystencils_include_path
-from pystencils.sympyextensions import prod
-
-benchmark_template = Template("""
-#include "kerncraft.h"
-#include <stdlib.h>
-#include <stdint.h>
-#include <stdbool.h>
-#include <math.h>
-#include <stdio.h>
-
-{{ includes }}
-
-{%- if likwid %}
-#include <likwid.h>
-{%- endif %}
-
-#define RESTRICT __restrict__
-#define FUNC_PREFIX
-void dummy(void *);
-void timing(double* wcTime, double* cpuTime);
-extern int var_false;
-
-
-{{kernel_code}}
-
-
-int main(int argc, char **argv)
-{
-  {%- if likwid %}
-  likwid_markerInit();
-  {%- endif %}
-
-  {%- for field_name, dataType, size in fields %}
-
-  // Initialization {{field_name}}
-  double * {{field_name}} = (double *) aligned_malloc(sizeof({{dataType}}) * {{size}}, 64);
-  for (unsigned long long i = 0; i < {{size}}; ++i)
-    {{field_name}}[i] = 0.23;
-
-  if(var_false)
-    dummy({{field_name}});
-
-  {%- endfor %}
-
-
-
-  {%- for constantName, dataType in constants %}
-
-  // Constant {{constantName}}
-  {{dataType}} {{constantName}};
-  {{constantName}} = 0.23;
-  if(var_false)
-      dummy(& {{constantName}});
-
-  {%- endfor %}
-
-  {%- if likwid and openmp %}
-  #pragma omp parallel
-  {
-  likwid_markerRegisterRegion("loop");
-  #pragma omp barrier
-  {%- elif likwid %}
-  likwid_markerRegisterRegion("loop");
-  {%- endif %}
-
-  for(int warmup = 1; warmup >= 0; --warmup) {
-    int repeat = 2;
-    if(warmup == 0) {
-      repeat = atoi(argv[1]);
-      {%- if likwid %}
-      likwid_markerStartRegion("loop");
-      {%- endif %}
-    }
-    
-    {%- if timing %}
-    double wcStartTime, cpuStartTime, wcEndTime, cpuEndTime;
-    timing(&wcStartTime, &cpuStartTime);
-    {%- endif %}
-    
-    for (; repeat > 0; --repeat)
-    {
-      {{kernelName}}({{call_argument_list}});
-
-      // Dummy calls
-      {%- for field_name, dataType, size in fields %}
-      if(var_false) dummy((void*){{field_name}});
-      {%- endfor %}
-      {%- for constantName, dataType in constants %}
-      if(var_false) dummy((void*)&{{constantName}});
-      {%- endfor %}
-    }
-    {%- if timing %}
-    timing(&wcEndTime, &cpuEndTime);
-    if( warmup == 0)
-        printf("%e\\n", (wcEndTime - wcStartTime) / atoi(argv[1]) );
-    {%- endif %}
-
-  }
-
-  {%- if likwid %}
-  likwid_markerStopRegion("loop");
-  {%- if openmp %}
-  }
-  {%- endif %}
-  {%- endif %}
-
-  {%- if likwid %}
-  likwid_markerClose();
-  {%- endif %}
-}
-""")
-
-
-def generate_benchmark(ast, likwid=False, openmp=False, timing=False):
-    """Return C code of a benchmark program for the given kernel.
-
-    Args:
-        ast: the pystencils AST object as returned by create_kernel
-        likwid: if True likwid markers are added to the code
-        openmp: relevant only if likwid=True, to generated correct likwid initialization code
-        timing: add timing output to the code, prints time per iteration to stdout
-
-    Returns:
-        C code as string
-    """
-    accessed_fields = {f.name: f for f in ast.fields_accessed}
-    constants = []
-    fields = []
-    call_parameters = []
-    for p in ast.get_parameters():
-        if not p.is_field_parameter:
-            constants.append((p.symbol.name, str(p.symbol.dtype)))
-            call_parameters.append(p.symbol.name)
-        else:
-            assert p.is_field_pointer, "Benchmark implemented only for kernels with fixed loop size"
-            field = accessed_fields[p.field_name]
-            dtype = str(get_base_type(p.symbol.dtype))
-            fields.append((p.field_name, dtype, prod(field.shape)))
-            call_parameters.append(p.field_name)
-
-    header_list = get_headers(ast)
-    includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list])
-
-    # Strip "#pragma omp parallel" from within kernel, because main function takes care of that
-    # when likwid and openmp are enabled
-    if likwid and openmp:
-        if len(ast.body.args) > 0 and isinstance(ast.body.args[0], PragmaBlock):
-            ast.body.args[0].pragma_line = ''
-
-    args = {
-        'likwid': likwid,
-        'openmp': openmp,
-        'kernel_code': generate_c(ast, dialect='c'),
-        'kernelName': ast.function_name,
-        'fields': fields,
-        'constants': constants,
-        'call_argument_list': ",".join(call_parameters),
-        'includes': includes,
-        'timing': timing,
-    }
-    return benchmark_template.render(**args)
-
-
-def run_c_benchmark(ast, inner_iterations, outer_iterations=3):
-    """Runs the given kernel with outer loop in C
-
-    Args:
-        ast:
-        inner_iterations: timings are recorded around this many iterations
-        outer_iterations: number of timings recorded
-
-    Returns:
-        list of times per iterations for each outer iteration
-    """
-    import kerncraft
-
-    benchmark_code = generate_benchmark(ast, timing=True)
-    with open('bench.c', 'w') as f:
-        f.write(benchmark_code)
-
-    kerncraft_path = os.path.dirname(kerncraft.__file__)
-
-    extra_flags = ['-I' + get_pystencils_include_path(),
-                   '-I' + os.path.join(kerncraft_path, 'headers')]
-
-    compiler_config = get_compiler_config()
-    compile_cmd = [compiler_config['command']] + compiler_config['flags'].split()
-    compile_cmd += [*extra_flags,
-                    os.path.join(kerncraft_path, 'headers', 'timing.c'),
-                    os.path.join(kerncraft_path, 'headers', 'dummy.c'),
-                    'bench.c',
-                    '-o', 'bench',
-                    ]
-    run_compile_step(compile_cmd)
-
-    results = []
-    for _ in range(outer_iterations):
-        benchmark_time = float(subprocess.check_output(['./bench', str(inner_iterations)]))
-        results.append(benchmark_time)
-    return results
--- a/pystencils/kerncraft_coupling/kerncraft_interface.py
+++ b/pystencils/kerncraft_coupling/kerncraft_interface.py
-import warnings
-from collections import defaultdict
-from tempfile import TemporaryDirectory
-from typing import Optional
-
-import kerncraft
-import sympy as sp
-from kerncraft.kerncraft import KernelCode
-from kerncraft.machinemodel import MachineModel
-
-from pystencils.astnodes import (
-    KernelFunction, LoopOverCoordinate, ResolvedFieldAccess, SympyAssignment)
-from pystencils.field import get_layout_from_strides
-from pystencils.kerncraft_coupling.generate_benchmark import generate_benchmark
-from pystencils.sympyextensions import count_operations_in_ast
-from pystencils.transformations import filtered_tree_iteration
-from pystencils.utils import DotDict
-
-
-class PyStencilsKerncraftKernel(KernelCode):
-    """
-    Implementation of kerncraft's kernel interface for pystencils CPU kernels.
-    Analyses a list of equations assuming they will be executed on a CPU
-    """
-    LIKWID_BASE = '/usr/local/likwid'
-
-    def __init__(self, ast: KernelFunction, machine: Optional[MachineModel] = None,
-                 assumed_layout='SoA', debug_print=False, filename=None):
-        """Create a kerncraft kernel using a pystencils AST
-
-        Args:
-            ast: pystencils ast
-            machine: kerncraft machine model - specify this if kernel needs to be compiled
-            assumed_layout: either 'SoA' or 'AoS' - if fields have symbolic sizes the layout of the index
-                    coordinates is not known. In this case either a structures of array (SoA) or
-                    array of structures (AoS) layout is assumed
-        """
-        kerncraft.kernel.Kernel.__init__(self, machine)
-
-        # Initialize state
-        self.asm_block = None
-        self._filename = filename
-
-        self.kernel_ast = ast
-        self.temporary_dir = TemporaryDirectory()
-        self._keep_intermediates = debug_print
-
-        # Loops
-        inner_loops = [l for l in filtered_tree_iteration(ast, LoopOverCoordinate, stop_type=SympyAssignment)
-                       if l.is_innermost_loop]
-        if len(inner_loops) == 0:
-            raise ValueError("No loop found in pystencils AST")
-        else:
-            if len(inner_loops) > 1:
-                warnings.warn("pystencils AST contains multiple inner loops. "
-                              "Only one can be analyzed - choosing first one")
-            inner_loop = inner_loops[0]
-
-        self._loop_stack = []
-        cur_node = inner_loop
-        while cur_node is not None:
-            if isinstance(cur_node, LoopOverCoordinate):
-                loop_counter_sym = cur_node.loop_counter_symbol
-                loop_info = (loop_counter_sym.name, cur_node.start, cur_node.stop, 1)
-                # If the correct step were to be provided, all access within that step length will
-                # also need to be passed to kerncraft: cur_node.step)
-                self._loop_stack.append(loop_info)
-            cur_node = cur_node.parent
-        self._loop_stack = list(reversed(self._loop_stack))
-
-        # Data sources & destinations
-        self.sources = defaultdict(list)
-        self.destinations = defaultdict(list)
-
-        def get_layout_tuple(f):
-            if f.has_fixed_shape:
-                return get_layout_from_strides(f.strides)
-            else:
-                layout_list = list(f.layout)
-                for _ in range(f.index_dimensions):
-                    layout_list.insert(0 if assumed_layout == 'SoA' else -1, max(layout_list) + 1)
-                return layout_list
-
-        reads, writes = search_resolved_field_accesses_in_ast(inner_loop)
-        for accesses, target_dict in [(reads, self.sources), (writes, self.destinations)]:
-            for fa in accesses:
-                coord = [sp.Symbol(LoopOverCoordinate.get_loop_counter_name(i), positive=True, integer=True) + off
-                         for i, off in enumerate(fa.offsets)]
-                coord += list(fa.idx_coordinate_values)
-                layout = get_layout_tuple(fa.field)
-                permuted_coord = [sp.sympify(coord[i]) for i in layout]
-                target_dict[fa.field.name].append(permuted_coord)
-
-        # Variables (arrays)
-        fields_accessed = ast.fields_accessed
-        for field in fields_accessed:
-            layout = get_layout_tuple(field)
-            permuted_shape = list(field.shape[i] for i in layout)
-            self.set_variable(field.name, str(field.dtype), tuple(permuted_shape))
-
-        # Scalars may be safely ignored
-        # for param in ast.get_parameters():
-        #     if not param.is_field_parameter:
-        #         # self.set_variable(param.symbol.name, str(param.symbol.dtype), None)
-        #         self.sources[param.symbol.name] = [None]
-
-        # data type
-        self.datatype = list(self.variables.values())[0][0]
-
-        # flops
-        operation_count = count_operations_in_ast(inner_loop)
-        self._flops = {
-            '+': operation_count['adds'],
-            '*': operation_count['muls'],
-            '/': operation_count['divs'],
-        }
-        for k in [k for k, v in self._flops.items() if v == 0]:
-            del self._flops[k]
-        self.check()
-
-        if debug_print:
-            from pprint import pprint
-            print("-----------------------------  Loop Stack --------------------------")
-            pprint(self._loop_stack)
-            print("-----------------------------  Sources -----------------------------")
-            pprint(self.sources)
-            print("-----------------------------  Destinations ------------------------")
-            pprint(self.destinations)
-            print("-----------------------------  FLOPS -------------------------------")
-            pprint(self._flops)
-
-    def as_code(self, type_='iaca', openmp=False, as_filename=False):
-        """
-        Generate and return compilable source code.
-
-        Args:
-            type_: can be iaca or likwid.
-            openmp: if true, openmp code will be generated
-            as_filename:
-        """
-        code = generate_benchmark(self.kernel_ast, likwid=type_ == 'likwid', openmp=openmp)
-        if as_filename:
-            fp, already_available = self._get_intermediate_file('kernel_{}.c'.format(type_),
-                                                                machine_and_compiler_dependent=False)
-            if not already_available:
-                fp.write(code)
-            return fp.name
-        else:
-            return code
-
-
-class KerncraftParameters(DotDict):
-    def __init__(self, **kwargs):
-        super(KerncraftParameters, self).__init__(**kwargs)
-        self['asm_block'] = 'auto'
-        self['asm_increment'] = 0
-        self['cores'] = 1
-        self['cache_predictor'] = 'SIM'
-        self['verbose'] = 0
-        self['pointer_increment'] = 'auto'
-        self['iterations'] = 10
-        self['unit'] = 'cy/CL'
-        self['ignore_warnings'] = True
-
-
-# ------------------------------------------- Helper functions ---------------------------------------------------------
-
-
-def search_resolved_field_accesses_in_ast(ast):
-    def visit(node, reads, writes):
-        if not isinstance(node, SympyAssignment):
-            for a in node.args:
-                visit(a, reads, writes)
-            return
-
-        for expr, accesses in [(node.lhs, writes), (node.rhs, reads)]:
-            accesses.update(expr.atoms(ResolvedFieldAccess))
-
-    read_accesses = set()
-    write_accesses = set()
-    visit(ast, read_accesses, write_accesses)
-    return read_accesses, write_accesses
--- a/pystencils/kernelcreation.py
+++ b/pystencils/kernelcreation.py
-import itertools
-from types import MappingProxyType
-
-import sympy as sp
-
-from pystencils.assignment import Assignment
-from pystencils.astnodes import Block, Conditional, LoopOverCoordinate, SympyAssignment
-from pystencils.cpu.vectorization import vectorize
-from pystencils.gpucuda.indexing import indexing_creator_from_params
-from pystencils.simp.assignment_collection import AssignmentCollection
-from pystencils.transformations import (
-    loop_blocking, move_constants_before_loop, remove_conditionals_in_staggered_kernel)
-
-
-def create_kernel(assignments, target='cpu', data_type="double", iteration_slice=None, ghost_layers=None,
-                  skip_independence_check=False,
-                  cpu_openmp=False, cpu_vectorize_info=None, cpu_blocking=None,
-                  gpu_indexing='block', gpu_indexing_params=MappingProxyType({})):
-    """
-    Creates abstract syntax tree (AST) of kernel, using a list of update equations.
-
-    Args:
-        assignments: can be a single assignment, sequence of assignments or an `AssignmentCollection`
-        target: 'cpu', 'llvm' or 'gpu'
-        data_type: data type used for all untyped symbols (i.e. non-fields), can also be a dict from symbol name
-                  to type
-        iteration_slice: rectangular subset to iterate over, if not specified the complete non-ghost layer \
-                         part of the field is iterated over
-        ghost_layers: if left to default, the number of necessary ghost layers is determined automatically
-                     a single integer specifies the ghost layer count at all borders, can also be a sequence of
-                     pairs ``[(x_lower_gl, x_upper_gl), .... ]``
-        skip_independence_check: don't check that loop iterations are independent. This is needed e.g. for
-                                 periodicity kernel, that access the field outside the iteration bounds. Use with care!
-        cpu_openmp: True or number of threads for OpenMP parallelization, False for no OpenMP
-        cpu_vectorize_info: a dictionary with keys, 'vector_instruction_set', 'assume_aligned' and 'nontemporal'
-                            for documentation of these parameters see vectorize function. Example:
-                            '{'instruction_set': 'avx512', 'assume_aligned': True, 'nontemporal':True}'
-        cpu_blocking: a tuple of block sizes or None if no blocking should be applied
-        gpu_indexing: either 'block' or 'line' , or custom indexing class, see `AbstractIndexing`
-        gpu_indexing_params: dict with indexing parameters (constructor parameters of indexing class)
-                             e.g. for 'block' one can specify '{'block_size': (20, 20, 10) }'
-
-    Returns:
-        abstract syntax tree (AST) object, that can either be printed as source code with `show_code` or
-        can be compiled with through its 'compile()' member
-
-    Example:
-        >>> import pystencils as ps
-        >>> import numpy as np
-        >>> s, d = ps.fields('s, d: [2D]')
-        >>> assignment = ps.Assignment(d[0,0], s[0, 1] + s[0, -1] + s[1, 0] + s[-1, 0])
-        >>> ast = ps.create_kernel(assignment, target='cpu', cpu_openmp=True)
-        >>> kernel = ast.compile()
-        >>> d_arr = np.zeros([5, 5])
-        >>> kernel(d=d_arr, s=np.ones([5, 5]))
-        >>> d_arr
-        array([[0., 0., 0., 0., 0.],
-               [0., 4., 4., 4., 0.],
-               [0., 4., 4., 4., 0.],
-               [0., 4., 4., 4., 0.],
-               [0., 0., 0., 0., 0.]])
-    """
-    # ----  Normalizing parameters
-    split_groups = ()
-    if isinstance(assignments, AssignmentCollection):
-        if 'split_groups' in assignments.simplification_hints:
-            split_groups = assignments.simplification_hints['split_groups']
-        assignments = assignments.all_assignments
-    if isinstance(assignments, Assignment):
-        assignments = [assignments]
-
-    # ----  Creating ast
-    if target == 'cpu':
-        from pystencils.cpu import create_kernel
-        from pystencils.cpu import add_openmp
-        ast = create_kernel(assignments, type_info=data_type, split_groups=split_groups,
-                            iteration_slice=iteration_slice, ghost_layers=ghost_layers,
-                            skip_independence_check=skip_independence_check)
-        omp_collapse = None
-        if cpu_blocking:
-            omp_collapse = loop_blocking(ast, cpu_blocking)
-        if cpu_openmp:
-            add_openmp(ast, num_threads=cpu_openmp, collapse=omp_collapse)
-        if cpu_vectorize_info:
-            if cpu_vectorize_info is True:
-                vectorize(ast)
-            elif isinstance(cpu_vectorize_info, dict):
-                vectorize(ast, **cpu_vectorize_info)
-            else:
-                raise ValueError("Invalid value for cpu_vectorize_info")
-        return ast
-    elif target == 'llvm':
-        from pystencils.llvm import create_kernel
-        ast = create_kernel(assignments, type_info=data_type, split_groups=split_groups,
-                            iteration_slice=iteration_slice, ghost_layers=ghost_layers)
-        return ast
-    elif target == 'gpu':
-        from pystencils.gpucuda import create_cuda_kernel
-        ast = create_cuda_kernel(assignments, type_info=data_type,
-                                 indexing_creator=indexing_creator_from_params(gpu_indexing, gpu_indexing_params),
-                                 iteration_slice=iteration_slice, ghost_layers=ghost_layers,
-                                 skip_independence_check=skip_independence_check)
-        return ast
-    else:
-        raise ValueError("Unknown target %s. Has to be one of 'cpu', 'gpu' or 'llvm' " % (target,))
-
-
-def create_indexed_kernel(assignments, index_fields, target='cpu', data_type="double", coordinate_names=('x', 'y', 'z'),
-                          cpu_openmp=True, gpu_indexing='block', gpu_indexing_params=MappingProxyType({})):
-    """
-    Similar to :func:`create_kernel`, but here not all cells of a field are updated but only cells with
-    coordinates which are stored in an index field. This traversal method can e.g. be used for boundary handling.
-
-    The coordinates are stored in a separated index_field, which is a one dimensional array with struct data type.
-    This struct has to contain fields named 'x', 'y' and for 3D fields ('z'). These names are configurable with the
-    'coordinate_names' parameter. The struct can have also other fields that can be read and written in the kernel, for
-    example boundary parameters.
-
-    index_fields: list of index fields, i.e. 1D fields with struct data type
-    coordinate_names: name of the coordinate fields in the struct data type
-
-    Example:
-        >>> import pystencils as ps
-        >>> import numpy as np
-        >>>
-        >>> # Index field stores the indices of the cell to visit together with optional values
-        >>> index_arr_dtype = np.dtype([('x', np.int32), ('y', np.int32), ('val', np.double)])
-        >>> index_arr = np.array([(1, 1, 0.1), (2, 2, 0.2), (3, 3, 0.3)], dtype=index_arr_dtype)
-        >>> idx_field = ps.fields(idx=index_arr)
-        >>>
-        >>> # Additional values  stored in index field can be accessed in the kernel as well
-        >>> s, d = ps.fields('s, d: [2D]')
-        >>> assignment = ps.Assignment(d[0,0], 2 * s[0, 1] + 2 * s[1, 0] + idx_field('val'))
-        >>> ast = create_indexed_kernel(assignment, [idx_field], coordinate_names=('x', 'y'))
-        >>> kernel = ast.compile()
-        >>> d_arr = np.zeros([5, 5])
-        >>> kernel(s=np.ones([5, 5]), d=d_arr, idx=index_arr)
-        >>> d_arr
-        array([[0. , 0. , 0. , 0. , 0. ],
-               [0. , 4.1, 0. , 0. , 0. ],
-               [0. , 0. , 4.2, 0. , 0. ],
-               [0. , 0. , 0. , 4.3, 0. ],
-               [0. , 0. , 0. , 0. , 0. ]])
-    """
-    if isinstance(assignments, Assignment):
-        assignments = [assignments]
-    elif isinstance(assignments, AssignmentCollection):
-        assignments = assignments.all_assignments
-    if target == 'cpu':
-        from pystencils.cpu import create_indexed_kernel
-        from pystencils.cpu import add_openmp
-        ast = create_indexed_kernel(assignments, index_fields=index_fields, type_info=data_type,
-                                    coordinate_names=coordinate_names)
-        if cpu_openmp:
-            add_openmp(ast, num_threads=cpu_openmp)
-        return ast
-    elif target == 'llvm':
-        raise NotImplementedError("Indexed kernels are not yet supported in LLVM backend")
-    elif target == 'gpu':
-        from pystencils.gpucuda import created_indexed_cuda_kernel
-        idx_creator = indexing_creator_from_params(gpu_indexing, gpu_indexing_params)
-        ast = created_indexed_cuda_kernel(assignments, index_fields, type_info=data_type,
-                                          coordinate_names=coordinate_names, indexing_creator=idx_creator)
-        return ast
-    else:
-        raise ValueError("Unknown target %s. Has to be either 'cpu' or 'gpu'" % (target,))
-
-
-def create_staggered_kernel(staggered_field, expressions, subexpressions=(), target='cpu',
-                            gpu_exclusive_conditions=False, **kwargs):
-    """Kernel that updates a staggered field.
-
-    .. image:: /img/staggered_grid.svg
-
-    Args:
-        staggered_field: field where the first index coordinate defines the location of the staggered value
-                can have 1 or 2 index coordinates, in case of two index coordinates at every staggered location
-                a vector is stored, expressions parameter has to be a sequence of sequences then
-                where e.g. ``f[0,0](0)`` is interpreted as value at the left cell boundary, ``f[1,0](0)`` the right cell
-                boundary and ``f[0,0](1)`` the southern cell boundary etc.
-        expressions: sequence of expressions of length dim, defining how the west, southern, (bottom) cell boundary
-                     should be updated.
-        subexpressions: optional sequence of Assignments, that define subexpressions used in the main expressions
-        target: 'cpu' or 'gpu'
-        gpu_exclusive_conditions: if/else construct to have only one code block for each of 2**dim code paths
-        kwargs: passed directly to create_kernel, iteration slice and ghost_layers parameters are not allowed
-
-    Returns:
-        AST, see `create_kernel`
-    """
-    assert 'iteration_slice' not in kwargs and 'ghost_layers' not in kwargs
-    assert staggered_field.index_dimensions in (1, 2), 'Staggered field must have one or two index dimensions'
-    dim = staggered_field.spatial_dimensions
-
-    counters = [LoopOverCoordinate.get_loop_counter_symbol(i) for i in range(dim)]
-    conditions = [counters[i] < staggered_field.shape[i] - 1 for i in range(dim)]
-    assert len(expressions) == dim
-    if staggered_field.index_dimensions == 2:
-        assert all(len(sublist) == len(expressions[0]) for sublist in expressions), \
-            "If staggered field has two index dimensions expressions has to be a sequence of sequences of all the " \
-            "same length."
-
-    final_assignments = []
-    last_conditional = None
-
-    def add(condition, dimensions, as_else_block=False):
-        nonlocal last_conditional
-        if staggered_field.index_dimensions == 1:
-            assignments = [Assignment(staggered_field(d), expressions[d]) for d in dimensions]
-            a_coll = AssignmentCollection(assignments, list(subexpressions))
-            a_coll = a_coll.new_filtered([staggered_field(d) for d in dimensions])
-        elif staggered_field.index_dimensions == 2:
-            assert staggered_field.has_fixed_index_shape
-            assignments = [Assignment(staggered_field(d, i), expr)
-                           for d in dimensions
-                           for i, expr in enumerate(expressions[d])]
-            a_coll = AssignmentCollection(assignments, list(subexpressions))
-            a_coll = a_coll.new_filtered([staggered_field(d, i) for i in range(staggered_field.index_shape[1])
-                                          for d in dimensions])
-        sp_assignments = [SympyAssignment(a.lhs, a.rhs) for a in a_coll.all_assignments]
-        if as_else_block and last_conditional:
-            new_cond = Conditional(condition, Block(sp_assignments))
-            last_conditional.false_block = Block([new_cond])
-            last_conditional = new_cond
-        else:
-            last_conditional = Conditional(condition, Block(sp_assignments))
-            final_assignments.append(last_conditional)
-
-    if target == 'cpu' or not gpu_exclusive_conditions:
-        for d in range(dim):
-            cond = sp.And(*[conditions[i] for i in range(dim) if d != i])
-            add(cond, [d])
-    elif target == 'gpu':
-        full_conditions = [sp.And(*[conditions[i] for i in range(dim) if d != i]) for d in range(dim)]
-        for include in itertools.product(*[[1, 0]] * dim):
-            case_conditions = sp.And(*[c if value else sp.Not(c) for c, value in zip(full_conditions, include)])
-            dimensions_to_include = [i for i in range(dim) if include[i]]
-            if dimensions_to_include:
-                add(case_conditions, dimensions_to_include, True)
-
-    ghost_layers = [(1, 0)] * dim
-
-    blocking = kwargs.get('cpu_blocking', None)
-    if blocking:
-        del kwargs['cpu_blocking']
-
-    cpu_vectorize_info = kwargs.get('cpu_vectorize_info', None)
-    if cpu_vectorize_info:
-        del kwargs['cpu_vectorize_info']
-    openmp = kwargs.get('cpu_openmp', None)
-    if openmp:
-        del kwargs['cpu_openmp']
-
-    ast = create_kernel(final_assignments, ghost_layers=ghost_layers, target=target, **kwargs)
-
-    if target == 'cpu':
-        remove_conditionals_in_staggered_kernel(ast)
-        move_constants_before_loop(ast)
-        omp_collapse = None
-        if blocking:
-            omp_collapse = loop_blocking(ast, blocking)
-        if openmp:
-            from pystencils.cpu import add_openmp
-            add_openmp(ast, num_threads=openmp, collapse=omp_collapse, assume_single_outer_loop=False)
-        if cpu_vectorize_info is True:
-            vectorize(ast)
-        elif isinstance(cpu_vectorize_info, dict):
-            vectorize(ast, **cpu_vectorize_info)
-    return ast
--- a/pystencils/llvm/__init__.py
+++ b/pystencils/llvm/__init__.py
-from .kernelcreation import create_kernel
-from .llvmjit import make_python_function
-
-__all__ = ['create_kernel', 'make_python_function']
--- a/pystencils/llvm/control_flow.py
+++ b/pystencils/llvm/control_flow.py
-import llvmlite.ir as ir
-
-
-class Loop(object):
-    def __init__(self, builder, start_val, stop_val, step_val=1, loop_name='loop', phi_name="_phi"):
-        self.builder = builder
-        self.start_val = start_val
-        self.stop_val = stop_val
-        self.step_val = step_val
-        self.loop_name = loop_name
-        self.phi_name = phi_name
-
-    def __enter__(self):
-        self.loop_end, self.after, phi = self._for_loop(self.start_val, self.stop_val, self.step_val, self.loop_name,
-                                                        self.phi_name)
-        return phi
-
-    def _for_loop(self, start_val, stop_val, step_val, loop_name, phi_name):
-        # TODO size of int??? unisgned???
-        integer = ir.IntType(64)
-
-        # Loop block
-        pre_loop_bb = self.builder.block
-        loop_bb = self.builder.append_basic_block(name='loop_' + loop_name)
-
-        self.builder.branch(loop_bb)
-
-        # Insert an explicit fall through from the current block to loop_bb
-        self.builder.position_at_start(loop_bb)
-
-        # Add phi
-        phi = self.builder.phi(integer, name=phi_name)
-        phi.add_incoming(start_val, pre_loop_bb)
-
-        loop_end_bb = self.builder.append_basic_block(name=loop_name + "_end_bb")
-        self.builder.position_at_start(loop_end_bb)
-
-        next_var = self.builder.add(phi, step_val, name=loop_name + '_next_it')
-        cond = self.builder.icmp_unsigned('<', next_var, stop_val, name=loop_name + "_cond")
-
-        after_bb = self.builder.append_basic_block(name=loop_name + "_after_bb")
-
-        self.builder.cbranch(cond, loop_bb, after_bb)
-        phi.add_incoming(next_var, loop_end_bb)
-
-        self.builder.position_at_end(loop_bb)
-
-        return loop_end_bb, after_bb, phi
-
-    def __exit__(self, exc_type, exc, exc_tb):
-        self.builder.branch(self.loop_end)
-        self.builder.position_at_end(self.after)
--- a/pystencils/llvm/kernelcreation.py
+++ b/pystencils/llvm/kernelcreation.py
-from pystencils.llvm.llvmjit import make_python_function
-from pystencils.transformations import insert_casts
-
-
-def create_kernel(assignments, function_name="kernel", type_info=None, split_groups=(),
-                  iteration_slice=None, ghost_layers=None):
-    """
-    Creates an abstract syntax tree for a kernel function, by taking a list of update rules.
-
-    Loops are created according to the field accesses in the equations.
-
-    Args:
-    assignments: list of sympy equations, containing accesses to :class:`pystencils.field.Field`.
-                 Defining the update rules of the kernel
-    function_name: name of the generated function - only important if generated code is written out
-                   type_info: a map from symbol name to a C type specifier. If not specified all symbols are assumed to
-                   be of type 'double' except symbols which occur on the left hand side of equations where the
-                   right hand side is a sympy Boolean which are assumed to be 'bool' .
-    split_groups: Specification on how to split up inner loop into multiple loops. For details see
-                  transformation :func:`pystencils.transformation.split_inner_loop`
-    iteration_slice: if not None, iteration is done only over this slice of the field
-    ghost_layers: a sequence of pairs for each coordinate with lower and upper nr of ghost layers
-                 if None, the number of ghost layers is determined automatically and assumed to be equal for a
-                 all dimensions
-
-    :return: :class:`pystencils.ast.KernelFunction` node
-    """
-    from pystencils.cpu import create_kernel
-    code = create_kernel(assignments, function_name, type_info, split_groups, iteration_slice, ghost_layers)
-    code.body = insert_casts(code.body)
-    code._compile_function = make_python_function
-    code._backend = 'llvm'
-    return code
--- a/pystencils/llvm/llvm.py
+++ b/pystencils/llvm/llvm.py
-import functools
-
-import llvmlite.ir as ir
-import sympy as sp
-from sympy import Indexed, S
-from sympy.printing.printer import Printer
-
-from pystencils.assignment import Assignment
-from pystencils.data_types import (
-    collate_types, create_composite_type_from_string, create_type, get_type_of_expression,
-    to_llvm_type)
-from pystencils.llvm.control_flow import Loop
-
-
-def generate_llvm(ast_node, module=None, builder=None):
-    """Prints the ast as llvm code."""
-    if module is None:
-        module = ir.Module()
-    if builder is None:
-        builder = ir.IRBuilder()
-    printer = LLVMPrinter(module, builder)
-    return printer._print(ast_node)
-
-
-# noinspection PyPep8Naming
-class LLVMPrinter(Printer):
-    """Convert expressions to LLVM IR"""
-
-    def __init__(self, module, builder, fn=None, *args, **kwargs):
-        self.func_arg_map = kwargs.pop("func_arg_map", {})
-        super(LLVMPrinter, self).__init__(*args, **kwargs)
-        self.fp_type = ir.DoubleType()
-        self.fp_pointer = self.fp_type.as_pointer()
-        self.integer = ir.IntType(64)
-        self.integer_pointer = self.integer.as_pointer()
-        self.void = ir.VoidType()
-        self.module = module
-        self.builder = builder
-        self.fn = fn
-        self.ext_fn = {}  # keep track of wrappers to external functions
-        self.tmp_var = {}
-
-    def _add_tmp_var(self, name, value):
-        self.tmp_var[name] = value
-
-    def _remove_tmp_var(self, name):
-        del self.tmp_var[name]
-
-    def _print_Number(self, n):
-        if get_type_of_expression(n) == create_type("int"):
-            return ir.Constant(self.integer, int(n))
-        elif get_type_of_expression(n) == create_type("double"):
-            return ir.Constant(self.fp_type, float(n))
-        else:
-            raise NotImplementedError("Numbers can only have int and double", n)
-
-    def _print_Float(self, expr):
-        return ir.Constant(self.fp_type, float(expr))
-
-    def _print_Integer(self, expr):
-        return ir.Constant(self.integer, int(expr))
-
-    def _print_int(self, i):
-        return ir.Constant(self.integer, i)
-
-    def _print_Symbol(self, s):
-        val = self.tmp_var.get(s)
-        if not val:
-            # look up parameter with name s
-            val = self.func_arg_map.get(s.name)
-        if not val:
-            raise LookupError("Symbol not found: %s" % s)
-        return val
-
-    def _print_Pow(self, expr):
-        base0 = self._print(expr.base)
-        if expr.exp == S.NegativeOne:
-            return self.builder.fdiv(ir.Constant(self.fp_type, 1.0), base0)
-        if expr.exp == S.Half:
-            fn = self.ext_fn.get("sqrt")
-            if not fn:
-                fn_type = ir.FunctionType(self.fp_type, [self.fp_type])
-                fn = ir.Function(self.module, fn_type, "sqrt")
-                self.ext_fn["sqrt"] = fn
-            return self.builder.call(fn, [base0], "sqrt")
-        if expr.exp == 2:
-            return self.builder.fmul(base0, base0)
-        elif expr.exp == 3:
-            return self.builder.fmul(self.builder.fmul(base0, base0), base0)
-
-        exp0 = self._print(expr.exp)
-        fn = self.ext_fn.get("pow")
-        if not fn:
-            fn_type = ir.FunctionType(self.fp_type, [self.fp_type, self.fp_type])
-            fn = ir.Function(self.module, fn_type, "pow")
-            self.ext_fn["pow"] = fn
-        return self.builder.call(fn, [base0, exp0], "pow")
-
-    def _print_Mul(self, expr):
-        nodes = [self._print(a) for a in expr.args]
-        e = nodes[0]
-        if get_type_of_expression(expr) == create_type('double'):
-            mul = self.builder.fmul
-        else:  # int TODO unsigned/signed
-            mul = self.builder.mul
-        for node in nodes[1:]:
-            e = mul(e, node)
-        return e
-
-    def _print_Add(self, expr):
-        nodes = [self._print(a) for a in expr.args]
-        e = nodes[0]
-        if get_type_of_expression(expr) == create_type('double'):
-            add = self.builder.fadd
-        else:  # int TODO unsigned/signed
-            add = self.builder.add
-        for node in nodes[1:]:
-            e = add(e, node)
-        return e
-
-    def _print_Or(self, expr):
-        nodes = [self._print(a) for a in expr.args]
-        e = nodes[0]
-        for node in nodes[1:]:
-            e = self.builder.or_(e, node)
-        return e
-
-    def _print_And(self, expr):
-        nodes = [self._print(a) for a in expr.args]
-        e = nodes[0]
-        for node in nodes[1:]:
-            e = self.builder.and_(e, node)
-        return e
-
-    def _print_StrictLessThan(self, expr):
-        return self._comparison('<', expr)
-
-    def _print_LessThan(self, expr):
-        return self._comparison('<=', expr)
-
-    def _print_StrictGreaterThan(self, expr):
-        return self._comparison('>', expr)
-
-    def _print_GreaterThan(self, expr):
-        return self._comparison('>=', expr)
-
-    def _print_Unequality(self, expr):
-        return self._comparison('!=', expr)
-
-    def _print_Equality(self, expr):
-        return self._comparison('==', expr)
-
-    def _comparison(self, cmpop, expr):
-        if collate_types([get_type_of_expression(arg) for arg in expr.args]) == create_type('double'):
-            comparison = self.builder.fcmp_unordered
-        else:
-            comparison = self.builder.icmp_signed
-        return comparison(cmpop, self._print(expr.lhs), self._print(expr.rhs))
-
-    def _print_KernelFunction(self, func):
-        # KernelFunction does not posses a return type
-        return_type = self.void
-        parameter_type = []
-        parameters = func.get_parameters()
-        for parameter in parameters:
-            parameter_type.append(to_llvm_type(parameter.symbol.dtype))
-        func_type = ir.FunctionType(return_type, tuple(parameter_type))
-        name = func.function_name
-        fn = ir.Function(self.module, func_type, name)
-        self.ext_fn[name] = fn
-
-        # set proper names to arguments
-        for i, arg in enumerate(fn.args):
-            arg.name = parameters[i].symbol.name
-            self.func_arg_map[parameters[i].symbol.name] = arg
-
-        # func.attributes.add("inlinehint")
-        # func.attributes.add("argmemonly")
-        block = fn.append_basic_block(name="entry")
-        self.builder = ir.IRBuilder(block)  # TODO use goto_block instead
-        self._print(func.body)
-        self.builder.ret_void()
-        self.fn = fn
-        return fn
-
-    def _print_Block(self, block):
-        for node in block.args:
-            self._print(node)
-
-    def _print_LoopOverCoordinate(self, loop):
-        with Loop(self.builder, self._print(loop.start), self._print(loop.stop), self._print(loop.step),
-                  loop.loop_counter_name, loop.loop_counter_symbol.name) as i:
-            self._add_tmp_var(loop.loop_counter_symbol, i)
-            self._print(loop.body)
-            self._remove_tmp_var(loop.loop_counter_symbol)
-
-    def _print_SympyAssignment(self, assignment):
-        expr = self._print(assignment.rhs)
-        lhs = assignment.lhs
-        if isinstance(lhs, Indexed):
-            ptr = self._print(lhs.base.label)
-            index = self._print(lhs.args[1])
-            gep = self.builder.gep(ptr, [index])
-            return self.builder.store(expr, gep)
-        self.func_arg_map[assignment.lhs.name] = expr
-        return expr
-
-    def _print_boolean_cast_func(self, conversion):
-        return self._print_cast_func(conversion)
-
-    def _print_cast_func(self, conversion):
-        node = self._print(conversion.args[0])
-        to_dtype = get_type_of_expression(conversion)
-        from_dtype = get_type_of_expression(conversion.args[0])
-        if from_dtype == to_dtype:
-            return self._print(conversion.args[0])
-
-        # (From, to)
-        decision = {
-            (create_composite_type_from_string("int16"),
-             create_composite_type_from_string("int64")): lambda: ir.Constant(self.integer, node),
-            (create_composite_type_from_string("int"),
-             create_composite_type_from_string("double")): functools.partial(self.builder.sitofp, node, self.fp_type),
-            (create_composite_type_from_string("int16"),
-             create_composite_type_from_string("double")): functools.partial(self.builder.sitofp, node, self.fp_type),
-            (create_composite_type_from_string("double"),
-             create_composite_type_from_string("int")): functools.partial(self.builder.fptosi, node, self.integer),
-            (create_composite_type_from_string("double *"),
-             create_composite_type_from_string("int")): functools.partial(self.builder.ptrtoint, node, self.integer),
-            (create_composite_type_from_string("int"),
-             create_composite_type_from_string("double *")): functools.partial(self.builder.inttoptr,
-                                                                               node, self.fp_pointer),
-            (create_composite_type_from_string("double * restrict"),
-             create_composite_type_from_string("int")): functools.partial(self.builder.ptrtoint, node, self.integer),
-            (create_composite_type_from_string("int"),
-             create_composite_type_from_string("double * restrict")): functools.partial(self.builder.inttoptr, node,
-                                                                                        self.fp_pointer),
-            (create_composite_type_from_string("double * restrict const"),
-             create_composite_type_from_string("int")): functools.partial(self.builder.ptrtoint, node,
-                                                                          self.integer),
-            (create_composite_type_from_string("int"),
-             create_composite_type_from_string("double * restrict const")): functools.partial(self.builder.inttoptr,
-                                                                                              node, self.fp_pointer),
-        }
-        # TODO float, TEST: const, restrict
-        # TODO bitcast, addrspacecast
-        # TODO unsigned/signed fills
-        # print([x for x in decision.keys()])
-        # print("Types:")
-        # print([(type(x), type(y)) for (x, y) in decision.keys()])
-        # print("Cast:")
-        # print((from_dtype, to_dtype))
-        return decision[(from_dtype, to_dtype)]()
-
-    def _print_pointer_arithmetic_func(self, pointer):
-        ptr = self._print(pointer.args[0])
-        index = self._print(pointer.args[1])
-        return self.builder.gep(ptr, [index])
-
-    def _print_Indexed(self, indexed):
-        ptr = self._print(indexed.base.label)
-        index = self._print(indexed.args[1])
-        gep = self.builder.gep(ptr, [index])
-        return self.builder.load(gep, name=indexed.base.label.name)
-
-    def _print_Piecewise(self, piece):
-        if not piece.args[-1].cond:
-            # We need the last conditional to be a True, otherwise the resulting
-            # function may not return a result.
-            raise ValueError("All Piecewise expressions must contain an "
-                             "(expr, True) statement to be used as a default "
-                             "condition. Without one, the generated "
-                             "expression may not evaluate to anything under "
-                             "some condition.")
-        if piece.has(Assignment):
-            raise NotImplementedError('The llvm-backend does not support assignments'
-                                      'in the Piecewise function. It is questionable'
-                                      'whether to implement it. So far there is no'
-                                      'use-case to test it.')
-        else:
-            phi_data = []
-            after_block = self.builder.append_basic_block()
-            for (expr, condition) in piece.args:
-                if condition == sp.sympify(True):  # Don't use 'is' use '=='!
-                    phi_data.append((self._print(expr), self.builder.block))
-                    self.builder.branch(after_block)
-                    self.builder.position_at_end(after_block)
-                else:
-                    cond = self._print(condition)
-                    true_block = self.builder.append_basic_block()
-                    false_block = self.builder.append_basic_block()
-                    self.builder.cbranch(cond, true_block, false_block)
-                    self.builder.position_at_end(true_block)
-                    phi_data.append((self._print(expr), true_block))
-                    self.builder.branch(after_block)
-                    self.builder.position_at_end(false_block)
-
-            phi = self.builder.phi(to_llvm_type(get_type_of_expression(piece)))
-            for (val, block) in phi_data:
-                phi.add_incoming(val, block)
-            return phi
-
-    # Should have a list of math library functions to validate this.
-    # TODO function calls to libs
-    def _print_Function(self, expr):
-        name = expr.func.__name__
-        e0 = self._print(expr.args[0])
-        fn = self.ext_fn.get(name)
-        if not fn:
-            fn_type = ir.FunctionType(self.fp_type, [self.fp_type])
-            fn = ir.Function(self.module, fn_type, name)
-            self.ext_fn[name] = fn
-        return self.builder.call(fn, [e0], name)
-
-    def empty_printer(self, expr):
-        try:
-            import inspect
-            mro = inspect.getmro(expr)
-        except AttributeError:
-            mro = "None"
-        raise TypeError("Unsupported type for LLVM JIT conversion: Expression:\"%s\", Type:\"%s\", MRO:%s"
-                        % (expr, type(expr), mro))
--- a/pystencils/llvm/llvmjit.py
+++ b/pystencils/llvm/llvmjit.py
-import ctypes as ct
-
-import llvmlite.binding as llvm
-import llvmlite.ir as ir
-import numpy as np
-
-from pystencils.data_types import create_composite_type_from_string
-from pystencils.field import FieldType
-
-from ..data_types import StructType, ctypes_from_llvm, to_ctypes
-from .llvm import generate_llvm
-
-
-def build_ctypes_argument_list(parameter_specification, argument_dict):
-    argument_dict = {k: v for k, v in argument_dict.items()}
-    ct_arguments = []
-    array_shapes = set()
-    index_arr_shapes = set()
-
-    for param in parameter_specification:
-        if param.is_field_parameter:
-            try:
-                field_arr = argument_dict[param.field_name]
-            except KeyError:
-                raise KeyError("Missing field parameter for kernel call " + param.field_name)
-
-            symbolic_field = param.fields[0]
-            if param.is_field_pointer:
-                ct_arguments.append(field_arr.ctypes.data_as(to_ctypes(param.symbol.dtype)))
-                if symbolic_field.has_fixed_shape:
-                    symbolic_field_shape = tuple(int(i) for i in symbolic_field.shape)
-                    if isinstance(symbolic_field.dtype, StructType):
-                        symbolic_field_shape = symbolic_field_shape[:-1]
-                    if symbolic_field_shape != field_arr.shape:
-                        raise ValueError("Passed array '%s' has shape %s which does not match expected shape %s" %
-                                         (param.field_name, str(field_arr.shape), str(symbolic_field.shape)))
-                if symbolic_field.has_fixed_shape:
-                    symbolic_field_strides = tuple(int(i) * field_arr.itemsize for i in symbolic_field.strides)
-                    if isinstance(symbolic_field.dtype, StructType):
-                        symbolic_field_strides = symbolic_field_strides[:-1]
-                    if symbolic_field_strides != field_arr.strides:
-                        raise ValueError("Passed array '%s' has strides %s which does not match expected strides %s" %
-                                         (param.field_name, str(field_arr.strides), str(symbolic_field_strides)))
-
-                if FieldType.is_indexed(symbolic_field):
-                    index_arr_shapes.add(field_arr.shape[:symbolic_field.spatial_dimensions])
-                elif FieldType.is_generic(symbolic_field):
-                    array_shapes.add(field_arr.shape[:symbolic_field.spatial_dimensions])
-
-            elif param.is_field_shape:
-                data_type = to_ctypes(param.symbol.dtype)
-                ct_arguments.append(data_type(field_arr.shape[param.symbol.coordinate]))
-            elif param.is_field_stride:
-                data_type = to_ctypes(param.symbol.dtype)
-                assert field_arr.strides[param.symbol.coordinate] % field_arr.itemsize == 0
-                item_stride = field_arr.strides[param.symbol.coordinate] // field_arr.itemsize
-                ct_arguments.append(data_type(item_stride))
-            else:
-                assert False
-        else:
-            try:
-                value = argument_dict[param.symbol.name]
-            except KeyError:
-                raise KeyError("Missing parameter for kernel call " + param.symbol.name)
-            expected_type = to_ctypes(param.symbol.dtype)
-            ct_arguments.append(expected_type(value))
-
-    if len(array_shapes) > 1:
-        raise ValueError("All passed arrays have to have the same size " + str(array_shapes))
-    if len(index_arr_shapes) > 1:
-        raise ValueError("All passed index arrays have to have the same size " + str(array_shapes))
-
-    return ct_arguments
-
-
-def make_python_function_incomplete_params(kernel_function_node, argument_dict, func):
-    parameters = kernel_function_node.get_parameters()
-
-    cache = {}
-    cache_values = []
-
-    def wrapper(**kwargs):
-        key = hash(tuple((k, v.ctypes.data, v.strides, v.shape) if isinstance(v, np.ndarray) else (k, id(v))
-                         for k, v in kwargs.items()))
-        try:
-            args = cache[key]
-            func(*args)
-        except KeyError:
-            full_arguments = argument_dict.copy()
-            full_arguments.update(kwargs)
-            args = build_ctypes_argument_list(parameters, full_arguments)
-            cache[key] = args
-            cache_values.append(kwargs)  # keep objects alive such that ids remain unique
-            func(*args)
-    wrapper.ast = kernel_function_node
-    wrapper.parameters = kernel_function_node.get_parameters()
-    return wrapper
-
-
-def generate_and_jit(ast):
-    gen = generate_llvm(ast)
-    if isinstance(gen, ir.Module):
-        return compile_llvm(gen)
-    else:
-        return compile_llvm(gen.module)
-
-
-def make_python_function(ast, argument_dict={}, func=None):
-    if func is None:
-        jit = generate_and_jit(ast)
-        func = jit.get_function_ptr(ast.function_name)
-    try:
-        args = build_ctypes_argument_list(ast.get_parameters(), argument_dict)
-    except KeyError:
-        # not all parameters specified yet
-        return make_python_function_incomplete_params(ast, argument_dict, func)
-    return lambda: func(*args)
-
-
-def compile_llvm(module):
-    jit = Jit()
-    jit.parse(module)
-    jit.optimize()
-    jit.compile()
-    return jit
-
-
-class Jit(object):
-    def __init__(self):
-        llvm.initialize()
-        llvm.initialize_all_targets()
-        llvm.initialize_native_target()
-        llvm.initialize_native_asmprinter()
-
-        self.module = None
-        self._llvmmod = llvm.parse_assembly("")
-        self.target = llvm.Target.from_default_triple()
-        self.cpu = llvm.get_host_cpu_name()
-        self.cpu_features = llvm.get_host_cpu_features()
-        self.target_machine = self.target.create_target_machine(cpu=self.cpu, features=self.cpu_features.flatten(),
-                                                                opt=2)
-        llvm.check_jit_execution()
-        self.ee = llvm.create_mcjit_compiler(self.llvmmod, self.target_machine)
-        self.ee.finalize_object()
-        self.fptr = None
-
-    @property
-    def llvmmod(self):
-        return self._llvmmod
-
-    @llvmmod.setter
-    def llvmmod(self, mod):
-        self.ee.remove_module(self.llvmmod)
-        self.ee.add_module(mod)
-        self.ee.finalize_object()
-        self.compile()
-        self._llvmmod = mod
-
-    def parse(self, module):
-        self.module = module
-        llvmmod = llvm.parse_assembly(str(module))
-        llvmmod.verify()
-        llvmmod.triple = self.target.triple
-        llvmmod.name = 'module'
-        self.llvmmod = llvmmod
-
-    def write_ll(self, file):
-        with open(file, 'w') as f:
-            f.write(str(self.llvmmod))
-
-    def write_assembly(self, file):
-        with open(file, 'w') as f:
-            f.write(self.target_machine.emit_assembly(self.llvmmod))
-
-    def write_object_file(self, file):
-        with open(file, 'wb') as f:
-            f.write(self.target_machine.emit_object(self.llvmmod))
-
-    def optimize(self):
-        pmb = llvm.create_pass_manager_builder()
-        pmb.opt_level = 2
-        pmb.disable_unit_at_a_time = False
-        pmb.loop_vectorize = True
-        pmb.slp_vectorize = True
-        # TODO possible to pass for functions
-        pm = llvm.create_module_pass_manager()
-        pm.add_instruction_combining_pass()
-        pm.add_function_attrs_pass()
-        pm.add_constant_merge_pass()
-        pm.add_licm_pass()
-        pmb.populate(pm)
-        pm.run(self.llvmmod)
-
-    def compile(self):
-        fptr = {}
-        for func in self.module.functions:
-            if not func.is_declaration:
-                return_type = None
-                if func.ftype.return_type != ir.VoidType():
-                    return_type = to_ctypes(create_composite_type_from_string(str(func.ftype.return_type)))
-                args = [ctypes_from_llvm(arg) for arg in func.ftype.args]
-                function_address = self.ee.get_function_address(func.name)
-                fptr[func.name] = ct.CFUNCTYPE(return_type, *args)(function_address)
-        self.fptr = fptr
-
-    def __call__(self, func, *args, **kwargs):
-        target_function = next(f for f in self.module.functions if f.name == func)
-        arg_types = [ctypes_from_llvm(arg.type) for arg in target_function.args]
-
-        transformed_args = []
-        for i, arg in enumerate(args):
-            if isinstance(arg, np.ndarray):
-                transformed_args.append(arg.ctypes.data_as(arg_types[i]))
-            else:
-                transformed_args.append(arg)
-
-        self.fptr[func](*transformed_args)
-
-    def print_functions(self):
-        for f in self.module.functions:
-            print(f.ftype.return_type, f.name, f.args)
-
-    def get_function_ptr(self, name):
-        fptr = self.fptr[name]
-        fptr.jit = self
-        return fptr
--- a/pystencils/opencl/opencljit.py
+++ b/pystencils/opencl/opencljit.py
-import numpy as np
-
-from pystencils.backends.cbackend import generate_c, get_headers
-from pystencils.gpucuda.cudajit import _build_numpy_argument_list, _check_arguments
-from pystencils.include import get_pystencils_include_path
-
-USE_FAST_MATH = True
-
-
-def make_python_function(kernel_function_node, opencl_queue, opencl_ctx, argument_dict=None, custom_backend=None):
-    """
-    Creates a **OpenCL** kernel function from an abstract syntax tree which
-    was created for the ``target='gpu'`` e.g. by :func:`pystencils.gpucuda.create_cuda_kernel`
-    or :func:`pystencils.gpucuda.created_indexed_cuda_kernel`
-
-    Args:
-        opencl_queue: a valid :class:`pyopencl.CommandQueue`
-        opencl_ctx: a valid :class:`pyopencl.Context`
-        kernel_function_node: the abstract syntax tree
-        argument_dict: parameters passed here are already fixed. Remaining parameters have to be passed to the
-                       returned kernel functor.
-
-    Returns:
-        compiled kernel as Python function
-    """
-    import pyopencl as cl
-    assert opencl_ctx, "No valid OpenCL context"
-    assert opencl_queue, "No valid OpenCL queue"
-
-    if argument_dict is None:
-        argument_dict = {}
-
-    # Changing of kernel name necessary since compilation with default name "kernel" is not possible (OpenCL keyword!)
-    kernel_function_node.function_name = "opencl_" + kernel_function_node.function_name
-    header_list = ['"opencl_stdint.h"'] + list(get_headers(kernel_function_node))
-    includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list])
-
-    code = includes + "\n"
-    code += "#define FUNC_PREFIX __kernel\n"
-    code += "#define RESTRICT restrict\n\n"
-    code += str(generate_c(kernel_function_node, dialect='opencl', custom_backend=custom_backend))
-    options = []
-    if USE_FAST_MATH:
-        options.append("-cl-unsafe-math-optimizations -cl-mad-enable -cl-fast-relaxed-math -cl-finite-math-only")
-    options.append("-I \"" + get_pystencils_include_path() + "\"")
-    mod = cl.Program(opencl_ctx, code).build(options=options)
-    func = getattr(mod, kernel_function_node.function_name)
-
-    parameters = kernel_function_node.get_parameters()
-
-    cache = {}
-    cache_values = []
-
-    def wrapper(**kwargs):
-        key = hash(tuple((k, v.ctypes.data, v.strides, v.shape) if isinstance(v, np.ndarray) else (k, id(v))
-                         for k, v in kwargs.items()))
-        try:
-            args, block_and_thread_numbers = cache[key]
-            func(opencl_queue, block_and_thread_numbers['grid'], block_and_thread_numbers['block'], *args)
-        except KeyError:
-            full_arguments = argument_dict.copy()
-            full_arguments.update(kwargs)
-            shape = _check_arguments(parameters, full_arguments)
-
-            indexing = kernel_function_node.indexing
-            block_and_thread_numbers = indexing.call_parameters(shape)
-            block_and_thread_numbers['block'] = tuple(int(i) for i in block_and_thread_numbers['block'])
-            block_and_thread_numbers['grid'] = tuple(int(b * g) for (b, g) in zip(block_and_thread_numbers['block'],
-                                                                                  block_and_thread_numbers['grid']))
-
-            args = _build_numpy_argument_list(parameters, full_arguments)
-            args = [a.data if hasattr(a, 'data') else a for a in args]
-            cache[key] = (args, block_and_thread_numbers)
-            cache_values.append(kwargs)  # keep objects alive such that ids remain unique
-            func(opencl_queue, block_and_thread_numbers['grid'], block_and_thread_numbers['block'], *args)
-
-    wrapper.ast = kernel_function_node
-    wrapper.parameters = kernel_function_node.get_parameters()
-    return wrapper
--- a/pystencils/rng.py
+++ b/pystencils/rng.py
-import numpy as np
-import sympy as sp
-
-from pystencils import TypedSymbol
-from pystencils.astnodes import LoopOverCoordinate
-from pystencils.backends.cbackend import CustomCodeNode
-
-
-def _get_rng_template(name, data_type, num_vars):
-    if data_type is np.float32:
-        c_type = "float"
-    elif data_type is np.float64:
-        c_type = "double"
-    template = "\n"
-    for i in range(num_vars):
-        template += "{{result_symbols[{}].dtype}} {{result_symbols[{}].name}};\n".format(i, i)
-    template += ("{}_{}{}({{parameters}}, " + ", ".join(["{{result_symbols[{}].name}}"] * num_vars) + ");\n") \
-        .format(name, c_type, num_vars, *tuple(range(num_vars)))
-    return template
-
-
-def _get_rng_code(template, dialect, vector_instruction_set, time_step, offsets, keys, dim, result_symbols):
-    parameters = [time_step] + [LoopOverCoordinate.get_loop_counter_symbol(i) + offsets[i]
-                                for i in range(dim)] + [0] * (3 - dim) + list(keys)
-
-    if dialect == 'cuda' or (dialect == 'c' and vector_instruction_set is None):
-        return template.format(parameters=', '.join(str(p) for p in parameters),
-                               result_symbols=result_symbols)
-    else:
-        raise NotImplementedError("Not yet implemented for this backend")
-
-
-class RNGBase(CustomCodeNode):
-
-    def __init__(self, dim, time_step=TypedSymbol("time_step", np.uint32), offsets=(0, 0, 0), keys=None):
-        if keys is None:
-            keys = (0,) * self._num_keys
-        if len(keys) != self._num_keys:
-            raise ValueError("Provided {} keys but need {}".format(len(keys), self._num_keys))
-        if len(offsets) != 3:
-            raise ValueError("Provided {} offsets but need {}".format(len(offsets), 3))
-        self.result_symbols = tuple(TypedSymbol(sp.Dummy().name, self._data_type) for _ in range(self._num_vars))
-        symbols_read = [s for s in keys if isinstance(s, sp.Symbol)]
-        super().__init__("", symbols_read=symbols_read, symbols_defined=self.result_symbols)
-        self._time_step = time_step
-        self._offsets = offsets
-        self.headers = ['"{}_rand.h"'.format(self._name)]
-        self.keys = tuple(keys)
-        self._args = sp.sympify((dim, time_step, keys))
-        self._dim = dim
-
-    @property
-    def args(self):
-        return self._args
-
-    @property
-    def undefined_symbols(self):
-        result = {a for a in (self._time_step, *self._offsets, *self.keys) if isinstance(a, sp.Symbol)}
-        loop_counters = [LoopOverCoordinate.get_loop_counter_symbol(i)
-                         for i in range(self._dim)]
-        result.update(loop_counters)
-        return result
-
-    def fast_subs(self, _):
-        return self  # nothing to replace inside this node - would destroy intermediate "dummy" by re-creating them
-
-    def get_code(self, dialect, vector_instruction_set):
-        template = _get_rng_template(self._name, self._data_type, self._num_vars)
-        return _get_rng_code(template, dialect, vector_instruction_set,
-                             self._time_step, self._offsets, self.keys, self._dim, self.result_symbols)
-
-    def __repr__(self):
-        return (", ".join(['{}'] * self._num_vars) + " <- {}RNG").format(*self.result_symbols, self._name.capitalize())
-
-
-class PhiloxTwoDoubles(RNGBase):
-    _name = "philox"
-    _data_type = np.float64
-    _num_vars = 2
-    _num_keys = 2
-
-
-class PhiloxFourFloats(RNGBase):
-    _name = "philox"
-    _data_type = np.float32
-    _num_vars = 4
-    _num_keys = 2
-
-
-class AESNITwoDoubles(RNGBase):
-    _name = "aesni"
-    _data_type = np.float64
-    _num_vars = 2
-    _num_keys = 4
-
-
-class AESNIFourFloats(RNGBase):
-    _name = "aesni"
-    _data_type = np.float32
-    _num_vars = 4
-    _num_keys = 4
-
-
-def random_symbol(assignment_list, seed=TypedSymbol("seed", np.uint32), rng_node=PhiloxTwoDoubles, *args, **kwargs):
-    counter = 0
-    while True:
-        node = rng_node(*args, keys=(counter, seed), **kwargs)
-        inserted = False
-        for symbol in node.result_symbols:
-            if not inserted:
-                assignment_list.insert(0, node)
-                inserted = True
-            yield symbol
--- a/pystencils/sympy_gmpy_bug_workaround.py
+++ b/pystencils/sympy_gmpy_bug_workaround.py
-# Disable gmpy backend until this bug is resolved if joblib serialize
-# See https://github.com/sympy/sympy/pull/13530
-import os
-import warnings
-
-os.environ['MPMATH_NOGMPY'] = '1'
-try:
-    import mpmath.libmp
-    # In case the user has imported sympy first, then pystencils
-    if mpmath.libmp.BACKEND == 'gmpy':
-        warnings.warn("You are using the gmpy backend. You might encounter an error 'argument is not an mpz sympy'. "
-                      "This is due to a known bug in sympy/gmpy library. "
-                      "To prevent this, import pystencils first then sympy or set the environment variable "
-                      "MPMATH_NOGMPY=1")
-except ImportError:
-    pass
-
-__all__ = []
--- a/pystencils_tests/benchmark/SkylakeSP_Gold-5122_allinclusive.yaml
+++ b/pystencils_tests/benchmark/SkylakeSP_Gold-5122_allinclusive.yaml
-# FIXME
-# FIXME performance counters might be wrong. This will only affect the Benchmark model
-# FIXME bandwidth measurements need validation
-# FIXME
-
-kerncraft version: 0.7.2
-model name: Intel(R) Xeon(R) Gold 5122 CPU @ 3.60GHz
-model type: Intel Core Skylake SP
-sockets: 2
-cores per socket: 4
-threads per core: 2
-NUMA domains per socket: 1
-cores per NUMA domain: 4
-clock: 3.6 GHz
-FLOPs per cycle:
-  SP:
-    total: 64
-    FMA: 64
-    ADD: 32
-    MUL: 32
-  DP:
-    total: 32
-    FMA: 32
-    ADD: 16
-    MUL: 16
-micro-architecture: SKX
-compiler:
-  !!omap
-  - icc: -O3 -fno-alias -xCORE-AVX512
-  - clang: -O3 -march=skylake-avx512 -D_POSIX_C_SOURCE=200112L
-  - gcc: -O3 -march=skylake-avx512
-cacheline size: 64 B
-overlapping model:
-  ports: ["0", "0DV", "1", "2", "3", "4", "5", "6", "7"]
-  performance counter metric:
-          Max(UOPS_DISPATCHED_PORT_PORT_0:PMC[0-3],
-          UOPS_DISPATCHED_PORT_PORT_1:PMC[0-3],
-          UOPS_DISPATCHED_PORT_PORT_4:PMC[0-3],
-          UOPS_DISPATCHED_PORT_PORT_5:PMC[0-3],
-          UOPS_DISPATCHED_PORT_PORT_6:PMC[0-3],
-          UOPS_DISPATCHED_PORT_PORT_7:PMC[0-3])
-non-overlapping model:
-  ports: ["2D", "3D"]
-  performance counter metric: T_OL + T_L1L2 + T_L2L3 + T_L3MEM
-memory hierarchy:
- level: L1
-  performance counter metrics:
-    accesses:  MEM_INST_RETIRED_ALL_LOADS:PMC[0-3]
-    misses: L1D_REPLACEMENT:PMC[0-3]
-    evicts: L2_TRANS_L1D_WB:PMC[0-3]
-  cache per group:
-    sets: 64
-    ways: 8
-    cl_size: 64
-    replacement_policy: 'LRU'
-    write_allocate: True
-    write_back: True
-    load_from: L2
-    store_to: L2
-  size per group: 32.00 kB
-  groups: 8
-  cores per group: 1
-  threads per group: 2
- level: L2
-  non-overlap upstream throughput: [64 B/cy, 'half-duplex']
-  performance counter metrics:
-    accesses: L1D_REPLACEMENT:PMC[0-3]
-    misses: L2_LINES_IN_ALL:PMC[0-3]
-    evicts: L2_TRANS_L2_WB:PMC[0-3]
-  cache per group:
-    sets: 1024
-    ways: 16
-    cl_size: 64
-    replacement_policy: 'LRU'
-    write_allocate: True
-    write_back: True
-    load_from: null  # L3 is a victim cache, thus unless a hit in L3, misses get forwarded to MEM
-    victims_to: L3  # all victims, modified or not are passed onto L3
-    store_to: L3
-  size per group: 1.00 MB
-  groups: 8
-  cores per group: 1
-  threads per group: 2
- level: L3
-  non-overlap upstream throughput: [16 B/cy, 'full-duplex']
-  performance counter metrics:
-    accesses: L2_LINES_IN_ALL:PMC[0-3]
-    # FIXME not all misses in L2 lead to loads from L3, only the hits do
-    misses: (CAS_COUNT_RD:MBOX0C[01] + CAS_COUNT_WR:MBOX0C[01] +
-             CAS_COUNT_RD:MBOX1C[01] + CAS_COUNT_WR:MBOX1C[01] +
-             CAS_COUNT_RD:MBOX2C[01] + CAS_COUNT_WR:MBOX2C[01] +
-             CAS_COUNT_RD:MBOX3C[01] + CAS_COUNT_WR:MBOX3C[01] +
-             CAS_COUNT_RD:MBOX4C[01] + CAS_COUNT_WR:MBOX4C[01] +
-             CAS_COUNT_RD:MBOX5C[01] + CAS_COUNT_WR:MBOX5C[01])
-    evicts: L2_TRANS_L2_WB:PMC[0-3]
-  cache per group:
-    sets: 16896
-    # TODO is actuall something else, but necessary to get to 16.5 MB
-    ways: 16
-    # TODO is actually 11, but pycachesim only supports powers of two
-    cl_size: 64
-    replacement_policy: 'LRU'
-    write_allocate: False
-    write_back: True
-  size per group: 16.50 MB
-  groups: 2
-  cores per group: 4
-  threads per group: 8
- level: MEM
-  cores per group: 4
-  threads per group: 8
-  non-overlap upstream throughput: ['full socket memory bandwidth', 'half-duplex']
-  penalty cycles per read stream: 0
-  size per group:
-benchmarks:
-  kernels:
-    load:
-      read streams:
-        streams: 1
-        bytes: 8.00 B
-      read+write streams:
-        streams: 0
-        bytes: 0.00 B
-      write streams:
-        streams: 0
-        bytes: 0.00 B
-      FLOPs per iteration: 0
-    copy:
-      read streams:
-        streams: 1
-        bytes: 8.00 B
-      read+write streams:
-        streams: 0
-        bytes: 0.00 B
-      write streams:
-        streams: 1
-        bytes: 8.00 B
-      FLOPs per iteration: 0
-    update:
-      read streams:
-        streams: 1
-        bytes: 8.00 B
-      read+write streams:
-        streams: 1
-        bytes: 8.00 B
-      write streams:
-        streams: 1
-        bytes: 8.00 B
-      FLOPs per iteration: 0
-    triad:
-      read streams:
-        streams: 3
-        bytes: 24.00 B
-      read+write streams:
-        streams: 0
-        bytes: 0.00 B
-      write streams:
-        streams: 1
-        bytes: 8.00 B
-      FLOPs per iteration: 2
-    daxpy:
-      read streams:
-        streams: 2
-        bytes: 16.00 B
-      read+write streams:
-        streams: 1
-        bytes: 8.00 B
-      write streams:
-        streams: 1
-        bytes: 8.00 B
-      FLOPs per iteration: 2
-  measurements:
-    L1:
-      1:
-        threads per core: 1
-        cores:
-        - 1
-        - 2
-        - 3
-        - 4
-        threads:
-        - 1
-        - 2
-        - 3
-        - 4
-        size per core:
-        - 21.12 kB
-        - 21.12 kB
-        - 21.12 kB
-        - 21.12 kB
-        size per thread:
-        - 21.12 kB
-        - 21.12 kB
-        - 21.12 kB
-        - 21.12 kB
-        total size:
-        - 21.12 kB
-        - 42.24 kB
-        - 63.36 kB
-        - 84.48 kB
-        results:
-          load:
-          - 42.98 GB/s
-          - 85.08 GB/s
-          - 127.45 GB/s
-          - 169.92 GB/s
-          copy:
-          - 56.07 GB/s
-          - 111.50 GB/s
-          - 164.90 GB/s
-          - 221.50 GB/s
-          update:
-          - 56.54 GB/s
-          - 112.25 GB/s
-          - 168.50 GB/s
-          - 224.75 GB/s
-          triad:
-          - 45.90 GB/s
-          - 89.81 GB/s
-          - 127.29 GB/s
-          - 169.57 GB/s
-          daxpy:
-          - 36.62 GB/s
-          - 71.30 GB/s
-          - 103.52 GB/s
-          - 135.26 GB/s
-      2:
-        threads per core: 2
-        cores:
-        - 1
-        - 2
-        - 3
-        - 4
-        threads:
-        - 2
-        - 4
-        - 6
-        - 8
-        size per core:
-        - 21.12 kB
-        - 21.12 kB
-        - 21.12 kB
-        - 21.12 kB
-        size per thread:
-        - 10.56 kB
-        - 10.56 kB
-        - 10.56 kB
-        - 10.56 kB
-        total size:
-        - 21.12 kB
-        - 42.24 kB
-        - 63.36 kB
-        - 84.48 kB
-        results:
-          load:
-          - 49.61 GB/s
-          - 98.80 GB/s
-          - 147.98 GB/s
-          - 198.22 GB/s
-          copy:
-          - 55.98 GB/s
-          - 111.56 GB/s
-          - 167.08 GB/s
-          - 220.42 GB/s
-          update:
-          - 56.53 GB/s
-          - 112.72 GB/s
-          - 168.95 GB/s
-          - 225.31 GB/s
-          triad:
-          - 54.01 GB/s
-          - 104.58 GB/s
-          - 153.02 GB/s
-          - 200.93 GB/s
-          daxpy:
-          - 41.11 GB/s
-          - 80.28 GB/s
-          - 115.71 GB/s
-          - 152.81 GB/s
-    L2:
-      1:
-        threads per core: 1
-        cores:
-        - 1
-        - 2
-        - 3
-        - 4
-        threads:
-        - 1
-        - 2
-        - 3
-        - 4
-        size per core:
-        - 660.00 kB
-        - 660.00 kB
-        - 660.00 kB
-        - 660.00 kB
-        size per thread:
-        - 660.00 kB
-        - 660.00 kB
-        - 660.00 kB
-        - 660.00 kB
-        total size:
-        - 660.00 kB
-        - 1.32 MB
-        - 1.98 MB
-        - 2.64 MB
-        results:
-          load:
-          - 27.15 GB/s
-          - 54.09 GB/s
-          - 80.61 GB/s
-          - 106.41 GB/s
-          copy:
-          - 43.53 GB/s
-          - 90.07 GB/s
-          - 127.73 GB/s
-          - 171.81 GB/s
-          update:
-          - 50.38 GB/s
-          - 98.47 GB/s
-          - 147.91 GB/s
-          - 197.20 GB/s
-          triad:
-          - 43.38 GB/s
-          - 83.72 GB/s
-          - 124.83 GB/s
-          - 166.04 GB/s
-          daxpy:
-          - 36.29 GB/s
-          - 71.29 GB/s
-          - 103.33 GB/s
-          - 136.48 GB/s
-      2:
-        threads per core: 2
-        cores:
-        - 1
-        - 2
-        - 3
-        - 4
-        threads:
-        - 2
-        - 4
-        - 6
-        - 8
-        size per core:
-        - 660.00 kB
-        - 660.00 kB
-        - 660.00 kB
-        - 660.00 kB
-        size per thread:
-        - 330.00 kB
-        - 330.00 kB
-        - 330.00 kB
-        - 330.00 kB
-        total size:
-        - 660.00 kB
-        - 1.32 MB
-        - 1.98 MB
-        - 2.64 MB
-        results:
-          load:
-          - 35.29 GB/s
-          - 70.28 GB/s
-          - 104.67 GB/s
-          - 139.63 GB/s
-          copy:
-          - 42.23 GB/s
-          - 83.70 GB/s
-          - 124.33 GB/s
-          - 167.50 GB/s
-          update:
-          - 50.09 GB/s
-          - 99.77 GB/s
-          - 149.87 GB/s
-          - 198.82 GB/s
-          triad:
-          - 52.38 GB/s
-          - 100.00 GB/s
-          - 147.40 GB/s
-          - 193.31 GB/s
-          daxpy:
-          - 41.14 GB/s
-          - 80.22 GB/s
-          - 116.23 GB/s
-          - 155.08 GB/s
-    L3:
-      1:
-        threads per core: 1
-        cores:
-        - 1
-        - 2
-        - 3
-        - 4
-        threads:
-        - 1
-        - 2
-        - 3
-        - 4
-        size per core:
-        - 10.56 MB
-        - 5.28 MB
-        - 3.52 MB
-        - 2.64 MB
-        size per thread:
-        - 10.56 MB
-        - 5.28 MB
-        - 3.52 MB
-        - 2.64 MB
-        total size:
-        - 10.56 MB
-        - 10.56 MB
-        - 10.56 MB
-        - 10.56 MB
-        results:
-          load:
-          - 22.40 GB/s
-          - 44.77 GB/s
-          - 65.71 GB/s
-          - 89.26 GB/s
-          copy:
-          - 25.32 GB/s
-          - 49.70 GB/s
-          - 72.89 GB/s
-          - 98.62 GB/s
-          update:
-          - 41.24 GB/s
-          - 81.14 GB/s
-          - 122.22 GB/s
-          - 166.44 GB/s
-          triad:
-          - 25.61 GB/s
-          - 50.02 GB/s
-          - 73.23 GB/s
-          - 98.95 GB/s
-          daxpy:
-          - 32.07 GB/s
-          - 62.65 GB/s
-          - 89.91 GB/s
-          - 120.65 GB/s
-      2:
-        threads per core: 2
-        cores:
-        - 1
-        - 2
-        - 3
-        - 4
-        threads:
-        - 2
-        - 4
-        - 6
-        - 8
-        size per core:
-        - 10.56 MB
-        - 5.28 MB
-        - 3.52 MB
-        - 2.64 MB
-        size per thread:
-        - 5.28 MB
-        - 2.64 MB
-        - 1.76 MB
-        - 1.32 MB
-        total size:
-        - 10.56 MB
-        - 10.56 MB
-        - 10.56 MB
-        - 10.56 MB
-        results:
-          load:
-          - 26.18 GB/s
-          - 51.85 GB/s
-          - 75.82 GB/s
-          - 101.39 GB/s
-          copy:
-          - 26.22 GB/s
-          - 51.83 GB/s
-          - 76.40 GB/s
-          - 102.84 GB/s
-          update:
-          - 43.51 GB/s
-          - 86.75 GB/s
-          - 129.86 GB/s
-          - 174.54 GB/s
-          triad:
-          - 26.39 GB/s
-          - 51.80 GB/s
-          - 76.27 GB/s
-          - 102.66 GB/s
-          daxpy:
-          - 37.43 GB/s
-          - 73.16 GB/s
-          - 106.53 GB/s
-          - 142.76 GB/s
-    MEM:
-      1:
-        threads per core: 1
-        cores:
-        - 1
-        - 2
-        - 3
-        - 4
-        threads:
-        - 1
-        - 2
-        - 3
-        - 4
-        size per core:
-        - 240.00 MB
-        - 120.00 MB
-        - 80.00 MB
-        - 60.00 MB
-        size per thread:
-        - 240.00 MB
-        - 120.00 MB
-        - 80.00 MB
-        - 60.00 MB
-        total size:
-        - 240.00 MB
-        - 240.00 MB
-        - 240.00 MB
-        - 240.00 MB
-        results:
-          load:
-          - 12.03 GB/s
-          - 24.38 GB/s
-          - 34.83 GB/s
-          - 45.05 GB/s
-          copy:
-          - 12.32 GB/s
-          - 24.40 GB/s
-          - 32.82 GB/s
-          - 37.00 GB/s
-          update:
-          - 20.83 GB/s
-          - 40.25 GB/s
-          - 48.81 GB/s
-          - 54.84 GB/s
-          triad:
-          - 11.64 GB/s
-          - 23.17 GB/s
-          - 34.78 GB/s
-          - 42.97 GB/s
-          daxpy:
-          - 17.69 GB/s
-          - 34.02 GB/s
-          - 48.12 GB/s
-          - 55.73 GB/s
-      2:
-        threads per core: 2
-        cores:
-        - 1
-        - 2
-        - 3
-        - 4
-        threads:
-        - 2
-        - 4
-        - 6
-        - 8
-        size per core:
-        - 240.00 MB
-        - 120.00 MB
-        - 80.00 MB
-        - 60.00 MB
-        size per thread:
-        - 120.00 MB
-        - 60.00 MB
-        - 40.00 MB
-        - 30.00 MB
-        total size:
-        - 240.00 MB
-        - 240.00 MB
-        - 240.00 MB
-        - 240.00 MB
-        results:
-          load:
-          - 15.33 GB/s
-          - 28.32 GB/s
-          - 41.34 GB/s
-          - 53.02 GB/s
-          copy:
-          - 13.96 GB/s
-          - 26.61 GB/s
-          - 34.39 GB/s
-          - 38.96 GB/s
-          update:
-          - 26.47 GB/s
-          - 47.82 GB/s
-          - 56.70 GB/s
-          - 62.78 GB/s
-          triad:
-          - 14.42 GB/s
-          - 26.66 GB/s
-          - 36.94 GB/s
-          - 44.01 GB/s
-          daxpy:
-          - 20.96 GB/s
-          - 39.12 GB/s
-          - 51.55 GB/s
-          - 58.37 GB/s
--- a/pystencils_tests/benchmark/benchmark.py
+++ b/pystencils_tests/benchmark/benchmark.py
-import math
-import os
-import time
-
-import numpy as np
-import sympy as sp
-from git import Repo
-from influxdb import InfluxDBClient
-from kerncraft.machinemodel import MachineModel
-from kerncraft.models import ECM, Benchmark, Roofline, RooflineIACA
-from kerncraft.prefixedunit import PrefixedUnit
-
-from pystencils import Assignment, Field, create_kernel
-from pystencils.kerncraft_coupling import KerncraftParameters, PyStencilsKerncraftKernel
-
-
-def output_benchmark(analysis):
-    output = {}
-    keys = ['Runtime (per repetition) [s]', 'Iterations per repetition',
-            'Runtime (per cacheline update) [cy/CL]', 'MEM volume (per repetition) [B]',
-            'Performance [MFLOP/s]', 'Performance [MLUP/s]', 'Performance [MIt/s]', 'MEM BW [MByte/s]']
-    copies = {key: analysis[key] for key in keys}
-    output.update(copies)
-
-    for cache, metrics in analysis['data transfers'].items():
-        for metric_name, metric_value in metrics.items():
-            fixed = metric_value.with_prefix('')
-            output[cache + ' ' + metric_name + ' ' + fixed.prefix + fixed.unit] = fixed.value
-
-    for level, value in analysis['ECM'].items():
-        output['Phenomenological ECM ' + level + ' cy/CL'] = value
-    return output
-
-
-def output_ecm(analysis):
-    output = {}
-    keys = ['T_nOL', 'T_OL', 'cl throughput', 'uops']
-    copies = {key: analysis[key] for key in keys}
-    output.update(copies)
-
-    if 'memory bandwidth kernel' in analysis:
-        output['memory bandwidth kernel' + analysis['memory bandwidth kernel'] + analysis['memory bandwidth'].prefix +
-               analysis['memory bandwidth'].unit] = analysis['memory bandwidth'].value
-
-    output['scaling cores'] = int(analysis['scaling cores']) if not math.isinf(analysis['scaling cores']) else -1
-
-    for key, value in analysis['cycles']:
-        output[key] = value
-    return output
-
-
-def output_roofline(analysis):
-    output = {}
-    keys = ['min performance']  # 'bottleneck level'
-    copies = {key: analysis[key] for key in keys}
-    output.update(copies)
-    # TODO save bottleneck information (compute it here)
-
-    # fixed = analysis['max_flops'].with_prefix('G')
-    # output['max GFlop/s'] = fixed.value
-
-    # if analysis['min performance'] > max_flops:
-    #    # CPU bound
-    #    print('CPU bound with {} cores(s)'.format(self._args.cores), file=output_file)
-    #    print('{!s} due to CPU max. FLOP/s'.format(max_flops), file=output_file)
-    # else:
-    # Memory bound
-    bottleneck = analysis['mem bottlenecks'][analysis['bottleneck level']]
-    output['bottleneck GFlop/s'] = bottleneck['performance'].with_prefix('G').value
-    output['bottleneck level'] = bottleneck['level']
-    output['bottleneck bw kernel'] = bottleneck['bw kernel']
-    output['bottleneck arithmetic intensity'] = bottleneck['arithmetic intensity']
-
-    for i, level in enumerate(analysis['mem bottlenecks']):
-        if level is None:
-            continue
-        for key, value in level.items():
-            if isinstance(value, PrefixedUnit):
-                fixed = value.with_prefix('G')
-                output['level ' + str(i) + ' ' + key + ' [' + fixed.prefix + fixed.unit + ']'] = 'inf' if isinstance(
-                    fixed.value, float) and math.isinf(fixed.value) else fixed.value
-            else:
-                output['level ' + str(i) + ' ' + key] = 'inf' if isinstance(value, float) and math.isinf(
-                    value) else value
-    return output
-
-
-def output_roofline_iaca(analysis):
-    output = {}
-    keys = ['min performance']  # 'bottleneck level'
-    copies = {key: analysis[key] for key in keys}
-    # output.update(copies)
-    # TODO save bottleneck information (compute it here)
-
-    # fixed = analysis['max_flops'].with_prefix('G')
-    # output['max GFlop/s'] = fixed.value
-
-    # if analysis['min performance'] > max_flops:
-    #    # CPU bound
-    #    print('CPU bound with {} cores(s)'.format(self._args.cores), file=output_file)
-    #    print('{!s} due to CPU max. FLOP/s'.format(max_flops), file=output_file)
-    # else:
-    # Memory bound
-    bottleneck = analysis['mem bottlenecks'][analysis['bottleneck level']]
-    output['bottleneck GFlop/s'] = bottleneck['performance'].with_prefix('G').value
-    output['bottleneck level'] = bottleneck['level']
-    output['bottleneck bw kernel'] = bottleneck['bw kernel']
-    output['bottleneck arithmetic intensity'] = bottleneck['arithmetic intensity']
-
-    for i, level in enumerate(analysis['mem bottlenecks']):
-        if level is None:
-            continue
-        for key, value in level.items():
-            if isinstance(value, PrefixedUnit):
-                fixed = value.with_prefix('G')
-                output['level ' + str(i) + ' ' + key + ' [' + fixed.prefix + fixed.unit + ']'] = 'inf' if isinstance(
-                    fixed.value, float) and math.isinf(fixed.value) else fixed.value
-            else:
-                output['level ' + str(i) + ' ' + key] = 'inf' if isinstance(value, float) and math.isinf(
-                    value) else value
-    return output
-
-
-def report_analysis(ast, models, machine, tags, fields=None):
-    kernel = PyStencilsKerncraftKernel(ast, machine)
-    client = InfluxDBClient('i10grafana.informatik.uni-erlangen.de', 8086, 'pystencils',
-                            'roggan', 'pystencils')
-    repo = Repo(search_parent_directories=True)
-    commit = repo.head.commit
-    point_time = int(time.time())
-
-    for model in models:
-        benchmark = model(kernel, machine, KerncraftParameters())
-        benchmark.analyze()
-        analysis = benchmark.results
-        if model is Benchmark:
-            output = output_benchmark(analysis)
-        elif model is ECM:
-            output = output_ecm(analysis)
-        elif model is Roofline:
-            output = output_roofline(analysis)
-        elif model is RooflineIACA:
-            output = output_roofline_iaca(analysis)
-        else:
-            raise ValueError('No valid model for analysis given!')
-
-        if fields is not None:
-            output.update(fields)
-
-        output['commit'] = commit.hexsha
-
-        json_body = [
-            {
-                'measurement': model.__name__,
-                'tags': tags,
-                'time': point_time,
-                'fields': output
-            }
-        ]
-        client.write_points(json_body, time_precision='s')
-
-
-def main():
-    size = [20, 200, 200]
-    arr = np.zeros(size)
-    a = Field.create_from_numpy_array('a', arr, index_dimensions=0)
-    b = Field.create_from_numpy_array('b', arr, index_dimensions=0)
-    s = sp.Symbol("s")
-    rhs = a[0, -1, 0] + a[0, 1, 0] + \
-          a[-1, 0, 0] + a[1, 0, 0] + \
-          a[0, 0, -1] + a[0, 0, 1]
-
-    update_rule = Assignment(b[0, 0, 0], s * rhs)
-    ast = create_kernel([update_rule])
-    input_folder = "./"
-    machine_file_path = os.path.join(input_folder, "SkylakeSP_Gold-5122_allinclusive.yaml")
-    machine = MachineModel(path_to_yaml=machine_file_path)
-    tags = {
-        'host': os.uname()[1],
-        'project': 'pystencils',
-        'kernel': 'jacobi_3D ' + str(size)
-    }
-
-    report_analysis(ast, [ECM, Roofline, RooflineIACA, Benchmark], machine, tags)
-
-
-if __name__ == '__main__':
-    main()
--- a/pystencils_tests/benchmark/generate.py
+++ b/pystencils_tests/benchmark/generate.py
-import numpy as np
-import sympy as sp
-
-from pystencils import Assignment, Field, create_kernel
-
-
-def meassure():
-    size = [30, 50, 3]
-    arr = np.zeros(size)
-    a = Field.create_from_numpy_array('a', arr, index_dimensions=1)
-    b = Field.create_from_numpy_array('b', arr, index_dimensions=1)
-    s = sp.Symbol("s")
-    rhs = a[0, -1](0) + a[0, 1] + a[-1, 0] + a[1, 0]
-    updateRule = Assignment(b[0, 0], s * rhs)
-    print(updateRule)
-
-    ast = create_kernel([updateRule])
-
-    # benchmark = generate_benchmark(ast)
-    # main = benchmark[0]
-    # kernel = benchmark[1]
-    # with open('src/main.cpp', 'w') as file:
-    #     file.write(main)
-    # with open('src/kernel.cpp', 'w') as file:
-    #     file.write(kernel)
-
-    func = ast.compile({'omega': 2/3})
-
-    from pystencils.kerncraft_coupling.generate_benchmark import generate_benchmark
-    from pystencils.kerncraft_coupling import BenchmarkAnalysis
-    from pystencils.kerncraft_coupling.kerncraft_interface import PyStencilsKerncraftKernel, KerncraftParameters
-    from kerncraft.machinemodel import MachineModel
-    from kerncraft.models import ECMData
-
-
-    machineFilePath = "../pystencils_tests/kerncraft_inputs/default_machine_file.yaml"
-    machine = MachineModel(path_to_yaml=machineFilePath)
-
-
-    benchmark = BenchmarkAnalysis(ast, machine)
-    #TODO what do i want to do with benchmark?
-
-    kernel = PyStencilsKerncraftKernel(ast)
-    model = ECMData(kernel, machine, KerncraftParameters())
-    model.analyze()
-    model.report()
-
-
-if __name__ == "__main__":
-    meassure()
No results found