5600b6b6 · 5600b6b6 · 3bb88ad1 · 3bb88ad1 · 3bb88ad1 · 3bb88ad1
--- a/doc/sphinx/simplifications.rst
+++ b/doc/sphinx/simplifications.rst
@@ -10,13 +10,27 @@ AssignmentCollection
   :members:


+SimplificationStrategy
+======================
+
+.. autoclass:: pystencils.simp.SimplificationStrategy
+    :members:
+
 Simplifications
 ===============

-.. automodule:: pystencils.simp
-   :members:
+.. automodule:: pystencils.simp.simplifications
+    :members:
+
+Subexpression insertion
+=======================

+The subexpression insertions have the goal to insert subexpressions which will not reduce the number of FLOPs.
+For example a constant value kept as subexpression will lead to a new variable in the code which will occupy
+a register slot. On the other side a single variable could just be inserted in all assignments.

+.. automodule:: pystencils.simp.subexpression_insertion
+    :members:




--- a/pyproject.toml
+++ b/pyproject.toml
+[project]
+name = "pystencils"
+description = "Speeding up stencil computations on CPUs and GPUs"
+dynamic = ["version"]
+readme = "README.md"
+authors = [
+    { name = "Martin Bauer" },
+    { name = "Jan Hönig " },
+    { name = "Markus Holzer" },
+    { name = "Frederik Hennig" },
+    { email = "cs10-codegen@fau.de" },
+]
+license = { file = "COPYING.txt" }
+requires-python = ">=3.10"
+dependencies = ["sympy>=1.9,<=1.12.1", "numpy>=1.8.0", "appdirs", "joblib", "pyyaml", "fasteners"]
+classifiers = [
+    "Development Status :: 4 - Beta",
+    "Framework :: Jupyter",
+    "Topic :: Software Development :: Code Generators",
+    "Topic :: Scientific/Engineering :: Physics",
+    "Intended Audience :: Developers",
+    "Intended Audience :: Science/Research",
+    "License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)",
+]
+
+[project.urls]
+"Bug Tracker" = "https://i10git.cs.fau.de/pycodegen/pystencils/-/issues"
+"Documentation" = "https://pycodegen.pages.i10git.cs.fau.de/pystencils/"
+"Source Code" = "https://i10git.cs.fau.de/pycodegen/pystencils"
+
+[project.optional-dependencies]
+gpu = ['cupy']
+alltrafos = ['islpy', 'py-cpuinfo']
+bench_db = ['blitzdb', 'pymongo', 'pandas']
+interactive = [
+    'matplotlib',
+    'ipy_table',
+    'imageio',
+    'jupyter',
+    'pyevtk',
+    'rich',
+    'graphviz',
+]
+use_cython = [
+    'Cython'
+]
+doc = [
+    'sphinx',
+    'sphinx_rtd_theme',
+    'nbsphinx',
+    'sphinxcontrib-bibtex',
+    'sphinx_autodoc_typehints',
+    'pandoc',
+]
+tests = [
+    'pytest',
+    'pytest-cov',
+    'pytest-html',
+    'ansi2html',
+    'pytest-xdist',
+    'flake8',
+    'nbformat',
+    'nbconvert',
+    'ipython',
+    'matplotlib',
+    'py-cpuinfo',
+    'randomgen>=1.18',
+]
+
+[build-system]
+requires = [
+    "setuptools>=61",
+    "versioneer[toml]>=0.29",
+    # 'Cython'
+]
+build-backend = "setuptools.build_meta"
+
+[tool.setuptools.package-data]
+pystencils = [
+    "include/*.h",
+    "boundaries/createindexlistcython.pyx"
+]
+
+[tool.setuptools.packages.find]
+where = ["src"]
+include = ["pystencils", "pystencils.*"]
+namespaces = false
+
+[tool.versioneer]
+# See the docstring in versioneer.py for instructions. Note that you must
+# re-run 'versioneer.py setup' after changing this section, and commit the
+# resulting files.
+VCS = "git"
+style = "pep440"
+versionfile_source = "src/pystencils/_version.py"
+versionfile_build = "pystencils/_version.py"
+tag_prefix = "release/"
+parentdir_prefix = "pystencils-"
--- a/pystencils/autodiff.py
+++ b/pystencils/autodiff.py
-"""
-Provides tools for generation of auto-differentiable operations.
-
-See https://github.com/theHamsta/pystencils_autodiff
-
-Installation:
-
-.. code-block:: bash
-    pip install pystencils-autodiff
-"""
-
-raise NotImplementedError('pystencils-autodiff is not installed. Run `pip install pystencils-autodiff`')
--- a/pystencils/backends/cuda_known_functions.txt
+++ b/pystencils/backends/cuda_known_functions.txt
-__prof_trigger
-printf
-
-__syncthreads
-__syncthreads_count
-__syncthreads_and
-__syncthreads_or
-__syncwarp
-__threadfence
-__threadfence_block
-__threadfence_system
-
-atomicAdd
-atomicSub
-atomicExch
-atomicMin
-atomicMax
-atomicInc
-atomicDec
-atomicAnd
-atomicOr
-atomicXor
-atomicCAS
-
-__all_sync
-__any_sync
-__ballot_sync
-__active_mask
-
-__shfl_sync
-__shfl_up_sync
-__shfl_down_sync
-__shfl_xor_sync
-
-__match_any_sync
-__match_all_sync
-
-__isGlobal
-__isShared
-__isConstant
-__isLocal
-
-tex1Dfetch
-tex1D
-tex2D
-tex3D
-
-sqrtf
-rsqrtf
-cbrtf
-rcbrtf
-hypotf
-rhypotf
-norm3df
-rnorm3df
-norm4df
-rnorm4df
-normf
-rnormf
-expf
-exp2f
-exp10f
-expm1f
-logf
-log2f
-log10f
-log1pf
-sinf
-cosf
-tanf
-sincosf
-sinpif
-cospif
-sincospif
-asinf
-acosf
-atanf
-atan2f
-sinhf
-coshf
-tanhf
-asinhf
-acoshf
-atanhf
-powf
-erff
-erfcf
-erfinvf
-erfcinvf
-erfcxf
-normcdff
-normcdfinvf
-lgammaf
-tgammaf
-fmaf
-frexpf
-ldexpf
-scalbnf
-scalblnf
-logbf
-ilogbf
-j0f
-j1f
-jnf
-y0f
-y1f
-ynf
-cyl_bessel_i0f
-cyl_bessel_i1f
-fmodf
-remainderf
-remquof
-modff
-fdimf
-truncf
-roundf
-rintf
-nearbyintf
-ceilf
-floorf
-lrintf
-lroundf
-llrintf
-llroundf
-
-sqrt
-rsqrt
-cbrt
-rcbrt
-hypot
-rhypot
-norm3d
-rnorm3d
-norm4d
-rnorm4d
-norm
-rnorm
-exp
-exp2
-exp10
-expm1
-log
-log2
-log10
-log1p
-sin
-cos
-tan
-sincos
-sinpi
-cospi
-sincospi
-asin
-acos
-atan
-atan2
-sinh
-cosh
-tanh
-asinh
-acosh
-atanh
-pow
-erf
-erfc
-erfinv
-erfcinv
-erfcx
-normcdf
-normcdfinv
-lgamma
-tgamma
-fma
-frexp
-ldexp
-scalbn
-scalbln
-logb
-ilogb
-j0
-j1
-jn
-y0
-y1
-yn
-cyl_bessel_i0
-cyl_bessel_i1
-fmod
-remainder
-remquo
-mod
-fdim
-trunc
-round
-rint
-nearbyint
-ceil
-floor
-lrint
-lround
-llrint
-llround
-
-__fdividef
-__sinf
-__cosf
-__tanf
-__sincosf
-__logf
-__log2f
-__log10f
-__expf
-__exp10f
-__powf
-
-__fadd_rn
-__fsub_rn
-__fmul_rn
-__fmaf_rn
-__frcp_rn
-__fsqrt_rn
-__frsqrt_rn
-__fdiv_rn
-
-__fadd_rz
-__fsub_rz
-__fmul_rz
-__fmaf_rz
-__frcp_rz
-__fsqrt_rz
-__frsqrt_rz
-__fdiv_rz
-
-__fadd_ru
-__fsub_ru
-__fmul_ru
-__fmaf_ru
-__frcp_ru
-__fsqrt_ru
-__frsqrt_ru
-__fdiv_ru
-
-__fadd_rd
-__fsub_rd
-__fmul_rd
-__fmaf_rd
-__frcp_rd
-__fsqrt_rd
-__frsqrt_rd
-__fdiv_rd
-
-__fdividef
-__expf
-__exp10f
-__logf
-__log2f
-__log10f
-__sinf
-__cosf
-__sincosf
-__tanf
-__powf
-
-__dadd_rn
-__dsub_rn
-__dmul_rn
-__fma_rn
-__ddiv_rn
-__drcp_rn
-__dsqrt_rn
-
-__dadd_rz
-__dsub_rz
-__dmul_rz
-__fma_rz
-__ddiv_rz
-__drcp_rz
-__dsqrt_rz
-
-__dadd_ru
-__dsub_ru
-__dmul_ru
-__fma_ru
-__ddiv_ru
-__drcp_ru
-__dsqrt_ru
-
-__dadd_rd
-__dsub_rd
-__dmul_rd
-__fma_rd
-__ddiv_rd
-__drcp_rd
-__dsqrt_rd
--- a/pystencils/backends/opencl1.1_known_functions.txt
+++ b/pystencils/backends/opencl1.1_known_functions.txt
-acos
-acosh
-acospi
-asin
-asinh
-asinpi
-atan
-atan2
-atanh
-atanpi
-atan2pi
-cbrt
-ceil
-copysign
-cos
-cosh
-cospi
-erfc
-erf
-exp
-exp2
-exp10
-expm1
-fabs
-fdim
-floor
-fma
-fmax
-fmax
-fmin45
-fmin
-fmod
-fract
-frexp
-hypot
-ilogb
-ldexp
-lgamma
-lgamma_r
-log
-log2
-log10
-log1p
-logb
-mad 
-maxmag 
-minmag 
-modf 
-nextafter
-pow 
-pown
-powr
-remquo 
-intn
-remquo
-rint
-rootn
-rootn
-round
-rsqrt
-sin
-sincos
-sinh
-sinpi
-sqrt
-tan
-tanh
-tanpi
-tgamma
-trunc
-
-
-half_cos 
-half_divide
-half_exp 
-half_exp2 
-half_exp10 
-half_log 
-half_log2 
-half_log10 
-half_powr 
-half_recip 
-half_rsqrt 
-half_sin 
-half_sqrt 
-half_tan 
-native_cos 
-native_divide
-native_exp 
-native_exp2 
-native_exp10 
-native_log 
-native_log2 
-native_log10 
-native_powr 
-native_recip 
-native_rsqrt 
-native_sin 
-native_sqrt 
-native_tan 
--- a/pystencils/backends/opencl_backend.py
+++ b/pystencils/backends/opencl_backend.py
-from os.path import dirname, join
-
-import pystencils.data_types
-from pystencils.astnodes import Node
-from pystencils.backends.cbackend import CustomSympyPrinter, generate_c
-from pystencils.backends.cuda_backend import CudaBackend, CudaSympyPrinter
-from pystencils.enums import Backend
-from pystencils.fast_approximation import fast_division, fast_inv_sqrt, fast_sqrt
-
-with open(join(dirname(__file__), 'opencl1.1_known_functions.txt')) as f:
-    lines = f.readlines()
-    OPENCL_KNOWN_FUNCTIONS = {l.strip(): l.strip() for l in lines if l}
-
-
-def generate_opencl(ast_node: Node, signature_only: bool = False, custom_backend=None, with_globals=True) -> str:
-    """Prints an abstract syntax tree node (made for `Target` 'GPU') as OpenCL code. # TODO Backend instead of Target?
-
-    Args:
-        ast_node: ast representation of kernel
-        signature_only: generate signature without function body
-        custom_backend: use own custom printer for code generation
-        with_globals: enable usage of global variables
-
-    Returns:
-        OpenCL code for the ast node and its descendants
-    """
-    return generate_c(ast_node, signature_only, dialect=Backend.OPENCL,
-                      custom_backend=custom_backend, with_globals=with_globals)
-
-
-class OpenClBackend(CudaBackend):
-
-    def __init__(self,
-                 sympy_printer=None,
-                 signature_only=False):
-        if not sympy_printer:
-            sympy_printer = OpenClSympyPrinter()
-
-        super().__init__(sympy_printer, signature_only)
-        self._dialect = Backend.OPENCL
-
-    def _print_Type(self, node):
-        code = super()._print_Type(node)
-        if isinstance(node, pystencils.data_types.PointerType):
-            return "__global " + code
-        else:
-            return code
-
-    def _print_ThreadBlockSynchronization(self, node):
-        raise NotImplementedError()
-
-    def _print_TextureDeclaration(self, node):
-        raise NotImplementedError()
-
-
-class OpenClSympyPrinter(CudaSympyPrinter):
-    language = "OpenCL"
-
-    DIMENSION_MAPPING = {
-        'x': '0',
-        'y': '1',
-        'z': '2'
-    }
-    INDEXING_FUNCTION_MAPPING = {
-        'blockIdx': 'get_group_id',
-        'threadIdx': 'get_local_id',
-        'blockDim': 'get_local_size',
-        'gridDim': 'get_global_size'
-    }
-
-    def __init__(self):
-        CustomSympyPrinter.__init__(self)
-        self.known_functions = OPENCL_KNOWN_FUNCTIONS
-
-    def _print_Type(self, node):
-        code = super()._print_Type(node)
-        if isinstance(node, pystencils.data_types.PointerType):
-            return "__global " + code
-        else:
-            return code
-
-    def _print_ThreadIndexingSymbol(self, node):
-        symbol_name: str = node.name
-        function_name, dimension = tuple(symbol_name.split("."))
-        dimension = self.DIMENSION_MAPPING[dimension]
-        function_name = self.INDEXING_FUNCTION_MAPPING[function_name]
-        return f"(int64_t) {function_name}({dimension})"
-
-    def _print_TextureAccess(self, node):
-        raise NotImplementedError()
-
-    # For math functions, OpenCL is more similar to the C++ printer CustomSympyPrinter
-    # since built-in math functions are generic.
-    # In CUDA, you have to differentiate between `sin` and `sinf`
-    try:
-        _print_math_func = CustomSympyPrinter._print_math_func
-    except AttributeError:
-        pass
-    _print_Pow = CustomSympyPrinter._print_Pow
-
-    def _print_Function(self, expr):
-        if isinstance(expr, fast_division):
-            return "native_divide(%s, %s)" % tuple(self._print(a) for a in expr.args)
-        elif isinstance(expr, fast_sqrt):
-            return f"native_sqrt({tuple(self._print(a) for a in expr.args)})"
-        elif isinstance(expr, fast_inv_sqrt):
-            return f"native_rsqrt({tuple(self._print(a) for a in expr.args)})"
-        return CustomSympyPrinter._print_Function(self, expr)
--- a/pystencils/data_types.py
+++ b/pystencils/data_types.py
-import ctypes
-from collections import defaultdict
-from functools import partial
-from typing import Tuple
-
-import numpy as np
-import sympy as sp
-import sympy.codegen.ast
-from sympy.core.cache import cacheit
-from sympy.logic.boolalg import Boolean, BooleanFunction
-
-import pystencils
-from pystencils.cache import memorycache, memorycache_if_hashable
-from pystencils.utils import all_equal
-
-try:
-    import llvmlite.ir as ir
-except ImportError as e:
-    ir = None
-    _ir_importerror = e
-
-
-def typed_symbols(names, dtype, *args):
-    symbols = sp.symbols(names, *args)
-    if isinstance(symbols, Tuple):
-        return tuple(TypedSymbol(str(s), dtype) for s in symbols)
-    else:
-        return TypedSymbol(str(symbols), dtype)
-
-
-def type_all_numbers(expr, dtype):
-    substitutions = {a: cast_func(a, dtype) for a in expr.atoms(sp.Number)}
-    return expr.subs(substitutions)
-
-
-def matrix_symbols(names, dtype, rows, cols):
-    if isinstance(names, str):
-        names = names.replace(' ', '').split(',')
-
-    matrices = []
-    for n in names:
-        symbols = typed_symbols(f"{n}:{rows * cols}", dtype)
-        matrices.append(sp.Matrix(rows, cols, lambda i, j: symbols[i * cols + j]))
-
-    return tuple(matrices)
-
-
-def assumptions_from_dtype(dtype):
-    """Derives SymPy assumptions from :class:`BasicType` or a Numpy dtype
-
-    Args:
-        dtype (BasicType, np.dtype): a Numpy data type
-    Returns:
-        A dict of SymPy assumptions
-    """
-    if hasattr(dtype, 'numpy_dtype'):
-        dtype = dtype.numpy_dtype
-
-    assumptions = dict()
-
-    try:
-        if np.issubdtype(dtype, np.integer):
-            assumptions.update({'integer': True})
-
-        if np.issubdtype(dtype, np.unsignedinteger):
-            assumptions.update({'negative': False})
-
-        if np.issubdtype(dtype, np.integer) or \
-                np.issubdtype(dtype, np.floating):
-            assumptions.update({'real': True})
-    except Exception:
-        pass
-
-    return assumptions
-
-
-# noinspection PyPep8Naming
-class address_of(sp.Function):
-    is_Atom = True
-
-    def __new__(cls, arg):
-        obj = sp.Function.__new__(cls, arg)
-        return obj
-
-    @property
-    def canonical(self):
-        if hasattr(self.args[0], 'canonical'):
-            return self.args[0].canonical
-        else:
-            raise NotImplementedError()
-
-    @property
-    def is_commutative(self):
-        return self.args[0].is_commutative
-
-    @property
-    def dtype(self):
-        if hasattr(self.args[0], 'dtype'):
-            return PointerType(self.args[0].dtype, restrict=True)
-        else:
-            return PointerType('void', restrict=True)
-
-
-# noinspection PyPep8Naming
-class cast_func(sp.Function):
-    is_Atom = True
-
-    def __new__(cls, *args, **kwargs):
-        if len(args) != 2:
-            pass
-        expr, dtype, *other_args = args
-        if not isinstance(dtype, Type):
-            dtype = create_type(dtype)
-        # to work in conditions of sp.Piecewise cast_func has to be of type Boolean as well
-        # however, a cast_function should only be a boolean if its argument is a boolean, otherwise this leads
-        # to problems when for example comparing cast_func's for equality
-        #
-        # lhs = bitwise_and(a, cast_func(1, 'int'))
-        # rhs = cast_func(0, 'int')
-        # print( sp.Ne(lhs, rhs) ) # would give true if all cast_funcs are booleans
-        # -> thus a separate class boolean_cast_func is introduced
-        if isinstance(expr, Boolean) and (not isinstance(expr, TypedSymbol) or expr.dtype == BasicType(bool)):
-            cls = boolean_cast_func
-
-        return sp.Function.__new__(cls, expr, dtype, *other_args, **kwargs)
-
-    @property
-    def canonical(self):
-        if hasattr(self.args[0], 'canonical'):
-            return self.args[0].canonical
-        else:
-            raise NotImplementedError()
-
-    @property
-    def is_commutative(self):
-        return self.args[0].is_commutative
-
-    def _eval_evalf(self, *args, **kwargs):
-        return self.args[0].evalf()
-
-    @property
-    def dtype(self):
-        return self.args[1]
-
-    @property
-    def is_integer(self):
-        """
-        Uses Numpy type hierarchy to determine :func:`sympy.Expr.is_integer` predicate
-
-        For reference: Numpy type hierarchy https://docs.scipy.org/doc/numpy-1.13.0/reference/arrays.scalars.html
-        """
-        if hasattr(self.dtype, 'numpy_dtype'):
-            return np.issubdtype(self.dtype.numpy_dtype, np.integer) or super().is_integer
-        else:
-            return super().is_integer
-
-    @property
-    def is_negative(self):
-        """
-        See :func:`.TypedSymbol.is_integer`
-        """
-        if hasattr(self.dtype, 'numpy_dtype'):
-            if np.issubdtype(self.dtype.numpy_dtype, np.unsignedinteger):
-                return False
-
-        return super().is_negative
-
-    @property
-    def is_nonnegative(self):
-        """
-        See :func:`.TypedSymbol.is_integer`
-        """
-        if self.is_negative is False:
-            return True
-        else:
-            return super().is_nonnegative
-
-    @property
-    def is_real(self):
-        """
-        See :func:`.TypedSymbol.is_integer`
-        """
-        if hasattr(self.dtype, 'numpy_dtype'):
-            return np.issubdtype(self.dtype.numpy_dtype, np.integer) or \
-                np.issubdtype(self.dtype.numpy_dtype, np.floating) or \
-                super().is_real
-        else:
-            return super().is_real
-
-
-# noinspection PyPep8Naming
-class boolean_cast_func(cast_func, Boolean):
-    pass
-
-
-# noinspection PyPep8Naming
-class vector_memory_access(cast_func):
-    # Arguments are: read/write expression, type, aligned, nontemporal, mask (or none), stride
-    nargs = (6,)
-
-
-# noinspection PyPep8Naming
-class reinterpret_cast_func(cast_func):
-    pass
-
-
-# noinspection PyPep8Naming
-class pointer_arithmetic_func(sp.Function, Boolean):
-    @property
-    def canonical(self):
-        if hasattr(self.args[0], 'canonical'):
-            return self.args[0].canonical
-        else:
-            raise NotImplementedError()
-
-
-class TypedSymbol(sp.Symbol):
-    def __new__(cls, *args, **kwds):
-        obj = TypedSymbol.__xnew_cached_(cls, *args, **kwds)
-        return obj
-
-    def __new_stage2__(cls, name, dtype, **kwargs):
-        assumptions = assumptions_from_dtype(dtype)
-        assumptions.update(kwargs)
-        obj = super(TypedSymbol, cls).__xnew__(cls, name, **assumptions)
-        try:
-            obj._dtype = create_type(dtype)
-        except (TypeError, ValueError):
-            # on error keep the string
-            obj._dtype = dtype
-        return obj
-
-    __xnew__ = staticmethod(__new_stage2__)
-    __xnew_cached_ = staticmethod(cacheit(__new_stage2__))
-
-    @property
-    def dtype(self):
-        return self._dtype
-
-    def _hashable_content(self):
-        return super()._hashable_content(), hash(self._dtype)
-
-    def __getnewargs__(self):
-        return self.name, self.dtype
-
-    def __getnewargs_ex__(self):
-        return (self.name, self.dtype), self.assumptions0
-
-    @property
-    def canonical(self):
-        return self
-
-    @property
-    def reversed(self):
-        return self
-
-    @property
-    def headers(self):
-        headers = []
-        try:
-            if np.issubdtype(self.dtype.numpy_dtype, np.complexfloating):
-                headers.append('"cuda_complex.hpp"')
-        except Exception:
-            pass
-        try:
-            if np.issubdtype(self.dtype.base_type.numpy_dtype, np.complexfloating):
-                headers.append('"cuda_complex.hpp"')
-        except Exception:
-            pass
-
-        return headers
-
-
-def create_type(specification):
-    """Creates a subclass of Type according to a string or an object of subclass Type.
-
-    Args:
-        specification: Type object, or a string
-
-    Returns:
-        Type object, or a new Type object parsed from the string
-    """
-    if isinstance(specification, Type):
-        return specification
-    else:
-        numpy_dtype = np.dtype(specification)
-        if numpy_dtype.fields is None:
-            return BasicType(numpy_dtype, const=False)
-        else:
-            return StructType(numpy_dtype, const=False)
-
-
-@memorycache(maxsize=64)
-def create_composite_type_from_string(specification):
-    """Creates a new Type object from a c-like string specification.
-
-    Args:
-        specification: Specification string
-
-    Returns:
-        Type object
-    """
-    specification = specification.lower().split()
-    parts = []
-    current = []
-    for s in specification:
-        if s == '*':
-            parts.append(current)
-            current = [s]
-        else:
-            current.append(s)
-    if len(current) > 0:
-        parts.append(current)
-        # Parse native part
-    base_part = parts.pop(0)
-    const = False
-    if 'const' in base_part:
-        const = True
-        base_part.remove('const')
-    assert len(base_part) == 1
-    if base_part[0][-1] == "*":
-        base_part[0] = base_part[0][:-1]
-        parts.append('*')
-    current_type = BasicType(np.dtype(base_part[0]), const)
-    # Parse pointer parts
-    for part in parts:
-        restrict = False
-        const = False
-        if 'restrict' in part:
-            restrict = True
-            part.remove('restrict')
-        if 'const' in part:
-            const = True
-            part.remove("const")
-        assert len(part) == 1 and part[0] == '*'
-        current_type = PointerType(current_type, const, restrict)
-    return current_type
-
-
-def get_base_type(data_type):
-    while data_type.base_type is not None:
-        data_type = data_type.base_type
-    return data_type
-
-
-def to_ctypes(data_type):
-    """
-    Transforms a given Type into ctypes
-    :param data_type: Subclass of Type
-    :return: ctypes type object
-    """
-    if isinstance(data_type, PointerType):
-        return ctypes.POINTER(to_ctypes(data_type.base_type))
-    elif isinstance(data_type, StructType):
-        return ctypes.POINTER(ctypes.c_uint8)
-    else:
-        return to_ctypes.map[data_type.numpy_dtype]
-
-
-to_ctypes.map = {
-    np.dtype(np.int8): ctypes.c_int8,
-    np.dtype(np.int16): ctypes.c_int16,
-    np.dtype(np.int32): ctypes.c_int32,
-    np.dtype(np.int64): ctypes.c_int64,
-
-    np.dtype(np.uint8): ctypes.c_uint8,
-    np.dtype(np.uint16): ctypes.c_uint16,
-    np.dtype(np.uint32): ctypes.c_uint32,
-    np.dtype(np.uint64): ctypes.c_uint64,
-
-    np.dtype(np.float32): ctypes.c_float,
-    np.dtype(np.float64): ctypes.c_double,
-}
-
-
-def ctypes_from_llvm(data_type):
-    if not ir:
-        raise _ir_importerror
-    if isinstance(data_type, ir.PointerType):
-        ctype = ctypes_from_llvm(data_type.pointee)
-        if ctype is None:
-            return ctypes.c_void_p
-        else:
-            return ctypes.POINTER(ctype)
-    elif isinstance(data_type, ir.IntType):
-        if data_type.width == 8:
-            return ctypes.c_int8
-        elif data_type.width == 16:
-            return ctypes.c_int16
-        elif data_type.width == 32:
-            return ctypes.c_int32
-        elif data_type.width == 64:
-            return ctypes.c_int64
-        else:
-            raise ValueError("Int width %d is not supported" % data_type.width)
-    elif isinstance(data_type, ir.FloatType):
-        return ctypes.c_float
-    elif isinstance(data_type, ir.DoubleType):
-        return ctypes.c_double
-    elif isinstance(data_type, ir.VoidType):
-        return None  # Void type is not supported by ctypes
-    else:
-        raise NotImplementedError(f'Data type {type(data_type)} of {data_type} is not supported yet')
-
-
-def to_llvm_type(data_type, nvvm_target=False):
-    """
-    Transforms a given type into ctypes
-    :param data_type: Subclass of Type
-    :return: llvmlite type object
-    """
-    if not ir:
-        raise _ir_importerror
-    if isinstance(data_type, PointerType):
-        return to_llvm_type(data_type.base_type).as_pointer(1 if nvvm_target else 0)
-    else:
-        return to_llvm_type.map[data_type.numpy_dtype]
-
-
-if ir:
-    to_llvm_type.map = {
-        np.dtype(np.int8): ir.IntType(8),
-        np.dtype(np.int16): ir.IntType(16),
-        np.dtype(np.int32): ir.IntType(32),
-        np.dtype(np.int64): ir.IntType(64),
-
-        np.dtype(np.uint8): ir.IntType(8),
-        np.dtype(np.uint16): ir.IntType(16),
-        np.dtype(np.uint32): ir.IntType(32),
-        np.dtype(np.uint64): ir.IntType(64),
-
-        np.dtype(np.float32): ir.FloatType(),
-        np.dtype(np.float64): ir.DoubleType(),
-    }
-
-
-def peel_off_type(dtype, type_to_peel_off):
-    while type(dtype) is type_to_peel_off:
-        dtype = dtype.base_type
-    return dtype
-
-
-def collate_types(types,
-                  forbid_collation_to_complex=False,
-                  forbid_collation_to_float=False,
-                  default_float_type='float64',
-                  default_int_type='int64'):
-    """
-    Takes a sequence of types and returns their "common type" e.g. (float, double, float) -> double
-    Uses the collation rules from numpy.
-    """
-    if forbid_collation_to_complex:
-        types = [t for t in types if not np.issubdtype(t.numpy_dtype, np.complexfloating)]
-        if not types:
-            return create_type(default_float_type)
-
-    if forbid_collation_to_float:
-        types = [t for t in types if not np.issubdtype(t.numpy_dtype, np.floating)]
-        if not types:
-            return create_type(default_int_type)
-
-    # Pointer arithmetic case i.e. pointer + integer is allowed
-    if any(type(t) is PointerType for t in types):
-        pointer_type = None
-        for t in types:
-            if type(t) is PointerType:
-                if pointer_type is not None:
-                    raise ValueError("Cannot collate the combination of two pointer types")
-                pointer_type = t
-            elif type(t) is BasicType:
-                if not (t.is_int() or t.is_uint()):
-                    raise ValueError("Invalid pointer arithmetic")
-            else:
-                raise ValueError("Invalid pointer arithmetic")
-        return pointer_type
-
-    # peel of vector types, if at least one vector type occurred the result will also be the vector type
-    vector_type = [t for t in types if type(t) is VectorType]
-    if not all_equal(t.width for t in vector_type):
-        raise ValueError("Collation failed because of vector types with different width")
-    types = [peel_off_type(t, VectorType) for t in types]
-
-    # now we should have a list of basic types - struct types are not yet supported
-    assert all(type(t) is BasicType for t in types)
-
-    if any(t.is_float() for t in types):
-        types = tuple(t for t in types if t.is_float())
-    # use numpy collation -> create type from numpy type -> and, put vector type around if necessary
-    result_numpy_type = np.result_type(*(t.numpy_dtype for t in types))
-    result = BasicType(result_numpy_type)
-    if vector_type:
-        result = VectorType(result, vector_type[0].width)
-    return result
-
-
-@memorycache_if_hashable(maxsize=2048)
-def get_type_of_expression(expr,
-                           default_float_type='double',
-                           default_int_type='int',
-                           symbol_type_dict=None):
-    from pystencils.astnodes import ResolvedFieldAccess
-    from pystencils.cpu.vectorization import vec_all, vec_any
-
-    if default_float_type == 'float':
-        default_float_type = 'float32'
-
-    if not symbol_type_dict:
-        symbol_type_dict = defaultdict(lambda: create_type('double'))
-
-    get_type = partial(get_type_of_expression,
-                       default_float_type=default_float_type,
-                       default_int_type=default_int_type,
-                       symbol_type_dict=symbol_type_dict)
-
-    expr = sp.sympify(expr)
-    if isinstance(expr, sp.Integer):
-        return create_type(default_int_type)
-    elif expr.is_real is False:
-        return create_type((np.zeros((1,), default_float_type) * 1j).dtype)
-    elif isinstance(expr, sp.Rational) or isinstance(expr, sp.Float):
-        return create_type(default_float_type)
-    elif isinstance(expr, ResolvedFieldAccess):
-        return expr.field.dtype
-    elif isinstance(expr, pystencils.field.Field.AbstractAccess):
-        return expr.field.dtype
-    elif isinstance(expr, TypedSymbol):
-        return expr.dtype
-    elif isinstance(expr, sp.Symbol):
-        if symbol_type_dict:
-            return symbol_type_dict[expr.name]
-        else:
-            raise ValueError("All symbols inside this expression have to be typed! ", str(expr))
-    elif isinstance(expr, cast_func):
-        return expr.args[1]
-    elif isinstance(expr, (vec_any, vec_all)):
-        return create_type("bool")
-    elif hasattr(expr, 'func') and expr.func == sp.Piecewise:
-        collated_result_type = collate_types(tuple(get_type(a[0]) for a in expr.args))
-        collated_condition_type = collate_types(tuple(get_type(a[1]) for a in expr.args))
-        if type(collated_condition_type) is VectorType and type(collated_result_type) is not VectorType:
-            collated_result_type = VectorType(collated_result_type, width=collated_condition_type.width)
-        return collated_result_type
-    elif isinstance(expr, sp.Indexed):
-        typed_symbol = expr.base.label
-        return typed_symbol.dtype.base_type
-    elif isinstance(expr, (Boolean, BooleanFunction)):
-        # if any arg is of vector type return a vector boolean, else return a normal scalar boolean
-        result = create_type("bool")
-        vec_args = [get_type(a) for a in expr.args if isinstance(get_type(a), VectorType)]
-        if vec_args:
-            result = VectorType(result, width=vec_args[0].width)
-        return result
-    elif isinstance(expr, sp.Pow):
-        base_type = get_type(expr.args[0])
-        if expr.exp.is_integer:
-            return base_type
-        else:
-            return collate_types([create_type(default_float_type), base_type])
-    elif isinstance(expr, (sp.Sum, sp.Product)):
-        return get_type(expr.args[0])
-    elif isinstance(expr, sp.Expr):
-        expr: sp.Expr
-        if expr.args:
-            types = tuple(get_type(a) for a in expr.args)
-            # collate_types checks numpy_dtype in the special cases
-            if any(not hasattr(t, 'numpy_dtype') for t in types):
-                forbid_collation_to_complex = False
-                forbid_collation_to_float = False
-            else:
-                forbid_collation_to_complex = expr.is_real is True
-                forbid_collation_to_float = expr.is_integer is True
-            return collate_types(
-                types,
-                forbid_collation_to_complex=forbid_collation_to_complex,
-                forbid_collation_to_float=forbid_collation_to_float,
-                default_float_type=default_float_type,
-                default_int_type=default_int_type)
-        else:
-            if expr.is_integer:
-                return create_type(default_int_type)
-            else:
-                return create_type(default_float_type)
-
-    raise NotImplementedError("Could not determine type for", expr, type(expr))
-
-
-sympy_version = sp.__version__.split('.')
-if int(sympy_version[0]) * 100 + int(sympy_version[1]) >= 109:
-    # __setstate__ would bypass the contructor, so we remove it
-    sp.Number.__getstate__ = sp.Basic.__getstate__
-    del sp.Basic.__getstate__
-
-    class FunctorWithStoredKwargs:
-        def __init__(self, func, **kwargs):
-            self.func = func
-            self.kwargs = kwargs
-
-        def __call__(self, *args):
-            return self.func(*args, **self.kwargs)
-
-    # __reduce_ex__ would strip kwargs, so we override it
-    def basic_reduce_ex(self, protocol):
-        if hasattr(self, '__getnewargs_ex__'):
-            args, kwargs = self.__getnewargs_ex__()
-        else:
-            args, kwargs = self.__getnewargs__(), {}
-        if hasattr(self, '__getstate__'):
-            state = self.__getstate__()
-        else:
-            state = None
-        return FunctorWithStoredKwargs(type(self), **kwargs), args, state
-    sp.Number.__reduce_ex__ = sp.Basic.__reduce_ex__
-    sp.Basic.__reduce_ex__ = basic_reduce_ex
-
-
-class Type(sp.Atom):
-    def __new__(cls, *args, **kwargs):
-        return sp.Basic.__new__(cls)
-
-    def _sympystr(self, *args, **kwargs):
-        return str(self)
-
-
-class BasicType(Type):
-    @staticmethod
-    def numpy_name_to_c(name):
-        if name == 'float64':
-            return 'double'
-        elif name == 'float32':
-            return 'float'
-        elif name == 'complex64':
-            return 'ComplexFloat'
-        elif name == 'complex128':
-            return 'ComplexDouble'
-        elif name.startswith('int'):
-            width = int(name[len("int"):])
-            return f"int{width}_t"
-        elif name.startswith('uint'):
-            width = int(name[len("uint"):])
-            return f"uint{width}_t"
-        elif name == 'bool':
-            return 'bool'
-        else:
-            raise NotImplementedError(f"Can map numpy to C name for {name}")
-
-    def __init__(self, dtype, const=False):
-        self.const = const
-        if isinstance(dtype, Type):
-            self._dtype = dtype.numpy_dtype
-        else:
-            self._dtype = np.dtype(dtype)
-        assert self._dtype.fields is None, "Tried to initialize NativeType with a structured type"
-        assert self._dtype.hasobject is False
-        assert self._dtype.subdtype is None
-
-    def __getnewargs__(self):
-        return self.numpy_dtype, self.const
-
-    def __getnewargs_ex__(self):
-        return (self.numpy_dtype, self.const), {}
-
-    @property
-    def base_type(self):
-        return None
-
-    @property
-    def numpy_dtype(self):
-        return self._dtype
-
-    @property
-    def sympy_dtype(self):
-        return getattr(sympy.codegen.ast, str(self.numpy_dtype))
-
-    @property
-    def item_size(self):
-        return 1
-
-    def is_int(self):
-        return self.numpy_dtype in np.sctypes['int'] or self.numpy_dtype in np.sctypes['uint']
-
-    def is_float(self):
-        return self.numpy_dtype in np.sctypes['float']
-
-    def is_uint(self):
-        return self.numpy_dtype in np.sctypes['uint']
-
-    def is_complex(self):
-        return self.numpy_dtype in np.sctypes['complex']
-
-    def is_other(self):
-        return self.numpy_dtype in np.sctypes['others']
-
-    @property
-    def base_name(self):
-        return BasicType.numpy_name_to_c(str(self._dtype))
-
-    def __str__(self):
-        result = BasicType.numpy_name_to_c(str(self._dtype))
-        if self.const:
-            result += " const"
-        return result
-
-    def __repr__(self):
-        return str(self)
-
-    def __eq__(self, other):
-        if not isinstance(other, BasicType):
-            return False
-        else:
-            return (self.numpy_dtype, self.const) == (other.numpy_dtype, other.const)
-
-    def __hash__(self):
-        return hash(str(self))
-
-
-class VectorType(Type):
-    instruction_set = None
-
-    def __init__(self, base_type, width=4):
-        self._base_type = base_type
-        self.width = width
-
-    @property
-    def base_type(self):
-        return self._base_type
-
-    @property
-    def item_size(self):
-        return self.width * self.base_type.item_size
-
-    def __eq__(self, other):
-        if not isinstance(other, VectorType):
-            return False
-        else:
-            return (self.base_type, self.width) == (other.base_type, other.width)
-
-    def __str__(self):
-        if self.instruction_set is None:
-            return f"{self.base_type}[{self.width}]"
-        else:
-            if self.base_type == create_type("int64") or self.base_type == create_type("int32"):
-                return self.instruction_set['int']
-            elif self.base_type == create_type("float64"):
-                return self.instruction_set['double']
-            elif self.base_type == create_type("float32"):
-                return self.instruction_set['float']
-            elif self.base_type == create_type("bool"):
-                return self.instruction_set['bool']
-            else:
-                raise NotImplementedError()
-
-    def __hash__(self):
-        return hash((self.base_type, self.width))
-
-    def __getnewargs__(self):
-        return self._base_type, self.width
-
-    def __getnewargs_ex__(self):
-        return (self._base_type, self.width), {}
-
-
-class PointerType(Type):
-    def __init__(self, base_type, const=False, restrict=True):
-        self._base_type = base_type
-        self.const = const
-        self.restrict = restrict
-
-    def __getnewargs__(self):
-        return self.base_type, self.const, self.restrict
-
-    def __getnewargs_ex__(self):
-        return (self.base_type, self.const, self.restrict), {}
-
-    @property
-    def alias(self):
-        return not self.restrict
-
-    @property
-    def base_type(self):
-        return self._base_type
-
-    @property
-    def item_size(self):
-        return self.base_type.item_size
-
-    def __eq__(self, other):
-        if not isinstance(other, PointerType):
-            return False
-        else:
-            return (self.base_type, self.const, self.restrict) == (other.base_type, other.const, other.restrict)
-
-    def __str__(self):
-        components = [str(self.base_type), '*']
-        if self.restrict:
-            components.append('RESTRICT')
-        if self.const:
-            components.append("const")
-        return " ".join(components)
-
-    def __repr__(self):
-        return str(self)
-
-    def __hash__(self):
-        return hash((self._base_type, self.const, self.restrict))
-
-
-class StructType:
-    def __init__(self, numpy_type, const=False):
-        self.const = const
-        self._dtype = np.dtype(numpy_type)
-
-    def __getnewargs__(self):
-        return self.numpy_dtype, self.const
-
-    def __getnewargs_ex__(self):
-        return (self.numpy_dtype, self.const), {}
-
-    @property
-    def base_type(self):
-        return None
-
-    @property
-    def numpy_dtype(self):
-        return self._dtype
-
-    @property
-    def item_size(self):
-        return self.numpy_dtype.itemsize
-
-    def get_element_offset(self, element_name):
-        return self.numpy_dtype.fields[element_name][1]
-
-    def get_element_type(self, element_name):
-        np_element_type = self.numpy_dtype.fields[element_name][0]
-        return BasicType(np_element_type, self.const)
-
-    def has_element(self, element_name):
-        return element_name in self.numpy_dtype.fields
-
-    def __eq__(self, other):
-        if not isinstance(other, StructType):
-            return False
-        else:
-            return (self.numpy_dtype, self.const) == (other.numpy_dtype, other.const)
-
-    def __str__(self):
-        # structs are handled byte-wise
-        result = "uint8_t"
-        if self.const:
-            result += " const"
-        return result
-
-    def __repr__(self):
-        return str(self)
-
-    def __hash__(self):
-        return hash((self.numpy_dtype, self.const))
-
-
-class TypedImaginaryUnit(TypedSymbol):
-    def __new__(cls, *args, **kwds):
-        obj = TypedImaginaryUnit.__xnew_cached_(cls, *args, **kwds)
-        return obj
-
-    def __new_stage2__(cls, dtype):
-        obj = super(TypedImaginaryUnit, cls).__xnew__(cls,
-                                                      "_i",
-                                                      dtype,
-                                                      imaginary=True)
-        return obj
-
-    headers = ['"cuda_complex.hpp"']
-
-    __xnew__ = staticmethod(__new_stage2__)
-    __xnew_cached_ = staticmethod(cacheit(__new_stage2__))
-
-    def __getnewargs__(self):
-        return (self.dtype,)
-
-    def __getnewargs_ex__(self):
-        return (self.dtype,), {}
--- a/pystencils/datahandling/pycuda.py
+++ b/pystencils/datahandling/pycuda.py
-try:
-    import pycuda.gpuarray as gpuarray
-except ImportError:
-    gpuarray = None
-import numpy as np
-
-import pystencils
-
-
-class PyCudaArrayHandler:
-
-    def __init__(self):
-        import pycuda.autoinit  # NOQA
-
-    def zeros(self, shape, dtype=np.float64, order='C'):
-        cpu_array = np.zeros(shape=shape, dtype=dtype, order=order)
-        return self.to_gpu(cpu_array)
-
-    def ones(self, shape, dtype=np.float64, order='C'):
-        cpu_array = np.ones(shape=shape, dtype=dtype, order=order)
-        return self.to_gpu(cpu_array)
-
-    def empty(self, shape, dtype=np.float64, layout=None):
-        if layout:
-            cpu_array = pystencils.field.create_numpy_array_with_layout(shape=shape, dtype=dtype, layout=layout)
-            return self.to_gpu(cpu_array)
-        else:
-            return gpuarray.empty(shape, dtype)
-
-    @staticmethod
-    def to_gpu(array):
-        return gpuarray.to_gpu(array)
-
-    @staticmethod
-    def upload(array, numpy_array):
-        array.set(numpy_array)
-
-    @staticmethod
-    def download(array, numpy_array):
-        array.get(numpy_array)
-
-    def randn(self, shape, dtype=np.float64):
-        cpu_array = np.random.randn(*shape).astype(dtype)
-        return self.to_gpu(cpu_array)
-
-    from_numpy = to_gpu
-
-
-class PyCudaNotAvailableHandler:
-    def __getattribute__(self, name):
-        raise NotImplementedError("Unable to initiaize PyCuda! "
-                                  "Try to run `import pycuda.autoinit` to check whether PyCuda is working correctly!")
--- a/pystencils/datahandling/pyopencl.py
+++ b/pystencils/datahandling/pyopencl.py
-try:
-    import pyopencl.array as gpuarray
-except ImportError:
-    gpuarray = None
-
-import numpy as np
-
-import pystencils
-
-
-class PyOpenClArrayHandler:
-
-    def __init__(self, queue):
-        if not queue:
-            from pystencils.opencl.opencljit import get_global_cl_queue
-            queue = get_global_cl_queue()
-        assert queue, "OpenCL queue missing!\n" \
-            "Use `import pystencils.opencl.autoinit` if you want it to be automatically created"
-        self.queue = queue
-
-    def zeros(self, shape, dtype=np.float64, order='C'):
-        cpu_array = np.zeros(shape=shape, dtype=dtype, order=order)
-        return self.to_gpu(cpu_array)
-
-    def ones(self, shape, dtype=np.float64, order='C'):
-        cpu_array = np.ones(shape=shape, dtype=dtype, order=order)
-        return self.to_gpu(cpu_array)
-
-    def empty(self, shape, dtype=np.float64, layout=None):
-        if layout:
-            cpu_array = pystencils.field.create_numpy_array_with_layout(shape=shape, dtype=dtype, layout=layout)
-            return self.to_gpu(cpu_array)
-        else:
-            return gpuarray.empty(self.queue, shape, dtype)
-
-    def to_gpu(self, array):
-        return gpuarray.to_device(self.queue, array)
-
-    def upload(self, gpuarray, numpy_array):
-        gpuarray.set(numpy_array, self.queue)
-
-    def download(self, gpuarray, numpy_array):
-        gpuarray.get(self.queue, numpy_array)
-
-    def randn(self, shape, dtype=np.float64):
-        cpu_array = np.random.randn(*shape).astype(dtype)
-        return self.from_numpy(cpu_array)
-
-    from_numpy = to_gpu
--- a/pystencils/gpucuda/texture_utils.py
+++ b/pystencils/gpucuda/texture_utils.py
-# -*- coding: utf-8 -*-
-#
-# Copyright © 2019 Stephan Seitz <stephan.seitz@fau.de>
-#
-# Distributed under terms of the GPLv3 license.
-
-"""
-
-"""
-from typing import Union
-
-import numpy as np
-
-try:
-    import pycuda.driver as cuda
-    from pycuda import gpuarray
-    import pycuda
-except Exception:
-    pass
-
-
-def ndarray_to_tex(tex_ref,  # type: Union[cuda.TextureReference, cuda.SurfaceReference]
-                   ndarray,
-                   address_mode=None,
-                   filter_mode=None,
-                   use_normalized_coordinates=False,
-                   read_as_integer=False):
-
-    if isinstance(address_mode, str):
-        address_mode = getattr(pycuda.driver.address_mode, address_mode.upper())
-    if address_mode is None:
-        address_mode = cuda.address_mode.BORDER
-    if filter_mode is None:
-        filter_mode = cuda.filter_mode.LINEAR
-
-    if isinstance(ndarray, np.ndarray):
-        cu_array = cuda.np_to_array(ndarray, 'C')
-    elif isinstance(ndarray, gpuarray.GPUArray):
-        cu_array = cuda.gpuarray_to_array(ndarray, 'C')
-    else:
-        raise TypeError(
-            'ndarray must be numpy.ndarray or pycuda.gpuarray.GPUArray')
-
-    tex_ref.set_array(cu_array)
-
-    tex_ref.set_address_mode(0, address_mode)
-    if ndarray.ndim >= 2:
-        tex_ref.set_address_mode(1, address_mode)
-    if ndarray.ndim >= 3:
-        tex_ref.set_address_mode(2, address_mode)
-    tex_ref.set_filter_mode(filter_mode)
-
-    if not use_normalized_coordinates:
-        tex_ref.set_flags(tex_ref.get_flags() & ~cuda.TRSF_NORMALIZED_COORDINATES)
-
-    if not read_as_integer:
-        tex_ref.set_flags(tex_ref.get_flags() & ~cuda.TRSF_READ_AS_INTEGER)
--- a/pystencils/include/PyStencilsField.h
+++ b/pystencils/include/PyStencilsField.h
-#pragma once
-
-extern "C++" {
-#ifdef __CUDA_ARCH__
-template <typename DTYPE_T, std::size_t DIMENSION> struct PyStencilsField {
-  DTYPE_T *data;
-  DTYPE_T shape[DIMENSION];
-  DTYPE_T stride[DIMENSION];
-};
-#else
-#include <array>
-
-template <typename DTYPE_T, std::size_t DIMENSION> struct PyStencilsField {
-  DTYPE_T *data;
-  std::array<DTYPE_T, DIMENSION> shape;
-  std::array<DTYPE_T, DIMENSION> stride;
-};
-#endif
-}
--- a/pystencils/include/cuda_complex.hpp
+++ b/pystencils/include/cuda_complex.hpp
-// An implementation of C++ std::complex for use on CUDA devices.
-// Written by John C. Travers <jtravs@gmail.com> (2012)
-//
-// Missing:
-//  - long double support (not supported on CUDA)
-//  - some integral pow functions (due to lack of C++11 support on CUDA)
-//
-// Heavily derived from the LLVM libcpp project (svn revision 147853).
-// Based on libcxx/include/complex.
-// The git history contains the complete change history from the original.
-// The modifications are licensed as per the original LLVM license below.
-//
-// -*- C++ -*-
-//===--------------------------- complex ----------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-extern "C++" {
-#ifndef CUDA_COMPLEX_HPP
-#define CUDA_COMPLEX_HPP
-
-#ifdef __CUDACC__
-#define CUDA_CALLABLE_MEMBER __host__ __device__
-#else
-#define CUDA_CALLABLE_MEMBER
-#endif
-
-/*
-    complex synopsis
-
-template<class T>
-class complex
-{
-public:
-    typedef T value_type;
-
-    complex(const T& re = T(), const T& im = T());
-    complex(const complex&);
-    template<class X> complex(const complex<X>&);
-
-    T real() const;
-    T imag() const;
-
-    void real(T);
-    void imag(T);
-
-    complex<T>& operator= (const T&);
-    complex<T>& operator+=(const T&);
-    complex<T>& operator-=(const T&);
-    complex<T>& operator*=(const T&);
-    complex<T>& operator/=(const T&);
-
-    complex& operator=(const complex&);
-    template<class X> complex<T>& operator= (const complex<X>&);
-    template<class X> complex<T>& operator+=(const complex<X>&);
-    template<class X> complex<T>& operator-=(const complex<X>&);
-    template<class X> complex<T>& operator*=(const complex<X>&);
-    template<class X> complex<T>& operator/=(const complex<X>&);
-};
-
-template<>
-class complex<float>
-{
-public:
-    typedef float value_type;
-
-    constexpr complex(float re = 0.0f, float im = 0.0f);
-    explicit constexpr complex(const complex<double>&);
-
-    constexpr float real() const;
-    void real(float);
-    constexpr float imag() const;
-    void imag(float);
-
-    complex<float>& operator= (float);
-    complex<float>& operator+=(float);
-    complex<float>& operator-=(float);
-    complex<float>& operator*=(float);
-    complex<float>& operator/=(float);
-
-    complex<float>& operator=(const complex<float>&);
-    template<class X> complex<float>& operator= (const complex<X>&);
-    template<class X> complex<float>& operator+=(const complex<X>&);
-    template<class X> complex<float>& operator-=(const complex<X>&);
-    template<class X> complex<float>& operator*=(const complex<X>&);
-    template<class X> complex<float>& operator/=(const complex<X>&);
-};
-
-template<>
-class complex<double>
-{
-public:
-    typedef double value_type;
-
-    constexpr complex(double re = 0.0, double im = 0.0);
-    constexpr complex(const complex<float>&);
-
-    constexpr double real() const;
-    void real(double);
-    constexpr double imag() const;
-    void imag(double);
-
-    complex<double>& operator= (double);
-    complex<double>& operator+=(double);
-    complex<double>& operator-=(double);
-    complex<double>& operator*=(double);
-    complex<double>& operator/=(double);
-    complex<double>& operator=(const complex<double>&);
-
-    template<class X> complex<double>& operator= (const complex<X>&);
-    template<class X> complex<double>& operator+=(const complex<X>&);
-    template<class X> complex<double>& operator-=(const complex<X>&);
-    template<class X> complex<double>& operator*=(const complex<X>&);
-    template<class X> complex<double>& operator/=(const complex<X>&);
-};
-
-// 26.3.6 operators:
-template<class T> complex<T> operator+(const complex<T>&, const complex<T>&);
-template<class T> complex<T> operator+(const complex<T>&, const T&);
-template<class T> complex<T> operator+(const T&, const complex<T>&);
-template<class T> complex<T> operator-(const complex<T>&, const complex<T>&);
-template<class T> complex<T> operator-(const complex<T>&, const T&);
-template<class T> complex<T> operator-(const T&, const complex<T>&);
-template<class T> complex<T> operator*(const complex<T>&, const complex<T>&);
-template<class T> complex<T> operator*(const complex<T>&, const T&);
-template<class T> complex<T> operator*(const T&, const complex<T>&);
-template<class T> complex<T> operator/(const complex<T>&, const complex<T>&);
-template<class T> complex<T> operator/(const complex<T>&, const T&);
-template<class T> complex<T> operator/(const T&, const complex<T>&);
-template<class T> complex<T> operator+(const complex<T>&);
-template<class T> complex<T> operator-(const complex<T>&);
-template<class T> bool operator==(const complex<T>&, const complex<T>&);
-template<class T> bool operator==(const complex<T>&, const T&);
-template<class T> bool operator==(const T&, const complex<T>&);
-template<class T> bool operator!=(const complex<T>&, const complex<T>&);
-template<class T> bool operator!=(const complex<T>&, const T&);
-template<class T> bool operator!=(const T&, const complex<T>&);
-
-template<class T, class charT, class traits>
-  basic_istream<charT, traits>&
-  operator>>(basic_istream<charT, traits>&, complex<T>&);
-template<class T, class charT, class traits>
-  basic_ostream<charT, traits>&
-  operator<<(basic_ostream<charT, traits>&, const complex<T>&);
-
-// 26.3.7 values:
-
-template<class T>              T real(const complex<T>&);
-                          double real(double);
-template<Integral T>      double real(T);
-                          float  real(float);
-
-template<class T>              T imag(const complex<T>&);
-                          double imag(double);
-template<Integral T>      double imag(T);
-                          float  imag(float);
-
-template<class T> T abs(const complex<T>&);
-
-template<class T>              T arg(const complex<T>&);
-                          double arg(double);
-template<Integral T>      double arg(T);
-                          float  arg(float);
-
-template<class T>              T norm(const complex<T>&);
-                          double norm(double);
-template<Integral T>      double norm(T);
-                          float  norm(float);
-
-template<class T>      complex<T>           conj(const complex<T>&);
-                       complex<double>      conj(double);
-template<Integral T>   complex<double>      conj(T);
-                       complex<float>       conj(float);
-
-template<class T>    complex<T>           proj(const complex<T>&);
-                     complex<double>      proj(double);
-template<Integral T> complex<double>      proj(T);
-                     complex<float>       proj(float);
-
-template<class T> complex<T> polar(const T&, const T& = 0);
-
-// 26.3.8 transcendentals:
-template<class T> complex<T> acos(const complex<T>&);
-template<class T> complex<T> asin(const complex<T>&);
-template<class T> complex<T> atan(const complex<T>&);
-template<class T> complex<T> acosh(const complex<T>&);
-template<class T> complex<T> asinh(const complex<T>&);
-template<class T> complex<T> atanh(const complex<T>&);
-template<class T> complex<T> cos (const complex<T>&);
-template<class T> complex<T> cosh (const complex<T>&);
-template<class T> complex<T> exp (const complex<T>&);
-template<class T> complex<T> log (const complex<T>&);
-template<class T> complex<T> log10(const complex<T>&);
-
-template<class T> complex<T> pow(const complex<T>&, const T&);
-template<class T> complex<T> pow(const complex<T>&, const complex<T>&);
-template<class T> complex<T> pow(const T&, const complex<T>&);
-
-template<class T> complex<T> sin (const complex<T>&);
-template<class T> complex<T> sinh (const complex<T>&);
-template<class T> complex<T> sqrt (const complex<T>&);
-template<class T> complex<T> tan (const complex<T>&);
-template<class T> complex<T> tanh (const complex<T>&);
-
-template<class T, class charT, class traits>
-  basic_istream<charT, traits>&
-  operator>>(basic_istream<charT, traits>& is, complex<T>& x);
-
-template<class T, class charT, class traits>
-  basic_ostream<charT, traits>&
-  operator<<(basic_ostream<charT, traits>& o, const complex<T>& x);
-
-*/
-
-#include <math.h>
-#include <sstream>
-
-template <class _Tp> class complex;
-
-template <class _Tp>
-complex<_Tp> operator*(const complex<_Tp> &__z, const complex<_Tp> &__w);
-template <class _Tp>
-complex<_Tp> operator/(const complex<_Tp> &__x, const complex<_Tp> &__y);
-
-template <class _Tp> class complex {
-public:
-  typedef _Tp value_type;
-
-private:
-  value_type __re_;
-  value_type __im_;
-
-public:
-  CUDA_CALLABLE_MEMBER
-  complex(const value_type &__re = value_type(),
-          const value_type &__im = value_type())
-      : __re_(__re), __im_(__im) {}
-  template <class _Xp>
-  CUDA_CALLABLE_MEMBER complex(const complex<_Xp> &__c)
-      : __re_(__c.real()), __im_(__c.imag()) {}
-
-  CUDA_CALLABLE_MEMBER value_type real() const { return __re_; }
-  CUDA_CALLABLE_MEMBER value_type imag() const { return __im_; }
-
-  CUDA_CALLABLE_MEMBER void real(value_type __re) { __re_ = __re; }
-  CUDA_CALLABLE_MEMBER void imag(value_type __im) { __im_ = __im; }
-
-  CUDA_CALLABLE_MEMBER complex &operator=(const value_type &__re) {
-    __re_ = __re;
-    __im_ = value_type();
-    return *this;
-  }
-  CUDA_CALLABLE_MEMBER complex &operator+=(const value_type &__re) {
-    __re_ += __re;
-    return *this;
-  }
-  CUDA_CALLABLE_MEMBER complex &operator-=(const value_type &__re) {
-    __re_ -= __re;
-    return *this;
-  }
-  CUDA_CALLABLE_MEMBER complex &operator*=(const value_type &__re) {
-    __re_ *= __re;
-    __im_ *= __re;
-    return *this;
-  }
-  CUDA_CALLABLE_MEMBER complex &operator/=(const value_type &__re) {
-    __re_ /= __re;
-    __im_ /= __re;
-    return *this;
-  }
-
-  template <class _Xp>
-  CUDA_CALLABLE_MEMBER complex &operator=(const complex<_Xp> &__c) {
-    __re_ = __c.real();
-    __im_ = __c.imag();
-    return *this;
-  }
-  template <class _Xp>
-  CUDA_CALLABLE_MEMBER complex &operator+=(const complex<_Xp> &__c) {
-    __re_ += __c.real();
-    __im_ += __c.imag();
-    return *this;
-  }
-  template <class _Xp>
-  CUDA_CALLABLE_MEMBER complex &operator-=(const complex<_Xp> &__c) {
-    __re_ -= __c.real();
-    __im_ -= __c.imag();
-    return *this;
-  }
-  template <class _Xp>
-  CUDA_CALLABLE_MEMBER complex &operator*=(const complex<_Xp> &__c) {
-    *this = *this * __c;
-    return *this;
-  }
-  template <class _Xp>
-  CUDA_CALLABLE_MEMBER complex &operator/=(const complex<_Xp> &__c) {
-    *this = *this / __c;
-    return *this;
-  }
-};
-
-template <> class complex<double>;
-
-template <> class complex<float> {
-  float __re_;
-  float __im_;
-
-public:
-  typedef float value_type;
-
-  /*constexpr*/ CUDA_CALLABLE_MEMBER complex(float __re = 0.0f,
-                                             float __im = 0.0f)
-      : __re_(__re), __im_(__im) {}
-  explicit /*constexpr*/ complex(const complex<double> &__c);
-
-  /*constexpr*/ CUDA_CALLABLE_MEMBER float real() const { return __re_; }
-  /*constexpr*/ CUDA_CALLABLE_MEMBER float imag() const { return __im_; }
-
-  CUDA_CALLABLE_MEMBER void real(value_type __re) { __re_ = __re; }
-  CUDA_CALLABLE_MEMBER void imag(value_type __im) { __im_ = __im; }
-
-  CUDA_CALLABLE_MEMBER complex &operator=(float __re) {
-    __re_ = __re;
-    __im_ = value_type();
-    return *this;
-  }
-  CUDA_CALLABLE_MEMBER complex &operator+=(float __re) {
-    __re_ += __re;
-    return *this;
-  }
-  CUDA_CALLABLE_MEMBER complex &operator-=(float __re) {
-    __re_ -= __re;
-    return *this;
-  }
-  CUDA_CALLABLE_MEMBER complex &operator*=(float __re) {
-    __re_ *= __re;
-    __im_ *= __re;
-    return *this;
-  }
-  CUDA_CALLABLE_MEMBER complex &operator/=(float __re) {
-    __re_ /= __re;
-    __im_ /= __re;
-    return *this;
-  }
-
-  template <class _Xp>
-  CUDA_CALLABLE_MEMBER complex &operator=(const complex<_Xp> &__c) {
-    __re_ = __c.real();
-    __im_ = __c.imag();
-    return *this;
-  }
-  template <class _Xp>
-  CUDA_CALLABLE_MEMBER complex &operator+=(const complex<_Xp> &__c) {
-    __re_ += __c.real();
-    __im_ += __c.imag();
-    return *this;
-  }
-  template <class _Xp>
-  CUDA_CALLABLE_MEMBER complex &operator-=(const complex<_Xp> &__c) {
-    __re_ -= __c.real();
-    __im_ -= __c.imag();
-    return *this;
-  }
-  template <class _Xp>
-  CUDA_CALLABLE_MEMBER complex &operator*=(const complex<_Xp> &__c) {
-    *this = *this * __c;
-    return *this;
-  }
-  template <class _Xp>
-  CUDA_CALLABLE_MEMBER complex &operator/=(const complex<_Xp> &__c) {
-    *this = *this / __c;
-    return *this;
-  }
-};
-
-template <> class complex<double> {
-  double __re_;
-  double __im_;
-
-public:
-  typedef double value_type;
-
-  /*constexpr*/ CUDA_CALLABLE_MEMBER complex(double __re = 0.0,
-                                             double __im = 0.0)
-      : __re_(__re), __im_(__im) {}
-  /*constexpr*/ complex(const complex<float> &__c);
-
-  /*constexpr*/ CUDA_CALLABLE_MEMBER double real() const { return __re_; }
-  /*constexpr*/ CUDA_CALLABLE_MEMBER double imag() const { return __im_; }
-
-  CUDA_CALLABLE_MEMBER void real(value_type __re) { __re_ = __re; }
-  CUDA_CALLABLE_MEMBER void imag(value_type __im) { __im_ = __im; }
-
-  CUDA_CALLABLE_MEMBER complex &operator=(double __re) {
-    __re_ = __re;
-    __im_ = value_type();
-    return *this;
-  }
-  CUDA_CALLABLE_MEMBER complex &operator+=(double __re) {
-    __re_ += __re;
-    return *this;
-  }
-  CUDA_CALLABLE_MEMBER complex &operator-=(double __re) {
-    __re_ -= __re;
-    return *this;
-  }
-  CUDA_CALLABLE_MEMBER complex &operator*=(double __re) {
-    __re_ *= __re;
-    __im_ *= __re;
-    return *this;
-  }
-  CUDA_CALLABLE_MEMBER complex &operator/=(double __re) {
-    __re_ /= __re;
-    __im_ /= __re;
-    return *this;
-  }
-
-  template <class _Xp>
-  CUDA_CALLABLE_MEMBER complex &operator=(const complex<_Xp> &__c) {
-    __re_ = __c.real();
-    __im_ = __c.imag();
-    return *this;
-  }
-  template <class _Xp>
-  CUDA_CALLABLE_MEMBER complex &operator+=(const complex<_Xp> &__c) {
-    __re_ += __c.real();
-    __im_ += __c.imag();
-    return *this;
-  }
-  template <class _Xp>
-  CUDA_CALLABLE_MEMBER complex &operator-=(const complex<_Xp> &__c) {
-    __re_ -= __c.real();
-    __im_ -= __c.imag();
-    return *this;
-  }
-  template <class _Xp>
-  CUDA_CALLABLE_MEMBER complex &operator*=(const complex<_Xp> &__c) {
-    *this = *this * __c;
-    return *this;
-  }
-  template <class _Xp>
-  CUDA_CALLABLE_MEMBER complex &operator/=(const complex<_Xp> &__c) {
-    *this = *this / __c;
-    return *this;
-  }
-};
-
-// constexpr
-inline CUDA_CALLABLE_MEMBER complex<float>::complex(const complex<double> &__c)
-    : __re_(__c.real()), __im_(__c.imag()) {}
-
-// constexpr
-inline CUDA_CALLABLE_MEMBER complex<double>::complex(const complex<float> &__c)
-    : __re_(__c.real()), __im_(__c.imag()) {}
-
-// 26.3.6 operators:
-
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> operator+(const complex<_Tp> &__x,
-                                                   const complex<_Tp> &__y) {
-  complex<_Tp> __t(__x);
-  __t += __y;
-  return __t;
-}
-
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> operator+(const complex<_Tp> &__x,
-                                                   const _Tp &__y) {
-  complex<_Tp> __t(__x);
-  __t += __y;
-  return __t;
-}
-
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> operator+(const _Tp &__x,
-                                                   const complex<_Tp> &__y) {
-  complex<_Tp> __t(__y);
-  __t += __x;
-  return __t;
-}
-
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> operator-(const complex<_Tp> &__x,
-                                                   const complex<_Tp> &__y) {
-  complex<_Tp> __t(__x);
-  __t -= __y;
-  return __t;
-}
-
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> operator-(const complex<_Tp> &__x,
-                                                   const _Tp &__y) {
-  complex<_Tp> __t(__x);
-  __t -= __y;
-  return __t;
-}
-
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> operator-(const _Tp &__x,
-                                                   const complex<_Tp> &__y) {
-  complex<_Tp> __t(-__y);
-  __t += __x;
-  return __t;
-}
-
-template <class _Tp>
-CUDA_CALLABLE_MEMBER complex<_Tp> operator*(const complex<_Tp> &__z,
-                                            const complex<_Tp> &__w) {
-  _Tp __a = __z.real();
-  _Tp __b = __z.imag();
-  _Tp __c = __w.real();
-  _Tp __d = __w.imag();
-  _Tp __ac = __a * __c;
-  _Tp __bd = __b * __d;
-  _Tp __ad = __a * __d;
-  _Tp __bc = __b * __c;
-  _Tp __x = __ac - __bd;
-  _Tp __y = __ad + __bc;
-  if (isnan(__x) && isnan(__y)) {
-    bool __recalc = false;
-    if (isinf(__a) || isinf(__b)) {
-      __a = copysign(isinf(__a) ? _Tp(1) : _Tp(0), __a);
-      __b = copysign(isinf(__b) ? _Tp(1) : _Tp(0), __b);
-      if (isnan(__c))
-        __c = copysign(_Tp(0), __c);
-      if (isnan(__d))
-        __d = copysign(_Tp(0), __d);
-      __recalc = true;
-    }
-    if (isinf(__c) || isinf(__d)) {
-      __c = copysign(isinf(__c) ? _Tp(1) : _Tp(0), __c);
-      __d = copysign(isinf(__d) ? _Tp(1) : _Tp(0), __d);
-      if (isnan(__a))
-        __a = copysign(_Tp(0), __a);
-      if (isnan(__b))
-        __b = copysign(_Tp(0), __b);
-      __recalc = true;
-    }
-    if (!__recalc &&
-        (isinf(__ac) || isinf(__bd) || isinf(__ad) || isinf(__bc))) {
-      if (isnan(__a))
-        __a = copysign(_Tp(0), __a);
-      if (isnan(__b))
-        __b = copysign(_Tp(0), __b);
-      if (isnan(__c))
-        __c = copysign(_Tp(0), __c);
-      if (isnan(__d))
-        __d = copysign(_Tp(0), __d);
-      __recalc = true;
-    }
-    if (__recalc) {
-      __x = _Tp(INFINITY) * (__a * __c - __b * __d);
-      __y = _Tp(INFINITY) * (__a * __d + __b * __c);
-    }
-  }
-  return complex<_Tp>(__x, __y);
-}
-
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> operator*(const complex<_Tp> &__x,
-                                                   const _Tp &__y) {
-  complex<_Tp> __t(__x);
-  __t *= __y;
-  return __t;
-}
-
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> operator*(const _Tp &__x,
-                                                   const complex<_Tp> &__y) {
-  complex<_Tp> __t(__y);
-  __t *= __x;
-  return __t;
-}
-
-template <class _Tp>
-CUDA_CALLABLE_MEMBER complex<_Tp> operator/(const complex<_Tp> &__z,
-                                            const complex<_Tp> &__w) {
-  int __ilogbw = 0;
-  _Tp __a = __z.real();
-  _Tp __b = __z.imag();
-  _Tp __c = __w.real();
-  _Tp __d = __w.imag();
-  _Tp __logbw = logb(fmax(fabs(__c), fabs(__d)));
-  if (isfinite(__logbw)) {
-    __ilogbw = static_cast<int>(__logbw);
-    __c = scalbn(__c, -__ilogbw);
-    __d = scalbn(__d, -__ilogbw);
-  }
-  _Tp __denom = __c * __c + __d * __d;
-  _Tp __x = scalbn((__a * __c + __b * __d) / __denom, -__ilogbw);
-  _Tp __y = scalbn((__b * __c - __a * __d) / __denom, -__ilogbw);
-  if (isnan(__x) && isnan(__y)) {
-    if ((__denom == _Tp(0)) && (!isnan(__a) || !isnan(__b))) {
-      __x = copysign(_Tp(INFINITY), __c) * __a;
-      __y = copysign(_Tp(INFINITY), __c) * __b;
-    } else if ((isinf(__a) || isinf(__b)) && isfinite(__c) && isfinite(__d)) {
-      __a = copysign(isinf(__a) ? _Tp(1) : _Tp(0), __a);
-      __b = copysign(isinf(__b) ? _Tp(1) : _Tp(0), __b);
-      __x = _Tp(INFINITY) * (__a * __c + __b * __d);
-      __y = _Tp(INFINITY) * (__b * __c - __a * __d);
-    } else if (isinf(__logbw) && __logbw > _Tp(0) && isfinite(__a) &&
-               isfinite(__b)) {
-      __c = copysign(isinf(__c) ? _Tp(1) : _Tp(0), __c);
-      __d = copysign(isinf(__d) ? _Tp(1) : _Tp(0), __d);
-      __x = _Tp(0) * (__a * __c + __b * __d);
-      __y = _Tp(0) * (__b * __c - __a * __d);
-    }
-  }
-  return complex<_Tp>(__x, __y);
-}
-
-template <>
-CUDA_CALLABLE_MEMBER complex<float> operator/(const complex<float> &__z,
-                                              const complex<float> &__w) {
-  int __ilogbw = 0;
-  float __a = __z.real();
-  float __b = __z.imag();
-  float __c = __w.real();
-  float __d = __w.imag();
-  float __logbw = logbf(fmaxf(fabsf(__c), fabsf(__d)));
-  if (isfinite(__logbw)) {
-    __ilogbw = static_cast<int>(__logbw);
-    __c = scalbnf(__c, -__ilogbw);
-    __d = scalbnf(__d, -__ilogbw);
-  }
-  float __denom = __c * __c + __d * __d;
-  float __x = scalbnf((__a * __c + __b * __d) / __denom, -__ilogbw);
-  float __y = scalbnf((__b * __c - __a * __d) / __denom, -__ilogbw);
-  if (isnan(__x) && isnan(__y)) {
-    if ((__denom == float(0)) && (!isnan(__a) || !isnan(__b))) {
-#pragma warning(suppress : 4756) // Ignore INFINITY related warning
-      __x = copysignf(INFINITY, __c) * __a;
-#pragma warning(suppress : 4756) // Ignore INFINITY related warning
-      __y = copysignf(INFINITY, __c) * __b;
-    } else if ((isinf(__a) || isinf(__b)) && isfinite(__c) && isfinite(__d)) {
-      __a = copysignf(isinf(__a) ? float(1) : float(0), __a);
-      __b = copysignf(isinf(__b) ? float(1) : float(0), __b);
-#pragma warning(suppress : 4756) // Ignore INFINITY related warning
-      __x = INFINITY * (__a * __c + __b * __d);
-#pragma warning(suppress : 4756) // Ignore INFINITY related warning
-      __y = INFINITY * (__b * __c - __a * __d);
-    } else if (isinf(__logbw) && __logbw > float(0) && isfinite(__a) &&
-               isfinite(__b)) {
-      __c = copysignf(isinf(__c) ? float(1) : float(0), __c);
-      __d = copysignf(isinf(__d) ? float(1) : float(0), __d);
-      __x = float(0) * (__a * __c + __b * __d);
-      __y = float(0) * (__b * __c - __a * __d);
-    }
-  }
-  return complex<float>(__x, __y);
-}
-
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> operator/(const complex<_Tp> &__x,
-                                                   const _Tp &__y) {
-  return complex<_Tp>(__x.real() / __y, __x.imag() / __y);
-}
-
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> operator/(const _Tp &__x,
-                                                   const complex<_Tp> &__y) {
-  complex<_Tp> __t(__x);
-  __t /= __y;
-  return __t;
-}
-
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> operator+(const complex<_Tp> &__x) {
-  return __x;
-}
-
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> operator-(const complex<_Tp> &__x) {
-  return complex<_Tp>(-__x.real(), -__x.imag());
-}
-
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER bool operator==(const complex<_Tp> &__x,
-                                            const complex<_Tp> &__y) {
-  return __x.real() == __y.real() && __x.imag() == __y.imag();
-}
-
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER bool operator==(const complex<_Tp> &__x,
-                                            const _Tp &__y) {
-  return __x.real() == __y && __x.imag() == 0;
-}
-
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER bool operator==(const _Tp &__x,
-                                            const complex<_Tp> &__y) {
-  return __x == __y.real() && 0 == __y.imag();
-}
-
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER bool operator!=(const complex<_Tp> &__x,
-                                            const complex<_Tp> &__y) {
-  return !(__x == __y);
-}
-
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER bool operator!=(const complex<_Tp> &__x,
-                                            const _Tp &__y) {
-  return !(__x == __y);
-}
-
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER bool operator!=(const _Tp &__x,
-                                            const complex<_Tp> &__y) {
-  return !(__x == __y);
-}
-
-// 26.3.7 values:
-
-// real
-
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER _Tp real(const complex<_Tp> &__c) {
-  return __c.real();
-}
-
-inline CUDA_CALLABLE_MEMBER double real(double __re) { return __re; }
-
-inline CUDA_CALLABLE_MEMBER float real(float __re) { return __re; }
-
-// imag
-
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER _Tp imag(const complex<_Tp> &__c) {
-  return __c.imag();
-}
-
-inline CUDA_CALLABLE_MEMBER double imag(double __re) { return 0; }
-
-inline CUDA_CALLABLE_MEMBER float imag(float __re) { return 0; }
-
-// abs
-
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER _Tp abs(const complex<_Tp> &__c) {
-  return hypot(__c.real(), __c.imag());
-}
-
-// arg
-
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER _Tp arg(const complex<_Tp> &__c) {
-  return atan2(__c.imag(), __c.real());
-}
-
-inline CUDA_CALLABLE_MEMBER double arg(double __re) { return atan2(0., __re); }
-
-inline CUDA_CALLABLE_MEMBER float arg(float __re) { return atan2f(0.F, __re); }
-
-// norm
-
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER _Tp norm(const complex<_Tp> &__c) {
-  if (isinf(__c.real()))
-    return fabs(__c.real());
-  if (isinf(__c.imag()))
-    return fabs(__c.imag());
-  return __c.real() * __c.real() + __c.imag() * __c.imag();
-}
-
-inline CUDA_CALLABLE_MEMBER double norm(double __re) { return __re * __re; }
-
-inline CUDA_CALLABLE_MEMBER float norm(float __re) { return __re * __re; }
-
-// conj
-
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> conj(const complex<_Tp> &__c) {
-  return complex<_Tp>(__c.real(), -__c.imag());
-}
-
-inline CUDA_CALLABLE_MEMBER complex<double> conj(double __re) {
-  return complex<double>(__re);
-}
-
-inline CUDA_CALLABLE_MEMBER complex<float> conj(float __re) {
-  return complex<float>(__re);
-}
-
-// proj
-
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> proj(const complex<_Tp> &__c) {
-  complex<_Tp> __r = __c;
-  if (isinf(__c.real()) || isinf(__c.imag()))
-    __r = complex<_Tp>(INFINITY, copysign(_Tp(0), __c.imag()));
-  return __r;
-}
-
-inline CUDA_CALLABLE_MEMBER complex<double> proj(double __re) {
-  if (isinf(__re))
-    __re = fabs(__re);
-  return complex<double>(__re);
-}
-
-inline CUDA_CALLABLE_MEMBER complex<float> proj(float __re) {
-  if (isinf(__re))
-    __re = fabs(__re);
-  return complex<float>(__re);
-}
-
-// polar
-
-template <class _Tp>
-CUDA_CALLABLE_MEMBER complex<_Tp> polar(const _Tp &__rho,
-                                        const _Tp &__theta = _Tp(0)) {
-  if (isnan(__rho) || signbit(__rho))
-    return complex<_Tp>(_Tp(NAN), _Tp(NAN));
-  if (isnan(__theta)) {
-    if (isinf(__rho))
-      return complex<_Tp>(__rho, __theta);
-    return complex<_Tp>(__theta, __theta);
-  }
-  if (isinf(__theta)) {
-    if (isinf(__rho))
-      return complex<_Tp>(__rho, _Tp(NAN));
-    return complex<_Tp>(_Tp(NAN), _Tp(NAN));
-  }
-  _Tp __x = __rho * cos(__theta);
-  if (isnan(__x))
-    __x = 0;
-  _Tp __y = __rho * sin(__theta);
-  if (isnan(__y))
-    __y = 0;
-  return complex<_Tp>(__x, __y);
-}
-
-// log
-
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> log(const complex<_Tp> &__x) {
-  return complex<_Tp>(log(abs(__x)), arg(__x));
-}
-
-// log10
-
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> log10(const complex<_Tp> &__x) {
-  return log(__x) / log(_Tp(10));
-}
-
-// sqrt
-
-template <class _Tp>
-CUDA_CALLABLE_MEMBER complex<_Tp> sqrt(const complex<_Tp> &__x) {
-  if (isinf(__x.imag()))
-    return complex<_Tp>(_Tp(INFINITY), __x.imag());
-  if (isinf(__x.real())) {
-    if (__x.real() > _Tp(0))
-      return complex<_Tp>(__x.real(), isnan(__x.imag())
-                                          ? __x.imag()
-                                          : copysign(_Tp(0), __x.imag()));
-    return complex<_Tp>(isnan(__x.imag()) ? __x.imag() : _Tp(0),
-                        copysign(__x.real(), __x.imag()));
-  }
-  return polar(sqrt(abs(__x)), arg(__x) / _Tp(2));
-}
-
-// exp
-
-template <class _Tp>
-CUDA_CALLABLE_MEMBER complex<_Tp> exp(const complex<_Tp> &__x) {
-  _Tp __i = __x.imag();
-  if (isinf(__x.real())) {
-    if (__x.real() < _Tp(0)) {
-      if (!isfinite(__i))
-        __i = _Tp(1);
-    } else if (__i == 0 || !isfinite(__i)) {
-      if (isinf(__i))
-        __i = _Tp(NAN);
-      return complex<_Tp>(__x.real(), __i);
-    }
-  } else if (isnan(__x.real()) && __x.imag() == 0)
-    return __x;
-  _Tp __e = exp(__x.real());
-  return complex<_Tp>(__e * cos(__i), __e * sin(__i));
-}
-
-// pow
-
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> pow(const complex<_Tp> &__x,
-                                             const complex<_Tp> &__y) {
-  return exp(__y * log(__x));
-}
-
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> pow(const complex<_Tp> &__x,
-                                             const _Tp &__y) {
-  return pow(__x, complex<_Tp>(__y));
-}
-
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> pow(const _Tp &__x,
-                                             const complex<_Tp> &__y) {
-  return pow(complex<_Tp>(__x), __y);
-}
-
-// asinh
-
-template <class _Tp>
-CUDA_CALLABLE_MEMBER complex<_Tp> asinh(const complex<_Tp> &__x) {
-  const _Tp __pi(atan2(+0., -0.));
-  if (isinf(__x.real())) {
-    if (isnan(__x.imag()))
-      return __x;
-    if (isinf(__x.imag()))
-      return complex<_Tp>(__x.real(), copysign(__pi * _Tp(0.25), __x.imag()));
-    return complex<_Tp>(__x.real(), copysign(_Tp(0), __x.imag()));
-  }
-  if (isnan(__x.real())) {
-    if (isinf(__x.imag()))
-      return complex<_Tp>(__x.imag(), __x.real());
-    if (__x.imag() == 0)
-      return __x;
-    return complex<_Tp>(__x.real(), __x.real());
-  }
-  if (isinf(__x.imag()))
-    return complex<_Tp>(copysign(__x.imag(), __x.real()),
-                        copysign(__pi / _Tp(2), __x.imag()));
-  complex<_Tp> __z = log(__x + sqrt(pow(__x, _Tp(2)) + _Tp(1)));
-  return complex<_Tp>(copysign(__z.real(), __x.real()),
-                      copysign(__z.imag(), __x.imag()));
-}
-
-// acosh
-
-template <class _Tp>
-CUDA_CALLABLE_MEMBER complex<_Tp> acosh(const complex<_Tp> &__x) {
-  const _Tp __pi(atan2(+0., -0.));
-  if (isinf(__x.real())) {
-    if (isnan(__x.imag()))
-      return complex<_Tp>(fabs(__x.real()), __x.imag());
-    if (isinf(__x.imag()))
-      if (__x.real() > 0)
-        return complex<_Tp>(__x.real(), copysign(__pi * _Tp(0.25), __x.imag()));
-      else
-        return complex<_Tp>(-__x.real(),
-                            copysign(__pi * _Tp(0.75), __x.imag()));
-    if (__x.real() < 0)
-      return complex<_Tp>(-__x.real(), copysign(__pi, __x.imag()));
-    return complex<_Tp>(__x.real(), copysign(_Tp(0), __x.imag()));
-  }
-  if (isnan(__x.real())) {
-    if (isinf(__x.imag()))
-      return complex<_Tp>(fabs(__x.imag()), __x.real());
-    return complex<_Tp>(__x.real(), __x.real());
-  }
-  if (isinf(__x.imag()))
-    return complex<_Tp>(fabs(__x.imag()), copysign(__pi / _Tp(2), __x.imag()));
-  complex<_Tp> __z = log(__x + sqrt(pow(__x, _Tp(2)) - _Tp(1)));
-  return complex<_Tp>(copysign(__z.real(), _Tp(0)),
-                      copysign(__z.imag(), __x.imag()));
-}
-
-// atanh
-
-template <class _Tp>
-CUDA_CALLABLE_MEMBER complex<_Tp> atanh(const complex<_Tp> &__x) {
-  const _Tp __pi(atan2(+0., -0.));
-  if (isinf(__x.imag())) {
-    return complex<_Tp>(copysign(_Tp(0), __x.real()),
-                        copysign(__pi / _Tp(2), __x.imag()));
-  }
-  if (isnan(__x.imag())) {
-    if (isinf(__x.real()) || __x.real() == 0)
-      return complex<_Tp>(copysign(_Tp(0), __x.real()), __x.imag());
-    return complex<_Tp>(__x.imag(), __x.imag());
-  }
-  if (isnan(__x.real())) {
-    return complex<_Tp>(__x.real(), __x.real());
-  }
-  if (isinf(__x.real())) {
-    return complex<_Tp>(copysign(_Tp(0), __x.real()),
-                        copysign(__pi / _Tp(2), __x.imag()));
-  }
-  if (fabs(__x.real()) == _Tp(1) && __x.imag() == _Tp(0)) {
-    return complex<_Tp>(copysign(_Tp(INFINITY), __x.real()),
-                        copysign(_Tp(0), __x.imag()));
-  }
-  complex<_Tp> __z = log((_Tp(1) + __x) / (_Tp(1) - __x)) / _Tp(2);
-  return complex<_Tp>(copysign(__z.real(), __x.real()),
-                      copysign(__z.imag(), __x.imag()));
-}
-
-// sinh
-
-template <class _Tp>
-CUDA_CALLABLE_MEMBER complex<_Tp> sinh(const complex<_Tp> &__x) {
-  if (isinf(__x.real()) && !isfinite(__x.imag()))
-    return complex<_Tp>(__x.real(), _Tp(NAN));
-  if (__x.real() == 0 && !isfinite(__x.imag()))
-    return complex<_Tp>(__x.real(), _Tp(NAN));
-  if (__x.imag() == 0 && !isfinite(__x.real()))
-    return __x;
-  return complex<_Tp>(sinh(__x.real()) * cos(__x.imag()),
-                      cosh(__x.real()) * sin(__x.imag()));
-}
-
-// cosh
-
-template <class _Tp>
-CUDA_CALLABLE_MEMBER complex<_Tp> cosh(const complex<_Tp> &__x) {
-  if (isinf(__x.real()) && !isfinite(__x.imag()))
-    return complex<_Tp>(fabs(__x.real()), _Tp(NAN));
-  if (__x.real() == 0 && !isfinite(__x.imag()))
-    return complex<_Tp>(_Tp(NAN), __x.real());
-  if (__x.real() == 0 && __x.imag() == 0)
-    return complex<_Tp>(_Tp(1), __x.imag());
-  if (__x.imag() == 0 && !isfinite(__x.real()))
-    return complex<_Tp>(fabs(__x.real()), __x.imag());
-  return complex<_Tp>(cosh(__x.real()) * cos(__x.imag()),
-                      sinh(__x.real()) * sin(__x.imag()));
-}
-
-// tanh
-
-template <class _Tp>
-CUDA_CALLABLE_MEMBER complex<_Tp> tanh(const complex<_Tp> &__x) {
-  if (isinf(__x.real())) {
-    if (!isfinite(__x.imag()))
-      return complex<_Tp>(_Tp(1), _Tp(0));
-    return complex<_Tp>(_Tp(1), copysign(_Tp(0), sin(_Tp(2) * __x.imag())));
-  }
-  if (isnan(__x.real()) && __x.imag() == 0)
-    return __x;
-  _Tp __2r(_Tp(2) * __x.real());
-  _Tp __2i(_Tp(2) * __x.imag());
-  _Tp __d(cosh(__2r) + cos(__2i));
-  return complex<_Tp>(sinh(__2r) / __d, sin(__2i) / __d);
-}
-
-// asin
-
-template <class _Tp>
-CUDA_CALLABLE_MEMBER complex<_Tp> asin(const complex<_Tp> &__x) {
-  complex<_Tp> __z = asinh(complex<_Tp>(-__x.imag(), __x.real()));
-  return complex<_Tp>(__z.imag(), -__z.real());
-}
-
-// acos
-
-template <class _Tp>
-CUDA_CALLABLE_MEMBER complex<_Tp> acos(const complex<_Tp> &__x) {
-  const _Tp __pi(atan2(+0., -0.));
-  if (isinf(__x.real())) {
-    if (isnan(__x.imag()))
-      return complex<_Tp>(__x.imag(), __x.real());
-    if (isinf(__x.imag())) {
-      if (__x.real() < _Tp(0))
-        return complex<_Tp>(_Tp(0.75) * __pi, -__x.imag());
-      return complex<_Tp>(_Tp(0.25) * __pi, -__x.imag());
-    }
-    if (__x.real() < _Tp(0))
-      return complex<_Tp>(__pi, signbit(__x.imag()) ? -__x.real() : __x.real());
-    return complex<_Tp>(_Tp(0), signbit(__x.imag()) ? __x.real() : -__x.real());
-  }
-  if (isnan(__x.real())) {
-    if (isinf(__x.imag()))
-      return complex<_Tp>(__x.real(), -__x.imag());
-    return complex<_Tp>(__x.real(), __x.real());
-  }
-  if (isinf(__x.imag()))
-    return complex<_Tp>(__pi / _Tp(2), -__x.imag());
-  if (__x.real() == 0)
-    return complex<_Tp>(__pi / _Tp(2), -__x.imag());
-  complex<_Tp> __z = log(__x + sqrt(pow(__x, _Tp(2)) - _Tp(1)));
-  if (signbit(__x.imag()))
-    return complex<_Tp>(fabs(__z.imag()), fabs(__z.real()));
-  return complex<_Tp>(fabs(__z.imag()), -fabs(__z.real()));
-}
-
-// atan
-
-template <class _Tp>
-CUDA_CALLABLE_MEMBER complex<_Tp> atan(const complex<_Tp> &__x) {
-  complex<_Tp> __z = atanh(complex<_Tp>(-__x.imag(), __x.real()));
-  return complex<_Tp>(__z.imag(), -__z.real());
-}
-
-// sin
-
-template <class _Tp>
-CUDA_CALLABLE_MEMBER complex<_Tp> sin(const complex<_Tp> &__x) {
-  complex<_Tp> __z = sinh(complex<_Tp>(-__x.imag(), __x.real()));
-  return complex<_Tp>(__z.imag(), -__z.real());
-}
-
-// cos
-
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> cos(const complex<_Tp> &__x) {
-  return cosh(complex<_Tp>(-__x.imag(), __x.real()));
-}
-
-// tan
-
-template <class _Tp>
-CUDA_CALLABLE_MEMBER complex<_Tp> tan(const complex<_Tp> &__x) {
-  complex<_Tp> __z = tanh(complex<_Tp>(-__x.imag(), __x.real()));
-  return complex<_Tp>(__z.imag(), -__z.real());
-}
-
-template <class _Tp, class _CharT, class _Traits>
-std::basic_istream<_CharT, _Traits> &
-operator>>(std::basic_istream<_CharT, _Traits> &__is, complex<_Tp> &__x) {
-  if (__is.good()) {
-    ws(__is);
-    if (__is.peek() == _CharT('(')) {
-      __is.get();
-      _Tp __r;
-      __is >> __r;
-      if (!__is.fail()) {
-        ws(__is);
-        _CharT __c = __is.peek();
-        if (__c == _CharT(',')) {
-          __is.get();
-          _Tp __i;
-          __is >> __i;
-          if (!__is.fail()) {
-            ws(__is);
-            __c = __is.peek();
-            if (__c == _CharT(')')) {
-              __is.get();
-              __x = complex<_Tp>(__r, __i);
-            } else
-              __is.setstate(std::ios_base::failbit);
-          } else
-            __is.setstate(std::ios_base::failbit);
-        } else if (__c == _CharT(')')) {
-          __is.get();
-          __x = complex<_Tp>(__r, _Tp(0));
-        } else
-          __is.setstate(std::ios_base::failbit);
-      } else
-        __is.setstate(std::ios_base::failbit);
-    } else {
-      _Tp __r;
-      __is >> __r;
-      if (!__is.fail())
-        __x = complex<_Tp>(__r, _Tp(0));
-      else
-        __is.setstate(std::ios_base::failbit);
-    }
-  } else
-    __is.setstate(std::ios_base::failbit);
-  return __is;
-}
-
-template <class _Tp, class _CharT, class _Traits>
-std::basic_ostream<_CharT, _Traits> &
-operator<<(std::basic_ostream<_CharT, _Traits> &__os, const complex<_Tp> &__x) {
-  std::basic_ostringstream<_CharT, _Traits> __s;
-  __s.flags(__os.flags());
-  __s.imbue(__os.getloc());
-  __s.precision(__os.precision());
-  __s << '(' << __x.real() << ',' << __x.imag() << ')';
-  return __os << __s.str();
-}
-
-//} // close namespace cuda_complex
-
-template <class U, class V>
-CUDA_CALLABLE_MEMBER auto operator*(const complex<U> &complexNumber,
-                                    const V &scalar) -> complex<U> {
-  return complex<U>(real(complexNumber) * scalar, imag(complexNumber) * scalar);
-}
-
-template <class U, class V>
-CUDA_CALLABLE_MEMBER auto operator*(const V &scalar,
-                                    const complex<U> &complexNumber)
-    -> complex<U> {
-  return complex<U>(real(complexNumber) * scalar, imag(complexNumber) * scalar);
-}
-
-template <class U, class V>
-CUDA_CALLABLE_MEMBER auto operator+(const complex<U> &complexNumber,
-                                    const V &scalar) -> complex<U> {
-  return complex<U>(real(complexNumber) + scalar, imag(complexNumber));
-}
-
-template <class U, class V>
-CUDA_CALLABLE_MEMBER auto operator+(const V &scalar,
-                                    const complex<U> &complexNumber)
-    -> complex<U> {
-  return complex<U>(real(complexNumber) + scalar, imag(complexNumber));
-}
-
-template <class U, class V>
-CUDA_CALLABLE_MEMBER auto operator-(const complex<U> &complexNumber,
-                                    const V &scalar) -> complex<U> {
-  return complex<U>(real(complexNumber) - scalar, imag(complexNumber));
-}
-
-template <class U, class V>
-CUDA_CALLABLE_MEMBER auto operator-(const V &scalar,
-                                    const complex<U> &complexNumber)
-    -> complex<U> {
-  return complex<U>(scalar - real(complexNumber), imag(complexNumber));
-}
-
-template <class U, class V>
-CUDA_CALLABLE_MEMBER auto operator/(const complex<U> &complexNumber,
-                                    const V scalar) -> complex<U> {
-  return complex<U>(real(complexNumber) / scalar, imag(complexNumber) / scalar);
-}
-
-template <class U, class V>
-CUDA_CALLABLE_MEMBER auto operator/(const V scalar,
-                                    const complex<U> &complexNumber)
-    -> complex<U> {
-  return complex<U>(scalar, 0) / complexNumber;
-}
-
-using ComplexDouble = complex<double>;
-using ComplexFloat = complex<float>;
-#endif // CUDA_COMPLEX_HPP
-}
--- a/pystencils/include/opencl_stdint.h
+++ b/pystencils/include/opencl_stdint.h
-#ifndef OPENCL_STDINT
-#define OPENCL_STDINT
-
-typedef unsigned int      uint_t;
-
-typedef signed char       int8_t;
-typedef signed short      int16_t;
-typedef signed int        int32_t;
-typedef signed long int   int64_t;
-typedef unsigned char     uint8_t;
-typedef unsigned short    uint16_t;
-typedef unsigned int      uint32_t;
-typedef unsigned long int uint64_t;
-
-#endif
--- a/pystencils/interpolation_astnodes.py
+++ b/pystencils/interpolation_astnodes.py
-# -*- coding: utf-8 -*-
-#
-# Copyright © 2019 Stephan Seitz <stephan.seitz@fau.de>
-#
-# Distributed under terms of the GPLv3 license.
-
-"""
-
-"""
-
-import hashlib
-import itertools
-from enum import Enum
-from typing import Set
-
-import sympy as sp
-from sympy.core.cache import cacheit
-
-import pystencils
-from pystencils.astnodes import Node
-from pystencils.data_types import TypedSymbol, cast_func, create_type
-
-try:
-    import pycuda.driver
-except Exception:
-    pass
-
-_hash = hashlib.md5
-
-
-class InterpolationMode(str, Enum):
-    NEAREST_NEIGHBOR = "nearest_neighbour"
-    NN = NEAREST_NEIGHBOR
-    LINEAR = "linear"
-    CUBIC_SPLINE = "cubic_spline"
-
-
-class _InterpolationSymbol(TypedSymbol):
-
-    def __new__(cls, name, field, interpolator):
-        obj = cls.__xnew_cached_(cls, name, field, interpolator)
-        return obj
-
-    def __new_stage2__(cls, name, field, interpolator):
-        obj = super().__xnew__(cls, name, 'dummy_symbol_carrying_field' + field.name)
-        obj.field = field
-        obj.interpolator = interpolator
-        return obj
-
-    def __getnewargs__(self):
-        return self.name, self.field, self.interpolator
-
-    def __getnewargs_ex__(self):
-        return (self.name, self.field, self.interpolator), {}
-
-    # noinspection SpellCheckingInspection
-    __xnew__ = staticmethod(__new_stage2__)
-    # noinspection SpellCheckingInspection
-    __xnew_cached_ = staticmethod(cacheit(__new_stage2__))
-
-
-class Interpolator(object):
-    """
-    Implements non-integer accesses on fields using linear interpolation.
-
-    On GPU, this interpolator can be implemented by a :class:`.TextureCachedField` for hardware acceleration.
-
-    Address modes are different boundary handlings possible choices are like for CUDA textures
-
-        **CLAMP**
-
-        The signal c[k] is continued outside k=0,...,M-1 so that c[k] = c[0] for k < 0, and c[k] = c[M-1] for k >= M.
-
-        **BORDER**
-
-        The signal c[k] is continued outside k=0,...,M-1 so that c[k] = 0 for k < 0and for k >= M.
-
-        Now, to describe the last two address modes, we are forced to consider normalized coordinates,
-        so that the 1D input signal samples are assumed to be c[k / M], with k=0,...,M-1.
-
-        **WRAP**
-
-        The signal c[k / M] is continued outside k=0,...,M-1 so that it is periodic with period equal to M.
-        In other words, c[(k + p * M) / M] = c[k / M] for any (positive, negative or vanishing) integer p.
-
-        **MIRROR**
-
-        The signal c[k / M] is continued outside k=0,...,M-1 so that it is periodic with period equal to 2 * M - 2.
-        In other words, c[l / M] = c[k / M] for any l and k such that (l + k)mod(2 * M - 2) = 0.
-
-    Explanations from https://stackoverflow.com/questions/19020963/the-different-addressing-modes-of-cuda-textures
-    """
-
-    required_global_declarations = []
-
-    def __init__(self,
-                 parent_field,
-                 interpolation_mode: InterpolationMode,
-                 address_mode='BORDER',
-                 use_normalized_coordinates=False,
-                 allow_textures=True):
-        super().__init__()
-
-        self.field = parent_field
-        self.field.field_type = pystencils.field.FieldType.CUSTOM
-        self.address_mode = address_mode
-        self.use_normalized_coordinates = use_normalized_coordinates
-        self.interpolation_mode = interpolation_mode
-        self.hash_str = hashlib.md5(
-            f'{self.field}_{address_mode}_{self.field.dtype}_{interpolation_mode}'.encode()).hexdigest()
-        self.symbol = _InterpolationSymbol(str(self), parent_field, self)
-        self.allow_textures = allow_textures
-
-    @property
-    def ndim(self):
-        return self.field.ndim
-
-    @property
-    def _hashable_contents(self):
-        return (str(self.address_mode),
-                str(type(self)),
-                self.hash_str,
-                self.use_normalized_coordinates)
-
-    def at(self, offset):
-        return InterpolatorAccess(self.symbol, *[sp.S(o) for o in offset])
-
-    def __getitem__(self, offset):
-        return InterpolatorAccess(self.symbol, *[sp.S(o) for o in offset])
-
-    def __str__(self):
-        return f'{self.field.name}_interpolator_{self.reproducible_hash}'
-
-    def __repr__(self):
-        return self.__str__()
-
-    def __hash__(self):
-        return hash(self._hashable_contents)
-
-    def __eq__(self, other):
-        return hash(self) == hash(other)
-
-    @property
-    def reproducible_hash(self):
-        return _hash(str(self._hashable_contents).encode()).hexdigest()
-
-
-class LinearInterpolator(Interpolator):
-
-    def __init__(self,
-                 parent_field: pystencils.Field,
-                 address_mode='BORDER',
-                 use_normalized_coordinates=False):
-        super().__init__(parent_field,
-                         InterpolationMode.LINEAR,
-                         address_mode,
-                         use_normalized_coordinates)
-
-
-class NearestNeightborInterpolator(Interpolator):
-
-    def __init__(self,
-                 parent_field: pystencils.Field,
-                 address_mode='BORDER',
-                 use_normalized_coordinates=False):
-        super().__init__(parent_field,
-                         InterpolationMode.NN,
-                         address_mode,
-                         use_normalized_coordinates)
-
-
-class InterpolatorAccess(TypedSymbol):
-    def __new__(cls, field, *offsets):
-        obj = InterpolatorAccess.__xnew_cached_(cls, field, *offsets)
-        return obj
-
-    def __new_stage2__(cls, symbol, *offsets):
-        assert offsets is not None
-        obj = super().__xnew__(cls, '%s_interpolator_%s' %
-                               (symbol.field.name, _hash(str(tuple(offsets)).encode()).hexdigest()),
-                               symbol.field.dtype)
-        obj.offsets = offsets
-        obj.symbol = symbol
-        obj.field = symbol.field
-        obj.interpolator = symbol.interpolator
-        return obj
-
-    def _hashable_contents(self):
-        return super()._hashable_content() + ((self.symbol, self.field, tuple(self.offsets), self.symbol.interpolator))
-
-    def __str__(self):
-        return f"{self.field.name}_interpolator({', '.join(str(o) for o in self.offsets)})"
-
-    def __repr__(self):
-        return self.__str__()
-
-    def _latex(self, printer, *_):
-        n = self.field.latex_name if self.field.latex_name else self.field.name
-        foo = ", ".join(str(printer.doprint(o)) for o in self.offsets)
-        return f'{n}_{{interpolator}}\\left({foo}\\right)'
-
-    @property
-    def ndim(self):
-        return len(self.offsets)
-
-    @property
-    def is_texture(self):
-        return isinstance(self.interpolator, TextureCachedField)
-
-    def atoms(self, *types):
-        if self.offsets:
-            offsets = set(o for o in self.offsets if isinstance(o, types))
-            if isinstance(self, *types):
-                offsets.update([self])
-            for o in self.offsets:
-                if hasattr(o, 'atoms'):
-                    offsets.update(set(o.atoms(*types)))
-            return offsets
-        else:
-            return set()
-
-    def neighbor(self, coord_id, offset):
-        offset_list = list(self.offsets)
-        offset_list[coord_id] += offset
-        return self.interpolator.at(tuple(offset_list))
-
-    @property
-    def free_symbols(self):
-        symbols = set()
-        if self.offsets is not None:
-            for o in self.offsets:
-                if hasattr(o, 'free_symbols'):
-                    symbols.update(set(o.free_symbols))
-                # if hasattr(o, 'atoms'):
-                    # symbols.update(set(o.atoms(sp.Symbol)))
-
-        return symbols
-
-    @property
-    def required_global_declarations(self):
-        required_global_declarations = self.symbol.interpolator.required_global_declarations
-        if required_global_declarations:
-            required_global_declarations[0]._symbols_defined.add(self)
-        return required_global_declarations
-
-    @property
-    def args(self):
-        return [self.symbol, *self.offsets]
-
-    @property
-    def symbols_defined(self) -> Set[sp.Symbol]:
-        return {self}
-
-    @property
-    def interpolation_mode(self):
-        return self.interpolator.interpolation_mode
-
-    @property
-    def _diff_interpolation_vec(self):
-        return sp.Matrix([DiffInterpolatorAccess(self.symbol, i, *self.offsets)
-                          for i in range(len(self.offsets))])
-
-    def diff(self, *symbols, **kwargs):
-        if symbols == (self,):
-            return 1
-        rtn = self._diff_interpolation_vec.T * sp.Matrix(self.offsets).diff(*symbols, **kwargs)
-        if rtn.shape == (1, 1):
-            rtn = rtn[0, 0]
-        return rtn
-
-    def implementation_with_stencils(self):
-        field = self.field
-
-        default_int_type = create_type('int64')
-        use_textures = isinstance(self.interpolator, TextureCachedField)
-        if use_textures:
-            def absolute_access(x, _):
-                return self.symbol.interpolator.at((o for o in x))
-        else:
-            absolute_access = field.absolute_access
-
-        sum = [0, ] * (field.shape[0] if field.index_dimensions else 1)
-
-        offsets = self.offsets
-        rounding_functions = (sp.floor, lambda x: sp.floor(x) + 1)
-
-        for channel_idx in range(field.shape[0] if field.index_dimensions else 1):
-            if self.interpolation_mode == InterpolationMode.NN:
-                if use_textures:
-                    sum[channel_idx] = self
-                else:
-                    sum[channel_idx] = absolute_access([sp.floor(i + 0.5) for i in offsets], channel_idx)
-
-            elif self.interpolation_mode == InterpolationMode.LINEAR:
-                # TODO optimization: implement via lerp: https://devblogs.nvidia.com/lerp-faster-cuda/
-                for c in itertools.product(rounding_functions, repeat=field.spatial_dimensions):
-                    weight = sp.Mul(*[1 - sp.Abs(f(offset) - offset) for (f, offset) in zip(c, offsets)])
-                    index = [f(offset) for (f, offset) in zip(c, offsets)]
-                    # Hardware boundary handling on GPU
-                    if use_textures:
-                        weight = sp.Mul(*[1 - sp.Abs(f(offset) - offset) for (f, offset) in zip(c, offsets)])
-                        sum[channel_idx] += \
-                            weight * absolute_access(index, channel_idx if field.index_dimensions else ())
-                    # else boundary handling using software
-                    elif str(self.interpolator.address_mode).lower() == 'border':
-                        is_inside_field = sp.And(
-                            *itertools.chain([i >= 0 for i in index],
-                                             [idx < field.shape[dim] for (dim, idx) in enumerate(index)]))
-                        index = [cast_func(i, default_int_type) for i in index]
-                        sum[channel_idx] += sp.Piecewise(
-                            (weight * absolute_access(index, channel_idx if field.index_dimensions else ()),
-                                is_inside_field),
-                            (sp.simplify(0), True)
-                        )
-                    elif str(self.interpolator.address_mode).lower() == 'clamp':
-                        index = [sp.Min(sp.Max(0, cast_func(i, default_int_type)), field.spatial_shape[dim] - 1)
-                                 for (dim, i) in enumerate(index)]
-                        sum[channel_idx] += weight * \
-                            absolute_access(index, channel_idx if field.index_dimensions else ())
-                    elif str(self.interpolator.address_mode).lower() == 'wrap':
-                        index = [sp.Mod(cast_func(i, default_int_type), field.shape[dim] - 1)
-                                 for (dim, i) in enumerate(index)]
-                        index = [cast_func(sp.Piecewise((i, i > 0),
-                                                        (sp.Abs(cast_func(field.shape[dim] - 1 + i, default_int_type)),
-                                                         True)), default_int_type)
-                                 for (dim, i) in enumerate(index)]
-                        sum[channel_idx] += weight * \
-                            absolute_access(index, channel_idx if field.index_dimensions else ())
-                        # sum[channel_idx] = 0
-                    elif str(self.interpolator.address_mode).lower() == 'mirror':
-                        def triangle_fun(x, half_period):
-                            saw_tooth = cast_func(sp.Abs(cast_func(x, 'int32')), 'int32') % (
-                                cast_func(2 * half_period, create_type('int32')))
-                            return sp.Piecewise((saw_tooth, saw_tooth < half_period),
-                                                (2 * half_period - 1 - saw_tooth, True))
-                        index = [cast_func(triangle_fun(i, field.shape[dim]),
-                                           default_int_type) for (dim, i) in enumerate(index)]
-                        sum[channel_idx] += weight * \
-                            absolute_access(index, channel_idx if field.index_dimensions else ())
-                    else:
-                        raise NotImplementedError()
-            elif self.interpolation_mode == InterpolationMode.CUBIC_SPLINE:
-                raise NotImplementedError("only works with HW interpolation for float32")
-
-            sum = [sp.factor(s) for s in sum]
-
-            if field.index_dimensions:
-                return sp.Matrix(sum)
-            else:
-                return sum[0]
-
-    # noinspection SpellCheckingInspection
-    __xnew__ = staticmethod(__new_stage2__)
-    # noinspection SpellCheckingInspection
-    __xnew_cached_ = staticmethod(cacheit(__new_stage2__))
-
-    def __getnewargs__(self):
-        return (self.symbol, *self.offsets)
-
-    def __getnewargs_ex__(self):
-        return (self.symbol, *self.offsets), {}
-
-
-class DiffInterpolatorAccess(InterpolatorAccess):
-    def __new__(cls, symbol, diff_coordinate_idx, *offsets):
-        if symbol.interpolator.interpolation_mode == InterpolationMode.LINEAR:
-            from pystencils.fd import Diff, Discretization2ndOrder
-            return Discretization2ndOrder(1)(Diff(symbol.interpolator.at(offsets), diff_coordinate_idx))
-        obj = DiffInterpolatorAccess.__xnew_cached_(cls, symbol, diff_coordinate_idx, *offsets)
-        return obj
-
-    def __new_stage2__(self, symbol: sp.Symbol, diff_coordinate_idx, *offsets):
-        assert offsets is not None
-        obj = super().__xnew__(self, symbol, *offsets)
-        obj.diff_coordinate_idx = diff_coordinate_idx
-        return obj
-
-    def __hash__(self):
-        return hash((self.symbol, self.field, self.diff_coordinate_idx, tuple(self.offsets), self.interpolator))
-
-    def __str__(self):
-        return '%s_diff%i_interpolator(%s)' % (self.field.name, self.diff_coordinate_idx,
-                                               ', '.join(str(o) for o in self.offsets))
-
-    def __repr__(self):
-        return str(self)
-
-    @property
-    def args(self):
-        return [self.symbol, self.diff_coordinate_idx, *self.offsets]
-
-    @property
-    def symbols_defined(self) -> Set[sp.Symbol]:
-        return {self}
-
-    @property
-    def interpolation_mode(self):
-        return self.interpolator.interpolation_mode
-
-    # noinspection SpellCheckingInspection
-    __xnew__ = staticmethod(__new_stage2__)
-    # noinspection SpellCheckingInspection
-    __xnew_cached_ = staticmethod(cacheit(__new_stage2__))
-
-    def __getnewargs__(self):
-        return (self.symbol, self.diff_coordinate_idx, *self.offsets)
-
-    def __getnewargs_ex__(self):
-        return (self.symbol, self.diff_coordinate_idx, *self.offsets), {}
-
-
-##########################################################################################
-# GPU-specific fast specializations (for precision GPUs can also use above nodes/symbols #
-##########################################################################################
-
-
-class TextureCachedField(Interpolator):
-
-    def __init__(self, parent_field,
-                 address_mode=None,
-                 filter_mode=None,
-                 interpolation_mode: InterpolationMode = InterpolationMode.LINEAR,
-                 use_normalized_coordinates=False,
-                 read_as_integer=False
-                 ):
-        super().__init__(parent_field, interpolation_mode, address_mode, use_normalized_coordinates)
-
-        if address_mode is None:
-            address_mode = 'border'
-        if filter_mode is None:
-            filter_mode = pycuda.driver.filter_mode.LINEAR
-
-        self.read_as_integer = read_as_integer
-        self.required_global_declarations = [TextureDeclaration(self)]
-
-    @property
-    def ndim(self):
-        return self.field.ndim
-
-    @classmethod
-    def from_interpolator(cls, interpolator: LinearInterpolator):
-        if (isinstance(interpolator, cls)
-                or (hasattr(interpolator, 'allow_textures') and not interpolator.allow_textures)):
-            return interpolator
-        obj = cls(interpolator.field, interpolator.address_mode, interpolation_mode=interpolator.interpolation_mode)
-        return obj
-
-    def __str__(self):
-        return f'{self.field.name}_texture_{self.reproducible_hash}'
-
-    def __repr__(self):
-        return self.__str__()
-
-    @property
-    def reproducible_hash(self):
-        return _hash(str(self._hashable_contents).encode()).hexdigest()
-
-
-class TextureDeclaration(Node):
-    """
-    A global declaration of a texture. Visible both for device and host code.
-
-    .. code:: cpp
-
-        // This Node represents the following global declaration
-        texture<float, cudaTextureType2D, cudaReadModeElementType> x_texture_5acc9fced7b0dc3e;
-
-        __device__ kernel(...) {
-            // kernel acceses x_texture_5acc9fced7b0dc3e with tex2d(...)
-        }
-
-        __host__ launch_kernel(...) {
-            // Host needs to bind the texture
-            cudaBindTexture(0, x_texture_5acc9fced7b0dc3e, buffer, N*sizeof(float));
-        }
-
-    This has been deprecated by CUDA in favor of :class:`.TextureObject`.
-    But texture objects are not yet supported by PyCUDA (https://github.com/inducer/pycuda/pull/174)
-    """
-
-    def __init__(self, parent_texture):
-        self.texture = parent_texture
-        self._symbols_defined = {self.texture.symbol}
-
-    @property
-    def symbols_defined(self) -> Set[sp.Symbol]:
-        return self._symbols_defined
-
-    @property
-    def args(self) -> Set[sp.Symbol]:
-        return set()
-
-    @property
-    def headers(self):
-        headers = ['"pycuda-helpers.hpp"']
-        if self.texture.interpolation_mode == InterpolationMode.CUBIC_SPLINE:
-            headers.append('"cubicTex%iD.cu"' % self.texture.ndim)
-        return headers
-
-    def __str__(self):
-        from pystencils.backends.cuda_backend import CudaBackend
-        return CudaBackend()(self)
-
-    def __repr__(self):
-        return str(self)
-
-
-class TextureObject(TextureDeclaration):
-    """
-    A CUDA texture object. Opposed to :class:`.TextureDeclaration` it is not declared globally but
-    used as a function argument for the kernel call.
-
-    Like :class:`.TextureDeclaration` it defines :class:`.TextureAccess` symbols.
-    Just the printing representation is a bit different.
-    """
-    pass
-
-
-def dtype_supports_textures(dtype):
-    """
-    Returns whether CUDA natively supports texture fetches with this numpy dtype.
-
-    The maximum word size for a texture fetch is four bytes.
-
-    With this trick also larger dtypes can be fetched:
-    https://github.com/inducer/pycuda/blob/master/pycuda/cuda/pycuda-helpers.hpp
-
-    """
-    if hasattr(dtype, 'numpy_dtype'):
-        dtype = dtype.numpy_dtype
-
-    if isinstance(dtype, type):
-        return dtype().itemsize <= 4
-
-    return dtype.itemsize <= 4
--- a/pystencils/kerncraft_coupling/__init__.py
+++ b/pystencils/kerncraft_coupling/__init__.py
-from .generate_benchmark import generate_benchmark, run_c_benchmark
-from .kerncraft_interface import KerncraftParameters, PyStencilsKerncraftKernel
-
-__all__ = ['PyStencilsKerncraftKernel', 'KerncraftParameters', 'generate_benchmark', 'run_c_benchmark']
--- a/pystencils/kerncraft_coupling/generate_benchmark.py
+++ b/pystencils/kerncraft_coupling/generate_benchmark.py
-import subprocess
-import warnings
-import tempfile
-from pathlib import Path
-
-from jinja2 import Environment, PackageLoader, StrictUndefined
-
-from pystencils.astnodes import PragmaBlock
-from pystencils.backends.cbackend import generate_c, get_headers
-from pystencils.cpu.cpujit import get_compiler_config, run_compile_step
-from pystencils.data_types import get_base_type
-from pystencils.enums import Backend
-from pystencils.include import get_pystencils_include_path
-from pystencils.integer_functions import modulo_ceil
-from pystencils.sympyextensions import prod
-
-import numpy as np
-
-
-def generate_benchmark(ast, likwid=False, openmp=False, timing=False):
-    """Return C code of a benchmark program for the given kernel.
-
-    Args:
-        ast: the pystencils AST object as returned by create_kernel
-        likwid: if True likwid markers are added to the code
-        openmp: relevant only if likwid=True, to generated correct likwid initialization code
-        timing: add timing output to the code, prints time per iteration to stdout
-
-    Returns:
-        C code as string
-    """
-    accessed_fields = {f.name: f for f in ast.fields_accessed}
-    constants = []
-    fields = []
-    call_parameters = []
-    for p in ast.get_parameters():
-        if not p.is_field_parameter:
-            constants.append((p.symbol.name, str(p.symbol.dtype)))
-            call_parameters.append(p.symbol.name)
-        else:
-            assert p.is_field_pointer, "Benchmark implemented only for kernels with fixed loop size"
-            field = accessed_fields[p.field_name]
-            dtype = str(get_base_type(p.symbol.dtype))
-            np_dtype = get_base_type(p.symbol.dtype).numpy_dtype
-            size_data_type = np_dtype.itemsize
-
-            dim0_size = field.shape[-1]
-            dim1_size = np.prod(field.shape[:-1])
-            elements = prod(field.shape)
-
-            if ast.instruction_set:
-                align = ast.instruction_set['width'] * size_data_type
-                padding_elements = modulo_ceil(dim0_size, ast.instruction_set['width']) - dim0_size
-                padding_bytes = padding_elements * size_data_type
-                ghost_layers = max(max(ast.ghost_layers))
-
-                size = dim1_size * padding_bytes + np.prod(field.shape) * size_data_type
-
-                assert align % np_dtype.itemsize == 0
-                offset = ((dim0_size + padding_elements + ghost_layers) % ast.instruction_set['width']) * size_data_type
-
-                fields.append((p.field_name, dtype, elements, size, offset, align))
-                call_parameters.append(p.field_name)
-            else:
-                size = elements * size_data_type
-                fields.append((p.field_name, dtype, elements, size, 0, 0))
-                call_parameters.append(p.field_name)
-
-    header_list = get_headers(ast)
-    includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list])
-
-    # Strip "#pragma omp parallel" from within kernel, because main function takes care of that
-    # when likwid and openmp are enabled
-    if likwid and openmp:
-        if len(ast.body.args) > 0 and isinstance(ast.body.args[0], PragmaBlock):
-            ast.body.args[0].pragma_line = ''
-
-    jinja_context = {
-        'likwid': likwid,
-        'openmp': openmp,
-        'kernel_code': generate_c(ast, dialect=Backend.C),
-        'kernelName': ast.function_name,
-        'fields': fields,
-        'constants': constants,
-        'call_argument_list': ",".join(call_parameters),
-        'includes': includes,
-        'timing': timing,
-    }
-
-    env = Environment(loader=PackageLoader('pystencils.kerncraft_coupling'), undefined=StrictUndefined)
-
-    return env.get_template('benchmark.c').render(**jinja_context)
-
-
-def run_c_benchmark(ast, inner_iterations, outer_iterations=3, path=None):
-    """Runs the given kernel with outer loop in C
-
-    Args:
-        ast: pystencils ast which is used to compile the benchmark file
-        inner_iterations: timings are recorded around this many iterations
-        outer_iterations: number of timings recorded
-        path: path where the benchmark file is stored. If None a tmp folder is created
-
-    Returns:
-        list of times per iterations for each outer iteration
-    """
-    import kerncraft
-
-    benchmark_code = generate_benchmark(ast, timing=True)
-
-    if path is None:
-        path = tempfile.mkdtemp()
-
-    if isinstance(path, str):
-        path = Path(path)
-
-    with open(path / 'bench.c', 'w') as f:
-        f.write(benchmark_code)
-
-    kerncraft_path = Path(kerncraft.__file__).parent
-
-    extra_flags = ['-I' + get_pystencils_include_path(),
-                   '-I' + str(kerncraft_path / 'headers')]
-
-    compiler_config = get_compiler_config()
-    compile_cmd = [compiler_config['command']] + compiler_config['flags'].split()
-    compile_cmd += [*extra_flags,
-                    str(kerncraft_path / 'headers' / 'timing.c'),
-                    str(kerncraft_path / 'headers' / 'dummy.c'),
-                    str(path / 'bench.c'),
-                    '-o', str(path / 'bench'),
-                    ]
-    run_compile_step(compile_cmd)
-
-    time_pre_estimation_per_iteration = float(subprocess.check_output(['./' / path / 'bench', str(10)]))
-    benchmark_time_limit = 20
-    if benchmark_time_limit / time_pre_estimation_per_iteration < inner_iterations:
-        warn = (f"A benchmark run with {inner_iterations} inner_iterations will probably take longer than "
-                f"{benchmark_time_limit} seconds for this kernel")
-        warnings.warn(warn)
-
-    results = []
-    for _ in range(outer_iterations):
-        benchmark_time = float(subprocess.check_output(['./' / path / 'bench', str(inner_iterations)]))
-        results.append(benchmark_time)
-    return results
--- a/pystencils/kerncraft_coupling/kerncraft_interface.py
+++ b/pystencils/kerncraft_coupling/kerncraft_interface.py
-import warnings
-import fcntl
-from collections import defaultdict
-from tempfile import TemporaryDirectory
-import textwrap
-import itertools
-import string
-
-from jinja2 import Environment, PackageLoader, StrictUndefined, Template
-import sympy as sp
-from kerncraft.kerncraft import KernelCode
-from kerncraft.kernel import symbol_pos_int
-from kerncraft.machinemodel import MachineModel
-
-from pystencils.astnodes import \
-    KernelFunction, LoopOverCoordinate, ResolvedFieldAccess, SympyAssignment
-from pystencils.backends.cbackend import generate_c, get_headers
-from pystencils.enums import Backend
-from pystencils.field import get_layout_from_strides
-from pystencils.sympyextensions import count_operations_in_ast
-from pystencils.transformations import filtered_tree_iteration
-from pystencils.utils import DotDict
-from pystencils.cpu.kernelcreation import add_openmp
-from pystencils.data_types import get_base_type
-from pystencils.sympyextensions import prod
-
-
-class PyStencilsKerncraftKernel(KernelCode):
-    """
-    Implementation of kerncraft's kernel interface for pystencils CPU kernels.
-    Analyses a list of equations assuming they will be executed on a CPU
-    """
-    LIKWID_BASE = '/usr/local/likwid'
-
-    def __init__(self, ast: KernelFunction, machine: MachineModel,
-                 assumed_layout='SoA', debug_print=False, filename=None):
-        """Create a kerncraft kernel using a pystencils AST
-
-        Args:
-            ast: pystencils ast
-            machine: kerncraft machine model - specify this if kernel needs to be compiled
-            assumed_layout: either 'SoA' or 'AoS' - if fields have symbolic sizes the layout of the index
-                    coordinates is not known. In this case either a structures of array (SoA) or
-                    array of structures (AoS) layout is assumed
-            debug_print: print debug information
-            filename: used for caching
-        """
-        super(KernelCode, self).__init__(machine=machine)
-
-        # Initialize state
-        self.asm_block = None
-        self._filename = filename
-        self._keep_intermediates = False
-
-        self.kernel_ast = ast
-        self.temporary_dir = TemporaryDirectory()
-        self._keep_intermediates = debug_print
-
-        # Loops
-        inner_loops = [l for l in filtered_tree_iteration(ast, LoopOverCoordinate, stop_type=SympyAssignment)
-                       if l.is_innermost_loop]
-        if len(inner_loops) == 0:
-            raise ValueError("No loop found in pystencils AST")
-        else:
-            if len(inner_loops) > 1:
-                warnings.warn("pystencils AST contains multiple inner loops. "
-                              "Only one can be analyzed - choosing first one")
-            inner_loop = inner_loops[0]
-
-        self._loop_stack = []
-        cur_node = inner_loop
-        while cur_node is not None:
-            if isinstance(cur_node, LoopOverCoordinate):
-                loop_counter_sym = cur_node.loop_counter_symbol
-                loop_info = (loop_counter_sym.name,
-                             sp.Integer(cur_node.start),
-                             sp.Integer(cur_node.stop),
-                             sp.Integer(1))
-                # If the correct step were to be provided, all access within that step length will
-                # also need to be passed to kerncraft: cur_node.step)
-                self._loop_stack.append(loop_info)
-            cur_node = cur_node.parent
-        self._loop_stack = list(reversed(self._loop_stack))
-
-        def get_layout_tuple(f):
-            if f.has_fixed_shape:
-                return get_layout_from_strides(f.strides)
-            else:
-                layout_list = list(f.layout)
-                for _ in range(f.index_dimensions):
-                    layout_list.insert(0 if assumed_layout == 'SoA' else -1, max(layout_list) + 1)
-                return layout_list
-
-        # Variables (arrays) and Constants (scalar sizes)
-        const_names_iter = itertools.product(string.ascii_uppercase, repeat=1)
-        constants_reversed = {}
-        fields_accessed = self.kernel_ast.fields_accessed
-        for field in fields_accessed:
-            layout = get_layout_tuple(field)
-            permuted_shape = list(field.shape[i] for i in layout)
-            # Replace shape dimensions with constant variables (necessary for layer condition
-            # analysis)
-            for i, d in enumerate(permuted_shape):
-                if d not in self.constants.values():
-                    const_symbol = symbol_pos_int(''.join(next(const_names_iter)))
-                    self.set_constant(const_symbol, d)
-                    constants_reversed[d] = const_symbol
-                permuted_shape[i] = constants_reversed[d]
-            self.set_variable(field.name, (str(field.dtype),), tuple(permuted_shape))
-
-        # Data sources & destinations
-        self.sources = defaultdict(list)
-        self.destinations = defaultdict(list)
-
-        reads, writes = search_resolved_field_accesses_in_ast(inner_loop)
-        for accesses, target_dict in [(reads, self.sources), (writes, self.destinations)]:
-            for fa in accesses:
-                coord = [symbol_pos_int(LoopOverCoordinate.get_loop_counter_name(i)) + off
-                         for i, off in enumerate(fa.offsets)]
-                coord += list(fa.idx_coordinate_values)
-                layout = get_layout_tuple(fa.field)
-                permuted_coord = [sp.sympify(coord[i]) for i in layout]
-                target_dict[fa.field.name].append(permuted_coord)
-
-        # data type
-        self.datatype = list(self.variables.values())[0][0]
-
-        # flops
-        operation_count = count_operations_in_ast(inner_loop)
-        self._flops = {
-            '+': operation_count['adds'],
-            '*': operation_count['muls'],
-            '/': operation_count['divs'],
-        }
-        for k in [k for k, v in self._flops.items() if v == 0]:
-            del self._flops[k]
-        self.check()
-
-        if debug_print:
-            from pprint import pprint
-            print("-----------------------------  Loop Stack --------------------------")
-            pprint(self._loop_stack)
-            print("-----------------------------  Sources -----------------------------")
-            pprint(self.sources)
-            print("-----------------------------  Destinations ------------------------")
-            pprint(self.destinations)
-            print("-----------------------------  FLOPS -------------------------------")
-            pprint(self._flops)
-
-    def get_kernel_header(self, name='pystencils_kernel'):
-        file_name = "pystencils_kernel.h"
-        file_path = self.get_intermediate_location(file_name, machine_and_compiler_dependent=False)
-        lock_mode, lock_fp = self.lock_intermediate(file_path)
-
-        if lock_mode == fcntl.LOCK_SH:
-            # use cache
-            pass
-        else:  # lock_mode == fcntl.LOCK_EX:
-            function_signature = generate_c(self.kernel_ast, dialect=Backend.C, signature_only=True)
-
-            jinja_context = {
-                'function_signature': function_signature,
-            }
-
-            env = Environment(loader=PackageLoader('pystencils.kerncraft_coupling'), undefined=StrictUndefined)
-            file_header = env.get_template('kernel.h').render(**jinja_context)
-            with open(file_path, 'w') as f:
-                f.write(file_header)
-
-            self.release_exclusive_lock(lock_fp)  # degrade to shared lock
-        return file_path, lock_fp
-
-    def get_kernel_code(self, openmp=False, name='pystencils_kernl'):
-        """
-        Generate and return compilable source code from AST.
-
-        Args:
-            openmp: if true, openmp code will be generated
-            name: kernel name
-        """
-        filename = 'pystencils_kernl'
-        if openmp:
-            filename += '-omp'
-        filename += '.c'
-        file_path = self.get_intermediate_location(filename, machine_and_compiler_dependent=False)
-        lock_mode, lock_fp = self.lock_intermediate(file_path)
-
-        if lock_mode == fcntl.LOCK_SH:
-            # use cache
-            with open(file_path) as f:
-                code = f.read()
-        else:  # lock_mode == fcntl.LOCK_EX:
-            header_list = get_headers(self.kernel_ast)
-            includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list])
-
-            if openmp:
-                add_openmp(self.kernel_ast)
-
-            kernel_code = generate_c(self.kernel_ast, dialect=Backend.C)
-
-            jinja_context = {
-                'includes': includes,
-                'kernel_code': kernel_code,
-            }
-
-            env = Environment(loader=PackageLoader('pystencils.kerncraft_coupling'), undefined=StrictUndefined)
-            code = env.get_template('kernel.c').render(**jinja_context)
-            with open(file_path, 'w') as f:
-                f.write(code)
-
-            self.release_exclusive_lock(lock_fp)  # degrade to shared lock
-        return file_path, lock_fp
-
-    CODE_TEMPLATE = Template(textwrap.dedent("""
-        #include <likwid.h>
-        #include <stdlib.h>
-        #include <stdint.h>
-        #include <stdbool.h>
-        #include <math.h>
-        #include "kerncraft.h"
-        #include "kernel.h"
-
-        #define RESTRICT __restrict__
-        #define FUNC_PREFIX
-        void dummy(void *);
-        extern int var_false;
-
-        int main(int argc, char **argv) {
-          {%- for constantName, dataType in constants %}
-          // Constant {{constantName}}
-          {{dataType}} {{constantName}};
-          {{constantName}} = 0.23;
-          {%- endfor %}
-
-          // Declaring arrays
-          {%- for field_name, dataType, size in fields %}
-
-          // Initialization {{field_name}}
-          double * {{field_name}} = (double *) aligned_malloc(sizeof({{dataType}}) * {{size}}, 64);
-          // TODO initialize in parallel context in same order as they are touched
-          for (unsigned long long i = 0; i < {{size}}; ++i)
-            {{field_name}}[i] = 0.23;
-          {%- endfor %}
-
-          likwid_markerInit();
-          #pragma omp parallel
-          {
-            likwid_markerRegisterRegion("loop");
-            #pragma omp barrier
-
-            // Initializing arrays in same order as touched in kernel loop nest
-            //INIT_ARRAYS;
-
-            // Dummy call
-            {%- for field_name, dataType, size in fields %}
-            if(var_false) dummy({{field_name}});
-            {%- endfor %}
-            {%- for constantName, dataType in constants %}
-            if(var_false) dummy(&{{constantName}});
-            {%- endfor %}
-
-            for(int warmup = 1; warmup >= 0; --warmup) {
-              int repeat = 2;
-              if(warmup == 0) {
-                repeat = atoi(argv[1]);
-                likwid_markerStartRegion("loop");
-              }
-
-              for(; repeat > 0; --repeat) {
-                {{kernelName}}({{call_argument_list}});
-
-                {%- for field_name, dataType, size in fields %}
-                if(var_false) dummy({{field_name}});
-                {%- endfor %}
-                {%- for constantName, dataType in constants %}
-                if(var_false) dummy(&{{constantName}});
-                {%- endfor %}
-              }
-
-            }
-            likwid_markerStopRegion("loop");
-          }
-          likwid_markerClose();
-          return 0;
-        }
-        """))
-
-    def get_main_code(self, kernel_function_name='kernel'):
-        """
-        Generate and return compilable source code from AST.
-
-        :return: tuple of filename and shared lock file pointer
-        """
-        # TODO produce nicer code, including help text and other "comfort features".
-        assert self.kernel_ast is not None, "AST does not exist, this could be due to running " \
-                                            "based on a kernel description rather than code."
-
-        file_path = self.get_intermediate_location('main.c', machine_and_compiler_dependent=False)
-        lock_mode, lock_fp = self.lock_intermediate(file_path)
-
-        if lock_mode == fcntl.LOCK_SH:
-            # use cache
-            with open(file_path) as f:
-                code = f.read()
-        else:  # lock_mode == fcntl.LOCK_EX
-            # needs update
-            accessed_fields = {f.name: f for f in self.kernel_ast.fields_accessed}
-            constants = []
-            fields = []
-            call_parameters = []
-            for p in self.kernel_ast.get_parameters():
-                if not p.is_field_parameter:
-                    constants.append((p.symbol.name, str(p.symbol.dtype)))
-                    call_parameters.append(p.symbol.name)
-                else:
-                    assert p.is_field_pointer, "Benchmark implemented only for kernels with fixed loop size"
-                    field = accessed_fields[p.field_name]
-                    dtype = str(get_base_type(p.symbol.dtype))
-                    fields.append((p.field_name, dtype, prod(field.shape)))
-                    call_parameters.append(p.field_name)
-
-            header_list = get_headers(self.kernel_ast)
-            includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list])
-
-            # Generate code
-            code = self.CODE_TEMPLATE.render(
-                kernelName=self.kernel_ast.function_name,
-                fields=fields,
-                constants=constants,
-                call_agument_list=','.join(call_parameters),
-                includes=includes)
-
-            # Store to file
-            with open(file_path, 'w') as f:
-                f.write(code)
-            self.release_exclusive_lock(lock_fp)  # degrade to shared lock
-
-        return file_path, lock_fp
-
-
-class KerncraftParameters(DotDict):
-    def __init__(self, **kwargs):
-        super(KerncraftParameters, self).__init__()
-        self['asm_block'] = 'auto'
-        self['asm_increment'] = 0
-        self['cores'] = 1
-        self['cache_predictor'] = 'SIM'
-        self['verbose'] = 0
-        self['pointer_increment'] = 'auto'
-        self['iterations'] = 10
-        self['unit'] = 'cy/CL'
-        self['ignore_warnings'] = True
-        self['incore_model'] = 'OSACA'
-        self.update(**kwargs)
-
-
-# ------------------------------------------- Helper functions ---------------------------------------------------------
-
-
-def search_resolved_field_accesses_in_ast(ast):
-    def visit(node, reads, writes):
-        if not isinstance(node, SympyAssignment):
-            for a in node.args:
-                visit(a, reads, writes)
-            return
-
-        for expr, accesses in [(node.lhs, writes), (node.rhs, reads)]:
-            accesses.update(expr.atoms(ResolvedFieldAccess))
-
-    read_accesses = set()
-    write_accesses = set()
-    visit(ast, read_accesses, write_accesses)
-    return read_accesses, write_accesses
--- a/pystencils/kerncraft_coupling/templates/benchmark.c
+++ b/pystencils/kerncraft_coupling/templates/benchmark.c
-#include "kerncraft.h"
-#include <stdlib.h>
-#include <stdint.h>
-#include <stdbool.h>
-#include <math.h>
-#include <stdio.h>
-#include <assert.h>
-
-{{ includes }}
-
-{%- if likwid %}
-#include <likwid.h>
-{%- endif %}
-
-#define RESTRICT __restrict__
-#define FUNC_PREFIX
-void dummy(void *);
-void timing(double* wcTime, double* cpuTime);
-extern int var_false;
-
-/* see waLBerla src/field/allocation/AlignedMalloc */
-void *aligned_malloc_with_offset( uint64_t size, uint64_t alignment, uint64_t offset )
-{
-    // With 0 alignment this function makes no sense
-    // use normal malloc instead
-    assert( alignment > 0 );
-    // Tests if alignment is power of two (assuming alignment>0)
-    assert( !(alignment & (alignment - 1)) );
-    assert( offset < alignment );
-
-    void *pa;  // pointer to allocated memory
-    void *ptr; // pointer to usable aligned memory
-
-    pa=std::malloc( (size+2*alignment-1 )+sizeof(void *));
-    if(!pa)
-        return nullptr;
-
-    // Find next aligned position, starting at pa+sizeof(void*)-1
-    ptr=(void*)( ((size_t)pa+sizeof(void *)+alignment-1) & ~(alignment-1));
-    ptr=(void*) ( (char*)(ptr) + alignment - offset);
-
-    // Store pointer to real allocated chunk just before usable chunk
-    *((void **)ptr-1)=pa;
-
-    assert( ((size_t)ptr+offset) % alignment == 0 );
-
-    return ptr;
-}
-
-void aligned_free( void *ptr )
-{
-    // assume that pointer to real allocated chunk is stored just before
-    // chunk that was given to user
-    if(ptr)
-        std::free(*((void **)ptr-1));
-}
-
-
-{{kernel_code}}
-
-
-int main(int argc, char **argv)
-{
-  {%- if likwid %}
-  likwid_markerInit();
-  {%- endif %}
-
-  {%- for field_name, dataType, elements, size, offset, alignment in fields %}
-  // Initialization {{field_name}}
-  {%- if alignment > 0 %}
-  {{dataType}} * {{field_name}} = ({{dataType}} *) aligned_malloc_with_offset({{size}}, {{alignment}}, {{offset}});
-  {%- else %}
-  {{dataType}} * {{field_name}} = new {{dataType}}[{{elements}}];
-  {%- endif %}
-  for (unsigned long long i = 0; i < {{elements}}; ++i)
-    {{field_name}}[i] = 0.23;
-
-  if(var_false)
-    dummy({{field_name}});
-
-  {%- endfor %}
-
-
-
-  {%- for constantName, dataType in constants %}
-
-  // Constant {{constantName}}
-  {{dataType}} {{constantName}};
-  {{constantName}} = 0.23;
-  if(var_false)
-      dummy(& {{constantName}});
-
-  {%- endfor %}
-
-  {%- if likwid and openmp %}
-  #pragma omp parallel
-  {
-  likwid_markerRegisterRegion("loop");
-  #pragma omp barrier
-  {%- elif likwid %}
-  likwid_markerRegisterRegion("loop");
-  {%- endif %}
-
-  for(int warmup = 1; warmup >= 0; --warmup) {
-    int repeat = 2;
-    if(warmup == 0) {
-      repeat = atoi(argv[1]);
-      {%- if likwid %}
-      likwid_markerStartRegion("loop");
-      {%- endif %}
-    }
-
-    {%- if timing %}
-    double wcStartTime, cpuStartTime, wcEndTime, cpuEndTime;
-    timing(&wcStartTime, &cpuStartTime);
-    {%- endif %}
-
-    for (; repeat > 0; --repeat)
-    {
-      {{kernelName}}({{call_argument_list}});
-
-      // Dummy calls
-      {%- for field_name, dataType, elements, size, offset, alignment in fields %}
-      if(var_false) dummy((void*){{field_name}});
-      {%- endfor %}
-      {%- for constantName, dataType in constants %}
-      if(var_false) dummy((void*)&{{constantName}});
-      {%- endfor %}
-    }
-    {%- if timing %}
-    timing(&wcEndTime, &cpuEndTime);
-    if( warmup == 0)
-        printf("%e\n", (wcEndTime - wcStartTime) / atoi(argv[1]) );
-    {%- endif %}
-
-  }
-
-  {%- if likwid %}
-  likwid_markerStopRegion("loop");
-  {%- if openmp %}
-  }
-  {%- endif %}
-  {%- endif %}
-
-  {%- if likwid %}
-  likwid_markerClose();
-  {%- endif %}
-
-  {%- for field_name, dataType, elements, size, offset, alignment in fields %}
-  {%- if alignment > 0 %}
-  aligned_free({{field_name}});
-  {%- else %}
-  delete[] {{field_name}};
-  {%- endif %}
-
-  {%- endfor %}
-}
--- a/pystencils/kerncraft_coupling/templates/kernel.c
+++ b/pystencils/kerncraft_coupling/templates/kernel.c
-
-#include "kerncraft.h"
-#include <stdlib.h>
-#include <stdint.h>
-#include <stdbool.h>
-#include <math.h>
-#include <stdio.h>
-
-{{ includes }}
-
-#define RESTRICT __restrict__
-#define FUNC_PREFIX
-void dummy(void *);
-void timing(double* wcTime, double* cpuTime);
-extern int var_false;
-
-
-{{kernel_code}}
\ No newline at end of file
--- a/pystencils/kerncraft_coupling/templates/kernel.h
+++ b/pystencils/kerncraft_coupling/templates/kernel.h
-#define FUNC_PREFIX
-
-{{function_signature}}
\ No newline at end of file
No results found