e6a30483 · e6a30483 · e6a30483 · e6a30483 · e6a30483 · e6a30483
--- a/pystencils/datahandling/pycuda.py
+++ b/pystencils/datahandling/pycuda.py
-try:
-    import pycuda.gpuarray as gpuarray
-except ImportError:
-    gpuarray = None
-import numpy as np
-
-import pystencils
-
-
-class PyCudaArrayHandler:
-
-    def __init__(self):
-        import pycuda.autoinit  # NOQA
-
-    def zeros(self, shape, dtype=np.float64, order='C'):
-        return gpuarray.zeros(shape, dtype, order)
-
-    def ones(self, shape, dtype, order='C'):
-        return gpuarray.ones(shape, dtype, order)
-
-    def empty(self, shape, dtype=np.float64, layout=None):
-        if layout:
-            cpu_array = pystencils.field.create_numpy_array_with_layout(shape, dtype, layout)
-            return self.from_numpy(cpu_array)
-        else:
-            return gpuarray.empty(shape, dtype)
-
-    def to_gpu(self, array):
-        return gpuarray.to_gpu(array)
-
-    def upload(self, gpuarray, numpy_array):
-        gpuarray.set(numpy_array)
-
-    def download(self, gpuarray, numpy_array):
-        gpuarray.get(numpy_array)
-
-    def randn(self, shape, dtype=np.float64):
-        cpu_array = np.random.randn(*shape).astype(dtype)
-        return self.from_numpy(cpu_array)
--- a/pystencils/datahandling/pyopencl.py
+++ b/pystencils/datahandling/pyopencl.py
-try:
-    import pyopencl.array as gpuarray
-except ImportError:
-    gpuarray = None
-
-import numpy as np
-
-import pystencils
-
-
-class PyOpenClArrayHandler:
-
-    def __init__(self, queue):
-        if not queue:
-            from pystencils.opencl.opencljit import get_global_cl_queue
-            queue = get_global_cl_queue()
-        assert queue, "OpenCL queue missing"
-        self.queue = queue
-
-    def zeros(self, shape, dtype=np.float64, order='C'):
-        return gpuarray.zeros(shape, dtype, order)
-
-    def ones(self, shape, dtype, order='C'):
-        return gpuarray.ones(self.queue, shape, dtype, order)
-
-    def empty(self, shape, dtype=np.float64, layout=None):
-        if layout:
-            cpu_array = pystencils.field.create_numpy_array_with_layout(shape, dtype, layout)
-            return self.from_numpy(cpu_array)
-        else:
-            return gpuarray.empty(self.queue, shape, dtype)
-
-    def to_gpu(self, array):
-        return gpuarray.to_device(self.queue, array)
-
-    def upload(self, gpuarray, numpy_array):
-        gpuarray.set(numpy_array, self.queue)
-
-    def download(self, gpuarray, numpy_array):
-        gpuarray.get(self.queue, numpy_array)
-
-    def randn(self, shape, dtype=np.float64):
-        cpu_array = np.random.randn(*shape).astype(dtype)
-        return self.from_numpy(cpu_array)
--- a/pystencils/gpucuda/texture_utils.py
+++ b/pystencils/gpucuda/texture_utils.py
-# -*- coding: utf-8 -*-
-#
-# Copyright © 2019 Stephan Seitz <stephan.seitz@fau.de>
-#
-# Distributed under terms of the GPLv3 license.
-
-"""
-
-"""
-
-from os.path import dirname, isdir, join
-
-import numpy as np
-
-try:
-    import pycuda.driver as cuda
-    from pycuda import gpuarray
-except Exception:
-    pass
-
-
-def pow_two_divider(n):
-    if n == 0:
-        return 0
-    divider = 1
-    while (n & divider) == 0:
-        divider <<= 1
-    return divider
-
-
-def ndarray_to_tex(tex_ref,
-                   ndarray,
-                   address_mode=None,
-                   filter_mode=None,
-                   use_normalized_coordinates=False,
-                   read_as_integer=False):
-
-    if address_mode is None:
-        address_mode = cuda.address_mode.BORDER
-    if filter_mode is None:
-        filter_mode = cuda.filter_mode.LINEAR
-
-    if isinstance(ndarray, np.ndarray):
-        cu_array = cuda.np_to_array(ndarray, 'C')
-    elif isinstance(ndarray, gpuarray.GPUArray):
-        cu_array = cuda.gpuarray_to_array(ndarray, 'C')
-    else:
-        raise TypeError(
-            'ndarray must be numpy.ndarray or pycuda.gpuarray.GPUArray')
-
-    cuda.TextureReference.set_array(tex_ref, cu_array)
-
-    tex_ref.set_address_mode(0, address_mode)
-    if ndarray.ndim >= 2:
-        tex_ref.set_address_mode(1, address_mode)
-    if ndarray.ndim >= 3:
-        tex_ref.set_address_mode(2, address_mode)
-    tex_ref.set_filter_mode(filter_mode)
-
-    if not use_normalized_coordinates:
-        tex_ref.set_flags(tex_ref.get_flags() & ~cuda.TRSF_NORMALIZED_COORDINATES)
-
-    if not read_as_integer:
-        tex_ref.set_flags(tex_ref.get_flags() & ~cuda.TRSF_READ_AS_INTEGER)
-
-
-def prefilter_for_cubic_bspline(gpuarray):
-    import pycuda.autoinit  # NOQA
-    from pycuda.compiler import SourceModule
-
-    ndim = gpuarray.ndim
-    assert ndim == 2 or ndim == 3, "Only 2d or 3d supported"
-    assert isdir(join(dirname(__file__), "CubicInterpolationCUDA", "code")), \
-        "Submodule CubicInterpolationCUDA does not exist"
-    nvcc_options = ["-w", "-std=c++11", "-Wno-deprecated-gpu-targets"]
-    nvcc_options += ["-I" + join(dirname(__file__), "CubicInterpolationCUDA", "code")]
-    nvcc_options += ["-I" + join(dirname(__file__), "CubicInterpolationCUDA", "code", "internal")]
-
-    file_name = join(dirname(__file__), "CubicInterpolationCUDA", "code", "cubicPrefilter%iD.cu" % ndim)
-    with open(file_name) as file:
-        code = file.read()
-
-    mod = SourceModule(code, options=nvcc_options)
-
-    if ndim == 2:
-        height, width = gpuarray.shape
-        block = min(pow_two_divider(height), 64)
-        grid = height // block
-        func = mod.get_function('SamplesToCoefficients2DXf')
-        func(gpuarray, np.uint32(gpuarray.strides[-2]), *(np.uint32(r)
-                                                          for r in reversed(gpuarray.shape)),
-             block=(block, 1, 1),
-             grid=(grid, 1, 1))
-
-        block = min(pow_two_divider(width), 64)
-        grid = width // block
-        func = mod.get_function('SamplesToCoefficients2DYf')
-        func(gpuarray, np.uint32(gpuarray.strides[-2]), *(np.uint32(r)
-                                                          for r in reversed(gpuarray.shape)),
-             block=(block, 1, 1),
-             grid=(grid, 1, 1))
-    elif ndim == 3:
-        depth, height, width = gpuarray.shape
-        dimX = min(min(pow_two_divider(width), pow_two_divider(height)), 64)
-        dimY = min(min(pow_two_divider(depth), pow_two_divider(height)), 512 / dimX)
-        block = (dimX, dimY, 1)
-
-        dimGridX = (height // block[0], depth // block[1], 1)
-        dimGridY = (width // block[0], depth // block[1], 1)
-        dimGridZ = (width // block[0], height // block[1], 1)
-
-        func = mod.get_function("SamplesToCoefficients3DXf")
-        func(gpuarray, np.uint32(gpuarray.strides[-2]), *(np.uint32(r)
-                                                          for r in reversed(gpuarray.shape)),
-             block=block,
-             grid=dimGridX)
-        func = mod.get_function("SamplesToCoefficients3DYf")
-        func(gpuarray, np.uint32(gpuarray.strides[-2]), *(np.uint32(r)
-                                                          for r in reversed(gpuarray.shape)),
-             block=block,
-             grid=dimGridY)
-        func = mod.get_function("SamplesToCoefficients3DZf")
-        func(gpuarray, np.uint32(gpuarray.strides[-2]), *(np.uint32(r)
-                                                          for r in reversed(gpuarray.shape)),
-             block=block,
-             grid=dimGridZ)
--- a/pystencils/include/PyStencilsField.h
+++ b/pystencils/include/PyStencilsField.h
-#pragma once
-
-extern "C++" {
-#ifdef __CUDA_ARCH__
-template <typename DTYPE_T, std::size_t DIMENSION> struct PyStencilsField {
-  DTYPE_T *data;
-  DTYPE_T shape[DIMENSION];
-  DTYPE_T stride[DIMENSION];
-};
-#else
-#include <array>
-
-template <typename DTYPE_T, std::size_t DIMENSION> struct PyStencilsField {
-  DTYPE_T *data;
-  std::array<DTYPE_T, DIMENSION> shape;
-  std::array<DTYPE_T, DIMENSION> stride;
-};
-#endif
-}
--- a/pystencils/include/aesni_rand.h
+++ b/pystencils/include/aesni_rand.h
-#if !defined(__AES__) || !defined(__SSE4_1__)
-#error AES-NI and SSE4.1 need to be enabled
-#endif
-
-#include <emmintrin.h> // SSE2
-#include <wmmintrin.h> // AES
-#ifdef __AVX512VL__
-#include <immintrin.h> // AVX*
-#else
-#include <smmintrin.h>  // SSE4
-#ifdef __FMA__
-#include <immintrin.h> // FMA
-#endif
-#endif
-#include <cstdint>
-
-#define QUALIFIERS inline
-#define TWOPOW53_INV_DOUBLE (1.1102230246251565e-16)
-#define TWOPOW32_INV_FLOAT (2.3283064e-10f)
-
-typedef std::uint32_t uint32;
-typedef std::uint64_t uint64;
-
-QUALIFIERS __m128i aesni1xm128i(const __m128i & in, const __m128i & k) {
-    __m128i x = _mm_xor_si128(k, in);
-    x = _mm_aesenc_si128(x, k);     // 1
-    x = _mm_aesenc_si128(x, k);     // 2
-    x = _mm_aesenc_si128(x, k);     // 3
-    x = _mm_aesenc_si128(x, k);     // 4
-    x = _mm_aesenc_si128(x, k);     // 5
-    x = _mm_aesenc_si128(x, k);     // 6
-    x = _mm_aesenc_si128(x, k);     // 7
-    x = _mm_aesenc_si128(x, k);     // 8
-    x = _mm_aesenc_si128(x, k);     // 9
-    x = _mm_aesenclast_si128(x, k); // 10
-    return x;
-}
-
-QUALIFIERS __m128 _my_cvtepu32_ps(const __m128i v)
-{
-#ifdef __AVX512VL__
-    return _mm_cvtepu32_ps(v);
-#else
-    __m128i v2 = _mm_srli_epi32(v, 1);
-    __m128i v1 = _mm_and_si128(v, _mm_set1_epi32(1));
-    __m128 v2f = _mm_cvtepi32_ps(v2);
-    __m128 v1f = _mm_cvtepi32_ps(v1);
-    return _mm_add_ps(_mm_add_ps(v2f, v2f), v1f);
-#endif
-}
-
-#if !defined(__AVX512VL__) && defined(__GNUC__) && __GNUC__ >= 5
-__attribute__((optimize("no-associative-math")))
-#endif
-QUALIFIERS __m128d _my_cvtepu64_pd(const __m128i x)
-{
-#ifdef __AVX512VL__
-    return _mm_cvtepu64_pd(x);
-#else
-    __m128i xH = _mm_srli_epi64(x, 32);
-    xH = _mm_or_si128(xH, _mm_castpd_si128(_mm_set1_pd(19342813113834066795298816.)));          //  2^84
-    __m128i xL = _mm_blend_epi16(x, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)), 0xcc);   //  2^52
-    __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(19342813118337666422669312.));     //  2^84 + 2^52
-    return _mm_add_pd(f, _mm_castsi128_pd(xL));
-#endif
-}
-
-
-QUALIFIERS void aesni_double2(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3,
-                              uint32 key0, uint32 key1, uint32 key2, uint32 key3,
-                              double & rnd1, double & rnd2)
-{
-    // pack input and call AES
-    __m128i c128 = _mm_set_epi32(ctr3, ctr2, ctr1, ctr0);
-    __m128i k128 = _mm_set_epi32(key3, key2, key1, key0);
-    c128 = aesni1xm128i(c128, k128);
-
-    // convert 32 to 64 bit and put 0th and 2nd element into x, 1st and 3rd element into y
-    __m128i x = _mm_and_si128(c128, _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff));
-    __m128i y = _mm_and_si128(c128, _mm_set_epi32(0xffffffff, 0, 0xffffffff, 0));
-    y = _mm_srli_si128(y, 4);
-
-    // calculate z = x ^ y << (53 - 32))
-    __m128i z = _mm_sll_epi64(y, _mm_set1_epi64x(53 - 32));
-    z = _mm_xor_si128(x, z);
-
-    // convert uint64 to double
-    __m128d rs = _my_cvtepu64_pd(z);
-    // calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
-#ifdef __FMA__
-    rs = _mm_fmadd_pd(rs, _mm_set1_pd(TWOPOW53_INV_DOUBLE), _mm_set1_pd(TWOPOW53_INV_DOUBLE/2.0));
-#else
-    rs = _mm_mul_pd(rs, _mm_set1_pd(TWOPOW53_INV_DOUBLE));
-    rs = _mm_add_pd(rs, _mm_set1_pd(TWOPOW53_INV_DOUBLE/2.0));
-#endif
-
-    // store result
-    alignas(16) double rr[2];
-    _mm_store_pd(rr, rs);
-    rnd1 = rr[0];
-    rnd2 = rr[1];
-}
-
-
-QUALIFIERS void aesni_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3,
-                             uint32 key0, uint32 key1, uint32 key2, uint32 key3,
-                             float & rnd1, float & rnd2, float & rnd3, float & rnd4)
-{
-    // pack input and call AES
-    __m128i c128 = _mm_set_epi32(ctr3, ctr2, ctr1, ctr0);
-    __m128i k128 = _mm_set_epi32(key3, key2, key1, key0);
-    c128 = aesni1xm128i(c128, k128);
-
-    // convert uint32 to float
-    __m128 rs = _my_cvtepu32_ps(c128);
-    // calculate rs * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
-#ifdef __FMA__
-    rs = _mm_fmadd_ps(rs, _mm_set1_ps(TWOPOW32_INV_FLOAT), _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
-#else
-    rs = _mm_mul_ps(rs, _mm_set1_ps(TWOPOW32_INV_FLOAT));
-    rs = _mm_add_ps(rs, _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
-#endif
-
-    // store result
-    alignas(16) float r[4];
-    _mm_store_ps(r, rs);
-    rnd1 = r[0];
-    rnd2 = r[1];
-    rnd3 = r[2];
-    rnd4 = r[3];
-}
-
--- a/pystencils/include/cuda_complex.hpp
+++ b/pystencils/include/cuda_complex.hpp
-// An implementation of C++ std::complex for use on CUDA devices.
-// Written by John C. Travers <jtravs@gmail.com> (2012)
-//
-// Missing:
-//  - long double support (not supported on CUDA)
-//  - some integral pow functions (due to lack of C++11 support on CUDA)
-//
-// Heavily derived from the LLVM libcpp project (svn revision 147853).
-// Based on libcxx/include/complex.
-// The git history contains the complete change history from the original.
-// The modifications are licensed as per the original LLVM license below.
-//
-// -*- C++ -*-
-//===--------------------------- complex ----------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-extern "C++" {
-#ifndef CUDA_COMPLEX_HPP
-#define CUDA_COMPLEX_HPP
-
-#ifdef __CUDACC__
-#define CUDA_CALLABLE_MEMBER __host__ __device__
-#else
-#define CUDA_CALLABLE_MEMBER
-#endif
-
-/*
-    complex synopsis
-
-template<class T>
-class complex
-{
-public:
-    typedef T value_type;
-
-    complex(const T& re = T(), const T& im = T());
-    complex(const complex&);
-    template<class X> complex(const complex<X>&);
-
-    T real() const;
-    T imag() const;
-
-    void real(T);
-    void imag(T);
-
-    complex<T>& operator= (const T&);
-    complex<T>& operator+=(const T&);
-    complex<T>& operator-=(const T&);
-    complex<T>& operator*=(const T&);
-    complex<T>& operator/=(const T&);
-
-    complex& operator=(const complex&);
-    template<class X> complex<T>& operator= (const complex<X>&);
-    template<class X> complex<T>& operator+=(const complex<X>&);
-    template<class X> complex<T>& operator-=(const complex<X>&);
-    template<class X> complex<T>& operator*=(const complex<X>&);
-    template<class X> complex<T>& operator/=(const complex<X>&);
-};
-
-template<>
-class complex<float>
-{
-public:
-    typedef float value_type;
-
-    constexpr complex(float re = 0.0f, float im = 0.0f);
-    explicit constexpr complex(const complex<double>&);
-
-    constexpr float real() const;
-    void real(float);
-    constexpr float imag() const;
-    void imag(float);
-
-    complex<float>& operator= (float);
-    complex<float>& operator+=(float);
-    complex<float>& operator-=(float);
-    complex<float>& operator*=(float);
-    complex<float>& operator/=(float);
-
-    complex<float>& operator=(const complex<float>&);
-    template<class X> complex<float>& operator= (const complex<X>&);
-    template<class X> complex<float>& operator+=(const complex<X>&);
-    template<class X> complex<float>& operator-=(const complex<X>&);
-    template<class X> complex<float>& operator*=(const complex<X>&);
-    template<class X> complex<float>& operator/=(const complex<X>&);
-};
-
-template<>
-class complex<double>
-{
-public:
-    typedef double value_type;
-
-    constexpr complex(double re = 0.0, double im = 0.0);
-    constexpr complex(const complex<float>&);
-
-    constexpr double real() const;
-    void real(double);
-    constexpr double imag() const;
-    void imag(double);
-
-    complex<double>& operator= (double);
-    complex<double>& operator+=(double);
-    complex<double>& operator-=(double);
-    complex<double>& operator*=(double);
-    complex<double>& operator/=(double);
-    complex<double>& operator=(const complex<double>&);
-
-    template<class X> complex<double>& operator= (const complex<X>&);
-    template<class X> complex<double>& operator+=(const complex<X>&);
-    template<class X> complex<double>& operator-=(const complex<X>&);
-    template<class X> complex<double>& operator*=(const complex<X>&);
-    template<class X> complex<double>& operator/=(const complex<X>&);
-};
-
-// 26.3.6 operators:
-template<class T> complex<T> operator+(const complex<T>&, const complex<T>&);
-template<class T> complex<T> operator+(const complex<T>&, const T&);
-template<class T> complex<T> operator+(const T&, const complex<T>&);
-template<class T> complex<T> operator-(const complex<T>&, const complex<T>&);
-template<class T> complex<T> operator-(const complex<T>&, const T&);
-template<class T> complex<T> operator-(const T&, const complex<T>&);
-template<class T> complex<T> operator*(const complex<T>&, const complex<T>&);
-template<class T> complex<T> operator*(const complex<T>&, const T&);
-template<class T> complex<T> operator*(const T&, const complex<T>&);
-template<class T> complex<T> operator/(const complex<T>&, const complex<T>&);
-template<class T> complex<T> operator/(const complex<T>&, const T&);
-template<class T> complex<T> operator/(const T&, const complex<T>&);
-template<class T> complex<T> operator+(const complex<T>&);
-template<class T> complex<T> operator-(const complex<T>&);
-template<class T> bool operator==(const complex<T>&, const complex<T>&);
-template<class T> bool operator==(const complex<T>&, const T&);
-template<class T> bool operator==(const T&, const complex<T>&);
-template<class T> bool operator!=(const complex<T>&, const complex<T>&);
-template<class T> bool operator!=(const complex<T>&, const T&);
-template<class T> bool operator!=(const T&, const complex<T>&);
-
-template<class T, class charT, class traits>
-  basic_istream<charT, traits>&
-  operator>>(basic_istream<charT, traits>&, complex<T>&);
-template<class T, class charT, class traits>
-  basic_ostream<charT, traits>&
-  operator<<(basic_ostream<charT, traits>&, const complex<T>&);
-
-// 26.3.7 values:
-
-template<class T>              T real(const complex<T>&);
-                          double real(double);
-template<Integral T>      double real(T);
-                          float  real(float);
-
-template<class T>              T imag(const complex<T>&);
-                          double imag(double);
-template<Integral T>      double imag(T);
-                          float  imag(float);
-
-template<class T> T abs(const complex<T>&);
-
-template<class T>              T arg(const complex<T>&);
-                          double arg(double);
-template<Integral T>      double arg(T);
-                          float  arg(float);
-
-template<class T>              T norm(const complex<T>&);
-                          double norm(double);
-template<Integral T>      double norm(T);
-                          float  norm(float);
-
-template<class T>      complex<T>           conj(const complex<T>&);
-                       complex<double>      conj(double);
-template<Integral T>   complex<double>      conj(T);
-                       complex<float>       conj(float);
-
-template<class T>    complex<T>           proj(const complex<T>&);
-                     complex<double>      proj(double);
-template<Integral T> complex<double>      proj(T);
-                     complex<float>       proj(float);
-
-template<class T> complex<T> polar(const T&, const T& = 0);
-
-// 26.3.8 transcendentals:
-template<class T> complex<T> acos(const complex<T>&);
-template<class T> complex<T> asin(const complex<T>&);
-template<class T> complex<T> atan(const complex<T>&);
-template<class T> complex<T> acosh(const complex<T>&);
-template<class T> complex<T> asinh(const complex<T>&);
-template<class T> complex<T> atanh(const complex<T>&);
-template<class T> complex<T> cos (const complex<T>&);
-template<class T> complex<T> cosh (const complex<T>&);
-template<class T> complex<T> exp (const complex<T>&);
-template<class T> complex<T> log (const complex<T>&);
-template<class T> complex<T> log10(const complex<T>&);
-
-template<class T> complex<T> pow(const complex<T>&, const T&);
-template<class T> complex<T> pow(const complex<T>&, const complex<T>&);
-template<class T> complex<T> pow(const T&, const complex<T>&);
-
-template<class T> complex<T> sin (const complex<T>&);
-template<class T> complex<T> sinh (const complex<T>&);
-template<class T> complex<T> sqrt (const complex<T>&);
-template<class T> complex<T> tan (const complex<T>&);
-template<class T> complex<T> tanh (const complex<T>&);
-
-template<class T, class charT, class traits>
-  basic_istream<charT, traits>&
-  operator>>(basic_istream<charT, traits>& is, complex<T>& x);
-
-template<class T, class charT, class traits>
-  basic_ostream<charT, traits>&
-  operator<<(basic_ostream<charT, traits>& o, const complex<T>& x);
-
-*/
-
-#include <math.h>
-#include <sstream>
-
-template <class _Tp> class complex;
-
-template <class _Tp>
-complex<_Tp> operator*(const complex<_Tp> &__z, const complex<_Tp> &__w);
-template <class _Tp>
-complex<_Tp> operator/(const complex<_Tp> &__x, const complex<_Tp> &__y);
-
-template <class _Tp> class complex {
-public:
-  typedef _Tp value_type;
-
-private:
-  value_type __re_;
-  value_type __im_;
-
-public:
-  CUDA_CALLABLE_MEMBER
-  complex(const value_type &__re = value_type(),
-          const value_type &__im = value_type())
-      : __re_(__re), __im_(__im) {}
-  template <class _Xp>
-  CUDA_CALLABLE_MEMBER complex(const complex<_Xp> &__c)
-      : __re_(__c.real()), __im_(__c.imag()) {}
-
-  CUDA_CALLABLE_MEMBER value_type real() const { return __re_; }
-  CUDA_CALLABLE_MEMBER value_type imag() const { return __im_; }
-
-  CUDA_CALLABLE_MEMBER void real(value_type __re) { __re_ = __re; }
-  CUDA_CALLABLE_MEMBER void imag(value_type __im) { __im_ = __im; }
-
-  CUDA_CALLABLE_MEMBER complex &operator=(const value_type &__re) {
-    __re_ = __re;
-    __im_ = value_type();
-    return *this;
-  }
-  CUDA_CALLABLE_MEMBER complex &operator+=(const value_type &__re) {
-    __re_ += __re;
-    return *this;
-  }
-  CUDA_CALLABLE_MEMBER complex &operator-=(const value_type &__re) {
-    __re_ -= __re;
-    return *this;
-  }
-  CUDA_CALLABLE_MEMBER complex &operator*=(const value_type &__re) {
-    __re_ *= __re;
-    __im_ *= __re;
-    return *this;
-  }
-  CUDA_CALLABLE_MEMBER complex &operator/=(const value_type &__re) {
-    __re_ /= __re;
-    __im_ /= __re;
-    return *this;
-  }
-
-  template <class _Xp>
-  CUDA_CALLABLE_MEMBER complex &operator=(const complex<_Xp> &__c) {
-    __re_ = __c.real();
-    __im_ = __c.imag();
-    return *this;
-  }
-  template <class _Xp>
-  CUDA_CALLABLE_MEMBER complex &operator+=(const complex<_Xp> &__c) {
-    __re_ += __c.real();
-    __im_ += __c.imag();
-    return *this;
-  }
-  template <class _Xp>
-  CUDA_CALLABLE_MEMBER complex &operator-=(const complex<_Xp> &__c) {
-    __re_ -= __c.real();
-    __im_ -= __c.imag();
-    return *this;
-  }
-  template <class _Xp>
-  CUDA_CALLABLE_MEMBER complex &operator*=(const complex<_Xp> &__c) {
-    *this = *this * __c;
-    return *this;
-  }
-  template <class _Xp>
-  CUDA_CALLABLE_MEMBER complex &operator/=(const complex<_Xp> &__c) {
-    *this = *this / __c;
-    return *this;
-  }
-};
-
-template <> class complex<double>;
-
-template <> class complex<float> {
-  float __re_;
-  float __im_;
-
-public:
-  typedef float value_type;
-
-  /*constexpr*/ CUDA_CALLABLE_MEMBER complex(float __re = 0.0f,
-                                             float __im = 0.0f)
-      : __re_(__re), __im_(__im) {}
-  explicit /*constexpr*/ complex(const complex<double> &__c);
-
-  /*constexpr*/ CUDA_CALLABLE_MEMBER float real() const { return __re_; }
-  /*constexpr*/ CUDA_CALLABLE_MEMBER float imag() const { return __im_; }
-
-  CUDA_CALLABLE_MEMBER void real(value_type __re) { __re_ = __re; }
-  CUDA_CALLABLE_MEMBER void imag(value_type __im) { __im_ = __im; }
-
-  CUDA_CALLABLE_MEMBER complex &operator=(float __re) {
-    __re_ = __re;
-    __im_ = value_type();
-    return *this;
-  }
-  CUDA_CALLABLE_MEMBER complex &operator+=(float __re) {
-    __re_ += __re;
-    return *this;
-  }
-  CUDA_CALLABLE_MEMBER complex &operator-=(float __re) {
-    __re_ -= __re;
-    return *this;
-  }
-  CUDA_CALLABLE_MEMBER complex &operator*=(float __re) {
-    __re_ *= __re;
-    __im_ *= __re;
-    return *this;
-  }
-  CUDA_CALLABLE_MEMBER complex &operator/=(float __re) {
-    __re_ /= __re;
-    __im_ /= __re;
-    return *this;
-  }
-
-  template <class _Xp>
-  CUDA_CALLABLE_MEMBER complex &operator=(const complex<_Xp> &__c) {
-    __re_ = __c.real();
-    __im_ = __c.imag();
-    return *this;
-  }
-  template <class _Xp>
-  CUDA_CALLABLE_MEMBER complex &operator+=(const complex<_Xp> &__c) {
-    __re_ += __c.real();
-    __im_ += __c.imag();
-    return *this;
-  }
-  template <class _Xp>
-  CUDA_CALLABLE_MEMBER complex &operator-=(const complex<_Xp> &__c) {
-    __re_ -= __c.real();
-    __im_ -= __c.imag();
-    return *this;
-  }
-  template <class _Xp>
-  CUDA_CALLABLE_MEMBER complex &operator*=(const complex<_Xp> &__c) {
-    *this = *this * __c;
-    return *this;
-  }
-  template <class _Xp>
-  CUDA_CALLABLE_MEMBER complex &operator/=(const complex<_Xp> &__c) {
-    *this = *this / __c;
-    return *this;
-  }
-};
-
-template <> class complex<double> {
-  double __re_;
-  double __im_;
-
-public:
-  typedef double value_type;
-
-  /*constexpr*/ CUDA_CALLABLE_MEMBER complex(double __re = 0.0,
-                                             double __im = 0.0)
-      : __re_(__re), __im_(__im) {}
-  /*constexpr*/ complex(const complex<float> &__c);
-
-  /*constexpr*/ CUDA_CALLABLE_MEMBER double real() const { return __re_; }
-  /*constexpr*/ CUDA_CALLABLE_MEMBER double imag() const { return __im_; }
-
-  CUDA_CALLABLE_MEMBER void real(value_type __re) { __re_ = __re; }
-  CUDA_CALLABLE_MEMBER void imag(value_type __im) { __im_ = __im; }
-
-  CUDA_CALLABLE_MEMBER complex &operator=(double __re) {
-    __re_ = __re;
-    __im_ = value_type();
-    return *this;
-  }
-  CUDA_CALLABLE_MEMBER complex &operator+=(double __re) {
-    __re_ += __re;
-    return *this;
-  }
-  CUDA_CALLABLE_MEMBER complex &operator-=(double __re) {
-    __re_ -= __re;
-    return *this;
-  }
-  CUDA_CALLABLE_MEMBER complex &operator*=(double __re) {
-    __re_ *= __re;
-    __im_ *= __re;
-    return *this;
-  }
-  CUDA_CALLABLE_MEMBER complex &operator/=(double __re) {
-    __re_ /= __re;
-    __im_ /= __re;
-    return *this;
-  }
-
-  template <class _Xp>
-  CUDA_CALLABLE_MEMBER complex &operator=(const complex<_Xp> &__c) {
-    __re_ = __c.real();
-    __im_ = __c.imag();
-    return *this;
-  }
-  template <class _Xp>
-  CUDA_CALLABLE_MEMBER complex &operator+=(const complex<_Xp> &__c) {
-    __re_ += __c.real();
-    __im_ += __c.imag();
-    return *this;
-  }
-  template <class _Xp>
-  CUDA_CALLABLE_MEMBER complex &operator-=(const complex<_Xp> &__c) {
-    __re_ -= __c.real();
-    __im_ -= __c.imag();
-    return *this;
-  }
-  template <class _Xp>
-  CUDA_CALLABLE_MEMBER complex &operator*=(const complex<_Xp> &__c) {
-    *this = *this * __c;
-    return *this;
-  }
-  template <class _Xp>
-  CUDA_CALLABLE_MEMBER complex &operator/=(const complex<_Xp> &__c) {
-    *this = *this / __c;
-    return *this;
-  }
-};
-
-// constexpr
-inline CUDA_CALLABLE_MEMBER complex<float>::complex(const complex<double> &__c)
-    : __re_(__c.real()), __im_(__c.imag()) {}
-
-// constexpr
-inline CUDA_CALLABLE_MEMBER complex<double>::complex(const complex<float> &__c)
-    : __re_(__c.real()), __im_(__c.imag()) {}
-
-// 26.3.6 operators:
-
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> operator+(const complex<_Tp> &__x,
-                                                   const complex<_Tp> &__y) {
-  complex<_Tp> __t(__x);
-  __t += __y;
-  return __t;
-}
-
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> operator+(const complex<_Tp> &__x,
-                                                   const _Tp &__y) {
-  complex<_Tp> __t(__x);
-  __t += __y;
-  return __t;
-}
-
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> operator+(const _Tp &__x,
-                                                   const complex<_Tp> &__y) {
-  complex<_Tp> __t(__y);
-  __t += __x;
-  return __t;
-}
-
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> operator-(const complex<_Tp> &__x,
-                                                   const complex<_Tp> &__y) {
-  complex<_Tp> __t(__x);
-  __t -= __y;
-  return __t;
-}
-
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> operator-(const complex<_Tp> &__x,
-                                                   const _Tp &__y) {
-  complex<_Tp> __t(__x);
-  __t -= __y;
-  return __t;
-}
-
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> operator-(const _Tp &__x,
-                                                   const complex<_Tp> &__y) {
-  complex<_Tp> __t(-__y);
-  __t += __x;
-  return __t;
-}
-
-template <class _Tp>
-CUDA_CALLABLE_MEMBER complex<_Tp> operator*(const complex<_Tp> &__z,
-                                            const complex<_Tp> &__w) {
-  _Tp __a = __z.real();
-  _Tp __b = __z.imag();
-  _Tp __c = __w.real();
-  _Tp __d = __w.imag();
-  _Tp __ac = __a * __c;
-  _Tp __bd = __b * __d;
-  _Tp __ad = __a * __d;
-  _Tp __bc = __b * __c;
-  _Tp __x = __ac - __bd;
-  _Tp __y = __ad + __bc;
-  if (isnan(__x) && isnan(__y)) {
-    bool __recalc = false;
-    if (isinf(__a) || isinf(__b)) {
-      __a = copysign(isinf(__a) ? _Tp(1) : _Tp(0), __a);
-      __b = copysign(isinf(__b) ? _Tp(1) : _Tp(0), __b);
-      if (isnan(__c))
-        __c = copysign(_Tp(0), __c);
-      if (isnan(__d))
-        __d = copysign(_Tp(0), __d);
-      __recalc = true;
-    }
-    if (isinf(__c) || isinf(__d)) {
-      __c = copysign(isinf(__c) ? _Tp(1) : _Tp(0), __c);
-      __d = copysign(isinf(__d) ? _Tp(1) : _Tp(0), __d);
-      if (isnan(__a))
-        __a = copysign(_Tp(0), __a);
-      if (isnan(__b))
-        __b = copysign(_Tp(0), __b);
-      __recalc = true;
-    }
-    if (!__recalc &&
-        (isinf(__ac) || isinf(__bd) || isinf(__ad) || isinf(__bc))) {
-      if (isnan(__a))
-        __a = copysign(_Tp(0), __a);
-      if (isnan(__b))
-        __b = copysign(_Tp(0), __b);
-      if (isnan(__c))
-        __c = copysign(_Tp(0), __c);
-      if (isnan(__d))
-        __d = copysign(_Tp(0), __d);
-      __recalc = true;
-    }
-    if (__recalc) {
-      __x = _Tp(INFINITY) * (__a * __c - __b * __d);
-      __y = _Tp(INFINITY) * (__a * __d + __b * __c);
-    }
-  }
-  return complex<_Tp>(__x, __y);
-}
-
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> operator*(const complex<_Tp> &__x,
-                                                   const _Tp &__y) {
-  complex<_Tp> __t(__x);
-  __t *= __y;
-  return __t;
-}
-
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> operator*(const _Tp &__x,
-                                                   const complex<_Tp> &__y) {
-  complex<_Tp> __t(__y);
-  __t *= __x;
-  return __t;
-}
-
-template <class _Tp>
-CUDA_CALLABLE_MEMBER complex<_Tp> operator/(const complex<_Tp> &__z,
-                                            const complex<_Tp> &__w) {
-  int __ilogbw = 0;
-  _Tp __a = __z.real();
-  _Tp __b = __z.imag();
-  _Tp __c = __w.real();
-  _Tp __d = __w.imag();
-  _Tp __logbw = logb(fmax(fabs(__c), fabs(__d)));
-  if (isfinite(__logbw)) {
-    __ilogbw = static_cast<int>(__logbw);
-    __c = scalbn(__c, -__ilogbw);
-    __d = scalbn(__d, -__ilogbw);
-  }
-  _Tp __denom = __c * __c + __d * __d;
-  _Tp __x = scalbn((__a * __c + __b * __d) / __denom, -__ilogbw);
-  _Tp __y = scalbn((__b * __c - __a * __d) / __denom, -__ilogbw);
-  if (isnan(__x) && isnan(__y)) {
-    if ((__denom == _Tp(0)) && (!isnan(__a) || !isnan(__b))) {
-      __x = copysign(_Tp(INFINITY), __c) * __a;
-      __y = copysign(_Tp(INFINITY), __c) * __b;
-    } else if ((isinf(__a) || isinf(__b)) && isfinite(__c) && isfinite(__d)) {
-      __a = copysign(isinf(__a) ? _Tp(1) : _Tp(0), __a);
-      __b = copysign(isinf(__b) ? _Tp(1) : _Tp(0), __b);
-      __x = _Tp(INFINITY) * (__a * __c + __b * __d);
-      __y = _Tp(INFINITY) * (__b * __c - __a * __d);
-    } else if (isinf(__logbw) && __logbw > _Tp(0) && isfinite(__a) &&
-               isfinite(__b)) {
-      __c = copysign(isinf(__c) ? _Tp(1) : _Tp(0), __c);
-      __d = copysign(isinf(__d) ? _Tp(1) : _Tp(0), __d);
-      __x = _Tp(0) * (__a * __c + __b * __d);
-      __y = _Tp(0) * (__b * __c - __a * __d);
-    }
-  }
-  return complex<_Tp>(__x, __y);
-}
-
-template <>
-CUDA_CALLABLE_MEMBER complex<float> operator/(const complex<float> &__z,
-                                              const complex<float> &__w) {
-  int __ilogbw = 0;
-  float __a = __z.real();
-  float __b = __z.imag();
-  float __c = __w.real();
-  float __d = __w.imag();
-  float __logbw = logbf(fmaxf(fabsf(__c), fabsf(__d)));
-  if (isfinite(__logbw)) {
-    __ilogbw = static_cast<int>(__logbw);
-    __c = scalbnf(__c, -__ilogbw);
-    __d = scalbnf(__d, -__ilogbw);
-  }
-  float __denom = __c * __c + __d * __d;
-  float __x = scalbnf((__a * __c + __b * __d) / __denom, -__ilogbw);
-  float __y = scalbnf((__b * __c - __a * __d) / __denom, -__ilogbw);
-  if (isnan(__x) && isnan(__y)) {
-    if ((__denom == float(0)) && (!isnan(__a) || !isnan(__b))) {
-#pragma warning(suppress : 4756) // Ignore INFINITY related warning
-      __x = copysignf(INFINITY, __c) * __a;
-#pragma warning(suppress : 4756) // Ignore INFINITY related warning
-      __y = copysignf(INFINITY, __c) * __b;
-    } else if ((isinf(__a) || isinf(__b)) && isfinite(__c) && isfinite(__d)) {
-      __a = copysignf(isinf(__a) ? float(1) : float(0), __a);
-      __b = copysignf(isinf(__b) ? float(1) : float(0), __b);
-#pragma warning(suppress : 4756) // Ignore INFINITY related warning
-      __x = INFINITY * (__a * __c + __b * __d);
-#pragma warning(suppress : 4756) // Ignore INFINITY related warning
-      __y = INFINITY * (__b * __c - __a * __d);
-    } else if (isinf(__logbw) && __logbw > float(0) && isfinite(__a) &&
-               isfinite(__b)) {
-      __c = copysignf(isinf(__c) ? float(1) : float(0), __c);
-      __d = copysignf(isinf(__d) ? float(1) : float(0), __d);
-      __x = float(0) * (__a * __c + __b * __d);
-      __y = float(0) * (__b * __c - __a * __d);
-    }
-  }
-  return complex<float>(__x, __y);
-}
-
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> operator/(const complex<_Tp> &__x,
-                                                   const _Tp &__y) {
-  return complex<_Tp>(__x.real() / __y, __x.imag() / __y);
-}
-
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> operator/(const _Tp &__x,
-                                                   const complex<_Tp> &__y) {
-  complex<_Tp> __t(__x);
-  __t /= __y;
-  return __t;
-}
-
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> operator+(const complex<_Tp> &__x) {
-  return __x;
-}
-
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> operator-(const complex<_Tp> &__x) {
-  return complex<_Tp>(-__x.real(), -__x.imag());
-}
-
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER bool operator==(const complex<_Tp> &__x,
-                                            const complex<_Tp> &__y) {
-  return __x.real() == __y.real() && __x.imag() == __y.imag();
-}
-
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER bool operator==(const complex<_Tp> &__x,
-                                            const _Tp &__y) {
-  return __x.real() == __y && __x.imag() == 0;
-}
-
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER bool operator==(const _Tp &__x,
-                                            const complex<_Tp> &__y) {
-  return __x == __y.real() && 0 == __y.imag();
-}
-
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER bool operator!=(const complex<_Tp> &__x,
-                                            const complex<_Tp> &__y) {
-  return !(__x == __y);
-}
-
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER bool operator!=(const complex<_Tp> &__x,
-                                            const _Tp &__y) {
-  return !(__x == __y);
-}
-
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER bool operator!=(const _Tp &__x,
-                                            const complex<_Tp> &__y) {
-  return !(__x == __y);
-}
-
-// 26.3.7 values:
-
-// real
-
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER _Tp real(const complex<_Tp> &__c) {
-  return __c.real();
-}
-
-inline CUDA_CALLABLE_MEMBER double real(double __re) { return __re; }
-
-inline CUDA_CALLABLE_MEMBER float real(float __re) { return __re; }
-
-// imag
-
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER _Tp imag(const complex<_Tp> &__c) {
-  return __c.imag();
-}
-
-inline CUDA_CALLABLE_MEMBER double imag(double __re) { return 0; }
-
-inline CUDA_CALLABLE_MEMBER float imag(float __re) { return 0; }
-
-// abs
-
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER _Tp abs(const complex<_Tp> &__c) {
-  return hypot(__c.real(), __c.imag());
-}
-
-// arg
-
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER _Tp arg(const complex<_Tp> &__c) {
-  return atan2(__c.imag(), __c.real());
-}
-
-inline CUDA_CALLABLE_MEMBER double arg(double __re) { return atan2(0., __re); }
-
-inline CUDA_CALLABLE_MEMBER float arg(float __re) { return atan2f(0.F, __re); }
-
-// norm
-
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER _Tp norm(const complex<_Tp> &__c) {
-  if (isinf(__c.real()))
-    return fabs(__c.real());
-  if (isinf(__c.imag()))
-    return fabs(__c.imag());
-  return __c.real() * __c.real() + __c.imag() * __c.imag();
-}
-
-inline CUDA_CALLABLE_MEMBER double norm(double __re) { return __re * __re; }
-
-inline CUDA_CALLABLE_MEMBER float norm(float __re) { return __re * __re; }
-
-// conj
-
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> conj(const complex<_Tp> &__c) {
-  return complex<_Tp>(__c.real(), -__c.imag());
-}
-
-inline CUDA_CALLABLE_MEMBER complex<double> conj(double __re) {
-  return complex<double>(__re);
-}
-
-inline CUDA_CALLABLE_MEMBER complex<float> conj(float __re) {
-  return complex<float>(__re);
-}
-
-// proj
-
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> proj(const complex<_Tp> &__c) {
-  complex<_Tp> __r = __c;
-  if (isinf(__c.real()) || isinf(__c.imag()))
-    __r = complex<_Tp>(INFINITY, copysign(_Tp(0), __c.imag()));
-  return __r;
-}
-
-inline CUDA_CALLABLE_MEMBER complex<double> proj(double __re) {
-  if (isinf(__re))
-    __re = fabs(__re);
-  return complex<double>(__re);
-}
-
-inline CUDA_CALLABLE_MEMBER complex<float> proj(float __re) {
-  if (isinf(__re))
-    __re = fabs(__re);
-  return complex<float>(__re);
-}
-
-// polar
-
-template <class _Tp>
-CUDA_CALLABLE_MEMBER complex<_Tp> polar(const _Tp &__rho,
-                                        const _Tp &__theta = _Tp(0)) {
-  if (isnan(__rho) || signbit(__rho))
-    return complex<_Tp>(_Tp(NAN), _Tp(NAN));
-  if (isnan(__theta)) {
-    if (isinf(__rho))
-      return complex<_Tp>(__rho, __theta);
-    return complex<_Tp>(__theta, __theta);
-  }
-  if (isinf(__theta)) {
-    if (isinf(__rho))
-      return complex<_Tp>(__rho, _Tp(NAN));
-    return complex<_Tp>(_Tp(NAN), _Tp(NAN));
-  }
-  _Tp __x = __rho * cos(__theta);
-  if (isnan(__x))
-    __x = 0;
-  _Tp __y = __rho * sin(__theta);
-  if (isnan(__y))
-    __y = 0;
-  return complex<_Tp>(__x, __y);
-}
-
-// log
-
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> log(const complex<_Tp> &__x) {
-  return complex<_Tp>(log(abs(__x)), arg(__x));
-}
-
-// log10
-
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> log10(const complex<_Tp> &__x) {
-  return log(__x) / log(_Tp(10));
-}
-
-// sqrt
-
-template <class _Tp>
-CUDA_CALLABLE_MEMBER complex<_Tp> sqrt(const complex<_Tp> &__x) {
-  if (isinf(__x.imag()))
-    return complex<_Tp>(_Tp(INFINITY), __x.imag());
-  if (isinf(__x.real())) {
-    if (__x.real() > _Tp(0))
-      return complex<_Tp>(__x.real(), isnan(__x.imag())
-                                          ? __x.imag()
-                                          : copysign(_Tp(0), __x.imag()));
-    return complex<_Tp>(isnan(__x.imag()) ? __x.imag() : _Tp(0),
-                        copysign(__x.real(), __x.imag()));
-  }
-  return polar(sqrt(abs(__x)), arg(__x) / _Tp(2));
-}
-
-// exp
-
-template <class _Tp>
-CUDA_CALLABLE_MEMBER complex<_Tp> exp(const complex<_Tp> &__x) {
-  _Tp __i = __x.imag();
-  if (isinf(__x.real())) {
-    if (__x.real() < _Tp(0)) {
-      if (!isfinite(__i))
-        __i = _Tp(1);
-    } else if (__i == 0 || !isfinite(__i)) {
-      if (isinf(__i))
-        __i = _Tp(NAN);
-      return complex<_Tp>(__x.real(), __i);
-    }
-  } else if (isnan(__x.real()) && __x.imag() == 0)
-    return __x;
-  _Tp __e = exp(__x.real());
-  return complex<_Tp>(__e * cos(__i), __e * sin(__i));
-}
-
-// pow
-
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> pow(const complex<_Tp> &__x,
-                                             const complex<_Tp> &__y) {
-  return exp(__y * log(__x));
-}
-
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> pow(const complex<_Tp> &__x,
-                                             const _Tp &__y) {
-  return pow(__x, complex<_Tp>(__y));
-}
-
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> pow(const _Tp &__x,
-                                             const complex<_Tp> &__y) {
-  return pow(complex<_Tp>(__x), __y);
-}
-
-// asinh
-
-template <class _Tp>
-CUDA_CALLABLE_MEMBER complex<_Tp> asinh(const complex<_Tp> &__x) {
-  const _Tp __pi(atan2(+0., -0.));
-  if (isinf(__x.real())) {
-    if (isnan(__x.imag()))
-      return __x;
-    if (isinf(__x.imag()))
-      return complex<_Tp>(__x.real(), copysign(__pi * _Tp(0.25), __x.imag()));
-    return complex<_Tp>(__x.real(), copysign(_Tp(0), __x.imag()));
-  }
-  if (isnan(__x.real())) {
-    if (isinf(__x.imag()))
-      return complex<_Tp>(__x.imag(), __x.real());
-    if (__x.imag() == 0)
-      return __x;
-    return complex<_Tp>(__x.real(), __x.real());
-  }
-  if (isinf(__x.imag()))
-    return complex<_Tp>(copysign(__x.imag(), __x.real()),
-                        copysign(__pi / _Tp(2), __x.imag()));
-  complex<_Tp> __z = log(__x + sqrt(pow(__x, _Tp(2)) + _Tp(1)));
-  return complex<_Tp>(copysign(__z.real(), __x.real()),
-                      copysign(__z.imag(), __x.imag()));
-}
-
-// acosh
-
-template <class _Tp>
-CUDA_CALLABLE_MEMBER complex<_Tp> acosh(const complex<_Tp> &__x) {
-  const _Tp __pi(atan2(+0., -0.));
-  if (isinf(__x.real())) {
-    if (isnan(__x.imag()))
-      return complex<_Tp>(fabs(__x.real()), __x.imag());
-    if (isinf(__x.imag()))
-      if (__x.real() > 0)
-        return complex<_Tp>(__x.real(), copysign(__pi * _Tp(0.25), __x.imag()));
-      else
-        return complex<_Tp>(-__x.real(),
-                            copysign(__pi * _Tp(0.75), __x.imag()));
-    if (__x.real() < 0)
-      return complex<_Tp>(-__x.real(), copysign(__pi, __x.imag()));
-    return complex<_Tp>(__x.real(), copysign(_Tp(0), __x.imag()));
-  }
-  if (isnan(__x.real())) {
-    if (isinf(__x.imag()))
-      return complex<_Tp>(fabs(__x.imag()), __x.real());
-    return complex<_Tp>(__x.real(), __x.real());
-  }
-  if (isinf(__x.imag()))
-    return complex<_Tp>(fabs(__x.imag()), copysign(__pi / _Tp(2), __x.imag()));
-  complex<_Tp> __z = log(__x + sqrt(pow(__x, _Tp(2)) - _Tp(1)));
-  return complex<_Tp>(copysign(__z.real(), _Tp(0)),
-                      copysign(__z.imag(), __x.imag()));
-}
-
-// atanh
-
-template <class _Tp>
-CUDA_CALLABLE_MEMBER complex<_Tp> atanh(const complex<_Tp> &__x) {
-  const _Tp __pi(atan2(+0., -0.));
-  if (isinf(__x.imag())) {
-    return complex<_Tp>(copysign(_Tp(0), __x.real()),
-                        copysign(__pi / _Tp(2), __x.imag()));
-  }
-  if (isnan(__x.imag())) {
-    if (isinf(__x.real()) || __x.real() == 0)
-      return complex<_Tp>(copysign(_Tp(0), __x.real()), __x.imag());
-    return complex<_Tp>(__x.imag(), __x.imag());
-  }
-  if (isnan(__x.real())) {
-    return complex<_Tp>(__x.real(), __x.real());
-  }
-  if (isinf(__x.real())) {
-    return complex<_Tp>(copysign(_Tp(0), __x.real()),
-                        copysign(__pi / _Tp(2), __x.imag()));
-  }
-  if (fabs(__x.real()) == _Tp(1) && __x.imag() == _Tp(0)) {
-    return complex<_Tp>(copysign(_Tp(INFINITY), __x.real()),
-                        copysign(_Tp(0), __x.imag()));
-  }
-  complex<_Tp> __z = log((_Tp(1) + __x) / (_Tp(1) - __x)) / _Tp(2);
-  return complex<_Tp>(copysign(__z.real(), __x.real()),
-                      copysign(__z.imag(), __x.imag()));
-}
-
-// sinh
-
-template <class _Tp>
-CUDA_CALLABLE_MEMBER complex<_Tp> sinh(const complex<_Tp> &__x) {
-  if (isinf(__x.real()) && !isfinite(__x.imag()))
-    return complex<_Tp>(__x.real(), _Tp(NAN));
-  if (__x.real() == 0 && !isfinite(__x.imag()))
-    return complex<_Tp>(__x.real(), _Tp(NAN));
-  if (__x.imag() == 0 && !isfinite(__x.real()))
-    return __x;
-  return complex<_Tp>(sinh(__x.real()) * cos(__x.imag()),
-                      cosh(__x.real()) * sin(__x.imag()));
-}
-
-// cosh
-
-template <class _Tp>
-CUDA_CALLABLE_MEMBER complex<_Tp> cosh(const complex<_Tp> &__x) {
-  if (isinf(__x.real()) && !isfinite(__x.imag()))
-    return complex<_Tp>(fabs(__x.real()), _Tp(NAN));
-  if (__x.real() == 0 && !isfinite(__x.imag()))
-    return complex<_Tp>(_Tp(NAN), __x.real());
-  if (__x.real() == 0 && __x.imag() == 0)
-    return complex<_Tp>(_Tp(1), __x.imag());
-  if (__x.imag() == 0 && !isfinite(__x.real()))
-    return complex<_Tp>(fabs(__x.real()), __x.imag());
-  return complex<_Tp>(cosh(__x.real()) * cos(__x.imag()),
-                      sinh(__x.real()) * sin(__x.imag()));
-}
-
-// tanh
-
-template <class _Tp>
-CUDA_CALLABLE_MEMBER complex<_Tp> tanh(const complex<_Tp> &__x) {
-  if (isinf(__x.real())) {
-    if (!isfinite(__x.imag()))
-      return complex<_Tp>(_Tp(1), _Tp(0));
-    return complex<_Tp>(_Tp(1), copysign(_Tp(0), sin(_Tp(2) * __x.imag())));
-  }
-  if (isnan(__x.real()) && __x.imag() == 0)
-    return __x;
-  _Tp __2r(_Tp(2) * __x.real());
-  _Tp __2i(_Tp(2) * __x.imag());
-  _Tp __d(cosh(__2r) + cos(__2i));
-  return complex<_Tp>(sinh(__2r) / __d, sin(__2i) / __d);
-}
-
-// asin
-
-template <class _Tp>
-CUDA_CALLABLE_MEMBER complex<_Tp> asin(const complex<_Tp> &__x) {
-  complex<_Tp> __z = asinh(complex<_Tp>(-__x.imag(), __x.real()));
-  return complex<_Tp>(__z.imag(), -__z.real());
-}
-
-// acos
-
-template <class _Tp>
-CUDA_CALLABLE_MEMBER complex<_Tp> acos(const complex<_Tp> &__x) {
-  const _Tp __pi(atan2(+0., -0.));
-  if (isinf(__x.real())) {
-    if (isnan(__x.imag()))
-      return complex<_Tp>(__x.imag(), __x.real());
-    if (isinf(__x.imag())) {
-      if (__x.real() < _Tp(0))
-        return complex<_Tp>(_Tp(0.75) * __pi, -__x.imag());
-      return complex<_Tp>(_Tp(0.25) * __pi, -__x.imag());
-    }
-    if (__x.real() < _Tp(0))
-      return complex<_Tp>(__pi, signbit(__x.imag()) ? -__x.real() : __x.real());
-    return complex<_Tp>(_Tp(0), signbit(__x.imag()) ? __x.real() : -__x.real());
-  }
-  if (isnan(__x.real())) {
-    if (isinf(__x.imag()))
-      return complex<_Tp>(__x.real(), -__x.imag());
-    return complex<_Tp>(__x.real(), __x.real());
-  }
-  if (isinf(__x.imag()))
-    return complex<_Tp>(__pi / _Tp(2), -__x.imag());
-  if (__x.real() == 0)
-    return complex<_Tp>(__pi / _Tp(2), -__x.imag());
-  complex<_Tp> __z = log(__x + sqrt(pow(__x, _Tp(2)) - _Tp(1)));
-  if (signbit(__x.imag()))
-    return complex<_Tp>(fabs(__z.imag()), fabs(__z.real()));
-  return complex<_Tp>(fabs(__z.imag()), -fabs(__z.real()));
-}
-
-// atan
-
-template <class _Tp>
-CUDA_CALLABLE_MEMBER complex<_Tp> atan(const complex<_Tp> &__x) {
-  complex<_Tp> __z = atanh(complex<_Tp>(-__x.imag(), __x.real()));
-  return complex<_Tp>(__z.imag(), -__z.real());
-}
-
-// sin
-
-template <class _Tp>
-CUDA_CALLABLE_MEMBER complex<_Tp> sin(const complex<_Tp> &__x) {
-  complex<_Tp> __z = sinh(complex<_Tp>(-__x.imag(), __x.real()));
-  return complex<_Tp>(__z.imag(), -__z.real());
-}
-
-// cos
-
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> cos(const complex<_Tp> &__x) {
-  return cosh(complex<_Tp>(-__x.imag(), __x.real()));
-}
-
-// tan
-
-template <class _Tp>
-CUDA_CALLABLE_MEMBER complex<_Tp> tan(const complex<_Tp> &__x) {
-  complex<_Tp> __z = tanh(complex<_Tp>(-__x.imag(), __x.real()));
-  return complex<_Tp>(__z.imag(), -__z.real());
-}
-
-template <class _Tp, class _CharT, class _Traits>
-std::basic_istream<_CharT, _Traits> &
-operator>>(std::basic_istream<_CharT, _Traits> &__is, complex<_Tp> &__x) {
-  if (__is.good()) {
-    ws(__is);
-    if (__is.peek() == _CharT('(')) {
-      __is.get();
-      _Tp __r;
-      __is >> __r;
-      if (!__is.fail()) {
-        ws(__is);
-        _CharT __c = __is.peek();
-        if (__c == _CharT(',')) {
-          __is.get();
-          _Tp __i;
-          __is >> __i;
-          if (!__is.fail()) {
-            ws(__is);
-            __c = __is.peek();
-            if (__c == _CharT(')')) {
-              __is.get();
-              __x = complex<_Tp>(__r, __i);
-            } else
-              __is.setstate(std::ios_base::failbit);
-          } else
-            __is.setstate(std::ios_base::failbit);
-        } else if (__c == _CharT(')')) {
-          __is.get();
-          __x = complex<_Tp>(__r, _Tp(0));
-        } else
-          __is.setstate(std::ios_base::failbit);
-      } else
-        __is.setstate(std::ios_base::failbit);
-    } else {
-      _Tp __r;
-      __is >> __r;
-      if (!__is.fail())
-        __x = complex<_Tp>(__r, _Tp(0));
-      else
-        __is.setstate(std::ios_base::failbit);
-    }
-  } else
-    __is.setstate(std::ios_base::failbit);
-  return __is;
-}
-
-template <class _Tp, class _CharT, class _Traits>
-std::basic_ostream<_CharT, _Traits> &
-operator<<(std::basic_ostream<_CharT, _Traits> &__os, const complex<_Tp> &__x) {
-  std::basic_ostringstream<_CharT, _Traits> __s;
-  __s.flags(__os.flags());
-  __s.imbue(__os.getloc());
-  __s.precision(__os.precision());
-  __s << '(' << __x.real() << ',' << __x.imag() << ')';
-  return __os << __s.str();
-}
-
-//} // close namespace cuda_complex
-
-template <class U, class V>
-CUDA_CALLABLE_MEMBER auto operator*(const complex<U> &complexNumber,
-                                    const V &scalar) -> complex<U> {
-  return complex<U>{real(complexNumber) * scalar, imag(complexNumber) * scalar};
-}
-
-template <class U, class V>
-CUDA_CALLABLE_MEMBER auto operator*(const V &scalar,
-                                    const complex<U> &complexNumber)
-    -> complex<U> {
-  return complex<U>{real(complexNumber) * scalar, imag(complexNumber) * scalar};
-}
-
-template <class U, class V>
-CUDA_CALLABLE_MEMBER auto operator+(const complex<U> &complexNumber,
-                                    const V &scalar) -> complex<U> {
-  return complex<U>{real(complexNumber) + scalar, imag(complexNumber)};
-}
-
-template <class U, class V>
-CUDA_CALLABLE_MEMBER auto operator+(const V &scalar,
-                                    const complex<U> &complexNumber)
-    -> complex<U> {
-  return complex<U>{real(complexNumber) + scalar, imag(complexNumber)};
-}
-
-template <class U, class V>
-CUDA_CALLABLE_MEMBER auto operator-(const complex<U> &complexNumber,
-                                    const V &scalar) -> complex<U> {
-  return complex<U>{real(complexNumber) - scalar, imag(complexNumber)};
-}
-
-template <class U, class V>
-CUDA_CALLABLE_MEMBER auto operator-(const V &scalar,
-                                    const complex<U> &complexNumber)
-    -> complex<U> {
-  return complex<U>{scalar - real(complexNumber), imag(complexNumber)};
-}
-
-template <class U, class V>
-CUDA_CALLABLE_MEMBER auto operator/(const complex<U> &complexNumber,
-                                    const V scalar) -> complex<U> {
-  return complex<U>{real(complexNumber) / scalar, imag(complexNumber) / scalar};
-}
-
-template <class U, class V>
-CUDA_CALLABLE_MEMBER auto operator/(const V scalar,
-                                    const complex<U> &complexNumber)
-    -> complex<U> {
-  return complex<U>{scalar, 0} / complexNumber;
-}
-
-using ComplexDouble = complex<double>;
-using ComplexFloat = complex<float>;
-#endif // CUDA_COMPLEX_HPP
-}
--- a/pystencils/include/opencl_stdint.h
+++ b/pystencils/include/opencl_stdint.h
-#ifndef OPENCL_STDINT
-#define OPENCL_STDINT
-
-typedef unsigned int      uint_t;
-
-typedef signed char       int8_t;
-typedef signed short      int16_t;
-typedef signed int        int32_t;
-typedef signed long int   int64_t;
-typedef unsigned char     uint8_t;
-typedef unsigned short    uint16_t;
-typedef unsigned int      uint32_t;
-typedef unsigned long int uint64_t;
-
-#endif
--- a/pystencils/include/philox_rand.h
+++ b/pystencils/include/philox_rand.h
-#include <cstdint>
-
-#ifndef __CUDA_ARCH__
-#define QUALIFIERS inline
-#else
-#define QUALIFIERS static __forceinline__ __device__
-#endif
-
-#define PHILOX_W32_0   (0x9E3779B9)
-#define PHILOX_W32_1   (0xBB67AE85)
-#define PHILOX_M4x32_0 (0xD2511F53)
-#define PHILOX_M4x32_1 (0xCD9E8D57)
-#define TWOPOW53_INV_DOUBLE (1.1102230246251565e-16)
-#define TWOPOW32_INV_FLOAT (2.3283064e-10f)
-
-typedef std::uint32_t uint32;
-typedef std::uint64_t uint64;
-
-
-QUALIFIERS uint32 mulhilo32(uint32 a, uint32 b, uint32* hip)
-{
-#ifndef __CUDA_ARCH__
-    // host code
-    uint64 product = ((uint64)a) * ((uint64)b);
-    *hip = product >> 32;
-    return (uint32)product;
-#else
-    // device code
-    *hip = __umulhi(a,b);
-    return a*b;
-#endif
-}
-
-QUALIFIERS void _philox4x32round(uint32* ctr, uint32* key)
-{
-    uint32 hi0;
-    uint32 hi1;
-    uint32 lo0 = mulhilo32(PHILOX_M4x32_0, ctr[0], &hi0);
-    uint32 lo1 = mulhilo32(PHILOX_M4x32_1, ctr[2], &hi1);
-
-    ctr[0] = hi1^ctr[1]^key[0];
-    ctr[1] = lo1;
-    ctr[2] = hi0^ctr[3]^key[1];
-    ctr[3] = lo0;
-}
-
-QUALIFIERS void _philox4x32bumpkey(uint32* key)
-{
-    key[0] += PHILOX_W32_0;
-    key[1] += PHILOX_W32_1;
-}
-
-QUALIFIERS double _uniform_double_hq(uint32 x, uint32 y)
-{
-    uint64 z = (uint64)x ^ ((uint64)y << (53 - 32));
-    return z * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0);
-}
-
-
-QUALIFIERS void philox_double2(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3,
-                               uint32 key0, uint32 key1, double & rnd1, double & rnd2)
-{
-    uint32 key[2] = {key0, key1};
-    uint32 ctr[4] = {ctr0, ctr1, ctr2, ctr3};
-    _philox4x32round(ctr, key);                           // 1
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 2
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 3
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 4
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 5
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 6
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 7
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 8
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 9
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 10
-
-    rnd1 = _uniform_double_hq(ctr[0], ctr[1]);
-    rnd2 = _uniform_double_hq(ctr[2], ctr[3]);
-}
-
-
-
-QUALIFIERS void philox_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3,
-                              uint32 key0, uint32 key1,
-                              float & rnd1, float & rnd2, float & rnd3, float & rnd4)
-{
-    uint32 key[2] = {key0, key1};
-    uint32 ctr[4] = {ctr0, ctr1, ctr2, ctr3};
-    _philox4x32round(ctr, key);                           // 1
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 2
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 3
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 4
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 5
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 6
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 7
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 8
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 9
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 10
-
-    rnd1 = ctr[0] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
-    rnd2 = ctr[1] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
-    rnd3 = ctr[2] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
-    rnd4 = ctr[3] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
-}
\ No newline at end of file
--- a/pystencils/interpolation_astnodes.py
+++ b/pystencils/interpolation_astnodes.py
-# -*- coding: utf-8 -*-
-#
-# Copyright © 2019 Stephan Seitz <stephan.seitz@fau.de>
-#
-# Distributed under terms of the GPLv3 license.
-
-"""
-
-"""
-
-import hashlib
-import itertools
-from enum import Enum
-from typing import Set
-
-import sympy as sp
-from sympy.core.cache import cacheit
-
-import pystencils
-from pystencils.astnodes import Node
-from pystencils.data_types import TypedSymbol, cast_func, create_type
-
-try:
-    import pycuda.driver
-except Exception:
-    pass
-
-_hash = hashlib.md5
-
-
-class InterpolationMode(str, Enum):
-    NEAREST_NEIGHBOR = "nearest_neighbour"
-    NN = NEAREST_NEIGHBOR
-    LINEAR = "linear"
-    CUBIC_SPLINE = "cubic_spline"
-
-
-class Interpolator(object):
-    """
-    Implements non-integer accesses on fields using linear interpolation.
-
-    On GPU, this interpolator can be implemented by a :class:`.TextureCachedField` for hardware acceleration.
-
-    Address modes are different boundary handlings possible choices are like for CUDA textures
-
-        **CLAMP**
-
-        The signal c[k] is continued outside k=0,...,M-1 so that c[k] = c[0] for k < 0, and c[k] = c[M-1] for k >= M.
-
-        **BORDER**
-
-        The signal c[k] is continued outside k=0,...,M-1 so that c[k] = 0 for k < 0and for k >= M.
-
-        Now, to describe the last two address modes, we are forced to consider normalized coordinates,
-        so that the 1D input signal samples are assumed to be c[k / M], with k=0,...,M-1.
-
-        **WRAP**
-
-        The signal c[k / M] is continued outside k=0,...,M-1 so that it is periodic with period equal to M.
-        In other words, c[(k + p * M) / M] = c[k / M] for any (positive, negative or vanishing) integer p.
-
-        **MIRROR**
-
-        The signal c[k / M] is continued outside k=0,...,M-1 so that it is periodic with period equal to 2 * M - 2.
-        In other words, c[l / M] = c[k / M] for any l and k such that (l + k)mod(2 * M - 2) = 0.
-
-    Explanations from https://stackoverflow.com/questions/19020963/the-different-addressing-modes-of-cuda-textures
-    """
-
-    required_global_declarations = []
-
-    def __init__(self,
-                 parent_field,
-                 interpolation_mode: InterpolationMode,
-                 address_mode='BORDER',
-                 use_normalized_coordinates=False,
-                 allow_textures=True):
-        super().__init__()
-
-        self.field = parent_field.new_field_with_different_name(parent_field.name)
-        self.field.field_type = pystencils.field.FieldType.CUSTOM
-        self.address_mode = address_mode
-        self.use_normalized_coordinates = use_normalized_coordinates
-        hash_str = "%x" % abs(hash(self.field) + hash(address_mode))
-        self.symbol = TypedSymbol('dummy_symbol_carrying_field' + self.field.name + hash_str,
-                                  'dummy_symbol_carrying_field' + self.field.name + hash_str)
-        self.symbol.field = self.field
-        self.symbol.interpolator = self
-        self.allow_textures = allow_textures
-        self.interpolation_mode = interpolation_mode
-
-    @property
-    def _hashable_contents(self):
-        return (str(self.address_mode),
-                str(type(self)),
-                self.symbol,
-                self.address_mode,
-                self.use_normalized_coordinates)
-
-    def at(self, offset):
-        return InterpolatorAccess(self.symbol, *[sp.S(o) for o in offset])
-
-    def __getitem__(self, offset):
-        return InterpolatorAccess(self.symbol, *[sp.S(o) for o in offset])
-
-    def __str__(self):
-        return '%s_interpolator_%s' % (self.field.name, self.reproducible_hash)
-
-    def __repr__(self):
-        return self.__str__()
-
-    def __hash__(self):
-        return hash(self._hashable_contents)
-
-    @property
-    def reproducible_hash(self):
-        return _hash(str(self._hashable_contents).encode()).hexdigest()
-
-
-class LinearInterpolator(Interpolator):
-
-    def __init__(self,
-                 parent_field: pystencils.Field,
-                 address_mode='BORDER',
-                 use_normalized_coordinates=False):
-        super().__init__(parent_field,
-                         InterpolationMode.LINEAR,
-                         address_mode,
-                         use_normalized_coordinates)
-
-
-class NearestNeightborInterpolator(Interpolator):
-
-    def __init__(self,
-                 parent_field: pystencils.Field,
-                 address_mode='BORDER',
-                 use_normalized_coordinates=False):
-        super().__init__(parent_field,
-                         InterpolationMode.NN,
-                         address_mode,
-                         use_normalized_coordinates)
-
-
-class InterpolatorAccess(TypedSymbol):
-    def __new__(cls, field, offsets, *args, **kwargs):
-        obj = TextureAccess.__xnew_cached_(cls, field, offsets, *args, **kwargs)
-        return obj
-
-    def __new_stage2__(self, symbol, *offsets):
-        assert offsets is not None
-        obj = super().__xnew__(self, '%s_interpolator_%x' %
-                               (symbol.field.name, abs(hash(tuple(offsets)))), symbol.field.dtype)
-        obj.offsets = offsets
-        obj.symbol = symbol
-        obj.field = symbol.field
-        obj.interpolator = symbol.interpolator
-        return obj
-
-    def __hash__(self):
-        return hash((self.symbol, self.field, tuple(self.offsets), self.interpolator))
-
-    def __str__(self):
-        return '%s_interpolator(%s)' % (self.field.name, ','.join(str(o) for o in self.offsets))
-
-    def __repr__(self):
-        return self.__str__()
-
-    def atoms(self, *types):
-        if self.offsets:
-            offsets = set(o for o in self.offsets if isinstance(o, types))
-            if isinstance(self, *types):
-                offsets.update([self])
-            for o in self.offsets:
-                if hasattr(o, 'atoms'):
-                    offsets.update(set(o.atoms(*types)))
-            return offsets
-        else:
-            return set()
-
-    @property
-    def free_symbols(self):
-        symbols = set()
-        if self.offsets is not None:
-            for o in self.offsets:
-                if hasattr(o, 'free_symbols'):
-                    symbols.update(set(o.free_symbols))
-                # if hasattr(o, 'atoms'):
-                    # symbols.update(set(o.atoms(sp.Symbol)))
-
-        return symbols
-
-    @property
-    def args(self):
-        return [self.symbol, *self.offsets]
-
-    @property
-    def symbols_defined(self) -> Set[sp.Symbol]:
-        return {self}
-
-    @property
-    def interpolation_mode(self):
-        return self.interpolator.interpolation_mode
-
-    def implementation_with_stencils(self):
-        field = self.field
-
-        default_int_type = create_type('int64')
-        use_textures = isinstance(self, TextureAccess)
-        if use_textures:
-            def absolute_access(x, _):
-                return self.texture.at((o for o in x))
-        else:
-            absolute_access = field.absolute_access
-
-        sum = [0, ] * (field.shape[0] if field.index_dimensions else 1)
-
-        offsets = self.offsets
-        rounding_functions = (sp.floor, lambda x: sp.floor(x) + 1)
-
-        for channel_idx in range(field.shape[0] if field.index_dimensions else 1):
-            if self.interpolation_mode == InterpolationMode.NN:
-                if use_textures:
-                    sum[channel_idx] = self
-                else:
-                    sum[channel_idx] = absolute_access([sp.floor(i + 0.5) for i in offsets], channel_idx)
-
-            elif self.interpolation_mode == InterpolationMode.LINEAR:
-                # TODO optimization: implement via lerp: https://devblogs.nvidia.com/lerp-faster-cuda/
-                for c in itertools.product(rounding_functions, repeat=field.spatial_dimensions):
-                    weight = sp.Mul(*[1 - sp.Abs(f(offset) - offset) for (f, offset) in zip(c, offsets)])
-                    index = [f(offset) for (f, offset) in zip(c, offsets)]
-                    # Hardware boundary handling on GPU
-                    if use_textures:
-                        weight = sp.Mul(*[1 - sp.Abs(f(offset) - offset) for (f, offset) in zip(c, offsets)])
-                        sum[channel_idx] += \
-                            weight * absolute_access(index, channel_idx if field.index_dimensions else ())
-                    # else boundary handling using software
-                    elif str(self.interpolator.address_mode).lower() == 'border':
-                        is_inside_field = sp.And(
-                            *itertools.chain([i >= 0 for i in index],
-                                             [idx < field.shape[dim] for (dim, idx) in enumerate(index)]))
-                        index = [cast_func(i, default_int_type) for i in index]
-                        sum[channel_idx] += sp.Piecewise(
-                            (weight * absolute_access(index, channel_idx if field.index_dimensions else ()),
-                                is_inside_field),
-                            (sp.simplify(0), True)
-                        )
-                    elif str(self.interpolator.address_mode).lower() == 'clamp':
-                        index = [sp.Min(sp.Max(0, cast_func(i, default_int_type)), field.spatial_shape[dim] - 1)
-                                 for (dim, i) in enumerate(index)]
-                        sum[channel_idx] += weight * \
-                            absolute_access(index, channel_idx if field.index_dimensions else ())
-                    elif str(self.interpolator.address_mode).lower() == 'wrap':
-                        index = [sp.Mod(cast_func(i, default_int_type), field.shape[dim] - 1)
-                                 for (dim, i) in enumerate(index)]
-                        index = [cast_func(sp.Piecewise((i, i > 0),
-                                                        (sp.Abs(cast_func(field.shape[dim] - 1 + i, default_int_type)),
-                                                            True)), default_int_type)
-                                 for (dim, i) in enumerate(index)]
-                        sum[channel_idx] += weight * \
-                            absolute_access(index, channel_idx if field.index_dimensions else ())
-                        # sum[channel_idx] = 0
-                    elif str(self.interpolator.address_mode).lower() == 'mirror':
-                        def triangle_fun(x, half_period):
-                            saw_tooth = sp.Abs(cast_func(x, default_int_type)) % (
-                                cast_func(2 * half_period, create_type('int32')))
-                            return sp.Piecewise((saw_tooth, saw_tooth < half_period),
-                                                (2 * half_period - 1 - saw_tooth, True))
-                        index = [cast_func(triangle_fun(i, field.shape[dim]),
-                                           default_int_type) for (dim, i) in enumerate(index)]
-                        sum[channel_idx] += weight * \
-                            absolute_access(index, channel_idx if field.index_dimensions else ())
-                    else:
-                        raise NotImplementedError()
-            elif self.interpolation_mode == InterpolationMode.CUBIC_SPLINE:
-                raise NotImplementedError("only works with HW interpolation for float32")
-
-            sum = [sp.factor(s) for s in sum]
-
-            if field.index_dimensions:
-                return sp.Matrix(sum)
-            else:
-                return sum[0]
-
-    # noinspection SpellCheckingInspection
-    __xnew__ = staticmethod(__new_stage2__)
-    # noinspection SpellCheckingInspection
-    __xnew_cached_ = staticmethod(cacheit(__new_stage2__))
-
-##########################################################################################
-# GPU-specific fast specializations (for precision GPUs can also use above nodes/symbols #
-##########################################################################################
-
-
-class TextureCachedField:
-
-    def __init__(self, parent_field,
-                 address_mode=None,
-                 filter_mode=None,
-                 interpolation_mode: InterpolationMode = InterpolationMode.LINEAR,
-                 use_normalized_coordinates=False,
-                 read_as_integer=False
-                 ):
-        if isinstance(address_mode, str):
-            address_mode = getattr(pycuda.driver.address_mode, address_mode.upper())
-
-        if address_mode is None:
-            address_mode = pycuda.driver.address_mode.BORDER
-        if filter_mode is None:
-            filter_mode = pycuda.driver.filter_mode.LINEAR
-
-        # self, field_name, field_type, dtype, layout, shape, strides
-        self.field = parent_field
-        self.address_mode = address_mode
-        self.filter_mode = filter_mode
-        self.read_as_integer = read_as_integer
-        self.use_normalized_coordinates = use_normalized_coordinates
-        self.interpolation_mode = interpolation_mode
-        self.symbol = TypedSymbol(str(self), self.field.dtype.numpy_dtype)
-        self.symbol.interpolator = self
-        self.symbol.field = self.field
-        self.required_global_declarations = [TextureDeclaration(self)]
-
-        # assert str(self.field.dtype) != 'double', "CUDA does not support double textures!"
-        # assert dtype_supports_textures(self.field.dtype), "CUDA only supports texture types with 32 bits or less"
-
-    @classmethod
-    def from_interpolator(cls, interpolator: LinearInterpolator):
-        if (isinstance(interpolator, cls)
-                or (hasattr(interpolator, 'allow_textures') and not interpolator.allow_textures)):
-            return interpolator
-        obj = cls(interpolator.field, interpolator.address_mode, interpolation_mode=interpolator.interpolation_mode)
-        return obj
-
-    def at(self, offset):
-        return TextureAccess(self.symbol, *offset)
-
-    def __getitem__(self, offset):
-        return TextureAccess(self.symbol, *offset)
-
-    def __str__(self):
-        return '%s_texture_%s' % (self.field.name, self.reproducible_hash)
-
-    def __repr__(self):
-        return self.__str__()
-
-    @property
-    def _hashable_contents(self):
-        return (type(self),
-                self.address_mode,
-                self.filter_mode,
-                self.read_as_integer,
-                self.interpolation_mode,
-                self.use_normalized_coordinates)
-
-    def __hash__(self):
-        return hash(self._hashable_contents)
-
-    @property
-    def reproducible_hash(self):
-        return _hash(str(self._hashable_contents).encode()).hexdigest()
-
-
-class TextureAccess(InterpolatorAccess):
-    def __new__(cls, texture_symbol, offsets, *args, **kwargs):
-        obj = TextureAccess.__xnew_cached_(cls, texture_symbol, offsets, *args, **kwargs)
-        return obj
-
-    def __new_stage2__(self, symbol, *offsets):
-        obj = super().__xnew__(self, symbol, *offsets)
-        obj.required_global_declarations = symbol.interpolator.required_global_declarations
-        obj.required_global_declarations[0]._symbols_defined.add(obj)
-        return obj
-
-    def __str__(self):
-        return '%s_texture(%s)' % (self.interpolator.field.name, ','.join(str(o) for o in self.offsets))
-
-    @property
-    def texture(self):
-        return self.interpolator
-
-    # noinspection SpellCheckingInspection
-    __xnew__ = staticmethod(__new_stage2__)
-    # noinspection SpellCheckingInspection
-    __xnew_cached_ = staticmethod(cacheit(__new_stage2__))
-
-
-class TextureDeclaration(Node):
-    """
-    A global declaration of a texture. Visible both for device and host code.
-
-    .. code:: cpp
-
-        // This Node represents the following global declaration
-        texture<float, cudaTextureType2D, cudaReadModeElementType> x_texture_5acc9fced7b0dc3e;
-
-        __device__ kernel(...) {
-            // kernel acceses x_texture_5acc9fced7b0dc3e with tex2d(...)
-        }
-
-        __host__ launch_kernel(...) {
-            // Host needs to bind the texture
-            cudaBindTexture(0, x_texture_5acc9fced7b0dc3e, buffer, N*sizeof(float));
-        }
-
-    This has been deprecated by CUDA in favor of :class:`.TextureObject`.
-    But texture objects are not yet supported by PyCUDA (https://github.com/inducer/pycuda/pull/174)
-    """
-
-    def __init__(self, parent_texture):
-        self.texture = parent_texture
-        self._symbols_defined = {self.texture.symbol}
-
-    @property
-    def symbols_defined(self) -> Set[sp.Symbol]:
-        return self._symbols_defined
-
-    @property
-    def args(self) -> Set[sp.Symbol]:
-        return set()
-
-    @property
-    def headers(self):
-        return ['"pycuda-helpers.hpp"']
-
-    def __str__(self):
-        from pystencils.backends.cuda_backend import CudaBackend
-        return CudaBackend()(self)
-
-
-class TextureObject(TextureDeclaration):
-    """
-    A CUDA texture object. Opposed to :class:`.TextureDeclaration` it is not declared globally but
-    used as a function argument for the kernel call.
-
-    Like :class:`.TextureDeclaration` it defines :class:`.TextureAccess` symbols.
-    Just the printing representation is a bit different.
-    """
-    pass
-
-
-def dtype_supports_textures(dtype):
-    """
-    Returns whether CUDA natively supports texture fetches with this numpy dtype.
-
-    The maximum word size for a texture fetch is four bytes.
-
-    With this trick also larger dtypes can be fetched:
-    https://github.com/inducer/pycuda/blob/master/pycuda/cuda/pycuda-helpers.hpp
-
-    """
-    if hasattr(dtype, 'numpy_dtype'):
-        dtype = dtype.numpy_dtype
-
-    if isinstance(dtype, type):
-        return dtype().itemsize <= 4
-
-    return dtype.itemsize <= 4
--- a/pystencils/kerncraft_coupling/__init__.py
+++ b/pystencils/kerncraft_coupling/__init__.py
-from .generate_benchmark import generate_benchmark, run_c_benchmark
-from .kerncraft_interface import KerncraftParameters, PyStencilsKerncraftKernel
-
-__all__ = ['PyStencilsKerncraftKernel', 'KerncraftParameters', 'generate_benchmark', 'run_c_benchmark']
--- a/pystencils/kerncraft_coupling/generate_benchmark.py
+++ b/pystencils/kerncraft_coupling/generate_benchmark.py
-import os
-import subprocess
-
-from jinja2 import Template
-
-from pystencils.astnodes import PragmaBlock
-from pystencils.backends.cbackend import generate_c, get_headers
-from pystencils.cpu.cpujit import get_compiler_config, run_compile_step
-from pystencils.data_types import get_base_type
-from pystencils.include import get_pystencils_include_path
-from pystencils.sympyextensions import prod
-
-benchmark_template = Template("""
-#include "kerncraft.h"
-#include <stdlib.h>
-#include <stdint.h>
-#include <stdbool.h>
-#include <math.h>
-#include <stdio.h>
-
-{{ includes }}
-
-{%- if likwid %}
-#include <likwid.h>
-{%- endif %}
-
-#define RESTRICT __restrict__
-#define FUNC_PREFIX
-void dummy(void *);
-void timing(double* wcTime, double* cpuTime);
-extern int var_false;
-
-
-{{kernel_code}}
-
-
-int main(int argc, char **argv)
-{
-  {%- if likwid %}
-  likwid_markerInit();
-  {%- endif %}
-
-  {%- for field_name, dataType, size in fields %}
-
-  // Initialization {{field_name}}
-  double * {{field_name}} = (double *) aligned_malloc(sizeof({{dataType}}) * {{size}}, 64);
-  for (unsigned long long i = 0; i < {{size}}; ++i)
-    {{field_name}}[i] = 0.23;
-
-  if(var_false)
-    dummy({{field_name}});
-
-  {%- endfor %}
-
-
-
-  {%- for constantName, dataType in constants %}
-
-  // Constant {{constantName}}
-  {{dataType}} {{constantName}};
-  {{constantName}} = 0.23;
-  if(var_false)
-      dummy(& {{constantName}});
-
-  {%- endfor %}
-
-  {%- if likwid and openmp %}
-  #pragma omp parallel
-  {
-  likwid_markerRegisterRegion("loop");
-  #pragma omp barrier
-  {%- elif likwid %}
-  likwid_markerRegisterRegion("loop");
-  {%- endif %}
-
-  for(int warmup = 1; warmup >= 0; --warmup) {
-    int repeat = 2;
-    if(warmup == 0) {
-      repeat = atoi(argv[1]);
-      {%- if likwid %}
-      likwid_markerStartRegion("loop");
-      {%- endif %}
-    }
-    
-    {%- if timing %}
-    double wcStartTime, cpuStartTime, wcEndTime, cpuEndTime;
-    timing(&wcStartTime, &cpuStartTime);
-    {%- endif %}
-    
-    for (; repeat > 0; --repeat)
-    {
-      {{kernelName}}({{call_argument_list}});
-
-      // Dummy calls
-      {%- for field_name, dataType, size in fields %}
-      if(var_false) dummy((void*){{field_name}});
-      {%- endfor %}
-      {%- for constantName, dataType in constants %}
-      if(var_false) dummy((void*)&{{constantName}});
-      {%- endfor %}
-    }
-    {%- if timing %}
-    timing(&wcEndTime, &cpuEndTime);
-    if( warmup == 0)
-        printf("%e\\n", (wcEndTime - wcStartTime) / atoi(argv[1]) );
-    {%- endif %}
-
-  }
-
-  {%- if likwid %}
-  likwid_markerStopRegion("loop");
-  {%- if openmp %}
-  }
-  {%- endif %}
-  {%- endif %}
-
-  {%- if likwid %}
-  likwid_markerClose();
-  {%- endif %}
-}
-""")
-
-
-def generate_benchmark(ast, likwid=False, openmp=False, timing=False):
-    """Return C code of a benchmark program for the given kernel.
-
-    Args:
-        ast: the pystencils AST object as returned by create_kernel
-        likwid: if True likwid markers are added to the code
-        openmp: relevant only if likwid=True, to generated correct likwid initialization code
-        timing: add timing output to the code, prints time per iteration to stdout
-
-    Returns:
-        C code as string
-    """
-    accessed_fields = {f.name: f for f in ast.fields_accessed}
-    constants = []
-    fields = []
-    call_parameters = []
-    for p in ast.get_parameters():
-        if not p.is_field_parameter:
-            constants.append((p.symbol.name, str(p.symbol.dtype)))
-            call_parameters.append(p.symbol.name)
-        else:
-            assert p.is_field_pointer, "Benchmark implemented only for kernels with fixed loop size"
-            field = accessed_fields[p.field_name]
-            dtype = str(get_base_type(p.symbol.dtype))
-            fields.append((p.field_name, dtype, prod(field.shape)))
-            call_parameters.append(p.field_name)
-
-    header_list = get_headers(ast)
-    includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list])
-
-    # Strip "#pragma omp parallel" from within kernel, because main function takes care of that
-    # when likwid and openmp are enabled
-    if likwid and openmp:
-        if len(ast.body.args) > 0 and isinstance(ast.body.args[0], PragmaBlock):
-            ast.body.args[0].pragma_line = ''
-
-    args = {
-        'likwid': likwid,
-        'openmp': openmp,
-        'kernel_code': generate_c(ast, dialect='c'),
-        'kernelName': ast.function_name,
-        'fields': fields,
-        'constants': constants,
-        'call_argument_list': ",".join(call_parameters),
-        'includes': includes,
-        'timing': timing,
-    }
-    return benchmark_template.render(**args)
-
-
-def run_c_benchmark(ast, inner_iterations, outer_iterations=3):
-    """Runs the given kernel with outer loop in C
-
-    Args:
-        ast:
-        inner_iterations: timings are recorded around this many iterations
-        outer_iterations: number of timings recorded
-
-    Returns:
-        list of times per iterations for each outer iteration
-    """
-    import kerncraft
-
-    benchmark_code = generate_benchmark(ast, timing=True)
-    with open('bench.c', 'w') as f:
-        f.write(benchmark_code)
-
-    kerncraft_path = os.path.dirname(kerncraft.__file__)
-
-    extra_flags = ['-I' + get_pystencils_include_path(),
-                   '-I' + os.path.join(kerncraft_path, 'headers')]
-
-    compiler_config = get_compiler_config()
-    compile_cmd = [compiler_config['command']] + compiler_config['flags'].split()
-    compile_cmd += [*extra_flags,
-                    os.path.join(kerncraft_path, 'headers', 'timing.c'),
-                    os.path.join(kerncraft_path, 'headers', 'dummy.c'),
-                    'bench.c',
-                    '-o', 'bench',
-                    ]
-    run_compile_step(compile_cmd)
-
-    results = []
-    for _ in range(outer_iterations):
-        benchmark_time = float(subprocess.check_output(['./bench', str(inner_iterations)]))
-        results.append(benchmark_time)
-    return results
--- a/pystencils/kerncraft_coupling/kerncraft_interface.py
+++ b/pystencils/kerncraft_coupling/kerncraft_interface.py
-import warnings
-from collections import defaultdict
-from tempfile import TemporaryDirectory
-from typing import Optional
-
-import kerncraft
-import sympy as sp
-from kerncraft.kerncraft import KernelCode
-from kerncraft.machinemodel import MachineModel
-
-from pystencils.astnodes import (
-    KernelFunction, LoopOverCoordinate, ResolvedFieldAccess, SympyAssignment)
-from pystencils.field import get_layout_from_strides
-from pystencils.kerncraft_coupling.generate_benchmark import generate_benchmark
-from pystencils.sympyextensions import count_operations_in_ast
-from pystencils.transformations import filtered_tree_iteration
-from pystencils.utils import DotDict
-
-
-class PyStencilsKerncraftKernel(KernelCode):
-    """
-    Implementation of kerncraft's kernel interface for pystencils CPU kernels.
-    Analyses a list of equations assuming they will be executed on a CPU
-    """
-    LIKWID_BASE = '/usr/local/likwid'
-
-    def __init__(self, ast: KernelFunction, machine: Optional[MachineModel] = None,
-                 assumed_layout='SoA', debug_print=False, filename=None):
-        """Create a kerncraft kernel using a pystencils AST
-
-        Args:
-            ast: pystencils ast
-            machine: kerncraft machine model - specify this if kernel needs to be compiled
-            assumed_layout: either 'SoA' or 'AoS' - if fields have symbolic sizes the layout of the index
-                    coordinates is not known. In this case either a structures of array (SoA) or
-                    array of structures (AoS) layout is assumed
-        """
-        kerncraft.kernel.Kernel.__init__(self, machine)
-
-        # Initialize state
-        self.asm_block = None
-        self._filename = filename
-
-        self.kernel_ast = ast
-        self.temporary_dir = TemporaryDirectory()
-        self._keep_intermediates = debug_print
-
-        # Loops
-        inner_loops = [l for l in filtered_tree_iteration(ast, LoopOverCoordinate, stop_type=SympyAssignment)
-                       if l.is_innermost_loop]
-        if len(inner_loops) == 0:
-            raise ValueError("No loop found in pystencils AST")
-        else:
-            if len(inner_loops) > 1:
-                warnings.warn("pystencils AST contains multiple inner loops. "
-                              "Only one can be analyzed - choosing first one")
-            inner_loop = inner_loops[0]
-
-        self._loop_stack = []
-        cur_node = inner_loop
-        while cur_node is not None:
-            if isinstance(cur_node, LoopOverCoordinate):
-                loop_counter_sym = cur_node.loop_counter_symbol
-                loop_info = (loop_counter_sym.name, cur_node.start, cur_node.stop, 1)
-                # If the correct step were to be provided, all access within that step length will
-                # also need to be passed to kerncraft: cur_node.step)
-                self._loop_stack.append(loop_info)
-            cur_node = cur_node.parent
-        self._loop_stack = list(reversed(self._loop_stack))
-
-        # Data sources & destinations
-        self.sources = defaultdict(list)
-        self.destinations = defaultdict(list)
-
-        def get_layout_tuple(f):
-            if f.has_fixed_shape:
-                return get_layout_from_strides(f.strides)
-            else:
-                layout_list = list(f.layout)
-                for _ in range(f.index_dimensions):
-                    layout_list.insert(0 if assumed_layout == 'SoA' else -1, max(layout_list) + 1)
-                return layout_list
-
-        reads, writes = search_resolved_field_accesses_in_ast(inner_loop)
-        for accesses, target_dict in [(reads, self.sources), (writes, self.destinations)]:
-            for fa in accesses:
-                coord = [sp.Symbol(LoopOverCoordinate.get_loop_counter_name(i), positive=True, integer=True) + off
-                         for i, off in enumerate(fa.offsets)]
-                coord += list(fa.idx_coordinate_values)
-                layout = get_layout_tuple(fa.field)
-                permuted_coord = [sp.sympify(coord[i]) for i in layout]
-                target_dict[fa.field.name].append(permuted_coord)
-
-        # Variables (arrays)
-        fields_accessed = ast.fields_accessed
-        for field in fields_accessed:
-            layout = get_layout_tuple(field)
-            permuted_shape = list(field.shape[i] for i in layout)
-            self.set_variable(field.name, str(field.dtype), tuple(permuted_shape))
-
-        # Scalars may be safely ignored
-        # for param in ast.get_parameters():
-        #     if not param.is_field_parameter:
-        #         # self.set_variable(param.symbol.name, str(param.symbol.dtype), None)
-        #         self.sources[param.symbol.name] = [None]
-
-        # data type
-        self.datatype = list(self.variables.values())[0][0]
-
-        # flops
-        operation_count = count_operations_in_ast(inner_loop)
-        self._flops = {
-            '+': operation_count['adds'],
-            '*': operation_count['muls'],
-            '/': operation_count['divs'],
-        }
-        for k in [k for k, v in self._flops.items() if v == 0]:
-            del self._flops[k]
-        self.check()
-
-        if debug_print:
-            from pprint import pprint
-            print("-----------------------------  Loop Stack --------------------------")
-            pprint(self._loop_stack)
-            print("-----------------------------  Sources -----------------------------")
-            pprint(self.sources)
-            print("-----------------------------  Destinations ------------------------")
-            pprint(self.destinations)
-            print("-----------------------------  FLOPS -------------------------------")
-            pprint(self._flops)
-
-    def as_code(self, type_='iaca', openmp=False, as_filename=False):
-        """
-        Generate and return compilable source code.
-
-        Args:
-            type_: can be iaca or likwid.
-            openmp: if true, openmp code will be generated
-            as_filename:
-        """
-        code = generate_benchmark(self.kernel_ast, likwid=type_ == 'likwid', openmp=openmp)
-        if as_filename:
-            fp, already_available = self._get_intermediate_file('kernel_{}.c'.format(type_),
-                                                                machine_and_compiler_dependent=False)
-            if not already_available:
-                fp.write(code)
-            return fp.name
-        else:
-            return code
-
-
-class KerncraftParameters(DotDict):
-    def __init__(self, **kwargs):
-        super(KerncraftParameters, self).__init__(**kwargs)
-        self['asm_block'] = 'auto'
-        self['asm_increment'] = 0
-        self['cores'] = 1
-        self['cache_predictor'] = 'SIM'
-        self['verbose'] = 0
-        self['pointer_increment'] = 'auto'
-        self['iterations'] = 10
-        self['unit'] = 'cy/CL'
-        self['ignore_warnings'] = True
-
-
-# ------------------------------------------- Helper functions ---------------------------------------------------------
-
-
-def search_resolved_field_accesses_in_ast(ast):
-    def visit(node, reads, writes):
-        if not isinstance(node, SympyAssignment):
-            for a in node.args:
-                visit(a, reads, writes)
-            return
-
-        for expr, accesses in [(node.lhs, writes), (node.rhs, reads)]:
-            accesses.update(expr.atoms(ResolvedFieldAccess))
-
-    read_accesses = set()
-    write_accesses = set()
-    visit(ast, read_accesses, write_accesses)
-    return read_accesses, write_accesses
--- a/pystencils/llvm/__init__.py
+++ b/pystencils/llvm/__init__.py
-from .kernelcreation import create_kernel
-from .llvmjit import make_python_function
-
-__all__ = ['create_kernel', 'make_python_function']
--- a/pystencils/llvm/control_flow.py
+++ b/pystencils/llvm/control_flow.py
-import llvmlite.ir as ir
-
-
-class Loop(object):
-    def __init__(self, builder, start_val, stop_val, step_val=1, loop_name='loop', phi_name="_phi"):
-        self.builder = builder
-        self.start_val = start_val
-        self.stop_val = stop_val
-        self.step_val = step_val
-        self.loop_name = loop_name
-        self.phi_name = phi_name
-
-    def __enter__(self):
-        self.loop_end, self.after, phi = self._for_loop(self.start_val, self.stop_val, self.step_val, self.loop_name,
-                                                        self.phi_name)
-        return phi
-
-    def _for_loop(self, start_val, stop_val, step_val, loop_name, phi_name):
-        # TODO size of int??? unisgned???
-        integer = ir.IntType(64)
-
-        # Loop block
-        pre_loop_bb = self.builder.block
-        loop_bb = self.builder.append_basic_block(name='loop_' + loop_name)
-
-        self.builder.branch(loop_bb)
-
-        # Insert an explicit fall through from the current block to loop_bb
-        self.builder.position_at_start(loop_bb)
-
-        # Add phi
-        phi = self.builder.phi(integer, name=phi_name)
-        phi.add_incoming(start_val, pre_loop_bb)
-
-        loop_end_bb = self.builder.append_basic_block(name=loop_name + "_end_bb")
-        self.builder.position_at_start(loop_end_bb)
-
-        next_var = self.builder.add(phi, step_val, name=loop_name + '_next_it')
-        cond = self.builder.icmp_unsigned('<', next_var, stop_val, name=loop_name + "_cond")
-
-        after_bb = self.builder.append_basic_block(name=loop_name + "_after_bb")
-
-        self.builder.cbranch(cond, loop_bb, after_bb)
-        phi.add_incoming(next_var, loop_end_bb)
-
-        self.builder.position_at_end(loop_bb)
-
-        return loop_end_bb, after_bb, phi
-
-    def __exit__(self, exc_type, exc, exc_tb):
-        self.builder.branch(self.loop_end)
-        self.builder.position_at_end(self.after)
--- a/pystencils/llvm/kernelcreation.py
+++ b/pystencils/llvm/kernelcreation.py
-from pystencils.llvm.llvmjit import make_python_function
-from pystencils.transformations import insert_casts
-
-
-def create_kernel(assignments, function_name="kernel", type_info=None, split_groups=(),
-                  iteration_slice=None, ghost_layers=None, target='cpu'):
-    """
-    Creates an abstract syntax tree for a kernel function, by taking a list of update rules.
-
-    Loops are created according to the field accesses in the equations.
-
-    Args:
-    assignments: list of sympy equations, containing accesses to :class:`pystencils.field.Field`.
-                 Defining the update rules of the kernel
-    function_name: name of the generated function - only important if generated code is written out
-                   type_info: a map from symbol name to a C type specifier. If not specified all symbols are assumed to
-                   be of type 'double' except symbols which occur on the left hand side of equations where the
-                   right hand side is a sympy Boolean which are assumed to be 'bool' .
-    split_groups: Specification on how to split up inner loop into multiple loops. For details see
-                  transformation :func:`pystencils.transformation.split_inner_loop`
-    iteration_slice: if not None, iteration is done only over this slice of the field
-    ghost_layers: a sequence of pairs for each coordinate with lower and upper nr of ghost layers
-                 if None, the number of ghost layers is determined automatically and assumed to be equal for a
-                 all dimensions
-
-    :return: :class:`pystencils.ast.KernelFunction` node
-    """
-    if target == 'cpu':
-        from pystencils.cpu import create_kernel
-        code = create_kernel(assignments, function_name, type_info, split_groups, iteration_slice, ghost_layers)
-        code._backend = 'llvm'
-    elif target == 'gpu':
-        from pystencils.gpucuda.kernelcreation import create_cuda_kernel
-        code = create_cuda_kernel(assignments,
-                                  function_name,
-                                  type_info,
-                                  iteration_slice=iteration_slice,
-                                  ghost_layers=ghost_layers)
-        code._backend = 'llvm_gpu'
-    else:
-        NotImplementedError()
-    code.body = insert_casts(code.body)
-    code._compile_function = make_python_function
-
-    return code
--- a/pystencils/llvm/llvm.py
+++ b/pystencils/llvm/llvm.py
-import functools
-
-import llvmlite.ir as ir
-import llvmlite.llvmpy.core as lc
-import sympy as sp
-from sympy import Indexed, S
-from sympy.printing.printer import Printer
-
-from pystencils.assignment import Assignment
-from pystencils.data_types import (
-    collate_types, create_composite_type_from_string, create_type, get_type_of_expression,
-    to_llvm_type)
-from pystencils.llvm.control_flow import Loop
-
-
-# From Numba
-def set_cuda_kernel(lfunc):
-    from llvmlite.llvmpy.core import MetaData, MetaDataString, Constant, Type
-
-    m = lfunc.module
-
-    ops = lfunc, MetaDataString.get(m, "kernel"), Constant.int(Type.int(), 1)
-    md = MetaData.get(m, ops)
-
-    nmd = m.get_or_insert_named_metadata('nvvm.annotations')
-    nmd.add(md)
-
-    # set nvvm ir version
-    i32 = ir.IntType(32)
-    md_ver = m.add_metadata([i32(1), i32(2), i32(2), i32(0)])
-    m.add_named_metadata('nvvmir.version', md_ver)
-
-
-# From Numba
-def _call_sreg(builder, name):
-    module = builder.module
-    fnty = lc.Type.function(lc.Type.int(), ())
-    fn = module.get_or_insert_function(fnty, name=name)
-    return builder.call(fn, ())
-
-
-def generate_llvm(ast_node, module=None, builder=None, target='cpu'):
-    """Prints the ast as llvm code."""
-    if module is None:
-        module = lc.Module()
-    if builder is None:
-        builder = ir.IRBuilder()
-    printer = LLVMPrinter(module, builder, target=target)
-    return printer._print(ast_node)
-
-
-# noinspection PyPep8Naming
-class LLVMPrinter(Printer):
-    """Convert expressions to LLVM IR"""
-
-    def __init__(self, module, builder, fn=None, target='cpu', *args, **kwargs):
-        self.func_arg_map = kwargs.pop("func_arg_map", {})
-        super(LLVMPrinter, self).__init__(*args, **kwargs)
-        self.fp_type = ir.DoubleType()
-        self.fp_pointer = self.fp_type.as_pointer()
-        self.integer = ir.IntType(64)
-        self.integer_pointer = self.integer.as_pointer()
-        self.void = ir.VoidType()
-        self.module = module
-        self.builder = builder
-        self.fn = fn
-        self.ext_fn = {}  # keep track of wrappers to external functions
-        self.tmp_var = {}
-        self.target = target
-
-    def _add_tmp_var(self, name, value):
-        self.tmp_var[name] = value
-
-    def _remove_tmp_var(self, name):
-        del self.tmp_var[name]
-
-    def _print_Number(self, n):
-        if get_type_of_expression(n) == create_type("int"):
-            return ir.Constant(self.integer, int(n))
-        elif get_type_of_expression(n) == create_type("double"):
-            return ir.Constant(self.fp_type, float(n))
-        else:
-            raise NotImplementedError("Numbers can only have int and double", n)
-
-    def _print_Float(self, expr):
-        return ir.Constant(self.fp_type, float(expr))
-
-    def _print_Integer(self, expr):
-        return ir.Constant(self.integer, int(expr))
-
-    def _print_int(self, i):
-        return ir.Constant(self.integer, i)
-
-    def _print_Symbol(self, s):
-        val = self.tmp_var.get(s)
-        if not val:
-            # look up parameter with name s
-            val = self.func_arg_map.get(s.name)
-        if not val:
-            raise LookupError("Symbol not found: %s" % s)
-        return val
-
-    def _print_Pow(self, expr):
-        base0 = self._print(expr.base)
-        if expr.exp == S.NegativeOne:
-            return self.builder.fdiv(ir.Constant(self.fp_type, 1.0), base0)
-        if expr.exp == S.Half:
-            fn = self.ext_fn.get("sqrt")
-            if not fn:
-                fn_type = ir.FunctionType(self.fp_type, [self.fp_type])
-                fn = ir.Function(self.module, fn_type, "sqrt")
-                self.ext_fn["sqrt"] = fn
-            return self.builder.call(fn, [base0], "sqrt")
-        if expr.exp == 2:
-            return self.builder.fmul(base0, base0)
-        elif expr.exp == 3:
-            return self.builder.fmul(self.builder.fmul(base0, base0), base0)
-
-        exp0 = self._print(expr.exp)
-        fn = self.ext_fn.get("pow")
-        if not fn:
-            fn_type = ir.FunctionType(self.fp_type, [self.fp_type, self.fp_type])
-            fn = ir.Function(self.module, fn_type, "pow")
-            self.ext_fn["pow"] = fn
-        return self.builder.call(fn, [base0, exp0], "pow")
-
-    def _print_Mul(self, expr):
-        nodes = [self._print(a) for a in expr.args]
-        e = nodes[0]
-        if get_type_of_expression(expr) == create_type('double'):
-            mul = self.builder.fmul
-        else:  # int TODO unsigned/signed
-            mul = self.builder.mul
-        for node in nodes[1:]:
-            e = mul(e, node)
-        return e
-
-    def _print_Add(self, expr):
-        nodes = [self._print(a) for a in expr.args]
-        e = nodes[0]
-        if get_type_of_expression(expr) == create_type('double'):
-            add = self.builder.fadd
-        else:  # int TODO unsigned/signed
-            add = self.builder.add
-        for node in nodes[1:]:
-            e = add(e, node)
-        return e
-
-    def _print_Or(self, expr):
-        nodes = [self._print(a) for a in expr.args]
-        e = nodes[0]
-        for node in nodes[1:]:
-            e = self.builder.or_(e, node)
-        return e
-
-    def _print_And(self, expr):
-        nodes = [self._print(a) for a in expr.args]
-        e = nodes[0]
-        for node in nodes[1:]:
-            e = self.builder.and_(e, node)
-        return e
-
-    def _print_StrictLessThan(self, expr):
-        return self._comparison('<', expr)
-
-    def _print_LessThan(self, expr):
-        return self._comparison('<=', expr)
-
-    def _print_StrictGreaterThan(self, expr):
-        return self._comparison('>', expr)
-
-    def _print_GreaterThan(self, expr):
-        return self._comparison('>=', expr)
-
-    def _print_Unequality(self, expr):
-        return self._comparison('!=', expr)
-
-    def _print_Equality(self, expr):
-        return self._comparison('==', expr)
-
-    def _comparison(self, cmpop, expr):
-        if collate_types([get_type_of_expression(arg) for arg in expr.args]) == create_type('double'):
-            comparison = self.builder.fcmp_unordered
-        else:
-            comparison = self.builder.icmp_signed
-        return comparison(cmpop, self._print(expr.lhs), self._print(expr.rhs))
-
-    def _print_KernelFunction(self, func):
-        # KernelFunction does not posses a return type
-        return_type = self.void
-        parameter_type = []
-        parameters = func.get_parameters()
-        for parameter in parameters:
-            parameter_type.append(to_llvm_type(parameter.symbol.dtype, nvvm_target=self.target == 'gpu'))
-        func_type = ir.FunctionType(return_type, tuple(parameter_type))
-        name = func.function_name
-        fn = ir.Function(self.module, func_type, name)
-        self.ext_fn[name] = fn
-
-        # set proper names to arguments
-        for i, arg in enumerate(fn.args):
-            arg.name = parameters[i].symbol.name
-            self.func_arg_map[parameters[i].symbol.name] = arg
-
-        # func.attributes.add("inlinehint")
-        # func.attributes.add("argmemonly")
-        block = fn.append_basic_block(name="entry")
-        self.builder = ir.IRBuilder(block)  # TODO use goto_block instead
-        self._print(func.body)
-        self.builder.ret_void()
-        self.fn = fn
-        if self.target == 'gpu':
-            set_cuda_kernel(fn)
-
-        return fn
-
-    def _print_Block(self, block):
-        for node in block.args:
-            self._print(node)
-
-    def _print_LoopOverCoordinate(self, loop):
-        with Loop(self.builder, self._print(loop.start), self._print(loop.stop), self._print(loop.step),
-                  loop.loop_counter_name, loop.loop_counter_symbol.name) as i:
-            self._add_tmp_var(loop.loop_counter_symbol, i)
-            self._print(loop.body)
-            self._remove_tmp_var(loop.loop_counter_symbol)
-
-    def _print_SympyAssignment(self, assignment):
-        expr = self._print(assignment.rhs)
-        lhs = assignment.lhs
-        if isinstance(lhs, Indexed):
-            ptr = self._print(lhs.base.label)
-            index = self._print(lhs.args[1])
-            gep = self.builder.gep(ptr, [index])
-            return self.builder.store(expr, gep)
-        self.func_arg_map[assignment.lhs.name] = expr
-        return expr
-
-    def _print_boolean_cast_func(self, conversion):
-        return self._print_cast_func(conversion)
-
-    def _print_cast_func(self, conversion):
-        node = self._print(conversion.args[0])
-        to_dtype = get_type_of_expression(conversion)
-        from_dtype = get_type_of_expression(conversion.args[0])
-        if from_dtype == to_dtype:
-            return self._print(conversion.args[0])
-
-        # (From, to)
-        decision = {
-            (create_composite_type_from_string("int32"),
-             create_composite_type_from_string("int64")): functools.partial(self.builder.zext, node, self.integer),
-            (create_composite_type_from_string("int16"),
-             create_composite_type_from_string("int64")): functools.partial(self.builder.zext, node, self.integer),
-            (create_composite_type_from_string("int"),
-             create_composite_type_from_string("double")): functools.partial(self.builder.sitofp, node, self.fp_type),
-            (create_composite_type_from_string("int16"),
-             create_composite_type_from_string("double")): functools.partial(self.builder.sitofp, node, self.fp_type),
-            (create_composite_type_from_string("double"),
-             create_composite_type_from_string("int")): functools.partial(self.builder.fptosi, node, self.integer),
-            (create_composite_type_from_string("double *"),
-             create_composite_type_from_string("int")): functools.partial(self.builder.ptrtoint, node, self.integer),
-            (create_composite_type_from_string("int"),
-             create_composite_type_from_string("double *")): functools.partial(self.builder.inttoptr,
-                                                                               node, self.fp_pointer),
-            (create_composite_type_from_string("double * restrict"),
-             create_composite_type_from_string("int")): functools.partial(self.builder.ptrtoint, node, self.integer),
-            (create_composite_type_from_string("int"),
-             create_composite_type_from_string("double * restrict")): functools.partial(self.builder.inttoptr, node,
-                                                                                        self.fp_pointer),
-            (create_composite_type_from_string("double * restrict const"),
-             create_composite_type_from_string("int")): functools.partial(self.builder.ptrtoint, node,
-                                                                          self.integer),
-            (create_composite_type_from_string("int"),
-             create_composite_type_from_string("double * restrict const")): functools.partial(self.builder.inttoptr,
-                                                                                              node, self.fp_pointer),
-        }
-        # TODO float, TEST: const, restrict
-        # TODO bitcast, addrspacecast
-        # TODO unsigned/signed fills
-        # print([x for x in decision.keys()])
-        # print("Types:")
-        # print([(type(x), type(y)) for (x, y) in decision.keys()])
-        # print("Cast:")
-        # print((from_dtype, to_dtype))
-        return decision[(from_dtype, to_dtype)]()
-
-    def _print_pointer_arithmetic_func(self, pointer):
-        ptr = self._print(pointer.args[0])
-        index = self._print(pointer.args[1])
-        return self.builder.gep(ptr, [index])
-
-    def _print_Indexed(self, indexed):
-        ptr = self._print(indexed.base.label)
-        index = self._print(indexed.args[1])
-        gep = self.builder.gep(ptr, [index])
-        return self.builder.load(gep, name=indexed.base.label.name)
-
-    def _print_Piecewise(self, piece):
-        if not piece.args[-1].cond:
-            # We need the last conditional to be a True, otherwise the resulting
-            # function may not return a result.
-            raise ValueError("All Piecewise expressions must contain an "
-                             "(expr, True) statement to be used as a default "
-                             "condition. Without one, the generated "
-                             "expression may not evaluate to anything under "
-                             "some condition.")
-        if piece.has(Assignment):
-            raise NotImplementedError('The llvm-backend does not support assignments'
-                                      'in the Piecewise function. It is questionable'
-                                      'whether to implement it. So far there is no'
-                                      'use-case to test it.')
-        else:
-            phi_data = []
-            after_block = self.builder.append_basic_block()
-            for (expr, condition) in piece.args:
-                if condition == sp.sympify(True):  # Don't use 'is' use '=='!
-                    phi_data.append((self._print(expr), self.builder.block))
-                    self.builder.branch(after_block)
-                    self.builder.position_at_end(after_block)
-                else:
-                    cond = self._print(condition)
-                    true_block = self.builder.append_basic_block()
-                    false_block = self.builder.append_basic_block()
-                    self.builder.cbranch(cond, true_block, false_block)
-                    self.builder.position_at_end(true_block)
-                    phi_data.append((self._print(expr), true_block))
-                    self.builder.branch(after_block)
-                    self.builder.position_at_end(false_block)
-
-            phi = self.builder.phi(to_llvm_type(get_type_of_expression(piece), nvvm_target=self.target == 'gpu'))
-            for (val, block) in phi_data:
-                phi.add_incoming(val, block)
-            return phi
-
-    def _print_Conditional(self, node):
-        cond = self._print(node.condition_expr)
-        with self.builder.if_else(cond) as (then, otherwise):
-            with then:
-                self._print(node.true_block)       # emit instructions for when the predicate is true
-            with otherwise:
-                self._print(node.false_block)       # emit instructions for when the predicate is true
-
-        # No return!
-
-    def _print_Function(self, expr):
-        name = expr.func.__name__
-        e0 = self._print(expr.args[0])
-        fn = self.ext_fn.get(name)
-        if not fn:
-            fn_type = ir.FunctionType(self.fp_type, [self.fp_type])
-            fn = ir.Function(self.module, fn_type, name)
-            self.ext_fn[name] = fn
-        return self.builder.call(fn, [e0], name)
-
-    def empty_printer(self, expr):
-        try:
-            import inspect
-            mro = inspect.getmro(expr)
-        except AttributeError:
-            mro = "None"
-        raise TypeError("Unsupported type for LLVM JIT conversion: Expression:\"%s\", Type:\"%s\", MRO:%s"
-                        % (expr, type(expr), mro))
-
-    # from: https://llvm.org/docs/NVPTXUsage.html#nvptx-intrinsics
-    INDEXING_FUNCTION_MAPPING = {
-        'blockIdx': 'llvm.nvvm.read.ptx.sreg.ctaid',
-        'threadIdx': 'llvm.nvvm.read.ptx.sreg.tid',
-        'blockDim': 'llvm.nvvm.read.ptx.sreg.ntid',
-        'gridDim': 'llvm.nvvm.read.ptx.sreg.nctaid'
-    }
-
-    def _print_ThreadIndexingSymbol(self, node):
-        symbol_name: str = node.name
-        function_name, dimension = tuple(symbol_name.split("."))
-        function_name = self.INDEXING_FUNCTION_MAPPING[function_name]
-        name = f"{function_name}.{dimension}"
-
-        return self.builder.zext(_call_sreg(self.builder, name), self.integer)
--- a/pystencils/llvm/llvmjit.py
+++ b/pystencils/llvm/llvmjit.py
-import ctypes as ct
-import subprocess
-from functools import partial
-from itertools import chain
-from os.path import exists, join
-
-import llvmlite.binding as llvm
-import llvmlite.ir as ir
-import numpy as np
-
-from pystencils.data_types import create_composite_type_from_string
-from pystencils.field import FieldType
-
-from ..data_types import StructType, ctypes_from_llvm, to_ctypes
-from .llvm import generate_llvm
-
-
-def build_ctypes_argument_list(parameter_specification, argument_dict):
-    argument_dict = {k: v for k, v in argument_dict.items()}
-    ct_arguments = []
-    array_shapes = set()
-    index_arr_shapes = set()
-
-    for param in parameter_specification:
-        if param.is_field_parameter:
-            try:
-                field_arr = argument_dict[param.field_name]
-            except KeyError:
-                raise KeyError("Missing field parameter for kernel call " + param.field_name)
-
-            symbolic_field = param.fields[0]
-            if param.is_field_pointer:
-                ct_arguments.append(field_arr.ctypes.data_as(to_ctypes(param.symbol.dtype)))
-                if symbolic_field.has_fixed_shape:
-                    symbolic_field_shape = tuple(int(i) for i in symbolic_field.shape)
-                    if isinstance(symbolic_field.dtype, StructType):
-                        symbolic_field_shape = symbolic_field_shape[:-1]
-                    if symbolic_field_shape != field_arr.shape:
-                        raise ValueError("Passed array '%s' has shape %s which does not match expected shape %s" %
-                                         (param.field_name, str(field_arr.shape), str(symbolic_field.shape)))
-                if symbolic_field.has_fixed_shape:
-                    symbolic_field_strides = tuple(int(i) * field_arr.itemsize for i in symbolic_field.strides)
-                    if isinstance(symbolic_field.dtype, StructType):
-                        symbolic_field_strides = symbolic_field_strides[:-1]
-                    if symbolic_field_strides != field_arr.strides:
-                        raise ValueError("Passed array '%s' has strides %s which does not match expected strides %s" %
-                                         (param.field_name, str(field_arr.strides), str(symbolic_field_strides)))
-
-                if FieldType.is_indexed(symbolic_field):
-                    index_arr_shapes.add(field_arr.shape[:symbolic_field.spatial_dimensions])
-                elif FieldType.is_generic(symbolic_field):
-                    array_shapes.add(field_arr.shape[:symbolic_field.spatial_dimensions])
-
-            elif param.is_field_shape:
-                data_type = to_ctypes(param.symbol.dtype)
-                ct_arguments.append(data_type(field_arr.shape[param.symbol.coordinate]))
-            elif param.is_field_stride:
-                data_type = to_ctypes(param.symbol.dtype)
-                assert field_arr.strides[param.symbol.coordinate] % field_arr.itemsize == 0
-                item_stride = field_arr.strides[param.symbol.coordinate] // field_arr.itemsize
-                ct_arguments.append(data_type(item_stride))
-            else:
-                assert False
-        else:
-            try:
-                value = argument_dict[param.symbol.name]
-            except KeyError:
-                raise KeyError("Missing parameter for kernel call " + param.symbol.name)
-            expected_type = to_ctypes(param.symbol.dtype)
-            ct_arguments.append(expected_type(value))
-
-    if len(array_shapes) > 1:
-        raise ValueError("All passed arrays have to have the same size " + str(array_shapes))
-    if len(index_arr_shapes) > 1:
-        raise ValueError("All passed index arrays have to have the same size " + str(array_shapes))
-
-    return ct_arguments
-
-
-def make_python_function_incomplete_params(kernel_function_node, argument_dict, func):
-    parameters = kernel_function_node.get_parameters()
-
-    cache = {}
-    cache_values = []
-
-    def wrapper(**kwargs):
-        key = hash(tuple((k, v.ctypes.data, v.strides, v.shape) if isinstance(v, np.ndarray) else (k, id(v))
-                         for k, v in kwargs.items()))
-        try:
-            args = cache[key]
-            func(*args)
-        except KeyError:
-            full_arguments = argument_dict.copy()
-            full_arguments.update(kwargs)
-            args = build_ctypes_argument_list(parameters, full_arguments)
-            cache[key] = args
-            cache_values.append(kwargs)  # keep objects alive such that ids remain unique
-            func(*args)
-    wrapper.ast = kernel_function_node
-    wrapper.parameters = kernel_function_node.get_parameters()
-    return wrapper
-
-
-def generate_and_jit(ast):
-    target = 'gpu' if ast._backend == 'llvm_gpu' else 'cpu'
-    gen = generate_llvm(ast, target=target)
-    if isinstance(gen, ir.Module):
-        return compile_llvm(gen, target, ast)
-    else:
-        return compile_llvm(gen.module, target, ast)
-
-
-def make_python_function(ast, argument_dict={}, func=None):
-    if func is None:
-        jit = generate_and_jit(ast)
-        func = jit.get_function_ptr(ast.function_name)
-    try:
-        args = build_ctypes_argument_list(ast.get_parameters(), argument_dict)
-    except KeyError:
-        # not all parameters specified yet
-        return make_python_function_incomplete_params(ast, argument_dict, func)
-    return lambda: func(*args)
-
-
-def compile_llvm(module, target='cpu', ast=None):
-    jit = CudaJit(ast) if target == "gpu" else Jit()
-    jit.parse(module)
-    jit.optimize()
-    jit.compile()
-    return jit
-
-
-class Jit(object):
-    def __init__(self):
-        llvm.initialize()
-        llvm.initialize_all_targets()
-        llvm.initialize_native_target()
-        llvm.initialize_native_asmprinter()
-
-        self.module = None
-        self._llvmmod = llvm.parse_assembly("")
-        self.target = llvm.Target.from_default_triple()
-        self.cpu = llvm.get_host_cpu_name()
-        self.cpu_features = llvm.get_host_cpu_features()
-        self.target_machine = self.target.create_target_machine(cpu=self.cpu, features=self.cpu_features.flatten(),
-                                                                opt=2)
-        llvm.check_jit_execution()
-        self.ee = llvm.create_mcjit_compiler(self.llvmmod, self.target_machine)
-        self.ee.finalize_object()
-        self.fptr = None
-
-    @property
-    def llvmmod(self):
-        return self._llvmmod
-
-    @llvmmod.setter
-    def llvmmod(self, mod):
-        self.ee.remove_module(self.llvmmod)
-        self.ee.add_module(mod)
-        self.ee.finalize_object()
-        self.compile()
-        self._llvmmod = mod
-
-    def parse(self, module):
-        self.module = module
-        llvmmod = llvm.parse_assembly(str(module))
-        llvmmod.verify()
-        llvmmod.triple = self.target.triple
-        llvmmod.name = 'module'
-        self.llvmmod = llvmmod
-
-    def write_ll(self, file):
-        with open(file, 'w') as f:
-            f.write(str(self.llvmmod))
-
-    def write_assembly(self, file):
-        with open(file, 'w') as f:
-            f.write(self.target_machine.emit_assembly(self.llvmmod))
-
-    def write_object_file(self, file):
-        with open(file, 'wb') as f:
-            f.write(self.target_machine.emit_object(self.llvmmod))
-
-    def optimize(self):
-        pmb = llvm.create_pass_manager_builder()
-        pmb.opt_level = 2
-        pmb.disable_unit_at_a_time = False
-        pmb.loop_vectorize = True
-        pmb.slp_vectorize = True
-        # TODO possible to pass for functions
-        pm = llvm.create_module_pass_manager()
-        pm.add_instruction_combining_pass()
-        pm.add_function_attrs_pass()
-        pm.add_constant_merge_pass()
-        pm.add_licm_pass()
-        pmb.populate(pm)
-        pm.run(self.llvmmod)
-
-    def compile(self):
-        fptr = {}
-        for func in self.module.functions:
-            if not func.is_declaration:
-                return_type = None
-                if func.ftype.return_type != ir.VoidType():
-                    return_type = to_ctypes(create_composite_type_from_string(str(func.ftype.return_type)))
-                args = [ctypes_from_llvm(arg) for arg in func.ftype.args]
-                function_address = self.ee.get_function_address(func.name)
-                fptr[func.name] = ct.CFUNCTYPE(return_type, *args)(function_address)
-        self.fptr = fptr
-
-    def __call__(self, func, *args, **kwargs):
-        target_function = next(f for f in self.module.functions if f.name == func)
-        arg_types = [ctypes_from_llvm(arg.type) for arg in target_function.args]
-
-        transformed_args = []
-        for i, arg in enumerate(args):
-            if isinstance(arg, np.ndarray):
-                transformed_args.append(arg.ctypes.data_as(arg_types[i]))
-            else:
-                transformed_args.append(arg)
-
-        self.fptr[func](*transformed_args)
-
-    def print_functions(self):
-        for f in self.module.functions:
-            print(f.ftype.return_type, f.name, f.args)
-
-    def get_function_ptr(self, name):
-        fptr = self.fptr[name]
-        fptr.jit = self
-        return fptr
-
-
-# Following code more or less from numba
-class CudaJit(Jit):
-
-    CUDA_TRIPLE = {32: 'nvptx-nvidia-cuda',
-                   64: 'nvptx64-nvidia-cuda'}
-    MACHINE_BITS = tuple.__itemsize__ * 8
-    data_layout = {
-        32: ('e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-'
-             'f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64'),
-        64: ('e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-'
-             'f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64')}
-
-    default_data_layout = data_layout[MACHINE_BITS]
-
-    def __init__(self, ast):
-        # super().__init__()
-
-        # self.target = llvm.Target.from_triple(self.CUDA_TRIPLE[self.MACHINE_BITS])
-        self._data_layout = self.default_data_layout[self.MACHINE_BITS]
-        # self._target_data = llvm.create_target_data(self._data_layout)
-        self.indexing = ast.indexing
-
-    def optimize(self):
-        pmb = llvm.create_pass_manager_builder()
-        pmb.opt_level = 2
-        pmb.disable_unit_at_a_time = False
-        pmb.loop_vectorize = False
-        pmb.slp_vectorize = False
-        # TODO possible to pass for functions
-        pm = llvm.create_module_pass_manager()
-        pm.add_instruction_combining_pass()
-        pm.add_function_attrs_pass()
-        pm.add_constant_merge_pass()
-        pm.add_licm_pass()
-        pmb.populate(pm)
-        pm.run(self.llvmmod)
-        pm.run(self.llvmmod)
-
-    def write_ll(self, file):
-        with open(file, 'w') as f:
-            f.write(str(self.llvmmod))
-
-    def parse(self, module):
-
-        llvmmod = module
-        llvmmod.triple = self.CUDA_TRIPLE[self.MACHINE_BITS]
-        llvmmod.data_layout = self.default_data_layout
-        llvmmod.verify()
-        llvmmod.name = 'module'
-
-        self._llvmmod = llvm.parse_assembly(str(llvmmod))
-
-    def compile(self):
-        from pystencils.cpu.cpujit import get_cache_config, get_compiler_config, get_llc_command
-        import hashlib
-        compiler_cache = get_cache_config()['object_cache']
-        ir_file = join(compiler_cache, hashlib.md5(str(self._llvmmod).encode()).hexdigest() + '.ll')
-        ptx_file = ir_file.replace('.ll', '.ptx')
-        try:
-            from pycuda.driver import Context
-            arch = "sm_%d%d" % Context.get_device().compute_capability()
-        except Exception:
-            arch = "sm_35"
-
-        if not exists(ptx_file):
-            self.write_ll(ir_file)
-            if 'llc' in get_compiler_config():
-                llc_command = get_compiler_config()['llc']
-            else:
-                llc_command = get_llc_command() or 'llc'
-
-            subprocess.check_call([llc_command, '-mcpu=' + arch, ir_file, '-o', ptx_file])
-
-        # cubin_file = ir_file.replace('.ll', '.cubin')
-        # if not exists(cubin_file):
-            # subprocess.check_call(['ptxas', '--gpu-name', arch, ptx_file, '-o', cubin_file])
-        import pycuda.driver
-
-        cuda_module = pycuda.driver.module_from_file(ptx_file)  # also works: cubin_file
-        self.cuda_module = cuda_module
-
-    def __call__(self, func, *args, **kwargs):
-        shape = [a.shape for a in chain(args, kwargs.values()) if hasattr(a, 'shape')][0]
-        block_and_thread_numbers = self.indexing.call_parameters(shape)
-        block_and_thread_numbers['block'] = tuple(int(i) for i in block_and_thread_numbers['block'])
-        block_and_thread_numbers['grid'] = tuple(int(i) for i in block_and_thread_numbers['grid'])
-        self.cuda_module.get_function(func)(*args, **kwargs, **block_and_thread_numbers)
-
-    def get_function_ptr(self, name):
-        return partial(self._call__, name)
--- a/pystencils/math_optimizations.py
+++ b/pystencils/math_optimizations.py
-"""
-Default Sympy optimizations applied in pystencils kernels using :func:`sympy.codegen.rewriting.optimize`.
-
-See :func:`sympy.codegen.rewriting.optimize`.
-"""
-
-
-import itertools
-
-from pystencils import Assignment
-from pystencils.astnodes import SympyAssignment
-
-try:
-    from sympy.codegen.rewriting import optims_c99, optimize
-    from sympy.codegen.rewriting import ReplaceOptim
-    HAS_REWRITING = True
-
-    # Evaluates all constant terms
-    evaluate_constant_terms = ReplaceOptim(
-        lambda e: hasattr(e, 'is_constant') and e.is_constant and not e.is_integer,
-        lambda p: p.evalf()
-    )
-
-    optims_pystencils_cpu = [evaluate_constant_terms] + list(optims_c99)
-    optims_pystencils_gpu = [evaluate_constant_terms] + list(optims_c99)
-except ImportError:
-    from warnings import warn
-    warn("Could not import ReplaceOptim, optims_c99, optimize from sympy.codegen.rewriting."
-         "Please update your sympy installation!")
-    optims_c99 = []
-    optims_pystencils_cpu = []
-    optims_pystencils_gpu = []
-    HAS_REWRITING = False
-
-
-def optimize_assignments(assignments, optimizations):
-
-    if HAS_REWRITING:
-        assignments = [Assignment(a.lhs, optimize(a.rhs, optimizations))
-                       if hasattr(a, 'lhs')
-                       else a for a in assignments]
-        assignments_nodes = [a.atoms(SympyAssignment) for a in assignments]
-        for a in itertools.chain.from_iterable(assignments_nodes):
-            a.optimize(optimizations)
-
-    return assignments
--- a/pystencils/opencl/opencljit.py
+++ b/pystencils/opencl/opencljit.py
-import numpy as np
-
-from pystencils.backends.cbackend import generate_c, get_headers
-from pystencils.gpucuda.cudajit import _build_numpy_argument_list, _check_arguments
-from pystencils.include import get_pystencils_include_path
-from pystencils.kernel_wrapper import KernelWrapper
-
-USE_FAST_MATH = True
-
-
-_global_cl_ctx = None
-_global_cl_queue = None
-
-
-def get_global_cl_queue():
-    return _global_cl_queue
-
-
-def get_global_cl_ctx():
-    return _global_cl_ctx
-
-
-def init_globally(device_index=0):
-    import pyopencl as cl
-    global _global_cl_ctx
-    global _global_cl_queue
-    _global_cl_ctx = cl.create_some_context(device_index)
-    _global_cl_queue = cl.CommandQueue(_global_cl_ctx)
-
-
-def make_python_function(kernel_function_node, opencl_queue, opencl_ctx, argument_dict=None, custom_backend=None):
-    """
-    Creates a **OpenCL** kernel function from an abstract syntax tree which
-    was created for the ``target='gpu'`` e.g. by :func:`pystencils.gpucuda.create_cuda_kernel`
-    or :func:`pystencils.gpucuda.created_indexed_cuda_kernel`
-
-    Args:
-        opencl_queue: a valid :class:`pyopencl.CommandQueue`
-        opencl_ctx: a valid :class:`pyopencl.Context`
-        kernel_function_node: the abstract syntax tree
-        argument_dict: parameters passed here are already fixed. Remaining parameters have to be passed to the
-                       returned kernel functor.
-
-    Returns:
-        compiled kernel as Python function
-    """
-    import pyopencl as cl
-
-    if not opencl_ctx:
-        opencl_ctx = _global_cl_ctx
-    if not opencl_queue:
-        opencl_queue = _global_cl_queue
-
-    assert opencl_ctx, "No valid OpenCL context"
-    assert opencl_queue, "No valid OpenCL queue"
-
-    if argument_dict is None:
-        argument_dict = {}
-
-    # check if double precision is supported and required
-    if any([d.double_fp_config == 0 for d in opencl_ctx.devices]):
-        for param in kernel_function_node.get_parameters():
-            if param.symbol.dtype.base_type:
-                if param.symbol.dtype.base_type.numpy_dtype == np.float64:
-                    raise ValueError('OpenCL device does not support double precision')
-            else:
-                if param.symbol.dtype.numpy_dtype == np.float64:
-                    raise ValueError('OpenCL device does not support double precision')
-
-    # Changing of kernel name necessary since compilation with default name "kernel" is not possible (OpenCL keyword!)
-    kernel_function_node.function_name = "opencl_" + kernel_function_node.function_name
-    header_list = ['"opencl_stdint.h"'] + list(get_headers(kernel_function_node))
-    includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list])
-
-    code = includes + "\n"
-    code += "#define FUNC_PREFIX __kernel\n"
-    code += "#define RESTRICT restrict\n\n"
-    code += str(generate_c(kernel_function_node, dialect='opencl', custom_backend=custom_backend))
-    options = []
-    if USE_FAST_MATH:
-        options.append("-cl-unsafe-math-optimizations")
-        options.append("-cl-mad-enable")
-        options.append("-cl-fast-relaxed-math")
-        options.append("-cl-finite-math-only")
-    options.append("-I")
-    options.append(get_pystencils_include_path())
-    mod = cl.Program(opencl_ctx, code).build(options=options)
-    func = getattr(mod, kernel_function_node.function_name)
-
-    parameters = kernel_function_node.get_parameters()
-
-    cache = {}
-    cache_values = []
-
-    def wrapper(**kwargs):
-        key = hash(tuple((k, v.ctypes.data, v.strides, v.shape) if isinstance(v, np.ndarray) else (k, id(v))
-                         for k, v in kwargs.items()))
-        try:
-            args, block_and_thread_numbers = cache[key]
-            func(opencl_queue, block_and_thread_numbers['grid'], block_and_thread_numbers['block'], *args)
-        except KeyError:
-            full_arguments = argument_dict.copy()
-            full_arguments.update(kwargs)
-            shape = _check_arguments(parameters, full_arguments)
-
-            indexing = kernel_function_node.indexing
-            block_and_thread_numbers = indexing.call_parameters(shape)
-            block_and_thread_numbers['block'] = tuple(int(i) for i in block_and_thread_numbers['block'])
-            block_and_thread_numbers['grid'] = tuple(int(b * g) for (b, g) in zip(block_and_thread_numbers['block'],
-                                                                                  block_and_thread_numbers['grid']))
-
-            args = _build_numpy_argument_list(parameters, full_arguments)
-            args = [a.data if hasattr(a, 'data') else a for a in args]
-            cache[key] = (args, block_and_thread_numbers)
-            cache_values.append(kwargs)  # keep objects alive such that ids remain unique
-            func(opencl_queue, block_and_thread_numbers['grid'], block_and_thread_numbers['block'], *args)
-
-    wrapper.ast = kernel_function_node
-    wrapper.parameters = kernel_function_node.get_parameters()
-    wrapper = KernelWrapper(wrapper, parameters, kernel_function_node)
-    return wrapper
--- a/pystencils/rng.py
+++ b/pystencils/rng.py
-import numpy as np
-import sympy as sp
-
-from pystencils import TypedSymbol
-from pystencils.astnodes import LoopOverCoordinate
-from pystencils.backends.cbackend import CustomCodeNode
-
-
-def _get_rng_template(name, data_type, num_vars):
-    if data_type is np.float32:
-        c_type = "float"
-    elif data_type is np.float64:
-        c_type = "double"
-    template = "\n"
-    for i in range(num_vars):
-        template += "{{result_symbols[{}].dtype}} {{result_symbols[{}].name}};\n".format(i, i)
-    template += ("{}_{}{}({{parameters}}, " + ", ".join(["{{result_symbols[{}].name}}"] * num_vars) + ");\n") \
-        .format(name, c_type, num_vars, *tuple(range(num_vars)))
-    return template
-
-
-def _get_rng_code(template, dialect, vector_instruction_set, time_step, offsets, keys, dim, result_symbols):
-    parameters = [time_step] + [LoopOverCoordinate.get_loop_counter_symbol(i) + offsets[i]
-                                for i in range(dim)] + [0] * (3 - dim) + list(keys)
-
-    if dialect == 'cuda' or (dialect == 'c' and vector_instruction_set is None):
-        return template.format(parameters=', '.join(str(p) for p in parameters),
-                               result_symbols=result_symbols)
-    else:
-        raise NotImplementedError("Not yet implemented for this backend")
-
-
-class RNGBase(CustomCodeNode):
-
-    def __init__(self, dim, time_step=TypedSymbol("time_step", np.uint32), offsets=(0, 0, 0), keys=None):
-        if keys is None:
-            keys = (0,) * self._num_keys
-        if len(keys) != self._num_keys:
-            raise ValueError("Provided {} keys but need {}".format(len(keys), self._num_keys))
-        if len(offsets) != 3:
-            raise ValueError("Provided {} offsets but need {}".format(len(offsets), 3))
-        self.result_symbols = tuple(TypedSymbol(sp.Dummy().name, self._data_type) for _ in range(self._num_vars))
-        symbols_read = [s for s in keys if isinstance(s, sp.Symbol)]
-        super().__init__("", symbols_read=symbols_read, symbols_defined=self.result_symbols)
-        self._time_step = time_step
-        self._offsets = offsets
-        self.headers = ['"{}_rand.h"'.format(self._name)]
-        self.keys = tuple(keys)
-        self._args = sp.sympify((dim, time_step, keys))
-        self._dim = dim
-
-    @property
-    def args(self):
-        return self._args
-
-    @property
-    def undefined_symbols(self):
-        result = {a for a in (self._time_step, *self._offsets, *self.keys) if isinstance(a, sp.Symbol)}
-        loop_counters = [LoopOverCoordinate.get_loop_counter_symbol(i)
-                         for i in range(self._dim)]
-        result.update(loop_counters)
-        return result
-
-    def fast_subs(self, _):
-        return self  # nothing to replace inside this node - would destroy intermediate "dummy" by re-creating them
-
-    def get_code(self, dialect, vector_instruction_set):
-        template = _get_rng_template(self._name, self._data_type, self._num_vars)
-        return _get_rng_code(template, dialect, vector_instruction_set,
-                             self._time_step, self._offsets, self.keys, self._dim, self.result_symbols)
-
-    def __repr__(self):
-        return (", ".join(['{}'] * self._num_vars) + " \\leftarrow {}RNG").format(*self.result_symbols,
-                                                                                  self._name.capitalize())
-
-
-class PhiloxTwoDoubles(RNGBase):
-    _name = "philox"
-    _data_type = np.float64
-    _num_vars = 2
-    _num_keys = 2
-
-
-class PhiloxFourFloats(RNGBase):
-    _name = "philox"
-    _data_type = np.float32
-    _num_vars = 4
-    _num_keys = 2
-
-
-class AESNITwoDoubles(RNGBase):
-    _name = "aesni"
-    _data_type = np.float64
-    _num_vars = 2
-    _num_keys = 4
-
-
-class AESNIFourFloats(RNGBase):
-    _name = "aesni"
-    _data_type = np.float32
-    _num_vars = 4
-    _num_keys = 4
-
-
-def random_symbol(assignment_list, seed=TypedSymbol("seed", np.uint32), rng_node=PhiloxTwoDoubles, *args, **kwargs):
-    counter = 0
-    while True:
-        node = rng_node(*args, keys=(counter, seed), **kwargs)
-        inserted = False
-        for symbol in node.result_symbols:
-            if not inserted:
-                assignment_list.insert(0, node)
-                inserted = True
-            yield symbol
-        counter += 1
No results found