1bb35b83 · 1bb35b83 · 1bb35b83 · 1bb35b83 · 1bb35b83 · 1bb35b83
--- a/pystencils/datahandling/pycuda.py
+++ b/pystencils/datahandling/pycuda.py
-try:
-    import pycuda.gpuarray as gpuarray
-except ImportError:
-    gpuarray = None
-import numpy as np
-import pystencils
-class PyCudaArrayHandler:
-    def __init__(self):
-        import pycuda.autoinit  # NOQA
-    def zeros(self, shape, dtype=np.float64, order='C'):
-        cpu_array = np.zeros(shape=shape, dtype=dtype, order=order)
-        return self.to_gpu(cpu_array)
-    def ones(self, shape, dtype=np.float64, order='C'):
-        cpu_array = np.ones(shape=shape, dtype=dtype, order=order)
-        return self.to_gpu(cpu_array)
-    def empty(self, shape, dtype=np.float64, layout=None):
-        if layout:
-            cpu_array = pystencils.field.create_numpy_array_with_layout(shape=shape, dtype=dtype, layout=layout)
-            return self.to_gpu(cpu_array)
-        else:
-            return gpuarray.empty(shape, dtype)
-    @staticmethod
-    def to_gpu(array):
-        return gpuarray.to_gpu(array)
-    @staticmethod
-    def upload(array, numpy_array):
-        array.set(numpy_array)
-    @staticmethod
-    def download(array, numpy_array):
-        array.get(numpy_array)
-    def randn(self, shape, dtype=np.float64):
-        cpu_array = np.random.randn(*shape).astype(dtype)
-        return self.to_gpu(cpu_array)
-    from_numpy = to_gpu
-class PyCudaNotAvailableHandler:
-    def __getattribute__(self, name):
-        raise NotImplementedError("Unable to initiaize PyCuda! "
-                                  "Try to run `import pycuda.autoinit` to check whether PyCuda is working correctly!")
--- a/pystencils/include/PyStencilsField.h
+++ b/pystencils/include/PyStencilsField.h
-#pragma once
-extern "C++" {
-#ifdef __CUDA_ARCH__
-template <typename DTYPE_T, std::size_t DIMENSION> struct PyStencilsField {
-  DTYPE_T *data;
-  DTYPE_T shape[DIMENSION];
-  DTYPE_T stride[DIMENSION];
-};
-#else
-#include <array>
-template <typename DTYPE_T, std::size_t DIMENSION> struct PyStencilsField {
-  DTYPE_T *data;
-  std::array<DTYPE_T, DIMENSION> shape;
-  std::array<DTYPE_T, DIMENSION> stride;
-};
-#endif
-}
--- a/pystencils/include/cuda_complex.hpp
+++ b/pystencils/include/cuda_complex.hpp
-// An implementation of C++ std::complex for use on CUDA devices.
-// Written by John C. Travers <jtravs@gmail.com> (2012)
-//
-// Missing:
-//  - long double support (not supported on CUDA)
-//  - some integral pow functions (due to lack of C++11 support on CUDA)
-//
-// Heavily derived from the LLVM libcpp project (svn revision 147853).
-// Based on libcxx/include/complex.
-// The git history contains the complete change history from the original.
-// The modifications are licensed as per the original LLVM license below.
-//
-// -*- C++ -*-
-//===--------------------------- complex ----------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-extern "C++" {
-#ifndef CUDA_COMPLEX_HPP
-#define CUDA_COMPLEX_HPP
-#ifdef __CUDACC__
-#define CUDA_CALLABLE_MEMBER __host__ __device__
-#else
-#define CUDA_CALLABLE_MEMBER
-#endif
-/*
-    complex synopsis
-template<class T>
-class complex
-{
-public:
-    typedef T value_type;
-    complex(const T& re = T(), const T& im = T());
-    complex(const complex&);
-    template<class X> complex(const complex<X>&);
-    T real() const;
-    T imag() const;
-    void real(T);
-    void imag(T);
-    complex<T>& operator= (const T&);
-    complex<T>& operator+=(const T&);
-    complex<T>& operator-=(const T&);
-    complex<T>& operator*=(const T&);
-    complex<T>& operator/=(const T&);
-    complex& operator=(const complex&);
-    template<class X> complex<T>& operator= (const complex<X>&);
-    template<class X> complex<T>& operator+=(const complex<X>&);
-    template<class X> complex<T>& operator-=(const complex<X>&);
-    template<class X> complex<T>& operator*=(const complex<X>&);
-    template<class X> complex<T>& operator/=(const complex<X>&);
-};
-template<>
-class complex<float>
-{
-public:
-    typedef float value_type;
-    constexpr complex(float re = 0.0f, float im = 0.0f);
-    explicit constexpr complex(const complex<double>&);
-    constexpr float real() const;
-    void real(float);
-    constexpr float imag() const;
-    void imag(float);
-    complex<float>& operator= (float);
-    complex<float>& operator+=(float);
-    complex<float>& operator-=(float);
-    complex<float>& operator*=(float);
-    complex<float>& operator/=(float);
-    complex<float>& operator=(const complex<float>&);
-    template<class X> complex<float>& operator= (const complex<X>&);
-    template<class X> complex<float>& operator+=(const complex<X>&);
-    template<class X> complex<float>& operator-=(const complex<X>&);
-    template<class X> complex<float>& operator*=(const complex<X>&);
-    template<class X> complex<float>& operator/=(const complex<X>&);
-};
-template<>
-class complex<double>
-{
-public:
-    typedef double value_type;
-    constexpr complex(double re = 0.0, double im = 0.0);
-    constexpr complex(const complex<float>&);
-    constexpr double real() const;
-    void real(double);
-    constexpr double imag() const;
-    void imag(double);
-    complex<double>& operator= (double);
-    complex<double>& operator+=(double);
-    complex<double>& operator-=(double);
-    complex<double>& operator*=(double);
-    complex<double>& operator/=(double);
-    complex<double>& operator=(const complex<double>&);
-    template<class X> complex<double>& operator= (const complex<X>&);
-    template<class X> complex<double>& operator+=(const complex<X>&);
-    template<class X> complex<double>& operator-=(const complex<X>&);
-    template<class X> complex<double>& operator*=(const complex<X>&);
-    template<class X> complex<double>& operator/=(const complex<X>&);
-};
-// 26.3.6 operators:
-template<class T> complex<T> operator+(const complex<T>&, const complex<T>&);
-template<class T> complex<T> operator+(const complex<T>&, const T&);
-template<class T> complex<T> operator+(const T&, const complex<T>&);
-template<class T> complex<T> operator-(const complex<T>&, const complex<T>&);
-template<class T> complex<T> operator-(const complex<T>&, const T&);
-template<class T> complex<T> operator-(const T&, const complex<T>&);
-template<class T> complex<T> operator*(const complex<T>&, const complex<T>&);
-template<class T> complex<T> operator*(const complex<T>&, const T&);
-template<class T> complex<T> operator*(const T&, const complex<T>&);
-template<class T> complex<T> operator/(const complex<T>&, const complex<T>&);
-template<class T> complex<T> operator/(const complex<T>&, const T&);
-template<class T> complex<T> operator/(const T&, const complex<T>&);
-template<class T> complex<T> operator+(const complex<T>&);
-template<class T> complex<T> operator-(const complex<T>&);
-template<class T> bool operator==(const complex<T>&, const complex<T>&);
-template<class T> bool operator==(const complex<T>&, const T&);
-template<class T> bool operator==(const T&, const complex<T>&);
-template<class T> bool operator!=(const complex<T>&, const complex<T>&);
-template<class T> bool operator!=(const complex<T>&, const T&);
-template<class T> bool operator!=(const T&, const complex<T>&);
-template<class T, class charT, class traits>
-  basic_istream<charT, traits>&
-  operator>>(basic_istream<charT, traits>&, complex<T>&);
-template<class T, class charT, class traits>
-  basic_ostream<charT, traits>&
-  operator<<(basic_ostream<charT, traits>&, const complex<T>&);
-// 26.3.7 values:
-template<class T>              T real(const complex<T>&);
-                          double real(double);
-template<Integral T>      double real(T);
-                          float  real(float);
-template<class T>              T imag(const complex<T>&);
-                          double imag(double);
-template<Integral T>      double imag(T);
-                          float  imag(float);
-template<class T> T abs(const complex<T>&);
-template<class T>              T arg(const complex<T>&);
-                          double arg(double);
-template<Integral T>      double arg(T);
-                          float  arg(float);
-template<class T>              T norm(const complex<T>&);
-                          double norm(double);
-template<Integral T>      double norm(T);
-                          float  norm(float);
-template<class T>      complex<T>           conj(const complex<T>&);
-                       complex<double>      conj(double);
-template<Integral T>   complex<double>      conj(T);
-                       complex<float>       conj(float);
-template<class T>    complex<T>           proj(const complex<T>&);
-                     complex<double>      proj(double);
-template<Integral T> complex<double>      proj(T);
-                     complex<float>       proj(float);
-template<class T> complex<T> polar(const T&, const T& = 0);
-// 26.3.8 transcendentals:
-template<class T> complex<T> acos(const complex<T>&);
-template<class T> complex<T> asin(const complex<T>&);
-template<class T> complex<T> atan(const complex<T>&);
-template<class T> complex<T> acosh(const complex<T>&);
-template<class T> complex<T> asinh(const complex<T>&);
-template<class T> complex<T> atanh(const complex<T>&);
-template<class T> complex<T> cos (const complex<T>&);
-template<class T> complex<T> cosh (const complex<T>&);
-template<class T> complex<T> exp (const complex<T>&);
-template<class T> complex<T> log (const complex<T>&);
-template<class T> complex<T> log10(const complex<T>&);
-template<class T> complex<T> pow(const complex<T>&, const T&);
-template<class T> complex<T> pow(const complex<T>&, const complex<T>&);
-template<class T> complex<T> pow(const T&, const complex<T>&);
-template<class T> complex<T> sin (const complex<T>&);
-template<class T> complex<T> sinh (const complex<T>&);
-template<class T> complex<T> sqrt (const complex<T>&);
-template<class T> complex<T> tan (const complex<T>&);
-template<class T> complex<T> tanh (const complex<T>&);
-template<class T, class charT, class traits>
-  basic_istream<charT, traits>&
-  operator>>(basic_istream<charT, traits>& is, complex<T>& x);
-template<class T, class charT, class traits>
-  basic_ostream<charT, traits>&
-  operator<<(basic_ostream<charT, traits>& o, const complex<T>& x);
-*/
-#include <math.h>
-#include <sstream>
-template <class _Tp> class complex;
-template <class _Tp>
-complex<_Tp> operator*(const complex<_Tp> &__z, const complex<_Tp> &__w);
-template <class _Tp>
-complex<_Tp> operator/(const complex<_Tp> &__x, const complex<_Tp> &__y);
-template <class _Tp> class complex {
-public:
-  typedef _Tp value_type;
-private:
-  value_type __re_;
-  value_type __im_;
-public:
-  CUDA_CALLABLE_MEMBER
-  complex(const value_type &__re = value_type(),
-          const value_type &__im = value_type())
-      : __re_(__re), __im_(__im) {}
-  template <class _Xp>
-  CUDA_CALLABLE_MEMBER complex(const complex<_Xp> &__c)
-      : __re_(__c.real()), __im_(__c.imag()) {}
-  CUDA_CALLABLE_MEMBER value_type real() const { return __re_; }
-  CUDA_CALLABLE_MEMBER value_type imag() const { return __im_; }
-  CUDA_CALLABLE_MEMBER void real(value_type __re) { __re_ = __re; }
-  CUDA_CALLABLE_MEMBER void imag(value_type __im) { __im_ = __im; }
-  CUDA_CALLABLE_MEMBER complex &operator=(const value_type &__re) {
-    __re_ = __re;
-    __im_ = value_type();
-    return *this;
-  }
-  CUDA_CALLABLE_MEMBER complex &operator+=(const value_type &__re) {
-    __re_ += __re;
-    return *this;
-  }
-  CUDA_CALLABLE_MEMBER complex &operator-=(const value_type &__re) {
-    __re_ -= __re;
-    return *this;
-  }
-  CUDA_CALLABLE_MEMBER complex &operator*=(const value_type &__re) {
-    __re_ *= __re;
-    __im_ *= __re;
-    return *this;
-  }
-  CUDA_CALLABLE_MEMBER complex &operator/=(const value_type &__re) {
-    __re_ /= __re;
-    __im_ /= __re;
-    return *this;
-  }
-  template <class _Xp>
-  CUDA_CALLABLE_MEMBER complex &operator=(const complex<_Xp> &__c) {
-    __re_ = __c.real();
-    __im_ = __c.imag();
-    return *this;
-  }
-  template <class _Xp>
-  CUDA_CALLABLE_MEMBER complex &operator+=(const complex<_Xp> &__c) {
-    __re_ += __c.real();
-    __im_ += __c.imag();
-    return *this;
-  }
-  template <class _Xp>
-  CUDA_CALLABLE_MEMBER complex &operator-=(const complex<_Xp> &__c) {
-    __re_ -= __c.real();
-    __im_ -= __c.imag();
-    return *this;
-  }
-  template <class _Xp>
-  CUDA_CALLABLE_MEMBER complex &operator*=(const complex<_Xp> &__c) {
-    *this = *this * __c;
-    return *this;
-  }
-  template <class _Xp>
-  CUDA_CALLABLE_MEMBER complex &operator/=(const complex<_Xp> &__c) {
-    *this = *this / __c;
-    return *this;
-  }
-};
-template <> class complex<double>;
-template <> class complex<float> {
-  float __re_;
-  float __im_;
-public:
-  typedef float value_type;
-  /*constexpr*/ CUDA_CALLABLE_MEMBER complex(float __re = 0.0f,
-                                             float __im = 0.0f)
-      : __re_(__re), __im_(__im) {}
-  explicit /*constexpr*/ complex(const complex<double> &__c);
-  /*constexpr*/ CUDA_CALLABLE_MEMBER float real() const { return __re_; }
-  /*constexpr*/ CUDA_CALLABLE_MEMBER float imag() const { return __im_; }
-  CUDA_CALLABLE_MEMBER void real(value_type __re) { __re_ = __re; }
-  CUDA_CALLABLE_MEMBER void imag(value_type __im) { __im_ = __im; }
-  CUDA_CALLABLE_MEMBER complex &operator=(float __re) {
-    __re_ = __re;
-    __im_ = value_type();
-    return *this;
-  }
-  CUDA_CALLABLE_MEMBER complex &operator+=(float __re) {
-    __re_ += __re;
-    return *this;
-  }
-  CUDA_CALLABLE_MEMBER complex &operator-=(float __re) {
-    __re_ -= __re;
-    return *this;
-  }
-  CUDA_CALLABLE_MEMBER complex &operator*=(float __re) {
-    __re_ *= __re;
-    __im_ *= __re;
-    return *this;
-  }
-  CUDA_CALLABLE_MEMBER complex &operator/=(float __re) {
-    __re_ /= __re;
-    __im_ /= __re;
-    return *this;
-  }
-  template <class _Xp>
-  CUDA_CALLABLE_MEMBER complex &operator=(const complex<_Xp> &__c) {
-    __re_ = __c.real();
-    __im_ = __c.imag();
-    return *this;
-  }
-  template <class _Xp>
-  CUDA_CALLABLE_MEMBER complex &operator+=(const complex<_Xp> &__c) {
-    __re_ += __c.real();
-    __im_ += __c.imag();
-    return *this;
-  }
-  template <class _Xp>
-  CUDA_CALLABLE_MEMBER complex &operator-=(const complex<_Xp> &__c) {
-    __re_ -= __c.real();
-    __im_ -= __c.imag();
-    return *this;
-  }
-  template <class _Xp>
-  CUDA_CALLABLE_MEMBER complex &operator*=(const complex<_Xp> &__c) {
-    *this = *this * __c;
-    return *this;
-  }
-  template <class _Xp>
-  CUDA_CALLABLE_MEMBER complex &operator/=(const complex<_Xp> &__c) {
-    *this = *this / __c;
-    return *this;
-  }
-};
-template <> class complex<double> {
-  double __re_;
-  double __im_;
-public:
-  typedef double value_type;
-  /*constexpr*/ CUDA_CALLABLE_MEMBER complex(double __re = 0.0,
-                                             double __im = 0.0)
-      : __re_(__re), __im_(__im) {}
-  /*constexpr*/ complex(const complex<float> &__c);
-  /*constexpr*/ CUDA_CALLABLE_MEMBER double real() const { return __re_; }
-  /*constexpr*/ CUDA_CALLABLE_MEMBER double imag() const { return __im_; }
-  CUDA_CALLABLE_MEMBER void real(value_type __re) { __re_ = __re; }
-  CUDA_CALLABLE_MEMBER void imag(value_type __im) { __im_ = __im; }
-  CUDA_CALLABLE_MEMBER complex &operator=(double __re) {
-    __re_ = __re;
-    __im_ = value_type();
-    return *this;
-  }
-  CUDA_CALLABLE_MEMBER complex &operator+=(double __re) {
-    __re_ += __re;
-    return *this;
-  }
-  CUDA_CALLABLE_MEMBER complex &operator-=(double __re) {
-    __re_ -= __re;
-    return *this;
-  }
-  CUDA_CALLABLE_MEMBER complex &operator*=(double __re) {
-    __re_ *= __re;
-    __im_ *= __re;
-    return *this;
-  }
-  CUDA_CALLABLE_MEMBER complex &operator/=(double __re) {
-    __re_ /= __re;
-    __im_ /= __re;
-    return *this;
-  }
-  template <class _Xp>
-  CUDA_CALLABLE_MEMBER complex &operator=(const complex<_Xp> &__c) {
-    __re_ = __c.real();
-    __im_ = __c.imag();
-    return *this;
-  }
-  template <class _Xp>
-  CUDA_CALLABLE_MEMBER complex &operator+=(const complex<_Xp> &__c) {
-    __re_ += __c.real();
-    __im_ += __c.imag();
-    return *this;
-  }
-  template <class _Xp>
-  CUDA_CALLABLE_MEMBER complex &operator-=(const complex<_Xp> &__c) {
-    __re_ -= __c.real();
-    __im_ -= __c.imag();
-    return *this;
-  }
-  template <class _Xp>
-  CUDA_CALLABLE_MEMBER complex &operator*=(const complex<_Xp> &__c) {
-    *this = *this * __c;
-    return *this;
-  }
-  template <class _Xp>
-  CUDA_CALLABLE_MEMBER complex &operator/=(const complex<_Xp> &__c) {
-    *this = *this / __c;
-    return *this;
-  }
-};
-// constexpr
-inline CUDA_CALLABLE_MEMBER complex<float>::complex(const complex<double> &__c)
-    : __re_(__c.real()), __im_(__c.imag()) {}
-// constexpr
-inline CUDA_CALLABLE_MEMBER complex<double>::complex(const complex<float> &__c)
-    : __re_(__c.real()), __im_(__c.imag()) {}
-// 26.3.6 operators:
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> operator+(const complex<_Tp> &__x,
-                                                   const complex<_Tp> &__y) {
-  complex<_Tp> __t(__x);
-  __t += __y;
-  return __t;
-}
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> operator+(const complex<_Tp> &__x,
-                                                   const _Tp &__y) {
-  complex<_Tp> __t(__x);
-  __t += __y;
-  return __t;
-}
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> operator+(const _Tp &__x,
-                                                   const complex<_Tp> &__y) {
-  complex<_Tp> __t(__y);
-  __t += __x;
-  return __t;
-}
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> operator-(const complex<_Tp> &__x,
-                                                   const complex<_Tp> &__y) {
-  complex<_Tp> __t(__x);
-  __t -= __y;
-  return __t;
-}
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> operator-(const complex<_Tp> &__x,
-                                                   const _Tp &__y) {
-  complex<_Tp> __t(__x);
-  __t -= __y;
-  return __t;
-}
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> operator-(const _Tp &__x,
-                                                   const complex<_Tp> &__y) {
-  complex<_Tp> __t(-__y);
-  __t += __x;
-  return __t;
-}
-template <class _Tp>
-CUDA_CALLABLE_MEMBER complex<_Tp> operator*(const complex<_Tp> &__z,
-                                            const complex<_Tp> &__w) {
-  _Tp __a = __z.real();
-  _Tp __b = __z.imag();
-  _Tp __c = __w.real();
-  _Tp __d = __w.imag();
-  _Tp __ac = __a * __c;
-  _Tp __bd = __b * __d;
-  _Tp __ad = __a * __d;
-  _Tp __bc = __b * __c;
-  _Tp __x = __ac - __bd;
-  _Tp __y = __ad + __bc;
-  if (isnan(__x) && isnan(__y)) {
-    bool __recalc = false;
-    if (isinf(__a) || isinf(__b)) {
-      __a = copysign(isinf(__a) ? _Tp(1) : _Tp(0), __a);
-      __b = copysign(isinf(__b) ? _Tp(1) : _Tp(0), __b);
-      if (isnan(__c))
-        __c = copysign(_Tp(0), __c);
-      if (isnan(__d))
-        __d = copysign(_Tp(0), __d);
-      __recalc = true;
-    }
-    if (isinf(__c) || isinf(__d)) {
-      __c = copysign(isinf(__c) ? _Tp(1) : _Tp(0), __c);
-      __d = copysign(isinf(__d) ? _Tp(1) : _Tp(0), __d);
-      if (isnan(__a))
-        __a = copysign(_Tp(0), __a);
-      if (isnan(__b))
-        __b = copysign(_Tp(0), __b);
-      __recalc = true;
-    }
-    if (!__recalc &&
-        (isinf(__ac) || isinf(__bd) || isinf(__ad) || isinf(__bc))) {
-      if (isnan(__a))
-        __a = copysign(_Tp(0), __a);
-      if (isnan(__b))
-        __b = copysign(_Tp(0), __b);
-      if (isnan(__c))
-        __c = copysign(_Tp(0), __c);
-      if (isnan(__d))
-        __d = copysign(_Tp(0), __d);
-      __recalc = true;
-    }
-    if (__recalc) {
-      __x = _Tp(INFINITY) * (__a * __c - __b * __d);
-      __y = _Tp(INFINITY) * (__a * __d + __b * __c);
-    }
-  }
-  return complex<_Tp>(__x, __y);
-}
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> operator*(const complex<_Tp> &__x,
-                                                   const _Tp &__y) {
-  complex<_Tp> __t(__x);
-  __t *= __y;
-  return __t;
-}
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> operator*(const _Tp &__x,
-                                                   const complex<_Tp> &__y) {
-  complex<_Tp> __t(__y);
-  __t *= __x;
-  return __t;
-}
-template <class _Tp>
-CUDA_CALLABLE_MEMBER complex<_Tp> operator/(const complex<_Tp> &__z,
-                                            const complex<_Tp> &__w) {
-  int __ilogbw = 0;
-  _Tp __a = __z.real();
-  _Tp __b = __z.imag();
-  _Tp __c = __w.real();
-  _Tp __d = __w.imag();
-  _Tp __logbw = logb(fmax(fabs(__c), fabs(__d)));
-  if (isfinite(__logbw)) {
-    __ilogbw = static_cast<int>(__logbw);
-    __c = scalbn(__c, -__ilogbw);
-    __d = scalbn(__d, -__ilogbw);
-  }
-  _Tp __denom = __c * __c + __d * __d;
-  _Tp __x = scalbn((__a * __c + __b * __d) / __denom, -__ilogbw);
-  _Tp __y = scalbn((__b * __c - __a * __d) / __denom, -__ilogbw);
-  if (isnan(__x) && isnan(__y)) {
-    if ((__denom == _Tp(0)) && (!isnan(__a) || !isnan(__b))) {
-      __x = copysign(_Tp(INFINITY), __c) * __a;
-      __y = copysign(_Tp(INFINITY), __c) * __b;
-    } else if ((isinf(__a) || isinf(__b)) && isfinite(__c) && isfinite(__d)) {
-      __a = copysign(isinf(__a) ? _Tp(1) : _Tp(0), __a);
-      __b = copysign(isinf(__b) ? _Tp(1) : _Tp(0), __b);
-      __x = _Tp(INFINITY) * (__a * __c + __b * __d);
-      __y = _Tp(INFINITY) * (__b * __c - __a * __d);
-    } else if (isinf(__logbw) && __logbw > _Tp(0) && isfinite(__a) &&
-               isfinite(__b)) {
-      __c = copysign(isinf(__c) ? _Tp(1) : _Tp(0), __c);
-      __d = copysign(isinf(__d) ? _Tp(1) : _Tp(0), __d);
-      __x = _Tp(0) * (__a * __c + __b * __d);
-      __y = _Tp(0) * (__b * __c - __a * __d);
-    }
-  }
-  return complex<_Tp>(__x, __y);
-}
-template <>
-CUDA_CALLABLE_MEMBER complex<float> operator/(const complex<float> &__z,
-                                              const complex<float> &__w) {
-  int __ilogbw = 0;
-  float __a = __z.real();
-  float __b = __z.imag();
-  float __c = __w.real();
-  float __d = __w.imag();
-  float __logbw = logbf(fmaxf(fabsf(__c), fabsf(__d)));
-  if (isfinite(__logbw)) {
-    __ilogbw = static_cast<int>(__logbw);
-    __c = scalbnf(__c, -__ilogbw);
-    __d = scalbnf(__d, -__ilogbw);
-  }
-  float __denom = __c * __c + __d * __d;
-  float __x = scalbnf((__a * __c + __b * __d) / __denom, -__ilogbw);
-  float __y = scalbnf((__b * __c - __a * __d) / __denom, -__ilogbw);
-  if (isnan(__x) && isnan(__y)) {
-    if ((__denom == float(0)) && (!isnan(__a) || !isnan(__b))) {
-#pragma warning(suppress : 4756) // Ignore INFINITY related warning
-      __x = copysignf(INFINITY, __c) * __a;
-#pragma warning(suppress : 4756) // Ignore INFINITY related warning
-      __y = copysignf(INFINITY, __c) * __b;
-    } else if ((isinf(__a) || isinf(__b)) && isfinite(__c) && isfinite(__d)) {
-      __a = copysignf(isinf(__a) ? float(1) : float(0), __a);
-      __b = copysignf(isinf(__b) ? float(1) : float(0), __b);
-#pragma warning(suppress : 4756) // Ignore INFINITY related warning
-      __x = INFINITY * (__a * __c + __b * __d);
-#pragma warning(suppress : 4756) // Ignore INFINITY related warning
-      __y = INFINITY * (__b * __c - __a * __d);
-    } else if (isinf(__logbw) && __logbw > float(0) && isfinite(__a) &&
-               isfinite(__b)) {
-      __c = copysignf(isinf(__c) ? float(1) : float(0), __c);
-      __d = copysignf(isinf(__d) ? float(1) : float(0), __d);
-      __x = float(0) * (__a * __c + __b * __d);
-      __y = float(0) * (__b * __c - __a * __d);
-    }
-  }
-  return complex<float>(__x, __y);
-}
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> operator/(const complex<_Tp> &__x,
-                                                   const _Tp &__y) {
-  return complex<_Tp>(__x.real() / __y, __x.imag() / __y);
-}
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> operator/(const _Tp &__x,
-                                                   const complex<_Tp> &__y) {
-  complex<_Tp> __t(__x);
-  __t /= __y;
-  return __t;
-}
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> operator+(const complex<_Tp> &__x) {
-  return __x;
-}
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> operator-(const complex<_Tp> &__x) {
-  return complex<_Tp>(-__x.real(), -__x.imag());
-}
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER bool operator==(const complex<_Tp> &__x,
-                                            const complex<_Tp> &__y) {
-  return __x.real() == __y.real() && __x.imag() == __y.imag();
-}
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER bool operator==(const complex<_Tp> &__x,
-                                            const _Tp &__y) {
-  return __x.real() == __y && __x.imag() == 0;
-}
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER bool operator==(const _Tp &__x,
-                                            const complex<_Tp> &__y) {
-  return __x == __y.real() && 0 == __y.imag();
-}
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER bool operator!=(const complex<_Tp> &__x,
-                                            const complex<_Tp> &__y) {
-  return !(__x == __y);
-}
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER bool operator!=(const complex<_Tp> &__x,
-                                            const _Tp &__y) {
-  return !(__x == __y);
-}
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER bool operator!=(const _Tp &__x,
-                                            const complex<_Tp> &__y) {
-  return !(__x == __y);
-}
-// 26.3.7 values:
-// real
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER _Tp real(const complex<_Tp> &__c) {
-  return __c.real();
-}
-inline CUDA_CALLABLE_MEMBER double real(double __re) { return __re; }
-inline CUDA_CALLABLE_MEMBER float real(float __re) { return __re; }
-// imag
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER _Tp imag(const complex<_Tp> &__c) {
-  return __c.imag();
-}
-inline CUDA_CALLABLE_MEMBER double imag(double __re) { return 0; }
-inline CUDA_CALLABLE_MEMBER float imag(float __re) { return 0; }
-// abs
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER _Tp abs(const complex<_Tp> &__c) {
-  return hypot(__c.real(), __c.imag());
-}
-// arg
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER _Tp arg(const complex<_Tp> &__c) {
-  return atan2(__c.imag(), __c.real());
-}
-inline CUDA_CALLABLE_MEMBER double arg(double __re) { return atan2(0., __re); }
-inline CUDA_CALLABLE_MEMBER float arg(float __re) { return atan2f(0.F, __re); }
-// norm
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER _Tp norm(const complex<_Tp> &__c) {
-  if (isinf(__c.real()))
-    return fabs(__c.real());
-  if (isinf(__c.imag()))
-    return fabs(__c.imag());
-  return __c.real() * __c.real() + __c.imag() * __c.imag();
-}
-inline CUDA_CALLABLE_MEMBER double norm(double __re) { return __re * __re; }
-inline CUDA_CALLABLE_MEMBER float norm(float __re) { return __re * __re; }
-// conj
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> conj(const complex<_Tp> &__c) {
-  return complex<_Tp>(__c.real(), -__c.imag());
-}
-inline CUDA_CALLABLE_MEMBER complex<double> conj(double __re) {
-  return complex<double>(__re);
-}
-inline CUDA_CALLABLE_MEMBER complex<float> conj(float __re) {
-  return complex<float>(__re);
-}
-// proj
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> proj(const complex<_Tp> &__c) {
-  complex<_Tp> __r = __c;
-  if (isinf(__c.real()) || isinf(__c.imag()))
-    __r = complex<_Tp>(INFINITY, copysign(_Tp(0), __c.imag()));
-  return __r;
-}
-inline CUDA_CALLABLE_MEMBER complex<double> proj(double __re) {
-  if (isinf(__re))
-    __re = fabs(__re);
-  return complex<double>(__re);
-}
-inline CUDA_CALLABLE_MEMBER complex<float> proj(float __re) {
-  if (isinf(__re))
-    __re = fabs(__re);
-  return complex<float>(__re);
-}
-// polar
-template <class _Tp>
-CUDA_CALLABLE_MEMBER complex<_Tp> polar(const _Tp &__rho,
-                                        const _Tp &__theta = _Tp(0)) {
-  if (isnan(__rho) || signbit(__rho))
-    return complex<_Tp>(_Tp(NAN), _Tp(NAN));
-  if (isnan(__theta)) {
-    if (isinf(__rho))
-      return complex<_Tp>(__rho, __theta);
-    return complex<_Tp>(__theta, __theta);
-  }
-  if (isinf(__theta)) {
-    if (isinf(__rho))
-      return complex<_Tp>(__rho, _Tp(NAN));
-    return complex<_Tp>(_Tp(NAN), _Tp(NAN));
-  }
-  _Tp __x = __rho * cos(__theta);
-  if (isnan(__x))
-    __x = 0;
-  _Tp __y = __rho * sin(__theta);
-  if (isnan(__y))
-    __y = 0;
-  return complex<_Tp>(__x, __y);
-}
-// log
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> log(const complex<_Tp> &__x) {
-  return complex<_Tp>(log(abs(__x)), arg(__x));
-}
-// log10
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> log10(const complex<_Tp> &__x) {
-  return log(__x) / log(_Tp(10));
-}
-// sqrt
-template <class _Tp>
-CUDA_CALLABLE_MEMBER complex<_Tp> sqrt(const complex<_Tp> &__x) {
-  if (isinf(__x.imag()))
-    return complex<_Tp>(_Tp(INFINITY), __x.imag());
-  if (isinf(__x.real())) {
-    if (__x.real() > _Tp(0))
-      return complex<_Tp>(__x.real(), isnan(__x.imag())
-                                          ? __x.imag()
-                                          : copysign(_Tp(0), __x.imag()));
-    return complex<_Tp>(isnan(__x.imag()) ? __x.imag() : _Tp(0),
-                        copysign(__x.real(), __x.imag()));
-  }
-  return polar(sqrt(abs(__x)), arg(__x) / _Tp(2));
-}
-// exp
-template <class _Tp>
-CUDA_CALLABLE_MEMBER complex<_Tp> exp(const complex<_Tp> &__x) {
-  _Tp __i = __x.imag();
-  if (isinf(__x.real())) {
-    if (__x.real() < _Tp(0)) {
-      if (!isfinite(__i))
-        __i = _Tp(1);
-    } else if (__i == 0 || !isfinite(__i)) {
-      if (isinf(__i))
-        __i = _Tp(NAN);
-      return complex<_Tp>(__x.real(), __i);
-    }
-  } else if (isnan(__x.real()) && __x.imag() == 0)
-    return __x;
-  _Tp __e = exp(__x.real());
-  return complex<_Tp>(__e * cos(__i), __e * sin(__i));
-}
-// pow
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> pow(const complex<_Tp> &__x,
-                                             const complex<_Tp> &__y) {
-  return exp(__y * log(__x));
-}
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> pow(const complex<_Tp> &__x,
-                                             const _Tp &__y) {
-  return pow(__x, complex<_Tp>(__y));
-}
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> pow(const _Tp &__x,
-                                             const complex<_Tp> &__y) {
-  return pow(complex<_Tp>(__x), __y);
-}
-// asinh
-template <class _Tp>
-CUDA_CALLABLE_MEMBER complex<_Tp> asinh(const complex<_Tp> &__x) {
-  const _Tp __pi(atan2(+0., -0.));
-  if (isinf(__x.real())) {
-    if (isnan(__x.imag()))
-      return __x;
-    if (isinf(__x.imag()))
-      return complex<_Tp>(__x.real(), copysign(__pi * _Tp(0.25), __x.imag()));
-    return complex<_Tp>(__x.real(), copysign(_Tp(0), __x.imag()));
-  }
-  if (isnan(__x.real())) {
-    if (isinf(__x.imag()))
-      return complex<_Tp>(__x.imag(), __x.real());
-    if (__x.imag() == 0)
-      return __x;
-    return complex<_Tp>(__x.real(), __x.real());
-  }
-  if (isinf(__x.imag()))
-    return complex<_Tp>(copysign(__x.imag(), __x.real()),
-                        copysign(__pi / _Tp(2), __x.imag()));
-  complex<_Tp> __z = log(__x + sqrt(pow(__x, _Tp(2)) + _Tp(1)));
-  return complex<_Tp>(copysign(__z.real(), __x.real()),
-                      copysign(__z.imag(), __x.imag()));
-}
-// acosh
-template <class _Tp>
-CUDA_CALLABLE_MEMBER complex<_Tp> acosh(const complex<_Tp> &__x) {
-  const _Tp __pi(atan2(+0., -0.));
-  if (isinf(__x.real())) {
-    if (isnan(__x.imag()))
-      return complex<_Tp>(fabs(__x.real()), __x.imag());
-    if (isinf(__x.imag()))
-      if (__x.real() > 0)
-        return complex<_Tp>(__x.real(), copysign(__pi * _Tp(0.25), __x.imag()));
-      else
-        return complex<_Tp>(-__x.real(),
-                            copysign(__pi * _Tp(0.75), __x.imag()));
-    if (__x.real() < 0)
-      return complex<_Tp>(-__x.real(), copysign(__pi, __x.imag()));
-    return complex<_Tp>(__x.real(), copysign(_Tp(0), __x.imag()));
-  }
-  if (isnan(__x.real())) {
-    if (isinf(__x.imag()))
-      return complex<_Tp>(fabs(__x.imag()), __x.real());
-    return complex<_Tp>(__x.real(), __x.real());
-  }
-  if (isinf(__x.imag()))
-    return complex<_Tp>(fabs(__x.imag()), copysign(__pi / _Tp(2), __x.imag()));
-  complex<_Tp> __z = log(__x + sqrt(pow(__x, _Tp(2)) - _Tp(1)));
-  return complex<_Tp>(copysign(__z.real(), _Tp(0)),
-                      copysign(__z.imag(), __x.imag()));
-}
-// atanh
-template <class _Tp>
-CUDA_CALLABLE_MEMBER complex<_Tp> atanh(const complex<_Tp> &__x) {
-  const _Tp __pi(atan2(+0., -0.));
-  if (isinf(__x.imag())) {
-    return complex<_Tp>(copysign(_Tp(0), __x.real()),
-                        copysign(__pi / _Tp(2), __x.imag()));
-  }
-  if (isnan(__x.imag())) {
-    if (isinf(__x.real()) || __x.real() == 0)
-      return complex<_Tp>(copysign(_Tp(0), __x.real()), __x.imag());
-    return complex<_Tp>(__x.imag(), __x.imag());
-  }
-  if (isnan(__x.real())) {
-    return complex<_Tp>(__x.real(), __x.real());
-  }
-  if (isinf(__x.real())) {
-    return complex<_Tp>(copysign(_Tp(0), __x.real()),
-                        copysign(__pi / _Tp(2), __x.imag()));
-  }
-  if (fabs(__x.real()) == _Tp(1) && __x.imag() == _Tp(0)) {
-    return complex<_Tp>(copysign(_Tp(INFINITY), __x.real()),
-                        copysign(_Tp(0), __x.imag()));
-  }
-  complex<_Tp> __z = log((_Tp(1) + __x) / (_Tp(1) - __x)) / _Tp(2);
-  return complex<_Tp>(copysign(__z.real(), __x.real()),
-                      copysign(__z.imag(), __x.imag()));
-}
-// sinh
-template <class _Tp>
-CUDA_CALLABLE_MEMBER complex<_Tp> sinh(const complex<_Tp> &__x) {
-  if (isinf(__x.real()) && !isfinite(__x.imag()))
-    return complex<_Tp>(__x.real(), _Tp(NAN));
-  if (__x.real() == 0 && !isfinite(__x.imag()))
-    return complex<_Tp>(__x.real(), _Tp(NAN));
-  if (__x.imag() == 0 && !isfinite(__x.real()))
-    return __x;
-  return complex<_Tp>(sinh(__x.real()) * cos(__x.imag()),
-                      cosh(__x.real()) * sin(__x.imag()));
-}
-// cosh
-template <class _Tp>
-CUDA_CALLABLE_MEMBER complex<_Tp> cosh(const complex<_Tp> &__x) {
-  if (isinf(__x.real()) && !isfinite(__x.imag()))
-    return complex<_Tp>(fabs(__x.real()), _Tp(NAN));
-  if (__x.real() == 0 && !isfinite(__x.imag()))
-    return complex<_Tp>(_Tp(NAN), __x.real());
-  if (__x.real() == 0 && __x.imag() == 0)
-    return complex<_Tp>(_Tp(1), __x.imag());
-  if (__x.imag() == 0 && !isfinite(__x.real()))
-    return complex<_Tp>(fabs(__x.real()), __x.imag());
-  return complex<_Tp>(cosh(__x.real()) * cos(__x.imag()),
-                      sinh(__x.real()) * sin(__x.imag()));
-}
-// tanh
-template <class _Tp>
-CUDA_CALLABLE_MEMBER complex<_Tp> tanh(const complex<_Tp> &__x) {
-  if (isinf(__x.real())) {
-    if (!isfinite(__x.imag()))
-      return complex<_Tp>(_Tp(1), _Tp(0));
-    return complex<_Tp>(_Tp(1), copysign(_Tp(0), sin(_Tp(2) * __x.imag())));
-  }
-  if (isnan(__x.real()) && __x.imag() == 0)
-    return __x;
-  _Tp __2r(_Tp(2) * __x.real());
-  _Tp __2i(_Tp(2) * __x.imag());
-  _Tp __d(cosh(__2r) + cos(__2i));
-  return complex<_Tp>(sinh(__2r) / __d, sin(__2i) / __d);
-}
-// asin
-template <class _Tp>
-CUDA_CALLABLE_MEMBER complex<_Tp> asin(const complex<_Tp> &__x) {
-  complex<_Tp> __z = asinh(complex<_Tp>(-__x.imag(), __x.real()));
-  return complex<_Tp>(__z.imag(), -__z.real());
-}
-// acos
-template <class _Tp>
-CUDA_CALLABLE_MEMBER complex<_Tp> acos(const complex<_Tp> &__x) {
-  const _Tp __pi(atan2(+0., -0.));
-  if (isinf(__x.real())) {
-    if (isnan(__x.imag()))
-      return complex<_Tp>(__x.imag(), __x.real());
-    if (isinf(__x.imag())) {
-      if (__x.real() < _Tp(0))
-        return complex<_Tp>(_Tp(0.75) * __pi, -__x.imag());
-      return complex<_Tp>(_Tp(0.25) * __pi, -__x.imag());
-    }
-    if (__x.real() < _Tp(0))
-      return complex<_Tp>(__pi, signbit(__x.imag()) ? -__x.real() : __x.real());
-    return complex<_Tp>(_Tp(0), signbit(__x.imag()) ? __x.real() : -__x.real());
-  }
-  if (isnan(__x.real())) {
-    if (isinf(__x.imag()))
-      return complex<_Tp>(__x.real(), -__x.imag());
-    return complex<_Tp>(__x.real(), __x.real());
-  }
-  if (isinf(__x.imag()))
-    return complex<_Tp>(__pi / _Tp(2), -__x.imag());
-  if (__x.real() == 0)
-    return complex<_Tp>(__pi / _Tp(2), -__x.imag());
-  complex<_Tp> __z = log(__x + sqrt(pow(__x, _Tp(2)) - _Tp(1)));
-  if (signbit(__x.imag()))
-    return complex<_Tp>(fabs(__z.imag()), fabs(__z.real()));
-  return complex<_Tp>(fabs(__z.imag()), -fabs(__z.real()));
-}
-// atan
-template <class _Tp>
-CUDA_CALLABLE_MEMBER complex<_Tp> atan(const complex<_Tp> &__x) {
-  complex<_Tp> __z = atanh(complex<_Tp>(-__x.imag(), __x.real()));
-  return complex<_Tp>(__z.imag(), -__z.real());
-}
-// sin
-template <class _Tp>
-CUDA_CALLABLE_MEMBER complex<_Tp> sin(const complex<_Tp> &__x) {
-  complex<_Tp> __z = sinh(complex<_Tp>(-__x.imag(), __x.real()));
-  return complex<_Tp>(__z.imag(), -__z.real());
-}
-// cos
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> cos(const complex<_Tp> &__x) {
-  return cosh(complex<_Tp>(-__x.imag(), __x.real()));
-}
-// tan
-template <class _Tp>
-CUDA_CALLABLE_MEMBER complex<_Tp> tan(const complex<_Tp> &__x) {
-  complex<_Tp> __z = tanh(complex<_Tp>(-__x.imag(), __x.real()));
-  return complex<_Tp>(__z.imag(), -__z.real());
-}
-template <class _Tp, class _CharT, class _Traits>
-std::basic_istream<_CharT, _Traits> &
-operator>>(std::basic_istream<_CharT, _Traits> &__is, complex<_Tp> &__x) {
-  if (__is.good()) {
-    ws(__is);
-    if (__is.peek() == _CharT('(')) {
-      __is.get();
-      _Tp __r;
-      __is >> __r;
-      if (!__is.fail()) {
-        ws(__is);
-        _CharT __c = __is.peek();
-        if (__c == _CharT(',')) {
-          __is.get();
-          _Tp __i;
-          __is >> __i;
-          if (!__is.fail()) {
-            ws(__is);
-            __c = __is.peek();
-            if (__c == _CharT(')')) {
-              __is.get();
-              __x = complex<_Tp>(__r, __i);
-            } else
-              __is.setstate(std::ios_base::failbit);
-          } else
-            __is.setstate(std::ios_base::failbit);
-        } else if (__c == _CharT(')')) {
-          __is.get();
-          __x = complex<_Tp>(__r, _Tp(0));
-        } else
-          __is.setstate(std::ios_base::failbit);
-      } else
-        __is.setstate(std::ios_base::failbit);
-    } else {
-      _Tp __r;
-      __is >> __r;
-      if (!__is.fail())
-        __x = complex<_Tp>(__r, _Tp(0));
-      else
-        __is.setstate(std::ios_base::failbit);
-    }
-  } else
-    __is.setstate(std::ios_base::failbit);
-  return __is;
-}
-template <class _Tp, class _CharT, class _Traits>
-std::basic_ostream<_CharT, _Traits> &
-operator<<(std::basic_ostream<_CharT, _Traits> &__os, const complex<_Tp> &__x) {
-  std::basic_ostringstream<_CharT, _Traits> __s;
-  __s.flags(__os.flags());
-  __s.imbue(__os.getloc());
-  __s.precision(__os.precision());
-  __s << '(' << __x.real() << ',' << __x.imag() << ')';
-  return __os << __s.str();
-}
-//} // close namespace cuda_complex
-template <class U, class V>
-CUDA_CALLABLE_MEMBER auto operator*(const complex<U> &complexNumber,
-                                    const V &scalar) -> complex<U> {
-  return complex<U>(real(complexNumber) * scalar, imag(complexNumber) * scalar);
-}
-template <class U, class V>
-CUDA_CALLABLE_MEMBER auto operator*(const V &scalar,
-                                    const complex<U> &complexNumber)
-    -> complex<U> {
-  return complex<U>(real(complexNumber) * scalar, imag(complexNumber) * scalar);
-}
-template <class U, class V>
-CUDA_CALLABLE_MEMBER auto operator+(const complex<U> &complexNumber,
-                                    const V &scalar) -> complex<U> {
-  return complex<U>(real(complexNumber) + scalar, imag(complexNumber));
-}
-template <class U, class V>
-CUDA_CALLABLE_MEMBER auto operator+(const V &scalar,
-                                    const complex<U> &complexNumber)
-    -> complex<U> {
-  return complex<U>(real(complexNumber) + scalar, imag(complexNumber));
-}
-template <class U, class V>
-CUDA_CALLABLE_MEMBER auto operator-(const complex<U> &complexNumber,
-                                    const V &scalar) -> complex<U> {
-  return complex<U>(real(complexNumber) - scalar, imag(complexNumber));
-}
-template <class U, class V>
-CUDA_CALLABLE_MEMBER auto operator-(const V &scalar,
-                                    const complex<U> &complexNumber)
-    -> complex<U> {
-  return complex<U>(scalar - real(complexNumber), imag(complexNumber));
-}
-template <class U, class V>
-CUDA_CALLABLE_MEMBER auto operator/(const complex<U> &complexNumber,
-                                    const V scalar) -> complex<U> {
-  return complex<U>(real(complexNumber) / scalar, imag(complexNumber) / scalar);
-}
-template <class U, class V>
-CUDA_CALLABLE_MEMBER auto operator/(const V scalar,
-                                    const complex<U> &complexNumber)
-    -> complex<U> {
-  return complex<U>(scalar, 0) / complexNumber;
-}
-using ComplexDouble = complex<double>;
-using ComplexFloat = complex<float>;
-#endif // CUDA_COMPLEX_HPP
-}
--- a/pystencils/include/opencl_stdint.h
+++ b/pystencils/include/opencl_stdint.h
-#ifndef OPENCL_STDINT
-#define OPENCL_STDINT
-typedef unsigned int      uint_t;
-typedef signed char       int8_t;
-typedef signed short      int16_t;
-typedef signed int        int32_t;
-typedef signed long int   int64_t;
-typedef unsigned char     uint8_t;
-typedef unsigned short    uint16_t;
-typedef unsigned int      uint32_t;
-typedef unsigned long int uint64_t;
-#endif
--- a/pystencils_tests/test_address_of.py
+++ b/pystencils_tests/test_address_of.py
-"""
-Test of pystencils.data_types.address_of
-"""
-import sympy as sp
-import pystencils
-from pystencils.data_types import PointerType, address_of, cast_func, create_type
-from pystencils.simp.simplifications import sympy_cse
-def test_address_of():
-    x, y = pystencils.fields('x,y: int64[2d]')
-    s = pystencils.TypedSymbol('s', PointerType(create_type('int64')))
-    assert address_of(x[0, 0]).canonical() == x[0, 0]
-    assert address_of(x[0, 0]).dtype == PointerType(x[0, 0].dtype, restrict=True)
-    assert address_of(sp.Symbol("a")).dtype == PointerType('void', restrict=True)
-    assignments = pystencils.AssignmentCollection({
-        s: address_of(x[0, 0]),
-        y[0, 0]: cast_func(s, create_type('int64'))
-    }, {})
-    ast = pystencils.create_kernel(assignments)
-    pystencils.show_code(ast)
-    assignments = pystencils.AssignmentCollection({
-        y[0, 0]: cast_func(address_of(x[0, 0]), create_type('int64'))
-    }, {})
-    ast = pystencils.create_kernel(assignments)
-    pystencils.show_code(ast)
-def test_address_of_with_cse():
-    x, y = pystencils.fields('x,y: int64[2d]')
-    s = pystencils.TypedSymbol('s', PointerType(create_type('int64')))
-    assignments = pystencils.AssignmentCollection({
-        y[0, 0]: cast_func(address_of(x[0, 0]), create_type('int64')) + s,
-        x[0, 0]: cast_func(address_of(x[0, 0]), create_type('int64')) + 1
-    }, {})
-    ast = pystencils.create_kernel(assignments)
-    pystencils.show_code(ast)
-    assignments_cse = sympy_cse(assignments)
-    ast = pystencils.create_kernel(assignments_cse)
-    pystencils.show_code(ast)
--- a/pystencils_tests/test_complex_numbers.py
+++ b/pystencils_tests/test_complex_numbers.py
-# -*- coding: utf-8 -*-
-#
-# Copyright © 2019 Stephan Seitz <stephan.seitz@fau.de>
-#
-# Distributed under terms of the GPLv3 license.
-"""
-"""
-import itertools
-import numpy as np
-import pytest
-import sympy
-from sympy.functions import im, re
-import pystencils
-from pystencils import AssignmentCollection
-from pystencils.data_types import TypedSymbol, create_type
-X, Y = pystencils.fields('x, y: complex64[2d]')
-A, B = pystencils.fields('a, b: float32[2d]')
-S1, S2, T = sympy.symbols('S1, S2, T')
-TEST_ASSIGNMENTS = [
-    AssignmentCollection({X[0, 0]: 1j}),
-    AssignmentCollection({
-        S1: re(Y.center),
-        S2: im(Y.center),
-        X[0, 0]: 2j * S1 + S2
-    }),
-    AssignmentCollection({
-        A.center: re(Y.center),
-        B.center: im(Y.center),
-    }),
-    AssignmentCollection({
-        Y.center: re(Y.center) + X.center + 2j,
-    }),
-    AssignmentCollection({
-        T: 2 + 4j,
-        Y.center: X.center / T,
-    })
-]
-SCALAR_DTYPES = ['float32', 'float64']
-@pytest.mark.parametrize("assignment, scalar_dtypes",
-                         itertools.product(TEST_ASSIGNMENTS, (np.float32,)))
-@pytest.mark.parametrize('target', (pystencils.Target.CPU, pystencils.Target.GPU))
-def test_complex_numbers(assignment, scalar_dtypes, target):
-    ast = pystencils.create_kernel(assignment,
-                                   target=target,
-                                   data_type=scalar_dtypes)
-    code = pystencils.get_code_str(ast)
-    print(code)
-    assert "Not supported" not in code
-    if target == pystencils.Target.GPU:
-        pytest.importorskip('pycuda')
-    kernel = ast.compile()
-    assert kernel is not None
-X, Y = pystencils.fields('x, y: complex128[2d]')
-A, B = pystencils.fields('a, b: float64[2d]')
-S1, S2 = sympy.symbols('S1, S2')
-T128 = TypedSymbol('ts', create_type('complex128'))
-TEST_ASSIGNMENTS = [
-    AssignmentCollection({X[0, 0]: 1j}),
-    AssignmentCollection({
-        S1: re(Y.center),
-        S2: im(Y.center),
-        X[0, 0]: 2j * S1 + S2
-    }),
-    AssignmentCollection({
-        A.center: re(Y.center),
-        B.center: im(Y.center),
-    }),
-    AssignmentCollection({
-        Y.center: re(Y.center) + X.center + 2j,
-    }),
-    AssignmentCollection({
-        T128: 2 + 4j,
-        Y.center: X.center / T128,
-    })
-]
-SCALAR_DTYPES = ['float64']
-@pytest.mark.parametrize("assignment", TEST_ASSIGNMENTS)
-@pytest.mark.parametrize('target', (pystencils.Target.CPU, pystencils.Target.GPU))
-def test_complex_numbers_64(assignment, target):
-    ast = pystencils.create_kernel(assignment,
-                                   target=target,
-                                   data_type='double')
-    code = pystencils.get_code_str(ast)
-    print(code)
-    assert "Not supported" not in code
-    if target == pystencils.Target.GPU:
-        pytest.importorskip('pycuda')
-    kernel = ast.compile()
-    assert kernel is not None
-@pytest.mark.parametrize('dtype', (np.float32, np.float64))
-@pytest.mark.parametrize('target', (pystencils.Target.CPU, pystencils.Target.GPU))
-@pytest.mark.parametrize('with_complex_argument', ('with_complex_argument', False))
-def test_complex_execution(dtype, target, with_complex_argument):
-    complex_dtype = f'complex{64 if dtype ==np.float32 else 128}'
-    x, y = pystencils.fields(f'x, y:  {complex_dtype}[2d]')
-    x_arr = np.zeros((20, 30), complex_dtype)
-    y_arr = np.zeros((20, 30), complex_dtype)
-    if with_complex_argument:
-        a = pystencils.TypedSymbol('a', create_type(complex_dtype))
-    else:
-        a = (2j+1)
-    assignments = AssignmentCollection({
-        y.center: x.center + a
-    })
-    if target == pystencils.Target.GPU:
-        pytest.importorskip('pycuda')
-        from pycuda.gpuarray import zeros
-        x_arr = zeros((20, 30), complex_dtype)
-        y_arr = zeros((20, 30), complex_dtype)
-    kernel = pystencils.create_kernel(assignments, target=target, data_type=dtype).compile()
-    if with_complex_argument:
-        kernel(x=x_arr, y=y_arr, a=2j+1)
-    else:
-        kernel(x=x_arr, y=y_arr)
-    if target == pystencils.Target.GPU:
-        y_arr = y_arr.get()
-    assert np.allclose(y_arr, 2j+1)
--- a/pystencils_tests/test_create_kernel_backwards_compability.py
+++ b/pystencils_tests/test_create_kernel_backwards_compability.py
-import pytest
-import pystencils as ps
-import numpy as np
-# This test aims to trigger deprication warnings. Thus the warnings should not be displayed in the warning summary.
-def test_create_kernel_backwards_compatibility():
-    size = (30, 20)
-    src_field_string = np.random.rand(*size)
-    src_field_enum = np.copy(src_field_string)
-    src_field_config = np.copy(src_field_string)
-    dst_field_string = np.zeros(size)
-    dst_field_enum = np.zeros(size)
-    dst_field_config = np.zeros(size)
-    f = ps.Field.create_from_numpy_array("f", src_field_enum)
-    d = ps.Field.create_from_numpy_array("d", dst_field_enum)
-    jacobi = ps.Assignment(d[0, 0], (f[1, 0] + f[-1, 0] + f[0, 1] + f[0, -1]) / 4)
-    ast_enum = ps.create_kernel(jacobi, target=ps.Target.CPU).compile()
-    with pytest.warns(DeprecationWarning):
-        ast_string = ps.create_kernel(jacobi, target='cpu').compile()
-    # noinspection PyTypeChecker
-    with pytest.warns(DeprecationWarning):
-        ast_config = ps.create_kernel(jacobi, config=ps.CreateKernelConfig(target='cpu')).compile()
-    ast_enum(f=src_field_enum, d=dst_field_enum)
-    ast_string(f=src_field_string, d=dst_field_string)
-    ast_config(f=src_field_config, d=dst_field_config)
-    error = np.sum(np.abs(dst_field_enum - dst_field_string))
-    np.testing.assert_almost_equal(error, 0.0)
-    error = np.sum(np.abs(dst_field_enum - dst_field_config))
-    np.testing.assert_almost_equal(error, 0.0)
--- a/pystencils_tests/test_cuda_known_functions.py
+++ b/pystencils_tests/test_cuda_known_functions.py
-import sympy
-import pytest
-import pystencils
-from pystencils.astnodes import get_dummy_symbol
-from pystencils.backends.cuda_backend import CudaSympyPrinter
-from pystencils.data_types import address_of
-from pystencils.enums import Target
-def test_cuda_known_functions():
-    printer = CudaSympyPrinter()
-    print(printer.known_functions)
-    x, y = pystencils.fields('x,y: float32 [2d]')
-    assignments = pystencils.AssignmentCollection({
-        get_dummy_symbol(): sympy.Function('atomicAdd')(address_of(y.center()), 2),
-        y.center():  sympy.Function('rsqrtf')(x[0, 0])
-    })
-    ast = pystencils.create_kernel(assignments, target=Target.GPU)
-    pytest.importorskip('pycuda')
-    pystencils.show_code(ast)
-    kernel = ast.compile()
-    assert(kernel is not None)
-def test_cuda_but_not_c():
-    x, y = pystencils.fields('x,y: float32 [2d]')
-    assignments = pystencils.AssignmentCollection({
-        get_dummy_symbol(): sympy.Function('atomicAdd')(address_of(y.center()), 2),
-        y.center():  sympy.Function('rsqrtf')(x[0, 0])
-    })
-    ast = pystencils.create_kernel(assignments, target=Target.CPU)
-    pystencils.show_code(ast)
-def test_cuda_unknown():
-    x, y = pystencils.fields('x,y: float32 [2d]')
-    assignments = pystencils.AssignmentCollection({
-        get_dummy_symbol(): sympy.Function('wtf')(address_of(y.center()), 2),
-    })
-    ast = pystencils.create_kernel(assignments, target=Target.GPU)
-    pystencils.show_code(ast)
--- a/pystencils_tests/test_dot_printer.ipynb
+++ b/pystencils_tests/test_dot_printer.ipynb
-%% Cell type:code id: tags:
-``` python
-import pytest
-pytest.importorskip('graphviz')
-```
-%% Cell type:code id: tags:
-``` python
-from pystencils.session import *
-from pystencils.astnodes import Block, Conditional
-```
-%% Cell type:code id: tags:
-``` python
-src, dst = ps.fields("src, dst: double[2D]", layout='c')
-true_block = Block([ps.Assignment(dst[0, 0], src[-1, 0])])
-false_block = Block([ps.Assignment(dst[0, 0], src[1, 0])])
-ur = [true_block, Conditional(dst.center() > 0.0, true_block, false_block)]
-ast = ps.create_kernel(ur)
-```
-%% Cell type:code id: tags:
-``` python
-ps.to_dot(ast, graph_style={'size': "9.5,12.5"})
-```
-%% Output
-    <graphviz.files.Source at 0x7f62452c4110>
-%% Cell type:code id: tags:
-``` python
-import pytest
-pytest.importorskip('graphviz')
-```
-%% Cell type:code id: tags:
-``` python
-from pystencils.session import *
-from pystencils.astnodes import Block, Conditional
-```
-%% Cell type:code id: tags:
-``` python
-src, dst = ps.fields("src, dst: double[2D]", layout='c')
-true_block = Block([ps.Assignment(dst[0, 0], src[-1, 0])])
-false_block = Block([ps.Assignment(dst[0, 0], src[1, 0])])
-ur = [true_block, Conditional(dst.center() > 0.0, true_block, false_block)]
-ast = ps.create_kernel(ur)
-```
-%% Cell type:code id: tags:
-``` python
-ps.to_dot(ast, graph_style={'size': "9.5,12.5"})
-```
-%% Output
-    <graphviz.files.Source at 0x7f62452c4110>
--- a/pystencils_tests/test_indexed_kernels.py
+++ b/pystencils_tests/test_indexed_kernels.py
-import numpy as np
-from pystencils import Assignment, Field
-from pystencils.cpu import create_indexed_kernel, make_python_function
-def test_indexed_kernel():
-    arr = np.zeros((3, 4))
-    dtype = np.dtype([('x', int), ('y', int), ('value', arr.dtype)])
-    index_arr = np.zeros((3,), dtype=dtype)
-    index_arr[0] = (0, 2, 3.0)
-    index_arr[1] = (1, 3, 42.0)
-    index_arr[2] = (2, 1, 5.0)
-    indexed_field = Field.create_from_numpy_array('index', index_arr)
-    normal_field = Field.create_from_numpy_array('f', arr)
-    update_rule = Assignment(normal_field[0, 0], indexed_field('value'))
-    ast = create_indexed_kernel([update_rule], [indexed_field])
-    kernel = make_python_function(ast)
-    kernel(f=arr, index=index_arr)
-    for i in range(index_arr.shape[0]):
-        np.testing.assert_allclose(arr[index_arr[i]['x'], index_arr[i]['y']], index_arr[i]['value'], atol=1e-13)
-def test_indexed_cuda_kernel():
-    try:
-        import pycuda
-    except ImportError:
-        pycuda = None
-    if pycuda:
-        from pystencils.gpucuda import make_python_function
-        import pycuda.gpuarray as gpuarray
-        from pystencils.gpucuda.kernelcreation import created_indexed_cuda_kernel
-        arr = np.zeros((3, 4))
-        dtype = np.dtype([('x', int), ('y', int), ('value', arr.dtype)])
-        index_arr = np.zeros((3,), dtype=dtype)
-        index_arr[0] = (0, 2, 3.0)
-        index_arr[1] = (1, 3, 42.0)
-        index_arr[2] = (2, 1, 5.0)
-        indexed_field = Field.create_from_numpy_array('index', index_arr)
-        normal_field = Field.create_from_numpy_array('f', arr)
-        update_rule = Assignment(normal_field[0, 0], indexed_field('value'))
-        ast = created_indexed_cuda_kernel([update_rule], [indexed_field])
-        kernel = make_python_function(ast)
-        gpu_arr = gpuarray.to_gpu(arr)
-        gpu_index_arr = gpuarray.to_gpu(index_arr)
-        kernel(f=gpu_arr, index=gpu_index_arr)
-        gpu_arr.get(arr)
-        for i in range(index_arr.shape[0]):
-            np.testing.assert_allclose(arr[index_arr[i]['x'], index_arr[i]['y']], index_arr[i]['value'], atol=1e-13)
-    else:
-        print("Did not run test on GPU since no pycuda is available")
--- a/pystencils_tests/test_jupyter_extensions.ipynb
+++ b/pystencils_tests/test_jupyter_extensions.ipynb
-%% Cell type:code id: tags:
-``` python
-from pystencils.session import *
-```
-%% Cell type:code id: tags:
-``` python
-dh = ps.create_data_handling(domain_size=(256, 256), periodicity=True)
-c_field = dh.add_array('c')
-dh.fill("c", 0.0, ghost_layers=True)
-```
-%% Cell type:code id: tags:
-``` python
-for x in range(129):
-    for y in range(258):
-        dh.cpu_arrays['c'][x, y] = 1.0
-```
-%% Cell type:code id: tags:
-``` python
-plt.scalar_field(dh.cpu_arrays["c"])
-```
-%% Output
-    <matplotlib.image.AxesImage at 0x7fcb7d253710>
-%% Cell type:code id: tags:
-``` python
-ur = ps.Assignment(c_field[0, 0], c_field[1, 0])
-ast = ps.create_kernel(ur, target=dh.default_target, cpu_openmp=True)
-kernel = ast.compile()
-```
-%% Cell type:code id: tags:
-``` python
-c_sync = dh.synchronization_function_cpu(['c'])
-```
-%% Cell type:code id: tags:
-``` python
-def timeloop(steps=10):
-    for i in range(steps):
-        c_sync()
-        dh.run_kernel(kernel)
-    return dh.gather_array('c')
-```
-%% Cell type:code id: tags:
-``` python
-ps.jupyter.set_display_mode('video')
-```
-%% Cell type:code id: tags:
-``` python
-ani = ps.plot.scalar_field_animation(timeloop, rescale=True, frames=12)
-ps.jupyter.display_animation(ani)
-```
-%% Output
-    <IPython.core.display.HTML object>
-%% Cell type:code id: tags:
-``` python
-ps.jupyter.set_display_mode('image_update')
-```
-%% Cell type:code id: tags:
-``` python
-ani = ps.plot.scalar_field_animation(timeloop, rescale=True, frames=12)
-ps.jupyter.display_animation(ani)
-```
-%% Output
-%% Cell type:code id: tags:
-``` python
-def grid_update_function(image):
-    for i in range(40):
-        c_sync()
-        dh.run_kernel(kernel)
-    return dh.gather_array('c')
-```
-%% Cell type:code id: tags:
-``` python
-animation = ps.jupyter.make_imshow_animation(dh.cpu_arrays["c"], grid_update_function, frames=300)
-```
-%% Output
-%% Cell type:code id: tags:
-``` python
-ps.jupyter.set_display_mode("video")
-ps.jupyter.set_display_mode("window")
-ps.jupyter.set_display_mode("image_update")
-ps.jupyter.activate_ipython()
-```
-%% Cell type:code id: tags:
-``` python
-from pystencils.session import *
-```
-%% Cell type:code id: tags:
-``` python
-dh = ps.create_data_handling(domain_size=(256, 256), periodicity=True)
-c_field = dh.add_array('c')
-dh.fill("c", 0.0, ghost_layers=True)
-```
-%% Cell type:code id: tags:
-``` python
-for x in range(129):
-    for y in range(258):
-        dh.cpu_arrays['c'][x, y] = 1.0
-```
-%% Cell type:code id: tags:
-``` python
-plt.scalar_field(dh.cpu_arrays["c"])
-```
-%% Output
-    <matplotlib.image.AxesImage at 0x7fcb7d253710>
-%% Cell type:code id: tags:
-``` python
-ur = ps.Assignment(c_field[0, 0], c_field[1, 0])
-ast = ps.create_kernel(ur, target=dh.default_target, cpu_openmp=True)
-kernel = ast.compile()
-```
-%% Cell type:code id: tags:
-``` python
-c_sync = dh.synchronization_function_cpu(['c'])
-```
-%% Cell type:code id: tags:
-``` python
-def timeloop(steps=10):
-    for i in range(steps):
-        c_sync()
-        dh.run_kernel(kernel)
-    return dh.gather_array('c')
-```
-%% Cell type:code id: tags:
-``` python
-ps.jupyter.set_display_mode('video')
-```
-%% Cell type:code id: tags:
-``` python
-ani = ps.plot.scalar_field_animation(timeloop, rescale=True, frames=12)
-ps.jupyter.display_animation(ani)
-```
-%% Output
-    <IPython.core.display.HTML object>
-%% Cell type:code id: tags:
-``` python
-ps.jupyter.set_display_mode('image_update')
-```
-%% Cell type:code id: tags:
-``` python
-ani = ps.plot.scalar_field_animation(timeloop, rescale=True, frames=12)
-ps.jupyter.display_animation(ani)
-```
-%% Output
-%% Cell type:code id: tags:
-``` python
-def grid_update_function(image):
-    for i in range(40):
-        c_sync()
-        dh.run_kernel(kernel)
-    return dh.gather_array('c')
-```
-%% Cell type:code id: tags:
-``` python
-animation = ps.jupyter.make_imshow_animation(dh.cpu_arrays["c"], grid_update_function, frames=300)
-```
-%% Output
-%% Cell type:code id: tags:
-``` python
-ps.jupyter.set_display_mode("video")
-ps.jupyter.set_display_mode("window")
-ps.jupyter.set_display_mode("image_update")
-ps.jupyter.activate_ipython()
-```
--- a/pystencils_tests/test_kernel_data_type.py
+++ b/pystencils_tests/test_kernel_data_type.py
-from collections import defaultdict
-import numpy as np
-import pytest
-from sympy.abc import x, y
-from pystencils import Assignment, create_kernel, fields, CreateKernelConfig
-from pystencils.transformations import adjust_c_single_precision_type
-@pytest.mark.parametrize("data_type", ("float", "double"))
-def test_single_precision(data_type):
-    dtype = f"float{64 if data_type == 'double' else 32}"
-    s = fields(f"s: {dtype}[1D]")
-    assignments = [Assignment(x, y), Assignment(s[0], x)]
-    ast = create_kernel(assignments, config=CreateKernelConfig(data_type=data_type))
-    assert ast.body.args[0].lhs.dtype.numpy_dtype == np.dtype(dtype)
-    assert ast.body.args[0].rhs.dtype.numpy_dtype == np.dtype(dtype)
-    assert ast.body.args[1].body.args[0].rhs.dtype.numpy_dtype == np.dtype(dtype)
-def test_adjustment_dict():
-    d = dict({"x": "float", "y": "double"})
-    adjust_c_single_precision_type(d)
-    assert np.dtype(d["x"]) == np.dtype("float32")
-    assert np.dtype(d["y"]) == np.dtype("float64")
-def test_adjustement_default_dict():
-    dd = defaultdict(lambda: "float")
-    dd["x"]
-    adjust_c_single_precision_type(dd)
-    dd["y"]
-    assert np.dtype(dd["x"]) == np.dtype("float32")
-    assert np.dtype(dd["y"]) == np.dtype("float32")
-    assert np.dtype(dd["z"]) == np.dtype("float32")
--- a/pystencils_tests/test_phasefield_dentritic_3D.ipynb
+++ b/pystencils_tests/test_phasefield_dentritic_3D.ipynb
-%% Cell type:code id: tags:
-``` python
-import pytest
-pytest.importorskip('pycuda')
-```
-%% Cell type:code id: tags:
-``` python
-from pystencils.session import *
-sp.init_printing()
-frac = sp.Rational
-```
-%% Cell type:markdown id: tags:
-# Phase-field simulation of dentritic solidification in 3D
-This notebook tests the model presented in the dentritic growth tutorial in 3D.
-%% Cell type:code id: tags:
-``` python
-target = ps.Target.GPU
-gpu = target == ps.Target.GPU
-domain_size = (25, 25, 25) if 'is_test_run' in globals() else (300, 300, 300)
-dh = ps.create_data_handling(domain_size=domain_size, periodicity=True, default_target=target)
-φ_field = dh.add_array('phi', latex_name='φ')
-φ_delta_field = dh.add_array('phidelta', latex_name='φ_D')
-t_field = dh.add_array('T')
-```
-%% Cell type:code id: tags:
-``` python
-ε, m, δ, j, θzero, α, γ, Teq, κ, τ = sp.symbols("ε m δ j θ_0 α γ T_eq κ τ")
-εb = sp.Symbol("\\bar{\\epsilon}")
-discretize = ps.fd.Discretization2ndOrder(dx=0.03, dt=1e-5)
-φ = φ_field.center
-T = t_field.center
-d = ps.fd.Diff
-def f(φ, m):
-    return φ**4 / 4 - (frac(1, 2) - m/3) * φ**3 + (frac(1,4)-m/2)*φ**2
-bulk_free_energy_density = f(φ, m)
-interface_free_energy_density = ε ** 2 / 2 * (d(φ, 0) ** 2 + d(φ, 1) ** 2 + d(φ, 2) ** 2)
-```
-%% Cell type:markdown id: tags:
-Here comes the major change, that has to be made for the 3D model: $\epsilon$ depends on the interface normal, which can not be computed simply as atan() as in the 2D case
-%% Cell type:code id: tags:
-``` python
-n = sp.Matrix([d(φ, i) for i in range(3)])
-nLen = sp.sqrt(sum(n_i**2 for n_i in n))
-n = n / nLen
-nVal = sum(n_i**4 for n_i in n)
-σ = δ * nVal
-εVal = εb * (1 + σ)
-εVal
-```
-%% Output
-    $\displaystyle \bar{\epsilon} \left(δ \left(\frac{{\partial_{0} {{φ}_{(0,0,0)}}}^{4}}{\left({\partial_{0} {{φ}_{(0,0,0)}}}^{2} + {\partial_{1} {{φ}_{(0,0,0)}}}^{2} + {\partial_{2} {{φ}_{(0,0,0)}}}^{2}\right)^{2}} + \frac{{\partial_{1} {{φ}_{(0,0,0)}}}^{4}}{\left({\partial_{0} {{φ}_{(0,0,0)}}}^{2} + {\partial_{1} {{φ}_{(0,0,0)}}}^{2} + {\partial_{2} {{φ}_{(0,0,0)}}}^{2}\right)^{2}} + \frac{{\partial_{2} {{φ}_{(0,0,0)}}}^{4}}{\left({\partial_{0} {{φ}_{(0,0,0)}}}^{2} + {\partial_{1} {{φ}_{(0,0,0)}}}^{2} + {\partial_{2} {{φ}_{(0,0,0)}}}^{2}\right)^{2}}\right) + 1\right)$
-                   ⎛  ⎛                            4
-                   ⎜  ⎜                 D(φ[0,0,0])
-    \bar{\epsilon}⋅⎜δ⋅⎜───────────────────────────────────────────── + ───────────
-                   ⎜  ⎜                                            2
-                   ⎜  ⎜⎛           2              2              2⎞    ⎛
-                   ⎝  ⎝⎝D(φ[0,0,0])  + D(φ[0,0,0])  + D(φ[0,0,0]) ⎠    ⎝D(φ[0,0,0]
-                     4                                               4
-          D(φ[0,0,0])                                     D(φ[0,0,0])
-    ────────────────────────────────── + ─────────────────────────────────────────
-                                     2
-     2              2              2⎞    ⎛           2              2
-    )  + D(φ[0,0,0])  + D(φ[0,0,0]) ⎠    ⎝D(φ[0,0,0])  + D(φ[0,0,0])  + D(φ[0,0,0]
-        ⎞    ⎞
-        ⎟    ⎟
-    ────⎟ + 1⎟
-       2⎟    ⎟
-     2⎞ ⎟    ⎟
-    ) ⎠ ⎠    ⎠
-%% Cell type:code id: tags:
-``` python
-def m_func(temperature):
-    return (α / sp.pi) * sp.atan(γ * (Teq - temperature))
-```
-%% Cell type:code id: tags:
-``` python
-substitutions = {m: m_func(T),
-                 ε: εVal}
-fe_i = interface_free_energy_density.subs(substitutions)
-fe_b = bulk_free_energy_density.subs(substitutions)
-μ_if = ps.fd.expand_diff_full(ps.fd.functional_derivative(fe_i, φ), functions=[φ])
-μ_b = ps.fd.expand_diff_full(ps.fd.functional_derivative(fe_b, φ), functions=[φ])
-```
-%% Cell type:code id: tags:
-``` python
-dF_dφ = μ_b + sp.Piecewise((μ_if, nLen**2 > 1e-10), (0, True))
-```
-%% Cell type:code id: tags:
-``` python
-parameters = {
-    τ: 0.0003,
-    κ: 1.8,
-    εb: 0.01,
-    δ: 0.3,
-    γ: 10,
-    j: 6,
-    α: 0.9,
-    Teq: 1.0,
-    θzero: 0.2,
-    sp.pi: sp.pi.evalf()
-}
-parameters
-```
-%% Output
-    $\displaystyle \left\{ \pi : 3.14159265358979, \  T_{eq} : 1.0, \  \bar{\epsilon} : 0.01, \  j : 6, \  α : 0.9, \  γ : 10, \  δ : 0.3, \  θ_{0} : 0.2, \  κ : 1.8, \  τ : 0.0003\right\}$
-    {π: 3.14159265358979, T_eq: 1.0, \bar{\epsilon}: 0.01, j: 6, α: 0.9, γ: 10, δ:
-     0.3, θ₀: 0.2, κ: 1.8, τ: 0.0003}
-%% Cell type:code id: tags:
-``` python
-dφ_dt = - dF_dφ / τ
-assignments = [
-    ps.Assignment(φ_delta_field.center, discretize(dφ_dt.subs(parameters))),
-]
-φEqs = ps.simp.sympy_cse_on_assignment_list(assignments)
-φEqs.append(ps.Assignment(φ, discretize(ps.fd.transient(φ) - φ_delta_field.center)))
-temperatureEvolution = -ps.fd.transient(T) + ps.fd.diffusion(T, 1) + κ * φ_delta_field.center
-temperatureEqs = [
-    ps.Assignment(T, discretize(temperatureEvolution.subs(parameters)))
-]
-```
-%% Cell type:code id: tags:
-``` python
-temperatureEqs
-```
-%% Output
-    $\displaystyle \left[ {{T}_{(0,0,0)}} \leftarrow 0.0111111111111111 {{T}_{(-1,0,0)}} + 0.0111111111111111 {{T}_{(0,-1,0)}} + 0.0111111111111111 {{T}_{(0,0,-1)}} + 0.933333333333333 {{T}_{(0,0,0)}} + 0.0111111111111111 {{T}_{(0,0,1)}} + 0.0111111111111111 {{T}_{(0,1,0)}} + 0.0111111111111111 {{T}_{(1,0,0)}} + 1.8 \cdot 10^{-5} {{φ_D}_{(0,0,0)}}\right]$
-    [T_C := 0.0111111111111111⋅T_W + 0.0111111111111111⋅T_S + 0.0111111111111111⋅T
-    _B + 0.933333333333333⋅T_C + 0.0111111111111111⋅T_T + 0.0111111111111111⋅T_N +
-     0.0111111111111111⋅T_E + 1.8e-5⋅phidelta_C]
-%% Cell type:code id: tags:
-``` python
-φ_kernel = ps.create_kernel(φEqs, cpu_openmp=4, target=target).compile()
-temperatureKernel = ps.create_kernel(temperatureEqs, cpu_openmp=4, target=target).compile()
-```
-%% Cell type:code id: tags:
-``` python
-def time_loop(steps):
-    φ_sync = dh.synchronization_function(['phi'], target=target)
-    temperature_sync = dh.synchronization_function(['T'], target=target)
-    dh.all_to_gpu()
-    for t in range(steps):
-        φ_sync()
-        dh.run_kernel(φ_kernel)
-        temperature_sync()
-        dh.run_kernel(temperatureKernel)
-    dh.all_to_cpu()
-def init(nucleus_size=np.sqrt(5)):
-    for b in dh.iterate():
-        x, y, z = b.cell_index_arrays
-        x, y, z = x - b.shape[0] // 2, y - b.shape[1] // 2, z - b.shape[2] // 2
-        b['phi'].fill(0)
-        b['phi'][(x ** 2 + y ** 2 + z ** 2) < nucleus_size ** 2] = 1.0
-        b['T'].fill(0.0)
-def plot(slice_obj=ps.make_slice[:, :, 0.5]):
-    plt.subplot(1, 3, 1)
-    plt.scalar_field(dh.gather_array('phi', slice_obj).squeeze())
-    plt.title("φ")
-    plt.colorbar()
-    plt.subplot(1, 3, 2)
-    plt.title("T")
-    plt.scalar_field(dh.gather_array('T', slice_obj).squeeze())
-    plt.colorbar()
-    plt.subplot(1, 3, 3)
-    plt.title("∂φ")
-    plt.scalar_field(dh.gather_array('phidelta', slice_obj).squeeze())
-    plt.colorbar()
-```
-%% Cell type:code id: tags:
-``` python
-init()
-plot()
-print(dh)
-```
-%% Output
-        Name|      Inner (min/max)|     WithGl (min/max)
-    ----------------------------------------------------
-           T|            (  0,  0)|            (  0,  0)
-         phi|            (  0,  1)|            (  0,  1)
-    phidelta|            (  0,  0)|            (  0,  0)
-%% Cell type:code id: tags:
-``` python
-if 'is_test_run' in globals():
-    time_loop(2)
-    assert np.isfinite(dh.max('phi'))
-    assert np.isfinite(dh.max('T'))
-    assert np.isfinite(dh.max('phidelta'))
-else:
-    from time import perf_counter
-    vtk_writer = dh.create_vtk_writer('dentritic_growth_large', ['phi'])
-    last = perf_counter()
-    for i in range(300):
-        time_loop(100)
-        vtk_writer(i)
-        print("Step ", i, perf_counter() - last, dh.max('phi'))
-        last = perf_counter()
-```
-%% Cell type:code id: tags:
-``` python
-import pytest
-pytest.importorskip('pycuda')
-```
-%% Cell type:code id: tags:
-``` python
-from pystencils.session import *
-sp.init_printing()
-frac = sp.Rational
-```
-%% Cell type:markdown id: tags:
-# Phase-field simulation of dentritic solidification in 3D
-This notebook tests the model presented in the dentritic growth tutorial in 3D.
-%% Cell type:code id: tags:
-``` python
-target = ps.Target.GPU
-gpu = target == ps.Target.GPU
-domain_size = (25, 25, 25) if 'is_test_run' in globals() else (300, 300, 300)
-dh = ps.create_data_handling(domain_size=domain_size, periodicity=True, default_target=target)
-φ_field = dh.add_array('phi', latex_name='φ')
-φ_delta_field = dh.add_array('phidelta', latex_name='φ_D')
-t_field = dh.add_array('T')
-```
-%% Cell type:code id: tags:
-``` python
-ε, m, δ, j, θzero, α, γ, Teq, κ, τ = sp.symbols("ε m δ j θ_0 α γ T_eq κ τ")
-εb = sp.Symbol("\\bar{\\epsilon}")
-discretize = ps.fd.Discretization2ndOrder(dx=0.03, dt=1e-5)
-φ = φ_field.center
-T = t_field.center
-d = ps.fd.Diff
-def f(φ, m):
-    return φ**4 / 4 - (frac(1, 2) - m/3) * φ**3 + (frac(1,4)-m/2)*φ**2
-bulk_free_energy_density = f(φ, m)
-interface_free_energy_density = ε ** 2 / 2 * (d(φ, 0) ** 2 + d(φ, 1) ** 2 + d(φ, 2) ** 2)
-```
-%% Cell type:markdown id: tags:
-Here comes the major change, that has to be made for the 3D model: $\epsilon$ depends on the interface normal, which can not be computed simply as atan() as in the 2D case
-%% Cell type:code id: tags:
-``` python
-n = sp.Matrix([d(φ, i) for i in range(3)])
-nLen = sp.sqrt(sum(n_i**2 for n_i in n))
-n = n / nLen
-nVal = sum(n_i**4 for n_i in n)
-σ = δ * nVal
-εVal = εb * (1 + σ)
-εVal
-```
-%% Output
-    $\displaystyle \bar{\epsilon} \left(δ \left(\frac{{\partial_{0} {{φ}_{(0,0,0)}}}^{4}}{\left({\partial_{0} {{φ}_{(0,0,0)}}}^{2} + {\partial_{1} {{φ}_{(0,0,0)}}}^{2} + {\partial_{2} {{φ}_{(0,0,0)}}}^{2}\right)^{2}} + \frac{{\partial_{1} {{φ}_{(0,0,0)}}}^{4}}{\left({\partial_{0} {{φ}_{(0,0,0)}}}^{2} + {\partial_{1} {{φ}_{(0,0,0)}}}^{2} + {\partial_{2} {{φ}_{(0,0,0)}}}^{2}\right)^{2}} + \frac{{\partial_{2} {{φ}_{(0,0,0)}}}^{4}}{\left({\partial_{0} {{φ}_{(0,0,0)}}}^{2} + {\partial_{1} {{φ}_{(0,0,0)}}}^{2} + {\partial_{2} {{φ}_{(0,0,0)}}}^{2}\right)^{2}}\right) + 1\right)$
-                   ⎛  ⎛                            4
-                   ⎜  ⎜                 D(φ[0,0,0])
-    \bar{\epsilon}⋅⎜δ⋅⎜───────────────────────────────────────────── + ───────────
-                   ⎜  ⎜                                            2
-                   ⎜  ⎜⎛           2              2              2⎞    ⎛
-                   ⎝  ⎝⎝D(φ[0,0,0])  + D(φ[0,0,0])  + D(φ[0,0,0]) ⎠    ⎝D(φ[0,0,0]
-                     4                                               4
-          D(φ[0,0,0])                                     D(φ[0,0,0])
-    ────────────────────────────────── + ─────────────────────────────────────────
-                                     2
-     2              2              2⎞    ⎛           2              2
-    )  + D(φ[0,0,0])  + D(φ[0,0,0]) ⎠    ⎝D(φ[0,0,0])  + D(φ[0,0,0])  + D(φ[0,0,0]
-        ⎞    ⎞
-        ⎟    ⎟
-    ────⎟ + 1⎟
-       2⎟    ⎟
-     2⎞ ⎟    ⎟
-    ) ⎠ ⎠    ⎠
-%% Cell type:code id: tags:
-``` python
-def m_func(temperature):
-    return (α / sp.pi) * sp.atan(γ * (Teq - temperature))
-```
-%% Cell type:code id: tags:
-``` python
-substitutions = {m: m_func(T),
-                 ε: εVal}
-fe_i = interface_free_energy_density.subs(substitutions)
-fe_b = bulk_free_energy_density.subs(substitutions)
-μ_if = ps.fd.expand_diff_full(ps.fd.functional_derivative(fe_i, φ), functions=[φ])
-μ_b = ps.fd.expand_diff_full(ps.fd.functional_derivative(fe_b, φ), functions=[φ])
-```
-%% Cell type:code id: tags:
-``` python
-dF_dφ = μ_b + sp.Piecewise((μ_if, nLen**2 > 1e-10), (0, True))
-```
-%% Cell type:code id: tags:
-``` python
-parameters = {
-    τ: 0.0003,
-    κ: 1.8,
-    εb: 0.01,
-    δ: 0.3,
-    γ: 10,
-    j: 6,
-    α: 0.9,
-    Teq: 1.0,
-    θzero: 0.2,
-    sp.pi: sp.pi.evalf()
-}
-parameters
-```
-%% Output
-    $\displaystyle \left\{ \pi : 3.14159265358979, \  T_{eq} : 1.0, \  \bar{\epsilon} : 0.01, \  j : 6, \  α : 0.9, \  γ : 10, \  δ : 0.3, \  θ_{0} : 0.2, \  κ : 1.8, \  τ : 0.0003\right\}$
-    {π: 3.14159265358979, T_eq: 1.0, \bar{\epsilon}: 0.01, j: 6, α: 0.9, γ: 10, δ:
-     0.3, θ₀: 0.2, κ: 1.8, τ: 0.0003}
-%% Cell type:code id: tags:
-``` python
-dφ_dt = - dF_dφ / τ
-assignments = [
-    ps.Assignment(φ_delta_field.center, discretize(dφ_dt.subs(parameters))),
-]
-φEqs = ps.simp.sympy_cse_on_assignment_list(assignments)
-φEqs.append(ps.Assignment(φ, discretize(ps.fd.transient(φ) - φ_delta_field.center)))
-temperatureEvolution = -ps.fd.transient(T) + ps.fd.diffusion(T, 1) + κ * φ_delta_field.center
-temperatureEqs = [
-    ps.Assignment(T, discretize(temperatureEvolution.subs(parameters)))
-]
-```
-%% Cell type:code id: tags:
-``` python
-temperatureEqs
-```
-%% Output
-    $\displaystyle \left[ {{T}_{(0,0,0)}} \leftarrow 0.0111111111111111 {{T}_{(-1,0,0)}} + 0.0111111111111111 {{T}_{(0,-1,0)}} + 0.0111111111111111 {{T}_{(0,0,-1)}} + 0.933333333333333 {{T}_{(0,0,0)}} + 0.0111111111111111 {{T}_{(0,0,1)}} + 0.0111111111111111 {{T}_{(0,1,0)}} + 0.0111111111111111 {{T}_{(1,0,0)}} + 1.8 \cdot 10^{-5} {{φ_D}_{(0,0,0)}}\right]$
-    [T_C := 0.0111111111111111⋅T_W + 0.0111111111111111⋅T_S + 0.0111111111111111⋅T
-    _B + 0.933333333333333⋅T_C + 0.0111111111111111⋅T_T + 0.0111111111111111⋅T_N +
-     0.0111111111111111⋅T_E + 1.8e-5⋅phidelta_C]
-%% Cell type:code id: tags:
-``` python
-φ_kernel = ps.create_kernel(φEqs, cpu_openmp=4, target=target).compile()
-temperatureKernel = ps.create_kernel(temperatureEqs, cpu_openmp=4, target=target).compile()
-```
-%% Cell type:code id: tags:
-``` python
-def time_loop(steps):
-    φ_sync = dh.synchronization_function(['phi'], target=target)
-    temperature_sync = dh.synchronization_function(['T'], target=target)
-    dh.all_to_gpu()
-    for t in range(steps):
-        φ_sync()
-        dh.run_kernel(φ_kernel)
-        temperature_sync()
-        dh.run_kernel(temperatureKernel)
-    dh.all_to_cpu()
-def init(nucleus_size=np.sqrt(5)):
-    for b in dh.iterate():
-        x, y, z = b.cell_index_arrays
-        x, y, z = x - b.shape[0] // 2, y - b.shape[1] // 2, z - b.shape[2] // 2
-        b['phi'].fill(0)
-        b['phi'][(x ** 2 + y ** 2 + z ** 2) < nucleus_size ** 2] = 1.0
-        b['T'].fill(0.0)
-def plot(slice_obj=ps.make_slice[:, :, 0.5]):
-    plt.subplot(1, 3, 1)
-    plt.scalar_field(dh.gather_array('phi', slice_obj).squeeze())
-    plt.title("φ")
-    plt.colorbar()
-    plt.subplot(1, 3, 2)
-    plt.title("T")
-    plt.scalar_field(dh.gather_array('T', slice_obj).squeeze())
-    plt.colorbar()
-    plt.subplot(1, 3, 3)
-    plt.title("∂φ")
-    plt.scalar_field(dh.gather_array('phidelta', slice_obj).squeeze())
-    plt.colorbar()
-```
-%% Cell type:code id: tags:
-``` python
-init()
-plot()
-print(dh)
-```
-%% Output
-        Name|      Inner (min/max)|     WithGl (min/max)
-    ----------------------------------------------------
-           T|            (  0,  0)|            (  0,  0)
-         phi|            (  0,  1)|            (  0,  1)
-    phidelta|            (  0,  0)|            (  0,  0)
-%% Cell type:code id: tags:
-``` python
-if 'is_test_run' in globals():
-    time_loop(2)
-    assert np.isfinite(dh.max('phi'))
-    assert np.isfinite(dh.max('T'))
-    assert np.isfinite(dh.max('phidelta'))
-else:
-    from time import perf_counter
-    vtk_writer = dh.create_vtk_writer('dentritic_growth_large', ['phi'])
-    last = perf_counter()
-    for i in range(300):
-        time_loop(100)
-        vtk_writer(i)
-        print("Step ", i, perf_counter() - last, dh.max('phi'))
-        last = perf_counter()
-```
--- a/pystencils_tests/test_print_infinity.py
+++ b/pystencils_tests/test_print_infinity.py
-import pytest
-import pystencils
-from sympy import oo
-@pytest.mark.parametrize('type', ('float32', 'float64', 'int64'))
-@pytest.mark.parametrize('negative', (False, 'Negative'))
-@pytest.mark.parametrize('target', (pystencils.Target.CPU, pystencils.Target.GPU))
-def test_print_infinity(type, negative, target):
-    x = pystencils.fields(f'x:  {type}[1d]')
-    if negative:
-        assignment = pystencils.Assignment(x.center, -oo)
-    else:
-        assignment = pystencils.Assignment(x.center, oo)
-    ast = pystencils.create_kernel(assignment, data_type=type, target=target)
-    if target == pystencils.Target.GPU:
-        pytest.importorskip('pycuda')
-    ast.compile()
-    print(ast.compile().code)
--- a/pystencils_tests/test_print_unsupported_node.py
+++ b/pystencils_tests/test_print_unsupported_node.py
-# -*- coding: utf-8 -*-
-#
-# Copyright © 2019 Stephan Seitz <stephan.seitz@fau.de>
-#
-# Distributed under terms of the GPLv3 license.
-"""
-"""
-import pytest
-import pystencils
-from pystencils.backends.cbackend import CBackend
-class UnsupportedNode(pystencils.astnodes.Node):
-    def __init__(self):
-        super().__init__()
-def test_print_unsupported_node():
-    with pytest.raises(NotImplementedError, match='CBackend does not support node of type UnsupportedNode'):
-        CBackend()(UnsupportedNode())
--- a/pystencils_tests/test_sliced_iteration.py
+++ b/pystencils_tests/test_sliced_iteration.py
-import numpy as np
-import sympy as sp
-from pystencils import Assignment, Field, TypedSymbol, create_kernel, make_slice
-from pystencils.simp import sympy_cse_on_assignment_list
-def test_sliced_iteration():
-    size = (4, 4)
-    src_arr = np.ones(size)
-    dst_arr = np.zeros_like(src_arr)
-    src_field = Field.create_from_numpy_array('src', src_arr)
-    dst_field = Field.create_from_numpy_array('dst', dst_arr)
-    a, b = sp.symbols("a b")
-    update_rule = Assignment(dst_field[0, 0],
-                             (a * src_field[0, 1] + a * src_field[0, -1] +
-                              b * src_field[1, 0] + b * src_field[-1, 0]) / 4)
-    x_end = TypedSymbol("x_end", "int")
-    s = make_slice[1:x_end, 1]
-    x_end_value = size[1] - 1
-    kernel = create_kernel(sympy_cse_on_assignment_list([update_rule]), iteration_slice=s).compile()
-    kernel(src=src_arr, dst=dst_arr, a=1.0, b=1.0, x_end=x_end_value)
-    expected_result = np.zeros(size)
-    expected_result[1:x_end_value, 1] = 1
-    np.testing.assert_almost_equal(expected_result, dst_arr)
--- a/pystencils_tests/test_small_block_benchmark.ipynb
+++ b/pystencils_tests/test_small_block_benchmark.ipynb
-%% Cell type:code id: tags:
-``` python
-import pytest
-pytest.importorskip('waLBerla')
-```
-%% Cell type:code id: tags:
-``` python
-from pystencils.session import *
-from time import perf_counter
-from statistics import median
-from functools import partial
-```
-%% Cell type:markdown id: tags:
-## Benchmark for Python call overhead
-%% Cell type:code id: tags:
-``` python
-inner_repeats = 100
-outer_repeats = 5
-sizes = [2**i for i in range(1, 8)]
-sizes
-```
-%% Output
-    $\displaystyle \left[ 2, \  4, \  8, \  16, \  32, \  64, \  128\right]$
-    [2, 4, 8, 16, 32, 64, 128]
-%% Cell type:code id: tags:
-``` python
-def benchmark_pure(domain_size, extract_first=False):
-    src = np.zeros(domain_size)
-    dst = np.zeros_like(src)
-    f_src, f_dst = ps.fields("src, dst", src=src, dst=dst)
-    kernel = ps.create_kernel(ps.Assignment(f_dst.center, f_src.center)).compile()
-    if extract_first:
-        kernel = kernel.kernel
-        start = perf_counter()
-        for i in range(inner_repeats):
-            kernel(src=src, dst=dst)
-            src, dst = dst, src
-        end = perf_counter()
-    else:
-        start = perf_counter()
-        for i in range(inner_repeats):
-            kernel(src=src, dst=dst)
-            src, dst = dst, src
-        end = perf_counter()
-    return (end - start) / inner_repeats
-def benchmark_datahandling(domain_size, parallel=False):
-    dh = ps.create_data_handling(domain_size, parallel=parallel)
-    f_src = dh.add_array('src')
-    f_dst = dh.add_array('dst')
-    kernel = ps.create_kernel(ps.Assignment(f_dst.center, f_src.center)).compile()
-    start = perf_counter()
-    for i in range(inner_repeats):
-        dh.run_kernel(kernel)
-        dh.swap('src', 'dst')
-    end = perf_counter()
-    return (end - start) / inner_repeats
-name_to_func = {
-    'pure_extract': partial(benchmark_pure, extract_first=True),
-    'pure_no_extract': partial(benchmark_pure, extract_first=False),
-    'dh_serial': partial(benchmark_datahandling, parallel=False),
-    'dh_parallel': partial(benchmark_datahandling, parallel=True),
-}
-```
-%% Cell type:code id: tags:
-``` python
-result = {'block_size': [],
-          'name': [],
-          'time': []}
-for bs in sizes:
-    print("Computing size ", bs)
-    for name, func in name_to_func.items():
-        for i in range(outer_repeats):
-            time = func((bs, bs))
-            result['block_size'].append(bs)
-            result['name'].append(name)
-            result['time'].append(time)
-```
-%% Output
-    Computing size  2
-    Computing size  4
-    Computing size  8
-    Computing size  16
-    Computing size  32
-    Computing size  64
-    Computing size  128
-%% Cell type:code id: tags:
-``` python
-if 'is_test_run' not in globals():
-    import pandas as pd
-    import seaborn as sns
-    data = pd.DataFrame.from_dict(result)
-    plt.subplot(1,2,1)
-    sns.barplot(x='block_size', y='time', hue='name', data=data, alpha=0.6)
-    plt.yscale('log')
-    plt.subplot(1,2,2)
-    data = pd.DataFrame.from_dict(result)
-    sns.barplot(x='block_size', y='time', hue='name', data=data, alpha=0.6)
-```
-%% Output
-%% Cell type:code id: tags:
-``` python
-import pytest
-pytest.importorskip('waLBerla')
-```
-%% Cell type:code id: tags:
-``` python
-from pystencils.session import *
-from time import perf_counter
-from statistics import median
-from functools import partial
-```
-%% Cell type:markdown id: tags:
-## Benchmark for Python call overhead
-%% Cell type:code id: tags:
-``` python
-inner_repeats = 100
-outer_repeats = 5
-sizes = [2**i for i in range(1, 8)]
-sizes
-```
-%% Output
-    $\displaystyle \left[ 2, \  4, \  8, \  16, \  32, \  64, \  128\right]$
-    [2, 4, 8, 16, 32, 64, 128]
-%% Cell type:code id: tags:
-``` python
-def benchmark_pure(domain_size, extract_first=False):
-    src = np.zeros(domain_size)
-    dst = np.zeros_like(src)
-    f_src, f_dst = ps.fields("src, dst", src=src, dst=dst)
-    kernel = ps.create_kernel(ps.Assignment(f_dst.center, f_src.center)).compile()
-    if extract_first:
-        kernel = kernel.kernel
-        start = perf_counter()
-        for i in range(inner_repeats):
-            kernel(src=src, dst=dst)
-            src, dst = dst, src
-        end = perf_counter()
-    else:
-        start = perf_counter()
-        for i in range(inner_repeats):
-            kernel(src=src, dst=dst)
-            src, dst = dst, src
-        end = perf_counter()
-    return (end - start) / inner_repeats
-def benchmark_datahandling(domain_size, parallel=False):
-    dh = ps.create_data_handling(domain_size, parallel=parallel)
-    f_src = dh.add_array('src')
-    f_dst = dh.add_array('dst')
-    kernel = ps.create_kernel(ps.Assignment(f_dst.center, f_src.center)).compile()
-    start = perf_counter()
-    for i in range(inner_repeats):
-        dh.run_kernel(kernel)
-        dh.swap('src', 'dst')
-    end = perf_counter()
-    return (end - start) / inner_repeats
-name_to_func = {
-    'pure_extract': partial(benchmark_pure, extract_first=True),
-    'pure_no_extract': partial(benchmark_pure, extract_first=False),
-    'dh_serial': partial(benchmark_datahandling, parallel=False),
-    'dh_parallel': partial(benchmark_datahandling, parallel=True),
-}
-```
-%% Cell type:code id: tags:
-``` python
-result = {'block_size': [],
-          'name': [],
-          'time': []}
-for bs in sizes:
-    print("Computing size ", bs)
-    for name, func in name_to_func.items():
-        for i in range(outer_repeats):
-            time = func((bs, bs))
-            result['block_size'].append(bs)
-            result['name'].append(name)
-            result['time'].append(time)
-```
-%% Output
-    Computing size  2
-    Computing size  4
-    Computing size  8
-    Computing size  16
-    Computing size  32
-    Computing size  64
-    Computing size  128
-%% Cell type:code id: tags:
-``` python
-if 'is_test_run' not in globals():
-    import pandas as pd
-    import seaborn as sns
-    data = pd.DataFrame.from_dict(result)
-    plt.subplot(1,2,1)
-    sns.barplot(x='block_size', y='time', hue='name', data=data, alpha=0.6)
-    plt.yscale('log')
-    plt.subplot(1,2,2)
-    data = pd.DataFrame.from_dict(result)
-    sns.barplot(x='block_size', y='time', hue='name', data=data, alpha=0.6)
-```
-%% Output
--- a/pystencils_tests/test_sum_prod.py
+++ b/pystencils_tests/test_sum_prod.py
-# -*- coding: utf-8 -*-
-#
-# Copyright © 2019 Stephan Seitz <stephan.seitz@fau.de>
-#
-# Distributed under terms of the GPLv3 license.
-"""
-"""
-import pytest
-import numpy as np
-import sympy as sp
-import sympy.abc
-import pystencils as ps
-from pystencils.data_types import create_type
-@pytest.mark.parametrize('default_assignment_simplifications', [False, True])
-def test_sum(default_assignment_simplifications):
-    sum = sp.Sum(sp.abc.k, (sp.abc.k, 1, 100))
-    expanded_sum = sum.doit()
-    print(sum)
-    print(expanded_sum)
-    x = ps.fields('x: float32[1d]')
-    assignments = ps.AssignmentCollection({x.center(): sum})
-    config = ps.CreateKernelConfig(default_assignment_simplifications=default_assignment_simplifications)
-    ast = ps.create_kernel(assignments, config=config)
-    code = ps.get_code_str(ast)
-    kernel = ast.compile()
-    print(code)
-    if default_assignment_simplifications is False:
-        assert 'double sum' in code
-    array = np.zeros((10,), np.float32)
-    kernel(x=array)
-    assert np.allclose(array, int(expanded_sum) * np.ones_like(array))
-@pytest.mark.parametrize('default_assignment_simplifications', [False, True])
-def test_sum_use_float(default_assignment_simplifications):
-    sum = sympy.Sum(sp.abc.k, (sp.abc.k, 1, 100))
-    expanded_sum = sum.doit()
-    print(sum)
-    print(expanded_sum)
-    x = ps.fields('x: float32[1d]')
-    assignments = ps.AssignmentCollection({x.center(): sum})
-    config = ps.CreateKernelConfig(default_assignment_simplifications=default_assignment_simplifications,
-                                   data_type=create_type('float32'))
-    ast = ps.create_kernel(assignments, config=config)
-    code = ps.get_code_str(ast)
-    kernel = ast.compile()
-    print(code)
-    if default_assignment_simplifications is False:
-        assert 'float sum' in code
-    array = np.zeros((10,), np.float32)
-    kernel(x=array)
-    assert np.allclose(array, int(expanded_sum) * np.ones_like(array))
-@pytest.mark.parametrize('default_assignment_simplifications', [False, True])
-def test_product(default_assignment_simplifications):
-    k = ps.TypedSymbol('k', create_type('int64'))
-    sum = sympy.Product(k, (k, 1, 10))
-    expanded_sum = sum.doit()
-    print(sum)
-    print(expanded_sum)
-    x = ps.fields('x: int64[1d]')
-    assignments = ps.AssignmentCollection({x.center(): sum})
-    config = ps.CreateKernelConfig(default_assignment_simplifications=default_assignment_simplifications)
-    ast = ps.create_kernel(assignments, config=config)
-    code = ps.get_code_str(ast)
-    kernel = ast.compile()
-    print(code)
-    if default_assignment_simplifications is False:
-        assert 'int64_t product' in code
-    array = np.zeros((10,), np.int64)
-    kernel(x=array)
-    assert np.allclose(array, int(expanded_sum) * np.ones_like(array))
-def test_prod_var_limit():
-    k = ps.TypedSymbol('k', create_type('int64'))
-    limit = ps.TypedSymbol('limit', create_type('int64'))
-    sum = sympy.Sum(k, (k, 1, limit))
-    expanded_sum = sum.replace(limit, 100).doit()
-    print(sum)
-    print(expanded_sum)
-    x = ps.fields('x: int64[1d]')
-    assignments = ps.AssignmentCollection({x.center(): sum})
-    ast = ps.create_kernel(assignments)
-    ps.show_code(ast)
-    kernel = ast.compile()
-    array = np.zeros((10,), np.int64)
-    kernel(x=array, limit=100)
-    assert np.allclose(array, int(expanded_sum) * np.ones_like(array))
--- a/pystencils_tests/test_transformations.py
+++ b/pystencils_tests/test_transformations.py
-import pystencils as ps
-from pystencils import TypedSymbol
-from pystencils.astnodes import LoopOverCoordinate, SympyAssignment
-from pystencils.data_types import create_type
-from pystencils.transformations import filtered_tree_iteration, get_loop_hierarchy, get_loop_counter_symbol_hierarchy
-def test_loop_information():
-    f, g = ps.fields("f, g: double[2D]")
-    update_rule = ps.Assignment(g[0, 0], f[0, 0])
-    ast = ps.create_kernel(update_rule)
-    inner_loops = [l for l in filtered_tree_iteration(ast, LoopOverCoordinate, stop_type=SympyAssignment)
-                   if l.is_innermost_loop]
-    loop_order = []
-    for i in get_loop_hierarchy(inner_loops[0].args[0]):
-        loop_order.append(i)
-    assert loop_order == [0, 1]
-    loop_symbols = get_loop_counter_symbol_hierarchy(inner_loops[0].args[0])
-    assert loop_symbols == [TypedSymbol("ctr_1", create_type("int"), nonnegative=True),
-                            TypedSymbol("ctr_0", create_type("int"), nonnegative=True)]
--- a/pystencils_tests/test_type_interference.py
+++ b/pystencils_tests/test_type_interference.py
-from sympy.abc import a, b, c, d, e, f
-import pystencils
-from pystencils.data_types import cast_func, create_type
-def test_type_interference():
-    x = pystencils.fields('x:  float32[3d]')
-    assignments = pystencils.AssignmentCollection({
-        a: cast_func(10, create_type('float64')),
-        b: cast_func(10, create_type('uint16')),
-        e: 11,
-        c: b,
-        f: c + b,
-        d: c + b + x.center + e,
-        x.center: c + b + x.center
-    })
-    ast = pystencils.create_kernel(assignments)
-    code = str(pystencils.get_code_str(ast))
-    assert 'double a' in code
-    assert 'uint16_t b' in code
-    assert 'uint16_t f' in code
-    assert 'int64_t e' in code
No results found