ea943334 · ea943334 · ea943334 · ea943334 · ea943334 · ea943334
--- a/pystencils/include/cuda_complex.hpp
+++ b/pystencils/include/cuda_complex.hpp
-// An implementation of C++ std::complex for use on CUDA devices.
-// Written by John C. Travers <jtravs@gmail.com> (2012)
-//
-// Missing:
-//  - long double support (not supported on CUDA)
-//  - some integral pow functions (due to lack of C++11 support on CUDA)
-//
-// Heavily derived from the LLVM libcpp project (svn revision 147853).
-// Based on libcxx/include/complex.
-// The git history contains the complete change history from the original.
-// The modifications are licensed as per the original LLVM license below.
-//
-// -*- C++ -*-
-//===--------------------------- complex ----------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-extern "C++" {
-#ifndef CUDA_COMPLEX_HPP
-#define CUDA_COMPLEX_HPP
-#ifdef __CUDACC__
-#define CUDA_CALLABLE_MEMBER __host__ __device__
-#else
-#define CUDA_CALLABLE_MEMBER
-#endif
-/*
-    complex synopsis
-template<class T>
-class complex
-{
-public:
-    typedef T value_type;
-    complex(const T& re = T(), const T& im = T());
-    complex(const complex&);
-    template<class X> complex(const complex<X>&);
-    T real() const;
-    T imag() const;
-    void real(T);
-    void imag(T);
-    complex<T>& operator= (const T&);
-    complex<T>& operator+=(const T&);
-    complex<T>& operator-=(const T&);
-    complex<T>& operator*=(const T&);
-    complex<T>& operator/=(const T&);
-    complex& operator=(const complex&);
-    template<class X> complex<T>& operator= (const complex<X>&);
-    template<class X> complex<T>& operator+=(const complex<X>&);
-    template<class X> complex<T>& operator-=(const complex<X>&);
-    template<class X> complex<T>& operator*=(const complex<X>&);
-    template<class X> complex<T>& operator/=(const complex<X>&);
-};
-template<>
-class complex<float>
-{
-public:
-    typedef float value_type;
-    constexpr complex(float re = 0.0f, float im = 0.0f);
-    explicit constexpr complex(const complex<double>&);
-    constexpr float real() const;
-    void real(float);
-    constexpr float imag() const;
-    void imag(float);
-    complex<float>& operator= (float);
-    complex<float>& operator+=(float);
-    complex<float>& operator-=(float);
-    complex<float>& operator*=(float);
-    complex<float>& operator/=(float);
-    complex<float>& operator=(const complex<float>&);
-    template<class X> complex<float>& operator= (const complex<X>&);
-    template<class X> complex<float>& operator+=(const complex<X>&);
-    template<class X> complex<float>& operator-=(const complex<X>&);
-    template<class X> complex<float>& operator*=(const complex<X>&);
-    template<class X> complex<float>& operator/=(const complex<X>&);
-};
-template<>
-class complex<double>
-{
-public:
-    typedef double value_type;
-    constexpr complex(double re = 0.0, double im = 0.0);
-    constexpr complex(const complex<float>&);
-    constexpr double real() const;
-    void real(double);
-    constexpr double imag() const;
-    void imag(double);
-    complex<double>& operator= (double);
-    complex<double>& operator+=(double);
-    complex<double>& operator-=(double);
-    complex<double>& operator*=(double);
-    complex<double>& operator/=(double);
-    complex<double>& operator=(const complex<double>&);
-    template<class X> complex<double>& operator= (const complex<X>&);
-    template<class X> complex<double>& operator+=(const complex<X>&);
-    template<class X> complex<double>& operator-=(const complex<X>&);
-    template<class X> complex<double>& operator*=(const complex<X>&);
-    template<class X> complex<double>& operator/=(const complex<X>&);
-};
-// 26.3.6 operators:
-template<class T> complex<T> operator+(const complex<T>&, const complex<T>&);
-template<class T> complex<T> operator+(const complex<T>&, const T&);
-template<class T> complex<T> operator+(const T&, const complex<T>&);
-template<class T> complex<T> operator-(const complex<T>&, const complex<T>&);
-template<class T> complex<T> operator-(const complex<T>&, const T&);
-template<class T> complex<T> operator-(const T&, const complex<T>&);
-template<class T> complex<T> operator*(const complex<T>&, const complex<T>&);
-template<class T> complex<T> operator*(const complex<T>&, const T&);
-template<class T> complex<T> operator*(const T&, const complex<T>&);
-template<class T> complex<T> operator/(const complex<T>&, const complex<T>&);
-template<class T> complex<T> operator/(const complex<T>&, const T&);
-template<class T> complex<T> operator/(const T&, const complex<T>&);
-template<class T> complex<T> operator+(const complex<T>&);
-template<class T> complex<T> operator-(const complex<T>&);
-template<class T> bool operator==(const complex<T>&, const complex<T>&);
-template<class T> bool operator==(const complex<T>&, const T&);
-template<class T> bool operator==(const T&, const complex<T>&);
-template<class T> bool operator!=(const complex<T>&, const complex<T>&);
-template<class T> bool operator!=(const complex<T>&, const T&);
-template<class T> bool operator!=(const T&, const complex<T>&);
-template<class T, class charT, class traits>
-  basic_istream<charT, traits>&
-  operator>>(basic_istream<charT, traits>&, complex<T>&);
-template<class T, class charT, class traits>
-  basic_ostream<charT, traits>&
-  operator<<(basic_ostream<charT, traits>&, const complex<T>&);
-// 26.3.7 values:
-template<class T>              T real(const complex<T>&);
-                          double real(double);
-template<Integral T>      double real(T);
-                          float  real(float);
-template<class T>              T imag(const complex<T>&);
-                          double imag(double);
-template<Integral T>      double imag(T);
-                          float  imag(float);
-template<class T> T abs(const complex<T>&);
-template<class T>              T arg(const complex<T>&);
-                          double arg(double);
-template<Integral T>      double arg(T);
-                          float  arg(float);
-template<class T>              T norm(const complex<T>&);
-                          double norm(double);
-template<Integral T>      double norm(T);
-                          float  norm(float);
-template<class T>      complex<T>           conj(const complex<T>&);
-                       complex<double>      conj(double);
-template<Integral T>   complex<double>      conj(T);
-                       complex<float>       conj(float);
-template<class T>    complex<T>           proj(const complex<T>&);
-                     complex<double>      proj(double);
-template<Integral T> complex<double>      proj(T);
-                     complex<float>       proj(float);
-template<class T> complex<T> polar(const T&, const T& = 0);
-// 26.3.8 transcendentals:
-template<class T> complex<T> acos(const complex<T>&);
-template<class T> complex<T> asin(const complex<T>&);
-template<class T> complex<T> atan(const complex<T>&);
-template<class T> complex<T> acosh(const complex<T>&);
-template<class T> complex<T> asinh(const complex<T>&);
-template<class T> complex<T> atanh(const complex<T>&);
-template<class T> complex<T> cos (const complex<T>&);
-template<class T> complex<T> cosh (const complex<T>&);
-template<class T> complex<T> exp (const complex<T>&);
-template<class T> complex<T> log (const complex<T>&);
-template<class T> complex<T> log10(const complex<T>&);
-template<class T> complex<T> pow(const complex<T>&, const T&);
-template<class T> complex<T> pow(const complex<T>&, const complex<T>&);
-template<class T> complex<T> pow(const T&, const complex<T>&);
-template<class T> complex<T> sin (const complex<T>&);
-template<class T> complex<T> sinh (const complex<T>&);
-template<class T> complex<T> sqrt (const complex<T>&);
-template<class T> complex<T> tan (const complex<T>&);
-template<class T> complex<T> tanh (const complex<T>&);
-template<class T, class charT, class traits>
-  basic_istream<charT, traits>&
-  operator>>(basic_istream<charT, traits>& is, complex<T>& x);
-template<class T, class charT, class traits>
-  basic_ostream<charT, traits>&
-  operator<<(basic_ostream<charT, traits>& o, const complex<T>& x);
-*/
-#include <math.h>
-#include <sstream>
-template <class _Tp> class complex;
-template <class _Tp>
-complex<_Tp> operator*(const complex<_Tp> &__z, const complex<_Tp> &__w);
-template <class _Tp>
-complex<_Tp> operator/(const complex<_Tp> &__x, const complex<_Tp> &__y);
-template <class _Tp> class complex {
-public:
-  typedef _Tp value_type;
-private:
-  value_type __re_;
-  value_type __im_;
-public:
-  CUDA_CALLABLE_MEMBER
-  complex(const value_type &__re = value_type(),
-          const value_type &__im = value_type())
-      : __re_(__re), __im_(__im) {}
-  template <class _Xp>
-  CUDA_CALLABLE_MEMBER complex(const complex<_Xp> &__c)
-      : __re_(__c.real()), __im_(__c.imag()) {}
-  CUDA_CALLABLE_MEMBER value_type real() const { return __re_; }
-  CUDA_CALLABLE_MEMBER value_type imag() const { return __im_; }
-  CUDA_CALLABLE_MEMBER void real(value_type __re) { __re_ = __re; }
-  CUDA_CALLABLE_MEMBER void imag(value_type __im) { __im_ = __im; }
-  CUDA_CALLABLE_MEMBER complex &operator=(const value_type &__re) {
-    __re_ = __re;
-    __im_ = value_type();
-    return *this;
-  }
-  CUDA_CALLABLE_MEMBER complex &operator+=(const value_type &__re) {
-    __re_ += __re;
-    return *this;
-  }
-  CUDA_CALLABLE_MEMBER complex &operator-=(const value_type &__re) {
-    __re_ -= __re;
-    return *this;
-  }
-  CUDA_CALLABLE_MEMBER complex &operator*=(const value_type &__re) {
-    __re_ *= __re;
-    __im_ *= __re;
-    return *this;
-  }
-  CUDA_CALLABLE_MEMBER complex &operator/=(const value_type &__re) {
-    __re_ /= __re;
-    __im_ /= __re;
-    return *this;
-  }
-  template <class _Xp>
-  CUDA_CALLABLE_MEMBER complex &operator=(const complex<_Xp> &__c) {
-    __re_ = __c.real();
-    __im_ = __c.imag();
-    return *this;
-  }
-  template <class _Xp>
-  CUDA_CALLABLE_MEMBER complex &operator+=(const complex<_Xp> &__c) {
-    __re_ += __c.real();
-    __im_ += __c.imag();
-    return *this;
-  }
-  template <class _Xp>
-  CUDA_CALLABLE_MEMBER complex &operator-=(const complex<_Xp> &__c) {
-    __re_ -= __c.real();
-    __im_ -= __c.imag();
-    return *this;
-  }
-  template <class _Xp>
-  CUDA_CALLABLE_MEMBER complex &operator*=(const complex<_Xp> &__c) {
-    *this = *this * __c;
-    return *this;
-  }
-  template <class _Xp>
-  CUDA_CALLABLE_MEMBER complex &operator/=(const complex<_Xp> &__c) {
-    *this = *this / __c;
-    return *this;
-  }
-};
-template <> class complex<double>;
-template <> class complex<float> {
-  float __re_;
-  float __im_;
-public:
-  typedef float value_type;
-  /*constexpr*/ CUDA_CALLABLE_MEMBER complex(float __re = 0.0f,
-                                             float __im = 0.0f)
-      : __re_(__re), __im_(__im) {}
-  explicit /*constexpr*/ complex(const complex<double> &__c);
-  /*constexpr*/ CUDA_CALLABLE_MEMBER float real() const { return __re_; }
-  /*constexpr*/ CUDA_CALLABLE_MEMBER float imag() const { return __im_; }
-  CUDA_CALLABLE_MEMBER void real(value_type __re) { __re_ = __re; }
-  CUDA_CALLABLE_MEMBER void imag(value_type __im) { __im_ = __im; }
-  CUDA_CALLABLE_MEMBER complex &operator=(float __re) {
-    __re_ = __re;
-    __im_ = value_type();
-    return *this;
-  }
-  CUDA_CALLABLE_MEMBER complex &operator+=(float __re) {
-    __re_ += __re;
-    return *this;
-  }
-  CUDA_CALLABLE_MEMBER complex &operator-=(float __re) {
-    __re_ -= __re;
-    return *this;
-  }
-  CUDA_CALLABLE_MEMBER complex &operator*=(float __re) {
-    __re_ *= __re;
-    __im_ *= __re;
-    return *this;
-  }
-  CUDA_CALLABLE_MEMBER complex &operator/=(float __re) {
-    __re_ /= __re;
-    __im_ /= __re;
-    return *this;
-  }
-  template <class _Xp>
-  CUDA_CALLABLE_MEMBER complex &operator=(const complex<_Xp> &__c) {
-    __re_ = __c.real();
-    __im_ = __c.imag();
-    return *this;
-  }
-  template <class _Xp>
-  CUDA_CALLABLE_MEMBER complex &operator+=(const complex<_Xp> &__c) {
-    __re_ += __c.real();
-    __im_ += __c.imag();
-    return *this;
-  }
-  template <class _Xp>
-  CUDA_CALLABLE_MEMBER complex &operator-=(const complex<_Xp> &__c) {
-    __re_ -= __c.real();
-    __im_ -= __c.imag();
-    return *this;
-  }
-  template <class _Xp>
-  CUDA_CALLABLE_MEMBER complex &operator*=(const complex<_Xp> &__c) {
-    *this = *this * __c;
-    return *this;
-  }
-  template <class _Xp>
-  CUDA_CALLABLE_MEMBER complex &operator/=(const complex<_Xp> &__c) {
-    *this = *this / __c;
-    return *this;
-  }
-};
-template <> class complex<double> {
-  double __re_;
-  double __im_;
-public:
-  typedef double value_type;
-  /*constexpr*/ CUDA_CALLABLE_MEMBER complex(double __re = 0.0,
-                                             double __im = 0.0)
-      : __re_(__re), __im_(__im) {}
-  /*constexpr*/ complex(const complex<float> &__c);
-  /*constexpr*/ CUDA_CALLABLE_MEMBER double real() const { return __re_; }
-  /*constexpr*/ CUDA_CALLABLE_MEMBER double imag() const { return __im_; }
-  CUDA_CALLABLE_MEMBER void real(value_type __re) { __re_ = __re; }
-  CUDA_CALLABLE_MEMBER void imag(value_type __im) { __im_ = __im; }
-  CUDA_CALLABLE_MEMBER complex &operator=(double __re) {
-    __re_ = __re;
-    __im_ = value_type();
-    return *this;
-  }
-  CUDA_CALLABLE_MEMBER complex &operator+=(double __re) {
-    __re_ += __re;
-    return *this;
-  }
-  CUDA_CALLABLE_MEMBER complex &operator-=(double __re) {
-    __re_ -= __re;
-    return *this;
-  }
-  CUDA_CALLABLE_MEMBER complex &operator*=(double __re) {
-    __re_ *= __re;
-    __im_ *= __re;
-    return *this;
-  }
-  CUDA_CALLABLE_MEMBER complex &operator/=(double __re) {
-    __re_ /= __re;
-    __im_ /= __re;
-    return *this;
-  }
-  template <class _Xp>
-  CUDA_CALLABLE_MEMBER complex &operator=(const complex<_Xp> &__c) {
-    __re_ = __c.real();
-    __im_ = __c.imag();
-    return *this;
-  }
-  template <class _Xp>
-  CUDA_CALLABLE_MEMBER complex &operator+=(const complex<_Xp> &__c) {
-    __re_ += __c.real();
-    __im_ += __c.imag();
-    return *this;
-  }
-  template <class _Xp>
-  CUDA_CALLABLE_MEMBER complex &operator-=(const complex<_Xp> &__c) {
-    __re_ -= __c.real();
-    __im_ -= __c.imag();
-    return *this;
-  }
-  template <class _Xp>
-  CUDA_CALLABLE_MEMBER complex &operator*=(const complex<_Xp> &__c) {
-    *this = *this * __c;
-    return *this;
-  }
-  template <class _Xp>
-  CUDA_CALLABLE_MEMBER complex &operator/=(const complex<_Xp> &__c) {
-    *this = *this / __c;
-    return *this;
-  }
-};
-// constexpr
-inline CUDA_CALLABLE_MEMBER complex<float>::complex(const complex<double> &__c)
-    : __re_(__c.real()), __im_(__c.imag()) {}
-// constexpr
-inline CUDA_CALLABLE_MEMBER complex<double>::complex(const complex<float> &__c)
-    : __re_(__c.real()), __im_(__c.imag()) {}
-// 26.3.6 operators:
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> operator+(const complex<_Tp> &__x,
-                                                   const complex<_Tp> &__y) {
-  complex<_Tp> __t(__x);
-  __t += __y;
-  return __t;
-}
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> operator+(const complex<_Tp> &__x,
-                                                   const _Tp &__y) {
-  complex<_Tp> __t(__x);
-  __t += __y;
-  return __t;
-}
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> operator+(const _Tp &__x,
-                                                   const complex<_Tp> &__y) {
-  complex<_Tp> __t(__y);
-  __t += __x;
-  return __t;
-}
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> operator-(const complex<_Tp> &__x,
-                                                   const complex<_Tp> &__y) {
-  complex<_Tp> __t(__x);
-  __t -= __y;
-  return __t;
-}
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> operator-(const complex<_Tp> &__x,
-                                                   const _Tp &__y) {
-  complex<_Tp> __t(__x);
-  __t -= __y;
-  return __t;
-}
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> operator-(const _Tp &__x,
-                                                   const complex<_Tp> &__y) {
-  complex<_Tp> __t(-__y);
-  __t += __x;
-  return __t;
-}
-template <class _Tp>
-CUDA_CALLABLE_MEMBER complex<_Tp> operator*(const complex<_Tp> &__z,
-                                            const complex<_Tp> &__w) {
-  _Tp __a = __z.real();
-  _Tp __b = __z.imag();
-  _Tp __c = __w.real();
-  _Tp __d = __w.imag();
-  _Tp __ac = __a * __c;
-  _Tp __bd = __b * __d;
-  _Tp __ad = __a * __d;
-  _Tp __bc = __b * __c;
-  _Tp __x = __ac - __bd;
-  _Tp __y = __ad + __bc;
-  if (isnan(__x) && isnan(__y)) {
-    bool __recalc = false;
-    if (isinf(__a) || isinf(__b)) {
-      __a = copysign(isinf(__a) ? _Tp(1) : _Tp(0), __a);
-      __b = copysign(isinf(__b) ? _Tp(1) : _Tp(0), __b);
-      if (isnan(__c))
-        __c = copysign(_Tp(0), __c);
-      if (isnan(__d))
-        __d = copysign(_Tp(0), __d);
-      __recalc = true;
-    }
-    if (isinf(__c) || isinf(__d)) {
-      __c = copysign(isinf(__c) ? _Tp(1) : _Tp(0), __c);
-      __d = copysign(isinf(__d) ? _Tp(1) : _Tp(0), __d);
-      if (isnan(__a))
-        __a = copysign(_Tp(0), __a);
-      if (isnan(__b))
-        __b = copysign(_Tp(0), __b);
-      __recalc = true;
-    }
-    if (!__recalc &&
-        (isinf(__ac) || isinf(__bd) || isinf(__ad) || isinf(__bc))) {
-      if (isnan(__a))
-        __a = copysign(_Tp(0), __a);
-      if (isnan(__b))
-        __b = copysign(_Tp(0), __b);
-      if (isnan(__c))
-        __c = copysign(_Tp(0), __c);
-      if (isnan(__d))
-        __d = copysign(_Tp(0), __d);
-      __recalc = true;
-    }
-    if (__recalc) {
-      __x = _Tp(INFINITY) * (__a * __c - __b * __d);
-      __y = _Tp(INFINITY) * (__a * __d + __b * __c);
-    }
-  }
-  return complex<_Tp>(__x, __y);
-}
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> operator*(const complex<_Tp> &__x,
-                                                   const _Tp &__y) {
-  complex<_Tp> __t(__x);
-  __t *= __y;
-  return __t;
-}
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> operator*(const _Tp &__x,
-                                                   const complex<_Tp> &__y) {
-  complex<_Tp> __t(__y);
-  __t *= __x;
-  return __t;
-}
-template <class _Tp>
-CUDA_CALLABLE_MEMBER complex<_Tp> operator/(const complex<_Tp> &__z,
-                                            const complex<_Tp> &__w) {
-  int __ilogbw = 0;
-  _Tp __a = __z.real();
-  _Tp __b = __z.imag();
-  _Tp __c = __w.real();
-  _Tp __d = __w.imag();
-  _Tp __logbw = logb(fmax(fabs(__c), fabs(__d)));
-  if (isfinite(__logbw)) {
-    __ilogbw = static_cast<int>(__logbw);
-    __c = scalbn(__c, -__ilogbw);
-    __d = scalbn(__d, -__ilogbw);
-  }
-  _Tp __denom = __c * __c + __d * __d;
-  _Tp __x = scalbn((__a * __c + __b * __d) / __denom, -__ilogbw);
-  _Tp __y = scalbn((__b * __c - __a * __d) / __denom, -__ilogbw);
-  if (isnan(__x) && isnan(__y)) {
-    if ((__denom == _Tp(0)) && (!isnan(__a) || !isnan(__b))) {
-      __x = copysign(_Tp(INFINITY), __c) * __a;
-      __y = copysign(_Tp(INFINITY), __c) * __b;
-    } else if ((isinf(__a) || isinf(__b)) && isfinite(__c) && isfinite(__d)) {
-      __a = copysign(isinf(__a) ? _Tp(1) : _Tp(0), __a);
-      __b = copysign(isinf(__b) ? _Tp(1) : _Tp(0), __b);
-      __x = _Tp(INFINITY) * (__a * __c + __b * __d);
-      __y = _Tp(INFINITY) * (__b * __c - __a * __d);
-    } else if (isinf(__logbw) && __logbw > _Tp(0) && isfinite(__a) &&
-               isfinite(__b)) {
-      __c = copysign(isinf(__c) ? _Tp(1) : _Tp(0), __c);
-      __d = copysign(isinf(__d) ? _Tp(1) : _Tp(0), __d);
-      __x = _Tp(0) * (__a * __c + __b * __d);
-      __y = _Tp(0) * (__b * __c - __a * __d);
-    }
-  }
-  return complex<_Tp>(__x, __y);
-}
-template <>
-CUDA_CALLABLE_MEMBER complex<float> operator/(const complex<float> &__z,
-                                              const complex<float> &__w) {
-  int __ilogbw = 0;
-  float __a = __z.real();
-  float __b = __z.imag();
-  float __c = __w.real();
-  float __d = __w.imag();
-  float __logbw = logbf(fmaxf(fabsf(__c), fabsf(__d)));
-  if (isfinite(__logbw)) {
-    __ilogbw = static_cast<int>(__logbw);
-    __c = scalbnf(__c, -__ilogbw);
-    __d = scalbnf(__d, -__ilogbw);
-  }
-  float __denom = __c * __c + __d * __d;
-  float __x = scalbnf((__a * __c + __b * __d) / __denom, -__ilogbw);
-  float __y = scalbnf((__b * __c - __a * __d) / __denom, -__ilogbw);
-  if (isnan(__x) && isnan(__y)) {
-    if ((__denom == float(0)) && (!isnan(__a) || !isnan(__b))) {
-#pragma warning(suppress : 4756) // Ignore INFINITY related warning
-      __x = copysignf(INFINITY, __c) * __a;
-#pragma warning(suppress : 4756) // Ignore INFINITY related warning
-      __y = copysignf(INFINITY, __c) * __b;
-    } else if ((isinf(__a) || isinf(__b)) && isfinite(__c) && isfinite(__d)) {
-      __a = copysignf(isinf(__a) ? float(1) : float(0), __a);
-      __b = copysignf(isinf(__b) ? float(1) : float(0), __b);
-#pragma warning(suppress : 4756) // Ignore INFINITY related warning
-      __x = INFINITY * (__a * __c + __b * __d);
-#pragma warning(suppress : 4756) // Ignore INFINITY related warning
-      __y = INFINITY * (__b * __c - __a * __d);
-    } else if (isinf(__logbw) && __logbw > float(0) && isfinite(__a) &&
-               isfinite(__b)) {
-      __c = copysignf(isinf(__c) ? float(1) : float(0), __c);
-      __d = copysignf(isinf(__d) ? float(1) : float(0), __d);
-      __x = float(0) * (__a * __c + __b * __d);
-      __y = float(0) * (__b * __c - __a * __d);
-    }
-  }
-  return complex<float>(__x, __y);
-}
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> operator/(const complex<_Tp> &__x,
-                                                   const _Tp &__y) {
-  return complex<_Tp>(__x.real() / __y, __x.imag() / __y);
-}
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> operator/(const _Tp &__x,
-                                                   const complex<_Tp> &__y) {
-  complex<_Tp> __t(__x);
-  __t /= __y;
-  return __t;
-}
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> operator+(const complex<_Tp> &__x) {
-  return __x;
-}
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> operator-(const complex<_Tp> &__x) {
-  return complex<_Tp>(-__x.real(), -__x.imag());
-}
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER bool operator==(const complex<_Tp> &__x,
-                                            const complex<_Tp> &__y) {
-  return __x.real() == __y.real() && __x.imag() == __y.imag();
-}
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER bool operator==(const complex<_Tp> &__x,
-                                            const _Tp &__y) {
-  return __x.real() == __y && __x.imag() == 0;
-}
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER bool operator==(const _Tp &__x,
-                                            const complex<_Tp> &__y) {
-  return __x == __y.real() && 0 == __y.imag();
-}
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER bool operator!=(const complex<_Tp> &__x,
-                                            const complex<_Tp> &__y) {
-  return !(__x == __y);
-}
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER bool operator!=(const complex<_Tp> &__x,
-                                            const _Tp &__y) {
-  return !(__x == __y);
-}
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER bool operator!=(const _Tp &__x,
-                                            const complex<_Tp> &__y) {
-  return !(__x == __y);
-}
-// 26.3.7 values:
-// real
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER _Tp real(const complex<_Tp> &__c) {
-  return __c.real();
-}
-inline CUDA_CALLABLE_MEMBER double real(double __re) { return __re; }
-inline CUDA_CALLABLE_MEMBER float real(float __re) { return __re; }
-// imag
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER _Tp imag(const complex<_Tp> &__c) {
-  return __c.imag();
-}
-inline CUDA_CALLABLE_MEMBER double imag(double __re) { return 0; }
-inline CUDA_CALLABLE_MEMBER float imag(float __re) { return 0; }
-// abs
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER _Tp abs(const complex<_Tp> &__c) {
-  return hypot(__c.real(), __c.imag());
-}
-// arg
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER _Tp arg(const complex<_Tp> &__c) {
-  return atan2(__c.imag(), __c.real());
-}
-inline CUDA_CALLABLE_MEMBER double arg(double __re) { return atan2(0., __re); }
-inline CUDA_CALLABLE_MEMBER float arg(float __re) { return atan2f(0.F, __re); }
-// norm
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER _Tp norm(const complex<_Tp> &__c) {
-  if (isinf(__c.real()))
-    return fabs(__c.real());
-  if (isinf(__c.imag()))
-    return fabs(__c.imag());
-  return __c.real() * __c.real() + __c.imag() * __c.imag();
-}
-inline CUDA_CALLABLE_MEMBER double norm(double __re) { return __re * __re; }
-inline CUDA_CALLABLE_MEMBER float norm(float __re) { return __re * __re; }
-// conj
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> conj(const complex<_Tp> &__c) {
-  return complex<_Tp>(__c.real(), -__c.imag());
-}
-inline CUDA_CALLABLE_MEMBER complex<double> conj(double __re) {
-  return complex<double>(__re);
-}
-inline CUDA_CALLABLE_MEMBER complex<float> conj(float __re) {
-  return complex<float>(__re);
-}
-// proj
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> proj(const complex<_Tp> &__c) {
-  complex<_Tp> __r = __c;
-  if (isinf(__c.real()) || isinf(__c.imag()))
-    __r = complex<_Tp>(INFINITY, copysign(_Tp(0), __c.imag()));
-  return __r;
-}
-inline CUDA_CALLABLE_MEMBER complex<double> proj(double __re) {
-  if (isinf(__re))
-    __re = fabs(__re);
-  return complex<double>(__re);
-}
-inline CUDA_CALLABLE_MEMBER complex<float> proj(float __re) {
-  if (isinf(__re))
-    __re = fabs(__re);
-  return complex<float>(__re);
-}
-// polar
-template <class _Tp>
-CUDA_CALLABLE_MEMBER complex<_Tp> polar(const _Tp &__rho,
-                                        const _Tp &__theta = _Tp(0)) {
-  if (isnan(__rho) || signbit(__rho))
-    return complex<_Tp>(_Tp(NAN), _Tp(NAN));
-  if (isnan(__theta)) {
-    if (isinf(__rho))
-      return complex<_Tp>(__rho, __theta);
-    return complex<_Tp>(__theta, __theta);
-  }
-  if (isinf(__theta)) {
-    if (isinf(__rho))
-      return complex<_Tp>(__rho, _Tp(NAN));
-    return complex<_Tp>(_Tp(NAN), _Tp(NAN));
-  }
-  _Tp __x = __rho * cos(__theta);
-  if (isnan(__x))
-    __x = 0;
-  _Tp __y = __rho * sin(__theta);
-  if (isnan(__y))
-    __y = 0;
-  return complex<_Tp>(__x, __y);
-}
-// log
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> log(const complex<_Tp> &__x) {
-  return complex<_Tp>(log(abs(__x)), arg(__x));
-}
-// log10
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> log10(const complex<_Tp> &__x) {
-  return log(__x) / log(_Tp(10));
-}
-// sqrt
-template <class _Tp>
-CUDA_CALLABLE_MEMBER complex<_Tp> sqrt(const complex<_Tp> &__x) {
-  if (isinf(__x.imag()))
-    return complex<_Tp>(_Tp(INFINITY), __x.imag());
-  if (isinf(__x.real())) {
-    if (__x.real() > _Tp(0))
-      return complex<_Tp>(__x.real(), isnan(__x.imag())
-                                          ? __x.imag()
-                                          : copysign(_Tp(0), __x.imag()));
-    return complex<_Tp>(isnan(__x.imag()) ? __x.imag() : _Tp(0),
-                        copysign(__x.real(), __x.imag()));
-  }
-  return polar(sqrt(abs(__x)), arg(__x) / _Tp(2));
-}
-// exp
-template <class _Tp>
-CUDA_CALLABLE_MEMBER complex<_Tp> exp(const complex<_Tp> &__x) {
-  _Tp __i = __x.imag();
-  if (isinf(__x.real())) {
-    if (__x.real() < _Tp(0)) {
-      if (!isfinite(__i))
-        __i = _Tp(1);
-    } else if (__i == 0 || !isfinite(__i)) {
-      if (isinf(__i))
-        __i = _Tp(NAN);
-      return complex<_Tp>(__x.real(), __i);
-    }
-  } else if (isnan(__x.real()) && __x.imag() == 0)
-    return __x;
-  _Tp __e = exp(__x.real());
-  return complex<_Tp>(__e * cos(__i), __e * sin(__i));
-}
-// pow
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> pow(const complex<_Tp> &__x,
-                                             const complex<_Tp> &__y) {
-  return exp(__y * log(__x));
-}
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> pow(const complex<_Tp> &__x,
-                                             const _Tp &__y) {
-  return pow(__x, complex<_Tp>(__y));
-}
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> pow(const _Tp &__x,
-                                             const complex<_Tp> &__y) {
-  return pow(complex<_Tp>(__x), __y);
-}
-// asinh
-template <class _Tp>
-CUDA_CALLABLE_MEMBER complex<_Tp> asinh(const complex<_Tp> &__x) {
-  const _Tp __pi(atan2(+0., -0.));
-  if (isinf(__x.real())) {
-    if (isnan(__x.imag()))
-      return __x;
-    if (isinf(__x.imag()))
-      return complex<_Tp>(__x.real(), copysign(__pi * _Tp(0.25), __x.imag()));
-    return complex<_Tp>(__x.real(), copysign(_Tp(0), __x.imag()));
-  }
-  if (isnan(__x.real())) {
-    if (isinf(__x.imag()))
-      return complex<_Tp>(__x.imag(), __x.real());
-    if (__x.imag() == 0)
-      return __x;
-    return complex<_Tp>(__x.real(), __x.real());
-  }
-  if (isinf(__x.imag()))
-    return complex<_Tp>(copysign(__x.imag(), __x.real()),
-                        copysign(__pi / _Tp(2), __x.imag()));
-  complex<_Tp> __z = log(__x + sqrt(pow(__x, _Tp(2)) + _Tp(1)));
-  return complex<_Tp>(copysign(__z.real(), __x.real()),
-                      copysign(__z.imag(), __x.imag()));
-}
-// acosh
-template <class _Tp>
-CUDA_CALLABLE_MEMBER complex<_Tp> acosh(const complex<_Tp> &__x) {
-  const _Tp __pi(atan2(+0., -0.));
-  if (isinf(__x.real())) {
-    if (isnan(__x.imag()))
-      return complex<_Tp>(fabs(__x.real()), __x.imag());
-    if (isinf(__x.imag()))
-      if (__x.real() > 0)
-        return complex<_Tp>(__x.real(), copysign(__pi * _Tp(0.25), __x.imag()));
-      else
-        return complex<_Tp>(-__x.real(),
-                            copysign(__pi * _Tp(0.75), __x.imag()));
-    if (__x.real() < 0)
-      return complex<_Tp>(-__x.real(), copysign(__pi, __x.imag()));
-    return complex<_Tp>(__x.real(), copysign(_Tp(0), __x.imag()));
-  }
-  if (isnan(__x.real())) {
-    if (isinf(__x.imag()))
-      return complex<_Tp>(fabs(__x.imag()), __x.real());
-    return complex<_Tp>(__x.real(), __x.real());
-  }
-  if (isinf(__x.imag()))
-    return complex<_Tp>(fabs(__x.imag()), copysign(__pi / _Tp(2), __x.imag()));
-  complex<_Tp> __z = log(__x + sqrt(pow(__x, _Tp(2)) - _Tp(1)));
-  return complex<_Tp>(copysign(__z.real(), _Tp(0)),
-                      copysign(__z.imag(), __x.imag()));
-}
-// atanh
-template <class _Tp>
-CUDA_CALLABLE_MEMBER complex<_Tp> atanh(const complex<_Tp> &__x) {
-  const _Tp __pi(atan2(+0., -0.));
-  if (isinf(__x.imag())) {
-    return complex<_Tp>(copysign(_Tp(0), __x.real()),
-                        copysign(__pi / _Tp(2), __x.imag()));
-  }
-  if (isnan(__x.imag())) {
-    if (isinf(__x.real()) || __x.real() == 0)
-      return complex<_Tp>(copysign(_Tp(0), __x.real()), __x.imag());
-    return complex<_Tp>(__x.imag(), __x.imag());
-  }
-  if (isnan(__x.real())) {
-    return complex<_Tp>(__x.real(), __x.real());
-  }
-  if (isinf(__x.real())) {
-    return complex<_Tp>(copysign(_Tp(0), __x.real()),
-                        copysign(__pi / _Tp(2), __x.imag()));
-  }
-  if (fabs(__x.real()) == _Tp(1) && __x.imag() == _Tp(0)) {
-    return complex<_Tp>(copysign(_Tp(INFINITY), __x.real()),
-                        copysign(_Tp(0), __x.imag()));
-  }
-  complex<_Tp> __z = log((_Tp(1) + __x) / (_Tp(1) - __x)) / _Tp(2);
-  return complex<_Tp>(copysign(__z.real(), __x.real()),
-                      copysign(__z.imag(), __x.imag()));
-}
-// sinh
-template <class _Tp>
-CUDA_CALLABLE_MEMBER complex<_Tp> sinh(const complex<_Tp> &__x) {
-  if (isinf(__x.real()) && !isfinite(__x.imag()))
-    return complex<_Tp>(__x.real(), _Tp(NAN));
-  if (__x.real() == 0 && !isfinite(__x.imag()))
-    return complex<_Tp>(__x.real(), _Tp(NAN));
-  if (__x.imag() == 0 && !isfinite(__x.real()))
-    return __x;
-  return complex<_Tp>(sinh(__x.real()) * cos(__x.imag()),
-                      cosh(__x.real()) * sin(__x.imag()));
-}
-// cosh
-template <class _Tp>
-CUDA_CALLABLE_MEMBER complex<_Tp> cosh(const complex<_Tp> &__x) {
-  if (isinf(__x.real()) && !isfinite(__x.imag()))
-    return complex<_Tp>(fabs(__x.real()), _Tp(NAN));
-  if (__x.real() == 0 && !isfinite(__x.imag()))
-    return complex<_Tp>(_Tp(NAN), __x.real());
-  if (__x.real() == 0 && __x.imag() == 0)
-    return complex<_Tp>(_Tp(1), __x.imag());
-  if (__x.imag() == 0 && !isfinite(__x.real()))
-    return complex<_Tp>(fabs(__x.real()), __x.imag());
-  return complex<_Tp>(cosh(__x.real()) * cos(__x.imag()),
-                      sinh(__x.real()) * sin(__x.imag()));
-}
-// tanh
-template <class _Tp>
-CUDA_CALLABLE_MEMBER complex<_Tp> tanh(const complex<_Tp> &__x) {
-  if (isinf(__x.real())) {
-    if (!isfinite(__x.imag()))
-      return complex<_Tp>(_Tp(1), _Tp(0));
-    return complex<_Tp>(_Tp(1), copysign(_Tp(0), sin(_Tp(2) * __x.imag())));
-  }
-  if (isnan(__x.real()) && __x.imag() == 0)
-    return __x;
-  _Tp __2r(_Tp(2) * __x.real());
-  _Tp __2i(_Tp(2) * __x.imag());
-  _Tp __d(cosh(__2r) + cos(__2i));
-  return complex<_Tp>(sinh(__2r) / __d, sin(__2i) / __d);
-}
-// asin
-template <class _Tp>
-CUDA_CALLABLE_MEMBER complex<_Tp> asin(const complex<_Tp> &__x) {
-  complex<_Tp> __z = asinh(complex<_Tp>(-__x.imag(), __x.real()));
-  return complex<_Tp>(__z.imag(), -__z.real());
-}
-// acos
-template <class _Tp>
-CUDA_CALLABLE_MEMBER complex<_Tp> acos(const complex<_Tp> &__x) {
-  const _Tp __pi(atan2(+0., -0.));
-  if (isinf(__x.real())) {
-    if (isnan(__x.imag()))
-      return complex<_Tp>(__x.imag(), __x.real());
-    if (isinf(__x.imag())) {
-      if (__x.real() < _Tp(0))
-        return complex<_Tp>(_Tp(0.75) * __pi, -__x.imag());
-      return complex<_Tp>(_Tp(0.25) * __pi, -__x.imag());
-    }
-    if (__x.real() < _Tp(0))
-      return complex<_Tp>(__pi, signbit(__x.imag()) ? -__x.real() : __x.real());
-    return complex<_Tp>(_Tp(0), signbit(__x.imag()) ? __x.real() : -__x.real());
-  }
-  if (isnan(__x.real())) {
-    if (isinf(__x.imag()))
-      return complex<_Tp>(__x.real(), -__x.imag());
-    return complex<_Tp>(__x.real(), __x.real());
-  }
-  if (isinf(__x.imag()))
-    return complex<_Tp>(__pi / _Tp(2), -__x.imag());
-  if (__x.real() == 0)
-    return complex<_Tp>(__pi / _Tp(2), -__x.imag());
-  complex<_Tp> __z = log(__x + sqrt(pow(__x, _Tp(2)) - _Tp(1)));
-  if (signbit(__x.imag()))
-    return complex<_Tp>(fabs(__z.imag()), fabs(__z.real()));
-  return complex<_Tp>(fabs(__z.imag()), -fabs(__z.real()));
-}
-// atan
-template <class _Tp>
-CUDA_CALLABLE_MEMBER complex<_Tp> atan(const complex<_Tp> &__x) {
-  complex<_Tp> __z = atanh(complex<_Tp>(-__x.imag(), __x.real()));
-  return complex<_Tp>(__z.imag(), -__z.real());
-}
-// sin
-template <class _Tp>
-CUDA_CALLABLE_MEMBER complex<_Tp> sin(const complex<_Tp> &__x) {
-  complex<_Tp> __z = sinh(complex<_Tp>(-__x.imag(), __x.real()));
-  return complex<_Tp>(__z.imag(), -__z.real());
-}
-// cos
-template <class _Tp>
-inline CUDA_CALLABLE_MEMBER complex<_Tp> cos(const complex<_Tp> &__x) {
-  return cosh(complex<_Tp>(-__x.imag(), __x.real()));
-}
-// tan
-template <class _Tp>
-CUDA_CALLABLE_MEMBER complex<_Tp> tan(const complex<_Tp> &__x) {
-  complex<_Tp> __z = tanh(complex<_Tp>(-__x.imag(), __x.real()));
-  return complex<_Tp>(__z.imag(), -__z.real());
-}
-template <class _Tp, class _CharT, class _Traits>
-std::basic_istream<_CharT, _Traits> &
-operator>>(std::basic_istream<_CharT, _Traits> &__is, complex<_Tp> &__x) {
-  if (__is.good()) {
-    ws(__is);
-    if (__is.peek() == _CharT('(')) {
-      __is.get();
-      _Tp __r;
-      __is >> __r;
-      if (!__is.fail()) {
-        ws(__is);
-        _CharT __c = __is.peek();
-        if (__c == _CharT(',')) {
-          __is.get();
-          _Tp __i;
-          __is >> __i;
-          if (!__is.fail()) {
-            ws(__is);
-            __c = __is.peek();
-            if (__c == _CharT(')')) {
-              __is.get();
-              __x = complex<_Tp>(__r, __i);
-            } else
-              __is.setstate(std::ios_base::failbit);
-          } else
-            __is.setstate(std::ios_base::failbit);
-        } else if (__c == _CharT(')')) {
-          __is.get();
-          __x = complex<_Tp>(__r, _Tp(0));
-        } else
-          __is.setstate(std::ios_base::failbit);
-      } else
-        __is.setstate(std::ios_base::failbit);
-    } else {
-      _Tp __r;
-      __is >> __r;
-      if (!__is.fail())
-        __x = complex<_Tp>(__r, _Tp(0));
-      else
-        __is.setstate(std::ios_base::failbit);
-    }
-  } else
-    __is.setstate(std::ios_base::failbit);
-  return __is;
-}
-template <class _Tp, class _CharT, class _Traits>
-std::basic_ostream<_CharT, _Traits> &
-operator<<(std::basic_ostream<_CharT, _Traits> &__os, const complex<_Tp> &__x) {
-  std::basic_ostringstream<_CharT, _Traits> __s;
-  __s.flags(__os.flags());
-  __s.imbue(__os.getloc());
-  __s.precision(__os.precision());
-  __s << '(' << __x.real() << ',' << __x.imag() << ')';
-  return __os << __s.str();
-}
-//} // close namespace cuda_complex
-template <class U, class V>
-CUDA_CALLABLE_MEMBER auto operator*(const complex<U> &complexNumber,
-                                    const V &scalar) -> complex<U> {
-  return complex<U>(real(complexNumber) * scalar, imag(complexNumber) * scalar);
-}
-template <class U, class V>
-CUDA_CALLABLE_MEMBER auto operator*(const V &scalar,
-                                    const complex<U> &complexNumber)
-    -> complex<U> {
-  return complex<U>(real(complexNumber) * scalar, imag(complexNumber) * scalar);
-}
-template <class U, class V>
-CUDA_CALLABLE_MEMBER auto operator+(const complex<U> &complexNumber,
-                                    const V &scalar) -> complex<U> {
-  return complex<U>(real(complexNumber) + scalar, imag(complexNumber));
-}
-template <class U, class V>
-CUDA_CALLABLE_MEMBER auto operator+(const V &scalar,
-                                    const complex<U> &complexNumber)
-    -> complex<U> {
-  return complex<U>(real(complexNumber) + scalar, imag(complexNumber));
-}
-template <class U, class V>
-CUDA_CALLABLE_MEMBER auto operator-(const complex<U> &complexNumber,
-                                    const V &scalar) -> complex<U> {
-  return complex<U>(real(complexNumber) - scalar, imag(complexNumber));
-}
-template <class U, class V>
-CUDA_CALLABLE_MEMBER auto operator-(const V &scalar,
-                                    const complex<U> &complexNumber)
-    -> complex<U> {
-  return complex<U>(scalar - real(complexNumber), imag(complexNumber));
-}
-template <class U, class V>
-CUDA_CALLABLE_MEMBER auto operator/(const complex<U> &complexNumber,
-                                    const V scalar) -> complex<U> {
-  return complex<U>(real(complexNumber) / scalar, imag(complexNumber) / scalar);
-}
-template <class U, class V>
-CUDA_CALLABLE_MEMBER auto operator/(const V scalar,
-                                    const complex<U> &complexNumber)
-    -> complex<U> {
-  return complex<U>(scalar, 0) / complexNumber;
-}
-using ComplexDouble = complex<double>;
-using ComplexFloat = complex<float>;
-#endif // CUDA_COMPLEX_HPP
-}
--- a/pystencils/include/opencl_stdint.h
+++ b/pystencils/include/opencl_stdint.h
-#ifndef OPENCL_STDINT
-#define OPENCL_STDINT
-typedef unsigned int      uint_t;
-typedef signed char       int8_t;
-typedef signed short      int16_t;
-typedef signed int        int32_t;
-typedef signed long int   int64_t;
-typedef unsigned char     uint8_t;
-typedef unsigned short    uint16_t;
-typedef unsigned int      uint32_t;
-typedef unsigned long int uint64_t;
-#endif
--- a/pystencils/node_collection.py
+++ b/pystencils/node_collection.py
-from typing import List, Union
-import sympy
-import sympy as sp
-from sympy.codegen import Assignment
-from sympy.codegen.rewriting import ReplaceOptim, optimize
-from pystencils.astnodes import Block, Node, SympyAssignment
-from pystencils.backends.cbackend import CustomCodeNode
-from pystencils.functions import DivFunc
-from pystencils.simp import AssignmentCollection
-class NodeCollection:
-    def __init__(self, assignments: List[Union[Node, Assignment]]):
-        self.all_assignments = assignments
-        if all((isinstance(a, Assignment) for a in assignments)):
-            self.is_Nodes = False
-            self.is_Assignments = True
-        elif all((isinstance(n, Node) for n in assignments)):
-            self.is_Nodes = True
-            self.is_Assignments = False
-        else:
-            raise ValueError(f'The list "{assignments}" is mixed. Pass either a list of "pystencils.Assignments" '
-                             f'or a list of "pystencils.astnodes.Node')
-        self.simplification_hints = {}
-    @staticmethod
-    def from_assignment_collection(assignment_collection: AssignmentCollection):
-        nodes = list()
-        for assignemt in assignment_collection.all_assignments:
-            if isinstance(assignemt, Assignment):
-                nodes.append(SympyAssignment(assignemt.lhs, assignemt.rhs))
-            elif isinstance(assignemt, Node):
-                nodes.append(assignemt)
-            else:
-                raise ValueError(f"Unknown node in the AssignmentCollection: {assignemt}")
-        return NodeCollection(nodes)
-    def evaluate_terms(self):
-        evaluate_constant_terms = ReplaceOptim(
-            lambda e: hasattr(e, 'is_constant') and e.is_constant and not e.is_integer,
-            lambda p: p.evalf())
-        evaluate_pow = ReplaceOptim(
-            lambda e: e.is_Pow and e.exp.is_Integer and abs(e.exp) <= 8,
-            lambda p: (
-                sp.UnevaluatedExpr(sp.Mul(*([p.base] * +p.exp), evaluate=False)) if p.exp > 0 else
-                DivFunc(sp.Integer(1), sp.Mul(*([p.base] * -p.exp), evaluate=False))
-            ))
-        sympy_optimisations = [evaluate_constant_terms, evaluate_pow]
-        if self.is_Nodes:
-            def visitor(node):
-                if isinstance(node, CustomCodeNode):
-                    return node
-                elif isinstance(node, Block):
-                    return node.func([visitor(child) for child in node.args])
-                elif isinstance(node, Node):
-                    return node.func(*[visitor(child) for child in node.args])
-                elif isinstance(node, sympy.Basic):
-                    return optimize(node, sympy_optimisations)
-                else:
-                    raise NotImplementedError(f'{node} {type(node)} has no valid visitor')
-            self.all_assignments = [visitor(assignment) for assignment in self.all_assignments]
-        else:
-            self.all_assignments = [Assignment(a.lhs, optimize(a.rhs, sympy_optimisations))
-                                    if hasattr(a, 'lhs')
-                                    else a for a in self.all_assignments]
--- a/pystencils_tests/test_create_kernel_backwards_compability.py
+++ b/pystencils_tests/test_create_kernel_backwards_compability.py
-import pytest
-import pystencils as ps
-import numpy as np
-# This test aims to trigger deprication warnings. Thus the warnings should not be displayed in the warning summary.
-import pystencils.config
-def test_create_kernel_backwards_compatibility():
-    size = (30, 20)
-    src_field_string = np.random.rand(*size)
-    src_field_enum = np.copy(src_field_string)
-    src_field_config = np.copy(src_field_string)
-    dst_field_string = np.zeros(size)
-    dst_field_enum = np.zeros(size)
-    dst_field_config = np.zeros(size)
-    f = ps.Field.create_from_numpy_array("f", src_field_enum)
-    d = ps.Field.create_from_numpy_array("d", dst_field_enum)
-    jacobi = ps.Assignment(d[0, 0], (f[1, 0] + f[-1, 0] + f[0, 1] + f[0, -1]) / 4)
-    ast_enum = ps.create_kernel(jacobi, target=ps.Target.CPU).compile()
-    with pytest.warns(DeprecationWarning):
-        ast_string = ps.create_kernel(jacobi, target='cpu').compile()
-    # noinspection PyTypeChecker
-    with pytest.warns(DeprecationWarning):
-        ast_config = ps.create_kernel(jacobi, config=pystencils.config.CreateKernelConfig(target='cpu')).compile()
-    ast_enum(f=src_field_enum, d=dst_field_enum)
-    ast_string(f=src_field_string, d=dst_field_string)
-    ast_config(f=src_field_config, d=dst_field_config)
-    error = np.sum(np.abs(dst_field_enum - dst_field_string))
-    np.testing.assert_almost_equal(error, 0.0)
-    error = np.sum(np.abs(dst_field_enum - dst_field_config))
-    np.testing.assert_almost_equal(error, 0.0)
--- a/pystencils_tests/test_indexed_kernels.py
+++ b/pystencils_tests/test_indexed_kernels.py
-import numpy as np
-import pystencils as ps
-from pystencils import Assignment, Field, CreateKernelConfig, create_kernel, Target
-def test_indexed_kernel():
-    arr = np.zeros((3, 4))
-    dtype = np.dtype([('x', int), ('y', int), ('value', arr.dtype)])
-    index_arr = np.zeros((3,), dtype=dtype)
-    index_arr[0] = (0, 2, 3.0)
-    index_arr[1] = (1, 3, 42.0)
-    index_arr[2] = (2, 1, 5.0)
-    indexed_field = Field.create_from_numpy_array('index', index_arr)
-    normal_field = Field.create_from_numpy_array('f', arr)
-    update_rule = Assignment(normal_field[0, 0], indexed_field('value'))
-    config = CreateKernelConfig(index_fields=[indexed_field])
-    ast = create_kernel([update_rule], config=config)
-    kernel = ast.compile()
-    kernel(f=arr, index=index_arr)
-    code = ps.get_code_str(kernel)
-    for i in range(index_arr.shape[0]):
-        np.testing.assert_allclose(arr[index_arr[i]['x'], index_arr[i]['y']], index_arr[i]['value'], atol=1e-13)
-def test_indexed_cuda_kernel():
-    try:
-        import pycuda
-    except ImportError:
-        pycuda = None
-    if pycuda:
-        import pycuda.gpuarray as gpuarray
-        arr = np.zeros((3, 4))
-        dtype = np.dtype([('x', int), ('y', int), ('value', arr.dtype)])
-        index_arr = np.zeros((3,), dtype=dtype)
-        index_arr[0] = (0, 2, 3.0)
-        index_arr[1] = (1, 3, 42.0)
-        index_arr[2] = (2, 1, 5.0)
-        indexed_field = Field.create_from_numpy_array('index', index_arr)
-        normal_field = Field.create_from_numpy_array('f', arr)
-        update_rule = Assignment(normal_field[0, 0], indexed_field('value'))
-        config = CreateKernelConfig(target=Target.GPU, index_fields=[indexed_field])
-        ast = create_kernel([update_rule], config=config)
-        kernel = ast.compile()
-        gpu_arr = gpuarray.to_gpu(arr)
-        gpu_index_arr = gpuarray.to_gpu(index_arr)
-        kernel(f=gpu_arr, index=gpu_index_arr)
-        gpu_arr.get(arr)
-        for i in range(index_arr.shape[0]):
-            np.testing.assert_allclose(arr[index_arr[i]['x'], index_arr[i]['y']], index_arr[i]['value'], atol=1e-13)
-    else:
-        print("Did not run test on GPU since no pycuda is available")
--- a/pystencils_tests/test_jupyter_extensions.ipynb
+++ b/pystencils_tests/test_jupyter_extensions.ipynb
-%% Cell type:code id: tags:
-``` python
-from pystencils.session import *
-```
-%% Cell type:code id: tags:
-``` python
-dh = ps.create_data_handling(domain_size=(256, 256), periodicity=True)
-c_field = dh.add_array('c')
-dh.fill("c", 0.0, ghost_layers=True)
-```
-%% Cell type:code id: tags:
-``` python
-for x in range(129):
-    for y in range(258):
-        dh.cpu_arrays['c'][x, y] = 1.0
-```
-%% Cell type:code id: tags:
-``` python
-plt.scalar_field(dh.cpu_arrays["c"])
-```
-%% Output
-    <matplotlib.image.AxesImage at 0x7fcb7d253710>
-%% Cell type:code id: tags:
-``` python
-ur = ps.Assignment(c_field[0, 0], c_field[1, 0])
-ast = ps.create_kernel(ur, target=dh.default_target, cpu_openmp=True)
-kernel = ast.compile()
-```
-%% Cell type:code id: tags:
-``` python
-c_sync = dh.synchronization_function_cpu(['c'])
-```
-%% Cell type:code id: tags:
-``` python
-def timeloop(steps=10):
-    for i in range(steps):
-        c_sync()
-        dh.run_kernel(kernel)
-    return dh.gather_array('c')
-```
-%% Cell type:code id: tags:
-``` python
-ps.jupyter.set_display_mode('video')
-```
-%% Cell type:code id: tags:
-``` python
-ani = ps.plot.scalar_field_animation(timeloop, rescale=True, frames=12)
-ps.jupyter.display_animation(ani)
-```
-%% Output
-    <IPython.core.display.HTML object>
-%% Cell type:code id: tags:
-``` python
-ps.jupyter.set_display_mode('image_update')
-```
-%% Cell type:code id: tags:
-``` python
-ani = ps.plot.scalar_field_animation(timeloop, rescale=True, frames=12)
-ps.jupyter.display_animation(ani)
-```
-%% Output
-%% Cell type:code id: tags:
-``` python
-def grid_update_function(image):
-    for i in range(40):
-        c_sync()
-        dh.run_kernel(kernel)
-    return dh.gather_array('c')
-```
-%% Cell type:code id: tags:
-``` python
-animation = ps.jupyter.make_imshow_animation(dh.cpu_arrays["c"], grid_update_function, frames=300)
-```
-%% Output
-%% Cell type:code id: tags:
-``` python
-ps.jupyter.set_display_mode("video")
-ps.jupyter.set_display_mode("window")
-ps.jupyter.set_display_mode("image_update")
-ps.jupyter.activate_ipython()
-```
-%% Cell type:code id: tags:
-``` python
-from pystencils.session import *
-```
-%% Cell type:code id: tags:
-``` python
-dh = ps.create_data_handling(domain_size=(256, 256), periodicity=True)
-c_field = dh.add_array('c')
-dh.fill("c", 0.0, ghost_layers=True)
-```
-%% Cell type:code id: tags:
-``` python
-for x in range(129):
-    for y in range(258):
-        dh.cpu_arrays['c'][x, y] = 1.0
-```
-%% Cell type:code id: tags:
-``` python
-plt.scalar_field(dh.cpu_arrays["c"])
-```
-%% Output
-    <matplotlib.image.AxesImage at 0x7fcb7d253710>
-%% Cell type:code id: tags:
-``` python
-ur = ps.Assignment(c_field[0, 0], c_field[1, 0])
-ast = ps.create_kernel(ur, target=dh.default_target, cpu_openmp=True)
-kernel = ast.compile()
-```
-%% Cell type:code id: tags:
-``` python
-c_sync = dh.synchronization_function_cpu(['c'])
-```
-%% Cell type:code id: tags:
-``` python
-def timeloop(steps=10):
-    for i in range(steps):
-        c_sync()
-        dh.run_kernel(kernel)
-    return dh.gather_array('c')
-```
-%% Cell type:code id: tags:
-``` python
-ps.jupyter.set_display_mode('video')
-```
-%% Cell type:code id: tags:
-``` python
-ani = ps.plot.scalar_field_animation(timeloop, rescale=True, frames=12)
-ps.jupyter.display_animation(ani)
-```
-%% Output
-    <IPython.core.display.HTML object>
-%% Cell type:code id: tags:
-``` python
-ps.jupyter.set_display_mode('image_update')
-```
-%% Cell type:code id: tags:
-``` python
-ani = ps.plot.scalar_field_animation(timeloop, rescale=True, frames=12)
-ps.jupyter.display_animation(ani)
-```
-%% Output
-%% Cell type:code id: tags:
-``` python
-def grid_update_function(image):
-    for i in range(40):
-        c_sync()
-        dh.run_kernel(kernel)
-    return dh.gather_array('c')
-```
-%% Cell type:code id: tags:
-``` python
-animation = ps.jupyter.make_imshow_animation(dh.cpu_arrays["c"], grid_update_function, frames=300)
-```
-%% Output
-%% Cell type:code id: tags:
-``` python
-ps.jupyter.set_display_mode("video")
-ps.jupyter.set_display_mode("window")
-ps.jupyter.set_display_mode("image_update")
-ps.jupyter.activate_ipython()
-```
--- a/pystencils_tests/test_phasefield_dentritic_3D.ipynb
+++ b/pystencils_tests/test_phasefield_dentritic_3D.ipynb
-%% Cell type:code id: tags:
-``` python
-import pytest
-pytest.importorskip('pycuda')
-```
-%% Cell type:code id: tags:
-``` python
-from pystencils.session import *
-sp.init_printing()
-frac = sp.Rational
-```
-%% Cell type:markdown id: tags:
-# Phase-field simulation of dentritic solidification in 3D
-This notebook tests the model presented in the dentritic growth tutorial in 3D.
-%% Cell type:code id: tags:
-``` python
-target = ps.Target.GPU
-gpu = target == ps.Target.GPU
-domain_size = (25, 25, 25) if 'is_test_run' in globals() else (300, 300, 300)
-dh = ps.create_data_handling(domain_size=domain_size, periodicity=True, default_target=target)
-φ_field = dh.add_array('phi', latex_name='φ')
-φ_delta_field = dh.add_array('phidelta', latex_name='φ_D')
-t_field = dh.add_array('T')
-```
-%% Cell type:code id: tags:
-``` python
-ε, m, δ, j, θzero, α, γ, Teq, κ, τ = sp.symbols("ε m δ j θ_0 α γ T_eq κ τ")
-εb = sp.Symbol("\\bar{\\epsilon}")
-discretize = ps.fd.Discretization2ndOrder(dx=0.03, dt=1e-5)
-φ = φ_field.center
-T = t_field.center
-d = ps.fd.Diff
-def f(φ, m):
-    return φ**4 / 4 - (frac(1, 2) - m/3) * φ**3 + (frac(1,4)-m/2)*φ**2
-bulk_free_energy_density = f(φ, m)
-interface_free_energy_density = ε ** 2 / 2 * (d(φ, 0) ** 2 + d(φ, 1) ** 2 + d(φ, 2) ** 2)
-```
-%% Cell type:markdown id: tags:
-Here comes the major change, that has to be made for the 3D model: $\epsilon$ depends on the interface normal, which can not be computed simply as atan() as in the 2D case
-%% Cell type:code id: tags:
-``` python
-n = sp.Matrix([d(φ, i) for i in range(3)])
-nLen = sp.sqrt(sum(n_i**2 for n_i in n))
-n = n / nLen
-nVal = sum(n_i**4 for n_i in n)
-σ = δ * nVal
-εVal = εb * (1 + σ)
-εVal
-```
-%% Output
-    $\displaystyle \bar{\epsilon} \left(δ \left(\frac{{\partial_{0} {{φ}_{(0,0,0)}}}^{4}}{\left({\partial_{0} {{φ}_{(0,0,0)}}}^{2} + {\partial_{1} {{φ}_{(0,0,0)}}}^{2} + {\partial_{2} {{φ}_{(0,0,0)}}}^{2}\right)^{2}} + \frac{{\partial_{1} {{φ}_{(0,0,0)}}}^{4}}{\left({\partial_{0} {{φ}_{(0,0,0)}}}^{2} + {\partial_{1} {{φ}_{(0,0,0)}}}^{2} + {\partial_{2} {{φ}_{(0,0,0)}}}^{2}\right)^{2}} + \frac{{\partial_{2} {{φ}_{(0,0,0)}}}^{4}}{\left({\partial_{0} {{φ}_{(0,0,0)}}}^{2} + {\partial_{1} {{φ}_{(0,0,0)}}}^{2} + {\partial_{2} {{φ}_{(0,0,0)}}}^{2}\right)^{2}}\right) + 1\right)$
-                   ⎛  ⎛                            4
-                   ⎜  ⎜                 D(φ[0,0,0])
-    \bar{\epsilon}⋅⎜δ⋅⎜───────────────────────────────────────────── + ───────────
-                   ⎜  ⎜                                            2
-                   ⎜  ⎜⎛           2              2              2⎞    ⎛
-                   ⎝  ⎝⎝D(φ[0,0,0])  + D(φ[0,0,0])  + D(φ[0,0,0]) ⎠    ⎝D(φ[0,0,0]
-                     4                                               4
-          D(φ[0,0,0])                                     D(φ[0,0,0])
-    ────────────────────────────────── + ─────────────────────────────────────────
-                                     2
-     2              2              2⎞    ⎛           2              2
-    )  + D(φ[0,0,0])  + D(φ[0,0,0]) ⎠    ⎝D(φ[0,0,0])  + D(φ[0,0,0])  + D(φ[0,0,0]
-        ⎞    ⎞
-        ⎟    ⎟
-    ────⎟ + 1⎟
-       2⎟    ⎟
-     2⎞ ⎟    ⎟
-    ) ⎠ ⎠    ⎠
-%% Cell type:code id: tags:
-``` python
-def m_func(temperature):
-    return (α / sp.pi) * sp.atan(γ * (Teq - temperature))
-```
-%% Cell type:code id: tags:
-``` python
-substitutions = {m: m_func(T),
-                 ε: εVal}
-fe_i = interface_free_energy_density.subs(substitutions)
-fe_b = bulk_free_energy_density.subs(substitutions)
-μ_if = ps.fd.expand_diff_full(ps.fd.functional_derivative(fe_i, φ), functions=[φ])
-μ_b = ps.fd.expand_diff_full(ps.fd.functional_derivative(fe_b, φ), functions=[φ])
-```
-%% Cell type:code id: tags:
-``` python
-dF_dφ = μ_b + sp.Piecewise((μ_if, nLen**2 > 1e-10), (0, True))
-```
-%% Cell type:code id: tags:
-``` python
-parameters = {
-    τ: 0.0003,
-    κ: 1.8,
-    εb: 0.01,
-    δ: 0.3,
-    γ: 10,
-    j: 6,
-    α: 0.9,
-    Teq: 1.0,
-    θzero: 0.2,
-    sp.pi: sp.pi.evalf()
-}
-parameters
-```
-%% Output
-    $\displaystyle \left\{ \pi : 3.14159265358979, \  T_{eq} : 1.0, \  \bar{\epsilon} : 0.01, \  j : 6, \  α : 0.9, \  γ : 10, \  δ : 0.3, \  θ_{0} : 0.2, \  κ : 1.8, \  τ : 0.0003\right\}$
-    {π: 3.14159265358979, T_eq: 1.0, \bar{\epsilon}: 0.01, j: 6, α: 0.9, γ: 10, δ:
-     0.3, θ₀: 0.2, κ: 1.8, τ: 0.0003}
-%% Cell type:code id: tags:
-``` python
-dφ_dt = - dF_dφ / τ
-assignments = [
-    ps.Assignment(φ_delta_field.center, discretize(dφ_dt.subs(parameters))),
-]
-φEqs = ps.simp.sympy_cse_on_assignment_list(assignments)
-φEqs.append(ps.Assignment(φ, discretize(ps.fd.transient(φ) - φ_delta_field.center)))
-temperatureEvolution = -ps.fd.transient(T) + ps.fd.diffusion(T, 1) + κ * φ_delta_field.center
-temperatureEqs = [
-    ps.Assignment(T, discretize(temperatureEvolution.subs(parameters)))
-]
-```
-%% Cell type:code id: tags:
-``` python
-temperatureEqs
-```
-%% Output
-    $\displaystyle \left[ {{T}_{(0,0,0)}} \leftarrow 0.0111111111111111 {{T}_{(-1,0,0)}} + 0.0111111111111111 {{T}_{(0,-1,0)}} + 0.0111111111111111 {{T}_{(0,0,-1)}} + 0.933333333333333 {{T}_{(0,0,0)}} + 0.0111111111111111 {{T}_{(0,0,1)}} + 0.0111111111111111 {{T}_{(0,1,0)}} + 0.0111111111111111 {{T}_{(1,0,0)}} + 1.8 \cdot 10^{-5} {{φ_D}_{(0,0,0)}}\right]$
-    [T_C := 0.0111111111111111⋅T_W + 0.0111111111111111⋅T_S + 0.0111111111111111⋅T
-    _B + 0.933333333333333⋅T_C + 0.0111111111111111⋅T_T + 0.0111111111111111⋅T_N +
-     0.0111111111111111⋅T_E + 1.8e-5⋅phidelta_C]
-%% Cell type:code id: tags:
-``` python
-φ_kernel = ps.create_kernel(φEqs, cpu_openmp=4, target=target).compile()
-temperatureKernel = ps.create_kernel(temperatureEqs, cpu_openmp=4, target=target).compile()
-```
-%% Cell type:code id: tags:
-``` python
-def time_loop(steps):
-    φ_sync = dh.synchronization_function(['phi'], target=target)
-    temperature_sync = dh.synchronization_function(['T'], target=target)
-    dh.all_to_gpu()
-    for t in range(steps):
-        φ_sync()
-        dh.run_kernel(φ_kernel)
-        temperature_sync()
-        dh.run_kernel(temperatureKernel)
-    dh.all_to_cpu()
-def init(nucleus_size=np.sqrt(5)):
-    for b in dh.iterate():
-        x, y, z = b.cell_index_arrays
-        x, y, z = x - b.shape[0] // 2, y - b.shape[1] // 2, z - b.shape[2] // 2
-        b['phi'].fill(0)
-        b['phi'][(x ** 2 + y ** 2 + z ** 2) < nucleus_size ** 2] = 1.0
-        b['T'].fill(0.0)
-def plot(slice_obj=ps.make_slice[:, :, 0.5]):
-    plt.subplot(1, 3, 1)
-    plt.scalar_field(dh.gather_array('phi', slice_obj).squeeze())
-    plt.title("φ")
-    plt.colorbar()
-    plt.subplot(1, 3, 2)
-    plt.title("T")
-    plt.scalar_field(dh.gather_array('T', slice_obj).squeeze())
-    plt.colorbar()
-    plt.subplot(1, 3, 3)
-    plt.title("∂φ")
-    plt.scalar_field(dh.gather_array('phidelta', slice_obj).squeeze())
-    plt.colorbar()
-```
-%% Cell type:code id: tags:
-``` python
-init()
-plot()
-print(dh)
-```
-%% Output
-        Name|      Inner (min/max)|     WithGl (min/max)
-    ----------------------------------------------------
-           T|            (  0,  0)|            (  0,  0)
-         phi|            (  0,  1)|            (  0,  1)
-    phidelta|            (  0,  0)|            (  0,  0)
-%% Cell type:code id: tags:
-``` python
-if 'is_test_run' in globals():
-    time_loop(2)
-    assert np.isfinite(dh.max('phi'))
-    assert np.isfinite(dh.max('T'))
-    assert np.isfinite(dh.max('phidelta'))
-else:
-    from time import perf_counter
-    vtk_writer = dh.create_vtk_writer('dentritic_growth_large', ['phi'])
-    last = perf_counter()
-    for i in range(300):
-        time_loop(100)
-        vtk_writer(i)
-        print("Step ", i, perf_counter() - last, dh.max('phi'))
-        last = perf_counter()
-```
-%% Cell type:code id: tags:
-``` python
-import pytest
-pytest.importorskip('pycuda')
-```
-%% Cell type:code id: tags:
-``` python
-from pystencils.session import *
-sp.init_printing()
-frac = sp.Rational
-```
-%% Cell type:markdown id: tags:
-# Phase-field simulation of dentritic solidification in 3D
-This notebook tests the model presented in the dentritic growth tutorial in 3D.
-%% Cell type:code id: tags:
-``` python
-target = ps.Target.GPU
-gpu = target == ps.Target.GPU
-domain_size = (25, 25, 25) if 'is_test_run' in globals() else (300, 300, 300)
-dh = ps.create_data_handling(domain_size=domain_size, periodicity=True, default_target=target)
-φ_field = dh.add_array('phi', latex_name='φ')
-φ_delta_field = dh.add_array('phidelta', latex_name='φ_D')
-t_field = dh.add_array('T')
-```
-%% Cell type:code id: tags:
-``` python
-ε, m, δ, j, θzero, α, γ, Teq, κ, τ = sp.symbols("ε m δ j θ_0 α γ T_eq κ τ")
-εb = sp.Symbol("\\bar{\\epsilon}")
-discretize = ps.fd.Discretization2ndOrder(dx=0.03, dt=1e-5)
-φ = φ_field.center
-T = t_field.center
-d = ps.fd.Diff
-def f(φ, m):
-    return φ**4 / 4 - (frac(1, 2) - m/3) * φ**3 + (frac(1,4)-m/2)*φ**2
-bulk_free_energy_density = f(φ, m)
-interface_free_energy_density = ε ** 2 / 2 * (d(φ, 0) ** 2 + d(φ, 1) ** 2 + d(φ, 2) ** 2)
-```
-%% Cell type:markdown id: tags:
-Here comes the major change, that has to be made for the 3D model: $\epsilon$ depends on the interface normal, which can not be computed simply as atan() as in the 2D case
-%% Cell type:code id: tags:
-``` python
-n = sp.Matrix([d(φ, i) for i in range(3)])
-nLen = sp.sqrt(sum(n_i**2 for n_i in n))
-n = n / nLen
-nVal = sum(n_i**4 for n_i in n)
-σ = δ * nVal
-εVal = εb * (1 + σ)
-εVal
-```
-%% Output
-    $\displaystyle \bar{\epsilon} \left(δ \left(\frac{{\partial_{0} {{φ}_{(0,0,0)}}}^{4}}{\left({\partial_{0} {{φ}_{(0,0,0)}}}^{2} + {\partial_{1} {{φ}_{(0,0,0)}}}^{2} + {\partial_{2} {{φ}_{(0,0,0)}}}^{2}\right)^{2}} + \frac{{\partial_{1} {{φ}_{(0,0,0)}}}^{4}}{\left({\partial_{0} {{φ}_{(0,0,0)}}}^{2} + {\partial_{1} {{φ}_{(0,0,0)}}}^{2} + {\partial_{2} {{φ}_{(0,0,0)}}}^{2}\right)^{2}} + \frac{{\partial_{2} {{φ}_{(0,0,0)}}}^{4}}{\left({\partial_{0} {{φ}_{(0,0,0)}}}^{2} + {\partial_{1} {{φ}_{(0,0,0)}}}^{2} + {\partial_{2} {{φ}_{(0,0,0)}}}^{2}\right)^{2}}\right) + 1\right)$
-                   ⎛  ⎛                            4
-                   ⎜  ⎜                 D(φ[0,0,0])
-    \bar{\epsilon}⋅⎜δ⋅⎜───────────────────────────────────────────── + ───────────
-                   ⎜  ⎜                                            2
-                   ⎜  ⎜⎛           2              2              2⎞    ⎛
-                   ⎝  ⎝⎝D(φ[0,0,0])  + D(φ[0,0,0])  + D(φ[0,0,0]) ⎠    ⎝D(φ[0,0,0]
-                     4                                               4
-          D(φ[0,0,0])                                     D(φ[0,0,0])
-    ────────────────────────────────── + ─────────────────────────────────────────
-                                     2
-     2              2              2⎞    ⎛           2              2
-    )  + D(φ[0,0,0])  + D(φ[0,0,0]) ⎠    ⎝D(φ[0,0,0])  + D(φ[0,0,0])  + D(φ[0,0,0]
-        ⎞    ⎞
-        ⎟    ⎟
-    ────⎟ + 1⎟
-       2⎟    ⎟
-     2⎞ ⎟    ⎟
-    ) ⎠ ⎠    ⎠
-%% Cell type:code id: tags:
-``` python
-def m_func(temperature):
-    return (α / sp.pi) * sp.atan(γ * (Teq - temperature))
-```
-%% Cell type:code id: tags:
-``` python
-substitutions = {m: m_func(T),
-                 ε: εVal}
-fe_i = interface_free_energy_density.subs(substitutions)
-fe_b = bulk_free_energy_density.subs(substitutions)
-μ_if = ps.fd.expand_diff_full(ps.fd.functional_derivative(fe_i, φ), functions=[φ])
-μ_b = ps.fd.expand_diff_full(ps.fd.functional_derivative(fe_b, φ), functions=[φ])
-```
-%% Cell type:code id: tags:
-``` python
-dF_dφ = μ_b + sp.Piecewise((μ_if, nLen**2 > 1e-10), (0, True))
-```
-%% Cell type:code id: tags:
-``` python
-parameters = {
-    τ: 0.0003,
-    κ: 1.8,
-    εb: 0.01,
-    δ: 0.3,
-    γ: 10,
-    j: 6,
-    α: 0.9,
-    Teq: 1.0,
-    θzero: 0.2,
-    sp.pi: sp.pi.evalf()
-}
-parameters
-```
-%% Output
-    $\displaystyle \left\{ \pi : 3.14159265358979, \  T_{eq} : 1.0, \  \bar{\epsilon} : 0.01, \  j : 6, \  α : 0.9, \  γ : 10, \  δ : 0.3, \  θ_{0} : 0.2, \  κ : 1.8, \  τ : 0.0003\right\}$
-    {π: 3.14159265358979, T_eq: 1.0, \bar{\epsilon}: 0.01, j: 6, α: 0.9, γ: 10, δ:
-     0.3, θ₀: 0.2, κ: 1.8, τ: 0.0003}
-%% Cell type:code id: tags:
-``` python
-dφ_dt = - dF_dφ / τ
-assignments = [
-    ps.Assignment(φ_delta_field.center, discretize(dφ_dt.subs(parameters))),
-]
-φEqs = ps.simp.sympy_cse_on_assignment_list(assignments)
-φEqs.append(ps.Assignment(φ, discretize(ps.fd.transient(φ) - φ_delta_field.center)))
-temperatureEvolution = -ps.fd.transient(T) + ps.fd.diffusion(T, 1) + κ * φ_delta_field.center
-temperatureEqs = [
-    ps.Assignment(T, discretize(temperatureEvolution.subs(parameters)))
-]
-```
-%% Cell type:code id: tags:
-``` python
-temperatureEqs
-```
-%% Output
-    $\displaystyle \left[ {{T}_{(0,0,0)}} \leftarrow 0.0111111111111111 {{T}_{(-1,0,0)}} + 0.0111111111111111 {{T}_{(0,-1,0)}} + 0.0111111111111111 {{T}_{(0,0,-1)}} + 0.933333333333333 {{T}_{(0,0,0)}} + 0.0111111111111111 {{T}_{(0,0,1)}} + 0.0111111111111111 {{T}_{(0,1,0)}} + 0.0111111111111111 {{T}_{(1,0,0)}} + 1.8 \cdot 10^{-5} {{φ_D}_{(0,0,0)}}\right]$
-    [T_C := 0.0111111111111111⋅T_W + 0.0111111111111111⋅T_S + 0.0111111111111111⋅T
-    _B + 0.933333333333333⋅T_C + 0.0111111111111111⋅T_T + 0.0111111111111111⋅T_N +
-     0.0111111111111111⋅T_E + 1.8e-5⋅phidelta_C]
-%% Cell type:code id: tags:
-``` python
-φ_kernel = ps.create_kernel(φEqs, cpu_openmp=4, target=target).compile()
-temperatureKernel = ps.create_kernel(temperatureEqs, cpu_openmp=4, target=target).compile()
-```
-%% Cell type:code id: tags:
-``` python
-def time_loop(steps):
-    φ_sync = dh.synchronization_function(['phi'], target=target)
-    temperature_sync = dh.synchronization_function(['T'], target=target)
-    dh.all_to_gpu()
-    for t in range(steps):
-        φ_sync()
-        dh.run_kernel(φ_kernel)
-        temperature_sync()
-        dh.run_kernel(temperatureKernel)
-    dh.all_to_cpu()
-def init(nucleus_size=np.sqrt(5)):
-    for b in dh.iterate():
-        x, y, z = b.cell_index_arrays
-        x, y, z = x - b.shape[0] // 2, y - b.shape[1] // 2, z - b.shape[2] // 2
-        b['phi'].fill(0)
-        b['phi'][(x ** 2 + y ** 2 + z ** 2) < nucleus_size ** 2] = 1.0
-        b['T'].fill(0.0)
-def plot(slice_obj=ps.make_slice[:, :, 0.5]):
-    plt.subplot(1, 3, 1)
-    plt.scalar_field(dh.gather_array('phi', slice_obj).squeeze())
-    plt.title("φ")
-    plt.colorbar()
-    plt.subplot(1, 3, 2)
-    plt.title("T")
-    plt.scalar_field(dh.gather_array('T', slice_obj).squeeze())
-    plt.colorbar()
-    plt.subplot(1, 3, 3)
-    plt.title("∂φ")
-    plt.scalar_field(dh.gather_array('phidelta', slice_obj).squeeze())
-    plt.colorbar()
-```
-%% Cell type:code id: tags:
-``` python
-init()
-plot()
-print(dh)
-```
-%% Output
-        Name|      Inner (min/max)|     WithGl (min/max)
-    ----------------------------------------------------
-           T|            (  0,  0)|            (  0,  0)
-         phi|            (  0,  1)|            (  0,  1)
-    phidelta|            (  0,  0)|            (  0,  0)
-%% Cell type:code id: tags:
-``` python
-if 'is_test_run' in globals():
-    time_loop(2)
-    assert np.isfinite(dh.max('phi'))
-    assert np.isfinite(dh.max('T'))
-    assert np.isfinite(dh.max('phidelta'))
-else:
-    from time import perf_counter
-    vtk_writer = dh.create_vtk_writer('dentritic_growth_large', ['phi'])
-    last = perf_counter()
-    for i in range(300):
-        time_loop(100)
-        vtk_writer(i)
-        print("Step ", i, perf_counter() - last, dh.max('phi'))
-        last = perf_counter()
-```
--- a/pystencils_tests/test_print_infinity.py
+++ b/pystencils_tests/test_print_infinity.py
-import pytest
-import pystencils
-from sympy import oo
-@pytest.mark.parametrize('type', ('float32', 'float64', 'int64'))
-@pytest.mark.parametrize('negative', (False, 'Negative'))
-@pytest.mark.parametrize('target', (pystencils.Target.CPU, pystencils.Target.GPU))
-def test_print_infinity(type, negative, target):
-    x = pystencils.fields(f'x:  {type}[1d]')
-    if negative:
-        assignment = pystencils.Assignment(x.center, -oo)
-    else:
-        assignment = pystencils.Assignment(x.center, oo)
-    ast = pystencils.create_kernel(assignment, data_type=type, target=target)
-    if target == pystencils.Target.GPU:
-        pytest.importorskip('pycuda')
-    ast.compile()
-    print(ast.compile().code)
--- a/pystencils_tests/test_print_unsupported_node.py
+++ b/pystencils_tests/test_print_unsupported_node.py
-# -*- coding: utf-8 -*-
-#
-# Copyright © 2019 Stephan Seitz <stephan.seitz@fau.de>
-#
-# Distributed under terms of the GPLv3 license.
-"""
-"""
-import pytest
-import pystencils
-from pystencils.backends.cbackend import CBackend
-class UnsupportedNode(pystencils.astnodes.Node):
-    def __init__(self):
-        super().__init__()
-def test_print_unsupported_node():
-    with pytest.raises(NotImplementedError, match='CBackend does not support node of type UnsupportedNode'):
-        CBackend()(UnsupportedNode())
--- a/pystencils_tests/test_sliced_iteration.py
+++ b/pystencils_tests/test_sliced_iteration.py
-import numpy as np
-import sympy as sp
-from pystencils import Assignment, Field, TypedSymbol, create_kernel, make_slice
-from pystencils.simp import sympy_cse_on_assignment_list
-def test_sliced_iteration():
-    size = (4, 4)
-    src_arr = np.ones(size)
-    dst_arr = np.zeros_like(src_arr)
-    src_field = Field.create_from_numpy_array('src', src_arr)
-    dst_field = Field.create_from_numpy_array('dst', dst_arr)
-    a, b = sp.symbols("a b")
-    update_rule = Assignment(dst_field[0, 0],
-                             (a * src_field[0, 1] + a * src_field[0, -1] +
-                              b * src_field[1, 0] + b * src_field[-1, 0]) / 4)
-    x_end = TypedSymbol("x_end", "int")
-    s = make_slice[1:x_end, 1]
-    x_end_value = size[1] - 1
-    kernel = create_kernel(sympy_cse_on_assignment_list([update_rule]), iteration_slice=s).compile()
-    kernel(src=src_arr, dst=dst_arr, a=1.0, b=1.0, x_end=x_end_value)
-    expected_result = np.zeros(size)
-    expected_result[1:x_end_value, 1] = 1
-    np.testing.assert_almost_equal(expected_result, dst_arr)
--- a/pystencils_tests/test_transformations.py
+++ b/pystencils_tests/test_transformations.py
-import pystencils as ps
-from pystencils import TypedSymbol
-from pystencils.astnodes import LoopOverCoordinate, SympyAssignment
-from pystencils.typing import create_type
-from pystencils.transformations import filtered_tree_iteration, get_loop_hierarchy, get_loop_counter_symbol_hierarchy
-def test_loop_information():
-    f, g = ps.fields("f, g: double[2D]")
-    update_rule = ps.Assignment(g[0, 0], f[0, 0])
-    ast = ps.create_kernel(update_rule)
-    inner_loops = [l for l in filtered_tree_iteration(ast, LoopOverCoordinate, stop_type=SympyAssignment)
-                   if l.is_innermost_loop]
-    loop_order = []
-    for i in get_loop_hierarchy(inner_loops[0].args[0]):
-        loop_order.append(i)
-    assert loop_order == [0, 1]
-    loop_symbols = get_loop_counter_symbol_hierarchy(inner_loops[0].args[0])
-    assert loop_symbols == [TypedSymbol("ctr_1", create_type("int"), nonnegative=True),
-                            TypedSymbol("ctr_0", create_type("int"), nonnegative=True)]
--- a/pytest.ini
+++ b/pytest.ini
 [pytest]
+testpaths = src tests doc/notebooks
+pythonpath = src
 python_files = test_*.py *_test.py scenario_*.py
 norecursedirs = *.egg-info .git .cache .ipynb_checkpoints htmlcov
 addopts = --doctest-modules --durations=20  --cov-config pytest.ini
@@ -17,20 +19,21 @@ filterwarnings =
 [run]
 branch = True
-source = pystencils
+source = src/pystencils
-         pystencils_tests
+         tests
 omit = doc/*
-       pystencils_tests/*
+       tests/*
       setup.py
+       quicktest.py
       conftest.py
       versioneer.py
-       pystencils/jupytersetup.py
+       src/pystencils/jupytersetup.py
-       pystencils/cpu/msvc_detection.py
+       src/pystencils/cpu/msvc_detection.py
-       pystencils/sympy_gmpy_bug_workaround.py
+       src/pystencils/sympy_gmpy_bug_workaround.py
-       pystencils/cache.py
+       src/pystencils/cache.py
-       pystencils/pacxx/benchmark.py
+       src/pystencils/pacxx/benchmark.py
-       pystencils/_version.py
+       src/pystencils/_version.py
       venv/
 [report]

--- a/quicktest.py
+++ b/quicktest.py
+#!/usr/bin/env python3
+from contextlib import redirect_stdout
+import io
+from tests.test_quicktests import (
+    test_basic_kernel,
+    test_basic_blocking_staggered,
+    test_basic_vectorization,
+)
+quick_tests = [
+    test_basic_kernel,
+    test_basic_blocking_staggered,
+    test_basic_vectorization,
+]
+if __name__ == "__main__":
+    print("Running pystencils quicktests")
+    for qt in quick_tests:
+        print(f"   -> {qt.__name__}")
+        with redirect_stdout(io.StringIO()):
+            qt()
--- a/setup.cfg
+++ b/setup.cfg
-# See the docstring in versioneer.py for instructions. Note that you must
-# re-run 'versioneer.py setup' after changing this section, and commit the
-# resulting files.
-[versioneer]
-VCS = git
-style = pep440
-versionfile_source = pystencils/_version.py
-versionfile_build = pystencils/_version.py
-tag_prefix = release/
-parentdir_prefix = pystencils-
--- a/setup.py
+++ b/setup.py
-import distutils
+from setuptools import setup, __version__ as setuptools_version
-import io
-import os
-from contextlib import redirect_stdout
-from importlib import import_module
-import setuptools
+if int(setuptools_version.split('.')[0]) < 61:
+    raise Exception(
+        "[ERROR] pystencils requires at least setuptools version 61 to install.\n"
+        "If this error occurs during an installation via pip, it is likely that there is a conflict between "
+        "versions of setuptools installed by pip and the system package manager. "
+        "In this case, it is recommended to install pystencils into a virtual environment instead."
+    )
 import versioneer
-try:
-    import cython  # noqa
-    USE_CYTHON = True
-except ImportError:
-    USE_CYTHON = False
-quick_tests = [
-    'test_quicktests.test_basic_kernel',
-    'test_quicktests.test_basic_blocking_staggered',
-    'test_quicktests.test_basic_vectorization',
-]
-class SimpleTestRunner(distutils.cmd.Command):
-    """A custom command to run selected tests"""
-    description = 'run some quick tests'
-    user_options = []
-    @staticmethod
-    def _run_tests_in_module(test):
-        """Short test runner function - to work also if py.test is not installed."""
-        test = f'pystencils_tests.{test}'
-        mod, function_name = test.rsplit('.', 1)
-        if isinstance(mod, str):
-            mod = import_module(mod)
-        func = getattr(mod, function_name)
-        print(f"   -> {function_name} in {mod.__name__}")
-        with redirect_stdout(io.StringIO()):
-            func()
-    def initialize_options(self):
-        pass
-    def finalize_options(self):
-        pass
-    def run(self):
-        """Run command."""
-        for test in quick_tests:
-            self._run_tests_in_module(test)
-def readme():
-    with open('README.md') as f:
-        return f.read()
-def cython_extensions(*extensions):
-    from distutils.extension import Extension
-    if USE_CYTHON:
-        ext = '.pyx'
-        result = [Extension(e, [os.path.join(*e.split(".")) + ext]) for e in extensions]
-        from Cython.Build import cythonize
-        result = cythonize(result, language_level=3)
-        return result
-    elif all([os.path.exists(os.path.join(*e.split(".")) + '.c') for e in extensions]):
-        ext = '.c'
-        result = [Extension(e, [os.path.join(*e.split(".")) + ext]) for e in extensions]
-        return result
-    else:
-        return None
 def get_cmdclass():
-    cmdclass = {"quicktest": SimpleTestRunner}
+    return versioneer.get_cmdclass()
-    cmdclass.update(versioneer.get_cmdclass())
-    return cmdclass
-setuptools.setup(name='pystencils',
-                 description='Speeding up stencil computations on CPUs and GPUs',
-                 version=versioneer.get_version(),
-                 long_description=readme(),
-                 long_description_content_type="text/markdown",
-                 author='Martin Bauer, Jan Hönig, Markus Holzer',
-                 license='AGPLv3',
-                 author_email='cs10-codegen@fau.de',
-                 url='https://i10git.cs.fau.de/pycodegen/pystencils/',
-                 packages=['pystencils'] + ['pystencils.' + s for s in setuptools.find_packages('pystencils')],
-                 install_requires=['sympy>=1.6,<=1.10', 'numpy>=1.8.0', 'appdirs', 'joblib'],
-                 package_data={'pystencils': ['include/*.h',
-                                              'backends/cuda_known_functions.txt',
-                                              'backends/opencl1.1_known_functions.txt',
-                                              'boundaries/createindexlistcython.c',
-                                              'boundaries/createindexlistcython.pyx']},
-                 ext_modules=cython_extensions("pystencils.boundaries.createindexlistcython"),
-                 classifiers=[
-                     'Development Status :: 4 - Beta',
-                     'Framework :: Jupyter',
-                     'Topic :: Software Development :: Code Generators',
-                     'Topic :: Scientific/Engineering :: Physics',
-                     'Intended Audience :: Developers',
-                     'Intended Audience :: Science/Research',
-                     'License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)',
-                 ],
-                 project_urls={
-                     "Bug Tracker": "https://i10git.cs.fau.de/pycodegen/pystencils/issues",
-                     "Documentation": "http://pycodegen.pages.walberla.net/pystencils/",
-                     "Source Code": "https://i10git.cs.fau.de/pycodegen/pystencils",
-                 },
-                 extras_require={
-                     'gpu': ['pycuda'],
-                     'alltrafos': ['islpy', 'py-cpuinfo'],
-                     'bench_db': ['blitzdb', 'pymongo', 'pandas'],
-                     'interactive': ['matplotlib', 'ipy_table', 'imageio', 'jupyter', 'pyevtk', 'rich', 'graphviz'],
-                     'doc': ['sphinx', 'sphinx_rtd_theme', 'nbsphinx',
-                             'sphinxcontrib-bibtex', 'sphinx_autodoc_typehints', 'pandoc'],
-                     'use_cython': ['Cython']
-                 },
-                 tests_require=['pytest',
-                                'pytest-cov',
-                                'pytest-html',
-                                'ansi2html',
-                                'pytest-xdist',
-                                'flake8',
-                                'nbformat',
-                                'nbconvert',
-                                'ipython',
-                                'randomgen>=1.18'],
-                 python_requires=">=3.8",
+setup(
-                 cmdclass=get_cmdclass()
+    version=versioneer.get_version(),
-                 )
+    cmdclass=get_cmdclass(),
+)
--- a/pystencils/__init__.py
+++ b/pystencils/__init__.py
@@ -2,18 +2,19 @@
 from .enums import Backend, Target
 from . import fd
 from . import stencil as stencil
-from .assignment import Assignment, assignment_from_stencil
+from .assignment import Assignment, AddAugmentedAssignment, assignment_from_stencil
-from pystencils.typing.typed_sympy import TypedSymbol
+from .typing.typed_sympy import TypedSymbol
-from .datahandling import create_data_handling
 from .display_utils import get_code_obj, get_code_str, show_code, to_dot
 from .field import Field, FieldType, fields
 from .config import CreateKernelConfig
+from .cache import clear_cache
 from .kernel_decorator import kernel, kernel_config
 from .kernelcreation import create_kernel, create_staggered_kernel
 from .simp import AssignmentCollection
 from .slicing import make_slice
 from .spatial_coordinates import x_, x_staggered, x_staggered_vector, x_vector, y_, y_staggered, z_, z_staggered
 from .sympyextensions import SymbolCreator
+from .datahandling import create_data_handling
 __all__ = ['Field', 'FieldType', 'fields',
           'TypedSymbol',
@@ -23,10 +24,11 @@ __all__ = ['Field', 'FieldType', 'fields',
           'Target', 'Backend',
           'show_code', 'to_dot', 'get_code_obj', 'get_code_str',
           'AssignmentCollection',
-           'Assignment',
+           'Assignment', 'AddAugmentedAssignment',
           'assignment_from_stencil',
           'SymbolCreator',
           'create_data_handling',
+           'clear_cache',
           'kernel', 'kernel_config',
           'x_', 'y_', 'z_',
           'x_staggered', 'y_staggered', 'z_staggered',
@@ -34,7 +36,5 @@ __all__ = ['Field', 'FieldType', 'fields',
           'fd',
           'stencil']
-from ._version import get_versions
+from . import _version
+__version__ = _version.get_versions()['version']
-__version__ = get_versions()['version']
-del get_versions
--- a/pystencils/_version.py
+++ b/pystencils/_version.py
@@ -5,8 +5,9 @@
 # directories (produced by setup.py build) will contain a much shorter file
 # that just contains the computed version number.
-# This file is released into the public domain. Generated by
+# This file is released into the public domain.
-# versioneer-0.19 (https://github.com/python-versioneer/python-versioneer)
+# Generated by versioneer-0.29
+# https://github.com/python-versioneer/python-versioneer
 """Git implementation of _version.py."""
@@ -15,9 +16,11 @@ import os
 import re
 import subprocess
 import sys
+from typing import Any, Callable, Dict, List, Optional, Tuple
+import functools
-def get_keywords():
+def get_keywords() -> Dict[str, str]:
    """Get the keywords needed to look up the version information."""
    # these strings will be replaced by git during git-archive.
    # setup.py/versioneer.py will grep for the variable names, so they must
@@ -33,8 +36,15 @@ def get_keywords():
 class VersioneerConfig:
    """Container for Versioneer configuration parameters."""
+    VCS: str
+    style: str
+    tag_prefix: str
+    parentdir_prefix: str
+    versionfile_source: str
+    verbose: bool
-def get_config():
+def get_config() -> VersioneerConfig:
    """Create, populate and return the VersioneerConfig() object."""
    # these strings are filled in when 'setup.py versioneer' creates
    # _version.py
@@ -43,7 +53,7 @@ def get_config():
    cfg.style = "pep440"
    cfg.tag_prefix = "release/"
    cfg.parentdir_prefix = "pystencils-"
-    cfg.versionfile_source = "pystencils/_version.py"
+    cfg.versionfile_source = "src/pystencils/_version.py"
    cfg.verbose = False
    return cfg
@@ -52,13 +62,13 @@ class NotThisMethod(Exception):
    """Exception raised if a method is not valid for the current scenario."""
-LONG_VERSION_PY = {}
+LONG_VERSION_PY: Dict[str, str] = {}
-HANDLERS = {}
+HANDLERS: Dict[str, Dict[str, Callable]] = {}
-def register_vcs_handler(vcs, method):  # decorator
+def register_vcs_handler(vcs: str, method: str) -> Callable:  # decorator
    """Create decorator to mark a method as the handler of a VCS."""
-    def decorate(f):
+    def decorate(f: Callable) -> Callable:
        """Store f in HANDLERS[vcs][method]."""
        if vcs not in HANDLERS:
            HANDLERS[vcs] = {}
@@ -67,22 +77,35 @@ def register_vcs_handler(vcs, method):  # decorator
    return decorate
-def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False,
+def run_command(
-                env=None):
+    commands: List[str],
+    args: List[str],
+    cwd: Optional[str] = None,
+    verbose: bool = False,
+    hide_stderr: bool = False,
+    env: Optional[Dict[str, str]] = None,
+) -> Tuple[Optional[str], Optional[int]]:
    """Call the given command(s)."""
    assert isinstance(commands, list)
-    p = None
+    process = None
-    for c in commands:
+    popen_kwargs: Dict[str, Any] = {}
+    if sys.platform == "win32":
+        # This hides the console window if pythonw.exe is used
+        startupinfo = subprocess.STARTUPINFO()
+        startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
+        popen_kwargs["startupinfo"] = startupinfo
+    for command in commands:
        try:
-            dispcmd = str([c] + args)
+            dispcmd = str([command] + args)
            # remember shell=False, so use git.cmd on windows, not just git
-            p = subprocess.Popen([c] + args, cwd=cwd, env=env,
+            process = subprocess.Popen([command] + args, cwd=cwd, env=env,
-                                 stdout=subprocess.PIPE,
+                                       stdout=subprocess.PIPE,
-                                 stderr=(subprocess.PIPE if hide_stderr
+                                       stderr=(subprocess.PIPE if hide_stderr
-                                         else None))
+                                               else None), **popen_kwargs)
            break
-        except EnvironmentError:
+        except OSError as e:
-            e = sys.exc_info()[1]
            if e.errno == errno.ENOENT:
                continue
            if verbose:
@@ -93,16 +116,20 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False,
        if verbose:
            print("unable to find command, tried %s" % (commands,))
        return None, None
-    stdout = p.communicate()[0].strip().decode()
+    stdout = process.communicate()[0].strip().decode()
-    if p.returncode != 0:
+    if process.returncode != 0:
        if verbose:
            print("unable to run %s (error)" % dispcmd)
            print("stdout was %s" % stdout)
-        return None, p.returncode
+        return None, process.returncode
-    return stdout, p.returncode
+    return stdout, process.returncode
-def versions_from_parentdir(parentdir_prefix, root, verbose):
+def versions_from_parentdir(
+    parentdir_prefix: str,
+    root: str,
+    verbose: bool,
+) -> Dict[str, Any]:
    """Try to determine the version from the parent directory name.
    Source tarballs conventionally unpack into a directory that includes both
@@ -111,15 +138,14 @@ def versions_from_parentdir(parentdir_prefix, root, verbose):
    """
    rootdirs = []
-    for i in range(3):
+    for _ in range(3):
        dirname = os.path.basename(root)
        if dirname.startswith(parentdir_prefix):
            return {"version": dirname[len(parentdir_prefix):],
                    "full-revisionid": None,
                    "dirty": False, "error": None, "date": None}
-        else:
+        rootdirs.append(root)
-            rootdirs.append(root)
+        root = os.path.dirname(root)  # up a level
-            root = os.path.dirname(root)  # up a level
    if verbose:
        print("Tried directories %s but none started with prefix %s" %
@@ -128,39 +154,42 @@ def versions_from_parentdir(parentdir_prefix, root, verbose):
 @register_vcs_handler("git", "get_keywords")
-def git_get_keywords(versionfile_abs):
+def git_get_keywords(versionfile_abs: str) -> Dict[str, str]:
    """Extract version information from the given file."""
    # the code embedded in _version.py can just fetch the value of these
    # keywords. When used from setup.py, we don't want to import _version.py,
    # so we do it with a regexp instead. This function is not used from
    # _version.py.
-    keywords = {}
+    keywords: Dict[str, str] = {}
    try:
-        f = open(versionfile_abs, "r")
+        with open(versionfile_abs, "r") as fobj:
-        for line in f.readlines():
+            for line in fobj:
-            if line.strip().startswith("git_refnames ="):
+                if line.strip().startswith("git_refnames ="):
-                mo = re.search(r'=\s*"(.*)"', line)
+                    mo = re.search(r'=\s*"(.*)"', line)
-                if mo:
+                    if mo:
-                    keywords["refnames"] = mo.group(1)
+                        keywords["refnames"] = mo.group(1)
-            if line.strip().startswith("git_full ="):
+                if line.strip().startswith("git_full ="):
-                mo = re.search(r'=\s*"(.*)"', line)
+                    mo = re.search(r'=\s*"(.*)"', line)
-                if mo:
+                    if mo:
-                    keywords["full"] = mo.group(1)
+                        keywords["full"] = mo.group(1)
-            if line.strip().startswith("git_date ="):
+                if line.strip().startswith("git_date ="):
-                mo = re.search(r'=\s*"(.*)"', line)
+                    mo = re.search(r'=\s*"(.*)"', line)
-                if mo:
+                    if mo:
-                    keywords["date"] = mo.group(1)
+                        keywords["date"] = mo.group(1)
-        f.close()
+    except OSError:
-    except EnvironmentError:
        pass
    return keywords
 @register_vcs_handler("git", "keywords")
-def git_versions_from_keywords(keywords, tag_prefix, verbose):
+def git_versions_from_keywords(
+    keywords: Dict[str, str],
+    tag_prefix: str,
+    verbose: bool,
+) -> Dict[str, Any]:
    """Get version information from git keywords."""
-    if not keywords:
+    if "refnames" not in keywords:
-        raise NotThisMethod("no keywords at all, weird")
+        raise NotThisMethod("Short version file found")
    date = keywords.get("date")
    if date is not None:
        # Use only the last line.  Previous lines may contain GPG signature
@@ -179,11 +208,11 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose):
        if verbose:
            print("keywords are unexpanded, not using")
        raise NotThisMethod("unexpanded keywords, not a git-archive tarball")
-    refs = set([r.strip() for r in refnames.strip("()").split(",")])
+    refs = {r.strip() for r in refnames.strip("()").split(",")}
    # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
    # just "foo-1.0". If we see a "tag: " prefix, prefer those.
    TAG = "tag: "
-    tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)])
+    tags = {r[len(TAG):] for r in refs if r.startswith(TAG)}
    if not tags:
        # Either we're using git < 1.8.3, or there really are no tags. We use
        # a heuristic: assume all version tags have a digit. The old git %d
@@ -192,7 +221,7 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose):
        # between branches and tags. By ignoring refnames without digits, we
        # filter out many common branch names like "release" and
        # "stabilization", as well as "HEAD" and "master".
-        tags = set([r for r in refs if re.search(r'\d', r)])
+        tags = {r for r in refs if re.search(r'\d', r)}
        if verbose:
            print("discarding '%s', no digits" % ",".join(refs - tags))
    if verbose:
@@ -201,6 +230,11 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose):
        # sorting will prefer e.g. "2.0" over "2.0rc1"
        if ref.startswith(tag_prefix):
            r = ref[len(tag_prefix):]
+            # Filter out refs that exactly match prefix or that don't start
+            # with a number once the prefix is stripped (mostly a concern
+            # when prefix is '')
+            if not re.match(r'\d', r):
+                continue
            if verbose:
                print("picking %s" % r)
            return {"version": r,
@@ -216,7 +250,12 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose):
 @register_vcs_handler("git", "pieces_from_vcs")
-def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
+def git_pieces_from_vcs(
+    tag_prefix: str,
+    root: str,
+    verbose: bool,
+    runner: Callable = run_command
+) -> Dict[str, Any]:
    """Get version from 'git describe' in the root of the source tree.
    This only gets called if the git-archive 'subst' keywords were *not*
@@ -227,8 +266,15 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
    if sys.platform == "win32":
        GITS = ["git.cmd", "git.exe"]
-    out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root,
+    # GIT_DIR can interfere with correct operation of Versioneer.
-                          hide_stderr=True)
+    # It may be intended to be passed to the Versioneer-versioned project,
+    # but that should not change where we get our version from.
+    env = os.environ.copy()
+    env.pop("GIT_DIR", None)
+    runner = functools.partial(runner, env=env)
+    _, rc = runner(GITS, ["rev-parse", "--git-dir"], cwd=root,
+                   hide_stderr=not verbose)
    if rc != 0:
        if verbose:
            print("Directory %s not under git control" % root)
@@ -236,24 +282,57 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
    # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty]
    # if there isn't one, this yields HEX[-dirty] (no NUM)
-    describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty",
+    describe_out, rc = runner(GITS, [
-                                          "--always", "--long",
+        "describe", "--tags", "--dirty", "--always", "--long",
-                                          "--match", "%s*" % tag_prefix],
+        "--match", f"{tag_prefix}[[:digit:]]*"
-                                   cwd=root)
+    ], cwd=root)
    # --long was added in git-1.5.5
    if describe_out is None:
        raise NotThisMethod("'git describe' failed")
    describe_out = describe_out.strip()
-    full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root)
+    full_out, rc = runner(GITS, ["rev-parse", "HEAD"], cwd=root)
    if full_out is None:
        raise NotThisMethod("'git rev-parse' failed")
    full_out = full_out.strip()
-    pieces = {}
+    pieces: Dict[str, Any] = {}
    pieces["long"] = full_out
    pieces["short"] = full_out[:7]  # maybe improved later
    pieces["error"] = None
+    branch_name, rc = runner(GITS, ["rev-parse", "--abbrev-ref", "HEAD"],
+                             cwd=root)
+    # --abbrev-ref was added in git-1.6.3
+    if rc != 0 or branch_name is None:
+        raise NotThisMethod("'git rev-parse --abbrev-ref' returned error")
+    branch_name = branch_name.strip()
+    if branch_name == "HEAD":
+        # If we aren't exactly on a branch, pick a branch which represents
+        # the current commit. If all else fails, we are on a branchless
+        # commit.
+        branches, rc = runner(GITS, ["branch", "--contains"], cwd=root)
+        # --contains was added in git-1.5.4
+        if rc != 0 or branches is None:
+            raise NotThisMethod("'git branch --contains' returned error")
+        branches = branches.split("\n")
+        # Remove the first line if we're running detached
+        if "(" in branches[0]:
+            branches.pop(0)
+        # Strip off the leading "* " from the list of branches.
+        branches = [branch[2:] for branch in branches]
+        if "master" in branches:
+            branch_name = "master"
+        elif not branches:
+            branch_name = None
+        else:
+            # Pick the first branch that is returned. Good or bad.
+            branch_name = branches[0]
+    pieces["branch"] = branch_name
    # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty]
    # TAG might have hyphens.
    git_describe = describe_out
@@ -270,7 +349,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
        # TAG-NUM-gHEX
        mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe)
        if not mo:
-            # unparseable. Maybe git-describe is misbehaving?
+            # unparsable. Maybe git-describe is misbehaving?
            pieces["error"] = ("unable to parse git-describe output: '%s'"
                               % describe_out)
            return pieces
@@ -295,13 +374,11 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
    else:
        # HEX: no tags
        pieces["closest-tag"] = None
-        count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"],
+        out, rc = runner(GITS, ["rev-list", "HEAD", "--left-right"], cwd=root)
-                                    cwd=root)
+        pieces["distance"] = len(out.split())  # total number of commits
-        pieces["distance"] = int(count_out)  # total number of commits
    # commit date: see ISO-8601 comment in git_versions_from_keywords()
-    date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"],
+    date = runner(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[0].strip()
-                       cwd=root)[0].strip()
    # Use only the last line.  Previous lines may contain GPG signature
    # information.
    date = date.splitlines()[-1]
@@ -310,14 +387,14 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
    return pieces
-def plus_or_dot(pieces):
+def plus_or_dot(pieces: Dict[str, Any]) -> str:
    """Return a + if we don't already have one, else return a ."""
    if "+" in pieces.get("closest-tag", ""):
        return "."
    return "+"
-def render_pep440(pieces):
+def render_pep440(pieces: Dict[str, Any]) -> str:
    """Build up version string, with post-release "local version identifier".
    Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you
@@ -342,23 +419,71 @@ def render_pep440(pieces):
    return rendered
-def render_pep440_pre(pieces):
+def render_pep440_branch(pieces: Dict[str, Any]) -> str:
-    """TAG[.post0.devDISTANCE] -- No -dirty.
+    """TAG[[.dev0]+DISTANCE.gHEX[.dirty]] .
+    The ".dev0" means not master branch. Note that .dev0 sorts backwards
+    (a feature branch will appear "older" than the master branch).
    Exceptions:
-    1: no tags. 0.post0.devDISTANCE
+    1: no tags. 0[.dev0]+untagged.DISTANCE.gHEX[.dirty]
    """
    if pieces["closest-tag"]:
        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            if pieces["branch"] != "master":
+                rendered += ".dev0"
+            rendered += plus_or_dot(pieces)
+            rendered += "%d.g%s" % (pieces["distance"], pieces["short"])
+            if pieces["dirty"]:
+                rendered += ".dirty"
+    else:
+        # exception #1
+        rendered = "0"
+        if pieces["branch"] != "master":
+            rendered += ".dev0"
+        rendered += "+untagged.%d.g%s" % (pieces["distance"],
+                                          pieces["short"])
+        if pieces["dirty"]:
+            rendered += ".dirty"
+    return rendered
+def pep440_split_post(ver: str) -> Tuple[str, Optional[int]]:
+    """Split pep440 version string at the post-release segment.
+    Returns the release segments before the post-release and the
+    post-release version number (or -1 if no post-release segment is present).
+    """
+    vc = str.split(ver, ".post")
+    return vc[0], int(vc[1] or 0) if len(vc) == 2 else None
+def render_pep440_pre(pieces: Dict[str, Any]) -> str:
+    """TAG[.postN.devDISTANCE] -- No -dirty.
+    Exceptions:
+    1: no tags. 0.post0.devDISTANCE
+    """
+    if pieces["closest-tag"]:
        if pieces["distance"]:
-            rendered += ".post0.dev%d" % pieces["distance"]
+            # update the post release segment
+            tag_version, post_version = pep440_split_post(pieces["closest-tag"])
+            rendered = tag_version
+            if post_version is not None:
+                rendered += ".post%d.dev%d" % (post_version + 1, pieces["distance"])
+            else:
+                rendered += ".post0.dev%d" % (pieces["distance"])
+        else:
+            # no commits, use the tag as the version
+            rendered = pieces["closest-tag"]
    else:
        # exception #1
        rendered = "0.post0.dev%d" % pieces["distance"]
    return rendered
-def render_pep440_post(pieces):
+def render_pep440_post(pieces: Dict[str, Any]) -> str:
    """TAG[.postDISTANCE[.dev0]+gHEX] .
    The ".dev0" means dirty. Note that .dev0 sorts backwards
@@ -385,7 +510,36 @@ def render_pep440_post(pieces):
    return rendered
-def render_pep440_old(pieces):
+def render_pep440_post_branch(pieces: Dict[str, Any]) -> str:
+    """TAG[.postDISTANCE[.dev0]+gHEX[.dirty]] .
+    The ".dev0" means not master branch.
+    Exceptions:
+    1: no tags. 0.postDISTANCE[.dev0]+gHEX[.dirty]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += ".post%d" % pieces["distance"]
+            if pieces["branch"] != "master":
+                rendered += ".dev0"
+            rendered += plus_or_dot(pieces)
+            rendered += "g%s" % pieces["short"]
+            if pieces["dirty"]:
+                rendered += ".dirty"
+    else:
+        # exception #1
+        rendered = "0.post%d" % pieces["distance"]
+        if pieces["branch"] != "master":
+            rendered += ".dev0"
+        rendered += "+g%s" % pieces["short"]
+        if pieces["dirty"]:
+            rendered += ".dirty"
+    return rendered
+def render_pep440_old(pieces: Dict[str, Any]) -> str:
    """TAG[.postDISTANCE[.dev0]] .
    The ".dev0" means dirty.
@@ -407,7 +561,7 @@ def render_pep440_old(pieces):
    return rendered
-def render_git_describe(pieces):
+def render_git_describe(pieces: Dict[str, Any]) -> str:
    """TAG[-DISTANCE-gHEX][-dirty].
    Like 'git describe --tags --dirty --always'.
@@ -427,7 +581,7 @@ def render_git_describe(pieces):
    return rendered
-def render_git_describe_long(pieces):
+def render_git_describe_long(pieces: Dict[str, Any]) -> str:
    """TAG-DISTANCE-gHEX[-dirty].
    Like 'git describe --tags --dirty --always -long'.
@@ -447,7 +601,7 @@ def render_git_describe_long(pieces):
    return rendered
-def render(pieces, style):
+def render(pieces: Dict[str, Any], style: str) -> Dict[str, Any]:
    """Render the given version pieces into the requested style."""
    if pieces["error"]:
        return {"version": "unknown",
@@ -461,10 +615,14 @@ def render(pieces, style):
    if style == "pep440":
        rendered = render_pep440(pieces)
+    elif style == "pep440-branch":
+        rendered = render_pep440_branch(pieces)
    elif style == "pep440-pre":
        rendered = render_pep440_pre(pieces)
    elif style == "pep440-post":
        rendered = render_pep440_post(pieces)
+    elif style == "pep440-post-branch":
+        rendered = render_pep440_post_branch(pieces)
    elif style == "pep440-old":
        rendered = render_pep440_old(pieces)
    elif style == "git-describe":
@@ -479,7 +637,7 @@ def render(pieces, style):
            "date": pieces.get("date")}
-def get_versions():
+def get_versions() -> Dict[str, Any]:
    """Get version information or return default if unable to do so."""
    # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have
    # __file__, we can work backwards from there to the root. Some
@@ -500,7 +658,7 @@ def get_versions():
        # versionfile_source is the relative path from the top of the source
        # tree (where the .git directory might live) to this file. Invert
        # this to find the root from __file__.
-        for i in cfg.versionfile_source.split('/'):
+        for _ in cfg.versionfile_source.split('/'):
            root = os.path.dirname(root)
    except NameError:
        return {"version": "0+unknown", "full-revisionid": None,

--- a/pystencils/alignedarray.py
+++ b/pystencils/alignedarray.py
 import numpy as np
-from pystencils.typing import numpy_name_to_c
 def aligned_empty(shape, byte_alignment=True, dtype=np.float64, byte_offset=0, order='C', align_inner_coordinate=True):
@@ -21,26 +20,26 @@ def aligned_empty(shape, byte_alignment=True, dtype=np.float64, byte_offset=0, o
        from pystencils.backends.simd_instruction_sets import (get_supported_instruction_sets, get_cacheline_size,
                                                               get_vector_instruction_set)
-        type_name = numpy_name_to_c(np.dtype(dtype).name)
        instruction_sets = get_supported_instruction_sets()
        if instruction_sets is None:
            byte_alignment = 64
        elif byte_alignment == 'cacheline':
            cacheline_sizes = [get_cacheline_size(is_name) for is_name in instruction_sets]
-            if all([s is None for s in cacheline_sizes]):
+            if all([s is None for s in cacheline_sizes]) or \
-                widths = [get_vector_instruction_set(type_name, is_name)['width'] * np.dtype(dtype).itemsize
+                    max([s for s in cacheline_sizes if s is not None]) > 0x100000:
+                widths = [get_vector_instruction_set(dtype, is_name)['width'] * np.dtype(dtype).itemsize
                          for is_name in instruction_sets
-                          if type(get_vector_instruction_set(type_name, is_name)['width']) is int]
+                          if type(get_vector_instruction_set(dtype, is_name)['width']) is int]
                byte_alignment = 64 if all([s is None for s in widths]) else max(widths)
            else:
                byte_alignment = max([s for s in cacheline_sizes if s is not None])
-        elif not any([type(get_vector_instruction_set(type_name, is_name)['width']) is int
+        elif not any([type(get_vector_instruction_set(dtype, is_name)['width']) is int
                      for is_name in instruction_sets]):
            byte_alignment = 64
        else:
-            byte_alignment = max([get_vector_instruction_set(type_name, is_name)['width'] * np.dtype(dtype).itemsize
+            byte_alignment = max([get_vector_instruction_set(dtype, is_name)['width'] * np.dtype(dtype).itemsize
                                  for is_name in instruction_sets
-                                  if type(get_vector_instruction_set(type_name, is_name)['width']) is int])
+                                  if type(get_vector_instruction_set(dtype, is_name)['width']) is int])
    if (not align_inner_coordinate) or (not hasattr(shape, '__len__')):
        size = np.prod(shape)
        d = np.dtype(dtype)
@@ -78,7 +77,7 @@ def aligned_empty(shape, byte_alignment=True, dtype=np.float64, byte_offset=0, o
        return tmp
-def aligned_zeros(shape, byte_alignment=True, dtype=float, byte_offset=0, order='C', align_inner_coordinate=True):
+def aligned_zeros(shape, byte_alignment=True, dtype=np.float64, byte_offset=0, order='C', align_inner_coordinate=True):
    arr = aligned_empty(shape, dtype=dtype, byte_offset=byte_offset,
                        order=order, byte_alignment=byte_alignment, align_inner_coordinate=align_inner_coordinate)
    x = np.zeros((), arr.dtype)
@@ -86,7 +85,7 @@ def aligned_zeros(shape, byte_alignment=True, dtype=float, byte_offset=0, order=
    return arr
-def aligned_ones(shape, byte_alignment=True, dtype=float, byte_offset=0, order='C', align_inner_coordinate=True):
+def aligned_ones(shape, byte_alignment=True, dtype=np.float64, byte_offset=0, order='C', align_inner_coordinate=True):
    arr = aligned_empty(shape, dtype=dtype, byte_offset=byte_offset,
                        order=order, byte_alignment=byte_alignment, align_inner_coordinate=align_inner_coordinate)
    x = np.ones((), arr.dtype)

--- a/pystencils/assignment.py
+++ b/pystencils/assignment.py
 import numpy as np
 import sympy as sp
-from sympy.codegen.ast import Assignment
+from sympy.codegen.ast import Assignment, AugmentedAssignment, AddAugmentedAssignment
 from sympy.printing.latex import LatexPrinter
-__all__ = ['Assignment', 'assignment_from_stencil']
+__all__ = ['Assignment', 'AugmentedAssignment', 'AddAugmentedAssignment', 'assignment_from_stencil']
 def print_assignment_latex(printer, expr):
+    binop = f"{expr.binop}=" if isinstance(expr, AugmentedAssignment) else ''
    """sympy cannot print Assignments as Latex. Thus, this function is added to the sympy Latex printer"""
    printed_lhs = printer.doprint(expr.lhs)
    printed_rhs = printer.doprint(expr.rhs)
-    return fr"{printed_lhs} \leftarrow {printed_rhs}"
+    return fr"{printed_lhs} \leftarrow_{{{binop}}} {printed_rhs}"
 def assignment_str(assignment):
-    return fr"{assignment.lhs} ← {assignment.rhs}"
+    op = f"{assignment.binop}=" if isinstance(assignment, AugmentedAssignment) else '←'
+    return fr"{assignment.lhs} {op} {assignment.rhs}"
 _old_new = sp.codegen.ast.Assignment.__new__
@@ -32,6 +34,9 @@ Assignment.__str__ = assignment_str
 Assignment.__new__ = _Assignment__new__
 LatexPrinter._print_Assignment = print_assignment_latex
+AugmentedAssignment.__str__ = assignment_str
+LatexPrinter._print_AugmentedAssignment = print_assignment_latex
 sp.MutableDenseMatrix.__hash__ = lambda self: hash(tuple(self))

--- a/pystencils/astnodes.py
+++ b/pystencils/astnodes.py
@@ -5,12 +5,12 @@ from typing import Any, List, Optional, Sequence, Set, Union
 import sympy as sp
-import pystencils
+from pystencils.assignment import Assignment
-from pystencils.typing.utilities import create_type, get_next_parent_of_type
 from pystencils.enums import Target, Backend
 from pystencils.field import Field
-from pystencils.typing.typed_sympy import FieldPointerSymbol, FieldShapeSymbol, FieldStrideSymbol, TypedSymbol
 from pystencils.sympyextensions import fast_subs
+from pystencils.typing import (create_type, get_next_parent_of_type,
+                               FieldPointerSymbol, FieldShapeSymbol, FieldStrideSymbol, TypedSymbol, CFunction)
 NodeOrExpr = Union['Node', sp.Expr]
@@ -193,6 +193,10 @@ class KernelFunction(Node):
        # function that compiles the node to a Python callable, is set by the backends
        self._compile_function = compile_function
        self.assignments = assignments
+        # If nontemporal stores are activated together with the Neon instruction set it results in cacheline zeroing
+        # For cacheline zeroing the information of the field size for each field is needed. Thus, in this case
+        # all field sizes are kernel parameters and not just the common field size used for the loops
+        self.use_all_written_field_sizes = False
    @property
    def target(self):
@@ -233,7 +237,8 @@ class KernelFunction(Node):
    @property
    def fields_written(self) -> Set[Field]:
        assignments = self.atoms(SympyAssignment)
-        return {a.lhs.field for a in assignments if isinstance(a.lhs, ResolvedFieldAccess)}
+        return set().union(itertools.chain.from_iterable([f.field for f in a.lhs.free_symbols if hasattr(f, 'field')]
+                                                         for a in assignments))
    @property
    def fields_read(self) -> Set[Field]:
@@ -247,6 +252,11 @@ class KernelFunction(Node):
        This function is expensive, cache the result where possible!
        """
        field_map = {f.name: f for f in self.fields_accessed}
+        sizes = set()
+        if self.use_all_written_field_sizes:
+            sizes = set().union(*(a.shape[:a.spatial_dimensions] for a in self.fields_written))
+            sizes = filter(lambda s: isinstance(s, FieldShapeSymbol), sizes)
        def get_fields(symbol):
            if hasattr(symbol, 'field_name'):
@@ -256,9 +266,13 @@ class KernelFunction(Node):
            return ()
        argument_symbols = self._body.undefined_symbols - self.global_variables
+        argument_symbols.update(sizes)
        parameters = [self.Parameter(symbol, get_fields(symbol)) for symbol in argument_symbols]
        if hasattr(self, 'indexing'):
            parameters += [self.Parameter(s, []) for s in self.indexing.symbolic_parameters()]
+        # Exclude paramters of type CFunction. These parameters will result in a C function call that will be handled
+        # by including a respective header file in the compute kernel. Hence, it is not a free parameter.
+        parameters = [p for p in parameters if not isinstance(p.symbol, CFunction)]
        parameters.sort(key=lambda p: p.symbol.name)
        return parameters
@@ -292,7 +306,7 @@ class SkipIteration(Node):
 class Block(Node):
-    def __init__(self, nodes: List[Node]):
+    def __init__(self, nodes: Union[Node, List[Node]]):
        super(Block, self).__init__()
        if not isinstance(nodes, list):
            nodes = [nodes]
@@ -334,14 +348,6 @@ class Block(Node):
        assert self._nodes.count(insert_before) == 1
        idx = self._nodes.index(insert_before)
-        # move all assignment (definitions to the top)
-        if isinstance(new_node, SympyAssignment) and new_node.is_declaration:
-            while idx > 0:
-                pn = self._nodes[idx - 1]
-                if isinstance(pn, LoopOverCoordinate) or isinstance(pn, Conditional):
-                    idx -= 1
-                else:
-                    break
        if not if_not_exists or self._nodes[idx] != new_node:
            self._nodes.insert(idx, new_node)
@@ -350,14 +356,6 @@ class Block(Node):
        assert self._nodes.count(insert_after) == 1
        idx = self._nodes.index(insert_after) + 1
-        # move all assignment (definitions to the top)
-        if isinstance(new_node, SympyAssignment) and new_node.is_declaration:
-            while idx > 0:
-                pn = self._nodes[idx - 1]
-                if isinstance(pn, LoopOverCoordinate) or isinstance(pn, Conditional):
-                    idx -= 1
-                else:
-                    break
        if not if_not_exists or not (self._nodes[idx - 1] == new_node
                                     or (idx < len(self._nodes) and self._nodes[idx] == new_node)):
            self._nodes.insert(idx, new_node)
@@ -392,7 +390,7 @@ class Block(Node):
    def symbols_defined(self):
        result = set()
        for a in self.args:
-            if isinstance(a, pystencils.Assignment):
+            if isinstance(a, Assignment):
                result.update(a.free_symbols)
            else:
                result.update(a.symbols_defined)
@@ -403,7 +401,7 @@ class Block(Node):
        result = set()
        defined_symbols = set()
        for a in self.args:
-            if isinstance(a, pystencils.Assignment):
+            if isinstance(a, Assignment):
                result.update(a.free_symbols)
                defined_symbols.update({a.lhs})
            else:
@@ -433,7 +431,7 @@ class LoopOverCoordinate(Node):
    LOOP_COUNTER_NAME_PREFIX = "ctr"
    BLOCK_LOOP_COUNTER_NAME_PREFIX = "_blockctr"
-    def __init__(self, body, coordinate_to_loop_over, start, stop, step=1, is_block_loop=False):
+    def __init__(self, body, coordinate_to_loop_over, start, stop, step=1, is_block_loop=False, custom_loop_ctr=None):
        super(LoopOverCoordinate, self).__init__(parent=None)
        self.body = body
        body.parent = self
@@ -444,11 +442,12 @@ class LoopOverCoordinate(Node):
        self.body.parent = self
        self.prefix_lines = []
        self.is_block_loop = is_block_loop
+        self.custom_loop_ctr = custom_loop_ctr
    def new_loop_with_different_body(self, new_body):
        result = LoopOverCoordinate(new_body, self.coordinate_to_loop_over, self.start, self.stop,
-                                    self.step, self.is_block_loop)
+                                    self.step, self.is_block_loop, self.custom_loop_ctr)
-        result.prefix_lines = [l for l in self.prefix_lines]
+        result.prefix_lines = [prefix_line for prefix_line in self.prefix_lines]
        return result
    def subs(self, subs_dict):
@@ -510,10 +509,13 @@ class LoopOverCoordinate(Node):
    @property
    def loop_counter_name(self):
-        if self.is_block_loop:
+        if self.custom_loop_ctr:
-            return LoopOverCoordinate.get_block_loop_counter_name(self.coordinate_to_loop_over)
+            return self.custom_loop_ctr.name
        else:
-            return LoopOverCoordinate.get_loop_counter_name(self.coordinate_to_loop_over)
+            if self.is_block_loop:
+                return LoopOverCoordinate.get_block_loop_counter_name(self.coordinate_to_loop_over)
+            else:
+                return LoopOverCoordinate.get_loop_counter_name(self.coordinate_to_loop_over)
    @staticmethod
    def is_loop_counter_symbol(symbol):
@@ -537,10 +539,13 @@ class LoopOverCoordinate(Node):
    @property
    def loop_counter_symbol(self):
-        if self.is_block_loop:
+        if self.custom_loop_ctr:
-            return self.get_block_loop_counter_symbol(self.coordinate_to_loop_over)
+            return self.custom_loop_ctr
        else:
-            return self.get_loop_counter_symbol(self.coordinate_to_loop_over)
+            if self.is_block_loop:
+                return self.get_block_loop_counter_symbol(self.coordinate_to_loop_over)
+            else:
+                return self.get_loop_counter_symbol(self.coordinate_to_loop_over)
    @property
    def is_outermost_loop(self):
@@ -566,10 +571,10 @@ class SympyAssignment(Node):
    def __init__(self, lhs_symbol, rhs_expr, is_const=True, use_auto=False):
        super(SympyAssignment, self).__init__(parent=None)
        self._lhs_symbol = sp.sympify(lhs_symbol)
-        self.rhs = sp.sympify(rhs_expr)
+        self._rhs = sp.sympify(rhs_expr)
        self._is_const = is_const
        self._is_declaration = self.__is_declaration()
-        self.use_auto = use_auto
+        self._use_auto = use_auto
    def __is_declaration(self):
        from pystencils.typing import CastFunc
@@ -583,15 +588,28 @@ class SympyAssignment(Node):
    def lhs(self):
        return self._lhs_symbol
+    @property
+    def rhs(self):
+        return self._rhs
    @lhs.setter
    def lhs(self, new_value):
        self._lhs_symbol = new_value
        self._is_declaration = self.__is_declaration()
+    @rhs.setter
+    def rhs(self, new_rhs_expr):
+        self._rhs = new_rhs_expr
    def subs(self, subs_dict):
        self.lhs = fast_subs(self.lhs, subs_dict)
        self.rhs = fast_subs(self.rhs, subs_dict)
+    def fast_subs(self, subs_dict, skip=None):
+        self.lhs = fast_subs(self.lhs, subs_dict, skip)
+        self.rhs = fast_subs(self.rhs, subs_dict, skip)
+        return self
    def optimize(self, optimizations):
        try:
            from sympy.codegen.rewriting import optimize
@@ -601,7 +619,7 @@ class SympyAssignment(Node):
    @property
    def args(self):
-        return [self._lhs_symbol, self.rhs, sp.sympify(self._is_const)]
+        return [self._lhs_symbol, self.rhs]
    @property
    def symbols_defined(self):
@@ -619,7 +637,9 @@ class SympyAssignment(Node):
                for i in range(len(symbol.offsets)):
                    loop_counters.add(LoopOverCoordinate.get_loop_counter_symbol(i))
        result.update(loop_counters)
        result.update(self._lhs_symbol.atoms(sp.Symbol))
        return result
    @property
@@ -630,6 +650,10 @@ class SympyAssignment(Node):
    def is_const(self):
        return self._is_const
+    @property
+    def use_auto(self):
+        return self._use_auto
    def replace(self, child, replacement):
        if child == self.lhs:
            replacement.parent = self
@@ -652,7 +676,7 @@ class SympyAssignment(Node):
        return hash((self.lhs, self.rhs))
    def __eq__(self, other):
-        return type(self) == type(other) and (self.lhs, self.rhs) == (other.lhs, other.rhs)
+        return type(self) is type(other) and (self.lhs, self.rhs) == (other.lhs, other.rhs)
 class ResolvedFieldAccess(sp.Indexed):
No results found