5600b6b6 · 5600b6b6 · 5600b6b6 · 5600b6b6 · 5600b6b6 · 5600b6b6
--- a/src/pystencils/include/gpu_defines.h
+++ b/src/pystencils/include/gpu_defines.h
+/*
+Copyright 2023, Markus Holzer.
+Copyright 2023, Michael Kuron.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions, and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions, and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+* Neither the name of of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#pragma once
+#define POS_INFINITY __int_as_float(0x7f800000)
+#define INFINITY POS_INFINITY
+#define NEG_INFINITY __int_as_float(0xff800000)
+#ifdef __HIPCC_RTC__
+typedef __hip_uint8_t uint8_t;
+typedef __hip_int8_t int8_t;
+typedef __hip_uint16_t uint16_t;
+typedef __hip_int16_t int16_t;
+#endif
--- a/src/pystencils/include/half_precision.h
+++ b/src/pystencils/include/half_precision.h
+/*
+Copyright 2023, Markus Holzer.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions, and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions, and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+* Neither the name of of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+/// Half precision support. Experimental. Use carefully.
+///
+/// This feature is experimental, since it strictly depends on the underlying architecture and compiler support.
+/// On x86 architectures, what you can expect is that the data format is supported natively only for storage and
+/// interchange. Arithmetic operations will likely involve casting to fp32 (C++ float) and truncation to fp16.
+/// Only bandwidth bound code may therefore benefit. None of this is guaranteed, and may change in the future.
+/// Clang version must be 15 or higher for x86 half precision support.
+/// GCC version must be 12 or higher for x86 half precision support.
+/// Also support seems to require SSE, so ensure that respective instruction sets are enabled.
+/// See
+///   https://clang.llvm.org/docs/LanguageExtensions.html#half-precision-floating-point
+///   https://gcc.gnu.org/onlinedocs/gcc/Half-Precision.html
+/// for more information.
+#pragma once
+using half    = _Float16;
--- a/pystencils/include/myintrin.h
+++ b/pystencils/include/myintrin.h
+/*
+Copyright 2019-2023, Michael Kuron.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions, and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions, and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+* Neither the name of of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
 #pragma once
-#if defined(__SSE2__) || defined(_MSC_VER)
+#if defined(__SSE2__) || (defined(_MSC_VER) && !defined(_M_ARM64))
 QUALIFIERS __m128 _my_cvtepu32_ps(const __m128i v)
 {
-#ifdef __AVX512VL__
+#if defined(__AVX512VL__) || defined(__AVX10_1__)
    return _mm_cvtepu32_ps(v);
 #else
    __m128i v2 = _mm_srli_epi32(v, 1);
@@ -28,13 +59,13 @@ QUALIFIERS void _MY_TRANSPOSE4_EPI32(__m128i & R0, __m128i & R1, __m128i & R2, _
 }
 #endif
-#if defined(__SSE4_1__) || defined(_MSC_VER)
+#if defined(__SSE4_1__) || (defined(_MSC_VER) && !defined(_M_ARM64))
-#if !defined(__AVX512VL__) && defined(__GNUC__) && __GNUC__ >= 5 && !defined(__clang__)
+#if !defined(__AVX512VL__) && !defined(__AVX10_1__) && defined(__GNUC__) && __GNUC__ >= 5 && !defined(__clang__)
 __attribute__((optimize("no-associative-math")))
 #endif
 QUALIFIERS __m128d _my_cvtepu64_pd(const __m128i x)
 {
-#ifdef __AVX512VL__
+#if defined(__AVX512VL__) || defined(__AVX10_1__)
    return _mm_cvtepu64_pd(x);
 #elif defined(__clang__)
    return __builtin_convertvector((uint64_t __attribute__((__vector_size__(16)))) x, __m128d);
@@ -69,7 +100,7 @@ QUALIFIERS __m256d _my256_set_m128d(__m128d hi, __m128d lo)
 QUALIFIERS __m256 _my256_cvtepu32_ps(const __m256i v)
 {
-#ifdef __AVX512VL__
+#if defined(__AVX512VL__) || defined(__AVX10_1__)
    return _mm256_cvtepu32_ps(v);
 #else
    __m256i v2 = _mm256_srli_epi32(v, 1);
@@ -80,12 +111,12 @@ QUALIFIERS __m256 _my256_cvtepu32_ps(const __m256i v)
 #endif
 }
-#if !defined(__AVX512VL__) && defined(__GNUC__) && __GNUC__ >= 5 && !defined(__clang__)
+#if !defined(__AVX512VL__) && !defined(__AVX10_1__) && defined(__GNUC__) && __GNUC__ >= 5 && !defined(__clang__)
 __attribute__((optimize("no-associative-math")))
 #endif
 QUALIFIERS __m256d _my256_cvtepu64_pd(const __m256i x)
 {
-#ifdef __AVX512VL__
+#if defined(__AVX512VL__) || defined(__AVX10_1__)
    return _mm256_cvtepu64_pd(x);
 #elif defined(__clang__)
    return __builtin_convertvector((uint64_t __attribute__((__vector_size__(32)))) x, __m256d);
@@ -99,7 +130,7 @@ QUALIFIERS __m256d _my256_cvtepu64_pd(const __m256i x)
 }
 #endif
-#ifdef __AVX512F__
+#if defined(__AVX512F__) || defined(__AVX10_512BIT__)
 QUALIFIERS __m512i _my512_set_m128i(__m128i d, __m128i c, __m128i b, __m128i a)
 {
    return _mm512_inserti32x4(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(a), b, 1), c, 2), d, 3);

--- a/pystencils/include/philox_rand.h
+++ b/pystencils/include/philox_rand.h
-#ifndef __OPENCL_VERSION__
+/*
-#if defined(__SSE2__) || defined(_MSC_VER)
+Copyright 2010-2011, D. E. Shaw Research. All rights reserved.
+Copyright 2019-2024, Michael Kuron.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions, and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions, and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+* Neither the name of of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#if !defined(__OPENCL_VERSION__) && !defined(__HIPCC_RTC__)
+#if defined(__SSE2__) || (defined(_MSC_VER) && !defined(_M_ARM64))
 #include <emmintrin.h> // SSE2
 #endif
 #ifdef __AVX2__
 #include <immintrin.h> // AVX*
-#elif defined(__SSE4_1__) || defined(_MSC_VER)
+#elif defined(__SSE4_1__) || (defined(_MSC_VER) && !defined(_M_ARM64))
 #include <smmintrin.h>  // SSE4
 #ifdef __FMA__
 #include <immintrin.h> // FMA
 #endif
 #endif
+#if defined(_MSC_VER) && defined(_M_ARM64)
+#define __ARM_NEON
+#endif
 #ifdef __ARM_NEON
 #include <arm_neon.h>
 #endif
-#ifdef __ARM_FEATURE_SVE
+#if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_SME)
 #include <arm_sve.h>
 #endif
@@ -34,7 +70,7 @@
 #endif
 #endif
-#ifdef __CUDA_ARCH__
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
 #define QUALIFIERS static __forceinline__ __device__
 #elif defined(__OPENCL_VERSION__)
 #define QUALIFIERS static inline
@@ -43,6 +79,12 @@
 #include "myintrin.h"
 #endif
+#if defined(__ARM_FEATURE_SME)
+#define SVE_QUALIFIERS __attribute__((arm_streaming_compatible)) QUALIFIERS
+#else
+#define SVE_QUALIFIERS QUALIFIERS
+#endif
 #define PHILOX_W32_0   (0x9E3779B9)
 #define PHILOX_W32_1   (0xBB67AE85)
 #define PHILOX_M4x32_0 (0xD2511F53)
@@ -55,7 +97,9 @@
 typedef uint32_t uint32;
 typedef uint64_t uint64;
 #else
+#ifndef __HIPCC_RTC__
 #include <cstdint>
+#endif
 typedef std::uint32_t uint32;
 typedef std::uint64_t uint64;
 #endif
@@ -63,7 +107,7 @@ typedef std::uint64_t uint64;
 #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS > 0
 typedef svfloat32_t svfloat32_st __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
 typedef svfloat64_t svfloat64_st __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
-#elif defined(__ARM_FEATURE_SVE)
+#elif defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_SME)
 typedef svfloat32_t svfloat32_st;
 typedef svfloat64_t svfloat64_st;
 #endif
@@ -71,7 +115,7 @@ typedef svfloat64_t svfloat64_st;
 QUALIFIERS uint32 mulhilo32(uint32 a, uint32 b, uint32* hip)
 {
-#ifndef __CUDA_ARCH__
+#if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
    // host code
 #if defined(__powerpc__) && (!defined(__clang__) || defined(__xlC__))
    *hip = __mulhwu(a,b);
@@ -182,8 +226,8 @@ QUALIFIERS void philox_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3
 #endif
 }
-#if !defined(__CUDA_ARCH__) && !defined(__OPENCL_VERSION__)
+#if !defined(__CUDA_ARCH__) && !defined(__OPENCL_VERSION__) && !defined(__HIP_DEVICE_COMPILE__)
-#if defined(__SSE4_1__) || defined(_MSC_VER)
+#if defined(__SSE4_1__) || (defined(_MSC_VER) && !defined(_M_ARM64))
 QUALIFIERS void _philox4x32round(__m128i* ctr, __m128i* key)
 {
    __m128i lohi0a = _mm_mul_epu32(ctr[0], _mm_set1_epi32(PHILOX_M4x32_0));
@@ -665,12 +709,14 @@ QUALIFIERS void philox_float4(uint32 ctr0, uint32x4_t ctr1, uint32 ctr2, uint32
    philox_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, rnd2, rnd3, rnd4);
 }
+#ifndef _MSC_VER
 QUALIFIERS void philox_float4(uint32 ctr0, int32x4_t ctr1, uint32 ctr2, uint32 ctr3,
                              uint32 key0, uint32 key1,
                              float32x4_t & rnd1, float32x4_t & rnd2, float32x4_t & rnd3, float32x4_t & rnd4)
 {
    philox_float4(ctr0, vreinterpretq_u32_s32(ctr1), ctr2, ctr3, key0, key1, rnd1, rnd2, rnd3, rnd4);
 }
+#endif
 QUALIFIERS void philox_double2(uint32 ctr0, uint32x4_t ctr1, uint32 ctr2, uint32 ctr3,
                               uint32 key0, uint32 key1,
@@ -695,6 +741,7 @@ QUALIFIERS void philox_double2(uint32 ctr0, uint32x4_t ctr1, uint32 ctr2, uint32
    philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, ignore, rnd2, ignore);
 }
+#ifndef _MSC_VER
 QUALIFIERS void philox_double2(uint32 ctr0, int32x4_t ctr1, uint32 ctr2, uint32 ctr3,
                               uint32 key0, uint32 key1,
                               float64x2_t & rnd1, float64x2_t & rnd2)
@@ -702,10 +749,11 @@ QUALIFIERS void philox_double2(uint32 ctr0, int32x4_t ctr1, uint32 ctr2, uint32
    philox_double2(ctr0, vreinterpretq_u32_s32(ctr1), ctr2, ctr3, key0, key1, rnd1, rnd2);
 }
 #endif
+#endif
-#if defined(__ARM_FEATURE_SVE)
+#if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_SME)
-QUALIFIERS void _philox4x32round(svuint32x4_t & ctr, svuint32x2_t & key)
+SVE_QUALIFIERS void _philox4x32round(svuint32x4_t & ctr, svuint32x2_t & key)
 {
    svuint32_t lo0 = svmul_u32_x(svptrue_b32(), svget4_u32(ctr, 0), svdup_u32(PHILOX_M4x32_0));
    svuint32_t hi0 = svmulh_u32_x(svptrue_b32(), svget4_u32(ctr, 0), svdup_u32(PHILOX_M4x32_0));
@@ -718,14 +766,14 @@ QUALIFIERS void _philox4x32round(svuint32x4_t & ctr, svuint32x2_t & key)
    ctr = svset4_u32(ctr, 3, lo0);
 }
-QUALIFIERS void _philox4x32bumpkey(svuint32x2_t & key)
+SVE_QUALIFIERS void _philox4x32bumpkey(svuint32x2_t & key)
 {
    key = svset2_u32(key, 0, svadd_u32_x(svptrue_b32(), svget2_u32(key, 0), svdup_u32(PHILOX_W32_0)));
    key = svset2_u32(key, 1, svadd_u32_x(svptrue_b32(), svget2_u32(key, 1), svdup_u32(PHILOX_W32_1)));
 }
 template<bool high>
-QUALIFIERS svfloat64_t _uniform_double_hq(svuint32_t x, svuint32_t y)
+SVE_QUALIFIERS svfloat64_t _uniform_double_hq(svuint32_t x, svuint32_t y)
 {
    // convert 32 to 64 bit
    if (high)
@@ -752,9 +800,9 @@ QUALIFIERS svfloat64_t _uniform_double_hq(svuint32_t x, svuint32_t y)
 }
-QUALIFIERS void philox_float4(svuint32_t ctr0, svuint32_t ctr1, svuint32_t ctr2, svuint32_t ctr3,
+SVE_QUALIFIERS void philox_float4(svuint32_t ctr0, svuint32_t ctr1, svuint32_t ctr2, svuint32_t ctr3,
-                              uint32 key0, uint32 key1,
+                                  uint32 key0, uint32 key1,
-                              svfloat32_st & rnd1, svfloat32_st & rnd2, svfloat32_st & rnd3, svfloat32_st & rnd4)
+                                  svfloat32_st & rnd1, svfloat32_st & rnd2, svfloat32_st & rnd3, svfloat32_st & rnd4)
 {
    svuint32x2_t key = svcreate2_u32(svdup_u32(key0), svdup_u32(key1));
    svuint32x4_t ctr = svcreate4_u32(ctr0, ctr1, ctr2, ctr3);
@@ -782,9 +830,9 @@ QUALIFIERS void philox_float4(svuint32_t ctr0, svuint32_t ctr1, svuint32_t ctr2,
 }
-QUALIFIERS void philox_double2(svuint32_t ctr0, svuint32_t ctr1, svuint32_t ctr2, svuint32_t ctr3,
+SVE_QUALIFIERS void philox_double2(svuint32_t ctr0, svuint32_t ctr1, svuint32_t ctr2, svuint32_t ctr3,
-                               uint32 key0, uint32 key1,
+                                   uint32 key0, uint32 key1,
-                               svfloat64_st & rnd1lo, svfloat64_st & rnd1hi, svfloat64_st & rnd2lo, svfloat64_st & rnd2hi)
+                                   svfloat64_st & rnd1lo, svfloat64_st & rnd1hi, svfloat64_st & rnd2lo, svfloat64_st & rnd2hi)
 {
    svuint32x2_t key = svcreate2_u32(svdup_u32(key0), svdup_u32(key1));
    svuint32x4_t ctr = svcreate4_u32(ctr0, ctr1, ctr2, ctr3);
@@ -805,9 +853,9 @@ QUALIFIERS void philox_double2(svuint32_t ctr0, svuint32_t ctr1, svuint32_t ctr2
    rnd2hi = _uniform_double_hq<true>(svget4_u32(ctr, 2), svget4_u32(ctr, 3));
 }
-QUALIFIERS void philox_float4(uint32 ctr0, svuint32_t ctr1, uint32 ctr2, uint32 ctr3,
+SVE_QUALIFIERS void philox_float4(uint32 ctr0, svuint32_t ctr1, uint32 ctr2, uint32 ctr3,
-                              uint32 key0, uint32 key1,
+                                  uint32 key0, uint32 key1,
-                              svfloat32_st & rnd1, svfloat32_st & rnd2, svfloat32_st & rnd3, svfloat32_st & rnd4)
+                                  svfloat32_st & rnd1, svfloat32_st & rnd2, svfloat32_st & rnd3, svfloat32_st & rnd4)
 {
    svuint32_t ctr0v = svdup_u32(ctr0);
    svuint32_t ctr2v = svdup_u32(ctr2);
@@ -816,16 +864,16 @@ QUALIFIERS void philox_float4(uint32 ctr0, svuint32_t ctr1, uint32 ctr2, uint32
    philox_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, rnd2, rnd3, rnd4);
 }
-QUALIFIERS void philox_float4(uint32 ctr0, svint32_t ctr1, uint32 ctr2, uint32 ctr3,
+SVE_QUALIFIERS void philox_float4(uint32 ctr0, svint32_t ctr1, uint32 ctr2, uint32 ctr3,
-                              uint32 key0, uint32 key1,
+                                  uint32 key0, uint32 key1,
-                              svfloat32_st & rnd1, svfloat32_st & rnd2, svfloat32_st & rnd3, svfloat32_st & rnd4)
+                                  svfloat32_st & rnd1, svfloat32_st & rnd2, svfloat32_st & rnd3, svfloat32_st & rnd4)
 {
    philox_float4(ctr0, svreinterpret_u32_s32(ctr1), ctr2, ctr3, key0, key1, rnd1, rnd2, rnd3, rnd4);
 }
-QUALIFIERS void philox_double2(uint32 ctr0, svuint32_t ctr1, uint32 ctr2, uint32 ctr3,
+SVE_QUALIFIERS void philox_double2(uint32 ctr0, svuint32_t ctr1, uint32 ctr2, uint32 ctr3,
-                               uint32 key0, uint32 key1,
+                                   uint32 key0, uint32 key1,
-                               svfloat64_st & rnd1lo, svfloat64_st & rnd1hi, svfloat64_st & rnd2lo, svfloat64_st & rnd2hi)
+                                   svfloat64_st & rnd1lo, svfloat64_st & rnd1hi, svfloat64_st & rnd2lo, svfloat64_st & rnd2hi)
 {
    svuint32_t ctr0v = svdup_u32(ctr0);
    svuint32_t ctr2v = svdup_u32(ctr2);
@@ -834,9 +882,9 @@ QUALIFIERS void philox_double2(uint32 ctr0, svuint32_t ctr1, uint32 ctr2, uint32
    philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1lo, rnd1hi, rnd2lo, rnd2hi);
 }
-QUALIFIERS void philox_double2(uint32 ctr0, svuint32_t ctr1, uint32 ctr2, uint32 ctr3,
+SVE_QUALIFIERS void philox_double2(uint32 ctr0, svuint32_t ctr1, uint32 ctr2, uint32 ctr3,
-                               uint32 key0, uint32 key1,
+                                   uint32 key0, uint32 key1,
-                               svfloat64_st & rnd1, svfloat64_st & rnd2)
+                                   svfloat64_st & rnd1, svfloat64_st & rnd2)
 {
    svuint32_t ctr0v = svdup_u32(ctr0);
    svuint32_t ctr2v = svdup_u32(ctr2);
@@ -846,9 +894,9 @@ QUALIFIERS void philox_double2(uint32 ctr0, svuint32_t ctr1, uint32 ctr2, uint32
    philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, ignore, rnd2, ignore);
 }
-QUALIFIERS void philox_double2(uint32 ctr0, svint32_t ctr1, uint32 ctr2, uint32 ctr3,
+SVE_QUALIFIERS void philox_double2(uint32 ctr0, svint32_t ctr1, uint32 ctr2, uint32 ctr3,
-                               uint32 key0, uint32 key1,
+                                   uint32 key0, uint32 key1,
-                               svfloat64_st & rnd1, svfloat64_st & rnd2)
+                                   svfloat64_st & rnd1, svfloat64_st & rnd2)
 {
    philox_double2(ctr0, svreinterpret_u32_s32(ctr1), ctr2, ctr3, key0, key1, rnd1, rnd2);
 }
@@ -1174,7 +1222,7 @@ QUALIFIERS void philox_double2(uint32 ctr0, __m256i ctr1, uint32 ctr2, uint32 ct
 }
 #endif
-#ifdef __AVX512F__
+#if defined(__AVX512F__) || defined(__AVX10_512BIT__)
 QUALIFIERS void _philox4x32round(__m512i* ctr, __m512i* key)
 {
    __m512i lohi0a = _mm512_mul_epu32(ctr[0], _mm512_set1_epi32(PHILOX_M4x32_0));

--- a/pystencils/include/ppc_altivec_helpers.h
+++ b/pystencils/include/ppc_altivec_helpers.h
+/*
+Copyright 2021, Michael Kuron.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions, and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions, and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+* Neither the name of of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
 #include <altivec.h>
 #undef vector
 #undef bool

--- a/src/pystencils/include/riscv_v_helpers.h
+++ b/src/pystencils/include/riscv_v_helpers.h
+/*
+Copyright 2023, Michael Kuron.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions, and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions, and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+* Neither the name of of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+inline void cachelineZero(void * p) {
+#ifdef __riscv_zicboz
+	__asm__ volatile("cbo.zero (%0)"::"r"(p):"memory");
+#endif
+}
+inline size_t _cachelineSize() {
+	// allocate and fill with ones
+	const size_t max_size = 0x100000;
+	uint8_t data[2*max_size];
+	for (size_t i = 0; i < 2*max_size; ++i) {
+		data[i] = 0xff;
+	}
+	// find alignment offset
+	size_t offset = max_size - ((uintptr_t) data) % max_size;
+	// zero a cacheline
+	cachelineZero((void*) (data + offset));
+	// make sure that at least one byte was zeroed
+	if (data[offset] != 0) {
+		return SIZE_MAX;
+	}
+	// make sure that nothing was zeroed before the pointer
+	if (data[offset-1] == 0) {
+		return SIZE_MAX;
+	}
+	// find the last byte that was zeroed
+	for (size_t size = 1; size < max_size; ++size) {
+		if (data[offset + size] != 0) {
+			return size;
+		}
+	}
+	// too much was zeroed
+	return SIZE_MAX;
+}
+inline size_t cachelineSize() {
+#ifdef __riscv_zicboz
+	static size_t size = _cachelineSize();
+	return size;
+#else
+	return SIZE_MAX;
+#endif
+}
--- a/pystencils/integer_functions.py
+++ b/pystencils/integer_functions.py
+# TODO #47 move to a module functions
 import numpy as np
 import sympy as sp
-from pystencils.data_types import cast_func, collate_types, create_type, get_type_of_expression
+from pystencils.typing import CastFunc, collate_types, create_type, get_type_of_expression
 from pystencils.sympyextensions import is_integer_sequence
@@ -12,9 +13,9 @@ class IntegerFunctionTwoArgsMixIn(sp.Function):
        args = []
        for a in (arg1, arg2):
            if isinstance(a, sp.Number) or isinstance(a, int):
-                args.append(cast_func(a, create_type("int")))
+                args.append(CastFunc(a, create_type("int")))
            elif isinstance(a, np.generic):
-                args.append(cast_func(a, a.dtype))
+                args.append(CastFunc(a, a.dtype))
            else:
                args.append(a)

--- a/pystencils/integer_set_analysis.py
+++ b/pystencils/integer_set_analysis.py
@@ -4,7 +4,8 @@ import islpy as isl
 import sympy as sp
 import pystencils.astnodes as ast
-from pystencils.transformations import parents_of_type
+from pystencils.typing import parents_of_type
+from pystencils.backends.cbackend import CustomSympyPrinter
 def remove_brackets(s):
@@ -51,11 +52,13 @@ def simplify_loop_counter_dependent_conditional(conditional):
    dofs_in_loops, iteration_set = isl_iteration_set(conditional)
    if dofs_in_condition.issubset(dofs_in_loops):
        symbol_names = ','.join(dofs_in_loops)
-        condition_str = remove_brackets(str(conditional.condition_expr))
+        condition_str = CustomSympyPrinter().doprint(conditional.condition_expr)
+        condition_str = remove_brackets(condition_str)
        condition_set = isl.BasicSet(f"{{ [{symbol_names}] : {condition_str} }}")
        if condition_set.is_empty():
            conditional.replace_by_false_block()
+            return
        intersection = iteration_set.intersect(condition_set)
        if intersection.is_empty():

--- a/pystencils/jupyter.py
+++ b/pystencils/jupyter.py
--- a/src/pystencils/kernel_contrains_check.py
+++ b/src/pystencils/kernel_contrains_check.py
+from collections import namedtuple, defaultdict
+from typing import Union
+import sympy as sp
+from sympy.codegen import Assignment
+from pystencils.simp import AssignmentCollection
+from pystencils import astnodes as ast, TypedSymbol
+from pystencils.field import Field
+from pystencils.node_collection import NodeCollection
+from pystencils.transformations import NestedScopes
+# TODO use this in Constraint Checker
+accepted_functions = [
+    sp.Pow,
+    sp.sqrt,
+    sp.log,
+    # TODO trigonometric functions (and whatever tests will fail)
+]
+class KernelConstraintsCheck:
+    # TODO: proper specification
+    # TODO: More checks :)
+    """Checks if the input to create_kernel is valid.
+    Test the following conditions:
+    - SSA Form for pure symbols:
+        -  Every pure symbol may occur only once as left-hand-side of an assignment
+        -  Every pure symbol that is read, may not be written to later
+    - Independence / Parallelization condition:
+        - a field that is written may only be read at exact the same spatial position
+    (Pure symbols are symbols that are not Field.Accesses)
+    """
+    FieldAndIndex = namedtuple('FieldAndIndex', ['field', 'index'])
+    def __init__(self, check_independence_condition=True, check_double_write_condition=True):
+        self.scopes = NestedScopes()
+        self.field_reads = defaultdict(set)
+        self.field_writes = defaultdict(set)
+        self.fields_read = set()
+        self.check_independence_condition = check_independence_condition
+        self.check_double_write_condition = check_double_write_condition
+    def visit(self, obj):
+        if isinstance(obj, (AssignmentCollection, NodeCollection)):
+            [self.visit(e) for e in obj.all_assignments]
+        elif isinstance(obj, list) or isinstance(obj, tuple):
+            [self.visit(e) for e in obj]
+        elif isinstance(obj, (sp.Eq, ast.SympyAssignment, Assignment)):
+            self.process_assignment(obj)
+        elif isinstance(obj, ast.Conditional):
+            self.scopes.push()
+            # Disable double write check inside conditionals
+            # would be triggered by e.g. in-kernel boundaries
+            old_double_write = self.check_double_write_condition
+            old_independence_condition = self.check_independence_condition
+            self.check_double_write_condition = False
+            self.check_independence_condition = False
+            if obj.false_block:
+                self.visit(obj.false_block)
+            self.process_expression(obj.condition_expr)
+            self.process_expression(obj.true_block)
+            self.check_double_write_condition = old_double_write
+            self.check_independence_condition = old_independence_condition
+            self.scopes.pop()
+        elif isinstance(obj, ast.Block):
+            self.scopes.push()
+            [self.visit(e) for e in obj.args]
+            self.scopes.pop()
+        elif isinstance(obj, ast.Node) and not isinstance(obj, ast.LoopOverCoordinate):
+            pass
+        else:
+            raise ValueError(f'Invalid object in kernel {type(obj)}')
+    def process_assignment(self, assignment: Union[sp.Eq, ast.SympyAssignment, Assignment]):
+        # for checks it is crucial to process rhs before lhs to catch e.g. a = a + 1
+        self.process_expression(assignment.rhs)
+        self.process_lhs(assignment.lhs)
+    def process_expression(self, rhs):
+        # TODO constraint for accepted functions, see TODO above
+        self.update_accesses_rhs(rhs)
+        if isinstance(rhs, Field.Access):
+            self.fields_read.add(rhs.field)
+            self.fields_read.update(rhs.indirect_addressing_fields)
+        else:
+            for arg in rhs.args:
+                self.process_expression(arg)
+    @property
+    def fields_written(self):
+        """
+        Return all rhs fields
+        """
+        return set(k.field for k, v in self.field_writes.items() if len(v))
+    def process_lhs(self, lhs: Union[Field.Access, TypedSymbol, sp.Symbol]):
+        assert isinstance(lhs, sp.Symbol)
+        self.update_accesses_lhs(lhs)
+    def update_accesses_lhs(self, lhs):
+        if isinstance(lhs, Field.Access):
+            fai = self.FieldAndIndex(lhs.field, lhs.index)
+            if self.check_double_write_condition and lhs.offsets in self.field_writes[fai]:
+                raise ValueError(f"Field {lhs.field.name} is written twice at the same location")
+            self.field_writes[fai].add(lhs.offsets)
+            if self.check_double_write_condition and len(self.field_writes[fai]) > 1:
+                raise ValueError(
+                    f"Field {lhs.field.name} is written at two different locations")
+            if fai in self.field_reads:
+                reads = tuple(self.field_reads[fai])
+                if len(reads) > 1 or lhs.offsets != reads[0]:
+                    if self.check_independence_condition:
+                        raise ValueError(f"Field {lhs.field.name} is written at different location than it was read. "
+                                         f"This means the resulting kernel would not be thread safe")
+        elif isinstance(lhs, sp.Symbol):
+            if self.scopes.is_defined_locally(lhs):
+                raise ValueError(f"Assignments not in SSA form, multiple assignments to {lhs.name}")
+            if lhs in self.scopes.free_parameters:
+                raise ValueError(f"Symbol {lhs.name} is written, after it has been read")
+            self.scopes.define_symbol(lhs)
+    def update_accesses_rhs(self, rhs):
+        if isinstance(rhs, Field.Access) and self.check_independence_condition:
+            fai = self.FieldAndIndex(rhs.field, rhs.index)
+            writes = self.field_writes[fai]
+            self.field_reads[fai].add(rhs.offsets)
+            for write_offset in writes:
+                assert len(writes) == 1
+                if write_offset != rhs.offsets:
+                    raise ValueError(f"Violation of loop independence condition. Field "
+                                     f"{rhs.field} is read at {rhs.offsets} and written at {write_offset}")
+            self.fields_read.add(rhs.field)
+        elif isinstance(rhs, sp.Symbol):
+            self.scopes.access_symbol(rhs)
--- a/pystencils/kernel_decorator.py
+++ b/pystencils/kernel_decorator.py
@@ -7,7 +7,7 @@ import sympy as sp
 from pystencils.assignment import Assignment
 from pystencils.sympyextensions import SymbolCreator
-from pystencils.kernelcreation import CreateKernelConfig
+from pystencils.config import CreateKernelConfig
 __all__ = ['kernel', 'kernel_config']
@@ -77,10 +77,10 @@ def kernel_config(config: CreateKernelConfig, **kwargs) -> Callable[..., Dict]:
    and updates the function name accordingly.
    Changes the meaning of the '@=' operator. Each line containing this operator gives a symbolic assignment
-    in the result list. Furthermore the meaning of the ternary inline 'if-else' changes meaning to denote a
+    in the result list. Furthermore, the meaning of the ternary inline 'if-else' changes meaning to denote a
    sympy Piecewise.
-    The decorated function may not receive any arguments, with exception of an argument called 's' that specifies
+    The decorated function may not receive any arguments, with exception to an argument called 's' that specifies
    a SymbolCreator()
    Args:
        config: Specify whether to return the list with assignments, or a dictionary containing additional settings
@@ -90,14 +90,14 @@ def kernel_config(config: CreateKernelConfig, **kwargs) -> Callable[..., Dict]:
    Examples:
        >>> import pystencils as ps
-        >>> config = ps.CreateKernelConfig()
+        >>> kernel_configuration = ps.CreateKernelConfig()
-        >>> @kernel_config(config)
+        >>> @kernel_config(kernel_configuration)
        ... def my_kernel(s):
-        ...     f, g = ps.fields('f, g: [2D]')
+        ...     src, dst = ps.fields('src, dst: [2D]')
-        ...     s.neighbors @= f[0,1] + f[1,0]
+        ...     s.neighbors @= src[0, 1] + src[1, 0]
-        ...     g[0,0]      @= s.neighbors + f[0,0] if f[0,0] > 0 else 0
+        ...     dst[0, 0]      @= s.neighbors + src[0, 0] if src[0, 0] > 0 else 0
-        >>> f, g = ps.fields('f, g: [2D]')
+        >>> f, g = ps.fields('src, dst: [2D]')
-        >>> assert my_kernel['assignments'][0].rhs == f[0,1] + f[1,0]
+        >>> assert my_kernel['assignments'][0].rhs == f[0, 1] + f[1, 0]
    """
    def decorator(func: Callable[..., None]) -> Union[List[Assignment], Dict]:
        """

--- a/pystencils/kernel_wrapper.py
+++ b/pystencils/kernel_wrapper.py
--- a/pystencils/kernelcreation.py
+++ b/pystencils/kernelcreation.py
 import itertools
 import warnings
-from dataclasses import dataclass, field
+from typing import Union, List
-from types import MappingProxyType
-from typing import Callable, Union, List, Dict, Tuple, Any
 import sympy as sp
+from pystencils.config import CreateKernelConfig
-from pystencils.assignment import Assignment
+from pystencils.assignment import Assignment, AddAugmentedAssignment
-from pystencils.astnodes import Block, Conditional, LoopOverCoordinate, SympyAssignment
+from pystencils.astnodes import Node, Block, Conditional, LoopOverCoordinate, SympyAssignment
 from pystencils.cpu.vectorization import vectorize
 from pystencils.enums import Target, Backend
 from pystencils.field import Field, FieldType
-from pystencils.gpucuda.indexing import indexing_creator_from_params
+from pystencils.node_collection import NodeCollection
 from pystencils.simp.assignment_collection import AssignmentCollection
-from pystencils.simp.simplifications import apply_sympy_optimisations
+from pystencils.kernel_contrains_check import KernelConstraintsCheck
 from pystencils.simplificationfactory import create_simplification_strategy
 from pystencils.stencil import direction_string_to_offset, inverse_direction_string
 from pystencils.transformations import (
    loop_blocking, move_constants_before_loop, remove_conditionals_in_staggered_kernel)
-@dataclass
+def create_kernel(assignments: Union[Assignment, List[Assignment],
-class CreateKernelConfig:
+                                     AddAugmentedAssignment, List[AddAugmentedAssignment],
-    """
+                                     AssignmentCollection, List[Node], NodeCollection],
-    **Below all parameters for the CreateKernelConfig are explained**
+                  *,
-    """
-    target: Target = Target.CPU
-    """
-    All targets are defined in :class:`pystencils.enums.Target`
-    """
-    backend: Backend = None
-    """
-    All backends are defined in :class:`pystencils.enums.Backend`
-    """
-    function_name: str = 'kernel'
-    """
-    Name of the generated function - only important if generated code is written out
-    """
-    data_type: Union[str, dict] = 'double'
-    """
-    Data type used for all untyped symbols (i.e. non-fields), can also be a dict from symbol name to type
-    """
-    iteration_slice: Tuple = None
-    """
-    Rectangular subset to iterate over, if not specified the complete non-ghost layer part of the field is iterated over
-    """
-    ghost_layers: Union[bool, int, List[Tuple[int]]] = None
-    """
-    A single integer specifies the ghost layer count at all borders, can also be a sequence of
-    pairs ``[(x_lower_gl, x_upper_gl), .... ]``. These layers are excluded from the iteration.
-    If left to default, the number of ghost layers is determined automatically from the assignments.
-    """
-    skip_independence_check: bool = False
-    """
-    Don't check that loop iterations are independent. This is needed e.g. for 
-    periodicity kernel, that access the field outside the iteration bounds. Use with care!
-    """
-    cpu_openmp: Union[bool, int] = False
-    """
-    `True` or number of threads for OpenMP parallelization, `False` for no OpenMP. If set to `True`, the maximum number
-    of available threads will be chosen.
-    """
-    cpu_vectorize_info: Dict = None
-    """
-    A dictionary with keys, 'vector_instruction_set', 'assume_aligned' and 'nontemporal'
-    for documentation of these parameters see vectorize function. Example:
-    '{'instruction_set': 'avx512', 'assume_aligned': True, 'nontemporal':True}'
-    """
-    cpu_blocking: Tuple[int] = None
-    """
-    A tuple of block sizes or `None` if no blocking should be applied
-    """
-    omp_single_loop: bool = True
-    """
-    If OpenMP is active: whether multiple outer loops are permitted
-    """
-    gpu_indexing: str = 'block'
-    """
-    Either 'block' or 'line' , or custom indexing class, see `AbstractIndexing`
-    """
-    gpu_indexing_params: MappingProxyType = field(default=MappingProxyType({}))
-    """
-    Dict with indexing parameters (constructor parameters of indexing class)
-    e.g. for 'block' one can specify '{'block_size': (20, 20, 10) }'.
-    """
-    default_assignment_simplifications: bool = False
-    """
-    If `True` default simplifications are first performed on the Assignments. If problems occur during the
-    simplification a warning will be thrown. 
-    Furthermore, it is essential to know that this is a two-stage process. The first stage of the process acts 
-    on the level of the `AssignmentCollection`.  In this part, `create_simplification_strategy` 
-    from pystencils.simplificationfactory will be used to apply optimisations like insertion of constants to 
-    remove pressure from the registers. Thus the first part of the optimisations can only be executed if 
-    an `AssignmentCollection` is passed. The second part of the optimisation acts on the level of each Assignment 
-    individually. In this stage, all optimisations from `sympy.codegen.rewriting.optims_c99` are applied 
-    to each Assignment. Thus this stage can also be applied if a list of Assignments is passed.
-    """
-    cpu_prepend_optimizations: List[Callable] = field(default_factory=list)
-    """
-    List of extra optimizations to perform first on the AST.
-    """
-    use_auto_for_assignments: bool = False
-    """
-    If set to `True`, auto can be used in the generated code for data types. This makes the type system more robust.
-    """
-    index_fields: List[Field] = None
-    """
-    List of index fields, i.e. 1D fields with struct data type. If not `None`, `create_index_kernel`
-    instead of `create_domain_kernel` is used.
-    """
-    coordinate_names: Tuple[str, Any] = ('x', 'y', 'z')
-    """
-    Name of the coordinate fields in the struct data type.
-    """
-    def __post_init__(self):
-        # ----  Legacy parameters
-        if isinstance(self.target, str):
-            new_target = Target[self.target.upper()]
-            warnings.warn(f'Target "{self.target}" as str is deprecated. Use {new_target} instead',
-                          category=DeprecationWarning)
-            self.target = new_target
-        # ---- Auto Backend
-        if not self.backend:
-            if self.target == Target.CPU:
-                self.backend = Backend.C
-            elif self.target == Target.GPU:
-                self.backend = Backend.CUDA
-            else:
-                raise NotImplementedError(f'Target {self.target} has no default backend')
-def create_kernel(assignments: Union[Assignment, List[Assignment], AssignmentCollection, List[Conditional]], *,
                  config: CreateKernelConfig = None, **kwargs):
    """
    Creates abstract syntax tree (AST) of kernel, using a list of update equations.
@@ -171,9 +62,24 @@ def create_kernel(assignments: Union[Assignment, List[Assignment], AssignmentCol
            setattr(config, k, v)
    # ----  Normalizing parameters
-    if isinstance(assignments, Assignment):
+    if isinstance(assignments, (Assignment, AddAugmentedAssignment)):
        assignments = [assignments]
    assert assignments, "Assignments must not be empty!"
+    if isinstance(assignments, list):
+        assignments = NodeCollection(assignments)
+    elif isinstance(assignments, AssignmentCollection):
+        # TODO Markus check and doku
+        # --- applying first default simplifications
+        try:
+            if config.default_assignment_simplifications:
+                simplification = create_simplification_strategy()
+                assignments = simplification(assignments)
+        except Exception as e:
+            warnings.warn(f"It was not possible to apply the default pystencils optimisations to the "
+                          f"AssignmentCollection due to the following problem :{e}")
+        simplification_hints = assignments.simplification_hints
+        assignments = NodeCollection.from_assignment_collection(assignments)
+        assignments.simplification_hints = simplification_hints
    if config.index_fields:
        return create_indexed_kernel(assignments, config=config)
@@ -181,12 +87,15 @@ def create_kernel(assignments: Union[Assignment, List[Assignment], AssignmentCol
        return create_domain_kernel(assignments, config=config)
-def create_domain_kernel(assignments: List[Assignment], *, config: CreateKernelConfig):
+def create_domain_kernel(assignments: NodeCollection, *, config: CreateKernelConfig):
    """
-    Creates abstract syntax tree (AST) of kernel, using a list of update equations.
+    Creates abstract syntax tree (AST) of kernel, using a NodeCollection.
+    Note that `create_domain_kernel` is a lower level function which shoul be accessed by not providing `index_fields`
+    to create_kernel
    Args:
-        assignments: can be a single assignment, sequence of assignments or an `AssignmentCollection`
+        assignments: `pystencils.node_collection.NodeCollection` containing all assignements and nodes to be processed
        config: CreateKernelConfig which includes the needed configuration
    Returns:
@@ -196,10 +105,12 @@ def create_domain_kernel(assignments: List[Assignment], *, config: CreateKernelC
    Example:
        >>> import pystencils as ps
        >>> import numpy as np
+        >>> from pystencils.kernelcreation import create_domain_kernel
+        >>> from pystencils.node_collection import NodeCollection
        >>> s, d = ps.fields('s, d: [2D]')
        >>> assignment = ps.Assignment(d[0,0], s[0, 1] + s[0, -1] + s[1, 0] + s[-1, 0])
        >>> kernel_config = ps.CreateKernelConfig(cpu_openmp=True)
-        >>> kernel_ast = ps.kernelcreation.create_domain_kernel([assignment], config=kernel_config)
+        >>> kernel_ast = create_domain_kernel(NodeCollection([assignment]), config=kernel_config)
        >>> kernel = kernel_ast.compile()
        >>> d_arr = np.zeros([5, 5])
        >>> kernel(d=d_arr, s=np.ones([5, 5]))
@@ -210,38 +121,25 @@ def create_domain_kernel(assignments: List[Assignment], *, config: CreateKernelC
               [0., 4., 4., 4., 0.],
               [0., 0., 0., 0., 0.]])
    """
-    # --- applying first default simplifications
+    # --- eval
-    try:
+    assignments.evaluate_terms()
-        if config.default_assignment_simplifications and isinstance(assignments, AssignmentCollection):
-            simplification = create_simplification_strategy()
-            assignments = simplification(assignments)
-    except Exception as e:
-        warnings.warn(f"It was not possible to apply the default pystencils optimisations to the "
-                      f"AssignmentCollection due to the following problem :{e}")
-    # ----  Normalizing parameters
+    # FUTURE WORK from here we shouldn't NEED sympy
-    split_groups = ()
+    # --- check constrains
-    if isinstance(assignments, AssignmentCollection):
+    check = KernelConstraintsCheck(check_independence_condition=not config.skip_independence_check,
-        if 'split_groups' in assignments.simplification_hints:
+                                   check_double_write_condition=not config.allow_double_writes)
-            split_groups = assignments.simplification_hints['split_groups']
-        assignments = assignments.all_assignments
+    check.visit(assignments)
-    try:
+    assignments.bound_fields = check.fields_written
-        if config.default_assignment_simplifications:
+    assignments.rhs_fields = check.fields_read
-            assignments = apply_sympy_optimisations(assignments)
-    except Exception as e:
-        warnings.warn(f"It was not possible to apply the default SymPy optimisations to the "
-                      f"Assignments due to the following problem :{e}")
    # ----  Creating ast
    ast = None
    if config.target == Target.CPU:
        if config.backend == Backend.C:
            from pystencils.cpu import add_openmp, create_kernel
-            ast = create_kernel(assignments, function_name=config.function_name, type_info=config.data_type,
+            ast = create_kernel(assignments, config=config)
-                                split_groups=split_groups,
-                                iteration_slice=config.iteration_slice, ghost_layers=config.ghost_layers,
-                                skip_independence_check=config.skip_independence_check)
            for optimization in config.cpu_prepend_optimizations:
                optimization(ast)
            omp_collapse = None
@@ -265,12 +163,8 @@ def create_domain_kernel(assignments: List[Assignment], *, config: CreateKernelC
                    raise ValueError("Invalid value for cpu_vectorize_info")
    elif config.target == Target.GPU:
        if config.backend == Backend.CUDA:
-            from pystencils.gpucuda import create_cuda_kernel
+            from pystencils.gpu import create_cuda_kernel
-            ast = create_cuda_kernel(assignments, function_name=config.function_name, type_info=config.data_type,
+            ast = create_cuda_kernel(assignments, config=config)
-                                     indexing_creator=indexing_creator_from_params(config.gpu_indexing,
-                                                                                   config.gpu_indexing_params),
-                                     iteration_slice=config.iteration_slice, ghost_layers=config.ghost_layers,
-                                     skip_independence_check=config.skip_independence_check)
    if not ast:
        raise NotImplementedError(
@@ -283,7 +177,7 @@ def create_domain_kernel(assignments: List[Assignment], *, config: CreateKernelC
    return ast
-def create_indexed_kernel(assignments: List[Assignment], *, config: CreateKernelConfig):
+def create_indexed_kernel(assignments: NodeCollection, *, config: CreateKernelConfig):
    """
    Similar to :func:`create_kernel`, but here not all cells of a field are updated but only cells with
    coordinates which are stored in an index field. This traversal method can e.g. be used for boundary handling.
@@ -293,8 +187,11 @@ def create_indexed_kernel(assignments: List[Assignment], *, config: CreateKernel
    'coordinate_names' parameter. The struct can have also other fields that can be read and written in the kernel, for
    example boundary parameters.
+    Note that `create_indexed_kernel` is a lower level function which shoul be accessed by providing `index_fields`
+    to create_kernel
    Args:
-        assignments: can be a single assignment, sequence of assignments or an `AssignmentCollection`
+        assignments: `pystencils.node_collection.NodeCollection` containing all assignements and nodes to be processed
        config: CreateKernelConfig which includes the needed configuration
    Returns:
@@ -303,7 +200,9 @@ def create_indexed_kernel(assignments: List[Assignment], *, config: CreateKernel
    Example:
        >>> import pystencils as ps
+        >>> from pystencils.node_collection import NodeCollection
        >>> import numpy as np
+        >>> from pystencils.kernelcreation import create_indexed_kernel
        >>>
        >>> # Index field stores the indices of the cell to visit together with optional values
        >>> index_arr_dtype = np.dtype([('x', np.int32), ('y', np.int32), ('val', np.double)])
@@ -314,7 +213,7 @@ def create_indexed_kernel(assignments: List[Assignment], *, config: CreateKernel
        >>> s, d = ps.fields('s, d: [2D]')
        >>> assignment = ps.Assignment(d[0, 0], 2 * s[0, 1] + 2 * s[1, 0] + idx_field('val'))
        >>> kernel_config = ps.CreateKernelConfig(index_fields=[idx_field], coordinate_names=('x', 'y'))
-        >>> kernel_ast = ps.create_indexed_kernel([assignment], config=kernel_config)
+        >>> kernel_ast = create_indexed_kernel(NodeCollection([assignment]), config=kernel_config)
        >>> kernel = kernel_ast.compile()
        >>> d_arr = np.zeros([5, 5])
        >>> kernel(s=np.ones([5, 5]), d=d_arr, idx=index_arr)
@@ -324,23 +223,30 @@ def create_indexed_kernel(assignments: List[Assignment], *, config: CreateKernel
               [0. , 0. , 4.2, 0. , 0. ],
               [0. , 0. , 0. , 4.3, 0. ],
               [0. , 0. , 0. , 0. , 0. ]])
    """
+    # --- eval
+    assignments.evaluate_terms()
+    # FUTURE WORK from here we shouldn't NEED sympy
+    # --- check constrains
+    check = KernelConstraintsCheck(check_independence_condition=not config.skip_independence_check,
+                                   check_double_write_condition=not config.allow_double_writes)
+    check.visit(assignments)
+    assignments.bound_fields = check.fields_written
+    assignments.rhs_fields = check.fields_read
    ast = None
    if config.target == Target.CPU and config.backend == Backend.C:
        from pystencils.cpu import add_openmp, create_indexed_kernel
-        ast = create_indexed_kernel(assignments, index_fields=config.index_fields, type_info=config.data_type,
+        ast = create_indexed_kernel(assignments, config=config)
-                                    coordinate_names=config.coordinate_names)
        if config.cpu_openmp:
            add_openmp(ast, num_threads=config.cpu_openmp)
    elif config.target == Target.GPU:
        if config.backend == Backend.CUDA:
-            from pystencils.gpucuda import created_indexed_cuda_kernel
+            from pystencils.gpu import created_indexed_cuda_kernel
-            idx_creator = indexing_creator_from_params(config.gpu_indexing, config.gpu_indexing_params)
+            ast = created_indexed_cuda_kernel(assignments, config=config)
-            ast = created_indexed_cuda_kernel(assignments,
-                                              config.index_fields,
-                                              type_info=config.data_type,
-                                              coordinate_names=config.coordinate_names,
-                                              indexing_creator=idx_creator)
    if not ast:
        raise NotImplementedError(f'Indexed kernels are not yet supported for {config.target} with {config.backend}')
@@ -369,6 +275,7 @@ def create_staggered_kernel(assignments, target: Target = Target.CPU, gpu_exclus
    Returns:
        AST, see `create_kernel`
    """
+    # TODO: Add doku like in the other kernels
    if 'ghost_layers' in kwargs:
        assert kwargs['ghost_layers'] is None
        del kwargs['ghost_layers']
@@ -462,11 +369,8 @@ def create_staggered_kernel(assignments, target: Target = Target.CPU, gpu_exclus
                            [SympyAssignment(s.lhs, s.rhs) for s in subexpressions if hasattr(s, 'lhs')] + \
                            [last_conditional]
-        if target == Target.CPU:
+        config = CreateKernelConfig(target=target, ghost_layers=ghost_layers, omp_single_loop=False, **kwargs)
-            from pystencils.cpu import create_kernel as create_kernel_cpu
+        ast = create_kernel(final_assignments, config=config)
-            ast = create_kernel_cpu(final_assignments, ghost_layers=ghost_layers, omp_single_loop=False, **kwargs)
-        else:
-            ast = create_kernel(final_assignments, ghost_layers=ghost_layers, target=target, **kwargs)
        return ast
    for assignment in assignments:
@@ -483,6 +387,8 @@ def create_staggered_kernel(assignments, target: Target = Target.CPU, gpu_exclus
    if 'cpu_prepend_optimizations' in kwargs:
        prepend_optimizations += kwargs['cpu_prepend_optimizations']
        del kwargs['cpu_prepend_optimizations']
-    ast = create_kernel(final_assignments, ghost_layers=ghost_layers, target=target, omp_single_loop=False,
-                        cpu_prepend_optimizations=prepend_optimizations, **kwargs)
+    config = CreateKernelConfig(ghost_layers=ghost_layers, target=target, omp_single_loop=False,
+                                cpu_prepend_optimizations=prepend_optimizations, **kwargs)
+    ast = create_kernel(final_assignments, config=config)
    return ast
--- a/src/pystencils/node_collection.py
+++ b/src/pystencils/node_collection.py
+from typing import Any, Dict, List, Union, Optional, Set
+import sympy
+import sympy as sp
+from sympy.codegen.rewriting import ReplaceOptim, optimize
+from pystencils.assignment import Assignment, AddAugmentedAssignment
+import pystencils.astnodes as ast
+from pystencils.backends.cbackend import CustomCodeNode
+from pystencils.functions import DivFunc
+from pystencils.simp import AssignmentCollection
+from pystencils.typing import FieldPointerSymbol
+class NodeCollection:
+    def __init__(self, assignments: List[Union[ast.Node, Assignment]],
+                 simplification_hints: Optional[Dict[str, Any]] = None,
+                 bound_fields: Set[sp.Symbol] = None, rhs_fields: Set[sp.Symbol] = None):
+        def visit(obj):
+            if isinstance(obj, (list, tuple)):
+                return [visit(e) for e in obj]
+            if isinstance(obj, Assignment):
+                if isinstance(obj.lhs, FieldPointerSymbol):
+                    return ast.SympyAssignment(obj.lhs, obj.rhs, is_const=obj.lhs.dtype.const)
+                return ast.SympyAssignment(obj.lhs, obj.rhs)
+            elif isinstance(obj, AddAugmentedAssignment):
+                return ast.SympyAssignment(obj.lhs, obj.lhs + obj.rhs)
+            elif isinstance(obj, ast.SympyAssignment):
+                return obj
+            elif isinstance(obj, ast.Conditional):
+                true_block = visit(obj.true_block)
+                false_block = None if obj.false_block is None else visit(obj.false_block)
+                return ast.Conditional(obj.condition_expr, true_block=true_block, false_block=false_block)
+            elif isinstance(obj, ast.Block):
+                return ast.Block([visit(e) for e in obj.args])
+            elif isinstance(obj, ast.Node) and not isinstance(obj, ast.LoopOverCoordinate):
+                return obj
+            else:
+                raise ValueError("Invalid object in the List of Assignments " + str(type(obj)))
+        self.all_assignments = visit(assignments)
+        self.simplification_hints = simplification_hints if simplification_hints else {}
+        self.bound_fields = bound_fields if bound_fields else {}
+        self.rhs_fields = rhs_fields if rhs_fields else {}
+    @staticmethod
+    def from_assignment_collection(assignment_collection: AssignmentCollection):
+        return NodeCollection(assignments=assignment_collection.all_assignments,
+                              simplification_hints=assignment_collection.simplification_hints,
+                              bound_fields=assignment_collection.bound_fields,
+                              rhs_fields=assignment_collection.rhs_fields)
+    def evaluate_terms(self):
+        evaluate_constant_terms = ReplaceOptim(
+            lambda e: hasattr(e, 'is_constant') and e.is_constant and not e.is_integer,
+            lambda p: p.evalf()
+        )
+        evaluate_pow = ReplaceOptim(
+            lambda e: e.is_Pow and e.exp.is_Integer and abs(e.exp) <= 8,
+            lambda p: sp.UnevaluatedExpr(sp.Mul(*([p.base] * +p.exp), evaluate=False)) if p.exp > 0 else
+            (DivFunc(sp.Integer(1), p.base) if p.exp == -1 else
+             DivFunc(sp.Integer(1), sp.UnevaluatedExpr(sp.Mul(*([p.base] * -p.exp), evaluate=False))))
+        )
+        sympy_optimisations = [evaluate_constant_terms, evaluate_pow]
+        def visitor(node):
+            if isinstance(node, CustomCodeNode):
+                return node
+            elif isinstance(node, ast.Block):
+                return node.func([visitor(child) for child in node.args])
+            elif isinstance(node, ast.SympyAssignment):
+                new_lhs = visitor(node.lhs)
+                new_rhs = visitor(node.rhs)
+                return node.func(new_lhs, new_rhs, node.is_const, node.use_auto)
+            elif isinstance(node, ast.Node):
+                return node.func(*[visitor(child) for child in node.args])
+            elif isinstance(node, sympy.Basic):
+                return optimize(node, sympy_optimisations)
+            else:
+                raise NotImplementedError(f'{node} {type(node)} has no valid visitor')
+        self.all_assignments = [visitor(assignment) for assignment in self.all_assignments]
--- a/pystencils/placeholder_function.py
+++ b/pystencils/placeholder_function.py
--- a/pystencils/plot.py
+++ b/pystencils/plot.py
--- a/pystencils/rng.py
+++ b/pystencils/rng.py
@@ -2,7 +2,7 @@ import copy
 import numpy as np
 import sympy as sp
-from pystencils.data_types import TypedSymbol, cast_func
+from pystencils.typing import TypedSymbol, CastFunc
 from pystencils.astnodes import LoopOverCoordinate
 from pystencils.backends.cbackend import CustomCodeNode
 from pystencils.sympyextensions import fast_subs
@@ -47,11 +47,11 @@ class RNGBase(CustomCodeNode):
    def get_code(self, dialect, vector_instruction_set, print_arg):
        code = "\n"
        for r in self.result_symbols:
-            if vector_instruction_set and not self.args[1].atoms(cast_func):
+            if vector_instruction_set and not self.args[1].atoms(CastFunc):
                # this vector RNG has become scalar through substitution
                code += f"{r.dtype} {r.name};\n"
            else:
-                code += f"{vector_instruction_set[r.dtype.base_name] if vector_instruction_set else r.dtype} " + \
+                code += f"{vector_instruction_set[r.dtype.c_name] if vector_instruction_set else r.dtype} " + \
                        f"{r.name};\n"
        args = [print_arg(a) for a in self.args] + ['' + r.name for r in self.result_symbols]
        code += (self._name + "(" + ", ".join(args) + ");\n")
@@ -61,6 +61,15 @@ class RNGBase(CustomCodeNode):
        return ", ".join([str(s) for s in self.result_symbols]) + " \\leftarrow " + \
            self._name.capitalize() + "_RNG(" + ", ".join([str(a) for a in self.args]) + ")"
+    def _hashable_content(self):
+        return (self._name, *self.result_symbols, *self.args)
+    def __eq__(self, other):
+        return type(self) is type(other) and self._hashable_content() == other._hashable_content()
+    def __hash__(self):
+        return hash(self._hashable_content())
 class PhiloxTwoDoubles(RNGBase):
    _name = "philox_double2"

--- a/pystencils/runhelper/__init__.py
+++ b/pystencils/runhelper/__init__.py
--- a/pystencils/runhelper/db.py
+++ b/pystencils/runhelper/db.py
 import socket
 import time
+from types import MappingProxyType
 from typing import Dict, Iterator, Sequence
 import blitzdb
+import six
+from blitzdb.backends.file.backend import serializer_classes
+from blitzdb.backends.file.utils import JsonEncoder
 from pystencils.cpu.cpujit import get_compiler_config
+from pystencils import CreateKernelConfig, Target, Backend, Field
+import json
+import sympy as sp
+from pystencils.typing import BasicType
+class PystencilsJsonEncoder(JsonEncoder):
+    def default(self, obj):
+        if isinstance(obj, CreateKernelConfig):
+            return obj.__dict__
+        if isinstance(obj, (sp.Float, sp.Rational)):
+            return float(obj)
+        if isinstance(obj, sp.Integer):
+            return int(obj)
+        if isinstance(obj, (BasicType, MappingProxyType)):
+            return str(obj)
+        if isinstance(obj, (Target, Backend, sp.Symbol)):
+            return obj.name
+        if isinstance(obj, Field):
+            return f"pystencils.Field(name = {obj.name}, field_type = {obj.field_type.name}, " \
+                   f"dtype = {str(obj.dtype)}, layout = {obj.layout}, shape = {obj.shape}, " \
+                   f"strides = {obj.strides})"
+        return JsonEncoder.default(self, obj)
+class PystencilsJsonSerializer(object):
+    @classmethod
+    def serialize(cls, data):
+        if six.PY3:
+            if isinstance(data, bytes):
+                return json.dumps(data.decode('utf-8'), cls=PystencilsJsonEncoder, ensure_ascii=False).encode('utf-8')
+            else:
+                return json.dumps(data, cls=PystencilsJsonEncoder, ensure_ascii=False).encode('utf-8')
+        else:
+            return json.dumps(data, cls=PystencilsJsonEncoder, ensure_ascii=False).encode('utf-8')
+    @classmethod
+    def deserialize(cls, data):
+        if six.PY3:
+            return json.loads(data.decode('utf-8'))
+        else:
+            return json.loads(data.decode('utf-8'))
 class Database:
@@ -46,7 +96,7 @@ class Database:
    class SimulationResult(blitzdb.Document):
        pass
-    def __init__(self, file: str) -> None:
+    def __init__(self, file: str, serializer_info: tuple = None) -> None:
        if file.startswith("mongo://"):
            from pymongo import MongoClient
            db_name = file[len("mongo://"):]
@@ -57,6 +107,10 @@ class Database:
        self.backend.autocommit = True
+        if serializer_info:
+            serializer_classes.update({serializer_info[0]: serializer_info[1]})
+            self.backend.load_config({'serializer_class': serializer_info[0]}, True)
    def save(self, params: Dict, result: Dict, env: Dict = None, **kwargs) -> None:
        """Stores a simulation result in the database.
@@ -146,10 +200,15 @@ class Database:
            'cpuCompilerConfig': get_compiler_config(),
        }
        try:
-            from git import Repo, InvalidGitRepositoryError
+            from git import Repo
+        except ImportError:
+            return result
+        try:
+            from git import InvalidGitRepositoryError
            repo = Repo(search_parent_directories=True)
            result['git_hash'] = str(repo.head.commit)
-        except (ImportError, InvalidGitRepositoryError):
+        except InvalidGitRepositoryError:
            pass
        return result

--- a/pystencils/runhelper/parameterstudy.py
+++ b/pystencils/runhelper/parameterstudy.py
@@ -9,6 +9,7 @@ from time import sleep
 from typing import Any, Callable, Dict, Optional, Sequence, Tuple
 from pystencils.runhelper import Database
+from pystencils.runhelper.db import PystencilsJsonSerializer
 from pystencils.utils import DotDict
 ParameterDict = Dict[str, Any]
@@ -54,10 +55,11 @@ class ParameterStudy:
    Run = namedtuple("Run", ['parameter_dict', 'weight'])
    def __init__(self, run_function: Callable[..., Dict], runs: Sequence = (),
-                 database_connector: str = './db') -> None:
+                 database_connector: str = './db',
+                 serializer_info: tuple = ('pystencils_serializer', PystencilsJsonSerializer)) -> None:
        self.runs = list(runs)
        self.run_function = run_function
-        self.db = Database(database_connector)
+        self.db = Database(database_connector, serializer_info)
    def add_run(self, parameter_dict: ParameterDict, weight: int = 1) -> None:
        """Schedule a dictionary of parameters to run in this parameter study.
No results found