Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
No results found
Show changes
Showing
with 882 additions and 233 deletions
from os.path import dirname, join, realpath from os.path import dirname, realpath
def get_pystencils_include_path(): def get_pystencils_include_path():
return dirname(realpath(__file__)) return dirname(realpath(__file__))
def get_pycuda_include_path():
import pycuda
return join(dirname(realpath(pycuda.__file__)), 'cuda')
/*
Copyright 2010-2011, D. E. Shaw Research. All rights reserved.
Copyright 2019-2023, Michael Kuron.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <emmintrin.h> // SSE2 #include <emmintrin.h> // SSE2
#include <wmmintrin.h> // AES #include <wmmintrin.h> // AES
#ifdef __AVX__ #ifdef __AVX__
...@@ -551,7 +583,7 @@ QUALIFIERS void aesni_double2(uint32 ctr0, __m256i ctr1, uint32 ctr2, uint32 ctr ...@@ -551,7 +583,7 @@ QUALIFIERS void aesni_double2(uint32 ctr0, __m256i ctr1, uint32 ctr2, uint32 ctr
#endif #endif
#ifdef __AVX512F__ #if defined(__AVX512F__) || defined(__AVX10_512BIT__)
QUALIFIERS const std::array<__m512i,11> & aesni_roundkeys(const __m512i & k512) { QUALIFIERS const std::array<__m512i,11> & aesni_roundkeys(const __m512i & k512) {
alignas(64) std::array<uint32,16> a; alignas(64) std::array<uint32,16> a;
_mm512_store_si512((__m512i*) a.data(), k512); _mm512_store_si512((__m512i*) a.data(), k512);
......
/*
Copyright 2021-2023, Michael Kuron.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#if defined(_MSC_VER)
#define __ARM_NEON
#endif
#ifdef __ARM_NEON #ifdef __ARM_NEON
#include <arm_neon.h> #include <arm_neon.h>
#endif #endif
...@@ -32,10 +67,13 @@ inline int32x4_t makeVec_s32(int a, int b, int c, int d) ...@@ -32,10 +67,13 @@ inline int32x4_t makeVec_s32(int a, int b, int c, int d)
#endif #endif
inline void cachelineZero(void * p) { inline void cachelineZero(void * p) {
#if !defined(_MSC_VER) || defined(__clang__)
__asm__ volatile("dc zva, %0"::"r"(p):"memory"); __asm__ volatile("dc zva, %0"::"r"(p):"memory");
#endif
} }
inline size_t _cachelineSize() { inline size_t _cachelineSize() {
#if !defined(_MSC_VER) || defined(__clang__)
// check that dc zva is permitted // check that dc zva is permitted
uint64_t dczid; uint64_t dczid;
__asm__ volatile ("mrs %0, dczid_el0" : "=r"(dczid)); __asm__ volatile ("mrs %0, dczid_el0" : "=r"(dczid));
...@@ -72,6 +110,7 @@ inline size_t _cachelineSize() { ...@@ -72,6 +110,7 @@ inline size_t _cachelineSize() {
return size; return size;
} }
} }
#endif
// too much was zeroed // too much was zeroed
return SIZE_MAX; return SIZE_MAX;
......
/*
Copyright 2023, Markus Holzer.
Copyright 2023, Michael Kuron.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#define POS_INFINITY __int_as_float(0x7f800000)
#define INFINITY POS_INFINITY
#define NEG_INFINITY __int_as_float(0xff800000)
#ifdef __HIPCC_RTC__
typedef __hip_uint8_t uint8_t;
typedef __hip_int8_t int8_t;
typedef __hip_uint16_t uint16_t;
typedef __hip_int16_t int16_t;
#endif
/*
Copyright 2023, Markus Holzer.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/// Half precision support. Experimental. Use carefully.
///
/// This feature is experimental, since it strictly depends on the underlying architecture and compiler support.
/// On x86 architectures, what you can expect is that the data format is supported natively only for storage and
/// interchange. Arithmetic operations will likely involve casting to fp32 (C++ float) and truncation to fp16.
/// Only bandwidth bound code may therefore benefit. None of this is guaranteed, and may change in the future.
/// Clang version must be 15 or higher for x86 half precision support.
/// GCC version must be 12 or higher for x86 half precision support.
/// Also support seems to require SSE, so ensure that respective instruction sets are enabled.
/// See
/// https://clang.llvm.org/docs/LanguageExtensions.html#half-precision-floating-point
/// https://gcc.gnu.org/onlinedocs/gcc/Half-Precision.html
/// for more information.
#pragma once
using half = _Float16;
/*
Copyright 2019-2023, Michael Kuron.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once #pragma once
#if defined(__SSE2__) || defined(_MSC_VER) #if defined(__SSE2__) || (defined(_MSC_VER) && !defined(_M_ARM64))
QUALIFIERS __m128 _my_cvtepu32_ps(const __m128i v) QUALIFIERS __m128 _my_cvtepu32_ps(const __m128i v)
{ {
#ifdef __AVX512VL__ #if defined(__AVX512VL__) || defined(__AVX10_1__)
return _mm_cvtepu32_ps(v); return _mm_cvtepu32_ps(v);
#else #else
__m128i v2 = _mm_srli_epi32(v, 1); __m128i v2 = _mm_srli_epi32(v, 1);
...@@ -28,13 +59,13 @@ QUALIFIERS void _MY_TRANSPOSE4_EPI32(__m128i & R0, __m128i & R1, __m128i & R2, _ ...@@ -28,13 +59,13 @@ QUALIFIERS void _MY_TRANSPOSE4_EPI32(__m128i & R0, __m128i & R1, __m128i & R2, _
} }
#endif #endif
#if defined(__SSE4_1__) || defined(_MSC_VER) #if defined(__SSE4_1__) || (defined(_MSC_VER) && !defined(_M_ARM64))
#if !defined(__AVX512VL__) && defined(__GNUC__) && __GNUC__ >= 5 && !defined(__clang__) #if !defined(__AVX512VL__) && !defined(__AVX10_1__) && defined(__GNUC__) && __GNUC__ >= 5 && !defined(__clang__)
__attribute__((optimize("no-associative-math"))) __attribute__((optimize("no-associative-math")))
#endif #endif
QUALIFIERS __m128d _my_cvtepu64_pd(const __m128i x) QUALIFIERS __m128d _my_cvtepu64_pd(const __m128i x)
{ {
#ifdef __AVX512VL__ #if defined(__AVX512VL__) || defined(__AVX10_1__)
return _mm_cvtepu64_pd(x); return _mm_cvtepu64_pd(x);
#elif defined(__clang__) #elif defined(__clang__)
return __builtin_convertvector((uint64_t __attribute__((__vector_size__(16)))) x, __m128d); return __builtin_convertvector((uint64_t __attribute__((__vector_size__(16)))) x, __m128d);
...@@ -69,7 +100,7 @@ QUALIFIERS __m256d _my256_set_m128d(__m128d hi, __m128d lo) ...@@ -69,7 +100,7 @@ QUALIFIERS __m256d _my256_set_m128d(__m128d hi, __m128d lo)
QUALIFIERS __m256 _my256_cvtepu32_ps(const __m256i v) QUALIFIERS __m256 _my256_cvtepu32_ps(const __m256i v)
{ {
#ifdef __AVX512VL__ #if defined(__AVX512VL__) || defined(__AVX10_1__)
return _mm256_cvtepu32_ps(v); return _mm256_cvtepu32_ps(v);
#else #else
__m256i v2 = _mm256_srli_epi32(v, 1); __m256i v2 = _mm256_srli_epi32(v, 1);
...@@ -80,12 +111,12 @@ QUALIFIERS __m256 _my256_cvtepu32_ps(const __m256i v) ...@@ -80,12 +111,12 @@ QUALIFIERS __m256 _my256_cvtepu32_ps(const __m256i v)
#endif #endif
} }
#if !defined(__AVX512VL__) && defined(__GNUC__) && __GNUC__ >= 5 && !defined(__clang__) #if !defined(__AVX512VL__) && !defined(__AVX10_1__) && defined(__GNUC__) && __GNUC__ >= 5 && !defined(__clang__)
__attribute__((optimize("no-associative-math"))) __attribute__((optimize("no-associative-math")))
#endif #endif
QUALIFIERS __m256d _my256_cvtepu64_pd(const __m256i x) QUALIFIERS __m256d _my256_cvtepu64_pd(const __m256i x)
{ {
#ifdef __AVX512VL__ #if defined(__AVX512VL__) || defined(__AVX10_1__)
return _mm256_cvtepu64_pd(x); return _mm256_cvtepu64_pd(x);
#elif defined(__clang__) #elif defined(__clang__)
return __builtin_convertvector((uint64_t __attribute__((__vector_size__(32)))) x, __m256d); return __builtin_convertvector((uint64_t __attribute__((__vector_size__(32)))) x, __m256d);
...@@ -99,7 +130,7 @@ QUALIFIERS __m256d _my256_cvtepu64_pd(const __m256i x) ...@@ -99,7 +130,7 @@ QUALIFIERS __m256d _my256_cvtepu64_pd(const __m256i x)
} }
#endif #endif
#ifdef __AVX512F__ #if defined(__AVX512F__) || defined(__AVX10_512BIT__)
QUALIFIERS __m512i _my512_set_m128i(__m128i d, __m128i c, __m128i b, __m128i a) QUALIFIERS __m512i _my512_set_m128i(__m128i d, __m128i c, __m128i b, __m128i a)
{ {
return _mm512_inserti32x4(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(a), b, 1), c, 2), d, 3); return _mm512_inserti32x4(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(a), b, 1), c, 2), d, 3);
......
#ifndef __OPENCL_VERSION__ /*
#if defined(__SSE2__) || defined(_MSC_VER) Copyright 2010-2011, D. E. Shaw Research. All rights reserved.
Copyright 2019-2024, Michael Kuron.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#if !defined(__OPENCL_VERSION__) && !defined(__HIPCC_RTC__)
#if defined(__SSE2__) || (defined(_MSC_VER) && !defined(_M_ARM64))
#include <emmintrin.h> // SSE2 #include <emmintrin.h> // SSE2
#endif #endif
#ifdef __AVX2__ #ifdef __AVX2__
#include <immintrin.h> // AVX* #include <immintrin.h> // AVX*
#elif defined(__SSE4_1__) || defined(_MSC_VER) #elif defined(__SSE4_1__) || (defined(_MSC_VER) && !defined(_M_ARM64))
#include <smmintrin.h> // SSE4 #include <smmintrin.h> // SSE4
#ifdef __FMA__ #ifdef __FMA__
#include <immintrin.h> // FMA #include <immintrin.h> // FMA
#endif #endif
#endif #endif
#if defined(_MSC_VER) && defined(_M_ARM64)
#define __ARM_NEON
#endif
#ifdef __ARM_NEON #ifdef __ARM_NEON
#include <arm_neon.h> #include <arm_neon.h>
#endif #endif
#ifdef __ARM_FEATURE_SVE #if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_SME)
#include <arm_sve.h> #include <arm_sve.h>
#endif #endif
...@@ -34,7 +70,7 @@ ...@@ -34,7 +70,7 @@
#endif #endif
#endif #endif
#ifdef __CUDA_ARCH__ #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
#define QUALIFIERS static __forceinline__ __device__ #define QUALIFIERS static __forceinline__ __device__
#elif defined(__OPENCL_VERSION__) #elif defined(__OPENCL_VERSION__)
#define QUALIFIERS static inline #define QUALIFIERS static inline
...@@ -43,6 +79,12 @@ ...@@ -43,6 +79,12 @@
#include "myintrin.h" #include "myintrin.h"
#endif #endif
#if defined(__ARM_FEATURE_SME)
#define SVE_QUALIFIERS __attribute__((arm_streaming_compatible)) QUALIFIERS
#else
#define SVE_QUALIFIERS QUALIFIERS
#endif
#define PHILOX_W32_0 (0x9E3779B9) #define PHILOX_W32_0 (0x9E3779B9)
#define PHILOX_W32_1 (0xBB67AE85) #define PHILOX_W32_1 (0xBB67AE85)
#define PHILOX_M4x32_0 (0xD2511F53) #define PHILOX_M4x32_0 (0xD2511F53)
...@@ -55,7 +97,9 @@ ...@@ -55,7 +97,9 @@
typedef uint32_t uint32; typedef uint32_t uint32;
typedef uint64_t uint64; typedef uint64_t uint64;
#else #else
#ifndef __HIPCC_RTC__
#include <cstdint> #include <cstdint>
#endif
typedef std::uint32_t uint32; typedef std::uint32_t uint32;
typedef std::uint64_t uint64; typedef std::uint64_t uint64;
#endif #endif
...@@ -63,7 +107,7 @@ typedef std::uint64_t uint64; ...@@ -63,7 +107,7 @@ typedef std::uint64_t uint64;
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS > 0 #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS > 0
typedef svfloat32_t svfloat32_st __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS))); typedef svfloat32_t svfloat32_st __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
typedef svfloat64_t svfloat64_st __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS))); typedef svfloat64_t svfloat64_st __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
#elif defined(__ARM_FEATURE_SVE) #elif defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_SME)
typedef svfloat32_t svfloat32_st; typedef svfloat32_t svfloat32_st;
typedef svfloat64_t svfloat64_st; typedef svfloat64_t svfloat64_st;
#endif #endif
...@@ -71,7 +115,7 @@ typedef svfloat64_t svfloat64_st; ...@@ -71,7 +115,7 @@ typedef svfloat64_t svfloat64_st;
QUALIFIERS uint32 mulhilo32(uint32 a, uint32 b, uint32* hip) QUALIFIERS uint32 mulhilo32(uint32 a, uint32 b, uint32* hip)
{ {
#ifndef __CUDA_ARCH__ #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
// host code // host code
#if defined(__powerpc__) && (!defined(__clang__) || defined(__xlC__)) #if defined(__powerpc__) && (!defined(__clang__) || defined(__xlC__))
*hip = __mulhwu(a,b); *hip = __mulhwu(a,b);
...@@ -182,8 +226,8 @@ QUALIFIERS void philox_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3 ...@@ -182,8 +226,8 @@ QUALIFIERS void philox_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3
#endif #endif
} }
#if !defined(__CUDA_ARCH__) && !defined(__OPENCL_VERSION__) #if !defined(__CUDA_ARCH__) && !defined(__OPENCL_VERSION__) && !defined(__HIP_DEVICE_COMPILE__)
#if defined(__SSE4_1__) || defined(_MSC_VER) #if defined(__SSE4_1__) || (defined(_MSC_VER) && !defined(_M_ARM64))
QUALIFIERS void _philox4x32round(__m128i* ctr, __m128i* key) QUALIFIERS void _philox4x32round(__m128i* ctr, __m128i* key)
{ {
__m128i lohi0a = _mm_mul_epu32(ctr[0], _mm_set1_epi32(PHILOX_M4x32_0)); __m128i lohi0a = _mm_mul_epu32(ctr[0], _mm_set1_epi32(PHILOX_M4x32_0));
...@@ -665,12 +709,14 @@ QUALIFIERS void philox_float4(uint32 ctr0, uint32x4_t ctr1, uint32 ctr2, uint32 ...@@ -665,12 +709,14 @@ QUALIFIERS void philox_float4(uint32 ctr0, uint32x4_t ctr1, uint32 ctr2, uint32
philox_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, rnd2, rnd3, rnd4); philox_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, rnd2, rnd3, rnd4);
} }
#ifndef _MSC_VER
QUALIFIERS void philox_float4(uint32 ctr0, int32x4_t ctr1, uint32 ctr2, uint32 ctr3, QUALIFIERS void philox_float4(uint32 ctr0, int32x4_t ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1, uint32 key0, uint32 key1,
float32x4_t & rnd1, float32x4_t & rnd2, float32x4_t & rnd3, float32x4_t & rnd4) float32x4_t & rnd1, float32x4_t & rnd2, float32x4_t & rnd3, float32x4_t & rnd4)
{ {
philox_float4(ctr0, vreinterpretq_u32_s32(ctr1), ctr2, ctr3, key0, key1, rnd1, rnd2, rnd3, rnd4); philox_float4(ctr0, vreinterpretq_u32_s32(ctr1), ctr2, ctr3, key0, key1, rnd1, rnd2, rnd3, rnd4);
} }
#endif
QUALIFIERS void philox_double2(uint32 ctr0, uint32x4_t ctr1, uint32 ctr2, uint32 ctr3, QUALIFIERS void philox_double2(uint32 ctr0, uint32x4_t ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1, uint32 key0, uint32 key1,
...@@ -695,6 +741,7 @@ QUALIFIERS void philox_double2(uint32 ctr0, uint32x4_t ctr1, uint32 ctr2, uint32 ...@@ -695,6 +741,7 @@ QUALIFIERS void philox_double2(uint32 ctr0, uint32x4_t ctr1, uint32 ctr2, uint32
philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, ignore, rnd2, ignore); philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, ignore, rnd2, ignore);
} }
#ifndef _MSC_VER
QUALIFIERS void philox_double2(uint32 ctr0, int32x4_t ctr1, uint32 ctr2, uint32 ctr3, QUALIFIERS void philox_double2(uint32 ctr0, int32x4_t ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1, uint32 key0, uint32 key1,
float64x2_t & rnd1, float64x2_t & rnd2) float64x2_t & rnd1, float64x2_t & rnd2)
...@@ -702,10 +749,11 @@ QUALIFIERS void philox_double2(uint32 ctr0, int32x4_t ctr1, uint32 ctr2, uint32 ...@@ -702,10 +749,11 @@ QUALIFIERS void philox_double2(uint32 ctr0, int32x4_t ctr1, uint32 ctr2, uint32
philox_double2(ctr0, vreinterpretq_u32_s32(ctr1), ctr2, ctr3, key0, key1, rnd1, rnd2); philox_double2(ctr0, vreinterpretq_u32_s32(ctr1), ctr2, ctr3, key0, key1, rnd1, rnd2);
} }
#endif #endif
#endif
#if defined(__ARM_FEATURE_SVE) #if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_SME)
QUALIFIERS void _philox4x32round(svuint32x4_t & ctr, svuint32x2_t & key) SVE_QUALIFIERS void _philox4x32round(svuint32x4_t & ctr, svuint32x2_t & key)
{ {
svuint32_t lo0 = svmul_u32_x(svptrue_b32(), svget4_u32(ctr, 0), svdup_u32(PHILOX_M4x32_0)); svuint32_t lo0 = svmul_u32_x(svptrue_b32(), svget4_u32(ctr, 0), svdup_u32(PHILOX_M4x32_0));
svuint32_t hi0 = svmulh_u32_x(svptrue_b32(), svget4_u32(ctr, 0), svdup_u32(PHILOX_M4x32_0)); svuint32_t hi0 = svmulh_u32_x(svptrue_b32(), svget4_u32(ctr, 0), svdup_u32(PHILOX_M4x32_0));
...@@ -718,14 +766,14 @@ QUALIFIERS void _philox4x32round(svuint32x4_t & ctr, svuint32x2_t & key) ...@@ -718,14 +766,14 @@ QUALIFIERS void _philox4x32round(svuint32x4_t & ctr, svuint32x2_t & key)
ctr = svset4_u32(ctr, 3, lo0); ctr = svset4_u32(ctr, 3, lo0);
} }
QUALIFIERS void _philox4x32bumpkey(svuint32x2_t & key) SVE_QUALIFIERS void _philox4x32bumpkey(svuint32x2_t & key)
{ {
key = svset2_u32(key, 0, svadd_u32_x(svptrue_b32(), svget2_u32(key, 0), svdup_u32(PHILOX_W32_0))); key = svset2_u32(key, 0, svadd_u32_x(svptrue_b32(), svget2_u32(key, 0), svdup_u32(PHILOX_W32_0)));
key = svset2_u32(key, 1, svadd_u32_x(svptrue_b32(), svget2_u32(key, 1), svdup_u32(PHILOX_W32_1))); key = svset2_u32(key, 1, svadd_u32_x(svptrue_b32(), svget2_u32(key, 1), svdup_u32(PHILOX_W32_1)));
} }
template<bool high> template<bool high>
QUALIFIERS svfloat64_t _uniform_double_hq(svuint32_t x, svuint32_t y) SVE_QUALIFIERS svfloat64_t _uniform_double_hq(svuint32_t x, svuint32_t y)
{ {
// convert 32 to 64 bit // convert 32 to 64 bit
if (high) if (high)
...@@ -752,9 +800,9 @@ QUALIFIERS svfloat64_t _uniform_double_hq(svuint32_t x, svuint32_t y) ...@@ -752,9 +800,9 @@ QUALIFIERS svfloat64_t _uniform_double_hq(svuint32_t x, svuint32_t y)
} }
QUALIFIERS void philox_float4(svuint32_t ctr0, svuint32_t ctr1, svuint32_t ctr2, svuint32_t ctr3, SVE_QUALIFIERS void philox_float4(svuint32_t ctr0, svuint32_t ctr1, svuint32_t ctr2, svuint32_t ctr3,
uint32 key0, uint32 key1, uint32 key0, uint32 key1,
svfloat32_st & rnd1, svfloat32_st & rnd2, svfloat32_st & rnd3, svfloat32_st & rnd4) svfloat32_st & rnd1, svfloat32_st & rnd2, svfloat32_st & rnd3, svfloat32_st & rnd4)
{ {
svuint32x2_t key = svcreate2_u32(svdup_u32(key0), svdup_u32(key1)); svuint32x2_t key = svcreate2_u32(svdup_u32(key0), svdup_u32(key1));
svuint32x4_t ctr = svcreate4_u32(ctr0, ctr1, ctr2, ctr3); svuint32x4_t ctr = svcreate4_u32(ctr0, ctr1, ctr2, ctr3);
...@@ -782,9 +830,9 @@ QUALIFIERS void philox_float4(svuint32_t ctr0, svuint32_t ctr1, svuint32_t ctr2, ...@@ -782,9 +830,9 @@ QUALIFIERS void philox_float4(svuint32_t ctr0, svuint32_t ctr1, svuint32_t ctr2,
} }
QUALIFIERS void philox_double2(svuint32_t ctr0, svuint32_t ctr1, svuint32_t ctr2, svuint32_t ctr3, SVE_QUALIFIERS void philox_double2(svuint32_t ctr0, svuint32_t ctr1, svuint32_t ctr2, svuint32_t ctr3,
uint32 key0, uint32 key1, uint32 key0, uint32 key1,
svfloat64_st & rnd1lo, svfloat64_st & rnd1hi, svfloat64_st & rnd2lo, svfloat64_st & rnd2hi) svfloat64_st & rnd1lo, svfloat64_st & rnd1hi, svfloat64_st & rnd2lo, svfloat64_st & rnd2hi)
{ {
svuint32x2_t key = svcreate2_u32(svdup_u32(key0), svdup_u32(key1)); svuint32x2_t key = svcreate2_u32(svdup_u32(key0), svdup_u32(key1));
svuint32x4_t ctr = svcreate4_u32(ctr0, ctr1, ctr2, ctr3); svuint32x4_t ctr = svcreate4_u32(ctr0, ctr1, ctr2, ctr3);
...@@ -805,9 +853,9 @@ QUALIFIERS void philox_double2(svuint32_t ctr0, svuint32_t ctr1, svuint32_t ctr2 ...@@ -805,9 +853,9 @@ QUALIFIERS void philox_double2(svuint32_t ctr0, svuint32_t ctr1, svuint32_t ctr2
rnd2hi = _uniform_double_hq<true>(svget4_u32(ctr, 2), svget4_u32(ctr, 3)); rnd2hi = _uniform_double_hq<true>(svget4_u32(ctr, 2), svget4_u32(ctr, 3));
} }
QUALIFIERS void philox_float4(uint32 ctr0, svuint32_t ctr1, uint32 ctr2, uint32 ctr3, SVE_QUALIFIERS void philox_float4(uint32 ctr0, svuint32_t ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1, uint32 key0, uint32 key1,
svfloat32_st & rnd1, svfloat32_st & rnd2, svfloat32_st & rnd3, svfloat32_st & rnd4) svfloat32_st & rnd1, svfloat32_st & rnd2, svfloat32_st & rnd3, svfloat32_st & rnd4)
{ {
svuint32_t ctr0v = svdup_u32(ctr0); svuint32_t ctr0v = svdup_u32(ctr0);
svuint32_t ctr2v = svdup_u32(ctr2); svuint32_t ctr2v = svdup_u32(ctr2);
...@@ -816,16 +864,16 @@ QUALIFIERS void philox_float4(uint32 ctr0, svuint32_t ctr1, uint32 ctr2, uint32 ...@@ -816,16 +864,16 @@ QUALIFIERS void philox_float4(uint32 ctr0, svuint32_t ctr1, uint32 ctr2, uint32
philox_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, rnd2, rnd3, rnd4); philox_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, rnd2, rnd3, rnd4);
} }
QUALIFIERS void philox_float4(uint32 ctr0, svint32_t ctr1, uint32 ctr2, uint32 ctr3, SVE_QUALIFIERS void philox_float4(uint32 ctr0, svint32_t ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1, uint32 key0, uint32 key1,
svfloat32_st & rnd1, svfloat32_st & rnd2, svfloat32_st & rnd3, svfloat32_st & rnd4) svfloat32_st & rnd1, svfloat32_st & rnd2, svfloat32_st & rnd3, svfloat32_st & rnd4)
{ {
philox_float4(ctr0, svreinterpret_u32_s32(ctr1), ctr2, ctr3, key0, key1, rnd1, rnd2, rnd3, rnd4); philox_float4(ctr0, svreinterpret_u32_s32(ctr1), ctr2, ctr3, key0, key1, rnd1, rnd2, rnd3, rnd4);
} }
QUALIFIERS void philox_double2(uint32 ctr0, svuint32_t ctr1, uint32 ctr2, uint32 ctr3, SVE_QUALIFIERS void philox_double2(uint32 ctr0, svuint32_t ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1, uint32 key0, uint32 key1,
svfloat64_st & rnd1lo, svfloat64_st & rnd1hi, svfloat64_st & rnd2lo, svfloat64_st & rnd2hi) svfloat64_st & rnd1lo, svfloat64_st & rnd1hi, svfloat64_st & rnd2lo, svfloat64_st & rnd2hi)
{ {
svuint32_t ctr0v = svdup_u32(ctr0); svuint32_t ctr0v = svdup_u32(ctr0);
svuint32_t ctr2v = svdup_u32(ctr2); svuint32_t ctr2v = svdup_u32(ctr2);
...@@ -834,9 +882,9 @@ QUALIFIERS void philox_double2(uint32 ctr0, svuint32_t ctr1, uint32 ctr2, uint32 ...@@ -834,9 +882,9 @@ QUALIFIERS void philox_double2(uint32 ctr0, svuint32_t ctr1, uint32 ctr2, uint32
philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1lo, rnd1hi, rnd2lo, rnd2hi); philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1lo, rnd1hi, rnd2lo, rnd2hi);
} }
QUALIFIERS void philox_double2(uint32 ctr0, svuint32_t ctr1, uint32 ctr2, uint32 ctr3, SVE_QUALIFIERS void philox_double2(uint32 ctr0, svuint32_t ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1, uint32 key0, uint32 key1,
svfloat64_st & rnd1, svfloat64_st & rnd2) svfloat64_st & rnd1, svfloat64_st & rnd2)
{ {
svuint32_t ctr0v = svdup_u32(ctr0); svuint32_t ctr0v = svdup_u32(ctr0);
svuint32_t ctr2v = svdup_u32(ctr2); svuint32_t ctr2v = svdup_u32(ctr2);
...@@ -846,9 +894,9 @@ QUALIFIERS void philox_double2(uint32 ctr0, svuint32_t ctr1, uint32 ctr2, uint32 ...@@ -846,9 +894,9 @@ QUALIFIERS void philox_double2(uint32 ctr0, svuint32_t ctr1, uint32 ctr2, uint32
philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, ignore, rnd2, ignore); philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, ignore, rnd2, ignore);
} }
QUALIFIERS void philox_double2(uint32 ctr0, svint32_t ctr1, uint32 ctr2, uint32 ctr3, SVE_QUALIFIERS void philox_double2(uint32 ctr0, svint32_t ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1, uint32 key0, uint32 key1,
svfloat64_st & rnd1, svfloat64_st & rnd2) svfloat64_st & rnd1, svfloat64_st & rnd2)
{ {
philox_double2(ctr0, svreinterpret_u32_s32(ctr1), ctr2, ctr3, key0, key1, rnd1, rnd2); philox_double2(ctr0, svreinterpret_u32_s32(ctr1), ctr2, ctr3, key0, key1, rnd1, rnd2);
} }
...@@ -1174,7 +1222,7 @@ QUALIFIERS void philox_double2(uint32 ctr0, __m256i ctr1, uint32 ctr2, uint32 ct ...@@ -1174,7 +1222,7 @@ QUALIFIERS void philox_double2(uint32 ctr0, __m256i ctr1, uint32 ctr2, uint32 ct
} }
#endif #endif
#ifdef __AVX512F__ #if defined(__AVX512F__) || defined(__AVX10_512BIT__)
QUALIFIERS void _philox4x32round(__m512i* ctr, __m512i* key) QUALIFIERS void _philox4x32round(__m512i* ctr, __m512i* key)
{ {
__m512i lohi0a = _mm512_mul_epu32(ctr[0], _mm512_set1_epi32(PHILOX_M4x32_0)); __m512i lohi0a = _mm512_mul_epu32(ctr[0], _mm512_set1_epi32(PHILOX_M4x32_0));
......
/*
Copyright 2021, Michael Kuron.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <altivec.h> #include <altivec.h>
#undef vector #undef vector
#undef bool #undef bool
......
/*
Copyright 2023, Michael Kuron.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
inline void cachelineZero(void * p) {
#ifdef __riscv_zicboz
__asm__ volatile("cbo.zero (%0)"::"r"(p):"memory");
#endif
}
inline size_t _cachelineSize() {
// allocate and fill with ones
const size_t max_size = 0x100000;
uint8_t data[2*max_size];
for (size_t i = 0; i < 2*max_size; ++i) {
data[i] = 0xff;
}
// find alignment offset
size_t offset = max_size - ((uintptr_t) data) % max_size;
// zero a cacheline
cachelineZero((void*) (data + offset));
// make sure that at least one byte was zeroed
if (data[offset] != 0) {
return SIZE_MAX;
}
// make sure that nothing was zeroed before the pointer
if (data[offset-1] == 0) {
return SIZE_MAX;
}
// find the last byte that was zeroed
for (size_t size = 1; size < max_size; ++size) {
if (data[offset + size] != 0) {
return size;
}
}
// too much was zeroed
return SIZE_MAX;
}
inline size_t cachelineSize() {
#ifdef __riscv_zicboz
static size_t size = _cachelineSize();
return size;
#else
return SIZE_MAX;
#endif
}
# TODO #47 move to a module functions
import numpy as np import numpy as np
import sympy as sp import sympy as sp
from pystencils.data_types import cast_func, collate_types, create_type, get_type_of_expression from pystencils.typing import CastFunc, collate_types, create_type, get_type_of_expression
from pystencils.sympyextensions import is_integer_sequence from pystencils.sympyextensions import is_integer_sequence
...@@ -12,9 +13,9 @@ class IntegerFunctionTwoArgsMixIn(sp.Function): ...@@ -12,9 +13,9 @@ class IntegerFunctionTwoArgsMixIn(sp.Function):
args = [] args = []
for a in (arg1, arg2): for a in (arg1, arg2):
if isinstance(a, sp.Number) or isinstance(a, int): if isinstance(a, sp.Number) or isinstance(a, int):
args.append(cast_func(a, create_type("int"))) args.append(CastFunc(a, create_type("int")))
elif isinstance(a, np.generic): elif isinstance(a, np.generic):
args.append(cast_func(a, a.dtype)) args.append(CastFunc(a, a.dtype))
else: else:
args.append(a) args.append(a)
......
...@@ -4,7 +4,8 @@ import islpy as isl ...@@ -4,7 +4,8 @@ import islpy as isl
import sympy as sp import sympy as sp
import pystencils.astnodes as ast import pystencils.astnodes as ast
from pystencils.transformations import parents_of_type from pystencils.typing import parents_of_type
from pystencils.backends.cbackend import CustomSympyPrinter
def remove_brackets(s): def remove_brackets(s):
...@@ -51,11 +52,13 @@ def simplify_loop_counter_dependent_conditional(conditional): ...@@ -51,11 +52,13 @@ def simplify_loop_counter_dependent_conditional(conditional):
dofs_in_loops, iteration_set = isl_iteration_set(conditional) dofs_in_loops, iteration_set = isl_iteration_set(conditional)
if dofs_in_condition.issubset(dofs_in_loops): if dofs_in_condition.issubset(dofs_in_loops):
symbol_names = ','.join(dofs_in_loops) symbol_names = ','.join(dofs_in_loops)
condition_str = remove_brackets(str(conditional.condition_expr)) condition_str = CustomSympyPrinter().doprint(conditional.condition_expr)
condition_str = remove_brackets(condition_str)
condition_set = isl.BasicSet(f"{{ [{symbol_names}] : {condition_str} }}") condition_set = isl.BasicSet(f"{{ [{symbol_names}] : {condition_str} }}")
if condition_set.is_empty(): if condition_set.is_empty():
conditional.replace_by_false_block() conditional.replace_by_false_block()
return
intersection = iteration_set.intersect(condition_set) intersection = iteration_set.intersect(condition_set)
if intersection.is_empty(): if intersection.is_empty():
......
File moved
from collections import namedtuple, defaultdict
from typing import Union
import sympy as sp
from sympy.codegen import Assignment
from pystencils.simp import AssignmentCollection
from pystencils import astnodes as ast, TypedSymbol
from pystencils.field import Field
from pystencils.node_collection import NodeCollection
from pystencils.transformations import NestedScopes
# TODO use this in Constraint Checker
accepted_functions = [
sp.Pow,
sp.sqrt,
sp.log,
# TODO trigonometric functions (and whatever tests will fail)
]
class KernelConstraintsCheck:
# TODO: proper specification
# TODO: More checks :)
"""Checks if the input to create_kernel is valid.
Test the following conditions:
- SSA Form for pure symbols:
- Every pure symbol may occur only once as left-hand-side of an assignment
- Every pure symbol that is read, may not be written to later
- Independence / Parallelization condition:
- a field that is written may only be read at exact the same spatial position
(Pure symbols are symbols that are not Field.Accesses)
"""
FieldAndIndex = namedtuple('FieldAndIndex', ['field', 'index'])
def __init__(self, check_independence_condition=True, check_double_write_condition=True):
self.scopes = NestedScopes()
self.field_reads = defaultdict(set)
self.field_writes = defaultdict(set)
self.fields_read = set()
self.check_independence_condition = check_independence_condition
self.check_double_write_condition = check_double_write_condition
def visit(self, obj):
if isinstance(obj, (AssignmentCollection, NodeCollection)):
[self.visit(e) for e in obj.all_assignments]
elif isinstance(obj, list) or isinstance(obj, tuple):
[self.visit(e) for e in obj]
elif isinstance(obj, (sp.Eq, ast.SympyAssignment, Assignment)):
self.process_assignment(obj)
elif isinstance(obj, ast.Conditional):
self.scopes.push()
# Disable double write check inside conditionals
# would be triggered by e.g. in-kernel boundaries
old_double_write = self.check_double_write_condition
old_independence_condition = self.check_independence_condition
self.check_double_write_condition = False
self.check_independence_condition = False
if obj.false_block:
self.visit(obj.false_block)
self.process_expression(obj.condition_expr)
self.process_expression(obj.true_block)
self.check_double_write_condition = old_double_write
self.check_independence_condition = old_independence_condition
self.scopes.pop()
elif isinstance(obj, ast.Block):
self.scopes.push()
[self.visit(e) for e in obj.args]
self.scopes.pop()
elif isinstance(obj, ast.Node) and not isinstance(obj, ast.LoopOverCoordinate):
pass
else:
raise ValueError(f'Invalid object in kernel {type(obj)}')
def process_assignment(self, assignment: Union[sp.Eq, ast.SympyAssignment, Assignment]):
# for checks it is crucial to process rhs before lhs to catch e.g. a = a + 1
self.process_expression(assignment.rhs)
self.process_lhs(assignment.lhs)
def process_expression(self, rhs):
# TODO constraint for accepted functions, see TODO above
self.update_accesses_rhs(rhs)
if isinstance(rhs, Field.Access):
self.fields_read.add(rhs.field)
self.fields_read.update(rhs.indirect_addressing_fields)
else:
for arg in rhs.args:
self.process_expression(arg)
@property
def fields_written(self):
"""
Return all rhs fields
"""
return set(k.field for k, v in self.field_writes.items() if len(v))
def process_lhs(self, lhs: Union[Field.Access, TypedSymbol, sp.Symbol]):
assert isinstance(lhs, sp.Symbol)
self.update_accesses_lhs(lhs)
def update_accesses_lhs(self, lhs):
if isinstance(lhs, Field.Access):
fai = self.FieldAndIndex(lhs.field, lhs.index)
if self.check_double_write_condition and lhs.offsets in self.field_writes[fai]:
raise ValueError(f"Field {lhs.field.name} is written twice at the same location")
self.field_writes[fai].add(lhs.offsets)
if self.check_double_write_condition and len(self.field_writes[fai]) > 1:
raise ValueError(
f"Field {lhs.field.name} is written at two different locations")
if fai in self.field_reads:
reads = tuple(self.field_reads[fai])
if len(reads) > 1 or lhs.offsets != reads[0]:
if self.check_independence_condition:
raise ValueError(f"Field {lhs.field.name} is written at different location than it was read. "
f"This means the resulting kernel would not be thread safe")
elif isinstance(lhs, sp.Symbol):
if self.scopes.is_defined_locally(lhs):
raise ValueError(f"Assignments not in SSA form, multiple assignments to {lhs.name}")
if lhs in self.scopes.free_parameters:
raise ValueError(f"Symbol {lhs.name} is written, after it has been read")
self.scopes.define_symbol(lhs)
def update_accesses_rhs(self, rhs):
if isinstance(rhs, Field.Access) and self.check_independence_condition:
fai = self.FieldAndIndex(rhs.field, rhs.index)
writes = self.field_writes[fai]
self.field_reads[fai].add(rhs.offsets)
for write_offset in writes:
assert len(writes) == 1
if write_offset != rhs.offsets:
raise ValueError(f"Violation of loop independence condition. Field "
f"{rhs.field} is read at {rhs.offsets} and written at {write_offset}")
self.fields_read.add(rhs.field)
elif isinstance(rhs, sp.Symbol):
self.scopes.access_symbol(rhs)
import ast import ast
import inspect import inspect
import textwrap import textwrap
from typing import Callable, Union, List, Dict from typing import Callable, Union, List, Dict, Tuple
import sympy as sp import sympy as sp
from pystencils.assignment import Assignment from pystencils.assignment import Assignment
from pystencils.sympyextensions import SymbolCreator from pystencils.sympyextensions import SymbolCreator
from pystencils.config import CreateKernelConfig
__all__ = ['kernel'] __all__ = ['kernel', 'kernel_config']
def kernel(func: Callable[..., None], return_config: bool = False, **kwargs) -> Union[List[Assignment], Dict]: def _kernel(func: Callable[..., None], **kwargs) -> Tuple[List[Assignment], str]:
"""Decorator to simplify generation of pystencils Assignments. """
Convenient function for kernel decorator to prevent code duplication
Changes the meaning of the '@=' operator. Each line containing this operator gives a symbolic assignment Args:
in the result list. Furthermore the meaning of the ternary inline 'if-else' changes meaning to denote a func: decorated function
sympy Piecewise. **kwargs: kwargs for the function
Returns:
The decorated function may not receive any arguments, with exception of an argument called 's' that specifies assignments, function_name
a SymbolCreator()
func: the decorated function
return_config: Specify whether to return the list with assignments, or a dictionary containing additional settings
like func_name
Examples:
>>> import pystencils as ps
>>> @kernel
... def my_kernel(s):
... f, g = ps.fields('f, g: [2D]')
... s.neighbors @= f[0,1] + f[1,0]
... g[0,0] @= s.neighbors + f[0,0] if f[0,0] > 0 else 0
>>> f, g = ps.fields('f, g: [2D]')
>>> assert my_kernel[0].rhs == f[0,1] + f[1,0]
""" """
source = inspect.getsource(func) source = inspect.getsource(func)
source = textwrap.dedent(source) source = textwrap.dedent(source)
...@@ -55,10 +42,74 @@ def kernel(func: Callable[..., None], return_config: bool = False, **kwargs) -> ...@@ -55,10 +42,74 @@ def kernel(func: Callable[..., None], return_config: bool = False, **kwargs) ->
if 's' in args and 's' not in kwargs: if 's' in args and 's' not in kwargs:
kwargs['s'] = SymbolCreator() kwargs['s'] = SymbolCreator()
func(**kwargs) func(**kwargs)
if return_config: return assignments, func.__name__
return {'assignments': assignments, 'function_name': func.__name__}
else:
return assignments def kernel(func: Callable[..., None], **kwargs) -> List[Assignment]:
"""Decorator to simplify generation of pystencils Assignments.
Changes the meaning of the '@=' operator. Each line containing this operator gives a symbolic assignment
in the result list. Furthermore the meaning of the ternary inline 'if-else' changes meaning to denote a
sympy Piecewise.
The decorated function may not receive any arguments, with exception of an argument called 's' that specifies
a SymbolCreator()
Args:
func: decorated function
**kwargs: kwargs for the function
Examples:
>>> import pystencils as ps
>>> @kernel
... def my_kernel(s):
... f, g = ps.fields('f, g: [2D]')
... s.neighbors @= f[0,1] + f[1,0]
... g[0,0] @= s.neighbors + f[0,0] if f[0,0] > 0 else 0
>>> f, g = ps.fields('f, g: [2D]')
>>> assert my_kernel[0].rhs == f[0,1] + f[1,0]
"""
assignments, _ = _kernel(func, **kwargs)
return assignments
def kernel_config(config: CreateKernelConfig, **kwargs) -> Callable[..., Dict]:
"""Decorator to simplify generation of pystencils Assignments, which takes a configuration
and updates the function name accordingly.
Changes the meaning of the '@=' operator. Each line containing this operator gives a symbolic assignment
in the result list. Furthermore, the meaning of the ternary inline 'if-else' changes meaning to denote a
sympy Piecewise.
The decorated function may not receive any arguments, with exception to an argument called 's' that specifies
a SymbolCreator()
Args:
config: Specify whether to return the list with assignments, or a dictionary containing additional settings
like func_name
Returns:
decorator with config
Examples:
>>> import pystencils as ps
>>> kernel_configuration = ps.CreateKernelConfig()
>>> @kernel_config(kernel_configuration)
... def my_kernel(s):
... src, dst = ps.fields('src, dst: [2D]')
... s.neighbors @= src[0, 1] + src[1, 0]
... dst[0, 0] @= s.neighbors + src[0, 0] if src[0, 0] > 0 else 0
>>> f, g = ps.fields('src, dst: [2D]')
>>> assert my_kernel['assignments'][0].rhs == f[0, 1] + f[1, 0]
"""
def decorator(func: Callable[..., None]) -> Union[List[Assignment], Dict]:
"""
Args:
func: decorated function
Returns:
Dict for unpacking into create_kernel
"""
assignments, func_name = _kernel(func, **kwargs)
config.function_name = func_name
return {'assignments': assignments, 'config': config}
return decorator
# noinspection PyMethodMayBeStatic # noinspection PyMethodMayBeStatic
......
import functools
import itertools import itertools
import warnings import warnings
from dataclasses import dataclass, field from typing import Union, List
from types import MappingProxyType
from typing import Callable, Union, List, Dict, Tuple, Any
import sympy as sp import sympy as sp
from pystencils.config import CreateKernelConfig
from pystencils.assignment import Assignment from pystencils.assignment import Assignment, AddAugmentedAssignment
from pystencils.astnodes import Block, Conditional, LoopOverCoordinate, SympyAssignment from pystencils.astnodes import Node, Block, Conditional, LoopOverCoordinate, SympyAssignment
from pystencils.cpu.vectorization import vectorize from pystencils.cpu.vectorization import vectorize
from pystencils.enums import Target, Backend from pystencils.enums import Target, Backend
from pystencils.field import Field, FieldType from pystencils.field import Field, FieldType
from pystencils.gpucuda.indexing import indexing_creator_from_params from pystencils.node_collection import NodeCollection
from pystencils.simp.assignment_collection import AssignmentCollection from pystencils.simp.assignment_collection import AssignmentCollection
from pystencils.kernel_contrains_check import KernelConstraintsCheck
from pystencils.simplificationfactory import create_simplification_strategy
from pystencils.stencil import direction_string_to_offset, inverse_direction_string from pystencils.stencil import direction_string_to_offset, inverse_direction_string
from pystencils.transformations import ( from pystencils.transformations import (
loop_blocking, move_constants_before_loop, remove_conditionals_in_staggered_kernel) loop_blocking, move_constants_before_loop, remove_conditionals_in_staggered_kernel)
@dataclass def create_kernel(assignments: Union[Assignment, List[Assignment],
class CreateKernelConfig: AddAugmentedAssignment, List[AddAugmentedAssignment],
""" AssignmentCollection, List[Node], NodeCollection],
target: One of Target's enums *,
backend: One of Backend's enums
function_name: name of the generated function - only important if generated code is written out
data_type: data type used for all untyped symbols (i.e. non-fields), can also be a dict from symbol name
to type
iteration_slice: rectangular subset to iterate over, if not specified the complete non-ghost layer \
part of the field is iterated over
ghost_layers: a single integer specifies the ghost layer count at all borders, can also be a sequence of
pairs ``[(x_lower_gl, x_upper_gl), .... ]``. These layers are excluded from the iteration.
If left to default, the number of ghost layers is determined automatically.
skip_independence_check: don't check that loop iterations are independent. This is needed e.g. for
periodicity kernel, that access the field outside the iteration bounds. Use with care!
cpu_openmp: True or number of threads for OpenMP parallelization, False for no OpenMP
cpu_vectorize_info: a dictionary with keys, 'vector_instruction_set', 'assume_aligned' and 'nontemporal'
for documentation of these parameters see vectorize function. Example:
'{'instruction_set': 'avx512', 'assume_aligned': True, 'nontemporal':True}'
cpu_blocking: a tuple of block sizes or None if no blocking should be applied
omp_single_loop: if OpenMP is active: whether multiple outer loops are permitted
gpu_indexing: either 'block' or 'line' , or custom indexing class, see `AbstractIndexing`
gpu_indexing_params: dict with indexing parameters (constructor parameters of indexing class)
e.g. for 'block' one can specify '{'block_size': (20, 20, 10) }'
use_textures_for_interpolation:
cpu_prepend_optimizations: list of extra optimizations to perform first on the AST
use_auto_for_assignments:
opencl_queue:
opencl_ctx:
index_fields: list of index fields, i.e. 1D fields with struct data type. If not None, `create_index_kernel`
instead of `create_domain_kernel` is used.
coordinate_names: name of the coordinate fields in the struct data type
"""
target: Target = Target.CPU
backend: Backend = None
function_name: str = 'kernel'
data_type: Union[str, dict] = 'double'
iteration_slice: Tuple = None
ghost_layers: Union[bool, int, List[Tuple[int]]] = None
skip_independence_check: bool = False
cpu_openmp: bool = False
cpu_vectorize_info: Dict = None
cpu_blocking: Tuple[int] = None
omp_single_loop: bool = True
gpu_indexing: str = 'block'
gpu_indexing_params: MappingProxyType = field(default=MappingProxyType({}))
use_textures_for_interpolation: bool = True
cpu_prepend_optimizations: List[Callable] = field(default_factory=list)
use_auto_for_assignments: bool = False
opencl_queue: Any = None
opencl_ctx: Any = None
index_fields: List[Field] = None
coordinate_names: Tuple[str, Any] = ('x', 'y', 'z')
def __post_init__(self):
# ---- Legacy parameters
if isinstance(self.target, str):
new_target = Target[self.target.upper()]
warnings.warn(f'Target "{self.target}" as str is deprecated. Use {new_target} instead',
category=DeprecationWarning)
self.target = new_target
# ---- Auto Backend
if not self.backend:
if self.target == Target.CPU:
self.backend = Backend.C
elif self.target == Target.GPU:
self.backend = Backend.CUDA
elif self.target == Target.OPENCL:
self.backend = Backend.OPENCL
else:
raise NotImplementedError(f'Target {self.target} has no default backend')
def create_kernel(assignments: Union[Assignment, List[Assignment], AssignmentCollection, List[Conditional]], *,
config: CreateKernelConfig = None, **kwargs): config: CreateKernelConfig = None, **kwargs):
""" """
Creates abstract syntax tree (AST) of kernel, using a list of update equations. Creates abstract syntax tree (AST) of kernel, using a list of update equations.
...@@ -132,9 +62,24 @@ def create_kernel(assignments: Union[Assignment, List[Assignment], AssignmentCol ...@@ -132,9 +62,24 @@ def create_kernel(assignments: Union[Assignment, List[Assignment], AssignmentCol
setattr(config, k, v) setattr(config, k, v)
# ---- Normalizing parameters # ---- Normalizing parameters
if isinstance(assignments, Assignment): if isinstance(assignments, (Assignment, AddAugmentedAssignment)):
assignments = [assignments] assignments = [assignments]
assert assignments, "Assignments must not be empty!" assert assignments, "Assignments must not be empty!"
if isinstance(assignments, list):
assignments = NodeCollection(assignments)
elif isinstance(assignments, AssignmentCollection):
# TODO Markus check and doku
# --- applying first default simplifications
try:
if config.default_assignment_simplifications:
simplification = create_simplification_strategy()
assignments = simplification(assignments)
except Exception as e:
warnings.warn(f"It was not possible to apply the default pystencils optimisations to the "
f"AssignmentCollection due to the following problem :{e}")
simplification_hints = assignments.simplification_hints
assignments = NodeCollection.from_assignment_collection(assignments)
assignments.simplification_hints = simplification_hints
if config.index_fields: if config.index_fields:
return create_indexed_kernel(assignments, config=config) return create_indexed_kernel(assignments, config=config)
...@@ -142,12 +87,15 @@ def create_kernel(assignments: Union[Assignment, List[Assignment], AssignmentCol ...@@ -142,12 +87,15 @@ def create_kernel(assignments: Union[Assignment, List[Assignment], AssignmentCol
return create_domain_kernel(assignments, config=config) return create_domain_kernel(assignments, config=config)
def create_domain_kernel(assignments: List[Assignment], *, config: CreateKernelConfig): def create_domain_kernel(assignments: NodeCollection, *, config: CreateKernelConfig):
""" """
Creates abstract syntax tree (AST) of kernel, using a list of update equations. Creates abstract syntax tree (AST) of kernel, using a NodeCollection.
Note that `create_domain_kernel` is a lower level function which shoul be accessed by not providing `index_fields`
to create_kernel
Args: Args:
assignments: can be a single assignment, sequence of assignments or an `AssignmentCollection` assignments: `pystencils.node_collection.NodeCollection` containing all assignements and nodes to be processed
config: CreateKernelConfig which includes the needed configuration config: CreateKernelConfig which includes the needed configuration
Returns: Returns:
...@@ -157,10 +105,12 @@ def create_domain_kernel(assignments: List[Assignment], *, config: CreateKernelC ...@@ -157,10 +105,12 @@ def create_domain_kernel(assignments: List[Assignment], *, config: CreateKernelC
Example: Example:
>>> import pystencils as ps >>> import pystencils as ps
>>> import numpy as np >>> import numpy as np
>>> from pystencils.kernelcreation import create_domain_kernel
>>> from pystencils.node_collection import NodeCollection
>>> s, d = ps.fields('s, d: [2D]') >>> s, d = ps.fields('s, d: [2D]')
>>> assignment = ps.Assignment(d[0,0], s[0, 1] + s[0, -1] + s[1, 0] + s[-1, 0]) >>> assignment = ps.Assignment(d[0,0], s[0, 1] + s[0, -1] + s[1, 0] + s[-1, 0])
>>> config = ps.CreateKernelConfig(cpu_openmp=True) >>> kernel_config = ps.CreateKernelConfig(cpu_openmp=True)
>>> kernel_ast = ps.kernelcreation.create_domain_kernel([assignment], config=config) >>> kernel_ast = create_domain_kernel(NodeCollection([assignment]), config=kernel_config)
>>> kernel = kernel_ast.compile() >>> kernel = kernel_ast.compile()
>>> d_arr = np.zeros([5, 5]) >>> d_arr = np.zeros([5, 5])
>>> kernel(d=d_arr, s=np.ones([5, 5])) >>> kernel(d=d_arr, s=np.ones([5, 5]))
...@@ -171,22 +121,25 @@ def create_domain_kernel(assignments: List[Assignment], *, config: CreateKernelC ...@@ -171,22 +121,25 @@ def create_domain_kernel(assignments: List[Assignment], *, config: CreateKernelC
[0., 4., 4., 4., 0.], [0., 4., 4., 4., 0.],
[0., 0., 0., 0., 0.]]) [0., 0., 0., 0., 0.]])
""" """
# ---- Normalizing parameters # --- eval
split_groups = () assignments.evaluate_terms()
if isinstance(assignments, AssignmentCollection):
if 'split_groups' in assignments.simplification_hints: # FUTURE WORK from here we shouldn't NEED sympy
split_groups = assignments.simplification_hints['split_groups'] # --- check constrains
assignments = assignments.all_assignments check = KernelConstraintsCheck(check_independence_condition=not config.skip_independence_check,
check_double_write_condition=not config.allow_double_writes)
check.visit(assignments)
assignments.bound_fields = check.fields_written
assignments.rhs_fields = check.fields_read
# ---- Creating ast # ---- Creating ast
ast = None ast = None
if config.target == Target.CPU: if config.target == Target.CPU:
if config.backend == Backend.C: if config.backend == Backend.C:
from pystencils.cpu import add_openmp, create_kernel from pystencils.cpu import add_openmp, create_kernel
ast = create_kernel(assignments, function_name=config.function_name, type_info=config.data_type, ast = create_kernel(assignments, config=config)
split_groups=split_groups,
iteration_slice=config.iteration_slice, ghost_layers=config.ghost_layers,
skip_independence_check=config.skip_independence_check)
for optimization in config.cpu_prepend_optimizations: for optimization in config.cpu_prepend_optimizations:
optimization(ast) optimization(ast)
omp_collapse = None omp_collapse = None
...@@ -208,26 +161,10 @@ def create_domain_kernel(assignments: List[Assignment], *, config: CreateKernelC ...@@ -208,26 +161,10 @@ def create_domain_kernel(assignments: List[Assignment], *, config: CreateKernelC
raise ValueError("Blocking cannot be combined with cacheline-zeroing") raise ValueError("Blocking cannot be combined with cacheline-zeroing")
else: else:
raise ValueError("Invalid value for cpu_vectorize_info") raise ValueError("Invalid value for cpu_vectorize_info")
elif config.backend == Backend.LLVM: elif config.target == Target.GPU:
from pystencils.llvm import create_kernel if config.backend == Backend.CUDA:
ast = create_kernel(assignments, function_name=config.function_name, type_info=config.data_type, from pystencils.gpu import create_cuda_kernel
split_groups=split_groups, iteration_slice=config.iteration_slice, ast = create_cuda_kernel(assignments, config=config)
ghost_layers=config.ghost_layers)
elif config.target == Target.GPU or config.target == Target.OPENCL:
if config.backend == Backend.CUDA or config.backend == Backend.OPENCL:
from pystencils.gpucuda import create_cuda_kernel
ast = create_cuda_kernel(assignments, function_name=config.function_name, type_info=config.data_type,
indexing_creator=indexing_creator_from_params(config.gpu_indexing,
config.gpu_indexing_params),
iteration_slice=config.iteration_slice, ghost_layers=config.ghost_layers,
skip_independence_check=config.skip_independence_check,
use_textures_for_interpolation=config.use_textures_for_interpolation)
if config.backend == Backend.OPENCL:
from pystencils.opencl.opencljit import make_python_function
ast._backend = config.backend
ast.compile = functools.partial(make_python_function, ast, config.opencl_queue, config.opencl_ctx)
ast._target = config.target
ast._backend = config.backend
if not ast: if not ast:
raise NotImplementedError( raise NotImplementedError(
...@@ -240,7 +177,7 @@ def create_domain_kernel(assignments: List[Assignment], *, config: CreateKernelC ...@@ -240,7 +177,7 @@ def create_domain_kernel(assignments: List[Assignment], *, config: CreateKernelC
return ast return ast
def create_indexed_kernel(assignments: List[Assignment], *, config: CreateKernelConfig): def create_indexed_kernel(assignments: NodeCollection, *, config: CreateKernelConfig):
""" """
Similar to :func:`create_kernel`, but here not all cells of a field are updated but only cells with Similar to :func:`create_kernel`, but here not all cells of a field are updated but only cells with
coordinates which are stored in an index field. This traversal method can e.g. be used for boundary handling. coordinates which are stored in an index field. This traversal method can e.g. be used for boundary handling.
...@@ -250,8 +187,11 @@ def create_indexed_kernel(assignments: List[Assignment], *, config: CreateKernel ...@@ -250,8 +187,11 @@ def create_indexed_kernel(assignments: List[Assignment], *, config: CreateKernel
'coordinate_names' parameter. The struct can have also other fields that can be read and written in the kernel, for 'coordinate_names' parameter. The struct can have also other fields that can be read and written in the kernel, for
example boundary parameters. example boundary parameters.
Note that `create_indexed_kernel` is a lower level function which shoul be accessed by providing `index_fields`
to create_kernel
Args: Args:
assignments: can be a single assignment, sequence of assignments or an `AssignmentCollection` assignments: `pystencils.node_collection.NodeCollection` containing all assignements and nodes to be processed
config: CreateKernelConfig which includes the needed configuration config: CreateKernelConfig which includes the needed configuration
Returns: Returns:
...@@ -260,7 +200,9 @@ def create_indexed_kernel(assignments: List[Assignment], *, config: CreateKernel ...@@ -260,7 +200,9 @@ def create_indexed_kernel(assignments: List[Assignment], *, config: CreateKernel
Example: Example:
>>> import pystencils as ps >>> import pystencils as ps
>>> from pystencils.node_collection import NodeCollection
>>> import numpy as np >>> import numpy as np
>>> from pystencils.kernelcreation import create_indexed_kernel
>>> >>>
>>> # Index field stores the indices of the cell to visit together with optional values >>> # Index field stores the indices of the cell to visit together with optional values
>>> index_arr_dtype = np.dtype([('x', np.int32), ('y', np.int32), ('val', np.double)]) >>> index_arr_dtype = np.dtype([('x', np.int32), ('y', np.int32), ('val', np.double)])
...@@ -269,9 +211,9 @@ def create_indexed_kernel(assignments: List[Assignment], *, config: CreateKernel ...@@ -269,9 +211,9 @@ def create_indexed_kernel(assignments: List[Assignment], *, config: CreateKernel
>>> >>>
>>> # Additional values stored in index field can be accessed in the kernel as well >>> # Additional values stored in index field can be accessed in the kernel as well
>>> s, d = ps.fields('s, d: [2D]') >>> s, d = ps.fields('s, d: [2D]')
>>> assignment = ps.Assignment(d[0,0], 2 * s[0, 1] + 2 * s[1, 0] + idx_field('val')) >>> assignment = ps.Assignment(d[0, 0], 2 * s[0, 1] + 2 * s[1, 0] + idx_field('val'))
>>> config = ps.CreateKernelConfig(index_fields=[idx_field], coordinate_names=('x', 'y')) >>> kernel_config = ps.CreateKernelConfig(index_fields=[idx_field], coordinate_names=('x', 'y'))
>>> kernel_ast = ps.create_indexed_kernel([assignment], config=config) >>> kernel_ast = create_indexed_kernel(NodeCollection([assignment]), config=kernel_config)
>>> kernel = kernel_ast.compile() >>> kernel = kernel_ast.compile()
>>> d_arr = np.zeros([5, 5]) >>> d_arr = np.zeros([5, 5])
>>> kernel(s=np.ones([5, 5]), d=d_arr, idx=index_arr) >>> kernel(s=np.ones([5, 5]), d=d_arr, idx=index_arr)
...@@ -281,30 +223,30 @@ def create_indexed_kernel(assignments: List[Assignment], *, config: CreateKernel ...@@ -281,30 +223,30 @@ def create_indexed_kernel(assignments: List[Assignment], *, config: CreateKernel
[0. , 0. , 4.2, 0. , 0. ], [0. , 0. , 4.2, 0. , 0. ],
[0. , 0. , 0. , 4.3, 0. ], [0. , 0. , 0. , 4.3, 0. ],
[0. , 0. , 0. , 0. , 0. ]]) [0. , 0. , 0. , 0. , 0. ]])
""" """
# --- eval
assignments.evaluate_terms()
# FUTURE WORK from here we shouldn't NEED sympy
# --- check constrains
check = KernelConstraintsCheck(check_independence_condition=not config.skip_independence_check,
check_double_write_condition=not config.allow_double_writes)
check.visit(assignments)
assignments.bound_fields = check.fields_written
assignments.rhs_fields = check.fields_read
ast = None ast = None
if config.target == Target.CPU and config.backend == Backend.C: if config.target == Target.CPU and config.backend == Backend.C:
from pystencils.cpu import add_openmp, create_indexed_kernel from pystencils.cpu import add_openmp, create_indexed_kernel
ast = create_indexed_kernel(assignments, index_fields=config.index_fields, type_info=config.data_type, ast = create_indexed_kernel(assignments, config=config)
coordinate_names=config.coordinate_names)
if config.cpu_openmp: if config.cpu_openmp:
add_openmp(ast, num_threads=config.cpu_openmp) add_openmp(ast, num_threads=config.cpu_openmp)
elif config.target == Target.GPU or config.target == Target.OPENCL: elif config.target == Target.GPU:
if config.backend == Backend.CUDA or config.backend == Backend.OPENCL: if config.backend == Backend.CUDA:
from pystencils.gpucuda import created_indexed_cuda_kernel from pystencils.gpu import created_indexed_cuda_kernel
idx_creator = indexing_creator_from_params(config.gpu_indexing, config.gpu_indexing_params) ast = created_indexed_cuda_kernel(assignments, config=config)
ast = created_indexed_cuda_kernel(assignments,
config.index_fields,
type_info=config.data_type,
coordinate_names=config.coordinate_names,
indexing_creator=idx_creator,
use_textures_for_interpolation=config.use_textures_for_interpolation)
if config.backend == Backend.OPENCL:
from pystencils.opencl.opencljit import make_python_function
ast._backend = config.backend
ast.compile = functools.partial(make_python_function, ast, config.opencl_queue, config.opencl_ctx)
ast._target = config.target
ast._backend = config.backend
if not ast: if not ast:
raise NotImplementedError(f'Indexed kernels are not yet supported for {config.target} with {config.backend}') raise NotImplementedError(f'Indexed kernels are not yet supported for {config.target} with {config.backend}')
...@@ -333,7 +275,16 @@ def create_staggered_kernel(assignments, target: Target = Target.CPU, gpu_exclus ...@@ -333,7 +275,16 @@ def create_staggered_kernel(assignments, target: Target = Target.CPU, gpu_exclus
Returns: Returns:
AST, see `create_kernel` AST, see `create_kernel`
""" """
assert 'iteration_slice' not in kwargs and 'ghost_layers' not in kwargs and 'omp_single_loop' not in kwargs # TODO: Add doku like in the other kernels
if 'ghost_layers' in kwargs:
assert kwargs['ghost_layers'] is None
del kwargs['ghost_layers']
if 'iteration_slice' in kwargs:
assert kwargs['iteration_slice'] is None
del kwargs['iteration_slice']
if 'omp_single_loop' in kwargs:
assert kwargs['omp_single_loop'] is False
del kwargs['omp_single_loop']
if isinstance(assignments, AssignmentCollection): if isinstance(assignments, AssignmentCollection):
subexpressions = assignments.subexpressions + [a for a in assignments.main_assignments subexpressions = assignments.subexpressions + [a for a in assignments.main_assignments
...@@ -411,7 +362,6 @@ def create_staggered_kernel(assignments, target: Target = Target.CPU, gpu_exclus ...@@ -411,7 +362,6 @@ def create_staggered_kernel(assignments, target: Target = Target.CPU, gpu_exclus
inner_assignment = [] inner_assignment = []
for assignment in assignments: for assignment in assignments:
direction = stencil[assignment.lhs.index[0]]
inner_assignment.append(SympyAssignment(assignment.lhs, assignment.rhs)) inner_assignment.append(SympyAssignment(assignment.lhs, assignment.rhs))
last_conditional = Conditional(sp.And(*[condition(d) for d in stencil]), last_conditional = Conditional(sp.And(*[condition(d) for d in stencil]),
Block(inner_assignment), outer_assignment) Block(inner_assignment), outer_assignment)
...@@ -419,11 +369,8 @@ def create_staggered_kernel(assignments, target: Target = Target.CPU, gpu_exclus ...@@ -419,11 +369,8 @@ def create_staggered_kernel(assignments, target: Target = Target.CPU, gpu_exclus
[SympyAssignment(s.lhs, s.rhs) for s in subexpressions if hasattr(s, 'lhs')] + \ [SympyAssignment(s.lhs, s.rhs) for s in subexpressions if hasattr(s, 'lhs')] + \
[last_conditional] [last_conditional]
if target == Target.CPU: config = CreateKernelConfig(target=target, ghost_layers=ghost_layers, omp_single_loop=False, **kwargs)
from pystencils.cpu import create_kernel as create_kernel_cpu ast = create_kernel(final_assignments, config=config)
ast = create_kernel_cpu(final_assignments, ghost_layers=ghost_layers, omp_single_loop=False, **kwargs)
else:
ast = create_kernel(final_assignments, ghost_layers=ghost_layers, target=target, **kwargs)
return ast return ast
for assignment in assignments: for assignment in assignments:
...@@ -437,6 +384,11 @@ def create_staggered_kernel(assignments, target: Target = Target.CPU, gpu_exclus ...@@ -437,6 +384,11 @@ def create_staggered_kernel(assignments, target: Target = Target.CPU, gpu_exclus
remove_start_conditional = any([gl[0] == 0 for gl in ghost_layers]) remove_start_conditional = any([gl[0] == 0 for gl in ghost_layers])
prepend_optimizations = [lambda ast: remove_conditionals_in_staggered_kernel(ast, remove_start_conditional), prepend_optimizations = [lambda ast: remove_conditionals_in_staggered_kernel(ast, remove_start_conditional),
move_constants_before_loop] move_constants_before_loop]
ast = create_kernel(final_assignments, ghost_layers=ghost_layers, target=target, omp_single_loop=False, if 'cpu_prepend_optimizations' in kwargs:
cpu_prepend_optimizations=prepend_optimizations, **kwargs) prepend_optimizations += kwargs['cpu_prepend_optimizations']
del kwargs['cpu_prepend_optimizations']
config = CreateKernelConfig(ghost_layers=ghost_layers, target=target, omp_single_loop=False,
cpu_prepend_optimizations=prepend_optimizations, **kwargs)
ast = create_kernel(final_assignments, config=config)
return ast return ast
from typing import Any, Dict, List, Union, Optional, Set
import sympy
import sympy as sp
from sympy.codegen.rewriting import ReplaceOptim, optimize
from pystencils.assignment import Assignment, AddAugmentedAssignment
import pystencils.astnodes as ast
from pystencils.backends.cbackend import CustomCodeNode
from pystencils.functions import DivFunc
from pystencils.simp import AssignmentCollection
from pystencils.typing import FieldPointerSymbol
class NodeCollection:
def __init__(self, assignments: List[Union[ast.Node, Assignment]],
simplification_hints: Optional[Dict[str, Any]] = None,
bound_fields: Set[sp.Symbol] = None, rhs_fields: Set[sp.Symbol] = None):
def visit(obj):
if isinstance(obj, (list, tuple)):
return [visit(e) for e in obj]
if isinstance(obj, Assignment):
if isinstance(obj.lhs, FieldPointerSymbol):
return ast.SympyAssignment(obj.lhs, obj.rhs, is_const=obj.lhs.dtype.const)
return ast.SympyAssignment(obj.lhs, obj.rhs)
elif isinstance(obj, AddAugmentedAssignment):
return ast.SympyAssignment(obj.lhs, obj.lhs + obj.rhs)
elif isinstance(obj, ast.SympyAssignment):
return obj
elif isinstance(obj, ast.Conditional):
true_block = visit(obj.true_block)
false_block = None if obj.false_block is None else visit(obj.false_block)
return ast.Conditional(obj.condition_expr, true_block=true_block, false_block=false_block)
elif isinstance(obj, ast.Block):
return ast.Block([visit(e) for e in obj.args])
elif isinstance(obj, ast.Node) and not isinstance(obj, ast.LoopOverCoordinate):
return obj
else:
raise ValueError("Invalid object in the List of Assignments " + str(type(obj)))
self.all_assignments = visit(assignments)
self.simplification_hints = simplification_hints if simplification_hints else {}
self.bound_fields = bound_fields if bound_fields else {}
self.rhs_fields = rhs_fields if rhs_fields else {}
@staticmethod
def from_assignment_collection(assignment_collection: AssignmentCollection):
return NodeCollection(assignments=assignment_collection.all_assignments,
simplification_hints=assignment_collection.simplification_hints,
bound_fields=assignment_collection.bound_fields,
rhs_fields=assignment_collection.rhs_fields)
def evaluate_terms(self):
evaluate_constant_terms = ReplaceOptim(
lambda e: hasattr(e, 'is_constant') and e.is_constant and not e.is_integer,
lambda p: p.evalf()
)
evaluate_pow = ReplaceOptim(
lambda e: e.is_Pow and e.exp.is_Integer and abs(e.exp) <= 8,
lambda p: sp.UnevaluatedExpr(sp.Mul(*([p.base] * +p.exp), evaluate=False)) if p.exp > 0 else
(DivFunc(sp.Integer(1), p.base) if p.exp == -1 else
DivFunc(sp.Integer(1), sp.UnevaluatedExpr(sp.Mul(*([p.base] * -p.exp), evaluate=False))))
)
sympy_optimisations = [evaluate_constant_terms, evaluate_pow]
def visitor(node):
if isinstance(node, CustomCodeNode):
return node
elif isinstance(node, ast.Block):
return node.func([visitor(child) for child in node.args])
elif isinstance(node, ast.SympyAssignment):
new_lhs = visitor(node.lhs)
new_rhs = visitor(node.rhs)
return node.func(new_lhs, new_rhs, node.is_const, node.use_auto)
elif isinstance(node, ast.Node):
return node.func(*[visitor(child) for child in node.args])
elif isinstance(node, sympy.Basic):
return optimize(node, sympy_optimisations)
else:
raise NotImplementedError(f'{node} {type(node)} has no valid visitor')
self.all_assignments = [visitor(assignment) for assignment in self.all_assignments]
File moved
...@@ -2,10 +2,9 @@ import copy ...@@ -2,10 +2,9 @@ import copy
import numpy as np import numpy as np
import sympy as sp import sympy as sp
from pystencils.data_types import TypedSymbol, cast_func from pystencils.typing import TypedSymbol, CastFunc
from pystencils.astnodes import LoopOverCoordinate from pystencils.astnodes import LoopOverCoordinate
from pystencils.backends.cbackend import CustomCodeNode from pystencils.backends.cbackend import CustomCodeNode
from pystencils.enums import Backend
from pystencils.sympyextensions import fast_subs from pystencils.sympyextensions import fast_subs
...@@ -48,14 +47,13 @@ class RNGBase(CustomCodeNode): ...@@ -48,14 +47,13 @@ class RNGBase(CustomCodeNode):
def get_code(self, dialect, vector_instruction_set, print_arg): def get_code(self, dialect, vector_instruction_set, print_arg):
code = "\n" code = "\n"
for r in self.result_symbols: for r in self.result_symbols:
if vector_instruction_set and not self.args[1].atoms(cast_func): if vector_instruction_set and not self.args[1].atoms(CastFunc):
# this vector RNG has become scalar through substitution # this vector RNG has become scalar through substitution
code += f"{r.dtype} {r.name};\n" code += f"{r.dtype} {r.name};\n"
else: else:
code += f"{vector_instruction_set[r.dtype.base_name] if vector_instruction_set else r.dtype} " + \ code += f"{vector_instruction_set[r.dtype.c_name] if vector_instruction_set else r.dtype} " + \
f"{r.name};\n" f"{r.name};\n"
args = [print_arg(a) for a in self.args] + \ args = [print_arg(a) for a in self.args] + ['' + r.name for r in self.result_symbols]
[('&' if dialect == Backend.OPENCL else '') + r.name for r in self.result_symbols]
code += (self._name + "(" + ", ".join(args) + ");\n") code += (self._name + "(" + ", ".join(args) + ");\n")
return code return code
...@@ -63,6 +61,15 @@ class RNGBase(CustomCodeNode): ...@@ -63,6 +61,15 @@ class RNGBase(CustomCodeNode):
return ", ".join([str(s) for s in self.result_symbols]) + " \\leftarrow " + \ return ", ".join([str(s) for s in self.result_symbols]) + " \\leftarrow " + \
self._name.capitalize() + "_RNG(" + ", ".join([str(a) for a in self.args]) + ")" self._name.capitalize() + "_RNG(" + ", ".join([str(a) for a in self.args]) + ")"
def _hashable_content(self):
return (self._name, *self.result_symbols, *self.args)
def __eq__(self, other):
return type(self) is type(other) and self._hashable_content() == other._hashable_content()
def __hash__(self):
return hash(self._hashable_content())
class PhiloxTwoDoubles(RNGBase): class PhiloxTwoDoubles(RNGBase):
_name = "philox_double2" _name = "philox_double2"
......