Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
No results found
Show changes
Showing
with 779 additions and 240 deletions
/*
Copyright 2023, Markus Holzer.
Copyright 2023, Michael Kuron.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#define POS_INFINITY __int_as_float(0x7f800000)
#define INFINITY POS_INFINITY
#define NEG_INFINITY __int_as_float(0xff800000)
#ifdef __HIPCC_RTC__
typedef __hip_uint8_t uint8_t;
typedef __hip_int8_t int8_t;
typedef __hip_uint16_t uint16_t;
typedef __hip_int16_t int16_t;
#endif
/*
Copyright 2023, Markus Holzer.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/// Half precision support. Experimental. Use carefully.
///
/// This feature is experimental, since it strictly depends on the underlying architecture and compiler support.
/// On x86 architectures, what you can expect is that the data format is supported natively only for storage and
/// interchange. Arithmetic operations will likely involve casting to fp32 (C++ float) and truncation to fp16.
/// Only bandwidth bound code may therefore benefit. None of this is guaranteed, and may change in the future.
/// Clang version must be 15 or higher for x86 half precision support.
/// GCC version must be 12 or higher for x86 half precision support.
/// Also support seems to require SSE, so ensure that respective instruction sets are enabled.
/// See
/// https://clang.llvm.org/docs/LanguageExtensions.html#half-precision-floating-point
/// https://gcc.gnu.org/onlinedocs/gcc/Half-Precision.html
/// for more information.
#pragma once
using half = _Float16;
/*
Copyright 2019-2023, Michael Kuron.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once #pragma once
#if defined(__SSE2__) || defined(_MSC_VER) #if defined(__SSE2__) || (defined(_MSC_VER) && !defined(_M_ARM64))
QUALIFIERS __m128 _my_cvtepu32_ps(const __m128i v) QUALIFIERS __m128 _my_cvtepu32_ps(const __m128i v)
{ {
#ifdef __AVX512VL__ #if defined(__AVX512VL__) || defined(__AVX10_1__)
return _mm_cvtepu32_ps(v); return _mm_cvtepu32_ps(v);
#else #else
__m128i v2 = _mm_srli_epi32(v, 1); __m128i v2 = _mm_srli_epi32(v, 1);
...@@ -28,13 +59,13 @@ QUALIFIERS void _MY_TRANSPOSE4_EPI32(__m128i & R0, __m128i & R1, __m128i & R2, _ ...@@ -28,13 +59,13 @@ QUALIFIERS void _MY_TRANSPOSE4_EPI32(__m128i & R0, __m128i & R1, __m128i & R2, _
} }
#endif #endif
#if defined(__SSE4_1__) || defined(_MSC_VER) #if defined(__SSE4_1__) || (defined(_MSC_VER) && !defined(_M_ARM64))
#if !defined(__AVX512VL__) && defined(__GNUC__) && __GNUC__ >= 5 && !defined(__clang__) #if !defined(__AVX512VL__) && !defined(__AVX10_1__) && defined(__GNUC__) && __GNUC__ >= 5 && !defined(__clang__)
__attribute__((optimize("no-associative-math"))) __attribute__((optimize("no-associative-math")))
#endif #endif
QUALIFIERS __m128d _my_cvtepu64_pd(const __m128i x) QUALIFIERS __m128d _my_cvtepu64_pd(const __m128i x)
{ {
#ifdef __AVX512VL__ #if defined(__AVX512VL__) || defined(__AVX10_1__)
return _mm_cvtepu64_pd(x); return _mm_cvtepu64_pd(x);
#elif defined(__clang__) #elif defined(__clang__)
return __builtin_convertvector((uint64_t __attribute__((__vector_size__(16)))) x, __m128d); return __builtin_convertvector((uint64_t __attribute__((__vector_size__(16)))) x, __m128d);
...@@ -69,7 +100,7 @@ QUALIFIERS __m256d _my256_set_m128d(__m128d hi, __m128d lo) ...@@ -69,7 +100,7 @@ QUALIFIERS __m256d _my256_set_m128d(__m128d hi, __m128d lo)
QUALIFIERS __m256 _my256_cvtepu32_ps(const __m256i v) QUALIFIERS __m256 _my256_cvtepu32_ps(const __m256i v)
{ {
#ifdef __AVX512VL__ #if defined(__AVX512VL__) || defined(__AVX10_1__)
return _mm256_cvtepu32_ps(v); return _mm256_cvtepu32_ps(v);
#else #else
__m256i v2 = _mm256_srli_epi32(v, 1); __m256i v2 = _mm256_srli_epi32(v, 1);
...@@ -80,12 +111,12 @@ QUALIFIERS __m256 _my256_cvtepu32_ps(const __m256i v) ...@@ -80,12 +111,12 @@ QUALIFIERS __m256 _my256_cvtepu32_ps(const __m256i v)
#endif #endif
} }
#if !defined(__AVX512VL__) && defined(__GNUC__) && __GNUC__ >= 5 && !defined(__clang__) #if !defined(__AVX512VL__) && !defined(__AVX10_1__) && defined(__GNUC__) && __GNUC__ >= 5 && !defined(__clang__)
__attribute__((optimize("no-associative-math"))) __attribute__((optimize("no-associative-math")))
#endif #endif
QUALIFIERS __m256d _my256_cvtepu64_pd(const __m256i x) QUALIFIERS __m256d _my256_cvtepu64_pd(const __m256i x)
{ {
#ifdef __AVX512VL__ #if defined(__AVX512VL__) || defined(__AVX10_1__)
return _mm256_cvtepu64_pd(x); return _mm256_cvtepu64_pd(x);
#elif defined(__clang__) #elif defined(__clang__)
return __builtin_convertvector((uint64_t __attribute__((__vector_size__(32)))) x, __m256d); return __builtin_convertvector((uint64_t __attribute__((__vector_size__(32)))) x, __m256d);
...@@ -99,7 +130,7 @@ QUALIFIERS __m256d _my256_cvtepu64_pd(const __m256i x) ...@@ -99,7 +130,7 @@ QUALIFIERS __m256d _my256_cvtepu64_pd(const __m256i x)
} }
#endif #endif
#ifdef __AVX512F__ #if defined(__AVX512F__) || defined(__AVX10_512BIT__)
QUALIFIERS __m512i _my512_set_m128i(__m128i d, __m128i c, __m128i b, __m128i a) QUALIFIERS __m512i _my512_set_m128i(__m128i d, __m128i c, __m128i b, __m128i a)
{ {
return _mm512_inserti32x4(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(a), b, 1), c, 2), d, 3); return _mm512_inserti32x4(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(a), b, 1), c, 2), d, 3);
......
#ifndef __OPENCL_VERSION__ /*
#if defined(__SSE2__) || defined(_MSC_VER) Copyright 2010-2011, D. E. Shaw Research. All rights reserved.
Copyright 2019-2024, Michael Kuron.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#if !defined(__OPENCL_VERSION__) && !defined(__HIPCC_RTC__)
#if defined(__SSE2__) || (defined(_MSC_VER) && !defined(_M_ARM64))
#include <emmintrin.h> // SSE2 #include <emmintrin.h> // SSE2
#endif #endif
#ifdef __AVX2__ #ifdef __AVX2__
#include <immintrin.h> // AVX* #include <immintrin.h> // AVX*
#elif defined(__SSE4_1__) || defined(_MSC_VER) #elif defined(__SSE4_1__) || (defined(_MSC_VER) && !defined(_M_ARM64))
#include <smmintrin.h> // SSE4 #include <smmintrin.h> // SSE4
#ifdef __FMA__ #ifdef __FMA__
#include <immintrin.h> // FMA #include <immintrin.h> // FMA
#endif #endif
#endif #endif
#if defined(_MSC_VER) && defined(_M_ARM64)
#define __ARM_NEON
#endif
#ifdef __ARM_NEON #ifdef __ARM_NEON
#include <arm_neon.h> #include <arm_neon.h>
#endif #endif
#ifdef __ARM_FEATURE_SVE #if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_SME)
#include <arm_sve.h> #include <arm_sve.h>
#endif #endif
...@@ -34,7 +70,7 @@ ...@@ -34,7 +70,7 @@
#endif #endif
#endif #endif
#ifdef __CUDA_ARCH__ #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
#define QUALIFIERS static __forceinline__ __device__ #define QUALIFIERS static __forceinline__ __device__
#elif defined(__OPENCL_VERSION__) #elif defined(__OPENCL_VERSION__)
#define QUALIFIERS static inline #define QUALIFIERS static inline
...@@ -43,6 +79,12 @@ ...@@ -43,6 +79,12 @@
#include "myintrin.h" #include "myintrin.h"
#endif #endif
#if defined(__ARM_FEATURE_SME)
#define SVE_QUALIFIERS __attribute__((arm_streaming_compatible)) QUALIFIERS
#else
#define SVE_QUALIFIERS QUALIFIERS
#endif
#define PHILOX_W32_0 (0x9E3779B9) #define PHILOX_W32_0 (0x9E3779B9)
#define PHILOX_W32_1 (0xBB67AE85) #define PHILOX_W32_1 (0xBB67AE85)
#define PHILOX_M4x32_0 (0xD2511F53) #define PHILOX_M4x32_0 (0xD2511F53)
...@@ -55,7 +97,9 @@ ...@@ -55,7 +97,9 @@
typedef uint32_t uint32; typedef uint32_t uint32;
typedef uint64_t uint64; typedef uint64_t uint64;
#else #else
#ifndef __HIPCC_RTC__
#include <cstdint> #include <cstdint>
#endif
typedef std::uint32_t uint32; typedef std::uint32_t uint32;
typedef std::uint64_t uint64; typedef std::uint64_t uint64;
#endif #endif
...@@ -63,7 +107,7 @@ typedef std::uint64_t uint64; ...@@ -63,7 +107,7 @@ typedef std::uint64_t uint64;
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS > 0 #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS > 0
typedef svfloat32_t svfloat32_st __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS))); typedef svfloat32_t svfloat32_st __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
typedef svfloat64_t svfloat64_st __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS))); typedef svfloat64_t svfloat64_st __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
#elif defined(__ARM_FEATURE_SVE) #elif defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_SME)
typedef svfloat32_t svfloat32_st; typedef svfloat32_t svfloat32_st;
typedef svfloat64_t svfloat64_st; typedef svfloat64_t svfloat64_st;
#endif #endif
...@@ -71,7 +115,7 @@ typedef svfloat64_t svfloat64_st; ...@@ -71,7 +115,7 @@ typedef svfloat64_t svfloat64_st;
QUALIFIERS uint32 mulhilo32(uint32 a, uint32 b, uint32* hip) QUALIFIERS uint32 mulhilo32(uint32 a, uint32 b, uint32* hip)
{ {
#ifndef __CUDA_ARCH__ #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
// host code // host code
#if defined(__powerpc__) && (!defined(__clang__) || defined(__xlC__)) #if defined(__powerpc__) && (!defined(__clang__) || defined(__xlC__))
*hip = __mulhwu(a,b); *hip = __mulhwu(a,b);
...@@ -182,8 +226,8 @@ QUALIFIERS void philox_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3 ...@@ -182,8 +226,8 @@ QUALIFIERS void philox_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3
#endif #endif
} }
#if !defined(__CUDA_ARCH__) && !defined(__OPENCL_VERSION__) #if !defined(__CUDA_ARCH__) && !defined(__OPENCL_VERSION__) && !defined(__HIP_DEVICE_COMPILE__)
#if defined(__SSE4_1__) || defined(_MSC_VER) #if defined(__SSE4_1__) || (defined(_MSC_VER) && !defined(_M_ARM64))
QUALIFIERS void _philox4x32round(__m128i* ctr, __m128i* key) QUALIFIERS void _philox4x32round(__m128i* ctr, __m128i* key)
{ {
__m128i lohi0a = _mm_mul_epu32(ctr[0], _mm_set1_epi32(PHILOX_M4x32_0)); __m128i lohi0a = _mm_mul_epu32(ctr[0], _mm_set1_epi32(PHILOX_M4x32_0));
...@@ -665,12 +709,14 @@ QUALIFIERS void philox_float4(uint32 ctr0, uint32x4_t ctr1, uint32 ctr2, uint32 ...@@ -665,12 +709,14 @@ QUALIFIERS void philox_float4(uint32 ctr0, uint32x4_t ctr1, uint32 ctr2, uint32
philox_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, rnd2, rnd3, rnd4); philox_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, rnd2, rnd3, rnd4);
} }
#ifndef _MSC_VER
QUALIFIERS void philox_float4(uint32 ctr0, int32x4_t ctr1, uint32 ctr2, uint32 ctr3, QUALIFIERS void philox_float4(uint32 ctr0, int32x4_t ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1, uint32 key0, uint32 key1,
float32x4_t & rnd1, float32x4_t & rnd2, float32x4_t & rnd3, float32x4_t & rnd4) float32x4_t & rnd1, float32x4_t & rnd2, float32x4_t & rnd3, float32x4_t & rnd4)
{ {
philox_float4(ctr0, vreinterpretq_u32_s32(ctr1), ctr2, ctr3, key0, key1, rnd1, rnd2, rnd3, rnd4); philox_float4(ctr0, vreinterpretq_u32_s32(ctr1), ctr2, ctr3, key0, key1, rnd1, rnd2, rnd3, rnd4);
} }
#endif
QUALIFIERS void philox_double2(uint32 ctr0, uint32x4_t ctr1, uint32 ctr2, uint32 ctr3, QUALIFIERS void philox_double2(uint32 ctr0, uint32x4_t ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1, uint32 key0, uint32 key1,
...@@ -695,6 +741,7 @@ QUALIFIERS void philox_double2(uint32 ctr0, uint32x4_t ctr1, uint32 ctr2, uint32 ...@@ -695,6 +741,7 @@ QUALIFIERS void philox_double2(uint32 ctr0, uint32x4_t ctr1, uint32 ctr2, uint32
philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, ignore, rnd2, ignore); philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, ignore, rnd2, ignore);
} }
#ifndef _MSC_VER
QUALIFIERS void philox_double2(uint32 ctr0, int32x4_t ctr1, uint32 ctr2, uint32 ctr3, QUALIFIERS void philox_double2(uint32 ctr0, int32x4_t ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1, uint32 key0, uint32 key1,
float64x2_t & rnd1, float64x2_t & rnd2) float64x2_t & rnd1, float64x2_t & rnd2)
...@@ -702,10 +749,11 @@ QUALIFIERS void philox_double2(uint32 ctr0, int32x4_t ctr1, uint32 ctr2, uint32 ...@@ -702,10 +749,11 @@ QUALIFIERS void philox_double2(uint32 ctr0, int32x4_t ctr1, uint32 ctr2, uint32
philox_double2(ctr0, vreinterpretq_u32_s32(ctr1), ctr2, ctr3, key0, key1, rnd1, rnd2); philox_double2(ctr0, vreinterpretq_u32_s32(ctr1), ctr2, ctr3, key0, key1, rnd1, rnd2);
} }
#endif #endif
#endif
#if defined(__ARM_FEATURE_SVE) #if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_SME)
QUALIFIERS void _philox4x32round(svuint32x4_t & ctr, svuint32x2_t & key) SVE_QUALIFIERS void _philox4x32round(svuint32x4_t & ctr, svuint32x2_t & key)
{ {
svuint32_t lo0 = svmul_u32_x(svptrue_b32(), svget4_u32(ctr, 0), svdup_u32(PHILOX_M4x32_0)); svuint32_t lo0 = svmul_u32_x(svptrue_b32(), svget4_u32(ctr, 0), svdup_u32(PHILOX_M4x32_0));
svuint32_t hi0 = svmulh_u32_x(svptrue_b32(), svget4_u32(ctr, 0), svdup_u32(PHILOX_M4x32_0)); svuint32_t hi0 = svmulh_u32_x(svptrue_b32(), svget4_u32(ctr, 0), svdup_u32(PHILOX_M4x32_0));
...@@ -718,14 +766,14 @@ QUALIFIERS void _philox4x32round(svuint32x4_t & ctr, svuint32x2_t & key) ...@@ -718,14 +766,14 @@ QUALIFIERS void _philox4x32round(svuint32x4_t & ctr, svuint32x2_t & key)
ctr = svset4_u32(ctr, 3, lo0); ctr = svset4_u32(ctr, 3, lo0);
} }
QUALIFIERS void _philox4x32bumpkey(svuint32x2_t & key) SVE_QUALIFIERS void _philox4x32bumpkey(svuint32x2_t & key)
{ {
key = svset2_u32(key, 0, svadd_u32_x(svptrue_b32(), svget2_u32(key, 0), svdup_u32(PHILOX_W32_0))); key = svset2_u32(key, 0, svadd_u32_x(svptrue_b32(), svget2_u32(key, 0), svdup_u32(PHILOX_W32_0)));
key = svset2_u32(key, 1, svadd_u32_x(svptrue_b32(), svget2_u32(key, 1), svdup_u32(PHILOX_W32_1))); key = svset2_u32(key, 1, svadd_u32_x(svptrue_b32(), svget2_u32(key, 1), svdup_u32(PHILOX_W32_1)));
} }
template<bool high> template<bool high>
QUALIFIERS svfloat64_t _uniform_double_hq(svuint32_t x, svuint32_t y) SVE_QUALIFIERS svfloat64_t _uniform_double_hq(svuint32_t x, svuint32_t y)
{ {
// convert 32 to 64 bit // convert 32 to 64 bit
if (high) if (high)
...@@ -752,9 +800,9 @@ QUALIFIERS svfloat64_t _uniform_double_hq(svuint32_t x, svuint32_t y) ...@@ -752,9 +800,9 @@ QUALIFIERS svfloat64_t _uniform_double_hq(svuint32_t x, svuint32_t y)
} }
QUALIFIERS void philox_float4(svuint32_t ctr0, svuint32_t ctr1, svuint32_t ctr2, svuint32_t ctr3, SVE_QUALIFIERS void philox_float4(svuint32_t ctr0, svuint32_t ctr1, svuint32_t ctr2, svuint32_t ctr3,
uint32 key0, uint32 key1, uint32 key0, uint32 key1,
svfloat32_st & rnd1, svfloat32_st & rnd2, svfloat32_st & rnd3, svfloat32_st & rnd4) svfloat32_st & rnd1, svfloat32_st & rnd2, svfloat32_st & rnd3, svfloat32_st & rnd4)
{ {
svuint32x2_t key = svcreate2_u32(svdup_u32(key0), svdup_u32(key1)); svuint32x2_t key = svcreate2_u32(svdup_u32(key0), svdup_u32(key1));
svuint32x4_t ctr = svcreate4_u32(ctr0, ctr1, ctr2, ctr3); svuint32x4_t ctr = svcreate4_u32(ctr0, ctr1, ctr2, ctr3);
...@@ -782,9 +830,9 @@ QUALIFIERS void philox_float4(svuint32_t ctr0, svuint32_t ctr1, svuint32_t ctr2, ...@@ -782,9 +830,9 @@ QUALIFIERS void philox_float4(svuint32_t ctr0, svuint32_t ctr1, svuint32_t ctr2,
} }
QUALIFIERS void philox_double2(svuint32_t ctr0, svuint32_t ctr1, svuint32_t ctr2, svuint32_t ctr3, SVE_QUALIFIERS void philox_double2(svuint32_t ctr0, svuint32_t ctr1, svuint32_t ctr2, svuint32_t ctr3,
uint32 key0, uint32 key1, uint32 key0, uint32 key1,
svfloat64_st & rnd1lo, svfloat64_st & rnd1hi, svfloat64_st & rnd2lo, svfloat64_st & rnd2hi) svfloat64_st & rnd1lo, svfloat64_st & rnd1hi, svfloat64_st & rnd2lo, svfloat64_st & rnd2hi)
{ {
svuint32x2_t key = svcreate2_u32(svdup_u32(key0), svdup_u32(key1)); svuint32x2_t key = svcreate2_u32(svdup_u32(key0), svdup_u32(key1));
svuint32x4_t ctr = svcreate4_u32(ctr0, ctr1, ctr2, ctr3); svuint32x4_t ctr = svcreate4_u32(ctr0, ctr1, ctr2, ctr3);
...@@ -805,9 +853,9 @@ QUALIFIERS void philox_double2(svuint32_t ctr0, svuint32_t ctr1, svuint32_t ctr2 ...@@ -805,9 +853,9 @@ QUALIFIERS void philox_double2(svuint32_t ctr0, svuint32_t ctr1, svuint32_t ctr2
rnd2hi = _uniform_double_hq<true>(svget4_u32(ctr, 2), svget4_u32(ctr, 3)); rnd2hi = _uniform_double_hq<true>(svget4_u32(ctr, 2), svget4_u32(ctr, 3));
} }
QUALIFIERS void philox_float4(uint32 ctr0, svuint32_t ctr1, uint32 ctr2, uint32 ctr3, SVE_QUALIFIERS void philox_float4(uint32 ctr0, svuint32_t ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1, uint32 key0, uint32 key1,
svfloat32_st & rnd1, svfloat32_st & rnd2, svfloat32_st & rnd3, svfloat32_st & rnd4) svfloat32_st & rnd1, svfloat32_st & rnd2, svfloat32_st & rnd3, svfloat32_st & rnd4)
{ {
svuint32_t ctr0v = svdup_u32(ctr0); svuint32_t ctr0v = svdup_u32(ctr0);
svuint32_t ctr2v = svdup_u32(ctr2); svuint32_t ctr2v = svdup_u32(ctr2);
...@@ -816,16 +864,16 @@ QUALIFIERS void philox_float4(uint32 ctr0, svuint32_t ctr1, uint32 ctr2, uint32 ...@@ -816,16 +864,16 @@ QUALIFIERS void philox_float4(uint32 ctr0, svuint32_t ctr1, uint32 ctr2, uint32
philox_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, rnd2, rnd3, rnd4); philox_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, rnd2, rnd3, rnd4);
} }
QUALIFIERS void philox_float4(uint32 ctr0, svint32_t ctr1, uint32 ctr2, uint32 ctr3, SVE_QUALIFIERS void philox_float4(uint32 ctr0, svint32_t ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1, uint32 key0, uint32 key1,
svfloat32_st & rnd1, svfloat32_st & rnd2, svfloat32_st & rnd3, svfloat32_st & rnd4) svfloat32_st & rnd1, svfloat32_st & rnd2, svfloat32_st & rnd3, svfloat32_st & rnd4)
{ {
philox_float4(ctr0, svreinterpret_u32_s32(ctr1), ctr2, ctr3, key0, key1, rnd1, rnd2, rnd3, rnd4); philox_float4(ctr0, svreinterpret_u32_s32(ctr1), ctr2, ctr3, key0, key1, rnd1, rnd2, rnd3, rnd4);
} }
QUALIFIERS void philox_double2(uint32 ctr0, svuint32_t ctr1, uint32 ctr2, uint32 ctr3, SVE_QUALIFIERS void philox_double2(uint32 ctr0, svuint32_t ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1, uint32 key0, uint32 key1,
svfloat64_st & rnd1lo, svfloat64_st & rnd1hi, svfloat64_st & rnd2lo, svfloat64_st & rnd2hi) svfloat64_st & rnd1lo, svfloat64_st & rnd1hi, svfloat64_st & rnd2lo, svfloat64_st & rnd2hi)
{ {
svuint32_t ctr0v = svdup_u32(ctr0); svuint32_t ctr0v = svdup_u32(ctr0);
svuint32_t ctr2v = svdup_u32(ctr2); svuint32_t ctr2v = svdup_u32(ctr2);
...@@ -834,9 +882,9 @@ QUALIFIERS void philox_double2(uint32 ctr0, svuint32_t ctr1, uint32 ctr2, uint32 ...@@ -834,9 +882,9 @@ QUALIFIERS void philox_double2(uint32 ctr0, svuint32_t ctr1, uint32 ctr2, uint32
philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1lo, rnd1hi, rnd2lo, rnd2hi); philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1lo, rnd1hi, rnd2lo, rnd2hi);
} }
QUALIFIERS void philox_double2(uint32 ctr0, svuint32_t ctr1, uint32 ctr2, uint32 ctr3, SVE_QUALIFIERS void philox_double2(uint32 ctr0, svuint32_t ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1, uint32 key0, uint32 key1,
svfloat64_st & rnd1, svfloat64_st & rnd2) svfloat64_st & rnd1, svfloat64_st & rnd2)
{ {
svuint32_t ctr0v = svdup_u32(ctr0); svuint32_t ctr0v = svdup_u32(ctr0);
svuint32_t ctr2v = svdup_u32(ctr2); svuint32_t ctr2v = svdup_u32(ctr2);
...@@ -846,9 +894,9 @@ QUALIFIERS void philox_double2(uint32 ctr0, svuint32_t ctr1, uint32 ctr2, uint32 ...@@ -846,9 +894,9 @@ QUALIFIERS void philox_double2(uint32 ctr0, svuint32_t ctr1, uint32 ctr2, uint32
philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, ignore, rnd2, ignore); philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, ignore, rnd2, ignore);
} }
QUALIFIERS void philox_double2(uint32 ctr0, svint32_t ctr1, uint32 ctr2, uint32 ctr3, SVE_QUALIFIERS void philox_double2(uint32 ctr0, svint32_t ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1, uint32 key0, uint32 key1,
svfloat64_st & rnd1, svfloat64_st & rnd2) svfloat64_st & rnd1, svfloat64_st & rnd2)
{ {
philox_double2(ctr0, svreinterpret_u32_s32(ctr1), ctr2, ctr3, key0, key1, rnd1, rnd2); philox_double2(ctr0, svreinterpret_u32_s32(ctr1), ctr2, ctr3, key0, key1, rnd1, rnd2);
} }
...@@ -1174,7 +1222,7 @@ QUALIFIERS void philox_double2(uint32 ctr0, __m256i ctr1, uint32 ctr2, uint32 ct ...@@ -1174,7 +1222,7 @@ QUALIFIERS void philox_double2(uint32 ctr0, __m256i ctr1, uint32 ctr2, uint32 ct
} }
#endif #endif
#ifdef __AVX512F__ #if defined(__AVX512F__) || defined(__AVX10_512BIT__)
QUALIFIERS void _philox4x32round(__m512i* ctr, __m512i* key) QUALIFIERS void _philox4x32round(__m512i* ctr, __m512i* key)
{ {
__m512i lohi0a = _mm512_mul_epu32(ctr[0], _mm512_set1_epi32(PHILOX_M4x32_0)); __m512i lohi0a = _mm512_mul_epu32(ctr[0], _mm512_set1_epi32(PHILOX_M4x32_0));
......
/*
Copyright 2021, Michael Kuron.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <altivec.h> #include <altivec.h>
#undef vector #undef vector
#undef bool #undef bool
......
/*
Copyright 2023, Michael Kuron.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
inline void cachelineZero(void * p) {
#ifdef __riscv_zicboz
__asm__ volatile("cbo.zero (%0)"::"r"(p):"memory");
#endif
}
inline size_t _cachelineSize() {
// allocate and fill with ones
const size_t max_size = 0x100000;
uint8_t data[2*max_size];
for (size_t i = 0; i < 2*max_size; ++i) {
data[i] = 0xff;
}
// find alignment offset
size_t offset = max_size - ((uintptr_t) data) % max_size;
// zero a cacheline
cachelineZero((void*) (data + offset));
// make sure that at least one byte was zeroed
if (data[offset] != 0) {
return SIZE_MAX;
}
// make sure that nothing was zeroed before the pointer
if (data[offset-1] == 0) {
return SIZE_MAX;
}
// find the last byte that was zeroed
for (size_t size = 1; size < max_size; ++size) {
if (data[offset + size] != 0) {
return size;
}
}
// too much was zeroed
return SIZE_MAX;
}
inline size_t cachelineSize() {
#ifdef __riscv_zicboz
static size_t size = _cachelineSize();
return size;
#else
return SIZE_MAX;
#endif
}
# TODO #47 move to a module functions
import numpy as np import numpy as np
import sympy as sp import sympy as sp
from pystencils.data_types import cast_func, collate_types, create_type, get_type_of_expression from pystencils.typing import CastFunc, collate_types, create_type, get_type_of_expression
from pystencils.sympyextensions import is_integer_sequence from pystencils.sympyextensions import is_integer_sequence
...@@ -12,9 +13,9 @@ class IntegerFunctionTwoArgsMixIn(sp.Function): ...@@ -12,9 +13,9 @@ class IntegerFunctionTwoArgsMixIn(sp.Function):
args = [] args = []
for a in (arg1, arg2): for a in (arg1, arg2):
if isinstance(a, sp.Number) or isinstance(a, int): if isinstance(a, sp.Number) or isinstance(a, int):
args.append(cast_func(a, create_type("int"))) args.append(CastFunc(a, create_type("int")))
elif isinstance(a, np.generic): elif isinstance(a, np.generic):
args.append(cast_func(a, a.dtype)) args.append(CastFunc(a, a.dtype))
else: else:
args.append(a) args.append(a)
......
...@@ -4,7 +4,8 @@ import islpy as isl ...@@ -4,7 +4,8 @@ import islpy as isl
import sympy as sp import sympy as sp
import pystencils.astnodes as ast import pystencils.astnodes as ast
from pystencils.transformations import parents_of_type from pystencils.typing import parents_of_type
from pystencils.backends.cbackend import CustomSympyPrinter
def remove_brackets(s): def remove_brackets(s):
...@@ -51,11 +52,13 @@ def simplify_loop_counter_dependent_conditional(conditional): ...@@ -51,11 +52,13 @@ def simplify_loop_counter_dependent_conditional(conditional):
dofs_in_loops, iteration_set = isl_iteration_set(conditional) dofs_in_loops, iteration_set = isl_iteration_set(conditional)
if dofs_in_condition.issubset(dofs_in_loops): if dofs_in_condition.issubset(dofs_in_loops):
symbol_names = ','.join(dofs_in_loops) symbol_names = ','.join(dofs_in_loops)
condition_str = remove_brackets(str(conditional.condition_expr)) condition_str = CustomSympyPrinter().doprint(conditional.condition_expr)
condition_str = remove_brackets(condition_str)
condition_set = isl.BasicSet(f"{{ [{symbol_names}] : {condition_str} }}") condition_set = isl.BasicSet(f"{{ [{symbol_names}] : {condition_str} }}")
if condition_set.is_empty(): if condition_set.is_empty():
conditional.replace_by_false_block() conditional.replace_by_false_block()
return
intersection = iteration_set.intersect(condition_set) intersection = iteration_set.intersect(condition_set)
if intersection.is_empty(): if intersection.is_empty():
......
File moved
from collections import namedtuple, defaultdict
from typing import Union
import sympy as sp
from sympy.codegen import Assignment
from pystencils.simp import AssignmentCollection
from pystencils import astnodes as ast, TypedSymbol
from pystencils.field import Field
from pystencils.node_collection import NodeCollection
from pystencils.transformations import NestedScopes
# TODO use this in Constraint Checker
accepted_functions = [
sp.Pow,
sp.sqrt,
sp.log,
# TODO trigonometric functions (and whatever tests will fail)
]
class KernelConstraintsCheck:
# TODO: proper specification
# TODO: More checks :)
"""Checks if the input to create_kernel is valid.
Test the following conditions:
- SSA Form for pure symbols:
- Every pure symbol may occur only once as left-hand-side of an assignment
- Every pure symbol that is read, may not be written to later
- Independence / Parallelization condition:
- a field that is written may only be read at exact the same spatial position
(Pure symbols are symbols that are not Field.Accesses)
"""
FieldAndIndex = namedtuple('FieldAndIndex', ['field', 'index'])
def __init__(self, check_independence_condition=True, check_double_write_condition=True):
self.scopes = NestedScopes()
self.field_reads = defaultdict(set)
self.field_writes = defaultdict(set)
self.fields_read = set()
self.check_independence_condition = check_independence_condition
self.check_double_write_condition = check_double_write_condition
def visit(self, obj):
if isinstance(obj, (AssignmentCollection, NodeCollection)):
[self.visit(e) for e in obj.all_assignments]
elif isinstance(obj, list) or isinstance(obj, tuple):
[self.visit(e) for e in obj]
elif isinstance(obj, (sp.Eq, ast.SympyAssignment, Assignment)):
self.process_assignment(obj)
elif isinstance(obj, ast.Conditional):
self.scopes.push()
# Disable double write check inside conditionals
# would be triggered by e.g. in-kernel boundaries
old_double_write = self.check_double_write_condition
old_independence_condition = self.check_independence_condition
self.check_double_write_condition = False
self.check_independence_condition = False
if obj.false_block:
self.visit(obj.false_block)
self.process_expression(obj.condition_expr)
self.process_expression(obj.true_block)
self.check_double_write_condition = old_double_write
self.check_independence_condition = old_independence_condition
self.scopes.pop()
elif isinstance(obj, ast.Block):
self.scopes.push()
[self.visit(e) for e in obj.args]
self.scopes.pop()
elif isinstance(obj, ast.Node) and not isinstance(obj, ast.LoopOverCoordinate):
pass
else:
raise ValueError(f'Invalid object in kernel {type(obj)}')
def process_assignment(self, assignment: Union[sp.Eq, ast.SympyAssignment, Assignment]):
# for checks it is crucial to process rhs before lhs to catch e.g. a = a + 1
self.process_expression(assignment.rhs)
self.process_lhs(assignment.lhs)
def process_expression(self, rhs):
# TODO constraint for accepted functions, see TODO above
self.update_accesses_rhs(rhs)
if isinstance(rhs, Field.Access):
self.fields_read.add(rhs.field)
self.fields_read.update(rhs.indirect_addressing_fields)
else:
for arg in rhs.args:
self.process_expression(arg)
@property
def fields_written(self):
"""
Return all rhs fields
"""
return set(k.field for k, v in self.field_writes.items() if len(v))
def process_lhs(self, lhs: Union[Field.Access, TypedSymbol, sp.Symbol]):
assert isinstance(lhs, sp.Symbol)
self.update_accesses_lhs(lhs)
def update_accesses_lhs(self, lhs):
if isinstance(lhs, Field.Access):
fai = self.FieldAndIndex(lhs.field, lhs.index)
if self.check_double_write_condition and lhs.offsets in self.field_writes[fai]:
raise ValueError(f"Field {lhs.field.name} is written twice at the same location")
self.field_writes[fai].add(lhs.offsets)
if self.check_double_write_condition and len(self.field_writes[fai]) > 1:
raise ValueError(
f"Field {lhs.field.name} is written at two different locations")
if fai in self.field_reads:
reads = tuple(self.field_reads[fai])
if len(reads) > 1 or lhs.offsets != reads[0]:
if self.check_independence_condition:
raise ValueError(f"Field {lhs.field.name} is written at different location than it was read. "
f"This means the resulting kernel would not be thread safe")
elif isinstance(lhs, sp.Symbol):
if self.scopes.is_defined_locally(lhs):
raise ValueError(f"Assignments not in SSA form, multiple assignments to {lhs.name}")
if lhs in self.scopes.free_parameters:
raise ValueError(f"Symbol {lhs.name} is written, after it has been read")
self.scopes.define_symbol(lhs)
def update_accesses_rhs(self, rhs):
if isinstance(rhs, Field.Access) and self.check_independence_condition:
fai = self.FieldAndIndex(rhs.field, rhs.index)
writes = self.field_writes[fai]
self.field_reads[fai].add(rhs.offsets)
for write_offset in writes:
assert len(writes) == 1
if write_offset != rhs.offsets:
raise ValueError(f"Violation of loop independence condition. Field "
f"{rhs.field} is read at {rhs.offsets} and written at {write_offset}")
self.fields_read.add(rhs.field)
elif isinstance(rhs, sp.Symbol):
self.scopes.access_symbol(rhs)
...@@ -7,7 +7,7 @@ import sympy as sp ...@@ -7,7 +7,7 @@ import sympy as sp
from pystencils.assignment import Assignment from pystencils.assignment import Assignment
from pystencils.sympyextensions import SymbolCreator from pystencils.sympyextensions import SymbolCreator
from pystencils.kernelcreation import CreateKernelConfig from pystencils.config import CreateKernelConfig
__all__ = ['kernel', 'kernel_config'] __all__ = ['kernel', 'kernel_config']
...@@ -77,10 +77,10 @@ def kernel_config(config: CreateKernelConfig, **kwargs) -> Callable[..., Dict]: ...@@ -77,10 +77,10 @@ def kernel_config(config: CreateKernelConfig, **kwargs) -> Callable[..., Dict]:
and updates the function name accordingly. and updates the function name accordingly.
Changes the meaning of the '@=' operator. Each line containing this operator gives a symbolic assignment Changes the meaning of the '@=' operator. Each line containing this operator gives a symbolic assignment
in the result list. Furthermore the meaning of the ternary inline 'if-else' changes meaning to denote a in the result list. Furthermore, the meaning of the ternary inline 'if-else' changes meaning to denote a
sympy Piecewise. sympy Piecewise.
The decorated function may not receive any arguments, with exception of an argument called 's' that specifies The decorated function may not receive any arguments, with exception to an argument called 's' that specifies
a SymbolCreator() a SymbolCreator()
Args: Args:
config: Specify whether to return the list with assignments, or a dictionary containing additional settings config: Specify whether to return the list with assignments, or a dictionary containing additional settings
...@@ -90,14 +90,14 @@ def kernel_config(config: CreateKernelConfig, **kwargs) -> Callable[..., Dict]: ...@@ -90,14 +90,14 @@ def kernel_config(config: CreateKernelConfig, **kwargs) -> Callable[..., Dict]:
Examples: Examples:
>>> import pystencils as ps >>> import pystencils as ps
>>> config = ps.CreateKernelConfig() >>> kernel_configuration = ps.CreateKernelConfig()
>>> @kernel_config(config) >>> @kernel_config(kernel_configuration)
... def my_kernel(s): ... def my_kernel(s):
... f, g = ps.fields('f, g: [2D]') ... src, dst = ps.fields('src, dst: [2D]')
... s.neighbors @= f[0,1] + f[1,0] ... s.neighbors @= src[0, 1] + src[1, 0]
... g[0,0] @= s.neighbors + f[0,0] if f[0,0] > 0 else 0 ... dst[0, 0] @= s.neighbors + src[0, 0] if src[0, 0] > 0 else 0
>>> f, g = ps.fields('f, g: [2D]') >>> f, g = ps.fields('src, dst: [2D]')
>>> assert my_kernel['assignments'][0].rhs == f[0,1] + f[1,0] >>> assert my_kernel['assignments'][0].rhs == f[0, 1] + f[1, 0]
""" """
def decorator(func: Callable[..., None]) -> Union[List[Assignment], Dict]: def decorator(func: Callable[..., None]) -> Union[List[Assignment], Dict]:
""" """
......
import itertools import itertools
import warnings import warnings
from dataclasses import dataclass, field from typing import Union, List
from types import MappingProxyType
from typing import Callable, Union, List, Dict, Tuple, Any
import sympy as sp import sympy as sp
from pystencils.config import CreateKernelConfig
from pystencils.assignment import Assignment from pystencils.assignment import Assignment, AddAugmentedAssignment
from pystencils.astnodes import Block, Conditional, LoopOverCoordinate, SympyAssignment from pystencils.astnodes import Node, Block, Conditional, LoopOverCoordinate, SympyAssignment
from pystencils.cpu.vectorization import vectorize from pystencils.cpu.vectorization import vectorize
from pystencils.enums import Target, Backend from pystencils.enums import Target, Backend
from pystencils.field import Field, FieldType from pystencils.field import Field, FieldType
from pystencils.gpucuda.indexing import indexing_creator_from_params from pystencils.node_collection import NodeCollection
from pystencils.simp.assignment_collection import AssignmentCollection from pystencils.simp.assignment_collection import AssignmentCollection
from pystencils.simp.simplifications import apply_sympy_optimisations from pystencils.kernel_contrains_check import KernelConstraintsCheck
from pystencils.simplificationfactory import create_simplification_strategy from pystencils.simplificationfactory import create_simplification_strategy
from pystencils.stencil import direction_string_to_offset, inverse_direction_string from pystencils.stencil import direction_string_to_offset, inverse_direction_string
from pystencils.transformations import ( from pystencils.transformations import (
loop_blocking, move_constants_before_loop, remove_conditionals_in_staggered_kernel) loop_blocking, move_constants_before_loop, remove_conditionals_in_staggered_kernel)
@dataclass def create_kernel(assignments: Union[Assignment, List[Assignment],
class CreateKernelConfig: AddAugmentedAssignment, List[AddAugmentedAssignment],
""" AssignmentCollection, List[Node], NodeCollection],
**Below all parameters for the CreateKernelConfig are explained** *,
"""
target: Target = Target.CPU
"""
All targets are defined in :class:`pystencils.enums.Target`
"""
backend: Backend = None
"""
All backends are defined in :class:`pystencils.enums.Backend`
"""
function_name: str = 'kernel'
"""
Name of the generated function - only important if generated code is written out
"""
data_type: Union[str, dict] = 'double'
"""
Data type used for all untyped symbols (i.e. non-fields), can also be a dict from symbol name to type
"""
iteration_slice: Tuple = None
"""
Rectangular subset to iterate over, if not specified the complete non-ghost layer part of the field is iterated over
"""
ghost_layers: Union[bool, int, List[Tuple[int]]] = None
"""
A single integer specifies the ghost layer count at all borders, can also be a sequence of
pairs ``[(x_lower_gl, x_upper_gl), .... ]``. These layers are excluded from the iteration.
If left to default, the number of ghost layers is determined automatically from the assignments.
"""
skip_independence_check: bool = False
"""
Don't check that loop iterations are independent. This is needed e.g. for
periodicity kernel, that access the field outside the iteration bounds. Use with care!
"""
cpu_openmp: Union[bool, int] = False
"""
`True` or number of threads for OpenMP parallelization, `False` for no OpenMP. If set to `True`, the maximum number
of available threads will be chosen.
"""
cpu_vectorize_info: Dict = None
"""
A dictionary with keys, 'vector_instruction_set', 'assume_aligned' and 'nontemporal'
for documentation of these parameters see vectorize function. Example:
'{'instruction_set': 'avx512', 'assume_aligned': True, 'nontemporal':True}'
"""
cpu_blocking: Tuple[int] = None
"""
A tuple of block sizes or `None` if no blocking should be applied
"""
omp_single_loop: bool = True
"""
If OpenMP is active: whether multiple outer loops are permitted
"""
gpu_indexing: str = 'block'
"""
Either 'block' or 'line' , or custom indexing class, see `AbstractIndexing`
"""
gpu_indexing_params: MappingProxyType = field(default=MappingProxyType({}))
"""
Dict with indexing parameters (constructor parameters of indexing class)
e.g. for 'block' one can specify '{'block_size': (20, 20, 10) }'.
"""
default_assignment_simplifications: bool = False
"""
If `True` default simplifications are first performed on the Assignments. If problems occur during the
simplification a warning will be thrown.
Furthermore, it is essential to know that this is a two-stage process. The first stage of the process acts
on the level of the `AssignmentCollection`. In this part, `create_simplification_strategy`
from pystencils.simplificationfactory will be used to apply optimisations like insertion of constants to
remove pressure from the registers. Thus the first part of the optimisations can only be executed if
an `AssignmentCollection` is passed. The second part of the optimisation acts on the level of each Assignment
individually. In this stage, all optimisations from `sympy.codegen.rewriting.optims_c99` are applied
to each Assignment. Thus this stage can also be applied if a list of Assignments is passed.
"""
cpu_prepend_optimizations: List[Callable] = field(default_factory=list)
"""
List of extra optimizations to perform first on the AST.
"""
use_auto_for_assignments: bool = False
"""
If set to `True`, auto can be used in the generated code for data types. This makes the type system more robust.
"""
index_fields: List[Field] = None
"""
List of index fields, i.e. 1D fields with struct data type. If not `None`, `create_index_kernel`
instead of `create_domain_kernel` is used.
"""
coordinate_names: Tuple[str, Any] = ('x', 'y', 'z')
"""
Name of the coordinate fields in the struct data type.
"""
def __post_init__(self):
# ---- Legacy parameters
if isinstance(self.target, str):
new_target = Target[self.target.upper()]
warnings.warn(f'Target "{self.target}" as str is deprecated. Use {new_target} instead',
category=DeprecationWarning)
self.target = new_target
# ---- Auto Backend
if not self.backend:
if self.target == Target.CPU:
self.backend = Backend.C
elif self.target == Target.GPU:
self.backend = Backend.CUDA
else:
raise NotImplementedError(f'Target {self.target} has no default backend')
def create_kernel(assignments: Union[Assignment, List[Assignment], AssignmentCollection, List[Conditional]], *,
config: CreateKernelConfig = None, **kwargs): config: CreateKernelConfig = None, **kwargs):
""" """
Creates abstract syntax tree (AST) of kernel, using a list of update equations. Creates abstract syntax tree (AST) of kernel, using a list of update equations.
...@@ -171,9 +62,24 @@ def create_kernel(assignments: Union[Assignment, List[Assignment], AssignmentCol ...@@ -171,9 +62,24 @@ def create_kernel(assignments: Union[Assignment, List[Assignment], AssignmentCol
setattr(config, k, v) setattr(config, k, v)
# ---- Normalizing parameters # ---- Normalizing parameters
if isinstance(assignments, Assignment): if isinstance(assignments, (Assignment, AddAugmentedAssignment)):
assignments = [assignments] assignments = [assignments]
assert assignments, "Assignments must not be empty!" assert assignments, "Assignments must not be empty!"
if isinstance(assignments, list):
assignments = NodeCollection(assignments)
elif isinstance(assignments, AssignmentCollection):
# TODO Markus check and doku
# --- applying first default simplifications
try:
if config.default_assignment_simplifications:
simplification = create_simplification_strategy()
assignments = simplification(assignments)
except Exception as e:
warnings.warn(f"It was not possible to apply the default pystencils optimisations to the "
f"AssignmentCollection due to the following problem :{e}")
simplification_hints = assignments.simplification_hints
assignments = NodeCollection.from_assignment_collection(assignments)
assignments.simplification_hints = simplification_hints
if config.index_fields: if config.index_fields:
return create_indexed_kernel(assignments, config=config) return create_indexed_kernel(assignments, config=config)
...@@ -181,12 +87,15 @@ def create_kernel(assignments: Union[Assignment, List[Assignment], AssignmentCol ...@@ -181,12 +87,15 @@ def create_kernel(assignments: Union[Assignment, List[Assignment], AssignmentCol
return create_domain_kernel(assignments, config=config) return create_domain_kernel(assignments, config=config)
def create_domain_kernel(assignments: List[Assignment], *, config: CreateKernelConfig): def create_domain_kernel(assignments: NodeCollection, *, config: CreateKernelConfig):
""" """
Creates abstract syntax tree (AST) of kernel, using a list of update equations. Creates abstract syntax tree (AST) of kernel, using a NodeCollection.
Note that `create_domain_kernel` is a lower level function which shoul be accessed by not providing `index_fields`
to create_kernel
Args: Args:
assignments: can be a single assignment, sequence of assignments or an `AssignmentCollection` assignments: `pystencils.node_collection.NodeCollection` containing all assignements and nodes to be processed
config: CreateKernelConfig which includes the needed configuration config: CreateKernelConfig which includes the needed configuration
Returns: Returns:
...@@ -196,10 +105,12 @@ def create_domain_kernel(assignments: List[Assignment], *, config: CreateKernelC ...@@ -196,10 +105,12 @@ def create_domain_kernel(assignments: List[Assignment], *, config: CreateKernelC
Example: Example:
>>> import pystencils as ps >>> import pystencils as ps
>>> import numpy as np >>> import numpy as np
>>> from pystencils.kernelcreation import create_domain_kernel
>>> from pystencils.node_collection import NodeCollection
>>> s, d = ps.fields('s, d: [2D]') >>> s, d = ps.fields('s, d: [2D]')
>>> assignment = ps.Assignment(d[0,0], s[0, 1] + s[0, -1] + s[1, 0] + s[-1, 0]) >>> assignment = ps.Assignment(d[0,0], s[0, 1] + s[0, -1] + s[1, 0] + s[-1, 0])
>>> kernel_config = ps.CreateKernelConfig(cpu_openmp=True) >>> kernel_config = ps.CreateKernelConfig(cpu_openmp=True)
>>> kernel_ast = ps.kernelcreation.create_domain_kernel([assignment], config=kernel_config) >>> kernel_ast = create_domain_kernel(NodeCollection([assignment]), config=kernel_config)
>>> kernel = kernel_ast.compile() >>> kernel = kernel_ast.compile()
>>> d_arr = np.zeros([5, 5]) >>> d_arr = np.zeros([5, 5])
>>> kernel(d=d_arr, s=np.ones([5, 5])) >>> kernel(d=d_arr, s=np.ones([5, 5]))
...@@ -210,38 +121,25 @@ def create_domain_kernel(assignments: List[Assignment], *, config: CreateKernelC ...@@ -210,38 +121,25 @@ def create_domain_kernel(assignments: List[Assignment], *, config: CreateKernelC
[0., 4., 4., 4., 0.], [0., 4., 4., 4., 0.],
[0., 0., 0., 0., 0.]]) [0., 0., 0., 0., 0.]])
""" """
# --- applying first default simplifications # --- eval
try: assignments.evaluate_terms()
if config.default_assignment_simplifications and isinstance(assignments, AssignmentCollection):
simplification = create_simplification_strategy()
assignments = simplification(assignments)
except Exception as e:
warnings.warn(f"It was not possible to apply the default pystencils optimisations to the "
f"AssignmentCollection due to the following problem :{e}")
# ---- Normalizing parameters # FUTURE WORK from here we shouldn't NEED sympy
split_groups = () # --- check constrains
if isinstance(assignments, AssignmentCollection): check = KernelConstraintsCheck(check_independence_condition=not config.skip_independence_check,
if 'split_groups' in assignments.simplification_hints: check_double_write_condition=not config.allow_double_writes)
split_groups = assignments.simplification_hints['split_groups']
assignments = assignments.all_assignments check.visit(assignments)
try: assignments.bound_fields = check.fields_written
if config.default_assignment_simplifications: assignments.rhs_fields = check.fields_read
assignments = apply_sympy_optimisations(assignments)
except Exception as e:
warnings.warn(f"It was not possible to apply the default SymPy optimisations to the "
f"Assignments due to the following problem :{e}")
# ---- Creating ast # ---- Creating ast
ast = None ast = None
if config.target == Target.CPU: if config.target == Target.CPU:
if config.backend == Backend.C: if config.backend == Backend.C:
from pystencils.cpu import add_openmp, create_kernel from pystencils.cpu import add_openmp, create_kernel
ast = create_kernel(assignments, function_name=config.function_name, type_info=config.data_type, ast = create_kernel(assignments, config=config)
split_groups=split_groups,
iteration_slice=config.iteration_slice, ghost_layers=config.ghost_layers,
skip_independence_check=config.skip_independence_check)
for optimization in config.cpu_prepend_optimizations: for optimization in config.cpu_prepend_optimizations:
optimization(ast) optimization(ast)
omp_collapse = None omp_collapse = None
...@@ -265,12 +163,8 @@ def create_domain_kernel(assignments: List[Assignment], *, config: CreateKernelC ...@@ -265,12 +163,8 @@ def create_domain_kernel(assignments: List[Assignment], *, config: CreateKernelC
raise ValueError("Invalid value for cpu_vectorize_info") raise ValueError("Invalid value for cpu_vectorize_info")
elif config.target == Target.GPU: elif config.target == Target.GPU:
if config.backend == Backend.CUDA: if config.backend == Backend.CUDA:
from pystencils.gpucuda import create_cuda_kernel from pystencils.gpu import create_cuda_kernel
ast = create_cuda_kernel(assignments, function_name=config.function_name, type_info=config.data_type, ast = create_cuda_kernel(assignments, config=config)
indexing_creator=indexing_creator_from_params(config.gpu_indexing,
config.gpu_indexing_params),
iteration_slice=config.iteration_slice, ghost_layers=config.ghost_layers,
skip_independence_check=config.skip_independence_check)
if not ast: if not ast:
raise NotImplementedError( raise NotImplementedError(
...@@ -283,7 +177,7 @@ def create_domain_kernel(assignments: List[Assignment], *, config: CreateKernelC ...@@ -283,7 +177,7 @@ def create_domain_kernel(assignments: List[Assignment], *, config: CreateKernelC
return ast return ast
def create_indexed_kernel(assignments: List[Assignment], *, config: CreateKernelConfig): def create_indexed_kernel(assignments: NodeCollection, *, config: CreateKernelConfig):
""" """
Similar to :func:`create_kernel`, but here not all cells of a field are updated but only cells with Similar to :func:`create_kernel`, but here not all cells of a field are updated but only cells with
coordinates which are stored in an index field. This traversal method can e.g. be used for boundary handling. coordinates which are stored in an index field. This traversal method can e.g. be used for boundary handling.
...@@ -293,8 +187,11 @@ def create_indexed_kernel(assignments: List[Assignment], *, config: CreateKernel ...@@ -293,8 +187,11 @@ def create_indexed_kernel(assignments: List[Assignment], *, config: CreateKernel
'coordinate_names' parameter. The struct can have also other fields that can be read and written in the kernel, for 'coordinate_names' parameter. The struct can have also other fields that can be read and written in the kernel, for
example boundary parameters. example boundary parameters.
Note that `create_indexed_kernel` is a lower level function which shoul be accessed by providing `index_fields`
to create_kernel
Args: Args:
assignments: can be a single assignment, sequence of assignments or an `AssignmentCollection` assignments: `pystencils.node_collection.NodeCollection` containing all assignements and nodes to be processed
config: CreateKernelConfig which includes the needed configuration config: CreateKernelConfig which includes the needed configuration
Returns: Returns:
...@@ -303,7 +200,9 @@ def create_indexed_kernel(assignments: List[Assignment], *, config: CreateKernel ...@@ -303,7 +200,9 @@ def create_indexed_kernel(assignments: List[Assignment], *, config: CreateKernel
Example: Example:
>>> import pystencils as ps >>> import pystencils as ps
>>> from pystencils.node_collection import NodeCollection
>>> import numpy as np >>> import numpy as np
>>> from pystencils.kernelcreation import create_indexed_kernel
>>> >>>
>>> # Index field stores the indices of the cell to visit together with optional values >>> # Index field stores the indices of the cell to visit together with optional values
>>> index_arr_dtype = np.dtype([('x', np.int32), ('y', np.int32), ('val', np.double)]) >>> index_arr_dtype = np.dtype([('x', np.int32), ('y', np.int32), ('val', np.double)])
...@@ -314,7 +213,7 @@ def create_indexed_kernel(assignments: List[Assignment], *, config: CreateKernel ...@@ -314,7 +213,7 @@ def create_indexed_kernel(assignments: List[Assignment], *, config: CreateKernel
>>> s, d = ps.fields('s, d: [2D]') >>> s, d = ps.fields('s, d: [2D]')
>>> assignment = ps.Assignment(d[0, 0], 2 * s[0, 1] + 2 * s[1, 0] + idx_field('val')) >>> assignment = ps.Assignment(d[0, 0], 2 * s[0, 1] + 2 * s[1, 0] + idx_field('val'))
>>> kernel_config = ps.CreateKernelConfig(index_fields=[idx_field], coordinate_names=('x', 'y')) >>> kernel_config = ps.CreateKernelConfig(index_fields=[idx_field], coordinate_names=('x', 'y'))
>>> kernel_ast = ps.create_indexed_kernel([assignment], config=kernel_config) >>> kernel_ast = create_indexed_kernel(NodeCollection([assignment]), config=kernel_config)
>>> kernel = kernel_ast.compile() >>> kernel = kernel_ast.compile()
>>> d_arr = np.zeros([5, 5]) >>> d_arr = np.zeros([5, 5])
>>> kernel(s=np.ones([5, 5]), d=d_arr, idx=index_arr) >>> kernel(s=np.ones([5, 5]), d=d_arr, idx=index_arr)
...@@ -324,23 +223,30 @@ def create_indexed_kernel(assignments: List[Assignment], *, config: CreateKernel ...@@ -324,23 +223,30 @@ def create_indexed_kernel(assignments: List[Assignment], *, config: CreateKernel
[0. , 0. , 4.2, 0. , 0. ], [0. , 0. , 4.2, 0. , 0. ],
[0. , 0. , 0. , 4.3, 0. ], [0. , 0. , 0. , 4.3, 0. ],
[0. , 0. , 0. , 0. , 0. ]]) [0. , 0. , 0. , 0. , 0. ]])
""" """
# --- eval
assignments.evaluate_terms()
# FUTURE WORK from here we shouldn't NEED sympy
# --- check constrains
check = KernelConstraintsCheck(check_independence_condition=not config.skip_independence_check,
check_double_write_condition=not config.allow_double_writes)
check.visit(assignments)
assignments.bound_fields = check.fields_written
assignments.rhs_fields = check.fields_read
ast = None ast = None
if config.target == Target.CPU and config.backend == Backend.C: if config.target == Target.CPU and config.backend == Backend.C:
from pystencils.cpu import add_openmp, create_indexed_kernel from pystencils.cpu import add_openmp, create_indexed_kernel
ast = create_indexed_kernel(assignments, index_fields=config.index_fields, type_info=config.data_type, ast = create_indexed_kernel(assignments, config=config)
coordinate_names=config.coordinate_names)
if config.cpu_openmp: if config.cpu_openmp:
add_openmp(ast, num_threads=config.cpu_openmp) add_openmp(ast, num_threads=config.cpu_openmp)
elif config.target == Target.GPU: elif config.target == Target.GPU:
if config.backend == Backend.CUDA: if config.backend == Backend.CUDA:
from pystencils.gpucuda import created_indexed_cuda_kernel from pystencils.gpu import created_indexed_cuda_kernel
idx_creator = indexing_creator_from_params(config.gpu_indexing, config.gpu_indexing_params) ast = created_indexed_cuda_kernel(assignments, config=config)
ast = created_indexed_cuda_kernel(assignments,
config.index_fields,
type_info=config.data_type,
coordinate_names=config.coordinate_names,
indexing_creator=idx_creator)
if not ast: if not ast:
raise NotImplementedError(f'Indexed kernels are not yet supported for {config.target} with {config.backend}') raise NotImplementedError(f'Indexed kernels are not yet supported for {config.target} with {config.backend}')
...@@ -369,6 +275,7 @@ def create_staggered_kernel(assignments, target: Target = Target.CPU, gpu_exclus ...@@ -369,6 +275,7 @@ def create_staggered_kernel(assignments, target: Target = Target.CPU, gpu_exclus
Returns: Returns:
AST, see `create_kernel` AST, see `create_kernel`
""" """
# TODO: Add doku like in the other kernels
if 'ghost_layers' in kwargs: if 'ghost_layers' in kwargs:
assert kwargs['ghost_layers'] is None assert kwargs['ghost_layers'] is None
del kwargs['ghost_layers'] del kwargs['ghost_layers']
...@@ -462,11 +369,8 @@ def create_staggered_kernel(assignments, target: Target = Target.CPU, gpu_exclus ...@@ -462,11 +369,8 @@ def create_staggered_kernel(assignments, target: Target = Target.CPU, gpu_exclus
[SympyAssignment(s.lhs, s.rhs) for s in subexpressions if hasattr(s, 'lhs')] + \ [SympyAssignment(s.lhs, s.rhs) for s in subexpressions if hasattr(s, 'lhs')] + \
[last_conditional] [last_conditional]
if target == Target.CPU: config = CreateKernelConfig(target=target, ghost_layers=ghost_layers, omp_single_loop=False, **kwargs)
from pystencils.cpu import create_kernel as create_kernel_cpu ast = create_kernel(final_assignments, config=config)
ast = create_kernel_cpu(final_assignments, ghost_layers=ghost_layers, omp_single_loop=False, **kwargs)
else:
ast = create_kernel(final_assignments, ghost_layers=ghost_layers, target=target, **kwargs)
return ast return ast
for assignment in assignments: for assignment in assignments:
...@@ -483,6 +387,8 @@ def create_staggered_kernel(assignments, target: Target = Target.CPU, gpu_exclus ...@@ -483,6 +387,8 @@ def create_staggered_kernel(assignments, target: Target = Target.CPU, gpu_exclus
if 'cpu_prepend_optimizations' in kwargs: if 'cpu_prepend_optimizations' in kwargs:
prepend_optimizations += kwargs['cpu_prepend_optimizations'] prepend_optimizations += kwargs['cpu_prepend_optimizations']
del kwargs['cpu_prepend_optimizations'] del kwargs['cpu_prepend_optimizations']
ast = create_kernel(final_assignments, ghost_layers=ghost_layers, target=target, omp_single_loop=False,
cpu_prepend_optimizations=prepend_optimizations, **kwargs) config = CreateKernelConfig(ghost_layers=ghost_layers, target=target, omp_single_loop=False,
cpu_prepend_optimizations=prepend_optimizations, **kwargs)
ast = create_kernel(final_assignments, config=config)
return ast return ast
from typing import Any, Dict, List, Union, Optional, Set
import sympy
import sympy as sp
from sympy.codegen.rewriting import ReplaceOptim, optimize
from pystencils.assignment import Assignment, AddAugmentedAssignment
import pystencils.astnodes as ast
from pystencils.backends.cbackend import CustomCodeNode
from pystencils.functions import DivFunc
from pystencils.simp import AssignmentCollection
from pystencils.typing import FieldPointerSymbol
class NodeCollection:
def __init__(self, assignments: List[Union[ast.Node, Assignment]],
simplification_hints: Optional[Dict[str, Any]] = None,
bound_fields: Set[sp.Symbol] = None, rhs_fields: Set[sp.Symbol] = None):
def visit(obj):
if isinstance(obj, (list, tuple)):
return [visit(e) for e in obj]
if isinstance(obj, Assignment):
if isinstance(obj.lhs, FieldPointerSymbol):
return ast.SympyAssignment(obj.lhs, obj.rhs, is_const=obj.lhs.dtype.const)
return ast.SympyAssignment(obj.lhs, obj.rhs)
elif isinstance(obj, AddAugmentedAssignment):
return ast.SympyAssignment(obj.lhs, obj.lhs + obj.rhs)
elif isinstance(obj, ast.SympyAssignment):
return obj
elif isinstance(obj, ast.Conditional):
true_block = visit(obj.true_block)
false_block = None if obj.false_block is None else visit(obj.false_block)
return ast.Conditional(obj.condition_expr, true_block=true_block, false_block=false_block)
elif isinstance(obj, ast.Block):
return ast.Block([visit(e) for e in obj.args])
elif isinstance(obj, ast.Node) and not isinstance(obj, ast.LoopOverCoordinate):
return obj
else:
raise ValueError("Invalid object in the List of Assignments " + str(type(obj)))
self.all_assignments = visit(assignments)
self.simplification_hints = simplification_hints if simplification_hints else {}
self.bound_fields = bound_fields if bound_fields else {}
self.rhs_fields = rhs_fields if rhs_fields else {}
@staticmethod
def from_assignment_collection(assignment_collection: AssignmentCollection):
return NodeCollection(assignments=assignment_collection.all_assignments,
simplification_hints=assignment_collection.simplification_hints,
bound_fields=assignment_collection.bound_fields,
rhs_fields=assignment_collection.rhs_fields)
def evaluate_terms(self):
evaluate_constant_terms = ReplaceOptim(
lambda e: hasattr(e, 'is_constant') and e.is_constant and not e.is_integer,
lambda p: p.evalf()
)
evaluate_pow = ReplaceOptim(
lambda e: e.is_Pow and e.exp.is_Integer and abs(e.exp) <= 8,
lambda p: sp.UnevaluatedExpr(sp.Mul(*([p.base] * +p.exp), evaluate=False)) if p.exp > 0 else
(DivFunc(sp.Integer(1), p.base) if p.exp == -1 else
DivFunc(sp.Integer(1), sp.UnevaluatedExpr(sp.Mul(*([p.base] * -p.exp), evaluate=False))))
)
sympy_optimisations = [evaluate_constant_terms, evaluate_pow]
def visitor(node):
if isinstance(node, CustomCodeNode):
return node
elif isinstance(node, ast.Block):
return node.func([visitor(child) for child in node.args])
elif isinstance(node, ast.SympyAssignment):
new_lhs = visitor(node.lhs)
new_rhs = visitor(node.rhs)
return node.func(new_lhs, new_rhs, node.is_const, node.use_auto)
elif isinstance(node, ast.Node):
return node.func(*[visitor(child) for child in node.args])
elif isinstance(node, sympy.Basic):
return optimize(node, sympy_optimisations)
else:
raise NotImplementedError(f'{node} {type(node)} has no valid visitor')
self.all_assignments = [visitor(assignment) for assignment in self.all_assignments]
File moved
...@@ -2,7 +2,7 @@ import copy ...@@ -2,7 +2,7 @@ import copy
import numpy as np import numpy as np
import sympy as sp import sympy as sp
from pystencils.data_types import TypedSymbol, cast_func from pystencils.typing import TypedSymbol, CastFunc
from pystencils.astnodes import LoopOverCoordinate from pystencils.astnodes import LoopOverCoordinate
from pystencils.backends.cbackend import CustomCodeNode from pystencils.backends.cbackend import CustomCodeNode
from pystencils.sympyextensions import fast_subs from pystencils.sympyextensions import fast_subs
...@@ -47,11 +47,11 @@ class RNGBase(CustomCodeNode): ...@@ -47,11 +47,11 @@ class RNGBase(CustomCodeNode):
def get_code(self, dialect, vector_instruction_set, print_arg): def get_code(self, dialect, vector_instruction_set, print_arg):
code = "\n" code = "\n"
for r in self.result_symbols: for r in self.result_symbols:
if vector_instruction_set and not self.args[1].atoms(cast_func): if vector_instruction_set and not self.args[1].atoms(CastFunc):
# this vector RNG has become scalar through substitution # this vector RNG has become scalar through substitution
code += f"{r.dtype} {r.name};\n" code += f"{r.dtype} {r.name};\n"
else: else:
code += f"{vector_instruction_set[r.dtype.base_name] if vector_instruction_set else r.dtype} " + \ code += f"{vector_instruction_set[r.dtype.c_name] if vector_instruction_set else r.dtype} " + \
f"{r.name};\n" f"{r.name};\n"
args = [print_arg(a) for a in self.args] + ['' + r.name for r in self.result_symbols] args = [print_arg(a) for a in self.args] + ['' + r.name for r in self.result_symbols]
code += (self._name + "(" + ", ".join(args) + ");\n") code += (self._name + "(" + ", ".join(args) + ");\n")
...@@ -61,6 +61,15 @@ class RNGBase(CustomCodeNode): ...@@ -61,6 +61,15 @@ class RNGBase(CustomCodeNode):
return ", ".join([str(s) for s in self.result_symbols]) + " \\leftarrow " + \ return ", ".join([str(s) for s in self.result_symbols]) + " \\leftarrow " + \
self._name.capitalize() + "_RNG(" + ", ".join([str(a) for a in self.args]) + ")" self._name.capitalize() + "_RNG(" + ", ".join([str(a) for a in self.args]) + ")"
def _hashable_content(self):
return (self._name, *self.result_symbols, *self.args)
def __eq__(self, other):
return type(self) is type(other) and self._hashable_content() == other._hashable_content()
def __hash__(self):
return hash(self._hashable_content())
class PhiloxTwoDoubles(RNGBase): class PhiloxTwoDoubles(RNGBase):
_name = "philox_double2" _name = "philox_double2"
......
import socket import socket
import time import time
from types import MappingProxyType
from typing import Dict, Iterator, Sequence from typing import Dict, Iterator, Sequence
import blitzdb import blitzdb
import six
from blitzdb.backends.file.backend import serializer_classes
from blitzdb.backends.file.utils import JsonEncoder
from pystencils.cpu.cpujit import get_compiler_config from pystencils.cpu.cpujit import get_compiler_config
from pystencils import CreateKernelConfig, Target, Backend, Field
import json
import sympy as sp
from pystencils.typing import BasicType
class PystencilsJsonEncoder(JsonEncoder):
def default(self, obj):
if isinstance(obj, CreateKernelConfig):
return obj.__dict__
if isinstance(obj, (sp.Float, sp.Rational)):
return float(obj)
if isinstance(obj, sp.Integer):
return int(obj)
if isinstance(obj, (BasicType, MappingProxyType)):
return str(obj)
if isinstance(obj, (Target, Backend, sp.Symbol)):
return obj.name
if isinstance(obj, Field):
return f"pystencils.Field(name = {obj.name}, field_type = {obj.field_type.name}, " \
f"dtype = {str(obj.dtype)}, layout = {obj.layout}, shape = {obj.shape}, " \
f"strides = {obj.strides})"
return JsonEncoder.default(self, obj)
class PystencilsJsonSerializer(object):
@classmethod
def serialize(cls, data):
if six.PY3:
if isinstance(data, bytes):
return json.dumps(data.decode('utf-8'), cls=PystencilsJsonEncoder, ensure_ascii=False).encode('utf-8')
else:
return json.dumps(data, cls=PystencilsJsonEncoder, ensure_ascii=False).encode('utf-8')
else:
return json.dumps(data, cls=PystencilsJsonEncoder, ensure_ascii=False).encode('utf-8')
@classmethod
def deserialize(cls, data):
if six.PY3:
return json.loads(data.decode('utf-8'))
else:
return json.loads(data.decode('utf-8'))
class Database: class Database:
...@@ -46,7 +96,7 @@ class Database: ...@@ -46,7 +96,7 @@ class Database:
class SimulationResult(blitzdb.Document): class SimulationResult(blitzdb.Document):
pass pass
def __init__(self, file: str) -> None: def __init__(self, file: str, serializer_info: tuple = None) -> None:
if file.startswith("mongo://"): if file.startswith("mongo://"):
from pymongo import MongoClient from pymongo import MongoClient
db_name = file[len("mongo://"):] db_name = file[len("mongo://"):]
...@@ -57,6 +107,10 @@ class Database: ...@@ -57,6 +107,10 @@ class Database:
self.backend.autocommit = True self.backend.autocommit = True
if serializer_info:
serializer_classes.update({serializer_info[0]: serializer_info[1]})
self.backend.load_config({'serializer_class': serializer_info[0]}, True)
def save(self, params: Dict, result: Dict, env: Dict = None, **kwargs) -> None: def save(self, params: Dict, result: Dict, env: Dict = None, **kwargs) -> None:
"""Stores a simulation result in the database. """Stores a simulation result in the database.
...@@ -146,10 +200,15 @@ class Database: ...@@ -146,10 +200,15 @@ class Database:
'cpuCompilerConfig': get_compiler_config(), 'cpuCompilerConfig': get_compiler_config(),
} }
try: try:
from git import Repo, InvalidGitRepositoryError from git import Repo
except ImportError:
return result
try:
from git import InvalidGitRepositoryError
repo = Repo(search_parent_directories=True) repo = Repo(search_parent_directories=True)
result['git_hash'] = str(repo.head.commit) result['git_hash'] = str(repo.head.commit)
except (ImportError, InvalidGitRepositoryError): except InvalidGitRepositoryError:
pass pass
return result return result
......
...@@ -9,6 +9,7 @@ from time import sleep ...@@ -9,6 +9,7 @@ from time import sleep
from typing import Any, Callable, Dict, Optional, Sequence, Tuple from typing import Any, Callable, Dict, Optional, Sequence, Tuple
from pystencils.runhelper import Database from pystencils.runhelper import Database
from pystencils.runhelper.db import PystencilsJsonSerializer
from pystencils.utils import DotDict from pystencils.utils import DotDict
ParameterDict = Dict[str, Any] ParameterDict = Dict[str, Any]
...@@ -54,10 +55,11 @@ class ParameterStudy: ...@@ -54,10 +55,11 @@ class ParameterStudy:
Run = namedtuple("Run", ['parameter_dict', 'weight']) Run = namedtuple("Run", ['parameter_dict', 'weight'])
def __init__(self, run_function: Callable[..., Dict], runs: Sequence = (), def __init__(self, run_function: Callable[..., Dict], runs: Sequence = (),
database_connector: str = './db') -> None: database_connector: str = './db',
serializer_info: tuple = ('pystencils_serializer', PystencilsJsonSerializer)) -> None:
self.runs = list(runs) self.runs = list(runs)
self.run_function = run_function self.run_function = run_function
self.db = Database(database_connector) self.db = Database(database_connector, serializer_info)
def add_run(self, parameter_dict: ParameterDict, weight: int = 1) -> None: def add_run(self, parameter_dict: ParameterDict, weight: int = 1) -> None:
"""Schedule a dictionary of parameters to run in this parameter study. """Schedule a dictionary of parameters to run in this parameter study.
......