Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
No results found
Show changes
Showing
with 0 additions and 3533 deletions
#pragma once
extern "C++" {
#ifdef __CUDA_ARCH__
template <typename DTYPE_T, std::size_t DIMENSION> struct PyStencilsField {
DTYPE_T *data;
DTYPE_T shape[DIMENSION];
DTYPE_T stride[DIMENSION];
};
#else
#include <array>
template <typename DTYPE_T, std::size_t DIMENSION> struct PyStencilsField {
DTYPE_T *data;
std::array<DTYPE_T, DIMENSION> shape;
std::array<DTYPE_T, DIMENSION> stride;
};
#endif
}
#if !defined(__AES__) || !defined(__SSE4_1__)
#error AES-NI and SSE4.1 need to be enabled
#endif
#include <emmintrin.h> // SSE2
#include <wmmintrin.h> // AES
#ifdef __AVX512VL__
#include <immintrin.h> // AVX*
#else
#include <smmintrin.h> // SSE4
#ifdef __FMA__
#include <immintrin.h> // FMA
#endif
#endif
#include <cstdint>
#define QUALIFIERS inline
#define TWOPOW53_INV_DOUBLE (1.1102230246251565e-16)
#define TWOPOW32_INV_FLOAT (2.3283064e-10f)
typedef std::uint32_t uint32;
typedef std::uint64_t uint64;
QUALIFIERS __m128i aesni1xm128i(const __m128i & in, const __m128i & k) {
__m128i x = _mm_xor_si128(k, in);
x = _mm_aesenc_si128(x, k); // 1
x = _mm_aesenc_si128(x, k); // 2
x = _mm_aesenc_si128(x, k); // 3
x = _mm_aesenc_si128(x, k); // 4
x = _mm_aesenc_si128(x, k); // 5
x = _mm_aesenc_si128(x, k); // 6
x = _mm_aesenc_si128(x, k); // 7
x = _mm_aesenc_si128(x, k); // 8
x = _mm_aesenc_si128(x, k); // 9
x = _mm_aesenclast_si128(x, k); // 10
return x;
}
QUALIFIERS __m128 _my_cvtepu32_ps(const __m128i v)
{
#ifdef __AVX512VL__
return _mm_cvtepu32_ps(v);
#else
__m128i v2 = _mm_srli_epi32(v, 1);
__m128i v1 = _mm_and_si128(v, _mm_set1_epi32(1));
__m128 v2f = _mm_cvtepi32_ps(v2);
__m128 v1f = _mm_cvtepi32_ps(v1);
return _mm_add_ps(_mm_add_ps(v2f, v2f), v1f);
#endif
}
#if !defined(__AVX512VL__) && defined(__GNUC__) && __GNUC__ >= 5
__attribute__((optimize("no-associative-math")))
#endif
QUALIFIERS __m128d _my_cvtepu64_pd(const __m128i x)
{
#ifdef __AVX512VL__
return _mm_cvtepu64_pd(x);
#else
__m128i xH = _mm_srli_epi64(x, 32);
xH = _mm_or_si128(xH, _mm_castpd_si128(_mm_set1_pd(19342813113834066795298816.))); // 2^84
__m128i xL = _mm_blend_epi16(x, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)), 0xcc); // 2^52
__m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(19342813118337666422669312.)); // 2^84 + 2^52
return _mm_add_pd(f, _mm_castsi128_pd(xL));
#endif
}
QUALIFIERS void aesni_double2(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1, uint32 key2, uint32 key3,
double & rnd1, double & rnd2)
{
// pack input and call AES
__m128i c128 = _mm_set_epi32(ctr3, ctr2, ctr1, ctr0);
__m128i k128 = _mm_set_epi32(key3, key2, key1, key0);
c128 = aesni1xm128i(c128, k128);
// convert 32 to 64 bit and put 0th and 2nd element into x, 1st and 3rd element into y
__m128i x = _mm_and_si128(c128, _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff));
__m128i y = _mm_and_si128(c128, _mm_set_epi32(0xffffffff, 0, 0xffffffff, 0));
y = _mm_srli_si128(y, 4);
// calculate z = x ^ y << (53 - 32))
__m128i z = _mm_sll_epi64(y, _mm_set1_epi64x(53 - 32));
z = _mm_xor_si128(x, z);
// convert uint64 to double
__m128d rs = _my_cvtepu64_pd(z);
// calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
#ifdef __FMA__
rs = _mm_fmadd_pd(rs, _mm_set1_pd(TWOPOW53_INV_DOUBLE), _mm_set1_pd(TWOPOW53_INV_DOUBLE/2.0));
#else
rs = _mm_mul_pd(rs, _mm_set1_pd(TWOPOW53_INV_DOUBLE));
rs = _mm_add_pd(rs, _mm_set1_pd(TWOPOW53_INV_DOUBLE/2.0));
#endif
// store result
alignas(16) double rr[2];
_mm_store_pd(rr, rs);
rnd1 = rr[0];
rnd2 = rr[1];
}
QUALIFIERS void aesni_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1, uint32 key2, uint32 key3,
float & rnd1, float & rnd2, float & rnd3, float & rnd4)
{
// pack input and call AES
__m128i c128 = _mm_set_epi32(ctr3, ctr2, ctr1, ctr0);
__m128i k128 = _mm_set_epi32(key3, key2, key1, key0);
c128 = aesni1xm128i(c128, k128);
// convert uint32 to float
__m128 rs = _my_cvtepu32_ps(c128);
// calculate rs * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
#ifdef __FMA__
rs = _mm_fmadd_ps(rs, _mm_set1_ps(TWOPOW32_INV_FLOAT), _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
#else
rs = _mm_mul_ps(rs, _mm_set1_ps(TWOPOW32_INV_FLOAT));
rs = _mm_add_ps(rs, _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
#endif
// store result
alignas(16) float r[4];
_mm_store_ps(r, rs);
rnd1 = r[0];
rnd2 = r[1];
rnd3 = r[2];
rnd4 = r[3];
}
// An implementation of C++ std::complex for use on CUDA devices.
// Written by John C. Travers <jtravs@gmail.com> (2012)
//
// Missing:
// - long double support (not supported on CUDA)
// - some integral pow functions (due to lack of C++11 support on CUDA)
//
// Heavily derived from the LLVM libcpp project (svn revision 147853).
// Based on libcxx/include/complex.
// The git history contains the complete change history from the original.
// The modifications are licensed as per the original LLVM license below.
//
// -*- C++ -*-
//===--------------------------- complex ----------------------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is dual licensed under the MIT and the University of Illinois Open
// Source Licenses. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
extern "C++" {
#ifndef CUDA_COMPLEX_HPP
#define CUDA_COMPLEX_HPP
#ifdef __CUDACC__
#define CUDA_CALLABLE_MEMBER __host__ __device__
#else
#define CUDA_CALLABLE_MEMBER
#endif
/*
complex synopsis
template<class T>
class complex
{
public:
typedef T value_type;
complex(const T& re = T(), const T& im = T());
complex(const complex&);
template<class X> complex(const complex<X>&);
T real() const;
T imag() const;
void real(T);
void imag(T);
complex<T>& operator= (const T&);
complex<T>& operator+=(const T&);
complex<T>& operator-=(const T&);
complex<T>& operator*=(const T&);
complex<T>& operator/=(const T&);
complex& operator=(const complex&);
template<class X> complex<T>& operator= (const complex<X>&);
template<class X> complex<T>& operator+=(const complex<X>&);
template<class X> complex<T>& operator-=(const complex<X>&);
template<class X> complex<T>& operator*=(const complex<X>&);
template<class X> complex<T>& operator/=(const complex<X>&);
};
template<>
class complex<float>
{
public:
typedef float value_type;
constexpr complex(float re = 0.0f, float im = 0.0f);
explicit constexpr complex(const complex<double>&);
constexpr float real() const;
void real(float);
constexpr float imag() const;
void imag(float);
complex<float>& operator= (float);
complex<float>& operator+=(float);
complex<float>& operator-=(float);
complex<float>& operator*=(float);
complex<float>& operator/=(float);
complex<float>& operator=(const complex<float>&);
template<class X> complex<float>& operator= (const complex<X>&);
template<class X> complex<float>& operator+=(const complex<X>&);
template<class X> complex<float>& operator-=(const complex<X>&);
template<class X> complex<float>& operator*=(const complex<X>&);
template<class X> complex<float>& operator/=(const complex<X>&);
};
template<>
class complex<double>
{
public:
typedef double value_type;
constexpr complex(double re = 0.0, double im = 0.0);
constexpr complex(const complex<float>&);
constexpr double real() const;
void real(double);
constexpr double imag() const;
void imag(double);
complex<double>& operator= (double);
complex<double>& operator+=(double);
complex<double>& operator-=(double);
complex<double>& operator*=(double);
complex<double>& operator/=(double);
complex<double>& operator=(const complex<double>&);
template<class X> complex<double>& operator= (const complex<X>&);
template<class X> complex<double>& operator+=(const complex<X>&);
template<class X> complex<double>& operator-=(const complex<X>&);
template<class X> complex<double>& operator*=(const complex<X>&);
template<class X> complex<double>& operator/=(const complex<X>&);
};
// 26.3.6 operators:
template<class T> complex<T> operator+(const complex<T>&, const complex<T>&);
template<class T> complex<T> operator+(const complex<T>&, const T&);
template<class T> complex<T> operator+(const T&, const complex<T>&);
template<class T> complex<T> operator-(const complex<T>&, const complex<T>&);
template<class T> complex<T> operator-(const complex<T>&, const T&);
template<class T> complex<T> operator-(const T&, const complex<T>&);
template<class T> complex<T> operator*(const complex<T>&, const complex<T>&);
template<class T> complex<T> operator*(const complex<T>&, const T&);
template<class T> complex<T> operator*(const T&, const complex<T>&);
template<class T> complex<T> operator/(const complex<T>&, const complex<T>&);
template<class T> complex<T> operator/(const complex<T>&, const T&);
template<class T> complex<T> operator/(const T&, const complex<T>&);
template<class T> complex<T> operator+(const complex<T>&);
template<class T> complex<T> operator-(const complex<T>&);
template<class T> bool operator==(const complex<T>&, const complex<T>&);
template<class T> bool operator==(const complex<T>&, const T&);
template<class T> bool operator==(const T&, const complex<T>&);
template<class T> bool operator!=(const complex<T>&, const complex<T>&);
template<class T> bool operator!=(const complex<T>&, const T&);
template<class T> bool operator!=(const T&, const complex<T>&);
template<class T, class charT, class traits>
basic_istream<charT, traits>&
operator>>(basic_istream<charT, traits>&, complex<T>&);
template<class T, class charT, class traits>
basic_ostream<charT, traits>&
operator<<(basic_ostream<charT, traits>&, const complex<T>&);
// 26.3.7 values:
template<class T> T real(const complex<T>&);
double real(double);
template<Integral T> double real(T);
float real(float);
template<class T> T imag(const complex<T>&);
double imag(double);
template<Integral T> double imag(T);
float imag(float);
template<class T> T abs(const complex<T>&);
template<class T> T arg(const complex<T>&);
double arg(double);
template<Integral T> double arg(T);
float arg(float);
template<class T> T norm(const complex<T>&);
double norm(double);
template<Integral T> double norm(T);
float norm(float);
template<class T> complex<T> conj(const complex<T>&);
complex<double> conj(double);
template<Integral T> complex<double> conj(T);
complex<float> conj(float);
template<class T> complex<T> proj(const complex<T>&);
complex<double> proj(double);
template<Integral T> complex<double> proj(T);
complex<float> proj(float);
template<class T> complex<T> polar(const T&, const T& = 0);
// 26.3.8 transcendentals:
template<class T> complex<T> acos(const complex<T>&);
template<class T> complex<T> asin(const complex<T>&);
template<class T> complex<T> atan(const complex<T>&);
template<class T> complex<T> acosh(const complex<T>&);
template<class T> complex<T> asinh(const complex<T>&);
template<class T> complex<T> atanh(const complex<T>&);
template<class T> complex<T> cos (const complex<T>&);
template<class T> complex<T> cosh (const complex<T>&);
template<class T> complex<T> exp (const complex<T>&);
template<class T> complex<T> log (const complex<T>&);
template<class T> complex<T> log10(const complex<T>&);
template<class T> complex<T> pow(const complex<T>&, const T&);
template<class T> complex<T> pow(const complex<T>&, const complex<T>&);
template<class T> complex<T> pow(const T&, const complex<T>&);
template<class T> complex<T> sin (const complex<T>&);
template<class T> complex<T> sinh (const complex<T>&);
template<class T> complex<T> sqrt (const complex<T>&);
template<class T> complex<T> tan (const complex<T>&);
template<class T> complex<T> tanh (const complex<T>&);
template<class T, class charT, class traits>
basic_istream<charT, traits>&
operator>>(basic_istream<charT, traits>& is, complex<T>& x);
template<class T, class charT, class traits>
basic_ostream<charT, traits>&
operator<<(basic_ostream<charT, traits>& o, const complex<T>& x);
*/
#include <math.h>
#include <sstream>
template <class _Tp> class complex;
template <class _Tp>
complex<_Tp> operator*(const complex<_Tp> &__z, const complex<_Tp> &__w);
template <class _Tp>
complex<_Tp> operator/(const complex<_Tp> &__x, const complex<_Tp> &__y);
template <class _Tp> class complex {
public:
typedef _Tp value_type;
private:
value_type __re_;
value_type __im_;
public:
CUDA_CALLABLE_MEMBER
complex(const value_type &__re = value_type(),
const value_type &__im = value_type())
: __re_(__re), __im_(__im) {}
template <class _Xp>
CUDA_CALLABLE_MEMBER complex(const complex<_Xp> &__c)
: __re_(__c.real()), __im_(__c.imag()) {}
CUDA_CALLABLE_MEMBER value_type real() const { return __re_; }
CUDA_CALLABLE_MEMBER value_type imag() const { return __im_; }
CUDA_CALLABLE_MEMBER void real(value_type __re) { __re_ = __re; }
CUDA_CALLABLE_MEMBER void imag(value_type __im) { __im_ = __im; }
CUDA_CALLABLE_MEMBER complex &operator=(const value_type &__re) {
__re_ = __re;
__im_ = value_type();
return *this;
}
CUDA_CALLABLE_MEMBER complex &operator+=(const value_type &__re) {
__re_ += __re;
return *this;
}
CUDA_CALLABLE_MEMBER complex &operator-=(const value_type &__re) {
__re_ -= __re;
return *this;
}
CUDA_CALLABLE_MEMBER complex &operator*=(const value_type &__re) {
__re_ *= __re;
__im_ *= __re;
return *this;
}
CUDA_CALLABLE_MEMBER complex &operator/=(const value_type &__re) {
__re_ /= __re;
__im_ /= __re;
return *this;
}
template <class _Xp>
CUDA_CALLABLE_MEMBER complex &operator=(const complex<_Xp> &__c) {
__re_ = __c.real();
__im_ = __c.imag();
return *this;
}
template <class _Xp>
CUDA_CALLABLE_MEMBER complex &operator+=(const complex<_Xp> &__c) {
__re_ += __c.real();
__im_ += __c.imag();
return *this;
}
template <class _Xp>
CUDA_CALLABLE_MEMBER complex &operator-=(const complex<_Xp> &__c) {
__re_ -= __c.real();
__im_ -= __c.imag();
return *this;
}
template <class _Xp>
CUDA_CALLABLE_MEMBER complex &operator*=(const complex<_Xp> &__c) {
*this = *this * __c;
return *this;
}
template <class _Xp>
CUDA_CALLABLE_MEMBER complex &operator/=(const complex<_Xp> &__c) {
*this = *this / __c;
return *this;
}
};
template <> class complex<double>;
template <> class complex<float> {
float __re_;
float __im_;
public:
typedef float value_type;
/*constexpr*/ CUDA_CALLABLE_MEMBER complex(float __re = 0.0f,
float __im = 0.0f)
: __re_(__re), __im_(__im) {}
explicit /*constexpr*/ complex(const complex<double> &__c);
/*constexpr*/ CUDA_CALLABLE_MEMBER float real() const { return __re_; }
/*constexpr*/ CUDA_CALLABLE_MEMBER float imag() const { return __im_; }
CUDA_CALLABLE_MEMBER void real(value_type __re) { __re_ = __re; }
CUDA_CALLABLE_MEMBER void imag(value_type __im) { __im_ = __im; }
CUDA_CALLABLE_MEMBER complex &operator=(float __re) {
__re_ = __re;
__im_ = value_type();
return *this;
}
CUDA_CALLABLE_MEMBER complex &operator+=(float __re) {
__re_ += __re;
return *this;
}
CUDA_CALLABLE_MEMBER complex &operator-=(float __re) {
__re_ -= __re;
return *this;
}
CUDA_CALLABLE_MEMBER complex &operator*=(float __re) {
__re_ *= __re;
__im_ *= __re;
return *this;
}
CUDA_CALLABLE_MEMBER complex &operator/=(float __re) {
__re_ /= __re;
__im_ /= __re;
return *this;
}
template <class _Xp>
CUDA_CALLABLE_MEMBER complex &operator=(const complex<_Xp> &__c) {
__re_ = __c.real();
__im_ = __c.imag();
return *this;
}
template <class _Xp>
CUDA_CALLABLE_MEMBER complex &operator+=(const complex<_Xp> &__c) {
__re_ += __c.real();
__im_ += __c.imag();
return *this;
}
template <class _Xp>
CUDA_CALLABLE_MEMBER complex &operator-=(const complex<_Xp> &__c) {
__re_ -= __c.real();
__im_ -= __c.imag();
return *this;
}
template <class _Xp>
CUDA_CALLABLE_MEMBER complex &operator*=(const complex<_Xp> &__c) {
*this = *this * __c;
return *this;
}
template <class _Xp>
CUDA_CALLABLE_MEMBER complex &operator/=(const complex<_Xp> &__c) {
*this = *this / __c;
return *this;
}
};
template <> class complex<double> {
double __re_;
double __im_;
public:
typedef double value_type;
/*constexpr*/ CUDA_CALLABLE_MEMBER complex(double __re = 0.0,
double __im = 0.0)
: __re_(__re), __im_(__im) {}
/*constexpr*/ complex(const complex<float> &__c);
/*constexpr*/ CUDA_CALLABLE_MEMBER double real() const { return __re_; }
/*constexpr*/ CUDA_CALLABLE_MEMBER double imag() const { return __im_; }
CUDA_CALLABLE_MEMBER void real(value_type __re) { __re_ = __re; }
CUDA_CALLABLE_MEMBER void imag(value_type __im) { __im_ = __im; }
CUDA_CALLABLE_MEMBER complex &operator=(double __re) {
__re_ = __re;
__im_ = value_type();
return *this;
}
CUDA_CALLABLE_MEMBER complex &operator+=(double __re) {
__re_ += __re;
return *this;
}
CUDA_CALLABLE_MEMBER complex &operator-=(double __re) {
__re_ -= __re;
return *this;
}
CUDA_CALLABLE_MEMBER complex &operator*=(double __re) {
__re_ *= __re;
__im_ *= __re;
return *this;
}
CUDA_CALLABLE_MEMBER complex &operator/=(double __re) {
__re_ /= __re;
__im_ /= __re;
return *this;
}
template <class _Xp>
CUDA_CALLABLE_MEMBER complex &operator=(const complex<_Xp> &__c) {
__re_ = __c.real();
__im_ = __c.imag();
return *this;
}
template <class _Xp>
CUDA_CALLABLE_MEMBER complex &operator+=(const complex<_Xp> &__c) {
__re_ += __c.real();
__im_ += __c.imag();
return *this;
}
template <class _Xp>
CUDA_CALLABLE_MEMBER complex &operator-=(const complex<_Xp> &__c) {
__re_ -= __c.real();
__im_ -= __c.imag();
return *this;
}
template <class _Xp>
CUDA_CALLABLE_MEMBER complex &operator*=(const complex<_Xp> &__c) {
*this = *this * __c;
return *this;
}
template <class _Xp>
CUDA_CALLABLE_MEMBER complex &operator/=(const complex<_Xp> &__c) {
*this = *this / __c;
return *this;
}
};
// constexpr
inline CUDA_CALLABLE_MEMBER complex<float>::complex(const complex<double> &__c)
: __re_(__c.real()), __im_(__c.imag()) {}
// constexpr
inline CUDA_CALLABLE_MEMBER complex<double>::complex(const complex<float> &__c)
: __re_(__c.real()), __im_(__c.imag()) {}
// 26.3.6 operators:
template <class _Tp>
inline CUDA_CALLABLE_MEMBER complex<_Tp> operator+(const complex<_Tp> &__x,
const complex<_Tp> &__y) {
complex<_Tp> __t(__x);
__t += __y;
return __t;
}
template <class _Tp>
inline CUDA_CALLABLE_MEMBER complex<_Tp> operator+(const complex<_Tp> &__x,
const _Tp &__y) {
complex<_Tp> __t(__x);
__t += __y;
return __t;
}
template <class _Tp>
inline CUDA_CALLABLE_MEMBER complex<_Tp> operator+(const _Tp &__x,
const complex<_Tp> &__y) {
complex<_Tp> __t(__y);
__t += __x;
return __t;
}
template <class _Tp>
inline CUDA_CALLABLE_MEMBER complex<_Tp> operator-(const complex<_Tp> &__x,
const complex<_Tp> &__y) {
complex<_Tp> __t(__x);
__t -= __y;
return __t;
}
template <class _Tp>
inline CUDA_CALLABLE_MEMBER complex<_Tp> operator-(const complex<_Tp> &__x,
const _Tp &__y) {
complex<_Tp> __t(__x);
__t -= __y;
return __t;
}
template <class _Tp>
inline CUDA_CALLABLE_MEMBER complex<_Tp> operator-(const _Tp &__x,
const complex<_Tp> &__y) {
complex<_Tp> __t(-__y);
__t += __x;
return __t;
}
template <class _Tp>
CUDA_CALLABLE_MEMBER complex<_Tp> operator*(const complex<_Tp> &__z,
const complex<_Tp> &__w) {
_Tp __a = __z.real();
_Tp __b = __z.imag();
_Tp __c = __w.real();
_Tp __d = __w.imag();
_Tp __ac = __a * __c;
_Tp __bd = __b * __d;
_Tp __ad = __a * __d;
_Tp __bc = __b * __c;
_Tp __x = __ac - __bd;
_Tp __y = __ad + __bc;
if (isnan(__x) && isnan(__y)) {
bool __recalc = false;
if (isinf(__a) || isinf(__b)) {
__a = copysign(isinf(__a) ? _Tp(1) : _Tp(0), __a);
__b = copysign(isinf(__b) ? _Tp(1) : _Tp(0), __b);
if (isnan(__c))
__c = copysign(_Tp(0), __c);
if (isnan(__d))
__d = copysign(_Tp(0), __d);
__recalc = true;
}
if (isinf(__c) || isinf(__d)) {
__c = copysign(isinf(__c) ? _Tp(1) : _Tp(0), __c);
__d = copysign(isinf(__d) ? _Tp(1) : _Tp(0), __d);
if (isnan(__a))
__a = copysign(_Tp(0), __a);
if (isnan(__b))
__b = copysign(_Tp(0), __b);
__recalc = true;
}
if (!__recalc &&
(isinf(__ac) || isinf(__bd) || isinf(__ad) || isinf(__bc))) {
if (isnan(__a))
__a = copysign(_Tp(0), __a);
if (isnan(__b))
__b = copysign(_Tp(0), __b);
if (isnan(__c))
__c = copysign(_Tp(0), __c);
if (isnan(__d))
__d = copysign(_Tp(0), __d);
__recalc = true;
}
if (__recalc) {
__x = _Tp(INFINITY) * (__a * __c - __b * __d);
__y = _Tp(INFINITY) * (__a * __d + __b * __c);
}
}
return complex<_Tp>(__x, __y);
}
template <class _Tp>
inline CUDA_CALLABLE_MEMBER complex<_Tp> operator*(const complex<_Tp> &__x,
const _Tp &__y) {
complex<_Tp> __t(__x);
__t *= __y;
return __t;
}
template <class _Tp>
inline CUDA_CALLABLE_MEMBER complex<_Tp> operator*(const _Tp &__x,
const complex<_Tp> &__y) {
complex<_Tp> __t(__y);
__t *= __x;
return __t;
}
template <class _Tp>
CUDA_CALLABLE_MEMBER complex<_Tp> operator/(const complex<_Tp> &__z,
const complex<_Tp> &__w) {
int __ilogbw = 0;
_Tp __a = __z.real();
_Tp __b = __z.imag();
_Tp __c = __w.real();
_Tp __d = __w.imag();
_Tp __logbw = logb(fmax(fabs(__c), fabs(__d)));
if (isfinite(__logbw)) {
__ilogbw = static_cast<int>(__logbw);
__c = scalbn(__c, -__ilogbw);
__d = scalbn(__d, -__ilogbw);
}
_Tp __denom = __c * __c + __d * __d;
_Tp __x = scalbn((__a * __c + __b * __d) / __denom, -__ilogbw);
_Tp __y = scalbn((__b * __c - __a * __d) / __denom, -__ilogbw);
if (isnan(__x) && isnan(__y)) {
if ((__denom == _Tp(0)) && (!isnan(__a) || !isnan(__b))) {
__x = copysign(_Tp(INFINITY), __c) * __a;
__y = copysign(_Tp(INFINITY), __c) * __b;
} else if ((isinf(__a) || isinf(__b)) && isfinite(__c) && isfinite(__d)) {
__a = copysign(isinf(__a) ? _Tp(1) : _Tp(0), __a);
__b = copysign(isinf(__b) ? _Tp(1) : _Tp(0), __b);
__x = _Tp(INFINITY) * (__a * __c + __b * __d);
__y = _Tp(INFINITY) * (__b * __c - __a * __d);
} else if (isinf(__logbw) && __logbw > _Tp(0) && isfinite(__a) &&
isfinite(__b)) {
__c = copysign(isinf(__c) ? _Tp(1) : _Tp(0), __c);
__d = copysign(isinf(__d) ? _Tp(1) : _Tp(0), __d);
__x = _Tp(0) * (__a * __c + __b * __d);
__y = _Tp(0) * (__b * __c - __a * __d);
}
}
return complex<_Tp>(__x, __y);
}
template <>
CUDA_CALLABLE_MEMBER complex<float> operator/(const complex<float> &__z,
const complex<float> &__w) {
int __ilogbw = 0;
float __a = __z.real();
float __b = __z.imag();
float __c = __w.real();
float __d = __w.imag();
float __logbw = logbf(fmaxf(fabsf(__c), fabsf(__d)));
if (isfinite(__logbw)) {
__ilogbw = static_cast<int>(__logbw);
__c = scalbnf(__c, -__ilogbw);
__d = scalbnf(__d, -__ilogbw);
}
float __denom = __c * __c + __d * __d;
float __x = scalbnf((__a * __c + __b * __d) / __denom, -__ilogbw);
float __y = scalbnf((__b * __c - __a * __d) / __denom, -__ilogbw);
if (isnan(__x) && isnan(__y)) {
if ((__denom == float(0)) && (!isnan(__a) || !isnan(__b))) {
#pragma warning(suppress : 4756) // Ignore INFINITY related warning
__x = copysignf(INFINITY, __c) * __a;
#pragma warning(suppress : 4756) // Ignore INFINITY related warning
__y = copysignf(INFINITY, __c) * __b;
} else if ((isinf(__a) || isinf(__b)) && isfinite(__c) && isfinite(__d)) {
__a = copysignf(isinf(__a) ? float(1) : float(0), __a);
__b = copysignf(isinf(__b) ? float(1) : float(0), __b);
#pragma warning(suppress : 4756) // Ignore INFINITY related warning
__x = INFINITY * (__a * __c + __b * __d);
#pragma warning(suppress : 4756) // Ignore INFINITY related warning
__y = INFINITY * (__b * __c - __a * __d);
} else if (isinf(__logbw) && __logbw > float(0) && isfinite(__a) &&
isfinite(__b)) {
__c = copysignf(isinf(__c) ? float(1) : float(0), __c);
__d = copysignf(isinf(__d) ? float(1) : float(0), __d);
__x = float(0) * (__a * __c + __b * __d);
__y = float(0) * (__b * __c - __a * __d);
}
}
return complex<float>(__x, __y);
}
template <class _Tp>
inline CUDA_CALLABLE_MEMBER complex<_Tp> operator/(const complex<_Tp> &__x,
const _Tp &__y) {
return complex<_Tp>(__x.real() / __y, __x.imag() / __y);
}
template <class _Tp>
inline CUDA_CALLABLE_MEMBER complex<_Tp> operator/(const _Tp &__x,
const complex<_Tp> &__y) {
complex<_Tp> __t(__x);
__t /= __y;
return __t;
}
template <class _Tp>
inline CUDA_CALLABLE_MEMBER complex<_Tp> operator+(const complex<_Tp> &__x) {
return __x;
}
template <class _Tp>
inline CUDA_CALLABLE_MEMBER complex<_Tp> operator-(const complex<_Tp> &__x) {
return complex<_Tp>(-__x.real(), -__x.imag());
}
template <class _Tp>
inline CUDA_CALLABLE_MEMBER bool operator==(const complex<_Tp> &__x,
const complex<_Tp> &__y) {
return __x.real() == __y.real() && __x.imag() == __y.imag();
}
template <class _Tp>
inline CUDA_CALLABLE_MEMBER bool operator==(const complex<_Tp> &__x,
const _Tp &__y) {
return __x.real() == __y && __x.imag() == 0;
}
template <class _Tp>
inline CUDA_CALLABLE_MEMBER bool operator==(const _Tp &__x,
const complex<_Tp> &__y) {
return __x == __y.real() && 0 == __y.imag();
}
template <class _Tp>
inline CUDA_CALLABLE_MEMBER bool operator!=(const complex<_Tp> &__x,
const complex<_Tp> &__y) {
return !(__x == __y);
}
template <class _Tp>
inline CUDA_CALLABLE_MEMBER bool operator!=(const complex<_Tp> &__x,
const _Tp &__y) {
return !(__x == __y);
}
template <class _Tp>
inline CUDA_CALLABLE_MEMBER bool operator!=(const _Tp &__x,
const complex<_Tp> &__y) {
return !(__x == __y);
}
// 26.3.7 values:
// real
template <class _Tp>
inline CUDA_CALLABLE_MEMBER _Tp real(const complex<_Tp> &__c) {
return __c.real();
}
inline CUDA_CALLABLE_MEMBER double real(double __re) { return __re; }
inline CUDA_CALLABLE_MEMBER float real(float __re) { return __re; }
// imag
template <class _Tp>
inline CUDA_CALLABLE_MEMBER _Tp imag(const complex<_Tp> &__c) {
return __c.imag();
}
inline CUDA_CALLABLE_MEMBER double imag(double __re) { return 0; }
inline CUDA_CALLABLE_MEMBER float imag(float __re) { return 0; }
// abs
template <class _Tp>
inline CUDA_CALLABLE_MEMBER _Tp abs(const complex<_Tp> &__c) {
return hypot(__c.real(), __c.imag());
}
// arg
template <class _Tp>
inline CUDA_CALLABLE_MEMBER _Tp arg(const complex<_Tp> &__c) {
return atan2(__c.imag(), __c.real());
}
inline CUDA_CALLABLE_MEMBER double arg(double __re) { return atan2(0., __re); }
inline CUDA_CALLABLE_MEMBER float arg(float __re) { return atan2f(0.F, __re); }
// norm
template <class _Tp>
inline CUDA_CALLABLE_MEMBER _Tp norm(const complex<_Tp> &__c) {
if (isinf(__c.real()))
return fabs(__c.real());
if (isinf(__c.imag()))
return fabs(__c.imag());
return __c.real() * __c.real() + __c.imag() * __c.imag();
}
inline CUDA_CALLABLE_MEMBER double norm(double __re) { return __re * __re; }
inline CUDA_CALLABLE_MEMBER float norm(float __re) { return __re * __re; }
// conj
template <class _Tp>
inline CUDA_CALLABLE_MEMBER complex<_Tp> conj(const complex<_Tp> &__c) {
return complex<_Tp>(__c.real(), -__c.imag());
}
inline CUDA_CALLABLE_MEMBER complex<double> conj(double __re) {
return complex<double>(__re);
}
inline CUDA_CALLABLE_MEMBER complex<float> conj(float __re) {
return complex<float>(__re);
}
// proj
template <class _Tp>
inline CUDA_CALLABLE_MEMBER complex<_Tp> proj(const complex<_Tp> &__c) {
complex<_Tp> __r = __c;
if (isinf(__c.real()) || isinf(__c.imag()))
__r = complex<_Tp>(INFINITY, copysign(_Tp(0), __c.imag()));
return __r;
}
inline CUDA_CALLABLE_MEMBER complex<double> proj(double __re) {
if (isinf(__re))
__re = fabs(__re);
return complex<double>(__re);
}
inline CUDA_CALLABLE_MEMBER complex<float> proj(float __re) {
if (isinf(__re))
__re = fabs(__re);
return complex<float>(__re);
}
// polar
template <class _Tp>
CUDA_CALLABLE_MEMBER complex<_Tp> polar(const _Tp &__rho,
const _Tp &__theta = _Tp(0)) {
if (isnan(__rho) || signbit(__rho))
return complex<_Tp>(_Tp(NAN), _Tp(NAN));
if (isnan(__theta)) {
if (isinf(__rho))
return complex<_Tp>(__rho, __theta);
return complex<_Tp>(__theta, __theta);
}
if (isinf(__theta)) {
if (isinf(__rho))
return complex<_Tp>(__rho, _Tp(NAN));
return complex<_Tp>(_Tp(NAN), _Tp(NAN));
}
_Tp __x = __rho * cos(__theta);
if (isnan(__x))
__x = 0;
_Tp __y = __rho * sin(__theta);
if (isnan(__y))
__y = 0;
return complex<_Tp>(__x, __y);
}
// log
template <class _Tp>
inline CUDA_CALLABLE_MEMBER complex<_Tp> log(const complex<_Tp> &__x) {
return complex<_Tp>(log(abs(__x)), arg(__x));
}
// log10
template <class _Tp>
inline CUDA_CALLABLE_MEMBER complex<_Tp> log10(const complex<_Tp> &__x) {
return log(__x) / log(_Tp(10));
}
// sqrt
template <class _Tp>
CUDA_CALLABLE_MEMBER complex<_Tp> sqrt(const complex<_Tp> &__x) {
if (isinf(__x.imag()))
return complex<_Tp>(_Tp(INFINITY), __x.imag());
if (isinf(__x.real())) {
if (__x.real() > _Tp(0))
return complex<_Tp>(__x.real(), isnan(__x.imag())
? __x.imag()
: copysign(_Tp(0), __x.imag()));
return complex<_Tp>(isnan(__x.imag()) ? __x.imag() : _Tp(0),
copysign(__x.real(), __x.imag()));
}
return polar(sqrt(abs(__x)), arg(__x) / _Tp(2));
}
// exp
template <class _Tp>
CUDA_CALLABLE_MEMBER complex<_Tp> exp(const complex<_Tp> &__x) {
_Tp __i = __x.imag();
if (isinf(__x.real())) {
if (__x.real() < _Tp(0)) {
if (!isfinite(__i))
__i = _Tp(1);
} else if (__i == 0 || !isfinite(__i)) {
if (isinf(__i))
__i = _Tp(NAN);
return complex<_Tp>(__x.real(), __i);
}
} else if (isnan(__x.real()) && __x.imag() == 0)
return __x;
_Tp __e = exp(__x.real());
return complex<_Tp>(__e * cos(__i), __e * sin(__i));
}
// pow
template <class _Tp>
inline CUDA_CALLABLE_MEMBER complex<_Tp> pow(const complex<_Tp> &__x,
const complex<_Tp> &__y) {
return exp(__y * log(__x));
}
template <class _Tp>
inline CUDA_CALLABLE_MEMBER complex<_Tp> pow(const complex<_Tp> &__x,
const _Tp &__y) {
return pow(__x, complex<_Tp>(__y));
}
template <class _Tp>
inline CUDA_CALLABLE_MEMBER complex<_Tp> pow(const _Tp &__x,
const complex<_Tp> &__y) {
return pow(complex<_Tp>(__x), __y);
}
// asinh
template <class _Tp>
CUDA_CALLABLE_MEMBER complex<_Tp> asinh(const complex<_Tp> &__x) {
const _Tp __pi(atan2(+0., -0.));
if (isinf(__x.real())) {
if (isnan(__x.imag()))
return __x;
if (isinf(__x.imag()))
return complex<_Tp>(__x.real(), copysign(__pi * _Tp(0.25), __x.imag()));
return complex<_Tp>(__x.real(), copysign(_Tp(0), __x.imag()));
}
if (isnan(__x.real())) {
if (isinf(__x.imag()))
return complex<_Tp>(__x.imag(), __x.real());
if (__x.imag() == 0)
return __x;
return complex<_Tp>(__x.real(), __x.real());
}
if (isinf(__x.imag()))
return complex<_Tp>(copysign(__x.imag(), __x.real()),
copysign(__pi / _Tp(2), __x.imag()));
complex<_Tp> __z = log(__x + sqrt(pow(__x, _Tp(2)) + _Tp(1)));
return complex<_Tp>(copysign(__z.real(), __x.real()),
copysign(__z.imag(), __x.imag()));
}
// acosh
template <class _Tp>
CUDA_CALLABLE_MEMBER complex<_Tp> acosh(const complex<_Tp> &__x) {
const _Tp __pi(atan2(+0., -0.));
if (isinf(__x.real())) {
if (isnan(__x.imag()))
return complex<_Tp>(fabs(__x.real()), __x.imag());
if (isinf(__x.imag()))
if (__x.real() > 0)
return complex<_Tp>(__x.real(), copysign(__pi * _Tp(0.25), __x.imag()));
else
return complex<_Tp>(-__x.real(),
copysign(__pi * _Tp(0.75), __x.imag()));
if (__x.real() < 0)
return complex<_Tp>(-__x.real(), copysign(__pi, __x.imag()));
return complex<_Tp>(__x.real(), copysign(_Tp(0), __x.imag()));
}
if (isnan(__x.real())) {
if (isinf(__x.imag()))
return complex<_Tp>(fabs(__x.imag()), __x.real());
return complex<_Tp>(__x.real(), __x.real());
}
if (isinf(__x.imag()))
return complex<_Tp>(fabs(__x.imag()), copysign(__pi / _Tp(2), __x.imag()));
complex<_Tp> __z = log(__x + sqrt(pow(__x, _Tp(2)) - _Tp(1)));
return complex<_Tp>(copysign(__z.real(), _Tp(0)),
copysign(__z.imag(), __x.imag()));
}
// atanh
template <class _Tp>
CUDA_CALLABLE_MEMBER complex<_Tp> atanh(const complex<_Tp> &__x) {
const _Tp __pi(atan2(+0., -0.));
if (isinf(__x.imag())) {
return complex<_Tp>(copysign(_Tp(0), __x.real()),
copysign(__pi / _Tp(2), __x.imag()));
}
if (isnan(__x.imag())) {
if (isinf(__x.real()) || __x.real() == 0)
return complex<_Tp>(copysign(_Tp(0), __x.real()), __x.imag());
return complex<_Tp>(__x.imag(), __x.imag());
}
if (isnan(__x.real())) {
return complex<_Tp>(__x.real(), __x.real());
}
if (isinf(__x.real())) {
return complex<_Tp>(copysign(_Tp(0), __x.real()),
copysign(__pi / _Tp(2), __x.imag()));
}
if (fabs(__x.real()) == _Tp(1) && __x.imag() == _Tp(0)) {
return complex<_Tp>(copysign(_Tp(INFINITY), __x.real()),
copysign(_Tp(0), __x.imag()));
}
complex<_Tp> __z = log((_Tp(1) + __x) / (_Tp(1) - __x)) / _Tp(2);
return complex<_Tp>(copysign(__z.real(), __x.real()),
copysign(__z.imag(), __x.imag()));
}
// sinh
template <class _Tp>
CUDA_CALLABLE_MEMBER complex<_Tp> sinh(const complex<_Tp> &__x) {
if (isinf(__x.real()) && !isfinite(__x.imag()))
return complex<_Tp>(__x.real(), _Tp(NAN));
if (__x.real() == 0 && !isfinite(__x.imag()))
return complex<_Tp>(__x.real(), _Tp(NAN));
if (__x.imag() == 0 && !isfinite(__x.real()))
return __x;
return complex<_Tp>(sinh(__x.real()) * cos(__x.imag()),
cosh(__x.real()) * sin(__x.imag()));
}
// cosh
template <class _Tp>
CUDA_CALLABLE_MEMBER complex<_Tp> cosh(const complex<_Tp> &__x) {
if (isinf(__x.real()) && !isfinite(__x.imag()))
return complex<_Tp>(fabs(__x.real()), _Tp(NAN));
if (__x.real() == 0 && !isfinite(__x.imag()))
return complex<_Tp>(_Tp(NAN), __x.real());
if (__x.real() == 0 && __x.imag() == 0)
return complex<_Tp>(_Tp(1), __x.imag());
if (__x.imag() == 0 && !isfinite(__x.real()))
return complex<_Tp>(fabs(__x.real()), __x.imag());
return complex<_Tp>(cosh(__x.real()) * cos(__x.imag()),
sinh(__x.real()) * sin(__x.imag()));
}
// tanh
template <class _Tp>
CUDA_CALLABLE_MEMBER complex<_Tp> tanh(const complex<_Tp> &__x) {
if (isinf(__x.real())) {
if (!isfinite(__x.imag()))
return complex<_Tp>(_Tp(1), _Tp(0));
return complex<_Tp>(_Tp(1), copysign(_Tp(0), sin(_Tp(2) * __x.imag())));
}
if (isnan(__x.real()) && __x.imag() == 0)
return __x;
_Tp __2r(_Tp(2) * __x.real());
_Tp __2i(_Tp(2) * __x.imag());
_Tp __d(cosh(__2r) + cos(__2i));
return complex<_Tp>(sinh(__2r) / __d, sin(__2i) / __d);
}
// asin
template <class _Tp>
CUDA_CALLABLE_MEMBER complex<_Tp> asin(const complex<_Tp> &__x) {
complex<_Tp> __z = asinh(complex<_Tp>(-__x.imag(), __x.real()));
return complex<_Tp>(__z.imag(), -__z.real());
}
// acos
template <class _Tp>
CUDA_CALLABLE_MEMBER complex<_Tp> acos(const complex<_Tp> &__x) {
const _Tp __pi(atan2(+0., -0.));
if (isinf(__x.real())) {
if (isnan(__x.imag()))
return complex<_Tp>(__x.imag(), __x.real());
if (isinf(__x.imag())) {
if (__x.real() < _Tp(0))
return complex<_Tp>(_Tp(0.75) * __pi, -__x.imag());
return complex<_Tp>(_Tp(0.25) * __pi, -__x.imag());
}
if (__x.real() < _Tp(0))
return complex<_Tp>(__pi, signbit(__x.imag()) ? -__x.real() : __x.real());
return complex<_Tp>(_Tp(0), signbit(__x.imag()) ? __x.real() : -__x.real());
}
if (isnan(__x.real())) {
if (isinf(__x.imag()))
return complex<_Tp>(__x.real(), -__x.imag());
return complex<_Tp>(__x.real(), __x.real());
}
if (isinf(__x.imag()))
return complex<_Tp>(__pi / _Tp(2), -__x.imag());
if (__x.real() == 0)
return complex<_Tp>(__pi / _Tp(2), -__x.imag());
complex<_Tp> __z = log(__x + sqrt(pow(__x, _Tp(2)) - _Tp(1)));
if (signbit(__x.imag()))
return complex<_Tp>(fabs(__z.imag()), fabs(__z.real()));
return complex<_Tp>(fabs(__z.imag()), -fabs(__z.real()));
}
// atan
template <class _Tp>
CUDA_CALLABLE_MEMBER complex<_Tp> atan(const complex<_Tp> &__x) {
complex<_Tp> __z = atanh(complex<_Tp>(-__x.imag(), __x.real()));
return complex<_Tp>(__z.imag(), -__z.real());
}
// sin
template <class _Tp>
CUDA_CALLABLE_MEMBER complex<_Tp> sin(const complex<_Tp> &__x) {
complex<_Tp> __z = sinh(complex<_Tp>(-__x.imag(), __x.real()));
return complex<_Tp>(__z.imag(), -__z.real());
}
// cos
template <class _Tp>
inline CUDA_CALLABLE_MEMBER complex<_Tp> cos(const complex<_Tp> &__x) {
return cosh(complex<_Tp>(-__x.imag(), __x.real()));
}
// tan
template <class _Tp>
CUDA_CALLABLE_MEMBER complex<_Tp> tan(const complex<_Tp> &__x) {
complex<_Tp> __z = tanh(complex<_Tp>(-__x.imag(), __x.real()));
return complex<_Tp>(__z.imag(), -__z.real());
}
template <class _Tp, class _CharT, class _Traits>
std::basic_istream<_CharT, _Traits> &
operator>>(std::basic_istream<_CharT, _Traits> &__is, complex<_Tp> &__x) {
if (__is.good()) {
ws(__is);
if (__is.peek() == _CharT('(')) {
__is.get();
_Tp __r;
__is >> __r;
if (!__is.fail()) {
ws(__is);
_CharT __c = __is.peek();
if (__c == _CharT(',')) {
__is.get();
_Tp __i;
__is >> __i;
if (!__is.fail()) {
ws(__is);
__c = __is.peek();
if (__c == _CharT(')')) {
__is.get();
__x = complex<_Tp>(__r, __i);
} else
__is.setstate(std::ios_base::failbit);
} else
__is.setstate(std::ios_base::failbit);
} else if (__c == _CharT(')')) {
__is.get();
__x = complex<_Tp>(__r, _Tp(0));
} else
__is.setstate(std::ios_base::failbit);
} else
__is.setstate(std::ios_base::failbit);
} else {
_Tp __r;
__is >> __r;
if (!__is.fail())
__x = complex<_Tp>(__r, _Tp(0));
else
__is.setstate(std::ios_base::failbit);
}
} else
__is.setstate(std::ios_base::failbit);
return __is;
}
template <class _Tp, class _CharT, class _Traits>
std::basic_ostream<_CharT, _Traits> &
operator<<(std::basic_ostream<_CharT, _Traits> &__os, const complex<_Tp> &__x) {
std::basic_ostringstream<_CharT, _Traits> __s;
__s.flags(__os.flags());
__s.imbue(__os.getloc());
__s.precision(__os.precision());
__s << '(' << __x.real() << ',' << __x.imag() << ')';
return __os << __s.str();
}
//} // close namespace cuda_complex
template <class U, class V>
CUDA_CALLABLE_MEMBER auto operator*(const complex<U> &complexNumber,
const V &scalar) -> complex<U> {
return complex<U>(real(complexNumber) * scalar, imag(complexNumber) * scalar);
}
template <class U, class V>
CUDA_CALLABLE_MEMBER auto operator*(const V &scalar,
const complex<U> &complexNumber)
-> complex<U> {
return complex<U>(real(complexNumber) * scalar, imag(complexNumber) * scalar);
}
template <class U, class V>
CUDA_CALLABLE_MEMBER auto operator+(const complex<U> &complexNumber,
const V &scalar) -> complex<U> {
return complex<U>(real(complexNumber) + scalar, imag(complexNumber));
}
template <class U, class V>
CUDA_CALLABLE_MEMBER auto operator+(const V &scalar,
const complex<U> &complexNumber)
-> complex<U> {
return complex<U>(real(complexNumber) + scalar, imag(complexNumber));
}
template <class U, class V>
CUDA_CALLABLE_MEMBER auto operator-(const complex<U> &complexNumber,
const V &scalar) -> complex<U> {
return complex<U>(real(complexNumber) - scalar, imag(complexNumber));
}
template <class U, class V>
CUDA_CALLABLE_MEMBER auto operator-(const V &scalar,
const complex<U> &complexNumber)
-> complex<U> {
return complex<U>(scalar - real(complexNumber), imag(complexNumber));
}
template <class U, class V>
CUDA_CALLABLE_MEMBER auto operator/(const complex<U> &complexNumber,
const V scalar) -> complex<U> {
return complex<U>(real(complexNumber) / scalar, imag(complexNumber) / scalar);
}
template <class U, class V>
CUDA_CALLABLE_MEMBER auto operator/(const V scalar,
const complex<U> &complexNumber)
-> complex<U> {
return complex<U>(scalar, 0) / complexNumber;
}
using ComplexDouble = complex<double>;
using ComplexFloat = complex<float>;
#endif // CUDA_COMPLEX_HPP
}
#ifndef OPENCL_STDINT
#define OPENCL_STDINT
typedef unsigned int uint_t;
typedef signed char int8_t;
typedef signed short int16_t;
typedef signed int int32_t;
typedef signed long int int64_t;
typedef unsigned char uint8_t;
typedef unsigned short uint16_t;
typedef unsigned int uint32_t;
typedef unsigned long int uint64_t;
#endif
#include <cstdint>
#ifndef __CUDA_ARCH__
#define QUALIFIERS inline
#else
#define QUALIFIERS static __forceinline__ __device__
#endif
#define PHILOX_W32_0 (0x9E3779B9)
#define PHILOX_W32_1 (0xBB67AE85)
#define PHILOX_M4x32_0 (0xD2511F53)
#define PHILOX_M4x32_1 (0xCD9E8D57)
#define TWOPOW53_INV_DOUBLE (1.1102230246251565e-16)
#define TWOPOW32_INV_FLOAT (2.3283064e-10f)
typedef std::uint32_t uint32;
typedef std::uint64_t uint64;
QUALIFIERS uint32 mulhilo32(uint32 a, uint32 b, uint32* hip)
{
#ifndef __CUDA_ARCH__
// host code
uint64 product = ((uint64)a) * ((uint64)b);
*hip = product >> 32;
return (uint32)product;
#else
// device code
*hip = __umulhi(a,b);
return a*b;
#endif
}
QUALIFIERS void _philox4x32round(uint32* ctr, uint32* key)
{
uint32 hi0;
uint32 hi1;
uint32 lo0 = mulhilo32(PHILOX_M4x32_0, ctr[0], &hi0);
uint32 lo1 = mulhilo32(PHILOX_M4x32_1, ctr[2], &hi1);
ctr[0] = hi1^ctr[1]^key[0];
ctr[1] = lo1;
ctr[2] = hi0^ctr[3]^key[1];
ctr[3] = lo0;
}
QUALIFIERS void _philox4x32bumpkey(uint32* key)
{
key[0] += PHILOX_W32_0;
key[1] += PHILOX_W32_1;
}
QUALIFIERS double _uniform_double_hq(uint32 x, uint32 y)
{
uint64 z = (uint64)x ^ ((uint64)y << (53 - 32));
return z * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0);
}
QUALIFIERS void philox_double2(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1, double & rnd1, double & rnd2)
{
uint32 key[2] = {key0, key1};
uint32 ctr[4] = {ctr0, ctr1, ctr2, ctr3};
_philox4x32round(ctr, key); // 1
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 2
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 3
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 4
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 5
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 6
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 7
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 8
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 9
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 10
rnd1 = _uniform_double_hq(ctr[0], ctr[1]);
rnd2 = _uniform_double_hq(ctr[2], ctr[3]);
}
QUALIFIERS void philox_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1,
float & rnd1, float & rnd2, float & rnd3, float & rnd4)
{
uint32 key[2] = {key0, key1};
uint32 ctr[4] = {ctr0, ctr1, ctr2, ctr3};
_philox4x32round(ctr, key); // 1
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 2
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 3
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 4
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 5
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 6
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 7
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 8
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 9
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 10
rnd1 = ctr[0] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
rnd2 = ctr[1] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
rnd3 = ctr[2] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
rnd4 = ctr[3] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
}
\ No newline at end of file
# -*- coding: utf-8 -*-
#
# Copyright © 2019 Stephan Seitz <stephan.seitz@fau.de>
#
# Distributed under terms of the GPLv3 license.
"""
"""
import hashlib
import itertools
from enum import Enum
from typing import Set
import sympy as sp
from sympy.core.cache import cacheit
import pystencils
from pystencils.astnodes import Node
from pystencils.data_types import TypedSymbol, cast_func, create_type
try:
import pycuda.driver
except Exception:
pass
_hash = hashlib.md5
class InterpolationMode(str, Enum):
NEAREST_NEIGHBOR = "nearest_neighbour"
NN = NEAREST_NEIGHBOR
LINEAR = "linear"
CUBIC_SPLINE = "cubic_spline"
class _InterpolationSymbol(TypedSymbol):
def __new__(cls, name, field, interpolator):
obj = cls.__xnew_cached_(cls, name, field, interpolator)
return obj
def __new_stage2__(cls, name, field, interpolator):
obj = super().__xnew__(cls, name, 'dummy_symbol_carrying_field' + field.name)
obj.field = field
obj.interpolator = interpolator
return obj
def __getnewargs__(self):
return self.name, self.field, self.interpolator
# noinspection SpellCheckingInspection
__xnew__ = staticmethod(__new_stage2__)
# noinspection SpellCheckingInspection
__xnew_cached_ = staticmethod(cacheit(__new_stage2__))
class Interpolator(object):
"""
Implements non-integer accesses on fields using linear interpolation.
On GPU, this interpolator can be implemented by a :class:`.TextureCachedField` for hardware acceleration.
Address modes are different boundary handlings possible choices are like for CUDA textures
**CLAMP**
The signal c[k] is continued outside k=0,...,M-1 so that c[k] = c[0] for k < 0, and c[k] = c[M-1] for k >= M.
**BORDER**
The signal c[k] is continued outside k=0,...,M-1 so that c[k] = 0 for k < 0and for k >= M.
Now, to describe the last two address modes, we are forced to consider normalized coordinates,
so that the 1D input signal samples are assumed to be c[k / M], with k=0,...,M-1.
**WRAP**
The signal c[k / M] is continued outside k=0,...,M-1 so that it is periodic with period equal to M.
In other words, c[(k + p * M) / M] = c[k / M] for any (positive, negative or vanishing) integer p.
**MIRROR**
The signal c[k / M] is continued outside k=0,...,M-1 so that it is periodic with period equal to 2 * M - 2.
In other words, c[l / M] = c[k / M] for any l and k such that (l + k)mod(2 * M - 2) = 0.
Explanations from https://stackoverflow.com/questions/19020963/the-different-addressing-modes-of-cuda-textures
"""
required_global_declarations = []
def __init__(self,
parent_field,
interpolation_mode: InterpolationMode,
address_mode='BORDER',
use_normalized_coordinates=False,
allow_textures=True):
super().__init__()
self.field = parent_field
self.field.field_type = pystencils.field.FieldType.CUSTOM
self.address_mode = address_mode
self.use_normalized_coordinates = use_normalized_coordinates
self.interpolation_mode = interpolation_mode
self.hash_str = hashlib.md5(
f'{self.field}_{address_mode}_{self.field.dtype}_{interpolation_mode}'.encode()).hexdigest()
self.symbol = _InterpolationSymbol(str(self), parent_field, self)
self.allow_textures = allow_textures
@property
def ndim(self):
return self.field.ndim
@property
def _hashable_contents(self):
return (str(self.address_mode),
str(type(self)),
self.hash_str,
self.use_normalized_coordinates)
def at(self, offset):
return InterpolatorAccess(self.symbol, *[sp.S(o) for o in offset])
def __getitem__(self, offset):
return InterpolatorAccess(self.symbol, *[sp.S(o) for o in offset])
def __str__(self):
return f'{self.field.name}_interpolator_{self.reproducible_hash}'
def __repr__(self):
return self.__str__()
def __hash__(self):
return hash(self._hashable_contents)
def __eq__(self, other):
return hash(self) == hash(other)
@property
def reproducible_hash(self):
return _hash(str(self._hashable_contents).encode()).hexdigest()
class LinearInterpolator(Interpolator):
def __init__(self,
parent_field: pystencils.Field,
address_mode='BORDER',
use_normalized_coordinates=False):
super().__init__(parent_field,
InterpolationMode.LINEAR,
address_mode,
use_normalized_coordinates)
class NearestNeightborInterpolator(Interpolator):
def __init__(self,
parent_field: pystencils.Field,
address_mode='BORDER',
use_normalized_coordinates=False):
super().__init__(parent_field,
InterpolationMode.NN,
address_mode,
use_normalized_coordinates)
class InterpolatorAccess(TypedSymbol):
def __new__(cls, field, *offsets, **kwargs):
obj = InterpolatorAccess.__xnew_cached_(cls, field, *offsets, **kwargs)
return obj
def __new_stage2__(cls, symbol, *offsets):
assert offsets is not None
obj = super().__xnew__(cls, '%s_interpolator_%s' %
(symbol.field.name, _hash(str(tuple(offsets)).encode()).hexdigest()),
symbol.field.dtype)
obj.offsets = offsets
obj.symbol = symbol
obj.field = symbol.field
obj.interpolator = symbol.interpolator
return obj
def _hashable_contents(self):
return super()._hashable_content() + ((self.symbol, self.field, tuple(self.offsets), self.symbol.interpolator))
def __str__(self):
return f"{self.field.name}_interpolator({', '.join(str(o) for o in self.offsets)})"
def __repr__(self):
return self.__str__()
def _latex(self, printer, *_):
n = self.field.latex_name if self.field.latex_name else self.field.name
foo = ", ".join(str(printer.doprint(o)) for o in self.offsets)
return f'{n}_{{interpolator}}\\left({foo}\\right)'
@property
def ndim(self):
return len(self.offsets)
@property
def is_texture(self):
return isinstance(self.interpolator, TextureCachedField)
def atoms(self, *types):
if self.offsets:
offsets = set(o for o in self.offsets if isinstance(o, types))
if isinstance(self, *types):
offsets.update([self])
for o in self.offsets:
if hasattr(o, 'atoms'):
offsets.update(set(o.atoms(*types)))
return offsets
else:
return set()
def neighbor(self, coord_id, offset):
offset_list = list(self.offsets)
offset_list[coord_id] += offset
return self.interpolator.at(tuple(offset_list))
@property
def free_symbols(self):
symbols = set()
if self.offsets is not None:
for o in self.offsets:
if hasattr(o, 'free_symbols'):
symbols.update(set(o.free_symbols))
# if hasattr(o, 'atoms'):
# symbols.update(set(o.atoms(sp.Symbol)))
return symbols
@property
def required_global_declarations(self):
required_global_declarations = self.symbol.interpolator.required_global_declarations
if required_global_declarations:
required_global_declarations[0]._symbols_defined.add(self)
return required_global_declarations
@property
def args(self):
return [self.symbol, *self.offsets]
@property
def symbols_defined(self) -> Set[sp.Symbol]:
return {self}
@property
def interpolation_mode(self):
return self.interpolator.interpolation_mode
@property
def _diff_interpolation_vec(self):
return sp.Matrix([DiffInterpolatorAccess(self.symbol, i, *self.offsets)
for i in range(len(self.offsets))])
def diff(self, *symbols, **kwargs):
if symbols == (self,):
return 1
rtn = self._diff_interpolation_vec.T * sp.Matrix(self.offsets).diff(*symbols, **kwargs)
if rtn.shape == (1, 1):
rtn = rtn[0, 0]
return rtn
def implementation_with_stencils(self):
field = self.field
default_int_type = create_type('int64')
use_textures = isinstance(self.interpolator, TextureCachedField)
if use_textures:
def absolute_access(x, _):
return self.symbol.interpolator.at((o for o in x))
else:
absolute_access = field.absolute_access
sum = [0, ] * (field.shape[0] if field.index_dimensions else 1)
offsets = self.offsets
rounding_functions = (sp.floor, lambda x: sp.floor(x) + 1)
for channel_idx in range(field.shape[0] if field.index_dimensions else 1):
if self.interpolation_mode == InterpolationMode.NN:
if use_textures:
sum[channel_idx] = self
else:
sum[channel_idx] = absolute_access([sp.floor(i + 0.5) for i in offsets], channel_idx)
elif self.interpolation_mode == InterpolationMode.LINEAR:
# TODO optimization: implement via lerp: https://devblogs.nvidia.com/lerp-faster-cuda/
for c in itertools.product(rounding_functions, repeat=field.spatial_dimensions):
weight = sp.Mul(*[1 - sp.Abs(f(offset) - offset) for (f, offset) in zip(c, offsets)])
index = [f(offset) for (f, offset) in zip(c, offsets)]
# Hardware boundary handling on GPU
if use_textures:
weight = sp.Mul(*[1 - sp.Abs(f(offset) - offset) for (f, offset) in zip(c, offsets)])
sum[channel_idx] += \
weight * absolute_access(index, channel_idx if field.index_dimensions else ())
# else boundary handling using software
elif str(self.interpolator.address_mode).lower() == 'border':
is_inside_field = sp.And(
*itertools.chain([i >= 0 for i in index],
[idx < field.shape[dim] for (dim, idx) in enumerate(index)]))
index = [cast_func(i, default_int_type) for i in index]
sum[channel_idx] += sp.Piecewise(
(weight * absolute_access(index, channel_idx if field.index_dimensions else ()),
is_inside_field),
(sp.simplify(0), True)
)
elif str(self.interpolator.address_mode).lower() == 'clamp':
index = [sp.Min(sp.Max(0, cast_func(i, default_int_type)), field.spatial_shape[dim] - 1)
for (dim, i) in enumerate(index)]
sum[channel_idx] += weight * \
absolute_access(index, channel_idx if field.index_dimensions else ())
elif str(self.interpolator.address_mode).lower() == 'wrap':
index = [sp.Mod(cast_func(i, default_int_type), field.shape[dim] - 1)
for (dim, i) in enumerate(index)]
index = [cast_func(sp.Piecewise((i, i > 0),
(sp.Abs(cast_func(field.shape[dim] - 1 + i, default_int_type)),
True)), default_int_type)
for (dim, i) in enumerate(index)]
sum[channel_idx] += weight * \
absolute_access(index, channel_idx if field.index_dimensions else ())
# sum[channel_idx] = 0
elif str(self.interpolator.address_mode).lower() == 'mirror':
def triangle_fun(x, half_period):
saw_tooth = cast_func(sp.Abs(cast_func(x, 'int32')), 'int32') % (
cast_func(2 * half_period, create_type('int32')))
return sp.Piecewise((saw_tooth, saw_tooth < half_period),
(2 * half_period - 1 - saw_tooth, True))
index = [cast_func(triangle_fun(i, field.shape[dim]),
default_int_type) for (dim, i) in enumerate(index)]
sum[channel_idx] += weight * \
absolute_access(index, channel_idx if field.index_dimensions else ())
else:
raise NotImplementedError()
elif self.interpolation_mode == InterpolationMode.CUBIC_SPLINE:
raise NotImplementedError("only works with HW interpolation for float32")
sum = [sp.factor(s) for s in sum]
if field.index_dimensions:
return sp.Matrix(sum)
else:
return sum[0]
# noinspection SpellCheckingInspection
__xnew__ = staticmethod(__new_stage2__)
# noinspection SpellCheckingInspection
__xnew_cached_ = staticmethod(cacheit(__new_stage2__))
def __getnewargs__(self):
return tuple(self.symbol, *self.offsets)
class DiffInterpolatorAccess(InterpolatorAccess):
def __new__(cls, symbol, diff_coordinate_idx, *offsets, **kwargs):
if symbol.interpolator.interpolation_mode == InterpolationMode.LINEAR:
from pystencils.fd import Diff, Discretization2ndOrder
return Discretization2ndOrder(1)(Diff(symbol.interpolator.at(offsets), diff_coordinate_idx))
obj = DiffInterpolatorAccess.__xnew_cached_(cls, symbol, diff_coordinate_idx, *offsets, **kwargs)
return obj
def __new_stage2__(self, symbol: sp.Symbol, diff_coordinate_idx, *offsets):
assert offsets is not None
obj = super().__xnew__(self, symbol, *offsets)
obj.diff_coordinate_idx = diff_coordinate_idx
return obj
def __hash__(self):
return hash((self.symbol, self.field, self.diff_coordinate_idx, tuple(self.offsets), self.interpolator))
def __str__(self):
return '%s_diff%i_interpolator(%s)' % (self.field.name, self.diff_coordinate_idx,
', '.join(str(o) for o in self.offsets))
def __repr__(self):
return str(self)
@property
def args(self):
return [self.symbol, self.diff_coordinate_idx, *self.offsets]
@property
def symbols_defined(self) -> Set[sp.Symbol]:
return {self}
@property
def interpolation_mode(self):
return self.interpolator.interpolation_mode
# noinspection SpellCheckingInspection
__xnew__ = staticmethod(__new_stage2__)
# noinspection SpellCheckingInspection
__xnew_cached_ = staticmethod(cacheit(__new_stage2__))
def __getnewargs__(self):
return tuple(self.symbol, self.diff_coordinate_idx, *self.offsets)
##########################################################################################
# GPU-specific fast specializations (for precision GPUs can also use above nodes/symbols #
##########################################################################################
class TextureCachedField(Interpolator):
def __init__(self, parent_field,
address_mode=None,
filter_mode=None,
interpolation_mode: InterpolationMode = InterpolationMode.LINEAR,
use_normalized_coordinates=False,
read_as_integer=False
):
super().__init__(parent_field, interpolation_mode, address_mode, use_normalized_coordinates)
if address_mode is None:
address_mode = 'border'
if filter_mode is None:
filter_mode = pycuda.driver.filter_mode.LINEAR
self.read_as_integer = read_as_integer
self.required_global_declarations = [TextureDeclaration(self)]
@property
def ndim(self):
return self.field.ndim
@classmethod
def from_interpolator(cls, interpolator: LinearInterpolator):
if (isinstance(interpolator, cls)
or (hasattr(interpolator, 'allow_textures') and not interpolator.allow_textures)):
return interpolator
obj = cls(interpolator.field, interpolator.address_mode, interpolation_mode=interpolator.interpolation_mode)
return obj
def __str__(self):
return f'{self.field.name}_texture_{self.reproducible_hash}'
def __repr__(self):
return self.__str__()
@property
def reproducible_hash(self):
return _hash(str(self._hashable_contents).encode()).hexdigest()
class TextureDeclaration(Node):
"""
A global declaration of a texture. Visible both for device and host code.
.. code:: cpp
// This Node represents the following global declaration
texture<float, cudaTextureType2D, cudaReadModeElementType> x_texture_5acc9fced7b0dc3e;
__device__ kernel(...) {
// kernel acceses x_texture_5acc9fced7b0dc3e with tex2d(...)
}
__host__ launch_kernel(...) {
// Host needs to bind the texture
cudaBindTexture(0, x_texture_5acc9fced7b0dc3e, buffer, N*sizeof(float));
}
This has been deprecated by CUDA in favor of :class:`.TextureObject`.
But texture objects are not yet supported by PyCUDA (https://github.com/inducer/pycuda/pull/174)
"""
def __init__(self, parent_texture):
self.texture = parent_texture
self._symbols_defined = {self.texture.symbol}
@property
def symbols_defined(self) -> Set[sp.Symbol]:
return self._symbols_defined
@property
def args(self) -> Set[sp.Symbol]:
return set()
@property
def headers(self):
headers = ['"pycuda-helpers.hpp"']
if self.texture.interpolation_mode == InterpolationMode.CUBIC_SPLINE:
headers.append('"cubicTex%iD.cu"' % self.texture.ndim)
return headers
def __str__(self):
from pystencils.backends.cuda_backend import CudaBackend
return CudaBackend()(self)
def __repr__(self):
return str(self)
class TextureObject(TextureDeclaration):
"""
A CUDA texture object. Opposed to :class:`.TextureDeclaration` it is not declared globally but
used as a function argument for the kernel call.
Like :class:`.TextureDeclaration` it defines :class:`.TextureAccess` symbols.
Just the printing representation is a bit different.
"""
pass
def dtype_supports_textures(dtype):
"""
Returns whether CUDA natively supports texture fetches with this numpy dtype.
The maximum word size for a texture fetch is four bytes.
With this trick also larger dtypes can be fetched:
https://github.com/inducer/pycuda/blob/master/pycuda/cuda/pycuda-helpers.hpp
"""
if hasattr(dtype, 'numpy_dtype'):
dtype = dtype.numpy_dtype
if isinstance(dtype, type):
return dtype().itemsize <= 4
return dtype.itemsize <= 4
from .generate_benchmark import generate_benchmark, run_c_benchmark
from .kerncraft_interface import KerncraftParameters, PyStencilsKerncraftKernel
__all__ = ['PyStencilsKerncraftKernel', 'KerncraftParameters', 'generate_benchmark', 'run_c_benchmark']
import subprocess
import warnings
import tempfile
from pathlib import Path
from jinja2 import Environment, PackageLoader, StrictUndefined
from pystencils.astnodes import PragmaBlock
from pystencils.backends.cbackend import generate_c, get_headers
from pystencils.cpu.cpujit import get_compiler_config, run_compile_step
from pystencils.data_types import get_base_type
from pystencils.include import get_pystencils_include_path
from pystencils.sympyextensions import prod
def generate_benchmark(ast, likwid=False, openmp=False, timing=False):
"""Return C code of a benchmark program for the given kernel.
Args:
ast: the pystencils AST object as returned by create_kernel
likwid: if True likwid markers are added to the code
openmp: relevant only if likwid=True, to generated correct likwid initialization code
timing: add timing output to the code, prints time per iteration to stdout
Returns:
C code as string
"""
accessed_fields = {f.name: f for f in ast.fields_accessed}
constants = []
fields = []
call_parameters = []
for p in ast.get_parameters():
if not p.is_field_parameter:
constants.append((p.symbol.name, str(p.symbol.dtype)))
call_parameters.append(p.symbol.name)
else:
assert p.is_field_pointer, "Benchmark implemented only for kernels with fixed loop size"
field = accessed_fields[p.field_name]
dtype = str(get_base_type(p.symbol.dtype))
fields.append((p.field_name, dtype, prod(field.shape)))
call_parameters.append(p.field_name)
header_list = get_headers(ast)
includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list])
# Strip "#pragma omp parallel" from within kernel, because main function takes care of that
# when likwid and openmp are enabled
if likwid and openmp:
if len(ast.body.args) > 0 and isinstance(ast.body.args[0], PragmaBlock):
ast.body.args[0].pragma_line = ''
jinja_context = {
'likwid': likwid,
'openmp': openmp,
'kernel_code': generate_c(ast, dialect='c'),
'kernelName': ast.function_name,
'fields': fields,
'constants': constants,
'call_argument_list': ",".join(call_parameters),
'includes': includes,
'timing': timing,
}
env = Environment(loader=PackageLoader('pystencils.kerncraft_coupling'), undefined=StrictUndefined)
return env.get_template('benchmark.c').render(**jinja_context)
def run_c_benchmark(ast, inner_iterations, outer_iterations=3, path=None):
"""Runs the given kernel with outer loop in C
Args:
ast: pystencils ast which is used to compile the benchmark file
inner_iterations: timings are recorded around this many iterations
outer_iterations: number of timings recorded
path: path where the benchmark file is stored. If None a tmp folder is created
Returns:
list of times per iterations for each outer iteration
"""
import kerncraft
benchmark_code = generate_benchmark(ast, timing=True)
if path is None:
path = tempfile.mkdtemp()
if isinstance(path, str):
path = Path(path)
with open(path / 'bench.c', 'w') as f:
f.write(benchmark_code)
kerncraft_path = Path(kerncraft.__file__).parent
extra_flags = ['-I' + get_pystencils_include_path(),
'-I' + str(kerncraft_path / 'headers')]
compiler_config = get_compiler_config()
compile_cmd = [compiler_config['command']] + compiler_config['flags'].split()
compile_cmd += [*extra_flags,
kerncraft_path / 'headers' / 'timing.c',
kerncraft_path / 'headers' / 'dummy.c',
path / 'bench.c',
'-o', path / 'bench',
]
run_compile_step(compile_cmd)
time_pre_estimation_per_iteration = float(subprocess.check_output(['./' / path / 'bench', str(10)]))
benchmark_time_limit = 20
if benchmark_time_limit / time_pre_estimation_per_iteration < inner_iterations:
warn = (f"A benchmark run with {inner_iterations} inner_iterations will probably take longer than "
f"{benchmark_time_limit} seconds for this kernel")
warnings.warn(warn)
results = []
for _ in range(outer_iterations):
benchmark_time = float(subprocess.check_output(['./' / path / 'bench', str(inner_iterations)]))
results.append(benchmark_time)
return results
import warnings
import fcntl
from collections import defaultdict
from tempfile import TemporaryDirectory
import textwrap
import itertools
import string
from jinja2 import Environment, PackageLoader, StrictUndefined, Template
import sympy as sp
from kerncraft.kerncraft import KernelCode
from kerncraft.kernel import symbol_pos_int
from kerncraft.machinemodel import MachineModel
from pystencils.astnodes import \
KernelFunction, LoopOverCoordinate, ResolvedFieldAccess, SympyAssignment
from pystencils.backends.cbackend import generate_c, get_headers
from pystencils.field import get_layout_from_strides
from pystencils.sympyextensions import count_operations_in_ast
from pystencils.transformations import filtered_tree_iteration
from pystencils.utils import DotDict
from pystencils.cpu.kernelcreation import add_openmp
from pystencils.data_types import get_base_type
from pystencils.sympyextensions import prod
class PyStencilsKerncraftKernel(KernelCode):
"""
Implementation of kerncraft's kernel interface for pystencils CPU kernels.
Analyses a list of equations assuming they will be executed on a CPU
"""
LIKWID_BASE = '/usr/local/likwid'
def __init__(self, ast: KernelFunction, machine: MachineModel,
assumed_layout='SoA', debug_print=False, filename=None):
"""Create a kerncraft kernel using a pystencils AST
Args:
ast: pystencils ast
machine: kerncraft machine model - specify this if kernel needs to be compiled
assumed_layout: either 'SoA' or 'AoS' - if fields have symbolic sizes the layout of the index
coordinates is not known. In this case either a structures of array (SoA) or
array of structures (AoS) layout is assumed
debug_print: print debug information
filename: used for caching
"""
super(KernelCode, self).__init__(machine=machine)
# Initialize state
self.asm_block = None
self._filename = filename
self._keep_intermediates = False
self.kernel_ast = ast
self.temporary_dir = TemporaryDirectory()
self._keep_intermediates = debug_print
# Loops
inner_loops = [l for l in filtered_tree_iteration(ast, LoopOverCoordinate, stop_type=SympyAssignment)
if l.is_innermost_loop]
if len(inner_loops) == 0:
raise ValueError("No loop found in pystencils AST")
else:
if len(inner_loops) > 1:
warnings.warn("pystencils AST contains multiple inner loops. "
"Only one can be analyzed - choosing first one")
inner_loop = inner_loops[0]
self._loop_stack = []
cur_node = inner_loop
while cur_node is not None:
if isinstance(cur_node, LoopOverCoordinate):
loop_counter_sym = cur_node.loop_counter_symbol
loop_info = (loop_counter_sym.name, cur_node.start, cur_node.stop, 1)
# If the correct step were to be provided, all access within that step length will
# also need to be passed to kerncraft: cur_node.step)
self._loop_stack.append(loop_info)
cur_node = cur_node.parent
self._loop_stack = list(reversed(self._loop_stack))
def get_layout_tuple(f):
if f.has_fixed_shape:
return get_layout_from_strides(f.strides)
else:
layout_list = list(f.layout)
for _ in range(f.index_dimensions):
layout_list.insert(0 if assumed_layout == 'SoA' else -1, max(layout_list) + 1)
return layout_list
# Variables (arrays) and Constants (scalar sizes)
const_names_iter = itertools.product(string.ascii_uppercase, repeat=1)
constants_reversed = {}
fields_accessed = self.kernel_ast.fields_accessed
for field in fields_accessed:
layout = get_layout_tuple(field)
permuted_shape = list(field.shape[i] for i in layout)
# Replace shape dimensions with constant variables (necessary for layer condition
# analysis)
for i, d in enumerate(permuted_shape):
if d not in self.constants.values():
const_symbol = symbol_pos_int(''.join(next(const_names_iter)))
self.set_constant(const_symbol, d)
constants_reversed[d] = const_symbol
permuted_shape[i] = constants_reversed[d]
self.set_variable(field.name, (str(field.dtype),), tuple(permuted_shape))
# Data sources & destinations
self.sources = defaultdict(list)
self.destinations = defaultdict(list)
reads, writes = search_resolved_field_accesses_in_ast(inner_loop)
for accesses, target_dict in [(reads, self.sources), (writes, self.destinations)]:
for fa in accesses:
coord = [symbol_pos_int(LoopOverCoordinate.get_loop_counter_name(i)) + off
for i, off in enumerate(fa.offsets)]
coord += list(fa.idx_coordinate_values)
layout = get_layout_tuple(fa.field)
permuted_coord = [sp.sympify(coord[i]) for i in layout]
target_dict[fa.field.name].append(permuted_coord)
# Scalars may be safely ignored
# for param in self.kernel_ast.get_parameters():
# if not param.is_field_parameter:
# # self.set_variable(param.symbol.name, str(param.symbol.dtype), None)
# self.sources[param.symbol.name] = [None]
# data type
self.datatype = list(self.variables.values())[0][0]
# flops
operation_count = count_operations_in_ast(inner_loop)
self._flops = {
'+': operation_count['adds'],
'*': operation_count['muls'],
'/': operation_count['divs'],
}
for k in [k for k, v in self._flops.items() if v == 0]:
del self._flops[k]
self.check()
if debug_print:
from pprint import pprint
print("----------------------------- Loop Stack --------------------------")
pprint(self._loop_stack)
print("----------------------------- Sources -----------------------------")
pprint(self.sources)
print("----------------------------- Destinations ------------------------")
pprint(self.destinations)
print("----------------------------- FLOPS -------------------------------")
pprint(self._flops)
def get_kernel_header(self, name='pystencils_kernel'):
file_name = "pystencils_kernel.h"
file_path = self.get_intermediate_location(file_name, machine_and_compiler_dependent=False)
lock_mode, lock_fp = self.lock_intermediate(file_path)
if lock_mode == fcntl.LOCK_SH:
# use cache
pass
else: # lock_mode == fcntl.LOCK_EX:
function_signature = generate_c(self.kernel_ast, dialect='c', signature_only=True)
jinja_context = {
'function_signature': function_signature,
}
env = Environment(loader=PackageLoader('pystencils.kerncraft_coupling'), undefined=StrictUndefined)
file_header = env.get_template('kernel.h').render(**jinja_context)
with open(file_path, 'w') as f:
f.write(file_header)
self.release_exclusive_lock(lock_fp) # degrade to shared lock
return file_path, lock_fp
def get_kernel_code(self, openmp=False, name='pystencils_kernl'):
"""
Generate and return compilable source code from AST.
Args:
openmp: if true, openmp code will be generated
name: kernel name
"""
filename = 'pystencils_kernl'
if openmp:
filename += '-omp'
filename += '.c'
file_path = self.get_intermediate_location(filename, machine_and_compiler_dependent=False)
lock_mode, lock_fp = self.lock_intermediate(file_path)
if lock_mode == fcntl.LOCK_SH:
# use cache
with open(file_path) as f:
code = f.read()
else: # lock_mode == fcntl.LOCK_EX:
header_list = get_headers(self.kernel_ast)
includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list])
if openmp:
add_openmp(self.kernel_ast)
kernel_code = generate_c(self.kernel_ast, dialect='c')
jinja_context = {
'includes': includes,
'kernel_code': kernel_code,
}
env = Environment(loader=PackageLoader('pystencils.kerncraft_coupling'), undefined=StrictUndefined)
code = env.get_template('kernel.c').render(**jinja_context)
with open(file_path, 'w') as f:
f.write(code)
self.release_exclusive_lock(lock_fp) # degrade to shared lock
return file_path, lock_fp
CODE_TEMPLATE = Template(textwrap.dedent("""
#include <likwid.h>
#include <stdlib.h>
#include <stdint.h>
#include <stdbool.h>
#include <math.h>
#include "kerncraft.h"
#include "kernel.h"
#define RESTRICT __restrict__
#define FUNC_PREFIX
void dummy(void *);
extern int var_false;
int main(int argc, char **argv) {
{%- for constantName, dataType in constants %}
// Constant {{constantName}}
{{dataType}} {{constantName}};
{{constantName}} = 0.23;
{%- endfor %}
// Declaring arrays
{%- for field_name, dataType, size in fields %}
// Initialization {{field_name}}
double * {{field_name}} = (double *) aligned_malloc(sizeof({{dataType}}) * {{size}}, 64);
// TODO initialize in parallel context in same order as they are touched
for (unsigned long long i = 0; i < {{size}}; ++i)
{{field_name}}[i] = 0.23;
{%- endfor %}
likwid_markerInit();
#pragma omp parallel
{
likwid_markerRegisterRegion("loop");
#pragma omp barrier
// Initializing arrays in same order as touched in kernel loop nest
//INIT_ARRAYS;
// Dummy call
{%- for field_name, dataType, size in fields %}
if(var_false) dummy({{field_name}});
{%- endfor %}
{%- for constantName, dataType in constants %}
if(var_false) dummy(&{{constantName}});
{%- endfor %}
for(int warmup = 1; warmup >= 0; --warmup) {
int repeat = 2;
if(warmup == 0) {
repeat = atoi(argv[1]);
likwid_markerStartRegion("loop");
}
for(; repeat > 0; --repeat) {
{{kernelName}}({{call_argument_list}});
{%- for field_name, dataType, size in fields %}
if(var_false) dummy({{field_name}});
{%- endfor %}
{%- for constantName, dataType in constants %}
if(var_false) dummy(&{{constantName}});
{%- endfor %}
}
}
likwid_markerStopRegion("loop");
}
likwid_markerClose();
return 0;
}
"""))
def get_main_code(self, kernel_function_name='kernel'):
"""
Generate and return compilable source code from AST.
:return: tuple of filename and shared lock file pointer
"""
# TODO produce nicer code, including help text and other "comfort features".
assert self.kernel_ast is not None, "AST does not exist, this could be due to running " \
"based on a kernel description rather than code."
file_path = self.get_intermediate_location('main.c', machine_and_compiler_dependent=False)
lock_mode, lock_fp = self.lock_intermediate(file_path)
if lock_mode == fcntl.LOCK_SH:
# use cache
with open(file_path) as f:
code = f.read()
else: # lock_mode == fcntl.LOCK_EX
# needs update
accessed_fields = {f.name: f for f in self.kernel_ast.fields_accessed}
constants = []
fields = []
call_parameters = []
for p in self.kernel_ast.get_parameters():
if not p.is_field_parameter:
constants.append((p.symbol.name, str(p.symbol.dtype)))
call_parameters.append(p.symbol.name)
else:
assert p.is_field_pointer, "Benchmark implemented only for kernels with fixed loop size"
field = accessed_fields[p.field_name]
dtype = str(get_base_type(p.symbol.dtype))
fields.append((p.field_name, dtype, prod(field.shape)))
call_parameters.append(p.field_name)
header_list = get_headers(self.kernel_ast)
includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list])
# Generate code
code = self.CODE_TEMPLATE.render(
kernelName=self.kernel_ast.function_name,
fields=fields,
constants=constants,
call_agument_list=','.join(call_parameters),
includes=includes)
# Store to file
with open(file_path, 'w') as f:
f.write(code)
self.release_exclusive_lock(lock_fp) # degrade to shared lock
return file_path, lock_fp
class KerncraftParameters(DotDict):
def __init__(self, **kwargs):
super(KerncraftParameters, self).__init__(**kwargs)
self['asm_block'] = 'auto'
self['asm_increment'] = 0
self['cores'] = 1
self['cache_predictor'] = 'SIM'
self['verbose'] = 0
self['pointer_increment'] = 'auto'
self['iterations'] = 10
self['unit'] = 'cy/CL'
self['ignore_warnings'] = True
self['incore_model'] = 'OSACA'
# ------------------------------------------- Helper functions ---------------------------------------------------------
def search_resolved_field_accesses_in_ast(ast):
def visit(node, reads, writes):
if not isinstance(node, SympyAssignment):
for a in node.args:
visit(a, reads, writes)
return
for expr, accesses in [(node.lhs, writes), (node.rhs, reads)]:
accesses.update(expr.atoms(ResolvedFieldAccess))
read_accesses = set()
write_accesses = set()
visit(ast, read_accesses, write_accesses)
return read_accesses, write_accesses
#include "kerncraft.h"
#include <stdlib.h>
#include <stdint.h>
#include <stdbool.h>
#include <math.h>
#include <stdio.h>
{{ includes }}
{%- if likwid %}
#include <likwid.h>
{%- endif %}
#define RESTRICT __restrict__
#define FUNC_PREFIX
void dummy(void *);
void timing(double* wcTime, double* cpuTime);
extern int var_false;
{{kernel_code}}
int main(int argc, char **argv)
{
{%- if likwid %}
likwid_markerInit();
{%- endif %}
{%- for field_name, dataType, size in fields %}
// Initialization {{field_name}}
double * {{field_name}} = (double *) aligned_malloc(sizeof({{dataType}}) * {{size}}, 64);
for (unsigned long long i = 0; i < {{size}}; ++i)
{{field_name}}[i] = 0.23;
if(var_false)
dummy({{field_name}});
{%- endfor %}
{%- for constantName, dataType in constants %}
// Constant {{constantName}}
{{dataType}} {{constantName}};
{{constantName}} = 0.23;
if(var_false)
dummy(& {{constantName}});
{%- endfor %}
{%- if likwid and openmp %}
#pragma omp parallel
{
likwid_markerRegisterRegion("loop");
#pragma omp barrier
{%- elif likwid %}
likwid_markerRegisterRegion("loop");
{%- endif %}
for(int warmup = 1; warmup >= 0; --warmup) {
int repeat = 2;
if(warmup == 0) {
repeat = atoi(argv[1]);
{%- if likwid %}
likwid_markerStartRegion("loop");
{%- endif %}
}
{%- if timing %}
double wcStartTime, cpuStartTime, wcEndTime, cpuEndTime;
timing(&wcStartTime, &cpuStartTime);
{%- endif %}
for (; repeat > 0; --repeat)
{
{{kernelName}}({{call_argument_list}});
// Dummy calls
{%- for field_name, dataType, size in fields %}
if(var_false) dummy((void*){{field_name}});
{%- endfor %}
{%- for constantName, dataType in constants %}
if(var_false) dummy((void*)&{{constantName}});
{%- endfor %}
}
{%- if timing %}
timing(&wcEndTime, &cpuEndTime);
if( warmup == 0)
printf("%e\n", (wcEndTime - wcStartTime) / atoi(argv[1]) );
{%- endif %}
}
{%- if likwid %}
likwid_markerStopRegion("loop");
{%- if openmp %}
}
{%- endif %}
{%- endif %}
{%- if likwid %}
likwid_markerClose();
{%- endif %}
}
#include "kerncraft.h"
#include <stdlib.h>
#include <stdint.h>
#include <stdbool.h>
#include <math.h>
#include <stdio.h>
{{ includes }}
#define RESTRICT __restrict__
#define FUNC_PREFIX
void dummy(void *);
void timing(double* wcTime, double* cpuTime);
extern int var_false;
{{kernel_code}}
\ No newline at end of file
#define FUNC_PREFIX
{{function_signature}}
\ No newline at end of file
from .kernelcreation import create_kernel
from .llvmjit import make_python_function
__all__ = ['create_kernel', 'make_python_function']
import llvmlite.ir as ir
class Loop(object):
def __init__(self, builder, start_val, stop_val, step_val=1, loop_name='loop', phi_name="_phi"):
self.builder = builder
self.start_val = start_val
self.stop_val = stop_val
self.step_val = step_val
self.loop_name = loop_name
self.phi_name = phi_name
def __enter__(self):
self.loop_end, self.after, phi = self._for_loop(self.start_val, self.stop_val, self.step_val, self.loop_name,
self.phi_name)
return phi
def _for_loop(self, start_val, stop_val, step_val, loop_name, phi_name):
# TODO size of int??? unisgned???
integer = ir.IntType(64)
# Loop block
pre_loop_bb = self.builder.block
loop_bb = self.builder.append_basic_block(name='loop_' + loop_name)
self.builder.branch(loop_bb)
# Insert an explicit fall through from the current block to loop_bb
self.builder.position_at_start(loop_bb)
# Add phi
phi = self.builder.phi(integer, name=phi_name)
phi.add_incoming(start_val, pre_loop_bb)
loop_end_bb = self.builder.append_basic_block(name=loop_name + "_end_bb")
self.builder.position_at_start(loop_end_bb)
next_var = self.builder.add(phi, step_val, name=loop_name + '_next_it')
cond = self.builder.icmp_unsigned('<', next_var, stop_val, name=loop_name + "_cond")
after_bb = self.builder.append_basic_block(name=loop_name + "_after_bb")
self.builder.cbranch(cond, loop_bb, after_bb)
phi.add_incoming(next_var, loop_end_bb)
self.builder.position_at_end(loop_bb)
return loop_end_bb, after_bb, phi
def __exit__(self, exc_type, exc, exc_tb):
self.builder.branch(self.loop_end)
self.builder.position_at_end(self.after)
from pystencils.llvm.llvmjit import make_python_function
from pystencils.transformations import insert_casts
def create_kernel(assignments, function_name="kernel", type_info=None, split_groups=(),
iteration_slice=None, ghost_layers=None, target='cpu'):
"""
Creates an abstract syntax tree for a kernel function, by taking a list of update rules.
Loops are created according to the field accesses in the equations.
Args:
assignments: list of sympy equations, containing accesses to :class:`pystencils.field.Field`.
Defining the update rules of the kernel
function_name: name of the generated function - only important if generated code is written out
type_info: a map from symbol name to a C type specifier. If not specified all symbols are assumed to
be of type 'double' except symbols which occur on the left hand side of equations where the
right hand side is a sympy Boolean which are assumed to be 'bool' .
split_groups: Specification on how to split up inner loop into multiple loops. For details see
transformation :func:`pystencils.transformation.split_inner_loop`
iteration_slice: if not None, iteration is done only over this slice of the field
ghost_layers: a sequence of pairs for each coordinate with lower and upper nr of ghost layers
if None, the number of ghost layers is determined automatically and assumed to be equal for a
all dimensions
:return: :class:`pystencils.ast.KernelFunction` node
"""
if target == 'cpu':
from pystencils.cpu import create_kernel
code = create_kernel(assignments, function_name, type_info, split_groups, iteration_slice, ghost_layers)
code._backend = 'llvm'
elif target == 'gpu':
from pystencils.gpucuda.kernelcreation import create_cuda_kernel
code = create_cuda_kernel(assignments,
function_name,
type_info,
iteration_slice=iteration_slice,
ghost_layers=ghost_layers)
code._backend = 'llvm_gpu'
else:
NotImplementedError()
code.body = insert_casts(code.body)
code._compile_function = make_python_function
return code
import functools
import llvmlite.ir as ir
import llvmlite.llvmpy.core as lc
import sympy as sp
from sympy import Indexed, S
from sympy.printing.printer import Printer
from pystencils.assignment import Assignment
from pystencils.data_types import (
collate_types, create_composite_type_from_string, create_type, get_type_of_expression,
to_llvm_type)
from pystencils.llvm.control_flow import Loop
# From Numba
def set_cuda_kernel(lfunc):
from llvmlite.llvmpy.core import MetaData, MetaDataString, Constant, Type
m = lfunc.module
ops = lfunc, MetaDataString.get(m, "kernel"), Constant.int(Type.int(), 1)
md = MetaData.get(m, ops)
nmd = m.get_or_insert_named_metadata('nvvm.annotations')
nmd.add(md)
# set nvvm ir version
i32 = ir.IntType(32)
md_ver = m.add_metadata([i32(1), i32(2), i32(2), i32(0)])
m.add_named_metadata('nvvmir.version', md_ver)
# From Numba
def _call_sreg(builder, name):
module = builder.module
fnty = lc.Type.function(lc.Type.int(), ())
fn = module.get_or_insert_function(fnty, name=name)
return builder.call(fn, ())
def generate_llvm(ast_node, module=None, builder=None, target='cpu'):
"""Prints the ast as llvm code."""
if module is None:
module = lc.Module()
if builder is None:
builder = ir.IRBuilder()
printer = LLVMPrinter(module, builder, target=target)
return printer._print(ast_node)
# noinspection PyPep8Naming
class LLVMPrinter(Printer):
"""Convert expressions to LLVM IR"""
def __init__(self, module, builder, fn=None, target='cpu', *args, **kwargs):
self.func_arg_map = kwargs.pop("func_arg_map", {})
super(LLVMPrinter, self).__init__(*args, **kwargs)
self.fp_type = ir.DoubleType()
self.fp_pointer = self.fp_type.as_pointer()
self.integer = ir.IntType(64)
self.integer_pointer = self.integer.as_pointer()
self.void = ir.VoidType()
self.module = module
self.builder = builder
self.fn = fn
self.ext_fn = {} # keep track of wrappers to external functions
self.tmp_var = {}
self.target = target
def _add_tmp_var(self, name, value):
self.tmp_var[name] = value
def _remove_tmp_var(self, name):
del self.tmp_var[name]
def _print_Number(self, n):
if get_type_of_expression(n) == create_type("int"):
return ir.Constant(self.integer, int(n))
elif get_type_of_expression(n) == create_type("double"):
return ir.Constant(self.fp_type, float(n))
else:
raise NotImplementedError("Numbers can only have int and double", n)
def _print_Float(self, expr):
return ir.Constant(self.fp_type, float(expr))
def _print_Integer(self, expr):
return ir.Constant(self.integer, int(expr))
def _print_int(self, i):
return ir.Constant(self.integer, i)
def _print_Symbol(self, s):
val = self.tmp_var.get(s)
if not val:
# look up parameter with name s
val = self.func_arg_map.get(s.name)
if not val:
raise LookupError(f"Symbol not found: {s}")
return val
def _print_Pow(self, expr):
base0 = self._print(expr.base)
if expr.exp == S.NegativeOne:
return self.builder.fdiv(ir.Constant(self.fp_type, 1.0), base0)
if expr.exp == S.Half:
fn = self.ext_fn.get("sqrt")
if not fn:
fn_type = ir.FunctionType(self.fp_type, [self.fp_type])
fn = ir.Function(self.module, fn_type, "sqrt")
self.ext_fn["sqrt"] = fn
return self.builder.call(fn, [base0], "sqrt")
if expr.exp == 2:
return self.builder.fmul(base0, base0)
elif expr.exp == 3:
return self.builder.fmul(self.builder.fmul(base0, base0), base0)
exp0 = self._print(expr.exp)
fn = self.ext_fn.get("pow")
if not fn:
fn_type = ir.FunctionType(self.fp_type, [self.fp_type, self.fp_type])
fn = ir.Function(self.module, fn_type, "pow")
self.ext_fn["pow"] = fn
return self.builder.call(fn, [base0, exp0], "pow")
def _print_Mul(self, expr):
nodes = [self._print(a) for a in expr.args]
e = nodes[0]
if get_type_of_expression(expr) == create_type('double'):
mul = self.builder.fmul
else: # int TODO unsigned/signed
mul = self.builder.mul
for node in nodes[1:]:
e = mul(e, node)
return e
def _print_Add(self, expr):
nodes = [self._print(a) for a in expr.args]
e = nodes[0]
if get_type_of_expression(expr) == create_type('double'):
add = self.builder.fadd
else: # int TODO unsigned/signed
add = self.builder.add
for node in nodes[1:]:
e = add(e, node)
return e
def _print_Or(self, expr):
nodes = [self._print(a) for a in expr.args]
e = nodes[0]
for node in nodes[1:]:
e = self.builder.or_(e, node)
return e
def _print_And(self, expr):
nodes = [self._print(a) for a in expr.args]
e = nodes[0]
for node in nodes[1:]:
e = self.builder.and_(e, node)
return e
def _print_StrictLessThan(self, expr):
return self._comparison('<', expr)
def _print_LessThan(self, expr):
return self._comparison('<=', expr)
def _print_StrictGreaterThan(self, expr):
return self._comparison('>', expr)
def _print_GreaterThan(self, expr):
return self._comparison('>=', expr)
def _print_Unequality(self, expr):
return self._comparison('!=', expr)
def _print_Equality(self, expr):
return self._comparison('==', expr)
def _comparison(self, cmpop, expr):
if collate_types([get_type_of_expression(arg) for arg in expr.args]) == create_type('double'):
comparison = self.builder.fcmp_unordered
else:
comparison = self.builder.icmp_signed
return comparison(cmpop, self._print(expr.lhs), self._print(expr.rhs))
def _print_KernelFunction(self, func):
# KernelFunction does not posses a return type
return_type = self.void
parameter_type = []
parameters = func.get_parameters()
for parameter in parameters:
parameter_type.append(to_llvm_type(parameter.symbol.dtype, nvvm_target=self.target == 'gpu'))
func_type = ir.FunctionType(return_type, tuple(parameter_type))
name = func.function_name
fn = ir.Function(self.module, func_type, name)
self.ext_fn[name] = fn
# set proper names to arguments
for i, arg in enumerate(fn.args):
arg.name = parameters[i].symbol.name
self.func_arg_map[parameters[i].symbol.name] = arg
# func.attributes.add("inlinehint")
# func.attributes.add("argmemonly")
block = fn.append_basic_block(name="entry")
self.builder = ir.IRBuilder(block) # TODO use goto_block instead
self._print(func.body)
self.builder.ret_void()
self.fn = fn
if self.target == 'gpu':
set_cuda_kernel(fn)
return fn
def _print_Block(self, block):
for node in block.args:
self._print(node)
def _print_LoopOverCoordinate(self, loop):
with Loop(self.builder, self._print(loop.start), self._print(loop.stop), self._print(loop.step),
loop.loop_counter_name, loop.loop_counter_symbol.name) as i:
self._add_tmp_var(loop.loop_counter_symbol, i)
self._print(loop.body)
self._remove_tmp_var(loop.loop_counter_symbol)
def _print_SympyAssignment(self, assignment):
expr = self._print(assignment.rhs)
lhs = assignment.lhs
if isinstance(lhs, Indexed):
ptr = self._print(lhs.base.label)
index = self._print(lhs.args[1])
gep = self.builder.gep(ptr, [index])
return self.builder.store(expr, gep)
self.func_arg_map[assignment.lhs.name] = expr
return expr
def _print_boolean_cast_func(self, conversion):
return self._print_cast_func(conversion)
def _print_cast_func(self, conversion):
node = self._print(conversion.args[0])
to_dtype = get_type_of_expression(conversion)
from_dtype = get_type_of_expression(conversion.args[0])
if from_dtype == to_dtype:
return self._print(conversion.args[0])
# (From, to)
decision = {
(create_composite_type_from_string("int32"),
create_composite_type_from_string("int64")): functools.partial(self.builder.zext, node, self.integer),
(create_composite_type_from_string("int16"),
create_composite_type_from_string("int64")): functools.partial(self.builder.zext, node, self.integer),
(create_composite_type_from_string("int"),
create_composite_type_from_string("double")): functools.partial(self.builder.sitofp, node, self.fp_type),
(create_composite_type_from_string("int16"),
create_composite_type_from_string("double")): functools.partial(self.builder.sitofp, node, self.fp_type),
(create_composite_type_from_string("double"),
create_composite_type_from_string("int")): functools.partial(self.builder.fptosi, node, self.integer),
(create_composite_type_from_string("double *"),
create_composite_type_from_string("int")): functools.partial(self.builder.ptrtoint, node, self.integer),
(create_composite_type_from_string("int"),
create_composite_type_from_string("double *")): functools.partial(self.builder.inttoptr,
node, self.fp_pointer),
(create_composite_type_from_string("double * restrict"),
create_composite_type_from_string("int")): functools.partial(self.builder.ptrtoint, node, self.integer),
(create_composite_type_from_string("int"),
create_composite_type_from_string("double * restrict")): functools.partial(self.builder.inttoptr, node,
self.fp_pointer),
(create_composite_type_from_string("double * restrict const"),
create_composite_type_from_string("int")): functools.partial(self.builder.ptrtoint, node,
self.integer),
(create_composite_type_from_string("int"),
create_composite_type_from_string("double * restrict const")): functools.partial(self.builder.inttoptr,
node, self.fp_pointer),
}
# TODO float, TEST: const, restrict
# TODO bitcast, addrspacecast
# TODO unsigned/signed fills
# print([x for x in decision.keys()])
# print("Types:")
# print([(type(x), type(y)) for (x, y) in decision.keys()])
# print("Cast:")
# print((from_dtype, to_dtype))
return decision[(from_dtype, to_dtype)]()
def _print_pointer_arithmetic_func(self, pointer):
ptr = self._print(pointer.args[0])
index = self._print(pointer.args[1])
return self.builder.gep(ptr, [index])
def _print_Indexed(self, indexed):
ptr = self._print(indexed.base.label)
index = self._print(indexed.args[1])
gep = self.builder.gep(ptr, [index])
return self.builder.load(gep, name=indexed.base.label.name)
def _print_Piecewise(self, piece):
if not piece.args[-1].cond:
# We need the last conditional to be a True, otherwise the resulting
# function may not return a result.
raise ValueError("All Piecewise expressions must contain an "
"(expr, True) statement to be used as a default "
"condition. Without one, the generated "
"expression may not evaluate to anything under "
"some condition.")
if piece.has(Assignment):
raise NotImplementedError('The llvm-backend does not support assignments'
'in the Piecewise function. It is questionable'
'whether to implement it. So far there is no'
'use-case to test it.')
else:
phi_data = []
after_block = self.builder.append_basic_block()
for (expr, condition) in piece.args:
if condition == sp.sympify(True): # Don't use 'is' use '=='!
phi_data.append((self._print(expr), self.builder.block))
self.builder.branch(after_block)
self.builder.position_at_end(after_block)
else:
cond = self._print(condition)
true_block = self.builder.append_basic_block()
false_block = self.builder.append_basic_block()
self.builder.cbranch(cond, true_block, false_block)
self.builder.position_at_end(true_block)
phi_data.append((self._print(expr), true_block))
self.builder.branch(after_block)
self.builder.position_at_end(false_block)
phi = self.builder.phi(to_llvm_type(get_type_of_expression(piece), nvvm_target=self.target == 'gpu'))
for (val, block) in phi_data:
phi.add_incoming(val, block)
return phi
def _print_Conditional(self, node):
cond = self._print(node.condition_expr)
with self.builder.if_else(cond) as (then, otherwise):
with then:
self._print(node.true_block) # emit instructions for when the predicate is true
with otherwise:
self._print(node.false_block) # emit instructions for when the predicate is true
# No return!
def _print_Function(self, expr):
name = expr.func.__name__
e0 = self._print(expr.args[0])
fn = self.ext_fn.get(name)
if not fn:
fn_type = ir.FunctionType(self.fp_type, [self.fp_type])
fn = ir.Function(self.module, fn_type, name)
self.ext_fn[name] = fn
return self.builder.call(fn, [e0], name)
def empty_printer(self, expr):
try:
import inspect
mro = inspect.getmro(expr)
except AttributeError:
mro = "None"
raise TypeError("Unsupported type for LLVM JIT conversion: Expression:\"%s\", Type:\"%s\", MRO:%s"
% (expr, type(expr), mro))
# from: https://llvm.org/docs/NVPTXUsage.html#nvptx-intrinsics
INDEXING_FUNCTION_MAPPING = {
'blockIdx': 'llvm.nvvm.read.ptx.sreg.ctaid',
'threadIdx': 'llvm.nvvm.read.ptx.sreg.tid',
'blockDim': 'llvm.nvvm.read.ptx.sreg.ntid',
'gridDim': 'llvm.nvvm.read.ptx.sreg.nctaid'
}
def _print_ThreadIndexingSymbol(self, node):
symbol_name: str = node.name
function_name, dimension = tuple(symbol_name.split("."))
function_name = self.INDEXING_FUNCTION_MAPPING[function_name]
name = f"{function_name}.{dimension}"
return self.builder.zext(_call_sreg(self.builder, name), self.integer)
import ctypes as ct
import subprocess
from functools import partial
from itertools import chain
from os.path import exists, join
import llvmlite.binding as llvm
import llvmlite.ir as ir
import numpy as np
from pystencils.data_types import create_composite_type_from_string
from pystencils.field import FieldType
from ..data_types import StructType, ctypes_from_llvm, to_ctypes
from .llvm import generate_llvm
def build_ctypes_argument_list(parameter_specification, argument_dict):
argument_dict = {k: v for k, v in argument_dict.items()}
ct_arguments = []
array_shapes = set()
index_arr_shapes = set()
for param in parameter_specification:
if param.is_field_parameter:
try:
field_arr = argument_dict[param.field_name]
except KeyError:
raise KeyError("Missing field parameter for kernel call " + param.field_name)
symbolic_field = param.fields[0]
if param.is_field_pointer:
ct_arguments.append(field_arr.ctypes.data_as(to_ctypes(param.symbol.dtype)))
if symbolic_field.has_fixed_shape:
symbolic_field_shape = tuple(int(i) for i in symbolic_field.shape)
if isinstance(symbolic_field.dtype, StructType):
symbolic_field_shape = symbolic_field_shape[:-1]
if symbolic_field_shape != field_arr.shape:
raise ValueError("Passed array '%s' has shape %s which does not match expected shape %s" %
(param.field_name, str(field_arr.shape), str(symbolic_field.shape)))
if symbolic_field.has_fixed_shape:
symbolic_field_strides = tuple(int(i) * field_arr.itemsize for i in symbolic_field.strides)
if isinstance(symbolic_field.dtype, StructType):
symbolic_field_strides = symbolic_field_strides[:-1]
if symbolic_field_strides != field_arr.strides:
raise ValueError("Passed array '%s' has strides %s which does not match expected strides %s" %
(param.field_name, str(field_arr.strides), str(symbolic_field_strides)))
if FieldType.is_indexed(symbolic_field):
index_arr_shapes.add(field_arr.shape[:symbolic_field.spatial_dimensions])
elif FieldType.is_generic(symbolic_field):
array_shapes.add(field_arr.shape[:symbolic_field.spatial_dimensions])
elif param.is_field_shape:
data_type = to_ctypes(param.symbol.dtype)
ct_arguments.append(data_type(field_arr.shape[param.symbol.coordinate]))
elif param.is_field_stride:
data_type = to_ctypes(param.symbol.dtype)
assert field_arr.strides[param.symbol.coordinate] % field_arr.itemsize == 0
item_stride = field_arr.strides[param.symbol.coordinate] // field_arr.itemsize
ct_arguments.append(data_type(item_stride))
else:
assert False
else:
try:
value = argument_dict[param.symbol.name]
except KeyError:
raise KeyError("Missing parameter for kernel call " + param.symbol.name)
expected_type = to_ctypes(param.symbol.dtype)
ct_arguments.append(expected_type(value))
if len(array_shapes) > 1:
raise ValueError("All passed arrays have to have the same size " + str(array_shapes))
if len(index_arr_shapes) > 1:
raise ValueError("All passed index arrays have to have the same size " + str(array_shapes))
return ct_arguments
def make_python_function_incomplete_params(kernel_function_node, argument_dict, func):
parameters = kernel_function_node.get_parameters()
cache = {}
cache_values = []
def wrapper(**kwargs):
key = hash(tuple((k, v.ctypes.data, v.strides, v.shape) if isinstance(v, np.ndarray) else (k, id(v))
for k, v in kwargs.items()))
try:
args = cache[key]
func(*args)
except KeyError:
full_arguments = argument_dict.copy()
full_arguments.update(kwargs)
args = build_ctypes_argument_list(parameters, full_arguments)
cache[key] = args
cache_values.append(kwargs) # keep objects alive such that ids remain unique
func(*args)
wrapper.ast = kernel_function_node
wrapper.parameters = kernel_function_node.get_parameters()
return wrapper
def generate_and_jit(ast):
target = 'gpu' if ast._backend == 'llvm_gpu' else 'cpu'
gen = generate_llvm(ast, target=target)
if isinstance(gen, ir.Module):
return compile_llvm(gen, target, ast)
else:
return compile_llvm(gen.module, target, ast)
def make_python_function(ast, argument_dict={}, func=None):
if func is None:
jit = generate_and_jit(ast)
func = jit.get_function_ptr(ast.function_name)
try:
args = build_ctypes_argument_list(ast.get_parameters(), argument_dict)
except KeyError:
# not all parameters specified yet
return make_python_function_incomplete_params(ast, argument_dict, func)
return lambda: func(*args)
def compile_llvm(module, target='cpu', ast=None):
jit = CudaJit(ast) if target == "gpu" else Jit()
jit.parse(module)
jit.optimize()
jit.compile()
return jit
class Jit(object):
def __init__(self):
llvm.initialize()
llvm.initialize_all_targets()
llvm.initialize_native_target()
llvm.initialize_native_asmprinter()
self.module = None
self._llvmmod = llvm.parse_assembly("")
self.target = llvm.Target.from_default_triple()
self.cpu = llvm.get_host_cpu_name()
self.cpu_features = llvm.get_host_cpu_features()
self.target_machine = self.target.create_target_machine(cpu=self.cpu, features=self.cpu_features.flatten(),
opt=2)
llvm.check_jit_execution()
self.ee = llvm.create_mcjit_compiler(self.llvmmod, self.target_machine)
self.ee.finalize_object()
self.fptr = None
@property
def llvmmod(self):
return self._llvmmod
@llvmmod.setter
def llvmmod(self, mod):
self.ee.remove_module(self.llvmmod)
self.ee.add_module(mod)
self.ee.finalize_object()
self.compile()
self._llvmmod = mod
def parse(self, module):
self.module = module
llvmmod = llvm.parse_assembly(str(module))
llvmmod.verify()
llvmmod.triple = self.target.triple
llvmmod.name = 'module'
self.llvmmod = llvmmod
def write_ll(self, file):
with open(file, 'w') as f:
f.write(str(self.llvmmod))
def write_assembly(self, file):
with open(file, 'w') as f:
f.write(self.target_machine.emit_assembly(self.llvmmod))
def write_object_file(self, file):
with open(file, 'wb') as f:
f.write(self.target_machine.emit_object(self.llvmmod))
def optimize(self):
pmb = llvm.create_pass_manager_builder()
pmb.opt_level = 2
pmb.disable_unit_at_a_time = False
pmb.loop_vectorize = True
pmb.slp_vectorize = True
# TODO possible to pass for functions
pm = llvm.create_module_pass_manager()
pm.add_instruction_combining_pass()
pm.add_function_attrs_pass()
pm.add_constant_merge_pass()
pm.add_licm_pass()
pmb.populate(pm)
pm.run(self.llvmmod)
def compile(self):
fptr = {}
for func in self.module.functions:
if not func.is_declaration:
return_type = None
if func.ftype.return_type != ir.VoidType():
return_type = to_ctypes(create_composite_type_from_string(str(func.ftype.return_type)))
args = [ctypes_from_llvm(arg) for arg in func.ftype.args]
function_address = self.ee.get_function_address(func.name)
fptr[func.name] = ct.CFUNCTYPE(return_type, *args)(function_address)
self.fptr = fptr
def __call__(self, func, *args, **kwargs):
target_function = next(f for f in self.module.functions if f.name == func)
arg_types = [ctypes_from_llvm(arg.type) for arg in target_function.args]
transformed_args = []
for i, arg in enumerate(args):
if isinstance(arg, np.ndarray):
transformed_args.append(arg.ctypes.data_as(arg_types[i]))
else:
transformed_args.append(arg)
self.fptr[func](*transformed_args)
def print_functions(self):
for f in self.module.functions:
print(f.ftype.return_type, f.name, f.args)
def get_function_ptr(self, name):
fptr = self.fptr[name]
fptr.jit = self
return fptr
# Following code more or less from numba
class CudaJit(Jit):
CUDA_TRIPLE = {32: 'nvptx-nvidia-cuda',
64: 'nvptx64-nvidia-cuda'}
MACHINE_BITS = tuple.__itemsize__ * 8
data_layout = {
32: ('e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-'
'f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64'),
64: ('e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-'
'f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64')}
default_data_layout = data_layout[MACHINE_BITS]
def __init__(self, ast):
# super().__init__()
# self.target = llvm.Target.from_triple(self.CUDA_TRIPLE[self.MACHINE_BITS])
self._data_layout = self.default_data_layout[self.MACHINE_BITS]
# self._target_data = llvm.create_target_data(self._data_layout)
self.indexing = ast.indexing
def optimize(self):
pmb = llvm.create_pass_manager_builder()
pmb.opt_level = 2
pmb.disable_unit_at_a_time = False
pmb.loop_vectorize = False
pmb.slp_vectorize = False
# TODO possible to pass for functions
pm = llvm.create_module_pass_manager()
pm.add_instruction_combining_pass()
pm.add_function_attrs_pass()
pm.add_constant_merge_pass()
pm.add_licm_pass()
pmb.populate(pm)
pm.run(self.llvmmod)
pm.run(self.llvmmod)
def write_ll(self, file):
with open(file, 'w') as f:
f.write(str(self.llvmmod))
def parse(self, module):
llvmmod = module
llvmmod.triple = self.CUDA_TRIPLE[self.MACHINE_BITS]
llvmmod.data_layout = self.default_data_layout
llvmmod.verify()
llvmmod.name = 'module'
self._llvmmod = llvm.parse_assembly(str(llvmmod))
def compile(self):
from pystencils.cpu.cpujit import get_cache_config, get_compiler_config, get_llc_command
import hashlib
compiler_cache = get_cache_config()['object_cache']
ir_file = join(compiler_cache, hashlib.md5(str(self._llvmmod).encode()).hexdigest() + '.ll')
ptx_file = ir_file.replace('.ll', '.ptx')
try:
from pycuda.driver import Context
arch = "sm_%d%d" % Context.get_device().compute_capability()
except Exception:
arch = "sm_35"
if not exists(ptx_file):
self.write_ll(ir_file)
if 'llc' in get_compiler_config():
llc_command = get_compiler_config()['llc']
else:
llc_command = get_llc_command() or 'llc'
subprocess.check_call([llc_command, '-mcpu=' + arch, ir_file, '-o', ptx_file])
# cubin_file = ir_file.replace('.ll', '.cubin')
# if not exists(cubin_file):
# subprocess.check_call(['ptxas', '--gpu-name', arch, ptx_file, '-o', cubin_file])
import pycuda.driver
cuda_module = pycuda.driver.module_from_file(ptx_file) # also works: cubin_file
self.cuda_module = cuda_module
def __call__(self, func, *args, **kwargs):
shape = [a.shape for a in chain(args, kwargs.values()) if hasattr(a, 'shape')][0]
block_and_thread_numbers = self.indexing.call_parameters(shape)
block_and_thread_numbers['block'] = tuple(int(i) for i in block_and_thread_numbers['block'])
block_and_thread_numbers['grid'] = tuple(int(i) for i in block_and_thread_numbers['grid'])
self.cuda_module.get_function(func)(*args, **kwargs, **block_and_thread_numbers)
def get_function_ptr(self, name):
return partial(self._call__, name)
"""
Default Sympy optimizations applied in pystencils kernels using :func:`sympy.codegen.rewriting.optimize`.
See :func:`sympy.codegen.rewriting.optimize`.
"""
import itertools
from pystencils import Assignment
from pystencils.astnodes import SympyAssignment
try:
from sympy.codegen.rewriting import optims_c99, optimize
from sympy.codegen.rewriting import ReplaceOptim
HAS_REWRITING = True
# Evaluates all constant terms
evaluate_constant_terms = ReplaceOptim(
lambda e: hasattr(e, 'is_constant') and e.is_constant and not e.is_integer,
lambda p: p.evalf()
)
optims_pystencils_cpu = [evaluate_constant_terms] + list(optims_c99)
optims_pystencils_gpu = [evaluate_constant_terms] + list(optims_c99)
except ImportError:
from warnings import warn
warn("Could not import ReplaceOptim, optims_c99, optimize from sympy.codegen.rewriting."
"Please update your sympy installation!")
optims_c99 = []
optims_pystencils_cpu = []
optims_pystencils_gpu = []
HAS_REWRITING = False
def optimize_assignments(assignments, optimizations):
if HAS_REWRITING:
assignments = [Assignment(a.lhs, optimize(a.rhs, optimizations))
if hasattr(a, 'lhs')
else a for a in assignments]
assignments_nodes = [a.atoms(SympyAssignment) for a in assignments]
for a in itertools.chain.from_iterable(assignments_nodes):
a.optimize(optimizations)
return assignments
def optimize_ast(ast, optimizations):
if HAS_REWRITING:
assignments_nodes = ast.atoms(SympyAssignment)
for a in assignments_nodes:
a.optimize(optimizations)
return ast
"""
"""
from pystencils.opencl.opencljit import (
clear_global_ctx, init_globally, init_globally_with_context, make_python_function)
__all__ = ['init_globally', 'init_globally_with_context', 'clear_global_ctx', 'make_python_function']
"""
Automatically initializes OpenCL context using any device.
Use `pystencils.opencl.{init_globally_with_context,init_globally}` if you want to use a specific device.
"""
from pystencils.opencl.opencljit import (
clear_global_ctx, init_globally, init_globally_with_context, make_python_function)
__all__ = ['init_globally', 'init_globally_with_context', 'clear_global_ctx', 'make_python_function']
try:
init_globally()
except Exception as e:
import warnings
warnings.warn(str(e))