Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
No results found
Show changes
Showing
with 0 additions and 3645 deletions
try:
import pycuda.gpuarray as gpuarray
except ImportError:
gpuarray = None
import numpy as np
import pystencils
class PyCudaArrayHandler:
def __init__(self):
import pycuda.autoinit # NOQA
def zeros(self, shape, dtype=np.float64, order='C'):
return gpuarray.zeros(shape, dtype, order)
def ones(self, shape, dtype, order='C'):
return gpuarray.ones(shape, dtype, order)
def empty(self, shape, dtype=np.float64, layout=None):
if layout:
cpu_array = pystencils.field.create_numpy_array_with_layout(shape, dtype, layout)
return self.from_numpy(cpu_array)
else:
return gpuarray.empty(shape, dtype)
def to_gpu(self, array):
return gpuarray.to_gpu(array)
def upload(self, gpuarray, numpy_array):
gpuarray.set(numpy_array)
def download(self, gpuarray, numpy_array):
gpuarray.get(numpy_array)
def randn(self, shape, dtype=np.float64):
cpu_array = np.random.randn(*shape).astype(dtype)
return self.from_numpy(cpu_array)
try:
import pyopencl.array as gpuarray
except ImportError:
gpuarray = None
import numpy as np
import pystencils
class PyOpenClArrayHandler:
def __init__(self, queue):
if not queue:
from pystencils.opencl.opencljit import get_global_cl_queue
queue = get_global_cl_queue()
assert queue, "OpenCL queue missing"
self.queue = queue
def zeros(self, shape, dtype=np.float64, order='C'):
return gpuarray.zeros(shape, dtype, order)
def ones(self, shape, dtype, order='C'):
return gpuarray.ones(self.queue, shape, dtype, order)
def empty(self, shape, dtype=np.float64, layout=None):
if layout:
cpu_array = pystencils.field.create_numpy_array_with_layout(shape, dtype, layout)
return self.from_numpy(cpu_array)
else:
return gpuarray.empty(self.queue, shape, dtype)
def to_gpu(self, array):
return gpuarray.to_device(self.queue, array)
def upload(self, gpuarray, numpy_array):
gpuarray.set(numpy_array, self.queue)
def download(self, gpuarray, numpy_array):
gpuarray.get(self.queue, numpy_array)
def randn(self, shape, dtype=np.float64):
cpu_array = np.random.randn(*shape).astype(dtype)
return self.from_numpy(cpu_array)
# -*- coding: utf-8 -*-
#
# Copyright © 2019 Stephan Seitz <stephan.seitz@fau.de>
#
# Distributed under terms of the GPLv3 license.
"""
"""
from os.path import dirname, isdir, join
import numpy as np
try:
import pycuda.driver as cuda
from pycuda import gpuarray
except Exception:
pass
def pow_two_divider(n):
if n == 0:
return 0
divider = 1
while (n & divider) == 0:
divider <<= 1
return divider
def ndarray_to_tex(tex_ref,
ndarray,
address_mode=None,
filter_mode=None,
use_normalized_coordinates=False,
read_as_integer=False):
if address_mode is None:
address_mode = cuda.address_mode.BORDER
if filter_mode is None:
filter_mode = cuda.filter_mode.LINEAR
if isinstance(ndarray, np.ndarray):
cu_array = cuda.np_to_array(ndarray, 'C')
elif isinstance(ndarray, gpuarray.GPUArray):
cu_array = cuda.gpuarray_to_array(ndarray, 'C')
else:
raise TypeError(
'ndarray must be numpy.ndarray or pycuda.gpuarray.GPUArray')
cuda.TextureReference.set_array(tex_ref, cu_array)
tex_ref.set_address_mode(0, address_mode)
if ndarray.ndim >= 2:
tex_ref.set_address_mode(1, address_mode)
if ndarray.ndim >= 3:
tex_ref.set_address_mode(2, address_mode)
tex_ref.set_filter_mode(filter_mode)
if not use_normalized_coordinates:
tex_ref.set_flags(tex_ref.get_flags() & ~cuda.TRSF_NORMALIZED_COORDINATES)
if not read_as_integer:
tex_ref.set_flags(tex_ref.get_flags() & ~cuda.TRSF_READ_AS_INTEGER)
def prefilter_for_cubic_bspline(gpuarray):
import pycuda.autoinit # NOQA
from pycuda.compiler import SourceModule
ndim = gpuarray.ndim
assert ndim == 2 or ndim == 3, "Only 2d or 3d supported"
assert isdir(join(dirname(__file__), "CubicInterpolationCUDA", "code")), \
"Submodule CubicInterpolationCUDA does not exist"
nvcc_options = ["-w", "-std=c++11", "-Wno-deprecated-gpu-targets"]
nvcc_options += ["-I" + join(dirname(__file__), "CubicInterpolationCUDA", "code")]
nvcc_options += ["-I" + join(dirname(__file__), "CubicInterpolationCUDA", "code", "internal")]
file_name = join(dirname(__file__), "CubicInterpolationCUDA", "code", "cubicPrefilter%iD.cu" % ndim)
with open(file_name) as file:
code = file.read()
mod = SourceModule(code, options=nvcc_options)
if ndim == 2:
height, width = gpuarray.shape
block = min(pow_two_divider(height), 64)
grid = height // block
func = mod.get_function('SamplesToCoefficients2DXf')
func(gpuarray, np.uint32(gpuarray.strides[-2]), *(np.uint32(r)
for r in reversed(gpuarray.shape)),
block=(block, 1, 1),
grid=(grid, 1, 1))
block = min(pow_two_divider(width), 64)
grid = width // block
func = mod.get_function('SamplesToCoefficients2DYf')
func(gpuarray, np.uint32(gpuarray.strides[-2]), *(np.uint32(r)
for r in reversed(gpuarray.shape)),
block=(block, 1, 1),
grid=(grid, 1, 1))
elif ndim == 3:
depth, height, width = gpuarray.shape
dimX = min(min(pow_two_divider(width), pow_two_divider(height)), 64)
dimY = min(min(pow_two_divider(depth), pow_two_divider(height)), 512 / dimX)
block = (dimX, dimY, 1)
dimGridX = (height // block[0], depth // block[1], 1)
dimGridY = (width // block[0], depth // block[1], 1)
dimGridZ = (width // block[0], height // block[1], 1)
func = mod.get_function("SamplesToCoefficients3DXf")
func(gpuarray, np.uint32(gpuarray.strides[-2]), *(np.uint32(r)
for r in reversed(gpuarray.shape)),
block=block,
grid=dimGridX)
func = mod.get_function("SamplesToCoefficients3DYf")
func(gpuarray, np.uint32(gpuarray.strides[-2]), *(np.uint32(r)
for r in reversed(gpuarray.shape)),
block=block,
grid=dimGridY)
func = mod.get_function("SamplesToCoefficients3DZf")
func(gpuarray, np.uint32(gpuarray.strides[-2]), *(np.uint32(r)
for r in reversed(gpuarray.shape)),
block=block,
grid=dimGridZ)
#pragma once
extern "C++" {
#ifdef __CUDA_ARCH__
template <typename DTYPE_T, std::size_t DIMENSION> struct PyStencilsField {
DTYPE_T *data;
DTYPE_T shape[DIMENSION];
DTYPE_T stride[DIMENSION];
};
#else
#include <array>
template <typename DTYPE_T, std::size_t DIMENSION> struct PyStencilsField {
DTYPE_T *data;
std::array<DTYPE_T, DIMENSION> shape;
std::array<DTYPE_T, DIMENSION> stride;
};
#endif
}
#if !defined(__AES__) || !defined(__SSE4_1__)
#error AES-NI and SSE4.1 need to be enabled
#endif
#include <emmintrin.h> // SSE2
#include <wmmintrin.h> // AES
#ifdef __AVX512VL__
#include <immintrin.h> // AVX*
#else
#include <smmintrin.h> // SSE4
#ifdef __FMA__
#include <immintrin.h> // FMA
#endif
#endif
#include <cstdint>
#define QUALIFIERS inline
#define TWOPOW53_INV_DOUBLE (1.1102230246251565e-16)
#define TWOPOW32_INV_FLOAT (2.3283064e-10f)
typedef std::uint32_t uint32;
typedef std::uint64_t uint64;
QUALIFIERS __m128i aesni1xm128i(const __m128i & in, const __m128i & k) {
__m128i x = _mm_xor_si128(k, in);
x = _mm_aesenc_si128(x, k); // 1
x = _mm_aesenc_si128(x, k); // 2
x = _mm_aesenc_si128(x, k); // 3
x = _mm_aesenc_si128(x, k); // 4
x = _mm_aesenc_si128(x, k); // 5
x = _mm_aesenc_si128(x, k); // 6
x = _mm_aesenc_si128(x, k); // 7
x = _mm_aesenc_si128(x, k); // 8
x = _mm_aesenc_si128(x, k); // 9
x = _mm_aesenclast_si128(x, k); // 10
return x;
}
QUALIFIERS __m128 _my_cvtepu32_ps(const __m128i v)
{
#ifdef __AVX512VL__
return _mm_cvtepu32_ps(v);
#else
__m128i v2 = _mm_srli_epi32(v, 1);
__m128i v1 = _mm_and_si128(v, _mm_set1_epi32(1));
__m128 v2f = _mm_cvtepi32_ps(v2);
__m128 v1f = _mm_cvtepi32_ps(v1);
return _mm_add_ps(_mm_add_ps(v2f, v2f), v1f);
#endif
}
#if !defined(__AVX512VL__) && defined(__GNUC__) && __GNUC__ >= 5
__attribute__((optimize("no-associative-math")))
#endif
QUALIFIERS __m128d _my_cvtepu64_pd(const __m128i x)
{
#ifdef __AVX512VL__
return _mm_cvtepu64_pd(x);
#else
__m128i xH = _mm_srli_epi64(x, 32);
xH = _mm_or_si128(xH, _mm_castpd_si128(_mm_set1_pd(19342813113834066795298816.))); // 2^84
__m128i xL = _mm_blend_epi16(x, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)), 0xcc); // 2^52
__m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(19342813118337666422669312.)); // 2^84 + 2^52
return _mm_add_pd(f, _mm_castsi128_pd(xL));
#endif
}
QUALIFIERS void aesni_double2(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1, uint32 key2, uint32 key3,
double & rnd1, double & rnd2)
{
// pack input and call AES
__m128i c128 = _mm_set_epi32(ctr3, ctr2, ctr1, ctr0);
__m128i k128 = _mm_set_epi32(key3, key2, key1, key0);
c128 = aesni1xm128i(c128, k128);
// convert 32 to 64 bit and put 0th and 2nd element into x, 1st and 3rd element into y
__m128i x = _mm_and_si128(c128, _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff));
__m128i y = _mm_and_si128(c128, _mm_set_epi32(0xffffffff, 0, 0xffffffff, 0));
y = _mm_srli_si128(y, 4);
// calculate z = x ^ y << (53 - 32))
__m128i z = _mm_sll_epi64(y, _mm_set1_epi64x(53 - 32));
z = _mm_xor_si128(x, z);
// convert uint64 to double
__m128d rs = _my_cvtepu64_pd(z);
// calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
#ifdef __FMA__
rs = _mm_fmadd_pd(rs, _mm_set1_pd(TWOPOW53_INV_DOUBLE), _mm_set1_pd(TWOPOW53_INV_DOUBLE/2.0));
#else
rs = _mm_mul_pd(rs, _mm_set1_pd(TWOPOW53_INV_DOUBLE));
rs = _mm_add_pd(rs, _mm_set1_pd(TWOPOW53_INV_DOUBLE/2.0));
#endif
// store result
alignas(16) double rr[2];
_mm_store_pd(rr, rs);
rnd1 = rr[0];
rnd2 = rr[1];
}
QUALIFIERS void aesni_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1, uint32 key2, uint32 key3,
float & rnd1, float & rnd2, float & rnd3, float & rnd4)
{
// pack input and call AES
__m128i c128 = _mm_set_epi32(ctr3, ctr2, ctr1, ctr0);
__m128i k128 = _mm_set_epi32(key3, key2, key1, key0);
c128 = aesni1xm128i(c128, k128);
// convert uint32 to float
__m128 rs = _my_cvtepu32_ps(c128);
// calculate rs * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
#ifdef __FMA__
rs = _mm_fmadd_ps(rs, _mm_set1_ps(TWOPOW32_INV_FLOAT), _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
#else
rs = _mm_mul_ps(rs, _mm_set1_ps(TWOPOW32_INV_FLOAT));
rs = _mm_add_ps(rs, _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
#endif
// store result
alignas(16) float r[4];
_mm_store_ps(r, rs);
rnd1 = r[0];
rnd2 = r[1];
rnd3 = r[2];
rnd4 = r[3];
}
// An implementation of C++ std::complex for use on CUDA devices.
// Written by John C. Travers <jtravs@gmail.com> (2012)
//
// Missing:
// - long double support (not supported on CUDA)
// - some integral pow functions (due to lack of C++11 support on CUDA)
//
// Heavily derived from the LLVM libcpp project (svn revision 147853).
// Based on libcxx/include/complex.
// The git history contains the complete change history from the original.
// The modifications are licensed as per the original LLVM license below.
//
// -*- C++ -*-
//===--------------------------- complex ----------------------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is dual licensed under the MIT and the University of Illinois Open
// Source Licenses. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
extern "C++" {
#ifndef CUDA_COMPLEX_HPP
#define CUDA_COMPLEX_HPP
#ifdef __CUDACC__
#define CUDA_CALLABLE_MEMBER __host__ __device__
#else
#define CUDA_CALLABLE_MEMBER
#endif
/*
complex synopsis
template<class T>
class complex
{
public:
typedef T value_type;
complex(const T& re = T(), const T& im = T());
complex(const complex&);
template<class X> complex(const complex<X>&);
T real() const;
T imag() const;
void real(T);
void imag(T);
complex<T>& operator= (const T&);
complex<T>& operator+=(const T&);
complex<T>& operator-=(const T&);
complex<T>& operator*=(const T&);
complex<T>& operator/=(const T&);
complex& operator=(const complex&);
template<class X> complex<T>& operator= (const complex<X>&);
template<class X> complex<T>& operator+=(const complex<X>&);
template<class X> complex<T>& operator-=(const complex<X>&);
template<class X> complex<T>& operator*=(const complex<X>&);
template<class X> complex<T>& operator/=(const complex<X>&);
};
template<>
class complex<float>
{
public:
typedef float value_type;
constexpr complex(float re = 0.0f, float im = 0.0f);
explicit constexpr complex(const complex<double>&);
constexpr float real() const;
void real(float);
constexpr float imag() const;
void imag(float);
complex<float>& operator= (float);
complex<float>& operator+=(float);
complex<float>& operator-=(float);
complex<float>& operator*=(float);
complex<float>& operator/=(float);
complex<float>& operator=(const complex<float>&);
template<class X> complex<float>& operator= (const complex<X>&);
template<class X> complex<float>& operator+=(const complex<X>&);
template<class X> complex<float>& operator-=(const complex<X>&);
template<class X> complex<float>& operator*=(const complex<X>&);
template<class X> complex<float>& operator/=(const complex<X>&);
};
template<>
class complex<double>
{
public:
typedef double value_type;
constexpr complex(double re = 0.0, double im = 0.0);
constexpr complex(const complex<float>&);
constexpr double real() const;
void real(double);
constexpr double imag() const;
void imag(double);
complex<double>& operator= (double);
complex<double>& operator+=(double);
complex<double>& operator-=(double);
complex<double>& operator*=(double);
complex<double>& operator/=(double);
complex<double>& operator=(const complex<double>&);
template<class X> complex<double>& operator= (const complex<X>&);
template<class X> complex<double>& operator+=(const complex<X>&);
template<class X> complex<double>& operator-=(const complex<X>&);
template<class X> complex<double>& operator*=(const complex<X>&);
template<class X> complex<double>& operator/=(const complex<X>&);
};
// 26.3.6 operators:
template<class T> complex<T> operator+(const complex<T>&, const complex<T>&);
template<class T> complex<T> operator+(const complex<T>&, const T&);
template<class T> complex<T> operator+(const T&, const complex<T>&);
template<class T> complex<T> operator-(const complex<T>&, const complex<T>&);
template<class T> complex<T> operator-(const complex<T>&, const T&);
template<class T> complex<T> operator-(const T&, const complex<T>&);
template<class T> complex<T> operator*(const complex<T>&, const complex<T>&);
template<class T> complex<T> operator*(const complex<T>&, const T&);
template<class T> complex<T> operator*(const T&, const complex<T>&);
template<class T> complex<T> operator/(const complex<T>&, const complex<T>&);
template<class T> complex<T> operator/(const complex<T>&, const T&);
template<class T> complex<T> operator/(const T&, const complex<T>&);
template<class T> complex<T> operator+(const complex<T>&);
template<class T> complex<T> operator-(const complex<T>&);
template<class T> bool operator==(const complex<T>&, const complex<T>&);
template<class T> bool operator==(const complex<T>&, const T&);
template<class T> bool operator==(const T&, const complex<T>&);
template<class T> bool operator!=(const complex<T>&, const complex<T>&);
template<class T> bool operator!=(const complex<T>&, const T&);
template<class T> bool operator!=(const T&, const complex<T>&);
template<class T, class charT, class traits>
basic_istream<charT, traits>&
operator>>(basic_istream<charT, traits>&, complex<T>&);
template<class T, class charT, class traits>
basic_ostream<charT, traits>&
operator<<(basic_ostream<charT, traits>&, const complex<T>&);
// 26.3.7 values:
template<class T> T real(const complex<T>&);
double real(double);
template<Integral T> double real(T);
float real(float);
template<class T> T imag(const complex<T>&);
double imag(double);
template<Integral T> double imag(T);
float imag(float);
template<class T> T abs(const complex<T>&);
template<class T> T arg(const complex<T>&);
double arg(double);
template<Integral T> double arg(T);
float arg(float);
template<class T> T norm(const complex<T>&);
double norm(double);
template<Integral T> double norm(T);
float norm(float);
template<class T> complex<T> conj(const complex<T>&);
complex<double> conj(double);
template<Integral T> complex<double> conj(T);
complex<float> conj(float);
template<class T> complex<T> proj(const complex<T>&);
complex<double> proj(double);
template<Integral T> complex<double> proj(T);
complex<float> proj(float);
template<class T> complex<T> polar(const T&, const T& = 0);
// 26.3.8 transcendentals:
template<class T> complex<T> acos(const complex<T>&);
template<class T> complex<T> asin(const complex<T>&);
template<class T> complex<T> atan(const complex<T>&);
template<class T> complex<T> acosh(const complex<T>&);
template<class T> complex<T> asinh(const complex<T>&);
template<class T> complex<T> atanh(const complex<T>&);
template<class T> complex<T> cos (const complex<T>&);
template<class T> complex<T> cosh (const complex<T>&);
template<class T> complex<T> exp (const complex<T>&);
template<class T> complex<T> log (const complex<T>&);
template<class T> complex<T> log10(const complex<T>&);
template<class T> complex<T> pow(const complex<T>&, const T&);
template<class T> complex<T> pow(const complex<T>&, const complex<T>&);
template<class T> complex<T> pow(const T&, const complex<T>&);
template<class T> complex<T> sin (const complex<T>&);
template<class T> complex<T> sinh (const complex<T>&);
template<class T> complex<T> sqrt (const complex<T>&);
template<class T> complex<T> tan (const complex<T>&);
template<class T> complex<T> tanh (const complex<T>&);
template<class T, class charT, class traits>
basic_istream<charT, traits>&
operator>>(basic_istream<charT, traits>& is, complex<T>& x);
template<class T, class charT, class traits>
basic_ostream<charT, traits>&
operator<<(basic_ostream<charT, traits>& o, const complex<T>& x);
*/
#include <math.h>
#include <sstream>
template <class _Tp> class complex;
template <class _Tp>
complex<_Tp> operator*(const complex<_Tp> &__z, const complex<_Tp> &__w);
template <class _Tp>
complex<_Tp> operator/(const complex<_Tp> &__x, const complex<_Tp> &__y);
template <class _Tp> class complex {
public:
typedef _Tp value_type;
private:
value_type __re_;
value_type __im_;
public:
CUDA_CALLABLE_MEMBER
complex(const value_type &__re = value_type(),
const value_type &__im = value_type())
: __re_(__re), __im_(__im) {}
template <class _Xp>
CUDA_CALLABLE_MEMBER complex(const complex<_Xp> &__c)
: __re_(__c.real()), __im_(__c.imag()) {}
CUDA_CALLABLE_MEMBER value_type real() const { return __re_; }
CUDA_CALLABLE_MEMBER value_type imag() const { return __im_; }
CUDA_CALLABLE_MEMBER void real(value_type __re) { __re_ = __re; }
CUDA_CALLABLE_MEMBER void imag(value_type __im) { __im_ = __im; }
CUDA_CALLABLE_MEMBER complex &operator=(const value_type &__re) {
__re_ = __re;
__im_ = value_type();
return *this;
}
CUDA_CALLABLE_MEMBER complex &operator+=(const value_type &__re) {
__re_ += __re;
return *this;
}
CUDA_CALLABLE_MEMBER complex &operator-=(const value_type &__re) {
__re_ -= __re;
return *this;
}
CUDA_CALLABLE_MEMBER complex &operator*=(const value_type &__re) {
__re_ *= __re;
__im_ *= __re;
return *this;
}
CUDA_CALLABLE_MEMBER complex &operator/=(const value_type &__re) {
__re_ /= __re;
__im_ /= __re;
return *this;
}
template <class _Xp>
CUDA_CALLABLE_MEMBER complex &operator=(const complex<_Xp> &__c) {
__re_ = __c.real();
__im_ = __c.imag();
return *this;
}
template <class _Xp>
CUDA_CALLABLE_MEMBER complex &operator+=(const complex<_Xp> &__c) {
__re_ += __c.real();
__im_ += __c.imag();
return *this;
}
template <class _Xp>
CUDA_CALLABLE_MEMBER complex &operator-=(const complex<_Xp> &__c) {
__re_ -= __c.real();
__im_ -= __c.imag();
return *this;
}
template <class _Xp>
CUDA_CALLABLE_MEMBER complex &operator*=(const complex<_Xp> &__c) {
*this = *this * __c;
return *this;
}
template <class _Xp>
CUDA_CALLABLE_MEMBER complex &operator/=(const complex<_Xp> &__c) {
*this = *this / __c;
return *this;
}
};
template <> class complex<double>;
template <> class complex<float> {
float __re_;
float __im_;
public:
typedef float value_type;
/*constexpr*/ CUDA_CALLABLE_MEMBER complex(float __re = 0.0f,
float __im = 0.0f)
: __re_(__re), __im_(__im) {}
explicit /*constexpr*/ complex(const complex<double> &__c);
/*constexpr*/ CUDA_CALLABLE_MEMBER float real() const { return __re_; }
/*constexpr*/ CUDA_CALLABLE_MEMBER float imag() const { return __im_; }
CUDA_CALLABLE_MEMBER void real(value_type __re) { __re_ = __re; }
CUDA_CALLABLE_MEMBER void imag(value_type __im) { __im_ = __im; }
CUDA_CALLABLE_MEMBER complex &operator=(float __re) {
__re_ = __re;
__im_ = value_type();
return *this;
}
CUDA_CALLABLE_MEMBER complex &operator+=(float __re) {
__re_ += __re;
return *this;
}
CUDA_CALLABLE_MEMBER complex &operator-=(float __re) {
__re_ -= __re;
return *this;
}
CUDA_CALLABLE_MEMBER complex &operator*=(float __re) {
__re_ *= __re;
__im_ *= __re;
return *this;
}
CUDA_CALLABLE_MEMBER complex &operator/=(float __re) {
__re_ /= __re;
__im_ /= __re;
return *this;
}
template <class _Xp>
CUDA_CALLABLE_MEMBER complex &operator=(const complex<_Xp> &__c) {
__re_ = __c.real();
__im_ = __c.imag();
return *this;
}
template <class _Xp>
CUDA_CALLABLE_MEMBER complex &operator+=(const complex<_Xp> &__c) {
__re_ += __c.real();
__im_ += __c.imag();
return *this;
}
template <class _Xp>
CUDA_CALLABLE_MEMBER complex &operator-=(const complex<_Xp> &__c) {
__re_ -= __c.real();
__im_ -= __c.imag();
return *this;
}
template <class _Xp>
CUDA_CALLABLE_MEMBER complex &operator*=(const complex<_Xp> &__c) {
*this = *this * __c;
return *this;
}
template <class _Xp>
CUDA_CALLABLE_MEMBER complex &operator/=(const complex<_Xp> &__c) {
*this = *this / __c;
return *this;
}
};
template <> class complex<double> {
double __re_;
double __im_;
public:
typedef double value_type;
/*constexpr*/ CUDA_CALLABLE_MEMBER complex(double __re = 0.0,
double __im = 0.0)
: __re_(__re), __im_(__im) {}
/*constexpr*/ complex(const complex<float> &__c);
/*constexpr*/ CUDA_CALLABLE_MEMBER double real() const { return __re_; }
/*constexpr*/ CUDA_CALLABLE_MEMBER double imag() const { return __im_; }
CUDA_CALLABLE_MEMBER void real(value_type __re) { __re_ = __re; }
CUDA_CALLABLE_MEMBER void imag(value_type __im) { __im_ = __im; }
CUDA_CALLABLE_MEMBER complex &operator=(double __re) {
__re_ = __re;
__im_ = value_type();
return *this;
}
CUDA_CALLABLE_MEMBER complex &operator+=(double __re) {
__re_ += __re;
return *this;
}
CUDA_CALLABLE_MEMBER complex &operator-=(double __re) {
__re_ -= __re;
return *this;
}
CUDA_CALLABLE_MEMBER complex &operator*=(double __re) {
__re_ *= __re;
__im_ *= __re;
return *this;
}
CUDA_CALLABLE_MEMBER complex &operator/=(double __re) {
__re_ /= __re;
__im_ /= __re;
return *this;
}
template <class _Xp>
CUDA_CALLABLE_MEMBER complex &operator=(const complex<_Xp> &__c) {
__re_ = __c.real();
__im_ = __c.imag();
return *this;
}
template <class _Xp>
CUDA_CALLABLE_MEMBER complex &operator+=(const complex<_Xp> &__c) {
__re_ += __c.real();
__im_ += __c.imag();
return *this;
}
template <class _Xp>
CUDA_CALLABLE_MEMBER complex &operator-=(const complex<_Xp> &__c) {
__re_ -= __c.real();
__im_ -= __c.imag();
return *this;
}
template <class _Xp>
CUDA_CALLABLE_MEMBER complex &operator*=(const complex<_Xp> &__c) {
*this = *this * __c;
return *this;
}
template <class _Xp>
CUDA_CALLABLE_MEMBER complex &operator/=(const complex<_Xp> &__c) {
*this = *this / __c;
return *this;
}
};
// constexpr
inline CUDA_CALLABLE_MEMBER complex<float>::complex(const complex<double> &__c)
: __re_(__c.real()), __im_(__c.imag()) {}
// constexpr
inline CUDA_CALLABLE_MEMBER complex<double>::complex(const complex<float> &__c)
: __re_(__c.real()), __im_(__c.imag()) {}
// 26.3.6 operators:
template <class _Tp>
inline CUDA_CALLABLE_MEMBER complex<_Tp> operator+(const complex<_Tp> &__x,
const complex<_Tp> &__y) {
complex<_Tp> __t(__x);
__t += __y;
return __t;
}
template <class _Tp>
inline CUDA_CALLABLE_MEMBER complex<_Tp> operator+(const complex<_Tp> &__x,
const _Tp &__y) {
complex<_Tp> __t(__x);
__t += __y;
return __t;
}
template <class _Tp>
inline CUDA_CALLABLE_MEMBER complex<_Tp> operator+(const _Tp &__x,
const complex<_Tp> &__y) {
complex<_Tp> __t(__y);
__t += __x;
return __t;
}
template <class _Tp>
inline CUDA_CALLABLE_MEMBER complex<_Tp> operator-(const complex<_Tp> &__x,
const complex<_Tp> &__y) {
complex<_Tp> __t(__x);
__t -= __y;
return __t;
}
template <class _Tp>
inline CUDA_CALLABLE_MEMBER complex<_Tp> operator-(const complex<_Tp> &__x,
const _Tp &__y) {
complex<_Tp> __t(__x);
__t -= __y;
return __t;
}
template <class _Tp>
inline CUDA_CALLABLE_MEMBER complex<_Tp> operator-(const _Tp &__x,
const complex<_Tp> &__y) {
complex<_Tp> __t(-__y);
__t += __x;
return __t;
}
template <class _Tp>
CUDA_CALLABLE_MEMBER complex<_Tp> operator*(const complex<_Tp> &__z,
const complex<_Tp> &__w) {
_Tp __a = __z.real();
_Tp __b = __z.imag();
_Tp __c = __w.real();
_Tp __d = __w.imag();
_Tp __ac = __a * __c;
_Tp __bd = __b * __d;
_Tp __ad = __a * __d;
_Tp __bc = __b * __c;
_Tp __x = __ac - __bd;
_Tp __y = __ad + __bc;
if (isnan(__x) && isnan(__y)) {
bool __recalc = false;
if (isinf(__a) || isinf(__b)) {
__a = copysign(isinf(__a) ? _Tp(1) : _Tp(0), __a);
__b = copysign(isinf(__b) ? _Tp(1) : _Tp(0), __b);
if (isnan(__c))
__c = copysign(_Tp(0), __c);
if (isnan(__d))
__d = copysign(_Tp(0), __d);
__recalc = true;
}
if (isinf(__c) || isinf(__d)) {
__c = copysign(isinf(__c) ? _Tp(1) : _Tp(0), __c);
__d = copysign(isinf(__d) ? _Tp(1) : _Tp(0), __d);
if (isnan(__a))
__a = copysign(_Tp(0), __a);
if (isnan(__b))
__b = copysign(_Tp(0), __b);
__recalc = true;
}
if (!__recalc &&
(isinf(__ac) || isinf(__bd) || isinf(__ad) || isinf(__bc))) {
if (isnan(__a))
__a = copysign(_Tp(0), __a);
if (isnan(__b))
__b = copysign(_Tp(0), __b);
if (isnan(__c))
__c = copysign(_Tp(0), __c);
if (isnan(__d))
__d = copysign(_Tp(0), __d);
__recalc = true;
}
if (__recalc) {
__x = _Tp(INFINITY) * (__a * __c - __b * __d);
__y = _Tp(INFINITY) * (__a * __d + __b * __c);
}
}
return complex<_Tp>(__x, __y);
}
template <class _Tp>
inline CUDA_CALLABLE_MEMBER complex<_Tp> operator*(const complex<_Tp> &__x,
const _Tp &__y) {
complex<_Tp> __t(__x);
__t *= __y;
return __t;
}
template <class _Tp>
inline CUDA_CALLABLE_MEMBER complex<_Tp> operator*(const _Tp &__x,
const complex<_Tp> &__y) {
complex<_Tp> __t(__y);
__t *= __x;
return __t;
}
template <class _Tp>
CUDA_CALLABLE_MEMBER complex<_Tp> operator/(const complex<_Tp> &__z,
const complex<_Tp> &__w) {
int __ilogbw = 0;
_Tp __a = __z.real();
_Tp __b = __z.imag();
_Tp __c = __w.real();
_Tp __d = __w.imag();
_Tp __logbw = logb(fmax(fabs(__c), fabs(__d)));
if (isfinite(__logbw)) {
__ilogbw = static_cast<int>(__logbw);
__c = scalbn(__c, -__ilogbw);
__d = scalbn(__d, -__ilogbw);
}
_Tp __denom = __c * __c + __d * __d;
_Tp __x = scalbn((__a * __c + __b * __d) / __denom, -__ilogbw);
_Tp __y = scalbn((__b * __c - __a * __d) / __denom, -__ilogbw);
if (isnan(__x) && isnan(__y)) {
if ((__denom == _Tp(0)) && (!isnan(__a) || !isnan(__b))) {
__x = copysign(_Tp(INFINITY), __c) * __a;
__y = copysign(_Tp(INFINITY), __c) * __b;
} else if ((isinf(__a) || isinf(__b)) && isfinite(__c) && isfinite(__d)) {
__a = copysign(isinf(__a) ? _Tp(1) : _Tp(0), __a);
__b = copysign(isinf(__b) ? _Tp(1) : _Tp(0), __b);
__x = _Tp(INFINITY) * (__a * __c + __b * __d);
__y = _Tp(INFINITY) * (__b * __c - __a * __d);
} else if (isinf(__logbw) && __logbw > _Tp(0) && isfinite(__a) &&
isfinite(__b)) {
__c = copysign(isinf(__c) ? _Tp(1) : _Tp(0), __c);
__d = copysign(isinf(__d) ? _Tp(1) : _Tp(0), __d);
__x = _Tp(0) * (__a * __c + __b * __d);
__y = _Tp(0) * (__b * __c - __a * __d);
}
}
return complex<_Tp>(__x, __y);
}
template <>
CUDA_CALLABLE_MEMBER complex<float> operator/(const complex<float> &__z,
const complex<float> &__w) {
int __ilogbw = 0;
float __a = __z.real();
float __b = __z.imag();
float __c = __w.real();
float __d = __w.imag();
float __logbw = logbf(fmaxf(fabsf(__c), fabsf(__d)));
if (isfinite(__logbw)) {
__ilogbw = static_cast<int>(__logbw);
__c = scalbnf(__c, -__ilogbw);
__d = scalbnf(__d, -__ilogbw);
}
float __denom = __c * __c + __d * __d;
float __x = scalbnf((__a * __c + __b * __d) / __denom, -__ilogbw);
float __y = scalbnf((__b * __c - __a * __d) / __denom, -__ilogbw);
if (isnan(__x) && isnan(__y)) {
if ((__denom == float(0)) && (!isnan(__a) || !isnan(__b))) {
#pragma warning(suppress : 4756) // Ignore INFINITY related warning
__x = copysignf(INFINITY, __c) * __a;
#pragma warning(suppress : 4756) // Ignore INFINITY related warning
__y = copysignf(INFINITY, __c) * __b;
} else if ((isinf(__a) || isinf(__b)) && isfinite(__c) && isfinite(__d)) {
__a = copysignf(isinf(__a) ? float(1) : float(0), __a);
__b = copysignf(isinf(__b) ? float(1) : float(0), __b);
#pragma warning(suppress : 4756) // Ignore INFINITY related warning
__x = INFINITY * (__a * __c + __b * __d);
#pragma warning(suppress : 4756) // Ignore INFINITY related warning
__y = INFINITY * (__b * __c - __a * __d);
} else if (isinf(__logbw) && __logbw > float(0) && isfinite(__a) &&
isfinite(__b)) {
__c = copysignf(isinf(__c) ? float(1) : float(0), __c);
__d = copysignf(isinf(__d) ? float(1) : float(0), __d);
__x = float(0) * (__a * __c + __b * __d);
__y = float(0) * (__b * __c - __a * __d);
}
}
return complex<float>(__x, __y);
}
template <class _Tp>
inline CUDA_CALLABLE_MEMBER complex<_Tp> operator/(const complex<_Tp> &__x,
const _Tp &__y) {
return complex<_Tp>(__x.real() / __y, __x.imag() / __y);
}
template <class _Tp>
inline CUDA_CALLABLE_MEMBER complex<_Tp> operator/(const _Tp &__x,
const complex<_Tp> &__y) {
complex<_Tp> __t(__x);
__t /= __y;
return __t;
}
template <class _Tp>
inline CUDA_CALLABLE_MEMBER complex<_Tp> operator+(const complex<_Tp> &__x) {
return __x;
}
template <class _Tp>
inline CUDA_CALLABLE_MEMBER complex<_Tp> operator-(const complex<_Tp> &__x) {
return complex<_Tp>(-__x.real(), -__x.imag());
}
template <class _Tp>
inline CUDA_CALLABLE_MEMBER bool operator==(const complex<_Tp> &__x,
const complex<_Tp> &__y) {
return __x.real() == __y.real() && __x.imag() == __y.imag();
}
template <class _Tp>
inline CUDA_CALLABLE_MEMBER bool operator==(const complex<_Tp> &__x,
const _Tp &__y) {
return __x.real() == __y && __x.imag() == 0;
}
template <class _Tp>
inline CUDA_CALLABLE_MEMBER bool operator==(const _Tp &__x,
const complex<_Tp> &__y) {
return __x == __y.real() && 0 == __y.imag();
}
template <class _Tp>
inline CUDA_CALLABLE_MEMBER bool operator!=(const complex<_Tp> &__x,
const complex<_Tp> &__y) {
return !(__x == __y);
}
template <class _Tp>
inline CUDA_CALLABLE_MEMBER bool operator!=(const complex<_Tp> &__x,
const _Tp &__y) {
return !(__x == __y);
}
template <class _Tp>
inline CUDA_CALLABLE_MEMBER bool operator!=(const _Tp &__x,
const complex<_Tp> &__y) {
return !(__x == __y);
}
// 26.3.7 values:
// real
template <class _Tp>
inline CUDA_CALLABLE_MEMBER _Tp real(const complex<_Tp> &__c) {
return __c.real();
}
inline CUDA_CALLABLE_MEMBER double real(double __re) { return __re; }
inline CUDA_CALLABLE_MEMBER float real(float __re) { return __re; }
// imag
template <class _Tp>
inline CUDA_CALLABLE_MEMBER _Tp imag(const complex<_Tp> &__c) {
return __c.imag();
}
inline CUDA_CALLABLE_MEMBER double imag(double __re) { return 0; }
inline CUDA_CALLABLE_MEMBER float imag(float __re) { return 0; }
// abs
template <class _Tp>
inline CUDA_CALLABLE_MEMBER _Tp abs(const complex<_Tp> &__c) {
return hypot(__c.real(), __c.imag());
}
// arg
template <class _Tp>
inline CUDA_CALLABLE_MEMBER _Tp arg(const complex<_Tp> &__c) {
return atan2(__c.imag(), __c.real());
}
inline CUDA_CALLABLE_MEMBER double arg(double __re) { return atan2(0., __re); }
inline CUDA_CALLABLE_MEMBER float arg(float __re) { return atan2f(0.F, __re); }
// norm
template <class _Tp>
inline CUDA_CALLABLE_MEMBER _Tp norm(const complex<_Tp> &__c) {
if (isinf(__c.real()))
return fabs(__c.real());
if (isinf(__c.imag()))
return fabs(__c.imag());
return __c.real() * __c.real() + __c.imag() * __c.imag();
}
inline CUDA_CALLABLE_MEMBER double norm(double __re) { return __re * __re; }
inline CUDA_CALLABLE_MEMBER float norm(float __re) { return __re * __re; }
// conj
template <class _Tp>
inline CUDA_CALLABLE_MEMBER complex<_Tp> conj(const complex<_Tp> &__c) {
return complex<_Tp>(__c.real(), -__c.imag());
}
inline CUDA_CALLABLE_MEMBER complex<double> conj(double __re) {
return complex<double>(__re);
}
inline CUDA_CALLABLE_MEMBER complex<float> conj(float __re) {
return complex<float>(__re);
}
// proj
template <class _Tp>
inline CUDA_CALLABLE_MEMBER complex<_Tp> proj(const complex<_Tp> &__c) {
complex<_Tp> __r = __c;
if (isinf(__c.real()) || isinf(__c.imag()))
__r = complex<_Tp>(INFINITY, copysign(_Tp(0), __c.imag()));
return __r;
}
inline CUDA_CALLABLE_MEMBER complex<double> proj(double __re) {
if (isinf(__re))
__re = fabs(__re);
return complex<double>(__re);
}
inline CUDA_CALLABLE_MEMBER complex<float> proj(float __re) {
if (isinf(__re))
__re = fabs(__re);
return complex<float>(__re);
}
// polar
template <class _Tp>
CUDA_CALLABLE_MEMBER complex<_Tp> polar(const _Tp &__rho,
const _Tp &__theta = _Tp(0)) {
if (isnan(__rho) || signbit(__rho))
return complex<_Tp>(_Tp(NAN), _Tp(NAN));
if (isnan(__theta)) {
if (isinf(__rho))
return complex<_Tp>(__rho, __theta);
return complex<_Tp>(__theta, __theta);
}
if (isinf(__theta)) {
if (isinf(__rho))
return complex<_Tp>(__rho, _Tp(NAN));
return complex<_Tp>(_Tp(NAN), _Tp(NAN));
}
_Tp __x = __rho * cos(__theta);
if (isnan(__x))
__x = 0;
_Tp __y = __rho * sin(__theta);
if (isnan(__y))
__y = 0;
return complex<_Tp>(__x, __y);
}
// log
template <class _Tp>
inline CUDA_CALLABLE_MEMBER complex<_Tp> log(const complex<_Tp> &__x) {
return complex<_Tp>(log(abs(__x)), arg(__x));
}
// log10
template <class _Tp>
inline CUDA_CALLABLE_MEMBER complex<_Tp> log10(const complex<_Tp> &__x) {
return log(__x) / log(_Tp(10));
}
// sqrt
template <class _Tp>
CUDA_CALLABLE_MEMBER complex<_Tp> sqrt(const complex<_Tp> &__x) {
if (isinf(__x.imag()))
return complex<_Tp>(_Tp(INFINITY), __x.imag());
if (isinf(__x.real())) {
if (__x.real() > _Tp(0))
return complex<_Tp>(__x.real(), isnan(__x.imag())
? __x.imag()
: copysign(_Tp(0), __x.imag()));
return complex<_Tp>(isnan(__x.imag()) ? __x.imag() : _Tp(0),
copysign(__x.real(), __x.imag()));
}
return polar(sqrt(abs(__x)), arg(__x) / _Tp(2));
}
// exp
template <class _Tp>
CUDA_CALLABLE_MEMBER complex<_Tp> exp(const complex<_Tp> &__x) {
_Tp __i = __x.imag();
if (isinf(__x.real())) {
if (__x.real() < _Tp(0)) {
if (!isfinite(__i))
__i = _Tp(1);
} else if (__i == 0 || !isfinite(__i)) {
if (isinf(__i))
__i = _Tp(NAN);
return complex<_Tp>(__x.real(), __i);
}
} else if (isnan(__x.real()) && __x.imag() == 0)
return __x;
_Tp __e = exp(__x.real());
return complex<_Tp>(__e * cos(__i), __e * sin(__i));
}
// pow
template <class _Tp>
inline CUDA_CALLABLE_MEMBER complex<_Tp> pow(const complex<_Tp> &__x,
const complex<_Tp> &__y) {
return exp(__y * log(__x));
}
template <class _Tp>
inline CUDA_CALLABLE_MEMBER complex<_Tp> pow(const complex<_Tp> &__x,
const _Tp &__y) {
return pow(__x, complex<_Tp>(__y));
}
template <class _Tp>
inline CUDA_CALLABLE_MEMBER complex<_Tp> pow(const _Tp &__x,
const complex<_Tp> &__y) {
return pow(complex<_Tp>(__x), __y);
}
// asinh
template <class _Tp>
CUDA_CALLABLE_MEMBER complex<_Tp> asinh(const complex<_Tp> &__x) {
const _Tp __pi(atan2(+0., -0.));
if (isinf(__x.real())) {
if (isnan(__x.imag()))
return __x;
if (isinf(__x.imag()))
return complex<_Tp>(__x.real(), copysign(__pi * _Tp(0.25), __x.imag()));
return complex<_Tp>(__x.real(), copysign(_Tp(0), __x.imag()));
}
if (isnan(__x.real())) {
if (isinf(__x.imag()))
return complex<_Tp>(__x.imag(), __x.real());
if (__x.imag() == 0)
return __x;
return complex<_Tp>(__x.real(), __x.real());
}
if (isinf(__x.imag()))
return complex<_Tp>(copysign(__x.imag(), __x.real()),
copysign(__pi / _Tp(2), __x.imag()));
complex<_Tp> __z = log(__x + sqrt(pow(__x, _Tp(2)) + _Tp(1)));
return complex<_Tp>(copysign(__z.real(), __x.real()),
copysign(__z.imag(), __x.imag()));
}
// acosh
template <class _Tp>
CUDA_CALLABLE_MEMBER complex<_Tp> acosh(const complex<_Tp> &__x) {
const _Tp __pi(atan2(+0., -0.));
if (isinf(__x.real())) {
if (isnan(__x.imag()))
return complex<_Tp>(fabs(__x.real()), __x.imag());
if (isinf(__x.imag()))
if (__x.real() > 0)
return complex<_Tp>(__x.real(), copysign(__pi * _Tp(0.25), __x.imag()));
else
return complex<_Tp>(-__x.real(),
copysign(__pi * _Tp(0.75), __x.imag()));
if (__x.real() < 0)
return complex<_Tp>(-__x.real(), copysign(__pi, __x.imag()));
return complex<_Tp>(__x.real(), copysign(_Tp(0), __x.imag()));
}
if (isnan(__x.real())) {
if (isinf(__x.imag()))
return complex<_Tp>(fabs(__x.imag()), __x.real());
return complex<_Tp>(__x.real(), __x.real());
}
if (isinf(__x.imag()))
return complex<_Tp>(fabs(__x.imag()), copysign(__pi / _Tp(2), __x.imag()));
complex<_Tp> __z = log(__x + sqrt(pow(__x, _Tp(2)) - _Tp(1)));
return complex<_Tp>(copysign(__z.real(), _Tp(0)),
copysign(__z.imag(), __x.imag()));
}
// atanh
template <class _Tp>
CUDA_CALLABLE_MEMBER complex<_Tp> atanh(const complex<_Tp> &__x) {
const _Tp __pi(atan2(+0., -0.));
if (isinf(__x.imag())) {
return complex<_Tp>(copysign(_Tp(0), __x.real()),
copysign(__pi / _Tp(2), __x.imag()));
}
if (isnan(__x.imag())) {
if (isinf(__x.real()) || __x.real() == 0)
return complex<_Tp>(copysign(_Tp(0), __x.real()), __x.imag());
return complex<_Tp>(__x.imag(), __x.imag());
}
if (isnan(__x.real())) {
return complex<_Tp>(__x.real(), __x.real());
}
if (isinf(__x.real())) {
return complex<_Tp>(copysign(_Tp(0), __x.real()),
copysign(__pi / _Tp(2), __x.imag()));
}
if (fabs(__x.real()) == _Tp(1) && __x.imag() == _Tp(0)) {
return complex<_Tp>(copysign(_Tp(INFINITY), __x.real()),
copysign(_Tp(0), __x.imag()));
}
complex<_Tp> __z = log((_Tp(1) + __x) / (_Tp(1) - __x)) / _Tp(2);
return complex<_Tp>(copysign(__z.real(), __x.real()),
copysign(__z.imag(), __x.imag()));
}
// sinh
template <class _Tp>
CUDA_CALLABLE_MEMBER complex<_Tp> sinh(const complex<_Tp> &__x) {
if (isinf(__x.real()) && !isfinite(__x.imag()))
return complex<_Tp>(__x.real(), _Tp(NAN));
if (__x.real() == 0 && !isfinite(__x.imag()))
return complex<_Tp>(__x.real(), _Tp(NAN));
if (__x.imag() == 0 && !isfinite(__x.real()))
return __x;
return complex<_Tp>(sinh(__x.real()) * cos(__x.imag()),
cosh(__x.real()) * sin(__x.imag()));
}
// cosh
template <class _Tp>
CUDA_CALLABLE_MEMBER complex<_Tp> cosh(const complex<_Tp> &__x) {
if (isinf(__x.real()) && !isfinite(__x.imag()))
return complex<_Tp>(fabs(__x.real()), _Tp(NAN));
if (__x.real() == 0 && !isfinite(__x.imag()))
return complex<_Tp>(_Tp(NAN), __x.real());
if (__x.real() == 0 && __x.imag() == 0)
return complex<_Tp>(_Tp(1), __x.imag());
if (__x.imag() == 0 && !isfinite(__x.real()))
return complex<_Tp>(fabs(__x.real()), __x.imag());
return complex<_Tp>(cosh(__x.real()) * cos(__x.imag()),
sinh(__x.real()) * sin(__x.imag()));
}
// tanh
template <class _Tp>
CUDA_CALLABLE_MEMBER complex<_Tp> tanh(const complex<_Tp> &__x) {
if (isinf(__x.real())) {
if (!isfinite(__x.imag()))
return complex<_Tp>(_Tp(1), _Tp(0));
return complex<_Tp>(_Tp(1), copysign(_Tp(0), sin(_Tp(2) * __x.imag())));
}
if (isnan(__x.real()) && __x.imag() == 0)
return __x;
_Tp __2r(_Tp(2) * __x.real());
_Tp __2i(_Tp(2) * __x.imag());
_Tp __d(cosh(__2r) + cos(__2i));
return complex<_Tp>(sinh(__2r) / __d, sin(__2i) / __d);
}
// asin
template <class _Tp>
CUDA_CALLABLE_MEMBER complex<_Tp> asin(const complex<_Tp> &__x) {
complex<_Tp> __z = asinh(complex<_Tp>(-__x.imag(), __x.real()));
return complex<_Tp>(__z.imag(), -__z.real());
}
// acos
template <class _Tp>
CUDA_CALLABLE_MEMBER complex<_Tp> acos(const complex<_Tp> &__x) {
const _Tp __pi(atan2(+0., -0.));
if (isinf(__x.real())) {
if (isnan(__x.imag()))
return complex<_Tp>(__x.imag(), __x.real());
if (isinf(__x.imag())) {
if (__x.real() < _Tp(0))
return complex<_Tp>(_Tp(0.75) * __pi, -__x.imag());
return complex<_Tp>(_Tp(0.25) * __pi, -__x.imag());
}
if (__x.real() < _Tp(0))
return complex<_Tp>(__pi, signbit(__x.imag()) ? -__x.real() : __x.real());
return complex<_Tp>(_Tp(0), signbit(__x.imag()) ? __x.real() : -__x.real());
}
if (isnan(__x.real())) {
if (isinf(__x.imag()))
return complex<_Tp>(__x.real(), -__x.imag());
return complex<_Tp>(__x.real(), __x.real());
}
if (isinf(__x.imag()))
return complex<_Tp>(__pi / _Tp(2), -__x.imag());
if (__x.real() == 0)
return complex<_Tp>(__pi / _Tp(2), -__x.imag());
complex<_Tp> __z = log(__x + sqrt(pow(__x, _Tp(2)) - _Tp(1)));
if (signbit(__x.imag()))
return complex<_Tp>(fabs(__z.imag()), fabs(__z.real()));
return complex<_Tp>(fabs(__z.imag()), -fabs(__z.real()));
}
// atan
template <class _Tp>
CUDA_CALLABLE_MEMBER complex<_Tp> atan(const complex<_Tp> &__x) {
complex<_Tp> __z = atanh(complex<_Tp>(-__x.imag(), __x.real()));
return complex<_Tp>(__z.imag(), -__z.real());
}
// sin
template <class _Tp>
CUDA_CALLABLE_MEMBER complex<_Tp> sin(const complex<_Tp> &__x) {
complex<_Tp> __z = sinh(complex<_Tp>(-__x.imag(), __x.real()));
return complex<_Tp>(__z.imag(), -__z.real());
}
// cos
template <class _Tp>
inline CUDA_CALLABLE_MEMBER complex<_Tp> cos(const complex<_Tp> &__x) {
return cosh(complex<_Tp>(-__x.imag(), __x.real()));
}
// tan
template <class _Tp>
CUDA_CALLABLE_MEMBER complex<_Tp> tan(const complex<_Tp> &__x) {
complex<_Tp> __z = tanh(complex<_Tp>(-__x.imag(), __x.real()));
return complex<_Tp>(__z.imag(), -__z.real());
}
template <class _Tp, class _CharT, class _Traits>
std::basic_istream<_CharT, _Traits> &
operator>>(std::basic_istream<_CharT, _Traits> &__is, complex<_Tp> &__x) {
if (__is.good()) {
ws(__is);
if (__is.peek() == _CharT('(')) {
__is.get();
_Tp __r;
__is >> __r;
if (!__is.fail()) {
ws(__is);
_CharT __c = __is.peek();
if (__c == _CharT(',')) {
__is.get();
_Tp __i;
__is >> __i;
if (!__is.fail()) {
ws(__is);
__c = __is.peek();
if (__c == _CharT(')')) {
__is.get();
__x = complex<_Tp>(__r, __i);
} else
__is.setstate(std::ios_base::failbit);
} else
__is.setstate(std::ios_base::failbit);
} else if (__c == _CharT(')')) {
__is.get();
__x = complex<_Tp>(__r, _Tp(0));
} else
__is.setstate(std::ios_base::failbit);
} else
__is.setstate(std::ios_base::failbit);
} else {
_Tp __r;
__is >> __r;
if (!__is.fail())
__x = complex<_Tp>(__r, _Tp(0));
else
__is.setstate(std::ios_base::failbit);
}
} else
__is.setstate(std::ios_base::failbit);
return __is;
}
template <class _Tp, class _CharT, class _Traits>
std::basic_ostream<_CharT, _Traits> &
operator<<(std::basic_ostream<_CharT, _Traits> &__os, const complex<_Tp> &__x) {
std::basic_ostringstream<_CharT, _Traits> __s;
__s.flags(__os.flags());
__s.imbue(__os.getloc());
__s.precision(__os.precision());
__s << '(' << __x.real() << ',' << __x.imag() << ')';
return __os << __s.str();
}
//} // close namespace cuda_complex
template <class U, class V>
CUDA_CALLABLE_MEMBER auto operator*(const complex<U> &complexNumber,
const V &scalar) -> complex<U> {
return complex<U>{real(complexNumber) * scalar, imag(complexNumber) * scalar};
}
template <class U, class V>
CUDA_CALLABLE_MEMBER auto operator*(const V &scalar,
const complex<U> &complexNumber)
-> complex<U> {
return complex<U>{real(complexNumber) * scalar, imag(complexNumber) * scalar};
}
template <class U, class V>
CUDA_CALLABLE_MEMBER auto operator+(const complex<U> &complexNumber,
const V &scalar) -> complex<U> {
return complex<U>{real(complexNumber) + scalar, imag(complexNumber)};
}
template <class U, class V>
CUDA_CALLABLE_MEMBER auto operator+(const V &scalar,
const complex<U> &complexNumber)
-> complex<U> {
return complex<U>{real(complexNumber) + scalar, imag(complexNumber)};
}
template <class U, class V>
CUDA_CALLABLE_MEMBER auto operator-(const complex<U> &complexNumber,
const V &scalar) -> complex<U> {
return complex<U>{real(complexNumber) - scalar, imag(complexNumber)};
}
template <class U, class V>
CUDA_CALLABLE_MEMBER auto operator-(const V &scalar,
const complex<U> &complexNumber)
-> complex<U> {
return complex<U>{scalar - real(complexNumber), imag(complexNumber)};
}
template <class U, class V>
CUDA_CALLABLE_MEMBER auto operator/(const complex<U> &complexNumber,
const V scalar) -> complex<U> {
return complex<U>{real(complexNumber) / scalar, imag(complexNumber) / scalar};
}
template <class U, class V>
CUDA_CALLABLE_MEMBER auto operator/(const V scalar,
const complex<U> &complexNumber)
-> complex<U> {
return complex<U>{scalar, 0} / complexNumber;
}
using ComplexDouble = complex<double>;
using ComplexFloat = complex<float>;
#endif // CUDA_COMPLEX_HPP
}
#ifndef OPENCL_STDINT
#define OPENCL_STDINT
typedef unsigned int uint_t;
typedef signed char int8_t;
typedef signed short int16_t;
typedef signed int int32_t;
typedef signed long int int64_t;
typedef unsigned char uint8_t;
typedef unsigned short uint16_t;
typedef unsigned int uint32_t;
typedef unsigned long int uint64_t;
#endif
#include <cstdint>
#ifndef __CUDA_ARCH__
#define QUALIFIERS inline
#else
#define QUALIFIERS static __forceinline__ __device__
#endif
#define PHILOX_W32_0 (0x9E3779B9)
#define PHILOX_W32_1 (0xBB67AE85)
#define PHILOX_M4x32_0 (0xD2511F53)
#define PHILOX_M4x32_1 (0xCD9E8D57)
#define TWOPOW53_INV_DOUBLE (1.1102230246251565e-16)
#define TWOPOW32_INV_FLOAT (2.3283064e-10f)
typedef std::uint32_t uint32;
typedef std::uint64_t uint64;
QUALIFIERS uint32 mulhilo32(uint32 a, uint32 b, uint32* hip)
{
#ifndef __CUDA_ARCH__
// host code
uint64 product = ((uint64)a) * ((uint64)b);
*hip = product >> 32;
return (uint32)product;
#else
// device code
*hip = __umulhi(a,b);
return a*b;
#endif
}
QUALIFIERS void _philox4x32round(uint32* ctr, uint32* key)
{
uint32 hi0;
uint32 hi1;
uint32 lo0 = mulhilo32(PHILOX_M4x32_0, ctr[0], &hi0);
uint32 lo1 = mulhilo32(PHILOX_M4x32_1, ctr[2], &hi1);
ctr[0] = hi1^ctr[1]^key[0];
ctr[1] = lo1;
ctr[2] = hi0^ctr[3]^key[1];
ctr[3] = lo0;
}
QUALIFIERS void _philox4x32bumpkey(uint32* key)
{
key[0] += PHILOX_W32_0;
key[1] += PHILOX_W32_1;
}
QUALIFIERS double _uniform_double_hq(uint32 x, uint32 y)
{
uint64 z = (uint64)x ^ ((uint64)y << (53 - 32));
return z * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0);
}
QUALIFIERS void philox_double2(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1, double & rnd1, double & rnd2)
{
uint32 key[2] = {key0, key1};
uint32 ctr[4] = {ctr0, ctr1, ctr2, ctr3};
_philox4x32round(ctr, key); // 1
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 2
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 3
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 4
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 5
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 6
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 7
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 8
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 9
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 10
rnd1 = _uniform_double_hq(ctr[0], ctr[1]);
rnd2 = _uniform_double_hq(ctr[2], ctr[3]);
}
QUALIFIERS void philox_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1,
float & rnd1, float & rnd2, float & rnd3, float & rnd4)
{
uint32 key[2] = {key0, key1};
uint32 ctr[4] = {ctr0, ctr1, ctr2, ctr3};
_philox4x32round(ctr, key); // 1
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 2
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 3
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 4
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 5
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 6
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 7
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 8
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 9
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 10
rnd1 = ctr[0] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
rnd2 = ctr[1] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
rnd3 = ctr[2] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
rnd4 = ctr[3] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
}
\ No newline at end of file
# -*- coding: utf-8 -*-
#
# Copyright © 2019 Stephan Seitz <stephan.seitz@fau.de>
#
# Distributed under terms of the GPLv3 license.
"""
"""
import hashlib
import itertools
from enum import Enum
from typing import Set
import sympy as sp
from sympy.core.cache import cacheit
import pystencils
from pystencils.astnodes import Node
from pystencils.data_types import TypedSymbol, cast_func, create_type
try:
import pycuda.driver
except Exception:
pass
_hash = hashlib.md5
class InterpolationMode(str, Enum):
NEAREST_NEIGHBOR = "nearest_neighbour"
NN = NEAREST_NEIGHBOR
LINEAR = "linear"
CUBIC_SPLINE = "cubic_spline"
class Interpolator(object):
"""
Implements non-integer accesses on fields using linear interpolation.
On GPU, this interpolator can be implemented by a :class:`.TextureCachedField` for hardware acceleration.
Address modes are different boundary handlings possible choices are like for CUDA textures
**CLAMP**
The signal c[k] is continued outside k=0,...,M-1 so that c[k] = c[0] for k < 0, and c[k] = c[M-1] for k >= M.
**BORDER**
The signal c[k] is continued outside k=0,...,M-1 so that c[k] = 0 for k < 0and for k >= M.
Now, to describe the last two address modes, we are forced to consider normalized coordinates,
so that the 1D input signal samples are assumed to be c[k / M], with k=0,...,M-1.
**WRAP**
The signal c[k / M] is continued outside k=0,...,M-1 so that it is periodic with period equal to M.
In other words, c[(k + p * M) / M] = c[k / M] for any (positive, negative or vanishing) integer p.
**MIRROR**
The signal c[k / M] is continued outside k=0,...,M-1 so that it is periodic with period equal to 2 * M - 2.
In other words, c[l / M] = c[k / M] for any l and k such that (l + k)mod(2 * M - 2) = 0.
Explanations from https://stackoverflow.com/questions/19020963/the-different-addressing-modes-of-cuda-textures
"""
required_global_declarations = []
def __init__(self,
parent_field,
interpolation_mode: InterpolationMode,
address_mode='BORDER',
use_normalized_coordinates=False,
allow_textures=True):
super().__init__()
self.field = parent_field.new_field_with_different_name(parent_field.name)
self.field.field_type = pystencils.field.FieldType.CUSTOM
self.address_mode = address_mode
self.use_normalized_coordinates = use_normalized_coordinates
hash_str = "%x" % abs(hash(self.field) + hash(address_mode))
self.symbol = TypedSymbol('dummy_symbol_carrying_field' + self.field.name + hash_str,
'dummy_symbol_carrying_field' + self.field.name + hash_str)
self.symbol.field = self.field
self.symbol.interpolator = self
self.allow_textures = allow_textures
self.interpolation_mode = interpolation_mode
@property
def _hashable_contents(self):
return (str(self.address_mode),
str(type(self)),
self.symbol,
self.address_mode,
self.use_normalized_coordinates)
def at(self, offset):
return InterpolatorAccess(self.symbol, *[sp.S(o) for o in offset])
def __getitem__(self, offset):
return InterpolatorAccess(self.symbol, *[sp.S(o) for o in offset])
def __str__(self):
return '%s_interpolator_%s' % (self.field.name, self.reproducible_hash)
def __repr__(self):
return self.__str__()
def __hash__(self):
return hash(self._hashable_contents)
@property
def reproducible_hash(self):
return _hash(str(self._hashable_contents).encode()).hexdigest()
class LinearInterpolator(Interpolator):
def __init__(self,
parent_field: pystencils.Field,
address_mode='BORDER',
use_normalized_coordinates=False):
super().__init__(parent_field,
InterpolationMode.LINEAR,
address_mode,
use_normalized_coordinates)
class NearestNeightborInterpolator(Interpolator):
def __init__(self,
parent_field: pystencils.Field,
address_mode='BORDER',
use_normalized_coordinates=False):
super().__init__(parent_field,
InterpolationMode.NN,
address_mode,
use_normalized_coordinates)
class InterpolatorAccess(TypedSymbol):
def __new__(cls, field, offsets, *args, **kwargs):
obj = TextureAccess.__xnew_cached_(cls, field, offsets, *args, **kwargs)
return obj
def __new_stage2__(self, symbol, *offsets):
assert offsets is not None
obj = super().__xnew__(self, '%s_interpolator_%x' %
(symbol.field.name, abs(hash(tuple(offsets)))), symbol.field.dtype)
obj.offsets = offsets
obj.symbol = symbol
obj.field = symbol.field
obj.interpolator = symbol.interpolator
return obj
def __hash__(self):
return hash((self.symbol, self.field, tuple(self.offsets), self.interpolator))
def __str__(self):
return '%s_interpolator(%s)' % (self.field.name, ','.join(str(o) for o in self.offsets))
def __repr__(self):
return self.__str__()
def atoms(self, *types):
if self.offsets:
offsets = set(o for o in self.offsets if isinstance(o, types))
if isinstance(self, *types):
offsets.update([self])
for o in self.offsets:
if hasattr(o, 'atoms'):
offsets.update(set(o.atoms(*types)))
return offsets
else:
return set()
@property
def free_symbols(self):
symbols = set()
if self.offsets is not None:
for o in self.offsets:
if hasattr(o, 'free_symbols'):
symbols.update(set(o.free_symbols))
# if hasattr(o, 'atoms'):
# symbols.update(set(o.atoms(sp.Symbol)))
return symbols
@property
def args(self):
return [self.symbol, *self.offsets]
@property
def symbols_defined(self) -> Set[sp.Symbol]:
return {self}
@property
def interpolation_mode(self):
return self.interpolator.interpolation_mode
def implementation_with_stencils(self):
field = self.field
default_int_type = create_type('int64')
use_textures = isinstance(self, TextureAccess)
if use_textures:
def absolute_access(x, _):
return self.texture.at((o for o in x))
else:
absolute_access = field.absolute_access
sum = [0, ] * (field.shape[0] if field.index_dimensions else 1)
offsets = self.offsets
rounding_functions = (sp.floor, lambda x: sp.floor(x) + 1)
for channel_idx in range(field.shape[0] if field.index_dimensions else 1):
if self.interpolation_mode == InterpolationMode.NN:
if use_textures:
sum[channel_idx] = self
else:
sum[channel_idx] = absolute_access([sp.floor(i + 0.5) for i in offsets], channel_idx)
elif self.interpolation_mode == InterpolationMode.LINEAR:
# TODO optimization: implement via lerp: https://devblogs.nvidia.com/lerp-faster-cuda/
for c in itertools.product(rounding_functions, repeat=field.spatial_dimensions):
weight = sp.Mul(*[1 - sp.Abs(f(offset) - offset) for (f, offset) in zip(c, offsets)])
index = [f(offset) for (f, offset) in zip(c, offsets)]
# Hardware boundary handling on GPU
if use_textures:
weight = sp.Mul(*[1 - sp.Abs(f(offset) - offset) for (f, offset) in zip(c, offsets)])
sum[channel_idx] += \
weight * absolute_access(index, channel_idx if field.index_dimensions else ())
# else boundary handling using software
elif str(self.interpolator.address_mode).lower() == 'border':
is_inside_field = sp.And(
*itertools.chain([i >= 0 for i in index],
[idx < field.shape[dim] for (dim, idx) in enumerate(index)]))
index = [cast_func(i, default_int_type) for i in index]
sum[channel_idx] += sp.Piecewise(
(weight * absolute_access(index, channel_idx if field.index_dimensions else ()),
is_inside_field),
(sp.simplify(0), True)
)
elif str(self.interpolator.address_mode).lower() == 'clamp':
index = [sp.Min(sp.Max(0, cast_func(i, default_int_type)), field.spatial_shape[dim] - 1)
for (dim, i) in enumerate(index)]
sum[channel_idx] += weight * \
absolute_access(index, channel_idx if field.index_dimensions else ())
elif str(self.interpolator.address_mode).lower() == 'wrap':
index = [sp.Mod(cast_func(i, default_int_type), field.shape[dim] - 1)
for (dim, i) in enumerate(index)]
index = [cast_func(sp.Piecewise((i, i > 0),
(sp.Abs(cast_func(field.shape[dim] - 1 + i, default_int_type)),
True)), default_int_type)
for (dim, i) in enumerate(index)]
sum[channel_idx] += weight * \
absolute_access(index, channel_idx if field.index_dimensions else ())
# sum[channel_idx] = 0
elif str(self.interpolator.address_mode).lower() == 'mirror':
def triangle_fun(x, half_period):
saw_tooth = sp.Abs(cast_func(x, default_int_type)) % (
cast_func(2 * half_period, create_type('int32')))
return sp.Piecewise((saw_tooth, saw_tooth < half_period),
(2 * half_period - 1 - saw_tooth, True))
index = [cast_func(triangle_fun(i, field.shape[dim]),
default_int_type) for (dim, i) in enumerate(index)]
sum[channel_idx] += weight * \
absolute_access(index, channel_idx if field.index_dimensions else ())
else:
raise NotImplementedError()
elif self.interpolation_mode == InterpolationMode.CUBIC_SPLINE:
raise NotImplementedError("only works with HW interpolation for float32")
sum = [sp.factor(s) for s in sum]
if field.index_dimensions:
return sp.Matrix(sum)
else:
return sum[0]
# noinspection SpellCheckingInspection
__xnew__ = staticmethod(__new_stage2__)
# noinspection SpellCheckingInspection
__xnew_cached_ = staticmethod(cacheit(__new_stage2__))
##########################################################################################
# GPU-specific fast specializations (for precision GPUs can also use above nodes/symbols #
##########################################################################################
class TextureCachedField:
def __init__(self, parent_field,
address_mode=None,
filter_mode=None,
interpolation_mode: InterpolationMode = InterpolationMode.LINEAR,
use_normalized_coordinates=False,
read_as_integer=False
):
if isinstance(address_mode, str):
address_mode = getattr(pycuda.driver.address_mode, address_mode.upper())
if address_mode is None:
address_mode = pycuda.driver.address_mode.BORDER
if filter_mode is None:
filter_mode = pycuda.driver.filter_mode.LINEAR
# self, field_name, field_type, dtype, layout, shape, strides
self.field = parent_field
self.address_mode = address_mode
self.filter_mode = filter_mode
self.read_as_integer = read_as_integer
self.use_normalized_coordinates = use_normalized_coordinates
self.interpolation_mode = interpolation_mode
self.symbol = TypedSymbol(str(self), self.field.dtype.numpy_dtype)
self.symbol.interpolator = self
self.symbol.field = self.field
self.required_global_declarations = [TextureDeclaration(self)]
# assert str(self.field.dtype) != 'double', "CUDA does not support double textures!"
# assert dtype_supports_textures(self.field.dtype), "CUDA only supports texture types with 32 bits or less"
@classmethod
def from_interpolator(cls, interpolator: LinearInterpolator):
if (isinstance(interpolator, cls)
or (hasattr(interpolator, 'allow_textures') and not interpolator.allow_textures)):
return interpolator
obj = cls(interpolator.field, interpolator.address_mode, interpolation_mode=interpolator.interpolation_mode)
return obj
def at(self, offset):
return TextureAccess(self.symbol, *offset)
def __getitem__(self, offset):
return TextureAccess(self.symbol, *offset)
def __str__(self):
return '%s_texture_%s' % (self.field.name, self.reproducible_hash)
def __repr__(self):
return self.__str__()
@property
def _hashable_contents(self):
return (type(self),
self.address_mode,
self.filter_mode,
self.read_as_integer,
self.interpolation_mode,
self.use_normalized_coordinates)
def __hash__(self):
return hash(self._hashable_contents)
@property
def reproducible_hash(self):
return _hash(str(self._hashable_contents).encode()).hexdigest()
class TextureAccess(InterpolatorAccess):
def __new__(cls, texture_symbol, offsets, *args, **kwargs):
obj = TextureAccess.__xnew_cached_(cls, texture_symbol, offsets, *args, **kwargs)
return obj
def __new_stage2__(self, symbol, *offsets):
obj = super().__xnew__(self, symbol, *offsets)
obj.required_global_declarations = symbol.interpolator.required_global_declarations
obj.required_global_declarations[0]._symbols_defined.add(obj)
return obj
def __str__(self):
return '%s_texture(%s)' % (self.interpolator.field.name, ','.join(str(o) for o in self.offsets))
@property
def texture(self):
return self.interpolator
# noinspection SpellCheckingInspection
__xnew__ = staticmethod(__new_stage2__)
# noinspection SpellCheckingInspection
__xnew_cached_ = staticmethod(cacheit(__new_stage2__))
class TextureDeclaration(Node):
"""
A global declaration of a texture. Visible both for device and host code.
.. code:: cpp
// This Node represents the following global declaration
texture<float, cudaTextureType2D, cudaReadModeElementType> x_texture_5acc9fced7b0dc3e;
__device__ kernel(...) {
// kernel acceses x_texture_5acc9fced7b0dc3e with tex2d(...)
}
__host__ launch_kernel(...) {
// Host needs to bind the texture
cudaBindTexture(0, x_texture_5acc9fced7b0dc3e, buffer, N*sizeof(float));
}
This has been deprecated by CUDA in favor of :class:`.TextureObject`.
But texture objects are not yet supported by PyCUDA (https://github.com/inducer/pycuda/pull/174)
"""
def __init__(self, parent_texture):
self.texture = parent_texture
self._symbols_defined = {self.texture.symbol}
@property
def symbols_defined(self) -> Set[sp.Symbol]:
return self._symbols_defined
@property
def args(self) -> Set[sp.Symbol]:
return set()
@property
def headers(self):
return ['"pycuda-helpers.hpp"']
def __str__(self):
from pystencils.backends.cuda_backend import CudaBackend
return CudaBackend()(self)
class TextureObject(TextureDeclaration):
"""
A CUDA texture object. Opposed to :class:`.TextureDeclaration` it is not declared globally but
used as a function argument for the kernel call.
Like :class:`.TextureDeclaration` it defines :class:`.TextureAccess` symbols.
Just the printing representation is a bit different.
"""
pass
def dtype_supports_textures(dtype):
"""
Returns whether CUDA natively supports texture fetches with this numpy dtype.
The maximum word size for a texture fetch is four bytes.
With this trick also larger dtypes can be fetched:
https://github.com/inducer/pycuda/blob/master/pycuda/cuda/pycuda-helpers.hpp
"""
if hasattr(dtype, 'numpy_dtype'):
dtype = dtype.numpy_dtype
if isinstance(dtype, type):
return dtype().itemsize <= 4
return dtype.itemsize <= 4
from .generate_benchmark import generate_benchmark, run_c_benchmark
from .kerncraft_interface import KerncraftParameters, PyStencilsKerncraftKernel
__all__ = ['PyStencilsKerncraftKernel', 'KerncraftParameters', 'generate_benchmark', 'run_c_benchmark']
import os
import subprocess
from jinja2 import Template
from pystencils.astnodes import PragmaBlock
from pystencils.backends.cbackend import generate_c, get_headers
from pystencils.cpu.cpujit import get_compiler_config, run_compile_step
from pystencils.data_types import get_base_type
from pystencils.include import get_pystencils_include_path
from pystencils.sympyextensions import prod
benchmark_template = Template("""
#include "kerncraft.h"
#include <stdlib.h>
#include <stdint.h>
#include <stdbool.h>
#include <math.h>
#include <stdio.h>
{{ includes }}
{%- if likwid %}
#include <likwid.h>
{%- endif %}
#define RESTRICT __restrict__
#define FUNC_PREFIX
void dummy(void *);
void timing(double* wcTime, double* cpuTime);
extern int var_false;
{{kernel_code}}
int main(int argc, char **argv)
{
{%- if likwid %}
likwid_markerInit();
{%- endif %}
{%- for field_name, dataType, size in fields %}
// Initialization {{field_name}}
double * {{field_name}} = (double *) aligned_malloc(sizeof({{dataType}}) * {{size}}, 64);
for (unsigned long long i = 0; i < {{size}}; ++i)
{{field_name}}[i] = 0.23;
if(var_false)
dummy({{field_name}});
{%- endfor %}
{%- for constantName, dataType in constants %}
// Constant {{constantName}}
{{dataType}} {{constantName}};
{{constantName}} = 0.23;
if(var_false)
dummy(& {{constantName}});
{%- endfor %}
{%- if likwid and openmp %}
#pragma omp parallel
{
likwid_markerRegisterRegion("loop");
#pragma omp barrier
{%- elif likwid %}
likwid_markerRegisterRegion("loop");
{%- endif %}
for(int warmup = 1; warmup >= 0; --warmup) {
int repeat = 2;
if(warmup == 0) {
repeat = atoi(argv[1]);
{%- if likwid %}
likwid_markerStartRegion("loop");
{%- endif %}
}
{%- if timing %}
double wcStartTime, cpuStartTime, wcEndTime, cpuEndTime;
timing(&wcStartTime, &cpuStartTime);
{%- endif %}
for (; repeat > 0; --repeat)
{
{{kernelName}}({{call_argument_list}});
// Dummy calls
{%- for field_name, dataType, size in fields %}
if(var_false) dummy((void*){{field_name}});
{%- endfor %}
{%- for constantName, dataType in constants %}
if(var_false) dummy((void*)&{{constantName}});
{%- endfor %}
}
{%- if timing %}
timing(&wcEndTime, &cpuEndTime);
if( warmup == 0)
printf("%e\\n", (wcEndTime - wcStartTime) / atoi(argv[1]) );
{%- endif %}
}
{%- if likwid %}
likwid_markerStopRegion("loop");
{%- if openmp %}
}
{%- endif %}
{%- endif %}
{%- if likwid %}
likwid_markerClose();
{%- endif %}
}
""")
def generate_benchmark(ast, likwid=False, openmp=False, timing=False):
"""Return C code of a benchmark program for the given kernel.
Args:
ast: the pystencils AST object as returned by create_kernel
likwid: if True likwid markers are added to the code
openmp: relevant only if likwid=True, to generated correct likwid initialization code
timing: add timing output to the code, prints time per iteration to stdout
Returns:
C code as string
"""
accessed_fields = {f.name: f for f in ast.fields_accessed}
constants = []
fields = []
call_parameters = []
for p in ast.get_parameters():
if not p.is_field_parameter:
constants.append((p.symbol.name, str(p.symbol.dtype)))
call_parameters.append(p.symbol.name)
else:
assert p.is_field_pointer, "Benchmark implemented only for kernels with fixed loop size"
field = accessed_fields[p.field_name]
dtype = str(get_base_type(p.symbol.dtype))
fields.append((p.field_name, dtype, prod(field.shape)))
call_parameters.append(p.field_name)
header_list = get_headers(ast)
includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list])
# Strip "#pragma omp parallel" from within kernel, because main function takes care of that
# when likwid and openmp are enabled
if likwid and openmp:
if len(ast.body.args) > 0 and isinstance(ast.body.args[0], PragmaBlock):
ast.body.args[0].pragma_line = ''
args = {
'likwid': likwid,
'openmp': openmp,
'kernel_code': generate_c(ast, dialect='c'),
'kernelName': ast.function_name,
'fields': fields,
'constants': constants,
'call_argument_list': ",".join(call_parameters),
'includes': includes,
'timing': timing,
}
return benchmark_template.render(**args)
def run_c_benchmark(ast, inner_iterations, outer_iterations=3):
"""Runs the given kernel with outer loop in C
Args:
ast:
inner_iterations: timings are recorded around this many iterations
outer_iterations: number of timings recorded
Returns:
list of times per iterations for each outer iteration
"""
import kerncraft
benchmark_code = generate_benchmark(ast, timing=True)
with open('bench.c', 'w') as f:
f.write(benchmark_code)
kerncraft_path = os.path.dirname(kerncraft.__file__)
extra_flags = ['-I' + get_pystencils_include_path(),
'-I' + os.path.join(kerncraft_path, 'headers')]
compiler_config = get_compiler_config()
compile_cmd = [compiler_config['command']] + compiler_config['flags'].split()
compile_cmd += [*extra_flags,
os.path.join(kerncraft_path, 'headers', 'timing.c'),
os.path.join(kerncraft_path, 'headers', 'dummy.c'),
'bench.c',
'-o', 'bench',
]
run_compile_step(compile_cmd)
results = []
for _ in range(outer_iterations):
benchmark_time = float(subprocess.check_output(['./bench', str(inner_iterations)]))
results.append(benchmark_time)
return results
import warnings
from collections import defaultdict
from tempfile import TemporaryDirectory
from typing import Optional
import kerncraft
import sympy as sp
from kerncraft.kerncraft import KernelCode
from kerncraft.machinemodel import MachineModel
from pystencils.astnodes import (
KernelFunction, LoopOverCoordinate, ResolvedFieldAccess, SympyAssignment)
from pystencils.field import get_layout_from_strides
from pystencils.kerncraft_coupling.generate_benchmark import generate_benchmark
from pystencils.sympyextensions import count_operations_in_ast
from pystencils.transformations import filtered_tree_iteration
from pystencils.utils import DotDict
class PyStencilsKerncraftKernel(KernelCode):
"""
Implementation of kerncraft's kernel interface for pystencils CPU kernels.
Analyses a list of equations assuming they will be executed on a CPU
"""
LIKWID_BASE = '/usr/local/likwid'
def __init__(self, ast: KernelFunction, machine: Optional[MachineModel] = None,
assumed_layout='SoA', debug_print=False, filename=None):
"""Create a kerncraft kernel using a pystencils AST
Args:
ast: pystencils ast
machine: kerncraft machine model - specify this if kernel needs to be compiled
assumed_layout: either 'SoA' or 'AoS' - if fields have symbolic sizes the layout of the index
coordinates is not known. In this case either a structures of array (SoA) or
array of structures (AoS) layout is assumed
"""
kerncraft.kernel.Kernel.__init__(self, machine)
# Initialize state
self.asm_block = None
self._filename = filename
self.kernel_ast = ast
self.temporary_dir = TemporaryDirectory()
self._keep_intermediates = debug_print
# Loops
inner_loops = [l for l in filtered_tree_iteration(ast, LoopOverCoordinate, stop_type=SympyAssignment)
if l.is_innermost_loop]
if len(inner_loops) == 0:
raise ValueError("No loop found in pystencils AST")
else:
if len(inner_loops) > 1:
warnings.warn("pystencils AST contains multiple inner loops. "
"Only one can be analyzed - choosing first one")
inner_loop = inner_loops[0]
self._loop_stack = []
cur_node = inner_loop
while cur_node is not None:
if isinstance(cur_node, LoopOverCoordinate):
loop_counter_sym = cur_node.loop_counter_symbol
loop_info = (loop_counter_sym.name, cur_node.start, cur_node.stop, 1)
# If the correct step were to be provided, all access within that step length will
# also need to be passed to kerncraft: cur_node.step)
self._loop_stack.append(loop_info)
cur_node = cur_node.parent
self._loop_stack = list(reversed(self._loop_stack))
# Data sources & destinations
self.sources = defaultdict(list)
self.destinations = defaultdict(list)
def get_layout_tuple(f):
if f.has_fixed_shape:
return get_layout_from_strides(f.strides)
else:
layout_list = list(f.layout)
for _ in range(f.index_dimensions):
layout_list.insert(0 if assumed_layout == 'SoA' else -1, max(layout_list) + 1)
return layout_list
reads, writes = search_resolved_field_accesses_in_ast(inner_loop)
for accesses, target_dict in [(reads, self.sources), (writes, self.destinations)]:
for fa in accesses:
coord = [sp.Symbol(LoopOverCoordinate.get_loop_counter_name(i), positive=True, integer=True) + off
for i, off in enumerate(fa.offsets)]
coord += list(fa.idx_coordinate_values)
layout = get_layout_tuple(fa.field)
permuted_coord = [sp.sympify(coord[i]) for i in layout]
target_dict[fa.field.name].append(permuted_coord)
# Variables (arrays)
fields_accessed = ast.fields_accessed
for field in fields_accessed:
layout = get_layout_tuple(field)
permuted_shape = list(field.shape[i] for i in layout)
self.set_variable(field.name, str(field.dtype), tuple(permuted_shape))
# Scalars may be safely ignored
# for param in ast.get_parameters():
# if not param.is_field_parameter:
# # self.set_variable(param.symbol.name, str(param.symbol.dtype), None)
# self.sources[param.symbol.name] = [None]
# data type
self.datatype = list(self.variables.values())[0][0]
# flops
operation_count = count_operations_in_ast(inner_loop)
self._flops = {
'+': operation_count['adds'],
'*': operation_count['muls'],
'/': operation_count['divs'],
}
for k in [k for k, v in self._flops.items() if v == 0]:
del self._flops[k]
self.check()
if debug_print:
from pprint import pprint
print("----------------------------- Loop Stack --------------------------")
pprint(self._loop_stack)
print("----------------------------- Sources -----------------------------")
pprint(self.sources)
print("----------------------------- Destinations ------------------------")
pprint(self.destinations)
print("----------------------------- FLOPS -------------------------------")
pprint(self._flops)
def as_code(self, type_='iaca', openmp=False, as_filename=False):
"""
Generate and return compilable source code.
Args:
type_: can be iaca or likwid.
openmp: if true, openmp code will be generated
as_filename:
"""
code = generate_benchmark(self.kernel_ast, likwid=type_ == 'likwid', openmp=openmp)
if as_filename:
fp, already_available = self._get_intermediate_file('kernel_{}.c'.format(type_),
machine_and_compiler_dependent=False)
if not already_available:
fp.write(code)
return fp.name
else:
return code
class KerncraftParameters(DotDict):
def __init__(self, **kwargs):
super(KerncraftParameters, self).__init__(**kwargs)
self['asm_block'] = 'auto'
self['asm_increment'] = 0
self['cores'] = 1
self['cache_predictor'] = 'SIM'
self['verbose'] = 0
self['pointer_increment'] = 'auto'
self['iterations'] = 10
self['unit'] = 'cy/CL'
self['ignore_warnings'] = True
# ------------------------------------------- Helper functions ---------------------------------------------------------
def search_resolved_field_accesses_in_ast(ast):
def visit(node, reads, writes):
if not isinstance(node, SympyAssignment):
for a in node.args:
visit(a, reads, writes)
return
for expr, accesses in [(node.lhs, writes), (node.rhs, reads)]:
accesses.update(expr.atoms(ResolvedFieldAccess))
read_accesses = set()
write_accesses = set()
visit(ast, read_accesses, write_accesses)
return read_accesses, write_accesses
from .kernelcreation import create_kernel
from .llvmjit import make_python_function
__all__ = ['create_kernel', 'make_python_function']
import llvmlite.ir as ir
class Loop(object):
def __init__(self, builder, start_val, stop_val, step_val=1, loop_name='loop', phi_name="_phi"):
self.builder = builder
self.start_val = start_val
self.stop_val = stop_val
self.step_val = step_val
self.loop_name = loop_name
self.phi_name = phi_name
def __enter__(self):
self.loop_end, self.after, phi = self._for_loop(self.start_val, self.stop_val, self.step_val, self.loop_name,
self.phi_name)
return phi
def _for_loop(self, start_val, stop_val, step_val, loop_name, phi_name):
# TODO size of int??? unisgned???
integer = ir.IntType(64)
# Loop block
pre_loop_bb = self.builder.block
loop_bb = self.builder.append_basic_block(name='loop_' + loop_name)
self.builder.branch(loop_bb)
# Insert an explicit fall through from the current block to loop_bb
self.builder.position_at_start(loop_bb)
# Add phi
phi = self.builder.phi(integer, name=phi_name)
phi.add_incoming(start_val, pre_loop_bb)
loop_end_bb = self.builder.append_basic_block(name=loop_name + "_end_bb")
self.builder.position_at_start(loop_end_bb)
next_var = self.builder.add(phi, step_val, name=loop_name + '_next_it')
cond = self.builder.icmp_unsigned('<', next_var, stop_val, name=loop_name + "_cond")
after_bb = self.builder.append_basic_block(name=loop_name + "_after_bb")
self.builder.cbranch(cond, loop_bb, after_bb)
phi.add_incoming(next_var, loop_end_bb)
self.builder.position_at_end(loop_bb)
return loop_end_bb, after_bb, phi
def __exit__(self, exc_type, exc, exc_tb):
self.builder.branch(self.loop_end)
self.builder.position_at_end(self.after)
from pystencils.llvm.llvmjit import make_python_function
from pystencils.transformations import insert_casts
def create_kernel(assignments, function_name="kernel", type_info=None, split_groups=(),
iteration_slice=None, ghost_layers=None, target='cpu'):
"""
Creates an abstract syntax tree for a kernel function, by taking a list of update rules.
Loops are created according to the field accesses in the equations.
Args:
assignments: list of sympy equations, containing accesses to :class:`pystencils.field.Field`.
Defining the update rules of the kernel
function_name: name of the generated function - only important if generated code is written out
type_info: a map from symbol name to a C type specifier. If not specified all symbols are assumed to
be of type 'double' except symbols which occur on the left hand side of equations where the
right hand side is a sympy Boolean which are assumed to be 'bool' .
split_groups: Specification on how to split up inner loop into multiple loops. For details see
transformation :func:`pystencils.transformation.split_inner_loop`
iteration_slice: if not None, iteration is done only over this slice of the field
ghost_layers: a sequence of pairs for each coordinate with lower and upper nr of ghost layers
if None, the number of ghost layers is determined automatically and assumed to be equal for a
all dimensions
:return: :class:`pystencils.ast.KernelFunction` node
"""
if target == 'cpu':
from pystencils.cpu import create_kernel
code = create_kernel(assignments, function_name, type_info, split_groups, iteration_slice, ghost_layers)
code._backend = 'llvm'
elif target == 'gpu':
from pystencils.gpucuda.kernelcreation import create_cuda_kernel
code = create_cuda_kernel(assignments,
function_name,
type_info,
iteration_slice=iteration_slice,
ghost_layers=ghost_layers)
code._backend = 'llvm_gpu'
else:
NotImplementedError()
code.body = insert_casts(code.body)
code._compile_function = make_python_function
return code
import functools
import llvmlite.ir as ir
import llvmlite.llvmpy.core as lc
import sympy as sp
from sympy import Indexed, S
from sympy.printing.printer import Printer
from pystencils.assignment import Assignment
from pystencils.data_types import (
collate_types, create_composite_type_from_string, create_type, get_type_of_expression,
to_llvm_type)
from pystencils.llvm.control_flow import Loop
# From Numba
def set_cuda_kernel(lfunc):
from llvmlite.llvmpy.core import MetaData, MetaDataString, Constant, Type
m = lfunc.module
ops = lfunc, MetaDataString.get(m, "kernel"), Constant.int(Type.int(), 1)
md = MetaData.get(m, ops)
nmd = m.get_or_insert_named_metadata('nvvm.annotations')
nmd.add(md)
# set nvvm ir version
i32 = ir.IntType(32)
md_ver = m.add_metadata([i32(1), i32(2), i32(2), i32(0)])
m.add_named_metadata('nvvmir.version', md_ver)
# From Numba
def _call_sreg(builder, name):
module = builder.module
fnty = lc.Type.function(lc.Type.int(), ())
fn = module.get_or_insert_function(fnty, name=name)
return builder.call(fn, ())
def generate_llvm(ast_node, module=None, builder=None, target='cpu'):
"""Prints the ast as llvm code."""
if module is None:
module = lc.Module()
if builder is None:
builder = ir.IRBuilder()
printer = LLVMPrinter(module, builder, target=target)
return printer._print(ast_node)
# noinspection PyPep8Naming
class LLVMPrinter(Printer):
"""Convert expressions to LLVM IR"""
def __init__(self, module, builder, fn=None, target='cpu', *args, **kwargs):
self.func_arg_map = kwargs.pop("func_arg_map", {})
super(LLVMPrinter, self).__init__(*args, **kwargs)
self.fp_type = ir.DoubleType()
self.fp_pointer = self.fp_type.as_pointer()
self.integer = ir.IntType(64)
self.integer_pointer = self.integer.as_pointer()
self.void = ir.VoidType()
self.module = module
self.builder = builder
self.fn = fn
self.ext_fn = {} # keep track of wrappers to external functions
self.tmp_var = {}
self.target = target
def _add_tmp_var(self, name, value):
self.tmp_var[name] = value
def _remove_tmp_var(self, name):
del self.tmp_var[name]
def _print_Number(self, n):
if get_type_of_expression(n) == create_type("int"):
return ir.Constant(self.integer, int(n))
elif get_type_of_expression(n) == create_type("double"):
return ir.Constant(self.fp_type, float(n))
else:
raise NotImplementedError("Numbers can only have int and double", n)
def _print_Float(self, expr):
return ir.Constant(self.fp_type, float(expr))
def _print_Integer(self, expr):
return ir.Constant(self.integer, int(expr))
def _print_int(self, i):
return ir.Constant(self.integer, i)
def _print_Symbol(self, s):
val = self.tmp_var.get(s)
if not val:
# look up parameter with name s
val = self.func_arg_map.get(s.name)
if not val:
raise LookupError("Symbol not found: %s" % s)
return val
def _print_Pow(self, expr):
base0 = self._print(expr.base)
if expr.exp == S.NegativeOne:
return self.builder.fdiv(ir.Constant(self.fp_type, 1.0), base0)
if expr.exp == S.Half:
fn = self.ext_fn.get("sqrt")
if not fn:
fn_type = ir.FunctionType(self.fp_type, [self.fp_type])
fn = ir.Function(self.module, fn_type, "sqrt")
self.ext_fn["sqrt"] = fn
return self.builder.call(fn, [base0], "sqrt")
if expr.exp == 2:
return self.builder.fmul(base0, base0)
elif expr.exp == 3:
return self.builder.fmul(self.builder.fmul(base0, base0), base0)
exp0 = self._print(expr.exp)
fn = self.ext_fn.get("pow")
if not fn:
fn_type = ir.FunctionType(self.fp_type, [self.fp_type, self.fp_type])
fn = ir.Function(self.module, fn_type, "pow")
self.ext_fn["pow"] = fn
return self.builder.call(fn, [base0, exp0], "pow")
def _print_Mul(self, expr):
nodes = [self._print(a) for a in expr.args]
e = nodes[0]
if get_type_of_expression(expr) == create_type('double'):
mul = self.builder.fmul
else: # int TODO unsigned/signed
mul = self.builder.mul
for node in nodes[1:]:
e = mul(e, node)
return e
def _print_Add(self, expr):
nodes = [self._print(a) for a in expr.args]
e = nodes[0]
if get_type_of_expression(expr) == create_type('double'):
add = self.builder.fadd
else: # int TODO unsigned/signed
add = self.builder.add
for node in nodes[1:]:
e = add(e, node)
return e
def _print_Or(self, expr):
nodes = [self._print(a) for a in expr.args]
e = nodes[0]
for node in nodes[1:]:
e = self.builder.or_(e, node)
return e
def _print_And(self, expr):
nodes = [self._print(a) for a in expr.args]
e = nodes[0]
for node in nodes[1:]:
e = self.builder.and_(e, node)
return e
def _print_StrictLessThan(self, expr):
return self._comparison('<', expr)
def _print_LessThan(self, expr):
return self._comparison('<=', expr)
def _print_StrictGreaterThan(self, expr):
return self._comparison('>', expr)
def _print_GreaterThan(self, expr):
return self._comparison('>=', expr)
def _print_Unequality(self, expr):
return self._comparison('!=', expr)
def _print_Equality(self, expr):
return self._comparison('==', expr)
def _comparison(self, cmpop, expr):
if collate_types([get_type_of_expression(arg) for arg in expr.args]) == create_type('double'):
comparison = self.builder.fcmp_unordered
else:
comparison = self.builder.icmp_signed
return comparison(cmpop, self._print(expr.lhs), self._print(expr.rhs))
def _print_KernelFunction(self, func):
# KernelFunction does not posses a return type
return_type = self.void
parameter_type = []
parameters = func.get_parameters()
for parameter in parameters:
parameter_type.append(to_llvm_type(parameter.symbol.dtype, nvvm_target=self.target == 'gpu'))
func_type = ir.FunctionType(return_type, tuple(parameter_type))
name = func.function_name
fn = ir.Function(self.module, func_type, name)
self.ext_fn[name] = fn
# set proper names to arguments
for i, arg in enumerate(fn.args):
arg.name = parameters[i].symbol.name
self.func_arg_map[parameters[i].symbol.name] = arg
# func.attributes.add("inlinehint")
# func.attributes.add("argmemonly")
block = fn.append_basic_block(name="entry")
self.builder = ir.IRBuilder(block) # TODO use goto_block instead
self._print(func.body)
self.builder.ret_void()
self.fn = fn
if self.target == 'gpu':
set_cuda_kernel(fn)
return fn
def _print_Block(self, block):
for node in block.args:
self._print(node)
def _print_LoopOverCoordinate(self, loop):
with Loop(self.builder, self._print(loop.start), self._print(loop.stop), self._print(loop.step),
loop.loop_counter_name, loop.loop_counter_symbol.name) as i:
self._add_tmp_var(loop.loop_counter_symbol, i)
self._print(loop.body)
self._remove_tmp_var(loop.loop_counter_symbol)
def _print_SympyAssignment(self, assignment):
expr = self._print(assignment.rhs)
lhs = assignment.lhs
if isinstance(lhs, Indexed):
ptr = self._print(lhs.base.label)
index = self._print(lhs.args[1])
gep = self.builder.gep(ptr, [index])
return self.builder.store(expr, gep)
self.func_arg_map[assignment.lhs.name] = expr
return expr
def _print_boolean_cast_func(self, conversion):
return self._print_cast_func(conversion)
def _print_cast_func(self, conversion):
node = self._print(conversion.args[0])
to_dtype = get_type_of_expression(conversion)
from_dtype = get_type_of_expression(conversion.args[0])
if from_dtype == to_dtype:
return self._print(conversion.args[0])
# (From, to)
decision = {
(create_composite_type_from_string("int32"),
create_composite_type_from_string("int64")): functools.partial(self.builder.zext, node, self.integer),
(create_composite_type_from_string("int16"),
create_composite_type_from_string("int64")): functools.partial(self.builder.zext, node, self.integer),
(create_composite_type_from_string("int"),
create_composite_type_from_string("double")): functools.partial(self.builder.sitofp, node, self.fp_type),
(create_composite_type_from_string("int16"),
create_composite_type_from_string("double")): functools.partial(self.builder.sitofp, node, self.fp_type),
(create_composite_type_from_string("double"),
create_composite_type_from_string("int")): functools.partial(self.builder.fptosi, node, self.integer),
(create_composite_type_from_string("double *"),
create_composite_type_from_string("int")): functools.partial(self.builder.ptrtoint, node, self.integer),
(create_composite_type_from_string("int"),
create_composite_type_from_string("double *")): functools.partial(self.builder.inttoptr,
node, self.fp_pointer),
(create_composite_type_from_string("double * restrict"),
create_composite_type_from_string("int")): functools.partial(self.builder.ptrtoint, node, self.integer),
(create_composite_type_from_string("int"),
create_composite_type_from_string("double * restrict")): functools.partial(self.builder.inttoptr, node,
self.fp_pointer),
(create_composite_type_from_string("double * restrict const"),
create_composite_type_from_string("int")): functools.partial(self.builder.ptrtoint, node,
self.integer),
(create_composite_type_from_string("int"),
create_composite_type_from_string("double * restrict const")): functools.partial(self.builder.inttoptr,
node, self.fp_pointer),
}
# TODO float, TEST: const, restrict
# TODO bitcast, addrspacecast
# TODO unsigned/signed fills
# print([x for x in decision.keys()])
# print("Types:")
# print([(type(x), type(y)) for (x, y) in decision.keys()])
# print("Cast:")
# print((from_dtype, to_dtype))
return decision[(from_dtype, to_dtype)]()
def _print_pointer_arithmetic_func(self, pointer):
ptr = self._print(pointer.args[0])
index = self._print(pointer.args[1])
return self.builder.gep(ptr, [index])
def _print_Indexed(self, indexed):
ptr = self._print(indexed.base.label)
index = self._print(indexed.args[1])
gep = self.builder.gep(ptr, [index])
return self.builder.load(gep, name=indexed.base.label.name)
def _print_Piecewise(self, piece):
if not piece.args[-1].cond:
# We need the last conditional to be a True, otherwise the resulting
# function may not return a result.
raise ValueError("All Piecewise expressions must contain an "
"(expr, True) statement to be used as a default "
"condition. Without one, the generated "
"expression may not evaluate to anything under "
"some condition.")
if piece.has(Assignment):
raise NotImplementedError('The llvm-backend does not support assignments'
'in the Piecewise function. It is questionable'
'whether to implement it. So far there is no'
'use-case to test it.')
else:
phi_data = []
after_block = self.builder.append_basic_block()
for (expr, condition) in piece.args:
if condition == sp.sympify(True): # Don't use 'is' use '=='!
phi_data.append((self._print(expr), self.builder.block))
self.builder.branch(after_block)
self.builder.position_at_end(after_block)
else:
cond = self._print(condition)
true_block = self.builder.append_basic_block()
false_block = self.builder.append_basic_block()
self.builder.cbranch(cond, true_block, false_block)
self.builder.position_at_end(true_block)
phi_data.append((self._print(expr), true_block))
self.builder.branch(after_block)
self.builder.position_at_end(false_block)
phi = self.builder.phi(to_llvm_type(get_type_of_expression(piece), nvvm_target=self.target == 'gpu'))
for (val, block) in phi_data:
phi.add_incoming(val, block)
return phi
def _print_Conditional(self, node):
cond = self._print(node.condition_expr)
with self.builder.if_else(cond) as (then, otherwise):
with then:
self._print(node.true_block) # emit instructions for when the predicate is true
with otherwise:
self._print(node.false_block) # emit instructions for when the predicate is true
# No return!
def _print_Function(self, expr):
name = expr.func.__name__
e0 = self._print(expr.args[0])
fn = self.ext_fn.get(name)
if not fn:
fn_type = ir.FunctionType(self.fp_type, [self.fp_type])
fn = ir.Function(self.module, fn_type, name)
self.ext_fn[name] = fn
return self.builder.call(fn, [e0], name)
def empty_printer(self, expr):
try:
import inspect
mro = inspect.getmro(expr)
except AttributeError:
mro = "None"
raise TypeError("Unsupported type for LLVM JIT conversion: Expression:\"%s\", Type:\"%s\", MRO:%s"
% (expr, type(expr), mro))
# from: https://llvm.org/docs/NVPTXUsage.html#nvptx-intrinsics
INDEXING_FUNCTION_MAPPING = {
'blockIdx': 'llvm.nvvm.read.ptx.sreg.ctaid',
'threadIdx': 'llvm.nvvm.read.ptx.sreg.tid',
'blockDim': 'llvm.nvvm.read.ptx.sreg.ntid',
'gridDim': 'llvm.nvvm.read.ptx.sreg.nctaid'
}
def _print_ThreadIndexingSymbol(self, node):
symbol_name: str = node.name
function_name, dimension = tuple(symbol_name.split("."))
function_name = self.INDEXING_FUNCTION_MAPPING[function_name]
name = f"{function_name}.{dimension}"
return self.builder.zext(_call_sreg(self.builder, name), self.integer)
import ctypes as ct
import subprocess
from functools import partial
from itertools import chain
from os.path import exists, join
import llvmlite.binding as llvm
import llvmlite.ir as ir
import numpy as np
from pystencils.data_types import create_composite_type_from_string
from pystencils.field import FieldType
from ..data_types import StructType, ctypes_from_llvm, to_ctypes
from .llvm import generate_llvm
def build_ctypes_argument_list(parameter_specification, argument_dict):
argument_dict = {k: v for k, v in argument_dict.items()}
ct_arguments = []
array_shapes = set()
index_arr_shapes = set()
for param in parameter_specification:
if param.is_field_parameter:
try:
field_arr = argument_dict[param.field_name]
except KeyError:
raise KeyError("Missing field parameter for kernel call " + param.field_name)
symbolic_field = param.fields[0]
if param.is_field_pointer:
ct_arguments.append(field_arr.ctypes.data_as(to_ctypes(param.symbol.dtype)))
if symbolic_field.has_fixed_shape:
symbolic_field_shape = tuple(int(i) for i in symbolic_field.shape)
if isinstance(symbolic_field.dtype, StructType):
symbolic_field_shape = symbolic_field_shape[:-1]
if symbolic_field_shape != field_arr.shape:
raise ValueError("Passed array '%s' has shape %s which does not match expected shape %s" %
(param.field_name, str(field_arr.shape), str(symbolic_field.shape)))
if symbolic_field.has_fixed_shape:
symbolic_field_strides = tuple(int(i) * field_arr.itemsize for i in symbolic_field.strides)
if isinstance(symbolic_field.dtype, StructType):
symbolic_field_strides = symbolic_field_strides[:-1]
if symbolic_field_strides != field_arr.strides:
raise ValueError("Passed array '%s' has strides %s which does not match expected strides %s" %
(param.field_name, str(field_arr.strides), str(symbolic_field_strides)))
if FieldType.is_indexed(symbolic_field):
index_arr_shapes.add(field_arr.shape[:symbolic_field.spatial_dimensions])
elif FieldType.is_generic(symbolic_field):
array_shapes.add(field_arr.shape[:symbolic_field.spatial_dimensions])
elif param.is_field_shape:
data_type = to_ctypes(param.symbol.dtype)
ct_arguments.append(data_type(field_arr.shape[param.symbol.coordinate]))
elif param.is_field_stride:
data_type = to_ctypes(param.symbol.dtype)
assert field_arr.strides[param.symbol.coordinate] % field_arr.itemsize == 0
item_stride = field_arr.strides[param.symbol.coordinate] // field_arr.itemsize
ct_arguments.append(data_type(item_stride))
else:
assert False
else:
try:
value = argument_dict[param.symbol.name]
except KeyError:
raise KeyError("Missing parameter for kernel call " + param.symbol.name)
expected_type = to_ctypes(param.symbol.dtype)
ct_arguments.append(expected_type(value))
if len(array_shapes) > 1:
raise ValueError("All passed arrays have to have the same size " + str(array_shapes))
if len(index_arr_shapes) > 1:
raise ValueError("All passed index arrays have to have the same size " + str(array_shapes))
return ct_arguments
def make_python_function_incomplete_params(kernel_function_node, argument_dict, func):
parameters = kernel_function_node.get_parameters()
cache = {}
cache_values = []
def wrapper(**kwargs):
key = hash(tuple((k, v.ctypes.data, v.strides, v.shape) if isinstance(v, np.ndarray) else (k, id(v))
for k, v in kwargs.items()))
try:
args = cache[key]
func(*args)
except KeyError:
full_arguments = argument_dict.copy()
full_arguments.update(kwargs)
args = build_ctypes_argument_list(parameters, full_arguments)
cache[key] = args
cache_values.append(kwargs) # keep objects alive such that ids remain unique
func(*args)
wrapper.ast = kernel_function_node
wrapper.parameters = kernel_function_node.get_parameters()
return wrapper
def generate_and_jit(ast):
target = 'gpu' if ast._backend == 'llvm_gpu' else 'cpu'
gen = generate_llvm(ast, target=target)
if isinstance(gen, ir.Module):
return compile_llvm(gen, target, ast)
else:
return compile_llvm(gen.module, target, ast)
def make_python_function(ast, argument_dict={}, func=None):
if func is None:
jit = generate_and_jit(ast)
func = jit.get_function_ptr(ast.function_name)
try:
args = build_ctypes_argument_list(ast.get_parameters(), argument_dict)
except KeyError:
# not all parameters specified yet
return make_python_function_incomplete_params(ast, argument_dict, func)
return lambda: func(*args)
def compile_llvm(module, target='cpu', ast=None):
jit = CudaJit(ast) if target == "gpu" else Jit()
jit.parse(module)
jit.optimize()
jit.compile()
return jit
class Jit(object):
def __init__(self):
llvm.initialize()
llvm.initialize_all_targets()
llvm.initialize_native_target()
llvm.initialize_native_asmprinter()
self.module = None
self._llvmmod = llvm.parse_assembly("")
self.target = llvm.Target.from_default_triple()
self.cpu = llvm.get_host_cpu_name()
self.cpu_features = llvm.get_host_cpu_features()
self.target_machine = self.target.create_target_machine(cpu=self.cpu, features=self.cpu_features.flatten(),
opt=2)
llvm.check_jit_execution()
self.ee = llvm.create_mcjit_compiler(self.llvmmod, self.target_machine)
self.ee.finalize_object()
self.fptr = None
@property
def llvmmod(self):
return self._llvmmod
@llvmmod.setter
def llvmmod(self, mod):
self.ee.remove_module(self.llvmmod)
self.ee.add_module(mod)
self.ee.finalize_object()
self.compile()
self._llvmmod = mod
def parse(self, module):
self.module = module
llvmmod = llvm.parse_assembly(str(module))
llvmmod.verify()
llvmmod.triple = self.target.triple
llvmmod.name = 'module'
self.llvmmod = llvmmod
def write_ll(self, file):
with open(file, 'w') as f:
f.write(str(self.llvmmod))
def write_assembly(self, file):
with open(file, 'w') as f:
f.write(self.target_machine.emit_assembly(self.llvmmod))
def write_object_file(self, file):
with open(file, 'wb') as f:
f.write(self.target_machine.emit_object(self.llvmmod))
def optimize(self):
pmb = llvm.create_pass_manager_builder()
pmb.opt_level = 2
pmb.disable_unit_at_a_time = False
pmb.loop_vectorize = True
pmb.slp_vectorize = True
# TODO possible to pass for functions
pm = llvm.create_module_pass_manager()
pm.add_instruction_combining_pass()
pm.add_function_attrs_pass()
pm.add_constant_merge_pass()
pm.add_licm_pass()
pmb.populate(pm)
pm.run(self.llvmmod)
def compile(self):
fptr = {}
for func in self.module.functions:
if not func.is_declaration:
return_type = None
if func.ftype.return_type != ir.VoidType():
return_type = to_ctypes(create_composite_type_from_string(str(func.ftype.return_type)))
args = [ctypes_from_llvm(arg) for arg in func.ftype.args]
function_address = self.ee.get_function_address(func.name)
fptr[func.name] = ct.CFUNCTYPE(return_type, *args)(function_address)
self.fptr = fptr
def __call__(self, func, *args, **kwargs):
target_function = next(f for f in self.module.functions if f.name == func)
arg_types = [ctypes_from_llvm(arg.type) for arg in target_function.args]
transformed_args = []
for i, arg in enumerate(args):
if isinstance(arg, np.ndarray):
transformed_args.append(arg.ctypes.data_as(arg_types[i]))
else:
transformed_args.append(arg)
self.fptr[func](*transformed_args)
def print_functions(self):
for f in self.module.functions:
print(f.ftype.return_type, f.name, f.args)
def get_function_ptr(self, name):
fptr = self.fptr[name]
fptr.jit = self
return fptr
# Following code more or less from numba
class CudaJit(Jit):
CUDA_TRIPLE = {32: 'nvptx-nvidia-cuda',
64: 'nvptx64-nvidia-cuda'}
MACHINE_BITS = tuple.__itemsize__ * 8
data_layout = {
32: ('e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-'
'f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64'),
64: ('e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-'
'f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64')}
default_data_layout = data_layout[MACHINE_BITS]
def __init__(self, ast):
# super().__init__()
# self.target = llvm.Target.from_triple(self.CUDA_TRIPLE[self.MACHINE_BITS])
self._data_layout = self.default_data_layout[self.MACHINE_BITS]
# self._target_data = llvm.create_target_data(self._data_layout)
self.indexing = ast.indexing
def optimize(self):
pmb = llvm.create_pass_manager_builder()
pmb.opt_level = 2
pmb.disable_unit_at_a_time = False
pmb.loop_vectorize = False
pmb.slp_vectorize = False
# TODO possible to pass for functions
pm = llvm.create_module_pass_manager()
pm.add_instruction_combining_pass()
pm.add_function_attrs_pass()
pm.add_constant_merge_pass()
pm.add_licm_pass()
pmb.populate(pm)
pm.run(self.llvmmod)
pm.run(self.llvmmod)
def write_ll(self, file):
with open(file, 'w') as f:
f.write(str(self.llvmmod))
def parse(self, module):
llvmmod = module
llvmmod.triple = self.CUDA_TRIPLE[self.MACHINE_BITS]
llvmmod.data_layout = self.default_data_layout
llvmmod.verify()
llvmmod.name = 'module'
self._llvmmod = llvm.parse_assembly(str(llvmmod))
def compile(self):
from pystencils.cpu.cpujit import get_cache_config, get_compiler_config, get_llc_command
import hashlib
compiler_cache = get_cache_config()['object_cache']
ir_file = join(compiler_cache, hashlib.md5(str(self._llvmmod).encode()).hexdigest() + '.ll')
ptx_file = ir_file.replace('.ll', '.ptx')
try:
from pycuda.driver import Context
arch = "sm_%d%d" % Context.get_device().compute_capability()
except Exception:
arch = "sm_35"
if not exists(ptx_file):
self.write_ll(ir_file)
if 'llc' in get_compiler_config():
llc_command = get_compiler_config()['llc']
else:
llc_command = get_llc_command() or 'llc'
subprocess.check_call([llc_command, '-mcpu=' + arch, ir_file, '-o', ptx_file])
# cubin_file = ir_file.replace('.ll', '.cubin')
# if not exists(cubin_file):
# subprocess.check_call(['ptxas', '--gpu-name', arch, ptx_file, '-o', cubin_file])
import pycuda.driver
cuda_module = pycuda.driver.module_from_file(ptx_file) # also works: cubin_file
self.cuda_module = cuda_module
def __call__(self, func, *args, **kwargs):
shape = [a.shape for a in chain(args, kwargs.values()) if hasattr(a, 'shape')][0]
block_and_thread_numbers = self.indexing.call_parameters(shape)
block_and_thread_numbers['block'] = tuple(int(i) for i in block_and_thread_numbers['block'])
block_and_thread_numbers['grid'] = tuple(int(i) for i in block_and_thread_numbers['grid'])
self.cuda_module.get_function(func)(*args, **kwargs, **block_and_thread_numbers)
def get_function_ptr(self, name):
return partial(self._call__, name)
"""
Default Sympy optimizations applied in pystencils kernels using :func:`sympy.codegen.rewriting.optimize`.
See :func:`sympy.codegen.rewriting.optimize`.
"""
import itertools
from pystencils import Assignment
from pystencils.astnodes import SympyAssignment
try:
from sympy.codegen.rewriting import optims_c99, optimize
from sympy.codegen.rewriting import ReplaceOptim
HAS_REWRITING = True
# Evaluates all constant terms
evaluate_constant_terms = ReplaceOptim(
lambda e: hasattr(e, 'is_constant') and e.is_constant and not e.is_integer,
lambda p: p.evalf()
)
optims_pystencils_cpu = [evaluate_constant_terms] + list(optims_c99)
optims_pystencils_gpu = [evaluate_constant_terms] + list(optims_c99)
except ImportError:
from warnings import warn
warn("Could not import ReplaceOptim, optims_c99, optimize from sympy.codegen.rewriting."
"Please update your sympy installation!")
optims_c99 = []
optims_pystencils_cpu = []
optims_pystencils_gpu = []
HAS_REWRITING = False
def optimize_assignments(assignments, optimizations):
if HAS_REWRITING:
assignments = [Assignment(a.lhs, optimize(a.rhs, optimizations))
if hasattr(a, 'lhs')
else a for a in assignments]
assignments_nodes = [a.atoms(SympyAssignment) for a in assignments]
for a in itertools.chain.from_iterable(assignments_nodes):
a.optimize(optimizations)
return assignments
import numpy as np
from pystencils.backends.cbackend import generate_c, get_headers
from pystencils.gpucuda.cudajit import _build_numpy_argument_list, _check_arguments
from pystencils.include import get_pystencils_include_path
from pystencils.kernel_wrapper import KernelWrapper
USE_FAST_MATH = True
_global_cl_ctx = None
_global_cl_queue = None
def get_global_cl_queue():
return _global_cl_queue
def get_global_cl_ctx():
return _global_cl_ctx
def init_globally(device_index=0):
import pyopencl as cl
global _global_cl_ctx
global _global_cl_queue
_global_cl_ctx = cl.create_some_context(device_index)
_global_cl_queue = cl.CommandQueue(_global_cl_ctx)
def make_python_function(kernel_function_node, opencl_queue, opencl_ctx, argument_dict=None, custom_backend=None):
"""
Creates a **OpenCL** kernel function from an abstract syntax tree which
was created for the ``target='gpu'`` e.g. by :func:`pystencils.gpucuda.create_cuda_kernel`
or :func:`pystencils.gpucuda.created_indexed_cuda_kernel`
Args:
opencl_queue: a valid :class:`pyopencl.CommandQueue`
opencl_ctx: a valid :class:`pyopencl.Context`
kernel_function_node: the abstract syntax tree
argument_dict: parameters passed here are already fixed. Remaining parameters have to be passed to the
returned kernel functor.
Returns:
compiled kernel as Python function
"""
import pyopencl as cl
if not opencl_ctx:
opencl_ctx = _global_cl_ctx
if not opencl_queue:
opencl_queue = _global_cl_queue
assert opencl_ctx, "No valid OpenCL context"
assert opencl_queue, "No valid OpenCL queue"
if argument_dict is None:
argument_dict = {}
# check if double precision is supported and required
if any([d.double_fp_config == 0 for d in opencl_ctx.devices]):
for param in kernel_function_node.get_parameters():
if param.symbol.dtype.base_type:
if param.symbol.dtype.base_type.numpy_dtype == np.float64:
raise ValueError('OpenCL device does not support double precision')
else:
if param.symbol.dtype.numpy_dtype == np.float64:
raise ValueError('OpenCL device does not support double precision')
# Changing of kernel name necessary since compilation with default name "kernel" is not possible (OpenCL keyword!)
kernel_function_node.function_name = "opencl_" + kernel_function_node.function_name
header_list = ['"opencl_stdint.h"'] + list(get_headers(kernel_function_node))
includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list])
code = includes + "\n"
code += "#define FUNC_PREFIX __kernel\n"
code += "#define RESTRICT restrict\n\n"
code += str(generate_c(kernel_function_node, dialect='opencl', custom_backend=custom_backend))
options = []
if USE_FAST_MATH:
options.append("-cl-unsafe-math-optimizations")
options.append("-cl-mad-enable")
options.append("-cl-fast-relaxed-math")
options.append("-cl-finite-math-only")
options.append("-I")
options.append(get_pystencils_include_path())
mod = cl.Program(opencl_ctx, code).build(options=options)
func = getattr(mod, kernel_function_node.function_name)
parameters = kernel_function_node.get_parameters()
cache = {}
cache_values = []
def wrapper(**kwargs):
key = hash(tuple((k, v.ctypes.data, v.strides, v.shape) if isinstance(v, np.ndarray) else (k, id(v))
for k, v in kwargs.items()))
try:
args, block_and_thread_numbers = cache[key]
func(opencl_queue, block_and_thread_numbers['grid'], block_and_thread_numbers['block'], *args)
except KeyError:
full_arguments = argument_dict.copy()
full_arguments.update(kwargs)
shape = _check_arguments(parameters, full_arguments)
indexing = kernel_function_node.indexing
block_and_thread_numbers = indexing.call_parameters(shape)
block_and_thread_numbers['block'] = tuple(int(i) for i in block_and_thread_numbers['block'])
block_and_thread_numbers['grid'] = tuple(int(b * g) for (b, g) in zip(block_and_thread_numbers['block'],
block_and_thread_numbers['grid']))
args = _build_numpy_argument_list(parameters, full_arguments)
args = [a.data if hasattr(a, 'data') else a for a in args]
cache[key] = (args, block_and_thread_numbers)
cache_values.append(kwargs) # keep objects alive such that ids remain unique
func(opencl_queue, block_and_thread_numbers['grid'], block_and_thread_numbers['block'], *args)
wrapper.ast = kernel_function_node
wrapper.parameters = kernel_function_node.get_parameters()
wrapper = KernelWrapper(wrapper, parameters, kernel_function_node)
return wrapper
import numpy as np
import sympy as sp
from pystencils import TypedSymbol
from pystencils.astnodes import LoopOverCoordinate
from pystencils.backends.cbackend import CustomCodeNode
def _get_rng_template(name, data_type, num_vars):
if data_type is np.float32:
c_type = "float"
elif data_type is np.float64:
c_type = "double"
template = "\n"
for i in range(num_vars):
template += "{{result_symbols[{}].dtype}} {{result_symbols[{}].name}};\n".format(i, i)
template += ("{}_{}{}({{parameters}}, " + ", ".join(["{{result_symbols[{}].name}}"] * num_vars) + ");\n") \
.format(name, c_type, num_vars, *tuple(range(num_vars)))
return template
def _get_rng_code(template, dialect, vector_instruction_set, time_step, offsets, keys, dim, result_symbols):
parameters = [time_step] + [LoopOverCoordinate.get_loop_counter_symbol(i) + offsets[i]
for i in range(dim)] + [0] * (3 - dim) + list(keys)
if dialect == 'cuda' or (dialect == 'c' and vector_instruction_set is None):
return template.format(parameters=', '.join(str(p) for p in parameters),
result_symbols=result_symbols)
else:
raise NotImplementedError("Not yet implemented for this backend")
class RNGBase(CustomCodeNode):
def __init__(self, dim, time_step=TypedSymbol("time_step", np.uint32), offsets=(0, 0, 0), keys=None):
if keys is None:
keys = (0,) * self._num_keys
if len(keys) != self._num_keys:
raise ValueError("Provided {} keys but need {}".format(len(keys), self._num_keys))
if len(offsets) != 3:
raise ValueError("Provided {} offsets but need {}".format(len(offsets), 3))
self.result_symbols = tuple(TypedSymbol(sp.Dummy().name, self._data_type) for _ in range(self._num_vars))
symbols_read = [s for s in keys if isinstance(s, sp.Symbol)]
super().__init__("", symbols_read=symbols_read, symbols_defined=self.result_symbols)
self._time_step = time_step
self._offsets = offsets
self.headers = ['"{}_rand.h"'.format(self._name)]
self.keys = tuple(keys)
self._args = sp.sympify((dim, time_step, keys))
self._dim = dim
@property
def args(self):
return self._args
@property
def undefined_symbols(self):
result = {a for a in (self._time_step, *self._offsets, *self.keys) if isinstance(a, sp.Symbol)}
loop_counters = [LoopOverCoordinate.get_loop_counter_symbol(i)
for i in range(self._dim)]
result.update(loop_counters)
return result
def fast_subs(self, _):
return self # nothing to replace inside this node - would destroy intermediate "dummy" by re-creating them
def get_code(self, dialect, vector_instruction_set):
template = _get_rng_template(self._name, self._data_type, self._num_vars)
return _get_rng_code(template, dialect, vector_instruction_set,
self._time_step, self._offsets, self.keys, self._dim, self.result_symbols)
def __repr__(self):
return (", ".join(['{}'] * self._num_vars) + " \\leftarrow {}RNG").format(*self.result_symbols,
self._name.capitalize())
class PhiloxTwoDoubles(RNGBase):
_name = "philox"
_data_type = np.float64
_num_vars = 2
_num_keys = 2
class PhiloxFourFloats(RNGBase):
_name = "philox"
_data_type = np.float32
_num_vars = 4
_num_keys = 2
class AESNITwoDoubles(RNGBase):
_name = "aesni"
_data_type = np.float64
_num_vars = 2
_num_keys = 4
class AESNIFourFloats(RNGBase):
_name = "aesni"
_data_type = np.float32
_num_vars = 4
_num_keys = 4
def random_symbol(assignment_list, seed=TypedSymbol("seed", np.uint32), rng_node=PhiloxTwoDoubles, *args, **kwargs):
counter = 0
while True:
node = rng_node(*args, keys=(counter, seed), **kwargs)
inserted = False
for symbol in node.result_symbols:
if not inserted:
assignment_list.insert(0, node)
inserted = True
yield symbol
counter += 1