Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
No results found
Show changes
Showing
with 0 additions and 3256 deletions
import ctypes
import numpy as np
import sympy as sp
from sympy.core.cache import cacheit
from sympy.logic.boolalg import Boolean
from pystencils.cache import memorycache
from pystencils.utils import all_equal
try:
import llvmlite.ir as ir
except ImportError as e:
ir = None
_ir_importerror = e
# noinspection PyPep8Naming
class address_of(sp.Function):
is_Atom = True
def __new__(cls, arg):
obj = sp.Function.__new__(cls, arg)
return obj
@property
def canonical(self):
if hasattr(self.args[0], 'canonical'):
return self.args[0].canonical
else:
raise NotImplementedError()
@property
def is_commutative(self):
return self.args[0].is_commutative
@property
def dtype(self):
if hasattr(self.args[0], 'dtype'):
return PointerType(self.args[0].dtype, restrict=True)
else:
return PointerType('void', restrict=True)
# noinspection PyPep8Naming
class cast_func(sp.Function):
is_Atom = True
def __new__(cls, *args, **kwargs):
if len(args) != 2:
pass
expr, dtype, *other_args = args
if not isinstance(dtype, Type):
dtype = create_type(dtype)
# to work in conditions of sp.Piecewise cast_func has to be of type Boolean as well
# however, a cast_function should only be a boolean if its argument is a boolean, otherwise this leads
# to problems when for example comparing cast_func's for equality
#
# lhs = bitwise_and(a, cast_func(1, 'int'))
# rhs = cast_func(0, 'int')
# print( sp.Ne(lhs, rhs) ) # would give true if all cast_funcs are booleans
# -> thus a separate class boolean_cast_func is introduced
if isinstance(expr, Boolean):
cls = boolean_cast_func
return sp.Function.__new__(cls, expr, dtype, *other_args, **kwargs)
@property
def canonical(self):
if hasattr(self.args[0], 'canonical'):
return self.args[0].canonical
else:
raise NotImplementedError()
@property
def is_commutative(self):
return self.args[0].is_commutative
def _eval_evalf(self, *args, **kwargs):
return self.args[0].evalf()
@property
def dtype(self):
return self.args[1]
# noinspection PyPep8Naming
class boolean_cast_func(cast_func, Boolean):
pass
# noinspection PyPep8Naming
class vector_memory_access(cast_func):
nargs = (4,)
# noinspection PyPep8Naming
class reinterpret_cast_func(cast_func):
pass
# noinspection PyPep8Naming
class pointer_arithmetic_func(sp.Function, Boolean):
@property
def canonical(self):
if hasattr(self.args[0], 'canonical'):
return self.args[0].canonical
else:
raise NotImplementedError()
class TypedSymbol(sp.Symbol):
def __new__(cls, *args, **kwds):
obj = TypedSymbol.__xnew_cached_(cls, *args, **kwds)
return obj
def __new_stage2__(cls, name, dtype, *args, **kwargs):
obj = super(TypedSymbol, cls).__xnew__(cls, name, *args, **kwargs)
try:
obj._dtype = create_type(dtype)
except (TypeError, ValueError):
# on error keep the string
obj._dtype = dtype
return obj
__xnew__ = staticmethod(__new_stage2__)
__xnew_cached_ = staticmethod(cacheit(__new_stage2__))
@property
def dtype(self):
return self._dtype
def _hashable_content(self):
return super()._hashable_content(), hash(self._dtype)
def __getnewargs__(self):
return self.name, self.dtype
# For reference: Numpy type hierarchy https://docs.scipy.org/doc/numpy-1.13.0/reference/arrays.scalars.html
@property
def is_integer(self):
if hasattr(self.dtype, 'numpy_dtype'):
return np.issubdtype(self.dtype.numpy_dtype, np.integer) or super().is_integer
else:
return super().is_integer
@property
def is_negative(self):
if hasattr(self.dtype, 'numpy_dtype'):
if np.issubdtype(self.dtype.numpy_dtype, np.unsignedinteger):
return False
return super().is_negative
@property
def is_nonnegative(self):
if self.is_negative is False:
return True
else:
return super().is_nonnegative
@property
def is_real(self):
if hasattr(self.dtype, 'numpy_dtype'):
return np.issubdtype(self.dtype.numpy_dtype, np.integer) or \
np.issubdtype(self.dtype.numpy_dtype, np.floating) or \
super().is_real
else:
return super().is_real
def create_type(specification):
"""Creates a subclass of Type according to a string or an object of subclass Type.
Args:
specification: Type object, or a string
Returns:
Type object, or a new Type object parsed from the string
"""
if isinstance(specification, Type):
return specification
else:
numpy_dtype = np.dtype(specification)
if numpy_dtype.fields is None:
return BasicType(numpy_dtype, const=False)
else:
return StructType(numpy_dtype, const=False)
@memorycache(maxsize=64)
def create_composite_type_from_string(specification):
"""Creates a new Type object from a c-like string specification.
Args:
specification: Specification string
Returns:
Type object
"""
specification = specification.lower().split()
parts = []
current = []
for s in specification:
if s == '*':
parts.append(current)
current = [s]
else:
current.append(s)
if len(current) > 0:
parts.append(current)
# Parse native part
base_part = parts.pop(0)
const = False
if 'const' in base_part:
const = True
base_part.remove('const')
assert len(base_part) == 1
if base_part[0][-1] == "*":
base_part[0] = base_part[0][:-1]
parts.append('*')
current_type = BasicType(np.dtype(base_part[0]), const)
# Parse pointer parts
for part in parts:
restrict = False
const = False
if 'restrict' in part:
restrict = True
part.remove('restrict')
if 'const' in part:
const = True
part.remove("const")
assert len(part) == 1 and part[0] == '*'
current_type = PointerType(current_type, const, restrict)
return current_type
def get_base_type(data_type):
while data_type.base_type is not None:
data_type = data_type.base_type
return data_type
def to_ctypes(data_type):
"""
Transforms a given Type into ctypes
:param data_type: Subclass of Type
:return: ctypes type object
"""
if isinstance(data_type, PointerType):
return ctypes.POINTER(to_ctypes(data_type.base_type))
elif isinstance(data_type, StructType):
return ctypes.POINTER(ctypes.c_uint8)
else:
return to_ctypes.map[data_type.numpy_dtype]
to_ctypes.map = {
np.dtype(np.int8): ctypes.c_int8,
np.dtype(np.int16): ctypes.c_int16,
np.dtype(np.int32): ctypes.c_int32,
np.dtype(np.int64): ctypes.c_int64,
np.dtype(np.uint8): ctypes.c_uint8,
np.dtype(np.uint16): ctypes.c_uint16,
np.dtype(np.uint32): ctypes.c_uint32,
np.dtype(np.uint64): ctypes.c_uint64,
np.dtype(np.float32): ctypes.c_float,
np.dtype(np.float64): ctypes.c_double,
}
def ctypes_from_llvm(data_type):
if not ir:
raise _ir_importerror
if isinstance(data_type, ir.PointerType):
ctype = ctypes_from_llvm(data_type.pointee)
if ctype is None:
return ctypes.c_void_p
else:
return ctypes.POINTER(ctype)
elif isinstance(data_type, ir.IntType):
if data_type.width == 8:
return ctypes.c_int8
elif data_type.width == 16:
return ctypes.c_int16
elif data_type.width == 32:
return ctypes.c_int32
elif data_type.width == 64:
return ctypes.c_int64
else:
raise ValueError("Int width %d is not supported" % data_type.width)
elif isinstance(data_type, ir.FloatType):
return ctypes.c_float
elif isinstance(data_type, ir.DoubleType):
return ctypes.c_double
elif isinstance(data_type, ir.VoidType):
return None # Void type is not supported by ctypes
else:
raise NotImplementedError('Data type %s of %s is not supported yet' % (type(data_type), data_type))
def to_llvm_type(data_type):
"""
Transforms a given type into ctypes
:param data_type: Subclass of Type
:return: llvmlite type object
"""
if not ir:
raise _ir_importerror
if isinstance(data_type, PointerType):
return to_llvm_type(data_type.base_type).as_pointer()
else:
return to_llvm_type.map[data_type.numpy_dtype]
if ir:
to_llvm_type.map = {
np.dtype(np.int8): ir.IntType(8),
np.dtype(np.int16): ir.IntType(16),
np.dtype(np.int32): ir.IntType(32),
np.dtype(np.int64): ir.IntType(64),
np.dtype(np.uint8): ir.IntType(8),
np.dtype(np.uint16): ir.IntType(16),
np.dtype(np.uint32): ir.IntType(32),
np.dtype(np.uint64): ir.IntType(64),
np.dtype(np.float32): ir.FloatType(),
np.dtype(np.float64): ir.DoubleType(),
}
def peel_off_type(dtype, type_to_peel_off):
while type(dtype) is type_to_peel_off:
dtype = dtype.base_type
return dtype
def collate_types(types):
"""
Takes a sequence of types and returns their "common type" e.g. (float, double, float) -> double
Uses the collation rules from numpy.
"""
# Pointer arithmetic case i.e. pointer + integer is allowed
if any(type(t) is PointerType for t in types):
pointer_type = None
for t in types:
if type(t) is PointerType:
if pointer_type is not None:
raise ValueError("Cannot collate the combination of two pointer types")
pointer_type = t
elif type(t) is BasicType:
if not (t.is_int() or t.is_uint()):
raise ValueError("Invalid pointer arithmetic")
else:
raise ValueError("Invalid pointer arithmetic")
return pointer_type
# peel of vector types, if at least one vector type occurred the result will also be the vector type
vector_type = [t for t in types if type(t) is VectorType]
if not all_equal(t.width for t in vector_type):
raise ValueError("Collation failed because of vector types with different width")
types = [peel_off_type(t, VectorType) for t in types]
# now we should have a list of basic types - struct types are not yet supported
assert all(type(t) is BasicType for t in types)
if any(t.is_float() for t in types):
types = tuple(t for t in types if t.is_float())
# use numpy collation -> create type from numpy type -> and, put vector type around if necessary
result_numpy_type = np.result_type(*(t.numpy_dtype for t in types))
result = BasicType(result_numpy_type)
if vector_type:
result = VectorType(result, vector_type[0].width)
return result
@memorycache(maxsize=2048)
def get_type_of_expression(expr, default_float_type='double', default_int_type='int'):
from pystencils.astnodes import ResolvedFieldAccess
from pystencils.cpu.vectorization import vec_all, vec_any
expr = sp.sympify(expr)
if isinstance(expr, sp.Integer):
return create_type(default_int_type)
elif isinstance(expr, sp.Rational) or isinstance(expr, sp.Float):
return create_type(default_float_type)
elif isinstance(expr, ResolvedFieldAccess):
return expr.field.dtype
elif isinstance(expr, TypedSymbol):
return expr.dtype
elif isinstance(expr, sp.Symbol):
raise ValueError("All symbols inside this expression have to be typed! ", str(expr))
elif isinstance(expr, cast_func):
return expr.args[1]
elif isinstance(expr, vec_any) or isinstance(expr, vec_all):
return create_type("bool")
elif hasattr(expr, 'func') and expr.func == sp.Piecewise:
collated_result_type = collate_types(tuple(get_type_of_expression(a[0]) for a in expr.args))
collated_condition_type = collate_types(tuple(get_type_of_expression(a[1]) for a in expr.args))
if type(collated_condition_type) is VectorType and type(collated_result_type) is not VectorType:
collated_result_type = VectorType(collated_result_type, width=collated_condition_type.width)
return collated_result_type
elif isinstance(expr, sp.Indexed):
typed_symbol = expr.base.label
return typed_symbol.dtype.base_type
elif isinstance(expr, sp.boolalg.Boolean) or isinstance(expr, sp.boolalg.BooleanFunction):
# if any arg is of vector type return a vector boolean, else return a normal scalar boolean
result = create_type("bool")
vec_args = [get_type_of_expression(a) for a in expr.args if isinstance(get_type_of_expression(a), VectorType)]
if vec_args:
result = VectorType(result, width=vec_args[0].width)
return result
elif isinstance(expr, sp.Pow):
return get_type_of_expression(expr.args[0])
elif isinstance(expr, sp.Expr):
expr: sp.Expr
if expr.args:
types = tuple(get_type_of_expression(a) for a in expr.args)
return collate_types(types)
else:
if expr.is_integer:
return create_type(default_int_type)
else:
return create_type(default_float_type)
raise NotImplementedError("Could not determine type for", expr, type(expr))
class Type(sp.Basic):
is_Atom = True
def __new__(cls, *args, **kwargs):
return sp.Basic.__new__(cls)
def _sympystr(self, *args, **kwargs):
return str(self)
class BasicType(Type):
@staticmethod
def numpy_name_to_c(name):
if name == 'float64':
return 'double'
elif name == 'float32':
return 'float'
elif name.startswith('int'):
width = int(name[len("int"):])
return "int%d_t" % (width,)
elif name.startswith('uint'):
width = int(name[len("uint"):])
return "uint%d_t" % (width,)
elif name == 'bool':
return 'bool'
else:
raise NotImplementedError("Can map numpy to C name for %s" % (name,))
def __init__(self, dtype, const=False):
self.const = const
if isinstance(dtype, Type):
self._dtype = dtype.numpy_dtype
else:
self._dtype = np.dtype(dtype)
assert self._dtype.fields is None, "Tried to initialize NativeType with a structured type"
assert self._dtype.hasobject is False
assert self._dtype.subdtype is None
def __getnewargs__(self):
return self.numpy_dtype, self.const
@property
def base_type(self):
return None
@property
def numpy_dtype(self):
return self._dtype
@property
def item_size(self):
return 1
def is_int(self):
return self.numpy_dtype in np.sctypes['int'] or self.numpy_dtype in np.sctypes['uint']
def is_float(self):
return self.numpy_dtype in np.sctypes['float']
def is_uint(self):
return self.numpy_dtype in np.sctypes['uint']
def is_complex(self):
return self.numpy_dtype in np.sctypes['complex']
def is_other(self):
return self.numpy_dtype in np.sctypes['others']
@property
def base_name(self):
return BasicType.numpy_name_to_c(str(self._dtype))
def __str__(self):
result = BasicType.numpy_name_to_c(str(self._dtype))
if self.const:
result += " const"
return result
def __repr__(self):
return str(self)
def __eq__(self, other):
if not isinstance(other, BasicType):
return False
else:
return (self.numpy_dtype, self.const) == (other.numpy_dtype, other.const)
def __hash__(self):
return hash(str(self))
class VectorType(Type):
instruction_set = None
def __init__(self, base_type, width=4):
self._base_type = base_type
self.width = width
@property
def base_type(self):
return self._base_type
@property
def item_size(self):
return self.width * self.base_type.item_size
def __eq__(self, other):
if not isinstance(other, VectorType):
return False
else:
return (self.base_type, self.width) == (other.base_type, other.width)
def __str__(self):
if self.instruction_set is None:
return "%s[%d]" % (self.base_type, self.width)
else:
if self.base_type == create_type("int64"):
return self.instruction_set['int']
elif self.base_type == create_type("float64"):
return self.instruction_set['double']
elif self.base_type == create_type("float32"):
return self.instruction_set['float']
elif self.base_type == create_type("bool"):
return self.instruction_set['bool']
else:
raise NotImplementedError()
def __hash__(self):
return hash((self.base_type, self.width))
def __getnewargs__(self):
return self._base_type, self.width
class PointerType(Type):
def __init__(self, base_type, const=False, restrict=True):
self._base_type = base_type
self.const = const
self.restrict = restrict
def __getnewargs__(self):
return self.base_type, self.const, self.restrict
@property
def alias(self):
return not self.restrict
@property
def base_type(self):
return self._base_type
@property
def item_size(self):
return self.base_type.item_size
def __eq__(self, other):
if not isinstance(other, PointerType):
return False
else:
return (self.base_type, self.const, self.restrict) == (other.base_type, other.const, other.restrict)
def __str__(self):
components = [str(self.base_type), '*']
if self.restrict:
components.append('RESTRICT')
if self.const:
components.append("const")
return " ".join(components)
def __repr__(self):
return str(self)
def __hash__(self):
return hash((self._base_type, self.const, self.restrict))
class StructType:
def __init__(self, numpy_type, const=False):
self.const = const
self._dtype = np.dtype(numpy_type)
def __getnewargs__(self):
return self.numpy_dtype, self.const
@property
def base_type(self):
return None
@property
def numpy_dtype(self):
return self._dtype
@property
def item_size(self):
return self.numpy_dtype.itemsize
def get_element_offset(self, element_name):
return self.numpy_dtype.fields[element_name][1]
def get_element_type(self, element_name):
np_element_type = self.numpy_dtype.fields[element_name][0]
return BasicType(np_element_type, self.const)
def has_element(self, element_name):
return element_name in self.numpy_dtype.fields
def __eq__(self, other):
if not isinstance(other, StructType):
return False
else:
return (self.numpy_dtype, self.const) == (other.numpy_dtype, other.const)
def __str__(self):
# structs are handled byte-wise
result = "uint8_t"
if self.const:
result += " const"
return result
def __repr__(self):
return str(self)
def __hash__(self):
return hash((self.numpy_dtype, self.const))
#pragma once
extern "C++" {
#ifdef __CUDA_ARCH__
template <typename DTYPE_T, std::size_t DIMENSION> struct PyStencilsField {
DTYPE_T *data;
DTYPE_T shape[DIMENSION];
DTYPE_T stride[DIMENSION];
};
#else
#include <array>
template <typename DTYPE_T, std::size_t DIMENSION> struct PyStencilsField {
DTYPE_T *data;
std::array<DTYPE_T, DIMENSION> shape;
std::array<DTYPE_T, DIMENSION> stride;
};
#endif
}
#if !defined(__AES__) || !defined(__SSE2__)
#error AES-NI and SSE2 need to be enabled
#endif
#include <emmintrin.h> // SSE2
#include <wmmintrin.h> // AES
#ifdef __AVX512VL__
#include <immintrin.h> // AVX*
#endif
#include <cstdint>
#define QUALIFIERS inline
#define TWOPOW53_INV_DOUBLE (1.1102230246251565e-16)
#define TWOPOW32_INV_FLOAT (2.3283064e-10f)
typedef std::uint32_t uint32;
typedef std::uint64_t uint64;
QUALIFIERS __m128i aesni1xm128i(const __m128i & in, const __m128i & k) {
__m128i x = _mm_xor_si128(k, in);
x = _mm_aesenc_si128(x, k); // 1
x = _mm_aesenc_si128(x, k); // 2
x = _mm_aesenc_si128(x, k); // 3
x = _mm_aesenc_si128(x, k); // 4
x = _mm_aesenc_si128(x, k); // 5
x = _mm_aesenc_si128(x, k); // 6
x = _mm_aesenc_si128(x, k); // 7
x = _mm_aesenc_si128(x, k); // 8
x = _mm_aesenc_si128(x, k); // 9
x = _mm_aesenclast_si128(x, k); // 10
return x;
}
QUALIFIERS __m128 _my_cvtepu32_ps(const __m128i v)
{
#ifdef __AVX512VL__
return _mm_cvtepu32_ps(v);
#else
__m128i v2 = _mm_srli_epi32(v, 1);
__m128i v1 = _mm_and_si128(v, _mm_set1_epi32(1));
__m128 v2f = _mm_cvtepi32_ps(v2);
__m128 v1f = _mm_cvtepi32_ps(v1);
return _mm_add_ps(_mm_add_ps(v2f, v2f), v1f);
#endif
}
QUALIFIERS __m128d _my_cvtepu64_pd(const __m128i x)
{
#ifdef __AVX512VL__
return _mm_cvtepu64_pd(x);
#else
uint64 r[2];
_mm_storeu_si128((__m128i*)r, x);
return _mm_set_pd((double)r[1], (double)r[0]);
#endif
}
QUALIFIERS void aesni_double2(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1, uint32 key2, uint32 key3,
double & rnd1, double & rnd2)
{
// pack input and call AES
__m128i c128 = _mm_set_epi32(ctr3, ctr2, ctr1, ctr0);
__m128i k128 = _mm_set_epi32(key3, key2, key1, key0);
c128 = aesni1xm128i(c128, k128);
// convert 32 to 64 bit and put 0th and 2nd element into x, 1st and 3rd element into y
__m128i x = _mm_and_si128(c128, _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff));
__m128i y = _mm_and_si128(c128, _mm_set_epi32(0xffffffff, 0, 0xffffffff, 0));
y = _mm_srli_si128(y, 4);
// calculate z = x ^ y << (53 - 32))
__m128i z = _mm_sll_epi64(y, _mm_set_epi64x(53 - 32, 53 - 32));
z = _mm_xor_si128(x, z);
// convert uint64 to double
__m128d rs = _my_cvtepu64_pd(z);
// calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
rs = _mm_mul_pd(rs, _mm_set_pd1(TWOPOW53_INV_DOUBLE));
rs = _mm_add_pd(rs, _mm_set_pd1(TWOPOW53_INV_DOUBLE/2.0));
// store result
double rr[2];
_mm_storeu_pd(rr, rs);
rnd1 = rr[0];
rnd2 = rr[1];
}
QUALIFIERS void aesni_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1, uint32 key2, uint32 key3,
float & rnd1, float & rnd2, float & rnd3, float & rnd4)
{
// pack input and call AES
__m128i c128 = _mm_set_epi32(ctr3, ctr2, ctr1, ctr0);
__m128i k128 = _mm_set_epi32(key3, key2, key1, key0);
c128 = aesni1xm128i(c128, k128);
// convert uint32 to float
__m128 rs = _my_cvtepu32_ps(c128);
// calculate rs * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
rs = _mm_mul_ps(rs, _mm_set_ps1(TWOPOW32_INV_FLOAT));
rs = _mm_add_ps(rs, _mm_set_ps1(TWOPOW32_INV_FLOAT/2.0f));
// store result
float r[4];
_mm_storeu_ps(r, rs);
rnd1 = r[0];
rnd2 = r[1];
rnd3 = r[2];
rnd4 = r[3];
}
\ No newline at end of file
#ifndef OPENCL_STDINT
#define OPENCL_STDINT
typedef unsigned int uint;
typedef unsigned int uint_t;
typedef signed char int8_t;
typedef signed short int16_t;
typedef signed int int32_t;
typedef signed long int int64_t;
typedef unsigned char uint8_t;
typedef unsigned short uint16_t;
typedef unsigned int uint32_t;
typedef unsigned long int uint64_t;
#endif
#include <cstdint>
#ifndef __CUDA_ARCH__
#define QUALIFIERS inline
#else
#define QUALIFIERS static __forceinline__ __device__
#endif
#define PHILOX_W32_0 (0x9E3779B9)
#define PHILOX_W32_1 (0xBB67AE85)
#define PHILOX_M4x32_0 (0xD2511F53)
#define PHILOX_M4x32_1 (0xCD9E8D57)
#define TWOPOW53_INV_DOUBLE (1.1102230246251565e-16)
#define TWOPOW32_INV_FLOAT (2.3283064e-10f)
typedef std::uint32_t uint32;
typedef std::uint64_t uint64;
QUALIFIERS uint32 mulhilo32(uint32 a, uint32 b, uint32* hip)
{
#ifndef __CUDA_ARCH__
// host code
uint64 product = ((uint64)a) * ((uint64)b);
*hip = product >> 32;
return (uint32)product;
#else
// device code
*hip = __umulhi(a,b);
return a*b;
#endif
}
QUALIFIERS void _philox4x32round(uint32* ctr, uint32* key)
{
uint32 hi0;
uint32 hi1;
uint32 lo0 = mulhilo32(PHILOX_M4x32_0, ctr[0], &hi0);
uint32 lo1 = mulhilo32(PHILOX_M4x32_1, ctr[2], &hi1);
ctr[0] = hi1^ctr[1]^key[0];
ctr[1] = lo1;
ctr[2] = hi0^ctr[3]^key[1];
ctr[3] = lo0;
}
QUALIFIERS void _philox4x32bumpkey(uint32* key)
{
key[0] += PHILOX_W32_0;
key[1] += PHILOX_W32_1;
}
QUALIFIERS double _uniform_double_hq(uint32 x, uint32 y)
{
uint64 z = (uint64)x ^ ((uint64)y << (53 - 32));
return z * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0);
}
QUALIFIERS void philox_double2(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1, double & rnd1, double & rnd2)
{
uint32 key[2] = {key0, key1};
uint32 ctr[4] = {ctr0, ctr1, ctr2, ctr3};
_philox4x32round(ctr, key); // 1
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 2
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 3
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 4
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 5
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 6
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 7
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 8
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 9
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 10
rnd1 = _uniform_double_hq(ctr[0], ctr[1]);
rnd2 = _uniform_double_hq(ctr[2], ctr[3]);
}
QUALIFIERS void philox_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1,
float & rnd1, float & rnd2, float & rnd3, float & rnd4)
{
uint32 key[2] = {key0, key1};
uint32 ctr[4] = {ctr0, ctr1, ctr2, ctr3};
_philox4x32round(ctr, key); // 1
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 2
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 3
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 4
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 5
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 6
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 7
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 8
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 9
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 10
rnd1 = ctr[0] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
rnd2 = ctr[1] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
rnd3 = ctr[2] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
rnd4 = ctr[3] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
}
\ No newline at end of file
from .generate_benchmark import generate_benchmark, run_c_benchmark
from .kerncraft_interface import KerncraftParameters, PyStencilsKerncraftKernel
__all__ = ['PyStencilsKerncraftKernel', 'KerncraftParameters', 'generate_benchmark', 'run_c_benchmark']
import os
import subprocess
from jinja2 import Template
from pystencils.astnodes import PragmaBlock
from pystencils.backends.cbackend import generate_c, get_headers
from pystencils.cpu.cpujit import get_compiler_config, run_compile_step
from pystencils.data_types import get_base_type
from pystencils.include import get_pystencils_include_path
from pystencils.sympyextensions import prod
benchmark_template = Template("""
#include "kerncraft.h"
#include <stdlib.h>
#include <stdint.h>
#include <stdbool.h>
#include <math.h>
#include <stdio.h>
{{ includes }}
{%- if likwid %}
#include <likwid.h>
{%- endif %}
#define RESTRICT __restrict__
#define FUNC_PREFIX
void dummy(void *);
void timing(double* wcTime, double* cpuTime);
extern int var_false;
{{kernel_code}}
int main(int argc, char **argv)
{
{%- if likwid %}
likwid_markerInit();
{%- endif %}
{%- for field_name, dataType, size in fields %}
// Initialization {{field_name}}
double * {{field_name}} = (double *) aligned_malloc(sizeof({{dataType}}) * {{size}}, 64);
for (unsigned long long i = 0; i < {{size}}; ++i)
{{field_name}}[i] = 0.23;
if(var_false)
dummy({{field_name}});
{%- endfor %}
{%- for constantName, dataType in constants %}
// Constant {{constantName}}
{{dataType}} {{constantName}};
{{constantName}} = 0.23;
if(var_false)
dummy(& {{constantName}});
{%- endfor %}
{%- if likwid and openmp %}
#pragma omp parallel
{
likwid_markerRegisterRegion("loop");
#pragma omp barrier
{%- elif likwid %}
likwid_markerRegisterRegion("loop");
{%- endif %}
for(int warmup = 1; warmup >= 0; --warmup) {
int repeat = 2;
if(warmup == 0) {
repeat = atoi(argv[1]);
{%- if likwid %}
likwid_markerStartRegion("loop");
{%- endif %}
}
{%- if timing %}
double wcStartTime, cpuStartTime, wcEndTime, cpuEndTime;
timing(&wcStartTime, &cpuStartTime);
{%- endif %}
for (; repeat > 0; --repeat)
{
{{kernelName}}({{call_argument_list}});
// Dummy calls
{%- for field_name, dataType, size in fields %}
if(var_false) dummy((void*){{field_name}});
{%- endfor %}
{%- for constantName, dataType in constants %}
if(var_false) dummy((void*)&{{constantName}});
{%- endfor %}
}
{%- if timing %}
timing(&wcEndTime, &cpuEndTime);
if( warmup == 0)
printf("%e\\n", (wcEndTime - wcStartTime) / atoi(argv[1]) );
{%- endif %}
}
{%- if likwid %}
likwid_markerStopRegion("loop");
{%- if openmp %}
}
{%- endif %}
{%- endif %}
{%- if likwid %}
likwid_markerClose();
{%- endif %}
}
""")
def generate_benchmark(ast, likwid=False, openmp=False, timing=False):
"""Return C code of a benchmark program for the given kernel.
Args:
ast: the pystencils AST object as returned by create_kernel
likwid: if True likwid markers are added to the code
openmp: relevant only if likwid=True, to generated correct likwid initialization code
timing: add timing output to the code, prints time per iteration to stdout
Returns:
C code as string
"""
accessed_fields = {f.name: f for f in ast.fields_accessed}
constants = []
fields = []
call_parameters = []
for p in ast.get_parameters():
if not p.is_field_parameter:
constants.append((p.symbol.name, str(p.symbol.dtype)))
call_parameters.append(p.symbol.name)
else:
assert p.is_field_pointer, "Benchmark implemented only for kernels with fixed loop size"
field = accessed_fields[p.field_name]
dtype = str(get_base_type(p.symbol.dtype))
fields.append((p.field_name, dtype, prod(field.shape)))
call_parameters.append(p.field_name)
header_list = get_headers(ast)
includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list])
# Strip "#pragma omp parallel" from within kernel, because main function takes care of that
# when likwid and openmp are enabled
if likwid and openmp:
if len(ast.body.args) > 0 and isinstance(ast.body.args[0], PragmaBlock):
ast.body.args[0].pragma_line = ''
args = {
'likwid': likwid,
'openmp': openmp,
'kernel_code': generate_c(ast, dialect='c'),
'kernelName': ast.function_name,
'fields': fields,
'constants': constants,
'call_argument_list': ",".join(call_parameters),
'includes': includes,
'timing': timing,
}
return benchmark_template.render(**args)
def run_c_benchmark(ast, inner_iterations, outer_iterations=3):
"""Runs the given kernel with outer loop in C
Args:
ast:
inner_iterations: timings are recorded around this many iterations
outer_iterations: number of timings recorded
Returns:
list of times per iterations for each outer iteration
"""
import kerncraft
benchmark_code = generate_benchmark(ast, timing=True)
with open('bench.c', 'w') as f:
f.write(benchmark_code)
kerncraft_path = os.path.dirname(kerncraft.__file__)
extra_flags = ['-I' + get_pystencils_include_path(),
'-I' + os.path.join(kerncraft_path, 'headers')]
compiler_config = get_compiler_config()
compile_cmd = [compiler_config['command']] + compiler_config['flags'].split()
compile_cmd += [*extra_flags,
os.path.join(kerncraft_path, 'headers', 'timing.c'),
os.path.join(kerncraft_path, 'headers', 'dummy.c'),
'bench.c',
'-o', 'bench',
]
run_compile_step(compile_cmd)
results = []
for _ in range(outer_iterations):
benchmark_time = float(subprocess.check_output(['./bench', str(inner_iterations)]))
results.append(benchmark_time)
return results
import warnings
from collections import defaultdict
from tempfile import TemporaryDirectory
from typing import Optional
import kerncraft
import sympy as sp
from kerncraft.kerncraft import KernelCode
from kerncraft.machinemodel import MachineModel
from pystencils.astnodes import (
KernelFunction, LoopOverCoordinate, ResolvedFieldAccess, SympyAssignment)
from pystencils.field import get_layout_from_strides
from pystencils.kerncraft_coupling.generate_benchmark import generate_benchmark
from pystencils.sympyextensions import count_operations_in_ast
from pystencils.transformations import filtered_tree_iteration
from pystencils.utils import DotDict
class PyStencilsKerncraftKernel(KernelCode):
"""
Implementation of kerncraft's kernel interface for pystencils CPU kernels.
Analyses a list of equations assuming they will be executed on a CPU
"""
LIKWID_BASE = '/usr/local/likwid'
def __init__(self, ast: KernelFunction, machine: Optional[MachineModel] = None,
assumed_layout='SoA', debug_print=False, filename=None):
"""Create a kerncraft kernel using a pystencils AST
Args:
ast: pystencils ast
machine: kerncraft machine model - specify this if kernel needs to be compiled
assumed_layout: either 'SoA' or 'AoS' - if fields have symbolic sizes the layout of the index
coordinates is not known. In this case either a structures of array (SoA) or
array of structures (AoS) layout is assumed
"""
kerncraft.kernel.Kernel.__init__(self, machine)
# Initialize state
self.asm_block = None
self._filename = filename
self.kernel_ast = ast
self.temporary_dir = TemporaryDirectory()
self._keep_intermediates = debug_print
# Loops
inner_loops = [l for l in filtered_tree_iteration(ast, LoopOverCoordinate, stop_type=SympyAssignment)
if l.is_innermost_loop]
if len(inner_loops) == 0:
raise ValueError("No loop found in pystencils AST")
else:
if len(inner_loops) > 1:
warnings.warn("pystencils AST contains multiple inner loops. "
"Only one can be analyzed - choosing first one")
inner_loop = inner_loops[0]
self._loop_stack = []
cur_node = inner_loop
while cur_node is not None:
if isinstance(cur_node, LoopOverCoordinate):
loop_counter_sym = cur_node.loop_counter_symbol
loop_info = (loop_counter_sym.name, cur_node.start, cur_node.stop, 1)
# If the correct step were to be provided, all access within that step length will
# also need to be passed to kerncraft: cur_node.step)
self._loop_stack.append(loop_info)
cur_node = cur_node.parent
self._loop_stack = list(reversed(self._loop_stack))
# Data sources & destinations
self.sources = defaultdict(list)
self.destinations = defaultdict(list)
def get_layout_tuple(f):
if f.has_fixed_shape:
return get_layout_from_strides(f.strides)
else:
layout_list = list(f.layout)
for _ in range(f.index_dimensions):
layout_list.insert(0 if assumed_layout == 'SoA' else -1, max(layout_list) + 1)
return layout_list
reads, writes = search_resolved_field_accesses_in_ast(inner_loop)
for accesses, target_dict in [(reads, self.sources), (writes, self.destinations)]:
for fa in accesses:
coord = [sp.Symbol(LoopOverCoordinate.get_loop_counter_name(i), positive=True, integer=True) + off
for i, off in enumerate(fa.offsets)]
coord += list(fa.idx_coordinate_values)
layout = get_layout_tuple(fa.field)
permuted_coord = [sp.sympify(coord[i]) for i in layout]
target_dict[fa.field.name].append(permuted_coord)
# Variables (arrays)
fields_accessed = ast.fields_accessed
for field in fields_accessed:
layout = get_layout_tuple(field)
permuted_shape = list(field.shape[i] for i in layout)
self.set_variable(field.name, str(field.dtype), tuple(permuted_shape))
# Scalars may be safely ignored
# for param in ast.get_parameters():
# if not param.is_field_parameter:
# # self.set_variable(param.symbol.name, str(param.symbol.dtype), None)
# self.sources[param.symbol.name] = [None]
# data type
self.datatype = list(self.variables.values())[0][0]
# flops
operation_count = count_operations_in_ast(inner_loop)
self._flops = {
'+': operation_count['adds'],
'*': operation_count['muls'],
'/': operation_count['divs'],
}
for k in [k for k, v in self._flops.items() if v == 0]:
del self._flops[k]
self.check()
if debug_print:
from pprint import pprint
print("----------------------------- Loop Stack --------------------------")
pprint(self._loop_stack)
print("----------------------------- Sources -----------------------------")
pprint(self.sources)
print("----------------------------- Destinations ------------------------")
pprint(self.destinations)
print("----------------------------- FLOPS -------------------------------")
pprint(self._flops)
def as_code(self, type_='iaca', openmp=False, as_filename=False):
"""
Generate and return compilable source code.
Args:
type_: can be iaca or likwid.
openmp: if true, openmp code will be generated
as_filename:
"""
code = generate_benchmark(self.kernel_ast, likwid=type_ == 'likwid', openmp=openmp)
if as_filename:
fp, already_available = self._get_intermediate_file('kernel_{}.c'.format(type_),
machine_and_compiler_dependent=False)
if not already_available:
fp.write(code)
return fp.name
else:
return code
class KerncraftParameters(DotDict):
def __init__(self, **kwargs):
super(KerncraftParameters, self).__init__(**kwargs)
self['asm_block'] = 'auto'
self['asm_increment'] = 0
self['cores'] = 1
self['cache_predictor'] = 'SIM'
self['verbose'] = 0
self['pointer_increment'] = 'auto'
self['iterations'] = 10
self['unit'] = 'cy/CL'
self['ignore_warnings'] = True
# ------------------------------------------- Helper functions ---------------------------------------------------------
def search_resolved_field_accesses_in_ast(ast):
def visit(node, reads, writes):
if not isinstance(node, SympyAssignment):
for a in node.args:
visit(a, reads, writes)
return
for expr, accesses in [(node.lhs, writes), (node.rhs, reads)]:
accesses.update(expr.atoms(ResolvedFieldAccess))
read_accesses = set()
write_accesses = set()
visit(ast, read_accesses, write_accesses)
return read_accesses, write_accesses
import itertools
from types import MappingProxyType
import sympy as sp
from pystencils.assignment import Assignment
from pystencils.astnodes import Block, Conditional, LoopOverCoordinate, SympyAssignment
from pystencils.cpu.vectorization import vectorize
from pystencils.gpucuda.indexing import indexing_creator_from_params
from pystencils.simp.assignment_collection import AssignmentCollection
from pystencils.transformations import (
loop_blocking, move_constants_before_loop, remove_conditionals_in_staggered_kernel)
def create_kernel(assignments, target='cpu', data_type="double", iteration_slice=None, ghost_layers=None,
skip_independence_check=False,
cpu_openmp=False, cpu_vectorize_info=None, cpu_blocking=None,
gpu_indexing='block', gpu_indexing_params=MappingProxyType({})):
"""
Creates abstract syntax tree (AST) of kernel, using a list of update equations.
Args:
assignments: can be a single assignment, sequence of assignments or an `AssignmentCollection`
target: 'cpu', 'llvm' or 'gpu'
data_type: data type used for all untyped symbols (i.e. non-fields), can also be a dict from symbol name
to type
iteration_slice: rectangular subset to iterate over, if not specified the complete non-ghost layer \
part of the field is iterated over
ghost_layers: if left to default, the number of necessary ghost layers is determined automatically
a single integer specifies the ghost layer count at all borders, can also be a sequence of
pairs ``[(x_lower_gl, x_upper_gl), .... ]``
skip_independence_check: don't check that loop iterations are independent. This is needed e.g. for
periodicity kernel, that access the field outside the iteration bounds. Use with care!
cpu_openmp: True or number of threads for OpenMP parallelization, False for no OpenMP
cpu_vectorize_info: a dictionary with keys, 'vector_instruction_set', 'assume_aligned' and 'nontemporal'
for documentation of these parameters see vectorize function. Example:
'{'instruction_set': 'avx512', 'assume_aligned': True, 'nontemporal':True}'
cpu_blocking: a tuple of block sizes or None if no blocking should be applied
gpu_indexing: either 'block' or 'line' , or custom indexing class, see `AbstractIndexing`
gpu_indexing_params: dict with indexing parameters (constructor parameters of indexing class)
e.g. for 'block' one can specify '{'block_size': (20, 20, 10) }'
Returns:
abstract syntax tree (AST) object, that can either be printed as source code with `show_code` or
can be compiled with through its 'compile()' member
Example:
>>> import pystencils as ps
>>> import numpy as np
>>> s, d = ps.fields('s, d: [2D]')
>>> assignment = ps.Assignment(d[0,0], s[0, 1] + s[0, -1] + s[1, 0] + s[-1, 0])
>>> ast = ps.create_kernel(assignment, target='cpu', cpu_openmp=True)
>>> kernel = ast.compile()
>>> d_arr = np.zeros([5, 5])
>>> kernel(d=d_arr, s=np.ones([5, 5]))
>>> d_arr
array([[0., 0., 0., 0., 0.],
[0., 4., 4., 4., 0.],
[0., 4., 4., 4., 0.],
[0., 4., 4., 4., 0.],
[0., 0., 0., 0., 0.]])
"""
# ---- Normalizing parameters
split_groups = ()
if isinstance(assignments, AssignmentCollection):
if 'split_groups' in assignments.simplification_hints:
split_groups = assignments.simplification_hints['split_groups']
assignments = assignments.all_assignments
if isinstance(assignments, Assignment):
assignments = [assignments]
# ---- Creating ast
if target == 'cpu':
from pystencils.cpu import create_kernel
from pystencils.cpu import add_openmp
ast = create_kernel(assignments, type_info=data_type, split_groups=split_groups,
iteration_slice=iteration_slice, ghost_layers=ghost_layers,
skip_independence_check=skip_independence_check)
omp_collapse = None
if cpu_blocking:
omp_collapse = loop_blocking(ast, cpu_blocking)
if cpu_openmp:
add_openmp(ast, num_threads=cpu_openmp, collapse=omp_collapse)
if cpu_vectorize_info:
if cpu_vectorize_info is True:
vectorize(ast)
elif isinstance(cpu_vectorize_info, dict):
vectorize(ast, **cpu_vectorize_info)
else:
raise ValueError("Invalid value for cpu_vectorize_info")
return ast
elif target == 'llvm':
from pystencils.llvm import create_kernel
ast = create_kernel(assignments, type_info=data_type, split_groups=split_groups,
iteration_slice=iteration_slice, ghost_layers=ghost_layers)
return ast
elif target == 'gpu':
from pystencils.gpucuda import create_cuda_kernel
ast = create_cuda_kernel(assignments, type_info=data_type,
indexing_creator=indexing_creator_from_params(gpu_indexing, gpu_indexing_params),
iteration_slice=iteration_slice, ghost_layers=ghost_layers,
skip_independence_check=skip_independence_check)
return ast
else:
raise ValueError("Unknown target %s. Has to be one of 'cpu', 'gpu' or 'llvm' " % (target,))
def create_indexed_kernel(assignments, index_fields, target='cpu', data_type="double", coordinate_names=('x', 'y', 'z'),
cpu_openmp=True, gpu_indexing='block', gpu_indexing_params=MappingProxyType({})):
"""
Similar to :func:`create_kernel`, but here not all cells of a field are updated but only cells with
coordinates which are stored in an index field. This traversal method can e.g. be used for boundary handling.
The coordinates are stored in a separated index_field, which is a one dimensional array with struct data type.
This struct has to contain fields named 'x', 'y' and for 3D fields ('z'). These names are configurable with the
'coordinate_names' parameter. The struct can have also other fields that can be read and written in the kernel, for
example boundary parameters.
index_fields: list of index fields, i.e. 1D fields with struct data type
coordinate_names: name of the coordinate fields in the struct data type
Example:
>>> import pystencils as ps
>>> import numpy as np
>>>
>>> # Index field stores the indices of the cell to visit together with optional values
>>> index_arr_dtype = np.dtype([('x', np.int32), ('y', np.int32), ('val', np.double)])
>>> index_arr = np.array([(1, 1, 0.1), (2, 2, 0.2), (3, 3, 0.3)], dtype=index_arr_dtype)
>>> idx_field = ps.fields(idx=index_arr)
>>>
>>> # Additional values stored in index field can be accessed in the kernel as well
>>> s, d = ps.fields('s, d: [2D]')
>>> assignment = ps.Assignment(d[0,0], 2 * s[0, 1] + 2 * s[1, 0] + idx_field('val'))
>>> ast = create_indexed_kernel(assignment, [idx_field], coordinate_names=('x', 'y'))
>>> kernel = ast.compile()
>>> d_arr = np.zeros([5, 5])
>>> kernel(s=np.ones([5, 5]), d=d_arr, idx=index_arr)
>>> d_arr
array([[0. , 0. , 0. , 0. , 0. ],
[0. , 4.1, 0. , 0. , 0. ],
[0. , 0. , 4.2, 0. , 0. ],
[0. , 0. , 0. , 4.3, 0. ],
[0. , 0. , 0. , 0. , 0. ]])
"""
if isinstance(assignments, Assignment):
assignments = [assignments]
elif isinstance(assignments, AssignmentCollection):
assignments = assignments.all_assignments
if target == 'cpu':
from pystencils.cpu import create_indexed_kernel
from pystencils.cpu import add_openmp
ast = create_indexed_kernel(assignments, index_fields=index_fields, type_info=data_type,
coordinate_names=coordinate_names)
if cpu_openmp:
add_openmp(ast, num_threads=cpu_openmp)
return ast
elif target == 'llvm':
raise NotImplementedError("Indexed kernels are not yet supported in LLVM backend")
elif target == 'gpu':
from pystencils.gpucuda import created_indexed_cuda_kernel
idx_creator = indexing_creator_from_params(gpu_indexing, gpu_indexing_params)
ast = created_indexed_cuda_kernel(assignments, index_fields, type_info=data_type,
coordinate_names=coordinate_names, indexing_creator=idx_creator)
return ast
else:
raise ValueError("Unknown target %s. Has to be either 'cpu' or 'gpu'" % (target,))
def create_staggered_kernel(staggered_field, expressions, subexpressions=(), target='cpu',
gpu_exclusive_conditions=False, **kwargs):
"""Kernel that updates a staggered field.
.. image:: /img/staggered_grid.svg
Args:
staggered_field: field where the first index coordinate defines the location of the staggered value
can have 1 or 2 index coordinates, in case of two index coordinates at every staggered location
a vector is stored, expressions parameter has to be a sequence of sequences then
where e.g. ``f[0,0](0)`` is interpreted as value at the left cell boundary, ``f[1,0](0)`` the right cell
boundary and ``f[0,0](1)`` the southern cell boundary etc.
expressions: sequence of expressions of length dim, defining how the west, southern, (bottom) cell boundary
should be updated.
subexpressions: optional sequence of Assignments, that define subexpressions used in the main expressions
target: 'cpu' or 'gpu'
gpu_exclusive_conditions: if/else construct to have only one code block for each of 2**dim code paths
kwargs: passed directly to create_kernel, iteration slice and ghost_layers parameters are not allowed
Returns:
AST, see `create_kernel`
"""
assert 'iteration_slice' not in kwargs and 'ghost_layers' not in kwargs
assert staggered_field.index_dimensions in (1, 2), 'Staggered field must have one or two index dimensions'
dim = staggered_field.spatial_dimensions
counters = [LoopOverCoordinate.get_loop_counter_symbol(i) for i in range(dim)]
conditions = [counters[i] < staggered_field.shape[i] - 1 for i in range(dim)]
assert len(expressions) == dim
if staggered_field.index_dimensions == 2:
assert all(len(sublist) == len(expressions[0]) for sublist in expressions), \
"If staggered field has two index dimensions expressions has to be a sequence of sequences of all the " \
"same length."
final_assignments = []
last_conditional = None
def add(condition, dimensions, as_else_block=False):
nonlocal last_conditional
if staggered_field.index_dimensions == 1:
assignments = [Assignment(staggered_field(d), expressions[d]) for d in dimensions]
a_coll = AssignmentCollection(assignments, list(subexpressions))
a_coll = a_coll.new_filtered([staggered_field(d) for d in dimensions])
elif staggered_field.index_dimensions == 2:
assert staggered_field.has_fixed_index_shape
assignments = [Assignment(staggered_field(d, i), expr)
for d in dimensions
for i, expr in enumerate(expressions[d])]
a_coll = AssignmentCollection(assignments, list(subexpressions))
a_coll = a_coll.new_filtered([staggered_field(d, i) for i in range(staggered_field.index_shape[1])
for d in dimensions])
sp_assignments = [SympyAssignment(a.lhs, a.rhs) for a in a_coll.all_assignments]
if as_else_block and last_conditional:
new_cond = Conditional(condition, Block(sp_assignments))
last_conditional.false_block = Block([new_cond])
last_conditional = new_cond
else:
last_conditional = Conditional(condition, Block(sp_assignments))
final_assignments.append(last_conditional)
if target == 'cpu' or not gpu_exclusive_conditions:
for d in range(dim):
cond = sp.And(*[conditions[i] for i in range(dim) if d != i])
add(cond, [d])
elif target == 'gpu':
full_conditions = [sp.And(*[conditions[i] for i in range(dim) if d != i]) for d in range(dim)]
for include in itertools.product(*[[1, 0]] * dim):
case_conditions = sp.And(*[c if value else sp.Not(c) for c, value in zip(full_conditions, include)])
dimensions_to_include = [i for i in range(dim) if include[i]]
if dimensions_to_include:
add(case_conditions, dimensions_to_include, True)
ghost_layers = [(1, 0)] * dim
blocking = kwargs.get('cpu_blocking', None)
if blocking:
del kwargs['cpu_blocking']
cpu_vectorize_info = kwargs.get('cpu_vectorize_info', None)
if cpu_vectorize_info:
del kwargs['cpu_vectorize_info']
openmp = kwargs.get('cpu_openmp', None)
if openmp:
del kwargs['cpu_openmp']
ast = create_kernel(final_assignments, ghost_layers=ghost_layers, target=target, **kwargs)
if target == 'cpu':
remove_conditionals_in_staggered_kernel(ast)
move_constants_before_loop(ast)
omp_collapse = None
if blocking:
omp_collapse = loop_blocking(ast, blocking)
if openmp:
from pystencils.cpu import add_openmp
add_openmp(ast, num_threads=openmp, collapse=omp_collapse, assume_single_outer_loop=False)
if cpu_vectorize_info is True:
vectorize(ast)
elif isinstance(cpu_vectorize_info, dict):
vectorize(ast, **cpu_vectorize_info)
return ast
from .kernelcreation import create_kernel
from .llvmjit import make_python_function
__all__ = ['create_kernel', 'make_python_function']
import llvmlite.ir as ir
class Loop(object):
def __init__(self, builder, start_val, stop_val, step_val=1, loop_name='loop', phi_name="_phi"):
self.builder = builder
self.start_val = start_val
self.stop_val = stop_val
self.step_val = step_val
self.loop_name = loop_name
self.phi_name = phi_name
def __enter__(self):
self.loop_end, self.after, phi = self._for_loop(self.start_val, self.stop_val, self.step_val, self.loop_name,
self.phi_name)
return phi
def _for_loop(self, start_val, stop_val, step_val, loop_name, phi_name):
# TODO size of int??? unisgned???
integer = ir.IntType(64)
# Loop block
pre_loop_bb = self.builder.block
loop_bb = self.builder.append_basic_block(name='loop_' + loop_name)
self.builder.branch(loop_bb)
# Insert an explicit fall through from the current block to loop_bb
self.builder.position_at_start(loop_bb)
# Add phi
phi = self.builder.phi(integer, name=phi_name)
phi.add_incoming(start_val, pre_loop_bb)
loop_end_bb = self.builder.append_basic_block(name=loop_name + "_end_bb")
self.builder.position_at_start(loop_end_bb)
next_var = self.builder.add(phi, step_val, name=loop_name + '_next_it')
cond = self.builder.icmp_unsigned('<', next_var, stop_val, name=loop_name + "_cond")
after_bb = self.builder.append_basic_block(name=loop_name + "_after_bb")
self.builder.cbranch(cond, loop_bb, after_bb)
phi.add_incoming(next_var, loop_end_bb)
self.builder.position_at_end(loop_bb)
return loop_end_bb, after_bb, phi
def __exit__(self, exc_type, exc, exc_tb):
self.builder.branch(self.loop_end)
self.builder.position_at_end(self.after)
from pystencils.llvm.llvmjit import make_python_function
from pystencils.transformations import insert_casts
def create_kernel(assignments, function_name="kernel", type_info=None, split_groups=(),
iteration_slice=None, ghost_layers=None):
"""
Creates an abstract syntax tree for a kernel function, by taking a list of update rules.
Loops are created according to the field accesses in the equations.
Args:
assignments: list of sympy equations, containing accesses to :class:`pystencils.field.Field`.
Defining the update rules of the kernel
function_name: name of the generated function - only important if generated code is written out
type_info: a map from symbol name to a C type specifier. If not specified all symbols are assumed to
be of type 'double' except symbols which occur on the left hand side of equations where the
right hand side is a sympy Boolean which are assumed to be 'bool' .
split_groups: Specification on how to split up inner loop into multiple loops. For details see
transformation :func:`pystencils.transformation.split_inner_loop`
iteration_slice: if not None, iteration is done only over this slice of the field
ghost_layers: a sequence of pairs for each coordinate with lower and upper nr of ghost layers
if None, the number of ghost layers is determined automatically and assumed to be equal for a
all dimensions
:return: :class:`pystencils.ast.KernelFunction` node
"""
from pystencils.cpu import create_kernel
code = create_kernel(assignments, function_name, type_info, split_groups, iteration_slice, ghost_layers)
code.body = insert_casts(code.body)
code._compile_function = make_python_function
code._backend = 'llvm'
return code
import functools
import llvmlite.ir as ir
import sympy as sp
from sympy import Indexed, S
from sympy.printing.printer import Printer
from pystencils.assignment import Assignment
from pystencils.data_types import (
collate_types, create_composite_type_from_string, create_type, get_type_of_expression,
to_llvm_type)
from pystencils.llvm.control_flow import Loop
def generate_llvm(ast_node, module=None, builder=None):
"""Prints the ast as llvm code."""
if module is None:
module = ir.Module()
if builder is None:
builder = ir.IRBuilder()
printer = LLVMPrinter(module, builder)
return printer._print(ast_node)
# noinspection PyPep8Naming
class LLVMPrinter(Printer):
"""Convert expressions to LLVM IR"""
def __init__(self, module, builder, fn=None, *args, **kwargs):
self.func_arg_map = kwargs.pop("func_arg_map", {})
super(LLVMPrinter, self).__init__(*args, **kwargs)
self.fp_type = ir.DoubleType()
self.fp_pointer = self.fp_type.as_pointer()
self.integer = ir.IntType(64)
self.integer_pointer = self.integer.as_pointer()
self.void = ir.VoidType()
self.module = module
self.builder = builder
self.fn = fn
self.ext_fn = {} # keep track of wrappers to external functions
self.tmp_var = {}
def _add_tmp_var(self, name, value):
self.tmp_var[name] = value
def _remove_tmp_var(self, name):
del self.tmp_var[name]
def _print_Number(self, n):
if get_type_of_expression(n) == create_type("int"):
return ir.Constant(self.integer, int(n))
elif get_type_of_expression(n) == create_type("double"):
return ir.Constant(self.fp_type, float(n))
else:
raise NotImplementedError("Numbers can only have int and double", n)
def _print_Float(self, expr):
return ir.Constant(self.fp_type, float(expr))
def _print_Integer(self, expr):
return ir.Constant(self.integer, int(expr))
def _print_int(self, i):
return ir.Constant(self.integer, i)
def _print_Symbol(self, s):
val = self.tmp_var.get(s)
if not val:
# look up parameter with name s
val = self.func_arg_map.get(s.name)
if not val:
raise LookupError("Symbol not found: %s" % s)
return val
def _print_Pow(self, expr):
base0 = self._print(expr.base)
if expr.exp == S.NegativeOne:
return self.builder.fdiv(ir.Constant(self.fp_type, 1.0), base0)
if expr.exp == S.Half:
fn = self.ext_fn.get("sqrt")
if not fn:
fn_type = ir.FunctionType(self.fp_type, [self.fp_type])
fn = ir.Function(self.module, fn_type, "sqrt")
self.ext_fn["sqrt"] = fn
return self.builder.call(fn, [base0], "sqrt")
if expr.exp == 2:
return self.builder.fmul(base0, base0)
elif expr.exp == 3:
return self.builder.fmul(self.builder.fmul(base0, base0), base0)
exp0 = self._print(expr.exp)
fn = self.ext_fn.get("pow")
if not fn:
fn_type = ir.FunctionType(self.fp_type, [self.fp_type, self.fp_type])
fn = ir.Function(self.module, fn_type, "pow")
self.ext_fn["pow"] = fn
return self.builder.call(fn, [base0, exp0], "pow")
def _print_Mul(self, expr):
nodes = [self._print(a) for a in expr.args]
e = nodes[0]
if get_type_of_expression(expr) == create_type('double'):
mul = self.builder.fmul
else: # int TODO unsigned/signed
mul = self.builder.mul
for node in nodes[1:]:
e = mul(e, node)
return e
def _print_Add(self, expr):
nodes = [self._print(a) for a in expr.args]
e = nodes[0]
if get_type_of_expression(expr) == create_type('double'):
add = self.builder.fadd
else: # int TODO unsigned/signed
add = self.builder.add
for node in nodes[1:]:
e = add(e, node)
return e
def _print_Or(self, expr):
nodes = [self._print(a) for a in expr.args]
e = nodes[0]
for node in nodes[1:]:
e = self.builder.or_(e, node)
return e
def _print_And(self, expr):
nodes = [self._print(a) for a in expr.args]
e = nodes[0]
for node in nodes[1:]:
e = self.builder.and_(e, node)
return e
def _print_StrictLessThan(self, expr):
return self._comparison('<', expr)
def _print_LessThan(self, expr):
return self._comparison('<=', expr)
def _print_StrictGreaterThan(self, expr):
return self._comparison('>', expr)
def _print_GreaterThan(self, expr):
return self._comparison('>=', expr)
def _print_Unequality(self, expr):
return self._comparison('!=', expr)
def _print_Equality(self, expr):
return self._comparison('==', expr)
def _comparison(self, cmpop, expr):
if collate_types([get_type_of_expression(arg) for arg in expr.args]) == create_type('double'):
comparison = self.builder.fcmp_unordered
else:
comparison = self.builder.icmp_signed
return comparison(cmpop, self._print(expr.lhs), self._print(expr.rhs))
def _print_KernelFunction(self, func):
# KernelFunction does not posses a return type
return_type = self.void
parameter_type = []
parameters = func.get_parameters()
for parameter in parameters:
parameter_type.append(to_llvm_type(parameter.symbol.dtype))
func_type = ir.FunctionType(return_type, tuple(parameter_type))
name = func.function_name
fn = ir.Function(self.module, func_type, name)
self.ext_fn[name] = fn
# set proper names to arguments
for i, arg in enumerate(fn.args):
arg.name = parameters[i].symbol.name
self.func_arg_map[parameters[i].symbol.name] = arg
# func.attributes.add("inlinehint")
# func.attributes.add("argmemonly")
block = fn.append_basic_block(name="entry")
self.builder = ir.IRBuilder(block) # TODO use goto_block instead
self._print(func.body)
self.builder.ret_void()
self.fn = fn
return fn
def _print_Block(self, block):
for node in block.args:
self._print(node)
def _print_LoopOverCoordinate(self, loop):
with Loop(self.builder, self._print(loop.start), self._print(loop.stop), self._print(loop.step),
loop.loop_counter_name, loop.loop_counter_symbol.name) as i:
self._add_tmp_var(loop.loop_counter_symbol, i)
self._print(loop.body)
self._remove_tmp_var(loop.loop_counter_symbol)
def _print_SympyAssignment(self, assignment):
expr = self._print(assignment.rhs)
lhs = assignment.lhs
if isinstance(lhs, Indexed):
ptr = self._print(lhs.base.label)
index = self._print(lhs.args[1])
gep = self.builder.gep(ptr, [index])
return self.builder.store(expr, gep)
self.func_arg_map[assignment.lhs.name] = expr
return expr
def _print_boolean_cast_func(self, conversion):
return self._print_cast_func(conversion)
def _print_cast_func(self, conversion):
node = self._print(conversion.args[0])
to_dtype = get_type_of_expression(conversion)
from_dtype = get_type_of_expression(conversion.args[0])
if from_dtype == to_dtype:
return self._print(conversion.args[0])
# (From, to)
decision = {
(create_composite_type_from_string("int16"),
create_composite_type_from_string("int64")): lambda: ir.Constant(self.integer, node),
(create_composite_type_from_string("int"),
create_composite_type_from_string("double")): functools.partial(self.builder.sitofp, node, self.fp_type),
(create_composite_type_from_string("int16"),
create_composite_type_from_string("double")): functools.partial(self.builder.sitofp, node, self.fp_type),
(create_composite_type_from_string("double"),
create_composite_type_from_string("int")): functools.partial(self.builder.fptosi, node, self.integer),
(create_composite_type_from_string("double *"),
create_composite_type_from_string("int")): functools.partial(self.builder.ptrtoint, node, self.integer),
(create_composite_type_from_string("int"),
create_composite_type_from_string("double *")): functools.partial(self.builder.inttoptr,
node, self.fp_pointer),
(create_composite_type_from_string("double * restrict"),
create_composite_type_from_string("int")): functools.partial(self.builder.ptrtoint, node, self.integer),
(create_composite_type_from_string("int"),
create_composite_type_from_string("double * restrict")): functools.partial(self.builder.inttoptr, node,
self.fp_pointer),
(create_composite_type_from_string("double * restrict const"),
create_composite_type_from_string("int")): functools.partial(self.builder.ptrtoint, node,
self.integer),
(create_composite_type_from_string("int"),
create_composite_type_from_string("double * restrict const")): functools.partial(self.builder.inttoptr,
node, self.fp_pointer),
}
# TODO float, TEST: const, restrict
# TODO bitcast, addrspacecast
# TODO unsigned/signed fills
# print([x for x in decision.keys()])
# print("Types:")
# print([(type(x), type(y)) for (x, y) in decision.keys()])
# print("Cast:")
# print((from_dtype, to_dtype))
return decision[(from_dtype, to_dtype)]()
def _print_pointer_arithmetic_func(self, pointer):
ptr = self._print(pointer.args[0])
index = self._print(pointer.args[1])
return self.builder.gep(ptr, [index])
def _print_Indexed(self, indexed):
ptr = self._print(indexed.base.label)
index = self._print(indexed.args[1])
gep = self.builder.gep(ptr, [index])
return self.builder.load(gep, name=indexed.base.label.name)
def _print_Piecewise(self, piece):
if not piece.args[-1].cond:
# We need the last conditional to be a True, otherwise the resulting
# function may not return a result.
raise ValueError("All Piecewise expressions must contain an "
"(expr, True) statement to be used as a default "
"condition. Without one, the generated "
"expression may not evaluate to anything under "
"some condition.")
if piece.has(Assignment):
raise NotImplementedError('The llvm-backend does not support assignments'
'in the Piecewise function. It is questionable'
'whether to implement it. So far there is no'
'use-case to test it.')
else:
phi_data = []
after_block = self.builder.append_basic_block()
for (expr, condition) in piece.args:
if condition == sp.sympify(True): # Don't use 'is' use '=='!
phi_data.append((self._print(expr), self.builder.block))
self.builder.branch(after_block)
self.builder.position_at_end(after_block)
else:
cond = self._print(condition)
true_block = self.builder.append_basic_block()
false_block = self.builder.append_basic_block()
self.builder.cbranch(cond, true_block, false_block)
self.builder.position_at_end(true_block)
phi_data.append((self._print(expr), true_block))
self.builder.branch(after_block)
self.builder.position_at_end(false_block)
phi = self.builder.phi(to_llvm_type(get_type_of_expression(piece)))
for (val, block) in phi_data:
phi.add_incoming(val, block)
return phi
# Should have a list of math library functions to validate this.
# TODO function calls to libs
def _print_Function(self, expr):
name = expr.func.__name__
e0 = self._print(expr.args[0])
fn = self.ext_fn.get(name)
if not fn:
fn_type = ir.FunctionType(self.fp_type, [self.fp_type])
fn = ir.Function(self.module, fn_type, name)
self.ext_fn[name] = fn
return self.builder.call(fn, [e0], name)
def empty_printer(self, expr):
try:
import inspect
mro = inspect.getmro(expr)
except AttributeError:
mro = "None"
raise TypeError("Unsupported type for LLVM JIT conversion: Expression:\"%s\", Type:\"%s\", MRO:%s"
% (expr, type(expr), mro))
import ctypes as ct
import llvmlite.binding as llvm
import llvmlite.ir as ir
import numpy as np
from pystencils.data_types import create_composite_type_from_string
from pystencils.field import FieldType
from ..data_types import StructType, ctypes_from_llvm, to_ctypes
from .llvm import generate_llvm
def build_ctypes_argument_list(parameter_specification, argument_dict):
argument_dict = {k: v for k, v in argument_dict.items()}
ct_arguments = []
array_shapes = set()
index_arr_shapes = set()
for param in parameter_specification:
if param.is_field_parameter:
try:
field_arr = argument_dict[param.field_name]
except KeyError:
raise KeyError("Missing field parameter for kernel call " + param.field_name)
symbolic_field = param.fields[0]
if param.is_field_pointer:
ct_arguments.append(field_arr.ctypes.data_as(to_ctypes(param.symbol.dtype)))
if symbolic_field.has_fixed_shape:
symbolic_field_shape = tuple(int(i) for i in symbolic_field.shape)
if isinstance(symbolic_field.dtype, StructType):
symbolic_field_shape = symbolic_field_shape[:-1]
if symbolic_field_shape != field_arr.shape:
raise ValueError("Passed array '%s' has shape %s which does not match expected shape %s" %
(param.field_name, str(field_arr.shape), str(symbolic_field.shape)))
if symbolic_field.has_fixed_shape:
symbolic_field_strides = tuple(int(i) * field_arr.itemsize for i in symbolic_field.strides)
if isinstance(symbolic_field.dtype, StructType):
symbolic_field_strides = symbolic_field_strides[:-1]
if symbolic_field_strides != field_arr.strides:
raise ValueError("Passed array '%s' has strides %s which does not match expected strides %s" %
(param.field_name, str(field_arr.strides), str(symbolic_field_strides)))
if FieldType.is_indexed(symbolic_field):
index_arr_shapes.add(field_arr.shape[:symbolic_field.spatial_dimensions])
elif FieldType.is_generic(symbolic_field):
array_shapes.add(field_arr.shape[:symbolic_field.spatial_dimensions])
elif param.is_field_shape:
data_type = to_ctypes(param.symbol.dtype)
ct_arguments.append(data_type(field_arr.shape[param.symbol.coordinate]))
elif param.is_field_stride:
data_type = to_ctypes(param.symbol.dtype)
assert field_arr.strides[param.symbol.coordinate] % field_arr.itemsize == 0
item_stride = field_arr.strides[param.symbol.coordinate] // field_arr.itemsize
ct_arguments.append(data_type(item_stride))
else:
assert False
else:
try:
value = argument_dict[param.symbol.name]
except KeyError:
raise KeyError("Missing parameter for kernel call " + param.symbol.name)
expected_type = to_ctypes(param.symbol.dtype)
ct_arguments.append(expected_type(value))
if len(array_shapes) > 1:
raise ValueError("All passed arrays have to have the same size " + str(array_shapes))
if len(index_arr_shapes) > 1:
raise ValueError("All passed index arrays have to have the same size " + str(array_shapes))
return ct_arguments
def make_python_function_incomplete_params(kernel_function_node, argument_dict, func):
parameters = kernel_function_node.get_parameters()
cache = {}
cache_values = []
def wrapper(**kwargs):
key = hash(tuple((k, v.ctypes.data, v.strides, v.shape) if isinstance(v, np.ndarray) else (k, id(v))
for k, v in kwargs.items()))
try:
args = cache[key]
func(*args)
except KeyError:
full_arguments = argument_dict.copy()
full_arguments.update(kwargs)
args = build_ctypes_argument_list(parameters, full_arguments)
cache[key] = args
cache_values.append(kwargs) # keep objects alive such that ids remain unique
func(*args)
wrapper.ast = kernel_function_node
wrapper.parameters = kernel_function_node.get_parameters()
return wrapper
def generate_and_jit(ast):
gen = generate_llvm(ast)
if isinstance(gen, ir.Module):
return compile_llvm(gen)
else:
return compile_llvm(gen.module)
def make_python_function(ast, argument_dict={}, func=None):
if func is None:
jit = generate_and_jit(ast)
func = jit.get_function_ptr(ast.function_name)
try:
args = build_ctypes_argument_list(ast.get_parameters(), argument_dict)
except KeyError:
# not all parameters specified yet
return make_python_function_incomplete_params(ast, argument_dict, func)
return lambda: func(*args)
def compile_llvm(module):
jit = Jit()
jit.parse(module)
jit.optimize()
jit.compile()
return jit
class Jit(object):
def __init__(self):
llvm.initialize()
llvm.initialize_all_targets()
llvm.initialize_native_target()
llvm.initialize_native_asmprinter()
self.module = None
self._llvmmod = llvm.parse_assembly("")
self.target = llvm.Target.from_default_triple()
self.cpu = llvm.get_host_cpu_name()
self.cpu_features = llvm.get_host_cpu_features()
self.target_machine = self.target.create_target_machine(cpu=self.cpu, features=self.cpu_features.flatten(),
opt=2)
llvm.check_jit_execution()
self.ee = llvm.create_mcjit_compiler(self.llvmmod, self.target_machine)
self.ee.finalize_object()
self.fptr = None
@property
def llvmmod(self):
return self._llvmmod
@llvmmod.setter
def llvmmod(self, mod):
self.ee.remove_module(self.llvmmod)
self.ee.add_module(mod)
self.ee.finalize_object()
self.compile()
self._llvmmod = mod
def parse(self, module):
self.module = module
llvmmod = llvm.parse_assembly(str(module))
llvmmod.verify()
llvmmod.triple = self.target.triple
llvmmod.name = 'module'
self.llvmmod = llvmmod
def write_ll(self, file):
with open(file, 'w') as f:
f.write(str(self.llvmmod))
def write_assembly(self, file):
with open(file, 'w') as f:
f.write(self.target_machine.emit_assembly(self.llvmmod))
def write_object_file(self, file):
with open(file, 'wb') as f:
f.write(self.target_machine.emit_object(self.llvmmod))
def optimize(self):
pmb = llvm.create_pass_manager_builder()
pmb.opt_level = 2
pmb.disable_unit_at_a_time = False
pmb.loop_vectorize = True
pmb.slp_vectorize = True
# TODO possible to pass for functions
pm = llvm.create_module_pass_manager()
pm.add_instruction_combining_pass()
pm.add_function_attrs_pass()
pm.add_constant_merge_pass()
pm.add_licm_pass()
pmb.populate(pm)
pm.run(self.llvmmod)
def compile(self):
fptr = {}
for func in self.module.functions:
if not func.is_declaration:
return_type = None
if func.ftype.return_type != ir.VoidType():
return_type = to_ctypes(create_composite_type_from_string(str(func.ftype.return_type)))
args = [ctypes_from_llvm(arg) for arg in func.ftype.args]
function_address = self.ee.get_function_address(func.name)
fptr[func.name] = ct.CFUNCTYPE(return_type, *args)(function_address)
self.fptr = fptr
def __call__(self, func, *args, **kwargs):
target_function = next(f for f in self.module.functions if f.name == func)
arg_types = [ctypes_from_llvm(arg.type) for arg in target_function.args]
transformed_args = []
for i, arg in enumerate(args):
if isinstance(arg, np.ndarray):
transformed_args.append(arg.ctypes.data_as(arg_types[i]))
else:
transformed_args.append(arg)
self.fptr[func](*transformed_args)
def print_functions(self):
for f in self.module.functions:
print(f.ftype.return_type, f.name, f.args)
def get_function_ptr(self, name):
fptr = self.fptr[name]
fptr.jit = self
return fptr
import numpy as np
from pystencils.backends.cbackend import generate_c, get_headers
from pystencils.gpucuda.cudajit import _build_numpy_argument_list, _check_arguments
from pystencils.include import get_pystencils_include_path
USE_FAST_MATH = True
def make_python_function(kernel_function_node, opencl_queue, opencl_ctx, argument_dict=None, custom_backend=None):
"""
Creates a **OpenCL** kernel function from an abstract syntax tree which
was created for the ``target='gpu'`` e.g. by :func:`pystencils.gpucuda.create_cuda_kernel`
or :func:`pystencils.gpucuda.created_indexed_cuda_kernel`
Args:
opencl_queue: a valid :class:`pyopencl.CommandQueue`
opencl_ctx: a valid :class:`pyopencl.Context`
kernel_function_node: the abstract syntax tree
argument_dict: parameters passed here are already fixed. Remaining parameters have to be passed to the
returned kernel functor.
Returns:
compiled kernel as Python function
"""
import pyopencl as cl
assert opencl_ctx, "No valid OpenCL context"
assert opencl_queue, "No valid OpenCL queue"
if argument_dict is None:
argument_dict = {}
# Changing of kernel name necessary since compilation with default name "kernel" is not possible (OpenCL keyword!)
kernel_function_node.function_name = "opencl_" + kernel_function_node.function_name
header_list = ['"opencl_stdint.h"'] + list(get_headers(kernel_function_node))
includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list])
code = includes + "\n"
code += "#define FUNC_PREFIX __kernel\n"
code += "#define RESTRICT restrict\n\n"
code += str(generate_c(kernel_function_node, dialect='opencl', custom_backend=custom_backend))
options = []
if USE_FAST_MATH:
options.append("-cl-unsafe-math-optimizations -cl-mad-enable -cl-fast-relaxed-math -cl-finite-math-only")
options.append("-I \"" + get_pystencils_include_path() + "\"")
mod = cl.Program(opencl_ctx, code).build(options=options)
func = getattr(mod, kernel_function_node.function_name)
parameters = kernel_function_node.get_parameters()
cache = {}
cache_values = []
def wrapper(**kwargs):
key = hash(tuple((k, v.ctypes.data, v.strides, v.shape) if isinstance(v, np.ndarray) else (k, id(v))
for k, v in kwargs.items()))
try:
args, block_and_thread_numbers = cache[key]
func(opencl_queue, block_and_thread_numbers['grid'], block_and_thread_numbers['block'], *args)
except KeyError:
full_arguments = argument_dict.copy()
full_arguments.update(kwargs)
shape = _check_arguments(parameters, full_arguments)
indexing = kernel_function_node.indexing
block_and_thread_numbers = indexing.call_parameters(shape)
block_and_thread_numbers['block'] = tuple(int(i) for i in block_and_thread_numbers['block'])
block_and_thread_numbers['grid'] = tuple(int(b * g) for (b, g) in zip(block_and_thread_numbers['block'],
block_and_thread_numbers['grid']))
args = _build_numpy_argument_list(parameters, full_arguments)
args = [a.data if hasattr(a, 'data') else a for a in args]
cache[key] = (args, block_and_thread_numbers)
cache_values.append(kwargs) # keep objects alive such that ids remain unique
func(opencl_queue, block_and_thread_numbers['grid'], block_and_thread_numbers['block'], *args)
wrapper.ast = kernel_function_node
wrapper.parameters = kernel_function_node.get_parameters()
return wrapper
import numpy as np
import sympy as sp
from pystencils import TypedSymbol
from pystencils.astnodes import LoopOverCoordinate
from pystencils.backends.cbackend import CustomCodeNode
def _get_rng_template(name, data_type, num_vars):
if data_type is np.float32:
c_type = "float"
elif data_type is np.float64:
c_type = "double"
template = "\n"
for i in range(num_vars):
template += "{{result_symbols[{}].dtype}} {{result_symbols[{}].name}};\n".format(i, i)
template += ("{}_{}{}({{parameters}}, " + ", ".join(["{{result_symbols[{}].name}}"] * num_vars) + ");\n") \
.format(name, c_type, num_vars, *tuple(range(num_vars)))
return template
def _get_rng_code(template, dialect, vector_instruction_set, time_step, offsets, keys, dim, result_symbols):
parameters = [time_step] + [LoopOverCoordinate.get_loop_counter_symbol(i) + offsets[i]
for i in range(dim)] + [0] * (3 - dim) + list(keys)
if dialect == 'cuda' or (dialect == 'c' and vector_instruction_set is None):
return template.format(parameters=', '.join(str(p) for p in parameters),
result_symbols=result_symbols)
else:
raise NotImplementedError("Not yet implemented for this backend")
class RNGBase(CustomCodeNode):
def __init__(self, dim, time_step=TypedSymbol("time_step", np.uint32), offsets=(0, 0, 0), keys=None):
if keys is None:
keys = (0,) * self._num_keys
if len(keys) != self._num_keys:
raise ValueError("Provided {} keys but need {}".format(len(keys), self._num_keys))
if len(offsets) != 3:
raise ValueError("Provided {} offsets but need {}".format(len(offsets), 3))
self.result_symbols = tuple(TypedSymbol(sp.Dummy().name, self._data_type) for _ in range(self._num_vars))
symbols_read = [s for s in keys if isinstance(s, sp.Symbol)]
super().__init__("", symbols_read=symbols_read, symbols_defined=self.result_symbols)
self._time_step = time_step
self._offsets = offsets
self.headers = ['"{}_rand.h"'.format(self._name)]
self.keys = tuple(keys)
self._args = sp.sympify((dim, time_step, keys))
self._dim = dim
@property
def args(self):
return self._args
@property
def undefined_symbols(self):
result = {a for a in (self._time_step, *self._offsets, *self.keys) if isinstance(a, sp.Symbol)}
loop_counters = [LoopOverCoordinate.get_loop_counter_symbol(i)
for i in range(self._dim)]
result.update(loop_counters)
return result
def fast_subs(self, _):
return self # nothing to replace inside this node - would destroy intermediate "dummy" by re-creating them
def get_code(self, dialect, vector_instruction_set):
template = _get_rng_template(self._name, self._data_type, self._num_vars)
return _get_rng_code(template, dialect, vector_instruction_set,
self._time_step, self._offsets, self.keys, self._dim, self.result_symbols)
def __repr__(self):
return (", ".join(['{}'] * self._num_vars) + " <- {}RNG").format(*self.result_symbols, self._name.capitalize())
class PhiloxTwoDoubles(RNGBase):
_name = "philox"
_data_type = np.float64
_num_vars = 2
_num_keys = 2
class PhiloxFourFloats(RNGBase):
_name = "philox"
_data_type = np.float32
_num_vars = 4
_num_keys = 2
class AESNITwoDoubles(RNGBase):
_name = "aesni"
_data_type = np.float64
_num_vars = 2
_num_keys = 4
class AESNIFourFloats(RNGBase):
_name = "aesni"
_data_type = np.float32
_num_vars = 4
_num_keys = 4
def random_symbol(assignment_list, seed=TypedSymbol("seed", np.uint32), rng_node=PhiloxTwoDoubles, *args, **kwargs):
counter = 0
while True:
node = rng_node(*args, keys=(counter, seed), **kwargs)
inserted = False
for symbol in node.result_symbols:
if not inserted:
assignment_list.insert(0, node)
inserted = True
yield symbol
# Disable gmpy backend until this bug is resolved if joblib serialize
# See https://github.com/sympy/sympy/pull/13530
import os
import warnings
os.environ['MPMATH_NOGMPY'] = '1'
try:
import mpmath.libmp
# In case the user has imported sympy first, then pystencils
if mpmath.libmp.BACKEND == 'gmpy':
warnings.warn("You are using the gmpy backend. You might encounter an error 'argument is not an mpz sympy'. "
"This is due to a known bug in sympy/gmpy library. "
"To prevent this, import pystencils first then sympy or set the environment variable "
"MPMATH_NOGMPY=1")
except ImportError:
pass
__all__ = []
# FIXME
# FIXME performance counters might be wrong. This will only affect the Benchmark model
# FIXME bandwidth measurements need validation
# FIXME
kerncraft version: 0.7.2
model name: Intel(R) Xeon(R) Gold 5122 CPU @ 3.60GHz
model type: Intel Core Skylake SP
sockets: 2
cores per socket: 4
threads per core: 2
NUMA domains per socket: 1
cores per NUMA domain: 4
clock: 3.6 GHz
FLOPs per cycle:
SP:
total: 64
FMA: 64
ADD: 32
MUL: 32
DP:
total: 32
FMA: 32
ADD: 16
MUL: 16
micro-architecture: SKX
compiler:
!!omap
- icc: -O3 -fno-alias -xCORE-AVX512
- clang: -O3 -march=skylake-avx512 -D_POSIX_C_SOURCE=200112L
- gcc: -O3 -march=skylake-avx512
cacheline size: 64 B
overlapping model:
ports: ["0", "0DV", "1", "2", "3", "4", "5", "6", "7"]
performance counter metric:
Max(UOPS_DISPATCHED_PORT_PORT_0:PMC[0-3],
UOPS_DISPATCHED_PORT_PORT_1:PMC[0-3],
UOPS_DISPATCHED_PORT_PORT_4:PMC[0-3],
UOPS_DISPATCHED_PORT_PORT_5:PMC[0-3],
UOPS_DISPATCHED_PORT_PORT_6:PMC[0-3],
UOPS_DISPATCHED_PORT_PORT_7:PMC[0-3])
non-overlapping model:
ports: ["2D", "3D"]
performance counter metric: T_OL + T_L1L2 + T_L2L3 + T_L3MEM
memory hierarchy:
- level: L1
performance counter metrics:
accesses: MEM_INST_RETIRED_ALL_LOADS:PMC[0-3]
misses: L1D_REPLACEMENT:PMC[0-3]
evicts: L2_TRANS_L1D_WB:PMC[0-3]
cache per group:
sets: 64
ways: 8
cl_size: 64
replacement_policy: 'LRU'
write_allocate: True
write_back: True
load_from: L2
store_to: L2
size per group: 32.00 kB
groups: 8
cores per group: 1
threads per group: 2
- level: L2
non-overlap upstream throughput: [64 B/cy, 'half-duplex']
performance counter metrics:
accesses: L1D_REPLACEMENT:PMC[0-3]
misses: L2_LINES_IN_ALL:PMC[0-3]
evicts: L2_TRANS_L2_WB:PMC[0-3]
cache per group:
sets: 1024
ways: 16
cl_size: 64
replacement_policy: 'LRU'
write_allocate: True
write_back: True
load_from: null # L3 is a victim cache, thus unless a hit in L3, misses get forwarded to MEM
victims_to: L3 # all victims, modified or not are passed onto L3
store_to: L3
size per group: 1.00 MB
groups: 8
cores per group: 1
threads per group: 2
- level: L3
non-overlap upstream throughput: [16 B/cy, 'full-duplex']
performance counter metrics:
accesses: L2_LINES_IN_ALL:PMC[0-3]
# FIXME not all misses in L2 lead to loads from L3, only the hits do
misses: (CAS_COUNT_RD:MBOX0C[01] + CAS_COUNT_WR:MBOX0C[01] +
CAS_COUNT_RD:MBOX1C[01] + CAS_COUNT_WR:MBOX1C[01] +
CAS_COUNT_RD:MBOX2C[01] + CAS_COUNT_WR:MBOX2C[01] +
CAS_COUNT_RD:MBOX3C[01] + CAS_COUNT_WR:MBOX3C[01] +
CAS_COUNT_RD:MBOX4C[01] + CAS_COUNT_WR:MBOX4C[01] +
CAS_COUNT_RD:MBOX5C[01] + CAS_COUNT_WR:MBOX5C[01])
evicts: L2_TRANS_L2_WB:PMC[0-3]
cache per group:
sets: 16896
# TODO is actuall something else, but necessary to get to 16.5 MB
ways: 16
# TODO is actually 11, but pycachesim only supports powers of two
cl_size: 64
replacement_policy: 'LRU'
write_allocate: False
write_back: True
size per group: 16.50 MB
groups: 2
cores per group: 4
threads per group: 8
- level: MEM
cores per group: 4
threads per group: 8
non-overlap upstream throughput: ['full socket memory bandwidth', 'half-duplex']
penalty cycles per read stream: 0
size per group:
benchmarks:
kernels:
load:
read streams:
streams: 1
bytes: 8.00 B
read+write streams:
streams: 0
bytes: 0.00 B
write streams:
streams: 0
bytes: 0.00 B
FLOPs per iteration: 0
copy:
read streams:
streams: 1
bytes: 8.00 B
read+write streams:
streams: 0
bytes: 0.00 B
write streams:
streams: 1
bytes: 8.00 B
FLOPs per iteration: 0
update:
read streams:
streams: 1
bytes: 8.00 B
read+write streams:
streams: 1
bytes: 8.00 B
write streams:
streams: 1
bytes: 8.00 B
FLOPs per iteration: 0
triad:
read streams:
streams: 3
bytes: 24.00 B
read+write streams:
streams: 0
bytes: 0.00 B
write streams:
streams: 1
bytes: 8.00 B
FLOPs per iteration: 2
daxpy:
read streams:
streams: 2
bytes: 16.00 B
read+write streams:
streams: 1
bytes: 8.00 B
write streams:
streams: 1
bytes: 8.00 B
FLOPs per iteration: 2
measurements:
L1:
1:
threads per core: 1
cores:
- 1
- 2
- 3
- 4
threads:
- 1
- 2
- 3
- 4
size per core:
- 21.12 kB
- 21.12 kB
- 21.12 kB
- 21.12 kB
size per thread:
- 21.12 kB
- 21.12 kB
- 21.12 kB
- 21.12 kB
total size:
- 21.12 kB
- 42.24 kB
- 63.36 kB
- 84.48 kB
results:
load:
- 42.98 GB/s
- 85.08 GB/s
- 127.45 GB/s
- 169.92 GB/s
copy:
- 56.07 GB/s
- 111.50 GB/s
- 164.90 GB/s
- 221.50 GB/s
update:
- 56.54 GB/s
- 112.25 GB/s
- 168.50 GB/s
- 224.75 GB/s
triad:
- 45.90 GB/s
- 89.81 GB/s
- 127.29 GB/s
- 169.57 GB/s
daxpy:
- 36.62 GB/s
- 71.30 GB/s
- 103.52 GB/s
- 135.26 GB/s
2:
threads per core: 2
cores:
- 1
- 2
- 3
- 4
threads:
- 2
- 4
- 6
- 8
size per core:
- 21.12 kB
- 21.12 kB
- 21.12 kB
- 21.12 kB
size per thread:
- 10.56 kB
- 10.56 kB
- 10.56 kB
- 10.56 kB
total size:
- 21.12 kB
- 42.24 kB
- 63.36 kB
- 84.48 kB
results:
load:
- 49.61 GB/s
- 98.80 GB/s
- 147.98 GB/s
- 198.22 GB/s
copy:
- 55.98 GB/s
- 111.56 GB/s
- 167.08 GB/s
- 220.42 GB/s
update:
- 56.53 GB/s
- 112.72 GB/s
- 168.95 GB/s
- 225.31 GB/s
triad:
- 54.01 GB/s
- 104.58 GB/s
- 153.02 GB/s
- 200.93 GB/s
daxpy:
- 41.11 GB/s
- 80.28 GB/s
- 115.71 GB/s
- 152.81 GB/s
L2:
1:
threads per core: 1
cores:
- 1
- 2
- 3
- 4
threads:
- 1
- 2
- 3
- 4
size per core:
- 660.00 kB
- 660.00 kB
- 660.00 kB
- 660.00 kB
size per thread:
- 660.00 kB
- 660.00 kB
- 660.00 kB
- 660.00 kB
total size:
- 660.00 kB
- 1.32 MB
- 1.98 MB
- 2.64 MB
results:
load:
- 27.15 GB/s
- 54.09 GB/s
- 80.61 GB/s
- 106.41 GB/s
copy:
- 43.53 GB/s
- 90.07 GB/s
- 127.73 GB/s
- 171.81 GB/s
update:
- 50.38 GB/s
- 98.47 GB/s
- 147.91 GB/s
- 197.20 GB/s
triad:
- 43.38 GB/s
- 83.72 GB/s
- 124.83 GB/s
- 166.04 GB/s
daxpy:
- 36.29 GB/s
- 71.29 GB/s
- 103.33 GB/s
- 136.48 GB/s
2:
threads per core: 2
cores:
- 1
- 2
- 3
- 4
threads:
- 2
- 4
- 6
- 8
size per core:
- 660.00 kB
- 660.00 kB
- 660.00 kB
- 660.00 kB
size per thread:
- 330.00 kB
- 330.00 kB
- 330.00 kB
- 330.00 kB
total size:
- 660.00 kB
- 1.32 MB
- 1.98 MB
- 2.64 MB
results:
load:
- 35.29 GB/s
- 70.28 GB/s
- 104.67 GB/s
- 139.63 GB/s
copy:
- 42.23 GB/s
- 83.70 GB/s
- 124.33 GB/s
- 167.50 GB/s
update:
- 50.09 GB/s
- 99.77 GB/s
- 149.87 GB/s
- 198.82 GB/s
triad:
- 52.38 GB/s
- 100.00 GB/s
- 147.40 GB/s
- 193.31 GB/s
daxpy:
- 41.14 GB/s
- 80.22 GB/s
- 116.23 GB/s
- 155.08 GB/s
L3:
1:
threads per core: 1
cores:
- 1
- 2
- 3
- 4
threads:
- 1
- 2
- 3
- 4
size per core:
- 10.56 MB
- 5.28 MB
- 3.52 MB
- 2.64 MB
size per thread:
- 10.56 MB
- 5.28 MB
- 3.52 MB
- 2.64 MB
total size:
- 10.56 MB
- 10.56 MB
- 10.56 MB
- 10.56 MB
results:
load:
- 22.40 GB/s
- 44.77 GB/s
- 65.71 GB/s
- 89.26 GB/s
copy:
- 25.32 GB/s
- 49.70 GB/s
- 72.89 GB/s
- 98.62 GB/s
update:
- 41.24 GB/s
- 81.14 GB/s
- 122.22 GB/s
- 166.44 GB/s
triad:
- 25.61 GB/s
- 50.02 GB/s
- 73.23 GB/s
- 98.95 GB/s
daxpy:
- 32.07 GB/s
- 62.65 GB/s
- 89.91 GB/s
- 120.65 GB/s
2:
threads per core: 2
cores:
- 1
- 2
- 3
- 4
threads:
- 2
- 4
- 6
- 8
size per core:
- 10.56 MB
- 5.28 MB
- 3.52 MB
- 2.64 MB
size per thread:
- 5.28 MB
- 2.64 MB
- 1.76 MB
- 1.32 MB
total size:
- 10.56 MB
- 10.56 MB
- 10.56 MB
- 10.56 MB
results:
load:
- 26.18 GB/s
- 51.85 GB/s
- 75.82 GB/s
- 101.39 GB/s
copy:
- 26.22 GB/s
- 51.83 GB/s
- 76.40 GB/s
- 102.84 GB/s
update:
- 43.51 GB/s
- 86.75 GB/s
- 129.86 GB/s
- 174.54 GB/s
triad:
- 26.39 GB/s
- 51.80 GB/s
- 76.27 GB/s
- 102.66 GB/s
daxpy:
- 37.43 GB/s
- 73.16 GB/s
- 106.53 GB/s
- 142.76 GB/s
MEM:
1:
threads per core: 1
cores:
- 1
- 2
- 3
- 4
threads:
- 1
- 2
- 3
- 4
size per core:
- 240.00 MB
- 120.00 MB
- 80.00 MB
- 60.00 MB
size per thread:
- 240.00 MB
- 120.00 MB
- 80.00 MB
- 60.00 MB
total size:
- 240.00 MB
- 240.00 MB
- 240.00 MB
- 240.00 MB
results:
load:
- 12.03 GB/s
- 24.38 GB/s
- 34.83 GB/s
- 45.05 GB/s
copy:
- 12.32 GB/s
- 24.40 GB/s
- 32.82 GB/s
- 37.00 GB/s
update:
- 20.83 GB/s
- 40.25 GB/s
- 48.81 GB/s
- 54.84 GB/s
triad:
- 11.64 GB/s
- 23.17 GB/s
- 34.78 GB/s
- 42.97 GB/s
daxpy:
- 17.69 GB/s
- 34.02 GB/s
- 48.12 GB/s
- 55.73 GB/s
2:
threads per core: 2
cores:
- 1
- 2
- 3
- 4
threads:
- 2
- 4
- 6
- 8
size per core:
- 240.00 MB
- 120.00 MB
- 80.00 MB
- 60.00 MB
size per thread:
- 120.00 MB
- 60.00 MB
- 40.00 MB
- 30.00 MB
total size:
- 240.00 MB
- 240.00 MB
- 240.00 MB
- 240.00 MB
results:
load:
- 15.33 GB/s
- 28.32 GB/s
- 41.34 GB/s
- 53.02 GB/s
copy:
- 13.96 GB/s
- 26.61 GB/s
- 34.39 GB/s
- 38.96 GB/s
update:
- 26.47 GB/s
- 47.82 GB/s
- 56.70 GB/s
- 62.78 GB/s
triad:
- 14.42 GB/s
- 26.66 GB/s
- 36.94 GB/s
- 44.01 GB/s
daxpy:
- 20.96 GB/s
- 39.12 GB/s
- 51.55 GB/s
- 58.37 GB/s
import math
import os
import time
import numpy as np
import sympy as sp
from git import Repo
from influxdb import InfluxDBClient
from kerncraft.machinemodel import MachineModel
from kerncraft.models import ECM, Benchmark, Roofline, RooflineIACA
from kerncraft.prefixedunit import PrefixedUnit
from pystencils import Assignment, Field, create_kernel
from pystencils.kerncraft_coupling import KerncraftParameters, PyStencilsKerncraftKernel
def output_benchmark(analysis):
output = {}
keys = ['Runtime (per repetition) [s]', 'Iterations per repetition',
'Runtime (per cacheline update) [cy/CL]', 'MEM volume (per repetition) [B]',
'Performance [MFLOP/s]', 'Performance [MLUP/s]', 'Performance [MIt/s]', 'MEM BW [MByte/s]']
copies = {key: analysis[key] for key in keys}
output.update(copies)
for cache, metrics in analysis['data transfers'].items():
for metric_name, metric_value in metrics.items():
fixed = metric_value.with_prefix('')
output[cache + ' ' + metric_name + ' ' + fixed.prefix + fixed.unit] = fixed.value
for level, value in analysis['ECM'].items():
output['Phenomenological ECM ' + level + ' cy/CL'] = value
return output
def output_ecm(analysis):
output = {}
keys = ['T_nOL', 'T_OL', 'cl throughput', 'uops']
copies = {key: analysis[key] for key in keys}
output.update(copies)
if 'memory bandwidth kernel' in analysis:
output['memory bandwidth kernel' + analysis['memory bandwidth kernel'] + analysis['memory bandwidth'].prefix +
analysis['memory bandwidth'].unit] = analysis['memory bandwidth'].value
output['scaling cores'] = int(analysis['scaling cores']) if not math.isinf(analysis['scaling cores']) else -1
for key, value in analysis['cycles']:
output[key] = value
return output
def output_roofline(analysis):
output = {}
keys = ['min performance'] # 'bottleneck level'
copies = {key: analysis[key] for key in keys}
output.update(copies)
# TODO save bottleneck information (compute it here)
# fixed = analysis['max_flops'].with_prefix('G')
# output['max GFlop/s'] = fixed.value
# if analysis['min performance'] > max_flops:
# # CPU bound
# print('CPU bound with {} cores(s)'.format(self._args.cores), file=output_file)
# print('{!s} due to CPU max. FLOP/s'.format(max_flops), file=output_file)
# else:
# Memory bound
bottleneck = analysis['mem bottlenecks'][analysis['bottleneck level']]
output['bottleneck GFlop/s'] = bottleneck['performance'].with_prefix('G').value
output['bottleneck level'] = bottleneck['level']
output['bottleneck bw kernel'] = bottleneck['bw kernel']
output['bottleneck arithmetic intensity'] = bottleneck['arithmetic intensity']
for i, level in enumerate(analysis['mem bottlenecks']):
if level is None:
continue
for key, value in level.items():
if isinstance(value, PrefixedUnit):
fixed = value.with_prefix('G')
output['level ' + str(i) + ' ' + key + ' [' + fixed.prefix + fixed.unit + ']'] = 'inf' if isinstance(
fixed.value, float) and math.isinf(fixed.value) else fixed.value
else:
output['level ' + str(i) + ' ' + key] = 'inf' if isinstance(value, float) and math.isinf(
value) else value
return output
def output_roofline_iaca(analysis):
output = {}
keys = ['min performance'] # 'bottleneck level'
copies = {key: analysis[key] for key in keys}
# output.update(copies)
# TODO save bottleneck information (compute it here)
# fixed = analysis['max_flops'].with_prefix('G')
# output['max GFlop/s'] = fixed.value
# if analysis['min performance'] > max_flops:
# # CPU bound
# print('CPU bound with {} cores(s)'.format(self._args.cores), file=output_file)
# print('{!s} due to CPU max. FLOP/s'.format(max_flops), file=output_file)
# else:
# Memory bound
bottleneck = analysis['mem bottlenecks'][analysis['bottleneck level']]
output['bottleneck GFlop/s'] = bottleneck['performance'].with_prefix('G').value
output['bottleneck level'] = bottleneck['level']
output['bottleneck bw kernel'] = bottleneck['bw kernel']
output['bottleneck arithmetic intensity'] = bottleneck['arithmetic intensity']
for i, level in enumerate(analysis['mem bottlenecks']):
if level is None:
continue
for key, value in level.items():
if isinstance(value, PrefixedUnit):
fixed = value.with_prefix('G')
output['level ' + str(i) + ' ' + key + ' [' + fixed.prefix + fixed.unit + ']'] = 'inf' if isinstance(
fixed.value, float) and math.isinf(fixed.value) else fixed.value
else:
output['level ' + str(i) + ' ' + key] = 'inf' if isinstance(value, float) and math.isinf(
value) else value
return output
def report_analysis(ast, models, machine, tags, fields=None):
kernel = PyStencilsKerncraftKernel(ast, machine)
client = InfluxDBClient('i10grafana.informatik.uni-erlangen.de', 8086, 'pystencils',
'roggan', 'pystencils')
repo = Repo(search_parent_directories=True)
commit = repo.head.commit
point_time = int(time.time())
for model in models:
benchmark = model(kernel, machine, KerncraftParameters())
benchmark.analyze()
analysis = benchmark.results
if model is Benchmark:
output = output_benchmark(analysis)
elif model is ECM:
output = output_ecm(analysis)
elif model is Roofline:
output = output_roofline(analysis)
elif model is RooflineIACA:
output = output_roofline_iaca(analysis)
else:
raise ValueError('No valid model for analysis given!')
if fields is not None:
output.update(fields)
output['commit'] = commit.hexsha
json_body = [
{
'measurement': model.__name__,
'tags': tags,
'time': point_time,
'fields': output
}
]
client.write_points(json_body, time_precision='s')
def main():
size = [20, 200, 200]
arr = np.zeros(size)
a = Field.create_from_numpy_array('a', arr, index_dimensions=0)
b = Field.create_from_numpy_array('b', arr, index_dimensions=0)
s = sp.Symbol("s")
rhs = a[0, -1, 0] + a[0, 1, 0] + \
a[-1, 0, 0] + a[1, 0, 0] + \
a[0, 0, -1] + a[0, 0, 1]
update_rule = Assignment(b[0, 0, 0], s * rhs)
ast = create_kernel([update_rule])
input_folder = "./"
machine_file_path = os.path.join(input_folder, "SkylakeSP_Gold-5122_allinclusive.yaml")
machine = MachineModel(path_to_yaml=machine_file_path)
tags = {
'host': os.uname()[1],
'project': 'pystencils',
'kernel': 'jacobi_3D ' + str(size)
}
report_analysis(ast, [ECM, Roofline, RooflineIACA, Benchmark], machine, tags)
if __name__ == '__main__':
main()
import numpy as np
import sympy as sp
from pystencils import Assignment, Field, create_kernel
def meassure():
size = [30, 50, 3]
arr = np.zeros(size)
a = Field.create_from_numpy_array('a', arr, index_dimensions=1)
b = Field.create_from_numpy_array('b', arr, index_dimensions=1)
s = sp.Symbol("s")
rhs = a[0, -1](0) + a[0, 1] + a[-1, 0] + a[1, 0]
updateRule = Assignment(b[0, 0], s * rhs)
print(updateRule)
ast = create_kernel([updateRule])
# benchmark = generate_benchmark(ast)
# main = benchmark[0]
# kernel = benchmark[1]
# with open('src/main.cpp', 'w') as file:
# file.write(main)
# with open('src/kernel.cpp', 'w') as file:
# file.write(kernel)
func = ast.compile({'omega': 2/3})
from pystencils.kerncraft_coupling.generate_benchmark import generate_benchmark
from pystencils.kerncraft_coupling import BenchmarkAnalysis
from pystencils.kerncraft_coupling.kerncraft_interface import PyStencilsKerncraftKernel, KerncraftParameters
from kerncraft.machinemodel import MachineModel
from kerncraft.models import ECMData
machineFilePath = "../pystencils_tests/kerncraft_inputs/default_machine_file.yaml"
machine = MachineModel(path_to_yaml=machineFilePath)
benchmark = BenchmarkAnalysis(ast, machine)
#TODO what do i want to do with benchmark?
kernel = PyStencilsKerncraftKernel(ast)
model = ECMData(kernel, machine, KerncraftParameters())
model.analyze()
model.report()
if __name__ == "__main__":
meassure()