Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
No results found
Show changes
Showing
with 4992 additions and 0 deletions
from .derivative import (
Diff, DiffOperator, collect_diffs, combine_diff_products, diff, diff_terms, evaluate_diffs,
expand_diff_full, expand_diff_linear, expand_diff_products, functional_derivative,
normalize_diff_order, zero_diffs)
from .finitedifferences import Discretization2ndOrder, advection, diffusion, transient
from .finitevolumes import FVM1stOrder, VOF
from .spatial import discretize_spatial, discretize_spatial_staggered
__all__ = ['Diff', 'diff', 'DiffOperator', 'diff_terms', 'collect_diffs',
'zero_diffs', 'evaluate_diffs', 'normalize_diff_order', 'expand_diff_full', 'expand_diff_linear',
'expand_diff_products', 'combine_diff_products', 'functional_derivative',
'advection', 'diffusion', 'transient', 'Discretization2ndOrder', 'discretize_spatial',
'discretize_spatial_staggered', 'FVM1stOrder', 'VOF']
import itertools
from collections import defaultdict
import numpy as np
import sympy as sp
from pystencils.field import Field
from pystencils.stencil import direction_string_to_offset
from pystencils.sympyextensions import multidimensional_sum, prod
from pystencils.utils import LinearEquationSystem, fully_contains
class FiniteDifferenceStencilDerivation:
"""Derives finite difference stencils.
Can derive standard finite difference stencils, as well as isotropic versions
(see Isotropic Finite Differences by A. Kumar)
Args:
derivative_coordinates: tuple indicating which derivative should be approximated,
(1, ) stands for first derivative in second direction (y),
(0, 1) would be a mixed second derivative in x and y
(0, 0, 0) would be a third derivative in x direction
stencil: list of offset tuples, defining the stencil
dx: spacing between grid points, one for all directions, i.e. dx=dy=dz
Examples:
Central differences
>>> fd_1d = FiniteDifferenceStencilDerivation((0,), stencil=[(-1,), (0,), (1,)])
>>> result = fd_1d.get_stencil()
>>> result
Finite difference stencil of accuracy 2, isotropic error: False
>>> result.weights
[-1/2, 0, 1/2]
Forward differences
>>> fd_1d = FiniteDifferenceStencilDerivation((0,), stencil=[(0,), (1,)])
>>> result = fd_1d.get_stencil()
>>> result
Finite difference stencil of accuracy 1, isotropic error: False
>>> result.weights
[-1, 1]
"""
def __init__(self, derivative_coordinates, stencil, dx=1):
self.dim = len(stencil[0])
self.field = Field.create_generic('f', spatial_dimensions=self.dim)
self._derivative = tuple(sorted(derivative_coordinates))
self._stencil = stencil
self._dx = dx
self.weights = {tuple(d): self.symbolic_weight(*d) for d in self._stencil}
def assume_symmetric(self, dim, anti_symmetric=False):
"""Adds restriction that weight in opposite directions of a dimension are equal (symmetric) or
the negative of each other (anti symmetric)
For example: dim=1, assumes that w(1, 1) == w(1, -1), if anti_symmetric=False or
w(1, 1) == -w(1, -1) if anti_symmetric=True
"""
update = {}
for direction, value in self.weights.items():
inv_direction = tuple(-offset if i == dim else offset for i, offset in enumerate(direction))
if direction[dim] < 0:
inv_weight = self.weights[inv_direction]
update[direction] = -inv_weight if anti_symmetric else inv_weight
self.weights.update(update)
def set_weight(self, offset, value):
assert offset in self.weights
self.weights[offset] = value
def get_stencil(self, isotropic=False) -> 'FiniteDifferenceStencilDerivation.Result':
weights = [self.weights[d] for d in self._stencil]
system = LinearEquationSystem(sp.Matrix(weights).atoms(sp.Symbol))
order = 0
while True:
new_system = system.copy()
eq = self.error_term_equations(order)
new_system.add_equations(eq)
sol_structure = new_system.solution_structure()
if sol_structure == 'single':
system = new_system
elif sol_structure == 'multiple':
system = new_system
elif sol_structure == 'none':
break
else:
assert False
order += 1
accuracy = order - len(self._derivative)
error_is_isotropic = False
if isotropic:
new_system = system.copy()
new_system.add_equations(self.isotropy_equations(order))
sol_structure = new_system.solution_structure()
error_is_isotropic = sol_structure != 'none'
if error_is_isotropic:
system = new_system
solve_res = system.solution()
weight_list = [self.weights[d].subs(solve_res) for d in self._stencil]
return self.Result(self._stencil, weight_list, accuracy, error_is_isotropic)
@staticmethod
def symbolic_weight(*args):
str_args = [str(e) for e in args]
return sp.Symbol(f"w_({','.join(str_args)})")
def error_term_dict(self, order):
error_terms = defaultdict(lambda: 0)
for direction in self._stencil:
weight = self.weights[tuple(direction)]
x = tuple(self._dx * d_i for d_i in direction)
for offset in multidimensional_sum(order, dim=self.field.spatial_dimensions):
fac = sp.factorial(order)
error_terms[tuple(sorted(offset))] += weight / fac * prod(x[off] for off in offset)
if self._derivative in error_terms:
error_terms[self._derivative] -= 1
return error_terms
def error_term_equations(self, order):
return list(self.error_term_dict(order).values())
def isotropy_equations(self, order):
def cycle_int_sequence(sequence, modulus):
result = []
arr = np.array(sequence, dtype=int)
while True:
if tuple(arr) in result:
break
result.append(tuple(arr))
arr = (arr + 1) % modulus
return tuple(set(tuple(sorted(t)) for t in result))
error_dict = self.error_term_dict(order)
eqs = []
for derivative_tuple in list(error_dict.keys()):
if fully_contains(self._derivative, derivative_tuple):
remaining = list(derivative_tuple)
for e in self._derivative:
del remaining[remaining.index(e)]
permutations = cycle_int_sequence(remaining, self.dim)
if len(permutations) == 1:
eqs.append(error_dict[derivative_tuple])
else:
for i in range(1, len(permutations)):
new_eq = (error_dict[tuple(sorted(permutations[i] + self._derivative))]
- error_dict[tuple(sorted(permutations[i - 1] + self._derivative))])
if new_eq:
eqs.append(new_eq)
else:
eqs.append(error_dict[derivative_tuple])
return eqs
class Result:
def __init__(self, stencil, weights, accuracy, is_isotropic):
self.stencil = stencil
self.weights = weights
self.accuracy = accuracy
self.is_isotropic = is_isotropic
def visualize(self):
from pystencils.stencil import plot
plot(self.stencil, data=self.weights)
def apply(self, field_access: Field.Access):
f = field_access
return sum(f.get_shifted(*offset) * weight for offset, weight in zip(self.stencil, self.weights))
def __array__(self):
return np.array(self.as_array().tolist())
def as_array(self):
dim = len(self.stencil[0])
assert (dim == 2 or dim == 3), "Only 2D or 3D matrix representations are available"
max_offset = max(max(abs(e) for e in direction) for direction in self.stencil)
shape_list = []
for i in range(dim):
shape_list.append(2 * max_offset + 1)
number_of_elements = np.prod(shape_list)
shape = tuple(shape_list)
result = sp.MutableDenseNDimArray([0] * number_of_elements, shape)
if dim == 2:
for direction, weight in zip(self.stencil, self.weights):
result[max_offset - direction[1], max_offset + direction[0]] = weight
if dim == 3:
for direction, weight in zip(self.stencil, self.weights):
result[max_offset - direction[1], max_offset + direction[0], max_offset + direction[2]] = weight
return result
def rotate_weights_and_apply(self, field_access: Field.Access, axes):
"""derive gradient weights of other direction with already calculated weights of one direction
via rotation and apply them to a field."""
dim = len(self.stencil[0])
assert (dim == 2 or dim == 3), "This function is only for 2D or 3D stencils available"
rotated_weights = np.rot90(np.array(self.__array__()), 1, axes)
result = []
max_offset = max(max(abs(e) for e in direction) for direction in self.stencil)
if dim == 2:
for direction in self.stencil:
result.append(rotated_weights[max_offset - direction[1],
max_offset + direction[0]])
if dim == 3:
for direction in self.stencil:
result.append(rotated_weights[max_offset - direction[1],
max_offset + direction[0],
max_offset + direction[2]])
f = field_access
return sum(f.get_shifted(*offset) * weight for offset, weight in zip(self.stencil, result))
def __repr__(self):
return "Finite difference stencil of accuracy {}, isotropic error: {}".format(self.accuracy,
self.is_isotropic)
class FiniteDifferenceStaggeredStencilDerivation:
"""Derives a finite difference stencil for application at a staggered position
Args:
neighbor: the neighbor direction string or vector at whose staggered position to calculate the derivative
dim: how many dimensions (2 or 3)
derivative: a tuple of directions over which to perform derivatives
free_weights_prefix: a string to prefix to free weight symbols. If None, do not return free weights
"""
def __init__(self, neighbor, dim, derivative=tuple(), free_weights_prefix=None):
if type(neighbor) is str:
neighbor = direction_string_to_offset(neighbor)
if dim == 2:
assert neighbor[dim:] == 0
assert derivative is tuple() or max(derivative) < dim
neighbor = sp.Matrix(neighbor[:dim])
pos = neighbor / 2
def unitvec(i):
"""return the `i`-th unit vector in three dimensions"""
a = np.zeros(dim, dtype=int)
a[i] = 1
return a
def flipped(a, i):
"""return `a` with its `i`-th element's sign flipped"""
a = a.copy()
a[i] *= -1
return a
# determine the points to use, coordinates are relative to position
points = []
if np.linalg.norm(neighbor, 1) == 1:
main_points = [neighbor / 2, neighbor / -2]
elif np.linalg.norm(neighbor, 1) == 2:
nonzero_indices = [i for i, v in enumerate(neighbor) if v != 0 and i < dim]
main_points = [neighbor / 2, neighbor / -2, flipped(neighbor / 2, nonzero_indices[0]),
flipped(neighbor / -2, nonzero_indices[0])]
else:
main_points = [sp.Matrix(np.multiply(neighbor, sp.Matrix(c) / 2))
for c in itertools.product([-1, 1], repeat=3)]
points += main_points
zero_indices = [i for i, v in enumerate(neighbor) if v == 0 and i < dim]
for i in zero_indices:
points += [point + sp.Matrix(unitvec(i)) for point in main_points]
points += [point - sp.Matrix(unitvec(i)) for point in main_points]
points_tuple = tuple([tuple(p) for p in points])
self._stencil = points_tuple
# determine the stencil weights
if len(derivative) == 0:
weights = None
else:
derivation = FiniteDifferenceStencilDerivation(derivative, points_tuple).get_stencil()
if not derivation.accuracy:
raise Exception('the requested derivative cannot be performed with the available neighbors')
weights = derivation.weights
# if the weights are underdefined, we can choose the free symbols to find the sparsest stencil
free_weights = set(itertools.chain(*[w.free_symbols for w in weights]))
if free_weights_prefix is not None:
weights = [w.subs({fw: sp.Symbol(f"{free_weights_prefix}_{i}") for i, fw in enumerate(free_weights)})
for w in weights]
elif len(free_weights) > 0:
zero_counts = defaultdict(list)
for values in itertools.product([-1, -sp.Rational(1, 2), 0, 1, sp.Rational(1, 2)],
repeat=len(free_weights)):
subs = {free_weight: value for free_weight, value in zip(free_weights, values)}
weights = [w.subs(subs) for w in derivation.weights]
if not all(a == 0 for a in weights):
zero_count = sum([1 for w in weights if w == 0])
zero_counts[zero_count].append(weights)
best = zero_counts[max(zero_counts.keys())]
if len(best) > 1: # if there are multiple, pick the one that contains a nonzero center weight
center = [tuple(p + pos) for p in points].index((0, 0, 0)[:dim])
best = [b for b in best if b[center] != 0]
if len(best) > 1: # if there are still multiple, they are equivalent, so we average
weights = [sum([b[i] for b in best]) / len(best) for i in range(len(weights))]
else:
weights = best[0]
assert weights
points_tuple = tuple([tuple(p + pos) for p in points])
self._points = points_tuple
self._weights = weights
@property
def points(self):
"""return the points of the stencil"""
return self._points
@property
def stencil(self):
"""return the points of the stencil relative to the staggered position specified by neighbor"""
return self._stencil
@property
def weights(self):
"""return the weights of the stencil"""
assert self._weights is not None
return self._weights
def visualize(self):
if self._weights is None:
ws = None
else:
ws = np.array([w for w in self.weights if w != 0], dtype=float)
pts = np.array([p for i, p in enumerate(self.points) if self.weights[i] != 0], dtype=int)
from pystencils.stencil import plot
plot(pts, data=ws)
def apply(self, access: Field.Access):
return sum([access.get_shifted(*point) * weight for point, weight in zip(self.points, self.weights)])
from collections import defaultdict, namedtuple
import sympy as sp
from pystencils.field import Field
from pystencils.sympyextensions import normalize_product, prod
def _default_diff_sort_key(d):
return str(d.superscript), str(d.target)
class Diff(sp.Expr):
"""Sympy Node representing a derivative.
The difference to sympy's built in differential is:
- shortened latex representation
- all simplifications have to be done manually
- optional marker displayed as superscript
"""
is_number = False
is_Rational = False
_diff_wrt = True
def __new__(cls, argument, target=-1, superscript=-1):
if argument == 0:
return sp.Rational(0, 1)
if isinstance(argument, Field):
argument = argument.center
return sp.Expr.__new__(cls, argument.expand(), sp.sympify(target), sp.sympify(superscript))
@property
def is_commutative(self):
any_non_commutative = any(not s.is_commutative for s in self.atoms(sp.Symbol))
if any_non_commutative:
return False
else:
return True
def get_arg_recursive(self):
"""Returns the argument the derivative acts on, for nested derivatives the inner argument is returned"""
if not isinstance(self.arg, Diff):
return self.arg
else:
return self.arg.get_arg_recursive()
def change_arg_recursive(self, new_arg):
"""Returns a Diff node with the given 'new_arg' instead of the current argument. For nested derivatives
a new nested derivative is returned where the inner Diff has the 'new_arg'"""
if not isinstance(self.arg, Diff):
return Diff(new_arg, self.target, self.superscript)
else:
return Diff(self.arg.change_arg_recursive(new_arg), self.target, self.superscript)
def split_linear(self, functions):
"""
Applies linearity property of Diff: i.e. 'Diff(c*a+b)' is transformed to 'c * Diff(a) + Diff(b)'
The parameter functions is a list of all symbols that are considered functions, not constants.
For the example above: functions=[a, b]
"""
constant, variable = 1, 1
if self.arg.func != sp.Mul:
constant, variable = 1, self.arg
else:
for factor in normalize_product(self.arg):
if factor in functions or isinstance(factor, Diff):
variable *= factor
else:
constant *= factor
if isinstance(variable, sp.Symbol) and variable not in functions:
return 0
if isinstance(variable, int) or variable.is_number:
return 0
else:
return constant * Diff(variable, target=self.target, superscript=self.superscript)
@property
def arg(self):
"""Expression the derivative acts on"""
return self.args[0]
@property
def target(self):
"""Subscript, usually the variable the Diff is w.r.t. """
return self.args[1]
@property
def superscript(self):
"""Superscript, for example used as the Chapman-Enskog order index"""
return self.args[2]
def _latex(self, printer, *_):
result = r"{\partial"
if self.superscript >= 0:
result += "^{(%s)}" % (self.superscript,)
if self.target != -1:
result += "_{%s}" % (printer.doprint(self.target),)
contents = printer.doprint(self.arg)
if isinstance(self.arg, int) or isinstance(self.arg, sp.Symbol) or self.arg.is_number or self.arg.func == Diff:
result += " " + contents
else:
result += " (" + contents + ") "
result += "}"
return result
def __str__(self):
return f"D({self.arg})"
def interpolated_access(self, offset, **kwargs):
"""Represents an interpolated access on a spatially differentiated field
Args:
offset (Tuple[sympy.Expr]): Absolute position to determine the value of the spatial derivative
"""
from pystencils.interpolation_astnodes import DiffInterpolatorAccess
assert isinstance(self.arg.field, Field), "Must be field to enable interpolated accesses"
return DiffInterpolatorAccess(self.arg.field.interpolated_access(offset, **kwargs).symbol, self.target, *offset)
class DiffOperator(sp.Expr):
"""Un-applied differential, i.e. differential operator
Args:
target: the differential is w.r.t to this variable.
This target is mainly for display purposes (its the subscript) and to distinguish DiffOperators
If the target is '-1' no subscript is displayed
superscript: optional marker displayed as superscript
is not displayed if set to '-1'
The DiffOperator behaves much like a variable with special name. Its main use is to be applied later, using the
DiffOperator.apply(expr, arg) which transforms 'DiffOperator's to applied 'Diff's
"""
is_commutative = True
is_number = False
is_Rational = False
def __new__(cls, target=-1, superscript=-1):
return sp.Expr.__new__(cls, sp.sympify(target), sp.sympify(superscript))
@property
def target(self):
return self.args[0]
@property
def superscript(self):
return self.args[1]
def _latex(self, *_):
result = r"{\partial"
if self.superscript >= 0:
result += "^{(%s)}" % (self.superscript,)
if self.target != -1:
result += "_{%s}" % (self.target,)
result += "}"
return result
@staticmethod
def apply(expr, argument, apply_to_constants=True):
"""
Returns a new expression where each 'DiffOperator' is replaced by a 'Diff' node.
Multiplications of 'DiffOperator's are interpreted as nested application of differentiation:
i.e. DiffOperator('x')*DiffOperator('x') is a second derivative replaced by Diff(Diff(arg, x), t)
"""
def handle_mul(mul):
args = normalize_product(mul)
diffs = [a for a in args if isinstance(a, DiffOperator)]
if len(diffs) == 0:
return mul * argument if apply_to_constants else mul
rest = [a for a in args if not isinstance(a, DiffOperator)]
diffs.sort(key=_default_diff_sort_key)
result = argument
for d in reversed(diffs):
result = Diff(result, target=d.target, superscript=d.superscript)
return prod(rest) * result
expr = expr.expand()
if expr.func == sp.Mul or expr.func == sp.Pow:
return handle_mul(expr)
elif expr.func == sp.Add:
return expr.func(*[handle_mul(a) for a in expr.args])
else:
return expr * argument if apply_to_constants else expr
# ----------------------------------------------------------------------------------------------------------------------
def diff(expr, *args):
"""Shortcut function to create nested derivatives
>>> f = sp.Symbol("f")
>>> diff(f, 0, 0, 1) == Diff(Diff( Diff(f, 1), 0), 0)
True
"""
if len(args) == 0:
return expr
result = expr
for index in reversed(args):
result = Diff(result, index)
return result
def diff_args(expr):
"""Extracts the indices and argument of possibly nested derivative - inverse of diff function
>>> args = (sp.Symbol("x"), 0, 1, 2, 5, 1)
>>> e = diff(*args)
>>> assert diff_args(e) == args
"""
if not isinstance(expr, Diff):
return expr,
else:
inner_res = diff_args(expr.args[0])
return (inner_res[0], expr.args[1], *inner_res[1:])
def diff_terms(expr):
"""Returns set of all derivatives in an expression.
This function yields different results than 'expr.atoms(Diff)' when nested derivatives are in the expression,
since this function only returns the outer derivatives
Example:
>>> x, y = sp.symbols("x, y")
>>> diff_terms( diff(x, 0, 0) )
{Diff(Diff(x, 0, -1), 0, -1)}
>>> diff_terms( diff(x, 0, 0) + y )
{Diff(Diff(x, 0, -1), 0, -1)}
"""
result = set()
def visit(e):
if isinstance(e, Diff):
result.add(e)
else:
for a in e.args:
visit(a)
visit(expr)
return result
def collect_diffs(expr):
"""Rewrites expression into a sum of distinct derivatives with pre-factors"""
return expr.collect(diff_terms(expr))
def zero_diffs(expr, label):
"""Replaces all differentials with the given target by 0
Example:
>>> x, y, f = sp.symbols("x y f")
>>> expression = Diff(f, x) + Diff(f, y) + Diff(Diff(f, y), x) + 7
>>> zero_diffs(expression, x)
Diff(f, y, -1) + 7
"""
def visit(e):
if isinstance(e, Diff):
if e.target == label:
return 0
new_args = [visit(arg) for arg in e.args]
return e.func(*new_args) if new_args else e
return visit(expr)
def evaluate_diffs(expr, var=None):
"""Replaces pystencils diff objects by sympy diff objects and evaluates them.
Replaces Diff nodes by sp.diff , the free variable is either the target (if var=None) otherwise
the specified var
"""
if isinstance(expr, Diff):
if var is None:
var = expr.target
return sp.diff(evaluate_diffs(expr.arg, var), var)
else:
new_args = [evaluate_diffs(arg, var) for arg in expr.args]
return expr.func(*new_args) if new_args else expr
def normalize_diff_order(expression, functions=None, constants=None, sort_key=_default_diff_sort_key):
"""Assumes order of differentiation can be exchanged. Changes the order of nested Diffs to a standard order defined
by the sorting key 'sort_key' such that the derivative terms can be further simplified """
def visit(expr):
if isinstance(expr, Diff):
nodes = [expr]
while isinstance(nodes[-1].arg, Diff):
nodes.append(nodes[-1].arg)
processed_arg = visit(nodes[-1].arg)
nodes.sort(key=sort_key)
result = processed_arg
for d in reversed(nodes):
result = Diff(result, target=d.target, superscript=d.superscript)
return result
else:
new_args = [visit(e) for e in expr.args]
return expr.func(*new_args) if new_args else expr
expression = expand_diff_linear(expression.expand(), functions, constants).expand()
return visit(expression)
def expand_diff_full(expr, functions=None, constants=None):
if functions is None:
functions = expr.atoms(sp.Symbol)
if constants is not None:
functions.difference_update(constants)
def visit(e):
if not isinstance(e, sp.Tuple):
e = e.expand()
if e.func == Diff:
result = 0
diff_args = {'target': e.target, 'superscript': e.superscript}
diff_inner = e.args[0]
diff_inner = visit(diff_inner)
if diff_inner.func not in (sp.Add, sp.Mul):
return e
for term in diff_inner.args if diff_inner.func == sp.Add else [diff_inner]:
independent_terms = 1
dependent_terms = []
for factor in normalize_product(term):
if factor in functions or isinstance(factor, Diff):
dependent_terms.append(factor)
else:
independent_terms *= factor
for i in range(len(dependent_terms)):
dependent_term = dependent_terms[i]
other_dependent_terms = dependent_terms[:i] + dependent_terms[i + 1:]
processed_diff = normalize_diff_order(Diff(dependent_term, **diff_args))
result += independent_terms * prod(other_dependent_terms) * processed_diff
return result
elif isinstance(e, sp.Piecewise):
return sp.Piecewise(*((expand_diff_full(a, functions, constants), b) for a, b in e.args))
elif isinstance(expr, sp.Tuple):
new_args = [visit(arg) for arg in e.args]
return sp.Tuple(*new_args)
else:
new_args = [visit(arg) for arg in e.args]
return e.func(*new_args) if new_args else e
if isinstance(expr, sp.Matrix):
return expr.applyfunc(visit)
else:
return visit(expr)
def expand_diff_linear(expr, functions=None, constants=None):
"""Expands all derivative nodes by applying Diff.split_linear
Args:
expr: expression containing derivatives
functions: sequence of symbols that are considered functions and can not be pulled before the derivative.
if None, all symbols are viewed as functions
constants: sequence of symbols which are considered constants and can be pulled before the derivative
"""
if functions is None:
functions = expr.atoms(sp.Symbol)
if constants is not None:
functions.difference_update(constants)
if isinstance(expr, Diff):
arg = expand_diff_linear(expr.arg, functions)
if hasattr(arg, 'func') and arg.func == sp.Add:
result = 0
for a in arg.args:
result += Diff(a, target=expr.target, superscript=expr.superscript).split_linear(functions)
return result
else:
diff = Diff(arg, target=expr.target, superscript=expr.superscript)
if diff == 0:
return 0
else:
return diff.split_linear(functions)
elif isinstance(expr, sp.Piecewise):
return sp.Piecewise(*((expand_diff_linear(a, functions, constants), b) for a, b in expr.args))
elif isinstance(expr, sp.Tuple):
new_args = [expand_diff_linear(e, functions) for e in expr.args]
return sp.Tuple(*new_args)
else:
new_args = [expand_diff_linear(e, functions) for e in expr.args]
result = sp.expand(expr.func(*new_args) if new_args else expr)
return result
def expand_diff_products(expr):
"""Fully expands all derivatives by applying product rule"""
if isinstance(expr, Diff):
arg = expand_diff_products(expr.args[0])
if arg.func == sp.Add:
new_args = [Diff(e, target=expr.target, superscript=expr.superscript)
for e in arg.args]
return sp.Add(*new_args)
if arg.func not in (sp.Mul, sp.Pow):
return Diff(arg, target=expr.target, superscript=expr.superscript)
else:
prod_list = normalize_product(arg)
result = 0
for i in range(len(prod_list)):
pre_factor = prod(prod_list[j] for j in range(len(prod_list)) if i != j)
result += pre_factor * Diff(prod_list[i], target=expr.target, superscript=expr.superscript)
return result
else:
new_args = [expand_diff_products(e) for e in expr.args]
return expr.func(*new_args) if new_args else expr
def combine_diff_products(expr):
"""Inverse product rule"""
def expr_to_diff_decomposition(expression):
"""Decomposes a sp.Add node containing CeDiffs into:
diff_dict: maps (target, superscript) -> [ (pre_factor, argument), ... ]
i.e. a partial(b) ( a is pre-factor, b is argument)
in case of partial(a) partial(b) two entries are created (0.5 partial(a), b), (0.5 partial(b), a)
"""
DiffInfo = namedtuple("DiffInfo", ["target", "superscript"])
class DiffSplit:
def __init__(self, fac, argument):
self.pre_factor = fac
self.argument = argument
def __repr__(self):
return str((self.pre_factor, self.argument))
assert isinstance(expression, sp.Add)
diff_dict = defaultdict(list)
rest = 0
for term in expression.args:
if isinstance(term, Diff):
diff_dict[DiffInfo(term.target, term.superscript)].append(DiffSplit(1, term.arg))
else:
mul_args = normalize_product(term)
diffs = [d for d in mul_args if isinstance(d, Diff)]
factor = prod(d for d in mul_args if not isinstance(d, Diff))
if len(diffs) == 0:
rest += factor
else:
for i, diff in enumerate(diffs):
all_but_current = [d for j, d in enumerate(diffs) if i != j]
pre_factor = factor * prod(all_but_current) * sp.Rational(1, len(diffs))
diff_dict[DiffInfo(diff.target, diff.superscript)].append(DiffSplit(pre_factor, diff.arg))
return diff_dict, rest
def match_diff_splits(own, other):
own_fac = own.pre_factor / other.argument
other_fac = other.pre_factor / own.argument
count = sp.count_ops
if count(own_fac) > count(own.pre_factor) or count(other_fac) > count(other.pre_factor):
return None
new_other_factor = own_fac - other_fac
return new_other_factor
def process_diff_list(diff_list, label, superscript):
if len(diff_list) == 0:
return 0
elif len(diff_list) == 1:
return diff_list[0].pre_factor * Diff(diff_list[0].argument, label, superscript)
result = 0
matches = []
for i in range(1, len(diff_list)):
match_result = match_diff_splits(diff_list[i], diff_list[0])
if match_result is not None:
matches.append((i, match_result))
if len(matches) == 0:
result += diff_list[0].pre_factor * Diff(diff_list[0].argument, label, superscript)
else:
other_idx, match_result = sorted(matches, key=lambda e: sp.count_ops(e[1]))[0]
new_argument = diff_list[0].argument * diff_list[other_idx].argument
result += (diff_list[0].pre_factor / diff_list[other_idx].argument) * Diff(new_argument, label, superscript)
if match_result == 0:
del diff_list[other_idx]
else:
diff_list[other_idx].pre_factor = match_result * diff_list[0].argument
result += process_diff_list(diff_list[1:], label, superscript)
return result
def combine(expression):
expression = expression.expand()
if isinstance(expression, sp.Add):
diff_dict, rest = expr_to_diff_decomposition(expression)
for (label, superscript), diff_list in diff_dict.items():
rest += process_diff_list(diff_list, label, superscript)
return rest
else:
new_args = [combine_diff_products(e) for e in expression.args]
return expression.func(*new_args) if new_args else expression
return combine(expr)
def replace_generic_laplacian(expr, dim=None):
"""Laplacian can be written as Diff(Diff(term)) without explicitly giving the dimensions.
This function replaces these constructs by diff(term, 0, 0) + diff(term, 1, 1) + ...
For this to work, the arguments of the derivative have to be field or field accesses such that the spatial
dimension can be determined.
>>> l = Diff(Diff(sp.symbols('x')))
>>> replace_generic_laplacian(l, 3)
Diff(Diff(x, 0, -1), 0, -1) + Diff(Diff(x, 1, -1), 1, -1) + Diff(Diff(x, 2, -1), 2, -1)
"""
if isinstance(expr, Diff):
arg, *indices = diff_args(expr)
if isinstance(arg, Field.Access):
dim = arg.field.spatial_dimensions
assert dim is not None
if len(indices) == 2 and all(i == -1 for i in indices):
return sum(diff(arg, i, i) for i in range(dim))
else:
return expr
else:
new_args = [replace_generic_laplacian(a, dim) for a in expr.args]
return expr.func(*new_args) if new_args else expr
def functional_derivative(functional, v):
r"""Computes functional derivative of functional with respect to v using Euler-Lagrange equation
.. math ::
\frac{\delta F}{\delta v} =
\frac{\partial F}{\partial v} - \nabla \cdot \frac{\partial F}{\partial \nabla v}
- assumes that gradients are represented by Diff() node
- Diff(Diff(r)) represents the divergence of r
- the constants parameter is a list with symbols not affected by the derivative. This is used for simplification
of the derivative terms.
"""
diffs = functional.atoms(Diff)
bulk_substitutions = {d: sp.Dummy() for d in diffs}
bulk_substitutions_inverse = {v: k for k, v in bulk_substitutions.items()}
non_diff_part = functional.subs(bulk_substitutions)
partial_f_partial_v = sp.diff(non_diff_part, v).subs(bulk_substitutions_inverse)
gradient_part = 0
for diff_obj in diffs:
if diff_obj.args[0] != v:
continue
dummy = sp.Dummy()
partial_f_partial_grad_v = functional.subs(diff_obj, dummy).diff(dummy).subs(dummy, diff_obj)
gradient_part += Diff(partial_f_partial_grad_v, target=diff_obj.target, superscript=diff_obj.superscript)
result = partial_f_partial_v - gradient_part
return result
from typing import Optional, Union
import numpy as np
import sympy as sp
from pystencils.fd import Diff
from pystencils.fd.derivative import diff_args
from pystencils.fd.spatial import fd_stencils_standard
from pystencils.field import Field
from pystencils.simp.assignment_collection import AssignmentCollection
from pystencils.sympyextensions import fast_subs
FieldOrFieldAccess = Union[Field, Field.Access]
# --------------------------------------- Advection Diffusion ----------------------------------------------------------
def diffusion(scalar, diffusion_coeff, idx=None):
"""Diffusion term ∇·( diffusion_coeff · ∇(scalar))
Examples:
>>> f = Field.create_generic('f', spatial_dimensions=2)
>>> d = sp.Symbol("d")
>>> dx = sp.Symbol("dx")
>>> diffusion_term = diffusion(scalar=f, diffusion_coeff=d)
>>> discretization = Discretization2ndOrder()
>>> expected_output = ((f[-1, 0] + f[0, -1] - 4 * f[0, 0] + f[0, 1] + f[1, 0]) * d) / dx**2
>>> sp.simplify(discretization(diffusion_term) - expected_output)
0
"""
if isinstance(scalar, Field):
first_arg = scalar.center
elif isinstance(scalar, Field.Access):
first_arg = scalar
else:
raise ValueError("Diffused scalar has to be a pystencils Field or Field.Access")
args = [first_arg, diffusion_coeff if not isinstance(diffusion_coeff, Field) else diffusion_coeff.center]
if idx is not None:
args.append(idx)
return Diffusion(*args)
def advection(advected_scalar: FieldOrFieldAccess, velocity_field: FieldOrFieldAccess, idx: Optional[int] = None):
"""Advection term ∇·(velocity_field · advected_scalar)
Term that describes the advection of a scalar quantity in a velocity field.
"""
if isinstance(advected_scalar, Field):
first_arg = advected_scalar.center
elif isinstance(advected_scalar, Field.Access):
first_arg = advected_scalar
else:
raise ValueError("Advected scalar has to be a pystencils Field or Field.Access")
args = [first_arg, velocity_field if not isinstance(velocity_field, Field) else velocity_field.center]
if idx is not None:
args.append(idx)
return Advection(*args)
def transient(scalar, idx=None):
"""Transient term ∂_t(scalar)"""
if isinstance(scalar, Field):
args = [scalar.center]
elif isinstance(scalar, Field.Access):
args = [scalar]
else:
raise ValueError("Scalar has to be a pystencils Field or Field.Access")
if idx is not None:
args.append(idx)
return Transient(*args)
class Discretization2ndOrder:
def __init__(self, dx=sp.Symbol("dx"), dt=sp.Symbol("dt"), discretization_stencil_func=fd_stencils_standard):
self.dx = dx
self.dt = dt
self.spatial_stencil = discretization_stencil_func
def _discretize_diffusion(self, e):
result = 0
for c in range(e.dim):
first_diffs = [offset
* (e.diffusion_scalar_at_offset(c, offset) * e.diffusion_coefficient_at_offset(c, offset)
- e.diffusion_scalar_at_offset(0, 0) * e.diffusion_coefficient_at_offset(0, 0))
for offset in [-1, 1]]
result += first_diffs[1] - first_diffs[0]
return result / (self.dx ** 2)
def _discretize_advection(self, expr):
result = 0
for c in range(expr.dim):
interpolated = [(expr.advected_scalar_at_offset(c, offset) * expr.velocity_field_at_offset(c, offset, c)
+ expr.advected_scalar_at_offset(c, 0) * expr.velocity_field_at_offset(c, 0, c)) / 2
for offset in [-1, 1]]
result += interpolated[1] - interpolated[0]
return result / self.dx
def _discretize_spatial(self, e):
if isinstance(e, Diffusion):
return self._discretize_diffusion(e)
elif isinstance(e, Advection):
return self._discretize_advection(e)
elif isinstance(e, Diff):
arg, *indices = diff_args(e)
if not isinstance(arg, Field.Access):
raise ValueError("Only derivatives with field or field accesses as arguments can be discretized")
return self.spatial_stencil(indices, self.dx, arg)
else:
new_args = [self._discretize_spatial(a) for a in e.args]
return e.func(*new_args) if new_args else e
def __call__(self, expr):
if isinstance(expr, list):
return [self(e) for e in expr]
elif isinstance(expr, sp.Matrix) or isinstance(expr, sp.ImmutableDenseMatrix):
return expr.applyfunc(self.__call__)
elif isinstance(expr, AssignmentCollection):
return expr.copy(main_assignments=[e for e in expr.main_assignments],
subexpressions=[e for e in expr.subexpressions])
transient_terms = expr.atoms(Transient)
if len(transient_terms) == 0:
return self._discretize_spatial(expr)
elif len(transient_terms) == 1:
transient_term = transient_terms.pop()
solve_result = sp.solve(expr, transient_term)
if len(solve_result) != 1:
raise ValueError("Could not solve for transient term" + str(solve_result))
rhs = solve_result.pop()
# explicit euler
return transient_term.scalar + self.dt * self._discretize_spatial(rhs)
else:
print(transient_terms)
raise NotImplementedError("Cannot discretize expression with more than one transient term")
# -------------------------------------- Helper Classes ----------------------------------------------------------------
class Advection(sp.Function):
@property
def scalar(self):
return self.args[0].field
@property
def vector(self):
if isinstance(self.args[1], Field.Access):
return self.args[1].field
else:
return self.args[1]
@property
def scalar_index(self):
return None if len(self.args) <= 2 else int(self.args[2])
@property
def dim(self):
return self.scalar.spatial_dimensions
def _latex(self, printer):
name_suffix = f"_{self.scalar_index}" if self.scalar_index is not None else ""
if isinstance(self.vector, Field):
return r"\nabla \cdot(%s %s)" % (printer.doprint(sp.Symbol(self.vector.name)),
printer.doprint(sp.Symbol(self.scalar.name + name_suffix)))
else:
args = [r"\partial_%d(%s %s)" % (i, printer.doprint(sp.Symbol(self.scalar.name + name_suffix)),
printer.doprint(self.vector[i]))
for i in range(self.dim)]
return " + ".join(args)
# --- Interface for discretization strategy
def velocity_field_at_offset(self, offset_dim, offset_value, index):
v = self.vector
if isinstance(v, Field):
assert v.index_dimensions == 1
return v.neighbor(offset_dim, offset_value)(index)
else:
return v[index]
def advected_scalar_at_offset(self, offset_dim, offset_value):
idx = 0 if self.scalar_index is None else int(self.scalar_index)
return self.scalar.neighbor(offset_dim, offset_value)(idx)
class Diffusion(sp.Function):
@property
def scalar(self):
return self.args[0].field
@property
def diffusion_coeff(self):
if isinstance(self.args[1], Field.Access):
return self.args[1].field
else:
return self.args[1]
@property
def scalar_index(self):
return None if len(self.args) <= 2 else int(self.args[2])
@property
def dim(self):
return self.scalar.spatial_dimensions
def _latex(self, printer):
name_suffix = f"_{self.scalar_index}" if self.scalar_index is not None else ""
coeff = self.diffusion_coeff
diff_coeff = sp.Symbol(coeff.name) if isinstance(coeff, Field) else coeff
return r"div(%s \nabla %s)" % (printer.doprint(diff_coeff),
printer.doprint(sp.Symbol(self.scalar.name + name_suffix)))
# --- Interface for discretization strategy
def diffusion_scalar_at_offset(self, offset_dim, offset_value):
idx = 0 if self.scalar_index is None else self.scalar_index
return self.scalar.neighbor(offset_dim, offset_value)(idx)
def diffusion_coefficient_at_offset(self, offset_dim, offset_value):
d = self.diffusion_coeff
if isinstance(d, Field):
return d.neighbor(offset_dim, offset_value)
else:
return d
class Transient(sp.Function):
@property
def scalar(self):
if self.scalar_index is None:
return self.args[0].field.center
else:
return self.args[0].field(self.scalar_index)
@property
def scalar_index(self):
return None if len(self.args) <= 1 else int(self.args[1])
def _latex(self, printer):
name_suffix = f"_{self.scalar_index}" if self.scalar_index is not None else ""
return r"\partial_t %s" % (printer.doprint(sp.Symbol(self.scalar.name + name_suffix)),)
# -------------------------------------------- Deprecated Functions ----------------------------------------------------
def grad(var, dim=3):
r"""
Gradients are represented as a special symbol:
e.g. :math:`\nabla x = (x^{\Delta 0}, x^{\Delta 1}, x^{\Delta 2})`
This function takes a symbol and creates the gradient symbols according to convention above
Args:
var: symbol to take the gradient of
dim: dimension (length) of the gradient vector
"""
if hasattr(var, "__getitem__"):
return [[sp.Symbol("%s^Delta^%d" % (v.name, i)) for v in var] for i in range(dim)]
else:
return [sp.Symbol("%s^Delta^%d" % (var.name, i)) for i in range(dim)]
def discretize_center(term, symbols_to_field_dict, dx, dim=3):
"""
Expects term that contains given symbols and gradient components of these symbols and replaces them
by field accesses. Gradients are replaced by centralized approximations:
``(upper neighbor - lower neighbor ) / ( 2*dx)``
Args:
term: term where symbols and gradient(symbol) should be replaced
symbols_to_field_dict: mapping of symbols to Field
dx: width and height of one cell
dim: dimension
Example:
>>> x = sp.Symbol("x")
>>> grad_x = grad(x, dim=3)
>>> term = x * grad_x[0]
>>> term
x*x^Delta^0
>>> f = Field.create_generic('f', spatial_dimensions=3)
>>> expected_output = f[0, 0, 0] * (-f[-1, 0, 0]/2 + f[1, 0, 0]/2)
>>> sp.simplify(discretize_center(term, { x: f }, dx=1, dim=3) - expected_output)
0
"""
substitutions = {}
for symbols, field in symbols_to_field_dict.items():
if not hasattr(symbols, "__getitem__"):
symbols = [symbols]
g = grad(symbols, dim)
substitutions.update({symbol: field(i) for i, symbol in enumerate(symbols)})
for d in range(dim):
up, down = __up_down_offsets(d, dim)
substitutions.update({g[d][i]: (field[up](i) - field[down](i)) / dx / 2 for i in range(len(symbols))})
return fast_subs(term, substitutions)
def discretize_staggered(term, symbols_to_field_dict, coordinate, coordinate_offset, dx, dim=3):
"""
Expects term that contains given symbols and gradient components of these symbols and replaces them
by field accesses. Gradients in coordinate direction are replaced by staggered version at cell boundary.
Symbols themselves and gradients in other directions are replaced by interpolated version at cell face.
Args:
term: input term where symbols and gradients are replaced
symbols_to_field_dict: mapping of symbols to Field
coordinate: id for coordinate (0 for x, 1 for y, ... ) defining cell boundary.
Only gradients in this direction are replaced e.g. if symbol^Delta^coordinate
coordinate_offset: either +1 or -1 for upper or lower face in coordinate direction
dx: width and height of one cell
dim: dimension
Examples:
Discretizing at right/east face of cell i.e. coordinate=0, offset=1)
>>> x, dx = sp.symbols("x dx")
>>> grad_x = grad(x, dim=3)
>>> term = x * grad_x[0]
>>> term
x*x^Delta^0
>>> f = Field.create_generic('f', spatial_dimensions=3)
>>> discretize_staggered(term, symbols_to_field_dict={ x: f}, dx=dx, coordinate=0, coordinate_offset=1, dim=3)
(-f_C + f_E)*(f_C/2 + f_E/2)/dx
"""
assert coordinate_offset == 1 or coordinate_offset == -1
assert 0 <= coordinate < dim
substitutions = {}
for symbols, field in symbols_to_field_dict.items():
if not hasattr(symbols, "__getitem__"):
symbols = [symbols]
offset = [0] * dim
offset[coordinate] = coordinate_offset
offset = np.array(offset, dtype=int)
gradient = grad(symbols)[coordinate]
substitutions.update({s: (field[offset](i) + field(i)) / 2 for i, s in enumerate(symbols)})
substitutions.update({g: (field[offset](i) - field(i)) / dx * coordinate_offset
for i, g in enumerate(gradient)})
for d in range(dim):
if d == coordinate:
continue
up, down = __up_down_offsets(d, dim)
for i, s in enumerate(symbols):
center_grad = (field[up](i) - field[down](i)) / (2 * dx)
neighbor_grad = (field[up + offset](i) - field[down + offset](i)) / (2 * dx)
substitutions[grad(s)[d]] = (center_grad + neighbor_grad) / 2
return fast_subs(term, substitutions)
def discretize_divergence(vector_term, symbols_to_field_dict, dx):
"""
Computes discrete divergence of symbolic vector
Args:
vector_term: sequence of terms, interpreted as vector
symbols_to_field_dict: mapping of symbols to Field
dx: length of a cell
Examples:
Laplace stencil
>>> x, dx = sp.symbols("x dx")
>>> grad_x = grad(x, dim=3)
>>> f = Field.create_generic('f', spatial_dimensions=3)
>>> expected_output = (f[-1, 0, 0] + f[0, -1, 0] + f[0, 0, -1] -
... 6*f[0, 0, 0] + f[0, 0, 1] + f[0, 1, 0] + f[1, 0, 0])/dx**2
>>> sp.simplify(discretize_divergence(grad_x, {x : f}, dx) - expected_output)
0
"""
dim = len(vector_term)
result = 0
for d in range(dim):
for offset in [-1, 1]:
result += offset * discretize_staggered(vector_term[d], symbols_to_field_dict, d, offset, dx, dim)
return result / dx
def __up_down_offsets(d, dim):
coord = [0] * dim
coord[d] = 1
up = np.array(coord, dtype=int)
coord[d] = -1
down = np.array(coord, dtype=int)
return up, down
import pystencils as ps
import sympy as sp
from pystencils.fd.derivation import FiniteDifferenceStaggeredStencilDerivation as FDS, \
FiniteDifferenceStencilDerivation as FD
import itertools
from collections import defaultdict
from collections.abc import Iterable
def get_access_and_direction(term):
direction1 = term.args[1]
if isinstance(term.args[0], ps.Field.Access): # first derivative
access = term.args[0]
direction = (direction1,)
elif isinstance(term.args[0], ps.fd.Diff): # nested derivative
if isinstance(term.args[0].args[0], ps.fd.Diff): # third or higher derivative
raise ValueError("can only handle first and second derivatives")
elif not isinstance(term.args[0].args[0], ps.Field.Access):
raise ValueError("can only handle derivatives of field accesses")
access, direction2 = term.args[0].args[:2]
direction = (direction1, direction2)
else:
raise NotImplementedError(f"can only deal with derivatives of field accesses, "
f"but not {type(term.args[0])}; expansion of derivatives probably failed")
return access, direction
class FVM1stOrder:
"""Finite-volume discretization
Args:
field: the field with the quantity to calculate, e.g. a concentration
flux: a list of sympy expressions that specify the flux, one for each cartesian direction
source: a list of sympy expressions that specify the source
"""
def __init__(self, field: ps.field.Field, flux=0, source=0):
def normalize(f, shape):
shape = tuple(s for s in shape if s != 1)
if not shape:
shape = None
if isinstance(f, sp.Array) or isinstance(f, Iterable) or isinstance(f, sp.Matrix):
return sp.Array(f, shape)
else:
return sp.Array([f] * (sp.Mul(*shape) if shape else 1))
self.c = field
self.dim = self.c.spatial_dimensions
self.j = normalize(flux, (self.dim, ) + self.c.index_shape)
self.q = normalize(source, self.c.index_shape)
def discrete_flux(self, flux_field: ps.field.Field):
"""Return a list of assignments for the discrete fluxes
Args:
flux_field: a staggered field to which the fluxes should be assigned
"""
assert ps.FieldType.is_staggered(flux_field)
num = 0
def discretize(term, neighbor):
nonlocal num
if isinstance(term, sp.Matrix):
nw = term.applyfunc(lambda t: discretize(t, neighbor))
return nw
elif isinstance(term, ps.field.Field.Access):
avg = (term.get_shifted(*neighbor) + term) * sp.Rational(1, 2)
return avg
elif isinstance(term, ps.fd.Diff):
access, direction = get_access_and_direction(term)
fds = FDS(neighbor, access.field.spatial_dimensions, direction,
free_weights_prefix=f'fvm_free_{num}' if sp.Matrix(neighbor).dot(neighbor) > 2 else None)
num += 1
return fds.apply(access)
if term.args:
new_args = [discretize(a, neighbor) for a in term.args]
return term.func(*new_args)
else:
return term
fluxes = self.j.applyfunc(ps.fd.derivative.expand_diff_full)
fluxes = [sp.Matrix(fluxes.tolist()[i]) if flux_field.index_dimensions > 1 else fluxes.tolist()[i]
for i in range(self.dim)]
A0 = sum([sp.Matrix(ps.stencil.direction_string_to_offset(d)).norm()
for d in flux_field.staggered_stencil]) / self.dim
discrete_fluxes = []
for neighbor in flux_field.staggered_stencil:
neighbor = ps.stencil.direction_string_to_offset(neighbor)
directional_flux = fluxes[0] * int(neighbor[0])
for i in range(1, self.dim):
directional_flux += fluxes[i] * int(neighbor[i])
discrete_flux = sp.simplify(discretize(directional_flux, neighbor))
free_weights = [s for s in discrete_flux.atoms(sp.Symbol) if s.name.startswith('fvm_free_')]
if len(free_weights) > 0:
discrete_flux = discrete_flux.collect(discrete_flux.atoms(ps.field.Field.Access))
access_counts = defaultdict(list)
for values in itertools.product([-1, 0, 1],
repeat=len(free_weights)):
subs = {free_weight: value for free_weight, value in zip(free_weights, values)}
simp = discrete_flux.subs(subs)
access_count = len(simp.atoms(ps.field.Field.Access))
access_counts[access_count].append(simp)
best_count = min(access_counts.keys())
discrete_flux = sum(access_counts[best_count]) / len(access_counts[best_count])
discrete_fluxes.append(discrete_flux / sp.Matrix(neighbor).norm())
if flux_field.index_dimensions > 1:
return [ps.Assignment(lhs, rhs / A0)
for i, d in enumerate(flux_field.staggered_stencil) if discrete_fluxes[i]
for lhs, rhs in zip(flux_field.staggered_vector_access(d), sp.simplify(discrete_fluxes[i]))]
else:
return [ps.Assignment(flux_field.staggered_access(d), sp.simplify(discrete_fluxes[i]) / A0)
for i, d in enumerate(flux_field.staggered_stencil)]
def discrete_source(self):
"""Return a list of assignments for the discrete source term"""
def discretize(term):
if isinstance(term, ps.fd.Diff):
access, direction = get_access_and_direction(term)
if self.dim == 2:
stencil = ["".join(a).replace(" ", "") for a in itertools.product("NS ", "EW ")
if "".join(a).strip()]
else:
stencil = ["".join(a).replace(" ", "") for a in itertools.product("NS ", "EW ", "TB ")
if "".join(a).strip()]
weights = None
for stencil in [["N", "S", "E", "W", "T", "B"][:2 * self.dim], stencil]:
stencil = [tuple(ps.stencil.direction_string_to_offset(d, self.dim)) for d in stencil]
derivation = FD(direction, stencil).get_stencil()
if not derivation.accuracy:
continue
weights = derivation.weights
# if the weights are underdefined, we can choose the free symbols to find the sparsest stencil
free_weights = set(itertools.chain(*[w.free_symbols for w in weights]))
if len(free_weights) > 0:
zero_counts = defaultdict(list)
for values in itertools.product([-1, -sp.Rational(1, 2), 0, 1, sp.Rational(1, 2)],
repeat=len(free_weights)):
subs = {free_weight: value for free_weight, value in zip(free_weights, values)}
weights = [w.subs(subs) for w in derivation.weights]
if not all(a == 0 for a in weights):
zero_count = sum([1 for w in weights if w == 0])
zero_counts[zero_count].append(weights)
best = zero_counts[max(zero_counts.keys())]
if len(best) > 1:
raise NotImplementedError("more than one suitable set of weights found, "
"don't know how to proceed")
weights = best[0]
break
if not weights:
raise Exception('the requested derivative cannot be performed with the available neighbors')
assert weights
if access._field.index_dimensions == 0:
return sum([access._field.__getitem__(point) * weight for point, weight in zip(stencil, weights)])
else:
total = access.get_shifted(*stencil[0]).at_index(*access.index) * weights[0]
for point, weight in zip(stencil[1:], weights[1:]):
addl = access.get_shifted(*point).at_index(*access.index) * weight
total += addl
return total
if term.args:
new_args = [discretize(a) for a in term.args]
return term.func(*new_args)
else:
return term
source = self.q.applyfunc(ps.fd.derivative.expand_diff_full)
source = source.applyfunc(discretize)
return [ps.Assignment(lhs, rhs) for lhs, rhs in zip(self.c.center_vector, sp.flatten(source)) if rhs]
def discrete_continuity(self, flux_field: ps.field.Field):
"""Return a list of assignments for the continuity equation, which includes the source term
Args:
flux_field: a staggered field from which the fluxes are taken
"""
assert ps.FieldType.is_staggered(flux_field)
neighbors = flux_field.staggered_stencil + [ps.stencil.inverse_direction_string(d)
for d in flux_field.staggered_stencil]
divergence = flux_field.staggered_vector_access(neighbors[0])
for d in neighbors[1:]:
divergence += flux_field.staggered_vector_access(d)
source = self.discrete_source()
source = {s.lhs: s.rhs for s in source}
return [ps.Assignment(lhs, (lhs - rhs + source[lhs]) if lhs in source else (lhs - rhs))
for lhs, rhs in zip(self.c.center_vector, divergence)]
def VOF(j: ps.field.Field, v: ps.field.Field, ρ: ps.field.Field):
"""Volume-of-fluid discretization of advection
Args:
j: the staggered field to write the fluxes to. Should have a D2Q9/D3Q27 stencil. Other stencils work too, but
incur a small error (D2Q5/D3Q7: v^2, D3Q19: v^3).
v: the flow velocity field
ρ: the quantity to advect
"""
assert ps.FieldType.is_staggered(j)
fluxes = [[] for i in range(j.index_shape[0])]
v0 = v.center_vector
for d, neighbor in enumerate(j.staggered_stencil):
c = ps.stencil.direction_string_to_offset(neighbor)
v1 = v.neighbor_vector(c)
# going out
cond = sp.And(*[sp.Or(c[i] * v0[i] > 0, c[i] == 0) for i in range(len(v0))])
overlap1 = [1 - sp.Abs(v0[i]) for i in range(len(v0))]
overlap2 = [c[i] * v0[i] for i in range(len(v0))]
overlap = sp.Mul(*[(overlap1[i] if c[i] == 0 else overlap2[i]) for i in range(len(v0))])
fluxes[d].append(ρ.center_vector * overlap * sp.Piecewise((1, cond), (0, True)))
# coming in
cond = sp.And(*[sp.Or(c[i] * v1[i] < 0, c[i] == 0) for i in range(len(v1))])
overlap1 = [1 - sp.Abs(v1[i]) for i in range(len(v1))]
overlap2 = [v1[i] for i in range(len(v1))]
overlap = sp.Mul(*[(overlap1[i] if c[i] == 0 else overlap2[i]) for i in range(len(v1))])
sign = (c == 1).sum() % 2 * 2 - 1
fluxes[d].append(sign * ρ.neighbor_vector(c) * overlap * sp.Piecewise((1, cond), (0, True)))
for i, ff in enumerate(fluxes):
fluxes[i] = ff[0]
for f in ff[1:]:
fluxes[i] += f
assignments = []
for i, d in enumerate(j.staggered_stencil):
for lhs, rhs in zip(j.staggered_vector_access(d).values(), fluxes[i].values()):
assignments.append(ps.Assignment(lhs, rhs))
return assignments
from functools import lru_cache
from typing import Tuple
import sympy as sp
from pystencils.astnodes import LoopOverCoordinate
from pystencils.fd import Diff
from pystencils.field import Field
from pystencils.transformations import generic_visit
from .derivation import FiniteDifferenceStencilDerivation
from .derivative import diff_args
def fd_stencils_standard(indices, dx, fa):
order = len(indices)
assert all(i >= 0 for i in indices), "Can only discretize objects with (integer) subscripts"
if order == 1:
idx = indices[0]
return (fa.neighbor(idx, 1) - fa.neighbor(idx, -1)) / (2 * dx)
elif order == 2:
if indices[0] == indices[1]:
return (-2 * fa + fa.neighbor(indices[0], -1) + fa.neighbor(indices[0], +1)) / (dx ** 2)
else:
offsets = [(1, 1), [-1, 1], [1, -1], [-1, -1]]
return sum(o1 * o2 * fa.neighbor(indices[0], o1).neighbor(indices[1], o2)
for o1, o2 in offsets) / (4 * dx ** 2)
raise NotImplementedError("Supports only derivatives up to order 2")
def fd_stencils_isotropic(indices, dx, fa):
dim = fa.field.spatial_dimensions
if dim == 1:
return fd_stencils_standard(indices, dx, fa)
elif dim == 2:
order = len(indices)
if order == 1:
idx = indices[0]
assert 0 <= idx < 2
other_idx = 1 if indices[0] == 0 else 0
weights = {-1: sp.Rational(1, 12) / dx,
0: sp.Rational(1, 3) / dx,
1: sp.Rational(1, 12) / dx}
upper_terms = sum(fa.neighbor(idx, +1).neighbor(other_idx, off) * w for off, w in weights.items())
lower_terms = sum(fa.neighbor(idx, -1).neighbor(other_idx, off) * w for off, w in weights.items())
return upper_terms - lower_terms
elif order == 2:
if indices[0] == indices[1]:
idx = indices[0]
other_idx = 1 if idx == 0 else 0
diagonals = sp.Rational(1, 12) * sum(fa.neighbor(0, i).neighbor(1, j) for i in (-1, 1) for j in (-1, 1))
div_direction = sp.Rational(5, 6) * sum(fa.neighbor(idx, i) for i in (-1, 1))
other_direction = - sp.Rational(1, 6) * sum(fa.neighbor(other_idx, i) for i in (-1, 1))
center = - sp.Rational(5, 3) * fa
return (diagonals + div_direction + other_direction + center) / (dx ** 2)
else:
return fd_stencils_standard(indices, dx, fa)
raise NotImplementedError("Supports only derivatives up to order 2 for 1D and 2D setups")
def fd_stencils_forth_order_isotropic(indices, dx, fa):
order = len(indices)
if order != 1:
raise NotImplementedError("Forth order finite difference discretization is "
"currently only supported for first derivatives")
dim = indices[0]
if dim not in (0, 1):
raise NotImplementedError("Forth order finite difference discretization is only implemented for 2D")
stencils = forth_order_2d_derivation()
return stencils[dim].apply(fa) / dx
def discretize_spatial(expr, dx, stencil=fd_stencils_standard):
if isinstance(stencil, str):
if stencil == 'standard':
stencil = fd_stencils_standard
elif stencil == 'isotropic':
stencil = fd_stencils_isotropic
else:
raise ValueError("Unknown stencil. Supported 'standard' and 'isotropic'")
def visitor(e):
if isinstance(e, Diff):
arg, *indices = diff_args(e)
if not isinstance(arg, Field.Access):
raise ValueError("Only derivatives with field or field accesses as arguments can be discretized")
return stencil(indices, dx, arg)
else:
new_args = [discretize_spatial(a, dx, stencil) for a in e.args]
return e.func(*new_args) if new_args else e
return generic_visit(expr, visitor)
def discretize_spatial_staggered(expr, dx, stencil=fd_stencils_standard):
def staggered_visitor(e, coordinate, sign):
if isinstance(e, Diff):
arg, *indices = diff_args(e)
if len(indices) != 1:
raise ValueError("Function supports only up to second derivatives")
if not isinstance(arg, Field.Access):
raise ValueError("Argument of inner derivative has to be field access")
target = indices[0]
if target == coordinate:
assert sign in (-1, 1)
return (arg.neighbor(coordinate, sign) - arg) / dx * sign
else:
return (stencil(indices, dx, arg.neighbor(coordinate, sign))
+ stencil(indices, dx, arg)) / 2
elif isinstance(e, Field.Access):
return (e.neighbor(coordinate, sign) + e) / 2
elif isinstance(e, sp.Symbol):
loop_idx = LoopOverCoordinate.is_loop_counter_symbol(e)
return e + sign / 2 if loop_idx == coordinate else e
else:
new_args = [staggered_visitor(a, coordinate, sign) for a in e.args]
return e.func(*new_args) if new_args else e
def visitor(e):
if isinstance(e, Diff):
arg, *indices = diff_args(e)
if isinstance(arg, Field.Access):
return stencil(indices, dx, arg)
else:
if not len(indices) == 1:
raise ValueError("This term is not support by the staggered discretization strategy")
target = indices[0]
return (staggered_visitor(arg, target, 1) - staggered_visitor(arg, target, -1)) / dx
else:
new_args = [visitor(a) for a in e.args]
return e.func(*new_args) if new_args else e
return generic_visit(expr, visitor)
# -------------------------------------- special stencils --------------------------------------------------------------
@lru_cache(maxsize=1)
def forth_order_2d_derivation() -> Tuple[FiniteDifferenceStencilDerivation.Result, ...]:
# Symmetry, isotropy and 4th order conditions are not enough to fully specify the stencil
# one weight has to be specifically set to a somewhat arbitrary value
second_neighbor_weight = sp.Rational(1, 10)
second_neighbor_stencil = [(i, j)
for i in (-2, -1, 0, 1, 2)
for j in (-2, -1, 0, 1, 2)
]
x_diff = FiniteDifferenceStencilDerivation((0,), second_neighbor_stencil)
x_diff.set_weight((2, 0), second_neighbor_weight)
x_diff.assume_symmetric(0, anti_symmetric=True)
x_diff.assume_symmetric(1)
x_diff_stencil = x_diff.get_stencil(isotropic=True)
y_diff = FiniteDifferenceStencilDerivation((1,), second_neighbor_stencil)
y_diff.set_weight((0, 2), second_neighbor_weight)
y_diff.assume_symmetric(1, anti_symmetric=True)
y_diff.assume_symmetric(0)
y_diff_stencil = y_diff.get_stencil(isotropic=True)
return x_diff_stencil, y_diff_stencil
import functools
import hashlib
import operator
import pickle
import re
from enum import Enum
from itertools import chain
from typing import List, Optional, Sequence, Set, Tuple, Union
import numpy as np
import sympy as sp
from sympy.core.cache import cacheit
import pystencils
from pystencils.alignedarray import aligned_empty
from pystencils.typing import StructType, TypedSymbol, BasicType, create_type
from pystencils.typing.typed_sympy import FieldShapeSymbol, FieldStrideSymbol
from pystencils.stencil import (
direction_string_to_offset, inverse_direction, offset_to_direction_string)
from pystencils.sympyextensions import is_integer_sequence
__all__ = ['Field', 'fields', 'FieldType', 'Field']
class FieldType(Enum):
# generic fields
GENERIC = 0
# index fields are currently only used for boundary handling
# the coordinates are not the loop counters in that case, but are read from this index field
INDEXED = 1
# communication buffer, used for (un)packing data in communication.
BUFFER = 2
# unsafe fields may be accessed in an absolute fashion - the index depends on the data
# and thus may lead to out-of-bounds accesses
CUSTOM = 3
# staggered field
STAGGERED = 4
# staggered field that reverses sign when accessed via opposite direction
STAGGERED_FLUX = 5
@staticmethod
def is_generic(field):
assert isinstance(field, Field)
return field.field_type == FieldType.GENERIC
@staticmethod
def is_indexed(field):
assert isinstance(field, Field)
return field.field_type == FieldType.INDEXED
@staticmethod
def is_buffer(field):
assert isinstance(field, Field)
return field.field_type == FieldType.BUFFER
@staticmethod
def is_custom(field):
assert isinstance(field, Field)
return field.field_type == FieldType.CUSTOM
@staticmethod
def is_staggered(field):
assert isinstance(field, Field)
return field.field_type == FieldType.STAGGERED or field.field_type == FieldType.STAGGERED_FLUX
@staticmethod
def is_staggered_flux(field):
assert isinstance(field, Field)
return field.field_type == FieldType.STAGGERED_FLUX
class Field:
"""
With fields one can formulate stencil-like update rules on structured grids.
This Field class knows about the dimension, memory layout (strides) and optionally about the size of an array.
Creating Fields:
The preferred method to create fields is the `fields` function.
Alternatively one can use one of the static functions `Field.create_generic`, `Field.create_from_numpy_array`
and `Field.create_fixed_size`. Don't instantiate the Field directly!
Fields can be created with known or unknown shapes:
1. If you want to create a kernel with fixed loop sizes i.e. the shape of the array is already known.
This is usually the case if just-in-time compilation directly from Python is done.
(see `Field.create_from_numpy_array`
2. create a more general kernel that works for variable array sizes. This can be used to create kernels
beforehand for a library. (see `Field.create_generic`)
Dimensions and Indexing:
A field has spatial and index dimensions, where the spatial dimensions come first.
The interpretation is that the field has multiple cells in (usually) two or three dimensional space which are
looped over. Additionally N values are stored per cell. In this case spatial_dimensions is two or three,
and index_dimensions equals N. If you want to store a matrix on each point in a two dimensional grid, there
are four dimensions, two spatial and two index dimensions: ``len(arr.shape) == spatial_dims + index_dims``
The shape of the index dimension does not have to be specified. Just use the 'index_dimensions' parameter.
However, it is good practice to define the shape, since out of bounds accesses can be directly detected in this
case. The shape can be passed with the 'index_shape' parameter of the field creation functions.
When accessing (indexing) a field the result is a `Field.Access` which is derived from sympy Symbol.
First specify the spatial offsets in [], then in case index_dimension>0 the indices in ()
e.g. ``f[-1,0,0](7)``
Staggered Fields:
Staggered fields are used to store a value on a second grid shifted by half a cell with respect to the usual
grid.
The first index dimension is used to specify the position on the staggered grid (e.g. 0 means half-way to the
eastern neighbor, 1 is half-way to the northern neighbor, etc.), while additional indices can be used to store
multiple values at each position.
Example using no index dimensions:
>>> a = np.zeros([10, 10])
>>> f = Field.create_from_numpy_array("f", a, index_dimensions=0)
>>> jacobi = (f[-1,0] + f[1,0] + f[0,-1] + f[0,1]) / 4
Examples for index dimensions to create LB field and implement stream pull:
>>> from pystencils import Assignment
>>> stencil = np.array([[0,0], [0,1], [0,-1]])
>>> src, dst = fields("src(3), dst(3) : double[2D]")
>>> assignments = [Assignment(dst[0,0](i), src[-offset](i)) for i, offset in enumerate(stencil)];
"""
@staticmethod
def create_generic(field_name, spatial_dimensions, dtype=np.float64, index_dimensions=0, layout='numpy',
index_shape=None, field_type=FieldType.GENERIC) -> 'Field':
"""
Creates a generic field where the field size is not fixed i.e. can be called with arrays of different sizes
Args:
field_name: symbolic name for the field
dtype: numpy data type of the array the kernel is called with later
spatial_dimensions: see documentation of Field
index_dimensions: see documentation of Field
layout: tuple specifying the loop ordering of the spatial dimensions e.g. (2, 1, 0 ) means that
the outer loop loops over dimension 2, the second outer over dimension 1, and the inner loop
over dimension 0. Also allowed: the strings 'numpy' (0,1,..d) or 'reverse_numpy' (d, ..., 1, 0)
index_shape: optional shape of the index dimensions i.e. maximum values allowed for each index dimension,
has to be a list or tuple
field_type: besides the normal GENERIC fields, there are INDEXED fields that store indices of the domain
that should be iterated over, BUFFER fields that are used to generate communication
packing/unpacking kernels, and STAGGERED fields, which store values half-way to the next
cell
"""
if index_shape is not None:
assert index_dimensions == 0 or index_dimensions == len(index_shape)
index_dimensions = len(index_shape)
if isinstance(layout, str):
layout = spatial_layout_string_to_tuple(layout, dim=spatial_dimensions)
total_dimensions = spatial_dimensions + index_dimensions
if index_shape is None or len(index_shape) == 0:
shape = tuple([FieldShapeSymbol([field_name], i) for i in range(total_dimensions)])
else:
shape = tuple([FieldShapeSymbol([field_name], i) for i in range(spatial_dimensions)] + list(index_shape))
strides = tuple([FieldStrideSymbol(field_name, i) for i in range(total_dimensions)])
np_data_type = np.dtype(dtype)
if np_data_type.fields is not None:
if index_dimensions != 0:
raise ValueError("Structured arrays/fields are not allowed to have an index dimension")
shape += (1,)
strides += (1,)
if field_type == FieldType.STAGGERED and index_dimensions == 0:
raise ValueError("A staggered field needs at least one index dimension")
return Field(field_name, field_type, dtype, layout, shape, strides)
@staticmethod
def create_from_numpy_array(field_name: str, array: np.ndarray, index_dimensions: int = 0,
field_type=FieldType.GENERIC) -> 'Field':
"""Creates a field based on the layout, data type, and shape of a given numpy array.
Kernels created for these kind of fields can only be called with arrays of the same layout, shape and type.
Args:
field_name: symbolic name for the field
array: numpy array
index_dimensions: see documentation of Field
field_type: kind of field
"""
spatial_dimensions = len(array.shape) - index_dimensions
if spatial_dimensions < 1:
raise ValueError("Too many index dimensions. At least one spatial dimension required")
full_layout = get_layout_of_array(array)
spatial_layout = tuple([i for i in full_layout if i < spatial_dimensions])
assert len(spatial_layout) == spatial_dimensions
strides = tuple([s // np.dtype(array.dtype).itemsize for s in array.strides])
shape = tuple(int(s) for s in array.shape)
numpy_dtype = np.dtype(array.dtype)
if numpy_dtype.fields is not None:
if index_dimensions != 0:
raise ValueError("Structured arrays/fields are not allowed to have an index dimension")
shape += (1,)
strides += (1,)
if field_type == FieldType.STAGGERED and index_dimensions == 0:
raise ValueError("A staggered field needs at least one index dimension")
return Field(field_name, field_type, array.dtype, spatial_layout, shape, strides)
@staticmethod
def create_fixed_size(field_name: str, shape: Tuple[int, ...], index_dimensions: int = 0,
dtype=np.float64, layout: str = 'numpy', strides: Optional[Sequence[int]] = None,
field_type=FieldType.GENERIC) -> 'Field':
"""
Creates a field with fixed sizes i.e. can be called only with arrays of the same size and layout
Args:
field_name: symbolic name for the field
shape: overall shape of the array
index_dimensions: how many of the trailing dimensions are interpreted as index (as opposed to spatial)
dtype: numpy data type of the array the kernel is called with later
layout: full layout of array, not only spatial dimensions
strides: strides in bytes or None to automatically compute them from shape (assuming no padding)
field_type: kind of field
"""
spatial_dimensions = len(shape) - index_dimensions
assert spatial_dimensions >= 1
if isinstance(layout, str):
layout = layout_string_to_tuple(layout, spatial_dimensions + index_dimensions)
shape = tuple(int(s) for s in shape)
if strides is None:
strides = compute_strides(shape, layout)
else:
assert len(strides) == len(shape)
strides = tuple([s // np.dtype(dtype).itemsize for s in strides])
numpy_dtype = np.dtype(dtype)
if numpy_dtype.fields is not None:
if index_dimensions != 0:
raise ValueError("Structured arrays/fields are not allowed to have an index dimension")
shape += (1,)
strides += (1,)
if field_type == FieldType.STAGGERED and index_dimensions == 0:
raise ValueError("A staggered field needs at least one index dimension")
spatial_layout = list(layout)
for i in range(spatial_dimensions, len(layout)):
spatial_layout.remove(i)
return Field(field_name, field_type, dtype, tuple(spatial_layout), shape, strides)
def __init__(self, field_name, field_type, dtype, layout, shape, strides):
"""Do not use directly. Use static create* methods"""
self._field_name = field_name
assert isinstance(field_type, FieldType)
assert len(shape) == len(strides)
self.field_type = field_type
self._dtype = create_type(dtype)
self._layout = normalize_layout(layout)
self.shape = shape
self.strides = strides
self.latex_name: Optional[str] = None
self.coordinate_origin = sp.Matrix([0] * self.spatial_dimensions)
self.coordinate_transform = sp.eye(self.spatial_dimensions)
if field_type == FieldType.STAGGERED:
assert self.staggered_stencil
def new_field_with_different_name(self, new_name):
if self.has_fixed_shape:
return Field(new_name, self.field_type, self._dtype, self._layout, self.shape, self.strides)
else:
return Field(new_name, self.field_type, self.dtype, self.layout, self.shape, self.strides)
@property
def spatial_dimensions(self) -> int:
return len(self._layout)
@property
def index_dimensions(self) -> int:
return len(self.shape) - len(self._layout)
@property
def ndim(self) -> int:
return len(self.shape)
def values_per_cell(self) -> int:
return functools.reduce(operator.mul, self.index_shape, 1)
@property
def layout(self):
return self._layout
@property
def name(self) -> str:
return self._field_name
@property
def spatial_shape(self) -> Tuple[int, ...]:
return self.shape[:self.spatial_dimensions]
@property
def has_fixed_shape(self):
return is_integer_sequence(self.shape)
@property
def index_shape(self):
return self.shape[self.spatial_dimensions:]
@property
def has_fixed_index_shape(self):
return is_integer_sequence(self.index_shape)
@property
def spatial_strides(self):
return self.strides[:self.spatial_dimensions]
@property
def index_strides(self):
return self.strides[self.spatial_dimensions:]
@property
def dtype(self):
return self._dtype
@property
def itemsize(self):
return self.dtype.numpy_dtype.itemsize
def __repr__(self):
if any(isinstance(s, sp.Symbol) for s in self.spatial_shape):
spatial_shape_str = f'{self.spatial_dimensions}d'
else:
spatial_shape_str = ','.join(str(i) for i in self.spatial_shape)
index_shape_str = ','.join(str(i) for i in self.index_shape)
if self.index_shape:
return f'{self._field_name}({index_shape_str}): {self.dtype}[{spatial_shape_str}]'
else:
return f'{self._field_name}: {self.dtype}[{spatial_shape_str}]'
def __str__(self):
return self.name
def neighbor(self, coord_id, offset):
offset_list = [0] * self.spatial_dimensions
offset_list[coord_id] = offset
return Field.Access(self, tuple(offset_list))
def neighbors(self, stencil):
return [self.__getitem__(s) for s in stencil]
@property
def center_vector(self):
index_shape = self.index_shape
if len(index_shape) == 0:
return sp.Matrix([self.center])
elif len(index_shape) == 1:
return sp.Matrix([self(i) for i in range(index_shape[0])])
elif len(index_shape) == 2:
return sp.Matrix([[self(i, j) for j in range(index_shape[1])] for i in range(index_shape[0])])
elif len(index_shape) == 3:
return sp.Array([[[self(i, j, k) for k in range(index_shape[2])]
for j in range(index_shape[1])] for i in range(index_shape[0])])
else:
raise NotImplementedError("center_vector is not implemented for more than 3 index dimensions")
@property
def center(self):
center = tuple([0] * self.spatial_dimensions)
return Field.Access(self, center)
def neighbor_vector(self, offset):
"""Like neighbor, but returns the entire vector/tensor stored at offset."""
if self.spatial_dimensions == 2 and len(offset) == 3:
assert offset[2] == 0
offset = offset[:2]
if self.index_dimensions == 0:
return sp.Matrix([self.__getitem__(offset)])
elif self.index_dimensions == 1:
return sp.Matrix([self.__getitem__(offset)(i) for i in range(self.index_shape[0])])
elif self.index_dimensions == 2:
return sp.Matrix([[self.__getitem__(offset)(i, k) for k in range(self.index_shape[1])]
for i in range(self.index_shape[0])])
else:
raise NotImplementedError("neighbor_vector is not implemented for more than 2 index dimensions")
def __getitem__(self, offset):
if type(offset) is np.ndarray:
offset = tuple(offset)
if type(offset) is str:
offset = tuple(direction_string_to_offset(offset, self.spatial_dimensions))
if type(offset) is not tuple:
offset = (offset,)
if len(offset) != self.spatial_dimensions:
raise ValueError(f"Wrong number of spatial indices: Got {len(offset)}, expected {self.spatial_dimensions}")
return Field.Access(self, offset)
def absolute_access(self, offset, index):
assert FieldType.is_custom(self)
return Field.Access(self, offset, index, is_absolute_access=True)
def staggered_access(self, offset, index=None):
"""If this field is a staggered field, it can be accessed using half-integer offsets.
For example, an offset of ``(0, sp.Rational(1,2))`` or ``"E"`` corresponds to the staggered point to the east
of the cell center, i.e. half-way to the eastern-next cell.
If the field stores more than one value per staggered point (e.g. a vector or a tensor), the index (integer or
tuple of integers) refers to which of these values to access.
"""
assert FieldType.is_staggered(self)
offset_orig = offset
if type(offset) is np.ndarray:
offset = tuple(offset)
if type(offset) is str:
offset = tuple(direction_string_to_offset(offset, self.spatial_dimensions))
offset = tuple([o * sp.Rational(1, 2) for o in offset])
if len(offset) != self.spatial_dimensions:
raise ValueError(f"Wrong number of spatial indices: Got {len(offset)}, expected {self.spatial_dimensions}")
prefactor = 1
neighbor_vec = [0] * len(offset)
for i in range(self.spatial_dimensions):
if (offset[i] + sp.Rational(1, 2)).is_Integer:
neighbor_vec[i] = sp.sign(offset[i])
neighbor = offset_to_direction_string(neighbor_vec)
if neighbor not in self.staggered_stencil:
neighbor_vec = inverse_direction(neighbor_vec)
neighbor = offset_to_direction_string(neighbor_vec)
if FieldType.is_staggered_flux(self):
prefactor = -1
if neighbor not in self.staggered_stencil:
raise ValueError(f"{offset_orig} is not a valid neighbor for the {self.staggered_stencil_name} stencil")
offset = tuple(sp.Matrix(offset) - sp.Rational(1, 2) * sp.Matrix(neighbor_vec))
idx = self.staggered_stencil.index(neighbor)
if self.index_dimensions == 1: # this field stores a scalar value at each staggered position
if index is not None:
raise ValueError("Cannot specify an index for a scalar staggered field")
return prefactor * Field.Access(self, offset, (idx,))
else: # this field stores a vector or tensor at each staggered position
if index is None:
raise ValueError(f"Wrong number of indices: Got 0, expected {self.index_dimensions - 1}")
if type(index) is np.ndarray:
index = tuple(index)
if type(index) is not tuple:
index = (index,)
if self.index_dimensions != len(index) + 1:
raise ValueError(f"Wrong number of indices: Got {len(index)}, expected {self.index_dimensions - 1}")
return prefactor * Field.Access(self, offset, (idx, *index))
def staggered_vector_access(self, offset):
"""Like staggered_access, but returns the entire vector/tensor stored at offset."""
assert FieldType.is_staggered(self)
if self.index_dimensions == 1:
return sp.Matrix([self.staggered_access(offset)])
elif self.index_dimensions == 2:
return sp.Matrix([self.staggered_access(offset, i) for i in range(self.index_shape[1])])
elif self.index_dimensions == 3:
return sp.Matrix([[self.staggered_access(offset, (i, k)) for k in range(self.index_shape[2])]
for i in range(self.index_shape[1])])
else:
raise NotImplementedError("staggered_vector_access is not implemented for more than 3 index dimensions")
@property
def staggered_stencil(self):
assert FieldType.is_staggered(self)
stencils = {
2: {
2: ["W", "S"], # D2Q5
4: ["W", "S", "SW", "NW"] # D2Q9
},
3: {
3: ["W", "S", "B"], # D3Q7
7: ["W", "S", "B", "BSW", "TSW", "BNW", "TNW"], # D3Q15
9: ["W", "S", "B", "SW", "NW", "BW", "TW", "BS", "TS"], # D3Q19
13: ["W", "S", "B", "SW", "NW", "BW", "TW", "BS", "TS", "BSW", "TSW", "BNW", "TNW"] # D3Q27
}
}
if not self.index_shape[0] in stencils[self.spatial_dimensions]:
raise ValueError(f"No known stencil has {self.index_shape[0]} staggered points")
return stencils[self.spatial_dimensions][self.index_shape[0]]
@property
def staggered_stencil_name(self):
assert FieldType.is_staggered(self)
return f"D{self.spatial_dimensions}Q{self.index_shape[0] * 2 + 1}"
def __call__(self, *args, **kwargs):
center = tuple([0] * self.spatial_dimensions)
return Field.Access(self, center)(*args, **kwargs)
def hashable_contents(self):
return (self._layout,
self.shape,
self.strides,
self.field_type,
self._field_name,
self.latex_name,
self._dtype)
def __hash__(self):
return hash(self.hashable_contents())
def __eq__(self, other):
if not isinstance(other, Field):
return False
return self.hashable_contents() == other.hashable_contents()
@property
def physical_coordinates(self):
if hasattr(self.coordinate_transform, '__call__'):
return self.coordinate_transform(self.coordinate_origin + pystencils.x_vector(self.spatial_dimensions))
else:
return self.coordinate_transform @ (self.coordinate_origin + pystencils.x_vector(self.spatial_dimensions))
@property
def physical_coordinates_staggered(self):
return self.coordinate_transform @ \
(self.coordinate_origin + pystencils.x_staggered_vector(self.spatial_dimensions))
def index_to_physical(self, index_coordinates: sp.Matrix, staggered=False):
if staggered:
index_coordinates = sp.Matrix([0.5] * len(self.coordinate_origin)) + index_coordinates
if hasattr(self.coordinate_transform, '__call__'):
return self.coordinate_transform(self.coordinate_origin + index_coordinates)
else:
return self.coordinate_transform @ (self.coordinate_origin + index_coordinates)
def physical_to_index(self, physical_coordinates: sp.Matrix, staggered=False):
if hasattr(self.coordinate_transform, '__call__'):
if hasattr(self.coordinate_transform, 'inv'):
return self.coordinate_transform.inv()(physical_coordinates) - self.coordinate_origin
else:
idx = sp.Matrix(sp.symbols(f'index_coordinates:{self.ndim}', real=True))
rtn = sp.solve(self.index_to_physical(idx) - physical_coordinates, idx)
assert rtn, f'Could not find inverese of coordinate_transform: {self.index_to_physical(idx)}'
return rtn
else:
rtn = self.coordinate_transform.inv() @ physical_coordinates - self.coordinate_origin
if staggered:
rtn = sp.Matrix([i - 0.5 for i in rtn])
return rtn
def set_coordinate_origin_to_field_center(self):
self.coordinate_origin = -sp.Matrix([i / 2 for i in self.spatial_shape])
# noinspection PyAttributeOutsideInit,PyUnresolvedReferences
class Access(TypedSymbol):
"""Class representing a relative access into a `Field`.
This class behaves like a normal sympy Symbol, it is actually derived from it. One can built up
sympy expressions using field accesses, solve for them, etc.
Examples:
>>> vector_field_2d = fields("v(2): double[2D]") # create a 2D vector field
>>> northern_neighbor_y_component = vector_field_2d[0, 1](1)
>>> northern_neighbor_y_component
v_N^1
>>> central_y_component = vector_field_2d(1)
>>> central_y_component
v_C^1
>>> central_y_component.get_shifted(1, 0) # move the existing access
v_E^1
>>> central_y_component.at_index(0) # change component
v_C^0
"""
_iterable = False # see https://i10git.cs.fau.de/pycodegen/pystencils/-/merge_requests/166#note_10680
def __new__(cls, name, *args, **kwargs):
obj = Field.Access.__xnew_cached_(cls, name, *args, **kwargs)
return obj
def __new_stage2__(self, field, offsets=(0, 0, 0), idx=None, is_absolute_access=False, dtype=None):
field_name = field.name
offsets_and_index = (*offsets, *idx) if idx is not None else offsets
constant_offsets = not any([isinstance(o, sp.Basic) and not o.is_Integer for o in offsets_and_index])
if not idx:
idx = tuple([0] * field.index_dimensions)
if constant_offsets:
offset_name = offset_to_direction_string(offsets)
if field.index_dimensions == 0:
superscript = None
elif field.index_dimensions == 1:
superscript = str(idx[0])
else:
idx_str = ",".join([str(e) for e in idx])
superscript = idx_str
if field.has_fixed_index_shape and not isinstance(field.dtype, StructType):
for i, bound in zip(idx, field.index_shape):
if i >= bound:
raise ValueError("Field index out of bounds")
else:
offset_name = hashlib.md5(pickle.dumps(offsets_and_index)).hexdigest()[:12]
superscript = None
symbol_name = f"{field_name}_{offset_name}"
if superscript is not None:
symbol_name += "^" + superscript
if dtype:
obj = super(Field.Access, self).__xnew__(self, symbol_name, dtype)
else:
obj = super(Field.Access, self).__xnew__(self, symbol_name, field.dtype)
obj._field = field
obj._offsets = []
for o in offsets:
if isinstance(o, sp.Basic):
obj._offsets.append(o)
else:
obj._offsets.append(int(o))
obj._offsets = tuple(sp.sympify(obj._offsets))
obj._offsetName = offset_name
obj._superscript = superscript
obj._index = idx
obj._indirect_addressing_fields = set()
for e in chain(obj._offsets, obj._index):
if isinstance(e, sp.Basic):
obj._indirect_addressing_fields.update(a.field for a in e.atoms(Field.Access))
obj._is_absolute_access = is_absolute_access
return obj
def __getnewargs__(self):
return self.field, self.offsets, self.index, self.is_absolute_access, self.dtype
def __getnewargs_ex__(self):
return (self.field, self.offsets, self.index, self.is_absolute_access, self.dtype), {}
# noinspection SpellCheckingInspection
__xnew__ = staticmethod(__new_stage2__)
# noinspection SpellCheckingInspection
__xnew_cached_ = staticmethod(cacheit(__new_stage2__))
def __call__(self, *idx):
if self._index != tuple([0] * self.field.index_dimensions):
raise ValueError("Indexing an already indexed Field.Access")
idx = tuple(idx)
if self.field.index_dimensions == 0 and idx == (0,):
idx = ()
if len(idx) != self.field.index_dimensions:
raise ValueError(f"Wrong number of indices: Got {len(idx)}, expected {self.field.index_dimensions}")
if len(idx) == 1 and isinstance(idx[0], str):
dtype = BasicType(self.field.dtype.numpy_dtype[idx[0]])
return Field.Access(self.field, self._offsets, idx,
is_absolute_access=self.is_absolute_access, dtype=dtype)
else:
return Field.Access(self.field, self._offsets, idx,
is_absolute_access=self.is_absolute_access, dtype=self.dtype)
def __getitem__(self, *idx):
return self.__call__(*idx)
@property
def field(self) -> 'Field':
"""Field that the Access points to"""
return self._field
@property
def offsets(self) -> Tuple:
"""Spatial offset as tuple"""
return self._offsets
@property
def required_ghost_layers(self) -> int:
"""Largest spatial distance that is accessed."""
return int(np.max(np.abs(self._offsets)))
@property
def nr_of_coordinates(self):
return len(self._offsets)
@property
def offset_name(self) -> str:
"""Spatial offset as string, East-West for x, North-South for y and Top-Bottom for z coordinate.
Example:
>>> f = fields("f: double[2D]")
>>> f[1, 1].offset_name # north-east
'NE'
"""
return self._offsetName
@property
def index(self):
"""Value of index coordinates as tuple."""
return self._index
def neighbor(self, coord_id: int, offset: int) -> 'Field.Access':
"""Returns a new Access with changed spatial coordinates.
Args:
coord_id: index of the coordinate to change (0 for x, 1 for y,...)
offset: incremental change of this coordinate
Example:
>>> f = fields('f: [2D]')
>>> f[0,0].neighbor(coord_id=1, offset=-1)
f_S
"""
offset_list = list(self.offsets)
offset_list[coord_id] += offset
return Field.Access(self.field, tuple(offset_list), self.index,
is_absolute_access=self.is_absolute_access, dtype=self.dtype)
def get_shifted(self, *shift) -> 'Field.Access':
"""Returns a new Access with changed spatial coordinates
Example:
>>> f = fields("f: [2D]")
>>> f[0,0].get_shifted(1, 1)
f_NE
"""
return Field.Access(self.field,
tuple(a + b for a, b in zip(shift, self.offsets)),
self.index,
is_absolute_access=self.is_absolute_access,
dtype=self.dtype)
def at_index(self, *idx_tuple) -> 'Field.Access':
"""Returns new Access with changed index.
Example:
>>> f = fields("f(9): [2D]")
>>> f(0).at_index(8)
f_C^8
"""
return Field.Access(self.field, self.offsets, idx_tuple,
is_absolute_access=self.is_absolute_access, dtype=self.dtype)
def _eval_subs(self, old, new):
return Field.Access(self.field,
tuple(sp.sympify(a).subs(old, new) for a in self.offsets),
tuple(sp.sympify(a).subs(old, new) for a in self.index),
is_absolute_access=self.is_absolute_access,
dtype=self.dtype)
@property
def is_absolute_access(self) -> bool:
"""Indicates if a field access is relative to the loop counters (this is the default) or absolute"""
return self._is_absolute_access
@property
def indirect_addressing_fields(self) -> Set['Field']:
"""Returns a set of fields that the access depends on.
e.g. f[index_field[1, 0]], the outer access to f depends on index_field
"""
return self._indirect_addressing_fields
def _hashable_content(self):
super_class_contents = super(Field.Access, self)._hashable_content()
return (super_class_contents, self._field.hashable_contents(), *self._index,
*self._offsets, self._is_absolute_access)
def _staggered_offset(self, offsets, index):
assert FieldType.is_staggered(self._field)
neighbor = self._field.staggered_stencil[index]
neighbor = direction_string_to_offset(neighbor, self._field.spatial_dimensions)
return [(o + sp.Rational(int(neighbor[i]), 2)) for i, o in enumerate(offsets)]
def _latex(self, _):
n = self._field.latex_name if self._field.latex_name else self._field.name
offset_str = ",".join([sp.latex(o) for o in self.offsets])
if FieldType.is_staggered(self._field):
offset_str = ",".join([sp.latex(self._staggered_offset(self.offsets, self.index[0])[i])
for i in range(len(self.offsets))])
if self.is_absolute_access:
offset_str = f"\\mathbf{offset_str}"
elif self.field.spatial_dimensions > 1:
offset_str = f"({offset_str})"
if FieldType.is_staggered(self._field):
if self.index and self.field.index_dimensions > 1:
return f"{{{n}}}_{{{offset_str}}}^{{{self.index[1:] if len(self.index) > 2 else self.index[1]}}}"
else:
return f"{{{n}}}_{{{offset_str}}}"
else:
if self.index and self.field.index_dimensions > 0:
return f"{{{n}}}_{{{offset_str}}}^{{{self.index if len(self.index) > 1 else self.index[0]}}}"
else:
return f"{{{n}}}_{{{offset_str}}}"
def __str__(self):
n = self._field.latex_name if self._field.latex_name else self._field.name
offset_str = ",".join([sp.latex(o) for o in self.offsets])
if FieldType.is_staggered(self._field):
offset_str = ",".join([sp.latex(self._staggered_offset(self.offsets, self.index[0])[i])
for i in range(len(self.offsets))])
if self.is_absolute_access:
offset_str = f"[abs]{offset_str}"
if FieldType.is_staggered(self._field):
if self.index and self.field.index_dimensions > 1:
return f"{n}[{offset_str}]({self.index[1:] if len(self.index) > 2 else self.index[1]})"
else:
return f"{n}[{offset_str}]"
else:
if self.index and self.field.index_dimensions > 0:
return f"{n}[{offset_str}]({self.index if len(self.index) > 1 else self.index[0]})"
else:
return f"{n}[{offset_str}]"
def fields(description=None, index_dimensions=0, layout=None,
field_type=FieldType.GENERIC, **kwargs) -> Union[Field, List[Field]]:
"""Creates pystencils fields from a string description.
Examples:
Create a 2D scalar and vector field:
>>> s, v = fields("s, v(2): double[2D]")
>>> assert s.spatial_dimensions == 2 and s.index_dimensions == 0
>>> assert (v.spatial_dimensions, v.index_dimensions, v.index_shape) == (2, 1, (2,))
Create an integer field of shape (10, 20):
>>> f = fields("f : int32[10, 20]")
>>> f.has_fixed_shape, f.shape
(True, (10, 20))
Numpy arrays can be used as template for shape and data type of field:
>>> arr_s, arr_v = np.zeros([20, 20]), np.zeros([20, 20, 2])
>>> s, v = fields("s, v(2)", s=arr_s, v=arr_v)
>>> assert s.index_dimensions == 0 and s.dtype.numpy_dtype == arr_s.dtype
>>> assert v.index_shape == (2,)
Format string can be left out, field names are taken from keyword arguments.
>>> fields(f1=arr_s, f2=arr_s)
[f1: double[20,20], f2: double[20,20]]
The keyword names ``index_dimension`` and ``layout`` have special meaning, don't use them for field names
>>> f = fields(f=arr_v, index_dimensions=1)
>>> assert f.index_dimensions == 1
>>> f = fields("pdfs(19) : float32[3D]", layout='fzyx')
>>> f.layout
(2, 1, 0)
"""
result = []
if description:
field_descriptions, dtype, shape = _parse_description(description)
layout = 'numpy' if layout is None else layout
for field_name, idx_shape in field_descriptions:
if field_name in kwargs:
arr = kwargs[field_name]
idx_shape_of_arr = () if not len(idx_shape) else arr.shape[-len(idx_shape):]
assert idx_shape_of_arr == idx_shape
f = Field.create_from_numpy_array(field_name, kwargs[field_name], index_dimensions=len(idx_shape),
field_type=field_type)
elif isinstance(shape, tuple):
f = Field.create_fixed_size(field_name, shape + idx_shape, dtype=dtype,
index_dimensions=len(idx_shape), layout=layout, field_type=field_type)
elif isinstance(shape, int):
f = Field.create_generic(field_name, spatial_dimensions=shape, dtype=dtype,
index_shape=idx_shape, layout=layout, field_type=field_type)
elif shape is None:
f = Field.create_generic(field_name, spatial_dimensions=2, dtype=dtype,
index_shape=idx_shape, layout=layout, field_type=field_type)
else:
assert False
result.append(f)
else:
assert layout is None, "Layout can not be specified when creating Field from numpy array"
for field_name, arr in kwargs.items():
result.append(Field.create_from_numpy_array(field_name, arr, index_dimensions=index_dimensions,
field_type=field_type))
if len(result) == 0:
raise ValueError("Could not parse field description")
elif len(result) == 1:
return result[0]
else:
return result
def get_layout_from_strides(strides: Sequence[int], index_dimension_ids: Optional[List[int]] = None):
index_dimension_ids = [] if index_dimension_ids is None else index_dimension_ids
coordinates = list(range(len(strides)))
relevant_strides = [stride for i, stride in enumerate(strides) if i not in index_dimension_ids]
result = [x for (y, x) in sorted(zip(relevant_strides, coordinates), key=lambda pair: pair[0], reverse=True)]
return normalize_layout(result)
def get_layout_of_array(arr: np.ndarray, index_dimension_ids: Optional[List[int]] = None):
""" Returns a list indicating the memory layout (linearization order) of the numpy array.
Examples:
>>> get_layout_of_array(np.zeros([3,3,3]))
(0, 1, 2)
In this example the loop over the zeroth coordinate should be the outermost loop,
followed by the first and second. Elements arr[x,y,0] and arr[x,y,1] are adjacent in memory.
Normally constructed numpy arrays have this order, however by stride tricks or other frameworks, arrays
with different memory layout can be created.
The index_dimension_ids parameter leaves specifies which coordinates should not be
"""
index_dimension_ids = [] if index_dimension_ids is None else index_dimension_ids
return get_layout_from_strides(arr.strides, index_dimension_ids)
def create_numpy_array_with_layout(shape, layout, alignment=False, byte_offset=0, **kwargs):
"""Creates numpy array with given memory layout.
Args:
shape: shape of the resulting array
layout: layout as tuple, where the coordinates are ordered from slow to fast
alignment: number of bytes to align the beginning and the innermost coordinate to, or False for no alignment
byte_offset: only used when alignment is specified, align not beginning but address at this offset
mostly used to align first inner cell, not ghost cells
Example:
>>> res = create_numpy_array_with_layout(shape=(2, 3, 4, 5), layout=(3, 2, 0, 1))
>>> res.shape
(2, 3, 4, 5)
>>> get_layout_of_array(res)
(3, 2, 0, 1)
"""
assert set(layout) == set(range(len(shape))), "Wrong layout descriptor"
cur_layout = list(range(len(shape)))
swaps = []
for i in range(len(layout)):
if cur_layout[i] != layout[i]:
index_to_swap_with = cur_layout.index(layout[i])
swaps.append((i, index_to_swap_with))
cur_layout[i], cur_layout[index_to_swap_with] = cur_layout[index_to_swap_with], cur_layout[i]
assert tuple(cur_layout) == tuple(layout)
shape = list(shape)
for a, b in swaps:
shape[a], shape[b] = shape[b], shape[a]
if not alignment:
res = np.empty(shape, order='c', **kwargs)
else:
res = aligned_empty(shape, alignment, byte_offset=byte_offset, **kwargs)
for a, b in reversed(swaps):
res = res.swapaxes(a, b)
return res
def spatial_layout_string_to_tuple(layout_str: str, dim: int) -> Tuple[int, ...]:
if dim <= 0:
raise ValueError("Dimensionality must be positive")
layout_str = layout_str.lower()
if layout_str in ('fzyx', 'zyxf', 'soa', 'aos'):
if dim > 3:
raise ValueError(f"Invalid spatial dimensionality for layout descriptor {layout_str}: May be at most 3.")
return tuple(reversed(range(dim)))
if layout_str in ('f', 'reverse_numpy'):
return tuple(reversed(range(dim)))
elif layout_str in ('c', 'numpy'):
return tuple(range(dim))
raise ValueError("Unknown layout descriptor " + layout_str)
def layout_string_to_tuple(layout_str, dim):
if dim <= 0:
raise ValueError("Dimensionality must be positive")
layout_str = layout_str.lower()
if layout_str == 'fzyx' or layout_str == 'soa':
if dim > 4:
raise ValueError(f"Invalid total dimensionality for layout descriptor {layout_str}: May be at most 4.")
return tuple(reversed(range(dim)))
elif layout_str == 'zyxf' or layout_str == 'aos':
if dim > 4:
raise ValueError(f"Invalid total dimensionality for layout descriptor {layout_str}: May be at most 4.")
return tuple(reversed(range(dim - 1))) + (dim - 1,)
elif layout_str == 'f' or layout_str == 'reverse_numpy':
return tuple(reversed(range(dim)))
elif layout_str == 'c' or layout_str == 'numpy':
return tuple(range(dim))
raise ValueError("Unknown layout descriptor " + layout_str)
def normalize_layout(layout):
"""Takes a layout tuple and subtracts the minimum from all entries"""
min_entry = min(layout)
return tuple(i - min_entry for i in layout)
def compute_strides(shape, layout):
"""
Computes strides assuming no padding exists
Args:
shape: shape (size) of array
layout: layout specification as tuple
Returns:
strides in elements, not in bytes
"""
dim = len(shape)
assert len(layout) == dim
assert len(set(layout)) == dim
strides = [0] * dim
product = 1
for j in reversed(layout):
strides[j] = product
product *= shape[j]
return tuple(strides)
# ---------------------------------------- Parsing of string in fields() function --------------------------------------
field_description_regex = re.compile(r"""
\s* # ignore leading white spaces
(\w+) # identifier is a sequence of alphanumeric characters, is stored in first group
(?: # optional index specification e.g. (1, 4, 2)
\s*
\(
([^\)]+) # read everything up to closing bracket
\)
\s*
)?
\s*,?\s* # ignore trailing white spaces and comma
""", re.VERBOSE)
type_description_regex = re.compile(r"""
\s*
(\w+)? # optional dtype
\s*
\[
([^\]]+)
\]
\s*
""", re.VERBOSE | re.IGNORECASE)
def _parse_part1(d):
result = field_description_regex.match(d)
while result:
name, index_str = result.group(1), result.group(2)
index = tuple(int(e) for e in index_str.split(",")) if index_str else ()
yield name, index
d = d[result.end():]
result = field_description_regex.match(d)
def _parse_description(description):
def parse_part2(d):
result = type_description_regex.match(d)
if result:
data_type_str, size_info = result.group(1), result.group(2).strip().lower()
if data_type_str is None:
data_type_str = 'float64'
data_type_str = data_type_str.lower().strip()
if not data_type_str:
data_type_str = 'float64'
if size_info.endswith('d'):
size_info = int(size_info[:-1])
else:
size_info = tuple(int(e) for e in size_info.split(","))
return data_type_str, size_info
else:
raise ValueError("Could not parse field description")
if ':' in description:
field_description, field_info = description.split(':')
else:
field_description, field_info = description, 'float64[2D]'
fields_info = [e for e in _parse_part1(field_description)]
if not field_info:
raise ValueError("Could not parse field description")
data_type, size = parse_part2(field_info)
return fields_info, data_type, size
import sympy as sp
from pystencils.typing import PointerType
class DivFunc(sp.Function):
"""
DivFunc represents a division operation, since sympy represents divisions with ^-1
"""
is_Atom = True
is_real = True
def __new__(cls, *args, **kwargs):
if len(args) != 2:
raise ValueError(f'{cls} takes only 2 arguments, instead {len(args)} received!')
divisor, dividend, *other_args = args
return sp.Function.__new__(cls, divisor, dividend, *other_args, **kwargs)
def _eval_evalf(self, *args, **kwargs):
return self.divisor.evalf() / self.dividend.evalf()
@property
def divisor(self):
return self.args[0]
@property
def dividend(self):
return self.args[1]
class AddressOf(sp.Function):
"""
AddressOf is the '&' operation in C. It gets the address of a lvalue.
"""
is_Atom = True
def __new__(cls, arg):
obj = sp.Function.__new__(cls, arg)
return obj
@property
def canonical(self):
if hasattr(self.args[0], 'canonical'):
return self.args[0].canonical
else:
raise NotImplementedError()
@property
def is_commutative(self):
return self.args[0].is_commutative
@property
def dtype(self):
if hasattr(self.args[0], 'dtype'):
return PointerType(self.args[0].dtype, restrict=True)
else:
raise ValueError(f'pystencils supports only non void pointers. Current address_of type: {self.args[0]}')
from pystencils.gpu.gpu_array_handler import GPUArrayHandler, GPUNotAvailableHandler
from pystencils.gpu.gpujit import make_python_function
from pystencils.gpu.kernelcreation import create_cuda_kernel, created_indexed_cuda_kernel
from .indexing import AbstractIndexing, BlockIndexing, LineIndexing
__all__ = ['GPUArrayHandler', 'GPUNotAvailableHandler',
'create_cuda_kernel', 'created_indexed_cuda_kernel', 'make_python_function',
'AbstractIndexing', 'BlockIndexing', 'LineIndexing']
try:
import cupy as cp
import cupyx as cpx
except ImportError:
cp = None
cpx = None
import numpy as np
class GPUArrayHandler:
def __init__(self, device_number):
self._device_number = device_number
def zeros(self, shape, dtype=np.float64, order='C'):
with cp.cuda.Device(self._device_number):
return cp.zeros(shape=shape, dtype=dtype, order=order)
def ones(self, shape, dtype=np.float64, order='C'):
with cp.cuda.Device(self._device_number):
return cp.ones(shape=shape, dtype=dtype, order=order)
def empty(self, shape, dtype=np.float64, order='C'):
with cp.cuda.Device(self._device_number):
return cp.empty(shape=shape, dtype=dtype, order=order)
def to_gpu(self, numpy_array):
swaps = _get_index_swaps(numpy_array)
if numpy_array.base is not None and isinstance(numpy_array.base, np.ndarray):
with cp.cuda.Device(self._device_number):
gpu_array = cp.asarray(numpy_array.base)
for a, b in reversed(swaps):
gpu_array = gpu_array.swapaxes(a, b)
return gpu_array
else:
return cp.asarray(numpy_array)
def upload(self, array, numpy_array):
assert self._device_number == array.device.id
if numpy_array.base is not None and isinstance(numpy_array.base, np.ndarray):
with cp.cuda.Device(self._device_number):
array.base.set(numpy_array.base)
else:
with cp.cuda.Device(self._device_number):
array.set(numpy_array)
def download(self, array, numpy_array):
assert self._device_number == array.device.id
if numpy_array.base is not None and isinstance(numpy_array.base, np.ndarray):
with cp.cuda.Device(self._device_number):
numpy_array.base[:] = array.base.get()
else:
with cp.cuda.Device(self._device_number):
numpy_array[:] = array.get()
def randn(self, shape, dtype=np.float64):
with cp.cuda.Device(self._device_number):
return cp.random.randn(*shape, dtype=dtype)
@staticmethod
def pinned_numpy_array(layout, shape, dtype):
assert set(layout) == set(range(len(shape))), "Wrong layout descriptor"
cur_layout = list(range(len(shape)))
swaps = []
for i in range(len(layout)):
if cur_layout[i] != layout[i]:
index_to_swap_with = cur_layout.index(layout[i])
swaps.append((i, index_to_swap_with))
cur_layout[i], cur_layout[index_to_swap_with] = cur_layout[index_to_swap_with], cur_layout[i]
assert tuple(cur_layout) == tuple(layout)
shape = list(shape)
for a, b in swaps:
shape[a], shape[b] = shape[b], shape[a]
res = cpx.empty_pinned(tuple(shape), order='c', dtype=dtype)
for a, b in reversed(swaps):
res = res.swapaxes(a, b)
return res
from_numpy = to_gpu
class GPUNotAvailableHandler:
def __getattribute__(self, name):
raise NotImplementedError("Unable to utilise cupy! Please make sure cupy works correctly in your setup!")
def _get_index_swaps(array):
swaps = []
if array.base is not None and isinstance(array.base, np.ndarray):
for stride in array.base.strides:
index_base = array.base.strides.index(stride)
index_view = array.strides.index(stride)
if index_base != index_view and (index_view, index_base) not in swaps:
swaps.append((index_base, index_view))
return swaps
import numpy as np
from pystencils.backends.cbackend import get_headers
from pystencils.backends.cuda_backend import generate_cuda
from pystencils.typing import StructType
from pystencils.field import FieldType
from pystencils.include import get_pystencils_include_path
from pystencils.kernel_wrapper import KernelWrapper
from pystencils.typing import BasicType, FieldPointerSymbol
USE_FAST_MATH = True
def get_cubic_interpolation_include_paths():
from os.path import join, dirname
return [join(dirname(__file__), "CubicInterpolationCUDA", "code"),
join(dirname(__file__), "CubicInterpolationCUDA", "code", "internal")]
def make_python_function(kernel_function_node, argument_dict=None, custom_backend=None):
"""
Creates a kernel function from an abstract syntax tree which
was created e.g. by :func:`pystencils.gpu.create_cuda_kernel`
or :func:`pystencils.gpu.created_indexed_cuda_kernel`
Args:
kernel_function_node: the abstract syntax tree
argument_dict: parameters passed here are already fixed. Remaining parameters have to be passed to the
returned kernel functor.
custom_backend: use own custom printer for code generation
Returns:
compiled kernel as Python function
"""
import cupy as cp
if argument_dict is None:
argument_dict = {}
headers = get_headers(kernel_function_node)
if cp.cuda.runtime.is_hip:
headers.add('"gpu_defines.h"')
for field in kernel_function_node.fields_accessed:
if isinstance(field.dtype, BasicType) and field.dtype.is_half():
headers.add('<hip/hip_fp16.h>')
else:
headers.update({'"gpu_defines.h"', '<cstdint>'})
for field in kernel_function_node.fields_accessed:
if isinstance(field.dtype, BasicType) and field.dtype.is_half():
headers.add('<cuda_fp16.h>')
header_list = sorted(headers)
includes = "\n".join([f"#include {include_file}" for include_file in header_list])
code = includes + "\n"
code += "#define FUNC_PREFIX __global__\n"
code += "#define RESTRICT __restrict__\n\n"
code += 'extern "C" {\n%s\n}\n' % str(generate_cuda(kernel_function_node, custom_backend=custom_backend))
options = ["-w", "-std=c++11"]
if USE_FAST_MATH:
options.append("-use_fast_math")
options.append("-I" + get_pystencils_include_path())
func = cp.RawKernel(code, kernel_function_node.function_name, options=tuple(options), backend="nvrtc", jitify=True)
parameters = kernel_function_node.get_parameters()
cache = {}
cache_values = []
def wrapper(**kwargs):
key = hash(tuple((k, v.ctypes.data, v.strides, v.shape) if isinstance(v, np.ndarray) else (k, id(v))
for k, v in kwargs.items()))
try:
args, block_and_thread_numbers = cache[key]
device = set(a.device.id for a in args if type(a) is cp.ndarray)
assert len(device) == 1, "All arrays used by a kernel need to be allocated on the same device"
with cp.cuda.Device(device.pop()):
func(block_and_thread_numbers['grid'], block_and_thread_numbers['block'], args)
except KeyError:
full_arguments = argument_dict.copy()
full_arguments.update(kwargs)
shape = _check_arguments(parameters, full_arguments)
indexing = kernel_function_node.indexing
block_and_thread_numbers = indexing.call_parameters(shape)
block_and_thread_numbers['block'] = tuple(int(i) for i in block_and_thread_numbers['block'])
block_and_thread_numbers['grid'] = tuple(int(i) for i in block_and_thread_numbers['grid'])
args = tuple(_build_numpy_argument_list(parameters, full_arguments))
cache[key] = (args, block_and_thread_numbers)
cache_values.append(kwargs) # keep objects alive such that ids remain unique
device = set(a.device.id for a in args if type(a) is cp.ndarray)
assert len(device) == 1, "All arrays used by a kernel need to be allocated on the same device"
with cp.cuda.Device(device.pop()):
func(block_and_thread_numbers['grid'], block_and_thread_numbers['block'], args)
# useful for debugging:
# cp.cuda.runtime.deviceSynchronize()
# cuda.Context.synchronize() # useful for debugging, to get errors right after kernel was called
ast = kernel_function_node
parameters = kernel_function_node.get_parameters()
wrapper = KernelWrapper(wrapper, parameters, ast)
wrapper.num_regs = func.num_regs
return wrapper
def _build_numpy_argument_list(parameters, argument_dict):
argument_dict = {k: v for k, v in argument_dict.items()}
result = []
for param in parameters:
if param.is_field_pointer:
array = argument_dict[param.field_name]
actual_type = array.dtype
expected_type = param.fields[0].dtype.numpy_dtype
if expected_type != actual_type:
raise ValueError(f"Data type mismatch for field {param.field_name}. "
f"Expected {expected_type} got {actual_type}.")
result.append(array)
elif param.is_field_stride:
cast_to_dtype = param.symbol.dtype.numpy_dtype.type
array = argument_dict[param.field_name]
stride = cast_to_dtype(array.strides[param.symbol.coordinate] // array.dtype.itemsize)
result.append(stride)
elif param.is_field_shape:
cast_to_dtype = param.symbol.dtype.numpy_dtype.type
array = argument_dict[param.field_name]
result.append(cast_to_dtype(array.shape[param.symbol.coordinate]))
else:
expected_type = param.symbol.dtype.numpy_dtype
result.append(expected_type.type(argument_dict[param.symbol.name]))
assert len(result) == len(parameters)
return result
def _check_arguments(parameter_specification, argument_dict):
"""
Checks if parameters passed to kernel match the description in the AST function node.
If not it raises a ValueError, on success it returns the array shape that determines the CUDA blocks and threads
"""
argument_dict = {k: v for k, v in argument_dict.items()}
array_shapes = set()
index_arr_shapes = set()
for param in parameter_specification:
if isinstance(param.symbol, FieldPointerSymbol):
symbolic_field = param.fields[0]
try:
field_arr = argument_dict[symbolic_field.name]
except KeyError:
raise KeyError(f"Missing field parameter for kernel call {str(symbolic_field)}")
if symbolic_field.has_fixed_shape:
symbolic_field_shape = tuple(int(i) for i in symbolic_field.shape)
if isinstance(symbolic_field.dtype, StructType):
symbolic_field_shape = symbolic_field_shape[:-1]
if symbolic_field_shape != field_arr.shape:
raise ValueError(f"Passed array {symbolic_field.name} has shape {str(field_arr.shape)} "
f"which does not match expected shape {str(symbolic_field.shape)}")
if symbolic_field.has_fixed_shape:
symbolic_field_strides = tuple(int(i) * field_arr.dtype.itemsize for i in symbolic_field.strides)
if isinstance(symbolic_field.dtype, StructType):
symbolic_field_strides = symbolic_field_strides[:-1]
if symbolic_field_strides != field_arr.strides:
raise ValueError(f"Passed array {symbolic_field.name} has strides {str(field_arr.strides)} "
f"which does not match expected strides {str(symbolic_field_strides)}")
if FieldType.is_indexed(symbolic_field):
index_arr_shapes.add(field_arr.shape[:symbolic_field.spatial_dimensions])
elif FieldType.is_generic(symbolic_field):
array_shapes.add(field_arr.shape[:symbolic_field.spatial_dimensions])
if len(array_shapes) > 1:
raise ValueError(f"All passed arrays have to have the same size {str(array_shapes)}")
if len(index_arr_shapes) > 1:
raise ValueError(f"All passed index arrays have to have the same size {str(array_shapes)}")
if len(index_arr_shapes) > 0:
return list(index_arr_shapes)[0]
else:
return list(array_shapes)[0]
import abc
from functools import partial
import math
from typing import List, Tuple
import sympy as sp
from sympy.core.cache import cacheit
from pystencils.astnodes import Block, Conditional, SympyAssignment
from pystencils.typing import TypedSymbol, create_type
from pystencils.integer_functions import div_ceil, div_floor
from pystencils.sympyextensions import is_integer_sequence, prod
class ThreadIndexingSymbol(TypedSymbol):
def __new__(cls, *args, **kwds):
obj = ThreadIndexingSymbol.__xnew_cached_(cls, *args, **kwds)
return obj
def __new_stage2__(cls, name, dtype, *args, **kwargs):
obj = super(ThreadIndexingSymbol, cls).__xnew__(cls, name, dtype, *args, **kwargs)
return obj
__xnew__ = staticmethod(__new_stage2__)
__xnew_cached_ = staticmethod(cacheit(__new_stage2__))
BLOCK_IDX = [ThreadIndexingSymbol("blockIdx." + coord, create_type("int32")) for coord in ('x', 'y', 'z')]
THREAD_IDX = [ThreadIndexingSymbol("threadIdx." + coord, create_type("int32")) for coord in ('x', 'y', 'z')]
BLOCK_DIM = [ThreadIndexingSymbol("blockDim." + coord, create_type("int32")) for coord in ('x', 'y', 'z')]
GRID_DIM = [ThreadIndexingSymbol("gridDim." + coord, create_type("int32")) for coord in ('x', 'y', 'z')]
class AbstractIndexing(abc.ABC):
"""
Abstract base class for all Indexing classes. An Indexing class defines how an iteration space is mapped
to GPU's block and grid system. It calculates indices based on GPU's thread and block indices
and computes the number of blocks and threads a kernel is started with.
The Indexing class is created with an iteration space that is given as list of slices to determine start, stop
and the step size for each coordinate. Further the data_layout is given as tuple to determine the fast and slow
coordinates. This is important to get an optimal mapping of coordinates to GPU threads.
"""
def __init__(self, iteration_space: Tuple[slice], data_layout: Tuple):
for iter_space in iteration_space:
assert isinstance(iter_space, slice), f"iteration_space must be of type Tuple[slice], " \
f"not tuple of type {type(iter_space)}"
self._iteration_space = iteration_space
self._data_layout = data_layout
self._dim = len(iteration_space)
@property
def iteration_space(self):
"""Iteration space to loop over"""
return self._iteration_space
@property
def data_layout(self):
"""Data layout of the kernels arrays"""
return self._data_layout
@property
def dim(self):
"""Number of spatial dimensions"""
return self._dim
@property
@abc.abstractmethod
def coordinates(self):
"""Returns a sequence of coordinate expressions for (x,y,z) depending on symbolic GPU block and thread indices.
These symbolic indices can be obtained with the method `index_variables` """
@property
def index_variables(self):
"""Sympy symbols for GPU's block and thread indices, and block and grid dimensions. """
return BLOCK_IDX + THREAD_IDX + BLOCK_DIM + GRID_DIM
@abc.abstractmethod
def get_loop_ctr_assignments(self, loop_counter_symbols) -> List[SympyAssignment]:
"""Adds assignments for the loop counter symbols depending on the gpu threads.
Args:
loop_counter_symbols: typed symbols representing the loop counters
Returns:
assignments for the loop counters
"""
@abc.abstractmethod
def call_parameters(self, arr_shape):
"""Determine grid and block size for kernel call.
Args:
arr_shape: the numeric (not symbolic) shape of the array
Returns:
dict with keys 'blocks' and 'threads' with tuple values for number of (x,y,z) threads and blocks
the kernel should be started with
"""
@abc.abstractmethod
def guard(self, kernel_content, arr_shape):
"""In some indexing schemes not all threads of a block execute the kernel content.
This function can return a Conditional ast node, defining this execution guard.
Args:
kernel_content: the actual kernel contents which can e.g. be put into the Conditional node as true block
arr_shape: the numeric or symbolic shape of the field
Returns:
ast node, which is put inside the kernel function
"""
@abc.abstractmethod
def max_threads_per_block(self):
"""Return maximal number of threads per block for launch bounds. If this cannot be determined without
knowing the array shape return None for unknown """
@abc.abstractmethod
def symbolic_parameters(self):
"""Set of symbols required in call_parameters code"""
# -------------------------------------------- Implementations ---------------------------------------------------------
class BlockIndexing(AbstractIndexing):
"""Generic indexing scheme that maps sub-blocks of an array to GPU blocks.
Args:
iteration_space: list of slices to determine start, stop and the step size for each coordinate
data_layout: tuple specifying loop order with innermost loop last.
This is the same format as returned by `Field.layout`.
permute_block_size_dependent_on_layout: if True the block_size is permuted such that the fastest coordinate
gets the largest amount of threads
compile_time_block_size: compile in concrete block size, otherwise the gpu variable 'blockDim' is used
maximum_block_size: maximum block size that is possible for the GPU. Set to 'auto' to let cupy define the
maximum block size from the device properties
device_number: device number of the used GPU. By default, the zeroth device is used.
"""
def __init__(self, iteration_space: Tuple[slice], data_layout: Tuple[int],
block_size=(128, 2, 1), permute_block_size_dependent_on_layout=True, compile_time_block_size=False,
maximum_block_size=(1024, 1024, 64), device_number=None):
super(BlockIndexing, self).__init__(iteration_space, data_layout)
if self._dim > 4:
raise NotImplementedError("This indexing scheme supports at most 4 spatial dimensions")
if permute_block_size_dependent_on_layout and self._dim < 4:
block_size = self.permute_block_size_according_to_layout(block_size, data_layout)
self._block_size = block_size
if maximum_block_size == 'auto':
assert device_number is not None, 'If "maximum_block_size" is set to "auto" a device number must be stated'
# Get device limits
import cupy as cp
# See https://github.com/cupy/cupy/issues/7676
if cp.cuda.runtime.is_hip:
maximum_block_size = tuple(cp.cuda.runtime.deviceGetAttribute(i, device_number) for i in range(26, 29))
else:
da = cp.cuda.Device(device_number).attributes
maximum_block_size = tuple(da[f"MaxBlockDim{c}"] for c in ["X", "Y", "Z"])
self._maximum_block_size = maximum_block_size
self._compile_time_block_size = compile_time_block_size
self._device_number = device_number
@property
def cuda_indices(self):
block_size = self._block_size if self._compile_time_block_size else BLOCK_DIM
indices = [block_index * bs + thread_idx
for block_index, bs, thread_idx in zip(BLOCK_IDX, block_size, THREAD_IDX)]
return indices[:self._dim]
@property
def coordinates(self):
if self._dim < 4:
coordinates = [c + iter_slice.start for c, iter_slice in zip(self.cuda_indices, self._iteration_space)]
return coordinates[:self._dim]
else:
coordinates = list()
width = self._iteration_space[1].stop - self.iteration_space[1].start
coordinates.append(div_floor(self.cuda_indices[0], width))
coordinates.append(sp.Mod(self.cuda_indices[0], width))
coordinates.append(self.cuda_indices[1] + self.iteration_space[2].start)
coordinates.append(self.cuda_indices[2] + self.iteration_space[3].start)
return coordinates
def get_loop_ctr_assignments(self, loop_counter_symbols):
return _loop_ctr_assignments(loop_counter_symbols, self.coordinates, self._iteration_space)
def call_parameters(self, arr_shape):
numeric_iteration_slice = _get_numeric_iteration_slice(self._iteration_space, arr_shape)
widths = _get_widths(numeric_iteration_slice)
if len(widths) > 3:
widths = [widths[0] * widths[1], widths[2], widths[3]]
extend_bs = (1,) * (3 - len(self._block_size))
block_size = self._block_size + extend_bs
if not self._compile_time_block_size:
assert len(block_size) == 3
adapted_block_size = []
for i in range(len(widths)):
factor = div_floor(prod(block_size[:i]), prod(adapted_block_size))
adapted_block_size.append(sp.Min(block_size[i] * factor, widths[i]))
extend_adapted_bs = (1,) * (3 - len(adapted_block_size))
block_size = tuple(adapted_block_size) + extend_adapted_bs
block_size = tuple(sp.Min(bs, max_bs) for bs, max_bs in zip(block_size, self._maximum_block_size))
grid = tuple(div_ceil(length, block_size) for length, block_size in zip(widths, block_size))
extend_gr = (1,) * (3 - len(grid))
return {'block': block_size,
'grid': grid + extend_gr}
def guard(self, kernel_content, arr_shape):
arr_shape = arr_shape[:self._dim]
if len(self._iteration_space) - 1 == len(arr_shape):
numeric_iteration_slice = _get_numeric_iteration_slice(self._iteration_space[1:], arr_shape)
numeric_iteration_slice = [self.iteration_space[0]] + numeric_iteration_slice
else:
assert len(self._iteration_space) == len(arr_shape), "Iteration space must be equal to the array shape"
numeric_iteration_slice = _get_numeric_iteration_slice(self._iteration_space, arr_shape)
end = [s.stop if s.stop != 0 else 1 for s in numeric_iteration_slice]
for i, s in enumerate(numeric_iteration_slice):
if s.step and s.step != 1:
end[i] = div_ceil(s.stop - s.start, s.step) + s.start
if self._dim < 4:
conditions = [c < e for c, e in zip(self.coordinates, end)]
else:
end = [end[0] * end[1], end[2], end[3]]
coordinates = [c + iter_slice.start for c, iter_slice in zip(self.cuda_indices, self._iteration_space[1:])]
conditions = [c < e for c, e in zip(coordinates, end)]
condition = conditions[0]
for c in conditions[1:]:
condition = sp.And(condition, c)
return Block([Conditional(condition, kernel_content)])
def numeric_iteration_space(self, arr_shape):
return _get_numeric_iteration_slice(self._iteration_space, arr_shape)
def limit_block_size_by_register_restriction(self, block_size, required_registers_per_thread):
"""Shrinks the block_size if there are too many registers used per block.
This is not done automatically, since the required_registers_per_thread are not known before compilation.
They can be obtained by ``func.num_regs`` from a cupy function.
Args:
block_size: used block size that is target for limiting
required_registers_per_thread: needed registers per thread
returns: smaller block_size if too many registers are used.
"""
import cupy as cp
# See https://github.com/cupy/cupy/issues/7676
if cp.cuda.runtime.is_hip:
max_registers_per_block = cp.cuda.runtime.deviceGetAttribute(71, self._device_number)
else:
device = cp.cuda.Device(self._device_number)
da = device.attributes
max_registers_per_block = da.get("MaxRegistersPerBlock")
result = list(block_size)
while True:
required_registers = math.prod(result) * required_registers_per_thread
if required_registers <= max_registers_per_block:
return result
else:
largest_list_entry_idx = max(range(len(result)), key=lambda e: result[e])
assert result[largest_list_entry_idx] >= 2
result[largest_list_entry_idx] //= 2
@staticmethod
def permute_block_size_according_to_layout(block_size, layout):
"""Returns modified block_size such that the fastest coordinate gets the biggest block dimension"""
if not is_integer_sequence(block_size):
return block_size
sorted_block_size = list(sorted(block_size, reverse=True))
while len(sorted_block_size) > len(layout):
sorted_block_size[0] *= sorted_block_size[-1]
sorted_block_size = sorted_block_size[:-1]
result = list(block_size)
for l, bs in zip(reversed(layout), sorted_block_size):
result[l] = bs
return tuple(result[:len(layout)])
def max_threads_per_block(self):
if is_integer_sequence(self._block_size):
return prod(self._block_size)
else:
return None
def symbolic_parameters(self):
return set(b for b in self._block_size if isinstance(b, sp.Symbol))
class LineIndexing(AbstractIndexing):
"""
Indexing scheme that assigns the innermost 'line' i.e. the elements which are adjacent in memory to a 1D GPU block.
The fastest coordinate is indexed with thread_idx.x, the remaining coordinates are mapped to block_idx.{x,y,z}
This indexing scheme supports up to 4 spatial dimensions, where the innermost dimensions is not larger than the
maximum amount of threads allowed in a GPU block (which depends on device).
Args:
iteration_space: list of slices to determine start, stop and the step size for each coordinate
data_layout: tuple to determine the fast and slow coordinates.
"""
def __init__(self, iteration_space: Tuple[slice], data_layout: Tuple):
super(LineIndexing, self).__init__(iteration_space, data_layout)
if len(iteration_space) > 4:
raise NotImplementedError("This indexing scheme supports at most 4 spatial dimensions")
@property
def cuda_indices(self):
available_indices = [THREAD_IDX[0]] + BLOCK_IDX
coordinates = available_indices[:self.dim]
fastest_coordinate = self.data_layout[-1]
coordinates[0], coordinates[fastest_coordinate] = coordinates[fastest_coordinate], coordinates[0]
return coordinates
@property
def coordinates(self):
return [i + o.start for i, o in zip(self.cuda_indices, self._iteration_space)]
def get_loop_ctr_assignments(self, loop_counter_symbols):
return _loop_ctr_assignments(loop_counter_symbols, self.coordinates, self._iteration_space)
def call_parameters(self, arr_shape):
numeric_iteration_slice = _get_numeric_iteration_slice(self._iteration_space, arr_shape)
widths = _get_widths(numeric_iteration_slice)
def get_shape_of_cuda_idx(cuda_idx):
if cuda_idx not in self.cuda_indices:
return 1
else:
idx = self.cuda_indices.index(cuda_idx)
return widths[idx]
return {'block': tuple([get_shape_of_cuda_idx(idx) for idx in THREAD_IDX]),
'grid': tuple([get_shape_of_cuda_idx(idx) for idx in BLOCK_IDX])}
def guard(self, kernel_content, arr_shape):
return kernel_content
def max_threads_per_block(self):
return None
def symbolic_parameters(self):
return set()
def numeric_iteration_space(self, arr_shape):
return _get_numeric_iteration_slice(self._iteration_space, arr_shape)
# -------------------------------------- Helper functions --------------------------------------------------------------
def _get_numeric_iteration_slice(iteration_slice, arr_shape):
res = []
for slice_component, shape in zip(iteration_slice, arr_shape):
result_slice = slice_component
if not isinstance(result_slice.start, int):
start = result_slice.start
assert len(start.free_symbols) == 1
start = start.subs({symbol: shape for symbol in start.free_symbols})
result_slice = slice(start, result_slice.stop, result_slice.step)
if not isinstance(result_slice.stop, int):
stop = result_slice.stop
assert len(stop.free_symbols) == 1
stop = stop.subs({symbol: shape for symbol in stop.free_symbols})
result_slice = slice(result_slice.start, stop, result_slice.step)
assert isinstance(result_slice.step, int)
res.append(result_slice)
return res
def _get_widths(iteration_slice):
widths = []
for iter_slice in iteration_slice:
step = iter_slice.step
assert isinstance(step, int), f"Step can only be of type int not of type {type(step)}"
start = iter_slice.start
stop = iter_slice.stop
if step == 1:
if stop - start == 0:
widths.append(1)
else:
widths.append(stop - start)
else:
width = (stop - start) / step
if isinstance(width, int):
widths.append(width)
elif isinstance(width, float):
widths.append(math.ceil(width))
else:
widths.append(div_ceil(stop - start, step))
return widths
def _loop_ctr_assignments(loop_counter_symbols, coordinates, iteration_space):
loop_ctr_assignments = []
for loop_counter, coordinate, iter_slice in zip(loop_counter_symbols, coordinates, iteration_space):
if isinstance(iter_slice, slice) and iter_slice.step > 1:
offset = (iter_slice.step * iter_slice.start) - iter_slice.start
loop_ctr_assignments.append(SympyAssignment(loop_counter, coordinate * iter_slice.step - offset))
elif iter_slice.start == iter_slice.stop:
loop_ctr_assignments.append(SympyAssignment(loop_counter, 0))
else:
loop_ctr_assignments.append(SympyAssignment(loop_counter, coordinate))
return loop_ctr_assignments
def indexing_creator_from_params(gpu_indexing, gpu_indexing_params):
if isinstance(gpu_indexing, str):
if gpu_indexing == 'block':
indexing_creator = BlockIndexing
elif gpu_indexing == 'line':
indexing_creator = LineIndexing
else:
raise ValueError(f"Unknown GPU indexing {gpu_indexing}. Valid values are 'block' and 'line'")
if gpu_indexing_params:
indexing_creator = partial(indexing_creator, **gpu_indexing_params)
return indexing_creator
else:
return gpu_indexing
import sympy as sp
from pystencils.astnodes import Block, KernelFunction, LoopOverCoordinate, SympyAssignment
from pystencils.config import CreateKernelConfig
from pystencils.typing import StructType, TypedSymbol
from pystencils.typing.transformations import add_types
from pystencils.field import Field, FieldType
from pystencils.enums import Target, Backend
from pystencils.gpu.gpujit import make_python_function
from pystencils.node_collection import NodeCollection
from pystencils.gpu.indexing import indexing_creator_from_params
from pystencils.slicing import normalize_slice
from pystencils.transformations import (
get_base_buffer_index, get_common_field, get_common_indexed_element, parse_base_pointer_info,
resolve_buffer_accesses, resolve_field_accesses, unify_shape_symbols)
def create_cuda_kernel(assignments: NodeCollection, config: CreateKernelConfig):
function_name = config.function_name
indexing_creator = indexing_creator_from_params(config.gpu_indexing, config.gpu_indexing_params)
iteration_slice = config.iteration_slice
ghost_layers = config.ghost_layers
fields_written = assignments.bound_fields
fields_read = assignments.rhs_fields
assignments = assignments.all_assignments
assignments = add_types(assignments, config)
all_fields = fields_read.union(fields_written)
read_only_fields = set([f.name for f in fields_read - fields_written])
buffers = set([f for f in all_fields if FieldType.is_buffer(f)])
fields_without_buffers = all_fields - buffers
field_accesses = set()
num_buffer_accesses = 0
indexed_elements = set()
for eq in assignments:
indexed_elements.update(eq.atoms(sp.Indexed))
field_accesses.update(eq.atoms(Field.Access))
field_accesses = {e for e in field_accesses if not e.is_absolute_access}
num_buffer_accesses += sum(1 for access in eq.atoms(Field.Access) if FieldType.is_buffer(access.field))
# common shape and field to from the iteration space
common_field = get_common_field(fields_without_buffers)
common_shape = common_field.spatial_shape
if iteration_slice is None:
# determine iteration slice from ghost layers
if ghost_layers is None:
# determine required number of ghost layers from field access
required_ghost_layers = max([fa.required_ghost_layers for fa in field_accesses])
ghost_layers = [(required_ghost_layers, required_ghost_layers)] * len(common_shape)
iteration_slice = []
if isinstance(ghost_layers, int):
for i in range(len(common_shape)):
iteration_slice.append(slice(ghost_layers, -ghost_layers if ghost_layers > 0 else None))
ghost_layers = [(ghost_layers, ghost_layers)] * len(common_shape)
else:
for i in range(len(common_shape)):
iteration_slice.append(slice(ghost_layers[i][0],
-ghost_layers[i][1] if ghost_layers[i][1] > 0 else None))
iteration_space = normalize_slice(iteration_slice, common_shape)
else:
iteration_space = normalize_slice(iteration_slice, common_shape)
iteration_space = tuple([s if isinstance(s, slice) else slice(s, s + 1, 1) for s in iteration_space])
loop_counter_symbols = [LoopOverCoordinate.get_loop_counter_symbol(i) for i in range(len(iteration_space))]
if len(indexed_elements) > 0:
common_indexed_element = get_common_indexed_element(indexed_elements)
index = common_indexed_element.indices[0].atoms(TypedSymbol)
assert len(index) == 1, "index expressions must only contain one symbol representing the index"
indexing = indexing_creator(iteration_space=(slice(0, common_indexed_element.shape[0], 1), *iteration_space),
data_layout=common_field.layout)
extended_ctrs = [index.pop(), *loop_counter_symbols]
loop_counter_assignments = indexing.get_loop_ctr_assignments(extended_ctrs)
else:
indexing = indexing_creator(iteration_space=iteration_space, data_layout=common_field.layout)
loop_counter_assignments = indexing.get_loop_ctr_assignments(loop_counter_symbols)
assignments = loop_counter_assignments + assignments
block = indexing.guard(Block(assignments), common_shape)
unify_shape_symbols(block, common_shape=common_shape, fields=fields_without_buffers)
ast = KernelFunction(block,
Target.GPU,
Backend.CUDA,
make_python_function,
ghost_layers,
function_name,
assignments=assignments)
ast.global_variables.update(indexing.index_variables)
base_pointer_spec = config.base_pointer_specification
if base_pointer_spec is None:
base_pointer_spec = []
base_pointer_info = {f.name: parse_base_pointer_info(base_pointer_spec, [2, 1, 0],
f.spatial_dimensions, f.index_dimensions)
for f in all_fields}
coord_mapping = {f.name: loop_counter_symbols for f in all_fields}
if any(FieldType.is_buffer(f) for f in all_fields):
base_buffer_index = get_base_buffer_index(ast, loop_counter_symbols, iteration_space)
resolve_buffer_accesses(ast, base_buffer_index, read_only_fields)
resolve_field_accesses(ast, read_only_fields, field_to_base_pointer_info=base_pointer_info,
field_to_fixed_coordinates=coord_mapping)
# add the function which determines #blocks and #threads as additional member to KernelFunction node
# this is used by the jit
# If loop counter symbols have been explicitly used in the update equations (e.g. for built in periodicity),
# they are defined here
undefined_loop_counters = {LoopOverCoordinate.is_loop_counter_symbol(s): s for s in ast.body.undefined_symbols
if LoopOverCoordinate.is_loop_counter_symbol(s) is not None}
for i, loop_counter in undefined_loop_counters.items():
ast.body.insert_front(SympyAssignment(loop_counter, indexing.coordinates[i]))
ast.indexing = indexing
return ast
def created_indexed_cuda_kernel(assignments: NodeCollection, config: CreateKernelConfig):
index_fields = config.index_fields
function_name = config.function_name
coordinate_names = config.coordinate_names
indexing_creator = indexing_creator_from_params(config.gpu_indexing, config.gpu_indexing_params)
fields_written = assignments.bound_fields
fields_read = assignments.rhs_fields
all_fields = fields_read.union(fields_written)
read_only_fields = set([f.name for f in fields_read - fields_written])
# extract the index fields based on the name. The original index field might have been modified
index_fields = [idx_field for idx_field in index_fields if idx_field.name in [f.name for f in all_fields]]
non_index_fields = [f for f in all_fields if f not in index_fields]
spatial_coordinates = {f.spatial_dimensions for f in non_index_fields}
assert len(spatial_coordinates) == 1, f"Non-index fields do not have the same number of spatial coordinates " \
f"Non index fields are {non_index_fields}, spatial coordinates are " \
f"{spatial_coordinates}"
spatial_coordinates = list(spatial_coordinates)[0]
assignments = assignments.all_assignments
assignments = add_types(assignments, config)
for index_field in index_fields:
index_field.field_type = FieldType.INDEXED
assert FieldType.is_indexed(index_field)
assert index_field.spatial_dimensions == 1, "Index fields have to be 1D"
def get_coordinate_symbol_assignment(name):
for ind_f in index_fields:
assert isinstance(ind_f.dtype, StructType), "Index fields have to have a struct data type"
data_type = ind_f.dtype
if data_type.has_element(name):
rhs = ind_f[0](name)
lhs = TypedSymbol(name, data_type.get_element_type(name))
return SympyAssignment(lhs, rhs)
raise ValueError(f"Index {name} not found in any of the passed index fields")
coordinate_symbol_assignments = [get_coordinate_symbol_assignment(n)
for n in coordinate_names[:spatial_coordinates]]
coordinate_typed_symbols = [eq.lhs for eq in coordinate_symbol_assignments]
idx_field = list(index_fields)[0]
iteration_space = normalize_slice(tuple([slice(None, None, None)]) * len(idx_field.spatial_shape),
idx_field.spatial_shape)
indexing = indexing_creator(iteration_space=iteration_space,
data_layout=idx_field.layout)
function_body = Block(coordinate_symbol_assignments + assignments)
function_body = indexing.guard(function_body, get_common_field(index_fields).spatial_shape)
ast = KernelFunction(function_body, Target.GPU, Backend.CUDA, make_python_function,
None, function_name, assignments=assignments)
ast.global_variables.update(indexing.index_variables)
coord_mapping = indexing.coordinates
base_pointer_spec = [['spatialInner0']]
base_pointer_info = {f.name: parse_base_pointer_info(base_pointer_spec, [2, 1, 0],
f.spatial_dimensions, f.index_dimensions)
for f in all_fields}
coord_mapping = {f.name: coord_mapping for f in index_fields}
coord_mapping.update({f.name: coordinate_typed_symbols for f in non_index_fields})
resolve_field_accesses(ast, read_only_fields, field_to_fixed_coordinates=coord_mapping,
field_to_base_pointer_info=base_pointer_info)
# add the function which determines #blocks and #threads as additional member to KernelFunction node
# this is used by the jit
ast.indexing = indexing
return ast
import numpy as np
from itertools import product
from pystencils import CreateKernelConfig, create_kernel
from pystencils.gpu import make_python_function
from pystencils import Assignment, Field
from pystencils.enums import Target
from pystencils.slicing import get_periodic_boundary_src_dst_slices, normalize_slice
def create_copy_kernel(domain_size, from_slice, to_slice, index_dimensions=0, index_dim_shape=1, dtype=np.float64):
"""Copies a rectangular part of an array to another non-overlapping part"""
f = Field.create_generic("pdfs", len(domain_size), index_dimensions=index_dimensions, dtype=dtype)
normalized_from_slice = normalize_slice(from_slice, f.spatial_shape)
normalized_to_slice = normalize_slice(to_slice, f.spatial_shape)
offset = [s1.start - s2.start for s1, s2 in zip(normalized_from_slice, normalized_to_slice)]
assert offset == [s1.stop - s2.stop for s1, s2 in zip(normalized_from_slice, normalized_to_slice)], \
"Slices have to have same size"
update_eqs = []
if index_dimensions < 2:
index_dim_shape = [index_dim_shape]
for i in product(*[range(d) for d in index_dim_shape]):
eq = Assignment(f(*i), f[tuple(offset)](*i))
update_eqs.append(eq)
config = CreateKernelConfig(target=Target.GPU, iteration_slice=to_slice, skip_independence_check=True)
ast = create_kernel(update_eqs, config=config)
return ast
def get_periodic_boundary_functor(stencil, domain_size, index_dimensions=0, index_dim_shape=1, ghost_layers=1,
thickness=None, dtype=np.float64, target=Target.GPU):
assert target in {Target.GPU}
src_dst_slice_tuples = get_periodic_boundary_src_dst_slices(stencil, ghost_layers, thickness)
kernels = []
for src_slice, dst_slice in src_dst_slice_tuples:
ast = create_copy_kernel(domain_size, src_slice, dst_slice, index_dimensions, index_dim_shape, dtype)
kernels.append(make_python_function(ast))
def functor(pdfs, **_):
for kernel in kernels:
kernel(pdfs=pdfs)
return functor
from os.path import dirname, realpath
def get_pystencils_include_path():
return dirname(realpath(__file__))
/*
Copyright 2010-2011, D. E. Shaw Research. All rights reserved.
Copyright 2019-2023, Michael Kuron.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <emmintrin.h> // SSE2
#include <wmmintrin.h> // AES
#ifdef __AVX__
#include <immintrin.h> // AVX*
#else
#include <smmintrin.h> // SSE4
#ifdef __FMA__
#include <immintrin.h> // FMA
#endif
#endif
#include <cstdint>
#include <array>
#include <map>
#define QUALIFIERS inline
#define TWOPOW53_INV_DOUBLE (1.1102230246251565e-16)
#define TWOPOW32_INV_FLOAT (2.3283064e-10f)
#include "myintrin.h"
typedef std::uint32_t uint32;
typedef std::uint64_t uint64;
template <typename T, std::size_t Alignment>
class AlignedAllocator
{
public:
typedef T value_type;
template <typename U>
struct rebind {
typedef AlignedAllocator<U, Alignment> other;
};
T * allocate(const std::size_t n) const {
if (n == 0) {
return nullptr;
}
void * const p = _mm_malloc(n*sizeof(T), Alignment);
if (p == nullptr) {
throw std::bad_alloc();
}
return static_cast<T *>(p);
}
void deallocate(T * const p, const std::size_t n) const {
_mm_free(p);
}
};
template <typename Key, typename T>
using AlignedMap = std::map<Key, T, std::less<Key>, AlignedAllocator<std::pair<const Key, T>, sizeof(Key)>>;
#if defined(__AES__) || defined(_MSC_VER)
QUALIFIERS __m128i aesni_keygen_assist(__m128i temp1, __m128i temp2) {
__m128i temp3;
temp2 = _mm_shuffle_epi32(temp2 ,0xff);
temp3 = _mm_slli_si128(temp1, 0x4);
temp1 = _mm_xor_si128(temp1, temp3);
temp3 = _mm_slli_si128(temp3, 0x4);
temp1 = _mm_xor_si128(temp1, temp3);
temp3 = _mm_slli_si128(temp3, 0x4);
temp1 = _mm_xor_si128(temp1, temp3);
temp1 = _mm_xor_si128(temp1, temp2);
return temp1;
}
QUALIFIERS std::array<__m128i,11> aesni_keygen(__m128i k) {
std::array<__m128i,11> rk;
__m128i tmp;
rk[0] = k;
tmp = _mm_aeskeygenassist_si128(k, 0x1);
k = aesni_keygen_assist(k, tmp);
rk[1] = k;
tmp = _mm_aeskeygenassist_si128(k, 0x2);
k = aesni_keygen_assist(k, tmp);
rk[2] = k;
tmp = _mm_aeskeygenassist_si128(k, 0x4);
k = aesni_keygen_assist(k, tmp);
rk[3] = k;
tmp = _mm_aeskeygenassist_si128(k, 0x8);
k = aesni_keygen_assist(k, tmp);
rk[4] = k;
tmp = _mm_aeskeygenassist_si128(k, 0x10);
k = aesni_keygen_assist(k, tmp);
rk[5] = k;
tmp = _mm_aeskeygenassist_si128(k, 0x20);
k = aesni_keygen_assist(k, tmp);
rk[6] = k;
tmp = _mm_aeskeygenassist_si128(k, 0x40);
k = aesni_keygen_assist(k, tmp);
rk[7] = k;
tmp = _mm_aeskeygenassist_si128(k, 0x80);
k = aesni_keygen_assist(k, tmp);
rk[8] = k;
tmp = _mm_aeskeygenassist_si128(k, 0x1b);
k = aesni_keygen_assist(k, tmp);
rk[9] = k;
tmp = _mm_aeskeygenassist_si128(k, 0x36);
k = aesni_keygen_assist(k, tmp);
rk[10] = k;
return rk;
}
QUALIFIERS const std::array<__m128i,11> & aesni_roundkeys(const __m128i & k128) {
alignas(16) std::array<uint32,4> a;
_mm_store_si128((__m128i*) a.data(), k128);
static AlignedMap<std::array<uint32,4>, std::array<__m128i,11>> roundkeys;
if(roundkeys.find(a) == roundkeys.end()) {
auto rk = aesni_keygen(k128);
roundkeys[a] = rk;
}
return roundkeys[a];
}
QUALIFIERS __m128i aesni1xm128i(const __m128i & in, const __m128i & k0) {
auto k = aesni_roundkeys(k0);
__m128i x = _mm_xor_si128(k[0], in);
x = _mm_aesenc_si128(x, k[1]);
x = _mm_aesenc_si128(x, k[2]);
x = _mm_aesenc_si128(x, k[3]);
x = _mm_aesenc_si128(x, k[4]);
x = _mm_aesenc_si128(x, k[5]);
x = _mm_aesenc_si128(x, k[6]);
x = _mm_aesenc_si128(x, k[7]);
x = _mm_aesenc_si128(x, k[8]);
x = _mm_aesenc_si128(x, k[9]);
x = _mm_aesenclast_si128(x, k[10]);
return x;
}
QUALIFIERS void aesni_double2(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1, uint32 key2, uint32 key3,
double & rnd1, double & rnd2)
{
// pack input and call AES
__m128i c128 = _mm_set_epi32(ctr3, ctr2, ctr1, ctr0);
__m128i k128 = _mm_set_epi32(key3, key2, key1, key0);
c128 = aesni1xm128i(c128, k128);
// convert 32 to 64 bit and put 0th and 2nd element into x, 1st and 3rd element into y
__m128i x = _mm_and_si128(c128, _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff));
__m128i y = _mm_and_si128(c128, _mm_set_epi32(0xffffffff, 0, 0xffffffff, 0));
y = _mm_srli_si128(y, 4);
// calculate z = x ^ y << (53 - 32))
__m128i z = _mm_sll_epi64(y, _mm_set1_epi64x(53 - 32));
z = _mm_xor_si128(x, z);
// convert uint64 to double
__m128d rs = _my_cvtepu64_pd(z);
// calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
#ifdef __FMA__
rs = _mm_fmadd_pd(rs, _mm_set1_pd(TWOPOW53_INV_DOUBLE), _mm_set1_pd(TWOPOW53_INV_DOUBLE/2.0));
#else
rs = _mm_mul_pd(rs, _mm_set1_pd(TWOPOW53_INV_DOUBLE));
rs = _mm_add_pd(rs, _mm_set1_pd(TWOPOW53_INV_DOUBLE/2.0));
#endif
// store result
alignas(16) double rr[2];
_mm_store_pd(rr, rs);
rnd1 = rr[0];
rnd2 = rr[1];
}
QUALIFIERS void aesni_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1, uint32 key2, uint32 key3,
float & rnd1, float & rnd2, float & rnd3, float & rnd4)
{
// pack input and call AES
__m128i c128 = _mm_set_epi32(ctr3, ctr2, ctr1, ctr0);
__m128i k128 = _mm_set_epi32(key3, key2, key1, key0);
c128 = aesni1xm128i(c128, k128);
// convert uint32 to float
__m128 rs = _my_cvtepu32_ps(c128);
// calculate rs * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
#ifdef __FMA__
rs = _mm_fmadd_ps(rs, _mm_set1_ps(TWOPOW32_INV_FLOAT), _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
#else
rs = _mm_mul_ps(rs, _mm_set1_ps(TWOPOW32_INV_FLOAT));
rs = _mm_add_ps(rs, _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
#endif
// store result
alignas(16) float r[4];
_mm_store_ps(r, rs);
rnd1 = r[0];
rnd2 = r[1];
rnd3 = r[2];
rnd4 = r[3];
}
template<bool high>
QUALIFIERS __m128d _uniform_double_hq(__m128i x, __m128i y)
{
// convert 32 to 64 bit
if (high)
{
x = _mm_unpackhi_epi32(x, _mm_setzero_si128());
y = _mm_unpackhi_epi32(y, _mm_setzero_si128());
}
else
{
x = _mm_unpacklo_epi32(x, _mm_setzero_si128());
y = _mm_unpacklo_epi32(y, _mm_setzero_si128());
}
// calculate z = x ^ y << (53 - 32))
__m128i z = _mm_sll_epi64(y, _mm_set1_epi64x(53 - 32));
z = _mm_xor_si128(x, z);
// convert uint64 to double
__m128d rs = _my_cvtepu64_pd(z);
// calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
#ifdef __FMA__
rs = _mm_fmadd_pd(rs, _mm_set1_pd(TWOPOW53_INV_DOUBLE), _mm_set1_pd(TWOPOW53_INV_DOUBLE/2.0));
#else
rs = _mm_mul_pd(rs, _mm_set1_pd(TWOPOW53_INV_DOUBLE));
rs = _mm_add_pd(rs, _mm_set1_pd(TWOPOW53_INV_DOUBLE/2.0));
#endif
return rs;
}
QUALIFIERS void aesni_float4(__m128i ctr0, __m128i ctr1, __m128i ctr2, __m128i ctr3,
uint32 key0, uint32 key1, uint32 key2, uint32 key3,
__m128 & rnd1, __m128 & rnd2, __m128 & rnd3, __m128 & rnd4)
{
// pack input and call AES
__m128i k128 = _mm_set_epi32(key3, key2, key1, key0);
__m128i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
_MY_TRANSPOSE4_EPI32(ctr[0], ctr[1], ctr[2], ctr[3]);
for (int i = 0; i < 4; ++i)
{
ctr[i] = aesni1xm128i(ctr[i], k128);
}
_MY_TRANSPOSE4_EPI32(ctr[0], ctr[1], ctr[2], ctr[3]);
// convert uint32 to float
rnd1 = _my_cvtepu32_ps(ctr[0]);
rnd2 = _my_cvtepu32_ps(ctr[1]);
rnd3 = _my_cvtepu32_ps(ctr[2]);
rnd4 = _my_cvtepu32_ps(ctr[3]);
// calculate rnd * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
#ifdef __FMA__
rnd1 = _mm_fmadd_ps(rnd1, _mm_set1_ps(TWOPOW32_INV_FLOAT), _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0));
rnd2 = _mm_fmadd_ps(rnd2, _mm_set1_ps(TWOPOW32_INV_FLOAT), _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0));
rnd3 = _mm_fmadd_ps(rnd3, _mm_set1_ps(TWOPOW32_INV_FLOAT), _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0));
rnd4 = _mm_fmadd_ps(rnd4, _mm_set1_ps(TWOPOW32_INV_FLOAT), _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0));
#else
rnd1 = _mm_mul_ps(rnd1, _mm_set1_ps(TWOPOW32_INV_FLOAT));
rnd1 = _mm_add_ps(rnd1, _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
rnd2 = _mm_mul_ps(rnd2, _mm_set1_ps(TWOPOW32_INV_FLOAT));
rnd2 = _mm_add_ps(rnd2, _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
rnd3 = _mm_mul_ps(rnd3, _mm_set1_ps(TWOPOW32_INV_FLOAT));
rnd3 = _mm_add_ps(rnd3, _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
rnd4 = _mm_mul_ps(rnd4, _mm_set1_ps(TWOPOW32_INV_FLOAT));
rnd4 = _mm_add_ps(rnd4, _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
#endif
}
QUALIFIERS void aesni_double2(__m128i ctr0, __m128i ctr1, __m128i ctr2, __m128i ctr3,
uint32 key0, uint32 key1, uint32 key2, uint32 key3,
__m128d & rnd1lo, __m128d & rnd1hi, __m128d & rnd2lo, __m128d & rnd2hi)
{
// pack input and call AES
__m128i k128 = _mm_set_epi32(key3, key2, key1, key0);
__m128i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
_MY_TRANSPOSE4_EPI32(ctr[0], ctr[1], ctr[2], ctr[3]);
for (int i = 0; i < 4; ++i)
{
ctr[i] = aesni1xm128i(ctr[i], k128);
}
_MY_TRANSPOSE4_EPI32(ctr[0], ctr[1], ctr[2], ctr[3]);
rnd1lo = _uniform_double_hq<false>(ctr[0], ctr[1]);
rnd1hi = _uniform_double_hq<true>(ctr[0], ctr[1]);
rnd2lo = _uniform_double_hq<false>(ctr[2], ctr[3]);
rnd2hi = _uniform_double_hq<true>(ctr[2], ctr[3]);
}
QUALIFIERS void aesni_float4(uint32 ctr0, __m128i ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1, uint32 key2, uint32 key3,
__m128 & rnd1, __m128 & rnd2, __m128 & rnd3, __m128 & rnd4)
{
__m128i ctr0v = _mm_set1_epi32(ctr0);
__m128i ctr2v = _mm_set1_epi32(ctr2);
__m128i ctr3v = _mm_set1_epi32(ctr3);
aesni_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, key2, key3, rnd1, rnd2, rnd3, rnd4);
}
QUALIFIERS void aesni_double2(uint32 ctr0, __m128i ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1, uint32 key2, uint32 key3,
__m128d & rnd1lo, __m128d & rnd1hi, __m128d & rnd2lo, __m128d & rnd2hi)
{
__m128i ctr0v = _mm_set1_epi32(ctr0);
__m128i ctr2v = _mm_set1_epi32(ctr2);
__m128i ctr3v = _mm_set1_epi32(ctr3);
aesni_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, key2, key3, rnd1lo, rnd1hi, rnd2lo, rnd2hi);
}
QUALIFIERS void aesni_double2(uint32 ctr0, __m128i ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1, uint32 key2, uint32 key3,
__m128d & rnd1, __m128d & rnd2)
{
__m128i ctr0v = _mm_set1_epi32(ctr0);
__m128i ctr2v = _mm_set1_epi32(ctr2);
__m128i ctr3v = _mm_set1_epi32(ctr3);
__m128d ignore;
aesni_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, key2, key3, rnd1, ignore, rnd2, ignore);
}
#endif
#ifdef __AVX2__
QUALIFIERS const std::array<__m256i,11> & aesni_roundkeys(const __m256i & k256) {
alignas(32) std::array<uint32,8> a;
_mm256_store_si256((__m256i*) a.data(), k256);
static AlignedMap<std::array<uint32,8>, std::array<__m256i,11>> roundkeys;
if(roundkeys.find(a) == roundkeys.end()) {
auto rk1 = aesni_keygen(_mm256_extractf128_si256(k256, 0));
auto rk2 = aesni_keygen(_mm256_extractf128_si256(k256, 1));
for(int i = 0; i < 11; ++i) {
roundkeys[a][i] = _my256_set_m128i(rk2[i], rk1[i]);
}
}
return roundkeys[a];
}
QUALIFIERS __m256i aesni1xm128i(const __m256i & in, const __m256i & k0) {
#if defined(__VAES__)
auto k = aesni_roundkeys(k0);
__m256i x = _mm256_xor_si256(k[0], in);
x = _mm256_aesenc_epi128(x, k[1]);
x = _mm256_aesenc_epi128(x, k[2]);
x = _mm256_aesenc_epi128(x, k[3]);
x = _mm256_aesenc_epi128(x, k[4]);
x = _mm256_aesenc_epi128(x, k[5]);
x = _mm256_aesenc_epi128(x, k[6]);
x = _mm256_aesenc_epi128(x, k[7]);
x = _mm256_aesenc_epi128(x, k[8]);
x = _mm256_aesenc_epi128(x, k[9]);
x = _mm256_aesenclast_epi128(x, k[10]);
#else
__m128i a = aesni1xm128i(_mm256_extractf128_si256(in, 0), _mm256_extractf128_si256(k0, 0));
__m128i b = aesni1xm128i(_mm256_extractf128_si256(in, 1), _mm256_extractf128_si256(k0, 1));
__m256i x = _my256_set_m128i(b, a);
#endif
return x;
}
template<bool high>
QUALIFIERS __m256d _uniform_double_hq(__m256i x, __m256i y)
{
// convert 32 to 64 bit
if (high)
{
x = _mm256_cvtepu32_epi64(_mm256_extracti128_si256(x, 1));
y = _mm256_cvtepu32_epi64(_mm256_extracti128_si256(y, 1));
}
else
{
x = _mm256_cvtepu32_epi64(_mm256_extracti128_si256(x, 0));
y = _mm256_cvtepu32_epi64(_mm256_extracti128_si256(y, 0));
}
// calculate z = x ^ y << (53 - 32))
__m256i z = _mm256_sll_epi64(y, _mm_set1_epi64x(53 - 32));
z = _mm256_xor_si256(x, z);
// convert uint64 to double
__m256d rs = _my256_cvtepu64_pd(z);
// calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
#ifdef __FMA__
rs = _mm256_fmadd_pd(rs, _mm256_set1_pd(TWOPOW53_INV_DOUBLE), _mm256_set1_pd(TWOPOW53_INV_DOUBLE/2.0));
#else
rs = _mm256_mul_pd(rs, _mm256_set1_pd(TWOPOW53_INV_DOUBLE));
rs = _mm256_add_pd(rs, _mm256_set1_pd(TWOPOW53_INV_DOUBLE/2.0));
#endif
return rs;
}
QUALIFIERS void aesni_float4(__m256i ctr0, __m256i ctr1, __m256i ctr2, __m256i ctr3,
uint32 key0, uint32 key1, uint32 key2, uint32 key3,
__m256 & rnd1, __m256 & rnd2, __m256 & rnd3, __m256 & rnd4)
{
// pack input and call AES
__m256i k256 = _mm256_set_epi32(key3, key2, key1, key0, key3, key2, key1, key0);
__m256i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
__m128i a[4], b[4];
for (int i = 0; i < 4; ++i)
{
a[i] = _mm256_extractf128_si256(ctr[i], 0);
b[i] = _mm256_extractf128_si256(ctr[i], 1);
}
_MY_TRANSPOSE4_EPI32(a[0], a[1], a[2], a[3]);
_MY_TRANSPOSE4_EPI32(b[0], b[1], b[2], b[3]);
for (int i = 0; i < 4; ++i)
{
ctr[i] = _my256_set_m128i(b[i], a[i]);
}
for (int i = 0; i < 4; ++i)
{
ctr[i] = aesni1xm128i(ctr[i], k256);
}
for (int i = 0; i < 4; ++i)
{
a[i] = _mm256_extractf128_si256(ctr[i], 0);
b[i] = _mm256_extractf128_si256(ctr[i], 1);
}
_MY_TRANSPOSE4_EPI32(a[0], a[1], a[2], a[3]);
_MY_TRANSPOSE4_EPI32(b[0], b[1], b[2], b[3]);
for (int i = 0; i < 4; ++i)
{
ctr[i] = _my256_set_m128i(b[i], a[i]);
}
// convert uint32 to float
rnd1 = _my256_cvtepu32_ps(ctr[0]);
rnd2 = _my256_cvtepu32_ps(ctr[1]);
rnd3 = _my256_cvtepu32_ps(ctr[2]);
rnd4 = _my256_cvtepu32_ps(ctr[3]);
// calculate rnd * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
#ifdef __FMA__
rnd1 = _mm256_fmadd_ps(rnd1, _mm256_set1_ps(TWOPOW32_INV_FLOAT), _mm256_set1_ps(TWOPOW32_INV_FLOAT/2.0));
rnd2 = _mm256_fmadd_ps(rnd2, _mm256_set1_ps(TWOPOW32_INV_FLOAT), _mm256_set1_ps(TWOPOW32_INV_FLOAT/2.0));
rnd3 = _mm256_fmadd_ps(rnd3, _mm256_set1_ps(TWOPOW32_INV_FLOAT), _mm256_set1_ps(TWOPOW32_INV_FLOAT/2.0));
rnd4 = _mm256_fmadd_ps(rnd4, _mm256_set1_ps(TWOPOW32_INV_FLOAT), _mm256_set1_ps(TWOPOW32_INV_FLOAT/2.0));
#else
rnd1 = _mm256_mul_ps(rnd1, _mm256_set1_ps(TWOPOW32_INV_FLOAT));
rnd1 = _mm256_add_ps(rnd1, _mm256_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
rnd2 = _mm256_mul_ps(rnd2, _mm256_set1_ps(TWOPOW32_INV_FLOAT));
rnd2 = _mm256_add_ps(rnd2, _mm256_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
rnd3 = _mm256_mul_ps(rnd3, _mm256_set1_ps(TWOPOW32_INV_FLOAT));
rnd3 = _mm256_add_ps(rnd3, _mm256_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
rnd4 = _mm256_mul_ps(rnd4, _mm256_set1_ps(TWOPOW32_INV_FLOAT));
rnd4 = _mm256_add_ps(rnd4, _mm256_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
#endif
}
QUALIFIERS void aesni_double2(__m256i ctr0, __m256i ctr1, __m256i ctr2, __m256i ctr3,
uint32 key0, uint32 key1, uint32 key2, uint32 key3,
__m256d & rnd1lo, __m256d & rnd1hi, __m256d & rnd2lo, __m256d & rnd2hi)
{
// pack input and call AES
__m256i k256 = _mm256_set_epi32(key3, key2, key1, key0, key3, key2, key1, key0);
__m256i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
__m128i a[4], b[4];
for (int i = 0; i < 4; ++i)
{
a[i] = _mm256_extractf128_si256(ctr[i], 0);
b[i] = _mm256_extractf128_si256(ctr[i], 1);
}
_MY_TRANSPOSE4_EPI32(a[0], a[1], a[2], a[3]);
_MY_TRANSPOSE4_EPI32(b[0], b[1], b[2], b[3]);
for (int i = 0; i < 4; ++i)
{
ctr[i] = _my256_set_m128i(b[i], a[i]);
}
for (int i = 0; i < 4; ++i)
{
ctr[i] = aesni1xm128i(ctr[i], k256);
}
for (int i = 0; i < 4; ++i)
{
a[i] = _mm256_extractf128_si256(ctr[i], 0);
b[i] = _mm256_extractf128_si256(ctr[i], 1);
}
_MY_TRANSPOSE4_EPI32(a[0], a[1], a[2], a[3]);
_MY_TRANSPOSE4_EPI32(b[0], b[1], b[2], b[3]);
for (int i = 0; i < 4; ++i)
{
ctr[i] = _my256_set_m128i(b[i], a[i]);
}
rnd1lo = _uniform_double_hq<false>(ctr[0], ctr[1]);
rnd1hi = _uniform_double_hq<true>(ctr[0], ctr[1]);
rnd2lo = _uniform_double_hq<false>(ctr[2], ctr[3]);
rnd2hi = _uniform_double_hq<true>(ctr[2], ctr[3]);
}
QUALIFIERS void aesni_float4(uint32 ctr0, __m256i ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1, uint32 key2, uint32 key3,
__m256 & rnd1, __m256 & rnd2, __m256 & rnd3, __m256 & rnd4)
{
__m256i ctr0v = _mm256_set1_epi32(ctr0);
__m256i ctr2v = _mm256_set1_epi32(ctr2);
__m256i ctr3v = _mm256_set1_epi32(ctr3);
aesni_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, key2, key3, rnd1, rnd2, rnd3, rnd4);
}
QUALIFIERS void aesni_double2(uint32 ctr0, __m256i ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1, uint32 key2, uint32 key3,
__m256d & rnd1lo, __m256d & rnd1hi, __m256d & rnd2lo, __m256d & rnd2hi)
{
__m256i ctr0v = _mm256_set1_epi32(ctr0);
__m256i ctr2v = _mm256_set1_epi32(ctr2);
__m256i ctr3v = _mm256_set1_epi32(ctr3);
aesni_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, key2, key3, rnd1lo, rnd1hi, rnd2lo, rnd2hi);
}
QUALIFIERS void aesni_double2(uint32 ctr0, __m256i ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1, uint32 key2, uint32 key3,
__m256d & rnd1, __m256d & rnd2)
{
#if 0
__m256i ctr0v = _mm256_set1_epi32(ctr0);
__m256i ctr2v = _mm256_set1_epi32(ctr2);
__m256i ctr3v = _mm256_set1_epi32(ctr3);
__m256d ignore;
aesni_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, key2, key3, rnd1, ignore, rnd2, ignore);
#else
__m128d rnd1lo, rnd1hi, rnd2lo, rnd2hi;
aesni_double2(ctr0, _mm256_extractf128_si256(ctr1, 0), ctr2, ctr3, key0, key1, key2, key3, rnd1lo, rnd1hi, rnd2lo, rnd2hi);
rnd1 = _my256_set_m128d(rnd1hi, rnd1lo);
rnd2 = _my256_set_m128d(rnd2hi, rnd2lo);
#endif
}
#endif
#if defined(__AVX512F__) || defined(__AVX10_512BIT__)
QUALIFIERS const std::array<__m512i,11> & aesni_roundkeys(const __m512i & k512) {
alignas(64) std::array<uint32,16> a;
_mm512_store_si512((__m512i*) a.data(), k512);
static AlignedMap<std::array<uint32,16>, std::array<__m512i,11>> roundkeys;
if(roundkeys.find(a) == roundkeys.end()) {
auto rk1 = aesni_keygen(_mm512_extracti32x4_epi32(k512, 0));
auto rk2 = aesni_keygen(_mm512_extracti32x4_epi32(k512, 1));
auto rk3 = aesni_keygen(_mm512_extracti32x4_epi32(k512, 2));
auto rk4 = aesni_keygen(_mm512_extracti32x4_epi32(k512, 3));
for(int i = 0; i < 11; ++i) {
roundkeys[a][i] = _my512_set_m128i(rk4[i], rk3[i], rk2[i], rk1[i]);
}
}
return roundkeys[a];
}
QUALIFIERS __m512i aesni1xm128i(const __m512i & in, const __m512i & k0) {
#ifdef __VAES__
auto k = aesni_roundkeys(k0);
__m512i x = _mm512_xor_si512(k[0], in);
x = _mm512_aesenc_epi128(x, k[1]);
x = _mm512_aesenc_epi128(x, k[2]);
x = _mm512_aesenc_epi128(x, k[3]);
x = _mm512_aesenc_epi128(x, k[4]);
x = _mm512_aesenc_epi128(x, k[5]);
x = _mm512_aesenc_epi128(x, k[6]);
x = _mm512_aesenc_epi128(x, k[7]);
x = _mm512_aesenc_epi128(x, k[8]);
x = _mm512_aesenc_epi128(x, k[9]);
x = _mm512_aesenclast_epi128(x, k[10]);
#else
__m128i a = aesni1xm128i(_mm512_extracti32x4_epi32(in, 0), _mm512_extracti32x4_epi32(k0, 0));
__m128i b = aesni1xm128i(_mm512_extracti32x4_epi32(in, 1), _mm512_extracti32x4_epi32(k0, 1));
__m128i c = aesni1xm128i(_mm512_extracti32x4_epi32(in, 2), _mm512_extracti32x4_epi32(k0, 2));
__m128i d = aesni1xm128i(_mm512_extracti32x4_epi32(in, 3), _mm512_extracti32x4_epi32(k0, 3));
__m512i x = _my512_set_m128i(d, c, b, a);
#endif
return x;
}
template<bool high>
QUALIFIERS __m512d _uniform_double_hq(__m512i x, __m512i y)
{
// convert 32 to 64 bit
if (high)
{
x = _mm512_cvtepu32_epi64(_mm512_extracti64x4_epi64(x, 1));
y = _mm512_cvtepu32_epi64(_mm512_extracti64x4_epi64(y, 1));
}
else
{
x = _mm512_cvtepu32_epi64(_mm512_extracti64x4_epi64(x, 0));
y = _mm512_cvtepu32_epi64(_mm512_extracti64x4_epi64(y, 0));
}
// calculate z = x ^ y << (53 - 32))
__m512i z = _mm512_sll_epi64(y, _mm_set1_epi64x(53 - 32));
z = _mm512_xor_si512(x, z);
// convert uint64 to double
__m512d rs = _mm512_cvtepu64_pd(z);
// calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
rs = _mm512_fmadd_pd(rs, _mm512_set1_pd(TWOPOW53_INV_DOUBLE), _mm512_set1_pd(TWOPOW53_INV_DOUBLE/2.0));
return rs;
}
QUALIFIERS void aesni_float4(__m512i ctr0, __m512i ctr1, __m512i ctr2, __m512i ctr3,
uint32 key0, uint32 key1, uint32 key2, uint32 key3,
__m512 & rnd1, __m512 & rnd2, __m512 & rnd3, __m512 & rnd4)
{
// pack input and call AES
__m512i k512 = _mm512_set_epi32(key3, key2, key1, key0, key3, key2, key1, key0,
key3, key2, key1, key0, key3, key2, key1, key0);
__m512i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
__m128i a[4], b[4], c[4], d[4];
for (int i = 0; i < 4; ++i)
{
a[i] = _mm512_extracti32x4_epi32(ctr[i], 0);
b[i] = _mm512_extracti32x4_epi32(ctr[i], 1);
c[i] = _mm512_extracti32x4_epi32(ctr[i], 2);
d[i] = _mm512_extracti32x4_epi32(ctr[i], 3);
}
_MY_TRANSPOSE4_EPI32(a[0], a[1], a[2], a[3]);
_MY_TRANSPOSE4_EPI32(b[0], b[1], b[2], b[3]);
_MY_TRANSPOSE4_EPI32(c[0], c[1], c[2], c[3]);
_MY_TRANSPOSE4_EPI32(d[0], d[1], d[2], d[3]);
for (int i = 0; i < 4; ++i)
{
ctr[i] = _my512_set_m128i(d[i], c[i], b[i], a[i]);
}
for (int i = 0; i < 4; ++i)
{
ctr[i] = aesni1xm128i(ctr[i], k512);
}
for (int i = 0; i < 4; ++i)
{
a[i] = _mm512_extracti32x4_epi32(ctr[i], 0);
b[i] = _mm512_extracti32x4_epi32(ctr[i], 1);
c[i] = _mm512_extracti32x4_epi32(ctr[i], 2);
d[i] = _mm512_extracti32x4_epi32(ctr[i], 3);
}
_MY_TRANSPOSE4_EPI32(a[0], a[1], a[2], a[3]);
_MY_TRANSPOSE4_EPI32(b[0], b[1], b[2], b[3]);
_MY_TRANSPOSE4_EPI32(c[0], c[1], c[2], c[3]);
_MY_TRANSPOSE4_EPI32(d[0], d[1], d[2], d[3]);
for (int i = 0; i < 4; ++i)
{
ctr[i] = _my512_set_m128i(d[i], c[i], b[i], a[i]);
}
// convert uint32 to float
rnd1 = _mm512_cvtepu32_ps(ctr[0]);
rnd2 = _mm512_cvtepu32_ps(ctr[1]);
rnd3 = _mm512_cvtepu32_ps(ctr[2]);
rnd4 = _mm512_cvtepu32_ps(ctr[3]);
// calculate rnd * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
rnd1 = _mm512_fmadd_ps(rnd1, _mm512_set1_ps(TWOPOW32_INV_FLOAT), _mm512_set1_ps(TWOPOW32_INV_FLOAT/2.0));
rnd2 = _mm512_fmadd_ps(rnd2, _mm512_set1_ps(TWOPOW32_INV_FLOAT), _mm512_set1_ps(TWOPOW32_INV_FLOAT/2.0));
rnd3 = _mm512_fmadd_ps(rnd3, _mm512_set1_ps(TWOPOW32_INV_FLOAT), _mm512_set1_ps(TWOPOW32_INV_FLOAT/2.0));
rnd4 = _mm512_fmadd_ps(rnd4, _mm512_set1_ps(TWOPOW32_INV_FLOAT), _mm512_set1_ps(TWOPOW32_INV_FLOAT/2.0));
}
QUALIFIERS void aesni_double2(__m512i ctr0, __m512i ctr1, __m512i ctr2, __m512i ctr3,
uint32 key0, uint32 key1, uint32 key2, uint32 key3,
__m512d & rnd1lo, __m512d & rnd1hi, __m512d & rnd2lo, __m512d & rnd2hi)
{
// pack input and call AES
__m512i k512 = _mm512_set_epi32(key3, key2, key1, key0, key3, key2, key1, key0,
key3, key2, key1, key0, key3, key2, key1, key0);
__m512i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
__m128i a[4], b[4], c[4], d[4];
for (int i = 0; i < 4; ++i)
{
a[i] = _mm512_extracti32x4_epi32(ctr[i], 0);
b[i] = _mm512_extracti32x4_epi32(ctr[i], 1);
c[i] = _mm512_extracti32x4_epi32(ctr[i], 2);
d[i] = _mm512_extracti32x4_epi32(ctr[i], 3);
}
_MY_TRANSPOSE4_EPI32(a[0], a[1], a[2], a[3]);
_MY_TRANSPOSE4_EPI32(b[0], b[1], b[2], b[3]);
_MY_TRANSPOSE4_EPI32(c[0], c[1], c[2], c[3]);
_MY_TRANSPOSE4_EPI32(d[0], d[1], d[2], d[3]);
for (int i = 0; i < 4; ++i)
{
ctr[i] = _my512_set_m128i(d[i], c[i], b[i], a[i]);
}
for (int i = 0; i < 4; ++i)
{
ctr[i] = aesni1xm128i(ctr[i], k512);
}
for (int i = 0; i < 4; ++i)
{
a[i] = _mm512_extracti32x4_epi32(ctr[i], 0);
b[i] = _mm512_extracti32x4_epi32(ctr[i], 1);
c[i] = _mm512_extracti32x4_epi32(ctr[i], 2);
d[i] = _mm512_extracti32x4_epi32(ctr[i], 3);
}
_MY_TRANSPOSE4_EPI32(a[0], a[1], a[2], a[3]);
_MY_TRANSPOSE4_EPI32(b[0], b[1], b[2], b[3]);
_MY_TRANSPOSE4_EPI32(c[0], c[1], c[2], c[3]);
_MY_TRANSPOSE4_EPI32(d[0], d[1], d[2], d[3]);
for (int i = 0; i < 4; ++i)
{
ctr[i] = _my512_set_m128i(d[i], c[i], b[i], a[i]);
}
rnd1lo = _uniform_double_hq<false>(ctr[0], ctr[1]);
rnd1hi = _uniform_double_hq<true>(ctr[0], ctr[1]);
rnd2lo = _uniform_double_hq<false>(ctr[2], ctr[3]);
rnd2hi = _uniform_double_hq<true>(ctr[2], ctr[3]);
}
QUALIFIERS void aesni_float4(uint32 ctr0, __m512i ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1, uint32 key2, uint32 key3,
__m512 & rnd1, __m512 & rnd2, __m512 & rnd3, __m512 & rnd4)
{
__m512i ctr0v = _mm512_set1_epi32(ctr0);
__m512i ctr2v = _mm512_set1_epi32(ctr2);
__m512i ctr3v = _mm512_set1_epi32(ctr3);
aesni_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, key2, key3, rnd1, rnd2, rnd3, rnd4);
}
QUALIFIERS void aesni_double2(uint32 ctr0, __m512i ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1, uint32 key2, uint32 key3,
__m512d & rnd1lo, __m512d & rnd1hi, __m512d & rnd2lo, __m512d & rnd2hi)
{
__m512i ctr0v = _mm512_set1_epi32(ctr0);
__m512i ctr2v = _mm512_set1_epi32(ctr2);
__m512i ctr3v = _mm512_set1_epi32(ctr3);
aesni_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, key2, key3, rnd1lo, rnd1hi, rnd2lo, rnd2hi);
}
QUALIFIERS void aesni_double2(uint32 ctr0, __m512i ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1, uint32 key2, uint32 key3,
__m512d & rnd1, __m512d & rnd2)
{
#if 0
__m512i ctr0v = _mm512_set1_epi32(ctr0);
__m512i ctr2v = _mm512_set1_epi32(ctr2);
__m512i ctr3v = _mm512_set1_epi32(ctr3);
__m512d ignore;
aesni_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, key2, key3, rnd1, ignore, rnd2, ignore);
#else
__m256d rnd1lo, rnd1hi, rnd2lo, rnd2hi;
aesni_double2(ctr0, _mm512_extracti64x4_epi64(ctr1, 0), ctr2, ctr3, key0, key1, key2, key3, rnd1lo, rnd1hi, rnd2lo, rnd2hi);
rnd1 = _my512_set_m256d(rnd1hi, rnd1lo);
rnd2 = _my512_set_m256d(rnd2hi, rnd2lo);
#endif
}
#endif
/*
Copyright 2021-2023, Michael Kuron.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#if defined(_MSC_VER)
#define __ARM_NEON
#endif
#ifdef __ARM_NEON
#include <arm_neon.h>
#endif
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS > 0
#include <arm_sve.h>
typedef svbool_t svbool_st __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
typedef svfloat32_t svfloat32_st __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
typedef svfloat64_t svfloat64_st __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
typedef svint32_t svint32_st __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
#endif
#ifdef __ARM_NEON
inline float32x4_t makeVec_f32(float a, float b, float c, float d)
{
alignas(16) float data[4] = {a, b, c, d};
return vld1q_f32(data);
}
inline float64x2_t makeVec_f64(double a, double b)
{
alignas(16) double data[2] = {a, b};
return vld1q_f64(data);
}
inline int32x4_t makeVec_s32(int a, int b, int c, int d)
{
alignas(16) int data[4] = {a, b, c, d};
return vld1q_s32(data);
}
#endif
inline void cachelineZero(void * p) {
#if !defined(_MSC_VER) || defined(__clang__)
__asm__ volatile("dc zva, %0"::"r"(p):"memory");
#endif
}
inline size_t _cachelineSize() {
#if !defined(_MSC_VER) || defined(__clang__)
// check that dc zva is permitted
uint64_t dczid;
__asm__ volatile ("mrs %0, dczid_el0" : "=r"(dczid));
if ((dczid & (1 << 4)) != 0) {
return SIZE_MAX;
}
// allocate and fill with ones
const size_t max_size = 0x100000;
uint8_t data[2*max_size];
for (size_t i = 0; i < 2*max_size; ++i) {
data[i] = 0xff;
}
// find alignment offset
size_t offset = max_size - ((uintptr_t) data) % max_size;
// zero a cacheline
cachelineZero((void*) (data + offset));
// make sure that at least one byte was zeroed
if (data[offset] != 0) {
return SIZE_MAX;
}
// make sure that nothing was zeroed before the pointer
if (data[offset-1] == 0) {
return SIZE_MAX;
}
// find the last byte that was zeroed
for (size_t size = 1; size < max_size; ++size) {
if (data[offset + size] != 0) {
return size;
}
}
#endif
// too much was zeroed
return SIZE_MAX;
}
inline size_t cachelineSize() {
static size_t size = _cachelineSize();
return size;
}
/*
Copyright 2023, Markus Holzer.
Copyright 2023, Michael Kuron.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#define POS_INFINITY __int_as_float(0x7f800000)
#define INFINITY POS_INFINITY
#define NEG_INFINITY __int_as_float(0xff800000)
#ifdef __HIPCC_RTC__
typedef __hip_uint8_t uint8_t;
typedef __hip_int8_t int8_t;
typedef __hip_uint16_t uint16_t;
typedef __hip_int16_t int16_t;
#endif
/*
Copyright 2023, Markus Holzer.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/// Half precision support. Experimental. Use carefully.
///
/// This feature is experimental, since it strictly depends on the underlying architecture and compiler support.
/// On x86 architectures, what you can expect is that the data format is supported natively only for storage and
/// interchange. Arithmetic operations will likely involve casting to fp32 (C++ float) and truncation to fp16.
/// Only bandwidth bound code may therefore benefit. None of this is guaranteed, and may change in the future.
/// Clang version must be 15 or higher for x86 half precision support.
/// GCC version must be 12 or higher for x86 half precision support.
/// Also support seems to require SSE, so ensure that respective instruction sets are enabled.
/// See
/// https://clang.llvm.org/docs/LanguageExtensions.html#half-precision-floating-point
/// https://gcc.gnu.org/onlinedocs/gcc/Half-Precision.html
/// for more information.
#pragma once
using half = _Float16;
/*
Copyright 2019-2023, Michael Kuron.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#if defined(__SSE2__) || (defined(_MSC_VER) && !defined(_M_ARM64))
QUALIFIERS __m128 _my_cvtepu32_ps(const __m128i v)
{
#if defined(__AVX512VL__) || defined(__AVX10_1__)
return _mm_cvtepu32_ps(v);
#else
__m128i v2 = _mm_srli_epi32(v, 1);
__m128i v1 = _mm_and_si128(v, _mm_set1_epi32(1));
__m128 v2f = _mm_cvtepi32_ps(v2);
__m128 v1f = _mm_cvtepi32_ps(v1);
return _mm_add_ps(_mm_add_ps(v2f, v2f), v1f);
#endif
}
QUALIFIERS void _MY_TRANSPOSE4_EPI32(__m128i & R0, __m128i & R1, __m128i & R2, __m128i & R3)
{
__m128i T0, T1, T2, T3;
T0 = _mm_unpacklo_epi32(R0, R1);
T1 = _mm_unpacklo_epi32(R2, R3);
T2 = _mm_unpackhi_epi32(R0, R1);
T3 = _mm_unpackhi_epi32(R2, R3);
R0 = _mm_unpacklo_epi64(T0, T1);
R1 = _mm_unpackhi_epi64(T0, T1);
R2 = _mm_unpacklo_epi64(T2, T3);
R3 = _mm_unpackhi_epi64(T2, T3);
}
#endif
#if defined(__SSE4_1__) || (defined(_MSC_VER) && !defined(_M_ARM64))
#if !defined(__AVX512VL__) && !defined(__AVX10_1__) && defined(__GNUC__) && __GNUC__ >= 5 && !defined(__clang__)
__attribute__((optimize("no-associative-math")))
#endif
QUALIFIERS __m128d _my_cvtepu64_pd(const __m128i x)
{
#if defined(__AVX512VL__) || defined(__AVX10_1__)
return _mm_cvtepu64_pd(x);
#elif defined(__clang__)
return __builtin_convertvector((uint64_t __attribute__((__vector_size__(16)))) x, __m128d);
#else
__m128i xH = _mm_srli_epi64(x, 32);
xH = _mm_or_si128(xH, _mm_castpd_si128(_mm_set1_pd(19342813113834066795298816.))); // 2^84
__m128i xL = _mm_blend_epi16(x, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)), 0xcc); // 2^52
__m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(19342813118337666422669312.)); // 2^84 + 2^52
return _mm_add_pd(f, _mm_castsi128_pd(xL));
#endif
}
#endif
#ifdef __AVX2__
QUALIFIERS __m256i _my256_set_m128i(__m128i hi, __m128i lo)
{
#if (!defined(__GNUC__) || __GNUC__ >= 8) || defined(__clang__)
return _mm256_set_m128i(hi, lo);
#else
return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
#endif
}
QUALIFIERS __m256d _my256_set_m128d(__m128d hi, __m128d lo)
{
#if (!defined(__GNUC__) || __GNUC__ >= 8) || defined(__clang__)
return _mm256_set_m128d(hi, lo);
#else
return _mm256_insertf128_pd(_mm256_castpd128_pd256(lo), hi, 1);
#endif
}
QUALIFIERS __m256 _my256_cvtepu32_ps(const __m256i v)
{
#if defined(__AVX512VL__) || defined(__AVX10_1__)
return _mm256_cvtepu32_ps(v);
#else
__m256i v2 = _mm256_srli_epi32(v, 1);
__m256i v1 = _mm256_and_si256(v, _mm256_set1_epi32(1));
__m256 v2f = _mm256_cvtepi32_ps(v2);
__m256 v1f = _mm256_cvtepi32_ps(v1);
return _mm256_add_ps(_mm256_add_ps(v2f, v2f), v1f);
#endif
}
#if !defined(__AVX512VL__) && !defined(__AVX10_1__) && defined(__GNUC__) && __GNUC__ >= 5 && !defined(__clang__)
__attribute__((optimize("no-associative-math")))
#endif
QUALIFIERS __m256d _my256_cvtepu64_pd(const __m256i x)
{
#if defined(__AVX512VL__) || defined(__AVX10_1__)
return _mm256_cvtepu64_pd(x);
#elif defined(__clang__)
return __builtin_convertvector((uint64_t __attribute__((__vector_size__(32)))) x, __m256d);
#else
__m256i xH = _mm256_srli_epi64(x, 32);
xH = _mm256_or_si256(xH, _mm256_castpd_si256(_mm256_set1_pd(19342813113834066795298816.))); // 2^84
__m256i xL = _mm256_blend_epi16(x, _mm256_castpd_si256(_mm256_set1_pd(0x0010000000000000)), 0xcc); // 2^52
__m256d f = _mm256_sub_pd(_mm256_castsi256_pd(xH), _mm256_set1_pd(19342813118337666422669312.)); // 2^84 + 2^52
return _mm256_add_pd(f, _mm256_castsi256_pd(xL));
#endif
}
#endif
#if defined(__AVX512F__) || defined(__AVX10_512BIT__)
QUALIFIERS __m512i _my512_set_m128i(__m128i d, __m128i c, __m128i b, __m128i a)
{
return _mm512_inserti32x4(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(a), b, 1), c, 2), d, 3);
}
QUALIFIERS __m512d _my512_set_m256d(__m256d b, __m256d a)
{
return _mm512_insertf64x4(_mm512_castpd256_pd512(a), b, 1);
}
#endif