Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
No results found
Show changes
Showing
with 4117 additions and 364 deletions
...@@ -21,10 +21,13 @@ def diffusion(scalar, diffusion_coeff, idx=None): ...@@ -21,10 +21,13 @@ def diffusion(scalar, diffusion_coeff, idx=None):
Examples: Examples:
>>> f = Field.create_generic('f', spatial_dimensions=2) >>> f = Field.create_generic('f', spatial_dimensions=2)
>>> diffusion_term = diffusion(scalar=f, diffusion_coeff=sp.Symbol("d")) >>> d = sp.Symbol("d")
>>> dx = sp.Symbol("dx")
>>> diffusion_term = diffusion(scalar=f, diffusion_coeff=d)
>>> discretization = Discretization2ndOrder() >>> discretization = Discretization2ndOrder()
>>> discretization(diffusion_term) >>> expected_output = ((f[-1, 0] + f[0, -1] - 4 * f[0, 0] + f[0, 1] + f[1, 0]) * d) / dx**2
(f_W*d + f_S*d - 4*f_C*d + f_N*d + f_E*d)/dx**2 >>> sp.simplify(discretization(diffusion_term) - expected_output)
0
""" """
if isinstance(scalar, Field): if isinstance(scalar, Field):
first_arg = scalar.center first_arg = scalar.center
...@@ -76,13 +79,6 @@ class Discretization2ndOrder: ...@@ -76,13 +79,6 @@ class Discretization2ndOrder:
self.dt = dt self.dt = dt
self.spatial_stencil = discretization_stencil_func self.spatial_stencil = discretization_stencil_func
@staticmethod
def _diff_order(e):
if not isinstance(e, Diff):
return 0
else:
return 1 + Discretization2ndOrder._diff_order(e.args[0])
def _discretize_diffusion(self, e): def _discretize_diffusion(self, e):
result = 0 result = 0
for c in range(e.dim): for c in range(e.dim):
...@@ -109,6 +105,7 @@ class Discretization2ndOrder: ...@@ -109,6 +105,7 @@ class Discretization2ndOrder:
return self._discretize_advection(e) return self._discretize_advection(e)
elif isinstance(e, Diff): elif isinstance(e, Diff):
arg, *indices = diff_args(e) arg, *indices = diff_args(e)
if not isinstance(arg, Field.Access): if not isinstance(arg, Field.Access):
raise ValueError("Only derivatives with field or field accesses as arguments can be discretized") raise ValueError("Only derivatives with field or field accesses as arguments can be discretized")
return self.spatial_stencil(indices, self.dx, arg) return self.spatial_stencil(indices, self.dx, arg)
...@@ -116,29 +113,6 @@ class Discretization2ndOrder: ...@@ -116,29 +113,6 @@ class Discretization2ndOrder:
new_args = [self._discretize_spatial(a) for a in e.args] new_args = [self._discretize_spatial(a) for a in e.args]
return e.func(*new_args) if new_args else e return e.func(*new_args) if new_args else e
def _discretize_diff(self, e):
order = self._diff_order(e)
if order == 1:
fa = e.args[0]
index = e.target
return (fa.neighbor(index, 1) - fa.neighbor(index, -1)) / (2 * self.dx)
elif order == 2:
indices = sorted([e.target, e.args[0].target])
fa = e.args[0].args[0]
if indices[0] == indices[1] and all(i >= 0 for i in indices):
result = (-2 * fa + fa.neighbor(indices[0], -1) + fa.neighbor(indices[0], +1))
elif indices[0] == indices[1]:
result = 0
for d in range(fa.field.spatial_dimensions):
result += (-2 * fa + fa.neighbor(d, -1) + fa.neighbor(d, +1))
else:
assert all(i >= 0 for i in indices)
offsets = [(1, 1), [-1, 1], [1, -1], [-1, -1]]
result = sum(o1 * o2 * fa.neighbor(indices[0], o1).neighbor(indices[1], o2) for o1, o2 in offsets) / 4
return result / (self.dx ** 2)
else:
raise NotImplementedError("Term contains derivatives of order > 2")
def __call__(self, expr): def __call__(self, expr):
if isinstance(expr, list): if isinstance(expr, list):
return [self(e) for e in expr] return [self(e) for e in expr]
...@@ -188,7 +162,7 @@ class Advection(sp.Function): ...@@ -188,7 +162,7 @@ class Advection(sp.Function):
return self.scalar.spatial_dimensions return self.scalar.spatial_dimensions
def _latex(self, printer): def _latex(self, printer):
name_suffix = "_%s" % self.scalar_index if self.scalar_index is not None else "" name_suffix = f"_{self.scalar_index}" if self.scalar_index is not None else ""
if isinstance(self.vector, Field): if isinstance(self.vector, Field):
return r"\nabla \cdot(%s %s)" % (printer.doprint(sp.Symbol(self.vector.name)), return r"\nabla \cdot(%s %s)" % (printer.doprint(sp.Symbol(self.vector.name)),
printer.doprint(sp.Symbol(self.scalar.name + name_suffix))) printer.doprint(sp.Symbol(self.scalar.name + name_suffix)))
...@@ -235,7 +209,7 @@ class Diffusion(sp.Function): ...@@ -235,7 +209,7 @@ class Diffusion(sp.Function):
return self.scalar.spatial_dimensions return self.scalar.spatial_dimensions
def _latex(self, printer): def _latex(self, printer):
name_suffix = "_%s" % self.scalar_index if self.scalar_index is not None else "" name_suffix = f"_{self.scalar_index}" if self.scalar_index is not None else ""
coeff = self.diffusion_coeff coeff = self.diffusion_coeff
diff_coeff = sp.Symbol(coeff.name) if isinstance(coeff, Field) else coeff diff_coeff = sp.Symbol(coeff.name) if isinstance(coeff, Field) else coeff
return r"div(%s \nabla %s)" % (printer.doprint(diff_coeff), return r"div(%s \nabla %s)" % (printer.doprint(diff_coeff),
...@@ -268,7 +242,7 @@ class Transient(sp.Function): ...@@ -268,7 +242,7 @@ class Transient(sp.Function):
return None if len(self.args) <= 1 else int(self.args[1]) return None if len(self.args) <= 1 else int(self.args[1])
def _latex(self, printer): def _latex(self, printer):
name_suffix = "_%s" % self.scalar_index if self.scalar_index is not None else "" name_suffix = f"_{self.scalar_index}" if self.scalar_index is not None else ""
return r"\partial_t %s" % (printer.doprint(sp.Symbol(self.scalar.name + name_suffix)),) return r"\partial_t %s" % (printer.doprint(sp.Symbol(self.scalar.name + name_suffix)),)
...@@ -311,8 +285,9 @@ def discretize_center(term, symbols_to_field_dict, dx, dim=3): ...@@ -311,8 +285,9 @@ def discretize_center(term, symbols_to_field_dict, dx, dim=3):
>>> term >>> term
x*x^Delta^0 x*x^Delta^0
>>> f = Field.create_generic('f', spatial_dimensions=3) >>> f = Field.create_generic('f', spatial_dimensions=3)
>>> discretize_center(term, { x: f }, dx=1, dim=3) >>> expected_output = f[0, 0, 0] * (-f[-1, 0, 0]/2 + f[1, 0, 0]/2)
f_C*(-f_W/2 + f_E/2) >>> sp.simplify(discretize_center(term, { x: f }, dx=1, dim=3) - expected_output)
0
""" """
substitutions = {} substitutions = {}
for symbols, field in symbols_to_field_dict.items(): for symbols, field in symbols_to_field_dict.items():
...@@ -362,7 +337,7 @@ def discretize_staggered(term, symbols_to_field_dict, coordinate, coordinate_off ...@@ -362,7 +337,7 @@ def discretize_staggered(term, symbols_to_field_dict, coordinate, coordinate_off
offset = [0] * dim offset = [0] * dim
offset[coordinate] = coordinate_offset offset[coordinate] = coordinate_offset
offset = np.array(offset, dtype=np.int) offset = np.array(offset, dtype=int)
gradient = grad(symbols)[coordinate] gradient = grad(symbols)[coordinate]
substitutions.update({s: (field[offset](i) + field(i)) / 2 for i, s in enumerate(symbols)}) substitutions.update({s: (field[offset](i) + field(i)) / 2 for i, s in enumerate(symbols)})
...@@ -394,8 +369,10 @@ def discretize_divergence(vector_term, symbols_to_field_dict, dx): ...@@ -394,8 +369,10 @@ def discretize_divergence(vector_term, symbols_to_field_dict, dx):
>>> x, dx = sp.symbols("x dx") >>> x, dx = sp.symbols("x dx")
>>> grad_x = grad(x, dim=3) >>> grad_x = grad(x, dim=3)
>>> f = Field.create_generic('f', spatial_dimensions=3) >>> f = Field.create_generic('f', spatial_dimensions=3)
>>> sp.simplify(discretize_divergence(grad_x, {x : f}, dx)) >>> expected_output = (f[-1, 0, 0] + f[0, -1, 0] + f[0, 0, -1] -
(f_W + f_S + f_B - 6*f_C + f_T + f_N + f_E)/dx**2 ... 6*f[0, 0, 0] + f[0, 0, 1] + f[0, 1, 0] + f[1, 0, 0])/dx**2
>>> sp.simplify(discretize_divergence(grad_x, {x : f}, dx) - expected_output)
0
""" """
dim = len(vector_term) dim = len(vector_term)
result = 0 result = 0
...@@ -408,7 +385,7 @@ def discretize_divergence(vector_term, symbols_to_field_dict, dx): ...@@ -408,7 +385,7 @@ def discretize_divergence(vector_term, symbols_to_field_dict, dx):
def __up_down_offsets(d, dim): def __up_down_offsets(d, dim):
coord = [0] * dim coord = [0] * dim
coord[d] = 1 coord[d] = 1
up = np.array(coord, dtype=np.int) up = np.array(coord, dtype=int)
coord[d] = -1 coord[d] = -1
down = np.array(coord, dtype=np.int) down = np.array(coord, dtype=int)
return up, down return up, down
import pystencils as ps
import sympy as sp
from pystencils.fd.derivation import FiniteDifferenceStaggeredStencilDerivation as FDS, \
FiniteDifferenceStencilDerivation as FD
import itertools
from collections import defaultdict
from collections.abc import Iterable
def get_access_and_direction(term):
direction1 = term.args[1]
if isinstance(term.args[0], ps.Field.Access): # first derivative
access = term.args[0]
direction = (direction1,)
elif isinstance(term.args[0], ps.fd.Diff): # nested derivative
if isinstance(term.args[0].args[0], ps.fd.Diff): # third or higher derivative
raise ValueError("can only handle first and second derivatives")
elif not isinstance(term.args[0].args[0], ps.Field.Access):
raise ValueError("can only handle derivatives of field accesses")
access, direction2 = term.args[0].args[:2]
direction = (direction1, direction2)
else:
raise NotImplementedError(f"can only deal with derivatives of field accesses, "
f"but not {type(term.args[0])}; expansion of derivatives probably failed")
return access, direction
class FVM1stOrder:
"""Finite-volume discretization
Args:
field: the field with the quantity to calculate, e.g. a concentration
flux: a list of sympy expressions that specify the flux, one for each cartesian direction
source: a list of sympy expressions that specify the source
"""
def __init__(self, field: ps.field.Field, flux=0, source=0):
def normalize(f, shape):
shape = tuple(s for s in shape if s != 1)
if not shape:
shape = None
if isinstance(f, sp.Array) or isinstance(f, Iterable) or isinstance(f, sp.Matrix):
return sp.Array(f, shape)
else:
return sp.Array([f] * (sp.Mul(*shape) if shape else 1))
self.c = field
self.dim = self.c.spatial_dimensions
self.j = normalize(flux, (self.dim, ) + self.c.index_shape)
self.q = normalize(source, self.c.index_shape)
def discrete_flux(self, flux_field: ps.field.Field):
"""Return a list of assignments for the discrete fluxes
Args:
flux_field: a staggered field to which the fluxes should be assigned
"""
assert ps.FieldType.is_staggered(flux_field)
num = 0
def discretize(term, neighbor):
nonlocal num
if isinstance(term, sp.Matrix):
nw = term.applyfunc(lambda t: discretize(t, neighbor))
return nw
elif isinstance(term, ps.field.Field.Access):
avg = (term.get_shifted(*neighbor) + term) * sp.Rational(1, 2)
return avg
elif isinstance(term, ps.fd.Diff):
access, direction = get_access_and_direction(term)
fds = FDS(neighbor, access.field.spatial_dimensions, direction,
free_weights_prefix=f'fvm_free_{num}' if sp.Matrix(neighbor).dot(neighbor) > 2 else None)
num += 1
return fds.apply(access)
if term.args:
new_args = [discretize(a, neighbor) for a in term.args]
return term.func(*new_args)
else:
return term
fluxes = self.j.applyfunc(ps.fd.derivative.expand_diff_full)
fluxes = [sp.Matrix(fluxes.tolist()[i]) if flux_field.index_dimensions > 1 else fluxes.tolist()[i]
for i in range(self.dim)]
A0 = sum([sp.Matrix(ps.stencil.direction_string_to_offset(d)).norm()
for d in flux_field.staggered_stencil]) / self.dim
discrete_fluxes = []
for neighbor in flux_field.staggered_stencil:
neighbor = ps.stencil.direction_string_to_offset(neighbor)
directional_flux = fluxes[0] * int(neighbor[0])
for i in range(1, self.dim):
directional_flux += fluxes[i] * int(neighbor[i])
discrete_flux = sp.simplify(discretize(directional_flux, neighbor))
free_weights = [s for s in discrete_flux.atoms(sp.Symbol) if s.name.startswith('fvm_free_')]
if len(free_weights) > 0:
discrete_flux = discrete_flux.collect(discrete_flux.atoms(ps.field.Field.Access))
access_counts = defaultdict(list)
for values in itertools.product([-1, 0, 1],
repeat=len(free_weights)):
subs = {free_weight: value for free_weight, value in zip(free_weights, values)}
simp = discrete_flux.subs(subs)
access_count = len(simp.atoms(ps.field.Field.Access))
access_counts[access_count].append(simp)
best_count = min(access_counts.keys())
discrete_flux = sum(access_counts[best_count]) / len(access_counts[best_count])
discrete_fluxes.append(discrete_flux / sp.Matrix(neighbor).norm())
if flux_field.index_dimensions > 1:
return [ps.Assignment(lhs, rhs / A0)
for i, d in enumerate(flux_field.staggered_stencil) if discrete_fluxes[i]
for lhs, rhs in zip(flux_field.staggered_vector_access(d), sp.simplify(discrete_fluxes[i]))]
else:
return [ps.Assignment(flux_field.staggered_access(d), sp.simplify(discrete_fluxes[i]) / A0)
for i, d in enumerate(flux_field.staggered_stencil)]
def discrete_source(self):
"""Return a list of assignments for the discrete source term"""
def discretize(term):
if isinstance(term, ps.fd.Diff):
access, direction = get_access_and_direction(term)
if self.dim == 2:
stencil = ["".join(a).replace(" ", "") for a in itertools.product("NS ", "EW ")
if "".join(a).strip()]
else:
stencil = ["".join(a).replace(" ", "") for a in itertools.product("NS ", "EW ", "TB ")
if "".join(a).strip()]
weights = None
for stencil in [["N", "S", "E", "W", "T", "B"][:2 * self.dim], stencil]:
stencil = [tuple(ps.stencil.direction_string_to_offset(d, self.dim)) for d in stencil]
derivation = FD(direction, stencil).get_stencil()
if not derivation.accuracy:
continue
weights = derivation.weights
# if the weights are underdefined, we can choose the free symbols to find the sparsest stencil
free_weights = set(itertools.chain(*[w.free_symbols for w in weights]))
if len(free_weights) > 0:
zero_counts = defaultdict(list)
for values in itertools.product([-1, -sp.Rational(1, 2), 0, 1, sp.Rational(1, 2)],
repeat=len(free_weights)):
subs = {free_weight: value for free_weight, value in zip(free_weights, values)}
weights = [w.subs(subs) for w in derivation.weights]
if not all(a == 0 for a in weights):
zero_count = sum([1 for w in weights if w == 0])
zero_counts[zero_count].append(weights)
best = zero_counts[max(zero_counts.keys())]
if len(best) > 1:
raise NotImplementedError("more than one suitable set of weights found, "
"don't know how to proceed")
weights = best[0]
break
if not weights:
raise Exception('the requested derivative cannot be performed with the available neighbors')
assert weights
if access._field.index_dimensions == 0:
return sum([access._field.__getitem__(point) * weight for point, weight in zip(stencil, weights)])
else:
total = access.get_shifted(*stencil[0]).at_index(*access.index) * weights[0]
for point, weight in zip(stencil[1:], weights[1:]):
addl = access.get_shifted(*point).at_index(*access.index) * weight
total += addl
return total
if term.args:
new_args = [discretize(a) for a in term.args]
return term.func(*new_args)
else:
return term
source = self.q.applyfunc(ps.fd.derivative.expand_diff_full)
source = source.applyfunc(discretize)
return [ps.Assignment(lhs, rhs) for lhs, rhs in zip(self.c.center_vector, sp.flatten(source)) if rhs]
def discrete_continuity(self, flux_field: ps.field.Field):
"""Return a list of assignments for the continuity equation, which includes the source term
Args:
flux_field: a staggered field from which the fluxes are taken
"""
assert ps.FieldType.is_staggered(flux_field)
neighbors = flux_field.staggered_stencil + [ps.stencil.inverse_direction_string(d)
for d in flux_field.staggered_stencil]
divergence = flux_field.staggered_vector_access(neighbors[0])
for d in neighbors[1:]:
divergence += flux_field.staggered_vector_access(d)
source = self.discrete_source()
source = {s.lhs: s.rhs for s in source}
return [ps.Assignment(lhs, (lhs - rhs + source[lhs]) if lhs in source else (lhs - rhs))
for lhs, rhs in zip(self.c.center_vector, divergence)]
def VOF(j: ps.field.Field, v: ps.field.Field, ρ: ps.field.Field):
"""Volume-of-fluid discretization of advection
Args:
j: the staggered field to write the fluxes to. Should have a D2Q9/D3Q27 stencil. Other stencils work too, but
incur a small error (D2Q5/D3Q7: v^2, D3Q19: v^3).
v: the flow velocity field
ρ: the quantity to advect
"""
assert ps.FieldType.is_staggered(j)
fluxes = [[] for i in range(j.index_shape[0])]
v0 = v.center_vector
for d, neighbor in enumerate(j.staggered_stencil):
c = ps.stencil.direction_string_to_offset(neighbor)
v1 = v.neighbor_vector(c)
# going out
cond = sp.And(*[sp.Or(c[i] * v0[i] > 0, c[i] == 0) for i in range(len(v0))])
overlap1 = [1 - sp.Abs(v0[i]) for i in range(len(v0))]
overlap2 = [c[i] * v0[i] for i in range(len(v0))]
overlap = sp.Mul(*[(overlap1[i] if c[i] == 0 else overlap2[i]) for i in range(len(v0))])
fluxes[d].append(ρ.center_vector * overlap * sp.Piecewise((1, cond), (0, True)))
# coming in
cond = sp.And(*[sp.Or(c[i] * v1[i] < 0, c[i] == 0) for i in range(len(v1))])
overlap1 = [1 - sp.Abs(v1[i]) for i in range(len(v1))]
overlap2 = [v1[i] for i in range(len(v1))]
overlap = sp.Mul(*[(overlap1[i] if c[i] == 0 else overlap2[i]) for i in range(len(v1))])
sign = (c == 1).sum() % 2 * 2 - 1
fluxes[d].append(sign * ρ.neighbor_vector(c) * overlap * sp.Piecewise((1, cond), (0, True)))
for i, ff in enumerate(fluxes):
fluxes[i] = ff[0]
for f in ff[1:]:
fluxes[i] += f
assignments = []
for i, d in enumerate(j.staggered_stencil):
for lhs, rhs in zip(j.staggered_vector_access(d).values(), fluxes[i].values()):
assignments.append(ps.Assignment(lhs, rhs))
return assignments
from functools import lru_cache
from typing import Tuple from typing import Tuple
import sympy as sp import sympy as sp
from pystencils.astnodes import LoopOverCoordinate from pystencils.astnodes import LoopOverCoordinate
from pystencils.cache import memorycache
from pystencils.fd import Diff from pystencils.fd import Diff
from pystencils.field import Field from pystencils.field import Field
from pystencils.transformations import generic_visit from pystencils.transformations import generic_visit
...@@ -72,43 +72,12 @@ def fd_stencils_forth_order_isotropic(indices, dx, fa): ...@@ -72,43 +72,12 @@ def fd_stencils_forth_order_isotropic(indices, dx, fa):
return stencils[dim].apply(fa) / dx return stencils[dim].apply(fa) / dx
def fd_stencils_isotropic_high_density_code(indices, dx, fa):
dim = fa.field.spatial_dimensions
if dim == 1:
return fd_stencils_standard(indices, dx, fa)
elif dim == 2:
order = len(indices)
if order == 1:
idx = indices[0]
assert 0 <= idx < 2
other_idx = 1 if indices[0] == 0 else 0
weights = {-1: sp.Rational(1, 12) / dx,
0: sp.Rational(1, 3) / dx,
1: sp.Rational(1, 12) / dx}
upper_terms = sum(fa.neighbor(idx, +1).neighbor(other_idx, off) * w for off, w in weights.items())
lower_terms = sum(fa.neighbor(idx, -1).neighbor(other_idx, off) * w for off, w in weights.items())
return upper_terms - lower_terms
elif order == 2:
if indices[0] == indices[1]:
idx = indices[0]
diagonals = sp.Rational(1, 8) * sum(fa.neighbor(0, i).neighbor(1, j) for i in (-1, 1) for j in (-1, 1))
div_direction = sp.Rational(1, 2) * sum(fa.neighbor(idx, i) for i in (-1, 1))
center = - sp.Rational(3, 2) * fa
return (diagonals + div_direction + center) / (dx ** 2)
else:
return fd_stencils_standard(indices, dx, fa)
raise NotImplementedError("Supports only derivatives up to order 2 for 1D and 2D setups")
def discretize_spatial(expr, dx, stencil=fd_stencils_standard): def discretize_spatial(expr, dx, stencil=fd_stencils_standard):
if isinstance(stencil, str): if isinstance(stencil, str):
if stencil == 'standard': if stencil == 'standard':
stencil = fd_stencils_standard stencil = fd_stencils_standard
elif stencil == 'isotropic': elif stencil == 'isotropic':
stencil = fd_stencils_isotropic stencil = fd_stencils_isotropic
elif stencil == 'isotropic_hd':
stencil = fd_stencils_isotropic_high_density_code
else: else:
raise ValueError("Unknown stencil. Supported 'standard' and 'isotropic'") raise ValueError("Unknown stencil. Supported 'standard' and 'isotropic'")
...@@ -167,9 +136,7 @@ def discretize_spatial_staggered(expr, dx, stencil=fd_stencils_standard): ...@@ -167,9 +136,7 @@ def discretize_spatial_staggered(expr, dx, stencil=fd_stencils_standard):
# -------------------------------------- special stencils -------------------------------------------------------------- # -------------------------------------- special stencils --------------------------------------------------------------
@lru_cache(maxsize=1)
@memorycache(maxsize=1)
def forth_order_2d_derivation() -> Tuple[FiniteDifferenceStencilDerivation.Result, ...]: def forth_order_2d_derivation() -> Tuple[FiniteDifferenceStencilDerivation.Result, ...]:
# Symmetry, isotropy and 4th order conditions are not enough to fully specify the stencil # Symmetry, isotropy and 4th order conditions are not enough to fully specify the stencil
# one weight has to be specifically set to a somewhat arbitrary value # one weight has to be specifically set to a somewhat arbitrary value
......
...@@ -5,7 +5,7 @@ import pickle ...@@ -5,7 +5,7 @@ import pickle
import re import re
from enum import Enum from enum import Enum
from itertools import chain from itertools import chain
from typing import List, Optional, Sequence, Set, Tuple from typing import List, Optional, Sequence, Set, Tuple, Union
import numpy as np import numpy as np
import sympy as sp import sympy as sp
...@@ -13,78 +13,13 @@ from sympy.core.cache import cacheit ...@@ -13,78 +13,13 @@ from sympy.core.cache import cacheit
import pystencils import pystencils
from pystencils.alignedarray import aligned_empty from pystencils.alignedarray import aligned_empty
from pystencils.data_types import StructType, TypedSymbol, create_type from pystencils.typing import StructType, TypedSymbol, BasicType, create_type
from pystencils.kernelparameters import FieldShapeSymbol, FieldStrideSymbol from pystencils.typing.typed_sympy import FieldShapeSymbol, FieldStrideSymbol
from pystencils.stencil import direction_string_to_offset, offset_to_direction_string from pystencils.stencil import (
direction_string_to_offset, inverse_direction, offset_to_direction_string)
from pystencils.sympyextensions import is_integer_sequence from pystencils.sympyextensions import is_integer_sequence
__all__ = ['Field', 'fields', 'FieldType', 'AbstractField'] __all__ = ['Field', 'fields', 'FieldType', 'Field']
def fields(description=None, index_dimensions=0, layout=None, **kwargs):
"""Creates pystencils fields from a string description.
Examples:
Create a 2D scalar and vector field:
>>> s, v = fields("s, v(2): double[2D]")
>>> assert s.spatial_dimensions == 2 and s.index_dimensions == 0
>>> assert (v.spatial_dimensions, v.index_dimensions, v.index_shape) == (2, 1, (2,))
Create an integer field of shape (10, 20):
>>> f = fields("f : int32[10, 20]")
>>> f.has_fixed_shape, f.shape
(True, (10, 20))
Numpy arrays can be used as template for shape and data type of field:
>>> arr_s, arr_v = np.zeros([20, 20]), np.zeros([20, 20, 2])
>>> s, v = fields("s, v(2)", s=arr_s, v=arr_v)
>>> assert s.index_dimensions == 0 and s.dtype.numpy_dtype == arr_s.dtype
>>> assert v.index_shape == (2,)
Format string can be left out, field names are taken from keyword arguments.
>>> fields(f1=arr_s, f2=arr_s)
[f1, f2]
The keyword names ``index_dimension`` and ``layout`` have special meaning, don't use them for field names
>>> f = fields(f=arr_v, index_dimensions=1)
>>> assert f.index_dimensions == 1
>>> f = fields("pdfs(19) : float32[3D]", layout='fzyx')
>>> f.layout
(2, 1, 0)
"""
result = []
if description:
field_descriptions, dtype, shape = _parse_description(description)
layout = 'numpy' if layout is None else layout
for field_name, idx_shape in field_descriptions:
if field_name in kwargs:
arr = kwargs[field_name]
idx_shape_of_arr = () if not len(idx_shape) else arr.shape[-len(idx_shape):]
assert idx_shape_of_arr == idx_shape
f = Field.create_from_numpy_array(field_name, kwargs[field_name], index_dimensions=len(idx_shape))
elif isinstance(shape, tuple):
f = Field.create_fixed_size(field_name, shape + idx_shape, dtype=dtype,
index_dimensions=len(idx_shape), layout=layout)
elif isinstance(shape, int):
f = Field.create_generic(field_name, spatial_dimensions=shape, dtype=dtype,
index_shape=idx_shape, layout=layout)
elif shape is None:
f = Field.create_generic(field_name, spatial_dimensions=2, dtype=dtype,
index_shape=idx_shape, layout=layout)
else:
assert False
result.append(f)
else:
assert layout is None, "Layout can not be specified when creating Field from numpy array"
for field_name, arr in kwargs.items():
result.append(Field.create_from_numpy_array(field_name, arr, index_dimensions=index_dimensions))
if len(result) == 0:
return None
elif len(result) == 1:
return result[0]
else:
return result
class FieldType(Enum): class FieldType(Enum):
...@@ -98,6 +33,10 @@ class FieldType(Enum): ...@@ -98,6 +33,10 @@ class FieldType(Enum):
# unsafe fields may be accessed in an absolute fashion - the index depends on the data # unsafe fields may be accessed in an absolute fashion - the index depends on the data
# and thus may lead to out-of-bounds accesses # and thus may lead to out-of-bounds accesses
CUSTOM = 3 CUSTOM = 3
# staggered field
STAGGERED = 4
# staggered field that reverses sign when accessed via opposite direction
STAGGERED_FLUX = 5
@staticmethod @staticmethod
def is_generic(field): def is_generic(field):
...@@ -119,14 +58,18 @@ class FieldType(Enum): ...@@ -119,14 +58,18 @@ class FieldType(Enum):
assert isinstance(field, Field) assert isinstance(field, Field)
return field.field_type == FieldType.CUSTOM return field.field_type == FieldType.CUSTOM
@staticmethod
def is_staggered(field):
assert isinstance(field, Field)
return field.field_type == FieldType.STAGGERED or field.field_type == FieldType.STAGGERED_FLUX
class AbstractField: @staticmethod
def is_staggered_flux(field):
class AbstractAccess: assert isinstance(field, Field)
pass return field.field_type == FieldType.STAGGERED_FLUX
class Field(AbstractField): class Field:
""" """
With fields one can formulate stencil-like update rules on structured grids. With fields one can formulate stencil-like update rules on structured grids.
This Field class knows about the dimension, memory layout (strides) and optionally about the size of an array. This Field class knows about the dimension, memory layout (strides) and optionally about the size of an array.
...@@ -158,6 +101,14 @@ class Field(AbstractField): ...@@ -158,6 +101,14 @@ class Field(AbstractField):
First specify the spatial offsets in [], then in case index_dimension>0 the indices in () First specify the spatial offsets in [], then in case index_dimension>0 the indices in ()
e.g. ``f[-1,0,0](7)`` e.g. ``f[-1,0,0](7)``
Staggered Fields:
Staggered fields are used to store a value on a second grid shifted by half a cell with respect to the usual
grid.
The first index dimension is used to specify the position on the staggered grid (e.g. 0 means half-way to the
eastern neighbor, 1 is half-way to the northern neighbor, etc.), while additional indices can be used to store
multiple values at each position.
Example using no index dimensions: Example using no index dimensions:
>>> a = np.zeros([10, 10]) >>> a = np.zeros([10, 10])
>>> f = Field.create_from_numpy_array("f", a, index_dimensions=0) >>> f = Field.create_from_numpy_array("f", a, index_dimensions=0)
...@@ -187,8 +138,9 @@ class Field(AbstractField): ...@@ -187,8 +138,9 @@ class Field(AbstractField):
index_shape: optional shape of the index dimensions i.e. maximum values allowed for each index dimension, index_shape: optional shape of the index dimensions i.e. maximum values allowed for each index dimension,
has to be a list or tuple has to be a list or tuple
field_type: besides the normal GENERIC fields, there are INDEXED fields that store indices of the domain field_type: besides the normal GENERIC fields, there are INDEXED fields that store indices of the domain
that should be iterated over, and BUFFER fields that are used to generate that should be iterated over, BUFFER fields that are used to generate communication
communication packing/unpacking kernels packing/unpacking kernels, and STAGGERED fields, which store values half-way to the next
cell
""" """
if index_shape is not None: if index_shape is not None:
assert index_dimensions == 0 or index_dimensions == len(index_shape) assert index_dimensions == 0 or index_dimensions == len(index_shape)
...@@ -210,11 +162,14 @@ class Field(AbstractField): ...@@ -210,11 +162,14 @@ class Field(AbstractField):
raise ValueError("Structured arrays/fields are not allowed to have an index dimension") raise ValueError("Structured arrays/fields are not allowed to have an index dimension")
shape += (1,) shape += (1,)
strides += (1,) strides += (1,)
if field_type == FieldType.STAGGERED and index_dimensions == 0:
raise ValueError("A staggered field needs at least one index dimension")
return Field(field_name, field_type, dtype, layout, shape, strides) return Field(field_name, field_type, dtype, layout, shape, strides)
@staticmethod @staticmethod
def create_from_numpy_array(field_name: str, array: np.ndarray, index_dimensions: int = 0) -> 'Field': def create_from_numpy_array(field_name: str, array: np.ndarray, index_dimensions: int = 0,
field_type=FieldType.GENERIC) -> 'Field':
"""Creates a field based on the layout, data type, and shape of a given numpy array. """Creates a field based on the layout, data type, and shape of a given numpy array.
Kernels created for these kind of fields can only be called with arrays of the same layout, shape and type. Kernels created for these kind of fields can only be called with arrays of the same layout, shape and type.
...@@ -223,6 +178,7 @@ class Field(AbstractField): ...@@ -223,6 +178,7 @@ class Field(AbstractField):
field_name: symbolic name for the field field_name: symbolic name for the field
array: numpy array array: numpy array
index_dimensions: see documentation of Field index_dimensions: see documentation of Field
field_type: kind of field
""" """
spatial_dimensions = len(array.shape) - index_dimensions spatial_dimensions = len(array.shape) - index_dimensions
if spatial_dimensions < 1: if spatial_dimensions < 1:
...@@ -241,12 +197,15 @@ class Field(AbstractField): ...@@ -241,12 +197,15 @@ class Field(AbstractField):
raise ValueError("Structured arrays/fields are not allowed to have an index dimension") raise ValueError("Structured arrays/fields are not allowed to have an index dimension")
shape += (1,) shape += (1,)
strides += (1,) strides += (1,)
if field_type == FieldType.STAGGERED and index_dimensions == 0:
raise ValueError("A staggered field needs at least one index dimension")
return Field(field_name, FieldType.GENERIC, array.dtype, spatial_layout, shape, strides) return Field(field_name, field_type, array.dtype, spatial_layout, shape, strides)
@staticmethod @staticmethod
def create_fixed_size(field_name: str, shape: Tuple[int, ...], index_dimensions: int = 0, def create_fixed_size(field_name: str, shape: Tuple[int, ...], index_dimensions: int = 0,
dtype=np.float64, layout: str = 'numpy', strides: Optional[Sequence[int]] = None) -> 'Field': dtype=np.float64, layout: str = 'numpy', strides: Optional[Sequence[int]] = None,
field_type=FieldType.GENERIC) -> 'Field':
""" """
Creates a field with fixed sizes i.e. can be called only with arrays of the same size and layout Creates a field with fixed sizes i.e. can be called only with arrays of the same size and layout
...@@ -257,6 +216,7 @@ class Field(AbstractField): ...@@ -257,6 +216,7 @@ class Field(AbstractField):
dtype: numpy data type of the array the kernel is called with later dtype: numpy data type of the array the kernel is called with later
layout: full layout of array, not only spatial dimensions layout: full layout of array, not only spatial dimensions
strides: strides in bytes or None to automatically compute them from shape (assuming no padding) strides: strides in bytes or None to automatically compute them from shape (assuming no padding)
field_type: kind of field
""" """
spatial_dimensions = len(shape) - index_dimensions spatial_dimensions = len(shape) - index_dimensions
assert spatial_dimensions >= 1 assert spatial_dimensions >= 1
...@@ -277,11 +237,13 @@ class Field(AbstractField): ...@@ -277,11 +237,13 @@ class Field(AbstractField):
raise ValueError("Structured arrays/fields are not allowed to have an index dimension") raise ValueError("Structured arrays/fields are not allowed to have an index dimension")
shape += (1,) shape += (1,)
strides += (1,) strides += (1,)
if field_type == FieldType.STAGGERED and index_dimensions == 0:
raise ValueError("A staggered field needs at least one index dimension")
spatial_layout = list(layout) spatial_layout = list(layout)
for i in range(spatial_dimensions, len(layout)): for i in range(spatial_dimensions, len(layout)):
spatial_layout.remove(i) spatial_layout.remove(i)
return Field(field_name, FieldType.GENERIC, dtype, tuple(spatial_layout), shape, strides) return Field(field_name, field_type, dtype, tuple(spatial_layout), shape, strides)
def __init__(self, field_name, field_type, dtype, layout, shape, strides): def __init__(self, field_name, field_type, dtype, layout, shape, strides):
"""Do not use directly. Use static create* methods""" """Do not use directly. Use static create* methods"""
...@@ -293,18 +255,17 @@ class Field(AbstractField): ...@@ -293,18 +255,17 @@ class Field(AbstractField):
self._layout = normalize_layout(layout) self._layout = normalize_layout(layout)
self.shape = shape self.shape = shape
self.strides = strides self.strides = strides
self.latex_name = None # type: Optional[str] self.latex_name: Optional[str] = None
self.coordinate_origin = sp.Matrix(tuple( self.coordinate_origin = sp.Matrix([0] * self.spatial_dimensions)
0 for _ in range(self.spatial_dimensions)
)) # type: tuple[float,sp.Symbol]
self.coordinate_transform = sp.eye(self.spatial_dimensions) self.coordinate_transform = sp.eye(self.spatial_dimensions)
if field_type == FieldType.STAGGERED:
assert self.staggered_stencil
def new_field_with_different_name(self, new_name): def new_field_with_different_name(self, new_name):
if self.has_fixed_shape: if self.has_fixed_shape:
return Field(new_name, self.field_type, self._dtype, self._layout, self.shape, self.strides) return Field(new_name, self.field_type, self._dtype, self._layout, self.shape, self.strides)
else: else:
return Field.create_generic(new_name, self.spatial_dimensions, self.dtype.numpy_dtype, return Field(new_name, self.field_type, self.dtype, self.layout, self.shape, self.strides)
self.index_dimensions, self._layout, self.index_shape, self.field_type)
@property @property
def spatial_dimensions(self) -> int: def spatial_dimensions(self) -> int:
...@@ -357,8 +318,24 @@ class Field(AbstractField): ...@@ -357,8 +318,24 @@ class Field(AbstractField):
def dtype(self): def dtype(self):
return self._dtype return self._dtype
@property
def itemsize(self):
return self.dtype.numpy_dtype.itemsize
def __repr__(self): def __repr__(self):
return self._field_name if any(isinstance(s, sp.Symbol) for s in self.spatial_shape):
spatial_shape_str = f'{self.spatial_dimensions}d'
else:
spatial_shape_str = ','.join(str(i) for i in self.spatial_shape)
index_shape_str = ','.join(str(i) for i in self.index_shape)
if self.index_shape:
return f'{self._field_name}({index_shape_str}): {self.dtype}[{spatial_shape_str}]'
else:
return f'{self._field_name}: {self.dtype}[{spatial_shape_str}]'
def __str__(self):
return self.name
def neighbor(self, coord_id, offset): def neighbor(self, coord_id, offset):
offset_list = [0] * self.spatial_dimensions offset_list = [0] * self.spatial_dimensions
...@@ -373,19 +350,37 @@ class Field(AbstractField): ...@@ -373,19 +350,37 @@ class Field(AbstractField):
index_shape = self.index_shape index_shape = self.index_shape
if len(index_shape) == 0: if len(index_shape) == 0:
return sp.Matrix([self.center]) return sp.Matrix([self.center])
if len(index_shape) == 1: elif len(index_shape) == 1:
return sp.Matrix([self(i) for i in range(index_shape[0])]) return sp.Matrix([self(i) for i in range(index_shape[0])])
elif len(index_shape) == 2: elif len(index_shape) == 2:
def cb(*args): return sp.Matrix([[self(i, j) for j in range(index_shape[1])] for i in range(index_shape[0])])
r = self.__call__(*args) elif len(index_shape) == 3:
return r return sp.Array([[[self(i, j, k) for k in range(index_shape[2])]
return sp.Matrix(*index_shape, cb) for j in range(index_shape[1])] for i in range(index_shape[0])])
else:
raise NotImplementedError("center_vector is not implemented for more than 3 index dimensions")
@property @property
def center(self): def center(self):
center = tuple([0] * self.spatial_dimensions) center = tuple([0] * self.spatial_dimensions)
return Field.Access(self, center) return Field.Access(self, center)
def neighbor_vector(self, offset):
"""Like neighbor, but returns the entire vector/tensor stored at offset."""
if self.spatial_dimensions == 2 and len(offset) == 3:
assert offset[2] == 0
offset = offset[:2]
if self.index_dimensions == 0:
return sp.Matrix([self.__getitem__(offset)])
elif self.index_dimensions == 1:
return sp.Matrix([self.__getitem__(offset)(i) for i in range(self.index_shape[0])])
elif self.index_dimensions == 2:
return sp.Matrix([[self.__getitem__(offset)(i, k) for k in range(self.index_shape[1])]
for i in range(self.index_shape[0])])
else:
raise NotImplementedError("neighbor_vector is not implemented for more than 2 index dimensions")
def __getitem__(self, offset): def __getitem__(self, offset):
if type(offset) is np.ndarray: if type(offset) is np.ndarray:
offset = tuple(offset) offset = tuple(offset)
...@@ -394,42 +389,115 @@ class Field(AbstractField): ...@@ -394,42 +389,115 @@ class Field(AbstractField):
if type(offset) is not tuple: if type(offset) is not tuple:
offset = (offset,) offset = (offset,)
if len(offset) != self.spatial_dimensions: if len(offset) != self.spatial_dimensions:
raise ValueError("Wrong number of spatial indices: " raise ValueError(f"Wrong number of spatial indices: Got {len(offset)}, expected {self.spatial_dimensions}")
"Got %d, expected %d" % (len(offset), self.spatial_dimensions))
return Field.Access(self, offset) return Field.Access(self, offset)
def absolute_access(self, offset, index): def absolute_access(self, offset, index):
assert FieldType.is_custom(self) assert FieldType.is_custom(self)
return Field.Access(self, offset, index, is_absolute_access=True) return Field.Access(self, offset, index, is_absolute_access=True)
def interpolated_access(self, def staggered_access(self, offset, index=None):
offset: Tuple, """If this field is a staggered field, it can be accessed using half-integer offsets.
interpolation_mode='linear', For example, an offset of ``(0, sp.Rational(1,2))`` or ``"E"`` corresponds to the staggered point to the east
address_mode='BORDER', of the cell center, i.e. half-way to the eastern-next cell.
allow_textures=True): If the field stores more than one value per staggered point (e.g. a vector or a tensor), the index (integer or
"""Provides access to field values at non-integer positions tuple of integers) refers to which of these values to access.
"""
assert FieldType.is_staggered(self)
``interpolated_access`` is similar to :func:`Field.absolute_access` except that offset_orig = offset
it allows non-integer offsets and automatic handling of out-of-bound accesses. if type(offset) is np.ndarray:
offset = tuple(offset)
if type(offset) is str:
offset = tuple(direction_string_to_offset(offset, self.spatial_dimensions))
offset = tuple([o * sp.Rational(1, 2) for o in offset])
if len(offset) != self.spatial_dimensions:
raise ValueError(f"Wrong number of spatial indices: Got {len(offset)}, expected {self.spatial_dimensions}")
prefactor = 1
neighbor_vec = [0] * len(offset)
for i in range(self.spatial_dimensions):
if (offset[i] + sp.Rational(1, 2)).is_Integer:
neighbor_vec[i] = sp.sign(offset[i])
neighbor = offset_to_direction_string(neighbor_vec)
if neighbor not in self.staggered_stencil:
neighbor_vec = inverse_direction(neighbor_vec)
neighbor = offset_to_direction_string(neighbor_vec)
if FieldType.is_staggered_flux(self):
prefactor = -1
if neighbor not in self.staggered_stencil:
raise ValueError(f"{offset_orig} is not a valid neighbor for the {self.staggered_stencil_name} stencil")
offset = tuple(sp.Matrix(offset) - sp.Rational(1, 2) * sp.Matrix(neighbor_vec))
idx = self.staggered_stencil.index(neighbor)
if self.index_dimensions == 1: # this field stores a scalar value at each staggered position
if index is not None:
raise ValueError("Cannot specify an index for a scalar staggered field")
return prefactor * Field.Access(self, offset, (idx,))
else: # this field stores a vector or tensor at each staggered position
if index is None:
raise ValueError(f"Wrong number of indices: Got 0, expected {self.index_dimensions - 1}")
if type(index) is np.ndarray:
index = tuple(index)
if type(index) is not tuple:
index = (index,)
if self.index_dimensions != len(index) + 1:
raise ValueError(f"Wrong number of indices: Got {len(index)}, expected {self.index_dimensions - 1}")
return prefactor * Field.Access(self, offset, (idx, *index))
def staggered_vector_access(self, offset):
"""Like staggered_access, but returns the entire vector/tensor stored at offset."""
assert FieldType.is_staggered(self)
if self.index_dimensions == 1:
return sp.Matrix([self.staggered_access(offset)])
elif self.index_dimensions == 2:
return sp.Matrix([self.staggered_access(offset, i) for i in range(self.index_shape[1])])
elif self.index_dimensions == 3:
return sp.Matrix([[self.staggered_access(offset, (i, k)) for k in range(self.index_shape[2])]
for i in range(self.index_shape[1])])
else:
raise NotImplementedError("staggered_vector_access is not implemented for more than 3 index dimensions")
:param offset: Tuple of spatial coordinates (can be floats) @property
:param interpolation_mode: One of :class:`pystencils.interpolation_astnodes.InterpolationMode` def staggered_stencil(self):
:param address_mode: How boundaries are handled can be 'border', 'wrap', 'mirror', 'clamp' assert FieldType.is_staggered(self)
:param allow_textures: Allow implementation by texture accesses on GPUs stencils = {
""" 2: {
from pystencils.interpolation_astnodes import Interpolator 2: ["W", "S"], # D2Q5
return Interpolator(self, 4: ["W", "S", "SW", "NW"] # D2Q9
interpolation_mode, },
address_mode, 3: {
allow_textures=allow_textures).at(offset) 3: ["W", "S", "B"], # D3Q7
7: ["W", "S", "B", "BSW", "TSW", "BNW", "TNW"], # D3Q15
9: ["W", "S", "B", "SW", "NW", "BW", "TW", "BS", "TS"], # D3Q19
13: ["W", "S", "B", "SW", "NW", "BW", "TW", "BS", "TS", "BSW", "TSW", "BNW", "TNW"] # D3Q27
}
}
if not self.index_shape[0] in stencils[self.spatial_dimensions]:
raise ValueError(f"No known stencil has {self.index_shape[0]} staggered points")
return stencils[self.spatial_dimensions][self.index_shape[0]]
@property
def staggered_stencil_name(self):
assert FieldType.is_staggered(self)
return f"D{self.spatial_dimensions}Q{self.index_shape[0] * 2 + 1}"
def __call__(self, *args, **kwargs): def __call__(self, *args, **kwargs):
center = tuple([0] * self.spatial_dimensions) center = tuple([0] * self.spatial_dimensions)
return Field.Access(self, center)(*args, **kwargs) return Field.Access(self, center)(*args, **kwargs)
def hashable_contents(self): def hashable_contents(self):
dth = hash(self._dtype) return (self._layout,
return self._layout, self.shape, self.strides, dth, self.field_type, self._field_name, self.latex_name self.shape,
self.strides,
self.field_type,
self._field_name,
self.latex_name,
self._dtype)
def __hash__(self): def __hash__(self):
return hash(self.hashable_contents()) return hash(self.hashable_contents())
...@@ -441,34 +509,46 @@ class Field(AbstractField): ...@@ -441,34 +509,46 @@ class Field(AbstractField):
@property @property
def physical_coordinates(self): def physical_coordinates(self):
return self.coordinate_transform @ (self.coordinate_origin + pystencils.x_vector(self.spatial_dimensions)) if hasattr(self.coordinate_transform, '__call__'):
return self.coordinate_transform(self.coordinate_origin + pystencils.x_vector(self.spatial_dimensions))
else:
return self.coordinate_transform @ (self.coordinate_origin + pystencils.x_vector(self.spatial_dimensions))
@property @property
def physical_coordinates_staggered(self): def physical_coordinates_staggered(self):
return self.coordinate_transform @ \ return self.coordinate_transform @ \
(self.coordinate_origin + pystencils.x_staggered_vector(self.spatial_dimensions)) (self.coordinate_origin + pystencils.x_staggered_vector(self.spatial_dimensions))
def index_to_physical(self, index_coordinates, staggered=False): def index_to_physical(self, index_coordinates: sp.Matrix, staggered=False):
if staggered: if staggered:
index_coordinates = sp.Matrix([i + 0.5 for i in index_coordinates]) index_coordinates = sp.Matrix([0.5] * len(self.coordinate_origin)) + index_coordinates
return self.coordinate_transform @ (self.coordinate_origin + index_coordinates) if hasattr(self.coordinate_transform, '__call__'):
return self.coordinate_transform(self.coordinate_origin + index_coordinates)
else:
return self.coordinate_transform @ (self.coordinate_origin + index_coordinates)
def physical_to_index(self, physical_coordinates: sp.Matrix, staggered=False):
if hasattr(self.coordinate_transform, '__call__'):
if hasattr(self.coordinate_transform, 'inv'):
return self.coordinate_transform.inv()(physical_coordinates) - self.coordinate_origin
else:
idx = sp.Matrix(sp.symbols(f'index_coordinates:{self.ndim}', real=True))
rtn = sp.solve(self.index_to_physical(idx) - physical_coordinates, idx)
assert rtn, f'Could not find inverese of coordinate_transform: {self.index_to_physical(idx)}'
return rtn
def physical_to_index(self, physical_coordinates, staggered=False): else:
rtn = self.coordinate_transform.inv() @ physical_coordinates - self.coordinate_origin rtn = self.coordinate_transform.inv() @ physical_coordinates - self.coordinate_origin
if staggered: if staggered:
rtn = sp.Matrix([i - 0.5 for i in rtn]) rtn = sp.Matrix([i - 0.5 for i in rtn])
return rtn return rtn
def index_to_staggered_physical_coordinates(self, symbol_vector):
symbol_vector += sp.Matrix([0.5] * self.spatial_dimensions)
return self.create_physical_coordinates(symbol_vector)
def set_coordinate_origin_to_field_center(self): def set_coordinate_origin_to_field_center(self):
self.coordinate_origin = -sp.Matrix([i / 2 for i in self.spatial_shape]) self.coordinate_origin = -sp.Matrix([i / 2 for i in self.spatial_shape])
# noinspection PyAttributeOutsideInit,PyUnresolvedReferences # noinspection PyAttributeOutsideInit,PyUnresolvedReferences
class Access(TypedSymbol, AbstractField.AbstractAccess): class Access(TypedSymbol):
"""Class representing a relative access into a `Field`. """Class representing a relative access into a `Field`.
This class behaves like a normal sympy Symbol, it is actually derived from it. One can built up This class behaves like a normal sympy Symbol, it is actually derived from it. One can built up
...@@ -487,6 +567,7 @@ class Field(AbstractField): ...@@ -487,6 +567,7 @@ class Field(AbstractField):
>>> central_y_component.at_index(0) # change component >>> central_y_component.at_index(0) # change component
v_C^0 v_C^0
""" """
_iterable = False # see https://i10git.cs.fau.de/pycodegen/pystencils/-/merge_requests/166#note_10680
def __new__(cls, name, *args, **kwargs): def __new__(cls, name, *args, **kwargs):
obj = Field.Access.__xnew_cached_(cls, name, *args, **kwargs) obj = Field.Access.__xnew_cached_(cls, name, *args, **kwargs)
...@@ -517,11 +598,15 @@ class Field(AbstractField): ...@@ -517,11 +598,15 @@ class Field(AbstractField):
offset_name = hashlib.md5(pickle.dumps(offsets_and_index)).hexdigest()[:12] offset_name = hashlib.md5(pickle.dumps(offsets_and_index)).hexdigest()[:12]
superscript = None superscript = None
symbol_name = "%s_%s" % (field_name, offset_name) symbol_name = f"{field_name}_{offset_name}"
if superscript is not None: if superscript is not None:
symbol_name += "^" + superscript symbol_name += "^" + superscript
obj = super(Field.Access, self).__xnew__(self, symbol_name, field.dtype) if dtype:
obj = super(Field.Access, self).__xnew__(self, symbol_name, dtype)
else:
obj = super(Field.Access, self).__xnew__(self, symbol_name, field.dtype)
obj._field = field obj._field = field
obj._offsets = [] obj._offsets = []
for o in offsets: for o in offsets:
...@@ -529,7 +614,7 @@ class Field(AbstractField): ...@@ -529,7 +614,7 @@ class Field(AbstractField):
obj._offsets.append(o) obj._offsets.append(o)
else: else:
obj._offsets.append(int(o)) obj._offsets.append(int(o))
obj._offsets = tuple(obj._offsets) obj._offsets = tuple(sp.sympify(obj._offsets))
obj._offsetName = offset_name obj._offsetName = offset_name
obj._superscript = superscript obj._superscript = superscript
obj._index = idx obj._index = idx
...@@ -545,6 +630,9 @@ class Field(AbstractField): ...@@ -545,6 +630,9 @@ class Field(AbstractField):
def __getnewargs__(self): def __getnewargs__(self):
return self.field, self.offsets, self.index, self.is_absolute_access, self.dtype return self.field, self.offsets, self.index, self.is_absolute_access, self.dtype
def __getnewargs_ex__(self):
return (self.field, self.offsets, self.index, self.is_absolute_access, self.dtype), {}
# noinspection SpellCheckingInspection # noinspection SpellCheckingInspection
__xnew__ = staticmethod(__new_stage2__) __xnew__ = staticmethod(__new_stage2__)
# noinspection SpellCheckingInspection # noinspection SpellCheckingInspection
...@@ -560,18 +648,18 @@ class Field(AbstractField): ...@@ -560,18 +648,18 @@ class Field(AbstractField):
idx = () idx = ()
if len(idx) != self.field.index_dimensions: if len(idx) != self.field.index_dimensions:
raise ValueError("Wrong number of indices: " raise ValueError(f"Wrong number of indices: Got {len(idx)}, expected {self.field.index_dimensions}")
"Got %d, expected %d" % (len(idx), self.field.index_dimensions)) if len(idx) == 1 and isinstance(idx[0], str):
return Field.Access(self.field, self._offsets, idx, dtype=self.dtype) dtype = BasicType(self.field.dtype.numpy_dtype[idx[0]])
return Field.Access(self.field, self._offsets, idx,
is_absolute_access=self.is_absolute_access, dtype=dtype)
else:
return Field.Access(self.field, self._offsets, idx,
is_absolute_access=self.is_absolute_access, dtype=self.dtype)
def __getitem__(self, *idx): def __getitem__(self, *idx):
return self.__call__(*idx) return self.__call__(*idx)
def __iter__(self):
"""This is necessary to work with parts of sympy that test if an object is iterable (e.g. simplify).
The __getitem__ would make it iterable"""
raise TypeError("Field access is not iterable")
@property @property
def field(self) -> 'Field': def field(self) -> 'Field':
"""Field that the Access points to""" """Field that the Access points to"""
...@@ -621,7 +709,8 @@ class Field(AbstractField): ...@@ -621,7 +709,8 @@ class Field(AbstractField):
""" """
offset_list = list(self.offsets) offset_list = list(self.offsets)
offset_list[coord_id] += offset offset_list[coord_id] += offset
return Field.Access(self.field, tuple(offset_list), self.index, dtype=self.dtype) return Field.Access(self.field, tuple(offset_list), self.index,
is_absolute_access=self.is_absolute_access, dtype=self.dtype)
def get_shifted(self, *shift) -> 'Field.Access': def get_shifted(self, *shift) -> 'Field.Access':
"""Returns a new Access with changed spatial coordinates """Returns a new Access with changed spatial coordinates
...@@ -634,6 +723,7 @@ class Field(AbstractField): ...@@ -634,6 +723,7 @@ class Field(AbstractField):
return Field.Access(self.field, return Field.Access(self.field,
tuple(a + b for a, b in zip(shift, self.offsets)), tuple(a + b for a, b in zip(shift, self.offsets)),
self.index, self.index,
is_absolute_access=self.is_absolute_access,
dtype=self.dtype) dtype=self.dtype)
def at_index(self, *idx_tuple) -> 'Field.Access': def at_index(self, *idx_tuple) -> 'Field.Access':
...@@ -644,7 +734,15 @@ class Field(AbstractField): ...@@ -644,7 +734,15 @@ class Field(AbstractField):
>>> f(0).at_index(8) >>> f(0).at_index(8)
f_C^8 f_C^8
""" """
return Field.Access(self.field, self.offsets, idx_tuple, dtype=self.dtype) return Field.Access(self.field, self.offsets, idx_tuple,
is_absolute_access=self.is_absolute_access, dtype=self.dtype)
def _eval_subs(self, old, new):
return Field.Access(self.field,
tuple(sp.sympify(a).subs(old, new) for a in self.offsets),
tuple(sp.sympify(a).subs(old, new) for a in self.index),
is_absolute_access=self.is_absolute_access,
dtype=self.dtype)
@property @property
def is_absolute_access(self) -> bool: def is_absolute_access(self) -> bool:
...@@ -661,30 +759,125 @@ class Field(AbstractField): ...@@ -661,30 +759,125 @@ class Field(AbstractField):
def _hashable_content(self): def _hashable_content(self):
super_class_contents = super(Field.Access, self)._hashable_content() super_class_contents = super(Field.Access, self)._hashable_content()
return (super_class_contents, self._field.hashable_contents(), *self._index, *self._offsets) return (super_class_contents, self._field.hashable_contents(), *self._index,
*self._offsets, self._is_absolute_access)
def _staggered_offset(self, offsets, index):
assert FieldType.is_staggered(self._field)
neighbor = self._field.staggered_stencil[index]
neighbor = direction_string_to_offset(neighbor, self._field.spatial_dimensions)
return [(o + sp.Rational(int(neighbor[i]), 2)) for i, o in enumerate(offsets)]
def _latex(self, _): def _latex(self, _):
n = self._field.latex_name if self._field.latex_name else self._field.name n = self._field.latex_name if self._field.latex_name else self._field.name
offset_str = ",".join([sp.latex(o) for o in self.offsets]) offset_str = ",".join([sp.latex(o) for o in self.offsets])
if FieldType.is_staggered(self._field):
offset_str = ",".join([sp.latex(self._staggered_offset(self.offsets, self.index[0])[i])
for i in range(len(self.offsets))])
if self.is_absolute_access: if self.is_absolute_access:
offset_str = "\\mathbf{}".format(offset_str) offset_str = f"\\mathbf{offset_str}"
elif self.field.spatial_dimensions > 1: elif self.field.spatial_dimensions > 1:
offset_str = "({})".format(offset_str) offset_str = f"({offset_str})"
if self.index and self.index != (0,): if FieldType.is_staggered(self._field):
return "{{%s}_{%s}^{%s}}" % (n, offset_str, self.index if len(self.index) > 1 else self.index[0]) if self.index and self.field.index_dimensions > 1:
return f"{{{n}}}_{{{offset_str}}}^{{{self.index[1:] if len(self.index) > 2 else self.index[1]}}}"
else:
return f"{{{n}}}_{{{offset_str}}}"
else: else:
return "{{%s}_{%s}}" % (n, offset_str) if self.index and self.field.index_dimensions > 0:
return f"{{{n}}}_{{{offset_str}}}^{{{self.index if len(self.index) > 1 else self.index[0]}}}"
else:
return f"{{{n}}}_{{{offset_str}}}"
def __str__(self): def __str__(self):
n = self._field.latex_name if self._field.latex_name else self._field.name n = self._field.latex_name if self._field.latex_name else self._field.name
offset_str = ",".join([sp.latex(o) for o in self.offsets]) offset_str = ",".join([sp.latex(o) for o in self.offsets])
if FieldType.is_staggered(self._field):
offset_str = ",".join([sp.latex(self._staggered_offset(self.offsets, self.index[0])[i])
for i in range(len(self.offsets))])
if self.is_absolute_access: if self.is_absolute_access:
offset_str = "[abs]{}".format(offset_str) offset_str = f"[abs]{offset_str}"
if self.index and self.index != (0,):
return "%s[%s](%s)" % (n, offset_str, self.index if len(self.index) > 1 else self.index[0]) if FieldType.is_staggered(self._field):
if self.index and self.field.index_dimensions > 1:
return f"{n}[{offset_str}]({self.index[1:] if len(self.index) > 2 else self.index[1]})"
else:
return f"{n}[{offset_str}]"
else: else:
return "%s[%s]" % (n, offset_str) if self.index and self.field.index_dimensions > 0:
return f"{n}[{offset_str}]({self.index if len(self.index) > 1 else self.index[0]})"
else:
return f"{n}[{offset_str}]"
def fields(description=None, index_dimensions=0, layout=None,
field_type=FieldType.GENERIC, **kwargs) -> Union[Field, List[Field]]:
"""Creates pystencils fields from a string description.
Examples:
Create a 2D scalar and vector field:
>>> s, v = fields("s, v(2): double[2D]")
>>> assert s.spatial_dimensions == 2 and s.index_dimensions == 0
>>> assert (v.spatial_dimensions, v.index_dimensions, v.index_shape) == (2, 1, (2,))
Create an integer field of shape (10, 20):
>>> f = fields("f : int32[10, 20]")
>>> f.has_fixed_shape, f.shape
(True, (10, 20))
Numpy arrays can be used as template for shape and data type of field:
>>> arr_s, arr_v = np.zeros([20, 20]), np.zeros([20, 20, 2])
>>> s, v = fields("s, v(2)", s=arr_s, v=arr_v)
>>> assert s.index_dimensions == 0 and s.dtype.numpy_dtype == arr_s.dtype
>>> assert v.index_shape == (2,)
Format string can be left out, field names are taken from keyword arguments.
>>> fields(f1=arr_s, f2=arr_s)
[f1: double[20,20], f2: double[20,20]]
The keyword names ``index_dimension`` and ``layout`` have special meaning, don't use them for field names
>>> f = fields(f=arr_v, index_dimensions=1)
>>> assert f.index_dimensions == 1
>>> f = fields("pdfs(19) : float32[3D]", layout='fzyx')
>>> f.layout
(2, 1, 0)
"""
result = []
if description:
field_descriptions, dtype, shape = _parse_description(description)
layout = 'numpy' if layout is None else layout
for field_name, idx_shape in field_descriptions:
if field_name in kwargs:
arr = kwargs[field_name]
idx_shape_of_arr = () if not len(idx_shape) else arr.shape[-len(idx_shape):]
assert idx_shape_of_arr == idx_shape
f = Field.create_from_numpy_array(field_name, kwargs[field_name], index_dimensions=len(idx_shape),
field_type=field_type)
elif isinstance(shape, tuple):
f = Field.create_fixed_size(field_name, shape + idx_shape, dtype=dtype,
index_dimensions=len(idx_shape), layout=layout, field_type=field_type)
elif isinstance(shape, int):
f = Field.create_generic(field_name, spatial_dimensions=shape, dtype=dtype,
index_shape=idx_shape, layout=layout, field_type=field_type)
elif shape is None:
f = Field.create_generic(field_name, spatial_dimensions=2, dtype=dtype,
index_shape=idx_shape, layout=layout, field_type=field_type)
else:
assert False
result.append(f)
else:
assert layout is None, "Layout can not be specified when creating Field from numpy array"
for field_name, arr in kwargs.items():
result.append(Field.create_from_numpy_array(field_name, arr, index_dimensions=index_dimensions,
field_type=field_type))
if len(result) == 0:
raise ValueError("Could not parse field description")
elif len(result) == 1:
return result[0]
else:
return result
def get_layout_from_strides(strides: Sequence[int], index_dimension_ids: Optional[List[int]] = None): def get_layout_from_strides(strides: Sequence[int], index_dimension_ids: Optional[List[int]] = None):
...@@ -747,8 +940,6 @@ def create_numpy_array_with_layout(shape, layout, alignment=False, byte_offset=0 ...@@ -747,8 +940,6 @@ def create_numpy_array_with_layout(shape, layout, alignment=False, byte_offset=0
if not alignment: if not alignment:
res = np.empty(shape, order='c', **kwargs) res = np.empty(shape, order='c', **kwargs)
else: else:
if alignment is True:
alignment = 8 * 4
res = aligned_empty(shape, alignment, byte_offset=byte_offset, **kwargs) res = aligned_empty(shape, alignment, byte_offset=byte_offset, **kwargs)
for a, b in reversed(swaps): for a, b in reversed(swaps):
...@@ -757,24 +948,35 @@ def create_numpy_array_with_layout(shape, layout, alignment=False, byte_offset=0 ...@@ -757,24 +948,35 @@ def create_numpy_array_with_layout(shape, layout, alignment=False, byte_offset=0
def spatial_layout_string_to_tuple(layout_str: str, dim: int) -> Tuple[int, ...]: def spatial_layout_string_to_tuple(layout_str: str, dim: int) -> Tuple[int, ...]:
if layout_str in ('fzyx', 'zyxf'): if dim <= 0:
assert dim <= 3 raise ValueError("Dimensionality must be positive")
return tuple(reversed(range(dim)))
layout_str = layout_str.lower()
if layout_str in ('fzyx', 'f', 'reverse_numpy', 'SoA'): if layout_str in ('fzyx', 'zyxf', 'soa', 'aos'):
if dim > 3:
raise ValueError(f"Invalid spatial dimensionality for layout descriptor {layout_str}: May be at most 3.")
return tuple(reversed(range(dim)))
if layout_str in ('f', 'reverse_numpy'):
return tuple(reversed(range(dim))) return tuple(reversed(range(dim)))
elif layout_str in ('c', 'numpy', 'AoS'): elif layout_str in ('c', 'numpy'):
return tuple(range(dim)) return tuple(range(dim))
raise ValueError("Unknown layout descriptor " + layout_str) raise ValueError("Unknown layout descriptor " + layout_str)
def layout_string_to_tuple(layout_str, dim): def layout_string_to_tuple(layout_str, dim):
if dim <= 0:
raise ValueError("Dimensionality must be positive")
layout_str = layout_str.lower() layout_str = layout_str.lower()
if layout_str == 'fzyx' or layout_str == 'soa': if layout_str == 'fzyx' or layout_str == 'soa':
assert dim <= 4 if dim > 4:
raise ValueError(f"Invalid total dimensionality for layout descriptor {layout_str}: May be at most 4.")
return tuple(reversed(range(dim))) return tuple(reversed(range(dim)))
elif layout_str == 'zyxf' or layout_str == 'aos': elif layout_str == 'zyxf' or layout_str == 'aos':
assert dim <= 4 if dim > 4:
raise ValueError(f"Invalid total dimensionality for layout descriptor {layout_str}: May be at most 4.")
return tuple(reversed(range(dim - 1))) + (dim - 1,) return tuple(reversed(range(dim - 1))) + (dim - 1,)
elif layout_str == 'f' or layout_str == 'reverse_numpy': elif layout_str == 'f' or layout_str == 'reverse_numpy':
return tuple(reversed(range(dim))) return tuple(reversed(range(dim)))
...@@ -837,16 +1039,17 @@ type_description_regex = re.compile(r""" ...@@ -837,16 +1039,17 @@ type_description_regex = re.compile(r"""
""", re.VERBOSE | re.IGNORECASE) """, re.VERBOSE | re.IGNORECASE)
def _parse_description(description): def _parse_part1(d):
def parse_part1(d): result = field_description_regex.match(d)
while result:
name, index_str = result.group(1), result.group(2)
index = tuple(int(e) for e in index_str.split(",")) if index_str else ()
yield name, index
d = d[result.end():]
result = field_description_regex.match(d) result = field_description_regex.match(d)
while result:
name, index_str = result.group(1), result.group(2)
index = tuple(int(e) for e in index_str.split(",")) if index_str else ()
yield name, index
d = d[result.end():]
result = field_description_regex.match(d)
def _parse_description(description):
def parse_part2(d): def parse_part2(d):
result = type_description_regex.match(d) result = type_description_regex.match(d)
if result: if result:
...@@ -870,7 +1073,7 @@ def _parse_description(description): ...@@ -870,7 +1073,7 @@ def _parse_description(description):
else: else:
field_description, field_info = description, 'float64[2D]' field_description, field_info = description, 'float64[2D]'
fields_info = [e for e in parse_part1(field_description)] fields_info = [e for e in _parse_part1(field_description)]
if not field_info: if not field_info:
raise ValueError("Could not parse field description") raise ValueError("Could not parse field description")
......
import sympy as sp
from pystencils.typing import PointerType
class DivFunc(sp.Function):
"""
DivFunc represents a division operation, since sympy represents divisions with ^-1
"""
is_Atom = True
is_real = True
def __new__(cls, *args, **kwargs):
if len(args) != 2:
raise ValueError(f'{cls} takes only 2 arguments, instead {len(args)} received!')
divisor, dividend, *other_args = args
return sp.Function.__new__(cls, divisor, dividend, *other_args, **kwargs)
def _eval_evalf(self, *args, **kwargs):
return self.divisor.evalf() / self.dividend.evalf()
@property
def divisor(self):
return self.args[0]
@property
def dividend(self):
return self.args[1]
class AddressOf(sp.Function):
"""
AddressOf is the '&' operation in C. It gets the address of a lvalue.
"""
is_Atom = True
def __new__(cls, arg):
obj = sp.Function.__new__(cls, arg)
return obj
@property
def canonical(self):
if hasattr(self.args[0], 'canonical'):
return self.args[0].canonical
else:
raise NotImplementedError()
@property
def is_commutative(self):
return self.args[0].is_commutative
@property
def dtype(self):
if hasattr(self.args[0], 'dtype'):
return PointerType(self.args[0].dtype, restrict=True)
else:
raise ValueError(f'pystencils supports only non void pointers. Current address_of type: {self.args[0]}')
from pystencils.gpucuda.cudajit import make_python_function from pystencils.gpu.gpu_array_handler import GPUArrayHandler, GPUNotAvailableHandler
from pystencils.gpucuda.kernelcreation import create_cuda_kernel, created_indexed_cuda_kernel from pystencils.gpu.gpujit import make_python_function
from pystencils.gpu.kernelcreation import create_cuda_kernel, created_indexed_cuda_kernel
from .indexing import AbstractIndexing, BlockIndexing, LineIndexing from .indexing import AbstractIndexing, BlockIndexing, LineIndexing
__all__ = ['create_cuda_kernel', 'created_indexed_cuda_kernel', 'make_python_function', __all__ = ['GPUArrayHandler', 'GPUNotAvailableHandler',
'create_cuda_kernel', 'created_indexed_cuda_kernel', 'make_python_function',
'AbstractIndexing', 'BlockIndexing', 'LineIndexing'] 'AbstractIndexing', 'BlockIndexing', 'LineIndexing']
try:
import cupy as cp
import cupyx as cpx
except ImportError:
cp = None
cpx = None
import numpy as np
class GPUArrayHandler:
def __init__(self, device_number):
self._device_number = device_number
def zeros(self, shape, dtype=np.float64, order='C'):
with cp.cuda.Device(self._device_number):
return cp.zeros(shape=shape, dtype=dtype, order=order)
def ones(self, shape, dtype=np.float64, order='C'):
with cp.cuda.Device(self._device_number):
return cp.ones(shape=shape, dtype=dtype, order=order)
def empty(self, shape, dtype=np.float64, order='C'):
with cp.cuda.Device(self._device_number):
return cp.empty(shape=shape, dtype=dtype, order=order)
def to_gpu(self, numpy_array):
swaps = _get_index_swaps(numpy_array)
if numpy_array.base is not None and isinstance(numpy_array.base, np.ndarray):
with cp.cuda.Device(self._device_number):
gpu_array = cp.asarray(numpy_array.base)
for a, b in reversed(swaps):
gpu_array = gpu_array.swapaxes(a, b)
return gpu_array
else:
return cp.asarray(numpy_array)
def upload(self, array, numpy_array):
assert self._device_number == array.device.id
if numpy_array.base is not None and isinstance(numpy_array.base, np.ndarray):
with cp.cuda.Device(self._device_number):
array.base.set(numpy_array.base)
else:
with cp.cuda.Device(self._device_number):
array.set(numpy_array)
def download(self, array, numpy_array):
assert self._device_number == array.device.id
if numpy_array.base is not None and isinstance(numpy_array.base, np.ndarray):
with cp.cuda.Device(self._device_number):
numpy_array.base[:] = array.base.get()
else:
with cp.cuda.Device(self._device_number):
numpy_array[:] = array.get()
def randn(self, shape, dtype=np.float64):
with cp.cuda.Device(self._device_number):
return cp.random.randn(*shape, dtype=dtype)
@staticmethod
def pinned_numpy_array(layout, shape, dtype):
assert set(layout) == set(range(len(shape))), "Wrong layout descriptor"
cur_layout = list(range(len(shape)))
swaps = []
for i in range(len(layout)):
if cur_layout[i] != layout[i]:
index_to_swap_with = cur_layout.index(layout[i])
swaps.append((i, index_to_swap_with))
cur_layout[i], cur_layout[index_to_swap_with] = cur_layout[index_to_swap_with], cur_layout[i]
assert tuple(cur_layout) == tuple(layout)
shape = list(shape)
for a, b in swaps:
shape[a], shape[b] = shape[b], shape[a]
res = cpx.empty_pinned(tuple(shape), order='c', dtype=dtype)
for a, b in reversed(swaps):
res = res.swapaxes(a, b)
return res
from_numpy = to_gpu
class GPUNotAvailableHandler:
def __getattribute__(self, name):
raise NotImplementedError("Unable to utilise cupy! Please make sure cupy works correctly in your setup!")
def _get_index_swaps(array):
swaps = []
if array.base is not None and isinstance(array.base, np.ndarray):
for stride in array.base.strides:
index_base = array.base.strides.index(stride)
index_view = array.strides.index(stride)
if index_base != index_view and (index_view, index_base) not in swaps:
swaps.append((index_base, index_view))
return swaps
import numpy as np import numpy as np
from pystencils.backends.cbackend import generate_c, get_headers from pystencils.backends.cbackend import get_headers
from pystencils.data_types import StructType from pystencils.backends.cuda_backend import generate_cuda
from pystencils.typing import StructType
from pystencils.field import FieldType from pystencils.field import FieldType
from pystencils.gpucuda.texture_utils import ndarray_to_tex from pystencils.include import get_pystencils_include_path
from pystencils.include import get_pycuda_include_path, get_pystencils_include_path
from pystencils.interpolation_astnodes import TextureAccess
from pystencils.kernel_wrapper import KernelWrapper from pystencils.kernel_wrapper import KernelWrapper
from pystencils.kernelparameters import FieldPointerSymbol from pystencils.typing import BasicType, FieldPointerSymbol
USE_FAST_MATH = True USE_FAST_MATH = True
def get_cubic_interpolation_include_paths():
from os.path import join, dirname
return [join(dirname(__file__), "CubicInterpolationCUDA", "code"),
join(dirname(__file__), "CubicInterpolationCUDA", "code", "internal")]
def make_python_function(kernel_function_node, argument_dict=None, custom_backend=None): def make_python_function(kernel_function_node, argument_dict=None, custom_backend=None):
""" """
Creates a kernel function from an abstract syntax tree which Creates a kernel function from an abstract syntax tree which
was created e.g. by :func:`pystencils.gpucuda.create_cuda_kernel` was created e.g. by :func:`pystencils.gpu.create_cuda_kernel`
or :func:`pystencils.gpucuda.created_indexed_cuda_kernel` or :func:`pystencils.gpu.created_indexed_cuda_kernel`
Args: Args:
kernel_function_node: the abstract syntax tree kernel_function_node: the abstract syntax tree
argument_dict: parameters passed here are already fixed. Remaining parameters have to be passed to the argument_dict: parameters passed here are already fixed. Remaining parameters have to be passed to the
returned kernel functor. returned kernel functor.
custom_backend: use own custom printer for code generation
Returns: Returns:
compiled kernel as Python function compiled kernel as Python function
""" """
import pycuda.autoinit # NOQA import cupy as cp
from pycuda.compiler import SourceModule
if argument_dict is None: if argument_dict is None:
argument_dict = {} argument_dict = {}
header_list = ['<cstdint>'] + list(get_headers(kernel_function_node)) headers = get_headers(kernel_function_node)
includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list]) if cp.cuda.runtime.is_hip:
headers.add('"gpu_defines.h"')
for field in kernel_function_node.fields_accessed:
if isinstance(field.dtype, BasicType) and field.dtype.is_half():
headers.add('<hip/hip_fp16.h>')
else:
headers.update({'"gpu_defines.h"', '<cstdint>'})
for field in kernel_function_node.fields_accessed:
if isinstance(field.dtype, BasicType) and field.dtype.is_half():
headers.add('<cuda_fp16.h>')
header_list = sorted(headers)
includes = "\n".join([f"#include {include_file}" for include_file in header_list])
code = includes + "\n" code = includes + "\n"
code += "#define FUNC_PREFIX __global__\n" code += "#define FUNC_PREFIX __global__\n"
code += "#define RESTRICT __restrict__\n\n" code += "#define RESTRICT __restrict__\n\n"
code += str(generate_c(kernel_function_node, dialect='cuda', custom_backend=custom_backend)) code += 'extern "C" {\n%s\n}\n' % str(generate_cuda(kernel_function_node, custom_backend=custom_backend))
textures = set(d.texture for d in kernel_function_node.atoms(TextureAccess))
nvcc_options = ["-w", "-std=c++11", "-Wno-deprecated-gpu-targets"] options = ["-w", "-std=c++11"]
if USE_FAST_MATH: if USE_FAST_MATH:
nvcc_options.append("-use_fast_math") options.append("-use_fast_math")
options.append("-I" + get_pystencils_include_path())
# Code for
# if any(t.interpolation_mode == InterpolationMode.CUBIC_SPLINE for t in textures):
# assert isdir(join(dirname(__file__), "CubicInterpolationCUDA", "code")), \
# "Submodule CubicInterpolationCUDA does not exist"
# nvcc_options += ["-I" + join(dirname(__file__), "CubicInterpolationCUDA", "code")]
# nvcc_options += ["-I" + join(dirname(__file__), "CubicInterpolationCUDA", "code", "internal")]
# needed_dims = set(t.field.spatial_dimensions for t in textures
# if t.interpolation_mode == InterpolationMode.CUBIC_SPLINE)
# for i in needed_dims:
# code = 'extern "C++" {\n#include "cubicTex%iD.cu"\n}\n' % i + code
mod = SourceModule(code, options=nvcc_options, include_dirs=[
get_pystencils_include_path(), get_pycuda_include_path()])
func = mod.get_function(kernel_function_node.function_name)
func = cp.RawKernel(code, kernel_function_node.function_name, options=tuple(options), backend="nvrtc", jitify=True)
parameters = kernel_function_node.get_parameters() parameters = kernel_function_node.get_parameters()
cache = {} cache = {}
...@@ -71,7 +74,10 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen ...@@ -71,7 +74,10 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen
for k, v in kwargs.items())) for k, v in kwargs.items()))
try: try:
args, block_and_thread_numbers = cache[key] args, block_and_thread_numbers = cache[key]
func(*args, **block_and_thread_numbers) device = set(a.device.id for a in args if type(a) is cp.ndarray)
assert len(device) == 1, "All arrays used by a kernel need to be allocated on the same device"
with cp.cuda.Device(device.pop()):
func(block_and_thread_numbers['grid'], block_and_thread_numbers['block'], args)
except KeyError: except KeyError:
full_arguments = argument_dict.copy() full_arguments = argument_dict.copy()
full_arguments.update(kwargs) full_arguments.update(kwargs)
...@@ -82,17 +88,16 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen ...@@ -82,17 +88,16 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen
block_and_thread_numbers['block'] = tuple(int(i) for i in block_and_thread_numbers['block']) block_and_thread_numbers['block'] = tuple(int(i) for i in block_and_thread_numbers['block'])
block_and_thread_numbers['grid'] = tuple(int(i) for i in block_and_thread_numbers['grid']) block_and_thread_numbers['grid'] = tuple(int(i) for i in block_and_thread_numbers['grid'])
# TODO: use texture objects: args = tuple(_build_numpy_argument_list(parameters, full_arguments))
# https://devblogs.nvidia.com/cuda-pro-tip-kepler-texture-objects-improve-performance-and-flexibility/
for tex in textures:
tex_ref = mod.get_texref(str(tex))
ndarray_to_tex(tex_ref, full_arguments[tex.field.name], tex.address_mode,
tex.filter_mode, tex.use_normalized_coordinates, tex.read_as_integer)
args = _build_numpy_argument_list(parameters, full_arguments)
cache[key] = (args, block_and_thread_numbers) cache[key] = (args, block_and_thread_numbers)
cache_values.append(kwargs) # keep objects alive such that ids remain unique cache_values.append(kwargs) # keep objects alive such that ids remain unique
func(*args, **block_and_thread_numbers) device = set(a.device.id for a in args if type(a) is cp.ndarray)
# import pycuda.driver as cuda assert len(device) == 1, "All arrays used by a kernel need to be allocated on the same device"
with cp.cuda.Device(device.pop()):
func(block_and_thread_numbers['grid'], block_and_thread_numbers['block'], args)
# useful for debugging:
# cp.cuda.runtime.deviceSynchronize()
# cuda.Context.synchronize() # useful for debugging, to get errors right after kernel was called # cuda.Context.synchronize() # useful for debugging, to get errors right after kernel was called
ast = kernel_function_node ast = kernel_function_node
parameters = kernel_function_node.get_parameters() parameters = kernel_function_node.get_parameters()
...@@ -111,8 +116,8 @@ def _build_numpy_argument_list(parameters, argument_dict): ...@@ -111,8 +116,8 @@ def _build_numpy_argument_list(parameters, argument_dict):
actual_type = array.dtype actual_type = array.dtype
expected_type = param.fields[0].dtype.numpy_dtype expected_type = param.fields[0].dtype.numpy_dtype
if expected_type != actual_type: if expected_type != actual_type:
raise ValueError("Data type mismatch for field '%s'. Expected '%s' got '%s'." % raise ValueError(f"Data type mismatch for field {param.field_name}. "
(param.field_name, expected_type, actual_type)) f"Expected {expected_type} got {actual_type}.")
result.append(array) result.append(array)
elif param.is_field_stride: elif param.is_field_stride:
cast_to_dtype = param.symbol.dtype.numpy_dtype.type cast_to_dtype = param.symbol.dtype.numpy_dtype.type
...@@ -147,22 +152,22 @@ def _check_arguments(parameter_specification, argument_dict): ...@@ -147,22 +152,22 @@ def _check_arguments(parameter_specification, argument_dict):
try: try:
field_arr = argument_dict[symbolic_field.name] field_arr = argument_dict[symbolic_field.name]
except KeyError: except KeyError:
raise KeyError("Missing field parameter for kernel call " + str(symbolic_field)) raise KeyError(f"Missing field parameter for kernel call {str(symbolic_field)}")
if symbolic_field.has_fixed_shape: if symbolic_field.has_fixed_shape:
symbolic_field_shape = tuple(int(i) for i in symbolic_field.shape) symbolic_field_shape = tuple(int(i) for i in symbolic_field.shape)
if isinstance(symbolic_field.dtype, StructType): if isinstance(symbolic_field.dtype, StructType):
symbolic_field_shape = symbolic_field_shape[:-1] symbolic_field_shape = symbolic_field_shape[:-1]
if symbolic_field_shape != field_arr.shape: if symbolic_field_shape != field_arr.shape:
raise ValueError("Passed array '%s' has shape %s which does not match expected shape %s" % raise ValueError(f"Passed array {symbolic_field.name} has shape {str(field_arr.shape)} "
(symbolic_field.name, str(field_arr.shape), str(symbolic_field.shape))) f"which does not match expected shape {str(symbolic_field.shape)}")
if symbolic_field.has_fixed_shape: if symbolic_field.has_fixed_shape:
symbolic_field_strides = tuple(int(i) * field_arr.dtype.itemsize for i in symbolic_field.strides) symbolic_field_strides = tuple(int(i) * field_arr.dtype.itemsize for i in symbolic_field.strides)
if isinstance(symbolic_field.dtype, StructType): if isinstance(symbolic_field.dtype, StructType):
symbolic_field_strides = symbolic_field_strides[:-1] symbolic_field_strides = symbolic_field_strides[:-1]
if symbolic_field_strides != field_arr.strides: if symbolic_field_strides != field_arr.strides:
raise ValueError("Passed array '%s' has strides %s which does not match expected strides %s" % raise ValueError(f"Passed array {symbolic_field.name} has strides {str(field_arr.strides)} "
(symbolic_field.name, str(field_arr.strides), str(symbolic_field_strides))) f"which does not match expected strides {str(symbolic_field_strides)}")
if FieldType.is_indexed(symbolic_field): if FieldType.is_indexed(symbolic_field):
index_arr_shapes.add(field_arr.shape[:symbolic_field.spatial_dimensions]) index_arr_shapes.add(field_arr.shape[:symbolic_field.spatial_dimensions])
...@@ -170,9 +175,9 @@ def _check_arguments(parameter_specification, argument_dict): ...@@ -170,9 +175,9 @@ def _check_arguments(parameter_specification, argument_dict):
array_shapes.add(field_arr.shape[:symbolic_field.spatial_dimensions]) array_shapes.add(field_arr.shape[:symbolic_field.spatial_dimensions])
if len(array_shapes) > 1: if len(array_shapes) > 1:
raise ValueError("All passed arrays have to have the same size " + str(array_shapes)) raise ValueError(f"All passed arrays have to have the same size {str(array_shapes)}")
if len(index_arr_shapes) > 1: if len(index_arr_shapes) > 1:
raise ValueError("All passed index arrays have to have the same size " + str(array_shapes)) raise ValueError(f"All passed index arrays have to have the same size {str(array_shapes)}")
if len(index_arr_shapes) > 0: if len(index_arr_shapes) > 0:
return list(index_arr_shapes)[0] return list(index_arr_shapes)[0]
......
import abc import abc
from functools import partial from functools import partial
import math
from typing import List, Tuple
import sympy as sp import sympy as sp
from sympy.core.cache import cacheit from sympy.core.cache import cacheit
from pystencils.astnodes import Block, Conditional from pystencils.astnodes import Block, Conditional, SympyAssignment
from pystencils.data_types import TypedSymbol, create_type from pystencils.typing import TypedSymbol, create_type
from pystencils.integer_functions import div_ceil, div_floor from pystencils.integer_functions import div_ceil, div_floor
from pystencils.slicing import normalize_slice
from pystencils.sympyextensions import is_integer_sequence, prod from pystencils.sympyextensions import is_integer_sequence, prod
...@@ -32,23 +33,58 @@ GRID_DIM = [ThreadIndexingSymbol("gridDim." + coord, create_type("int32")) for c ...@@ -32,23 +33,58 @@ GRID_DIM = [ThreadIndexingSymbol("gridDim." + coord, create_type("int32")) for c
class AbstractIndexing(abc.ABC): class AbstractIndexing(abc.ABC):
""" """
Abstract base class for all Indexing classes. An Indexing class defines how a multidimensional Abstract base class for all Indexing classes. An Indexing class defines how an iteration space is mapped
field is mapped to CUDA's block and grid system. It calculates indices based on CUDA's thread and block indices to GPU's block and grid system. It calculates indices based on GPU's thread and block indices
and computes the number of blocks and threads a kernel is started with. The Indexing class is created with and computes the number of blocks and threads a kernel is started with.
a pystencils field, a slice to iterate over, and further optional parameters that must have default values. The Indexing class is created with an iteration space that is given as list of slices to determine start, stop
and the step size for each coordinate. Further the data_layout is given as tuple to determine the fast and slow
coordinates. This is important to get an optimal mapping of coordinates to GPU threads.
""" """
def __init__(self, iteration_space: Tuple[slice], data_layout: Tuple):
for iter_space in iteration_space:
assert isinstance(iter_space, slice), f"iteration_space must be of type Tuple[slice], " \
f"not tuple of type {type(iter_space)}"
self._iteration_space = iteration_space
self._data_layout = data_layout
self._dim = len(iteration_space)
@property
def iteration_space(self):
"""Iteration space to loop over"""
return self._iteration_space
@property
def data_layout(self):
"""Data layout of the kernels arrays"""
return self._data_layout
@property
def dim(self):
"""Number of spatial dimensions"""
return self._dim
@property @property
@abc.abstractmethod @abc.abstractmethod
def coordinates(self): def coordinates(self):
"""Returns a sequence of coordinate expressions for (x,y,z) depending on symbolic CUDA block and thread indices. """Returns a sequence of coordinate expressions for (x,y,z) depending on symbolic GPU block and thread indices.
These symbolic indices can be obtained with the method `index_variables` """ These symbolic indices can be obtained with the method `index_variables` """
@property @property
def index_variables(self): def index_variables(self):
"""Sympy symbols for CUDA's block and thread indices, and block and grid dimensions. """ """Sympy symbols for GPU's block and thread indices, and block and grid dimensions. """
return BLOCK_IDX + THREAD_IDX + BLOCK_DIM + GRID_DIM return BLOCK_IDX + THREAD_IDX + BLOCK_DIM + GRID_DIM
@abc.abstractmethod
def get_loop_ctr_assignments(self, loop_counter_symbols) -> List[SympyAssignment]:
"""Adds assignments for the loop counter symbols depending on the gpu threads.
Args:
loop_counter_symbols: typed symbols representing the loop counters
Returns:
assignments for the loop counters
"""
@abc.abstractmethod @abc.abstractmethod
def call_parameters(self, arr_shape): def call_parameters(self, arr_shape):
"""Determine grid and block size for kernel call. """Determine grid and block size for kernel call.
...@@ -88,57 +124,79 @@ class AbstractIndexing(abc.ABC): ...@@ -88,57 +124,79 @@ class AbstractIndexing(abc.ABC):
class BlockIndexing(AbstractIndexing): class BlockIndexing(AbstractIndexing):
"""Generic indexing scheme that maps sub-blocks of an array to CUDA blocks. """Generic indexing scheme that maps sub-blocks of an array to GPU blocks.
Args: Args:
field: pystencils field (common to all Indexing classes) iteration_space: list of slices to determine start, stop and the step size for each coordinate
iteration_slice: slice that defines rectangular subarea which is iterated over data_layout: tuple specifying loop order with innermost loop last.
This is the same format as returned by `Field.layout`.
permute_block_size_dependent_on_layout: if True the block_size is permuted such that the fastest coordinate permute_block_size_dependent_on_layout: if True the block_size is permuted such that the fastest coordinate
gets the largest amount of threads gets the largest amount of threads
compile_time_block_size: compile in concrete block size, otherwise the cuda variable 'blockDim' is used compile_time_block_size: compile in concrete block size, otherwise the gpu variable 'blockDim' is used
maximum_block_size: maximum block size that is possible for the GPU. Set to 'auto' to let cupy define the
maximum block size from the device properties
device_number: device number of the used GPU. By default, the zeroth device is used.
""" """
def __init__(self, field, iteration_slice, def __init__(self, iteration_space: Tuple[slice], data_layout: Tuple[int],
block_size=(16, 16, 1), permute_block_size_dependent_on_layout=True, compile_time_block_size=False, block_size=(128, 2, 1), permute_block_size_dependent_on_layout=True, compile_time_block_size=False,
maximum_block_size=(1024, 1024, 64)): maximum_block_size=(1024, 1024, 64), device_number=None):
if field.spatial_dimensions > 3: super(BlockIndexing, self).__init__(iteration_space, data_layout)
raise NotImplementedError("This indexing scheme supports at most 3 spatial dimensions")
if self._dim > 4:
raise NotImplementedError("This indexing scheme supports at most 4 spatial dimensions")
if permute_block_size_dependent_on_layout: if permute_block_size_dependent_on_layout and self._dim < 4:
block_size = self.permute_block_size_according_to_layout(block_size, field.layout) block_size = self.permute_block_size_according_to_layout(block_size, data_layout)
self._block_size = block_size self._block_size = block_size
if maximum_block_size == 'auto': if maximum_block_size == 'auto':
assert device_number is not None, 'If "maximum_block_size" is set to "auto" a device number must be stated'
# Get device limits # Get device limits
import pycuda.driver as cuda import cupy as cp
# noinspection PyUnresolvedReferences # See https://github.com/cupy/cupy/issues/7676
import pycuda.autoinit # NOQA if cp.cuda.runtime.is_hip:
da = cuda.device_attribute maximum_block_size = tuple(cp.cuda.runtime.deviceGetAttribute(i, device_number) for i in range(26, 29))
device = cuda.Context.get_device() else:
maximum_block_size = tuple(device.get_attribute(a) da = cp.cuda.Device(device_number).attributes
for a in (da.MAX_BLOCK_DIM_X, da.MAX_BLOCK_DIM_Y, da.MAX_BLOCK_DIM_Z)) maximum_block_size = tuple(da[f"MaxBlockDim{c}"] for c in ["X", "Y", "Z"])
self._maximum_block_size = maximum_block_size self._maximum_block_size = maximum_block_size
self._iterationSlice = normalize_slice(iteration_slice, field.spatial_shape)
self._dim = field.spatial_dimensions
self._symbolic_shape = [e if isinstance(e, sp.Basic) else None for e in field.spatial_shape]
self._compile_time_block_size = compile_time_block_size self._compile_time_block_size = compile_time_block_size
self._device_number = device_number
@property @property
def coordinates(self): def cuda_indices(self):
offsets = _get_start_from_slice(self._iterationSlice)
block_size = self._block_size if self._compile_time_block_size else BLOCK_DIM block_size = self._block_size if self._compile_time_block_size else BLOCK_DIM
coordinates = [block_index * bs + thread_idx + off indices = [block_index * bs + thread_idx
for block_index, bs, thread_idx, off in zip(BLOCK_IDX, block_size, THREAD_IDX, offsets)] for block_index, bs, thread_idx in zip(BLOCK_IDX, block_size, THREAD_IDX)]
return indices[:self._dim]
return coordinates[:self._dim] @property
def coordinates(self):
if self._dim < 4:
coordinates = [c + iter_slice.start for c, iter_slice in zip(self.cuda_indices, self._iteration_space)]
return coordinates[:self._dim]
else:
coordinates = list()
width = self._iteration_space[1].stop - self.iteration_space[1].start
coordinates.append(div_floor(self.cuda_indices[0], width))
coordinates.append(sp.Mod(self.cuda_indices[0], width))
coordinates.append(self.cuda_indices[1] + self.iteration_space[2].start)
coordinates.append(self.cuda_indices[2] + self.iteration_space[3].start)
return coordinates
def get_loop_ctr_assignments(self, loop_counter_symbols):
return _loop_ctr_assignments(loop_counter_symbols, self.coordinates, self._iteration_space)
def call_parameters(self, arr_shape): def call_parameters(self, arr_shape):
substitution_dict = {sym: value for sym, value in zip(self._symbolic_shape, arr_shape) if sym is not None} numeric_iteration_slice = _get_numeric_iteration_slice(self._iteration_space, arr_shape)
widths = _get_widths(numeric_iteration_slice)
if len(widths) > 3:
widths = [widths[0] * widths[1], widths[2], widths[3]]
widths = [end - start for start, end in zip(_get_start_from_slice(self._iterationSlice),
_get_end_from_slice(self._iterationSlice, arr_shape))]
widths = sp.Matrix(widths).subs(substitution_dict)
extend_bs = (1,) * (3 - len(self._block_size)) extend_bs = (1,) * (3 - len(self._block_size))
block_size = self._block_size + extend_bs block_size = self._block_size + extend_bs
if not self._compile_time_block_size: if not self._compile_time_block_size:
...@@ -147,7 +205,8 @@ class BlockIndexing(AbstractIndexing): ...@@ -147,7 +205,8 @@ class BlockIndexing(AbstractIndexing):
for i in range(len(widths)): for i in range(len(widths)):
factor = div_floor(prod(block_size[:i]), prod(adapted_block_size)) factor = div_floor(prod(block_size[:i]), prod(adapted_block_size))
adapted_block_size.append(sp.Min(block_size[i] * factor, widths[i])) adapted_block_size.append(sp.Min(block_size[i] * factor, widths[i]))
block_size = tuple(adapted_block_size) + extend_bs extend_adapted_bs = (1,) * (3 - len(adapted_block_size))
block_size = tuple(adapted_block_size) + extend_adapted_bs
block_size = tuple(sp.Min(bs, max_bs) for bs, max_bs in zip(block_size, self._maximum_block_size)) block_size = tuple(sp.Min(bs, max_bs) for bs, max_bs in zip(block_size, self._maximum_block_size))
grid = tuple(div_ceil(length, block_size) for length, block_size in zip(widths, block_size)) grid = tuple(div_ceil(length, block_size) for length, block_size in zip(widths, block_size))
...@@ -158,42 +217,59 @@ class BlockIndexing(AbstractIndexing): ...@@ -158,42 +217,59 @@ class BlockIndexing(AbstractIndexing):
def guard(self, kernel_content, arr_shape): def guard(self, kernel_content, arr_shape):
arr_shape = arr_shape[:self._dim] arr_shape = arr_shape[:self._dim]
conditions = [c < end if len(self._iteration_space) - 1 == len(arr_shape):
for c, end in zip(self.coordinates, _get_end_from_slice(self._iterationSlice, arr_shape))] numeric_iteration_slice = _get_numeric_iteration_slice(self._iteration_space[1:], arr_shape)
numeric_iteration_slice = [self.iteration_space[0]] + numeric_iteration_slice
else:
assert len(self._iteration_space) == len(arr_shape), "Iteration space must be equal to the array shape"
numeric_iteration_slice = _get_numeric_iteration_slice(self._iteration_space, arr_shape)
end = [s.stop if s.stop != 0 else 1 for s in numeric_iteration_slice]
for i, s in enumerate(numeric_iteration_slice):
if s.step and s.step != 1:
end[i] = div_ceil(s.stop - s.start, s.step) + s.start
if self._dim < 4:
conditions = [c < e for c, e in zip(self.coordinates, end)]
else:
end = [end[0] * end[1], end[2], end[3]]
coordinates = [c + iter_slice.start for c, iter_slice in zip(self.cuda_indices, self._iteration_space[1:])]
conditions = [c < e for c, e in zip(coordinates, end)]
condition = conditions[0] condition = conditions[0]
for c in conditions[1:]: for c in conditions[1:]:
condition = sp.And(condition, c) condition = sp.And(condition, c)
return Block([Conditional(condition, kernel_content)]) return Block([Conditional(condition, kernel_content)])
@staticmethod def numeric_iteration_space(self, arr_shape):
def limit_block_size_by_register_restriction(block_size, required_registers_per_thread, device=None): return _get_numeric_iteration_slice(self._iteration_space, arr_shape)
"""Shrinks the block_size if there are too many registers used per multiprocessor.
def limit_block_size_by_register_restriction(self, block_size, required_registers_per_thread):
"""Shrinks the block_size if there are too many registers used per block.
This is not done automatically, since the required_registers_per_thread are not known before compilation. This is not done automatically, since the required_registers_per_thread are not known before compilation.
They can be obtained by ``func.num_regs`` from a pycuda function. They can be obtained by ``func.num_regs`` from a cupy function.
:returns smaller block_size if too many registers are used. Args:
block_size: used block size that is target for limiting
required_registers_per_thread: needed registers per thread
returns: smaller block_size if too many registers are used.
""" """
import pycuda.driver as cuda import cupy as cp
# noinspection PyUnresolvedReferences
import pycuda.autoinit # NOQA
da = cuda.device_attribute
if device is None:
device = cuda.Context.get_device()
available_registers_per_mp = device.get_attribute(da.MAX_REGISTERS_PER_MULTIPROCESSOR)
block = block_size # See https://github.com/cupy/cupy/issues/7676
if cp.cuda.runtime.is_hip:
max_registers_per_block = cp.cuda.runtime.deviceGetAttribute(71, self._device_number)
else:
device = cp.cuda.Device(self._device_number)
da = device.attributes
max_registers_per_block = da.get("MaxRegistersPerBlock")
result = list(block_size)
while True: while True:
num_threads = 1 required_registers = math.prod(result) * required_registers_per_thread
for t in block: if required_registers <= max_registers_per_block:
num_threads *= t return result
required_registers_per_mt = num_threads * required_registers_per_thread
if required_registers_per_mt <= available_registers_per_mp:
return block
else: else:
largest_grid_entry_idx = max(range(len(block)), key=lambda e: block[e]) largest_list_entry_idx = max(range(len(result)), key=lambda e: result[e])
assert block[largest_grid_entry_idx] >= 2 assert result[largest_list_entry_idx] >= 2
block[largest_grid_entry_idx] //= 2 result[largest_list_entry_idx] //= 2
@staticmethod @staticmethod
def permute_block_size_according_to_layout(block_size, layout): def permute_block_size_according_to_layout(block_size, layout):
...@@ -222,42 +298,48 @@ class BlockIndexing(AbstractIndexing): ...@@ -222,42 +298,48 @@ class BlockIndexing(AbstractIndexing):
class LineIndexing(AbstractIndexing): class LineIndexing(AbstractIndexing):
""" """
Indexing scheme that assigns the innermost 'line' i.e. the elements which are adjacent in memory to a 1D CUDA block. Indexing scheme that assigns the innermost 'line' i.e. the elements which are adjacent in memory to a 1D GPU block.
The fastest coordinate is indexed with thread_idx.x, the remaining coordinates are mapped to block_idx.{x,y,z} The fastest coordinate is indexed with thread_idx.x, the remaining coordinates are mapped to block_idx.{x,y,z}
This indexing scheme supports up to 4 spatial dimensions, where the innermost dimensions is not larger than the This indexing scheme supports up to 4 spatial dimensions, where the innermost dimensions is not larger than the
maximum amount of threads allowed in a CUDA block (which depends on device). maximum amount of threads allowed in a GPU block (which depends on device).
Args:
iteration_space: list of slices to determine start, stop and the step size for each coordinate
data_layout: tuple to determine the fast and slow coordinates.
""" """
def __init__(self, field, iteration_slice): def __init__(self, iteration_space: Tuple[slice], data_layout: Tuple):
available_indices = [THREAD_IDX[0]] + BLOCK_IDX super(LineIndexing, self).__init__(iteration_space, data_layout)
if field.spatial_dimensions > 4:
if len(iteration_space) > 4:
raise NotImplementedError("This indexing scheme supports at most 4 spatial dimensions") raise NotImplementedError("This indexing scheme supports at most 4 spatial dimensions")
coordinates = available_indices[:field.spatial_dimensions] @property
def cuda_indices(self):
available_indices = [THREAD_IDX[0]] + BLOCK_IDX
coordinates = available_indices[:self.dim]
fastest_coordinate = field.layout[-1] fastest_coordinate = self.data_layout[-1]
coordinates[0], coordinates[fastest_coordinate] = coordinates[fastest_coordinate], coordinates[0] coordinates[0], coordinates[fastest_coordinate] = coordinates[fastest_coordinate], coordinates[0]
self._coordinates = coordinates return coordinates
self._iterationSlice = normalize_slice(iteration_slice, field.spatial_shape)
self._symbolicShape = [e if isinstance(e, sp.Basic) else None for e in field.spatial_shape]
@property @property
def coordinates(self): def coordinates(self):
return [i + offset for i, offset in zip(self._coordinates, _get_start_from_slice(self._iterationSlice))] return [i + o.start for i, o in zip(self.cuda_indices, self._iteration_space)]
def call_parameters(self, arr_shape): def get_loop_ctr_assignments(self, loop_counter_symbols):
substitution_dict = {sym: value for sym, value in zip(self._symbolicShape, arr_shape) if sym is not None} return _loop_ctr_assignments(loop_counter_symbols, self.coordinates, self._iteration_space)
widths = [end - start for start, end in zip(_get_start_from_slice(self._iterationSlice), def call_parameters(self, arr_shape):
_get_end_from_slice(self._iterationSlice, arr_shape))] numeric_iteration_slice = _get_numeric_iteration_slice(self._iteration_space, arr_shape)
widths = sp.Matrix(widths).subs(substitution_dict) widths = _get_widths(numeric_iteration_slice)
def get_shape_of_cuda_idx(cuda_idx): def get_shape_of_cuda_idx(cuda_idx):
if cuda_idx not in self._coordinates: if cuda_idx not in self.cuda_indices:
return 1 return 1
else: else:
idx = self._coordinates.index(cuda_idx) idx = self.cuda_indices.index(cuda_idx)
return widths[idx] return widths[idx]
return {'block': tuple([get_shape_of_cuda_idx(idx) for idx in THREAD_IDX]), return {'block': tuple([get_shape_of_cuda_idx(idx) for idx in THREAD_IDX]),
...@@ -272,30 +354,66 @@ class LineIndexing(AbstractIndexing): ...@@ -272,30 +354,66 @@ class LineIndexing(AbstractIndexing):
def symbolic_parameters(self): def symbolic_parameters(self):
return set() return set()
def numeric_iteration_space(self, arr_shape):
return _get_numeric_iteration_slice(self._iteration_space, arr_shape)
# -------------------------------------- Helper functions -------------------------------------------------------------- # -------------------------------------- Helper functions --------------------------------------------------------------
def _get_start_from_slice(iteration_slice): def _get_numeric_iteration_slice(iteration_slice, arr_shape):
res = [] res = []
for slice_component in iteration_slice: for slice_component, shape in zip(iteration_slice, arr_shape):
if type(slice_component) is slice: result_slice = slice_component
res.append(slice_component.start if slice_component.start is not None else 0) if not isinstance(result_slice.start, int):
else: start = result_slice.start
assert isinstance(slice_component, int) assert len(start.free_symbols) == 1
res.append(slice_component) start = start.subs({symbol: shape for symbol in start.free_symbols})
result_slice = slice(start, result_slice.stop, result_slice.step)
if not isinstance(result_slice.stop, int):
stop = result_slice.stop
assert len(stop.free_symbols) == 1
stop = stop.subs({symbol: shape for symbol in stop.free_symbols})
result_slice = slice(result_slice.start, stop, result_slice.step)
assert isinstance(result_slice.step, int)
res.append(result_slice)
return res return res
def _get_end_from_slice(iteration_slice, arr_shape): def _get_widths(iteration_slice):
iter_slice = normalize_slice(iteration_slice, arr_shape) widths = []
res = [] for iter_slice in iteration_slice:
for slice_component in iter_slice: step = iter_slice.step
if type(slice_component) is slice: assert isinstance(step, int), f"Step can only be of type int not of type {type(step)}"
res.append(slice_component.stop) start = iter_slice.start
stop = iter_slice.stop
if step == 1:
if stop - start == 0:
widths.append(1)
else:
widths.append(stop - start)
else: else:
assert isinstance(slice_component, int) width = (stop - start) / step
res.append(slice_component + 1) if isinstance(width, int):
return res widths.append(width)
elif isinstance(width, float):
widths.append(math.ceil(width))
else:
widths.append(div_ceil(stop - start, step))
return widths
def _loop_ctr_assignments(loop_counter_symbols, coordinates, iteration_space):
loop_ctr_assignments = []
for loop_counter, coordinate, iter_slice in zip(loop_counter_symbols, coordinates, iteration_space):
if isinstance(iter_slice, slice) and iter_slice.step > 1:
offset = (iter_slice.step * iter_slice.start) - iter_slice.start
loop_ctr_assignments.append(SympyAssignment(loop_counter, coordinate * iter_slice.step - offset))
elif iter_slice.start == iter_slice.stop:
loop_ctr_assignments.append(SympyAssignment(loop_counter, 0))
else:
loop_ctr_assignments.append(SympyAssignment(loop_counter, coordinate))
return loop_ctr_assignments
def indexing_creator_from_params(gpu_indexing, gpu_indexing_params): def indexing_creator_from_params(gpu_indexing, gpu_indexing_params):
...@@ -305,7 +423,7 @@ def indexing_creator_from_params(gpu_indexing, gpu_indexing_params): ...@@ -305,7 +423,7 @@ def indexing_creator_from_params(gpu_indexing, gpu_indexing_params):
elif gpu_indexing == 'line': elif gpu_indexing == 'line':
indexing_creator = LineIndexing indexing_creator = LineIndexing
else: else:
raise ValueError("Unknown GPU indexing %s. Valid values are 'block' and 'line'" % (gpu_indexing,)) raise ValueError(f"Unknown GPU indexing {gpu_indexing}. Valid values are 'block' and 'line'")
if gpu_indexing_params: if gpu_indexing_params:
indexing_creator = partial(indexing_creator, **gpu_indexing_params) indexing_creator = partial(indexing_creator, **gpu_indexing_params)
return indexing_creator return indexing_creator
......
import sympy as sp
from pystencils.astnodes import Block, KernelFunction, LoopOverCoordinate, SympyAssignment from pystencils.astnodes import Block, KernelFunction, LoopOverCoordinate, SympyAssignment
from pystencils.data_types import BasicType, StructType, TypedSymbol from pystencils.config import CreateKernelConfig
from pystencils.typing import StructType, TypedSymbol
from pystencils.typing.transformations import add_types
from pystencils.field import Field, FieldType from pystencils.field import Field, FieldType
from pystencils.gpucuda.cudajit import make_python_function from pystencils.enums import Target, Backend
from pystencils.gpucuda.indexing import BlockIndexing from pystencils.gpu.gpujit import make_python_function
from pystencils.node_collection import NodeCollection
from pystencils.gpu.indexing import indexing_creator_from_params
from pystencils.slicing import normalize_slice
from pystencils.transformations import ( from pystencils.transformations import (
add_types, get_base_buffer_index, get_common_shape, implement_interpolations, get_base_buffer_index, get_common_field, get_common_indexed_element, parse_base_pointer_info,
parse_base_pointer_info, resolve_buffer_accesses, resolve_field_accesses, unify_shape_symbols) resolve_buffer_accesses, resolve_field_accesses, unify_shape_symbols)
def create_cuda_kernel(assignments, def create_cuda_kernel(assignments: NodeCollection, config: CreateKernelConfig):
function_name="kernel",
type_info=None, function_name = config.function_name
indexing_creator=BlockIndexing, indexing_creator = indexing_creator_from_params(config.gpu_indexing, config.gpu_indexing_params)
iteration_slice=None, iteration_slice = config.iteration_slice
ghost_layers=None, ghost_layers = config.ghost_layers
skip_independence_check=False,
use_textures_for_interpolation=True): fields_written = assignments.bound_fields
fields_read, fields_written, assignments = add_types(assignments, type_info, not skip_independence_check) fields_read = assignments.rhs_fields
assignments = assignments.all_assignments
assignments = add_types(assignments, config)
all_fields = fields_read.union(fields_written) all_fields = fields_read.union(fields_written)
read_only_fields = set([f.name for f in fields_read - fields_written]) read_only_fields = set([f.name for f in fields_read - fields_written])
...@@ -25,12 +36,16 @@ def create_cuda_kernel(assignments, ...@@ -25,12 +36,16 @@ def create_cuda_kernel(assignments,
field_accesses = set() field_accesses = set()
num_buffer_accesses = 0 num_buffer_accesses = 0
indexed_elements = set()
for eq in assignments: for eq in assignments:
indexed_elements.update(eq.atoms(sp.Indexed))
field_accesses.update(eq.atoms(Field.Access)) field_accesses.update(eq.atoms(Field.Access))
field_accesses = {e for e in field_accesses if not e.is_absolute_access} field_accesses = {e for e in field_accesses if not e.is_absolute_access}
num_buffer_accesses += sum(1 for access in eq.atoms(Field.Access) if FieldType.is_buffer(access.field)) num_buffer_accesses += sum(1 for access in eq.atoms(Field.Access) if FieldType.is_buffer(access.field))
common_shape = get_common_shape(fields_without_buffers) # common shape and field to from the iteration space
common_field = get_common_field(fields_without_buffers)
common_shape = common_field.spatial_shape
if iteration_slice is None: if iteration_slice is None:
# determine iteration slice from ghost layers # determine iteration slice from ghost layers
...@@ -48,35 +63,51 @@ def create_cuda_kernel(assignments, ...@@ -48,35 +63,51 @@ def create_cuda_kernel(assignments,
iteration_slice.append(slice(ghost_layers[i][0], iteration_slice.append(slice(ghost_layers[i][0],
-ghost_layers[i][1] if ghost_layers[i][1] > 0 else None)) -ghost_layers[i][1] if ghost_layers[i][1] > 0 else None))
indexing = indexing_creator(field=list(fields_without_buffers)[0], iteration_slice=iteration_slice) iteration_space = normalize_slice(iteration_slice, common_shape)
coord_mapping = indexing.coordinates else:
iteration_space = normalize_slice(iteration_slice, common_shape)
cell_idx_assignments = [SympyAssignment(LoopOverCoordinate.get_loop_counter_symbol(i), value)
for i, value in enumerate(coord_mapping)] iteration_space = tuple([s if isinstance(s, slice) else slice(s, s + 1, 1) for s in iteration_space])
cell_idx_symbols = [LoopOverCoordinate.get_loop_counter_symbol(i) for i, _ in enumerate(coord_mapping)]
assignments = cell_idx_assignments + assignments loop_counter_symbols = [LoopOverCoordinate.get_loop_counter_symbol(i) for i in range(len(iteration_space))]
block = Block(assignments) if len(indexed_elements) > 0:
common_indexed_element = get_common_indexed_element(indexed_elements)
index = common_indexed_element.indices[0].atoms(TypedSymbol)
assert len(index) == 1, "index expressions must only contain one symbol representing the index"
indexing = indexing_creator(iteration_space=(slice(0, common_indexed_element.shape[0], 1), *iteration_space),
data_layout=common_field.layout)
extended_ctrs = [index.pop(), *loop_counter_symbols]
loop_counter_assignments = indexing.get_loop_ctr_assignments(extended_ctrs)
else:
indexing = indexing_creator(iteration_space=iteration_space, data_layout=common_field.layout)
loop_counter_assignments = indexing.get_loop_ctr_assignments(loop_counter_symbols)
assignments = loop_counter_assignments + assignments
block = indexing.guard(Block(assignments), common_shape)
block = indexing.guard(block, common_shape)
unify_shape_symbols(block, common_shape=common_shape, fields=fields_without_buffers) unify_shape_symbols(block, common_shape=common_shape, fields=fields_without_buffers)
ast = KernelFunction(block, 'gpu', 'gpucuda', make_python_function, ghost_layers, function_name) ast = KernelFunction(block,
Target.GPU,
Backend.CUDA,
make_python_function,
ghost_layers,
function_name,
assignments=assignments)
ast.global_variables.update(indexing.index_variables) ast.global_variables.update(indexing.index_variables)
implement_interpolations(ast, implement_by_texture_accesses=use_textures_for_interpolation) base_pointer_spec = config.base_pointer_specification
if base_pointer_spec is None:
base_pointer_spec = [['spatialInner0']] base_pointer_spec = []
base_pointer_info = {f.name: parse_base_pointer_info(base_pointer_spec, [2, 1, 0], base_pointer_info = {f.name: parse_base_pointer_info(base_pointer_spec, [2, 1, 0],
f.spatial_dimensions, f.index_dimensions) f.spatial_dimensions, f.index_dimensions)
for f in all_fields} for f in all_fields}
coord_mapping = {f.name: cell_idx_symbols for f in all_fields} coord_mapping = {f.name: loop_counter_symbols for f in all_fields}
loop_strides = list(fields_without_buffers)[0].shape
if any(FieldType.is_buffer(f) for f in all_fields): if any(FieldType.is_buffer(f) for f in all_fields):
resolve_buffer_accesses(ast, get_base_buffer_index(ast, indexing.coordinates, loop_strides), read_only_fields) base_buffer_index = get_base_buffer_index(ast, loop_counter_symbols, iteration_space)
resolve_buffer_accesses(ast, base_buffer_index, read_only_fields)
resolve_field_accesses(ast, read_only_fields, field_to_base_pointer_info=base_pointer_info, resolve_field_accesses(ast, read_only_fields, field_to_base_pointer_info=base_pointer_info,
field_to_fixed_coordinates=coord_mapping) field_to_fixed_coordinates=coord_mapping)
...@@ -95,52 +126,62 @@ def create_cuda_kernel(assignments, ...@@ -95,52 +126,62 @@ def create_cuda_kernel(assignments,
return ast return ast
def created_indexed_cuda_kernel(assignments, def created_indexed_cuda_kernel(assignments: NodeCollection, config: CreateKernelConfig):
index_fields,
function_name="kernel", index_fields = config.index_fields
type_info=None, function_name = config.function_name
coordinate_names=('x', 'y', 'z'), coordinate_names = config.coordinate_names
indexing_creator=BlockIndexing, indexing_creator = indexing_creator_from_params(config.gpu_indexing, config.gpu_indexing_params)
use_textures_for_interpolation=True): fields_written = assignments.bound_fields
fields_read, fields_written, assignments = add_types(assignments, type_info, check_independence_condition=False) fields_read = assignments.rhs_fields
all_fields = fields_read.union(fields_written) all_fields = fields_read.union(fields_written)
read_only_fields = set([f.name for f in fields_read - fields_written]) read_only_fields = set([f.name for f in fields_read - fields_written])
# extract the index fields based on the name. The original index field might have been modified
index_fields = [idx_field for idx_field in index_fields if idx_field.name in [f.name for f in all_fields]]
non_index_fields = [f for f in all_fields if f not in index_fields]
spatial_coordinates = {f.spatial_dimensions for f in non_index_fields}
assert len(spatial_coordinates) == 1, f"Non-index fields do not have the same number of spatial coordinates " \
f"Non index fields are {non_index_fields}, spatial coordinates are " \
f"{spatial_coordinates}"
spatial_coordinates = list(spatial_coordinates)[0]
assignments = assignments.all_assignments
assignments = add_types(assignments, config)
for index_field in index_fields: for index_field in index_fields:
index_field.field_type = FieldType.INDEXED index_field.field_type = FieldType.INDEXED
assert FieldType.is_indexed(index_field) assert FieldType.is_indexed(index_field)
assert index_field.spatial_dimensions == 1, "Index fields have to be 1D" assert index_field.spatial_dimensions == 1, "Index fields have to be 1D"
non_index_fields = [f for f in all_fields if f not in index_fields]
spatial_coordinates = {f.spatial_dimensions for f in non_index_fields}
assert len(spatial_coordinates) == 1, "Non-index fields do not have the same number of spatial coordinates"
spatial_coordinates = list(spatial_coordinates)[0]
def get_coordinate_symbol_assignment(name): def get_coordinate_symbol_assignment(name):
for ind_f in index_fields: for ind_f in index_fields:
assert isinstance(ind_f.dtype, StructType), "Index fields have to have a struct data type" assert isinstance(ind_f.dtype, StructType), "Index fields have to have a struct data type"
data_type = ind_f.dtype data_type = ind_f.dtype
if data_type.has_element(name): if data_type.has_element(name):
rhs = ind_f[0](name) rhs = ind_f[0](name)
lhs = TypedSymbol(name, BasicType(data_type.get_element_type(name))) lhs = TypedSymbol(name, data_type.get_element_type(name))
return SympyAssignment(lhs, rhs) return SympyAssignment(lhs, rhs)
raise ValueError("Index %s not found in any of the passed index fields" % (name,)) raise ValueError(f"Index {name} not found in any of the passed index fields")
coordinate_symbol_assignments = [get_coordinate_symbol_assignment(n) coordinate_symbol_assignments = [get_coordinate_symbol_assignment(n)
for n in coordinate_names[:spatial_coordinates]] for n in coordinate_names[:spatial_coordinates]]
coordinate_typed_symbols = [eq.lhs for eq in coordinate_symbol_assignments] coordinate_typed_symbols = [eq.lhs for eq in coordinate_symbol_assignments]
idx_field = list(index_fields)[0] idx_field = list(index_fields)[0]
indexing = indexing_creator(field=idx_field,
iteration_slice=[slice(None, None, None)] * len(idx_field.spatial_shape)) iteration_space = normalize_slice(tuple([slice(None, None, None)]) * len(idx_field.spatial_shape),
idx_field.spatial_shape)
indexing = indexing_creator(iteration_space=iteration_space,
data_layout=idx_field.layout)
function_body = Block(coordinate_symbol_assignments + assignments) function_body = Block(coordinate_symbol_assignments + assignments)
function_body = indexing.guard(function_body, get_common_shape(index_fields)) function_body = indexing.guard(function_body, get_common_field(index_fields).spatial_shape)
ast = KernelFunction(function_body, 'gpu', 'gpucuda', make_python_function, None, function_name) ast = KernelFunction(function_body, Target.GPU, Backend.CUDA, make_python_function,
None, function_name, assignments=assignments)
ast.global_variables.update(indexing.index_variables) ast.global_variables.update(indexing.index_variables)
implement_interpolations(ast, implement_by_texture_accesses=use_textures_for_interpolation)
coord_mapping = indexing.coordinates coord_mapping = indexing.coordinates
base_pointer_spec = [['spatialInner0']] base_pointer_spec = [['spatialInner0']]
base_pointer_info = {f.name: parse_base_pointer_info(base_pointer_spec, [2, 1, 0], base_pointer_info = {f.name: parse_base_pointer_info(base_pointer_spec, [2, 1, 0],
......
import numpy as np import numpy as np
from itertools import product
from pystencils import CreateKernelConfig, create_kernel
from pystencils.gpu import make_python_function
from pystencils import Assignment, Field from pystencils import Assignment, Field
from pystencils.gpucuda import make_python_function from pystencils.enums import Target
from pystencils.gpucuda.kernelcreation import create_cuda_kernel
from pystencils.slicing import get_periodic_boundary_src_dst_slices, normalize_slice from pystencils.slicing import get_periodic_boundary_src_dst_slices, normalize_slice
def create_copy_kernel(domain_size, from_slice, to_slice, index_dimensions=0, index_dim_shape=1, dtype=np.float64): def create_copy_kernel(domain_size, from_slice, to_slice, index_dimensions=0, index_dim_shape=1, dtype=np.float64):
"""Copies a rectangular part of an array to another non-overlapping part""" """Copies a rectangular part of an array to another non-overlapping part"""
if index_dimensions not in (0, 1):
raise NotImplementedError("Works only for one or zero index coordinates")
f = Field.create_generic("pdfs", len(domain_size), index_dimensions=index_dimensions, dtype=dtype) f = Field.create_generic("pdfs", len(domain_size), index_dimensions=index_dimensions, dtype=dtype)
normalized_from_slice = normalize_slice(from_slice, f.spatial_shape) normalized_from_slice = normalize_slice(from_slice, f.spatial_shape)
...@@ -20,21 +20,27 @@ def create_copy_kernel(domain_size, from_slice, to_slice, index_dimensions=0, in ...@@ -20,21 +20,27 @@ def create_copy_kernel(domain_size, from_slice, to_slice, index_dimensions=0, in
"Slices have to have same size" "Slices have to have same size"
update_eqs = [] update_eqs = []
for i in range(index_dim_shape): if index_dimensions < 2:
eq = Assignment(f(i), f[tuple(offset)](i)) index_dim_shape = [index_dim_shape]
for i in product(*[range(d) for d in index_dim_shape]):
eq = Assignment(f(*i), f[tuple(offset)](*i))
update_eqs.append(eq) update_eqs.append(eq)
ast = create_cuda_kernel(update_eqs, iteration_slice=to_slice, skip_independence_check=True) config = CreateKernelConfig(target=Target.GPU, iteration_slice=to_slice, skip_independence_check=True)
return make_python_function(ast)
ast = create_kernel(update_eqs, config=config)
return ast
def get_periodic_boundary_functor(stencil, domain_size, index_dimensions=0, index_dim_shape=1, ghost_layers=1, def get_periodic_boundary_functor(stencil, domain_size, index_dimensions=0, index_dim_shape=1, ghost_layers=1,
thickness=None, dtype=float): thickness=None, dtype=np.float64, target=Target.GPU):
assert target in {Target.GPU}
src_dst_slice_tuples = get_periodic_boundary_src_dst_slices(stencil, ghost_layers, thickness) src_dst_slice_tuples = get_periodic_boundary_src_dst_slices(stencil, ghost_layers, thickness)
kernels = [] kernels = []
index_dimensions = index_dimensions
for src_slice, dst_slice in src_dst_slice_tuples: for src_slice, dst_slice in src_dst_slice_tuples:
kernels.append(create_copy_kernel(domain_size, src_slice, dst_slice, index_dimensions, index_dim_shape, dtype)) ast = create_copy_kernel(domain_size, src_slice, dst_slice, index_dimensions, index_dim_shape, dtype)
kernels.append(make_python_function(ast))
def functor(pdfs, **_): def functor(pdfs, **_):
for kernel in kernels: for kernel in kernels:
......
from os.path import dirname, join, realpath from os.path import dirname, realpath
def get_pystencils_include_path(): def get_pystencils_include_path():
return dirname(realpath(__file__)) return dirname(realpath(__file__))
def get_pycuda_include_path():
import pycuda
return join(dirname(realpath(pycuda.__file__)), 'cuda')
/*
Copyright 2010-2011, D. E. Shaw Research. All rights reserved.
Copyright 2019-2023, Michael Kuron.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <emmintrin.h> // SSE2
#include <wmmintrin.h> // AES
#ifdef __AVX__
#include <immintrin.h> // AVX*
#else
#include <smmintrin.h> // SSE4
#ifdef __FMA__
#include <immintrin.h> // FMA
#endif
#endif
#include <cstdint>
#include <array>
#include <map>
#define QUALIFIERS inline
#define TWOPOW53_INV_DOUBLE (1.1102230246251565e-16)
#define TWOPOW32_INV_FLOAT (2.3283064e-10f)
#include "myintrin.h"
typedef std::uint32_t uint32;
typedef std::uint64_t uint64;
template <typename T, std::size_t Alignment>
class AlignedAllocator
{
public:
typedef T value_type;
template <typename U>
struct rebind {
typedef AlignedAllocator<U, Alignment> other;
};
T * allocate(const std::size_t n) const {
if (n == 0) {
return nullptr;
}
void * const p = _mm_malloc(n*sizeof(T), Alignment);
if (p == nullptr) {
throw std::bad_alloc();
}
return static_cast<T *>(p);
}
void deallocate(T * const p, const std::size_t n) const {
_mm_free(p);
}
};
template <typename Key, typename T>
using AlignedMap = std::map<Key, T, std::less<Key>, AlignedAllocator<std::pair<const Key, T>, sizeof(Key)>>;
#if defined(__AES__) || defined(_MSC_VER)
QUALIFIERS __m128i aesni_keygen_assist(__m128i temp1, __m128i temp2) {
__m128i temp3;
temp2 = _mm_shuffle_epi32(temp2 ,0xff);
temp3 = _mm_slli_si128(temp1, 0x4);
temp1 = _mm_xor_si128(temp1, temp3);
temp3 = _mm_slli_si128(temp3, 0x4);
temp1 = _mm_xor_si128(temp1, temp3);
temp3 = _mm_slli_si128(temp3, 0x4);
temp1 = _mm_xor_si128(temp1, temp3);
temp1 = _mm_xor_si128(temp1, temp2);
return temp1;
}
QUALIFIERS std::array<__m128i,11> aesni_keygen(__m128i k) {
std::array<__m128i,11> rk;
__m128i tmp;
rk[0] = k;
tmp = _mm_aeskeygenassist_si128(k, 0x1);
k = aesni_keygen_assist(k, tmp);
rk[1] = k;
tmp = _mm_aeskeygenassist_si128(k, 0x2);
k = aesni_keygen_assist(k, tmp);
rk[2] = k;
tmp = _mm_aeskeygenassist_si128(k, 0x4);
k = aesni_keygen_assist(k, tmp);
rk[3] = k;
tmp = _mm_aeskeygenassist_si128(k, 0x8);
k = aesni_keygen_assist(k, tmp);
rk[4] = k;
tmp = _mm_aeskeygenassist_si128(k, 0x10);
k = aesni_keygen_assist(k, tmp);
rk[5] = k;
tmp = _mm_aeskeygenassist_si128(k, 0x20);
k = aesni_keygen_assist(k, tmp);
rk[6] = k;
tmp = _mm_aeskeygenassist_si128(k, 0x40);
k = aesni_keygen_assist(k, tmp);
rk[7] = k;
tmp = _mm_aeskeygenassist_si128(k, 0x80);
k = aesni_keygen_assist(k, tmp);
rk[8] = k;
tmp = _mm_aeskeygenassist_si128(k, 0x1b);
k = aesni_keygen_assist(k, tmp);
rk[9] = k;
tmp = _mm_aeskeygenassist_si128(k, 0x36);
k = aesni_keygen_assist(k, tmp);
rk[10] = k;
return rk;
}
QUALIFIERS const std::array<__m128i,11> & aesni_roundkeys(const __m128i & k128) {
alignas(16) std::array<uint32,4> a;
_mm_store_si128((__m128i*) a.data(), k128);
static AlignedMap<std::array<uint32,4>, std::array<__m128i,11>> roundkeys;
if(roundkeys.find(a) == roundkeys.end()) {
auto rk = aesni_keygen(k128);
roundkeys[a] = rk;
}
return roundkeys[a];
}
QUALIFIERS __m128i aesni1xm128i(const __m128i & in, const __m128i & k0) {
auto k = aesni_roundkeys(k0);
__m128i x = _mm_xor_si128(k[0], in);
x = _mm_aesenc_si128(x, k[1]);
x = _mm_aesenc_si128(x, k[2]);
x = _mm_aesenc_si128(x, k[3]);
x = _mm_aesenc_si128(x, k[4]);
x = _mm_aesenc_si128(x, k[5]);
x = _mm_aesenc_si128(x, k[6]);
x = _mm_aesenc_si128(x, k[7]);
x = _mm_aesenc_si128(x, k[8]);
x = _mm_aesenc_si128(x, k[9]);
x = _mm_aesenclast_si128(x, k[10]);
return x;
}
QUALIFIERS void aesni_double2(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1, uint32 key2, uint32 key3,
double & rnd1, double & rnd2)
{
// pack input and call AES
__m128i c128 = _mm_set_epi32(ctr3, ctr2, ctr1, ctr0);
__m128i k128 = _mm_set_epi32(key3, key2, key1, key0);
c128 = aesni1xm128i(c128, k128);
// convert 32 to 64 bit and put 0th and 2nd element into x, 1st and 3rd element into y
__m128i x = _mm_and_si128(c128, _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff));
__m128i y = _mm_and_si128(c128, _mm_set_epi32(0xffffffff, 0, 0xffffffff, 0));
y = _mm_srli_si128(y, 4);
// calculate z = x ^ y << (53 - 32))
__m128i z = _mm_sll_epi64(y, _mm_set1_epi64x(53 - 32));
z = _mm_xor_si128(x, z);
// convert uint64 to double
__m128d rs = _my_cvtepu64_pd(z);
// calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
#ifdef __FMA__
rs = _mm_fmadd_pd(rs, _mm_set1_pd(TWOPOW53_INV_DOUBLE), _mm_set1_pd(TWOPOW53_INV_DOUBLE/2.0));
#else
rs = _mm_mul_pd(rs, _mm_set1_pd(TWOPOW53_INV_DOUBLE));
rs = _mm_add_pd(rs, _mm_set1_pd(TWOPOW53_INV_DOUBLE/2.0));
#endif
// store result
alignas(16) double rr[2];
_mm_store_pd(rr, rs);
rnd1 = rr[0];
rnd2 = rr[1];
}
QUALIFIERS void aesni_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1, uint32 key2, uint32 key3,
float & rnd1, float & rnd2, float & rnd3, float & rnd4)
{
// pack input and call AES
__m128i c128 = _mm_set_epi32(ctr3, ctr2, ctr1, ctr0);
__m128i k128 = _mm_set_epi32(key3, key2, key1, key0);
c128 = aesni1xm128i(c128, k128);
// convert uint32 to float
__m128 rs = _my_cvtepu32_ps(c128);
// calculate rs * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
#ifdef __FMA__
rs = _mm_fmadd_ps(rs, _mm_set1_ps(TWOPOW32_INV_FLOAT), _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
#else
rs = _mm_mul_ps(rs, _mm_set1_ps(TWOPOW32_INV_FLOAT));
rs = _mm_add_ps(rs, _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
#endif
// store result
alignas(16) float r[4];
_mm_store_ps(r, rs);
rnd1 = r[0];
rnd2 = r[1];
rnd3 = r[2];
rnd4 = r[3];
}
template<bool high>
QUALIFIERS __m128d _uniform_double_hq(__m128i x, __m128i y)
{
// convert 32 to 64 bit
if (high)
{
x = _mm_unpackhi_epi32(x, _mm_setzero_si128());
y = _mm_unpackhi_epi32(y, _mm_setzero_si128());
}
else
{
x = _mm_unpacklo_epi32(x, _mm_setzero_si128());
y = _mm_unpacklo_epi32(y, _mm_setzero_si128());
}
// calculate z = x ^ y << (53 - 32))
__m128i z = _mm_sll_epi64(y, _mm_set1_epi64x(53 - 32));
z = _mm_xor_si128(x, z);
// convert uint64 to double
__m128d rs = _my_cvtepu64_pd(z);
// calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
#ifdef __FMA__
rs = _mm_fmadd_pd(rs, _mm_set1_pd(TWOPOW53_INV_DOUBLE), _mm_set1_pd(TWOPOW53_INV_DOUBLE/2.0));
#else
rs = _mm_mul_pd(rs, _mm_set1_pd(TWOPOW53_INV_DOUBLE));
rs = _mm_add_pd(rs, _mm_set1_pd(TWOPOW53_INV_DOUBLE/2.0));
#endif
return rs;
}
QUALIFIERS void aesni_float4(__m128i ctr0, __m128i ctr1, __m128i ctr2, __m128i ctr3,
uint32 key0, uint32 key1, uint32 key2, uint32 key3,
__m128 & rnd1, __m128 & rnd2, __m128 & rnd3, __m128 & rnd4)
{
// pack input and call AES
__m128i k128 = _mm_set_epi32(key3, key2, key1, key0);
__m128i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
_MY_TRANSPOSE4_EPI32(ctr[0], ctr[1], ctr[2], ctr[3]);
for (int i = 0; i < 4; ++i)
{
ctr[i] = aesni1xm128i(ctr[i], k128);
}
_MY_TRANSPOSE4_EPI32(ctr[0], ctr[1], ctr[2], ctr[3]);
// convert uint32 to float
rnd1 = _my_cvtepu32_ps(ctr[0]);
rnd2 = _my_cvtepu32_ps(ctr[1]);
rnd3 = _my_cvtepu32_ps(ctr[2]);
rnd4 = _my_cvtepu32_ps(ctr[3]);
// calculate rnd * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
#ifdef __FMA__
rnd1 = _mm_fmadd_ps(rnd1, _mm_set1_ps(TWOPOW32_INV_FLOAT), _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0));
rnd2 = _mm_fmadd_ps(rnd2, _mm_set1_ps(TWOPOW32_INV_FLOAT), _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0));
rnd3 = _mm_fmadd_ps(rnd3, _mm_set1_ps(TWOPOW32_INV_FLOAT), _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0));
rnd4 = _mm_fmadd_ps(rnd4, _mm_set1_ps(TWOPOW32_INV_FLOAT), _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0));
#else
rnd1 = _mm_mul_ps(rnd1, _mm_set1_ps(TWOPOW32_INV_FLOAT));
rnd1 = _mm_add_ps(rnd1, _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
rnd2 = _mm_mul_ps(rnd2, _mm_set1_ps(TWOPOW32_INV_FLOAT));
rnd2 = _mm_add_ps(rnd2, _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
rnd3 = _mm_mul_ps(rnd3, _mm_set1_ps(TWOPOW32_INV_FLOAT));
rnd3 = _mm_add_ps(rnd3, _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
rnd4 = _mm_mul_ps(rnd4, _mm_set1_ps(TWOPOW32_INV_FLOAT));
rnd4 = _mm_add_ps(rnd4, _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
#endif
}
QUALIFIERS void aesni_double2(__m128i ctr0, __m128i ctr1, __m128i ctr2, __m128i ctr3,
uint32 key0, uint32 key1, uint32 key2, uint32 key3,
__m128d & rnd1lo, __m128d & rnd1hi, __m128d & rnd2lo, __m128d & rnd2hi)
{
// pack input and call AES
__m128i k128 = _mm_set_epi32(key3, key2, key1, key0);
__m128i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
_MY_TRANSPOSE4_EPI32(ctr[0], ctr[1], ctr[2], ctr[3]);
for (int i = 0; i < 4; ++i)
{
ctr[i] = aesni1xm128i(ctr[i], k128);
}
_MY_TRANSPOSE4_EPI32(ctr[0], ctr[1], ctr[2], ctr[3]);
rnd1lo = _uniform_double_hq<false>(ctr[0], ctr[1]);
rnd1hi = _uniform_double_hq<true>(ctr[0], ctr[1]);
rnd2lo = _uniform_double_hq<false>(ctr[2], ctr[3]);
rnd2hi = _uniform_double_hq<true>(ctr[2], ctr[3]);
}
QUALIFIERS void aesni_float4(uint32 ctr0, __m128i ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1, uint32 key2, uint32 key3,
__m128 & rnd1, __m128 & rnd2, __m128 & rnd3, __m128 & rnd4)
{
__m128i ctr0v = _mm_set1_epi32(ctr0);
__m128i ctr2v = _mm_set1_epi32(ctr2);
__m128i ctr3v = _mm_set1_epi32(ctr3);
aesni_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, key2, key3, rnd1, rnd2, rnd3, rnd4);
}
QUALIFIERS void aesni_double2(uint32 ctr0, __m128i ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1, uint32 key2, uint32 key3,
__m128d & rnd1lo, __m128d & rnd1hi, __m128d & rnd2lo, __m128d & rnd2hi)
{
__m128i ctr0v = _mm_set1_epi32(ctr0);
__m128i ctr2v = _mm_set1_epi32(ctr2);
__m128i ctr3v = _mm_set1_epi32(ctr3);
aesni_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, key2, key3, rnd1lo, rnd1hi, rnd2lo, rnd2hi);
}
QUALIFIERS void aesni_double2(uint32 ctr0, __m128i ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1, uint32 key2, uint32 key3,
__m128d & rnd1, __m128d & rnd2)
{
__m128i ctr0v = _mm_set1_epi32(ctr0);
__m128i ctr2v = _mm_set1_epi32(ctr2);
__m128i ctr3v = _mm_set1_epi32(ctr3);
__m128d ignore;
aesni_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, key2, key3, rnd1, ignore, rnd2, ignore);
}
#endif
#ifdef __AVX2__
QUALIFIERS const std::array<__m256i,11> & aesni_roundkeys(const __m256i & k256) {
alignas(32) std::array<uint32,8> a;
_mm256_store_si256((__m256i*) a.data(), k256);
static AlignedMap<std::array<uint32,8>, std::array<__m256i,11>> roundkeys;
if(roundkeys.find(a) == roundkeys.end()) {
auto rk1 = aesni_keygen(_mm256_extractf128_si256(k256, 0));
auto rk2 = aesni_keygen(_mm256_extractf128_si256(k256, 1));
for(int i = 0; i < 11; ++i) {
roundkeys[a][i] = _my256_set_m128i(rk2[i], rk1[i]);
}
}
return roundkeys[a];
}
QUALIFIERS __m256i aesni1xm128i(const __m256i & in, const __m256i & k0) {
#if defined(__VAES__)
auto k = aesni_roundkeys(k0);
__m256i x = _mm256_xor_si256(k[0], in);
x = _mm256_aesenc_epi128(x, k[1]);
x = _mm256_aesenc_epi128(x, k[2]);
x = _mm256_aesenc_epi128(x, k[3]);
x = _mm256_aesenc_epi128(x, k[4]);
x = _mm256_aesenc_epi128(x, k[5]);
x = _mm256_aesenc_epi128(x, k[6]);
x = _mm256_aesenc_epi128(x, k[7]);
x = _mm256_aesenc_epi128(x, k[8]);
x = _mm256_aesenc_epi128(x, k[9]);
x = _mm256_aesenclast_epi128(x, k[10]);
#else
__m128i a = aesni1xm128i(_mm256_extractf128_si256(in, 0), _mm256_extractf128_si256(k0, 0));
__m128i b = aesni1xm128i(_mm256_extractf128_si256(in, 1), _mm256_extractf128_si256(k0, 1));
__m256i x = _my256_set_m128i(b, a);
#endif
return x;
}
template<bool high>
QUALIFIERS __m256d _uniform_double_hq(__m256i x, __m256i y)
{
// convert 32 to 64 bit
if (high)
{
x = _mm256_cvtepu32_epi64(_mm256_extracti128_si256(x, 1));
y = _mm256_cvtepu32_epi64(_mm256_extracti128_si256(y, 1));
}
else
{
x = _mm256_cvtepu32_epi64(_mm256_extracti128_si256(x, 0));
y = _mm256_cvtepu32_epi64(_mm256_extracti128_si256(y, 0));
}
// calculate z = x ^ y << (53 - 32))
__m256i z = _mm256_sll_epi64(y, _mm_set1_epi64x(53 - 32));
z = _mm256_xor_si256(x, z);
// convert uint64 to double
__m256d rs = _my256_cvtepu64_pd(z);
// calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
#ifdef __FMA__
rs = _mm256_fmadd_pd(rs, _mm256_set1_pd(TWOPOW53_INV_DOUBLE), _mm256_set1_pd(TWOPOW53_INV_DOUBLE/2.0));
#else
rs = _mm256_mul_pd(rs, _mm256_set1_pd(TWOPOW53_INV_DOUBLE));
rs = _mm256_add_pd(rs, _mm256_set1_pd(TWOPOW53_INV_DOUBLE/2.0));
#endif
return rs;
}
QUALIFIERS void aesni_float4(__m256i ctr0, __m256i ctr1, __m256i ctr2, __m256i ctr3,
uint32 key0, uint32 key1, uint32 key2, uint32 key3,
__m256 & rnd1, __m256 & rnd2, __m256 & rnd3, __m256 & rnd4)
{
// pack input and call AES
__m256i k256 = _mm256_set_epi32(key3, key2, key1, key0, key3, key2, key1, key0);
__m256i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
__m128i a[4], b[4];
for (int i = 0; i < 4; ++i)
{
a[i] = _mm256_extractf128_si256(ctr[i], 0);
b[i] = _mm256_extractf128_si256(ctr[i], 1);
}
_MY_TRANSPOSE4_EPI32(a[0], a[1], a[2], a[3]);
_MY_TRANSPOSE4_EPI32(b[0], b[1], b[2], b[3]);
for (int i = 0; i < 4; ++i)
{
ctr[i] = _my256_set_m128i(b[i], a[i]);
}
for (int i = 0; i < 4; ++i)
{
ctr[i] = aesni1xm128i(ctr[i], k256);
}
for (int i = 0; i < 4; ++i)
{
a[i] = _mm256_extractf128_si256(ctr[i], 0);
b[i] = _mm256_extractf128_si256(ctr[i], 1);
}
_MY_TRANSPOSE4_EPI32(a[0], a[1], a[2], a[3]);
_MY_TRANSPOSE4_EPI32(b[0], b[1], b[2], b[3]);
for (int i = 0; i < 4; ++i)
{
ctr[i] = _my256_set_m128i(b[i], a[i]);
}
// convert uint32 to float
rnd1 = _my256_cvtepu32_ps(ctr[0]);
rnd2 = _my256_cvtepu32_ps(ctr[1]);
rnd3 = _my256_cvtepu32_ps(ctr[2]);
rnd4 = _my256_cvtepu32_ps(ctr[3]);
// calculate rnd * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
#ifdef __FMA__
rnd1 = _mm256_fmadd_ps(rnd1, _mm256_set1_ps(TWOPOW32_INV_FLOAT), _mm256_set1_ps(TWOPOW32_INV_FLOAT/2.0));
rnd2 = _mm256_fmadd_ps(rnd2, _mm256_set1_ps(TWOPOW32_INV_FLOAT), _mm256_set1_ps(TWOPOW32_INV_FLOAT/2.0));
rnd3 = _mm256_fmadd_ps(rnd3, _mm256_set1_ps(TWOPOW32_INV_FLOAT), _mm256_set1_ps(TWOPOW32_INV_FLOAT/2.0));
rnd4 = _mm256_fmadd_ps(rnd4, _mm256_set1_ps(TWOPOW32_INV_FLOAT), _mm256_set1_ps(TWOPOW32_INV_FLOAT/2.0));
#else
rnd1 = _mm256_mul_ps(rnd1, _mm256_set1_ps(TWOPOW32_INV_FLOAT));
rnd1 = _mm256_add_ps(rnd1, _mm256_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
rnd2 = _mm256_mul_ps(rnd2, _mm256_set1_ps(TWOPOW32_INV_FLOAT));
rnd2 = _mm256_add_ps(rnd2, _mm256_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
rnd3 = _mm256_mul_ps(rnd3, _mm256_set1_ps(TWOPOW32_INV_FLOAT));
rnd3 = _mm256_add_ps(rnd3, _mm256_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
rnd4 = _mm256_mul_ps(rnd4, _mm256_set1_ps(TWOPOW32_INV_FLOAT));
rnd4 = _mm256_add_ps(rnd4, _mm256_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
#endif
}
QUALIFIERS void aesni_double2(__m256i ctr0, __m256i ctr1, __m256i ctr2, __m256i ctr3,
uint32 key0, uint32 key1, uint32 key2, uint32 key3,
__m256d & rnd1lo, __m256d & rnd1hi, __m256d & rnd2lo, __m256d & rnd2hi)
{
// pack input and call AES
__m256i k256 = _mm256_set_epi32(key3, key2, key1, key0, key3, key2, key1, key0);
__m256i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
__m128i a[4], b[4];
for (int i = 0; i < 4; ++i)
{
a[i] = _mm256_extractf128_si256(ctr[i], 0);
b[i] = _mm256_extractf128_si256(ctr[i], 1);
}
_MY_TRANSPOSE4_EPI32(a[0], a[1], a[2], a[3]);
_MY_TRANSPOSE4_EPI32(b[0], b[1], b[2], b[3]);
for (int i = 0; i < 4; ++i)
{
ctr[i] = _my256_set_m128i(b[i], a[i]);
}
for (int i = 0; i < 4; ++i)
{
ctr[i] = aesni1xm128i(ctr[i], k256);
}
for (int i = 0; i < 4; ++i)
{
a[i] = _mm256_extractf128_si256(ctr[i], 0);
b[i] = _mm256_extractf128_si256(ctr[i], 1);
}
_MY_TRANSPOSE4_EPI32(a[0], a[1], a[2], a[3]);
_MY_TRANSPOSE4_EPI32(b[0], b[1], b[2], b[3]);
for (int i = 0; i < 4; ++i)
{
ctr[i] = _my256_set_m128i(b[i], a[i]);
}
rnd1lo = _uniform_double_hq<false>(ctr[0], ctr[1]);
rnd1hi = _uniform_double_hq<true>(ctr[0], ctr[1]);
rnd2lo = _uniform_double_hq<false>(ctr[2], ctr[3]);
rnd2hi = _uniform_double_hq<true>(ctr[2], ctr[3]);
}
QUALIFIERS void aesni_float4(uint32 ctr0, __m256i ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1, uint32 key2, uint32 key3,
__m256 & rnd1, __m256 & rnd2, __m256 & rnd3, __m256 & rnd4)
{
__m256i ctr0v = _mm256_set1_epi32(ctr0);
__m256i ctr2v = _mm256_set1_epi32(ctr2);
__m256i ctr3v = _mm256_set1_epi32(ctr3);
aesni_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, key2, key3, rnd1, rnd2, rnd3, rnd4);
}
QUALIFIERS void aesni_double2(uint32 ctr0, __m256i ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1, uint32 key2, uint32 key3,
__m256d & rnd1lo, __m256d & rnd1hi, __m256d & rnd2lo, __m256d & rnd2hi)
{
__m256i ctr0v = _mm256_set1_epi32(ctr0);
__m256i ctr2v = _mm256_set1_epi32(ctr2);
__m256i ctr3v = _mm256_set1_epi32(ctr3);
aesni_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, key2, key3, rnd1lo, rnd1hi, rnd2lo, rnd2hi);
}
QUALIFIERS void aesni_double2(uint32 ctr0, __m256i ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1, uint32 key2, uint32 key3,
__m256d & rnd1, __m256d & rnd2)
{
#if 0
__m256i ctr0v = _mm256_set1_epi32(ctr0);
__m256i ctr2v = _mm256_set1_epi32(ctr2);
__m256i ctr3v = _mm256_set1_epi32(ctr3);
__m256d ignore;
aesni_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, key2, key3, rnd1, ignore, rnd2, ignore);
#else
__m128d rnd1lo, rnd1hi, rnd2lo, rnd2hi;
aesni_double2(ctr0, _mm256_extractf128_si256(ctr1, 0), ctr2, ctr3, key0, key1, key2, key3, rnd1lo, rnd1hi, rnd2lo, rnd2hi);
rnd1 = _my256_set_m128d(rnd1hi, rnd1lo);
rnd2 = _my256_set_m128d(rnd2hi, rnd2lo);
#endif
}
#endif
#if defined(__AVX512F__) || defined(__AVX10_512BIT__)
QUALIFIERS const std::array<__m512i,11> & aesni_roundkeys(const __m512i & k512) {
alignas(64) std::array<uint32,16> a;
_mm512_store_si512((__m512i*) a.data(), k512);
static AlignedMap<std::array<uint32,16>, std::array<__m512i,11>> roundkeys;
if(roundkeys.find(a) == roundkeys.end()) {
auto rk1 = aesni_keygen(_mm512_extracti32x4_epi32(k512, 0));
auto rk2 = aesni_keygen(_mm512_extracti32x4_epi32(k512, 1));
auto rk3 = aesni_keygen(_mm512_extracti32x4_epi32(k512, 2));
auto rk4 = aesni_keygen(_mm512_extracti32x4_epi32(k512, 3));
for(int i = 0; i < 11; ++i) {
roundkeys[a][i] = _my512_set_m128i(rk4[i], rk3[i], rk2[i], rk1[i]);
}
}
return roundkeys[a];
}
QUALIFIERS __m512i aesni1xm128i(const __m512i & in, const __m512i & k0) {
#ifdef __VAES__
auto k = aesni_roundkeys(k0);
__m512i x = _mm512_xor_si512(k[0], in);
x = _mm512_aesenc_epi128(x, k[1]);
x = _mm512_aesenc_epi128(x, k[2]);
x = _mm512_aesenc_epi128(x, k[3]);
x = _mm512_aesenc_epi128(x, k[4]);
x = _mm512_aesenc_epi128(x, k[5]);
x = _mm512_aesenc_epi128(x, k[6]);
x = _mm512_aesenc_epi128(x, k[7]);
x = _mm512_aesenc_epi128(x, k[8]);
x = _mm512_aesenc_epi128(x, k[9]);
x = _mm512_aesenclast_epi128(x, k[10]);
#else
__m128i a = aesni1xm128i(_mm512_extracti32x4_epi32(in, 0), _mm512_extracti32x4_epi32(k0, 0));
__m128i b = aesni1xm128i(_mm512_extracti32x4_epi32(in, 1), _mm512_extracti32x4_epi32(k0, 1));
__m128i c = aesni1xm128i(_mm512_extracti32x4_epi32(in, 2), _mm512_extracti32x4_epi32(k0, 2));
__m128i d = aesni1xm128i(_mm512_extracti32x4_epi32(in, 3), _mm512_extracti32x4_epi32(k0, 3));
__m512i x = _my512_set_m128i(d, c, b, a);
#endif
return x;
}
template<bool high>
QUALIFIERS __m512d _uniform_double_hq(__m512i x, __m512i y)
{
// convert 32 to 64 bit
if (high)
{
x = _mm512_cvtepu32_epi64(_mm512_extracti64x4_epi64(x, 1));
y = _mm512_cvtepu32_epi64(_mm512_extracti64x4_epi64(y, 1));
}
else
{
x = _mm512_cvtepu32_epi64(_mm512_extracti64x4_epi64(x, 0));
y = _mm512_cvtepu32_epi64(_mm512_extracti64x4_epi64(y, 0));
}
// calculate z = x ^ y << (53 - 32))
__m512i z = _mm512_sll_epi64(y, _mm_set1_epi64x(53 - 32));
z = _mm512_xor_si512(x, z);
// convert uint64 to double
__m512d rs = _mm512_cvtepu64_pd(z);
// calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
rs = _mm512_fmadd_pd(rs, _mm512_set1_pd(TWOPOW53_INV_DOUBLE), _mm512_set1_pd(TWOPOW53_INV_DOUBLE/2.0));
return rs;
}
QUALIFIERS void aesni_float4(__m512i ctr0, __m512i ctr1, __m512i ctr2, __m512i ctr3,
uint32 key0, uint32 key1, uint32 key2, uint32 key3,
__m512 & rnd1, __m512 & rnd2, __m512 & rnd3, __m512 & rnd4)
{
// pack input and call AES
__m512i k512 = _mm512_set_epi32(key3, key2, key1, key0, key3, key2, key1, key0,
key3, key2, key1, key0, key3, key2, key1, key0);
__m512i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
__m128i a[4], b[4], c[4], d[4];
for (int i = 0; i < 4; ++i)
{
a[i] = _mm512_extracti32x4_epi32(ctr[i], 0);
b[i] = _mm512_extracti32x4_epi32(ctr[i], 1);
c[i] = _mm512_extracti32x4_epi32(ctr[i], 2);
d[i] = _mm512_extracti32x4_epi32(ctr[i], 3);
}
_MY_TRANSPOSE4_EPI32(a[0], a[1], a[2], a[3]);
_MY_TRANSPOSE4_EPI32(b[0], b[1], b[2], b[3]);
_MY_TRANSPOSE4_EPI32(c[0], c[1], c[2], c[3]);
_MY_TRANSPOSE4_EPI32(d[0], d[1], d[2], d[3]);
for (int i = 0; i < 4; ++i)
{
ctr[i] = _my512_set_m128i(d[i], c[i], b[i], a[i]);
}
for (int i = 0; i < 4; ++i)
{
ctr[i] = aesni1xm128i(ctr[i], k512);
}
for (int i = 0; i < 4; ++i)
{
a[i] = _mm512_extracti32x4_epi32(ctr[i], 0);
b[i] = _mm512_extracti32x4_epi32(ctr[i], 1);
c[i] = _mm512_extracti32x4_epi32(ctr[i], 2);
d[i] = _mm512_extracti32x4_epi32(ctr[i], 3);
}
_MY_TRANSPOSE4_EPI32(a[0], a[1], a[2], a[3]);
_MY_TRANSPOSE4_EPI32(b[0], b[1], b[2], b[3]);
_MY_TRANSPOSE4_EPI32(c[0], c[1], c[2], c[3]);
_MY_TRANSPOSE4_EPI32(d[0], d[1], d[2], d[3]);
for (int i = 0; i < 4; ++i)
{
ctr[i] = _my512_set_m128i(d[i], c[i], b[i], a[i]);
}
// convert uint32 to float
rnd1 = _mm512_cvtepu32_ps(ctr[0]);
rnd2 = _mm512_cvtepu32_ps(ctr[1]);
rnd3 = _mm512_cvtepu32_ps(ctr[2]);
rnd4 = _mm512_cvtepu32_ps(ctr[3]);
// calculate rnd * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
rnd1 = _mm512_fmadd_ps(rnd1, _mm512_set1_ps(TWOPOW32_INV_FLOAT), _mm512_set1_ps(TWOPOW32_INV_FLOAT/2.0));
rnd2 = _mm512_fmadd_ps(rnd2, _mm512_set1_ps(TWOPOW32_INV_FLOAT), _mm512_set1_ps(TWOPOW32_INV_FLOAT/2.0));
rnd3 = _mm512_fmadd_ps(rnd3, _mm512_set1_ps(TWOPOW32_INV_FLOAT), _mm512_set1_ps(TWOPOW32_INV_FLOAT/2.0));
rnd4 = _mm512_fmadd_ps(rnd4, _mm512_set1_ps(TWOPOW32_INV_FLOAT), _mm512_set1_ps(TWOPOW32_INV_FLOAT/2.0));
}
QUALIFIERS void aesni_double2(__m512i ctr0, __m512i ctr1, __m512i ctr2, __m512i ctr3,
uint32 key0, uint32 key1, uint32 key2, uint32 key3,
__m512d & rnd1lo, __m512d & rnd1hi, __m512d & rnd2lo, __m512d & rnd2hi)
{
// pack input and call AES
__m512i k512 = _mm512_set_epi32(key3, key2, key1, key0, key3, key2, key1, key0,
key3, key2, key1, key0, key3, key2, key1, key0);
__m512i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
__m128i a[4], b[4], c[4], d[4];
for (int i = 0; i < 4; ++i)
{
a[i] = _mm512_extracti32x4_epi32(ctr[i], 0);
b[i] = _mm512_extracti32x4_epi32(ctr[i], 1);
c[i] = _mm512_extracti32x4_epi32(ctr[i], 2);
d[i] = _mm512_extracti32x4_epi32(ctr[i], 3);
}
_MY_TRANSPOSE4_EPI32(a[0], a[1], a[2], a[3]);
_MY_TRANSPOSE4_EPI32(b[0], b[1], b[2], b[3]);
_MY_TRANSPOSE4_EPI32(c[0], c[1], c[2], c[3]);
_MY_TRANSPOSE4_EPI32(d[0], d[1], d[2], d[3]);
for (int i = 0; i < 4; ++i)
{
ctr[i] = _my512_set_m128i(d[i], c[i], b[i], a[i]);
}
for (int i = 0; i < 4; ++i)
{
ctr[i] = aesni1xm128i(ctr[i], k512);
}
for (int i = 0; i < 4; ++i)
{
a[i] = _mm512_extracti32x4_epi32(ctr[i], 0);
b[i] = _mm512_extracti32x4_epi32(ctr[i], 1);
c[i] = _mm512_extracti32x4_epi32(ctr[i], 2);
d[i] = _mm512_extracti32x4_epi32(ctr[i], 3);
}
_MY_TRANSPOSE4_EPI32(a[0], a[1], a[2], a[3]);
_MY_TRANSPOSE4_EPI32(b[0], b[1], b[2], b[3]);
_MY_TRANSPOSE4_EPI32(c[0], c[1], c[2], c[3]);
_MY_TRANSPOSE4_EPI32(d[0], d[1], d[2], d[3]);
for (int i = 0; i < 4; ++i)
{
ctr[i] = _my512_set_m128i(d[i], c[i], b[i], a[i]);
}
rnd1lo = _uniform_double_hq<false>(ctr[0], ctr[1]);
rnd1hi = _uniform_double_hq<true>(ctr[0], ctr[1]);
rnd2lo = _uniform_double_hq<false>(ctr[2], ctr[3]);
rnd2hi = _uniform_double_hq<true>(ctr[2], ctr[3]);
}
QUALIFIERS void aesni_float4(uint32 ctr0, __m512i ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1, uint32 key2, uint32 key3,
__m512 & rnd1, __m512 & rnd2, __m512 & rnd3, __m512 & rnd4)
{
__m512i ctr0v = _mm512_set1_epi32(ctr0);
__m512i ctr2v = _mm512_set1_epi32(ctr2);
__m512i ctr3v = _mm512_set1_epi32(ctr3);
aesni_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, key2, key3, rnd1, rnd2, rnd3, rnd4);
}
QUALIFIERS void aesni_double2(uint32 ctr0, __m512i ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1, uint32 key2, uint32 key3,
__m512d & rnd1lo, __m512d & rnd1hi, __m512d & rnd2lo, __m512d & rnd2hi)
{
__m512i ctr0v = _mm512_set1_epi32(ctr0);
__m512i ctr2v = _mm512_set1_epi32(ctr2);
__m512i ctr3v = _mm512_set1_epi32(ctr3);
aesni_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, key2, key3, rnd1lo, rnd1hi, rnd2lo, rnd2hi);
}
QUALIFIERS void aesni_double2(uint32 ctr0, __m512i ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1, uint32 key2, uint32 key3,
__m512d & rnd1, __m512d & rnd2)
{
#if 0
__m512i ctr0v = _mm512_set1_epi32(ctr0);
__m512i ctr2v = _mm512_set1_epi32(ctr2);
__m512i ctr3v = _mm512_set1_epi32(ctr3);
__m512d ignore;
aesni_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, key2, key3, rnd1, ignore, rnd2, ignore);
#else
__m256d rnd1lo, rnd1hi, rnd2lo, rnd2hi;
aesni_double2(ctr0, _mm512_extracti64x4_epi64(ctr1, 0), ctr2, ctr3, key0, key1, key2, key3, rnd1lo, rnd1hi, rnd2lo, rnd2hi);
rnd1 = _my512_set_m256d(rnd1hi, rnd1lo);
rnd2 = _my512_set_m256d(rnd2hi, rnd2lo);
#endif
}
#endif
/*
Copyright 2021-2023, Michael Kuron.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#if defined(_MSC_VER)
#define __ARM_NEON
#endif
#ifdef __ARM_NEON
#include <arm_neon.h>
#endif
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS > 0
#include <arm_sve.h>
typedef svbool_t svbool_st __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
typedef svfloat32_t svfloat32_st __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
typedef svfloat64_t svfloat64_st __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
typedef svint32_t svint32_st __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
#endif
#ifdef __ARM_NEON
inline float32x4_t makeVec_f32(float a, float b, float c, float d)
{
alignas(16) float data[4] = {a, b, c, d};
return vld1q_f32(data);
}
inline float64x2_t makeVec_f64(double a, double b)
{
alignas(16) double data[2] = {a, b};
return vld1q_f64(data);
}
inline int32x4_t makeVec_s32(int a, int b, int c, int d)
{
alignas(16) int data[4] = {a, b, c, d};
return vld1q_s32(data);
}
#endif
inline void cachelineZero(void * p) {
#if !defined(_MSC_VER) || defined(__clang__)
__asm__ volatile("dc zva, %0"::"r"(p):"memory");
#endif
}
inline size_t _cachelineSize() {
#if !defined(_MSC_VER) || defined(__clang__)
// check that dc zva is permitted
uint64_t dczid;
__asm__ volatile ("mrs %0, dczid_el0" : "=r"(dczid));
if ((dczid & (1 << 4)) != 0) {
return SIZE_MAX;
}
// allocate and fill with ones
const size_t max_size = 0x100000;
uint8_t data[2*max_size];
for (size_t i = 0; i < 2*max_size; ++i) {
data[i] = 0xff;
}
// find alignment offset
size_t offset = max_size - ((uintptr_t) data) % max_size;
// zero a cacheline
cachelineZero((void*) (data + offset));
// make sure that at least one byte was zeroed
if (data[offset] != 0) {
return SIZE_MAX;
}
// make sure that nothing was zeroed before the pointer
if (data[offset-1] == 0) {
return SIZE_MAX;
}
// find the last byte that was zeroed
for (size_t size = 1; size < max_size; ++size) {
if (data[offset + size] != 0) {
return size;
}
}
#endif
// too much was zeroed
return SIZE_MAX;
}
inline size_t cachelineSize() {
static size_t size = _cachelineSize();
return size;
}
/*
Copyright 2023, Markus Holzer.
Copyright 2023, Michael Kuron.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#define POS_INFINITY __int_as_float(0x7f800000)
#define INFINITY POS_INFINITY
#define NEG_INFINITY __int_as_float(0xff800000)
#ifdef __HIPCC_RTC__
typedef __hip_uint8_t uint8_t;
typedef __hip_int8_t int8_t;
typedef __hip_uint16_t uint16_t;
typedef __hip_int16_t int16_t;
#endif
/*
Copyright 2023, Markus Holzer.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/// Half precision support. Experimental. Use carefully.
///
/// This feature is experimental, since it strictly depends on the underlying architecture and compiler support.
/// On x86 architectures, what you can expect is that the data format is supported natively only for storage and
/// interchange. Arithmetic operations will likely involve casting to fp32 (C++ float) and truncation to fp16.
/// Only bandwidth bound code may therefore benefit. None of this is guaranteed, and may change in the future.
/// Clang version must be 15 or higher for x86 half precision support.
/// GCC version must be 12 or higher for x86 half precision support.
/// Also support seems to require SSE, so ensure that respective instruction sets are enabled.
/// See
/// https://clang.llvm.org/docs/LanguageExtensions.html#half-precision-floating-point
/// https://gcc.gnu.org/onlinedocs/gcc/Half-Precision.html
/// for more information.
#pragma once
using half = _Float16;
/*
Copyright 2019-2023, Michael Kuron.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#if defined(__SSE2__) || (defined(_MSC_VER) && !defined(_M_ARM64))
QUALIFIERS __m128 _my_cvtepu32_ps(const __m128i v)
{
#if defined(__AVX512VL__) || defined(__AVX10_1__)
return _mm_cvtepu32_ps(v);
#else
__m128i v2 = _mm_srli_epi32(v, 1);
__m128i v1 = _mm_and_si128(v, _mm_set1_epi32(1));
__m128 v2f = _mm_cvtepi32_ps(v2);
__m128 v1f = _mm_cvtepi32_ps(v1);
return _mm_add_ps(_mm_add_ps(v2f, v2f), v1f);
#endif
}
QUALIFIERS void _MY_TRANSPOSE4_EPI32(__m128i & R0, __m128i & R1, __m128i & R2, __m128i & R3)
{
__m128i T0, T1, T2, T3;
T0 = _mm_unpacklo_epi32(R0, R1);
T1 = _mm_unpacklo_epi32(R2, R3);
T2 = _mm_unpackhi_epi32(R0, R1);
T3 = _mm_unpackhi_epi32(R2, R3);
R0 = _mm_unpacklo_epi64(T0, T1);
R1 = _mm_unpackhi_epi64(T0, T1);
R2 = _mm_unpacklo_epi64(T2, T3);
R3 = _mm_unpackhi_epi64(T2, T3);
}
#endif
#if defined(__SSE4_1__) || (defined(_MSC_VER) && !defined(_M_ARM64))
#if !defined(__AVX512VL__) && !defined(__AVX10_1__) && defined(__GNUC__) && __GNUC__ >= 5 && !defined(__clang__)
__attribute__((optimize("no-associative-math")))
#endif
QUALIFIERS __m128d _my_cvtepu64_pd(const __m128i x)
{
#if defined(__AVX512VL__) || defined(__AVX10_1__)
return _mm_cvtepu64_pd(x);
#elif defined(__clang__)
return __builtin_convertvector((uint64_t __attribute__((__vector_size__(16)))) x, __m128d);
#else
__m128i xH = _mm_srli_epi64(x, 32);
xH = _mm_or_si128(xH, _mm_castpd_si128(_mm_set1_pd(19342813113834066795298816.))); // 2^84
__m128i xL = _mm_blend_epi16(x, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)), 0xcc); // 2^52
__m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(19342813118337666422669312.)); // 2^84 + 2^52
return _mm_add_pd(f, _mm_castsi128_pd(xL));
#endif
}
#endif
#ifdef __AVX2__
QUALIFIERS __m256i _my256_set_m128i(__m128i hi, __m128i lo)
{
#if (!defined(__GNUC__) || __GNUC__ >= 8) || defined(__clang__)
return _mm256_set_m128i(hi, lo);
#else
return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
#endif
}
QUALIFIERS __m256d _my256_set_m128d(__m128d hi, __m128d lo)
{
#if (!defined(__GNUC__) || __GNUC__ >= 8) || defined(__clang__)
return _mm256_set_m128d(hi, lo);
#else
return _mm256_insertf128_pd(_mm256_castpd128_pd256(lo), hi, 1);
#endif
}
QUALIFIERS __m256 _my256_cvtepu32_ps(const __m256i v)
{
#if defined(__AVX512VL__) || defined(__AVX10_1__)
return _mm256_cvtepu32_ps(v);
#else
__m256i v2 = _mm256_srli_epi32(v, 1);
__m256i v1 = _mm256_and_si256(v, _mm256_set1_epi32(1));
__m256 v2f = _mm256_cvtepi32_ps(v2);
__m256 v1f = _mm256_cvtepi32_ps(v1);
return _mm256_add_ps(_mm256_add_ps(v2f, v2f), v1f);
#endif
}
#if !defined(__AVX512VL__) && !defined(__AVX10_1__) && defined(__GNUC__) && __GNUC__ >= 5 && !defined(__clang__)
__attribute__((optimize("no-associative-math")))
#endif
QUALIFIERS __m256d _my256_cvtepu64_pd(const __m256i x)
{
#if defined(__AVX512VL__) || defined(__AVX10_1__)
return _mm256_cvtepu64_pd(x);
#elif defined(__clang__)
return __builtin_convertvector((uint64_t __attribute__((__vector_size__(32)))) x, __m256d);
#else
__m256i xH = _mm256_srli_epi64(x, 32);
xH = _mm256_or_si256(xH, _mm256_castpd_si256(_mm256_set1_pd(19342813113834066795298816.))); // 2^84
__m256i xL = _mm256_blend_epi16(x, _mm256_castpd_si256(_mm256_set1_pd(0x0010000000000000)), 0xcc); // 2^52
__m256d f = _mm256_sub_pd(_mm256_castsi256_pd(xH), _mm256_set1_pd(19342813118337666422669312.)); // 2^84 + 2^52
return _mm256_add_pd(f, _mm256_castsi256_pd(xL));
#endif
}
#endif
#if defined(__AVX512F__) || defined(__AVX10_512BIT__)
QUALIFIERS __m512i _my512_set_m128i(__m128i d, __m128i c, __m128i b, __m128i a)
{
return _mm512_inserti32x4(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(a), b, 1), c, 2), d, 3);
}
QUALIFIERS __m512d _my512_set_m256d(__m256d b, __m256d a)
{
return _mm512_insertf64x4(_mm512_castpd256_pd512(a), b, 1);
}
#endif
/*
Copyright 2010-2011, D. E. Shaw Research. All rights reserved.
Copyright 2019-2024, Michael Kuron.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#if !defined(__OPENCL_VERSION__) && !defined(__HIPCC_RTC__)
#if defined(__SSE2__) || (defined(_MSC_VER) && !defined(_M_ARM64))
#include <emmintrin.h> // SSE2
#endif
#ifdef __AVX2__
#include <immintrin.h> // AVX*
#elif defined(__SSE4_1__) || (defined(_MSC_VER) && !defined(_M_ARM64))
#include <smmintrin.h> // SSE4
#ifdef __FMA__
#include <immintrin.h> // FMA
#endif
#endif
#if defined(_MSC_VER) && defined(_M_ARM64)
#define __ARM_NEON
#endif
#ifdef __ARM_NEON
#include <arm_neon.h>
#endif
#if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_SME)
#include <arm_sve.h>
#endif
#if defined(__powerpc__) && defined(__GNUC__) && !defined(__clang__) && !defined(__xlC__)
#include <ppu_intrinsics.h>
#endif
#ifdef __ALTIVEC__
#include <altivec.h>
#undef bool
#ifndef _ARCH_PWR8
#include <pveclib/vec_int64_ppc.h>
#endif
#endif
#ifdef __riscv_v
#include <riscv_vector.h>
#endif
#endif
#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
#define QUALIFIERS static __forceinline__ __device__
#elif defined(__OPENCL_VERSION__)
#define QUALIFIERS static inline
#else
#define QUALIFIERS inline
#include "myintrin.h"
#endif
#if defined(__ARM_FEATURE_SME)
#define SVE_QUALIFIERS __attribute__((arm_streaming_compatible)) QUALIFIERS
#else
#define SVE_QUALIFIERS QUALIFIERS
#endif
#define PHILOX_W32_0 (0x9E3779B9)
#define PHILOX_W32_1 (0xBB67AE85)
#define PHILOX_M4x32_0 (0xD2511F53)
#define PHILOX_M4x32_1 (0xCD9E8D57)
#define TWOPOW53_INV_DOUBLE (1.1102230246251565e-16)
#define TWOPOW32_INV_FLOAT (2.3283064e-10f)
#ifdef __OPENCL_VERSION__
#include "opencl_stdint.h"
typedef uint32_t uint32;
typedef uint64_t uint64;
#else
#ifndef __HIPCC_RTC__
#include <cstdint>
#endif
typedef std::uint32_t uint32;
typedef std::uint64_t uint64;
#endif
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS > 0
typedef svfloat32_t svfloat32_st __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
typedef svfloat64_t svfloat64_st __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
#elif defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_SME)
typedef svfloat32_t svfloat32_st;
typedef svfloat64_t svfloat64_st;
#endif
QUALIFIERS uint32 mulhilo32(uint32 a, uint32 b, uint32* hip)
{
#if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
// host code
#if defined(__powerpc__) && (!defined(__clang__) || defined(__xlC__))
*hip = __mulhwu(a,b);
return a*b;
#elif defined(__OPENCL_VERSION__)
*hip = mul_hi(a,b);
return a*b;
#else
uint64 product = ((uint64)a) * ((uint64)b);
*hip = product >> 32;
return (uint32)product;
#endif
#else
// device code
*hip = __umulhi(a,b);
return a*b;
#endif
}
QUALIFIERS void _philox4x32round(uint32* ctr, uint32* key)
{
uint32 hi0;
uint32 hi1;
uint32 lo0 = mulhilo32(PHILOX_M4x32_0, ctr[0], &hi0);
uint32 lo1 = mulhilo32(PHILOX_M4x32_1, ctr[2], &hi1);
ctr[0] = hi1^ctr[1]^key[0];
ctr[1] = lo1;
ctr[2] = hi0^ctr[3]^key[1];
ctr[3] = lo0;
}
QUALIFIERS void _philox4x32bumpkey(uint32* key)
{
key[0] += PHILOX_W32_0;
key[1] += PHILOX_W32_1;
}
QUALIFIERS double _uniform_double_hq(uint32 x, uint32 y)
{
uint64 z = (uint64)x ^ ((uint64)y << (53 - 32));
return z * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0);
}
QUALIFIERS void philox_double2(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1,
#ifdef __OPENCL_VERSION__
double * rnd1, double * rnd2)
#else
double & rnd1, double & rnd2)
#endif
{
uint32 key[2] = {key0, key1};
uint32 ctr[4] = {ctr0, ctr1, ctr2, ctr3};
_philox4x32round(ctr, key); // 1
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 2
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 3
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 4
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 5
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 6
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 7
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 8
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 9
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 10
#ifdef __OPENCL_VERSION__
*rnd1 = _uniform_double_hq(ctr[0], ctr[1]);
*rnd2 = _uniform_double_hq(ctr[2], ctr[3]);
#else
rnd1 = _uniform_double_hq(ctr[0], ctr[1]);
rnd2 = _uniform_double_hq(ctr[2], ctr[3]);
#endif
}
QUALIFIERS void philox_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1,
#ifdef __OPENCL_VERSION__
float * rnd1, float * rnd2, float * rnd3, float * rnd4)
#else
float & rnd1, float & rnd2, float & rnd3, float & rnd4)
#endif
{
uint32 key[2] = {key0, key1};
uint32 ctr[4] = {ctr0, ctr1, ctr2, ctr3};
_philox4x32round(ctr, key); // 1
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 2
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 3
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 4
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 5
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 6
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 7
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 8
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 9
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 10
#ifdef __OPENCL_VERSION__
*rnd1 = ctr[0] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
*rnd2 = ctr[1] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
*rnd3 = ctr[2] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
*rnd4 = ctr[3] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
#else
rnd1 = ctr[0] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
rnd2 = ctr[1] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
rnd3 = ctr[2] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
rnd4 = ctr[3] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
#endif
}
#if !defined(__CUDA_ARCH__) && !defined(__OPENCL_VERSION__) && !defined(__HIP_DEVICE_COMPILE__)
#if defined(__SSE4_1__) || (defined(_MSC_VER) && !defined(_M_ARM64))
QUALIFIERS void _philox4x32round(__m128i* ctr, __m128i* key)
{
__m128i lohi0a = _mm_mul_epu32(ctr[0], _mm_set1_epi32(PHILOX_M4x32_0));
__m128i lohi0b = _mm_mul_epu32(_mm_srli_epi64(ctr[0], 32), _mm_set1_epi32(PHILOX_M4x32_0));
__m128i lohi1a = _mm_mul_epu32(ctr[2], _mm_set1_epi32(PHILOX_M4x32_1));
__m128i lohi1b = _mm_mul_epu32(_mm_srli_epi64(ctr[2], 32), _mm_set1_epi32(PHILOX_M4x32_1));
lohi0a = _mm_shuffle_epi32(lohi0a, 0xD8);
lohi0b = _mm_shuffle_epi32(lohi0b, 0xD8);
lohi1a = _mm_shuffle_epi32(lohi1a, 0xD8);
lohi1b = _mm_shuffle_epi32(lohi1b, 0xD8);
__m128i lo0 = _mm_unpacklo_epi32(lohi0a, lohi0b);
__m128i hi0 = _mm_unpackhi_epi32(lohi0a, lohi0b);
__m128i lo1 = _mm_unpacklo_epi32(lohi1a, lohi1b);
__m128i hi1 = _mm_unpackhi_epi32(lohi1a, lohi1b);
ctr[0] = _mm_xor_si128(_mm_xor_si128(hi1, ctr[1]), key[0]);
ctr[1] = lo1;
ctr[2] = _mm_xor_si128(_mm_xor_si128(hi0, ctr[3]), key[1]);
ctr[3] = lo0;
}
QUALIFIERS void _philox4x32bumpkey(__m128i* key)
{
key[0] = _mm_add_epi32(key[0], _mm_set1_epi32(PHILOX_W32_0));
key[1] = _mm_add_epi32(key[1], _mm_set1_epi32(PHILOX_W32_1));
}
template<bool high>
QUALIFIERS __m128d _uniform_double_hq(__m128i x, __m128i y)
{
// convert 32 to 64 bit
if (high)
{
x = _mm_unpackhi_epi32(x, _mm_setzero_si128());
y = _mm_unpackhi_epi32(y, _mm_setzero_si128());
}
else
{
x = _mm_unpacklo_epi32(x, _mm_setzero_si128());
y = _mm_unpacklo_epi32(y, _mm_setzero_si128());
}
// calculate z = x ^ y << (53 - 32))
__m128i z = _mm_sll_epi64(y, _mm_set1_epi64x(53 - 32));
z = _mm_xor_si128(x, z);
// convert uint64 to double
__m128d rs = _my_cvtepu64_pd(z);
// calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
#ifdef __FMA__
rs = _mm_fmadd_pd(rs, _mm_set1_pd(TWOPOW53_INV_DOUBLE), _mm_set1_pd(TWOPOW53_INV_DOUBLE/2.0));
#else
rs = _mm_mul_pd(rs, _mm_set1_pd(TWOPOW53_INV_DOUBLE));
rs = _mm_add_pd(rs, _mm_set1_pd(TWOPOW53_INV_DOUBLE/2.0));
#endif
return rs;
}
QUALIFIERS void philox_float4(__m128i ctr0, __m128i ctr1, __m128i ctr2, __m128i ctr3,
uint32 key0, uint32 key1,
__m128 & rnd1, __m128 & rnd2, __m128 & rnd3, __m128 & rnd4)
{
__m128i key[2] = {_mm_set1_epi32(key0), _mm_set1_epi32(key1)};
__m128i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
_philox4x32round(ctr, key); // 1
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 2
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 3
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 4
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 5
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 6
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 7
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 8
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 9
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 10
// convert uint32 to float
rnd1 = _my_cvtepu32_ps(ctr[0]);
rnd2 = _my_cvtepu32_ps(ctr[1]);
rnd3 = _my_cvtepu32_ps(ctr[2]);
rnd4 = _my_cvtepu32_ps(ctr[3]);
// calculate rnd * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
#ifdef __FMA__
rnd1 = _mm_fmadd_ps(rnd1, _mm_set1_ps(TWOPOW32_INV_FLOAT), _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0));
rnd2 = _mm_fmadd_ps(rnd2, _mm_set1_ps(TWOPOW32_INV_FLOAT), _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0));
rnd3 = _mm_fmadd_ps(rnd3, _mm_set1_ps(TWOPOW32_INV_FLOAT), _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0));
rnd4 = _mm_fmadd_ps(rnd4, _mm_set1_ps(TWOPOW32_INV_FLOAT), _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0));
#else
rnd1 = _mm_mul_ps(rnd1, _mm_set1_ps(TWOPOW32_INV_FLOAT));
rnd1 = _mm_add_ps(rnd1, _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
rnd2 = _mm_mul_ps(rnd2, _mm_set1_ps(TWOPOW32_INV_FLOAT));
rnd2 = _mm_add_ps(rnd2, _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
rnd3 = _mm_mul_ps(rnd3, _mm_set1_ps(TWOPOW32_INV_FLOAT));
rnd3 = _mm_add_ps(rnd3, _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
rnd4 = _mm_mul_ps(rnd4, _mm_set1_ps(TWOPOW32_INV_FLOAT));
rnd4 = _mm_add_ps(rnd4, _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
#endif
}
QUALIFIERS void philox_double2(__m128i ctr0, __m128i ctr1, __m128i ctr2, __m128i ctr3,
uint32 key0, uint32 key1,
__m128d & rnd1lo, __m128d & rnd1hi, __m128d & rnd2lo, __m128d & rnd2hi)
{
__m128i key[2] = {_mm_set1_epi32(key0), _mm_set1_epi32(key1)};
__m128i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
_philox4x32round(ctr, key); // 1
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 2
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 3
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 4
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 5
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 6
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 7
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 8
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 9
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 10
rnd1lo = _uniform_double_hq<false>(ctr[0], ctr[1]);
rnd1hi = _uniform_double_hq<true>(ctr[0], ctr[1]);
rnd2lo = _uniform_double_hq<false>(ctr[2], ctr[3]);
rnd2hi = _uniform_double_hq<true>(ctr[2], ctr[3]);
}
QUALIFIERS void philox_float4(uint32 ctr0, __m128i ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1,
__m128 & rnd1, __m128 & rnd2, __m128 & rnd3, __m128 & rnd4)
{
__m128i ctr0v = _mm_set1_epi32(ctr0);
__m128i ctr2v = _mm_set1_epi32(ctr2);
__m128i ctr3v = _mm_set1_epi32(ctr3);
philox_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, rnd2, rnd3, rnd4);
}
QUALIFIERS void philox_double2(uint32 ctr0, __m128i ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1,
__m128d & rnd1lo, __m128d & rnd1hi, __m128d & rnd2lo, __m128d & rnd2hi)
{
__m128i ctr0v = _mm_set1_epi32(ctr0);
__m128i ctr2v = _mm_set1_epi32(ctr2);
__m128i ctr3v = _mm_set1_epi32(ctr3);
philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1lo, rnd1hi, rnd2lo, rnd2hi);
}
QUALIFIERS void philox_double2(uint32 ctr0, __m128i ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1,
__m128d & rnd1, __m128d & rnd2)
{
__m128i ctr0v = _mm_set1_epi32(ctr0);
__m128i ctr2v = _mm_set1_epi32(ctr2);
__m128i ctr3v = _mm_set1_epi32(ctr3);
__m128d ignore;
philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, ignore, rnd2, ignore);
}
#endif
#ifdef __ALTIVEC__
QUALIFIERS void _philox4x32round(__vector unsigned int* ctr, __vector unsigned int* key)
{
#ifndef _ARCH_PWR8
__vector unsigned int lo0 = vec_mul(ctr[0], vec_splats(PHILOX_M4x32_0));
__vector unsigned int hi0 = vec_mulhuw(ctr[0], vec_splats(PHILOX_M4x32_0));
__vector unsigned int lo1 = vec_mul(ctr[2], vec_splats(PHILOX_M4x32_1));
__vector unsigned int hi1 = vec_mulhuw(ctr[2], vec_splats(PHILOX_M4x32_1));
#elif defined(_ARCH_PWR10)
__vector unsigned int lo0 = vec_mul(ctr[0], vec_splats(PHILOX_M4x32_0));
__vector unsigned int hi0 = vec_mulh(ctr[0], vec_splats(PHILOX_M4x32_0));
__vector unsigned int lo1 = vec_mul(ctr[2], vec_splats(PHILOX_M4x32_1));
__vector unsigned int hi1 = vec_mulh(ctr[2], vec_splats(PHILOX_M4x32_1));
#else
__vector unsigned int lohi0a = (__vector unsigned int) vec_mule(ctr[0], vec_splats(PHILOX_M4x32_0));
__vector unsigned int lohi0b = (__vector unsigned int) vec_mulo(ctr[0], vec_splats(PHILOX_M4x32_0));
__vector unsigned int lohi1a = (__vector unsigned int) vec_mule(ctr[2], vec_splats(PHILOX_M4x32_1));
__vector unsigned int lohi1b = (__vector unsigned int) vec_mulo(ctr[2], vec_splats(PHILOX_M4x32_1));
#ifdef __LITTLE_ENDIAN__
__vector unsigned int lo0 = vec_mergee(lohi0a, lohi0b);
__vector unsigned int lo1 = vec_mergee(lohi1a, lohi1b);
__vector unsigned int hi0 = vec_mergeo(lohi0a, lohi0b);
__vector unsigned int hi1 = vec_mergeo(lohi1a, lohi1b);
#else
__vector unsigned int lo0 = vec_mergeo(lohi0a, lohi0b);
__vector unsigned int lo1 = vec_mergeo(lohi1a, lohi1b);
__vector unsigned int hi0 = vec_mergee(lohi0a, lohi0b);
__vector unsigned int hi1 = vec_mergee(lohi1a, lohi1b);
#endif
#endif
ctr[0] = vec_xor(vec_xor(hi1, ctr[1]), key[0]);
ctr[1] = lo1;
ctr[2] = vec_xor(vec_xor(hi0, ctr[3]), key[1]);
ctr[3] = lo0;
}
QUALIFIERS void _philox4x32bumpkey(__vector unsigned int* key)
{
key[0] = vec_add(key[0], vec_splats(PHILOX_W32_0));
key[1] = vec_add(key[1], vec_splats(PHILOX_W32_1));
}
#ifdef __VSX__
template<bool high>
QUALIFIERS __vector double _uniform_double_hq(__vector unsigned int x, __vector unsigned int y)
{
// convert 32 to 64 bit
#ifdef __LITTLE_ENDIAN__
if (high)
{
x = vec_mergel(x, vec_splats(0U));
y = vec_mergel(y, vec_splats(0U));
}
else
{
x = vec_mergeh(x, vec_splats(0U));
y = vec_mergeh(y, vec_splats(0U));
}
#else
if (high)
{
x = vec_mergel(vec_splats(0U), x);
y = vec_mergel(vec_splats(0U), y);
}
else
{
x = vec_mergeh(vec_splats(0U), x);
y = vec_mergeh(vec_splats(0U), y);
}
#endif
// calculate z = x ^ y << (53 - 32))
#ifdef _ARCH_PWR8
__vector unsigned long long z = vec_sl((__vector unsigned long long) y, vec_splats(53ULL - 32ULL));
#else
__vector unsigned long long z = vec_vsld((__vector unsigned long long) y, vec_splats(53ULL - 32ULL));
#endif
z = vec_xor((__vector unsigned long long) x, z);
// convert uint64 to double
#ifdef __xlC__
__vector double rs = vec_ctd(z, 0);
#else
__vector double rs = vec_ctf(z, 0);
#endif
// calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
rs = vec_madd(rs, vec_splats(TWOPOW53_INV_DOUBLE), vec_splats(TWOPOW53_INV_DOUBLE/2.0));
return rs;
}
#endif
QUALIFIERS void philox_float4(__vector unsigned int ctr0, __vector unsigned int ctr1, __vector unsigned int ctr2, __vector unsigned int ctr3,
uint32 key0, uint32 key1,
__vector float & rnd1, __vector float & rnd2, __vector float & rnd3, __vector float & rnd4)
{
__vector unsigned int key[2] = {vec_splats(key0), vec_splats(key1)};
__vector unsigned int ctr[4] = {ctr0, ctr1, ctr2, ctr3};
_philox4x32round(ctr, key); // 1
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 2
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 3
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 4
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 5
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 6
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 7
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 8
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 9
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 10
// convert uint32 to float
rnd1 = vec_ctf(ctr[0], 0);
rnd2 = vec_ctf(ctr[1], 0);
rnd3 = vec_ctf(ctr[2], 0);
rnd4 = vec_ctf(ctr[3], 0);
// calculate rnd * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
rnd1 = vec_madd(rnd1, vec_splats(TWOPOW32_INV_FLOAT), vec_splats(TWOPOW32_INV_FLOAT/2.0f));
rnd2 = vec_madd(rnd2, vec_splats(TWOPOW32_INV_FLOAT), vec_splats(TWOPOW32_INV_FLOAT/2.0f));
rnd3 = vec_madd(rnd3, vec_splats(TWOPOW32_INV_FLOAT), vec_splats(TWOPOW32_INV_FLOAT/2.0f));
rnd4 = vec_madd(rnd4, vec_splats(TWOPOW32_INV_FLOAT), vec_splats(TWOPOW32_INV_FLOAT/2.0f));
}
#ifdef __VSX__
QUALIFIERS void philox_double2(__vector unsigned int ctr0, __vector unsigned int ctr1, __vector unsigned int ctr2, __vector unsigned int ctr3,
uint32 key0, uint32 key1,
__vector double & rnd1lo, __vector double & rnd1hi, __vector double & rnd2lo, __vector double & rnd2hi)
{
__vector unsigned int key[2] = {vec_splats(key0), vec_splats(key1)};
__vector unsigned int ctr[4] = {ctr0, ctr1, ctr2, ctr3};
_philox4x32round(ctr, key); // 1
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 2
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 3
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 4
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 5
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 6
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 7
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 8
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 9
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 10
rnd1lo = _uniform_double_hq<false>(ctr[0], ctr[1]);
rnd1hi = _uniform_double_hq<true>(ctr[0], ctr[1]);
rnd2lo = _uniform_double_hq<false>(ctr[2], ctr[3]);
rnd2hi = _uniform_double_hq<true>(ctr[2], ctr[3]);
}
#endif
QUALIFIERS void philox_float4(uint32 ctr0, __vector unsigned int ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1,
__vector float & rnd1, __vector float & rnd2, __vector float & rnd3, __vector float & rnd4)
{
__vector unsigned int ctr0v = vec_splats(ctr0);
__vector unsigned int ctr2v = vec_splats(ctr2);
__vector unsigned int ctr3v = vec_splats(ctr3);
philox_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, rnd2, rnd3, rnd4);
}
QUALIFIERS void philox_float4(uint32 ctr0, __vector int ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1,
__vector float & rnd1, __vector float & rnd2, __vector float & rnd3, __vector float & rnd4)
{
philox_float4(ctr0, (__vector unsigned int) ctr1, ctr2, ctr3, key0, key1, rnd1, rnd2, rnd3, rnd4);
}
#ifdef __VSX__
QUALIFIERS void philox_double2(uint32 ctr0, __vector unsigned int ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1,
__vector double & rnd1lo, __vector double & rnd1hi, __vector double & rnd2lo, __vector double & rnd2hi)
{
__vector unsigned int ctr0v = vec_splats(ctr0);
__vector unsigned int ctr2v = vec_splats(ctr2);
__vector unsigned int ctr3v = vec_splats(ctr3);
philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1lo, rnd1hi, rnd2lo, rnd2hi);
}
QUALIFIERS void philox_double2(uint32 ctr0, __vector unsigned int ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1,
__vector double & rnd1, __vector double & rnd2)
{
__vector unsigned int ctr0v = vec_splats(ctr0);
__vector unsigned int ctr2v = vec_splats(ctr2);
__vector unsigned int ctr3v = vec_splats(ctr3);
__vector double ignore;
philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, ignore, rnd2, ignore);
}
QUALIFIERS void philox_double2(uint32 ctr0, __vector int ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1,
__vector double & rnd1, __vector double & rnd2)
{
philox_double2(ctr0, (__vector unsigned int) ctr1, ctr2, ctr3, key0, key1, rnd1, rnd2);
}
#endif
#endif
#if defined(__ARM_NEON)
QUALIFIERS void _philox4x32round(uint32x4_t* ctr, uint32x4_t* key)
{
uint32x4_t lohi0a = vreinterpretq_u32_u64(vmull_u32(vget_low_u32(ctr[0]), vdup_n_u32(PHILOX_M4x32_0)));
uint32x4_t lohi0b = vreinterpretq_u32_u64(vmull_high_u32(ctr[0], vdupq_n_u32(PHILOX_M4x32_0)));
uint32x4_t lohi1a = vreinterpretq_u32_u64(vmull_u32(vget_low_u32(ctr[2]), vdup_n_u32(PHILOX_M4x32_1)));
uint32x4_t lohi1b = vreinterpretq_u32_u64(vmull_high_u32(ctr[2], vdupq_n_u32(PHILOX_M4x32_1)));
uint32x4_t lo0 = vuzp1q_u32(lohi0a, lohi0b);
uint32x4_t lo1 = vuzp1q_u32(lohi1a, lohi1b);
uint32x4_t hi0 = vuzp2q_u32(lohi0a, lohi0b);
uint32x4_t hi1 = vuzp2q_u32(lohi1a, lohi1b);
ctr[0] = veorq_u32(veorq_u32(hi1, ctr[1]), key[0]);
ctr[1] = lo1;
ctr[2] = veorq_u32(veorq_u32(hi0, ctr[3]), key[1]);
ctr[3] = lo0;
}
QUALIFIERS void _philox4x32bumpkey(uint32x4_t* key)
{
key[0] = vaddq_u32(key[0], vdupq_n_u32(PHILOX_W32_0));
key[1] = vaddq_u32(key[1], vdupq_n_u32(PHILOX_W32_1));
}
template<bool high>
QUALIFIERS float64x2_t _uniform_double_hq(uint32x4_t x, uint32x4_t y)
{
// convert 32 to 64 bit
if (high)
{
x = vzip2q_u32(x, vdupq_n_u32(0));
y = vzip2q_u32(y, vdupq_n_u32(0));
}
else
{
x = vzip1q_u32(x, vdupq_n_u32(0));
y = vzip1q_u32(y, vdupq_n_u32(0));
}
// calculate z = x ^ y << (53 - 32))
uint64x2_t z = vshlq_n_u64(vreinterpretq_u64_u32(y), 53 - 32);
z = veorq_u64(vreinterpretq_u64_u32(x), z);
// convert uint64 to double
float64x2_t rs = vcvtq_f64_u64(z);
// calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
rs = vfmaq_f64(vdupq_n_f64(TWOPOW53_INV_DOUBLE/2.0), vdupq_n_f64(TWOPOW53_INV_DOUBLE), rs);
return rs;
}
QUALIFIERS void philox_float4(uint32x4_t ctr0, uint32x4_t ctr1, uint32x4_t ctr2, uint32x4_t ctr3,
uint32 key0, uint32 key1,
float32x4_t & rnd1, float32x4_t & rnd2, float32x4_t & rnd3, float32x4_t & rnd4)
{
uint32x4_t key[2] = {vdupq_n_u32(key0), vdupq_n_u32(key1)};
uint32x4_t ctr[4] = {ctr0, ctr1, ctr2, ctr3};
_philox4x32round(ctr, key); // 1
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 2
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 3
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 4
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 5
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 6
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 7
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 8
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 9
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 10
// convert uint32 to float
rnd1 = vcvtq_f32_u32(ctr[0]);
rnd2 = vcvtq_f32_u32(ctr[1]);
rnd3 = vcvtq_f32_u32(ctr[2]);
rnd4 = vcvtq_f32_u32(ctr[3]);
// calculate rnd * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
rnd1 = vfmaq_f32(vdupq_n_f32(TWOPOW32_INV_FLOAT/2.0), vdupq_n_f32(TWOPOW32_INV_FLOAT), rnd1);
rnd2 = vfmaq_f32(vdupq_n_f32(TWOPOW32_INV_FLOAT/2.0), vdupq_n_f32(TWOPOW32_INV_FLOAT), rnd2);
rnd3 = vfmaq_f32(vdupq_n_f32(TWOPOW32_INV_FLOAT/2.0), vdupq_n_f32(TWOPOW32_INV_FLOAT), rnd3);
rnd4 = vfmaq_f32(vdupq_n_f32(TWOPOW32_INV_FLOAT/2.0), vdupq_n_f32(TWOPOW32_INV_FLOAT), rnd4);
}
QUALIFIERS void philox_double2(uint32x4_t ctr0, uint32x4_t ctr1, uint32x4_t ctr2, uint32x4_t ctr3,
uint32 key0, uint32 key1,
float64x2_t & rnd1lo, float64x2_t & rnd1hi, float64x2_t & rnd2lo, float64x2_t & rnd2hi)
{
uint32x4_t key[2] = {vdupq_n_u32(key0), vdupq_n_u32(key1)};
uint32x4_t ctr[4] = {ctr0, ctr1, ctr2, ctr3};
_philox4x32round(ctr, key); // 1
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 2
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 3
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 4
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 5
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 6
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 7
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 8
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 9
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 10
rnd1lo = _uniform_double_hq<false>(ctr[0], ctr[1]);
rnd1hi = _uniform_double_hq<true>(ctr[0], ctr[1]);
rnd2lo = _uniform_double_hq<false>(ctr[2], ctr[3]);
rnd2hi = _uniform_double_hq<true>(ctr[2], ctr[3]);
}
QUALIFIERS void philox_float4(uint32 ctr0, uint32x4_t ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1,
float32x4_t & rnd1, float32x4_t & rnd2, float32x4_t & rnd3, float32x4_t & rnd4)
{
uint32x4_t ctr0v = vdupq_n_u32(ctr0);
uint32x4_t ctr2v = vdupq_n_u32(ctr2);
uint32x4_t ctr3v = vdupq_n_u32(ctr3);
philox_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, rnd2, rnd3, rnd4);
}
#ifndef _MSC_VER
QUALIFIERS void philox_float4(uint32 ctr0, int32x4_t ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1,
float32x4_t & rnd1, float32x4_t & rnd2, float32x4_t & rnd3, float32x4_t & rnd4)
{
philox_float4(ctr0, vreinterpretq_u32_s32(ctr1), ctr2, ctr3, key0, key1, rnd1, rnd2, rnd3, rnd4);
}
#endif
QUALIFIERS void philox_double2(uint32 ctr0, uint32x4_t ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1,
float64x2_t & rnd1lo, float64x2_t & rnd1hi, float64x2_t & rnd2lo, float64x2_t & rnd2hi)
{
uint32x4_t ctr0v = vdupq_n_u32(ctr0);
uint32x4_t ctr2v = vdupq_n_u32(ctr2);
uint32x4_t ctr3v = vdupq_n_u32(ctr3);
philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1lo, rnd1hi, rnd2lo, rnd2hi);
}
QUALIFIERS void philox_double2(uint32 ctr0, uint32x4_t ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1,
float64x2_t & rnd1, float64x2_t & rnd2)
{
uint32x4_t ctr0v = vdupq_n_u32(ctr0);
uint32x4_t ctr2v = vdupq_n_u32(ctr2);
uint32x4_t ctr3v = vdupq_n_u32(ctr3);
float64x2_t ignore;
philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, ignore, rnd2, ignore);
}
#ifndef _MSC_VER
QUALIFIERS void philox_double2(uint32 ctr0, int32x4_t ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1,
float64x2_t & rnd1, float64x2_t & rnd2)
{
philox_double2(ctr0, vreinterpretq_u32_s32(ctr1), ctr2, ctr3, key0, key1, rnd1, rnd2);
}
#endif
#endif
#if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_SME)
SVE_QUALIFIERS void _philox4x32round(svuint32x4_t & ctr, svuint32x2_t & key)
{
svuint32_t lo0 = svmul_u32_x(svptrue_b32(), svget4_u32(ctr, 0), svdup_u32(PHILOX_M4x32_0));
svuint32_t hi0 = svmulh_u32_x(svptrue_b32(), svget4_u32(ctr, 0), svdup_u32(PHILOX_M4x32_0));
svuint32_t lo1 = svmul_u32_x(svptrue_b32(), svget4_u32(ctr, 2), svdup_u32(PHILOX_M4x32_1));
svuint32_t hi1 = svmulh_u32_x(svptrue_b32(), svget4_u32(ctr, 2), svdup_u32(PHILOX_M4x32_1));
ctr = svset4_u32(ctr, 0, sveor_u32_x(svptrue_b32(), sveor_u32_x(svptrue_b32(), hi1, svget4_u32(ctr, 1)), svget2_u32(key, 0)));
ctr = svset4_u32(ctr, 1, lo1);
ctr = svset4_u32(ctr, 2, sveor_u32_x(svptrue_b32(), sveor_u32_x(svptrue_b32(), hi0, svget4_u32(ctr, 3)), svget2_u32(key, 1)));
ctr = svset4_u32(ctr, 3, lo0);
}
SVE_QUALIFIERS void _philox4x32bumpkey(svuint32x2_t & key)
{
key = svset2_u32(key, 0, svadd_u32_x(svptrue_b32(), svget2_u32(key, 0), svdup_u32(PHILOX_W32_0)));
key = svset2_u32(key, 1, svadd_u32_x(svptrue_b32(), svget2_u32(key, 1), svdup_u32(PHILOX_W32_1)));
}
template<bool high>
SVE_QUALIFIERS svfloat64_t _uniform_double_hq(svuint32_t x, svuint32_t y)
{
// convert 32 to 64 bit
if (high)
{
x = svzip2_u32(x, svdup_u32(0));
y = svzip2_u32(y, svdup_u32(0));
}
else
{
x = svzip1_u32(x, svdup_u32(0));
y = svzip1_u32(y, svdup_u32(0));
}
// calculate z = x ^ y << (53 - 32))
svuint64_t z = svlsl_n_u64_x(svptrue_b64(), svreinterpret_u64_u32(y), 53 - 32);
z = sveor_u64_x(svptrue_b64(), svreinterpret_u64_u32(x), z);
// convert uint64 to double
svfloat64_t rs = svcvt_f64_u64_x(svptrue_b64(), z);
// calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
rs = svmad_f64_x(svptrue_b64(), rs, svdup_f64(TWOPOW53_INV_DOUBLE), svdup_f64(TWOPOW53_INV_DOUBLE/2.0));
return rs;
}
SVE_QUALIFIERS void philox_float4(svuint32_t ctr0, svuint32_t ctr1, svuint32_t ctr2, svuint32_t ctr3,
uint32 key0, uint32 key1,
svfloat32_st & rnd1, svfloat32_st & rnd2, svfloat32_st & rnd3, svfloat32_st & rnd4)
{
svuint32x2_t key = svcreate2_u32(svdup_u32(key0), svdup_u32(key1));
svuint32x4_t ctr = svcreate4_u32(ctr0, ctr1, ctr2, ctr3);
_philox4x32round(ctr, key); // 1
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 2
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 3
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 4
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 5
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 6
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 7
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 8
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 9
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 10
// convert uint32 to float
rnd1 = svcvt_f32_u32_x(svptrue_b32(), svget4_u32(ctr, 0));
rnd2 = svcvt_f32_u32_x(svptrue_b32(), svget4_u32(ctr, 1));
rnd3 = svcvt_f32_u32_x(svptrue_b32(), svget4_u32(ctr, 2));
rnd4 = svcvt_f32_u32_x(svptrue_b32(), svget4_u32(ctr, 3));
// calculate rnd * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
rnd1 = svmad_f32_x(svptrue_b32(), rnd1, svdup_f32(TWOPOW32_INV_FLOAT), svdup_f32(TWOPOW32_INV_FLOAT/2.0));
rnd2 = svmad_f32_x(svptrue_b32(), rnd2, svdup_f32(TWOPOW32_INV_FLOAT), svdup_f32(TWOPOW32_INV_FLOAT/2.0));
rnd3 = svmad_f32_x(svptrue_b32(), rnd3, svdup_f32(TWOPOW32_INV_FLOAT), svdup_f32(TWOPOW32_INV_FLOAT/2.0));
rnd4 = svmad_f32_x(svptrue_b32(), rnd4, svdup_f32(TWOPOW32_INV_FLOAT), svdup_f32(TWOPOW32_INV_FLOAT/2.0));
}
SVE_QUALIFIERS void philox_double2(svuint32_t ctr0, svuint32_t ctr1, svuint32_t ctr2, svuint32_t ctr3,
uint32 key0, uint32 key1,
svfloat64_st & rnd1lo, svfloat64_st & rnd1hi, svfloat64_st & rnd2lo, svfloat64_st & rnd2hi)
{
svuint32x2_t key = svcreate2_u32(svdup_u32(key0), svdup_u32(key1));
svuint32x4_t ctr = svcreate4_u32(ctr0, ctr1, ctr2, ctr3);
_philox4x32round(ctr, key); // 1
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 2
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 3
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 4
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 5
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 6
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 7
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 8
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 9
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 10
rnd1lo = _uniform_double_hq<false>(svget4_u32(ctr, 0), svget4_u32(ctr, 1));
rnd1hi = _uniform_double_hq<true>(svget4_u32(ctr, 0), svget4_u32(ctr, 1));
rnd2lo = _uniform_double_hq<false>(svget4_u32(ctr, 2), svget4_u32(ctr, 3));
rnd2hi = _uniform_double_hq<true>(svget4_u32(ctr, 2), svget4_u32(ctr, 3));
}
SVE_QUALIFIERS void philox_float4(uint32 ctr0, svuint32_t ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1,
svfloat32_st & rnd1, svfloat32_st & rnd2, svfloat32_st & rnd3, svfloat32_st & rnd4)
{
svuint32_t ctr0v = svdup_u32(ctr0);
svuint32_t ctr2v = svdup_u32(ctr2);
svuint32_t ctr3v = svdup_u32(ctr3);
philox_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, rnd2, rnd3, rnd4);
}
SVE_QUALIFIERS void philox_float4(uint32 ctr0, svint32_t ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1,
svfloat32_st & rnd1, svfloat32_st & rnd2, svfloat32_st & rnd3, svfloat32_st & rnd4)
{
philox_float4(ctr0, svreinterpret_u32_s32(ctr1), ctr2, ctr3, key0, key1, rnd1, rnd2, rnd3, rnd4);
}
SVE_QUALIFIERS void philox_double2(uint32 ctr0, svuint32_t ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1,
svfloat64_st & rnd1lo, svfloat64_st & rnd1hi, svfloat64_st & rnd2lo, svfloat64_st & rnd2hi)
{
svuint32_t ctr0v = svdup_u32(ctr0);
svuint32_t ctr2v = svdup_u32(ctr2);
svuint32_t ctr3v = svdup_u32(ctr3);
philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1lo, rnd1hi, rnd2lo, rnd2hi);
}
SVE_QUALIFIERS void philox_double2(uint32 ctr0, svuint32_t ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1,
svfloat64_st & rnd1, svfloat64_st & rnd2)
{
svuint32_t ctr0v = svdup_u32(ctr0);
svuint32_t ctr2v = svdup_u32(ctr2);
svuint32_t ctr3v = svdup_u32(ctr3);
svfloat64_st ignore;
philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, ignore, rnd2, ignore);
}
SVE_QUALIFIERS void philox_double2(uint32 ctr0, svint32_t ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1,
svfloat64_st & rnd1, svfloat64_st & rnd2)
{
philox_double2(ctr0, svreinterpret_u32_s32(ctr1), ctr2, ctr3, key0, key1, rnd1, rnd2);
}
#endif
#if defined(__riscv_v)
QUALIFIERS void _philox4x32round(vuint32m1_t & ctr0, vuint32m1_t & ctr1, vuint32m1_t & ctr2, vuint32m1_t & ctr3,
vuint32m1_t key0, vuint32m1_t key1)
{
vuint32m1_t lo0 = vmul_vv_u32m1(ctr0, vmv_v_x_u32m1(PHILOX_M4x32_0, vsetvlmax_e32m1()), vsetvlmax_e32m1());
vuint32m1_t hi0 = vmulhu_vv_u32m1(ctr0, vmv_v_x_u32m1(PHILOX_M4x32_0, vsetvlmax_e32m1()), vsetvlmax_e32m1());
vuint32m1_t lo1 = vmul_vv_u32m1(ctr2, vmv_v_x_u32m1(PHILOX_M4x32_1, vsetvlmax_e32m1()), vsetvlmax_e32m1());
vuint32m1_t hi1 = vmulhu_vv_u32m1(ctr2, vmv_v_x_u32m1(PHILOX_M4x32_1, vsetvlmax_e32m1()), vsetvlmax_e32m1());
ctr0 = vxor_vv_u32m1(vxor_vv_u32m1(hi1, ctr1, vsetvlmax_e32m1()), key0, vsetvlmax_e32m1());
ctr1 = lo1;
ctr2 = vxor_vv_u32m1(vxor_vv_u32m1(hi0, ctr3, vsetvlmax_e32m1()), key1, vsetvlmax_e32m1());
ctr3 = lo0;
}
QUALIFIERS void _philox4x32bumpkey(vuint32m1_t & key0, vuint32m1_t & key1)
{
key0 = vadd_vv_u32m1(key0, vmv_v_x_u32m1(PHILOX_W32_0, vsetvlmax_e32m1()), vsetvlmax_e32m1());
key1 = vadd_vv_u32m1(key1, vmv_v_x_u32m1(PHILOX_W32_1, vsetvlmax_e32m1()), vsetvlmax_e32m1());
}
template<bool high>
QUALIFIERS vfloat64m1_t _uniform_double_hq(vuint32m1_t x, vuint32m1_t y)
{
// convert 32 to 64 bit
if (high)
{
size_t s = vsetvlmax_e32m1();
x = vslidedown_vx_u32m1(vundefined_u32m1(), x, s/2, s);
y = vslidedown_vx_u32m1(vundefined_u32m1(), y, s/2, s);
}
vuint64m1_t x64 = vwcvtu_x_x_v_u64m1(vlmul_trunc_v_u32m1_u32mf2(x), vsetvlmax_e64m1());
vuint64m1_t y64 = vwcvtu_x_x_v_u64m1(vlmul_trunc_v_u32m1_u32mf2(y), vsetvlmax_e64m1());
// calculate z = x ^ y << (53 - 32))
vuint64m1_t z = vsll_vx_u64m1(y64, 53 - 32, vsetvlmax_e64m1());
z = vxor_vv_u64m1(x64, z, vsetvlmax_e64m1());
// convert uint64 to double
vfloat64m1_t rs = vfcvt_f_xu_v_f64m1(z, vsetvlmax_e64m1());
// calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
rs = vfmadd_vv_f64m1(rs, vfmv_v_f_f64m1(TWOPOW53_INV_DOUBLE, vsetvlmax_e64m1()), vfmv_v_f_f64m1(TWOPOW53_INV_DOUBLE/2.0, vsetvlmax_e64m1()), vsetvlmax_e64m1());
return rs;
}
QUALIFIERS void philox_float4(vuint32m1_t ctr0, vuint32m1_t ctr1, vuint32m1_t ctr2, vuint32m1_t ctr3,
uint32 key0, uint32 key1,
vfloat32m1_t & rnd1, vfloat32m1_t & rnd2, vfloat32m1_t & rnd3, vfloat32m1_t & rnd4)
{
vuint32m1_t key0v = vmv_v_x_u32m1(key0, vsetvlmax_e32m1());
vuint32m1_t key1v = vmv_v_x_u32m1(key1, vsetvlmax_e32m1());
_philox4x32round(ctr0, ctr1, ctr2, ctr3, key0v, key1v); // 1
_philox4x32bumpkey(key0v, key1v); _philox4x32round(ctr0, ctr1, ctr2, ctr3, key0v, key1v); // 2
_philox4x32bumpkey(key0v, key1v); _philox4x32round(ctr0, ctr1, ctr2, ctr3, key0v, key1v); // 3
_philox4x32bumpkey(key0v, key1v); _philox4x32round(ctr0, ctr1, ctr2, ctr3, key0v, key1v); // 4
_philox4x32bumpkey(key0v, key1v); _philox4x32round(ctr0, ctr1, ctr2, ctr3, key0v, key1v); // 5
_philox4x32bumpkey(key0v, key1v); _philox4x32round(ctr0, ctr1, ctr2, ctr3, key0v, key1v); // 6
_philox4x32bumpkey(key0v, key1v); _philox4x32round(ctr0, ctr1, ctr2, ctr3, key0v, key1v); // 7
_philox4x32bumpkey(key0v, key1v); _philox4x32round(ctr0, ctr1, ctr2, ctr3, key0v, key1v); // 8
_philox4x32bumpkey(key0v, key1v); _philox4x32round(ctr0, ctr1, ctr2, ctr3, key0v, key1v); // 9
_philox4x32bumpkey(key0v, key1v); _philox4x32round(ctr0, ctr1, ctr2, ctr3, key0v, key1v); // 10
// convert uint32 to float
rnd1 = vfcvt_f_xu_v_f32m1(ctr0, vsetvlmax_e32m1());
rnd2 = vfcvt_f_xu_v_f32m1(ctr1, vsetvlmax_e32m1());
rnd3 = vfcvt_f_xu_v_f32m1(ctr2, vsetvlmax_e32m1());
rnd4 = vfcvt_f_xu_v_f32m1(ctr3, vsetvlmax_e32m1());
// calculate rnd * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
rnd1 = vfmadd_vv_f32m1(rnd1, vfmv_v_f_f32m1(TWOPOW32_INV_FLOAT, vsetvlmax_e32m1()), vfmv_v_f_f32m1(TWOPOW32_INV_FLOAT/2.0, vsetvlmax_e32m1()), vsetvlmax_e32m1());
rnd2 = vfmadd_vv_f32m1(rnd2, vfmv_v_f_f32m1(TWOPOW32_INV_FLOAT, vsetvlmax_e32m1()), vfmv_v_f_f32m1(TWOPOW32_INV_FLOAT/2.0, vsetvlmax_e32m1()), vsetvlmax_e32m1());
rnd3 = vfmadd_vv_f32m1(rnd3, vfmv_v_f_f32m1(TWOPOW32_INV_FLOAT, vsetvlmax_e32m1()), vfmv_v_f_f32m1(TWOPOW32_INV_FLOAT/2.0, vsetvlmax_e32m1()), vsetvlmax_e32m1());
rnd4 = vfmadd_vv_f32m1(rnd4, vfmv_v_f_f32m1(TWOPOW32_INV_FLOAT, vsetvlmax_e32m1()), vfmv_v_f_f32m1(TWOPOW32_INV_FLOAT/2.0, vsetvlmax_e32m1()), vsetvlmax_e32m1());
}
QUALIFIERS void philox_double2(vuint32m1_t ctr0, vuint32m1_t ctr1, vuint32m1_t ctr2, vuint32m1_t ctr3,
uint32 key0, uint32 key1,
vfloat64m1_t & rnd1lo, vfloat64m1_t & rnd1hi, vfloat64m1_t & rnd2lo, vfloat64m1_t & rnd2hi)
{
vuint32m1_t key0v = vmv_v_x_u32m1(key0, vsetvlmax_e32m1());
vuint32m1_t key1v = vmv_v_x_u32m1(key1, vsetvlmax_e32m1());
_philox4x32round(ctr0, ctr1, ctr2, ctr3, key0v, key1v); // 1
_philox4x32bumpkey(key0v, key1v); _philox4x32round(ctr0, ctr1, ctr2, ctr3, key0v, key1v); // 2
_philox4x32bumpkey(key0v, key1v); _philox4x32round(ctr0, ctr1, ctr2, ctr3, key0v, key1v); // 3
_philox4x32bumpkey(key0v, key1v); _philox4x32round(ctr0, ctr1, ctr2, ctr3, key0v, key1v); // 4
_philox4x32bumpkey(key0v, key1v); _philox4x32round(ctr0, ctr1, ctr2, ctr3, key0v, key1v); // 5
_philox4x32bumpkey(key0v, key1v); _philox4x32round(ctr0, ctr1, ctr2, ctr3, key0v, key1v); // 6
_philox4x32bumpkey(key0v, key1v); _philox4x32round(ctr0, ctr1, ctr2, ctr3, key0v, key1v); // 7
_philox4x32bumpkey(key0v, key1v); _philox4x32round(ctr0, ctr1, ctr2, ctr3, key0v, key1v); // 8
_philox4x32bumpkey(key0v, key1v); _philox4x32round(ctr0, ctr1, ctr2, ctr3, key0v, key1v); // 9
_philox4x32bumpkey(key0v, key1v); _philox4x32round(ctr0, ctr1, ctr2, ctr3, key0v, key1v); // 10
rnd1lo = _uniform_double_hq<false>(ctr0, ctr1);
rnd1hi = _uniform_double_hq<true>(ctr0, ctr1);
rnd2lo = _uniform_double_hq<false>(ctr2, ctr3);
rnd2hi = _uniform_double_hq<true>(ctr2, ctr3);
}
QUALIFIERS void philox_float4(uint32 ctr0, vuint32m1_t ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1,
vfloat32m1_t & rnd1, vfloat32m1_t & rnd2, vfloat32m1_t & rnd3, vfloat32m1_t & rnd4)
{
vuint32m1_t ctr0v = vmv_v_x_u32m1(ctr0, vsetvlmax_e32m1());
vuint32m1_t ctr2v = vmv_v_x_u32m1(ctr2, vsetvlmax_e32m1());
vuint32m1_t ctr3v = vmv_v_x_u32m1(ctr3, vsetvlmax_e32m1());
philox_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, rnd2, rnd3, rnd4);
}
QUALIFIERS void philox_float4(uint32 ctr0, vint32m1_t ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1,
vfloat32m1_t & rnd1, vfloat32m1_t & rnd2, vfloat32m1_t & rnd3, vfloat32m1_t & rnd4)
{
philox_float4(ctr0, vreinterpret_v_i32m1_u32m1(ctr1), ctr2, ctr3, key0, key1, rnd1, rnd2, rnd3, rnd4);
}
QUALIFIERS void philox_double2(uint32 ctr0, vuint32m1_t ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1,
vfloat64m1_t & rnd1lo, vfloat64m1_t & rnd1hi, vfloat64m1_t & rnd2lo, vfloat64m1_t & rnd2hi)
{
vuint32m1_t ctr0v = vmv_v_x_u32m1(ctr0, vsetvlmax_e32m1());
vuint32m1_t ctr2v = vmv_v_x_u32m1(ctr2, vsetvlmax_e32m1());
vuint32m1_t ctr3v = vmv_v_x_u32m1(ctr3, vsetvlmax_e32m1());
philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1lo, rnd1hi, rnd2lo, rnd2hi);
}
QUALIFIERS void philox_double2(uint32 ctr0, vuint32m1_t ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1,
vfloat64m1_t & rnd1, vfloat64m1_t & rnd2)
{
vuint32m1_t ctr0v = vmv_v_x_u32m1(ctr0, vsetvlmax_e32m1());
vuint32m1_t ctr2v = vmv_v_x_u32m1(ctr2, vsetvlmax_e32m1());
vuint32m1_t ctr3v = vmv_v_x_u32m1(ctr3, vsetvlmax_e32m1());
vfloat64m1_t ignore;
philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, ignore, rnd2, ignore);
}
QUALIFIERS void philox_double2(uint32 ctr0, vint32m1_t ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1,
vfloat64m1_t & rnd1, vfloat64m1_t & rnd2)
{
philox_double2(ctr0, vreinterpret_v_i32m1_u32m1(ctr1), ctr2, ctr3, key0, key1, rnd1, rnd2);
}
#endif
#ifdef __AVX2__
QUALIFIERS void _philox4x32round(__m256i* ctr, __m256i* key)
{
__m256i lohi0a = _mm256_mul_epu32(ctr[0], _mm256_set1_epi32(PHILOX_M4x32_0));
__m256i lohi0b = _mm256_mul_epu32(_mm256_srli_epi64(ctr[0], 32), _mm256_set1_epi32(PHILOX_M4x32_0));
__m256i lohi1a = _mm256_mul_epu32(ctr[2], _mm256_set1_epi32(PHILOX_M4x32_1));
__m256i lohi1b = _mm256_mul_epu32(_mm256_srli_epi64(ctr[2], 32), _mm256_set1_epi32(PHILOX_M4x32_1));
lohi0a = _mm256_shuffle_epi32(lohi0a, 0xD8);
lohi0b = _mm256_shuffle_epi32(lohi0b, 0xD8);
lohi1a = _mm256_shuffle_epi32(lohi1a, 0xD8);
lohi1b = _mm256_shuffle_epi32(lohi1b, 0xD8);
__m256i lo0 = _mm256_unpacklo_epi32(lohi0a, lohi0b);
__m256i hi0 = _mm256_unpackhi_epi32(lohi0a, lohi0b);
__m256i lo1 = _mm256_unpacklo_epi32(lohi1a, lohi1b);
__m256i hi1 = _mm256_unpackhi_epi32(lohi1a, lohi1b);
ctr[0] = _mm256_xor_si256(_mm256_xor_si256(hi1, ctr[1]), key[0]);
ctr[1] = lo1;
ctr[2] = _mm256_xor_si256(_mm256_xor_si256(hi0, ctr[3]), key[1]);
ctr[3] = lo0;
}
QUALIFIERS void _philox4x32bumpkey(__m256i* key)
{
key[0] = _mm256_add_epi32(key[0], _mm256_set1_epi32(PHILOX_W32_0));
key[1] = _mm256_add_epi32(key[1], _mm256_set1_epi32(PHILOX_W32_1));
}
template<bool high>
QUALIFIERS __m256d _uniform_double_hq(__m256i x, __m256i y)
{
// convert 32 to 64 bit
if (high)
{
x = _mm256_cvtepu32_epi64(_mm256_extracti128_si256(x, 1));
y = _mm256_cvtepu32_epi64(_mm256_extracti128_si256(y, 1));
}
else
{
x = _mm256_cvtepu32_epi64(_mm256_extracti128_si256(x, 0));
y = _mm256_cvtepu32_epi64(_mm256_extracti128_si256(y, 0));
}
// calculate z = x ^ y << (53 - 32))
__m256i z = _mm256_sll_epi64(y, _mm_set1_epi64x(53 - 32));
z = _mm256_xor_si256(x, z);
// convert uint64 to double
__m256d rs = _my256_cvtepu64_pd(z);
// calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
#ifdef __FMA__
rs = _mm256_fmadd_pd(rs, _mm256_set1_pd(TWOPOW53_INV_DOUBLE), _mm256_set1_pd(TWOPOW53_INV_DOUBLE/2.0));
#else
rs = _mm256_mul_pd(rs, _mm256_set1_pd(TWOPOW53_INV_DOUBLE));
rs = _mm256_add_pd(rs, _mm256_set1_pd(TWOPOW53_INV_DOUBLE/2.0));
#endif
return rs;
}
QUALIFIERS void philox_float4(__m256i ctr0, __m256i ctr1, __m256i ctr2, __m256i ctr3,
uint32 key0, uint32 key1,
__m256 & rnd1, __m256 & rnd2, __m256 & rnd3, __m256 & rnd4)
{
__m256i key[2] = {_mm256_set1_epi32(key0), _mm256_set1_epi32(key1)};
__m256i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
_philox4x32round(ctr, key); // 1
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 2
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 3
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 4
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 5
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 6
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 7
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 8
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 9
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 10
// convert uint32 to float
rnd1 = _my256_cvtepu32_ps(ctr[0]);
rnd2 = _my256_cvtepu32_ps(ctr[1]);
rnd3 = _my256_cvtepu32_ps(ctr[2]);
rnd4 = _my256_cvtepu32_ps(ctr[3]);
// calculate rnd * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
#ifdef __FMA__
rnd1 = _mm256_fmadd_ps(rnd1, _mm256_set1_ps(TWOPOW32_INV_FLOAT), _mm256_set1_ps(TWOPOW32_INV_FLOAT/2.0));
rnd2 = _mm256_fmadd_ps(rnd2, _mm256_set1_ps(TWOPOW32_INV_FLOAT), _mm256_set1_ps(TWOPOW32_INV_FLOAT/2.0));
rnd3 = _mm256_fmadd_ps(rnd3, _mm256_set1_ps(TWOPOW32_INV_FLOAT), _mm256_set1_ps(TWOPOW32_INV_FLOAT/2.0));
rnd4 = _mm256_fmadd_ps(rnd4, _mm256_set1_ps(TWOPOW32_INV_FLOAT), _mm256_set1_ps(TWOPOW32_INV_FLOAT/2.0));
#else
rnd1 = _mm256_mul_ps(rnd1, _mm256_set1_ps(TWOPOW32_INV_FLOAT));
rnd1 = _mm256_add_ps(rnd1, _mm256_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
rnd2 = _mm256_mul_ps(rnd2, _mm256_set1_ps(TWOPOW32_INV_FLOAT));
rnd2 = _mm256_add_ps(rnd2, _mm256_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
rnd3 = _mm256_mul_ps(rnd3, _mm256_set1_ps(TWOPOW32_INV_FLOAT));
rnd3 = _mm256_add_ps(rnd3, _mm256_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
rnd4 = _mm256_mul_ps(rnd4, _mm256_set1_ps(TWOPOW32_INV_FLOAT));
rnd4 = _mm256_add_ps(rnd4, _mm256_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
#endif
}
QUALIFIERS void philox_double2(__m256i ctr0, __m256i ctr1, __m256i ctr2, __m256i ctr3,
uint32 key0, uint32 key1,
__m256d & rnd1lo, __m256d & rnd1hi, __m256d & rnd2lo, __m256d & rnd2hi)
{
__m256i key[2] = {_mm256_set1_epi32(key0), _mm256_set1_epi32(key1)};
__m256i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
_philox4x32round(ctr, key); // 1
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 2
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 3
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 4
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 5
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 6
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 7
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 8
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 9
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 10
rnd1lo = _uniform_double_hq<false>(ctr[0], ctr[1]);
rnd1hi = _uniform_double_hq<true>(ctr[0], ctr[1]);
rnd2lo = _uniform_double_hq<false>(ctr[2], ctr[3]);
rnd2hi = _uniform_double_hq<true>(ctr[2], ctr[3]);
}
QUALIFIERS void philox_float4(uint32 ctr0, __m256i ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1,
__m256 & rnd1, __m256 & rnd2, __m256 & rnd3, __m256 & rnd4)
{
__m256i ctr0v = _mm256_set1_epi32(ctr0);
__m256i ctr2v = _mm256_set1_epi32(ctr2);
__m256i ctr3v = _mm256_set1_epi32(ctr3);
philox_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, rnd2, rnd3, rnd4);
}
QUALIFIERS void philox_double2(uint32 ctr0, __m256i ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1,
__m256d & rnd1lo, __m256d & rnd1hi, __m256d & rnd2lo, __m256d & rnd2hi)
{
__m256i ctr0v = _mm256_set1_epi32(ctr0);
__m256i ctr2v = _mm256_set1_epi32(ctr2);
__m256i ctr3v = _mm256_set1_epi32(ctr3);
philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1lo, rnd1hi, rnd2lo, rnd2hi);
}
QUALIFIERS void philox_double2(uint32 ctr0, __m256i ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1,
__m256d & rnd1, __m256d & rnd2)
{
#if 0
__m256i ctr0v = _mm256_set1_epi32(ctr0);
__m256i ctr2v = _mm256_set1_epi32(ctr2);
__m256i ctr3v = _mm256_set1_epi32(ctr3);
__m256d ignore;
philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, ignore, rnd2, ignore);
#else
__m128d rnd1lo, rnd1hi, rnd2lo, rnd2hi;
philox_double2(ctr0, _mm256_extractf128_si256(ctr1, 0), ctr2, ctr3, key0, key1, rnd1lo, rnd1hi, rnd2lo, rnd2hi);
rnd1 = _my256_set_m128d(rnd1hi, rnd1lo);
rnd2 = _my256_set_m128d(rnd2hi, rnd2lo);
#endif
}
#endif
#if defined(__AVX512F__) || defined(__AVX10_512BIT__)
QUALIFIERS void _philox4x32round(__m512i* ctr, __m512i* key)
{
__m512i lohi0a = _mm512_mul_epu32(ctr[0], _mm512_set1_epi32(PHILOX_M4x32_0));
__m512i lohi0b = _mm512_mul_epu32(_mm512_srli_epi64(ctr[0], 32), _mm512_set1_epi32(PHILOX_M4x32_0));
__m512i lohi1a = _mm512_mul_epu32(ctr[2], _mm512_set1_epi32(PHILOX_M4x32_1));
__m512i lohi1b = _mm512_mul_epu32(_mm512_srli_epi64(ctr[2], 32), _mm512_set1_epi32(PHILOX_M4x32_1));
lohi0a = _mm512_shuffle_epi32(lohi0a, _MM_PERM_DBCA);
lohi0b = _mm512_shuffle_epi32(lohi0b, _MM_PERM_DBCA);
lohi1a = _mm512_shuffle_epi32(lohi1a, _MM_PERM_DBCA);
lohi1b = _mm512_shuffle_epi32(lohi1b, _MM_PERM_DBCA);
__m512i lo0 = _mm512_unpacklo_epi32(lohi0a, lohi0b);
__m512i hi0 = _mm512_unpackhi_epi32(lohi0a, lohi0b);
__m512i lo1 = _mm512_unpacklo_epi32(lohi1a, lohi1b);
__m512i hi1 = _mm512_unpackhi_epi32(lohi1a, lohi1b);
ctr[0] = _mm512_xor_si512(_mm512_xor_si512(hi1, ctr[1]), key[0]);
ctr[1] = lo1;
ctr[2] = _mm512_xor_si512(_mm512_xor_si512(hi0, ctr[3]), key[1]);
ctr[3] = lo0;
}
QUALIFIERS void _philox4x32bumpkey(__m512i* key)
{
key[0] = _mm512_add_epi32(key[0], _mm512_set1_epi32(PHILOX_W32_0));
key[1] = _mm512_add_epi32(key[1], _mm512_set1_epi32(PHILOX_W32_1));
}
template<bool high>
QUALIFIERS __m512d _uniform_double_hq(__m512i x, __m512i y)
{
// convert 32 to 64 bit
if (high)
{
x = _mm512_cvtepu32_epi64(_mm512_extracti64x4_epi64(x, 1));
y = _mm512_cvtepu32_epi64(_mm512_extracti64x4_epi64(y, 1));
}
else
{
x = _mm512_cvtepu32_epi64(_mm512_extracti64x4_epi64(x, 0));
y = _mm512_cvtepu32_epi64(_mm512_extracti64x4_epi64(y, 0));
}
// calculate z = x ^ y << (53 - 32))
__m512i z = _mm512_sll_epi64(y, _mm_set1_epi64x(53 - 32));
z = _mm512_xor_si512(x, z);
// convert uint64 to double
__m512d rs = _mm512_cvtepu64_pd(z);
// calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
rs = _mm512_fmadd_pd(rs, _mm512_set1_pd(TWOPOW53_INV_DOUBLE), _mm512_set1_pd(TWOPOW53_INV_DOUBLE/2.0));
return rs;
}
QUALIFIERS void philox_float4(__m512i ctr0, __m512i ctr1, __m512i ctr2, __m512i ctr3,
uint32 key0, uint32 key1,
__m512 & rnd1, __m512 & rnd2, __m512 & rnd3, __m512 & rnd4)
{
__m512i key[2] = {_mm512_set1_epi32(key0), _mm512_set1_epi32(key1)};
__m512i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
_philox4x32round(ctr, key); // 1
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 2
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 3
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 4
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 5
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 6
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 7
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 8
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 9
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 10
// convert uint32 to float
rnd1 = _mm512_cvtepu32_ps(ctr[0]);
rnd2 = _mm512_cvtepu32_ps(ctr[1]);
rnd3 = _mm512_cvtepu32_ps(ctr[2]);
rnd4 = _mm512_cvtepu32_ps(ctr[3]);
// calculate rnd * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
rnd1 = _mm512_fmadd_ps(rnd1, _mm512_set1_ps(TWOPOW32_INV_FLOAT), _mm512_set1_ps(TWOPOW32_INV_FLOAT/2.0));
rnd2 = _mm512_fmadd_ps(rnd2, _mm512_set1_ps(TWOPOW32_INV_FLOAT), _mm512_set1_ps(TWOPOW32_INV_FLOAT/2.0));
rnd3 = _mm512_fmadd_ps(rnd3, _mm512_set1_ps(TWOPOW32_INV_FLOAT), _mm512_set1_ps(TWOPOW32_INV_FLOAT/2.0));
rnd4 = _mm512_fmadd_ps(rnd4, _mm512_set1_ps(TWOPOW32_INV_FLOAT), _mm512_set1_ps(TWOPOW32_INV_FLOAT/2.0));
}
QUALIFIERS void philox_double2(__m512i ctr0, __m512i ctr1, __m512i ctr2, __m512i ctr3,
uint32 key0, uint32 key1,
__m512d & rnd1lo, __m512d & rnd1hi, __m512d & rnd2lo, __m512d & rnd2hi)
{
__m512i key[2] = {_mm512_set1_epi32(key0), _mm512_set1_epi32(key1)};
__m512i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
_philox4x32round(ctr, key); // 1
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 2
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 3
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 4
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 5
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 6
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 7
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 8
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 9
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 10
rnd1lo = _uniform_double_hq<false>(ctr[0], ctr[1]);
rnd1hi = _uniform_double_hq<true>(ctr[0], ctr[1]);
rnd2lo = _uniform_double_hq<false>(ctr[2], ctr[3]);
rnd2hi = _uniform_double_hq<true>(ctr[2], ctr[3]);
}
QUALIFIERS void philox_float4(uint32 ctr0, __m512i ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1,
__m512 & rnd1, __m512 & rnd2, __m512 & rnd3, __m512 & rnd4)
{
__m512i ctr0v = _mm512_set1_epi32(ctr0);
__m512i ctr2v = _mm512_set1_epi32(ctr2);
__m512i ctr3v = _mm512_set1_epi32(ctr3);
philox_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, rnd2, rnd3, rnd4);
}
QUALIFIERS void philox_double2(uint32 ctr0, __m512i ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1,
__m512d & rnd1lo, __m512d & rnd1hi, __m512d & rnd2lo, __m512d & rnd2hi)
{
__m512i ctr0v = _mm512_set1_epi32(ctr0);
__m512i ctr2v = _mm512_set1_epi32(ctr2);
__m512i ctr3v = _mm512_set1_epi32(ctr3);
philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1lo, rnd1hi, rnd2lo, rnd2hi);
}
QUALIFIERS void philox_double2(uint32 ctr0, __m512i ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1,
__m512d & rnd1, __m512d & rnd2)
{
#if 0
__m512i ctr0v = _mm512_set1_epi32(ctr0);
__m512i ctr2v = _mm512_set1_epi32(ctr2);
__m512i ctr3v = _mm512_set1_epi32(ctr3);
__m512d ignore;
philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, ignore, rnd2, ignore);
#else
__m256d rnd1lo, rnd1hi, rnd2lo, rnd2hi;
philox_double2(ctr0, _mm512_extracti64x4_epi64(ctr1, 0), ctr2, ctr3, key0, key1, rnd1lo, rnd1hi, rnd2lo, rnd2hi);
rnd1 = _my512_set_m256d(rnd1hi, rnd1lo);
rnd2 = _my512_set_m256d(rnd2hi, rnd2lo);
#endif
}
#endif
#endif
/*
Copyright 2021, Michael Kuron.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <altivec.h>
#undef vector
#undef bool
inline void cachelineZero(void * p) {
#ifdef __xlC__
__dcbz(p);
#else
__asm__ volatile("dcbz 0, %0"::"r"(p):"memory");
#endif
}
inline size_t _cachelineSize() {
// allocate and fill with ones
const size_t max_size = 0x100000;
uint8_t data[2*max_size];
for (size_t i = 0; i < 2*max_size; ++i) {
data[i] = 0xff;
}
// find alignment offset
size_t offset = max_size - ((uintptr_t) data) % max_size;
// zero a cacheline
cachelineZero((void*) (data + offset));
// make sure that at least one byte was zeroed
if (data[offset] != 0) {
return SIZE_MAX;
}
// make sure that nothing was zeroed before the pointer
if (data[offset-1] == 0) {
return SIZE_MAX;
}
// find the last byte that was zeroed
for (size_t size = 1; size < max_size; ++size) {
if (data[offset + size] != 0) {
return size;
}
}
// too much was zeroed
return SIZE_MAX;
}
inline size_t cachelineSize() {
static size_t size = _cachelineSize();
return size;
}
/*
Copyright 2023, Michael Kuron.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
inline void cachelineZero(void * p) {
#ifdef __riscv_zicboz
__asm__ volatile("cbo.zero (%0)"::"r"(p):"memory");
#endif
}
inline size_t _cachelineSize() {
// allocate and fill with ones
const size_t max_size = 0x100000;
uint8_t data[2*max_size];
for (size_t i = 0; i < 2*max_size; ++i) {
data[i] = 0xff;
}
// find alignment offset
size_t offset = max_size - ((uintptr_t) data) % max_size;
// zero a cacheline
cachelineZero((void*) (data + offset));
// make sure that at least one byte was zeroed
if (data[offset] != 0) {
return SIZE_MAX;
}
// make sure that nothing was zeroed before the pointer
if (data[offset-1] == 0) {
return SIZE_MAX;
}
// find the last byte that was zeroed
for (size_t size = 1; size < max_size; ++size) {
if (data[offset + size] != 0) {
return size;
}
}
// too much was zeroed
return SIZE_MAX;
}
inline size_t cachelineSize() {
#ifdef __riscv_zicboz
static size_t size = _cachelineSize();
return size;
#else
return SIZE_MAX;
#endif
}