Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
No results found
Show changes
Showing
with 2952 additions and 221 deletions
import numpy as np
import sympy as sp
from sympy.codegen.ast import Assignment, AugmentedAssignment, AddAugmentedAssignment
from sympy.printing.latex import LatexPrinter
__all__ = ['Assignment', 'AugmentedAssignment', 'AddAugmentedAssignment', 'assignment_from_stencil']
def print_assignment_latex(printer, expr):
binop = f"{expr.binop}=" if isinstance(expr, AugmentedAssignment) else ''
"""sympy cannot print Assignments as Latex. Thus, this function is added to the sympy Latex printer"""
printed_lhs = printer.doprint(expr.lhs)
printed_rhs = printer.doprint(expr.rhs)
return fr"{printed_lhs} \leftarrow_{{{binop}}} {printed_rhs}"
def assignment_str(assignment):
op = f"{assignment.binop}=" if isinstance(assignment, AugmentedAssignment) else ''
return fr"{assignment.lhs} {op} {assignment.rhs}"
_old_new = sp.codegen.ast.Assignment.__new__
# TODO Typing Part2 add default type, defult_float_type, default_int_type and use sane defaults
def _Assignment__new__(cls, lhs, rhs, *args, **kwargs):
if isinstance(lhs, (list, tuple, sp.Matrix)) and isinstance(rhs, (list, tuple, sp.Matrix)):
assert len(lhs) == len(rhs), f'{lhs} and {rhs} must have same length when performing vector assignment!'
return tuple(_old_new(cls, a, b, *args, **kwargs) for a, b in zip(lhs, rhs))
return _old_new(cls, lhs, rhs, *args, **kwargs)
Assignment.__str__ = assignment_str
Assignment.__new__ = _Assignment__new__
LatexPrinter._print_Assignment = print_assignment_latex
AugmentedAssignment.__str__ = assignment_str
LatexPrinter._print_AugmentedAssignment = print_assignment_latex
sp.MutableDenseMatrix.__hash__ = lambda self: hash(tuple(self))
def assignment_from_stencil(stencil_array, input_field, output_field,
normalization_factor=None, order='visual') -> Assignment:
"""Creates an assignment
Args:
stencil_array: nested list of numpy array defining the stencil weights
input_field: field or field access, defining where the stencil should be applied to
output_field: field or field access where the result is written to
normalization_factor: optional normalization factor for the stencil
order: defines how the stencil_array is interpreted. Possible values are 'visual' and 'numpy'.
For details see examples
Returns:
Assignment that can be used to create a kernel
Examples:
>>> import pystencils as ps
>>> f, g = ps.fields("f, g: [2D]")
>>> stencil = [[0, 2, 0],
... [3, 4, 5],
... [0, 6, 0]]
By default 'visual ordering is used - i.e. the stencil is applied as the nested lists are written down
>>> expected_output = Assignment(g[0, 0], 3*f[-1, 0] + 6*f[0, -1] + 4*f[0, 0] + 2*f[0, 1] + 5*f[1, 0])
>>> assignment_from_stencil(stencil, f, g, order='visual') == expected_output
True
'numpy' ordering uses the first coordinate of the stencil array for x offset, second for y offset etc.
>>> expected_output = Assignment(g[0, 0], 2*f[-1, 0] + 3*f[0, -1] + 4*f[0, 0] + 5*f[0, 1] + 6*f[1, 0])
>>> assignment_from_stencil(stencil, f, g, order='numpy') == expected_output
True
You can also pass field accesses to apply the stencil at an already shifted position:
>>> expected_output = Assignment(g[2, 0], 3*f[0, 0] + 6*f[1, -1] + 4*f[1, 0] + 2*f[1, 1] + 5*f[2, 0])
>>> assignment_from_stencil(stencil, f[1, 0], g[2, 0]) == expected_output
True
"""
from pystencils.field import Field
stencil_array = np.array(stencil_array)
if order == 'visual':
stencil_array = np.swapaxes(stencil_array, 0, 1)
stencil_array = np.flip(stencil_array, axis=1)
elif order == 'numpy':
pass
else:
raise ValueError("'order' has to be either 'visual' or 'numpy'")
if isinstance(input_field, Field):
input_field = input_field.center
if isinstance(output_field, Field):
output_field = output_field.center
rhs = 0
offset = tuple(s // 2 for s in stencil_array.shape)
for index, factor in np.ndenumerate(stencil_array):
shift = tuple(i - o for i, o in zip(index, offset))
rhs += factor * input_field.get_shifted(*shift)
if normalization_factor:
rhs *= normalization_factor
return Assignment(output_field, rhs)
import collections.abc
import itertools
import uuid
from typing import Any, List, Optional, Sequence, Set, Union
import sympy as sp import sympy as sp
from sympy.tensor import IndexedBase
from pystencils.assignment import Assignment
from pystencils.enums import Target, Backend
from pystencils.field import Field from pystencils.field import Field
from pystencils.data_types import TypedSymbol, create_type, cast_func
from pystencils.sympyextensions import fast_subs from pystencils.sympyextensions import fast_subs
from typing import List, Set, Optional, Union, Any from pystencils.typing import (create_type, get_next_parent_of_type,
FieldPointerSymbol, FieldShapeSymbol, FieldStrideSymbol, TypedSymbol, CFunction)
NodeOrExpr = Union['Node', sp.Expr] NodeOrExpr = Union['Node', sp.Expr]
...@@ -30,9 +37,13 @@ class Node: ...@@ -30,9 +37,13 @@ class Node:
raise NotImplementedError() raise NotImplementedError()
def subs(self, subs_dict) -> None: def subs(self, subs_dict) -> None:
"""Inplace! substitute, similar to sympy's but modifies the AST inplace.""" """Inplace! Substitute, similar to sympy's but modifies the AST inplace."""
for a in self.args: for i, a in enumerate(self.args):
a.subs(subs_dict) result = a.subs(subs_dict)
if isinstance(a, sp.Expr): # sympy expressions' subs is out-of-place
self.args[i] = result
else: # all other should be in-place
assert result is None
@property @property
def func(self): def func(self):
...@@ -64,7 +75,6 @@ class Conditional(Node): ...@@ -64,7 +75,6 @@ class Conditional(Node):
false_block: Optional['Block'] = None) -> None: false_block: Optional['Block'] = None) -> None:
super(Conditional, self).__init__(parent=None) super(Conditional, self).__init__(parent=None)
assert condition_expr.is_Boolean or condition_expr.is_Relational
self.condition_expr = condition_expr self.condition_expr = condition_expr
def handle_child(c): def handle_child(c):
...@@ -100,14 +110,22 @@ class Conditional(Node): ...@@ -100,14 +110,22 @@ class Conditional(Node):
result = self.true_block.undefined_symbols result = self.true_block.undefined_symbols
if self.false_block: if self.false_block:
result.update(self.false_block.undefined_symbols) result.update(self.false_block.undefined_symbols)
result.update(self.condition_expr.atoms(sp.Symbol)) if hasattr(self.condition_expr, 'atoms'):
result.update(self.condition_expr.atoms(sp.Symbol))
return result return result
def __str__(self): def __str__(self):
return 'if:({!s}) '.format(self.condition_expr) return self.__repr__()
def __repr__(self): def __repr__(self):
return 'if:({!r}) '.format(self.condition_expr) result = f'if:({self.condition_expr!r}) '
if self.true_block:
result += f'\n\t{self.true_block}) '
if self.false_block:
result = 'else: '
result += f'\n\t{self.false_block} '
return result
def replace_by_true_block(self): def replace_by_true_block(self):
"""Replaces the conditional by its True block""" """Replaces the conditional by its True block"""
...@@ -119,71 +137,76 @@ class Conditional(Node): ...@@ -119,71 +137,76 @@ class Conditional(Node):
class KernelFunction(Node): class KernelFunction(Node):
class Parameter:
"""Function parameter.
class Argument: Each undefined symbol in a `KernelFunction` node becomes a parameter to the function.
def __init__(self, name, dtype, symbol, kernel_function_node): Parameters are either symbols introduced by the user that never occur on the left hand side of an
from pystencils.transformations import symbol_name_to_variable_name Assignment, or are related to fields/arrays passed to the function.
self.name = name
self.dtype = dtype A parameter consists of the typed symbol (symbol property). For field related parameters this is a symbol
self.is_field_ptr_argument = False defined in pystencils.kernelparameters.
self.is_field_shape_argument = False If the parameter is related to one or multiple fields, these fields are referenced in the fields property.
self.is_field_stride_argument = False """
self.is_field_argument = False
self.field_name = "" def __init__(self, symbol, fields):
self.coordinate = None self.symbol = symbol # type: TypedSymbol
self.symbol = symbol self.fields = fields # type: Sequence[Field]
if name.startswith(Field.DATA_PREFIX):
self.is_field_ptr_argument = True
self.is_field_argument = True
self.field_name = name[len(Field.DATA_PREFIX):]
elif name.startswith(Field.SHAPE_PREFIX):
self.is_field_shape_argument = True
self.is_field_argument = True
self.field_name = name[len(Field.SHAPE_PREFIX):]
elif name.startswith(Field.STRIDE_PREFIX):
self.is_field_stride_argument = True
self.is_field_argument = True
self.field_name = name[len(Field.STRIDE_PREFIX):]
self.field = None
if self.is_field_argument:
field_map = {symbol_name_to_variable_name(f.name): f for f in kernel_function_node.fields_accessed}
self.field = field_map[self.field_name]
def __lt__(self, other):
def score(l):
if l.is_field_ptr_argument:
return -4
elif l.is_field_shape_argument:
return -3
elif l.is_field_stride_argument:
return -2
return 0
if score(self) < score(other):
return True
elif score(self) == score(other):
return self.name < other.name
else:
return False
def __repr__(self): def __repr__(self):
return '<{0} {1}>'.format(self.dtype, self.name) return repr(self.symbol)
@property
def is_field_stride(self):
return isinstance(self.symbol, FieldStrideSymbol)
@property
def is_field_shape(self):
return isinstance(self.symbol, FieldShapeSymbol)
@property
def is_field_pointer(self):
return isinstance(self.symbol, FieldPointerSymbol)
def __init__(self, body, ghost_layers=None, function_name="kernel", backend=""): @property
def is_field_parameter(self):
return self.is_field_pointer or self.is_field_shape or self.is_field_stride
@property
def field_name(self):
return self.fields[0].name
def __init__(self, body, target: Target, backend: Backend, compile_function, ghost_layers,
function_name: str = "kernel",
assignments=None):
super(KernelFunction, self).__init__() super(KernelFunction, self).__init__()
self._body = body self._body = body
body.parent = self body.parent = self
self._parameters = None
self.function_name = function_name self.function_name = function_name
self._body.parent = self self._body.parent = self
self.compile = None
self.ghost_layers = ghost_layers self.ghost_layers = ghost_layers
self._target = target
self._backend = backend
# these variables are assumed to be global, so no automatic parameter is generated for them # these variables are assumed to be global, so no automatic parameter is generated for them
self.global_variables = set() self.global_variables = set()
self.backend = backend
self.instruction_set = None # used in `vectorize` function to tell the backend which i.s. (SSE,AVX) to use self.instruction_set = None # used in `vectorize` function to tell the backend which i.s. (SSE,AVX) to use
# function that compiles the node to a Python callable, is set by the backends
self._compile_function = compile_function
self.assignments = assignments
# If nontemporal stores are activated together with the Neon instruction set it results in cacheline zeroing
# For cacheline zeroing the information of the field size for each field is needed. Thus, in this case
# all field sizes are kernel parameters and not just the common field size used for the loops
self.use_all_written_field_sizes = False
@property
def target(self):
"""See pystencils.Target"""
return self._target
@property
def backend(self):
"""Backend for generating the code: `Backend`"""
return self._backend
@property @property
def symbols_defined(self): def symbols_defined(self):
...@@ -193,81 +216,149 @@ class KernelFunction(Node): ...@@ -193,81 +216,149 @@ class KernelFunction(Node):
def undefined_symbols(self): def undefined_symbols(self):
return set() return set()
@property
def parameters(self):
self._update_parameters()
return self._parameters
@property @property
def body(self): def body(self):
return self._body return self._body
@body.setter
def body(self, value):
self._body = value
self._body.parent = self
@property @property
def args(self): def args(self):
return [self._body] return self._body,
@property @property
def fields_accessed(self): def fields_accessed(self) -> Set[Field]:
"""Set of Field instances: fields which are accessed inside this kernel function""" """Set of Field instances: fields which are accessed inside this kernel function"""
return set(o.field for o in self.atoms(ResolvedFieldAccess)) return set(o.field for o in itertools.chain(self.atoms(ResolvedFieldAccess)))
def _update_parameters(self): @property
undefined_symbols = self._body.undefined_symbols - self.global_variables def fields_written(self) -> Set[Field]:
self._parameters = [KernelFunction.Argument(s.name, s.dtype, s, self) for s in undefined_symbols] assignments = self.atoms(SympyAssignment)
return set().union(itertools.chain.from_iterable([f.field for f in a.lhs.free_symbols if hasattr(f, 'field')]
self._parameters.sort() for a in assignments))
@property
def fields_read(self) -> Set[Field]:
assignments = self.atoms(SympyAssignment)
return set().union(itertools.chain.from_iterable([f.field for f in a.rhs.free_symbols if hasattr(f, 'field')]
for a in assignments))
def get_parameters(self) -> Sequence['KernelFunction.Parameter']:
"""Returns list of parameters for this function.
This function is expensive, cache the result where possible!
"""
field_map = {f.name: f for f in self.fields_accessed}
sizes = set()
if self.use_all_written_field_sizes:
sizes = set().union(*(a.shape[:a.spatial_dimensions] for a in self.fields_written))
sizes = filter(lambda s: isinstance(s, FieldShapeSymbol), sizes)
def get_fields(symbol):
if hasattr(symbol, 'field_name'):
return field_map[symbol.field_name],
elif hasattr(symbol, 'field_names'):
return tuple(field_map[fn] for fn in symbol.field_names)
return ()
argument_symbols = self._body.undefined_symbols - self.global_variables
argument_symbols.update(sizes)
parameters = [self.Parameter(symbol, get_fields(symbol)) for symbol in argument_symbols]
if hasattr(self, 'indexing'):
parameters += [self.Parameter(s, []) for s in self.indexing.symbolic_parameters()]
# Exclude paramters of type CFunction. These parameters will result in a C function call that will be handled
# by including a respective header file in the compute kernel. Hence, it is not a free parameter.
parameters = [p for p in parameters if not isinstance(p.symbol, CFunction)]
parameters.sort(key=lambda p: p.symbol.name)
return parameters
def __str__(self): def __str__(self):
self._update_parameters() params = [p.symbol for p in self.get_parameters()]
return '{0} {1}({2})\n{3}'.format(type(self).__name__, self.function_name, self.parameters, return '{0} {1}({2})\n{3}'.format(type(self).__name__, self.function_name, params,
("\t" + "\t".join(str(self.body).splitlines(True)))) ("\t" + "\t".join(str(self.body).splitlines(True))))
def __repr__(self): def __repr__(self):
self._update_parameters() params = [p.symbol for p in self.get_parameters()]
return '{0} {1}({2})'.format(type(self).__name__, self.function_name, self.parameters) return f'{type(self).__name__} {self.function_name}({params})'
def compile(self, *args, **kwargs):
if self._compile_function is None:
raise ValueError("No compile-function provided for this KernelFunction node")
return self._compile_function(self, *args, **kwargs)
class SkipIteration(Node):
@property
def args(self):
return []
@property
def symbols_defined(self):
return set()
@property
def undefined_symbols(self):
return set()
class Block(Node): class Block(Node):
def __init__(self, nodes: List[Node]): def __init__(self, nodes: Union[Node, List[Node]]):
super(Block, self).__init__() super(Block, self).__init__()
if not isinstance(nodes, list):
nodes = [nodes]
self._nodes = nodes self._nodes = nodes
self.parent = None self.parent = None
for n in self._nodes: for n in self._nodes:
n.parent = self try:
n.parent = self
except AttributeError:
pass
@property @property
def args(self): def args(self):
return self._nodes return self._nodes
def subs(self, subs_dict) -> None: def subs(self, subs_dict) -> None:
new_args = []
for a in self.args:
if isinstance(a, SympyAssignment) and a.is_declaration and a.rhs in subs_dict.keys():
subs_dict[a.lhs] = subs_dict[a.rhs]
else:
new_args.append(a)
self._nodes = new_args
for a in self.args: for a in self.args:
a.subs(subs_dict) a.subs(subs_dict)
def insert_front(self, node): def fast_subs(self, subs_dict, skip=None):
node.parent = self self._nodes = [fast_subs(a, subs_dict, skip) for a in self._nodes]
self._nodes.insert(0, node) return self
def insert_front(self, node, if_not_exists=False):
if if_not_exists and len(self._nodes) > 0 and self._nodes[0] == node:
return
if isinstance(node, collections.abc.Iterable):
node = list(node)
for n in node:
n.parent = self
self._nodes = node + self._nodes
else:
node.parent = self
self._nodes.insert(0, node)
def insert_before(self, new_node, insert_before): def insert_before(self, new_node, insert_before, if_not_exists=False):
new_node.parent = self new_node.parent = self
assert self._nodes.count(insert_before) == 1
idx = self._nodes.index(insert_before) idx = self._nodes.index(insert_before)
# move all assignment (definitions to the top) if not if_not_exists or self._nodes[idx] != new_node:
if isinstance(new_node, SympyAssignment) and new_node.is_declaration: self._nodes.insert(idx, new_node)
while idx > 0:
pn = self._nodes[idx - 1] def insert_after(self, new_node, insert_after, if_not_exists=False):
if isinstance(pn, LoopOverCoordinate) or isinstance(pn, Conditional): new_node.parent = self
idx -= 1 assert self._nodes.count(insert_after) == 1
else: idx = self._nodes.index(insert_after) + 1
break
self._nodes.insert(idx, new_node) if not if_not_exists or not (self._nodes[idx - 1] == new_node
or (idx < len(self._nodes) and self._nodes[idx] == new_node)):
self._nodes.insert(idx, new_node)
def append(self, node): def append(self, node):
if isinstance(node, list) or isinstance(node, tuple): if isinstance(node, list) or isinstance(node, tuple):
...@@ -284,6 +375,7 @@ class Block(Node): ...@@ -284,6 +375,7 @@ class Block(Node):
return tmp return tmp
def replace(self, child, replacements): def replace(self, child, replacements):
assert self._nodes.count(child) == 1
idx = self._nodes.index(child) idx = self._nodes.index(child)
del self._nodes[idx] del self._nodes[idx]
if type(replacements) is list: if type(replacements) is list:
...@@ -298,7 +390,10 @@ class Block(Node): ...@@ -298,7 +390,10 @@ class Block(Node):
def symbols_defined(self): def symbols_defined(self):
result = set() result = set()
for a in self.args: for a in self.args:
result.update(a.symbols_defined) if isinstance(a, Assignment):
result.update(a.free_symbols)
else:
result.update(a.symbols_defined)
return result return result
@property @property
...@@ -306,8 +401,12 @@ class Block(Node): ...@@ -306,8 +401,12 @@ class Block(Node):
result = set() result = set()
defined_symbols = set() defined_symbols = set()
for a in self.args: for a in self.args:
result.update(a.undefined_symbols) if isinstance(a, Assignment):
defined_symbols.update(a.symbols_defined) result.update(a.free_symbols)
defined_symbols.update({a.lhs})
else:
result.update(a.undefined_symbols)
defined_symbols.update(a.symbols_defined)
return result - defined_symbols return result - defined_symbols
def __str__(self): def __str__(self):
...@@ -330,8 +429,9 @@ class PragmaBlock(Block): ...@@ -330,8 +429,9 @@ class PragmaBlock(Block):
class LoopOverCoordinate(Node): class LoopOverCoordinate(Node):
LOOP_COUNTER_NAME_PREFIX = "ctr" LOOP_COUNTER_NAME_PREFIX = "ctr"
BLOCK_LOOP_COUNTER_NAME_PREFIX = "_blockctr"
def __init__(self, body, coordinate_to_loop_over, start, stop, step=1): def __init__(self, body, coordinate_to_loop_over, start, stop, step=1, is_block_loop=False, custom_loop_ctr=None):
super(LoopOverCoordinate, self).__init__(parent=None) super(LoopOverCoordinate, self).__init__(parent=None)
self.body = body self.body = body
body.parent = self body.parent = self
...@@ -341,10 +441,13 @@ class LoopOverCoordinate(Node): ...@@ -341,10 +441,13 @@ class LoopOverCoordinate(Node):
self.step = step self.step = step
self.body.parent = self self.body.parent = self
self.prefix_lines = [] self.prefix_lines = []
self.is_block_loop = is_block_loop
self.custom_loop_ctr = custom_loop_ctr
def new_loop_with_different_body(self, new_body): def new_loop_with_different_body(self, new_body):
result = LoopOverCoordinate(new_body, self.coordinate_to_loop_over, self.start, self.stop, self.step) result = LoopOverCoordinate(new_body, self.coordinate_to_loop_over, self.start, self.stop,
result.prefix_lines = [l for l in self.prefix_lines] self.step, self.is_block_loop, self.custom_loop_ctr)
result.prefix_lines = [prefix_line for prefix_line in self.prefix_lines]
return result return result
def subs(self, subs_dict): def subs(self, subs_dict):
...@@ -356,6 +459,16 @@ class LoopOverCoordinate(Node): ...@@ -356,6 +459,16 @@ class LoopOverCoordinate(Node):
if hasattr(self.step, "subs"): if hasattr(self.step, "subs"):
self.step = self.step.subs(subs_dict) self.step = self.step.subs(subs_dict)
def fast_subs(self, subs_dict, skip=None):
self.body = fast_subs(self.body, subs_dict, skip)
if isinstance(self.start, sp.Basic):
self.start = fast_subs(self.start, subs_dict, skip)
if isinstance(self.stop, sp.Basic):
self.stop = fast_subs(self.stop, subs_dict, skip)
if isinstance(self.step, sp.Basic):
self.step = fast_subs(self.step, subs_dict, skip)
return self
@property @property
def args(self): def args(self):
result = [self.body] result = [self.body]
...@@ -388,11 +501,21 @@ class LoopOverCoordinate(Node): ...@@ -388,11 +501,21 @@ class LoopOverCoordinate(Node):
@staticmethod @staticmethod
def get_loop_counter_name(coordinate_to_loop_over): def get_loop_counter_name(coordinate_to_loop_over):
return "%s_%s" % (LoopOverCoordinate.LOOP_COUNTER_NAME_PREFIX, coordinate_to_loop_over) return f"{LoopOverCoordinate.LOOP_COUNTER_NAME_PREFIX}_{coordinate_to_loop_over}"
@staticmethod
def get_block_loop_counter_name(coordinate_to_loop_over):
return f"{LoopOverCoordinate.BLOCK_LOOP_COUNTER_NAME_PREFIX}_{coordinate_to_loop_over}"
@property @property
def loop_counter_name(self): def loop_counter_name(self):
return LoopOverCoordinate.get_loop_counter_name(self.coordinate_to_loop_over) if self.custom_loop_ctr:
return self.custom_loop_ctr.name
else:
if self.is_block_loop:
return LoopOverCoordinate.get_block_loop_counter_name(self.coordinate_to_loop_over)
else:
return LoopOverCoordinate.get_loop_counter_name(self.coordinate_to_loop_over)
@staticmethod @staticmethod
def is_loop_counter_symbol(symbol): def is_loop_counter_symbol(symbol):
...@@ -406,15 +529,26 @@ class LoopOverCoordinate(Node): ...@@ -406,15 +529,26 @@ class LoopOverCoordinate(Node):
@staticmethod @staticmethod
def get_loop_counter_symbol(coordinate_to_loop_over): def get_loop_counter_symbol(coordinate_to_loop_over):
return TypedSymbol(LoopOverCoordinate.get_loop_counter_name(coordinate_to_loop_over), 'int') return TypedSymbol(LoopOverCoordinate.get_loop_counter_name(coordinate_to_loop_over), 'int', nonnegative=True)
@staticmethod
def get_block_loop_counter_symbol(coordinate_to_loop_over):
return TypedSymbol(LoopOverCoordinate.get_block_loop_counter_name(coordinate_to_loop_over),
'int',
nonnegative=True)
@property @property
def loop_counter_symbol(self): def loop_counter_symbol(self):
return LoopOverCoordinate.get_loop_counter_symbol(self.coordinate_to_loop_over) if self.custom_loop_ctr:
return self.custom_loop_ctr
else:
if self.is_block_loop:
return self.get_block_loop_counter_symbol(self.coordinate_to_loop_over)
else:
return self.get_loop_counter_symbol(self.coordinate_to_loop_over)
@property @property
def is_outermost_loop(self): def is_outermost_loop(self):
from pystencils.transformations import get_next_parent_of_type
return get_next_parent_of_type(self, LoopOverCoordinate) is None return get_next_parent_of_type(self, LoopOverCoordinate) is None
@property @property
...@@ -434,15 +568,17 @@ class LoopOverCoordinate(Node): ...@@ -434,15 +568,17 @@ class LoopOverCoordinate(Node):
class SympyAssignment(Node): class SympyAssignment(Node):
def __init__(self, lhs_symbol, rhs_expr, is_const=True): def __init__(self, lhs_symbol, rhs_expr, is_const=True, use_auto=False):
super(SympyAssignment, self).__init__(parent=None) super(SympyAssignment, self).__init__(parent=None)
self._lhs_symbol = lhs_symbol self._lhs_symbol = sp.sympify(lhs_symbol)
self.rhs = rhs_expr self._rhs = sp.sympify(rhs_expr)
self._is_const = is_const self._is_const = is_const
self._is_declaration = self.__is_declaration() self._is_declaration = self.__is_declaration()
self._use_auto = use_auto
def __is_declaration(self): def __is_declaration(self):
if isinstance(self._lhs_symbol, cast_func): from pystencils.typing import CastFunc
if isinstance(self._lhs_symbol, CastFunc):
return False return False
if any(isinstance(self._lhs_symbol, c) for c in (Field.Access, sp.Indexed, TemporaryMemoryAllocation)): if any(isinstance(self._lhs_symbol, c) for c in (Field.Access, sp.Indexed, TemporaryMemoryAllocation)):
return False return False
...@@ -452,15 +588,35 @@ class SympyAssignment(Node): ...@@ -452,15 +588,35 @@ class SympyAssignment(Node):
def lhs(self): def lhs(self):
return self._lhs_symbol return self._lhs_symbol
@property
def rhs(self):
return self._rhs
@lhs.setter @lhs.setter
def lhs(self, new_value): def lhs(self, new_value):
self._lhs_symbol = new_value self._lhs_symbol = new_value
self._is_declaration = self.__is_declaration() self._is_declaration = self.__is_declaration()
@rhs.setter
def rhs(self, new_rhs_expr):
self._rhs = new_rhs_expr
def subs(self, subs_dict): def subs(self, subs_dict):
self.lhs = fast_subs(self.lhs, subs_dict) self.lhs = fast_subs(self.lhs, subs_dict)
self.rhs = fast_subs(self.rhs, subs_dict) self.rhs = fast_subs(self.rhs, subs_dict)
def fast_subs(self, subs_dict, skip=None):
self.lhs = fast_subs(self.lhs, subs_dict, skip)
self.rhs = fast_subs(self.rhs, subs_dict, skip)
return self
def optimize(self, optimizations):
try:
from sympy.codegen.rewriting import optimize
self.rhs = optimize(self.rhs, optimizations)
except Exception:
pass
@property @property
def args(self): def args(self):
return [self._lhs_symbol, self.rhs] return [self._lhs_symbol, self.rhs]
...@@ -473,7 +629,7 @@ class SympyAssignment(Node): ...@@ -473,7 +629,7 @@ class SympyAssignment(Node):
@property @property
def undefined_symbols(self): def undefined_symbols(self):
result = self.rhs.atoms(sp.Symbol) result = {s for s in self.rhs.free_symbols if not isinstance(s, sp.Indexed)}
# Add loop counters if there a field accesses # Add loop counters if there a field accesses
loop_counters = set() loop_counters = set()
for symbol in result: for symbol in result:
...@@ -481,7 +637,9 @@ class SympyAssignment(Node): ...@@ -481,7 +637,9 @@ class SympyAssignment(Node):
for i in range(len(symbol.offsets)): for i in range(len(symbol.offsets)):
loop_counters.add(LoopOverCoordinate.get_loop_counter_symbol(i)) loop_counters.add(LoopOverCoordinate.get_loop_counter_symbol(i))
result.update(loop_counters) result.update(loop_counters)
result.update(self._lhs_symbol.atoms(sp.Symbol)) result.update(self._lhs_symbol.atoms(sp.Symbol))
return result return result
@property @property
...@@ -492,6 +650,10 @@ class SympyAssignment(Node): ...@@ -492,6 +650,10 @@ class SympyAssignment(Node):
def is_const(self): def is_const(self):
return self._is_const return self._is_const
@property
def use_auto(self):
return self._use_auto
def replace(self, child, replacement): def replace(self, child, replacement):
if child == self.lhs: if child == self.lhs:
replacement.parent = self replacement.parent = self
...@@ -500,7 +662,7 @@ class SympyAssignment(Node): ...@@ -500,7 +662,7 @@ class SympyAssignment(Node):
replacement.parent = self replacement.parent = self
self.rhs = replacement self.rhs = replacement
else: else:
raise ValueError('%s is not in args of %s' % (replacement, self.__class__)) raise ValueError(f'{replacement} is not in args of {self.__class__}')
def __repr__(self): def __repr__(self):
return repr(self.lhs) + "" + repr(self.rhs) return repr(self.lhs) + "" + repr(self.rhs)
...@@ -508,13 +670,21 @@ class SympyAssignment(Node): ...@@ -508,13 +670,21 @@ class SympyAssignment(Node):
def _repr_html_(self): def _repr_html_(self):
printed_lhs = sp.latex(self.lhs) printed_lhs = sp.latex(self.lhs)
printed_rhs = sp.latex(self.rhs) printed_rhs = sp.latex(self.rhs)
return "${printed_lhs} \leftarrow {printed_rhs}$".format(printed_lhs=printed_lhs, printed_rhs=printed_rhs) return f"${printed_lhs} \\leftarrow {printed_rhs}$"
def __hash__(self):
return hash((self.lhs, self.rhs))
def __eq__(self, other):
return type(self) is type(other) and (self.lhs, self.rhs) == (other.lhs, other.rhs)
class ResolvedFieldAccess(sp.Indexed): class ResolvedFieldAccess(sp.Indexed):
def __new__(cls, base, linearized_index, field, offsets, idx_coordinate_values): def __new__(cls, base, linearized_index, field, offsets, idx_coordinate_values):
if not isinstance(base, IndexedBase): if not isinstance(base, sp.IndexedBase):
base = IndexedBase(base, shape=(1,)) assert isinstance(base, TypedSymbol)
base = sp.IndexedBase(base, shape=(1,))
assert isinstance(base.label, TypedSymbol)
obj = super(ResolvedFieldAccess, cls).__new__(cls, base, linearized_index) obj = super(ResolvedFieldAccess, cls).__new__(cls, base, linearized_index)
obj.field = field obj.field = field
obj.offsets = offsets obj.offsets = offsets
...@@ -526,7 +696,7 @@ class ResolvedFieldAccess(sp.Indexed): ...@@ -526,7 +696,7 @@ class ResolvedFieldAccess(sp.Indexed):
self.args[1].subs(old, new), self.args[1].subs(old, new),
self.field, self.offsets, self.idx_coordinate_values) self.field, self.offsets, self.idx_coordinate_values)
def fast_subs(self, substitutions): def fast_subs(self, substitutions, skip=None):
if self in substitutions: if self in substitutions:
return substitutions[self] return substitutions[self]
return ResolvedFieldAccess(self.args[0].subs(substitutions), return ResolvedFieldAccess(self.args[0].subs(substitutions),
...@@ -543,11 +713,14 @@ class ResolvedFieldAccess(sp.Indexed): ...@@ -543,11 +713,14 @@ class ResolvedFieldAccess(sp.Indexed):
def __str__(self): def __str__(self):
top = super(ResolvedFieldAccess, self).__str__() top = super(ResolvedFieldAccess, self).__str__()
return "%s (%s)" % (top, self.typed_symbol.dtype) return f"{top} ({self.typed_symbol.dtype})"
def __getnewargs__(self): def __getnewargs__(self):
return self.base, self.indices[0], self.field, self.offsets, self.idx_coordinate_values return self.base, self.indices[0], self.field, self.offsets, self.idx_coordinate_values
def __getnewargs_ex__(self):
return (self.base, self.indices[0], self.field, self.offsets, self.idx_coordinate_values), {}
class TemporaryMemoryAllocation(Node): class TemporaryMemoryAllocation(Node):
"""Node for temporary memory buffer allocation. """Node for temporary memory buffer allocation.
...@@ -559,6 +732,7 @@ class TemporaryMemoryAllocation(Node): ...@@ -559,6 +732,7 @@ class TemporaryMemoryAllocation(Node):
size: number of elements to allocate size: number of elements to allocate
align_offset: the align_offset's element is aligned align_offset: the align_offset's element is aligned
""" """
def __init__(self, typed_symbol: TypedSymbol, size, align_offset): def __init__(self, typed_symbol: TypedSymbol, size, align_offset):
super(TemporaryMemoryAllocation, self).__init__(parent=None) super(TemporaryMemoryAllocation, self).__init__(parent=None)
self.symbol = typed_symbol self.symbol = typed_symbol
...@@ -611,3 +785,86 @@ class TemporaryMemoryFree(Node): ...@@ -611,3 +785,86 @@ class TemporaryMemoryFree(Node):
@property @property
def args(self): def args(self):
return [] return []
def early_out(condition):
from pystencils.cpu.vectorization import vec_all
return Conditional(vec_all(condition), Block([SkipIteration()]))
def get_dummy_symbol(dtype='bool'):
return TypedSymbol(f'dummy{uuid.uuid4().hex}', create_type(dtype))
class SourceCodeComment(Node):
def __init__(self, text):
self.text = text
@property
def args(self):
return []
@property
def symbols_defined(self):
return set()
@property
def undefined_symbols(self):
return set()
def __str__(self):
return "/* " + self.text + " */"
def __repr__(self):
return self.__str__()
class EmptyLine(Node):
def __init__(self):
pass
@property
def args(self):
return []
@property
def symbols_defined(self):
return set()
@property
def undefined_symbols(self):
return set()
def __str__(self):
return ""
def __repr__(self):
return self.__str__()
class ConditionalFieldAccess(sp.Function):
"""
:class:`pystencils.Field.Access` that is only executed if a certain condition is met.
Can be used, for instance, for out-of-bound checks.
"""
def __new__(cls, field_access, outofbounds_condition, outofbounds_value=0):
return sp.Function.__new__(cls, field_access, outofbounds_condition, sp.S(outofbounds_value))
@property
def access(self):
return self.args[0]
@property
def outofbounds_condition(self):
return self.args[1]
@property
def outofbounds_value(self):
return self.args[2]
def __getnewargs__(self):
return self.access, self.outofbounds_condition, self.outofbounds_value
def __getnewargs_ex__(self):
return (self.access, self.outofbounds_condition, self.outofbounds_value), {}
...@@ -6,9 +6,3 @@ try: ...@@ -6,9 +6,3 @@ try:
__all__.append('print_dot') __all__.append('print_dot')
except ImportError: except ImportError:
pass pass
try:
from .llvm import generate_llvm # NOQA
__all__.append('generate_llvm')
except ImportError:
pass
from pystencils.typing import CFunction
def get_argument_string(function_shortcut, first=''):
args = function_shortcut[function_shortcut.index('[') + 1: -1]
arg_string = "("
if first:
arg_string += first + ', '
for arg in args.split(","):
arg = arg.strip()
if not arg:
continue
if arg in ('0', '1', '2', '3', '4', '5'):
arg_string += "{" + arg + "},"
else:
arg_string += arg + ","
arg_string = arg_string[:-1] + ")"
return arg_string
def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'):
if instruction_set not in ['neon', 'sme'] and not instruction_set.startswith('sve'):
raise NotImplementedError(instruction_set)
if instruction_set in ['sve', 'sve2', 'sme']:
cmp = 'cmp'
elif instruction_set.startswith('sve2') and instruction_set not in ('sve256', 'sve2048'):
cmp = 'cmp'
bitwidth = int(instruction_set[4:])
elif instruction_set.startswith('sve'):
cmp = 'cmp'
bitwidth = int(instruction_set[3:])
elif instruction_set == 'neon':
cmp = 'c'
bitwidth = 128
base_names = {
'+': 'add[0, 1]',
'-': 'sub[0, 1]',
'*': 'mul[0, 1]',
'/': 'div[0, 1]',
'sqrt': 'sqrt[0]',
'loadU': 'ld1[0]',
'storeU': 'st1[0, 1]',
'abs': 'abs[0]',
'==': f'{cmp}eq[0, 1]',
'!=': f'{cmp}eq[0, 1]',
'<=': f'{cmp}le[0, 1]',
'<': f'{cmp}lt[0, 1]',
'>=': f'{cmp}ge[0, 1]',
'>': f'{cmp}gt[0, 1]',
}
bits = {'double': 64,
'float': 32,
'int': 32}
result = dict()
if instruction_set in ['sve', 'sve2', 'sme']:
width = 'svcntd()' if data_type == 'double' else 'svcntw()'
intwidth = 'svcntw()'
result['bytes'] = 'svcntb()'
else:
width = bitwidth // bits[data_type]
intwidth = bitwidth // bits['int']
result['bytes'] = bitwidth // 8
if instruction_set.startswith('sve') or instruction_set == 'sme':
base_names['stream'] = 'stnt1[0, 1]'
prefix = 'sv'
suffix = f'_f{bits[data_type]}'
elif instruction_set == 'neon':
prefix = 'v'
suffix = f'q_f{bits[data_type]}'
if instruction_set in ['sve', 'sve2', 'sme']:
predicate = f'{prefix}whilelt_b{bits[data_type]}_u64({{loop_counter}}, {{loop_stop}})'
int_predicate = f'{prefix}whilelt_b{bits["int"]}_u64({{loop_counter}}, {{loop_stop}})'
else:
predicate = f'{prefix}whilelt_b{bits[data_type]}(0, {width})'
int_predicate = f'{prefix}whilelt_b{bits["int"]}(0, {intwidth})'
for intrinsic_id, function_shortcut in base_names.items():
function_shortcut = function_shortcut.strip()
name = function_shortcut[:function_shortcut.index('[')]
arg_string = get_argument_string(function_shortcut, first=predicate if prefix == 'sv' else '')
if prefix == 'sv' and not name.startswith('ld') and not name.startswith('st') and not name.startswith(cmp):
undef = '_x'
else:
undef = ''
result[intrinsic_id] = prefix + name + suffix + undef + arg_string
if instruction_set in ['sve', 'sve2', 'sme']:
result['width'] = CFunction(width, "int")
result['intwidth'] = CFunction(intwidth, "int")
else:
result['width'] = width
result['intwidth'] = intwidth
if instruction_set.startswith('sve') or instruction_set == 'sme':
result['makeVecConst'] = f'svdup_f{bits[data_type]}' + '({0})'
result['makeVecConstInt'] = f'svdup_s{bits["int"]}' + '({0})'
result['makeVecIndex'] = f'svindex_s{bits["int"]}' + '({0}, {1})'
if instruction_set != 'sme':
vindex = f'svindex_u{bits[data_type]}(0, {{0}})'
result['storeS'] = f'svst1_scatter_u{bits[data_type]}index_f{bits[data_type]}({predicate}, {{0}}, ' + \
vindex.format("{2}") + ', {1})'
result['loadS'] = f'svld1_gather_u{bits[data_type]}index_f{bits[data_type]}({predicate}, {{0}}, ' + \
vindex.format("{1}") + ')'
if instruction_set.startswith('sve2') and instruction_set not in ('sve256', 'sve2048'):
result['streamS'] = f'svstnt1_scatter_u{bits[data_type]}offset_f{bits[data_type]}({predicate}, {{0}}, ' + \
vindex.format(f"{{2}}*{bits[data_type]//8}") + ', {1})'
result['+int'] = f"svadd_s{bits['int']}_x({int_predicate}, " + "{0}, {1})"
result['float'] = f'svfloat{bits["float"]}_{"s" if instruction_set not in ["sve", "sve2", "sme"] else ""}t'
result['double'] = f'svfloat{bits["double"]}_{"s" if instruction_set not in ["sve", "sve2", "sme"] else ""}t'
result['int'] = f'svint{bits["int"]}_{"s" if instruction_set not in ["sve", "sve2", "sme"] else ""}t'
result['bool'] = f'svbool_{"s" if instruction_set not in ["sve", "sve2", "sme"] else ""}t'
result['headers'] = ['<arm_sve.h>', '<arm_acle.h>', '"arm_neon_helpers.h"']
result['&'] = f'svand_b_z({predicate},' + ' {0}, {1})'
result['|'] = f'svorr_b_z({predicate},' + ' {0}, {1})'
result['blendv'] = f'svsel_f{bits[data_type]}' + '({2}, {1}, {0})'
result['any'] = f'svptest_any({predicate}, {{0}})'
result['all'] = f'svcntp_b{bits[data_type]}({predicate}, {{0}}) == {width}'
result['maskStoreU'] = result['storeU'].replace(predicate, '{2}')
result['maskStream'] = result['stream'].replace(predicate, '{2}')
if instruction_set != 'sme':
result['maskStoreS'] = result['storeS'].replace(predicate, '{3}')
if instruction_set.startswith('sve2') and instruction_set not in ('sve256', 'sve2048'):
result['maskStreamS'] = result['streamS'].replace(predicate, '{3}')
result['streamFence'] = '__dmb(15)'
if instruction_set == 'sme':
result['function_prefix'] = '__attribute__((arm_locally_streaming))'
elif instruction_set not in ['sve', 'sve2', 'sme']:
result['compile_flags'] = [f'-msve-vector-bits={bitwidth}']
else:
result['makeVecConst'] = f'vdupq_n_f{bits[data_type]}' + '({0})'
result['makeVec'] = f'makeVec_f{bits[data_type]}' + '(' + ", ".join(['{' + str(i) + '}' for i in
range(width)]) + ')'
result['makeVecConstInt'] = f'vdupq_n_s{bits["int"]}' + '({0})'
result['makeVecInt'] = f'makeVec_s{bits["int"]}' + '({0}, {1}, {2}, {3})'
result['+int'] = f"vaddq_s{bits['int']}" + "({0}, {1})"
result[data_type] = f'float{bits[data_type]}x{width}_t'
result['int'] = f'int{bits["int"]}x{intwidth}_t'
result['bool'] = f'uint{bits[data_type]}x{width}_t'
result['headers'] = ['<arm_neon.h>', '"arm_neon_helpers.h"']
result['!='] = f'vmvnq_u{bits[data_type]}({result["=="]})'
result['&'] = f'vandq_u{bits[data_type]}' + '({0}, {1})'
result['|'] = f'vorrq_u{bits[data_type]}' + '({0}, {1})'
result['blendv'] = f'vbslq_f{bits[data_type]}' + '({2}, {1}, {0})'
result['any'] = f'vaddlvq_u8(vreinterpretq_u8_u{bits[data_type]}({{0}})) > 0'
result['all'] = f'vaddlvq_u8(vreinterpretq_u8_u{bits[data_type]}({{0}})) == 16*0xff'
# SVE has real nontemporal stores, so we only need to zero cachlines on Neon
result['cachelineZero'] = 'cachelineZero((void*) {0})'
result['cachelineSize'] = 'cachelineSize()'
return result
import re
from collections import namedtuple
import hashlib
from typing import Set
import numpy as np
import sympy as sp
from sympy.core import S
from sympy.logic.boolalg import BooleanFalse, BooleanTrue
from sympy.functions.elementary.trigonometric import TrigonometricFunction, InverseTrigonometricFunction
from sympy.functions.elementary.hyperbolic import HyperbolicFunction
from pystencils.astnodes import KernelFunction, LoopOverCoordinate, Node
from pystencils.cpu.vectorization import vec_all, vec_any, CachelineSize
from pystencils.typing import (
PointerType, VectorType, CastFunc, create_type, get_type_of_expression,
ReinterpretCastFunc, VectorMemoryAccess, BasicType, TypedSymbol, CFunction)
from pystencils.enums import Backend
from pystencils.fast_approximation import fast_division, fast_inv_sqrt, fast_sqrt
from pystencils.functions import DivFunc, AddressOf
from pystencils.integer_functions import (
bit_shift_left, bit_shift_right, bitwise_and, bitwise_or, bitwise_xor,
int_div, int_power_of_2, modulo_ceil)
try:
from sympy.printing.c import C99CodePrinter as CCodePrinter # for sympy versions > 1.6
except ImportError:
from sympy.printing.ccode import C99CodePrinter as CCodePrinter
__all__ = ['generate_c', 'CustomCodeNode', 'PrintNode', 'get_headers', 'CustomSympyPrinter']
HEADER_REGEX = re.compile(r'^[<"].*[">]$')
def generate_c(ast_node: Node,
signature_only: bool = False,
dialect: Backend = Backend.C,
custom_backend=None,
with_globals=True) -> str:
"""Prints an abstract syntax tree node as C or CUDA code.
This function does not need to distinguish for most AST nodes between C, C++ or CUDA code, it just prints 'C-like'
code as encoded in the abstract syntax tree (AST). The AST is built differently for C or CUDA by calling different
create_kernel functions.
Args:
ast_node: ast representation of kernel
signature_only: generate signature without function body
dialect: `Backend`: 'C' or 'CUDA'
custom_backend: use own custom printer for code generation
with_globals: enable usage of global variables
Returns:
C-like code for the ast node and its descendants
"""
global_declarations = get_global_declarations(ast_node)
for d in global_declarations:
if hasattr(ast_node, "global_variables"):
ast_node.global_variables.update(d.symbols_defined)
else:
ast_node.global_variables = d.symbols_defined
if custom_backend:
printer = custom_backend
elif dialect == Backend.C:
try:
# TODO Vectorization Revamp: instruction_set should not be just slapped on ast
instruction_set = ast_node.instruction_set
except Exception:
instruction_set = None
printer = CBackend(signature_only=signature_only,
vector_instruction_set=instruction_set)
elif dialect == Backend.CUDA:
from pystencils.backends.cuda_backend import CudaBackend
printer = CudaBackend(signature_only=signature_only)
else:
raise ValueError(f'Unknown {dialect=}')
code = printer(ast_node)
if not signature_only and isinstance(ast_node, KernelFunction):
if with_globals and global_declarations:
code = "\n" + code
for declaration in global_declarations:
code = printer(declaration) + "\n" + code
return code
def get_global_declarations(ast):
global_declarations = []
def visit_node(sub_ast):
nonlocal global_declarations
if hasattr(sub_ast, "required_global_declarations"):
global_declarations += sub_ast.required_global_declarations
if hasattr(sub_ast, "args"):
for node in sub_ast.args:
visit_node(node)
visit_node(ast)
return sorted(set(global_declarations), key=str)
def get_headers(ast_node: Node) -> Set[str]:
"""Return a set of header files, necessary to compile the printed C-like code."""
headers = set()
if isinstance(ast_node, KernelFunction) and ast_node.instruction_set:
headers.update(ast_node.instruction_set['headers'])
if hasattr(ast_node, 'headers'):
headers.update(ast_node.headers)
for a in ast_node.args:
if isinstance(a, (sp.Expr, Node)):
headers.update(get_headers(a))
for g in get_global_declarations(ast_node):
if isinstance(g, Node):
headers.update(get_headers(g))
for h in headers:
assert HEADER_REGEX.match(h), f'header /{h}/ does not follow the pattern /"..."/ or /<...>/'
return headers
# --------------------------------------- Backend Specific Nodes -------------------------------------------------------
# TODO future CustomCodeNode should not be backend specific move it elsewhere
class CustomCodeNode(Node):
def __init__(self, code, symbols_read, symbols_defined, parent=None):
super(CustomCodeNode, self).__init__(parent=parent)
self._code = "\n" + code
self._symbols_read = set(symbols_read)
self._symbols_defined = set(symbols_defined)
self.headers = []
def get_code(self, dialect, vector_instruction_set, print_arg):
return self._code
@property
def args(self):
return []
@property
def symbols_defined(self):
return self._symbols_defined
@property
def undefined_symbols(self):
return self._symbols_read - self._symbols_defined
def __eq__(self, other):
return type(self) is type(other) and self._code == other._code
def __hash__(self):
return hash(self._code)
class PrintNode(CustomCodeNode):
# noinspection SpellCheckingInspection
def __init__(self, symbol_to_print):
code = f'\nstd::cout << "{symbol_to_print.name} = " << {symbol_to_print.name} << std::endl; \n'
super(PrintNode, self).__init__(code, symbols_read=[symbol_to_print], symbols_defined=set())
self.headers.append("<iostream>")
# ------------------------------------------- Printer ------------------------------------------------------------------
# noinspection PyPep8Naming
class CBackend:
def __init__(self, sympy_printer=None, signature_only=False, vector_instruction_set=None, dialect=Backend.C):
if sympy_printer is None:
if vector_instruction_set is not None:
self.sympy_printer = VectorizedCustomSympyPrinter(vector_instruction_set)
else:
self.sympy_printer = CustomSympyPrinter()
else:
self.sympy_printer = sympy_printer
self._vector_instruction_set = vector_instruction_set
self._indent = " "
self._dialect = dialect
self._signatureOnly = signature_only
self._kwargs = {}
self.sympy_printer._kwargs = self._kwargs
def __call__(self, node):
prev_is = VectorType.instruction_set
VectorType.instruction_set = self._vector_instruction_set
result = str(self._print(node))
VectorType.instruction_set = prev_is
return result
def _print(self, node):
if isinstance(node, str):
return node
for cls in type(node).__mro__:
method_name = f"_print_{cls.__name__}"
if hasattr(self, method_name):
return getattr(self, method_name)(node)
raise NotImplementedError(f"{self.__class__.__name__} does not support node of type {node.__class__.__name__}")
def _print_AbstractType(self, node):
return str(node)
def _print_KernelFunction(self, node):
function_arguments = [f"{self._print(s.symbol.dtype)} {s.symbol.name}" for s in node.get_parameters()
if not type(s.symbol) is CFunction]
launch_bounds = ""
if self._dialect == Backend.CUDA:
max_threads = node.indexing.max_threads_per_block()
if max_threads:
launch_bounds = f"__launch_bounds__({max_threads}) "
func_declaration = "FUNC_PREFIX %svoid %s(%s)" % (launch_bounds, node.function_name,
", ".join(function_arguments))
if self._signatureOnly:
return func_declaration
body = self._print(node.body)
return func_declaration + "\n" + body
def _print_Block(self, node):
block_contents = "\n".join([self._print(child) for child in node.args])
return "{\n%s\n}" % (self._indent + self._indent.join(block_contents.splitlines(True)))
def _print_PragmaBlock(self, node):
return f"{node.pragma_line}\n{self._print_Block(node)}"
def _print_LoopOverCoordinate(self, node):
counter_name = node.loop_counter_name
counter_dtype = node.loop_counter_symbol.dtype.c_name
start = f"{counter_dtype} {counter_name} = {self.sympy_printer.doprint(node.start)}"
condition = f"{counter_name} < {self.sympy_printer.doprint(node.stop)}"
update = f"{counter_name} += {self.sympy_printer.doprint(node.step)}"
loop_str = f"for ({start}; {condition}; {update})"
self._kwargs['loop_counter'] = counter_name
self._kwargs['loop_stop'] = node.stop
prefix = "\n".join(node.prefix_lines)
if prefix:
prefix += "\n"
return f"{prefix}{loop_str}\n{self._print(node.body)}"
def _print_SympyAssignment(self, node):
printed_lhs = self.sympy_printer.doprint(node.lhs)
printed_rhs = self.sympy_printer.doprint(node.rhs)
if node.is_declaration:
if node.use_auto:
data_type = 'auto'
else:
data_type = self._print(node.lhs.dtype).replace(' const', '')
if node.is_const:
data_type = f'const {data_type}'
return f"{data_type} {printed_lhs} = {printed_rhs};"
else:
lhs_type = get_type_of_expression(node.lhs) # TOOD: this should have been typed
printed_mask = ""
if type(lhs_type) is VectorType and isinstance(node.lhs, CastFunc):
arg, data_type, aligned, nontemporal, mask, stride = node.lhs.args
instr = 'storeU'
if nontemporal and 'storeA' not in self._vector_instruction_set and \
'stream' in self._vector_instruction_set:
instr = 'stream'
elif aligned:
instr = 'stream' if nontemporal and 'stream' in self._vector_instruction_set else 'storeA'
if mask != True: # NOQA
instr = 'maskStream' if nontemporal and 'maskStream' in self._vector_instruction_set else \
'maskStoreA' if aligned else 'maskStoreU'
if instr not in self._vector_instruction_set:
if instr == 'maskStream' and 'stream' in self._vector_instruction_set:
store, load = 'stream', 'loadA'
elif (instr in ('maskStream', 'maskStoreA')) and 'storeA' in self._vector_instruction_set:
store, load = 'storeA', 'loadA'
else:
store, load = 'storeU', 'loadU'
load = load if load in self._vector_instruction_set else 'loadU'
self._vector_instruction_set[instr] = self._vector_instruction_set[store].format(
'{0}', self._vector_instruction_set['blendv'].format(
self._vector_instruction_set[load].format('{0}', **self._kwargs),
'{1}', '{2}', **self._kwargs), **self._kwargs)
printed_mask = self.sympy_printer.doprint(mask)
if data_type.base_type.c_name == 'double':
if self._vector_instruction_set['double'] == '__m256d':
printed_mask = f"_mm256_castpd_si256({printed_mask})"
elif self._vector_instruction_set['double'] == '__m128d':
printed_mask = f"_mm_castpd_si128({printed_mask})"
elif data_type.base_type.c_name == 'float':
if self._vector_instruction_set['float'] == '__m256':
printed_mask = f"_mm256_castps_si256({printed_mask})"
elif self._vector_instruction_set['float'] == '__m128':
printed_mask = f"_mm_castps_si128({printed_mask})"
rhs_type = get_type_of_expression(node.rhs)
if type(rhs_type) is not VectorType:
raise ValueError(f'Cannot vectorize {node.rhs} of type {rhs_type} inside of the pretty printer! '
f'This should have happen earlier!')
# rhs = CastFunc(node.rhs, VectorType(rhs_type)) # Unknown width
else:
rhs = node.rhs
ptr = "&" + self.sympy_printer.doprint(node.lhs.args[0])
if stride != 1:
instr = ('maskStreamS' if nontemporal and 'maskStreamS' in self._vector_instruction_set else
'maskStoreS') if mask != True else \
('streamS' if nontemporal and 'streamS' in self._vector_instruction_set else 'storeS') # NOQA
return self._vector_instruction_set[instr].format(ptr, self.sympy_printer.doprint(rhs),
stride, printed_mask, **self._kwargs) + ';'
pre_code = ''
if nontemporal and 'cachelineZero' in self._vector_instruction_set and mask == True: # NOQA
first_cond = f"((uintptr_t) {ptr} & {CachelineSize.mask_symbol}) == 0"
offset = sp.Add(*[sp.Symbol(LoopOverCoordinate.get_loop_counter_name(i))
* node.lhs.args[0].field.spatial_strides[i] for i in
range(len(node.lhs.args[0].field.spatial_strides))])
if stride == 1:
offset = offset.subs({node.lhs.args[0].field.spatial_strides[0]: 1})
size = sp.Mul(*node.lhs.args[0].field.spatial_shape)
element_size = 8 if data_type.base_type.c_name == 'double' else 4
size_cond = f"({offset} + {CachelineSize.symbol/element_size}) < {size}"
pre_code = f"if ({first_cond} && {size_cond}) " + "{\n\t" + \
self._vector_instruction_set['cachelineZero'].format(ptr, **self._kwargs) + ';\n}\n'
code = self._vector_instruction_set[instr].format(ptr, self.sympy_printer.doprint(rhs),
printed_mask, **self._kwargs) + ';'
flushcond = f"((uintptr_t) {ptr} & {CachelineSize.mask_symbol}) == {CachelineSize.last_symbol}"
if nontemporal and 'flushCacheline' in self._vector_instruction_set:
code2 = self._vector_instruction_set['flushCacheline'].format(
ptr, self.sympy_printer.doprint(rhs), **self._kwargs) + ';'
code = f"{code}\nif ({flushcond}) {{\n\t{code2}\n}}"
elif aligned and nontemporal and 'storeAAndFlushCacheline' in self._vector_instruction_set:
lhs_hash = hashlib.sha1(self.sympy_printer.doprint(node.lhs).encode('ascii')).hexdigest()[:8]
rhs_hash = hashlib.sha1(self.sympy_printer.doprint(rhs).encode('ascii')).hexdigest()[:8]
tmpvar = f'_tmp_{lhs_hash}_{rhs_hash}'
code = 'const ' + self._print(node.lhs.dtype).replace(' const', '') + ' ' + tmpvar + ' = ' \
+ self.sympy_printer.doprint(rhs) + ';'
code1 = self._vector_instruction_set[instr].format(ptr, tmpvar, printed_mask, **self._kwargs) + ';'
maskStore, store, load = 'maskStoreAAndFlushCacheline', 'storeAAndFlushCacheline', 'loadA'
instr2 = maskStore if mask != True else store # NOQA
if instr2 not in self._vector_instruction_set:
self._vector_instruction_set[maskStore] = self._vector_instruction_set[store].format(
'{0}', self._vector_instruction_set['blendv'].format(
self._vector_instruction_set[load].format('{0}', **self._kwargs),
'{1}', '{2}', **self._kwargs),
**self._kwargs)
code2 = self._vector_instruction_set[instr2].format(ptr, tmpvar, printed_mask, **self._kwargs) + ';'
code += f"\nif ({flushcond}) {{\n\t{code2}\n}} else {{\n\t{code1}\n}}"
return pre_code + code
else:
return f"{printed_lhs} = {printed_rhs};"
def _print_NontemporalFence(self, _):
if 'streamFence' in self._vector_instruction_set:
return self._vector_instruction_set['streamFence'] + ';'
else:
return ''
def _print_CachelineSize(self, node):
if 'cachelineSize' in self._vector_instruction_set:
code = f'const size_t {node.symbol} = {self._vector_instruction_set["cachelineSize"]};\n'
code += f'const size_t {node.mask_symbol} = {node.symbol} - 1;\n'
vectorsize = self._vector_instruction_set['bytes']
code += f'const size_t {node.last_symbol} = {node.symbol} - {vectorsize};\n'
return code
else:
return ''
def _print_TemporaryMemoryAllocation(self, node):
if self._vector_instruction_set:
align = self._vector_instruction_set['bytes']
else:
align = node.symbol.dtype.base_type.numpy_dtype.itemsize
np_dtype = node.symbol.dtype.base_type.numpy_dtype
required_size = np_dtype.itemsize * node.size + align
size = modulo_ceil(required_size, align)
code = "#if defined(_MSC_VER)\n"
code += "{dtype} {name}=({dtype})_aligned_malloc({size}, {align}) + {offset};\n"
code += "#elif __cplusplus >= 201703L || __STDC_VERSION__ >= 201112L\n"
code += "{dtype} {name}=({dtype})aligned_alloc({align}, {size}) + {offset};\n"
code += "#else\n"
code += "{dtype} {name};\n"
code += "posix_memalign((void**) &{name}, {align}, {size});\n"
code += "{name} += {offset};\n"
code += "#endif"
return code.format(dtype=node.symbol.dtype,
name=self.sympy_printer.doprint(node.symbol.name),
size=self.sympy_printer.doprint(size),
offset=int(node.offset(align)),
align=align)
def _print_TemporaryMemoryFree(self, node):
if self._vector_instruction_set:
align = self._vector_instruction_set['bytes']
else:
align = node.symbol.dtype.base_type.numpy_dtype.itemsize
code = "#if defined(_MSC_VER)\n"
code += "_aligned_free(%s - %d);\n" % (self.sympy_printer.doprint(node.symbol.name), node.offset(align))
code += "#else\n"
code += "free(%s - %d);\n" % (self.sympy_printer.doprint(node.symbol.name), node.offset(align))
code += "#endif"
return code
def _print_SkipIteration(self, _):
return "continue;"
def _print_CustomCodeNode(self, node):
return node.get_code(self._dialect, self._vector_instruction_set, print_arg=self.sympy_printer._print)
def _print_SourceCodeComment(self, node):
return f"/* {node.text } */"
def _print_EmptyLine(self, node):
return ""
def _print_Conditional(self, node):
if type(node.condition_expr) is BooleanTrue:
return self._print_Block(node.true_block)
elif type(node.condition_expr) is BooleanFalse:
return self._print_Block(node.false_block)
cond_type = get_type_of_expression(node.condition_expr)
if isinstance(cond_type, VectorType):
raise ValueError("Problem with Conditional inside vectorized loop - use vec_any or vec_all")
condition_expr = self.sympy_printer.doprint(node.condition_expr)
true_block = self._print_Block(node.true_block)
result = f"if ({condition_expr})\n{true_block} "
if node.false_block:
false_block = self._print_Block(node.false_block)
result += f"else {false_block}"
return result
# ------------------------------------------ Helper function & classes -------------------------------------------------
# noinspection PyPep8Naming
class CustomSympyPrinter(CCodePrinter):
def __init__(self):
super(CustomSympyPrinter, self).__init__()
def _print_Pow(self, expr):
"""Don't use std::pow function, for small integer exponents, write as multiplication"""
# Ideally the printer has as little logic as possible. Therefore,
# powers should be rewritten as `DivFunc`s / unevaluated `Mul`s before
# printing. `NodeCollection` offers a convenience function to do just
# that. However, `cut_loops` rewrites unevaluated multiplications as
# `Pow`s again. Neither `deepcopy` nor `func(*args)` are suited to
# rebuild unevaluated expressions. Therefore, as long as we stick with
# SymPy, this is the only way to avoid printing `pow`s.
exp = expr.exp.expr if isinstance(expr.exp, CastFunc) else expr.exp
one_type = expr.base.dtype if hasattr(expr.base, "dtype") else get_type_of_expression(expr.base)
if exp.is_integer and exp.is_number and (0 < exp <= 8):
return f"({self._print(sp.Mul(*[expr.base] * exp, evaluate=False))})"
elif exp.is_integer and exp.is_number and (-8 <= exp < 0):
return f"{self._typed_number(1, one_type)} / ({self._print(sp.Mul(*([expr.base] * -exp), evaluate=False))})"
else:
return super(CustomSympyPrinter, self)._print_Pow(expr)
# TODO don't print ones in sp.Mul
def _print_Rational(self, expr):
"""Evaluate all rationals i.e. print 0.25 instead of 1.0/4.0"""
res = str(expr.evalf(17))
return res
def _print_Equality(self, expr):
"""Equality operator is not printable in default printer"""
return '((' + self._print(expr.lhs) + ") == (" + self._print(expr.rhs) + '))'
def _print_Piecewise(self, expr):
"""Print piecewise in one line (remove newlines)"""
result = super(CustomSympyPrinter, self)._print_Piecewise(expr)
return result.replace("\n", "")
def _print_Abs(self, expr):
if expr.args[0].is_integer:
return f'abs({self._print(expr.args[0])})'
else:
return f'fabs({self._print(expr.args[0])})'
def _print_AbstractType(self, node):
return str(node)
def _print_Function(self, expr):
infix_functions = {
bitwise_xor: '^',
bit_shift_right: '>>',
bit_shift_left: '<<',
bitwise_or: '|',
bitwise_and: '&',
}
if hasattr(expr, 'to_c'):
return expr.to_c(self._print)
if isinstance(expr, ReinterpretCastFunc):
arg, data_type = expr.args
if isinstance(data_type, PointerType):
const_str = "const" if data_type.const else ""
return f"(({const_str} {self._print(data_type.base_type)} *)(& {self._print(arg)}))"
else:
return f"*(({self._print(PointerType(data_type, restrict=False))})(& {self._print(arg)}))"
elif isinstance(expr, AddressOf):
assert len(expr.args) == 1, "address_of must only have one argument"
return f"&({self._print(expr.args[0])})"
elif isinstance(expr, CastFunc):
cast = "(({data_type})({code}))"
arg, data_type = expr.args
if arg.is_Number and not isinstance(arg, (sp.core.numbers.Infinity, sp.core.numbers.NegativeInfinity)):
return self._typed_number(arg, data_type)
elif isinstance(arg, (InverseTrigonometricFunction, TrigonometricFunction, HyperbolicFunction)) \
and data_type == BasicType('float32'):
known = self.known_functions[arg.__class__.__name__.lower()]
code = self._print(arg)
return code.replace(known, f"{known}f")
elif isinstance(arg, (sp.Pow, sp.exp)) and data_type == BasicType('float32'):
known = ['sqrt', 'cbrt', 'pow', 'exp']
code = self._print(arg)
for k in known:
if k in code:
return code.replace(k, f'{k}f')
# Powers of small integers are printed as divisions/multiplications.
if '/' in code or '*' in code:
return cast.format(data_type=data_type, code=code)
raise ValueError(f"{code} doesn't give {known=} function back.")
else:
return cast.format(data_type=data_type, code=self._print(arg))
elif isinstance(expr, fast_division):
raise ValueError("fast_division is only supported for Taget.GPU")
elif isinstance(expr, fast_sqrt):
raise ValueError("fast_sqrt is only supported for Taget.GPU")
elif isinstance(expr, fast_inv_sqrt):
raise ValueError("fast_inv_sqrt is only supported for Taget.GPU")
elif isinstance(expr, vec_any) or isinstance(expr, vec_all):
return self._print(expr.args[0])
elif isinstance(expr, sp.Abs):
return f"abs({self._print(expr.args[0])})"
elif isinstance(expr, sp.Mod):
if expr.args[0].is_integer and expr.args[1].is_integer:
return f"({self._print(expr.args[0])} % {self._print(expr.args[1])})"
else:
return f"fmod({self._print(expr.args[0])}, {self._print(expr.args[1])})"
elif expr.func in infix_functions:
return f"({self._print(expr.args[0])} {infix_functions[expr.func]} {self._print(expr.args[1])})"
elif expr.func == int_power_of_2:
return f"(1 << ({self._print(expr.args[0])}))"
elif expr.func == int_div:
return f"(({self._print(expr.args[0])}) / ({self._print(expr.args[1])}))"
elif expr.func == DivFunc:
return f'(({self._print(expr.divisor)}) / ({self._print(expr.dividend)}))'
else:
name = expr.name if hasattr(expr, 'name') else expr.__class__.__name__
arg_str = ', '.join(self._print(a) for a in expr.args)
return f'{name}({arg_str})'
def _typed_number(self, number, dtype):
res = self._print(number)
if dtype.numpy_dtype == np.float32:
return res + '.0f' if '.' not in res else res + 'f'
elif dtype.numpy_dtype == np.float64:
return res + '.0' if '.' not in res else res
elif dtype.is_int():
tokens = res.split('.')
if len(tokens) == 1:
return res
elif int(tokens[1]) != 0:
raise ValueError(f"Cannot print non-integer number {res} as an integer.")
else:
return tokens[0]
else:
return res
def _print_ConditionalFieldAccess(self, node):
return self._print(sp.Piecewise((node.outofbounds_value, node.outofbounds_condition), (node.access, True)))
def _print_Max(self, expr):
def inner_print_max(args):
if len(args) == 1:
return self._print(args[0])
half = len(args) // 2
a = inner_print_max(args[:half])
b = inner_print_max(args[half:])
return f"(({a} > {b}) ? {a} : {b})"
return inner_print_max(expr.args)
def _print_Min(self, expr):
def inner_print_min(args):
if len(args) == 1:
return self._print(args[0])
half = len(args) // 2
a = inner_print_min(args[:half])
b = inner_print_min(args[half:])
return f"(({a} < {b}) ? {a} : {b})"
return inner_print_min(expr.args)
# noinspection PyPep8Naming
class VectorizedCustomSympyPrinter(CustomSympyPrinter):
SummandInfo = namedtuple("SummandInfo", ['sign', 'term'])
def __init__(self, instruction_set):
super(VectorizedCustomSympyPrinter, self).__init__()
self.instruction_set = instruction_set
def _scalarFallback(self, func_name, expr, *args, **kwargs):
expr_type = get_type_of_expression(expr)
if type(expr_type) is not VectorType:
return getattr(super(VectorizedCustomSympyPrinter, self), func_name)(expr, *args, **kwargs)
else:
assert self.instruction_set['width'] == expr_type.width
return None
def _print_Abs(self, expr):
if isinstance(get_type_of_expression(expr), (VectorType, VectorMemoryAccess)):
return self.instruction_set['abs'].format(self._print(expr.args[0]), **self._kwargs)
return super()._print_Abs(expr)
def _typed_vectorized_number(self, expr, data_type):
basic_data_type = data_type.base_type
number = self._typed_number(expr, basic_data_type)
instruction = 'makeVecConst'
if basic_data_type.is_bool():
instruction = 'makeVecConstBool'
# TODO Vectorization Revamp: is int, or sint, or uint (my guess is sint)
elif basic_data_type.is_int():
instruction = 'makeVecConstInt'
return self.instruction_set[instruction].format(number, **self._kwargs)
def _typed_vectorized_symbol(self, expr, data_type):
if not isinstance(expr, TypedSymbol):
raise ValueError(f'{expr} is not a TypeSymbol. It is {expr.type=}')
basic_data_type = data_type.base_type
symbol = self._print(expr)
if basic_data_type != expr.dtype:
symbol = f'(({basic_data_type})({symbol}))'
instruction = 'makeVecConst'
if basic_data_type.is_bool():
instruction = 'makeVecConstBool'
# TODO Vectorization Revamp: is int, or sint, or uint (my guess is sint)
elif basic_data_type.is_int():
instruction = 'makeVecConstInt'
return self.instruction_set[instruction].format(symbol, **self._kwargs)
def _print_CastFunc(self, expr):
arg, data_type = expr.args
if type(data_type) is VectorType:
base_type = data_type.base_type
# vector_memory_access is a cast_func itself so it should't be directly inside a cast_func
assert not isinstance(arg, VectorMemoryAccess)
if isinstance(arg, sp.Tuple):
is_boolean = get_type_of_expression(arg[0]) == create_type("bool")
is_integer = get_type_of_expression(arg[0]) == create_type("int")
printed_args = [self._print(a) for a in arg]
instruction = 'makeVecBool' if is_boolean else 'makeVecInt' if is_integer else 'makeVec'
if instruction == 'makeVecInt' and 'makeVecIndex' in self.instruction_set:
increments = np.array(arg)[1:] - np.array(arg)[:-1]
if len(set(increments)) == 1:
return self.instruction_set['makeVecIndex'].format(printed_args[0], increments[0],
**self._kwargs)
return self.instruction_set[instruction].format(*printed_args, **self._kwargs)
else:
if arg.is_Number and not isinstance(arg, (sp.core.numbers.Infinity, sp.core.numbers.NegativeInfinity)):
return self._typed_vectorized_number(arg, data_type)
elif isinstance(arg, TypedSymbol):
return self._typed_vectorized_symbol(arg, data_type)
elif isinstance(arg, (InverseTrigonometricFunction, TrigonometricFunction, HyperbolicFunction)) \
and base_type == BasicType('float32'):
raise NotImplementedError('Vectorizer is not tested for trigonometric functions yet')
# known = self.known_functions[arg.__class__.__name__.lower()]
# code = self._print(arg)
# return code.replace(known, f"{known}f")
elif isinstance(arg, sp.Pow):
if base_type == BasicType('float32') or base_type == BasicType('float64'):
return self._print_Pow(arg)
else:
raise NotImplementedError('Integer Pow is not implemented')
elif isinstance(arg, sp.UnevaluatedExpr):
return self._print(arg.args[0])
else:
raise NotImplementedError('Vectorizer cannot cast between different datatypes')
# to_type = self.instruction_set['suffix'][data_type.base_type.c_name]
# from_type = self.instruction_set['suffix'][get_type_of_expression(arg).base_type.c_name]
# return self.instruction_set['cast'].format(from_type, to_type, self._print(arg))
else:
return self._scalarFallback('_print_Function', expr)
# raise ValueError(f'Non VectorType cast "{data_type}" in vectorized code.')
def _print_Function(self, expr):
if isinstance(expr, VectorMemoryAccess):
arg, data_type, aligned, _, mask, stride = expr.args
if stride != 1:
return self.instruction_set['loadS'].format(f"& {self._print(arg)}", stride, **self._kwargs)
instruction = self.instruction_set['loadA'] if aligned else self.instruction_set['loadU']
return instruction.format(f"& {self._print(arg)}", **self._kwargs)
elif expr.func == DivFunc:
result = self._scalarFallback('_print_Function', expr)
if not result:
result = self.instruction_set['/'].format(self._print(expr.divisor), self._print(expr.dividend),
**self._kwargs)
return result
elif isinstance(expr, fast_division):
raise ValueError("fast_division is only supported for Taget.GPU")
elif isinstance(expr, fast_sqrt):
raise ValueError("fast_sqrt is only supported for Taget.GPU")
elif isinstance(expr, fast_inv_sqrt):
raise ValueError("fast_inv_sqrt is only supported for Taget.GPU")
elif isinstance(expr, vec_any) or isinstance(expr, vec_all):
instr = 'any' if isinstance(expr, vec_any) else 'all'
expr_type = get_type_of_expression(expr.args[0])
if type(expr_type) is not VectorType:
return self._print(expr.args[0])
else:
if isinstance(expr.args[0], sp.Rel):
op = expr.args[0].rel_op
if (instr, op) in self.instruction_set:
return self.instruction_set[(instr, op)].format(*[self._print(a) for a in expr.args[0].args],
**self._kwargs)
return self.instruction_set[instr].format(self._print(expr.args[0]), **self._kwargs)
return super(VectorizedCustomSympyPrinter, self)._print_Function(expr)
def _print_And(self, expr):
result = self._scalarFallback('_print_And', expr)
if result:
return result
arg_strings = [self._print(a) for a in expr.args]
assert len(arg_strings) > 0
result = arg_strings[0]
for item in arg_strings[1:]:
result = self.instruction_set['&'].format(result, item, **self._kwargs)
return result
def _print_Or(self, expr):
result = self._scalarFallback('_print_Or', expr)
if result:
return result
arg_strings = [self._print(a) for a in expr.args]
assert len(arg_strings) > 0
result = arg_strings[0]
for item in arg_strings[1:]:
result = self.instruction_set['|'].format(result, item, **self._kwargs)
return result
def _print_Add(self, expr, order=None):
try:
result = self._scalarFallback('_print_Add', expr)
except Exception:
result = None
if result:
return result
args = expr.args
# special treatment for all-integer args, for loop index arithmetic until we have proper int vectorization
suffix = ""
if all([(type(e) is CastFunc and str(e.dtype) == self.instruction_set['int']) or isinstance(e, sp.Integer)
or (type(e) is TypedSymbol and isinstance(e.dtype, BasicType) and e.dtype.is_int()) for e in args]):
dtype = set([e.dtype for e in args if type(e) is CastFunc])
assert len(dtype) == 1
dtype = dtype.pop()
args = [CastFunc(e, dtype) if (isinstance(e, sp.Integer) or isinstance(e, TypedSymbol)) else e
for e in args]
suffix = "int"
summands = []
for term in args:
if term.func == sp.Mul:
sign, t = self._print_Mul(term, inside_add=True)
else:
t = self._print(term)
sign = 1
summands.append(self.SummandInfo(sign, t))
# Use positive terms first
summands.sort(key=lambda e: e.sign, reverse=True)
# if no positive term exists, prepend a zero
if summands[0].sign == -1:
summands.insert(0, self.SummandInfo(1, "0"))
assert len(summands) >= 2
processed = summands[0].term
for summand in summands[1:]:
func = self.instruction_set['-' + suffix] if summand.sign == -1 else self.instruction_set['+' + suffix]
processed = func.format(processed, summand.term, **self._kwargs)
return processed
def _print_Pow(self, expr):
# Due to loop cutting sp.Mul is evaluated again.
try:
result = self._scalarFallback('_print_Pow', expr)
except ValueError:
result = None
if result:
return result
one = self.instruction_set['makeVecConst'].format(1.0, **self._kwargs)
root = self.instruction_set['sqrt'].format(self._print(expr.base), **self._kwargs)
if isinstance(expr.exp, CastFunc) and expr.exp.args[0].is_number:
exp = expr.exp.args[0]
else:
exp = expr.exp
# TODO the printer should not have any intelligence like this.
# TODO To remove all of these cases the vectoriser needs to be reworked. See loop cutting
if exp.is_integer and exp.is_number and 0 < exp < 8:
return self._print(sp.Mul(*[expr.base] * exp, evaluate=False))
elif exp == 0.5:
return root
elif exp == -0.5:
return self.instruction_set['/'].format(one, root, **self._kwargs)
else:
raise ValueError("Generic exponential not supported: " + str(expr))
def _print_Mul(self, expr, inside_add=False):
# noinspection PyProtectedMember
from sympy.core.mul import _keep_coeff
if not inside_add:
result = self._scalarFallback('_print_Mul', expr)
else:
result = None
if result:
return result
c, e = expr.as_coeff_Mul()
if c < 0:
expr = _keep_coeff(-c, e)
sign = -1
else:
sign = 1
a = [] # items in the numerator
b = [] # items that are in the denominator (if any)
# Gather args for numerator/denominator
for item in expr.as_ordered_factors():
if item.is_commutative and item.is_Pow and item.exp.is_Rational and item.exp.is_negative:
if item.exp != -1:
b.append(sp.Pow(item.base, -item.exp, evaluate=False))
else:
b.append(sp.Pow(item.base, -item.exp))
else:
a.append(item)
a = a or [S.One]
a_str = [self._print(x) for x in a]
b_str = [self._print(x) for x in b]
result = a_str[0]
for item in a_str[1:]:
result = self.instruction_set['*'].format(result, item, **self._kwargs)
if len(b) > 0:
denominator_str = b_str[0]
for item in b_str[1:]:
denominator_str = self.instruction_set['*'].format(denominator_str, item, **self._kwargs)
result = self.instruction_set['/'].format(result, denominator_str, **self._kwargs)
if inside_add:
return sign, result
else:
if sign < 0:
return self.instruction_set['*'].format(self._print(S.NegativeOne), result, **self._kwargs)
else:
return result
def _print_Relational(self, expr):
result = self._scalarFallback('_print_Relational', expr)
if result:
return result
return self.instruction_set[expr.rel_op].format(self._print(expr.lhs), self._print(expr.rhs), **self._kwargs)
def _print_Equality(self, expr):
result = self._scalarFallback('_print_Equality', expr)
if result:
return result
return self.instruction_set['=='].format(self._print(expr.lhs), self._print(expr.rhs), **self._kwargs)
def _print_Piecewise(self, expr):
result = self._scalarFallback('_print_Piecewise', expr)
if result:
return result
if expr.args[-1].cond.args[0] is not sp.sympify(True):
# We need the last conditional to be a True, otherwise the resulting
# function may not return a result.
raise ValueError("All Piecewise expressions must contain an "
"(expr, True) statement to be used as a default "
"condition. Without one, the generated "
"expression may not evaluate to anything under "
"some condition.")
result = self._print(expr.args[-1][0])
for true_expr, condition in reversed(expr.args[:-1]):
if isinstance(condition, CastFunc) and get_type_of_expression(condition.args[0]) == create_type("bool"):
result = "(({}) ? ({}) : ({}))".format(self._print(condition.args[0]), self._print(true_expr),
result, **self._kwargs)
else:
# noinspection SpellCheckingInspection
result = self.instruction_set['blendv'].format(result, self._print(true_expr), self._print(condition),
**self._kwargs)
return result
from pystencils.astnodes import Node
from pystencils.backends.cbackend import CBackend, CustomSympyPrinter, generate_c
from pystencils.enums import Backend
from pystencils.fast_approximation import fast_division, fast_inv_sqrt, fast_sqrt
def generate_cuda(ast_node: Node, signature_only: bool = False, custom_backend=None, with_globals=True) -> str:
"""Prints an abstract syntax tree node as CUDA code.
Args:
ast_node: ast representation of kernel
signature_only: generate signature without function body
custom_backend: use own custom printer for code generation
with_globals: enable usage of global variables
Returns:
CUDA code for the ast node and its descendants
"""
return generate_c(ast_node, signature_only, dialect=Backend.CUDA,
custom_backend=custom_backend, with_globals=with_globals)
class CudaBackend(CBackend):
def __init__(self, sympy_printer=None,
signature_only=False):
if not sympy_printer:
sympy_printer = CudaSympyPrinter()
super().__init__(sympy_printer, signature_only, dialect=Backend.CUDA)
def _print_SharedMemoryAllocation(self, node):
dtype = node.symbol.dtype
name = self.sympy_printer.doprint(node.symbol.name)
num_elements = '*'.join([str(s) for s in node.shared_mem.shape])
code = f"__shared__ {dtype} {name}[{num_elements}];"
return code
@staticmethod
def _print_ThreadBlockSynchronization(_):
return "__synchtreads();"
def _print_TextureDeclaration(self, node):
cond = node.texture.field.dtype.numpy_dtype.itemsize > 4
return f'texture<{"fp_tex_" if cond else ""}{str(node.texture.field.dtype)}, ' \
f'cudaTextureType{node.texture.field.spacial_dimensions}D, cudaReadModeElementType> {node.texture};'
def _print_SkipIteration(self, _):
return "return;"
class CudaSympyPrinter(CustomSympyPrinter):
language = "CUDA"
def __init__(self):
super(CudaSympyPrinter, self).__init__()
def _print_Function(self, expr):
if isinstance(expr, fast_division):
assert len(expr.args) == 2, f"__fdividef has two arguments, but {len(expr.args)} where given"
return f"__fdividef({self._print(expr.args[0])}, {self._print(expr.args[1])})"
elif isinstance(expr, fast_sqrt):
assert len(expr.args) == 1, f"__fsqrt_rn has one argument, but {len(expr.args)} where given"
return f"__fsqrt_rn({self._print(expr.args[0])})"
elif isinstance(expr, fast_inv_sqrt):
assert len(expr.args) == 1, f"__frsqrt_rn has one argument, but {len(expr.args)} where given"
return f"__frsqrt_rn({self._print(expr.args[0])})"
return super()._print_Function(expr)
from sympy.printing.printer import Printer
from graphviz import Digraph, lang
import graphviz import graphviz
try:
from graphviz import Digraph
import graphviz.quoting as quote
except ImportError:
from graphviz import Digraph
import graphviz.lang as quote
from sympy.printing.printer import Printer
# noinspection PyPep8Naming # noinspection PyPep8Naming
...@@ -12,7 +17,7 @@ class DotPrinter(Printer): ...@@ -12,7 +17,7 @@ class DotPrinter(Printer):
super(DotPrinter, self).__init__() super(DotPrinter, self).__init__()
self._node_to_str_function = node_to_str_function self._node_to_str_function = node_to_str_function
self.dot = Digraph(**kwargs) self.dot = Digraph(**kwargs)
self.dot.quote_edge = lang.quote self.dot.quote_edge = quote.quote
def _print_KernelFunction(self, func): def _print_KernelFunction(self, func):
self.dot.node(str(id(func)), style='filled', fillcolor='#a056db', label=self._node_to_str_function(func)) self.dot.node(str(id(func)), style='filled', fillcolor='#a056db', label=self._node_to_str_function(func))
...@@ -50,21 +55,20 @@ class DotPrinter(Printer): ...@@ -50,21 +55,20 @@ class DotPrinter(Printer):
def __shortened(node): def __shortened(node):
from pystencils.astnodes import LoopOverCoordinate, KernelFunction, SympyAssignment, Block, Conditional from pystencils.astnodes import LoopOverCoordinate, KernelFunction, SympyAssignment, Conditional
if isinstance(node, LoopOverCoordinate): if isinstance(node, LoopOverCoordinate):
return "Loop over dim %d" % (node.coordinate_to_loop_over,) return "Loop over dim %d" % (node.coordinate_to_loop_over,)
elif isinstance(node, KernelFunction): elif isinstance(node, KernelFunction):
params = [f.name for f in node.fields_accessed] params = node.get_parameters()
params += [p.name for p in node.parameters if not p.is_field_argument] param_names = [p.field_name for p in params if p.is_field_pointer]
return "Func: %s (%s)" % (node.function_name, ",".join(params)) param_names += [p.symbol.name for p in params if not p.is_field_parameter]
return f"Func: {node.function_name} ({','.join(param_names)})"
elif isinstance(node, SympyAssignment): elif isinstance(node, SympyAssignment):
return repr(node.lhs) return repr(node.lhs)
elif isinstance(node, Block):
return "Block" + str(id(node))
elif isinstance(node, Conditional): elif isinstance(node, Conditional):
return repr(node) return repr(node)
else: else:
raise NotImplementedError("Cannot handle node type %s" % (type(node),)) raise NotImplementedError(f"Cannot handle node type {type(node)}")
def print_dot(node, view=False, short=False, **kwargs): def print_dot(node, view=False, short=False, **kwargs):
......
# -*- coding: utf-8 -*-
#
# Copyright © 2019 Stephan Seitz <stephan.seitz@fau.de>
#
# Distributed under terms of the GPLv3 license.
"""
"""
import json
from pystencils.astnodes import NodeOrExpr
from pystencils.backends.cbackend import CustomSympyPrinter, generate_c
try:
import yaml
except ImportError:
raise ImportError('yaml not installed')
def expr_to_dict(expr_or_node: NodeOrExpr, with_c_code=True, full_class_names=False):
"""Converts a SymPy expression to a serializable dict (mainly for debugging purposes)
The dict recursively contains all args of the expression as ``dict``s
See :func:`.write_json`
Args:
expr_or_node (NodeOrExpr): a SymPy expression or a :class:`pystencils.astnodes.Node`
with_c_code (bool, optional): include C representation of the nodes
full_class_names (bool, optional): use full class names (type(object) instead of ``type(object).__name__``
"""
self = {'str': str(expr_or_node)}
if with_c_code:
try:
self.update({'c': generate_c(expr_or_node)})
except Exception:
try:
self.update({'c': CustomSympyPrinter().doprint(expr_or_node)})
except Exception:
pass
for a in expr_or_node.args:
self.update({str(a.__class__ if full_class_names else a.__class__.__name__): expr_to_dict(a)})
return self
def print_json(expr_or_node: NodeOrExpr):
"""Print debug JSON of an AST to string
Args:
expr_or_node (NodeOrExpr): a SymPy expression or a :class:`pystencils.astnodes.Node`
Returns:
str: JSON representation of AST
"""
expr_or_node_dict = expr_to_dict(expr_or_node)
return json.dumps(expr_or_node_dict, indent=4)
def write_json(filename: str, expr_or_node: NodeOrExpr):
"""Writes debug JSON represenation of AST to file
Args:
filename (str): Path for the file to write
expr_or_node (NodeOrExpr): a SymPy expression or a :class:`pystencils.astnodes.Node`
"""
expr_or_node_dict = expr_to_dict(expr_or_node)
with open(filename, 'w') as f:
json.dump(expr_or_node_dict, f, indent=4)
def print_yaml(expr_or_node):
expr_or_node_dict = expr_to_dict(expr_or_node, full_class_names=False)
return yaml.dump(expr_or_node_dict)
def write_yaml(filename, expr_or_node):
expr_or_node_dict = expr_to_dict(expr_or_node)
with open(filename, 'w') as f:
yaml.dump(expr_or_node_dict, f)
def get_argument_string(function_shortcut):
args = function_shortcut[function_shortcut.index('[') + 1: -1]
arg_string = "("
for arg in args.split(","):
arg = arg.strip()
if not arg:
continue
if arg in ('0', '1', '2', '3', '4', '5'):
arg_string += "{" + arg + "},"
else:
arg_string += arg + ","
arg_string = arg_string[:-1] + ")"
return arg_string
def get_vector_instruction_set_ppc(data_type='double', instruction_set='vsx'):
if instruction_set != 'vsx':
raise NotImplementedError(instruction_set)
base_names = {
'+': 'add[0, 1]',
'-': 'sub[0, 1]',
'*': 'mul[0, 1]',
'/': 'div[0, 1]',
'sqrt': 'sqrt[0]',
'rsqrt': 'rsqrte[0]', # rsqrt is available too, but not on Clang
'loadU': 'xl[0x0, 0]',
'loadA': 'ld[0x0, 0]',
'storeU': 'xst[1, 0x0, 0]',
'storeA': 'st[1, 0x0, 0]',
'storeAAndFlushCacheline': 'stl[1, 0x0, 0]',
'abs': 'abs[0]',
'==': 'cmpeq[0, 1]',
'!=': 'cmpne[0, 1]',
'<=': 'cmple[0, 1]',
'<': 'cmplt[0, 1]',
'>=': 'cmpge[0, 1]',
'>': 'cmpgt[0, 1]',
'&': 'and[0, 1]',
'|': 'or[0, 1]',
'blendv': 'sel[0, 1, 2]',
('any', '=='): 'any_eq[0, 1]',
('any', '!='): 'any_ne[0, 1]',
('any', '<='): 'any_le[0, 1]',
('any', '<'): 'any_lt[0, 1]',
('any', '>='): 'any_ge[0, 1]',
('any', '>'): 'any_gt[0, 1]',
('all', '=='): 'all_eq[0, 1]',
('all', '!='): 'all_ne[0, 1]',
('all', '<='): 'all_le[0, 1]',
('all', '<'): 'all_lt[0, 1]',
('all', '>='): 'all_ge[0, 1]',
('all', '>'): 'all_gt[0, 1]',
}
bits = {'double': 64,
'float': 32,
'int': 32}
width = 128 // bits[data_type]
intwidth = 128 // bits['int']
result = dict()
result['bytes'] = 16
for intrinsic_id, function_shortcut in base_names.items():
function_shortcut = function_shortcut.strip()
name = function_shortcut[:function_shortcut.index('[')]
arg_string = get_argument_string(function_shortcut)
result[intrinsic_id] = 'vec_' + name + arg_string
if data_type == 'double':
# Clang and XL C++ are missing these for doubles
result['loadA'] = '(__vector double)' + result['loadA'].format('(float*) {0}')
result['storeA'] = result['storeA'].format('(float*) {0}', '(__vector float) {1}')
result['storeAAndFlushCacheline'] = result['storeAAndFlushCacheline'].format('(float*) {0}',
'(__vector float) {1}')
result['+int'] = "vec_add({0}, {1})"
result['width'] = width
result['intwidth'] = intwidth
result[data_type] = f'__vector {data_type}'
result['int'] = '__vector int'
result['bool'] = f'__vector __bool {"long long" if data_type == "double" else "int"}'
result['headers'] = ['<altivec.h>', '"ppc_altivec_helpers.h"']
result['makeVecConst'] = '((' + result[data_type] + '){{' + \
", ".join(['(' + data_type + ') {0}' for _ in range(width)]) + '}})'
result['makeVec'] = '((' + result[data_type] + '){{' + \
", ".join(['{' + data_type + '} {' + str(i) + '}' for i in range(width)]) + '}})'
result['makeVecConstInt'] = '((' + result['int'] + '){{' + ", ".join(['(int) {0}' for _ in range(intwidth)]) + '}})'
result['makeVecInt'] = '((' + result['int'] + '){{(int) {0}, (int) {1}, (int) {2}, (int) {3}}})'
result['any'] = 'vec_any_ne({0}, ((' + result['bool'] + ') {{' + ", ".join(['0'] * width) + '}}))'
result['all'] = 'vec_all_ne({0}, ((' + result['bool'] + ') {{' + ", ".join(['0'] * width) + '}}))'
result['cachelineSize'] = 'cachelineSize()'
result['cachelineZero'] = 'cachelineZero((void*) {0})'
return result
from pystencils.typing import CFunction
def get_argument_string(function_shortcut, last=''):
args = function_shortcut[function_shortcut.index('[') + 1: -1]
arg_string = "("
for arg in args.split(","):
arg = arg.strip()
if not arg:
continue
if arg in ('0', '1', '2', '3', '4', '5'):
arg_string += "{" + arg + "},"
else:
arg_string += arg + ","
if last:
arg_string += last + ','
arg_string = arg_string[:-1] + ")"
return arg_string
def get_vector_instruction_set_riscv(data_type='double', instruction_set='rvv'):
assert instruction_set == 'rvv'
bits = {'double': 64,
'float': 32,
'int': 32}
base_names = {
'+': 'fadd_vv[0, 1]',
'-': 'fsub_vv[0, 1]',
'*': 'fmul_vv[0, 1]',
'/': 'fdiv_vv[0, 1]',
'sqrt': 'fsqrt_v[0]',
'loadU': f'le{bits[data_type]}_v[0]',
'storeU': f'se{bits[data_type]}_v[0, 1]',
'maskStoreU': f'se{bits[data_type]}_v[2, 0, 1]',
'loadS': f'lse{bits[data_type]}_v[0, 1]',
'storeS': f'sse{bits[data_type]}_v[0, 2, 1]',
'maskStoreS': f'sse{bits[data_type]}_v[3, 0, 2, 1]',
'abs': 'fabs_v[0]',
'==': 'mfeq_vv[0, 1]',
'!=': 'mfne_vv[0, 1]',
'<=': 'mfle_vv[0, 1]',
'<': 'mflt_vv[0, 1]',
'>=': 'mfge_vv[0, 1]',
'>': 'mfgt_vv[0, 1]',
'&': 'mand_mm[0, 1]',
'|': 'mor_mm[0, 1]',
'blendv': 'merge_vvm[2, 0, 1]',
'any': 'cpop_m[0]',
'all': 'cpop_m[0]',
}
result = dict()
width = f'vsetvlmax_e{bits[data_type]}m1()'
intwidth = 'vsetvlmax_e{bits["int"]}m1()'
result['bytes'] = 'vsetvlmax_e8m1()'
prefix = 'v'
suffix = f'_f{bits[data_type]}m1'
vl = '{loop_stop} - {loop_counter}'
int_vl = f'({vl})*{bits[data_type]//bits["int"]}'
for intrinsic_id, function_shortcut in base_names.items():
function_shortcut = function_shortcut.strip()
name = function_shortcut[:function_shortcut.index('[')]
if name.startswith('mf'):
suffix2 = suffix + f'_b{bits[data_type]}'
elif name.endswith('_mm') or name.endswith('_m'):
suffix2 = f'_b{bits[data_type]}'
elif intrinsic_id.startswith('mask'):
suffix2 = suffix + '_m'
else:
suffix2 = suffix
arg_string = get_argument_string(function_shortcut, last=vl)
result[intrinsic_id] = prefix + name + suffix2 + arg_string
result['width'] = CFunction(width, "int")
result['intwidth'] = CFunction(intwidth, "int")
result['makeVecConst'] = f'vfmv_v_f_f{bits[data_type]}m1({{0}}, {vl})'
result['makeVecConstInt'] = f'vmv_v_x_i{bits["int"]}m1({{0}}, {int_vl})'
result['makeVecIndex'] = f'vmacc_vx_i{bits["int"]}m1({result["makeVecConstInt"]}, {{1}}, ' + \
f'vid_v_i{bits["int"]}m1({int_vl}), {int_vl})'
result['storeS'] = result['storeS'].replace('{2}', f'{{2}}*{bits[data_type]//8}')
result['loadS'] = result['loadS'].replace('{1}', f'{{1}}*{bits[data_type]//8}')
result['maskStoreS'] = result['maskStoreS'].replace('{2}', f'{{2}}*{bits[data_type]//8}')
result['+int'] = f"vadd_vv_i{bits['int']}m1({{0}}, {{1}}, {int_vl})"
result['float'] = f'vfloat{bits["float"]}m1_t'
result['double'] = f'vfloat{bits["double"]}m1_t'
result['int'] = f'vint{bits["int"]}m1_t'
result['bool'] = f'vbool{bits[data_type]}_t'
result['headers'] = ['<riscv_vector.h>', '"riscv_v_helpers.h"']
result['any'] += ' > 0x0'
result['all'] += f' == vsetvl_e{bits[data_type]}m1({vl})'
result['cachelineSize'] = 'cachelineSize()'
result['cachelineZero'] = 'cachelineZero((void*) {0})'
return result
import os
import platform
from ctypes import CDLL, c_int, c_size_t, sizeof, byref
from warnings import warn
import numpy as np
from pystencils.backends.x86_instruction_sets import get_vector_instruction_set_x86
from pystencils.backends.arm_instruction_sets import get_vector_instruction_set_arm
from pystencils.backends.ppc_instruction_sets import get_vector_instruction_set_ppc
from pystencils.backends.riscv_instruction_sets import get_vector_instruction_set_riscv
from pystencils.cache import memorycache
from pystencils.typing import numpy_name_to_c
def get_vector_instruction_set(data_type='double', instruction_set='avx'):
if data_type == 'float':
warn(f"Ambiguous input for data_type: {data_type}. For single precision please use float32. "
f"For more information please take numpy.dtype as a reference. This input will not be supported in future "
f"releases")
data_type = 'float64'
type_name = numpy_name_to_c(np.dtype(data_type).name)
if instruction_set in ['neon', 'sme'] or instruction_set.startswith('sve'):
return get_vector_instruction_set_arm(type_name, instruction_set)
elif instruction_set in ['vsx']:
return get_vector_instruction_set_ppc(type_name, instruction_set)
elif instruction_set in ['rvv']:
return get_vector_instruction_set_riscv(type_name, instruction_set)
else:
return get_vector_instruction_set_x86(type_name, instruction_set)
@memorycache
def get_supported_instruction_sets():
"""List of supported instruction sets on current hardware, or None if query failed."""
if 'PYSTENCILS_SIMD' in os.environ:
return os.environ['PYSTENCILS_SIMD'].split(',')
if platform.system() == 'Darwin' and platform.machine() == 'arm64':
result = ['neon']
libc = CDLL('/usr/lib/libc.dylib')
value = c_int(0)
size = c_size_t(sizeof(value))
status = libc.sysctlbyname(b"hw.optional.arm.FEAT_SME", byref(value), byref(size), None, 0)
if status == 0 and value.value == 1:
result.insert(0, "sme")
return result
elif platform.system() == 'Windows' and platform.machine() == 'ARM64':
return ['neon']
elif platform.system() == 'Linux' and platform.machine() == 'aarch64':
result = ['neon'] # Neon is mandatory on 64-bit ARM
libc = CDLL('libc.so.6')
hwcap = libc.getauxval(16) # AT_HWCAP
hwcap2 = libc.getauxval(26) # AT_HWCAP2
if hwcap & (1 << 22): # HWCAP_SVE
if hwcap2 & (1 << 1): # HWCAP2_SVE2
name = 'sve2'
else:
name = 'sve'
length = 8 * libc.prctl(51, 0, 0, 0, 0) # PR_SVE_GET_VL
if length < 0:
raise OSError("SVE length query failed")
while length >= 128:
result.append(f"{name}{length}")
length //= 2
result.append(name)
if hwcap2 & (1 << 23): # HWCAP2_SME
result.insert(0, "sme") # prepend to list so it is not automatically chosen as best instruction set
return result
elif platform.system() == 'Linux' and platform.machine().startswith('riscv'):
libc = CDLL('libc.so.6')
hwcap = libc.getauxval(16) # AT_HWCAP
hwcap_isa_v = 1 << (ord('V') - ord('A')) # COMPAT_HWCAP_ISA_V
return ['rvv'] if hwcap & hwcap_isa_v else []
elif platform.system() == 'Linux' and platform.machine().startswith('ppc64'):
libc = CDLL('libc.so.6')
hwcap = libc.getauxval(16) # AT_HWCAP
return ['vsx'] if hwcap & 0x00000080 else [] # PPC_FEATURE_HAS_VSX
elif platform.machine() in ['x86_64', 'x86', 'AMD64', 'i386']:
try:
from cpuinfo import get_cpu_info
except ImportError:
return None
result = []
required_sse_flags = {'sse', 'sse2', 'ssse3', 'sse4_1', 'sse4_2'}
required_avx_flags = {'avx', 'avx2'}
required_avx512_flags = {'avx512f'}
possible_avx512vl_flags = {'avx512vl', 'avx10_1'}
flags = set(get_cpu_info()['flags'])
if flags.issuperset(required_sse_flags):
result.append("sse")
if flags.issuperset(required_avx_flags):
result.append("avx")
if flags.issuperset(required_avx512_flags):
result.append("avx512")
if not flags.isdisjoint(possible_avx512vl_flags):
result.append("avx512vl")
return result
else:
raise NotImplementedError('Instruction set detection for %s on %s is not implemented' %
(platform.system(), platform.machine()))
@memorycache
def get_cacheline_size(instruction_set):
"""Get the size (in bytes) of a cache block that can be zeroed without memory access.
Usually, this is identical to the cache line size."""
instruction_sets = get_vector_instruction_set('double', instruction_set)
if 'cachelineSize' not in instruction_sets:
return None
import pystencils as ps
from pystencils.astnodes import SympyAssignment
import numpy as np
from pystencils.cpu.vectorization import CachelineSize
arr = np.zeros((1, 1), dtype=np.float32)
f = ps.Field.create_from_numpy_array('f', arr, index_dimensions=0)
ass = [CachelineSize(), SympyAssignment(f.center, CachelineSize.symbol)]
ast = ps.create_kernel(ass, cpu_vectorize_info={'instruction_set': instruction_set})
kernel = ast.compile()
kernel(**{f.name: arr, CachelineSize.symbol.name: 0})
return int(arr[0, 0])
def get_argument_string(intrinsic_id, width, function_shortcut):
if intrinsic_id == 'makeVecConst' or intrinsic_id == 'makeVecConstInt':
arg_string = f"({','.join(['{0}'] * width)})"
elif intrinsic_id == 'makeVec' or intrinsic_id == 'makeVecInt':
params = ["{" + str(i) + "}" for i in reversed(range(width))]
arg_string = f"({','.join(params)})"
elif intrinsic_id == 'makeVecBool':
params = [f"(({{{i}}} ? -1.0 : 0.0)" for i in reversed(range(width))]
arg_string = f"({','.join(params)})"
elif intrinsic_id == 'makeVecConstBool':
params = ["(({0}) ? -1.0 : 0.0)" for _ in range(width)]
arg_string = f"({','.join(params)})"
else:
args = function_shortcut[function_shortcut.index('[') + 1: -1]
arg_string = "("
for arg in args.split(","):
arg = arg.strip()
if not arg:
continue
if arg in ('0', '1', '2', '3', '4', '5'):
arg_string += "{" + arg + "},"
else:
arg_string += arg + ","
arg_string = arg_string[:-1] + ")"
return arg_string
def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'):
comparisons = {
'==': '_CMP_EQ_UQ',
'!=': '_CMP_NEQ_UQ',
'>=': '_CMP_GE_OQ',
'<=': '_CMP_LE_OQ',
'<': '_CMP_NGE_UQ',
'>': '_CMP_NLE_UQ',
}
base_names = {
'+': 'add[0, 1]',
'-': 'sub[0, 1]',
'*': 'mul[0, 1]',
'/': 'div[0, 1]',
'&': 'and[0, 1]',
'|': 'or[0, 1]',
'blendv': 'blendv[0, 1, 2]',
'sqrt': 'sqrt[0]',
'makeVecConst': 'set[]',
'makeVec': 'set[]',
'makeVecBool': 'set[]',
'makeVecConstBool': 'set[]',
'makeVecInt': 'set[]',
'makeVecConstInt': 'set[]',
'loadU': 'loadu[0]',
'loadA': 'load[0]',
'storeU': 'storeu[0,1]',
'storeA': 'store[0,1]',
'stream': 'stream[0,1]',
'maskStoreA': 'mask_store[0, 2, 1]' if instruction_set.startswith('avx512') else 'maskstore[0, 2, 1]',
'maskStoreU': 'mask_storeu[0, 2, 1]' if instruction_set.startswith('avx512') else 'maskstore[0, 2, 1]',
}
for comparison_op, constant in comparisons.items():
base_names[comparison_op] = f'cmp[0, 1, {constant}]'
headers = {
'avx512': ['<immintrin.h>'],
'avx512vl': ['<immintrin.h>'],
'avx': ['<immintrin.h>'],
'sse': ['<immintrin.h>', '<xmmintrin.h>', '<emmintrin.h>', '<pmmintrin.h>',
'<tmmintrin.h>', '<smmintrin.h>', '<nmmintrin.h>']
}
suffix = {
'double': 'pd',
'float': 'ps',
'int': 'epi32'
}
prefix = {
'sse': '_mm',
'avx': '_mm256',
'avx512vl': '_mm256',
'avx512': '_mm512',
}
width = {
("double", "sse"): 2,
("float", "sse"): 4,
("int", "sse"): 4,
("double", "avx"): 4,
("float", "avx"): 8,
("int", "avx"): 8,
("double", "avx512vl"): 4,
("float", "avx512vl"): 8,
("int", "avx512vl"): 8,
("double", "avx512"): 8,
("float", "avx512"): 16,
("int", "avx512"): 16,
}
result = {
'width': width[(data_type, instruction_set)],
'intwidth': width[('int', instruction_set)],
'bytes': 4 * width[("float", instruction_set)]
}
pre = prefix[instruction_set]
for intrinsic_id, function_shortcut in base_names.items():
function_shortcut = function_shortcut.strip()
name = function_shortcut[:function_shortcut.index('[')]
if 'Int' in intrinsic_id:
suf = suffix['int']
arg_string = get_argument_string(intrinsic_id, result['intwidth'], function_shortcut)
else:
suf = suffix[data_type]
arg_string = get_argument_string(intrinsic_id, result['width'], function_shortcut)
mask_suffix = '_mask' if instruction_set.startswith('avx512') and intrinsic_id in comparisons.keys() else ''
result[intrinsic_id] = pre + "_" + name + "_" + suf + mask_suffix + arg_string
bit_width = result['width'] * (64 if data_type == 'double' else 32)
result['double'] = f"__m{bit_width}d"
result['float'] = f"__m{bit_width}"
result['int'] = f"__m{bit_width}i"
result['bool'] = result[data_type]
result['headers'] = headers[instruction_set]
result['any'] = f"{pre}_movemask_{suf}({{0}}) > 0"
result['all'] = f"{pre}_movemask_{suf}({{0}}) == {hex(2**result['width']-1)}"
setsuf = "x" if bit_width < 512 and bit_width // result['width'] == 64 else ""
if instruction_set.startswith('avx512'):
size = result['width']
masksize = max(size, 8)
result['&'] = f'_kand_mask{masksize}({{0}}, {{1}})'
result['|'] = f'_kor_mask{masksize}({{0}}, {{1}})'
result['any'] = f'!_ktestz_mask{masksize}_u8({{0}}, {{0}})'
result['all'] = f'_kortestc_mask{masksize}_u8({{0}}, {{0}})'
result['blendv'] = f'{pre}_mask_blend_{suf}({{2}}, {{0}}, {{1}})'
result['rsqrt'] = f"{pre}_rsqrt14_{suf}({{0}})"
result['bool'] = f"__mmask{masksize}"
params = " | ".join(["({{{i}}} ? {power} : 0)".format(i=i, power=2 ** i) for i in range(8)])
result['makeVecBool'] = f"__mmask8(({params}) )"
params = " | ".join(["({{0}} ? {power} : 0)".format(power=2 ** i) for i in range(8)])
result['makeVecConstBool'] = f"__mmask8(({params}) )"
vindex = f'{pre}_set_epi{bit_width//size}{setsuf}(' + \
', '.join([str(i) for i in range(result['width'])][::-1]) + ')'
vindex = f'{pre}_mullo_epi{bit_width//size}({vindex}, {pre}_set1_epi{bit_width//size}{setsuf}({{0}}))'
scale = bit_width // size // 8
result['storeS'] = f'{pre}_i{bit_width//size}scatter_{suf}({{0}}, ' + vindex.format("{2}") + \
f', {{1}}, {scale})'
result['maskStoreS'] = f'{pre}_mask_i{bit_width//size}scatter_{suf}({{0}}, {{3}}, ' + vindex.format("{2}") + \
f', {{1}}, {scale})'
if bit_width == 512:
result['loadS'] = f'{pre}_i{bit_width//size}gather_{suf}(' + vindex.format("{1}") + f', {{0}}, {scale})'
else:
result['loadS'] = f'{pre}_i{bit_width//size}gather_{suf}({{0}}, ' + vindex.format("{1}") + f', {scale})'
# abs intrinsic exists in 512 bits, but expands to a sequence. We generate that same sequence for 128 and 256 bits
if instruction_set == 'avx512':
result['abs'] = f"{pre}_abs_{suf}({{0}})"
else:
result['abs'] = f"{pre}_castsi{bit_width}_{suf}({pre}_and_si{bit_width}(" + \
f"{pre}_set1_epi{bit_width // result['width']}{setsuf}(0x7" + \
'f' * (bit_width // result['width'] // 4 - 1) + "), " + \
f"{pre}_cast{suf}_si{bit_width}({{0}})))"
if instruction_set == 'avx' and data_type == 'float':
result['rsqrt'] = f"{pre}_rsqrt_{suf}({{0}})"
result['+int'] = f"{pre}_add_{suffix['int']}({{0}}, {{1}})"
result['streamFence'] = '_mm_mfence()'
return result
import sympy as sp
# from pystencils.typing import get_type_of_expression
# noinspection PyPep8Naming
class flag_cond(sp.Function):
"""Evaluates a flag condition on a bit mask, and returns the value of one of two expressions,
depending on whether the flag is set.
Three argument version:
```
flag_cond(flag_bit, mask, expr) = expr if (flag_bit is set in mask) else 0
```
Four argument version:
```
flag_cond(flag_bit, mask, expr_then, expr_else) = expr_then if (flag_bit is set in mask) else expr_else
```
"""
nargs = (3, 4)
def __new__(cls, flag_bit, mask_expression, *expressions):
# TODO Jan reintroduce checking
# flag_dtype = get_type_of_expression(flag_bit)
# if not flag_dtype.is_int():
# raise ValueError('Argument flag_bit must be of integer type.')
#
# mask_dtype = get_type_of_expression(mask_expression)
# if not mask_dtype.is_int():
# raise ValueError('Argument mask_expression must be of integer type.')
return super().__new__(cls, flag_bit, mask_expression, *expressions)
def to_c(self, print_func):
flag_bit = self.args[0]
mask = self.args[1]
then_expression = self.args[2]
flag_bit_code = print_func(flag_bit)
mask_code = print_func(mask)
then_code = print_func(then_expression)
code = f"(({mask_code}) >> ({flag_bit_code}) & 1) * ({then_code})"
if len(self.args) > 3:
else_expression = self.args[3]
else_code = print_func(else_expression)
code += f" + (({mask_code}) >> ({flag_bit_code}) ^ 1) * ({else_code})"
return code
from pystencils.boundaries.boundaryconditions import Dirichlet, Neumann
from pystencils.boundaries.boundaryhandling import BoundaryHandling from pystencils.boundaries.boundaryhandling import BoundaryHandling
from pystencils.boundaries.boundaryconditions import Neumann
from pystencils.boundaries.inkernel import add_neumann_boundary from pystencils.boundaries.inkernel import add_neumann_boundary
__all__ = ['BoundaryHandling', 'Neumann', 'add_neumann_boundary'] __all__ = ['BoundaryHandling', 'Neumann', 'Dirichlet', 'add_neumann_boundary']
from pystencils import Assignment from typing import Any, List, Tuple
from pystencils.astnodes import SympyAssignment
from pystencils.boundaries.boundaryhandling import BoundaryOffsetInfo from pystencils.boundaries.boundaryhandling import BoundaryOffsetInfo
from typing import List, Tuple, Any from pystencils.typing import create_type
class Boundary: class Boundary:
"""Base class all boundaries should derive from""" """Base class all boundaries should derive from"""
inner_or_boundary = True
single_link = False
def __init__(self, name=None): def __init__(self, name=None):
self._name = name self._name = name
def __call__(self, field, direction_symbol, index_field) -> List[Assignment]: def __call__(self, field, direction_symbol, index_field) -> List[SympyAssignment]:
"""Defines the boundary behavior and must therefore be implemented by all boundaries. """Defines the boundary behavior and must therefore be implemented by all boundaries.
Here the boundary is defined as a list of sympy assignments, from which a boundary kernel is generated. Here the boundary is defined as a list of sympy assignments, from which a boundary kernel is generated.
...@@ -50,21 +55,59 @@ class Boundary: ...@@ -50,21 +55,59 @@ class Boundary:
class Neumann(Boundary): class Neumann(Boundary):
inner_or_boundary = False
single_link = True
def __call__(self, field, direction_symbol, **kwargs): def __call__(self, field, direction_symbol, **kwargs):
neighbor = BoundaryOffsetInfo.offset_from_dir(direction_symbol, field.spatial_dimensions) neighbor = BoundaryOffsetInfo.offset_from_dir(direction_symbol, field.spatial_dimensions)
if field.index_dimensions == 0: if field.index_dimensions == 0:
return [Assignment(field[neighbor], field.center)] return [SympyAssignment(field.center, field[neighbor])]
else: else:
from itertools import product from itertools import product
if not field.has_fixed_index_shape: if not field.has_fixed_index_shape:
raise NotImplementedError("Neumann boundary works only for fields with fixed index shape") raise NotImplementedError("Neumann boundary works only for fields with fixed index shape")
index_iter = product(*(range(i) for i in field.index_shape)) index_iter = product(*(range(i) for i in field.index_shape))
return [Assignment(field[neighbor](*idx), field(*idx)) for idx in index_iter] return [SympyAssignment(field(*idx), field[neighbor](*idx)) for idx in index_iter]
def __hash__(self): def __hash__(self):
# All boundaries of these class behave equal -> should also be equal # All boundaries of these class behave equal -> should also be equal
return hash("Neumann") return hash("Neumann")
def __eq__(self, other): def __eq__(self, other):
return type(other) == Neumann return type(other) is Neumann
class Dirichlet(Boundary):
inner_or_boundary = False
single_link = True
def __init__(self, value, name=None):
super().__init__(name)
self._value = value
@property
def additional_data(self):
if callable(self._value):
return [('value', create_type("double"))]
else:
return []
@property
def additional_data_init_callback(self):
if callable(self._value):
return self._value
def __call__(self, field, direction_symbol, index_field, **kwargs):
if field.index_dimensions == 0:
return [SympyAssignment(field.center, index_field("value") if self.additional_data else self._value)]
elif field.index_dimensions == 1:
assert not self.additional_data
if not field.has_fixed_index_shape:
raise NotImplementedError("Field needs fixed index shape")
assert len(self._value) == field.index_shape[0], "Dirichlet value does not match index shape of field"
return [SympyAssignment(field(i), self._value[i]) for i in range(field.index_shape[0])]
raise NotImplementedError("Dirichlet boundary not implemented for fields with more than one index dimension")
from functools import lru_cache
import numpy as np import numpy as np
import sympy as sp import sympy as sp
from pystencils.assignment import Assignment
from pystencils import Field, TypedSymbol, create_indexed_kernel from pystencils import create_kernel, CreateKernelConfig, Target
from pystencils.backends.cbackend import CustomCppCode from pystencils.astnodes import SympyAssignment
from pystencils.boundaries.createindexlist import numpy_data_type_for_boundary_object, create_boundary_index_array from pystencils.backends.cbackend import CustomCodeNode
from pystencils.cache import memorycache from pystencils.boundaries.createindexlist import (
from pystencils.data_types import create_type create_boundary_index_array, numpy_data_type_for_boundary_object)
from pystencils.typing import TypedSymbol, create_type
from pystencils.gpu.gpu_array_handler import GPUArrayHandler
from pystencils.field import Field
from pystencils.typing.typed_sympy import FieldPointerSymbol
try:
# noinspection PyPep8Naming
import waLBerla as wlb
if wlb.cpp_available:
from pystencils.datahandling.parallel_datahandling import ParallelDataHandling
import cupy.cuda.runtime
else:
ParallelDataHandling = None
except ImportError:
ParallelDataHandling = None
DEFAULT_FLAG_TYPE = np.uint32 DEFAULT_FLAG_TYPE = np.uint32
...@@ -18,11 +35,11 @@ class FlagInterface: ...@@ -18,11 +35,11 @@ class FlagInterface:
>>> dh = create_data_handling((4, 5)) >>> dh = create_data_handling((4, 5))
>>> fi = FlagInterface(dh, 'flag_field', np.uint8) >>> fi = FlagInterface(dh, 'flag_field', np.uint8)
>>> assert dh.has_data('flag_field') >>> assert dh.has_data('flag_field')
>>> fi.reserve_next_flag() >>> int(fi.reserve_next_flag())
2 2
>>> fi.reserve_flag(4) >>> int(fi.reserve_flag(4))
4 4
>>> fi.reserve_next_flag() >>> int(fi.reserve_next_flag())
8 8
""" """
...@@ -51,13 +68,13 @@ class FlagInterface: ...@@ -51,13 +68,13 @@ class FlagInterface:
self._used_flags.add(flag) self._used_flags.add(flag)
assert self._is_power_of_2(flag) assert self._is_power_of_2(flag)
return flag return flag
raise ValueError("All available {} flags are reserved".format(self.max_bits)) raise ValueError(f"All available {self.max_bits} flags are reserved")
def reserve_flag(self, flag): def reserve_flag(self, flag):
assert self._is_power_of_2(flag) assert self._is_power_of_2(flag)
flag = self.dtype(flag) flag = self.dtype(flag)
if flag in self._used_flags: if flag in self._used_flags:
raise ValueError("The flag {flag} is already reserved".format(flag=flag)) raise ValueError(f"The flag {flag} is already reserved")
self._used_flags.add(flag) self._used_flags.add(flag)
return flag return flag
...@@ -69,9 +86,9 @@ class FlagInterface: ...@@ -69,9 +86,9 @@ class FlagInterface:
class BoundaryHandling: class BoundaryHandling:
def __init__(self, data_handling, field_name, stencil, name="boundary_handling", flag_interface=None, def __init__(self, data_handling, field_name, stencil, name="boundary_handling", flag_interface=None,
target='cpu', openmp=True): target: Target = Target.CPU, openmp=True):
assert data_handling.has_data(field_name) assert data_handling.has_data(field_name)
assert data_handling.dim == len(stencil[0]), "Dimension of stencil and data handling do not match"
self._data_handling = data_handling self._data_handling = data_handling
self._field_name = field_name self._field_name = field_name
self._index_array_name = name + "IndexArrays" self._index_array_name = name + "IndexArrays"
...@@ -83,8 +100,33 @@ class BoundaryHandling: ...@@ -83,8 +100,33 @@ class BoundaryHandling:
fi = flag_interface fi = flag_interface
self.flag_interface = fi if fi is not None else FlagInterface(data_handling, name + "Flags") self.flag_interface = fi if fi is not None else FlagInterface(data_handling, name + "Flags")
gpu = self._target == 'gpu' if ParallelDataHandling and isinstance(self.data_handling, ParallelDataHandling):
data_handling.add_custom_class(self._index_array_name, self.IndexFieldBlockData, cpu=True, gpu=gpu) array_handler = GPUArrayHandler(cupy.cuda.runtime.getDevice())
else:
array_handler = self.data_handling.array_handler
def to_cpu(gpu_version, cpu_version):
gpu_version = gpu_version.boundary_object_to_index_list
cpu_version = cpu_version.boundary_object_to_index_list
for obj, cpu_arr in cpu_version.items():
array_handler.download(gpu_version[obj], cpu_arr)
def to_gpu(gpu_version, cpu_version):
gpu_version = gpu_version.boundary_object_to_index_list
cpu_version = cpu_version.boundary_object_to_index_list
for obj, cpu_arr in cpu_version.items():
if obj not in gpu_version or gpu_version[obj].shape != cpu_arr.shape:
gpu_version[obj] = array_handler.empty(cpu_arr.shape, cpu_arr.dtype)
array_handler.upload(gpu_version[obj], cpu_arr)
else:
array_handler.upload(gpu_version[obj], cpu_arr)
class_ = self.IndexFieldBlockData
class_.to_cpu = to_cpu
class_.to_gpu = to_gpu
gpu = self._target in data_handling._GPU_LIKE_TARGETS
data_handling.add_custom_class(self._index_array_name, class_, cpu=True, gpu=gpu)
@property @property
def data_handling(self): def data_handling(self):
...@@ -200,13 +242,13 @@ class BoundaryHandling: ...@@ -200,13 +242,13 @@ class BoundaryHandling:
if self._dirty: if self._dirty:
self.prepare() self.prepare()
for b in self._data_handling.iterate(gpu=self._target == 'gpu'): for b in self._data_handling.iterate(gpu=self._target in self._data_handling._GPU_LIKE_TARGETS):
for b_obj, idx_arr in b[self._index_array_name].boundary_object_to_index_list.items(): for b_obj, idx_arr in b[self._index_array_name].boundary_object_to_index_list.items():
kwargs[self._field_name] = b[self._field_name] kwargs[self._field_name] = b[self._field_name]
kwargs['indexField'] = idx_arr kwargs['indexField'] = idx_arr
data_used_in_kernel = (p.field_name data_used_in_kernel = (p.fields[0].name
for p in self._boundary_object_to_boundary_info[b_obj].kernel.parameters for p in self._boundary_object_to_boundary_info[b_obj].kernel.parameters
if p.is_field_ptr_argument and p.field_name not in kwargs) if isinstance(p.symbol, FieldPointerSymbol) and p.fields[0].name not in kwargs)
kwargs.update({name: b[name] for name in data_used_in_kernel}) kwargs.update({name: b[name] for name in data_used_in_kernel})
self._boundary_object_to_boundary_info[b_obj].kernel(**kwargs) self._boundary_object_to_boundary_info[b_obj].kernel(**kwargs)
...@@ -215,14 +257,14 @@ class BoundaryHandling: ...@@ -215,14 +257,14 @@ class BoundaryHandling:
if self._dirty: if self._dirty:
self.prepare() self.prepare()
for b in self._data_handling.iterate(gpu=self._target == 'gpu'): for b in self._data_handling.iterate(gpu=self._target in self._data_handling._GPU_LIKE_TARGETS):
for b_obj, idx_arr in b[self._index_array_name].boundary_object_to_index_list.items(): for b_obj, idx_arr in b[self._index_array_name].boundary_object_to_index_list.items():
arguments = kwargs.copy() arguments = kwargs.copy()
arguments[self._field_name] = b[self._field_name] arguments[self._field_name] = b[self._field_name]
arguments['indexField'] = idx_arr arguments['indexField'] = idx_arr
data_used_in_kernel = (p.field_name data_used_in_kernel = (p.fields[0].name
for p in self._boundary_object_to_boundary_info[b_obj].kernel.parameters for p in self._boundary_object_to_boundary_info[b_obj].kernel.parameters
if p.is_field_ptr_argument and p.field_name not in arguments) if isinstance(p.symbol, FieldPointerSymbol) and p.field_name not in arguments)
arguments.update({name: b[name] for name in data_used_in_kernel if name not in arguments}) arguments.update({name: b[name] for name in data_used_in_kernel if name not in arguments})
kernel = self._boundary_object_to_boundary_info[b_obj].kernel kernel = self._boundary_object_to_boundary_info[b_obj].kernel
...@@ -232,11 +274,13 @@ class BoundaryHandling: ...@@ -232,11 +274,13 @@ class BoundaryHandling:
""" """
Writes a VTK field where each cell with the given boundary is marked with 1, other cells are 0 Writes a VTK field where each cell with the given boundary is marked with 1, other cells are 0
This can be used to display the simulation geometry in Paraview This can be used to display the simulation geometry in Paraview
:param file_name: vtk filename
:param boundaries: boundary object, or special string 'domain' for domain cells or special string 'all' for all Params:
boundary conditions. file_name: vtk filename
can also be a sequence, to write multiple boundaries to VTK file boundaries: boundary object, or special string 'domain' for domain cells or special string 'all' for all
:param ghost_layers: number of ghost layers to write, or True for all, False for none boundary conditions.
can also be a sequence, to write multiple boundaries to VTK file
ghost_layers: number of ghost layers to write, or True for all, False for none
""" """
if boundaries == 'all': if boundaries == 'all':
boundaries = list(self._boundary_object_to_boundary_info.keys()) + ['domain'] boundaries = list(self._boundary_object_to_boundary_info.keys()) + ['domain']
...@@ -271,7 +315,7 @@ class BoundaryHandling: ...@@ -271,7 +315,7 @@ class BoundaryHandling:
def _create_boundary_kernel(self, symbolic_field, symbolic_index_field, boundary_obj): def _create_boundary_kernel(self, symbolic_field, symbolic_index_field, boundary_obj):
return create_boundary_kernel(symbolic_field, symbolic_index_field, self.stencil, boundary_obj, return create_boundary_kernel(symbolic_field, symbolic_index_field, self.stencil, boundary_obj,
target=self._target, openmp=self._openmp) target=self._target, cpu_openmp=self._openmp)
def _create_index_fields(self): def _create_index_fields(self):
dh = self._data_handling dh = self._data_handling
...@@ -282,9 +326,11 @@ class BoundaryHandling: ...@@ -282,9 +326,11 @@ class BoundaryHandling:
index_array_bd = b[self._index_array_name] index_array_bd = b[self._index_array_name]
index_array_bd.clear() index_array_bd.clear()
for b_info in self._boundary_object_to_boundary_info.values(): for b_info in self._boundary_object_to_boundary_info.values():
boundary_obj = b_info.boundary_object
idx_arr = create_boundary_index_array(flag_arr, self.stencil, b_info.flag, idx_arr = create_boundary_index_array(flag_arr, self.stencil, b_info.flag,
self.flag_interface.domain_flag, b_info.boundary_object, self.flag_interface.domain_flag, boundary_obj,
ff_ghost_layers) ff_ghost_layers, boundary_obj.inner_or_boundary,
boundary_obj.single_link)
if idx_arr.size == 0: if idx_arr.size == 0:
continue continue
...@@ -296,7 +342,7 @@ class BoundaryHandling: ...@@ -296,7 +342,7 @@ class BoundaryHandling:
def _boundary_data_initialization(self, boundary_obj, boundary_data_setter, **kwargs): def _boundary_data_initialization(self, boundary_obj, boundary_data_setter, **kwargs):
if boundary_obj.additional_data_init_callback: if boundary_obj.additional_data_init_callback:
boundary_obj.additional_data_init_callback(boundary_data_setter, **kwargs) boundary_obj.additional_data_init_callback(boundary_data_setter, **kwargs)
if self._target == 'gpu': if self._target in self._data_handling._GPU_LIKE_TARGETS:
self._data_handling.to_gpu(self._index_array_name) self._data_handling.to_gpu(self._index_array_name)
class BoundaryInfo(object): class BoundaryInfo(object):
...@@ -306,7 +352,7 @@ class BoundaryHandling: ...@@ -306,7 +352,7 @@ class BoundaryHandling:
self.kernel = kernel self.kernel = kernel
class IndexFieldBlockData: class IndexFieldBlockData:
def __init__(self, *_1, **_2): def __init__(self, *args, **kwargs):
self.boundary_object_to_index_list = {} self.boundary_object_to_index_list = {}
self.boundary_object_to_data_setter = {} self.boundary_object_to_data_setter = {}
...@@ -314,24 +360,6 @@ class BoundaryHandling: ...@@ -314,24 +360,6 @@ class BoundaryHandling:
self.boundary_object_to_index_list.clear() self.boundary_object_to_index_list.clear()
self.boundary_object_to_data_setter.clear() self.boundary_object_to_data_setter.clear()
@staticmethod
def to_cpu(gpu_version, cpu_version):
gpu_version = gpu_version.boundary_object_to_index_list
cpu_version = cpu_version.boundary_object_to_index_list
for obj, cpu_arr in cpu_version.values():
gpu_version[obj].get(cpu_arr)
@staticmethod
def to_gpu(gpu_version, cpu_version):
from pycuda import gpuarray
gpu_version = gpu_version.boundary_object_to_index_list
cpu_version = cpu_version.boundary_object_to_index_list
for obj, cpu_arr in cpu_version.items():
if obj not in gpu_version or gpu_version[obj].shape != cpu_arr.shape:
gpu_version[obj] = gpuarray.to_gpu(cpu_arr)
else:
gpu_version[obj].set(cpu_arr)
class BoundaryDataSetter: class BoundaryDataSetter:
...@@ -353,30 +381,30 @@ class BoundaryDataSetter: ...@@ -353,30 +381,30 @@ class BoundaryDataSetter:
assert coord < self.dim assert coord < self.dim
return self.index_array[self.coord_map[coord]] + self.offset[coord] - self.ghost_layers + 0.5 return self.index_array[self.coord_map[coord]] + self.offset[coord] - self.ghost_layers + 0.5
@memorycache() @lru_cache()
def link_offsets(self): def link_offsets(self):
return self.stencil[self.index_array['dir']] return self.stencil[self.index_array['dir']]
@memorycache() @lru_cache()
def link_positions(self, coord): def link_positions(self, coord):
return self.non_boundary_cell_positions(coord) + 0.5 * self.link_offsets()[:, coord] return self.non_boundary_cell_positions(coord) + 0.5 * self.link_offsets()[:, coord]
@memorycache() @lru_cache()
def boundary_cell_positions(self, coord): def boundary_cell_positions(self, coord):
return self.non_boundary_cell_positions(coord) + self.link_offsets()[:, coord] return self.non_boundary_cell_positions(coord) + self.link_offsets()[:, coord]
def __setitem__(self, key, value): def __setitem__(self, key, value):
if key not in self.boundary_data_names: if key not in self.boundary_data_names:
raise KeyError("Invalid boundary data name %s. Allowed are %s" % (key, self.boundary_data_names)) raise KeyError(f"Invalid boundary data name {key}. Allowed are {self.boundary_data_names}")
self.index_array[key] = value self.index_array[key] = value
def __getitem__(self, item): def __getitem__(self, item):
if item not in self.boundary_data_names: if item not in self.boundary_data_names:
raise KeyError("Invalid boundary data name %s. Allowed are %s" % (item, self.boundary_data_names)) raise KeyError(f"Invalid boundary data name {item}. Allowed are {self.boundary_data_names}")
return self.index_array[item] return self.index_array[item]
class BoundaryOffsetInfo(CustomCppCode): class BoundaryOffsetInfo(CustomCodeNode):
# --------------------------- Functions to be used by boundaries -------------------------- # --------------------------- Functions to be used by boundaries --------------------------
...@@ -398,29 +426,30 @@ class BoundaryOffsetInfo(CustomCppCode): ...@@ -398,29 +426,30 @@ class BoundaryOffsetInfo(CustomCppCode):
code = "\n" code = "\n"
for i in range(dim): for i in range(dim):
offset_str = ", ".join([str(d[i]) for d in stencil]) offset_str = ", ".join([str(d[i]) for d in stencil])
code += "const int64_t %s [] = { %s };\n" % (offset_sym[i].name, offset_str) code += "const int32_t %s [] = { %s };\n" % (offset_sym[i].name, offset_str)
inv_dirs = [] inv_dirs = []
for direction in stencil: for direction in stencil:
inverse_dir = tuple([-i for i in direction]) inverse_dir = tuple([-i for i in direction])
inv_dirs.append(str(stencil.index(inverse_dir))) inv_dirs.append(str(stencil.index(inverse_dir)))
code += "const int %s [] = { %s };\n" % (self.INV_DIR_SYMBOL.name, ", ".join(inv_dirs)) code += "const int32_t %s [] = { %s };\n" % (self.INV_DIR_SYMBOL.name, ", ".join(inv_dirs))
offset_symbols = BoundaryOffsetInfo._offset_symbols(dim) offset_symbols = BoundaryOffsetInfo._offset_symbols(dim)
super(BoundaryOffsetInfo, self).__init__(code, symbols_read=set(), super(BoundaryOffsetInfo, self).__init__(code, symbols_read=set(),
symbols_defined=set(offset_symbols + [self.INV_DIR_SYMBOL])) symbols_defined=set(offset_symbols + [self.INV_DIR_SYMBOL]))
@staticmethod @staticmethod
def _offset_symbols(dim): def _offset_symbols(dim):
return [TypedSymbol("c%s" % (d,), create_type(np.int64)) for d in ['x', 'y', 'z'][:dim]] return [TypedSymbol(f"c{d}", create_type(np.int32)) for d in ['x', 'y', 'z'][:dim]]
INV_DIR_SYMBOL = TypedSymbol("invdir", "int") INV_DIR_SYMBOL = TypedSymbol("invdir", np.int32)
def create_boundary_kernel(field, index_field, stencil, boundary_functor, target='cpu', openmp=True): def create_boundary_kernel(field, index_field, stencil, boundary_functor, target=Target.CPU, **kernel_creation_args):
elements = [BoundaryOffsetInfo(stencil)] elements = [BoundaryOffsetInfo(stencil)]
index_arr_dtype = index_field.dtype.numpy_dtype dir_symbol = TypedSymbol("dir", np.int32)
dir_symbol = TypedSymbol("dir", index_arr_dtype.fields['dir'][0]) elements += [SympyAssignment(dir_symbol, index_field[0]('dir'))]
elements += [Assignment(dir_symbol, index_field[0]('dir'))]
elements += boundary_functor(field, direction_symbol=dir_symbol, index_field=index_field) elements += boundary_functor(field, direction_symbol=dir_symbol, index_field=index_field)
return create_indexed_kernel(elements, [index_field], target=target, cpu_openmp=openmp) config = CreateKernelConfig(index_fields=[index_field], target=target, skip_independence_check=True,
**kernel_creation_args)
return create_kernel(elements, config=config)
import warnings
import numpy as np
try:
import pyximport
pyximport.install(language_level=3)
cython_funcs_available = True
except ImportError:
cython_funcs_available = False
if cython_funcs_available:
from pystencils.boundaries.createindexlistcython import (
create_boundary_neighbor_index_list_2d,
create_boundary_neighbor_index_list_3d,
create_boundary_cell_index_list_2d,
create_boundary_cell_index_list_3d,
)
boundary_index_array_coordinate_names = ["x", "y", "z"]
direction_member_name = "dir"
default_index_array_dtype = np.int32
def numpy_data_type_for_boundary_object(boundary_object, dim):
coordinate_names = boundary_index_array_coordinate_names[:dim]
return np.dtype(
[(name, default_index_array_dtype) for name in coordinate_names]
+ [(direction_member_name, default_index_array_dtype)]
+ [(i[0], i[1].numpy_dtype) for i in boundary_object.additional_data],
align=True,
)
def _create_index_list_python(
flag_field_arr,
boundary_mask,
fluid_mask,
stencil,
single_link,
inner_or_boundary=False,
nr_of_ghost_layers=None,
):
if inner_or_boundary and nr_of_ghost_layers is None:
raise ValueError(
"If inner_or_boundary is set True the number of ghost layers "
"around the inner domain has to be specified"
)
if nr_of_ghost_layers is None:
nr_of_ghost_layers = 0
coordinate_names = boundary_index_array_coordinate_names[
: len(flag_field_arr.shape)
]
index_arr_dtype = np.dtype(
[(name, default_index_array_dtype) for name in coordinate_names]
+ [(direction_member_name, default_index_array_dtype)]
)
# boundary cells are extracted via np.where. To ensure continous memory access in the compute kernel these cells
# have to be sorted.
boundary_cells = np.transpose(np.nonzero(flag_field_arr == boundary_mask))
for i in range(len(flag_field_arr.shape)):
boundary_cells = boundary_cells[boundary_cells[:, i].argsort(kind="mergesort")]
# First a set is created to save all fluid cells which are near boundary
fluid_cells = set()
for cell in boundary_cells:
cell = tuple(cell)
for dir_idx, direction in enumerate(stencil):
neighbor_cell = tuple(
[cell_i + dir_i for cell_i, dir_i in zip(cell, direction)]
)
# prevent out ouf bounds access. If boundary cell is at the border, some stencil directions would be out.
if any(
not 0 + nr_of_ghost_layers <= e < upper - nr_of_ghost_layers
for e, upper in zip(neighbor_cell, flag_field_arr.shape)
):
continue
if flag_field_arr[neighbor_cell] & fluid_mask:
fluid_cells.add(neighbor_cell)
# then this is set is transformed to a list to make it sortable. This ensures continoous memory access later.
fluid_cells = list(fluid_cells)
if len(flag_field_arr.shape) == 3:
fluid_cells.sort(key=lambda tup: (tup[-1], tup[-2], tup[0]))
else:
fluid_cells.sort(key=lambda tup: (tup[-1], tup[0]))
cells_to_iterate = fluid_cells if inner_or_boundary else boundary_cells
checkmask = boundary_mask if inner_or_boundary else fluid_mask
result = []
for cell in cells_to_iterate:
cell = tuple(cell)
sum_cells = np.zeros(len(cell))
for dir_idx, direction in enumerate(stencil):
neighbor_cell = tuple(
[cell_i + dir_i for cell_i, dir_i in zip(cell, direction)]
)
# prevent out ouf bounds access. If boundary cell is at the border, some stencil directions would be out.
if any(
not 0 <= e < upper
for e, upper in zip(neighbor_cell, flag_field_arr.shape)
):
continue
if flag_field_arr[neighbor_cell] & checkmask:
if single_link:
sum_cells += np.array(direction)
else:
result.append(tuple(cell) + (dir_idx,))
# the discrete normal direction is the one which gives the maximum inner product to the stencil direction
if single_link and any(sum_cells != 0):
idx = np.argmax(np.inner(sum_cells, stencil))
result.append(tuple(cell) + (idx,))
return np.array(result, dtype=index_arr_dtype)
def create_boundary_index_list(
flag_field,
stencil,
boundary_mask,
fluid_mask,
nr_of_ghost_layers=1,
inner_or_boundary=True,
single_link=False,
):
"""Creates a numpy array storing links (connections) between domain cells and boundary cells.
Args:
flag_field: flag integer array where boundary and domain cells are marked (interpreted as bit vector)
stencil: list of directions, for possible links. When single_link is set to true the order matters, because
then only the first link is added to the list
boundary_mask: cells where (cell & mask) is true are considered boundary cells
fluid_mask: cells where (cell & mask) is true are considered fluid/inner cells cells
nr_of_ghost_layers: only relevant if neighbors is True
inner_or_boundary: if true, the result contains the cell coordinates of the domain cells -
if false the boundary cells are listed
single_link: if true only the link in normal direction to this cell is reported
"""
dim = len(flag_field.shape)
coordinate_names = boundary_index_array_coordinate_names[:dim]
index_arr_dtype = np.dtype(
[(name, default_index_array_dtype) for name in coordinate_names]
+ [(direction_member_name, default_index_array_dtype)]
)
stencil = np.array(stencil, dtype=default_index_array_dtype)
args = (
flag_field,
nr_of_ghost_layers,
boundary_mask,
fluid_mask,
stencil,
single_link,
)
args_no_gl = (flag_field, boundary_mask, fluid_mask, stencil, single_link)
if cython_funcs_available:
if dim == 2:
if inner_or_boundary:
idx_list = create_boundary_neighbor_index_list_2d(*args)
else:
idx_list = create_boundary_cell_index_list_2d(*args_no_gl)
elif dim == 3:
if inner_or_boundary:
idx_list = create_boundary_neighbor_index_list_3d(*args)
else:
idx_list = create_boundary_cell_index_list_3d(*args_no_gl)
else:
raise ValueError("Flag field has to be a 2 or 3 dimensional numpy array")
return np.array(idx_list, dtype=index_arr_dtype)
else:
if flag_field.size > 1e6:
warnings.warn(
"Boundary setup may take very long! Consider installing cython to speed it up"
)
return _create_index_list_python(
*args_no_gl,
inner_or_boundary=inner_or_boundary,
nr_of_ghost_layers=nr_of_ghost_layers,
)
def create_boundary_index_array(
flag_field,
stencil,
boundary_mask,
fluid_mask,
boundary_object,
nr_of_ghost_layers=1,
inner_or_boundary=True,
single_link=False,
):
idx_array = create_boundary_index_list(
flag_field,
stencil,
boundary_mask,
fluid_mask,
nr_of_ghost_layers,
inner_or_boundary,
single_link,
)
dim = len(flag_field.shape)
if boundary_object.additional_data:
coordinate_names = boundary_index_array_coordinate_names[:dim]
index_arr_dtype = numpy_data_type_for_boundary_object(boundary_object, dim)
extended_idx_field = np.empty(len(idx_array), dtype=index_arr_dtype)
for prop in coordinate_names + ["dir"]:
extended_idx_field[prop] = idx_array[prop]
idx_array = extended_idx_field
return idx_array
# cython: language_level=3str
import cython
ctypedef fused IntegerType:
short
int
long
long long
unsigned short
unsigned int
unsigned long
@cython.boundscheck(False) # turn off bounds-checking for entire function
@cython.wraparound(False) # turn off negative index wrapping for entire function
def create_boundary_neighbor_index_list_2d(object[IntegerType, ndim=2] flag_field,
int nr_of_ghost_layers, IntegerType boundary_mask, IntegerType fluid_mask,
object[int, ndim=2] stencil, int single_link):
cdef int xs, ys, x, y
cdef int dirIdx, num_directions, dx, dy
cdef int sum_x, sum_y
cdef float dot, maxn
cdef int calculated_idx
xs, ys = flag_field.shape
boundary_index_list = []
num_directions = stencil.shape[0]
for y in range(nr_of_ghost_layers, ys - nr_of_ghost_layers):
for x in range(nr_of_ghost_layers, xs - nr_of_ghost_layers):
sum_x = 0; sum_y = 0;
if flag_field[x, y] & fluid_mask:
for dirIdx in range(num_directions):
dx = stencil[dirIdx,0]; dy = stencil[dirIdx,1]
if flag_field[x + dx, y + dy] & boundary_mask:
if single_link:
sum_x += dx; sum_y += dy;
else:
boundary_index_list.append((x, y, dirIdx))
dot = 0; maxn = 0; calculated_idx = 0
if single_link and (sum_x != 0 or sum_y != 0):
for dirIdx in range(num_directions):
dx = stencil[dirIdx, 0]; dy = stencil[dirIdx, 1];
dot = dx * sum_x + dy * sum_y
if dot > maxn:
maxn = dot
calculated_idx = dirIdx
boundary_index_list.append((x, y, calculated_idx))
return boundary_index_list
@cython.boundscheck(False) # turn off bounds-checking for entire function
@cython.wraparound(False) # turn off negative index wrapping for entire function
def create_boundary_neighbor_index_list_3d(object[IntegerType, ndim=3] flag_field,
int nr_of_ghost_layers, IntegerType boundary_mask, IntegerType fluid_mask,
object[int, ndim=2] stencil, int single_link):
cdef int xs, ys, zs, x, y, z
cdef int dirIdx, num_directions, dx, dy, dz
cdef int sum_x, sum_y, sum_z
cdef float dot, maxn
cdef int calculated_idx
xs, ys, zs = flag_field.shape
boundary_index_list = []
num_directions = stencil.shape[0]
for z in range(nr_of_ghost_layers, zs - nr_of_ghost_layers):
for y in range(nr_of_ghost_layers, ys - nr_of_ghost_layers):
for x in range(nr_of_ghost_layers, xs - nr_of_ghost_layers):
sum_x = 0; sum_y = 0; sum_z = 0
if flag_field[x, y, z] & fluid_mask:
for dirIdx in range(num_directions):
dx = stencil[dirIdx,0]; dy = stencil[dirIdx,1]; dz = stencil[dirIdx,2]
if flag_field[x + dx, y + dy, z + dz] & boundary_mask:
if single_link:
sum_x += dx; sum_y += dy; sum_z += dz
else:
boundary_index_list.append((x, y, z, dirIdx))
dot = 0; maxn = 0; calculated_idx = 0
if single_link and (sum_x != 0 or sum_y != 0 or sum_z != 0):
for dirIdx in range(num_directions):
dx = stencil[dirIdx, 0]; dy = stencil[dirIdx, 1]; dz = stencil[dirIdx, 2]
dot = dx * sum_x + dy * sum_y + dz * sum_z
if dot > maxn:
maxn = dot
calculated_idx = dirIdx
boundary_index_list.append((x, y, z, calculated_idx))
return boundary_index_list
@cython.boundscheck(False) # turn off bounds-checking for entire function
@cython.wraparound(False) # turn off negative index wrapping for entire function
def create_boundary_cell_index_list_2d(object[IntegerType, ndim=2] flag_field,
IntegerType boundary_mask, IntegerType fluid_mask,
object[int, ndim=2] stencil, int single_link):
cdef int xs, ys, x, y
cdef int dirIdx, num_directions, dx, dy
cdef int sum_x, sum_y
cdef float dot, maxn
cdef int calculated_idx
xs, ys = flag_field.shape
boundary_index_list = []
num_directions = stencil.shape[0]
for y in range(0, ys):
for x in range(0, xs):
sum_x = 0; sum_y = 0;
if flag_field[x, y] & boundary_mask:
for dirIdx in range(num_directions):
dx = stencil[dirIdx,0]; dy = stencil[dirIdx,1]
if 0 <= x + dx < xs and 0 <= y + dy < ys:
if flag_field[x + dx, y + dy] & fluid_mask:
if single_link:
sum_x += dx; sum_y += dy
else:
boundary_index_list.append((x, y, dirIdx))
dot = 0; maxn = 0; calculated_idx = 0
if single_link and (sum_x != 0 or sum_y != 0):
for dirIdx in range(num_directions):
dx = stencil[dirIdx, 0]; dy = stencil[dirIdx, 1]
dot = dx * sum_x + dy * sum_y
if dot > maxn:
maxn = dot
calculated_idx = dirIdx
boundary_index_list.append((x, y, calculated_idx))
return boundary_index_list
@cython.boundscheck(False) # turn off bounds-checking for entire function
@cython.wraparound(False) # turn off negative index wrapping for entire function
def create_boundary_cell_index_list_3d(object[IntegerType, ndim=3] flag_field,
IntegerType boundary_mask, IntegerType fluid_mask,
object[int, ndim=2] stencil, int single_link):
cdef int xs, ys, zs, x, y, z
cdef int dirIdx, num_directions, dx, dy, dz
cdef int sum_x, sum_y, sum_z
cdef float dot, maxn
cdef int calculated_idx
xs, ys, zs = flag_field.shape
boundary_index_list = []
num_directions = stencil.shape[0]
for z in range(0, zs):
for y in range(0, ys):
for x in range(0, xs):
sum_x = 0; sum_y = 0; sum_z = 0
if flag_field[x, y, z] & boundary_mask:
for dirIdx in range(num_directions):
dx = stencil[dirIdx, 0]; dy = stencil[dirIdx, 1]; dz = stencil[dirIdx, 2]
if 0 <= x + dx < xs and 0 <= y + dy < ys and 0 <= z + dz < zs:
if flag_field[x + dx, y + dy, z + dz] & fluid_mask:
if single_link:
sum_x += dx; sum_y += dy; sum_z += dz
else:
boundary_index_list.append((x, y, z, dirIdx))
dot = 0; maxn = 0; calculated_idx=0
if single_link and (sum_x != 0 or sum_y !=0 or sum_z !=0):
for dirIdx in range(num_directions):
dx = stencil[dirIdx, 0]; dy = stencil[dirIdx, 1]; dz = stencil[dirIdx, 2]
dot = dx*sum_x + dy*sum_y + dz*sum_z
if dot > maxn:
maxn = dot
calculated_idx = dirIdx
boundary_index_list.append((x, y, z, calculated_idx))
return boundary_index_list
\ No newline at end of file
import sympy as sp import sympy as sp
from pystencils import Field, TypedSymbol
from pystencils.integer_functions import bitwise_and
from pystencils.boundaries.boundaryhandling import DEFAULT_FLAG_TYPE from pystencils.boundaries.boundaryhandling import DEFAULT_FLAG_TYPE
from pystencils.data_types import create_type from pystencils.typing import TypedSymbol, create_type
from pystencils.field import Field
from pystencils.integer_functions import bitwise_and
def add_neumann_boundary(eqs, fields, flag_field, boundary_flag="neumann_flag", inverse_flag=False): def add_neumann_boundary(eqs, fields, flag_field, boundary_flag="neumann_flag", inverse_flag=False):
""" """
Replaces all neighbor accesses by flag field guarded accesses. Replaces all neighbor accesses by flag field guarded accesses.
If flag in neighboring cell is set, the center value is used instead If flag in neighboring cell is set, the center value is used instead
:param eqs: list of equations containing field accesses to direct neighbors
:param fields: fields for which the Neumann boundary should be applied Args:
:param flag_field: integer field marking boundary cells eqs: list of equations containing field accesses to direct neighbors
:param boundary_flag: if flag field has value 'boundary_flag' (no bit operations yet) fields: fields for which the Neumann boundary should be applied
the cell is assumed to be boundary flag_field: integer field marking boundary cells
:param inverse_flag: if true, boundary cells are where flag field has not the value of boundary_flag boundary_flag: if flag field has value 'boundary_flag' (no bit operations yet)
:return: list of equations with guarded field accesses the cell is assumed to be boundary
inverse_flag: if true, boundary cells are where flag field has not the value of boundary_flag
Returns:
list of equations with guarded field accesses
""" """
if not hasattr(fields, "__len__"): if not hasattr(fields, "__len__"):
fields = [fields] fields = [fields]
......
import os
from collections.abc import Hashable
from functools import partial, wraps
from itertools import chain
from functools import lru_cache as memorycache
from joblib import Memory
from appdirs import user_cache_dir
if 'PYSTENCILS_CACHE_DIR' in os.environ:
cache_dir = os.environ['PYSTENCILS_CACHE_DIR']
else:
cache_dir = user_cache_dir('pystencils')
disk_cache = Memory(cache_dir, verbose=False).cache
disk_cache_no_fallback = disk_cache
def _wrapper(wrapped_func, cached_func, *args, **kwargs):
if all(isinstance(a, Hashable) for a in chain(args, kwargs.values())):
return cached_func(*args, **kwargs)
else:
return wrapped_func(*args, **kwargs)
def memorycache_if_hashable(maxsize=128, typed=False):
def wrapper(func):
return partial(_wrapper, func, memorycache(maxsize, typed)(func))
return wrapper
def sharedmethodcache(cache_id: str):
"""Decorator for memoization of instance methods, allowing multiple methods to use the same cache.
This decorator caches results of instance methods per instantiated object of the surrounding class.
It allows multiple methods to use the same cache, by passing them the same `cache_id` string.
Cached values are stored in a dictionary, which is added as a member `self.<cache_id>` to the
`self` object instance. Make sure that this doesn't cause any naming conflicts with other members!
Of course, for this to be useful, said methods must have the same signature (up to additional kwargs)
and must return the same result when called with the same arguments."""
def _decorator(user_method):
@wraps(user_method)
def _decorated_func(self, *args, **kwargs):
objdict = self.__dict__
cache = objdict.setdefault(cache_id, dict())
key = args
for item in kwargs.items():
key += item
if key not in cache:
result = user_method(self, *args, **kwargs)
cache[key] = result
return result
else:
return cache[key]
return _decorated_func
return _decorator
def clear_cache():
"""
Clears the pystencils cache created by joblib.
"""
memory = Memory(cache_dir, verbose=0)
memory.clear(warn=False)
# Disable memory cache:
# disk_cache = lambda o: o
# disk_cache_no_fallback = lambda o: o