Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
No results found
Show changes
Showing
with 1270 additions and 482 deletions
import numpy as np import numpy as np
from pystencils.data_types import BasicType
def aligned_empty(shape, byte_alignment=True, dtype=np.float64, byte_offset=0, order='C', align_inner_coordinate=True): def aligned_empty(shape, byte_alignment=True, dtype=np.float64, byte_offset=0, order='C', align_inner_coordinate=True):
...@@ -21,20 +20,26 @@ def aligned_empty(shape, byte_alignment=True, dtype=np.float64, byte_offset=0, o ...@@ -21,20 +20,26 @@ def aligned_empty(shape, byte_alignment=True, dtype=np.float64, byte_offset=0, o
from pystencils.backends.simd_instruction_sets import (get_supported_instruction_sets, get_cacheline_size, from pystencils.backends.simd_instruction_sets import (get_supported_instruction_sets, get_cacheline_size,
get_vector_instruction_set) get_vector_instruction_set)
type_name = BasicType.numpy_name_to_c(np.dtype(dtype).name)
instruction_sets = get_supported_instruction_sets() instruction_sets = get_supported_instruction_sets()
if instruction_sets is None: if instruction_sets is None:
byte_alignment = 64 byte_alignment = 64
elif byte_alignment == 'cacheline': elif byte_alignment == 'cacheline':
cacheline_sizes = [get_cacheline_size(is_name) for is_name in instruction_sets] cacheline_sizes = [get_cacheline_size(is_name) for is_name in instruction_sets]
if all([s is None for s in cacheline_sizes]): if all([s is None for s in cacheline_sizes]) or \
byte_alignment = max([get_vector_instruction_set(type_name, is_name)['width'] * np.dtype(dtype).itemsize max([s for s in cacheline_sizes if s is not None]) > 0x100000:
for is_name in instruction_sets]) widths = [get_vector_instruction_set(dtype, is_name)['width'] * np.dtype(dtype).itemsize
for is_name in instruction_sets
if type(get_vector_instruction_set(dtype, is_name)['width']) is int]
byte_alignment = 64 if all([s is None for s in widths]) else max(widths)
else: else:
byte_alignment = max([s for s in cacheline_sizes if s is not None]) byte_alignment = max([s for s in cacheline_sizes if s is not None])
elif not any([type(get_vector_instruction_set(dtype, is_name)['width']) is int
for is_name in instruction_sets]):
byte_alignment = 64
else: else:
byte_alignment = max([get_vector_instruction_set(type_name, is_name)['width'] * np.dtype(dtype).itemsize byte_alignment = max([get_vector_instruction_set(dtype, is_name)['width'] * np.dtype(dtype).itemsize
for is_name in instruction_sets]) for is_name in instruction_sets
if type(get_vector_instruction_set(dtype, is_name)['width']) is int])
if (not align_inner_coordinate) or (not hasattr(shape, '__len__')): if (not align_inner_coordinate) or (not hasattr(shape, '__len__')):
size = np.prod(shape) size = np.prod(shape)
d = np.dtype(dtype) d = np.dtype(dtype)
...@@ -72,7 +77,7 @@ def aligned_empty(shape, byte_alignment=True, dtype=np.float64, byte_offset=0, o ...@@ -72,7 +77,7 @@ def aligned_empty(shape, byte_alignment=True, dtype=np.float64, byte_offset=0, o
return tmp return tmp
def aligned_zeros(shape, byte_alignment=True, dtype=float, byte_offset=0, order='C', align_inner_coordinate=True): def aligned_zeros(shape, byte_alignment=True, dtype=np.float64, byte_offset=0, order='C', align_inner_coordinate=True):
arr = aligned_empty(shape, dtype=dtype, byte_offset=byte_offset, arr = aligned_empty(shape, dtype=dtype, byte_offset=byte_offset,
order=order, byte_alignment=byte_alignment, align_inner_coordinate=align_inner_coordinate) order=order, byte_alignment=byte_alignment, align_inner_coordinate=align_inner_coordinate)
x = np.zeros((), arr.dtype) x = np.zeros((), arr.dtype)
...@@ -80,7 +85,7 @@ def aligned_zeros(shape, byte_alignment=True, dtype=float, byte_offset=0, order= ...@@ -80,7 +85,7 @@ def aligned_zeros(shape, byte_alignment=True, dtype=float, byte_offset=0, order=
return arr return arr
def aligned_ones(shape, byte_alignment=True, dtype=float, byte_offset=0, order='C', align_inner_coordinate=True): def aligned_ones(shape, byte_alignment=True, dtype=np.float64, byte_offset=0, order='C', align_inner_coordinate=True):
arr = aligned_empty(shape, dtype=dtype, byte_offset=byte_offset, arr = aligned_empty(shape, dtype=dtype, byte_offset=byte_offset,
order=order, byte_alignment=byte_alignment, align_inner_coordinate=align_inner_coordinate) order=order, byte_alignment=byte_alignment, align_inner_coordinate=align_inner_coordinate)
x = np.ones((), arr.dtype) x = np.ones((), arr.dtype)
......
import numpy as np import numpy as np
import sympy as sp import sympy as sp
from sympy.codegen.ast import Assignment from sympy.codegen.ast import Assignment, AugmentedAssignment, AddAugmentedAssignment
from sympy.printing.latex import LatexPrinter from sympy.printing.latex import LatexPrinter
__all__ = ['Assignment', 'assignment_from_stencil'] __all__ = ['Assignment', 'AugmentedAssignment', 'AddAugmentedAssignment', 'assignment_from_stencil']
def print_assignment_latex(printer, expr): def print_assignment_latex(printer, expr):
binop = f"{expr.binop}=" if isinstance(expr, AugmentedAssignment) else ''
"""sympy cannot print Assignments as Latex. Thus, this function is added to the sympy Latex printer""" """sympy cannot print Assignments as Latex. Thus, this function is added to the sympy Latex printer"""
printed_lhs = printer.doprint(expr.lhs) printed_lhs = printer.doprint(expr.lhs)
printed_rhs = printer.doprint(expr.rhs) printed_rhs = printer.doprint(expr.rhs)
return r"{printed_lhs} \leftarrow {printed_rhs}".format(printed_lhs=printed_lhs, printed_rhs=printed_rhs) return fr"{printed_lhs} \leftarrow_{{{binop}}} {printed_rhs}"
def assignment_str(assignment): def assignment_str(assignment):
return r"{lhs} ← {rhs}".format(lhs=assignment.lhs, rhs=assignment.rhs) op = f"{assignment.binop}=" if isinstance(assignment, AugmentedAssignment) else ''
return fr"{assignment.lhs} {op} {assignment.rhs}"
_old_new = sp.codegen.ast.Assignment.__new__ _old_new = sp.codegen.ast.Assignment.__new__
# TODO Typing Part2 add default type, defult_float_type, default_int_type and use sane defaults
def _Assignment__new__(cls, lhs, rhs, *args, **kwargs): def _Assignment__new__(cls, lhs, rhs, *args, **kwargs):
if isinstance(lhs, (list, tuple, sp.Matrix)) and isinstance(rhs, (list, tuple, sp.Matrix)): if isinstance(lhs, (list, tuple, sp.Matrix)) and isinstance(rhs, (list, tuple, sp.Matrix)):
assert len(lhs) == len(rhs), f'{lhs} and {rhs} must have same length when performing vector assignment!' assert len(lhs) == len(rhs), f'{lhs} and {rhs} must have same length when performing vector assignment!'
...@@ -31,20 +34,10 @@ Assignment.__str__ = assignment_str ...@@ -31,20 +34,10 @@ Assignment.__str__ = assignment_str
Assignment.__new__ = _Assignment__new__ Assignment.__new__ = _Assignment__new__
LatexPrinter._print_Assignment = print_assignment_latex LatexPrinter._print_Assignment = print_assignment_latex
sp.MutableDenseMatrix.__hash__ = lambda self: hash(tuple(self)) AugmentedAssignment.__str__ = assignment_str
LatexPrinter._print_AugmentedAssignment = print_assignment_latex
# Apparently, in SymPy 1.4 Assignment.__hash__ is not implemented. This has been fixed in current master
try:
sympy_version = sp.__version__.split('.')
if int(sympy_version[0]) <= 1 and int(sympy_version[1]) <= 4: sp.MutableDenseMatrix.__hash__ = lambda self: hash(tuple(self))
def hash_fun(self):
return hash((self.lhs, self.rhs))
Assignment.__hash__ = hash_fun
except Exception:
pass
def assignment_from_stencil(stencil_array, input_field, output_field, def assignment_from_stencil(stencil_array, input_field, output_field,
......
...@@ -5,11 +5,12 @@ from typing import Any, List, Optional, Sequence, Set, Union ...@@ -5,11 +5,12 @@ from typing import Any, List, Optional, Sequence, Set, Union
import sympy as sp import sympy as sp
import pystencils from pystencils.assignment import Assignment
from pystencils.data_types import TypedImaginaryUnit, TypedSymbol, cast_func, create_type from pystencils.enums import Target, Backend
from pystencils.field import Field from pystencils.field import Field
from pystencils.kernelparameters import FieldPointerSymbol, FieldShapeSymbol, FieldStrideSymbol
from pystencils.sympyextensions import fast_subs from pystencils.sympyextensions import fast_subs
from pystencils.typing import (create_type, get_next_parent_of_type,
FieldPointerSymbol, FieldShapeSymbol, FieldStrideSymbol, TypedSymbol, CFunction)
NodeOrExpr = Union['Node', sp.Expr] NodeOrExpr = Union['Node', sp.Expr]
...@@ -136,7 +137,6 @@ class Conditional(Node): ...@@ -136,7 +137,6 @@ class Conditional(Node):
class KernelFunction(Node): class KernelFunction(Node):
class Parameter: class Parameter:
"""Function parameter. """Function parameter.
...@@ -176,7 +176,9 @@ class KernelFunction(Node): ...@@ -176,7 +176,9 @@ class KernelFunction(Node):
def field_name(self): def field_name(self):
return self.fields[0].name return self.fields[0].name
def __init__(self, body, target, backend, compile_function, ghost_layers, function_name="kernel", assignments=None): def __init__(self, body, target: Target, backend: Backend, compile_function, ghost_layers,
function_name: str = "kernel",
assignments=None):
super(KernelFunction, self).__init__() super(KernelFunction, self).__init__()
self._body = body self._body = body
body.parent = self body.parent = self
...@@ -191,15 +193,19 @@ class KernelFunction(Node): ...@@ -191,15 +193,19 @@ class KernelFunction(Node):
# function that compiles the node to a Python callable, is set by the backends # function that compiles the node to a Python callable, is set by the backends
self._compile_function = compile_function self._compile_function = compile_function
self.assignments = assignments self.assignments = assignments
# If nontemporal stores are activated together with the Neon instruction set it results in cacheline zeroing
# For cacheline zeroing the information of the field size for each field is needed. Thus, in this case
# all field sizes are kernel parameters and not just the common field size used for the loops
self.use_all_written_field_sizes = False
@property @property
def target(self): def target(self):
"""Currently either 'cpu' or 'gpu' """ """See pystencils.Target"""
return self._target return self._target
@property @property
def backend(self): def backend(self):
"""Backend for generating the code e.g. 'llvm', 'c', 'cuda' """ """Backend for generating the code: `Backend`"""
return self._backend return self._backend
@property @property
...@@ -226,13 +232,13 @@ class KernelFunction(Node): ...@@ -226,13 +232,13 @@ class KernelFunction(Node):
@property @property
def fields_accessed(self) -> Set[Field]: def fields_accessed(self) -> Set[Field]:
"""Set of Field instances: fields which are accessed inside this kernel function""" """Set of Field instances: fields which are accessed inside this kernel function"""
from pystencils.interpolation_astnodes import InterpolatorAccess return set(o.field for o in itertools.chain(self.atoms(ResolvedFieldAccess)))
return set(o.field for o in itertools.chain(self.atoms(ResolvedFieldAccess), self.atoms(InterpolatorAccess)))
@property @property
def fields_written(self) -> Set[Field]: def fields_written(self) -> Set[Field]:
assignments = self.atoms(SympyAssignment) assignments = self.atoms(SympyAssignment)
return {a.lhs.field for a in assignments if isinstance(a.lhs, ResolvedFieldAccess)} return set().union(itertools.chain.from_iterable([f.field for f in a.lhs.free_symbols if hasattr(f, 'field')]
for a in assignments))
@property @property
def fields_read(self) -> Set[Field]: def fields_read(self) -> Set[Field]:
...@@ -246,6 +252,11 @@ class KernelFunction(Node): ...@@ -246,6 +252,11 @@ class KernelFunction(Node):
This function is expensive, cache the result where possible! This function is expensive, cache the result where possible!
""" """
field_map = {f.name: f for f in self.fields_accessed} field_map = {f.name: f for f in self.fields_accessed}
sizes = set()
if self.use_all_written_field_sizes:
sizes = set().union(*(a.shape[:a.spatial_dimensions] for a in self.fields_written))
sizes = filter(lambda s: isinstance(s, FieldShapeSymbol), sizes)
def get_fields(symbol): def get_fields(symbol):
if hasattr(symbol, 'field_name'): if hasattr(symbol, 'field_name'):
...@@ -255,9 +266,13 @@ class KernelFunction(Node): ...@@ -255,9 +266,13 @@ class KernelFunction(Node):
return () return ()
argument_symbols = self._body.undefined_symbols - self.global_variables argument_symbols = self._body.undefined_symbols - self.global_variables
argument_symbols.update(sizes)
parameters = [self.Parameter(symbol, get_fields(symbol)) for symbol in argument_symbols] parameters = [self.Parameter(symbol, get_fields(symbol)) for symbol in argument_symbols]
if hasattr(self, 'indexing'): if hasattr(self, 'indexing'):
parameters += [self.Parameter(s, []) for s in self.indexing.symbolic_parameters()] parameters += [self.Parameter(s, []) for s in self.indexing.symbolic_parameters()]
# Exclude paramters of type CFunction. These parameters will result in a C function call that will be handled
# by including a respective header file in the compute kernel. Hence, it is not a free parameter.
parameters = [p for p in parameters if not isinstance(p.symbol, CFunction)]
parameters.sort(key=lambda p: p.symbol.name) parameters.sort(key=lambda p: p.symbol.name)
return parameters return parameters
...@@ -291,8 +306,10 @@ class SkipIteration(Node): ...@@ -291,8 +306,10 @@ class SkipIteration(Node):
class Block(Node): class Block(Node):
def __init__(self, nodes: List[Node]): def __init__(self, nodes: Union[Node, List[Node]]):
super(Block, self).__init__() super(Block, self).__init__()
if not isinstance(nodes, list):
nodes = [nodes]
self._nodes = nodes self._nodes = nodes
self.parent = None self.parent = None
for n in self._nodes: for n in self._nodes:
...@@ -331,14 +348,6 @@ class Block(Node): ...@@ -331,14 +348,6 @@ class Block(Node):
assert self._nodes.count(insert_before) == 1 assert self._nodes.count(insert_before) == 1
idx = self._nodes.index(insert_before) idx = self._nodes.index(insert_before)
# move all assignment (definitions to the top)
if isinstance(new_node, SympyAssignment) and new_node.is_declaration:
while idx > 0:
pn = self._nodes[idx - 1]
if isinstance(pn, LoopOverCoordinate) or isinstance(pn, Conditional):
idx -= 1
else:
break
if not if_not_exists or self._nodes[idx] != new_node: if not if_not_exists or self._nodes[idx] != new_node:
self._nodes.insert(idx, new_node) self._nodes.insert(idx, new_node)
...@@ -347,14 +356,6 @@ class Block(Node): ...@@ -347,14 +356,6 @@ class Block(Node):
assert self._nodes.count(insert_after) == 1 assert self._nodes.count(insert_after) == 1
idx = self._nodes.index(insert_after) + 1 idx = self._nodes.index(insert_after) + 1
# move all assignment (definitions to the top)
if isinstance(new_node, SympyAssignment) and new_node.is_declaration:
while idx > 0:
pn = self._nodes[idx - 1]
if isinstance(pn, LoopOverCoordinate) or isinstance(pn, Conditional):
idx -= 1
else:
break
if not if_not_exists or not (self._nodes[idx - 1] == new_node if not if_not_exists or not (self._nodes[idx - 1] == new_node
or (idx < len(self._nodes) and self._nodes[idx] == new_node)): or (idx < len(self._nodes) and self._nodes[idx] == new_node)):
self._nodes.insert(idx, new_node) self._nodes.insert(idx, new_node)
...@@ -389,7 +390,7 @@ class Block(Node): ...@@ -389,7 +390,7 @@ class Block(Node):
def symbols_defined(self): def symbols_defined(self):
result = set() result = set()
for a in self.args: for a in self.args:
if isinstance(a, pystencils.Assignment): if isinstance(a, Assignment):
result.update(a.free_symbols) result.update(a.free_symbols)
else: else:
result.update(a.symbols_defined) result.update(a.symbols_defined)
...@@ -400,7 +401,7 @@ class Block(Node): ...@@ -400,7 +401,7 @@ class Block(Node):
result = set() result = set()
defined_symbols = set() defined_symbols = set()
for a in self.args: for a in self.args:
if isinstance(a, pystencils.Assignment): if isinstance(a, Assignment):
result.update(a.free_symbols) result.update(a.free_symbols)
defined_symbols.update({a.lhs}) defined_symbols.update({a.lhs})
else: else:
...@@ -430,7 +431,7 @@ class LoopOverCoordinate(Node): ...@@ -430,7 +431,7 @@ class LoopOverCoordinate(Node):
LOOP_COUNTER_NAME_PREFIX = "ctr" LOOP_COUNTER_NAME_PREFIX = "ctr"
BLOCK_LOOP_COUNTER_NAME_PREFIX = "_blockctr" BLOCK_LOOP_COUNTER_NAME_PREFIX = "_blockctr"
def __init__(self, body, coordinate_to_loop_over, start, stop, step=1, is_block_loop=False): def __init__(self, body, coordinate_to_loop_over, start, stop, step=1, is_block_loop=False, custom_loop_ctr=None):
super(LoopOverCoordinate, self).__init__(parent=None) super(LoopOverCoordinate, self).__init__(parent=None)
self.body = body self.body = body
body.parent = self body.parent = self
...@@ -441,11 +442,12 @@ class LoopOverCoordinate(Node): ...@@ -441,11 +442,12 @@ class LoopOverCoordinate(Node):
self.body.parent = self self.body.parent = self
self.prefix_lines = [] self.prefix_lines = []
self.is_block_loop = is_block_loop self.is_block_loop = is_block_loop
self.custom_loop_ctr = custom_loop_ctr
def new_loop_with_different_body(self, new_body): def new_loop_with_different_body(self, new_body):
result = LoopOverCoordinate(new_body, self.coordinate_to_loop_over, self.start, self.stop, result = LoopOverCoordinate(new_body, self.coordinate_to_loop_over, self.start, self.stop,
self.step, self.is_block_loop) self.step, self.is_block_loop, self.custom_loop_ctr)
result.prefix_lines = [l for l in self.prefix_lines] result.prefix_lines = [prefix_line for prefix_line in self.prefix_lines]
return result return result
def subs(self, subs_dict): def subs(self, subs_dict):
...@@ -507,10 +509,13 @@ class LoopOverCoordinate(Node): ...@@ -507,10 +509,13 @@ class LoopOverCoordinate(Node):
@property @property
def loop_counter_name(self): def loop_counter_name(self):
if self.is_block_loop: if self.custom_loop_ctr:
return LoopOverCoordinate.get_block_loop_counter_name(self.coordinate_to_loop_over) return self.custom_loop_ctr.name
else: else:
return LoopOverCoordinate.get_loop_counter_name(self.coordinate_to_loop_over) if self.is_block_loop:
return LoopOverCoordinate.get_block_loop_counter_name(self.coordinate_to_loop_over)
else:
return LoopOverCoordinate.get_loop_counter_name(self.coordinate_to_loop_over)
@staticmethod @staticmethod
def is_loop_counter_symbol(symbol): def is_loop_counter_symbol(symbol):
...@@ -534,14 +539,16 @@ class LoopOverCoordinate(Node): ...@@ -534,14 +539,16 @@ class LoopOverCoordinate(Node):
@property @property
def loop_counter_symbol(self): def loop_counter_symbol(self):
if self.is_block_loop: if self.custom_loop_ctr:
return self.get_block_loop_counter_symbol(self.coordinate_to_loop_over) return self.custom_loop_ctr
else: else:
return self.get_loop_counter_symbol(self.coordinate_to_loop_over) if self.is_block_loop:
return self.get_block_loop_counter_symbol(self.coordinate_to_loop_over)
else:
return self.get_loop_counter_symbol(self.coordinate_to_loop_over)
@property @property
def is_outermost_loop(self): def is_outermost_loop(self):
from pystencils.transformations import get_next_parent_of_type
return get_next_parent_of_type(self, LoopOverCoordinate) is None return get_next_parent_of_type(self, LoopOverCoordinate) is None
@property @property
...@@ -564,13 +571,14 @@ class SympyAssignment(Node): ...@@ -564,13 +571,14 @@ class SympyAssignment(Node):
def __init__(self, lhs_symbol, rhs_expr, is_const=True, use_auto=False): def __init__(self, lhs_symbol, rhs_expr, is_const=True, use_auto=False):
super(SympyAssignment, self).__init__(parent=None) super(SympyAssignment, self).__init__(parent=None)
self._lhs_symbol = sp.sympify(lhs_symbol) self._lhs_symbol = sp.sympify(lhs_symbol)
self.rhs = sp.sympify(rhs_expr) self._rhs = sp.sympify(rhs_expr)
self._is_const = is_const self._is_const = is_const
self._is_declaration = self.__is_declaration() self._is_declaration = self.__is_declaration()
self.use_auto = use_auto self._use_auto = use_auto
def __is_declaration(self): def __is_declaration(self):
if isinstance(self._lhs_symbol, cast_func): from pystencils.typing import CastFunc
if isinstance(self._lhs_symbol, CastFunc):
return False return False
if any(isinstance(self._lhs_symbol, c) for c in (Field.Access, sp.Indexed, TemporaryMemoryAllocation)): if any(isinstance(self._lhs_symbol, c) for c in (Field.Access, sp.Indexed, TemporaryMemoryAllocation)):
return False return False
...@@ -580,15 +588,28 @@ class SympyAssignment(Node): ...@@ -580,15 +588,28 @@ class SympyAssignment(Node):
def lhs(self): def lhs(self):
return self._lhs_symbol return self._lhs_symbol
@property
def rhs(self):
return self._rhs
@lhs.setter @lhs.setter
def lhs(self, new_value): def lhs(self, new_value):
self._lhs_symbol = new_value self._lhs_symbol = new_value
self._is_declaration = self.__is_declaration() self._is_declaration = self.__is_declaration()
@rhs.setter
def rhs(self, new_rhs_expr):
self._rhs = new_rhs_expr
def subs(self, subs_dict): def subs(self, subs_dict):
self.lhs = fast_subs(self.lhs, subs_dict) self.lhs = fast_subs(self.lhs, subs_dict)
self.rhs = fast_subs(self.rhs, subs_dict) self.rhs = fast_subs(self.rhs, subs_dict)
def fast_subs(self, subs_dict, skip=None):
self.lhs = fast_subs(self.lhs, subs_dict, skip)
self.rhs = fast_subs(self.rhs, subs_dict, skip)
return self
def optimize(self, optimizations): def optimize(self, optimizations):
try: try:
from sympy.codegen.rewriting import optimize from sympy.codegen.rewriting import optimize
...@@ -598,7 +619,7 @@ class SympyAssignment(Node): ...@@ -598,7 +619,7 @@ class SympyAssignment(Node):
@property @property
def args(self): def args(self):
return [self._lhs_symbol, self.rhs, sp.sympify(self._is_const)] return [self._lhs_symbol, self.rhs]
@property @property
def symbols_defined(self): def symbols_defined(self):
...@@ -615,9 +636,10 @@ class SympyAssignment(Node): ...@@ -615,9 +636,10 @@ class SympyAssignment(Node):
if isinstance(symbol, Field.Access): if isinstance(symbol, Field.Access):
for i in range(len(symbol.offsets)): for i in range(len(symbol.offsets)):
loop_counters.add(LoopOverCoordinate.get_loop_counter_symbol(i)) loop_counters.add(LoopOverCoordinate.get_loop_counter_symbol(i))
result = {r for r in result if not isinstance(r, TypedImaginaryUnit)}
result.update(loop_counters) result.update(loop_counters)
result.update(self._lhs_symbol.atoms(sp.Symbol)) result.update(self._lhs_symbol.atoms(sp.Symbol))
return result return result
@property @property
...@@ -628,6 +650,10 @@ class SympyAssignment(Node): ...@@ -628,6 +650,10 @@ class SympyAssignment(Node):
def is_const(self): def is_const(self):
return self._is_const return self._is_const
@property
def use_auto(self):
return self._use_auto
def replace(self, child, replacement): def replace(self, child, replacement):
if child == self.lhs: if child == self.lhs:
replacement.parent = self replacement.parent = self
...@@ -650,7 +676,7 @@ class SympyAssignment(Node): ...@@ -650,7 +676,7 @@ class SympyAssignment(Node):
return hash((self.lhs, self.rhs)) return hash((self.lhs, self.rhs))
def __eq__(self, other): def __eq__(self, other):
return type(self) == type(other) and (self.lhs, self.rhs) == (other.lhs, other.rhs) return type(self) is type(other) and (self.lhs, self.rhs) == (other.lhs, other.rhs)
class ResolvedFieldAccess(sp.Indexed): class ResolvedFieldAccess(sp.Indexed):
...@@ -692,6 +718,9 @@ class ResolvedFieldAccess(sp.Indexed): ...@@ -692,6 +718,9 @@ class ResolvedFieldAccess(sp.Indexed):
def __getnewargs__(self): def __getnewargs__(self):
return self.base, self.indices[0], self.field, self.offsets, self.idx_coordinate_values return self.base, self.indices[0], self.field, self.offsets, self.idx_coordinate_values
def __getnewargs_ex__(self):
return (self.base, self.indices[0], self.field, self.offsets, self.idx_coordinate_values), {}
class TemporaryMemoryAllocation(Node): class TemporaryMemoryAllocation(Node):
"""Node for temporary memory buffer allocation. """Node for temporary memory buffer allocation.
...@@ -837,46 +866,5 @@ class ConditionalFieldAccess(sp.Function): ...@@ -837,46 +866,5 @@ class ConditionalFieldAccess(sp.Function):
def __getnewargs__(self): def __getnewargs__(self):
return self.access, self.outofbounds_condition, self.outofbounds_value return self.access, self.outofbounds_condition, self.outofbounds_value
def __getnewargs_ex__(self):
class NontemporalFence(Node): return (self.access, self.outofbounds_condition, self.outofbounds_value), {}
def __init__(self):
super(NontemporalFence, self).__init__(parent=None)
@property
def symbols_defined(self):
return set()
@property
def undefined_symbols(self):
return set()
@property
def args(self):
return []
def __eq__(self, other):
return isinstance(other, NontemporalFence)
class CachelineSize(Node):
symbol = sp.Symbol("_clsize")
mask_symbol = sp.Symbol("_clsize_mask")
last_symbol = sp.Symbol("_cl_lastvec")
def __init__(self):
super(CachelineSize, self).__init__(parent=None)
@property
def symbols_defined(self):
return set([self.symbol, self.mask_symbol, self.last_symbol])
@property
def undefined_symbols(self):
return set()
@property
def args(self):
return []
def __eq__(self, other):
return isinstance(other, CachelineSize)
...@@ -6,9 +6,3 @@ try: ...@@ -6,9 +6,3 @@ try:
__all__.append('print_dot') __all__.append('print_dot')
except ImportError: except ImportError:
pass pass
try:
from .llvm import generate_llvm # NOQA
__all__.append('generate_llvm')
except ImportError:
pass
from pystencils.typing import CFunction
def get_argument_string(function_shortcut, first=''):
args = function_shortcut[function_shortcut.index('[') + 1: -1]
arg_string = "("
if first:
arg_string += first + ', '
for arg in args.split(","):
arg = arg.strip()
if not arg:
continue
if arg in ('0', '1', '2', '3', '4', '5'):
arg_string += "{" + arg + "},"
else:
arg_string += arg + ","
arg_string = arg_string[:-1] + ")"
return arg_string
def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'):
if instruction_set not in ['neon', 'sme'] and not instruction_set.startswith('sve'):
raise NotImplementedError(instruction_set)
if instruction_set in ['sve', 'sve2', 'sme']:
cmp = 'cmp'
elif instruction_set.startswith('sve2') and instruction_set not in ('sve256', 'sve2048'):
cmp = 'cmp'
bitwidth = int(instruction_set[4:])
elif instruction_set.startswith('sve'):
cmp = 'cmp'
bitwidth = int(instruction_set[3:])
elif instruction_set == 'neon':
cmp = 'c'
bitwidth = 128
base_names = {
'+': 'add[0, 1]',
'-': 'sub[0, 1]',
'*': 'mul[0, 1]',
'/': 'div[0, 1]',
'sqrt': 'sqrt[0]',
'loadU': 'ld1[0]',
'storeU': 'st1[0, 1]',
'abs': 'abs[0]',
'==': f'{cmp}eq[0, 1]',
'!=': f'{cmp}eq[0, 1]',
'<=': f'{cmp}le[0, 1]',
'<': f'{cmp}lt[0, 1]',
'>=': f'{cmp}ge[0, 1]',
'>': f'{cmp}gt[0, 1]',
}
bits = {'double': 64,
'float': 32,
'int': 32}
result = dict()
if instruction_set in ['sve', 'sve2', 'sme']:
width = 'svcntd()' if data_type == 'double' else 'svcntw()'
intwidth = 'svcntw()'
result['bytes'] = 'svcntb()'
else:
width = bitwidth // bits[data_type]
intwidth = bitwidth // bits['int']
result['bytes'] = bitwidth // 8
if instruction_set.startswith('sve') or instruction_set == 'sme':
base_names['stream'] = 'stnt1[0, 1]'
prefix = 'sv'
suffix = f'_f{bits[data_type]}'
elif instruction_set == 'neon':
prefix = 'v'
suffix = f'q_f{bits[data_type]}'
if instruction_set in ['sve', 'sve2', 'sme']:
predicate = f'{prefix}whilelt_b{bits[data_type]}_u64({{loop_counter}}, {{loop_stop}})'
int_predicate = f'{prefix}whilelt_b{bits["int"]}_u64({{loop_counter}}, {{loop_stop}})'
else:
predicate = f'{prefix}whilelt_b{bits[data_type]}(0, {width})'
int_predicate = f'{prefix}whilelt_b{bits["int"]}(0, {intwidth})'
for intrinsic_id, function_shortcut in base_names.items():
function_shortcut = function_shortcut.strip()
name = function_shortcut[:function_shortcut.index('[')]
arg_string = get_argument_string(function_shortcut, first=predicate if prefix == 'sv' else '')
if prefix == 'sv' and not name.startswith('ld') and not name.startswith('st') and not name.startswith(cmp):
undef = '_x'
else:
undef = ''
result[intrinsic_id] = prefix + name + suffix + undef + arg_string
if instruction_set in ['sve', 'sve2', 'sme']:
result['width'] = CFunction(width, "int")
result['intwidth'] = CFunction(intwidth, "int")
else:
result['width'] = width
result['intwidth'] = intwidth
if instruction_set.startswith('sve') or instruction_set == 'sme':
result['makeVecConst'] = f'svdup_f{bits[data_type]}' + '({0})'
result['makeVecConstInt'] = f'svdup_s{bits["int"]}' + '({0})'
result['makeVecIndex'] = f'svindex_s{bits["int"]}' + '({0}, {1})'
if instruction_set != 'sme':
vindex = f'svindex_u{bits[data_type]}(0, {{0}})'
result['storeS'] = f'svst1_scatter_u{bits[data_type]}index_f{bits[data_type]}({predicate}, {{0}}, ' + \
vindex.format("{2}") + ', {1})'
result['loadS'] = f'svld1_gather_u{bits[data_type]}index_f{bits[data_type]}({predicate}, {{0}}, ' + \
vindex.format("{1}") + ')'
if instruction_set.startswith('sve2') and instruction_set not in ('sve256', 'sve2048'):
result['streamS'] = f'svstnt1_scatter_u{bits[data_type]}offset_f{bits[data_type]}({predicate}, {{0}}, ' + \
vindex.format(f"{{2}}*{bits[data_type]//8}") + ', {1})'
result['+int'] = f"svadd_s{bits['int']}_x({int_predicate}, " + "{0}, {1})"
result['float'] = f'svfloat{bits["float"]}_{"s" if instruction_set not in ["sve", "sve2", "sme"] else ""}t'
result['double'] = f'svfloat{bits["double"]}_{"s" if instruction_set not in ["sve", "sve2", "sme"] else ""}t'
result['int'] = f'svint{bits["int"]}_{"s" if instruction_set not in ["sve", "sve2", "sme"] else ""}t'
result['bool'] = f'svbool_{"s" if instruction_set not in ["sve", "sve2", "sme"] else ""}t'
result['headers'] = ['<arm_sve.h>', '<arm_acle.h>', '"arm_neon_helpers.h"']
result['&'] = f'svand_b_z({predicate},' + ' {0}, {1})'
result['|'] = f'svorr_b_z({predicate},' + ' {0}, {1})'
result['blendv'] = f'svsel_f{bits[data_type]}' + '({2}, {1}, {0})'
result['any'] = f'svptest_any({predicate}, {{0}})'
result['all'] = f'svcntp_b{bits[data_type]}({predicate}, {{0}}) == {width}'
result['maskStoreU'] = result['storeU'].replace(predicate, '{2}')
result['maskStream'] = result['stream'].replace(predicate, '{2}')
if instruction_set != 'sme':
result['maskStoreS'] = result['storeS'].replace(predicate, '{3}')
if instruction_set.startswith('sve2') and instruction_set not in ('sve256', 'sve2048'):
result['maskStreamS'] = result['streamS'].replace(predicate, '{3}')
result['streamFence'] = '__dmb(15)'
if instruction_set == 'sme':
result['function_prefix'] = '__attribute__((arm_locally_streaming))'
elif instruction_set not in ['sve', 'sve2', 'sme']:
result['compile_flags'] = [f'-msve-vector-bits={bitwidth}']
else:
result['makeVecConst'] = f'vdupq_n_f{bits[data_type]}' + '({0})'
result['makeVec'] = f'makeVec_f{bits[data_type]}' + '(' + ", ".join(['{' + str(i) + '}' for i in
range(width)]) + ')'
result['makeVecConstInt'] = f'vdupq_n_s{bits["int"]}' + '({0})'
result['makeVecInt'] = f'makeVec_s{bits["int"]}' + '({0}, {1}, {2}, {3})'
result['+int'] = f"vaddq_s{bits['int']}" + "({0}, {1})"
result[data_type] = f'float{bits[data_type]}x{width}_t'
result['int'] = f'int{bits["int"]}x{intwidth}_t'
result['bool'] = f'uint{bits[data_type]}x{width}_t'
result['headers'] = ['<arm_neon.h>', '"arm_neon_helpers.h"']
result['!='] = f'vmvnq_u{bits[data_type]}({result["=="]})'
result['&'] = f'vandq_u{bits[data_type]}' + '({0}, {1})'
result['|'] = f'vorrq_u{bits[data_type]}' + '({0}, {1})'
result['blendv'] = f'vbslq_f{bits[data_type]}' + '({2}, {1}, {0})'
result['any'] = f'vaddlvq_u8(vreinterpretq_u8_u{bits[data_type]}({{0}})) > 0'
result['all'] = f'vaddlvq_u8(vreinterpretq_u8_u{bits[data_type]}({{0}})) == 16*0xff'
# SVE has real nontemporal stores, so we only need to zero cachlines on Neon
result['cachelineZero'] = 'cachelineZero((void*) {0})'
result['cachelineSize'] = 'cachelineSize()'
return result
...@@ -7,13 +7,17 @@ import numpy as np ...@@ -7,13 +7,17 @@ import numpy as np
import sympy as sp import sympy as sp
from sympy.core import S from sympy.core import S
from sympy.logic.boolalg import BooleanFalse, BooleanTrue from sympy.logic.boolalg import BooleanFalse, BooleanTrue
from sympy.functions.elementary.trigonometric import TrigonometricFunction, InverseTrigonometricFunction
from pystencils.astnodes import KernelFunction, Node, CachelineSize from sympy.functions.elementary.hyperbolic import HyperbolicFunction
from pystencils.cpu.vectorization import vec_all, vec_any
from pystencils.data_types import ( from pystencils.astnodes import KernelFunction, LoopOverCoordinate, Node
PointerType, VectorType, address_of, cast_func, create_type, get_type_of_expression, from pystencils.cpu.vectorization import vec_all, vec_any, CachelineSize
reinterpret_cast_func, vector_memory_access, BasicType, TypedSymbol) from pystencils.typing import (
PointerType, VectorType, CastFunc, create_type, get_type_of_expression,
ReinterpretCastFunc, VectorMemoryAccess, BasicType, TypedSymbol, CFunction)
from pystencils.enums import Backend
from pystencils.fast_approximation import fast_division, fast_inv_sqrt, fast_sqrt from pystencils.fast_approximation import fast_division, fast_inv_sqrt, fast_sqrt
from pystencils.functions import DivFunc, AddressOf
from pystencils.integer_functions import ( from pystencils.integer_functions import (
bit_shift_left, bit_shift_right, bitwise_and, bitwise_or, bitwise_xor, bit_shift_left, bit_shift_right, bitwise_and, bitwise_or, bitwise_xor,
int_div, int_power_of_2, modulo_ceil) int_div, int_power_of_2, modulo_ceil)
...@@ -28,12 +32,10 @@ __all__ = ['generate_c', 'CustomCodeNode', 'PrintNode', 'get_headers', 'CustomSy ...@@ -28,12 +32,10 @@ __all__ = ['generate_c', 'CustomCodeNode', 'PrintNode', 'get_headers', 'CustomSy
HEADER_REGEX = re.compile(r'^[<"].*[">]$') HEADER_REGEX = re.compile(r'^[<"].*[">]$')
KERNCRAFT_NO_TERNARY_MODE = False
def generate_c(ast_node: Node, def generate_c(ast_node: Node,
signature_only: bool = False, signature_only: bool = False,
dialect='c', dialect: Backend = Backend.C,
custom_backend=None, custom_backend=None,
with_globals=True) -> str: with_globals=True) -> str:
"""Prints an abstract syntax tree node as C or CUDA code. """Prints an abstract syntax tree node as C or CUDA code.
...@@ -45,7 +47,7 @@ def generate_c(ast_node: Node, ...@@ -45,7 +47,7 @@ def generate_c(ast_node: Node,
Args: Args:
ast_node: ast representation of kernel ast_node: ast representation of kernel
signature_only: generate signature without function body signature_only: generate signature without function body
dialect: 'c', 'cuda' or opencl dialect: `Backend`: 'C' or 'CUDA'
custom_backend: use own custom printer for code generation custom_backend: use own custom printer for code generation
with_globals: enable usage of global variables with_globals: enable usage of global variables
Returns: Returns:
...@@ -59,21 +61,19 @@ def generate_c(ast_node: Node, ...@@ -59,21 +61,19 @@ def generate_c(ast_node: Node,
ast_node.global_variables = d.symbols_defined ast_node.global_variables = d.symbols_defined
if custom_backend: if custom_backend:
printer = custom_backend printer = custom_backend
elif dialect == 'c': elif dialect == Backend.C:
try: try:
# TODO Vectorization Revamp: instruction_set should not be just slapped on ast
instruction_set = ast_node.instruction_set instruction_set = ast_node.instruction_set
except Exception: except Exception:
instruction_set = None instruction_set = None
printer = CBackend(signature_only=signature_only, printer = CBackend(signature_only=signature_only,
vector_instruction_set=instruction_set) vector_instruction_set=instruction_set)
elif dialect == 'cuda': elif dialect == Backend.CUDA:
from pystencils.backends.cuda_backend import CudaBackend from pystencils.backends.cuda_backend import CudaBackend
printer = CudaBackend(signature_only=signature_only) printer = CudaBackend(signature_only=signature_only)
elif dialect == 'opencl':
from pystencils.backends.opencl_backend import OpenClBackend
printer = OpenClBackend(signature_only=signature_only)
else: else:
raise ValueError("Unknown dialect: " + str(dialect)) raise ValueError(f'Unknown {dialect=}')
code = printer(ast_node) code = printer(ast_node)
if not signature_only and isinstance(ast_node, KernelFunction): if not signature_only and isinstance(ast_node, KernelFunction):
if with_globals and global_declarations: if with_globals and global_declarations:
...@@ -121,12 +121,12 @@ def get_headers(ast_node: Node) -> Set[str]: ...@@ -121,12 +121,12 @@ def get_headers(ast_node: Node) -> Set[str]:
for h in headers: for h in headers:
assert HEADER_REGEX.match(h), f'header /{h}/ does not follow the pattern /"..."/ or /<...>/' assert HEADER_REGEX.match(h), f'header /{h}/ does not follow the pattern /"..."/ or /<...>/'
return sorted(headers) return headers
# --------------------------------------- Backend Specific Nodes ------------------------------------------------------- # --------------------------------------- Backend Specific Nodes -------------------------------------------------------
# TODO future CustomCodeNode should not be backend specific move it elsewhere
class CustomCodeNode(Node): class CustomCodeNode(Node):
def __init__(self, code, symbols_read, symbols_defined, parent=None): def __init__(self, code, symbols_read, symbols_defined, parent=None):
super(CustomCodeNode, self).__init__(parent=parent) super(CustomCodeNode, self).__init__(parent=parent)
...@@ -150,8 +150,8 @@ class CustomCodeNode(Node): ...@@ -150,8 +150,8 @@ class CustomCodeNode(Node):
def undefined_symbols(self): def undefined_symbols(self):
return self._symbols_read - self._symbols_defined return self._symbols_read - self._symbols_defined
def __eq___(self, other): def __eq__(self, other):
return self._code == other._code return type(self) is type(other) and self._code == other._code
def __hash__(self): def __hash__(self):
return hash(self._code) return hash(self._code)
...@@ -171,7 +171,7 @@ class PrintNode(CustomCodeNode): ...@@ -171,7 +171,7 @@ class PrintNode(CustomCodeNode):
# noinspection PyPep8Naming # noinspection PyPep8Naming
class CBackend: class CBackend:
def __init__(self, sympy_printer=None, signature_only=False, vector_instruction_set=None, dialect='c'): def __init__(self, sympy_printer=None, signature_only=False, vector_instruction_set=None, dialect=Backend.C):
if sympy_printer is None: if sympy_printer is None:
if vector_instruction_set is not None: if vector_instruction_set is not None:
self.sympy_printer = VectorizedCustomSympyPrinter(vector_instruction_set) self.sympy_printer = VectorizedCustomSympyPrinter(vector_instruction_set)
...@@ -184,6 +184,8 @@ class CBackend: ...@@ -184,6 +184,8 @@ class CBackend:
self._indent = " " self._indent = " "
self._dialect = dialect self._dialect = dialect
self._signatureOnly = signature_only self._signatureOnly = signature_only
self._kwargs = {}
self.sympy_printer._kwargs = self._kwargs
def __call__(self, node): def __call__(self, node):
prev_is = VectorType.instruction_set prev_is = VectorType.instruction_set
...@@ -196,18 +198,19 @@ class CBackend: ...@@ -196,18 +198,19 @@ class CBackend:
if isinstance(node, str): if isinstance(node, str):
return node return node
for cls in type(node).__mro__: for cls in type(node).__mro__:
method_name = "_print_" + cls.__name__ method_name = f"_print_{cls.__name__}"
if hasattr(self, method_name): if hasattr(self, method_name):
return getattr(self, method_name)(node) return getattr(self, method_name)(node)
raise NotImplementedError(self.__class__.__name__ + " does not support node of type " + node.__class__.__name__) raise NotImplementedError(f"{self.__class__.__name__} does not support node of type {node.__class__.__name__}")
def _print_Type(self, node): def _print_AbstractType(self, node):
return str(node) return str(node)
def _print_KernelFunction(self, node): def _print_KernelFunction(self, node):
function_arguments = [f"{self._print(s.symbol.dtype)} {s.symbol.name}" for s in node.get_parameters()] function_arguments = [f"{self._print(s.symbol.dtype)} {s.symbol.name}" for s in node.get_parameters()
if not type(s.symbol) is CFunction]
launch_bounds = "" launch_bounds = ""
if self._dialect == 'cuda': if self._dialect == Backend.CUDA:
max_threads = node.indexing.max_threads_per_block() max_threads = node.indexing.max_threads_per_block()
if max_threads: if max_threads:
launch_bounds = f"__launch_bounds__({max_threads}) " launch_bounds = f"__launch_bounds__({max_threads}) "
...@@ -227,11 +230,14 @@ class CBackend: ...@@ -227,11 +230,14 @@ class CBackend:
return f"{node.pragma_line}\n{self._print_Block(node)}" return f"{node.pragma_line}\n{self._print_Block(node)}"
def _print_LoopOverCoordinate(self, node): def _print_LoopOverCoordinate(self, node):
counter_symbol = node.loop_counter_name counter_name = node.loop_counter_name
start = f"int64_t {counter_symbol} = {self.sympy_printer.doprint(node.start)}" counter_dtype = node.loop_counter_symbol.dtype.c_name
condition = f"{counter_symbol} < {self.sympy_printer.doprint(node.stop)}" start = f"{counter_dtype} {counter_name} = {self.sympy_printer.doprint(node.start)}"
update = f"{counter_symbol} += {self.sympy_printer.doprint(node.step)}" condition = f"{counter_name} < {self.sympy_printer.doprint(node.stop)}"
update = f"{counter_name} += {self.sympy_printer.doprint(node.step)}"
loop_str = f"for ({start}; {condition}; {update})" loop_str = f"for ({start}; {condition}; {update})"
self._kwargs['loop_counter'] = counter_name
self._kwargs['loop_stop'] = node.stop
prefix = "\n".join(node.prefix_lines) prefix = "\n".join(node.prefix_lines)
if prefix: if prefix:
...@@ -239,63 +245,113 @@ class CBackend: ...@@ -239,63 +245,113 @@ class CBackend:
return f"{prefix}{loop_str}\n{self._print(node.body)}" return f"{prefix}{loop_str}\n{self._print(node.body)}"
def _print_SympyAssignment(self, node): def _print_SympyAssignment(self, node):
printed_lhs = self.sympy_printer.doprint(node.lhs)
printed_rhs = self.sympy_printer.doprint(node.rhs)
if node.is_declaration: if node.is_declaration:
if node.use_auto: if node.use_auto:
data_type = 'auto ' data_type = 'auto'
else: else:
data_type = self._print(node.lhs.dtype).replace(' const', '')
if node.is_const: if node.is_const:
prefix = 'const ' data_type = f'const {data_type}'
else: return f"{data_type} {printed_lhs} = {printed_rhs};"
prefix = ''
data_type = prefix + self._print(node.lhs.dtype).replace(' const', '') + " "
return "%s%s = %s;" % (data_type,
self.sympy_printer.doprint(node.lhs),
self.sympy_printer.doprint(node.rhs))
else: else:
lhs_type = get_type_of_expression(node.lhs) lhs_type = get_type_of_expression(node.lhs) # TOOD: this should have been typed
printed_mask = "" printed_mask = ""
if type(lhs_type) is VectorType and isinstance(node.lhs, cast_func): if type(lhs_type) is VectorType and isinstance(node.lhs, CastFunc):
arg, data_type, aligned, nontemporal, mask = node.lhs.args arg, data_type, aligned, nontemporal, mask, stride = node.lhs.args
instr = 'storeU' instr = 'storeU'
if aligned: if nontemporal and 'storeA' not in self._vector_instruction_set and \
'stream' in self._vector_instruction_set:
instr = 'stream'
elif aligned:
instr = 'stream' if nontemporal and 'stream' in self._vector_instruction_set else 'storeA' instr = 'stream' if nontemporal and 'stream' in self._vector_instruction_set else 'storeA'
if mask != True: # NOQA if mask != True: # NOQA
instr = 'maskStore' if aligned else 'maskStoreU' instr = 'maskStream' if nontemporal and 'maskStream' in self._vector_instruction_set else \
'maskStoreA' if aligned else 'maskStoreU'
if instr not in self._vector_instruction_set:
if instr == 'maskStream' and 'stream' in self._vector_instruction_set:
store, load = 'stream', 'loadA'
elif (instr in ('maskStream', 'maskStoreA')) and 'storeA' in self._vector_instruction_set:
store, load = 'storeA', 'loadA'
else:
store, load = 'storeU', 'loadU'
load = load if load in self._vector_instruction_set else 'loadU'
self._vector_instruction_set[instr] = self._vector_instruction_set[store].format(
'{0}', self._vector_instruction_set['blendv'].format(
self._vector_instruction_set[load].format('{0}', **self._kwargs),
'{1}', '{2}', **self._kwargs), **self._kwargs)
printed_mask = self.sympy_printer.doprint(mask) printed_mask = self.sympy_printer.doprint(mask)
if self._vector_instruction_set['dataTypePrefix']['double'] == '__mm256d': if data_type.base_type.c_name == 'double':
printed_mask = f"_mm256_castpd_si256({printed_mask})" if self._vector_instruction_set['double'] == '__m256d':
printed_mask = f"_mm256_castpd_si256({printed_mask})"
elif self._vector_instruction_set['double'] == '__m128d':
printed_mask = f"_mm_castpd_si128({printed_mask})"
elif data_type.base_type.c_name == 'float':
if self._vector_instruction_set['float'] == '__m256':
printed_mask = f"_mm256_castps_si256({printed_mask})"
elif self._vector_instruction_set['float'] == '__m128':
printed_mask = f"_mm_castps_si128({printed_mask})"
rhs_type = get_type_of_expression(node.rhs) rhs_type = get_type_of_expression(node.rhs)
if type(rhs_type) is not VectorType: if type(rhs_type) is not VectorType:
rhs = cast_func(node.rhs, VectorType(rhs_type)) raise ValueError(f'Cannot vectorize {node.rhs} of type {rhs_type} inside of the pretty printer! '
f'This should have happen earlier!')
# rhs = CastFunc(node.rhs, VectorType(rhs_type)) # Unknown width
else: else:
rhs = node.rhs rhs = node.rhs
ptr = "&" + self.sympy_printer.doprint(node.lhs.args[0]) ptr = "&" + self.sympy_printer.doprint(node.lhs.args[0])
if stride != 1:
instr = ('maskStreamS' if nontemporal and 'maskStreamS' in self._vector_instruction_set else
'maskStoreS') if mask != True else \
('streamS' if nontemporal and 'streamS' in self._vector_instruction_set else 'storeS') # NOQA
return self._vector_instruction_set[instr].format(ptr, self.sympy_printer.doprint(rhs),
stride, printed_mask, **self._kwargs) + ';'
pre_code = '' pre_code = ''
if nontemporal and 'cachelineZero' in self._vector_instruction_set: if nontemporal and 'cachelineZero' in self._vector_instruction_set and mask == True: # NOQA
pre_code = f"if (((uintptr_t) {ptr} & {CachelineSize.mask_symbol}) == 0) " + "{\n\t" + \ first_cond = f"((uintptr_t) {ptr} & {CachelineSize.mask_symbol}) == 0"
self._vector_instruction_set['cachelineZero'].format(ptr) + ';\n}\n' offset = sp.Add(*[sp.Symbol(LoopOverCoordinate.get_loop_counter_name(i))
* node.lhs.args[0].field.spatial_strides[i] for i in
range(len(node.lhs.args[0].field.spatial_strides))])
if stride == 1:
offset = offset.subs({node.lhs.args[0].field.spatial_strides[0]: 1})
size = sp.Mul(*node.lhs.args[0].field.spatial_shape)
element_size = 8 if data_type.base_type.c_name == 'double' else 4
size_cond = f"({offset} + {CachelineSize.symbol/element_size}) < {size}"
pre_code = f"if ({first_cond} && {size_cond}) " + "{\n\t" + \
self._vector_instruction_set['cachelineZero'].format(ptr, **self._kwargs) + ';\n}\n'
code = self._vector_instruction_set[instr].format(ptr, self.sympy_printer.doprint(rhs), code = self._vector_instruction_set[instr].format(ptr, self.sympy_printer.doprint(rhs),
printed_mask) + ';' printed_mask, **self._kwargs) + ';'
flushcond = f"((uintptr_t) {ptr} & {CachelineSize.mask_symbol}) == {CachelineSize.last_symbol}" flushcond = f"((uintptr_t) {ptr} & {CachelineSize.mask_symbol}) == {CachelineSize.last_symbol}"
if nontemporal and 'flushCacheline' in self._vector_instruction_set: if nontemporal and 'flushCacheline' in self._vector_instruction_set:
code2 = self._vector_instruction_set['flushCacheline'].format( code2 = self._vector_instruction_set['flushCacheline'].format(
ptr, self.sympy_printer.doprint(rhs)) + ';' ptr, self.sympy_printer.doprint(rhs), **self._kwargs) + ';'
code = f"{code}\nif ({flushcond}) {{\n\t{code2}\n}}" code = f"{code}\nif ({flushcond}) {{\n\t{code2}\n}}"
elif nontemporal and 'storeAAndFlushCacheline' in self._vector_instruction_set: elif aligned and nontemporal and 'storeAAndFlushCacheline' in self._vector_instruction_set:
tmpvar = '_tmp_' + hashlib.sha1(self.sympy_printer.doprint(rhs).encode('ascii')).hexdigest()[:8] lhs_hash = hashlib.sha1(self.sympy_printer.doprint(node.lhs).encode('ascii')).hexdigest()[:8]
rhs_hash = hashlib.sha1(self.sympy_printer.doprint(rhs).encode('ascii')).hexdigest()[:8]
tmpvar = f'_tmp_{lhs_hash}_{rhs_hash}'
code = 'const ' + self._print(node.lhs.dtype).replace(' const', '') + ' ' + tmpvar + ' = ' \ code = 'const ' + self._print(node.lhs.dtype).replace(' const', '') + ' ' + tmpvar + ' = ' \
+ self.sympy_printer.doprint(rhs) + ';' + self.sympy_printer.doprint(rhs) + ';'
code1 = self._vector_instruction_set[instr].format(ptr, tmpvar, printed_mask) + ';' code1 = self._vector_instruction_set[instr].format(ptr, tmpvar, printed_mask, **self._kwargs) + ';'
code2 = self._vector_instruction_set['storeAAndFlushCacheline'].format(ptr, tmpvar, printed_mask) \ maskStore, store, load = 'maskStoreAAndFlushCacheline', 'storeAAndFlushCacheline', 'loadA'
+ ';' instr2 = maskStore if mask != True else store # NOQA
if instr2 not in self._vector_instruction_set:
self._vector_instruction_set[maskStore] = self._vector_instruction_set[store].format(
'{0}', self._vector_instruction_set['blendv'].format(
self._vector_instruction_set[load].format('{0}', **self._kwargs),
'{1}', '{2}', **self._kwargs),
**self._kwargs)
code2 = self._vector_instruction_set[instr2].format(ptr, tmpvar, printed_mask, **self._kwargs) + ';'
code += f"\nif ({flushcond}) {{\n\t{code2}\n}} else {{\n\t{code1}\n}}" code += f"\nif ({flushcond}) {{\n\t{code2}\n}} else {{\n\t{code1}\n}}"
return pre_code + code return pre_code + code
else: else:
return f"{self.sympy_printer.doprint(node.lhs)} = {self.sympy_printer.doprint(node.rhs)};" return f"{printed_lhs} = {printed_rhs};"
def _print_NontemporalFence(self, _): def _print_NontemporalFence(self, _):
if 'streamFence' in self._vector_instruction_set: if 'streamFence' in self._vector_instruction_set:
...@@ -387,23 +443,31 @@ class CustomSympyPrinter(CCodePrinter): ...@@ -387,23 +443,31 @@ class CustomSympyPrinter(CCodePrinter):
def __init__(self): def __init__(self):
super(CustomSympyPrinter, self).__init__() super(CustomSympyPrinter, self).__init__()
self._float_type = create_type("float32")
def _print_Pow(self, expr): def _print_Pow(self, expr):
"""Don't use std::pow function, for small integer exponents, write as multiplication""" """Don't use std::pow function, for small integer exponents, write as multiplication"""
if not expr.free_symbols: # Ideally the printer has as little logic as possible. Therefore,
return self._typed_number(expr.evalf(), get_type_of_expression(expr)) # powers should be rewritten as `DivFunc`s / unevaluated `Mul`s before
# printing. `NodeCollection` offers a convenience function to do just
if expr.exp.is_integer and expr.exp.is_number and 0 < expr.exp < 8: # that. However, `cut_loops` rewrites unevaluated multiplications as
return f"({self._print(sp.Mul(*[expr.base] * expr.exp, evaluate=False))})" # `Pow`s again. Neither `deepcopy` nor `func(*args)` are suited to
elif expr.exp.is_integer and expr.exp.is_number and - 8 < expr.exp < 0: # rebuild unevaluated expressions. Therefore, as long as we stick with
return f"1 / ({self._print(sp.Mul(*([expr.base] * -expr.exp), evaluate=False))})" # SymPy, this is the only way to avoid printing `pow`s.
exp = expr.exp.expr if isinstance(expr.exp, CastFunc) else expr.exp
one_type = expr.base.dtype if hasattr(expr.base, "dtype") else get_type_of_expression(expr.base)
if exp.is_integer and exp.is_number and (0 < exp <= 8):
return f"({self._print(sp.Mul(*[expr.base] * exp, evaluate=False))})"
elif exp.is_integer and exp.is_number and (-8 <= exp < 0):
return f"{self._typed_number(1, one_type)} / ({self._print(sp.Mul(*([expr.base] * -exp), evaluate=False))})"
else: else:
return super(CustomSympyPrinter, self)._print_Pow(expr) return super(CustomSympyPrinter, self)._print_Pow(expr)
# TODO don't print ones in sp.Mul
def _print_Rational(self, expr): def _print_Rational(self, expr):
"""Evaluate all rationals i.e. print 0.25 instead of 1.0/4.0""" """Evaluate all rationals i.e. print 0.25 instead of 1.0/4.0"""
res = str(expr.evalf().num) res = str(expr.evalf(17))
return res return res
def _print_Equality(self, expr): def _print_Equality(self, expr):
...@@ -421,7 +485,7 @@ class CustomSympyPrinter(CCodePrinter): ...@@ -421,7 +485,7 @@ class CustomSympyPrinter(CCodePrinter):
else: else:
return f'fabs({self._print(expr.args[0])})' return f'fabs({self._print(expr.args[0])})'
def _print_Type(self, node): def _print_AbstractType(self, node):
return str(node) return str(node)
def _print_Function(self, expr): def _print_Function(self, expr):
...@@ -434,30 +498,48 @@ class CustomSympyPrinter(CCodePrinter): ...@@ -434,30 +498,48 @@ class CustomSympyPrinter(CCodePrinter):
} }
if hasattr(expr, 'to_c'): if hasattr(expr, 'to_c'):
return expr.to_c(self._print) return expr.to_c(self._print)
if isinstance(expr, reinterpret_cast_func): if isinstance(expr, ReinterpretCastFunc):
arg, data_type = expr.args arg, data_type = expr.args
return f"*(({self._print(PointerType(data_type, restrict=False))})(& {self._print(arg)}))" if isinstance(data_type, PointerType):
elif isinstance(expr, address_of): const_str = "const" if data_type.const else ""
return f"(({const_str} {self._print(data_type.base_type)} *)(& {self._print(arg)}))"
else:
return f"*(({self._print(PointerType(data_type, restrict=False))})(& {self._print(arg)}))"
elif isinstance(expr, AddressOf):
assert len(expr.args) == 1, "address_of must only have one argument" assert len(expr.args) == 1, "address_of must only have one argument"
return f"&({self._print(expr.args[0])})" return f"&({self._print(expr.args[0])})"
elif isinstance(expr, cast_func): elif isinstance(expr, CastFunc):
cast = "(({data_type})({code}))"
arg, data_type = expr.args arg, data_type = expr.args
if isinstance(arg, sp.Number) and arg.is_finite: if arg.is_Number and not isinstance(arg, (sp.core.numbers.Infinity, sp.core.numbers.NegativeInfinity)):
return self._typed_number(arg, data_type) return self._typed_number(arg, data_type)
elif isinstance(arg, (InverseTrigonometricFunction, TrigonometricFunction, HyperbolicFunction)) \
and data_type == BasicType('float32'):
known = self.known_functions[arg.__class__.__name__.lower()]
code = self._print(arg)
return code.replace(known, f"{known}f")
elif isinstance(arg, (sp.Pow, sp.exp)) and data_type == BasicType('float32'):
known = ['sqrt', 'cbrt', 'pow', 'exp']
code = self._print(arg)
for k in known:
if k in code:
return code.replace(k, f'{k}f')
# Powers of small integers are printed as divisions/multiplications.
if '/' in code or '*' in code:
return cast.format(data_type=data_type, code=code)
raise ValueError(f"{code} doesn't give {known=} function back.")
else: else:
return f"(({data_type})({self._print(arg)}))" return cast.format(data_type=data_type, code=self._print(arg))
elif isinstance(expr, fast_division): elif isinstance(expr, fast_division):
return f"({self._print(expr.args[0] / expr.args[1])})" raise ValueError("fast_division is only supported for Taget.GPU")
elif isinstance(expr, fast_sqrt): elif isinstance(expr, fast_sqrt):
return f"({self._print(sp.sqrt(expr.args[0]))})" raise ValueError("fast_sqrt is only supported for Taget.GPU")
elif isinstance(expr, fast_inv_sqrt):
raise ValueError("fast_inv_sqrt is only supported for Taget.GPU")
elif isinstance(expr, vec_any) or isinstance(expr, vec_all): elif isinstance(expr, vec_any) or isinstance(expr, vec_all):
return self._print(expr.args[0]) return self._print(expr.args[0])
elif isinstance(expr, fast_inv_sqrt):
return f"({self._print(1 / sp.sqrt(expr.args[0]))})"
elif isinstance(expr, sp.Abs): elif isinstance(expr, sp.Abs):
return f"abs({self._print(expr.args[0])})" return f"abs({self._print(expr.args[0])})"
elif isinstance(expr, sp.Max):
return self._print(expr)
elif isinstance(expr, sp.Mod): elif isinstance(expr, sp.Mod):
if expr.args[0].is_integer and expr.args[1].is_integer: if expr.args[0].is_integer and expr.args[1].is_integer:
return f"({self._print(expr.args[0])} % {self._print(expr.args[1])})" return f"({self._print(expr.args[0])} % {self._print(expr.args[1])})"
...@@ -469,6 +551,8 @@ class CustomSympyPrinter(CCodePrinter): ...@@ -469,6 +551,8 @@ class CustomSympyPrinter(CCodePrinter):
return f"(1 << ({self._print(expr.args[0])}))" return f"(1 << ({self._print(expr.args[0])}))"
elif expr.func == int_div: elif expr.func == int_div:
return f"(({self._print(expr.args[0])}) / ({self._print(expr.args[1])}))" return f"(({self._print(expr.args[0])}) / ({self._print(expr.args[1])}))"
elif expr.func == DivFunc:
return f'(({self._print(expr.divisor)}) / ({self._print(expr.dividend)}))'
else: else:
name = expr.name if hasattr(expr, 'name') else expr.__class__.__name__ name = expr.name if hasattr(expr, 'name') else expr.__class__.__name__
arg_str = ', '.join(self._print(a) for a in expr.args) arg_str = ', '.join(self._print(a) for a in expr.args)
...@@ -480,55 +564,17 @@ class CustomSympyPrinter(CCodePrinter): ...@@ -480,55 +564,17 @@ class CustomSympyPrinter(CCodePrinter):
return res + '.0f' if '.' not in res else res + 'f' return res + '.0f' if '.' not in res else res + 'f'
elif dtype.numpy_dtype == np.float64: elif dtype.numpy_dtype == np.float64:
return res + '.0' if '.' not in res else res return res + '.0' if '.' not in res else res
elif dtype.is_int():
tokens = res.split('.')
if len(tokens) == 1:
return res
elif int(tokens[1]) != 0:
raise ValueError(f"Cannot print non-integer number {res} as an integer.")
else:
return tokens[0]
else: else:
return res return res
def _print_Sum(self, expr):
template = """[&]() {{
{dtype} sum = ({dtype}) 0;
for ( {iterator_dtype} {var} = {start}; {condition}; {var} += {increment} ) {{
sum += {expr};
}}
return sum;
}}()"""
var = expr.limits[0][0]
start = expr.limits[0][1]
end = expr.limits[0][2]
code = template.format(
dtype=get_type_of_expression(expr.args[0]),
iterator_dtype='int',
var=self._print(var),
start=self._print(start),
end=self._print(end),
expr=self._print(expr.function),
increment=str(1),
condition=self._print(var) + ' <= ' + self._print(end) # if start < end else '>='
)
return code
def _print_Product(self, expr):
template = """[&]() {{
{dtype} product = ({dtype}) 1;
for ( {iterator_dtype} {var} = {start}; {condition}; {var} += {increment} ) {{
product *= {expr};
}}
return product;
}}()"""
var = expr.limits[0][0]
start = expr.limits[0][1]
end = expr.limits[0][2]
code = template.format(
dtype=get_type_of_expression(expr.args[0]),
iterator_dtype='int',
var=self._print(var),
start=self._print(start),
end=self._print(end),
expr=self._print(expr.function),
increment=str(1),
condition=self._print(var) + ' <= ' + self._print(end) # if start < end else '>='
)
return code
def _print_ConditionalFieldAccess(self, node): def _print_ConditionalFieldAccess(self, node):
return self._print(sp.Piecewise((node.outofbounds_value, node.outofbounds_condition), (node.access, True))) return self._print(sp.Piecewise((node.outofbounds_value, node.outofbounds_condition), (node.access, True)))
...@@ -552,27 +598,6 @@ class CustomSympyPrinter(CCodePrinter): ...@@ -552,27 +598,6 @@ class CustomSympyPrinter(CCodePrinter):
return f"(({a} < {b}) ? {a} : {b})" return f"(({a} < {b}) ? {a} : {b})"
return inner_print_min(expr.args) return inner_print_min(expr.args)
def _print_re(self, expr):
return f"real({self._print(expr.args[0])})"
def _print_im(self, expr):
return f"imag({self._print(expr.args[0])})"
def _print_ImaginaryUnit(self, expr):
return "complex<double>{0,1}"
def _print_TypedImaginaryUnit(self, expr):
if expr.dtype.numpy_dtype == np.complex64:
return "complex<float>{0,1}"
elif expr.dtype.numpy_dtype == np.complex128:
return "complex<double>{0,1}"
else:
raise NotImplementedError(
"only complex64 and complex128 supported")
def _print_Complex(self, expr):
return self._typed_number(expr, np.complex64)
# noinspection PyPep8Naming # noinspection PyPep8Naming
class VectorizedCustomSympyPrinter(CustomSympyPrinter): class VectorizedCustomSympyPrinter(CustomSympyPrinter):
...@@ -591,47 +616,100 @@ class VectorizedCustomSympyPrinter(CustomSympyPrinter): ...@@ -591,47 +616,100 @@ class VectorizedCustomSympyPrinter(CustomSympyPrinter):
return None return None
def _print_Abs(self, expr): def _print_Abs(self, expr):
if 'abs' in self.instruction_set and isinstance(expr.args[0], vector_memory_access): if isinstance(get_type_of_expression(expr), (VectorType, VectorMemoryAccess)):
return self.instruction_set['abs'].format(self._print(expr.args[0])) return self.instruction_set['abs'].format(self._print(expr.args[0]), **self._kwargs)
return super()._print_Abs(expr) return super()._print_Abs(expr)
def _typed_vectorized_number(self, expr, data_type):
basic_data_type = data_type.base_type
number = self._typed_number(expr, basic_data_type)
instruction = 'makeVecConst'
if basic_data_type.is_bool():
instruction = 'makeVecConstBool'
# TODO Vectorization Revamp: is int, or sint, or uint (my guess is sint)
elif basic_data_type.is_int():
instruction = 'makeVecConstInt'
return self.instruction_set[instruction].format(number, **self._kwargs)
def _typed_vectorized_symbol(self, expr, data_type):
if not isinstance(expr, TypedSymbol):
raise ValueError(f'{expr} is not a TypeSymbol. It is {expr.type=}')
basic_data_type = data_type.base_type
symbol = self._print(expr)
if basic_data_type != expr.dtype:
symbol = f'(({basic_data_type})({symbol}))'
instruction = 'makeVecConst'
if basic_data_type.is_bool():
instruction = 'makeVecConstBool'
# TODO Vectorization Revamp: is int, or sint, or uint (my guess is sint)
elif basic_data_type.is_int():
instruction = 'makeVecConstInt'
return self.instruction_set[instruction].format(symbol, **self._kwargs)
def _print_CastFunc(self, expr):
arg, data_type = expr.args
if type(data_type) is VectorType:
base_type = data_type.base_type
# vector_memory_access is a cast_func itself so it should't be directly inside a cast_func
assert not isinstance(arg, VectorMemoryAccess)
if isinstance(arg, sp.Tuple):
is_boolean = get_type_of_expression(arg[0]) == create_type("bool")
is_integer = get_type_of_expression(arg[0]) == create_type("int")
printed_args = [self._print(a) for a in arg]
instruction = 'makeVecBool' if is_boolean else 'makeVecInt' if is_integer else 'makeVec'
if instruction == 'makeVecInt' and 'makeVecIndex' in self.instruction_set:
increments = np.array(arg)[1:] - np.array(arg)[:-1]
if len(set(increments)) == 1:
return self.instruction_set['makeVecIndex'].format(printed_args[0], increments[0],
**self._kwargs)
return self.instruction_set[instruction].format(*printed_args, **self._kwargs)
else:
if arg.is_Number and not isinstance(arg, (sp.core.numbers.Infinity, sp.core.numbers.NegativeInfinity)):
return self._typed_vectorized_number(arg, data_type)
elif isinstance(arg, TypedSymbol):
return self._typed_vectorized_symbol(arg, data_type)
elif isinstance(arg, (InverseTrigonometricFunction, TrigonometricFunction, HyperbolicFunction)) \
and base_type == BasicType('float32'):
raise NotImplementedError('Vectorizer is not tested for trigonometric functions yet')
# known = self.known_functions[arg.__class__.__name__.lower()]
# code = self._print(arg)
# return code.replace(known, f"{known}f")
elif isinstance(arg, sp.Pow):
if base_type == BasicType('float32') or base_type == BasicType('float64'):
return self._print_Pow(arg)
else:
raise NotImplementedError('Integer Pow is not implemented')
elif isinstance(arg, sp.UnevaluatedExpr):
return self._print(arg.args[0])
else:
raise NotImplementedError('Vectorizer cannot cast between different datatypes')
# to_type = self.instruction_set['suffix'][data_type.base_type.c_name]
# from_type = self.instruction_set['suffix'][get_type_of_expression(arg).base_type.c_name]
# return self.instruction_set['cast'].format(from_type, to_type, self._print(arg))
else:
return self._scalarFallback('_print_Function', expr)
# raise ValueError(f'Non VectorType cast "{data_type}" in vectorized code.')
def _print_Function(self, expr): def _print_Function(self, expr):
if isinstance(expr, vector_memory_access): if isinstance(expr, VectorMemoryAccess):
arg, data_type, aligned, _, mask = expr.args arg, data_type, aligned, _, mask, stride = expr.args
if stride != 1:
return self.instruction_set['loadS'].format(f"& {self._print(arg)}", stride, **self._kwargs)
instruction = self.instruction_set['loadA'] if aligned else self.instruction_set['loadU'] instruction = self.instruction_set['loadA'] if aligned else self.instruction_set['loadU']
return instruction.format("& " + self._print(arg)) return instruction.format(f"& {self._print(arg)}", **self._kwargs)
elif isinstance(expr, cast_func): elif expr.func == DivFunc:
arg, data_type = expr.args
if type(data_type) is VectorType:
# vector_memory_access is a cast_func itself so it should't be directly inside a cast_func
assert not isinstance(arg, vector_memory_access)
if isinstance(arg, sp.Tuple):
is_boolean = get_type_of_expression(arg[0]) == create_type("bool")
is_integer = get_type_of_expression(arg[0]) == create_type("int")
printed_args = [self._print(a) for a in arg]
instruction = 'makeVecBool' if is_boolean else 'makeVecInt' if is_integer else 'makeVec'
return self.instruction_set[instruction].format(*printed_args)
else:
is_boolean = get_type_of_expression(arg) == create_type("bool")
is_integer = get_type_of_expression(arg) == create_type("int") or \
(isinstance(arg, TypedSymbol) and arg.dtype.is_int())
instruction = 'makeVecConstBool' if is_boolean else \
'makeVecConstInt' if is_integer else 'makeVecConst'
return self.instruction_set[instruction].format(self._print(arg))
elif expr.func == fast_division:
result = self._scalarFallback('_print_Function', expr) result = self._scalarFallback('_print_Function', expr)
if not result: if not result:
result = self.instruction_set['/'].format(self._print(expr.args[0]), self._print(expr.args[1])) result = self.instruction_set['/'].format(self._print(expr.divisor), self._print(expr.dividend),
**self._kwargs)
return result return result
elif expr.func == fast_sqrt: elif isinstance(expr, fast_division):
return f"({self._print(sp.sqrt(expr.args[0]))})" raise ValueError("fast_division is only supported for Taget.GPU")
elif expr.func == fast_inv_sqrt: elif isinstance(expr, fast_sqrt):
result = self._scalarFallback('_print_Function', expr) raise ValueError("fast_sqrt is only supported for Taget.GPU")
if not result: elif isinstance(expr, fast_inv_sqrt):
if self.instruction_set['rsqrt']: raise ValueError("fast_inv_sqrt is only supported for Taget.GPU")
return self.instruction_set['rsqrt'].format(self._print(expr.args[0]))
else:
return f"({self._print(1 / sp.sqrt(expr.args[0]))})"
elif isinstance(expr, vec_any) or isinstance(expr, vec_all): elif isinstance(expr, vec_any) or isinstance(expr, vec_all):
instr = 'any' if isinstance(expr, vec_any) else 'all' instr = 'any' if isinstance(expr, vec_any) else 'all'
expr_type = get_type_of_expression(expr.args[0]) expr_type = get_type_of_expression(expr.args[0])
...@@ -641,8 +719,9 @@ class VectorizedCustomSympyPrinter(CustomSympyPrinter): ...@@ -641,8 +719,9 @@ class VectorizedCustomSympyPrinter(CustomSympyPrinter):
if isinstance(expr.args[0], sp.Rel): if isinstance(expr.args[0], sp.Rel):
op = expr.args[0].rel_op op = expr.args[0].rel_op
if (instr, op) in self.instruction_set: if (instr, op) in self.instruction_set:
return self.instruction_set[(instr, op)].format(*[self._print(a) for a in expr.args[0].args]) return self.instruction_set[(instr, op)].format(*[self._print(a) for a in expr.args[0].args],
return self.instruction_set[instr].format(self._print(expr.args[0])) **self._kwargs)
return self.instruction_set[instr].format(self._print(expr.args[0]), **self._kwargs)
return super(VectorizedCustomSympyPrinter, self)._print_Function(expr) return super(VectorizedCustomSympyPrinter, self)._print_Function(expr)
...@@ -655,7 +734,7 @@ class VectorizedCustomSympyPrinter(CustomSympyPrinter): ...@@ -655,7 +734,7 @@ class VectorizedCustomSympyPrinter(CustomSympyPrinter):
assert len(arg_strings) > 0 assert len(arg_strings) > 0
result = arg_strings[0] result = arg_strings[0]
for item in arg_strings[1:]: for item in arg_strings[1:]:
result = self.instruction_set['&'].format(result, item) result = self.instruction_set['&'].format(result, item, **self._kwargs)
return result return result
def _print_Or(self, expr): def _print_Or(self, expr):
...@@ -667,7 +746,7 @@ class VectorizedCustomSympyPrinter(CustomSympyPrinter): ...@@ -667,7 +746,7 @@ class VectorizedCustomSympyPrinter(CustomSympyPrinter):
assert len(arg_strings) > 0 assert len(arg_strings) > 0
result = arg_strings[0] result = arg_strings[0]
for item in arg_strings[1:]: for item in arg_strings[1:]:
result = self.instruction_set['|'].format(result, item) result = self.instruction_set['|'].format(result, item, **self._kwargs)
return result return result
def _print_Add(self, expr, order=None): def _print_Add(self, expr, order=None):
...@@ -681,12 +760,12 @@ class VectorizedCustomSympyPrinter(CustomSympyPrinter): ...@@ -681,12 +760,12 @@ class VectorizedCustomSympyPrinter(CustomSympyPrinter):
# special treatment for all-integer args, for loop index arithmetic until we have proper int vectorization # special treatment for all-integer args, for loop index arithmetic until we have proper int vectorization
suffix = "" suffix = ""
if all([(type(e) is cast_func and str(e.dtype) == self.instruction_set['int']) or isinstance(e, sp.Integer) if all([(type(e) is CastFunc and str(e.dtype) == self.instruction_set['int']) or isinstance(e, sp.Integer)
or (type(e) is TypedSymbol and isinstance(e.dtype, BasicType) and e.dtype.is_int()) for e in args]): or (type(e) is TypedSymbol and isinstance(e.dtype, BasicType) and e.dtype.is_int()) for e in args]):
dtype = set([e.dtype for e in args if type(e) is cast_func]) dtype = set([e.dtype for e in args if type(e) is CastFunc])
assert len(dtype) == 1 assert len(dtype) == 1
dtype = dtype.pop() dtype = dtype.pop()
args = [cast_func(e, dtype) if (isinstance(e, sp.Integer) or isinstance(e, TypedSymbol)) else e args = [CastFunc(e, dtype) if (isinstance(e, sp.Integer) or isinstance(e, TypedSymbol)) else e
for e in args] for e in args]
suffix = "int" suffix = "int"
...@@ -708,29 +787,35 @@ class VectorizedCustomSympyPrinter(CustomSympyPrinter): ...@@ -708,29 +787,35 @@ class VectorizedCustomSympyPrinter(CustomSympyPrinter):
processed = summands[0].term processed = summands[0].term
for summand in summands[1:]: for summand in summands[1:]:
func = self.instruction_set['-' + suffix] if summand.sign == -1 else self.instruction_set['+' + suffix] func = self.instruction_set['-' + suffix] if summand.sign == -1 else self.instruction_set['+' + suffix]
processed = func.format(processed, summand.term) processed = func.format(processed, summand.term, **self._kwargs)
return processed return processed
def _print_Pow(self, expr): def _print_Pow(self, expr):
result = self._scalarFallback('_print_Pow', expr) # Due to loop cutting sp.Mul is evaluated again.
try:
result = self._scalarFallback('_print_Pow', expr)
except ValueError:
result = None
if result: if result:
return result return result
one = self.instruction_set['makeVecConst'].format(1.0) one = self.instruction_set['makeVecConst'].format(1.0, **self._kwargs)
root = self.instruction_set['sqrt'].format(self._print(expr.base), **self._kwargs)
if expr.exp.is_integer and expr.exp.is_number and 0 < expr.exp < 8:
return "(" + self._print(sp.Mul(*[expr.base] * expr.exp, evaluate=False)) + ")" if isinstance(expr.exp, CastFunc) and expr.exp.args[0].is_number:
elif expr.exp == -1: exp = expr.exp.args[0]
one = self.instruction_set['makeVecConst'].format(1.0) else:
return self.instruction_set['/'].format(one, self._print(expr.base)) exp = expr.exp
elif expr.exp == 0.5:
return self.instruction_set['sqrt'].format(self._print(expr.base)) # TODO the printer should not have any intelligence like this.
elif expr.exp == -0.5: # TODO To remove all of these cases the vectoriser needs to be reworked. See loop cutting
root = self.instruction_set['sqrt'].format(self._print(expr.base)) if exp.is_integer and exp.is_number and 0 < exp < 8:
return self.instruction_set['/'].format(one, root) return self._print(sp.Mul(*[expr.base] * exp, evaluate=False))
elif expr.exp.is_integer and expr.exp.is_number and - 8 < expr.exp < 0: elif exp == 0.5:
return self.instruction_set['/'].format(one, return root
self._print(sp.Mul(*[expr.base] * (-expr.exp), evaluate=False))) elif exp == -0.5:
return self.instruction_set['/'].format(one, root, **self._kwargs)
else: else:
raise ValueError("Generic exponential not supported: " + str(expr)) raise ValueError("Generic exponential not supported: " + str(expr))
...@@ -738,7 +823,10 @@ class VectorizedCustomSympyPrinter(CustomSympyPrinter): ...@@ -738,7 +823,10 @@ class VectorizedCustomSympyPrinter(CustomSympyPrinter):
# noinspection PyProtectedMember # noinspection PyProtectedMember
from sympy.core.mul import _keep_coeff from sympy.core.mul import _keep_coeff
result = self._scalarFallback('_print_Mul', expr) if not inside_add:
result = self._scalarFallback('_print_Mul', expr)
else:
result = None
if result: if result:
return result return result
...@@ -769,19 +857,19 @@ class VectorizedCustomSympyPrinter(CustomSympyPrinter): ...@@ -769,19 +857,19 @@ class VectorizedCustomSympyPrinter(CustomSympyPrinter):
result = a_str[0] result = a_str[0]
for item in a_str[1:]: for item in a_str[1:]:
result = self.instruction_set['*'].format(result, item) result = self.instruction_set['*'].format(result, item, **self._kwargs)
if len(b) > 0: if len(b) > 0:
denominator_str = b_str[0] denominator_str = b_str[0]
for item in b_str[1:]: for item in b_str[1:]:
denominator_str = self.instruction_set['*'].format(denominator_str, item) denominator_str = self.instruction_set['*'].format(denominator_str, item, **self._kwargs)
result = self.instruction_set['/'].format(result, denominator_str) result = self.instruction_set['/'].format(result, denominator_str, **self._kwargs)
if inside_add: if inside_add:
return sign, result return sign, result
else: else:
if sign < 0: if sign < 0:
return self.instruction_set['*'].format(self._print(S.NegativeOne), result) return self.instruction_set['*'].format(self._print(S.NegativeOne), result, **self._kwargs)
else: else:
return result return result
...@@ -789,13 +877,13 @@ class VectorizedCustomSympyPrinter(CustomSympyPrinter): ...@@ -789,13 +877,13 @@ class VectorizedCustomSympyPrinter(CustomSympyPrinter):
result = self._scalarFallback('_print_Relational', expr) result = self._scalarFallback('_print_Relational', expr)
if result: if result:
return result return result
return self.instruction_set[expr.rel_op].format(self._print(expr.lhs), self._print(expr.rhs)) return self.instruction_set[expr.rel_op].format(self._print(expr.lhs), self._print(expr.rhs), **self._kwargs)
def _print_Equality(self, expr): def _print_Equality(self, expr):
result = self._scalarFallback('_print_Equality', expr) result = self._scalarFallback('_print_Equality', expr)
if result: if result:
return result return result
return self.instruction_set['=='].format(self._print(expr.lhs), self._print(expr.rhs)) return self.instruction_set['=='].format(self._print(expr.lhs), self._print(expr.rhs), **self._kwargs)
def _print_Piecewise(self, expr): def _print_Piecewise(self, expr):
result = self._scalarFallback('_print_Piecewise', expr) result = self._scalarFallback('_print_Piecewise', expr)
...@@ -813,13 +901,11 @@ class VectorizedCustomSympyPrinter(CustomSympyPrinter): ...@@ -813,13 +901,11 @@ class VectorizedCustomSympyPrinter(CustomSympyPrinter):
result = self._print(expr.args[-1][0]) result = self._print(expr.args[-1][0])
for true_expr, condition in reversed(expr.args[:-1]): for true_expr, condition in reversed(expr.args[:-1]):
if isinstance(condition, cast_func) and get_type_of_expression(condition.args[0]) == create_type("bool"): if isinstance(condition, CastFunc) and get_type_of_expression(condition.args[0]) == create_type("bool"):
if not KERNCRAFT_NO_TERNARY_MODE: result = "(({}) ? ({}) : ({}))".format(self._print(condition.args[0]), self._print(true_expr),
result = "(({}) ? ({}) : ({}))".format(self._print(condition.args[0]), self._print(true_expr), result, **self._kwargs)
result)
else:
print("Warning - skipping ternary op")
else: else:
# noinspection SpellCheckingInspection # noinspection SpellCheckingInspection
result = self.instruction_set['blendv'].format(result, self._print(true_expr), self._print(condition)) result = self.instruction_set['blendv'].format(result, self._print(true_expr), self._print(condition),
**self._kwargs)
return result return result
from os.path import dirname, join
from pystencils.astnodes import Node from pystencils.astnodes import Node
from pystencils.backends.cbackend import CBackend, CustomSympyPrinter, generate_c from pystencils.backends.cbackend import CBackend, CustomSympyPrinter, generate_c
from pystencils.enums import Backend
from pystencils.fast_approximation import fast_division, fast_inv_sqrt, fast_sqrt from pystencils.fast_approximation import fast_division, fast_inv_sqrt, fast_sqrt
from pystencils.interpolation_astnodes import DiffInterpolatorAccess, InterpolationMode
with open(join(dirname(__file__), 'cuda_known_functions.txt')) as f:
lines = f.readlines()
CUDA_KNOWN_FUNCTIONS = {l.strip(): l.strip() for l in lines if l}
def generate_cuda(ast_node: Node, signature_only: bool = False, custom_backend=None, with_globals=True) -> str: def generate_cuda(ast_node: Node, signature_only: bool = False, custom_backend=None, with_globals=True) -> str:
...@@ -22,7 +16,7 @@ def generate_cuda(ast_node: Node, signature_only: bool = False, custom_backend=N ...@@ -22,7 +16,7 @@ def generate_cuda(ast_node: Node, signature_only: bool = False, custom_backend=N
Returns: Returns:
CUDA code for the ast node and its descendants CUDA code for the ast node and its descendants
""" """
return generate_c(ast_node, signature_only, dialect='cuda', return generate_c(ast_node, signature_only, dialect=Backend.CUDA,
custom_backend=custom_backend, with_globals=with_globals) custom_backend=custom_backend, with_globals=with_globals)
...@@ -33,7 +27,7 @@ class CudaBackend(CBackend): ...@@ -33,7 +27,7 @@ class CudaBackend(CBackend):
if not sympy_printer: if not sympy_printer:
sympy_printer = CudaSympyPrinter() sympy_printer = CudaSympyPrinter()
super().__init__(sympy_printer, signature_only, dialect='cuda') super().__init__(sympy_printer, signature_only, dialect=Backend.CUDA)
def _print_SharedMemoryAllocation(self, node): def _print_SharedMemoryAllocation(self, node):
dtype = node.symbol.dtype dtype = node.symbol.dtype
...@@ -43,26 +37,13 @@ class CudaBackend(CBackend): ...@@ -43,26 +37,13 @@ class CudaBackend(CBackend):
return code return code
@staticmethod @staticmethod
def _print_ThreadBlockSynchronization(node): def _print_ThreadBlockSynchronization(_):
code = "__synchtreads();" return "__synchtreads();"
return code
def _print_TextureDeclaration(self, node): def _print_TextureDeclaration(self, node):
cond = node.texture.field.dtype.numpy_dtype.itemsize > 4
# TODO: use fStrings here return f'texture<{"fp_tex_" if cond else ""}{str(node.texture.field.dtype)}, ' \
if node.texture.field.dtype.numpy_dtype.itemsize > 4: f'cudaTextureType{node.texture.field.spacial_dimensions}D, cudaReadModeElementType> {node.texture};'
code = "texture<fp_tex_%s, cudaTextureType%iD, cudaReadModeElementType> %s;" % (
str(node.texture.field.dtype),
node.texture.field.spatial_dimensions,
node.texture
)
else:
code = "texture<%s, cudaTextureType%iD, cudaReadModeElementType> %s;" % (
str(node.texture.field.dtype),
node.texture.field.spatial_dimensions,
node.texture
)
return code
def _print_SkipIteration(self, _): def _print_SkipIteration(self, _):
return "return;" return "return;"
...@@ -73,31 +54,6 @@ class CudaSympyPrinter(CustomSympyPrinter): ...@@ -73,31 +54,6 @@ class CudaSympyPrinter(CustomSympyPrinter):
def __init__(self): def __init__(self):
super(CudaSympyPrinter, self).__init__() super(CudaSympyPrinter, self).__init__()
self.known_functions.update(CUDA_KNOWN_FUNCTIONS)
def _print_InterpolatorAccess(self, node):
dtype = node.interpolator.field.dtype.numpy_dtype
if type(node) == DiffInterpolatorAccess:
# cubicTex3D_1st_derivative_x(texture tex, float3 coord)
template = f"cubicTex%iD_1st_derivative_{list(reversed('xyz'[:node.ndim]))[node.diff_coordinate_idx]}(%s, %s)" # noqa
elif node.interpolator.interpolation_mode == InterpolationMode.CUBIC_SPLINE:
template = "cubicTex%iDSimple(%s, %s)"
else:
if dtype.itemsize > 4:
# Use PyCuda hack!
# https://github.com/inducer/pycuda/blob/master/pycuda/cuda/pycuda-helpers.hpp
template = "fp_tex%iD(%s, %s)"
else:
template = "tex%iD(%s, %s)"
code = template % (
node.interpolator.field.spatial_dimensions,
str(node.interpolator),
# + 0.5 comes from Nvidia's staggered indexing
', '.join(self._print(o + 0.5) for o in reversed(node.offsets))
)
return code
def _print_Function(self, expr): def _print_Function(self, expr):
if isinstance(expr, fast_division): if isinstance(expr, fast_division):
......
import graphviz import graphviz
from graphviz import Digraph, lang try:
from graphviz import Digraph
import graphviz.quoting as quote
except ImportError:
from graphviz import Digraph
import graphviz.lang as quote
from sympy.printing.printer import Printer from sympy.printing.printer import Printer
...@@ -12,7 +17,7 @@ class DotPrinter(Printer): ...@@ -12,7 +17,7 @@ class DotPrinter(Printer):
super(DotPrinter, self).__init__() super(DotPrinter, self).__init__()
self._node_to_str_function = node_to_str_function self._node_to_str_function = node_to_str_function
self.dot = Digraph(**kwargs) self.dot = Digraph(**kwargs)
self.dot.quote_edge = lang.quote self.dot.quote_edge = quote.quote
def _print_KernelFunction(self, func): def _print_KernelFunction(self, func):
self.dot.node(str(id(func)), style='filled', fillcolor='#a056db', label=self._node_to_str_function(func)) self.dot.node(str(id(func)), style='filled', fillcolor='#a056db', label=self._node_to_str_function(func))
......
...@@ -78,7 +78,6 @@ def get_vector_instruction_set_ppc(data_type='double', instruction_set='vsx'): ...@@ -78,7 +78,6 @@ def get_vector_instruction_set_ppc(data_type='double', instruction_set='vsx'):
# Clang and XL C++ are missing these for doubles # Clang and XL C++ are missing these for doubles
result['loadA'] = '(__vector double)' + result['loadA'].format('(float*) {0}') result['loadA'] = '(__vector double)' + result['loadA'].format('(float*) {0}')
result['storeA'] = result['storeA'].format('(float*) {0}', '(__vector float) {1}') result['storeA'] = result['storeA'].format('(float*) {0}', '(__vector float) {1}')
result['stream'] = result['stream'].format('(float*) {0}', '(__vector float) {1}')
result['storeAAndFlushCacheline'] = result['storeAAndFlushCacheline'].format('(float*) {0}', result['storeAAndFlushCacheline'] = result['storeAAndFlushCacheline'].format('(float*) {0}',
'(__vector float) {1}') '(__vector float) {1}')
......
from pystencils.typing import CFunction
def get_argument_string(function_shortcut, last=''):
args = function_shortcut[function_shortcut.index('[') + 1: -1]
arg_string = "("
for arg in args.split(","):
arg = arg.strip()
if not arg:
continue
if arg in ('0', '1', '2', '3', '4', '5'):
arg_string += "{" + arg + "},"
else:
arg_string += arg + ","
if last:
arg_string += last + ','
arg_string = arg_string[:-1] + ")"
return arg_string
def get_vector_instruction_set_riscv(data_type='double', instruction_set='rvv'):
assert instruction_set == 'rvv'
bits = {'double': 64,
'float': 32,
'int': 32}
base_names = {
'+': 'fadd_vv[0, 1]',
'-': 'fsub_vv[0, 1]',
'*': 'fmul_vv[0, 1]',
'/': 'fdiv_vv[0, 1]',
'sqrt': 'fsqrt_v[0]',
'loadU': f'le{bits[data_type]}_v[0]',
'storeU': f'se{bits[data_type]}_v[0, 1]',
'maskStoreU': f'se{bits[data_type]}_v[2, 0, 1]',
'loadS': f'lse{bits[data_type]}_v[0, 1]',
'storeS': f'sse{bits[data_type]}_v[0, 2, 1]',
'maskStoreS': f'sse{bits[data_type]}_v[3, 0, 2, 1]',
'abs': 'fabs_v[0]',
'==': 'mfeq_vv[0, 1]',
'!=': 'mfne_vv[0, 1]',
'<=': 'mfle_vv[0, 1]',
'<': 'mflt_vv[0, 1]',
'>=': 'mfge_vv[0, 1]',
'>': 'mfgt_vv[0, 1]',
'&': 'mand_mm[0, 1]',
'|': 'mor_mm[0, 1]',
'blendv': 'merge_vvm[2, 0, 1]',
'any': 'cpop_m[0]',
'all': 'cpop_m[0]',
}
result = dict()
width = f'vsetvlmax_e{bits[data_type]}m1()'
intwidth = 'vsetvlmax_e{bits["int"]}m1()'
result['bytes'] = 'vsetvlmax_e8m1()'
prefix = 'v'
suffix = f'_f{bits[data_type]}m1'
vl = '{loop_stop} - {loop_counter}'
int_vl = f'({vl})*{bits[data_type]//bits["int"]}'
for intrinsic_id, function_shortcut in base_names.items():
function_shortcut = function_shortcut.strip()
name = function_shortcut[:function_shortcut.index('[')]
if name.startswith('mf'):
suffix2 = suffix + f'_b{bits[data_type]}'
elif name.endswith('_mm') or name.endswith('_m'):
suffix2 = f'_b{bits[data_type]}'
elif intrinsic_id.startswith('mask'):
suffix2 = suffix + '_m'
else:
suffix2 = suffix
arg_string = get_argument_string(function_shortcut, last=vl)
result[intrinsic_id] = prefix + name + suffix2 + arg_string
result['width'] = CFunction(width, "int")
result['intwidth'] = CFunction(intwidth, "int")
result['makeVecConst'] = f'vfmv_v_f_f{bits[data_type]}m1({{0}}, {vl})'
result['makeVecConstInt'] = f'vmv_v_x_i{bits["int"]}m1({{0}}, {int_vl})'
result['makeVecIndex'] = f'vmacc_vx_i{bits["int"]}m1({result["makeVecConstInt"]}, {{1}}, ' + \
f'vid_v_i{bits["int"]}m1({int_vl}), {int_vl})'
result['storeS'] = result['storeS'].replace('{2}', f'{{2}}*{bits[data_type]//8}')
result['loadS'] = result['loadS'].replace('{1}', f'{{1}}*{bits[data_type]//8}')
result['maskStoreS'] = result['maskStoreS'].replace('{2}', f'{{2}}*{bits[data_type]//8}')
result['+int'] = f"vadd_vv_i{bits['int']}m1({{0}}, {{1}}, {int_vl})"
result['float'] = f'vfloat{bits["float"]}m1_t'
result['double'] = f'vfloat{bits["double"]}m1_t'
result['int'] = f'vint{bits["int"]}m1_t'
result['bool'] = f'vbool{bits[data_type]}_t'
result['headers'] = ['<riscv_vector.h>', '"riscv_v_helpers.h"']
result['any'] += ' > 0x0'
result['all'] += f' == vsetvl_e{bits[data_type]}m1({vl})'
result['cachelineSize'] = 'cachelineSize()'
result['cachelineZero'] = 'cachelineZero((void*) {0})'
return result
import os
import platform
from ctypes import CDLL, c_int, c_size_t, sizeof, byref
from warnings import warn
import numpy as np
from pystencils.backends.x86_instruction_sets import get_vector_instruction_set_x86
from pystencils.backends.arm_instruction_sets import get_vector_instruction_set_arm
from pystencils.backends.ppc_instruction_sets import get_vector_instruction_set_ppc
from pystencils.backends.riscv_instruction_sets import get_vector_instruction_set_riscv
from pystencils.cache import memorycache
from pystencils.typing import numpy_name_to_c
def get_vector_instruction_set(data_type='double', instruction_set='avx'):
if data_type == 'float':
warn(f"Ambiguous input for data_type: {data_type}. For single precision please use float32. "
f"For more information please take numpy.dtype as a reference. This input will not be supported in future "
f"releases")
data_type = 'float64'
type_name = numpy_name_to_c(np.dtype(data_type).name)
if instruction_set in ['neon', 'sme'] or instruction_set.startswith('sve'):
return get_vector_instruction_set_arm(type_name, instruction_set)
elif instruction_set in ['vsx']:
return get_vector_instruction_set_ppc(type_name, instruction_set)
elif instruction_set in ['rvv']:
return get_vector_instruction_set_riscv(type_name, instruction_set)
else:
return get_vector_instruction_set_x86(type_name, instruction_set)
@memorycache
def get_supported_instruction_sets():
"""List of supported instruction sets on current hardware, or None if query failed."""
if 'PYSTENCILS_SIMD' in os.environ:
return os.environ['PYSTENCILS_SIMD'].split(',')
if platform.system() == 'Darwin' and platform.machine() == 'arm64':
result = ['neon']
libc = CDLL('/usr/lib/libc.dylib')
value = c_int(0)
size = c_size_t(sizeof(value))
status = libc.sysctlbyname(b"hw.optional.arm.FEAT_SME", byref(value), byref(size), None, 0)
if status == 0 and value.value == 1:
result.insert(0, "sme")
return result
elif platform.system() == 'Windows' and platform.machine() == 'ARM64':
return ['neon']
elif platform.system() == 'Linux' and platform.machine() == 'aarch64':
result = ['neon'] # Neon is mandatory on 64-bit ARM
libc = CDLL('libc.so.6')
hwcap = libc.getauxval(16) # AT_HWCAP
hwcap2 = libc.getauxval(26) # AT_HWCAP2
if hwcap & (1 << 22): # HWCAP_SVE
if hwcap2 & (1 << 1): # HWCAP2_SVE2
name = 'sve2'
else:
name = 'sve'
length = 8 * libc.prctl(51, 0, 0, 0, 0) # PR_SVE_GET_VL
if length < 0:
raise OSError("SVE length query failed")
while length >= 128:
result.append(f"{name}{length}")
length //= 2
result.append(name)
if hwcap2 & (1 << 23): # HWCAP2_SME
result.insert(0, "sme") # prepend to list so it is not automatically chosen as best instruction set
return result
elif platform.system() == 'Linux' and platform.machine().startswith('riscv'):
libc = CDLL('libc.so.6')
hwcap = libc.getauxval(16) # AT_HWCAP
hwcap_isa_v = 1 << (ord('V') - ord('A')) # COMPAT_HWCAP_ISA_V
return ['rvv'] if hwcap & hwcap_isa_v else []
elif platform.system() == 'Linux' and platform.machine().startswith('ppc64'):
libc = CDLL('libc.so.6')
hwcap = libc.getauxval(16) # AT_HWCAP
return ['vsx'] if hwcap & 0x00000080 else [] # PPC_FEATURE_HAS_VSX
elif platform.machine() in ['x86_64', 'x86', 'AMD64', 'i386']:
try:
from cpuinfo import get_cpu_info
except ImportError:
return None
result = []
required_sse_flags = {'sse', 'sse2', 'ssse3', 'sse4_1', 'sse4_2'}
required_avx_flags = {'avx', 'avx2'}
required_avx512_flags = {'avx512f'}
possible_avx512vl_flags = {'avx512vl', 'avx10_1'}
flags = set(get_cpu_info()['flags'])
if flags.issuperset(required_sse_flags):
result.append("sse")
if flags.issuperset(required_avx_flags):
result.append("avx")
if flags.issuperset(required_avx512_flags):
result.append("avx512")
if not flags.isdisjoint(possible_avx512vl_flags):
result.append("avx512vl")
return result
else:
raise NotImplementedError('Instruction set detection for %s on %s is not implemented' %
(platform.system(), platform.machine()))
@memorycache
def get_cacheline_size(instruction_set):
"""Get the size (in bytes) of a cache block that can be zeroed without memory access.
Usually, this is identical to the cache line size."""
instruction_sets = get_vector_instruction_set('double', instruction_set)
if 'cachelineSize' not in instruction_sets:
return None
import pystencils as ps
from pystencils.astnodes import SympyAssignment
import numpy as np
from pystencils.cpu.vectorization import CachelineSize
arr = np.zeros((1, 1), dtype=np.float32)
f = ps.Field.create_from_numpy_array('f', arr, index_dimensions=0)
ass = [CachelineSize(), SympyAssignment(f.center, CachelineSize.symbol)]
ast = ps.create_kernel(ass, cpu_vectorize_info={'instruction_set': instruction_set})
kernel = ast.compile()
kernel(**{f.name: arr, CachelineSize.symbol.name: 0})
return int(arr[0, 0])
...@@ -51,35 +51,22 @@ def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'): ...@@ -51,35 +51,22 @@ def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'):
'makeVecConstBool': 'set[]', 'makeVecConstBool': 'set[]',
'makeVecInt': 'set[]', 'makeVecInt': 'set[]',
'makeVecConstInt': 'set[]', 'makeVecConstInt': 'set[]',
'loadU': 'loadu[0]', 'loadU': 'loadu[0]',
'loadA': 'load[0]', 'loadA': 'load[0]',
'storeU': 'storeu[0,1]', 'storeU': 'storeu[0,1]',
'storeA': 'store[0,1]', 'storeA': 'store[0,1]',
'stream': 'stream[0,1]', 'stream': 'stream[0,1]',
'maskstore': 'mask_store[0, 2, 1]' if instruction_set == 'avx512' else 'maskstore[0, 2, 1]', 'maskStoreA': 'mask_store[0, 2, 1]' if instruction_set.startswith('avx512') else 'maskstore[0, 2, 1]',
'maskload': 'mask_load[0, 2, 1]' if instruction_set == 'avx512' else 'maskload[0, 2, 1]' 'maskStoreU': 'mask_storeu[0, 2, 1]' if instruction_set.startswith('avx512') else 'maskstore[0, 2, 1]',
} }
if instruction_set == 'avx512':
base_names.update({
'maskStore': 'mask_store[0, 2, 1]',
'maskStoreU': 'mask_storeu[0, 2, 1]',
'maskLoad': 'mask_load[2, 1, 0]',
'maskLoadU': 'mask_loadu[2, 1, 0]'
})
if instruction_set == 'avx':
base_names.update({
'maskStore': 'maskstore[0, 2, 1]',
'maskStoreU': 'maskstore[0, 2, 1]',
'maskLoad': 'maskload[0, 1]',
'maskLoadU': 'maskloadu[0, 1]'
})
for comparison_op, constant in comparisons.items(): for comparison_op, constant in comparisons.items():
base_names[comparison_op] = f'cmp[0, 1, {constant}]' base_names[comparison_op] = f'cmp[0, 1, {constant}]'
headers = { headers = {
'avx512': ['<immintrin.h>'], 'avx512': ['<immintrin.h>'],
'avx512vl': ['<immintrin.h>'],
'avx': ['<immintrin.h>'], 'avx': ['<immintrin.h>'],
'sse': ['<immintrin.h>', '<xmmintrin.h>', '<emmintrin.h>', '<pmmintrin.h>', 'sse': ['<immintrin.h>', '<xmmintrin.h>', '<emmintrin.h>', '<pmmintrin.h>',
'<tmmintrin.h>', '<smmintrin.h>', '<nmmintrin.h>'] '<tmmintrin.h>', '<smmintrin.h>', '<nmmintrin.h>']
...@@ -93,6 +80,7 @@ def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'): ...@@ -93,6 +80,7 @@ def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'):
prefix = { prefix = {
'sse': '_mm', 'sse': '_mm',
'avx': '_mm256', 'avx': '_mm256',
'avx512vl': '_mm256',
'avx512': '_mm512', 'avx512': '_mm512',
} }
...@@ -103,11 +91,13 @@ def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'): ...@@ -103,11 +91,13 @@ def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'):
("double", "avx"): 4, ("double", "avx"): 4,
("float", "avx"): 8, ("float", "avx"): 8,
("int", "avx"): 8, ("int", "avx"): 8,
("double", "avx512vl"): 4,
("float", "avx512vl"): 8,
("int", "avx512vl"): 8,
("double", "avx512"): 8, ("double", "avx512"): 8,
("float", "avx512"): 16, ("float", "avx512"): 16,
("int", "avx512"): 16, ("int", "avx512"): 16,
} }
result = { result = {
'width': width[(data_type, instruction_set)], 'width': width[(data_type, instruction_set)],
'intwidth': width[('int', instruction_set)], 'intwidth': width[('int', instruction_set)],
...@@ -125,15 +115,9 @@ def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'): ...@@ -125,15 +115,9 @@ def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'):
suf = suffix[data_type] suf = suffix[data_type]
arg_string = get_argument_string(intrinsic_id, result['width'], function_shortcut) arg_string = get_argument_string(intrinsic_id, result['width'], function_shortcut)
mask_suffix = '_mask' if instruction_set == 'avx512' and intrinsic_id in comparisons.keys() else '' mask_suffix = '_mask' if instruction_set.startswith('avx512') and intrinsic_id in comparisons.keys() else ''
result[intrinsic_id] = pre + "_" + name + "_" + suf + mask_suffix + arg_string result[intrinsic_id] = pre + "_" + name + "_" + suf + mask_suffix + arg_string
result['dataTypePrefix'] = {
'double': "_" + pre + 'd',
'float': "_" + pre,
}
result['rsqrt'] = None
bit_width = result['width'] * (64 if data_type == 'double' else 32) bit_width = result['width'] * (64 if data_type == 'double' else 32)
result['double'] = f"__m{bit_width}d" result['double'] = f"__m{bit_width}d"
result['float'] = f"__m{bit_width}" result['float'] = f"__m{bit_width}"
...@@ -144,22 +128,46 @@ def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'): ...@@ -144,22 +128,46 @@ def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'):
result['any'] = f"{pre}_movemask_{suf}({{0}}) > 0" result['any'] = f"{pre}_movemask_{suf}({{0}}) > 0"
result['all'] = f"{pre}_movemask_{suf}({{0}}) == {hex(2**result['width']-1)}" result['all'] = f"{pre}_movemask_{suf}({{0}}) == {hex(2**result['width']-1)}"
if instruction_set == 'avx512': setsuf = "x" if bit_width < 512 and bit_width // result['width'] == 64 else ""
size = 8 if data_type == 'double' else 16
result['&'] = f'_kand_mask{size}({{0}}, {{1}})' if instruction_set.startswith('avx512'):
result['|'] = f'_kor_mask{size}({{0}}, {{1}})' size = result['width']
result['any'] = f'!_ktestz_mask{size}_u8({{0}}, {{0}})' masksize = max(size, 8)
result['all'] = f'_kortestc_mask{size}_u8({{0}}, {{0}})' result['&'] = f'_kand_mask{masksize}({{0}}, {{1}})'
result['|'] = f'_kor_mask{masksize}({{0}}, {{1}})'
result['any'] = f'!_ktestz_mask{masksize}_u8({{0}}, {{0}})'
result['all'] = f'_kortestc_mask{masksize}_u8({{0}}, {{0}})'
result['blendv'] = f'{pre}_mask_blend_{suf}({{2}}, {{0}}, {{1}})' result['blendv'] = f'{pre}_mask_blend_{suf}({{2}}, {{0}}, {{1}})'
result['rsqrt'] = f"{pre}_rsqrt14_{suf}({{0}})" result['rsqrt'] = f"{pre}_rsqrt14_{suf}({{0}})"
result['abs'] = f"{pre}_abs_{suf}({{0}})" result['bool'] = f"__mmask{masksize}"
result['bool'] = f"__mmask{size}"
params = " | ".join(["({{{i}}} ? {power} : 0)".format(i=i, power=2 ** i) for i in range(8)]) params = " | ".join(["({{{i}}} ? {power} : 0)".format(i=i, power=2 ** i) for i in range(8)])
result['makeVecBool'] = f"__mmask8(({params}) )" result['makeVecBool'] = f"__mmask8(({params}) )"
params = " | ".join(["({{0}} ? {power} : 0)".format(power=2 ** i) for i in range(8)]) params = " | ".join(["({{0}} ? {power} : 0)".format(power=2 ** i) for i in range(8)])
result['makeVecConstBool'] = f"__mmask8(({params}) )" result['makeVecConstBool'] = f"__mmask8(({params}) )"
vindex = f'{pre}_set_epi{bit_width//size}{setsuf}(' + \
', '.join([str(i) for i in range(result['width'])][::-1]) + ')'
vindex = f'{pre}_mullo_epi{bit_width//size}({vindex}, {pre}_set1_epi{bit_width//size}{setsuf}({{0}}))'
scale = bit_width // size // 8
result['storeS'] = f'{pre}_i{bit_width//size}scatter_{suf}({{0}}, ' + vindex.format("{2}") + \
f', {{1}}, {scale})'
result['maskStoreS'] = f'{pre}_mask_i{bit_width//size}scatter_{suf}({{0}}, {{3}}, ' + vindex.format("{2}") + \
f', {{1}}, {scale})'
if bit_width == 512:
result['loadS'] = f'{pre}_i{bit_width//size}gather_{suf}(' + vindex.format("{1}") + f', {{0}}, {scale})'
else:
result['loadS'] = f'{pre}_i{bit_width//size}gather_{suf}({{0}}, ' + vindex.format("{1}") + f', {scale})'
# abs intrinsic exists in 512 bits, but expands to a sequence. We generate that same sequence for 128 and 256 bits
if instruction_set == 'avx512':
result['abs'] = f"{pre}_abs_{suf}({{0}})"
else:
result['abs'] = f"{pre}_castsi{bit_width}_{suf}({pre}_and_si{bit_width}(" + \
f"{pre}_set1_epi{bit_width // result['width']}{setsuf}(0x7" + \
'f' * (bit_width // result['width'] // 4 - 1) + "), " + \
f"{pre}_cast{suf}_si{bit_width}({{0}})))"
if instruction_set == 'avx' and data_type == 'float': if instruction_set == 'avx' and data_type == 'float':
result['rsqrt'] = f"{pre}_rsqrt_{suf}({{0}})" result['rsqrt'] = f"{pre}_rsqrt_{suf}({{0}})"
......
import sympy as sp
# from pystencils.typing import get_type_of_expression
# noinspection PyPep8Naming
class flag_cond(sp.Function):
"""Evaluates a flag condition on a bit mask, and returns the value of one of two expressions,
depending on whether the flag is set.
Three argument version:
```
flag_cond(flag_bit, mask, expr) = expr if (flag_bit is set in mask) else 0
```
Four argument version:
```
flag_cond(flag_bit, mask, expr_then, expr_else) = expr_then if (flag_bit is set in mask) else expr_else
```
"""
nargs = (3, 4)
def __new__(cls, flag_bit, mask_expression, *expressions):
# TODO Jan reintroduce checking
# flag_dtype = get_type_of_expression(flag_bit)
# if not flag_dtype.is_int():
# raise ValueError('Argument flag_bit must be of integer type.')
#
# mask_dtype = get_type_of_expression(mask_expression)
# if not mask_dtype.is_int():
# raise ValueError('Argument mask_expression must be of integer type.')
return super().__new__(cls, flag_bit, mask_expression, *expressions)
def to_c(self, print_func):
flag_bit = self.args[0]
mask = self.args[1]
then_expression = self.args[2]
flag_bit_code = print_func(flag_bit)
mask_code = print_func(mask)
then_code = print_func(then_expression)
code = f"(({mask_code}) >> ({flag_bit_code}) & 1) * ({then_code})"
if len(self.args) > 3:
else_expression = self.args[3]
else_code = print_func(else_expression)
code += f" + (({mask_code}) >> ({flag_bit_code}) ^ 1) * ({else_code})"
return code
from typing import Any, List, Tuple from typing import Any, List, Tuple
from pystencils import Assignment from pystencils.astnodes import SympyAssignment
from pystencils.boundaries.boundaryhandling import BoundaryOffsetInfo from pystencils.boundaries.boundaryhandling import BoundaryOffsetInfo
from pystencils.data_types import create_type from pystencils.typing import create_type
class Boundary: class Boundary:
...@@ -14,7 +14,7 @@ class Boundary: ...@@ -14,7 +14,7 @@ class Boundary:
def __init__(self, name=None): def __init__(self, name=None):
self._name = name self._name = name
def __call__(self, field, direction_symbol, index_field) -> List[Assignment]: def __call__(self, field, direction_symbol, index_field) -> List[SympyAssignment]:
"""Defines the boundary behavior and must therefore be implemented by all boundaries. """Defines the boundary behavior and must therefore be implemented by all boundaries.
Here the boundary is defined as a list of sympy assignments, from which a boundary kernel is generated. Here the boundary is defined as a list of sympy assignments, from which a boundary kernel is generated.
...@@ -63,20 +63,20 @@ class Neumann(Boundary): ...@@ -63,20 +63,20 @@ class Neumann(Boundary):
neighbor = BoundaryOffsetInfo.offset_from_dir(direction_symbol, field.spatial_dimensions) neighbor = BoundaryOffsetInfo.offset_from_dir(direction_symbol, field.spatial_dimensions)
if field.index_dimensions == 0: if field.index_dimensions == 0:
return [Assignment(field.center, field[neighbor])] return [SympyAssignment(field.center, field[neighbor])]
else: else:
from itertools import product from itertools import product
if not field.has_fixed_index_shape: if not field.has_fixed_index_shape:
raise NotImplementedError("Neumann boundary works only for fields with fixed index shape") raise NotImplementedError("Neumann boundary works only for fields with fixed index shape")
index_iter = product(*(range(i) for i in field.index_shape)) index_iter = product(*(range(i) for i in field.index_shape))
return [Assignment(field(*idx), field[neighbor](*idx)) for idx in index_iter] return [SympyAssignment(field(*idx), field[neighbor](*idx)) for idx in index_iter]
def __hash__(self): def __hash__(self):
# All boundaries of these class behave equal -> should also be equal # All boundaries of these class behave equal -> should also be equal
return hash("Neumann") return hash("Neumann")
def __eq__(self, other): def __eq__(self, other):
return type(other) == Neumann return type(other) is Neumann
class Dirichlet(Boundary): class Dirichlet(Boundary):
...@@ -103,11 +103,11 @@ class Dirichlet(Boundary): ...@@ -103,11 +103,11 @@ class Dirichlet(Boundary):
def __call__(self, field, direction_symbol, index_field, **kwargs): def __call__(self, field, direction_symbol, index_field, **kwargs):
if field.index_dimensions == 0: if field.index_dimensions == 0:
return [Assignment(field.center, index_field("value") if self.additional_data else self._value)] return [SympyAssignment(field.center, index_field("value") if self.additional_data else self._value)]
elif field.index_dimensions == 1: elif field.index_dimensions == 1:
assert not self.additional_data assert not self.additional_data
if not field.has_fixed_index_shape: if not field.has_fixed_index_shape:
raise NotImplementedError("Field needs fixed index shape") raise NotImplementedError("Field needs fixed index shape")
assert len(self._value) == field.index_shape[0], "Dirichlet value does not match index shape of field" assert len(self._value) == field.index_shape[0], "Dirichlet value does not match index shape of field"
return [Assignment(field(i), self._value[i]) for i in range(field.index_shape[0])] return [SympyAssignment(field(i), self._value[i]) for i in range(field.index_shape[0])]
raise NotImplementedError("Dirichlet boundary not implemented for fields with more than one index dimension") raise NotImplementedError("Dirichlet boundary not implemented for fields with more than one index dimension")
from functools import lru_cache
import numpy as np import numpy as np
import sympy as sp import sympy as sp
from pystencils import create_indexed_kernel from pystencils import create_kernel, CreateKernelConfig, Target
from pystencils.assignment import Assignment from pystencils.astnodes import SympyAssignment
from pystencils.backends.cbackend import CustomCodeNode from pystencils.backends.cbackend import CustomCodeNode
from pystencils.boundaries.createindexlist import ( from pystencils.boundaries.createindexlist import (
create_boundary_index_array, numpy_data_type_for_boundary_object) create_boundary_index_array, numpy_data_type_for_boundary_object)
from pystencils.cache import memorycache from pystencils.typing import TypedSymbol, create_type
from pystencils.data_types import TypedSymbol, create_type from pystencils.gpu.gpu_array_handler import GPUArrayHandler
from pystencils.datahandling.pycuda import PyCudaArrayHandler
from pystencils.field import Field from pystencils.field import Field
from pystencils.kernelparameters import FieldPointerSymbol from pystencils.typing.typed_sympy import FieldPointerSymbol
try: try:
# noinspection PyPep8Naming # noinspection PyPep8Naming
import waLBerla as wlb import waLBerla as wlb
if wlb.cpp_available: if wlb.cpp_available:
from pystencils.datahandling.parallel_datahandling import ParallelDataHandling from pystencils.datahandling.parallel_datahandling import ParallelDataHandling
import cupy.cuda.runtime
else: else:
ParallelDataHandling = None ParallelDataHandling = None
except ImportError: except ImportError:
...@@ -33,11 +35,11 @@ class FlagInterface: ...@@ -33,11 +35,11 @@ class FlagInterface:
>>> dh = create_data_handling((4, 5)) >>> dh = create_data_handling((4, 5))
>>> fi = FlagInterface(dh, 'flag_field', np.uint8) >>> fi = FlagInterface(dh, 'flag_field', np.uint8)
>>> assert dh.has_data('flag_field') >>> assert dh.has_data('flag_field')
>>> fi.reserve_next_flag() >>> int(fi.reserve_next_flag())
2 2
>>> fi.reserve_flag(4) >>> int(fi.reserve_flag(4))
4 4
>>> fi.reserve_next_flag() >>> int(fi.reserve_next_flag())
8 8
""" """
...@@ -84,7 +86,7 @@ class FlagInterface: ...@@ -84,7 +86,7 @@ class FlagInterface:
class BoundaryHandling: class BoundaryHandling:
def __init__(self, data_handling, field_name, stencil, name="boundary_handling", flag_interface=None, def __init__(self, data_handling, field_name, stencil, name="boundary_handling", flag_interface=None,
target='cpu', openmp=True): target: Target = Target.CPU, openmp=True):
assert data_handling.has_data(field_name) assert data_handling.has_data(field_name)
assert data_handling.dim == len(stencil[0]), "Dimension of stencil and data handling do not match" assert data_handling.dim == len(stencil[0]), "Dimension of stencil and data handling do not match"
self._data_handling = data_handling self._data_handling = data_handling
...@@ -99,7 +101,7 @@ class BoundaryHandling: ...@@ -99,7 +101,7 @@ class BoundaryHandling:
self.flag_interface = fi if fi is not None else FlagInterface(data_handling, name + "Flags") self.flag_interface = fi if fi is not None else FlagInterface(data_handling, name + "Flags")
if ParallelDataHandling and isinstance(self.data_handling, ParallelDataHandling): if ParallelDataHandling and isinstance(self.data_handling, ParallelDataHandling):
array_handler = PyCudaArrayHandler() array_handler = GPUArrayHandler(cupy.cuda.runtime.getDevice())
else: else:
array_handler = self.data_handling.array_handler array_handler = self.data_handling.array_handler
...@@ -115,7 +117,8 @@ class BoundaryHandling: ...@@ -115,7 +117,8 @@ class BoundaryHandling:
for obj, cpu_arr in cpu_version.items(): for obj, cpu_arr in cpu_version.items():
if obj not in gpu_version or gpu_version[obj].shape != cpu_arr.shape: if obj not in gpu_version or gpu_version[obj].shape != cpu_arr.shape:
gpu_version[obj] = array_handler.to_gpu(cpu_arr) gpu_version[obj] = array_handler.empty(cpu_arr.shape, cpu_arr.dtype)
array_handler.upload(gpu_version[obj], cpu_arr)
else: else:
array_handler.upload(gpu_version[obj], cpu_arr) array_handler.upload(gpu_version[obj], cpu_arr)
...@@ -378,15 +381,15 @@ class BoundaryDataSetter: ...@@ -378,15 +381,15 @@ class BoundaryDataSetter:
assert coord < self.dim assert coord < self.dim
return self.index_array[self.coord_map[coord]] + self.offset[coord] - self.ghost_layers + 0.5 return self.index_array[self.coord_map[coord]] + self.offset[coord] - self.ghost_layers + 0.5
@memorycache() @lru_cache()
def link_offsets(self): def link_offsets(self):
return self.stencil[self.index_array['dir']] return self.stencil[self.index_array['dir']]
@memorycache() @lru_cache()
def link_positions(self, coord): def link_positions(self, coord):
return self.non_boundary_cell_positions(coord) + 0.5 * self.link_offsets()[:, coord] return self.non_boundary_cell_positions(coord) + 0.5 * self.link_offsets()[:, coord]
@memorycache() @lru_cache()
def boundary_cell_positions(self, coord): def boundary_cell_positions(self, coord):
return self.non_boundary_cell_positions(coord) + self.link_offsets()[:, coord] return self.non_boundary_cell_positions(coord) + self.link_offsets()[:, coord]
...@@ -423,29 +426,30 @@ class BoundaryOffsetInfo(CustomCodeNode): ...@@ -423,29 +426,30 @@ class BoundaryOffsetInfo(CustomCodeNode):
code = "\n" code = "\n"
for i in range(dim): for i in range(dim):
offset_str = ", ".join([str(d[i]) for d in stencil]) offset_str = ", ".join([str(d[i]) for d in stencil])
code += "const int64_t %s [] = { %s };\n" % (offset_sym[i].name, offset_str) code += "const int32_t %s [] = { %s };\n" % (offset_sym[i].name, offset_str)
inv_dirs = [] inv_dirs = []
for direction in stencil: for direction in stencil:
inverse_dir = tuple([-i for i in direction]) inverse_dir = tuple([-i for i in direction])
inv_dirs.append(str(stencil.index(inverse_dir))) inv_dirs.append(str(stencil.index(inverse_dir)))
code += "const int %s [] = { %s };\n" % (self.INV_DIR_SYMBOL.name, ", ".join(inv_dirs)) code += "const int32_t %s [] = { %s };\n" % (self.INV_DIR_SYMBOL.name, ", ".join(inv_dirs))
offset_symbols = BoundaryOffsetInfo._offset_symbols(dim) offset_symbols = BoundaryOffsetInfo._offset_symbols(dim)
super(BoundaryOffsetInfo, self).__init__(code, symbols_read=set(), super(BoundaryOffsetInfo, self).__init__(code, symbols_read=set(),
symbols_defined=set(offset_symbols + [self.INV_DIR_SYMBOL])) symbols_defined=set(offset_symbols + [self.INV_DIR_SYMBOL]))
@staticmethod @staticmethod
def _offset_symbols(dim): def _offset_symbols(dim):
return [TypedSymbol(f"c{d}", create_type(np.int64)) for d in ['x', 'y', 'z'][:dim]] return [TypedSymbol(f"c{d}", create_type(np.int32)) for d in ['x', 'y', 'z'][:dim]]
INV_DIR_SYMBOL = TypedSymbol("invdir", "int") INV_DIR_SYMBOL = TypedSymbol("invdir", np.int32)
def create_boundary_kernel(field, index_field, stencil, boundary_functor, target='cpu', **kernel_creation_args): def create_boundary_kernel(field, index_field, stencil, boundary_functor, target=Target.CPU, **kernel_creation_args):
elements = [BoundaryOffsetInfo(stencil)] elements = [BoundaryOffsetInfo(stencil)]
index_arr_dtype = index_field.dtype.numpy_dtype dir_symbol = TypedSymbol("dir", np.int32)
dir_symbol = TypedSymbol("dir", index_arr_dtype.fields['dir'][0]) elements += [SympyAssignment(dir_symbol, index_field[0]('dir'))]
elements += [Assignment(dir_symbol, index_field[0]('dir'))]
elements += boundary_functor(field, direction_symbol=dir_symbol, index_field=index_field) elements += boundary_functor(field, direction_symbol=dir_symbol, index_field=index_field)
return create_indexed_kernel(elements, [index_field], target=target, **kernel_creation_args) config = CreateKernelConfig(index_fields=[index_field], target=target, skip_independence_check=True,
**kernel_creation_args)
return create_kernel(elements, config=config)
import itertools
import warnings import warnings
import numpy as np import numpy as np
try: try:
# Try to import right away - assume compiled code is available import pyximport
# compile with: python setup.py build_ext --inplace --use-cython
from pystencils.boundaries.createindexlistcython import create_boundary_neighbor_index_list_2d, \
create_boundary_neighbor_index_list_3d, create_boundary_cell_index_list_2d, create_boundary_cell_index_list_3d
pyximport.install(language_level=3)
cython_funcs_available = True cython_funcs_available = True
except ImportError: except ImportError:
try: cython_funcs_available = False
# If not, try development mode and import via pyximport
import pyximport if cython_funcs_available:
from pystencils.boundaries.createindexlistcython import (
pyximport.install(language_level=3) create_boundary_neighbor_index_list_2d,
cython_funcs_available = True create_boundary_neighbor_index_list_3d,
except ImportError: create_boundary_cell_index_list_2d,
cython_funcs_available = False create_boundary_cell_index_list_3d,
if cython_funcs_available: )
from pystencils.boundaries.createindexlistcython import create_boundary_neighbor_index_list_2d, \
create_boundary_neighbor_index_list_3d, create_boundary_cell_index_list_2d, \
create_boundary_cell_index_list_3d
boundary_index_array_coordinate_names = ["x", "y", "z"] boundary_index_array_coordinate_names = ["x", "y", "z"]
direction_member_name = "dir" direction_member_name = "dir"
default_index_array_dtype = np.int32
def numpy_data_type_for_boundary_object(boundary_object, dim): def numpy_data_type_for_boundary_object(boundary_object, dim):
coordinate_names = boundary_index_array_coordinate_names[:dim] coordinate_names = boundary_index_array_coordinate_names[:dim]
return np.dtype([(name, np.int32) for name in coordinate_names] return np.dtype(
+ [(direction_member_name, np.int32)] [(name, default_index_array_dtype) for name in coordinate_names]
+ [(i[0], i[1].numpy_dtype) for i in boundary_object.additional_data], align=True) + [(direction_member_name, default_index_array_dtype)]
+ [(i[0], i[1].numpy_dtype) for i in boundary_object.additional_data],
align=True,
def _create_boundary_neighbor_index_list_python(flag_field_arr, nr_of_ghost_layers, boundary_mask, )
fluid_mask, stencil, single_link):
coordinate_names = boundary_index_array_coordinate_names[:len(flag_field_arr.shape)]
index_arr_dtype = np.dtype([(name, np.int32) for name in coordinate_names] + [(direction_member_name, np.int32)]) def _create_index_list_python(
flag_field_arr,
result = [] boundary_mask,
gl = nr_of_ghost_layers fluid_mask,
for cell in itertools.product(*reversed([range(gl, i - gl) for i in flag_field_arr.shape])): stencil,
cell = cell[::-1] single_link,
if not flag_field_arr[cell] & fluid_mask: inner_or_boundary=False,
continue nr_of_ghost_layers=None,
):
if inner_or_boundary and nr_of_ghost_layers is None:
raise ValueError(
"If inner_or_boundary is set True the number of ghost layers "
"around the inner domain has to be specified"
)
if nr_of_ghost_layers is None:
nr_of_ghost_layers = 0
coordinate_names = boundary_index_array_coordinate_names[
: len(flag_field_arr.shape)
]
index_arr_dtype = np.dtype(
[(name, default_index_array_dtype) for name in coordinate_names]
+ [(direction_member_name, default_index_array_dtype)]
)
# boundary cells are extracted via np.where. To ensure continous memory access in the compute kernel these cells
# have to be sorted.
boundary_cells = np.transpose(np.nonzero(flag_field_arr == boundary_mask))
for i in range(len(flag_field_arr.shape)):
boundary_cells = boundary_cells[boundary_cells[:, i].argsort(kind="mergesort")]
# First a set is created to save all fluid cells which are near boundary
fluid_cells = set()
for cell in boundary_cells:
cell = tuple(cell)
for dir_idx, direction in enumerate(stencil): for dir_idx, direction in enumerate(stencil):
neighbor_cell = tuple([cell_i + dir_i for cell_i, dir_i in zip(cell, direction)]) neighbor_cell = tuple(
if flag_field_arr[neighbor_cell] & boundary_mask: [cell_i + dir_i for cell_i, dir_i in zip(cell, direction)]
result.append(cell + (dir_idx,)) )
if single_link: # prevent out ouf bounds access. If boundary cell is at the border, some stencil directions would be out.
break if any(
not 0 + nr_of_ghost_layers <= e < upper - nr_of_ghost_layers
return np.array(result, dtype=index_arr_dtype) for e, upper in zip(neighbor_cell, flag_field_arr.shape)
):
continue
if flag_field_arr[neighbor_cell] & fluid_mask:
fluid_cells.add(neighbor_cell)
# then this is set is transformed to a list to make it sortable. This ensures continoous memory access later.
fluid_cells = list(fluid_cells)
if len(flag_field_arr.shape) == 3:
fluid_cells.sort(key=lambda tup: (tup[-1], tup[-2], tup[0]))
else:
fluid_cells.sort(key=lambda tup: (tup[-1], tup[0]))
def _create_boundary_cell_index_list_python(flag_field_arr, boundary_mask, cells_to_iterate = fluid_cells if inner_or_boundary else boundary_cells
fluid_mask, stencil, single_link): checkmask = boundary_mask if inner_or_boundary else fluid_mask
coordinate_names = boundary_index_array_coordinate_names[:len(flag_field_arr.shape)]
index_arr_dtype = np.dtype([(name, np.int32) for name in coordinate_names] + [(direction_member_name, np.int32)])
result = [] result = []
for cell in itertools.product(*reversed([range(0, i) for i in flag_field_arr.shape])): for cell in cells_to_iterate:
cell = cell[::-1] cell = tuple(cell)
if not flag_field_arr[cell] & boundary_mask: sum_cells = np.zeros(len(cell))
continue
for dir_idx, direction in enumerate(stencil): for dir_idx, direction in enumerate(stencil):
neighbor_cell = tuple([cell_i + dir_i for cell_i, dir_i in zip(cell, direction)]) neighbor_cell = tuple(
if any(not 0 <= e < upper for e, upper in zip(neighbor_cell, flag_field_arr.shape)): [cell_i + dir_i for cell_i, dir_i in zip(cell, direction)]
)
# prevent out ouf bounds access. If boundary cell is at the border, some stencil directions would be out.
if any(
not 0 <= e < upper
for e, upper in zip(neighbor_cell, flag_field_arr.shape)
):
continue continue
if flag_field_arr[neighbor_cell] & fluid_mask: if flag_field_arr[neighbor_cell] & checkmask:
result.append(cell + (dir_idx,))
if single_link: if single_link:
break sum_cells += np.array(direction)
else:
result.append(tuple(cell) + (dir_idx,))
# the discrete normal direction is the one which gives the maximum inner product to the stencil direction
if single_link and any(sum_cells != 0):
idx = np.argmax(np.inner(sum_cells, stencil))
result.append(tuple(cell) + (idx,))
return np.array(result, dtype=index_arr_dtype) return np.array(result, dtype=index_arr_dtype)
def create_boundary_index_list(flag_field, stencil, boundary_mask, fluid_mask, def create_boundary_index_list(
nr_of_ghost_layers=1, inner_or_boundary=True, single_link=False): flag_field,
stencil,
boundary_mask,
fluid_mask,
nr_of_ghost_layers=1,
inner_or_boundary=True,
single_link=False,
):
"""Creates a numpy array storing links (connections) between domain cells and boundary cells. """Creates a numpy array storing links (connections) between domain cells and boundary cells.
Args: Args:
...@@ -91,15 +141,25 @@ def create_boundary_index_list(flag_field, stencil, boundary_mask, fluid_mask, ...@@ -91,15 +141,25 @@ def create_boundary_index_list(flag_field, stencil, boundary_mask, fluid_mask,
nr_of_ghost_layers: only relevant if neighbors is True nr_of_ghost_layers: only relevant if neighbors is True
inner_or_boundary: if true, the result contains the cell coordinates of the domain cells - inner_or_boundary: if true, the result contains the cell coordinates of the domain cells -
if false the boundary cells are listed if false the boundary cells are listed
single_link: if true only the first link is reported from this cell single_link: if true only the link in normal direction to this cell is reported
""" """
dim = len(flag_field.shape) dim = len(flag_field.shape)
coordinate_names = boundary_index_array_coordinate_names[:dim] coordinate_names = boundary_index_array_coordinate_names[:dim]
index_arr_dtype = np.dtype([(name, np.int32) for name in coordinate_names] + [(direction_member_name, np.int32)]) index_arr_dtype = np.dtype(
[(name, default_index_array_dtype) for name in coordinate_names]
stencil = np.array(stencil, dtype=np.int32) + [(direction_member_name, default_index_array_dtype)]
args = (flag_field, nr_of_ghost_layers, boundary_mask, fluid_mask, stencil, single_link) )
stencil = np.array(stencil, dtype=default_index_array_dtype)
args = (
flag_field,
nr_of_ghost_layers,
boundary_mask,
fluid_mask,
stencil,
single_link,
)
args_no_gl = (flag_field, boundary_mask, fluid_mask, stencil, single_link) args_no_gl = (flag_field, boundary_mask, fluid_mask, stencil, single_link)
if cython_funcs_available: if cython_funcs_available:
...@@ -118,24 +178,42 @@ def create_boundary_index_list(flag_field, stencil, boundary_mask, fluid_mask, ...@@ -118,24 +178,42 @@ def create_boundary_index_list(flag_field, stencil, boundary_mask, fluid_mask,
return np.array(idx_list, dtype=index_arr_dtype) return np.array(idx_list, dtype=index_arr_dtype)
else: else:
if flag_field.size > 1e6: if flag_field.size > 1e6:
warnings.warn("Boundary setup may take very long! Consider installing cython to speed it up") warnings.warn(
if inner_or_boundary: "Boundary setup may take very long! Consider installing cython to speed it up"
return _create_boundary_neighbor_index_list_python(*args) )
else: return _create_index_list_python(
return _create_boundary_cell_index_list_python(*args_no_gl) *args_no_gl,
inner_or_boundary=inner_or_boundary,
nr_of_ghost_layers=nr_of_ghost_layers,
def create_boundary_index_array(flag_field, stencil, boundary_mask, fluid_mask, boundary_object, )
nr_of_ghost_layers=1, inner_or_boundary=True, single_link=False):
idx_array = create_boundary_index_list(flag_field, stencil, boundary_mask, fluid_mask,
nr_of_ghost_layers, inner_or_boundary, single_link) def create_boundary_index_array(
flag_field,
stencil,
boundary_mask,
fluid_mask,
boundary_object,
nr_of_ghost_layers=1,
inner_or_boundary=True,
single_link=False,
):
idx_array = create_boundary_index_list(
flag_field,
stencil,
boundary_mask,
fluid_mask,
nr_of_ghost_layers,
inner_or_boundary,
single_link,
)
dim = len(flag_field.shape) dim = len(flag_field.shape)
if boundary_object.additional_data: if boundary_object.additional_data:
coordinate_names = boundary_index_array_coordinate_names[:dim] coordinate_names = boundary_index_array_coordinate_names[:dim]
index_arr_dtype = numpy_data_type_for_boundary_object(boundary_object, dim) index_arr_dtype = numpy_data_type_for_boundary_object(boundary_object, dim)
extended_idx_field = np.empty(len(idx_array), dtype=index_arr_dtype) extended_idx_field = np.empty(len(idx_array), dtype=index_arr_dtype)
for prop in coordinate_names + ['dir']: for prop in coordinate_names + ["dir"]:
extended_idx_field[prop] = idx_array[prop] extended_idx_field[prop] = idx_array[prop]
idx_array = extended_idx_field idx_array = extended_idx_field
......
# distutils: language=c # cython: language_level=3str
# Workaround for cython bug
# see https://stackoverflow.com/questions/8024805/cython-compiled-c-extension-importerror-dynamic-module-does-not-define-init-fu
WORKAROUND = "Something"
import cython import cython
...@@ -22,20 +19,37 @@ def create_boundary_neighbor_index_list_2d(object[IntegerType, ndim=2] flag_fiel ...@@ -22,20 +19,37 @@ def create_boundary_neighbor_index_list_2d(object[IntegerType, ndim=2] flag_fiel
cdef int xs, ys, x, y cdef int xs, ys, x, y
cdef int dirIdx, num_directions, dx, dy cdef int dirIdx, num_directions, dx, dy
cdef int sum_x, sum_y
cdef float dot, maxn
cdef int calculated_idx
xs, ys = flag_field.shape xs, ys = flag_field.shape
boundary_index_list = [] boundary_index_list = []
num_directions = stencil.shape[0] num_directions = stencil.shape[0]
for y in range(nr_of_ghost_layers, ys - nr_of_ghost_layers): for y in range(nr_of_ghost_layers, ys - nr_of_ghost_layers):
for x in range(nr_of_ghost_layers, xs - nr_of_ghost_layers): for x in range(nr_of_ghost_layers, xs - nr_of_ghost_layers):
sum_x = 0; sum_y = 0;
if flag_field[x, y] & fluid_mask: if flag_field[x, y] & fluid_mask:
for dirIdx in range(num_directions): for dirIdx in range(num_directions):
dx = stencil[dirIdx,0] dx = stencil[dirIdx,0]; dy = stencil[dirIdx,1]
dy = stencil[dirIdx,1]
if flag_field[x + dx, y + dy] & boundary_mask: if flag_field[x + dx, y + dy] & boundary_mask:
boundary_index_list.append((x,y, dirIdx))
if single_link: if single_link:
break sum_x += dx; sum_y += dy;
else:
boundary_index_list.append((x, y, dirIdx))
dot = 0; maxn = 0; calculated_idx = 0
if single_link and (sum_x != 0 or sum_y != 0):
for dirIdx in range(num_directions):
dx = stencil[dirIdx, 0]; dy = stencil[dirIdx, 1];
dot = dx * sum_x + dy * sum_y
if dot > maxn:
maxn = dot
calculated_idx = dirIdx
boundary_index_list.append((x, y, calculated_idx))
return boundary_index_list return boundary_index_list
...@@ -47,6 +61,10 @@ def create_boundary_neighbor_index_list_3d(object[IntegerType, ndim=3] flag_fiel ...@@ -47,6 +61,10 @@ def create_boundary_neighbor_index_list_3d(object[IntegerType, ndim=3] flag_fiel
cdef int xs, ys, zs, x, y, z cdef int xs, ys, zs, x, y, z
cdef int dirIdx, num_directions, dx, dy, dz cdef int dirIdx, num_directions, dx, dy, dz
cdef int sum_x, sum_y, sum_z
cdef float dot, maxn
cdef int calculated_idx
xs, ys, zs = flag_field.shape xs, ys, zs = flag_field.shape
boundary_index_list = [] boundary_index_list = []
num_directions = stencil.shape[0] num_directions = stencil.shape[0]
...@@ -54,15 +72,27 @@ def create_boundary_neighbor_index_list_3d(object[IntegerType, ndim=3] flag_fiel ...@@ -54,15 +72,27 @@ def create_boundary_neighbor_index_list_3d(object[IntegerType, ndim=3] flag_fiel
for z in range(nr_of_ghost_layers, zs - nr_of_ghost_layers): for z in range(nr_of_ghost_layers, zs - nr_of_ghost_layers):
for y in range(nr_of_ghost_layers, ys - nr_of_ghost_layers): for y in range(nr_of_ghost_layers, ys - nr_of_ghost_layers):
for x in range(nr_of_ghost_layers, xs - nr_of_ghost_layers): for x in range(nr_of_ghost_layers, xs - nr_of_ghost_layers):
sum_x = 0; sum_y = 0; sum_z = 0
if flag_field[x, y, z] & fluid_mask: if flag_field[x, y, z] & fluid_mask:
for dirIdx in range(num_directions): for dirIdx in range(num_directions):
dx = stencil[dirIdx,0] dx = stencil[dirIdx,0]; dy = stencil[dirIdx,1]; dz = stencil[dirIdx,2]
dy = stencil[dirIdx,1]
dz = stencil[dirIdx,2]
if flag_field[x + dx, y + dy, z + dz] & boundary_mask: if flag_field[x + dx, y + dy, z + dz] & boundary_mask:
boundary_index_list.append((x,y,z, dirIdx))
if single_link: if single_link:
break sum_x += dx; sum_y += dy; sum_z += dz
else:
boundary_index_list.append((x, y, z, dirIdx))
dot = 0; maxn = 0; calculated_idx = 0
if single_link and (sum_x != 0 or sum_y != 0 or sum_z != 0):
for dirIdx in range(num_directions):
dx = stencil[dirIdx, 0]; dy = stencil[dirIdx, 1]; dz = stencil[dirIdx, 2]
dot = dx * sum_x + dy * sum_y + dz * sum_z
if dot > maxn:
maxn = dot
calculated_idx = dirIdx
boundary_index_list.append((x, y, z, calculated_idx))
return boundary_index_list return boundary_index_list
...@@ -75,21 +105,39 @@ def create_boundary_cell_index_list_2d(object[IntegerType, ndim=2] flag_field, ...@@ -75,21 +105,39 @@ def create_boundary_cell_index_list_2d(object[IntegerType, ndim=2] flag_field,
cdef int xs, ys, x, y cdef int xs, ys, x, y
cdef int dirIdx, num_directions, dx, dy cdef int dirIdx, num_directions, dx, dy
cdef int sum_x, sum_y
cdef float dot, maxn
cdef int calculated_idx
xs, ys = flag_field.shape xs, ys = flag_field.shape
boundary_index_list = [] boundary_index_list = []
num_directions = stencil.shape[0] num_directions = stencil.shape[0]
for y in range(0, ys): for y in range(0, ys):
for x in range(0, xs): for x in range(0, xs):
sum_x = 0; sum_y = 0;
if flag_field[x, y] & boundary_mask: if flag_field[x, y] & boundary_mask:
for dirIdx in range(num_directions): for dirIdx in range(num_directions):
dx = stencil[dirIdx,0] dx = stencil[dirIdx,0]; dy = stencil[dirIdx,1]
dy = stencil[dirIdx,1]
if 0 <= x + dx < xs and 0 <= y + dy < ys: if 0 <= x + dx < xs and 0 <= y + dy < ys:
if flag_field[x + dx, y + dy] & fluid_mask: if flag_field[x + dx, y + dy] & fluid_mask:
boundary_index_list.append((x,y, dirIdx))
if single_link: if single_link:
break sum_x += dx; sum_y += dy
else:
boundary_index_list.append((x, y, dirIdx))
dot = 0; maxn = 0; calculated_idx = 0
if single_link and (sum_x != 0 or sum_y != 0):
for dirIdx in range(num_directions):
dx = stencil[dirIdx, 0]; dy = stencil[dirIdx, 1]
dot = dx * sum_x + dy * sum_y
if dot > maxn:
maxn = dot
calculated_idx = dirIdx
boundary_index_list.append((x, y, calculated_idx))
return boundary_index_list return boundary_index_list
...@@ -101,6 +149,10 @@ def create_boundary_cell_index_list_3d(object[IntegerType, ndim=3] flag_field, ...@@ -101,6 +149,10 @@ def create_boundary_cell_index_list_3d(object[IntegerType, ndim=3] flag_field,
cdef int xs, ys, zs, x, y, z cdef int xs, ys, zs, x, y, z
cdef int dirIdx, num_directions, dx, dy, dz cdef int dirIdx, num_directions, dx, dy, dz
cdef int sum_x, sum_y, sum_z
cdef float dot, maxn
cdef int calculated_idx
xs, ys, zs = flag_field.shape xs, ys, zs = flag_field.shape
boundary_index_list = [] boundary_index_list = []
num_directions = stencil.shape[0] num_directions = stencil.shape[0]
...@@ -108,14 +160,27 @@ def create_boundary_cell_index_list_3d(object[IntegerType, ndim=3] flag_field, ...@@ -108,14 +160,27 @@ def create_boundary_cell_index_list_3d(object[IntegerType, ndim=3] flag_field,
for z in range(0, zs): for z in range(0, zs):
for y in range(0, ys): for y in range(0, ys):
for x in range(0, xs): for x in range(0, xs):
sum_x = 0; sum_y = 0; sum_z = 0
if flag_field[x, y, z] & boundary_mask: if flag_field[x, y, z] & boundary_mask:
for dirIdx in range(num_directions): for dirIdx in range(num_directions):
dx = stencil[dirIdx,0] dx = stencil[dirIdx, 0]; dy = stencil[dirIdx, 1]; dz = stencil[dirIdx, 2]
dy = stencil[dirIdx,1]
dz = stencil[dirIdx,2]
if 0 <= x + dx < xs and 0 <= y + dy < ys and 0 <= z + dz < zs: if 0 <= x + dx < xs and 0 <= y + dy < ys and 0 <= z + dz < zs:
if flag_field[x + dx, y + dy, z + dz] & fluid_mask: if flag_field[x + dx, y + dy, z + dz] & fluid_mask:
boundary_index_list.append((x,y,z, dirIdx))
if single_link: if single_link:
break sum_x += dx; sum_y += dy; sum_z += dz
else:
boundary_index_list.append((x, y, z, dirIdx))
dot = 0; maxn = 0; calculated_idx=0
if single_link and (sum_x != 0 or sum_y !=0 or sum_z !=0):
for dirIdx in range(num_directions):
dx = stencil[dirIdx, 0]; dy = stencil[dirIdx, 1]; dz = stencil[dirIdx, 2]
dot = dx*sum_x + dy*sum_y + dz*sum_z
if dot > maxn:
maxn = dot
calculated_idx = dirIdx
boundary_index_list.append((x, y, z, calculated_idx))
return boundary_index_list return boundary_index_list
\ No newline at end of file
import sympy as sp import sympy as sp
from pystencils.boundaries.boundaryhandling import DEFAULT_FLAG_TYPE from pystencils.boundaries.boundaryhandling import DEFAULT_FLAG_TYPE
from pystencils.data_types import TypedSymbol, create_type from pystencils.typing import TypedSymbol, create_type
from pystencils.field import Field from pystencils.field import Field
from pystencils.integer_functions import bitwise_and from pystencils.integer_functions import bitwise_and
......