Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
No results found
Show changes
Showing
with 1249 additions and 488 deletions
from pystencils.typing import CFunction
def get_argument_string(function_shortcut, first=''): def get_argument_string(function_shortcut, first=''):
args = function_shortcut[function_shortcut.index('[') + 1: -1] args = function_shortcut[function_shortcut.index('[') + 1: -1]
arg_string = "(" arg_string = "("
...@@ -16,10 +19,13 @@ def get_argument_string(function_shortcut, first=''): ...@@ -16,10 +19,13 @@ def get_argument_string(function_shortcut, first=''):
def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'): def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'):
if instruction_set != 'neon' and not instruction_set.startswith('sve'): if instruction_set not in ['neon', 'sme'] and not instruction_set.startswith('sve'):
raise NotImplementedError(instruction_set) raise NotImplementedError(instruction_set)
if instruction_set == 'sve': if instruction_set in ['sve', 'sve2', 'sme']:
cmp = 'cmp'
elif instruction_set.startswith('sve2') and instruction_set not in ('sve256', 'sve2048'):
cmp = 'cmp' cmp = 'cmp'
bitwidth = int(instruction_set[4:])
elif instruction_set.startswith('sve'): elif instruction_set.startswith('sve'):
cmp = 'cmp' cmp = 'cmp'
bitwidth = int(instruction_set[3:]) bitwidth = int(instruction_set[3:])
...@@ -35,9 +41,7 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'): ...@@ -35,9 +41,7 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'):
'sqrt': 'sqrt[0]', 'sqrt': 'sqrt[0]',
'loadU': 'ld1[0]', 'loadU': 'ld1[0]',
'loadA': 'ld1[0]',
'storeU': 'st1[0, 1]', 'storeU': 'st1[0, 1]',
'storeA': 'st1[0, 1]',
'abs': 'abs[0]', 'abs': 'abs[0]',
'==': f'{cmp}eq[0, 1]', '==': f'{cmp}eq[0, 1]',
...@@ -54,7 +58,7 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'): ...@@ -54,7 +58,7 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'):
result = dict() result = dict()
if instruction_set == 'sve': if instruction_set in ['sve', 'sve2', 'sme']:
width = 'svcntd()' if data_type == 'double' else 'svcntw()' width = 'svcntd()' if data_type == 'double' else 'svcntw()'
intwidth = 'svcntw()' intwidth = 'svcntw()'
result['bytes'] = 'svcntb()' result['bytes'] = 'svcntb()'
...@@ -62,14 +66,15 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'): ...@@ -62,14 +66,15 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'):
width = bitwidth // bits[data_type] width = bitwidth // bits[data_type]
intwidth = bitwidth // bits['int'] intwidth = bitwidth // bits['int']
result['bytes'] = bitwidth // 8 result['bytes'] = bitwidth // 8
if instruction_set.startswith('sve'): if instruction_set.startswith('sve') or instruction_set == 'sme':
base_names['stream'] = 'stnt1[0, 1]'
prefix = 'sv' prefix = 'sv'
suffix = f'_f{bits[data_type]}' suffix = f'_f{bits[data_type]}'
elif instruction_set == 'neon': elif instruction_set == 'neon':
prefix = 'v' prefix = 'v'
suffix = f'q_f{bits[data_type]}' suffix = f'q_f{bits[data_type]}'
if instruction_set == 'sve': if instruction_set in ['sve', 'sve2', 'sme']:
predicate = f'{prefix}whilelt_b{bits[data_type]}_u64({{loop_counter}}, {{loop_stop}})' predicate = f'{prefix}whilelt_b{bits[data_type]}_u64({{loop_counter}}, {{loop_stop}})'
int_predicate = f'{prefix}whilelt_b{bits["int"]}_u64({{loop_counter}}, {{loop_stop}})' int_predicate = f'{prefix}whilelt_b{bits["int"]}_u64({{loop_counter}}, {{loop_stop}})'
else: else:
...@@ -88,33 +93,36 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'): ...@@ -88,33 +93,36 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'):
result[intrinsic_id] = prefix + name + suffix + undef + arg_string result[intrinsic_id] = prefix + name + suffix + undef + arg_string
if instruction_set == 'sve': if instruction_set in ['sve', 'sve2', 'sme']:
from pystencils.backends.cbackend import CFunction
result['width'] = CFunction(width, "int") result['width'] = CFunction(width, "int")
result['intwidth'] = CFunction(intwidth, "int") result['intwidth'] = CFunction(intwidth, "int")
else: else:
result['width'] = width result['width'] = width
result['intwidth'] = intwidth result['intwidth'] = intwidth
if instruction_set.startswith('sve'): if instruction_set.startswith('sve') or instruction_set == 'sme':
result['makeVecConst'] = f'svdup_f{bits[data_type]}' + '({0})' result['makeVecConst'] = f'svdup_f{bits[data_type]}' + '({0})'
result['makeVecConstInt'] = f'svdup_s{bits["int"]}' + '({0})' result['makeVecConstInt'] = f'svdup_s{bits["int"]}' + '({0})'
result['makeVecIndex'] = f'svindex_s{bits["int"]}' + '({0}, {1})' result['makeVecIndex'] = f'svindex_s{bits["int"]}' + '({0}, {1})'
vindex = f'svindex_u{bits[data_type]}(0, {{0}})' if instruction_set != 'sme':
result['storeS'] = f'svst1_scatter_u{bits[data_type]}index_f{bits[data_type]}({predicate}, {{0}}, ' + \ vindex = f'svindex_u{bits[data_type]}(0, {{0}})'
vindex.format("{2}") + ', {1})' result['storeS'] = f'svst1_scatter_u{bits[data_type]}index_f{bits[data_type]}({predicate}, {{0}}, ' + \
result['loadS'] = f'svld1_gather_u{bits[data_type]}index_f{bits[data_type]}({predicate}, {{0}}, ' + \ vindex.format("{2}") + ', {1})'
vindex.format("{1}") + ')' result['loadS'] = f'svld1_gather_u{bits[data_type]}index_f{bits[data_type]}({predicate}, {{0}}, ' + \
vindex.format("{1}") + ')'
if instruction_set.startswith('sve2') and instruction_set not in ('sve256', 'sve2048'):
result['streamS'] = f'svstnt1_scatter_u{bits[data_type]}offset_f{bits[data_type]}({predicate}, {{0}}, ' + \
vindex.format(f"{{2}}*{bits[data_type]//8}") + ', {1})'
result['+int'] = f"svadd_s{bits['int']}_x({int_predicate}, " + "{0}, {1})" result['+int'] = f"svadd_s{bits['int']}_x({int_predicate}, " + "{0}, {1})"
result['float'] = f'svfloat{bits["float"]}_{"s" if instruction_set != "sve" else ""}t' result['float'] = f'svfloat{bits["float"]}_{"s" if instruction_set not in ["sve", "sve2", "sme"] else ""}t'
result['double'] = f'svfloat{bits["double"]}_{"s" if instruction_set != "sve" else ""}t' result['double'] = f'svfloat{bits["double"]}_{"s" if instruction_set not in ["sve", "sve2", "sme"] else ""}t'
result['int'] = f'svint{bits["int"]}_{"s" if instruction_set != "sve" else ""}t' result['int'] = f'svint{bits["int"]}_{"s" if instruction_set not in ["sve", "sve2", "sme"] else ""}t'
result['bool'] = f'svbool_{"s" if instruction_set != "sve" else ""}t' result['bool'] = f'svbool_{"s" if instruction_set not in ["sve", "sve2", "sme"] else ""}t'
result['headers'] = ['<arm_sve.h>', '"arm_neon_helpers.h"'] result['headers'] = ['<arm_sve.h>', '<arm_acle.h>', '"arm_neon_helpers.h"']
result['&'] = f'svand_b_z({predicate},' + ' {0}, {1})' result['&'] = f'svand_b_z({predicate},' + ' {0}, {1})'
result['|'] = f'svorr_b_z({predicate},' + ' {0}, {1})' result['|'] = f'svorr_b_z({predicate},' + ' {0}, {1})'
...@@ -123,10 +131,17 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'): ...@@ -123,10 +131,17 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'):
result['all'] = f'svcntp_b{bits[data_type]}({predicate}, {{0}}) == {width}' result['all'] = f'svcntp_b{bits[data_type]}({predicate}, {{0}}) == {width}'
result['maskStoreU'] = result['storeU'].replace(predicate, '{2}') result['maskStoreU'] = result['storeU'].replace(predicate, '{2}')
result['maskStoreA'] = result['storeA'].replace(predicate, '{2}') result['maskStream'] = result['stream'].replace(predicate, '{2}')
result['maskStoreS'] = result['storeS'].replace(predicate, '{3}') if instruction_set != 'sme':
result['maskStoreS'] = result['storeS'].replace(predicate, '{3}')
if instruction_set.startswith('sve2') and instruction_set not in ('sve256', 'sve2048'):
result['maskStreamS'] = result['streamS'].replace(predicate, '{3}')
if instruction_set != 'sve': result['streamFence'] = '__dmb(15)'
if instruction_set == 'sme':
result['function_prefix'] = '__attribute__((arm_locally_streaming))'
elif instruction_set not in ['sve', 'sve2', 'sme']:
result['compile_flags'] = [f'-msve-vector-bits={bitwidth}'] result['compile_flags'] = [f'-msve-vector-bits={bitwidth}']
else: else:
result['makeVecConst'] = f'vdupq_n_f{bits[data_type]}' + '({0})' result['makeVecConst'] = f'vdupq_n_f{bits[data_type]}' + '({0})'
...@@ -151,9 +166,9 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'): ...@@ -151,9 +166,9 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'):
result['any'] = f'vaddlvq_u8(vreinterpretq_u8_u{bits[data_type]}({{0}})) > 0' result['any'] = f'vaddlvq_u8(vreinterpretq_u8_u{bits[data_type]}({{0}})) > 0'
result['all'] = f'vaddlvq_u8(vreinterpretq_u8_u{bits[data_type]}({{0}})) == 16*0xff' result['all'] = f'vaddlvq_u8(vreinterpretq_u8_u{bits[data_type]}({{0}})) == 16*0xff'
if instruction_set == 'sve' or bitwidth & (bitwidth - 1) == 0: # SVE has real nontemporal stores, so we only need to zero cachlines on Neon
# only power-of-2 vector sizes will evenly divide a cacheline
result['cachelineSize'] = 'cachelineSize()'
result['cachelineZero'] = 'cachelineZero((void*) {0})' result['cachelineZero'] = 'cachelineZero((void*) {0})'
result['cachelineSize'] = 'cachelineSize()'
return result return result
...@@ -6,15 +6,18 @@ from typing import Set ...@@ -6,15 +6,18 @@ from typing import Set
import numpy as np import numpy as np
import sympy as sp import sympy as sp
from sympy.core import S from sympy.core import S
from sympy.core.cache import cacheit
from sympy.logic.boolalg import BooleanFalse, BooleanTrue from sympy.logic.boolalg import BooleanFalse, BooleanTrue
from sympy.functions.elementary.trigonometric import TrigonometricFunction, InverseTrigonometricFunction
from sympy.functions.elementary.hyperbolic import HyperbolicFunction
from pystencils.astnodes import KernelFunction, LoopOverCoordinate, Node from pystencils.astnodes import KernelFunction, LoopOverCoordinate, Node
from pystencils.cpu.vectorization import vec_all, vec_any, CachelineSize from pystencils.cpu.vectorization import vec_all, vec_any, CachelineSize
from pystencils.data_types import ( from pystencils.typing import (
PointerType, VectorType, address_of, cast_func, create_type, get_type_of_expression, PointerType, VectorType, CastFunc, create_type, get_type_of_expression,
reinterpret_cast_func, vector_memory_access, BasicType, TypedSymbol) ReinterpretCastFunc, VectorMemoryAccess, BasicType, TypedSymbol, CFunction)
from pystencils.enums import Backend
from pystencils.fast_approximation import fast_division, fast_inv_sqrt, fast_sqrt from pystencils.fast_approximation import fast_division, fast_inv_sqrt, fast_sqrt
from pystencils.functions import DivFunc, AddressOf
from pystencils.integer_functions import ( from pystencils.integer_functions import (
bit_shift_left, bit_shift_right, bitwise_and, bitwise_or, bitwise_xor, bit_shift_left, bit_shift_right, bitwise_and, bitwise_or, bitwise_xor,
int_div, int_power_of_2, modulo_ceil) int_div, int_power_of_2, modulo_ceil)
...@@ -29,12 +32,10 @@ __all__ = ['generate_c', 'CustomCodeNode', 'PrintNode', 'get_headers', 'CustomSy ...@@ -29,12 +32,10 @@ __all__ = ['generate_c', 'CustomCodeNode', 'PrintNode', 'get_headers', 'CustomSy
HEADER_REGEX = re.compile(r'^[<"].*[">]$') HEADER_REGEX = re.compile(r'^[<"].*[">]$')
KERNCRAFT_NO_TERNARY_MODE = False
def generate_c(ast_node: Node, def generate_c(ast_node: Node,
signature_only: bool = False, signature_only: bool = False,
dialect='c', dialect: Backend = Backend.C,
custom_backend=None, custom_backend=None,
with_globals=True) -> str: with_globals=True) -> str:
"""Prints an abstract syntax tree node as C or CUDA code. """Prints an abstract syntax tree node as C or CUDA code.
...@@ -46,7 +47,7 @@ def generate_c(ast_node: Node, ...@@ -46,7 +47,7 @@ def generate_c(ast_node: Node,
Args: Args:
ast_node: ast representation of kernel ast_node: ast representation of kernel
signature_only: generate signature without function body signature_only: generate signature without function body
dialect: 'c', 'cuda' or opencl dialect: `Backend`: 'C' or 'CUDA'
custom_backend: use own custom printer for code generation custom_backend: use own custom printer for code generation
with_globals: enable usage of global variables with_globals: enable usage of global variables
Returns: Returns:
...@@ -60,21 +61,19 @@ def generate_c(ast_node: Node, ...@@ -60,21 +61,19 @@ def generate_c(ast_node: Node,
ast_node.global_variables = d.symbols_defined ast_node.global_variables = d.symbols_defined
if custom_backend: if custom_backend:
printer = custom_backend printer = custom_backend
elif dialect == 'c': elif dialect == Backend.C:
try: try:
# TODO Vectorization Revamp: instruction_set should not be just slapped on ast
instruction_set = ast_node.instruction_set instruction_set = ast_node.instruction_set
except Exception: except Exception:
instruction_set = None instruction_set = None
printer = CBackend(signature_only=signature_only, printer = CBackend(signature_only=signature_only,
vector_instruction_set=instruction_set) vector_instruction_set=instruction_set)
elif dialect == 'cuda': elif dialect == Backend.CUDA:
from pystencils.backends.cuda_backend import CudaBackend from pystencils.backends.cuda_backend import CudaBackend
printer = CudaBackend(signature_only=signature_only) printer = CudaBackend(signature_only=signature_only)
elif dialect == 'opencl':
from pystencils.backends.opencl_backend import OpenClBackend
printer = OpenClBackend(signature_only=signature_only)
else: else:
raise ValueError("Unknown dialect: " + str(dialect)) raise ValueError(f'Unknown {dialect=}')
code = printer(ast_node) code = printer(ast_node)
if not signature_only and isinstance(ast_node, KernelFunction): if not signature_only and isinstance(ast_node, KernelFunction):
if with_globals and global_declarations: if with_globals and global_declarations:
...@@ -122,12 +121,12 @@ def get_headers(ast_node: Node) -> Set[str]: ...@@ -122,12 +121,12 @@ def get_headers(ast_node: Node) -> Set[str]:
for h in headers: for h in headers:
assert HEADER_REGEX.match(h), f'header /{h}/ does not follow the pattern /"..."/ or /<...>/' assert HEADER_REGEX.match(h), f'header /{h}/ does not follow the pattern /"..."/ or /<...>/'
return sorted(headers) return headers
# --------------------------------------- Backend Specific Nodes ------------------------------------------------------- # --------------------------------------- Backend Specific Nodes -------------------------------------------------------
# TODO future CustomCodeNode should not be backend specific move it elsewhere
class CustomCodeNode(Node): class CustomCodeNode(Node):
def __init__(self, code, symbols_read, symbols_defined, parent=None): def __init__(self, code, symbols_read, symbols_defined, parent=None):
super(CustomCodeNode, self).__init__(parent=parent) super(CustomCodeNode, self).__init__(parent=parent)
...@@ -151,8 +150,8 @@ class CustomCodeNode(Node): ...@@ -151,8 +150,8 @@ class CustomCodeNode(Node):
def undefined_symbols(self): def undefined_symbols(self):
return self._symbols_read - self._symbols_defined return self._symbols_read - self._symbols_defined
def __eq___(self, other): def __eq__(self, other):
return self._code == other._code return type(self) is type(other) and self._code == other._code
def __hash__(self): def __hash__(self):
return hash(self._code) return hash(self._code)
...@@ -166,30 +165,13 @@ class PrintNode(CustomCodeNode): ...@@ -166,30 +165,13 @@ class PrintNode(CustomCodeNode):
self.headers.append("<iostream>") self.headers.append("<iostream>")
class CFunction(TypedSymbol):
def __new__(cls, function, dtype):
return CFunction.__xnew_cached_(cls, function, dtype)
def __new_stage2__(cls, function, dtype):
return super(CFunction, cls).__xnew__(cls, function, dtype)
__xnew__ = staticmethod(__new_stage2__)
__xnew_cached_ = staticmethod(cacheit(__new_stage2__))
def __getnewargs__(self):
return self.name, self.dtype
def __getnewargs_ex__(self):
return (self.name, self.dtype), {}
# ------------------------------------------- Printer ------------------------------------------------------------------ # ------------------------------------------- Printer ------------------------------------------------------------------
# noinspection PyPep8Naming # noinspection PyPep8Naming
class CBackend: class CBackend:
def __init__(self, sympy_printer=None, signature_only=False, vector_instruction_set=None, dialect='c'): def __init__(self, sympy_printer=None, signature_only=False, vector_instruction_set=None, dialect=Backend.C):
if sympy_printer is None: if sympy_printer is None:
if vector_instruction_set is not None: if vector_instruction_set is not None:
self.sympy_printer = VectorizedCustomSympyPrinter(vector_instruction_set) self.sympy_printer = VectorizedCustomSympyPrinter(vector_instruction_set)
...@@ -216,19 +198,19 @@ class CBackend: ...@@ -216,19 +198,19 @@ class CBackend:
if isinstance(node, str): if isinstance(node, str):
return node return node
for cls in type(node).__mro__: for cls in type(node).__mro__:
method_name = "_print_" + cls.__name__ method_name = f"_print_{cls.__name__}"
if hasattr(self, method_name): if hasattr(self, method_name):
return getattr(self, method_name)(node) return getattr(self, method_name)(node)
raise NotImplementedError(self.__class__.__name__ + " does not support node of type " + node.__class__.__name__) raise NotImplementedError(f"{self.__class__.__name__} does not support node of type {node.__class__.__name__}")
def _print_Type(self, node): def _print_AbstractType(self, node):
return str(node) return str(node)
def _print_KernelFunction(self, node): def _print_KernelFunction(self, node):
function_arguments = [f"{self._print(s.symbol.dtype)} {s.symbol.name}" for s in node.get_parameters() function_arguments = [f"{self._print(s.symbol.dtype)} {s.symbol.name}" for s in node.get_parameters()
if not type(s.symbol) is CFunction] if not type(s.symbol) is CFunction]
launch_bounds = "" launch_bounds = ""
if self._dialect == 'cuda': if self._dialect == Backend.CUDA:
max_threads = node.indexing.max_threads_per_block() max_threads = node.indexing.max_threads_per_block()
if max_threads: if max_threads:
launch_bounds = f"__launch_bounds__({max_threads}) " launch_bounds = f"__launch_bounds__({max_threads}) "
...@@ -248,12 +230,13 @@ class CBackend: ...@@ -248,12 +230,13 @@ class CBackend:
return f"{node.pragma_line}\n{self._print_Block(node)}" return f"{node.pragma_line}\n{self._print_Block(node)}"
def _print_LoopOverCoordinate(self, node): def _print_LoopOverCoordinate(self, node):
counter_symbol = node.loop_counter_name counter_name = node.loop_counter_name
start = f"int64_t {counter_symbol} = {self.sympy_printer.doprint(node.start)}" counter_dtype = node.loop_counter_symbol.dtype.c_name
condition = f"{counter_symbol} < {self.sympy_printer.doprint(node.stop)}" start = f"{counter_dtype} {counter_name} = {self.sympy_printer.doprint(node.start)}"
update = f"{counter_symbol} += {self.sympy_printer.doprint(node.step)}" condition = f"{counter_name} < {self.sympy_printer.doprint(node.stop)}"
update = f"{counter_name} += {self.sympy_printer.doprint(node.step)}"
loop_str = f"for ({start}; {condition}; {update})" loop_str = f"for ({start}; {condition}; {update})"
self._kwargs['loop_counter'] = counter_symbol self._kwargs['loop_counter'] = counter_name
self._kwargs['loop_stop'] = node.stop self._kwargs['loop_stop'] = node.stop
prefix = "\n".join(node.prefix_lines) prefix = "\n".join(node.prefix_lines)
...@@ -262,41 +245,50 @@ class CBackend: ...@@ -262,41 +245,50 @@ class CBackend:
return f"{prefix}{loop_str}\n{self._print(node.body)}" return f"{prefix}{loop_str}\n{self._print(node.body)}"
def _print_SympyAssignment(self, node): def _print_SympyAssignment(self, node):
printed_lhs = self.sympy_printer.doprint(node.lhs)
printed_rhs = self.sympy_printer.doprint(node.rhs)
if node.is_declaration: if node.is_declaration:
if node.use_auto: if node.use_auto:
data_type = 'auto ' data_type = 'auto'
else: else:
data_type = self._print(node.lhs.dtype).replace(' const', '')
if node.is_const: if node.is_const:
prefix = 'const ' data_type = f'const {data_type}'
else: return f"{data_type} {printed_lhs} = {printed_rhs};"
prefix = ''
data_type = prefix + self._print(node.lhs.dtype).replace(' const', '') + " "
return "%s%s = %s;" % (data_type,
self.sympy_printer.doprint(node.lhs),
self.sympy_printer.doprint(node.rhs))
else: else:
lhs_type = get_type_of_expression(node.lhs) lhs_type = get_type_of_expression(node.lhs) # TOOD: this should have been typed
printed_mask = "" printed_mask = ""
if type(lhs_type) is VectorType and isinstance(node.lhs, cast_func): if type(lhs_type) is VectorType and isinstance(node.lhs, CastFunc):
arg, data_type, aligned, nontemporal, mask, stride = node.lhs.args arg, data_type, aligned, nontemporal, mask, stride = node.lhs.args
instr = 'storeU' instr = 'storeU'
if aligned: if nontemporal and 'storeA' not in self._vector_instruction_set and \
'stream' in self._vector_instruction_set:
instr = 'stream'
elif aligned:
instr = 'stream' if nontemporal and 'stream' in self._vector_instruction_set else 'storeA' instr = 'stream' if nontemporal and 'stream' in self._vector_instruction_set else 'storeA'
if mask != True: # NOQA if mask != True: # NOQA
instr = 'maskStoreA' if aligned else 'maskStoreU' instr = 'maskStream' if nontemporal and 'maskStream' in self._vector_instruction_set else \
'maskStoreA' if aligned else 'maskStoreU'
if instr not in self._vector_instruction_set: if instr not in self._vector_instruction_set:
self._vector_instruction_set[instr] = self._vector_instruction_set['store' + instr[-1]].format( if instr == 'maskStream' and 'stream' in self._vector_instruction_set:
store, load = 'stream', 'loadA'
elif (instr in ('maskStream', 'maskStoreA')) and 'storeA' in self._vector_instruction_set:
store, load = 'storeA', 'loadA'
else:
store, load = 'storeU', 'loadU'
load = load if load in self._vector_instruction_set else 'loadU'
self._vector_instruction_set[instr] = self._vector_instruction_set[store].format(
'{0}', self._vector_instruction_set['blendv'].format( '{0}', self._vector_instruction_set['blendv'].format(
self._vector_instruction_set['load' + instr[-1]].format('{0}', **self._kwargs), self._vector_instruction_set[load].format('{0}', **self._kwargs),
'{1}', '{2}', **self._kwargs), **self._kwargs) '{1}', '{2}', **self._kwargs), **self._kwargs)
printed_mask = self.sympy_printer.doprint(mask) printed_mask = self.sympy_printer.doprint(mask)
if data_type.base_type.base_name == 'double': if data_type.base_type.c_name == 'double':
if self._vector_instruction_set['double'] == '__m256d': if self._vector_instruction_set['double'] == '__m256d':
printed_mask = f"_mm256_castpd_si256({printed_mask})" printed_mask = f"_mm256_castpd_si256({printed_mask})"
elif self._vector_instruction_set['double'] == '__m128d': elif self._vector_instruction_set['double'] == '__m128d':
printed_mask = f"_mm_castpd_si128({printed_mask})" printed_mask = f"_mm_castpd_si128({printed_mask})"
elif data_type.base_type.base_name == 'float': elif data_type.base_type.c_name == 'float':
if self._vector_instruction_set['float'] == '__m256': if self._vector_instruction_set['float'] == '__m256':
printed_mask = f"_mm256_castps_si256({printed_mask})" printed_mask = f"_mm256_castps_si256({printed_mask})"
elif self._vector_instruction_set['float'] == '__m128': elif self._vector_instruction_set['float'] == '__m128':
...@@ -304,25 +296,31 @@ class CBackend: ...@@ -304,25 +296,31 @@ class CBackend:
rhs_type = get_type_of_expression(node.rhs) rhs_type = get_type_of_expression(node.rhs)
if type(rhs_type) is not VectorType: if type(rhs_type) is not VectorType:
rhs = cast_func(node.rhs, VectorType(rhs_type)) raise ValueError(f'Cannot vectorize {node.rhs} of type {rhs_type} inside of the pretty printer! '
f'This should have happen earlier!')
# rhs = CastFunc(node.rhs, VectorType(rhs_type)) # Unknown width
else: else:
rhs = node.rhs rhs = node.rhs
ptr = "&" + self.sympy_printer.doprint(node.lhs.args[0]) ptr = "&" + self.sympy_printer.doprint(node.lhs.args[0])
if stride != 1: if stride != 1:
instr = 'maskStoreS' if mask != True else 'storeS' # NOQA instr = ('maskStreamS' if nontemporal and 'maskStreamS' in self._vector_instruction_set else
'maskStoreS') if mask != True else \
('streamS' if nontemporal and 'streamS' in self._vector_instruction_set else 'storeS') # NOQA
return self._vector_instruction_set[instr].format(ptr, self.sympy_printer.doprint(rhs), return self._vector_instruction_set[instr].format(ptr, self.sympy_printer.doprint(rhs),
stride, printed_mask, **self._kwargs) + ';' stride, printed_mask, **self._kwargs) + ';'
pre_code = '' pre_code = ''
if nontemporal and 'cachelineZero' in self._vector_instruction_set: if nontemporal and 'cachelineZero' in self._vector_instruction_set and mask == True: # NOQA
first_cond = f"((uintptr_t) {ptr} & {CachelineSize.mask_symbol}) == 0" first_cond = f"((uintptr_t) {ptr} & {CachelineSize.mask_symbol}) == 0"
offset = sp.Add(*[sp.Symbol(LoopOverCoordinate.get_loop_counter_name(i)) offset = sp.Add(*[sp.Symbol(LoopOverCoordinate.get_loop_counter_name(i))
* node.lhs.args[0].field.spatial_strides[i] for i in * node.lhs.args[0].field.spatial_strides[i] for i in
range(len(node.lhs.args[0].field.spatial_strides))]) range(len(node.lhs.args[0].field.spatial_strides))])
if stride == 1:
offset = offset.subs({node.lhs.args[0].field.spatial_strides[0]: 1})
size = sp.Mul(*node.lhs.args[0].field.spatial_shape) size = sp.Mul(*node.lhs.args[0].field.spatial_shape)
element_size = 8 if data_type.base_type.base_name == 'double' else 4 element_size = 8 if data_type.base_type.c_name == 'double' else 4
size_cond = f"({offset} + {CachelineSize.symbol/element_size}) < {size}" size_cond = f"({offset} + {CachelineSize.symbol/element_size}) < {size}"
pre_code = f"if ({first_cond} && {size_cond}) " + "{\n\t" + \ pre_code = f"if ({first_cond} && {size_cond}) " + "{\n\t" + \
self._vector_instruction_set['cachelineZero'].format(ptr, **self._kwargs) + ';\n}\n' self._vector_instruction_set['cachelineZero'].format(ptr, **self._kwargs) + ';\n}\n'
...@@ -334,17 +332,26 @@ class CBackend: ...@@ -334,17 +332,26 @@ class CBackend:
code2 = self._vector_instruction_set['flushCacheline'].format( code2 = self._vector_instruction_set['flushCacheline'].format(
ptr, self.sympy_printer.doprint(rhs), **self._kwargs) + ';' ptr, self.sympy_printer.doprint(rhs), **self._kwargs) + ';'
code = f"{code}\nif ({flushcond}) {{\n\t{code2}\n}}" code = f"{code}\nif ({flushcond}) {{\n\t{code2}\n}}"
elif nontemporal and 'storeAAndFlushCacheline' in self._vector_instruction_set: elif aligned and nontemporal and 'storeAAndFlushCacheline' in self._vector_instruction_set:
tmpvar = '_tmp_' + hashlib.sha1(self.sympy_printer.doprint(rhs).encode('ascii')).hexdigest()[:8] lhs_hash = hashlib.sha1(self.sympy_printer.doprint(node.lhs).encode('ascii')).hexdigest()[:8]
rhs_hash = hashlib.sha1(self.sympy_printer.doprint(rhs).encode('ascii')).hexdigest()[:8]
tmpvar = f'_tmp_{lhs_hash}_{rhs_hash}'
code = 'const ' + self._print(node.lhs.dtype).replace(' const', '') + ' ' + tmpvar + ' = ' \ code = 'const ' + self._print(node.lhs.dtype).replace(' const', '') + ' ' + tmpvar + ' = ' \
+ self.sympy_printer.doprint(rhs) + ';' + self.sympy_printer.doprint(rhs) + ';'
code1 = self._vector_instruction_set[instr].format(ptr, tmpvar, printed_mask, **self._kwargs) + ';' code1 = self._vector_instruction_set[instr].format(ptr, tmpvar, printed_mask, **self._kwargs) + ';'
code2 = self._vector_instruction_set['storeAAndFlushCacheline'].format(ptr, tmpvar, printed_mask, maskStore, store, load = 'maskStoreAAndFlushCacheline', 'storeAAndFlushCacheline', 'loadA'
**self._kwargs) + ';' instr2 = maskStore if mask != True else store # NOQA
if instr2 not in self._vector_instruction_set:
self._vector_instruction_set[maskStore] = self._vector_instruction_set[store].format(
'{0}', self._vector_instruction_set['blendv'].format(
self._vector_instruction_set[load].format('{0}', **self._kwargs),
'{1}', '{2}', **self._kwargs),
**self._kwargs)
code2 = self._vector_instruction_set[instr2].format(ptr, tmpvar, printed_mask, **self._kwargs) + ';'
code += f"\nif ({flushcond}) {{\n\t{code2}\n}} else {{\n\t{code1}\n}}" code += f"\nif ({flushcond}) {{\n\t{code2}\n}} else {{\n\t{code1}\n}}"
return pre_code + code return pre_code + code
else: else:
return f"{self.sympy_printer.doprint(node.lhs)} = {self.sympy_printer.doprint(node.rhs)};" return f"{printed_lhs} = {printed_rhs};"
def _print_NontemporalFence(self, _): def _print_NontemporalFence(self, _):
if 'streamFence' in self._vector_instruction_set: if 'streamFence' in self._vector_instruction_set:
...@@ -436,23 +443,31 @@ class CustomSympyPrinter(CCodePrinter): ...@@ -436,23 +443,31 @@ class CustomSympyPrinter(CCodePrinter):
def __init__(self): def __init__(self):
super(CustomSympyPrinter, self).__init__() super(CustomSympyPrinter, self).__init__()
self._float_type = create_type("float32")
def _print_Pow(self, expr): def _print_Pow(self, expr):
"""Don't use std::pow function, for small integer exponents, write as multiplication""" """Don't use std::pow function, for small integer exponents, write as multiplication"""
if not expr.free_symbols: # Ideally the printer has as little logic as possible. Therefore,
return self._typed_number(expr.evalf(), get_type_of_expression(expr)) # powers should be rewritten as `DivFunc`s / unevaluated `Mul`s before
# printing. `NodeCollection` offers a convenience function to do just
if expr.exp.is_integer and expr.exp.is_number and 0 < expr.exp < 8: # that. However, `cut_loops` rewrites unevaluated multiplications as
return f"({self._print(sp.Mul(*[expr.base] * expr.exp, evaluate=False))})" # `Pow`s again. Neither `deepcopy` nor `func(*args)` are suited to
elif expr.exp.is_integer and expr.exp.is_number and - 8 < expr.exp < 0: # rebuild unevaluated expressions. Therefore, as long as we stick with
return f"1 / ({self._print(sp.Mul(*([expr.base] * -expr.exp), evaluate=False))})" # SymPy, this is the only way to avoid printing `pow`s.
exp = expr.exp.expr if isinstance(expr.exp, CastFunc) else expr.exp
one_type = expr.base.dtype if hasattr(expr.base, "dtype") else get_type_of_expression(expr.base)
if exp.is_integer and exp.is_number and (0 < exp <= 8):
return f"({self._print(sp.Mul(*[expr.base] * exp, evaluate=False))})"
elif exp.is_integer and exp.is_number and (-8 <= exp < 0):
return f"{self._typed_number(1, one_type)} / ({self._print(sp.Mul(*([expr.base] * -exp), evaluate=False))})"
else: else:
return super(CustomSympyPrinter, self)._print_Pow(expr) return super(CustomSympyPrinter, self)._print_Pow(expr)
# TODO don't print ones in sp.Mul
def _print_Rational(self, expr): def _print_Rational(self, expr):
"""Evaluate all rationals i.e. print 0.25 instead of 1.0/4.0""" """Evaluate all rationals i.e. print 0.25 instead of 1.0/4.0"""
res = str(expr.evalf().num) res = str(expr.evalf(17))
return res return res
def _print_Equality(self, expr): def _print_Equality(self, expr):
...@@ -470,7 +485,7 @@ class CustomSympyPrinter(CCodePrinter): ...@@ -470,7 +485,7 @@ class CustomSympyPrinter(CCodePrinter):
else: else:
return f'fabs({self._print(expr.args[0])})' return f'fabs({self._print(expr.args[0])})'
def _print_Type(self, node): def _print_AbstractType(self, node):
return str(node) return str(node)
def _print_Function(self, expr): def _print_Function(self, expr):
...@@ -483,30 +498,48 @@ class CustomSympyPrinter(CCodePrinter): ...@@ -483,30 +498,48 @@ class CustomSympyPrinter(CCodePrinter):
} }
if hasattr(expr, 'to_c'): if hasattr(expr, 'to_c'):
return expr.to_c(self._print) return expr.to_c(self._print)
if isinstance(expr, reinterpret_cast_func): if isinstance(expr, ReinterpretCastFunc):
arg, data_type = expr.args arg, data_type = expr.args
return f"*(({self._print(PointerType(data_type, restrict=False))})(& {self._print(arg)}))" if isinstance(data_type, PointerType):
elif isinstance(expr, address_of): const_str = "const" if data_type.const else ""
return f"(({const_str} {self._print(data_type.base_type)} *)(& {self._print(arg)}))"
else:
return f"*(({self._print(PointerType(data_type, restrict=False))})(& {self._print(arg)}))"
elif isinstance(expr, AddressOf):
assert len(expr.args) == 1, "address_of must only have one argument" assert len(expr.args) == 1, "address_of must only have one argument"
return f"&({self._print(expr.args[0])})" return f"&({self._print(expr.args[0])})"
elif isinstance(expr, cast_func): elif isinstance(expr, CastFunc):
cast = "(({data_type})({code}))"
arg, data_type = expr.args arg, data_type = expr.args
if isinstance(arg, sp.Number) and arg.is_finite: if arg.is_Number and not isinstance(arg, (sp.core.numbers.Infinity, sp.core.numbers.NegativeInfinity)):
return self._typed_number(arg, data_type) return self._typed_number(arg, data_type)
elif isinstance(arg, (InverseTrigonometricFunction, TrigonometricFunction, HyperbolicFunction)) \
and data_type == BasicType('float32'):
known = self.known_functions[arg.__class__.__name__.lower()]
code = self._print(arg)
return code.replace(known, f"{known}f")
elif isinstance(arg, (sp.Pow, sp.exp)) and data_type == BasicType('float32'):
known = ['sqrt', 'cbrt', 'pow', 'exp']
code = self._print(arg)
for k in known:
if k in code:
return code.replace(k, f'{k}f')
# Powers of small integers are printed as divisions/multiplications.
if '/' in code or '*' in code:
return cast.format(data_type=data_type, code=code)
raise ValueError(f"{code} doesn't give {known=} function back.")
else: else:
return f"(({data_type})({self._print(arg)}))" return cast.format(data_type=data_type, code=self._print(arg))
elif isinstance(expr, fast_division): elif isinstance(expr, fast_division):
return f"({self._print(expr.args[0] / expr.args[1])})" raise ValueError("fast_division is only supported for Taget.GPU")
elif isinstance(expr, fast_sqrt): elif isinstance(expr, fast_sqrt):
return f"({self._print(sp.sqrt(expr.args[0]))})" raise ValueError("fast_sqrt is only supported for Taget.GPU")
elif isinstance(expr, fast_inv_sqrt):
raise ValueError("fast_inv_sqrt is only supported for Taget.GPU")
elif isinstance(expr, vec_any) or isinstance(expr, vec_all): elif isinstance(expr, vec_any) or isinstance(expr, vec_all):
return self._print(expr.args[0]) return self._print(expr.args[0])
elif isinstance(expr, fast_inv_sqrt):
return f"({self._print(1 / sp.sqrt(expr.args[0]))})"
elif isinstance(expr, sp.Abs): elif isinstance(expr, sp.Abs):
return f"abs({self._print(expr.args[0])})" return f"abs({self._print(expr.args[0])})"
elif isinstance(expr, sp.Max):
return self._print(expr)
elif isinstance(expr, sp.Mod): elif isinstance(expr, sp.Mod):
if expr.args[0].is_integer and expr.args[1].is_integer: if expr.args[0].is_integer and expr.args[1].is_integer:
return f"({self._print(expr.args[0])} % {self._print(expr.args[1])})" return f"({self._print(expr.args[0])} % {self._print(expr.args[1])})"
...@@ -518,6 +551,8 @@ class CustomSympyPrinter(CCodePrinter): ...@@ -518,6 +551,8 @@ class CustomSympyPrinter(CCodePrinter):
return f"(1 << ({self._print(expr.args[0])}))" return f"(1 << ({self._print(expr.args[0])}))"
elif expr.func == int_div: elif expr.func == int_div:
return f"(({self._print(expr.args[0])}) / ({self._print(expr.args[1])}))" return f"(({self._print(expr.args[0])}) / ({self._print(expr.args[1])}))"
elif expr.func == DivFunc:
return f'(({self._print(expr.divisor)}) / ({self._print(expr.dividend)}))'
else: else:
name = expr.name if hasattr(expr, 'name') else expr.__class__.__name__ name = expr.name if hasattr(expr, 'name') else expr.__class__.__name__
arg_str = ', '.join(self._print(a) for a in expr.args) arg_str = ', '.join(self._print(a) for a in expr.args)
...@@ -540,52 +575,6 @@ class CustomSympyPrinter(CCodePrinter): ...@@ -540,52 +575,6 @@ class CustomSympyPrinter(CCodePrinter):
else: else:
return res return res
def _print_Sum(self, expr):
template = """[&]() {{
{dtype} sum = ({dtype}) 0;
for ( {iterator_dtype} {var} = {start}; {condition}; {var} += {increment} ) {{
sum += {expr};
}}
return sum;
}}()"""
var = expr.limits[0][0]
start = expr.limits[0][1]
end = expr.limits[0][2]
code = template.format(
dtype=get_type_of_expression(expr.args[0]),
iterator_dtype='int',
var=self._print(var),
start=self._print(start),
end=self._print(end),
expr=self._print(expr.function),
increment=str(1),
condition=self._print(var) + ' <= ' + self._print(end) # if start < end else '>='
)
return code
def _print_Product(self, expr):
template = """[&]() {{
{dtype} product = ({dtype}) 1;
for ( {iterator_dtype} {var} = {start}; {condition}; {var} += {increment} ) {{
product *= {expr};
}}
return product;
}}()"""
var = expr.limits[0][0]
start = expr.limits[0][1]
end = expr.limits[0][2]
code = template.format(
dtype=get_type_of_expression(expr.args[0]),
iterator_dtype='int',
var=self._print(var),
start=self._print(start),
end=self._print(end),
expr=self._print(expr.function),
increment=str(1),
condition=self._print(var) + ' <= ' + self._print(end) # if start < end else '>='
)
return code
def _print_ConditionalFieldAccess(self, node): def _print_ConditionalFieldAccess(self, node):
return self._print(sp.Piecewise((node.outofbounds_value, node.outofbounds_condition), (node.access, True))) return self._print(sp.Piecewise((node.outofbounds_value, node.outofbounds_condition), (node.access, True)))
...@@ -609,27 +598,6 @@ class CustomSympyPrinter(CCodePrinter): ...@@ -609,27 +598,6 @@ class CustomSympyPrinter(CCodePrinter):
return f"(({a} < {b}) ? {a} : {b})" return f"(({a} < {b}) ? {a} : {b})"
return inner_print_min(expr.args) return inner_print_min(expr.args)
def _print_re(self, expr):
return f"real({self._print(expr.args[0])})"
def _print_im(self, expr):
return f"imag({self._print(expr.args[0])})"
def _print_ImaginaryUnit(self, expr):
return "complex<double>{0,1}"
def _print_TypedImaginaryUnit(self, expr):
if expr.dtype.numpy_dtype == np.complex64:
return "complex<float>{0,1}"
elif expr.dtype.numpy_dtype == np.complex128:
return "complex<double>{0,1}"
else:
raise NotImplementedError(
"only complex64 and complex128 supported")
def _print_Complex(self, expr):
return self._typed_number(expr, np.complex64)
# noinspection PyPep8Naming # noinspection PyPep8Naming
class VectorizedCustomSympyPrinter(CustomSympyPrinter): class VectorizedCustomSympyPrinter(CustomSympyPrinter):
...@@ -648,55 +616,100 @@ class VectorizedCustomSympyPrinter(CustomSympyPrinter): ...@@ -648,55 +616,100 @@ class VectorizedCustomSympyPrinter(CustomSympyPrinter):
return None return None
def _print_Abs(self, expr): def _print_Abs(self, expr):
if 'abs' in self.instruction_set and isinstance(expr.args[0], vector_memory_access): if isinstance(get_type_of_expression(expr), (VectorType, VectorMemoryAccess)):
return self.instruction_set['abs'].format(self._print(expr.args[0]), **self._kwargs) return self.instruction_set['abs'].format(self._print(expr.args[0]), **self._kwargs)
return super()._print_Abs(expr) return super()._print_Abs(expr)
def _typed_vectorized_number(self, expr, data_type):
basic_data_type = data_type.base_type
number = self._typed_number(expr, basic_data_type)
instruction = 'makeVecConst'
if basic_data_type.is_bool():
instruction = 'makeVecConstBool'
# TODO Vectorization Revamp: is int, or sint, or uint (my guess is sint)
elif basic_data_type.is_int():
instruction = 'makeVecConstInt'
return self.instruction_set[instruction].format(number, **self._kwargs)
def _typed_vectorized_symbol(self, expr, data_type):
if not isinstance(expr, TypedSymbol):
raise ValueError(f'{expr} is not a TypeSymbol. It is {expr.type=}')
basic_data_type = data_type.base_type
symbol = self._print(expr)
if basic_data_type != expr.dtype:
symbol = f'(({basic_data_type})({symbol}))'
instruction = 'makeVecConst'
if basic_data_type.is_bool():
instruction = 'makeVecConstBool'
# TODO Vectorization Revamp: is int, or sint, or uint (my guess is sint)
elif basic_data_type.is_int():
instruction = 'makeVecConstInt'
return self.instruction_set[instruction].format(symbol, **self._kwargs)
def _print_CastFunc(self, expr):
arg, data_type = expr.args
if type(data_type) is VectorType:
base_type = data_type.base_type
# vector_memory_access is a cast_func itself so it should't be directly inside a cast_func
assert not isinstance(arg, VectorMemoryAccess)
if isinstance(arg, sp.Tuple):
is_boolean = get_type_of_expression(arg[0]) == create_type("bool")
is_integer = get_type_of_expression(arg[0]) == create_type("int")
printed_args = [self._print(a) for a in arg]
instruction = 'makeVecBool' if is_boolean else 'makeVecInt' if is_integer else 'makeVec'
if instruction == 'makeVecInt' and 'makeVecIndex' in self.instruction_set:
increments = np.array(arg)[1:] - np.array(arg)[:-1]
if len(set(increments)) == 1:
return self.instruction_set['makeVecIndex'].format(printed_args[0], increments[0],
**self._kwargs)
return self.instruction_set[instruction].format(*printed_args, **self._kwargs)
else:
if arg.is_Number and not isinstance(arg, (sp.core.numbers.Infinity, sp.core.numbers.NegativeInfinity)):
return self._typed_vectorized_number(arg, data_type)
elif isinstance(arg, TypedSymbol):
return self._typed_vectorized_symbol(arg, data_type)
elif isinstance(arg, (InverseTrigonometricFunction, TrigonometricFunction, HyperbolicFunction)) \
and base_type == BasicType('float32'):
raise NotImplementedError('Vectorizer is not tested for trigonometric functions yet')
# known = self.known_functions[arg.__class__.__name__.lower()]
# code = self._print(arg)
# return code.replace(known, f"{known}f")
elif isinstance(arg, sp.Pow):
if base_type == BasicType('float32') or base_type == BasicType('float64'):
return self._print_Pow(arg)
else:
raise NotImplementedError('Integer Pow is not implemented')
elif isinstance(arg, sp.UnevaluatedExpr):
return self._print(arg.args[0])
else:
raise NotImplementedError('Vectorizer cannot cast between different datatypes')
# to_type = self.instruction_set['suffix'][data_type.base_type.c_name]
# from_type = self.instruction_set['suffix'][get_type_of_expression(arg).base_type.c_name]
# return self.instruction_set['cast'].format(from_type, to_type, self._print(arg))
else:
return self._scalarFallback('_print_Function', expr)
# raise ValueError(f'Non VectorType cast "{data_type}" in vectorized code.')
def _print_Function(self, expr): def _print_Function(self, expr):
if isinstance(expr, vector_memory_access): if isinstance(expr, VectorMemoryAccess):
arg, data_type, aligned, _, mask, stride = expr.args arg, data_type, aligned, _, mask, stride = expr.args
if stride != 1: if stride != 1:
return self.instruction_set['loadS'].format("& " + self._print(arg), stride, **self._kwargs) return self.instruction_set['loadS'].format(f"& {self._print(arg)}", stride, **self._kwargs)
instruction = self.instruction_set['loadA'] if aligned else self.instruction_set['loadU'] instruction = self.instruction_set['loadA'] if aligned else self.instruction_set['loadU']
return instruction.format("& " + self._print(arg), **self._kwargs) return instruction.format(f"& {self._print(arg)}", **self._kwargs)
elif isinstance(expr, cast_func): elif expr.func == DivFunc:
arg, data_type = expr.args
if type(data_type) is VectorType:
# vector_memory_access is a cast_func itself so it should't be directly inside a cast_func
assert not isinstance(arg, vector_memory_access)
if isinstance(arg, sp.Tuple):
is_boolean = get_type_of_expression(arg[0]) == create_type("bool")
is_integer = get_type_of_expression(arg[0]) == create_type("int")
printed_args = [self._print(a) for a in arg]
instruction = 'makeVecBool' if is_boolean else 'makeVecInt' if is_integer else 'makeVec'
if instruction == 'makeVecInt' and 'makeVecIndex' in self.instruction_set:
increments = np.array(arg)[1:] - np.array(arg)[:-1]
if len(set(increments)) == 1:
return self.instruction_set['makeVecIndex'].format(printed_args[0], increments[0],
**self._kwargs)
return self.instruction_set[instruction].format(*printed_args, **self._kwargs)
else:
is_boolean = get_type_of_expression(arg) == create_type("bool")
is_integer = get_type_of_expression(arg) == create_type("int") or \
(isinstance(arg, TypedSymbol) and not isinstance(arg.dtype, VectorType) and arg.dtype.is_int())
instruction = 'makeVecConstBool' if is_boolean else \
'makeVecConstInt' if is_integer else 'makeVecConst'
return self.instruction_set[instruction].format(self._print(arg), **self._kwargs)
elif expr.func == fast_division:
result = self._scalarFallback('_print_Function', expr) result = self._scalarFallback('_print_Function', expr)
if not result: if not result:
result = self.instruction_set['/'].format(self._print(expr.args[0]), self._print(expr.args[1]), result = self.instruction_set['/'].format(self._print(expr.divisor), self._print(expr.dividend),
**self._kwargs) **self._kwargs)
return result return result
elif expr.func == fast_sqrt: elif isinstance(expr, fast_division):
return f"({self._print(sp.sqrt(expr.args[0]))})" raise ValueError("fast_division is only supported for Taget.GPU")
elif expr.func == fast_inv_sqrt: elif isinstance(expr, fast_sqrt):
result = self._scalarFallback('_print_Function', expr) raise ValueError("fast_sqrt is only supported for Taget.GPU")
if not result: elif isinstance(expr, fast_inv_sqrt):
if 'rsqrt' in self.instruction_set: raise ValueError("fast_inv_sqrt is only supported for Taget.GPU")
return self.instruction_set['rsqrt'].format(self._print(expr.args[0]), **self._kwargs)
else:
return f"({self._print(1 / sp.sqrt(expr.args[0]))})"
elif isinstance(expr, vec_any) or isinstance(expr, vec_all): elif isinstance(expr, vec_any) or isinstance(expr, vec_all):
instr = 'any' if isinstance(expr, vec_any) else 'all' instr = 'any' if isinstance(expr, vec_any) else 'all'
expr_type = get_type_of_expression(expr.args[0]) expr_type = get_type_of_expression(expr.args[0])
...@@ -747,12 +760,12 @@ class VectorizedCustomSympyPrinter(CustomSympyPrinter): ...@@ -747,12 +760,12 @@ class VectorizedCustomSympyPrinter(CustomSympyPrinter):
# special treatment for all-integer args, for loop index arithmetic until we have proper int vectorization # special treatment for all-integer args, for loop index arithmetic until we have proper int vectorization
suffix = "" suffix = ""
if all([(type(e) is cast_func and str(e.dtype) == self.instruction_set['int']) or isinstance(e, sp.Integer) if all([(type(e) is CastFunc and str(e.dtype) == self.instruction_set['int']) or isinstance(e, sp.Integer)
or (type(e) is TypedSymbol and isinstance(e.dtype, BasicType) and e.dtype.is_int()) for e in args]): or (type(e) is TypedSymbol and isinstance(e.dtype, BasicType) and e.dtype.is_int()) for e in args]):
dtype = set([e.dtype for e in args if type(e) is cast_func]) dtype = set([e.dtype for e in args if type(e) is CastFunc])
assert len(dtype) == 1 assert len(dtype) == 1
dtype = dtype.pop() dtype = dtype.pop()
args = [cast_func(e, dtype) if (isinstance(e, sp.Integer) or isinstance(e, TypedSymbol)) else e args = [CastFunc(e, dtype) if (isinstance(e, sp.Integer) or isinstance(e, TypedSymbol)) else e
for e in args] for e in args]
suffix = "int" suffix = "int"
...@@ -778,26 +791,31 @@ class VectorizedCustomSympyPrinter(CustomSympyPrinter): ...@@ -778,26 +791,31 @@ class VectorizedCustomSympyPrinter(CustomSympyPrinter):
return processed return processed
def _print_Pow(self, expr): def _print_Pow(self, expr):
result = self._scalarFallback('_print_Pow', expr) # Due to loop cutting sp.Mul is evaluated again.
try:
result = self._scalarFallback('_print_Pow', expr)
except ValueError:
result = None
if result: if result:
return result return result
one = self.instruction_set['makeVecConst'].format(1.0, **self._kwargs) one = self.instruction_set['makeVecConst'].format(1.0, **self._kwargs)
root = self.instruction_set['sqrt'].format(self._print(expr.base), **self._kwargs)
if expr.exp.is_integer and expr.exp.is_number and 0 < expr.exp < 8: if isinstance(expr.exp, CastFunc) and expr.exp.args[0].is_number:
return "(" + self._print(sp.Mul(*[expr.base] * expr.exp, evaluate=False)) + ")" exp = expr.exp.args[0]
elif expr.exp == -1: else:
one = self.instruction_set['makeVecConst'].format(1.0, **self._kwargs) exp = expr.exp
return self.instruction_set['/'].format(one, self._print(expr.base), **self._kwargs)
elif expr.exp == 0.5: # TODO the printer should not have any intelligence like this.
return self.instruction_set['sqrt'].format(self._print(expr.base), **self._kwargs) # TODO To remove all of these cases the vectoriser needs to be reworked. See loop cutting
elif expr.exp == -0.5: if exp.is_integer and exp.is_number and 0 < exp < 8:
root = self.instruction_set['sqrt'].format(self._print(expr.base), **self._kwargs) return self._print(sp.Mul(*[expr.base] * exp, evaluate=False))
elif exp == 0.5:
return root
elif exp == -0.5:
return self.instruction_set['/'].format(one, root, **self._kwargs) return self.instruction_set['/'].format(one, root, **self._kwargs)
elif expr.exp.is_integer and expr.exp.is_number and - 8 < expr.exp < 0:
return self.instruction_set['/'].format(one,
self._print(sp.Mul(*[expr.base] * (-expr.exp), evaluate=False)),
**self._kwargs)
else: else:
raise ValueError("Generic exponential not supported: " + str(expr)) raise ValueError("Generic exponential not supported: " + str(expr))
...@@ -805,7 +823,10 @@ class VectorizedCustomSympyPrinter(CustomSympyPrinter): ...@@ -805,7 +823,10 @@ class VectorizedCustomSympyPrinter(CustomSympyPrinter):
# noinspection PyProtectedMember # noinspection PyProtectedMember
from sympy.core.mul import _keep_coeff from sympy.core.mul import _keep_coeff
result = self._scalarFallback('_print_Mul', expr) if not inside_add:
result = self._scalarFallback('_print_Mul', expr)
else:
result = None
if result: if result:
return result return result
...@@ -880,12 +901,9 @@ class VectorizedCustomSympyPrinter(CustomSympyPrinter): ...@@ -880,12 +901,9 @@ class VectorizedCustomSympyPrinter(CustomSympyPrinter):
result = self._print(expr.args[-1][0]) result = self._print(expr.args[-1][0])
for true_expr, condition in reversed(expr.args[:-1]): for true_expr, condition in reversed(expr.args[:-1]):
if isinstance(condition, cast_func) and get_type_of_expression(condition.args[0]) == create_type("bool"): if isinstance(condition, CastFunc) and get_type_of_expression(condition.args[0]) == create_type("bool"):
if not KERNCRAFT_NO_TERNARY_MODE: result = "(({}) ? ({}) : ({}))".format(self._print(condition.args[0]), self._print(true_expr),
result = "(({}) ? ({}) : ({}))".format(self._print(condition.args[0]), self._print(true_expr), result, **self._kwargs)
result, **self._kwargs)
else:
print("Warning - skipping ternary op")
else: else:
# noinspection SpellCheckingInspection # noinspection SpellCheckingInspection
result = self.instruction_set['blendv'].format(result, self._print(true_expr), self._print(condition), result = self.instruction_set['blendv'].format(result, self._print(true_expr), self._print(condition),
......
from os.path import dirname, join
from pystencils.astnodes import Node from pystencils.astnodes import Node
from pystencils.backends.cbackend import CBackend, CustomSympyPrinter, generate_c from pystencils.backends.cbackend import CBackend, CustomSympyPrinter, generate_c
from pystencils.enums import Backend
from pystencils.fast_approximation import fast_division, fast_inv_sqrt, fast_sqrt from pystencils.fast_approximation import fast_division, fast_inv_sqrt, fast_sqrt
from pystencils.interpolation_astnodes import DiffInterpolatorAccess, InterpolationMode
with open(join(dirname(__file__), 'cuda_known_functions.txt')) as f:
lines = f.readlines()
CUDA_KNOWN_FUNCTIONS = {l.strip(): l.strip() for l in lines if l}
def generate_cuda(ast_node: Node, signature_only: bool = False, custom_backend=None, with_globals=True) -> str: def generate_cuda(ast_node: Node, signature_only: bool = False, custom_backend=None, with_globals=True) -> str:
...@@ -22,7 +16,7 @@ def generate_cuda(ast_node: Node, signature_only: bool = False, custom_backend=N ...@@ -22,7 +16,7 @@ def generate_cuda(ast_node: Node, signature_only: bool = False, custom_backend=N
Returns: Returns:
CUDA code for the ast node and its descendants CUDA code for the ast node and its descendants
""" """
return generate_c(ast_node, signature_only, dialect='cuda', return generate_c(ast_node, signature_only, dialect=Backend.CUDA,
custom_backend=custom_backend, with_globals=with_globals) custom_backend=custom_backend, with_globals=with_globals)
...@@ -33,7 +27,7 @@ class CudaBackend(CBackend): ...@@ -33,7 +27,7 @@ class CudaBackend(CBackend):
if not sympy_printer: if not sympy_printer:
sympy_printer = CudaSympyPrinter() sympy_printer = CudaSympyPrinter()
super().__init__(sympy_printer, signature_only, dialect='cuda') super().__init__(sympy_printer, signature_only, dialect=Backend.CUDA)
def _print_SharedMemoryAllocation(self, node): def _print_SharedMemoryAllocation(self, node):
dtype = node.symbol.dtype dtype = node.symbol.dtype
...@@ -43,26 +37,13 @@ class CudaBackend(CBackend): ...@@ -43,26 +37,13 @@ class CudaBackend(CBackend):
return code return code
@staticmethod @staticmethod
def _print_ThreadBlockSynchronization(node): def _print_ThreadBlockSynchronization(_):
code = "__synchtreads();" return "__synchtreads();"
return code
def _print_TextureDeclaration(self, node): def _print_TextureDeclaration(self, node):
cond = node.texture.field.dtype.numpy_dtype.itemsize > 4
# TODO: use fStrings here return f'texture<{"fp_tex_" if cond else ""}{str(node.texture.field.dtype)}, ' \
if node.texture.field.dtype.numpy_dtype.itemsize > 4: f'cudaTextureType{node.texture.field.spacial_dimensions}D, cudaReadModeElementType> {node.texture};'
code = "texture<fp_tex_%s, cudaTextureType%iD, cudaReadModeElementType> %s;" % (
str(node.texture.field.dtype),
node.texture.field.spatial_dimensions,
node.texture
)
else:
code = "texture<%s, cudaTextureType%iD, cudaReadModeElementType> %s;" % (
str(node.texture.field.dtype),
node.texture.field.spatial_dimensions,
node.texture
)
return code
def _print_SkipIteration(self, _): def _print_SkipIteration(self, _):
return "return;" return "return;"
...@@ -73,31 +54,6 @@ class CudaSympyPrinter(CustomSympyPrinter): ...@@ -73,31 +54,6 @@ class CudaSympyPrinter(CustomSympyPrinter):
def __init__(self): def __init__(self):
super(CudaSympyPrinter, self).__init__() super(CudaSympyPrinter, self).__init__()
self.known_functions.update(CUDA_KNOWN_FUNCTIONS)
def _print_InterpolatorAccess(self, node):
dtype = node.interpolator.field.dtype.numpy_dtype
if type(node) == DiffInterpolatorAccess:
# cubicTex3D_1st_derivative_x(texture tex, float3 coord)
template = f"cubicTex%iD_1st_derivative_{list(reversed('xyz'[:node.ndim]))[node.diff_coordinate_idx]}(%s, %s)" # noqa
elif node.interpolator.interpolation_mode == InterpolationMode.CUBIC_SPLINE:
template = "cubicTex%iDSimple(%s, %s)"
else:
if dtype.itemsize > 4:
# Use PyCuda hack!
# https://github.com/inducer/pycuda/blob/master/pycuda/cuda/pycuda-helpers.hpp
template = "fp_tex%iD(%s, %s)"
else:
template = "tex%iD(%s, %s)"
code = template % (
node.interpolator.field.spatial_dimensions,
str(node.interpolator),
# + 0.5 comes from Nvidia's staggered indexing
', '.join(self._print(o + 0.5) for o in reversed(node.offsets))
)
return code
def _print_Function(self, expr): def _print_Function(self, expr):
if isinstance(expr, fast_division): if isinstance(expr, fast_division):
......
import graphviz import graphviz
from graphviz import Digraph, lang try:
from graphviz import Digraph
import graphviz.quoting as quote
except ImportError:
from graphviz import Digraph
import graphviz.lang as quote
from sympy.printing.printer import Printer from sympy.printing.printer import Printer
...@@ -12,7 +17,7 @@ class DotPrinter(Printer): ...@@ -12,7 +17,7 @@ class DotPrinter(Printer):
super(DotPrinter, self).__init__() super(DotPrinter, self).__init__()
self._node_to_str_function = node_to_str_function self._node_to_str_function = node_to_str_function
self.dot = Digraph(**kwargs) self.dot = Digraph(**kwargs)
self.dot.quote_edge = lang.quote self.dot.quote_edge = quote.quote
def _print_KernelFunction(self, func): def _print_KernelFunction(self, func):
self.dot.node(str(id(func)), style='filled', fillcolor='#a056db', label=self._node_to_str_function(func)) self.dot.node(str(id(func)), style='filled', fillcolor='#a056db', label=self._node_to_str_function(func))
......
from pystencils.typing import CFunction
def get_argument_string(function_shortcut, last=''): def get_argument_string(function_shortcut, last=''):
args = function_shortcut[function_shortcut.index('[') + 1: -1] args = function_shortcut[function_shortcut.index('[') + 1: -1]
arg_string = "(" arg_string = "("
...@@ -30,14 +33,11 @@ def get_vector_instruction_set_riscv(data_type='double', instruction_set='rvv'): ...@@ -30,14 +33,11 @@ def get_vector_instruction_set_riscv(data_type='double', instruction_set='rvv'):
'sqrt': 'fsqrt_v[0]', 'sqrt': 'fsqrt_v[0]',
'loadU': f'le{bits[data_type]}_v[0]', 'loadU': f'le{bits[data_type]}_v[0]',
'loadA': f'le{bits[data_type]}_v[0]',
'storeU': f'se{bits[data_type]}_v[0, 1]', 'storeU': f'se{bits[data_type]}_v[0, 1]',
'storeA': f'se{bits[data_type]}_v[0, 1]',
'maskStoreU': f'se{bits[data_type]}_v[2, 0, 1]', 'maskStoreU': f'se{bits[data_type]}_v[2, 0, 1]',
'maskStoreA': f'se{bits[data_type]}_v[2, 0, 1]',
'loadS': f'lse{bits[data_type]}_v[0, 1]', 'loadS': f'lse{bits[data_type]}_v[0, 1]',
'storeS': f'sse{bits[data_type]}_v[0, 2, 1]', 'storeS': f'sse{bits[data_type]}_v[0, 2, 1]',
'maskStoreS': f'sse{bits[data_type]}_v[2, 0, 3, 1]', 'maskStoreS': f'sse{bits[data_type]}_v[3, 0, 2, 1]',
'abs': 'fabs_v[0]', 'abs': 'fabs_v[0]',
'==': 'mfeq_vv[0, 1]', '==': 'mfeq_vv[0, 1]',
...@@ -50,8 +50,8 @@ def get_vector_instruction_set_riscv(data_type='double', instruction_set='rvv'): ...@@ -50,8 +50,8 @@ def get_vector_instruction_set_riscv(data_type='double', instruction_set='rvv'):
'|': 'mor_mm[0, 1]', '|': 'mor_mm[0, 1]',
'blendv': 'merge_vvm[2, 0, 1]', 'blendv': 'merge_vvm[2, 0, 1]',
'any': 'popc_m[0]', 'any': 'cpop_m[0]',
'all': 'popc_m[0]', 'all': 'cpop_m[0]',
} }
result = dict() result = dict()
...@@ -81,7 +81,6 @@ def get_vector_instruction_set_riscv(data_type='double', instruction_set='rvv'): ...@@ -81,7 +81,6 @@ def get_vector_instruction_set_riscv(data_type='double', instruction_set='rvv'):
result[intrinsic_id] = prefix + name + suffix2 + arg_string result[intrinsic_id] = prefix + name + suffix2 + arg_string
from pystencils.backends.cbackend import CFunction
result['width'] = CFunction(width, "int") result['width'] = CFunction(width, "int")
result['intwidth'] = CFunction(intwidth, "int") result['intwidth'] = CFunction(intwidth, "int")
...@@ -92,7 +91,7 @@ def get_vector_instruction_set_riscv(data_type='double', instruction_set='rvv'): ...@@ -92,7 +91,7 @@ def get_vector_instruction_set_riscv(data_type='double', instruction_set='rvv'):
result['storeS'] = result['storeS'].replace('{2}', f'{{2}}*{bits[data_type]//8}') result['storeS'] = result['storeS'].replace('{2}', f'{{2}}*{bits[data_type]//8}')
result['loadS'] = result['loadS'].replace('{1}', f'{{1}}*{bits[data_type]//8}') result['loadS'] = result['loadS'].replace('{1}', f'{{1}}*{bits[data_type]//8}')
result['maskStoreS'] = result['maskStoreS'].replace('{3}', f'{{3}}*{bits[data_type]//8}') result['maskStoreS'] = result['maskStoreS'].replace('{2}', f'{{2}}*{bits[data_type]//8}')
result['+int'] = f"vadd_vv_i{bits['int']}m1({{0}}, {{1}}, {int_vl})" result['+int'] = f"vadd_vv_i{bits['int']}m1({{0}}, {{1}}, {int_vl})"
...@@ -101,9 +100,12 @@ def get_vector_instruction_set_riscv(data_type='double', instruction_set='rvv'): ...@@ -101,9 +100,12 @@ def get_vector_instruction_set_riscv(data_type='double', instruction_set='rvv'):
result['int'] = f'vint{bits["int"]}m1_t' result['int'] = f'vint{bits["int"]}m1_t'
result['bool'] = f'vbool{bits[data_type]}_t' result['bool'] = f'vbool{bits[data_type]}_t'
result['headers'] = ['<riscv_vector.h>'] result['headers'] = ['<riscv_vector.h>', '"riscv_v_helpers.h"']
result['any'] += ' > 0x0' result['any'] += ' > 0x0'
result['all'] += f' == vsetvl_e{bits[data_type]}m1({vl})' result['all'] += f' == vsetvl_e{bits[data_type]}m1({vl})'
result['cachelineSize'] = 'cachelineSize()'
result['cachelineZero'] = 'cachelineZero((void*) {0})'
return result return result
import math
import os import os
import platform import platform
from ctypes import CDLL from ctypes import CDLL, c_int, c_size_t, sizeof, byref
from warnings import warn
import numpy as np
from pystencils.backends.x86_instruction_sets import get_vector_instruction_set_x86 from pystencils.backends.x86_instruction_sets import get_vector_instruction_set_x86
from pystencils.backends.arm_instruction_sets import get_vector_instruction_set_arm from pystencils.backends.arm_instruction_sets import get_vector_instruction_set_arm
from pystencils.backends.ppc_instruction_sets import get_vector_instruction_set_ppc from pystencils.backends.ppc_instruction_sets import get_vector_instruction_set_ppc
from pystencils.backends.riscv_instruction_sets import get_vector_instruction_set_riscv from pystencils.backends.riscv_instruction_sets import get_vector_instruction_set_riscv
from pystencils.cache import memorycache
from pystencils.typing import numpy_name_to_c
def get_vector_instruction_set(data_type='double', instruction_set='avx'): def get_vector_instruction_set(data_type='double', instruction_set='avx'):
if instruction_set in ['neon'] or instruction_set.startswith('sve'): if data_type == 'float':
return get_vector_instruction_set_arm(data_type, instruction_set) warn(f"Ambiguous input for data_type: {data_type}. For single precision please use float32. "
f"For more information please take numpy.dtype as a reference. This input will not be supported in future "
f"releases")
data_type = 'float64'
type_name = numpy_name_to_c(np.dtype(data_type).name)
if instruction_set in ['neon', 'sme'] or instruction_set.startswith('sve'):
return get_vector_instruction_set_arm(type_name, instruction_set)
elif instruction_set in ['vsx']: elif instruction_set in ['vsx']:
return get_vector_instruction_set_ppc(data_type, instruction_set) return get_vector_instruction_set_ppc(type_name, instruction_set)
elif instruction_set in ['rvv']: elif instruction_set in ['rvv']:
return get_vector_instruction_set_riscv(data_type, instruction_set) return get_vector_instruction_set_riscv(type_name, instruction_set)
else: else:
return get_vector_instruction_set_x86(data_type, instruction_set) return get_vector_instruction_set_x86(type_name, instruction_set)
_cache = None
_cachelinesize = None
@memorycache
def get_supported_instruction_sets(): def get_supported_instruction_sets():
"""List of supported instruction sets on current hardware, or None if query failed.""" """List of supported instruction sets on current hardware, or None if query failed."""
global _cache
if _cache is not None:
return _cache.copy()
if 'PYSTENCILS_SIMD' in os.environ: if 'PYSTENCILS_SIMD' in os.environ:
return os.environ['PYSTENCILS_SIMD'].split(',') return os.environ['PYSTENCILS_SIMD'].split(',')
if platform.system() == 'Darwin' and platform.machine() == 'arm64': # not supported by cpuinfo if platform.system() == 'Darwin' and platform.machine() == 'arm64':
result = ['neon']
libc = CDLL('/usr/lib/libc.dylib')
value = c_int(0)
size = c_size_t(sizeof(value))
status = libc.sysctlbyname(b"hw.optional.arm.FEAT_SME", byref(value), byref(size), None, 0)
if status == 0 and value.value == 1:
result.insert(0, "sme")
return result
elif platform.system() == 'Windows' and platform.machine() == 'ARM64':
return ['neon'] return ['neon']
elif platform.system() == 'Linux' and platform.machine().startswith('riscv'): # not supported by cpuinfo elif platform.system() == 'Linux' and platform.machine() == 'aarch64':
result = ['neon'] # Neon is mandatory on 64-bit ARM
libc = CDLL('libc.so.6')
hwcap = libc.getauxval(16) # AT_HWCAP
hwcap2 = libc.getauxval(26) # AT_HWCAP2
if hwcap & (1 << 22): # HWCAP_SVE
if hwcap2 & (1 << 1): # HWCAP2_SVE2
name = 'sve2'
else:
name = 'sve'
length = 8 * libc.prctl(51, 0, 0, 0, 0) # PR_SVE_GET_VL
if length < 0:
raise OSError("SVE length query failed")
while length >= 128:
result.append(f"{name}{length}")
length //= 2
result.append(name)
if hwcap2 & (1 << 23): # HWCAP2_SME
result.insert(0, "sme") # prepend to list so it is not automatically chosen as best instruction set
return result
elif platform.system() == 'Linux' and platform.machine().startswith('riscv'):
libc = CDLL('libc.so.6') libc = CDLL('libc.so.6')
hwcap = libc.getauxval(16) # AT_HWCAP hwcap = libc.getauxval(16) # AT_HWCAP
hwcap_isa_v = 1 << (ord('V') - ord('A')) # COMPAT_HWCAP_ISA_V hwcap_isa_v = 1 << (ord('V') - ord('A')) # COMPAT_HWCAP_ISA_V
return ['rvv'] if hwcap & hwcap_isa_v else [] return ['rvv'] if hwcap & hwcap_isa_v else []
elif platform.machine().startswith('ppc64'): # no flags reported by cpuinfo elif platform.system() == 'Linux' and platform.machine().startswith('ppc64'):
import subprocess libc = CDLL('libc.so.6')
import tempfile hwcap = libc.getauxval(16) # AT_HWCAP
from pystencils.cpu.cpujit import get_compiler_config return ['vsx'] if hwcap & 0x00000080 else [] # PPC_FEATURE_HAS_VSX
f = tempfile.NamedTemporaryFile(suffix='.cpp') elif platform.machine() in ['x86_64', 'x86', 'AMD64', 'i386']:
command = [get_compiler_config()['command'], '-mcpu=native', '-dM', '-E', f.name] try:
macros = subprocess.check_output(command, input='', text=True) from cpuinfo import get_cpu_info
if '#define __VSX__' in macros and '#define __ALTIVEC__' in macros: except ImportError:
_cache = ['vsx'] return None
else:
_cache = []
return _cache.copy()
try:
from cpuinfo import get_cpu_info
except ImportError:
return None
result = [] result = []
required_sse_flags = {'sse', 'sse2', 'ssse3', 'sse4_1', 'sse4_2'} required_sse_flags = {'sse', 'sse2', 'ssse3', 'sse4_1', 'sse4_2'}
required_avx_flags = {'avx', 'avx2'} required_avx_flags = {'avx', 'avx2'}
required_avx512_flags = {'avx512f'} required_avx512_flags = {'avx512f'}
required_neon_flags = {'neon'} possible_avx512vl_flags = {'avx512vl', 'avx10_1'}
required_sve_flags = {'sve'} flags = set(get_cpu_info()['flags'])
flags = set(get_cpu_info()['flags']) if flags.issuperset(required_sse_flags):
if flags.issuperset(required_sse_flags): result.append("sse")
result.append("sse") if flags.issuperset(required_avx_flags):
if flags.issuperset(required_avx_flags): result.append("avx")
result.append("avx") if flags.issuperset(required_avx512_flags):
if flags.issuperset(required_avx512_flags): result.append("avx512")
result.append("avx512") if not flags.isdisjoint(possible_avx512vl_flags):
if flags.issuperset(required_neon_flags): result.append("avx512vl")
result.append("neon") return result
if flags.issuperset(required_sve_flags): else:
if platform.system() == 'Linux': raise NotImplementedError('Instruction set detection for %s on %s is not implemented' %
libc = CDLL('libc.so.6') (platform.system(), platform.machine()))
native_length = 8 * libc.prctl(51, 0, 0, 0, 0) # PR_SVE_GET_VL
if native_length < 0:
raise OSError("SVE length query failed")
pwr2_length = int(2**math.floor(math.log2(native_length)))
if pwr2_length % 256 == 0:
result.append(f"sve{pwr2_length//2}")
if native_length != pwr2_length:
result.append(f"sve{pwr2_length}")
result.append(f"sve{native_length}")
result.append("sve")
return result
@memorycache
def get_cacheline_size(instruction_set): def get_cacheline_size(instruction_set):
"""Get the size (in bytes) of a cache block that can be zeroed without memory access. """Get the size (in bytes) of a cache block that can be zeroed without memory access.
Usually, this is identical to the cache line size.""" Usually, this is identical to the cache line size."""
global _cachelinesize
instruction_sets = get_vector_instruction_set('double', instruction_set) instruction_sets = get_vector_instruction_set('double', instruction_set)
if 'cachelineSize' not in instruction_sets: if 'cachelineSize' not in instruction_sets:
return None return None
if _cachelinesize is not None:
return _cachelinesize
import pystencils as ps import pystencils as ps
from pystencils.astnodes import SympyAssignment
import numpy as np import numpy as np
from pystencils.cpu.vectorization import CachelineSize from pystencils.cpu.vectorization import CachelineSize
arr = np.zeros((1, 1), dtype=np.float32) arr = np.zeros((1, 1), dtype=np.float32)
f = ps.Field.create_from_numpy_array('f', arr, index_dimensions=0) f = ps.Field.create_from_numpy_array('f', arr, index_dimensions=0)
ass = [CachelineSize(), ps.Assignment(f.center, CachelineSize.symbol)] ass = [CachelineSize(), SympyAssignment(f.center, CachelineSize.symbol)]
ast = ps.create_kernel(ass, cpu_vectorize_info={'instruction_set': instruction_set}) ast = ps.create_kernel(ass, cpu_vectorize_info={'instruction_set': instruction_set})
kernel = ast.compile() kernel = ast.compile()
kernel(**{f.name: arr, CachelineSize.symbol.name: 0}) kernel(**{f.name: arr, CachelineSize.symbol.name: 0})
_cachelinesize = int(arr[0, 0]) return int(arr[0, 0])
return _cachelinesize
...@@ -51,14 +51,14 @@ def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'): ...@@ -51,14 +51,14 @@ def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'):
'makeVecConstBool': 'set[]', 'makeVecConstBool': 'set[]',
'makeVecInt': 'set[]', 'makeVecInt': 'set[]',
'makeVecConstInt': 'set[]', 'makeVecConstInt': 'set[]',
'loadU': 'loadu[0]', 'loadU': 'loadu[0]',
'loadA': 'load[0]', 'loadA': 'load[0]',
'storeU': 'storeu[0,1]', 'storeU': 'storeu[0,1]',
'storeA': 'store[0,1]', 'storeA': 'store[0,1]',
'stream': 'stream[0,1]', 'stream': 'stream[0,1]',
'maskStoreA': 'mask_store[0, 2, 1]' if instruction_set == 'avx512' else 'maskstore[0, 2, 1]', 'maskStoreA': 'mask_store[0, 2, 1]' if instruction_set.startswith('avx512') else 'maskstore[0, 2, 1]',
'maskStoreU': 'mask_storeu[0, 2, 1]' if instruction_set == 'avx512' else 'maskstore[0, 2, 1]', 'maskStoreU': 'mask_storeu[0, 2, 1]' if instruction_set.startswith('avx512') else 'maskstore[0, 2, 1]',
} }
for comparison_op, constant in comparisons.items(): for comparison_op, constant in comparisons.items():
...@@ -66,6 +66,7 @@ def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'): ...@@ -66,6 +66,7 @@ def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'):
headers = { headers = {
'avx512': ['<immintrin.h>'], 'avx512': ['<immintrin.h>'],
'avx512vl': ['<immintrin.h>'],
'avx': ['<immintrin.h>'], 'avx': ['<immintrin.h>'],
'sse': ['<immintrin.h>', '<xmmintrin.h>', '<emmintrin.h>', '<pmmintrin.h>', 'sse': ['<immintrin.h>', '<xmmintrin.h>', '<emmintrin.h>', '<pmmintrin.h>',
'<tmmintrin.h>', '<smmintrin.h>', '<nmmintrin.h>'] '<tmmintrin.h>', '<smmintrin.h>', '<nmmintrin.h>']
...@@ -79,6 +80,7 @@ def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'): ...@@ -79,6 +80,7 @@ def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'):
prefix = { prefix = {
'sse': '_mm', 'sse': '_mm',
'avx': '_mm256', 'avx': '_mm256',
'avx512vl': '_mm256',
'avx512': '_mm512', 'avx512': '_mm512',
} }
...@@ -89,11 +91,13 @@ def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'): ...@@ -89,11 +91,13 @@ def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'):
("double", "avx"): 4, ("double", "avx"): 4,
("float", "avx"): 8, ("float", "avx"): 8,
("int", "avx"): 8, ("int", "avx"): 8,
("double", "avx512vl"): 4,
("float", "avx512vl"): 8,
("int", "avx512vl"): 8,
("double", "avx512"): 8, ("double", "avx512"): 8,
("float", "avx512"): 16, ("float", "avx512"): 16,
("int", "avx512"): 16, ("int", "avx512"): 16,
} }
result = { result = {
'width': width[(data_type, instruction_set)], 'width': width[(data_type, instruction_set)],
'intwidth': width[('int', instruction_set)], 'intwidth': width[('int', instruction_set)],
...@@ -111,14 +115,9 @@ def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'): ...@@ -111,14 +115,9 @@ def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'):
suf = suffix[data_type] suf = suffix[data_type]
arg_string = get_argument_string(intrinsic_id, result['width'], function_shortcut) arg_string = get_argument_string(intrinsic_id, result['width'], function_shortcut)
mask_suffix = '_mask' if instruction_set == 'avx512' and intrinsic_id in comparisons.keys() else '' mask_suffix = '_mask' if instruction_set.startswith('avx512') and intrinsic_id in comparisons.keys() else ''
result[intrinsic_id] = pre + "_" + name + "_" + suf + mask_suffix + arg_string result[intrinsic_id] = pre + "_" + name + "_" + suf + mask_suffix + arg_string
result['dataTypePrefix'] = {
'double': "_" + pre + 'd',
'float': "_" + pre,
}
bit_width = result['width'] * (64 if data_type == 'double' else 32) bit_width = result['width'] * (64 if data_type == 'double' else 32)
result['double'] = f"__m{bit_width}d" result['double'] = f"__m{bit_width}d"
result['float'] = f"__m{bit_width}" result['float'] = f"__m{bit_width}"
...@@ -129,29 +128,45 @@ def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'): ...@@ -129,29 +128,45 @@ def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'):
result['any'] = f"{pre}_movemask_{suf}({{0}}) > 0" result['any'] = f"{pre}_movemask_{suf}({{0}}) > 0"
result['all'] = f"{pre}_movemask_{suf}({{0}}) == {hex(2**result['width']-1)}" result['all'] = f"{pre}_movemask_{suf}({{0}}) == {hex(2**result['width']-1)}"
if instruction_set == 'avx512': setsuf = "x" if bit_width < 512 and bit_width // result['width'] == 64 else ""
if instruction_set.startswith('avx512'):
size = result['width'] size = result['width']
result['&'] = f'_kand_mask{size}({{0}}, {{1}})' masksize = max(size, 8)
result['|'] = f'_kor_mask{size}({{0}}, {{1}})' result['&'] = f'_kand_mask{masksize}({{0}}, {{1}})'
result['any'] = f'!_ktestz_mask{size}_u8({{0}}, {{0}})' result['|'] = f'_kor_mask{masksize}({{0}}, {{1}})'
result['all'] = f'_kortestc_mask{size}_u8({{0}}, {{0}})' result['any'] = f'!_ktestz_mask{masksize}_u8({{0}}, {{0}})'
result['all'] = f'_kortestc_mask{masksize}_u8({{0}}, {{0}})'
result['blendv'] = f'{pre}_mask_blend_{suf}({{2}}, {{0}}, {{1}})' result['blendv'] = f'{pre}_mask_blend_{suf}({{2}}, {{0}}, {{1}})'
result['rsqrt'] = f"{pre}_rsqrt14_{suf}({{0}})" result['rsqrt'] = f"{pre}_rsqrt14_{suf}({{0}})"
result['abs'] = f"{pre}_abs_{suf}({{0}})" result['bool'] = f"__mmask{masksize}"
result['bool'] = f"__mmask{size}"
params = " | ".join(["({{{i}}} ? {power} : 0)".format(i=i, power=2 ** i) for i in range(8)]) params = " | ".join(["({{{i}}} ? {power} : 0)".format(i=i, power=2 ** i) for i in range(8)])
result['makeVecBool'] = f"__mmask8(({params}) )" result['makeVecBool'] = f"__mmask8(({params}) )"
params = " | ".join(["({{0}} ? {power} : 0)".format(power=2 ** i) for i in range(8)]) params = " | ".join(["({{0}} ? {power} : 0)".format(power=2 ** i) for i in range(8)])
result['makeVecConstBool'] = f"__mmask8(({params}) )" result['makeVecConstBool'] = f"__mmask8(({params}) )"
vindex = f'{pre}_set_epi{bit_width//size}(' + ', '.join([str(i) for i in range(result['width'])][::-1]) + ')' vindex = f'{pre}_set_epi{bit_width//size}{setsuf}(' + \
vindex = f'{pre}_mullo_epi{bit_width//size}({vindex}, {pre}_set1_epi{bit_width//size}({{0}}))' ', '.join([str(i) for i in range(result['width'])][::-1]) + ')'
vindex = f'{pre}_mullo_epi{bit_width//size}({vindex}, {pre}_set1_epi{bit_width//size}{setsuf}({{0}}))'
scale = bit_width // size // 8
result['storeS'] = f'{pre}_i{bit_width//size}scatter_{suf}({{0}}, ' + vindex.format("{2}") + \ result['storeS'] = f'{pre}_i{bit_width//size}scatter_{suf}({{0}}, ' + vindex.format("{2}") + \
f', {{1}}, {64//size})' f', {{1}}, {scale})'
result['maskStoreS'] = f'{pre}_mask_i{bit_width//size}scatter_{suf}({{0}}, {{3}}, ' + vindex.format("{2}") + \ result['maskStoreS'] = f'{pre}_mask_i{bit_width//size}scatter_{suf}({{0}}, {{3}}, ' + vindex.format("{2}") + \
f', {{1}}, {64//size})' f', {{1}}, {scale})'
result['loadS'] = f'{pre}_i{bit_width//size}gather_{suf}(' + vindex.format("{1}") + f', {{0}}, {64//size})' if bit_width == 512:
result['loadS'] = f'{pre}_i{bit_width//size}gather_{suf}(' + vindex.format("{1}") + f', {{0}}, {scale})'
else:
result['loadS'] = f'{pre}_i{bit_width//size}gather_{suf}({{0}}, ' + vindex.format("{1}") + f', {scale})'
# abs intrinsic exists in 512 bits, but expands to a sequence. We generate that same sequence for 128 and 256 bits
if instruction_set == 'avx512':
result['abs'] = f"{pre}_abs_{suf}({{0}})"
else:
result['abs'] = f"{pre}_castsi{bit_width}_{suf}({pre}_and_si{bit_width}(" + \
f"{pre}_set1_epi{bit_width // result['width']}{setsuf}(0x7" + \
'f' * (bit_width // result['width'] // 4 - 1) + "), " + \
f"{pre}_cast{suf}_si{bit_width}({{0}})))"
if instruction_set == 'avx' and data_type == 'float': if instruction_set == 'avx' and data_type == 'float':
result['rsqrt'] = f"{pre}_rsqrt_{suf}({{0}})" result['rsqrt'] = f"{pre}_rsqrt_{suf}({{0}})"
......
import sympy as sp
# from pystencils.typing import get_type_of_expression
# noinspection PyPep8Naming
class flag_cond(sp.Function):
"""Evaluates a flag condition on a bit mask, and returns the value of one of two expressions,
depending on whether the flag is set.
Three argument version:
```
flag_cond(flag_bit, mask, expr) = expr if (flag_bit is set in mask) else 0
```
Four argument version:
```
flag_cond(flag_bit, mask, expr_then, expr_else) = expr_then if (flag_bit is set in mask) else expr_else
```
"""
nargs = (3, 4)
def __new__(cls, flag_bit, mask_expression, *expressions):
# TODO Jan reintroduce checking
# flag_dtype = get_type_of_expression(flag_bit)
# if not flag_dtype.is_int():
# raise ValueError('Argument flag_bit must be of integer type.')
#
# mask_dtype = get_type_of_expression(mask_expression)
# if not mask_dtype.is_int():
# raise ValueError('Argument mask_expression must be of integer type.')
return super().__new__(cls, flag_bit, mask_expression, *expressions)
def to_c(self, print_func):
flag_bit = self.args[0]
mask = self.args[1]
then_expression = self.args[2]
flag_bit_code = print_func(flag_bit)
mask_code = print_func(mask)
then_code = print_func(then_expression)
code = f"(({mask_code}) >> ({flag_bit_code}) & 1) * ({then_code})"
if len(self.args) > 3:
else_expression = self.args[3]
else_code = print_func(else_expression)
code += f" + (({mask_code}) >> ({flag_bit_code}) ^ 1) * ({else_code})"
return code
from typing import Any, List, Tuple from typing import Any, List, Tuple
from pystencils import Assignment from pystencils.astnodes import SympyAssignment
from pystencils.boundaries.boundaryhandling import BoundaryOffsetInfo from pystencils.boundaries.boundaryhandling import BoundaryOffsetInfo
from pystencils.data_types import create_type from pystencils.typing import create_type
class Boundary: class Boundary:
...@@ -14,7 +14,7 @@ class Boundary: ...@@ -14,7 +14,7 @@ class Boundary:
def __init__(self, name=None): def __init__(self, name=None):
self._name = name self._name = name
def __call__(self, field, direction_symbol, index_field) -> List[Assignment]: def __call__(self, field, direction_symbol, index_field) -> List[SympyAssignment]:
"""Defines the boundary behavior and must therefore be implemented by all boundaries. """Defines the boundary behavior and must therefore be implemented by all boundaries.
Here the boundary is defined as a list of sympy assignments, from which a boundary kernel is generated. Here the boundary is defined as a list of sympy assignments, from which a boundary kernel is generated.
...@@ -63,20 +63,20 @@ class Neumann(Boundary): ...@@ -63,20 +63,20 @@ class Neumann(Boundary):
neighbor = BoundaryOffsetInfo.offset_from_dir(direction_symbol, field.spatial_dimensions) neighbor = BoundaryOffsetInfo.offset_from_dir(direction_symbol, field.spatial_dimensions)
if field.index_dimensions == 0: if field.index_dimensions == 0:
return [Assignment(field.center, field[neighbor])] return [SympyAssignment(field.center, field[neighbor])]
else: else:
from itertools import product from itertools import product
if not field.has_fixed_index_shape: if not field.has_fixed_index_shape:
raise NotImplementedError("Neumann boundary works only for fields with fixed index shape") raise NotImplementedError("Neumann boundary works only for fields with fixed index shape")
index_iter = product(*(range(i) for i in field.index_shape)) index_iter = product(*(range(i) for i in field.index_shape))
return [Assignment(field(*idx), field[neighbor](*idx)) for idx in index_iter] return [SympyAssignment(field(*idx), field[neighbor](*idx)) for idx in index_iter]
def __hash__(self): def __hash__(self):
# All boundaries of these class behave equal -> should also be equal # All boundaries of these class behave equal -> should also be equal
return hash("Neumann") return hash("Neumann")
def __eq__(self, other): def __eq__(self, other):
return type(other) == Neumann return type(other) is Neumann
class Dirichlet(Boundary): class Dirichlet(Boundary):
...@@ -103,11 +103,11 @@ class Dirichlet(Boundary): ...@@ -103,11 +103,11 @@ class Dirichlet(Boundary):
def __call__(self, field, direction_symbol, index_field, **kwargs): def __call__(self, field, direction_symbol, index_field, **kwargs):
if field.index_dimensions == 0: if field.index_dimensions == 0:
return [Assignment(field.center, index_field("value") if self.additional_data else self._value)] return [SympyAssignment(field.center, index_field("value") if self.additional_data else self._value)]
elif field.index_dimensions == 1: elif field.index_dimensions == 1:
assert not self.additional_data assert not self.additional_data
if not field.has_fixed_index_shape: if not field.has_fixed_index_shape:
raise NotImplementedError("Field needs fixed index shape") raise NotImplementedError("Field needs fixed index shape")
assert len(self._value) == field.index_shape[0], "Dirichlet value does not match index shape of field" assert len(self._value) == field.index_shape[0], "Dirichlet value does not match index shape of field"
return [Assignment(field(i), self._value[i]) for i in range(field.index_shape[0])] return [SympyAssignment(field(i), self._value[i]) for i in range(field.index_shape[0])]
raise NotImplementedError("Dirichlet boundary not implemented for fields with more than one index dimension") raise NotImplementedError("Dirichlet boundary not implemented for fields with more than one index dimension")
from functools import lru_cache
import numpy as np import numpy as np
import sympy as sp import sympy as sp
from pystencils import create_indexed_kernel from pystencils import create_kernel, CreateKernelConfig, Target
from pystencils.assignment import Assignment from pystencils.astnodes import SympyAssignment
from pystencils.backends.cbackend import CustomCodeNode from pystencils.backends.cbackend import CustomCodeNode
from pystencils.boundaries.createindexlist import ( from pystencils.boundaries.createindexlist import (
create_boundary_index_array, numpy_data_type_for_boundary_object) create_boundary_index_array, numpy_data_type_for_boundary_object)
from pystencils.cache import memorycache from pystencils.typing import TypedSymbol, create_type
from pystencils.data_types import TypedSymbol, create_type from pystencils.gpu.gpu_array_handler import GPUArrayHandler
from pystencils.datahandling.pycuda import PyCudaArrayHandler
from pystencils.field import Field from pystencils.field import Field
from pystencils.kernelparameters import FieldPointerSymbol from pystencils.typing.typed_sympy import FieldPointerSymbol
try: try:
# noinspection PyPep8Naming # noinspection PyPep8Naming
import waLBerla as wlb import waLBerla as wlb
if wlb.cpp_available: if wlb.cpp_available:
from pystencils.datahandling.parallel_datahandling import ParallelDataHandling from pystencils.datahandling.parallel_datahandling import ParallelDataHandling
import cupy.cuda.runtime
else: else:
ParallelDataHandling = None ParallelDataHandling = None
except ImportError: except ImportError:
...@@ -33,11 +35,11 @@ class FlagInterface: ...@@ -33,11 +35,11 @@ class FlagInterface:
>>> dh = create_data_handling((4, 5)) >>> dh = create_data_handling((4, 5))
>>> fi = FlagInterface(dh, 'flag_field', np.uint8) >>> fi = FlagInterface(dh, 'flag_field', np.uint8)
>>> assert dh.has_data('flag_field') >>> assert dh.has_data('flag_field')
>>> fi.reserve_next_flag() >>> int(fi.reserve_next_flag())
2 2
>>> fi.reserve_flag(4) >>> int(fi.reserve_flag(4))
4 4
>>> fi.reserve_next_flag() >>> int(fi.reserve_next_flag())
8 8
""" """
...@@ -84,7 +86,7 @@ class FlagInterface: ...@@ -84,7 +86,7 @@ class FlagInterface:
class BoundaryHandling: class BoundaryHandling:
def __init__(self, data_handling, field_name, stencil, name="boundary_handling", flag_interface=None, def __init__(self, data_handling, field_name, stencil, name="boundary_handling", flag_interface=None,
target='cpu', openmp=True): target: Target = Target.CPU, openmp=True):
assert data_handling.has_data(field_name) assert data_handling.has_data(field_name)
assert data_handling.dim == len(stencil[0]), "Dimension of stencil and data handling do not match" assert data_handling.dim == len(stencil[0]), "Dimension of stencil and data handling do not match"
self._data_handling = data_handling self._data_handling = data_handling
...@@ -99,7 +101,7 @@ class BoundaryHandling: ...@@ -99,7 +101,7 @@ class BoundaryHandling:
self.flag_interface = fi if fi is not None else FlagInterface(data_handling, name + "Flags") self.flag_interface = fi if fi is not None else FlagInterface(data_handling, name + "Flags")
if ParallelDataHandling and isinstance(self.data_handling, ParallelDataHandling): if ParallelDataHandling and isinstance(self.data_handling, ParallelDataHandling):
array_handler = PyCudaArrayHandler() array_handler = GPUArrayHandler(cupy.cuda.runtime.getDevice())
else: else:
array_handler = self.data_handling.array_handler array_handler = self.data_handling.array_handler
...@@ -115,7 +117,8 @@ class BoundaryHandling: ...@@ -115,7 +117,8 @@ class BoundaryHandling:
for obj, cpu_arr in cpu_version.items(): for obj, cpu_arr in cpu_version.items():
if obj not in gpu_version or gpu_version[obj].shape != cpu_arr.shape: if obj not in gpu_version or gpu_version[obj].shape != cpu_arr.shape:
gpu_version[obj] = array_handler.to_gpu(cpu_arr) gpu_version[obj] = array_handler.empty(cpu_arr.shape, cpu_arr.dtype)
array_handler.upload(gpu_version[obj], cpu_arr)
else: else:
array_handler.upload(gpu_version[obj], cpu_arr) array_handler.upload(gpu_version[obj], cpu_arr)
...@@ -378,15 +381,15 @@ class BoundaryDataSetter: ...@@ -378,15 +381,15 @@ class BoundaryDataSetter:
assert coord < self.dim assert coord < self.dim
return self.index_array[self.coord_map[coord]] + self.offset[coord] - self.ghost_layers + 0.5 return self.index_array[self.coord_map[coord]] + self.offset[coord] - self.ghost_layers + 0.5
@memorycache() @lru_cache()
def link_offsets(self): def link_offsets(self):
return self.stencil[self.index_array['dir']] return self.stencil[self.index_array['dir']]
@memorycache() @lru_cache()
def link_positions(self, coord): def link_positions(self, coord):
return self.non_boundary_cell_positions(coord) + 0.5 * self.link_offsets()[:, coord] return self.non_boundary_cell_positions(coord) + 0.5 * self.link_offsets()[:, coord]
@memorycache() @lru_cache()
def boundary_cell_positions(self, coord): def boundary_cell_positions(self, coord):
return self.non_boundary_cell_positions(coord) + self.link_offsets()[:, coord] return self.non_boundary_cell_positions(coord) + self.link_offsets()[:, coord]
...@@ -423,29 +426,30 @@ class BoundaryOffsetInfo(CustomCodeNode): ...@@ -423,29 +426,30 @@ class BoundaryOffsetInfo(CustomCodeNode):
code = "\n" code = "\n"
for i in range(dim): for i in range(dim):
offset_str = ", ".join([str(d[i]) for d in stencil]) offset_str = ", ".join([str(d[i]) for d in stencil])
code += "const int64_t %s [] = { %s };\n" % (offset_sym[i].name, offset_str) code += "const int32_t %s [] = { %s };\n" % (offset_sym[i].name, offset_str)
inv_dirs = [] inv_dirs = []
for direction in stencil: for direction in stencil:
inverse_dir = tuple([-i for i in direction]) inverse_dir = tuple([-i for i in direction])
inv_dirs.append(str(stencil.index(inverse_dir))) inv_dirs.append(str(stencil.index(inverse_dir)))
code += "const int %s [] = { %s };\n" % (self.INV_DIR_SYMBOL.name, ", ".join(inv_dirs)) code += "const int32_t %s [] = { %s };\n" % (self.INV_DIR_SYMBOL.name, ", ".join(inv_dirs))
offset_symbols = BoundaryOffsetInfo._offset_symbols(dim) offset_symbols = BoundaryOffsetInfo._offset_symbols(dim)
super(BoundaryOffsetInfo, self).__init__(code, symbols_read=set(), super(BoundaryOffsetInfo, self).__init__(code, symbols_read=set(),
symbols_defined=set(offset_symbols + [self.INV_DIR_SYMBOL])) symbols_defined=set(offset_symbols + [self.INV_DIR_SYMBOL]))
@staticmethod @staticmethod
def _offset_symbols(dim): def _offset_symbols(dim):
return [TypedSymbol(f"c{d}", create_type(np.int64)) for d in ['x', 'y', 'z'][:dim]] return [TypedSymbol(f"c{d}", create_type(np.int32)) for d in ['x', 'y', 'z'][:dim]]
INV_DIR_SYMBOL = TypedSymbol("invdir", "int") INV_DIR_SYMBOL = TypedSymbol("invdir", np.int32)
def create_boundary_kernel(field, index_field, stencil, boundary_functor, target='cpu', **kernel_creation_args): def create_boundary_kernel(field, index_field, stencil, boundary_functor, target=Target.CPU, **kernel_creation_args):
elements = [BoundaryOffsetInfo(stencil)] elements = [BoundaryOffsetInfo(stencil)]
index_arr_dtype = index_field.dtype.numpy_dtype dir_symbol = TypedSymbol("dir", np.int32)
dir_symbol = TypedSymbol("dir", index_arr_dtype.fields['dir'][0]) elements += [SympyAssignment(dir_symbol, index_field[0]('dir'))]
elements += [Assignment(dir_symbol, index_field[0]('dir'))]
elements += boundary_functor(field, direction_symbol=dir_symbol, index_field=index_field) elements += boundary_functor(field, direction_symbol=dir_symbol, index_field=index_field)
return create_indexed_kernel(elements, [index_field], target=target, **kernel_creation_args) config = CreateKernelConfig(index_fields=[index_field], target=target, skip_independence_check=True,
**kernel_creation_args)
return create_kernel(elements, config=config)
import itertools
import warnings import warnings
import numpy as np import numpy as np
try: try:
# Try to import right away - assume compiled code is available import pyximport
# compile with: python setup.py build_ext --inplace --use-cython
from pystencils.boundaries.createindexlistcython import create_boundary_neighbor_index_list_2d, \
create_boundary_neighbor_index_list_3d, create_boundary_cell_index_list_2d, create_boundary_cell_index_list_3d
pyximport.install(language_level=3)
cython_funcs_available = True cython_funcs_available = True
except ImportError: except ImportError:
try: cython_funcs_available = False
# If not, try development mode and import via pyximport
import pyximport if cython_funcs_available:
from pystencils.boundaries.createindexlistcython import (
pyximport.install(language_level=3) create_boundary_neighbor_index_list_2d,
cython_funcs_available = True create_boundary_neighbor_index_list_3d,
except ImportError: create_boundary_cell_index_list_2d,
cython_funcs_available = False create_boundary_cell_index_list_3d,
if cython_funcs_available: )
from pystencils.boundaries.createindexlistcython import create_boundary_neighbor_index_list_2d, \
create_boundary_neighbor_index_list_3d, create_boundary_cell_index_list_2d, \
create_boundary_cell_index_list_3d
boundary_index_array_coordinate_names = ["x", "y", "z"] boundary_index_array_coordinate_names = ["x", "y", "z"]
direction_member_name = "dir" direction_member_name = "dir"
default_index_array_dtype = np.int32
def numpy_data_type_for_boundary_object(boundary_object, dim): def numpy_data_type_for_boundary_object(boundary_object, dim):
coordinate_names = boundary_index_array_coordinate_names[:dim] coordinate_names = boundary_index_array_coordinate_names[:dim]
return np.dtype([(name, np.int32) for name in coordinate_names] return np.dtype(
+ [(direction_member_name, np.int32)] [(name, default_index_array_dtype) for name in coordinate_names]
+ [(i[0], i[1].numpy_dtype) for i in boundary_object.additional_data], align=True) + [(direction_member_name, default_index_array_dtype)]
+ [(i[0], i[1].numpy_dtype) for i in boundary_object.additional_data],
align=True,
def _create_boundary_neighbor_index_list_python(flag_field_arr, nr_of_ghost_layers, boundary_mask, )
fluid_mask, stencil, single_link):
coordinate_names = boundary_index_array_coordinate_names[:len(flag_field_arr.shape)]
index_arr_dtype = np.dtype([(name, np.int32) for name in coordinate_names] + [(direction_member_name, np.int32)]) def _create_index_list_python(
flag_field_arr,
result = [] boundary_mask,
gl = nr_of_ghost_layers fluid_mask,
for cell in itertools.product(*reversed([range(gl, i - gl) for i in flag_field_arr.shape])): stencil,
cell = cell[::-1] single_link,
if not flag_field_arr[cell] & fluid_mask: inner_or_boundary=False,
continue nr_of_ghost_layers=None,
):
if inner_or_boundary and nr_of_ghost_layers is None:
raise ValueError(
"If inner_or_boundary is set True the number of ghost layers "
"around the inner domain has to be specified"
)
if nr_of_ghost_layers is None:
nr_of_ghost_layers = 0
coordinate_names = boundary_index_array_coordinate_names[
: len(flag_field_arr.shape)
]
index_arr_dtype = np.dtype(
[(name, default_index_array_dtype) for name in coordinate_names]
+ [(direction_member_name, default_index_array_dtype)]
)
# boundary cells are extracted via np.where. To ensure continous memory access in the compute kernel these cells
# have to be sorted.
boundary_cells = np.transpose(np.nonzero(flag_field_arr == boundary_mask))
for i in range(len(flag_field_arr.shape)):
boundary_cells = boundary_cells[boundary_cells[:, i].argsort(kind="mergesort")]
# First a set is created to save all fluid cells which are near boundary
fluid_cells = set()
for cell in boundary_cells:
cell = tuple(cell)
for dir_idx, direction in enumerate(stencil): for dir_idx, direction in enumerate(stencil):
neighbor_cell = tuple([cell_i + dir_i for cell_i, dir_i in zip(cell, direction)]) neighbor_cell = tuple(
if flag_field_arr[neighbor_cell] & boundary_mask: [cell_i + dir_i for cell_i, dir_i in zip(cell, direction)]
result.append(cell + (dir_idx,)) )
if single_link: # prevent out ouf bounds access. If boundary cell is at the border, some stencil directions would be out.
break if any(
not 0 + nr_of_ghost_layers <= e < upper - nr_of_ghost_layers
return np.array(result, dtype=index_arr_dtype) for e, upper in zip(neighbor_cell, flag_field_arr.shape)
):
continue
if flag_field_arr[neighbor_cell] & fluid_mask:
fluid_cells.add(neighbor_cell)
# then this is set is transformed to a list to make it sortable. This ensures continoous memory access later.
fluid_cells = list(fluid_cells)
if len(flag_field_arr.shape) == 3:
fluid_cells.sort(key=lambda tup: (tup[-1], tup[-2], tup[0]))
else:
fluid_cells.sort(key=lambda tup: (tup[-1], tup[0]))
def _create_boundary_cell_index_list_python(flag_field_arr, boundary_mask, cells_to_iterate = fluid_cells if inner_or_boundary else boundary_cells
fluid_mask, stencil, single_link): checkmask = boundary_mask if inner_or_boundary else fluid_mask
coordinate_names = boundary_index_array_coordinate_names[:len(flag_field_arr.shape)]
index_arr_dtype = np.dtype([(name, np.int32) for name in coordinate_names] + [(direction_member_name, np.int32)])
result = [] result = []
for cell in itertools.product(*reversed([range(0, i) for i in flag_field_arr.shape])): for cell in cells_to_iterate:
cell = cell[::-1] cell = tuple(cell)
if not flag_field_arr[cell] & boundary_mask: sum_cells = np.zeros(len(cell))
continue
for dir_idx, direction in enumerate(stencil): for dir_idx, direction in enumerate(stencil):
neighbor_cell = tuple([cell_i + dir_i for cell_i, dir_i in zip(cell, direction)]) neighbor_cell = tuple(
if any(not 0 <= e < upper for e, upper in zip(neighbor_cell, flag_field_arr.shape)): [cell_i + dir_i for cell_i, dir_i in zip(cell, direction)]
)
# prevent out ouf bounds access. If boundary cell is at the border, some stencil directions would be out.
if any(
not 0 <= e < upper
for e, upper in zip(neighbor_cell, flag_field_arr.shape)
):
continue continue
if flag_field_arr[neighbor_cell] & fluid_mask: if flag_field_arr[neighbor_cell] & checkmask:
result.append(cell + (dir_idx,))
if single_link: if single_link:
break sum_cells += np.array(direction)
else:
result.append(tuple(cell) + (dir_idx,))
# the discrete normal direction is the one which gives the maximum inner product to the stencil direction
if single_link and any(sum_cells != 0):
idx = np.argmax(np.inner(sum_cells, stencil))
result.append(tuple(cell) + (idx,))
return np.array(result, dtype=index_arr_dtype) return np.array(result, dtype=index_arr_dtype)
def create_boundary_index_list(flag_field, stencil, boundary_mask, fluid_mask, def create_boundary_index_list(
nr_of_ghost_layers=1, inner_or_boundary=True, single_link=False): flag_field,
stencil,
boundary_mask,
fluid_mask,
nr_of_ghost_layers=1,
inner_or_boundary=True,
single_link=False,
):
"""Creates a numpy array storing links (connections) between domain cells and boundary cells. """Creates a numpy array storing links (connections) between domain cells and boundary cells.
Args: Args:
...@@ -91,15 +141,25 @@ def create_boundary_index_list(flag_field, stencil, boundary_mask, fluid_mask, ...@@ -91,15 +141,25 @@ def create_boundary_index_list(flag_field, stencil, boundary_mask, fluid_mask,
nr_of_ghost_layers: only relevant if neighbors is True nr_of_ghost_layers: only relevant if neighbors is True
inner_or_boundary: if true, the result contains the cell coordinates of the domain cells - inner_or_boundary: if true, the result contains the cell coordinates of the domain cells -
if false the boundary cells are listed if false the boundary cells are listed
single_link: if true only the first link is reported from this cell single_link: if true only the link in normal direction to this cell is reported
""" """
dim = len(flag_field.shape) dim = len(flag_field.shape)
coordinate_names = boundary_index_array_coordinate_names[:dim] coordinate_names = boundary_index_array_coordinate_names[:dim]
index_arr_dtype = np.dtype([(name, np.int32) for name in coordinate_names] + [(direction_member_name, np.int32)]) index_arr_dtype = np.dtype(
[(name, default_index_array_dtype) for name in coordinate_names]
stencil = np.array(stencil, dtype=np.int32) + [(direction_member_name, default_index_array_dtype)]
args = (flag_field, nr_of_ghost_layers, boundary_mask, fluid_mask, stencil, single_link) )
stencil = np.array(stencil, dtype=default_index_array_dtype)
args = (
flag_field,
nr_of_ghost_layers,
boundary_mask,
fluid_mask,
stencil,
single_link,
)
args_no_gl = (flag_field, boundary_mask, fluid_mask, stencil, single_link) args_no_gl = (flag_field, boundary_mask, fluid_mask, stencil, single_link)
if cython_funcs_available: if cython_funcs_available:
...@@ -118,24 +178,42 @@ def create_boundary_index_list(flag_field, stencil, boundary_mask, fluid_mask, ...@@ -118,24 +178,42 @@ def create_boundary_index_list(flag_field, stencil, boundary_mask, fluid_mask,
return np.array(idx_list, dtype=index_arr_dtype) return np.array(idx_list, dtype=index_arr_dtype)
else: else:
if flag_field.size > 1e6: if flag_field.size > 1e6:
warnings.warn("Boundary setup may take very long! Consider installing cython to speed it up") warnings.warn(
if inner_or_boundary: "Boundary setup may take very long! Consider installing cython to speed it up"
return _create_boundary_neighbor_index_list_python(*args) )
else: return _create_index_list_python(
return _create_boundary_cell_index_list_python(*args_no_gl) *args_no_gl,
inner_or_boundary=inner_or_boundary,
nr_of_ghost_layers=nr_of_ghost_layers,
def create_boundary_index_array(flag_field, stencil, boundary_mask, fluid_mask, boundary_object, )
nr_of_ghost_layers=1, inner_or_boundary=True, single_link=False):
idx_array = create_boundary_index_list(flag_field, stencil, boundary_mask, fluid_mask,
nr_of_ghost_layers, inner_or_boundary, single_link) def create_boundary_index_array(
flag_field,
stencil,
boundary_mask,
fluid_mask,
boundary_object,
nr_of_ghost_layers=1,
inner_or_boundary=True,
single_link=False,
):
idx_array = create_boundary_index_list(
flag_field,
stencil,
boundary_mask,
fluid_mask,
nr_of_ghost_layers,
inner_or_boundary,
single_link,
)
dim = len(flag_field.shape) dim = len(flag_field.shape)
if boundary_object.additional_data: if boundary_object.additional_data:
coordinate_names = boundary_index_array_coordinate_names[:dim] coordinate_names = boundary_index_array_coordinate_names[:dim]
index_arr_dtype = numpy_data_type_for_boundary_object(boundary_object, dim) index_arr_dtype = numpy_data_type_for_boundary_object(boundary_object, dim)
extended_idx_field = np.empty(len(idx_array), dtype=index_arr_dtype) extended_idx_field = np.empty(len(idx_array), dtype=index_arr_dtype)
for prop in coordinate_names + ['dir']: for prop in coordinate_names + ["dir"]:
extended_idx_field[prop] = idx_array[prop] extended_idx_field[prop] = idx_array[prop]
idx_array = extended_idx_field idx_array = extended_idx_field
......
# distutils: language=c # cython: language_level=3str
# Workaround for cython bug
# see https://stackoverflow.com/questions/8024805/cython-compiled-c-extension-importerror-dynamic-module-does-not-define-init-fu
WORKAROUND = "Something"
import cython import cython
...@@ -22,20 +19,37 @@ def create_boundary_neighbor_index_list_2d(object[IntegerType, ndim=2] flag_fiel ...@@ -22,20 +19,37 @@ def create_boundary_neighbor_index_list_2d(object[IntegerType, ndim=2] flag_fiel
cdef int xs, ys, x, y cdef int xs, ys, x, y
cdef int dirIdx, num_directions, dx, dy cdef int dirIdx, num_directions, dx, dy
cdef int sum_x, sum_y
cdef float dot, maxn
cdef int calculated_idx
xs, ys = flag_field.shape xs, ys = flag_field.shape
boundary_index_list = [] boundary_index_list = []
num_directions = stencil.shape[0] num_directions = stencil.shape[0]
for y in range(nr_of_ghost_layers, ys - nr_of_ghost_layers): for y in range(nr_of_ghost_layers, ys - nr_of_ghost_layers):
for x in range(nr_of_ghost_layers, xs - nr_of_ghost_layers): for x in range(nr_of_ghost_layers, xs - nr_of_ghost_layers):
sum_x = 0; sum_y = 0;
if flag_field[x, y] & fluid_mask: if flag_field[x, y] & fluid_mask:
for dirIdx in range(num_directions): for dirIdx in range(num_directions):
dx = stencil[dirIdx,0] dx = stencil[dirIdx,0]; dy = stencil[dirIdx,1]
dy = stencil[dirIdx,1]
if flag_field[x + dx, y + dy] & boundary_mask: if flag_field[x + dx, y + dy] & boundary_mask:
boundary_index_list.append((x,y, dirIdx))
if single_link: if single_link:
break sum_x += dx; sum_y += dy;
else:
boundary_index_list.append((x, y, dirIdx))
dot = 0; maxn = 0; calculated_idx = 0
if single_link and (sum_x != 0 or sum_y != 0):
for dirIdx in range(num_directions):
dx = stencil[dirIdx, 0]; dy = stencil[dirIdx, 1];
dot = dx * sum_x + dy * sum_y
if dot > maxn:
maxn = dot
calculated_idx = dirIdx
boundary_index_list.append((x, y, calculated_idx))
return boundary_index_list return boundary_index_list
...@@ -47,6 +61,10 @@ def create_boundary_neighbor_index_list_3d(object[IntegerType, ndim=3] flag_fiel ...@@ -47,6 +61,10 @@ def create_boundary_neighbor_index_list_3d(object[IntegerType, ndim=3] flag_fiel
cdef int xs, ys, zs, x, y, z cdef int xs, ys, zs, x, y, z
cdef int dirIdx, num_directions, dx, dy, dz cdef int dirIdx, num_directions, dx, dy, dz
cdef int sum_x, sum_y, sum_z
cdef float dot, maxn
cdef int calculated_idx
xs, ys, zs = flag_field.shape xs, ys, zs = flag_field.shape
boundary_index_list = [] boundary_index_list = []
num_directions = stencil.shape[0] num_directions = stencil.shape[0]
...@@ -54,15 +72,27 @@ def create_boundary_neighbor_index_list_3d(object[IntegerType, ndim=3] flag_fiel ...@@ -54,15 +72,27 @@ def create_boundary_neighbor_index_list_3d(object[IntegerType, ndim=3] flag_fiel
for z in range(nr_of_ghost_layers, zs - nr_of_ghost_layers): for z in range(nr_of_ghost_layers, zs - nr_of_ghost_layers):
for y in range(nr_of_ghost_layers, ys - nr_of_ghost_layers): for y in range(nr_of_ghost_layers, ys - nr_of_ghost_layers):
for x in range(nr_of_ghost_layers, xs - nr_of_ghost_layers): for x in range(nr_of_ghost_layers, xs - nr_of_ghost_layers):
sum_x = 0; sum_y = 0; sum_z = 0
if flag_field[x, y, z] & fluid_mask: if flag_field[x, y, z] & fluid_mask:
for dirIdx in range(num_directions): for dirIdx in range(num_directions):
dx = stencil[dirIdx,0] dx = stencil[dirIdx,0]; dy = stencil[dirIdx,1]; dz = stencil[dirIdx,2]
dy = stencil[dirIdx,1]
dz = stencil[dirIdx,2]
if flag_field[x + dx, y + dy, z + dz] & boundary_mask: if flag_field[x + dx, y + dy, z + dz] & boundary_mask:
boundary_index_list.append((x,y,z, dirIdx))
if single_link: if single_link:
break sum_x += dx; sum_y += dy; sum_z += dz
else:
boundary_index_list.append((x, y, z, dirIdx))
dot = 0; maxn = 0; calculated_idx = 0
if single_link and (sum_x != 0 or sum_y != 0 or sum_z != 0):
for dirIdx in range(num_directions):
dx = stencil[dirIdx, 0]; dy = stencil[dirIdx, 1]; dz = stencil[dirIdx, 2]
dot = dx * sum_x + dy * sum_y + dz * sum_z
if dot > maxn:
maxn = dot
calculated_idx = dirIdx
boundary_index_list.append((x, y, z, calculated_idx))
return boundary_index_list return boundary_index_list
...@@ -75,21 +105,39 @@ def create_boundary_cell_index_list_2d(object[IntegerType, ndim=2] flag_field, ...@@ -75,21 +105,39 @@ def create_boundary_cell_index_list_2d(object[IntegerType, ndim=2] flag_field,
cdef int xs, ys, x, y cdef int xs, ys, x, y
cdef int dirIdx, num_directions, dx, dy cdef int dirIdx, num_directions, dx, dy
cdef int sum_x, sum_y
cdef float dot, maxn
cdef int calculated_idx
xs, ys = flag_field.shape xs, ys = flag_field.shape
boundary_index_list = [] boundary_index_list = []
num_directions = stencil.shape[0] num_directions = stencil.shape[0]
for y in range(0, ys): for y in range(0, ys):
for x in range(0, xs): for x in range(0, xs):
sum_x = 0; sum_y = 0;
if flag_field[x, y] & boundary_mask: if flag_field[x, y] & boundary_mask:
for dirIdx in range(num_directions): for dirIdx in range(num_directions):
dx = stencil[dirIdx,0] dx = stencil[dirIdx,0]; dy = stencil[dirIdx,1]
dy = stencil[dirIdx,1]
if 0 <= x + dx < xs and 0 <= y + dy < ys: if 0 <= x + dx < xs and 0 <= y + dy < ys:
if flag_field[x + dx, y + dy] & fluid_mask: if flag_field[x + dx, y + dy] & fluid_mask:
boundary_index_list.append((x,y, dirIdx))
if single_link: if single_link:
break sum_x += dx; sum_y += dy
else:
boundary_index_list.append((x, y, dirIdx))
dot = 0; maxn = 0; calculated_idx = 0
if single_link and (sum_x != 0 or sum_y != 0):
for dirIdx in range(num_directions):
dx = stencil[dirIdx, 0]; dy = stencil[dirIdx, 1]
dot = dx * sum_x + dy * sum_y
if dot > maxn:
maxn = dot
calculated_idx = dirIdx
boundary_index_list.append((x, y, calculated_idx))
return boundary_index_list return boundary_index_list
...@@ -101,6 +149,10 @@ def create_boundary_cell_index_list_3d(object[IntegerType, ndim=3] flag_field, ...@@ -101,6 +149,10 @@ def create_boundary_cell_index_list_3d(object[IntegerType, ndim=3] flag_field,
cdef int xs, ys, zs, x, y, z cdef int xs, ys, zs, x, y, z
cdef int dirIdx, num_directions, dx, dy, dz cdef int dirIdx, num_directions, dx, dy, dz
cdef int sum_x, sum_y, sum_z
cdef float dot, maxn
cdef int calculated_idx
xs, ys, zs = flag_field.shape xs, ys, zs = flag_field.shape
boundary_index_list = [] boundary_index_list = []
num_directions = stencil.shape[0] num_directions = stencil.shape[0]
...@@ -108,14 +160,27 @@ def create_boundary_cell_index_list_3d(object[IntegerType, ndim=3] flag_field, ...@@ -108,14 +160,27 @@ def create_boundary_cell_index_list_3d(object[IntegerType, ndim=3] flag_field,
for z in range(0, zs): for z in range(0, zs):
for y in range(0, ys): for y in range(0, ys):
for x in range(0, xs): for x in range(0, xs):
sum_x = 0; sum_y = 0; sum_z = 0
if flag_field[x, y, z] & boundary_mask: if flag_field[x, y, z] & boundary_mask:
for dirIdx in range(num_directions): for dirIdx in range(num_directions):
dx = stencil[dirIdx,0] dx = stencil[dirIdx, 0]; dy = stencil[dirIdx, 1]; dz = stencil[dirIdx, 2]
dy = stencil[dirIdx,1]
dz = stencil[dirIdx,2]
if 0 <= x + dx < xs and 0 <= y + dy < ys and 0 <= z + dz < zs: if 0 <= x + dx < xs and 0 <= y + dy < ys and 0 <= z + dz < zs:
if flag_field[x + dx, y + dy, z + dz] & fluid_mask: if flag_field[x + dx, y + dy, z + dz] & fluid_mask:
boundary_index_list.append((x,y,z, dirIdx))
if single_link: if single_link:
break sum_x += dx; sum_y += dy; sum_z += dz
else:
boundary_index_list.append((x, y, z, dirIdx))
dot = 0; maxn = 0; calculated_idx=0
if single_link and (sum_x != 0 or sum_y !=0 or sum_z !=0):
for dirIdx in range(num_directions):
dx = stencil[dirIdx, 0]; dy = stencil[dirIdx, 1]; dz = stencil[dirIdx, 2]
dot = dx*sum_x + dy*sum_y + dz*sum_z
if dot > maxn:
maxn = dot
calculated_idx = dirIdx
boundary_index_list.append((x, y, z, calculated_idx))
return boundary_index_list return boundary_index_list
\ No newline at end of file
import sympy as sp import sympy as sp
from pystencils.boundaries.boundaryhandling import DEFAULT_FLAG_TYPE from pystencils.boundaries.boundaryhandling import DEFAULT_FLAG_TYPE
from pystencils.data_types import TypedSymbol, create_type from pystencils.typing import TypedSymbol, create_type
from pystencils.field import Field from pystencils.field import Field
from pystencils.integer_functions import bitwise_and from pystencils.integer_functions import bitwise_and
......
import os
from collections.abc import Hashable
from functools import partial, wraps
from itertools import chain
from functools import lru_cache as memorycache
from joblib import Memory
from appdirs import user_cache_dir
if 'PYSTENCILS_CACHE_DIR' in os.environ:
cache_dir = os.environ['PYSTENCILS_CACHE_DIR']
else:
cache_dir = user_cache_dir('pystencils')
disk_cache = Memory(cache_dir, verbose=False).cache
disk_cache_no_fallback = disk_cache
def _wrapper(wrapped_func, cached_func, *args, **kwargs):
if all(isinstance(a, Hashable) for a in chain(args, kwargs.values())):
return cached_func(*args, **kwargs)
else:
return wrapped_func(*args, **kwargs)
def memorycache_if_hashable(maxsize=128, typed=False):
def wrapper(func):
return partial(_wrapper, func, memorycache(maxsize, typed)(func))
return wrapper
def sharedmethodcache(cache_id: str):
"""Decorator for memoization of instance methods, allowing multiple methods to use the same cache.
This decorator caches results of instance methods per instantiated object of the surrounding class.
It allows multiple methods to use the same cache, by passing them the same `cache_id` string.
Cached values are stored in a dictionary, which is added as a member `self.<cache_id>` to the
`self` object instance. Make sure that this doesn't cause any naming conflicts with other members!
Of course, for this to be useful, said methods must have the same signature (up to additional kwargs)
and must return the same result when called with the same arguments."""
def _decorator(user_method):
@wraps(user_method)
def _decorated_func(self, *args, **kwargs):
objdict = self.__dict__
cache = objdict.setdefault(cache_id, dict())
key = args
for item in kwargs.items():
key += item
if key not in cache:
result = user_method(self, *args, **kwargs)
cache[key] = result
return result
else:
return cache[key]
return _decorated_func
return _decorator
def clear_cache():
"""
Clears the pystencils cache created by joblib.
"""
memory = Memory(cache_dir, verbose=0)
memory.clear(warn=False)
# Disable memory cache:
# disk_cache = lambda o: o
# disk_cache_no_fallback = lambda o: o
from copy import copy
from collections import defaultdict
from dataclasses import dataclass, field
from types import MappingProxyType
from typing import Union, Tuple, List, Dict, Callable, Any, DefaultDict, Iterable
from pystencils import Target, Backend, Field
from pystencils.typing.typed_sympy import BasicType
from pystencils.typing.utilities import collate_types
import numpy as np
# TODO: There exists DTypeLike in NumPy which would be better than type for type hinting, to new at the moment
# from numpy.typing import DTypeLike
# TODO: CreateKernelConfig is bloated think of more classes better usage, factory whatever ...
# Proposition: CreateKernelConfigs Classes for different targets?
@dataclass
class CreateKernelConfig:
"""
**Below all parameters for the CreateKernelConfig are explained**
"""
target: Target = Target.CPU
"""
All targets are defined in :class:`pystencils.enums.Target`
"""
backend: Backend = None
"""
All backends are defined in :class:`pystencils.enums.Backend`
"""
function_name: str = 'kernel'
"""
Name of the generated function - only important if generated code is written out
"""
data_type: Union[type, str, DefaultDict[str, BasicType], Dict[str, BasicType]] = np.float64
"""
Data type used for all untyped symbols (i.e. non-fields), can also be a dict from symbol name to type.
If specified as a dict ideally a defaultdict is used to define a default value for symbols not listed in the
dict. If a plain dict is provided it will be transformed into a defaultdict internally. The default value
will then be specified via type collation then.
"""
default_number_float: Union[type, str, BasicType] = None
"""
Data type used for all untyped floating point numbers (i.e. 0.5). By default the value of data_type is used.
If data_type is given as a defaultdict its default_factory is used.
"""
default_number_int: Union[type, str, BasicType] = np.int64
"""
Data type used for all untyped integer numbers (i.e. 1)
"""
iteration_slice: Tuple = None
"""
Rectangular subset to iterate over, if not specified the complete non-ghost layer part of the field is iterated over
"""
ghost_layers: Union[bool, int, List[Tuple[int]]] = None
"""
A single integer specifies the ghost layer count at all borders, can also be a sequence of
pairs ``[(x_lower_gl, x_upper_gl), .... ]``. These layers are excluded from the iteration.
If left to default, the number of ghost layers is determined automatically from the assignments.
"""
cpu_openmp: Union[bool, int] = False
"""
`True` or number of threads for OpenMP parallelization, `False` for no OpenMP. If set to `True`, the maximum number
of available threads will be chosen.
"""
cpu_vectorize_info: Dict = None
"""
A dictionary with keys, 'vector_instruction_set', 'assume_aligned' and 'nontemporal'
for documentation of these parameters see vectorize function. Example:
'{'instruction_set': 'avx512', 'assume_aligned': True, 'nontemporal':True}'
"""
cpu_blocking: Tuple[int] = None
"""
A tuple of block sizes or `None` if no blocking should be applied
"""
omp_single_loop: bool = True
"""
If OpenMP is active: whether multiple outer loops are permitted
"""
base_pointer_specification: Union[List[Iterable[str]], List[Iterable[int]]] = None
"""
Specification of how many and which intermediate pointers are created for a field access.
For example [ (0), (2,3,)] creates on base pointer for coordinates 2 and 3 and writes the offset for coordinate
zero directly in the field access. These specifications are defined dependent on the loop ordering.
This function translates more readable version into the specification above.
For more information see: `pystencils.transformations.create_intermediate_base_pointer`
"""
gpu_indexing: str = 'block'
"""
Either 'block' or 'line' , or custom indexing class, see `pystencils.gpu.AbstractIndexing`
"""
gpu_indexing_params: MappingProxyType = field(default_factory=lambda: MappingProxyType({}))
"""
Dict with indexing parameters (constructor parameters of indexing class)
e.g. for 'block' one can specify '{'block_size': (20, 20, 10) }'.
"""
# TODO Markus rework this docstring
default_assignment_simplifications: bool = False
"""
If `True` default simplifications are first performed on the Assignments. If problems occur during the
simplification a warning will be thrown.
Furthermore, it is essential to know that this is a two-stage process. The first stage of the process acts
on the level of the `pystencils.AssignmentCollection`. In this part,
`pystencil.simp.create_simplification_strategy` from pystencils.simplificationfactory will be used to
apply optimisations like insertion of constants to
remove pressure from the registers. Thus the first part of the optimisations can only be executed if
an `AssignmentCollection` is passed. The second part of the optimisation acts on the level of each Assignment
individually. In this stage, all optimisations from `sympy.codegen.rewriting.optims_c99` are applied
to each Assignment. Thus this stage can also be applied if a list of Assignments is passed.
"""
cpu_prepend_optimizations: List[Callable] = field(default_factory=list)
"""
List of extra optimizations to perform first on the AST.
"""
use_auto_for_assignments: bool = False
"""
If set to `True`, auto can be used in the generated code for data types. This makes the type system more robust.
"""
index_fields: List[Field] = None
"""
List of index fields, i.e. 1D fields with struct data type. If not `None`, `create_index_kernel`
instead of `create_domain_kernel` is used.
"""
coordinate_names: Tuple[str, Any] = ('x', 'y', 'z')
"""
Name of the coordinate fields in the struct data type.
"""
allow_double_writes: bool = False
"""
If True, don't check if every field is only written at a single location. This is required
for example for kernels that are compiled with loop step sizes > 1, that handle multiple
cells at once. Use with care!
"""
skip_independence_check: bool = False
"""
By default the assignment list is checked for read/write independence. This means fields are only written at
locations where they are read. Doing so guarantees thread safety. In some cases e.g. for
periodicity kernel, this can not be assured and does the check needs to be deactivated. Use with care!
"""
class DataTypeFactory:
"""Because of pickle, we need to have a nested class, instead of a lambda in __post_init__"""
def __init__(self, dt):
self.dt = dt
def __call__(self):
return BasicType(self.dt)
def _check_type(self, dtype_to_check):
if isinstance(dtype_to_check, str) and (dtype_to_check == 'float' or dtype_to_check == 'int'):
self._typing_error()
if isinstance(dtype_to_check, type) and not hasattr(dtype_to_check, "dtype"):
# NumPy-types are also of type 'type'. However, they have more properties
self._typing_error()
@staticmethod
def _typing_error():
raise ValueError("It is not possible to use python types (float, int) for datatypes because these "
"types are ambiguous. For example float will map to double. "
"Also the string version like 'float' is not allowed, e.g. use 'float64' instead")
def __post_init__(self):
# ---- Legacy parameters
if not isinstance(self.target, Target):
raise ValueError("target must be provided by the 'Target' enum")
# ---- Auto Backend
if not self.backend:
if self.target == Target.CPU:
self.backend = Backend.C
elif self.target == Target.GPU:
self.backend = Backend.CUDA
else:
raise NotImplementedError(f'Target {self.target} has no default backend')
if not isinstance(self.backend, Backend):
raise ValueError("backend must be provided by the 'Backend' enum")
# Normalise data types
for dtype in [self.data_type, self.default_number_float, self.default_number_int]:
self._check_type(dtype)
if not isinstance(self.data_type, dict):
dt = copy(self.data_type) # The copy is necessary because BasicType has sympy shinanigans
self.data_type = defaultdict(self.DataTypeFactory(dt))
if isinstance(self.data_type, dict) and not isinstance(self.data_type, defaultdict):
for dtype in self.data_type.values():
self._check_type(dtype)
dt = collate_types([BasicType(dtype) for dtype in self.data_type.values()])
dtype_dict = self.data_type
self.data_type = defaultdict(self.DataTypeFactory(dt), dtype_dict)
assert isinstance(self.data_type, defaultdict), "At this point data_type must be a defaultdict!"
for dtype in self.data_type.values():
self._check_type(dtype)
self._check_type(self.data_type.default_factory())
if self.default_number_float is None:
self.default_number_float = self.data_type.default_factory()
if not isinstance(self.default_number_float, BasicType):
self.default_number_float = BasicType(self.default_number_float)
if not isinstance(self.default_number_int, BasicType):
self.default_number_int = BasicType(self.default_number_int)
from pystencils.cpu.cpujit import make_python_function from pystencils.cpu.cpujit import make_python_function
from pystencils.cpu.kernelcreation import add_openmp, create_indexed_kernel, create_kernel from pystencils.cpu.kernelcreation import add_openmp, create_indexed_kernel, create_kernel, add_pragmas
__all__ = ['create_kernel', 'create_indexed_kernel', 'add_openmp', 'make_python_function'] __all__ = ['create_kernel', 'create_indexed_kernel', 'add_openmp', 'add_pragmas', 'make_python_function']
...@@ -13,7 +13,7 @@ in a configuration file. ...@@ -13,7 +13,7 @@ in a configuration file.
3. or in your home directory at ``~/.config/pystencils/config.json`` (Linux) or 3. or in your home directory at ``~/.config/pystencils/config.json`` (Linux) or
``%HOMEPATH%\.pystencils\config.json`` (Windows) ``%HOMEPATH%\.pystencils\config.json`` (Windows)
If no configuration file is found, a default configuration is created at the above mentioned location in your home. If no configuration file is found, a default configuration is created at the above-mentioned location in your home.
So run *pystencils* once, then edit the created configuration file. So run *pystencils* once, then edit the created configuration file.
...@@ -23,7 +23,7 @@ Compiler Config (Linux) ...@@ -23,7 +23,7 @@ Compiler Config (Linux)
- **'os'**: should be detected automatically as 'linux' - **'os'**: should be detected automatically as 'linux'
- **'command'**: path to C++ compiler (defaults to 'g++') - **'command'**: path to C++ compiler (defaults to 'g++')
- **'flags'**: space separated list of compiler flags. Make sure to activate OpenMP in your compiler - **'flags'**: space separated list of compiler flags. Make sure to activate OpenMP in your compiler
- **'restrict_qualifier'**: the restrict qualifier is not standardized accross compilers. - **'restrict_qualifier'**: the 'restrict' qualifier is not standardized across compilers.
For most Linux compilers the qualifier is ``__restrict__`` For most Linux compilers the qualifier is ``__restrict__``
...@@ -39,31 +39,37 @@ Then 'cl.exe' is used to compile. ...@@ -39,31 +39,37 @@ Then 'cl.exe' is used to compile.
where Visual Studio is installed. This path has to contain a file called 'vcvarsall.bat' where Visual Studio is installed. This path has to contain a file called 'vcvarsall.bat'
- **'arch'**: 'x86' or 'x64' - **'arch'**: 'x86' or 'x64'
- **'flags'**: flags passed to 'cl.exe', make sure OpenMP is activated - **'flags'**: flags passed to 'cl.exe', make sure OpenMP is activated
- **'restrict_qualifier'**: the restrict qualifier is not standardized across compilers. - **'restrict_qualifier'**: the 'restrict' qualifier is not standardized across compilers.
For Windows compilers the qualifier should be ``__restrict`` For Windows compilers the qualifier should be ``__restrict``
""" """
from appdirs import user_cache_dir, user_config_dir
from collections import OrderedDict
import hashlib import hashlib
import importlib.util
import json import json
import os import os
import platform import platform
import shutil import shutil
import subprocess import subprocess
import sysconfig
import tempfile
import textwrap import textwrap
from collections import OrderedDict import time
from sysconfig import get_paths import warnings
from tempfile import TemporaryDirectory, NamedTemporaryFile import pathlib
import numpy as np import numpy as np
from appdirs import user_cache_dir, user_config_dir
from pystencils import FieldType from pystencils import FieldType
from pystencils.astnodes import LoopOverCoordinate from pystencils.astnodes import LoopOverCoordinate
from pystencils.backends.cbackend import generate_c, get_headers, CFunction from pystencils.backends.cbackend import generate_c, get_headers
from pystencils.data_types import cast_func, VectorType, vector_memory_access from pystencils.backends.simd_instruction_sets import get_supported_instruction_sets
from pystencils.cpu.msvc_detection import get_environment
from pystencils.include import get_pystencils_include_path from pystencils.include import get_pystencils_include_path
from pystencils.kernel_wrapper import KernelWrapper from pystencils.kernel_wrapper import KernelWrapper
from pystencils.utils import atomic_file_write, file_handle_for_atomic_write, recursive_dict_update from pystencils.typing import BasicType, CastFunc, VectorType, VectorMemoryAccess
from pystencils.utils import atomic_file_write, recursive_dict_update
def make_python_function(kernel_function_node, custom_backend=None): def make_python_function(kernel_function_node, custom_backend=None):
...@@ -75,6 +81,7 @@ def make_python_function(kernel_function_node, custom_backend=None): ...@@ -75,6 +81,7 @@ def make_python_function(kernel_function_node, custom_backend=None):
- all symbols which are not defined in the kernel itself are expected as parameters - all symbols which are not defined in the kernel itself are expected as parameters
:param kernel_function_node: the abstract syntax tree :param kernel_function_node: the abstract syntax tree
:param custom_backend: use own custom printer for code generation
:return: kernel functor :return: kernel functor
""" """
result = compile_and_load(kernel_function_node, custom_backend) result = compile_and_load(kernel_function_node, custom_backend)
...@@ -117,15 +124,15 @@ def get_configuration_file_path(): ...@@ -117,15 +124,15 @@ def get_configuration_file_path():
# 1) Read path from environment variable if found # 1) Read path from environment variable if found
if 'PYSTENCILS_CONFIG' in os.environ: if 'PYSTENCILS_CONFIG' in os.environ:
return os.environ['PYSTENCILS_CONFIG'], True return os.environ['PYSTENCILS_CONFIG']
# 2) Look in current directory for pystencils.json # 2) Look in current directory for pystencils.json
elif os.path.exists("pystencils.json"): elif os.path.exists("pystencils.json"):
return "pystencils.json", True return "pystencils.json"
# 3) Try ~/.pystencils.json # 3) Try ~/.pystencils.json
elif os.path.exists(config_path_in_home): elif os.path.exists(config_path_in_home):
return config_path_in_home, True return config_path_in_home
else: else:
return config_path_in_home, False return config_path_in_home
def create_folder(path, is_file): def create_folder(path, is_file):
...@@ -137,52 +144,50 @@ def create_folder(path, is_file): ...@@ -137,52 +144,50 @@ def create_folder(path, is_file):
pass pass
def get_llc_command():
"""Try to get executable for llvm's IR compiler llc
We try if one of the following is in PATH: llc, llc-10, llc-9, llc-8, llc-7, llc-6
"""
candidates = ['llc', 'llc-10', 'llc-9', 'llc-8', 'llc-7', 'llc-6']
found_executables = (e for e in candidates if shutil.which(e))
return next(found_executables, None)
def read_config(): def read_config():
if platform.system().lower() == 'linux': if platform.system().lower() == 'linux':
default_compiler_config = OrderedDict([ default_compiler_config = OrderedDict([
('os', 'linux'), ('os', 'linux'),
('command', 'g++'), ('command', 'g++'),
('llc_command', get_llc_command() or 'llc'),
('flags', '-Ofast -DNDEBUG -fPIC -march=native -fopenmp -std=c++11'), ('flags', '-Ofast -DNDEBUG -fPIC -march=native -fopenmp -std=c++11'),
('restrict_qualifier', '__restrict__') ('restrict_qualifier', '__restrict__')
]) ])
if platform.machine().startswith('ppc64'): if platform.machine().startswith('ppc64') or platform.machine() == 'arm64':
default_compiler_config['flags'] = default_compiler_config['flags'].replace('-march=native', default_compiler_config['flags'] = default_compiler_config['flags'].replace('-march=native',
'-mcpu=native') '-mcpu=native')
elif platform.system().lower() == 'windows': elif platform.system().lower() == 'windows':
default_compiler_config = OrderedDict([ default_compiler_config = OrderedDict([
('os', 'windows'), ('os', 'windows'),
('msvc_version', 'latest'), ('msvc_version', 'latest'),
('llc_command', get_llc_command() or 'llc'),
('arch', 'x64'), ('arch', 'x64'),
('flags', '/Ox /fp:fast /OpenMP /arch:avx'), ('flags', '/Ox /fp:fast /OpenMP /arch:avx'),
('restrict_qualifier', '__restrict') ('restrict_qualifier', '__restrict')
]) ])
if platform.machine() == 'ARM64':
default_compiler_config['arch'] = 'ARM64'
default_compiler_config['flags'] = default_compiler_config['flags'].replace(' /arch:avx', '')
elif platform.system().lower() == 'darwin': elif platform.system().lower() == 'darwin':
default_compiler_config = OrderedDict([ default_compiler_config = OrderedDict([
('os', 'darwin'), ('os', 'darwin'),
('command', 'clang++'), ('command', 'clang++'),
('llc_command', get_llc_command() or 'llc'),
('flags', '-Ofast -DNDEBUG -fPIC -march=native -Xclang -fopenmp -std=c++11'), ('flags', '-Ofast -DNDEBUG -fPIC -march=native -Xclang -fopenmp -std=c++11'),
('restrict_qualifier', '__restrict__') ('restrict_qualifier', '__restrict__')
]) ])
if platform.machine() == 'arm64': if platform.machine() == 'arm64':
default_compiler_config['flags'] = default_compiler_config['flags'].replace('-march=native ', '') if 'sme' in get_supported_instruction_sets():
flag = '-march=armv8.7-a+sme '
else:
flag = ''
default_compiler_config['flags'] = default_compiler_config['flags'].replace('-march=native ', flag)
for libomp in ['/opt/local/lib/libomp/libomp.dylib', '/usr/local/lib/libomp.dylib', for libomp in ['/opt/local/lib/libomp/libomp.dylib', '/usr/local/lib/libomp.dylib',
'/opt/homebrew/lib/libomp.dylib']: '/opt/homebrew/lib/libomp.dylib']:
if os.path.exists(libomp): if os.path.exists(libomp):
default_compiler_config['flags'] += ' ' + libomp default_compiler_config['flags'] += ' ' + libomp
break break
else:
raise NotImplementedError('Generation of default compiler flags for %s is not implemented' %
(platform.system(),))
default_cache_config = OrderedDict([ default_cache_config = OrderedDict([
('object_cache', os.path.join(user_cache_dir('pystencils'), 'objectcache')), ('object_cache', os.path.join(user_cache_dir('pystencils'), 'objectcache')),
('clear_cache_on_start', False), ('clear_cache_on_start', False),
...@@ -191,42 +196,47 @@ def read_config(): ...@@ -191,42 +196,47 @@ def read_config():
default_config = OrderedDict([('compiler', default_compiler_config), default_config = OrderedDict([('compiler', default_compiler_config),
('cache', default_cache_config)]) ('cache', default_cache_config)])
config_path, config_exists = get_configuration_file_path() from fasteners import InterProcessLock
config_path = pathlib.Path(get_configuration_file_path())
config_path.parent.mkdir(parents=True, exist_ok=True)
config = default_config.copy() config = default_config.copy()
if config_exists:
with open(config_path, 'r') as json_config_file: lockfile = config_path.with_suffix(config_path.suffix + ".lock")
loaded_config = json.load(json_config_file) with InterProcessLock(lockfile):
config = recursive_dict_update(config, loaded_config) if config_path.exists():
else: with open(config_path, 'r') as json_config_file:
create_folder(config_path, True) loaded_config = json.load(json_config_file)
with open(config_path, 'w') as f: config = recursive_dict_update(config, loaded_config)
json.dump(config, f, indent=4) else:
with open(config_path, 'w') as f:
json.dump(config, f, indent=4)
if config['cache']['object_cache'] is not False: if config['cache']['object_cache'] is not False:
config['cache']['object_cache'] = os.path.expanduser(config['cache']['object_cache']).format(pid=os.getpid()) config['cache']['object_cache'] = os.path.expanduser(config['cache']['object_cache']).format(pid=os.getpid())
clear_cache = False clear_cache_on_start = False
cache_status_file = os.path.join(config['cache']['object_cache'], 'last_config.json') cache_status_file = os.path.join(config['cache']['object_cache'], 'last_config.json')
if os.path.exists(cache_status_file): if os.path.exists(cache_status_file):
# check if compiler config has changed # check if compiler config has changed
last_config = json.load(open(cache_status_file, 'r')) last_config = json.load(open(cache_status_file, 'r'))
if set(last_config.items()) != set(config['compiler'].items()): if set(last_config.items()) != set(config['compiler'].items()):
clear_cache = True clear_cache_on_start = True
else: else:
for key in last_config.keys(): for key in last_config.keys():
if last_config[key] != config['compiler'][key]: if last_config[key] != config['compiler'][key]:
clear_cache = True clear_cache_on_start = True
if config['cache']['clear_cache_on_start'] or clear_cache: if config['cache']['clear_cache_on_start'] or clear_cache_on_start:
shutil.rmtree(config['cache']['object_cache'], ignore_errors=True) shutil.rmtree(config['cache']['object_cache'], ignore_errors=True)
create_folder(config['cache']['object_cache'], False) create_folder(config['cache']['object_cache'], False)
with NamedTemporaryFile('w', dir=os.path.dirname(cache_status_file), delete=False) as f: with tempfile.NamedTemporaryFile('w', dir=os.path.dirname(cache_status_file), delete=False) as f:
json.dump(config['compiler'], f, indent=4) json.dump(config['compiler'], f, indent=4)
os.replace(f.name, cache_status_file) os.replace(f.name, cache_status_file)
if config['compiler']['os'] == 'windows': if config['compiler']['os'] == 'windows':
from pystencils.cpu.msvc_detection import get_environment
msvc_env = get_environment(config['compiler']['msvc_version'], config['compiler']['arch']) msvc_env = get_environment(config['compiler']['msvc_version'], config['compiler']['arch'])
if 'env' not in config['compiler']: if 'env' not in config['compiler']:
config['compiler']['env'] = {} config['compiler']['env'] = {}
...@@ -273,6 +283,7 @@ def clear_cache(): ...@@ -273,6 +283,7 @@ def clear_cache():
create_folder(cache_config['object_cache'], False) create_folder(cache_config['object_cache'], False)
# TODO don't hardcode C type. [1] of tuple output
type_mapping = { type_mapping = {
np.float32: ('PyFloat_AsDouble', 'float'), np.float32: ('PyFloat_AsDouble', 'float'),
np.float64: ('PyFloat_AsDouble', 'double'), np.float64: ('PyFloat_AsDouble', 'double'),
...@@ -282,8 +293,6 @@ type_mapping = { ...@@ -282,8 +293,6 @@ type_mapping = {
np.uint16: ('PyLong_AsUnsignedLong', 'uint16_t'), np.uint16: ('PyLong_AsUnsignedLong', 'uint16_t'),
np.uint32: ('PyLong_AsUnsignedLong', 'uint32_t'), np.uint32: ('PyLong_AsUnsignedLong', 'uint32_t'),
np.uint64: ('PyLong_AsUnsignedLong', 'uint64_t'), np.uint64: ('PyLong_AsUnsignedLong', 'uint64_t'),
np.complex64: (('PyComplex_RealAsDouble', 'PyComplex_ImagAsDouble'), 'ComplexFloat'),
np.complex128: (('PyComplex_RealAsDouble', 'PyComplex_ImagAsDouble'), 'ComplexDouble'),
} }
template_extract_scalar = """ template_extract_scalar = """
...@@ -293,14 +302,6 @@ if( obj_{name} == NULL) {{ PyErr_SetString(PyExc_TypeError, "Keyword argument ' ...@@ -293,14 +302,6 @@ if( obj_{name} == NULL) {{ PyErr_SetString(PyExc_TypeError, "Keyword argument '
if( PyErr_Occurred() ) {{ return NULL; }} if( PyErr_Occurred() ) {{ return NULL; }}
""" """
template_extract_complex = """
PyObject * obj_{name} = PyDict_GetItemString(kwargs, "{name}");
if( obj_{name} == NULL) {{ PyErr_SetString(PyExc_TypeError, "Keyword argument '{name}' missing"); return NULL; }};
{target_type} {name}{{ ({real_type}) {extract_function_real}( obj_{name} ),
({real_type}) {extract_function_imag}( obj_{name} ) }};
if( PyErr_Occurred() ) {{ return NULL; }}
"""
template_extract_array = """ template_extract_array = """
PyObject * obj_{name} = PyDict_GetItemString(kwargs, "{name}"); PyObject * obj_{name} = PyDict_GetItemString(kwargs, "{name}");
if( obj_{name} == NULL) {{ PyErr_SetString(PyExc_TypeError, "Keyword argument '{name}' missing"); return NULL; }}; if( obj_{name} == NULL) {{ PyErr_SetString(PyExc_TypeError, "Keyword argument '{name}' missing"); return NULL; }};
...@@ -368,8 +369,7 @@ def equal_size_check(fields): ...@@ -368,8 +369,7 @@ def equal_size_check(fields):
return "" return ""
ref_field = fields[0] ref_field = fields[0]
cond = ["(buffer_{field.name}.shape[{i}] == buffer_{ref_field.name}.shape[{i}])".format(ref_field=ref_field, cond = [f"(buffer_{field_to_test.name}.shape[{i}] == buffer_{ref_field.name}.shape[{i}])"
field=field_to_test, i=i)
for field_to_test in fields[1:] for field_to_test in fields[1:]
for i in range(fields[0].spatial_dimensions)] for i in range(fields[0].spatial_dimensions)]
cond = " && ".join(cond) cond = " && ".join(cond)
...@@ -397,7 +397,7 @@ def create_function_boilerplate_code(parameter_info, name, ast_node, insert_chec ...@@ -397,7 +397,7 @@ def create_function_boilerplate_code(parameter_info, name, ast_node, insert_chec
aligned = False aligned = False
if ast_node.assignments: if ast_node.assignments:
aligned = any([a.lhs.args[2] for a in ast_node.assignments aligned = any([a.lhs.args[2] for a in ast_node.assignments
if hasattr(a, 'lhs') and isinstance(a.lhs, cast_func) if hasattr(a, 'lhs') and isinstance(a.lhs, CastFunc)
and hasattr(a.lhs, 'dtype') and isinstance(a.lhs.dtype, VectorType)]) and hasattr(a.lhs, 'dtype') and isinstance(a.lhs.dtype, VectorType)])
if ast_node.instruction_set and aligned: if ast_node.instruction_set and aligned:
...@@ -407,9 +407,10 @@ def create_function_boilerplate_code(parameter_info, name, ast_node, insert_chec ...@@ -407,9 +407,10 @@ def create_function_boilerplate_code(parameter_info, name, ast_node, insert_chec
for loop in ast_node.atoms(LoopOverCoordinate): for loop in ast_node.atoms(LoopOverCoordinate):
has_openmp = has_openmp or any(['#pragma omp' in p for p in loop.prefix_lines]) has_openmp = has_openmp or any(['#pragma omp' in p for p in loop.prefix_lines])
has_nontemporal = has_nontemporal or any([a.args[0].field == field and a.args[3] for a in has_nontemporal = has_nontemporal or any([a.args[0].field == field and a.args[3] for a in
loop.atoms(vector_memory_access)]) loop.atoms(VectorMemoryAccess)])
if has_openmp and has_nontemporal: if has_openmp and has_nontemporal:
byte_width = ast_node.instruction_set['cachelineSize'] cl_size = ast_node.instruction_set['cachelineSize']
byte_width = f"({cl_size}) < SIZE_MAX ? ({cl_size}) : ({byte_width})"
offset = max(max(ast_node.ghost_layers)) * item_size offset = max(max(ast_node.ghost_layers)) * item_size
offset_cond = f"(((uintptr_t) buffer_{field.name}.buf) + {offset}) % ({byte_width}) == 0" offset_cond = f"(((uintptr_t) buffer_{field.name}.buf) + {offset}) % ({byte_width}) == 0"
...@@ -426,8 +427,7 @@ def create_function_boilerplate_code(parameter_info, name, ast_node, insert_chec ...@@ -426,8 +427,7 @@ def create_function_boilerplate_code(parameter_info, name, ast_node, insert_chec
if (np_dtype.isbuiltin and FieldType.is_generic(field) if (np_dtype.isbuiltin and FieldType.is_generic(field)
and not np.issubdtype(field.dtype.numpy_dtype, np.complexfloating)): and not np.issubdtype(field.dtype.numpy_dtype, np.complexfloating)):
dtype_cond = "buffer_{name}.format[0] == '{format}'".format(name=field.name, dtype_cond = f"buffer_{field.name}.format[0] == '{field.dtype.numpy_dtype.char}'"
format=field.dtype.numpy_dtype.char)
pre_call_code += template_check_array.format(cond=dtype_cond, what="data type", name=field.name, pre_call_code += template_check_array.format(cond=dtype_cond, what="data type", name=field.name,
expected=str(field.dtype.numpy_dtype)) expected=str(field.dtype.numpy_dtype))
...@@ -456,25 +456,14 @@ def create_function_boilerplate_code(parameter_info, name, ast_node, insert_chec ...@@ -456,25 +456,14 @@ def create_function_boilerplate_code(parameter_info, name, ast_node, insert_chec
elif param.is_field_stride: elif param.is_field_stride:
field = param.fields[0] field = param.fields[0]
item_size = field.dtype.numpy_dtype.itemsize item_size = field.dtype.numpy_dtype.itemsize
parameters.append("buffer_{name}.strides[{i}] / {bytes}".format(bytes=item_size, i=param.symbol.coordinate, parameters.append(f"buffer_{field.name}.strides[{param.symbol.coordinate}] / {item_size}")
name=field.name))
elif param.is_field_shape: elif param.is_field_shape:
parameters.append(f"buffer_{param.field_name}.shape[{param.symbol.coordinate}]") parameters.append(f"buffer_{param.field_name}.shape[{param.symbol.coordinate}]")
elif type(param.symbol) is CFunction:
continue
else: else:
extract_function, target_type = type_mapping[param.symbol.dtype.numpy_dtype.type] extract_function, target_type = type_mapping[param.symbol.dtype.numpy_dtype.type]
if np.issubdtype(param.symbol.dtype.numpy_dtype, np.complexfloating): pre_call_code += template_extract_scalar.format(extract_function=extract_function,
pre_call_code += template_extract_complex.format(extract_function_real=extract_function[0], target_type=target_type,
extract_function_imag=extract_function[1], name=param.symbol.name)
target_type=target_type,
real_type="float" if target_type == "ComplexFloat"
else "double",
name=param.symbol.name)
else:
pre_call_code += template_extract_scalar.format(extract_function=extract_function,
target_type=target_type,
name=param.symbol.name)
parameters.append(param.symbol.name) parameters.append(param.symbol.name)
...@@ -494,18 +483,15 @@ def create_module_boilerplate_code(module_name, names): ...@@ -494,18 +483,15 @@ def create_module_boilerplate_code(module_name, names):
def load_kernel_from_file(module_name, function_name, path): def load_kernel_from_file(module_name, function_name, path):
from importlib.util import spec_from_file_location, module_from_spec
try: try:
spec = spec_from_file_location(name=module_name, location=path) spec = importlib.util.spec_from_file_location(name=module_name, location=path)
mod = module_from_spec(spec) mod = importlib.util.module_from_spec(spec)
spec.loader.exec_module(mod) spec.loader.exec_module(mod)
except ImportError: except ImportError:
import time warnings.warn(f"Could not load {path}, trying on more time in 5 seconds ...")
import warnings time.sleep(5)
warnings.warn("Could not load " + path + ", trying on more time...") spec = importlib.util.spec_from_file_location(name=module_name, location=path)
time.sleep(1) mod = importlib.util.module_from_spec(spec)
spec = spec_from_file_location(name=module_name, location=path)
mod = module_from_spec(spec)
spec.loader.exec_module(mod) spec.loader.exec_module(mod)
return getattr(mod, function_name) return getattr(mod, function_name)
...@@ -544,15 +530,19 @@ class ExtensionModuleCode: ...@@ -544,15 +530,19 @@ class ExtensionModuleCode:
headers = {'<math.h>', '<stdint.h>'} headers = {'<math.h>', '<stdint.h>'}
for ast in self._ast_nodes: for ast in self._ast_nodes:
for field in ast.fields_accessed:
if isinstance(field.dtype, BasicType) and field.dtype.is_half():
# Add the half precision header only if half precision numbers occur in the AST
headers.add('"half_precision.h"')
headers.update(get_headers(ast)) headers.update(get_headers(ast))
header_list = list(headers)
header_list.sort() header_list = sorted(headers)
header_list.insert(0, '"Python.h"') header_list.insert(0, '"Python.h"')
ps_headers = [os.path.join(os.path.dirname(__file__), '..', 'include', h[1:-1]) for h in header_list ps_headers = [os.path.join(os.path.dirname(__file__), '..', 'include', h[1:-1]) for h in header_list
if os.path.exists(os.path.join(os.path.dirname(__file__), '..', 'include', h[1:-1]))] if os.path.exists(os.path.join(os.path.dirname(__file__), '..', 'include', h[1:-1]))]
header_hash = b''.join([hashlib.sha256(open(h, 'rb').read()).digest() for h in ps_headers]) header_hash = b''.join([hashlib.sha256(open(h, 'rb').read()).digest() for h in ps_headers])
includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list]) includes = "\n".join([f"#include {include_file}" for include_file in header_list])
self._code_string += includes self._code_string += includes
self._code_string += "\n" self._code_string += "\n"
self._code_string += f"#define RESTRICT {restrict_qualifier} \n" self._code_string += f"#define RESTRICT {restrict_qualifier} \n"
...@@ -561,7 +551,7 @@ class ExtensionModuleCode: ...@@ -561,7 +551,7 @@ class ExtensionModuleCode:
for ast, name in zip(self._ast_nodes, self._function_names): for ast, name in zip(self._ast_nodes, self._function_names):
old_name = ast.function_name old_name = ast.function_name
ast.function_name = "kernel_" + name ast.function_name = f"kernel_{name}"
self._code_string += generate_c(ast, custom_backend=self._custom_backend) self._code_string += generate_c(ast, custom_backend=self._custom_backend)
self._code_string += create_function_boilerplate_code(ast.get_parameters(), name, ast) self._code_string += create_function_boilerplate_code(ast.get_parameters(), name, ast)
ast.function_name = old_name ast.function_name = old_name
...@@ -578,9 +568,12 @@ class ExtensionModuleCode: ...@@ -578,9 +568,12 @@ class ExtensionModuleCode:
print(self._code_string, file=file) print(self._code_string, file=file)
def compile_module(code, code_hash, base_dir, compile_flags=[]): def compile_module(code, code_hash, base_dir, compile_flags=None):
if compile_flags is None:
compile_flags = []
compiler_config = get_compiler_config() compiler_config = get_compiler_config()
extra_flags = ['-I' + get_paths()['include'], '-I' + get_pystencils_include_path()] + compile_flags extra_flags = ['-I' + sysconfig.get_paths()['include'], '-I' + get_pystencils_include_path()] + compile_flags
if compiler_config['os'].lower() == 'windows': if compiler_config['os'].lower() == 'windows':
lib_suffix = '.pyd' lib_suffix = '.pyd'
...@@ -596,8 +589,11 @@ def compile_module(code, code_hash, base_dir, compile_flags=[]): ...@@ -596,8 +589,11 @@ def compile_module(code, code_hash, base_dir, compile_flags=[]):
object_file = os.path.join(base_dir, code_hash + object_suffix) object_file = os.path.join(base_dir, code_hash + object_suffix)
if not os.path.exists(object_file): if not os.path.exists(object_file):
with file_handle_for_atomic_write(src_file) as f: try:
code.write_to_file(f) with open(src_file, 'x') as f:
code.write_to_file(f)
except FileExistsError:
pass
if windows: if windows:
compile_cmd = ['cl.exe', '/c', '/EHsc'] + compiler_config['flags'].split() compile_cmd = ['cl.exe', '/c', '/EHsc'] + compiler_config['flags'].split()
...@@ -611,7 +607,6 @@ def compile_module(code, code_hash, base_dir, compile_flags=[]): ...@@ -611,7 +607,6 @@ def compile_module(code, code_hash, base_dir, compile_flags=[]):
# Linking # Linking
if windows: if windows:
import sysconfig
config_vars = sysconfig.get_config_vars() config_vars = sysconfig.get_config_vars()
py_lib = os.path.join(config_vars["installed_base"], "libs", py_lib = os.path.join(config_vars["installed_base"], "libs",
f"python{config_vars['py_version_nodot']}.lib") f"python{config_vars['py_version_nodot']}.lib")
...@@ -632,7 +627,12 @@ def compile_and_load(ast, custom_backend=None): ...@@ -632,7 +627,12 @@ def compile_and_load(ast, custom_backend=None):
cache_config = get_cache_config() cache_config = get_cache_config()
compiler_config = get_compiler_config() compiler_config = get_compiler_config()
function_prefix = '__declspec(dllexport)' if compiler_config['os'].lower() == 'windows' else '' if compiler_config['os'].lower() == 'windows':
function_prefix = '__declspec(dllexport)'
elif ast.instruction_set and 'function_prefix' in ast.instruction_set:
function_prefix = ast.instruction_set['function_prefix']
else:
function_prefix = ''
code = ExtensionModuleCode(custom_backend=custom_backend) code = ExtensionModuleCode(custom_backend=custom_backend)
code.add_function(ast, ast.function_name) code.add_function(ast, ast.function_name)
...@@ -645,7 +645,7 @@ def compile_and_load(ast, custom_backend=None): ...@@ -645,7 +645,7 @@ def compile_and_load(ast, custom_backend=None):
compile_flags = ast.instruction_set['compile_flags'] compile_flags = ast.instruction_set['compile_flags']
if cache_config['object_cache'] is False: if cache_config['object_cache'] is False:
with TemporaryDirectory() as base_dir: with tempfile.TemporaryDirectory() as base_dir:
lib_file = compile_module(code, code_hash_str, base_dir, compile_flags=compile_flags) lib_file = compile_module(code, code_hash_str, base_dir, compile_flags=compile_flags)
result = load_kernel_from_file(code_hash_str, ast.function_name, lib_file) result = load_kernel_from_file(code_hash_str, ast.function_name, lib_file)
else: else:
......