Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
No results found
Show changes
Showing
with 2245 additions and 194 deletions
from sympy.printing.printer import Printer
from graphviz import Digraph, lang
import graphviz import graphviz
try:
from graphviz import Digraph
import graphviz.quoting as quote
except ImportError:
from graphviz import Digraph
import graphviz.lang as quote
from sympy.printing.printer import Printer
# noinspection PyPep8Naming # noinspection PyPep8Naming
...@@ -12,7 +17,7 @@ class DotPrinter(Printer): ...@@ -12,7 +17,7 @@ class DotPrinter(Printer):
super(DotPrinter, self).__init__() super(DotPrinter, self).__init__()
self._node_to_str_function = node_to_str_function self._node_to_str_function = node_to_str_function
self.dot = Digraph(**kwargs) self.dot = Digraph(**kwargs)
self.dot.quote_edge = lang.quote self.dot.quote_edge = quote.quote
def _print_KernelFunction(self, func): def _print_KernelFunction(self, func):
self.dot.node(str(id(func)), style='filled', fillcolor='#a056db', label=self._node_to_str_function(func)) self.dot.node(str(id(func)), style='filled', fillcolor='#a056db', label=self._node_to_str_function(func))
...@@ -50,22 +55,20 @@ class DotPrinter(Printer): ...@@ -50,22 +55,20 @@ class DotPrinter(Printer):
def __shortened(node): def __shortened(node):
from pystencils.astnodes import LoopOverCoordinate, KernelFunction, SympyAssignment, Block, Conditional from pystencils.astnodes import LoopOverCoordinate, KernelFunction, SympyAssignment, Conditional
if isinstance(node, LoopOverCoordinate): if isinstance(node, LoopOverCoordinate):
return "Loop over dim %d" % (node.coordinate_to_loop_over,) return "Loop over dim %d" % (node.coordinate_to_loop_over,)
elif isinstance(node, KernelFunction): elif isinstance(node, KernelFunction):
params = node.get_parameters() params = node.get_parameters()
param_names = [p.field_name for p in params if p.is_field_pointer] param_names = [p.field_name for p in params if p.is_field_pointer]
param_names += [p.symbol.name for p in params if not p.is_field_parameter] param_names += [p.symbol.name for p in params if not p.is_field_parameter]
return "Func: %s (%s)" % (node.function_name, ",".join(param_names)) return f"Func: {node.function_name} ({','.join(param_names)})"
elif isinstance(node, SympyAssignment): elif isinstance(node, SympyAssignment):
return repr(node.lhs) return repr(node.lhs)
elif isinstance(node, Block):
return "Block" + str(id(node))
elif isinstance(node, Conditional): elif isinstance(node, Conditional):
return repr(node) return repr(node)
else: else:
raise NotImplementedError("Cannot handle node type %s" % (type(node),)) raise NotImplementedError(f"Cannot handle node type {type(node)}")
def print_dot(node, view=False, short=False, **kwargs): def print_dot(node, view=False, short=False, **kwargs):
......
# -*- coding: utf-8 -*-
#
# Copyright © 2019 Stephan Seitz <stephan.seitz@fau.de>
#
# Distributed under terms of the GPLv3 license.
"""
"""
import json
from pystencils.astnodes import NodeOrExpr
from pystencils.backends.cbackend import CustomSympyPrinter, generate_c
try:
import yaml
except ImportError:
raise ImportError('yaml not installed')
def expr_to_dict(expr_or_node: NodeOrExpr, with_c_code=True, full_class_names=False):
"""Converts a SymPy expression to a serializable dict (mainly for debugging purposes)
The dict recursively contains all args of the expression as ``dict``s
See :func:`.write_json`
Args:
expr_or_node (NodeOrExpr): a SymPy expression or a :class:`pystencils.astnodes.Node`
with_c_code (bool, optional): include C representation of the nodes
full_class_names (bool, optional): use full class names (type(object) instead of ``type(object).__name__``
"""
self = {'str': str(expr_or_node)}
if with_c_code:
try:
self.update({'c': generate_c(expr_or_node)})
except Exception:
try:
self.update({'c': CustomSympyPrinter().doprint(expr_or_node)})
except Exception:
pass
for a in expr_or_node.args:
self.update({str(a.__class__ if full_class_names else a.__class__.__name__): expr_to_dict(a)})
return self
def print_json(expr_or_node: NodeOrExpr):
"""Print debug JSON of an AST to string
Args:
expr_or_node (NodeOrExpr): a SymPy expression or a :class:`pystencils.astnodes.Node`
Returns:
str: JSON representation of AST
"""
expr_or_node_dict = expr_to_dict(expr_or_node)
return json.dumps(expr_or_node_dict, indent=4)
def write_json(filename: str, expr_or_node: NodeOrExpr):
"""Writes debug JSON represenation of AST to file
Args:
filename (str): Path for the file to write
expr_or_node (NodeOrExpr): a SymPy expression or a :class:`pystencils.astnodes.Node`
"""
expr_or_node_dict = expr_to_dict(expr_or_node)
with open(filename, 'w') as f:
json.dump(expr_or_node_dict, f, indent=4)
def print_yaml(expr_or_node):
expr_or_node_dict = expr_to_dict(expr_or_node, full_class_names=False)
return yaml.dump(expr_or_node_dict)
def write_yaml(filename, expr_or_node):
expr_or_node_dict = expr_to_dict(expr_or_node)
with open(filename, 'w') as f:
yaml.dump(expr_or_node_dict, f)
def get_argument_string(function_shortcut):
args = function_shortcut[function_shortcut.index('[') + 1: -1]
arg_string = "("
for arg in args.split(","):
arg = arg.strip()
if not arg:
continue
if arg in ('0', '1', '2', '3', '4', '5'):
arg_string += "{" + arg + "},"
else:
arg_string += arg + ","
arg_string = arg_string[:-1] + ")"
return arg_string
def get_vector_instruction_set_ppc(data_type='double', instruction_set='vsx'):
if instruction_set != 'vsx':
raise NotImplementedError(instruction_set)
base_names = {
'+': 'add[0, 1]',
'-': 'sub[0, 1]',
'*': 'mul[0, 1]',
'/': 'div[0, 1]',
'sqrt': 'sqrt[0]',
'rsqrt': 'rsqrte[0]', # rsqrt is available too, but not on Clang
'loadU': 'xl[0x0, 0]',
'loadA': 'ld[0x0, 0]',
'storeU': 'xst[1, 0x0, 0]',
'storeA': 'st[1, 0x0, 0]',
'storeAAndFlushCacheline': 'stl[1, 0x0, 0]',
'abs': 'abs[0]',
'==': 'cmpeq[0, 1]',
'!=': 'cmpne[0, 1]',
'<=': 'cmple[0, 1]',
'<': 'cmplt[0, 1]',
'>=': 'cmpge[0, 1]',
'>': 'cmpgt[0, 1]',
'&': 'and[0, 1]',
'|': 'or[0, 1]',
'blendv': 'sel[0, 1, 2]',
('any', '=='): 'any_eq[0, 1]',
('any', '!='): 'any_ne[0, 1]',
('any', '<='): 'any_le[0, 1]',
('any', '<'): 'any_lt[0, 1]',
('any', '>='): 'any_ge[0, 1]',
('any', '>'): 'any_gt[0, 1]',
('all', '=='): 'all_eq[0, 1]',
('all', '!='): 'all_ne[0, 1]',
('all', '<='): 'all_le[0, 1]',
('all', '<'): 'all_lt[0, 1]',
('all', '>='): 'all_ge[0, 1]',
('all', '>'): 'all_gt[0, 1]',
}
bits = {'double': 64,
'float': 32,
'int': 32}
width = 128 // bits[data_type]
intwidth = 128 // bits['int']
result = dict()
result['bytes'] = 16
for intrinsic_id, function_shortcut in base_names.items():
function_shortcut = function_shortcut.strip()
name = function_shortcut[:function_shortcut.index('[')]
arg_string = get_argument_string(function_shortcut)
result[intrinsic_id] = 'vec_' + name + arg_string
if data_type == 'double':
# Clang and XL C++ are missing these for doubles
result['loadA'] = '(__vector double)' + result['loadA'].format('(float*) {0}')
result['storeA'] = result['storeA'].format('(float*) {0}', '(__vector float) {1}')
result['storeAAndFlushCacheline'] = result['storeAAndFlushCacheline'].format('(float*) {0}',
'(__vector float) {1}')
result['+int'] = "vec_add({0}, {1})"
result['width'] = width
result['intwidth'] = intwidth
result[data_type] = f'__vector {data_type}'
result['int'] = '__vector int'
result['bool'] = f'__vector __bool {"long long" if data_type == "double" else "int"}'
result['headers'] = ['<altivec.h>', '"ppc_altivec_helpers.h"']
result['makeVecConst'] = '((' + result[data_type] + '){{' + \
", ".join(['(' + data_type + ') {0}' for _ in range(width)]) + '}})'
result['makeVec'] = '((' + result[data_type] + '){{' + \
", ".join(['{' + data_type + '} {' + str(i) + '}' for i in range(width)]) + '}})'
result['makeVecConstInt'] = '((' + result['int'] + '){{' + ", ".join(['(int) {0}' for _ in range(intwidth)]) + '}})'
result['makeVecInt'] = '((' + result['int'] + '){{(int) {0}, (int) {1}, (int) {2}, (int) {3}}})'
result['any'] = 'vec_any_ne({0}, ((' + result['bool'] + ') {{' + ", ".join(['0'] * width) + '}}))'
result['all'] = 'vec_all_ne({0}, ((' + result['bool'] + ') {{' + ", ".join(['0'] * width) + '}}))'
result['cachelineSize'] = 'cachelineSize()'
result['cachelineZero'] = 'cachelineZero((void*) {0})'
return result
from pystencils.typing import CFunction
def get_argument_string(function_shortcut, last=''):
args = function_shortcut[function_shortcut.index('[') + 1: -1]
arg_string = "("
for arg in args.split(","):
arg = arg.strip()
if not arg:
continue
if arg in ('0', '1', '2', '3', '4', '5'):
arg_string += "{" + arg + "},"
else:
arg_string += arg + ","
if last:
arg_string += last + ','
arg_string = arg_string[:-1] + ")"
return arg_string
def get_vector_instruction_set_riscv(data_type='double', instruction_set='rvv'):
assert instruction_set == 'rvv'
bits = {'double': 64,
'float': 32,
'int': 32}
base_names = {
'+': 'fadd_vv[0, 1]',
'-': 'fsub_vv[0, 1]',
'*': 'fmul_vv[0, 1]',
'/': 'fdiv_vv[0, 1]',
'sqrt': 'fsqrt_v[0]',
'loadU': f'le{bits[data_type]}_v[0]',
'storeU': f'se{bits[data_type]}_v[0, 1]',
'maskStoreU': f'se{bits[data_type]}_v[2, 0, 1]',
'loadS': f'lse{bits[data_type]}_v[0, 1]',
'storeS': f'sse{bits[data_type]}_v[0, 2, 1]',
'maskStoreS': f'sse{bits[data_type]}_v[3, 0, 2, 1]',
'abs': 'fabs_v[0]',
'==': 'mfeq_vv[0, 1]',
'!=': 'mfne_vv[0, 1]',
'<=': 'mfle_vv[0, 1]',
'<': 'mflt_vv[0, 1]',
'>=': 'mfge_vv[0, 1]',
'>': 'mfgt_vv[0, 1]',
'&': 'mand_mm[0, 1]',
'|': 'mor_mm[0, 1]',
'blendv': 'merge_vvm[2, 0, 1]',
'any': 'cpop_m[0]',
'all': 'cpop_m[0]',
}
result = dict()
width = f'vsetvlmax_e{bits[data_type]}m1()'
intwidth = 'vsetvlmax_e{bits["int"]}m1()'
result['bytes'] = 'vsetvlmax_e8m1()'
prefix = 'v'
suffix = f'_f{bits[data_type]}m1'
vl = '{loop_stop} - {loop_counter}'
int_vl = f'({vl})*{bits[data_type]//bits["int"]}'
for intrinsic_id, function_shortcut in base_names.items():
function_shortcut = function_shortcut.strip()
name = function_shortcut[:function_shortcut.index('[')]
if name.startswith('mf'):
suffix2 = suffix + f'_b{bits[data_type]}'
elif name.endswith('_mm') or name.endswith('_m'):
suffix2 = f'_b{bits[data_type]}'
elif intrinsic_id.startswith('mask'):
suffix2 = suffix + '_m'
else:
suffix2 = suffix
arg_string = get_argument_string(function_shortcut, last=vl)
result[intrinsic_id] = prefix + name + suffix2 + arg_string
result['width'] = CFunction(width, "int")
result['intwidth'] = CFunction(intwidth, "int")
result['makeVecConst'] = f'vfmv_v_f_f{bits[data_type]}m1({{0}}, {vl})'
result['makeVecConstInt'] = f'vmv_v_x_i{bits["int"]}m1({{0}}, {int_vl})'
result['makeVecIndex'] = f'vmacc_vx_i{bits["int"]}m1({result["makeVecConstInt"]}, {{1}}, ' + \
f'vid_v_i{bits["int"]}m1({int_vl}), {int_vl})'
result['storeS'] = result['storeS'].replace('{2}', f'{{2}}*{bits[data_type]//8}')
result['loadS'] = result['loadS'].replace('{1}', f'{{1}}*{bits[data_type]//8}')
result['maskStoreS'] = result['maskStoreS'].replace('{2}', f'{{2}}*{bits[data_type]//8}')
result['+int'] = f"vadd_vv_i{bits['int']}m1({{0}}, {{1}}, {int_vl})"
result['float'] = f'vfloat{bits["float"]}m1_t'
result['double'] = f'vfloat{bits["double"]}m1_t'
result['int'] = f'vint{bits["int"]}m1_t'
result['bool'] = f'vbool{bits[data_type]}_t'
result['headers'] = ['<riscv_vector.h>', '"riscv_v_helpers.h"']
result['any'] += ' > 0x0'
result['all'] += f' == vsetvl_e{bits[data_type]}m1({vl})'
result['cachelineSize'] = 'cachelineSize()'
result['cachelineZero'] = 'cachelineZero((void*) {0})'
return result
import os
import platform
from ctypes import CDLL, c_int, c_size_t, sizeof, byref
from warnings import warn
import numpy as np
from pystencils.backends.x86_instruction_sets import get_vector_instruction_set_x86
from pystencils.backends.arm_instruction_sets import get_vector_instruction_set_arm
from pystencils.backends.ppc_instruction_sets import get_vector_instruction_set_ppc
from pystencils.backends.riscv_instruction_sets import get_vector_instruction_set_riscv
from pystencils.cache import memorycache
from pystencils.typing import numpy_name_to_c
def get_vector_instruction_set(data_type='double', instruction_set='avx'):
if data_type == 'float':
warn(f"Ambiguous input for data_type: {data_type}. For single precision please use float32. "
f"For more information please take numpy.dtype as a reference. This input will not be supported in future "
f"releases")
data_type = 'float64'
type_name = numpy_name_to_c(np.dtype(data_type).name)
if instruction_set in ['neon', 'sme'] or instruction_set.startswith('sve'):
return get_vector_instruction_set_arm(type_name, instruction_set)
elif instruction_set in ['vsx']:
return get_vector_instruction_set_ppc(type_name, instruction_set)
elif instruction_set in ['rvv']:
return get_vector_instruction_set_riscv(type_name, instruction_set)
else:
return get_vector_instruction_set_x86(type_name, instruction_set)
@memorycache
def get_supported_instruction_sets():
"""List of supported instruction sets on current hardware, or None if query failed."""
if 'PYSTENCILS_SIMD' in os.environ:
return os.environ['PYSTENCILS_SIMD'].split(',')
if platform.system() == 'Darwin' and platform.machine() == 'arm64':
result = ['neon']
libc = CDLL('/usr/lib/libc.dylib')
value = c_int(0)
size = c_size_t(sizeof(value))
status = libc.sysctlbyname(b"hw.optional.arm.FEAT_SME", byref(value), byref(size), None, 0)
if status == 0 and value.value == 1:
result.insert(0, "sme")
return result
elif platform.system() == 'Windows' and platform.machine() == 'ARM64':
return ['neon']
elif platform.system() == 'Linux' and platform.machine() == 'aarch64':
result = ['neon'] # Neon is mandatory on 64-bit ARM
libc = CDLL('libc.so.6')
hwcap = libc.getauxval(16) # AT_HWCAP
hwcap2 = libc.getauxval(26) # AT_HWCAP2
if hwcap & (1 << 22): # HWCAP_SVE
if hwcap2 & (1 << 1): # HWCAP2_SVE2
name = 'sve2'
else:
name = 'sve'
length = 8 * libc.prctl(51, 0, 0, 0, 0) # PR_SVE_GET_VL
if length < 0:
raise OSError("SVE length query failed")
while length >= 128:
result.append(f"{name}{length}")
length //= 2
result.append(name)
if hwcap2 & (1 << 23): # HWCAP2_SME
result.insert(0, "sme") # prepend to list so it is not automatically chosen as best instruction set
return result
elif platform.system() == 'Linux' and platform.machine().startswith('riscv'):
libc = CDLL('libc.so.6')
hwcap = libc.getauxval(16) # AT_HWCAP
hwcap_isa_v = 1 << (ord('V') - ord('A')) # COMPAT_HWCAP_ISA_V
return ['rvv'] if hwcap & hwcap_isa_v else []
elif platform.system() == 'Linux' and platform.machine().startswith('ppc64'):
libc = CDLL('libc.so.6')
hwcap = libc.getauxval(16) # AT_HWCAP
return ['vsx'] if hwcap & 0x00000080 else [] # PPC_FEATURE_HAS_VSX
elif platform.machine() in ['x86_64', 'x86', 'AMD64', 'i386']:
try:
from cpuinfo import get_cpu_info
except ImportError:
return None
result = []
required_sse_flags = {'sse', 'sse2', 'ssse3', 'sse4_1', 'sse4_2'}
required_avx_flags = {'avx', 'avx2'}
required_avx512_flags = {'avx512f'}
possible_avx512vl_flags = {'avx512vl', 'avx10_1'}
flags = set(get_cpu_info()['flags'])
if flags.issuperset(required_sse_flags):
result.append("sse")
if flags.issuperset(required_avx_flags):
result.append("avx")
if flags.issuperset(required_avx512_flags):
result.append("avx512")
if not flags.isdisjoint(possible_avx512vl_flags):
result.append("avx512vl")
return result
else:
raise NotImplementedError('Instruction set detection for %s on %s is not implemented' %
(platform.system(), platform.machine()))
@memorycache
def get_cacheline_size(instruction_set):
"""Get the size (in bytes) of a cache block that can be zeroed without memory access.
Usually, this is identical to the cache line size."""
instruction_sets = get_vector_instruction_set('double', instruction_set)
if 'cachelineSize' not in instruction_sets:
return None
import pystencils as ps
from pystencils.astnodes import SympyAssignment
import numpy as np
from pystencils.cpu.vectorization import CachelineSize
arr = np.zeros((1, 1), dtype=np.float32)
f = ps.Field.create_from_numpy_array('f', arr, index_dimensions=0)
ass = [CachelineSize(), SympyAssignment(f.center, CachelineSize.symbol)]
ast = ps.create_kernel(ass, cpu_vectorize_info={'instruction_set': instruction_set})
kernel = ast.compile()
kernel(**{f.name: arr, CachelineSize.symbol.name: 0})
return int(arr[0, 0])
def get_argument_string(intrinsic_id, width, function_shortcut):
if intrinsic_id == 'makeVecConst' or intrinsic_id == 'makeVecConstInt':
arg_string = f"({','.join(['{0}'] * width)})"
elif intrinsic_id == 'makeVec' or intrinsic_id == 'makeVecInt':
params = ["{" + str(i) + "}" for i in reversed(range(width))]
arg_string = f"({','.join(params)})"
elif intrinsic_id == 'makeVecBool':
params = [f"(({{{i}}} ? -1.0 : 0.0)" for i in reversed(range(width))]
arg_string = f"({','.join(params)})"
elif intrinsic_id == 'makeVecConstBool':
params = ["(({0}) ? -1.0 : 0.0)" for _ in range(width)]
arg_string = f"({','.join(params)})"
else:
args = function_shortcut[function_shortcut.index('[') + 1: -1]
arg_string = "("
for arg in args.split(","):
arg = arg.strip()
if not arg:
continue
if arg in ('0', '1', '2', '3', '4', '5'):
arg_string += "{" + arg + "},"
else:
arg_string += arg + ","
arg_string = arg_string[:-1] + ")"
return arg_string
def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'):
comparisons = {
'==': '_CMP_EQ_UQ',
'!=': '_CMP_NEQ_UQ',
'>=': '_CMP_GE_OQ',
'<=': '_CMP_LE_OQ',
'<': '_CMP_NGE_UQ',
'>': '_CMP_NLE_UQ',
}
base_names = {
'+': 'add[0, 1]',
'-': 'sub[0, 1]',
'*': 'mul[0, 1]',
'/': 'div[0, 1]',
'&': 'and[0, 1]',
'|': 'or[0, 1]',
'blendv': 'blendv[0, 1, 2]',
'sqrt': 'sqrt[0]',
'makeVecConst': 'set[]',
'makeVec': 'set[]',
'makeVecBool': 'set[]',
'makeVecConstBool': 'set[]',
'makeVecInt': 'set[]',
'makeVecConstInt': 'set[]',
'loadU': 'loadu[0]',
'loadA': 'load[0]',
'storeU': 'storeu[0,1]',
'storeA': 'store[0,1]',
'stream': 'stream[0,1]',
'maskStoreA': 'mask_store[0, 2, 1]' if instruction_set.startswith('avx512') else 'maskstore[0, 2, 1]',
'maskStoreU': 'mask_storeu[0, 2, 1]' if instruction_set.startswith('avx512') else 'maskstore[0, 2, 1]',
}
for comparison_op, constant in comparisons.items():
base_names[comparison_op] = f'cmp[0, 1, {constant}]'
headers = {
'avx512': ['<immintrin.h>'],
'avx512vl': ['<immintrin.h>'],
'avx': ['<immintrin.h>'],
'sse': ['<immintrin.h>', '<xmmintrin.h>', '<emmintrin.h>', '<pmmintrin.h>',
'<tmmintrin.h>', '<smmintrin.h>', '<nmmintrin.h>']
}
suffix = {
'double': 'pd',
'float': 'ps',
'int': 'epi32'
}
prefix = {
'sse': '_mm',
'avx': '_mm256',
'avx512vl': '_mm256',
'avx512': '_mm512',
}
width = {
("double", "sse"): 2,
("float", "sse"): 4,
("int", "sse"): 4,
("double", "avx"): 4,
("float", "avx"): 8,
("int", "avx"): 8,
("double", "avx512vl"): 4,
("float", "avx512vl"): 8,
("int", "avx512vl"): 8,
("double", "avx512"): 8,
("float", "avx512"): 16,
("int", "avx512"): 16,
}
result = {
'width': width[(data_type, instruction_set)],
'intwidth': width[('int', instruction_set)],
'bytes': 4 * width[("float", instruction_set)]
}
pre = prefix[instruction_set]
for intrinsic_id, function_shortcut in base_names.items():
function_shortcut = function_shortcut.strip()
name = function_shortcut[:function_shortcut.index('[')]
if 'Int' in intrinsic_id:
suf = suffix['int']
arg_string = get_argument_string(intrinsic_id, result['intwidth'], function_shortcut)
else:
suf = suffix[data_type]
arg_string = get_argument_string(intrinsic_id, result['width'], function_shortcut)
mask_suffix = '_mask' if instruction_set.startswith('avx512') and intrinsic_id in comparisons.keys() else ''
result[intrinsic_id] = pre + "_" + name + "_" + suf + mask_suffix + arg_string
bit_width = result['width'] * (64 if data_type == 'double' else 32)
result['double'] = f"__m{bit_width}d"
result['float'] = f"__m{bit_width}"
result['int'] = f"__m{bit_width}i"
result['bool'] = result[data_type]
result['headers'] = headers[instruction_set]
result['any'] = f"{pre}_movemask_{suf}({{0}}) > 0"
result['all'] = f"{pre}_movemask_{suf}({{0}}) == {hex(2**result['width']-1)}"
setsuf = "x" if bit_width < 512 and bit_width // result['width'] == 64 else ""
if instruction_set.startswith('avx512'):
size = result['width']
masksize = max(size, 8)
result['&'] = f'_kand_mask{masksize}({{0}}, {{1}})'
result['|'] = f'_kor_mask{masksize}({{0}}, {{1}})'
result['any'] = f'!_ktestz_mask{masksize}_u8({{0}}, {{0}})'
result['all'] = f'_kortestc_mask{masksize}_u8({{0}}, {{0}})'
result['blendv'] = f'{pre}_mask_blend_{suf}({{2}}, {{0}}, {{1}})'
result['rsqrt'] = f"{pre}_rsqrt14_{suf}({{0}})"
result['bool'] = f"__mmask{masksize}"
params = " | ".join(["({{{i}}} ? {power} : 0)".format(i=i, power=2 ** i) for i in range(8)])
result['makeVecBool'] = f"__mmask8(({params}) )"
params = " | ".join(["({{0}} ? {power} : 0)".format(power=2 ** i) for i in range(8)])
result['makeVecConstBool'] = f"__mmask8(({params}) )"
vindex = f'{pre}_set_epi{bit_width//size}{setsuf}(' + \
', '.join([str(i) for i in range(result['width'])][::-1]) + ')'
vindex = f'{pre}_mullo_epi{bit_width//size}({vindex}, {pre}_set1_epi{bit_width//size}{setsuf}({{0}}))'
scale = bit_width // size // 8
result['storeS'] = f'{pre}_i{bit_width//size}scatter_{suf}({{0}}, ' + vindex.format("{2}") + \
f', {{1}}, {scale})'
result['maskStoreS'] = f'{pre}_mask_i{bit_width//size}scatter_{suf}({{0}}, {{3}}, ' + vindex.format("{2}") + \
f', {{1}}, {scale})'
if bit_width == 512:
result['loadS'] = f'{pre}_i{bit_width//size}gather_{suf}(' + vindex.format("{1}") + f', {{0}}, {scale})'
else:
result['loadS'] = f'{pre}_i{bit_width//size}gather_{suf}({{0}}, ' + vindex.format("{1}") + f', {scale})'
# abs intrinsic exists in 512 bits, but expands to a sequence. We generate that same sequence for 128 and 256 bits
if instruction_set == 'avx512':
result['abs'] = f"{pre}_abs_{suf}({{0}})"
else:
result['abs'] = f"{pre}_castsi{bit_width}_{suf}({pre}_and_si{bit_width}(" + \
f"{pre}_set1_epi{bit_width // result['width']}{setsuf}(0x7" + \
'f' * (bit_width // result['width'] // 4 - 1) + "), " + \
f"{pre}_cast{suf}_si{bit_width}({{0}})))"
if instruction_set == 'avx' and data_type == 'float':
result['rsqrt'] = f"{pre}_rsqrt_{suf}({{0}})"
result['+int'] = f"{pre}_add_{suffix['int']}({{0}}, {{1}})"
result['streamFence'] = '_mm_mfence()'
return result
import sympy as sp
# from pystencils.typing import get_type_of_expression
# noinspection PyPep8Naming
class flag_cond(sp.Function):
"""Evaluates a flag condition on a bit mask, and returns the value of one of two expressions,
depending on whether the flag is set.
Three argument version:
```
flag_cond(flag_bit, mask, expr) = expr if (flag_bit is set in mask) else 0
```
Four argument version:
```
flag_cond(flag_bit, mask, expr_then, expr_else) = expr_then if (flag_bit is set in mask) else expr_else
```
"""
nargs = (3, 4)
def __new__(cls, flag_bit, mask_expression, *expressions):
# TODO Jan reintroduce checking
# flag_dtype = get_type_of_expression(flag_bit)
# if not flag_dtype.is_int():
# raise ValueError('Argument flag_bit must be of integer type.')
#
# mask_dtype = get_type_of_expression(mask_expression)
# if not mask_dtype.is_int():
# raise ValueError('Argument mask_expression must be of integer type.')
return super().__new__(cls, flag_bit, mask_expression, *expressions)
def to_c(self, print_func):
flag_bit = self.args[0]
mask = self.args[1]
then_expression = self.args[2]
flag_bit_code = print_func(flag_bit)
mask_code = print_func(mask)
then_code = print_func(then_expression)
code = f"(({mask_code}) >> ({flag_bit_code}) & 1) * ({then_code})"
if len(self.args) > 3:
else_expression = self.args[3]
else_code = print_func(else_expression)
code += f" + (({mask_code}) >> ({flag_bit_code}) ^ 1) * ({else_code})"
return code
from pystencils.boundaries.boundaryconditions import Dirichlet, Neumann
from pystencils.boundaries.boundaryhandling import BoundaryHandling from pystencils.boundaries.boundaryhandling import BoundaryHandling
from pystencils.boundaries.boundaryconditions import Neumann, Dirichlet
from pystencils.boundaries.inkernel import add_neumann_boundary from pystencils.boundaries.inkernel import add_neumann_boundary
__all__ = ['BoundaryHandling', 'Neumann', 'Dirichlet', 'add_neumann_boundary'] __all__ = ['BoundaryHandling', 'Neumann', 'Dirichlet', 'add_neumann_boundary']
from typing import List, Tuple, Any from typing import Any, List, Tuple
from pystencils import Assignment
from pystencils.astnodes import SympyAssignment
from pystencils.boundaries.boundaryhandling import BoundaryOffsetInfo from pystencils.boundaries.boundaryhandling import BoundaryOffsetInfo
from pystencils.data_types import create_type from pystencils.typing import create_type
class Boundary: class Boundary:
...@@ -13,7 +14,7 @@ class Boundary: ...@@ -13,7 +14,7 @@ class Boundary:
def __init__(self, name=None): def __init__(self, name=None):
self._name = name self._name = name
def __call__(self, field, direction_symbol, index_field) -> List[Assignment]: def __call__(self, field, direction_symbol, index_field) -> List[SympyAssignment]:
"""Defines the boundary behavior and must therefore be implemented by all boundaries. """Defines the boundary behavior and must therefore be implemented by all boundaries.
Here the boundary is defined as a list of sympy assignments, from which a boundary kernel is generated. Here the boundary is defined as a list of sympy assignments, from which a boundary kernel is generated.
...@@ -62,20 +63,20 @@ class Neumann(Boundary): ...@@ -62,20 +63,20 @@ class Neumann(Boundary):
neighbor = BoundaryOffsetInfo.offset_from_dir(direction_symbol, field.spatial_dimensions) neighbor = BoundaryOffsetInfo.offset_from_dir(direction_symbol, field.spatial_dimensions)
if field.index_dimensions == 0: if field.index_dimensions == 0:
return [Assignment(field.center, field[neighbor])] return [SympyAssignment(field.center, field[neighbor])]
else: else:
from itertools import product from itertools import product
if not field.has_fixed_index_shape: if not field.has_fixed_index_shape:
raise NotImplementedError("Neumann boundary works only for fields with fixed index shape") raise NotImplementedError("Neumann boundary works only for fields with fixed index shape")
index_iter = product(*(range(i) for i in field.index_shape)) index_iter = product(*(range(i) for i in field.index_shape))
return [Assignment(field(*idx), field[neighbor](*idx)) for idx in index_iter] return [SympyAssignment(field(*idx), field[neighbor](*idx)) for idx in index_iter]
def __hash__(self): def __hash__(self):
# All boundaries of these class behave equal -> should also be equal # All boundaries of these class behave equal -> should also be equal
return hash("Neumann") return hash("Neumann")
def __eq__(self, other): def __eq__(self, other):
return type(other) == Neumann return type(other) is Neumann
class Dirichlet(Boundary): class Dirichlet(Boundary):
...@@ -83,7 +84,7 @@ class Dirichlet(Boundary): ...@@ -83,7 +84,7 @@ class Dirichlet(Boundary):
inner_or_boundary = False inner_or_boundary = False
single_link = True single_link = True
def __init__(self, value, name="Dirchlet"): def __init__(self, value, name=None):
super().__init__(name) super().__init__(name)
self._value = value self._value = value
...@@ -102,11 +103,11 @@ class Dirichlet(Boundary): ...@@ -102,11 +103,11 @@ class Dirichlet(Boundary):
def __call__(self, field, direction_symbol, index_field, **kwargs): def __call__(self, field, direction_symbol, index_field, **kwargs):
if field.index_dimensions == 0: if field.index_dimensions == 0:
return [Assignment(field, index_field("value") if self.additional_data else self._value)] return [SympyAssignment(field.center, index_field("value") if self.additional_data else self._value)]
elif field.index_dimensions == 1: elif field.index_dimensions == 1:
assert not self.additional_data assert not self.additional_data
if not field.has_fixed_index_shape: if not field.has_fixed_index_shape:
raise NotImplementedError("Field needs fixed index shape") raise NotImplementedError("Field needs fixed index shape")
assert len(self._value) == field.index_shape[0], "Dirichlet value does not match index shape of field" assert len(self._value) == field.index_shape[0], "Dirichlet value does not match index shape of field"
return [Assignment(field(i), self._value[i]) for i in range(field.index_shape[0])] return [SympyAssignment(field(i), self._value[i]) for i in range(field.index_shape[0])]
raise NotImplementedError("Dirichlet boundary not implemented for fields with more than one index dimension") raise NotImplementedError("Dirichlet boundary not implemented for fields with more than one index dimension")
from functools import lru_cache
import numpy as np import numpy as np
import sympy as sp import sympy as sp
from pystencils.assignment import Assignment
from pystencils import Field, TypedSymbol, create_indexed_kernel from pystencils import create_kernel, CreateKernelConfig, Target
from pystencils.astnodes import SympyAssignment
from pystencils.backends.cbackend import CustomCodeNode from pystencils.backends.cbackend import CustomCodeNode
from pystencils.boundaries.createindexlist import numpy_data_type_for_boundary_object, create_boundary_index_array from pystencils.boundaries.createindexlist import (
from pystencils.cache import memorycache create_boundary_index_array, numpy_data_type_for_boundary_object)
from pystencils.data_types import create_type from pystencils.typing import TypedSymbol, create_type
from pystencils.kernelparameters import FieldPointerSymbol from pystencils.gpu.gpu_array_handler import GPUArrayHandler
from pystencils.field import Field
from pystencils.typing.typed_sympy import FieldPointerSymbol
try:
# noinspection PyPep8Naming
import waLBerla as wlb
if wlb.cpp_available:
from pystencils.datahandling.parallel_datahandling import ParallelDataHandling
import cupy.cuda.runtime
else:
ParallelDataHandling = None
except ImportError:
ParallelDataHandling = None
DEFAULT_FLAG_TYPE = np.uint32 DEFAULT_FLAG_TYPE = np.uint32
...@@ -19,11 +35,11 @@ class FlagInterface: ...@@ -19,11 +35,11 @@ class FlagInterface:
>>> dh = create_data_handling((4, 5)) >>> dh = create_data_handling((4, 5))
>>> fi = FlagInterface(dh, 'flag_field', np.uint8) >>> fi = FlagInterface(dh, 'flag_field', np.uint8)
>>> assert dh.has_data('flag_field') >>> assert dh.has_data('flag_field')
>>> fi.reserve_next_flag() >>> int(fi.reserve_next_flag())
2 2
>>> fi.reserve_flag(4) >>> int(fi.reserve_flag(4))
4 4
>>> fi.reserve_next_flag() >>> int(fi.reserve_next_flag())
8 8
""" """
...@@ -52,13 +68,13 @@ class FlagInterface: ...@@ -52,13 +68,13 @@ class FlagInterface:
self._used_flags.add(flag) self._used_flags.add(flag)
assert self._is_power_of_2(flag) assert self._is_power_of_2(flag)
return flag return flag
raise ValueError("All available {} flags are reserved".format(self.max_bits)) raise ValueError(f"All available {self.max_bits} flags are reserved")
def reserve_flag(self, flag): def reserve_flag(self, flag):
assert self._is_power_of_2(flag) assert self._is_power_of_2(flag)
flag = self.dtype(flag) flag = self.dtype(flag)
if flag in self._used_flags: if flag in self._used_flags:
raise ValueError("The flag {flag} is already reserved".format(flag=flag)) raise ValueError(f"The flag {flag} is already reserved")
self._used_flags.add(flag) self._used_flags.add(flag)
return flag return flag
...@@ -70,7 +86,7 @@ class FlagInterface: ...@@ -70,7 +86,7 @@ class FlagInterface:
class BoundaryHandling: class BoundaryHandling:
def __init__(self, data_handling, field_name, stencil, name="boundary_handling", flag_interface=None, def __init__(self, data_handling, field_name, stencil, name="boundary_handling", flag_interface=None,
target='cpu', openmp=True): target: Target = Target.CPU, openmp=True):
assert data_handling.has_data(field_name) assert data_handling.has_data(field_name)
assert data_handling.dim == len(stencil[0]), "Dimension of stencil and data handling do not match" assert data_handling.dim == len(stencil[0]), "Dimension of stencil and data handling do not match"
self._data_handling = data_handling self._data_handling = data_handling
...@@ -84,8 +100,33 @@ class BoundaryHandling: ...@@ -84,8 +100,33 @@ class BoundaryHandling:
fi = flag_interface fi = flag_interface
self.flag_interface = fi if fi is not None else FlagInterface(data_handling, name + "Flags") self.flag_interface = fi if fi is not None else FlagInterface(data_handling, name + "Flags")
gpu = self._target == 'gpu' if ParallelDataHandling and isinstance(self.data_handling, ParallelDataHandling):
data_handling.add_custom_class(self._index_array_name, self.IndexFieldBlockData, cpu=True, gpu=gpu) array_handler = GPUArrayHandler(cupy.cuda.runtime.getDevice())
else:
array_handler = self.data_handling.array_handler
def to_cpu(gpu_version, cpu_version):
gpu_version = gpu_version.boundary_object_to_index_list
cpu_version = cpu_version.boundary_object_to_index_list
for obj, cpu_arr in cpu_version.items():
array_handler.download(gpu_version[obj], cpu_arr)
def to_gpu(gpu_version, cpu_version):
gpu_version = gpu_version.boundary_object_to_index_list
cpu_version = cpu_version.boundary_object_to_index_list
for obj, cpu_arr in cpu_version.items():
if obj not in gpu_version or gpu_version[obj].shape != cpu_arr.shape:
gpu_version[obj] = array_handler.empty(cpu_arr.shape, cpu_arr.dtype)
array_handler.upload(gpu_version[obj], cpu_arr)
else:
array_handler.upload(gpu_version[obj], cpu_arr)
class_ = self.IndexFieldBlockData
class_.to_cpu = to_cpu
class_.to_gpu = to_gpu
gpu = self._target in data_handling._GPU_LIKE_TARGETS
data_handling.add_custom_class(self._index_array_name, class_, cpu=True, gpu=gpu)
@property @property
def data_handling(self): def data_handling(self):
...@@ -201,7 +242,7 @@ class BoundaryHandling: ...@@ -201,7 +242,7 @@ class BoundaryHandling:
if self._dirty: if self._dirty:
self.prepare() self.prepare()
for b in self._data_handling.iterate(gpu=self._target == 'gpu'): for b in self._data_handling.iterate(gpu=self._target in self._data_handling._GPU_LIKE_TARGETS):
for b_obj, idx_arr in b[self._index_array_name].boundary_object_to_index_list.items(): for b_obj, idx_arr in b[self._index_array_name].boundary_object_to_index_list.items():
kwargs[self._field_name] = b[self._field_name] kwargs[self._field_name] = b[self._field_name]
kwargs['indexField'] = idx_arr kwargs['indexField'] = idx_arr
...@@ -216,7 +257,7 @@ class BoundaryHandling: ...@@ -216,7 +257,7 @@ class BoundaryHandling:
if self._dirty: if self._dirty:
self.prepare() self.prepare()
for b in self._data_handling.iterate(gpu=self._target == 'gpu'): for b in self._data_handling.iterate(gpu=self._target in self._data_handling._GPU_LIKE_TARGETS):
for b_obj, idx_arr in b[self._index_array_name].boundary_object_to_index_list.items(): for b_obj, idx_arr in b[self._index_array_name].boundary_object_to_index_list.items():
arguments = kwargs.copy() arguments = kwargs.copy()
arguments[self._field_name] = b[self._field_name] arguments[self._field_name] = b[self._field_name]
...@@ -233,11 +274,13 @@ class BoundaryHandling: ...@@ -233,11 +274,13 @@ class BoundaryHandling:
""" """
Writes a VTK field where each cell with the given boundary is marked with 1, other cells are 0 Writes a VTK field where each cell with the given boundary is marked with 1, other cells are 0
This can be used to display the simulation geometry in Paraview This can be used to display the simulation geometry in Paraview
:param file_name: vtk filename
:param boundaries: boundary object, or special string 'domain' for domain cells or special string 'all' for all Params:
boundary conditions. file_name: vtk filename
can also be a sequence, to write multiple boundaries to VTK file boundaries: boundary object, or special string 'domain' for domain cells or special string 'all' for all
:param ghost_layers: number of ghost layers to write, or True for all, False for none boundary conditions.
can also be a sequence, to write multiple boundaries to VTK file
ghost_layers: number of ghost layers to write, or True for all, False for none
""" """
if boundaries == 'all': if boundaries == 'all':
boundaries = list(self._boundary_object_to_boundary_info.keys()) + ['domain'] boundaries = list(self._boundary_object_to_boundary_info.keys()) + ['domain']
...@@ -272,7 +315,7 @@ class BoundaryHandling: ...@@ -272,7 +315,7 @@ class BoundaryHandling:
def _create_boundary_kernel(self, symbolic_field, symbolic_index_field, boundary_obj): def _create_boundary_kernel(self, symbolic_field, symbolic_index_field, boundary_obj):
return create_boundary_kernel(symbolic_field, symbolic_index_field, self.stencil, boundary_obj, return create_boundary_kernel(symbolic_field, symbolic_index_field, self.stencil, boundary_obj,
target=self._target, openmp=self._openmp) target=self._target, cpu_openmp=self._openmp)
def _create_index_fields(self): def _create_index_fields(self):
dh = self._data_handling dh = self._data_handling
...@@ -299,7 +342,7 @@ class BoundaryHandling: ...@@ -299,7 +342,7 @@ class BoundaryHandling:
def _boundary_data_initialization(self, boundary_obj, boundary_data_setter, **kwargs): def _boundary_data_initialization(self, boundary_obj, boundary_data_setter, **kwargs):
if boundary_obj.additional_data_init_callback: if boundary_obj.additional_data_init_callback:
boundary_obj.additional_data_init_callback(boundary_data_setter, **kwargs) boundary_obj.additional_data_init_callback(boundary_data_setter, **kwargs)
if self._target == 'gpu': if self._target in self._data_handling._GPU_LIKE_TARGETS:
self._data_handling.to_gpu(self._index_array_name) self._data_handling.to_gpu(self._index_array_name)
class BoundaryInfo(object): class BoundaryInfo(object):
...@@ -309,7 +352,7 @@ class BoundaryHandling: ...@@ -309,7 +352,7 @@ class BoundaryHandling:
self.kernel = kernel self.kernel = kernel
class IndexFieldBlockData: class IndexFieldBlockData:
def __init__(self, *_1, **_2): def __init__(self, *args, **kwargs):
self.boundary_object_to_index_list = {} self.boundary_object_to_index_list = {}
self.boundary_object_to_data_setter = {} self.boundary_object_to_data_setter = {}
...@@ -317,24 +360,6 @@ class BoundaryHandling: ...@@ -317,24 +360,6 @@ class BoundaryHandling:
self.boundary_object_to_index_list.clear() self.boundary_object_to_index_list.clear()
self.boundary_object_to_data_setter.clear() self.boundary_object_to_data_setter.clear()
@staticmethod
def to_cpu(gpu_version, cpu_version):
gpu_version = gpu_version.boundary_object_to_index_list
cpu_version = cpu_version.boundary_object_to_index_list
for obj, cpu_arr in cpu_version.items():
gpu_version[obj].get(cpu_arr)
@staticmethod
def to_gpu(gpu_version, cpu_version):
from pycuda import gpuarray
gpu_version = gpu_version.boundary_object_to_index_list
cpu_version = cpu_version.boundary_object_to_index_list
for obj, cpu_arr in cpu_version.items():
if obj not in gpu_version or gpu_version[obj].shape != cpu_arr.shape:
gpu_version[obj] = gpuarray.to_gpu(cpu_arr)
else:
gpu_version[obj].set(cpu_arr)
class BoundaryDataSetter: class BoundaryDataSetter:
...@@ -356,26 +381,26 @@ class BoundaryDataSetter: ...@@ -356,26 +381,26 @@ class BoundaryDataSetter:
assert coord < self.dim assert coord < self.dim
return self.index_array[self.coord_map[coord]] + self.offset[coord] - self.ghost_layers + 0.5 return self.index_array[self.coord_map[coord]] + self.offset[coord] - self.ghost_layers + 0.5
@memorycache() @lru_cache()
def link_offsets(self): def link_offsets(self):
return self.stencil[self.index_array['dir']] return self.stencil[self.index_array['dir']]
@memorycache() @lru_cache()
def link_positions(self, coord): def link_positions(self, coord):
return self.non_boundary_cell_positions(coord) + 0.5 * self.link_offsets()[:, coord] return self.non_boundary_cell_positions(coord) + 0.5 * self.link_offsets()[:, coord]
@memorycache() @lru_cache()
def boundary_cell_positions(self, coord): def boundary_cell_positions(self, coord):
return self.non_boundary_cell_positions(coord) + self.link_offsets()[:, coord] return self.non_boundary_cell_positions(coord) + self.link_offsets()[:, coord]
def __setitem__(self, key, value): def __setitem__(self, key, value):
if key not in self.boundary_data_names: if key not in self.boundary_data_names:
raise KeyError("Invalid boundary data name %s. Allowed are %s" % (key, self.boundary_data_names)) raise KeyError(f"Invalid boundary data name {key}. Allowed are {self.boundary_data_names}")
self.index_array[key] = value self.index_array[key] = value
def __getitem__(self, item): def __getitem__(self, item):
if item not in self.boundary_data_names: if item not in self.boundary_data_names:
raise KeyError("Invalid boundary data name %s. Allowed are %s" % (item, self.boundary_data_names)) raise KeyError(f"Invalid boundary data name {item}. Allowed are {self.boundary_data_names}")
return self.index_array[item] return self.index_array[item]
...@@ -401,29 +426,30 @@ class BoundaryOffsetInfo(CustomCodeNode): ...@@ -401,29 +426,30 @@ class BoundaryOffsetInfo(CustomCodeNode):
code = "\n" code = "\n"
for i in range(dim): for i in range(dim):
offset_str = ", ".join([str(d[i]) for d in stencil]) offset_str = ", ".join([str(d[i]) for d in stencil])
code += "const int64_t %s [] = { %s };\n" % (offset_sym[i].name, offset_str) code += "const int32_t %s [] = { %s };\n" % (offset_sym[i].name, offset_str)
inv_dirs = [] inv_dirs = []
for direction in stencil: for direction in stencil:
inverse_dir = tuple([-i for i in direction]) inverse_dir = tuple([-i for i in direction])
inv_dirs.append(str(stencil.index(inverse_dir))) inv_dirs.append(str(stencil.index(inverse_dir)))
code += "const int %s [] = { %s };\n" % (self.INV_DIR_SYMBOL.name, ", ".join(inv_dirs)) code += "const int32_t %s [] = { %s };\n" % (self.INV_DIR_SYMBOL.name, ", ".join(inv_dirs))
offset_symbols = BoundaryOffsetInfo._offset_symbols(dim) offset_symbols = BoundaryOffsetInfo._offset_symbols(dim)
super(BoundaryOffsetInfo, self).__init__(code, symbols_read=set(), super(BoundaryOffsetInfo, self).__init__(code, symbols_read=set(),
symbols_defined=set(offset_symbols + [self.INV_DIR_SYMBOL])) symbols_defined=set(offset_symbols + [self.INV_DIR_SYMBOL]))
@staticmethod @staticmethod
def _offset_symbols(dim): def _offset_symbols(dim):
return [TypedSymbol("c%s" % (d,), create_type(np.int64)) for d in ['x', 'y', 'z'][:dim]] return [TypedSymbol(f"c{d}", create_type(np.int32)) for d in ['x', 'y', 'z'][:dim]]
INV_DIR_SYMBOL = TypedSymbol("invdir", "int") INV_DIR_SYMBOL = TypedSymbol("invdir", np.int32)
def create_boundary_kernel(field, index_field, stencil, boundary_functor, target='cpu', openmp=True): def create_boundary_kernel(field, index_field, stencil, boundary_functor, target=Target.CPU, **kernel_creation_args):
elements = [BoundaryOffsetInfo(stencil)] elements = [BoundaryOffsetInfo(stencil)]
index_arr_dtype = index_field.dtype.numpy_dtype dir_symbol = TypedSymbol("dir", np.int32)
dir_symbol = TypedSymbol("dir", index_arr_dtype.fields['dir'][0]) elements += [SympyAssignment(dir_symbol, index_field[0]('dir'))]
elements += [Assignment(dir_symbol, index_field[0]('dir'))]
elements += boundary_functor(field, direction_symbol=dir_symbol, index_field=index_field) elements += boundary_functor(field, direction_symbol=dir_symbol, index_field=index_field)
return create_indexed_kernel(elements, [index_field], target=target, cpu_openmp=openmp) config = CreateKernelConfig(index_fields=[index_field], target=target, skip_independence_check=True,
**kernel_creation_args)
return create_kernel(elements, config=config)
import numpy as np
import itertools
import warnings import warnings
import numpy as np
try: try:
import pyximport import pyximport
pyximport.install(language_level=3) pyximport.install(language_level=3)
from pystencils.boundaries.createindexlistcython import create_boundary_neighbor_index_list_2d, \
create_boundary_neighbor_index_list_3d, create_boundary_cell_index_list_2d, create_boundary_cell_index_list_3d
cython_funcs_available = True cython_funcs_available = True
except Exception: except ImportError:
cython_funcs_available = False cython_funcs_available = False
create_boundary_index_list_2d = None
create_boundary_index_list_3d = None if cython_funcs_available:
from pystencils.boundaries.createindexlistcython import (
create_boundary_neighbor_index_list_2d,
create_boundary_neighbor_index_list_3d,
create_boundary_cell_index_list_2d,
create_boundary_cell_index_list_3d,
)
boundary_index_array_coordinate_names = ["x", "y", "z"] boundary_index_array_coordinate_names = ["x", "y", "z"]
direction_member_name = "dir" direction_member_name = "dir"
default_index_array_dtype = np.int32
def numpy_data_type_for_boundary_object(boundary_object, dim): def numpy_data_type_for_boundary_object(boundary_object, dim):
coordinate_names = boundary_index_array_coordinate_names[:dim] coordinate_names = boundary_index_array_coordinate_names[:dim]
return np.dtype([(name, np.int32) for name in coordinate_names] return np.dtype(
+ [(direction_member_name, np.int32)] [(name, default_index_array_dtype) for name in coordinate_names]
+ [(i[0], i[1].numpy_dtype) for i in boundary_object.additional_data], align=True) + [(direction_member_name, default_index_array_dtype)]
+ [(i[0], i[1].numpy_dtype) for i in boundary_object.additional_data],
align=True,
def _create_boundary_neighbor_index_list_python(flag_field_arr, nr_of_ghost_layers, boundary_mask, )
fluid_mask, stencil, single_link):
coordinate_names = boundary_index_array_coordinate_names[:len(flag_field_arr.shape)]
index_arr_dtype = np.dtype([(name, np.int32) for name in coordinate_names] + [(direction_member_name, np.int32)]) def _create_index_list_python(
flag_field_arr,
result = [] boundary_mask,
gl = nr_of_ghost_layers fluid_mask,
for cell in itertools.product(*reversed([range(gl, i - gl) for i in flag_field_arr.shape])): stencil,
cell = cell[::-1] single_link,
if not flag_field_arr[cell] & fluid_mask: inner_or_boundary=False,
continue nr_of_ghost_layers=None,
):
if inner_or_boundary and nr_of_ghost_layers is None:
raise ValueError(
"If inner_or_boundary is set True the number of ghost layers "
"around the inner domain has to be specified"
)
if nr_of_ghost_layers is None:
nr_of_ghost_layers = 0
coordinate_names = boundary_index_array_coordinate_names[
: len(flag_field_arr.shape)
]
index_arr_dtype = np.dtype(
[(name, default_index_array_dtype) for name in coordinate_names]
+ [(direction_member_name, default_index_array_dtype)]
)
# boundary cells are extracted via np.where. To ensure continous memory access in the compute kernel these cells
# have to be sorted.
boundary_cells = np.transpose(np.nonzero(flag_field_arr == boundary_mask))
for i in range(len(flag_field_arr.shape)):
boundary_cells = boundary_cells[boundary_cells[:, i].argsort(kind="mergesort")]
# First a set is created to save all fluid cells which are near boundary
fluid_cells = set()
for cell in boundary_cells:
cell = tuple(cell)
for dir_idx, direction in enumerate(stencil): for dir_idx, direction in enumerate(stencil):
neighbor_cell = tuple([cell_i + dir_i for cell_i, dir_i in zip(cell, direction)]) neighbor_cell = tuple(
if flag_field_arr[neighbor_cell] & boundary_mask: [cell_i + dir_i for cell_i, dir_i in zip(cell, direction)]
result.append(cell + (dir_idx,)) )
if single_link: # prevent out ouf bounds access. If boundary cell is at the border, some stencil directions would be out.
continue if any(
not 0 + nr_of_ghost_layers <= e < upper - nr_of_ghost_layers
return np.array(result, dtype=index_arr_dtype) for e, upper in zip(neighbor_cell, flag_field_arr.shape)
):
continue
if flag_field_arr[neighbor_cell] & fluid_mask:
fluid_cells.add(neighbor_cell)
# then this is set is transformed to a list to make it sortable. This ensures continoous memory access later.
fluid_cells = list(fluid_cells)
if len(flag_field_arr.shape) == 3:
fluid_cells.sort(key=lambda tup: (tup[-1], tup[-2], tup[0]))
else:
fluid_cells.sort(key=lambda tup: (tup[-1], tup[0]))
def _create_boundary_cell_index_list_python(flag_field_arr, nr_of_ghost_layers, boundary_mask, cells_to_iterate = fluid_cells if inner_or_boundary else boundary_cells
fluid_mask, stencil, single_link): checkmask = boundary_mask if inner_or_boundary else fluid_mask
coordinate_names = boundary_index_array_coordinate_names[:len(flag_field_arr.shape)]
index_arr_dtype = np.dtype([(name, np.int32) for name in coordinate_names] + [(direction_member_name, np.int32)])
result = [] result = []
gl = nr_of_ghost_layers for cell in cells_to_iterate:
for cell in itertools.product(*reversed([range(gl, i - gl) for i in flag_field_arr.shape])): cell = tuple(cell)
cell = cell[::-1] sum_cells = np.zeros(len(cell))
if not flag_field_arr[cell] & boundary_mask:
continue
for dir_idx, direction in enumerate(stencil): for dir_idx, direction in enumerate(stencil):
neighbor_cell = tuple([cell_i + dir_i for cell_i, dir_i in zip(cell, direction)]) neighbor_cell = tuple(
neighbor_is_fluid = False [cell_i + dir_i for cell_i, dir_i in zip(cell, direction)]
try: )
neighbor_is_fluid = flag_field_arr[neighbor_cell] & fluid_mask # prevent out ouf bounds access. If boundary cell is at the border, some stencil directions would be out.
except IndexError: if any(
pass not 0 <= e < upper
if neighbor_is_fluid: for e, upper in zip(neighbor_cell, flag_field_arr.shape)
result.append(cell + (dir_idx,)) ):
continue
if flag_field_arr[neighbor_cell] & checkmask:
if single_link: if single_link:
continue sum_cells += np.array(direction)
else:
result.append(tuple(cell) + (dir_idx,))
# the discrete normal direction is the one which gives the maximum inner product to the stencil direction
if single_link and any(sum_cells != 0):
idx = np.argmax(np.inner(sum_cells, stencil))
result.append(tuple(cell) + (idx,))
return np.array(result, dtype=index_arr_dtype) return np.array(result, dtype=index_arr_dtype)
def create_boundary_index_list(flag_field, stencil, boundary_mask, fluid_mask, def create_boundary_index_list(
nr_of_ghost_layers=1, inner_or_boundary=True, single_link=False): flag_field,
stencil,
boundary_mask,
fluid_mask,
nr_of_ghost_layers=1,
inner_or_boundary=True,
single_link=False,
):
"""Creates a numpy array storing links (connections) between domain cells and boundary cells. """Creates a numpy array storing links (connections) between domain cells and boundary cells.
Args: Args:
...@@ -86,50 +141,79 @@ def create_boundary_index_list(flag_field, stencil, boundary_mask, fluid_mask, ...@@ -86,50 +141,79 @@ def create_boundary_index_list(flag_field, stencil, boundary_mask, fluid_mask,
nr_of_ghost_layers: only relevant if neighbors is True nr_of_ghost_layers: only relevant if neighbors is True
inner_or_boundary: if true, the result contains the cell coordinates of the domain cells - inner_or_boundary: if true, the result contains the cell coordinates of the domain cells -
if false the boundary cells are listed if false the boundary cells are listed
single_link: if true only the first link is reported from this cell single_link: if true only the link in normal direction to this cell is reported
""" """
dim = len(flag_field.shape) dim = len(flag_field.shape)
coordinate_names = boundary_index_array_coordinate_names[:dim] coordinate_names = boundary_index_array_coordinate_names[:dim]
index_arr_dtype = np.dtype([(name, np.int32) for name in coordinate_names] + [(direction_member_name, np.int32)]) index_arr_dtype = np.dtype(
[(name, default_index_array_dtype) for name in coordinate_names]
stencil = np.array(stencil, dtype=np.int32) + [(direction_member_name, default_index_array_dtype)]
args = (flag_field, nr_of_ghost_layers, boundary_mask, fluid_mask, stencil, single_link) )
stencil = np.array(stencil, dtype=default_index_array_dtype)
args = (
flag_field,
nr_of_ghost_layers,
boundary_mask,
fluid_mask,
stencil,
single_link,
)
args_no_gl = (flag_field, boundary_mask, fluid_mask, stencil, single_link)
if cython_funcs_available: if cython_funcs_available:
if dim == 2: if dim == 2:
if inner_or_boundary: if inner_or_boundary:
idx_list = create_boundary_neighbor_index_list_2d(*args) idx_list = create_boundary_neighbor_index_list_2d(*args)
else: else:
idx_list = create_boundary_cell_index_list_2d(*args) idx_list = create_boundary_cell_index_list_2d(*args_no_gl)
elif dim == 3: elif dim == 3:
if inner_or_boundary: if inner_or_boundary:
idx_list = create_boundary_neighbor_index_list_3d(*args) idx_list = create_boundary_neighbor_index_list_3d(*args)
else: else:
idx_list = create_boundary_cell_index_list_3d(*args) idx_list = create_boundary_cell_index_list_3d(*args_no_gl)
else: else:
raise ValueError("Flag field has to be a 2 or 3 dimensional numpy array") raise ValueError("Flag field has to be a 2 or 3 dimensional numpy array")
return np.array(idx_list, dtype=index_arr_dtype) return np.array(idx_list, dtype=index_arr_dtype)
else: else:
if flag_field.size > 1e6: if flag_field.size > 1e6:
warnings.warn("Boundary setup may take very long! Consider installing cython to speed it up") warnings.warn(
if inner_or_boundary: "Boundary setup may take very long! Consider installing cython to speed it up"
return _create_boundary_neighbor_index_list_python(*args) )
else: return _create_index_list_python(
return _create_boundary_cell_index_list_python(*args) *args_no_gl,
inner_or_boundary=inner_or_boundary,
nr_of_ghost_layers=nr_of_ghost_layers,
def create_boundary_index_array(flag_field, stencil, boundary_mask, fluid_mask, boundary_object, )
nr_of_ghost_layers=1, inner_or_boundary=True, single_link=False):
idx_array = create_boundary_index_list(flag_field, stencil, boundary_mask, fluid_mask,
nr_of_ghost_layers, inner_or_boundary, single_link) def create_boundary_index_array(
flag_field,
stencil,
boundary_mask,
fluid_mask,
boundary_object,
nr_of_ghost_layers=1,
inner_or_boundary=True,
single_link=False,
):
idx_array = create_boundary_index_list(
flag_field,
stencil,
boundary_mask,
fluid_mask,
nr_of_ghost_layers,
inner_or_boundary,
single_link,
)
dim = len(flag_field.shape) dim = len(flag_field.shape)
if boundary_object.additional_data: if boundary_object.additional_data:
coordinate_names = boundary_index_array_coordinate_names[:dim] coordinate_names = boundary_index_array_coordinate_names[:dim]
index_arr_dtype = numpy_data_type_for_boundary_object(boundary_object, dim) index_arr_dtype = numpy_data_type_for_boundary_object(boundary_object, dim)
extended_idx_field = np.empty(len(idx_array), dtype=index_arr_dtype) extended_idx_field = np.empty(len(idx_array), dtype=index_arr_dtype)
for prop in coordinate_names + ['dir']: for prop in coordinate_names + ["dir"]:
extended_idx_field[prop] = idx_array[prop] extended_idx_field[prop] = idx_array[prop]
idx_array = extended_idx_field idx_array = extended_idx_field
......
# Workaround for cython bug # cython: language_level=3str
# see https://stackoverflow.com/questions/8024805/cython-compiled-c-extension-importerror-dynamic-module-does-not-define-init-fu
WORKAROUND = "Something"
import cython import cython
...@@ -21,20 +19,37 @@ def create_boundary_neighbor_index_list_2d(object[IntegerType, ndim=2] flag_fiel ...@@ -21,20 +19,37 @@ def create_boundary_neighbor_index_list_2d(object[IntegerType, ndim=2] flag_fiel
cdef int xs, ys, x, y cdef int xs, ys, x, y
cdef int dirIdx, num_directions, dx, dy cdef int dirIdx, num_directions, dx, dy
cdef int sum_x, sum_y
cdef float dot, maxn
cdef int calculated_idx
xs, ys = flag_field.shape xs, ys = flag_field.shape
boundary_index_list = [] boundary_index_list = []
num_directions = stencil.shape[0] num_directions = stencil.shape[0]
for y in range(nr_of_ghost_layers, ys - nr_of_ghost_layers): for y in range(nr_of_ghost_layers, ys - nr_of_ghost_layers):
for x in range(nr_of_ghost_layers, xs - nr_of_ghost_layers): for x in range(nr_of_ghost_layers, xs - nr_of_ghost_layers):
sum_x = 0; sum_y = 0;
if flag_field[x, y] & fluid_mask: if flag_field[x, y] & fluid_mask:
for dirIdx in range(num_directions): for dirIdx in range(num_directions):
dx = stencil[dirIdx,0] dx = stencil[dirIdx,0]; dy = stencil[dirIdx,1]
dy = stencil[dirIdx,1]
if flag_field[x + dx, y + dy] & boundary_mask: if flag_field[x + dx, y + dy] & boundary_mask:
boundary_index_list.append((x,y, dirIdx))
if single_link: if single_link:
break sum_x += dx; sum_y += dy;
else:
boundary_index_list.append((x, y, dirIdx))
dot = 0; maxn = 0; calculated_idx = 0
if single_link and (sum_x != 0 or sum_y != 0):
for dirIdx in range(num_directions):
dx = stencil[dirIdx, 0]; dy = stencil[dirIdx, 1];
dot = dx * sum_x + dy * sum_y
if dot > maxn:
maxn = dot
calculated_idx = dirIdx
boundary_index_list.append((x, y, calculated_idx))
return boundary_index_list return boundary_index_list
...@@ -46,6 +61,10 @@ def create_boundary_neighbor_index_list_3d(object[IntegerType, ndim=3] flag_fiel ...@@ -46,6 +61,10 @@ def create_boundary_neighbor_index_list_3d(object[IntegerType, ndim=3] flag_fiel
cdef int xs, ys, zs, x, y, z cdef int xs, ys, zs, x, y, z
cdef int dirIdx, num_directions, dx, dy, dz cdef int dirIdx, num_directions, dx, dy, dz
cdef int sum_x, sum_y, sum_z
cdef float dot, maxn
cdef int calculated_idx
xs, ys, zs = flag_field.shape xs, ys, zs = flag_field.shape
boundary_index_list = [] boundary_index_list = []
num_directions = stencil.shape[0] num_directions = stencil.shape[0]
...@@ -53,15 +72,27 @@ def create_boundary_neighbor_index_list_3d(object[IntegerType, ndim=3] flag_fiel ...@@ -53,15 +72,27 @@ def create_boundary_neighbor_index_list_3d(object[IntegerType, ndim=3] flag_fiel
for z in range(nr_of_ghost_layers, zs - nr_of_ghost_layers): for z in range(nr_of_ghost_layers, zs - nr_of_ghost_layers):
for y in range(nr_of_ghost_layers, ys - nr_of_ghost_layers): for y in range(nr_of_ghost_layers, ys - nr_of_ghost_layers):
for x in range(nr_of_ghost_layers, xs - nr_of_ghost_layers): for x in range(nr_of_ghost_layers, xs - nr_of_ghost_layers):
sum_x = 0; sum_y = 0; sum_z = 0
if flag_field[x, y, z] & fluid_mask: if flag_field[x, y, z] & fluid_mask:
for dirIdx in range(num_directions): for dirIdx in range(num_directions):
dx = stencil[dirIdx,0] dx = stencil[dirIdx,0]; dy = stencil[dirIdx,1]; dz = stencil[dirIdx,2]
dy = stencil[dirIdx,1]
dz = stencil[dirIdx,2]
if flag_field[x + dx, y + dy, z + dz] & boundary_mask: if flag_field[x + dx, y + dy, z + dz] & boundary_mask:
boundary_index_list.append((x,y,z, dirIdx))
if single_link: if single_link:
break sum_x += dx; sum_y += dy; sum_z += dz
else:
boundary_index_list.append((x, y, z, dirIdx))
dot = 0; maxn = 0; calculated_idx = 0
if single_link and (sum_x != 0 or sum_y != 0 or sum_z != 0):
for dirIdx in range(num_directions):
dx = stencil[dirIdx, 0]; dy = stencil[dirIdx, 1]; dz = stencil[dirIdx, 2]
dot = dx * sum_x + dy * sum_y + dz * sum_z
if dot > maxn:
maxn = dot
calculated_idx = dirIdx
boundary_index_list.append((x, y, z, calculated_idx))
return boundary_index_list return boundary_index_list
...@@ -69,37 +100,59 @@ def create_boundary_neighbor_index_list_3d(object[IntegerType, ndim=3] flag_fiel ...@@ -69,37 +100,59 @@ def create_boundary_neighbor_index_list_3d(object[IntegerType, ndim=3] flag_fiel
@cython.boundscheck(False) # turn off bounds-checking for entire function @cython.boundscheck(False) # turn off bounds-checking for entire function
@cython.wraparound(False) # turn off negative index wrapping for entire function @cython.wraparound(False) # turn off negative index wrapping for entire function
def create_boundary_cell_index_list_2d(object[IntegerType, ndim=2] flag_field, def create_boundary_cell_index_list_2d(object[IntegerType, ndim=2] flag_field,
int nr_of_ghost_layers, IntegerType boundary_mask, IntegerType fluid_mask, IntegerType boundary_mask, IntegerType fluid_mask,
object[int, ndim=2] stencil, int single_link): object[int, ndim=2] stencil, int single_link):
cdef int xs, ys, x, y cdef int xs, ys, x, y
cdef int dirIdx, num_directions, dx, dy cdef int dirIdx, num_directions, dx, dy
cdef int sum_x, sum_y
cdef float dot, maxn
cdef int calculated_idx
xs, ys = flag_field.shape xs, ys = flag_field.shape
boundary_index_list = [] boundary_index_list = []
num_directions = stencil.shape[0] num_directions = stencil.shape[0]
for y in range(0, ys): for y in range(0, ys):
for x in range(0, xs): for x in range(0, xs):
sum_x = 0; sum_y = 0;
if flag_field[x, y] & boundary_mask: if flag_field[x, y] & boundary_mask:
for dirIdx in range(num_directions): for dirIdx in range(num_directions):
dx = stencil[dirIdx,0] dx = stencil[dirIdx,0]; dy = stencil[dirIdx,1]
dy = stencil[dirIdx,1]
if 0 <= x + dx < xs and 0 <= y + dy < ys: if 0 <= x + dx < xs and 0 <= y + dy < ys:
if flag_field[x + dx, y + dy] & fluid_mask: if flag_field[x + dx, y + dy] & fluid_mask:
boundary_index_list.append((x,y, dirIdx))
if single_link: if single_link:
break sum_x += dx; sum_y += dy
else:
boundary_index_list.append((x, y, dirIdx))
dot = 0; maxn = 0; calculated_idx = 0
if single_link and (sum_x != 0 or sum_y != 0):
for dirIdx in range(num_directions):
dx = stencil[dirIdx, 0]; dy = stencil[dirIdx, 1]
dot = dx * sum_x + dy * sum_y
if dot > maxn:
maxn = dot
calculated_idx = dirIdx
boundary_index_list.append((x, y, calculated_idx))
return boundary_index_list return boundary_index_list
@cython.boundscheck(False) # turn off bounds-checking for entire function @cython.boundscheck(False) # turn off bounds-checking for entire function
@cython.wraparound(False) # turn off negative index wrapping for entire function @cython.wraparound(False) # turn off negative index wrapping for entire function
def create_boundary_cell_index_list_3d(object[IntegerType, ndim=3] flag_field, def create_boundary_cell_index_list_3d(object[IntegerType, ndim=3] flag_field,
int nr_of_ghost_layers, IntegerType boundary_mask, IntegerType fluid_mask, IntegerType boundary_mask, IntegerType fluid_mask,
object[int, ndim=2] stencil, int single_link): object[int, ndim=2] stencil, int single_link):
cdef int xs, ys, zs, x, y, z cdef int xs, ys, zs, x, y, z
cdef int dirIdx, num_directions, dx, dy, dz cdef int dirIdx, num_directions, dx, dy, dz
cdef int sum_x, sum_y, sum_z
cdef float dot, maxn
cdef int calculated_idx
xs, ys, zs = flag_field.shape xs, ys, zs = flag_field.shape
boundary_index_list = [] boundary_index_list = []
num_directions = stencil.shape[0] num_directions = stencil.shape[0]
...@@ -107,14 +160,27 @@ def create_boundary_cell_index_list_3d(object[IntegerType, ndim=3] flag_field, ...@@ -107,14 +160,27 @@ def create_boundary_cell_index_list_3d(object[IntegerType, ndim=3] flag_field,
for z in range(0, zs): for z in range(0, zs):
for y in range(0, ys): for y in range(0, ys):
for x in range(0, xs): for x in range(0, xs):
sum_x = 0; sum_y = 0; sum_z = 0
if flag_field[x, y, z] & boundary_mask: if flag_field[x, y, z] & boundary_mask:
for dirIdx in range(num_directions): for dirIdx in range(num_directions):
dx = stencil[dirIdx,0] dx = stencil[dirIdx, 0]; dy = stencil[dirIdx, 1]; dz = stencil[dirIdx, 2]
dy = stencil[dirIdx,1]
dz = stencil[dirIdx,2]
if 0 <= x + dx < xs and 0 <= y + dy < ys and 0 <= z + dz < zs: if 0 <= x + dx < xs and 0 <= y + dy < ys and 0 <= z + dz < zs:
if flag_field[x + dx, y + dy, z + dz] & fluid_mask: if flag_field[x + dx, y + dy, z + dz] & fluid_mask:
boundary_index_list.append((x,y,z, dirIdx))
if single_link: if single_link:
break sum_x += dx; sum_y += dy; sum_z += dz
else:
boundary_index_list.append((x, y, z, dirIdx))
dot = 0; maxn = 0; calculated_idx=0
if single_link and (sum_x != 0 or sum_y !=0 or sum_z !=0):
for dirIdx in range(num_directions):
dx = stencil[dirIdx, 0]; dy = stencil[dirIdx, 1]; dz = stencil[dirIdx, 2]
dot = dx*sum_x + dy*sum_y + dz*sum_z
if dot > maxn:
maxn = dot
calculated_idx = dirIdx
boundary_index_list.append((x, y, z, calculated_idx))
return boundary_index_list return boundary_index_list
\ No newline at end of file
import sympy as sp import sympy as sp
from pystencils import Field, TypedSymbol
from pystencils.integer_functions import bitwise_and
from pystencils.boundaries.boundaryhandling import DEFAULT_FLAG_TYPE from pystencils.boundaries.boundaryhandling import DEFAULT_FLAG_TYPE
from pystencils.data_types import create_type from pystencils.typing import TypedSymbol, create_type
from pystencils.field import Field
from pystencils.integer_functions import bitwise_and
def add_neumann_boundary(eqs, fields, flag_field, boundary_flag="neumann_flag", inverse_flag=False): def add_neumann_boundary(eqs, fields, flag_field, boundary_flag="neumann_flag", inverse_flag=False):
""" """
Replaces all neighbor accesses by flag field guarded accesses. Replaces all neighbor accesses by flag field guarded accesses.
If flag in neighboring cell is set, the center value is used instead If flag in neighboring cell is set, the center value is used instead
:param eqs: list of equations containing field accesses to direct neighbors
:param fields: fields for which the Neumann boundary should be applied Args:
:param flag_field: integer field marking boundary cells eqs: list of equations containing field accesses to direct neighbors
:param boundary_flag: if flag field has value 'boundary_flag' (no bit operations yet) fields: fields for which the Neumann boundary should be applied
the cell is assumed to be boundary flag_field: integer field marking boundary cells
:param inverse_flag: if true, boundary cells are where flag field has not the value of boundary_flag boundary_flag: if flag field has value 'boundary_flag' (no bit operations yet)
:return: list of equations with guarded field accesses the cell is assumed to be boundary
inverse_flag: if true, boundary cells are where flag field has not the value of boundary_flag
Returns:
list of equations with guarded field accesses
""" """
if not hasattr(fields, "__len__"): if not hasattr(fields, "__len__"):
fields = [fields] fields = [fields]
......
import os
from collections.abc import Hashable
from functools import partial, wraps
from itertools import chain
from functools import lru_cache as memorycache
from joblib import Memory
from appdirs import user_cache_dir
if 'PYSTENCILS_CACHE_DIR' in os.environ:
cache_dir = os.environ['PYSTENCILS_CACHE_DIR']
else:
cache_dir = user_cache_dir('pystencils')
disk_cache = Memory(cache_dir, verbose=False).cache
disk_cache_no_fallback = disk_cache
def _wrapper(wrapped_func, cached_func, *args, **kwargs):
if all(isinstance(a, Hashable) for a in chain(args, kwargs.values())):
return cached_func(*args, **kwargs)
else:
return wrapped_func(*args, **kwargs)
def memorycache_if_hashable(maxsize=128, typed=False):
def wrapper(func):
return partial(_wrapper, func, memorycache(maxsize, typed)(func))
return wrapper
def sharedmethodcache(cache_id: str):
"""Decorator for memoization of instance methods, allowing multiple methods to use the same cache.
This decorator caches results of instance methods per instantiated object of the surrounding class.
It allows multiple methods to use the same cache, by passing them the same `cache_id` string.
Cached values are stored in a dictionary, which is added as a member `self.<cache_id>` to the
`self` object instance. Make sure that this doesn't cause any naming conflicts with other members!
Of course, for this to be useful, said methods must have the same signature (up to additional kwargs)
and must return the same result when called with the same arguments."""
def _decorator(user_method):
@wraps(user_method)
def _decorated_func(self, *args, **kwargs):
objdict = self.__dict__
cache = objdict.setdefault(cache_id, dict())
key = args
for item in kwargs.items():
key += item
if key not in cache:
result = user_method(self, *args, **kwargs)
cache[key] = result
return result
else:
return cache[key]
return _decorated_func
return _decorator
def clear_cache():
"""
Clears the pystencils cache created by joblib.
"""
memory = Memory(cache_dir, verbose=0)
memory.clear(warn=False)
# Disable memory cache:
# disk_cache = lambda o: o
# disk_cache_no_fallback = lambda o: o
from copy import copy
from collections import defaultdict
from dataclasses import dataclass, field
from types import MappingProxyType
from typing import Union, Tuple, List, Dict, Callable, Any, DefaultDict, Iterable
from pystencils import Target, Backend, Field
from pystencils.typing.typed_sympy import BasicType
from pystencils.typing.utilities import collate_types
import numpy as np
# TODO: There exists DTypeLike in NumPy which would be better than type for type hinting, to new at the moment
# from numpy.typing import DTypeLike
# TODO: CreateKernelConfig is bloated think of more classes better usage, factory whatever ...
# Proposition: CreateKernelConfigs Classes for different targets?
@dataclass
class CreateKernelConfig:
"""
**Below all parameters for the CreateKernelConfig are explained**
"""
target: Target = Target.CPU
"""
All targets are defined in :class:`pystencils.enums.Target`
"""
backend: Backend = None
"""
All backends are defined in :class:`pystencils.enums.Backend`
"""
function_name: str = 'kernel'
"""
Name of the generated function - only important if generated code is written out
"""
data_type: Union[type, str, DefaultDict[str, BasicType], Dict[str, BasicType]] = np.float64
"""
Data type used for all untyped symbols (i.e. non-fields), can also be a dict from symbol name to type.
If specified as a dict ideally a defaultdict is used to define a default value for symbols not listed in the
dict. If a plain dict is provided it will be transformed into a defaultdict internally. The default value
will then be specified via type collation then.
"""
default_number_float: Union[type, str, BasicType] = None
"""
Data type used for all untyped floating point numbers (i.e. 0.5). By default the value of data_type is used.
If data_type is given as a defaultdict its default_factory is used.
"""
default_number_int: Union[type, str, BasicType] = np.int64
"""
Data type used for all untyped integer numbers (i.e. 1)
"""
iteration_slice: Tuple = None
"""
Rectangular subset to iterate over, if not specified the complete non-ghost layer part of the field is iterated over
"""
ghost_layers: Union[bool, int, List[Tuple[int]]] = None
"""
A single integer specifies the ghost layer count at all borders, can also be a sequence of
pairs ``[(x_lower_gl, x_upper_gl), .... ]``. These layers are excluded from the iteration.
If left to default, the number of ghost layers is determined automatically from the assignments.
"""
cpu_openmp: Union[bool, int] = False
"""
`True` or number of threads for OpenMP parallelization, `False` for no OpenMP. If set to `True`, the maximum number
of available threads will be chosen.
"""
cpu_vectorize_info: Dict = None
"""
A dictionary with keys, 'vector_instruction_set', 'assume_aligned' and 'nontemporal'
for documentation of these parameters see vectorize function. Example:
'{'instruction_set': 'avx512', 'assume_aligned': True, 'nontemporal':True}'
"""
cpu_blocking: Tuple[int] = None
"""
A tuple of block sizes or `None` if no blocking should be applied
"""
omp_single_loop: bool = True
"""
If OpenMP is active: whether multiple outer loops are permitted
"""
base_pointer_specification: Union[List[Iterable[str]], List[Iterable[int]]] = None
"""
Specification of how many and which intermediate pointers are created for a field access.
For example [ (0), (2,3,)] creates on base pointer for coordinates 2 and 3 and writes the offset for coordinate
zero directly in the field access. These specifications are defined dependent on the loop ordering.
This function translates more readable version into the specification above.
For more information see: `pystencils.transformations.create_intermediate_base_pointer`
"""
gpu_indexing: str = 'block'
"""
Either 'block' or 'line' , or custom indexing class, see `pystencils.gpu.AbstractIndexing`
"""
gpu_indexing_params: MappingProxyType = field(default_factory=lambda: MappingProxyType({}))
"""
Dict with indexing parameters (constructor parameters of indexing class)
e.g. for 'block' one can specify '{'block_size': (20, 20, 10) }'.
"""
# TODO Markus rework this docstring
default_assignment_simplifications: bool = False
"""
If `True` default simplifications are first performed on the Assignments. If problems occur during the
simplification a warning will be thrown.
Furthermore, it is essential to know that this is a two-stage process. The first stage of the process acts
on the level of the `pystencils.AssignmentCollection`. In this part,
`pystencil.simp.create_simplification_strategy` from pystencils.simplificationfactory will be used to
apply optimisations like insertion of constants to
remove pressure from the registers. Thus the first part of the optimisations can only be executed if
an `AssignmentCollection` is passed. The second part of the optimisation acts on the level of each Assignment
individually. In this stage, all optimisations from `sympy.codegen.rewriting.optims_c99` are applied
to each Assignment. Thus this stage can also be applied if a list of Assignments is passed.
"""
cpu_prepend_optimizations: List[Callable] = field(default_factory=list)
"""
List of extra optimizations to perform first on the AST.
"""
use_auto_for_assignments: bool = False
"""
If set to `True`, auto can be used in the generated code for data types. This makes the type system more robust.
"""
index_fields: List[Field] = None
"""
List of index fields, i.e. 1D fields with struct data type. If not `None`, `create_index_kernel`
instead of `create_domain_kernel` is used.
"""
coordinate_names: Tuple[str, Any] = ('x', 'y', 'z')
"""
Name of the coordinate fields in the struct data type.
"""
allow_double_writes: bool = False
"""
If True, don't check if every field is only written at a single location. This is required
for example for kernels that are compiled with loop step sizes > 1, that handle multiple
cells at once. Use with care!
"""
skip_independence_check: bool = False
"""
By default the assignment list is checked for read/write independence. This means fields are only written at
locations where they are read. Doing so guarantees thread safety. In some cases e.g. for
periodicity kernel, this can not be assured and does the check needs to be deactivated. Use with care!
"""
class DataTypeFactory:
"""Because of pickle, we need to have a nested class, instead of a lambda in __post_init__"""
def __init__(self, dt):
self.dt = dt
def __call__(self):
return BasicType(self.dt)
def _check_type(self, dtype_to_check):
if isinstance(dtype_to_check, str) and (dtype_to_check == 'float' or dtype_to_check == 'int'):
self._typing_error()
if isinstance(dtype_to_check, type) and not hasattr(dtype_to_check, "dtype"):
# NumPy-types are also of type 'type'. However, they have more properties
self._typing_error()
@staticmethod
def _typing_error():
raise ValueError("It is not possible to use python types (float, int) for datatypes because these "
"types are ambiguous. For example float will map to double. "
"Also the string version like 'float' is not allowed, e.g. use 'float64' instead")
def __post_init__(self):
# ---- Legacy parameters
if not isinstance(self.target, Target):
raise ValueError("target must be provided by the 'Target' enum")
# ---- Auto Backend
if not self.backend:
if self.target == Target.CPU:
self.backend = Backend.C
elif self.target == Target.GPU:
self.backend = Backend.CUDA
else:
raise NotImplementedError(f'Target {self.target} has no default backend')
if not isinstance(self.backend, Backend):
raise ValueError("backend must be provided by the 'Backend' enum")
# Normalise data types
for dtype in [self.data_type, self.default_number_float, self.default_number_int]:
self._check_type(dtype)
if not isinstance(self.data_type, dict):
dt = copy(self.data_type) # The copy is necessary because BasicType has sympy shinanigans
self.data_type = defaultdict(self.DataTypeFactory(dt))
if isinstance(self.data_type, dict) and not isinstance(self.data_type, defaultdict):
for dtype in self.data_type.values():
self._check_type(dtype)
dt = collate_types([BasicType(dtype) for dtype in self.data_type.values()])
dtype_dict = self.data_type
self.data_type = defaultdict(self.DataTypeFactory(dt), dtype_dict)
assert isinstance(self.data_type, defaultdict), "At this point data_type must be a defaultdict!"
for dtype in self.data_type.values():
self._check_type(dtype)
self._check_type(self.data_type.default_factory())
if self.default_number_float is None:
self.default_number_float = self.data_type.default_factory()
if not isinstance(self.default_number_float, BasicType):
self.default_number_float = BasicType(self.default_number_float)
if not isinstance(self.default_number_int, BasicType):
self.default_number_int = BasicType(self.default_number_int)
from pystencils.cpu.kernelcreation import create_kernel, create_indexed_kernel, add_openmp
from pystencils.cpu.cpujit import make_python_function from pystencils.cpu.cpujit import make_python_function
from pystencils.cpu.kernelcreation import add_openmp, create_indexed_kernel, create_kernel, add_pragmas
__all__ = ['create_kernel', 'create_indexed_kernel', 'add_openmp', 'make_python_function'] __all__ = ['create_kernel', 'create_indexed_kernel', 'add_openmp', 'add_pragmas', 'make_python_function']
...@@ -13,7 +13,7 @@ in a configuration file. ...@@ -13,7 +13,7 @@ in a configuration file.
3. or in your home directory at ``~/.config/pystencils/config.json`` (Linux) or 3. or in your home directory at ``~/.config/pystencils/config.json`` (Linux) or
``%HOMEPATH%\.pystencils\config.json`` (Windows) ``%HOMEPATH%\.pystencils\config.json`` (Windows)
If no configuration file is found, a default configuration is created at the above mentioned location in your home. If no configuration file is found, a default configuration is created at the above-mentioned location in your home.
So run *pystencils* once, then edit the created configuration file. So run *pystencils* once, then edit the created configuration file.
...@@ -23,7 +23,7 @@ Compiler Config (Linux) ...@@ -23,7 +23,7 @@ Compiler Config (Linux)
- **'os'**: should be detected automatically as 'linux' - **'os'**: should be detected automatically as 'linux'
- **'command'**: path to C++ compiler (defaults to 'g++') - **'command'**: path to C++ compiler (defaults to 'g++')
- **'flags'**: space separated list of compiler flags. Make sure to activate OpenMP in your compiler - **'flags'**: space separated list of compiler flags. Make sure to activate OpenMP in your compiler
- **'restrict_qualifier'**: the restrict qualifier is not standardized accross compilers. - **'restrict_qualifier'**: the 'restrict' qualifier is not standardized across compilers.
For most Linux compilers the qualifier is ``__restrict__`` For most Linux compilers the qualifier is ``__restrict__``
...@@ -39,32 +39,40 @@ Then 'cl.exe' is used to compile. ...@@ -39,32 +39,40 @@ Then 'cl.exe' is used to compile.
where Visual Studio is installed. This path has to contain a file called 'vcvarsall.bat' where Visual Studio is installed. This path has to contain a file called 'vcvarsall.bat'
- **'arch'**: 'x86' or 'x64' - **'arch'**: 'x86' or 'x64'
- **'flags'**: flags passed to 'cl.exe', make sure OpenMP is activated - **'flags'**: flags passed to 'cl.exe', make sure OpenMP is activated
- **'restrict_qualifier'**: the restrict qualifier is not standardized across compilers. - **'restrict_qualifier'**: the 'restrict' qualifier is not standardized across compilers.
For Windows compilers the qualifier should be ``__restrict`` For Windows compilers the qualifier should be ``__restrict``
""" """
import os from appdirs import user_cache_dir, user_config_dir
from collections import OrderedDict
import hashlib import hashlib
import importlib.util
import json import json
import os
import platform import platform
import shutil import shutil
import subprocess
import sysconfig
import tempfile
import textwrap import textwrap
from tempfile import TemporaryDirectory import time
import warnings
import pathlib
import numpy as np import numpy as np
import subprocess
from appdirs import user_config_dir, user_cache_dir
from collections import OrderedDict
from pystencils.utils import recursive_dict_update
from sysconfig import get_paths
from pystencils import FieldType from pystencils import FieldType
from pystencils.astnodes import LoopOverCoordinate
from pystencils.backends.cbackend import generate_c, get_headers from pystencils.backends.cbackend import generate_c, get_headers
from pystencils.utils import file_handle_for_atomic_write, atomic_file_write from pystencils.backends.simd_instruction_sets import get_supported_instruction_sets
from pystencils.cpu.msvc_detection import get_environment
from pystencils.include import get_pystencils_include_path from pystencils.include import get_pystencils_include_path
from pystencils.kernel_wrapper import KernelWrapper
from pystencils.typing import BasicType, CastFunc, VectorType, VectorMemoryAccess
from pystencils.utils import atomic_file_write, recursive_dict_update
def make_python_function(kernel_function_node): def make_python_function(kernel_function_node, custom_backend=None):
""" """
Creates C code from the abstract syntax tree, compiles it and makes it accessible as Python function Creates C code from the abstract syntax tree, compiles it and makes it accessible as Python function
...@@ -73,9 +81,10 @@ def make_python_function(kernel_function_node): ...@@ -73,9 +81,10 @@ def make_python_function(kernel_function_node):
- all symbols which are not defined in the kernel itself are expected as parameters - all symbols which are not defined in the kernel itself are expected as parameters
:param kernel_function_node: the abstract syntax tree :param kernel_function_node: the abstract syntax tree
:param custom_backend: use own custom printer for code generation
:return: kernel functor :return: kernel functor
""" """
result = compile_and_load(kernel_function_node) result = compile_and_load(kernel_function_node, custom_backend)
return result return result
...@@ -115,15 +124,15 @@ def get_configuration_file_path(): ...@@ -115,15 +124,15 @@ def get_configuration_file_path():
# 1) Read path from environment variable if found # 1) Read path from environment variable if found
if 'PYSTENCILS_CONFIG' in os.environ: if 'PYSTENCILS_CONFIG' in os.environ:
return os.environ['PYSTENCILS_CONFIG'], True return os.environ['PYSTENCILS_CONFIG']
# 2) Look in current directory for pystencils.json # 2) Look in current directory for pystencils.json
elif os.path.exists("pystencils.json"): elif os.path.exists("pystencils.json"):
return "pystencils.json", True return "pystencils.json"
# 3) Try ~/.pystencils.json # 3) Try ~/.pystencils.json
elif os.path.exists(config_path_in_home): elif os.path.exists(config_path_in_home):
return config_path_in_home, True return config_path_in_home
else: else:
return config_path_in_home, False return config_path_in_home
def create_folder(path, is_file): def create_folder(path, is_file):
...@@ -143,15 +152,42 @@ def read_config(): ...@@ -143,15 +152,42 @@ def read_config():
('flags', '-Ofast -DNDEBUG -fPIC -march=native -fopenmp -std=c++11'), ('flags', '-Ofast -DNDEBUG -fPIC -march=native -fopenmp -std=c++11'),
('restrict_qualifier', '__restrict__') ('restrict_qualifier', '__restrict__')
]) ])
if platform.machine().startswith('ppc64') or platform.machine() == 'arm64':
default_compiler_config['flags'] = default_compiler_config['flags'].replace('-march=native',
'-mcpu=native')
elif platform.system().lower() == 'windows': elif platform.system().lower() == 'windows':
default_compiler_config = OrderedDict([ default_compiler_config = OrderedDict([
('os', 'windows'), ('os', 'windows'),
('msvc_version', 'latest'), ('msvc_version', 'latest'),
('arch', 'x64'), ('arch', 'x64'),
('flags', '/Ox /fp:fast /openmp /arch:avx'), ('flags', '/Ox /fp:fast /OpenMP /arch:avx'),
('restrict_qualifier', '__restrict') ('restrict_qualifier', '__restrict')
]) ])
if platform.machine() == 'ARM64':
default_compiler_config['arch'] = 'ARM64'
default_compiler_config['flags'] = default_compiler_config['flags'].replace(' /arch:avx', '')
elif platform.system().lower() == 'darwin':
default_compiler_config = OrderedDict([
('os', 'darwin'),
('command', 'clang++'),
('flags', '-Ofast -DNDEBUG -fPIC -march=native -Xclang -fopenmp -std=c++11'),
('restrict_qualifier', '__restrict__')
])
if platform.machine() == 'arm64':
if 'sme' in get_supported_instruction_sets():
flag = '-march=armv8.7-a+sme '
else:
flag = ''
default_compiler_config['flags'] = default_compiler_config['flags'].replace('-march=native ', flag)
for libomp in ['/opt/local/lib/libomp/libomp.dylib', '/usr/local/lib/libomp.dylib',
'/opt/homebrew/lib/libomp.dylib']:
if os.path.exists(libomp):
default_compiler_config['flags'] += ' ' + libomp
break
else:
raise NotImplementedError('Generation of default compiler flags for %s is not implemented' %
(platform.system(),))
default_cache_config = OrderedDict([ default_cache_config = OrderedDict([
('object_cache', os.path.join(user_cache_dir('pystencils'), 'objectcache')), ('object_cache', os.path.join(user_cache_dir('pystencils'), 'objectcache')),
('clear_cache_on_start', False), ('clear_cache_on_start', False),
...@@ -160,26 +196,47 @@ def read_config(): ...@@ -160,26 +196,47 @@ def read_config():
default_config = OrderedDict([('compiler', default_compiler_config), default_config = OrderedDict([('compiler', default_compiler_config),
('cache', default_cache_config)]) ('cache', default_cache_config)])
config_path, config_exists = get_configuration_file_path() from fasteners import InterProcessLock
config_path = pathlib.Path(get_configuration_file_path())
config_path.parent.mkdir(parents=True, exist_ok=True)
config = default_config.copy() config = default_config.copy()
if config_exists:
with open(config_path, 'r') as json_config_file: lockfile = config_path.with_suffix(config_path.suffix + ".lock")
loaded_config = json.load(json_config_file) with InterProcessLock(lockfile):
config = recursive_dict_update(config, loaded_config) if config_path.exists():
else: with open(config_path, 'r') as json_config_file:
create_folder(config_path, True) loaded_config = json.load(json_config_file)
json.dump(config, open(config_path, 'w'), indent=4) config = recursive_dict_update(config, loaded_config)
else:
with open(config_path, 'w') as f:
json.dump(config, f, indent=4)
if config['cache']['object_cache'] is not False: if config['cache']['object_cache'] is not False:
config['cache']['object_cache'] = os.path.expanduser(config['cache']['object_cache']).format(pid=os.getpid()) config['cache']['object_cache'] = os.path.expanduser(config['cache']['object_cache']).format(pid=os.getpid())
if config['cache']['clear_cache_on_start']: clear_cache_on_start = False
clear_cache() cache_status_file = os.path.join(config['cache']['object_cache'], 'last_config.json')
if os.path.exists(cache_status_file):
# check if compiler config has changed
last_config = json.load(open(cache_status_file, 'r'))
if set(last_config.items()) != set(config['compiler'].items()):
clear_cache_on_start = True
else:
for key in last_config.keys():
if last_config[key] != config['compiler'][key]:
clear_cache_on_start = True
if config['cache']['clear_cache_on_start'] or clear_cache_on_start:
shutil.rmtree(config['cache']['object_cache'], ignore_errors=True)
create_folder(config['cache']['object_cache'], False) create_folder(config['cache']['object_cache'], False)
with tempfile.NamedTemporaryFile('w', dir=os.path.dirname(cache_status_file), delete=False) as f:
json.dump(config['compiler'], f, indent=4)
os.replace(f.name, cache_status_file)
if config['compiler']['os'] == 'windows': if config['compiler']['os'] == 'windows':
from pystencils.cpu.msvc_detection import get_environment
msvc_env = get_environment(config['compiler']['msvc_version'], config['compiler']['arch']) msvc_env = get_environment(config['compiler']['msvc_version'], config['compiler']['arch'])
if 'env' not in config['compiler']: if 'env' not in config['compiler']:
config['compiler']['env'] = {} config['compiler']['env'] = {}
...@@ -226,6 +283,7 @@ def clear_cache(): ...@@ -226,6 +283,7 @@ def clear_cache():
create_folder(cache_config['object_cache'], False) create_folder(cache_config['object_cache'], False)
# TODO don't hardcode C type. [1] of tuple output
type_mapping = { type_mapping = {
np.float32: ('PyFloat_AsDouble', 'float'), np.float32: ('PyFloat_AsDouble', 'float'),
np.float64: ('PyFloat_AsDouble', 'double'), np.float64: ('PyFloat_AsDouble', 'double'),
...@@ -237,7 +295,6 @@ type_mapping = { ...@@ -237,7 +295,6 @@ type_mapping = {
np.uint64: ('PyLong_AsUnsignedLong', 'uint64_t'), np.uint64: ('PyLong_AsUnsignedLong', 'uint64_t'),
} }
template_extract_scalar = """ template_extract_scalar = """
PyObject * obj_{name} = PyDict_GetItemString(kwargs, "{name}"); PyObject * obj_{name} = PyDict_GetItemString(kwargs, "{name}");
if( obj_{name} == NULL) {{ PyErr_SetString(PyExc_TypeError, "Keyword argument '{name}' missing"); return NULL; }}; if( obj_{name} == NULL) {{ PyErr_SetString(PyExc_TypeError, "Keyword argument '{name}' missing"); return NULL; }};
...@@ -312,15 +369,14 @@ def equal_size_check(fields): ...@@ -312,15 +369,14 @@ def equal_size_check(fields):
return "" return ""
ref_field = fields[0] ref_field = fields[0]
cond = ["(buffer_{field.name}.shape[{i}] == buffer_{ref_field.name}.shape[{i}])".format(ref_field=ref_field, cond = [f"(buffer_{field_to_test.name}.shape[{i}] == buffer_{ref_field.name}.shape[{i}])"
field=field_to_test, i=i)
for field_to_test in fields[1:] for field_to_test in fields[1:]
for i in range(fields[0].spatial_dimensions)] for i in range(fields[0].spatial_dimensions)]
cond = " && ".join(cond) cond = " && ".join(cond)
return template_size_check.format(cond=cond) return template_size_check.format(cond=cond)
def create_function_boilerplate_code(parameter_info, name, insert_checks=True): def create_function_boilerplate_code(parameter_info, name, ast_node, insert_checks=True):
pre_call_code = "" pre_call_code = ""
parameters = [] parameters = []
post_call_code = "" post_call_code = ""
...@@ -332,24 +388,55 @@ def create_function_boilerplate_code(parameter_info, name, insert_checks=True): ...@@ -332,24 +388,55 @@ def create_function_boilerplate_code(parameter_info, name, insert_checks=True):
field = param.fields[0] field = param.fields[0]
pre_call_code += template_extract_array.format(name=field.name) pre_call_code += template_extract_array.format(name=field.name)
post_call_code += template_release_buffer.format(name=field.name) post_call_code += template_release_buffer.format(name=field.name)
parameters.append("({dtype} *)buffer_{name}.buf".format(dtype=str(field.dtype), name=field.name)) parameters.append(f"({str(field.dtype)} *)buffer_{field.name}.buf")
if insert_checks: if insert_checks:
np_dtype = field.dtype.numpy_dtype np_dtype = field.dtype.numpy_dtype
item_size = np_dtype.itemsize item_size = np_dtype.itemsize
if np_dtype.isbuiltin and FieldType.is_generic(field): aligned = False
dtype_cond = "buffer_{name}.format[0] == '{format}'".format(name=field.name, if ast_node.assignments:
format=field.dtype.numpy_dtype.char) aligned = any([a.lhs.args[2] for a in ast_node.assignments
if hasattr(a, 'lhs') and isinstance(a.lhs, CastFunc)
and hasattr(a.lhs, 'dtype') and isinstance(a.lhs.dtype, VectorType)])
if ast_node.instruction_set and aligned:
byte_width = ast_node.instruction_set['width'] * item_size
if 'cachelineZero' in ast_node.instruction_set:
has_openmp, has_nontemporal = False, False
for loop in ast_node.atoms(LoopOverCoordinate):
has_openmp = has_openmp or any(['#pragma omp' in p for p in loop.prefix_lines])
has_nontemporal = has_nontemporal or any([a.args[0].field == field and a.args[3] for a in
loop.atoms(VectorMemoryAccess)])
if has_openmp and has_nontemporal:
cl_size = ast_node.instruction_set['cachelineSize']
byte_width = f"({cl_size}) < SIZE_MAX ? ({cl_size}) : ({byte_width})"
offset = max(max(ast_node.ghost_layers)) * item_size
offset_cond = f"(((uintptr_t) buffer_{field.name}.buf) + {offset}) % ({byte_width}) == 0"
message = str(offset) + ". This is probably due to a different number of ghost_layers chosen for " \
"the arrays and the kernel creation. If the number of ghost layers for " \
"the kernel creation is not specified it will choose a suitable value " \
"automatically. This value might not " \
"be compatible with the allocated arrays."
if type(byte_width) is not int:
message += " Note that when both OpenMP and non-temporal stores are enabled, alignment to the "\
"cacheline size is required."
pre_call_code += template_check_array.format(cond=offset_cond, what="offset", name=field.name,
expected=message)
if (np_dtype.isbuiltin and FieldType.is_generic(field)
and not np.issubdtype(field.dtype.numpy_dtype, np.complexfloating)):
dtype_cond = f"buffer_{field.name}.format[0] == '{field.dtype.numpy_dtype.char}'"
pre_call_code += template_check_array.format(cond=dtype_cond, what="data type", name=field.name, pre_call_code += template_check_array.format(cond=dtype_cond, what="data type", name=field.name,
expected=str(field.dtype.numpy_dtype)) expected=str(field.dtype.numpy_dtype))
item_size_cond = "buffer_{name}.itemsize == {size}".format(name=field.name, size=item_size) item_size_cond = f"buffer_{field.name}.itemsize == {item_size}"
pre_call_code += template_check_array.format(cond=item_size_cond, what="itemsize", name=field.name, pre_call_code += template_check_array.format(cond=item_size_cond, what="itemsize", name=field.name,
expected=item_size) expected=item_size)
if field.has_fixed_shape: if field.has_fixed_shape:
shape_cond = ["buffer_{name}.shape[{i}] == {s}".format(s=s, name=field.name, i=i) shape_cond = [f"buffer_{field.name}.shape[{i}] == {s}"
for i, s in enumerate(field.spatial_shape)] for i, s in enumerate(field.spatial_shape)]
shape_cond = " && ".join(shape_cond) shape_cond = " && ".join(shape_cond)
pre_call_code += template_check_array.format(cond=shape_cond, what="shape", name=field.name, pre_call_code += template_check_array.format(cond=shape_cond, what="shape", name=field.name,
...@@ -369,14 +456,15 @@ def create_function_boilerplate_code(parameter_info, name, insert_checks=True): ...@@ -369,14 +456,15 @@ def create_function_boilerplate_code(parameter_info, name, insert_checks=True):
elif param.is_field_stride: elif param.is_field_stride:
field = param.fields[0] field = param.fields[0]
item_size = field.dtype.numpy_dtype.itemsize item_size = field.dtype.numpy_dtype.itemsize
parameters.append("buffer_{name}.strides[{i}] / {bytes}".format(bytes=item_size, i=param.symbol.coordinate, parameters.append(f"buffer_{field.name}.strides[{param.symbol.coordinate}] / {item_size}")
name=field.name))
elif param.is_field_shape: elif param.is_field_shape:
parameters.append("buffer_{name}.shape[{i}]".format(i=param.symbol.coordinate, name=param.field_name)) parameters.append(f"buffer_{param.field_name}.shape[{param.symbol.coordinate}]")
else: else:
extract_function, target_type = type_mapping[param.symbol.dtype.numpy_dtype.type] extract_function, target_type = type_mapping[param.symbol.dtype.numpy_dtype.type]
pre_call_code += template_extract_scalar.format(extract_function=extract_function, target_type=target_type, pre_call_code += template_extract_scalar.format(extract_function=extract_function,
target_type=target_type,
name=param.symbol.name) name=param.symbol.name)
parameters.append(param.symbol.name) parameters.append(param.symbol.name)
pre_call_code += equal_size_check(variable_sized_normal_fields) pre_call_code += equal_size_check(variable_sized_normal_fields)
...@@ -395,10 +483,17 @@ def create_module_boilerplate_code(module_name, names): ...@@ -395,10 +483,17 @@ def create_module_boilerplate_code(module_name, names):
def load_kernel_from_file(module_name, function_name, path): def load_kernel_from_file(module_name, function_name, path):
from importlib.util import spec_from_file_location, module_from_spec try:
spec = spec_from_file_location(name=module_name, location=path) spec = importlib.util.spec_from_file_location(name=module_name, location=path)
mod = module_from_spec(spec) mod = importlib.util.module_from_spec(spec)
spec.loader.exec_module(mod) spec.loader.exec_module(mod)
except ImportError:
warnings.warn(f"Could not load {path}, trying on more time in 5 seconds ...")
time.sleep(5)
spec = importlib.util.spec_from_file_location(name=module_name, location=path)
mod = importlib.util.module_from_spec(spec)
spec.loader.exec_module(mod)
return getattr(mod, function_name) return getattr(mod, function_name)
...@@ -407,7 +502,6 @@ def run_compile_step(command): ...@@ -407,7 +502,6 @@ def run_compile_step(command):
config_env = compiler_config['env'] if 'env' in compiler_config else {} config_env = compiler_config['env'] if 'env' in compiler_config else {}
compile_environment = os.environ.copy() compile_environment = os.environ.copy()
compile_environment.update(config_env) compile_environment.update(config_env)
try: try:
shell = True if compiler_config['os'].lower() == 'windows' else False shell = True if compiler_config['os'].lower() == 'windows' else False
subprocess.check_output(command, env=compile_environment, stderr=subprocess.STDOUT, shell=shell) subprocess.check_output(command, env=compile_environment, stderr=subprocess.STDOUT, shell=shell)
...@@ -418,61 +512,74 @@ def run_compile_step(command): ...@@ -418,61 +512,74 @@ def run_compile_step(command):
class ExtensionModuleCode: class ExtensionModuleCode:
def __init__(self, module_name='generated'): def __init__(self, module_name='generated', custom_backend=None):
self.module_name = module_name self.module_name = module_name
self._ast_nodes = [] self._ast_nodes = []
self._function_names = [] self._function_names = []
self._custom_backend = custom_backend
self._code_string = str()
self._code_hash = None
def add_function(self, ast, name=None): def add_function(self, ast, name=None):
self._ast_nodes.append(ast) self._ast_nodes.append(ast)
self._function_names.append(name if name is not None else ast.function_name) self._function_names.append(name if name is not None else ast.function_name)
def write_to_file(self, restrict_qualifier, function_prefix, file): def create_code_string(self, restrict_qualifier, function_prefix):
self._code_string = str()
headers = {'<math.h>', '<stdint.h>'} headers = {'<math.h>', '<stdint.h>'}
for ast in self._ast_nodes: for ast in self._ast_nodes:
for field in ast.fields_accessed:
if isinstance(field.dtype, BasicType) and field.dtype.is_half():
# Add the half precision header only if half precision numbers occur in the AST
headers.add('"half_precision.h"')
headers.update(get_headers(ast)) headers.update(get_headers(ast))
header_list = list(headers)
header_list.sort() header_list = sorted(headers)
header_list.insert(0, '"Python.h"') header_list.insert(0, '"Python.h"')
ps_headers = [os.path.join(os.path.dirname(__file__), '..', 'include', h[1:-1]) for h in header_list
if os.path.exists(os.path.join(os.path.dirname(__file__), '..', 'include', h[1:-1]))]
header_hash = b''.join([hashlib.sha256(open(h, 'rb').read()).digest() for h in ps_headers])
includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list]) includes = "\n".join([f"#include {include_file}" for include_file in header_list])
print(includes, file=file) self._code_string += includes
print("\n", file=file) self._code_string += "\n"
print("#define RESTRICT %s" % (restrict_qualifier,), file=file) self._code_string += f"#define RESTRICT {restrict_qualifier} \n"
print("#define FUNC_PREFIX %s" % (function_prefix,), file=file) self._code_string += f"#define FUNC_PREFIX {function_prefix}"
print("\n", file=file) self._code_string += "\n"
for ast, name in zip(self._ast_nodes, self._function_names): for ast, name in zip(self._ast_nodes, self._function_names):
old_name = ast.function_name old_name = ast.function_name
ast.function_name = "kernel_" + name ast.function_name = f"kernel_{name}"
print(generate_c(ast), file=file) self._code_string += generate_c(ast, custom_backend=self._custom_backend)
print(create_function_boilerplate_code(ast.get_parameters(), name), file=file) self._code_string += create_function_boilerplate_code(ast.get_parameters(), name, ast)
ast.function_name = old_name ast.function_name = old_name
print(create_module_boilerplate_code(self.module_name, self._function_names), file=file)
self._code_hash = "mod_" + hashlib.sha256(self._code_string.encode() + header_hash).hexdigest()
self._code_string += create_module_boilerplate_code(self._code_hash, self._function_names)
def get_hash_of_code(self):
assert self._code_string, "The code must be generated first"
return self._code_hash
class KernelWrapper: def write_to_file(self, file):
def __init__(self, kernel, parameters, ast_node): assert self._code_string, "The code must be generated first"
self.kernel = kernel print(self._code_string, file=file)
self.parameters = parameters
self.ast = ast_node
def __call__(self, **kwargs):
return self.kernel(**kwargs)
def compile_module(code, code_hash, base_dir, compile_flags=None):
if compile_flags is None:
compile_flags = []
def compile_module(code, code_hash, base_dir):
compiler_config = get_compiler_config() compiler_config = get_compiler_config()
extra_flags = ['-I' + get_paths()['include'], '-I' + get_pystencils_include_path()] extra_flags = ['-I' + sysconfig.get_paths()['include'], '-I' + get_pystencils_include_path()] + compile_flags
if compiler_config['os'].lower() == 'windows': if compiler_config['os'].lower() == 'windows':
function_prefix = '__declspec(dllexport)'
lib_suffix = '.pyd' lib_suffix = '.pyd'
object_suffix = '.obj' object_suffix = '.obj'
windows = True windows = True
else: else:
function_prefix = ''
lib_suffix = '.so' lib_suffix = '.so'
object_suffix = '.o' object_suffix = '.o'
windows = False windows = False
...@@ -482,8 +589,11 @@ def compile_module(code, code_hash, base_dir): ...@@ -482,8 +589,11 @@ def compile_module(code, code_hash, base_dir):
object_file = os.path.join(base_dir, code_hash + object_suffix) object_file = os.path.join(base_dir, code_hash + object_suffix)
if not os.path.exists(object_file): if not os.path.exists(object_file):
with file_handle_for_atomic_write(src_file) as f: try:
code.write_to_file(compiler_config['restrict_qualifier'], function_prefix, f) with open(src_file, 'x') as f:
code.write_to_file(f)
except FileExistsError:
pass
if windows: if windows:
compile_cmd = ['cl.exe', '/c', '/EHsc'] + compiler_config['flags'].split() compile_cmd = ['cl.exe', '/c', '/EHsc'] + compiler_config['flags'].split()
...@@ -497,11 +607,15 @@ def compile_module(code, code_hash, base_dir): ...@@ -497,11 +607,15 @@ def compile_module(code, code_hash, base_dir):
# Linking # Linking
if windows: if windows:
import sysconfig
config_vars = sysconfig.get_config_vars() config_vars = sysconfig.get_config_vars()
py_lib = os.path.join(config_vars["installed_base"], "libs", py_lib = os.path.join(config_vars["installed_base"], "libs",
"python{}.lib".format(config_vars["py_version_nodot"])) f"python{config_vars['py_version_nodot']}.lib")
run_compile_step(['link.exe', py_lib, '/DLL', '/out:' + lib_file, object_file]) run_compile_step(['link.exe', py_lib, '/DLL', '/out:' + lib_file, object_file])
elif platform.system().lower() == 'darwin':
with atomic_file_write(lib_file) as file_name:
run_compile_step([compiler_config['command'], '-shared', object_file, '-o', file_name, '-undefined',
'dynamic_lookup']
+ compiler_config['flags'].split())
else: else:
with atomic_file_write(lib_file) as file_name: with atomic_file_write(lib_file) as file_name:
run_compile_step([compiler_config['command'], '-shared', object_file, '-o', file_name] run_compile_step([compiler_config['command'], '-shared', object_file, '-o', file_name]
...@@ -509,18 +623,34 @@ def compile_module(code, code_hash, base_dir): ...@@ -509,18 +623,34 @@ def compile_module(code, code_hash, base_dir):
return lib_file return lib_file
def compile_and_load(ast): def compile_and_load(ast, custom_backend=None):
cache_config = get_cache_config() cache_config = get_cache_config()
code_hash_str = "mod_" + hashlib.sha256(generate_c(ast, dialect='c').encode()).hexdigest()
code = ExtensionModuleCode(module_name=code_hash_str) compiler_config = get_compiler_config()
if compiler_config['os'].lower() == 'windows':
function_prefix = '__declspec(dllexport)'
elif ast.instruction_set and 'function_prefix' in ast.instruction_set:
function_prefix = ast.instruction_set['function_prefix']
else:
function_prefix = ''
code = ExtensionModuleCode(custom_backend=custom_backend)
code.add_function(ast, ast.function_name) code.add_function(ast, ast.function_name)
code.create_code_string(compiler_config['restrict_qualifier'], function_prefix)
code_hash_str = code.get_hash_of_code()
compile_flags = []
if ast.instruction_set and 'compile_flags' in ast.instruction_set:
compile_flags = ast.instruction_set['compile_flags']
if cache_config['object_cache'] is False: if cache_config['object_cache'] is False:
with TemporaryDirectory() as base_dir: with tempfile.TemporaryDirectory() as base_dir:
lib_file = compile_module(code, code_hash_str, base_dir) lib_file = compile_module(code, code_hash_str, base_dir, compile_flags=compile_flags)
result = load_kernel_from_file(code_hash_str, ast.function_name, lib_file) result = load_kernel_from_file(code_hash_str, ast.function_name, lib_file)
else: else:
lib_file = compile_module(code, code_hash_str, base_dir=cache_config['object_cache']) lib_file = compile_module(code, code_hash_str, base_dir=cache_config['object_cache'],
compile_flags=compile_flags)
result = load_kernel_from_file(code_hash_str, ast.function_name, lib_file) result = load_kernel_from_file(code_hash_str, ast.function_name, lib_file)
return KernelWrapper(result, ast.get_parameters(), ast) return KernelWrapper(result, ast.get_parameters(), ast)
import sympy as sp import sympy as sp
from functools import partial
from pystencils.astnodes import SympyAssignment, Block, LoopOverCoordinate, KernelFunction
from pystencils.transformations import resolve_buffer_accesses, resolve_field_accesses, make_loop_over_domain, \
add_types, get_optimal_loop_ordering, parse_base_pointer_info, move_constants_before_loop, \
split_inner_loop, get_base_buffer_index
from pystencils.data_types import TypedSymbol, BasicType, StructType, create_type
from pystencils.field import Field, FieldType
import pystencils.astnodes as ast import pystencils.astnodes as ast
from pystencils.config import CreateKernelConfig
from pystencils.enums import Target, Backend
from pystencils.astnodes import Block, KernelFunction, LoopOverCoordinate, SympyAssignment
from pystencils.cpu.cpujit import make_python_function from pystencils.cpu.cpujit import make_python_function
from pystencils.assignment import Assignment from pystencils.typing import StructType, TypedSymbol, create_type
from typing import List, Union from pystencils.typing.transformations import add_types
from pystencils.field import Field, FieldType
AssignmentOrAstNodeList = List[Union[Assignment, ast.Node]] from pystencils.node_collection import NodeCollection
from pystencils.transformations import (
filtered_tree_iteration, iterate_loops_by_depth, get_base_buffer_index, get_optimal_loop_ordering,
make_loop_over_domain, add_outer_loop_over_indexed_elements,
move_constants_before_loop, parse_base_pointer_info, resolve_buffer_accesses,
resolve_field_accesses, split_inner_loop)
def create_kernel(assignments: AssignmentOrAstNodeList, function_name: str = "kernel", type_info='double', def create_kernel(assignments: NodeCollection,
split_groups=(), iteration_slice=None, ghost_layers=None, config: CreateKernelConfig) -> KernelFunction:
skip_independence_check=False) -> KernelFunction:
"""Creates an abstract syntax tree for a kernel function, by taking a list of update rules. """Creates an abstract syntax tree for a kernel function, by taking a list of update rules.
Loops are created according to the field accesses in the equations. Loops are created according to the field accesses in the equations.
...@@ -24,35 +25,25 @@ def create_kernel(assignments: AssignmentOrAstNodeList, function_name: str = "ke ...@@ -24,35 +25,25 @@ def create_kernel(assignments: AssignmentOrAstNodeList, function_name: str = "ke
Args: Args:
assignments: list of sympy equations, containing accesses to :class:`pystencils.field.Field`. assignments: list of sympy equations, containing accesses to :class:`pystencils.field.Field`.
Defining the update rules of the kernel Defining the update rules of the kernel
function_name: name of the generated function - only important if generated code is written out config: create kernel config
type_info: a map from symbol name to a C type specifier. If not specified all symbols are assumed to
be of type 'double' except symbols which occur on the left hand side of equations where the
right hand side is a sympy Boolean which are assumed to be 'bool' .
split_groups: Specification on how to split up inner loop into multiple loops. For details see
transformation :func:`pystencils.transformation.split_inner_loop`
iteration_slice: if not None, iteration is done only over this slice of the field
ghost_layers: a sequence of pairs for each coordinate with lower and upper nr of ghost layers
if None, the number of ghost layers is determined automatically and assumed to be equal for a
all dimensions
skip_independence_check: don't check that loop iterations are independent. This is needed e.g. for
periodicity kernel, that access the field outside the iteration bounds. Use with care!
Returns: Returns:
AST node representing a function, that can be printed as C or CUDA code AST node representing a function, that can be printed as C or CUDA code
""" """
function_name = config.function_name
iteration_slice = config.iteration_slice
ghost_layers = config.ghost_layers
fields_written = assignments.bound_fields
fields_read = assignments.rhs_fields
def type_symbol(term): split_groups = ()
if isinstance(term, Field.Access) or isinstance(term, TypedSymbol): if 'split_groups' in assignments.simplification_hints:
return term split_groups = assignments.simplification_hints['split_groups']
elif isinstance(term, sp.Symbol): assignments = assignments.all_assignments
if not hasattr(type_info, '__getitem__'):
return TypedSymbol(term.name, create_type(type_info)) # TODO Cleanup: move add_types to create_domain_kernel or create_kernel
else: assignments = add_types(assignments, config)
return TypedSymbol(term.name, type_info[term.name])
else:
raise ValueError("Term has to be field access or symbol")
fields_read, fields_written, assignments = add_types(assignments, type_info, not skip_independence_check)
all_fields = fields_read.union(fields_written) all_fields = fields_read.union(fields_written)
read_only_fields = set([f.name for f in fields_read - fields_written]) read_only_fields = set([f.name for f in fields_read - fields_written])
...@@ -61,15 +52,33 @@ def create_kernel(assignments: AssignmentOrAstNodeList, function_name: str = "ke ...@@ -61,15 +52,33 @@ def create_kernel(assignments: AssignmentOrAstNodeList, function_name: str = "ke
body = ast.Block(assignments) body = ast.Block(assignments)
loop_order = get_optimal_loop_ordering(fields_without_buffers) loop_order = get_optimal_loop_ordering(fields_without_buffers)
ast_node = make_loop_over_domain(body, function_name, iteration_slice=iteration_slice, loop_node, ghost_layer_info = make_loop_over_domain(body, iteration_slice=iteration_slice,
ghost_layers=ghost_layers, loop_order=loop_order) ghost_layers=ghost_layers, loop_order=loop_order)
ast_node.target = 'cpu' loop_node = add_outer_loop_over_indexed_elements(loop_node)
ast_node = KernelFunction(loop_node, Target.CPU, Backend.C, compile_function=make_python_function,
ghost_layers=ghost_layer_info, function_name=function_name, assignments=assignments)
if split_groups: if split_groups:
type_info = config.data_type
def type_symbol(term):
if isinstance(term, Field.Access) or isinstance(term, TypedSymbol):
return term
elif isinstance(term, sp.Symbol):
if isinstance(type_info, str) or not hasattr(type_info, '__getitem__'):
return TypedSymbol(term.name, create_type(type_info))
else:
return TypedSymbol(term.name, type_info[term.name])
else:
raise ValueError("Term has to be field access or symbol")
typed_split_groups = [[type_symbol(s) for s in split_group] for split_group in split_groups] typed_split_groups = [[type_symbol(s) for s in split_group] for split_group in split_groups]
split_inner_loop(ast_node, typed_split_groups) split_inner_loop(ast_node, typed_split_groups)
base_pointer_spec = [['spatialInner0'], ['spatialInner1']] if len(loop_order) >= 2 else [['spatialInner0']] base_pointer_spec = config.base_pointer_specification
if base_pointer_spec is None:
base_pointer_spec = []
base_pointer_info = {field.name: parse_base_pointer_info(base_pointer_spec, loop_order, base_pointer_info = {field.name: parse_base_pointer_info(base_pointer_spec, loop_order,
field.spatial_dimensions, field.index_dimensions) field.spatial_dimensions, field.index_dimensions)
for field in fields_without_buffers} for field in fields_without_buffers}
...@@ -81,14 +90,14 @@ def create_kernel(assignments: AssignmentOrAstNodeList, function_name: str = "ke ...@@ -81,14 +90,14 @@ def create_kernel(assignments: AssignmentOrAstNodeList, function_name: str = "ke
if any(FieldType.is_buffer(f) for f in all_fields): if any(FieldType.is_buffer(f) for f in all_fields):
resolve_buffer_accesses(ast_node, get_base_buffer_index(ast_node), read_only_fields) resolve_buffer_accesses(ast_node, get_base_buffer_index(ast_node), read_only_fields)
# TODO think about typing
resolve_field_accesses(ast_node, read_only_fields, field_to_base_pointer_info=base_pointer_info) resolve_field_accesses(ast_node, read_only_fields, field_to_base_pointer_info=base_pointer_info)
move_constants_before_loop(ast_node) move_constants_before_loop(ast_node)
ast_node.compile = partial(make_python_function, ast_node)
return ast_node return ast_node
def create_indexed_kernel(assignments: AssignmentOrAstNodeList, index_fields, function_name="kernel", def create_indexed_kernel(assignments: NodeCollection,
type_info=None, coordinate_names=('x', 'y', 'z')) -> KernelFunction: config: CreateKernelConfig) -> KernelFunction:
""" """
Similar to :func:`create_kernel`, but here not all cells of a field are updated but only cells with Similar to :func:`create_kernel`, but here not all cells of a field are updated but only cells with
coordinates which are stored in an index field. This traversal method can e.g. be used for boundary handling. coordinates which are stored in an index field. This traversal method can e.g. be used for boundary handling.
...@@ -100,33 +109,41 @@ def create_indexed_kernel(assignments: AssignmentOrAstNodeList, index_fields, fu ...@@ -100,33 +109,41 @@ def create_indexed_kernel(assignments: AssignmentOrAstNodeList, index_fields, fu
Args: Args:
assignments: list of assignments assignments: list of assignments
index_fields: list of index fields, i.e. 1D fields with struct data type config: Kernel configuration
type_info: see documentation of :func:`create_kernel`
function_name: see documentation of :func:`create_kernel`
coordinate_names: name of the coordinate fields in the struct data type
""" """
fields_read, fields_written, assignments = add_types(assignments, type_info, check_independence_condition=False) function_name = config.function_name
index_fields = config.index_fields
coordinate_names = config.coordinate_names
fields_written = assignments.bound_fields
fields_read = assignments.rhs_fields
all_fields = fields_read.union(fields_written) all_fields = fields_read.union(fields_written)
# extract the index fields based on the name. The original index field might have been modified
index_fields = [idx_field for idx_field in index_fields if idx_field.name in [f.name for f in all_fields]]
non_index_fields = [f for f in all_fields if f not in index_fields]
spatial_coordinates = {f.spatial_dimensions for f in non_index_fields}
assert len(spatial_coordinates) == 1, f"Non-index fields do not have the same number of spatial coordinates " \
f"Non index fields are {non_index_fields}, spatial coordinates are " \
f"{spatial_coordinates}"
spatial_coordinates = list(spatial_coordinates)[0]
assignments = assignments.all_assignments
assignments = add_types(assignments, config)
for index_field in index_fields: for index_field in index_fields:
index_field.field_type = FieldType.INDEXED index_field.field_type = FieldType.INDEXED
assert FieldType.is_indexed(index_field) assert FieldType.is_indexed(index_field)
assert index_field.spatial_dimensions == 1, "Index fields have to be 1D" assert index_field.spatial_dimensions == 1, "Index fields have to be 1D"
non_index_fields = [f for f in all_fields if f not in index_fields]
spatial_coordinates = {f.spatial_dimensions for f in non_index_fields}
assert len(spatial_coordinates) == 1, "Non-index fields do not have the same number of spatial coordinates"
spatial_coordinates = list(spatial_coordinates)[0]
def get_coordinate_symbol_assignment(name): def get_coordinate_symbol_assignment(name):
for idx_field in index_fields: for idx_field in index_fields:
assert isinstance(idx_field.dtype, StructType), "Index fields have to have a struct data type" assert isinstance(idx_field.dtype, StructType), "Index fields have to have a struct data type"
data_type = idx_field.dtype data_type = idx_field.dtype
if data_type.has_element(name): if data_type.has_element(name):
rhs = idx_field[0](name) rhs = idx_field[0](name)
lhs = TypedSymbol(name, BasicType(data_type.get_element_type(name))) lhs = TypedSymbol(name, data_type.get_element_type(name))
return SympyAssignment(lhs, rhs) return SympyAssignment(lhs, rhs)
raise ValueError("Index %s not found in any of the passed index fields" % (name,)) raise ValueError(f"Index {name} not found in any of the passed index fields")
coordinate_symbol_assignments = [get_coordinate_symbol_assignment(n) coordinate_symbol_assignments = [get_coordinate_symbol_assignment(n)
for n in coordinate_names[:spatial_coordinates]] for n in coordinate_names[:spatial_coordinates]]
...@@ -141,18 +158,18 @@ def create_indexed_kernel(assignments: AssignmentOrAstNodeList, index_fields, fu ...@@ -141,18 +158,18 @@ def create_indexed_kernel(assignments: AssignmentOrAstNodeList, index_fields, fu
loop_body.append(assignment) loop_body.append(assignment)
function_body = Block([loop_node]) function_body = Block([loop_node])
ast_node = KernelFunction(function_body, backend="cpu", function_name=function_name) ast_node = KernelFunction(function_body, Target.CPU, Backend.C, make_python_function,
ghost_layers=None, function_name=function_name, assignments=assignments)
fixed_coordinate_mapping = {f.name: coordinate_typed_symbols for f in non_index_fields} fixed_coordinate_mapping = {f.name: coordinate_typed_symbols for f in non_index_fields}
read_only_fields = set([f.name for f in fields_read - fields_written]) read_only_fields = set([f.name for f in fields_read - fields_written])
resolve_field_accesses(ast_node, read_only_fields, field_to_fixed_coordinates=fixed_coordinate_mapping) resolve_field_accesses(ast_node, read_only_fields, field_to_fixed_coordinates=fixed_coordinate_mapping)
move_constants_before_loop(ast_node) move_constants_before_loop(ast_node)
ast_node.compile = partial(make_python_function, ast_node)
return ast_node return ast_node
def add_openmp(ast_node, schedule="static", num_threads=True, collapse=None): def add_openmp(ast_node, schedule="static", num_threads=True, collapse=None, assume_single_outer_loop=True):
"""Parallelize the outer loop with OpenMP. """Parallelize the outer loop with OpenMP.
Args: Args:
...@@ -160,41 +177,57 @@ def add_openmp(ast_node, schedule="static", num_threads=True, collapse=None): ...@@ -160,41 +177,57 @@ def add_openmp(ast_node, schedule="static", num_threads=True, collapse=None):
schedule: OpenMP scheduling policy e.g. 'static' or 'dynamic' schedule: OpenMP scheduling policy e.g. 'static' or 'dynamic'
num_threads: explicitly specify number of threads num_threads: explicitly specify number of threads
collapse: number of nested loops to include in parallel region (see OpenMP collapse) collapse: number of nested loops to include in parallel region (see OpenMP collapse)
assume_single_outer_loop: if True an exception is raised if multiple outer loops are detected for all but
optimized staggered kernels the single-outer-loop assumption should be true
""" """
if not num_threads: if not num_threads:
return return
assert type(ast_node) is ast.KernelFunction assert type(ast_node) is ast.KernelFunction
body = ast_node.body body = ast_node.body
threads_clause = "" if num_threads and isinstance(num_threads, bool) else " num_threads(%s)" % (num_threads,) threads_clause = "" if num_threads and isinstance(num_threads, bool) else f" num_threads({num_threads})"
wrapper_block = ast.PragmaBlock('#pragma omp parallel' + threads_clause, body.take_child_nodes()) wrapper_block = ast.PragmaBlock('#pragma omp parallel' + threads_clause, body.take_child_nodes())
body.append(wrapper_block) body.append(wrapper_block)
outer_loops = [l for l in body.atoms(ast.LoopOverCoordinate) if l.is_outermost_loop] outer_loops = [l for l in filtered_tree_iteration(body, LoopOverCoordinate, stop_type=SympyAssignment)
if l.is_outermost_loop]
assert outer_loops, "No outer loop found" assert outer_loops, "No outer loop found"
assert len(outer_loops) <= 1, "More than one outer loop found. Not clear where to put OpenMP pragma." if assume_single_outer_loop and len(outer_loops) > 1:
loop_to_parallelize = outer_loops[0] raise ValueError("More than one outer loop found, only one outer loop expected")
try:
loop_range = int(loop_to_parallelize.stop - loop_to_parallelize.start) for loop_to_parallelize in outer_loops:
except TypeError: try:
loop_range = None loop_range = int(loop_to_parallelize.stop - loop_to_parallelize.start)
except TypeError:
if num_threads is None: loop_range = None
import multiprocessing
num_threads = multiprocessing.cpu_count() if loop_range is not None and loop_range < num_threads and not collapse:
contained_loops = [l for l in loop_to_parallelize.body.args if isinstance(l, LoopOverCoordinate)]
if loop_range is not None and loop_range < num_threads and not collapse: if len(contained_loops) == 1:
contained_loops = [l for l in loop_to_parallelize.body.args if isinstance(l, LoopOverCoordinate)] contained_loop = contained_loops[0]
if len(contained_loops) == 1: try:
contained_loop = contained_loops[0] contained_loop_range = int(contained_loop.stop - contained_loop.start)
try: if contained_loop_range > loop_range:
contained_loop_range = int(contained_loop.stop - contained_loop.start) loop_to_parallelize = contained_loop
if contained_loop_range > loop_range: except TypeError:
loop_to_parallelize = contained_loop pass
except TypeError:
pass prefix = f"#pragma omp for schedule({schedule})"
if collapse:
prefix = "#pragma omp for schedule(%s)" % (schedule,) prefix += f" collapse({collapse})"
if collapse: loop_to_parallelize.prefix_lines.append(prefix)
prefix += " collapse(%d)" % (collapse, )
loop_to_parallelize.prefix_lines.append(prefix)
def add_pragmas(ast_node, pragma_lines, nesting_depth=-1):
"""Prepends given pragma lines to all loops of specified nesting depth.
Args:
ast_node: pystencils abstract syntax tree
pragma_lines: Iterable of strings containing the pragma lines
nesting_depth: Nesting depth of the loops the pragmas should be applied to.
Outermost loop has depth 0.
A depth of -1 indicates the innermost loops.
"""
loop_nodes = iterate_loops_by_depth(ast_node, nesting_depth)
for n in loop_nodes:
n.prefix_lines += list(pragma_lines)
import subprocess
import os import os
import subprocess
def get_environment(version_specifier, arch='x64'): def get_environment(version_specifier, arch='x64'):
...@@ -71,7 +71,7 @@ def normalize_msvc_version(version): ...@@ -71,7 +71,7 @@ def normalize_msvc_version(version):
def get_environment_from_vc_vars_file(vc_vars_file, arch): def get_environment_from_vc_vars_file(vc_vars_file, arch):
out = subprocess.check_output( out = subprocess.check_output(
'cmd /u /c "{}" {} && set'.format(vc_vars_file, arch), f'cmd /u /c "{vc_vars_file}" {arch} && set',
stderr=subprocess.STDOUT, stderr=subprocess.STDOUT,
).decode('utf-16le', errors='replace') ).decode('utf-16le', errors='replace')
......
import sympy as sp
import warnings import warnings
from typing import Union, Container from typing import Container, Union
from pystencils.backends.simd_instruction_sets import get_vector_instruction_set
from pystencils.fast_approximation import fast_division, fast_sqrt, fast_inv_sqrt import numpy as np
from pystencils.integer_functions import modulo_floor, modulo_ceil import sympy as sp
from pystencils.sympyextensions import fast_subs from sympy.logic.boolalg import BooleanFunction, BooleanAtom
from pystencils.data_types import TypedSymbol, VectorType, get_type_of_expression, vector_memory_access, cast_func, \
collate_types, PointerType
import pystencils.astnodes as ast import pystencils.astnodes as ast
from pystencils.transformations import cut_loop, filtered_tree_iteration, replace_inner_stride_with_one from pystencils.backends.simd_instruction_sets import get_supported_instruction_sets, get_vector_instruction_set
from pystencils.typing import (BasicType, PointerType, TypedSymbol, VectorType, CastFunc, collate_types,
get_type_of_expression, VectorMemoryAccess)
from pystencils.functions import DivFunc
from pystencils.field import Field from pystencils.field import Field
from pystencils.integer_functions import modulo_ceil, modulo_floor
from pystencils.sympyextensions import fast_subs
from pystencils.transformations import cut_loop, filtered_tree_iteration, replace_inner_stride_with_one
# noinspection PyPep8Naming
class vec_any(sp.Function):
nargs = (1,)
# noinspection PyPep8Naming
class vec_all(sp.Function):
nargs = (1,)
class NontemporalFence(ast.Node):
def __init__(self):
super(NontemporalFence, self).__init__(parent=None)
@property
def symbols_defined(self):
return set()
def vectorize(kernel_ast: ast.KernelFunction, instruction_set: str = 'avx', @property
def undefined_symbols(self):
return set()
@property
def args(self):
return []
def __eq__(self, other):
return isinstance(other, NontemporalFence)
class CachelineSize(ast.Node):
symbol = sp.Symbol("_clsize")
mask_symbol = sp.Symbol("_clsize_mask")
last_symbol = sp.Symbol("_cl_lastvec")
def __init__(self):
super(CachelineSize, self).__init__(parent=None)
@property
def symbols_defined(self):
return {self.symbol, self.mask_symbol, self.last_symbol}
@property
def undefined_symbols(self):
return set()
@property
def args(self):
return []
def __eq__(self, other):
return isinstance(other, CachelineSize)
def __hash__(self):
return hash(self.symbol)
def vectorize(kernel_ast: ast.KernelFunction, instruction_set: str = 'best',
assume_aligned: bool = False, nontemporal: Union[bool, Container[Union[str, Field]]] = False, assume_aligned: bool = False, nontemporal: Union[bool, Container[Union[str, Field]]] = False,
assume_inner_stride_one: bool = False, assume_sufficient_line_padding: bool = True): assume_inner_stride_one: bool = False, assume_sufficient_line_padding: bool = True):
# TODO Vectorization Revamp we first introduce the remainder loop and then check if we can even vectorise.
# Maybe first copy the ast and return the copied version on failure
"""Explicit vectorization using SIMD vectorization via intrinsics. """Explicit vectorization using SIMD vectorization via intrinsics.
Args: Args:
...@@ -37,9 +100,14 @@ def vectorize(kernel_ast: ast.KernelFunction, instruction_set: str = 'avx', ...@@ -37,9 +100,14 @@ def vectorize(kernel_ast: ast.KernelFunction, instruction_set: str = 'avx',
depending on the access pattern there might be additional padding depending on the access pattern there might be additional padding
required at the end of the array required at the end of the array
""" """
if instruction_set == 'best':
if get_supported_instruction_sets():
instruction_set = get_supported_instruction_sets()[-1]
else:
instruction_set = 'avx'
if instruction_set is None: if instruction_set is None:
return return
all_fields = kernel_ast.fields_accessed all_fields = kernel_ast.fields_accessed
if nontemporal is None or nontemporal is False: if nontemporal is None or nontemporal is False:
nontemporal = {} nontemporal = {}
...@@ -55,37 +123,53 @@ def vectorize(kernel_ast: ast.KernelFunction, instruction_set: str = 'avx', ...@@ -55,37 +123,53 @@ def vectorize(kernel_ast: ast.KernelFunction, instruction_set: str = 'avx',
"to differently typed floating point fields") "to differently typed floating point fields")
float_size = field_float_dtypes.pop().numpy_dtype.itemsize float_size = field_float_dtypes.pop().numpy_dtype.itemsize
assert float_size in (8, 4) assert float_size in (8, 4)
vector_is = get_vector_instruction_set('double' if float_size == 8 else 'float', default_float_type = 'float64' if float_size == 8 else 'float32'
instruction_set=instruction_set) vector_is = get_vector_instruction_set(default_float_type, instruction_set=instruction_set)
vector_width = vector_is['width']
kernel_ast.instruction_set = vector_is kernel_ast.instruction_set = vector_is
vectorize_inner_loops_and_adapt_load_stores(kernel_ast, vector_width, assume_aligned, if nontemporal and 'cachelineZero' in vector_is:
nontemporal, assume_sufficient_line_padding) kernel_ast.use_all_written_field_sizes = True
insert_vector_casts(kernel_ast) strided = 'storeS' in vector_is and 'loadS' in vector_is
keep_loop_stop = '{loop_stop}' in vector_is['storeA' if assume_aligned and 'storeA' in vector_is else 'storeU']
vectorize_inner_loops_and_adapt_load_stores(kernel_ast, assume_aligned, nontemporal,
strided, keep_loop_stop, assume_sufficient_line_padding,
default_float_type)
def vectorize_inner_loops_and_adapt_load_stores(ast_node, vector_width, assume_aligned, nontemporal_fields, def vectorize_inner_loops_and_adapt_load_stores(ast_node, assume_aligned, nontemporal_fields,
assume_sufficient_line_padding): strided, keep_loop_stop, assume_sufficient_line_padding,
default_float_type):
"""Goes over all innermost loops, changes increment to vector width and replaces field accesses by vector type.""" """Goes over all innermost loops, changes increment to vector width and replaces field accesses by vector type."""
all_loops = filtered_tree_iteration(ast_node, ast.LoopOverCoordinate, stop_type=ast.SympyAssignment) all_loops = list(filtered_tree_iteration(ast_node, ast.LoopOverCoordinate, stop_type=ast.SympyAssignment))
inner_loops = [n for n in all_loops if n.is_innermost_loop] inner_loops = [loop for loop in all_loops if loop.is_innermost_loop]
zero_loop_counters = {l.loop_counter_symbol: 0 for l in all_loops} zero_loop_counters = {loop.loop_counter_symbol: 0 for loop in all_loops}
vector_is = ast_node.instruction_set
assert vector_is, "The ast needs to hold information about the instruction_set for the vectorisation"
vector_width = vector_is['width']
vector_int_width = vector_is['intwidth']
for loop_node in inner_loops: for loop_node in inner_loops:
loop_range = loop_node.stop - loop_node.start loop_range = loop_node.stop - loop_node.start
# cut off loop tail, that is not a multiple of four # cut off loop tail, that is not a multiple of four
if assume_aligned and assume_sufficient_line_padding: if keep_loop_stop:
pass
elif assume_aligned and assume_sufficient_line_padding:
loop_range = loop_node.stop - loop_node.start loop_range = loop_node.stop - loop_node.start
new_stop = loop_node.start + modulo_ceil(loop_range, vector_width) new_stop = loop_node.start + modulo_ceil(loop_range, vector_width)
loop_node.stop = new_stop loop_node.stop = new_stop
else: else:
cutting_point = modulo_floor(loop_range, vector_width) + loop_node.start cutting_point = modulo_floor(loop_range, vector_width) + loop_node.start
loop_nodes = cut_loop(loop_node, [cutting_point]) # TODO cut_loop calls deepcopy on the loop_node. This is bad as documented in cut_loop
assert len(loop_nodes) in (1, 2) # 2 for main and tail loop, 1 if loop range divisible by vector width loop_nodes = [loop for loop in cut_loop(loop_node, [cutting_point]).args
if isinstance(loop, ast.LoopOverCoordinate)]
assert len(loop_nodes) in (0, 1, 2) # 2 for main and tail loop, 1 if loop range divisible by vector width
if len(loop_nodes) == 0:
continue
loop_node = loop_nodes[0] loop_node = loop_nodes[0]
# loop_node is the vectorized one
# Find all array accesses (indexed) that depend on the loop counter as offset # Find all array accesses (indexed) that depend on the loop counter as offset
loop_counter_symbol = ast.LoopOverCoordinate.get_loop_counter_symbol(loop_node.coordinate_to_loop_over) loop_counter_symbol = ast.LoopOverCoordinate.get_loop_counter_symbol(loop_node.coordinate_to_loop_over)
substitutions = {} substitutions = {}
...@@ -93,54 +177,184 @@ def vectorize_inner_loops_and_adapt_load_stores(ast_node, vector_width, assume_a ...@@ -93,54 +177,184 @@ def vectorize_inner_loops_and_adapt_load_stores(ast_node, vector_width, assume_a
for indexed in loop_node.atoms(sp.Indexed): for indexed in loop_node.atoms(sp.Indexed):
base, index = indexed.args base, index = indexed.args
if loop_counter_symbol in index.atoms(sp.Symbol): if loop_counter_symbol in index.atoms(sp.Symbol):
if 'loadA' not in vector_is and 'storeA' not in vector_is and 'maskStoreA' not in vector_is:
# don't need to generate the alignment check when there are no aligned load/store instructions
aligned_access = False
else:
if not isinstance(vector_width, int):
raise NotImplementedError('Access alignment cannot be statically determined for sizeless '
'vector ISAs')
aligned_access = (index - loop_counter_symbol).subs(zero_loop_counters) % vector_width == 0
loop_counter_is_offset = loop_counter_symbol not in (index - loop_counter_symbol).atoms() loop_counter_is_offset = loop_counter_symbol not in (index - loop_counter_symbol).atoms()
aligned_access = (index - loop_counter_symbol).subs(zero_loop_counters) == 0 stride = sp.simplify(index.subs({loop_counter_symbol: loop_counter_symbol + 1}) - index)
if not loop_counter_is_offset: if not loop_counter_is_offset and (not strided or loop_counter_symbol in stride.atoms()):
successful = False successful = False
break break
typed_symbol = base.label typed_symbol = base.label
assert type(typed_symbol.dtype) is PointerType, \ assert type(typed_symbol.dtype) is PointerType, f"Type of access is {typed_symbol.dtype}, {indexed}"
"Type of access is {}, {}".format(typed_symbol.dtype, indexed)
vec_type = VectorType(typed_symbol.dtype.base_type, vector_width) vec_type = VectorType(typed_symbol.dtype.base_type, vector_width)
use_aligned_access = aligned_access and assume_aligned use_aligned_access = aligned_access and assume_aligned
nontemporal = False nontemporal = False
if hasattr(indexed, 'field'): if hasattr(indexed, 'field'):
nontemporal = (indexed.field in nontemporal_fields) or (indexed.field.name in nontemporal_fields) nontemporal = (indexed.field in nontemporal_fields) or (indexed.field.name in nontemporal_fields)
substitutions[indexed] = vector_memory_access(indexed, vec_type, use_aligned_access, nontemporal) substitutions[indexed] = VectorMemoryAccess(indexed, vec_type, use_aligned_access, nontemporal, True,
stride if strided else 1)
if nontemporal:
# insert NontemporalFence after the outermost loop
parent = loop_node.parent
while type(parent.parent.parent) is not ast.KernelFunction:
parent = parent.parent
parent.parent.insert_after(NontemporalFence(), parent, if_not_exists=True)
# insert CachelineSize at the beginning of the kernel
parent.parent.insert_front(CachelineSize(), if_not_exists=True)
if not successful: if not successful:
warnings.warn("Could not vectorize loop because of non-consecutive memory access") warnings.warn("Could not vectorize loop because of non-consecutive memory access")
continue continue
loop_node.step = vector_width loop_node.step = vector_width
loop_node.subs(substitutions) loop_node.subs(substitutions)
arg_1 = CastFunc(loop_counter_symbol, VectorType(loop_counter_symbol.dtype, vector_int_width))
arg_2 = CastFunc(tuple(range(vector_int_width if type(vector_int_width) is int else 2)),
VectorType(loop_counter_symbol.dtype, vector_int_width))
vector_loop_counter = arg_1 + arg_2
fast_subs(loop_node, {loop_counter_symbol: vector_loop_counter},
skip=lambda e: isinstance(e, ast.ResolvedFieldAccess) or isinstance(e, VectorMemoryAccess))
mask_conditionals(loop_node)
from pystencils.rng import RNGBase
substitutions = {}
for rng in loop_node.atoms(RNGBase):
new_result_symbols = [TypedSymbol(s.name, VectorType(s.dtype, width=vector_width))
for s in rng.result_symbols]
substitutions.update({s[0]: s[1] for s in zip(rng.result_symbols, new_result_symbols)})
rng._symbols_defined = set(new_result_symbols)
fast_subs(loop_node, substitutions, skip=lambda e: isinstance(e, RNGBase))
insert_vector_casts(loop_node, vector_is, default_float_type)
def mask_conditionals(loop_body):
def visit_node(node, mask):
if isinstance(node, ast.Conditional):
cond = node.condition_expr
skip = (loop_body.loop_counter_symbol not in cond.atoms(sp.Symbol)) or cond.func in (vec_all, vec_any)
cond = True if skip else cond
true_mask = sp.And(cond, mask)
visit_node(node.true_block, true_mask)
if node.false_block:
false_mask = sp.And(sp.Not(node.condition_expr), mask)
visit_node(node, false_mask)
if not skip:
node.condition_expr = vec_any(node.condition_expr)
elif isinstance(node, ast.SympyAssignment):
if mask is not True:
s = {ma: VectorMemoryAccess(*ma.args[0:4], sp.And(mask, ma.args[4]), *ma.args[5:])
for ma in node.atoms(VectorMemoryAccess)}
node.subs(s)
else:
for arg in node.args:
visit_node(arg, mask)
visit_node(loop_body, mask=True)
def insert_vector_casts(ast_node): def insert_vector_casts(ast_node, instruction_set, default_float_type='double'):
"""Inserts necessary casts from scalar values to vector values.""" """Inserts necessary casts from scalar values to vector values."""
handled_functions = (sp.Add, sp.Mul, fast_division, fast_sqrt, fast_inv_sqrt) handled_functions = (sp.Add, sp.Mul, vec_any, vec_all, DivFunc, sp.Abs)
def visit_expr(expr): def is_scalar(expr) -> bool:
if hasattr(expr, "dtype"):
if type(expr.dtype) is VectorType:
return False
# Else branch: If expr is a CastFunc, then whether the expression
# is scalar is determined by the argument (remember: vector casts
# are not inserted yet). Therefore, we must recurse into the args of
# expr below. Otherwise, this expression is atomic and in that case
# it is assumed to be scalar below.
if isinstance(expr, cast_func) or isinstance(expr, vector_memory_access): if isinstance(expr, ast.ResolvedFieldAccess):
return expr # expr.field is not in expr.args
elif expr.func in handled_functions or isinstance(expr, sp.Rel) or isinstance(expr, sp.boolalg.BooleanFunction): return is_scalar(expr.field)
new_args = [visit_expr(a) for a in expr.args] elif isinstance(expr, (vec_any, vec_all)):
arg_types = [get_type_of_expression(a) for a in new_args] return True
if not hasattr(expr, "args"):
return True
return all(is_scalar(arg) for arg in expr.args)
# TODO Vectorization Revamp: get rid of default_type
def visit_expr(expr, default_type='double', force_vectorize=False):
if isinstance(expr, VectorMemoryAccess):
return VectorMemoryAccess(*expr.args[0:4], visit_expr(expr.args[4], default_type, force_vectorize),
*expr.args[5:])
elif isinstance(expr, CastFunc):
cast_type = expr.args[1]
arg = visit_expr(expr.args[0], default_type, force_vectorize)
assert cast_type in [BasicType('float32'), BasicType('float64')], \
f'Vectorization cannot vectorize type {cast_type}'
return expr.func(arg, VectorType(cast_type, instruction_set['width']))
elif expr.func is sp.Abs and 'abs' not in instruction_set:
new_arg = visit_expr(expr.args[0], default_type, force_vectorize)
base_type = get_type_of_expression(expr.args[0]).base_type if type(expr.args[0]) is VectorMemoryAccess \
else get_type_of_expression(expr.args[0])
pw = sp.Piecewise((-new_arg, new_arg < CastFunc(0, base_type.numpy_dtype)),
(new_arg, True))
return visit_expr(pw, default_type, force_vectorize)
elif expr.func in handled_functions or isinstance(expr, sp.Rel) or isinstance(expr, BooleanFunction):
if expr.func is sp.Mul and expr.args[0] == -1:
# special treatment for the unary minus: make sure that the -1 has the same type as the argument
dtype = int
for arg in expr.atoms(VectorMemoryAccess):
if arg.dtype.base_type.is_float():
dtype = arg.dtype.base_type.numpy_dtype.type
for arg in expr.atoms(TypedSymbol):
if type(arg.dtype) is VectorType and arg.dtype.base_type.is_float():
dtype = arg.dtype.base_type.numpy_dtype.type
if dtype is not int:
if dtype is np.float32:
default_type = 'float'
expr = sp.Mul(dtype(expr.args[0]), *expr.args[1:])
new_args = [visit_expr(a, default_type, force_vectorize) for a in expr.args]
arg_types = [get_type_of_expression(a, default_float_type=default_type) for a in new_args]
if not any(type(t) is VectorType for t in arg_types): if not any(type(t) is VectorType for t in arg_types):
return expr return expr
else: else:
target_type = collate_types(arg_types) target_type = collate_types(arg_types)
casted_args = [cast_func(a, target_type) if t != target_type else a casted_args = [
for a, t in zip(new_args, arg_types)] CastFunc(a, target_type) if t != target_type and not isinstance(a, VectorMemoryAccess) else a
for a, t in zip(new_args, arg_types)]
return expr.func(*casted_args) return expr.func(*casted_args)
elif expr.func is sp.UnevaluatedExpr:
assert expr.args[0].is_Pow or expr.args[0].is_Mul, "UnevaluatedExpr only implemented holding Mul or Pow"
# TODO this is only because cut_loop evaluates the multiplications again due to deepcopy. All this should
# TODO be fixed for real at some point.
if expr.args[0].is_Pow:
base = expr.args[0].base
exp = expr.args[0].exp
expr = sp.UnevaluatedExpr(sp.Mul(*([base] * +exp), evaluate=False))
new_args = [visit_expr(a, default_type, force_vectorize) for a in expr.args[0].args]
arg_types = [get_type_of_expression(a, default_float_type=default_type) for a in new_args]
target_type = collate_types(arg_types)
if not any(type(t) is VectorType for t in arg_types):
target_type = VectorType(target_type, instruction_set['width'])
casted_args = [
CastFunc(a, target_type) if t != target_type and not isinstance(a, VectorMemoryAccess) else a
for a, t in zip(new_args, arg_types)]
return expr.func(expr.args[0].func(*casted_args, evaluate=False))
elif expr.func is sp.Pow: elif expr.func is sp.Pow:
new_arg = visit_expr(expr.args[0]) new_arg = visit_expr(expr.args[0], default_type, force_vectorize)
return expr.func(new_arg, expr.args[1]) return expr.func(new_arg, expr.args[1])
elif expr.func == sp.Piecewise: elif expr.func == sp.Piecewise:
new_results = [visit_expr(a[0]) for a in expr.args] new_results = [visit_expr(a[0], default_type, force_vectorize) for a in expr.args]
new_conditions = [visit_expr(a[1]) for a in expr.args] new_conditions = [visit_expr(a[1], default_type, force_vectorize) for a in expr.args]
types_of_results = [get_type_of_expression(a) for a in new_results] types_of_results = [get_type_of_expression(a) for a in new_results]
types_of_conditions = [get_type_of_expression(a) for a in new_conditions] types_of_conditions = [get_type_of_expression(a) for a in new_conditions]
...@@ -151,38 +365,61 @@ def insert_vector_casts(ast_node): ...@@ -151,38 +365,61 @@ def insert_vector_casts(ast_node):
if type(condition_target_type) is not VectorType and type(result_target_type) is VectorType: if type(condition_target_type) is not VectorType and type(result_target_type) is VectorType:
condition_target_type = VectorType(condition_target_type, width=result_target_type.width) condition_target_type = VectorType(condition_target_type, width=result_target_type.width)
casted_results = [cast_func(a, result_target_type) if t != result_target_type else a casted_results = [CastFunc(a, result_target_type) if t != result_target_type else a
for a, t in zip(new_results, types_of_results)] for a, t in zip(new_results, types_of_results)]
casted_conditions = [cast_func(a, condition_target_type) casted_conditions = [CastFunc(a, condition_target_type)
if t != condition_target_type and a is not True else a if t != condition_target_type and a is not True else a
for a, t in zip(new_conditions, types_of_conditions)] for a, t in zip(new_conditions, types_of_conditions)]
return sp.Piecewise(*[(r, c) for r, c in zip(casted_results, casted_conditions)]) return sp.Piecewise(*[(r, c) for r, c in zip(casted_results, casted_conditions)])
else: elif isinstance(expr, TypedSymbol):
if force_vectorize:
expr_type = get_type_of_expression(expr)
if type(expr_type) is not VectorType:
vector_type = VectorType(expr_type, instruction_set['width'])
return CastFunc(expr, vector_type)
return expr return expr
elif isinstance(expr, (sp.Number, BooleanAtom)):
return expr
else:
raise NotImplementedError(f'Due to defensive programming we handle only specific expressions.\n'
f'The expression {expr} of type {type(expr)} is not known yet.')
def visit_node(node, substitution_dict): def visit_node(node, substitution_dict, default_type='double'):
substitution_dict = substitution_dict.copy() substitution_dict = substitution_dict.copy()
for arg in node.args: for arg in node.args:
if isinstance(arg, ast.SympyAssignment): if isinstance(arg, ast.SympyAssignment):
assignment = arg assignment = arg
# If there is a remainder loop we do not vectorise it, thus lhs will indicate this
# if isinstance(assignment.lhs, ast.ResolvedFieldAccess):
# continue
subs_expr = fast_subs(assignment.rhs, substitution_dict, subs_expr = fast_subs(assignment.rhs, substitution_dict,
skip=lambda e: isinstance(e, ast.ResolvedFieldAccess)) skip=lambda e: isinstance(e, ast.ResolvedFieldAccess))
assignment.rhs = visit_expr(subs_expr)
rhs_type = get_type_of_expression(assignment.rhs) # If either side contains a vectorized subexpression, both sides
# must be fully vectorized.
lhs_scalar = is_scalar(assignment.lhs)
rhs_scalar = is_scalar(subs_expr)
assignment.rhs = visit_expr(subs_expr, default_type, force_vectorize=not (lhs_scalar and rhs_scalar))
if isinstance(assignment.lhs, TypedSymbol): if isinstance(assignment.lhs, TypedSymbol):
lhs_type = assignment.lhs.dtype if lhs_scalar and not rhs_scalar:
if type(rhs_type) is VectorType and type(lhs_type) is not VectorType: lhs_type = get_type_of_expression(assignment.lhs)
rhs_type = get_type_of_expression(assignment.rhs)
new_lhs_type = VectorType(lhs_type, rhs_type.width) new_lhs_type = VectorType(lhs_type, rhs_type.width)
new_lhs = TypedSymbol(assignment.lhs.name, new_lhs_type) new_lhs = TypedSymbol(assignment.lhs.name, new_lhs_type)
substitution_dict[assignment.lhs] = new_lhs substitution_dict[assignment.lhs] = new_lhs
assignment.lhs = new_lhs assignment.lhs = new_lhs
elif isinstance(assignment.lhs.func, cast_func): elif isinstance(assignment.lhs, VectorMemoryAccess):
lhs_type = assignment.lhs.args[1] assignment.lhs = visit_expr(assignment.lhs, default_type)
if type(lhs_type) is VectorType and type(rhs_type) is not VectorType: elif isinstance(arg, ast.Conditional):
assignment.rhs = cast_func(assignment.rhs, lhs_type) arg.condition_expr = fast_subs(arg.condition_expr, substitution_dict,
skip=lambda e: isinstance(e, ast.ResolvedFieldAccess))
arg.condition_expr = visit_expr(arg.condition_expr, default_type)
visit_node(arg, substitution_dict, default_type)
else: else:
visit_node(arg, substitution_dict) visit_node(arg, substitution_dict, default_type)
visit_node(ast_node, {}) visit_node(ast_node, {}, default_float_type)