Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
No results found
Show changes
Showing
with 0 additions and 2671 deletions
from tempfile import TemporaryDirectory
import sympy as sp
from collections import defaultdict
import kerncraft
from kerncraft.kerncraft import KernelCode
from typing import Optional
from kerncraft.machinemodel import MachineModel
from pystencils.kerncraft_coupling.generate_benchmark import generate_benchmark
from pystencils.astnodes import LoopOverCoordinate, SympyAssignment, ResolvedFieldAccess, KernelFunction
from pystencils.field import get_layout_from_strides
from pystencils.sympyextensions import count_operations_in_ast
from pystencils.transformations import filtered_tree_iteration
from pystencils.utils import DotDict
import warnings
class PyStencilsKerncraftKernel(KernelCode):
"""
Implementation of kerncraft's kernel interface for pystencils CPU kernels.
Analyses a list of equations assuming they will be executed on a CPU
"""
LIKWID_BASE = '/usr/local/likwid'
def __init__(self, ast: KernelFunction, machine: Optional[MachineModel] = None,
assumed_layout='SoA', debug_print=False, filename=None):
"""Create a kerncraft kernel using a pystencils AST
Args:
ast: pystencils ast
machine: kerncraft machine model - specify this if kernel needs to be compiled
assumed_layout: either 'SoA' or 'AoS' - if fields have symbolic sizes the layout of the index
coordinates is not known. In this case either a structures of array (SoA) or
array of structures (AoS) layout is assumed
"""
kerncraft.kernel.Kernel.__init__(self, machine)
# Initialize state
self.asm_block = None
self._filename = filename
self.kernel_ast = ast
self.temporary_dir = TemporaryDirectory()
self._keep_intermediates = debug_print
# Loops
inner_loops = [l for l in filtered_tree_iteration(ast, LoopOverCoordinate, stop_type=SympyAssignment)
if l.is_innermost_loop]
if len(inner_loops) == 0:
raise ValueError("No loop found in pystencils AST")
else:
if len(inner_loops) > 1:
warnings.warn("pystencils AST contains multiple inner loops. "
"Only one can be analyzed - choosing first one")
inner_loop = inner_loops[0]
self._loop_stack = []
cur_node = inner_loop
while cur_node is not None:
if isinstance(cur_node, LoopOverCoordinate):
loop_counter_sym = cur_node.loop_counter_symbol
loop_info = (loop_counter_sym.name, cur_node.start, cur_node.stop, 1)
# If the correct step were to be provided, all access within that step length will
# also need to be passed to kerncraft: cur_node.step)
self._loop_stack.append(loop_info)
cur_node = cur_node.parent
self._loop_stack = list(reversed(self._loop_stack))
# Data sources & destinations
self.sources = defaultdict(list)
self.destinations = defaultdict(list)
def get_layout_tuple(f):
if f.has_fixed_shape:
return get_layout_from_strides(f.strides)
else:
layout_list = list(f.layout)
for _ in range(f.index_dimensions):
layout_list.insert(0 if assumed_layout == 'SoA' else -1, max(layout_list) + 1)
return layout_list
reads, writes = search_resolved_field_accesses_in_ast(inner_loop)
for accesses, target_dict in [(reads, self.sources), (writes, self.destinations)]:
for fa in accesses:
coord = [sp.Symbol(LoopOverCoordinate.get_loop_counter_name(i), positive=True, integer=True) + off
for i, off in enumerate(fa.offsets)]
coord += list(fa.idx_coordinate_values)
layout = get_layout_tuple(fa.field)
permuted_coord = [sp.sympify(coord[i]) for i in layout]
target_dict[fa.field.name].append(permuted_coord)
# Variables (arrays)
fields_accessed = ast.fields_accessed
for field in fields_accessed:
layout = get_layout_tuple(field)
permuted_shape = list(field.shape[i] for i in layout)
self.set_variable(field.name, str(field.dtype), tuple(permuted_shape))
# Scalars may be safely ignored
# for param in ast.get_parameters():
# if not param.is_field_parameter:
# # self.set_variable(param.symbol.name, str(param.symbol.dtype), None)
# self.sources[param.symbol.name] = [None]
# data type
self.datatype = list(self.variables.values())[0][0]
# flops
operation_count = count_operations_in_ast(inner_loop)
self._flops = {
'+': operation_count['adds'],
'*': operation_count['muls'],
'/': operation_count['divs'],
}
for k in [k for k, v in self._flops.items() if v == 0]:
del self._flops[k]
self.check()
if debug_print:
from pprint import pprint
print("----------------------------- Loop Stack --------------------------")
pprint(self._loop_stack)
print("----------------------------- Sources -----------------------------")
pprint(self.sources)
print("----------------------------- Destinations ------------------------")
pprint(self.destinations)
print("----------------------------- FLOPS -------------------------------")
pprint(self._flops)
def as_code(self, type_='iaca', openmp=False, as_filename=False):
"""
Generate and return compilable source code.
Args:
type_: can be iaca or likwid.
openmp: if true, openmp code will be generated
as_filename:
"""
code = generate_benchmark(self.kernel_ast, likwid=type_ == 'likwid', openmp=openmp)
if as_filename:
fp, already_available = self._get_intermediate_file('kernel_{}.c'.format(type_),
machine_and_compiler_dependent=False)
if not already_available:
fp.write(code)
return fp.name
else:
return code
class KerncraftParameters(DotDict):
def __init__(self, **kwargs):
super(KerncraftParameters, self).__init__(**kwargs)
self['asm_block'] = 'auto'
self['asm_increment'] = 0
self['cores'] = 1
self['cache_predictor'] = 'SIM'
self['verbose'] = 0
self['pointer_increment'] = 'auto'
self['iterations'] = 10
self['unit'] = 'cy/CL'
self['ignore_warnings'] = True
# ------------------------------------------- Helper functions ---------------------------------------------------------
def search_resolved_field_accesses_in_ast(ast):
def visit(node, reads, writes):
if not isinstance(node, SympyAssignment):
for a in node.args:
visit(a, reads, writes)
return
for expr, accesses in [(node.lhs, writes), (node.rhs, reads)]:
accesses.update(expr.atoms(ResolvedFieldAccess))
read_accesses = set()
write_accesses = set()
visit(ast, read_accesses, write_accesses)
return read_accesses, write_accesses
from types import MappingProxyType
import sympy as sp
import itertools
from pystencils.assignment import Assignment
from pystencils.astnodes import LoopOverCoordinate, Conditional, Block, SympyAssignment
from pystencils.cpu.vectorization import vectorize
from pystencils.simp.assignment_collection import AssignmentCollection
from pystencils.gpucuda.indexing import indexing_creator_from_params
from pystencils.transformations import remove_conditionals_in_staggered_kernel, loop_blocking, \
move_constants_before_loop
def create_kernel(assignments, target='cpu', data_type="double", iteration_slice=None, ghost_layers=None,
skip_independence_check=False,
cpu_openmp=False, cpu_vectorize_info=None, cpu_blocking=None,
gpu_indexing='block', gpu_indexing_params=MappingProxyType({})):
"""
Creates abstract syntax tree (AST) of kernel, using a list of update equations.
Args:
assignments: can be a single assignment, sequence of assignments or an `AssignmentCollection`
target: 'cpu', 'llvm' or 'gpu'
data_type: data type used for all untyped symbols (i.e. non-fields), can also be a dict from symbol name
to type
iteration_slice: rectangular subset to iterate over, if not specified the complete non-ghost layer \
part of the field is iterated over
ghost_layers: if left to default, the number of necessary ghost layers is determined automatically
a single integer specifies the ghost layer count at all borders, can also be a sequence of
pairs ``[(x_lower_gl, x_upper_gl), .... ]``
skip_independence_check: don't check that loop iterations are independent. This is needed e.g. for
periodicity kernel, that access the field outside the iteration bounds. Use with care!
cpu_openmp: True or number of threads for OpenMP parallelization, False for no OpenMP
cpu_vectorize_info: a dictionary with keys, 'vector_instruction_set', 'assume_aligned' and 'nontemporal'
for documentation of these parameters see vectorize function. Example:
'{'instruction_set': 'avx512', 'assume_aligned': True, 'nontemporal':True}'
cpu_blocking: a tuple of block sizes or None if no blocking should be applied
gpu_indexing: either 'block' or 'line' , or custom indexing class, see `AbstractIndexing`
gpu_indexing_params: dict with indexing parameters (constructor parameters of indexing class)
e.g. for 'block' one can specify '{'block_size': (20, 20, 10) }'
Returns:
abstract syntax tree (AST) object, that can either be printed as source code with `show_code` or
can be compiled with through its 'compile()' member
Example:
>>> import pystencils as ps
>>> import numpy as np
>>> s, d = ps.fields('s, d: [2D]')
>>> assignment = ps.Assignment(d[0,0], s[0, 1] + s[0, -1] + s[1, 0] + s[-1, 0])
>>> ast = ps.create_kernel(assignment, target='cpu', cpu_openmp=True)
>>> kernel = ast.compile()
>>> d_arr = np.zeros([5, 5])
>>> kernel(d=d_arr, s=np.ones([5, 5]))
>>> d_arr
array([[0., 0., 0., 0., 0.],
[0., 4., 4., 4., 0.],
[0., 4., 4., 4., 0.],
[0., 4., 4., 4., 0.],
[0., 0., 0., 0., 0.]])
"""
# ---- Normalizing parameters
split_groups = ()
if isinstance(assignments, AssignmentCollection):
if 'split_groups' in assignments.simplification_hints:
split_groups = assignments.simplification_hints['split_groups']
assignments = assignments.all_assignments
if isinstance(assignments, Assignment):
assignments = [assignments]
# ---- Creating ast
if target == 'cpu':
from pystencils.cpu import create_kernel
from pystencils.cpu import add_openmp
ast = create_kernel(assignments, type_info=data_type, split_groups=split_groups,
iteration_slice=iteration_slice, ghost_layers=ghost_layers,
skip_independence_check=skip_independence_check)
omp_collapse = None
if cpu_blocking:
omp_collapse = loop_blocking(ast, cpu_blocking)
if cpu_openmp:
add_openmp(ast, num_threads=cpu_openmp, collapse=omp_collapse)
if cpu_vectorize_info:
if cpu_vectorize_info is True:
vectorize(ast)
elif isinstance(cpu_vectorize_info, dict):
vectorize(ast, **cpu_vectorize_info)
else:
raise ValueError("Invalid value for cpu_vectorize_info")
return ast
elif target == 'llvm':
from pystencils.llvm import create_kernel
ast = create_kernel(assignments, type_info=data_type, split_groups=split_groups,
iteration_slice=iteration_slice, ghost_layers=ghost_layers)
return ast
elif target == 'gpu':
from pystencils.gpucuda import create_cuda_kernel
ast = create_cuda_kernel(assignments, type_info=data_type,
indexing_creator=indexing_creator_from_params(gpu_indexing, gpu_indexing_params),
iteration_slice=iteration_slice, ghost_layers=ghost_layers,
skip_independence_check=skip_independence_check)
return ast
else:
raise ValueError("Unknown target %s. Has to be one of 'cpu', 'gpu' or 'llvm' " % (target,))
def create_indexed_kernel(assignments, index_fields, target='cpu', data_type="double", coordinate_names=('x', 'y', 'z'),
cpu_openmp=True, gpu_indexing='block', gpu_indexing_params=MappingProxyType({})):
"""
Similar to :func:`create_kernel`, but here not all cells of a field are updated but only cells with
coordinates which are stored in an index field. This traversal method can e.g. be used for boundary handling.
The coordinates are stored in a separated index_field, which is a one dimensional array with struct data type.
This struct has to contain fields named 'x', 'y' and for 3D fields ('z'). These names are configurable with the
'coordinate_names' parameter. The struct can have also other fields that can be read and written in the kernel, for
example boundary parameters.
index_fields: list of index fields, i.e. 1D fields with struct data type
coordinate_names: name of the coordinate fields in the struct data type
Example:
>>> import pystencils as ps
>>> import numpy as np
>>>
>>> # Index field stores the indices of the cell to visit together with optional values
>>> index_arr_dtype = np.dtype([('x', np.int32), ('y', np.int32), ('val', np.double)])
>>> index_arr = np.array([(1, 1, 0.1), (2, 2, 0.2), (3, 3, 0.3)], dtype=index_arr_dtype)
>>> idx_field = ps.fields(idx=index_arr)
>>>
>>> # Additional values stored in index field can be accessed in the kernel as well
>>> s, d = ps.fields('s, d: [2D]')
>>> assignment = ps.Assignment(d[0,0], 2 * s[0, 1] + 2 * s[1, 0] + idx_field('val'))
>>> ast = create_indexed_kernel(assignment, [idx_field], coordinate_names=('x', 'y'))
>>> kernel = ast.compile()
>>> d_arr = np.zeros([5, 5])
>>> kernel(s=np.ones([5, 5]), d=d_arr, idx=index_arr)
>>> d_arr
array([[0. , 0. , 0. , 0. , 0. ],
[0. , 4.1, 0. , 0. , 0. ],
[0. , 0. , 4.2, 0. , 0. ],
[0. , 0. , 0. , 4.3, 0. ],
[0. , 0. , 0. , 0. , 0. ]])
"""
if isinstance(assignments, Assignment):
assignments = [assignments]
elif isinstance(assignments, AssignmentCollection):
assignments = assignments.all_assignments
if target == 'cpu':
from pystencils.cpu import create_indexed_kernel
from pystencils.cpu import add_openmp
ast = create_indexed_kernel(assignments, index_fields=index_fields, type_info=data_type,
coordinate_names=coordinate_names)
if cpu_openmp:
add_openmp(ast, num_threads=cpu_openmp)
return ast
elif target == 'llvm':
raise NotImplementedError("Indexed kernels are not yet supported in LLVM backend")
elif target == 'gpu':
from pystencils.gpucuda import created_indexed_cuda_kernel
idx_creator = indexing_creator_from_params(gpu_indexing, gpu_indexing_params)
ast = created_indexed_cuda_kernel(assignments, index_fields, type_info=data_type,
coordinate_names=coordinate_names, indexing_creator=idx_creator)
return ast
else:
raise ValueError("Unknown target %s. Has to be either 'cpu' or 'gpu'" % (target,))
def create_staggered_kernel(staggered_field, expressions, subexpressions=(), target='cpu',
gpu_exclusive_conditions=False, **kwargs):
"""Kernel that updates a staggered field.
.. image:: /img/staggered_grid.svg
Args:
staggered_field: field where the first index coordinate defines the location of the staggered value
can have 1 or 2 index coordinates, in case of two index coordinates at every staggered location
a vector is stored, expressions parameter has to be a sequence of sequences then
where e.g. ``f[0,0](0)`` is interpreted as value at the left cell boundary, ``f[1,0](0)`` the right cell
boundary and ``f[0,0](1)`` the southern cell boundary etc.
expressions: sequence of expressions of length dim, defining how the west, southern, (bottom) cell boundary
should be updated.
subexpressions: optional sequence of Assignments, that define subexpressions used in the main expressions
target: 'cpu' or 'gpu'
gpu_exclusive_conditions: if/else construct to have only one code block for each of 2**dim code paths
kwargs: passed directly to create_kernel, iteration slice and ghost_layers parameters are not allowed
Returns:
AST, see `create_kernel`
"""
assert 'iteration_slice' not in kwargs and 'ghost_layers' not in kwargs
assert staggered_field.index_dimensions in (1, 2), 'Staggered field must have one or two index dimensions'
dim = staggered_field.spatial_dimensions
counters = [LoopOverCoordinate.get_loop_counter_symbol(i) for i in range(dim)]
conditions = [counters[i] < staggered_field.shape[i] - 1 for i in range(dim)]
assert len(expressions) == dim
if staggered_field.index_dimensions == 2:
assert all(len(sublist) == len(expressions[0]) for sublist in expressions), \
"If staggered field has two index dimensions expressions has to be a sequence of sequences of all the " \
"same length."
final_assignments = []
last_conditional = None
def add(condition, dimensions, as_else_block=False):
nonlocal last_conditional
if staggered_field.index_dimensions == 1:
assignments = [Assignment(staggered_field(d), expressions[d]) for d in dimensions]
a_coll = AssignmentCollection(assignments, list(subexpressions))
a_coll = a_coll.new_filtered([staggered_field(d) for d in dimensions])
elif staggered_field.index_dimensions == 2:
assert staggered_field.has_fixed_index_shape
assignments = [Assignment(staggered_field(d, i), expr)
for d in dimensions
for i, expr in enumerate(expressions[d])]
a_coll = AssignmentCollection(assignments, list(subexpressions))
a_coll = a_coll.new_filtered([staggered_field(d, i) for i in range(staggered_field.index_shape[1])
for d in dimensions])
sp_assignments = [SympyAssignment(a.lhs, a.rhs) for a in a_coll.all_assignments]
if as_else_block and last_conditional:
new_cond = Conditional(condition, Block(sp_assignments))
last_conditional.false_block = Block([new_cond])
last_conditional = new_cond
else:
last_conditional = Conditional(condition, Block(sp_assignments))
final_assignments.append(last_conditional)
if target == 'cpu' or not gpu_exclusive_conditions:
for d in range(dim):
cond = sp.And(*[conditions[i] for i in range(dim) if d != i])
add(cond, [d])
elif target == 'gpu':
full_conditions = [sp.And(*[conditions[i] for i in range(dim) if d != i]) for d in range(dim)]
for include in itertools.product(*[[1, 0]] * dim):
case_conditions = sp.And(*[c if value else sp.Not(c) for c, value in zip(full_conditions, include)])
dimensions_to_include = [i for i in range(dim) if include[i]]
if dimensions_to_include:
add(case_conditions, dimensions_to_include, True)
ghost_layers = [(1, 0)] * dim
blocking = kwargs.get('cpu_blocking', None)
if blocking:
del kwargs['cpu_blocking']
cpu_vectorize_info = kwargs.get('cpu_vectorize_info', None)
if cpu_vectorize_info:
del kwargs['cpu_vectorize_info']
openmp = kwargs.get('cpu_openmp', None)
if openmp:
del kwargs['cpu_openmp']
ast = create_kernel(final_assignments, ghost_layers=ghost_layers, target=target, **kwargs)
if target == 'cpu':
remove_conditionals_in_staggered_kernel(ast)
move_constants_before_loop(ast)
omp_collapse = None
if blocking:
omp_collapse = loop_blocking(ast, blocking)
if openmp:
from pystencils.cpu import add_openmp
add_openmp(ast, num_threads=openmp, collapse=omp_collapse, assume_single_outer_loop=False)
if cpu_vectorize_info is True:
vectorize(ast)
elif isinstance(cpu_vectorize_info, dict):
vectorize(ast, **cpu_vectorize_info)
return ast
from .kernelcreation import create_kernel
from .llvmjit import make_python_function
__all__ = ['create_kernel', 'make_python_function']
import llvmlite.ir as ir
class Loop(object):
def __init__(self, builder, start_val, stop_val, step_val=1, loop_name='loop', phi_name="_phi"):
self.builder = builder
self.start_val = start_val
self.stop_val = stop_val
self.step_val = step_val
self.loop_name = loop_name
self.phi_name = phi_name
def __enter__(self):
self.loop_end, self.after, phi = self._for_loop(self.start_val, self.stop_val, self.step_val, self.loop_name,
self.phi_name)
return phi
def _for_loop(self, start_val, stop_val, step_val, loop_name, phi_name):
# TODO size of int??? unisgned???
integer = ir.IntType(64)
# Loop block
pre_loop_bb = self.builder.block
loop_bb = self.builder.append_basic_block(name='loop_' + loop_name)
self.builder.branch(loop_bb)
# Insert an explicit fall through from the current block to loop_bb
self.builder.position_at_start(loop_bb)
# Add phi
phi = self.builder.phi(integer, name=phi_name)
phi.add_incoming(start_val, pre_loop_bb)
loop_end_bb = self.builder.append_basic_block(name=loop_name + "_end_bb")
self.builder.position_at_start(loop_end_bb)
next_var = self.builder.add(phi, step_val, name=loop_name + '_next_it')
cond = self.builder.icmp_unsigned('<', next_var, stop_val, name=loop_name + "_cond")
after_bb = self.builder.append_basic_block(name=loop_name + "_after_bb")
self.builder.cbranch(cond, loop_bb, after_bb)
phi.add_incoming(next_var, loop_end_bb)
self.builder.position_at_end(loop_bb)
return loop_end_bb, after_bb, phi
def __exit__(self, exc_type, exc, exc_tb):
self.builder.branch(self.loop_end)
self.builder.position_at_end(self.after)
from pystencils.transformations import insert_casts
from functools import partial
from pystencils.llvm.llvmjit import make_python_function
def create_kernel(assignments, function_name="kernel", type_info=None, split_groups=(),
iteration_slice=None, ghost_layers=None):
"""
Creates an abstract syntax tree for a kernel function, by taking a list of update rules.
Loops are created according to the field accesses in the equations.
:param assignments: list of sympy equations, containing accesses to :class:`pystencils.field.Field`.
Defining the update rules of the kernel
:param function_name: name of the generated function - only important if generated code is written out
:param type_info: a map from symbol name to a C type specifier. If not specified all symbols are assumed to
be of type 'double' except symbols which occur on the left hand side of equations where the
right hand side is a sympy Boolean which are assumed to be 'bool' .
:param split_groups: Specification on how to split up inner loop into multiple loops. For details see
transformation :func:`pystencils.transformation.split_inner_loop`
:param iteration_slice: if not None, iteration is done only over this slice of the field
:param ghost_layers: a sequence of pairs for each coordinate with lower and upper nr of ghost layers
if None, the number of ghost layers is determined automatically and assumed to be equal for a
all dimensions
:return: :class:`pystencils.ast.KernelFunction` node
"""
from pystencils.cpu import create_kernel
code = create_kernel(assignments, function_name, type_info, split_groups, iteration_slice, ghost_layers)
code = insert_casts(code)
code.compile = partial(make_python_function, code)
return code
import sympy as sp
import functools
from sympy import S, Indexed
from sympy.printing.printer import Printer
import llvmlite.ir as ir
from pystencils.assignment import Assignment
from pystencils.llvm.control_flow import Loop
from pystencils.data_types import create_type, to_llvm_type, get_type_of_expression, collate_types, \
create_composite_type_from_string
def generate_llvm(ast_node, module=None, builder=None):
"""Prints the ast as llvm code."""
if module is None:
module = ir.Module()
if builder is None:
builder = ir.IRBuilder()
printer = LLVMPrinter(module, builder)
return printer._print(ast_node)
# noinspection PyPep8Naming
class LLVMPrinter(Printer):
"""Convert expressions to LLVM IR"""
def __init__(self, module, builder, fn=None, *args, **kwargs):
self.func_arg_map = kwargs.pop("func_arg_map", {})
super(LLVMPrinter, self).__init__(*args, **kwargs)
self.fp_type = ir.DoubleType()
self.fp_pointer = self.fp_type.as_pointer()
self.integer = ir.IntType(64)
self.integer_pointer = self.integer.as_pointer()
self.void = ir.VoidType()
self.module = module
self.builder = builder
self.fn = fn
self.ext_fn = {} # keep track of wrappers to external functions
self.tmp_var = {}
def _add_tmp_var(self, name, value):
self.tmp_var[name] = value
def _remove_tmp_var(self, name):
del self.tmp_var[name]
def _print_Number(self, n):
if get_type_of_expression(n) == create_type("int"):
return ir.Constant(self.integer, int(n))
elif get_type_of_expression(n) == create_type("double"):
return ir.Constant(self.fp_type, float(n))
else:
raise NotImplementedError("Numbers can only have int and double", n)
def _print_Float(self, expr):
return ir.Constant(self.fp_type, float(expr))
def _print_Integer(self, expr):
return ir.Constant(self.integer, int(expr))
def _print_int(self, i):
return ir.Constant(self.integer, i)
def _print_Symbol(self, s):
val = self.tmp_var.get(s)
if not val:
# look up parameter with name s
val = self.func_arg_map.get(s.name)
if not val:
raise LookupError("Symbol not found: %s" % s)
return val
def _print_Pow(self, expr):
base0 = self._print(expr.base)
if expr.exp == S.NegativeOne:
return self.builder.fdiv(ir.Constant(self.fp_type, 1.0), base0)
if expr.exp == S.Half:
fn = self.ext_fn.get("sqrt")
if not fn:
fn_type = ir.FunctionType(self.fp_type, [self.fp_type])
fn = ir.Function(self.module, fn_type, "sqrt")
self.ext_fn["sqrt"] = fn
return self.builder.call(fn, [base0], "sqrt")
if expr.exp == 2:
return self.builder.fmul(base0, base0)
elif expr.exp == 3:
return self.builder.fmul(self.builder.fmul(base0, base0), base0)
exp0 = self._print(expr.exp)
fn = self.ext_fn.get("pow")
if not fn:
fn_type = ir.FunctionType(self.fp_type, [self.fp_type, self.fp_type])
fn = ir.Function(self.module, fn_type, "pow")
self.ext_fn["pow"] = fn
return self.builder.call(fn, [base0, exp0], "pow")
def _print_Mul(self, expr):
nodes = [self._print(a) for a in expr.args]
e = nodes[0]
if get_type_of_expression(expr) == create_type('double'):
mul = self.builder.fmul
else: # int TODO unsigned/signed
mul = self.builder.mul
for node in nodes[1:]:
e = mul(e, node)
return e
def _print_Add(self, expr):
nodes = [self._print(a) for a in expr.args]
e = nodes[0]
if get_type_of_expression(expr) == create_type('double'):
add = self.builder.fadd
else: # int TODO unsigned/signed
add = self.builder.add
for node in nodes[1:]:
e = add(e, node)
return e
def _print_Or(self, expr):
nodes = [self._print(a) for a in expr.args]
e = nodes[0]
for node in nodes[1:]:
e = self.builder.or_(e, node)
return e
def _print_And(self, expr):
nodes = [self._print(a) for a in expr.args]
e = nodes[0]
for node in nodes[1:]:
e = self.builder.and_(e, node)
return e
def _print_StrictLessThan(self, expr):
return self._comparison('<', expr)
def _print_LessThan(self, expr):
return self._comparison('<=', expr)
def _print_StrictGreaterThan(self, expr):
return self._comparison('>', expr)
def _print_GreaterThan(self, expr):
return self._comparison('>=', expr)
def _print_Unequality(self, expr):
return self._comparison('!=', expr)
def _print_Equality(self, expr):
return self._comparison('==', expr)
def _comparison(self, cmpop, expr):
if collate_types([get_type_of_expression(arg) for arg in expr.args]) == create_type('double'):
comparison = self.builder.fcmp_unordered
else:
comparison = self.builder.icmp_signed
return comparison(cmpop, self._print(expr.lhs), self._print(expr.rhs))
def _print_KernelFunction(self, func):
# KernelFunction does not posses a return type
return_type = self.void
parameter_type = []
parameters = func.get_parameters()
for parameter in parameters:
parameter_type.append(to_llvm_type(parameter.symbol.dtype))
func_type = ir.FunctionType(return_type, tuple(parameter_type))
name = func.function_name
fn = ir.Function(self.module, func_type, name)
self.ext_fn[name] = fn
# set proper names to arguments
for i, arg in enumerate(fn.args):
arg.name = parameters[i].symbol.name
self.func_arg_map[parameters[i].symbol.name] = arg
# func.attributes.add("inlinehint")
# func.attributes.add("argmemonly")
block = fn.append_basic_block(name="entry")
self.builder = ir.IRBuilder(block) # TODO use goto_block instead
self._print(func.body)
self.builder.ret_void()
self.fn = fn
return fn
def _print_Block(self, block):
for node in block.args:
self._print(node)
def _print_LoopOverCoordinate(self, loop):
with Loop(self.builder, self._print(loop.start), self._print(loop.stop), self._print(loop.step),
loop.loop_counter_name, loop.loop_counter_symbol.name) as i:
self._add_tmp_var(loop.loop_counter_symbol, i)
self._print(loop.body)
self._remove_tmp_var(loop.loop_counter_symbol)
def _print_SympyAssignment(self, assignment):
expr = self._print(assignment.rhs)
lhs = assignment.lhs
if isinstance(lhs, Indexed):
ptr = self._print(lhs.base.label)
index = self._print(lhs.args[1])
gep = self.builder.gep(ptr, [index])
return self.builder.store(expr, gep)
self.func_arg_map[assignment.lhs.name] = expr
return expr
def _print_boolean_cast_func(self, conversion):
return self._print_cast_func(conversion)
def _print_cast_func(self, conversion):
node = self._print(conversion.args[0])
to_dtype = get_type_of_expression(conversion)
from_dtype = get_type_of_expression(conversion.args[0])
if from_dtype == to_dtype:
return self._print(conversion.args[0])
# (From, to)
decision = {
(create_composite_type_from_string("int16"),
create_composite_type_from_string("int64")): lambda: ir.Constant(self.integer, node),
(create_composite_type_from_string("int"),
create_composite_type_from_string("double")): functools.partial(self.builder.sitofp, node, self.fp_type),
(create_composite_type_from_string("int16"),
create_composite_type_from_string("double")): functools.partial(self.builder.sitofp, node, self.fp_type),
(create_composite_type_from_string("double"),
create_composite_type_from_string("int")): functools.partial(self.builder.fptosi, node, self.integer),
(create_composite_type_from_string("double *"),
create_composite_type_from_string("int")): functools.partial(self.builder.ptrtoint, node, self.integer),
(create_composite_type_from_string("int"),
create_composite_type_from_string("double *")): functools.partial(self.builder.inttoptr,
node, self.fp_pointer),
(create_composite_type_from_string("double * restrict"),
create_composite_type_from_string("int")): functools.partial(self.builder.ptrtoint, node, self.integer),
(create_composite_type_from_string("int"),
create_composite_type_from_string("double * restrict")): functools.partial(self.builder.inttoptr, node,
self.fp_pointer),
(create_composite_type_from_string("double * restrict const"),
create_composite_type_from_string("int")): functools.partial(self.builder.ptrtoint, node,
self.integer),
(create_composite_type_from_string("int"),
create_composite_type_from_string("double * restrict const")): functools.partial(self.builder.inttoptr,
node, self.fp_pointer),
}
# TODO float, TEST: const, restrict
# TODO bitcast, addrspacecast
# TODO unsigned/signed fills
# print([x for x in decision.keys()])
# print("Types:")
# print([(type(x), type(y)) for (x, y) in decision.keys()])
# print("Cast:")
# print((from_dtype, to_dtype))
return decision[(from_dtype, to_dtype)]()
def _print_pointer_arithmetic_func(self, pointer):
ptr = self._print(pointer.args[0])
index = self._print(pointer.args[1])
return self.builder.gep(ptr, [index])
def _print_Indexed(self, indexed):
ptr = self._print(indexed.base.label)
index = self._print(indexed.args[1])
gep = self.builder.gep(ptr, [index])
return self.builder.load(gep, name=indexed.base.label.name)
def _print_Piecewise(self, piece):
if not piece.args[-1].cond:
# We need the last conditional to be a True, otherwise the resulting
# function may not return a result.
raise ValueError("All Piecewise expressions must contain an "
"(expr, True) statement to be used as a default "
"condition. Without one, the generated "
"expression may not evaluate to anything under "
"some condition.")
if piece.has(Assignment):
raise NotImplementedError('The llvm-backend does not support assignments'
'in the Piecewise function. It is questionable'
'whether to implement it. So far there is no'
'use-case to test it.')
else:
phi_data = []
after_block = self.builder.append_basic_block()
for (expr, condition) in piece.args:
if condition == sp.sympify(True): # Don't use 'is' use '=='!
phi_data.append((self._print(expr), self.builder.block))
self.builder.branch(after_block)
self.builder.position_at_end(after_block)
else:
cond = self._print(condition)
true_block = self.builder.append_basic_block()
false_block = self.builder.append_basic_block()
self.builder.cbranch(cond, true_block, false_block)
self.builder.position_at_end(true_block)
phi_data.append((self._print(expr), true_block))
self.builder.branch(after_block)
self.builder.position_at_end(false_block)
phi = self.builder.phi(to_llvm_type(get_type_of_expression(piece)))
for (val, block) in phi_data:
phi.add_incoming(val, block)
return phi
# Should have a list of math library functions to validate this.
# TODO function calls to libs
def _print_Function(self, expr):
name = expr.func.__name__
e0 = self._print(expr.args[0])
fn = self.ext_fn.get(name)
if not fn:
fn_type = ir.FunctionType(self.fp_type, [self.fp_type])
fn = ir.Function(self.module, fn_type, name)
self.ext_fn[name] = fn
return self.builder.call(fn, [e0], name)
def empty_printer(self, expr):
try:
import inspect
mro = inspect.getmro(expr)
except AttributeError:
mro = "None"
raise TypeError("Unsupported type for LLVM JIT conversion: Expression:\"%s\", Type:\"%s\", MRO:%s"
% (expr, type(expr), mro))
import llvmlite.ir as ir
import llvmlite.binding as llvm
import numpy as np
import ctypes as ct
from pystencils.data_types import create_composite_type_from_string
from ..data_types import to_ctypes, ctypes_from_llvm, StructType
from .llvm import generate_llvm
from pystencils.field import FieldType
def build_ctypes_argument_list(parameter_specification, argument_dict):
argument_dict = {k: v for k, v in argument_dict.items()}
ct_arguments = []
array_shapes = set()
index_arr_shapes = set()
for param in parameter_specification:
if param.is_field_parameter:
try:
field_arr = argument_dict[param.field_name]
except KeyError:
raise KeyError("Missing field parameter for kernel call " + param.field_name)
symbolic_field = param.fields[0]
if param.is_field_pointer:
ct_arguments.append(field_arr.ctypes.data_as(to_ctypes(param.symbol.dtype)))
if symbolic_field.has_fixed_shape:
symbolic_field_shape = tuple(int(i) for i in symbolic_field.shape)
if isinstance(symbolic_field.dtype, StructType):
symbolic_field_shape = symbolic_field_shape[:-1]
if symbolic_field_shape != field_arr.shape:
raise ValueError("Passed array '%s' has shape %s which does not match expected shape %s" %
(param.field_name, str(field_arr.shape), str(symbolic_field.shape)))
if symbolic_field.has_fixed_shape:
symbolic_field_strides = tuple(int(i) * field_arr.itemsize for i in symbolic_field.strides)
if isinstance(symbolic_field.dtype, StructType):
symbolic_field_strides = symbolic_field_strides[:-1]
if symbolic_field_strides != field_arr.strides:
raise ValueError("Passed array '%s' has strides %s which does not match expected strides %s" %
(param.field_name, str(field_arr.strides), str(symbolic_field_strides)))
if FieldType.is_indexed(symbolic_field):
index_arr_shapes.add(field_arr.shape[:symbolic_field.spatial_dimensions])
elif FieldType.is_generic(symbolic_field):
array_shapes.add(field_arr.shape[:symbolic_field.spatial_dimensions])
elif param.is_field_shape:
data_type = to_ctypes(param.symbol.dtype)
ct_arguments.append(data_type(field_arr.shape[param.symbol.coordinate]))
elif param.is_field_stride:
data_type = to_ctypes(param.symbol.dtype)
assert field_arr.strides[param.symbol.coordinate] % field_arr.itemsize == 0
item_stride = field_arr.strides[param.symbol.coordinate] // field_arr.itemsize
ct_arguments.append(data_type(item_stride))
else:
assert False
else:
try:
value = argument_dict[param.symbol.name]
except KeyError:
raise KeyError("Missing parameter for kernel call " + param.symbol.name)
expected_type = to_ctypes(param.symbol.dtype)
ct_arguments.append(expected_type(value))
if len(array_shapes) > 1:
raise ValueError("All passed arrays have to have the same size " + str(array_shapes))
if len(index_arr_shapes) > 1:
raise ValueError("All passed index arrays have to have the same size " + str(array_shapes))
return ct_arguments
def make_python_function_incomplete_params(kernel_function_node, argument_dict, func):
parameters = kernel_function_node.get_parameters()
cache = {}
cache_values = []
def wrapper(**kwargs):
key = hash(tuple((k, v.ctypes.data, v.strides, v.shape) if isinstance(v, np.ndarray) else (k, id(v))
for k, v in kwargs.items()))
try:
args = cache[key]
func(*args)
except KeyError:
full_arguments = argument_dict.copy()
full_arguments.update(kwargs)
args = build_ctypes_argument_list(parameters, full_arguments)
cache[key] = args
cache_values.append(kwargs) # keep objects alive such that ids remain unique
func(*args)
wrapper.ast = kernel_function_node
wrapper.parameters = kernel_function_node.get_parameters()
return wrapper
def generate_and_jit(ast):
gen = generate_llvm(ast)
if isinstance(gen, ir.Module):
return compile_llvm(gen)
else:
return compile_llvm(gen.module)
def make_python_function(ast, argument_dict={}, func=None):
if func is None:
jit = generate_and_jit(ast)
func = jit.get_function_ptr(ast.function_name)
try:
args = build_ctypes_argument_list(ast.get_parameters(), argument_dict)
except KeyError:
# not all parameters specified yet
return make_python_function_incomplete_params(ast, argument_dict, func)
return lambda: func(*args)
def compile_llvm(module):
jit = Jit()
jit.parse(module)
jit.optimize()
jit.compile()
return jit
class Jit(object):
def __init__(self):
llvm.initialize()
llvm.initialize_all_targets()
llvm.initialize_native_target()
llvm.initialize_native_asmprinter()
self.module = None
self._llvmmod = llvm.parse_assembly("")
self.target = llvm.Target.from_default_triple()
self.cpu = llvm.get_host_cpu_name()
self.cpu_features = llvm.get_host_cpu_features()
self.target_machine = self.target.create_target_machine(cpu=self.cpu, features=self.cpu_features.flatten(),
opt=2)
llvm.check_jit_execution()
self.ee = llvm.create_mcjit_compiler(self.llvmmod, self.target_machine)
self.ee.finalize_object()
self.fptr = None
@property
def llvmmod(self):
return self._llvmmod
@llvmmod.setter
def llvmmod(self, mod):
self.ee.remove_module(self.llvmmod)
self.ee.add_module(mod)
self.ee.finalize_object()
self.compile()
self._llvmmod = mod
def parse(self, module):
self.module = module
llvmmod = llvm.parse_assembly(str(module))
llvmmod.verify()
llvmmod.triple = self.target.triple
llvmmod.name = 'module'
self.llvmmod = llvmmod
def write_ll(self, file):
with open(file, 'w') as f:
f.write(str(self.llvmmod))
def write_assembly(self, file):
with open(file, 'w') as f:
f.write(self.target_machine.emit_assembly(self.llvmmod))
def write_object_file(self, file):
with open(file, 'wb') as f:
f.write(self.target_machine.emit_object(self.llvmmod))
def optimize(self):
pmb = llvm.create_pass_manager_builder()
pmb.opt_level = 2
pmb.disable_unit_at_a_time = False
pmb.loop_vectorize = True
pmb.slp_vectorize = True
# TODO possible to pass for functions
pm = llvm.create_module_pass_manager()
pm.add_instruction_combining_pass()
pm.add_function_attrs_pass()
pm.add_constant_merge_pass()
pm.add_licm_pass()
pmb.populate(pm)
pm.run(self.llvmmod)
def compile(self):
fptr = {}
for func in self.module.functions:
if not func.is_declaration:
return_type = None
if func.ftype.return_type != ir.VoidType():
return_type = to_ctypes(create_composite_type_from_string(str(func.ftype.return_type)))
args = [ctypes_from_llvm(arg) for arg in func.ftype.args]
function_address = self.ee.get_function_address(func.name)
fptr[func.name] = ct.CFUNCTYPE(return_type, *args)(function_address)
self.fptr = fptr
def __call__(self, func, *args, **kwargs):
target_function = next(f for f in self.module.functions if f.name == func)
arg_types = [ctypes_from_llvm(arg.type) for arg in target_function.args]
transformed_args = []
for i, arg in enumerate(args):
if isinstance(arg, np.ndarray):
transformed_args.append(arg.ctypes.data_as(arg_types[i]))
else:
transformed_args.append(arg)
self.fptr[func](*transformed_args)
def print_functions(self):
for f in self.module.functions:
print(f.ftype.return_type, f.name, f.args)
def get_function_ptr(self, name):
fptr = self.fptr[name]
fptr.jit = self
return fptr
import sympy as sp
import numpy as np
from pystencils import TypedSymbol
from pystencils.astnodes import LoopOverCoordinate
from pystencils.backends.cbackend import CustomCodeNode
philox_two_doubles_call = """
{result_symbols[0].dtype} {result_symbols[0].name};
{result_symbols[1].dtype} {result_symbols[1].name};
philox_double2({parameters}, {result_symbols[0].name}, {result_symbols[1].name});
"""
philox_four_floats_call = """
{result_symbols[0].dtype} {result_symbols[0].name};
{result_symbols[1].dtype} {result_symbols[1].name};
{result_symbols[2].dtype} {result_symbols[2].name};
{result_symbols[3].dtype} {result_symbols[3].name};
philox_float4({parameters},
{result_symbols[0].name}, {result_symbols[1].name}, {result_symbols[2].name}, {result_symbols[3].name});
"""
class PhiloxTwoDoubles(CustomCodeNode):
def __init__(self, dim, time_step=TypedSymbol("time_step", np.uint32), keys=(0, 0)):
self.result_symbols = tuple(TypedSymbol(sp.Dummy().name, np.float64) for _ in range(2))
symbols_read = [s for s in keys if isinstance(s, sp.Symbol)]
super().__init__("", symbols_read=symbols_read, symbols_defined=self.result_symbols)
self._time_step = time_step
self.headers = ['"philox_rand.h"']
self.keys = list(keys)
self._args = (time_step, *sp.sympify(keys))
self._dim = dim
@property
def args(self):
return self._args
@property
def undefined_symbols(self):
result = {a for a in self.args if isinstance(a, sp.Symbol)}
loop_counters = [LoopOverCoordinate.get_loop_counter_symbol(i)
for i in range(self._dim)]
result.update(loop_counters)
return result
def get_code(self, dialect, vector_instruction_set):
parameters = [self._time_step] + [LoopOverCoordinate.get_loop_counter_symbol(i)
for i in range(self._dim)] + self.keys
while len(parameters) < 6:
parameters.append(0)
parameters = parameters[:6]
assert len(parameters) == 6
if dialect == 'cuda' or (dialect == 'c' and vector_instruction_set is None):
return philox_two_doubles_call.format(parameters=', '.join(str(p) for p in parameters),
result_symbols=self.result_symbols)
else:
raise NotImplementedError("Not yet implemented for this backend")
def __repr__(self):
return "{}, {} <- PhiloxRNG".format(*self.result_symbols)
class PhiloxFourFloats(CustomCodeNode):
def __init__(self, dim, time_step=TypedSymbol("time_step", np.uint32), keys=(0, 0)):
self.result_symbols = tuple(TypedSymbol(sp.Dummy().name, np.float32) for _ in range(4))
symbols_read = [s for s in keys if isinstance(s, sp.Symbol)]
super().__init__("", symbols_read=symbols_read, symbols_defined=self.result_symbols)
self._time_step = time_step
self.headers = ['"philox_rand.h"']
self.keys = list(keys)
self._args = (time_step, *sp.sympify(keys))
self._dim = dim
@property
def args(self):
return self._args
@property
def undefined_symbols(self):
result = {a for a in self.args if isinstance(a, sp.Symbol)}
loop_counters = [LoopOverCoordinate.get_loop_counter_symbol(i)
for i in range(self._dim)]
result.update(loop_counters)
return result
def get_code(self, dialect, vector_instruction_set):
parameters = [self._time_step] + [LoopOverCoordinate.get_loop_counter_symbol(i)
for i in range(self._dim)] + self.keys
while len(parameters) < 6:
parameters.append(0)
parameters = parameters[:6]
assert len(parameters) == 6
if dialect == 'cuda' or (dialect == 'c' and vector_instruction_set is None):
return philox_four_floats_call.format(parameters=', '.join(str(p) for p in parameters),
result_symbols=self.result_symbols)
else:
raise NotImplementedError("Not yet implemented for this backend")
def __repr__(self):
return "{}, {}, {}, {} <- PhiloxRNG".format(*self.result_symbols)
def random_symbol(assignment_list, rng_node=PhiloxTwoDoubles, *args, **kwargs):
while True:
node = rng_node(*args, **kwargs)
inserted = False
for symbol in node.result_symbols:
if not inserted:
assignment_list.insert(0, node)
inserted = True
yield symbol
# Disable gmpy backend until this bug is resolved if joblib serialize
# See https://github.com/sympy/sympy/pull/13530
import os
import warnings
os.environ['MPMATH_NOGMPY'] = '1'
try:
import mpmath.libmp
# In case the user has imported sympy first, then pystencils
if mpmath.libmp.BACKEND == 'gmpy':
warnings.warn("You are using the gmpy backend. You might encounter an error 'argument is not an mpz sympy'. "
"This is due to a known bug in sympy/gmpy library. "
"To prevent this, import pystencils first then sympy or set the environment variable "
"MPMATH_NOGMPY=1")
except ImportError:
pass
__all__ = []
# FIXME
# FIXME performance counters might be wrong. This will only affect the Benchmark model
# FIXME bandwidth measurements need validation
# FIXME
kerncraft version: 0.7.2
model name: Intel(R) Xeon(R) Gold 5122 CPU @ 3.60GHz
model type: Intel Core Skylake SP
sockets: 2
cores per socket: 4
threads per core: 2
NUMA domains per socket: 1
cores per NUMA domain: 4
clock: 3.6 GHz
FLOPs per cycle:
SP:
total: 64
FMA: 64
ADD: 32
MUL: 32
DP:
total: 32
FMA: 32
ADD: 16
MUL: 16
micro-architecture: SKX
compiler:
!!omap
- icc: -O3 -fno-alias -xCORE-AVX512
- clang: -O3 -march=skylake-avx512 -D_POSIX_C_SOURCE=200112L
- gcc: -O3 -march=skylake-avx512
cacheline size: 64 B
overlapping model:
ports: ["0", "0DV", "1", "2", "3", "4", "5", "6", "7"]
performance counter metric:
Max(UOPS_DISPATCHED_PORT_PORT_0:PMC[0-3],
UOPS_DISPATCHED_PORT_PORT_1:PMC[0-3],
UOPS_DISPATCHED_PORT_PORT_4:PMC[0-3],
UOPS_DISPATCHED_PORT_PORT_5:PMC[0-3],
UOPS_DISPATCHED_PORT_PORT_6:PMC[0-3],
UOPS_DISPATCHED_PORT_PORT_7:PMC[0-3])
non-overlapping model:
ports: ["2D", "3D"]
performance counter metric: T_OL + T_L1L2 + T_L2L3 + T_L3MEM
memory hierarchy:
- level: L1
performance counter metrics:
accesses: MEM_INST_RETIRED_ALL_LOADS:PMC[0-3]
misses: L1D_REPLACEMENT:PMC[0-3]
evicts: L2_TRANS_L1D_WB:PMC[0-3]
cache per group:
sets: 64
ways: 8
cl_size: 64
replacement_policy: 'LRU'
write_allocate: True
write_back: True
load_from: L2
store_to: L2
size per group: 32.00 kB
groups: 8
cores per group: 1
threads per group: 2
- level: L2
non-overlap upstream throughput: [64 B/cy, 'half-duplex']
performance counter metrics:
accesses: L1D_REPLACEMENT:PMC[0-3]
misses: L2_LINES_IN_ALL:PMC[0-3]
evicts: L2_TRANS_L2_WB:PMC[0-3]
cache per group:
sets: 1024
ways: 16
cl_size: 64
replacement_policy: 'LRU'
write_allocate: True
write_back: True
load_from: null # L3 is a victim cache, thus unless a hit in L3, misses get forwarded to MEM
victims_to: L3 # all victims, modified or not are passed onto L3
store_to: L3
size per group: 1.00 MB
groups: 8
cores per group: 1
threads per group: 2
- level: L3
non-overlap upstream throughput: [16 B/cy, 'full-duplex']
performance counter metrics:
accesses: L2_LINES_IN_ALL:PMC[0-3]
# FIXME not all misses in L2 lead to loads from L3, only the hits do
misses: (CAS_COUNT_RD:MBOX0C[01] + CAS_COUNT_WR:MBOX0C[01] +
CAS_COUNT_RD:MBOX1C[01] + CAS_COUNT_WR:MBOX1C[01] +
CAS_COUNT_RD:MBOX2C[01] + CAS_COUNT_WR:MBOX2C[01] +
CAS_COUNT_RD:MBOX3C[01] + CAS_COUNT_WR:MBOX3C[01] +
CAS_COUNT_RD:MBOX4C[01] + CAS_COUNT_WR:MBOX4C[01] +
CAS_COUNT_RD:MBOX5C[01] + CAS_COUNT_WR:MBOX5C[01])
evicts: L2_TRANS_L2_WB:PMC[0-3]
cache per group:
sets: 16896
# TODO is actuall something else, but necessary to get to 16.5 MB
ways: 16
# TODO is actually 11, but pycachesim only supports powers of two
cl_size: 64
replacement_policy: 'LRU'
write_allocate: False
write_back: True
size per group: 16.50 MB
groups: 2
cores per group: 4
threads per group: 8
- level: MEM
cores per group: 4
threads per group: 8
non-overlap upstream throughput: ['full socket memory bandwidth', 'half-duplex']
penalty cycles per read stream: 0
size per group:
benchmarks:
kernels:
load:
read streams:
streams: 1
bytes: 8.00 B
read+write streams:
streams: 0
bytes: 0.00 B
write streams:
streams: 0
bytes: 0.00 B
FLOPs per iteration: 0
copy:
read streams:
streams: 1
bytes: 8.00 B
read+write streams:
streams: 0
bytes: 0.00 B
write streams:
streams: 1
bytes: 8.00 B
FLOPs per iteration: 0
update:
read streams:
streams: 1
bytes: 8.00 B
read+write streams:
streams: 1
bytes: 8.00 B
write streams:
streams: 1
bytes: 8.00 B
FLOPs per iteration: 0
triad:
read streams:
streams: 3
bytes: 24.00 B
read+write streams:
streams: 0
bytes: 0.00 B
write streams:
streams: 1
bytes: 8.00 B
FLOPs per iteration: 2
daxpy:
read streams:
streams: 2
bytes: 16.00 B
read+write streams:
streams: 1
bytes: 8.00 B
write streams:
streams: 1
bytes: 8.00 B
FLOPs per iteration: 2
measurements:
L1:
1:
threads per core: 1
cores:
- 1
- 2
- 3
- 4
threads:
- 1
- 2
- 3
- 4
size per core:
- 21.12 kB
- 21.12 kB
- 21.12 kB
- 21.12 kB
size per thread:
- 21.12 kB
- 21.12 kB
- 21.12 kB
- 21.12 kB
total size:
- 21.12 kB
- 42.24 kB
- 63.36 kB
- 84.48 kB
results:
load:
- 42.98 GB/s
- 85.08 GB/s
- 127.45 GB/s
- 169.92 GB/s
copy:
- 56.07 GB/s
- 111.50 GB/s
- 164.90 GB/s
- 221.50 GB/s
update:
- 56.54 GB/s
- 112.25 GB/s
- 168.50 GB/s
- 224.75 GB/s
triad:
- 45.90 GB/s
- 89.81 GB/s
- 127.29 GB/s
- 169.57 GB/s
daxpy:
- 36.62 GB/s
- 71.30 GB/s
- 103.52 GB/s
- 135.26 GB/s
2:
threads per core: 2
cores:
- 1
- 2
- 3
- 4
threads:
- 2
- 4
- 6
- 8
size per core:
- 21.12 kB
- 21.12 kB
- 21.12 kB
- 21.12 kB
size per thread:
- 10.56 kB
- 10.56 kB
- 10.56 kB
- 10.56 kB
total size:
- 21.12 kB
- 42.24 kB
- 63.36 kB
- 84.48 kB
results:
load:
- 49.61 GB/s
- 98.80 GB/s
- 147.98 GB/s
- 198.22 GB/s
copy:
- 55.98 GB/s
- 111.56 GB/s
- 167.08 GB/s
- 220.42 GB/s
update:
- 56.53 GB/s
- 112.72 GB/s
- 168.95 GB/s
- 225.31 GB/s
triad:
- 54.01 GB/s
- 104.58 GB/s
- 153.02 GB/s
- 200.93 GB/s
daxpy:
- 41.11 GB/s
- 80.28 GB/s
- 115.71 GB/s
- 152.81 GB/s
L2:
1:
threads per core: 1
cores:
- 1
- 2
- 3
- 4
threads:
- 1
- 2
- 3
- 4
size per core:
- 660.00 kB
- 660.00 kB
- 660.00 kB
- 660.00 kB
size per thread:
- 660.00 kB
- 660.00 kB
- 660.00 kB
- 660.00 kB
total size:
- 660.00 kB
- 1.32 MB
- 1.98 MB
- 2.64 MB
results:
load:
- 27.15 GB/s
- 54.09 GB/s
- 80.61 GB/s
- 106.41 GB/s
copy:
- 43.53 GB/s
- 90.07 GB/s
- 127.73 GB/s
- 171.81 GB/s
update:
- 50.38 GB/s
- 98.47 GB/s
- 147.91 GB/s
- 197.20 GB/s
triad:
- 43.38 GB/s
- 83.72 GB/s
- 124.83 GB/s
- 166.04 GB/s
daxpy:
- 36.29 GB/s
- 71.29 GB/s
- 103.33 GB/s
- 136.48 GB/s
2:
threads per core: 2
cores:
- 1
- 2
- 3
- 4
threads:
- 2
- 4
- 6
- 8
size per core:
- 660.00 kB
- 660.00 kB
- 660.00 kB
- 660.00 kB
size per thread:
- 330.00 kB
- 330.00 kB
- 330.00 kB
- 330.00 kB
total size:
- 660.00 kB
- 1.32 MB
- 1.98 MB
- 2.64 MB
results:
load:
- 35.29 GB/s
- 70.28 GB/s
- 104.67 GB/s
- 139.63 GB/s
copy:
- 42.23 GB/s
- 83.70 GB/s
- 124.33 GB/s
- 167.50 GB/s
update:
- 50.09 GB/s
- 99.77 GB/s
- 149.87 GB/s
- 198.82 GB/s
triad:
- 52.38 GB/s
- 100.00 GB/s
- 147.40 GB/s
- 193.31 GB/s
daxpy:
- 41.14 GB/s
- 80.22 GB/s
- 116.23 GB/s
- 155.08 GB/s
L3:
1:
threads per core: 1
cores:
- 1
- 2
- 3
- 4
threads:
- 1
- 2
- 3
- 4
size per core:
- 10.56 MB
- 5.28 MB
- 3.52 MB
- 2.64 MB
size per thread:
- 10.56 MB
- 5.28 MB
- 3.52 MB
- 2.64 MB
total size:
- 10.56 MB
- 10.56 MB
- 10.56 MB
- 10.56 MB
results:
load:
- 22.40 GB/s
- 44.77 GB/s
- 65.71 GB/s
- 89.26 GB/s
copy:
- 25.32 GB/s
- 49.70 GB/s
- 72.89 GB/s
- 98.62 GB/s
update:
- 41.24 GB/s
- 81.14 GB/s
- 122.22 GB/s
- 166.44 GB/s
triad:
- 25.61 GB/s
- 50.02 GB/s
- 73.23 GB/s
- 98.95 GB/s
daxpy:
- 32.07 GB/s
- 62.65 GB/s
- 89.91 GB/s
- 120.65 GB/s
2:
threads per core: 2
cores:
- 1
- 2
- 3
- 4
threads:
- 2
- 4
- 6
- 8
size per core:
- 10.56 MB
- 5.28 MB
- 3.52 MB
- 2.64 MB
size per thread:
- 5.28 MB
- 2.64 MB
- 1.76 MB
- 1.32 MB
total size:
- 10.56 MB
- 10.56 MB
- 10.56 MB
- 10.56 MB
results:
load:
- 26.18 GB/s
- 51.85 GB/s
- 75.82 GB/s
- 101.39 GB/s
copy:
- 26.22 GB/s
- 51.83 GB/s
- 76.40 GB/s
- 102.84 GB/s
update:
- 43.51 GB/s
- 86.75 GB/s
- 129.86 GB/s
- 174.54 GB/s
triad:
- 26.39 GB/s
- 51.80 GB/s
- 76.27 GB/s
- 102.66 GB/s
daxpy:
- 37.43 GB/s
- 73.16 GB/s
- 106.53 GB/s
- 142.76 GB/s
MEM:
1:
threads per core: 1
cores:
- 1
- 2
- 3
- 4
threads:
- 1
- 2
- 3
- 4
size per core:
- 240.00 MB
- 120.00 MB
- 80.00 MB
- 60.00 MB
size per thread:
- 240.00 MB
- 120.00 MB
- 80.00 MB
- 60.00 MB
total size:
- 240.00 MB
- 240.00 MB
- 240.00 MB
- 240.00 MB
results:
load:
- 12.03 GB/s
- 24.38 GB/s
- 34.83 GB/s
- 45.05 GB/s
copy:
- 12.32 GB/s
- 24.40 GB/s
- 32.82 GB/s
- 37.00 GB/s
update:
- 20.83 GB/s
- 40.25 GB/s
- 48.81 GB/s
- 54.84 GB/s
triad:
- 11.64 GB/s
- 23.17 GB/s
- 34.78 GB/s
- 42.97 GB/s
daxpy:
- 17.69 GB/s
- 34.02 GB/s
- 48.12 GB/s
- 55.73 GB/s
2:
threads per core: 2
cores:
- 1
- 2
- 3
- 4
threads:
- 2
- 4
- 6
- 8
size per core:
- 240.00 MB
- 120.00 MB
- 80.00 MB
- 60.00 MB
size per thread:
- 120.00 MB
- 60.00 MB
- 40.00 MB
- 30.00 MB
total size:
- 240.00 MB
- 240.00 MB
- 240.00 MB
- 240.00 MB
results:
load:
- 15.33 GB/s
- 28.32 GB/s
- 41.34 GB/s
- 53.02 GB/s
copy:
- 13.96 GB/s
- 26.61 GB/s
- 34.39 GB/s
- 38.96 GB/s
update:
- 26.47 GB/s
- 47.82 GB/s
- 56.70 GB/s
- 62.78 GB/s
triad:
- 14.42 GB/s
- 26.66 GB/s
- 36.94 GB/s
- 44.01 GB/s
daxpy:
- 20.96 GB/s
- 39.12 GB/s
- 51.55 GB/s
- 58.37 GB/s
import os
import math
import time
import numpy as np
import sympy as sp
from influxdb import InfluxDBClient
from git import Repo
from kerncraft.models import ECM, Benchmark, Roofline, RooflineIACA
from kerncraft.machinemodel import MachineModel
from kerncraft.prefixedunit import PrefixedUnit
from pystencils.kerncraft_coupling import KerncraftParameters, PyStencilsKerncraftKernel
from pystencils import Field, Assignment, create_kernel
def output_benchmark(analysis):
output = {}
keys = ['Runtime (per repetition) [s]', 'Iterations per repetition',
'Runtime (per cacheline update) [cy/CL]', 'MEM volume (per repetition) [B]',
'Performance [MFLOP/s]', 'Performance [MLUP/s]', 'Performance [MIt/s]', 'MEM BW [MByte/s]']
copies = {key: analysis[key] for key in keys}
output.update(copies)
for cache, metrics in analysis['data transfers'].items():
for metric_name, metric_value in metrics.items():
fixed = metric_value.with_prefix('')
output[cache + ' ' + metric_name + ' ' + fixed.prefix + fixed.unit] = fixed.value
for level, value in analysis['ECM'].items():
output['Phenomenological ECM ' + level + ' cy/CL'] = value
return output
def output_ecm(analysis):
output = {}
keys = ['T_nOL', 'T_OL', 'cl throughput', 'uops']
copies = {key: analysis[key] for key in keys}
output.update(copies)
if 'memory bandwidth kernel' in analysis:
output['memory bandwidth kernel' + analysis['memory bandwidth kernel'] + analysis['memory bandwidth'].prefix +
analysis['memory bandwidth'].unit] = analysis['memory bandwidth'].value
output['scaling cores'] = int(analysis['scaling cores']) if not math.isinf(analysis['scaling cores']) else -1
for key, value in analysis['cycles']:
output[key] = value
return output
def output_roofline(analysis):
output = {}
keys = ['min performance'] # 'bottleneck level'
copies = {key: analysis[key] for key in keys}
output.update(copies)
# TODO save bottleneck information (compute it here)
# fixed = analysis['max_flops'].with_prefix('G')
# output['max GFlop/s'] = fixed.value
# if analysis['min performance'] > max_flops:
# # CPU bound
# print('CPU bound with {} cores(s)'.format(self._args.cores), file=output_file)
# print('{!s} due to CPU max. FLOP/s'.format(max_flops), file=output_file)
# else:
# Memory bound
bottleneck = analysis['mem bottlenecks'][analysis['bottleneck level']]
output['bottleneck GFlop/s'] = bottleneck['performance'].with_prefix('G').value
output['bottleneck level'] = bottleneck['level']
output['bottleneck bw kernel'] = bottleneck['bw kernel']
output['bottleneck arithmetic intensity'] = bottleneck['arithmetic intensity']
for i, level in enumerate(analysis['mem bottlenecks']):
if level is None:
continue
for key, value in level.items():
if isinstance(value, PrefixedUnit):
fixed = value.with_prefix('G')
output['level ' + str(i) + ' ' + key + ' [' + fixed.prefix + fixed.unit + ']'] = 'inf' if isinstance(
fixed.value, float) and math.isinf(fixed.value) else fixed.value
else:
output['level ' + str(i) + ' ' + key] = 'inf' if isinstance(value, float) and math.isinf(
value) else value
return output
def output_roofline_iaca(analysis):
output = {}
keys = ['min performance'] # 'bottleneck level'
copies = {key: analysis[key] for key in keys}
# output.update(copies)
# TODO save bottleneck information (compute it here)
# fixed = analysis['max_flops'].with_prefix('G')
# output['max GFlop/s'] = fixed.value
# if analysis['min performance'] > max_flops:
# # CPU bound
# print('CPU bound with {} cores(s)'.format(self._args.cores), file=output_file)
# print('{!s} due to CPU max. FLOP/s'.format(max_flops), file=output_file)
# else:
# Memory bound
bottleneck = analysis['mem bottlenecks'][analysis['bottleneck level']]
output['bottleneck GFlop/s'] = bottleneck['performance'].with_prefix('G').value
output['bottleneck level'] = bottleneck['level']
output['bottleneck bw kernel'] = bottleneck['bw kernel']
output['bottleneck arithmetic intensity'] = bottleneck['arithmetic intensity']
for i, level in enumerate(analysis['mem bottlenecks']):
if level is None:
continue
for key, value in level.items():
if isinstance(value, PrefixedUnit):
fixed = value.with_prefix('G')
output['level ' + str(i) + ' ' + key + ' [' + fixed.prefix + fixed.unit + ']'] = 'inf' if isinstance(
fixed.value, float) and math.isinf(fixed.value) else fixed.value
else:
output['level ' + str(i) + ' ' + key] = 'inf' if isinstance(value, float) and math.isinf(
value) else value
return output
def report_analysis(ast, models, machine, tags, fields=None):
kernel = PyStencilsKerncraftKernel(ast, machine)
client = InfluxDBClient('i10grafana.informatik.uni-erlangen.de', 8086, 'pystencils',
'roggan', 'pystencils')
repo = Repo(search_parent_directories=True)
commit = repo.head.commit
point_time = int(time.time())
for model in models:
benchmark = model(kernel, machine, KerncraftParameters())
benchmark.analyze()
analysis = benchmark.results
if model is Benchmark:
output = output_benchmark(analysis)
elif model is ECM:
output = output_ecm(analysis)
elif model is Roofline:
output = output_roofline(analysis)
elif model is RooflineIACA:
output = output_roofline_iaca(analysis)
else:
raise ValueError('No valid model for analysis given!')
if fields is not None:
output.update(fields)
output['commit'] = commit.hexsha
json_body = [
{
'measurement': model.__name__,
'tags': tags,
'time': point_time,
'fields': output
}
]
client.write_points(json_body, time_precision='s')
def main():
size = [20, 200, 200]
arr = np.zeros(size)
a = Field.create_from_numpy_array('a', arr, index_dimensions=0)
b = Field.create_from_numpy_array('b', arr, index_dimensions=0)
s = sp.Symbol("s")
rhs = a[0, -1, 0] + a[0, 1, 0] + \
a[-1, 0, 0] + a[1, 0, 0] + \
a[0, 0, -1] + a[0, 0, 1]
update_rule = Assignment(b[0, 0, 0], s * rhs)
ast = create_kernel([update_rule])
input_folder = "./"
machine_file_path = os.path.join(input_folder, "SkylakeSP_Gold-5122_allinclusive.yaml")
machine = MachineModel(path_to_yaml=machine_file_path)
tags = {
'host': os.uname()[1],
'project': 'pystencils',
'kernel': 'jacobi_3D ' + str(size)
}
report_analysis(ast, [ECM, Roofline, RooflineIACA, Benchmark], machine, tags)
if __name__ == '__main__':
main()
import sympy as sp
import numpy as np
from pystencils import Field, Assignment, create_kernel
def meassure():
size = [30, 50, 3]
arr = np.zeros(size)
a = Field.create_from_numpy_array('a', arr, index_dimensions=1)
b = Field.create_from_numpy_array('b', arr, index_dimensions=1)
s = sp.Symbol("s")
rhs = a[0, -1](0) + a[0, 1] + a[-1, 0] + a[1, 0]
updateRule = Assignment(b[0, 0], s * rhs)
print(updateRule)
ast = create_kernel([updateRule])
# benchmark = generate_benchmark(ast)
# main = benchmark[0]
# kernel = benchmark[1]
# with open('src/main.cpp', 'w') as file:
# file.write(main)
# with open('src/kernel.cpp', 'w') as file:
# file.write(kernel)
func = ast.compile({'omega': 2/3})
from pystencils.kerncraft_coupling.generate_benchmark import generate_benchmark
from pystencils.kerncraft_coupling import BenchmarkAnalysis
from pystencils.kerncraft_coupling.kerncraft_interface import PyStencilsKerncraftKernel, KerncraftParameters
from kerncraft.machinemodel import MachineModel
from kerncraft.models import ECMData
machineFilePath = "../pystencils_tests/kerncraft_inputs/default_machine_file.yaml"
machine = MachineModel(path_to_yaml=machineFilePath)
benchmark = BenchmarkAnalysis(ast, machine)
#TODO what do i want to do with benchmark?
kernel = PyStencilsKerncraftKernel(ast)
model = ECMData(kernel, machine, KerncraftParameters())
model.analyze()
model.report()
if __name__ == "__main__":
meassure()
/*
* Copyright (2008-2009) Intel Corporation All Rights Reserved.
* The source code contained or described herein and all documents
* related to the source code ("Material") are owned by Intel Corporation
* or its suppliers or licensors. Title to the Material remains with
* Intel Corporation or its suppliers and licensors. The Material
* contains trade secrets and proprietary and confidential information
* of Intel or its suppliers and licensors. The Material is protected
* by worldwide copyright and trade secret laws and treaty provisions.
* No part of the Material may be used, copied, reproduced, modified,
* published, uploaded, posted, transmitted, distributed, or disclosed
* in any way without Intel(R)s prior express written permission.
*
* No license under any patent, copyright, trade secret or other
* intellectual property right is granted to or conferred upon you by
* disclosure or delivery of the Materials, either expressly, by implication,
* inducement, estoppel or otherwise. Any license under such intellectual
* property rights must be express and approved by Intel in writing.
*/
#if defined (__GNUC__)
#define IACA_SSC_MARK( MARK_ID ) \
__asm__ __volatile__ ( \
"\n\t movl $"#MARK_ID", %%ebx" \
"\n\t .byte 0x64, 0x67, 0x90" \
: : : "memory" );
#else
#define IACA_SSC_MARK(x) {__asm mov ebx, x\
__asm _emit 0x64 \
__asm _emit 0x67 \
__asm _emit 0x90 }
#endif
#define IACA_START {IACA_SSC_MARK(111)}
#define IACA_END {IACA_SSC_MARK(222)}
#ifdef _WIN64
#include <intrin.h>
#define IACA_VC64_START __writegsbyte(111, 111);
#define IACA_VC64_END __writegsbyte(222, 222);
#endif
/**************** asm *****************
;START_MARKER
mov ebx, 111
db 0x64, 0x67, 0x90
;END_MARKER
mov ebx, 222
db 0x64, 0x67, 0x90
**************************************/
#include "iacaMarks.h"
int main(int argc, char * argv[]){
int a = 0;
for(int i = 0; i < argc+100000; i++){
IACA_START
a += i;
}
IACA_END
return a;
}
double a[30][50][3];
double b[30][50][3];
double s;
for(int j=1; j<30-1; ++j)
for(int i=1; i<50-1; ++i)
b[j][i] = ( a[j][i-1] + a[j][i+1]
+ a[j-1][i] + a[j+1][i]) * s;
double a[M][N][N];
double b[M][N][N];
double s;
for(int k=1; k<M-1; ++k)
for(int j=1; j<N-1; ++j)
for(int i=1; i<N-1; ++i)
b[k][j][i] = ( a[k][j][i-1] + a[k][j][i+1]
+ a[k][j-1][i] + a[k][j+1][i]
+ a[k-1][j][i] + a[k+1][j][i]) * s;
kerncraft version: 0.7.3
clock: 2.7 GHz
cores per socket: 8
cores per NUMA domain: 8
NUMA domains per socket: 1
model type: Intel Core SandyBridge EP processor
model name: Intel(R) Xeon(R) CPU E5-2680 0 @ 2.70GHz
sockets: 2
threads per core: 2
cacheline size: 64 B
compiler:
!!omap
- icc: -O3 -xAVX -fno-alias -qopenmp
- clang: -O3 -march=corei7-avx -mtune=corei7-avx -D_POSIX_C_SOURCE=200112L -fopenmp
- gcc: -O3 -march=corei7-avx -D_POSIX_C_SOURCE=200112L -fopenmp
micro-architecture: SNB
FLOPs per cycle:
SP: {total: 16, ADD: 8, MUL: 8}
DP: {total: 8, ADD: 4, MUL: 4}
overlapping model:
ports: ["0", "0DV", "1", "2", "3", "4", "5"]
performance counter metric:
Max(UOPS_DISPATCHED_PORT_PORT_0:PMC[0-3],
UOPS_DISPATCHED_PORT_PORT_1:PMC[0-3],
UOPS_DISPATCHED_PORT_PORT_4:PMC[0-3],
UOPS_DISPATCHED_PORT_PORT_5:PMC[0-3])
non-overlapping model:
ports: ["2D", "3D"]
performance counter metric: T_OL + T_L1L2 + T_L2L3 + T_L3MEM
write-allocate: True
memory hierarchy:
- level: L1
cache per group: {
'sets': 64, 'ways': 8, 'cl_size': 64, # 32 kB
'replacement_policy': 'LRU',
'write_allocate': True, 'write_back': True,
'load_from': 'L2', 'store_to': 'L2'}
cores per group: 1
threads per group: 2
groups: 16
performance counter metrics:
accesses: MEM_UOPS_RETIRED_LOADS:PMC[0-3]
misses: L1D_REPLACEMENT:PMC[0-3]
evicts: L1D_M_EVICT:PMC[0-3]
- level: L2
cache per group: {
'sets': 512, 'ways': 8, 'cl_size': 64, # 256 kB
'replacement_policy': 'LRU',
'write_allocate': True, 'write_back': True,
'load_from': 'L3', 'store_to': 'L3'}
cores per group: 1
threads per group: 2
groups: 16
non-overlap upstream throughput: [32 B/cy, 'half-duplex']
performance counter metrics:
accesses: L1D_REPLACEMENT:PMC[0-3]
misses: L2_LINES_IN_ALL:PMC[0-3]
evicts: L2_TRANS_L2_WB:PMC[0-3]
- level: L3
cache per group: {
'sets': 20480, 'ways': 16, 'cl_size': 64, # 20 MB
'replacement_policy': 'LRU',
'write_allocate': True, 'write_back': True}
cores per group: 8
threads per group: 16
groups: 2
non-overlap upstream throughput: [32 B/cy, 'half-duplex']
performance counter metrics:
accesses: L2_LINES_IN_ALL:PMC[0-3]
misses: (CAS_COUNT_RD:MBOX0C[01] + CAS_COUNT_RD:MBOX1C[01] +
CAS_COUNT_RD:MBOX2C[01] + CAS_COUNT_RD:MBOX3C[01])
evicts: (CAS_COUNT_WR:MBOX0C[01] + CAS_COUNT_WR:MBOX1C[01] +
CAS_COUNT_WR:MBOX2C[01] + CAS_COUNT_WR:MBOX3C[01])
- level: MEM
cores per group: 8
non-overlap upstream throughput: ['full socket memory bandwidth', 'half-duplex']
size per group: null
threads per group: 16
benchmarks:
kernels:
copy:
FLOPs per iteration: 0
read streams: {bytes: 8.00 B, streams: 1}
read+write streams: {bytes: 0.00 B, streams: 0}
write streams: {bytes: 8.00 B, streams: 1}
daxpy:
FLOPs per iteration: 2
read streams: {bytes: 16.00 B, streams: 2}
read+write streams: {bytes: 8.00 B, streams: 1}
write streams: {bytes: 8.00 B, streams: 1}
load:
FLOPs per iteration: 0
read streams: {bytes: 8.00 B, streams: 1}
read+write streams: {bytes: 0.00 B, streams: 0}
write streams: {bytes: 0.00 B, streams: 0}
triad:
FLOPs per iteration: 2
read streams: {bytes: 24.00 B, streams: 3}
read+write streams: {bytes: 0.00 B, streams: 0}
write streams: {bytes: 8.00 B, streams: 1}
update:
FLOPs per iteration: 0
read streams: {bytes: 8.00 B, streams: 1}
read+write streams: {bytes: 8.00 B, streams: 1}
write streams: {bytes: 8.00 B, streams: 1}
measurements:
L1:
1:
cores: [1, 2, 3, 4, 5, 6, 7, 8]
results:
copy: [81.98 GB/s, 163.75 GB/s, 245.62 GB/s, 327.69 GB/s, 409.41 GB/s, 489.83
GB/s, 571.67 GB/s, 653.50 GB/s]
daxpy: [71.55 GB/s, 143.01 GB/s, 214.86 GB/s, 286.26 GB/s, 355.60 GB/s,
426.71 GB/s, 497.45 GB/s, 568.97 GB/s]
load: [61.92 GB/s, 122.79 GB/s, 183.01 GB/s, 244.30 GB/s, 306.76 GB/s, 368.46
GB/s, 427.41 GB/s, 490.88 GB/s]
triad: [81.61 GB/s, 163.25 GB/s, 244.92 GB/s, 326.65 GB/s, 406.69 GB/s,
487.76 GB/s, 569.10 GB/s, 650.39 GB/s]
update: [84.03 GB/s, 168.02 GB/s, 252.10 GB/s, 335.94 GB/s, 419.90 GB/s,
503.88 GB/s, 587.86 GB/s, 671.88 GB/s]
size per core: [16.00 kB, 16.00 kB, 16.00 kB, 16.00 kB, 16.00 kB, 16.00 kB,
16.00 kB, 16.00 kB]
size per thread: [16.00 kB, 16.00 kB, 16.00 kB, 16.00 kB, 16.00 kB, 16.00
kB, 16.00 kB, 16.00 kB]
threads: [1, 2, 3, 4, 5, 6, 7, 8]
threads per core: 1
total size: [16.00 kB, 32.00 kB, 48.00 kB, 64.00 kB, 80.00 kB, 96.00 kB, 112.00
kB, 128.00 kB]
2:
cores: [1, 2, 3, 4, 5, 6, 7, 8]
results:
copy: [79.53 GB/s, 158.70 GB/s, 238.20 GB/s, 317.62 GB/s, 397.09 GB/s, 476.33
GB/s, 555.69 GB/s, 634.96 GB/s]
daxpy: [70.94 GB/s, 141.90 GB/s, 212.97 GB/s, 283.91 GB/s, 354.93 GB/s,
425.85 GB/s, 496.74 GB/s, 567.40 GB/s]
load: [57.01 GB/s, 114.11 GB/s, 171.11 GB/s, 228.13 GB/s, 285.15 GB/s, 342.11
GB/s, 399.11 GB/s, 456.11 GB/s]
triad: [79.48 GB/s, 159.03 GB/s, 238.53 GB/s, 318.04 GB/s, 392.11 GB/s,
477.10 GB/s, 538.36 GB/s, 636.02 GB/s]
update: [82.75 GB/s, 165.55 GB/s, 248.50 GB/s, 331.32 GB/s, 414.06 GB/s,
496.82 GB/s, 579.83 GB/s, 662.36 GB/s]
size per core: [16.00 kB, 16.00 kB, 16.00 kB, 16.00 kB, 16.00 kB, 16.00 kB,
16.00 kB, 16.00 kB]
size per thread: [8.00 kB, 8.00 kB, 8.00 kB, 8.00 kB, 8.00 kB, 8.00 kB, 8.00
kB, 8.00 kB]
threads: [2, 4, 6, 8, 10, 12, 14, 16]
threads per core: 2
total size: [16.00 kB, 32.00 kB, 48.00 kB, 64.00 kB, 80.00 kB, 96.00 kB, 112.00
kB, 128.00 kB]
L2:
1:
cores: [1, 2, 3, 4, 5, 6, 7, 8]
results:
copy: [41.28 GB/s, 81.96 GB/s, 120.28 GB/s, 160.70 GB/s, 203.22 GB/s, 239.97
GB/s, 271.13 GB/s, 307.01 GB/s]
daxpy: [48.85 GB/s, 98.62 GB/s, 143.29 GB/s, 197.76 GB/s, 230.58 GB/s, 284.98
GB/s, 334.22 GB/s, 385.72 GB/s]
load: [38.51 GB/s, 76.67 GB/s, 114.73 GB/s, 152.90 GB/s, 188.69 GB/s, 223.64
GB/s, 265.21 GB/s, 289.41 GB/s]
triad: [40.92 GB/s, 83.49 GB/s, 124.48 GB/s, 165.24 GB/s, 206.74 GB/s, 237.90
GB/s, 274.96 GB/s, 329.09 GB/s]
update: [50.37 GB/s, 100.05 GB/s, 145.43 GB/s, 196.82 GB/s, 244.07 GB/s,
301.62 GB/s, 336.88 GB/s, 403.78 GB/s]
size per core: [128.00 kB, 128.00 kB, 128.00 kB, 128.00 kB, 128.00 kB, 128.00
kB, 128.00 kB, 128.00 kB]
size per thread: [128.00 kB, 128.00 kB, 128.00 kB, 128.00 kB, 128.00 kB, 128.00
kB, 128.00 kB, 128.00 kB]
threads: [1, 2, 3, 4, 5, 6, 7, 8]
threads per core: 1
total size: [128.00 kB, 256.00 kB, 384.00 kB, 512.00 kB, 640.00 kB, 768.00
kB, 0.90 MB, 1.02 MB]
2:
cores: [1, 2, 3, 4, 5, 6, 7, 8]
results:
copy: [42.17 GB/s, 83.47 GB/s, 124.57 GB/s, 163.78 GB/s, 202.56 GB/s, 242.80
GB/s, 276.95 GB/s, 311.36 GB/s]
daxpy: [50.87 GB/s, 98.72 GB/s, 152.12 GB/s, 193.48 GB/s, 251.36 GB/s, 301.72
GB/s, 352.55 GB/s, 365.28 GB/s]
load: [39.62 GB/s, 79.03 GB/s, 118.03 GB/s, 157.85 GB/s, 196.48 GB/s, 237.44
GB/s, 276.81 GB/s, 309.71 GB/s]
triad: [44.80 GB/s, 88.35 GB/s, 125.13 GB/s, 169.94 GB/s, 209.60 GB/s, 260.15
GB/s, 300.75 GB/s, 333.08 GB/s]
update: [49.80 GB/s, 100.70 GB/s, 150.56 GB/s, 196.44 GB/s, 251.90 GB/s,
280.93 GB/s, 352.74 GB/s, 399.27 GB/s]
size per core: [128.00 kB, 128.00 kB, 128.00 kB, 128.00 kB, 128.00 kB, 128.00
kB, 128.00 kB, 128.00 kB]
size per thread: [64.00 kB, 64.00 kB, 64.00 kB, 64.00 kB, 64.00 kB, 64.00
kB, 64.00 kB, 64.00 kB]
threads: [2, 4, 6, 8, 10, 12, 14, 16]
threads per core: 2
total size: [128.00 kB, 256.00 kB, 384.00 kB, 512.00 kB, 640.00 kB, 768.00
kB, 0.90 MB, 1.02 MB]
L3:
1:
cores: [1, 2, 3, 4, 5, 6, 7, 8]
results:
copy: [23.21 GB/s, 46.01 GB/s, 67.96 GB/s, 90.17 GB/s, 111.47 GB/s, 133.14
GB/s, 153.84 GB/s, 174.92 GB/s]
daxpy: [30.35 GB/s, 60.32 GB/s, 90.00 GB/s, 119.71 GB/s, 148.87 GB/s, 178.39
GB/s, 207.10 GB/s, 236.25 GB/s]
load: [23.35 GB/s, 46.52 GB/s, 69.57 GB/s, 92.60 GB/s, 115.77 GB/s, 138.89
GB/s, 161.82 GB/s, 184.11 GB/s]
triad: [25.18 GB/s, 50.08 GB/s, 74.33 GB/s, 98.78 GB/s, 122.66 GB/s, 146.78
GB/s, 170.52 GB/s, 194.47 GB/s]
update: [32.67 GB/s, 64.65 GB/s, 95.98 GB/s, 127.29 GB/s, 157.67 GB/s, 188.22
GB/s, 217.41 GB/s, 246.99 GB/s]
size per core: [1.25 MB, 1.25 MB, 1.25 MB, 1.25 MB, 1.25 MB, 1.25 MB, 1.25
MB, 1.25 MB]
size per thread: [1.25 MB, 1.25 MB, 1.25 MB, 1.25 MB, 1.25 MB, 1.25 MB, 1.25
MB, 1.25 MB]
threads: [1, 2, 3, 4, 5, 6, 7, 8]
threads per core: 1
total size: [1.25 MB, 2.50 MB, 3.75 MB, 5.00 MB, 6.25 MB, 7.50 MB, 8.75 MB,
10.00 MB]
2:
cores: [1, 2, 3, 4, 5, 6, 7, 8]
results:
copy: [23.83 GB/s, 47.25 GB/s, 69.84 GB/s, 92.61 GB/s, 114.31 GB/s, 136.48
GB/s, 157.55 GB/s, 178.99 GB/s]
daxpy: [31.52 GB/s, 62.72 GB/s, 93.43 GB/s, 124.29 GB/s, 154.55 GB/s, 185.18
GB/s, 215.10 GB/s, 245.24 GB/s]
load: [27.63 GB/s, 54.93 GB/s, 81.57 GB/s, 108.63 GB/s, 134.91 GB/s, 161.72
GB/s, 188.15 GB/s, 214.94 GB/s]
triad: [25.90 GB/s, 51.76 GB/s, 76.73 GB/s, 102.29 GB/s, 126.17 GB/s, 152.10
GB/s, 176.71 GB/s, 200.64 GB/s]
update: [34.10 GB/s, 67.67 GB/s, 100.62 GB/s, 133.50 GB/s, 165.61 GB/s,
197.74 GB/s, 228.73 GB/s, 259.05 GB/s]
size per core: [1.25 MB, 1.25 MB, 1.25 MB, 1.25 MB, 1.25 MB, 1.25 MB, 1.25
MB, 1.25 MB]
size per thread: [625.00 kB, 625.00 kB, 625.00 kB, 625.00 kB, 625.00 kB, 625.00
kB, 625.00 kB, 625.00 kB]
threads: [2, 4, 6, 8, 10, 12, 14, 16]
threads per core: 2
total size: [1.25 MB, 2.50 MB, 3.75 MB, 5.00 MB, 6.25 MB, 7.50 MB, 8.75 MB,
10.00 MB]
MEM:
1:
cores: [1, 2, 3, 4, 5, 6, 7, 8]
results:
copy: [11.60 GB/s, 21.29 GB/s, 25.94 GB/s, 27.28 GB/s, 27.47 GB/s, 27.36
GB/s, 27.21 GB/s, 27.12 GB/s]
daxpy: [17.33 GB/s, 31.89 GB/s, 38.65 GB/s, 40.50 GB/s, 40.81 GB/s, 40.62
GB/s, 40.59 GB/s, 40.26 GB/s]
load: [12.01 GB/s, 23.04 GB/s, 32.79 GB/s, 40.21 GB/s, 43.39 GB/s, 44.14
GB/s, 44.42 GB/s, 44.40 GB/s]
triad: [12.73 GB/s, 24.27 GB/s, 30.43 GB/s, 31.46 GB/s, 31.77 GB/s, 31.74
GB/s, 31.65 GB/s, 31.52 GB/s]
update: [18.91 GB/s, 32.43 GB/s, 37.28 GB/s, 39.98 GB/s, 40.99 GB/s, 40.92
GB/s, 40.61 GB/s, 40.34 GB/s]
size per core: [40.00 MB, 20.00 MB, 13.33 MB, 10.00 MB, 8.00 MB, 6.67 MB,
5.71 MB, 5.00 MB]
size per thread: [40.00 MB, 20.00 MB, 13.33 MB, 10.00 MB, 8.00 MB, 6.67 MB,
5.71 MB, 5.00 MB]
threads: [1, 2, 3, 4, 5, 6, 7, 8]
threads per core: 1
total size: [40.00 MB, 40.00 MB, 40.00 MB, 40.00 MB, 40.00 MB, 40.00 MB, 40.00 MB, 40.00 MB, 40.00 MB, 40.00 MB]
2:
cores: [1, 2, 3, 4, 5, 6, 7, 8]
results:
copy: [10.92 GB/s, 20.62 GB/s, 25.34 GB/s, 26.22 GB/s, 26.32 GB/s, 26.31
GB/s, 26.22 GB/s, 26.16 GB/s]
daxpy: [17.15 GB/s, 31.96 GB/s, 38.12 GB/s, 39.19 GB/s, 39.38 GB/s, 39.16
GB/s, 39.06 GB/s, 38.87 GB/s]
load: [13.49 GB/s, 25.92 GB/s, 36.16 GB/s, 41.56 GB/s, 43.34 GB/s, 43.40
GB/s, 43.01 GB/s, 42.66 GB/s]
triad: [12.38 GB/s, 23.17 GB/s, 28.69 GB/s, 29.98 GB/s, 30.50 GB/s, 30.59
GB/s, 30.75 GB/s, 30.70 GB/s]
update: [19.67 GB/s, 34.93 GB/s, 39.93 GB/s, 40.79 GB/s, 40.43 GB/s, 40.03
GB/s, 39.62 GB/s, 39.33 GB/s]
size per core: [40.00 MB, 20.00 MB, 13.33 MB, 10.00 MB, 8.00 MB, 6.67 MB,
5.71 MB, 5.00 MB]
size per thread: [20.00 MB, 10.00 MB, 6.67 MB, 5.00 MB, 4.00 MB, 3.33 MB,
2.86 MB, 2.50 MB]
threads: [2, 4, 6, 8, 10, 12, 14, 16]
threads per core: 2
total size: [40.00 MB, 40.00 MB, 40.00 MB, 40.00 MB, 40.00 MB, 40.00 MB, 40.00 MB, 40.00 MB, 40.00 MB, 40.00 MB]
import sympy as sp
from pystencils import Assignment, AssignmentCollection
from pystencils.simp.assignment_collection import SymbolGen
def test_assignment_collection():
x, y, z, t = sp.symbols("x y z t")
symbol_gen = SymbolGen("a")
ac = AssignmentCollection([Assignment(z, x + y)],
[], subexpression_symbol_generator=symbol_gen)
lhs = ac.add_subexpression(t)
assert lhs == sp.Symbol("a_0")
ac.subexpressions.append(Assignment(t, 3))
ac.topological_sort(sort_main_assignments=False, sort_subexpressions=True)
assert ac.subexpressions[0].lhs == t
assert ac.new_with_inserted_subexpression(sp.Symbol("not_defined")) == ac
ac_inserted = ac.new_with_inserted_subexpression(t)
ac_inserted2 = ac.new_without_subexpressions({lhs})
assert all(a == b for a, b in zip(ac_inserted.all_assignments, ac_inserted2.all_assignments))
print(ac_inserted)
assert ac_inserted.subexpressions[0] == Assignment(lhs, 3)
assert 'a_0' in str(ac_inserted)
assert '<table' in ac_inserted._repr_html_()
%% Cell type:markdown id: tags:
# pystencils - LLVM generation
The generation of LLVM code is simliar but not identical as seen with the C++ version. For the generation itself a python module ``llvmlite`` is used. This module provides the necessary support and bindings for LLVM. In order to generate from the AST to llvm, the AST needs to be transformed to support type conversions. This is the biggest difference to the C++ version. C++ doesn't need that since the language itself handles the casts.
In this example a simple weighted Jacobi kernel is generated, so the focus remains on the part of LLVM generation.
%% Cell type:code id: tags:
``` python
import sympy as sp
import numpy as np
import ctypes
from pystencils import Field, Assignment
from pystencils import create_kernel
from pystencils.display_utils import to_dot
sp.init_printing()
```
%% Cell type:markdown id: tags:
The numpy arrays (with inital values) create *Field*s for the update Rule. Later those arrays are used for the computation itself.
%% Cell type:code id: tags:
``` python
src_arr = np.zeros((30, 20))
src_arr[0,:] = 1.0
src_arr[:,0] = 1.0
dst_arr = src_arr.copy()
src_field = Field.create_from_numpy_array('src', src_arr)
dst_field = Field.create_from_numpy_array('dst', dst_arr)
```
%% Cell type:markdown id: tags:
Using the *Field* objects and the additional *Symbol* $\omega$ for the weight the update rule is specified as a *sympy* equation.
%% Cell type:code id: tags:
``` python
omega = sp.symbols("omega")
update_rule = Assignment(dst_field[0,0], omega * (src_field[0,1] + src_field[0,-1] + src_field[1,0] + src_field[-1,0]) / 4
+ (1.-omega)*src_field[0,0])
update_rule
```
%% Output
ω⋅(src_E + src_N + src_S + src_W)
dst_C := src_C⋅(-ω + 1.0) + ─────────────────────────────────
4
%% Cell type:markdown id: tags:
With this update rule an abstract syntax tree (AST) can be created. This AST can be used to print the LLVM code. The creation follows the same routines as the C++ version does. However at the end there are two more steps. In order to generate LLVM, type casting and pointer arithmetic had to be introduced (which C++ does for you).
%% Cell type:code id: tags:
``` python
ast = create_kernel([update_rule], target='llvm')
print(str(ast))
```
%% Output
KernelFunction kernel([<double * RESTRICT fd_dst>, <double * RESTRICT const fd_src>, <double omega>])
Block for(ctr_0=1; ctr_0<29; ctr_0+=1)
Block fd_dst_C ← pointer_arithmetic_func(fd_dst, 20*ctr_0)
fd_src_C ← pointer_arithmetic_func(fd_src, 20*ctr_0)
fd_src_E ← pointer_arithmetic_func(fd_src, 20*ctr_0 + 20)
fd_src_W ← pointer_arithmetic_func(fd_src, 20*ctr_0 - 20)
for(ctr_1=1; ctr_1<19; ctr_1+=1)
Block fd_dst_C[ctr_1] ← omega*(fd_src_C[ctr_1 + 1] + fd_src_C[ctr_1 - 1] + fd_src_E[ctr_1] + fd_src_W[ctr_1])/4 + (omega*cast_func(-1, double) + 1.0)*fd_src_C[ctr_1]
%% Cell type:markdown id: tags:
It is possible to examine the AST further.
%% Cell type:code id: tags:
``` python
to_dot(ast)
```
%% Output
<graphviz.files.Source at 0x7f2d14f276a0>
%% Cell type:markdown id: tags:
With transformed AST it is now possible to generate and compile the AST into LLVM. Notice that unlike in C++ version, no files are writen to the hard drive (although it is possible).
There are multiple ways how to generate and compile the AST. The most simple one is simillar to the C++ version. Using the ``compile()`` function of the generated AST
You can also manually create a python function with ``make_python_function``.
Another option is obtaining the jit itself with ``generate_and_jit``.
The function ``generate_and_jit`` first generates and the compiles the AST.
If even more controll is needed, it is possible to use the functions ``generateLLVM`` and ``compileLLVM`` to achieve the same. For further controll, instead of calling ``compileLLVM`` the jit object itself can be created and its necessary functions for the compilation have to be run manually (``parse``, (``optimize``,) ``compile``)
%% Cell type:code id: tags:
``` python
kernel = ast.compile()
#kernel = make_python_function(ast)
# Or alternativally
#jit = generate_and_jit(ast)
# Call: jit('kernel', src_arr, dst_arr)
```
%% Cell type:markdown id: tags:
The compiled function(s) can be used now. Either call the function (with arguments, if not given before) or call the jit object with the function's name and its arguments. Here, numpy arrays are automatically adjusted with ctypes.
The functions and arguments can be read as well.
**All of the information the jit object has comes from the module which was parsed. If you parse a second module and don't run the compilation step, the module and the compiled code are not the same anymore, thus leading to false information**
%% Cell type:code id: tags:
``` python
#jit.print_functions()
```
%% Cell type:code id: tags:
``` python
for i in range(100):
kernel(src=src_arr, dst=dst_arr, omega=2/3)
src_arr, dst_arr = dst_arr, src_arr
```
%% Cell type:markdown id: tags:
The output is drawn with matplotlib.
%% Cell type:code id: tags:
``` python
import matplotlib.pyplot as plt
from matplotlib import cm
fig = plt.figure()
ax = fig.add_subplot(111)
ax.imshow(dst_arr, cmap=cm.jet)
plt.show()
```
%% Output
%% Cell type:code id: tags:
``` python
```
import pystencils as ps
import sympy as sp
import numpy as np
from pystencils.astnodes import Conditional, Block
from pystencils.cpu.vectorization import vec_all, vec_any
def test_vec_any():
data_arr = np.zeros((15, 15))
data_arr[3:9, 2:7] = 1.0
data = ps.fields("data: double[2D]", data=data_arr)
c = [
ps.Assignment(sp.Symbol("t1"), vec_any(data.center() > 0.0)),
Conditional(vec_any(data.center() > 0.0), Block([
ps.Assignment(data.center(), 2.0)
]))
]
ast = ps.create_kernel(c, target='cpu',
cpu_vectorize_info={'instruction_set': 'avx'})
kernel = ast.compile()
kernel(data=data_arr)
np.testing.assert_equal(data_arr[3:9, 0:8], 2.0)
def test_vec_all():
data_arr = np.zeros((15, 15))
data_arr[3:9, 2:7] = 1.0
data = ps.fields("data: double[2D]", data=data_arr)
c = [
Conditional(vec_all(data.center() > 0.0), Block([
ps.Assignment(data.center(), 2.0)
]))
]
ast = ps.create_kernel(c, target='cpu',
cpu_vectorize_info={'instruction_set': 'avx'})
kernel = ast.compile()
before = data_arr.copy()
kernel(data=data_arr)
np.testing.assert_equal(data_arr, before)
def test_boolean_before_loop():
t1, t2 = sp.symbols('t1, t2')
f_arr = np.ones((10, 10))
g_arr = np.zeros_like(f_arr)
f, g = ps.fields("f, g : double[2D]", f=f_arr, g=g_arr)
a = [
ps.Assignment(t1, t2 > 0),
ps.Assignment(g[0, 0],
sp.Piecewise((f[0, 0], t1), (42, True)))
]
ast = ps.create_kernel(a, cpu_vectorize_info={'instruction_set': 'avx'})
kernel = ast.compile()
kernel(f=f_arr, g=g_arr, t2=1.0)
print(g)
np.testing.assert_array_equal(g_arr, 1.0)
kernel(f=f_arr, g=g_arr, t2=-1.0)
np.testing.assert_array_equal(g_arr, 42.0)