6f74f2ab · 6f74f2ab · 6f74f2ab · 6f74f2ab · 6f74f2ab · 6f74f2ab
--- a/pystencils/kerncraft_coupling/templates/benchmark.c
+++ b/pystencils/kerncraft_coupling/templates/benchmark.c
-#include "kerncraft.h"
-#include <stdlib.h>
-#include <stdint.h>
-#include <stdbool.h>
-#include <math.h>
-#include <stdio.h>
-#include <assert.h>
-{{ includes }}
-{%- if likwid %}
-#include <likwid.h>
-{%- endif %}
-#define RESTRICT __restrict__
-#define FUNC_PREFIX
-void dummy(void *);
-void timing(double* wcTime, double* cpuTime);
-extern int var_false;
-/* see waLBerla src/field/allocation/AlignedMalloc */
-void *aligned_malloc_with_offset( uint64_t size, uint64_t alignment, uint64_t offset )
-{
-    // With 0 alignment this function makes no sense
-    // use normal malloc instead
-    assert( alignment > 0 );
-    // Tests if alignment is power of two (assuming alignment>0)
-    assert( !(alignment & (alignment - 1)) );
-    assert( offset < alignment );
-    void *pa;  // pointer to allocated memory
-    void *ptr; // pointer to usable aligned memory
-    pa=std::malloc( (size+2*alignment-1 )+sizeof(void *));
-    if(!pa)
-        return nullptr;
-    // Find next aligned position, starting at pa+sizeof(void*)-1
-    ptr=(void*)( ((size_t)pa+sizeof(void *)+alignment-1) & ~(alignment-1));
-    ptr=(void*) ( (char*)(ptr) + alignment - offset);
-    // Store pointer to real allocated chunk just before usable chunk
-    *((void **)ptr-1)=pa;
-    assert( ((size_t)ptr+offset) % alignment == 0 );
-    return ptr;
-}
-void aligned_free( void *ptr )
-{
-    // assume that pointer to real allocated chunk is stored just before
-    // chunk that was given to user
-    if(ptr)
-        std::free(*((void **)ptr-1));
-}
-{{kernel_code}}
-int main(int argc, char **argv)
-{
-  {%- if likwid %}
-  likwid_markerInit();
-  {%- endif %}
-  {%- for field_name, dataType, elements, size, offset, alignment in fields %}
-  // Initialization {{field_name}}
-  {%- if alignment > 0 %}
-  {{dataType}} * {{field_name}} = ({{dataType}} *) aligned_malloc_with_offset({{size}}, {{alignment}}, {{offset}});
-  {%- else %}
-  {{dataType}} * {{field_name}} = new {{dataType}}[{{elements}}];
-  {%- endif %}
-  for (unsigned long long i = 0; i < {{elements}}; ++i)
-    {{field_name}}[i] = 0.23;
-  if(var_false)
-    dummy({{field_name}});
-  {%- endfor %}
-  {%- for constantName, dataType in constants %}
-  // Constant {{constantName}}
-  {{dataType}} {{constantName}};
-  {{constantName}} = 0.23;
-  if(var_false)
-      dummy(& {{constantName}});
-  {%- endfor %}
-  {%- if likwid and openmp %}
-  #pragma omp parallel
-  {
-  likwid_markerRegisterRegion("loop");
-  #pragma omp barrier
-  {%- elif likwid %}
-  likwid_markerRegisterRegion("loop");
-  {%- endif %}
-  for(int warmup = 1; warmup >= 0; --warmup) {
-    int repeat = 2;
-    if(warmup == 0) {
-      repeat = atoi(argv[1]);
-      {%- if likwid %}
-      likwid_markerStartRegion("loop");
-      {%- endif %}
-    }
-    {%- if timing %}
-    double wcStartTime, cpuStartTime, wcEndTime, cpuEndTime;
-    timing(&wcStartTime, &cpuStartTime);
-    {%- endif %}
-    for (; repeat > 0; --repeat)
-    {
-      {{kernelName}}({{call_argument_list}});
-      // Dummy calls
-      {%- for field_name, dataType, elements, size, offset, alignment in fields %}
-      if(var_false) dummy((void*){{field_name}});
-      {%- endfor %}
-      {%- for constantName, dataType in constants %}
-      if(var_false) dummy((void*)&{{constantName}});
-      {%- endfor %}
-    }
-    {%- if timing %}
-    timing(&wcEndTime, &cpuEndTime);
-    if( warmup == 0)
-        printf("%e\n", (wcEndTime - wcStartTime) / atoi(argv[1]) );
-    {%- endif %}
-  }
-  {%- if likwid %}
-  likwid_markerStopRegion("loop");
-  {%- if openmp %}
-  }
-  {%- endif %}
-  {%- endif %}
-  {%- if likwid %}
-  likwid_markerClose();
-  {%- endif %}
-  {%- for field_name, dataType, elements, size, offset, alignment in fields %}
-  {%- if alignment > 0 %}
-  aligned_free({{field_name}});
-  {%- else %}
-  delete[] {{field_name}};
-  {%- endif %}
-  {%- endfor %}
-}
--- a/pystencils/kerncraft_coupling/templates/kernel.c
+++ b/pystencils/kerncraft_coupling/templates/kernel.c
-#include "kerncraft.h"
-#include <stdlib.h>
-#include <stdint.h>
-#include <stdbool.h>
-#include <math.h>
-#include <stdio.h>
-{{ includes }}
-#define RESTRICT __restrict__
-#define FUNC_PREFIX
-void dummy(void *);
-void timing(double* wcTime, double* cpuTime);
-extern int var_false;
-{{kernel_code}}
\ No newline at end of file
--- a/pystencils/kerncraft_coupling/templates/kernel.h
+++ b/pystencils/kerncraft_coupling/templates/kernel.h
-#define FUNC_PREFIX
-{{function_signature}}
\ No newline at end of file
--- a/pystencils/llvm/__init__.py
+++ b/pystencils/llvm/__init__.py
-from .kernelcreation import create_kernel
-from .llvmjit import make_python_function
-__all__ = ['create_kernel', 'make_python_function']
--- a/pystencils/llvm/control_flow.py
+++ b/pystencils/llvm/control_flow.py
-import llvmlite.ir as ir
-class Loop(object):
-    def __init__(self, builder, start_val, stop_val, step_val=1, loop_name='loop', phi_name="_phi"):
-        self.builder = builder
-        self.start_val = start_val
-        self.stop_val = stop_val
-        self.step_val = step_val
-        self.loop_name = loop_name
-        self.phi_name = phi_name
-    def __enter__(self):
-        self.loop_end, self.after, phi = self._for_loop(self.start_val, self.stop_val, self.step_val, self.loop_name,
-                                                        self.phi_name)
-        return phi
-    def _for_loop(self, start_val, stop_val, step_val, loop_name, phi_name):
-        # TODO size of int??? unisgned???
-        integer = ir.IntType(64)
-        # Loop block
-        pre_loop_bb = self.builder.block
-        loop_bb = self.builder.append_basic_block(name='loop_' + loop_name)
-        self.builder.branch(loop_bb)
-        # Insert an explicit fall through from the current block to loop_bb
-        self.builder.position_at_start(loop_bb)
-        # Add phi
-        phi = self.builder.phi(integer, name=phi_name)
-        phi.add_incoming(start_val, pre_loop_bb)
-        loop_end_bb = self.builder.append_basic_block(name=loop_name + "_end_bb")
-        self.builder.position_at_start(loop_end_bb)
-        next_var = self.builder.add(phi, step_val, name=loop_name + '_next_it')
-        cond = self.builder.icmp_unsigned('<', next_var, stop_val, name=loop_name + "_cond")
-        after_bb = self.builder.append_basic_block(name=loop_name + "_after_bb")
-        self.builder.cbranch(cond, loop_bb, after_bb)
-        phi.add_incoming(next_var, loop_end_bb)
-        self.builder.position_at_end(loop_bb)
-        return loop_end_bb, after_bb, phi
-    def __exit__(self, exc_type, exc, exc_tb):
-        self.builder.branch(self.loop_end)
-        self.builder.position_at_end(self.after)
--- a/pystencils/llvm/kernelcreation.py
+++ b/pystencils/llvm/kernelcreation.py
-from pystencils.enums import Target, Backend
-from pystencils.llvm.llvmjit import make_python_function
-from pystencils.transformations import insert_casts
-def create_kernel(assignments, function_name="kernel", type_info=None, split_groups=(),
-                  iteration_slice=None, ghost_layers=None, target=Target.CPU):
-    """
-    Creates an abstract syntax tree for a kernel function, by taking a list of update rules.
-    Loops are created according to the field accesses in the equations.
-    Args:
-    assignments: list of sympy equations, containing accesses to :class:`pystencils.field.Field`.
-                 Defining the update rules of the kernel
-    function_name: name of the generated function - only important if generated code is written out
-                   type_info: a map from symbol name to a C type specifier. If not specified all symbols are assumed to
-                   be of type 'double' except symbols which occur on the left hand side of equations where the
-                   right hand side is a sympy Boolean which are assumed to be 'bool' .
-    split_groups: Specification on how to split up inner loop into multiple loops. For details see
-                  transformation :func:`pystencils.transformation.split_inner_loop`
-    iteration_slice: if not None, iteration is done only over this slice of the field
-    ghost_layers: a sequence of pairs for each coordinate with lower and upper nr of ghost layers
-                 if None, the number of ghost layers is determined automatically and assumed to be equal for a
-                 all dimensions
-    :return: :class:`pystencils.ast.KernelFunction` node
-    """
-    if target == Target.CPU:
-        from pystencils.cpu import create_kernel
-        code = create_kernel(assignments, function_name, type_info, split_groups, iteration_slice, ghost_layers)
-    elif target == Target.GPU:
-        from pystencils.gpucuda.kernelcreation import create_cuda_kernel
-        code = create_cuda_kernel(assignments,
-                                  function_name,
-                                  type_info,
-                                  iteration_slice=iteration_slice,
-                                  ghost_layers=ghost_layers)
-    else:
-        NotImplementedError(f'{target} is not implemented for LLVM kernel creation')
-    code._backend = Backend.LLVM
-    code.body = insert_casts(code.body)
-    code._compile_function = make_python_function
-    return code
--- a/pystencils/llvm/llvm.py
+++ b/pystencils/llvm/llvm.py
-import functools
-import llvmlite.ir as ir
-import llvmlite.llvmpy.core as lc
-import sympy as sp
-from sympy import Indexed, S
-from sympy.printing.printer import Printer
-from pystencils import Target
-from pystencils.assignment import Assignment
-from pystencils.data_types import (
-    collate_types, create_composite_type_from_string, create_type, get_type_of_expression,
-    to_llvm_type)
-from pystencils.llvm.control_flow import Loop
-# From Numba
-def set_cuda_kernel(lfunc):
-    from llvmlite.llvmpy.core import MetaData, MetaDataString, Constant, Type
-    m = lfunc.module
-    ops = lfunc, MetaDataString.get(m, "kernel"), Constant.int(Type.int(), 1)
-    md = MetaData.get(m, ops)
-    nmd = m.get_or_insert_named_metadata('nvvm.annotations')
-    nmd.add(md)
-    # set nvvm ir version
-    i32 = ir.IntType(32)
-    md_ver = m.add_metadata([i32(1), i32(2), i32(2), i32(0)])
-    m.add_named_metadata('nvvmir.version', md_ver)
-# From Numba
-def _call_sreg(builder, name):
-    module = builder.module
-    fnty = lc.Type.function(lc.Type.int(), ())
-    fn = module.get_or_insert_function(fnty, name=name)
-    return builder.call(fn, ())
-def generate_llvm(ast_node, module=None, builder=None, target=Target.CPU):
-    """Prints the ast as llvm code."""
-    if module is None:
-        module = lc.Module()
-    if builder is None:
-        builder = ir.IRBuilder()
-    printer = LLVMPrinter(module, builder, target=target)
-    return printer._print(ast_node)
-# noinspection PyPep8Naming
-class LLVMPrinter(Printer):
-    """Convert expressions to LLVM IR"""
-    def __init__(self, module, builder, fn=None, target=Target.CPU, *args, **kwargs):
-        self.func_arg_map = kwargs.pop("func_arg_map", {})
-        super(LLVMPrinter, self).__init__(*args, **kwargs)
-        self.fp_type = ir.DoubleType()
-        self.fp_pointer = self.fp_type.as_pointer()
-        self.integer = ir.IntType(64)
-        self.integer_pointer = self.integer.as_pointer()
-        self.void = ir.VoidType()
-        self.module = module
-        self.builder = builder
-        self.fn = fn
-        self.ext_fn = {}  # keep track of wrappers to external functions
-        self.tmp_var = {}
-        self.target = target
-    def _add_tmp_var(self, name, value):
-        self.tmp_var[name] = value
-    def _remove_tmp_var(self, name):
-        del self.tmp_var[name]
-    def _print_Number(self, n):
-        if get_type_of_expression(n) == create_type("int"):
-            return ir.Constant(self.integer, int(n))
-        elif get_type_of_expression(n) == create_type("double"):
-            return ir.Constant(self.fp_type, float(n))
-        else:
-            raise NotImplementedError("Numbers can only have int and double", n)
-    def _print_Float(self, expr):
-        return ir.Constant(self.fp_type, float(expr))
-    def _print_Integer(self, expr):
-        return ir.Constant(self.integer, int(expr))
-    def _print_int(self, i):
-        return ir.Constant(self.integer, i)
-    def _print_Symbol(self, s):
-        val = self.tmp_var.get(s)
-        if not val:
-            # look up parameter with name s
-            val = self.func_arg_map.get(s.name)
-        if not val:
-            raise LookupError(f"Symbol not found: {s}")
-        return val
-    def _print_Pow(self, expr):
-        base0 = self._print(expr.base)
-        if expr.exp == S.NegativeOne:
-            return self.builder.fdiv(ir.Constant(self.fp_type, 1.0), base0)
-        if expr.exp == S.Half:
-            fn = self.ext_fn.get("sqrt")
-            if not fn:
-                fn_type = ir.FunctionType(self.fp_type, [self.fp_type])
-                fn = ir.Function(self.module, fn_type, "sqrt")
-                self.ext_fn["sqrt"] = fn
-            return self.builder.call(fn, [base0], "sqrt")
-        if expr.exp == 2:
-            return self.builder.fmul(base0, base0)
-        elif expr.exp == 3:
-            return self.builder.fmul(self.builder.fmul(base0, base0), base0)
-        exp0 = self._print(expr.exp)
-        fn = self.ext_fn.get("pow")
-        if not fn:
-            fn_type = ir.FunctionType(self.fp_type, [self.fp_type, self.fp_type])
-            fn = ir.Function(self.module, fn_type, "pow")
-            self.ext_fn["pow"] = fn
-        return self.builder.call(fn, [base0, exp0], "pow")
-    def _print_Mul(self, expr):
-        nodes = [self._print(a) for a in expr.args]
-        e = nodes[0]
-        if get_type_of_expression(expr) == create_type('double'):
-            mul = self.builder.fmul
-        else:  # int TODO unsigned/signed
-            mul = self.builder.mul
-        for node in nodes[1:]:
-            e = mul(e, node)
-        return e
-    def _print_Add(self, expr):
-        nodes = [self._print(a) for a in expr.args]
-        e = nodes[0]
-        if get_type_of_expression(expr) == create_type('double'):
-            add = self.builder.fadd
-        else:  # int TODO unsigned/signed
-            add = self.builder.add
-        for node in nodes[1:]:
-            e = add(e, node)
-        return e
-    def _print_Or(self, expr):
-        nodes = [self._print(a) for a in expr.args]
-        e = nodes[0]
-        for node in nodes[1:]:
-            e = self.builder.or_(e, node)
-        return e
-    def _print_And(self, expr):
-        nodes = [self._print(a) for a in expr.args]
-        e = nodes[0]
-        for node in nodes[1:]:
-            e = self.builder.and_(e, node)
-        return e
-    def _print_StrictLessThan(self, expr):
-        return self._comparison('<', expr)
-    def _print_LessThan(self, expr):
-        return self._comparison('<=', expr)
-    def _print_StrictGreaterThan(self, expr):
-        return self._comparison('>', expr)
-    def _print_GreaterThan(self, expr):
-        return self._comparison('>=', expr)
-    def _print_Unequality(self, expr):
-        return self._comparison('!=', expr)
-    def _print_Equality(self, expr):
-        return self._comparison('==', expr)
-    def _comparison(self, cmpop, expr):
-        if collate_types([get_type_of_expression(arg) for arg in expr.args]) == create_type('double'):
-            comparison = self.builder.fcmp_unordered
-        else:
-            comparison = self.builder.icmp_signed
-        return comparison(cmpop, self._print(expr.lhs), self._print(expr.rhs))
-    def _print_KernelFunction(self, func):
-        # KernelFunction does not posses a return type
-        return_type = self.void
-        parameter_type = []
-        parameters = func.get_parameters()
-        for parameter in parameters:
-            parameter_type.append(to_llvm_type(parameter.symbol.dtype, nvvm_target=self.target == Target.GPU))
-        func_type = ir.FunctionType(return_type, tuple(parameter_type))
-        name = func.function_name
-        fn = ir.Function(self.module, func_type, name)
-        self.ext_fn[name] = fn
-        # set proper names to arguments
-        for i, arg in enumerate(fn.args):
-            arg.name = parameters[i].symbol.name
-            self.func_arg_map[parameters[i].symbol.name] = arg
-        # func.attributes.add("inlinehint")
-        # func.attributes.add("argmemonly")
-        block = fn.append_basic_block(name="entry")
-        self.builder = ir.IRBuilder(block)  # TODO use goto_block instead
-        self._print(func.body)
-        self.builder.ret_void()
-        self.fn = fn
-        if self.target == Target.GPU:
-            set_cuda_kernel(fn)
-        return fn
-    def _print_Block(self, block):
-        for node in block.args:
-            self._print(node)
-    def _print_LoopOverCoordinate(self, loop):
-        with Loop(self.builder, self._print(loop.start), self._print(loop.stop), self._print(loop.step),
-                  loop.loop_counter_name, loop.loop_counter_symbol.name) as i:
-            self._add_tmp_var(loop.loop_counter_symbol, i)
-            self._print(loop.body)
-            self._remove_tmp_var(loop.loop_counter_symbol)
-    def _print_SympyAssignment(self, assignment):
-        expr = self._print(assignment.rhs)
-        lhs = assignment.lhs
-        if isinstance(lhs, Indexed):
-            ptr = self._print(lhs.base.label)
-            index = self._print(lhs.args[1])
-            gep = self.builder.gep(ptr, [index])
-            return self.builder.store(expr, gep)
-        self.func_arg_map[assignment.lhs.name] = expr
-        return expr
-    def _print_boolean_cast_func(self, conversion):
-        return self._print_cast_func(conversion)
-    def _print_cast_func(self, conversion):
-        node = self._print(conversion.args[0])
-        to_dtype = get_type_of_expression(conversion)
-        from_dtype = get_type_of_expression(conversion.args[0])
-        if from_dtype == to_dtype:
-            return self._print(conversion.args[0])
-        # (From, to)
-        decision = {
-            (create_composite_type_from_string("int32"),
-             create_composite_type_from_string("int64")): functools.partial(self.builder.zext, node, self.integer),
-            (create_composite_type_from_string("int16"),
-             create_composite_type_from_string("int64")): functools.partial(self.builder.zext, node, self.integer),
-            (create_composite_type_from_string("int"),
-             create_composite_type_from_string("double")): functools.partial(self.builder.sitofp, node, self.fp_type),
-            (create_composite_type_from_string("int16"),
-             create_composite_type_from_string("double")): functools.partial(self.builder.sitofp, node, self.fp_type),
-            (create_composite_type_from_string("double"),
-             create_composite_type_from_string("int")): functools.partial(self.builder.fptosi, node, self.integer),
-            (create_composite_type_from_string("double *"),
-             create_composite_type_from_string("int")): functools.partial(self.builder.ptrtoint, node, self.integer),
-            (create_composite_type_from_string("int"),
-             create_composite_type_from_string("double *")): functools.partial(self.builder.inttoptr,
-                                                                               node, self.fp_pointer),
-            (create_composite_type_from_string("double * restrict"),
-             create_composite_type_from_string("int")): functools.partial(self.builder.ptrtoint, node, self.integer),
-            (create_composite_type_from_string("int"),
-             create_composite_type_from_string("double * restrict")): functools.partial(self.builder.inttoptr, node,
-                                                                                        self.fp_pointer),
-            (create_composite_type_from_string("double * restrict const"),
-             create_composite_type_from_string("int")): functools.partial(self.builder.ptrtoint, node,
-                                                                          self.integer),
-            (create_composite_type_from_string("int"),
-             create_composite_type_from_string("double * restrict const")): functools.partial(self.builder.inttoptr,
-                                                                                              node, self.fp_pointer),
-        }
-        # TODO float, TEST: const, restrict
-        # TODO bitcast, addrspacecast
-        # TODO unsigned/signed fills
-        # print([x for x in decision.keys()])
-        # print("Types:")
-        # print([(type(x), type(y)) for (x, y) in decision.keys()])
-        # print("Cast:")
-        # print((from_dtype, to_dtype))
-        return decision[(from_dtype, to_dtype)]()
-    def _print_pointer_arithmetic_func(self, pointer):
-        ptr = self._print(pointer.args[0])
-        index = self._print(pointer.args[1])
-        return self.builder.gep(ptr, [index])
-    def _print_Indexed(self, indexed):
-        ptr = self._print(indexed.base.label)
-        index = self._print(indexed.args[1])
-        gep = self.builder.gep(ptr, [index])
-        return self.builder.load(gep, name=indexed.base.label.name)
-    def _print_Piecewise(self, piece):
-        if not piece.args[-1].cond:
-            # We need the last conditional to be a True, otherwise the resulting
-            # function may not return a result.
-            raise ValueError("All Piecewise expressions must contain an "
-                             "(expr, True) statement to be used as a default "
-                             "condition. Without one, the generated "
-                             "expression may not evaluate to anything under "
-                             "some condition.")
-        if piece.has(Assignment):
-            raise NotImplementedError('The llvm-backend does not support assignments'
-                                      'in the Piecewise function. It is questionable'
-                                      'whether to implement it. So far there is no'
-                                      'use-case to test it.')
-        else:
-            phi_data = []
-            after_block = self.builder.append_basic_block()
-            for (expr, condition) in piece.args:
-                if condition == sp.sympify(True):  # Don't use 'is' use '=='!
-                    phi_data.append((self._print(expr), self.builder.block))
-                    self.builder.branch(after_block)
-                    self.builder.position_at_end(after_block)
-                else:
-                    cond = self._print(condition)
-                    true_block = self.builder.append_basic_block()
-                    false_block = self.builder.append_basic_block()
-                    self.builder.cbranch(cond, true_block, false_block)
-                    self.builder.position_at_end(true_block)
-                    phi_data.append((self._print(expr), true_block))
-                    self.builder.branch(after_block)
-                    self.builder.position_at_end(false_block)
-            phi = self.builder.phi(to_llvm_type(get_type_of_expression(piece), nvvm_target=self.target == Target.GPU))
-            for (val, block) in phi_data:
-                phi.add_incoming(val, block)
-            return phi
-    def _print_Conditional(self, node):
-        cond = self._print(node.condition_expr)
-        with self.builder.if_else(cond) as (then, otherwise):
-            with then:
-                self._print(node.true_block)       # emit instructions for when the predicate is true
-            with otherwise:
-                self._print(node.false_block)       # emit instructions for when the predicate is true
-        # No return!
-    def _print_Function(self, expr):
-        name = expr.func.__name__
-        e0 = self._print(expr.args[0])
-        fn = self.ext_fn.get(name)
-        if not fn:
-            fn_type = ir.FunctionType(self.fp_type, [self.fp_type])
-            fn = ir.Function(self.module, fn_type, name)
-            self.ext_fn[name] = fn
-        return self.builder.call(fn, [e0], name)
-    def empty_printer(self, expr):
-        try:
-            import inspect
-            mro = inspect.getmro(expr)
-        except AttributeError:
-            mro = "None"
-        raise TypeError("Unsupported type for LLVM JIT conversion: Expression:\"%s\", Type:\"%s\", MRO:%s"
-                        % (expr, type(expr), mro))
-    # from: https://llvm.org/docs/NVPTXUsage.html#nvptx-intrinsics
-    INDEXING_FUNCTION_MAPPING = {
-        'blockIdx': 'llvm.nvvm.read.ptx.sreg.ctaid',
-        'threadIdx': 'llvm.nvvm.read.ptx.sreg.tid',
-        'blockDim': 'llvm.nvvm.read.ptx.sreg.ntid',
-        'gridDim': 'llvm.nvvm.read.ptx.sreg.nctaid'
-    }
-    def _print_ThreadIndexingSymbol(self, node):
-        symbol_name: str = node.name
-        function_name, dimension = tuple(symbol_name.split("."))
-        function_name = self.INDEXING_FUNCTION_MAPPING[function_name]
-        name = f"{function_name}.{dimension}"
-        return self.builder.zext(_call_sreg(self.builder, name), self.integer)
--- a/pystencils/llvm/llvmjit.py
+++ b/pystencils/llvm/llvmjit.py
-import ctypes as ct
-import subprocess
-from functools import partial
-from itertools import chain
-from os.path import exists, join
-import llvmlite.binding as llvm
-import llvmlite.ir as ir
-import numpy as np
-from pystencils.data_types import create_composite_type_from_string
-from pystencils.enums import Target
-from pystencils.field import FieldType
-from ..data_types import StructType, ctypes_from_llvm, to_ctypes
-from .llvm import generate_llvm
-def build_ctypes_argument_list(parameter_specification, argument_dict):
-    argument_dict = {k: v for k, v in argument_dict.items()}
-    ct_arguments = []
-    array_shapes = set()
-    index_arr_shapes = set()
-    for param in parameter_specification:
-        if param.is_field_parameter:
-            try:
-                field_arr = argument_dict[param.field_name]
-            except KeyError:
-                raise KeyError("Missing field parameter for kernel call " + param.field_name)
-            symbolic_field = param.fields[0]
-            if param.is_field_pointer:
-                ct_arguments.append(field_arr.ctypes.data_as(to_ctypes(param.symbol.dtype)))
-                if symbolic_field.has_fixed_shape:
-                    symbolic_field_shape = tuple(int(i) for i in symbolic_field.shape)
-                    if isinstance(symbolic_field.dtype, StructType):
-                        symbolic_field_shape = symbolic_field_shape[:-1]
-                    if symbolic_field_shape != field_arr.shape:
-                        raise ValueError("Passed array '%s' has shape %s which does not match expected shape %s" %
-                                         (param.field_name, str(field_arr.shape), str(symbolic_field.shape)))
-                if symbolic_field.has_fixed_shape:
-                    symbolic_field_strides = tuple(int(i) * field_arr.itemsize for i in symbolic_field.strides)
-                    if isinstance(symbolic_field.dtype, StructType):
-                        symbolic_field_strides = symbolic_field_strides[:-1]
-                    if symbolic_field_strides != field_arr.strides:
-                        raise ValueError("Passed array '%s' has strides %s which does not match expected strides %s" %
-                                         (param.field_name, str(field_arr.strides), str(symbolic_field_strides)))
-                if FieldType.is_indexed(symbolic_field):
-                    index_arr_shapes.add(field_arr.shape[:symbolic_field.spatial_dimensions])
-                elif FieldType.is_generic(symbolic_field):
-                    array_shapes.add(field_arr.shape[:symbolic_field.spatial_dimensions])
-            elif param.is_field_shape:
-                data_type = to_ctypes(param.symbol.dtype)
-                ct_arguments.append(data_type(field_arr.shape[param.symbol.coordinate]))
-            elif param.is_field_stride:
-                data_type = to_ctypes(param.symbol.dtype)
-                assert field_arr.strides[param.symbol.coordinate] % field_arr.itemsize == 0
-                item_stride = field_arr.strides[param.symbol.coordinate] // field_arr.itemsize
-                ct_arguments.append(data_type(item_stride))
-            else:
-                assert False
-        else:
-            try:
-                value = argument_dict[param.symbol.name]
-            except KeyError:
-                raise KeyError("Missing parameter for kernel call " + param.symbol.name)
-            expected_type = to_ctypes(param.symbol.dtype)
-            ct_arguments.append(expected_type(value))
-    if len(array_shapes) > 1:
-        raise ValueError("All passed arrays have to have the same size " + str(array_shapes))
-    if len(index_arr_shapes) > 1:
-        raise ValueError("All passed index arrays have to have the same size " + str(array_shapes))
-    return ct_arguments
-def make_python_function_incomplete_params(kernel_function_node, argument_dict, func):
-    parameters = kernel_function_node.get_parameters()
-    cache = {}
-    cache_values = []
-    def wrapper(**kwargs):
-        key = hash(tuple((k, v.ctypes.data, v.strides, v.shape) if isinstance(v, np.ndarray) else (k, id(v))
-                         for k, v in kwargs.items()))
-        try:
-            args = cache[key]
-            func(*args)
-        except KeyError:
-            full_arguments = argument_dict.copy()
-            full_arguments.update(kwargs)
-            args = build_ctypes_argument_list(parameters, full_arguments)
-            cache[key] = args
-            cache_values.append(kwargs)  # keep objects alive such that ids remain unique
-            func(*args)
-    wrapper.ast = kernel_function_node
-    wrapper.parameters = kernel_function_node.get_parameters()
-    return wrapper
-def generate_and_jit(ast):
-    target = ast.target
-    gen = generate_llvm(ast, target=target)
-    if isinstance(gen, ir.Module):
-        return compile_llvm(gen, target, ast)
-    else:
-        return compile_llvm(gen.module, target, ast)
-def make_python_function(ast, argument_dict=None, func=None):
-    if argument_dict is None:
-        argument_dict = {}
-    if func is None:
-        jit = generate_and_jit(ast)
-        func = jit.get_function_ptr(ast.function_name)
-    try:
-        args = build_ctypes_argument_list(ast.get_parameters(), argument_dict)
-    except KeyError:
-        # not all parameters specified yet
-        return make_python_function_incomplete_params(ast, argument_dict, func)
-    return lambda: func(*args)
-def compile_llvm(module, target=Target.CPU, ast=None):
-    jit = CudaJit(ast) if target == Target.GPU else Jit()
-    jit.parse(module)
-    jit.optimize()
-    jit.compile()
-    return jit
-class Jit(object):
-    def __init__(self):
-        llvm.initialize()
-        llvm.initialize_all_targets()
-        llvm.initialize_native_target()
-        llvm.initialize_native_asmprinter()
-        self.module = None
-        self._llvmmod = llvm.parse_assembly("")
-        self.target = llvm.Target.from_default_triple()
-        self.cpu = llvm.get_host_cpu_name()
-        try:
-            self.cpu_features = llvm.get_host_cpu_features()
-            self.target_machine = self.target.create_target_machine(cpu=self.cpu, features=self.cpu_features.flatten(),
-                                                                    opt=2)
-        except RuntimeError:
-            self.target_machine = self.target.create_target_machine(cpu=self.cpu, opt=2)
-        llvm.check_jit_execution()
-        self.ee = llvm.create_mcjit_compiler(self.llvmmod, self.target_machine)
-        self.ee.finalize_object()
-        self.fptr = None
-    @property
-    def llvmmod(self):
-        return self._llvmmod
-    @llvmmod.setter
-    def llvmmod(self, mod):
-        self.ee.remove_module(self.llvmmod)
-        self.ee.add_module(mod)
-        self.ee.finalize_object()
-        self.compile()
-        self._llvmmod = mod
-    def parse(self, module):
-        self.module = module
-        llvmmod = llvm.parse_assembly(str(module))
-        llvmmod.verify()
-        llvmmod.triple = self.target.triple
-        llvmmod.name = 'module'
-        self.llvmmod = llvmmod
-    def write_ll(self, file):
-        with open(file, 'w') as f:
-            f.write(str(self.llvmmod))
-    def write_assembly(self, file):
-        with open(file, 'w') as f:
-            f.write(self.target_machine.emit_assembly(self.llvmmod))
-    def write_object_file(self, file):
-        with open(file, 'wb') as f:
-            f.write(self.target_machine.emit_object(self.llvmmod))
-    def optimize(self):
-        pmb = llvm.create_pass_manager_builder()
-        pmb.opt_level = 2
-        pmb.disable_unit_at_a_time = False
-        pmb.loop_vectorize = True
-        pmb.slp_vectorize = True
-        # TODO possible to pass for functions
-        pm = llvm.create_module_pass_manager()
-        pm.add_instruction_combining_pass()
-        pm.add_function_attrs_pass()
-        pm.add_constant_merge_pass()
-        pm.add_licm_pass()
-        pmb.populate(pm)
-        pm.run(self.llvmmod)
-    def compile(self):
-        fptr = {}
-        for func in self.module.functions:
-            if not func.is_declaration:
-                return_type = None
-                if func.ftype.return_type != ir.VoidType():
-                    return_type = to_ctypes(create_composite_type_from_string(str(func.ftype.return_type)))
-                args = [ctypes_from_llvm(arg) for arg in func.ftype.args]
-                function_address = self.ee.get_function_address(func.name)
-                fptr[func.name] = ct.CFUNCTYPE(return_type, *args)(function_address)
-        self.fptr = fptr
-    def __call__(self, func, *args, **kwargs):
-        target_function = next(f for f in self.module.functions if f.name == func)
-        arg_types = [ctypes_from_llvm(arg.type) for arg in target_function.args]
-        transformed_args = []
-        for i, arg in enumerate(args):
-            if isinstance(arg, np.ndarray):
-                transformed_args.append(arg.ctypes.data_as(arg_types[i]))
-            else:
-                transformed_args.append(arg)
-        self.fptr[func](*transformed_args)
-    def print_functions(self):
-        for f in self.module.functions:
-            print(f.ftype.return_type, f.name, f.args)
-    def get_function_ptr(self, name):
-        fptr = self.fptr[name]
-        fptr.jit = self
-        return fptr
-# Following code more or less from numba
-class CudaJit(Jit):
-    CUDA_TRIPLE = {32: 'nvptx-nvidia-cuda',
-                   64: 'nvptx64-nvidia-cuda'}
-    MACHINE_BITS = tuple.__itemsize__ * 8
-    data_layout = {
-        32: ('e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-'
-             'f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64'),
-        64: ('e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-'
-             'f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64')}
-    default_data_layout = data_layout[MACHINE_BITS]
-    def __init__(self, ast):
-        # super().__init__()
-        # self.target = llvm.Target.from_triple(self.CUDA_TRIPLE[self.MACHINE_BITS])
-        self._data_layout = self.default_data_layout[self.MACHINE_BITS]
-        # self._target_data = llvm.create_target_data(self._data_layout)
-        self.indexing = ast.indexing
-    def optimize(self):
-        pmb = llvm.create_pass_manager_builder()
-        pmb.opt_level = 2
-        pmb.disable_unit_at_a_time = False
-        pmb.loop_vectorize = False
-        pmb.slp_vectorize = False
-        # TODO possible to pass for functions
-        pm = llvm.create_module_pass_manager()
-        pm.add_instruction_combining_pass()
-        pm.add_function_attrs_pass()
-        pm.add_constant_merge_pass()
-        pm.add_licm_pass()
-        pmb.populate(pm)
-        pm.run(self.llvmmod)
-        pm.run(self.llvmmod)
-    def write_ll(self, file):
-        with open(file, 'w') as f:
-            f.write(str(self.llvmmod))
-    def parse(self, module):
-        llvmmod = module
-        llvmmod.triple = self.CUDA_TRIPLE[self.MACHINE_BITS]
-        llvmmod.data_layout = self.default_data_layout
-        llvmmod.verify()
-        llvmmod.name = 'module'
-        self._llvmmod = llvm.parse_assembly(str(llvmmod))
-    def compile(self):
-        from pystencils.cpu.cpujit import get_cache_config, get_compiler_config, get_llc_command
-        import hashlib
-        compiler_cache = get_cache_config()['object_cache']
-        ir_file = join(compiler_cache, hashlib.md5(str(self._llvmmod).encode()).hexdigest() + '.ll')
-        ptx_file = ir_file.replace('.ll', '.ptx')
-        try:
-            from pycuda.driver import Context
-            arch = "sm_%d%d" % Context.get_device().compute_capability()
-        except Exception:
-            arch = "sm_35"
-        if not exists(ptx_file):
-            self.write_ll(ir_file)
-            if 'llc' in get_compiler_config():
-                llc_command = get_compiler_config()['llc']
-            else:
-                llc_command = get_llc_command() or 'llc'
-            subprocess.check_call([llc_command, '-mcpu=' + arch, ir_file, '-o', ptx_file])
-        # cubin_file = ir_file.replace('.ll', '.cubin')
-        # if not exists(cubin_file):
-            # subprocess.check_call(['ptxas', '--gpu-name', arch, ptx_file, '-o', cubin_file])
-        import pycuda.driver
-        cuda_module = pycuda.driver.module_from_file(ptx_file)  # also works: cubin_file
-        self.cuda_module = cuda_module
-    def __call__(self, func, *args, **kwargs):
-        shape = [a.shape for a in chain(args, kwargs.values()) if hasattr(a, 'shape')][0]
-        block_and_thread_numbers = self.indexing.call_parameters(shape)
-        block_and_thread_numbers['block'] = tuple(int(i) for i in block_and_thread_numbers['block'])
-        block_and_thread_numbers['grid'] = tuple(int(i) for i in block_and_thread_numbers['grid'])
-        self.cuda_module.get_function(func)(*args, **kwargs, **block_and_thread_numbers)
-    def get_function_ptr(self, name):
-        return partial(self._call__, name)
--- a/pystencils/opencl/__init__.py
+++ b/pystencils/opencl/__init__.py
-"""
-"""
-from pystencils.opencl.opencljit import (
-    clear_global_ctx, init_globally, init_globally_with_context, make_python_function)
-__all__ = ['init_globally', 'init_globally_with_context', 'clear_global_ctx', 'make_python_function']
--- a/pystencils/opencl/autoinit.py
+++ b/pystencils/opencl/autoinit.py
-"""
-Automatically initializes OpenCL context using any device.
-Use `pystencils.opencl.{init_globally_with_context,init_globally}` if you want to use a specific device.
-"""
-from pystencils.opencl.opencljit import (
-    clear_global_ctx, init_globally, init_globally_with_context, make_python_function)
-__all__ = ['init_globally', 'init_globally_with_context', 'clear_global_ctx', 'make_python_function']
-try:
-    init_globally()
-except Exception as e:
-    import warnings
-    warnings.warn(str(e))
--- a/pystencils/opencl/opencljit.py
+++ b/pystencils/opencl/opencljit.py
-import numpy as np
-from pystencils.backends.cbackend import get_headers
-from pystencils.backends.opencl_backend import generate_opencl
-from pystencils.gpucuda.cudajit import _build_numpy_argument_list, _check_arguments
-from pystencils.include import get_pystencils_include_path
-from pystencils.kernel_wrapper import KernelWrapper
-USE_FAST_MATH = True
-_global_cl_ctx = None
-_global_cl_queue = None
-def get_global_cl_queue():
-    return _global_cl_queue
-def get_global_cl_ctx():
-    return _global_cl_ctx
-def init_globally(device_index=0):
-    import pyopencl as cl
-    global _global_cl_ctx
-    global _global_cl_queue
-    _global_cl_ctx = cl.create_some_context(device_index)
-    _global_cl_queue = cl.CommandQueue(_global_cl_ctx)
-def init_globally_with_context(opencl_ctx, opencl_queue):
-    global _global_cl_ctx
-    global _global_cl_queue
-    _global_cl_ctx = opencl_ctx
-    _global_cl_queue = opencl_queue
-def clear_global_ctx():
-    global _global_cl_ctx
-    global _global_cl_queue
-    _global_cl_ctx = None
-    _global_cl_queue = None
-def make_python_function(kernel_function_node, opencl_queue, opencl_ctx, argument_dict=None, custom_backend=None):
-    """
-    Creates a **OpenCL** kernel function from an abstract syntax tree which
-    was created for the ``target='Target.GPU'`` e.g. by :func:`pystencils.gpucuda.create_cuda_kernel`
-    or :func:`pystencils.gpucuda.created_indexed_cuda_kernel`
-    Args:
-        opencl_queue: a valid :class:`pyopencl.CommandQueue`
-        opencl_ctx: a valid :class:`pyopencl.Context`
-        kernel_function_node: the abstract syntax tree
-        argument_dict: parameters passed here are already fixed. Remaining parameters have to be passed to the
-                       returned kernel functor.
-    Returns:
-        compiled kernel as Python function
-    """
-    import pyopencl as cl
-    if not opencl_ctx:
-        opencl_ctx = _global_cl_ctx
-    if not opencl_queue:
-        opencl_queue = _global_cl_queue
-    assert opencl_ctx, "No valid OpenCL context!\n" \
-        "Use `import pystencils.opencl.autoinit` if you want it to be automatically created"
-    assert opencl_queue, "No valid OpenCL queue!\n" \
-        "Use `import pystencils.opencl.autoinit` if you want it to be automatically created"
-    if argument_dict is None:
-        argument_dict = {}
-    # check if double precision is supported and required
-    if any([d.double_fp_config == 0 for d in opencl_ctx.devices]):
-        for param in kernel_function_node.get_parameters():
-            if param.symbol.dtype.base_type:
-                if param.symbol.dtype.base_type.numpy_dtype == np.float64:
-                    raise ValueError('OpenCL device does not support double precision')
-            else:
-                if param.symbol.dtype.numpy_dtype == np.float64:
-                    raise ValueError('OpenCL device does not support double precision')
-    # Changing of kernel name necessary since compilation with default name "kernel" is not possible (OpenCL keyword!)
-    kernel_function_node.function_name = "opencl_" + kernel_function_node.function_name
-    header_list = ['"opencl_stdint.h"'] + list(get_headers(kernel_function_node))
-    includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list])
-    code = includes + "\n"
-    code += "#define FUNC_PREFIX __kernel\n"
-    code += "#define RESTRICT restrict\n\n"
-    code += str(generate_opencl(kernel_function_node, custom_backend=custom_backend))
-    options = []
-    if USE_FAST_MATH:
-        options.append("-cl-unsafe-math-optimizations")
-        options.append("-cl-mad-enable")
-        options.append("-cl-fast-relaxed-math")
-        options.append("-cl-finite-math-only")
-    options.append("-I")
-    options.append(get_pystencils_include_path())
-    mod = cl.Program(opencl_ctx, code).build(options=options)
-    func = getattr(mod, kernel_function_node.function_name)
-    parameters = kernel_function_node.get_parameters()
-    cache = {}
-    cache_values = []
-    def wrapper(**kwargs):
-        key = hash(tuple((k, v.ctypes.data, v.strides, v.shape) if isinstance(v, np.ndarray) else (k, id(v))
-                         for k, v in kwargs.items()))
-        try:
-            args, block_and_thread_numbers = cache[key]
-            func(opencl_queue, block_and_thread_numbers['grid'], block_and_thread_numbers['block'], *args)
-        except KeyError:
-            full_arguments = argument_dict.copy()
-            full_arguments.update(kwargs)
-            assert not any(isinstance(a, np.ndarray)
-                           for a in full_arguments.values()), 'Calling a OpenCL kernel with a Numpy array!'
-            assert not any('pycuda' in str(type(a))
-                           for a in full_arguments.values()), 'Calling a OpenCL kernel with a PyCUDA array!'
-            shape = _check_arguments(parameters, full_arguments)
-            indexing = kernel_function_node.indexing
-            block_and_thread_numbers = indexing.call_parameters(shape)
-            block_and_thread_numbers['block'] = tuple(int(i) for i in block_and_thread_numbers['block'])
-            block_and_thread_numbers['grid'] = tuple(int(b * g) for (b, g) in zip(block_and_thread_numbers['block'],
-                                                                                  block_and_thread_numbers['grid']))
-            args = _build_numpy_argument_list(parameters, full_arguments)
-            args = [a.data if hasattr(a, 'data') else a for a in args]
-            cache[key] = (args, block_and_thread_numbers)
-            cache_values.append(kwargs)  # keep objects alive such that ids remain unique
-            func(opencl_queue, block_and_thread_numbers['grid'], block_and_thread_numbers['block'], *args)
-    wrapper.ast = kernel_function_node
-    wrapper.parameters = kernel_function_node.get_parameters()
-    wrapper = KernelWrapper(wrapper, parameters, kernel_function_node)
-    return wrapper
--- a/pystencils_tests/benchmark/SkylakeSP_Gold-5122_allinclusive.yaml
+++ b/pystencils_tests/benchmark/SkylakeSP_Gold-5122_allinclusive.yaml
-# FIXME
-# FIXME performance counters might be wrong. This will only affect the Benchmark model
-# FIXME bandwidth measurements need validation
-# FIXME
-kerncraft version: 0.7.2
-model name: Intel(R) Xeon(R) Gold 5122 CPU @ 3.60GHz
-model type: Intel Core Skylake SP
-sockets: 2
-cores per socket: 4
-threads per core: 2
-NUMA domains per socket: 1
-cores per NUMA domain: 4
-clock: 3.6 GHz
-FLOPs per cycle:
-  SP:
-    total: 64
-    FMA: 64
-    ADD: 32
-    MUL: 32
-  DP:
-    total: 32
-    FMA: 32
-    ADD: 16
-    MUL: 16
-micro-architecture: SKX
-compiler:
-  !!omap
-  - icc: -O3 -fno-alias -xCORE-AVX512
-  - clang: -O3 -march=skylake-avx512 -D_POSIX_C_SOURCE=200112L
-  - gcc: -O3 -march=skylake-avx512
-cacheline size: 64 B
-overlapping model:
-  ports: ["0", "0DV", "1", "2", "3", "4", "5", "6", "7"]
-  performance counter metric:
-          Max(UOPS_DISPATCHED_PORT_PORT_0:PMC[0-3],
-          UOPS_DISPATCHED_PORT_PORT_1:PMC[0-3],
-          UOPS_DISPATCHED_PORT_PORT_4:PMC[0-3],
-          UOPS_DISPATCHED_PORT_PORT_5:PMC[0-3],
-          UOPS_DISPATCHED_PORT_PORT_6:PMC[0-3],
-          UOPS_DISPATCHED_PORT_PORT_7:PMC[0-3])
-non-overlapping model:
-  ports: ["2D", "3D"]
-  performance counter metric: T_OL + T_L1L2 + T_L2L3 + T_L3MEM
-memory hierarchy:
- level: L1
-  performance counter metrics:
-    accesses:  MEM_INST_RETIRED_ALL_LOADS:PMC[0-3]
-    misses: L1D_REPLACEMENT:PMC[0-3]
-    evicts: L2_TRANS_L1D_WB:PMC[0-3]
-  cache per group:
-    sets: 64
-    ways: 8
-    cl_size: 64
-    replacement_policy: 'LRU'
-    write_allocate: True
-    write_back: True
-    load_from: L2
-    store_to: L2
-  size per group: 32.00 kB
-  groups: 8
-  cores per group: 1
-  threads per group: 2
- level: L2
-  non-overlap upstream throughput: [64 B/cy, 'half-duplex']
-  performance counter metrics:
-    accesses: L1D_REPLACEMENT:PMC[0-3]
-    misses: L2_LINES_IN_ALL:PMC[0-3]
-    evicts: L2_TRANS_L2_WB:PMC[0-3]
-  cache per group:
-    sets: 1024
-    ways: 16
-    cl_size: 64
-    replacement_policy: 'LRU'
-    write_allocate: True
-    write_back: True
-    load_from: null  # L3 is a victim cache, thus unless a hit in L3, misses get forwarded to MEM
-    victims_to: L3  # all victims, modified or not are passed onto L3
-    store_to: L3
-  size per group: 1.00 MB
-  groups: 8
-  cores per group: 1
-  threads per group: 2
- level: L3
-  non-overlap upstream throughput: [16 B/cy, 'full-duplex']
-  performance counter metrics:
-    accesses: L2_LINES_IN_ALL:PMC[0-3]
-    # FIXME not all misses in L2 lead to loads from L3, only the hits do
-    misses: (CAS_COUNT_RD:MBOX0C[01] + CAS_COUNT_WR:MBOX0C[01] +
-             CAS_COUNT_RD:MBOX1C[01] + CAS_COUNT_WR:MBOX1C[01] +
-             CAS_COUNT_RD:MBOX2C[01] + CAS_COUNT_WR:MBOX2C[01] +
-             CAS_COUNT_RD:MBOX3C[01] + CAS_COUNT_WR:MBOX3C[01] +
-             CAS_COUNT_RD:MBOX4C[01] + CAS_COUNT_WR:MBOX4C[01] +
-             CAS_COUNT_RD:MBOX5C[01] + CAS_COUNT_WR:MBOX5C[01])
-    evicts: L2_TRANS_L2_WB:PMC[0-3]
-  cache per group:
-    sets: 16896
-    # TODO is actuall something else, but necessary to get to 16.5 MB
-    ways: 16
-    # TODO is actually 11, but pycachesim only supports powers of two
-    cl_size: 64
-    replacement_policy: 'LRU'
-    write_allocate: False
-    write_back: True
-  size per group: 16.50 MB
-  groups: 2
-  cores per group: 4
-  threads per group: 8
- level: MEM
-  cores per group: 4
-  threads per group: 8
-  non-overlap upstream throughput: ['full socket memory bandwidth', 'half-duplex']
-  penalty cycles per read stream: 0
-  size per group:
-benchmarks:
-  kernels:
-    load:
-      read streams:
-        streams: 1
-        bytes: 8.00 B
-      read+write streams:
-        streams: 0
-        bytes: 0.00 B
-      write streams:
-        streams: 0
-        bytes: 0.00 B
-      FLOPs per iteration: 0
-    copy:
-      read streams:
-        streams: 1
-        bytes: 8.00 B
-      read+write streams:
-        streams: 0
-        bytes: 0.00 B
-      write streams:
-        streams: 1
-        bytes: 8.00 B
-      FLOPs per iteration: 0
-    update:
-      read streams:
-        streams: 1
-        bytes: 8.00 B
-      read+write streams:
-        streams: 1
-        bytes: 8.00 B
-      write streams:
-        streams: 1
-        bytes: 8.00 B
-      FLOPs per iteration: 0
-    triad:
-      read streams:
-        streams: 3
-        bytes: 24.00 B
-      read+write streams:
-        streams: 0
-        bytes: 0.00 B
-      write streams:
-        streams: 1
-        bytes: 8.00 B
-      FLOPs per iteration: 2
-    daxpy:
-      read streams:
-        streams: 2
-        bytes: 16.00 B
-      read+write streams:
-        streams: 1
-        bytes: 8.00 B
-      write streams:
-        streams: 1
-        bytes: 8.00 B
-      FLOPs per iteration: 2
-  measurements:
-    L1:
-      1:
-        threads per core: 1
-        cores:
-        - 1
-        - 2
-        - 3
-        - 4
-        threads:
-        - 1
-        - 2
-        - 3
-        - 4
-        size per core:
-        - 21.12 kB
-        - 21.12 kB
-        - 21.12 kB
-        - 21.12 kB
-        size per thread:
-        - 21.12 kB
-        - 21.12 kB
-        - 21.12 kB
-        - 21.12 kB
-        total size:
-        - 21.12 kB
-        - 42.24 kB
-        - 63.36 kB
-        - 84.48 kB
-        results:
-          load:
-          - 42.98 GB/s
-          - 85.08 GB/s
-          - 127.45 GB/s
-          - 169.92 GB/s
-          copy:
-          - 56.07 GB/s
-          - 111.50 GB/s
-          - 164.90 GB/s
-          - 221.50 GB/s
-          update:
-          - 56.54 GB/s
-          - 112.25 GB/s
-          - 168.50 GB/s
-          - 224.75 GB/s
-          triad:
-          - 45.90 GB/s
-          - 89.81 GB/s
-          - 127.29 GB/s
-          - 169.57 GB/s
-          daxpy:
-          - 36.62 GB/s
-          - 71.30 GB/s
-          - 103.52 GB/s
-          - 135.26 GB/s
-      2:
-        threads per core: 2
-        cores:
-        - 1
-        - 2
-        - 3
-        - 4
-        threads:
-        - 2
-        - 4
-        - 6
-        - 8
-        size per core:
-        - 21.12 kB
-        - 21.12 kB
-        - 21.12 kB
-        - 21.12 kB
-        size per thread:
-        - 10.56 kB
-        - 10.56 kB
-        - 10.56 kB
-        - 10.56 kB
-        total size:
-        - 21.12 kB
-        - 42.24 kB
-        - 63.36 kB
-        - 84.48 kB
-        results:
-          load:
-          - 49.61 GB/s
-          - 98.80 GB/s
-          - 147.98 GB/s
-          - 198.22 GB/s
-          copy:
-          - 55.98 GB/s
-          - 111.56 GB/s
-          - 167.08 GB/s
-          - 220.42 GB/s
-          update:
-          - 56.53 GB/s
-          - 112.72 GB/s
-          - 168.95 GB/s
-          - 225.31 GB/s
-          triad:
-          - 54.01 GB/s
-          - 104.58 GB/s
-          - 153.02 GB/s
-          - 200.93 GB/s
-          daxpy:
-          - 41.11 GB/s
-          - 80.28 GB/s
-          - 115.71 GB/s
-          - 152.81 GB/s
-    L2:
-      1:
-        threads per core: 1
-        cores:
-        - 1
-        - 2
-        - 3
-        - 4
-        threads:
-        - 1
-        - 2
-        - 3
-        - 4
-        size per core:
-        - 660.00 kB
-        - 660.00 kB
-        - 660.00 kB
-        - 660.00 kB
-        size per thread:
-        - 660.00 kB
-        - 660.00 kB
-        - 660.00 kB
-        - 660.00 kB
-        total size:
-        - 660.00 kB
-        - 1.32 MB
-        - 1.98 MB
-        - 2.64 MB
-        results:
-          load:
-          - 27.15 GB/s
-          - 54.09 GB/s
-          - 80.61 GB/s
-          - 106.41 GB/s
-          copy:
-          - 43.53 GB/s
-          - 90.07 GB/s
-          - 127.73 GB/s
-          - 171.81 GB/s
-          update:
-          - 50.38 GB/s
-          - 98.47 GB/s
-          - 147.91 GB/s
-          - 197.20 GB/s
-          triad:
-          - 43.38 GB/s
-          - 83.72 GB/s
-          - 124.83 GB/s
-          - 166.04 GB/s
-          daxpy:
-          - 36.29 GB/s
-          - 71.29 GB/s
-          - 103.33 GB/s
-          - 136.48 GB/s
-      2:
-        threads per core: 2
-        cores:
-        - 1
-        - 2
-        - 3
-        - 4
-        threads:
-        - 2
-        - 4
-        - 6
-        - 8
-        size per core:
-        - 660.00 kB
-        - 660.00 kB
-        - 660.00 kB
-        - 660.00 kB
-        size per thread:
-        - 330.00 kB
-        - 330.00 kB
-        - 330.00 kB
-        - 330.00 kB
-        total size:
-        - 660.00 kB
-        - 1.32 MB
-        - 1.98 MB
-        - 2.64 MB
-        results:
-          load:
-          - 35.29 GB/s
-          - 70.28 GB/s
-          - 104.67 GB/s
-          - 139.63 GB/s
-          copy:
-          - 42.23 GB/s
-          - 83.70 GB/s
-          - 124.33 GB/s
-          - 167.50 GB/s
-          update:
-          - 50.09 GB/s
-          - 99.77 GB/s
-          - 149.87 GB/s
-          - 198.82 GB/s
-          triad:
-          - 52.38 GB/s
-          - 100.00 GB/s
-          - 147.40 GB/s
-          - 193.31 GB/s
-          daxpy:
-          - 41.14 GB/s
-          - 80.22 GB/s
-          - 116.23 GB/s
-          - 155.08 GB/s
-    L3:
-      1:
-        threads per core: 1
-        cores:
-        - 1
-        - 2
-        - 3
-        - 4
-        threads:
-        - 1
-        - 2
-        - 3
-        - 4
-        size per core:
-        - 10.56 MB
-        - 5.28 MB
-        - 3.52 MB
-        - 2.64 MB
-        size per thread:
-        - 10.56 MB
-        - 5.28 MB
-        - 3.52 MB
-        - 2.64 MB
-        total size:
-        - 10.56 MB
-        - 10.56 MB
-        - 10.56 MB
-        - 10.56 MB
-        results:
-          load:
-          - 22.40 GB/s
-          - 44.77 GB/s
-          - 65.71 GB/s
-          - 89.26 GB/s
-          copy:
-          - 25.32 GB/s
-          - 49.70 GB/s
-          - 72.89 GB/s
-          - 98.62 GB/s
-          update:
-          - 41.24 GB/s
-          - 81.14 GB/s
-          - 122.22 GB/s
-          - 166.44 GB/s
-          triad:
-          - 25.61 GB/s
-          - 50.02 GB/s
-          - 73.23 GB/s
-          - 98.95 GB/s
-          daxpy:
-          - 32.07 GB/s
-          - 62.65 GB/s
-          - 89.91 GB/s
-          - 120.65 GB/s
-      2:
-        threads per core: 2
-        cores:
-        - 1
-        - 2
-        - 3
-        - 4
-        threads:
-        - 2
-        - 4
-        - 6
-        - 8
-        size per core:
-        - 10.56 MB
-        - 5.28 MB
-        - 3.52 MB
-        - 2.64 MB
-        size per thread:
-        - 5.28 MB
-        - 2.64 MB
-        - 1.76 MB
-        - 1.32 MB
-        total size:
-        - 10.56 MB
-        - 10.56 MB
-        - 10.56 MB
-        - 10.56 MB
-        results:
-          load:
-          - 26.18 GB/s
-          - 51.85 GB/s
-          - 75.82 GB/s
-          - 101.39 GB/s
-          copy:
-          - 26.22 GB/s
-          - 51.83 GB/s
-          - 76.40 GB/s
-          - 102.84 GB/s
-          update:
-          - 43.51 GB/s
-          - 86.75 GB/s
-          - 129.86 GB/s
-          - 174.54 GB/s
-          triad:
-          - 26.39 GB/s
-          - 51.80 GB/s
-          - 76.27 GB/s
-          - 102.66 GB/s
-          daxpy:
-          - 37.43 GB/s
-          - 73.16 GB/s
-          - 106.53 GB/s
-          - 142.76 GB/s
-    MEM:
-      1:
-        threads per core: 1
-        cores:
-        - 1
-        - 2
-        - 3
-        - 4
-        threads:
-        - 1
-        - 2
-        - 3
-        - 4
-        size per core:
-        - 240.00 MB
-        - 120.00 MB
-        - 80.00 MB
-        - 60.00 MB
-        size per thread:
-        - 240.00 MB
-        - 120.00 MB
-        - 80.00 MB
-        - 60.00 MB
-        total size:
-        - 240.00 MB
-        - 240.00 MB
-        - 240.00 MB
-        - 240.00 MB
-        results:
-          load:
-          - 12.03 GB/s
-          - 24.38 GB/s
-          - 34.83 GB/s
-          - 45.05 GB/s
-          copy:
-          - 12.32 GB/s
-          - 24.40 GB/s
-          - 32.82 GB/s
-          - 37.00 GB/s
-          update:
-          - 20.83 GB/s
-          - 40.25 GB/s
-          - 48.81 GB/s
-          - 54.84 GB/s
-          triad:
-          - 11.64 GB/s
-          - 23.17 GB/s
-          - 34.78 GB/s
-          - 42.97 GB/s
-          daxpy:
-          - 17.69 GB/s
-          - 34.02 GB/s
-          - 48.12 GB/s
-          - 55.73 GB/s
-      2:
-        threads per core: 2
-        cores:
-        - 1
-        - 2
-        - 3
-        - 4
-        threads:
-        - 2
-        - 4
-        - 6
-        - 8
-        size per core:
-        - 240.00 MB
-        - 120.00 MB
-        - 80.00 MB
-        - 60.00 MB
-        size per thread:
-        - 120.00 MB
-        - 60.00 MB
-        - 40.00 MB
-        - 30.00 MB
-        total size:
-        - 240.00 MB
-        - 240.00 MB
-        - 240.00 MB
-        - 240.00 MB
-        results:
-          load:
-          - 15.33 GB/s
-          - 28.32 GB/s
-          - 41.34 GB/s
-          - 53.02 GB/s
-          copy:
-          - 13.96 GB/s
-          - 26.61 GB/s
-          - 34.39 GB/s
-          - 38.96 GB/s
-          update:
-          - 26.47 GB/s
-          - 47.82 GB/s
-          - 56.70 GB/s
-          - 62.78 GB/s
-          triad:
-          - 14.42 GB/s
-          - 26.66 GB/s
-          - 36.94 GB/s
-          - 44.01 GB/s
-          daxpy:
-          - 20.96 GB/s
-          - 39.12 GB/s
-          - 51.55 GB/s
-          - 58.37 GB/s
--- a/pystencils_tests/benchmark/benchmark.py
+++ b/pystencils_tests/benchmark/benchmark.py
-import math
-import os
-import time
-import numpy as np
-import sympy as sp
-from git import Repo
-from influxdb import InfluxDBClient
-from kerncraft.machinemodel import MachineModel
-from kerncraft.models import ECM, Benchmark, Roofline, RooflineIACA
-from kerncraft.prefixedunit import PrefixedUnit
-from pystencils import Assignment, Field, create_kernel
-from pystencils.kerncraft_coupling import KerncraftParameters, PyStencilsKerncraftKernel
-def output_benchmark(analysis):
-    output = {}
-    keys = ['Runtime (per repetition) [s]', 'Iterations per repetition',
-            'Runtime (per cacheline update) [cy/CL]', 'MEM volume (per repetition) [B]',
-            'Performance [MFLOP/s]', 'Performance [MLUP/s]', 'Performance [MIt/s]', 'MEM BW [MByte/s]']
-    copies = {key: analysis[key] for key in keys}
-    output.update(copies)
-    for cache, metrics in analysis['data transfers'].items():
-        for metric_name, metric_value in metrics.items():
-            fixed = metric_value.with_prefix('')
-            output[cache + ' ' + metric_name + ' ' + fixed.prefix + fixed.unit] = fixed.value
-    for level, value in analysis['ECM'].items():
-        output['Phenomenological ECM ' + level + ' cy/CL'] = value
-    return output
-def output_ecm(analysis):
-    output = {}
-    keys = ['T_nOL', 'T_OL', 'cl throughput', 'uops']
-    copies = {key: analysis[key] for key in keys}
-    output.update(copies)
-    if 'memory bandwidth kernel' in analysis:
-        output['memory bandwidth kernel' + analysis['memory bandwidth kernel'] + analysis['memory bandwidth'].prefix +
-               analysis['memory bandwidth'].unit] = analysis['memory bandwidth'].value
-    output['scaling cores'] = int(analysis['scaling cores']) if not math.isinf(analysis['scaling cores']) else -1
-    for key, value in analysis['cycles']:
-        output[key] = value
-    return output
-def output_roofline(analysis):
-    output = {}
-    keys = ['min performance']  # 'bottleneck level'
-    copies = {key: analysis[key] for key in keys}
-    output.update(copies)
-    # TODO save bottleneck information (compute it here)
-    # fixed = analysis['max_flops'].with_prefix('G')
-    # output['max GFlop/s'] = fixed.value
-    # if analysis['min performance'] > max_flops:
-    #    # CPU bound
-    #    print('CPU bound with {} cores(s)'.format(self._args.cores), file=output_file)
-    #    print('{!s} due to CPU max. FLOP/s'.format(max_flops), file=output_file)
-    # else:
-    # Memory bound
-    bottleneck = analysis['mem bottlenecks'][analysis['bottleneck level']]
-    output['bottleneck GFlop/s'] = bottleneck['performance'].with_prefix('G').value
-    output['bottleneck level'] = bottleneck['level']
-    output['bottleneck bw kernel'] = bottleneck['bw kernel']
-    output['bottleneck arithmetic intensity'] = bottleneck['arithmetic intensity']
-    for i, level in enumerate(analysis['mem bottlenecks']):
-        if level is None:
-            continue
-        for key, value in level.items():
-            if isinstance(value, PrefixedUnit):
-                fixed = value.with_prefix('G')
-                output['level ' + str(i) + ' ' + key + ' [' + fixed.prefix + fixed.unit + ']'] = 'inf' if isinstance(
-                    fixed.value, float) and math.isinf(fixed.value) else fixed.value
-            else:
-                output['level ' + str(i) + ' ' + key] = 'inf' if isinstance(value, float) and math.isinf(
-                    value) else value
-    return output
-def output_roofline_iaca(analysis):
-    output = {}
-    keys = ['min performance']  # 'bottleneck level'
-    copies = {key: analysis[key] for key in keys}
-    # output.update(copies)
-    # TODO save bottleneck information (compute it here)
-    # fixed = analysis['max_flops'].with_prefix('G')
-    # output['max GFlop/s'] = fixed.value
-    # if analysis['min performance'] > max_flops:
-    #    # CPU bound
-    #    print('CPU bound with {} cores(s)'.format(self._args.cores), file=output_file)
-    #    print('{!s} due to CPU max. FLOP/s'.format(max_flops), file=output_file)
-    # else:
-    # Memory bound
-    bottleneck = analysis['mem bottlenecks'][analysis['bottleneck level']]
-    output['bottleneck GFlop/s'] = bottleneck['performance'].with_prefix('G').value
-    output['bottleneck level'] = bottleneck['level']
-    output['bottleneck bw kernel'] = bottleneck['bw kernel']
-    output['bottleneck arithmetic intensity'] = bottleneck['arithmetic intensity']
-    for i, level in enumerate(analysis['mem bottlenecks']):
-        if level is None:
-            continue
-        for key, value in level.items():
-            if isinstance(value, PrefixedUnit):
-                fixed = value.with_prefix('G')
-                output['level ' + str(i) + ' ' + key + ' [' + fixed.prefix + fixed.unit + ']'] = 'inf' if isinstance(
-                    fixed.value, float) and math.isinf(fixed.value) else fixed.value
-            else:
-                output['level ' + str(i) + ' ' + key] = 'inf' if isinstance(value, float) and math.isinf(
-                    value) else value
-    return output
-def report_analysis(ast, models, machine, tags, fields=None):
-    kernel = PyStencilsKerncraftKernel(ast, machine)
-    client = InfluxDBClient('i10grafana.informatik.uni-erlangen.de', 8086, 'pystencils',
-                            'roggan', 'pystencils')
-    repo = Repo(search_parent_directories=True)
-    commit = repo.head.commit
-    point_time = int(time.time())
-    for model in models:
-        benchmark = model(kernel, machine, KerncraftParameters())
-        benchmark.analyze()
-        analysis = benchmark.results
-        if model is Benchmark:
-            output = output_benchmark(analysis)
-        elif model is ECM:
-            output = output_ecm(analysis)
-        elif model is Roofline:
-            output = output_roofline(analysis)
-        elif model is RooflineIACA:
-            output = output_roofline_iaca(analysis)
-        else:
-            raise ValueError('No valid model for analysis given!')
-        if fields is not None:
-            output.update(fields)
-        output['commit'] = commit.hexsha
-        json_body = [
-            {
-                'measurement': model.__name__,
-                'tags': tags,
-                'time': point_time,
-                'fields': output
-            }
-        ]
-        client.write_points(json_body, time_precision='s')
-def main():
-    size = [20, 200, 200]
-    arr = np.zeros(size)
-    a = Field.create_from_numpy_array('a', arr, index_dimensions=0)
-    b = Field.create_from_numpy_array('b', arr, index_dimensions=0)
-    s = sp.Symbol("s")
-    rhs = a[0, -1, 0] + a[0, 1, 0] + \
-          a[-1, 0, 0] + a[1, 0, 0] + \
-          a[0, 0, -1] + a[0, 0, 1]
-    update_rule = Assignment(b[0, 0, 0], s * rhs)
-    ast = create_kernel([update_rule])
-    input_folder = "./"
-    machine_file_path = os.path.join(input_folder, "SkylakeSP_Gold-5122_allinclusive.yaml")
-    machine = MachineModel(path_to_yaml=machine_file_path)
-    tags = {
-        'host': os.uname()[1],
-        'project': 'pystencils',
-        'kernel': 'jacobi_3D ' + str(size)
-    }
-    report_analysis(ast, [ECM, Roofline, RooflineIACA, Benchmark], machine, tags)
-if __name__ == '__main__':
-    main()
--- a/pystencils_tests/benchmark/generate.py
+++ b/pystencils_tests/benchmark/generate.py
-import numpy as np
-import sympy as sp
-from pystencils import Assignment, Field, create_kernel
-def meassure():
-    size = [30, 50, 3]
-    arr = np.zeros(size)
-    a = Field.create_from_numpy_array('a', arr, index_dimensions=1)
-    b = Field.create_from_numpy_array('b', arr, index_dimensions=1)
-    s = sp.Symbol("s")
-    rhs = a[0, -1](0) + a[0, 1] + a[-1, 0] + a[1, 0]
-    updateRule = Assignment(b[0, 0], s * rhs)
-    print(updateRule)
-    ast = create_kernel([updateRule])
-    # benchmark = generate_benchmark(ast)
-    # main = benchmark[0]
-    # kernel = benchmark[1]
-    # with open('src/main.cpp', 'w') as file:
-    #     file.write(main)
-    # with open('src/kernel.cpp', 'w') as file:
-    #     file.write(kernel)
-    func = ast.compile({'omega': 2/3})
-    from pystencils.kerncraft_coupling.generate_benchmark import generate_benchmark
-    from pystencils.kerncraft_coupling import BenchmarkAnalysis
-    from pystencils.kerncraft_coupling.kerncraft_interface import PyStencilsKerncraftKernel, KerncraftParameters
-    from kerncraft.machinemodel import MachineModel
-    from kerncraft.models import ECMData
-    machineFilePath = "../pystencils_tests/kerncraft_inputs/default_machine_file.yaml"
-    machine = MachineModel(path_to_yaml=machineFilePath)
-    benchmark = BenchmarkAnalysis(ast, machine)
-    #TODO what do i want to do with benchmark?
-    kernel = PyStencilsKerncraftKernel(ast)
-    model = ECMData(kernel, machine, KerncraftParameters())
-    model.analyze()
-    model.report()
-if __name__ == "__main__":
-    meassure()
--- a/pystencils_tests/benchmark/iacaMarks.h
+++ b/pystencils_tests/benchmark/iacaMarks.h
-/*
-* Copyright (2008-2009) Intel Corporation All Rights Reserved. 
-* The source code contained or described herein and all documents 
-* related to the source code ("Material") are owned by Intel Corporation 
-* or its suppliers or licensors. Title to the Material remains with 
-* Intel Corporation or its suppliers and licensors. The Material 
-* contains trade secrets and proprietary and confidential information 
-* of Intel or its suppliers and licensors. The Material is protected 
-* by worldwide copyright and trade secret laws and treaty provisions. 
-* No part of the Material may be used, copied, reproduced, modified, 
-* published, uploaded, posted, transmitted, distributed, or disclosed 
-* in any way without Intel(R)s prior express written permission.
-*
-* No license under any patent, copyright, trade secret or other 
-* intellectual property right is granted to or conferred upon you by 
-* disclosure or delivery of the Materials, either expressly, by implication,
-* inducement, estoppel or otherwise. Any license under such intellectual 
-* property rights must be express and approved by Intel in writing.
-*/
-#if defined (__GNUC__) 
-#define IACA_SSC_MARK( MARK_ID )						\
-__asm__ __volatile__ (									\
-					  "\n\t  movl $"#MARK_ID", %%ebx"	\
-					  "\n\t  .byte 0x64, 0x67, 0x90"	\
-					  : : : "memory" );
-#else
-#define IACA_SSC_MARK(x) {__asm  mov ebx, x\
-	__asm  _emit 0x64 \
-	__asm  _emit 0x67 \
-	__asm  _emit 0x90 }
-#endif
-#define IACA_START {IACA_SSC_MARK(111)}
-#define IACA_END {IACA_SSC_MARK(222)}
-#ifdef _WIN64
-#include <intrin.h>
-#define IACA_VC64_START __writegsbyte(111, 111);
-#define IACA_VC64_END   __writegsbyte(222, 222);
-#endif
-/**************** asm *****************
-;START_MARKER
-mov ebx, 111
-db 0x64, 0x67, 0x90
-;END_MARKER
-mov ebx, 222
-db 0x64, 0x67, 0x90
-**************************************/
--- a/pystencils_tests/benchmark/main.c
+++ b/pystencils_tests/benchmark/main.c
-#include "iacaMarks.h"
-int main(int argc, char * argv[]){
-	int a = 0;
-	for(int i = 0; i < argc+100000; i++){
-		IACA_START
-		a += i;
-	}
-	IACA_END
-	return a;
-}
--- a/pystencils_tests/kerncraft_inputs/2d-5pt.c
+++ b/pystencils_tests/kerncraft_inputs/2d-5pt.c
-double a[30][50][3];
-double b[30][50][3];
-double s;
-for(int j=1; j<30-1; ++j)
-    for(int i=1; i<50-1; ++i)
-        b[j][i] = ( a[j][i-1] + a[j][i+1]
-                  + a[j-1][i] + a[j+1][i]) * s;
--- a/pystencils_tests/kerncraft_inputs/3d-7pt.c
+++ b/pystencils_tests/kerncraft_inputs/3d-7pt.c
-double a[M][N][N];
-double b[M][N][N];
-double s;
-for(int k=1; k<M-1; ++k)
-    for(int j=1; j<N-1; ++j)
-        for(int i=1; i<N-1; ++i)
-            b[k][j][i] = ( a[k][j][i-1] + a[k][j][i+1]
-                         + a[k][j-1][i] + a[k][j+1][i]
-                         + a[k-1][j][i] + a[k+1][j][i]) * s;
--- a/pystencils_tests/kerncraft_inputs/Example_SandyBridgeEP_E5-2680.yml
+++ b/pystencils_tests/kerncraft_inputs/Example_SandyBridgeEP_E5-2680.yml
-kerncraft version: 0.8.6.dev0
-model name: Intel(R) Xeon(R) CPU E5-2680 0 @ 2.70GHz
-model type: Intel Xeon SandyBridge EN/EP processor
-clock: 2.7 GHz
-sockets: 2
-cores per socket: 8
-threads per core: 2
-NUMA domains per socket: 1
-cores per NUMA domain: 8
-transparent hugepage: always
-in-core model: !!omap
-  - IACA: SNB
-  - OSACA: SNB
-  - LLVM-MCA: -mcpu=sandybridge
-isa: x86
-FLOPs per cycle:
-  SP: {total: 16, ADD: 8, MUL: 8}
-  DP: {total: 8, ADD: 4, MUL: 4}
-compiler: !!omap
- icc: -O3 -xAVX -fno-alias -qopenmp -ffreestanding -nolib-inline
- clang: -O3 -march=corei7-avx -mtune=corei7-avx -D_POSIX_C_SOURCE=200809L -fopenmp -ffreestanding
- gcc: -O3 -march=corei7-avx -D_POSIX_C_SOURCE=200809L -fopenmp -lm -ffreestanding
-overlapping model:
-  ports: 
-    IACA: ['0', 0DV, '1', '2', '3', '4', '5']
-    OSACA: ['0', 0DV, '1', '2', '3', '4', '5']
-    LLVM-MCA: [SBDivider, SBFPDivider, SBPort0, SBPort1, SBPort23, SBPort4, SBPort5]
-  performance counter metric: Max(UOPS_DISPATCHED_PORT_PORT_0:PMC[0-3], UOPS_DISPATCHED_PORT_PORT_1:PMC[0-3], UOPS_DISPATCHED_PORT_PORT_4:PMC[0-3], UOPS_DISPATCHED_PORT_PORT_5:PMC[0-3])
-non-overlapping model:
-  ports: 
-    IACA: [2D, 3D]
-    OSACA: [2D, 3D]
-    LLVM-MCA: [SBPort23]
-  performance counter metric: T_nOL + T_L1L2 + T_L2L3 + T_L3MEM
-cacheline size: 64 B
-memory hierarchy:
- level: L1
-  cache per group: {sets: 64, ways: 8, cl_size: 64, replacement_policy: LRU, write_allocate: true,
-    write_back: true, load_from: L2, store_to: L2}
-  cores per group: 1
-  threads per group: 2
-  groups: 16
-  performance counter metrics:
-    accesses: MEM_UOPS_RETIRED_LOADS:PMC[0-3] + MEM_UOPS_RETIRED_STORES:PMC[0-3]
-    misses: L1D_REPLACEMENT:PMC[0-3]
-    evicts: L1D_M_EVICT:PMC[0-3]
-  upstream throughput: [architecture code analyzer, [2D, 3D]]
-  transfers overlap: false
- level: L2
-  cache per group: {sets: 512, ways: 8, cl_size: 64, replacement_policy: LRU, write_allocate: true,
-    write_back: true, load_from: L3, store_to: L3}
-  cores per group: 1
-  threads per group: 2
-  groups: 16
-  upstream throughput: [32 B/cy, half-duplex]
-  transfers overlap: false
-  performance counter metrics:
-    accesses: L1D_REPLACEMENT:PMC[0-3] + L1D_M_EVICT:PMC[0-3]
-    misses: L2_LINES_IN_ALL:PMC[0-3]
-    evicts: L2_TRANS_L2_WB:PMC[0-3]
- level: L3
-  cache per group: {sets: 20480, ways: 16, cl_size: 64, replacement_policy: LRU, write_allocate: true,
-    write_back: true}
-  cores per group: 8
-  threads per group: 16
-  groups: 2
-  upstream throughput: [32 B/cy, half-duplex]
-  transfers overlap: false
-  performance counter metrics:
-    accesses: L2_LINES_IN_ALL:PMC[0-3] + L2_TRANS_L2_WB:PMC[0-3]
-    misses: (CAS_COUNT_RD:MBOX0C[01] + CAS_COUNT_RD:MBOX1C[01] + CAS_COUNT_RD:MBOX2C[01]
-      + CAS_COUNT_RD:MBOX3C[01])
-    evicts: (CAS_COUNT_WR:MBOX0C[01] + CAS_COUNT_WR:MBOX1C[01] + CAS_COUNT_WR:MBOX2C[01]
-      + CAS_COUNT_WR:MBOX3C[01])
- level: MEM
-  cores per group: 8
-  upstream throughput: [full socket memory bandwidth, half-duplex]
-  transfers overlap: false
-  size per group:
-  threads per group: 16
-benchmarks:
-  kernels:
-    copy:
-      FLOPs per iteration: 0
-      fastest bench kernel: copy_avx
-      read streams: {bytes: 8.00 B, streams: 1}
-      read+write streams: {bytes: 0.00 B, streams: 0}
-      write streams: {bytes: 8.00 B, streams: 1}
-    daxpy:
-      FLOPs per iteration: 2
-      fastest bench kernel: daxpy_avx
-      read streams: {bytes: 16.00 B, streams: 2}
-      read+write streams: {bytes: 8.00 B, streams: 1}
-      write streams: {bytes: 8.00 B, streams: 1}
-    load:
-      FLOPs per iteration: 0
-      fastest bench kernel: load_avx
-      read streams: {bytes: 8.00 B, streams: 1}
-      read+write streams: {bytes: 0.00 B, streams: 0}
-      write streams: {bytes: 0.00 B, streams: 0}
-    triad:
-      FLOPs per iteration: 2
-      fastest bench kernel: triad_avx
-      read streams: {bytes: 24.00 B, streams: 3}
-      read+write streams: {bytes: 0.00 B, streams: 0}
-      write streams: {bytes: 8.00 B, streams: 1}
-    update:
-      FLOPs per iteration: 0
-      fastest bench kernel: update_avx
-      read streams: {bytes: 8.00 B, streams: 1}
-      read+write streams: {bytes: 8.00 B, streams: 1}
-      write streams: {bytes: 8.00 B, streams: 1}
-  measurements:
-    L1:
-      1:
-        cores: [1, 2, 3, 4, 5, 6, 7, 8]
-        results:
-          copy: [83.27 GB/s, 166.52 GB/s, 249.78 GB/s, 333.02 GB/s, 416.34 GB/s, 495.96
-              GB/s, 578.56 GB/s, 660.60 GB/s]
-          daxpy: [116.88 GB/s, 233.68 GB/s, 311.60 GB/s, 409.72 GB/s, 509.79 GB/s,
-            559.65 GB/s, 612.77 GB/s, 719.71 GB/s]
-          load: [84.07 GB/s, 168.13 GB/s, 252.21 GB/s, 336.04 GB/s, 420.34 GB/s, 504.02
-              GB/s, 588.04 GB/s, 668.37 GB/s]
-          triad: [100.24 GB/s, 211.57 GB/s, 314.53 GB/s, 392.73 GB/s, 506.87 GB/s,
-            589.51 GB/s, 687.28 GB/s, 782.17 GB/s]
-          update: [84.77 GB/s, 160.10 GB/s, 237.12 GB/s, 312.74 GB/s, 392.54 GB/s,
-            465.53 GB/s, 516.02 GB/s, 567.27 GB/s]
-        size per core: [21.12 kB, 21.12 kB, 21.12 kB, 21.12 kB, 21.12 kB, 21.12 kB,
-          21.12 kB, 21.12 kB]
-        size per thread: [21.12 kB, 21.12 kB, 21.12 kB, 21.12 kB, 21.12 kB, 21.12
-            kB, 21.12 kB, 21.12 kB]
-        stats:
-          copy:
-          - [83.24 GB/s, 83.25 GB/s, 83.26 GB/s, 83.26 GB/s, 83.27 GB/s, 83.26 GB/s,
-            83.25 GB/s, 83.23 GB/s, 83.24 GB/s, 83.25 GB/s]
-          - [166.49 GB/s, 166.47 GB/s, 166.51 GB/s, 166.49 GB/s, 166.48 GB/s, 166.52
-              GB/s, 166.51 GB/s, 166.51 GB/s, 166.51 GB/s, 166.50 GB/s]
-          - [249.78 GB/s, 249.75 GB/s, 249.73 GB/s, 249.72 GB/s, 249.74 GB/s, 249.76
-              GB/s, 249.76 GB/s, 249.74 GB/s, 249.73 GB/s, 249.75 GB/s]
-          - [332.98 GB/s, 327.92 GB/s, 332.30 GB/s, 332.95 GB/s, 333.00 GB/s, 333.01
-              GB/s, 332.95 GB/s, 333.00 GB/s, 332.99 GB/s, 333.02 GB/s]
-          - [416.26 GB/s, 416.23 GB/s, 416.28 GB/s, 416.27 GB/s, 416.23 GB/s, 416.27
-              GB/s, 416.34 GB/s, 416.26 GB/s, 416.16 GB/s, 416.23 GB/s]
-          - [495.84 GB/s, 495.93 GB/s, 495.88 GB/s, 495.91 GB/s, 495.96 GB/s, 495.92
-              GB/s, 495.89 GB/s, 495.87 GB/s, 495.96 GB/s, 495.92 GB/s]
-          - [578.51 GB/s, 578.52 GB/s, 578.39 GB/s, 578.56 GB/s, 578.48 GB/s, 578.44
-              GB/s, 578.51 GB/s, 578.48 GB/s, 578.51 GB/s, 578.53 GB/s]
-          - [422.14 GB/s, 660.55 GB/s, 660.60 GB/s, 660.49 GB/s, 660.52 GB/s, 660.48
-              GB/s, 660.56 GB/s, 660.56 GB/s, 660.52 GB/s, 651.64 GB/s]
-          daxpy:
-          - [116.87 GB/s, 116.82 GB/s, 116.85 GB/s, 116.84 GB/s, 116.83 GB/s, 116.85
-              GB/s, 116.88 GB/s, 116.87 GB/s, 116.86 GB/s, 116.82 GB/s]
-          - [214.69 GB/s, 229.83 GB/s, 221.16 GB/s, 233.60 GB/s, 232.90 GB/s, 233.68
-              GB/s, 207.83 GB/s, 233.65 GB/s, 212.71 GB/s, 214.07 GB/s]
-          - [282.77 GB/s, 307.63 GB/s, 307.09 GB/s, 310.67 GB/s, 307.50 GB/s, 311.40
-              GB/s, 307.06 GB/s, 305.89 GB/s, 311.60 GB/s, 308.47 GB/s]
-          - [404.96 GB/s, 408.54 GB/s, 395.76 GB/s, 409.72 GB/s, 316.70 GB/s, 408.07
-              GB/s, 347.34 GB/s, 406.03 GB/s, 391.75 GB/s, 385.10 GB/s]
-          - [479.84 GB/s, 509.24 GB/s, 502.60 GB/s, 449.79 GB/s, 402.46 GB/s, 489.18
-              GB/s, 491.15 GB/s, 491.20 GB/s, 384.36 GB/s, 509.79 GB/s]
-          - [515.12 GB/s, 496.21 GB/s, 517.52 GB/s, 540.00 GB/s, 501.82 GB/s, 507.84
-              GB/s, 496.71 GB/s, 479.42 GB/s, 559.65 GB/s, 519.55 GB/s]
-          - [584.86 GB/s, 580.10 GB/s, 583.34 GB/s, 612.77 GB/s, 607.15 GB/s, 607.89
-              GB/s, 589.85 GB/s, 609.59 GB/s, 592.86 GB/s, 568.07 GB/s]
-          - [719.71 GB/s, 660.98 GB/s, 675.88 GB/s, 679.51 GB/s, 696.97 GB/s, 635.23
-              GB/s, 644.06 GB/s, 694.74 GB/s, 654.01 GB/s, 656.57 GB/s]
-          load:
-          - [84.04 GB/s, 84.06 GB/s, 84.06 GB/s, 84.04 GB/s, 84.05 GB/s, 84.05 GB/s,
-            84.07 GB/s, 84.04 GB/s, 84.05 GB/s, 84.06 GB/s]
-          - [168.09 GB/s, 168.12 GB/s, 168.06 GB/s, 168.11 GB/s, 168.12 GB/s, 168.13
-              GB/s, 168.13 GB/s, 168.12 GB/s, 168.10 GB/s, 168.13 GB/s]
-          - [252.16 GB/s, 252.21 GB/s, 252.07 GB/s, 252.07 GB/s, 252.18 GB/s, 252.16
-              GB/s, 252.21 GB/s, 252.20 GB/s, 252.20 GB/s, 252.17 GB/s]
-          - [335.94 GB/s, 336.03 GB/s, 335.99 GB/s, 336.04 GB/s, 336.00 GB/s, 335.98
-              GB/s, 335.97 GB/s, 335.89 GB/s, 335.99 GB/s, 336.03 GB/s]
-          - [420.30 GB/s, 420.18 GB/s, 420.30 GB/s, 420.33 GB/s, 420.25 GB/s, 420.28
-              GB/s, 420.31 GB/s, 420.31 GB/s, 420.34 GB/s, 420.33 GB/s]
-          - [503.98 GB/s, 503.99 GB/s, 503.97 GB/s, 503.98 GB/s, 504.02 GB/s, 503.99
-              GB/s, 503.92 GB/s, 503.98 GB/s, 503.94 GB/s, 503.97 GB/s]
-          - [587.93 GB/s, 588.01 GB/s, 588.04 GB/s, 587.94 GB/s, 587.97 GB/s, 588.01
-              GB/s, 588.00 GB/s, 587.92 GB/s, 588.04 GB/s, 588.02 GB/s]
-          - [668.21 GB/s, 668.22 GB/s, 668.29 GB/s, 668.24 GB/s, 668.27 GB/s, 668.37
-              GB/s, 668.28 GB/s, 668.14 GB/s, 668.19 GB/s, 668.19 GB/s]
-          triad:
-          - [100.00 GB/s, 99.71 GB/s, 99.74 GB/s, 100.24 GB/s, 99.72 GB/s, 99.62 GB/s,
-            99.54 GB/s, 99.61 GB/s, 99.72 GB/s, 99.71 GB/s]
-          - [208.08 GB/s, 210.33 GB/s, 211.57 GB/s, 208.34 GB/s, 210.03 GB/s, 209.16
-              GB/s, 210.21 GB/s, 209.48 GB/s, 210.03 GB/s, 208.80 GB/s]
-          - [311.43 GB/s, 311.08 GB/s, 311.41 GB/s, 311.10 GB/s, 313.13 GB/s, 314.53
-              GB/s, 311.59 GB/s, 311.80 GB/s, 311.57 GB/s, 311.89 GB/s]
-          - [391.65 GB/s, 392.34 GB/s, 391.84 GB/s, 392.07 GB/s, 391.96 GB/s, 392.73
-              GB/s, 391.66 GB/s, 391.83 GB/s, 392.09 GB/s, 391.88 GB/s]
-          - [504.20 GB/s, 506.77 GB/s, 503.22 GB/s, 506.74 GB/s, 502.78 GB/s, 506.15
-              GB/s, 506.87 GB/s, 502.85 GB/s, 505.82 GB/s, 506.57 GB/s]
-          - [587.75 GB/s, 589.51 GB/s, 588.01 GB/s, 587.29 GB/s, 588.04 GB/s, 587.92
-              GB/s, 588.08 GB/s, 587.94 GB/s, 587.82 GB/s, 587.55 GB/s]
-          - [686.03 GB/s, 685.97 GB/s, 685.01 GB/s, 685.88 GB/s, 685.61 GB/s, 687.12
-              GB/s, 684.97 GB/s, 686.09 GB/s, 685.81 GB/s, 687.28 GB/s]
-          - [782.05 GB/s, 781.73 GB/s, 781.13 GB/s, 781.87 GB/s, 782.17 GB/s, 781.24
-              GB/s, 781.82 GB/s, 781.92 GB/s, 781.90 GB/s, 781.66 GB/s]
-          update:
-          - [84.76 GB/s, 84.76 GB/s, 84.77 GB/s, 84.75 GB/s, 84.75 GB/s, 84.75 GB/s,
-            84.75 GB/s, 84.75 GB/s, 84.74 GB/s, 57.21 GB/s]
-          - [157.73 GB/s, 155.29 GB/s, 147.91 GB/s, 160.10 GB/s, 156.33 GB/s, 158.06
-              GB/s, 159.23 GB/s, 156.16 GB/s, 155.30 GB/s, 159.15 GB/s]
-          - [232.07 GB/s, 230.40 GB/s, 234.05 GB/s, 232.69 GB/s, 215.80 GB/s, 232.76
-              GB/s, 236.01 GB/s, 237.12 GB/s, 234.66 GB/s, 234.86 GB/s]
-          - [303.60 GB/s, 304.21 GB/s, 306.83 GB/s, 309.43 GB/s, 312.69 GB/s, 311.75
-              GB/s, 301.74 GB/s, 307.54 GB/s, 312.74 GB/s, 312.19 GB/s]
-          - [386.45 GB/s, 382.41 GB/s, 387.87 GB/s, 392.54 GB/s, 369.42 GB/s, 341.87
-              GB/s, 352.85 GB/s, 390.87 GB/s, 382.44 GB/s, 383.50 GB/s]
-          - [459.60 GB/s, 384.27 GB/s, 437.39 GB/s, 459.42 GB/s, 465.53 GB/s, 447.31
-              GB/s, 440.00 GB/s, 409.94 GB/s, 412.94 GB/s, 446.74 GB/s]
-          - [489.85 GB/s, 489.35 GB/s, 435.92 GB/s, 492.39 GB/s, 446.44 GB/s, 501.71
-              GB/s, 516.02 GB/s, 478.87 GB/s, 494.52 GB/s, 493.04 GB/s]
-          - [521.08 GB/s, 553.73 GB/s, 541.34 GB/s, 527.75 GB/s, 554.87 GB/s, 536.30
-              GB/s, 540.66 GB/s, 551.02 GB/s, 567.27 GB/s, 565.31 GB/s]
-        threads: [1, 2, 3, 4, 5, 6, 7, 8]
-        threads per core: 1
-        total size: [21.12 kB, 42.24 kB, 63.36 kB, 84.48 kB, 105.60 kB, 126.72 kB,
-          147.84 kB, 168.96 kB]
-      2:
-        cores: [1, 2, 3, 4, 5, 6, 7, 8]
-        results:
-          copy: [80.41 GB/s, 160.83 GB/s, 240.43 GB/s, 320.63 GB/s, 401.66 GB/s, 454.32
-              GB/s, 539.77 GB/s, 628.51 GB/s]
-          daxpy: [95.87 GB/s, 187.75 GB/s, 270.68 GB/s, 371.80 GB/s, 454.05 GB/s,
-            503.46 GB/s, 606.85 GB/s, 689.34 GB/s]
-          load: [82.30 GB/s, 164.06 GB/s, 244.78 GB/s, 326.21 GB/s, 408.56 GB/s, 490.13
-              GB/s, 569.95 GB/s, 651.79 GB/s]
-          triad: [93.22 GB/s, 186.75 GB/s, 288.55 GB/s, 340.91 GB/s, 442.20 GB/s,
-            534.62 GB/s, 597.98 GB/s, 707.54 GB/s]
-          update: [83.25 GB/s, 166.04 GB/s, 248.21 GB/s, 330.58 GB/s, 414.71 GB/s,
-            496.97 GB/s, 578.67 GB/s, 656.56 GB/s]
-        size per core: [21.12 kB, 21.12 kB, 21.12 kB, 21.12 kB, 21.12 kB, 21.12 kB,
-          21.12 kB, 21.12 kB]
-        size per thread: [10.56 kB, 10.56 kB, 10.56 kB, 10.56 kB, 10.56 kB, 10.56
-            kB, 10.56 kB, 10.56 kB]
-        stats:
-          copy:
-          - [80.37 GB/s, 79.07 GB/s, 80.39 GB/s, 80.39 GB/s, 80.41 GB/s, 80.29 GB/s,
-            80.36 GB/s, 79.05 GB/s, 77.87 GB/s, 80.37 GB/s]
-          - [160.76 GB/s, 160.63 GB/s, 160.76 GB/s, 160.71 GB/s, 160.80 GB/s, 160.74
-              GB/s, 160.83 GB/s, 160.69 GB/s, 160.79 GB/s, 160.78 GB/s]
-          - [240.43 GB/s, 240.20 GB/s, 240.36 GB/s, 240.37 GB/s, 237.17 GB/s, 240.39
-              GB/s, 240.14 GB/s, 240.24 GB/s, 240.26 GB/s, 240.10 GB/s]
-          - [320.46 GB/s, 320.47 GB/s, 320.63 GB/s, 320.52 GB/s, 320.40 GB/s, 320.40
-              GB/s, 320.51 GB/s, 320.46 GB/s, 319.72 GB/s, 320.44 GB/s]
-          - [401.40 GB/s, 399.28 GB/s, 401.66 GB/s, 401.53 GB/s, 401.52 GB/s, 401.55
-              GB/s, 401.60 GB/s, 401.47 GB/s, 401.47 GB/s, 401.35 GB/s]
-          - [447.24 GB/s, 453.65 GB/s, 453.54 GB/s, 453.86 GB/s, 453.82 GB/s, 453.62
-              GB/s, 453.48 GB/s, 454.32 GB/s, 453.86 GB/s, 446.79 GB/s]
-          - [538.79 GB/s, 538.47 GB/s, 539.02 GB/s, 538.25 GB/s, 538.72 GB/s, 538.89
-              GB/s, 539.37 GB/s, 539.41 GB/s, 539.77 GB/s, 538.49 GB/s]
-          - [628.14 GB/s, 618.54 GB/s, 628.12 GB/s, 623.90 GB/s, 628.27 GB/s, 623.78
-              GB/s, 618.17 GB/s, 623.43 GB/s, 628.51 GB/s, 628.43 GB/s]
-          daxpy:
-          - [95.77 GB/s, 93.25 GB/s, 92.87 GB/s, 95.87 GB/s, 95.84 GB/s, 95.81 GB/s,
-            95.80 GB/s, 94.99 GB/s, 95.81 GB/s, 95.86 GB/s]
-          - [184.53 GB/s, 186.60 GB/s, 183.99 GB/s, 187.48 GB/s, 187.75 GB/s, 181.53
-              GB/s, 183.82 GB/s, 187.75 GB/s, 184.13 GB/s, 180.61 GB/s]
-          - [258.46 GB/s, 270.13 GB/s, 264.76 GB/s, 262.23 GB/s, 265.05 GB/s, 267.25
-              GB/s, 270.68 GB/s, 268.08 GB/s, 266.20 GB/s, 265.66 GB/s]
-          - [367.99 GB/s, 367.15 GB/s, 361.68 GB/s, 364.86 GB/s, 368.76 GB/s, 363.27
-              GB/s, 364.95 GB/s, 366.97 GB/s, 371.80 GB/s, 366.55 GB/s]
-          - [441.95 GB/s, 442.77 GB/s, 444.97 GB/s, 454.05 GB/s, 441.02 GB/s, 445.96
-              GB/s, 442.49 GB/s, 440.23 GB/s, 449.29 GB/s, 452.66 GB/s]
-          - [501.31 GB/s, 489.91 GB/s, 495.43 GB/s, 503.39 GB/s, 488.03 GB/s, 497.71
-              GB/s, 503.46 GB/s, 496.85 GB/s, 497.38 GB/s, 468.90 GB/s]
-          - [604.57 GB/s, 580.51 GB/s, 587.67 GB/s, 594.32 GB/s, 561.32 GB/s, 588.09
-              GB/s, 606.85 GB/s, 600.91 GB/s, 599.40 GB/s, 598.24 GB/s]
-          - [646.48 GB/s, 655.06 GB/s, 684.70 GB/s, 653.61 GB/s, 671.61 GB/s, 689.34
-              GB/s, 673.74 GB/s, 685.49 GB/s, 681.48 GB/s, 683.23 GB/s]
-          load:
-          - [82.19 GB/s, 82.08 GB/s, 82.22 GB/s, 82.10 GB/s, 82.14 GB/s, 82.17 GB/s,
-            82.22 GB/s, 82.28 GB/s, 82.30 GB/s, 81.98 GB/s]
-          - [163.22 GB/s, 163.43 GB/s, 164.06 GB/s, 164.03 GB/s, 163.19 GB/s, 163.83
-              GB/s, 163.29 GB/s, 163.88 GB/s, 163.83 GB/s, 163.11 GB/s]
-          - [244.32 GB/s, 244.47 GB/s, 244.65 GB/s, 244.29 GB/s, 243.96 GB/s, 244.50
-              GB/s, 244.78 GB/s, 244.52 GB/s, 244.48 GB/s, 244.72 GB/s]
-          - [325.18 GB/s, 326.21 GB/s, 325.49 GB/s, 325.86 GB/s, 325.73 GB/s, 325.72
-              GB/s, 326.00 GB/s, 325.41 GB/s, 325.63 GB/s, 325.82 GB/s]
-          - [407.81 GB/s, 407.96 GB/s, 407.59 GB/s, 408.56 GB/s, 407.64 GB/s, 407.61
-              GB/s, 408.09 GB/s, 407.95 GB/s, 408.30 GB/s, 408.32 GB/s]
-          - [488.65 GB/s, 489.73 GB/s, 489.38 GB/s, 489.81 GB/s, 490.13 GB/s, 489.31
-              GB/s, 488.74 GB/s, 489.38 GB/s, 488.17 GB/s, 489.51 GB/s]
-          - [569.95 GB/s, 567.21 GB/s, 566.08 GB/s, 567.88 GB/s, 567.69 GB/s, 569.58
-              GB/s, 568.61 GB/s, 568.35 GB/s, 569.70 GB/s, 568.87 GB/s]
-          - [650.43 GB/s, 651.58 GB/s, 650.86 GB/s, 651.34 GB/s, 651.04 GB/s, 651.79
-              GB/s, 650.28 GB/s, 650.31 GB/s, 650.81 GB/s, 651.09 GB/s]
-          triad:
-          - [93.22 GB/s, 90.73 GB/s, 92.48 GB/s, 92.53 GB/s, 92.37 GB/s, 92.50 GB/s,
-            92.48 GB/s, 90.28 GB/s, 92.35 GB/s, 92.51 GB/s]
-          - [186.75 GB/s, 184.51 GB/s, 184.17 GB/s, 186.66 GB/s, 186.43 GB/s, 184.59
-              GB/s, 186.71 GB/s, 186.30 GB/s, 186.64 GB/s, 186.12 GB/s]
-          - [287.77 GB/s, 288.55 GB/s, 287.76 GB/s, 287.76 GB/s, 288.19 GB/s, 287.70
-              GB/s, 287.42 GB/s, 288.12 GB/s, 287.66 GB/s, 288.01 GB/s]
-          - [339.82 GB/s, 338.95 GB/s, 340.11 GB/s, 340.11 GB/s, 340.25 GB/s, 340.20
-              GB/s, 339.90 GB/s, 340.22 GB/s, 340.91 GB/s, 340.01 GB/s]
-          - [440.41 GB/s, 440.65 GB/s, 441.59 GB/s, 442.20 GB/s, 441.67 GB/s, 432.59
-              GB/s, 440.20 GB/s, 440.81 GB/s, 440.24 GB/s, 441.38 GB/s]
-          - [534.30 GB/s, 527.60 GB/s, 528.52 GB/s, 509.55 GB/s, 527.68 GB/s, 527.63
-              GB/s, 533.66 GB/s, 534.62 GB/s, 534.60 GB/s, 534.19 GB/s]
-          - [595.90 GB/s, 595.94 GB/s, 597.91 GB/s, 580.22 GB/s, 597.98 GB/s, 597.66
-              GB/s, 596.16 GB/s, 567.03 GB/s, 580.88 GB/s, 578.29 GB/s]
-          - [703.80 GB/s, 705.57 GB/s, 694.84 GB/s, 682.59 GB/s, 694.37 GB/s, 696.56
-              GB/s, 704.50 GB/s, 704.95 GB/s, 694.52 GB/s, 707.54 GB/s]
-          update:
-          - [83.18 GB/s, 83.24 GB/s, 83.25 GB/s, 83.16 GB/s, 83.22 GB/s, 83.23 GB/s,
-            83.22 GB/s, 83.21 GB/s, 83.20 GB/s, 83.17 GB/s]
-          - [165.65 GB/s, 165.76 GB/s, 165.99 GB/s, 166.04 GB/s, 165.49 GB/s, 165.87
-              GB/s, 165.58 GB/s, 165.96 GB/s, 165.67 GB/s, 165.66 GB/s]
-          - [247.30 GB/s, 248.14 GB/s, 247.84 GB/s, 247.90 GB/s, 247.77 GB/s, 247.60
-              GB/s, 248.21 GB/s, 247.95 GB/s, 248.05 GB/s, 247.83 GB/s]
-          - [330.49 GB/s, 330.07 GB/s, 329.91 GB/s, 329.90 GB/s, 330.58 GB/s, 329.30
-              GB/s, 329.92 GB/s, 330.03 GB/s, 330.04 GB/s, 330.12 GB/s]
-          - [413.89 GB/s, 414.04 GB/s, 413.56 GB/s, 414.06 GB/s, 414.15 GB/s, 413.94
-              GB/s, 414.04 GB/s, 414.71 GB/s, 414.32 GB/s, 413.93 GB/s]
-          - [496.97 GB/s, 496.80 GB/s, 496.17 GB/s, 495.42 GB/s, 496.17 GB/s, 496.66
-              GB/s, 495.55 GB/s, 496.27 GB/s, 495.52 GB/s, 496.80 GB/s]
-          - [564.44 GB/s, 577.86 GB/s, 574.38 GB/s, 571.96 GB/s, 564.76 GB/s, 578.67
-              GB/s, 565.89 GB/s, 572.49 GB/s, 571.80 GB/s, 572.01 GB/s]
-          - [647.68 GB/s, 656.56 GB/s, 655.56 GB/s, 644.04 GB/s, 655.30 GB/s, 648.80
-              GB/s, 654.77 GB/s, 653.58 GB/s, 656.27 GB/s, 653.79 GB/s]
-        threads: [2, 4, 6, 8, 10, 12, 14, 16]
-        threads per core: 2
-        total size: [21.12 kB, 42.24 kB, 63.36 kB, 84.48 kB, 105.60 kB, 126.72 kB,
-          147.84 kB, 168.96 kB]
-    L2:
-      1:
-        cores: [1, 2, 3, 4, 5, 6, 7, 8]
-        results:
-          copy: [36.74 GB/s, 73.65 GB/s, 107.11 GB/s, 141.43 GB/s, 179.70 GB/s, 215.63
-              GB/s, 247.20 GB/s, 282.42 GB/s]
-          daxpy: [44.59 GB/s, 88.24 GB/s, 132.21 GB/s, 175.78 GB/s, 219.11 GB/s, 259.95
-              GB/s, 305.84 GB/s, 346.83 GB/s]
-          load: [31.46 GB/s, 62.97 GB/s, 93.73 GB/s, 125.46 GB/s, 157.32 GB/s, 183.63
-              GB/s, 214.02 GB/s, 245.17 GB/s]
-          triad: [37.79 GB/s, 75.08 GB/s, 111.43 GB/s, 148.90 GB/s, 185.54 GB/s, 223.72
-              GB/s, 258.53 GB/s, 299.32 GB/s]
-          update: [48.46 GB/s, 96.10 GB/s, 141.97 GB/s, 189.18 GB/s, 234.73 GB/s,
-            280.47 GB/s, 330.94 GB/s, 365.43 GB/s]
-        size per core: [168.96 kB, 168.96 kB, 168.96 kB, 168.96 kB, 168.96 kB, 168.96
-            kB, 168.96 kB, 168.96 kB]
-        size per thread: [168.96 kB, 168.96 kB, 168.96 kB, 168.96 kB, 168.96 kB, 168.96
-            kB, 168.96 kB, 168.96 kB]
-        stats:
-          copy:
-          - [36.38 GB/s, 36.59 GB/s, 36.18 GB/s, 36.57 GB/s, 36.26 GB/s, 34.61 GB/s,
-            35.96 GB/s, 35.84 GB/s, 36.74 GB/s, 36.53 GB/s]
-          - [68.97 GB/s, 70.42 GB/s, 69.88 GB/s, 71.40 GB/s, 69.05 GB/s, 72.46 GB/s,
-            70.32 GB/s, 73.65 GB/s, 72.14 GB/s, 69.81 GB/s]
-          - [107.08 GB/s, 103.53 GB/s, 107.11 GB/s, 103.66 GB/s, 103.88 GB/s, 106.48
-              GB/s, 97.32 GB/s, 105.92 GB/s, 104.16 GB/s, 104.84 GB/s]
-          - [138.97 GB/s, 136.86 GB/s, 140.88 GB/s, 138.96 GB/s, 140.58 GB/s, 138.51
-              GB/s, 141.43 GB/s, 139.53 GB/s, 141.20 GB/s, 139.43 GB/s]
-          - [158.20 GB/s, 171.06 GB/s, 179.70 GB/s, 171.43 GB/s, 174.27 GB/s, 175.01
-              GB/s, 165.20 GB/s, 170.89 GB/s, 173.01 GB/s, 175.17 GB/s]
-          - [209.74 GB/s, 204.59 GB/s, 215.27 GB/s, 215.63 GB/s, 210.59 GB/s, 206.94
-              GB/s, 211.03 GB/s, 201.61 GB/s, 214.45 GB/s, 208.15 GB/s]
-          - [241.38 GB/s, 246.88 GB/s, 246.90 GB/s, 247.20 GB/s, 235.27 GB/s, 227.39
-              GB/s, 239.48 GB/s, 244.45 GB/s, 246.68 GB/s, 235.87 GB/s]
-          - [271.07 GB/s, 282.42 GB/s, 282.38 GB/s, 276.20 GB/s, 269.85 GB/s, 276.96
-              GB/s, 268.64 GB/s, 269.61 GB/s, 279.68 GB/s, 280.63 GB/s]
-          daxpy:
-          - [44.54 GB/s, 44.59 GB/s, 44.50 GB/s, 44.42 GB/s, 44.41 GB/s, 44.06 GB/s,
-            43.39 GB/s, 44.02 GB/s, 44.34 GB/s, 44.28 GB/s]
-          - [85.35 GB/s, 87.05 GB/s, 86.47 GB/s, 86.90 GB/s, 86.92 GB/s, 88.24 GB/s,
-            87.39 GB/s, 87.60 GB/s, 87.55 GB/s, 84.19 GB/s]
-          - [129.21 GB/s, 130.47 GB/s, 123.29 GB/s, 127.92 GB/s, 132.21 GB/s, 128.37
-              GB/s, 127.09 GB/s, 128.72 GB/s, 129.34 GB/s, 128.69 GB/s]
-          - [171.53 GB/s, 169.64 GB/s, 173.92 GB/s, 173.74 GB/s, 168.53 GB/s, 171.54
-              GB/s, 173.96 GB/s, 175.78 GB/s, 171.29 GB/s, 171.33 GB/s]
-          - [219.11 GB/s, 208.86 GB/s, 211.66 GB/s, 216.47 GB/s, 212.73 GB/s, 204.90
-              GB/s, 208.87 GB/s, 215.75 GB/s, 213.61 GB/s, 214.56 GB/s]
-          - [250.69 GB/s, 241.36 GB/s, 255.22 GB/s, 250.29 GB/s, 253.80 GB/s, 256.34
-              GB/s, 254.38 GB/s, 259.95 GB/s, 245.69 GB/s, 259.12 GB/s]
-          - [296.08 GB/s, 301.77 GB/s, 297.40 GB/s, 305.84 GB/s, 288.62 GB/s, 283.76
-              GB/s, 293.61 GB/s, 291.93 GB/s, 299.74 GB/s, 289.76 GB/s]
-          - [344.46 GB/s, 334.36 GB/s, 339.31 GB/s, 330.88 GB/s, 343.26 GB/s, 327.28
-              GB/s, 344.53 GB/s, 346.83 GB/s, 344.29 GB/s, 346.28 GB/s]
-          load:
-          - [31.40 GB/s, 31.23 GB/s, 31.29 GB/s, 31.24 GB/s, 31.46 GB/s, 31.20 GB/s,
-            31.33 GB/s, 30.01 GB/s, 30.08 GB/s, 31.40 GB/s]
-          - [61.20 GB/s, 60.74 GB/s, 61.93 GB/s, 61.22 GB/s, 61.20 GB/s, 60.03 GB/s,
-            59.33 GB/s, 59.94 GB/s, 58.54 GB/s, 62.97 GB/s]
-          - [91.53 GB/s, 93.73 GB/s, 93.05 GB/s, 90.07 GB/s, 91.60 GB/s, 90.11 GB/s,
-            90.21 GB/s, 90.43 GB/s, 89.15 GB/s, 93.10 GB/s]
-          - [122.80 GB/s, 116.57 GB/s, 120.68 GB/s, 122.54 GB/s, 122.75 GB/s, 121.79
-              GB/s, 125.30 GB/s, 125.46 GB/s, 122.28 GB/s, 124.51 GB/s]
-          - [151.01 GB/s, 151.10 GB/s, 148.68 GB/s, 151.17 GB/s, 147.24 GB/s, 153.65
-              GB/s, 146.48 GB/s, 150.48 GB/s, 150.74 GB/s, 157.32 GB/s]
-          - [181.52 GB/s, 173.89 GB/s, 181.58 GB/s, 174.01 GB/s, 176.40 GB/s, 179.73
-              GB/s, 174.06 GB/s, 181.26 GB/s, 180.57 GB/s, 183.63 GB/s]
-          - [214.02 GB/s, 205.69 GB/s, 207.64 GB/s, 204.18 GB/s, 208.42 GB/s, 211.39
-              GB/s, 206.58 GB/s, 204.90 GB/s, 204.75 GB/s, 208.91 GB/s]
-          - [232.16 GB/s, 233.90 GB/s, 241.32 GB/s, 237.45 GB/s, 235.41 GB/s, 241.17
-              GB/s, 237.52 GB/s, 245.17 GB/s, 241.17 GB/s, 234.08 GB/s]
-          triad:
-          - [37.62 GB/s, 37.54 GB/s, 37.79 GB/s, 37.67 GB/s, 37.76 GB/s, 37.77 GB/s,
-            37.68 GB/s, 35.83 GB/s, 37.06 GB/s, 37.50 GB/s]
-          - [72.79 GB/s, 74.76 GB/s, 73.15 GB/s, 74.68 GB/s, 73.88 GB/s, 73.27 GB/s,
-            75.08 GB/s, 73.48 GB/s, 71.27 GB/s, 72.05 GB/s]
-          - [106.26 GB/s, 105.22 GB/s, 109.70 GB/s, 109.07 GB/s, 110.84 GB/s, 111.43
-              GB/s, 106.32 GB/s, 109.73 GB/s, 106.22 GB/s, 107.20 GB/s]
-          - [142.10 GB/s, 148.90 GB/s, 148.11 GB/s, 144.38 GB/s, 144.77 GB/s, 145.42
-              GB/s, 147.36 GB/s, 142.94 GB/s, 145.39 GB/s, 139.42 GB/s]
-          - [182.07 GB/s, 176.75 GB/s, 181.39 GB/s, 183.31 GB/s, 181.87 GB/s, 183.71
-              GB/s, 180.48 GB/s, 178.11 GB/s, 181.36 GB/s, 185.54 GB/s]
-          - [219.85 GB/s, 217.02 GB/s, 218.86 GB/s, 217.09 GB/s, 212.24 GB/s, 212.22
-              GB/s, 219.33 GB/s, 208.81 GB/s, 215.84 GB/s, 223.72 GB/s]
-          - [258.06 GB/s, 232.27 GB/s, 247.04 GB/s, 240.55 GB/s, 236.11 GB/s, 251.88
-              GB/s, 258.53 GB/s, 247.32 GB/s, 251.53 GB/s, 245.10 GB/s]
-          - [273.67 GB/s, 292.81 GB/s, 288.67 GB/s, 289.75 GB/s, 293.98 GB/s, 283.56
-              GB/s, 295.33 GB/s, 280.11 GB/s, 299.32 GB/s, 285.18 GB/s]
-          update:
-          - [47.30 GB/s, 48.33 GB/s, 48.17 GB/s, 47.38 GB/s, 48.16 GB/s, 46.99 GB/s,
-            48.46 GB/s, 47.51 GB/s, 46.20 GB/s, 48.26 GB/s]
-          - [92.10 GB/s, 92.30 GB/s, 95.73 GB/s, 95.53 GB/s, 86.95 GB/s, 96.10 GB/s,
-            94.16 GB/s, 89.72 GB/s, 92.00 GB/s, 93.10 GB/s]
-          - [137.06 GB/s, 140.40 GB/s, 136.20 GB/s, 139.57 GB/s, 140.69 GB/s, 136.20
-              GB/s, 141.53 GB/s, 129.76 GB/s, 136.47 GB/s, 141.97 GB/s]
-          - [184.84 GB/s, 177.96 GB/s, 178.61 GB/s, 179.03 GB/s, 176.59 GB/s, 180.62
-              GB/s, 182.26 GB/s, 182.27 GB/s, 189.18 GB/s, 185.49 GB/s]
-          - [232.17 GB/s, 217.86 GB/s, 232.40 GB/s, 223.10 GB/s, 228.52 GB/s, 234.73
-              GB/s, 232.00 GB/s, 233.14 GB/s, 231.69 GB/s, 225.01 GB/s]
-          - [276.16 GB/s, 274.80 GB/s, 272.58 GB/s, 272.43 GB/s, 280.47 GB/s, 276.90
-              GB/s, 264.76 GB/s, 272.47 GB/s, 277.77 GB/s, 271.42 GB/s]
-          - [330.94 GB/s, 312.06 GB/s, 312.83 GB/s, 312.62 GB/s, 292.44 GB/s, 315.68
-              GB/s, 316.67 GB/s, 321.25 GB/s, 321.71 GB/s, 315.05 GB/s]
-          - [362.85 GB/s, 356.49 GB/s, 365.43 GB/s, 332.52 GB/s, 354.30 GB/s, 354.68
-              GB/s, 335.54 GB/s, 358.54 GB/s, 363.22 GB/s, 360.01 GB/s]
-        threads: [1, 2, 3, 4, 5, 6, 7, 8]
-        threads per core: 1
-        total size: [168.96 kB, 337.92 kB, 506.88 kB, 675.84 kB, 844.80 kB, 1.01 MB,
-          1.18 MB, 1.35 MB]
-      2:
-        cores: [1, 2, 3, 4, 5, 6, 7, 8]
-        results:
-          copy: [36.83 GB/s, 72.70 GB/s, 108.11 GB/s, 142.21 GB/s, 178.07 GB/s, 213.30
-              GB/s, 251.98 GB/s, 283.06 GB/s]
-          daxpy: [45.34 GB/s, 90.11 GB/s, 134.85 GB/s, 180.06 GB/s, 224.22 GB/s, 268.27
-              GB/s, 312.15 GB/s, 358.38 GB/s]
-          load: [33.99 GB/s, 67.65 GB/s, 100.93 GB/s, 134.81 GB/s, 165.89 GB/s, 196.09
-              GB/s, 233.31 GB/s, 262.05 GB/s]
-          triad: [38.60 GB/s, 76.58 GB/s, 114.50 GB/s, 150.54 GB/s, 189.60 GB/s, 227.05
-              GB/s, 263.75 GB/s, 301.02 GB/s]
-          update: [49.25 GB/s, 97.34 GB/s, 146.81 GB/s, 194.71 GB/s, 239.97 GB/s,
-            287.14 GB/s, 330.84 GB/s, 384.71 GB/s]
-        size per core: [168.96 kB, 168.96 kB, 168.96 kB, 168.96 kB, 168.96 kB, 168.96
-            kB, 168.96 kB, 168.96 kB]
-        size per thread: [84.48 kB, 84.48 kB, 84.48 kB, 84.48 kB, 84.48 kB, 84.48
-            kB, 84.48 kB, 84.48 kB]
-        stats:
-          copy:
-          - [36.83 GB/s, 36.67 GB/s, 34.90 GB/s, 36.44 GB/s, 35.13 GB/s, 35.07 GB/s,
-            35.53 GB/s, 36.15 GB/s, 35.85 GB/s, 36.23 GB/s]
-          - [71.52 GB/s, 70.16 GB/s, 70.67 GB/s, 71.20 GB/s, 72.70 GB/s, 70.14 GB/s,
-            70.53 GB/s, 69.17 GB/s, 71.57 GB/s, 70.22 GB/s]
-          - [104.39 GB/s, 104.74 GB/s, 103.12 GB/s, 108.11 GB/s, 105.30 GB/s, 102.80
-              GB/s, 102.90 GB/s, 107.06 GB/s, 103.45 GB/s, 105.45 GB/s]
-          - [139.02 GB/s, 134.63 GB/s, 140.72 GB/s, 141.32 GB/s, 140.35 GB/s, 141.19
-              GB/s, 135.44 GB/s, 142.21 GB/s, 140.96 GB/s, 142.05 GB/s]
-          - [177.86 GB/s, 177.74 GB/s, 177.42 GB/s, 175.35 GB/s, 176.42 GB/s, 173.13
-              GB/s, 174.32 GB/s, 170.24 GB/s, 178.07 GB/s, 177.88 GB/s]
-          - [206.27 GB/s, 211.63 GB/s, 209.06 GB/s, 210.54 GB/s, 208.80 GB/s, 209.99
-              GB/s, 208.77 GB/s, 206.41 GB/s, 213.30 GB/s, 206.39 GB/s]
-          - [240.18 GB/s, 238.36 GB/s, 244.16 GB/s, 236.26 GB/s, 244.12 GB/s, 238.49
-              GB/s, 242.23 GB/s, 244.46 GB/s, 251.98 GB/s, 242.55 GB/s]
-          - [279.77 GB/s, 282.91 GB/s, 278.73 GB/s, 276.91 GB/s, 283.06 GB/s, 273.23
-              GB/s, 278.33 GB/s, 280.88 GB/s, 277.54 GB/s, 281.83 GB/s]
-          daxpy:
-          - [45.32 GB/s, 44.62 GB/s, 45.29 GB/s, 45.18 GB/s, 45.17 GB/s, 45.07 GB/s,
-            44.69 GB/s, 45.17 GB/s, 45.11 GB/s, 45.34 GB/s]
-          - [89.94 GB/s, 89.97 GB/s, 89.37 GB/s, 89.90 GB/s, 88.37 GB/s, 89.13 GB/s,
-            90.11 GB/s, 89.67 GB/s, 89.90 GB/s, 89.93 GB/s]
-          - [134.83 GB/s, 134.85 GB/s, 132.02 GB/s, 134.33 GB/s, 133.82 GB/s, 132.39
-              GB/s, 131.67 GB/s, 134.62 GB/s, 132.71 GB/s, 131.67 GB/s]
-          - [175.52 GB/s, 173.36 GB/s, 176.83 GB/s, 177.98 GB/s, 175.73 GB/s, 173.42
-              GB/s, 180.06 GB/s, 179.55 GB/s, 176.71 GB/s, 175.85 GB/s]
-          - [222.00 GB/s, 216.86 GB/s, 220.17 GB/s, 218.14 GB/s, 220.60 GB/s, 219.43
-              GB/s, 220.58 GB/s, 224.22 GB/s, 220.89 GB/s, 222.28 GB/s]
-          - [258.75 GB/s, 262.88 GB/s, 261.77 GB/s, 268.27 GB/s, 263.66 GB/s, 262.59
-              GB/s, 266.54 GB/s, 261.67 GB/s, 262.80 GB/s, 263.72 GB/s]
-          - [298.65 GB/s, 312.15 GB/s, 308.52 GB/s, 304.22 GB/s, 301.87 GB/s, 305.53
-              GB/s, 309.84 GB/s, 310.67 GB/s, 310.49 GB/s, 311.99 GB/s]
-          - [347.55 GB/s, 350.67 GB/s, 348.93 GB/s, 358.38 GB/s, 352.35 GB/s, 352.05
-              GB/s, 353.82 GB/s, 356.00 GB/s, 348.07 GB/s, 349.87 GB/s]
-          load:
-          - [33.99 GB/s, 32.54 GB/s, 32.94 GB/s, 33.17 GB/s, 33.83 GB/s, 31.55 GB/s,
-            31.91 GB/s, 33.86 GB/s, 33.93 GB/s, 33.75 GB/s]
-          - [66.22 GB/s, 64.94 GB/s, 67.64 GB/s, 67.52 GB/s, 65.01 GB/s, 67.21 GB/s,
-            66.07 GB/s, 66.43 GB/s, 67.65 GB/s, 64.84 GB/s]
-          - [98.58 GB/s, 97.97 GB/s, 98.39 GB/s, 98.50 GB/s, 98.77 GB/s, 97.84 GB/s,
-            99.58 GB/s, 100.93 GB/s, 100.50 GB/s, 99.94 GB/s]
-          - [130.23 GB/s, 131.10 GB/s, 131.04 GB/s, 127.83 GB/s, 134.81 GB/s, 132.68
-              GB/s, 131.80 GB/s, 129.42 GB/s, 130.76 GB/s, 126.96 GB/s]
-          - [164.90 GB/s, 165.18 GB/s, 161.19 GB/s, 164.33 GB/s, 162.76 GB/s, 165.04
-              GB/s, 162.20 GB/s, 165.89 GB/s, 164.34 GB/s, 159.66 GB/s]
-          - [192.69 GB/s, 193.33 GB/s, 188.88 GB/s, 190.70 GB/s, 194.60 GB/s, 190.92
-              GB/s, 191.36 GB/s, 192.89 GB/s, 191.85 GB/s, 196.09 GB/s]
-          - [227.70 GB/s, 223.95 GB/s, 222.79 GB/s, 227.09 GB/s, 227.04 GB/s, 229.45
-              GB/s, 228.09 GB/s, 227.83 GB/s, 233.31 GB/s, 227.49 GB/s]
-          - [257.94 GB/s, 261.47 GB/s, 262.05 GB/s, 257.70 GB/s, 259.70 GB/s, 259.23
-              GB/s, 261.09 GB/s, 253.81 GB/s, 254.21 GB/s, 259.34 GB/s]
-          triad:
-          - [38.60 GB/s, 36.68 GB/s, 38.07 GB/s, 38.10 GB/s, 37.89 GB/s, 36.48 GB/s,
-            38.33 GB/s, 38.12 GB/s, 37.43 GB/s, 37.87 GB/s]
-          - [76.58 GB/s, 74.97 GB/s, 75.74 GB/s, 76.02 GB/s, 72.66 GB/s, 74.73 GB/s,
-            76.37 GB/s, 76.18 GB/s, 74.59 GB/s, 75.75 GB/s]
-          - [111.71 GB/s, 114.50 GB/s, 108.96 GB/s, 111.49 GB/s, 111.56 GB/s, 111.66
-              GB/s, 113.43 GB/s, 114.37 GB/s, 111.67 GB/s, 108.14 GB/s]
-          - [146.29 GB/s, 147.84 GB/s, 149.09 GB/s, 149.93 GB/s, 150.54 GB/s, 145.50
-              GB/s, 145.16 GB/s, 149.47 GB/s, 146.30 GB/s, 149.32 GB/s]
-          - [186.73 GB/s, 186.46 GB/s, 180.47 GB/s, 187.32 GB/s, 184.34 GB/s, 187.34
-              GB/s, 186.55 GB/s, 183.81 GB/s, 189.60 GB/s, 188.70 GB/s]
-          - [224.81 GB/s, 219.69 GB/s, 227.05 GB/s, 224.25 GB/s, 223.36 GB/s, 225.86
-              GB/s, 216.09 GB/s, 221.98 GB/s, 218.47 GB/s, 226.37 GB/s]
-          - [263.29 GB/s, 259.28 GB/s, 258.81 GB/s, 258.77 GB/s, 256.56 GB/s, 256.49
-              GB/s, 256.39 GB/s, 263.75 GB/s, 262.00 GB/s, 261.48 GB/s]
-          - [299.28 GB/s, 292.80 GB/s, 293.63 GB/s, 297.93 GB/s, 293.02 GB/s, 295.95
-              GB/s, 287.92 GB/s, 301.02 GB/s, 300.76 GB/s, 297.01 GB/s]
-          update:
-          - [49.07 GB/s, 47.17 GB/s, 47.56 GB/s, 49.25 GB/s, 46.44 GB/s, 49.04 GB/s,
-            48.91 GB/s, 49.20 GB/s, 48.30 GB/s, 48.85 GB/s]
-          - [96.45 GB/s, 97.11 GB/s, 94.03 GB/s, 92.56 GB/s, 95.39 GB/s, 97.34 GB/s,
-            96.06 GB/s, 92.25 GB/s, 95.53 GB/s, 97.08 GB/s]
-          - [137.54 GB/s, 135.13 GB/s, 145.80 GB/s, 141.29 GB/s, 138.99 GB/s, 143.44
-              GB/s, 146.81 GB/s, 142.94 GB/s, 133.84 GB/s, 146.33 GB/s]
-          - [190.64 GB/s, 185.02 GB/s, 194.24 GB/s, 187.48 GB/s, 194.52 GB/s, 188.51
-              GB/s, 189.17 GB/s, 194.71 GB/s, 194.37 GB/s, 190.83 GB/s]
-          - [239.97 GB/s, 219.74 GB/s, 233.72 GB/s, 234.38 GB/s, 235.78 GB/s, 235.11
-              GB/s, 235.62 GB/s, 226.09 GB/s, 235.93 GB/s, 230.51 GB/s]
-          - [280.16 GB/s, 275.22 GB/s, 260.15 GB/s, 286.01 GB/s, 280.61 GB/s, 287.14
-              GB/s, 283.75 GB/s, 275.23 GB/s, 283.71 GB/s, 285.38 GB/s]
-          - [311.15 GB/s, 318.00 GB/s, 325.21 GB/s, 328.34 GB/s, 318.09 GB/s, 328.66
-              GB/s, 329.69 GB/s, 316.97 GB/s, 328.51 GB/s, 330.84 GB/s]
-          - [374.41 GB/s, 369.73 GB/s, 358.15 GB/s, 375.54 GB/s, 384.71 GB/s, 357.66
-              GB/s, 369.71 GB/s, 375.35 GB/s, 370.25 GB/s, 364.01 GB/s]
-        threads: [2, 4, 6, 8, 10, 12, 14, 16]
-        threads per core: 2
-        total size: [168.96 kB, 337.92 kB, 506.88 kB, 675.84 kB, 844.80 kB, 1.01 MB,
-          1.18 MB, 1.35 MB]
-    L3:
-      1:
-        cores: [1, 2, 3, 4, 5, 6, 7, 8]
-        results:
-          copy: [21.93 GB/s, 43.10 GB/s, 65.38 GB/s, 85.69 GB/s, 105.64 GB/s, 127.34
-              GB/s, 148.22 GB/s, 171.52 GB/s]
-          daxpy: [30.98 GB/s, 62.27 GB/s, 93.13 GB/s, 123.27 GB/s, 153.64 GB/s, 185.97
-              GB/s, 216.67 GB/s, 247.41 GB/s]
-          load: [23.47 GB/s, 46.84 GB/s, 69.74 GB/s, 92.76 GB/s, 115.37 GB/s, 139.23
-              GB/s, 163.12 GB/s, 186.65 GB/s]
-          triad: [24.72 GB/s, 49.11 GB/s, 72.42 GB/s, 95.36 GB/s, 119.46 GB/s, 144.60
-              GB/s, 168.66 GB/s, 189.45 GB/s]
-          update: [31.39 GB/s, 62.11 GB/s, 91.95 GB/s, 122.24 GB/s, 151.40 GB/s, 182.28
-              GB/s, 216.07 GB/s, 239.92 GB/s]
-        size per core: [13.20 MB, 6.60 MB, 4.40 MB, 3.30 MB, 2.64 MB, 2.20 MB, 1.89
-            MB, 1.65 MB]
-        size per thread: [13.20 MB, 6.60 MB, 4.40 MB, 3.30 MB, 2.64 MB, 2.20 MB, 1.89
-            MB, 1.65 MB]
-        stats:
-          copy:
-          - [21.64 GB/s, 20.85 GB/s, 20.56 GB/s, 21.69 GB/s, 21.06 GB/s, 21.46 GB/s,
-            21.93 GB/s, 21.73 GB/s, 21.83 GB/s, 21.69 GB/s]
-          - [42.86 GB/s, 42.70 GB/s, 42.72 GB/s, 38.47 GB/s, 42.82 GB/s, 43.10 GB/s,
-            42.66 GB/s, 42.44 GB/s, 42.61 GB/s, 42.48 GB/s]
-          - [64.95 GB/s, 64.34 GB/s, 63.93 GB/s, 65.38 GB/s, 64.36 GB/s, 63.60 GB/s,
-            62.65 GB/s, 63.66 GB/s, 63.51 GB/s, 63.75 GB/s]
-          - [84.07 GB/s, 83.97 GB/s, 83.34 GB/s, 83.91 GB/s, 81.16 GB/s, 85.69 GB/s,
-            85.40 GB/s, 85.37 GB/s, 85.42 GB/s, 84.48 GB/s]
-          - [102.83 GB/s, 104.24 GB/s, 105.42 GB/s, 103.68 GB/s, 105.22 GB/s, 105.64
-              GB/s, 103.15 GB/s, 102.02 GB/s, 100.60 GB/s, 105.09 GB/s]
-          - [125.46 GB/s, 122.23 GB/s, 123.56 GB/s, 124.59 GB/s, 127.03 GB/s, 125.39
-              GB/s, 124.50 GB/s, 127.02 GB/s, 126.95 GB/s, 127.34 GB/s]
-          - [147.99 GB/s, 146.65 GB/s, 139.23 GB/s, 147.69 GB/s, 146.42 GB/s, 145.65
-              GB/s, 148.22 GB/s, 143.77 GB/s, 147.96 GB/s, 147.70 GB/s]
-          - [168.36 GB/s, 168.24 GB/s, 164.99 GB/s, 165.32 GB/s, 167.08 GB/s, 165.98
-              GB/s, 165.39 GB/s, 165.84 GB/s, 166.15 GB/s, 171.52 GB/s]
-          daxpy:
-          - [30.92 GB/s, 30.74 GB/s, 30.87 GB/s, 30.98 GB/s, 30.45 GB/s, 29.62 GB/s,
-            29.54 GB/s, 30.04 GB/s, 30.94 GB/s, 30.93 GB/s]
-          - [61.96 GB/s, 61.38 GB/s, 61.27 GB/s, 62.27 GB/s, 61.36 GB/s, 61.27 GB/s,
-            62.06 GB/s, 60.01 GB/s, 61.49 GB/s, 62.16 GB/s]
-          - [92.26 GB/s, 93.06 GB/s, 88.45 GB/s, 92.18 GB/s, 93.13 GB/s, 92.11 GB/s,
-            92.28 GB/s, 92.28 GB/s, 93.03 GB/s, 92.78 GB/s]
-          - [123.22 GB/s, 123.06 GB/s, 123.27 GB/s, 119.42 GB/s, 122.94 GB/s, 122.54
-              GB/s, 123.24 GB/s, 115.90 GB/s, 121.65 GB/s, 122.47 GB/s]
-          - [151.70 GB/s, 145.65 GB/s, 149.53 GB/s, 152.52 GB/s, 153.64 GB/s, 152.93
-              GB/s, 152.81 GB/s, 153.01 GB/s, 153.04 GB/s, 152.06 GB/s]
-          - [184.04 GB/s, 171.51 GB/s, 184.83 GB/s, 184.09 GB/s, 185.97 GB/s, 183.75
-              GB/s, 184.66 GB/s, 182.54 GB/s, 184.39 GB/s, 184.40 GB/s]
-          - [198.70 GB/s, 216.51 GB/s, 216.17 GB/s, 203.10 GB/s, 211.40 GB/s, 215.04
-              GB/s, 215.48 GB/s, 216.03 GB/s, 216.24 GB/s, 216.67 GB/s]
-          - [246.02 GB/s, 247.35 GB/s, 245.00 GB/s, 244.65 GB/s, 229.12 GB/s, 243.37
-              GB/s, 247.22 GB/s, 247.41 GB/s, 246.03 GB/s, 244.83 GB/s]
-          load:
-          - [23.08 GB/s, 23.38 GB/s, 22.88 GB/s, 23.43 GB/s, 23.05 GB/s, 23.23 GB/s,
-            22.97 GB/s, 22.39 GB/s, 23.47 GB/s, 23.33 GB/s]
-          - [46.39 GB/s, 46.40 GB/s, 46.45 GB/s, 46.36 GB/s, 46.69 GB/s, 46.62 GB/s,
-            46.84 GB/s, 45.98 GB/s, 46.73 GB/s, 46.80 GB/s]
-          - [69.18 GB/s, 68.61 GB/s, 69.74 GB/s, 69.34 GB/s, 68.39 GB/s, 69.73 GB/s,
-            67.76 GB/s, 69.65 GB/s, 69.70 GB/s, 69.16 GB/s]
-          - [92.29 GB/s, 91.67 GB/s, 92.76 GB/s, 90.78 GB/s, 92.76 GB/s, 90.76 GB/s,
-            91.58 GB/s, 91.60 GB/s, 91.03 GB/s, 92.72 GB/s]
-          - [114.04 GB/s, 113.82 GB/s, 112.26 GB/s, 112.65 GB/s, 114.09 GB/s, 113.81
-              GB/s, 113.72 GB/s, 114.70 GB/s, 115.37 GB/s, 112.57 GB/s]
-          - [136.42 GB/s, 135.83 GB/s, 134.93 GB/s, 135.43 GB/s, 135.94 GB/s, 139.23
-              GB/s, 137.52 GB/s, 137.59 GB/s, 135.97 GB/s, 136.96 GB/s]
-          - [157.88 GB/s, 163.12 GB/s, 159.53 GB/s, 160.16 GB/s, 162.18 GB/s, 159.58
-              GB/s, 161.55 GB/s, 159.81 GB/s, 162.97 GB/s, 163.10 GB/s]
-          - [183.41 GB/s, 181.86 GB/s, 183.55 GB/s, 183.38 GB/s, 181.66 GB/s, 186.65
-              GB/s, 179.62 GB/s, 174.70 GB/s, 180.10 GB/s, 181.49 GB/s]
-          triad:
-          - [24.72 GB/s, 23.66 GB/s, 23.58 GB/s, 23.75 GB/s, 23.62 GB/s, 24.37 GB/s,
-            24.44 GB/s, 23.57 GB/s, 23.30 GB/s, 23.57 GB/s]
-          - [49.11 GB/s, 46.87 GB/s, 47.13 GB/s, 46.83 GB/s, 46.58 GB/s, 46.73 GB/s,
-            46.32 GB/s, 47.22 GB/s, 46.79 GB/s, 48.73 GB/s]
-          - [72.29 GB/s, 69.87 GB/s, 70.57 GB/s, 68.89 GB/s, 68.56 GB/s, 69.02 GB/s,
-            72.42 GB/s, 69.37 GB/s, 72.34 GB/s, 69.44 GB/s]
-          - [94.95 GB/s, 94.67 GB/s, 91.05 GB/s, 90.46 GB/s, 95.36 GB/s, 91.63 GB/s,
-            94.06 GB/s, 95.30 GB/s, 93.99 GB/s, 94.71 GB/s]
-          - [119.32 GB/s, 117.99 GB/s, 119.46 GB/s, 117.28 GB/s, 118.97 GB/s, 115.67
-              GB/s, 116.64 GB/s, 117.99 GB/s, 119.02 GB/s, 117.75 GB/s]
-          - [138.63 GB/s, 144.53 GB/s, 144.60 GB/s, 135.72 GB/s, 141.86 GB/s, 139.64
-              GB/s, 142.95 GB/s, 140.89 GB/s, 142.10 GB/s, 143.97 GB/s]
-          - [168.66 GB/s, 166.77 GB/s, 157.10 GB/s, 164.75 GB/s, 164.00 GB/s, 164.38
-              GB/s, 163.94 GB/s, 158.58 GB/s, 165.60 GB/s, 164.39 GB/s]
-          - [184.53 GB/s, 187.00 GB/s, 186.87 GB/s, 179.43 GB/s, 185.70 GB/s, 187.49
-              GB/s, 189.45 GB/s, 186.82 GB/s, 188.50 GB/s, 185.96 GB/s]
-          update:
-          - [30.60 GB/s, 31.20 GB/s, 30.65 GB/s, 31.39 GB/s, 30.89 GB/s, 30.75 GB/s,
-            30.58 GB/s, 30.99 GB/s, 30.69 GB/s, 31.34 GB/s]
-          - [60.99 GB/s, 62.11 GB/s, 61.42 GB/s, 61.55 GB/s, 61.79 GB/s, 61.24 GB/s,
-            61.37 GB/s, 61.74 GB/s, 61.45 GB/s, 61.58 GB/s]
-          - [91.11 GB/s, 91.21 GB/s, 91.95 GB/s, 91.19 GB/s, 91.14 GB/s, 91.36 GB/s,
-            91.30 GB/s, 91.70 GB/s, 90.84 GB/s, 91.09 GB/s]
-          - [120.90 GB/s, 120.49 GB/s, 121.35 GB/s, 122.24 GB/s, 120.37 GB/s, 119.83
-              GB/s, 119.32 GB/s, 119.48 GB/s, 119.11 GB/s, 119.76 GB/s]
-          - [146.72 GB/s, 147.18 GB/s, 147.81 GB/s, 151.40 GB/s, 147.81 GB/s, 146.84
-              GB/s, 147.51 GB/s, 148.15 GB/s, 146.89 GB/s, 148.41 GB/s]
-          - [179.93 GB/s, 179.68 GB/s, 182.28 GB/s, 179.65 GB/s, 179.06 GB/s, 182.25
-              GB/s, 182.03 GB/s, 179.10 GB/s, 178.82 GB/s, 177.84 GB/s]
-          - [208.84 GB/s, 210.17 GB/s, 210.20 GB/s, 210.81 GB/s, 209.88 GB/s, 211.16
-              GB/s, 216.07 GB/s, 211.77 GB/s, 208.89 GB/s, 210.47 GB/s]
-          - [236.56 GB/s, 239.05 GB/s, 237.81 GB/s, 237.20 GB/s, 238.68 GB/s, 237.69
-              GB/s, 239.05 GB/s, 239.38 GB/s, 239.92 GB/s, 238.63 GB/s]
-        threads: [1, 2, 3, 4, 5, 6, 7, 8]
-        threads per core: 1
-        total size: [13.20 MB, 13.20 MB, 13.20 MB, 13.20 MB, 13.20 MB, 13.20 MB, 13.20
-            MB, 13.20 MB]
-      2:
-        cores: [1, 2, 3, 4, 5, 6, 7, 8]
-        results:
-          copy: [23.35 GB/s, 45.63 GB/s, 68.10 GB/s, 89.46 GB/s, 111.10 GB/s, 134.20
-              GB/s, 154.44 GB/s, 174.89 GB/s]
-          daxpy: [32.32 GB/s, 64.16 GB/s, 96.12 GB/s, 126.75 GB/s, 156.91 GB/s, 188.57
-              GB/s, 221.57 GB/s, 251.65 GB/s]
-          load: [25.14 GB/s, 50.38 GB/s, 75.49 GB/s, 101.06 GB/s, 126.04 GB/s, 151.12
-              GB/s, 172.57 GB/s, 196.91 GB/s]
-          triad: [25.15 GB/s, 50.37 GB/s, 75.31 GB/s, 99.12 GB/s, 123.25 GB/s, 150.29
-              GB/s, 171.60 GB/s, 197.81 GB/s]
-          update: [32.98 GB/s, 65.60 GB/s, 97.60 GB/s, 130.34 GB/s, 162.76 GB/s, 194.12
-              GB/s, 229.02 GB/s, 260.35 GB/s]
-        size per core: [13.20 MB, 6.60 MB, 4.40 MB, 3.30 MB, 2.64 MB, 2.20 MB, 1.89
-            MB, 1.65 MB]
-        size per thread: [6.60 MB, 3.30 MB, 2.20 MB, 1.65 MB, 1.32 MB, 1.10 MB, 0.94
-            MB, 825.00 kB]
-        stats:
-          copy:
-          - [22.79 GB/s, 22.55 GB/s, 22.86 GB/s, 22.74 GB/s, 23.09 GB/s, 22.51 GB/s,
-            23.35 GB/s, 23.32 GB/s, 23.02 GB/s, 22.75 GB/s]
-          - [45.32 GB/s, 45.15 GB/s, 45.63 GB/s, 44.84 GB/s, 44.54 GB/s, 44.33 GB/s,
-            44.68 GB/s, 44.98 GB/s, 44.64 GB/s, 44.75 GB/s]
-          - [68.10 GB/s, 67.88 GB/s, 67.98 GB/s, 67.32 GB/s, 67.02 GB/s, 67.14 GB/s,
-            67.71 GB/s, 67.19 GB/s, 63.08 GB/s, 68.04 GB/s]
-          - [89.46 GB/s, 88.53 GB/s, 88.51 GB/s, 89.13 GB/s, 89.32 GB/s, 84.53 GB/s,
-            87.51 GB/s, 88.95 GB/s, 88.91 GB/s, 87.62 GB/s]
-          - [108.72 GB/s, 110.42 GB/s, 106.02 GB/s, 111.08 GB/s, 110.70 GB/s, 111.10
-              GB/s, 110.24 GB/s, 109.68 GB/s, 109.55 GB/s, 108.86 GB/s]
-          - [133.21 GB/s, 127.37 GB/s, 132.83 GB/s, 132.67 GB/s, 133.02 GB/s, 132.65
-              GB/s, 134.20 GB/s, 132.96 GB/s, 118.86 GB/s, 131.20 GB/s]
-          - [152.95 GB/s, 153.90 GB/s, 153.80 GB/s, 153.22 GB/s, 153.32 GB/s, 142.75
-              GB/s, 152.99 GB/s, 154.44 GB/s, 154.43 GB/s, 152.24 GB/s]
-          - [174.89 GB/s, 171.49 GB/s, 157.46 GB/s, 172.90 GB/s, 173.42 GB/s, 171.07
-              GB/s, 171.82 GB/s, 170.68 GB/s, 172.19 GB/s, 161.38 GB/s]
-          daxpy:
-          - [31.88 GB/s, 32.27 GB/s, 31.11 GB/s, 32.20 GB/s, 32.17 GB/s, 32.32 GB/s,
-            32.20 GB/s, 32.32 GB/s, 30.76 GB/s, 32.03 GB/s]
-          - [64.16 GB/s, 63.70 GB/s, 64.04 GB/s, 63.55 GB/s, 60.64 GB/s, 64.05 GB/s,
-            63.56 GB/s, 63.36 GB/s, 63.94 GB/s, 63.86 GB/s]
-          - [96.12 GB/s, 95.66 GB/s, 95.93 GB/s, 95.93 GB/s, 96.10 GB/s, 95.94 GB/s,
-            95.78 GB/s, 95.79 GB/s, 95.17 GB/s, 89.44 GB/s]
-          - [126.04 GB/s, 126.43 GB/s, 126.09 GB/s, 124.90 GB/s, 125.07 GB/s, 125.74
-              GB/s, 118.86 GB/s, 125.80 GB/s, 125.10 GB/s, 126.75 GB/s]
-          - [155.92 GB/s, 155.99 GB/s, 156.32 GB/s, 151.54 GB/s, 156.49 GB/s, 156.91
-              GB/s, 154.92 GB/s, 155.92 GB/s, 156.20 GB/s, 154.49 GB/s]
-          - [185.57 GB/s, 180.38 GB/s, 187.51 GB/s, 187.10 GB/s, 186.44 GB/s, 187.13
-              GB/s, 187.31 GB/s, 188.10 GB/s, 187.91 GB/s, 188.57 GB/s]
-          - [207.55 GB/s, 219.63 GB/s, 219.38 GB/s, 219.81 GB/s, 220.29 GB/s, 219.72
-              GB/s, 221.05 GB/s, 216.76 GB/s, 221.57 GB/s, 220.75 GB/s]
-          - [250.81 GB/s, 250.78 GB/s, 251.19 GB/s, 251.28 GB/s, 249.10 GB/s, 250.42
-              GB/s, 251.65 GB/s, 244.31 GB/s, 250.40 GB/s, 250.19 GB/s]
-          load:
-          - [24.84 GB/s, 24.86 GB/s, 25.09 GB/s, 25.04 GB/s, 24.74 GB/s, 24.87 GB/s,
-            25.01 GB/s, 25.08 GB/s, 25.14 GB/s, 25.00 GB/s]
-          - [50.03 GB/s, 49.40 GB/s, 50.28 GB/s, 50.08 GB/s, 50.37 GB/s, 49.75 GB/s,
-            50.01 GB/s, 50.38 GB/s, 49.89 GB/s, 50.24 GB/s]
-          - [74.37 GB/s, 74.65 GB/s, 74.40 GB/s, 73.45 GB/s, 73.31 GB/s, 73.00 GB/s,
-            75.49 GB/s, 73.94 GB/s, 74.42 GB/s, 74.80 GB/s]
-          - [99.51 GB/s, 99.43 GB/s, 98.90 GB/s, 99.83 GB/s, 98.74 GB/s, 100.75 GB/s,
-            99.33 GB/s, 99.81 GB/s, 100.00 GB/s, 101.06 GB/s]
-          - [126.04 GB/s, 126.03 GB/s, 124.70 GB/s, 124.86 GB/s, 125.31 GB/s, 124.78
-              GB/s, 125.99 GB/s, 123.52 GB/s, 124.45 GB/s, 123.01 GB/s]
-          - [146.95 GB/s, 150.27 GB/s, 151.12 GB/s, 150.93 GB/s, 150.68 GB/s, 149.75
-              GB/s, 150.67 GB/s, 146.01 GB/s, 148.34 GB/s, 149.15 GB/s]
-          - [169.40 GB/s, 172.12 GB/s, 172.40 GB/s, 171.99 GB/s, 172.57 GB/s, 171.95
-              GB/s, 167.06 GB/s, 169.66 GB/s, 168.34 GB/s, 169.45 GB/s]
-          - [192.68 GB/s, 191.98 GB/s, 192.82 GB/s, 191.84 GB/s, 191.97 GB/s, 196.91
-              GB/s, 193.36 GB/s, 190.12 GB/s, 192.04 GB/s, 193.93 GB/s]
-          triad:
-          - [24.78 GB/s, 25.03 GB/s, 25.07 GB/s, 24.81 GB/s, 24.65 GB/s, 24.80 GB/s,
-            24.71 GB/s, 25.15 GB/s, 24.70 GB/s, 24.25 GB/s]
-          - [49.63 GB/s, 48.68 GB/s, 49.73 GB/s, 49.97 GB/s, 50.37 GB/s, 49.89 GB/s,
-            49.59 GB/s, 49.00 GB/s, 49.96 GB/s, 49.61 GB/s]
-          - [74.88 GB/s, 74.99 GB/s, 75.31 GB/s, 73.20 GB/s, 74.50 GB/s, 72.88 GB/s,
-            73.43 GB/s, 73.74 GB/s, 74.59 GB/s, 74.60 GB/s]
-          - [95.80 GB/s, 97.67 GB/s, 98.93 GB/s, 97.79 GB/s, 98.74 GB/s, 97.74 GB/s,
-            98.87 GB/s, 99.12 GB/s, 97.90 GB/s, 97.96 GB/s]
-          - [121.15 GB/s, 120.28 GB/s, 120.66 GB/s, 121.19 GB/s, 121.09 GB/s, 121.68
-              GB/s, 121.30 GB/s, 123.22 GB/s, 122.51 GB/s, 123.25 GB/s]
-          - [146.72 GB/s, 146.38 GB/s, 146.25 GB/s, 146.49 GB/s, 146.29 GB/s, 144.30
-              GB/s, 142.89 GB/s, 150.29 GB/s, 146.37 GB/s, 146.30 GB/s]
-          - [166.36 GB/s, 168.18 GB/s, 168.79 GB/s, 170.27 GB/s, 169.26 GB/s, 170.98
-              GB/s, 170.77 GB/s, 171.43 GB/s, 169.53 GB/s, 171.60 GB/s]
-          - [190.83 GB/s, 197.81 GB/s, 196.29 GB/s, 197.12 GB/s, 196.21 GB/s, 188.40
-              GB/s, 191.07 GB/s, 195.14 GB/s, 192.48 GB/s, 194.23 GB/s]
-          update:
-          - [32.74 GB/s, 32.98 GB/s, 32.73 GB/s, 32.57 GB/s, 32.63 GB/s, 32.41 GB/s,
-            32.61 GB/s, 32.24 GB/s, 32.52 GB/s, 32.49 GB/s]
-          - [65.22 GB/s, 65.07 GB/s, 64.65 GB/s, 65.26 GB/s, 63.70 GB/s, 64.19 GB/s,
-            64.35 GB/s, 64.83 GB/s, 65.60 GB/s, 63.99 GB/s]
-          - [97.60 GB/s, 96.65 GB/s, 97.50 GB/s, 96.07 GB/s, 97.12 GB/s, 96.41 GB/s,
-            96.85 GB/s, 96.80 GB/s, 97.10 GB/s, 97.10 GB/s]
-          - [129.18 GB/s, 127.79 GB/s, 129.50 GB/s, 129.46 GB/s, 128.85 GB/s, 128.69
-              GB/s, 129.02 GB/s, 130.34 GB/s, 129.92 GB/s, 129.11 GB/s]
-          - [160.00 GB/s, 161.81 GB/s, 160.37 GB/s, 159.56 GB/s, 160.38 GB/s, 161.91
-              GB/s, 160.54 GB/s, 161.43 GB/s, 160.59 GB/s, 162.76 GB/s]
-          - [192.24 GB/s, 193.69 GB/s, 191.11 GB/s, 190.65 GB/s, 193.10 GB/s, 191.30
-              GB/s, 192.50 GB/s, 193.37 GB/s, 191.98 GB/s, 194.12 GB/s]
-          - [221.45 GB/s, 229.02 GB/s, 226.33 GB/s, 224.81 GB/s, 225.62 GB/s, 224.79
-              GB/s, 226.03 GB/s, 227.09 GB/s, 226.46 GB/s, 225.88 GB/s]
-          - [255.45 GB/s, 256.52 GB/s, 254.06 GB/s, 257.76 GB/s, 256.85 GB/s, 256.27
-              GB/s, 260.35 GB/s, 259.96 GB/s, 258.40 GB/s, 255.79 GB/s]
-        threads: [2, 4, 6, 8, 10, 12, 14, 16]
-        threads per core: 2
-        total size: [13.20 MB, 13.20 MB, 13.20 MB, 13.20 MB, 13.20 MB, 13.20 MB, 13.20
-            MB, 13.20 MB]
-    MEM:
-      1:
-        cores: [1, 2, 3, 4, 5, 6, 7, 8]
-        results:
-          copy: [11.12 GB/s, 20.53 GB/s, 24.86 GB/s, 26.20 GB/s, 26.47 GB/s, 26.35
-              GB/s, 26.24 GB/s, 26.17 GB/s]
-          daxpy: [16.10 GB/s, 30.00 GB/s, 36.88 GB/s, 38.86 GB/s, 39.36 GB/s, 39.19
-              GB/s, 39.02 GB/s, 38.88 GB/s]
-          load: [12.30 GB/s, 23.50 GB/s, 33.04 GB/s, 40.59 GB/s, 44.03 GB/s, 44.56
-              GB/s, 44.26 GB/s, 43.77 GB/s]
-          triad: [12.41 GB/s, 24.13 GB/s, 29.24 GB/s, 30.73 GB/s, 30.68 GB/s, 30.58
-              GB/s, 30.54 GB/s, 30.63 GB/s]
-          update: [17.40 GB/s, 31.16 GB/s, 36.80 GB/s, 39.06 GB/s, 39.80 GB/s, 39.77
-              GB/s, 39.50 GB/s, 39.24 GB/s]
-        size per core: [300.00 MB, 150.00 MB, 100.00 MB, 75.00 MB, 60.00 MB, 50.00
-            MB, 42.86 MB, 37.50 MB]
-        size per thread: [300.00 MB, 150.00 MB, 100.00 MB, 75.00 MB, 60.00 MB, 50.00
-            MB, 42.86 MB, 37.50 MB]
-        stats:
-          copy:
-          - [10.83 GB/s, 10.83 GB/s, 10.81 GB/s, 10.82 GB/s, 10.82 GB/s, 10.82 GB/s,
-            10.83 GB/s, 10.81 GB/s, 10.82 GB/s, 11.12 GB/s]
-          - [20.34 GB/s, 20.38 GB/s, 20.37 GB/s, 20.34 GB/s, 20.41 GB/s, 20.39 GB/s,
-            20.39 GB/s, 20.39 GB/s, 20.53 GB/s, 20.35 GB/s]
-          - [24.70 GB/s, 24.76 GB/s, 24.80 GB/s, 24.86 GB/s, 24.75 GB/s, 24.80 GB/s,
-            24.77 GB/s, 24.82 GB/s, 24.81 GB/s, 24.73 GB/s]
-          - [26.10 GB/s, 26.16 GB/s, 26.14 GB/s, 26.16 GB/s, 26.10 GB/s, 26.15 GB/s,
-            26.10 GB/s, 26.15 GB/s, 26.11 GB/s, 26.20 GB/s]
-          - [26.45 GB/s, 26.44 GB/s, 26.41 GB/s, 26.43 GB/s, 26.45 GB/s, 26.44 GB/s,
-            26.46 GB/s, 26.47 GB/s, 26.45 GB/s, 26.44 GB/s]
-          - [26.34 GB/s, 26.30 GB/s, 26.31 GB/s, 26.33 GB/s, 26.26 GB/s, 26.35 GB/s,
-            26.30 GB/s, 26.30 GB/s, 26.30 GB/s, 26.34 GB/s]
-          - [26.20 GB/s, 26.24 GB/s, 26.21 GB/s, 26.22 GB/s, 26.22 GB/s, 26.20 GB/s,
-            26.20 GB/s, 26.23 GB/s, 26.22 GB/s, 26.23 GB/s]
-          - [26.15 GB/s, 26.17 GB/s, 26.12 GB/s, 26.15 GB/s, 26.15 GB/s, 26.15 GB/s,
-            26.12 GB/s, 26.14 GB/s, 26.14 GB/s, 26.17 GB/s]
-          daxpy:
-          - [15.77 GB/s, 15.77 GB/s, 16.04 GB/s, 15.68 GB/s, 15.72 GB/s, 15.76 GB/s,
-            15.91 GB/s, 15.77 GB/s, 16.10 GB/s, 16.04 GB/s]
-          - [29.88 GB/s, 29.80 GB/s, 30.00 GB/s, 29.87 GB/s, 29.87 GB/s, 30.00 GB/s,
-            29.79 GB/s, 29.80 GB/s, 29.80 GB/s, 29.82 GB/s]
-          - [36.63 GB/s, 36.73 GB/s, 36.64 GB/s, 36.64 GB/s, 36.81 GB/s, 36.88 GB/s,
-            36.62 GB/s, 36.65 GB/s, 36.74 GB/s, 36.71 GB/s]
-          - [38.82 GB/s, 38.83 GB/s, 38.86 GB/s, 38.81 GB/s, 38.81 GB/s, 38.82 GB/s,
-            38.85 GB/s, 38.80 GB/s, 38.84 GB/s, 38.73 GB/s]
-          - [39.32 GB/s, 39.30 GB/s, 39.34 GB/s, 39.36 GB/s, 39.28 GB/s, 39.33 GB/s,
-            39.31 GB/s, 39.25 GB/s, 39.32 GB/s, 39.33 GB/s]
-          - [39.10 GB/s, 39.12 GB/s, 39.14 GB/s, 39.16 GB/s, 39.17 GB/s, 39.17 GB/s,
-            39.13 GB/s, 39.15 GB/s, 39.14 GB/s, 39.19 GB/s]
-          - [39.01 GB/s, 39.01 GB/s, 39.02 GB/s, 39.02 GB/s, 39.00 GB/s, 39.00 GB/s,
-            38.97 GB/s, 39.02 GB/s, 38.98 GB/s, 39.01 GB/s]
-          - [38.76 GB/s, 38.86 GB/s, 38.83 GB/s, 38.82 GB/s, 38.87 GB/s, 38.88 GB/s,
-            38.81 GB/s, 38.83 GB/s, 38.88 GB/s, 38.88 GB/s]
-          load:
-          - [11.97 GB/s, 11.96 GB/s, 11.98 GB/s, 11.97 GB/s, 11.96 GB/s, 12.05 GB/s,
-            12.30 GB/s, 12.18 GB/s, 11.97 GB/s, 11.96 GB/s]
-          - [22.85 GB/s, 22.85 GB/s, 22.87 GB/s, 22.94 GB/s, 23.50 GB/s, 22.86 GB/s,
-            22.86 GB/s, 23.25 GB/s, 22.85 GB/s, 22.86 GB/s]
-          - [33.04 GB/s, 32.43 GB/s, 32.51 GB/s, 32.52 GB/s, 32.52 GB/s, 32.81 GB/s,
-            32.77 GB/s, 32.54 GB/s, 32.53 GB/s, 32.53 GB/s]
-          - [39.95 GB/s, 39.94 GB/s, 39.93 GB/s, 40.15 GB/s, 40.59 GB/s, 40.36 GB/s,
-            40.28 GB/s, 39.93 GB/s, 39.94 GB/s, 39.98 GB/s]
-          - [43.98 GB/s, 43.86 GB/s, 43.90 GB/s, 43.80 GB/s, 43.83 GB/s, 43.86 GB/s,
-            44.03 GB/s, 43.94 GB/s, 43.83 GB/s, 43.92 GB/s]
-          - [44.46 GB/s, 44.34 GB/s, 44.56 GB/s, 44.51 GB/s, 44.32 GB/s, 44.32 GB/s,
-            44.51 GB/s, 44.48 GB/s, 44.32 GB/s, 44.34 GB/s]
-          - [44.03 GB/s, 44.26 GB/s, 44.08 GB/s, 44.18 GB/s, 44.10 GB/s, 43.99 GB/s,
-            44.07 GB/s, 44.06 GB/s, 43.94 GB/s, 43.97 GB/s]
-          - [43.48 GB/s, 43.77 GB/s, 43.51 GB/s, 43.49 GB/s, 43.47 GB/s, 43.73 GB/s,
-            43.55 GB/s, 43.68 GB/s, 43.49 GB/s, 43.50 GB/s]
-          triad:
-          - [12.11 GB/s, 12.02 GB/s, 12.03 GB/s, 12.10 GB/s, 12.03 GB/s, 12.04 GB/s,
-            12.05 GB/s, 12.17 GB/s, 12.02 GB/s, 12.41 GB/s]
-          - [23.43 GB/s, 23.25 GB/s, 23.25 GB/s, 23.36 GB/s, 23.28 GB/s, 23.24 GB/s,
-            23.61 GB/s, 23.29 GB/s, 23.31 GB/s, 24.13 GB/s]
-          - [28.92 GB/s, 29.10 GB/s, 29.17 GB/s, 29.04 GB/s, 28.91 GB/s, 29.16 GB/s,
-            28.82 GB/s, 29.01 GB/s, 29.24 GB/s, 28.88 GB/s]
-          - [30.65 GB/s, 30.62 GB/s, 30.73 GB/s, 30.59 GB/s, 30.69 GB/s, 30.68 GB/s,
-            30.59 GB/s, 30.59 GB/s, 30.57 GB/s, 30.67 GB/s]
-          - [30.53 GB/s, 30.67 GB/s, 30.65 GB/s, 30.53 GB/s, 30.63 GB/s, 30.68 GB/s,
-            30.50 GB/s, 30.67 GB/s, 30.64 GB/s, 30.67 GB/s]
-          - [30.45 GB/s, 30.58 GB/s, 30.51 GB/s, 30.49 GB/s, 30.52 GB/s, 30.49 GB/s,
-            30.56 GB/s, 30.55 GB/s, 30.47 GB/s, 30.47 GB/s]
-          - [30.51 GB/s, 30.47 GB/s, 30.50 GB/s, 30.47 GB/s, 30.52 GB/s, 30.54 GB/s,
-            30.54 GB/s, 30.50 GB/s, 30.49 GB/s, 30.50 GB/s]
-          - [30.58 GB/s, 30.34 GB/s, 30.56 GB/s, 30.54 GB/s, 30.63 GB/s, 30.53 GB/s,
-            30.59 GB/s, 30.50 GB/s, 30.54 GB/s, 30.47 GB/s]
-          update:
-          - [17.33 GB/s, 17.32 GB/s, 17.34 GB/s, 17.35 GB/s, 17.40 GB/s, 17.35 GB/s,
-            17.36 GB/s, 17.39 GB/s, 17.35 GB/s, 17.35 GB/s]
-          - [31.12 GB/s, 31.15 GB/s, 31.10 GB/s, 31.16 GB/s, 31.07 GB/s, 31.08 GB/s,
-            31.09 GB/s, 31.12 GB/s, 31.12 GB/s, 31.08 GB/s]
-          - [36.80 GB/s, 36.42 GB/s, 35.92 GB/s, 36.39 GB/s, 35.99 GB/s, 35.98 GB/s,
-            36.37 GB/s, 36.39 GB/s, 36.38 GB/s, 36.44 GB/s]
-          - [39.03 GB/s, 39.05 GB/s, 39.02 GB/s, 39.06 GB/s, 39.01 GB/s, 39.02 GB/s,
-            39.02 GB/s, 39.00 GB/s, 39.00 GB/s, 39.00 GB/s]
-          - [39.76 GB/s, 39.80 GB/s, 39.80 GB/s, 39.78 GB/s, 39.76 GB/s, 39.79 GB/s,
-            39.79 GB/s, 39.77 GB/s, 39.77 GB/s, 39.71 GB/s]
-          - [39.71 GB/s, 39.72 GB/s, 39.72 GB/s, 39.66 GB/s, 39.74 GB/s, 39.70 GB/s,
-            39.76 GB/s, 39.74 GB/s, 39.77 GB/s, 39.74 GB/s]
-          - [39.50 GB/s, 39.47 GB/s, 39.45 GB/s, 39.43 GB/s, 39.46 GB/s, 39.45 GB/s,
-            39.45 GB/s, 39.40 GB/s, 39.43 GB/s, 39.47 GB/s]
-          - [39.21 GB/s, 39.18 GB/s, 39.19 GB/s, 39.19 GB/s, 39.21 GB/s, 39.19 GB/s,
-            39.18 GB/s, 39.21 GB/s, 39.20 GB/s, 39.24 GB/s]
-        threads: [1, 2, 3, 4, 5, 6, 7, 8]
-        threads per core: 1
-        total size: [300.00 MB, 300.00 MB, 300.00 MB, 300.00 MB, 300.00 MB, 300.00
-            MB, 300.00 MB, 300.00 MB]
-      2:
-        cores: [1, 2, 3, 4, 5, 6, 7, 8]
-        results:
-          copy: [10.79 GB/s, 20.46 GB/s, 24.69 GB/s, 25.42 GB/s, 25.63 GB/s, 25.45
-              GB/s, 25.32 GB/s, 25.06 GB/s]
-          daxpy: [15.97 GB/s, 29.70 GB/s, 35.95 GB/s, 37.55 GB/s, 37.81 GB/s, 37.78
-              GB/s, 37.64 GB/s, 37.33 GB/s]
-          load: [13.46 GB/s, 25.84 GB/s, 35.75 GB/s, 40.54 GB/s, 42.38 GB/s, 42.30
-              GB/s, 41.85 GB/s, 41.19 GB/s]
-          triad: [12.05 GB/s, 22.53 GB/s, 27.53 GB/s, 29.10 GB/s, 29.68 GB/s, 29.79
-              GB/s, 29.85 GB/s, 29.64 GB/s]
-          update: [19.12 GB/s, 33.86 GB/s, 38.51 GB/s, 39.38 GB/s, 39.20 GB/s, 38.80
-              GB/s, 38.39 GB/s, 38.02 GB/s]
-        size per core: [300.00 MB, 150.00 MB, 100.00 MB, 75.00 MB, 60.00 MB, 50.00
-            MB, 42.86 MB, 37.50 MB]
-        size per thread: [150.00 MB, 75.00 MB, 50.00 MB, 37.50 MB, 30.00 MB, 25.00
-            MB, 21.43 MB, 18.75 MB]
-        stats:
-          copy:
-          - [10.71 GB/s, 10.69 GB/s, 10.71 GB/s, 10.70 GB/s, 10.79 GB/s, 10.58 GB/s,
-            10.70 GB/s, 10.69 GB/s, 10.69 GB/s, 10.70 GB/s]
-          - [20.27 GB/s, 20.31 GB/s, 20.27 GB/s, 20.26 GB/s, 20.31 GB/s, 20.26 GB/s,
-            20.24 GB/s, 20.26 GB/s, 20.26 GB/s, 20.46 GB/s]
-          - [24.69 GB/s, 24.66 GB/s, 24.64 GB/s, 24.63 GB/s, 24.67 GB/s, 24.64 GB/s,
-            24.64 GB/s, 24.68 GB/s, 24.61 GB/s, 24.63 GB/s]
-          - [25.42 GB/s, 25.41 GB/s, 25.40 GB/s, 25.36 GB/s, 25.40 GB/s, 25.39 GB/s,
-            25.40 GB/s, 25.38 GB/s, 25.41 GB/s, 25.39 GB/s]
-          - [25.55 GB/s, 25.57 GB/s, 25.58 GB/s, 25.63 GB/s, 25.57 GB/s, 25.57 GB/s,
-            25.58 GB/s, 25.55 GB/s, 25.57 GB/s, 25.49 GB/s]
-          - [25.42 GB/s, 25.42 GB/s, 25.41 GB/s, 25.39 GB/s, 25.40 GB/s, 25.43 GB/s,
-            25.45 GB/s, 25.44 GB/s, 25.43 GB/s, 25.43 GB/s]
-          - [25.27 GB/s, 25.31 GB/s, 25.28 GB/s, 25.31 GB/s, 25.32 GB/s, 25.31 GB/s,
-            25.29 GB/s, 25.30 GB/s, 25.25 GB/s, 25.28 GB/s]
-          - [25.03 GB/s, 25.01 GB/s, 25.01 GB/s, 25.04 GB/s, 25.00 GB/s, 25.03 GB/s,
-            25.06 GB/s, 25.04 GB/s, 25.04 GB/s, 25.04 GB/s]
-          daxpy:
-          - [15.81 GB/s, 15.81 GB/s, 15.97 GB/s, 15.62 GB/s, 15.64 GB/s, 15.83 GB/s,
-            15.63 GB/s, 15.82 GB/s, 15.81 GB/s, 15.63 GB/s]
-          - [29.62 GB/s, 29.56 GB/s, 29.61 GB/s, 29.59 GB/s, 29.70 GB/s, 29.61 GB/s,
-            29.65 GB/s, 29.65 GB/s, 29.58 GB/s, 29.59 GB/s]
-          - [35.95 GB/s, 35.89 GB/s, 35.92 GB/s, 35.92 GB/s, 35.95 GB/s, 35.90 GB/s,
-            35.87 GB/s, 35.90 GB/s, 35.92 GB/s, 35.82 GB/s]
-          - [37.55 GB/s, 37.46 GB/s, 37.52 GB/s, 37.51 GB/s, 37.55 GB/s, 37.51 GB/s,
-            37.44 GB/s, 37.41 GB/s, 37.50 GB/s, 37.40 GB/s]
-          - [37.79 GB/s, 37.76 GB/s, 37.80 GB/s, 37.77 GB/s, 37.76 GB/s, 37.81 GB/s,
-            37.78 GB/s, 37.81 GB/s, 37.79 GB/s, 37.78 GB/s]
-          - [37.71 GB/s, 37.68 GB/s, 37.68 GB/s, 37.73 GB/s, 37.74 GB/s, 37.66 GB/s,
-            37.78 GB/s, 37.74 GB/s, 37.71 GB/s, 37.70 GB/s]
-          - [37.61 GB/s, 37.60 GB/s, 37.61 GB/s, 37.62 GB/s, 37.64 GB/s, 37.61 GB/s,
-            37.60 GB/s, 37.59 GB/s, 37.63 GB/s, 37.60 GB/s]
-          - [37.23 GB/s, 37.21 GB/s, 37.26 GB/s, 37.27 GB/s, 37.28 GB/s, 37.33 GB/s,
-            37.29 GB/s, 37.31 GB/s, 37.26 GB/s, 37.29 GB/s]
-          load:
-          - [13.34 GB/s, 13.36 GB/s, 13.35 GB/s, 13.34 GB/s, 13.35 GB/s, 13.38 GB/s,
-            13.46 GB/s, 13.35 GB/s, 13.35 GB/s, 13.35 GB/s]
-          - [25.63 GB/s, 25.64 GB/s, 25.84 GB/s, 25.64 GB/s, 25.74 GB/s, 25.63 GB/s,
-            25.64 GB/s, 25.63 GB/s, 25.64 GB/s, 25.68 GB/s]
-          - [35.38 GB/s, 35.56 GB/s, 35.50 GB/s, 35.75 GB/s, 35.50 GB/s, 35.39 GB/s,
-            35.46 GB/s, 35.39 GB/s, 35.75 GB/s, 35.40 GB/s]
-          - [40.37 GB/s, 40.37 GB/s, 40.49 GB/s, 40.49 GB/s, 40.42 GB/s, 40.37 GB/s,
-            40.54 GB/s, 40.39 GB/s, 40.37 GB/s, 40.51 GB/s]
-          - [42.34 GB/s, 42.14 GB/s, 42.26 GB/s, 42.17 GB/s, 42.10 GB/s, 42.13 GB/s,
-            42.38 GB/s, 42.13 GB/s, 42.21 GB/s, 42.15 GB/s]
-          - [42.30 GB/s, 42.13 GB/s, 42.20 GB/s, 42.11 GB/s, 42.12 GB/s, 42.12 GB/s,
-            42.18 GB/s, 42.25 GB/s, 42.19 GB/s, 42.21 GB/s]
-          - [41.70 GB/s, 41.76 GB/s, 41.85 GB/s, 41.80 GB/s, 41.71 GB/s, 41.71 GB/s,
-            41.80 GB/s, 41.70 GB/s, 41.76 GB/s, 41.75 GB/s]
-          - [41.02 GB/s, 41.01 GB/s, 41.17 GB/s, 41.12 GB/s, 41.13 GB/s, 41.15 GB/s,
-            41.19 GB/s, 41.01 GB/s, 41.10 GB/s, 41.06 GB/s]
-          triad:
-          - [11.87 GB/s, 11.89 GB/s, 11.91 GB/s, 11.81 GB/s, 11.83 GB/s, 11.85 GB/s,
-            11.90 GB/s, 11.80 GB/s, 11.85 GB/s, 12.05 GB/s]
-          - [22.53 GB/s, 22.47 GB/s, 22.44 GB/s, 22.46 GB/s, 22.43 GB/s, 22.52 GB/s,
-            22.41 GB/s, 22.52 GB/s, 22.48 GB/s, 22.41 GB/s]
-          - [27.43 GB/s, 27.42 GB/s, 27.47 GB/s, 27.47 GB/s, 27.52 GB/s, 27.49 GB/s,
-            27.41 GB/s, 27.42 GB/s, 27.51 GB/s, 27.53 GB/s]
-          - [29.02 GB/s, 29.03 GB/s, 29.03 GB/s, 29.04 GB/s, 28.89 GB/s, 29.10 GB/s,
-            29.02 GB/s, 29.05 GB/s, 28.93 GB/s, 29.01 GB/s]
-          - [29.66 GB/s, 29.68 GB/s, 29.60 GB/s, 29.62 GB/s, 29.60 GB/s, 29.67 GB/s,
-            29.66 GB/s, 29.62 GB/s, 29.62 GB/s, 29.62 GB/s]
-          - [29.78 GB/s, 29.76 GB/s, 29.77 GB/s, 29.77 GB/s, 29.75 GB/s, 29.79 GB/s,
-            29.75 GB/s, 29.77 GB/s, 29.76 GB/s, 29.78 GB/s]
-          - [29.82 GB/s, 29.85 GB/s, 29.85 GB/s, 29.83 GB/s, 29.82 GB/s, 29.83 GB/s,
-            29.83 GB/s, 29.81 GB/s, 29.81 GB/s, 29.80 GB/s]
-          - [29.54 GB/s, 29.63 GB/s, 29.57 GB/s, 29.56 GB/s, 29.55 GB/s, 29.64 GB/s,
-            29.60 GB/s, 29.53 GB/s, 29.54 GB/s, 29.57 GB/s]
-          update:
-          - [18.66 GB/s, 18.67 GB/s, 18.66 GB/s, 19.12 GB/s, 18.67 GB/s, 18.67 GB/s,
-            18.67 GB/s, 18.67 GB/s, 18.70 GB/s, 18.67 GB/s]
-          - [33.61 GB/s, 33.34 GB/s, 33.71 GB/s, 33.31 GB/s, 33.34 GB/s, 33.86 GB/s,
-            33.62 GB/s, 33.35 GB/s, 33.54 GB/s, 33.34 GB/s]
-          - [38.51 GB/s, 38.46 GB/s, 38.42 GB/s, 38.43 GB/s, 38.41 GB/s, 38.46 GB/s,
-            38.41 GB/s, 38.42 GB/s, 38.43 GB/s, 38.41 GB/s]
-          - [39.37 GB/s, 39.34 GB/s, 39.36 GB/s, 39.35 GB/s, 39.37 GB/s, 39.38 GB/s,
-            39.36 GB/s, 39.35 GB/s, 39.31 GB/s, 39.32 GB/s]
-          - [39.17 GB/s, 39.17 GB/s, 39.16 GB/s, 39.20 GB/s, 39.18 GB/s, 39.17 GB/s,
-            39.18 GB/s, 39.15 GB/s, 39.20 GB/s, 39.17 GB/s]
-          - [38.79 GB/s, 38.79 GB/s, 38.80 GB/s, 38.78 GB/s, 38.78 GB/s, 38.75 GB/s,
-            38.80 GB/s, 38.77 GB/s, 38.78 GB/s, 38.78 GB/s]
-          - [38.36 GB/s, 38.37 GB/s, 38.37 GB/s, 38.39 GB/s, 38.36 GB/s, 38.37 GB/s,
-            38.38 GB/s, 38.37 GB/s, 38.35 GB/s, 38.39 GB/s]
-          - [37.98 GB/s, 37.99 GB/s, 38.02 GB/s, 38.01 GB/s, 38.01 GB/s, 38.00 GB/s,
-            38.02 GB/s, 38.00 GB/s, 38.02 GB/s, 38.02 GB/s]
-        threads: [2, 4, 6, 8, 10, 12, 14, 16]
-        threads per core: 2
-        total size: [300.00 MB, 300.00 MB, 300.00 MB, 300.00 MB, 300.00 MB, 300.00
-            MB, 300.00 MB, 300.00 MB]
--- a/pystencils_tests/test_Min_Max.py
+++ b/pystencils_tests/test_Min_Max.py
-import sympy
-import numpy
-import pystencils
-from pystencils.datahandling import create_data_handling
-def test_max():
-    dh = create_data_handling(domain_size=(10, 10), periodicity=True)
-    x = dh.add_array('x', values_per_cell=1)
-    dh.fill("x", 0.0, ghost_layers=True)
-    y = dh.add_array('y', values_per_cell=1)
-    dh.fill("y", 1.0, ghost_layers=True)
-    z = dh.add_array('z', values_per_cell=1)
-    dh.fill("z", 2.0, ghost_layers=True)
-    # test sp.Max with one argument
-    assignment_1 = pystencils.Assignment(x.center, sympy.Max(y.center + 3.3))
-    ast_1 = pystencils.create_kernel(assignment_1)
-    kernel_1 = ast_1.compile()
-    # test sp.Max with two arguments
-    assignment_2 = pystencils.Assignment(x.center, sympy.Max(0.5, y.center - 1.5))
-    ast_2 = pystencils.create_kernel(assignment_2)
-    kernel_2 = ast_2.compile()
-    # test sp.Max with many arguments
-    assignment_3 = pystencils.Assignment(x.center, sympy.Max(z.center, 4.5, y.center - 1.5, y.center + z.center))
-    ast_3 = pystencils.create_kernel(assignment_3)
-    kernel_3 = ast_3.compile()
-    dh.run_kernel(kernel_1)
-    assert numpy.all(dh.cpu_arrays["x"] == 4.3)
-    dh.run_kernel(kernel_2)
-    assert numpy.all(dh.cpu_arrays["x"] == 0.5)
-    dh.run_kernel(kernel_3)
-    assert numpy.all(dh.cpu_arrays["x"] == 4.5)
-def test_min():
-    dh = create_data_handling(domain_size=(10, 10), periodicity=True)
-    x = dh.add_array('x', values_per_cell=1)
-    dh.fill("x", 0.0, ghost_layers=True)
-    y = dh.add_array('y', values_per_cell=1)
-    dh.fill("y", 1.0, ghost_layers=True)
-    z = dh.add_array('z', values_per_cell=1)
-    dh.fill("z", 2.0, ghost_layers=True)
-    # test sp.Min with one argument
-    assignment_1 = pystencils.Assignment(x.center, sympy.Min(y.center + 3.3))
-    ast_1 = pystencils.create_kernel(assignment_1)
-    kernel_1 = ast_1.compile()
-    # test sp.Min with two arguments
-    assignment_2 = pystencils.Assignment(x.center, sympy.Min(0.5, y.center - 1.5))
-    ast_2 = pystencils.create_kernel(assignment_2)
-    kernel_2 = ast_2.compile()
-    # test sp.Min with many arguments
-    assignment_3 = pystencils.Assignment(x.center, sympy.Min(z.center, 4.5, y.center - 1.5, y.center + z.center))
-    ast_3 = pystencils.create_kernel(assignment_3)
-    kernel_3 = ast_3.compile()
-    dh.run_kernel(kernel_1)
-    assert numpy.all(dh.cpu_arrays["x"] == 4.3)
-    dh.run_kernel(kernel_2)
-    assert numpy.all(dh.cpu_arrays["x"] == - 0.5)
-    dh.run_kernel(kernel_3)
-    assert numpy.all(dh.cpu_arrays["x"] == - 0.5)
No results found