From 2082ef802a38f828365a0bbe9467743c65a5eb78 Mon Sep 17 00:00:00 2001
From: Markus Holzer <markus.holzer@fau.de>
Date: Thu, 25 Nov 2021 11:34:56 +0000
Subject: [PATCH] Removed kerncraft

---
 README.md                                     |   2 -
 conftest.py                                   |  16 -
 pystencils/__init__.py                        |   7 -
 pystencils/autodiff.py                        |  12 -
 pystencils/kerncraft_coupling/__init__.py     |   4 -
 .../kerncraft_coupling/generate_benchmark.py  | 146 ---
 .../kerncraft_coupling/kerncraft_interface.py | 373 -------
 .../kerncraft_coupling/templates/benchmark.c  | 157 ---
 .../kerncraft_coupling/templates/kernel.c     |  18 -
 .../kerncraft_coupling/templates/kernel.h     |   3 -
 .../SkylakeSP_Gold-5122_allinclusive.yaml     | 600 -----------
 pystencils_tests/benchmark/benchmark.py       | 188 ----
 pystencils_tests/benchmark/generate.py        |  50 -
 pystencils_tests/benchmark/iacaMarks.h        |  53 -
 pystencils_tests/benchmark/main.c             |  11 -
 pystencils_tests/kerncraft_inputs/2d-5pt.c    |   8 -
 pystencils_tests/kerncraft_inputs/3d-7pt.c    |  10 -
 .../Example_SandyBridgeEP_E5-2680.yml         | 980 ------------------
 pystencils_tests/test_kerncraft_coupling.py   | 197 ----
 pytest.ini                                    |   3 +-
 setup.py                                      |   5 +-
 21 files changed, 2 insertions(+), 2841 deletions(-)
 delete mode 100644 pystencils/autodiff.py
 delete mode 100644 pystencils/kerncraft_coupling/__init__.py
 delete mode 100644 pystencils/kerncraft_coupling/generate_benchmark.py
 delete mode 100644 pystencils/kerncraft_coupling/kerncraft_interface.py
 delete mode 100644 pystencils/kerncraft_coupling/templates/benchmark.c
 delete mode 100644 pystencils/kerncraft_coupling/templates/kernel.c
 delete mode 100644 pystencils/kerncraft_coupling/templates/kernel.h
 delete mode 100644 pystencils_tests/benchmark/SkylakeSP_Gold-5122_allinclusive.yaml
 delete mode 100644 pystencils_tests/benchmark/benchmark.py
 delete mode 100644 pystencils_tests/benchmark/generate.py
 delete mode 100644 pystencils_tests/benchmark/iacaMarks.h
 delete mode 100644 pystencils_tests/benchmark/main.c
 delete mode 100644 pystencils_tests/kerncraft_inputs/2d-5pt.c
 delete mode 100644 pystencils_tests/kerncraft_inputs/3d-7pt.c
 delete mode 100644 pystencils_tests/kerncraft_inputs/Example_SandyBridgeEP_E5-2680.yml
 delete mode 100644 pystencils_tests/test_kerncraft_coupling.py

diff --git a/README.md b/README.md
index 62105e71d..5ab85add4 100644
--- a/README.md
+++ b/README.md
@@ -56,9 +56,7 @@ All options:
 - `alltrafos`: pulls in additional dependencies for loop simplification e.g. libisl
 - `bench_db`: functionality to store benchmark result in object databases
 - `interactive`: installs dependencies to work in Jupyter including image I/O, plotting etc.
-- `autodiff`: enable derivation of adjoint kernels and generation of Torch/Tensorflow operations
 - `doc`: packages to build documentation
-- `kerncraft`: use kerncraft for automatic performance analysis
 
 Options can be combined e.g.
 ```bash
diff --git a/conftest.py b/conftest.py
index b7c535a59..131167994 100644
--- a/conftest.py
+++ b/conftest.py
@@ -45,28 +45,12 @@ add_path_to_ignore('pystencils_tests/benchmark')
 add_path_to_ignore('_local_tmp')
 
 
-collect_ignore += [os.path.join(SCRIPT_FOLDER, "pystencils/autodiff.py")]
-
 try:
     import pycuda
 except ImportError:
     collect_ignore += [os.path.join(SCRIPT_FOLDER, "pystencils_tests/test_cudagpu.py")]
     add_path_to_ignore('pystencils/gpucuda')
 
-try:
-    import llvmlite
-except ImportError:
-    collect_ignore += [os.path.join(SCRIPT_FOLDER, 'pystencils_tests/backends/llvm.py')]
-    collect_ignore += [os.path.join(SCRIPT_FOLDER, 'pystencils_tests/test_basic_usage_llvm.ipynb')]
-    add_path_to_ignore('pystencils/llvm')
-
-try:
-    import kerncraft
-except ImportError:
-    collect_ignore += [os.path.join(SCRIPT_FOLDER, "pystencils_tests/test_kerncraft_coupling.py"),
-                       os.path.join(SCRIPT_FOLDER, "pystencils_tests/benchmark/benchmark.py")]
-    add_path_to_ignore('pystencils/kerncraft_coupling')
-
 try:
     import waLBerla
 except ImportError:
diff --git a/pystencils/__init__.py b/pystencils/__init__.py
index 4d97202bd..56b9c9e5c 100644
--- a/pystencils/__init__.py
+++ b/pystencils/__init__.py
@@ -15,13 +15,6 @@ from .slicing import make_slice
 from .spatial_coordinates import x_, x_staggered, x_staggered_vector, x_vector, y_, y_staggered, z_, z_staggered
 from .sympyextensions import SymbolCreator
 
-try:
-    import pystencils_autodiff
-
-    autodiff = pystencils_autodiff
-except ImportError:
-    pass
-
 __all__ = ['Field', 'FieldType', 'fields',
            'TypedSymbol',
            'make_slice',
diff --git a/pystencils/autodiff.py b/pystencils/autodiff.py
deleted file mode 100644
index 96ac836c8..000000000
--- a/pystencils/autodiff.py
+++ /dev/null
@@ -1,12 +0,0 @@
-"""
-Provides tools for generation of auto-differentiable operations.
-
-See https://github.com/theHamsta/pystencils_autodiff
-
-Installation:
-
-.. code-block:: bash
-    pip install pystencils-autodiff
-"""
-
-raise NotImplementedError('pystencils-autodiff is not installed. Run `pip install pystencils-autodiff`')
diff --git a/pystencils/kerncraft_coupling/__init__.py b/pystencils/kerncraft_coupling/__init__.py
deleted file mode 100644
index 8d2c3ad50..000000000
--- a/pystencils/kerncraft_coupling/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-from .generate_benchmark import generate_benchmark, run_c_benchmark
-from .kerncraft_interface import KerncraftParameters, PyStencilsKerncraftKernel
-
-__all__ = ['PyStencilsKerncraftKernel', 'KerncraftParameters', 'generate_benchmark', 'run_c_benchmark']
diff --git a/pystencils/kerncraft_coupling/generate_benchmark.py b/pystencils/kerncraft_coupling/generate_benchmark.py
deleted file mode 100644
index 8d8d7d1da..000000000
--- a/pystencils/kerncraft_coupling/generate_benchmark.py
+++ /dev/null
@@ -1,146 +0,0 @@
-import subprocess
-import warnings
-import tempfile
-from pathlib import Path
-
-from jinja2 import Environment, PackageLoader, StrictUndefined
-
-from pystencils.astnodes import PragmaBlock
-from pystencils.backends.cbackend import generate_c, get_headers
-from pystencils.cpu.cpujit import get_compiler_config, run_compile_step
-from pystencils.typing import get_base_type
-from pystencils.enums import Backend
-from pystencils.include import get_pystencils_include_path
-from pystencils.integer_functions import modulo_ceil
-from pystencils.sympyextensions import prod
-
-import numpy as np
-
-
-def generate_benchmark(ast, likwid=False, openmp=False, timing=False):
-    """Return C code of a benchmark program for the given kernel.
-
-    Args:
-        ast: the pystencils AST object as returned by create_kernel
-        likwid: if True likwid markers are added to the code
-        openmp: relevant only if likwid=True, to generated correct likwid initialization code
-        timing: add timing output to the code, prints time per iteration to stdout
-
-    Returns:
-        C code as string
-    """
-    accessed_fields = {f.name: f for f in ast.fields_accessed}
-    constants = []
-    fields = []
-    call_parameters = []
-    for p in ast.get_parameters():
-        if not p.is_field_parameter:
-            constants.append((p.symbol.name, str(p.symbol.dtype)))
-            call_parameters.append(p.symbol.name)
-        else:
-            assert p.is_field_pointer, "Benchmark implemented only for kernels with fixed loop size"
-            field = accessed_fields[p.field_name]
-            dtype = str(get_base_type(p.symbol.dtype))
-            np_dtype = get_base_type(p.symbol.dtype).numpy_dtype
-            size_data_type = np_dtype.itemsize
-
-            dim0_size = field.shape[-1]
-            dim1_size = np.prod(field.shape[:-1])
-            elements = prod(field.shape)
-
-            if ast.instruction_set:
-                align = ast.instruction_set['width'] * size_data_type
-                padding_elements = modulo_ceil(dim0_size, ast.instruction_set['width']) - dim0_size
-                padding_bytes = padding_elements * size_data_type
-                ghost_layers = max(max(ast.ghost_layers))
-
-                size = dim1_size * padding_bytes + np.prod(field.shape) * size_data_type
-
-                assert align % np_dtype.itemsize == 0
-                offset = ((dim0_size + padding_elements + ghost_layers) % ast.instruction_set['width']) * size_data_type
-
-                fields.append((p.field_name, dtype, elements, size, offset, align))
-                call_parameters.append(p.field_name)
-            else:
-                size = elements * size_data_type
-                fields.append((p.field_name, dtype, elements, size, 0, 0))
-                call_parameters.append(p.field_name)
-
-    header_list = get_headers(ast)
-    includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list])
-
-    # Strip "#pragma omp parallel" from within kernel, because main function takes care of that
-    # when likwid and openmp are enabled
-    if likwid and openmp:
-        if len(ast.body.args) > 0 and isinstance(ast.body.args[0], PragmaBlock):
-            ast.body.args[0].pragma_line = ''
-
-    jinja_context = {
-        'likwid': likwid,
-        'openmp': openmp,
-        'kernel_code': generate_c(ast, dialect=Backend.C),
-        'kernelName': ast.function_name,
-        'fields': fields,
-        'constants': constants,
-        'call_argument_list': ",".join(call_parameters),
-        'includes': includes,
-        'timing': timing,
-    }
-
-    env = Environment(loader=PackageLoader('pystencils.kerncraft_coupling'), undefined=StrictUndefined)
-
-    return env.get_template('benchmark.c').render(**jinja_context)
-
-
-def run_c_benchmark(ast, inner_iterations, outer_iterations=3, path=None):
-    """Runs the given kernel with outer loop in C
-
-    Args:
-        ast: pystencils ast which is used to compile the benchmark file
-        inner_iterations: timings are recorded around this many iterations
-        outer_iterations: number of timings recorded
-        path: path where the benchmark file is stored. If None a tmp folder is created
-
-    Returns:
-        list of times per iterations for each outer iteration
-    """
-    import kerncraft
-
-    benchmark_code = generate_benchmark(ast, timing=True)
-
-    if path is None:
-        path = tempfile.mkdtemp()
-
-    if isinstance(path, str):
-        path = Path(path)
-
-    with open(path / 'bench.c', 'w') as f:
-        f.write(benchmark_code)
-
-    kerncraft_path = Path(kerncraft.__file__).parent
-
-    extra_flags = ['-I' + get_pystencils_include_path(),
-                   '-I' + str(kerncraft_path / 'headers')]
-
-    compiler_config = get_compiler_config()
-    compile_cmd = [compiler_config['command']] + compiler_config['flags'].split()
-    compile_cmd += [*extra_flags,
-                    str(kerncraft_path / 'headers' / 'timing.c'),
-                    str(kerncraft_path / 'headers' / 'dummy.c'),
-                    str(path / 'bench.c'),
-                    '-o', str(path / 'bench'),
-                    ]
-    run_compile_step(compile_cmd)
-
-    time_pre_estimation_per_iteration = float(subprocess.check_output(['./' / path / 'bench', str(10)]))
-    benchmark_time_limit = 20
-    if benchmark_time_limit / time_pre_estimation_per_iteration < inner_iterations:
-        warn = (f"A benchmark run with {inner_iterations} inner_iterations will probably take longer than "
-                f"{benchmark_time_limit} seconds for this kernel")
-        warnings.warn(warn)
-
-    results = []
-    for _ in range(outer_iterations):
-        benchmark_time = float(subprocess.check_output(['./' / path / 'bench', str(inner_iterations)]))
-        results.append(benchmark_time)
-    return results
diff --git a/pystencils/kerncraft_coupling/kerncraft_interface.py b/pystencils/kerncraft_coupling/kerncraft_interface.py
deleted file mode 100644
index bfb5a2d6a..000000000
--- a/pystencils/kerncraft_coupling/kerncraft_interface.py
+++ /dev/null
@@ -1,373 +0,0 @@
-import warnings
-import fcntl
-from collections import defaultdict
-from tempfile import TemporaryDirectory
-import textwrap
-import itertools
-import string
-
-from jinja2 import Environment, PackageLoader, StrictUndefined, Template
-import sympy as sp
-from kerncraft.kerncraft import KernelCode
-from kerncraft.kernel import symbol_pos_int
-from kerncraft.machinemodel import MachineModel
-
-from pystencils.astnodes import \
-    KernelFunction, LoopOverCoordinate, ResolvedFieldAccess, SympyAssignment
-from pystencils.backends.cbackend import generate_c, get_headers
-from pystencils.enums import Backend
-from pystencils.field import get_layout_from_strides
-from pystencils.sympyextensions import count_operations_in_ast
-from pystencils.transformations import filtered_tree_iteration
-from pystencils.utils import DotDict
-from pystencils.cpu.kernelcreation import add_openmp
-from pystencils.typing.utilities import get_base_type
-from pystencils.sympyextensions import prod
-
-
-class PyStencilsKerncraftKernel(KernelCode):
-    """
-    Implementation of kerncraft's kernel interface for pystencils CPU kernels.
-    Analyses a list of equations assuming they will be executed on a CPU
-    """
-    LIKWID_BASE = '/usr/local/likwid'
-
-    def __init__(self, ast: KernelFunction, machine: MachineModel,
-                 assumed_layout='SoA', debug_print=False, filename=None):
-        """Create a kerncraft kernel using a pystencils AST
-
-        Args:
-            ast: pystencils ast
-            machine: kerncraft machine model - specify this if kernel needs to be compiled
-            assumed_layout: either 'SoA' or 'AoS' - if fields have symbolic sizes the layout of the index
-                    coordinates is not known. In this case either a structures of array (SoA) or
-                    array of structures (AoS) layout is assumed
-            debug_print: print debug information
-            filename: used for caching
-        """
-        super(KernelCode, self).__init__(machine=machine)
-
-        # Initialize state
-        self.asm_block = None
-        self._filename = filename
-        self._keep_intermediates = False
-
-        self.kernel_ast = ast
-        self.temporary_dir = TemporaryDirectory()
-        self._keep_intermediates = debug_print
-
-        # Loops
-        inner_loops = [l for l in filtered_tree_iteration(ast, LoopOverCoordinate, stop_type=SympyAssignment)
-                       if l.is_innermost_loop]
-        if len(inner_loops) == 0:
-            raise ValueError("No loop found in pystencils AST")
-        else:
-            if len(inner_loops) > 1:
-                warnings.warn("pystencils AST contains multiple inner loops. "
-                              "Only one can be analyzed - choosing first one")
-            inner_loop = inner_loops[0]
-
-        self._loop_stack = []
-        cur_node = inner_loop
-        while cur_node is not None:
-            if isinstance(cur_node, LoopOverCoordinate):
-                loop_counter_sym = cur_node.loop_counter_symbol
-                loop_info = (loop_counter_sym.name,
-                             sp.Integer(cur_node.start),
-                             sp.Integer(cur_node.stop),
-                             sp.Integer(1))
-                # If the correct step were to be provided, all access within that step length will
-                # also need to be passed to kerncraft: cur_node.step)
-                self._loop_stack.append(loop_info)
-            cur_node = cur_node.parent
-        self._loop_stack = list(reversed(self._loop_stack))
-
-        def get_layout_tuple(f):
-            if f.has_fixed_shape:
-                return get_layout_from_strides(f.strides)
-            else:
-                layout_list = list(f.layout)
-                for _ in range(f.index_dimensions):
-                    layout_list.insert(0 if assumed_layout == 'SoA' else -1, max(layout_list) + 1)
-                return layout_list
-
-        # Variables (arrays) and Constants (scalar sizes)
-        const_names_iter = itertools.product(string.ascii_uppercase, repeat=1)
-        constants_reversed = {}
-        fields_accessed = self.kernel_ast.fields_accessed
-        for field in fields_accessed:
-            layout = get_layout_tuple(field)
-            permuted_shape = list(field.shape[i] for i in layout)
-            # Replace shape dimensions with constant variables (necessary for layer condition
-            # analysis)
-            for i, d in enumerate(permuted_shape):
-                if d not in self.constants.values():
-                    const_symbol = symbol_pos_int(''.join(next(const_names_iter)))
-                    self.set_constant(const_symbol, d)
-                    constants_reversed[d] = const_symbol
-                permuted_shape[i] = constants_reversed[d]
-            self.set_variable(field.name, (str(field.dtype),), tuple(permuted_shape))
-
-        # Data sources & destinations
-        self.sources = defaultdict(list)
-        self.destinations = defaultdict(list)
-
-        reads, writes = search_resolved_field_accesses_in_ast(inner_loop)
-        for accesses, target_dict in [(reads, self.sources), (writes, self.destinations)]:
-            for fa in accesses:
-                coord = [symbol_pos_int(LoopOverCoordinate.get_loop_counter_name(i)) + off
-                         for i, off in enumerate(fa.offsets)]
-                coord += list(fa.idx_coordinate_values)
-                layout = get_layout_tuple(fa.field)
-                permuted_coord = [sp.sympify(coord[i]) for i in layout]
-                target_dict[fa.field.name].append(permuted_coord)
-
-        # data type
-        self.datatype = list(self.variables.values())[0][0]
-
-        # flops
-        operation_count = count_operations_in_ast(inner_loop)
-        self._flops = {
-            '+': operation_count['adds'],
-            '*': operation_count['muls'],
-            '/': operation_count['divs'],
-        }
-        for k in [k for k, v in self._flops.items() if v == 0]:
-            del self._flops[k]
-        self.check()
-
-        if debug_print:
-            from pprint import pprint
-            print("-----------------------------  Loop Stack --------------------------")
-            pprint(self._loop_stack)
-            print("-----------------------------  Sources -----------------------------")
-            pprint(self.sources)
-            print("-----------------------------  Destinations ------------------------")
-            pprint(self.destinations)
-            print("-----------------------------  FLOPS -------------------------------")
-            pprint(self._flops)
-
-    def get_kernel_header(self, name='pystencils_kernel'):
-        file_name = "pystencils_kernel.h"
-        file_path = self.get_intermediate_location(file_name, machine_and_compiler_dependent=False)
-        lock_mode, lock_fp = self.lock_intermediate(file_path)
-
-        if lock_mode == fcntl.LOCK_SH:
-            # use cache
-            pass
-        else:  # lock_mode == fcntl.LOCK_EX:
-            function_signature = generate_c(self.kernel_ast, dialect=Backend.C, signature_only=True)
-
-            jinja_context = {
-                'function_signature': function_signature,
-            }
-
-            env = Environment(loader=PackageLoader('pystencils.kerncraft_coupling'), undefined=StrictUndefined)
-            file_header = env.get_template('kernel.h').render(**jinja_context)
-            with open(file_path, 'w') as f:
-                f.write(file_header)
-
-            self.release_exclusive_lock(lock_fp)  # degrade to shared lock
-        return file_path, lock_fp
-
-    def get_kernel_code(self, openmp=False, name='pystencils_kernl'):
-        """
-        Generate and return compilable source code from AST.
-
-        Args:
-            openmp: if true, openmp code will be generated
-            name: kernel name
-        """
-        filename = 'pystencils_kernl'
-        if openmp:
-            filename += '-omp'
-        filename += '.c'
-        file_path = self.get_intermediate_location(filename, machine_and_compiler_dependent=False)
-        lock_mode, lock_fp = self.lock_intermediate(file_path)
-
-        if lock_mode == fcntl.LOCK_SH:
-            # use cache
-            with open(file_path) as f:
-                code = f.read()
-        else:  # lock_mode == fcntl.LOCK_EX:
-            header_list = get_headers(self.kernel_ast)
-            includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list])
-
-            if openmp:
-                add_openmp(self.kernel_ast)
-
-            kernel_code = generate_c(self.kernel_ast, dialect=Backend.C)
-
-            jinja_context = {
-                'includes': includes,
-                'kernel_code': kernel_code,
-            }
-
-            env = Environment(loader=PackageLoader('pystencils.kerncraft_coupling'), undefined=StrictUndefined)
-            code = env.get_template('kernel.c').render(**jinja_context)
-            with open(file_path, 'w') as f:
-                f.write(code)
-
-            self.release_exclusive_lock(lock_fp)  # degrade to shared lock
-        return file_path, lock_fp
-
-    CODE_TEMPLATE = Template(textwrap.dedent("""
-        #include <likwid.h>
-        #include <stdlib.h>
-        #include <stdint.h>
-        #include <stdbool.h>
-        #include <math.h>
-        #include "kerncraft.h"
-        #include "kernel.h"
-
-        #define RESTRICT __restrict__
-        #define FUNC_PREFIX
-        void dummy(void *);
-        extern int var_false;
-
-        int main(int argc, char **argv) {
-          {%- for constantName, dataType in constants %}
-          // Constant {{constantName}}
-          {{dataType}} {{constantName}};
-          {{constantName}} = 0.23;
-          {%- endfor %}
-
-          // Declaring arrays
-          {%- for field_name, dataType, size in fields %}
-
-          // Initialization {{field_name}}
-          double * {{field_name}} = (double *) aligned_malloc(sizeof({{dataType}}) * {{size}}, 64);
-          // TODO initialize in parallel context in same order as they are touched
-          for (unsigned long long i = 0; i < {{size}}; ++i)
-            {{field_name}}[i] = 0.23;
-          {%- endfor %}
-
-          likwid_markerInit();
-          #pragma omp parallel
-          {
-            likwid_markerRegisterRegion("loop");
-            #pragma omp barrier
-
-            // Initializing arrays in same order as touched in kernel loop nest
-            //INIT_ARRAYS;
-
-            // Dummy call
-            {%- for field_name, dataType, size in fields %}
-            if(var_false) dummy({{field_name}});
-            {%- endfor %}
-            {%- for constantName, dataType in constants %}
-            if(var_false) dummy(&{{constantName}});
-            {%- endfor %}
-
-            for(int warmup = 1; warmup >= 0; --warmup) {
-              int repeat = 2;
-              if(warmup == 0) {
-                repeat = atoi(argv[1]);
-                likwid_markerStartRegion("loop");
-              }
-
-              for(; repeat > 0; --repeat) {
-                {{kernelName}}({{call_argument_list}});
-
-                {%- for field_name, dataType, size in fields %}
-                if(var_false) dummy({{field_name}});
-                {%- endfor %}
-                {%- for constantName, dataType in constants %}
-                if(var_false) dummy(&{{constantName}});
-                {%- endfor %}
-              }
-
-            }
-            likwid_markerStopRegion("loop");
-          }
-          likwid_markerClose();
-          return 0;
-        }
-        """))
-
-    def get_main_code(self, kernel_function_name='kernel'):
-        """
-        Generate and return compilable source code from AST.
-
-        :return: tuple of filename and shared lock file pointer
-        """
-        # TODO produce nicer code, including help text and other "comfort features".
-        assert self.kernel_ast is not None, "AST does not exist, this could be due to running " \
-                                            "based on a kernel description rather than code."
-
-        file_path = self.get_intermediate_location('main.c', machine_and_compiler_dependent=False)
-        lock_mode, lock_fp = self.lock_intermediate(file_path)
-
-        if lock_mode == fcntl.LOCK_SH:
-            # use cache
-            with open(file_path) as f:
-                code = f.read()
-        else:  # lock_mode == fcntl.LOCK_EX
-            # needs update
-            accessed_fields = {f.name: f for f in self.kernel_ast.fields_accessed}
-            constants = []
-            fields = []
-            call_parameters = []
-            for p in self.kernel_ast.get_parameters():
-                if not p.is_field_parameter:
-                    constants.append((p.symbol.name, str(p.symbol.dtype)))
-                    call_parameters.append(p.symbol.name)
-                else:
-                    assert p.is_field_pointer, "Benchmark implemented only for kernels with fixed loop size"
-                    field = accessed_fields[p.field_name]
-                    dtype = str(get_base_type(p.symbol.dtype))
-                    fields.append((p.field_name, dtype, prod(field.shape)))
-                    call_parameters.append(p.field_name)
-
-            header_list = get_headers(self.kernel_ast)
-            includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list])
-
-            # Generate code
-            code = self.CODE_TEMPLATE.render(
-                kernelName=self.kernel_ast.function_name,
-                fields=fields,
-                constants=constants,
-                call_agument_list=','.join(call_parameters),
-                includes=includes)
-
-            # Store to file
-            with open(file_path, 'w') as f:
-                f.write(code)
-            self.release_exclusive_lock(lock_fp)  # degrade to shared lock
-
-        return file_path, lock_fp
-
-
-class KerncraftParameters(DotDict):
-    def __init__(self, **kwargs):
-        super(KerncraftParameters, self).__init__()
-        self['asm_block'] = 'auto'
-        self['asm_increment'] = 0
-        self['cores'] = 1
-        self['cache_predictor'] = 'SIM'
-        self['verbose'] = 0
-        self['pointer_increment'] = 'auto'
-        self['iterations'] = 10
-        self['unit'] = 'cy/CL'
-        self['ignore_warnings'] = True
-        self['incore_model'] = 'OSACA'
-        self.update(**kwargs)
-
-
-# ------------------------------------------- Helper functions ---------------------------------------------------------
-
-
-def search_resolved_field_accesses_in_ast(ast):
-    def visit(node, reads, writes):
-        if not isinstance(node, SympyAssignment):
-            for a in node.args:
-                visit(a, reads, writes)
-            return
-
-        for expr, accesses in [(node.lhs, writes), (node.rhs, reads)]:
-            accesses.update(expr.atoms(ResolvedFieldAccess))
-
-    read_accesses = set()
-    write_accesses = set()
-    visit(ast, read_accesses, write_accesses)
-    return read_accesses, write_accesses
diff --git a/pystencils/kerncraft_coupling/templates/benchmark.c b/pystencils/kerncraft_coupling/templates/benchmark.c
deleted file mode 100644
index 79daaffd9..000000000
--- a/pystencils/kerncraft_coupling/templates/benchmark.c
+++ /dev/null
@@ -1,157 +0,0 @@
-#include "kerncraft.h"
-#include <stdlib.h>
-#include <stdint.h>
-#include <stdbool.h>
-#include <math.h>
-#include <stdio.h>
-#include <assert.h>
-
-{{ includes }}
-
-{%- if likwid %}
-#include <likwid.h>
-{%- endif %}
-
-#define RESTRICT __restrict__
-#define FUNC_PREFIX
-void dummy(void *);
-void timing(double* wcTime, double* cpuTime);
-extern int var_false;
-
-/* see waLBerla src/field/allocation/AlignedMalloc */
-void *aligned_malloc_with_offset( uint64_t size, uint64_t alignment, uint64_t offset )
-{
-    // With 0 alignment this function makes no sense
-    // use normal malloc instead
-    assert( alignment > 0 );
-    // Tests if alignment is power of two (assuming alignment>0)
-    assert( !(alignment & (alignment - 1)) );
-    assert( offset < alignment );
-
-    void *pa;  // pointer to allocated memory
-    void *ptr; // pointer to usable aligned memory
-
-    pa=std::malloc( (size+2*alignment-1 )+sizeof(void *));
-    if(!pa)
-        return nullptr;
-
-    // Find next aligned position, starting at pa+sizeof(void*)-1
-    ptr=(void*)( ((size_t)pa+sizeof(void *)+alignment-1) & ~(alignment-1));
-    ptr=(void*) ( (char*)(ptr) + alignment - offset);
-
-    // Store pointer to real allocated chunk just before usable chunk
-    *((void **)ptr-1)=pa;
-
-    assert( ((size_t)ptr+offset) % alignment == 0 );
-
-    return ptr;
-}
-
-void aligned_free( void *ptr )
-{
-    // assume that pointer to real allocated chunk is stored just before
-    // chunk that was given to user
-    if(ptr)
-        std::free(*((void **)ptr-1));
-}
-
-
-{{kernel_code}}
-
-
-int main(int argc, char **argv)
-{
-  {%- if likwid %}
-  likwid_markerInit();
-  {%- endif %}
-
-  {%- for field_name, dataType, elements, size, offset, alignment in fields %}
-  // Initialization {{field_name}}
-  {%- if alignment > 0 %}
-  {{dataType}} * {{field_name}} = ({{dataType}} *) aligned_malloc_with_offset({{size}}, {{alignment}}, {{offset}});
-  {%- else %}
-  {{dataType}} * {{field_name}} = new {{dataType}}[{{elements}}];
-  {%- endif %}
-  for (unsigned long long i = 0; i < {{elements}}; ++i)
-    {{field_name}}[i] = 0.23;
-
-  if(var_false)
-    dummy({{field_name}});
-
-  {%- endfor %}
-
-
-
-  {%- for constantName, dataType in constants %}
-
-  // Constant {{constantName}}
-  {{dataType}} {{constantName}};
-  {{constantName}} = 0.23;
-  if(var_false)
-      dummy(& {{constantName}});
-
-  {%- endfor %}
-
-  {%- if likwid and openmp %}
-  #pragma omp parallel
-  {
-  likwid_markerRegisterRegion("loop");
-  #pragma omp barrier
-  {%- elif likwid %}
-  likwid_markerRegisterRegion("loop");
-  {%- endif %}
-
-  for(int warmup = 1; warmup >= 0; --warmup) {
-    int repeat = 2;
-    if(warmup == 0) {
-      repeat = atoi(argv[1]);
-      {%- if likwid %}
-      likwid_markerStartRegion("loop");
-      {%- endif %}
-    }
-
-    {%- if timing %}
-    double wcStartTime, cpuStartTime, wcEndTime, cpuEndTime;
-    timing(&wcStartTime, &cpuStartTime);
-    {%- endif %}
-
-    for (; repeat > 0; --repeat)
-    {
-      {{kernelName}}({{call_argument_list}});
-
-      // Dummy calls
-      {%- for field_name, dataType, elements, size, offset, alignment in fields %}
-      if(var_false) dummy((void*){{field_name}});
-      {%- endfor %}
-      {%- for constantName, dataType in constants %}
-      if(var_false) dummy((void*)&{{constantName}});
-      {%- endfor %}
-    }
-    {%- if timing %}
-    timing(&wcEndTime, &cpuEndTime);
-    if( warmup == 0)
-        printf("%e\n", (wcEndTime - wcStartTime) / atoi(argv[1]) );
-    {%- endif %}
-
-  }
-
-  {%- if likwid %}
-  likwid_markerStopRegion("loop");
-  {%- if openmp %}
-  }
-  {%- endif %}
-  {%- endif %}
-
-  {%- if likwid %}
-  likwid_markerClose();
-  {%- endif %}
-
-  {%- for field_name, dataType, elements, size, offset, alignment in fields %}
-  {%- if alignment > 0 %}
-  aligned_free({{field_name}});
-  {%- else %}
-  delete[] {{field_name}};
-  {%- endif %}
-
-  {%- endfor %}
-}
diff --git a/pystencils/kerncraft_coupling/templates/kernel.c b/pystencils/kerncraft_coupling/templates/kernel.c
deleted file mode 100644
index 47fbf7cf2..000000000
--- a/pystencils/kerncraft_coupling/templates/kernel.c
+++ /dev/null
@@ -1,18 +0,0 @@
-
-#include "kerncraft.h"
-#include <stdlib.h>
-#include <stdint.h>
-#include <stdbool.h>
-#include <math.h>
-#include <stdio.h>
-
-{{ includes }}
-
-#define RESTRICT __restrict__
-#define FUNC_PREFIX
-void dummy(void *);
-void timing(double* wcTime, double* cpuTime);
-extern int var_false;
-
-
-{{kernel_code}}
\ No newline at end of file
diff --git a/pystencils/kerncraft_coupling/templates/kernel.h b/pystencils/kerncraft_coupling/templates/kernel.h
deleted file mode 100644
index 539d51f92..000000000
--- a/pystencils/kerncraft_coupling/templates/kernel.h
+++ /dev/null
@@ -1,3 +0,0 @@
-#define FUNC_PREFIX
-
-{{function_signature}}
\ No newline at end of file
diff --git a/pystencils_tests/benchmark/SkylakeSP_Gold-5122_allinclusive.yaml b/pystencils_tests/benchmark/SkylakeSP_Gold-5122_allinclusive.yaml
deleted file mode 100644
index 75370ecd2..000000000
--- a/pystencils_tests/benchmark/SkylakeSP_Gold-5122_allinclusive.yaml
+++ /dev/null
@@ -1,600 +0,0 @@
-# FIXME
-# FIXME performance counters might be wrong. This will only affect the Benchmark model
-# FIXME bandwidth measurements need validation
-# FIXME
-
-kerncraft version: 0.7.2
-model name: Intel(R) Xeon(R) Gold 5122 CPU @ 3.60GHz
-model type: Intel Core Skylake SP
-sockets: 2
-cores per socket: 4
-threads per core: 2
-NUMA domains per socket: 1
-cores per NUMA domain: 4
-clock: 3.6 GHz
-FLOPs per cycle:
-  SP:
-    total: 64
-    FMA: 64
-    ADD: 32
-    MUL: 32
-  DP:
-    total: 32
-    FMA: 32
-    ADD: 16
-    MUL: 16
-micro-architecture: SKX
-compiler:
-  !!omap
-  - icc: -O3 -fno-alias -xCORE-AVX512
-  - clang: -O3 -march=skylake-avx512 -D_POSIX_C_SOURCE=200112L
-  - gcc: -O3 -march=skylake-avx512
-cacheline size: 64 B
-overlapping model:
-  ports: ["0", "0DV", "1", "2", "3", "4", "5", "6", "7"]
-  performance counter metric:
-          Max(UOPS_DISPATCHED_PORT_PORT_0:PMC[0-3],
-          UOPS_DISPATCHED_PORT_PORT_1:PMC[0-3],
-          UOPS_DISPATCHED_PORT_PORT_4:PMC[0-3],
-          UOPS_DISPATCHED_PORT_PORT_5:PMC[0-3],
-          UOPS_DISPATCHED_PORT_PORT_6:PMC[0-3],
-          UOPS_DISPATCHED_PORT_PORT_7:PMC[0-3])
-non-overlapping model:
-  ports: ["2D", "3D"]
-  performance counter metric: T_OL + T_L1L2 + T_L2L3 + T_L3MEM
-memory hierarchy:
-- level: L1
-  performance counter metrics:
-    accesses:  MEM_INST_RETIRED_ALL_LOADS:PMC[0-3]
-    misses: L1D_REPLACEMENT:PMC[0-3]
-    evicts: L2_TRANS_L1D_WB:PMC[0-3]
-  cache per group:
-    sets: 64
-    ways: 8
-    cl_size: 64
-    replacement_policy: 'LRU'
-    write_allocate: True
-    write_back: True
-    load_from: L2
-    store_to: L2
-  size per group: 32.00 kB
-  groups: 8
-  cores per group: 1
-  threads per group: 2
-- level: L2
-  non-overlap upstream throughput: [64 B/cy, 'half-duplex']
-  performance counter metrics:
-    accesses: L1D_REPLACEMENT:PMC[0-3]
-    misses: L2_LINES_IN_ALL:PMC[0-3]
-    evicts: L2_TRANS_L2_WB:PMC[0-3]
-  cache per group:
-    sets: 1024
-    ways: 16
-    cl_size: 64
-    replacement_policy: 'LRU'
-    write_allocate: True
-    write_back: True
-    load_from: null  # L3 is a victim cache, thus unless a hit in L3, misses get forwarded to MEM
-    victims_to: L3  # all victims, modified or not are passed onto L3
-    store_to: L3
-  size per group: 1.00 MB
-  groups: 8
-  cores per group: 1
-  threads per group: 2
-- level: L3
-  non-overlap upstream throughput: [16 B/cy, 'full-duplex']
-  performance counter metrics:
-    accesses: L2_LINES_IN_ALL:PMC[0-3]
-    # FIXME not all misses in L2 lead to loads from L3, only the hits do
-    misses: (CAS_COUNT_RD:MBOX0C[01] + CAS_COUNT_WR:MBOX0C[01] +
-             CAS_COUNT_RD:MBOX1C[01] + CAS_COUNT_WR:MBOX1C[01] +
-             CAS_COUNT_RD:MBOX2C[01] + CAS_COUNT_WR:MBOX2C[01] +
-             CAS_COUNT_RD:MBOX3C[01] + CAS_COUNT_WR:MBOX3C[01] +
-             CAS_COUNT_RD:MBOX4C[01] + CAS_COUNT_WR:MBOX4C[01] +
-             CAS_COUNT_RD:MBOX5C[01] + CAS_COUNT_WR:MBOX5C[01])
-    evicts: L2_TRANS_L2_WB:PMC[0-3]
-  cache per group:
-    sets: 16896
-    # TODO is actuall something else, but necessary to get to 16.5 MB
-    ways: 16
-    # TODO is actually 11, but pycachesim only supports powers of two
-    cl_size: 64
-    replacement_policy: 'LRU'
-    write_allocate: False
-    write_back: True
-  size per group: 16.50 MB
-  groups: 2
-  cores per group: 4
-  threads per group: 8
-- level: MEM
-  cores per group: 4
-  threads per group: 8
-  non-overlap upstream throughput: ['full socket memory bandwidth', 'half-duplex']
-  penalty cycles per read stream: 0
-  size per group:
-benchmarks:
-  kernels:
-    load:
-      read streams:
-        streams: 1
-        bytes: 8.00 B
-      read+write streams:
-        streams: 0
-        bytes: 0.00 B
-      write streams:
-        streams: 0
-        bytes: 0.00 B
-      FLOPs per iteration: 0
-    copy:
-      read streams:
-        streams: 1
-        bytes: 8.00 B
-      read+write streams:
-        streams: 0
-        bytes: 0.00 B
-      write streams:
-        streams: 1
-        bytes: 8.00 B
-      FLOPs per iteration: 0
-    update:
-      read streams:
-        streams: 1
-        bytes: 8.00 B
-      read+write streams:
-        streams: 1
-        bytes: 8.00 B
-      write streams:
-        streams: 1
-        bytes: 8.00 B
-      FLOPs per iteration: 0
-    triad:
-      read streams:
-        streams: 3
-        bytes: 24.00 B
-      read+write streams:
-        streams: 0
-        bytes: 0.00 B
-      write streams:
-        streams: 1
-        bytes: 8.00 B
-      FLOPs per iteration: 2
-    daxpy:
-      read streams:
-        streams: 2
-        bytes: 16.00 B
-      read+write streams:
-        streams: 1
-        bytes: 8.00 B
-      write streams:
-        streams: 1
-        bytes: 8.00 B
-      FLOPs per iteration: 2
-  measurements:
-    L1:
-      1:
-        threads per core: 1
-        cores:
-        - 1
-        - 2
-        - 3
-        - 4
-        threads:
-        - 1
-        - 2
-        - 3
-        - 4
-        size per core:
-        - 21.12 kB
-        - 21.12 kB
-        - 21.12 kB
-        - 21.12 kB
-        size per thread:
-        - 21.12 kB
-        - 21.12 kB
-        - 21.12 kB
-        - 21.12 kB
-        total size:
-        - 21.12 kB
-        - 42.24 kB
-        - 63.36 kB
-        - 84.48 kB
-        results:
-          load:
-          - 42.98 GB/s
-          - 85.08 GB/s
-          - 127.45 GB/s
-          - 169.92 GB/s
-          copy:
-          - 56.07 GB/s
-          - 111.50 GB/s
-          - 164.90 GB/s
-          - 221.50 GB/s
-          update:
-          - 56.54 GB/s
-          - 112.25 GB/s
-          - 168.50 GB/s
-          - 224.75 GB/s
-          triad:
-          - 45.90 GB/s
-          - 89.81 GB/s
-          - 127.29 GB/s
-          - 169.57 GB/s
-          daxpy:
-          - 36.62 GB/s
-          - 71.30 GB/s
-          - 103.52 GB/s
-          - 135.26 GB/s
-      2:
-        threads per core: 2
-        cores:
-        - 1
-        - 2
-        - 3
-        - 4
-        threads:
-        - 2
-        - 4
-        - 6
-        - 8
-        size per core:
-        - 21.12 kB
-        - 21.12 kB
-        - 21.12 kB
-        - 21.12 kB
-        size per thread:
-        - 10.56 kB
-        - 10.56 kB
-        - 10.56 kB
-        - 10.56 kB
-        total size:
-        - 21.12 kB
-        - 42.24 kB
-        - 63.36 kB
-        - 84.48 kB
-        results:
-          load:
-          - 49.61 GB/s
-          - 98.80 GB/s
-          - 147.98 GB/s
-          - 198.22 GB/s
-          copy:
-          - 55.98 GB/s
-          - 111.56 GB/s
-          - 167.08 GB/s
-          - 220.42 GB/s
-          update:
-          - 56.53 GB/s
-          - 112.72 GB/s
-          - 168.95 GB/s
-          - 225.31 GB/s
-          triad:
-          - 54.01 GB/s
-          - 104.58 GB/s
-          - 153.02 GB/s
-          - 200.93 GB/s
-          daxpy:
-          - 41.11 GB/s
-          - 80.28 GB/s
-          - 115.71 GB/s
-          - 152.81 GB/s
-    L2:
-      1:
-        threads per core: 1
-        cores:
-        - 1
-        - 2
-        - 3
-        - 4
-        threads:
-        - 1
-        - 2
-        - 3
-        - 4
-        size per core:
-        - 660.00 kB
-        - 660.00 kB
-        - 660.00 kB
-        - 660.00 kB
-        size per thread:
-        - 660.00 kB
-        - 660.00 kB
-        - 660.00 kB
-        - 660.00 kB
-        total size:
-        - 660.00 kB
-        - 1.32 MB
-        - 1.98 MB
-        - 2.64 MB
-        results:
-          load:
-          - 27.15 GB/s
-          - 54.09 GB/s
-          - 80.61 GB/s
-          - 106.41 GB/s
-          copy:
-          - 43.53 GB/s
-          - 90.07 GB/s
-          - 127.73 GB/s
-          - 171.81 GB/s
-          update:
-          - 50.38 GB/s
-          - 98.47 GB/s
-          - 147.91 GB/s
-          - 197.20 GB/s
-          triad:
-          - 43.38 GB/s
-          - 83.72 GB/s
-          - 124.83 GB/s
-          - 166.04 GB/s
-          daxpy:
-          - 36.29 GB/s
-          - 71.29 GB/s
-          - 103.33 GB/s
-          - 136.48 GB/s
-      2:
-        threads per core: 2
-        cores:
-        - 1
-        - 2
-        - 3
-        - 4
-        threads:
-        - 2
-        - 4
-        - 6
-        - 8
-        size per core:
-        - 660.00 kB
-        - 660.00 kB
-        - 660.00 kB
-        - 660.00 kB
-        size per thread:
-        - 330.00 kB
-        - 330.00 kB
-        - 330.00 kB
-        - 330.00 kB
-        total size:
-        - 660.00 kB
-        - 1.32 MB
-        - 1.98 MB
-        - 2.64 MB
-        results:
-          load:
-          - 35.29 GB/s
-          - 70.28 GB/s
-          - 104.67 GB/s
-          - 139.63 GB/s
-          copy:
-          - 42.23 GB/s
-          - 83.70 GB/s
-          - 124.33 GB/s
-          - 167.50 GB/s
-          update:
-          - 50.09 GB/s
-          - 99.77 GB/s
-          - 149.87 GB/s
-          - 198.82 GB/s
-          triad:
-          - 52.38 GB/s
-          - 100.00 GB/s
-          - 147.40 GB/s
-          - 193.31 GB/s
-          daxpy:
-          - 41.14 GB/s
-          - 80.22 GB/s
-          - 116.23 GB/s
-          - 155.08 GB/s
-    L3:
-      1:
-        threads per core: 1
-        cores:
-        - 1
-        - 2
-        - 3
-        - 4
-        threads:
-        - 1
-        - 2
-        - 3
-        - 4
-        size per core:
-        - 10.56 MB
-        - 5.28 MB
-        - 3.52 MB
-        - 2.64 MB
-        size per thread:
-        - 10.56 MB
-        - 5.28 MB
-        - 3.52 MB
-        - 2.64 MB
-        total size:
-        - 10.56 MB
-        - 10.56 MB
-        - 10.56 MB
-        - 10.56 MB
-        results:
-          load:
-          - 22.40 GB/s
-          - 44.77 GB/s
-          - 65.71 GB/s
-          - 89.26 GB/s
-          copy:
-          - 25.32 GB/s
-          - 49.70 GB/s
-          - 72.89 GB/s
-          - 98.62 GB/s
-          update:
-          - 41.24 GB/s
-          - 81.14 GB/s
-          - 122.22 GB/s
-          - 166.44 GB/s
-          triad:
-          - 25.61 GB/s
-          - 50.02 GB/s
-          - 73.23 GB/s
-          - 98.95 GB/s
-          daxpy:
-          - 32.07 GB/s
-          - 62.65 GB/s
-          - 89.91 GB/s
-          - 120.65 GB/s
-      2:
-        threads per core: 2
-        cores:
-        - 1
-        - 2
-        - 3
-        - 4
-        threads:
-        - 2
-        - 4
-        - 6
-        - 8
-        size per core:
-        - 10.56 MB
-        - 5.28 MB
-        - 3.52 MB
-        - 2.64 MB
-        size per thread:
-        - 5.28 MB
-        - 2.64 MB
-        - 1.76 MB
-        - 1.32 MB
-        total size:
-        - 10.56 MB
-        - 10.56 MB
-        - 10.56 MB
-        - 10.56 MB
-        results:
-          load:
-          - 26.18 GB/s
-          - 51.85 GB/s
-          - 75.82 GB/s
-          - 101.39 GB/s
-          copy:
-          - 26.22 GB/s
-          - 51.83 GB/s
-          - 76.40 GB/s
-          - 102.84 GB/s
-          update:
-          - 43.51 GB/s
-          - 86.75 GB/s
-          - 129.86 GB/s
-          - 174.54 GB/s
-          triad:
-          - 26.39 GB/s
-          - 51.80 GB/s
-          - 76.27 GB/s
-          - 102.66 GB/s
-          daxpy:
-          - 37.43 GB/s
-          - 73.16 GB/s
-          - 106.53 GB/s
-          - 142.76 GB/s
-    MEM:
-      1:
-        threads per core: 1
-        cores:
-        - 1
-        - 2
-        - 3
-        - 4
-        threads:
-        - 1
-        - 2
-        - 3
-        - 4
-        size per core:
-        - 240.00 MB
-        - 120.00 MB
-        - 80.00 MB
-        - 60.00 MB
-        size per thread:
-        - 240.00 MB
-        - 120.00 MB
-        - 80.00 MB
-        - 60.00 MB
-        total size:
-        - 240.00 MB
-        - 240.00 MB
-        - 240.00 MB
-        - 240.00 MB
-        results:
-          load:
-          - 12.03 GB/s
-          - 24.38 GB/s
-          - 34.83 GB/s
-          - 45.05 GB/s
-          copy:
-          - 12.32 GB/s
-          - 24.40 GB/s
-          - 32.82 GB/s
-          - 37.00 GB/s
-          update:
-          - 20.83 GB/s
-          - 40.25 GB/s
-          - 48.81 GB/s
-          - 54.84 GB/s
-          triad:
-          - 11.64 GB/s
-          - 23.17 GB/s
-          - 34.78 GB/s
-          - 42.97 GB/s
-          daxpy:
-          - 17.69 GB/s
-          - 34.02 GB/s
-          - 48.12 GB/s
-          - 55.73 GB/s
-      2:
-        threads per core: 2
-        cores:
-        - 1
-        - 2
-        - 3
-        - 4
-        threads:
-        - 2
-        - 4
-        - 6
-        - 8
-        size per core:
-        - 240.00 MB
-        - 120.00 MB
-        - 80.00 MB
-        - 60.00 MB
-        size per thread:
-        - 120.00 MB
-        - 60.00 MB
-        - 40.00 MB
-        - 30.00 MB
-        total size:
-        - 240.00 MB
-        - 240.00 MB
-        - 240.00 MB
-        - 240.00 MB
-        results:
-          load:
-          - 15.33 GB/s
-          - 28.32 GB/s
-          - 41.34 GB/s
-          - 53.02 GB/s
-          copy:
-          - 13.96 GB/s
-          - 26.61 GB/s
-          - 34.39 GB/s
-          - 38.96 GB/s
-          update:
-          - 26.47 GB/s
-          - 47.82 GB/s
-          - 56.70 GB/s
-          - 62.78 GB/s
-          triad:
-          - 14.42 GB/s
-          - 26.66 GB/s
-          - 36.94 GB/s
-          - 44.01 GB/s
-          daxpy:
-          - 20.96 GB/s
-          - 39.12 GB/s
-          - 51.55 GB/s
-          - 58.37 GB/s
diff --git a/pystencils_tests/benchmark/benchmark.py b/pystencils_tests/benchmark/benchmark.py
deleted file mode 100644
index 29066127c..000000000
--- a/pystencils_tests/benchmark/benchmark.py
+++ /dev/null
@@ -1,188 +0,0 @@
-import math
-import os
-import time
-
-import numpy as np
-import sympy as sp
-from git import Repo
-from influxdb import InfluxDBClient
-from kerncraft.machinemodel import MachineModel
-from kerncraft.models import ECM, Benchmark, Roofline, RooflineIACA
-from kerncraft.prefixedunit import PrefixedUnit
-
-from pystencils import Assignment, Field, create_kernel
-from pystencils.kerncraft_coupling import KerncraftParameters, PyStencilsKerncraftKernel
-
-
-def output_benchmark(analysis):
-    output = {}
-    keys = ['Runtime (per repetition) [s]', 'Iterations per repetition',
-            'Runtime (per cacheline update) [cy/CL]', 'MEM volume (per repetition) [B]',
-            'Performance [MFLOP/s]', 'Performance [MLUP/s]', 'Performance [MIt/s]', 'MEM BW [MByte/s]']
-    copies = {key: analysis[key] for key in keys}
-    output.update(copies)
-
-    for cache, metrics in analysis['data transfers'].items():
-        for metric_name, metric_value in metrics.items():
-            fixed = metric_value.with_prefix('')
-            output[cache + ' ' + metric_name + ' ' + fixed.prefix + fixed.unit] = fixed.value
-
-    for level, value in analysis['ECM'].items():
-        output['Phenomenological ECM ' + level + ' cy/CL'] = value
-    return output
-
-
-def output_ecm(analysis):
-    output = {}
-    keys = ['T_nOL', 'T_OL', 'cl throughput', 'uops']
-    copies = {key: analysis[key] for key in keys}
-    output.update(copies)
-
-    if 'memory bandwidth kernel' in analysis:
-        output['memory bandwidth kernel' + analysis['memory bandwidth kernel'] + analysis['memory bandwidth'].prefix +
-               analysis['memory bandwidth'].unit] = analysis['memory bandwidth'].value
-
-    output['scaling cores'] = int(analysis['scaling cores']) if not math.isinf(analysis['scaling cores']) else -1
-
-    for key, value in analysis['cycles']:
-        output[key] = value
-    return output
-
-
-def output_roofline(analysis):
-    output = {}
-    keys = ['min performance']  # 'bottleneck level'
-    copies = {key: analysis[key] for key in keys}
-    output.update(copies)
-    # TODO save bottleneck information (compute it here)
-
-    # fixed = analysis['max_flops'].with_prefix('G')
-    # output['max GFlop/s'] = fixed.value
-
-    # if analysis['min performance'] > max_flops:
-    #    # CPU bound
-    #    print('CPU bound with {} cores(s)'.format(self._args.cores), file=output_file)
-    #    print('{!s} due to CPU max. FLOP/s'.format(max_flops), file=output_file)
-    # else:
-    # Memory bound
-    bottleneck = analysis['mem bottlenecks'][analysis['bottleneck level']]
-    output['bottleneck GFlop/s'] = bottleneck['performance'].with_prefix('G').value
-    output['bottleneck level'] = bottleneck['level']
-    output['bottleneck bw kernel'] = bottleneck['bw kernel']
-    output['bottleneck arithmetic intensity'] = bottleneck['arithmetic intensity']
-
-    for i, level in enumerate(analysis['mem bottlenecks']):
-        if level is None:
-            continue
-        for key, value in level.items():
-            if isinstance(value, PrefixedUnit):
-                fixed = value.with_prefix('G')
-                output['level ' + str(i) + ' ' + key + ' [' + fixed.prefix + fixed.unit + ']'] = 'inf' if isinstance(
-                    fixed.value, float) and math.isinf(fixed.value) else fixed.value
-            else:
-                output['level ' + str(i) + ' ' + key] = 'inf' if isinstance(value, float) and math.isinf(
-                    value) else value
-    return output
-
-
-def output_roofline_iaca(analysis):
-    output = {}
-    keys = ['min performance']  # 'bottleneck level'
-    copies = {key: analysis[key] for key in keys}
-    # output.update(copies)
-    # TODO save bottleneck information (compute it here)
-
-    # fixed = analysis['max_flops'].with_prefix('G')
-    # output['max GFlop/s'] = fixed.value
-
-    # if analysis['min performance'] > max_flops:
-    #    # CPU bound
-    #    print('CPU bound with {} cores(s)'.format(self._args.cores), file=output_file)
-    #    print('{!s} due to CPU max. FLOP/s'.format(max_flops), file=output_file)
-    # else:
-    # Memory bound
-    bottleneck = analysis['mem bottlenecks'][analysis['bottleneck level']]
-    output['bottleneck GFlop/s'] = bottleneck['performance'].with_prefix('G').value
-    output['bottleneck level'] = bottleneck['level']
-    output['bottleneck bw kernel'] = bottleneck['bw kernel']
-    output['bottleneck arithmetic intensity'] = bottleneck['arithmetic intensity']
-
-    for i, level in enumerate(analysis['mem bottlenecks']):
-        if level is None:
-            continue
-        for key, value in level.items():
-            if isinstance(value, PrefixedUnit):
-                fixed = value.with_prefix('G')
-                output['level ' + str(i) + ' ' + key + ' [' + fixed.prefix + fixed.unit + ']'] = 'inf' if isinstance(
-                    fixed.value, float) and math.isinf(fixed.value) else fixed.value
-            else:
-                output['level ' + str(i) + ' ' + key] = 'inf' if isinstance(value, float) and math.isinf(
-                    value) else value
-    return output
-
-
-def report_analysis(ast, models, machine, tags, fields=None):
-    kernel = PyStencilsKerncraftKernel(ast, machine)
-    client = InfluxDBClient('i10grafana.informatik.uni-erlangen.de', 8086, 'pystencils',
-                            'roggan', 'pystencils')
-    repo = Repo(search_parent_directories=True)
-    commit = repo.head.commit
-    point_time = int(time.time())
-
-    for model in models:
-        benchmark = model(kernel, machine, KerncraftParameters())
-        benchmark.analyze()
-        analysis = benchmark.results
-        if model is Benchmark:
-            output = output_benchmark(analysis)
-        elif model is ECM:
-            output = output_ecm(analysis)
-        elif model is Roofline:
-            output = output_roofline(analysis)
-        elif model is RooflineIACA:
-            output = output_roofline_iaca(analysis)
-        else:
-            raise ValueError('No valid model for analysis given!')
-
-        if fields is not None:
-            output.update(fields)
-
-        output['commit'] = commit.hexsha
-
-        json_body = [
-            {
-                'measurement': model.__name__,
-                'tags': tags,
-                'time': point_time,
-                'fields': output
-            }
-        ]
-        client.write_points(json_body, time_precision='s')
-
-
-def main():
-    size = [20, 200, 200]
-    arr = np.zeros(size)
-    a = Field.create_from_numpy_array('a', arr, index_dimensions=0)
-    b = Field.create_from_numpy_array('b', arr, index_dimensions=0)
-    s = sp.Symbol("s")
-    rhs = a[0, -1, 0] + a[0, 1, 0] + \
-          a[-1, 0, 0] + a[1, 0, 0] + \
-          a[0, 0, -1] + a[0, 0, 1]
-
-    update_rule = Assignment(b[0, 0, 0], s * rhs)
-    ast = create_kernel([update_rule])
-    input_folder = "./"
-    machine_file_path = os.path.join(input_folder, "SkylakeSP_Gold-5122_allinclusive.yaml")
-    machine = MachineModel(path_to_yaml=machine_file_path)
-    tags = {
-        'host': os.uname()[1],
-        'project': 'pystencils',
-        'kernel': 'jacobi_3D ' + str(size)
-    }
-
-    report_analysis(ast, [ECM, Roofline, RooflineIACA, Benchmark], machine, tags)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/pystencils_tests/benchmark/generate.py b/pystencils_tests/benchmark/generate.py
deleted file mode 100644
index fba398489..000000000
--- a/pystencils_tests/benchmark/generate.py
+++ /dev/null
@@ -1,50 +0,0 @@
-import numpy as np
-import sympy as sp
-
-from pystencils import Assignment, Field, create_kernel
-
-
-def meassure():
-    size = [30, 50, 3]
-    arr = np.zeros(size)
-    a = Field.create_from_numpy_array('a', arr, index_dimensions=1)
-    b = Field.create_from_numpy_array('b', arr, index_dimensions=1)
-    s = sp.Symbol("s")
-    rhs = a[0, -1](0) + a[0, 1] + a[-1, 0] + a[1, 0]
-    updateRule = Assignment(b[0, 0], s * rhs)
-    print(updateRule)
-
-    ast = create_kernel([updateRule])
-
-    # benchmark = generate_benchmark(ast)
-    # main = benchmark[0]
-    # kernel = benchmark[1]
-    # with open('src/main.cpp', 'w') as file:
-    #     file.write(main)
-    # with open('src/kernel.cpp', 'w') as file:
-    #     file.write(kernel)
-
-    func = ast.compile({'omega': 2/3})
-
-    from pystencils.kerncraft_coupling.generate_benchmark import generate_benchmark
-    from pystencils.kerncraft_coupling import BenchmarkAnalysis
-    from pystencils.kerncraft_coupling.kerncraft_interface import PyStencilsKerncraftKernel, KerncraftParameters
-    from kerncraft.machinemodel import MachineModel
-    from kerncraft.models import ECMData
-
-
-    machineFilePath = "../pystencils_tests/kerncraft_inputs/default_machine_file.yaml"
-    machine = MachineModel(path_to_yaml=machineFilePath)
-
-
-    benchmark = BenchmarkAnalysis(ast, machine)
-    #TODO what do i want to do with benchmark?
-
-    kernel = PyStencilsKerncraftKernel(ast)
-    model = ECMData(kernel, machine, KerncraftParameters())
-    model.analyze()
-    model.report()
-
-
-if __name__ == "__main__":
-    meassure()
diff --git a/pystencils_tests/benchmark/iacaMarks.h b/pystencils_tests/benchmark/iacaMarks.h
deleted file mode 100644
index be1973eb2..000000000
--- a/pystencils_tests/benchmark/iacaMarks.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
-* Copyright (2008-2009) Intel Corporation All Rights Reserved. 
-* The source code contained or described herein and all documents 
-* related to the source code ("Material") are owned by Intel Corporation 
-* or its suppliers or licensors. Title to the Material remains with 
-* Intel Corporation or its suppliers and licensors. The Material 
-* contains trade secrets and proprietary and confidential information 
-* of Intel or its suppliers and licensors. The Material is protected 
-* by worldwide copyright and trade secret laws and treaty provisions. 
-* No part of the Material may be used, copied, reproduced, modified, 
-* published, uploaded, posted, transmitted, distributed, or disclosed 
-* in any way without Intel(R)s prior express written permission.
-*
-* No license under any patent, copyright, trade secret or other 
-* intellectual property right is granted to or conferred upon you by 
-* disclosure or delivery of the Materials, either expressly, by implication,
-* inducement, estoppel or otherwise. Any license under such intellectual 
-* property rights must be express and approved by Intel in writing.
-*/
-
-#if defined (__GNUC__) 
-#define IACA_SSC_MARK( MARK_ID )						\
-__asm__ __volatile__ (									\
-					  "\n\t  movl $"#MARK_ID", %%ebx"	\
-					  "\n\t  .byte 0x64, 0x67, 0x90"	\
-					  : : : "memory" );
-
-#else
-#define IACA_SSC_MARK(x) {__asm  mov ebx, x\
-	__asm  _emit 0x64 \
-	__asm  _emit 0x67 \
-	__asm  _emit 0x90 }
-#endif
-
-#define IACA_START {IACA_SSC_MARK(111)}
-#define IACA_END {IACA_SSC_MARK(222)}
-
-#ifdef _WIN64
-#include <intrin.h>
-#define IACA_VC64_START __writegsbyte(111, 111);
-#define IACA_VC64_END   __writegsbyte(222, 222);
-#endif
-
-/**************** asm *****************
-;START_MARKER
-mov ebx, 111
-db 0x64, 0x67, 0x90
-
-;END_MARKER
-mov ebx, 222
-db 0x64, 0x67, 0x90
-
-**************************************/
diff --git a/pystencils_tests/benchmark/main.c b/pystencils_tests/benchmark/main.c
deleted file mode 100644
index 6bd57f91c..000000000
--- a/pystencils_tests/benchmark/main.c
+++ /dev/null
@@ -1,11 +0,0 @@
-#include "iacaMarks.h"
-
-int main(int argc, char * argv[]){
-	int a = 0;
-	for(int i = 0; i < argc+100000; i++){
-		IACA_START
-		a += i;
-	}
-	IACA_END
-	return a;
-}
diff --git a/pystencils_tests/kerncraft_inputs/2d-5pt.c b/pystencils_tests/kerncraft_inputs/2d-5pt.c
deleted file mode 100644
index 0f2b99cf2..000000000
--- a/pystencils_tests/kerncraft_inputs/2d-5pt.c
+++ /dev/null
@@ -1,8 +0,0 @@
-double a[30][50][3];
-double b[30][50][3];
-double s;
-
-for(int j=1; j<30-1; ++j)
-    for(int i=1; i<50-1; ++i)
-        b[j][i] = ( a[j][i-1] + a[j][i+1]
-                  + a[j-1][i] + a[j+1][i]) * s;
diff --git a/pystencils_tests/kerncraft_inputs/3d-7pt.c b/pystencils_tests/kerncraft_inputs/3d-7pt.c
deleted file mode 100644
index 0e9ff901d..000000000
--- a/pystencils_tests/kerncraft_inputs/3d-7pt.c
+++ /dev/null
@@ -1,10 +0,0 @@
-double a[M][N][N];
-double b[M][N][N];
-double s;
-
-for(int k=1; k<M-1; ++k)
-    for(int j=1; j<N-1; ++j)
-        for(int i=1; i<N-1; ++i)
-            b[k][j][i] = ( a[k][j][i-1] + a[k][j][i+1]
-                         + a[k][j-1][i] + a[k][j+1][i]
-                         + a[k-1][j][i] + a[k+1][j][i]) * s;
diff --git a/pystencils_tests/kerncraft_inputs/Example_SandyBridgeEP_E5-2680.yml b/pystencils_tests/kerncraft_inputs/Example_SandyBridgeEP_E5-2680.yml
deleted file mode 100644
index 890e2e895..000000000
--- a/pystencils_tests/kerncraft_inputs/Example_SandyBridgeEP_E5-2680.yml
+++ /dev/null
@@ -1,980 +0,0 @@
-kerncraft version: 0.8.6.dev0
-model name: Intel(R) Xeon(R) CPU E5-2680 0 @ 2.70GHz
-model type: Intel Xeon SandyBridge EN/EP processor
-clock: 2.7 GHz
-
-sockets: 2
-cores per socket: 8
-threads per core: 2
-NUMA domains per socket: 1
-cores per NUMA domain: 8
-transparent hugepage: always
-
-in-core model: !!omap
-  - IACA: SNB
-  - OSACA: SNB
-  - LLVM-MCA: -mcpu=sandybridge
-isa: x86
-
-FLOPs per cycle:
-  SP: {total: 16, ADD: 8, MUL: 8}
-  DP: {total: 8, ADD: 4, MUL: 4}
-
-compiler: !!omap
-- icc: -O3 -xAVX -fno-alias -qopenmp -ffreestanding -nolib-inline
-- clang: -O3 -march=corei7-avx -mtune=corei7-avx -D_POSIX_C_SOURCE=200809L -fopenmp -ffreestanding
-- gcc: -O3 -march=corei7-avx -D_POSIX_C_SOURCE=200809L -fopenmp -lm -ffreestanding
-
-overlapping model:
-  ports: 
-    IACA: ['0', 0DV, '1', '2', '3', '4', '5']
-    OSACA: ['0', 0DV, '1', '2', '3', '4', '5']
-    LLVM-MCA: [SBDivider, SBFPDivider, SBPort0, SBPort1, SBPort23, SBPort4, SBPort5]
-  performance counter metric: Max(UOPS_DISPATCHED_PORT_PORT_0:PMC[0-3], UOPS_DISPATCHED_PORT_PORT_1:PMC[0-3], UOPS_DISPATCHED_PORT_PORT_4:PMC[0-3], UOPS_DISPATCHED_PORT_PORT_5:PMC[0-3])
-non-overlapping model:
-  ports: 
-    IACA: [2D, 3D]
-    OSACA: [2D, 3D]
-    LLVM-MCA: [SBPort23]
-  performance counter metric: T_nOL + T_L1L2 + T_L2L3 + T_L3MEM
-
-cacheline size: 64 B
-memory hierarchy:
-- level: L1
-  cache per group: {sets: 64, ways: 8, cl_size: 64, replacement_policy: LRU, write_allocate: true,
-    write_back: true, load_from: L2, store_to: L2}
-  cores per group: 1
-  threads per group: 2
-  groups: 16
-  performance counter metrics:
-    accesses: MEM_UOPS_RETIRED_LOADS:PMC[0-3] + MEM_UOPS_RETIRED_STORES:PMC[0-3]
-    misses: L1D_REPLACEMENT:PMC[0-3]
-    evicts: L1D_M_EVICT:PMC[0-3]
-  upstream throughput: [architecture code analyzer, [2D, 3D]]
-  transfers overlap: false
-- level: L2
-  cache per group: {sets: 512, ways: 8, cl_size: 64, replacement_policy: LRU, write_allocate: true,
-    write_back: true, load_from: L3, store_to: L3}
-  cores per group: 1
-  threads per group: 2
-  groups: 16
-  upstream throughput: [32 B/cy, half-duplex]
-  transfers overlap: false
-  performance counter metrics:
-    accesses: L1D_REPLACEMENT:PMC[0-3] + L1D_M_EVICT:PMC[0-3]
-    misses: L2_LINES_IN_ALL:PMC[0-3]
-    evicts: L2_TRANS_L2_WB:PMC[0-3]
-- level: L3
-  cache per group: {sets: 20480, ways: 16, cl_size: 64, replacement_policy: LRU, write_allocate: true,
-    write_back: true}
-  cores per group: 8
-  threads per group: 16
-  groups: 2
-  upstream throughput: [32 B/cy, half-duplex]
-  transfers overlap: false
-  performance counter metrics:
-    accesses: L2_LINES_IN_ALL:PMC[0-3] + L2_TRANS_L2_WB:PMC[0-3]
-    misses: (CAS_COUNT_RD:MBOX0C[01] + CAS_COUNT_RD:MBOX1C[01] + CAS_COUNT_RD:MBOX2C[01]
-      + CAS_COUNT_RD:MBOX3C[01])
-    evicts: (CAS_COUNT_WR:MBOX0C[01] + CAS_COUNT_WR:MBOX1C[01] + CAS_COUNT_WR:MBOX2C[01]
-      + CAS_COUNT_WR:MBOX3C[01])
-- level: MEM
-  cores per group: 8
-  upstream throughput: [full socket memory bandwidth, half-duplex]
-  transfers overlap: false
-  size per group:
-  threads per group: 16
-
-benchmarks:
-  kernels:
-    copy:
-      FLOPs per iteration: 0
-      fastest bench kernel: copy_avx
-      read streams: {bytes: 8.00 B, streams: 1}
-      read+write streams: {bytes: 0.00 B, streams: 0}
-      write streams: {bytes: 8.00 B, streams: 1}
-    daxpy:
-      FLOPs per iteration: 2
-      fastest bench kernel: daxpy_avx
-      read streams: {bytes: 16.00 B, streams: 2}
-      read+write streams: {bytes: 8.00 B, streams: 1}
-      write streams: {bytes: 8.00 B, streams: 1}
-    load:
-      FLOPs per iteration: 0
-      fastest bench kernel: load_avx
-      read streams: {bytes: 8.00 B, streams: 1}
-      read+write streams: {bytes: 0.00 B, streams: 0}
-      write streams: {bytes: 0.00 B, streams: 0}
-    triad:
-      FLOPs per iteration: 2
-      fastest bench kernel: triad_avx
-      read streams: {bytes: 24.00 B, streams: 3}
-      read+write streams: {bytes: 0.00 B, streams: 0}
-      write streams: {bytes: 8.00 B, streams: 1}
-    update:
-      FLOPs per iteration: 0
-      fastest bench kernel: update_avx
-      read streams: {bytes: 8.00 B, streams: 1}
-      read+write streams: {bytes: 8.00 B, streams: 1}
-      write streams: {bytes: 8.00 B, streams: 1}
-  measurements:
-    L1:
-      1:
-        cores: [1, 2, 3, 4, 5, 6, 7, 8]
-        results:
-          copy: [83.27 GB/s, 166.52 GB/s, 249.78 GB/s, 333.02 GB/s, 416.34 GB/s, 495.96
-              GB/s, 578.56 GB/s, 660.60 GB/s]
-          daxpy: [116.88 GB/s, 233.68 GB/s, 311.60 GB/s, 409.72 GB/s, 509.79 GB/s,
-            559.65 GB/s, 612.77 GB/s, 719.71 GB/s]
-          load: [84.07 GB/s, 168.13 GB/s, 252.21 GB/s, 336.04 GB/s, 420.34 GB/s, 504.02
-              GB/s, 588.04 GB/s, 668.37 GB/s]
-          triad: [100.24 GB/s, 211.57 GB/s, 314.53 GB/s, 392.73 GB/s, 506.87 GB/s,
-            589.51 GB/s, 687.28 GB/s, 782.17 GB/s]
-          update: [84.77 GB/s, 160.10 GB/s, 237.12 GB/s, 312.74 GB/s, 392.54 GB/s,
-            465.53 GB/s, 516.02 GB/s, 567.27 GB/s]
-        size per core: [21.12 kB, 21.12 kB, 21.12 kB, 21.12 kB, 21.12 kB, 21.12 kB,
-          21.12 kB, 21.12 kB]
-        size per thread: [21.12 kB, 21.12 kB, 21.12 kB, 21.12 kB, 21.12 kB, 21.12
-            kB, 21.12 kB, 21.12 kB]
-        stats:
-          copy:
-          - [83.24 GB/s, 83.25 GB/s, 83.26 GB/s, 83.26 GB/s, 83.27 GB/s, 83.26 GB/s,
-            83.25 GB/s, 83.23 GB/s, 83.24 GB/s, 83.25 GB/s]
-          - [166.49 GB/s, 166.47 GB/s, 166.51 GB/s, 166.49 GB/s, 166.48 GB/s, 166.52
-              GB/s, 166.51 GB/s, 166.51 GB/s, 166.51 GB/s, 166.50 GB/s]
-          - [249.78 GB/s, 249.75 GB/s, 249.73 GB/s, 249.72 GB/s, 249.74 GB/s, 249.76
-              GB/s, 249.76 GB/s, 249.74 GB/s, 249.73 GB/s, 249.75 GB/s]
-          - [332.98 GB/s, 327.92 GB/s, 332.30 GB/s, 332.95 GB/s, 333.00 GB/s, 333.01
-              GB/s, 332.95 GB/s, 333.00 GB/s, 332.99 GB/s, 333.02 GB/s]
-          - [416.26 GB/s, 416.23 GB/s, 416.28 GB/s, 416.27 GB/s, 416.23 GB/s, 416.27
-              GB/s, 416.34 GB/s, 416.26 GB/s, 416.16 GB/s, 416.23 GB/s]
-          - [495.84 GB/s, 495.93 GB/s, 495.88 GB/s, 495.91 GB/s, 495.96 GB/s, 495.92
-              GB/s, 495.89 GB/s, 495.87 GB/s, 495.96 GB/s, 495.92 GB/s]
-          - [578.51 GB/s, 578.52 GB/s, 578.39 GB/s, 578.56 GB/s, 578.48 GB/s, 578.44
-              GB/s, 578.51 GB/s, 578.48 GB/s, 578.51 GB/s, 578.53 GB/s]
-          - [422.14 GB/s, 660.55 GB/s, 660.60 GB/s, 660.49 GB/s, 660.52 GB/s, 660.48
-              GB/s, 660.56 GB/s, 660.56 GB/s, 660.52 GB/s, 651.64 GB/s]
-          daxpy:
-          - [116.87 GB/s, 116.82 GB/s, 116.85 GB/s, 116.84 GB/s, 116.83 GB/s, 116.85
-              GB/s, 116.88 GB/s, 116.87 GB/s, 116.86 GB/s, 116.82 GB/s]
-          - [214.69 GB/s, 229.83 GB/s, 221.16 GB/s, 233.60 GB/s, 232.90 GB/s, 233.68
-              GB/s, 207.83 GB/s, 233.65 GB/s, 212.71 GB/s, 214.07 GB/s]
-          - [282.77 GB/s, 307.63 GB/s, 307.09 GB/s, 310.67 GB/s, 307.50 GB/s, 311.40
-              GB/s, 307.06 GB/s, 305.89 GB/s, 311.60 GB/s, 308.47 GB/s]
-          - [404.96 GB/s, 408.54 GB/s, 395.76 GB/s, 409.72 GB/s, 316.70 GB/s, 408.07
-              GB/s, 347.34 GB/s, 406.03 GB/s, 391.75 GB/s, 385.10 GB/s]
-          - [479.84 GB/s, 509.24 GB/s, 502.60 GB/s, 449.79 GB/s, 402.46 GB/s, 489.18
-              GB/s, 491.15 GB/s, 491.20 GB/s, 384.36 GB/s, 509.79 GB/s]
-          - [515.12 GB/s, 496.21 GB/s, 517.52 GB/s, 540.00 GB/s, 501.82 GB/s, 507.84
-              GB/s, 496.71 GB/s, 479.42 GB/s, 559.65 GB/s, 519.55 GB/s]
-          - [584.86 GB/s, 580.10 GB/s, 583.34 GB/s, 612.77 GB/s, 607.15 GB/s, 607.89
-              GB/s, 589.85 GB/s, 609.59 GB/s, 592.86 GB/s, 568.07 GB/s]
-          - [719.71 GB/s, 660.98 GB/s, 675.88 GB/s, 679.51 GB/s, 696.97 GB/s, 635.23
-              GB/s, 644.06 GB/s, 694.74 GB/s, 654.01 GB/s, 656.57 GB/s]
-          load:
-          - [84.04 GB/s, 84.06 GB/s, 84.06 GB/s, 84.04 GB/s, 84.05 GB/s, 84.05 GB/s,
-            84.07 GB/s, 84.04 GB/s, 84.05 GB/s, 84.06 GB/s]
-          - [168.09 GB/s, 168.12 GB/s, 168.06 GB/s, 168.11 GB/s, 168.12 GB/s, 168.13
-              GB/s, 168.13 GB/s, 168.12 GB/s, 168.10 GB/s, 168.13 GB/s]
-          - [252.16 GB/s, 252.21 GB/s, 252.07 GB/s, 252.07 GB/s, 252.18 GB/s, 252.16
-              GB/s, 252.21 GB/s, 252.20 GB/s, 252.20 GB/s, 252.17 GB/s]
-          - [335.94 GB/s, 336.03 GB/s, 335.99 GB/s, 336.04 GB/s, 336.00 GB/s, 335.98
-              GB/s, 335.97 GB/s, 335.89 GB/s, 335.99 GB/s, 336.03 GB/s]
-          - [420.30 GB/s, 420.18 GB/s, 420.30 GB/s, 420.33 GB/s, 420.25 GB/s, 420.28
-              GB/s, 420.31 GB/s, 420.31 GB/s, 420.34 GB/s, 420.33 GB/s]
-          - [503.98 GB/s, 503.99 GB/s, 503.97 GB/s, 503.98 GB/s, 504.02 GB/s, 503.99
-              GB/s, 503.92 GB/s, 503.98 GB/s, 503.94 GB/s, 503.97 GB/s]
-          - [587.93 GB/s, 588.01 GB/s, 588.04 GB/s, 587.94 GB/s, 587.97 GB/s, 588.01
-              GB/s, 588.00 GB/s, 587.92 GB/s, 588.04 GB/s, 588.02 GB/s]
-          - [668.21 GB/s, 668.22 GB/s, 668.29 GB/s, 668.24 GB/s, 668.27 GB/s, 668.37
-              GB/s, 668.28 GB/s, 668.14 GB/s, 668.19 GB/s, 668.19 GB/s]
-          triad:
-          - [100.00 GB/s, 99.71 GB/s, 99.74 GB/s, 100.24 GB/s, 99.72 GB/s, 99.62 GB/s,
-            99.54 GB/s, 99.61 GB/s, 99.72 GB/s, 99.71 GB/s]
-          - [208.08 GB/s, 210.33 GB/s, 211.57 GB/s, 208.34 GB/s, 210.03 GB/s, 209.16
-              GB/s, 210.21 GB/s, 209.48 GB/s, 210.03 GB/s, 208.80 GB/s]
-          - [311.43 GB/s, 311.08 GB/s, 311.41 GB/s, 311.10 GB/s, 313.13 GB/s, 314.53
-              GB/s, 311.59 GB/s, 311.80 GB/s, 311.57 GB/s, 311.89 GB/s]
-          - [391.65 GB/s, 392.34 GB/s, 391.84 GB/s, 392.07 GB/s, 391.96 GB/s, 392.73
-              GB/s, 391.66 GB/s, 391.83 GB/s, 392.09 GB/s, 391.88 GB/s]
-          - [504.20 GB/s, 506.77 GB/s, 503.22 GB/s, 506.74 GB/s, 502.78 GB/s, 506.15
-              GB/s, 506.87 GB/s, 502.85 GB/s, 505.82 GB/s, 506.57 GB/s]
-          - [587.75 GB/s, 589.51 GB/s, 588.01 GB/s, 587.29 GB/s, 588.04 GB/s, 587.92
-              GB/s, 588.08 GB/s, 587.94 GB/s, 587.82 GB/s, 587.55 GB/s]
-          - [686.03 GB/s, 685.97 GB/s, 685.01 GB/s, 685.88 GB/s, 685.61 GB/s, 687.12
-              GB/s, 684.97 GB/s, 686.09 GB/s, 685.81 GB/s, 687.28 GB/s]
-          - [782.05 GB/s, 781.73 GB/s, 781.13 GB/s, 781.87 GB/s, 782.17 GB/s, 781.24
-              GB/s, 781.82 GB/s, 781.92 GB/s, 781.90 GB/s, 781.66 GB/s]
-          update:
-          - [84.76 GB/s, 84.76 GB/s, 84.77 GB/s, 84.75 GB/s, 84.75 GB/s, 84.75 GB/s,
-            84.75 GB/s, 84.75 GB/s, 84.74 GB/s, 57.21 GB/s]
-          - [157.73 GB/s, 155.29 GB/s, 147.91 GB/s, 160.10 GB/s, 156.33 GB/s, 158.06
-              GB/s, 159.23 GB/s, 156.16 GB/s, 155.30 GB/s, 159.15 GB/s]
-          - [232.07 GB/s, 230.40 GB/s, 234.05 GB/s, 232.69 GB/s, 215.80 GB/s, 232.76
-              GB/s, 236.01 GB/s, 237.12 GB/s, 234.66 GB/s, 234.86 GB/s]
-          - [303.60 GB/s, 304.21 GB/s, 306.83 GB/s, 309.43 GB/s, 312.69 GB/s, 311.75
-              GB/s, 301.74 GB/s, 307.54 GB/s, 312.74 GB/s, 312.19 GB/s]
-          - [386.45 GB/s, 382.41 GB/s, 387.87 GB/s, 392.54 GB/s, 369.42 GB/s, 341.87
-              GB/s, 352.85 GB/s, 390.87 GB/s, 382.44 GB/s, 383.50 GB/s]
-          - [459.60 GB/s, 384.27 GB/s, 437.39 GB/s, 459.42 GB/s, 465.53 GB/s, 447.31
-              GB/s, 440.00 GB/s, 409.94 GB/s, 412.94 GB/s, 446.74 GB/s]
-          - [489.85 GB/s, 489.35 GB/s, 435.92 GB/s, 492.39 GB/s, 446.44 GB/s, 501.71
-              GB/s, 516.02 GB/s, 478.87 GB/s, 494.52 GB/s, 493.04 GB/s]
-          - [521.08 GB/s, 553.73 GB/s, 541.34 GB/s, 527.75 GB/s, 554.87 GB/s, 536.30
-              GB/s, 540.66 GB/s, 551.02 GB/s, 567.27 GB/s, 565.31 GB/s]
-        threads: [1, 2, 3, 4, 5, 6, 7, 8]
-        threads per core: 1
-        total size: [21.12 kB, 42.24 kB, 63.36 kB, 84.48 kB, 105.60 kB, 126.72 kB,
-          147.84 kB, 168.96 kB]
-      2:
-        cores: [1, 2, 3, 4, 5, 6, 7, 8]
-        results:
-          copy: [80.41 GB/s, 160.83 GB/s, 240.43 GB/s, 320.63 GB/s, 401.66 GB/s, 454.32
-              GB/s, 539.77 GB/s, 628.51 GB/s]
-          daxpy: [95.87 GB/s, 187.75 GB/s, 270.68 GB/s, 371.80 GB/s, 454.05 GB/s,
-            503.46 GB/s, 606.85 GB/s, 689.34 GB/s]
-          load: [82.30 GB/s, 164.06 GB/s, 244.78 GB/s, 326.21 GB/s, 408.56 GB/s, 490.13
-              GB/s, 569.95 GB/s, 651.79 GB/s]
-          triad: [93.22 GB/s, 186.75 GB/s, 288.55 GB/s, 340.91 GB/s, 442.20 GB/s,
-            534.62 GB/s, 597.98 GB/s, 707.54 GB/s]
-          update: [83.25 GB/s, 166.04 GB/s, 248.21 GB/s, 330.58 GB/s, 414.71 GB/s,
-            496.97 GB/s, 578.67 GB/s, 656.56 GB/s]
-        size per core: [21.12 kB, 21.12 kB, 21.12 kB, 21.12 kB, 21.12 kB, 21.12 kB,
-          21.12 kB, 21.12 kB]
-        size per thread: [10.56 kB, 10.56 kB, 10.56 kB, 10.56 kB, 10.56 kB, 10.56
-            kB, 10.56 kB, 10.56 kB]
-        stats:
-          copy:
-          - [80.37 GB/s, 79.07 GB/s, 80.39 GB/s, 80.39 GB/s, 80.41 GB/s, 80.29 GB/s,
-            80.36 GB/s, 79.05 GB/s, 77.87 GB/s, 80.37 GB/s]
-          - [160.76 GB/s, 160.63 GB/s, 160.76 GB/s, 160.71 GB/s, 160.80 GB/s, 160.74
-              GB/s, 160.83 GB/s, 160.69 GB/s, 160.79 GB/s, 160.78 GB/s]
-          - [240.43 GB/s, 240.20 GB/s, 240.36 GB/s, 240.37 GB/s, 237.17 GB/s, 240.39
-              GB/s, 240.14 GB/s, 240.24 GB/s, 240.26 GB/s, 240.10 GB/s]
-          - [320.46 GB/s, 320.47 GB/s, 320.63 GB/s, 320.52 GB/s, 320.40 GB/s, 320.40
-              GB/s, 320.51 GB/s, 320.46 GB/s, 319.72 GB/s, 320.44 GB/s]
-          - [401.40 GB/s, 399.28 GB/s, 401.66 GB/s, 401.53 GB/s, 401.52 GB/s, 401.55
-              GB/s, 401.60 GB/s, 401.47 GB/s, 401.47 GB/s, 401.35 GB/s]
-          - [447.24 GB/s, 453.65 GB/s, 453.54 GB/s, 453.86 GB/s, 453.82 GB/s, 453.62
-              GB/s, 453.48 GB/s, 454.32 GB/s, 453.86 GB/s, 446.79 GB/s]
-          - [538.79 GB/s, 538.47 GB/s, 539.02 GB/s, 538.25 GB/s, 538.72 GB/s, 538.89
-              GB/s, 539.37 GB/s, 539.41 GB/s, 539.77 GB/s, 538.49 GB/s]
-          - [628.14 GB/s, 618.54 GB/s, 628.12 GB/s, 623.90 GB/s, 628.27 GB/s, 623.78
-              GB/s, 618.17 GB/s, 623.43 GB/s, 628.51 GB/s, 628.43 GB/s]
-          daxpy:
-          - [95.77 GB/s, 93.25 GB/s, 92.87 GB/s, 95.87 GB/s, 95.84 GB/s, 95.81 GB/s,
-            95.80 GB/s, 94.99 GB/s, 95.81 GB/s, 95.86 GB/s]
-          - [184.53 GB/s, 186.60 GB/s, 183.99 GB/s, 187.48 GB/s, 187.75 GB/s, 181.53
-              GB/s, 183.82 GB/s, 187.75 GB/s, 184.13 GB/s, 180.61 GB/s]
-          - [258.46 GB/s, 270.13 GB/s, 264.76 GB/s, 262.23 GB/s, 265.05 GB/s, 267.25
-              GB/s, 270.68 GB/s, 268.08 GB/s, 266.20 GB/s, 265.66 GB/s]
-          - [367.99 GB/s, 367.15 GB/s, 361.68 GB/s, 364.86 GB/s, 368.76 GB/s, 363.27
-              GB/s, 364.95 GB/s, 366.97 GB/s, 371.80 GB/s, 366.55 GB/s]
-          - [441.95 GB/s, 442.77 GB/s, 444.97 GB/s, 454.05 GB/s, 441.02 GB/s, 445.96
-              GB/s, 442.49 GB/s, 440.23 GB/s, 449.29 GB/s, 452.66 GB/s]
-          - [501.31 GB/s, 489.91 GB/s, 495.43 GB/s, 503.39 GB/s, 488.03 GB/s, 497.71
-              GB/s, 503.46 GB/s, 496.85 GB/s, 497.38 GB/s, 468.90 GB/s]
-          - [604.57 GB/s, 580.51 GB/s, 587.67 GB/s, 594.32 GB/s, 561.32 GB/s, 588.09
-              GB/s, 606.85 GB/s, 600.91 GB/s, 599.40 GB/s, 598.24 GB/s]
-          - [646.48 GB/s, 655.06 GB/s, 684.70 GB/s, 653.61 GB/s, 671.61 GB/s, 689.34
-              GB/s, 673.74 GB/s, 685.49 GB/s, 681.48 GB/s, 683.23 GB/s]
-          load:
-          - [82.19 GB/s, 82.08 GB/s, 82.22 GB/s, 82.10 GB/s, 82.14 GB/s, 82.17 GB/s,
-            82.22 GB/s, 82.28 GB/s, 82.30 GB/s, 81.98 GB/s]
-          - [163.22 GB/s, 163.43 GB/s, 164.06 GB/s, 164.03 GB/s, 163.19 GB/s, 163.83
-              GB/s, 163.29 GB/s, 163.88 GB/s, 163.83 GB/s, 163.11 GB/s]
-          - [244.32 GB/s, 244.47 GB/s, 244.65 GB/s, 244.29 GB/s, 243.96 GB/s, 244.50
-              GB/s, 244.78 GB/s, 244.52 GB/s, 244.48 GB/s, 244.72 GB/s]
-          - [325.18 GB/s, 326.21 GB/s, 325.49 GB/s, 325.86 GB/s, 325.73 GB/s, 325.72
-              GB/s, 326.00 GB/s, 325.41 GB/s, 325.63 GB/s, 325.82 GB/s]
-          - [407.81 GB/s, 407.96 GB/s, 407.59 GB/s, 408.56 GB/s, 407.64 GB/s, 407.61
-              GB/s, 408.09 GB/s, 407.95 GB/s, 408.30 GB/s, 408.32 GB/s]
-          - [488.65 GB/s, 489.73 GB/s, 489.38 GB/s, 489.81 GB/s, 490.13 GB/s, 489.31
-              GB/s, 488.74 GB/s, 489.38 GB/s, 488.17 GB/s, 489.51 GB/s]
-          - [569.95 GB/s, 567.21 GB/s, 566.08 GB/s, 567.88 GB/s, 567.69 GB/s, 569.58
-              GB/s, 568.61 GB/s, 568.35 GB/s, 569.70 GB/s, 568.87 GB/s]
-          - [650.43 GB/s, 651.58 GB/s, 650.86 GB/s, 651.34 GB/s, 651.04 GB/s, 651.79
-              GB/s, 650.28 GB/s, 650.31 GB/s, 650.81 GB/s, 651.09 GB/s]
-          triad:
-          - [93.22 GB/s, 90.73 GB/s, 92.48 GB/s, 92.53 GB/s, 92.37 GB/s, 92.50 GB/s,
-            92.48 GB/s, 90.28 GB/s, 92.35 GB/s, 92.51 GB/s]
-          - [186.75 GB/s, 184.51 GB/s, 184.17 GB/s, 186.66 GB/s, 186.43 GB/s, 184.59
-              GB/s, 186.71 GB/s, 186.30 GB/s, 186.64 GB/s, 186.12 GB/s]
-          - [287.77 GB/s, 288.55 GB/s, 287.76 GB/s, 287.76 GB/s, 288.19 GB/s, 287.70
-              GB/s, 287.42 GB/s, 288.12 GB/s, 287.66 GB/s, 288.01 GB/s]
-          - [339.82 GB/s, 338.95 GB/s, 340.11 GB/s, 340.11 GB/s, 340.25 GB/s, 340.20
-              GB/s, 339.90 GB/s, 340.22 GB/s, 340.91 GB/s, 340.01 GB/s]
-          - [440.41 GB/s, 440.65 GB/s, 441.59 GB/s, 442.20 GB/s, 441.67 GB/s, 432.59
-              GB/s, 440.20 GB/s, 440.81 GB/s, 440.24 GB/s, 441.38 GB/s]
-          - [534.30 GB/s, 527.60 GB/s, 528.52 GB/s, 509.55 GB/s, 527.68 GB/s, 527.63
-              GB/s, 533.66 GB/s, 534.62 GB/s, 534.60 GB/s, 534.19 GB/s]
-          - [595.90 GB/s, 595.94 GB/s, 597.91 GB/s, 580.22 GB/s, 597.98 GB/s, 597.66
-              GB/s, 596.16 GB/s, 567.03 GB/s, 580.88 GB/s, 578.29 GB/s]
-          - [703.80 GB/s, 705.57 GB/s, 694.84 GB/s, 682.59 GB/s, 694.37 GB/s, 696.56
-              GB/s, 704.50 GB/s, 704.95 GB/s, 694.52 GB/s, 707.54 GB/s]
-          update:
-          - [83.18 GB/s, 83.24 GB/s, 83.25 GB/s, 83.16 GB/s, 83.22 GB/s, 83.23 GB/s,
-            83.22 GB/s, 83.21 GB/s, 83.20 GB/s, 83.17 GB/s]
-          - [165.65 GB/s, 165.76 GB/s, 165.99 GB/s, 166.04 GB/s, 165.49 GB/s, 165.87
-              GB/s, 165.58 GB/s, 165.96 GB/s, 165.67 GB/s, 165.66 GB/s]
-          - [247.30 GB/s, 248.14 GB/s, 247.84 GB/s, 247.90 GB/s, 247.77 GB/s, 247.60
-              GB/s, 248.21 GB/s, 247.95 GB/s, 248.05 GB/s, 247.83 GB/s]
-          - [330.49 GB/s, 330.07 GB/s, 329.91 GB/s, 329.90 GB/s, 330.58 GB/s, 329.30
-              GB/s, 329.92 GB/s, 330.03 GB/s, 330.04 GB/s, 330.12 GB/s]
-          - [413.89 GB/s, 414.04 GB/s, 413.56 GB/s, 414.06 GB/s, 414.15 GB/s, 413.94
-              GB/s, 414.04 GB/s, 414.71 GB/s, 414.32 GB/s, 413.93 GB/s]
-          - [496.97 GB/s, 496.80 GB/s, 496.17 GB/s, 495.42 GB/s, 496.17 GB/s, 496.66
-              GB/s, 495.55 GB/s, 496.27 GB/s, 495.52 GB/s, 496.80 GB/s]
-          - [564.44 GB/s, 577.86 GB/s, 574.38 GB/s, 571.96 GB/s, 564.76 GB/s, 578.67
-              GB/s, 565.89 GB/s, 572.49 GB/s, 571.80 GB/s, 572.01 GB/s]
-          - [647.68 GB/s, 656.56 GB/s, 655.56 GB/s, 644.04 GB/s, 655.30 GB/s, 648.80
-              GB/s, 654.77 GB/s, 653.58 GB/s, 656.27 GB/s, 653.79 GB/s]
-        threads: [2, 4, 6, 8, 10, 12, 14, 16]
-        threads per core: 2
-        total size: [21.12 kB, 42.24 kB, 63.36 kB, 84.48 kB, 105.60 kB, 126.72 kB,
-          147.84 kB, 168.96 kB]
-    L2:
-      1:
-        cores: [1, 2, 3, 4, 5, 6, 7, 8]
-        results:
-          copy: [36.74 GB/s, 73.65 GB/s, 107.11 GB/s, 141.43 GB/s, 179.70 GB/s, 215.63
-              GB/s, 247.20 GB/s, 282.42 GB/s]
-          daxpy: [44.59 GB/s, 88.24 GB/s, 132.21 GB/s, 175.78 GB/s, 219.11 GB/s, 259.95
-              GB/s, 305.84 GB/s, 346.83 GB/s]
-          load: [31.46 GB/s, 62.97 GB/s, 93.73 GB/s, 125.46 GB/s, 157.32 GB/s, 183.63
-              GB/s, 214.02 GB/s, 245.17 GB/s]
-          triad: [37.79 GB/s, 75.08 GB/s, 111.43 GB/s, 148.90 GB/s, 185.54 GB/s, 223.72
-              GB/s, 258.53 GB/s, 299.32 GB/s]
-          update: [48.46 GB/s, 96.10 GB/s, 141.97 GB/s, 189.18 GB/s, 234.73 GB/s,
-            280.47 GB/s, 330.94 GB/s, 365.43 GB/s]
-        size per core: [168.96 kB, 168.96 kB, 168.96 kB, 168.96 kB, 168.96 kB, 168.96
-            kB, 168.96 kB, 168.96 kB]
-        size per thread: [168.96 kB, 168.96 kB, 168.96 kB, 168.96 kB, 168.96 kB, 168.96
-            kB, 168.96 kB, 168.96 kB]
-        stats:
-          copy:
-          - [36.38 GB/s, 36.59 GB/s, 36.18 GB/s, 36.57 GB/s, 36.26 GB/s, 34.61 GB/s,
-            35.96 GB/s, 35.84 GB/s, 36.74 GB/s, 36.53 GB/s]
-          - [68.97 GB/s, 70.42 GB/s, 69.88 GB/s, 71.40 GB/s, 69.05 GB/s, 72.46 GB/s,
-            70.32 GB/s, 73.65 GB/s, 72.14 GB/s, 69.81 GB/s]
-          - [107.08 GB/s, 103.53 GB/s, 107.11 GB/s, 103.66 GB/s, 103.88 GB/s, 106.48
-              GB/s, 97.32 GB/s, 105.92 GB/s, 104.16 GB/s, 104.84 GB/s]
-          - [138.97 GB/s, 136.86 GB/s, 140.88 GB/s, 138.96 GB/s, 140.58 GB/s, 138.51
-              GB/s, 141.43 GB/s, 139.53 GB/s, 141.20 GB/s, 139.43 GB/s]
-          - [158.20 GB/s, 171.06 GB/s, 179.70 GB/s, 171.43 GB/s, 174.27 GB/s, 175.01
-              GB/s, 165.20 GB/s, 170.89 GB/s, 173.01 GB/s, 175.17 GB/s]
-          - [209.74 GB/s, 204.59 GB/s, 215.27 GB/s, 215.63 GB/s, 210.59 GB/s, 206.94
-              GB/s, 211.03 GB/s, 201.61 GB/s, 214.45 GB/s, 208.15 GB/s]
-          - [241.38 GB/s, 246.88 GB/s, 246.90 GB/s, 247.20 GB/s, 235.27 GB/s, 227.39
-              GB/s, 239.48 GB/s, 244.45 GB/s, 246.68 GB/s, 235.87 GB/s]
-          - [271.07 GB/s, 282.42 GB/s, 282.38 GB/s, 276.20 GB/s, 269.85 GB/s, 276.96
-              GB/s, 268.64 GB/s, 269.61 GB/s, 279.68 GB/s, 280.63 GB/s]
-          daxpy:
-          - [44.54 GB/s, 44.59 GB/s, 44.50 GB/s, 44.42 GB/s, 44.41 GB/s, 44.06 GB/s,
-            43.39 GB/s, 44.02 GB/s, 44.34 GB/s, 44.28 GB/s]
-          - [85.35 GB/s, 87.05 GB/s, 86.47 GB/s, 86.90 GB/s, 86.92 GB/s, 88.24 GB/s,
-            87.39 GB/s, 87.60 GB/s, 87.55 GB/s, 84.19 GB/s]
-          - [129.21 GB/s, 130.47 GB/s, 123.29 GB/s, 127.92 GB/s, 132.21 GB/s, 128.37
-              GB/s, 127.09 GB/s, 128.72 GB/s, 129.34 GB/s, 128.69 GB/s]
-          - [171.53 GB/s, 169.64 GB/s, 173.92 GB/s, 173.74 GB/s, 168.53 GB/s, 171.54
-              GB/s, 173.96 GB/s, 175.78 GB/s, 171.29 GB/s, 171.33 GB/s]
-          - [219.11 GB/s, 208.86 GB/s, 211.66 GB/s, 216.47 GB/s, 212.73 GB/s, 204.90
-              GB/s, 208.87 GB/s, 215.75 GB/s, 213.61 GB/s, 214.56 GB/s]
-          - [250.69 GB/s, 241.36 GB/s, 255.22 GB/s, 250.29 GB/s, 253.80 GB/s, 256.34
-              GB/s, 254.38 GB/s, 259.95 GB/s, 245.69 GB/s, 259.12 GB/s]
-          - [296.08 GB/s, 301.77 GB/s, 297.40 GB/s, 305.84 GB/s, 288.62 GB/s, 283.76
-              GB/s, 293.61 GB/s, 291.93 GB/s, 299.74 GB/s, 289.76 GB/s]
-          - [344.46 GB/s, 334.36 GB/s, 339.31 GB/s, 330.88 GB/s, 343.26 GB/s, 327.28
-              GB/s, 344.53 GB/s, 346.83 GB/s, 344.29 GB/s, 346.28 GB/s]
-          load:
-          - [31.40 GB/s, 31.23 GB/s, 31.29 GB/s, 31.24 GB/s, 31.46 GB/s, 31.20 GB/s,
-            31.33 GB/s, 30.01 GB/s, 30.08 GB/s, 31.40 GB/s]
-          - [61.20 GB/s, 60.74 GB/s, 61.93 GB/s, 61.22 GB/s, 61.20 GB/s, 60.03 GB/s,
-            59.33 GB/s, 59.94 GB/s, 58.54 GB/s, 62.97 GB/s]
-          - [91.53 GB/s, 93.73 GB/s, 93.05 GB/s, 90.07 GB/s, 91.60 GB/s, 90.11 GB/s,
-            90.21 GB/s, 90.43 GB/s, 89.15 GB/s, 93.10 GB/s]
-          - [122.80 GB/s, 116.57 GB/s, 120.68 GB/s, 122.54 GB/s, 122.75 GB/s, 121.79
-              GB/s, 125.30 GB/s, 125.46 GB/s, 122.28 GB/s, 124.51 GB/s]
-          - [151.01 GB/s, 151.10 GB/s, 148.68 GB/s, 151.17 GB/s, 147.24 GB/s, 153.65
-              GB/s, 146.48 GB/s, 150.48 GB/s, 150.74 GB/s, 157.32 GB/s]
-          - [181.52 GB/s, 173.89 GB/s, 181.58 GB/s, 174.01 GB/s, 176.40 GB/s, 179.73
-              GB/s, 174.06 GB/s, 181.26 GB/s, 180.57 GB/s, 183.63 GB/s]
-          - [214.02 GB/s, 205.69 GB/s, 207.64 GB/s, 204.18 GB/s, 208.42 GB/s, 211.39
-              GB/s, 206.58 GB/s, 204.90 GB/s, 204.75 GB/s, 208.91 GB/s]
-          - [232.16 GB/s, 233.90 GB/s, 241.32 GB/s, 237.45 GB/s, 235.41 GB/s, 241.17
-              GB/s, 237.52 GB/s, 245.17 GB/s, 241.17 GB/s, 234.08 GB/s]
-          triad:
-          - [37.62 GB/s, 37.54 GB/s, 37.79 GB/s, 37.67 GB/s, 37.76 GB/s, 37.77 GB/s,
-            37.68 GB/s, 35.83 GB/s, 37.06 GB/s, 37.50 GB/s]
-          - [72.79 GB/s, 74.76 GB/s, 73.15 GB/s, 74.68 GB/s, 73.88 GB/s, 73.27 GB/s,
-            75.08 GB/s, 73.48 GB/s, 71.27 GB/s, 72.05 GB/s]
-          - [106.26 GB/s, 105.22 GB/s, 109.70 GB/s, 109.07 GB/s, 110.84 GB/s, 111.43
-              GB/s, 106.32 GB/s, 109.73 GB/s, 106.22 GB/s, 107.20 GB/s]
-          - [142.10 GB/s, 148.90 GB/s, 148.11 GB/s, 144.38 GB/s, 144.77 GB/s, 145.42
-              GB/s, 147.36 GB/s, 142.94 GB/s, 145.39 GB/s, 139.42 GB/s]
-          - [182.07 GB/s, 176.75 GB/s, 181.39 GB/s, 183.31 GB/s, 181.87 GB/s, 183.71
-              GB/s, 180.48 GB/s, 178.11 GB/s, 181.36 GB/s, 185.54 GB/s]
-          - [219.85 GB/s, 217.02 GB/s, 218.86 GB/s, 217.09 GB/s, 212.24 GB/s, 212.22
-              GB/s, 219.33 GB/s, 208.81 GB/s, 215.84 GB/s, 223.72 GB/s]
-          - [258.06 GB/s, 232.27 GB/s, 247.04 GB/s, 240.55 GB/s, 236.11 GB/s, 251.88
-              GB/s, 258.53 GB/s, 247.32 GB/s, 251.53 GB/s, 245.10 GB/s]
-          - [273.67 GB/s, 292.81 GB/s, 288.67 GB/s, 289.75 GB/s, 293.98 GB/s, 283.56
-              GB/s, 295.33 GB/s, 280.11 GB/s, 299.32 GB/s, 285.18 GB/s]
-          update:
-          - [47.30 GB/s, 48.33 GB/s, 48.17 GB/s, 47.38 GB/s, 48.16 GB/s, 46.99 GB/s,
-            48.46 GB/s, 47.51 GB/s, 46.20 GB/s, 48.26 GB/s]
-          - [92.10 GB/s, 92.30 GB/s, 95.73 GB/s, 95.53 GB/s, 86.95 GB/s, 96.10 GB/s,
-            94.16 GB/s, 89.72 GB/s, 92.00 GB/s, 93.10 GB/s]
-          - [137.06 GB/s, 140.40 GB/s, 136.20 GB/s, 139.57 GB/s, 140.69 GB/s, 136.20
-              GB/s, 141.53 GB/s, 129.76 GB/s, 136.47 GB/s, 141.97 GB/s]
-          - [184.84 GB/s, 177.96 GB/s, 178.61 GB/s, 179.03 GB/s, 176.59 GB/s, 180.62
-              GB/s, 182.26 GB/s, 182.27 GB/s, 189.18 GB/s, 185.49 GB/s]
-          - [232.17 GB/s, 217.86 GB/s, 232.40 GB/s, 223.10 GB/s, 228.52 GB/s, 234.73
-              GB/s, 232.00 GB/s, 233.14 GB/s, 231.69 GB/s, 225.01 GB/s]
-          - [276.16 GB/s, 274.80 GB/s, 272.58 GB/s, 272.43 GB/s, 280.47 GB/s, 276.90
-              GB/s, 264.76 GB/s, 272.47 GB/s, 277.77 GB/s, 271.42 GB/s]
-          - [330.94 GB/s, 312.06 GB/s, 312.83 GB/s, 312.62 GB/s, 292.44 GB/s, 315.68
-              GB/s, 316.67 GB/s, 321.25 GB/s, 321.71 GB/s, 315.05 GB/s]
-          - [362.85 GB/s, 356.49 GB/s, 365.43 GB/s, 332.52 GB/s, 354.30 GB/s, 354.68
-              GB/s, 335.54 GB/s, 358.54 GB/s, 363.22 GB/s, 360.01 GB/s]
-        threads: [1, 2, 3, 4, 5, 6, 7, 8]
-        threads per core: 1
-        total size: [168.96 kB, 337.92 kB, 506.88 kB, 675.84 kB, 844.80 kB, 1.01 MB,
-          1.18 MB, 1.35 MB]
-      2:
-        cores: [1, 2, 3, 4, 5, 6, 7, 8]
-        results:
-          copy: [36.83 GB/s, 72.70 GB/s, 108.11 GB/s, 142.21 GB/s, 178.07 GB/s, 213.30
-              GB/s, 251.98 GB/s, 283.06 GB/s]
-          daxpy: [45.34 GB/s, 90.11 GB/s, 134.85 GB/s, 180.06 GB/s, 224.22 GB/s, 268.27
-              GB/s, 312.15 GB/s, 358.38 GB/s]
-          load: [33.99 GB/s, 67.65 GB/s, 100.93 GB/s, 134.81 GB/s, 165.89 GB/s, 196.09
-              GB/s, 233.31 GB/s, 262.05 GB/s]
-          triad: [38.60 GB/s, 76.58 GB/s, 114.50 GB/s, 150.54 GB/s, 189.60 GB/s, 227.05
-              GB/s, 263.75 GB/s, 301.02 GB/s]
-          update: [49.25 GB/s, 97.34 GB/s, 146.81 GB/s, 194.71 GB/s, 239.97 GB/s,
-            287.14 GB/s, 330.84 GB/s, 384.71 GB/s]
-        size per core: [168.96 kB, 168.96 kB, 168.96 kB, 168.96 kB, 168.96 kB, 168.96
-            kB, 168.96 kB, 168.96 kB]
-        size per thread: [84.48 kB, 84.48 kB, 84.48 kB, 84.48 kB, 84.48 kB, 84.48
-            kB, 84.48 kB, 84.48 kB]
-        stats:
-          copy:
-          - [36.83 GB/s, 36.67 GB/s, 34.90 GB/s, 36.44 GB/s, 35.13 GB/s, 35.07 GB/s,
-            35.53 GB/s, 36.15 GB/s, 35.85 GB/s, 36.23 GB/s]
-          - [71.52 GB/s, 70.16 GB/s, 70.67 GB/s, 71.20 GB/s, 72.70 GB/s, 70.14 GB/s,
-            70.53 GB/s, 69.17 GB/s, 71.57 GB/s, 70.22 GB/s]
-          - [104.39 GB/s, 104.74 GB/s, 103.12 GB/s, 108.11 GB/s, 105.30 GB/s, 102.80
-              GB/s, 102.90 GB/s, 107.06 GB/s, 103.45 GB/s, 105.45 GB/s]
-          - [139.02 GB/s, 134.63 GB/s, 140.72 GB/s, 141.32 GB/s, 140.35 GB/s, 141.19
-              GB/s, 135.44 GB/s, 142.21 GB/s, 140.96 GB/s, 142.05 GB/s]
-          - [177.86 GB/s, 177.74 GB/s, 177.42 GB/s, 175.35 GB/s, 176.42 GB/s, 173.13
-              GB/s, 174.32 GB/s, 170.24 GB/s, 178.07 GB/s, 177.88 GB/s]
-          - [206.27 GB/s, 211.63 GB/s, 209.06 GB/s, 210.54 GB/s, 208.80 GB/s, 209.99
-              GB/s, 208.77 GB/s, 206.41 GB/s, 213.30 GB/s, 206.39 GB/s]
-          - [240.18 GB/s, 238.36 GB/s, 244.16 GB/s, 236.26 GB/s, 244.12 GB/s, 238.49
-              GB/s, 242.23 GB/s, 244.46 GB/s, 251.98 GB/s, 242.55 GB/s]
-          - [279.77 GB/s, 282.91 GB/s, 278.73 GB/s, 276.91 GB/s, 283.06 GB/s, 273.23
-              GB/s, 278.33 GB/s, 280.88 GB/s, 277.54 GB/s, 281.83 GB/s]
-          daxpy:
-          - [45.32 GB/s, 44.62 GB/s, 45.29 GB/s, 45.18 GB/s, 45.17 GB/s, 45.07 GB/s,
-            44.69 GB/s, 45.17 GB/s, 45.11 GB/s, 45.34 GB/s]
-          - [89.94 GB/s, 89.97 GB/s, 89.37 GB/s, 89.90 GB/s, 88.37 GB/s, 89.13 GB/s,
-            90.11 GB/s, 89.67 GB/s, 89.90 GB/s, 89.93 GB/s]
-          - [134.83 GB/s, 134.85 GB/s, 132.02 GB/s, 134.33 GB/s, 133.82 GB/s, 132.39
-              GB/s, 131.67 GB/s, 134.62 GB/s, 132.71 GB/s, 131.67 GB/s]
-          - [175.52 GB/s, 173.36 GB/s, 176.83 GB/s, 177.98 GB/s, 175.73 GB/s, 173.42
-              GB/s, 180.06 GB/s, 179.55 GB/s, 176.71 GB/s, 175.85 GB/s]
-          - [222.00 GB/s, 216.86 GB/s, 220.17 GB/s, 218.14 GB/s, 220.60 GB/s, 219.43
-              GB/s, 220.58 GB/s, 224.22 GB/s, 220.89 GB/s, 222.28 GB/s]
-          - [258.75 GB/s, 262.88 GB/s, 261.77 GB/s, 268.27 GB/s, 263.66 GB/s, 262.59
-              GB/s, 266.54 GB/s, 261.67 GB/s, 262.80 GB/s, 263.72 GB/s]
-          - [298.65 GB/s, 312.15 GB/s, 308.52 GB/s, 304.22 GB/s, 301.87 GB/s, 305.53
-              GB/s, 309.84 GB/s, 310.67 GB/s, 310.49 GB/s, 311.99 GB/s]
-          - [347.55 GB/s, 350.67 GB/s, 348.93 GB/s, 358.38 GB/s, 352.35 GB/s, 352.05
-              GB/s, 353.82 GB/s, 356.00 GB/s, 348.07 GB/s, 349.87 GB/s]
-          load:
-          - [33.99 GB/s, 32.54 GB/s, 32.94 GB/s, 33.17 GB/s, 33.83 GB/s, 31.55 GB/s,
-            31.91 GB/s, 33.86 GB/s, 33.93 GB/s, 33.75 GB/s]
-          - [66.22 GB/s, 64.94 GB/s, 67.64 GB/s, 67.52 GB/s, 65.01 GB/s, 67.21 GB/s,
-            66.07 GB/s, 66.43 GB/s, 67.65 GB/s, 64.84 GB/s]
-          - [98.58 GB/s, 97.97 GB/s, 98.39 GB/s, 98.50 GB/s, 98.77 GB/s, 97.84 GB/s,
-            99.58 GB/s, 100.93 GB/s, 100.50 GB/s, 99.94 GB/s]
-          - [130.23 GB/s, 131.10 GB/s, 131.04 GB/s, 127.83 GB/s, 134.81 GB/s, 132.68
-              GB/s, 131.80 GB/s, 129.42 GB/s, 130.76 GB/s, 126.96 GB/s]
-          - [164.90 GB/s, 165.18 GB/s, 161.19 GB/s, 164.33 GB/s, 162.76 GB/s, 165.04
-              GB/s, 162.20 GB/s, 165.89 GB/s, 164.34 GB/s, 159.66 GB/s]
-          - [192.69 GB/s, 193.33 GB/s, 188.88 GB/s, 190.70 GB/s, 194.60 GB/s, 190.92
-              GB/s, 191.36 GB/s, 192.89 GB/s, 191.85 GB/s, 196.09 GB/s]
-          - [227.70 GB/s, 223.95 GB/s, 222.79 GB/s, 227.09 GB/s, 227.04 GB/s, 229.45
-              GB/s, 228.09 GB/s, 227.83 GB/s, 233.31 GB/s, 227.49 GB/s]
-          - [257.94 GB/s, 261.47 GB/s, 262.05 GB/s, 257.70 GB/s, 259.70 GB/s, 259.23
-              GB/s, 261.09 GB/s, 253.81 GB/s, 254.21 GB/s, 259.34 GB/s]
-          triad:
-          - [38.60 GB/s, 36.68 GB/s, 38.07 GB/s, 38.10 GB/s, 37.89 GB/s, 36.48 GB/s,
-            38.33 GB/s, 38.12 GB/s, 37.43 GB/s, 37.87 GB/s]
-          - [76.58 GB/s, 74.97 GB/s, 75.74 GB/s, 76.02 GB/s, 72.66 GB/s, 74.73 GB/s,
-            76.37 GB/s, 76.18 GB/s, 74.59 GB/s, 75.75 GB/s]
-          - [111.71 GB/s, 114.50 GB/s, 108.96 GB/s, 111.49 GB/s, 111.56 GB/s, 111.66
-              GB/s, 113.43 GB/s, 114.37 GB/s, 111.67 GB/s, 108.14 GB/s]
-          - [146.29 GB/s, 147.84 GB/s, 149.09 GB/s, 149.93 GB/s, 150.54 GB/s, 145.50
-              GB/s, 145.16 GB/s, 149.47 GB/s, 146.30 GB/s, 149.32 GB/s]
-          - [186.73 GB/s, 186.46 GB/s, 180.47 GB/s, 187.32 GB/s, 184.34 GB/s, 187.34
-              GB/s, 186.55 GB/s, 183.81 GB/s, 189.60 GB/s, 188.70 GB/s]
-          - [224.81 GB/s, 219.69 GB/s, 227.05 GB/s, 224.25 GB/s, 223.36 GB/s, 225.86
-              GB/s, 216.09 GB/s, 221.98 GB/s, 218.47 GB/s, 226.37 GB/s]
-          - [263.29 GB/s, 259.28 GB/s, 258.81 GB/s, 258.77 GB/s, 256.56 GB/s, 256.49
-              GB/s, 256.39 GB/s, 263.75 GB/s, 262.00 GB/s, 261.48 GB/s]
-          - [299.28 GB/s, 292.80 GB/s, 293.63 GB/s, 297.93 GB/s, 293.02 GB/s, 295.95
-              GB/s, 287.92 GB/s, 301.02 GB/s, 300.76 GB/s, 297.01 GB/s]
-          update:
-          - [49.07 GB/s, 47.17 GB/s, 47.56 GB/s, 49.25 GB/s, 46.44 GB/s, 49.04 GB/s,
-            48.91 GB/s, 49.20 GB/s, 48.30 GB/s, 48.85 GB/s]
-          - [96.45 GB/s, 97.11 GB/s, 94.03 GB/s, 92.56 GB/s, 95.39 GB/s, 97.34 GB/s,
-            96.06 GB/s, 92.25 GB/s, 95.53 GB/s, 97.08 GB/s]
-          - [137.54 GB/s, 135.13 GB/s, 145.80 GB/s, 141.29 GB/s, 138.99 GB/s, 143.44
-              GB/s, 146.81 GB/s, 142.94 GB/s, 133.84 GB/s, 146.33 GB/s]
-          - [190.64 GB/s, 185.02 GB/s, 194.24 GB/s, 187.48 GB/s, 194.52 GB/s, 188.51
-              GB/s, 189.17 GB/s, 194.71 GB/s, 194.37 GB/s, 190.83 GB/s]
-          - [239.97 GB/s, 219.74 GB/s, 233.72 GB/s, 234.38 GB/s, 235.78 GB/s, 235.11
-              GB/s, 235.62 GB/s, 226.09 GB/s, 235.93 GB/s, 230.51 GB/s]
-          - [280.16 GB/s, 275.22 GB/s, 260.15 GB/s, 286.01 GB/s, 280.61 GB/s, 287.14
-              GB/s, 283.75 GB/s, 275.23 GB/s, 283.71 GB/s, 285.38 GB/s]
-          - [311.15 GB/s, 318.00 GB/s, 325.21 GB/s, 328.34 GB/s, 318.09 GB/s, 328.66
-              GB/s, 329.69 GB/s, 316.97 GB/s, 328.51 GB/s, 330.84 GB/s]
-          - [374.41 GB/s, 369.73 GB/s, 358.15 GB/s, 375.54 GB/s, 384.71 GB/s, 357.66
-              GB/s, 369.71 GB/s, 375.35 GB/s, 370.25 GB/s, 364.01 GB/s]
-        threads: [2, 4, 6, 8, 10, 12, 14, 16]
-        threads per core: 2
-        total size: [168.96 kB, 337.92 kB, 506.88 kB, 675.84 kB, 844.80 kB, 1.01 MB,
-          1.18 MB, 1.35 MB]
-    L3:
-      1:
-        cores: [1, 2, 3, 4, 5, 6, 7, 8]
-        results:
-          copy: [21.93 GB/s, 43.10 GB/s, 65.38 GB/s, 85.69 GB/s, 105.64 GB/s, 127.34
-              GB/s, 148.22 GB/s, 171.52 GB/s]
-          daxpy: [30.98 GB/s, 62.27 GB/s, 93.13 GB/s, 123.27 GB/s, 153.64 GB/s, 185.97
-              GB/s, 216.67 GB/s, 247.41 GB/s]
-          load: [23.47 GB/s, 46.84 GB/s, 69.74 GB/s, 92.76 GB/s, 115.37 GB/s, 139.23
-              GB/s, 163.12 GB/s, 186.65 GB/s]
-          triad: [24.72 GB/s, 49.11 GB/s, 72.42 GB/s, 95.36 GB/s, 119.46 GB/s, 144.60
-              GB/s, 168.66 GB/s, 189.45 GB/s]
-          update: [31.39 GB/s, 62.11 GB/s, 91.95 GB/s, 122.24 GB/s, 151.40 GB/s, 182.28
-              GB/s, 216.07 GB/s, 239.92 GB/s]
-        size per core: [13.20 MB, 6.60 MB, 4.40 MB, 3.30 MB, 2.64 MB, 2.20 MB, 1.89
-            MB, 1.65 MB]
-        size per thread: [13.20 MB, 6.60 MB, 4.40 MB, 3.30 MB, 2.64 MB, 2.20 MB, 1.89
-            MB, 1.65 MB]
-        stats:
-          copy:
-          - [21.64 GB/s, 20.85 GB/s, 20.56 GB/s, 21.69 GB/s, 21.06 GB/s, 21.46 GB/s,
-            21.93 GB/s, 21.73 GB/s, 21.83 GB/s, 21.69 GB/s]
-          - [42.86 GB/s, 42.70 GB/s, 42.72 GB/s, 38.47 GB/s, 42.82 GB/s, 43.10 GB/s,
-            42.66 GB/s, 42.44 GB/s, 42.61 GB/s, 42.48 GB/s]
-          - [64.95 GB/s, 64.34 GB/s, 63.93 GB/s, 65.38 GB/s, 64.36 GB/s, 63.60 GB/s,
-            62.65 GB/s, 63.66 GB/s, 63.51 GB/s, 63.75 GB/s]
-          - [84.07 GB/s, 83.97 GB/s, 83.34 GB/s, 83.91 GB/s, 81.16 GB/s, 85.69 GB/s,
-            85.40 GB/s, 85.37 GB/s, 85.42 GB/s, 84.48 GB/s]
-          - [102.83 GB/s, 104.24 GB/s, 105.42 GB/s, 103.68 GB/s, 105.22 GB/s, 105.64
-              GB/s, 103.15 GB/s, 102.02 GB/s, 100.60 GB/s, 105.09 GB/s]
-          - [125.46 GB/s, 122.23 GB/s, 123.56 GB/s, 124.59 GB/s, 127.03 GB/s, 125.39
-              GB/s, 124.50 GB/s, 127.02 GB/s, 126.95 GB/s, 127.34 GB/s]
-          - [147.99 GB/s, 146.65 GB/s, 139.23 GB/s, 147.69 GB/s, 146.42 GB/s, 145.65
-              GB/s, 148.22 GB/s, 143.77 GB/s, 147.96 GB/s, 147.70 GB/s]
-          - [168.36 GB/s, 168.24 GB/s, 164.99 GB/s, 165.32 GB/s, 167.08 GB/s, 165.98
-              GB/s, 165.39 GB/s, 165.84 GB/s, 166.15 GB/s, 171.52 GB/s]
-          daxpy:
-          - [30.92 GB/s, 30.74 GB/s, 30.87 GB/s, 30.98 GB/s, 30.45 GB/s, 29.62 GB/s,
-            29.54 GB/s, 30.04 GB/s, 30.94 GB/s, 30.93 GB/s]
-          - [61.96 GB/s, 61.38 GB/s, 61.27 GB/s, 62.27 GB/s, 61.36 GB/s, 61.27 GB/s,
-            62.06 GB/s, 60.01 GB/s, 61.49 GB/s, 62.16 GB/s]
-          - [92.26 GB/s, 93.06 GB/s, 88.45 GB/s, 92.18 GB/s, 93.13 GB/s, 92.11 GB/s,
-            92.28 GB/s, 92.28 GB/s, 93.03 GB/s, 92.78 GB/s]
-          - [123.22 GB/s, 123.06 GB/s, 123.27 GB/s, 119.42 GB/s, 122.94 GB/s, 122.54
-              GB/s, 123.24 GB/s, 115.90 GB/s, 121.65 GB/s, 122.47 GB/s]
-          - [151.70 GB/s, 145.65 GB/s, 149.53 GB/s, 152.52 GB/s, 153.64 GB/s, 152.93
-              GB/s, 152.81 GB/s, 153.01 GB/s, 153.04 GB/s, 152.06 GB/s]
-          - [184.04 GB/s, 171.51 GB/s, 184.83 GB/s, 184.09 GB/s, 185.97 GB/s, 183.75
-              GB/s, 184.66 GB/s, 182.54 GB/s, 184.39 GB/s, 184.40 GB/s]
-          - [198.70 GB/s, 216.51 GB/s, 216.17 GB/s, 203.10 GB/s, 211.40 GB/s, 215.04
-              GB/s, 215.48 GB/s, 216.03 GB/s, 216.24 GB/s, 216.67 GB/s]
-          - [246.02 GB/s, 247.35 GB/s, 245.00 GB/s, 244.65 GB/s, 229.12 GB/s, 243.37
-              GB/s, 247.22 GB/s, 247.41 GB/s, 246.03 GB/s, 244.83 GB/s]
-          load:
-          - [23.08 GB/s, 23.38 GB/s, 22.88 GB/s, 23.43 GB/s, 23.05 GB/s, 23.23 GB/s,
-            22.97 GB/s, 22.39 GB/s, 23.47 GB/s, 23.33 GB/s]
-          - [46.39 GB/s, 46.40 GB/s, 46.45 GB/s, 46.36 GB/s, 46.69 GB/s, 46.62 GB/s,
-            46.84 GB/s, 45.98 GB/s, 46.73 GB/s, 46.80 GB/s]
-          - [69.18 GB/s, 68.61 GB/s, 69.74 GB/s, 69.34 GB/s, 68.39 GB/s, 69.73 GB/s,
-            67.76 GB/s, 69.65 GB/s, 69.70 GB/s, 69.16 GB/s]
-          - [92.29 GB/s, 91.67 GB/s, 92.76 GB/s, 90.78 GB/s, 92.76 GB/s, 90.76 GB/s,
-            91.58 GB/s, 91.60 GB/s, 91.03 GB/s, 92.72 GB/s]
-          - [114.04 GB/s, 113.82 GB/s, 112.26 GB/s, 112.65 GB/s, 114.09 GB/s, 113.81
-              GB/s, 113.72 GB/s, 114.70 GB/s, 115.37 GB/s, 112.57 GB/s]
-          - [136.42 GB/s, 135.83 GB/s, 134.93 GB/s, 135.43 GB/s, 135.94 GB/s, 139.23
-              GB/s, 137.52 GB/s, 137.59 GB/s, 135.97 GB/s, 136.96 GB/s]
-          - [157.88 GB/s, 163.12 GB/s, 159.53 GB/s, 160.16 GB/s, 162.18 GB/s, 159.58
-              GB/s, 161.55 GB/s, 159.81 GB/s, 162.97 GB/s, 163.10 GB/s]
-          - [183.41 GB/s, 181.86 GB/s, 183.55 GB/s, 183.38 GB/s, 181.66 GB/s, 186.65
-              GB/s, 179.62 GB/s, 174.70 GB/s, 180.10 GB/s, 181.49 GB/s]
-          triad:
-          - [24.72 GB/s, 23.66 GB/s, 23.58 GB/s, 23.75 GB/s, 23.62 GB/s, 24.37 GB/s,
-            24.44 GB/s, 23.57 GB/s, 23.30 GB/s, 23.57 GB/s]
-          - [49.11 GB/s, 46.87 GB/s, 47.13 GB/s, 46.83 GB/s, 46.58 GB/s, 46.73 GB/s,
-            46.32 GB/s, 47.22 GB/s, 46.79 GB/s, 48.73 GB/s]
-          - [72.29 GB/s, 69.87 GB/s, 70.57 GB/s, 68.89 GB/s, 68.56 GB/s, 69.02 GB/s,
-            72.42 GB/s, 69.37 GB/s, 72.34 GB/s, 69.44 GB/s]
-          - [94.95 GB/s, 94.67 GB/s, 91.05 GB/s, 90.46 GB/s, 95.36 GB/s, 91.63 GB/s,
-            94.06 GB/s, 95.30 GB/s, 93.99 GB/s, 94.71 GB/s]
-          - [119.32 GB/s, 117.99 GB/s, 119.46 GB/s, 117.28 GB/s, 118.97 GB/s, 115.67
-              GB/s, 116.64 GB/s, 117.99 GB/s, 119.02 GB/s, 117.75 GB/s]
-          - [138.63 GB/s, 144.53 GB/s, 144.60 GB/s, 135.72 GB/s, 141.86 GB/s, 139.64
-              GB/s, 142.95 GB/s, 140.89 GB/s, 142.10 GB/s, 143.97 GB/s]
-          - [168.66 GB/s, 166.77 GB/s, 157.10 GB/s, 164.75 GB/s, 164.00 GB/s, 164.38
-              GB/s, 163.94 GB/s, 158.58 GB/s, 165.60 GB/s, 164.39 GB/s]
-          - [184.53 GB/s, 187.00 GB/s, 186.87 GB/s, 179.43 GB/s, 185.70 GB/s, 187.49
-              GB/s, 189.45 GB/s, 186.82 GB/s, 188.50 GB/s, 185.96 GB/s]
-          update:
-          - [30.60 GB/s, 31.20 GB/s, 30.65 GB/s, 31.39 GB/s, 30.89 GB/s, 30.75 GB/s,
-            30.58 GB/s, 30.99 GB/s, 30.69 GB/s, 31.34 GB/s]
-          - [60.99 GB/s, 62.11 GB/s, 61.42 GB/s, 61.55 GB/s, 61.79 GB/s, 61.24 GB/s,
-            61.37 GB/s, 61.74 GB/s, 61.45 GB/s, 61.58 GB/s]
-          - [91.11 GB/s, 91.21 GB/s, 91.95 GB/s, 91.19 GB/s, 91.14 GB/s, 91.36 GB/s,
-            91.30 GB/s, 91.70 GB/s, 90.84 GB/s, 91.09 GB/s]
-          - [120.90 GB/s, 120.49 GB/s, 121.35 GB/s, 122.24 GB/s, 120.37 GB/s, 119.83
-              GB/s, 119.32 GB/s, 119.48 GB/s, 119.11 GB/s, 119.76 GB/s]
-          - [146.72 GB/s, 147.18 GB/s, 147.81 GB/s, 151.40 GB/s, 147.81 GB/s, 146.84
-              GB/s, 147.51 GB/s, 148.15 GB/s, 146.89 GB/s, 148.41 GB/s]
-          - [179.93 GB/s, 179.68 GB/s, 182.28 GB/s, 179.65 GB/s, 179.06 GB/s, 182.25
-              GB/s, 182.03 GB/s, 179.10 GB/s, 178.82 GB/s, 177.84 GB/s]
-          - [208.84 GB/s, 210.17 GB/s, 210.20 GB/s, 210.81 GB/s, 209.88 GB/s, 211.16
-              GB/s, 216.07 GB/s, 211.77 GB/s, 208.89 GB/s, 210.47 GB/s]
-          - [236.56 GB/s, 239.05 GB/s, 237.81 GB/s, 237.20 GB/s, 238.68 GB/s, 237.69
-              GB/s, 239.05 GB/s, 239.38 GB/s, 239.92 GB/s, 238.63 GB/s]
-        threads: [1, 2, 3, 4, 5, 6, 7, 8]
-        threads per core: 1
-        total size: [13.20 MB, 13.20 MB, 13.20 MB, 13.20 MB, 13.20 MB, 13.20 MB, 13.20
-            MB, 13.20 MB]
-      2:
-        cores: [1, 2, 3, 4, 5, 6, 7, 8]
-        results:
-          copy: [23.35 GB/s, 45.63 GB/s, 68.10 GB/s, 89.46 GB/s, 111.10 GB/s, 134.20
-              GB/s, 154.44 GB/s, 174.89 GB/s]
-          daxpy: [32.32 GB/s, 64.16 GB/s, 96.12 GB/s, 126.75 GB/s, 156.91 GB/s, 188.57
-              GB/s, 221.57 GB/s, 251.65 GB/s]
-          load: [25.14 GB/s, 50.38 GB/s, 75.49 GB/s, 101.06 GB/s, 126.04 GB/s, 151.12
-              GB/s, 172.57 GB/s, 196.91 GB/s]
-          triad: [25.15 GB/s, 50.37 GB/s, 75.31 GB/s, 99.12 GB/s, 123.25 GB/s, 150.29
-              GB/s, 171.60 GB/s, 197.81 GB/s]
-          update: [32.98 GB/s, 65.60 GB/s, 97.60 GB/s, 130.34 GB/s, 162.76 GB/s, 194.12
-              GB/s, 229.02 GB/s, 260.35 GB/s]
-        size per core: [13.20 MB, 6.60 MB, 4.40 MB, 3.30 MB, 2.64 MB, 2.20 MB, 1.89
-            MB, 1.65 MB]
-        size per thread: [6.60 MB, 3.30 MB, 2.20 MB, 1.65 MB, 1.32 MB, 1.10 MB, 0.94
-            MB, 825.00 kB]
-        stats:
-          copy:
-          - [22.79 GB/s, 22.55 GB/s, 22.86 GB/s, 22.74 GB/s, 23.09 GB/s, 22.51 GB/s,
-            23.35 GB/s, 23.32 GB/s, 23.02 GB/s, 22.75 GB/s]
-          - [45.32 GB/s, 45.15 GB/s, 45.63 GB/s, 44.84 GB/s, 44.54 GB/s, 44.33 GB/s,
-            44.68 GB/s, 44.98 GB/s, 44.64 GB/s, 44.75 GB/s]
-          - [68.10 GB/s, 67.88 GB/s, 67.98 GB/s, 67.32 GB/s, 67.02 GB/s, 67.14 GB/s,
-            67.71 GB/s, 67.19 GB/s, 63.08 GB/s, 68.04 GB/s]
-          - [89.46 GB/s, 88.53 GB/s, 88.51 GB/s, 89.13 GB/s, 89.32 GB/s, 84.53 GB/s,
-            87.51 GB/s, 88.95 GB/s, 88.91 GB/s, 87.62 GB/s]
-          - [108.72 GB/s, 110.42 GB/s, 106.02 GB/s, 111.08 GB/s, 110.70 GB/s, 111.10
-              GB/s, 110.24 GB/s, 109.68 GB/s, 109.55 GB/s, 108.86 GB/s]
-          - [133.21 GB/s, 127.37 GB/s, 132.83 GB/s, 132.67 GB/s, 133.02 GB/s, 132.65
-              GB/s, 134.20 GB/s, 132.96 GB/s, 118.86 GB/s, 131.20 GB/s]
-          - [152.95 GB/s, 153.90 GB/s, 153.80 GB/s, 153.22 GB/s, 153.32 GB/s, 142.75
-              GB/s, 152.99 GB/s, 154.44 GB/s, 154.43 GB/s, 152.24 GB/s]
-          - [174.89 GB/s, 171.49 GB/s, 157.46 GB/s, 172.90 GB/s, 173.42 GB/s, 171.07
-              GB/s, 171.82 GB/s, 170.68 GB/s, 172.19 GB/s, 161.38 GB/s]
-          daxpy:
-          - [31.88 GB/s, 32.27 GB/s, 31.11 GB/s, 32.20 GB/s, 32.17 GB/s, 32.32 GB/s,
-            32.20 GB/s, 32.32 GB/s, 30.76 GB/s, 32.03 GB/s]
-          - [64.16 GB/s, 63.70 GB/s, 64.04 GB/s, 63.55 GB/s, 60.64 GB/s, 64.05 GB/s,
-            63.56 GB/s, 63.36 GB/s, 63.94 GB/s, 63.86 GB/s]
-          - [96.12 GB/s, 95.66 GB/s, 95.93 GB/s, 95.93 GB/s, 96.10 GB/s, 95.94 GB/s,
-            95.78 GB/s, 95.79 GB/s, 95.17 GB/s, 89.44 GB/s]
-          - [126.04 GB/s, 126.43 GB/s, 126.09 GB/s, 124.90 GB/s, 125.07 GB/s, 125.74
-              GB/s, 118.86 GB/s, 125.80 GB/s, 125.10 GB/s, 126.75 GB/s]
-          - [155.92 GB/s, 155.99 GB/s, 156.32 GB/s, 151.54 GB/s, 156.49 GB/s, 156.91
-              GB/s, 154.92 GB/s, 155.92 GB/s, 156.20 GB/s, 154.49 GB/s]
-          - [185.57 GB/s, 180.38 GB/s, 187.51 GB/s, 187.10 GB/s, 186.44 GB/s, 187.13
-              GB/s, 187.31 GB/s, 188.10 GB/s, 187.91 GB/s, 188.57 GB/s]
-          - [207.55 GB/s, 219.63 GB/s, 219.38 GB/s, 219.81 GB/s, 220.29 GB/s, 219.72
-              GB/s, 221.05 GB/s, 216.76 GB/s, 221.57 GB/s, 220.75 GB/s]
-          - [250.81 GB/s, 250.78 GB/s, 251.19 GB/s, 251.28 GB/s, 249.10 GB/s, 250.42
-              GB/s, 251.65 GB/s, 244.31 GB/s, 250.40 GB/s, 250.19 GB/s]
-          load:
-          - [24.84 GB/s, 24.86 GB/s, 25.09 GB/s, 25.04 GB/s, 24.74 GB/s, 24.87 GB/s,
-            25.01 GB/s, 25.08 GB/s, 25.14 GB/s, 25.00 GB/s]
-          - [50.03 GB/s, 49.40 GB/s, 50.28 GB/s, 50.08 GB/s, 50.37 GB/s, 49.75 GB/s,
-            50.01 GB/s, 50.38 GB/s, 49.89 GB/s, 50.24 GB/s]
-          - [74.37 GB/s, 74.65 GB/s, 74.40 GB/s, 73.45 GB/s, 73.31 GB/s, 73.00 GB/s,
-            75.49 GB/s, 73.94 GB/s, 74.42 GB/s, 74.80 GB/s]
-          - [99.51 GB/s, 99.43 GB/s, 98.90 GB/s, 99.83 GB/s, 98.74 GB/s, 100.75 GB/s,
-            99.33 GB/s, 99.81 GB/s, 100.00 GB/s, 101.06 GB/s]
-          - [126.04 GB/s, 126.03 GB/s, 124.70 GB/s, 124.86 GB/s, 125.31 GB/s, 124.78
-              GB/s, 125.99 GB/s, 123.52 GB/s, 124.45 GB/s, 123.01 GB/s]
-          - [146.95 GB/s, 150.27 GB/s, 151.12 GB/s, 150.93 GB/s, 150.68 GB/s, 149.75
-              GB/s, 150.67 GB/s, 146.01 GB/s, 148.34 GB/s, 149.15 GB/s]
-          - [169.40 GB/s, 172.12 GB/s, 172.40 GB/s, 171.99 GB/s, 172.57 GB/s, 171.95
-              GB/s, 167.06 GB/s, 169.66 GB/s, 168.34 GB/s, 169.45 GB/s]
-          - [192.68 GB/s, 191.98 GB/s, 192.82 GB/s, 191.84 GB/s, 191.97 GB/s, 196.91
-              GB/s, 193.36 GB/s, 190.12 GB/s, 192.04 GB/s, 193.93 GB/s]
-          triad:
-          - [24.78 GB/s, 25.03 GB/s, 25.07 GB/s, 24.81 GB/s, 24.65 GB/s, 24.80 GB/s,
-            24.71 GB/s, 25.15 GB/s, 24.70 GB/s, 24.25 GB/s]
-          - [49.63 GB/s, 48.68 GB/s, 49.73 GB/s, 49.97 GB/s, 50.37 GB/s, 49.89 GB/s,
-            49.59 GB/s, 49.00 GB/s, 49.96 GB/s, 49.61 GB/s]
-          - [74.88 GB/s, 74.99 GB/s, 75.31 GB/s, 73.20 GB/s, 74.50 GB/s, 72.88 GB/s,
-            73.43 GB/s, 73.74 GB/s, 74.59 GB/s, 74.60 GB/s]
-          - [95.80 GB/s, 97.67 GB/s, 98.93 GB/s, 97.79 GB/s, 98.74 GB/s, 97.74 GB/s,
-            98.87 GB/s, 99.12 GB/s, 97.90 GB/s, 97.96 GB/s]
-          - [121.15 GB/s, 120.28 GB/s, 120.66 GB/s, 121.19 GB/s, 121.09 GB/s, 121.68
-              GB/s, 121.30 GB/s, 123.22 GB/s, 122.51 GB/s, 123.25 GB/s]
-          - [146.72 GB/s, 146.38 GB/s, 146.25 GB/s, 146.49 GB/s, 146.29 GB/s, 144.30
-              GB/s, 142.89 GB/s, 150.29 GB/s, 146.37 GB/s, 146.30 GB/s]
-          - [166.36 GB/s, 168.18 GB/s, 168.79 GB/s, 170.27 GB/s, 169.26 GB/s, 170.98
-              GB/s, 170.77 GB/s, 171.43 GB/s, 169.53 GB/s, 171.60 GB/s]
-          - [190.83 GB/s, 197.81 GB/s, 196.29 GB/s, 197.12 GB/s, 196.21 GB/s, 188.40
-              GB/s, 191.07 GB/s, 195.14 GB/s, 192.48 GB/s, 194.23 GB/s]
-          update:
-          - [32.74 GB/s, 32.98 GB/s, 32.73 GB/s, 32.57 GB/s, 32.63 GB/s, 32.41 GB/s,
-            32.61 GB/s, 32.24 GB/s, 32.52 GB/s, 32.49 GB/s]
-          - [65.22 GB/s, 65.07 GB/s, 64.65 GB/s, 65.26 GB/s, 63.70 GB/s, 64.19 GB/s,
-            64.35 GB/s, 64.83 GB/s, 65.60 GB/s, 63.99 GB/s]
-          - [97.60 GB/s, 96.65 GB/s, 97.50 GB/s, 96.07 GB/s, 97.12 GB/s, 96.41 GB/s,
-            96.85 GB/s, 96.80 GB/s, 97.10 GB/s, 97.10 GB/s]
-          - [129.18 GB/s, 127.79 GB/s, 129.50 GB/s, 129.46 GB/s, 128.85 GB/s, 128.69
-              GB/s, 129.02 GB/s, 130.34 GB/s, 129.92 GB/s, 129.11 GB/s]
-          - [160.00 GB/s, 161.81 GB/s, 160.37 GB/s, 159.56 GB/s, 160.38 GB/s, 161.91
-              GB/s, 160.54 GB/s, 161.43 GB/s, 160.59 GB/s, 162.76 GB/s]
-          - [192.24 GB/s, 193.69 GB/s, 191.11 GB/s, 190.65 GB/s, 193.10 GB/s, 191.30
-              GB/s, 192.50 GB/s, 193.37 GB/s, 191.98 GB/s, 194.12 GB/s]
-          - [221.45 GB/s, 229.02 GB/s, 226.33 GB/s, 224.81 GB/s, 225.62 GB/s, 224.79
-              GB/s, 226.03 GB/s, 227.09 GB/s, 226.46 GB/s, 225.88 GB/s]
-          - [255.45 GB/s, 256.52 GB/s, 254.06 GB/s, 257.76 GB/s, 256.85 GB/s, 256.27
-              GB/s, 260.35 GB/s, 259.96 GB/s, 258.40 GB/s, 255.79 GB/s]
-        threads: [2, 4, 6, 8, 10, 12, 14, 16]
-        threads per core: 2
-        total size: [13.20 MB, 13.20 MB, 13.20 MB, 13.20 MB, 13.20 MB, 13.20 MB, 13.20
-            MB, 13.20 MB]
-    MEM:
-      1:
-        cores: [1, 2, 3, 4, 5, 6, 7, 8]
-        results:
-          copy: [11.12 GB/s, 20.53 GB/s, 24.86 GB/s, 26.20 GB/s, 26.47 GB/s, 26.35
-              GB/s, 26.24 GB/s, 26.17 GB/s]
-          daxpy: [16.10 GB/s, 30.00 GB/s, 36.88 GB/s, 38.86 GB/s, 39.36 GB/s, 39.19
-              GB/s, 39.02 GB/s, 38.88 GB/s]
-          load: [12.30 GB/s, 23.50 GB/s, 33.04 GB/s, 40.59 GB/s, 44.03 GB/s, 44.56
-              GB/s, 44.26 GB/s, 43.77 GB/s]
-          triad: [12.41 GB/s, 24.13 GB/s, 29.24 GB/s, 30.73 GB/s, 30.68 GB/s, 30.58
-              GB/s, 30.54 GB/s, 30.63 GB/s]
-          update: [17.40 GB/s, 31.16 GB/s, 36.80 GB/s, 39.06 GB/s, 39.80 GB/s, 39.77
-              GB/s, 39.50 GB/s, 39.24 GB/s]
-        size per core: [300.00 MB, 150.00 MB, 100.00 MB, 75.00 MB, 60.00 MB, 50.00
-            MB, 42.86 MB, 37.50 MB]
-        size per thread: [300.00 MB, 150.00 MB, 100.00 MB, 75.00 MB, 60.00 MB, 50.00
-            MB, 42.86 MB, 37.50 MB]
-        stats:
-          copy:
-          - [10.83 GB/s, 10.83 GB/s, 10.81 GB/s, 10.82 GB/s, 10.82 GB/s, 10.82 GB/s,
-            10.83 GB/s, 10.81 GB/s, 10.82 GB/s, 11.12 GB/s]
-          - [20.34 GB/s, 20.38 GB/s, 20.37 GB/s, 20.34 GB/s, 20.41 GB/s, 20.39 GB/s,
-            20.39 GB/s, 20.39 GB/s, 20.53 GB/s, 20.35 GB/s]
-          - [24.70 GB/s, 24.76 GB/s, 24.80 GB/s, 24.86 GB/s, 24.75 GB/s, 24.80 GB/s,
-            24.77 GB/s, 24.82 GB/s, 24.81 GB/s, 24.73 GB/s]
-          - [26.10 GB/s, 26.16 GB/s, 26.14 GB/s, 26.16 GB/s, 26.10 GB/s, 26.15 GB/s,
-            26.10 GB/s, 26.15 GB/s, 26.11 GB/s, 26.20 GB/s]
-          - [26.45 GB/s, 26.44 GB/s, 26.41 GB/s, 26.43 GB/s, 26.45 GB/s, 26.44 GB/s,
-            26.46 GB/s, 26.47 GB/s, 26.45 GB/s, 26.44 GB/s]
-          - [26.34 GB/s, 26.30 GB/s, 26.31 GB/s, 26.33 GB/s, 26.26 GB/s, 26.35 GB/s,
-            26.30 GB/s, 26.30 GB/s, 26.30 GB/s, 26.34 GB/s]
-          - [26.20 GB/s, 26.24 GB/s, 26.21 GB/s, 26.22 GB/s, 26.22 GB/s, 26.20 GB/s,
-            26.20 GB/s, 26.23 GB/s, 26.22 GB/s, 26.23 GB/s]
-          - [26.15 GB/s, 26.17 GB/s, 26.12 GB/s, 26.15 GB/s, 26.15 GB/s, 26.15 GB/s,
-            26.12 GB/s, 26.14 GB/s, 26.14 GB/s, 26.17 GB/s]
-          daxpy:
-          - [15.77 GB/s, 15.77 GB/s, 16.04 GB/s, 15.68 GB/s, 15.72 GB/s, 15.76 GB/s,
-            15.91 GB/s, 15.77 GB/s, 16.10 GB/s, 16.04 GB/s]
-          - [29.88 GB/s, 29.80 GB/s, 30.00 GB/s, 29.87 GB/s, 29.87 GB/s, 30.00 GB/s,
-            29.79 GB/s, 29.80 GB/s, 29.80 GB/s, 29.82 GB/s]
-          - [36.63 GB/s, 36.73 GB/s, 36.64 GB/s, 36.64 GB/s, 36.81 GB/s, 36.88 GB/s,
-            36.62 GB/s, 36.65 GB/s, 36.74 GB/s, 36.71 GB/s]
-          - [38.82 GB/s, 38.83 GB/s, 38.86 GB/s, 38.81 GB/s, 38.81 GB/s, 38.82 GB/s,
-            38.85 GB/s, 38.80 GB/s, 38.84 GB/s, 38.73 GB/s]
-          - [39.32 GB/s, 39.30 GB/s, 39.34 GB/s, 39.36 GB/s, 39.28 GB/s, 39.33 GB/s,
-            39.31 GB/s, 39.25 GB/s, 39.32 GB/s, 39.33 GB/s]
-          - [39.10 GB/s, 39.12 GB/s, 39.14 GB/s, 39.16 GB/s, 39.17 GB/s, 39.17 GB/s,
-            39.13 GB/s, 39.15 GB/s, 39.14 GB/s, 39.19 GB/s]
-          - [39.01 GB/s, 39.01 GB/s, 39.02 GB/s, 39.02 GB/s, 39.00 GB/s, 39.00 GB/s,
-            38.97 GB/s, 39.02 GB/s, 38.98 GB/s, 39.01 GB/s]
-          - [38.76 GB/s, 38.86 GB/s, 38.83 GB/s, 38.82 GB/s, 38.87 GB/s, 38.88 GB/s,
-            38.81 GB/s, 38.83 GB/s, 38.88 GB/s, 38.88 GB/s]
-          load:
-          - [11.97 GB/s, 11.96 GB/s, 11.98 GB/s, 11.97 GB/s, 11.96 GB/s, 12.05 GB/s,
-            12.30 GB/s, 12.18 GB/s, 11.97 GB/s, 11.96 GB/s]
-          - [22.85 GB/s, 22.85 GB/s, 22.87 GB/s, 22.94 GB/s, 23.50 GB/s, 22.86 GB/s,
-            22.86 GB/s, 23.25 GB/s, 22.85 GB/s, 22.86 GB/s]
-          - [33.04 GB/s, 32.43 GB/s, 32.51 GB/s, 32.52 GB/s, 32.52 GB/s, 32.81 GB/s,
-            32.77 GB/s, 32.54 GB/s, 32.53 GB/s, 32.53 GB/s]
-          - [39.95 GB/s, 39.94 GB/s, 39.93 GB/s, 40.15 GB/s, 40.59 GB/s, 40.36 GB/s,
-            40.28 GB/s, 39.93 GB/s, 39.94 GB/s, 39.98 GB/s]
-          - [43.98 GB/s, 43.86 GB/s, 43.90 GB/s, 43.80 GB/s, 43.83 GB/s, 43.86 GB/s,
-            44.03 GB/s, 43.94 GB/s, 43.83 GB/s, 43.92 GB/s]
-          - [44.46 GB/s, 44.34 GB/s, 44.56 GB/s, 44.51 GB/s, 44.32 GB/s, 44.32 GB/s,
-            44.51 GB/s, 44.48 GB/s, 44.32 GB/s, 44.34 GB/s]
-          - [44.03 GB/s, 44.26 GB/s, 44.08 GB/s, 44.18 GB/s, 44.10 GB/s, 43.99 GB/s,
-            44.07 GB/s, 44.06 GB/s, 43.94 GB/s, 43.97 GB/s]
-          - [43.48 GB/s, 43.77 GB/s, 43.51 GB/s, 43.49 GB/s, 43.47 GB/s, 43.73 GB/s,
-            43.55 GB/s, 43.68 GB/s, 43.49 GB/s, 43.50 GB/s]
-          triad:
-          - [12.11 GB/s, 12.02 GB/s, 12.03 GB/s, 12.10 GB/s, 12.03 GB/s, 12.04 GB/s,
-            12.05 GB/s, 12.17 GB/s, 12.02 GB/s, 12.41 GB/s]
-          - [23.43 GB/s, 23.25 GB/s, 23.25 GB/s, 23.36 GB/s, 23.28 GB/s, 23.24 GB/s,
-            23.61 GB/s, 23.29 GB/s, 23.31 GB/s, 24.13 GB/s]
-          - [28.92 GB/s, 29.10 GB/s, 29.17 GB/s, 29.04 GB/s, 28.91 GB/s, 29.16 GB/s,
-            28.82 GB/s, 29.01 GB/s, 29.24 GB/s, 28.88 GB/s]
-          - [30.65 GB/s, 30.62 GB/s, 30.73 GB/s, 30.59 GB/s, 30.69 GB/s, 30.68 GB/s,
-            30.59 GB/s, 30.59 GB/s, 30.57 GB/s, 30.67 GB/s]
-          - [30.53 GB/s, 30.67 GB/s, 30.65 GB/s, 30.53 GB/s, 30.63 GB/s, 30.68 GB/s,
-            30.50 GB/s, 30.67 GB/s, 30.64 GB/s, 30.67 GB/s]
-          - [30.45 GB/s, 30.58 GB/s, 30.51 GB/s, 30.49 GB/s, 30.52 GB/s, 30.49 GB/s,
-            30.56 GB/s, 30.55 GB/s, 30.47 GB/s, 30.47 GB/s]
-          - [30.51 GB/s, 30.47 GB/s, 30.50 GB/s, 30.47 GB/s, 30.52 GB/s, 30.54 GB/s,
-            30.54 GB/s, 30.50 GB/s, 30.49 GB/s, 30.50 GB/s]
-          - [30.58 GB/s, 30.34 GB/s, 30.56 GB/s, 30.54 GB/s, 30.63 GB/s, 30.53 GB/s,
-            30.59 GB/s, 30.50 GB/s, 30.54 GB/s, 30.47 GB/s]
-          update:
-          - [17.33 GB/s, 17.32 GB/s, 17.34 GB/s, 17.35 GB/s, 17.40 GB/s, 17.35 GB/s,
-            17.36 GB/s, 17.39 GB/s, 17.35 GB/s, 17.35 GB/s]
-          - [31.12 GB/s, 31.15 GB/s, 31.10 GB/s, 31.16 GB/s, 31.07 GB/s, 31.08 GB/s,
-            31.09 GB/s, 31.12 GB/s, 31.12 GB/s, 31.08 GB/s]
-          - [36.80 GB/s, 36.42 GB/s, 35.92 GB/s, 36.39 GB/s, 35.99 GB/s, 35.98 GB/s,
-            36.37 GB/s, 36.39 GB/s, 36.38 GB/s, 36.44 GB/s]
-          - [39.03 GB/s, 39.05 GB/s, 39.02 GB/s, 39.06 GB/s, 39.01 GB/s, 39.02 GB/s,
-            39.02 GB/s, 39.00 GB/s, 39.00 GB/s, 39.00 GB/s]
-          - [39.76 GB/s, 39.80 GB/s, 39.80 GB/s, 39.78 GB/s, 39.76 GB/s, 39.79 GB/s,
-            39.79 GB/s, 39.77 GB/s, 39.77 GB/s, 39.71 GB/s]
-          - [39.71 GB/s, 39.72 GB/s, 39.72 GB/s, 39.66 GB/s, 39.74 GB/s, 39.70 GB/s,
-            39.76 GB/s, 39.74 GB/s, 39.77 GB/s, 39.74 GB/s]
-          - [39.50 GB/s, 39.47 GB/s, 39.45 GB/s, 39.43 GB/s, 39.46 GB/s, 39.45 GB/s,
-            39.45 GB/s, 39.40 GB/s, 39.43 GB/s, 39.47 GB/s]
-          - [39.21 GB/s, 39.18 GB/s, 39.19 GB/s, 39.19 GB/s, 39.21 GB/s, 39.19 GB/s,
-            39.18 GB/s, 39.21 GB/s, 39.20 GB/s, 39.24 GB/s]
-        threads: [1, 2, 3, 4, 5, 6, 7, 8]
-        threads per core: 1
-        total size: [300.00 MB, 300.00 MB, 300.00 MB, 300.00 MB, 300.00 MB, 300.00
-            MB, 300.00 MB, 300.00 MB]
-      2:
-        cores: [1, 2, 3, 4, 5, 6, 7, 8]
-        results:
-          copy: [10.79 GB/s, 20.46 GB/s, 24.69 GB/s, 25.42 GB/s, 25.63 GB/s, 25.45
-              GB/s, 25.32 GB/s, 25.06 GB/s]
-          daxpy: [15.97 GB/s, 29.70 GB/s, 35.95 GB/s, 37.55 GB/s, 37.81 GB/s, 37.78
-              GB/s, 37.64 GB/s, 37.33 GB/s]
-          load: [13.46 GB/s, 25.84 GB/s, 35.75 GB/s, 40.54 GB/s, 42.38 GB/s, 42.30
-              GB/s, 41.85 GB/s, 41.19 GB/s]
-          triad: [12.05 GB/s, 22.53 GB/s, 27.53 GB/s, 29.10 GB/s, 29.68 GB/s, 29.79
-              GB/s, 29.85 GB/s, 29.64 GB/s]
-          update: [19.12 GB/s, 33.86 GB/s, 38.51 GB/s, 39.38 GB/s, 39.20 GB/s, 38.80
-              GB/s, 38.39 GB/s, 38.02 GB/s]
-        size per core: [300.00 MB, 150.00 MB, 100.00 MB, 75.00 MB, 60.00 MB, 50.00
-            MB, 42.86 MB, 37.50 MB]
-        size per thread: [150.00 MB, 75.00 MB, 50.00 MB, 37.50 MB, 30.00 MB, 25.00
-            MB, 21.43 MB, 18.75 MB]
-        stats:
-          copy:
-          - [10.71 GB/s, 10.69 GB/s, 10.71 GB/s, 10.70 GB/s, 10.79 GB/s, 10.58 GB/s,
-            10.70 GB/s, 10.69 GB/s, 10.69 GB/s, 10.70 GB/s]
-          - [20.27 GB/s, 20.31 GB/s, 20.27 GB/s, 20.26 GB/s, 20.31 GB/s, 20.26 GB/s,
-            20.24 GB/s, 20.26 GB/s, 20.26 GB/s, 20.46 GB/s]
-          - [24.69 GB/s, 24.66 GB/s, 24.64 GB/s, 24.63 GB/s, 24.67 GB/s, 24.64 GB/s,
-            24.64 GB/s, 24.68 GB/s, 24.61 GB/s, 24.63 GB/s]
-          - [25.42 GB/s, 25.41 GB/s, 25.40 GB/s, 25.36 GB/s, 25.40 GB/s, 25.39 GB/s,
-            25.40 GB/s, 25.38 GB/s, 25.41 GB/s, 25.39 GB/s]
-          - [25.55 GB/s, 25.57 GB/s, 25.58 GB/s, 25.63 GB/s, 25.57 GB/s, 25.57 GB/s,
-            25.58 GB/s, 25.55 GB/s, 25.57 GB/s, 25.49 GB/s]
-          - [25.42 GB/s, 25.42 GB/s, 25.41 GB/s, 25.39 GB/s, 25.40 GB/s, 25.43 GB/s,
-            25.45 GB/s, 25.44 GB/s, 25.43 GB/s, 25.43 GB/s]
-          - [25.27 GB/s, 25.31 GB/s, 25.28 GB/s, 25.31 GB/s, 25.32 GB/s, 25.31 GB/s,
-            25.29 GB/s, 25.30 GB/s, 25.25 GB/s, 25.28 GB/s]
-          - [25.03 GB/s, 25.01 GB/s, 25.01 GB/s, 25.04 GB/s, 25.00 GB/s, 25.03 GB/s,
-            25.06 GB/s, 25.04 GB/s, 25.04 GB/s, 25.04 GB/s]
-          daxpy:
-          - [15.81 GB/s, 15.81 GB/s, 15.97 GB/s, 15.62 GB/s, 15.64 GB/s, 15.83 GB/s,
-            15.63 GB/s, 15.82 GB/s, 15.81 GB/s, 15.63 GB/s]
-          - [29.62 GB/s, 29.56 GB/s, 29.61 GB/s, 29.59 GB/s, 29.70 GB/s, 29.61 GB/s,
-            29.65 GB/s, 29.65 GB/s, 29.58 GB/s, 29.59 GB/s]
-          - [35.95 GB/s, 35.89 GB/s, 35.92 GB/s, 35.92 GB/s, 35.95 GB/s, 35.90 GB/s,
-            35.87 GB/s, 35.90 GB/s, 35.92 GB/s, 35.82 GB/s]
-          - [37.55 GB/s, 37.46 GB/s, 37.52 GB/s, 37.51 GB/s, 37.55 GB/s, 37.51 GB/s,
-            37.44 GB/s, 37.41 GB/s, 37.50 GB/s, 37.40 GB/s]
-          - [37.79 GB/s, 37.76 GB/s, 37.80 GB/s, 37.77 GB/s, 37.76 GB/s, 37.81 GB/s,
-            37.78 GB/s, 37.81 GB/s, 37.79 GB/s, 37.78 GB/s]
-          - [37.71 GB/s, 37.68 GB/s, 37.68 GB/s, 37.73 GB/s, 37.74 GB/s, 37.66 GB/s,
-            37.78 GB/s, 37.74 GB/s, 37.71 GB/s, 37.70 GB/s]
-          - [37.61 GB/s, 37.60 GB/s, 37.61 GB/s, 37.62 GB/s, 37.64 GB/s, 37.61 GB/s,
-            37.60 GB/s, 37.59 GB/s, 37.63 GB/s, 37.60 GB/s]
-          - [37.23 GB/s, 37.21 GB/s, 37.26 GB/s, 37.27 GB/s, 37.28 GB/s, 37.33 GB/s,
-            37.29 GB/s, 37.31 GB/s, 37.26 GB/s, 37.29 GB/s]
-          load:
-          - [13.34 GB/s, 13.36 GB/s, 13.35 GB/s, 13.34 GB/s, 13.35 GB/s, 13.38 GB/s,
-            13.46 GB/s, 13.35 GB/s, 13.35 GB/s, 13.35 GB/s]
-          - [25.63 GB/s, 25.64 GB/s, 25.84 GB/s, 25.64 GB/s, 25.74 GB/s, 25.63 GB/s,
-            25.64 GB/s, 25.63 GB/s, 25.64 GB/s, 25.68 GB/s]
-          - [35.38 GB/s, 35.56 GB/s, 35.50 GB/s, 35.75 GB/s, 35.50 GB/s, 35.39 GB/s,
-            35.46 GB/s, 35.39 GB/s, 35.75 GB/s, 35.40 GB/s]
-          - [40.37 GB/s, 40.37 GB/s, 40.49 GB/s, 40.49 GB/s, 40.42 GB/s, 40.37 GB/s,
-            40.54 GB/s, 40.39 GB/s, 40.37 GB/s, 40.51 GB/s]
-          - [42.34 GB/s, 42.14 GB/s, 42.26 GB/s, 42.17 GB/s, 42.10 GB/s, 42.13 GB/s,
-            42.38 GB/s, 42.13 GB/s, 42.21 GB/s, 42.15 GB/s]
-          - [42.30 GB/s, 42.13 GB/s, 42.20 GB/s, 42.11 GB/s, 42.12 GB/s, 42.12 GB/s,
-            42.18 GB/s, 42.25 GB/s, 42.19 GB/s, 42.21 GB/s]
-          - [41.70 GB/s, 41.76 GB/s, 41.85 GB/s, 41.80 GB/s, 41.71 GB/s, 41.71 GB/s,
-            41.80 GB/s, 41.70 GB/s, 41.76 GB/s, 41.75 GB/s]
-          - [41.02 GB/s, 41.01 GB/s, 41.17 GB/s, 41.12 GB/s, 41.13 GB/s, 41.15 GB/s,
-            41.19 GB/s, 41.01 GB/s, 41.10 GB/s, 41.06 GB/s]
-          triad:
-          - [11.87 GB/s, 11.89 GB/s, 11.91 GB/s, 11.81 GB/s, 11.83 GB/s, 11.85 GB/s,
-            11.90 GB/s, 11.80 GB/s, 11.85 GB/s, 12.05 GB/s]
-          - [22.53 GB/s, 22.47 GB/s, 22.44 GB/s, 22.46 GB/s, 22.43 GB/s, 22.52 GB/s,
-            22.41 GB/s, 22.52 GB/s, 22.48 GB/s, 22.41 GB/s]
-          - [27.43 GB/s, 27.42 GB/s, 27.47 GB/s, 27.47 GB/s, 27.52 GB/s, 27.49 GB/s,
-            27.41 GB/s, 27.42 GB/s, 27.51 GB/s, 27.53 GB/s]
-          - [29.02 GB/s, 29.03 GB/s, 29.03 GB/s, 29.04 GB/s, 28.89 GB/s, 29.10 GB/s,
-            29.02 GB/s, 29.05 GB/s, 28.93 GB/s, 29.01 GB/s]
-          - [29.66 GB/s, 29.68 GB/s, 29.60 GB/s, 29.62 GB/s, 29.60 GB/s, 29.67 GB/s,
-            29.66 GB/s, 29.62 GB/s, 29.62 GB/s, 29.62 GB/s]
-          - [29.78 GB/s, 29.76 GB/s, 29.77 GB/s, 29.77 GB/s, 29.75 GB/s, 29.79 GB/s,
-            29.75 GB/s, 29.77 GB/s, 29.76 GB/s, 29.78 GB/s]
-          - [29.82 GB/s, 29.85 GB/s, 29.85 GB/s, 29.83 GB/s, 29.82 GB/s, 29.83 GB/s,
-            29.83 GB/s, 29.81 GB/s, 29.81 GB/s, 29.80 GB/s]
-          - [29.54 GB/s, 29.63 GB/s, 29.57 GB/s, 29.56 GB/s, 29.55 GB/s, 29.64 GB/s,
-            29.60 GB/s, 29.53 GB/s, 29.54 GB/s, 29.57 GB/s]
-          update:
-          - [18.66 GB/s, 18.67 GB/s, 18.66 GB/s, 19.12 GB/s, 18.67 GB/s, 18.67 GB/s,
-            18.67 GB/s, 18.67 GB/s, 18.70 GB/s, 18.67 GB/s]
-          - [33.61 GB/s, 33.34 GB/s, 33.71 GB/s, 33.31 GB/s, 33.34 GB/s, 33.86 GB/s,
-            33.62 GB/s, 33.35 GB/s, 33.54 GB/s, 33.34 GB/s]
-          - [38.51 GB/s, 38.46 GB/s, 38.42 GB/s, 38.43 GB/s, 38.41 GB/s, 38.46 GB/s,
-            38.41 GB/s, 38.42 GB/s, 38.43 GB/s, 38.41 GB/s]
-          - [39.37 GB/s, 39.34 GB/s, 39.36 GB/s, 39.35 GB/s, 39.37 GB/s, 39.38 GB/s,
-            39.36 GB/s, 39.35 GB/s, 39.31 GB/s, 39.32 GB/s]
-          - [39.17 GB/s, 39.17 GB/s, 39.16 GB/s, 39.20 GB/s, 39.18 GB/s, 39.17 GB/s,
-            39.18 GB/s, 39.15 GB/s, 39.20 GB/s, 39.17 GB/s]
-          - [38.79 GB/s, 38.79 GB/s, 38.80 GB/s, 38.78 GB/s, 38.78 GB/s, 38.75 GB/s,
-            38.80 GB/s, 38.77 GB/s, 38.78 GB/s, 38.78 GB/s]
-          - [38.36 GB/s, 38.37 GB/s, 38.37 GB/s, 38.39 GB/s, 38.36 GB/s, 38.37 GB/s,
-            38.38 GB/s, 38.37 GB/s, 38.35 GB/s, 38.39 GB/s]
-          - [37.98 GB/s, 37.99 GB/s, 38.02 GB/s, 38.01 GB/s, 38.01 GB/s, 38.00 GB/s,
-            38.02 GB/s, 38.00 GB/s, 38.02 GB/s, 38.02 GB/s]
-        threads: [2, 4, 6, 8, 10, 12, 14, 16]
-        threads per core: 2
-        total size: [300.00 MB, 300.00 MB, 300.00 MB, 300.00 MB, 300.00 MB, 300.00
-            MB, 300.00 MB, 300.00 MB]
diff --git a/pystencils_tests/test_kerncraft_coupling.py b/pystencils_tests/test_kerncraft_coupling.py
deleted file mode 100644
index 5f3757ba6..000000000
--- a/pystencils_tests/test_kerncraft_coupling.py
+++ /dev/null
@@ -1,197 +0,0 @@
-import numpy as np
-import pytest
-import sympy as sp
-from pathlib import Path
-
-from kerncraft.kernel import KernelCode
-from kerncraft.machinemodel import MachineModel
-from kerncraft.models import ECM, ECMData, Benchmark
-
-import pystencils as ps
-from pystencils import Assignment, Field
-from pystencils.backends.simd_instruction_sets import get_supported_instruction_sets, get_vector_instruction_set
-from pystencils.cpu import create_kernel
-from pystencils.datahandling import create_data_handling
-from pystencils.kerncraft_coupling import KerncraftParameters, PyStencilsKerncraftKernel
-from pystencils.kerncraft_coupling.generate_benchmark import generate_benchmark, run_c_benchmark
-from pystencils.timeloop import TimeLoop
-
-SCRIPT_FOLDER = Path(__file__).parent
-INPUT_FOLDER = SCRIPT_FOLDER / "kerncraft_inputs"
-
-
-@pytest.mark.kerncraft
-def test_compilation():
-    machine_file_path = INPUT_FOLDER / "Example_SandyBridgeEP_E5-2680.yml"
-    machine = MachineModel(path_to_yaml=machine_file_path)
-
-    kernel_file_path = INPUT_FOLDER / "2d-5pt.c"
-    with open(kernel_file_path) as kernel_file:
-        reference_kernel = KernelCode(kernel_file.read(), machine=machine, filename=kernel_file_path)
-        reference_kernel.get_kernel_header(name='test_kernel')
-        reference_kernel.get_kernel_code(name='test_kernel')
-        reference_kernel.get_main_code(kernel_function_name='test_kernel')
-
-    size = [30, 50, 3]
-    arr = np.zeros(size)
-    a = Field.create_from_numpy_array('a', arr, index_dimensions=1)
-    b = Field.create_from_numpy_array('b', arr, index_dimensions=1)
-    s = sp.Symbol("s")
-    rhs = a[0, -1](0) + a[0, 1] + a[-1, 0] + a[1, 0]
-    update_rule = Assignment(b[0, 0], s * rhs)
-    ast = create_kernel([update_rule])
-    mine = generate_benchmark(ast, likwid=False)
-    print(mine)
-
-
-def analysis(kernel, machine, model='ecmdata'):
-    if model == 'ecmdata':
-        model = ECMData(kernel, machine, KerncraftParameters())
-    elif model == 'ecm':
-        model = ECM(kernel, machine, KerncraftParameters())
-    elif model == 'benchmark':
-        model = Benchmark(kernel, machine, KerncraftParameters())
-    else:
-        model = ECM(kernel, machine, KerncraftParameters())
-    model.analyze()
-    return model
-
-
-@pytest.mark.kerncraft
-def test_3d_7pt_osaca():
-
-    size = [20, 200, 200]
-    kernel_file_path = INPUT_FOLDER / "3d-7pt.c"
-    machine_file_path = INPUT_FOLDER / "Example_SandyBridgeEP_E5-2680.yml"
-    machine_model = MachineModel(path_to_yaml=machine_file_path)
-    with open(kernel_file_path) as kernel_file:
-        reference_kernel = KernelCode(kernel_file.read(), machine=machine_model, filename=kernel_file_path)
-    reference_kernel.set_constant('M', size[0])
-    reference_kernel.set_constant('N', size[1])
-    assert size[1] == size[2]
-    analysis(reference_kernel, machine_model, model='ecm')
-
-    arr = np.zeros(size)
-    a = Field.create_from_numpy_array('a', arr, index_dimensions=0)
-    b = Field.create_from_numpy_array('b', arr, index_dimensions=0)
-    s = sp.Symbol("s")
-    rhs = a[0, -1, 0] + a[0, 1, 0] + a[-1, 0, 0] + a[1, 0, 0] + a[0, 0, -1] + a[0, 0, 1]
-
-    update_rule = Assignment(b[0, 0, 0], s * rhs)
-    ast = create_kernel([update_rule])
-    k = PyStencilsKerncraftKernel(ast, machine=machine_model, debug_print=True)
-    analysis(k, machine_model, model='ecm')
-    assert reference_kernel._flops == k._flops
-
-    path, lock = k.get_kernel_code(openmp=True)
-    with open(path) as kernel_file:
-        assert "#pragma omp parallel" in kernel_file.read()
-
-    path, lock = k.get_main_code()
-    with open(path) as kernel_file:
-        assert "likwid_markerInit();" in kernel_file.read()
-
-
-@pytest.mark.kerncraft
-def test_2d_5pt():
-    machine_file_path = INPUT_FOLDER / "Example_SandyBridgeEP_E5-2680.yml"
-    machine = MachineModel(path_to_yaml=machine_file_path)
-
-    size = [30, 50, 3]
-    kernel_file_path = INPUT_FOLDER / "2d-5pt.c"
-    with open(kernel_file_path) as kernel_file:
-        reference_kernel = KernelCode(kernel_file.read(), machine=machine, 
-                                      filename=kernel_file_path)
-    reference = analysis(reference_kernel, machine)
-
-    arr = np.zeros(size)
-    a = Field.create_from_numpy_array('a', arr, index_dimensions=1)
-    b = Field.create_from_numpy_array('b', arr, index_dimensions=1)
-    s = sp.Symbol("s")
-    rhs = a[0, -1](0) + a[0, 1] + a[-1, 0] + a[1, 0]
-    update_rule = Assignment(b[0, 0], s * rhs)
-    ast = create_kernel([update_rule])
-    k = PyStencilsKerncraftKernel(ast, machine)
-    result = analysis(k, machine)
-
-    for e1, e2 in zip(reference.results['cycles'], result.results['cycles']):
-        assert e1 == e2
-
-
-@pytest.mark.kerncraft
-def test_3d_7pt():
-    machine_file_path = INPUT_FOLDER / "Example_SandyBridgeEP_E5-2680.yml"
-    machine = MachineModel(path_to_yaml=machine_file_path)
-
-    size = [30, 50, 50]
-    kernel_file_path = INPUT_FOLDER / "3d-7pt.c"
-    with open(kernel_file_path) as kernel_file:
-        reference_kernel = KernelCode(kernel_file.read(), machine=machine,
-                                      filename=kernel_file_path)
-    reference_kernel.set_constant('M', size[0])
-    reference_kernel.set_constant('N', size[1])
-    assert size[1] == size[2]
-    reference = analysis(reference_kernel, machine)
-
-    arr = np.zeros(size)
-    a = Field.create_from_numpy_array('a', arr, index_dimensions=0)
-    b = Field.create_from_numpy_array('b', arr, index_dimensions=0)
-    s = sp.Symbol("s")
-    rhs = a[0, -1, 0] + a[0, 1, 0] + a[-1, 0, 0] + a[1, 0, 0] + a[0, 0, -1] + a[0, 0, 1]
-
-    update_rule = Assignment(b[0, 0, 0], s * rhs)
-    ast = create_kernel([update_rule])
-    k = PyStencilsKerncraftKernel(ast, machine)
-    result = analysis(k, machine)
-
-    for e1, e2 in zip(reference.results['cycles'], result.results['cycles']):
-        assert e1 == e2
-
-
-@pytest.mark.kerncraft
-def test_benchmark():
-    size = [30, 50, 50]
-    arr = np.zeros(size)
-    a = Field.create_from_numpy_array('a', arr, index_dimensions=0)
-    b = Field.create_from_numpy_array('b', arr, index_dimensions=0)
-    s = sp.Symbol("s")
-    rhs = a[0, -1, 0] + a[0, 1, 0] + a[-1, 0, 0] + a[1, 0, 0] + a[0, 0, -1] + a[0, 0, 1]
-
-    update_rule = Assignment(b[0, 0, 0], s * rhs)
-    ast = create_kernel([update_rule])
-
-    c_benchmark_run = run_c_benchmark(ast, inner_iterations=1000, outer_iterations=1)
-
-    kernel = ast.compile()
-    a = np.full(size, fill_value=0.23)
-    b = np.full(size, fill_value=0.23)
-
-    timeloop = TimeLoop(steps=1)
-    timeloop.add_call(kernel, {'a': a, 'b': b, 's': 0.23})
-
-    timeloop_time = timeloop.benchmark(number_of_time_steps_for_estimation=1)
-
-    np.testing.assert_almost_equal(c_benchmark_run, timeloop_time, decimal=4)
-
-
-@pytest.mark.kerncraft
-def test_benchmark_vectorized():
-    instruction_sets = get_supported_instruction_sets()
-    if not instruction_sets:
-        pytest.skip("cannot detect CPU instruction set")
-
-    for vec in instruction_sets:
-        dh = create_data_handling((20, 20, 20), periodicity=True)
-
-        width = get_vector_instruction_set(instruction_set=vec)['width'] * 8
-
-        a = dh.add_array("a", values_per_cell=1, alignment=width)
-        b = dh.add_array("b", values_per_cell=1, alignment=width)
-
-        rhs = a[0, -1, 0] + a[0, 1, 0] + a[-1, 0, 0] + a[1, 0, 0] + a[0, 0, -1] + a[0, 0, 1]
-        update_rule = Assignment(b[0, 0, 0], rhs)
-
-        opt = {'instruction_set': vec, 'assume_aligned': True, 'nontemporal': True, 'assume_inner_stride_one': True}
-        ast = ps.create_kernel(update_rule, cpu_vectorize_info=opt)
-
-        run_c_benchmark(ast, 5)
diff --git a/pytest.ini b/pytest.ini
index f77dd5ea5..b9b5db388 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -4,7 +4,6 @@ norecursedirs = *.egg-info .git .cache .ipynb_checkpoints htmlcov
 addopts = --doctest-modules --durations=20  --cov-config pytest.ini
 markers =
        longrun: tests only run at night since they have large execution time
-       kerncraft: tests depending on kerncraft
        notebook: mark for notebooks
 # these warnings all come from third party libraries.
 filterwarnings =
@@ -54,7 +53,7 @@ exclude_lines =
        if __name__ == .__main__.:
 
 skip_covered = True
-fail_under = 87
+fail_under = 86
 
 [html]
 directory = coverage_report
diff --git a/setup.py b/setup.py
index 4643a3883..a2053b422 100644
--- a/setup.py
+++ b/setup.py
@@ -93,7 +93,6 @@ setuptools.setup(name='pystencils',
                  packages=['pystencils'] + ['pystencils.' + s for s in setuptools.find_packages('pystencils')],
                  install_requires=['sympy>=1.5.1,<=1.9', 'numpy>=1.8.0', 'appdirs', 'joblib'],
                  package_data={'pystencils': ['include/*.h',
-                                              'kerncraft_coupling/templates/*',
                                               'backends/cuda_known_functions.txt',
                                               'backends/opencl1.1_known_functions.txt',
                                               'boundaries/createindexlistcython.c',
@@ -118,11 +117,9 @@ setuptools.setup(name='pystencils',
                      'alltrafos': ['islpy', 'py-cpuinfo'],
                      'bench_db': ['blitzdb', 'pymongo', 'pandas'],
                      'interactive': ['matplotlib', 'ipy_table', 'imageio', 'jupyter', 'pyevtk', 'rich', 'graphviz'],
-                     'autodiff': ['pystencils-autodiff'],
                      'doc': ['sphinx', 'sphinx_rtd_theme', 'nbsphinx',
                              'sphinxcontrib-bibtex', 'sphinx_autodoc_typehints', 'pandoc'],
-                     'use_cython': ['Cython'],
-                     'kerncraft': ['osaca', 'kerncraft']
+                     'use_cython': ['Cython']
                  },
                  tests_require=['pytest',
                                 'pytest-cov',
-- 
GitLab