From 2082ef802a38f828365a0bbe9467743c65a5eb78 Mon Sep 17 00:00:00 2001 From: Markus Holzer <markus.holzer@fau.de> Date: Thu, 25 Nov 2021 11:34:56 +0000 Subject: [PATCH] Removed kerncraft --- README.md | 2 - conftest.py | 16 - pystencils/__init__.py | 7 - pystencils/autodiff.py | 12 - pystencils/kerncraft_coupling/__init__.py | 4 - .../kerncraft_coupling/generate_benchmark.py | 146 --- .../kerncraft_coupling/kerncraft_interface.py | 373 ------- .../kerncraft_coupling/templates/benchmark.c | 157 --- .../kerncraft_coupling/templates/kernel.c | 18 - .../kerncraft_coupling/templates/kernel.h | 3 - .../SkylakeSP_Gold-5122_allinclusive.yaml | 600 ----------- pystencils_tests/benchmark/benchmark.py | 188 ---- pystencils_tests/benchmark/generate.py | 50 - pystencils_tests/benchmark/iacaMarks.h | 53 - pystencils_tests/benchmark/main.c | 11 - pystencils_tests/kerncraft_inputs/2d-5pt.c | 8 - pystencils_tests/kerncraft_inputs/3d-7pt.c | 10 - .../Example_SandyBridgeEP_E5-2680.yml | 980 ------------------ pystencils_tests/test_kerncraft_coupling.py | 197 ---- pytest.ini | 3 +- setup.py | 5 +- 21 files changed, 2 insertions(+), 2841 deletions(-) delete mode 100644 pystencils/autodiff.py delete mode 100644 pystencils/kerncraft_coupling/__init__.py delete mode 100644 pystencils/kerncraft_coupling/generate_benchmark.py delete mode 100644 pystencils/kerncraft_coupling/kerncraft_interface.py delete mode 100644 pystencils/kerncraft_coupling/templates/benchmark.c delete mode 100644 pystencils/kerncraft_coupling/templates/kernel.c delete mode 100644 pystencils/kerncraft_coupling/templates/kernel.h delete mode 100644 pystencils_tests/benchmark/SkylakeSP_Gold-5122_allinclusive.yaml delete mode 100644 pystencils_tests/benchmark/benchmark.py delete mode 100644 pystencils_tests/benchmark/generate.py delete mode 100644 pystencils_tests/benchmark/iacaMarks.h delete mode 100644 pystencils_tests/benchmark/main.c delete mode 100644 pystencils_tests/kerncraft_inputs/2d-5pt.c delete mode 100644 pystencils_tests/kerncraft_inputs/3d-7pt.c delete mode 100644 pystencils_tests/kerncraft_inputs/Example_SandyBridgeEP_E5-2680.yml delete mode 100644 pystencils_tests/test_kerncraft_coupling.py diff --git a/README.md b/README.md index 62105e71d..5ab85add4 100644 --- a/README.md +++ b/README.md @@ -56,9 +56,7 @@ All options: - `alltrafos`: pulls in additional dependencies for loop simplification e.g. libisl - `bench_db`: functionality to store benchmark result in object databases - `interactive`: installs dependencies to work in Jupyter including image I/O, plotting etc. -- `autodiff`: enable derivation of adjoint kernels and generation of Torch/Tensorflow operations - `doc`: packages to build documentation -- `kerncraft`: use kerncraft for automatic performance analysis Options can be combined e.g. ```bash diff --git a/conftest.py b/conftest.py index b7c535a59..131167994 100644 --- a/conftest.py +++ b/conftest.py @@ -45,28 +45,12 @@ add_path_to_ignore('pystencils_tests/benchmark') add_path_to_ignore('_local_tmp') -collect_ignore += [os.path.join(SCRIPT_FOLDER, "pystencils/autodiff.py")] - try: import pycuda except ImportError: collect_ignore += [os.path.join(SCRIPT_FOLDER, "pystencils_tests/test_cudagpu.py")] add_path_to_ignore('pystencils/gpucuda') -try: - import llvmlite -except ImportError: - collect_ignore += [os.path.join(SCRIPT_FOLDER, 'pystencils_tests/backends/llvm.py')] - collect_ignore += [os.path.join(SCRIPT_FOLDER, 'pystencils_tests/test_basic_usage_llvm.ipynb')] - add_path_to_ignore('pystencils/llvm') - -try: - import kerncraft -except ImportError: - collect_ignore += [os.path.join(SCRIPT_FOLDER, "pystencils_tests/test_kerncraft_coupling.py"), - os.path.join(SCRIPT_FOLDER, "pystencils_tests/benchmark/benchmark.py")] - add_path_to_ignore('pystencils/kerncraft_coupling') - try: import waLBerla except ImportError: diff --git a/pystencils/__init__.py b/pystencils/__init__.py index 4d97202bd..56b9c9e5c 100644 --- a/pystencils/__init__.py +++ b/pystencils/__init__.py @@ -15,13 +15,6 @@ from .slicing import make_slice from .spatial_coordinates import x_, x_staggered, x_staggered_vector, x_vector, y_, y_staggered, z_, z_staggered from .sympyextensions import SymbolCreator -try: - import pystencils_autodiff - - autodiff = pystencils_autodiff -except ImportError: - pass - __all__ = ['Field', 'FieldType', 'fields', 'TypedSymbol', 'make_slice', diff --git a/pystencils/autodiff.py b/pystencils/autodiff.py deleted file mode 100644 index 96ac836c8..000000000 --- a/pystencils/autodiff.py +++ /dev/null @@ -1,12 +0,0 @@ -""" -Provides tools for generation of auto-differentiable operations. - -See https://github.com/theHamsta/pystencils_autodiff - -Installation: - -.. code-block:: bash - pip install pystencils-autodiff -""" - -raise NotImplementedError('pystencils-autodiff is not installed. Run `pip install pystencils-autodiff`') diff --git a/pystencils/kerncraft_coupling/__init__.py b/pystencils/kerncraft_coupling/__init__.py deleted file mode 100644 index 8d2c3ad50..000000000 --- a/pystencils/kerncraft_coupling/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from .generate_benchmark import generate_benchmark, run_c_benchmark -from .kerncraft_interface import KerncraftParameters, PyStencilsKerncraftKernel - -__all__ = ['PyStencilsKerncraftKernel', 'KerncraftParameters', 'generate_benchmark', 'run_c_benchmark'] diff --git a/pystencils/kerncraft_coupling/generate_benchmark.py b/pystencils/kerncraft_coupling/generate_benchmark.py deleted file mode 100644 index 8d8d7d1da..000000000 --- a/pystencils/kerncraft_coupling/generate_benchmark.py +++ /dev/null @@ -1,146 +0,0 @@ -import subprocess -import warnings -import tempfile -from pathlib import Path - -from jinja2 import Environment, PackageLoader, StrictUndefined - -from pystencils.astnodes import PragmaBlock -from pystencils.backends.cbackend import generate_c, get_headers -from pystencils.cpu.cpujit import get_compiler_config, run_compile_step -from pystencils.typing import get_base_type -from pystencils.enums import Backend -from pystencils.include import get_pystencils_include_path -from pystencils.integer_functions import modulo_ceil -from pystencils.sympyextensions import prod - -import numpy as np - - -def generate_benchmark(ast, likwid=False, openmp=False, timing=False): - """Return C code of a benchmark program for the given kernel. - - Args: - ast: the pystencils AST object as returned by create_kernel - likwid: if True likwid markers are added to the code - openmp: relevant only if likwid=True, to generated correct likwid initialization code - timing: add timing output to the code, prints time per iteration to stdout - - Returns: - C code as string - """ - accessed_fields = {f.name: f for f in ast.fields_accessed} - constants = [] - fields = [] - call_parameters = [] - for p in ast.get_parameters(): - if not p.is_field_parameter: - constants.append((p.symbol.name, str(p.symbol.dtype))) - call_parameters.append(p.symbol.name) - else: - assert p.is_field_pointer, "Benchmark implemented only for kernels with fixed loop size" - field = accessed_fields[p.field_name] - dtype = str(get_base_type(p.symbol.dtype)) - np_dtype = get_base_type(p.symbol.dtype).numpy_dtype - size_data_type = np_dtype.itemsize - - dim0_size = field.shape[-1] - dim1_size = np.prod(field.shape[:-1]) - elements = prod(field.shape) - - if ast.instruction_set: - align = ast.instruction_set['width'] * size_data_type - padding_elements = modulo_ceil(dim0_size, ast.instruction_set['width']) - dim0_size - padding_bytes = padding_elements * size_data_type - ghost_layers = max(max(ast.ghost_layers)) - - size = dim1_size * padding_bytes + np.prod(field.shape) * size_data_type - - assert align % np_dtype.itemsize == 0 - offset = ((dim0_size + padding_elements + ghost_layers) % ast.instruction_set['width']) * size_data_type - - fields.append((p.field_name, dtype, elements, size, offset, align)) - call_parameters.append(p.field_name) - else: - size = elements * size_data_type - fields.append((p.field_name, dtype, elements, size, 0, 0)) - call_parameters.append(p.field_name) - - header_list = get_headers(ast) - includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list]) - - # Strip "#pragma omp parallel" from within kernel, because main function takes care of that - # when likwid and openmp are enabled - if likwid and openmp: - if len(ast.body.args) > 0 and isinstance(ast.body.args[0], PragmaBlock): - ast.body.args[0].pragma_line = '' - - jinja_context = { - 'likwid': likwid, - 'openmp': openmp, - 'kernel_code': generate_c(ast, dialect=Backend.C), - 'kernelName': ast.function_name, - 'fields': fields, - 'constants': constants, - 'call_argument_list': ",".join(call_parameters), - 'includes': includes, - 'timing': timing, - } - - env = Environment(loader=PackageLoader('pystencils.kerncraft_coupling'), undefined=StrictUndefined) - - return env.get_template('benchmark.c').render(**jinja_context) - - -def run_c_benchmark(ast, inner_iterations, outer_iterations=3, path=None): - """Runs the given kernel with outer loop in C - - Args: - ast: pystencils ast which is used to compile the benchmark file - inner_iterations: timings are recorded around this many iterations - outer_iterations: number of timings recorded - path: path where the benchmark file is stored. If None a tmp folder is created - - Returns: - list of times per iterations for each outer iteration - """ - import kerncraft - - benchmark_code = generate_benchmark(ast, timing=True) - - if path is None: - path = tempfile.mkdtemp() - - if isinstance(path, str): - path = Path(path) - - with open(path / 'bench.c', 'w') as f: - f.write(benchmark_code) - - kerncraft_path = Path(kerncraft.__file__).parent - - extra_flags = ['-I' + get_pystencils_include_path(), - '-I' + str(kerncraft_path / 'headers')] - - compiler_config = get_compiler_config() - compile_cmd = [compiler_config['command']] + compiler_config['flags'].split() - compile_cmd += [*extra_flags, - str(kerncraft_path / 'headers' / 'timing.c'), - str(kerncraft_path / 'headers' / 'dummy.c'), - str(path / 'bench.c'), - '-o', str(path / 'bench'), - ] - run_compile_step(compile_cmd) - - time_pre_estimation_per_iteration = float(subprocess.check_output(['./' / path / 'bench', str(10)])) - benchmark_time_limit = 20 - if benchmark_time_limit / time_pre_estimation_per_iteration < inner_iterations: - warn = (f"A benchmark run with {inner_iterations} inner_iterations will probably take longer than " - f"{benchmark_time_limit} seconds for this kernel") - warnings.warn(warn) - - results = [] - for _ in range(outer_iterations): - benchmark_time = float(subprocess.check_output(['./' / path / 'bench', str(inner_iterations)])) - results.append(benchmark_time) - return results diff --git a/pystencils/kerncraft_coupling/kerncraft_interface.py b/pystencils/kerncraft_coupling/kerncraft_interface.py deleted file mode 100644 index bfb5a2d6a..000000000 --- a/pystencils/kerncraft_coupling/kerncraft_interface.py +++ /dev/null @@ -1,373 +0,0 @@ -import warnings -import fcntl -from collections import defaultdict -from tempfile import TemporaryDirectory -import textwrap -import itertools -import string - -from jinja2 import Environment, PackageLoader, StrictUndefined, Template -import sympy as sp -from kerncraft.kerncraft import KernelCode -from kerncraft.kernel import symbol_pos_int -from kerncraft.machinemodel import MachineModel - -from pystencils.astnodes import \ - KernelFunction, LoopOverCoordinate, ResolvedFieldAccess, SympyAssignment -from pystencils.backends.cbackend import generate_c, get_headers -from pystencils.enums import Backend -from pystencils.field import get_layout_from_strides -from pystencils.sympyextensions import count_operations_in_ast -from pystencils.transformations import filtered_tree_iteration -from pystencils.utils import DotDict -from pystencils.cpu.kernelcreation import add_openmp -from pystencils.typing.utilities import get_base_type -from pystencils.sympyextensions import prod - - -class PyStencilsKerncraftKernel(KernelCode): - """ - Implementation of kerncraft's kernel interface for pystencils CPU kernels. - Analyses a list of equations assuming they will be executed on a CPU - """ - LIKWID_BASE = '/usr/local/likwid' - - def __init__(self, ast: KernelFunction, machine: MachineModel, - assumed_layout='SoA', debug_print=False, filename=None): - """Create a kerncraft kernel using a pystencils AST - - Args: - ast: pystencils ast - machine: kerncraft machine model - specify this if kernel needs to be compiled - assumed_layout: either 'SoA' or 'AoS' - if fields have symbolic sizes the layout of the index - coordinates is not known. In this case either a structures of array (SoA) or - array of structures (AoS) layout is assumed - debug_print: print debug information - filename: used for caching - """ - super(KernelCode, self).__init__(machine=machine) - - # Initialize state - self.asm_block = None - self._filename = filename - self._keep_intermediates = False - - self.kernel_ast = ast - self.temporary_dir = TemporaryDirectory() - self._keep_intermediates = debug_print - - # Loops - inner_loops = [l for l in filtered_tree_iteration(ast, LoopOverCoordinate, stop_type=SympyAssignment) - if l.is_innermost_loop] - if len(inner_loops) == 0: - raise ValueError("No loop found in pystencils AST") - else: - if len(inner_loops) > 1: - warnings.warn("pystencils AST contains multiple inner loops. " - "Only one can be analyzed - choosing first one") - inner_loop = inner_loops[0] - - self._loop_stack = [] - cur_node = inner_loop - while cur_node is not None: - if isinstance(cur_node, LoopOverCoordinate): - loop_counter_sym = cur_node.loop_counter_symbol - loop_info = (loop_counter_sym.name, - sp.Integer(cur_node.start), - sp.Integer(cur_node.stop), - sp.Integer(1)) - # If the correct step were to be provided, all access within that step length will - # also need to be passed to kerncraft: cur_node.step) - self._loop_stack.append(loop_info) - cur_node = cur_node.parent - self._loop_stack = list(reversed(self._loop_stack)) - - def get_layout_tuple(f): - if f.has_fixed_shape: - return get_layout_from_strides(f.strides) - else: - layout_list = list(f.layout) - for _ in range(f.index_dimensions): - layout_list.insert(0 if assumed_layout == 'SoA' else -1, max(layout_list) + 1) - return layout_list - - # Variables (arrays) and Constants (scalar sizes) - const_names_iter = itertools.product(string.ascii_uppercase, repeat=1) - constants_reversed = {} - fields_accessed = self.kernel_ast.fields_accessed - for field in fields_accessed: - layout = get_layout_tuple(field) - permuted_shape = list(field.shape[i] for i in layout) - # Replace shape dimensions with constant variables (necessary for layer condition - # analysis) - for i, d in enumerate(permuted_shape): - if d not in self.constants.values(): - const_symbol = symbol_pos_int(''.join(next(const_names_iter))) - self.set_constant(const_symbol, d) - constants_reversed[d] = const_symbol - permuted_shape[i] = constants_reversed[d] - self.set_variable(field.name, (str(field.dtype),), tuple(permuted_shape)) - - # Data sources & destinations - self.sources = defaultdict(list) - self.destinations = defaultdict(list) - - reads, writes = search_resolved_field_accesses_in_ast(inner_loop) - for accesses, target_dict in [(reads, self.sources), (writes, self.destinations)]: - for fa in accesses: - coord = [symbol_pos_int(LoopOverCoordinate.get_loop_counter_name(i)) + off - for i, off in enumerate(fa.offsets)] - coord += list(fa.idx_coordinate_values) - layout = get_layout_tuple(fa.field) - permuted_coord = [sp.sympify(coord[i]) for i in layout] - target_dict[fa.field.name].append(permuted_coord) - - # data type - self.datatype = list(self.variables.values())[0][0] - - # flops - operation_count = count_operations_in_ast(inner_loop) - self._flops = { - '+': operation_count['adds'], - '*': operation_count['muls'], - '/': operation_count['divs'], - } - for k in [k for k, v in self._flops.items() if v == 0]: - del self._flops[k] - self.check() - - if debug_print: - from pprint import pprint - print("----------------------------- Loop Stack --------------------------") - pprint(self._loop_stack) - print("----------------------------- Sources -----------------------------") - pprint(self.sources) - print("----------------------------- Destinations ------------------------") - pprint(self.destinations) - print("----------------------------- FLOPS -------------------------------") - pprint(self._flops) - - def get_kernel_header(self, name='pystencils_kernel'): - file_name = "pystencils_kernel.h" - file_path = self.get_intermediate_location(file_name, machine_and_compiler_dependent=False) - lock_mode, lock_fp = self.lock_intermediate(file_path) - - if lock_mode == fcntl.LOCK_SH: - # use cache - pass - else: # lock_mode == fcntl.LOCK_EX: - function_signature = generate_c(self.kernel_ast, dialect=Backend.C, signature_only=True) - - jinja_context = { - 'function_signature': function_signature, - } - - env = Environment(loader=PackageLoader('pystencils.kerncraft_coupling'), undefined=StrictUndefined) - file_header = env.get_template('kernel.h').render(**jinja_context) - with open(file_path, 'w') as f: - f.write(file_header) - - self.release_exclusive_lock(lock_fp) # degrade to shared lock - return file_path, lock_fp - - def get_kernel_code(self, openmp=False, name='pystencils_kernl'): - """ - Generate and return compilable source code from AST. - - Args: - openmp: if true, openmp code will be generated - name: kernel name - """ - filename = 'pystencils_kernl' - if openmp: - filename += '-omp' - filename += '.c' - file_path = self.get_intermediate_location(filename, machine_and_compiler_dependent=False) - lock_mode, lock_fp = self.lock_intermediate(file_path) - - if lock_mode == fcntl.LOCK_SH: - # use cache - with open(file_path) as f: - code = f.read() - else: # lock_mode == fcntl.LOCK_EX: - header_list = get_headers(self.kernel_ast) - includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list]) - - if openmp: - add_openmp(self.kernel_ast) - - kernel_code = generate_c(self.kernel_ast, dialect=Backend.C) - - jinja_context = { - 'includes': includes, - 'kernel_code': kernel_code, - } - - env = Environment(loader=PackageLoader('pystencils.kerncraft_coupling'), undefined=StrictUndefined) - code = env.get_template('kernel.c').render(**jinja_context) - with open(file_path, 'w') as f: - f.write(code) - - self.release_exclusive_lock(lock_fp) # degrade to shared lock - return file_path, lock_fp - - CODE_TEMPLATE = Template(textwrap.dedent(""" - #include <likwid.h> - #include <stdlib.h> - #include <stdint.h> - #include <stdbool.h> - #include <math.h> - #include "kerncraft.h" - #include "kernel.h" - - #define RESTRICT __restrict__ - #define FUNC_PREFIX - void dummy(void *); - extern int var_false; - - int main(int argc, char **argv) { - {%- for constantName, dataType in constants %} - // Constant {{constantName}} - {{dataType}} {{constantName}}; - {{constantName}} = 0.23; - {%- endfor %} - - // Declaring arrays - {%- for field_name, dataType, size in fields %} - - // Initialization {{field_name}} - double * {{field_name}} = (double *) aligned_malloc(sizeof({{dataType}}) * {{size}}, 64); - // TODO initialize in parallel context in same order as they are touched - for (unsigned long long i = 0; i < {{size}}; ++i) - {{field_name}}[i] = 0.23; - {%- endfor %} - - likwid_markerInit(); - #pragma omp parallel - { - likwid_markerRegisterRegion("loop"); - #pragma omp barrier - - // Initializing arrays in same order as touched in kernel loop nest - //INIT_ARRAYS; - - // Dummy call - {%- for field_name, dataType, size in fields %} - if(var_false) dummy({{field_name}}); - {%- endfor %} - {%- for constantName, dataType in constants %} - if(var_false) dummy(&{{constantName}}); - {%- endfor %} - - for(int warmup = 1; warmup >= 0; --warmup) { - int repeat = 2; - if(warmup == 0) { - repeat = atoi(argv[1]); - likwid_markerStartRegion("loop"); - } - - for(; repeat > 0; --repeat) { - {{kernelName}}({{call_argument_list}}); - - {%- for field_name, dataType, size in fields %} - if(var_false) dummy({{field_name}}); - {%- endfor %} - {%- for constantName, dataType in constants %} - if(var_false) dummy(&{{constantName}}); - {%- endfor %} - } - - } - likwid_markerStopRegion("loop"); - } - likwid_markerClose(); - return 0; - } - """)) - - def get_main_code(self, kernel_function_name='kernel'): - """ - Generate and return compilable source code from AST. - - :return: tuple of filename and shared lock file pointer - """ - # TODO produce nicer code, including help text and other "comfort features". - assert self.kernel_ast is not None, "AST does not exist, this could be due to running " \ - "based on a kernel description rather than code." - - file_path = self.get_intermediate_location('main.c', machine_and_compiler_dependent=False) - lock_mode, lock_fp = self.lock_intermediate(file_path) - - if lock_mode == fcntl.LOCK_SH: - # use cache - with open(file_path) as f: - code = f.read() - else: # lock_mode == fcntl.LOCK_EX - # needs update - accessed_fields = {f.name: f for f in self.kernel_ast.fields_accessed} - constants = [] - fields = [] - call_parameters = [] - for p in self.kernel_ast.get_parameters(): - if not p.is_field_parameter: - constants.append((p.symbol.name, str(p.symbol.dtype))) - call_parameters.append(p.symbol.name) - else: - assert p.is_field_pointer, "Benchmark implemented only for kernels with fixed loop size" - field = accessed_fields[p.field_name] - dtype = str(get_base_type(p.symbol.dtype)) - fields.append((p.field_name, dtype, prod(field.shape))) - call_parameters.append(p.field_name) - - header_list = get_headers(self.kernel_ast) - includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list]) - - # Generate code - code = self.CODE_TEMPLATE.render( - kernelName=self.kernel_ast.function_name, - fields=fields, - constants=constants, - call_agument_list=','.join(call_parameters), - includes=includes) - - # Store to file - with open(file_path, 'w') as f: - f.write(code) - self.release_exclusive_lock(lock_fp) # degrade to shared lock - - return file_path, lock_fp - - -class KerncraftParameters(DotDict): - def __init__(self, **kwargs): - super(KerncraftParameters, self).__init__() - self['asm_block'] = 'auto' - self['asm_increment'] = 0 - self['cores'] = 1 - self['cache_predictor'] = 'SIM' - self['verbose'] = 0 - self['pointer_increment'] = 'auto' - self['iterations'] = 10 - self['unit'] = 'cy/CL' - self['ignore_warnings'] = True - self['incore_model'] = 'OSACA' - self.update(**kwargs) - - -# ------------------------------------------- Helper functions --------------------------------------------------------- - - -def search_resolved_field_accesses_in_ast(ast): - def visit(node, reads, writes): - if not isinstance(node, SympyAssignment): - for a in node.args: - visit(a, reads, writes) - return - - for expr, accesses in [(node.lhs, writes), (node.rhs, reads)]: - accesses.update(expr.atoms(ResolvedFieldAccess)) - - read_accesses = set() - write_accesses = set() - visit(ast, read_accesses, write_accesses) - return read_accesses, write_accesses diff --git a/pystencils/kerncraft_coupling/templates/benchmark.c b/pystencils/kerncraft_coupling/templates/benchmark.c deleted file mode 100644 index 79daaffd9..000000000 --- a/pystencils/kerncraft_coupling/templates/benchmark.c +++ /dev/null @@ -1,157 +0,0 @@ -#include "kerncraft.h" -#include <stdlib.h> -#include <stdint.h> -#include <stdbool.h> -#include <math.h> -#include <stdio.h> -#include <assert.h> - -{{ includes }} - -{%- if likwid %} -#include <likwid.h> -{%- endif %} - -#define RESTRICT __restrict__ -#define FUNC_PREFIX -void dummy(void *); -void timing(double* wcTime, double* cpuTime); -extern int var_false; - -/* see waLBerla src/field/allocation/AlignedMalloc */ -void *aligned_malloc_with_offset( uint64_t size, uint64_t alignment, uint64_t offset ) -{ - // With 0 alignment this function makes no sense - // use normal malloc instead - assert( alignment > 0 ); - // Tests if alignment is power of two (assuming alignment>0) - assert( !(alignment & (alignment - 1)) ); - assert( offset < alignment ); - - void *pa; // pointer to allocated memory - void *ptr; // pointer to usable aligned memory - - pa=std::malloc( (size+2*alignment-1 )+sizeof(void *)); - if(!pa) - return nullptr; - - // Find next aligned position, starting at pa+sizeof(void*)-1 - ptr=(void*)( ((size_t)pa+sizeof(void *)+alignment-1) & ~(alignment-1)); - ptr=(void*) ( (char*)(ptr) + alignment - offset); - - // Store pointer to real allocated chunk just before usable chunk - *((void **)ptr-1)=pa; - - assert( ((size_t)ptr+offset) % alignment == 0 ); - - return ptr; -} - -void aligned_free( void *ptr ) -{ - // assume that pointer to real allocated chunk is stored just before - // chunk that was given to user - if(ptr) - std::free(*((void **)ptr-1)); -} - - -{{kernel_code}} - - -int main(int argc, char **argv) -{ - {%- if likwid %} - likwid_markerInit(); - {%- endif %} - - {%- for field_name, dataType, elements, size, offset, alignment in fields %} - // Initialization {{field_name}} - {%- if alignment > 0 %} - {{dataType}} * {{field_name}} = ({{dataType}} *) aligned_malloc_with_offset({{size}}, {{alignment}}, {{offset}}); - {%- else %} - {{dataType}} * {{field_name}} = new {{dataType}}[{{elements}}]; - {%- endif %} - for (unsigned long long i = 0; i < {{elements}}; ++i) - {{field_name}}[i] = 0.23; - - if(var_false) - dummy({{field_name}}); - - {%- endfor %} - - - - {%- for constantName, dataType in constants %} - - // Constant {{constantName}} - {{dataType}} {{constantName}}; - {{constantName}} = 0.23; - if(var_false) - dummy(& {{constantName}}); - - {%- endfor %} - - {%- if likwid and openmp %} - #pragma omp parallel - { - likwid_markerRegisterRegion("loop"); - #pragma omp barrier - {%- elif likwid %} - likwid_markerRegisterRegion("loop"); - {%- endif %} - - for(int warmup = 1; warmup >= 0; --warmup) { - int repeat = 2; - if(warmup == 0) { - repeat = atoi(argv[1]); - {%- if likwid %} - likwid_markerStartRegion("loop"); - {%- endif %} - } - - {%- if timing %} - double wcStartTime, cpuStartTime, wcEndTime, cpuEndTime; - timing(&wcStartTime, &cpuStartTime); - {%- endif %} - - for (; repeat > 0; --repeat) - { - {{kernelName}}({{call_argument_list}}); - - // Dummy calls - {%- for field_name, dataType, elements, size, offset, alignment in fields %} - if(var_false) dummy((void*){{field_name}}); - {%- endfor %} - {%- for constantName, dataType in constants %} - if(var_false) dummy((void*)&{{constantName}}); - {%- endfor %} - } - {%- if timing %} - timing(&wcEndTime, &cpuEndTime); - if( warmup == 0) - printf("%e\n", (wcEndTime - wcStartTime) / atoi(argv[1]) ); - {%- endif %} - - } - - {%- if likwid %} - likwid_markerStopRegion("loop"); - {%- if openmp %} - } - {%- endif %} - {%- endif %} - - {%- if likwid %} - likwid_markerClose(); - {%- endif %} - - {%- for field_name, dataType, elements, size, offset, alignment in fields %} - {%- if alignment > 0 %} - aligned_free({{field_name}}); - {%- else %} - delete[] {{field_name}}; - {%- endif %} - - {%- endfor %} -} diff --git a/pystencils/kerncraft_coupling/templates/kernel.c b/pystencils/kerncraft_coupling/templates/kernel.c deleted file mode 100644 index 47fbf7cf2..000000000 --- a/pystencils/kerncraft_coupling/templates/kernel.c +++ /dev/null @@ -1,18 +0,0 @@ - -#include "kerncraft.h" -#include <stdlib.h> -#include <stdint.h> -#include <stdbool.h> -#include <math.h> -#include <stdio.h> - -{{ includes }} - -#define RESTRICT __restrict__ -#define FUNC_PREFIX -void dummy(void *); -void timing(double* wcTime, double* cpuTime); -extern int var_false; - - -{{kernel_code}} \ No newline at end of file diff --git a/pystencils/kerncraft_coupling/templates/kernel.h b/pystencils/kerncraft_coupling/templates/kernel.h deleted file mode 100644 index 539d51f92..000000000 --- a/pystencils/kerncraft_coupling/templates/kernel.h +++ /dev/null @@ -1,3 +0,0 @@ -#define FUNC_PREFIX - -{{function_signature}} \ No newline at end of file diff --git a/pystencils_tests/benchmark/SkylakeSP_Gold-5122_allinclusive.yaml b/pystencils_tests/benchmark/SkylakeSP_Gold-5122_allinclusive.yaml deleted file mode 100644 index 75370ecd2..000000000 --- a/pystencils_tests/benchmark/SkylakeSP_Gold-5122_allinclusive.yaml +++ /dev/null @@ -1,600 +0,0 @@ -# FIXME -# FIXME performance counters might be wrong. This will only affect the Benchmark model -# FIXME bandwidth measurements need validation -# FIXME - -kerncraft version: 0.7.2 -model name: Intel(R) Xeon(R) Gold 5122 CPU @ 3.60GHz -model type: Intel Core Skylake SP -sockets: 2 -cores per socket: 4 -threads per core: 2 -NUMA domains per socket: 1 -cores per NUMA domain: 4 -clock: 3.6 GHz -FLOPs per cycle: - SP: - total: 64 - FMA: 64 - ADD: 32 - MUL: 32 - DP: - total: 32 - FMA: 32 - ADD: 16 - MUL: 16 -micro-architecture: SKX -compiler: - !!omap - - icc: -O3 -fno-alias -xCORE-AVX512 - - clang: -O3 -march=skylake-avx512 -D_POSIX_C_SOURCE=200112L - - gcc: -O3 -march=skylake-avx512 -cacheline size: 64 B -overlapping model: - ports: ["0", "0DV", "1", "2", "3", "4", "5", "6", "7"] - performance counter metric: - Max(UOPS_DISPATCHED_PORT_PORT_0:PMC[0-3], - UOPS_DISPATCHED_PORT_PORT_1:PMC[0-3], - UOPS_DISPATCHED_PORT_PORT_4:PMC[0-3], - UOPS_DISPATCHED_PORT_PORT_5:PMC[0-3], - UOPS_DISPATCHED_PORT_PORT_6:PMC[0-3], - UOPS_DISPATCHED_PORT_PORT_7:PMC[0-3]) -non-overlapping model: - ports: ["2D", "3D"] - performance counter metric: T_OL + T_L1L2 + T_L2L3 + T_L3MEM -memory hierarchy: -- level: L1 - performance counter metrics: - accesses: MEM_INST_RETIRED_ALL_LOADS:PMC[0-3] - misses: L1D_REPLACEMENT:PMC[0-3] - evicts: L2_TRANS_L1D_WB:PMC[0-3] - cache per group: - sets: 64 - ways: 8 - cl_size: 64 - replacement_policy: 'LRU' - write_allocate: True - write_back: True - load_from: L2 - store_to: L2 - size per group: 32.00 kB - groups: 8 - cores per group: 1 - threads per group: 2 -- level: L2 - non-overlap upstream throughput: [64 B/cy, 'half-duplex'] - performance counter metrics: - accesses: L1D_REPLACEMENT:PMC[0-3] - misses: L2_LINES_IN_ALL:PMC[0-3] - evicts: L2_TRANS_L2_WB:PMC[0-3] - cache per group: - sets: 1024 - ways: 16 - cl_size: 64 - replacement_policy: 'LRU' - write_allocate: True - write_back: True - load_from: null # L3 is a victim cache, thus unless a hit in L3, misses get forwarded to MEM - victims_to: L3 # all victims, modified or not are passed onto L3 - store_to: L3 - size per group: 1.00 MB - groups: 8 - cores per group: 1 - threads per group: 2 -- level: L3 - non-overlap upstream throughput: [16 B/cy, 'full-duplex'] - performance counter metrics: - accesses: L2_LINES_IN_ALL:PMC[0-3] - # FIXME not all misses in L2 lead to loads from L3, only the hits do - misses: (CAS_COUNT_RD:MBOX0C[01] + CAS_COUNT_WR:MBOX0C[01] + - CAS_COUNT_RD:MBOX1C[01] + CAS_COUNT_WR:MBOX1C[01] + - CAS_COUNT_RD:MBOX2C[01] + CAS_COUNT_WR:MBOX2C[01] + - CAS_COUNT_RD:MBOX3C[01] + CAS_COUNT_WR:MBOX3C[01] + - CAS_COUNT_RD:MBOX4C[01] + CAS_COUNT_WR:MBOX4C[01] + - CAS_COUNT_RD:MBOX5C[01] + CAS_COUNT_WR:MBOX5C[01]) - evicts: L2_TRANS_L2_WB:PMC[0-3] - cache per group: - sets: 16896 - # TODO is actuall something else, but necessary to get to 16.5 MB - ways: 16 - # TODO is actually 11, but pycachesim only supports powers of two - cl_size: 64 - replacement_policy: 'LRU' - write_allocate: False - write_back: True - size per group: 16.50 MB - groups: 2 - cores per group: 4 - threads per group: 8 -- level: MEM - cores per group: 4 - threads per group: 8 - non-overlap upstream throughput: ['full socket memory bandwidth', 'half-duplex'] - penalty cycles per read stream: 0 - size per group: -benchmarks: - kernels: - load: - read streams: - streams: 1 - bytes: 8.00 B - read+write streams: - streams: 0 - bytes: 0.00 B - write streams: - streams: 0 - bytes: 0.00 B - FLOPs per iteration: 0 - copy: - read streams: - streams: 1 - bytes: 8.00 B - read+write streams: - streams: 0 - bytes: 0.00 B - write streams: - streams: 1 - bytes: 8.00 B - FLOPs per iteration: 0 - update: - read streams: - streams: 1 - bytes: 8.00 B - read+write streams: - streams: 1 - bytes: 8.00 B - write streams: - streams: 1 - bytes: 8.00 B - FLOPs per iteration: 0 - triad: - read streams: - streams: 3 - bytes: 24.00 B - read+write streams: - streams: 0 - bytes: 0.00 B - write streams: - streams: 1 - bytes: 8.00 B - FLOPs per iteration: 2 - daxpy: - read streams: - streams: 2 - bytes: 16.00 B - read+write streams: - streams: 1 - bytes: 8.00 B - write streams: - streams: 1 - bytes: 8.00 B - FLOPs per iteration: 2 - measurements: - L1: - 1: - threads per core: 1 - cores: - - 1 - - 2 - - 3 - - 4 - threads: - - 1 - - 2 - - 3 - - 4 - size per core: - - 21.12 kB - - 21.12 kB - - 21.12 kB - - 21.12 kB - size per thread: - - 21.12 kB - - 21.12 kB - - 21.12 kB - - 21.12 kB - total size: - - 21.12 kB - - 42.24 kB - - 63.36 kB - - 84.48 kB - results: - load: - - 42.98 GB/s - - 85.08 GB/s - - 127.45 GB/s - - 169.92 GB/s - copy: - - 56.07 GB/s - - 111.50 GB/s - - 164.90 GB/s - - 221.50 GB/s - update: - - 56.54 GB/s - - 112.25 GB/s - - 168.50 GB/s - - 224.75 GB/s - triad: - - 45.90 GB/s - - 89.81 GB/s - - 127.29 GB/s - - 169.57 GB/s - daxpy: - - 36.62 GB/s - - 71.30 GB/s - - 103.52 GB/s - - 135.26 GB/s - 2: - threads per core: 2 - cores: - - 1 - - 2 - - 3 - - 4 - threads: - - 2 - - 4 - - 6 - - 8 - size per core: - - 21.12 kB - - 21.12 kB - - 21.12 kB - - 21.12 kB - size per thread: - - 10.56 kB - - 10.56 kB - - 10.56 kB - - 10.56 kB - total size: - - 21.12 kB - - 42.24 kB - - 63.36 kB - - 84.48 kB - results: - load: - - 49.61 GB/s - - 98.80 GB/s - - 147.98 GB/s - - 198.22 GB/s - copy: - - 55.98 GB/s - - 111.56 GB/s - - 167.08 GB/s - - 220.42 GB/s - update: - - 56.53 GB/s - - 112.72 GB/s - - 168.95 GB/s - - 225.31 GB/s - triad: - - 54.01 GB/s - - 104.58 GB/s - - 153.02 GB/s - - 200.93 GB/s - daxpy: - - 41.11 GB/s - - 80.28 GB/s - - 115.71 GB/s - - 152.81 GB/s - L2: - 1: - threads per core: 1 - cores: - - 1 - - 2 - - 3 - - 4 - threads: - - 1 - - 2 - - 3 - - 4 - size per core: - - 660.00 kB - - 660.00 kB - - 660.00 kB - - 660.00 kB - size per thread: - - 660.00 kB - - 660.00 kB - - 660.00 kB - - 660.00 kB - total size: - - 660.00 kB - - 1.32 MB - - 1.98 MB - - 2.64 MB - results: - load: - - 27.15 GB/s - - 54.09 GB/s - - 80.61 GB/s - - 106.41 GB/s - copy: - - 43.53 GB/s - - 90.07 GB/s - - 127.73 GB/s - - 171.81 GB/s - update: - - 50.38 GB/s - - 98.47 GB/s - - 147.91 GB/s - - 197.20 GB/s - triad: - - 43.38 GB/s - - 83.72 GB/s - - 124.83 GB/s - - 166.04 GB/s - daxpy: - - 36.29 GB/s - - 71.29 GB/s - - 103.33 GB/s - - 136.48 GB/s - 2: - threads per core: 2 - cores: - - 1 - - 2 - - 3 - - 4 - threads: - - 2 - - 4 - - 6 - - 8 - size per core: - - 660.00 kB - - 660.00 kB - - 660.00 kB - - 660.00 kB - size per thread: - - 330.00 kB - - 330.00 kB - - 330.00 kB - - 330.00 kB - total size: - - 660.00 kB - - 1.32 MB - - 1.98 MB - - 2.64 MB - results: - load: - - 35.29 GB/s - - 70.28 GB/s - - 104.67 GB/s - - 139.63 GB/s - copy: - - 42.23 GB/s - - 83.70 GB/s - - 124.33 GB/s - - 167.50 GB/s - update: - - 50.09 GB/s - - 99.77 GB/s - - 149.87 GB/s - - 198.82 GB/s - triad: - - 52.38 GB/s - - 100.00 GB/s - - 147.40 GB/s - - 193.31 GB/s - daxpy: - - 41.14 GB/s - - 80.22 GB/s - - 116.23 GB/s - - 155.08 GB/s - L3: - 1: - threads per core: 1 - cores: - - 1 - - 2 - - 3 - - 4 - threads: - - 1 - - 2 - - 3 - - 4 - size per core: - - 10.56 MB - - 5.28 MB - - 3.52 MB - - 2.64 MB - size per thread: - - 10.56 MB - - 5.28 MB - - 3.52 MB - - 2.64 MB - total size: - - 10.56 MB - - 10.56 MB - - 10.56 MB - - 10.56 MB - results: - load: - - 22.40 GB/s - - 44.77 GB/s - - 65.71 GB/s - - 89.26 GB/s - copy: - - 25.32 GB/s - - 49.70 GB/s - - 72.89 GB/s - - 98.62 GB/s - update: - - 41.24 GB/s - - 81.14 GB/s - - 122.22 GB/s - - 166.44 GB/s - triad: - - 25.61 GB/s - - 50.02 GB/s - - 73.23 GB/s - - 98.95 GB/s - daxpy: - - 32.07 GB/s - - 62.65 GB/s - - 89.91 GB/s - - 120.65 GB/s - 2: - threads per core: 2 - cores: - - 1 - - 2 - - 3 - - 4 - threads: - - 2 - - 4 - - 6 - - 8 - size per core: - - 10.56 MB - - 5.28 MB - - 3.52 MB - - 2.64 MB - size per thread: - - 5.28 MB - - 2.64 MB - - 1.76 MB - - 1.32 MB - total size: - - 10.56 MB - - 10.56 MB - - 10.56 MB - - 10.56 MB - results: - load: - - 26.18 GB/s - - 51.85 GB/s - - 75.82 GB/s - - 101.39 GB/s - copy: - - 26.22 GB/s - - 51.83 GB/s - - 76.40 GB/s - - 102.84 GB/s - update: - - 43.51 GB/s - - 86.75 GB/s - - 129.86 GB/s - - 174.54 GB/s - triad: - - 26.39 GB/s - - 51.80 GB/s - - 76.27 GB/s - - 102.66 GB/s - daxpy: - - 37.43 GB/s - - 73.16 GB/s - - 106.53 GB/s - - 142.76 GB/s - MEM: - 1: - threads per core: 1 - cores: - - 1 - - 2 - - 3 - - 4 - threads: - - 1 - - 2 - - 3 - - 4 - size per core: - - 240.00 MB - - 120.00 MB - - 80.00 MB - - 60.00 MB - size per thread: - - 240.00 MB - - 120.00 MB - - 80.00 MB - - 60.00 MB - total size: - - 240.00 MB - - 240.00 MB - - 240.00 MB - - 240.00 MB - results: - load: - - 12.03 GB/s - - 24.38 GB/s - - 34.83 GB/s - - 45.05 GB/s - copy: - - 12.32 GB/s - - 24.40 GB/s - - 32.82 GB/s - - 37.00 GB/s - update: - - 20.83 GB/s - - 40.25 GB/s - - 48.81 GB/s - - 54.84 GB/s - triad: - - 11.64 GB/s - - 23.17 GB/s - - 34.78 GB/s - - 42.97 GB/s - daxpy: - - 17.69 GB/s - - 34.02 GB/s - - 48.12 GB/s - - 55.73 GB/s - 2: - threads per core: 2 - cores: - - 1 - - 2 - - 3 - - 4 - threads: - - 2 - - 4 - - 6 - - 8 - size per core: - - 240.00 MB - - 120.00 MB - - 80.00 MB - - 60.00 MB - size per thread: - - 120.00 MB - - 60.00 MB - - 40.00 MB - - 30.00 MB - total size: - - 240.00 MB - - 240.00 MB - - 240.00 MB - - 240.00 MB - results: - load: - - 15.33 GB/s - - 28.32 GB/s - - 41.34 GB/s - - 53.02 GB/s - copy: - - 13.96 GB/s - - 26.61 GB/s - - 34.39 GB/s - - 38.96 GB/s - update: - - 26.47 GB/s - - 47.82 GB/s - - 56.70 GB/s - - 62.78 GB/s - triad: - - 14.42 GB/s - - 26.66 GB/s - - 36.94 GB/s - - 44.01 GB/s - daxpy: - - 20.96 GB/s - - 39.12 GB/s - - 51.55 GB/s - - 58.37 GB/s diff --git a/pystencils_tests/benchmark/benchmark.py b/pystencils_tests/benchmark/benchmark.py deleted file mode 100644 index 29066127c..000000000 --- a/pystencils_tests/benchmark/benchmark.py +++ /dev/null @@ -1,188 +0,0 @@ -import math -import os -import time - -import numpy as np -import sympy as sp -from git import Repo -from influxdb import InfluxDBClient -from kerncraft.machinemodel import MachineModel -from kerncraft.models import ECM, Benchmark, Roofline, RooflineIACA -from kerncraft.prefixedunit import PrefixedUnit - -from pystencils import Assignment, Field, create_kernel -from pystencils.kerncraft_coupling import KerncraftParameters, PyStencilsKerncraftKernel - - -def output_benchmark(analysis): - output = {} - keys = ['Runtime (per repetition) [s]', 'Iterations per repetition', - 'Runtime (per cacheline update) [cy/CL]', 'MEM volume (per repetition) [B]', - 'Performance [MFLOP/s]', 'Performance [MLUP/s]', 'Performance [MIt/s]', 'MEM BW [MByte/s]'] - copies = {key: analysis[key] for key in keys} - output.update(copies) - - for cache, metrics in analysis['data transfers'].items(): - for metric_name, metric_value in metrics.items(): - fixed = metric_value.with_prefix('') - output[cache + ' ' + metric_name + ' ' + fixed.prefix + fixed.unit] = fixed.value - - for level, value in analysis['ECM'].items(): - output['Phenomenological ECM ' + level + ' cy/CL'] = value - return output - - -def output_ecm(analysis): - output = {} - keys = ['T_nOL', 'T_OL', 'cl throughput', 'uops'] - copies = {key: analysis[key] for key in keys} - output.update(copies) - - if 'memory bandwidth kernel' in analysis: - output['memory bandwidth kernel' + analysis['memory bandwidth kernel'] + analysis['memory bandwidth'].prefix + - analysis['memory bandwidth'].unit] = analysis['memory bandwidth'].value - - output['scaling cores'] = int(analysis['scaling cores']) if not math.isinf(analysis['scaling cores']) else -1 - - for key, value in analysis['cycles']: - output[key] = value - return output - - -def output_roofline(analysis): - output = {} - keys = ['min performance'] # 'bottleneck level' - copies = {key: analysis[key] for key in keys} - output.update(copies) - # TODO save bottleneck information (compute it here) - - # fixed = analysis['max_flops'].with_prefix('G') - # output['max GFlop/s'] = fixed.value - - # if analysis['min performance'] > max_flops: - # # CPU bound - # print('CPU bound with {} cores(s)'.format(self._args.cores), file=output_file) - # print('{!s} due to CPU max. FLOP/s'.format(max_flops), file=output_file) - # else: - # Memory bound - bottleneck = analysis['mem bottlenecks'][analysis['bottleneck level']] - output['bottleneck GFlop/s'] = bottleneck['performance'].with_prefix('G').value - output['bottleneck level'] = bottleneck['level'] - output['bottleneck bw kernel'] = bottleneck['bw kernel'] - output['bottleneck arithmetic intensity'] = bottleneck['arithmetic intensity'] - - for i, level in enumerate(analysis['mem bottlenecks']): - if level is None: - continue - for key, value in level.items(): - if isinstance(value, PrefixedUnit): - fixed = value.with_prefix('G') - output['level ' + str(i) + ' ' + key + ' [' + fixed.prefix + fixed.unit + ']'] = 'inf' if isinstance( - fixed.value, float) and math.isinf(fixed.value) else fixed.value - else: - output['level ' + str(i) + ' ' + key] = 'inf' if isinstance(value, float) and math.isinf( - value) else value - return output - - -def output_roofline_iaca(analysis): - output = {} - keys = ['min performance'] # 'bottleneck level' - copies = {key: analysis[key] for key in keys} - # output.update(copies) - # TODO save bottleneck information (compute it here) - - # fixed = analysis['max_flops'].with_prefix('G') - # output['max GFlop/s'] = fixed.value - - # if analysis['min performance'] > max_flops: - # # CPU bound - # print('CPU bound with {} cores(s)'.format(self._args.cores), file=output_file) - # print('{!s} due to CPU max. FLOP/s'.format(max_flops), file=output_file) - # else: - # Memory bound - bottleneck = analysis['mem bottlenecks'][analysis['bottleneck level']] - output['bottleneck GFlop/s'] = bottleneck['performance'].with_prefix('G').value - output['bottleneck level'] = bottleneck['level'] - output['bottleneck bw kernel'] = bottleneck['bw kernel'] - output['bottleneck arithmetic intensity'] = bottleneck['arithmetic intensity'] - - for i, level in enumerate(analysis['mem bottlenecks']): - if level is None: - continue - for key, value in level.items(): - if isinstance(value, PrefixedUnit): - fixed = value.with_prefix('G') - output['level ' + str(i) + ' ' + key + ' [' + fixed.prefix + fixed.unit + ']'] = 'inf' if isinstance( - fixed.value, float) and math.isinf(fixed.value) else fixed.value - else: - output['level ' + str(i) + ' ' + key] = 'inf' if isinstance(value, float) and math.isinf( - value) else value - return output - - -def report_analysis(ast, models, machine, tags, fields=None): - kernel = PyStencilsKerncraftKernel(ast, machine) - client = InfluxDBClient('i10grafana.informatik.uni-erlangen.de', 8086, 'pystencils', - 'roggan', 'pystencils') - repo = Repo(search_parent_directories=True) - commit = repo.head.commit - point_time = int(time.time()) - - for model in models: - benchmark = model(kernel, machine, KerncraftParameters()) - benchmark.analyze() - analysis = benchmark.results - if model is Benchmark: - output = output_benchmark(analysis) - elif model is ECM: - output = output_ecm(analysis) - elif model is Roofline: - output = output_roofline(analysis) - elif model is RooflineIACA: - output = output_roofline_iaca(analysis) - else: - raise ValueError('No valid model for analysis given!') - - if fields is not None: - output.update(fields) - - output['commit'] = commit.hexsha - - json_body = [ - { - 'measurement': model.__name__, - 'tags': tags, - 'time': point_time, - 'fields': output - } - ] - client.write_points(json_body, time_precision='s') - - -def main(): - size = [20, 200, 200] - arr = np.zeros(size) - a = Field.create_from_numpy_array('a', arr, index_dimensions=0) - b = Field.create_from_numpy_array('b', arr, index_dimensions=0) - s = sp.Symbol("s") - rhs = a[0, -1, 0] + a[0, 1, 0] + \ - a[-1, 0, 0] + a[1, 0, 0] + \ - a[0, 0, -1] + a[0, 0, 1] - - update_rule = Assignment(b[0, 0, 0], s * rhs) - ast = create_kernel([update_rule]) - input_folder = "./" - machine_file_path = os.path.join(input_folder, "SkylakeSP_Gold-5122_allinclusive.yaml") - machine = MachineModel(path_to_yaml=machine_file_path) - tags = { - 'host': os.uname()[1], - 'project': 'pystencils', - 'kernel': 'jacobi_3D ' + str(size) - } - - report_analysis(ast, [ECM, Roofline, RooflineIACA, Benchmark], machine, tags) - - -if __name__ == '__main__': - main() diff --git a/pystencils_tests/benchmark/generate.py b/pystencils_tests/benchmark/generate.py deleted file mode 100644 index fba398489..000000000 --- a/pystencils_tests/benchmark/generate.py +++ /dev/null @@ -1,50 +0,0 @@ -import numpy as np -import sympy as sp - -from pystencils import Assignment, Field, create_kernel - - -def meassure(): - size = [30, 50, 3] - arr = np.zeros(size) - a = Field.create_from_numpy_array('a', arr, index_dimensions=1) - b = Field.create_from_numpy_array('b', arr, index_dimensions=1) - s = sp.Symbol("s") - rhs = a[0, -1](0) + a[0, 1] + a[-1, 0] + a[1, 0] - updateRule = Assignment(b[0, 0], s * rhs) - print(updateRule) - - ast = create_kernel([updateRule]) - - # benchmark = generate_benchmark(ast) - # main = benchmark[0] - # kernel = benchmark[1] - # with open('src/main.cpp', 'w') as file: - # file.write(main) - # with open('src/kernel.cpp', 'w') as file: - # file.write(kernel) - - func = ast.compile({'omega': 2/3}) - - from pystencils.kerncraft_coupling.generate_benchmark import generate_benchmark - from pystencils.kerncraft_coupling import BenchmarkAnalysis - from pystencils.kerncraft_coupling.kerncraft_interface import PyStencilsKerncraftKernel, KerncraftParameters - from kerncraft.machinemodel import MachineModel - from kerncraft.models import ECMData - - - machineFilePath = "../pystencils_tests/kerncraft_inputs/default_machine_file.yaml" - machine = MachineModel(path_to_yaml=machineFilePath) - - - benchmark = BenchmarkAnalysis(ast, machine) - #TODO what do i want to do with benchmark? - - kernel = PyStencilsKerncraftKernel(ast) - model = ECMData(kernel, machine, KerncraftParameters()) - model.analyze() - model.report() - - -if __name__ == "__main__": - meassure() diff --git a/pystencils_tests/benchmark/iacaMarks.h b/pystencils_tests/benchmark/iacaMarks.h deleted file mode 100644 index be1973eb2..000000000 --- a/pystencils_tests/benchmark/iacaMarks.h +++ /dev/null @@ -1,53 +0,0 @@ -/* -* Copyright (2008-2009) Intel Corporation All Rights Reserved. -* The source code contained or described herein and all documents -* related to the source code ("Material") are owned by Intel Corporation -* or its suppliers or licensors. Title to the Material remains with -* Intel Corporation or its suppliers and licensors. The Material -* contains trade secrets and proprietary and confidential information -* of Intel or its suppliers and licensors. The Material is protected -* by worldwide copyright and trade secret laws and treaty provisions. -* No part of the Material may be used, copied, reproduced, modified, -* published, uploaded, posted, transmitted, distributed, or disclosed -* in any way without Intel(R)s prior express written permission. -* -* No license under any patent, copyright, trade secret or other -* intellectual property right is granted to or conferred upon you by -* disclosure or delivery of the Materials, either expressly, by implication, -* inducement, estoppel or otherwise. Any license under such intellectual -* property rights must be express and approved by Intel in writing. -*/ - -#if defined (__GNUC__) -#define IACA_SSC_MARK( MARK_ID ) \ -__asm__ __volatile__ ( \ - "\n\t movl $"#MARK_ID", %%ebx" \ - "\n\t .byte 0x64, 0x67, 0x90" \ - : : : "memory" ); - -#else -#define IACA_SSC_MARK(x) {__asm mov ebx, x\ - __asm _emit 0x64 \ - __asm _emit 0x67 \ - __asm _emit 0x90 } -#endif - -#define IACA_START {IACA_SSC_MARK(111)} -#define IACA_END {IACA_SSC_MARK(222)} - -#ifdef _WIN64 -#include <intrin.h> -#define IACA_VC64_START __writegsbyte(111, 111); -#define IACA_VC64_END __writegsbyte(222, 222); -#endif - -/**************** asm ***************** -;START_MARKER -mov ebx, 111 -db 0x64, 0x67, 0x90 - -;END_MARKER -mov ebx, 222 -db 0x64, 0x67, 0x90 - -**************************************/ diff --git a/pystencils_tests/benchmark/main.c b/pystencils_tests/benchmark/main.c deleted file mode 100644 index 6bd57f91c..000000000 --- a/pystencils_tests/benchmark/main.c +++ /dev/null @@ -1,11 +0,0 @@ -#include "iacaMarks.h" - -int main(int argc, char * argv[]){ - int a = 0; - for(int i = 0; i < argc+100000; i++){ - IACA_START - a += i; - } - IACA_END - return a; -} diff --git a/pystencils_tests/kerncraft_inputs/2d-5pt.c b/pystencils_tests/kerncraft_inputs/2d-5pt.c deleted file mode 100644 index 0f2b99cf2..000000000 --- a/pystencils_tests/kerncraft_inputs/2d-5pt.c +++ /dev/null @@ -1,8 +0,0 @@ -double a[30][50][3]; -double b[30][50][3]; -double s; - -for(int j=1; j<30-1; ++j) - for(int i=1; i<50-1; ++i) - b[j][i] = ( a[j][i-1] + a[j][i+1] - + a[j-1][i] + a[j+1][i]) * s; diff --git a/pystencils_tests/kerncraft_inputs/3d-7pt.c b/pystencils_tests/kerncraft_inputs/3d-7pt.c deleted file mode 100644 index 0e9ff901d..000000000 --- a/pystencils_tests/kerncraft_inputs/3d-7pt.c +++ /dev/null @@ -1,10 +0,0 @@ -double a[M][N][N]; -double b[M][N][N]; -double s; - -for(int k=1; k<M-1; ++k) - for(int j=1; j<N-1; ++j) - for(int i=1; i<N-1; ++i) - b[k][j][i] = ( a[k][j][i-1] + a[k][j][i+1] - + a[k][j-1][i] + a[k][j+1][i] - + a[k-1][j][i] + a[k+1][j][i]) * s; diff --git a/pystencils_tests/kerncraft_inputs/Example_SandyBridgeEP_E5-2680.yml b/pystencils_tests/kerncraft_inputs/Example_SandyBridgeEP_E5-2680.yml deleted file mode 100644 index 890e2e895..000000000 --- a/pystencils_tests/kerncraft_inputs/Example_SandyBridgeEP_E5-2680.yml +++ /dev/null @@ -1,980 +0,0 @@ -kerncraft version: 0.8.6.dev0 -model name: Intel(R) Xeon(R) CPU E5-2680 0 @ 2.70GHz -model type: Intel Xeon SandyBridge EN/EP processor -clock: 2.7 GHz - -sockets: 2 -cores per socket: 8 -threads per core: 2 -NUMA domains per socket: 1 -cores per NUMA domain: 8 -transparent hugepage: always - -in-core model: !!omap - - IACA: SNB - - OSACA: SNB - - LLVM-MCA: -mcpu=sandybridge -isa: x86 - -FLOPs per cycle: - SP: {total: 16, ADD: 8, MUL: 8} - DP: {total: 8, ADD: 4, MUL: 4} - -compiler: !!omap -- icc: -O3 -xAVX -fno-alias -qopenmp -ffreestanding -nolib-inline -- clang: -O3 -march=corei7-avx -mtune=corei7-avx -D_POSIX_C_SOURCE=200809L -fopenmp -ffreestanding -- gcc: -O3 -march=corei7-avx -D_POSIX_C_SOURCE=200809L -fopenmp -lm -ffreestanding - -overlapping model: - ports: - IACA: ['0', 0DV, '1', '2', '3', '4', '5'] - OSACA: ['0', 0DV, '1', '2', '3', '4', '5'] - LLVM-MCA: [SBDivider, SBFPDivider, SBPort0, SBPort1, SBPort23, SBPort4, SBPort5] - performance counter metric: Max(UOPS_DISPATCHED_PORT_PORT_0:PMC[0-3], UOPS_DISPATCHED_PORT_PORT_1:PMC[0-3], UOPS_DISPATCHED_PORT_PORT_4:PMC[0-3], UOPS_DISPATCHED_PORT_PORT_5:PMC[0-3]) -non-overlapping model: - ports: - IACA: [2D, 3D] - OSACA: [2D, 3D] - LLVM-MCA: [SBPort23] - performance counter metric: T_nOL + T_L1L2 + T_L2L3 + T_L3MEM - -cacheline size: 64 B -memory hierarchy: -- level: L1 - cache per group: {sets: 64, ways: 8, cl_size: 64, replacement_policy: LRU, write_allocate: true, - write_back: true, load_from: L2, store_to: L2} - cores per group: 1 - threads per group: 2 - groups: 16 - performance counter metrics: - accesses: MEM_UOPS_RETIRED_LOADS:PMC[0-3] + MEM_UOPS_RETIRED_STORES:PMC[0-3] - misses: L1D_REPLACEMENT:PMC[0-3] - evicts: L1D_M_EVICT:PMC[0-3] - upstream throughput: [architecture code analyzer, [2D, 3D]] - transfers overlap: false -- level: L2 - cache per group: {sets: 512, ways: 8, cl_size: 64, replacement_policy: LRU, write_allocate: true, - write_back: true, load_from: L3, store_to: L3} - cores per group: 1 - threads per group: 2 - groups: 16 - upstream throughput: [32 B/cy, half-duplex] - transfers overlap: false - performance counter metrics: - accesses: L1D_REPLACEMENT:PMC[0-3] + L1D_M_EVICT:PMC[0-3] - misses: L2_LINES_IN_ALL:PMC[0-3] - evicts: L2_TRANS_L2_WB:PMC[0-3] -- level: L3 - cache per group: {sets: 20480, ways: 16, cl_size: 64, replacement_policy: LRU, write_allocate: true, - write_back: true} - cores per group: 8 - threads per group: 16 - groups: 2 - upstream throughput: [32 B/cy, half-duplex] - transfers overlap: false - performance counter metrics: - accesses: L2_LINES_IN_ALL:PMC[0-3] + L2_TRANS_L2_WB:PMC[0-3] - misses: (CAS_COUNT_RD:MBOX0C[01] + CAS_COUNT_RD:MBOX1C[01] + CAS_COUNT_RD:MBOX2C[01] - + CAS_COUNT_RD:MBOX3C[01]) - evicts: (CAS_COUNT_WR:MBOX0C[01] + CAS_COUNT_WR:MBOX1C[01] + CAS_COUNT_WR:MBOX2C[01] - + CAS_COUNT_WR:MBOX3C[01]) -- level: MEM - cores per group: 8 - upstream throughput: [full socket memory bandwidth, half-duplex] - transfers overlap: false - size per group: - threads per group: 16 - -benchmarks: - kernels: - copy: - FLOPs per iteration: 0 - fastest bench kernel: copy_avx - read streams: {bytes: 8.00 B, streams: 1} - read+write streams: {bytes: 0.00 B, streams: 0} - write streams: {bytes: 8.00 B, streams: 1} - daxpy: - FLOPs per iteration: 2 - fastest bench kernel: daxpy_avx - read streams: {bytes: 16.00 B, streams: 2} - read+write streams: {bytes: 8.00 B, streams: 1} - write streams: {bytes: 8.00 B, streams: 1} - load: - FLOPs per iteration: 0 - fastest bench kernel: load_avx - read streams: {bytes: 8.00 B, streams: 1} - read+write streams: {bytes: 0.00 B, streams: 0} - write streams: {bytes: 0.00 B, streams: 0} - triad: - FLOPs per iteration: 2 - fastest bench kernel: triad_avx - read streams: {bytes: 24.00 B, streams: 3} - read+write streams: {bytes: 0.00 B, streams: 0} - write streams: {bytes: 8.00 B, streams: 1} - update: - FLOPs per iteration: 0 - fastest bench kernel: update_avx - read streams: {bytes: 8.00 B, streams: 1} - read+write streams: {bytes: 8.00 B, streams: 1} - write streams: {bytes: 8.00 B, streams: 1} - measurements: - L1: - 1: - cores: [1, 2, 3, 4, 5, 6, 7, 8] - results: - copy: [83.27 GB/s, 166.52 GB/s, 249.78 GB/s, 333.02 GB/s, 416.34 GB/s, 495.96 - GB/s, 578.56 GB/s, 660.60 GB/s] - daxpy: [116.88 GB/s, 233.68 GB/s, 311.60 GB/s, 409.72 GB/s, 509.79 GB/s, - 559.65 GB/s, 612.77 GB/s, 719.71 GB/s] - load: [84.07 GB/s, 168.13 GB/s, 252.21 GB/s, 336.04 GB/s, 420.34 GB/s, 504.02 - GB/s, 588.04 GB/s, 668.37 GB/s] - triad: [100.24 GB/s, 211.57 GB/s, 314.53 GB/s, 392.73 GB/s, 506.87 GB/s, - 589.51 GB/s, 687.28 GB/s, 782.17 GB/s] - update: [84.77 GB/s, 160.10 GB/s, 237.12 GB/s, 312.74 GB/s, 392.54 GB/s, - 465.53 GB/s, 516.02 GB/s, 567.27 GB/s] - size per core: [21.12 kB, 21.12 kB, 21.12 kB, 21.12 kB, 21.12 kB, 21.12 kB, - 21.12 kB, 21.12 kB] - size per thread: [21.12 kB, 21.12 kB, 21.12 kB, 21.12 kB, 21.12 kB, 21.12 - kB, 21.12 kB, 21.12 kB] - stats: - copy: - - [83.24 GB/s, 83.25 GB/s, 83.26 GB/s, 83.26 GB/s, 83.27 GB/s, 83.26 GB/s, - 83.25 GB/s, 83.23 GB/s, 83.24 GB/s, 83.25 GB/s] - - [166.49 GB/s, 166.47 GB/s, 166.51 GB/s, 166.49 GB/s, 166.48 GB/s, 166.52 - GB/s, 166.51 GB/s, 166.51 GB/s, 166.51 GB/s, 166.50 GB/s] - - [249.78 GB/s, 249.75 GB/s, 249.73 GB/s, 249.72 GB/s, 249.74 GB/s, 249.76 - GB/s, 249.76 GB/s, 249.74 GB/s, 249.73 GB/s, 249.75 GB/s] - - [332.98 GB/s, 327.92 GB/s, 332.30 GB/s, 332.95 GB/s, 333.00 GB/s, 333.01 - GB/s, 332.95 GB/s, 333.00 GB/s, 332.99 GB/s, 333.02 GB/s] - - [416.26 GB/s, 416.23 GB/s, 416.28 GB/s, 416.27 GB/s, 416.23 GB/s, 416.27 - GB/s, 416.34 GB/s, 416.26 GB/s, 416.16 GB/s, 416.23 GB/s] - - [495.84 GB/s, 495.93 GB/s, 495.88 GB/s, 495.91 GB/s, 495.96 GB/s, 495.92 - GB/s, 495.89 GB/s, 495.87 GB/s, 495.96 GB/s, 495.92 GB/s] - - [578.51 GB/s, 578.52 GB/s, 578.39 GB/s, 578.56 GB/s, 578.48 GB/s, 578.44 - GB/s, 578.51 GB/s, 578.48 GB/s, 578.51 GB/s, 578.53 GB/s] - - [422.14 GB/s, 660.55 GB/s, 660.60 GB/s, 660.49 GB/s, 660.52 GB/s, 660.48 - GB/s, 660.56 GB/s, 660.56 GB/s, 660.52 GB/s, 651.64 GB/s] - daxpy: - - [116.87 GB/s, 116.82 GB/s, 116.85 GB/s, 116.84 GB/s, 116.83 GB/s, 116.85 - GB/s, 116.88 GB/s, 116.87 GB/s, 116.86 GB/s, 116.82 GB/s] - - [214.69 GB/s, 229.83 GB/s, 221.16 GB/s, 233.60 GB/s, 232.90 GB/s, 233.68 - GB/s, 207.83 GB/s, 233.65 GB/s, 212.71 GB/s, 214.07 GB/s] - - [282.77 GB/s, 307.63 GB/s, 307.09 GB/s, 310.67 GB/s, 307.50 GB/s, 311.40 - GB/s, 307.06 GB/s, 305.89 GB/s, 311.60 GB/s, 308.47 GB/s] - - [404.96 GB/s, 408.54 GB/s, 395.76 GB/s, 409.72 GB/s, 316.70 GB/s, 408.07 - GB/s, 347.34 GB/s, 406.03 GB/s, 391.75 GB/s, 385.10 GB/s] - - [479.84 GB/s, 509.24 GB/s, 502.60 GB/s, 449.79 GB/s, 402.46 GB/s, 489.18 - GB/s, 491.15 GB/s, 491.20 GB/s, 384.36 GB/s, 509.79 GB/s] - - [515.12 GB/s, 496.21 GB/s, 517.52 GB/s, 540.00 GB/s, 501.82 GB/s, 507.84 - GB/s, 496.71 GB/s, 479.42 GB/s, 559.65 GB/s, 519.55 GB/s] - - [584.86 GB/s, 580.10 GB/s, 583.34 GB/s, 612.77 GB/s, 607.15 GB/s, 607.89 - GB/s, 589.85 GB/s, 609.59 GB/s, 592.86 GB/s, 568.07 GB/s] - - [719.71 GB/s, 660.98 GB/s, 675.88 GB/s, 679.51 GB/s, 696.97 GB/s, 635.23 - GB/s, 644.06 GB/s, 694.74 GB/s, 654.01 GB/s, 656.57 GB/s] - load: - - [84.04 GB/s, 84.06 GB/s, 84.06 GB/s, 84.04 GB/s, 84.05 GB/s, 84.05 GB/s, - 84.07 GB/s, 84.04 GB/s, 84.05 GB/s, 84.06 GB/s] - - [168.09 GB/s, 168.12 GB/s, 168.06 GB/s, 168.11 GB/s, 168.12 GB/s, 168.13 - GB/s, 168.13 GB/s, 168.12 GB/s, 168.10 GB/s, 168.13 GB/s] - - [252.16 GB/s, 252.21 GB/s, 252.07 GB/s, 252.07 GB/s, 252.18 GB/s, 252.16 - GB/s, 252.21 GB/s, 252.20 GB/s, 252.20 GB/s, 252.17 GB/s] - - [335.94 GB/s, 336.03 GB/s, 335.99 GB/s, 336.04 GB/s, 336.00 GB/s, 335.98 - GB/s, 335.97 GB/s, 335.89 GB/s, 335.99 GB/s, 336.03 GB/s] - - [420.30 GB/s, 420.18 GB/s, 420.30 GB/s, 420.33 GB/s, 420.25 GB/s, 420.28 - GB/s, 420.31 GB/s, 420.31 GB/s, 420.34 GB/s, 420.33 GB/s] - - [503.98 GB/s, 503.99 GB/s, 503.97 GB/s, 503.98 GB/s, 504.02 GB/s, 503.99 - GB/s, 503.92 GB/s, 503.98 GB/s, 503.94 GB/s, 503.97 GB/s] - - [587.93 GB/s, 588.01 GB/s, 588.04 GB/s, 587.94 GB/s, 587.97 GB/s, 588.01 - GB/s, 588.00 GB/s, 587.92 GB/s, 588.04 GB/s, 588.02 GB/s] - - [668.21 GB/s, 668.22 GB/s, 668.29 GB/s, 668.24 GB/s, 668.27 GB/s, 668.37 - GB/s, 668.28 GB/s, 668.14 GB/s, 668.19 GB/s, 668.19 GB/s] - triad: - - [100.00 GB/s, 99.71 GB/s, 99.74 GB/s, 100.24 GB/s, 99.72 GB/s, 99.62 GB/s, - 99.54 GB/s, 99.61 GB/s, 99.72 GB/s, 99.71 GB/s] - - [208.08 GB/s, 210.33 GB/s, 211.57 GB/s, 208.34 GB/s, 210.03 GB/s, 209.16 - GB/s, 210.21 GB/s, 209.48 GB/s, 210.03 GB/s, 208.80 GB/s] - - [311.43 GB/s, 311.08 GB/s, 311.41 GB/s, 311.10 GB/s, 313.13 GB/s, 314.53 - GB/s, 311.59 GB/s, 311.80 GB/s, 311.57 GB/s, 311.89 GB/s] - - [391.65 GB/s, 392.34 GB/s, 391.84 GB/s, 392.07 GB/s, 391.96 GB/s, 392.73 - GB/s, 391.66 GB/s, 391.83 GB/s, 392.09 GB/s, 391.88 GB/s] - - [504.20 GB/s, 506.77 GB/s, 503.22 GB/s, 506.74 GB/s, 502.78 GB/s, 506.15 - GB/s, 506.87 GB/s, 502.85 GB/s, 505.82 GB/s, 506.57 GB/s] - - [587.75 GB/s, 589.51 GB/s, 588.01 GB/s, 587.29 GB/s, 588.04 GB/s, 587.92 - GB/s, 588.08 GB/s, 587.94 GB/s, 587.82 GB/s, 587.55 GB/s] - - [686.03 GB/s, 685.97 GB/s, 685.01 GB/s, 685.88 GB/s, 685.61 GB/s, 687.12 - GB/s, 684.97 GB/s, 686.09 GB/s, 685.81 GB/s, 687.28 GB/s] - - [782.05 GB/s, 781.73 GB/s, 781.13 GB/s, 781.87 GB/s, 782.17 GB/s, 781.24 - GB/s, 781.82 GB/s, 781.92 GB/s, 781.90 GB/s, 781.66 GB/s] - update: - - [84.76 GB/s, 84.76 GB/s, 84.77 GB/s, 84.75 GB/s, 84.75 GB/s, 84.75 GB/s, - 84.75 GB/s, 84.75 GB/s, 84.74 GB/s, 57.21 GB/s] - - [157.73 GB/s, 155.29 GB/s, 147.91 GB/s, 160.10 GB/s, 156.33 GB/s, 158.06 - GB/s, 159.23 GB/s, 156.16 GB/s, 155.30 GB/s, 159.15 GB/s] - - [232.07 GB/s, 230.40 GB/s, 234.05 GB/s, 232.69 GB/s, 215.80 GB/s, 232.76 - GB/s, 236.01 GB/s, 237.12 GB/s, 234.66 GB/s, 234.86 GB/s] - - [303.60 GB/s, 304.21 GB/s, 306.83 GB/s, 309.43 GB/s, 312.69 GB/s, 311.75 - GB/s, 301.74 GB/s, 307.54 GB/s, 312.74 GB/s, 312.19 GB/s] - - [386.45 GB/s, 382.41 GB/s, 387.87 GB/s, 392.54 GB/s, 369.42 GB/s, 341.87 - GB/s, 352.85 GB/s, 390.87 GB/s, 382.44 GB/s, 383.50 GB/s] - - [459.60 GB/s, 384.27 GB/s, 437.39 GB/s, 459.42 GB/s, 465.53 GB/s, 447.31 - GB/s, 440.00 GB/s, 409.94 GB/s, 412.94 GB/s, 446.74 GB/s] - - [489.85 GB/s, 489.35 GB/s, 435.92 GB/s, 492.39 GB/s, 446.44 GB/s, 501.71 - GB/s, 516.02 GB/s, 478.87 GB/s, 494.52 GB/s, 493.04 GB/s] - - [521.08 GB/s, 553.73 GB/s, 541.34 GB/s, 527.75 GB/s, 554.87 GB/s, 536.30 - GB/s, 540.66 GB/s, 551.02 GB/s, 567.27 GB/s, 565.31 GB/s] - threads: [1, 2, 3, 4, 5, 6, 7, 8] - threads per core: 1 - total size: [21.12 kB, 42.24 kB, 63.36 kB, 84.48 kB, 105.60 kB, 126.72 kB, - 147.84 kB, 168.96 kB] - 2: - cores: [1, 2, 3, 4, 5, 6, 7, 8] - results: - copy: [80.41 GB/s, 160.83 GB/s, 240.43 GB/s, 320.63 GB/s, 401.66 GB/s, 454.32 - GB/s, 539.77 GB/s, 628.51 GB/s] - daxpy: [95.87 GB/s, 187.75 GB/s, 270.68 GB/s, 371.80 GB/s, 454.05 GB/s, - 503.46 GB/s, 606.85 GB/s, 689.34 GB/s] - load: [82.30 GB/s, 164.06 GB/s, 244.78 GB/s, 326.21 GB/s, 408.56 GB/s, 490.13 - GB/s, 569.95 GB/s, 651.79 GB/s] - triad: [93.22 GB/s, 186.75 GB/s, 288.55 GB/s, 340.91 GB/s, 442.20 GB/s, - 534.62 GB/s, 597.98 GB/s, 707.54 GB/s] - update: [83.25 GB/s, 166.04 GB/s, 248.21 GB/s, 330.58 GB/s, 414.71 GB/s, - 496.97 GB/s, 578.67 GB/s, 656.56 GB/s] - size per core: [21.12 kB, 21.12 kB, 21.12 kB, 21.12 kB, 21.12 kB, 21.12 kB, - 21.12 kB, 21.12 kB] - size per thread: [10.56 kB, 10.56 kB, 10.56 kB, 10.56 kB, 10.56 kB, 10.56 - kB, 10.56 kB, 10.56 kB] - stats: - copy: - - [80.37 GB/s, 79.07 GB/s, 80.39 GB/s, 80.39 GB/s, 80.41 GB/s, 80.29 GB/s, - 80.36 GB/s, 79.05 GB/s, 77.87 GB/s, 80.37 GB/s] - - [160.76 GB/s, 160.63 GB/s, 160.76 GB/s, 160.71 GB/s, 160.80 GB/s, 160.74 - GB/s, 160.83 GB/s, 160.69 GB/s, 160.79 GB/s, 160.78 GB/s] - - [240.43 GB/s, 240.20 GB/s, 240.36 GB/s, 240.37 GB/s, 237.17 GB/s, 240.39 - GB/s, 240.14 GB/s, 240.24 GB/s, 240.26 GB/s, 240.10 GB/s] - - [320.46 GB/s, 320.47 GB/s, 320.63 GB/s, 320.52 GB/s, 320.40 GB/s, 320.40 - GB/s, 320.51 GB/s, 320.46 GB/s, 319.72 GB/s, 320.44 GB/s] - - [401.40 GB/s, 399.28 GB/s, 401.66 GB/s, 401.53 GB/s, 401.52 GB/s, 401.55 - GB/s, 401.60 GB/s, 401.47 GB/s, 401.47 GB/s, 401.35 GB/s] - - [447.24 GB/s, 453.65 GB/s, 453.54 GB/s, 453.86 GB/s, 453.82 GB/s, 453.62 - GB/s, 453.48 GB/s, 454.32 GB/s, 453.86 GB/s, 446.79 GB/s] - - [538.79 GB/s, 538.47 GB/s, 539.02 GB/s, 538.25 GB/s, 538.72 GB/s, 538.89 - GB/s, 539.37 GB/s, 539.41 GB/s, 539.77 GB/s, 538.49 GB/s] - - [628.14 GB/s, 618.54 GB/s, 628.12 GB/s, 623.90 GB/s, 628.27 GB/s, 623.78 - GB/s, 618.17 GB/s, 623.43 GB/s, 628.51 GB/s, 628.43 GB/s] - daxpy: - - [95.77 GB/s, 93.25 GB/s, 92.87 GB/s, 95.87 GB/s, 95.84 GB/s, 95.81 GB/s, - 95.80 GB/s, 94.99 GB/s, 95.81 GB/s, 95.86 GB/s] - - [184.53 GB/s, 186.60 GB/s, 183.99 GB/s, 187.48 GB/s, 187.75 GB/s, 181.53 - GB/s, 183.82 GB/s, 187.75 GB/s, 184.13 GB/s, 180.61 GB/s] - - [258.46 GB/s, 270.13 GB/s, 264.76 GB/s, 262.23 GB/s, 265.05 GB/s, 267.25 - GB/s, 270.68 GB/s, 268.08 GB/s, 266.20 GB/s, 265.66 GB/s] - - [367.99 GB/s, 367.15 GB/s, 361.68 GB/s, 364.86 GB/s, 368.76 GB/s, 363.27 - GB/s, 364.95 GB/s, 366.97 GB/s, 371.80 GB/s, 366.55 GB/s] - - [441.95 GB/s, 442.77 GB/s, 444.97 GB/s, 454.05 GB/s, 441.02 GB/s, 445.96 - GB/s, 442.49 GB/s, 440.23 GB/s, 449.29 GB/s, 452.66 GB/s] - - [501.31 GB/s, 489.91 GB/s, 495.43 GB/s, 503.39 GB/s, 488.03 GB/s, 497.71 - GB/s, 503.46 GB/s, 496.85 GB/s, 497.38 GB/s, 468.90 GB/s] - - [604.57 GB/s, 580.51 GB/s, 587.67 GB/s, 594.32 GB/s, 561.32 GB/s, 588.09 - GB/s, 606.85 GB/s, 600.91 GB/s, 599.40 GB/s, 598.24 GB/s] - - [646.48 GB/s, 655.06 GB/s, 684.70 GB/s, 653.61 GB/s, 671.61 GB/s, 689.34 - GB/s, 673.74 GB/s, 685.49 GB/s, 681.48 GB/s, 683.23 GB/s] - load: - - [82.19 GB/s, 82.08 GB/s, 82.22 GB/s, 82.10 GB/s, 82.14 GB/s, 82.17 GB/s, - 82.22 GB/s, 82.28 GB/s, 82.30 GB/s, 81.98 GB/s] - - [163.22 GB/s, 163.43 GB/s, 164.06 GB/s, 164.03 GB/s, 163.19 GB/s, 163.83 - GB/s, 163.29 GB/s, 163.88 GB/s, 163.83 GB/s, 163.11 GB/s] - - [244.32 GB/s, 244.47 GB/s, 244.65 GB/s, 244.29 GB/s, 243.96 GB/s, 244.50 - GB/s, 244.78 GB/s, 244.52 GB/s, 244.48 GB/s, 244.72 GB/s] - - [325.18 GB/s, 326.21 GB/s, 325.49 GB/s, 325.86 GB/s, 325.73 GB/s, 325.72 - GB/s, 326.00 GB/s, 325.41 GB/s, 325.63 GB/s, 325.82 GB/s] - - [407.81 GB/s, 407.96 GB/s, 407.59 GB/s, 408.56 GB/s, 407.64 GB/s, 407.61 - GB/s, 408.09 GB/s, 407.95 GB/s, 408.30 GB/s, 408.32 GB/s] - - [488.65 GB/s, 489.73 GB/s, 489.38 GB/s, 489.81 GB/s, 490.13 GB/s, 489.31 - GB/s, 488.74 GB/s, 489.38 GB/s, 488.17 GB/s, 489.51 GB/s] - - [569.95 GB/s, 567.21 GB/s, 566.08 GB/s, 567.88 GB/s, 567.69 GB/s, 569.58 - GB/s, 568.61 GB/s, 568.35 GB/s, 569.70 GB/s, 568.87 GB/s] - - [650.43 GB/s, 651.58 GB/s, 650.86 GB/s, 651.34 GB/s, 651.04 GB/s, 651.79 - GB/s, 650.28 GB/s, 650.31 GB/s, 650.81 GB/s, 651.09 GB/s] - triad: - - [93.22 GB/s, 90.73 GB/s, 92.48 GB/s, 92.53 GB/s, 92.37 GB/s, 92.50 GB/s, - 92.48 GB/s, 90.28 GB/s, 92.35 GB/s, 92.51 GB/s] - - [186.75 GB/s, 184.51 GB/s, 184.17 GB/s, 186.66 GB/s, 186.43 GB/s, 184.59 - GB/s, 186.71 GB/s, 186.30 GB/s, 186.64 GB/s, 186.12 GB/s] - - [287.77 GB/s, 288.55 GB/s, 287.76 GB/s, 287.76 GB/s, 288.19 GB/s, 287.70 - GB/s, 287.42 GB/s, 288.12 GB/s, 287.66 GB/s, 288.01 GB/s] - - [339.82 GB/s, 338.95 GB/s, 340.11 GB/s, 340.11 GB/s, 340.25 GB/s, 340.20 - GB/s, 339.90 GB/s, 340.22 GB/s, 340.91 GB/s, 340.01 GB/s] - - [440.41 GB/s, 440.65 GB/s, 441.59 GB/s, 442.20 GB/s, 441.67 GB/s, 432.59 - GB/s, 440.20 GB/s, 440.81 GB/s, 440.24 GB/s, 441.38 GB/s] - - [534.30 GB/s, 527.60 GB/s, 528.52 GB/s, 509.55 GB/s, 527.68 GB/s, 527.63 - GB/s, 533.66 GB/s, 534.62 GB/s, 534.60 GB/s, 534.19 GB/s] - - [595.90 GB/s, 595.94 GB/s, 597.91 GB/s, 580.22 GB/s, 597.98 GB/s, 597.66 - GB/s, 596.16 GB/s, 567.03 GB/s, 580.88 GB/s, 578.29 GB/s] - - [703.80 GB/s, 705.57 GB/s, 694.84 GB/s, 682.59 GB/s, 694.37 GB/s, 696.56 - GB/s, 704.50 GB/s, 704.95 GB/s, 694.52 GB/s, 707.54 GB/s] - update: - - [83.18 GB/s, 83.24 GB/s, 83.25 GB/s, 83.16 GB/s, 83.22 GB/s, 83.23 GB/s, - 83.22 GB/s, 83.21 GB/s, 83.20 GB/s, 83.17 GB/s] - - [165.65 GB/s, 165.76 GB/s, 165.99 GB/s, 166.04 GB/s, 165.49 GB/s, 165.87 - GB/s, 165.58 GB/s, 165.96 GB/s, 165.67 GB/s, 165.66 GB/s] - - [247.30 GB/s, 248.14 GB/s, 247.84 GB/s, 247.90 GB/s, 247.77 GB/s, 247.60 - GB/s, 248.21 GB/s, 247.95 GB/s, 248.05 GB/s, 247.83 GB/s] - - [330.49 GB/s, 330.07 GB/s, 329.91 GB/s, 329.90 GB/s, 330.58 GB/s, 329.30 - GB/s, 329.92 GB/s, 330.03 GB/s, 330.04 GB/s, 330.12 GB/s] - - [413.89 GB/s, 414.04 GB/s, 413.56 GB/s, 414.06 GB/s, 414.15 GB/s, 413.94 - GB/s, 414.04 GB/s, 414.71 GB/s, 414.32 GB/s, 413.93 GB/s] - - [496.97 GB/s, 496.80 GB/s, 496.17 GB/s, 495.42 GB/s, 496.17 GB/s, 496.66 - GB/s, 495.55 GB/s, 496.27 GB/s, 495.52 GB/s, 496.80 GB/s] - - [564.44 GB/s, 577.86 GB/s, 574.38 GB/s, 571.96 GB/s, 564.76 GB/s, 578.67 - GB/s, 565.89 GB/s, 572.49 GB/s, 571.80 GB/s, 572.01 GB/s] - - [647.68 GB/s, 656.56 GB/s, 655.56 GB/s, 644.04 GB/s, 655.30 GB/s, 648.80 - GB/s, 654.77 GB/s, 653.58 GB/s, 656.27 GB/s, 653.79 GB/s] - threads: [2, 4, 6, 8, 10, 12, 14, 16] - threads per core: 2 - total size: [21.12 kB, 42.24 kB, 63.36 kB, 84.48 kB, 105.60 kB, 126.72 kB, - 147.84 kB, 168.96 kB] - L2: - 1: - cores: [1, 2, 3, 4, 5, 6, 7, 8] - results: - copy: [36.74 GB/s, 73.65 GB/s, 107.11 GB/s, 141.43 GB/s, 179.70 GB/s, 215.63 - GB/s, 247.20 GB/s, 282.42 GB/s] - daxpy: [44.59 GB/s, 88.24 GB/s, 132.21 GB/s, 175.78 GB/s, 219.11 GB/s, 259.95 - GB/s, 305.84 GB/s, 346.83 GB/s] - load: [31.46 GB/s, 62.97 GB/s, 93.73 GB/s, 125.46 GB/s, 157.32 GB/s, 183.63 - GB/s, 214.02 GB/s, 245.17 GB/s] - triad: [37.79 GB/s, 75.08 GB/s, 111.43 GB/s, 148.90 GB/s, 185.54 GB/s, 223.72 - GB/s, 258.53 GB/s, 299.32 GB/s] - update: [48.46 GB/s, 96.10 GB/s, 141.97 GB/s, 189.18 GB/s, 234.73 GB/s, - 280.47 GB/s, 330.94 GB/s, 365.43 GB/s] - size per core: [168.96 kB, 168.96 kB, 168.96 kB, 168.96 kB, 168.96 kB, 168.96 - kB, 168.96 kB, 168.96 kB] - size per thread: [168.96 kB, 168.96 kB, 168.96 kB, 168.96 kB, 168.96 kB, 168.96 - kB, 168.96 kB, 168.96 kB] - stats: - copy: - - [36.38 GB/s, 36.59 GB/s, 36.18 GB/s, 36.57 GB/s, 36.26 GB/s, 34.61 GB/s, - 35.96 GB/s, 35.84 GB/s, 36.74 GB/s, 36.53 GB/s] - - [68.97 GB/s, 70.42 GB/s, 69.88 GB/s, 71.40 GB/s, 69.05 GB/s, 72.46 GB/s, - 70.32 GB/s, 73.65 GB/s, 72.14 GB/s, 69.81 GB/s] - - [107.08 GB/s, 103.53 GB/s, 107.11 GB/s, 103.66 GB/s, 103.88 GB/s, 106.48 - GB/s, 97.32 GB/s, 105.92 GB/s, 104.16 GB/s, 104.84 GB/s] - - [138.97 GB/s, 136.86 GB/s, 140.88 GB/s, 138.96 GB/s, 140.58 GB/s, 138.51 - GB/s, 141.43 GB/s, 139.53 GB/s, 141.20 GB/s, 139.43 GB/s] - - [158.20 GB/s, 171.06 GB/s, 179.70 GB/s, 171.43 GB/s, 174.27 GB/s, 175.01 - GB/s, 165.20 GB/s, 170.89 GB/s, 173.01 GB/s, 175.17 GB/s] - - [209.74 GB/s, 204.59 GB/s, 215.27 GB/s, 215.63 GB/s, 210.59 GB/s, 206.94 - GB/s, 211.03 GB/s, 201.61 GB/s, 214.45 GB/s, 208.15 GB/s] - - [241.38 GB/s, 246.88 GB/s, 246.90 GB/s, 247.20 GB/s, 235.27 GB/s, 227.39 - GB/s, 239.48 GB/s, 244.45 GB/s, 246.68 GB/s, 235.87 GB/s] - - [271.07 GB/s, 282.42 GB/s, 282.38 GB/s, 276.20 GB/s, 269.85 GB/s, 276.96 - GB/s, 268.64 GB/s, 269.61 GB/s, 279.68 GB/s, 280.63 GB/s] - daxpy: - - [44.54 GB/s, 44.59 GB/s, 44.50 GB/s, 44.42 GB/s, 44.41 GB/s, 44.06 GB/s, - 43.39 GB/s, 44.02 GB/s, 44.34 GB/s, 44.28 GB/s] - - [85.35 GB/s, 87.05 GB/s, 86.47 GB/s, 86.90 GB/s, 86.92 GB/s, 88.24 GB/s, - 87.39 GB/s, 87.60 GB/s, 87.55 GB/s, 84.19 GB/s] - - [129.21 GB/s, 130.47 GB/s, 123.29 GB/s, 127.92 GB/s, 132.21 GB/s, 128.37 - GB/s, 127.09 GB/s, 128.72 GB/s, 129.34 GB/s, 128.69 GB/s] - - [171.53 GB/s, 169.64 GB/s, 173.92 GB/s, 173.74 GB/s, 168.53 GB/s, 171.54 - GB/s, 173.96 GB/s, 175.78 GB/s, 171.29 GB/s, 171.33 GB/s] - - [219.11 GB/s, 208.86 GB/s, 211.66 GB/s, 216.47 GB/s, 212.73 GB/s, 204.90 - GB/s, 208.87 GB/s, 215.75 GB/s, 213.61 GB/s, 214.56 GB/s] - - [250.69 GB/s, 241.36 GB/s, 255.22 GB/s, 250.29 GB/s, 253.80 GB/s, 256.34 - GB/s, 254.38 GB/s, 259.95 GB/s, 245.69 GB/s, 259.12 GB/s] - - [296.08 GB/s, 301.77 GB/s, 297.40 GB/s, 305.84 GB/s, 288.62 GB/s, 283.76 - GB/s, 293.61 GB/s, 291.93 GB/s, 299.74 GB/s, 289.76 GB/s] - - [344.46 GB/s, 334.36 GB/s, 339.31 GB/s, 330.88 GB/s, 343.26 GB/s, 327.28 - GB/s, 344.53 GB/s, 346.83 GB/s, 344.29 GB/s, 346.28 GB/s] - load: - - [31.40 GB/s, 31.23 GB/s, 31.29 GB/s, 31.24 GB/s, 31.46 GB/s, 31.20 GB/s, - 31.33 GB/s, 30.01 GB/s, 30.08 GB/s, 31.40 GB/s] - - [61.20 GB/s, 60.74 GB/s, 61.93 GB/s, 61.22 GB/s, 61.20 GB/s, 60.03 GB/s, - 59.33 GB/s, 59.94 GB/s, 58.54 GB/s, 62.97 GB/s] - - [91.53 GB/s, 93.73 GB/s, 93.05 GB/s, 90.07 GB/s, 91.60 GB/s, 90.11 GB/s, - 90.21 GB/s, 90.43 GB/s, 89.15 GB/s, 93.10 GB/s] - - [122.80 GB/s, 116.57 GB/s, 120.68 GB/s, 122.54 GB/s, 122.75 GB/s, 121.79 - GB/s, 125.30 GB/s, 125.46 GB/s, 122.28 GB/s, 124.51 GB/s] - - [151.01 GB/s, 151.10 GB/s, 148.68 GB/s, 151.17 GB/s, 147.24 GB/s, 153.65 - GB/s, 146.48 GB/s, 150.48 GB/s, 150.74 GB/s, 157.32 GB/s] - - [181.52 GB/s, 173.89 GB/s, 181.58 GB/s, 174.01 GB/s, 176.40 GB/s, 179.73 - GB/s, 174.06 GB/s, 181.26 GB/s, 180.57 GB/s, 183.63 GB/s] - - [214.02 GB/s, 205.69 GB/s, 207.64 GB/s, 204.18 GB/s, 208.42 GB/s, 211.39 - GB/s, 206.58 GB/s, 204.90 GB/s, 204.75 GB/s, 208.91 GB/s] - - [232.16 GB/s, 233.90 GB/s, 241.32 GB/s, 237.45 GB/s, 235.41 GB/s, 241.17 - GB/s, 237.52 GB/s, 245.17 GB/s, 241.17 GB/s, 234.08 GB/s] - triad: - - [37.62 GB/s, 37.54 GB/s, 37.79 GB/s, 37.67 GB/s, 37.76 GB/s, 37.77 GB/s, - 37.68 GB/s, 35.83 GB/s, 37.06 GB/s, 37.50 GB/s] - - [72.79 GB/s, 74.76 GB/s, 73.15 GB/s, 74.68 GB/s, 73.88 GB/s, 73.27 GB/s, - 75.08 GB/s, 73.48 GB/s, 71.27 GB/s, 72.05 GB/s] - - [106.26 GB/s, 105.22 GB/s, 109.70 GB/s, 109.07 GB/s, 110.84 GB/s, 111.43 - GB/s, 106.32 GB/s, 109.73 GB/s, 106.22 GB/s, 107.20 GB/s] - - [142.10 GB/s, 148.90 GB/s, 148.11 GB/s, 144.38 GB/s, 144.77 GB/s, 145.42 - GB/s, 147.36 GB/s, 142.94 GB/s, 145.39 GB/s, 139.42 GB/s] - - [182.07 GB/s, 176.75 GB/s, 181.39 GB/s, 183.31 GB/s, 181.87 GB/s, 183.71 - GB/s, 180.48 GB/s, 178.11 GB/s, 181.36 GB/s, 185.54 GB/s] - - [219.85 GB/s, 217.02 GB/s, 218.86 GB/s, 217.09 GB/s, 212.24 GB/s, 212.22 - GB/s, 219.33 GB/s, 208.81 GB/s, 215.84 GB/s, 223.72 GB/s] - - [258.06 GB/s, 232.27 GB/s, 247.04 GB/s, 240.55 GB/s, 236.11 GB/s, 251.88 - GB/s, 258.53 GB/s, 247.32 GB/s, 251.53 GB/s, 245.10 GB/s] - - [273.67 GB/s, 292.81 GB/s, 288.67 GB/s, 289.75 GB/s, 293.98 GB/s, 283.56 - GB/s, 295.33 GB/s, 280.11 GB/s, 299.32 GB/s, 285.18 GB/s] - update: - - [47.30 GB/s, 48.33 GB/s, 48.17 GB/s, 47.38 GB/s, 48.16 GB/s, 46.99 GB/s, - 48.46 GB/s, 47.51 GB/s, 46.20 GB/s, 48.26 GB/s] - - [92.10 GB/s, 92.30 GB/s, 95.73 GB/s, 95.53 GB/s, 86.95 GB/s, 96.10 GB/s, - 94.16 GB/s, 89.72 GB/s, 92.00 GB/s, 93.10 GB/s] - - [137.06 GB/s, 140.40 GB/s, 136.20 GB/s, 139.57 GB/s, 140.69 GB/s, 136.20 - GB/s, 141.53 GB/s, 129.76 GB/s, 136.47 GB/s, 141.97 GB/s] - - [184.84 GB/s, 177.96 GB/s, 178.61 GB/s, 179.03 GB/s, 176.59 GB/s, 180.62 - GB/s, 182.26 GB/s, 182.27 GB/s, 189.18 GB/s, 185.49 GB/s] - - [232.17 GB/s, 217.86 GB/s, 232.40 GB/s, 223.10 GB/s, 228.52 GB/s, 234.73 - GB/s, 232.00 GB/s, 233.14 GB/s, 231.69 GB/s, 225.01 GB/s] - - [276.16 GB/s, 274.80 GB/s, 272.58 GB/s, 272.43 GB/s, 280.47 GB/s, 276.90 - GB/s, 264.76 GB/s, 272.47 GB/s, 277.77 GB/s, 271.42 GB/s] - - [330.94 GB/s, 312.06 GB/s, 312.83 GB/s, 312.62 GB/s, 292.44 GB/s, 315.68 - GB/s, 316.67 GB/s, 321.25 GB/s, 321.71 GB/s, 315.05 GB/s] - - [362.85 GB/s, 356.49 GB/s, 365.43 GB/s, 332.52 GB/s, 354.30 GB/s, 354.68 - GB/s, 335.54 GB/s, 358.54 GB/s, 363.22 GB/s, 360.01 GB/s] - threads: [1, 2, 3, 4, 5, 6, 7, 8] - threads per core: 1 - total size: [168.96 kB, 337.92 kB, 506.88 kB, 675.84 kB, 844.80 kB, 1.01 MB, - 1.18 MB, 1.35 MB] - 2: - cores: [1, 2, 3, 4, 5, 6, 7, 8] - results: - copy: [36.83 GB/s, 72.70 GB/s, 108.11 GB/s, 142.21 GB/s, 178.07 GB/s, 213.30 - GB/s, 251.98 GB/s, 283.06 GB/s] - daxpy: [45.34 GB/s, 90.11 GB/s, 134.85 GB/s, 180.06 GB/s, 224.22 GB/s, 268.27 - GB/s, 312.15 GB/s, 358.38 GB/s] - load: [33.99 GB/s, 67.65 GB/s, 100.93 GB/s, 134.81 GB/s, 165.89 GB/s, 196.09 - GB/s, 233.31 GB/s, 262.05 GB/s] - triad: [38.60 GB/s, 76.58 GB/s, 114.50 GB/s, 150.54 GB/s, 189.60 GB/s, 227.05 - GB/s, 263.75 GB/s, 301.02 GB/s] - update: [49.25 GB/s, 97.34 GB/s, 146.81 GB/s, 194.71 GB/s, 239.97 GB/s, - 287.14 GB/s, 330.84 GB/s, 384.71 GB/s] - size per core: [168.96 kB, 168.96 kB, 168.96 kB, 168.96 kB, 168.96 kB, 168.96 - kB, 168.96 kB, 168.96 kB] - size per thread: [84.48 kB, 84.48 kB, 84.48 kB, 84.48 kB, 84.48 kB, 84.48 - kB, 84.48 kB, 84.48 kB] - stats: - copy: - - [36.83 GB/s, 36.67 GB/s, 34.90 GB/s, 36.44 GB/s, 35.13 GB/s, 35.07 GB/s, - 35.53 GB/s, 36.15 GB/s, 35.85 GB/s, 36.23 GB/s] - - [71.52 GB/s, 70.16 GB/s, 70.67 GB/s, 71.20 GB/s, 72.70 GB/s, 70.14 GB/s, - 70.53 GB/s, 69.17 GB/s, 71.57 GB/s, 70.22 GB/s] - - [104.39 GB/s, 104.74 GB/s, 103.12 GB/s, 108.11 GB/s, 105.30 GB/s, 102.80 - GB/s, 102.90 GB/s, 107.06 GB/s, 103.45 GB/s, 105.45 GB/s] - - [139.02 GB/s, 134.63 GB/s, 140.72 GB/s, 141.32 GB/s, 140.35 GB/s, 141.19 - GB/s, 135.44 GB/s, 142.21 GB/s, 140.96 GB/s, 142.05 GB/s] - - [177.86 GB/s, 177.74 GB/s, 177.42 GB/s, 175.35 GB/s, 176.42 GB/s, 173.13 - GB/s, 174.32 GB/s, 170.24 GB/s, 178.07 GB/s, 177.88 GB/s] - - [206.27 GB/s, 211.63 GB/s, 209.06 GB/s, 210.54 GB/s, 208.80 GB/s, 209.99 - GB/s, 208.77 GB/s, 206.41 GB/s, 213.30 GB/s, 206.39 GB/s] - - [240.18 GB/s, 238.36 GB/s, 244.16 GB/s, 236.26 GB/s, 244.12 GB/s, 238.49 - GB/s, 242.23 GB/s, 244.46 GB/s, 251.98 GB/s, 242.55 GB/s] - - [279.77 GB/s, 282.91 GB/s, 278.73 GB/s, 276.91 GB/s, 283.06 GB/s, 273.23 - GB/s, 278.33 GB/s, 280.88 GB/s, 277.54 GB/s, 281.83 GB/s] - daxpy: - - [45.32 GB/s, 44.62 GB/s, 45.29 GB/s, 45.18 GB/s, 45.17 GB/s, 45.07 GB/s, - 44.69 GB/s, 45.17 GB/s, 45.11 GB/s, 45.34 GB/s] - - [89.94 GB/s, 89.97 GB/s, 89.37 GB/s, 89.90 GB/s, 88.37 GB/s, 89.13 GB/s, - 90.11 GB/s, 89.67 GB/s, 89.90 GB/s, 89.93 GB/s] - - [134.83 GB/s, 134.85 GB/s, 132.02 GB/s, 134.33 GB/s, 133.82 GB/s, 132.39 - GB/s, 131.67 GB/s, 134.62 GB/s, 132.71 GB/s, 131.67 GB/s] - - [175.52 GB/s, 173.36 GB/s, 176.83 GB/s, 177.98 GB/s, 175.73 GB/s, 173.42 - GB/s, 180.06 GB/s, 179.55 GB/s, 176.71 GB/s, 175.85 GB/s] - - [222.00 GB/s, 216.86 GB/s, 220.17 GB/s, 218.14 GB/s, 220.60 GB/s, 219.43 - GB/s, 220.58 GB/s, 224.22 GB/s, 220.89 GB/s, 222.28 GB/s] - - [258.75 GB/s, 262.88 GB/s, 261.77 GB/s, 268.27 GB/s, 263.66 GB/s, 262.59 - GB/s, 266.54 GB/s, 261.67 GB/s, 262.80 GB/s, 263.72 GB/s] - - [298.65 GB/s, 312.15 GB/s, 308.52 GB/s, 304.22 GB/s, 301.87 GB/s, 305.53 - GB/s, 309.84 GB/s, 310.67 GB/s, 310.49 GB/s, 311.99 GB/s] - - [347.55 GB/s, 350.67 GB/s, 348.93 GB/s, 358.38 GB/s, 352.35 GB/s, 352.05 - GB/s, 353.82 GB/s, 356.00 GB/s, 348.07 GB/s, 349.87 GB/s] - load: - - [33.99 GB/s, 32.54 GB/s, 32.94 GB/s, 33.17 GB/s, 33.83 GB/s, 31.55 GB/s, - 31.91 GB/s, 33.86 GB/s, 33.93 GB/s, 33.75 GB/s] - - [66.22 GB/s, 64.94 GB/s, 67.64 GB/s, 67.52 GB/s, 65.01 GB/s, 67.21 GB/s, - 66.07 GB/s, 66.43 GB/s, 67.65 GB/s, 64.84 GB/s] - - [98.58 GB/s, 97.97 GB/s, 98.39 GB/s, 98.50 GB/s, 98.77 GB/s, 97.84 GB/s, - 99.58 GB/s, 100.93 GB/s, 100.50 GB/s, 99.94 GB/s] - - [130.23 GB/s, 131.10 GB/s, 131.04 GB/s, 127.83 GB/s, 134.81 GB/s, 132.68 - GB/s, 131.80 GB/s, 129.42 GB/s, 130.76 GB/s, 126.96 GB/s] - - [164.90 GB/s, 165.18 GB/s, 161.19 GB/s, 164.33 GB/s, 162.76 GB/s, 165.04 - GB/s, 162.20 GB/s, 165.89 GB/s, 164.34 GB/s, 159.66 GB/s] - - [192.69 GB/s, 193.33 GB/s, 188.88 GB/s, 190.70 GB/s, 194.60 GB/s, 190.92 - GB/s, 191.36 GB/s, 192.89 GB/s, 191.85 GB/s, 196.09 GB/s] - - [227.70 GB/s, 223.95 GB/s, 222.79 GB/s, 227.09 GB/s, 227.04 GB/s, 229.45 - GB/s, 228.09 GB/s, 227.83 GB/s, 233.31 GB/s, 227.49 GB/s] - - [257.94 GB/s, 261.47 GB/s, 262.05 GB/s, 257.70 GB/s, 259.70 GB/s, 259.23 - GB/s, 261.09 GB/s, 253.81 GB/s, 254.21 GB/s, 259.34 GB/s] - triad: - - [38.60 GB/s, 36.68 GB/s, 38.07 GB/s, 38.10 GB/s, 37.89 GB/s, 36.48 GB/s, - 38.33 GB/s, 38.12 GB/s, 37.43 GB/s, 37.87 GB/s] - - [76.58 GB/s, 74.97 GB/s, 75.74 GB/s, 76.02 GB/s, 72.66 GB/s, 74.73 GB/s, - 76.37 GB/s, 76.18 GB/s, 74.59 GB/s, 75.75 GB/s] - - [111.71 GB/s, 114.50 GB/s, 108.96 GB/s, 111.49 GB/s, 111.56 GB/s, 111.66 - GB/s, 113.43 GB/s, 114.37 GB/s, 111.67 GB/s, 108.14 GB/s] - - [146.29 GB/s, 147.84 GB/s, 149.09 GB/s, 149.93 GB/s, 150.54 GB/s, 145.50 - GB/s, 145.16 GB/s, 149.47 GB/s, 146.30 GB/s, 149.32 GB/s] - - [186.73 GB/s, 186.46 GB/s, 180.47 GB/s, 187.32 GB/s, 184.34 GB/s, 187.34 - GB/s, 186.55 GB/s, 183.81 GB/s, 189.60 GB/s, 188.70 GB/s] - - [224.81 GB/s, 219.69 GB/s, 227.05 GB/s, 224.25 GB/s, 223.36 GB/s, 225.86 - GB/s, 216.09 GB/s, 221.98 GB/s, 218.47 GB/s, 226.37 GB/s] - - [263.29 GB/s, 259.28 GB/s, 258.81 GB/s, 258.77 GB/s, 256.56 GB/s, 256.49 - GB/s, 256.39 GB/s, 263.75 GB/s, 262.00 GB/s, 261.48 GB/s] - - [299.28 GB/s, 292.80 GB/s, 293.63 GB/s, 297.93 GB/s, 293.02 GB/s, 295.95 - GB/s, 287.92 GB/s, 301.02 GB/s, 300.76 GB/s, 297.01 GB/s] - update: - - [49.07 GB/s, 47.17 GB/s, 47.56 GB/s, 49.25 GB/s, 46.44 GB/s, 49.04 GB/s, - 48.91 GB/s, 49.20 GB/s, 48.30 GB/s, 48.85 GB/s] - - [96.45 GB/s, 97.11 GB/s, 94.03 GB/s, 92.56 GB/s, 95.39 GB/s, 97.34 GB/s, - 96.06 GB/s, 92.25 GB/s, 95.53 GB/s, 97.08 GB/s] - - [137.54 GB/s, 135.13 GB/s, 145.80 GB/s, 141.29 GB/s, 138.99 GB/s, 143.44 - GB/s, 146.81 GB/s, 142.94 GB/s, 133.84 GB/s, 146.33 GB/s] - - [190.64 GB/s, 185.02 GB/s, 194.24 GB/s, 187.48 GB/s, 194.52 GB/s, 188.51 - GB/s, 189.17 GB/s, 194.71 GB/s, 194.37 GB/s, 190.83 GB/s] - - [239.97 GB/s, 219.74 GB/s, 233.72 GB/s, 234.38 GB/s, 235.78 GB/s, 235.11 - GB/s, 235.62 GB/s, 226.09 GB/s, 235.93 GB/s, 230.51 GB/s] - - [280.16 GB/s, 275.22 GB/s, 260.15 GB/s, 286.01 GB/s, 280.61 GB/s, 287.14 - GB/s, 283.75 GB/s, 275.23 GB/s, 283.71 GB/s, 285.38 GB/s] - - [311.15 GB/s, 318.00 GB/s, 325.21 GB/s, 328.34 GB/s, 318.09 GB/s, 328.66 - GB/s, 329.69 GB/s, 316.97 GB/s, 328.51 GB/s, 330.84 GB/s] - - [374.41 GB/s, 369.73 GB/s, 358.15 GB/s, 375.54 GB/s, 384.71 GB/s, 357.66 - GB/s, 369.71 GB/s, 375.35 GB/s, 370.25 GB/s, 364.01 GB/s] - threads: [2, 4, 6, 8, 10, 12, 14, 16] - threads per core: 2 - total size: [168.96 kB, 337.92 kB, 506.88 kB, 675.84 kB, 844.80 kB, 1.01 MB, - 1.18 MB, 1.35 MB] - L3: - 1: - cores: [1, 2, 3, 4, 5, 6, 7, 8] - results: - copy: [21.93 GB/s, 43.10 GB/s, 65.38 GB/s, 85.69 GB/s, 105.64 GB/s, 127.34 - GB/s, 148.22 GB/s, 171.52 GB/s] - daxpy: [30.98 GB/s, 62.27 GB/s, 93.13 GB/s, 123.27 GB/s, 153.64 GB/s, 185.97 - GB/s, 216.67 GB/s, 247.41 GB/s] - load: [23.47 GB/s, 46.84 GB/s, 69.74 GB/s, 92.76 GB/s, 115.37 GB/s, 139.23 - GB/s, 163.12 GB/s, 186.65 GB/s] - triad: [24.72 GB/s, 49.11 GB/s, 72.42 GB/s, 95.36 GB/s, 119.46 GB/s, 144.60 - GB/s, 168.66 GB/s, 189.45 GB/s] - update: [31.39 GB/s, 62.11 GB/s, 91.95 GB/s, 122.24 GB/s, 151.40 GB/s, 182.28 - GB/s, 216.07 GB/s, 239.92 GB/s] - size per core: [13.20 MB, 6.60 MB, 4.40 MB, 3.30 MB, 2.64 MB, 2.20 MB, 1.89 - MB, 1.65 MB] - size per thread: [13.20 MB, 6.60 MB, 4.40 MB, 3.30 MB, 2.64 MB, 2.20 MB, 1.89 - MB, 1.65 MB] - stats: - copy: - - [21.64 GB/s, 20.85 GB/s, 20.56 GB/s, 21.69 GB/s, 21.06 GB/s, 21.46 GB/s, - 21.93 GB/s, 21.73 GB/s, 21.83 GB/s, 21.69 GB/s] - - [42.86 GB/s, 42.70 GB/s, 42.72 GB/s, 38.47 GB/s, 42.82 GB/s, 43.10 GB/s, - 42.66 GB/s, 42.44 GB/s, 42.61 GB/s, 42.48 GB/s] - - [64.95 GB/s, 64.34 GB/s, 63.93 GB/s, 65.38 GB/s, 64.36 GB/s, 63.60 GB/s, - 62.65 GB/s, 63.66 GB/s, 63.51 GB/s, 63.75 GB/s] - - [84.07 GB/s, 83.97 GB/s, 83.34 GB/s, 83.91 GB/s, 81.16 GB/s, 85.69 GB/s, - 85.40 GB/s, 85.37 GB/s, 85.42 GB/s, 84.48 GB/s] - - [102.83 GB/s, 104.24 GB/s, 105.42 GB/s, 103.68 GB/s, 105.22 GB/s, 105.64 - GB/s, 103.15 GB/s, 102.02 GB/s, 100.60 GB/s, 105.09 GB/s] - - [125.46 GB/s, 122.23 GB/s, 123.56 GB/s, 124.59 GB/s, 127.03 GB/s, 125.39 - GB/s, 124.50 GB/s, 127.02 GB/s, 126.95 GB/s, 127.34 GB/s] - - [147.99 GB/s, 146.65 GB/s, 139.23 GB/s, 147.69 GB/s, 146.42 GB/s, 145.65 - GB/s, 148.22 GB/s, 143.77 GB/s, 147.96 GB/s, 147.70 GB/s] - - [168.36 GB/s, 168.24 GB/s, 164.99 GB/s, 165.32 GB/s, 167.08 GB/s, 165.98 - GB/s, 165.39 GB/s, 165.84 GB/s, 166.15 GB/s, 171.52 GB/s] - daxpy: - - [30.92 GB/s, 30.74 GB/s, 30.87 GB/s, 30.98 GB/s, 30.45 GB/s, 29.62 GB/s, - 29.54 GB/s, 30.04 GB/s, 30.94 GB/s, 30.93 GB/s] - - [61.96 GB/s, 61.38 GB/s, 61.27 GB/s, 62.27 GB/s, 61.36 GB/s, 61.27 GB/s, - 62.06 GB/s, 60.01 GB/s, 61.49 GB/s, 62.16 GB/s] - - [92.26 GB/s, 93.06 GB/s, 88.45 GB/s, 92.18 GB/s, 93.13 GB/s, 92.11 GB/s, - 92.28 GB/s, 92.28 GB/s, 93.03 GB/s, 92.78 GB/s] - - [123.22 GB/s, 123.06 GB/s, 123.27 GB/s, 119.42 GB/s, 122.94 GB/s, 122.54 - GB/s, 123.24 GB/s, 115.90 GB/s, 121.65 GB/s, 122.47 GB/s] - - [151.70 GB/s, 145.65 GB/s, 149.53 GB/s, 152.52 GB/s, 153.64 GB/s, 152.93 - GB/s, 152.81 GB/s, 153.01 GB/s, 153.04 GB/s, 152.06 GB/s] - - [184.04 GB/s, 171.51 GB/s, 184.83 GB/s, 184.09 GB/s, 185.97 GB/s, 183.75 - GB/s, 184.66 GB/s, 182.54 GB/s, 184.39 GB/s, 184.40 GB/s] - - [198.70 GB/s, 216.51 GB/s, 216.17 GB/s, 203.10 GB/s, 211.40 GB/s, 215.04 - GB/s, 215.48 GB/s, 216.03 GB/s, 216.24 GB/s, 216.67 GB/s] - - [246.02 GB/s, 247.35 GB/s, 245.00 GB/s, 244.65 GB/s, 229.12 GB/s, 243.37 - GB/s, 247.22 GB/s, 247.41 GB/s, 246.03 GB/s, 244.83 GB/s] - load: - - [23.08 GB/s, 23.38 GB/s, 22.88 GB/s, 23.43 GB/s, 23.05 GB/s, 23.23 GB/s, - 22.97 GB/s, 22.39 GB/s, 23.47 GB/s, 23.33 GB/s] - - [46.39 GB/s, 46.40 GB/s, 46.45 GB/s, 46.36 GB/s, 46.69 GB/s, 46.62 GB/s, - 46.84 GB/s, 45.98 GB/s, 46.73 GB/s, 46.80 GB/s] - - [69.18 GB/s, 68.61 GB/s, 69.74 GB/s, 69.34 GB/s, 68.39 GB/s, 69.73 GB/s, - 67.76 GB/s, 69.65 GB/s, 69.70 GB/s, 69.16 GB/s] - - [92.29 GB/s, 91.67 GB/s, 92.76 GB/s, 90.78 GB/s, 92.76 GB/s, 90.76 GB/s, - 91.58 GB/s, 91.60 GB/s, 91.03 GB/s, 92.72 GB/s] - - [114.04 GB/s, 113.82 GB/s, 112.26 GB/s, 112.65 GB/s, 114.09 GB/s, 113.81 - GB/s, 113.72 GB/s, 114.70 GB/s, 115.37 GB/s, 112.57 GB/s] - - [136.42 GB/s, 135.83 GB/s, 134.93 GB/s, 135.43 GB/s, 135.94 GB/s, 139.23 - GB/s, 137.52 GB/s, 137.59 GB/s, 135.97 GB/s, 136.96 GB/s] - - [157.88 GB/s, 163.12 GB/s, 159.53 GB/s, 160.16 GB/s, 162.18 GB/s, 159.58 - GB/s, 161.55 GB/s, 159.81 GB/s, 162.97 GB/s, 163.10 GB/s] - - [183.41 GB/s, 181.86 GB/s, 183.55 GB/s, 183.38 GB/s, 181.66 GB/s, 186.65 - GB/s, 179.62 GB/s, 174.70 GB/s, 180.10 GB/s, 181.49 GB/s] - triad: - - [24.72 GB/s, 23.66 GB/s, 23.58 GB/s, 23.75 GB/s, 23.62 GB/s, 24.37 GB/s, - 24.44 GB/s, 23.57 GB/s, 23.30 GB/s, 23.57 GB/s] - - [49.11 GB/s, 46.87 GB/s, 47.13 GB/s, 46.83 GB/s, 46.58 GB/s, 46.73 GB/s, - 46.32 GB/s, 47.22 GB/s, 46.79 GB/s, 48.73 GB/s] - - [72.29 GB/s, 69.87 GB/s, 70.57 GB/s, 68.89 GB/s, 68.56 GB/s, 69.02 GB/s, - 72.42 GB/s, 69.37 GB/s, 72.34 GB/s, 69.44 GB/s] - - [94.95 GB/s, 94.67 GB/s, 91.05 GB/s, 90.46 GB/s, 95.36 GB/s, 91.63 GB/s, - 94.06 GB/s, 95.30 GB/s, 93.99 GB/s, 94.71 GB/s] - - [119.32 GB/s, 117.99 GB/s, 119.46 GB/s, 117.28 GB/s, 118.97 GB/s, 115.67 - GB/s, 116.64 GB/s, 117.99 GB/s, 119.02 GB/s, 117.75 GB/s] - - [138.63 GB/s, 144.53 GB/s, 144.60 GB/s, 135.72 GB/s, 141.86 GB/s, 139.64 - GB/s, 142.95 GB/s, 140.89 GB/s, 142.10 GB/s, 143.97 GB/s] - - [168.66 GB/s, 166.77 GB/s, 157.10 GB/s, 164.75 GB/s, 164.00 GB/s, 164.38 - GB/s, 163.94 GB/s, 158.58 GB/s, 165.60 GB/s, 164.39 GB/s] - - [184.53 GB/s, 187.00 GB/s, 186.87 GB/s, 179.43 GB/s, 185.70 GB/s, 187.49 - GB/s, 189.45 GB/s, 186.82 GB/s, 188.50 GB/s, 185.96 GB/s] - update: - - [30.60 GB/s, 31.20 GB/s, 30.65 GB/s, 31.39 GB/s, 30.89 GB/s, 30.75 GB/s, - 30.58 GB/s, 30.99 GB/s, 30.69 GB/s, 31.34 GB/s] - - [60.99 GB/s, 62.11 GB/s, 61.42 GB/s, 61.55 GB/s, 61.79 GB/s, 61.24 GB/s, - 61.37 GB/s, 61.74 GB/s, 61.45 GB/s, 61.58 GB/s] - - [91.11 GB/s, 91.21 GB/s, 91.95 GB/s, 91.19 GB/s, 91.14 GB/s, 91.36 GB/s, - 91.30 GB/s, 91.70 GB/s, 90.84 GB/s, 91.09 GB/s] - - [120.90 GB/s, 120.49 GB/s, 121.35 GB/s, 122.24 GB/s, 120.37 GB/s, 119.83 - GB/s, 119.32 GB/s, 119.48 GB/s, 119.11 GB/s, 119.76 GB/s] - - [146.72 GB/s, 147.18 GB/s, 147.81 GB/s, 151.40 GB/s, 147.81 GB/s, 146.84 - GB/s, 147.51 GB/s, 148.15 GB/s, 146.89 GB/s, 148.41 GB/s] - - [179.93 GB/s, 179.68 GB/s, 182.28 GB/s, 179.65 GB/s, 179.06 GB/s, 182.25 - GB/s, 182.03 GB/s, 179.10 GB/s, 178.82 GB/s, 177.84 GB/s] - - [208.84 GB/s, 210.17 GB/s, 210.20 GB/s, 210.81 GB/s, 209.88 GB/s, 211.16 - GB/s, 216.07 GB/s, 211.77 GB/s, 208.89 GB/s, 210.47 GB/s] - - [236.56 GB/s, 239.05 GB/s, 237.81 GB/s, 237.20 GB/s, 238.68 GB/s, 237.69 - GB/s, 239.05 GB/s, 239.38 GB/s, 239.92 GB/s, 238.63 GB/s] - threads: [1, 2, 3, 4, 5, 6, 7, 8] - threads per core: 1 - total size: [13.20 MB, 13.20 MB, 13.20 MB, 13.20 MB, 13.20 MB, 13.20 MB, 13.20 - MB, 13.20 MB] - 2: - cores: [1, 2, 3, 4, 5, 6, 7, 8] - results: - copy: [23.35 GB/s, 45.63 GB/s, 68.10 GB/s, 89.46 GB/s, 111.10 GB/s, 134.20 - GB/s, 154.44 GB/s, 174.89 GB/s] - daxpy: [32.32 GB/s, 64.16 GB/s, 96.12 GB/s, 126.75 GB/s, 156.91 GB/s, 188.57 - GB/s, 221.57 GB/s, 251.65 GB/s] - load: [25.14 GB/s, 50.38 GB/s, 75.49 GB/s, 101.06 GB/s, 126.04 GB/s, 151.12 - GB/s, 172.57 GB/s, 196.91 GB/s] - triad: [25.15 GB/s, 50.37 GB/s, 75.31 GB/s, 99.12 GB/s, 123.25 GB/s, 150.29 - GB/s, 171.60 GB/s, 197.81 GB/s] - update: [32.98 GB/s, 65.60 GB/s, 97.60 GB/s, 130.34 GB/s, 162.76 GB/s, 194.12 - GB/s, 229.02 GB/s, 260.35 GB/s] - size per core: [13.20 MB, 6.60 MB, 4.40 MB, 3.30 MB, 2.64 MB, 2.20 MB, 1.89 - MB, 1.65 MB] - size per thread: [6.60 MB, 3.30 MB, 2.20 MB, 1.65 MB, 1.32 MB, 1.10 MB, 0.94 - MB, 825.00 kB] - stats: - copy: - - [22.79 GB/s, 22.55 GB/s, 22.86 GB/s, 22.74 GB/s, 23.09 GB/s, 22.51 GB/s, - 23.35 GB/s, 23.32 GB/s, 23.02 GB/s, 22.75 GB/s] - - [45.32 GB/s, 45.15 GB/s, 45.63 GB/s, 44.84 GB/s, 44.54 GB/s, 44.33 GB/s, - 44.68 GB/s, 44.98 GB/s, 44.64 GB/s, 44.75 GB/s] - - [68.10 GB/s, 67.88 GB/s, 67.98 GB/s, 67.32 GB/s, 67.02 GB/s, 67.14 GB/s, - 67.71 GB/s, 67.19 GB/s, 63.08 GB/s, 68.04 GB/s] - - [89.46 GB/s, 88.53 GB/s, 88.51 GB/s, 89.13 GB/s, 89.32 GB/s, 84.53 GB/s, - 87.51 GB/s, 88.95 GB/s, 88.91 GB/s, 87.62 GB/s] - - [108.72 GB/s, 110.42 GB/s, 106.02 GB/s, 111.08 GB/s, 110.70 GB/s, 111.10 - GB/s, 110.24 GB/s, 109.68 GB/s, 109.55 GB/s, 108.86 GB/s] - - [133.21 GB/s, 127.37 GB/s, 132.83 GB/s, 132.67 GB/s, 133.02 GB/s, 132.65 - GB/s, 134.20 GB/s, 132.96 GB/s, 118.86 GB/s, 131.20 GB/s] - - [152.95 GB/s, 153.90 GB/s, 153.80 GB/s, 153.22 GB/s, 153.32 GB/s, 142.75 - GB/s, 152.99 GB/s, 154.44 GB/s, 154.43 GB/s, 152.24 GB/s] - - [174.89 GB/s, 171.49 GB/s, 157.46 GB/s, 172.90 GB/s, 173.42 GB/s, 171.07 - GB/s, 171.82 GB/s, 170.68 GB/s, 172.19 GB/s, 161.38 GB/s] - daxpy: - - [31.88 GB/s, 32.27 GB/s, 31.11 GB/s, 32.20 GB/s, 32.17 GB/s, 32.32 GB/s, - 32.20 GB/s, 32.32 GB/s, 30.76 GB/s, 32.03 GB/s] - - [64.16 GB/s, 63.70 GB/s, 64.04 GB/s, 63.55 GB/s, 60.64 GB/s, 64.05 GB/s, - 63.56 GB/s, 63.36 GB/s, 63.94 GB/s, 63.86 GB/s] - - [96.12 GB/s, 95.66 GB/s, 95.93 GB/s, 95.93 GB/s, 96.10 GB/s, 95.94 GB/s, - 95.78 GB/s, 95.79 GB/s, 95.17 GB/s, 89.44 GB/s] - - [126.04 GB/s, 126.43 GB/s, 126.09 GB/s, 124.90 GB/s, 125.07 GB/s, 125.74 - GB/s, 118.86 GB/s, 125.80 GB/s, 125.10 GB/s, 126.75 GB/s] - - [155.92 GB/s, 155.99 GB/s, 156.32 GB/s, 151.54 GB/s, 156.49 GB/s, 156.91 - GB/s, 154.92 GB/s, 155.92 GB/s, 156.20 GB/s, 154.49 GB/s] - - [185.57 GB/s, 180.38 GB/s, 187.51 GB/s, 187.10 GB/s, 186.44 GB/s, 187.13 - GB/s, 187.31 GB/s, 188.10 GB/s, 187.91 GB/s, 188.57 GB/s] - - [207.55 GB/s, 219.63 GB/s, 219.38 GB/s, 219.81 GB/s, 220.29 GB/s, 219.72 - GB/s, 221.05 GB/s, 216.76 GB/s, 221.57 GB/s, 220.75 GB/s] - - [250.81 GB/s, 250.78 GB/s, 251.19 GB/s, 251.28 GB/s, 249.10 GB/s, 250.42 - GB/s, 251.65 GB/s, 244.31 GB/s, 250.40 GB/s, 250.19 GB/s] - load: - - [24.84 GB/s, 24.86 GB/s, 25.09 GB/s, 25.04 GB/s, 24.74 GB/s, 24.87 GB/s, - 25.01 GB/s, 25.08 GB/s, 25.14 GB/s, 25.00 GB/s] - - [50.03 GB/s, 49.40 GB/s, 50.28 GB/s, 50.08 GB/s, 50.37 GB/s, 49.75 GB/s, - 50.01 GB/s, 50.38 GB/s, 49.89 GB/s, 50.24 GB/s] - - [74.37 GB/s, 74.65 GB/s, 74.40 GB/s, 73.45 GB/s, 73.31 GB/s, 73.00 GB/s, - 75.49 GB/s, 73.94 GB/s, 74.42 GB/s, 74.80 GB/s] - - [99.51 GB/s, 99.43 GB/s, 98.90 GB/s, 99.83 GB/s, 98.74 GB/s, 100.75 GB/s, - 99.33 GB/s, 99.81 GB/s, 100.00 GB/s, 101.06 GB/s] - - [126.04 GB/s, 126.03 GB/s, 124.70 GB/s, 124.86 GB/s, 125.31 GB/s, 124.78 - GB/s, 125.99 GB/s, 123.52 GB/s, 124.45 GB/s, 123.01 GB/s] - - [146.95 GB/s, 150.27 GB/s, 151.12 GB/s, 150.93 GB/s, 150.68 GB/s, 149.75 - GB/s, 150.67 GB/s, 146.01 GB/s, 148.34 GB/s, 149.15 GB/s] - - [169.40 GB/s, 172.12 GB/s, 172.40 GB/s, 171.99 GB/s, 172.57 GB/s, 171.95 - GB/s, 167.06 GB/s, 169.66 GB/s, 168.34 GB/s, 169.45 GB/s] - - [192.68 GB/s, 191.98 GB/s, 192.82 GB/s, 191.84 GB/s, 191.97 GB/s, 196.91 - GB/s, 193.36 GB/s, 190.12 GB/s, 192.04 GB/s, 193.93 GB/s] - triad: - - [24.78 GB/s, 25.03 GB/s, 25.07 GB/s, 24.81 GB/s, 24.65 GB/s, 24.80 GB/s, - 24.71 GB/s, 25.15 GB/s, 24.70 GB/s, 24.25 GB/s] - - [49.63 GB/s, 48.68 GB/s, 49.73 GB/s, 49.97 GB/s, 50.37 GB/s, 49.89 GB/s, - 49.59 GB/s, 49.00 GB/s, 49.96 GB/s, 49.61 GB/s] - - [74.88 GB/s, 74.99 GB/s, 75.31 GB/s, 73.20 GB/s, 74.50 GB/s, 72.88 GB/s, - 73.43 GB/s, 73.74 GB/s, 74.59 GB/s, 74.60 GB/s] - - [95.80 GB/s, 97.67 GB/s, 98.93 GB/s, 97.79 GB/s, 98.74 GB/s, 97.74 GB/s, - 98.87 GB/s, 99.12 GB/s, 97.90 GB/s, 97.96 GB/s] - - [121.15 GB/s, 120.28 GB/s, 120.66 GB/s, 121.19 GB/s, 121.09 GB/s, 121.68 - GB/s, 121.30 GB/s, 123.22 GB/s, 122.51 GB/s, 123.25 GB/s] - - [146.72 GB/s, 146.38 GB/s, 146.25 GB/s, 146.49 GB/s, 146.29 GB/s, 144.30 - GB/s, 142.89 GB/s, 150.29 GB/s, 146.37 GB/s, 146.30 GB/s] - - [166.36 GB/s, 168.18 GB/s, 168.79 GB/s, 170.27 GB/s, 169.26 GB/s, 170.98 - GB/s, 170.77 GB/s, 171.43 GB/s, 169.53 GB/s, 171.60 GB/s] - - [190.83 GB/s, 197.81 GB/s, 196.29 GB/s, 197.12 GB/s, 196.21 GB/s, 188.40 - GB/s, 191.07 GB/s, 195.14 GB/s, 192.48 GB/s, 194.23 GB/s] - update: - - [32.74 GB/s, 32.98 GB/s, 32.73 GB/s, 32.57 GB/s, 32.63 GB/s, 32.41 GB/s, - 32.61 GB/s, 32.24 GB/s, 32.52 GB/s, 32.49 GB/s] - - [65.22 GB/s, 65.07 GB/s, 64.65 GB/s, 65.26 GB/s, 63.70 GB/s, 64.19 GB/s, - 64.35 GB/s, 64.83 GB/s, 65.60 GB/s, 63.99 GB/s] - - [97.60 GB/s, 96.65 GB/s, 97.50 GB/s, 96.07 GB/s, 97.12 GB/s, 96.41 GB/s, - 96.85 GB/s, 96.80 GB/s, 97.10 GB/s, 97.10 GB/s] - - [129.18 GB/s, 127.79 GB/s, 129.50 GB/s, 129.46 GB/s, 128.85 GB/s, 128.69 - GB/s, 129.02 GB/s, 130.34 GB/s, 129.92 GB/s, 129.11 GB/s] - - [160.00 GB/s, 161.81 GB/s, 160.37 GB/s, 159.56 GB/s, 160.38 GB/s, 161.91 - GB/s, 160.54 GB/s, 161.43 GB/s, 160.59 GB/s, 162.76 GB/s] - - [192.24 GB/s, 193.69 GB/s, 191.11 GB/s, 190.65 GB/s, 193.10 GB/s, 191.30 - GB/s, 192.50 GB/s, 193.37 GB/s, 191.98 GB/s, 194.12 GB/s] - - [221.45 GB/s, 229.02 GB/s, 226.33 GB/s, 224.81 GB/s, 225.62 GB/s, 224.79 - GB/s, 226.03 GB/s, 227.09 GB/s, 226.46 GB/s, 225.88 GB/s] - - [255.45 GB/s, 256.52 GB/s, 254.06 GB/s, 257.76 GB/s, 256.85 GB/s, 256.27 - GB/s, 260.35 GB/s, 259.96 GB/s, 258.40 GB/s, 255.79 GB/s] - threads: [2, 4, 6, 8, 10, 12, 14, 16] - threads per core: 2 - total size: [13.20 MB, 13.20 MB, 13.20 MB, 13.20 MB, 13.20 MB, 13.20 MB, 13.20 - MB, 13.20 MB] - MEM: - 1: - cores: [1, 2, 3, 4, 5, 6, 7, 8] - results: - copy: [11.12 GB/s, 20.53 GB/s, 24.86 GB/s, 26.20 GB/s, 26.47 GB/s, 26.35 - GB/s, 26.24 GB/s, 26.17 GB/s] - daxpy: [16.10 GB/s, 30.00 GB/s, 36.88 GB/s, 38.86 GB/s, 39.36 GB/s, 39.19 - GB/s, 39.02 GB/s, 38.88 GB/s] - load: [12.30 GB/s, 23.50 GB/s, 33.04 GB/s, 40.59 GB/s, 44.03 GB/s, 44.56 - GB/s, 44.26 GB/s, 43.77 GB/s] - triad: [12.41 GB/s, 24.13 GB/s, 29.24 GB/s, 30.73 GB/s, 30.68 GB/s, 30.58 - GB/s, 30.54 GB/s, 30.63 GB/s] - update: [17.40 GB/s, 31.16 GB/s, 36.80 GB/s, 39.06 GB/s, 39.80 GB/s, 39.77 - GB/s, 39.50 GB/s, 39.24 GB/s] - size per core: [300.00 MB, 150.00 MB, 100.00 MB, 75.00 MB, 60.00 MB, 50.00 - MB, 42.86 MB, 37.50 MB] - size per thread: [300.00 MB, 150.00 MB, 100.00 MB, 75.00 MB, 60.00 MB, 50.00 - MB, 42.86 MB, 37.50 MB] - stats: - copy: - - [10.83 GB/s, 10.83 GB/s, 10.81 GB/s, 10.82 GB/s, 10.82 GB/s, 10.82 GB/s, - 10.83 GB/s, 10.81 GB/s, 10.82 GB/s, 11.12 GB/s] - - [20.34 GB/s, 20.38 GB/s, 20.37 GB/s, 20.34 GB/s, 20.41 GB/s, 20.39 GB/s, - 20.39 GB/s, 20.39 GB/s, 20.53 GB/s, 20.35 GB/s] - - [24.70 GB/s, 24.76 GB/s, 24.80 GB/s, 24.86 GB/s, 24.75 GB/s, 24.80 GB/s, - 24.77 GB/s, 24.82 GB/s, 24.81 GB/s, 24.73 GB/s] - - [26.10 GB/s, 26.16 GB/s, 26.14 GB/s, 26.16 GB/s, 26.10 GB/s, 26.15 GB/s, - 26.10 GB/s, 26.15 GB/s, 26.11 GB/s, 26.20 GB/s] - - [26.45 GB/s, 26.44 GB/s, 26.41 GB/s, 26.43 GB/s, 26.45 GB/s, 26.44 GB/s, - 26.46 GB/s, 26.47 GB/s, 26.45 GB/s, 26.44 GB/s] - - [26.34 GB/s, 26.30 GB/s, 26.31 GB/s, 26.33 GB/s, 26.26 GB/s, 26.35 GB/s, - 26.30 GB/s, 26.30 GB/s, 26.30 GB/s, 26.34 GB/s] - - [26.20 GB/s, 26.24 GB/s, 26.21 GB/s, 26.22 GB/s, 26.22 GB/s, 26.20 GB/s, - 26.20 GB/s, 26.23 GB/s, 26.22 GB/s, 26.23 GB/s] - - [26.15 GB/s, 26.17 GB/s, 26.12 GB/s, 26.15 GB/s, 26.15 GB/s, 26.15 GB/s, - 26.12 GB/s, 26.14 GB/s, 26.14 GB/s, 26.17 GB/s] - daxpy: - - [15.77 GB/s, 15.77 GB/s, 16.04 GB/s, 15.68 GB/s, 15.72 GB/s, 15.76 GB/s, - 15.91 GB/s, 15.77 GB/s, 16.10 GB/s, 16.04 GB/s] - - [29.88 GB/s, 29.80 GB/s, 30.00 GB/s, 29.87 GB/s, 29.87 GB/s, 30.00 GB/s, - 29.79 GB/s, 29.80 GB/s, 29.80 GB/s, 29.82 GB/s] - - [36.63 GB/s, 36.73 GB/s, 36.64 GB/s, 36.64 GB/s, 36.81 GB/s, 36.88 GB/s, - 36.62 GB/s, 36.65 GB/s, 36.74 GB/s, 36.71 GB/s] - - [38.82 GB/s, 38.83 GB/s, 38.86 GB/s, 38.81 GB/s, 38.81 GB/s, 38.82 GB/s, - 38.85 GB/s, 38.80 GB/s, 38.84 GB/s, 38.73 GB/s] - - [39.32 GB/s, 39.30 GB/s, 39.34 GB/s, 39.36 GB/s, 39.28 GB/s, 39.33 GB/s, - 39.31 GB/s, 39.25 GB/s, 39.32 GB/s, 39.33 GB/s] - - [39.10 GB/s, 39.12 GB/s, 39.14 GB/s, 39.16 GB/s, 39.17 GB/s, 39.17 GB/s, - 39.13 GB/s, 39.15 GB/s, 39.14 GB/s, 39.19 GB/s] - - [39.01 GB/s, 39.01 GB/s, 39.02 GB/s, 39.02 GB/s, 39.00 GB/s, 39.00 GB/s, - 38.97 GB/s, 39.02 GB/s, 38.98 GB/s, 39.01 GB/s] - - [38.76 GB/s, 38.86 GB/s, 38.83 GB/s, 38.82 GB/s, 38.87 GB/s, 38.88 GB/s, - 38.81 GB/s, 38.83 GB/s, 38.88 GB/s, 38.88 GB/s] - load: - - [11.97 GB/s, 11.96 GB/s, 11.98 GB/s, 11.97 GB/s, 11.96 GB/s, 12.05 GB/s, - 12.30 GB/s, 12.18 GB/s, 11.97 GB/s, 11.96 GB/s] - - [22.85 GB/s, 22.85 GB/s, 22.87 GB/s, 22.94 GB/s, 23.50 GB/s, 22.86 GB/s, - 22.86 GB/s, 23.25 GB/s, 22.85 GB/s, 22.86 GB/s] - - [33.04 GB/s, 32.43 GB/s, 32.51 GB/s, 32.52 GB/s, 32.52 GB/s, 32.81 GB/s, - 32.77 GB/s, 32.54 GB/s, 32.53 GB/s, 32.53 GB/s] - - [39.95 GB/s, 39.94 GB/s, 39.93 GB/s, 40.15 GB/s, 40.59 GB/s, 40.36 GB/s, - 40.28 GB/s, 39.93 GB/s, 39.94 GB/s, 39.98 GB/s] - - [43.98 GB/s, 43.86 GB/s, 43.90 GB/s, 43.80 GB/s, 43.83 GB/s, 43.86 GB/s, - 44.03 GB/s, 43.94 GB/s, 43.83 GB/s, 43.92 GB/s] - - [44.46 GB/s, 44.34 GB/s, 44.56 GB/s, 44.51 GB/s, 44.32 GB/s, 44.32 GB/s, - 44.51 GB/s, 44.48 GB/s, 44.32 GB/s, 44.34 GB/s] - - [44.03 GB/s, 44.26 GB/s, 44.08 GB/s, 44.18 GB/s, 44.10 GB/s, 43.99 GB/s, - 44.07 GB/s, 44.06 GB/s, 43.94 GB/s, 43.97 GB/s] - - [43.48 GB/s, 43.77 GB/s, 43.51 GB/s, 43.49 GB/s, 43.47 GB/s, 43.73 GB/s, - 43.55 GB/s, 43.68 GB/s, 43.49 GB/s, 43.50 GB/s] - triad: - - [12.11 GB/s, 12.02 GB/s, 12.03 GB/s, 12.10 GB/s, 12.03 GB/s, 12.04 GB/s, - 12.05 GB/s, 12.17 GB/s, 12.02 GB/s, 12.41 GB/s] - - [23.43 GB/s, 23.25 GB/s, 23.25 GB/s, 23.36 GB/s, 23.28 GB/s, 23.24 GB/s, - 23.61 GB/s, 23.29 GB/s, 23.31 GB/s, 24.13 GB/s] - - [28.92 GB/s, 29.10 GB/s, 29.17 GB/s, 29.04 GB/s, 28.91 GB/s, 29.16 GB/s, - 28.82 GB/s, 29.01 GB/s, 29.24 GB/s, 28.88 GB/s] - - [30.65 GB/s, 30.62 GB/s, 30.73 GB/s, 30.59 GB/s, 30.69 GB/s, 30.68 GB/s, - 30.59 GB/s, 30.59 GB/s, 30.57 GB/s, 30.67 GB/s] - - [30.53 GB/s, 30.67 GB/s, 30.65 GB/s, 30.53 GB/s, 30.63 GB/s, 30.68 GB/s, - 30.50 GB/s, 30.67 GB/s, 30.64 GB/s, 30.67 GB/s] - - [30.45 GB/s, 30.58 GB/s, 30.51 GB/s, 30.49 GB/s, 30.52 GB/s, 30.49 GB/s, - 30.56 GB/s, 30.55 GB/s, 30.47 GB/s, 30.47 GB/s] - - [30.51 GB/s, 30.47 GB/s, 30.50 GB/s, 30.47 GB/s, 30.52 GB/s, 30.54 GB/s, - 30.54 GB/s, 30.50 GB/s, 30.49 GB/s, 30.50 GB/s] - - [30.58 GB/s, 30.34 GB/s, 30.56 GB/s, 30.54 GB/s, 30.63 GB/s, 30.53 GB/s, - 30.59 GB/s, 30.50 GB/s, 30.54 GB/s, 30.47 GB/s] - update: - - [17.33 GB/s, 17.32 GB/s, 17.34 GB/s, 17.35 GB/s, 17.40 GB/s, 17.35 GB/s, - 17.36 GB/s, 17.39 GB/s, 17.35 GB/s, 17.35 GB/s] - - [31.12 GB/s, 31.15 GB/s, 31.10 GB/s, 31.16 GB/s, 31.07 GB/s, 31.08 GB/s, - 31.09 GB/s, 31.12 GB/s, 31.12 GB/s, 31.08 GB/s] - - [36.80 GB/s, 36.42 GB/s, 35.92 GB/s, 36.39 GB/s, 35.99 GB/s, 35.98 GB/s, - 36.37 GB/s, 36.39 GB/s, 36.38 GB/s, 36.44 GB/s] - - [39.03 GB/s, 39.05 GB/s, 39.02 GB/s, 39.06 GB/s, 39.01 GB/s, 39.02 GB/s, - 39.02 GB/s, 39.00 GB/s, 39.00 GB/s, 39.00 GB/s] - - [39.76 GB/s, 39.80 GB/s, 39.80 GB/s, 39.78 GB/s, 39.76 GB/s, 39.79 GB/s, - 39.79 GB/s, 39.77 GB/s, 39.77 GB/s, 39.71 GB/s] - - [39.71 GB/s, 39.72 GB/s, 39.72 GB/s, 39.66 GB/s, 39.74 GB/s, 39.70 GB/s, - 39.76 GB/s, 39.74 GB/s, 39.77 GB/s, 39.74 GB/s] - - [39.50 GB/s, 39.47 GB/s, 39.45 GB/s, 39.43 GB/s, 39.46 GB/s, 39.45 GB/s, - 39.45 GB/s, 39.40 GB/s, 39.43 GB/s, 39.47 GB/s] - - [39.21 GB/s, 39.18 GB/s, 39.19 GB/s, 39.19 GB/s, 39.21 GB/s, 39.19 GB/s, - 39.18 GB/s, 39.21 GB/s, 39.20 GB/s, 39.24 GB/s] - threads: [1, 2, 3, 4, 5, 6, 7, 8] - threads per core: 1 - total size: [300.00 MB, 300.00 MB, 300.00 MB, 300.00 MB, 300.00 MB, 300.00 - MB, 300.00 MB, 300.00 MB] - 2: - cores: [1, 2, 3, 4, 5, 6, 7, 8] - results: - copy: [10.79 GB/s, 20.46 GB/s, 24.69 GB/s, 25.42 GB/s, 25.63 GB/s, 25.45 - GB/s, 25.32 GB/s, 25.06 GB/s] - daxpy: [15.97 GB/s, 29.70 GB/s, 35.95 GB/s, 37.55 GB/s, 37.81 GB/s, 37.78 - GB/s, 37.64 GB/s, 37.33 GB/s] - load: [13.46 GB/s, 25.84 GB/s, 35.75 GB/s, 40.54 GB/s, 42.38 GB/s, 42.30 - GB/s, 41.85 GB/s, 41.19 GB/s] - triad: [12.05 GB/s, 22.53 GB/s, 27.53 GB/s, 29.10 GB/s, 29.68 GB/s, 29.79 - GB/s, 29.85 GB/s, 29.64 GB/s] - update: [19.12 GB/s, 33.86 GB/s, 38.51 GB/s, 39.38 GB/s, 39.20 GB/s, 38.80 - GB/s, 38.39 GB/s, 38.02 GB/s] - size per core: [300.00 MB, 150.00 MB, 100.00 MB, 75.00 MB, 60.00 MB, 50.00 - MB, 42.86 MB, 37.50 MB] - size per thread: [150.00 MB, 75.00 MB, 50.00 MB, 37.50 MB, 30.00 MB, 25.00 - MB, 21.43 MB, 18.75 MB] - stats: - copy: - - [10.71 GB/s, 10.69 GB/s, 10.71 GB/s, 10.70 GB/s, 10.79 GB/s, 10.58 GB/s, - 10.70 GB/s, 10.69 GB/s, 10.69 GB/s, 10.70 GB/s] - - [20.27 GB/s, 20.31 GB/s, 20.27 GB/s, 20.26 GB/s, 20.31 GB/s, 20.26 GB/s, - 20.24 GB/s, 20.26 GB/s, 20.26 GB/s, 20.46 GB/s] - - [24.69 GB/s, 24.66 GB/s, 24.64 GB/s, 24.63 GB/s, 24.67 GB/s, 24.64 GB/s, - 24.64 GB/s, 24.68 GB/s, 24.61 GB/s, 24.63 GB/s] - - [25.42 GB/s, 25.41 GB/s, 25.40 GB/s, 25.36 GB/s, 25.40 GB/s, 25.39 GB/s, - 25.40 GB/s, 25.38 GB/s, 25.41 GB/s, 25.39 GB/s] - - [25.55 GB/s, 25.57 GB/s, 25.58 GB/s, 25.63 GB/s, 25.57 GB/s, 25.57 GB/s, - 25.58 GB/s, 25.55 GB/s, 25.57 GB/s, 25.49 GB/s] - - [25.42 GB/s, 25.42 GB/s, 25.41 GB/s, 25.39 GB/s, 25.40 GB/s, 25.43 GB/s, - 25.45 GB/s, 25.44 GB/s, 25.43 GB/s, 25.43 GB/s] - - [25.27 GB/s, 25.31 GB/s, 25.28 GB/s, 25.31 GB/s, 25.32 GB/s, 25.31 GB/s, - 25.29 GB/s, 25.30 GB/s, 25.25 GB/s, 25.28 GB/s] - - [25.03 GB/s, 25.01 GB/s, 25.01 GB/s, 25.04 GB/s, 25.00 GB/s, 25.03 GB/s, - 25.06 GB/s, 25.04 GB/s, 25.04 GB/s, 25.04 GB/s] - daxpy: - - [15.81 GB/s, 15.81 GB/s, 15.97 GB/s, 15.62 GB/s, 15.64 GB/s, 15.83 GB/s, - 15.63 GB/s, 15.82 GB/s, 15.81 GB/s, 15.63 GB/s] - - [29.62 GB/s, 29.56 GB/s, 29.61 GB/s, 29.59 GB/s, 29.70 GB/s, 29.61 GB/s, - 29.65 GB/s, 29.65 GB/s, 29.58 GB/s, 29.59 GB/s] - - [35.95 GB/s, 35.89 GB/s, 35.92 GB/s, 35.92 GB/s, 35.95 GB/s, 35.90 GB/s, - 35.87 GB/s, 35.90 GB/s, 35.92 GB/s, 35.82 GB/s] - - [37.55 GB/s, 37.46 GB/s, 37.52 GB/s, 37.51 GB/s, 37.55 GB/s, 37.51 GB/s, - 37.44 GB/s, 37.41 GB/s, 37.50 GB/s, 37.40 GB/s] - - [37.79 GB/s, 37.76 GB/s, 37.80 GB/s, 37.77 GB/s, 37.76 GB/s, 37.81 GB/s, - 37.78 GB/s, 37.81 GB/s, 37.79 GB/s, 37.78 GB/s] - - [37.71 GB/s, 37.68 GB/s, 37.68 GB/s, 37.73 GB/s, 37.74 GB/s, 37.66 GB/s, - 37.78 GB/s, 37.74 GB/s, 37.71 GB/s, 37.70 GB/s] - - [37.61 GB/s, 37.60 GB/s, 37.61 GB/s, 37.62 GB/s, 37.64 GB/s, 37.61 GB/s, - 37.60 GB/s, 37.59 GB/s, 37.63 GB/s, 37.60 GB/s] - - [37.23 GB/s, 37.21 GB/s, 37.26 GB/s, 37.27 GB/s, 37.28 GB/s, 37.33 GB/s, - 37.29 GB/s, 37.31 GB/s, 37.26 GB/s, 37.29 GB/s] - load: - - [13.34 GB/s, 13.36 GB/s, 13.35 GB/s, 13.34 GB/s, 13.35 GB/s, 13.38 GB/s, - 13.46 GB/s, 13.35 GB/s, 13.35 GB/s, 13.35 GB/s] - - [25.63 GB/s, 25.64 GB/s, 25.84 GB/s, 25.64 GB/s, 25.74 GB/s, 25.63 GB/s, - 25.64 GB/s, 25.63 GB/s, 25.64 GB/s, 25.68 GB/s] - - [35.38 GB/s, 35.56 GB/s, 35.50 GB/s, 35.75 GB/s, 35.50 GB/s, 35.39 GB/s, - 35.46 GB/s, 35.39 GB/s, 35.75 GB/s, 35.40 GB/s] - - [40.37 GB/s, 40.37 GB/s, 40.49 GB/s, 40.49 GB/s, 40.42 GB/s, 40.37 GB/s, - 40.54 GB/s, 40.39 GB/s, 40.37 GB/s, 40.51 GB/s] - - [42.34 GB/s, 42.14 GB/s, 42.26 GB/s, 42.17 GB/s, 42.10 GB/s, 42.13 GB/s, - 42.38 GB/s, 42.13 GB/s, 42.21 GB/s, 42.15 GB/s] - - [42.30 GB/s, 42.13 GB/s, 42.20 GB/s, 42.11 GB/s, 42.12 GB/s, 42.12 GB/s, - 42.18 GB/s, 42.25 GB/s, 42.19 GB/s, 42.21 GB/s] - - [41.70 GB/s, 41.76 GB/s, 41.85 GB/s, 41.80 GB/s, 41.71 GB/s, 41.71 GB/s, - 41.80 GB/s, 41.70 GB/s, 41.76 GB/s, 41.75 GB/s] - - [41.02 GB/s, 41.01 GB/s, 41.17 GB/s, 41.12 GB/s, 41.13 GB/s, 41.15 GB/s, - 41.19 GB/s, 41.01 GB/s, 41.10 GB/s, 41.06 GB/s] - triad: - - [11.87 GB/s, 11.89 GB/s, 11.91 GB/s, 11.81 GB/s, 11.83 GB/s, 11.85 GB/s, - 11.90 GB/s, 11.80 GB/s, 11.85 GB/s, 12.05 GB/s] - - [22.53 GB/s, 22.47 GB/s, 22.44 GB/s, 22.46 GB/s, 22.43 GB/s, 22.52 GB/s, - 22.41 GB/s, 22.52 GB/s, 22.48 GB/s, 22.41 GB/s] - - [27.43 GB/s, 27.42 GB/s, 27.47 GB/s, 27.47 GB/s, 27.52 GB/s, 27.49 GB/s, - 27.41 GB/s, 27.42 GB/s, 27.51 GB/s, 27.53 GB/s] - - [29.02 GB/s, 29.03 GB/s, 29.03 GB/s, 29.04 GB/s, 28.89 GB/s, 29.10 GB/s, - 29.02 GB/s, 29.05 GB/s, 28.93 GB/s, 29.01 GB/s] - - [29.66 GB/s, 29.68 GB/s, 29.60 GB/s, 29.62 GB/s, 29.60 GB/s, 29.67 GB/s, - 29.66 GB/s, 29.62 GB/s, 29.62 GB/s, 29.62 GB/s] - - [29.78 GB/s, 29.76 GB/s, 29.77 GB/s, 29.77 GB/s, 29.75 GB/s, 29.79 GB/s, - 29.75 GB/s, 29.77 GB/s, 29.76 GB/s, 29.78 GB/s] - - [29.82 GB/s, 29.85 GB/s, 29.85 GB/s, 29.83 GB/s, 29.82 GB/s, 29.83 GB/s, - 29.83 GB/s, 29.81 GB/s, 29.81 GB/s, 29.80 GB/s] - - [29.54 GB/s, 29.63 GB/s, 29.57 GB/s, 29.56 GB/s, 29.55 GB/s, 29.64 GB/s, - 29.60 GB/s, 29.53 GB/s, 29.54 GB/s, 29.57 GB/s] - update: - - [18.66 GB/s, 18.67 GB/s, 18.66 GB/s, 19.12 GB/s, 18.67 GB/s, 18.67 GB/s, - 18.67 GB/s, 18.67 GB/s, 18.70 GB/s, 18.67 GB/s] - - [33.61 GB/s, 33.34 GB/s, 33.71 GB/s, 33.31 GB/s, 33.34 GB/s, 33.86 GB/s, - 33.62 GB/s, 33.35 GB/s, 33.54 GB/s, 33.34 GB/s] - - [38.51 GB/s, 38.46 GB/s, 38.42 GB/s, 38.43 GB/s, 38.41 GB/s, 38.46 GB/s, - 38.41 GB/s, 38.42 GB/s, 38.43 GB/s, 38.41 GB/s] - - [39.37 GB/s, 39.34 GB/s, 39.36 GB/s, 39.35 GB/s, 39.37 GB/s, 39.38 GB/s, - 39.36 GB/s, 39.35 GB/s, 39.31 GB/s, 39.32 GB/s] - - [39.17 GB/s, 39.17 GB/s, 39.16 GB/s, 39.20 GB/s, 39.18 GB/s, 39.17 GB/s, - 39.18 GB/s, 39.15 GB/s, 39.20 GB/s, 39.17 GB/s] - - [38.79 GB/s, 38.79 GB/s, 38.80 GB/s, 38.78 GB/s, 38.78 GB/s, 38.75 GB/s, - 38.80 GB/s, 38.77 GB/s, 38.78 GB/s, 38.78 GB/s] - - [38.36 GB/s, 38.37 GB/s, 38.37 GB/s, 38.39 GB/s, 38.36 GB/s, 38.37 GB/s, - 38.38 GB/s, 38.37 GB/s, 38.35 GB/s, 38.39 GB/s] - - [37.98 GB/s, 37.99 GB/s, 38.02 GB/s, 38.01 GB/s, 38.01 GB/s, 38.00 GB/s, - 38.02 GB/s, 38.00 GB/s, 38.02 GB/s, 38.02 GB/s] - threads: [2, 4, 6, 8, 10, 12, 14, 16] - threads per core: 2 - total size: [300.00 MB, 300.00 MB, 300.00 MB, 300.00 MB, 300.00 MB, 300.00 - MB, 300.00 MB, 300.00 MB] diff --git a/pystencils_tests/test_kerncraft_coupling.py b/pystencils_tests/test_kerncraft_coupling.py deleted file mode 100644 index 5f3757ba6..000000000 --- a/pystencils_tests/test_kerncraft_coupling.py +++ /dev/null @@ -1,197 +0,0 @@ -import numpy as np -import pytest -import sympy as sp -from pathlib import Path - -from kerncraft.kernel import KernelCode -from kerncraft.machinemodel import MachineModel -from kerncraft.models import ECM, ECMData, Benchmark - -import pystencils as ps -from pystencils import Assignment, Field -from pystencils.backends.simd_instruction_sets import get_supported_instruction_sets, get_vector_instruction_set -from pystencils.cpu import create_kernel -from pystencils.datahandling import create_data_handling -from pystencils.kerncraft_coupling import KerncraftParameters, PyStencilsKerncraftKernel -from pystencils.kerncraft_coupling.generate_benchmark import generate_benchmark, run_c_benchmark -from pystencils.timeloop import TimeLoop - -SCRIPT_FOLDER = Path(__file__).parent -INPUT_FOLDER = SCRIPT_FOLDER / "kerncraft_inputs" - - -@pytest.mark.kerncraft -def test_compilation(): - machine_file_path = INPUT_FOLDER / "Example_SandyBridgeEP_E5-2680.yml" - machine = MachineModel(path_to_yaml=machine_file_path) - - kernel_file_path = INPUT_FOLDER / "2d-5pt.c" - with open(kernel_file_path) as kernel_file: - reference_kernel = KernelCode(kernel_file.read(), machine=machine, filename=kernel_file_path) - reference_kernel.get_kernel_header(name='test_kernel') - reference_kernel.get_kernel_code(name='test_kernel') - reference_kernel.get_main_code(kernel_function_name='test_kernel') - - size = [30, 50, 3] - arr = np.zeros(size) - a = Field.create_from_numpy_array('a', arr, index_dimensions=1) - b = Field.create_from_numpy_array('b', arr, index_dimensions=1) - s = sp.Symbol("s") - rhs = a[0, -1](0) + a[0, 1] + a[-1, 0] + a[1, 0] - update_rule = Assignment(b[0, 0], s * rhs) - ast = create_kernel([update_rule]) - mine = generate_benchmark(ast, likwid=False) - print(mine) - - -def analysis(kernel, machine, model='ecmdata'): - if model == 'ecmdata': - model = ECMData(kernel, machine, KerncraftParameters()) - elif model == 'ecm': - model = ECM(kernel, machine, KerncraftParameters()) - elif model == 'benchmark': - model = Benchmark(kernel, machine, KerncraftParameters()) - else: - model = ECM(kernel, machine, KerncraftParameters()) - model.analyze() - return model - - -@pytest.mark.kerncraft -def test_3d_7pt_osaca(): - - size = [20, 200, 200] - kernel_file_path = INPUT_FOLDER / "3d-7pt.c" - machine_file_path = INPUT_FOLDER / "Example_SandyBridgeEP_E5-2680.yml" - machine_model = MachineModel(path_to_yaml=machine_file_path) - with open(kernel_file_path) as kernel_file: - reference_kernel = KernelCode(kernel_file.read(), machine=machine_model, filename=kernel_file_path) - reference_kernel.set_constant('M', size[0]) - reference_kernel.set_constant('N', size[1]) - assert size[1] == size[2] - analysis(reference_kernel, machine_model, model='ecm') - - arr = np.zeros(size) - a = Field.create_from_numpy_array('a', arr, index_dimensions=0) - b = Field.create_from_numpy_array('b', arr, index_dimensions=0) - s = sp.Symbol("s") - rhs = a[0, -1, 0] + a[0, 1, 0] + a[-1, 0, 0] + a[1, 0, 0] + a[0, 0, -1] + a[0, 0, 1] - - update_rule = Assignment(b[0, 0, 0], s * rhs) - ast = create_kernel([update_rule]) - k = PyStencilsKerncraftKernel(ast, machine=machine_model, debug_print=True) - analysis(k, machine_model, model='ecm') - assert reference_kernel._flops == k._flops - - path, lock = k.get_kernel_code(openmp=True) - with open(path) as kernel_file: - assert "#pragma omp parallel" in kernel_file.read() - - path, lock = k.get_main_code() - with open(path) as kernel_file: - assert "likwid_markerInit();" in kernel_file.read() - - -@pytest.mark.kerncraft -def test_2d_5pt(): - machine_file_path = INPUT_FOLDER / "Example_SandyBridgeEP_E5-2680.yml" - machine = MachineModel(path_to_yaml=machine_file_path) - - size = [30, 50, 3] - kernel_file_path = INPUT_FOLDER / "2d-5pt.c" - with open(kernel_file_path) as kernel_file: - reference_kernel = KernelCode(kernel_file.read(), machine=machine, - filename=kernel_file_path) - reference = analysis(reference_kernel, machine) - - arr = np.zeros(size) - a = Field.create_from_numpy_array('a', arr, index_dimensions=1) - b = Field.create_from_numpy_array('b', arr, index_dimensions=1) - s = sp.Symbol("s") - rhs = a[0, -1](0) + a[0, 1] + a[-1, 0] + a[1, 0] - update_rule = Assignment(b[0, 0], s * rhs) - ast = create_kernel([update_rule]) - k = PyStencilsKerncraftKernel(ast, machine) - result = analysis(k, machine) - - for e1, e2 in zip(reference.results['cycles'], result.results['cycles']): - assert e1 == e2 - - -@pytest.mark.kerncraft -def test_3d_7pt(): - machine_file_path = INPUT_FOLDER / "Example_SandyBridgeEP_E5-2680.yml" - machine = MachineModel(path_to_yaml=machine_file_path) - - size = [30, 50, 50] - kernel_file_path = INPUT_FOLDER / "3d-7pt.c" - with open(kernel_file_path) as kernel_file: - reference_kernel = KernelCode(kernel_file.read(), machine=machine, - filename=kernel_file_path) - reference_kernel.set_constant('M', size[0]) - reference_kernel.set_constant('N', size[1]) - assert size[1] == size[2] - reference = analysis(reference_kernel, machine) - - arr = np.zeros(size) - a = Field.create_from_numpy_array('a', arr, index_dimensions=0) - b = Field.create_from_numpy_array('b', arr, index_dimensions=0) - s = sp.Symbol("s") - rhs = a[0, -1, 0] + a[0, 1, 0] + a[-1, 0, 0] + a[1, 0, 0] + a[0, 0, -1] + a[0, 0, 1] - - update_rule = Assignment(b[0, 0, 0], s * rhs) - ast = create_kernel([update_rule]) - k = PyStencilsKerncraftKernel(ast, machine) - result = analysis(k, machine) - - for e1, e2 in zip(reference.results['cycles'], result.results['cycles']): - assert e1 == e2 - - -@pytest.mark.kerncraft -def test_benchmark(): - size = [30, 50, 50] - arr = np.zeros(size) - a = Field.create_from_numpy_array('a', arr, index_dimensions=0) - b = Field.create_from_numpy_array('b', arr, index_dimensions=0) - s = sp.Symbol("s") - rhs = a[0, -1, 0] + a[0, 1, 0] + a[-1, 0, 0] + a[1, 0, 0] + a[0, 0, -1] + a[0, 0, 1] - - update_rule = Assignment(b[0, 0, 0], s * rhs) - ast = create_kernel([update_rule]) - - c_benchmark_run = run_c_benchmark(ast, inner_iterations=1000, outer_iterations=1) - - kernel = ast.compile() - a = np.full(size, fill_value=0.23) - b = np.full(size, fill_value=0.23) - - timeloop = TimeLoop(steps=1) - timeloop.add_call(kernel, {'a': a, 'b': b, 's': 0.23}) - - timeloop_time = timeloop.benchmark(number_of_time_steps_for_estimation=1) - - np.testing.assert_almost_equal(c_benchmark_run, timeloop_time, decimal=4) - - -@pytest.mark.kerncraft -def test_benchmark_vectorized(): - instruction_sets = get_supported_instruction_sets() - if not instruction_sets: - pytest.skip("cannot detect CPU instruction set") - - for vec in instruction_sets: - dh = create_data_handling((20, 20, 20), periodicity=True) - - width = get_vector_instruction_set(instruction_set=vec)['width'] * 8 - - a = dh.add_array("a", values_per_cell=1, alignment=width) - b = dh.add_array("b", values_per_cell=1, alignment=width) - - rhs = a[0, -1, 0] + a[0, 1, 0] + a[-1, 0, 0] + a[1, 0, 0] + a[0, 0, -1] + a[0, 0, 1] - update_rule = Assignment(b[0, 0, 0], rhs) - - opt = {'instruction_set': vec, 'assume_aligned': True, 'nontemporal': True, 'assume_inner_stride_one': True} - ast = ps.create_kernel(update_rule, cpu_vectorize_info=opt) - - run_c_benchmark(ast, 5) diff --git a/pytest.ini b/pytest.ini index f77dd5ea5..b9b5db388 100644 --- a/pytest.ini +++ b/pytest.ini @@ -4,7 +4,6 @@ norecursedirs = *.egg-info .git .cache .ipynb_checkpoints htmlcov addopts = --doctest-modules --durations=20 --cov-config pytest.ini markers = longrun: tests only run at night since they have large execution time - kerncraft: tests depending on kerncraft notebook: mark for notebooks # these warnings all come from third party libraries. filterwarnings = @@ -54,7 +53,7 @@ exclude_lines = if __name__ == .__main__.: skip_covered = True -fail_under = 87 +fail_under = 86 [html] directory = coverage_report diff --git a/setup.py b/setup.py index 4643a3883..a2053b422 100644 --- a/setup.py +++ b/setup.py @@ -93,7 +93,6 @@ setuptools.setup(name='pystencils', packages=['pystencils'] + ['pystencils.' + s for s in setuptools.find_packages('pystencils')], install_requires=['sympy>=1.5.1,<=1.9', 'numpy>=1.8.0', 'appdirs', 'joblib'], package_data={'pystencils': ['include/*.h', - 'kerncraft_coupling/templates/*', 'backends/cuda_known_functions.txt', 'backends/opencl1.1_known_functions.txt', 'boundaries/createindexlistcython.c', @@ -118,11 +117,9 @@ setuptools.setup(name='pystencils', 'alltrafos': ['islpy', 'py-cpuinfo'], 'bench_db': ['blitzdb', 'pymongo', 'pandas'], 'interactive': ['matplotlib', 'ipy_table', 'imageio', 'jupyter', 'pyevtk', 'rich', 'graphviz'], - 'autodiff': ['pystencils-autodiff'], 'doc': ['sphinx', 'sphinx_rtd_theme', 'nbsphinx', 'sphinxcontrib-bibtex', 'sphinx_autodoc_typehints', 'pandoc'], - 'use_cython': ['Cython'], - 'kerncraft': ['osaca', 'kerncraft'] + 'use_cython': ['Cython'] }, tests_require=['pytest', 'pytest-cov', -- GitLab