diff --git a/pystencils/kerncraft_coupling/generate_benchmark.py b/pystencils/kerncraft_coupling/generate_benchmark.py index c95c420d6c62d4428354f934a4e36fb7e7398b9d..9a012d6c2a75c98faae05e6815dd3883c7d4d2e4 100644 --- a/pystencils/kerncraft_coupling/generate_benchmark.py +++ b/pystencils/kerncraft_coupling/generate_benchmark.py @@ -1,7 +1,9 @@ -import os import subprocess +import warnings +import tempfile +from pathlib import Path -from jinja2 import Template +from jinja2 import Environment, PackageLoader, StrictUndefined from pystencils.astnodes import PragmaBlock from pystencils.backends.cbackend import generate_c, get_headers @@ -10,116 +12,6 @@ from pystencils.data_types import get_base_type from pystencils.include import get_pystencils_include_path from pystencils.sympyextensions import prod -benchmark_template = Template(""" -#include "kerncraft.h" -#include <stdlib.h> -#include <stdint.h> -#include <stdbool.h> -#include <math.h> -#include <stdio.h> - -{{ includes }} - -{%- if likwid %} -#include <likwid.h> -{%- endif %} - -#define RESTRICT __restrict__ -#define FUNC_PREFIX -void dummy(void *); -void timing(double* wcTime, double* cpuTime); -extern int var_false; - - -{{kernel_code}} - - -int main(int argc, char **argv) -{ - {%- if likwid %} - likwid_markerInit(); - {%- endif %} - - {%- for field_name, dataType, size in fields %} - - // Initialization {{field_name}} - double * {{field_name}} = (double *) aligned_malloc(sizeof({{dataType}}) * {{size}}, 64); - for (unsigned long long i = 0; i < {{size}}; ++i) - {{field_name}}[i] = 0.23; - - if(var_false) - dummy({{field_name}}); - - {%- endfor %} - - - - {%- for constantName, dataType in constants %} - - // Constant {{constantName}} - {{dataType}} {{constantName}}; - {{constantName}} = 0.23; - if(var_false) - dummy(& {{constantName}}); - - {%- endfor %} - - {%- if likwid and openmp %} - #pragma omp parallel - { - likwid_markerRegisterRegion("loop"); - #pragma omp barrier - {%- elif likwid %} - likwid_markerRegisterRegion("loop"); - {%- endif %} - - for(int warmup = 1; warmup >= 0; --warmup) { - int repeat = 2; - if(warmup == 0) { - repeat = atoi(argv[1]); - {%- if likwid %} - likwid_markerStartRegion("loop"); - {%- endif %} - } - - {%- if timing %} - double wcStartTime, cpuStartTime, wcEndTime, cpuEndTime; - timing(&wcStartTime, &cpuStartTime); - {%- endif %} - - for (; repeat > 0; --repeat) - { - {{kernelName}}({{call_argument_list}}); - - // Dummy calls - {%- for field_name, dataType, size in fields %} - if(var_false) dummy((void*){{field_name}}); - {%- endfor %} - {%- for constantName, dataType in constants %} - if(var_false) dummy((void*)&{{constantName}}); - {%- endfor %} - } - {%- if timing %} - timing(&wcEndTime, &cpuEndTime); - if( warmup == 0) - printf("%e\\n", (wcEndTime - wcStartTime) / atoi(argv[1]) ); - {%- endif %} - - } - - {%- if likwid %} - likwid_markerStopRegion("loop"); - {%- if openmp %} - } - {%- endif %} - {%- endif %} - - {%- if likwid %} - likwid_markerClose(); - {%- endif %} -} -""") - def generate_benchmark(ast, likwid=False, openmp=False, timing=False): """Return C code of a benchmark program for the given kernel. @@ -157,7 +49,7 @@ def generate_benchmark(ast, likwid=False, openmp=False, timing=False): if len(ast.body.args) > 0 and isinstance(ast.body.args[0], PragmaBlock): ast.body.args[0].pragma_line = '' - args = { + jinja_context = { 'likwid': likwid, 'openmp': openmp, 'kernel_code': generate_c(ast, dialect='c'), @@ -168,16 +60,20 @@ def generate_benchmark(ast, likwid=False, openmp=False, timing=False): 'includes': includes, 'timing': timing, } - return benchmark_template.render(**args) + env = Environment(loader=PackageLoader('pystencils.kerncraft_coupling'), undefined=StrictUndefined) + + return env.get_template('benchmark.c').render(**jinja_context) -def run_c_benchmark(ast, inner_iterations, outer_iterations=3): + +def run_c_benchmark(ast, inner_iterations, outer_iterations=3, path=None): """Runs the given kernel with outer loop in C Args: - ast: + ast: pystencils ast which is used to compile the benchmark file inner_iterations: timings are recorded around this many iterations outer_iterations: number of timings recorded + path: path where the benchmark file is stored. If None a tmp folder is created Returns: list of times per iterations for each outer iteration @@ -185,26 +81,40 @@ def run_c_benchmark(ast, inner_iterations, outer_iterations=3): import kerncraft benchmark_code = generate_benchmark(ast, timing=True) - with open('bench.c', 'w') as f: + + if path is None: + path = tempfile.mkdtemp() + + if isinstance(path, str): + path = Path(path) + + with open(path / 'bench.c', 'w') as f: f.write(benchmark_code) - kerncraft_path = os.path.dirname(kerncraft.__file__) + kerncraft_path = Path(kerncraft.__file__).parent extra_flags = ['-I' + get_pystencils_include_path(), - '-I' + os.path.join(kerncraft_path, 'headers')] + '-I' + str(kerncraft_path / 'headers')] compiler_config = get_compiler_config() compile_cmd = [compiler_config['command']] + compiler_config['flags'].split() compile_cmd += [*extra_flags, - os.path.join(kerncraft_path, 'headers', 'timing.c'), - os.path.join(kerncraft_path, 'headers', 'dummy.c'), - 'bench.c', - '-o', 'bench', + kerncraft_path / 'headers' / 'timing.c', + kerncraft_path / 'headers' / 'dummy.c', + path / 'bench.c', + '-o', path / 'bench', ] run_compile_step(compile_cmd) + time_pre_estimation_per_iteration = float(subprocess.check_output(['./' / path / 'bench', str(10)])) + benchmark_time_limit = 20 + if benchmark_time_limit / time_pre_estimation_per_iteration < inner_iterations: + warn = (f"A benchmark run with {inner_iterations} inner_iterations will probably take longer than " + f"{benchmark_time_limit} seconds for this kernel") + warnings.warn(warn) + results = [] for _ in range(outer_iterations): - benchmark_time = float(subprocess.check_output(['./bench', str(inner_iterations)])) + benchmark_time = float(subprocess.check_output(['./' / path / 'bench', str(inner_iterations)])) results.append(benchmark_time) return results diff --git a/pystencils/kerncraft_coupling/kerncraft_interface.py b/pystencils/kerncraft_coupling/kerncraft_interface.py index bd1771493434d1166bf899cc8a8188994bbd2101..7564245c049c58288387a4a23918622d974aaaf3 100644 --- a/pystencils/kerncraft_coupling/kerncraft_interface.py +++ b/pystencils/kerncraft_coupling/kerncraft_interface.py @@ -1,20 +1,22 @@ import warnings +import fcntl from collections import defaultdict from tempfile import TemporaryDirectory from typing import Optional -import kerncraft +from jinja2 import Environment, PackageLoader, StrictUndefined + import sympy as sp from kerncraft.kerncraft import KernelCode from kerncraft.machinemodel import MachineModel -from pystencils.astnodes import ( - KernelFunction, LoopOverCoordinate, ResolvedFieldAccess, SympyAssignment) +from pystencils.astnodes import (KernelFunction, LoopOverCoordinate, ResolvedFieldAccess, SympyAssignment) from pystencils.field import get_layout_from_strides -from pystencils.kerncraft_coupling.generate_benchmark import generate_benchmark from pystencils.sympyextensions import count_operations_in_ast from pystencils.transformations import filtered_tree_iteration from pystencils.utils import DotDict +from pystencils.backends.cbackend import generate_c, get_headers +from pystencils.cpu.kernelcreation import add_openmp class PyStencilsKerncraftKernel(KernelCode): @@ -34,8 +36,10 @@ class PyStencilsKerncraftKernel(KernelCode): assumed_layout: either 'SoA' or 'AoS' - if fields have symbolic sizes the layout of the index coordinates is not known. In this case either a structures of array (SoA) or array of structures (AoS) layout is assumed + debug_print: print debug information + filename: used for caching """ - kerncraft.kernel.Kernel.__init__(self, machine) + super(KernelCode, self).__init__(machine=machine) # Initialize state self.asm_block = None @@ -96,7 +100,7 @@ class PyStencilsKerncraftKernel(KernelCode): for field in fields_accessed: layout = get_layout_tuple(field) permuted_shape = list(field.shape[i] for i in layout) - self.set_variable(field.name, str(field.dtype), tuple(permuted_shape)) + self.set_variable(field.name, tuple([str(field.dtype)]), tuple(permuted_shape)) # Scalars may be safely ignored # for param in ast.get_parameters(): @@ -129,24 +133,64 @@ class PyStencilsKerncraftKernel(KernelCode): print("----------------------------- FLOPS -------------------------------") pprint(self._flops) - def as_code(self, type_='iaca', openmp=False, as_filename=False): + def get_kernel_header(self, name='pystencils_kernel'): + file_name = "pystencils_kernel.h" + file_path = self.get_intermediate_location(file_name, machine_and_compiler_dependent=False) + lock_mode, lock_fp = self.lock_intermediate(file_path) + + if lock_mode == fcntl.LOCK_EX: + function_signature = generate_c(self.kernel_ast, dialect='c', signature_only=True) + + jinja_context = { + 'function_signature': function_signature, + } + + env = Environment(loader=PackageLoader('pystencils.kerncraft_coupling'), undefined=StrictUndefined) + file_header = env.get_template('kernel.h').render(**jinja_context) + with open(file_path, 'w') as f: + f.write(file_header) + + fcntl.flock(lock_fp, fcntl.LOCK_SH) # degrade to shared lock + + return file_path, lock_fp + + def get_kernel_code(self, openmp=False, name='pystencils_kernl'): """ Generate and return compilable source code. Args: - type_: can be iaca or likwid. openmp: if true, openmp code will be generated - as_filename: + name: kernel name """ - code = generate_benchmark(self.kernel_ast, likwid=type_ == 'likwid', openmp=openmp) - if as_filename: - fp, already_available = self._get_intermediate_file(f'kernel_{type_}.c', - machine_and_compiler_dependent=False) - if not already_available: - fp.write(code) - return fp.name - else: - return code + filename = 'pystencils_kernl' + if openmp: + filename += '-omp' + filename += '.c' + file_path = self.get_intermediate_location(filename, machine_and_compiler_dependent=False) + lock_mode, lock_fp = self.lock_intermediate(file_path) + + if lock_mode == fcntl.LOCK_EX: + header_list = get_headers(self.kernel_ast) + includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list]) + + if openmp: + add_openmp(self.kernel_ast) + + kernel_code = generate_c(self.kernel_ast, dialect='c') + + jinja_context = { + 'includes': includes, + 'kernel_code': kernel_code, + } + + env = Environment(loader=PackageLoader('pystencils.kerncraft_coupling'), undefined=StrictUndefined) + file_header = env.get_template('kernel.c').render(**jinja_context) + with open(file_path, 'w') as f: + f.write(file_header) + + fcntl.flock(lock_fp, fcntl.LOCK_SH) # degrade to shared lock + + return file_path, lock_fp class KerncraftParameters(DotDict): @@ -161,6 +205,7 @@ class KerncraftParameters(DotDict): self['iterations'] = 10 self['unit'] = 'cy/CL' self['ignore_warnings'] = True + self['incore_model'] = 'OSACA' # ------------------------------------------- Helper functions --------------------------------------------------------- diff --git a/pystencils/kerncraft_coupling/templates/benchmark.c b/pystencils/kerncraft_coupling/templates/benchmark.c new file mode 100644 index 0000000000000000000000000000000000000000..ae70ddd6775a45c0709e95d57cef061da2a4b6b0 --- /dev/null +++ b/pystencils/kerncraft_coupling/templates/benchmark.c @@ -0,0 +1,108 @@ + +#include "kerncraft.h" +#include <stdlib.h> +#include <stdint.h> +#include <stdbool.h> +#include <math.h> +#include <stdio.h> + +{{ includes }} + +{%- if likwid %} +#include <likwid.h> +{%- endif %} + +#define RESTRICT __restrict__ +#define FUNC_PREFIX +void dummy(void *); +void timing(double* wcTime, double* cpuTime); +extern int var_false; + + +{{kernel_code}} + + +int main(int argc, char **argv) +{ + {%- if likwid %} + likwid_markerInit(); + {%- endif %} + + {%- for field_name, dataType, size in fields %} + + // Initialization {{field_name}} + double * {{field_name}} = (double *) aligned_malloc(sizeof({{dataType}}) * {{size}}, 64); + for (unsigned long long i = 0; i < {{size}}; ++i) + {{field_name}}[i] = 0.23; + + if(var_false) + dummy({{field_name}}); + + {%- endfor %} + + + + {%- for constantName, dataType in constants %} + + // Constant {{constantName}} + {{dataType}} {{constantName}}; + {{constantName}} = 0.23; + if(var_false) + dummy(& {{constantName}}); + + {%- endfor %} + + {%- if likwid and openmp %} + #pragma omp parallel + { + likwid_markerRegisterRegion("loop"); + #pragma omp barrier + {%- elif likwid %} + likwid_markerRegisterRegion("loop"); + {%- endif %} + + for(int warmup = 1; warmup >= 0; --warmup) { + int repeat = 2; + if(warmup == 0) { + repeat = atoi(argv[1]); + {%- if likwid %} + likwid_markerStartRegion("loop"); + {%- endif %} + } + + {%- if timing %} + double wcStartTime, cpuStartTime, wcEndTime, cpuEndTime; + timing(&wcStartTime, &cpuStartTime); + {%- endif %} + + for (; repeat > 0; --repeat) + { + {{kernelName}}({{call_argument_list}}); + + // Dummy calls + {%- for field_name, dataType, size in fields %} + if(var_false) dummy((void*){{field_name}}); + {%- endfor %} + {%- for constantName, dataType in constants %} + if(var_false) dummy((void*)&{{constantName}}); + {%- endfor %} + } + {%- if timing %} + timing(&wcEndTime, &cpuEndTime); + if( warmup == 0) + printf("%e\n", (wcEndTime - wcStartTime) / atoi(argv[1]) ); + {%- endif %} + + } + + {%- if likwid %} + likwid_markerStopRegion("loop"); + {%- if openmp %} + } + {%- endif %} + {%- endif %} + + {%- if likwid %} + likwid_markerClose(); + {%- endif %} +} diff --git a/pystencils/kerncraft_coupling/templates/kernel.c b/pystencils/kerncraft_coupling/templates/kernel.c new file mode 100644 index 0000000000000000000000000000000000000000..47fbf7cf25eda318a8fcecffa1477f5738eb1abc --- /dev/null +++ b/pystencils/kerncraft_coupling/templates/kernel.c @@ -0,0 +1,18 @@ + +#include "kerncraft.h" +#include <stdlib.h> +#include <stdint.h> +#include <stdbool.h> +#include <math.h> +#include <stdio.h> + +{{ includes }} + +#define RESTRICT __restrict__ +#define FUNC_PREFIX +void dummy(void *); +void timing(double* wcTime, double* cpuTime); +extern int var_false; + + +{{kernel_code}} \ No newline at end of file diff --git a/pystencils/kerncraft_coupling/templates/kernel.h b/pystencils/kerncraft_coupling/templates/kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..539d51f928ca5d702dc2dad8de7396a505e5c5ee --- /dev/null +++ b/pystencils/kerncraft_coupling/templates/kernel.h @@ -0,0 +1,3 @@ +#define FUNC_PREFIX + +{{function_signature}} \ No newline at end of file diff --git a/pystencils_tests/kerncraft_inputs/Example_SandyBridgeEP_E5-2680.yml b/pystencils_tests/kerncraft_inputs/Example_SandyBridgeEP_E5-2680.yml new file mode 100644 index 0000000000000000000000000000000000000000..37889b8fee94242855e51d5d31a3118dc367bfd6 --- /dev/null +++ b/pystencils_tests/kerncraft_inputs/Example_SandyBridgeEP_E5-2680.yml @@ -0,0 +1,974 @@ +kerncraft version: 0.8.3.dev0 +model name: Intel(R) Xeon(R) CPU E5-2680 0 @ 2.70GHz +model type: Intel Xeon SandyBridge EN/EP processor +clock: 2.7 GHz + +sockets: 2 +cores per socket: 8 +threads per core: 2 +NUMA domains per socket: 1 +cores per NUMA domain: 8 + +in-core model: !!omap + - IACA: SNB + - OSACA: SNB + - LLVM-MCA: -mcpu=sandybridge +isa: x86 + +FLOPs per cycle: + SP: {total: 16, ADD: 8, MUL: 8} + DP: {total: 8, ADD: 4, MUL: 4} + +compiler: !!omap +- icc: -O3 -xAVX -fno-alias -qopenmp +- clang: -O3 -march=corei7-avx -mtune=corei7-avx -D_POSIX_C_SOURCE=200809L -fopenmp +- gcc: -O3 -march=corei7-avx -D_POSIX_C_SOURCE=200809L -fopenmp -lm + +non-overlapping model: + ports: [2D, 3D] + performance counter metric: T_nOL + T_L1L2 + T_L2L3 + T_L3MEM +overlapping model: + ports: ['0', 0DV, '1', '2', '3', '4', '5'] + performance counter metric: Max(UOPS_DISPATCHED_PORT_PORT_0:PMC[0-3], UOPS_DISPATCHED_PORT_PORT_1:PMC[0-3], + UOPS_DISPATCHED_PORT_PORT_4:PMC[0-3], UOPS_DISPATCHED_PORT_PORT_5:PMC[0-3]) + +cacheline size: 64 B +memory hierarchy: +- level: L1 + cache per group: {sets: 64, ways: 8, cl_size: 64, replacement_policy: LRU, write_allocate: true, + write_back: true, load_from: L2, store_to: L2} + cores per group: 1 + threads per group: 2 + groups: 16 + performance counter metrics: + accesses: MEM_UOPS_RETIRED_LOADS:PMC[0-3] + MEM_UOPS_RETIRED_STORES:PMC[0-3] + misses: L1D_REPLACEMENT:PMC[0-3] + evicts: L1D_M_EVICT:PMC[0-3] + upstream throughput: [architecture code analyzer, [2D, 3D]] + transfers overlap: false +- level: L2 + cache per group: {sets: 512, ways: 8, cl_size: 64, replacement_policy: LRU, write_allocate: true, + write_back: true, load_from: L3, store_to: L3} + cores per group: 1 + threads per group: 2 + groups: 16 + upstream throughput: [32 B/cy, half-duplex] + transfers overlap: false + performance counter metrics: + accesses: L1D_REPLACEMENT:PMC[0-3] + L1D_M_EVICT:PMC[0-3] + misses: L2_LINES_IN_ALL:PMC[0-3] + evicts: L2_TRANS_L2_WB:PMC[0-3] +- level: L3 + cache per group: {sets: 20480, ways: 16, cl_size: 64, replacement_policy: LRU, write_allocate: true, + write_back: true} + cores per group: 8 + threads per group: 16 + groups: 2 + upstream throughput: [32 B/cy, half-duplex] + transfers overlap: false + performance counter metrics: + accesses: L2_LINES_IN_ALL:PMC[0-3] + L2_TRANS_L2_WB:PMC[0-3] + misses: (CAS_COUNT_RD:MBOX0C[01] + CAS_COUNT_RD:MBOX1C[01] + CAS_COUNT_RD:MBOX2C[01] + + CAS_COUNT_RD:MBOX3C[01]) + evicts: (CAS_COUNT_WR:MBOX0C[01] + CAS_COUNT_WR:MBOX1C[01] + CAS_COUNT_WR:MBOX2C[01] + + CAS_COUNT_WR:MBOX3C[01]) +- level: MEM + cores per group: 8 + upstream throughput: [full socket memory bandwidth, half-duplex] + transfers overlap: false + size per group: + threads per group: 16 + +benchmarks: + kernels: + copy: + FLOPs per iteration: 0 + fastest bench kernel: copy_avx + read streams: {bytes: 8.00 B, streams: 1} + read+write streams: {bytes: 0.00 B, streams: 0} + write streams: {bytes: 8.00 B, streams: 1} + daxpy: + FLOPs per iteration: 2 + fastest bench kernel: daxpy_avx + read streams: {bytes: 16.00 B, streams: 2} + read+write streams: {bytes: 8.00 B, streams: 1} + write streams: {bytes: 8.00 B, streams: 1} + load: + FLOPs per iteration: 0 + fastest bench kernel: load_avx + read streams: {bytes: 8.00 B, streams: 1} + read+write streams: {bytes: 0.00 B, streams: 0} + write streams: {bytes: 0.00 B, streams: 0} + triad: + FLOPs per iteration: 2 + fastest bench kernel: triad_avx + read streams: {bytes: 24.00 B, streams: 3} + read+write streams: {bytes: 0.00 B, streams: 0} + write streams: {bytes: 8.00 B, streams: 1} + update: + FLOPs per iteration: 0 + fastest bench kernel: update_avx + read streams: {bytes: 8.00 B, streams: 1} + read+write streams: {bytes: 8.00 B, streams: 1} + write streams: {bytes: 8.00 B, streams: 1} + measurements: + L1: + 1: + cores: [1, 2, 3, 4, 5, 6, 7, 8] + results: + copy: [83.27 GB/s, 166.52 GB/s, 249.78 GB/s, 333.02 GB/s, 416.34 GB/s, 495.96 + GB/s, 578.56 GB/s, 660.60 GB/s] + daxpy: [116.88 GB/s, 233.68 GB/s, 311.60 GB/s, 409.72 GB/s, 509.79 GB/s, + 559.65 GB/s, 612.77 GB/s, 719.71 GB/s] + load: [84.07 GB/s, 168.13 GB/s, 252.21 GB/s, 336.04 GB/s, 420.34 GB/s, 504.02 + GB/s, 588.04 GB/s, 668.37 GB/s] + triad: [100.24 GB/s, 211.57 GB/s, 314.53 GB/s, 392.73 GB/s, 506.87 GB/s, + 589.51 GB/s, 687.28 GB/s, 782.17 GB/s] + update: [84.77 GB/s, 160.10 GB/s, 237.12 GB/s, 312.74 GB/s, 392.54 GB/s, + 465.53 GB/s, 516.02 GB/s, 567.27 GB/s] + size per core: [21.12 kB, 21.12 kB, 21.12 kB, 21.12 kB, 21.12 kB, 21.12 kB, + 21.12 kB, 21.12 kB] + size per thread: [21.12 kB, 21.12 kB, 21.12 kB, 21.12 kB, 21.12 kB, 21.12 + kB, 21.12 kB, 21.12 kB] + stats: + copy: + - [83.24 GB/s, 83.25 GB/s, 83.26 GB/s, 83.26 GB/s, 83.27 GB/s, 83.26 GB/s, + 83.25 GB/s, 83.23 GB/s, 83.24 GB/s, 83.25 GB/s] + - [166.49 GB/s, 166.47 GB/s, 166.51 GB/s, 166.49 GB/s, 166.48 GB/s, 166.52 + GB/s, 166.51 GB/s, 166.51 GB/s, 166.51 GB/s, 166.50 GB/s] + - [249.78 GB/s, 249.75 GB/s, 249.73 GB/s, 249.72 GB/s, 249.74 GB/s, 249.76 + GB/s, 249.76 GB/s, 249.74 GB/s, 249.73 GB/s, 249.75 GB/s] + - [332.98 GB/s, 327.92 GB/s, 332.30 GB/s, 332.95 GB/s, 333.00 GB/s, 333.01 + GB/s, 332.95 GB/s, 333.00 GB/s, 332.99 GB/s, 333.02 GB/s] + - [416.26 GB/s, 416.23 GB/s, 416.28 GB/s, 416.27 GB/s, 416.23 GB/s, 416.27 + GB/s, 416.34 GB/s, 416.26 GB/s, 416.16 GB/s, 416.23 GB/s] + - [495.84 GB/s, 495.93 GB/s, 495.88 GB/s, 495.91 GB/s, 495.96 GB/s, 495.92 + GB/s, 495.89 GB/s, 495.87 GB/s, 495.96 GB/s, 495.92 GB/s] + - [578.51 GB/s, 578.52 GB/s, 578.39 GB/s, 578.56 GB/s, 578.48 GB/s, 578.44 + GB/s, 578.51 GB/s, 578.48 GB/s, 578.51 GB/s, 578.53 GB/s] + - [422.14 GB/s, 660.55 GB/s, 660.60 GB/s, 660.49 GB/s, 660.52 GB/s, 660.48 + GB/s, 660.56 GB/s, 660.56 GB/s, 660.52 GB/s, 651.64 GB/s] + daxpy: + - [116.87 GB/s, 116.82 GB/s, 116.85 GB/s, 116.84 GB/s, 116.83 GB/s, 116.85 + GB/s, 116.88 GB/s, 116.87 GB/s, 116.86 GB/s, 116.82 GB/s] + - [214.69 GB/s, 229.83 GB/s, 221.16 GB/s, 233.60 GB/s, 232.90 GB/s, 233.68 + GB/s, 207.83 GB/s, 233.65 GB/s, 212.71 GB/s, 214.07 GB/s] + - [282.77 GB/s, 307.63 GB/s, 307.09 GB/s, 310.67 GB/s, 307.50 GB/s, 311.40 + GB/s, 307.06 GB/s, 305.89 GB/s, 311.60 GB/s, 308.47 GB/s] + - [404.96 GB/s, 408.54 GB/s, 395.76 GB/s, 409.72 GB/s, 316.70 GB/s, 408.07 + GB/s, 347.34 GB/s, 406.03 GB/s, 391.75 GB/s, 385.10 GB/s] + - [479.84 GB/s, 509.24 GB/s, 502.60 GB/s, 449.79 GB/s, 402.46 GB/s, 489.18 + GB/s, 491.15 GB/s, 491.20 GB/s, 384.36 GB/s, 509.79 GB/s] + - [515.12 GB/s, 496.21 GB/s, 517.52 GB/s, 540.00 GB/s, 501.82 GB/s, 507.84 + GB/s, 496.71 GB/s, 479.42 GB/s, 559.65 GB/s, 519.55 GB/s] + - [584.86 GB/s, 580.10 GB/s, 583.34 GB/s, 612.77 GB/s, 607.15 GB/s, 607.89 + GB/s, 589.85 GB/s, 609.59 GB/s, 592.86 GB/s, 568.07 GB/s] + - [719.71 GB/s, 660.98 GB/s, 675.88 GB/s, 679.51 GB/s, 696.97 GB/s, 635.23 + GB/s, 644.06 GB/s, 694.74 GB/s, 654.01 GB/s, 656.57 GB/s] + load: + - [84.04 GB/s, 84.06 GB/s, 84.06 GB/s, 84.04 GB/s, 84.05 GB/s, 84.05 GB/s, + 84.07 GB/s, 84.04 GB/s, 84.05 GB/s, 84.06 GB/s] + - [168.09 GB/s, 168.12 GB/s, 168.06 GB/s, 168.11 GB/s, 168.12 GB/s, 168.13 + GB/s, 168.13 GB/s, 168.12 GB/s, 168.10 GB/s, 168.13 GB/s] + - [252.16 GB/s, 252.21 GB/s, 252.07 GB/s, 252.07 GB/s, 252.18 GB/s, 252.16 + GB/s, 252.21 GB/s, 252.20 GB/s, 252.20 GB/s, 252.17 GB/s] + - [335.94 GB/s, 336.03 GB/s, 335.99 GB/s, 336.04 GB/s, 336.00 GB/s, 335.98 + GB/s, 335.97 GB/s, 335.89 GB/s, 335.99 GB/s, 336.03 GB/s] + - [420.30 GB/s, 420.18 GB/s, 420.30 GB/s, 420.33 GB/s, 420.25 GB/s, 420.28 + GB/s, 420.31 GB/s, 420.31 GB/s, 420.34 GB/s, 420.33 GB/s] + - [503.98 GB/s, 503.99 GB/s, 503.97 GB/s, 503.98 GB/s, 504.02 GB/s, 503.99 + GB/s, 503.92 GB/s, 503.98 GB/s, 503.94 GB/s, 503.97 GB/s] + - [587.93 GB/s, 588.01 GB/s, 588.04 GB/s, 587.94 GB/s, 587.97 GB/s, 588.01 + GB/s, 588.00 GB/s, 587.92 GB/s, 588.04 GB/s, 588.02 GB/s] + - [668.21 GB/s, 668.22 GB/s, 668.29 GB/s, 668.24 GB/s, 668.27 GB/s, 668.37 + GB/s, 668.28 GB/s, 668.14 GB/s, 668.19 GB/s, 668.19 GB/s] + triad: + - [100.00 GB/s, 99.71 GB/s, 99.74 GB/s, 100.24 GB/s, 99.72 GB/s, 99.62 GB/s, + 99.54 GB/s, 99.61 GB/s, 99.72 GB/s, 99.71 GB/s] + - [208.08 GB/s, 210.33 GB/s, 211.57 GB/s, 208.34 GB/s, 210.03 GB/s, 209.16 + GB/s, 210.21 GB/s, 209.48 GB/s, 210.03 GB/s, 208.80 GB/s] + - [311.43 GB/s, 311.08 GB/s, 311.41 GB/s, 311.10 GB/s, 313.13 GB/s, 314.53 + GB/s, 311.59 GB/s, 311.80 GB/s, 311.57 GB/s, 311.89 GB/s] + - [391.65 GB/s, 392.34 GB/s, 391.84 GB/s, 392.07 GB/s, 391.96 GB/s, 392.73 + GB/s, 391.66 GB/s, 391.83 GB/s, 392.09 GB/s, 391.88 GB/s] + - [504.20 GB/s, 506.77 GB/s, 503.22 GB/s, 506.74 GB/s, 502.78 GB/s, 506.15 + GB/s, 506.87 GB/s, 502.85 GB/s, 505.82 GB/s, 506.57 GB/s] + - [587.75 GB/s, 589.51 GB/s, 588.01 GB/s, 587.29 GB/s, 588.04 GB/s, 587.92 + GB/s, 588.08 GB/s, 587.94 GB/s, 587.82 GB/s, 587.55 GB/s] + - [686.03 GB/s, 685.97 GB/s, 685.01 GB/s, 685.88 GB/s, 685.61 GB/s, 687.12 + GB/s, 684.97 GB/s, 686.09 GB/s, 685.81 GB/s, 687.28 GB/s] + - [782.05 GB/s, 781.73 GB/s, 781.13 GB/s, 781.87 GB/s, 782.17 GB/s, 781.24 + GB/s, 781.82 GB/s, 781.92 GB/s, 781.90 GB/s, 781.66 GB/s] + update: + - [84.76 GB/s, 84.76 GB/s, 84.77 GB/s, 84.75 GB/s, 84.75 GB/s, 84.75 GB/s, + 84.75 GB/s, 84.75 GB/s, 84.74 GB/s, 57.21 GB/s] + - [157.73 GB/s, 155.29 GB/s, 147.91 GB/s, 160.10 GB/s, 156.33 GB/s, 158.06 + GB/s, 159.23 GB/s, 156.16 GB/s, 155.30 GB/s, 159.15 GB/s] + - [232.07 GB/s, 230.40 GB/s, 234.05 GB/s, 232.69 GB/s, 215.80 GB/s, 232.76 + GB/s, 236.01 GB/s, 237.12 GB/s, 234.66 GB/s, 234.86 GB/s] + - [303.60 GB/s, 304.21 GB/s, 306.83 GB/s, 309.43 GB/s, 312.69 GB/s, 311.75 + GB/s, 301.74 GB/s, 307.54 GB/s, 312.74 GB/s, 312.19 GB/s] + - [386.45 GB/s, 382.41 GB/s, 387.87 GB/s, 392.54 GB/s, 369.42 GB/s, 341.87 + GB/s, 352.85 GB/s, 390.87 GB/s, 382.44 GB/s, 383.50 GB/s] + - [459.60 GB/s, 384.27 GB/s, 437.39 GB/s, 459.42 GB/s, 465.53 GB/s, 447.31 + GB/s, 440.00 GB/s, 409.94 GB/s, 412.94 GB/s, 446.74 GB/s] + - [489.85 GB/s, 489.35 GB/s, 435.92 GB/s, 492.39 GB/s, 446.44 GB/s, 501.71 + GB/s, 516.02 GB/s, 478.87 GB/s, 494.52 GB/s, 493.04 GB/s] + - [521.08 GB/s, 553.73 GB/s, 541.34 GB/s, 527.75 GB/s, 554.87 GB/s, 536.30 + GB/s, 540.66 GB/s, 551.02 GB/s, 567.27 GB/s, 565.31 GB/s] + threads: [1, 2, 3, 4, 5, 6, 7, 8] + threads per core: 1 + total size: [21.12 kB, 42.24 kB, 63.36 kB, 84.48 kB, 105.60 kB, 126.72 kB, + 147.84 kB, 168.96 kB] + 2: + cores: [1, 2, 3, 4, 5, 6, 7, 8] + results: + copy: [80.41 GB/s, 160.83 GB/s, 240.43 GB/s, 320.63 GB/s, 401.66 GB/s, 454.32 + GB/s, 539.77 GB/s, 628.51 GB/s] + daxpy: [95.87 GB/s, 187.75 GB/s, 270.68 GB/s, 371.80 GB/s, 454.05 GB/s, + 503.46 GB/s, 606.85 GB/s, 689.34 GB/s] + load: [82.30 GB/s, 164.06 GB/s, 244.78 GB/s, 326.21 GB/s, 408.56 GB/s, 490.13 + GB/s, 569.95 GB/s, 651.79 GB/s] + triad: [93.22 GB/s, 186.75 GB/s, 288.55 GB/s, 340.91 GB/s, 442.20 GB/s, + 534.62 GB/s, 597.98 GB/s, 707.54 GB/s] + update: [83.25 GB/s, 166.04 GB/s, 248.21 GB/s, 330.58 GB/s, 414.71 GB/s, + 496.97 GB/s, 578.67 GB/s, 656.56 GB/s] + size per core: [21.12 kB, 21.12 kB, 21.12 kB, 21.12 kB, 21.12 kB, 21.12 kB, + 21.12 kB, 21.12 kB] + size per thread: [10.56 kB, 10.56 kB, 10.56 kB, 10.56 kB, 10.56 kB, 10.56 + kB, 10.56 kB, 10.56 kB] + stats: + copy: + - [80.37 GB/s, 79.07 GB/s, 80.39 GB/s, 80.39 GB/s, 80.41 GB/s, 80.29 GB/s, + 80.36 GB/s, 79.05 GB/s, 77.87 GB/s, 80.37 GB/s] + - [160.76 GB/s, 160.63 GB/s, 160.76 GB/s, 160.71 GB/s, 160.80 GB/s, 160.74 + GB/s, 160.83 GB/s, 160.69 GB/s, 160.79 GB/s, 160.78 GB/s] + - [240.43 GB/s, 240.20 GB/s, 240.36 GB/s, 240.37 GB/s, 237.17 GB/s, 240.39 + GB/s, 240.14 GB/s, 240.24 GB/s, 240.26 GB/s, 240.10 GB/s] + - [320.46 GB/s, 320.47 GB/s, 320.63 GB/s, 320.52 GB/s, 320.40 GB/s, 320.40 + GB/s, 320.51 GB/s, 320.46 GB/s, 319.72 GB/s, 320.44 GB/s] + - [401.40 GB/s, 399.28 GB/s, 401.66 GB/s, 401.53 GB/s, 401.52 GB/s, 401.55 + GB/s, 401.60 GB/s, 401.47 GB/s, 401.47 GB/s, 401.35 GB/s] + - [447.24 GB/s, 453.65 GB/s, 453.54 GB/s, 453.86 GB/s, 453.82 GB/s, 453.62 + GB/s, 453.48 GB/s, 454.32 GB/s, 453.86 GB/s, 446.79 GB/s] + - [538.79 GB/s, 538.47 GB/s, 539.02 GB/s, 538.25 GB/s, 538.72 GB/s, 538.89 + GB/s, 539.37 GB/s, 539.41 GB/s, 539.77 GB/s, 538.49 GB/s] + - [628.14 GB/s, 618.54 GB/s, 628.12 GB/s, 623.90 GB/s, 628.27 GB/s, 623.78 + GB/s, 618.17 GB/s, 623.43 GB/s, 628.51 GB/s, 628.43 GB/s] + daxpy: + - [95.77 GB/s, 93.25 GB/s, 92.87 GB/s, 95.87 GB/s, 95.84 GB/s, 95.81 GB/s, + 95.80 GB/s, 94.99 GB/s, 95.81 GB/s, 95.86 GB/s] + - [184.53 GB/s, 186.60 GB/s, 183.99 GB/s, 187.48 GB/s, 187.75 GB/s, 181.53 + GB/s, 183.82 GB/s, 187.75 GB/s, 184.13 GB/s, 180.61 GB/s] + - [258.46 GB/s, 270.13 GB/s, 264.76 GB/s, 262.23 GB/s, 265.05 GB/s, 267.25 + GB/s, 270.68 GB/s, 268.08 GB/s, 266.20 GB/s, 265.66 GB/s] + - [367.99 GB/s, 367.15 GB/s, 361.68 GB/s, 364.86 GB/s, 368.76 GB/s, 363.27 + GB/s, 364.95 GB/s, 366.97 GB/s, 371.80 GB/s, 366.55 GB/s] + - [441.95 GB/s, 442.77 GB/s, 444.97 GB/s, 454.05 GB/s, 441.02 GB/s, 445.96 + GB/s, 442.49 GB/s, 440.23 GB/s, 449.29 GB/s, 452.66 GB/s] + - [501.31 GB/s, 489.91 GB/s, 495.43 GB/s, 503.39 GB/s, 488.03 GB/s, 497.71 + GB/s, 503.46 GB/s, 496.85 GB/s, 497.38 GB/s, 468.90 GB/s] + - [604.57 GB/s, 580.51 GB/s, 587.67 GB/s, 594.32 GB/s, 561.32 GB/s, 588.09 + GB/s, 606.85 GB/s, 600.91 GB/s, 599.40 GB/s, 598.24 GB/s] + - [646.48 GB/s, 655.06 GB/s, 684.70 GB/s, 653.61 GB/s, 671.61 GB/s, 689.34 + GB/s, 673.74 GB/s, 685.49 GB/s, 681.48 GB/s, 683.23 GB/s] + load: + - [82.19 GB/s, 82.08 GB/s, 82.22 GB/s, 82.10 GB/s, 82.14 GB/s, 82.17 GB/s, + 82.22 GB/s, 82.28 GB/s, 82.30 GB/s, 81.98 GB/s] + - [163.22 GB/s, 163.43 GB/s, 164.06 GB/s, 164.03 GB/s, 163.19 GB/s, 163.83 + GB/s, 163.29 GB/s, 163.88 GB/s, 163.83 GB/s, 163.11 GB/s] + - [244.32 GB/s, 244.47 GB/s, 244.65 GB/s, 244.29 GB/s, 243.96 GB/s, 244.50 + GB/s, 244.78 GB/s, 244.52 GB/s, 244.48 GB/s, 244.72 GB/s] + - [325.18 GB/s, 326.21 GB/s, 325.49 GB/s, 325.86 GB/s, 325.73 GB/s, 325.72 + GB/s, 326.00 GB/s, 325.41 GB/s, 325.63 GB/s, 325.82 GB/s] + - [407.81 GB/s, 407.96 GB/s, 407.59 GB/s, 408.56 GB/s, 407.64 GB/s, 407.61 + GB/s, 408.09 GB/s, 407.95 GB/s, 408.30 GB/s, 408.32 GB/s] + - [488.65 GB/s, 489.73 GB/s, 489.38 GB/s, 489.81 GB/s, 490.13 GB/s, 489.31 + GB/s, 488.74 GB/s, 489.38 GB/s, 488.17 GB/s, 489.51 GB/s] + - [569.95 GB/s, 567.21 GB/s, 566.08 GB/s, 567.88 GB/s, 567.69 GB/s, 569.58 + GB/s, 568.61 GB/s, 568.35 GB/s, 569.70 GB/s, 568.87 GB/s] + - [650.43 GB/s, 651.58 GB/s, 650.86 GB/s, 651.34 GB/s, 651.04 GB/s, 651.79 + GB/s, 650.28 GB/s, 650.31 GB/s, 650.81 GB/s, 651.09 GB/s] + triad: + - [93.22 GB/s, 90.73 GB/s, 92.48 GB/s, 92.53 GB/s, 92.37 GB/s, 92.50 GB/s, + 92.48 GB/s, 90.28 GB/s, 92.35 GB/s, 92.51 GB/s] + - [186.75 GB/s, 184.51 GB/s, 184.17 GB/s, 186.66 GB/s, 186.43 GB/s, 184.59 + GB/s, 186.71 GB/s, 186.30 GB/s, 186.64 GB/s, 186.12 GB/s] + - [287.77 GB/s, 288.55 GB/s, 287.76 GB/s, 287.76 GB/s, 288.19 GB/s, 287.70 + GB/s, 287.42 GB/s, 288.12 GB/s, 287.66 GB/s, 288.01 GB/s] + - [339.82 GB/s, 338.95 GB/s, 340.11 GB/s, 340.11 GB/s, 340.25 GB/s, 340.20 + GB/s, 339.90 GB/s, 340.22 GB/s, 340.91 GB/s, 340.01 GB/s] + - [440.41 GB/s, 440.65 GB/s, 441.59 GB/s, 442.20 GB/s, 441.67 GB/s, 432.59 + GB/s, 440.20 GB/s, 440.81 GB/s, 440.24 GB/s, 441.38 GB/s] + - [534.30 GB/s, 527.60 GB/s, 528.52 GB/s, 509.55 GB/s, 527.68 GB/s, 527.63 + GB/s, 533.66 GB/s, 534.62 GB/s, 534.60 GB/s, 534.19 GB/s] + - [595.90 GB/s, 595.94 GB/s, 597.91 GB/s, 580.22 GB/s, 597.98 GB/s, 597.66 + GB/s, 596.16 GB/s, 567.03 GB/s, 580.88 GB/s, 578.29 GB/s] + - [703.80 GB/s, 705.57 GB/s, 694.84 GB/s, 682.59 GB/s, 694.37 GB/s, 696.56 + GB/s, 704.50 GB/s, 704.95 GB/s, 694.52 GB/s, 707.54 GB/s] + update: + - [83.18 GB/s, 83.24 GB/s, 83.25 GB/s, 83.16 GB/s, 83.22 GB/s, 83.23 GB/s, + 83.22 GB/s, 83.21 GB/s, 83.20 GB/s, 83.17 GB/s] + - [165.65 GB/s, 165.76 GB/s, 165.99 GB/s, 166.04 GB/s, 165.49 GB/s, 165.87 + GB/s, 165.58 GB/s, 165.96 GB/s, 165.67 GB/s, 165.66 GB/s] + - [247.30 GB/s, 248.14 GB/s, 247.84 GB/s, 247.90 GB/s, 247.77 GB/s, 247.60 + GB/s, 248.21 GB/s, 247.95 GB/s, 248.05 GB/s, 247.83 GB/s] + - [330.49 GB/s, 330.07 GB/s, 329.91 GB/s, 329.90 GB/s, 330.58 GB/s, 329.30 + GB/s, 329.92 GB/s, 330.03 GB/s, 330.04 GB/s, 330.12 GB/s] + - [413.89 GB/s, 414.04 GB/s, 413.56 GB/s, 414.06 GB/s, 414.15 GB/s, 413.94 + GB/s, 414.04 GB/s, 414.71 GB/s, 414.32 GB/s, 413.93 GB/s] + - [496.97 GB/s, 496.80 GB/s, 496.17 GB/s, 495.42 GB/s, 496.17 GB/s, 496.66 + GB/s, 495.55 GB/s, 496.27 GB/s, 495.52 GB/s, 496.80 GB/s] + - [564.44 GB/s, 577.86 GB/s, 574.38 GB/s, 571.96 GB/s, 564.76 GB/s, 578.67 + GB/s, 565.89 GB/s, 572.49 GB/s, 571.80 GB/s, 572.01 GB/s] + - [647.68 GB/s, 656.56 GB/s, 655.56 GB/s, 644.04 GB/s, 655.30 GB/s, 648.80 + GB/s, 654.77 GB/s, 653.58 GB/s, 656.27 GB/s, 653.79 GB/s] + threads: [2, 4, 6, 8, 10, 12, 14, 16] + threads per core: 2 + total size: [21.12 kB, 42.24 kB, 63.36 kB, 84.48 kB, 105.60 kB, 126.72 kB, + 147.84 kB, 168.96 kB] + L2: + 1: + cores: [1, 2, 3, 4, 5, 6, 7, 8] + results: + copy: [36.74 GB/s, 73.65 GB/s, 107.11 GB/s, 141.43 GB/s, 179.70 GB/s, 215.63 + GB/s, 247.20 GB/s, 282.42 GB/s] + daxpy: [44.59 GB/s, 88.24 GB/s, 132.21 GB/s, 175.78 GB/s, 219.11 GB/s, 259.95 + GB/s, 305.84 GB/s, 346.83 GB/s] + load: [31.46 GB/s, 62.97 GB/s, 93.73 GB/s, 125.46 GB/s, 157.32 GB/s, 183.63 + GB/s, 214.02 GB/s, 245.17 GB/s] + triad: [37.79 GB/s, 75.08 GB/s, 111.43 GB/s, 148.90 GB/s, 185.54 GB/s, 223.72 + GB/s, 258.53 GB/s, 299.32 GB/s] + update: [48.46 GB/s, 96.10 GB/s, 141.97 GB/s, 189.18 GB/s, 234.73 GB/s, + 280.47 GB/s, 330.94 GB/s, 365.43 GB/s] + size per core: [168.96 kB, 168.96 kB, 168.96 kB, 168.96 kB, 168.96 kB, 168.96 + kB, 168.96 kB, 168.96 kB] + size per thread: [168.96 kB, 168.96 kB, 168.96 kB, 168.96 kB, 168.96 kB, 168.96 + kB, 168.96 kB, 168.96 kB] + stats: + copy: + - [36.38 GB/s, 36.59 GB/s, 36.18 GB/s, 36.57 GB/s, 36.26 GB/s, 34.61 GB/s, + 35.96 GB/s, 35.84 GB/s, 36.74 GB/s, 36.53 GB/s] + - [68.97 GB/s, 70.42 GB/s, 69.88 GB/s, 71.40 GB/s, 69.05 GB/s, 72.46 GB/s, + 70.32 GB/s, 73.65 GB/s, 72.14 GB/s, 69.81 GB/s] + - [107.08 GB/s, 103.53 GB/s, 107.11 GB/s, 103.66 GB/s, 103.88 GB/s, 106.48 + GB/s, 97.32 GB/s, 105.92 GB/s, 104.16 GB/s, 104.84 GB/s] + - [138.97 GB/s, 136.86 GB/s, 140.88 GB/s, 138.96 GB/s, 140.58 GB/s, 138.51 + GB/s, 141.43 GB/s, 139.53 GB/s, 141.20 GB/s, 139.43 GB/s] + - [158.20 GB/s, 171.06 GB/s, 179.70 GB/s, 171.43 GB/s, 174.27 GB/s, 175.01 + GB/s, 165.20 GB/s, 170.89 GB/s, 173.01 GB/s, 175.17 GB/s] + - [209.74 GB/s, 204.59 GB/s, 215.27 GB/s, 215.63 GB/s, 210.59 GB/s, 206.94 + GB/s, 211.03 GB/s, 201.61 GB/s, 214.45 GB/s, 208.15 GB/s] + - [241.38 GB/s, 246.88 GB/s, 246.90 GB/s, 247.20 GB/s, 235.27 GB/s, 227.39 + GB/s, 239.48 GB/s, 244.45 GB/s, 246.68 GB/s, 235.87 GB/s] + - [271.07 GB/s, 282.42 GB/s, 282.38 GB/s, 276.20 GB/s, 269.85 GB/s, 276.96 + GB/s, 268.64 GB/s, 269.61 GB/s, 279.68 GB/s, 280.63 GB/s] + daxpy: + - [44.54 GB/s, 44.59 GB/s, 44.50 GB/s, 44.42 GB/s, 44.41 GB/s, 44.06 GB/s, + 43.39 GB/s, 44.02 GB/s, 44.34 GB/s, 44.28 GB/s] + - [85.35 GB/s, 87.05 GB/s, 86.47 GB/s, 86.90 GB/s, 86.92 GB/s, 88.24 GB/s, + 87.39 GB/s, 87.60 GB/s, 87.55 GB/s, 84.19 GB/s] + - [129.21 GB/s, 130.47 GB/s, 123.29 GB/s, 127.92 GB/s, 132.21 GB/s, 128.37 + GB/s, 127.09 GB/s, 128.72 GB/s, 129.34 GB/s, 128.69 GB/s] + - [171.53 GB/s, 169.64 GB/s, 173.92 GB/s, 173.74 GB/s, 168.53 GB/s, 171.54 + GB/s, 173.96 GB/s, 175.78 GB/s, 171.29 GB/s, 171.33 GB/s] + - [219.11 GB/s, 208.86 GB/s, 211.66 GB/s, 216.47 GB/s, 212.73 GB/s, 204.90 + GB/s, 208.87 GB/s, 215.75 GB/s, 213.61 GB/s, 214.56 GB/s] + - [250.69 GB/s, 241.36 GB/s, 255.22 GB/s, 250.29 GB/s, 253.80 GB/s, 256.34 + GB/s, 254.38 GB/s, 259.95 GB/s, 245.69 GB/s, 259.12 GB/s] + - [296.08 GB/s, 301.77 GB/s, 297.40 GB/s, 305.84 GB/s, 288.62 GB/s, 283.76 + GB/s, 293.61 GB/s, 291.93 GB/s, 299.74 GB/s, 289.76 GB/s] + - [344.46 GB/s, 334.36 GB/s, 339.31 GB/s, 330.88 GB/s, 343.26 GB/s, 327.28 + GB/s, 344.53 GB/s, 346.83 GB/s, 344.29 GB/s, 346.28 GB/s] + load: + - [31.40 GB/s, 31.23 GB/s, 31.29 GB/s, 31.24 GB/s, 31.46 GB/s, 31.20 GB/s, + 31.33 GB/s, 30.01 GB/s, 30.08 GB/s, 31.40 GB/s] + - [61.20 GB/s, 60.74 GB/s, 61.93 GB/s, 61.22 GB/s, 61.20 GB/s, 60.03 GB/s, + 59.33 GB/s, 59.94 GB/s, 58.54 GB/s, 62.97 GB/s] + - [91.53 GB/s, 93.73 GB/s, 93.05 GB/s, 90.07 GB/s, 91.60 GB/s, 90.11 GB/s, + 90.21 GB/s, 90.43 GB/s, 89.15 GB/s, 93.10 GB/s] + - [122.80 GB/s, 116.57 GB/s, 120.68 GB/s, 122.54 GB/s, 122.75 GB/s, 121.79 + GB/s, 125.30 GB/s, 125.46 GB/s, 122.28 GB/s, 124.51 GB/s] + - [151.01 GB/s, 151.10 GB/s, 148.68 GB/s, 151.17 GB/s, 147.24 GB/s, 153.65 + GB/s, 146.48 GB/s, 150.48 GB/s, 150.74 GB/s, 157.32 GB/s] + - [181.52 GB/s, 173.89 GB/s, 181.58 GB/s, 174.01 GB/s, 176.40 GB/s, 179.73 + GB/s, 174.06 GB/s, 181.26 GB/s, 180.57 GB/s, 183.63 GB/s] + - [214.02 GB/s, 205.69 GB/s, 207.64 GB/s, 204.18 GB/s, 208.42 GB/s, 211.39 + GB/s, 206.58 GB/s, 204.90 GB/s, 204.75 GB/s, 208.91 GB/s] + - [232.16 GB/s, 233.90 GB/s, 241.32 GB/s, 237.45 GB/s, 235.41 GB/s, 241.17 + GB/s, 237.52 GB/s, 245.17 GB/s, 241.17 GB/s, 234.08 GB/s] + triad: + - [37.62 GB/s, 37.54 GB/s, 37.79 GB/s, 37.67 GB/s, 37.76 GB/s, 37.77 GB/s, + 37.68 GB/s, 35.83 GB/s, 37.06 GB/s, 37.50 GB/s] + - [72.79 GB/s, 74.76 GB/s, 73.15 GB/s, 74.68 GB/s, 73.88 GB/s, 73.27 GB/s, + 75.08 GB/s, 73.48 GB/s, 71.27 GB/s, 72.05 GB/s] + - [106.26 GB/s, 105.22 GB/s, 109.70 GB/s, 109.07 GB/s, 110.84 GB/s, 111.43 + GB/s, 106.32 GB/s, 109.73 GB/s, 106.22 GB/s, 107.20 GB/s] + - [142.10 GB/s, 148.90 GB/s, 148.11 GB/s, 144.38 GB/s, 144.77 GB/s, 145.42 + GB/s, 147.36 GB/s, 142.94 GB/s, 145.39 GB/s, 139.42 GB/s] + - [182.07 GB/s, 176.75 GB/s, 181.39 GB/s, 183.31 GB/s, 181.87 GB/s, 183.71 + GB/s, 180.48 GB/s, 178.11 GB/s, 181.36 GB/s, 185.54 GB/s] + - [219.85 GB/s, 217.02 GB/s, 218.86 GB/s, 217.09 GB/s, 212.24 GB/s, 212.22 + GB/s, 219.33 GB/s, 208.81 GB/s, 215.84 GB/s, 223.72 GB/s] + - [258.06 GB/s, 232.27 GB/s, 247.04 GB/s, 240.55 GB/s, 236.11 GB/s, 251.88 + GB/s, 258.53 GB/s, 247.32 GB/s, 251.53 GB/s, 245.10 GB/s] + - [273.67 GB/s, 292.81 GB/s, 288.67 GB/s, 289.75 GB/s, 293.98 GB/s, 283.56 + GB/s, 295.33 GB/s, 280.11 GB/s, 299.32 GB/s, 285.18 GB/s] + update: + - [47.30 GB/s, 48.33 GB/s, 48.17 GB/s, 47.38 GB/s, 48.16 GB/s, 46.99 GB/s, + 48.46 GB/s, 47.51 GB/s, 46.20 GB/s, 48.26 GB/s] + - [92.10 GB/s, 92.30 GB/s, 95.73 GB/s, 95.53 GB/s, 86.95 GB/s, 96.10 GB/s, + 94.16 GB/s, 89.72 GB/s, 92.00 GB/s, 93.10 GB/s] + - [137.06 GB/s, 140.40 GB/s, 136.20 GB/s, 139.57 GB/s, 140.69 GB/s, 136.20 + GB/s, 141.53 GB/s, 129.76 GB/s, 136.47 GB/s, 141.97 GB/s] + - [184.84 GB/s, 177.96 GB/s, 178.61 GB/s, 179.03 GB/s, 176.59 GB/s, 180.62 + GB/s, 182.26 GB/s, 182.27 GB/s, 189.18 GB/s, 185.49 GB/s] + - [232.17 GB/s, 217.86 GB/s, 232.40 GB/s, 223.10 GB/s, 228.52 GB/s, 234.73 + GB/s, 232.00 GB/s, 233.14 GB/s, 231.69 GB/s, 225.01 GB/s] + - [276.16 GB/s, 274.80 GB/s, 272.58 GB/s, 272.43 GB/s, 280.47 GB/s, 276.90 + GB/s, 264.76 GB/s, 272.47 GB/s, 277.77 GB/s, 271.42 GB/s] + - [330.94 GB/s, 312.06 GB/s, 312.83 GB/s, 312.62 GB/s, 292.44 GB/s, 315.68 + GB/s, 316.67 GB/s, 321.25 GB/s, 321.71 GB/s, 315.05 GB/s] + - [362.85 GB/s, 356.49 GB/s, 365.43 GB/s, 332.52 GB/s, 354.30 GB/s, 354.68 + GB/s, 335.54 GB/s, 358.54 GB/s, 363.22 GB/s, 360.01 GB/s] + threads: [1, 2, 3, 4, 5, 6, 7, 8] + threads per core: 1 + total size: [168.96 kB, 337.92 kB, 506.88 kB, 675.84 kB, 844.80 kB, 1.01 MB, + 1.18 MB, 1.35 MB] + 2: + cores: [1, 2, 3, 4, 5, 6, 7, 8] + results: + copy: [36.83 GB/s, 72.70 GB/s, 108.11 GB/s, 142.21 GB/s, 178.07 GB/s, 213.30 + GB/s, 251.98 GB/s, 283.06 GB/s] + daxpy: [45.34 GB/s, 90.11 GB/s, 134.85 GB/s, 180.06 GB/s, 224.22 GB/s, 268.27 + GB/s, 312.15 GB/s, 358.38 GB/s] + load: [33.99 GB/s, 67.65 GB/s, 100.93 GB/s, 134.81 GB/s, 165.89 GB/s, 196.09 + GB/s, 233.31 GB/s, 262.05 GB/s] + triad: [38.60 GB/s, 76.58 GB/s, 114.50 GB/s, 150.54 GB/s, 189.60 GB/s, 227.05 + GB/s, 263.75 GB/s, 301.02 GB/s] + update: [49.25 GB/s, 97.34 GB/s, 146.81 GB/s, 194.71 GB/s, 239.97 GB/s, + 287.14 GB/s, 330.84 GB/s, 384.71 GB/s] + size per core: [168.96 kB, 168.96 kB, 168.96 kB, 168.96 kB, 168.96 kB, 168.96 + kB, 168.96 kB, 168.96 kB] + size per thread: [84.48 kB, 84.48 kB, 84.48 kB, 84.48 kB, 84.48 kB, 84.48 + kB, 84.48 kB, 84.48 kB] + stats: + copy: + - [36.83 GB/s, 36.67 GB/s, 34.90 GB/s, 36.44 GB/s, 35.13 GB/s, 35.07 GB/s, + 35.53 GB/s, 36.15 GB/s, 35.85 GB/s, 36.23 GB/s] + - [71.52 GB/s, 70.16 GB/s, 70.67 GB/s, 71.20 GB/s, 72.70 GB/s, 70.14 GB/s, + 70.53 GB/s, 69.17 GB/s, 71.57 GB/s, 70.22 GB/s] + - [104.39 GB/s, 104.74 GB/s, 103.12 GB/s, 108.11 GB/s, 105.30 GB/s, 102.80 + GB/s, 102.90 GB/s, 107.06 GB/s, 103.45 GB/s, 105.45 GB/s] + - [139.02 GB/s, 134.63 GB/s, 140.72 GB/s, 141.32 GB/s, 140.35 GB/s, 141.19 + GB/s, 135.44 GB/s, 142.21 GB/s, 140.96 GB/s, 142.05 GB/s] + - [177.86 GB/s, 177.74 GB/s, 177.42 GB/s, 175.35 GB/s, 176.42 GB/s, 173.13 + GB/s, 174.32 GB/s, 170.24 GB/s, 178.07 GB/s, 177.88 GB/s] + - [206.27 GB/s, 211.63 GB/s, 209.06 GB/s, 210.54 GB/s, 208.80 GB/s, 209.99 + GB/s, 208.77 GB/s, 206.41 GB/s, 213.30 GB/s, 206.39 GB/s] + - [240.18 GB/s, 238.36 GB/s, 244.16 GB/s, 236.26 GB/s, 244.12 GB/s, 238.49 + GB/s, 242.23 GB/s, 244.46 GB/s, 251.98 GB/s, 242.55 GB/s] + - [279.77 GB/s, 282.91 GB/s, 278.73 GB/s, 276.91 GB/s, 283.06 GB/s, 273.23 + GB/s, 278.33 GB/s, 280.88 GB/s, 277.54 GB/s, 281.83 GB/s] + daxpy: + - [45.32 GB/s, 44.62 GB/s, 45.29 GB/s, 45.18 GB/s, 45.17 GB/s, 45.07 GB/s, + 44.69 GB/s, 45.17 GB/s, 45.11 GB/s, 45.34 GB/s] + - [89.94 GB/s, 89.97 GB/s, 89.37 GB/s, 89.90 GB/s, 88.37 GB/s, 89.13 GB/s, + 90.11 GB/s, 89.67 GB/s, 89.90 GB/s, 89.93 GB/s] + - [134.83 GB/s, 134.85 GB/s, 132.02 GB/s, 134.33 GB/s, 133.82 GB/s, 132.39 + GB/s, 131.67 GB/s, 134.62 GB/s, 132.71 GB/s, 131.67 GB/s] + - [175.52 GB/s, 173.36 GB/s, 176.83 GB/s, 177.98 GB/s, 175.73 GB/s, 173.42 + GB/s, 180.06 GB/s, 179.55 GB/s, 176.71 GB/s, 175.85 GB/s] + - [222.00 GB/s, 216.86 GB/s, 220.17 GB/s, 218.14 GB/s, 220.60 GB/s, 219.43 + GB/s, 220.58 GB/s, 224.22 GB/s, 220.89 GB/s, 222.28 GB/s] + - [258.75 GB/s, 262.88 GB/s, 261.77 GB/s, 268.27 GB/s, 263.66 GB/s, 262.59 + GB/s, 266.54 GB/s, 261.67 GB/s, 262.80 GB/s, 263.72 GB/s] + - [298.65 GB/s, 312.15 GB/s, 308.52 GB/s, 304.22 GB/s, 301.87 GB/s, 305.53 + GB/s, 309.84 GB/s, 310.67 GB/s, 310.49 GB/s, 311.99 GB/s] + - [347.55 GB/s, 350.67 GB/s, 348.93 GB/s, 358.38 GB/s, 352.35 GB/s, 352.05 + GB/s, 353.82 GB/s, 356.00 GB/s, 348.07 GB/s, 349.87 GB/s] + load: + - [33.99 GB/s, 32.54 GB/s, 32.94 GB/s, 33.17 GB/s, 33.83 GB/s, 31.55 GB/s, + 31.91 GB/s, 33.86 GB/s, 33.93 GB/s, 33.75 GB/s] + - [66.22 GB/s, 64.94 GB/s, 67.64 GB/s, 67.52 GB/s, 65.01 GB/s, 67.21 GB/s, + 66.07 GB/s, 66.43 GB/s, 67.65 GB/s, 64.84 GB/s] + - [98.58 GB/s, 97.97 GB/s, 98.39 GB/s, 98.50 GB/s, 98.77 GB/s, 97.84 GB/s, + 99.58 GB/s, 100.93 GB/s, 100.50 GB/s, 99.94 GB/s] + - [130.23 GB/s, 131.10 GB/s, 131.04 GB/s, 127.83 GB/s, 134.81 GB/s, 132.68 + GB/s, 131.80 GB/s, 129.42 GB/s, 130.76 GB/s, 126.96 GB/s] + - [164.90 GB/s, 165.18 GB/s, 161.19 GB/s, 164.33 GB/s, 162.76 GB/s, 165.04 + GB/s, 162.20 GB/s, 165.89 GB/s, 164.34 GB/s, 159.66 GB/s] + - [192.69 GB/s, 193.33 GB/s, 188.88 GB/s, 190.70 GB/s, 194.60 GB/s, 190.92 + GB/s, 191.36 GB/s, 192.89 GB/s, 191.85 GB/s, 196.09 GB/s] + - [227.70 GB/s, 223.95 GB/s, 222.79 GB/s, 227.09 GB/s, 227.04 GB/s, 229.45 + GB/s, 228.09 GB/s, 227.83 GB/s, 233.31 GB/s, 227.49 GB/s] + - [257.94 GB/s, 261.47 GB/s, 262.05 GB/s, 257.70 GB/s, 259.70 GB/s, 259.23 + GB/s, 261.09 GB/s, 253.81 GB/s, 254.21 GB/s, 259.34 GB/s] + triad: + - [38.60 GB/s, 36.68 GB/s, 38.07 GB/s, 38.10 GB/s, 37.89 GB/s, 36.48 GB/s, + 38.33 GB/s, 38.12 GB/s, 37.43 GB/s, 37.87 GB/s] + - [76.58 GB/s, 74.97 GB/s, 75.74 GB/s, 76.02 GB/s, 72.66 GB/s, 74.73 GB/s, + 76.37 GB/s, 76.18 GB/s, 74.59 GB/s, 75.75 GB/s] + - [111.71 GB/s, 114.50 GB/s, 108.96 GB/s, 111.49 GB/s, 111.56 GB/s, 111.66 + GB/s, 113.43 GB/s, 114.37 GB/s, 111.67 GB/s, 108.14 GB/s] + - [146.29 GB/s, 147.84 GB/s, 149.09 GB/s, 149.93 GB/s, 150.54 GB/s, 145.50 + GB/s, 145.16 GB/s, 149.47 GB/s, 146.30 GB/s, 149.32 GB/s] + - [186.73 GB/s, 186.46 GB/s, 180.47 GB/s, 187.32 GB/s, 184.34 GB/s, 187.34 + GB/s, 186.55 GB/s, 183.81 GB/s, 189.60 GB/s, 188.70 GB/s] + - [224.81 GB/s, 219.69 GB/s, 227.05 GB/s, 224.25 GB/s, 223.36 GB/s, 225.86 + GB/s, 216.09 GB/s, 221.98 GB/s, 218.47 GB/s, 226.37 GB/s] + - [263.29 GB/s, 259.28 GB/s, 258.81 GB/s, 258.77 GB/s, 256.56 GB/s, 256.49 + GB/s, 256.39 GB/s, 263.75 GB/s, 262.00 GB/s, 261.48 GB/s] + - [299.28 GB/s, 292.80 GB/s, 293.63 GB/s, 297.93 GB/s, 293.02 GB/s, 295.95 + GB/s, 287.92 GB/s, 301.02 GB/s, 300.76 GB/s, 297.01 GB/s] + update: + - [49.07 GB/s, 47.17 GB/s, 47.56 GB/s, 49.25 GB/s, 46.44 GB/s, 49.04 GB/s, + 48.91 GB/s, 49.20 GB/s, 48.30 GB/s, 48.85 GB/s] + - [96.45 GB/s, 97.11 GB/s, 94.03 GB/s, 92.56 GB/s, 95.39 GB/s, 97.34 GB/s, + 96.06 GB/s, 92.25 GB/s, 95.53 GB/s, 97.08 GB/s] + - [137.54 GB/s, 135.13 GB/s, 145.80 GB/s, 141.29 GB/s, 138.99 GB/s, 143.44 + GB/s, 146.81 GB/s, 142.94 GB/s, 133.84 GB/s, 146.33 GB/s] + - [190.64 GB/s, 185.02 GB/s, 194.24 GB/s, 187.48 GB/s, 194.52 GB/s, 188.51 + GB/s, 189.17 GB/s, 194.71 GB/s, 194.37 GB/s, 190.83 GB/s] + - [239.97 GB/s, 219.74 GB/s, 233.72 GB/s, 234.38 GB/s, 235.78 GB/s, 235.11 + GB/s, 235.62 GB/s, 226.09 GB/s, 235.93 GB/s, 230.51 GB/s] + - [280.16 GB/s, 275.22 GB/s, 260.15 GB/s, 286.01 GB/s, 280.61 GB/s, 287.14 + GB/s, 283.75 GB/s, 275.23 GB/s, 283.71 GB/s, 285.38 GB/s] + - [311.15 GB/s, 318.00 GB/s, 325.21 GB/s, 328.34 GB/s, 318.09 GB/s, 328.66 + GB/s, 329.69 GB/s, 316.97 GB/s, 328.51 GB/s, 330.84 GB/s] + - [374.41 GB/s, 369.73 GB/s, 358.15 GB/s, 375.54 GB/s, 384.71 GB/s, 357.66 + GB/s, 369.71 GB/s, 375.35 GB/s, 370.25 GB/s, 364.01 GB/s] + threads: [2, 4, 6, 8, 10, 12, 14, 16] + threads per core: 2 + total size: [168.96 kB, 337.92 kB, 506.88 kB, 675.84 kB, 844.80 kB, 1.01 MB, + 1.18 MB, 1.35 MB] + L3: + 1: + cores: [1, 2, 3, 4, 5, 6, 7, 8] + results: + copy: [21.93 GB/s, 43.10 GB/s, 65.38 GB/s, 85.69 GB/s, 105.64 GB/s, 127.34 + GB/s, 148.22 GB/s, 171.52 GB/s] + daxpy: [30.98 GB/s, 62.27 GB/s, 93.13 GB/s, 123.27 GB/s, 153.64 GB/s, 185.97 + GB/s, 216.67 GB/s, 247.41 GB/s] + load: [23.47 GB/s, 46.84 GB/s, 69.74 GB/s, 92.76 GB/s, 115.37 GB/s, 139.23 + GB/s, 163.12 GB/s, 186.65 GB/s] + triad: [24.72 GB/s, 49.11 GB/s, 72.42 GB/s, 95.36 GB/s, 119.46 GB/s, 144.60 + GB/s, 168.66 GB/s, 189.45 GB/s] + update: [31.39 GB/s, 62.11 GB/s, 91.95 GB/s, 122.24 GB/s, 151.40 GB/s, 182.28 + GB/s, 216.07 GB/s, 239.92 GB/s] + size per core: [13.20 MB, 6.60 MB, 4.40 MB, 3.30 MB, 2.64 MB, 2.20 MB, 1.89 + MB, 1.65 MB] + size per thread: [13.20 MB, 6.60 MB, 4.40 MB, 3.30 MB, 2.64 MB, 2.20 MB, 1.89 + MB, 1.65 MB] + stats: + copy: + - [21.64 GB/s, 20.85 GB/s, 20.56 GB/s, 21.69 GB/s, 21.06 GB/s, 21.46 GB/s, + 21.93 GB/s, 21.73 GB/s, 21.83 GB/s, 21.69 GB/s] + - [42.86 GB/s, 42.70 GB/s, 42.72 GB/s, 38.47 GB/s, 42.82 GB/s, 43.10 GB/s, + 42.66 GB/s, 42.44 GB/s, 42.61 GB/s, 42.48 GB/s] + - [64.95 GB/s, 64.34 GB/s, 63.93 GB/s, 65.38 GB/s, 64.36 GB/s, 63.60 GB/s, + 62.65 GB/s, 63.66 GB/s, 63.51 GB/s, 63.75 GB/s] + - [84.07 GB/s, 83.97 GB/s, 83.34 GB/s, 83.91 GB/s, 81.16 GB/s, 85.69 GB/s, + 85.40 GB/s, 85.37 GB/s, 85.42 GB/s, 84.48 GB/s] + - [102.83 GB/s, 104.24 GB/s, 105.42 GB/s, 103.68 GB/s, 105.22 GB/s, 105.64 + GB/s, 103.15 GB/s, 102.02 GB/s, 100.60 GB/s, 105.09 GB/s] + - [125.46 GB/s, 122.23 GB/s, 123.56 GB/s, 124.59 GB/s, 127.03 GB/s, 125.39 + GB/s, 124.50 GB/s, 127.02 GB/s, 126.95 GB/s, 127.34 GB/s] + - [147.99 GB/s, 146.65 GB/s, 139.23 GB/s, 147.69 GB/s, 146.42 GB/s, 145.65 + GB/s, 148.22 GB/s, 143.77 GB/s, 147.96 GB/s, 147.70 GB/s] + - [168.36 GB/s, 168.24 GB/s, 164.99 GB/s, 165.32 GB/s, 167.08 GB/s, 165.98 + GB/s, 165.39 GB/s, 165.84 GB/s, 166.15 GB/s, 171.52 GB/s] + daxpy: + - [30.92 GB/s, 30.74 GB/s, 30.87 GB/s, 30.98 GB/s, 30.45 GB/s, 29.62 GB/s, + 29.54 GB/s, 30.04 GB/s, 30.94 GB/s, 30.93 GB/s] + - [61.96 GB/s, 61.38 GB/s, 61.27 GB/s, 62.27 GB/s, 61.36 GB/s, 61.27 GB/s, + 62.06 GB/s, 60.01 GB/s, 61.49 GB/s, 62.16 GB/s] + - [92.26 GB/s, 93.06 GB/s, 88.45 GB/s, 92.18 GB/s, 93.13 GB/s, 92.11 GB/s, + 92.28 GB/s, 92.28 GB/s, 93.03 GB/s, 92.78 GB/s] + - [123.22 GB/s, 123.06 GB/s, 123.27 GB/s, 119.42 GB/s, 122.94 GB/s, 122.54 + GB/s, 123.24 GB/s, 115.90 GB/s, 121.65 GB/s, 122.47 GB/s] + - [151.70 GB/s, 145.65 GB/s, 149.53 GB/s, 152.52 GB/s, 153.64 GB/s, 152.93 + GB/s, 152.81 GB/s, 153.01 GB/s, 153.04 GB/s, 152.06 GB/s] + - [184.04 GB/s, 171.51 GB/s, 184.83 GB/s, 184.09 GB/s, 185.97 GB/s, 183.75 + GB/s, 184.66 GB/s, 182.54 GB/s, 184.39 GB/s, 184.40 GB/s] + - [198.70 GB/s, 216.51 GB/s, 216.17 GB/s, 203.10 GB/s, 211.40 GB/s, 215.04 + GB/s, 215.48 GB/s, 216.03 GB/s, 216.24 GB/s, 216.67 GB/s] + - [246.02 GB/s, 247.35 GB/s, 245.00 GB/s, 244.65 GB/s, 229.12 GB/s, 243.37 + GB/s, 247.22 GB/s, 247.41 GB/s, 246.03 GB/s, 244.83 GB/s] + load: + - [23.08 GB/s, 23.38 GB/s, 22.88 GB/s, 23.43 GB/s, 23.05 GB/s, 23.23 GB/s, + 22.97 GB/s, 22.39 GB/s, 23.47 GB/s, 23.33 GB/s] + - [46.39 GB/s, 46.40 GB/s, 46.45 GB/s, 46.36 GB/s, 46.69 GB/s, 46.62 GB/s, + 46.84 GB/s, 45.98 GB/s, 46.73 GB/s, 46.80 GB/s] + - [69.18 GB/s, 68.61 GB/s, 69.74 GB/s, 69.34 GB/s, 68.39 GB/s, 69.73 GB/s, + 67.76 GB/s, 69.65 GB/s, 69.70 GB/s, 69.16 GB/s] + - [92.29 GB/s, 91.67 GB/s, 92.76 GB/s, 90.78 GB/s, 92.76 GB/s, 90.76 GB/s, + 91.58 GB/s, 91.60 GB/s, 91.03 GB/s, 92.72 GB/s] + - [114.04 GB/s, 113.82 GB/s, 112.26 GB/s, 112.65 GB/s, 114.09 GB/s, 113.81 + GB/s, 113.72 GB/s, 114.70 GB/s, 115.37 GB/s, 112.57 GB/s] + - [136.42 GB/s, 135.83 GB/s, 134.93 GB/s, 135.43 GB/s, 135.94 GB/s, 139.23 + GB/s, 137.52 GB/s, 137.59 GB/s, 135.97 GB/s, 136.96 GB/s] + - [157.88 GB/s, 163.12 GB/s, 159.53 GB/s, 160.16 GB/s, 162.18 GB/s, 159.58 + GB/s, 161.55 GB/s, 159.81 GB/s, 162.97 GB/s, 163.10 GB/s] + - [183.41 GB/s, 181.86 GB/s, 183.55 GB/s, 183.38 GB/s, 181.66 GB/s, 186.65 + GB/s, 179.62 GB/s, 174.70 GB/s, 180.10 GB/s, 181.49 GB/s] + triad: + - [24.72 GB/s, 23.66 GB/s, 23.58 GB/s, 23.75 GB/s, 23.62 GB/s, 24.37 GB/s, + 24.44 GB/s, 23.57 GB/s, 23.30 GB/s, 23.57 GB/s] + - [49.11 GB/s, 46.87 GB/s, 47.13 GB/s, 46.83 GB/s, 46.58 GB/s, 46.73 GB/s, + 46.32 GB/s, 47.22 GB/s, 46.79 GB/s, 48.73 GB/s] + - [72.29 GB/s, 69.87 GB/s, 70.57 GB/s, 68.89 GB/s, 68.56 GB/s, 69.02 GB/s, + 72.42 GB/s, 69.37 GB/s, 72.34 GB/s, 69.44 GB/s] + - [94.95 GB/s, 94.67 GB/s, 91.05 GB/s, 90.46 GB/s, 95.36 GB/s, 91.63 GB/s, + 94.06 GB/s, 95.30 GB/s, 93.99 GB/s, 94.71 GB/s] + - [119.32 GB/s, 117.99 GB/s, 119.46 GB/s, 117.28 GB/s, 118.97 GB/s, 115.67 + GB/s, 116.64 GB/s, 117.99 GB/s, 119.02 GB/s, 117.75 GB/s] + - [138.63 GB/s, 144.53 GB/s, 144.60 GB/s, 135.72 GB/s, 141.86 GB/s, 139.64 + GB/s, 142.95 GB/s, 140.89 GB/s, 142.10 GB/s, 143.97 GB/s] + - [168.66 GB/s, 166.77 GB/s, 157.10 GB/s, 164.75 GB/s, 164.00 GB/s, 164.38 + GB/s, 163.94 GB/s, 158.58 GB/s, 165.60 GB/s, 164.39 GB/s] + - [184.53 GB/s, 187.00 GB/s, 186.87 GB/s, 179.43 GB/s, 185.70 GB/s, 187.49 + GB/s, 189.45 GB/s, 186.82 GB/s, 188.50 GB/s, 185.96 GB/s] + update: + - [30.60 GB/s, 31.20 GB/s, 30.65 GB/s, 31.39 GB/s, 30.89 GB/s, 30.75 GB/s, + 30.58 GB/s, 30.99 GB/s, 30.69 GB/s, 31.34 GB/s] + - [60.99 GB/s, 62.11 GB/s, 61.42 GB/s, 61.55 GB/s, 61.79 GB/s, 61.24 GB/s, + 61.37 GB/s, 61.74 GB/s, 61.45 GB/s, 61.58 GB/s] + - [91.11 GB/s, 91.21 GB/s, 91.95 GB/s, 91.19 GB/s, 91.14 GB/s, 91.36 GB/s, + 91.30 GB/s, 91.70 GB/s, 90.84 GB/s, 91.09 GB/s] + - [120.90 GB/s, 120.49 GB/s, 121.35 GB/s, 122.24 GB/s, 120.37 GB/s, 119.83 + GB/s, 119.32 GB/s, 119.48 GB/s, 119.11 GB/s, 119.76 GB/s] + - [146.72 GB/s, 147.18 GB/s, 147.81 GB/s, 151.40 GB/s, 147.81 GB/s, 146.84 + GB/s, 147.51 GB/s, 148.15 GB/s, 146.89 GB/s, 148.41 GB/s] + - [179.93 GB/s, 179.68 GB/s, 182.28 GB/s, 179.65 GB/s, 179.06 GB/s, 182.25 + GB/s, 182.03 GB/s, 179.10 GB/s, 178.82 GB/s, 177.84 GB/s] + - [208.84 GB/s, 210.17 GB/s, 210.20 GB/s, 210.81 GB/s, 209.88 GB/s, 211.16 + GB/s, 216.07 GB/s, 211.77 GB/s, 208.89 GB/s, 210.47 GB/s] + - [236.56 GB/s, 239.05 GB/s, 237.81 GB/s, 237.20 GB/s, 238.68 GB/s, 237.69 + GB/s, 239.05 GB/s, 239.38 GB/s, 239.92 GB/s, 238.63 GB/s] + threads: [1, 2, 3, 4, 5, 6, 7, 8] + threads per core: 1 + total size: [13.20 MB, 13.20 MB, 13.20 MB, 13.20 MB, 13.20 MB, 13.20 MB, 13.20 + MB, 13.20 MB] + 2: + cores: [1, 2, 3, 4, 5, 6, 7, 8] + results: + copy: [23.35 GB/s, 45.63 GB/s, 68.10 GB/s, 89.46 GB/s, 111.10 GB/s, 134.20 + GB/s, 154.44 GB/s, 174.89 GB/s] + daxpy: [32.32 GB/s, 64.16 GB/s, 96.12 GB/s, 126.75 GB/s, 156.91 GB/s, 188.57 + GB/s, 221.57 GB/s, 251.65 GB/s] + load: [25.14 GB/s, 50.38 GB/s, 75.49 GB/s, 101.06 GB/s, 126.04 GB/s, 151.12 + GB/s, 172.57 GB/s, 196.91 GB/s] + triad: [25.15 GB/s, 50.37 GB/s, 75.31 GB/s, 99.12 GB/s, 123.25 GB/s, 150.29 + GB/s, 171.60 GB/s, 197.81 GB/s] + update: [32.98 GB/s, 65.60 GB/s, 97.60 GB/s, 130.34 GB/s, 162.76 GB/s, 194.12 + GB/s, 229.02 GB/s, 260.35 GB/s] + size per core: [13.20 MB, 6.60 MB, 4.40 MB, 3.30 MB, 2.64 MB, 2.20 MB, 1.89 + MB, 1.65 MB] + size per thread: [6.60 MB, 3.30 MB, 2.20 MB, 1.65 MB, 1.32 MB, 1.10 MB, 0.94 + MB, 825.00 kB] + stats: + copy: + - [22.79 GB/s, 22.55 GB/s, 22.86 GB/s, 22.74 GB/s, 23.09 GB/s, 22.51 GB/s, + 23.35 GB/s, 23.32 GB/s, 23.02 GB/s, 22.75 GB/s] + - [45.32 GB/s, 45.15 GB/s, 45.63 GB/s, 44.84 GB/s, 44.54 GB/s, 44.33 GB/s, + 44.68 GB/s, 44.98 GB/s, 44.64 GB/s, 44.75 GB/s] + - [68.10 GB/s, 67.88 GB/s, 67.98 GB/s, 67.32 GB/s, 67.02 GB/s, 67.14 GB/s, + 67.71 GB/s, 67.19 GB/s, 63.08 GB/s, 68.04 GB/s] + - [89.46 GB/s, 88.53 GB/s, 88.51 GB/s, 89.13 GB/s, 89.32 GB/s, 84.53 GB/s, + 87.51 GB/s, 88.95 GB/s, 88.91 GB/s, 87.62 GB/s] + - [108.72 GB/s, 110.42 GB/s, 106.02 GB/s, 111.08 GB/s, 110.70 GB/s, 111.10 + GB/s, 110.24 GB/s, 109.68 GB/s, 109.55 GB/s, 108.86 GB/s] + - [133.21 GB/s, 127.37 GB/s, 132.83 GB/s, 132.67 GB/s, 133.02 GB/s, 132.65 + GB/s, 134.20 GB/s, 132.96 GB/s, 118.86 GB/s, 131.20 GB/s] + - [152.95 GB/s, 153.90 GB/s, 153.80 GB/s, 153.22 GB/s, 153.32 GB/s, 142.75 + GB/s, 152.99 GB/s, 154.44 GB/s, 154.43 GB/s, 152.24 GB/s] + - [174.89 GB/s, 171.49 GB/s, 157.46 GB/s, 172.90 GB/s, 173.42 GB/s, 171.07 + GB/s, 171.82 GB/s, 170.68 GB/s, 172.19 GB/s, 161.38 GB/s] + daxpy: + - [31.88 GB/s, 32.27 GB/s, 31.11 GB/s, 32.20 GB/s, 32.17 GB/s, 32.32 GB/s, + 32.20 GB/s, 32.32 GB/s, 30.76 GB/s, 32.03 GB/s] + - [64.16 GB/s, 63.70 GB/s, 64.04 GB/s, 63.55 GB/s, 60.64 GB/s, 64.05 GB/s, + 63.56 GB/s, 63.36 GB/s, 63.94 GB/s, 63.86 GB/s] + - [96.12 GB/s, 95.66 GB/s, 95.93 GB/s, 95.93 GB/s, 96.10 GB/s, 95.94 GB/s, + 95.78 GB/s, 95.79 GB/s, 95.17 GB/s, 89.44 GB/s] + - [126.04 GB/s, 126.43 GB/s, 126.09 GB/s, 124.90 GB/s, 125.07 GB/s, 125.74 + GB/s, 118.86 GB/s, 125.80 GB/s, 125.10 GB/s, 126.75 GB/s] + - [155.92 GB/s, 155.99 GB/s, 156.32 GB/s, 151.54 GB/s, 156.49 GB/s, 156.91 + GB/s, 154.92 GB/s, 155.92 GB/s, 156.20 GB/s, 154.49 GB/s] + - [185.57 GB/s, 180.38 GB/s, 187.51 GB/s, 187.10 GB/s, 186.44 GB/s, 187.13 + GB/s, 187.31 GB/s, 188.10 GB/s, 187.91 GB/s, 188.57 GB/s] + - [207.55 GB/s, 219.63 GB/s, 219.38 GB/s, 219.81 GB/s, 220.29 GB/s, 219.72 + GB/s, 221.05 GB/s, 216.76 GB/s, 221.57 GB/s, 220.75 GB/s] + - [250.81 GB/s, 250.78 GB/s, 251.19 GB/s, 251.28 GB/s, 249.10 GB/s, 250.42 + GB/s, 251.65 GB/s, 244.31 GB/s, 250.40 GB/s, 250.19 GB/s] + load: + - [24.84 GB/s, 24.86 GB/s, 25.09 GB/s, 25.04 GB/s, 24.74 GB/s, 24.87 GB/s, + 25.01 GB/s, 25.08 GB/s, 25.14 GB/s, 25.00 GB/s] + - [50.03 GB/s, 49.40 GB/s, 50.28 GB/s, 50.08 GB/s, 50.37 GB/s, 49.75 GB/s, + 50.01 GB/s, 50.38 GB/s, 49.89 GB/s, 50.24 GB/s] + - [74.37 GB/s, 74.65 GB/s, 74.40 GB/s, 73.45 GB/s, 73.31 GB/s, 73.00 GB/s, + 75.49 GB/s, 73.94 GB/s, 74.42 GB/s, 74.80 GB/s] + - [99.51 GB/s, 99.43 GB/s, 98.90 GB/s, 99.83 GB/s, 98.74 GB/s, 100.75 GB/s, + 99.33 GB/s, 99.81 GB/s, 100.00 GB/s, 101.06 GB/s] + - [126.04 GB/s, 126.03 GB/s, 124.70 GB/s, 124.86 GB/s, 125.31 GB/s, 124.78 + GB/s, 125.99 GB/s, 123.52 GB/s, 124.45 GB/s, 123.01 GB/s] + - [146.95 GB/s, 150.27 GB/s, 151.12 GB/s, 150.93 GB/s, 150.68 GB/s, 149.75 + GB/s, 150.67 GB/s, 146.01 GB/s, 148.34 GB/s, 149.15 GB/s] + - [169.40 GB/s, 172.12 GB/s, 172.40 GB/s, 171.99 GB/s, 172.57 GB/s, 171.95 + GB/s, 167.06 GB/s, 169.66 GB/s, 168.34 GB/s, 169.45 GB/s] + - [192.68 GB/s, 191.98 GB/s, 192.82 GB/s, 191.84 GB/s, 191.97 GB/s, 196.91 + GB/s, 193.36 GB/s, 190.12 GB/s, 192.04 GB/s, 193.93 GB/s] + triad: + - [24.78 GB/s, 25.03 GB/s, 25.07 GB/s, 24.81 GB/s, 24.65 GB/s, 24.80 GB/s, + 24.71 GB/s, 25.15 GB/s, 24.70 GB/s, 24.25 GB/s] + - [49.63 GB/s, 48.68 GB/s, 49.73 GB/s, 49.97 GB/s, 50.37 GB/s, 49.89 GB/s, + 49.59 GB/s, 49.00 GB/s, 49.96 GB/s, 49.61 GB/s] + - [74.88 GB/s, 74.99 GB/s, 75.31 GB/s, 73.20 GB/s, 74.50 GB/s, 72.88 GB/s, + 73.43 GB/s, 73.74 GB/s, 74.59 GB/s, 74.60 GB/s] + - [95.80 GB/s, 97.67 GB/s, 98.93 GB/s, 97.79 GB/s, 98.74 GB/s, 97.74 GB/s, + 98.87 GB/s, 99.12 GB/s, 97.90 GB/s, 97.96 GB/s] + - [121.15 GB/s, 120.28 GB/s, 120.66 GB/s, 121.19 GB/s, 121.09 GB/s, 121.68 + GB/s, 121.30 GB/s, 123.22 GB/s, 122.51 GB/s, 123.25 GB/s] + - [146.72 GB/s, 146.38 GB/s, 146.25 GB/s, 146.49 GB/s, 146.29 GB/s, 144.30 + GB/s, 142.89 GB/s, 150.29 GB/s, 146.37 GB/s, 146.30 GB/s] + - [166.36 GB/s, 168.18 GB/s, 168.79 GB/s, 170.27 GB/s, 169.26 GB/s, 170.98 + GB/s, 170.77 GB/s, 171.43 GB/s, 169.53 GB/s, 171.60 GB/s] + - [190.83 GB/s, 197.81 GB/s, 196.29 GB/s, 197.12 GB/s, 196.21 GB/s, 188.40 + GB/s, 191.07 GB/s, 195.14 GB/s, 192.48 GB/s, 194.23 GB/s] + update: + - [32.74 GB/s, 32.98 GB/s, 32.73 GB/s, 32.57 GB/s, 32.63 GB/s, 32.41 GB/s, + 32.61 GB/s, 32.24 GB/s, 32.52 GB/s, 32.49 GB/s] + - [65.22 GB/s, 65.07 GB/s, 64.65 GB/s, 65.26 GB/s, 63.70 GB/s, 64.19 GB/s, + 64.35 GB/s, 64.83 GB/s, 65.60 GB/s, 63.99 GB/s] + - [97.60 GB/s, 96.65 GB/s, 97.50 GB/s, 96.07 GB/s, 97.12 GB/s, 96.41 GB/s, + 96.85 GB/s, 96.80 GB/s, 97.10 GB/s, 97.10 GB/s] + - [129.18 GB/s, 127.79 GB/s, 129.50 GB/s, 129.46 GB/s, 128.85 GB/s, 128.69 + GB/s, 129.02 GB/s, 130.34 GB/s, 129.92 GB/s, 129.11 GB/s] + - [160.00 GB/s, 161.81 GB/s, 160.37 GB/s, 159.56 GB/s, 160.38 GB/s, 161.91 + GB/s, 160.54 GB/s, 161.43 GB/s, 160.59 GB/s, 162.76 GB/s] + - [192.24 GB/s, 193.69 GB/s, 191.11 GB/s, 190.65 GB/s, 193.10 GB/s, 191.30 + GB/s, 192.50 GB/s, 193.37 GB/s, 191.98 GB/s, 194.12 GB/s] + - [221.45 GB/s, 229.02 GB/s, 226.33 GB/s, 224.81 GB/s, 225.62 GB/s, 224.79 + GB/s, 226.03 GB/s, 227.09 GB/s, 226.46 GB/s, 225.88 GB/s] + - [255.45 GB/s, 256.52 GB/s, 254.06 GB/s, 257.76 GB/s, 256.85 GB/s, 256.27 + GB/s, 260.35 GB/s, 259.96 GB/s, 258.40 GB/s, 255.79 GB/s] + threads: [2, 4, 6, 8, 10, 12, 14, 16] + threads per core: 2 + total size: [13.20 MB, 13.20 MB, 13.20 MB, 13.20 MB, 13.20 MB, 13.20 MB, 13.20 + MB, 13.20 MB] + MEM: + 1: + cores: [1, 2, 3, 4, 5, 6, 7, 8] + results: + copy: [11.12 GB/s, 20.53 GB/s, 24.86 GB/s, 26.20 GB/s, 26.47 GB/s, 26.35 + GB/s, 26.24 GB/s, 26.17 GB/s] + daxpy: [16.10 GB/s, 30.00 GB/s, 36.88 GB/s, 38.86 GB/s, 39.36 GB/s, 39.19 + GB/s, 39.02 GB/s, 38.88 GB/s] + load: [12.30 GB/s, 23.50 GB/s, 33.04 GB/s, 40.59 GB/s, 44.03 GB/s, 44.56 + GB/s, 44.26 GB/s, 43.77 GB/s] + triad: [12.41 GB/s, 24.13 GB/s, 29.24 GB/s, 30.73 GB/s, 30.68 GB/s, 30.58 + GB/s, 30.54 GB/s, 30.63 GB/s] + update: [17.40 GB/s, 31.16 GB/s, 36.80 GB/s, 39.06 GB/s, 39.80 GB/s, 39.77 + GB/s, 39.50 GB/s, 39.24 GB/s] + size per core: [300.00 MB, 150.00 MB, 100.00 MB, 75.00 MB, 60.00 MB, 50.00 + MB, 42.86 MB, 37.50 MB] + size per thread: [300.00 MB, 150.00 MB, 100.00 MB, 75.00 MB, 60.00 MB, 50.00 + MB, 42.86 MB, 37.50 MB] + stats: + copy: + - [10.83 GB/s, 10.83 GB/s, 10.81 GB/s, 10.82 GB/s, 10.82 GB/s, 10.82 GB/s, + 10.83 GB/s, 10.81 GB/s, 10.82 GB/s, 11.12 GB/s] + - [20.34 GB/s, 20.38 GB/s, 20.37 GB/s, 20.34 GB/s, 20.41 GB/s, 20.39 GB/s, + 20.39 GB/s, 20.39 GB/s, 20.53 GB/s, 20.35 GB/s] + - [24.70 GB/s, 24.76 GB/s, 24.80 GB/s, 24.86 GB/s, 24.75 GB/s, 24.80 GB/s, + 24.77 GB/s, 24.82 GB/s, 24.81 GB/s, 24.73 GB/s] + - [26.10 GB/s, 26.16 GB/s, 26.14 GB/s, 26.16 GB/s, 26.10 GB/s, 26.15 GB/s, + 26.10 GB/s, 26.15 GB/s, 26.11 GB/s, 26.20 GB/s] + - [26.45 GB/s, 26.44 GB/s, 26.41 GB/s, 26.43 GB/s, 26.45 GB/s, 26.44 GB/s, + 26.46 GB/s, 26.47 GB/s, 26.45 GB/s, 26.44 GB/s] + - [26.34 GB/s, 26.30 GB/s, 26.31 GB/s, 26.33 GB/s, 26.26 GB/s, 26.35 GB/s, + 26.30 GB/s, 26.30 GB/s, 26.30 GB/s, 26.34 GB/s] + - [26.20 GB/s, 26.24 GB/s, 26.21 GB/s, 26.22 GB/s, 26.22 GB/s, 26.20 GB/s, + 26.20 GB/s, 26.23 GB/s, 26.22 GB/s, 26.23 GB/s] + - [26.15 GB/s, 26.17 GB/s, 26.12 GB/s, 26.15 GB/s, 26.15 GB/s, 26.15 GB/s, + 26.12 GB/s, 26.14 GB/s, 26.14 GB/s, 26.17 GB/s] + daxpy: + - [15.77 GB/s, 15.77 GB/s, 16.04 GB/s, 15.68 GB/s, 15.72 GB/s, 15.76 GB/s, + 15.91 GB/s, 15.77 GB/s, 16.10 GB/s, 16.04 GB/s] + - [29.88 GB/s, 29.80 GB/s, 30.00 GB/s, 29.87 GB/s, 29.87 GB/s, 30.00 GB/s, + 29.79 GB/s, 29.80 GB/s, 29.80 GB/s, 29.82 GB/s] + - [36.63 GB/s, 36.73 GB/s, 36.64 GB/s, 36.64 GB/s, 36.81 GB/s, 36.88 GB/s, + 36.62 GB/s, 36.65 GB/s, 36.74 GB/s, 36.71 GB/s] + - [38.82 GB/s, 38.83 GB/s, 38.86 GB/s, 38.81 GB/s, 38.81 GB/s, 38.82 GB/s, + 38.85 GB/s, 38.80 GB/s, 38.84 GB/s, 38.73 GB/s] + - [39.32 GB/s, 39.30 GB/s, 39.34 GB/s, 39.36 GB/s, 39.28 GB/s, 39.33 GB/s, + 39.31 GB/s, 39.25 GB/s, 39.32 GB/s, 39.33 GB/s] + - [39.10 GB/s, 39.12 GB/s, 39.14 GB/s, 39.16 GB/s, 39.17 GB/s, 39.17 GB/s, + 39.13 GB/s, 39.15 GB/s, 39.14 GB/s, 39.19 GB/s] + - [39.01 GB/s, 39.01 GB/s, 39.02 GB/s, 39.02 GB/s, 39.00 GB/s, 39.00 GB/s, + 38.97 GB/s, 39.02 GB/s, 38.98 GB/s, 39.01 GB/s] + - [38.76 GB/s, 38.86 GB/s, 38.83 GB/s, 38.82 GB/s, 38.87 GB/s, 38.88 GB/s, + 38.81 GB/s, 38.83 GB/s, 38.88 GB/s, 38.88 GB/s] + load: + - [11.97 GB/s, 11.96 GB/s, 11.98 GB/s, 11.97 GB/s, 11.96 GB/s, 12.05 GB/s, + 12.30 GB/s, 12.18 GB/s, 11.97 GB/s, 11.96 GB/s] + - [22.85 GB/s, 22.85 GB/s, 22.87 GB/s, 22.94 GB/s, 23.50 GB/s, 22.86 GB/s, + 22.86 GB/s, 23.25 GB/s, 22.85 GB/s, 22.86 GB/s] + - [33.04 GB/s, 32.43 GB/s, 32.51 GB/s, 32.52 GB/s, 32.52 GB/s, 32.81 GB/s, + 32.77 GB/s, 32.54 GB/s, 32.53 GB/s, 32.53 GB/s] + - [39.95 GB/s, 39.94 GB/s, 39.93 GB/s, 40.15 GB/s, 40.59 GB/s, 40.36 GB/s, + 40.28 GB/s, 39.93 GB/s, 39.94 GB/s, 39.98 GB/s] + - [43.98 GB/s, 43.86 GB/s, 43.90 GB/s, 43.80 GB/s, 43.83 GB/s, 43.86 GB/s, + 44.03 GB/s, 43.94 GB/s, 43.83 GB/s, 43.92 GB/s] + - [44.46 GB/s, 44.34 GB/s, 44.56 GB/s, 44.51 GB/s, 44.32 GB/s, 44.32 GB/s, + 44.51 GB/s, 44.48 GB/s, 44.32 GB/s, 44.34 GB/s] + - [44.03 GB/s, 44.26 GB/s, 44.08 GB/s, 44.18 GB/s, 44.10 GB/s, 43.99 GB/s, + 44.07 GB/s, 44.06 GB/s, 43.94 GB/s, 43.97 GB/s] + - [43.48 GB/s, 43.77 GB/s, 43.51 GB/s, 43.49 GB/s, 43.47 GB/s, 43.73 GB/s, + 43.55 GB/s, 43.68 GB/s, 43.49 GB/s, 43.50 GB/s] + triad: + - [12.11 GB/s, 12.02 GB/s, 12.03 GB/s, 12.10 GB/s, 12.03 GB/s, 12.04 GB/s, + 12.05 GB/s, 12.17 GB/s, 12.02 GB/s, 12.41 GB/s] + - [23.43 GB/s, 23.25 GB/s, 23.25 GB/s, 23.36 GB/s, 23.28 GB/s, 23.24 GB/s, + 23.61 GB/s, 23.29 GB/s, 23.31 GB/s, 24.13 GB/s] + - [28.92 GB/s, 29.10 GB/s, 29.17 GB/s, 29.04 GB/s, 28.91 GB/s, 29.16 GB/s, + 28.82 GB/s, 29.01 GB/s, 29.24 GB/s, 28.88 GB/s] + - [30.65 GB/s, 30.62 GB/s, 30.73 GB/s, 30.59 GB/s, 30.69 GB/s, 30.68 GB/s, + 30.59 GB/s, 30.59 GB/s, 30.57 GB/s, 30.67 GB/s] + - [30.53 GB/s, 30.67 GB/s, 30.65 GB/s, 30.53 GB/s, 30.63 GB/s, 30.68 GB/s, + 30.50 GB/s, 30.67 GB/s, 30.64 GB/s, 30.67 GB/s] + - [30.45 GB/s, 30.58 GB/s, 30.51 GB/s, 30.49 GB/s, 30.52 GB/s, 30.49 GB/s, + 30.56 GB/s, 30.55 GB/s, 30.47 GB/s, 30.47 GB/s] + - [30.51 GB/s, 30.47 GB/s, 30.50 GB/s, 30.47 GB/s, 30.52 GB/s, 30.54 GB/s, + 30.54 GB/s, 30.50 GB/s, 30.49 GB/s, 30.50 GB/s] + - [30.58 GB/s, 30.34 GB/s, 30.56 GB/s, 30.54 GB/s, 30.63 GB/s, 30.53 GB/s, + 30.59 GB/s, 30.50 GB/s, 30.54 GB/s, 30.47 GB/s] + update: + - [17.33 GB/s, 17.32 GB/s, 17.34 GB/s, 17.35 GB/s, 17.40 GB/s, 17.35 GB/s, + 17.36 GB/s, 17.39 GB/s, 17.35 GB/s, 17.35 GB/s] + - [31.12 GB/s, 31.15 GB/s, 31.10 GB/s, 31.16 GB/s, 31.07 GB/s, 31.08 GB/s, + 31.09 GB/s, 31.12 GB/s, 31.12 GB/s, 31.08 GB/s] + - [36.80 GB/s, 36.42 GB/s, 35.92 GB/s, 36.39 GB/s, 35.99 GB/s, 35.98 GB/s, + 36.37 GB/s, 36.39 GB/s, 36.38 GB/s, 36.44 GB/s] + - [39.03 GB/s, 39.05 GB/s, 39.02 GB/s, 39.06 GB/s, 39.01 GB/s, 39.02 GB/s, + 39.02 GB/s, 39.00 GB/s, 39.00 GB/s, 39.00 GB/s] + - [39.76 GB/s, 39.80 GB/s, 39.80 GB/s, 39.78 GB/s, 39.76 GB/s, 39.79 GB/s, + 39.79 GB/s, 39.77 GB/s, 39.77 GB/s, 39.71 GB/s] + - [39.71 GB/s, 39.72 GB/s, 39.72 GB/s, 39.66 GB/s, 39.74 GB/s, 39.70 GB/s, + 39.76 GB/s, 39.74 GB/s, 39.77 GB/s, 39.74 GB/s] + - [39.50 GB/s, 39.47 GB/s, 39.45 GB/s, 39.43 GB/s, 39.46 GB/s, 39.45 GB/s, + 39.45 GB/s, 39.40 GB/s, 39.43 GB/s, 39.47 GB/s] + - [39.21 GB/s, 39.18 GB/s, 39.19 GB/s, 39.19 GB/s, 39.21 GB/s, 39.19 GB/s, + 39.18 GB/s, 39.21 GB/s, 39.20 GB/s, 39.24 GB/s] + threads: [1, 2, 3, 4, 5, 6, 7, 8] + threads per core: 1 + total size: [300.00 MB, 300.00 MB, 300.00 MB, 300.00 MB, 300.00 MB, 300.00 + MB, 300.00 MB, 300.00 MB] + 2: + cores: [1, 2, 3, 4, 5, 6, 7, 8] + results: + copy: [10.79 GB/s, 20.46 GB/s, 24.69 GB/s, 25.42 GB/s, 25.63 GB/s, 25.45 + GB/s, 25.32 GB/s, 25.06 GB/s] + daxpy: [15.97 GB/s, 29.70 GB/s, 35.95 GB/s, 37.55 GB/s, 37.81 GB/s, 37.78 + GB/s, 37.64 GB/s, 37.33 GB/s] + load: [13.46 GB/s, 25.84 GB/s, 35.75 GB/s, 40.54 GB/s, 42.38 GB/s, 42.30 + GB/s, 41.85 GB/s, 41.19 GB/s] + triad: [12.05 GB/s, 22.53 GB/s, 27.53 GB/s, 29.10 GB/s, 29.68 GB/s, 29.79 + GB/s, 29.85 GB/s, 29.64 GB/s] + update: [19.12 GB/s, 33.86 GB/s, 38.51 GB/s, 39.38 GB/s, 39.20 GB/s, 38.80 + GB/s, 38.39 GB/s, 38.02 GB/s] + size per core: [300.00 MB, 150.00 MB, 100.00 MB, 75.00 MB, 60.00 MB, 50.00 + MB, 42.86 MB, 37.50 MB] + size per thread: [150.00 MB, 75.00 MB, 50.00 MB, 37.50 MB, 30.00 MB, 25.00 + MB, 21.43 MB, 18.75 MB] + stats: + copy: + - [10.71 GB/s, 10.69 GB/s, 10.71 GB/s, 10.70 GB/s, 10.79 GB/s, 10.58 GB/s, + 10.70 GB/s, 10.69 GB/s, 10.69 GB/s, 10.70 GB/s] + - [20.27 GB/s, 20.31 GB/s, 20.27 GB/s, 20.26 GB/s, 20.31 GB/s, 20.26 GB/s, + 20.24 GB/s, 20.26 GB/s, 20.26 GB/s, 20.46 GB/s] + - [24.69 GB/s, 24.66 GB/s, 24.64 GB/s, 24.63 GB/s, 24.67 GB/s, 24.64 GB/s, + 24.64 GB/s, 24.68 GB/s, 24.61 GB/s, 24.63 GB/s] + - [25.42 GB/s, 25.41 GB/s, 25.40 GB/s, 25.36 GB/s, 25.40 GB/s, 25.39 GB/s, + 25.40 GB/s, 25.38 GB/s, 25.41 GB/s, 25.39 GB/s] + - [25.55 GB/s, 25.57 GB/s, 25.58 GB/s, 25.63 GB/s, 25.57 GB/s, 25.57 GB/s, + 25.58 GB/s, 25.55 GB/s, 25.57 GB/s, 25.49 GB/s] + - [25.42 GB/s, 25.42 GB/s, 25.41 GB/s, 25.39 GB/s, 25.40 GB/s, 25.43 GB/s, + 25.45 GB/s, 25.44 GB/s, 25.43 GB/s, 25.43 GB/s] + - [25.27 GB/s, 25.31 GB/s, 25.28 GB/s, 25.31 GB/s, 25.32 GB/s, 25.31 GB/s, + 25.29 GB/s, 25.30 GB/s, 25.25 GB/s, 25.28 GB/s] + - [25.03 GB/s, 25.01 GB/s, 25.01 GB/s, 25.04 GB/s, 25.00 GB/s, 25.03 GB/s, + 25.06 GB/s, 25.04 GB/s, 25.04 GB/s, 25.04 GB/s] + daxpy: + - [15.81 GB/s, 15.81 GB/s, 15.97 GB/s, 15.62 GB/s, 15.64 GB/s, 15.83 GB/s, + 15.63 GB/s, 15.82 GB/s, 15.81 GB/s, 15.63 GB/s] + - [29.62 GB/s, 29.56 GB/s, 29.61 GB/s, 29.59 GB/s, 29.70 GB/s, 29.61 GB/s, + 29.65 GB/s, 29.65 GB/s, 29.58 GB/s, 29.59 GB/s] + - [35.95 GB/s, 35.89 GB/s, 35.92 GB/s, 35.92 GB/s, 35.95 GB/s, 35.90 GB/s, + 35.87 GB/s, 35.90 GB/s, 35.92 GB/s, 35.82 GB/s] + - [37.55 GB/s, 37.46 GB/s, 37.52 GB/s, 37.51 GB/s, 37.55 GB/s, 37.51 GB/s, + 37.44 GB/s, 37.41 GB/s, 37.50 GB/s, 37.40 GB/s] + - [37.79 GB/s, 37.76 GB/s, 37.80 GB/s, 37.77 GB/s, 37.76 GB/s, 37.81 GB/s, + 37.78 GB/s, 37.81 GB/s, 37.79 GB/s, 37.78 GB/s] + - [37.71 GB/s, 37.68 GB/s, 37.68 GB/s, 37.73 GB/s, 37.74 GB/s, 37.66 GB/s, + 37.78 GB/s, 37.74 GB/s, 37.71 GB/s, 37.70 GB/s] + - [37.61 GB/s, 37.60 GB/s, 37.61 GB/s, 37.62 GB/s, 37.64 GB/s, 37.61 GB/s, + 37.60 GB/s, 37.59 GB/s, 37.63 GB/s, 37.60 GB/s] + - [37.23 GB/s, 37.21 GB/s, 37.26 GB/s, 37.27 GB/s, 37.28 GB/s, 37.33 GB/s, + 37.29 GB/s, 37.31 GB/s, 37.26 GB/s, 37.29 GB/s] + load: + - [13.34 GB/s, 13.36 GB/s, 13.35 GB/s, 13.34 GB/s, 13.35 GB/s, 13.38 GB/s, + 13.46 GB/s, 13.35 GB/s, 13.35 GB/s, 13.35 GB/s] + - [25.63 GB/s, 25.64 GB/s, 25.84 GB/s, 25.64 GB/s, 25.74 GB/s, 25.63 GB/s, + 25.64 GB/s, 25.63 GB/s, 25.64 GB/s, 25.68 GB/s] + - [35.38 GB/s, 35.56 GB/s, 35.50 GB/s, 35.75 GB/s, 35.50 GB/s, 35.39 GB/s, + 35.46 GB/s, 35.39 GB/s, 35.75 GB/s, 35.40 GB/s] + - [40.37 GB/s, 40.37 GB/s, 40.49 GB/s, 40.49 GB/s, 40.42 GB/s, 40.37 GB/s, + 40.54 GB/s, 40.39 GB/s, 40.37 GB/s, 40.51 GB/s] + - [42.34 GB/s, 42.14 GB/s, 42.26 GB/s, 42.17 GB/s, 42.10 GB/s, 42.13 GB/s, + 42.38 GB/s, 42.13 GB/s, 42.21 GB/s, 42.15 GB/s] + - [42.30 GB/s, 42.13 GB/s, 42.20 GB/s, 42.11 GB/s, 42.12 GB/s, 42.12 GB/s, + 42.18 GB/s, 42.25 GB/s, 42.19 GB/s, 42.21 GB/s] + - [41.70 GB/s, 41.76 GB/s, 41.85 GB/s, 41.80 GB/s, 41.71 GB/s, 41.71 GB/s, + 41.80 GB/s, 41.70 GB/s, 41.76 GB/s, 41.75 GB/s] + - [41.02 GB/s, 41.01 GB/s, 41.17 GB/s, 41.12 GB/s, 41.13 GB/s, 41.15 GB/s, + 41.19 GB/s, 41.01 GB/s, 41.10 GB/s, 41.06 GB/s] + triad: + - [11.87 GB/s, 11.89 GB/s, 11.91 GB/s, 11.81 GB/s, 11.83 GB/s, 11.85 GB/s, + 11.90 GB/s, 11.80 GB/s, 11.85 GB/s, 12.05 GB/s] + - [22.53 GB/s, 22.47 GB/s, 22.44 GB/s, 22.46 GB/s, 22.43 GB/s, 22.52 GB/s, + 22.41 GB/s, 22.52 GB/s, 22.48 GB/s, 22.41 GB/s] + - [27.43 GB/s, 27.42 GB/s, 27.47 GB/s, 27.47 GB/s, 27.52 GB/s, 27.49 GB/s, + 27.41 GB/s, 27.42 GB/s, 27.51 GB/s, 27.53 GB/s] + - [29.02 GB/s, 29.03 GB/s, 29.03 GB/s, 29.04 GB/s, 28.89 GB/s, 29.10 GB/s, + 29.02 GB/s, 29.05 GB/s, 28.93 GB/s, 29.01 GB/s] + - [29.66 GB/s, 29.68 GB/s, 29.60 GB/s, 29.62 GB/s, 29.60 GB/s, 29.67 GB/s, + 29.66 GB/s, 29.62 GB/s, 29.62 GB/s, 29.62 GB/s] + - [29.78 GB/s, 29.76 GB/s, 29.77 GB/s, 29.77 GB/s, 29.75 GB/s, 29.79 GB/s, + 29.75 GB/s, 29.77 GB/s, 29.76 GB/s, 29.78 GB/s] + - [29.82 GB/s, 29.85 GB/s, 29.85 GB/s, 29.83 GB/s, 29.82 GB/s, 29.83 GB/s, + 29.83 GB/s, 29.81 GB/s, 29.81 GB/s, 29.80 GB/s] + - [29.54 GB/s, 29.63 GB/s, 29.57 GB/s, 29.56 GB/s, 29.55 GB/s, 29.64 GB/s, + 29.60 GB/s, 29.53 GB/s, 29.54 GB/s, 29.57 GB/s] + update: + - [18.66 GB/s, 18.67 GB/s, 18.66 GB/s, 19.12 GB/s, 18.67 GB/s, 18.67 GB/s, + 18.67 GB/s, 18.67 GB/s, 18.70 GB/s, 18.67 GB/s] + - [33.61 GB/s, 33.34 GB/s, 33.71 GB/s, 33.31 GB/s, 33.34 GB/s, 33.86 GB/s, + 33.62 GB/s, 33.35 GB/s, 33.54 GB/s, 33.34 GB/s] + - [38.51 GB/s, 38.46 GB/s, 38.42 GB/s, 38.43 GB/s, 38.41 GB/s, 38.46 GB/s, + 38.41 GB/s, 38.42 GB/s, 38.43 GB/s, 38.41 GB/s] + - [39.37 GB/s, 39.34 GB/s, 39.36 GB/s, 39.35 GB/s, 39.37 GB/s, 39.38 GB/s, + 39.36 GB/s, 39.35 GB/s, 39.31 GB/s, 39.32 GB/s] + - [39.17 GB/s, 39.17 GB/s, 39.16 GB/s, 39.20 GB/s, 39.18 GB/s, 39.17 GB/s, + 39.18 GB/s, 39.15 GB/s, 39.20 GB/s, 39.17 GB/s] + - [38.79 GB/s, 38.79 GB/s, 38.80 GB/s, 38.78 GB/s, 38.78 GB/s, 38.75 GB/s, + 38.80 GB/s, 38.77 GB/s, 38.78 GB/s, 38.78 GB/s] + - [38.36 GB/s, 38.37 GB/s, 38.37 GB/s, 38.39 GB/s, 38.36 GB/s, 38.37 GB/s, + 38.38 GB/s, 38.37 GB/s, 38.35 GB/s, 38.39 GB/s] + - [37.98 GB/s, 37.99 GB/s, 38.02 GB/s, 38.01 GB/s, 38.01 GB/s, 38.00 GB/s, + 38.02 GB/s, 38.00 GB/s, 38.02 GB/s, 38.02 GB/s] + threads: [2, 4, 6, 8, 10, 12, 14, 16] + threads per core: 2 + total size: [300.00 MB, 300.00 MB, 300.00 MB, 300.00 MB, 300.00 MB, 300.00 + MB, 300.00 MB, 300.00 MB] diff --git a/pystencils_tests/kerncraft_inputs/default_machine_file.yaml b/pystencils_tests/kerncraft_inputs/default_machine_file.yaml deleted file mode 100644 index edec1eef99dfcc153c7c6e933b60d1b6edca74be..0000000000000000000000000000000000000000 --- a/pystencils_tests/kerncraft_inputs/default_machine_file.yaml +++ /dev/null @@ -1,277 +0,0 @@ -kerncraft version: 0.7.3 -clock: 2.7 GHz -cores per socket: 8 -cores per NUMA domain: 8 -NUMA domains per socket: 1 -model type: Intel Core SandyBridge EP processor -model name: Intel(R) Xeon(R) CPU E5-2680 0 @ 2.70GHz -sockets: 2 -threads per core: 2 -cacheline size: 64 B -compiler: - !!omap - - icc: -O3 -xAVX -fno-alias -qopenmp - - clang: -O3 -march=corei7-avx -mtune=corei7-avx -D_POSIX_C_SOURCE=200112L -fopenmp - - gcc: -O3 -march=corei7-avx -D_POSIX_C_SOURCE=200112L -fopenmp -micro-architecture: SNB -FLOPs per cycle: - SP: {total: 16, ADD: 8, MUL: 8} - DP: {total: 8, ADD: 4, MUL: 4} -overlapping model: - ports: ["0", "0DV", "1", "2", "3", "4", "5"] - performance counter metric: - Max(UOPS_DISPATCHED_PORT_PORT_0:PMC[0-3], - UOPS_DISPATCHED_PORT_PORT_1:PMC[0-3], - UOPS_DISPATCHED_PORT_PORT_4:PMC[0-3], - UOPS_DISPATCHED_PORT_PORT_5:PMC[0-3]) -non-overlapping model: - ports: ["2D", "3D"] - performance counter metric: T_OL + T_L1L2 + T_L2L3 + T_L3MEM -write-allocate: True -memory hierarchy: - - level: L1 - cache per group: { - 'sets': 64, 'ways': 8, 'cl_size': 64, # 32 kB - 'replacement_policy': 'LRU', - 'write_allocate': True, 'write_back': True, - 'load_from': 'L2', 'store_to': 'L2'} - cores per group: 1 - threads per group: 2 - groups: 16 - performance counter metrics: - accesses: MEM_UOPS_RETIRED_LOADS:PMC[0-3] - misses: L1D_REPLACEMENT:PMC[0-3] - evicts: L1D_M_EVICT:PMC[0-3] - - level: L2 - cache per group: { - 'sets': 512, 'ways': 8, 'cl_size': 64, # 256 kB - 'replacement_policy': 'LRU', - 'write_allocate': True, 'write_back': True, - 'load_from': 'L3', 'store_to': 'L3'} - cores per group: 1 - threads per group: 2 - groups: 16 - non-overlap upstream throughput: [32 B/cy, 'half-duplex'] - performance counter metrics: - accesses: L1D_REPLACEMENT:PMC[0-3] - misses: L2_LINES_IN_ALL:PMC[0-3] - evicts: L2_TRANS_L2_WB:PMC[0-3] - - level: L3 - cache per group: { - 'sets': 20480, 'ways': 16, 'cl_size': 64, # 20 MB - 'replacement_policy': 'LRU', - 'write_allocate': True, 'write_back': True} - cores per group: 8 - threads per group: 16 - groups: 2 - non-overlap upstream throughput: [32 B/cy, 'half-duplex'] - performance counter metrics: - accesses: L2_LINES_IN_ALL:PMC[0-3] - misses: (CAS_COUNT_RD:MBOX0C[01] + CAS_COUNT_RD:MBOX1C[01] + - CAS_COUNT_RD:MBOX2C[01] + CAS_COUNT_RD:MBOX3C[01]) - evicts: (CAS_COUNT_WR:MBOX0C[01] + CAS_COUNT_WR:MBOX1C[01] + - CAS_COUNT_WR:MBOX2C[01] + CAS_COUNT_WR:MBOX3C[01]) - - level: MEM - cores per group: 8 - non-overlap upstream throughput: ['full socket memory bandwidth', 'half-duplex'] - size per group: null - threads per group: 16 -benchmarks: - kernels: - copy: - FLOPs per iteration: 0 - read streams: {bytes: 8.00 B, streams: 1} - read+write streams: {bytes: 0.00 B, streams: 0} - write streams: {bytes: 8.00 B, streams: 1} - daxpy: - FLOPs per iteration: 2 - read streams: {bytes: 16.00 B, streams: 2} - read+write streams: {bytes: 8.00 B, streams: 1} - write streams: {bytes: 8.00 B, streams: 1} - load: - FLOPs per iteration: 0 - read streams: {bytes: 8.00 B, streams: 1} - read+write streams: {bytes: 0.00 B, streams: 0} - write streams: {bytes: 0.00 B, streams: 0} - triad: - FLOPs per iteration: 2 - read streams: {bytes: 24.00 B, streams: 3} - read+write streams: {bytes: 0.00 B, streams: 0} - write streams: {bytes: 8.00 B, streams: 1} - update: - FLOPs per iteration: 0 - read streams: {bytes: 8.00 B, streams: 1} - read+write streams: {bytes: 8.00 B, streams: 1} - write streams: {bytes: 8.00 B, streams: 1} - measurements: - L1: - 1: - cores: [1, 2, 3, 4, 5, 6, 7, 8] - results: - copy: [81.98 GB/s, 163.75 GB/s, 245.62 GB/s, 327.69 GB/s, 409.41 GB/s, 489.83 - GB/s, 571.67 GB/s, 653.50 GB/s] - daxpy: [71.55 GB/s, 143.01 GB/s, 214.86 GB/s, 286.26 GB/s, 355.60 GB/s, - 426.71 GB/s, 497.45 GB/s, 568.97 GB/s] - load: [61.92 GB/s, 122.79 GB/s, 183.01 GB/s, 244.30 GB/s, 306.76 GB/s, 368.46 - GB/s, 427.41 GB/s, 490.88 GB/s] - triad: [81.61 GB/s, 163.25 GB/s, 244.92 GB/s, 326.65 GB/s, 406.69 GB/s, - 487.76 GB/s, 569.10 GB/s, 650.39 GB/s] - update: [84.03 GB/s, 168.02 GB/s, 252.10 GB/s, 335.94 GB/s, 419.90 GB/s, - 503.88 GB/s, 587.86 GB/s, 671.88 GB/s] - size per core: [16.00 kB, 16.00 kB, 16.00 kB, 16.00 kB, 16.00 kB, 16.00 kB, - 16.00 kB, 16.00 kB] - size per thread: [16.00 kB, 16.00 kB, 16.00 kB, 16.00 kB, 16.00 kB, 16.00 - kB, 16.00 kB, 16.00 kB] - threads: [1, 2, 3, 4, 5, 6, 7, 8] - threads per core: 1 - total size: [16.00 kB, 32.00 kB, 48.00 kB, 64.00 kB, 80.00 kB, 96.00 kB, 112.00 - kB, 128.00 kB] - 2: - cores: [1, 2, 3, 4, 5, 6, 7, 8] - results: - copy: [79.53 GB/s, 158.70 GB/s, 238.20 GB/s, 317.62 GB/s, 397.09 GB/s, 476.33 - GB/s, 555.69 GB/s, 634.96 GB/s] - daxpy: [70.94 GB/s, 141.90 GB/s, 212.97 GB/s, 283.91 GB/s, 354.93 GB/s, - 425.85 GB/s, 496.74 GB/s, 567.40 GB/s] - load: [57.01 GB/s, 114.11 GB/s, 171.11 GB/s, 228.13 GB/s, 285.15 GB/s, 342.11 - GB/s, 399.11 GB/s, 456.11 GB/s] - triad: [79.48 GB/s, 159.03 GB/s, 238.53 GB/s, 318.04 GB/s, 392.11 GB/s, - 477.10 GB/s, 538.36 GB/s, 636.02 GB/s] - update: [82.75 GB/s, 165.55 GB/s, 248.50 GB/s, 331.32 GB/s, 414.06 GB/s, - 496.82 GB/s, 579.83 GB/s, 662.36 GB/s] - size per core: [16.00 kB, 16.00 kB, 16.00 kB, 16.00 kB, 16.00 kB, 16.00 kB, - 16.00 kB, 16.00 kB] - size per thread: [8.00 kB, 8.00 kB, 8.00 kB, 8.00 kB, 8.00 kB, 8.00 kB, 8.00 - kB, 8.00 kB] - threads: [2, 4, 6, 8, 10, 12, 14, 16] - threads per core: 2 - total size: [16.00 kB, 32.00 kB, 48.00 kB, 64.00 kB, 80.00 kB, 96.00 kB, 112.00 - kB, 128.00 kB] - L2: - 1: - cores: [1, 2, 3, 4, 5, 6, 7, 8] - results: - copy: [41.28 GB/s, 81.96 GB/s, 120.28 GB/s, 160.70 GB/s, 203.22 GB/s, 239.97 - GB/s, 271.13 GB/s, 307.01 GB/s] - daxpy: [48.85 GB/s, 98.62 GB/s, 143.29 GB/s, 197.76 GB/s, 230.58 GB/s, 284.98 - GB/s, 334.22 GB/s, 385.72 GB/s] - load: [38.51 GB/s, 76.67 GB/s, 114.73 GB/s, 152.90 GB/s, 188.69 GB/s, 223.64 - GB/s, 265.21 GB/s, 289.41 GB/s] - triad: [40.92 GB/s, 83.49 GB/s, 124.48 GB/s, 165.24 GB/s, 206.74 GB/s, 237.90 - GB/s, 274.96 GB/s, 329.09 GB/s] - update: [50.37 GB/s, 100.05 GB/s, 145.43 GB/s, 196.82 GB/s, 244.07 GB/s, - 301.62 GB/s, 336.88 GB/s, 403.78 GB/s] - size per core: [128.00 kB, 128.00 kB, 128.00 kB, 128.00 kB, 128.00 kB, 128.00 - kB, 128.00 kB, 128.00 kB] - size per thread: [128.00 kB, 128.00 kB, 128.00 kB, 128.00 kB, 128.00 kB, 128.00 - kB, 128.00 kB, 128.00 kB] - threads: [1, 2, 3, 4, 5, 6, 7, 8] - threads per core: 1 - total size: [128.00 kB, 256.00 kB, 384.00 kB, 512.00 kB, 640.00 kB, 768.00 - kB, 0.90 MB, 1.02 MB] - 2: - cores: [1, 2, 3, 4, 5, 6, 7, 8] - results: - copy: [42.17 GB/s, 83.47 GB/s, 124.57 GB/s, 163.78 GB/s, 202.56 GB/s, 242.80 - GB/s, 276.95 GB/s, 311.36 GB/s] - daxpy: [50.87 GB/s, 98.72 GB/s, 152.12 GB/s, 193.48 GB/s, 251.36 GB/s, 301.72 - GB/s, 352.55 GB/s, 365.28 GB/s] - load: [39.62 GB/s, 79.03 GB/s, 118.03 GB/s, 157.85 GB/s, 196.48 GB/s, 237.44 - GB/s, 276.81 GB/s, 309.71 GB/s] - triad: [44.80 GB/s, 88.35 GB/s, 125.13 GB/s, 169.94 GB/s, 209.60 GB/s, 260.15 - GB/s, 300.75 GB/s, 333.08 GB/s] - update: [49.80 GB/s, 100.70 GB/s, 150.56 GB/s, 196.44 GB/s, 251.90 GB/s, - 280.93 GB/s, 352.74 GB/s, 399.27 GB/s] - size per core: [128.00 kB, 128.00 kB, 128.00 kB, 128.00 kB, 128.00 kB, 128.00 - kB, 128.00 kB, 128.00 kB] - size per thread: [64.00 kB, 64.00 kB, 64.00 kB, 64.00 kB, 64.00 kB, 64.00 - kB, 64.00 kB, 64.00 kB] - threads: [2, 4, 6, 8, 10, 12, 14, 16] - threads per core: 2 - total size: [128.00 kB, 256.00 kB, 384.00 kB, 512.00 kB, 640.00 kB, 768.00 - kB, 0.90 MB, 1.02 MB] - L3: - 1: - cores: [1, 2, 3, 4, 5, 6, 7, 8] - results: - copy: [23.21 GB/s, 46.01 GB/s, 67.96 GB/s, 90.17 GB/s, 111.47 GB/s, 133.14 - GB/s, 153.84 GB/s, 174.92 GB/s] - daxpy: [30.35 GB/s, 60.32 GB/s, 90.00 GB/s, 119.71 GB/s, 148.87 GB/s, 178.39 - GB/s, 207.10 GB/s, 236.25 GB/s] - load: [23.35 GB/s, 46.52 GB/s, 69.57 GB/s, 92.60 GB/s, 115.77 GB/s, 138.89 - GB/s, 161.82 GB/s, 184.11 GB/s] - triad: [25.18 GB/s, 50.08 GB/s, 74.33 GB/s, 98.78 GB/s, 122.66 GB/s, 146.78 - GB/s, 170.52 GB/s, 194.47 GB/s] - update: [32.67 GB/s, 64.65 GB/s, 95.98 GB/s, 127.29 GB/s, 157.67 GB/s, 188.22 - GB/s, 217.41 GB/s, 246.99 GB/s] - size per core: [1.25 MB, 1.25 MB, 1.25 MB, 1.25 MB, 1.25 MB, 1.25 MB, 1.25 - MB, 1.25 MB] - size per thread: [1.25 MB, 1.25 MB, 1.25 MB, 1.25 MB, 1.25 MB, 1.25 MB, 1.25 - MB, 1.25 MB] - threads: [1, 2, 3, 4, 5, 6, 7, 8] - threads per core: 1 - total size: [1.25 MB, 2.50 MB, 3.75 MB, 5.00 MB, 6.25 MB, 7.50 MB, 8.75 MB, - 10.00 MB] - 2: - cores: [1, 2, 3, 4, 5, 6, 7, 8] - results: - copy: [23.83 GB/s, 47.25 GB/s, 69.84 GB/s, 92.61 GB/s, 114.31 GB/s, 136.48 - GB/s, 157.55 GB/s, 178.99 GB/s] - daxpy: [31.52 GB/s, 62.72 GB/s, 93.43 GB/s, 124.29 GB/s, 154.55 GB/s, 185.18 - GB/s, 215.10 GB/s, 245.24 GB/s] - load: [27.63 GB/s, 54.93 GB/s, 81.57 GB/s, 108.63 GB/s, 134.91 GB/s, 161.72 - GB/s, 188.15 GB/s, 214.94 GB/s] - triad: [25.90 GB/s, 51.76 GB/s, 76.73 GB/s, 102.29 GB/s, 126.17 GB/s, 152.10 - GB/s, 176.71 GB/s, 200.64 GB/s] - update: [34.10 GB/s, 67.67 GB/s, 100.62 GB/s, 133.50 GB/s, 165.61 GB/s, - 197.74 GB/s, 228.73 GB/s, 259.05 GB/s] - size per core: [1.25 MB, 1.25 MB, 1.25 MB, 1.25 MB, 1.25 MB, 1.25 MB, 1.25 - MB, 1.25 MB] - size per thread: [625.00 kB, 625.00 kB, 625.00 kB, 625.00 kB, 625.00 kB, 625.00 - kB, 625.00 kB, 625.00 kB] - threads: [2, 4, 6, 8, 10, 12, 14, 16] - threads per core: 2 - total size: [1.25 MB, 2.50 MB, 3.75 MB, 5.00 MB, 6.25 MB, 7.50 MB, 8.75 MB, - 10.00 MB] - MEM: - 1: - cores: [1, 2, 3, 4, 5, 6, 7, 8] - results: - copy: [11.60 GB/s, 21.29 GB/s, 25.94 GB/s, 27.28 GB/s, 27.47 GB/s, 27.36 - GB/s, 27.21 GB/s, 27.12 GB/s] - daxpy: [17.33 GB/s, 31.89 GB/s, 38.65 GB/s, 40.50 GB/s, 40.81 GB/s, 40.62 - GB/s, 40.59 GB/s, 40.26 GB/s] - load: [12.01 GB/s, 23.04 GB/s, 32.79 GB/s, 40.21 GB/s, 43.39 GB/s, 44.14 - GB/s, 44.42 GB/s, 44.40 GB/s] - triad: [12.73 GB/s, 24.27 GB/s, 30.43 GB/s, 31.46 GB/s, 31.77 GB/s, 31.74 - GB/s, 31.65 GB/s, 31.52 GB/s] - update: [18.91 GB/s, 32.43 GB/s, 37.28 GB/s, 39.98 GB/s, 40.99 GB/s, 40.92 - GB/s, 40.61 GB/s, 40.34 GB/s] - size per core: [40.00 MB, 20.00 MB, 13.33 MB, 10.00 MB, 8.00 MB, 6.67 MB, - 5.71 MB, 5.00 MB] - size per thread: [40.00 MB, 20.00 MB, 13.33 MB, 10.00 MB, 8.00 MB, 6.67 MB, - 5.71 MB, 5.00 MB] - threads: [1, 2, 3, 4, 5, 6, 7, 8] - threads per core: 1 - total size: [40.00 MB, 40.00 MB, 40.00 MB, 40.00 MB, 40.00 MB, 40.00 MB, 40.00 MB, 40.00 MB, 40.00 MB, 40.00 MB] - 2: - cores: [1, 2, 3, 4, 5, 6, 7, 8] - results: - copy: [10.92 GB/s, 20.62 GB/s, 25.34 GB/s, 26.22 GB/s, 26.32 GB/s, 26.31 - GB/s, 26.22 GB/s, 26.16 GB/s] - daxpy: [17.15 GB/s, 31.96 GB/s, 38.12 GB/s, 39.19 GB/s, 39.38 GB/s, 39.16 - GB/s, 39.06 GB/s, 38.87 GB/s] - load: [13.49 GB/s, 25.92 GB/s, 36.16 GB/s, 41.56 GB/s, 43.34 GB/s, 43.40 - GB/s, 43.01 GB/s, 42.66 GB/s] - triad: [12.38 GB/s, 23.17 GB/s, 28.69 GB/s, 29.98 GB/s, 30.50 GB/s, 30.59 - GB/s, 30.75 GB/s, 30.70 GB/s] - update: [19.67 GB/s, 34.93 GB/s, 39.93 GB/s, 40.79 GB/s, 40.43 GB/s, 40.03 - GB/s, 39.62 GB/s, 39.33 GB/s] - size per core: [40.00 MB, 20.00 MB, 13.33 MB, 10.00 MB, 8.00 MB, 6.67 MB, - 5.71 MB, 5.00 MB] - size per thread: [20.00 MB, 10.00 MB, 6.67 MB, 5.00 MB, 4.00 MB, 3.33 MB, - 2.86 MB, 2.50 MB] - threads: [2, 4, 6, 8, 10, 12, 14, 16] - threads per core: 2 - total size: [40.00 MB, 40.00 MB, 40.00 MB, 40.00 MB, 40.00 MB, 40.00 MB, 40.00 MB, 40.00 MB, 40.00 MB, 40.00 MB] - diff --git a/pystencils_tests/test_kerncraft_coupling.py b/pystencils_tests/test_kerncraft_coupling.py index 0040006097bc5f48461105cb1d0462313c18bd1a..653ed34d90e6ecd45a7a5785fb71d522cc3734f5 100644 --- a/pystencils_tests/test_kerncraft_coupling.py +++ b/pystencils_tests/test_kerncraft_coupling.py @@ -1,28 +1,33 @@ -import os - import numpy as np import pytest import sympy as sp -import kerncraft +from pathlib import Path + +from kerncraft.kernel import KernelCode +from kerncraft.machinemodel import MachineModel +from kerncraft.models import ECM, ECMData, Benchmark from pystencils import Assignment, Field from pystencils.cpu import create_kernel from pystencils.kerncraft_coupling import KerncraftParameters, PyStencilsKerncraftKernel -from pystencils.kerncraft_coupling.generate_benchmark import generate_benchmark +from pystencils.kerncraft_coupling.generate_benchmark import generate_benchmark, run_c_benchmark +from pystencils.timeloop import TimeLoop -SCRIPT_FOLDER = os.path.dirname(os.path.realpath(__file__)) -INPUT_FOLDER = os.path.join(SCRIPT_FOLDER, "kerncraft_inputs") +SCRIPT_FOLDER = Path(__file__).parent +INPUT_FOLDER = SCRIPT_FOLDER / "kerncraft_inputs" @pytest.mark.kerncraft def test_compilation(): - machine_file_path = os.path.join(INPUT_FOLDER, "default_machine_file.yaml") - machine = kerncraft.machinemodel.MachineModel(path_to_yaml=machine_file_path) + machine_file_path = INPUT_FOLDER / "Example_SandyBridgeEP_E5-2680.yml" + machine = MachineModel(path_to_yaml=machine_file_path) - kernel_file_path = os.path.join(INPUT_FOLDER, "2d-5pt.c") + kernel_file_path = INPUT_FOLDER / "2d-5pt.c" with open(kernel_file_path) as kernel_file: - reference_kernel = kerncraft.kernel.KernelCode(kernel_file.read(), machine=machine, filename=kernel_file_path) - reference_kernel.as_code('likwid') + reference_kernel = KernelCode(kernel_file.read(), machine=machine, filename=kernel_file_path) + reference_kernel.get_kernel_header(name='test_kernel') + reference_kernel.get_kernel_code(name='test_kernel') + reference_kernel.get_main_code(kernel_function_name='test_kernel') size = [30, 50, 3] arr = np.zeros(size) @@ -38,31 +43,31 @@ def test_compilation(): @pytest.mark.kerncraft def analysis(kernel, model='ecmdata'): - machine_file_path = os.path.join(INPUT_FOLDER, "default_machine_file.yaml") - machine = kerncraft.machinemodel.MachineModel(path_to_yaml=machine_file_path) + machine_file_path = INPUT_FOLDER / "Example_SandyBridgeEP_E5-2680.yml" + machine = MachineModel(path_to_yaml=machine_file_path) if model == 'ecmdata': - model = kerncraft.models.ECMData(kernel, machine, KerncraftParameters()) + model = ECMData(kernel, machine, KerncraftParameters()) elif model == 'ecm': - model = kerncraft.models.ECM(kernel, machine, KerncraftParameters()) + model = ECM(kernel, machine, KerncraftParameters()) # model.analyze() # model.plot() elif model == 'benchmark': - model = kerncraft.models.Benchmark(kernel, machine, KerncraftParameters()) + model = Benchmark(kernel, machine, KerncraftParameters()) else: - model = kerncraft.models.ECM(kernel, machine, KerncraftParameters()) + model = ECM(kernel, machine, KerncraftParameters()) model.analyze() return model @pytest.mark.kerncraft -def test_3d_7pt_iaca(): - # Make sure you use the intel compiler +def test_3d_7pt_osaca(): + size = [20, 200, 200] - kernel_file_path = os.path.join(INPUT_FOLDER, "3d-7pt.c") - machine_file_path = os.path.join(INPUT_FOLDER, "default_machine_file.yaml") - machine = kerncraft.machinemodel.MachineModel(path_to_yaml=machine_file_path) + kernel_file_path = INPUT_FOLDER / "3d-7pt.c" + machine_file_path = INPUT_FOLDER / "Example_SandyBridgeEP_E5-2680.yml" + machine_model = MachineModel(path_to_yaml=machine_file_path) with open(kernel_file_path) as kernel_file: - reference_kernel = kerncraft.kernel.KernelCode(kernel_file.read(), machine=machine, filename=kernel_file_path) + reference_kernel = KernelCode(kernel_file.read(), machine=machine_model, filename=kernel_file_path) reference_kernel.set_constant('M', size[0]) reference_kernel.set_constant('N', size[1]) assert size[1] == size[2] @@ -76,7 +81,7 @@ def test_3d_7pt_iaca(): update_rule = Assignment(b[0, 0, 0], s * rhs) ast = create_kernel([update_rule]) - k = PyStencilsKerncraftKernel(ast, machine) + k = PyStencilsKerncraftKernel(ast, machine=machine_model) analysis(k, model='ecm') assert reference_kernel._flops == k._flops # assert reference.results['cl throughput'] == analysis.results['cl throughput'] @@ -85,9 +90,9 @@ def test_3d_7pt_iaca(): @pytest.mark.kerncraft def test_2d_5pt(): size = [30, 50, 3] - kernel_file_path = os.path.join(INPUT_FOLDER, "2d-5pt.c") + kernel_file_path = INPUT_FOLDER / "2d-5pt.c" with open(kernel_file_path) as kernel_file: - reference_kernel = kerncraft.kernel.KernelCode(kernel_file.read(), machine=None, filename=kernel_file_path) + reference_kernel = KernelCode(kernel_file.read(), machine=None, filename=kernel_file_path) reference = analysis(reference_kernel) arr = np.zeros(size) @@ -107,9 +112,9 @@ def test_2d_5pt(): @pytest.mark.kerncraft def test_3d_7pt(): size = [30, 50, 50] - kernel_file_path = os.path.join(INPUT_FOLDER, "3d-7pt.c") + kernel_file_path = INPUT_FOLDER / "3d-7pt.c" with open(kernel_file_path) as kernel_file: - reference_kernel = kerncraft.kernel.KernelCode(kernel_file.read(), machine=None, filename=kernel_file_path) + reference_kernel = KernelCode(kernel_file.read(), machine=None, filename=kernel_file_path) reference_kernel.set_constant('M', size[0]) reference_kernel.set_constant('N', size[1]) assert size[1] == size[2] @@ -128,3 +133,29 @@ def test_3d_7pt(): for e1, e2 in zip(reference.results['cycles'], result.results['cycles']): assert e1 == e2 + + +@pytest.mark.kerncraft +def test_benchmark(): + size = [30, 50, 50] + arr = np.zeros(size) + a = Field.create_from_numpy_array('a', arr, index_dimensions=0) + b = Field.create_from_numpy_array('b', arr, index_dimensions=0) + s = sp.Symbol("s") + rhs = a[0, -1, 0] + a[0, 1, 0] + a[-1, 0, 0] + a[1, 0, 0] + a[0, 0, -1] + a[0, 0, 1] + + update_rule = Assignment(b[0, 0, 0], s * rhs) + ast = create_kernel([update_rule]) + + c_benchmark_run = run_c_benchmark(ast, inner_iterations=1000, outer_iterations=1) + + kernel = ast.compile() + a = np.full(size, fill_value=0.23) + b = np.full(size, fill_value=0.23) + + timeloop = TimeLoop(steps=1) + timeloop.add_call(kernel, {'a': a, 'b': b, 's': 0.23}) + + timeloop_time = timeloop.benchmark(number_of_time_steps_for_estimation=1) + + np.testing.assert_almost_equal(c_benchmark_run, timeloop_time, decimal=4)