Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
No results found
Show changes
Showing
with 0 additions and 3124 deletions
#include "kerncraft.h"
#include <stdlib.h>
#include <stdint.h>
#include <stdbool.h>
#include <math.h>
#include <stdio.h>
#include <assert.h>
{{ includes }}
{%- if likwid %}
#include <likwid.h>
{%- endif %}
#define RESTRICT __restrict__
#define FUNC_PREFIX
void dummy(void *);
void timing(double* wcTime, double* cpuTime);
extern int var_false;
/* see waLBerla src/field/allocation/AlignedMalloc */
void *aligned_malloc_with_offset( uint64_t size, uint64_t alignment, uint64_t offset )
{
// With 0 alignment this function makes no sense
// use normal malloc instead
assert( alignment > 0 );
// Tests if alignment is power of two (assuming alignment>0)
assert( !(alignment & (alignment - 1)) );
assert( offset < alignment );
void *pa; // pointer to allocated memory
void *ptr; // pointer to usable aligned memory
pa=std::malloc( (size+2*alignment-1 )+sizeof(void *));
if(!pa)
return nullptr;
// Find next aligned position, starting at pa+sizeof(void*)-1
ptr=(void*)( ((size_t)pa+sizeof(void *)+alignment-1) & ~(alignment-1));
ptr=(void*) ( (char*)(ptr) + alignment - offset);
// Store pointer to real allocated chunk just before usable chunk
*((void **)ptr-1)=pa;
assert( ((size_t)ptr+offset) % alignment == 0 );
return ptr;
}
void aligned_free( void *ptr )
{
// assume that pointer to real allocated chunk is stored just before
// chunk that was given to user
if(ptr)
std::free(*((void **)ptr-1));
}
{{kernel_code}}
int main(int argc, char **argv)
{
{%- if likwid %}
likwid_markerInit();
{%- endif %}
{%- for field_name, dataType, elements, size, offset, alignment in fields %}
// Initialization {{field_name}}
{%- if alignment > 0 %}
{{dataType}} * {{field_name}} = ({{dataType}} *) aligned_malloc_with_offset({{size}}, {{alignment}}, {{offset}});
{%- else %}
{{dataType}} * {{field_name}} = new {{dataType}}[{{elements}}];
{%- endif %}
for (unsigned long long i = 0; i < {{elements}}; ++i)
{{field_name}}[i] = 0.23;
if(var_false)
dummy({{field_name}});
{%- endfor %}
{%- for constantName, dataType in constants %}
// Constant {{constantName}}
{{dataType}} {{constantName}};
{{constantName}} = 0.23;
if(var_false)
dummy(& {{constantName}});
{%- endfor %}
{%- if likwid and openmp %}
#pragma omp parallel
{
likwid_markerRegisterRegion("loop");
#pragma omp barrier
{%- elif likwid %}
likwid_markerRegisterRegion("loop");
{%- endif %}
for(int warmup = 1; warmup >= 0; --warmup) {
int repeat = 2;
if(warmup == 0) {
repeat = atoi(argv[1]);
{%- if likwid %}
likwid_markerStartRegion("loop");
{%- endif %}
}
{%- if timing %}
double wcStartTime, cpuStartTime, wcEndTime, cpuEndTime;
timing(&wcStartTime, &cpuStartTime);
{%- endif %}
for (; repeat > 0; --repeat)
{
{{kernelName}}({{call_argument_list}});
// Dummy calls
{%- for field_name, dataType, elements, size, offset, alignment in fields %}
if(var_false) dummy((void*){{field_name}});
{%- endfor %}
{%- for constantName, dataType in constants %}
if(var_false) dummy((void*)&{{constantName}});
{%- endfor %}
}
{%- if timing %}
timing(&wcEndTime, &cpuEndTime);
if( warmup == 0)
printf("%e\n", (wcEndTime - wcStartTime) / atoi(argv[1]) );
{%- endif %}
}
{%- if likwid %}
likwid_markerStopRegion("loop");
{%- if openmp %}
}
{%- endif %}
{%- endif %}
{%- if likwid %}
likwid_markerClose();
{%- endif %}
{%- for field_name, dataType, elements, size, offset, alignment in fields %}
{%- if alignment > 0 %}
aligned_free({{field_name}});
{%- else %}
delete[] {{field_name}};
{%- endif %}
{%- endfor %}
}
#include "kerncraft.h"
#include <stdlib.h>
#include <stdint.h>
#include <stdbool.h>
#include <math.h>
#include <stdio.h>
{{ includes }}
#define RESTRICT __restrict__
#define FUNC_PREFIX
void dummy(void *);
void timing(double* wcTime, double* cpuTime);
extern int var_false;
{{kernel_code}}
\ No newline at end of file
#define FUNC_PREFIX
{{function_signature}}
\ No newline at end of file
from .kernelcreation import create_kernel
from .llvmjit import make_python_function
__all__ = ['create_kernel', 'make_python_function']
import llvmlite.ir as ir
class Loop(object):
def __init__(self, builder, start_val, stop_val, step_val=1, loop_name='loop', phi_name="_phi"):
self.builder = builder
self.start_val = start_val
self.stop_val = stop_val
self.step_val = step_val
self.loop_name = loop_name
self.phi_name = phi_name
def __enter__(self):
self.loop_end, self.after, phi = self._for_loop(self.start_val, self.stop_val, self.step_val, self.loop_name,
self.phi_name)
return phi
def _for_loop(self, start_val, stop_val, step_val, loop_name, phi_name):
# TODO size of int??? unisgned???
integer = ir.IntType(64)
# Loop block
pre_loop_bb = self.builder.block
loop_bb = self.builder.append_basic_block(name='loop_' + loop_name)
self.builder.branch(loop_bb)
# Insert an explicit fall through from the current block to loop_bb
self.builder.position_at_start(loop_bb)
# Add phi
phi = self.builder.phi(integer, name=phi_name)
phi.add_incoming(start_val, pre_loop_bb)
loop_end_bb = self.builder.append_basic_block(name=loop_name + "_end_bb")
self.builder.position_at_start(loop_end_bb)
next_var = self.builder.add(phi, step_val, name=loop_name + '_next_it')
cond = self.builder.icmp_unsigned('<', next_var, stop_val, name=loop_name + "_cond")
after_bb = self.builder.append_basic_block(name=loop_name + "_after_bb")
self.builder.cbranch(cond, loop_bb, after_bb)
phi.add_incoming(next_var, loop_end_bb)
self.builder.position_at_end(loop_bb)
return loop_end_bb, after_bb, phi
def __exit__(self, exc_type, exc, exc_tb):
self.builder.branch(self.loop_end)
self.builder.position_at_end(self.after)
from pystencils.enums import Target, Backend
from pystencils.llvm.llvmjit import make_python_function
from pystencils.transformations import insert_casts
def create_kernel(assignments, function_name="kernel", type_info=None, split_groups=(),
iteration_slice=None, ghost_layers=None, target=Target.CPU):
"""
Creates an abstract syntax tree for a kernel function, by taking a list of update rules.
Loops are created according to the field accesses in the equations.
Args:
assignments: list of sympy equations, containing accesses to :class:`pystencils.field.Field`.
Defining the update rules of the kernel
function_name: name of the generated function - only important if generated code is written out
type_info: a map from symbol name to a C type specifier. If not specified all symbols are assumed to
be of type 'double' except symbols which occur on the left hand side of equations where the
right hand side is a sympy Boolean which are assumed to be 'bool' .
split_groups: Specification on how to split up inner loop into multiple loops. For details see
transformation :func:`pystencils.transformation.split_inner_loop`
iteration_slice: if not None, iteration is done only over this slice of the field
ghost_layers: a sequence of pairs for each coordinate with lower and upper nr of ghost layers
if None, the number of ghost layers is determined automatically and assumed to be equal for a
all dimensions
:return: :class:`pystencils.ast.KernelFunction` node
"""
if target == Target.CPU:
from pystencils.cpu import create_kernel
code = create_kernel(assignments, function_name, type_info, split_groups, iteration_slice, ghost_layers)
elif target == Target.GPU:
from pystencils.gpucuda.kernelcreation import create_cuda_kernel
code = create_cuda_kernel(assignments,
function_name,
type_info,
iteration_slice=iteration_slice,
ghost_layers=ghost_layers)
else:
NotImplementedError(f'{target} is not implemented for LLVM kernel creation')
code._backend = Backend.LLVM
code.body = insert_casts(code.body)
code._compile_function = make_python_function
return code
import functools
import llvmlite.ir as ir
import llvmlite.llvmpy.core as lc
import sympy as sp
from sympy import Indexed, S
from sympy.printing.printer import Printer
from pystencils import Target
from pystencils.assignment import Assignment
from pystencils.data_types import (
collate_types, create_composite_type_from_string, create_type, get_type_of_expression,
to_llvm_type)
from pystencils.llvm.control_flow import Loop
# From Numba
def set_cuda_kernel(lfunc):
from llvmlite.llvmpy.core import MetaData, MetaDataString, Constant, Type
m = lfunc.module
ops = lfunc, MetaDataString.get(m, "kernel"), Constant.int(Type.int(), 1)
md = MetaData.get(m, ops)
nmd = m.get_or_insert_named_metadata('nvvm.annotations')
nmd.add(md)
# set nvvm ir version
i32 = ir.IntType(32)
md_ver = m.add_metadata([i32(1), i32(2), i32(2), i32(0)])
m.add_named_metadata('nvvmir.version', md_ver)
# From Numba
def _call_sreg(builder, name):
module = builder.module
fnty = lc.Type.function(lc.Type.int(), ())
fn = module.get_or_insert_function(fnty, name=name)
return builder.call(fn, ())
def generate_llvm(ast_node, module=None, builder=None, target=Target.CPU):
"""Prints the ast as llvm code."""
if module is None:
module = lc.Module()
if builder is None:
builder = ir.IRBuilder()
printer = LLVMPrinter(module, builder, target=target)
return printer._print(ast_node)
# noinspection PyPep8Naming
class LLVMPrinter(Printer):
"""Convert expressions to LLVM IR"""
def __init__(self, module, builder, fn=None, target=Target.CPU, *args, **kwargs):
self.func_arg_map = kwargs.pop("func_arg_map", {})
super(LLVMPrinter, self).__init__(*args, **kwargs)
self.fp_type = ir.DoubleType()
self.fp_pointer = self.fp_type.as_pointer()
self.integer = ir.IntType(64)
self.integer_pointer = self.integer.as_pointer()
self.void = ir.VoidType()
self.module = module
self.builder = builder
self.fn = fn
self.ext_fn = {} # keep track of wrappers to external functions
self.tmp_var = {}
self.target = target
def _add_tmp_var(self, name, value):
self.tmp_var[name] = value
def _remove_tmp_var(self, name):
del self.tmp_var[name]
def _print_Number(self, n):
if get_type_of_expression(n) == create_type("int"):
return ir.Constant(self.integer, int(n))
elif get_type_of_expression(n) == create_type("double"):
return ir.Constant(self.fp_type, float(n))
else:
raise NotImplementedError("Numbers can only have int and double", n)
def _print_Float(self, expr):
return ir.Constant(self.fp_type, float(expr))
def _print_Integer(self, expr):
return ir.Constant(self.integer, int(expr))
def _print_int(self, i):
return ir.Constant(self.integer, i)
def _print_Symbol(self, s):
val = self.tmp_var.get(s)
if not val:
# look up parameter with name s
val = self.func_arg_map.get(s.name)
if not val:
raise LookupError(f"Symbol not found: {s}")
return val
def _print_Pow(self, expr):
base0 = self._print(expr.base)
if expr.exp == S.NegativeOne:
return self.builder.fdiv(ir.Constant(self.fp_type, 1.0), base0)
if expr.exp == S.Half:
fn = self.ext_fn.get("sqrt")
if not fn:
fn_type = ir.FunctionType(self.fp_type, [self.fp_type])
fn = ir.Function(self.module, fn_type, "sqrt")
self.ext_fn["sqrt"] = fn
return self.builder.call(fn, [base0], "sqrt")
if expr.exp == 2:
return self.builder.fmul(base0, base0)
elif expr.exp == 3:
return self.builder.fmul(self.builder.fmul(base0, base0), base0)
exp0 = self._print(expr.exp)
fn = self.ext_fn.get("pow")
if not fn:
fn_type = ir.FunctionType(self.fp_type, [self.fp_type, self.fp_type])
fn = ir.Function(self.module, fn_type, "pow")
self.ext_fn["pow"] = fn
return self.builder.call(fn, [base0, exp0], "pow")
def _print_Mul(self, expr):
nodes = [self._print(a) for a in expr.args]
e = nodes[0]
if get_type_of_expression(expr) == create_type('double'):
mul = self.builder.fmul
else: # int TODO unsigned/signed
mul = self.builder.mul
for node in nodes[1:]:
e = mul(e, node)
return e
def _print_Add(self, expr):
nodes = [self._print(a) for a in expr.args]
e = nodes[0]
if get_type_of_expression(expr) == create_type('double'):
add = self.builder.fadd
else: # int TODO unsigned/signed
add = self.builder.add
for node in nodes[1:]:
e = add(e, node)
return e
def _print_Or(self, expr):
nodes = [self._print(a) for a in expr.args]
e = nodes[0]
for node in nodes[1:]:
e = self.builder.or_(e, node)
return e
def _print_And(self, expr):
nodes = [self._print(a) for a in expr.args]
e = nodes[0]
for node in nodes[1:]:
e = self.builder.and_(e, node)
return e
def _print_StrictLessThan(self, expr):
return self._comparison('<', expr)
def _print_LessThan(self, expr):
return self._comparison('<=', expr)
def _print_StrictGreaterThan(self, expr):
return self._comparison('>', expr)
def _print_GreaterThan(self, expr):
return self._comparison('>=', expr)
def _print_Unequality(self, expr):
return self._comparison('!=', expr)
def _print_Equality(self, expr):
return self._comparison('==', expr)
def _comparison(self, cmpop, expr):
if collate_types([get_type_of_expression(arg) for arg in expr.args]) == create_type('double'):
comparison = self.builder.fcmp_unordered
else:
comparison = self.builder.icmp_signed
return comparison(cmpop, self._print(expr.lhs), self._print(expr.rhs))
def _print_KernelFunction(self, func):
# KernelFunction does not posses a return type
return_type = self.void
parameter_type = []
parameters = func.get_parameters()
for parameter in parameters:
parameter_type.append(to_llvm_type(parameter.symbol.dtype, nvvm_target=self.target == Target.GPU))
func_type = ir.FunctionType(return_type, tuple(parameter_type))
name = func.function_name
fn = ir.Function(self.module, func_type, name)
self.ext_fn[name] = fn
# set proper names to arguments
for i, arg in enumerate(fn.args):
arg.name = parameters[i].symbol.name
self.func_arg_map[parameters[i].symbol.name] = arg
# func.attributes.add("inlinehint")
# func.attributes.add("argmemonly")
block = fn.append_basic_block(name="entry")
self.builder = ir.IRBuilder(block) # TODO use goto_block instead
self._print(func.body)
self.builder.ret_void()
self.fn = fn
if self.target == Target.GPU:
set_cuda_kernel(fn)
return fn
def _print_Block(self, block):
for node in block.args:
self._print(node)
def _print_LoopOverCoordinate(self, loop):
with Loop(self.builder, self._print(loop.start), self._print(loop.stop), self._print(loop.step),
loop.loop_counter_name, loop.loop_counter_symbol.name) as i:
self._add_tmp_var(loop.loop_counter_symbol, i)
self._print(loop.body)
self._remove_tmp_var(loop.loop_counter_symbol)
def _print_SympyAssignment(self, assignment):
expr = self._print(assignment.rhs)
lhs = assignment.lhs
if isinstance(lhs, Indexed):
ptr = self._print(lhs.base.label)
index = self._print(lhs.args[1])
gep = self.builder.gep(ptr, [index])
return self.builder.store(expr, gep)
self.func_arg_map[assignment.lhs.name] = expr
return expr
def _print_boolean_cast_func(self, conversion):
return self._print_cast_func(conversion)
def _print_cast_func(self, conversion):
node = self._print(conversion.args[0])
to_dtype = get_type_of_expression(conversion)
from_dtype = get_type_of_expression(conversion.args[0])
if from_dtype == to_dtype:
return self._print(conversion.args[0])
# (From, to)
decision = {
(create_composite_type_from_string("int32"),
create_composite_type_from_string("int64")): functools.partial(self.builder.zext, node, self.integer),
(create_composite_type_from_string("int16"),
create_composite_type_from_string("int64")): functools.partial(self.builder.zext, node, self.integer),
(create_composite_type_from_string("int"),
create_composite_type_from_string("double")): functools.partial(self.builder.sitofp, node, self.fp_type),
(create_composite_type_from_string("int16"),
create_composite_type_from_string("double")): functools.partial(self.builder.sitofp, node, self.fp_type),
(create_composite_type_from_string("double"),
create_composite_type_from_string("int")): functools.partial(self.builder.fptosi, node, self.integer),
(create_composite_type_from_string("double *"),
create_composite_type_from_string("int")): functools.partial(self.builder.ptrtoint, node, self.integer),
(create_composite_type_from_string("int"),
create_composite_type_from_string("double *")): functools.partial(self.builder.inttoptr,
node, self.fp_pointer),
(create_composite_type_from_string("double * restrict"),
create_composite_type_from_string("int")): functools.partial(self.builder.ptrtoint, node, self.integer),
(create_composite_type_from_string("int"),
create_composite_type_from_string("double * restrict")): functools.partial(self.builder.inttoptr, node,
self.fp_pointer),
(create_composite_type_from_string("double * restrict const"),
create_composite_type_from_string("int")): functools.partial(self.builder.ptrtoint, node,
self.integer),
(create_composite_type_from_string("int"),
create_composite_type_from_string("double * restrict const")): functools.partial(self.builder.inttoptr,
node, self.fp_pointer),
}
# TODO float, TEST: const, restrict
# TODO bitcast, addrspacecast
# TODO unsigned/signed fills
# print([x for x in decision.keys()])
# print("Types:")
# print([(type(x), type(y)) for (x, y) in decision.keys()])
# print("Cast:")
# print((from_dtype, to_dtype))
return decision[(from_dtype, to_dtype)]()
def _print_pointer_arithmetic_func(self, pointer):
ptr = self._print(pointer.args[0])
index = self._print(pointer.args[1])
return self.builder.gep(ptr, [index])
def _print_Indexed(self, indexed):
ptr = self._print(indexed.base.label)
index = self._print(indexed.args[1])
gep = self.builder.gep(ptr, [index])
return self.builder.load(gep, name=indexed.base.label.name)
def _print_Piecewise(self, piece):
if not piece.args[-1].cond:
# We need the last conditional to be a True, otherwise the resulting
# function may not return a result.
raise ValueError("All Piecewise expressions must contain an "
"(expr, True) statement to be used as a default "
"condition. Without one, the generated "
"expression may not evaluate to anything under "
"some condition.")
if piece.has(Assignment):
raise NotImplementedError('The llvm-backend does not support assignments'
'in the Piecewise function. It is questionable'
'whether to implement it. So far there is no'
'use-case to test it.')
else:
phi_data = []
after_block = self.builder.append_basic_block()
for (expr, condition) in piece.args:
if condition == sp.sympify(True): # Don't use 'is' use '=='!
phi_data.append((self._print(expr), self.builder.block))
self.builder.branch(after_block)
self.builder.position_at_end(after_block)
else:
cond = self._print(condition)
true_block = self.builder.append_basic_block()
false_block = self.builder.append_basic_block()
self.builder.cbranch(cond, true_block, false_block)
self.builder.position_at_end(true_block)
phi_data.append((self._print(expr), true_block))
self.builder.branch(after_block)
self.builder.position_at_end(false_block)
phi = self.builder.phi(to_llvm_type(get_type_of_expression(piece), nvvm_target=self.target == Target.GPU))
for (val, block) in phi_data:
phi.add_incoming(val, block)
return phi
def _print_Conditional(self, node):
cond = self._print(node.condition_expr)
with self.builder.if_else(cond) as (then, otherwise):
with then:
self._print(node.true_block) # emit instructions for when the predicate is true
with otherwise:
self._print(node.false_block) # emit instructions for when the predicate is true
# No return!
def _print_Function(self, expr):
name = expr.func.__name__
e0 = self._print(expr.args[0])
fn = self.ext_fn.get(name)
if not fn:
fn_type = ir.FunctionType(self.fp_type, [self.fp_type])
fn = ir.Function(self.module, fn_type, name)
self.ext_fn[name] = fn
return self.builder.call(fn, [e0], name)
def empty_printer(self, expr):
try:
import inspect
mro = inspect.getmro(expr)
except AttributeError:
mro = "None"
raise TypeError("Unsupported type for LLVM JIT conversion: Expression:\"%s\", Type:\"%s\", MRO:%s"
% (expr, type(expr), mro))
# from: https://llvm.org/docs/NVPTXUsage.html#nvptx-intrinsics
INDEXING_FUNCTION_MAPPING = {
'blockIdx': 'llvm.nvvm.read.ptx.sreg.ctaid',
'threadIdx': 'llvm.nvvm.read.ptx.sreg.tid',
'blockDim': 'llvm.nvvm.read.ptx.sreg.ntid',
'gridDim': 'llvm.nvvm.read.ptx.sreg.nctaid'
}
def _print_ThreadIndexingSymbol(self, node):
symbol_name: str = node.name
function_name, dimension = tuple(symbol_name.split("."))
function_name = self.INDEXING_FUNCTION_MAPPING[function_name]
name = f"{function_name}.{dimension}"
return self.builder.zext(_call_sreg(self.builder, name), self.integer)
import ctypes as ct
import subprocess
from functools import partial
from itertools import chain
from os.path import exists, join
import llvmlite.binding as llvm
import llvmlite.ir as ir
import numpy as np
from pystencils.data_types import create_composite_type_from_string
from pystencils.enums import Target
from pystencils.field import FieldType
from ..data_types import StructType, ctypes_from_llvm, to_ctypes
from .llvm import generate_llvm
def build_ctypes_argument_list(parameter_specification, argument_dict):
argument_dict = {k: v for k, v in argument_dict.items()}
ct_arguments = []
array_shapes = set()
index_arr_shapes = set()
for param in parameter_specification:
if param.is_field_parameter:
try:
field_arr = argument_dict[param.field_name]
except KeyError:
raise KeyError("Missing field parameter for kernel call " + param.field_name)
symbolic_field = param.fields[0]
if param.is_field_pointer:
ct_arguments.append(field_arr.ctypes.data_as(to_ctypes(param.symbol.dtype)))
if symbolic_field.has_fixed_shape:
symbolic_field_shape = tuple(int(i) for i in symbolic_field.shape)
if isinstance(symbolic_field.dtype, StructType):
symbolic_field_shape = symbolic_field_shape[:-1]
if symbolic_field_shape != field_arr.shape:
raise ValueError("Passed array '%s' has shape %s which does not match expected shape %s" %
(param.field_name, str(field_arr.shape), str(symbolic_field.shape)))
if symbolic_field.has_fixed_shape:
symbolic_field_strides = tuple(int(i) * field_arr.itemsize for i in symbolic_field.strides)
if isinstance(symbolic_field.dtype, StructType):
symbolic_field_strides = symbolic_field_strides[:-1]
if symbolic_field_strides != field_arr.strides:
raise ValueError("Passed array '%s' has strides %s which does not match expected strides %s" %
(param.field_name, str(field_arr.strides), str(symbolic_field_strides)))
if FieldType.is_indexed(symbolic_field):
index_arr_shapes.add(field_arr.shape[:symbolic_field.spatial_dimensions])
elif FieldType.is_generic(symbolic_field):
array_shapes.add(field_arr.shape[:symbolic_field.spatial_dimensions])
elif param.is_field_shape:
data_type = to_ctypes(param.symbol.dtype)
ct_arguments.append(data_type(field_arr.shape[param.symbol.coordinate]))
elif param.is_field_stride:
data_type = to_ctypes(param.symbol.dtype)
assert field_arr.strides[param.symbol.coordinate] % field_arr.itemsize == 0
item_stride = field_arr.strides[param.symbol.coordinate] // field_arr.itemsize
ct_arguments.append(data_type(item_stride))
else:
assert False
else:
try:
value = argument_dict[param.symbol.name]
except KeyError:
raise KeyError("Missing parameter for kernel call " + param.symbol.name)
expected_type = to_ctypes(param.symbol.dtype)
ct_arguments.append(expected_type(value))
if len(array_shapes) > 1:
raise ValueError("All passed arrays have to have the same size " + str(array_shapes))
if len(index_arr_shapes) > 1:
raise ValueError("All passed index arrays have to have the same size " + str(array_shapes))
return ct_arguments
def make_python_function_incomplete_params(kernel_function_node, argument_dict, func):
parameters = kernel_function_node.get_parameters()
cache = {}
cache_values = []
def wrapper(**kwargs):
key = hash(tuple((k, v.ctypes.data, v.strides, v.shape) if isinstance(v, np.ndarray) else (k, id(v))
for k, v in kwargs.items()))
try:
args = cache[key]
func(*args)
except KeyError:
full_arguments = argument_dict.copy()
full_arguments.update(kwargs)
args = build_ctypes_argument_list(parameters, full_arguments)
cache[key] = args
cache_values.append(kwargs) # keep objects alive such that ids remain unique
func(*args)
wrapper.ast = kernel_function_node
wrapper.parameters = kernel_function_node.get_parameters()
return wrapper
def generate_and_jit(ast):
target = ast.target
gen = generate_llvm(ast, target=target)
if isinstance(gen, ir.Module):
return compile_llvm(gen, target, ast)
else:
return compile_llvm(gen.module, target, ast)
def make_python_function(ast, argument_dict=None, func=None):
if argument_dict is None:
argument_dict = {}
if func is None:
jit = generate_and_jit(ast)
func = jit.get_function_ptr(ast.function_name)
try:
args = build_ctypes_argument_list(ast.get_parameters(), argument_dict)
except KeyError:
# not all parameters specified yet
return make_python_function_incomplete_params(ast, argument_dict, func)
return lambda: func(*args)
def compile_llvm(module, target=Target.CPU, ast=None):
jit = CudaJit(ast) if target == Target.GPU else Jit()
jit.parse(module)
jit.optimize()
jit.compile()
return jit
class Jit(object):
def __init__(self):
llvm.initialize()
llvm.initialize_all_targets()
llvm.initialize_native_target()
llvm.initialize_native_asmprinter()
self.module = None
self._llvmmod = llvm.parse_assembly("")
self.target = llvm.Target.from_default_triple()
self.cpu = llvm.get_host_cpu_name()
try:
self.cpu_features = llvm.get_host_cpu_features()
self.target_machine = self.target.create_target_machine(cpu=self.cpu, features=self.cpu_features.flatten(),
opt=2)
except RuntimeError:
self.target_machine = self.target.create_target_machine(cpu=self.cpu, opt=2)
llvm.check_jit_execution()
self.ee = llvm.create_mcjit_compiler(self.llvmmod, self.target_machine)
self.ee.finalize_object()
self.fptr = None
@property
def llvmmod(self):
return self._llvmmod
@llvmmod.setter
def llvmmod(self, mod):
self.ee.remove_module(self.llvmmod)
self.ee.add_module(mod)
self.ee.finalize_object()
self.compile()
self._llvmmod = mod
def parse(self, module):
self.module = module
llvmmod = llvm.parse_assembly(str(module))
llvmmod.verify()
llvmmod.triple = self.target.triple
llvmmod.name = 'module'
self.llvmmod = llvmmod
def write_ll(self, file):
with open(file, 'w') as f:
f.write(str(self.llvmmod))
def write_assembly(self, file):
with open(file, 'w') as f:
f.write(self.target_machine.emit_assembly(self.llvmmod))
def write_object_file(self, file):
with open(file, 'wb') as f:
f.write(self.target_machine.emit_object(self.llvmmod))
def optimize(self):
pmb = llvm.create_pass_manager_builder()
pmb.opt_level = 2
pmb.disable_unit_at_a_time = False
pmb.loop_vectorize = True
pmb.slp_vectorize = True
# TODO possible to pass for functions
pm = llvm.create_module_pass_manager()
pm.add_instruction_combining_pass()
pm.add_function_attrs_pass()
pm.add_constant_merge_pass()
pm.add_licm_pass()
pmb.populate(pm)
pm.run(self.llvmmod)
def compile(self):
fptr = {}
for func in self.module.functions:
if not func.is_declaration:
return_type = None
if func.ftype.return_type != ir.VoidType():
return_type = to_ctypes(create_composite_type_from_string(str(func.ftype.return_type)))
args = [ctypes_from_llvm(arg) for arg in func.ftype.args]
function_address = self.ee.get_function_address(func.name)
fptr[func.name] = ct.CFUNCTYPE(return_type, *args)(function_address)
self.fptr = fptr
def __call__(self, func, *args, **kwargs):
target_function = next(f for f in self.module.functions if f.name == func)
arg_types = [ctypes_from_llvm(arg.type) for arg in target_function.args]
transformed_args = []
for i, arg in enumerate(args):
if isinstance(arg, np.ndarray):
transformed_args.append(arg.ctypes.data_as(arg_types[i]))
else:
transformed_args.append(arg)
self.fptr[func](*transformed_args)
def print_functions(self):
for f in self.module.functions:
print(f.ftype.return_type, f.name, f.args)
def get_function_ptr(self, name):
fptr = self.fptr[name]
fptr.jit = self
return fptr
# Following code more or less from numba
class CudaJit(Jit):
CUDA_TRIPLE = {32: 'nvptx-nvidia-cuda',
64: 'nvptx64-nvidia-cuda'}
MACHINE_BITS = tuple.__itemsize__ * 8
data_layout = {
32: ('e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-'
'f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64'),
64: ('e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-'
'f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64')}
default_data_layout = data_layout[MACHINE_BITS]
def __init__(self, ast):
# super().__init__()
# self.target = llvm.Target.from_triple(self.CUDA_TRIPLE[self.MACHINE_BITS])
self._data_layout = self.default_data_layout[self.MACHINE_BITS]
# self._target_data = llvm.create_target_data(self._data_layout)
self.indexing = ast.indexing
def optimize(self):
pmb = llvm.create_pass_manager_builder()
pmb.opt_level = 2
pmb.disable_unit_at_a_time = False
pmb.loop_vectorize = False
pmb.slp_vectorize = False
# TODO possible to pass for functions
pm = llvm.create_module_pass_manager()
pm.add_instruction_combining_pass()
pm.add_function_attrs_pass()
pm.add_constant_merge_pass()
pm.add_licm_pass()
pmb.populate(pm)
pm.run(self.llvmmod)
pm.run(self.llvmmod)
def write_ll(self, file):
with open(file, 'w') as f:
f.write(str(self.llvmmod))
def parse(self, module):
llvmmod = module
llvmmod.triple = self.CUDA_TRIPLE[self.MACHINE_BITS]
llvmmod.data_layout = self.default_data_layout
llvmmod.verify()
llvmmod.name = 'module'
self._llvmmod = llvm.parse_assembly(str(llvmmod))
def compile(self):
from pystencils.cpu.cpujit import get_cache_config, get_compiler_config, get_llc_command
import hashlib
compiler_cache = get_cache_config()['object_cache']
ir_file = join(compiler_cache, hashlib.md5(str(self._llvmmod).encode()).hexdigest() + '.ll')
ptx_file = ir_file.replace('.ll', '.ptx')
try:
from pycuda.driver import Context
arch = "sm_%d%d" % Context.get_device().compute_capability()
except Exception:
arch = "sm_35"
if not exists(ptx_file):
self.write_ll(ir_file)
if 'llc' in get_compiler_config():
llc_command = get_compiler_config()['llc']
else:
llc_command = get_llc_command() or 'llc'
subprocess.check_call([llc_command, '-mcpu=' + arch, ir_file, '-o', ptx_file])
# cubin_file = ir_file.replace('.ll', '.cubin')
# if not exists(cubin_file):
# subprocess.check_call(['ptxas', '--gpu-name', arch, ptx_file, '-o', cubin_file])
import pycuda.driver
cuda_module = pycuda.driver.module_from_file(ptx_file) # also works: cubin_file
self.cuda_module = cuda_module
def __call__(self, func, *args, **kwargs):
shape = [a.shape for a in chain(args, kwargs.values()) if hasattr(a, 'shape')][0]
block_and_thread_numbers = self.indexing.call_parameters(shape)
block_and_thread_numbers['block'] = tuple(int(i) for i in block_and_thread_numbers['block'])
block_and_thread_numbers['grid'] = tuple(int(i) for i in block_and_thread_numbers['grid'])
self.cuda_module.get_function(func)(*args, **kwargs, **block_and_thread_numbers)
def get_function_ptr(self, name):
return partial(self._call__, name)
"""
"""
from pystencils.opencl.opencljit import (
clear_global_ctx, init_globally, init_globally_with_context, make_python_function)
__all__ = ['init_globally', 'init_globally_with_context', 'clear_global_ctx', 'make_python_function']
"""
Automatically initializes OpenCL context using any device.
Use `pystencils.opencl.{init_globally_with_context,init_globally}` if you want to use a specific device.
"""
from pystencils.opencl.opencljit import (
clear_global_ctx, init_globally, init_globally_with_context, make_python_function)
__all__ = ['init_globally', 'init_globally_with_context', 'clear_global_ctx', 'make_python_function']
try:
init_globally()
except Exception as e:
import warnings
warnings.warn(str(e))
import numpy as np
from pystencils.backends.cbackend import get_headers
from pystencils.backends.opencl_backend import generate_opencl
from pystencils.gpucuda.cudajit import _build_numpy_argument_list, _check_arguments
from pystencils.include import get_pystencils_include_path
from pystencils.kernel_wrapper import KernelWrapper
USE_FAST_MATH = True
_global_cl_ctx = None
_global_cl_queue = None
def get_global_cl_queue():
return _global_cl_queue
def get_global_cl_ctx():
return _global_cl_ctx
def init_globally(device_index=0):
import pyopencl as cl
global _global_cl_ctx
global _global_cl_queue
_global_cl_ctx = cl.create_some_context(device_index)
_global_cl_queue = cl.CommandQueue(_global_cl_ctx)
def init_globally_with_context(opencl_ctx, opencl_queue):
global _global_cl_ctx
global _global_cl_queue
_global_cl_ctx = opencl_ctx
_global_cl_queue = opencl_queue
def clear_global_ctx():
global _global_cl_ctx
global _global_cl_queue
_global_cl_ctx = None
_global_cl_queue = None
def make_python_function(kernel_function_node, opencl_queue, opencl_ctx, argument_dict=None, custom_backend=None):
"""
Creates a **OpenCL** kernel function from an abstract syntax tree which
was created for the ``target='Target.GPU'`` e.g. by :func:`pystencils.gpucuda.create_cuda_kernel`
or :func:`pystencils.gpucuda.created_indexed_cuda_kernel`
Args:
opencl_queue: a valid :class:`pyopencl.CommandQueue`
opencl_ctx: a valid :class:`pyopencl.Context`
kernel_function_node: the abstract syntax tree
argument_dict: parameters passed here are already fixed. Remaining parameters have to be passed to the
returned kernel functor.
Returns:
compiled kernel as Python function
"""
import pyopencl as cl
if not opencl_ctx:
opencl_ctx = _global_cl_ctx
if not opencl_queue:
opencl_queue = _global_cl_queue
assert opencl_ctx, "No valid OpenCL context!\n" \
"Use `import pystencils.opencl.autoinit` if you want it to be automatically created"
assert opencl_queue, "No valid OpenCL queue!\n" \
"Use `import pystencils.opencl.autoinit` if you want it to be automatically created"
if argument_dict is None:
argument_dict = {}
# check if double precision is supported and required
if any([d.double_fp_config == 0 for d in opencl_ctx.devices]):
for param in kernel_function_node.get_parameters():
if param.symbol.dtype.base_type:
if param.symbol.dtype.base_type.numpy_dtype == np.float64:
raise ValueError('OpenCL device does not support double precision')
else:
if param.symbol.dtype.numpy_dtype == np.float64:
raise ValueError('OpenCL device does not support double precision')
# Changing of kernel name necessary since compilation with default name "kernel" is not possible (OpenCL keyword!)
kernel_function_node.function_name = "opencl_" + kernel_function_node.function_name
header_list = ['"opencl_stdint.h"'] + list(get_headers(kernel_function_node))
includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list])
code = includes + "\n"
code += "#define FUNC_PREFIX __kernel\n"
code += "#define RESTRICT restrict\n\n"
code += str(generate_opencl(kernel_function_node, custom_backend=custom_backend))
options = []
if USE_FAST_MATH:
options.append("-cl-unsafe-math-optimizations")
options.append("-cl-mad-enable")
options.append("-cl-fast-relaxed-math")
options.append("-cl-finite-math-only")
options.append("-I")
options.append(get_pystencils_include_path())
mod = cl.Program(opencl_ctx, code).build(options=options)
func = getattr(mod, kernel_function_node.function_name)
parameters = kernel_function_node.get_parameters()
cache = {}
cache_values = []
def wrapper(**kwargs):
key = hash(tuple((k, v.ctypes.data, v.strides, v.shape) if isinstance(v, np.ndarray) else (k, id(v))
for k, v in kwargs.items()))
try:
args, block_and_thread_numbers = cache[key]
func(opencl_queue, block_and_thread_numbers['grid'], block_and_thread_numbers['block'], *args)
except KeyError:
full_arguments = argument_dict.copy()
full_arguments.update(kwargs)
assert not any(isinstance(a, np.ndarray)
for a in full_arguments.values()), 'Calling a OpenCL kernel with a Numpy array!'
assert not any('pycuda' in str(type(a))
for a in full_arguments.values()), 'Calling a OpenCL kernel with a PyCUDA array!'
shape = _check_arguments(parameters, full_arguments)
indexing = kernel_function_node.indexing
block_and_thread_numbers = indexing.call_parameters(shape)
block_and_thread_numbers['block'] = tuple(int(i) for i in block_and_thread_numbers['block'])
block_and_thread_numbers['grid'] = tuple(int(b * g) for (b, g) in zip(block_and_thread_numbers['block'],
block_and_thread_numbers['grid']))
args = _build_numpy_argument_list(parameters, full_arguments)
args = [a.data if hasattr(a, 'data') else a for a in args]
cache[key] = (args, block_and_thread_numbers)
cache_values.append(kwargs) # keep objects alive such that ids remain unique
func(opencl_queue, block_and_thread_numbers['grid'], block_and_thread_numbers['block'], *args)
wrapper.ast = kernel_function_node
wrapper.parameters = kernel_function_node.get_parameters()
wrapper = KernelWrapper(wrapper, parameters, kernel_function_node)
return wrapper
# FIXME
# FIXME performance counters might be wrong. This will only affect the Benchmark model
# FIXME bandwidth measurements need validation
# FIXME
kerncraft version: 0.7.2
model name: Intel(R) Xeon(R) Gold 5122 CPU @ 3.60GHz
model type: Intel Core Skylake SP
sockets: 2
cores per socket: 4
threads per core: 2
NUMA domains per socket: 1
cores per NUMA domain: 4
clock: 3.6 GHz
FLOPs per cycle:
SP:
total: 64
FMA: 64
ADD: 32
MUL: 32
DP:
total: 32
FMA: 32
ADD: 16
MUL: 16
micro-architecture: SKX
compiler:
!!omap
- icc: -O3 -fno-alias -xCORE-AVX512
- clang: -O3 -march=skylake-avx512 -D_POSIX_C_SOURCE=200112L
- gcc: -O3 -march=skylake-avx512
cacheline size: 64 B
overlapping model:
ports: ["0", "0DV", "1", "2", "3", "4", "5", "6", "7"]
performance counter metric:
Max(UOPS_DISPATCHED_PORT_PORT_0:PMC[0-3],
UOPS_DISPATCHED_PORT_PORT_1:PMC[0-3],
UOPS_DISPATCHED_PORT_PORT_4:PMC[0-3],
UOPS_DISPATCHED_PORT_PORT_5:PMC[0-3],
UOPS_DISPATCHED_PORT_PORT_6:PMC[0-3],
UOPS_DISPATCHED_PORT_PORT_7:PMC[0-3])
non-overlapping model:
ports: ["2D", "3D"]
performance counter metric: T_OL + T_L1L2 + T_L2L3 + T_L3MEM
memory hierarchy:
- level: L1
performance counter metrics:
accesses: MEM_INST_RETIRED_ALL_LOADS:PMC[0-3]
misses: L1D_REPLACEMENT:PMC[0-3]
evicts: L2_TRANS_L1D_WB:PMC[0-3]
cache per group:
sets: 64
ways: 8
cl_size: 64
replacement_policy: 'LRU'
write_allocate: True
write_back: True
load_from: L2
store_to: L2
size per group: 32.00 kB
groups: 8
cores per group: 1
threads per group: 2
- level: L2
non-overlap upstream throughput: [64 B/cy, 'half-duplex']
performance counter metrics:
accesses: L1D_REPLACEMENT:PMC[0-3]
misses: L2_LINES_IN_ALL:PMC[0-3]
evicts: L2_TRANS_L2_WB:PMC[0-3]
cache per group:
sets: 1024
ways: 16
cl_size: 64
replacement_policy: 'LRU'
write_allocate: True
write_back: True
load_from: null # L3 is a victim cache, thus unless a hit in L3, misses get forwarded to MEM
victims_to: L3 # all victims, modified or not are passed onto L3
store_to: L3
size per group: 1.00 MB
groups: 8
cores per group: 1
threads per group: 2
- level: L3
non-overlap upstream throughput: [16 B/cy, 'full-duplex']
performance counter metrics:
accesses: L2_LINES_IN_ALL:PMC[0-3]
# FIXME not all misses in L2 lead to loads from L3, only the hits do
misses: (CAS_COUNT_RD:MBOX0C[01] + CAS_COUNT_WR:MBOX0C[01] +
CAS_COUNT_RD:MBOX1C[01] + CAS_COUNT_WR:MBOX1C[01] +
CAS_COUNT_RD:MBOX2C[01] + CAS_COUNT_WR:MBOX2C[01] +
CAS_COUNT_RD:MBOX3C[01] + CAS_COUNT_WR:MBOX3C[01] +
CAS_COUNT_RD:MBOX4C[01] + CAS_COUNT_WR:MBOX4C[01] +
CAS_COUNT_RD:MBOX5C[01] + CAS_COUNT_WR:MBOX5C[01])
evicts: L2_TRANS_L2_WB:PMC[0-3]
cache per group:
sets: 16896
# TODO is actuall something else, but necessary to get to 16.5 MB
ways: 16
# TODO is actually 11, but pycachesim only supports powers of two
cl_size: 64
replacement_policy: 'LRU'
write_allocate: False
write_back: True
size per group: 16.50 MB
groups: 2
cores per group: 4
threads per group: 8
- level: MEM
cores per group: 4
threads per group: 8
non-overlap upstream throughput: ['full socket memory bandwidth', 'half-duplex']
penalty cycles per read stream: 0
size per group:
benchmarks:
kernels:
load:
read streams:
streams: 1
bytes: 8.00 B
read+write streams:
streams: 0
bytes: 0.00 B
write streams:
streams: 0
bytes: 0.00 B
FLOPs per iteration: 0
copy:
read streams:
streams: 1
bytes: 8.00 B
read+write streams:
streams: 0
bytes: 0.00 B
write streams:
streams: 1
bytes: 8.00 B
FLOPs per iteration: 0
update:
read streams:
streams: 1
bytes: 8.00 B
read+write streams:
streams: 1
bytes: 8.00 B
write streams:
streams: 1
bytes: 8.00 B
FLOPs per iteration: 0
triad:
read streams:
streams: 3
bytes: 24.00 B
read+write streams:
streams: 0
bytes: 0.00 B
write streams:
streams: 1
bytes: 8.00 B
FLOPs per iteration: 2
daxpy:
read streams:
streams: 2
bytes: 16.00 B
read+write streams:
streams: 1
bytes: 8.00 B
write streams:
streams: 1
bytes: 8.00 B
FLOPs per iteration: 2
measurements:
L1:
1:
threads per core: 1
cores:
- 1
- 2
- 3
- 4
threads:
- 1
- 2
- 3
- 4
size per core:
- 21.12 kB
- 21.12 kB
- 21.12 kB
- 21.12 kB
size per thread:
- 21.12 kB
- 21.12 kB
- 21.12 kB
- 21.12 kB
total size:
- 21.12 kB
- 42.24 kB
- 63.36 kB
- 84.48 kB
results:
load:
- 42.98 GB/s
- 85.08 GB/s
- 127.45 GB/s
- 169.92 GB/s
copy:
- 56.07 GB/s
- 111.50 GB/s
- 164.90 GB/s
- 221.50 GB/s
update:
- 56.54 GB/s
- 112.25 GB/s
- 168.50 GB/s
- 224.75 GB/s
triad:
- 45.90 GB/s
- 89.81 GB/s
- 127.29 GB/s
- 169.57 GB/s
daxpy:
- 36.62 GB/s
- 71.30 GB/s
- 103.52 GB/s
- 135.26 GB/s
2:
threads per core: 2
cores:
- 1
- 2
- 3
- 4
threads:
- 2
- 4
- 6
- 8
size per core:
- 21.12 kB
- 21.12 kB
- 21.12 kB
- 21.12 kB
size per thread:
- 10.56 kB
- 10.56 kB
- 10.56 kB
- 10.56 kB
total size:
- 21.12 kB
- 42.24 kB
- 63.36 kB
- 84.48 kB
results:
load:
- 49.61 GB/s
- 98.80 GB/s
- 147.98 GB/s
- 198.22 GB/s
copy:
- 55.98 GB/s
- 111.56 GB/s
- 167.08 GB/s
- 220.42 GB/s
update:
- 56.53 GB/s
- 112.72 GB/s
- 168.95 GB/s
- 225.31 GB/s
triad:
- 54.01 GB/s
- 104.58 GB/s
- 153.02 GB/s
- 200.93 GB/s
daxpy:
- 41.11 GB/s
- 80.28 GB/s
- 115.71 GB/s
- 152.81 GB/s
L2:
1:
threads per core: 1
cores:
- 1
- 2
- 3
- 4
threads:
- 1
- 2
- 3
- 4
size per core:
- 660.00 kB
- 660.00 kB
- 660.00 kB
- 660.00 kB
size per thread:
- 660.00 kB
- 660.00 kB
- 660.00 kB
- 660.00 kB
total size:
- 660.00 kB
- 1.32 MB
- 1.98 MB
- 2.64 MB
results:
load:
- 27.15 GB/s
- 54.09 GB/s
- 80.61 GB/s
- 106.41 GB/s
copy:
- 43.53 GB/s
- 90.07 GB/s
- 127.73 GB/s
- 171.81 GB/s
update:
- 50.38 GB/s
- 98.47 GB/s
- 147.91 GB/s
- 197.20 GB/s
triad:
- 43.38 GB/s
- 83.72 GB/s
- 124.83 GB/s
- 166.04 GB/s
daxpy:
- 36.29 GB/s
- 71.29 GB/s
- 103.33 GB/s
- 136.48 GB/s
2:
threads per core: 2
cores:
- 1
- 2
- 3
- 4
threads:
- 2
- 4
- 6
- 8
size per core:
- 660.00 kB
- 660.00 kB
- 660.00 kB
- 660.00 kB
size per thread:
- 330.00 kB
- 330.00 kB
- 330.00 kB
- 330.00 kB
total size:
- 660.00 kB
- 1.32 MB
- 1.98 MB
- 2.64 MB
results:
load:
- 35.29 GB/s
- 70.28 GB/s
- 104.67 GB/s
- 139.63 GB/s
copy:
- 42.23 GB/s
- 83.70 GB/s
- 124.33 GB/s
- 167.50 GB/s
update:
- 50.09 GB/s
- 99.77 GB/s
- 149.87 GB/s
- 198.82 GB/s
triad:
- 52.38 GB/s
- 100.00 GB/s
- 147.40 GB/s
- 193.31 GB/s
daxpy:
- 41.14 GB/s
- 80.22 GB/s
- 116.23 GB/s
- 155.08 GB/s
L3:
1:
threads per core: 1
cores:
- 1
- 2
- 3
- 4
threads:
- 1
- 2
- 3
- 4
size per core:
- 10.56 MB
- 5.28 MB
- 3.52 MB
- 2.64 MB
size per thread:
- 10.56 MB
- 5.28 MB
- 3.52 MB
- 2.64 MB
total size:
- 10.56 MB
- 10.56 MB
- 10.56 MB
- 10.56 MB
results:
load:
- 22.40 GB/s
- 44.77 GB/s
- 65.71 GB/s
- 89.26 GB/s
copy:
- 25.32 GB/s
- 49.70 GB/s
- 72.89 GB/s
- 98.62 GB/s
update:
- 41.24 GB/s
- 81.14 GB/s
- 122.22 GB/s
- 166.44 GB/s
triad:
- 25.61 GB/s
- 50.02 GB/s
- 73.23 GB/s
- 98.95 GB/s
daxpy:
- 32.07 GB/s
- 62.65 GB/s
- 89.91 GB/s
- 120.65 GB/s
2:
threads per core: 2
cores:
- 1
- 2
- 3
- 4
threads:
- 2
- 4
- 6
- 8
size per core:
- 10.56 MB
- 5.28 MB
- 3.52 MB
- 2.64 MB
size per thread:
- 5.28 MB
- 2.64 MB
- 1.76 MB
- 1.32 MB
total size:
- 10.56 MB
- 10.56 MB
- 10.56 MB
- 10.56 MB
results:
load:
- 26.18 GB/s
- 51.85 GB/s
- 75.82 GB/s
- 101.39 GB/s
copy:
- 26.22 GB/s
- 51.83 GB/s
- 76.40 GB/s
- 102.84 GB/s
update:
- 43.51 GB/s
- 86.75 GB/s
- 129.86 GB/s
- 174.54 GB/s
triad:
- 26.39 GB/s
- 51.80 GB/s
- 76.27 GB/s
- 102.66 GB/s
daxpy:
- 37.43 GB/s
- 73.16 GB/s
- 106.53 GB/s
- 142.76 GB/s
MEM:
1:
threads per core: 1
cores:
- 1
- 2
- 3
- 4
threads:
- 1
- 2
- 3
- 4
size per core:
- 240.00 MB
- 120.00 MB
- 80.00 MB
- 60.00 MB
size per thread:
- 240.00 MB
- 120.00 MB
- 80.00 MB
- 60.00 MB
total size:
- 240.00 MB
- 240.00 MB
- 240.00 MB
- 240.00 MB
results:
load:
- 12.03 GB/s
- 24.38 GB/s
- 34.83 GB/s
- 45.05 GB/s
copy:
- 12.32 GB/s
- 24.40 GB/s
- 32.82 GB/s
- 37.00 GB/s
update:
- 20.83 GB/s
- 40.25 GB/s
- 48.81 GB/s
- 54.84 GB/s
triad:
- 11.64 GB/s
- 23.17 GB/s
- 34.78 GB/s
- 42.97 GB/s
daxpy:
- 17.69 GB/s
- 34.02 GB/s
- 48.12 GB/s
- 55.73 GB/s
2:
threads per core: 2
cores:
- 1
- 2
- 3
- 4
threads:
- 2
- 4
- 6
- 8
size per core:
- 240.00 MB
- 120.00 MB
- 80.00 MB
- 60.00 MB
size per thread:
- 120.00 MB
- 60.00 MB
- 40.00 MB
- 30.00 MB
total size:
- 240.00 MB
- 240.00 MB
- 240.00 MB
- 240.00 MB
results:
load:
- 15.33 GB/s
- 28.32 GB/s
- 41.34 GB/s
- 53.02 GB/s
copy:
- 13.96 GB/s
- 26.61 GB/s
- 34.39 GB/s
- 38.96 GB/s
update:
- 26.47 GB/s
- 47.82 GB/s
- 56.70 GB/s
- 62.78 GB/s
triad:
- 14.42 GB/s
- 26.66 GB/s
- 36.94 GB/s
- 44.01 GB/s
daxpy:
- 20.96 GB/s
- 39.12 GB/s
- 51.55 GB/s
- 58.37 GB/s
import math
import os
import time
import numpy as np
import sympy as sp
from git import Repo
from influxdb import InfluxDBClient
from kerncraft.machinemodel import MachineModel
from kerncraft.models import ECM, Benchmark, Roofline, RooflineIACA
from kerncraft.prefixedunit import PrefixedUnit
from pystencils import Assignment, Field, create_kernel
from pystencils.kerncraft_coupling import KerncraftParameters, PyStencilsKerncraftKernel
def output_benchmark(analysis):
output = {}
keys = ['Runtime (per repetition) [s]', 'Iterations per repetition',
'Runtime (per cacheline update) [cy/CL]', 'MEM volume (per repetition) [B]',
'Performance [MFLOP/s]', 'Performance [MLUP/s]', 'Performance [MIt/s]', 'MEM BW [MByte/s]']
copies = {key: analysis[key] for key in keys}
output.update(copies)
for cache, metrics in analysis['data transfers'].items():
for metric_name, metric_value in metrics.items():
fixed = metric_value.with_prefix('')
output[cache + ' ' + metric_name + ' ' + fixed.prefix + fixed.unit] = fixed.value
for level, value in analysis['ECM'].items():
output['Phenomenological ECM ' + level + ' cy/CL'] = value
return output
def output_ecm(analysis):
output = {}
keys = ['T_nOL', 'T_OL', 'cl throughput', 'uops']
copies = {key: analysis[key] for key in keys}
output.update(copies)
if 'memory bandwidth kernel' in analysis:
output['memory bandwidth kernel' + analysis['memory bandwidth kernel'] + analysis['memory bandwidth'].prefix +
analysis['memory bandwidth'].unit] = analysis['memory bandwidth'].value
output['scaling cores'] = int(analysis['scaling cores']) if not math.isinf(analysis['scaling cores']) else -1
for key, value in analysis['cycles']:
output[key] = value
return output
def output_roofline(analysis):
output = {}
keys = ['min performance'] # 'bottleneck level'
copies = {key: analysis[key] for key in keys}
output.update(copies)
# TODO save bottleneck information (compute it here)
# fixed = analysis['max_flops'].with_prefix('G')
# output['max GFlop/s'] = fixed.value
# if analysis['min performance'] > max_flops:
# # CPU bound
# print('CPU bound with {} cores(s)'.format(self._args.cores), file=output_file)
# print('{!s} due to CPU max. FLOP/s'.format(max_flops), file=output_file)
# else:
# Memory bound
bottleneck = analysis['mem bottlenecks'][analysis['bottleneck level']]
output['bottleneck GFlop/s'] = bottleneck['performance'].with_prefix('G').value
output['bottleneck level'] = bottleneck['level']
output['bottleneck bw kernel'] = bottleneck['bw kernel']
output['bottleneck arithmetic intensity'] = bottleneck['arithmetic intensity']
for i, level in enumerate(analysis['mem bottlenecks']):
if level is None:
continue
for key, value in level.items():
if isinstance(value, PrefixedUnit):
fixed = value.with_prefix('G')
output['level ' + str(i) + ' ' + key + ' [' + fixed.prefix + fixed.unit + ']'] = 'inf' if isinstance(
fixed.value, float) and math.isinf(fixed.value) else fixed.value
else:
output['level ' + str(i) + ' ' + key] = 'inf' if isinstance(value, float) and math.isinf(
value) else value
return output
def output_roofline_iaca(analysis):
output = {}
keys = ['min performance'] # 'bottleneck level'
copies = {key: analysis[key] for key in keys}
# output.update(copies)
# TODO save bottleneck information (compute it here)
# fixed = analysis['max_flops'].with_prefix('G')
# output['max GFlop/s'] = fixed.value
# if analysis['min performance'] > max_flops:
# # CPU bound
# print('CPU bound with {} cores(s)'.format(self._args.cores), file=output_file)
# print('{!s} due to CPU max. FLOP/s'.format(max_flops), file=output_file)
# else:
# Memory bound
bottleneck = analysis['mem bottlenecks'][analysis['bottleneck level']]
output['bottleneck GFlop/s'] = bottleneck['performance'].with_prefix('G').value
output['bottleneck level'] = bottleneck['level']
output['bottleneck bw kernel'] = bottleneck['bw kernel']
output['bottleneck arithmetic intensity'] = bottleneck['arithmetic intensity']
for i, level in enumerate(analysis['mem bottlenecks']):
if level is None:
continue
for key, value in level.items():
if isinstance(value, PrefixedUnit):
fixed = value.with_prefix('G')
output['level ' + str(i) + ' ' + key + ' [' + fixed.prefix + fixed.unit + ']'] = 'inf' if isinstance(
fixed.value, float) and math.isinf(fixed.value) else fixed.value
else:
output['level ' + str(i) + ' ' + key] = 'inf' if isinstance(value, float) and math.isinf(
value) else value
return output
def report_analysis(ast, models, machine, tags, fields=None):
kernel = PyStencilsKerncraftKernel(ast, machine)
client = InfluxDBClient('i10grafana.informatik.uni-erlangen.de', 8086, 'pystencils',
'roggan', 'pystencils')
repo = Repo(search_parent_directories=True)
commit = repo.head.commit
point_time = int(time.time())
for model in models:
benchmark = model(kernel, machine, KerncraftParameters())
benchmark.analyze()
analysis = benchmark.results
if model is Benchmark:
output = output_benchmark(analysis)
elif model is ECM:
output = output_ecm(analysis)
elif model is Roofline:
output = output_roofline(analysis)
elif model is RooflineIACA:
output = output_roofline_iaca(analysis)
else:
raise ValueError('No valid model for analysis given!')
if fields is not None:
output.update(fields)
output['commit'] = commit.hexsha
json_body = [
{
'measurement': model.__name__,
'tags': tags,
'time': point_time,
'fields': output
}
]
client.write_points(json_body, time_precision='s')
def main():
size = [20, 200, 200]
arr = np.zeros(size)
a = Field.create_from_numpy_array('a', arr, index_dimensions=0)
b = Field.create_from_numpy_array('b', arr, index_dimensions=0)
s = sp.Symbol("s")
rhs = a[0, -1, 0] + a[0, 1, 0] + \
a[-1, 0, 0] + a[1, 0, 0] + \
a[0, 0, -1] + a[0, 0, 1]
update_rule = Assignment(b[0, 0, 0], s * rhs)
ast = create_kernel([update_rule])
input_folder = "./"
machine_file_path = os.path.join(input_folder, "SkylakeSP_Gold-5122_allinclusive.yaml")
machine = MachineModel(path_to_yaml=machine_file_path)
tags = {
'host': os.uname()[1],
'project': 'pystencils',
'kernel': 'jacobi_3D ' + str(size)
}
report_analysis(ast, [ECM, Roofline, RooflineIACA, Benchmark], machine, tags)
if __name__ == '__main__':
main()
import numpy as np
import sympy as sp
from pystencils import Assignment, Field, create_kernel
def meassure():
size = [30, 50, 3]
arr = np.zeros(size)
a = Field.create_from_numpy_array('a', arr, index_dimensions=1)
b = Field.create_from_numpy_array('b', arr, index_dimensions=1)
s = sp.Symbol("s")
rhs = a[0, -1](0) + a[0, 1] + a[-1, 0] + a[1, 0]
updateRule = Assignment(b[0, 0], s * rhs)
print(updateRule)
ast = create_kernel([updateRule])
# benchmark = generate_benchmark(ast)
# main = benchmark[0]
# kernel = benchmark[1]
# with open('src/main.cpp', 'w') as file:
# file.write(main)
# with open('src/kernel.cpp', 'w') as file:
# file.write(kernel)
func = ast.compile({'omega': 2/3})
from pystencils.kerncraft_coupling.generate_benchmark import generate_benchmark
from pystencils.kerncraft_coupling import BenchmarkAnalysis
from pystencils.kerncraft_coupling.kerncraft_interface import PyStencilsKerncraftKernel, KerncraftParameters
from kerncraft.machinemodel import MachineModel
from kerncraft.models import ECMData
machineFilePath = "../pystencils_tests/kerncraft_inputs/default_machine_file.yaml"
machine = MachineModel(path_to_yaml=machineFilePath)
benchmark = BenchmarkAnalysis(ast, machine)
#TODO what do i want to do with benchmark?
kernel = PyStencilsKerncraftKernel(ast)
model = ECMData(kernel, machine, KerncraftParameters())
model.analyze()
model.report()
if __name__ == "__main__":
meassure()
/*
* Copyright (2008-2009) Intel Corporation All Rights Reserved.
* The source code contained or described herein and all documents
* related to the source code ("Material") are owned by Intel Corporation
* or its suppliers or licensors. Title to the Material remains with
* Intel Corporation or its suppliers and licensors. The Material
* contains trade secrets and proprietary and confidential information
* of Intel or its suppliers and licensors. The Material is protected
* by worldwide copyright and trade secret laws and treaty provisions.
* No part of the Material may be used, copied, reproduced, modified,
* published, uploaded, posted, transmitted, distributed, or disclosed
* in any way without Intel(R)s prior express written permission.
*
* No license under any patent, copyright, trade secret or other
* intellectual property right is granted to or conferred upon you by
* disclosure or delivery of the Materials, either expressly, by implication,
* inducement, estoppel or otherwise. Any license under such intellectual
* property rights must be express and approved by Intel in writing.
*/
#if defined (__GNUC__)
#define IACA_SSC_MARK( MARK_ID ) \
__asm__ __volatile__ ( \
"\n\t movl $"#MARK_ID", %%ebx" \
"\n\t .byte 0x64, 0x67, 0x90" \
: : : "memory" );
#else
#define IACA_SSC_MARK(x) {__asm mov ebx, x\
__asm _emit 0x64 \
__asm _emit 0x67 \
__asm _emit 0x90 }
#endif
#define IACA_START {IACA_SSC_MARK(111)}
#define IACA_END {IACA_SSC_MARK(222)}
#ifdef _WIN64
#include <intrin.h>
#define IACA_VC64_START __writegsbyte(111, 111);
#define IACA_VC64_END __writegsbyte(222, 222);
#endif
/**************** asm *****************
;START_MARKER
mov ebx, 111
db 0x64, 0x67, 0x90
;END_MARKER
mov ebx, 222
db 0x64, 0x67, 0x90
**************************************/
#include "iacaMarks.h"
int main(int argc, char * argv[]){
int a = 0;
for(int i = 0; i < argc+100000; i++){
IACA_START
a += i;
}
IACA_END
return a;
}
double a[30][50][3];
double b[30][50][3];
double s;
for(int j=1; j<30-1; ++j)
for(int i=1; i<50-1; ++i)
b[j][i] = ( a[j][i-1] + a[j][i+1]
+ a[j-1][i] + a[j+1][i]) * s;
double a[M][N][N];
double b[M][N][N];
double s;
for(int k=1; k<M-1; ++k)
for(int j=1; j<N-1; ++j)
for(int i=1; i<N-1; ++i)
b[k][j][i] = ( a[k][j][i-1] + a[k][j][i+1]
+ a[k][j-1][i] + a[k][j+1][i]
+ a[k-1][j][i] + a[k+1][j][i]) * s;
kerncraft version: 0.8.6.dev0
model name: Intel(R) Xeon(R) CPU E5-2680 0 @ 2.70GHz
model type: Intel Xeon SandyBridge EN/EP processor
clock: 2.7 GHz
sockets: 2
cores per socket: 8
threads per core: 2
NUMA domains per socket: 1
cores per NUMA domain: 8
transparent hugepage: always
in-core model: !!omap
- IACA: SNB
- OSACA: SNB
- LLVM-MCA: -mcpu=sandybridge
isa: x86
FLOPs per cycle:
SP: {total: 16, ADD: 8, MUL: 8}
DP: {total: 8, ADD: 4, MUL: 4}
compiler: !!omap
- icc: -O3 -xAVX -fno-alias -qopenmp -ffreestanding -nolib-inline
- clang: -O3 -march=corei7-avx -mtune=corei7-avx -D_POSIX_C_SOURCE=200809L -fopenmp -ffreestanding
- gcc: -O3 -march=corei7-avx -D_POSIX_C_SOURCE=200809L -fopenmp -lm -ffreestanding
overlapping model:
ports:
IACA: ['0', 0DV, '1', '2', '3', '4', '5']
OSACA: ['0', 0DV, '1', '2', '3', '4', '5']
LLVM-MCA: [SBDivider, SBFPDivider, SBPort0, SBPort1, SBPort23, SBPort4, SBPort5]
performance counter metric: Max(UOPS_DISPATCHED_PORT_PORT_0:PMC[0-3], UOPS_DISPATCHED_PORT_PORT_1:PMC[0-3], UOPS_DISPATCHED_PORT_PORT_4:PMC[0-3], UOPS_DISPATCHED_PORT_PORT_5:PMC[0-3])
non-overlapping model:
ports:
IACA: [2D, 3D]
OSACA: [2D, 3D]
LLVM-MCA: [SBPort23]
performance counter metric: T_nOL + T_L1L2 + T_L2L3 + T_L3MEM
cacheline size: 64 B
memory hierarchy:
- level: L1
cache per group: {sets: 64, ways: 8, cl_size: 64, replacement_policy: LRU, write_allocate: true,
write_back: true, load_from: L2, store_to: L2}
cores per group: 1
threads per group: 2
groups: 16
performance counter metrics:
accesses: MEM_UOPS_RETIRED_LOADS:PMC[0-3] + MEM_UOPS_RETIRED_STORES:PMC[0-3]
misses: L1D_REPLACEMENT:PMC[0-3]
evicts: L1D_M_EVICT:PMC[0-3]
upstream throughput: [architecture code analyzer, [2D, 3D]]
transfers overlap: false
- level: L2
cache per group: {sets: 512, ways: 8, cl_size: 64, replacement_policy: LRU, write_allocate: true,
write_back: true, load_from: L3, store_to: L3}
cores per group: 1
threads per group: 2
groups: 16
upstream throughput: [32 B/cy, half-duplex]
transfers overlap: false
performance counter metrics:
accesses: L1D_REPLACEMENT:PMC[0-3] + L1D_M_EVICT:PMC[0-3]
misses: L2_LINES_IN_ALL:PMC[0-3]
evicts: L2_TRANS_L2_WB:PMC[0-3]
- level: L3
cache per group: {sets: 20480, ways: 16, cl_size: 64, replacement_policy: LRU, write_allocate: true,
write_back: true}
cores per group: 8
threads per group: 16
groups: 2
upstream throughput: [32 B/cy, half-duplex]
transfers overlap: false
performance counter metrics:
accesses: L2_LINES_IN_ALL:PMC[0-3] + L2_TRANS_L2_WB:PMC[0-3]
misses: (CAS_COUNT_RD:MBOX0C[01] + CAS_COUNT_RD:MBOX1C[01] + CAS_COUNT_RD:MBOX2C[01]
+ CAS_COUNT_RD:MBOX3C[01])
evicts: (CAS_COUNT_WR:MBOX0C[01] + CAS_COUNT_WR:MBOX1C[01] + CAS_COUNT_WR:MBOX2C[01]
+ CAS_COUNT_WR:MBOX3C[01])
- level: MEM
cores per group: 8
upstream throughput: [full socket memory bandwidth, half-duplex]
transfers overlap: false
size per group:
threads per group: 16
benchmarks:
kernels:
copy:
FLOPs per iteration: 0
fastest bench kernel: copy_avx
read streams: {bytes: 8.00 B, streams: 1}
read+write streams: {bytes: 0.00 B, streams: 0}
write streams: {bytes: 8.00 B, streams: 1}
daxpy:
FLOPs per iteration: 2
fastest bench kernel: daxpy_avx
read streams: {bytes: 16.00 B, streams: 2}
read+write streams: {bytes: 8.00 B, streams: 1}
write streams: {bytes: 8.00 B, streams: 1}
load:
FLOPs per iteration: 0
fastest bench kernel: load_avx
read streams: {bytes: 8.00 B, streams: 1}
read+write streams: {bytes: 0.00 B, streams: 0}
write streams: {bytes: 0.00 B, streams: 0}
triad:
FLOPs per iteration: 2
fastest bench kernel: triad_avx
read streams: {bytes: 24.00 B, streams: 3}
read+write streams: {bytes: 0.00 B, streams: 0}
write streams: {bytes: 8.00 B, streams: 1}
update:
FLOPs per iteration: 0
fastest bench kernel: update_avx
read streams: {bytes: 8.00 B, streams: 1}
read+write streams: {bytes: 8.00 B, streams: 1}
write streams: {bytes: 8.00 B, streams: 1}
measurements:
L1:
1:
cores: [1, 2, 3, 4, 5, 6, 7, 8]
results:
copy: [83.27 GB/s, 166.52 GB/s, 249.78 GB/s, 333.02 GB/s, 416.34 GB/s, 495.96
GB/s, 578.56 GB/s, 660.60 GB/s]
daxpy: [116.88 GB/s, 233.68 GB/s, 311.60 GB/s, 409.72 GB/s, 509.79 GB/s,
559.65 GB/s, 612.77 GB/s, 719.71 GB/s]
load: [84.07 GB/s, 168.13 GB/s, 252.21 GB/s, 336.04 GB/s, 420.34 GB/s, 504.02
GB/s, 588.04 GB/s, 668.37 GB/s]
triad: [100.24 GB/s, 211.57 GB/s, 314.53 GB/s, 392.73 GB/s, 506.87 GB/s,
589.51 GB/s, 687.28 GB/s, 782.17 GB/s]
update: [84.77 GB/s, 160.10 GB/s, 237.12 GB/s, 312.74 GB/s, 392.54 GB/s,
465.53 GB/s, 516.02 GB/s, 567.27 GB/s]
size per core: [21.12 kB, 21.12 kB, 21.12 kB, 21.12 kB, 21.12 kB, 21.12 kB,
21.12 kB, 21.12 kB]
size per thread: [21.12 kB, 21.12 kB, 21.12 kB, 21.12 kB, 21.12 kB, 21.12
kB, 21.12 kB, 21.12 kB]
stats:
copy:
- [83.24 GB/s, 83.25 GB/s, 83.26 GB/s, 83.26 GB/s, 83.27 GB/s, 83.26 GB/s,
83.25 GB/s, 83.23 GB/s, 83.24 GB/s, 83.25 GB/s]
- [166.49 GB/s, 166.47 GB/s, 166.51 GB/s, 166.49 GB/s, 166.48 GB/s, 166.52
GB/s, 166.51 GB/s, 166.51 GB/s, 166.51 GB/s, 166.50 GB/s]
- [249.78 GB/s, 249.75 GB/s, 249.73 GB/s, 249.72 GB/s, 249.74 GB/s, 249.76
GB/s, 249.76 GB/s, 249.74 GB/s, 249.73 GB/s, 249.75 GB/s]
- [332.98 GB/s, 327.92 GB/s, 332.30 GB/s, 332.95 GB/s, 333.00 GB/s, 333.01
GB/s, 332.95 GB/s, 333.00 GB/s, 332.99 GB/s, 333.02 GB/s]
- [416.26 GB/s, 416.23 GB/s, 416.28 GB/s, 416.27 GB/s, 416.23 GB/s, 416.27
GB/s, 416.34 GB/s, 416.26 GB/s, 416.16 GB/s, 416.23 GB/s]
- [495.84 GB/s, 495.93 GB/s, 495.88 GB/s, 495.91 GB/s, 495.96 GB/s, 495.92
GB/s, 495.89 GB/s, 495.87 GB/s, 495.96 GB/s, 495.92 GB/s]
- [578.51 GB/s, 578.52 GB/s, 578.39 GB/s, 578.56 GB/s, 578.48 GB/s, 578.44
GB/s, 578.51 GB/s, 578.48 GB/s, 578.51 GB/s, 578.53 GB/s]
- [422.14 GB/s, 660.55 GB/s, 660.60 GB/s, 660.49 GB/s, 660.52 GB/s, 660.48
GB/s, 660.56 GB/s, 660.56 GB/s, 660.52 GB/s, 651.64 GB/s]
daxpy:
- [116.87 GB/s, 116.82 GB/s, 116.85 GB/s, 116.84 GB/s, 116.83 GB/s, 116.85
GB/s, 116.88 GB/s, 116.87 GB/s, 116.86 GB/s, 116.82 GB/s]
- [214.69 GB/s, 229.83 GB/s, 221.16 GB/s, 233.60 GB/s, 232.90 GB/s, 233.68
GB/s, 207.83 GB/s, 233.65 GB/s, 212.71 GB/s, 214.07 GB/s]
- [282.77 GB/s, 307.63 GB/s, 307.09 GB/s, 310.67 GB/s, 307.50 GB/s, 311.40
GB/s, 307.06 GB/s, 305.89 GB/s, 311.60 GB/s, 308.47 GB/s]
- [404.96 GB/s, 408.54 GB/s, 395.76 GB/s, 409.72 GB/s, 316.70 GB/s, 408.07
GB/s, 347.34 GB/s, 406.03 GB/s, 391.75 GB/s, 385.10 GB/s]
- [479.84 GB/s, 509.24 GB/s, 502.60 GB/s, 449.79 GB/s, 402.46 GB/s, 489.18
GB/s, 491.15 GB/s, 491.20 GB/s, 384.36 GB/s, 509.79 GB/s]
- [515.12 GB/s, 496.21 GB/s, 517.52 GB/s, 540.00 GB/s, 501.82 GB/s, 507.84
GB/s, 496.71 GB/s, 479.42 GB/s, 559.65 GB/s, 519.55 GB/s]
- [584.86 GB/s, 580.10 GB/s, 583.34 GB/s, 612.77 GB/s, 607.15 GB/s, 607.89
GB/s, 589.85 GB/s, 609.59 GB/s, 592.86 GB/s, 568.07 GB/s]
- [719.71 GB/s, 660.98 GB/s, 675.88 GB/s, 679.51 GB/s, 696.97 GB/s, 635.23
GB/s, 644.06 GB/s, 694.74 GB/s, 654.01 GB/s, 656.57 GB/s]
load:
- [84.04 GB/s, 84.06 GB/s, 84.06 GB/s, 84.04 GB/s, 84.05 GB/s, 84.05 GB/s,
84.07 GB/s, 84.04 GB/s, 84.05 GB/s, 84.06 GB/s]
- [168.09 GB/s, 168.12 GB/s, 168.06 GB/s, 168.11 GB/s, 168.12 GB/s, 168.13
GB/s, 168.13 GB/s, 168.12 GB/s, 168.10 GB/s, 168.13 GB/s]
- [252.16 GB/s, 252.21 GB/s, 252.07 GB/s, 252.07 GB/s, 252.18 GB/s, 252.16
GB/s, 252.21 GB/s, 252.20 GB/s, 252.20 GB/s, 252.17 GB/s]
- [335.94 GB/s, 336.03 GB/s, 335.99 GB/s, 336.04 GB/s, 336.00 GB/s, 335.98
GB/s, 335.97 GB/s, 335.89 GB/s, 335.99 GB/s, 336.03 GB/s]
- [420.30 GB/s, 420.18 GB/s, 420.30 GB/s, 420.33 GB/s, 420.25 GB/s, 420.28
GB/s, 420.31 GB/s, 420.31 GB/s, 420.34 GB/s, 420.33 GB/s]
- [503.98 GB/s, 503.99 GB/s, 503.97 GB/s, 503.98 GB/s, 504.02 GB/s, 503.99
GB/s, 503.92 GB/s, 503.98 GB/s, 503.94 GB/s, 503.97 GB/s]
- [587.93 GB/s, 588.01 GB/s, 588.04 GB/s, 587.94 GB/s, 587.97 GB/s, 588.01
GB/s, 588.00 GB/s, 587.92 GB/s, 588.04 GB/s, 588.02 GB/s]
- [668.21 GB/s, 668.22 GB/s, 668.29 GB/s, 668.24 GB/s, 668.27 GB/s, 668.37
GB/s, 668.28 GB/s, 668.14 GB/s, 668.19 GB/s, 668.19 GB/s]
triad:
- [100.00 GB/s, 99.71 GB/s, 99.74 GB/s, 100.24 GB/s, 99.72 GB/s, 99.62 GB/s,
99.54 GB/s, 99.61 GB/s, 99.72 GB/s, 99.71 GB/s]
- [208.08 GB/s, 210.33 GB/s, 211.57 GB/s, 208.34 GB/s, 210.03 GB/s, 209.16
GB/s, 210.21 GB/s, 209.48 GB/s, 210.03 GB/s, 208.80 GB/s]
- [311.43 GB/s, 311.08 GB/s, 311.41 GB/s, 311.10 GB/s, 313.13 GB/s, 314.53
GB/s, 311.59 GB/s, 311.80 GB/s, 311.57 GB/s, 311.89 GB/s]
- [391.65 GB/s, 392.34 GB/s, 391.84 GB/s, 392.07 GB/s, 391.96 GB/s, 392.73
GB/s, 391.66 GB/s, 391.83 GB/s, 392.09 GB/s, 391.88 GB/s]
- [504.20 GB/s, 506.77 GB/s, 503.22 GB/s, 506.74 GB/s, 502.78 GB/s, 506.15
GB/s, 506.87 GB/s, 502.85 GB/s, 505.82 GB/s, 506.57 GB/s]
- [587.75 GB/s, 589.51 GB/s, 588.01 GB/s, 587.29 GB/s, 588.04 GB/s, 587.92
GB/s, 588.08 GB/s, 587.94 GB/s, 587.82 GB/s, 587.55 GB/s]
- [686.03 GB/s, 685.97 GB/s, 685.01 GB/s, 685.88 GB/s, 685.61 GB/s, 687.12
GB/s, 684.97 GB/s, 686.09 GB/s, 685.81 GB/s, 687.28 GB/s]
- [782.05 GB/s, 781.73 GB/s, 781.13 GB/s, 781.87 GB/s, 782.17 GB/s, 781.24
GB/s, 781.82 GB/s, 781.92 GB/s, 781.90 GB/s, 781.66 GB/s]
update:
- [84.76 GB/s, 84.76 GB/s, 84.77 GB/s, 84.75 GB/s, 84.75 GB/s, 84.75 GB/s,
84.75 GB/s, 84.75 GB/s, 84.74 GB/s, 57.21 GB/s]
- [157.73 GB/s, 155.29 GB/s, 147.91 GB/s, 160.10 GB/s, 156.33 GB/s, 158.06
GB/s, 159.23 GB/s, 156.16 GB/s, 155.30 GB/s, 159.15 GB/s]
- [232.07 GB/s, 230.40 GB/s, 234.05 GB/s, 232.69 GB/s, 215.80 GB/s, 232.76
GB/s, 236.01 GB/s, 237.12 GB/s, 234.66 GB/s, 234.86 GB/s]
- [303.60 GB/s, 304.21 GB/s, 306.83 GB/s, 309.43 GB/s, 312.69 GB/s, 311.75
GB/s, 301.74 GB/s, 307.54 GB/s, 312.74 GB/s, 312.19 GB/s]
- [386.45 GB/s, 382.41 GB/s, 387.87 GB/s, 392.54 GB/s, 369.42 GB/s, 341.87
GB/s, 352.85 GB/s, 390.87 GB/s, 382.44 GB/s, 383.50 GB/s]
- [459.60 GB/s, 384.27 GB/s, 437.39 GB/s, 459.42 GB/s, 465.53 GB/s, 447.31
GB/s, 440.00 GB/s, 409.94 GB/s, 412.94 GB/s, 446.74 GB/s]
- [489.85 GB/s, 489.35 GB/s, 435.92 GB/s, 492.39 GB/s, 446.44 GB/s, 501.71
GB/s, 516.02 GB/s, 478.87 GB/s, 494.52 GB/s, 493.04 GB/s]
- [521.08 GB/s, 553.73 GB/s, 541.34 GB/s, 527.75 GB/s, 554.87 GB/s, 536.30
GB/s, 540.66 GB/s, 551.02 GB/s, 567.27 GB/s, 565.31 GB/s]
threads: [1, 2, 3, 4, 5, 6, 7, 8]
threads per core: 1
total size: [21.12 kB, 42.24 kB, 63.36 kB, 84.48 kB, 105.60 kB, 126.72 kB,
147.84 kB, 168.96 kB]
2:
cores: [1, 2, 3, 4, 5, 6, 7, 8]
results:
copy: [80.41 GB/s, 160.83 GB/s, 240.43 GB/s, 320.63 GB/s, 401.66 GB/s, 454.32
GB/s, 539.77 GB/s, 628.51 GB/s]
daxpy: [95.87 GB/s, 187.75 GB/s, 270.68 GB/s, 371.80 GB/s, 454.05 GB/s,
503.46 GB/s, 606.85 GB/s, 689.34 GB/s]
load: [82.30 GB/s, 164.06 GB/s, 244.78 GB/s, 326.21 GB/s, 408.56 GB/s, 490.13
GB/s, 569.95 GB/s, 651.79 GB/s]
triad: [93.22 GB/s, 186.75 GB/s, 288.55 GB/s, 340.91 GB/s, 442.20 GB/s,
534.62 GB/s, 597.98 GB/s, 707.54 GB/s]
update: [83.25 GB/s, 166.04 GB/s, 248.21 GB/s, 330.58 GB/s, 414.71 GB/s,
496.97 GB/s, 578.67 GB/s, 656.56 GB/s]
size per core: [21.12 kB, 21.12 kB, 21.12 kB, 21.12 kB, 21.12 kB, 21.12 kB,
21.12 kB, 21.12 kB]
size per thread: [10.56 kB, 10.56 kB, 10.56 kB, 10.56 kB, 10.56 kB, 10.56
kB, 10.56 kB, 10.56 kB]
stats:
copy:
- [80.37 GB/s, 79.07 GB/s, 80.39 GB/s, 80.39 GB/s, 80.41 GB/s, 80.29 GB/s,
80.36 GB/s, 79.05 GB/s, 77.87 GB/s, 80.37 GB/s]
- [160.76 GB/s, 160.63 GB/s, 160.76 GB/s, 160.71 GB/s, 160.80 GB/s, 160.74
GB/s, 160.83 GB/s, 160.69 GB/s, 160.79 GB/s, 160.78 GB/s]
- [240.43 GB/s, 240.20 GB/s, 240.36 GB/s, 240.37 GB/s, 237.17 GB/s, 240.39
GB/s, 240.14 GB/s, 240.24 GB/s, 240.26 GB/s, 240.10 GB/s]
- [320.46 GB/s, 320.47 GB/s, 320.63 GB/s, 320.52 GB/s, 320.40 GB/s, 320.40
GB/s, 320.51 GB/s, 320.46 GB/s, 319.72 GB/s, 320.44 GB/s]
- [401.40 GB/s, 399.28 GB/s, 401.66 GB/s, 401.53 GB/s, 401.52 GB/s, 401.55
GB/s, 401.60 GB/s, 401.47 GB/s, 401.47 GB/s, 401.35 GB/s]
- [447.24 GB/s, 453.65 GB/s, 453.54 GB/s, 453.86 GB/s, 453.82 GB/s, 453.62
GB/s, 453.48 GB/s, 454.32 GB/s, 453.86 GB/s, 446.79 GB/s]
- [538.79 GB/s, 538.47 GB/s, 539.02 GB/s, 538.25 GB/s, 538.72 GB/s, 538.89
GB/s, 539.37 GB/s, 539.41 GB/s, 539.77 GB/s, 538.49 GB/s]
- [628.14 GB/s, 618.54 GB/s, 628.12 GB/s, 623.90 GB/s, 628.27 GB/s, 623.78
GB/s, 618.17 GB/s, 623.43 GB/s, 628.51 GB/s, 628.43 GB/s]
daxpy:
- [95.77 GB/s, 93.25 GB/s, 92.87 GB/s, 95.87 GB/s, 95.84 GB/s, 95.81 GB/s,
95.80 GB/s, 94.99 GB/s, 95.81 GB/s, 95.86 GB/s]
- [184.53 GB/s, 186.60 GB/s, 183.99 GB/s, 187.48 GB/s, 187.75 GB/s, 181.53
GB/s, 183.82 GB/s, 187.75 GB/s, 184.13 GB/s, 180.61 GB/s]
- [258.46 GB/s, 270.13 GB/s, 264.76 GB/s, 262.23 GB/s, 265.05 GB/s, 267.25
GB/s, 270.68 GB/s, 268.08 GB/s, 266.20 GB/s, 265.66 GB/s]
- [367.99 GB/s, 367.15 GB/s, 361.68 GB/s, 364.86 GB/s, 368.76 GB/s, 363.27
GB/s, 364.95 GB/s, 366.97 GB/s, 371.80 GB/s, 366.55 GB/s]
- [441.95 GB/s, 442.77 GB/s, 444.97 GB/s, 454.05 GB/s, 441.02 GB/s, 445.96
GB/s, 442.49 GB/s, 440.23 GB/s, 449.29 GB/s, 452.66 GB/s]
- [501.31 GB/s, 489.91 GB/s, 495.43 GB/s, 503.39 GB/s, 488.03 GB/s, 497.71
GB/s, 503.46 GB/s, 496.85 GB/s, 497.38 GB/s, 468.90 GB/s]
- [604.57 GB/s, 580.51 GB/s, 587.67 GB/s, 594.32 GB/s, 561.32 GB/s, 588.09
GB/s, 606.85 GB/s, 600.91 GB/s, 599.40 GB/s, 598.24 GB/s]
- [646.48 GB/s, 655.06 GB/s, 684.70 GB/s, 653.61 GB/s, 671.61 GB/s, 689.34
GB/s, 673.74 GB/s, 685.49 GB/s, 681.48 GB/s, 683.23 GB/s]
load:
- [82.19 GB/s, 82.08 GB/s, 82.22 GB/s, 82.10 GB/s, 82.14 GB/s, 82.17 GB/s,
82.22 GB/s, 82.28 GB/s, 82.30 GB/s, 81.98 GB/s]
- [163.22 GB/s, 163.43 GB/s, 164.06 GB/s, 164.03 GB/s, 163.19 GB/s, 163.83
GB/s, 163.29 GB/s, 163.88 GB/s, 163.83 GB/s, 163.11 GB/s]
- [244.32 GB/s, 244.47 GB/s, 244.65 GB/s, 244.29 GB/s, 243.96 GB/s, 244.50
GB/s, 244.78 GB/s, 244.52 GB/s, 244.48 GB/s, 244.72 GB/s]
- [325.18 GB/s, 326.21 GB/s, 325.49 GB/s, 325.86 GB/s, 325.73 GB/s, 325.72
GB/s, 326.00 GB/s, 325.41 GB/s, 325.63 GB/s, 325.82 GB/s]
- [407.81 GB/s, 407.96 GB/s, 407.59 GB/s, 408.56 GB/s, 407.64 GB/s, 407.61
GB/s, 408.09 GB/s, 407.95 GB/s, 408.30 GB/s, 408.32 GB/s]
- [488.65 GB/s, 489.73 GB/s, 489.38 GB/s, 489.81 GB/s, 490.13 GB/s, 489.31
GB/s, 488.74 GB/s, 489.38 GB/s, 488.17 GB/s, 489.51 GB/s]
- [569.95 GB/s, 567.21 GB/s, 566.08 GB/s, 567.88 GB/s, 567.69 GB/s, 569.58
GB/s, 568.61 GB/s, 568.35 GB/s, 569.70 GB/s, 568.87 GB/s]
- [650.43 GB/s, 651.58 GB/s, 650.86 GB/s, 651.34 GB/s, 651.04 GB/s, 651.79
GB/s, 650.28 GB/s, 650.31 GB/s, 650.81 GB/s, 651.09 GB/s]
triad:
- [93.22 GB/s, 90.73 GB/s, 92.48 GB/s, 92.53 GB/s, 92.37 GB/s, 92.50 GB/s,
92.48 GB/s, 90.28 GB/s, 92.35 GB/s, 92.51 GB/s]
- [186.75 GB/s, 184.51 GB/s, 184.17 GB/s, 186.66 GB/s, 186.43 GB/s, 184.59
GB/s, 186.71 GB/s, 186.30 GB/s, 186.64 GB/s, 186.12 GB/s]
- [287.77 GB/s, 288.55 GB/s, 287.76 GB/s, 287.76 GB/s, 288.19 GB/s, 287.70
GB/s, 287.42 GB/s, 288.12 GB/s, 287.66 GB/s, 288.01 GB/s]
- [339.82 GB/s, 338.95 GB/s, 340.11 GB/s, 340.11 GB/s, 340.25 GB/s, 340.20
GB/s, 339.90 GB/s, 340.22 GB/s, 340.91 GB/s, 340.01 GB/s]
- [440.41 GB/s, 440.65 GB/s, 441.59 GB/s, 442.20 GB/s, 441.67 GB/s, 432.59
GB/s, 440.20 GB/s, 440.81 GB/s, 440.24 GB/s, 441.38 GB/s]
- [534.30 GB/s, 527.60 GB/s, 528.52 GB/s, 509.55 GB/s, 527.68 GB/s, 527.63
GB/s, 533.66 GB/s, 534.62 GB/s, 534.60 GB/s, 534.19 GB/s]
- [595.90 GB/s, 595.94 GB/s, 597.91 GB/s, 580.22 GB/s, 597.98 GB/s, 597.66
GB/s, 596.16 GB/s, 567.03 GB/s, 580.88 GB/s, 578.29 GB/s]
- [703.80 GB/s, 705.57 GB/s, 694.84 GB/s, 682.59 GB/s, 694.37 GB/s, 696.56
GB/s, 704.50 GB/s, 704.95 GB/s, 694.52 GB/s, 707.54 GB/s]
update:
- [83.18 GB/s, 83.24 GB/s, 83.25 GB/s, 83.16 GB/s, 83.22 GB/s, 83.23 GB/s,
83.22 GB/s, 83.21 GB/s, 83.20 GB/s, 83.17 GB/s]
- [165.65 GB/s, 165.76 GB/s, 165.99 GB/s, 166.04 GB/s, 165.49 GB/s, 165.87
GB/s, 165.58 GB/s, 165.96 GB/s, 165.67 GB/s, 165.66 GB/s]
- [247.30 GB/s, 248.14 GB/s, 247.84 GB/s, 247.90 GB/s, 247.77 GB/s, 247.60
GB/s, 248.21 GB/s, 247.95 GB/s, 248.05 GB/s, 247.83 GB/s]
- [330.49 GB/s, 330.07 GB/s, 329.91 GB/s, 329.90 GB/s, 330.58 GB/s, 329.30
GB/s, 329.92 GB/s, 330.03 GB/s, 330.04 GB/s, 330.12 GB/s]
- [413.89 GB/s, 414.04 GB/s, 413.56 GB/s, 414.06 GB/s, 414.15 GB/s, 413.94
GB/s, 414.04 GB/s, 414.71 GB/s, 414.32 GB/s, 413.93 GB/s]
- [496.97 GB/s, 496.80 GB/s, 496.17 GB/s, 495.42 GB/s, 496.17 GB/s, 496.66
GB/s, 495.55 GB/s, 496.27 GB/s, 495.52 GB/s, 496.80 GB/s]
- [564.44 GB/s, 577.86 GB/s, 574.38 GB/s, 571.96 GB/s, 564.76 GB/s, 578.67
GB/s, 565.89 GB/s, 572.49 GB/s, 571.80 GB/s, 572.01 GB/s]
- [647.68 GB/s, 656.56 GB/s, 655.56 GB/s, 644.04 GB/s, 655.30 GB/s, 648.80
GB/s, 654.77 GB/s, 653.58 GB/s, 656.27 GB/s, 653.79 GB/s]
threads: [2, 4, 6, 8, 10, 12, 14, 16]
threads per core: 2
total size: [21.12 kB, 42.24 kB, 63.36 kB, 84.48 kB, 105.60 kB, 126.72 kB,
147.84 kB, 168.96 kB]
L2:
1:
cores: [1, 2, 3, 4, 5, 6, 7, 8]
results:
copy: [36.74 GB/s, 73.65 GB/s, 107.11 GB/s, 141.43 GB/s, 179.70 GB/s, 215.63
GB/s, 247.20 GB/s, 282.42 GB/s]
daxpy: [44.59 GB/s, 88.24 GB/s, 132.21 GB/s, 175.78 GB/s, 219.11 GB/s, 259.95
GB/s, 305.84 GB/s, 346.83 GB/s]
load: [31.46 GB/s, 62.97 GB/s, 93.73 GB/s, 125.46 GB/s, 157.32 GB/s, 183.63
GB/s, 214.02 GB/s, 245.17 GB/s]
triad: [37.79 GB/s, 75.08 GB/s, 111.43 GB/s, 148.90 GB/s, 185.54 GB/s, 223.72
GB/s, 258.53 GB/s, 299.32 GB/s]
update: [48.46 GB/s, 96.10 GB/s, 141.97 GB/s, 189.18 GB/s, 234.73 GB/s,
280.47 GB/s, 330.94 GB/s, 365.43 GB/s]
size per core: [168.96 kB, 168.96 kB, 168.96 kB, 168.96 kB, 168.96 kB, 168.96
kB, 168.96 kB, 168.96 kB]
size per thread: [168.96 kB, 168.96 kB, 168.96 kB, 168.96 kB, 168.96 kB, 168.96
kB, 168.96 kB, 168.96 kB]
stats:
copy:
- [36.38 GB/s, 36.59 GB/s, 36.18 GB/s, 36.57 GB/s, 36.26 GB/s, 34.61 GB/s,
35.96 GB/s, 35.84 GB/s, 36.74 GB/s, 36.53 GB/s]
- [68.97 GB/s, 70.42 GB/s, 69.88 GB/s, 71.40 GB/s, 69.05 GB/s, 72.46 GB/s,
70.32 GB/s, 73.65 GB/s, 72.14 GB/s, 69.81 GB/s]
- [107.08 GB/s, 103.53 GB/s, 107.11 GB/s, 103.66 GB/s, 103.88 GB/s, 106.48
GB/s, 97.32 GB/s, 105.92 GB/s, 104.16 GB/s, 104.84 GB/s]
- [138.97 GB/s, 136.86 GB/s, 140.88 GB/s, 138.96 GB/s, 140.58 GB/s, 138.51
GB/s, 141.43 GB/s, 139.53 GB/s, 141.20 GB/s, 139.43 GB/s]
- [158.20 GB/s, 171.06 GB/s, 179.70 GB/s, 171.43 GB/s, 174.27 GB/s, 175.01
GB/s, 165.20 GB/s, 170.89 GB/s, 173.01 GB/s, 175.17 GB/s]
- [209.74 GB/s, 204.59 GB/s, 215.27 GB/s, 215.63 GB/s, 210.59 GB/s, 206.94
GB/s, 211.03 GB/s, 201.61 GB/s, 214.45 GB/s, 208.15 GB/s]
- [241.38 GB/s, 246.88 GB/s, 246.90 GB/s, 247.20 GB/s, 235.27 GB/s, 227.39
GB/s, 239.48 GB/s, 244.45 GB/s, 246.68 GB/s, 235.87 GB/s]
- [271.07 GB/s, 282.42 GB/s, 282.38 GB/s, 276.20 GB/s, 269.85 GB/s, 276.96
GB/s, 268.64 GB/s, 269.61 GB/s, 279.68 GB/s, 280.63 GB/s]
daxpy:
- [44.54 GB/s, 44.59 GB/s, 44.50 GB/s, 44.42 GB/s, 44.41 GB/s, 44.06 GB/s,
43.39 GB/s, 44.02 GB/s, 44.34 GB/s, 44.28 GB/s]
- [85.35 GB/s, 87.05 GB/s, 86.47 GB/s, 86.90 GB/s, 86.92 GB/s, 88.24 GB/s,
87.39 GB/s, 87.60 GB/s, 87.55 GB/s, 84.19 GB/s]
- [129.21 GB/s, 130.47 GB/s, 123.29 GB/s, 127.92 GB/s, 132.21 GB/s, 128.37
GB/s, 127.09 GB/s, 128.72 GB/s, 129.34 GB/s, 128.69 GB/s]
- [171.53 GB/s, 169.64 GB/s, 173.92 GB/s, 173.74 GB/s, 168.53 GB/s, 171.54
GB/s, 173.96 GB/s, 175.78 GB/s, 171.29 GB/s, 171.33 GB/s]
- [219.11 GB/s, 208.86 GB/s, 211.66 GB/s, 216.47 GB/s, 212.73 GB/s, 204.90
GB/s, 208.87 GB/s, 215.75 GB/s, 213.61 GB/s, 214.56 GB/s]
- [250.69 GB/s, 241.36 GB/s, 255.22 GB/s, 250.29 GB/s, 253.80 GB/s, 256.34
GB/s, 254.38 GB/s, 259.95 GB/s, 245.69 GB/s, 259.12 GB/s]
- [296.08 GB/s, 301.77 GB/s, 297.40 GB/s, 305.84 GB/s, 288.62 GB/s, 283.76
GB/s, 293.61 GB/s, 291.93 GB/s, 299.74 GB/s, 289.76 GB/s]
- [344.46 GB/s, 334.36 GB/s, 339.31 GB/s, 330.88 GB/s, 343.26 GB/s, 327.28
GB/s, 344.53 GB/s, 346.83 GB/s, 344.29 GB/s, 346.28 GB/s]
load:
- [31.40 GB/s, 31.23 GB/s, 31.29 GB/s, 31.24 GB/s, 31.46 GB/s, 31.20 GB/s,
31.33 GB/s, 30.01 GB/s, 30.08 GB/s, 31.40 GB/s]
- [61.20 GB/s, 60.74 GB/s, 61.93 GB/s, 61.22 GB/s, 61.20 GB/s, 60.03 GB/s,
59.33 GB/s, 59.94 GB/s, 58.54 GB/s, 62.97 GB/s]
- [91.53 GB/s, 93.73 GB/s, 93.05 GB/s, 90.07 GB/s, 91.60 GB/s, 90.11 GB/s,
90.21 GB/s, 90.43 GB/s, 89.15 GB/s, 93.10 GB/s]
- [122.80 GB/s, 116.57 GB/s, 120.68 GB/s, 122.54 GB/s, 122.75 GB/s, 121.79
GB/s, 125.30 GB/s, 125.46 GB/s, 122.28 GB/s, 124.51 GB/s]
- [151.01 GB/s, 151.10 GB/s, 148.68 GB/s, 151.17 GB/s, 147.24 GB/s, 153.65
GB/s, 146.48 GB/s, 150.48 GB/s, 150.74 GB/s, 157.32 GB/s]
- [181.52 GB/s, 173.89 GB/s, 181.58 GB/s, 174.01 GB/s, 176.40 GB/s, 179.73
GB/s, 174.06 GB/s, 181.26 GB/s, 180.57 GB/s, 183.63 GB/s]
- [214.02 GB/s, 205.69 GB/s, 207.64 GB/s, 204.18 GB/s, 208.42 GB/s, 211.39
GB/s, 206.58 GB/s, 204.90 GB/s, 204.75 GB/s, 208.91 GB/s]
- [232.16 GB/s, 233.90 GB/s, 241.32 GB/s, 237.45 GB/s, 235.41 GB/s, 241.17
GB/s, 237.52 GB/s, 245.17 GB/s, 241.17 GB/s, 234.08 GB/s]
triad:
- [37.62 GB/s, 37.54 GB/s, 37.79 GB/s, 37.67 GB/s, 37.76 GB/s, 37.77 GB/s,
37.68 GB/s, 35.83 GB/s, 37.06 GB/s, 37.50 GB/s]
- [72.79 GB/s, 74.76 GB/s, 73.15 GB/s, 74.68 GB/s, 73.88 GB/s, 73.27 GB/s,
75.08 GB/s, 73.48 GB/s, 71.27 GB/s, 72.05 GB/s]
- [106.26 GB/s, 105.22 GB/s, 109.70 GB/s, 109.07 GB/s, 110.84 GB/s, 111.43
GB/s, 106.32 GB/s, 109.73 GB/s, 106.22 GB/s, 107.20 GB/s]
- [142.10 GB/s, 148.90 GB/s, 148.11 GB/s, 144.38 GB/s, 144.77 GB/s, 145.42
GB/s, 147.36 GB/s, 142.94 GB/s, 145.39 GB/s, 139.42 GB/s]
- [182.07 GB/s, 176.75 GB/s, 181.39 GB/s, 183.31 GB/s, 181.87 GB/s, 183.71
GB/s, 180.48 GB/s, 178.11 GB/s, 181.36 GB/s, 185.54 GB/s]
- [219.85 GB/s, 217.02 GB/s, 218.86 GB/s, 217.09 GB/s, 212.24 GB/s, 212.22
GB/s, 219.33 GB/s, 208.81 GB/s, 215.84 GB/s, 223.72 GB/s]
- [258.06 GB/s, 232.27 GB/s, 247.04 GB/s, 240.55 GB/s, 236.11 GB/s, 251.88
GB/s, 258.53 GB/s, 247.32 GB/s, 251.53 GB/s, 245.10 GB/s]
- [273.67 GB/s, 292.81 GB/s, 288.67 GB/s, 289.75 GB/s, 293.98 GB/s, 283.56
GB/s, 295.33 GB/s, 280.11 GB/s, 299.32 GB/s, 285.18 GB/s]
update:
- [47.30 GB/s, 48.33 GB/s, 48.17 GB/s, 47.38 GB/s, 48.16 GB/s, 46.99 GB/s,
48.46 GB/s, 47.51 GB/s, 46.20 GB/s, 48.26 GB/s]
- [92.10 GB/s, 92.30 GB/s, 95.73 GB/s, 95.53 GB/s, 86.95 GB/s, 96.10 GB/s,
94.16 GB/s, 89.72 GB/s, 92.00 GB/s, 93.10 GB/s]
- [137.06 GB/s, 140.40 GB/s, 136.20 GB/s, 139.57 GB/s, 140.69 GB/s, 136.20
GB/s, 141.53 GB/s, 129.76 GB/s, 136.47 GB/s, 141.97 GB/s]
- [184.84 GB/s, 177.96 GB/s, 178.61 GB/s, 179.03 GB/s, 176.59 GB/s, 180.62
GB/s, 182.26 GB/s, 182.27 GB/s, 189.18 GB/s, 185.49 GB/s]
- [232.17 GB/s, 217.86 GB/s, 232.40 GB/s, 223.10 GB/s, 228.52 GB/s, 234.73
GB/s, 232.00 GB/s, 233.14 GB/s, 231.69 GB/s, 225.01 GB/s]
- [276.16 GB/s, 274.80 GB/s, 272.58 GB/s, 272.43 GB/s, 280.47 GB/s, 276.90
GB/s, 264.76 GB/s, 272.47 GB/s, 277.77 GB/s, 271.42 GB/s]
- [330.94 GB/s, 312.06 GB/s, 312.83 GB/s, 312.62 GB/s, 292.44 GB/s, 315.68
GB/s, 316.67 GB/s, 321.25 GB/s, 321.71 GB/s, 315.05 GB/s]
- [362.85 GB/s, 356.49 GB/s, 365.43 GB/s, 332.52 GB/s, 354.30 GB/s, 354.68
GB/s, 335.54 GB/s, 358.54 GB/s, 363.22 GB/s, 360.01 GB/s]
threads: [1, 2, 3, 4, 5, 6, 7, 8]
threads per core: 1
total size: [168.96 kB, 337.92 kB, 506.88 kB, 675.84 kB, 844.80 kB, 1.01 MB,
1.18 MB, 1.35 MB]
2:
cores: [1, 2, 3, 4, 5, 6, 7, 8]
results:
copy: [36.83 GB/s, 72.70 GB/s, 108.11 GB/s, 142.21 GB/s, 178.07 GB/s, 213.30
GB/s, 251.98 GB/s, 283.06 GB/s]
daxpy: [45.34 GB/s, 90.11 GB/s, 134.85 GB/s, 180.06 GB/s, 224.22 GB/s, 268.27
GB/s, 312.15 GB/s, 358.38 GB/s]
load: [33.99 GB/s, 67.65 GB/s, 100.93 GB/s, 134.81 GB/s, 165.89 GB/s, 196.09
GB/s, 233.31 GB/s, 262.05 GB/s]
triad: [38.60 GB/s, 76.58 GB/s, 114.50 GB/s, 150.54 GB/s, 189.60 GB/s, 227.05
GB/s, 263.75 GB/s, 301.02 GB/s]
update: [49.25 GB/s, 97.34 GB/s, 146.81 GB/s, 194.71 GB/s, 239.97 GB/s,
287.14 GB/s, 330.84 GB/s, 384.71 GB/s]
size per core: [168.96 kB, 168.96 kB, 168.96 kB, 168.96 kB, 168.96 kB, 168.96
kB, 168.96 kB, 168.96 kB]
size per thread: [84.48 kB, 84.48 kB, 84.48 kB, 84.48 kB, 84.48 kB, 84.48
kB, 84.48 kB, 84.48 kB]
stats:
copy:
- [36.83 GB/s, 36.67 GB/s, 34.90 GB/s, 36.44 GB/s, 35.13 GB/s, 35.07 GB/s,
35.53 GB/s, 36.15 GB/s, 35.85 GB/s, 36.23 GB/s]
- [71.52 GB/s, 70.16 GB/s, 70.67 GB/s, 71.20 GB/s, 72.70 GB/s, 70.14 GB/s,
70.53 GB/s, 69.17 GB/s, 71.57 GB/s, 70.22 GB/s]
- [104.39 GB/s, 104.74 GB/s, 103.12 GB/s, 108.11 GB/s, 105.30 GB/s, 102.80
GB/s, 102.90 GB/s, 107.06 GB/s, 103.45 GB/s, 105.45 GB/s]
- [139.02 GB/s, 134.63 GB/s, 140.72 GB/s, 141.32 GB/s, 140.35 GB/s, 141.19
GB/s, 135.44 GB/s, 142.21 GB/s, 140.96 GB/s, 142.05 GB/s]
- [177.86 GB/s, 177.74 GB/s, 177.42 GB/s, 175.35 GB/s, 176.42 GB/s, 173.13
GB/s, 174.32 GB/s, 170.24 GB/s, 178.07 GB/s, 177.88 GB/s]
- [206.27 GB/s, 211.63 GB/s, 209.06 GB/s, 210.54 GB/s, 208.80 GB/s, 209.99
GB/s, 208.77 GB/s, 206.41 GB/s, 213.30 GB/s, 206.39 GB/s]
- [240.18 GB/s, 238.36 GB/s, 244.16 GB/s, 236.26 GB/s, 244.12 GB/s, 238.49
GB/s, 242.23 GB/s, 244.46 GB/s, 251.98 GB/s, 242.55 GB/s]
- [279.77 GB/s, 282.91 GB/s, 278.73 GB/s, 276.91 GB/s, 283.06 GB/s, 273.23
GB/s, 278.33 GB/s, 280.88 GB/s, 277.54 GB/s, 281.83 GB/s]
daxpy:
- [45.32 GB/s, 44.62 GB/s, 45.29 GB/s, 45.18 GB/s, 45.17 GB/s, 45.07 GB/s,
44.69 GB/s, 45.17 GB/s, 45.11 GB/s, 45.34 GB/s]
- [89.94 GB/s, 89.97 GB/s, 89.37 GB/s, 89.90 GB/s, 88.37 GB/s, 89.13 GB/s,
90.11 GB/s, 89.67 GB/s, 89.90 GB/s, 89.93 GB/s]
- [134.83 GB/s, 134.85 GB/s, 132.02 GB/s, 134.33 GB/s, 133.82 GB/s, 132.39
GB/s, 131.67 GB/s, 134.62 GB/s, 132.71 GB/s, 131.67 GB/s]
- [175.52 GB/s, 173.36 GB/s, 176.83 GB/s, 177.98 GB/s, 175.73 GB/s, 173.42
GB/s, 180.06 GB/s, 179.55 GB/s, 176.71 GB/s, 175.85 GB/s]
- [222.00 GB/s, 216.86 GB/s, 220.17 GB/s, 218.14 GB/s, 220.60 GB/s, 219.43
GB/s, 220.58 GB/s, 224.22 GB/s, 220.89 GB/s, 222.28 GB/s]
- [258.75 GB/s, 262.88 GB/s, 261.77 GB/s, 268.27 GB/s, 263.66 GB/s, 262.59
GB/s, 266.54 GB/s, 261.67 GB/s, 262.80 GB/s, 263.72 GB/s]
- [298.65 GB/s, 312.15 GB/s, 308.52 GB/s, 304.22 GB/s, 301.87 GB/s, 305.53
GB/s, 309.84 GB/s, 310.67 GB/s, 310.49 GB/s, 311.99 GB/s]
- [347.55 GB/s, 350.67 GB/s, 348.93 GB/s, 358.38 GB/s, 352.35 GB/s, 352.05
GB/s, 353.82 GB/s, 356.00 GB/s, 348.07 GB/s, 349.87 GB/s]
load:
- [33.99 GB/s, 32.54 GB/s, 32.94 GB/s, 33.17 GB/s, 33.83 GB/s, 31.55 GB/s,
31.91 GB/s, 33.86 GB/s, 33.93 GB/s, 33.75 GB/s]
- [66.22 GB/s, 64.94 GB/s, 67.64 GB/s, 67.52 GB/s, 65.01 GB/s, 67.21 GB/s,
66.07 GB/s, 66.43 GB/s, 67.65 GB/s, 64.84 GB/s]
- [98.58 GB/s, 97.97 GB/s, 98.39 GB/s, 98.50 GB/s, 98.77 GB/s, 97.84 GB/s,
99.58 GB/s, 100.93 GB/s, 100.50 GB/s, 99.94 GB/s]
- [130.23 GB/s, 131.10 GB/s, 131.04 GB/s, 127.83 GB/s, 134.81 GB/s, 132.68
GB/s, 131.80 GB/s, 129.42 GB/s, 130.76 GB/s, 126.96 GB/s]
- [164.90 GB/s, 165.18 GB/s, 161.19 GB/s, 164.33 GB/s, 162.76 GB/s, 165.04
GB/s, 162.20 GB/s, 165.89 GB/s, 164.34 GB/s, 159.66 GB/s]
- [192.69 GB/s, 193.33 GB/s, 188.88 GB/s, 190.70 GB/s, 194.60 GB/s, 190.92
GB/s, 191.36 GB/s, 192.89 GB/s, 191.85 GB/s, 196.09 GB/s]
- [227.70 GB/s, 223.95 GB/s, 222.79 GB/s, 227.09 GB/s, 227.04 GB/s, 229.45
GB/s, 228.09 GB/s, 227.83 GB/s, 233.31 GB/s, 227.49 GB/s]
- [257.94 GB/s, 261.47 GB/s, 262.05 GB/s, 257.70 GB/s, 259.70 GB/s, 259.23
GB/s, 261.09 GB/s, 253.81 GB/s, 254.21 GB/s, 259.34 GB/s]
triad:
- [38.60 GB/s, 36.68 GB/s, 38.07 GB/s, 38.10 GB/s, 37.89 GB/s, 36.48 GB/s,
38.33 GB/s, 38.12 GB/s, 37.43 GB/s, 37.87 GB/s]
- [76.58 GB/s, 74.97 GB/s, 75.74 GB/s, 76.02 GB/s, 72.66 GB/s, 74.73 GB/s,
76.37 GB/s, 76.18 GB/s, 74.59 GB/s, 75.75 GB/s]
- [111.71 GB/s, 114.50 GB/s, 108.96 GB/s, 111.49 GB/s, 111.56 GB/s, 111.66
GB/s, 113.43 GB/s, 114.37 GB/s, 111.67 GB/s, 108.14 GB/s]
- [146.29 GB/s, 147.84 GB/s, 149.09 GB/s, 149.93 GB/s, 150.54 GB/s, 145.50
GB/s, 145.16 GB/s, 149.47 GB/s, 146.30 GB/s, 149.32 GB/s]
- [186.73 GB/s, 186.46 GB/s, 180.47 GB/s, 187.32 GB/s, 184.34 GB/s, 187.34
GB/s, 186.55 GB/s, 183.81 GB/s, 189.60 GB/s, 188.70 GB/s]
- [224.81 GB/s, 219.69 GB/s, 227.05 GB/s, 224.25 GB/s, 223.36 GB/s, 225.86
GB/s, 216.09 GB/s, 221.98 GB/s, 218.47 GB/s, 226.37 GB/s]
- [263.29 GB/s, 259.28 GB/s, 258.81 GB/s, 258.77 GB/s, 256.56 GB/s, 256.49
GB/s, 256.39 GB/s, 263.75 GB/s, 262.00 GB/s, 261.48 GB/s]
- [299.28 GB/s, 292.80 GB/s, 293.63 GB/s, 297.93 GB/s, 293.02 GB/s, 295.95
GB/s, 287.92 GB/s, 301.02 GB/s, 300.76 GB/s, 297.01 GB/s]
update:
- [49.07 GB/s, 47.17 GB/s, 47.56 GB/s, 49.25 GB/s, 46.44 GB/s, 49.04 GB/s,
48.91 GB/s, 49.20 GB/s, 48.30 GB/s, 48.85 GB/s]
- [96.45 GB/s, 97.11 GB/s, 94.03 GB/s, 92.56 GB/s, 95.39 GB/s, 97.34 GB/s,
96.06 GB/s, 92.25 GB/s, 95.53 GB/s, 97.08 GB/s]
- [137.54 GB/s, 135.13 GB/s, 145.80 GB/s, 141.29 GB/s, 138.99 GB/s, 143.44
GB/s, 146.81 GB/s, 142.94 GB/s, 133.84 GB/s, 146.33 GB/s]
- [190.64 GB/s, 185.02 GB/s, 194.24 GB/s, 187.48 GB/s, 194.52 GB/s, 188.51
GB/s, 189.17 GB/s, 194.71 GB/s, 194.37 GB/s, 190.83 GB/s]
- [239.97 GB/s, 219.74 GB/s, 233.72 GB/s, 234.38 GB/s, 235.78 GB/s, 235.11
GB/s, 235.62 GB/s, 226.09 GB/s, 235.93 GB/s, 230.51 GB/s]
- [280.16 GB/s, 275.22 GB/s, 260.15 GB/s, 286.01 GB/s, 280.61 GB/s, 287.14
GB/s, 283.75 GB/s, 275.23 GB/s, 283.71 GB/s, 285.38 GB/s]
- [311.15 GB/s, 318.00 GB/s, 325.21 GB/s, 328.34 GB/s, 318.09 GB/s, 328.66
GB/s, 329.69 GB/s, 316.97 GB/s, 328.51 GB/s, 330.84 GB/s]
- [374.41 GB/s, 369.73 GB/s, 358.15 GB/s, 375.54 GB/s, 384.71 GB/s, 357.66
GB/s, 369.71 GB/s, 375.35 GB/s, 370.25 GB/s, 364.01 GB/s]
threads: [2, 4, 6, 8, 10, 12, 14, 16]
threads per core: 2
total size: [168.96 kB, 337.92 kB, 506.88 kB, 675.84 kB, 844.80 kB, 1.01 MB,
1.18 MB, 1.35 MB]
L3:
1:
cores: [1, 2, 3, 4, 5, 6, 7, 8]
results:
copy: [21.93 GB/s, 43.10 GB/s, 65.38 GB/s, 85.69 GB/s, 105.64 GB/s, 127.34
GB/s, 148.22 GB/s, 171.52 GB/s]
daxpy: [30.98 GB/s, 62.27 GB/s, 93.13 GB/s, 123.27 GB/s, 153.64 GB/s, 185.97
GB/s, 216.67 GB/s, 247.41 GB/s]
load: [23.47 GB/s, 46.84 GB/s, 69.74 GB/s, 92.76 GB/s, 115.37 GB/s, 139.23
GB/s, 163.12 GB/s, 186.65 GB/s]
triad: [24.72 GB/s, 49.11 GB/s, 72.42 GB/s, 95.36 GB/s, 119.46 GB/s, 144.60
GB/s, 168.66 GB/s, 189.45 GB/s]
update: [31.39 GB/s, 62.11 GB/s, 91.95 GB/s, 122.24 GB/s, 151.40 GB/s, 182.28
GB/s, 216.07 GB/s, 239.92 GB/s]
size per core: [13.20 MB, 6.60 MB, 4.40 MB, 3.30 MB, 2.64 MB, 2.20 MB, 1.89
MB, 1.65 MB]
size per thread: [13.20 MB, 6.60 MB, 4.40 MB, 3.30 MB, 2.64 MB, 2.20 MB, 1.89
MB, 1.65 MB]
stats:
copy:
- [21.64 GB/s, 20.85 GB/s, 20.56 GB/s, 21.69 GB/s, 21.06 GB/s, 21.46 GB/s,
21.93 GB/s, 21.73 GB/s, 21.83 GB/s, 21.69 GB/s]
- [42.86 GB/s, 42.70 GB/s, 42.72 GB/s, 38.47 GB/s, 42.82 GB/s, 43.10 GB/s,
42.66 GB/s, 42.44 GB/s, 42.61 GB/s, 42.48 GB/s]
- [64.95 GB/s, 64.34 GB/s, 63.93 GB/s, 65.38 GB/s, 64.36 GB/s, 63.60 GB/s,
62.65 GB/s, 63.66 GB/s, 63.51 GB/s, 63.75 GB/s]
- [84.07 GB/s, 83.97 GB/s, 83.34 GB/s, 83.91 GB/s, 81.16 GB/s, 85.69 GB/s,
85.40 GB/s, 85.37 GB/s, 85.42 GB/s, 84.48 GB/s]
- [102.83 GB/s, 104.24 GB/s, 105.42 GB/s, 103.68 GB/s, 105.22 GB/s, 105.64
GB/s, 103.15 GB/s, 102.02 GB/s, 100.60 GB/s, 105.09 GB/s]
- [125.46 GB/s, 122.23 GB/s, 123.56 GB/s, 124.59 GB/s, 127.03 GB/s, 125.39
GB/s, 124.50 GB/s, 127.02 GB/s, 126.95 GB/s, 127.34 GB/s]
- [147.99 GB/s, 146.65 GB/s, 139.23 GB/s, 147.69 GB/s, 146.42 GB/s, 145.65
GB/s, 148.22 GB/s, 143.77 GB/s, 147.96 GB/s, 147.70 GB/s]
- [168.36 GB/s, 168.24 GB/s, 164.99 GB/s, 165.32 GB/s, 167.08 GB/s, 165.98
GB/s, 165.39 GB/s, 165.84 GB/s, 166.15 GB/s, 171.52 GB/s]
daxpy:
- [30.92 GB/s, 30.74 GB/s, 30.87 GB/s, 30.98 GB/s, 30.45 GB/s, 29.62 GB/s,
29.54 GB/s, 30.04 GB/s, 30.94 GB/s, 30.93 GB/s]
- [61.96 GB/s, 61.38 GB/s, 61.27 GB/s, 62.27 GB/s, 61.36 GB/s, 61.27 GB/s,
62.06 GB/s, 60.01 GB/s, 61.49 GB/s, 62.16 GB/s]
- [92.26 GB/s, 93.06 GB/s, 88.45 GB/s, 92.18 GB/s, 93.13 GB/s, 92.11 GB/s,
92.28 GB/s, 92.28 GB/s, 93.03 GB/s, 92.78 GB/s]
- [123.22 GB/s, 123.06 GB/s, 123.27 GB/s, 119.42 GB/s, 122.94 GB/s, 122.54
GB/s, 123.24 GB/s, 115.90 GB/s, 121.65 GB/s, 122.47 GB/s]
- [151.70 GB/s, 145.65 GB/s, 149.53 GB/s, 152.52 GB/s, 153.64 GB/s, 152.93
GB/s, 152.81 GB/s, 153.01 GB/s, 153.04 GB/s, 152.06 GB/s]
- [184.04 GB/s, 171.51 GB/s, 184.83 GB/s, 184.09 GB/s, 185.97 GB/s, 183.75
GB/s, 184.66 GB/s, 182.54 GB/s, 184.39 GB/s, 184.40 GB/s]
- [198.70 GB/s, 216.51 GB/s, 216.17 GB/s, 203.10 GB/s, 211.40 GB/s, 215.04
GB/s, 215.48 GB/s, 216.03 GB/s, 216.24 GB/s, 216.67 GB/s]
- [246.02 GB/s, 247.35 GB/s, 245.00 GB/s, 244.65 GB/s, 229.12 GB/s, 243.37
GB/s, 247.22 GB/s, 247.41 GB/s, 246.03 GB/s, 244.83 GB/s]
load:
- [23.08 GB/s, 23.38 GB/s, 22.88 GB/s, 23.43 GB/s, 23.05 GB/s, 23.23 GB/s,
22.97 GB/s, 22.39 GB/s, 23.47 GB/s, 23.33 GB/s]
- [46.39 GB/s, 46.40 GB/s, 46.45 GB/s, 46.36 GB/s, 46.69 GB/s, 46.62 GB/s,
46.84 GB/s, 45.98 GB/s, 46.73 GB/s, 46.80 GB/s]
- [69.18 GB/s, 68.61 GB/s, 69.74 GB/s, 69.34 GB/s, 68.39 GB/s, 69.73 GB/s,
67.76 GB/s, 69.65 GB/s, 69.70 GB/s, 69.16 GB/s]
- [92.29 GB/s, 91.67 GB/s, 92.76 GB/s, 90.78 GB/s, 92.76 GB/s, 90.76 GB/s,
91.58 GB/s, 91.60 GB/s, 91.03 GB/s, 92.72 GB/s]
- [114.04 GB/s, 113.82 GB/s, 112.26 GB/s, 112.65 GB/s, 114.09 GB/s, 113.81
GB/s, 113.72 GB/s, 114.70 GB/s, 115.37 GB/s, 112.57 GB/s]
- [136.42 GB/s, 135.83 GB/s, 134.93 GB/s, 135.43 GB/s, 135.94 GB/s, 139.23
GB/s, 137.52 GB/s, 137.59 GB/s, 135.97 GB/s, 136.96 GB/s]
- [157.88 GB/s, 163.12 GB/s, 159.53 GB/s, 160.16 GB/s, 162.18 GB/s, 159.58
GB/s, 161.55 GB/s, 159.81 GB/s, 162.97 GB/s, 163.10 GB/s]
- [183.41 GB/s, 181.86 GB/s, 183.55 GB/s, 183.38 GB/s, 181.66 GB/s, 186.65
GB/s, 179.62 GB/s, 174.70 GB/s, 180.10 GB/s, 181.49 GB/s]
triad:
- [24.72 GB/s, 23.66 GB/s, 23.58 GB/s, 23.75 GB/s, 23.62 GB/s, 24.37 GB/s,
24.44 GB/s, 23.57 GB/s, 23.30 GB/s, 23.57 GB/s]
- [49.11 GB/s, 46.87 GB/s, 47.13 GB/s, 46.83 GB/s, 46.58 GB/s, 46.73 GB/s,
46.32 GB/s, 47.22 GB/s, 46.79 GB/s, 48.73 GB/s]
- [72.29 GB/s, 69.87 GB/s, 70.57 GB/s, 68.89 GB/s, 68.56 GB/s, 69.02 GB/s,
72.42 GB/s, 69.37 GB/s, 72.34 GB/s, 69.44 GB/s]
- [94.95 GB/s, 94.67 GB/s, 91.05 GB/s, 90.46 GB/s, 95.36 GB/s, 91.63 GB/s,
94.06 GB/s, 95.30 GB/s, 93.99 GB/s, 94.71 GB/s]
- [119.32 GB/s, 117.99 GB/s, 119.46 GB/s, 117.28 GB/s, 118.97 GB/s, 115.67
GB/s, 116.64 GB/s, 117.99 GB/s, 119.02 GB/s, 117.75 GB/s]
- [138.63 GB/s, 144.53 GB/s, 144.60 GB/s, 135.72 GB/s, 141.86 GB/s, 139.64
GB/s, 142.95 GB/s, 140.89 GB/s, 142.10 GB/s, 143.97 GB/s]
- [168.66 GB/s, 166.77 GB/s, 157.10 GB/s, 164.75 GB/s, 164.00 GB/s, 164.38
GB/s, 163.94 GB/s, 158.58 GB/s, 165.60 GB/s, 164.39 GB/s]
- [184.53 GB/s, 187.00 GB/s, 186.87 GB/s, 179.43 GB/s, 185.70 GB/s, 187.49
GB/s, 189.45 GB/s, 186.82 GB/s, 188.50 GB/s, 185.96 GB/s]
update:
- [30.60 GB/s, 31.20 GB/s, 30.65 GB/s, 31.39 GB/s, 30.89 GB/s, 30.75 GB/s,
30.58 GB/s, 30.99 GB/s, 30.69 GB/s, 31.34 GB/s]
- [60.99 GB/s, 62.11 GB/s, 61.42 GB/s, 61.55 GB/s, 61.79 GB/s, 61.24 GB/s,
61.37 GB/s, 61.74 GB/s, 61.45 GB/s, 61.58 GB/s]
- [91.11 GB/s, 91.21 GB/s, 91.95 GB/s, 91.19 GB/s, 91.14 GB/s, 91.36 GB/s,
91.30 GB/s, 91.70 GB/s, 90.84 GB/s, 91.09 GB/s]
- [120.90 GB/s, 120.49 GB/s, 121.35 GB/s, 122.24 GB/s, 120.37 GB/s, 119.83
GB/s, 119.32 GB/s, 119.48 GB/s, 119.11 GB/s, 119.76 GB/s]
- [146.72 GB/s, 147.18 GB/s, 147.81 GB/s, 151.40 GB/s, 147.81 GB/s, 146.84
GB/s, 147.51 GB/s, 148.15 GB/s, 146.89 GB/s, 148.41 GB/s]
- [179.93 GB/s, 179.68 GB/s, 182.28 GB/s, 179.65 GB/s, 179.06 GB/s, 182.25
GB/s, 182.03 GB/s, 179.10 GB/s, 178.82 GB/s, 177.84 GB/s]
- [208.84 GB/s, 210.17 GB/s, 210.20 GB/s, 210.81 GB/s, 209.88 GB/s, 211.16
GB/s, 216.07 GB/s, 211.77 GB/s, 208.89 GB/s, 210.47 GB/s]
- [236.56 GB/s, 239.05 GB/s, 237.81 GB/s, 237.20 GB/s, 238.68 GB/s, 237.69
GB/s, 239.05 GB/s, 239.38 GB/s, 239.92 GB/s, 238.63 GB/s]
threads: [1, 2, 3, 4, 5, 6, 7, 8]
threads per core: 1
total size: [13.20 MB, 13.20 MB, 13.20 MB, 13.20 MB, 13.20 MB, 13.20 MB, 13.20
MB, 13.20 MB]
2:
cores: [1, 2, 3, 4, 5, 6, 7, 8]
results:
copy: [23.35 GB/s, 45.63 GB/s, 68.10 GB/s, 89.46 GB/s, 111.10 GB/s, 134.20
GB/s, 154.44 GB/s, 174.89 GB/s]
daxpy: [32.32 GB/s, 64.16 GB/s, 96.12 GB/s, 126.75 GB/s, 156.91 GB/s, 188.57
GB/s, 221.57 GB/s, 251.65 GB/s]
load: [25.14 GB/s, 50.38 GB/s, 75.49 GB/s, 101.06 GB/s, 126.04 GB/s, 151.12
GB/s, 172.57 GB/s, 196.91 GB/s]
triad: [25.15 GB/s, 50.37 GB/s, 75.31 GB/s, 99.12 GB/s, 123.25 GB/s, 150.29
GB/s, 171.60 GB/s, 197.81 GB/s]
update: [32.98 GB/s, 65.60 GB/s, 97.60 GB/s, 130.34 GB/s, 162.76 GB/s, 194.12
GB/s, 229.02 GB/s, 260.35 GB/s]
size per core: [13.20 MB, 6.60 MB, 4.40 MB, 3.30 MB, 2.64 MB, 2.20 MB, 1.89
MB, 1.65 MB]
size per thread: [6.60 MB, 3.30 MB, 2.20 MB, 1.65 MB, 1.32 MB, 1.10 MB, 0.94
MB, 825.00 kB]
stats:
copy:
- [22.79 GB/s, 22.55 GB/s, 22.86 GB/s, 22.74 GB/s, 23.09 GB/s, 22.51 GB/s,
23.35 GB/s, 23.32 GB/s, 23.02 GB/s, 22.75 GB/s]
- [45.32 GB/s, 45.15 GB/s, 45.63 GB/s, 44.84 GB/s, 44.54 GB/s, 44.33 GB/s,
44.68 GB/s, 44.98 GB/s, 44.64 GB/s, 44.75 GB/s]
- [68.10 GB/s, 67.88 GB/s, 67.98 GB/s, 67.32 GB/s, 67.02 GB/s, 67.14 GB/s,
67.71 GB/s, 67.19 GB/s, 63.08 GB/s, 68.04 GB/s]
- [89.46 GB/s, 88.53 GB/s, 88.51 GB/s, 89.13 GB/s, 89.32 GB/s, 84.53 GB/s,
87.51 GB/s, 88.95 GB/s, 88.91 GB/s, 87.62 GB/s]
- [108.72 GB/s, 110.42 GB/s, 106.02 GB/s, 111.08 GB/s, 110.70 GB/s, 111.10
GB/s, 110.24 GB/s, 109.68 GB/s, 109.55 GB/s, 108.86 GB/s]
- [133.21 GB/s, 127.37 GB/s, 132.83 GB/s, 132.67 GB/s, 133.02 GB/s, 132.65
GB/s, 134.20 GB/s, 132.96 GB/s, 118.86 GB/s, 131.20 GB/s]
- [152.95 GB/s, 153.90 GB/s, 153.80 GB/s, 153.22 GB/s, 153.32 GB/s, 142.75
GB/s, 152.99 GB/s, 154.44 GB/s, 154.43 GB/s, 152.24 GB/s]
- [174.89 GB/s, 171.49 GB/s, 157.46 GB/s, 172.90 GB/s, 173.42 GB/s, 171.07
GB/s, 171.82 GB/s, 170.68 GB/s, 172.19 GB/s, 161.38 GB/s]
daxpy:
- [31.88 GB/s, 32.27 GB/s, 31.11 GB/s, 32.20 GB/s, 32.17 GB/s, 32.32 GB/s,
32.20 GB/s, 32.32 GB/s, 30.76 GB/s, 32.03 GB/s]
- [64.16 GB/s, 63.70 GB/s, 64.04 GB/s, 63.55 GB/s, 60.64 GB/s, 64.05 GB/s,
63.56 GB/s, 63.36 GB/s, 63.94 GB/s, 63.86 GB/s]
- [96.12 GB/s, 95.66 GB/s, 95.93 GB/s, 95.93 GB/s, 96.10 GB/s, 95.94 GB/s,
95.78 GB/s, 95.79 GB/s, 95.17 GB/s, 89.44 GB/s]
- [126.04 GB/s, 126.43 GB/s, 126.09 GB/s, 124.90 GB/s, 125.07 GB/s, 125.74
GB/s, 118.86 GB/s, 125.80 GB/s, 125.10 GB/s, 126.75 GB/s]
- [155.92 GB/s, 155.99 GB/s, 156.32 GB/s, 151.54 GB/s, 156.49 GB/s, 156.91
GB/s, 154.92 GB/s, 155.92 GB/s, 156.20 GB/s, 154.49 GB/s]
- [185.57 GB/s, 180.38 GB/s, 187.51 GB/s, 187.10 GB/s, 186.44 GB/s, 187.13
GB/s, 187.31 GB/s, 188.10 GB/s, 187.91 GB/s, 188.57 GB/s]
- [207.55 GB/s, 219.63 GB/s, 219.38 GB/s, 219.81 GB/s, 220.29 GB/s, 219.72
GB/s, 221.05 GB/s, 216.76 GB/s, 221.57 GB/s, 220.75 GB/s]
- [250.81 GB/s, 250.78 GB/s, 251.19 GB/s, 251.28 GB/s, 249.10 GB/s, 250.42
GB/s, 251.65 GB/s, 244.31 GB/s, 250.40 GB/s, 250.19 GB/s]
load:
- [24.84 GB/s, 24.86 GB/s, 25.09 GB/s, 25.04 GB/s, 24.74 GB/s, 24.87 GB/s,
25.01 GB/s, 25.08 GB/s, 25.14 GB/s, 25.00 GB/s]
- [50.03 GB/s, 49.40 GB/s, 50.28 GB/s, 50.08 GB/s, 50.37 GB/s, 49.75 GB/s,
50.01 GB/s, 50.38 GB/s, 49.89 GB/s, 50.24 GB/s]
- [74.37 GB/s, 74.65 GB/s, 74.40 GB/s, 73.45 GB/s, 73.31 GB/s, 73.00 GB/s,
75.49 GB/s, 73.94 GB/s, 74.42 GB/s, 74.80 GB/s]
- [99.51 GB/s, 99.43 GB/s, 98.90 GB/s, 99.83 GB/s, 98.74 GB/s, 100.75 GB/s,
99.33 GB/s, 99.81 GB/s, 100.00 GB/s, 101.06 GB/s]
- [126.04 GB/s, 126.03 GB/s, 124.70 GB/s, 124.86 GB/s, 125.31 GB/s, 124.78
GB/s, 125.99 GB/s, 123.52 GB/s, 124.45 GB/s, 123.01 GB/s]
- [146.95 GB/s, 150.27 GB/s, 151.12 GB/s, 150.93 GB/s, 150.68 GB/s, 149.75
GB/s, 150.67 GB/s, 146.01 GB/s, 148.34 GB/s, 149.15 GB/s]
- [169.40 GB/s, 172.12 GB/s, 172.40 GB/s, 171.99 GB/s, 172.57 GB/s, 171.95
GB/s, 167.06 GB/s, 169.66 GB/s, 168.34 GB/s, 169.45 GB/s]
- [192.68 GB/s, 191.98 GB/s, 192.82 GB/s, 191.84 GB/s, 191.97 GB/s, 196.91
GB/s, 193.36 GB/s, 190.12 GB/s, 192.04 GB/s, 193.93 GB/s]
triad:
- [24.78 GB/s, 25.03 GB/s, 25.07 GB/s, 24.81 GB/s, 24.65 GB/s, 24.80 GB/s,
24.71 GB/s, 25.15 GB/s, 24.70 GB/s, 24.25 GB/s]
- [49.63 GB/s, 48.68 GB/s, 49.73 GB/s, 49.97 GB/s, 50.37 GB/s, 49.89 GB/s,
49.59 GB/s, 49.00 GB/s, 49.96 GB/s, 49.61 GB/s]
- [74.88 GB/s, 74.99 GB/s, 75.31 GB/s, 73.20 GB/s, 74.50 GB/s, 72.88 GB/s,
73.43 GB/s, 73.74 GB/s, 74.59 GB/s, 74.60 GB/s]
- [95.80 GB/s, 97.67 GB/s, 98.93 GB/s, 97.79 GB/s, 98.74 GB/s, 97.74 GB/s,
98.87 GB/s, 99.12 GB/s, 97.90 GB/s, 97.96 GB/s]
- [121.15 GB/s, 120.28 GB/s, 120.66 GB/s, 121.19 GB/s, 121.09 GB/s, 121.68
GB/s, 121.30 GB/s, 123.22 GB/s, 122.51 GB/s, 123.25 GB/s]
- [146.72 GB/s, 146.38 GB/s, 146.25 GB/s, 146.49 GB/s, 146.29 GB/s, 144.30
GB/s, 142.89 GB/s, 150.29 GB/s, 146.37 GB/s, 146.30 GB/s]
- [166.36 GB/s, 168.18 GB/s, 168.79 GB/s, 170.27 GB/s, 169.26 GB/s, 170.98
GB/s, 170.77 GB/s, 171.43 GB/s, 169.53 GB/s, 171.60 GB/s]
- [190.83 GB/s, 197.81 GB/s, 196.29 GB/s, 197.12 GB/s, 196.21 GB/s, 188.40
GB/s, 191.07 GB/s, 195.14 GB/s, 192.48 GB/s, 194.23 GB/s]
update:
- [32.74 GB/s, 32.98 GB/s, 32.73 GB/s, 32.57 GB/s, 32.63 GB/s, 32.41 GB/s,
32.61 GB/s, 32.24 GB/s, 32.52 GB/s, 32.49 GB/s]
- [65.22 GB/s, 65.07 GB/s, 64.65 GB/s, 65.26 GB/s, 63.70 GB/s, 64.19 GB/s,
64.35 GB/s, 64.83 GB/s, 65.60 GB/s, 63.99 GB/s]
- [97.60 GB/s, 96.65 GB/s, 97.50 GB/s, 96.07 GB/s, 97.12 GB/s, 96.41 GB/s,
96.85 GB/s, 96.80 GB/s, 97.10 GB/s, 97.10 GB/s]
- [129.18 GB/s, 127.79 GB/s, 129.50 GB/s, 129.46 GB/s, 128.85 GB/s, 128.69
GB/s, 129.02 GB/s, 130.34 GB/s, 129.92 GB/s, 129.11 GB/s]
- [160.00 GB/s, 161.81 GB/s, 160.37 GB/s, 159.56 GB/s, 160.38 GB/s, 161.91
GB/s, 160.54 GB/s, 161.43 GB/s, 160.59 GB/s, 162.76 GB/s]
- [192.24 GB/s, 193.69 GB/s, 191.11 GB/s, 190.65 GB/s, 193.10 GB/s, 191.30
GB/s, 192.50 GB/s, 193.37 GB/s, 191.98 GB/s, 194.12 GB/s]
- [221.45 GB/s, 229.02 GB/s, 226.33 GB/s, 224.81 GB/s, 225.62 GB/s, 224.79
GB/s, 226.03 GB/s, 227.09 GB/s, 226.46 GB/s, 225.88 GB/s]
- [255.45 GB/s, 256.52 GB/s, 254.06 GB/s, 257.76 GB/s, 256.85 GB/s, 256.27
GB/s, 260.35 GB/s, 259.96 GB/s, 258.40 GB/s, 255.79 GB/s]
threads: [2, 4, 6, 8, 10, 12, 14, 16]
threads per core: 2
total size: [13.20 MB, 13.20 MB, 13.20 MB, 13.20 MB, 13.20 MB, 13.20 MB, 13.20
MB, 13.20 MB]
MEM:
1:
cores: [1, 2, 3, 4, 5, 6, 7, 8]
results:
copy: [11.12 GB/s, 20.53 GB/s, 24.86 GB/s, 26.20 GB/s, 26.47 GB/s, 26.35
GB/s, 26.24 GB/s, 26.17 GB/s]
daxpy: [16.10 GB/s, 30.00 GB/s, 36.88 GB/s, 38.86 GB/s, 39.36 GB/s, 39.19
GB/s, 39.02 GB/s, 38.88 GB/s]
load: [12.30 GB/s, 23.50 GB/s, 33.04 GB/s, 40.59 GB/s, 44.03 GB/s, 44.56
GB/s, 44.26 GB/s, 43.77 GB/s]
triad: [12.41 GB/s, 24.13 GB/s, 29.24 GB/s, 30.73 GB/s, 30.68 GB/s, 30.58
GB/s, 30.54 GB/s, 30.63 GB/s]
update: [17.40 GB/s, 31.16 GB/s, 36.80 GB/s, 39.06 GB/s, 39.80 GB/s, 39.77
GB/s, 39.50 GB/s, 39.24 GB/s]
size per core: [300.00 MB, 150.00 MB, 100.00 MB, 75.00 MB, 60.00 MB, 50.00
MB, 42.86 MB, 37.50 MB]
size per thread: [300.00 MB, 150.00 MB, 100.00 MB, 75.00 MB, 60.00 MB, 50.00
MB, 42.86 MB, 37.50 MB]
stats:
copy:
- [10.83 GB/s, 10.83 GB/s, 10.81 GB/s, 10.82 GB/s, 10.82 GB/s, 10.82 GB/s,
10.83 GB/s, 10.81 GB/s, 10.82 GB/s, 11.12 GB/s]
- [20.34 GB/s, 20.38 GB/s, 20.37 GB/s, 20.34 GB/s, 20.41 GB/s, 20.39 GB/s,
20.39 GB/s, 20.39 GB/s, 20.53 GB/s, 20.35 GB/s]
- [24.70 GB/s, 24.76 GB/s, 24.80 GB/s, 24.86 GB/s, 24.75 GB/s, 24.80 GB/s,
24.77 GB/s, 24.82 GB/s, 24.81 GB/s, 24.73 GB/s]
- [26.10 GB/s, 26.16 GB/s, 26.14 GB/s, 26.16 GB/s, 26.10 GB/s, 26.15 GB/s,
26.10 GB/s, 26.15 GB/s, 26.11 GB/s, 26.20 GB/s]
- [26.45 GB/s, 26.44 GB/s, 26.41 GB/s, 26.43 GB/s, 26.45 GB/s, 26.44 GB/s,
26.46 GB/s, 26.47 GB/s, 26.45 GB/s, 26.44 GB/s]
- [26.34 GB/s, 26.30 GB/s, 26.31 GB/s, 26.33 GB/s, 26.26 GB/s, 26.35 GB/s,
26.30 GB/s, 26.30 GB/s, 26.30 GB/s, 26.34 GB/s]
- [26.20 GB/s, 26.24 GB/s, 26.21 GB/s, 26.22 GB/s, 26.22 GB/s, 26.20 GB/s,
26.20 GB/s, 26.23 GB/s, 26.22 GB/s, 26.23 GB/s]
- [26.15 GB/s, 26.17 GB/s, 26.12 GB/s, 26.15 GB/s, 26.15 GB/s, 26.15 GB/s,
26.12 GB/s, 26.14 GB/s, 26.14 GB/s, 26.17 GB/s]
daxpy:
- [15.77 GB/s, 15.77 GB/s, 16.04 GB/s, 15.68 GB/s, 15.72 GB/s, 15.76 GB/s,
15.91 GB/s, 15.77 GB/s, 16.10 GB/s, 16.04 GB/s]
- [29.88 GB/s, 29.80 GB/s, 30.00 GB/s, 29.87 GB/s, 29.87 GB/s, 30.00 GB/s,
29.79 GB/s, 29.80 GB/s, 29.80 GB/s, 29.82 GB/s]
- [36.63 GB/s, 36.73 GB/s, 36.64 GB/s, 36.64 GB/s, 36.81 GB/s, 36.88 GB/s,
36.62 GB/s, 36.65 GB/s, 36.74 GB/s, 36.71 GB/s]
- [38.82 GB/s, 38.83 GB/s, 38.86 GB/s, 38.81 GB/s, 38.81 GB/s, 38.82 GB/s,
38.85 GB/s, 38.80 GB/s, 38.84 GB/s, 38.73 GB/s]
- [39.32 GB/s, 39.30 GB/s, 39.34 GB/s, 39.36 GB/s, 39.28 GB/s, 39.33 GB/s,
39.31 GB/s, 39.25 GB/s, 39.32 GB/s, 39.33 GB/s]
- [39.10 GB/s, 39.12 GB/s, 39.14 GB/s, 39.16 GB/s, 39.17 GB/s, 39.17 GB/s,
39.13 GB/s, 39.15 GB/s, 39.14 GB/s, 39.19 GB/s]
- [39.01 GB/s, 39.01 GB/s, 39.02 GB/s, 39.02 GB/s, 39.00 GB/s, 39.00 GB/s,
38.97 GB/s, 39.02 GB/s, 38.98 GB/s, 39.01 GB/s]
- [38.76 GB/s, 38.86 GB/s, 38.83 GB/s, 38.82 GB/s, 38.87 GB/s, 38.88 GB/s,
38.81 GB/s, 38.83 GB/s, 38.88 GB/s, 38.88 GB/s]
load:
- [11.97 GB/s, 11.96 GB/s, 11.98 GB/s, 11.97 GB/s, 11.96 GB/s, 12.05 GB/s,
12.30 GB/s, 12.18 GB/s, 11.97 GB/s, 11.96 GB/s]
- [22.85 GB/s, 22.85 GB/s, 22.87 GB/s, 22.94 GB/s, 23.50 GB/s, 22.86 GB/s,
22.86 GB/s, 23.25 GB/s, 22.85 GB/s, 22.86 GB/s]
- [33.04 GB/s, 32.43 GB/s, 32.51 GB/s, 32.52 GB/s, 32.52 GB/s, 32.81 GB/s,
32.77 GB/s, 32.54 GB/s, 32.53 GB/s, 32.53 GB/s]
- [39.95 GB/s, 39.94 GB/s, 39.93 GB/s, 40.15 GB/s, 40.59 GB/s, 40.36 GB/s,
40.28 GB/s, 39.93 GB/s, 39.94 GB/s, 39.98 GB/s]
- [43.98 GB/s, 43.86 GB/s, 43.90 GB/s, 43.80 GB/s, 43.83 GB/s, 43.86 GB/s,
44.03 GB/s, 43.94 GB/s, 43.83 GB/s, 43.92 GB/s]
- [44.46 GB/s, 44.34 GB/s, 44.56 GB/s, 44.51 GB/s, 44.32 GB/s, 44.32 GB/s,
44.51 GB/s, 44.48 GB/s, 44.32 GB/s, 44.34 GB/s]
- [44.03 GB/s, 44.26 GB/s, 44.08 GB/s, 44.18 GB/s, 44.10 GB/s, 43.99 GB/s,
44.07 GB/s, 44.06 GB/s, 43.94 GB/s, 43.97 GB/s]
- [43.48 GB/s, 43.77 GB/s, 43.51 GB/s, 43.49 GB/s, 43.47 GB/s, 43.73 GB/s,
43.55 GB/s, 43.68 GB/s, 43.49 GB/s, 43.50 GB/s]
triad:
- [12.11 GB/s, 12.02 GB/s, 12.03 GB/s, 12.10 GB/s, 12.03 GB/s, 12.04 GB/s,
12.05 GB/s, 12.17 GB/s, 12.02 GB/s, 12.41 GB/s]
- [23.43 GB/s, 23.25 GB/s, 23.25 GB/s, 23.36 GB/s, 23.28 GB/s, 23.24 GB/s,
23.61 GB/s, 23.29 GB/s, 23.31 GB/s, 24.13 GB/s]
- [28.92 GB/s, 29.10 GB/s, 29.17 GB/s, 29.04 GB/s, 28.91 GB/s, 29.16 GB/s,
28.82 GB/s, 29.01 GB/s, 29.24 GB/s, 28.88 GB/s]
- [30.65 GB/s, 30.62 GB/s, 30.73 GB/s, 30.59 GB/s, 30.69 GB/s, 30.68 GB/s,
30.59 GB/s, 30.59 GB/s, 30.57 GB/s, 30.67 GB/s]
- [30.53 GB/s, 30.67 GB/s, 30.65 GB/s, 30.53 GB/s, 30.63 GB/s, 30.68 GB/s,
30.50 GB/s, 30.67 GB/s, 30.64 GB/s, 30.67 GB/s]
- [30.45 GB/s, 30.58 GB/s, 30.51 GB/s, 30.49 GB/s, 30.52 GB/s, 30.49 GB/s,
30.56 GB/s, 30.55 GB/s, 30.47 GB/s, 30.47 GB/s]
- [30.51 GB/s, 30.47 GB/s, 30.50 GB/s, 30.47 GB/s, 30.52 GB/s, 30.54 GB/s,
30.54 GB/s, 30.50 GB/s, 30.49 GB/s, 30.50 GB/s]
- [30.58 GB/s, 30.34 GB/s, 30.56 GB/s, 30.54 GB/s, 30.63 GB/s, 30.53 GB/s,
30.59 GB/s, 30.50 GB/s, 30.54 GB/s, 30.47 GB/s]
update:
- [17.33 GB/s, 17.32 GB/s, 17.34 GB/s, 17.35 GB/s, 17.40 GB/s, 17.35 GB/s,
17.36 GB/s, 17.39 GB/s, 17.35 GB/s, 17.35 GB/s]
- [31.12 GB/s, 31.15 GB/s, 31.10 GB/s, 31.16 GB/s, 31.07 GB/s, 31.08 GB/s,
31.09 GB/s, 31.12 GB/s, 31.12 GB/s, 31.08 GB/s]
- [36.80 GB/s, 36.42 GB/s, 35.92 GB/s, 36.39 GB/s, 35.99 GB/s, 35.98 GB/s,
36.37 GB/s, 36.39 GB/s, 36.38 GB/s, 36.44 GB/s]
- [39.03 GB/s, 39.05 GB/s, 39.02 GB/s, 39.06 GB/s, 39.01 GB/s, 39.02 GB/s,
39.02 GB/s, 39.00 GB/s, 39.00 GB/s, 39.00 GB/s]
- [39.76 GB/s, 39.80 GB/s, 39.80 GB/s, 39.78 GB/s, 39.76 GB/s, 39.79 GB/s,
39.79 GB/s, 39.77 GB/s, 39.77 GB/s, 39.71 GB/s]
- [39.71 GB/s, 39.72 GB/s, 39.72 GB/s, 39.66 GB/s, 39.74 GB/s, 39.70 GB/s,
39.76 GB/s, 39.74 GB/s, 39.77 GB/s, 39.74 GB/s]
- [39.50 GB/s, 39.47 GB/s, 39.45 GB/s, 39.43 GB/s, 39.46 GB/s, 39.45 GB/s,
39.45 GB/s, 39.40 GB/s, 39.43 GB/s, 39.47 GB/s]
- [39.21 GB/s, 39.18 GB/s, 39.19 GB/s, 39.19 GB/s, 39.21 GB/s, 39.19 GB/s,
39.18 GB/s, 39.21 GB/s, 39.20 GB/s, 39.24 GB/s]
threads: [1, 2, 3, 4, 5, 6, 7, 8]
threads per core: 1
total size: [300.00 MB, 300.00 MB, 300.00 MB, 300.00 MB, 300.00 MB, 300.00
MB, 300.00 MB, 300.00 MB]
2:
cores: [1, 2, 3, 4, 5, 6, 7, 8]
results:
copy: [10.79 GB/s, 20.46 GB/s, 24.69 GB/s, 25.42 GB/s, 25.63 GB/s, 25.45
GB/s, 25.32 GB/s, 25.06 GB/s]
daxpy: [15.97 GB/s, 29.70 GB/s, 35.95 GB/s, 37.55 GB/s, 37.81 GB/s, 37.78
GB/s, 37.64 GB/s, 37.33 GB/s]
load: [13.46 GB/s, 25.84 GB/s, 35.75 GB/s, 40.54 GB/s, 42.38 GB/s, 42.30
GB/s, 41.85 GB/s, 41.19 GB/s]
triad: [12.05 GB/s, 22.53 GB/s, 27.53 GB/s, 29.10 GB/s, 29.68 GB/s, 29.79
GB/s, 29.85 GB/s, 29.64 GB/s]
update: [19.12 GB/s, 33.86 GB/s, 38.51 GB/s, 39.38 GB/s, 39.20 GB/s, 38.80
GB/s, 38.39 GB/s, 38.02 GB/s]
size per core: [300.00 MB, 150.00 MB, 100.00 MB, 75.00 MB, 60.00 MB, 50.00
MB, 42.86 MB, 37.50 MB]
size per thread: [150.00 MB, 75.00 MB, 50.00 MB, 37.50 MB, 30.00 MB, 25.00
MB, 21.43 MB, 18.75 MB]
stats:
copy:
- [10.71 GB/s, 10.69 GB/s, 10.71 GB/s, 10.70 GB/s, 10.79 GB/s, 10.58 GB/s,
10.70 GB/s, 10.69 GB/s, 10.69 GB/s, 10.70 GB/s]
- [20.27 GB/s, 20.31 GB/s, 20.27 GB/s, 20.26 GB/s, 20.31 GB/s, 20.26 GB/s,
20.24 GB/s, 20.26 GB/s, 20.26 GB/s, 20.46 GB/s]
- [24.69 GB/s, 24.66 GB/s, 24.64 GB/s, 24.63 GB/s, 24.67 GB/s, 24.64 GB/s,
24.64 GB/s, 24.68 GB/s, 24.61 GB/s, 24.63 GB/s]
- [25.42 GB/s, 25.41 GB/s, 25.40 GB/s, 25.36 GB/s, 25.40 GB/s, 25.39 GB/s,
25.40 GB/s, 25.38 GB/s, 25.41 GB/s, 25.39 GB/s]
- [25.55 GB/s, 25.57 GB/s, 25.58 GB/s, 25.63 GB/s, 25.57 GB/s, 25.57 GB/s,
25.58 GB/s, 25.55 GB/s, 25.57 GB/s, 25.49 GB/s]
- [25.42 GB/s, 25.42 GB/s, 25.41 GB/s, 25.39 GB/s, 25.40 GB/s, 25.43 GB/s,
25.45 GB/s, 25.44 GB/s, 25.43 GB/s, 25.43 GB/s]
- [25.27 GB/s, 25.31 GB/s, 25.28 GB/s, 25.31 GB/s, 25.32 GB/s, 25.31 GB/s,
25.29 GB/s, 25.30 GB/s, 25.25 GB/s, 25.28 GB/s]
- [25.03 GB/s, 25.01 GB/s, 25.01 GB/s, 25.04 GB/s, 25.00 GB/s, 25.03 GB/s,
25.06 GB/s, 25.04 GB/s, 25.04 GB/s, 25.04 GB/s]
daxpy:
- [15.81 GB/s, 15.81 GB/s, 15.97 GB/s, 15.62 GB/s, 15.64 GB/s, 15.83 GB/s,
15.63 GB/s, 15.82 GB/s, 15.81 GB/s, 15.63 GB/s]
- [29.62 GB/s, 29.56 GB/s, 29.61 GB/s, 29.59 GB/s, 29.70 GB/s, 29.61 GB/s,
29.65 GB/s, 29.65 GB/s, 29.58 GB/s, 29.59 GB/s]
- [35.95 GB/s, 35.89 GB/s, 35.92 GB/s, 35.92 GB/s, 35.95 GB/s, 35.90 GB/s,
35.87 GB/s, 35.90 GB/s, 35.92 GB/s, 35.82 GB/s]
- [37.55 GB/s, 37.46 GB/s, 37.52 GB/s, 37.51 GB/s, 37.55 GB/s, 37.51 GB/s,
37.44 GB/s, 37.41 GB/s, 37.50 GB/s, 37.40 GB/s]
- [37.79 GB/s, 37.76 GB/s, 37.80 GB/s, 37.77 GB/s, 37.76 GB/s, 37.81 GB/s,
37.78 GB/s, 37.81 GB/s, 37.79 GB/s, 37.78 GB/s]
- [37.71 GB/s, 37.68 GB/s, 37.68 GB/s, 37.73 GB/s, 37.74 GB/s, 37.66 GB/s,
37.78 GB/s, 37.74 GB/s, 37.71 GB/s, 37.70 GB/s]
- [37.61 GB/s, 37.60 GB/s, 37.61 GB/s, 37.62 GB/s, 37.64 GB/s, 37.61 GB/s,
37.60 GB/s, 37.59 GB/s, 37.63 GB/s, 37.60 GB/s]
- [37.23 GB/s, 37.21 GB/s, 37.26 GB/s, 37.27 GB/s, 37.28 GB/s, 37.33 GB/s,
37.29 GB/s, 37.31 GB/s, 37.26 GB/s, 37.29 GB/s]
load:
- [13.34 GB/s, 13.36 GB/s, 13.35 GB/s, 13.34 GB/s, 13.35 GB/s, 13.38 GB/s,
13.46 GB/s, 13.35 GB/s, 13.35 GB/s, 13.35 GB/s]
- [25.63 GB/s, 25.64 GB/s, 25.84 GB/s, 25.64 GB/s, 25.74 GB/s, 25.63 GB/s,
25.64 GB/s, 25.63 GB/s, 25.64 GB/s, 25.68 GB/s]
- [35.38 GB/s, 35.56 GB/s, 35.50 GB/s, 35.75 GB/s, 35.50 GB/s, 35.39 GB/s,
35.46 GB/s, 35.39 GB/s, 35.75 GB/s, 35.40 GB/s]
- [40.37 GB/s, 40.37 GB/s, 40.49 GB/s, 40.49 GB/s, 40.42 GB/s, 40.37 GB/s,
40.54 GB/s, 40.39 GB/s, 40.37 GB/s, 40.51 GB/s]
- [42.34 GB/s, 42.14 GB/s, 42.26 GB/s, 42.17 GB/s, 42.10 GB/s, 42.13 GB/s,
42.38 GB/s, 42.13 GB/s, 42.21 GB/s, 42.15 GB/s]
- [42.30 GB/s, 42.13 GB/s, 42.20 GB/s, 42.11 GB/s, 42.12 GB/s, 42.12 GB/s,
42.18 GB/s, 42.25 GB/s, 42.19 GB/s, 42.21 GB/s]
- [41.70 GB/s, 41.76 GB/s, 41.85 GB/s, 41.80 GB/s, 41.71 GB/s, 41.71 GB/s,
41.80 GB/s, 41.70 GB/s, 41.76 GB/s, 41.75 GB/s]
- [41.02 GB/s, 41.01 GB/s, 41.17 GB/s, 41.12 GB/s, 41.13 GB/s, 41.15 GB/s,
41.19 GB/s, 41.01 GB/s, 41.10 GB/s, 41.06 GB/s]
triad:
- [11.87 GB/s, 11.89 GB/s, 11.91 GB/s, 11.81 GB/s, 11.83 GB/s, 11.85 GB/s,
11.90 GB/s, 11.80 GB/s, 11.85 GB/s, 12.05 GB/s]
- [22.53 GB/s, 22.47 GB/s, 22.44 GB/s, 22.46 GB/s, 22.43 GB/s, 22.52 GB/s,
22.41 GB/s, 22.52 GB/s, 22.48 GB/s, 22.41 GB/s]
- [27.43 GB/s, 27.42 GB/s, 27.47 GB/s, 27.47 GB/s, 27.52 GB/s, 27.49 GB/s,
27.41 GB/s, 27.42 GB/s, 27.51 GB/s, 27.53 GB/s]
- [29.02 GB/s, 29.03 GB/s, 29.03 GB/s, 29.04 GB/s, 28.89 GB/s, 29.10 GB/s,
29.02 GB/s, 29.05 GB/s, 28.93 GB/s, 29.01 GB/s]
- [29.66 GB/s, 29.68 GB/s, 29.60 GB/s, 29.62 GB/s, 29.60 GB/s, 29.67 GB/s,
29.66 GB/s, 29.62 GB/s, 29.62 GB/s, 29.62 GB/s]
- [29.78 GB/s, 29.76 GB/s, 29.77 GB/s, 29.77 GB/s, 29.75 GB/s, 29.79 GB/s,
29.75 GB/s, 29.77 GB/s, 29.76 GB/s, 29.78 GB/s]
- [29.82 GB/s, 29.85 GB/s, 29.85 GB/s, 29.83 GB/s, 29.82 GB/s, 29.83 GB/s,
29.83 GB/s, 29.81 GB/s, 29.81 GB/s, 29.80 GB/s]
- [29.54 GB/s, 29.63 GB/s, 29.57 GB/s, 29.56 GB/s, 29.55 GB/s, 29.64 GB/s,
29.60 GB/s, 29.53 GB/s, 29.54 GB/s, 29.57 GB/s]
update:
- [18.66 GB/s, 18.67 GB/s, 18.66 GB/s, 19.12 GB/s, 18.67 GB/s, 18.67 GB/s,
18.67 GB/s, 18.67 GB/s, 18.70 GB/s, 18.67 GB/s]
- [33.61 GB/s, 33.34 GB/s, 33.71 GB/s, 33.31 GB/s, 33.34 GB/s, 33.86 GB/s,
33.62 GB/s, 33.35 GB/s, 33.54 GB/s, 33.34 GB/s]
- [38.51 GB/s, 38.46 GB/s, 38.42 GB/s, 38.43 GB/s, 38.41 GB/s, 38.46 GB/s,
38.41 GB/s, 38.42 GB/s, 38.43 GB/s, 38.41 GB/s]
- [39.37 GB/s, 39.34 GB/s, 39.36 GB/s, 39.35 GB/s, 39.37 GB/s, 39.38 GB/s,
39.36 GB/s, 39.35 GB/s, 39.31 GB/s, 39.32 GB/s]
- [39.17 GB/s, 39.17 GB/s, 39.16 GB/s, 39.20 GB/s, 39.18 GB/s, 39.17 GB/s,
39.18 GB/s, 39.15 GB/s, 39.20 GB/s, 39.17 GB/s]
- [38.79 GB/s, 38.79 GB/s, 38.80 GB/s, 38.78 GB/s, 38.78 GB/s, 38.75 GB/s,
38.80 GB/s, 38.77 GB/s, 38.78 GB/s, 38.78 GB/s]
- [38.36 GB/s, 38.37 GB/s, 38.37 GB/s, 38.39 GB/s, 38.36 GB/s, 38.37 GB/s,
38.38 GB/s, 38.37 GB/s, 38.35 GB/s, 38.39 GB/s]
- [37.98 GB/s, 37.99 GB/s, 38.02 GB/s, 38.01 GB/s, 38.01 GB/s, 38.00 GB/s,
38.02 GB/s, 38.00 GB/s, 38.02 GB/s, 38.02 GB/s]
threads: [2, 4, 6, 8, 10, 12, 14, 16]
threads per core: 2
total size: [300.00 MB, 300.00 MB, 300.00 MB, 300.00 MB, 300.00 MB, 300.00
MB, 300.00 MB, 300.00 MB]
import sympy
import numpy
import pystencils
from pystencils.datahandling import create_data_handling
def test_max():
dh = create_data_handling(domain_size=(10, 10), periodicity=True)
x = dh.add_array('x', values_per_cell=1)
dh.fill("x", 0.0, ghost_layers=True)
y = dh.add_array('y', values_per_cell=1)
dh.fill("y", 1.0, ghost_layers=True)
z = dh.add_array('z', values_per_cell=1)
dh.fill("z", 2.0, ghost_layers=True)
# test sp.Max with one argument
assignment_1 = pystencils.Assignment(x.center, sympy.Max(y.center + 3.3))
ast_1 = pystencils.create_kernel(assignment_1)
kernel_1 = ast_1.compile()
# test sp.Max with two arguments
assignment_2 = pystencils.Assignment(x.center, sympy.Max(0.5, y.center - 1.5))
ast_2 = pystencils.create_kernel(assignment_2)
kernel_2 = ast_2.compile()
# test sp.Max with many arguments
assignment_3 = pystencils.Assignment(x.center, sympy.Max(z.center, 4.5, y.center - 1.5, y.center + z.center))
ast_3 = pystencils.create_kernel(assignment_3)
kernel_3 = ast_3.compile()
dh.run_kernel(kernel_1)
assert numpy.all(dh.cpu_arrays["x"] == 4.3)
dh.run_kernel(kernel_2)
assert numpy.all(dh.cpu_arrays["x"] == 0.5)
dh.run_kernel(kernel_3)
assert numpy.all(dh.cpu_arrays["x"] == 4.5)
def test_min():
dh = create_data_handling(domain_size=(10, 10), periodicity=True)
x = dh.add_array('x', values_per_cell=1)
dh.fill("x", 0.0, ghost_layers=True)
y = dh.add_array('y', values_per_cell=1)
dh.fill("y", 1.0, ghost_layers=True)
z = dh.add_array('z', values_per_cell=1)
dh.fill("z", 2.0, ghost_layers=True)
# test sp.Min with one argument
assignment_1 = pystencils.Assignment(x.center, sympy.Min(y.center + 3.3))
ast_1 = pystencils.create_kernel(assignment_1)
kernel_1 = ast_1.compile()
# test sp.Min with two arguments
assignment_2 = pystencils.Assignment(x.center, sympy.Min(0.5, y.center - 1.5))
ast_2 = pystencils.create_kernel(assignment_2)
kernel_2 = ast_2.compile()
# test sp.Min with many arguments
assignment_3 = pystencils.Assignment(x.center, sympy.Min(z.center, 4.5, y.center - 1.5, y.center + z.center))
ast_3 = pystencils.create_kernel(assignment_3)
kernel_3 = ast_3.compile()
dh.run_kernel(kernel_1)
assert numpy.all(dh.cpu_arrays["x"] == 4.3)
dh.run_kernel(kernel_2)
assert numpy.all(dh.cpu_arrays["x"] == - 0.5)
dh.run_kernel(kernel_3)
assert numpy.all(dh.cpu_arrays["x"] == - 0.5)