Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Stephan Seitz
pystencils
Commits
2d758462
Commit
2d758462
authored
Jul 14, 2020
by
Jan Hönig
Browse files
Merge branch 'Enable_OSACA_usage' into 'master'
Enable osaca usage See merge request
pycodegen/pystencils!165
parents
cfd770cf
0391c91d
Changes
8
Expand all
Hide whitespace changes
Inline
Side-by-side
pystencils/kerncraft_coupling/generate_benchmark.py
View file @
2d758462
import
os
import
subprocess
import
warnings
import
tempfile
from
pathlib
import
Path
from
jinja2
import
Template
from
jinja2
import
Environment
,
PackageLoader
,
StrictUndefined
from
pystencils.astnodes
import
PragmaBlock
from
pystencils.backends.cbackend
import
generate_c
,
get_headers
...
...
@@ -10,116 +12,6 @@ from pystencils.data_types import get_base_type
from
pystencils.include
import
get_pystencils_include_path
from
pystencils.sympyextensions
import
prod
benchmark_template
=
Template
(
"""
#include "kerncraft.h"
#include <stdlib.h>
#include <stdint.h>
#include <stdbool.h>
#include <math.h>
#include <stdio.h>
{{ includes }}
{%- if likwid %}
#include <likwid.h>
{%- endif %}
#define RESTRICT __restrict__
#define FUNC_PREFIX
void dummy(void *);
void timing(double* wcTime, double* cpuTime);
extern int var_false;
{{kernel_code}}
int main(int argc, char **argv)
{
{%- if likwid %}
likwid_markerInit();
{%- endif %}
{%- for field_name, dataType, size in fields %}
// Initialization {{field_name}}
double * {{field_name}} = (double *) aligned_malloc(sizeof({{dataType}}) * {{size}}, 64);
for (unsigned long long i = 0; i < {{size}}; ++i)
{{field_name}}[i] = 0.23;
if(var_false)
dummy({{field_name}});
{%- endfor %}
{%- for constantName, dataType in constants %}
// Constant {{constantName}}
{{dataType}} {{constantName}};
{{constantName}} = 0.23;
if(var_false)
dummy(& {{constantName}});
{%- endfor %}
{%- if likwid and openmp %}
#pragma omp parallel
{
likwid_markerRegisterRegion("loop");
#pragma omp barrier
{%- elif likwid %}
likwid_markerRegisterRegion("loop");
{%- endif %}
for(int warmup = 1; warmup >= 0; --warmup) {
int repeat = 2;
if(warmup == 0) {
repeat = atoi(argv[1]);
{%- if likwid %}
likwid_markerStartRegion("loop");
{%- endif %}
}
{%- if timing %}
double wcStartTime, cpuStartTime, wcEndTime, cpuEndTime;
timing(&wcStartTime, &cpuStartTime);
{%- endif %}
for (; repeat > 0; --repeat)
{
{{kernelName}}({{call_argument_list}});
// Dummy calls
{%- for field_name, dataType, size in fields %}
if(var_false) dummy((void*){{field_name}});
{%- endfor %}
{%- for constantName, dataType in constants %}
if(var_false) dummy((void*)&{{constantName}});
{%- endfor %}
}
{%- if timing %}
timing(&wcEndTime, &cpuEndTime);
if( warmup == 0)
printf("%e
\\
n", (wcEndTime - wcStartTime) / atoi(argv[1]) );
{%- endif %}
}
{%- if likwid %}
likwid_markerStopRegion("loop");
{%- if openmp %}
}
{%- endif %}
{%- endif %}
{%- if likwid %}
likwid_markerClose();
{%- endif %}
}
"""
)
def
generate_benchmark
(
ast
,
likwid
=
False
,
openmp
=
False
,
timing
=
False
):
"""Return C code of a benchmark program for the given kernel.
...
...
@@ -157,7 +49,7 @@ def generate_benchmark(ast, likwid=False, openmp=False, timing=False):
if
len
(
ast
.
body
.
args
)
>
0
and
isinstance
(
ast
.
body
.
args
[
0
],
PragmaBlock
):
ast
.
body
.
args
[
0
].
pragma_line
=
''
args
=
{
jinja_context
=
{
'likwid'
:
likwid
,
'openmp'
:
openmp
,
'kernel_code'
:
generate_c
(
ast
,
dialect
=
'c'
),
...
...
@@ -168,16 +60,20 @@ def generate_benchmark(ast, likwid=False, openmp=False, timing=False):
'includes'
:
includes
,
'timing'
:
timing
,
}
return
benchmark_template
.
render
(
**
args
)
env
=
Environment
(
loader
=
PackageLoader
(
'pystencils.kerncraft_coupling'
),
undefined
=
StrictUndefined
)
return
env
.
get_template
(
'benchmark.c'
).
render
(
**
jinja_context
)
def
run_c_benchmark
(
ast
,
inner_iterations
,
outer_iterations
=
3
):
def
run_c_benchmark
(
ast
,
inner_iterations
,
outer_iterations
=
3
,
path
=
None
):
"""Runs the given kernel with outer loop in C
Args:
ast:
ast:
pystencils ast which is used to compile the benchmark file
inner_iterations: timings are recorded around this many iterations
outer_iterations: number of timings recorded
path: path where the benchmark file is stored. If None a tmp folder is created
Returns:
list of times per iterations for each outer iteration
...
...
@@ -185,26 +81,40 @@ def run_c_benchmark(ast, inner_iterations, outer_iterations=3):
import
kerncraft
benchmark_code
=
generate_benchmark
(
ast
,
timing
=
True
)
with
open
(
'bench.c'
,
'w'
)
as
f
:
if
path
is
None
:
path
=
tempfile
.
mkdtemp
()
if
isinstance
(
path
,
str
):
path
=
Path
(
path
)
with
open
(
path
/
'bench.c'
,
'w'
)
as
f
:
f
.
write
(
benchmark_code
)
kerncraft_path
=
os
.
path
.
dirname
(
kerncraft
.
__file__
)
kerncraft_path
=
Path
(
kerncraft
.
__file__
)
.
parent
extra_flags
=
[
'-I'
+
get_pystencils_include_path
(),
'-I'
+
os
.
path
.
join
(
kerncraft_path
,
'headers'
)]
'-I'
+
str
(
kerncraft_path
/
'headers'
)]
compiler_config
=
get_compiler_config
()
compile_cmd
=
[
compiler_config
[
'command'
]]
+
compiler_config
[
'flags'
].
split
()
compile_cmd
+=
[
*
extra_flags
,
os
.
path
.
join
(
kerncraft_path
,
'headers'
,
'timing.c'
)
,
os
.
path
.
join
(
kerncraft_path
,
'headers'
,
'dummy.c'
)
,
'bench.c'
,
'-o'
,
'bench'
,
kerncraft_path
/
'headers'
/
'timing.c'
,
kerncraft_path
/
'headers'
/
'dummy.c'
,
path
/
'bench.c'
,
'-o'
,
path
/
'bench'
,
]
run_compile_step
(
compile_cmd
)
time_pre_estimation_per_iteration
=
float
(
subprocess
.
check_output
([
'./'
/
path
/
'bench'
,
str
(
10
)]))
benchmark_time_limit
=
20
if
benchmark_time_limit
/
time_pre_estimation_per_iteration
<
inner_iterations
:
warn
=
(
f
"A benchmark run with
{
inner_iterations
}
inner_iterations will probably take longer than "
f
"
{
benchmark_time_limit
}
seconds for this kernel"
)
warnings
.
warn
(
warn
)
results
=
[]
for
_
in
range
(
outer_iterations
):
benchmark_time
=
float
(
subprocess
.
check_output
([
'./bench'
,
str
(
inner_iterations
)]))
benchmark_time
=
float
(
subprocess
.
check_output
([
'./
'
/
path
/
'
bench'
,
str
(
inner_iterations
)]))
results
.
append
(
benchmark_time
)
return
results
pystencils/kerncraft_coupling/kerncraft_interface.py
View file @
2d758462
import
warnings
import
fcntl
from
collections
import
defaultdict
from
tempfile
import
TemporaryDirectory
from
typing
import
Optional
import
kerncraft
from
jinja2
import
Environment
,
PackageLoader
,
StrictUndefined
import
sympy
as
sp
from
kerncraft.kerncraft
import
KernelCode
from
kerncraft.machinemodel
import
MachineModel
from
pystencils.astnodes
import
(
KernelFunction
,
LoopOverCoordinate
,
ResolvedFieldAccess
,
SympyAssignment
)
from
pystencils.astnodes
import
(
KernelFunction
,
LoopOverCoordinate
,
ResolvedFieldAccess
,
SympyAssignment
)
from
pystencils.field
import
get_layout_from_strides
from
pystencils.kerncraft_coupling.generate_benchmark
import
generate_benchmark
from
pystencils.sympyextensions
import
count_operations_in_ast
from
pystencils.transformations
import
filtered_tree_iteration
from
pystencils.utils
import
DotDict
from
pystencils.backends.cbackend
import
generate_c
,
get_headers
from
pystencils.cpu.kernelcreation
import
add_openmp
class
PyStencilsKerncraftKernel
(
KernelCode
):
...
...
@@ -34,8 +36,10 @@ class PyStencilsKerncraftKernel(KernelCode):
assumed_layout: either 'SoA' or 'AoS' - if fields have symbolic sizes the layout of the index
coordinates is not known. In this case either a structures of array (SoA) or
array of structures (AoS) layout is assumed
debug_print: print debug information
filename: used for caching
"""
kerncraft
.
kernel
.
Kernel
.
__init__
(
self
,
machine
)
super
(
KernelCode
,
self
)
.
__init__
(
machine
=
machine
)
# Initialize state
self
.
asm_block
=
None
...
...
@@ -96,7 +100,7 @@ class PyStencilsKerncraftKernel(KernelCode):
for
field
in
fields_accessed
:
layout
=
get_layout_tuple
(
field
)
permuted_shape
=
list
(
field
.
shape
[
i
]
for
i
in
layout
)
self
.
set_variable
(
field
.
name
,
str
(
field
.
dtype
),
tuple
(
permuted_shape
))
self
.
set_variable
(
field
.
name
,
tuple
([
str
(
field
.
dtype
)
])
,
tuple
(
permuted_shape
))
# Scalars may be safely ignored
# for param in ast.get_parameters():
...
...
@@ -129,24 +133,64 @@ class PyStencilsKerncraftKernel(KernelCode):
print
(
"----------------------------- FLOPS -------------------------------"
)
pprint
(
self
.
_flops
)
def
as_code
(
self
,
type_
=
'iaca'
,
openmp
=
False
,
as_filename
=
False
):
def
get_kernel_header
(
self
,
name
=
'pystencils_kernel'
):
file_name
=
"pystencils_kernel.h"
file_path
=
self
.
get_intermediate_location
(
file_name
,
machine_and_compiler_dependent
=
False
)
lock_mode
,
lock_fp
=
self
.
lock_intermediate
(
file_path
)
if
lock_mode
==
fcntl
.
LOCK_EX
:
function_signature
=
generate_c
(
self
.
kernel_ast
,
dialect
=
'c'
,
signature_only
=
True
)
jinja_context
=
{
'function_signature'
:
function_signature
,
}
env
=
Environment
(
loader
=
PackageLoader
(
'pystencils.kerncraft_coupling'
),
undefined
=
StrictUndefined
)
file_header
=
env
.
get_template
(
'kernel.h'
).
render
(
**
jinja_context
)
with
open
(
file_path
,
'w'
)
as
f
:
f
.
write
(
file_header
)
fcntl
.
flock
(
lock_fp
,
fcntl
.
LOCK_SH
)
# degrade to shared lock
return
file_path
,
lock_fp
def
get_kernel_code
(
self
,
openmp
=
False
,
name
=
'pystencils_kernl'
):
"""
Generate and return compilable source code.
Args:
type_: can be iaca or likwid.
openmp: if true, openmp code will be generated
as_file
name
:
name: kernel
name
"""
code
=
generate_benchmark
(
self
.
kernel_ast
,
likwid
=
type_
==
'likwid'
,
openmp
=
openmp
)
if
as_filename
:
fp
,
already_available
=
self
.
_get_intermediate_file
(
f
'kernel_
{
type_
}
.c'
,
machine_and_compiler_dependent
=
False
)
if
not
already_available
:
fp
.
write
(
code
)
return
fp
.
name
else
:
return
code
filename
=
'pystencils_kernl'
if
openmp
:
filename
+=
'-omp'
filename
+=
'.c'
file_path
=
self
.
get_intermediate_location
(
filename
,
machine_and_compiler_dependent
=
False
)
lock_mode
,
lock_fp
=
self
.
lock_intermediate
(
file_path
)
if
lock_mode
==
fcntl
.
LOCK_EX
:
header_list
=
get_headers
(
self
.
kernel_ast
)
includes
=
"
\n
"
.
join
([
"#include %s"
%
(
include_file
,)
for
include_file
in
header_list
])
if
openmp
:
add_openmp
(
self
.
kernel_ast
)
kernel_code
=
generate_c
(
self
.
kernel_ast
,
dialect
=
'c'
)
jinja_context
=
{
'includes'
:
includes
,
'kernel_code'
:
kernel_code
,
}
env
=
Environment
(
loader
=
PackageLoader
(
'pystencils.kerncraft_coupling'
),
undefined
=
StrictUndefined
)
file_header
=
env
.
get_template
(
'kernel.c'
).
render
(
**
jinja_context
)
with
open
(
file_path
,
'w'
)
as
f
:
f
.
write
(
file_header
)
fcntl
.
flock
(
lock_fp
,
fcntl
.
LOCK_SH
)
# degrade to shared lock
return
file_path
,
lock_fp
class
KerncraftParameters
(
DotDict
):
...
...
@@ -161,6 +205,7 @@ class KerncraftParameters(DotDict):
self
[
'iterations'
]
=
10
self
[
'unit'
]
=
'cy/CL'
self
[
'ignore_warnings'
]
=
True
self
[
'incore_model'
]
=
'OSACA'
# ------------------------------------------- Helper functions ---------------------------------------------------------
...
...
pystencils/kerncraft_coupling/templates/benchmark.c
0 → 100644
View file @
2d758462
#include "kerncraft.h"
#include <stdlib.h>
#include <stdint.h>
#include <stdbool.h>
#include <math.h>
#include <stdio.h>
{{
includes
}}
{
%-
if
likwid
%
}
#include <likwid.h>
{
%-
endif
%
}
#define RESTRICT __restrict__
#define FUNC_PREFIX
void
dummy
(
void
*
);
void
timing
(
double
*
wcTime
,
double
*
cpuTime
);
extern
int
var_false
;
{{
kernel_code
}}
int
main
(
int
argc
,
char
**
argv
)
{
{
%-
if
likwid
%
}
likwid_markerInit
();
{
%-
endif
%
}
{
%-
for
field_name
,
dataType
,
size
in
fields
%
}
// Initialization {{field_name}}
double
*
{{
field_name
}}
=
(
double
*
)
aligned_malloc
(
sizeof
({{
dataType
}})
*
{{
size
}},
64
);
for
(
unsigned
long
long
i
=
0
;
i
<
{{
size
}};
++
i
)
{{
field_name
}}[
i
]
=
0
.
23
;
if
(
var_false
)
dummy
({{
field_name
}});
{
%-
endfor
%
}
{
%-
for
constantName
,
dataType
in
constants
%
}
// Constant {{constantName}}
{{
dataType
}}
{{
constantName
}};
{{
constantName
}}
=
0
.
23
;
if
(
var_false
)
dummy
(
&
{{
constantName
}});
{
%-
endfor
%
}
{
%-
if
likwid
and
openmp
%
}
#pragma omp parallel
{
likwid_markerRegisterRegion
(
"loop"
);
#pragma omp barrier
{
%-
elif
likwid
%
}
likwid_markerRegisterRegion
(
"loop"
);
{
%-
endif
%
}
for
(
int
warmup
=
1
;
warmup
>=
0
;
--
warmup
)
{
int
repeat
=
2
;
if
(
warmup
==
0
)
{
repeat
=
atoi
(
argv
[
1
]);
{
%-
if
likwid
%
}
likwid_markerStartRegion
(
"loop"
);
{
%-
endif
%
}
}
{
%-
if
timing
%
}
double
wcStartTime
,
cpuStartTime
,
wcEndTime
,
cpuEndTime
;
timing
(
&
wcStartTime
,
&
cpuStartTime
);
{
%-
endif
%
}
for
(;
repeat
>
0
;
--
repeat
)
{
{{
kernelName
}}({{
call_argument_list
}});
// Dummy calls
{
%-
for
field_name
,
dataType
,
size
in
fields
%
}
if
(
var_false
)
dummy
((
void
*
){{
field_name
}});
{
%-
endfor
%
}
{
%-
for
constantName
,
dataType
in
constants
%
}
if
(
var_false
)
dummy
((
void
*
)
&
{{
constantName
}});
{
%-
endfor
%
}
}
{
%-
if
timing
%
}
timing
(
&
wcEndTime
,
&
cpuEndTime
);
if
(
warmup
==
0
)
printf
(
"%e
\n
"
,
(
wcEndTime
-
wcStartTime
)
/
atoi
(
argv
[
1
])
);
{
%-
endif
%
}
}
{
%-
if
likwid
%
}
likwid_markerStopRegion
(
"loop"
);
{
%-
if
openmp
%
}
}
{
%-
endif
%
}
{
%-
endif
%
}
{
%-
if
likwid
%
}
likwid_markerClose
();
{
%-
endif
%
}
}
pystencils/kerncraft_coupling/templates/kernel.c
0 → 100644
View file @
2d758462
#include "kerncraft.h"
#include <stdlib.h>
#include <stdint.h>
#include <stdbool.h>
#include <math.h>
#include <stdio.h>
{{
includes
}}
#define RESTRICT __restrict__
#define FUNC_PREFIX
void
dummy
(
void
*
);
void
timing
(
double
*
wcTime
,
double
*
cpuTime
);
extern
int
var_false
;
{{
kernel_code
}}
\ No newline at end of file
pystencils/kerncraft_coupling/templates/kernel.h
0 → 100644
View file @
2d758462
#define FUNC_PREFIX
{{
function_signature
}}
\ No newline at end of file
pystencils_tests/kerncraft_inputs/Example_SandyBridgeEP_E5-2680.yml
0 → 100644
View file @
2d758462
This diff is collapsed.
Click to expand it.
pystencils_tests/kerncraft_inputs/default_machine_file.yaml
deleted
100644 → 0
View file @
cfd770cf
kerncraft version
:
0.7.3
clock
:
2.7 GHz
cores per socket
:
8
cores per NUMA domain
:
8
NUMA domains per socket
:
1
model type
:
Intel Core SandyBridge EP processor
model name
:
Intel(R) Xeon(R) CPU E5-2680 0 @ 2.70GHz
sockets
:
2
threads per core
:
2
cacheline size
:
64 B
compiler
:
!!omap
-
icc
:
-O3 -xAVX -fno-alias -qopenmp
-
clang
:
-O3 -march=corei7-avx -mtune=corei7-avx -D_POSIX_C_SOURCE=200112L -fopenmp
-
gcc
:
-O3 -march=corei7-avx -D_POSIX_C_SOURCE=200112L -fopenmp
micro-architecture
:
SNB
FLOPs per cycle
:
SP
:
{
total
:
16
,
ADD
:
8
,
MUL
:
8
}
DP
:
{
total
:
8
,
ADD
:
4
,
MUL
:
4
}
overlapping model
:
ports
:
[
"
0"
,
"
0DV"
,
"
1"
,
"
2"
,
"
3"
,
"
4"
,
"
5"
]
performance counter metric
:
Max(UOPS_DISPATCHED_PORT_PORT_0:PMC[0-3],
UOPS_DISPATCHED_PORT_PORT_1:PMC[0-3],
UOPS_DISPATCHED_PORT_PORT_4:PMC[0-3],
UOPS_DISPATCHED_PORT_PORT_5:PMC[0-3])
non-overlapping model
:
ports
:
[
"
2D"
,
"
3D"
]
performance counter metric
:
T_OL + T_L1L2 + T_L2L3 + T_L3MEM
write-allocate
:
True
memory hierarchy
:
-
level
:
L1
cache per group
:
{
'
sets'
:
64
,
'
ways'
:
8
,
'
cl_size'
:
64
,
# 32 kB
'
replacement_policy'
:
'
LRU'
,
'
write_allocate'
:
True
,
'
write_back'
:
True
,
'
load_from'
:
'
L2'
,
'
store_to'
:
'
L2'
}
cores per group
:
1
threads per group
:
2
groups
:
16
performance counter metrics
:
accesses
:
MEM_UOPS_RETIRED_LOADS:PMC[0-3]
misses
:
L1D_REPLACEMENT:PMC[0-3]
evicts
:
L1D_M_EVICT:PMC[0-3]
-
level
:
L2
cache per group
:
{
'
sets'
:
512
,
'
ways'
:
8
,
'
cl_size'
:
64
,
# 256 kB
'
replacement_policy'
:
'
LRU'
,
'
write_allocate'
:
True
,
'
write_back'
:
True
,
'
load_from'
:
'
L3'
,
'
store_to'
:
'
L3'
}
cores per group
:
1
threads per group
:
2
groups
:
16
non-overlap upstream throughput
:
[
32 B/cy
,
'
half-duplex'
]
performance counter metrics
:
accesses
:
L1D_REPLACEMENT:PMC[0-3]
misses
:
L2_LINES_IN_ALL:PMC[0-3]
evicts
:
L2_TRANS_L2_WB:PMC[0-3]
-
level
:
L3
cache per group
:
{
'
sets'
:
20480
,
'
ways'
:
16
,
'
cl_size'
:
64
,
# 20 MB
'
replacement_policy'
:
'
LRU'
,
'
write_allocate'
:
True
,
'
write_back'
:
True
}
cores per group
:
8
threads per group
:
16
groups
:
2
non-overlap upstream throughput
:
[
32 B/cy
,
'
half-duplex'
]
performance counter metrics
:
accesses
:
L2_LINES_IN_ALL:PMC[0-3]
misses
:
(CAS_COUNT_RD:MBOX0C[01] + CAS_COUNT_RD:MBOX1C[01] +
CAS_COUNT_RD:MBOX2C[01] + CAS_COUNT_RD:MBOX3C[01])
evicts
:
(CAS_COUNT_WR:MBOX0C[01] + CAS_COUNT_WR:MBOX1C[01] +
CAS_COUNT_WR:MBOX2C[01] + CAS_COUNT_WR:MBOX3C[01])
-
level
:
MEM
cores per group
:
8
non-overlap upstream throughput
:
[
'
full
socket
memory
bandwidth'
,
'
half-duplex'
]
size per group
:
null
threads per group
:
16
benchmarks
:
kernels
:
copy
:
FLOPs per iteration
:
0
read streams
:
{
bytes
:
8.00 B
,
streams
:
1
}
read+write streams
:
{
bytes
:
0.00 B
,
streams
:
0
}
write streams
:
{
bytes
:
8.00 B
,
streams
:
1
}
daxpy
:
FLOPs per iteration
:
2
read streams
:
{
bytes
:
16.00 B
,
streams
:
2
}
read+write streams
:
{
bytes
:
8.00 B
,
streams
:
1
}
write streams
:
{
bytes
:
8.00 B
,
streams
:
1
}
load
:
FLOPs per iteration
:
0
read streams
:
{
bytes
:
8.00 B
,
streams
:
1
}
read+write streams
:
{
bytes
:
0.00 B
,
streams
:
0
}
write streams
:
{
bytes
:
0.00 B
,
streams
:
0
}
triad
:
FLOPs per iteration
:
2
read streams
:
{
bytes
:
24.00 B
,
streams
:
3
}
read+write streams
:
{
bytes
:
0.00 B
,
streams
:
0
}
write streams
:
{
bytes
:
8.00 B
,
streams
:
1
}
update
:
FLOPs per iteration
:
0
read streams
:
{
bytes
:
8.00 B
,
streams
:
1
}
read+write streams
:
{
bytes
:
8.00 B
,
streams
:
1
}
write streams
:
{
bytes
:
8.00 B
,
streams
:
1
}
measurements
:
L1
:
1
:
cores
:
[
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
]
results
:
copy
:
[
81.98 GB/s
,
163.75 GB/s
,
245.62 GB/s
,
327.69 GB/s
,
409.41 GB/s
,
489.83
GB/s
,
571.67 GB/s
,
653.50 GB/s
]
daxpy
:
[
71.55 GB/s
,
143.01 GB/s
,
214.86 GB/s
,
286.26 GB/s
,
355.60 GB/s
,
426.71 GB/s
,
497.45 GB/s
,
568.97 GB/s
]
load
:
[
61.92 GB/s
,
122.79 GB/s
,
183.01 GB/s
,
244.30 GB/s
,
306.76 GB/s
,
368.46
GB/s
,
427.41 GB/s
,
490.88 GB/s
]
triad
:
[
81.61 GB/s
,
163.25 GB/s
,
244.92 GB/s
,
326.65 GB/s
,
406.69 GB/s
,
487.76 GB/s
,
569.10 GB/s
,
650.39 GB/s
]
update
:
[
84.03 GB/s
,
168.02 GB/s
,
252.10 GB/s
,
335.94 GB/s
,
419.90 G