Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
No results found
Show changes
Showing
with 0 additions and 3113 deletions
/*
* Copyright (2008-2009) Intel Corporation All Rights Reserved.
* The source code contained or described herein and all documents
* related to the source code ("Material") are owned by Intel Corporation
* or its suppliers or licensors. Title to the Material remains with
* Intel Corporation or its suppliers and licensors. The Material
* contains trade secrets and proprietary and confidential information
* of Intel or its suppliers and licensors. The Material is protected
* by worldwide copyright and trade secret laws and treaty provisions.
* No part of the Material may be used, copied, reproduced, modified,
* published, uploaded, posted, transmitted, distributed, or disclosed
* in any way without Intel(R)s prior express written permission.
*
* No license under any patent, copyright, trade secret or other
* intellectual property right is granted to or conferred upon you by
* disclosure or delivery of the Materials, either expressly, by implication,
* inducement, estoppel or otherwise. Any license under such intellectual
* property rights must be express and approved by Intel in writing.
*/
#if defined (__GNUC__)
#define IACA_SSC_MARK( MARK_ID ) \
__asm__ __volatile__ ( \
"\n\t movl $"#MARK_ID", %%ebx" \
"\n\t .byte 0x64, 0x67, 0x90" \
: : : "memory" );
#else
#define IACA_SSC_MARK(x) {__asm mov ebx, x\
__asm _emit 0x64 \
__asm _emit 0x67 \
__asm _emit 0x90 }
#endif
#define IACA_START {IACA_SSC_MARK(111)}
#define IACA_END {IACA_SSC_MARK(222)}
#ifdef _WIN64
#include <intrin.h>
#define IACA_VC64_START __writegsbyte(111, 111);
#define IACA_VC64_END __writegsbyte(222, 222);
#endif
/**************** asm *****************
;START_MARKER
mov ebx, 111
db 0x64, 0x67, 0x90
;END_MARKER
mov ebx, 222
db 0x64, 0x67, 0x90
**************************************/
#include "iacaMarks.h"
int main(int argc, char * argv[]){
int a = 0;
for(int i = 0; i < argc+100000; i++){
IACA_START
a += i;
}
IACA_END
return a;
}
double a[30][50][3];
double b[30][50][3];
double s;
for(int j=1; j<30-1; ++j)
for(int i=1; i<50-1; ++i)
b[j][i] = ( a[j][i-1] + a[j][i+1]
+ a[j-1][i] + a[j+1][i]) * s;
double a[M][N][N];
double b[M][N][N];
double s;
for(int k=1; k<M-1; ++k)
for(int j=1; j<N-1; ++j)
for(int i=1; i<N-1; ++i)
b[k][j][i] = ( a[k][j][i-1] + a[k][j][i+1]
+ a[k][j-1][i] + a[k][j+1][i]
+ a[k-1][j][i] + a[k+1][j][i]) * s;
kerncraft version: 0.8.6.dev0
model name: Intel(R) Xeon(R) CPU E5-2680 0 @ 2.70GHz
model type: Intel Xeon SandyBridge EN/EP processor
clock: 2.7 GHz
sockets: 2
cores per socket: 8
threads per core: 2
NUMA domains per socket: 1
cores per NUMA domain: 8
transparent hugepage: always
in-core model: !!omap
- IACA: SNB
- OSACA: SNB
- LLVM-MCA: -mcpu=sandybridge
isa: x86
FLOPs per cycle:
SP: {total: 16, ADD: 8, MUL: 8}
DP: {total: 8, ADD: 4, MUL: 4}
compiler: !!omap
- icc: -O3 -xAVX -fno-alias -qopenmp -ffreestanding -nolib-inline
- clang: -O3 -march=corei7-avx -mtune=corei7-avx -D_POSIX_C_SOURCE=200809L -fopenmp -ffreestanding
- gcc: -O3 -march=corei7-avx -D_POSIX_C_SOURCE=200809L -fopenmp -lm -ffreestanding
overlapping model:
ports:
IACA: ['0', 0DV, '1', '2', '3', '4', '5']
OSACA: ['0', 0DV, '1', '2', '3', '4', '5']
LLVM-MCA: [SBDivider, SBFPDivider, SBPort0, SBPort1, SBPort23, SBPort4, SBPort5]
performance counter metric: Max(UOPS_DISPATCHED_PORT_PORT_0:PMC[0-3], UOPS_DISPATCHED_PORT_PORT_1:PMC[0-3], UOPS_DISPATCHED_PORT_PORT_4:PMC[0-3], UOPS_DISPATCHED_PORT_PORT_5:PMC[0-3])
non-overlapping model:
ports:
IACA: [2D, 3D]
OSACA: [2D, 3D]
LLVM-MCA: [SBPort23]
performance counter metric: T_nOL + T_L1L2 + T_L2L3 + T_L3MEM
cacheline size: 64 B
memory hierarchy:
- level: L1
cache per group: {sets: 64, ways: 8, cl_size: 64, replacement_policy: LRU, write_allocate: true,
write_back: true, load_from: L2, store_to: L2}
cores per group: 1
threads per group: 2
groups: 16
performance counter metrics:
accesses: MEM_UOPS_RETIRED_LOADS:PMC[0-3] + MEM_UOPS_RETIRED_STORES:PMC[0-3]
misses: L1D_REPLACEMENT:PMC[0-3]
evicts: L1D_M_EVICT:PMC[0-3]
upstream throughput: [architecture code analyzer, [2D, 3D]]
transfers overlap: false
- level: L2
cache per group: {sets: 512, ways: 8, cl_size: 64, replacement_policy: LRU, write_allocate: true,
write_back: true, load_from: L3, store_to: L3}
cores per group: 1
threads per group: 2
groups: 16
upstream throughput: [32 B/cy, half-duplex]
transfers overlap: false
performance counter metrics:
accesses: L1D_REPLACEMENT:PMC[0-3] + L1D_M_EVICT:PMC[0-3]
misses: L2_LINES_IN_ALL:PMC[0-3]
evicts: L2_TRANS_L2_WB:PMC[0-3]
- level: L3
cache per group: {sets: 20480, ways: 16, cl_size: 64, replacement_policy: LRU, write_allocate: true,
write_back: true}
cores per group: 8
threads per group: 16
groups: 2
upstream throughput: [32 B/cy, half-duplex]
transfers overlap: false
performance counter metrics:
accesses: L2_LINES_IN_ALL:PMC[0-3] + L2_TRANS_L2_WB:PMC[0-3]
misses: (CAS_COUNT_RD:MBOX0C[01] + CAS_COUNT_RD:MBOX1C[01] + CAS_COUNT_RD:MBOX2C[01]
+ CAS_COUNT_RD:MBOX3C[01])
evicts: (CAS_COUNT_WR:MBOX0C[01] + CAS_COUNT_WR:MBOX1C[01] + CAS_COUNT_WR:MBOX2C[01]
+ CAS_COUNT_WR:MBOX3C[01])
- level: MEM
cores per group: 8
upstream throughput: [full socket memory bandwidth, half-duplex]
transfers overlap: false
size per group:
threads per group: 16
benchmarks:
kernels:
copy:
FLOPs per iteration: 0
fastest bench kernel: copy_avx
read streams: {bytes: 8.00 B, streams: 1}
read+write streams: {bytes: 0.00 B, streams: 0}
write streams: {bytes: 8.00 B, streams: 1}
daxpy:
FLOPs per iteration: 2
fastest bench kernel: daxpy_avx
read streams: {bytes: 16.00 B, streams: 2}
read+write streams: {bytes: 8.00 B, streams: 1}
write streams: {bytes: 8.00 B, streams: 1}
load:
FLOPs per iteration: 0
fastest bench kernel: load_avx
read streams: {bytes: 8.00 B, streams: 1}
read+write streams: {bytes: 0.00 B, streams: 0}
write streams: {bytes: 0.00 B, streams: 0}
triad:
FLOPs per iteration: 2
fastest bench kernel: triad_avx
read streams: {bytes: 24.00 B, streams: 3}
read+write streams: {bytes: 0.00 B, streams: 0}
write streams: {bytes: 8.00 B, streams: 1}
update:
FLOPs per iteration: 0
fastest bench kernel: update_avx
read streams: {bytes: 8.00 B, streams: 1}
read+write streams: {bytes: 8.00 B, streams: 1}
write streams: {bytes: 8.00 B, streams: 1}
measurements:
L1:
1:
cores: [1, 2, 3, 4, 5, 6, 7, 8]
results:
copy: [83.27 GB/s, 166.52 GB/s, 249.78 GB/s, 333.02 GB/s, 416.34 GB/s, 495.96
GB/s, 578.56 GB/s, 660.60 GB/s]
daxpy: [116.88 GB/s, 233.68 GB/s, 311.60 GB/s, 409.72 GB/s, 509.79 GB/s,
559.65 GB/s, 612.77 GB/s, 719.71 GB/s]
load: [84.07 GB/s, 168.13 GB/s, 252.21 GB/s, 336.04 GB/s, 420.34 GB/s, 504.02
GB/s, 588.04 GB/s, 668.37 GB/s]
triad: [100.24 GB/s, 211.57 GB/s, 314.53 GB/s, 392.73 GB/s, 506.87 GB/s,
589.51 GB/s, 687.28 GB/s, 782.17 GB/s]
update: [84.77 GB/s, 160.10 GB/s, 237.12 GB/s, 312.74 GB/s, 392.54 GB/s,
465.53 GB/s, 516.02 GB/s, 567.27 GB/s]
size per core: [21.12 kB, 21.12 kB, 21.12 kB, 21.12 kB, 21.12 kB, 21.12 kB,
21.12 kB, 21.12 kB]
size per thread: [21.12 kB, 21.12 kB, 21.12 kB, 21.12 kB, 21.12 kB, 21.12
kB, 21.12 kB, 21.12 kB]
stats:
copy:
- [83.24 GB/s, 83.25 GB/s, 83.26 GB/s, 83.26 GB/s, 83.27 GB/s, 83.26 GB/s,
83.25 GB/s, 83.23 GB/s, 83.24 GB/s, 83.25 GB/s]
- [166.49 GB/s, 166.47 GB/s, 166.51 GB/s, 166.49 GB/s, 166.48 GB/s, 166.52
GB/s, 166.51 GB/s, 166.51 GB/s, 166.51 GB/s, 166.50 GB/s]
- [249.78 GB/s, 249.75 GB/s, 249.73 GB/s, 249.72 GB/s, 249.74 GB/s, 249.76
GB/s, 249.76 GB/s, 249.74 GB/s, 249.73 GB/s, 249.75 GB/s]
- [332.98 GB/s, 327.92 GB/s, 332.30 GB/s, 332.95 GB/s, 333.00 GB/s, 333.01
GB/s, 332.95 GB/s, 333.00 GB/s, 332.99 GB/s, 333.02 GB/s]
- [416.26 GB/s, 416.23 GB/s, 416.28 GB/s, 416.27 GB/s, 416.23 GB/s, 416.27
GB/s, 416.34 GB/s, 416.26 GB/s, 416.16 GB/s, 416.23 GB/s]
- [495.84 GB/s, 495.93 GB/s, 495.88 GB/s, 495.91 GB/s, 495.96 GB/s, 495.92
GB/s, 495.89 GB/s, 495.87 GB/s, 495.96 GB/s, 495.92 GB/s]
- [578.51 GB/s, 578.52 GB/s, 578.39 GB/s, 578.56 GB/s, 578.48 GB/s, 578.44
GB/s, 578.51 GB/s, 578.48 GB/s, 578.51 GB/s, 578.53 GB/s]
- [422.14 GB/s, 660.55 GB/s, 660.60 GB/s, 660.49 GB/s, 660.52 GB/s, 660.48
GB/s, 660.56 GB/s, 660.56 GB/s, 660.52 GB/s, 651.64 GB/s]
daxpy:
- [116.87 GB/s, 116.82 GB/s, 116.85 GB/s, 116.84 GB/s, 116.83 GB/s, 116.85
GB/s, 116.88 GB/s, 116.87 GB/s, 116.86 GB/s, 116.82 GB/s]
- [214.69 GB/s, 229.83 GB/s, 221.16 GB/s, 233.60 GB/s, 232.90 GB/s, 233.68
GB/s, 207.83 GB/s, 233.65 GB/s, 212.71 GB/s, 214.07 GB/s]
- [282.77 GB/s, 307.63 GB/s, 307.09 GB/s, 310.67 GB/s, 307.50 GB/s, 311.40
GB/s, 307.06 GB/s, 305.89 GB/s, 311.60 GB/s, 308.47 GB/s]
- [404.96 GB/s, 408.54 GB/s, 395.76 GB/s, 409.72 GB/s, 316.70 GB/s, 408.07
GB/s, 347.34 GB/s, 406.03 GB/s, 391.75 GB/s, 385.10 GB/s]
- [479.84 GB/s, 509.24 GB/s, 502.60 GB/s, 449.79 GB/s, 402.46 GB/s, 489.18
GB/s, 491.15 GB/s, 491.20 GB/s, 384.36 GB/s, 509.79 GB/s]
- [515.12 GB/s, 496.21 GB/s, 517.52 GB/s, 540.00 GB/s, 501.82 GB/s, 507.84
GB/s, 496.71 GB/s, 479.42 GB/s, 559.65 GB/s, 519.55 GB/s]
- [584.86 GB/s, 580.10 GB/s, 583.34 GB/s, 612.77 GB/s, 607.15 GB/s, 607.89
GB/s, 589.85 GB/s, 609.59 GB/s, 592.86 GB/s, 568.07 GB/s]
- [719.71 GB/s, 660.98 GB/s, 675.88 GB/s, 679.51 GB/s, 696.97 GB/s, 635.23
GB/s, 644.06 GB/s, 694.74 GB/s, 654.01 GB/s, 656.57 GB/s]
load:
- [84.04 GB/s, 84.06 GB/s, 84.06 GB/s, 84.04 GB/s, 84.05 GB/s, 84.05 GB/s,
84.07 GB/s, 84.04 GB/s, 84.05 GB/s, 84.06 GB/s]
- [168.09 GB/s, 168.12 GB/s, 168.06 GB/s, 168.11 GB/s, 168.12 GB/s, 168.13
GB/s, 168.13 GB/s, 168.12 GB/s, 168.10 GB/s, 168.13 GB/s]
- [252.16 GB/s, 252.21 GB/s, 252.07 GB/s, 252.07 GB/s, 252.18 GB/s, 252.16
GB/s, 252.21 GB/s, 252.20 GB/s, 252.20 GB/s, 252.17 GB/s]
- [335.94 GB/s, 336.03 GB/s, 335.99 GB/s, 336.04 GB/s, 336.00 GB/s, 335.98
GB/s, 335.97 GB/s, 335.89 GB/s, 335.99 GB/s, 336.03 GB/s]
- [420.30 GB/s, 420.18 GB/s, 420.30 GB/s, 420.33 GB/s, 420.25 GB/s, 420.28
GB/s, 420.31 GB/s, 420.31 GB/s, 420.34 GB/s, 420.33 GB/s]
- [503.98 GB/s, 503.99 GB/s, 503.97 GB/s, 503.98 GB/s, 504.02 GB/s, 503.99
GB/s, 503.92 GB/s, 503.98 GB/s, 503.94 GB/s, 503.97 GB/s]
- [587.93 GB/s, 588.01 GB/s, 588.04 GB/s, 587.94 GB/s, 587.97 GB/s, 588.01
GB/s, 588.00 GB/s, 587.92 GB/s, 588.04 GB/s, 588.02 GB/s]
- [668.21 GB/s, 668.22 GB/s, 668.29 GB/s, 668.24 GB/s, 668.27 GB/s, 668.37
GB/s, 668.28 GB/s, 668.14 GB/s, 668.19 GB/s, 668.19 GB/s]
triad:
- [100.00 GB/s, 99.71 GB/s, 99.74 GB/s, 100.24 GB/s, 99.72 GB/s, 99.62 GB/s,
99.54 GB/s, 99.61 GB/s, 99.72 GB/s, 99.71 GB/s]
- [208.08 GB/s, 210.33 GB/s, 211.57 GB/s, 208.34 GB/s, 210.03 GB/s, 209.16
GB/s, 210.21 GB/s, 209.48 GB/s, 210.03 GB/s, 208.80 GB/s]
- [311.43 GB/s, 311.08 GB/s, 311.41 GB/s, 311.10 GB/s, 313.13 GB/s, 314.53
GB/s, 311.59 GB/s, 311.80 GB/s, 311.57 GB/s, 311.89 GB/s]
- [391.65 GB/s, 392.34 GB/s, 391.84 GB/s, 392.07 GB/s, 391.96 GB/s, 392.73
GB/s, 391.66 GB/s, 391.83 GB/s, 392.09 GB/s, 391.88 GB/s]
- [504.20 GB/s, 506.77 GB/s, 503.22 GB/s, 506.74 GB/s, 502.78 GB/s, 506.15
GB/s, 506.87 GB/s, 502.85 GB/s, 505.82 GB/s, 506.57 GB/s]
- [587.75 GB/s, 589.51 GB/s, 588.01 GB/s, 587.29 GB/s, 588.04 GB/s, 587.92
GB/s, 588.08 GB/s, 587.94 GB/s, 587.82 GB/s, 587.55 GB/s]
- [686.03 GB/s, 685.97 GB/s, 685.01 GB/s, 685.88 GB/s, 685.61 GB/s, 687.12
GB/s, 684.97 GB/s, 686.09 GB/s, 685.81 GB/s, 687.28 GB/s]
- [782.05 GB/s, 781.73 GB/s, 781.13 GB/s, 781.87 GB/s, 782.17 GB/s, 781.24
GB/s, 781.82 GB/s, 781.92 GB/s, 781.90 GB/s, 781.66 GB/s]
update:
- [84.76 GB/s, 84.76 GB/s, 84.77 GB/s, 84.75 GB/s, 84.75 GB/s, 84.75 GB/s,
84.75 GB/s, 84.75 GB/s, 84.74 GB/s, 57.21 GB/s]
- [157.73 GB/s, 155.29 GB/s, 147.91 GB/s, 160.10 GB/s, 156.33 GB/s, 158.06
GB/s, 159.23 GB/s, 156.16 GB/s, 155.30 GB/s, 159.15 GB/s]
- [232.07 GB/s, 230.40 GB/s, 234.05 GB/s, 232.69 GB/s, 215.80 GB/s, 232.76
GB/s, 236.01 GB/s, 237.12 GB/s, 234.66 GB/s, 234.86 GB/s]
- [303.60 GB/s, 304.21 GB/s, 306.83 GB/s, 309.43 GB/s, 312.69 GB/s, 311.75
GB/s, 301.74 GB/s, 307.54 GB/s, 312.74 GB/s, 312.19 GB/s]
- [386.45 GB/s, 382.41 GB/s, 387.87 GB/s, 392.54 GB/s, 369.42 GB/s, 341.87
GB/s, 352.85 GB/s, 390.87 GB/s, 382.44 GB/s, 383.50 GB/s]
- [459.60 GB/s, 384.27 GB/s, 437.39 GB/s, 459.42 GB/s, 465.53 GB/s, 447.31
GB/s, 440.00 GB/s, 409.94 GB/s, 412.94 GB/s, 446.74 GB/s]
- [489.85 GB/s, 489.35 GB/s, 435.92 GB/s, 492.39 GB/s, 446.44 GB/s, 501.71
GB/s, 516.02 GB/s, 478.87 GB/s, 494.52 GB/s, 493.04 GB/s]
- [521.08 GB/s, 553.73 GB/s, 541.34 GB/s, 527.75 GB/s, 554.87 GB/s, 536.30
GB/s, 540.66 GB/s, 551.02 GB/s, 567.27 GB/s, 565.31 GB/s]
threads: [1, 2, 3, 4, 5, 6, 7, 8]
threads per core: 1
total size: [21.12 kB, 42.24 kB, 63.36 kB, 84.48 kB, 105.60 kB, 126.72 kB,
147.84 kB, 168.96 kB]
2:
cores: [1, 2, 3, 4, 5, 6, 7, 8]
results:
copy: [80.41 GB/s, 160.83 GB/s, 240.43 GB/s, 320.63 GB/s, 401.66 GB/s, 454.32
GB/s, 539.77 GB/s, 628.51 GB/s]
daxpy: [95.87 GB/s, 187.75 GB/s, 270.68 GB/s, 371.80 GB/s, 454.05 GB/s,
503.46 GB/s, 606.85 GB/s, 689.34 GB/s]
load: [82.30 GB/s, 164.06 GB/s, 244.78 GB/s, 326.21 GB/s, 408.56 GB/s, 490.13
GB/s, 569.95 GB/s, 651.79 GB/s]
triad: [93.22 GB/s, 186.75 GB/s, 288.55 GB/s, 340.91 GB/s, 442.20 GB/s,
534.62 GB/s, 597.98 GB/s, 707.54 GB/s]
update: [83.25 GB/s, 166.04 GB/s, 248.21 GB/s, 330.58 GB/s, 414.71 GB/s,
496.97 GB/s, 578.67 GB/s, 656.56 GB/s]
size per core: [21.12 kB, 21.12 kB, 21.12 kB, 21.12 kB, 21.12 kB, 21.12 kB,
21.12 kB, 21.12 kB]
size per thread: [10.56 kB, 10.56 kB, 10.56 kB, 10.56 kB, 10.56 kB, 10.56
kB, 10.56 kB, 10.56 kB]
stats:
copy:
- [80.37 GB/s, 79.07 GB/s, 80.39 GB/s, 80.39 GB/s, 80.41 GB/s, 80.29 GB/s,
80.36 GB/s, 79.05 GB/s, 77.87 GB/s, 80.37 GB/s]
- [160.76 GB/s, 160.63 GB/s, 160.76 GB/s, 160.71 GB/s, 160.80 GB/s, 160.74
GB/s, 160.83 GB/s, 160.69 GB/s, 160.79 GB/s, 160.78 GB/s]
- [240.43 GB/s, 240.20 GB/s, 240.36 GB/s, 240.37 GB/s, 237.17 GB/s, 240.39
GB/s, 240.14 GB/s, 240.24 GB/s, 240.26 GB/s, 240.10 GB/s]
- [320.46 GB/s, 320.47 GB/s, 320.63 GB/s, 320.52 GB/s, 320.40 GB/s, 320.40
GB/s, 320.51 GB/s, 320.46 GB/s, 319.72 GB/s, 320.44 GB/s]
- [401.40 GB/s, 399.28 GB/s, 401.66 GB/s, 401.53 GB/s, 401.52 GB/s, 401.55
GB/s, 401.60 GB/s, 401.47 GB/s, 401.47 GB/s, 401.35 GB/s]
- [447.24 GB/s, 453.65 GB/s, 453.54 GB/s, 453.86 GB/s, 453.82 GB/s, 453.62
GB/s, 453.48 GB/s, 454.32 GB/s, 453.86 GB/s, 446.79 GB/s]
- [538.79 GB/s, 538.47 GB/s, 539.02 GB/s, 538.25 GB/s, 538.72 GB/s, 538.89
GB/s, 539.37 GB/s, 539.41 GB/s, 539.77 GB/s, 538.49 GB/s]
- [628.14 GB/s, 618.54 GB/s, 628.12 GB/s, 623.90 GB/s, 628.27 GB/s, 623.78
GB/s, 618.17 GB/s, 623.43 GB/s, 628.51 GB/s, 628.43 GB/s]
daxpy:
- [95.77 GB/s, 93.25 GB/s, 92.87 GB/s, 95.87 GB/s, 95.84 GB/s, 95.81 GB/s,
95.80 GB/s, 94.99 GB/s, 95.81 GB/s, 95.86 GB/s]
- [184.53 GB/s, 186.60 GB/s, 183.99 GB/s, 187.48 GB/s, 187.75 GB/s, 181.53
GB/s, 183.82 GB/s, 187.75 GB/s, 184.13 GB/s, 180.61 GB/s]
- [258.46 GB/s, 270.13 GB/s, 264.76 GB/s, 262.23 GB/s, 265.05 GB/s, 267.25
GB/s, 270.68 GB/s, 268.08 GB/s, 266.20 GB/s, 265.66 GB/s]
- [367.99 GB/s, 367.15 GB/s, 361.68 GB/s, 364.86 GB/s, 368.76 GB/s, 363.27
GB/s, 364.95 GB/s, 366.97 GB/s, 371.80 GB/s, 366.55 GB/s]
- [441.95 GB/s, 442.77 GB/s, 444.97 GB/s, 454.05 GB/s, 441.02 GB/s, 445.96
GB/s, 442.49 GB/s, 440.23 GB/s, 449.29 GB/s, 452.66 GB/s]
- [501.31 GB/s, 489.91 GB/s, 495.43 GB/s, 503.39 GB/s, 488.03 GB/s, 497.71
GB/s, 503.46 GB/s, 496.85 GB/s, 497.38 GB/s, 468.90 GB/s]
- [604.57 GB/s, 580.51 GB/s, 587.67 GB/s, 594.32 GB/s, 561.32 GB/s, 588.09
GB/s, 606.85 GB/s, 600.91 GB/s, 599.40 GB/s, 598.24 GB/s]
- [646.48 GB/s, 655.06 GB/s, 684.70 GB/s, 653.61 GB/s, 671.61 GB/s, 689.34
GB/s, 673.74 GB/s, 685.49 GB/s, 681.48 GB/s, 683.23 GB/s]
load:
- [82.19 GB/s, 82.08 GB/s, 82.22 GB/s, 82.10 GB/s, 82.14 GB/s, 82.17 GB/s,
82.22 GB/s, 82.28 GB/s, 82.30 GB/s, 81.98 GB/s]
- [163.22 GB/s, 163.43 GB/s, 164.06 GB/s, 164.03 GB/s, 163.19 GB/s, 163.83
GB/s, 163.29 GB/s, 163.88 GB/s, 163.83 GB/s, 163.11 GB/s]
- [244.32 GB/s, 244.47 GB/s, 244.65 GB/s, 244.29 GB/s, 243.96 GB/s, 244.50
GB/s, 244.78 GB/s, 244.52 GB/s, 244.48 GB/s, 244.72 GB/s]
- [325.18 GB/s, 326.21 GB/s, 325.49 GB/s, 325.86 GB/s, 325.73 GB/s, 325.72
GB/s, 326.00 GB/s, 325.41 GB/s, 325.63 GB/s, 325.82 GB/s]
- [407.81 GB/s, 407.96 GB/s, 407.59 GB/s, 408.56 GB/s, 407.64 GB/s, 407.61
GB/s, 408.09 GB/s, 407.95 GB/s, 408.30 GB/s, 408.32 GB/s]
- [488.65 GB/s, 489.73 GB/s, 489.38 GB/s, 489.81 GB/s, 490.13 GB/s, 489.31
GB/s, 488.74 GB/s, 489.38 GB/s, 488.17 GB/s, 489.51 GB/s]
- [569.95 GB/s, 567.21 GB/s, 566.08 GB/s, 567.88 GB/s, 567.69 GB/s, 569.58
GB/s, 568.61 GB/s, 568.35 GB/s, 569.70 GB/s, 568.87 GB/s]
- [650.43 GB/s, 651.58 GB/s, 650.86 GB/s, 651.34 GB/s, 651.04 GB/s, 651.79
GB/s, 650.28 GB/s, 650.31 GB/s, 650.81 GB/s, 651.09 GB/s]
triad:
- [93.22 GB/s, 90.73 GB/s, 92.48 GB/s, 92.53 GB/s, 92.37 GB/s, 92.50 GB/s,
92.48 GB/s, 90.28 GB/s, 92.35 GB/s, 92.51 GB/s]
- [186.75 GB/s, 184.51 GB/s, 184.17 GB/s, 186.66 GB/s, 186.43 GB/s, 184.59
GB/s, 186.71 GB/s, 186.30 GB/s, 186.64 GB/s, 186.12 GB/s]
- [287.77 GB/s, 288.55 GB/s, 287.76 GB/s, 287.76 GB/s, 288.19 GB/s, 287.70
GB/s, 287.42 GB/s, 288.12 GB/s, 287.66 GB/s, 288.01 GB/s]
- [339.82 GB/s, 338.95 GB/s, 340.11 GB/s, 340.11 GB/s, 340.25 GB/s, 340.20
GB/s, 339.90 GB/s, 340.22 GB/s, 340.91 GB/s, 340.01 GB/s]
- [440.41 GB/s, 440.65 GB/s, 441.59 GB/s, 442.20 GB/s, 441.67 GB/s, 432.59
GB/s, 440.20 GB/s, 440.81 GB/s, 440.24 GB/s, 441.38 GB/s]
- [534.30 GB/s, 527.60 GB/s, 528.52 GB/s, 509.55 GB/s, 527.68 GB/s, 527.63
GB/s, 533.66 GB/s, 534.62 GB/s, 534.60 GB/s, 534.19 GB/s]
- [595.90 GB/s, 595.94 GB/s, 597.91 GB/s, 580.22 GB/s, 597.98 GB/s, 597.66
GB/s, 596.16 GB/s, 567.03 GB/s, 580.88 GB/s, 578.29 GB/s]
- [703.80 GB/s, 705.57 GB/s, 694.84 GB/s, 682.59 GB/s, 694.37 GB/s, 696.56
GB/s, 704.50 GB/s, 704.95 GB/s, 694.52 GB/s, 707.54 GB/s]
update:
- [83.18 GB/s, 83.24 GB/s, 83.25 GB/s, 83.16 GB/s, 83.22 GB/s, 83.23 GB/s,
83.22 GB/s, 83.21 GB/s, 83.20 GB/s, 83.17 GB/s]
- [165.65 GB/s, 165.76 GB/s, 165.99 GB/s, 166.04 GB/s, 165.49 GB/s, 165.87
GB/s, 165.58 GB/s, 165.96 GB/s, 165.67 GB/s, 165.66 GB/s]
- [247.30 GB/s, 248.14 GB/s, 247.84 GB/s, 247.90 GB/s, 247.77 GB/s, 247.60
GB/s, 248.21 GB/s, 247.95 GB/s, 248.05 GB/s, 247.83 GB/s]
- [330.49 GB/s, 330.07 GB/s, 329.91 GB/s, 329.90 GB/s, 330.58 GB/s, 329.30
GB/s, 329.92 GB/s, 330.03 GB/s, 330.04 GB/s, 330.12 GB/s]
- [413.89 GB/s, 414.04 GB/s, 413.56 GB/s, 414.06 GB/s, 414.15 GB/s, 413.94
GB/s, 414.04 GB/s, 414.71 GB/s, 414.32 GB/s, 413.93 GB/s]
- [496.97 GB/s, 496.80 GB/s, 496.17 GB/s, 495.42 GB/s, 496.17 GB/s, 496.66
GB/s, 495.55 GB/s, 496.27 GB/s, 495.52 GB/s, 496.80 GB/s]
- [564.44 GB/s, 577.86 GB/s, 574.38 GB/s, 571.96 GB/s, 564.76 GB/s, 578.67
GB/s, 565.89 GB/s, 572.49 GB/s, 571.80 GB/s, 572.01 GB/s]
- [647.68 GB/s, 656.56 GB/s, 655.56 GB/s, 644.04 GB/s, 655.30 GB/s, 648.80
GB/s, 654.77 GB/s, 653.58 GB/s, 656.27 GB/s, 653.79 GB/s]
threads: [2, 4, 6, 8, 10, 12, 14, 16]
threads per core: 2
total size: [21.12 kB, 42.24 kB, 63.36 kB, 84.48 kB, 105.60 kB, 126.72 kB,
147.84 kB, 168.96 kB]
L2:
1:
cores: [1, 2, 3, 4, 5, 6, 7, 8]
results:
copy: [36.74 GB/s, 73.65 GB/s, 107.11 GB/s, 141.43 GB/s, 179.70 GB/s, 215.63
GB/s, 247.20 GB/s, 282.42 GB/s]
daxpy: [44.59 GB/s, 88.24 GB/s, 132.21 GB/s, 175.78 GB/s, 219.11 GB/s, 259.95
GB/s, 305.84 GB/s, 346.83 GB/s]
load: [31.46 GB/s, 62.97 GB/s, 93.73 GB/s, 125.46 GB/s, 157.32 GB/s, 183.63
GB/s, 214.02 GB/s, 245.17 GB/s]
triad: [37.79 GB/s, 75.08 GB/s, 111.43 GB/s, 148.90 GB/s, 185.54 GB/s, 223.72
GB/s, 258.53 GB/s, 299.32 GB/s]
update: [48.46 GB/s, 96.10 GB/s, 141.97 GB/s, 189.18 GB/s, 234.73 GB/s,
280.47 GB/s, 330.94 GB/s, 365.43 GB/s]
size per core: [168.96 kB, 168.96 kB, 168.96 kB, 168.96 kB, 168.96 kB, 168.96
kB, 168.96 kB, 168.96 kB]
size per thread: [168.96 kB, 168.96 kB, 168.96 kB, 168.96 kB, 168.96 kB, 168.96
kB, 168.96 kB, 168.96 kB]
stats:
copy:
- [36.38 GB/s, 36.59 GB/s, 36.18 GB/s, 36.57 GB/s, 36.26 GB/s, 34.61 GB/s,
35.96 GB/s, 35.84 GB/s, 36.74 GB/s, 36.53 GB/s]
- [68.97 GB/s, 70.42 GB/s, 69.88 GB/s, 71.40 GB/s, 69.05 GB/s, 72.46 GB/s,
70.32 GB/s, 73.65 GB/s, 72.14 GB/s, 69.81 GB/s]
- [107.08 GB/s, 103.53 GB/s, 107.11 GB/s, 103.66 GB/s, 103.88 GB/s, 106.48
GB/s, 97.32 GB/s, 105.92 GB/s, 104.16 GB/s, 104.84 GB/s]
- [138.97 GB/s, 136.86 GB/s, 140.88 GB/s, 138.96 GB/s, 140.58 GB/s, 138.51
GB/s, 141.43 GB/s, 139.53 GB/s, 141.20 GB/s, 139.43 GB/s]
- [158.20 GB/s, 171.06 GB/s, 179.70 GB/s, 171.43 GB/s, 174.27 GB/s, 175.01
GB/s, 165.20 GB/s, 170.89 GB/s, 173.01 GB/s, 175.17 GB/s]
- [209.74 GB/s, 204.59 GB/s, 215.27 GB/s, 215.63 GB/s, 210.59 GB/s, 206.94
GB/s, 211.03 GB/s, 201.61 GB/s, 214.45 GB/s, 208.15 GB/s]
- [241.38 GB/s, 246.88 GB/s, 246.90 GB/s, 247.20 GB/s, 235.27 GB/s, 227.39
GB/s, 239.48 GB/s, 244.45 GB/s, 246.68 GB/s, 235.87 GB/s]
- [271.07 GB/s, 282.42 GB/s, 282.38 GB/s, 276.20 GB/s, 269.85 GB/s, 276.96
GB/s, 268.64 GB/s, 269.61 GB/s, 279.68 GB/s, 280.63 GB/s]
daxpy:
- [44.54 GB/s, 44.59 GB/s, 44.50 GB/s, 44.42 GB/s, 44.41 GB/s, 44.06 GB/s,
43.39 GB/s, 44.02 GB/s, 44.34 GB/s, 44.28 GB/s]
- [85.35 GB/s, 87.05 GB/s, 86.47 GB/s, 86.90 GB/s, 86.92 GB/s, 88.24 GB/s,
87.39 GB/s, 87.60 GB/s, 87.55 GB/s, 84.19 GB/s]
- [129.21 GB/s, 130.47 GB/s, 123.29 GB/s, 127.92 GB/s, 132.21 GB/s, 128.37
GB/s, 127.09 GB/s, 128.72 GB/s, 129.34 GB/s, 128.69 GB/s]
- [171.53 GB/s, 169.64 GB/s, 173.92 GB/s, 173.74 GB/s, 168.53 GB/s, 171.54
GB/s, 173.96 GB/s, 175.78 GB/s, 171.29 GB/s, 171.33 GB/s]
- [219.11 GB/s, 208.86 GB/s, 211.66 GB/s, 216.47 GB/s, 212.73 GB/s, 204.90
GB/s, 208.87 GB/s, 215.75 GB/s, 213.61 GB/s, 214.56 GB/s]
- [250.69 GB/s, 241.36 GB/s, 255.22 GB/s, 250.29 GB/s, 253.80 GB/s, 256.34
GB/s, 254.38 GB/s, 259.95 GB/s, 245.69 GB/s, 259.12 GB/s]
- [296.08 GB/s, 301.77 GB/s, 297.40 GB/s, 305.84 GB/s, 288.62 GB/s, 283.76
GB/s, 293.61 GB/s, 291.93 GB/s, 299.74 GB/s, 289.76 GB/s]
- [344.46 GB/s, 334.36 GB/s, 339.31 GB/s, 330.88 GB/s, 343.26 GB/s, 327.28
GB/s, 344.53 GB/s, 346.83 GB/s, 344.29 GB/s, 346.28 GB/s]
load:
- [31.40 GB/s, 31.23 GB/s, 31.29 GB/s, 31.24 GB/s, 31.46 GB/s, 31.20 GB/s,
31.33 GB/s, 30.01 GB/s, 30.08 GB/s, 31.40 GB/s]
- [61.20 GB/s, 60.74 GB/s, 61.93 GB/s, 61.22 GB/s, 61.20 GB/s, 60.03 GB/s,
59.33 GB/s, 59.94 GB/s, 58.54 GB/s, 62.97 GB/s]
- [91.53 GB/s, 93.73 GB/s, 93.05 GB/s, 90.07 GB/s, 91.60 GB/s, 90.11 GB/s,
90.21 GB/s, 90.43 GB/s, 89.15 GB/s, 93.10 GB/s]
- [122.80 GB/s, 116.57 GB/s, 120.68 GB/s, 122.54 GB/s, 122.75 GB/s, 121.79
GB/s, 125.30 GB/s, 125.46 GB/s, 122.28 GB/s, 124.51 GB/s]
- [151.01 GB/s, 151.10 GB/s, 148.68 GB/s, 151.17 GB/s, 147.24 GB/s, 153.65
GB/s, 146.48 GB/s, 150.48 GB/s, 150.74 GB/s, 157.32 GB/s]
- [181.52 GB/s, 173.89 GB/s, 181.58 GB/s, 174.01 GB/s, 176.40 GB/s, 179.73
GB/s, 174.06 GB/s, 181.26 GB/s, 180.57 GB/s, 183.63 GB/s]
- [214.02 GB/s, 205.69 GB/s, 207.64 GB/s, 204.18 GB/s, 208.42 GB/s, 211.39
GB/s, 206.58 GB/s, 204.90 GB/s, 204.75 GB/s, 208.91 GB/s]
- [232.16 GB/s, 233.90 GB/s, 241.32 GB/s, 237.45 GB/s, 235.41 GB/s, 241.17
GB/s, 237.52 GB/s, 245.17 GB/s, 241.17 GB/s, 234.08 GB/s]
triad:
- [37.62 GB/s, 37.54 GB/s, 37.79 GB/s, 37.67 GB/s, 37.76 GB/s, 37.77 GB/s,
37.68 GB/s, 35.83 GB/s, 37.06 GB/s, 37.50 GB/s]
- [72.79 GB/s, 74.76 GB/s, 73.15 GB/s, 74.68 GB/s, 73.88 GB/s, 73.27 GB/s,
75.08 GB/s, 73.48 GB/s, 71.27 GB/s, 72.05 GB/s]
- [106.26 GB/s, 105.22 GB/s, 109.70 GB/s, 109.07 GB/s, 110.84 GB/s, 111.43
GB/s, 106.32 GB/s, 109.73 GB/s, 106.22 GB/s, 107.20 GB/s]
- [142.10 GB/s, 148.90 GB/s, 148.11 GB/s, 144.38 GB/s, 144.77 GB/s, 145.42
GB/s, 147.36 GB/s, 142.94 GB/s, 145.39 GB/s, 139.42 GB/s]
- [182.07 GB/s, 176.75 GB/s, 181.39 GB/s, 183.31 GB/s, 181.87 GB/s, 183.71
GB/s, 180.48 GB/s, 178.11 GB/s, 181.36 GB/s, 185.54 GB/s]
- [219.85 GB/s, 217.02 GB/s, 218.86 GB/s, 217.09 GB/s, 212.24 GB/s, 212.22
GB/s, 219.33 GB/s, 208.81 GB/s, 215.84 GB/s, 223.72 GB/s]
- [258.06 GB/s, 232.27 GB/s, 247.04 GB/s, 240.55 GB/s, 236.11 GB/s, 251.88
GB/s, 258.53 GB/s, 247.32 GB/s, 251.53 GB/s, 245.10 GB/s]
- [273.67 GB/s, 292.81 GB/s, 288.67 GB/s, 289.75 GB/s, 293.98 GB/s, 283.56
GB/s, 295.33 GB/s, 280.11 GB/s, 299.32 GB/s, 285.18 GB/s]
update:
- [47.30 GB/s, 48.33 GB/s, 48.17 GB/s, 47.38 GB/s, 48.16 GB/s, 46.99 GB/s,
48.46 GB/s, 47.51 GB/s, 46.20 GB/s, 48.26 GB/s]
- [92.10 GB/s, 92.30 GB/s, 95.73 GB/s, 95.53 GB/s, 86.95 GB/s, 96.10 GB/s,
94.16 GB/s, 89.72 GB/s, 92.00 GB/s, 93.10 GB/s]
- [137.06 GB/s, 140.40 GB/s, 136.20 GB/s, 139.57 GB/s, 140.69 GB/s, 136.20
GB/s, 141.53 GB/s, 129.76 GB/s, 136.47 GB/s, 141.97 GB/s]
- [184.84 GB/s, 177.96 GB/s, 178.61 GB/s, 179.03 GB/s, 176.59 GB/s, 180.62
GB/s, 182.26 GB/s, 182.27 GB/s, 189.18 GB/s, 185.49 GB/s]
- [232.17 GB/s, 217.86 GB/s, 232.40 GB/s, 223.10 GB/s, 228.52 GB/s, 234.73
GB/s, 232.00 GB/s, 233.14 GB/s, 231.69 GB/s, 225.01 GB/s]
- [276.16 GB/s, 274.80 GB/s, 272.58 GB/s, 272.43 GB/s, 280.47 GB/s, 276.90
GB/s, 264.76 GB/s, 272.47 GB/s, 277.77 GB/s, 271.42 GB/s]
- [330.94 GB/s, 312.06 GB/s, 312.83 GB/s, 312.62 GB/s, 292.44 GB/s, 315.68
GB/s, 316.67 GB/s, 321.25 GB/s, 321.71 GB/s, 315.05 GB/s]
- [362.85 GB/s, 356.49 GB/s, 365.43 GB/s, 332.52 GB/s, 354.30 GB/s, 354.68
GB/s, 335.54 GB/s, 358.54 GB/s, 363.22 GB/s, 360.01 GB/s]
threads: [1, 2, 3, 4, 5, 6, 7, 8]
threads per core: 1
total size: [168.96 kB, 337.92 kB, 506.88 kB, 675.84 kB, 844.80 kB, 1.01 MB,
1.18 MB, 1.35 MB]
2:
cores: [1, 2, 3, 4, 5, 6, 7, 8]
results:
copy: [36.83 GB/s, 72.70 GB/s, 108.11 GB/s, 142.21 GB/s, 178.07 GB/s, 213.30
GB/s, 251.98 GB/s, 283.06 GB/s]
daxpy: [45.34 GB/s, 90.11 GB/s, 134.85 GB/s, 180.06 GB/s, 224.22 GB/s, 268.27
GB/s, 312.15 GB/s, 358.38 GB/s]
load: [33.99 GB/s, 67.65 GB/s, 100.93 GB/s, 134.81 GB/s, 165.89 GB/s, 196.09
GB/s, 233.31 GB/s, 262.05 GB/s]
triad: [38.60 GB/s, 76.58 GB/s, 114.50 GB/s, 150.54 GB/s, 189.60 GB/s, 227.05
GB/s, 263.75 GB/s, 301.02 GB/s]
update: [49.25 GB/s, 97.34 GB/s, 146.81 GB/s, 194.71 GB/s, 239.97 GB/s,
287.14 GB/s, 330.84 GB/s, 384.71 GB/s]
size per core: [168.96 kB, 168.96 kB, 168.96 kB, 168.96 kB, 168.96 kB, 168.96
kB, 168.96 kB, 168.96 kB]
size per thread: [84.48 kB, 84.48 kB, 84.48 kB, 84.48 kB, 84.48 kB, 84.48
kB, 84.48 kB, 84.48 kB]
stats:
copy:
- [36.83 GB/s, 36.67 GB/s, 34.90 GB/s, 36.44 GB/s, 35.13 GB/s, 35.07 GB/s,
35.53 GB/s, 36.15 GB/s, 35.85 GB/s, 36.23 GB/s]
- [71.52 GB/s, 70.16 GB/s, 70.67 GB/s, 71.20 GB/s, 72.70 GB/s, 70.14 GB/s,
70.53 GB/s, 69.17 GB/s, 71.57 GB/s, 70.22 GB/s]
- [104.39 GB/s, 104.74 GB/s, 103.12 GB/s, 108.11 GB/s, 105.30 GB/s, 102.80
GB/s, 102.90 GB/s, 107.06 GB/s, 103.45 GB/s, 105.45 GB/s]
- [139.02 GB/s, 134.63 GB/s, 140.72 GB/s, 141.32 GB/s, 140.35 GB/s, 141.19
GB/s, 135.44 GB/s, 142.21 GB/s, 140.96 GB/s, 142.05 GB/s]
- [177.86 GB/s, 177.74 GB/s, 177.42 GB/s, 175.35 GB/s, 176.42 GB/s, 173.13
GB/s, 174.32 GB/s, 170.24 GB/s, 178.07 GB/s, 177.88 GB/s]
- [206.27 GB/s, 211.63 GB/s, 209.06 GB/s, 210.54 GB/s, 208.80 GB/s, 209.99
GB/s, 208.77 GB/s, 206.41 GB/s, 213.30 GB/s, 206.39 GB/s]
- [240.18 GB/s, 238.36 GB/s, 244.16 GB/s, 236.26 GB/s, 244.12 GB/s, 238.49
GB/s, 242.23 GB/s, 244.46 GB/s, 251.98 GB/s, 242.55 GB/s]
- [279.77 GB/s, 282.91 GB/s, 278.73 GB/s, 276.91 GB/s, 283.06 GB/s, 273.23
GB/s, 278.33 GB/s, 280.88 GB/s, 277.54 GB/s, 281.83 GB/s]
daxpy:
- [45.32 GB/s, 44.62 GB/s, 45.29 GB/s, 45.18 GB/s, 45.17 GB/s, 45.07 GB/s,
44.69 GB/s, 45.17 GB/s, 45.11 GB/s, 45.34 GB/s]
- [89.94 GB/s, 89.97 GB/s, 89.37 GB/s, 89.90 GB/s, 88.37 GB/s, 89.13 GB/s,
90.11 GB/s, 89.67 GB/s, 89.90 GB/s, 89.93 GB/s]
- [134.83 GB/s, 134.85 GB/s, 132.02 GB/s, 134.33 GB/s, 133.82 GB/s, 132.39
GB/s, 131.67 GB/s, 134.62 GB/s, 132.71 GB/s, 131.67 GB/s]
- [175.52 GB/s, 173.36 GB/s, 176.83 GB/s, 177.98 GB/s, 175.73 GB/s, 173.42
GB/s, 180.06 GB/s, 179.55 GB/s, 176.71 GB/s, 175.85 GB/s]
- [222.00 GB/s, 216.86 GB/s, 220.17 GB/s, 218.14 GB/s, 220.60 GB/s, 219.43
GB/s, 220.58 GB/s, 224.22 GB/s, 220.89 GB/s, 222.28 GB/s]
- [258.75 GB/s, 262.88 GB/s, 261.77 GB/s, 268.27 GB/s, 263.66 GB/s, 262.59
GB/s, 266.54 GB/s, 261.67 GB/s, 262.80 GB/s, 263.72 GB/s]
- [298.65 GB/s, 312.15 GB/s, 308.52 GB/s, 304.22 GB/s, 301.87 GB/s, 305.53
GB/s, 309.84 GB/s, 310.67 GB/s, 310.49 GB/s, 311.99 GB/s]
- [347.55 GB/s, 350.67 GB/s, 348.93 GB/s, 358.38 GB/s, 352.35 GB/s, 352.05
GB/s, 353.82 GB/s, 356.00 GB/s, 348.07 GB/s, 349.87 GB/s]
load:
- [33.99 GB/s, 32.54 GB/s, 32.94 GB/s, 33.17 GB/s, 33.83 GB/s, 31.55 GB/s,
31.91 GB/s, 33.86 GB/s, 33.93 GB/s, 33.75 GB/s]
- [66.22 GB/s, 64.94 GB/s, 67.64 GB/s, 67.52 GB/s, 65.01 GB/s, 67.21 GB/s,
66.07 GB/s, 66.43 GB/s, 67.65 GB/s, 64.84 GB/s]
- [98.58 GB/s, 97.97 GB/s, 98.39 GB/s, 98.50 GB/s, 98.77 GB/s, 97.84 GB/s,
99.58 GB/s, 100.93 GB/s, 100.50 GB/s, 99.94 GB/s]
- [130.23 GB/s, 131.10 GB/s, 131.04 GB/s, 127.83 GB/s, 134.81 GB/s, 132.68
GB/s, 131.80 GB/s, 129.42 GB/s, 130.76 GB/s, 126.96 GB/s]
- [164.90 GB/s, 165.18 GB/s, 161.19 GB/s, 164.33 GB/s, 162.76 GB/s, 165.04
GB/s, 162.20 GB/s, 165.89 GB/s, 164.34 GB/s, 159.66 GB/s]
- [192.69 GB/s, 193.33 GB/s, 188.88 GB/s, 190.70 GB/s, 194.60 GB/s, 190.92
GB/s, 191.36 GB/s, 192.89 GB/s, 191.85 GB/s, 196.09 GB/s]
- [227.70 GB/s, 223.95 GB/s, 222.79 GB/s, 227.09 GB/s, 227.04 GB/s, 229.45
GB/s, 228.09 GB/s, 227.83 GB/s, 233.31 GB/s, 227.49 GB/s]
- [257.94 GB/s, 261.47 GB/s, 262.05 GB/s, 257.70 GB/s, 259.70 GB/s, 259.23
GB/s, 261.09 GB/s, 253.81 GB/s, 254.21 GB/s, 259.34 GB/s]
triad:
- [38.60 GB/s, 36.68 GB/s, 38.07 GB/s, 38.10 GB/s, 37.89 GB/s, 36.48 GB/s,
38.33 GB/s, 38.12 GB/s, 37.43 GB/s, 37.87 GB/s]
- [76.58 GB/s, 74.97 GB/s, 75.74 GB/s, 76.02 GB/s, 72.66 GB/s, 74.73 GB/s,
76.37 GB/s, 76.18 GB/s, 74.59 GB/s, 75.75 GB/s]
- [111.71 GB/s, 114.50 GB/s, 108.96 GB/s, 111.49 GB/s, 111.56 GB/s, 111.66
GB/s, 113.43 GB/s, 114.37 GB/s, 111.67 GB/s, 108.14 GB/s]
- [146.29 GB/s, 147.84 GB/s, 149.09 GB/s, 149.93 GB/s, 150.54 GB/s, 145.50
GB/s, 145.16 GB/s, 149.47 GB/s, 146.30 GB/s, 149.32 GB/s]
- [186.73 GB/s, 186.46 GB/s, 180.47 GB/s, 187.32 GB/s, 184.34 GB/s, 187.34
GB/s, 186.55 GB/s, 183.81 GB/s, 189.60 GB/s, 188.70 GB/s]
- [224.81 GB/s, 219.69 GB/s, 227.05 GB/s, 224.25 GB/s, 223.36 GB/s, 225.86
GB/s, 216.09 GB/s, 221.98 GB/s, 218.47 GB/s, 226.37 GB/s]
- [263.29 GB/s, 259.28 GB/s, 258.81 GB/s, 258.77 GB/s, 256.56 GB/s, 256.49
GB/s, 256.39 GB/s, 263.75 GB/s, 262.00 GB/s, 261.48 GB/s]
- [299.28 GB/s, 292.80 GB/s, 293.63 GB/s, 297.93 GB/s, 293.02 GB/s, 295.95
GB/s, 287.92 GB/s, 301.02 GB/s, 300.76 GB/s, 297.01 GB/s]
update:
- [49.07 GB/s, 47.17 GB/s, 47.56 GB/s, 49.25 GB/s, 46.44 GB/s, 49.04 GB/s,
48.91 GB/s, 49.20 GB/s, 48.30 GB/s, 48.85 GB/s]
- [96.45 GB/s, 97.11 GB/s, 94.03 GB/s, 92.56 GB/s, 95.39 GB/s, 97.34 GB/s,
96.06 GB/s, 92.25 GB/s, 95.53 GB/s, 97.08 GB/s]
- [137.54 GB/s, 135.13 GB/s, 145.80 GB/s, 141.29 GB/s, 138.99 GB/s, 143.44
GB/s, 146.81 GB/s, 142.94 GB/s, 133.84 GB/s, 146.33 GB/s]
- [190.64 GB/s, 185.02 GB/s, 194.24 GB/s, 187.48 GB/s, 194.52 GB/s, 188.51
GB/s, 189.17 GB/s, 194.71 GB/s, 194.37 GB/s, 190.83 GB/s]
- [239.97 GB/s, 219.74 GB/s, 233.72 GB/s, 234.38 GB/s, 235.78 GB/s, 235.11
GB/s, 235.62 GB/s, 226.09 GB/s, 235.93 GB/s, 230.51 GB/s]
- [280.16 GB/s, 275.22 GB/s, 260.15 GB/s, 286.01 GB/s, 280.61 GB/s, 287.14
GB/s, 283.75 GB/s, 275.23 GB/s, 283.71 GB/s, 285.38 GB/s]
- [311.15 GB/s, 318.00 GB/s, 325.21 GB/s, 328.34 GB/s, 318.09 GB/s, 328.66
GB/s, 329.69 GB/s, 316.97 GB/s, 328.51 GB/s, 330.84 GB/s]
- [374.41 GB/s, 369.73 GB/s, 358.15 GB/s, 375.54 GB/s, 384.71 GB/s, 357.66
GB/s, 369.71 GB/s, 375.35 GB/s, 370.25 GB/s, 364.01 GB/s]
threads: [2, 4, 6, 8, 10, 12, 14, 16]
threads per core: 2
total size: [168.96 kB, 337.92 kB, 506.88 kB, 675.84 kB, 844.80 kB, 1.01 MB,
1.18 MB, 1.35 MB]
L3:
1:
cores: [1, 2, 3, 4, 5, 6, 7, 8]
results:
copy: [21.93 GB/s, 43.10 GB/s, 65.38 GB/s, 85.69 GB/s, 105.64 GB/s, 127.34
GB/s, 148.22 GB/s, 171.52 GB/s]
daxpy: [30.98 GB/s, 62.27 GB/s, 93.13 GB/s, 123.27 GB/s, 153.64 GB/s, 185.97
GB/s, 216.67 GB/s, 247.41 GB/s]
load: [23.47 GB/s, 46.84 GB/s, 69.74 GB/s, 92.76 GB/s, 115.37 GB/s, 139.23
GB/s, 163.12 GB/s, 186.65 GB/s]
triad: [24.72 GB/s, 49.11 GB/s, 72.42 GB/s, 95.36 GB/s, 119.46 GB/s, 144.60
GB/s, 168.66 GB/s, 189.45 GB/s]
update: [31.39 GB/s, 62.11 GB/s, 91.95 GB/s, 122.24 GB/s, 151.40 GB/s, 182.28
GB/s, 216.07 GB/s, 239.92 GB/s]
size per core: [13.20 MB, 6.60 MB, 4.40 MB, 3.30 MB, 2.64 MB, 2.20 MB, 1.89
MB, 1.65 MB]
size per thread: [13.20 MB, 6.60 MB, 4.40 MB, 3.30 MB, 2.64 MB, 2.20 MB, 1.89
MB, 1.65 MB]
stats:
copy:
- [21.64 GB/s, 20.85 GB/s, 20.56 GB/s, 21.69 GB/s, 21.06 GB/s, 21.46 GB/s,
21.93 GB/s, 21.73 GB/s, 21.83 GB/s, 21.69 GB/s]
- [42.86 GB/s, 42.70 GB/s, 42.72 GB/s, 38.47 GB/s, 42.82 GB/s, 43.10 GB/s,
42.66 GB/s, 42.44 GB/s, 42.61 GB/s, 42.48 GB/s]
- [64.95 GB/s, 64.34 GB/s, 63.93 GB/s, 65.38 GB/s, 64.36 GB/s, 63.60 GB/s,
62.65 GB/s, 63.66 GB/s, 63.51 GB/s, 63.75 GB/s]
- [84.07 GB/s, 83.97 GB/s, 83.34 GB/s, 83.91 GB/s, 81.16 GB/s, 85.69 GB/s,
85.40 GB/s, 85.37 GB/s, 85.42 GB/s, 84.48 GB/s]
- [102.83 GB/s, 104.24 GB/s, 105.42 GB/s, 103.68 GB/s, 105.22 GB/s, 105.64
GB/s, 103.15 GB/s, 102.02 GB/s, 100.60 GB/s, 105.09 GB/s]
- [125.46 GB/s, 122.23 GB/s, 123.56 GB/s, 124.59 GB/s, 127.03 GB/s, 125.39
GB/s, 124.50 GB/s, 127.02 GB/s, 126.95 GB/s, 127.34 GB/s]
- [147.99 GB/s, 146.65 GB/s, 139.23 GB/s, 147.69 GB/s, 146.42 GB/s, 145.65
GB/s, 148.22 GB/s, 143.77 GB/s, 147.96 GB/s, 147.70 GB/s]
- [168.36 GB/s, 168.24 GB/s, 164.99 GB/s, 165.32 GB/s, 167.08 GB/s, 165.98
GB/s, 165.39 GB/s, 165.84 GB/s, 166.15 GB/s, 171.52 GB/s]
daxpy:
- [30.92 GB/s, 30.74 GB/s, 30.87 GB/s, 30.98 GB/s, 30.45 GB/s, 29.62 GB/s,
29.54 GB/s, 30.04 GB/s, 30.94 GB/s, 30.93 GB/s]
- [61.96 GB/s, 61.38 GB/s, 61.27 GB/s, 62.27 GB/s, 61.36 GB/s, 61.27 GB/s,
62.06 GB/s, 60.01 GB/s, 61.49 GB/s, 62.16 GB/s]
- [92.26 GB/s, 93.06 GB/s, 88.45 GB/s, 92.18 GB/s, 93.13 GB/s, 92.11 GB/s,
92.28 GB/s, 92.28 GB/s, 93.03 GB/s, 92.78 GB/s]
- [123.22 GB/s, 123.06 GB/s, 123.27 GB/s, 119.42 GB/s, 122.94 GB/s, 122.54
GB/s, 123.24 GB/s, 115.90 GB/s, 121.65 GB/s, 122.47 GB/s]
- [151.70 GB/s, 145.65 GB/s, 149.53 GB/s, 152.52 GB/s, 153.64 GB/s, 152.93
GB/s, 152.81 GB/s, 153.01 GB/s, 153.04 GB/s, 152.06 GB/s]
- [184.04 GB/s, 171.51 GB/s, 184.83 GB/s, 184.09 GB/s, 185.97 GB/s, 183.75
GB/s, 184.66 GB/s, 182.54 GB/s, 184.39 GB/s, 184.40 GB/s]
- [198.70 GB/s, 216.51 GB/s, 216.17 GB/s, 203.10 GB/s, 211.40 GB/s, 215.04
GB/s, 215.48 GB/s, 216.03 GB/s, 216.24 GB/s, 216.67 GB/s]
- [246.02 GB/s, 247.35 GB/s, 245.00 GB/s, 244.65 GB/s, 229.12 GB/s, 243.37
GB/s, 247.22 GB/s, 247.41 GB/s, 246.03 GB/s, 244.83 GB/s]
load:
- [23.08 GB/s, 23.38 GB/s, 22.88 GB/s, 23.43 GB/s, 23.05 GB/s, 23.23 GB/s,
22.97 GB/s, 22.39 GB/s, 23.47 GB/s, 23.33 GB/s]
- [46.39 GB/s, 46.40 GB/s, 46.45 GB/s, 46.36 GB/s, 46.69 GB/s, 46.62 GB/s,
46.84 GB/s, 45.98 GB/s, 46.73 GB/s, 46.80 GB/s]
- [69.18 GB/s, 68.61 GB/s, 69.74 GB/s, 69.34 GB/s, 68.39 GB/s, 69.73 GB/s,
67.76 GB/s, 69.65 GB/s, 69.70 GB/s, 69.16 GB/s]
- [92.29 GB/s, 91.67 GB/s, 92.76 GB/s, 90.78 GB/s, 92.76 GB/s, 90.76 GB/s,
91.58 GB/s, 91.60 GB/s, 91.03 GB/s, 92.72 GB/s]
- [114.04 GB/s, 113.82 GB/s, 112.26 GB/s, 112.65 GB/s, 114.09 GB/s, 113.81
GB/s, 113.72 GB/s, 114.70 GB/s, 115.37 GB/s, 112.57 GB/s]
- [136.42 GB/s, 135.83 GB/s, 134.93 GB/s, 135.43 GB/s, 135.94 GB/s, 139.23
GB/s, 137.52 GB/s, 137.59 GB/s, 135.97 GB/s, 136.96 GB/s]
- [157.88 GB/s, 163.12 GB/s, 159.53 GB/s, 160.16 GB/s, 162.18 GB/s, 159.58
GB/s, 161.55 GB/s, 159.81 GB/s, 162.97 GB/s, 163.10 GB/s]
- [183.41 GB/s, 181.86 GB/s, 183.55 GB/s, 183.38 GB/s, 181.66 GB/s, 186.65
GB/s, 179.62 GB/s, 174.70 GB/s, 180.10 GB/s, 181.49 GB/s]
triad:
- [24.72 GB/s, 23.66 GB/s, 23.58 GB/s, 23.75 GB/s, 23.62 GB/s, 24.37 GB/s,
24.44 GB/s, 23.57 GB/s, 23.30 GB/s, 23.57 GB/s]
- [49.11 GB/s, 46.87 GB/s, 47.13 GB/s, 46.83 GB/s, 46.58 GB/s, 46.73 GB/s,
46.32 GB/s, 47.22 GB/s, 46.79 GB/s, 48.73 GB/s]
- [72.29 GB/s, 69.87 GB/s, 70.57 GB/s, 68.89 GB/s, 68.56 GB/s, 69.02 GB/s,
72.42 GB/s, 69.37 GB/s, 72.34 GB/s, 69.44 GB/s]
- [94.95 GB/s, 94.67 GB/s, 91.05 GB/s, 90.46 GB/s, 95.36 GB/s, 91.63 GB/s,
94.06 GB/s, 95.30 GB/s, 93.99 GB/s, 94.71 GB/s]
- [119.32 GB/s, 117.99 GB/s, 119.46 GB/s, 117.28 GB/s, 118.97 GB/s, 115.67
GB/s, 116.64 GB/s, 117.99 GB/s, 119.02 GB/s, 117.75 GB/s]
- [138.63 GB/s, 144.53 GB/s, 144.60 GB/s, 135.72 GB/s, 141.86 GB/s, 139.64
GB/s, 142.95 GB/s, 140.89 GB/s, 142.10 GB/s, 143.97 GB/s]
- [168.66 GB/s, 166.77 GB/s, 157.10 GB/s, 164.75 GB/s, 164.00 GB/s, 164.38
GB/s, 163.94 GB/s, 158.58 GB/s, 165.60 GB/s, 164.39 GB/s]
- [184.53 GB/s, 187.00 GB/s, 186.87 GB/s, 179.43 GB/s, 185.70 GB/s, 187.49
GB/s, 189.45 GB/s, 186.82 GB/s, 188.50 GB/s, 185.96 GB/s]
update:
- [30.60 GB/s, 31.20 GB/s, 30.65 GB/s, 31.39 GB/s, 30.89 GB/s, 30.75 GB/s,
30.58 GB/s, 30.99 GB/s, 30.69 GB/s, 31.34 GB/s]
- [60.99 GB/s, 62.11 GB/s, 61.42 GB/s, 61.55 GB/s, 61.79 GB/s, 61.24 GB/s,
61.37 GB/s, 61.74 GB/s, 61.45 GB/s, 61.58 GB/s]
- [91.11 GB/s, 91.21 GB/s, 91.95 GB/s, 91.19 GB/s, 91.14 GB/s, 91.36 GB/s,
91.30 GB/s, 91.70 GB/s, 90.84 GB/s, 91.09 GB/s]
- [120.90 GB/s, 120.49 GB/s, 121.35 GB/s, 122.24 GB/s, 120.37 GB/s, 119.83
GB/s, 119.32 GB/s, 119.48 GB/s, 119.11 GB/s, 119.76 GB/s]
- [146.72 GB/s, 147.18 GB/s, 147.81 GB/s, 151.40 GB/s, 147.81 GB/s, 146.84
GB/s, 147.51 GB/s, 148.15 GB/s, 146.89 GB/s, 148.41 GB/s]
- [179.93 GB/s, 179.68 GB/s, 182.28 GB/s, 179.65 GB/s, 179.06 GB/s, 182.25
GB/s, 182.03 GB/s, 179.10 GB/s, 178.82 GB/s, 177.84 GB/s]
- [208.84 GB/s, 210.17 GB/s, 210.20 GB/s, 210.81 GB/s, 209.88 GB/s, 211.16
GB/s, 216.07 GB/s, 211.77 GB/s, 208.89 GB/s, 210.47 GB/s]
- [236.56 GB/s, 239.05 GB/s, 237.81 GB/s, 237.20 GB/s, 238.68 GB/s, 237.69
GB/s, 239.05 GB/s, 239.38 GB/s, 239.92 GB/s, 238.63 GB/s]
threads: [1, 2, 3, 4, 5, 6, 7, 8]
threads per core: 1
total size: [13.20 MB, 13.20 MB, 13.20 MB, 13.20 MB, 13.20 MB, 13.20 MB, 13.20
MB, 13.20 MB]
2:
cores: [1, 2, 3, 4, 5, 6, 7, 8]
results:
copy: [23.35 GB/s, 45.63 GB/s, 68.10 GB/s, 89.46 GB/s, 111.10 GB/s, 134.20
GB/s, 154.44 GB/s, 174.89 GB/s]
daxpy: [32.32 GB/s, 64.16 GB/s, 96.12 GB/s, 126.75 GB/s, 156.91 GB/s, 188.57
GB/s, 221.57 GB/s, 251.65 GB/s]
load: [25.14 GB/s, 50.38 GB/s, 75.49 GB/s, 101.06 GB/s, 126.04 GB/s, 151.12
GB/s, 172.57 GB/s, 196.91 GB/s]
triad: [25.15 GB/s, 50.37 GB/s, 75.31 GB/s, 99.12 GB/s, 123.25 GB/s, 150.29
GB/s, 171.60 GB/s, 197.81 GB/s]
update: [32.98 GB/s, 65.60 GB/s, 97.60 GB/s, 130.34 GB/s, 162.76 GB/s, 194.12
GB/s, 229.02 GB/s, 260.35 GB/s]
size per core: [13.20 MB, 6.60 MB, 4.40 MB, 3.30 MB, 2.64 MB, 2.20 MB, 1.89
MB, 1.65 MB]
size per thread: [6.60 MB, 3.30 MB, 2.20 MB, 1.65 MB, 1.32 MB, 1.10 MB, 0.94
MB, 825.00 kB]
stats:
copy:
- [22.79 GB/s, 22.55 GB/s, 22.86 GB/s, 22.74 GB/s, 23.09 GB/s, 22.51 GB/s,
23.35 GB/s, 23.32 GB/s, 23.02 GB/s, 22.75 GB/s]
- [45.32 GB/s, 45.15 GB/s, 45.63 GB/s, 44.84 GB/s, 44.54 GB/s, 44.33 GB/s,
44.68 GB/s, 44.98 GB/s, 44.64 GB/s, 44.75 GB/s]
- [68.10 GB/s, 67.88 GB/s, 67.98 GB/s, 67.32 GB/s, 67.02 GB/s, 67.14 GB/s,
67.71 GB/s, 67.19 GB/s, 63.08 GB/s, 68.04 GB/s]
- [89.46 GB/s, 88.53 GB/s, 88.51 GB/s, 89.13 GB/s, 89.32 GB/s, 84.53 GB/s,
87.51 GB/s, 88.95 GB/s, 88.91 GB/s, 87.62 GB/s]
- [108.72 GB/s, 110.42 GB/s, 106.02 GB/s, 111.08 GB/s, 110.70 GB/s, 111.10
GB/s, 110.24 GB/s, 109.68 GB/s, 109.55 GB/s, 108.86 GB/s]
- [133.21 GB/s, 127.37 GB/s, 132.83 GB/s, 132.67 GB/s, 133.02 GB/s, 132.65
GB/s, 134.20 GB/s, 132.96 GB/s, 118.86 GB/s, 131.20 GB/s]
- [152.95 GB/s, 153.90 GB/s, 153.80 GB/s, 153.22 GB/s, 153.32 GB/s, 142.75
GB/s, 152.99 GB/s, 154.44 GB/s, 154.43 GB/s, 152.24 GB/s]
- [174.89 GB/s, 171.49 GB/s, 157.46 GB/s, 172.90 GB/s, 173.42 GB/s, 171.07
GB/s, 171.82 GB/s, 170.68 GB/s, 172.19 GB/s, 161.38 GB/s]
daxpy:
- [31.88 GB/s, 32.27 GB/s, 31.11 GB/s, 32.20 GB/s, 32.17 GB/s, 32.32 GB/s,
32.20 GB/s, 32.32 GB/s, 30.76 GB/s, 32.03 GB/s]
- [64.16 GB/s, 63.70 GB/s, 64.04 GB/s, 63.55 GB/s, 60.64 GB/s, 64.05 GB/s,
63.56 GB/s, 63.36 GB/s, 63.94 GB/s, 63.86 GB/s]
- [96.12 GB/s, 95.66 GB/s, 95.93 GB/s, 95.93 GB/s, 96.10 GB/s, 95.94 GB/s,
95.78 GB/s, 95.79 GB/s, 95.17 GB/s, 89.44 GB/s]
- [126.04 GB/s, 126.43 GB/s, 126.09 GB/s, 124.90 GB/s, 125.07 GB/s, 125.74
GB/s, 118.86 GB/s, 125.80 GB/s, 125.10 GB/s, 126.75 GB/s]
- [155.92 GB/s, 155.99 GB/s, 156.32 GB/s, 151.54 GB/s, 156.49 GB/s, 156.91
GB/s, 154.92 GB/s, 155.92 GB/s, 156.20 GB/s, 154.49 GB/s]
- [185.57 GB/s, 180.38 GB/s, 187.51 GB/s, 187.10 GB/s, 186.44 GB/s, 187.13
GB/s, 187.31 GB/s, 188.10 GB/s, 187.91 GB/s, 188.57 GB/s]
- [207.55 GB/s, 219.63 GB/s, 219.38 GB/s, 219.81 GB/s, 220.29 GB/s, 219.72
GB/s, 221.05 GB/s, 216.76 GB/s, 221.57 GB/s, 220.75 GB/s]
- [250.81 GB/s, 250.78 GB/s, 251.19 GB/s, 251.28 GB/s, 249.10 GB/s, 250.42
GB/s, 251.65 GB/s, 244.31 GB/s, 250.40 GB/s, 250.19 GB/s]
load:
- [24.84 GB/s, 24.86 GB/s, 25.09 GB/s, 25.04 GB/s, 24.74 GB/s, 24.87 GB/s,
25.01 GB/s, 25.08 GB/s, 25.14 GB/s, 25.00 GB/s]
- [50.03 GB/s, 49.40 GB/s, 50.28 GB/s, 50.08 GB/s, 50.37 GB/s, 49.75 GB/s,
50.01 GB/s, 50.38 GB/s, 49.89 GB/s, 50.24 GB/s]
- [74.37 GB/s, 74.65 GB/s, 74.40 GB/s, 73.45 GB/s, 73.31 GB/s, 73.00 GB/s,
75.49 GB/s, 73.94 GB/s, 74.42 GB/s, 74.80 GB/s]
- [99.51 GB/s, 99.43 GB/s, 98.90 GB/s, 99.83 GB/s, 98.74 GB/s, 100.75 GB/s,
99.33 GB/s, 99.81 GB/s, 100.00 GB/s, 101.06 GB/s]
- [126.04 GB/s, 126.03 GB/s, 124.70 GB/s, 124.86 GB/s, 125.31 GB/s, 124.78
GB/s, 125.99 GB/s, 123.52 GB/s, 124.45 GB/s, 123.01 GB/s]
- [146.95 GB/s, 150.27 GB/s, 151.12 GB/s, 150.93 GB/s, 150.68 GB/s, 149.75
GB/s, 150.67 GB/s, 146.01 GB/s, 148.34 GB/s, 149.15 GB/s]
- [169.40 GB/s, 172.12 GB/s, 172.40 GB/s, 171.99 GB/s, 172.57 GB/s, 171.95
GB/s, 167.06 GB/s, 169.66 GB/s, 168.34 GB/s, 169.45 GB/s]
- [192.68 GB/s, 191.98 GB/s, 192.82 GB/s, 191.84 GB/s, 191.97 GB/s, 196.91
GB/s, 193.36 GB/s, 190.12 GB/s, 192.04 GB/s, 193.93 GB/s]
triad:
- [24.78 GB/s, 25.03 GB/s, 25.07 GB/s, 24.81 GB/s, 24.65 GB/s, 24.80 GB/s,
24.71 GB/s, 25.15 GB/s, 24.70 GB/s, 24.25 GB/s]
- [49.63 GB/s, 48.68 GB/s, 49.73 GB/s, 49.97 GB/s, 50.37 GB/s, 49.89 GB/s,
49.59 GB/s, 49.00 GB/s, 49.96 GB/s, 49.61 GB/s]
- [74.88 GB/s, 74.99 GB/s, 75.31 GB/s, 73.20 GB/s, 74.50 GB/s, 72.88 GB/s,
73.43 GB/s, 73.74 GB/s, 74.59 GB/s, 74.60 GB/s]
- [95.80 GB/s, 97.67 GB/s, 98.93 GB/s, 97.79 GB/s, 98.74 GB/s, 97.74 GB/s,
98.87 GB/s, 99.12 GB/s, 97.90 GB/s, 97.96 GB/s]
- [121.15 GB/s, 120.28 GB/s, 120.66 GB/s, 121.19 GB/s, 121.09 GB/s, 121.68
GB/s, 121.30 GB/s, 123.22 GB/s, 122.51 GB/s, 123.25 GB/s]
- [146.72 GB/s, 146.38 GB/s, 146.25 GB/s, 146.49 GB/s, 146.29 GB/s, 144.30
GB/s, 142.89 GB/s, 150.29 GB/s, 146.37 GB/s, 146.30 GB/s]
- [166.36 GB/s, 168.18 GB/s, 168.79 GB/s, 170.27 GB/s, 169.26 GB/s, 170.98
GB/s, 170.77 GB/s, 171.43 GB/s, 169.53 GB/s, 171.60 GB/s]
- [190.83 GB/s, 197.81 GB/s, 196.29 GB/s, 197.12 GB/s, 196.21 GB/s, 188.40
GB/s, 191.07 GB/s, 195.14 GB/s, 192.48 GB/s, 194.23 GB/s]
update:
- [32.74 GB/s, 32.98 GB/s, 32.73 GB/s, 32.57 GB/s, 32.63 GB/s, 32.41 GB/s,
32.61 GB/s, 32.24 GB/s, 32.52 GB/s, 32.49 GB/s]
- [65.22 GB/s, 65.07 GB/s, 64.65 GB/s, 65.26 GB/s, 63.70 GB/s, 64.19 GB/s,
64.35 GB/s, 64.83 GB/s, 65.60 GB/s, 63.99 GB/s]
- [97.60 GB/s, 96.65 GB/s, 97.50 GB/s, 96.07 GB/s, 97.12 GB/s, 96.41 GB/s,
96.85 GB/s, 96.80 GB/s, 97.10 GB/s, 97.10 GB/s]
- [129.18 GB/s, 127.79 GB/s, 129.50 GB/s, 129.46 GB/s, 128.85 GB/s, 128.69
GB/s, 129.02 GB/s, 130.34 GB/s, 129.92 GB/s, 129.11 GB/s]
- [160.00 GB/s, 161.81 GB/s, 160.37 GB/s, 159.56 GB/s, 160.38 GB/s, 161.91
GB/s, 160.54 GB/s, 161.43 GB/s, 160.59 GB/s, 162.76 GB/s]
- [192.24 GB/s, 193.69 GB/s, 191.11 GB/s, 190.65 GB/s, 193.10 GB/s, 191.30
GB/s, 192.50 GB/s, 193.37 GB/s, 191.98 GB/s, 194.12 GB/s]
- [221.45 GB/s, 229.02 GB/s, 226.33 GB/s, 224.81 GB/s, 225.62 GB/s, 224.79
GB/s, 226.03 GB/s, 227.09 GB/s, 226.46 GB/s, 225.88 GB/s]
- [255.45 GB/s, 256.52 GB/s, 254.06 GB/s, 257.76 GB/s, 256.85 GB/s, 256.27
GB/s, 260.35 GB/s, 259.96 GB/s, 258.40 GB/s, 255.79 GB/s]
threads: [2, 4, 6, 8, 10, 12, 14, 16]
threads per core: 2
total size: [13.20 MB, 13.20 MB, 13.20 MB, 13.20 MB, 13.20 MB, 13.20 MB, 13.20
MB, 13.20 MB]
MEM:
1:
cores: [1, 2, 3, 4, 5, 6, 7, 8]
results:
copy: [11.12 GB/s, 20.53 GB/s, 24.86 GB/s, 26.20 GB/s, 26.47 GB/s, 26.35
GB/s, 26.24 GB/s, 26.17 GB/s]
daxpy: [16.10 GB/s, 30.00 GB/s, 36.88 GB/s, 38.86 GB/s, 39.36 GB/s, 39.19
GB/s, 39.02 GB/s, 38.88 GB/s]
load: [12.30 GB/s, 23.50 GB/s, 33.04 GB/s, 40.59 GB/s, 44.03 GB/s, 44.56
GB/s, 44.26 GB/s, 43.77 GB/s]
triad: [12.41 GB/s, 24.13 GB/s, 29.24 GB/s, 30.73 GB/s, 30.68 GB/s, 30.58
GB/s, 30.54 GB/s, 30.63 GB/s]
update: [17.40 GB/s, 31.16 GB/s, 36.80 GB/s, 39.06 GB/s, 39.80 GB/s, 39.77
GB/s, 39.50 GB/s, 39.24 GB/s]
size per core: [300.00 MB, 150.00 MB, 100.00 MB, 75.00 MB, 60.00 MB, 50.00
MB, 42.86 MB, 37.50 MB]
size per thread: [300.00 MB, 150.00 MB, 100.00 MB, 75.00 MB, 60.00 MB, 50.00
MB, 42.86 MB, 37.50 MB]
stats:
copy:
- [10.83 GB/s, 10.83 GB/s, 10.81 GB/s, 10.82 GB/s, 10.82 GB/s, 10.82 GB/s,
10.83 GB/s, 10.81 GB/s, 10.82 GB/s, 11.12 GB/s]
- [20.34 GB/s, 20.38 GB/s, 20.37 GB/s, 20.34 GB/s, 20.41 GB/s, 20.39 GB/s,
20.39 GB/s, 20.39 GB/s, 20.53 GB/s, 20.35 GB/s]
- [24.70 GB/s, 24.76 GB/s, 24.80 GB/s, 24.86 GB/s, 24.75 GB/s, 24.80 GB/s,
24.77 GB/s, 24.82 GB/s, 24.81 GB/s, 24.73 GB/s]
- [26.10 GB/s, 26.16 GB/s, 26.14 GB/s, 26.16 GB/s, 26.10 GB/s, 26.15 GB/s,
26.10 GB/s, 26.15 GB/s, 26.11 GB/s, 26.20 GB/s]
- [26.45 GB/s, 26.44 GB/s, 26.41 GB/s, 26.43 GB/s, 26.45 GB/s, 26.44 GB/s,
26.46 GB/s, 26.47 GB/s, 26.45 GB/s, 26.44 GB/s]
- [26.34 GB/s, 26.30 GB/s, 26.31 GB/s, 26.33 GB/s, 26.26 GB/s, 26.35 GB/s,
26.30 GB/s, 26.30 GB/s, 26.30 GB/s, 26.34 GB/s]
- [26.20 GB/s, 26.24 GB/s, 26.21 GB/s, 26.22 GB/s, 26.22 GB/s, 26.20 GB/s,
26.20 GB/s, 26.23 GB/s, 26.22 GB/s, 26.23 GB/s]
- [26.15 GB/s, 26.17 GB/s, 26.12 GB/s, 26.15 GB/s, 26.15 GB/s, 26.15 GB/s,
26.12 GB/s, 26.14 GB/s, 26.14 GB/s, 26.17 GB/s]
daxpy:
- [15.77 GB/s, 15.77 GB/s, 16.04 GB/s, 15.68 GB/s, 15.72 GB/s, 15.76 GB/s,
15.91 GB/s, 15.77 GB/s, 16.10 GB/s, 16.04 GB/s]
- [29.88 GB/s, 29.80 GB/s, 30.00 GB/s, 29.87 GB/s, 29.87 GB/s, 30.00 GB/s,
29.79 GB/s, 29.80 GB/s, 29.80 GB/s, 29.82 GB/s]
- [36.63 GB/s, 36.73 GB/s, 36.64 GB/s, 36.64 GB/s, 36.81 GB/s, 36.88 GB/s,
36.62 GB/s, 36.65 GB/s, 36.74 GB/s, 36.71 GB/s]
- [38.82 GB/s, 38.83 GB/s, 38.86 GB/s, 38.81 GB/s, 38.81 GB/s, 38.82 GB/s,
38.85 GB/s, 38.80 GB/s, 38.84 GB/s, 38.73 GB/s]
- [39.32 GB/s, 39.30 GB/s, 39.34 GB/s, 39.36 GB/s, 39.28 GB/s, 39.33 GB/s,
39.31 GB/s, 39.25 GB/s, 39.32 GB/s, 39.33 GB/s]
- [39.10 GB/s, 39.12 GB/s, 39.14 GB/s, 39.16 GB/s, 39.17 GB/s, 39.17 GB/s,
39.13 GB/s, 39.15 GB/s, 39.14 GB/s, 39.19 GB/s]
- [39.01 GB/s, 39.01 GB/s, 39.02 GB/s, 39.02 GB/s, 39.00 GB/s, 39.00 GB/s,
38.97 GB/s, 39.02 GB/s, 38.98 GB/s, 39.01 GB/s]
- [38.76 GB/s, 38.86 GB/s, 38.83 GB/s, 38.82 GB/s, 38.87 GB/s, 38.88 GB/s,
38.81 GB/s, 38.83 GB/s, 38.88 GB/s, 38.88 GB/s]
load:
- [11.97 GB/s, 11.96 GB/s, 11.98 GB/s, 11.97 GB/s, 11.96 GB/s, 12.05 GB/s,
12.30 GB/s, 12.18 GB/s, 11.97 GB/s, 11.96 GB/s]
- [22.85 GB/s, 22.85 GB/s, 22.87 GB/s, 22.94 GB/s, 23.50 GB/s, 22.86 GB/s,
22.86 GB/s, 23.25 GB/s, 22.85 GB/s, 22.86 GB/s]
- [33.04 GB/s, 32.43 GB/s, 32.51 GB/s, 32.52 GB/s, 32.52 GB/s, 32.81 GB/s,
32.77 GB/s, 32.54 GB/s, 32.53 GB/s, 32.53 GB/s]
- [39.95 GB/s, 39.94 GB/s, 39.93 GB/s, 40.15 GB/s, 40.59 GB/s, 40.36 GB/s,
40.28 GB/s, 39.93 GB/s, 39.94 GB/s, 39.98 GB/s]
- [43.98 GB/s, 43.86 GB/s, 43.90 GB/s, 43.80 GB/s, 43.83 GB/s, 43.86 GB/s,
44.03 GB/s, 43.94 GB/s, 43.83 GB/s, 43.92 GB/s]
- [44.46 GB/s, 44.34 GB/s, 44.56 GB/s, 44.51 GB/s, 44.32 GB/s, 44.32 GB/s,
44.51 GB/s, 44.48 GB/s, 44.32 GB/s, 44.34 GB/s]
- [44.03 GB/s, 44.26 GB/s, 44.08 GB/s, 44.18 GB/s, 44.10 GB/s, 43.99 GB/s,
44.07 GB/s, 44.06 GB/s, 43.94 GB/s, 43.97 GB/s]
- [43.48 GB/s, 43.77 GB/s, 43.51 GB/s, 43.49 GB/s, 43.47 GB/s, 43.73 GB/s,
43.55 GB/s, 43.68 GB/s, 43.49 GB/s, 43.50 GB/s]
triad:
- [12.11 GB/s, 12.02 GB/s, 12.03 GB/s, 12.10 GB/s, 12.03 GB/s, 12.04 GB/s,
12.05 GB/s, 12.17 GB/s, 12.02 GB/s, 12.41 GB/s]
- [23.43 GB/s, 23.25 GB/s, 23.25 GB/s, 23.36 GB/s, 23.28 GB/s, 23.24 GB/s,
23.61 GB/s, 23.29 GB/s, 23.31 GB/s, 24.13 GB/s]
- [28.92 GB/s, 29.10 GB/s, 29.17 GB/s, 29.04 GB/s, 28.91 GB/s, 29.16 GB/s,
28.82 GB/s, 29.01 GB/s, 29.24 GB/s, 28.88 GB/s]
- [30.65 GB/s, 30.62 GB/s, 30.73 GB/s, 30.59 GB/s, 30.69 GB/s, 30.68 GB/s,
30.59 GB/s, 30.59 GB/s, 30.57 GB/s, 30.67 GB/s]
- [30.53 GB/s, 30.67 GB/s, 30.65 GB/s, 30.53 GB/s, 30.63 GB/s, 30.68 GB/s,
30.50 GB/s, 30.67 GB/s, 30.64 GB/s, 30.67 GB/s]
- [30.45 GB/s, 30.58 GB/s, 30.51 GB/s, 30.49 GB/s, 30.52 GB/s, 30.49 GB/s,
30.56 GB/s, 30.55 GB/s, 30.47 GB/s, 30.47 GB/s]
- [30.51 GB/s, 30.47 GB/s, 30.50 GB/s, 30.47 GB/s, 30.52 GB/s, 30.54 GB/s,
30.54 GB/s, 30.50 GB/s, 30.49 GB/s, 30.50 GB/s]
- [30.58 GB/s, 30.34 GB/s, 30.56 GB/s, 30.54 GB/s, 30.63 GB/s, 30.53 GB/s,
30.59 GB/s, 30.50 GB/s, 30.54 GB/s, 30.47 GB/s]
update:
- [17.33 GB/s, 17.32 GB/s, 17.34 GB/s, 17.35 GB/s, 17.40 GB/s, 17.35 GB/s,
17.36 GB/s, 17.39 GB/s, 17.35 GB/s, 17.35 GB/s]
- [31.12 GB/s, 31.15 GB/s, 31.10 GB/s, 31.16 GB/s, 31.07 GB/s, 31.08 GB/s,
31.09 GB/s, 31.12 GB/s, 31.12 GB/s, 31.08 GB/s]
- [36.80 GB/s, 36.42 GB/s, 35.92 GB/s, 36.39 GB/s, 35.99 GB/s, 35.98 GB/s,
36.37 GB/s, 36.39 GB/s, 36.38 GB/s, 36.44 GB/s]
- [39.03 GB/s, 39.05 GB/s, 39.02 GB/s, 39.06 GB/s, 39.01 GB/s, 39.02 GB/s,
39.02 GB/s, 39.00 GB/s, 39.00 GB/s, 39.00 GB/s]
- [39.76 GB/s, 39.80 GB/s, 39.80 GB/s, 39.78 GB/s, 39.76 GB/s, 39.79 GB/s,
39.79 GB/s, 39.77 GB/s, 39.77 GB/s, 39.71 GB/s]
- [39.71 GB/s, 39.72 GB/s, 39.72 GB/s, 39.66 GB/s, 39.74 GB/s, 39.70 GB/s,
39.76 GB/s, 39.74 GB/s, 39.77 GB/s, 39.74 GB/s]
- [39.50 GB/s, 39.47 GB/s, 39.45 GB/s, 39.43 GB/s, 39.46 GB/s, 39.45 GB/s,
39.45 GB/s, 39.40 GB/s, 39.43 GB/s, 39.47 GB/s]
- [39.21 GB/s, 39.18 GB/s, 39.19 GB/s, 39.19 GB/s, 39.21 GB/s, 39.19 GB/s,
39.18 GB/s, 39.21 GB/s, 39.20 GB/s, 39.24 GB/s]
threads: [1, 2, 3, 4, 5, 6, 7, 8]
threads per core: 1
total size: [300.00 MB, 300.00 MB, 300.00 MB, 300.00 MB, 300.00 MB, 300.00
MB, 300.00 MB, 300.00 MB]
2:
cores: [1, 2, 3, 4, 5, 6, 7, 8]
results:
copy: [10.79 GB/s, 20.46 GB/s, 24.69 GB/s, 25.42 GB/s, 25.63 GB/s, 25.45
GB/s, 25.32 GB/s, 25.06 GB/s]
daxpy: [15.97 GB/s, 29.70 GB/s, 35.95 GB/s, 37.55 GB/s, 37.81 GB/s, 37.78
GB/s, 37.64 GB/s, 37.33 GB/s]
load: [13.46 GB/s, 25.84 GB/s, 35.75 GB/s, 40.54 GB/s, 42.38 GB/s, 42.30
GB/s, 41.85 GB/s, 41.19 GB/s]
triad: [12.05 GB/s, 22.53 GB/s, 27.53 GB/s, 29.10 GB/s, 29.68 GB/s, 29.79
GB/s, 29.85 GB/s, 29.64 GB/s]
update: [19.12 GB/s, 33.86 GB/s, 38.51 GB/s, 39.38 GB/s, 39.20 GB/s, 38.80
GB/s, 38.39 GB/s, 38.02 GB/s]
size per core: [300.00 MB, 150.00 MB, 100.00 MB, 75.00 MB, 60.00 MB, 50.00
MB, 42.86 MB, 37.50 MB]
size per thread: [150.00 MB, 75.00 MB, 50.00 MB, 37.50 MB, 30.00 MB, 25.00
MB, 21.43 MB, 18.75 MB]
stats:
copy:
- [10.71 GB/s, 10.69 GB/s, 10.71 GB/s, 10.70 GB/s, 10.79 GB/s, 10.58 GB/s,
10.70 GB/s, 10.69 GB/s, 10.69 GB/s, 10.70 GB/s]
- [20.27 GB/s, 20.31 GB/s, 20.27 GB/s, 20.26 GB/s, 20.31 GB/s, 20.26 GB/s,
20.24 GB/s, 20.26 GB/s, 20.26 GB/s, 20.46 GB/s]
- [24.69 GB/s, 24.66 GB/s, 24.64 GB/s, 24.63 GB/s, 24.67 GB/s, 24.64 GB/s,
24.64 GB/s, 24.68 GB/s, 24.61 GB/s, 24.63 GB/s]
- [25.42 GB/s, 25.41 GB/s, 25.40 GB/s, 25.36 GB/s, 25.40 GB/s, 25.39 GB/s,
25.40 GB/s, 25.38 GB/s, 25.41 GB/s, 25.39 GB/s]
- [25.55 GB/s, 25.57 GB/s, 25.58 GB/s, 25.63 GB/s, 25.57 GB/s, 25.57 GB/s,
25.58 GB/s, 25.55 GB/s, 25.57 GB/s, 25.49 GB/s]
- [25.42 GB/s, 25.42 GB/s, 25.41 GB/s, 25.39 GB/s, 25.40 GB/s, 25.43 GB/s,
25.45 GB/s, 25.44 GB/s, 25.43 GB/s, 25.43 GB/s]
- [25.27 GB/s, 25.31 GB/s, 25.28 GB/s, 25.31 GB/s, 25.32 GB/s, 25.31 GB/s,
25.29 GB/s, 25.30 GB/s, 25.25 GB/s, 25.28 GB/s]
- [25.03 GB/s, 25.01 GB/s, 25.01 GB/s, 25.04 GB/s, 25.00 GB/s, 25.03 GB/s,
25.06 GB/s, 25.04 GB/s, 25.04 GB/s, 25.04 GB/s]
daxpy:
- [15.81 GB/s, 15.81 GB/s, 15.97 GB/s, 15.62 GB/s, 15.64 GB/s, 15.83 GB/s,
15.63 GB/s, 15.82 GB/s, 15.81 GB/s, 15.63 GB/s]
- [29.62 GB/s, 29.56 GB/s, 29.61 GB/s, 29.59 GB/s, 29.70 GB/s, 29.61 GB/s,
29.65 GB/s, 29.65 GB/s, 29.58 GB/s, 29.59 GB/s]
- [35.95 GB/s, 35.89 GB/s, 35.92 GB/s, 35.92 GB/s, 35.95 GB/s, 35.90 GB/s,
35.87 GB/s, 35.90 GB/s, 35.92 GB/s, 35.82 GB/s]
- [37.55 GB/s, 37.46 GB/s, 37.52 GB/s, 37.51 GB/s, 37.55 GB/s, 37.51 GB/s,
37.44 GB/s, 37.41 GB/s, 37.50 GB/s, 37.40 GB/s]
- [37.79 GB/s, 37.76 GB/s, 37.80 GB/s, 37.77 GB/s, 37.76 GB/s, 37.81 GB/s,
37.78 GB/s, 37.81 GB/s, 37.79 GB/s, 37.78 GB/s]
- [37.71 GB/s, 37.68 GB/s, 37.68 GB/s, 37.73 GB/s, 37.74 GB/s, 37.66 GB/s,
37.78 GB/s, 37.74 GB/s, 37.71 GB/s, 37.70 GB/s]
- [37.61 GB/s, 37.60 GB/s, 37.61 GB/s, 37.62 GB/s, 37.64 GB/s, 37.61 GB/s,
37.60 GB/s, 37.59 GB/s, 37.63 GB/s, 37.60 GB/s]
- [37.23 GB/s, 37.21 GB/s, 37.26 GB/s, 37.27 GB/s, 37.28 GB/s, 37.33 GB/s,
37.29 GB/s, 37.31 GB/s, 37.26 GB/s, 37.29 GB/s]
load:
- [13.34 GB/s, 13.36 GB/s, 13.35 GB/s, 13.34 GB/s, 13.35 GB/s, 13.38 GB/s,
13.46 GB/s, 13.35 GB/s, 13.35 GB/s, 13.35 GB/s]
- [25.63 GB/s, 25.64 GB/s, 25.84 GB/s, 25.64 GB/s, 25.74 GB/s, 25.63 GB/s,
25.64 GB/s, 25.63 GB/s, 25.64 GB/s, 25.68 GB/s]
- [35.38 GB/s, 35.56 GB/s, 35.50 GB/s, 35.75 GB/s, 35.50 GB/s, 35.39 GB/s,
35.46 GB/s, 35.39 GB/s, 35.75 GB/s, 35.40 GB/s]
- [40.37 GB/s, 40.37 GB/s, 40.49 GB/s, 40.49 GB/s, 40.42 GB/s, 40.37 GB/s,
40.54 GB/s, 40.39 GB/s, 40.37 GB/s, 40.51 GB/s]
- [42.34 GB/s, 42.14 GB/s, 42.26 GB/s, 42.17 GB/s, 42.10 GB/s, 42.13 GB/s,
42.38 GB/s, 42.13 GB/s, 42.21 GB/s, 42.15 GB/s]
- [42.30 GB/s, 42.13 GB/s, 42.20 GB/s, 42.11 GB/s, 42.12 GB/s, 42.12 GB/s,
42.18 GB/s, 42.25 GB/s, 42.19 GB/s, 42.21 GB/s]
- [41.70 GB/s, 41.76 GB/s, 41.85 GB/s, 41.80 GB/s, 41.71 GB/s, 41.71 GB/s,
41.80 GB/s, 41.70 GB/s, 41.76 GB/s, 41.75 GB/s]
- [41.02 GB/s, 41.01 GB/s, 41.17 GB/s, 41.12 GB/s, 41.13 GB/s, 41.15 GB/s,
41.19 GB/s, 41.01 GB/s, 41.10 GB/s, 41.06 GB/s]
triad:
- [11.87 GB/s, 11.89 GB/s, 11.91 GB/s, 11.81 GB/s, 11.83 GB/s, 11.85 GB/s,
11.90 GB/s, 11.80 GB/s, 11.85 GB/s, 12.05 GB/s]
- [22.53 GB/s, 22.47 GB/s, 22.44 GB/s, 22.46 GB/s, 22.43 GB/s, 22.52 GB/s,
22.41 GB/s, 22.52 GB/s, 22.48 GB/s, 22.41 GB/s]
- [27.43 GB/s, 27.42 GB/s, 27.47 GB/s, 27.47 GB/s, 27.52 GB/s, 27.49 GB/s,
27.41 GB/s, 27.42 GB/s, 27.51 GB/s, 27.53 GB/s]
- [29.02 GB/s, 29.03 GB/s, 29.03 GB/s, 29.04 GB/s, 28.89 GB/s, 29.10 GB/s,
29.02 GB/s, 29.05 GB/s, 28.93 GB/s, 29.01 GB/s]
- [29.66 GB/s, 29.68 GB/s, 29.60 GB/s, 29.62 GB/s, 29.60 GB/s, 29.67 GB/s,
29.66 GB/s, 29.62 GB/s, 29.62 GB/s, 29.62 GB/s]
- [29.78 GB/s, 29.76 GB/s, 29.77 GB/s, 29.77 GB/s, 29.75 GB/s, 29.79 GB/s,
29.75 GB/s, 29.77 GB/s, 29.76 GB/s, 29.78 GB/s]
- [29.82 GB/s, 29.85 GB/s, 29.85 GB/s, 29.83 GB/s, 29.82 GB/s, 29.83 GB/s,
29.83 GB/s, 29.81 GB/s, 29.81 GB/s, 29.80 GB/s]
- [29.54 GB/s, 29.63 GB/s, 29.57 GB/s, 29.56 GB/s, 29.55 GB/s, 29.64 GB/s,
29.60 GB/s, 29.53 GB/s, 29.54 GB/s, 29.57 GB/s]
update:
- [18.66 GB/s, 18.67 GB/s, 18.66 GB/s, 19.12 GB/s, 18.67 GB/s, 18.67 GB/s,
18.67 GB/s, 18.67 GB/s, 18.70 GB/s, 18.67 GB/s]
- [33.61 GB/s, 33.34 GB/s, 33.71 GB/s, 33.31 GB/s, 33.34 GB/s, 33.86 GB/s,
33.62 GB/s, 33.35 GB/s, 33.54 GB/s, 33.34 GB/s]
- [38.51 GB/s, 38.46 GB/s, 38.42 GB/s, 38.43 GB/s, 38.41 GB/s, 38.46 GB/s,
38.41 GB/s, 38.42 GB/s, 38.43 GB/s, 38.41 GB/s]
- [39.37 GB/s, 39.34 GB/s, 39.36 GB/s, 39.35 GB/s, 39.37 GB/s, 39.38 GB/s,
39.36 GB/s, 39.35 GB/s, 39.31 GB/s, 39.32 GB/s]
- [39.17 GB/s, 39.17 GB/s, 39.16 GB/s, 39.20 GB/s, 39.18 GB/s, 39.17 GB/s,
39.18 GB/s, 39.15 GB/s, 39.20 GB/s, 39.17 GB/s]
- [38.79 GB/s, 38.79 GB/s, 38.80 GB/s, 38.78 GB/s, 38.78 GB/s, 38.75 GB/s,
38.80 GB/s, 38.77 GB/s, 38.78 GB/s, 38.78 GB/s]
- [38.36 GB/s, 38.37 GB/s, 38.37 GB/s, 38.39 GB/s, 38.36 GB/s, 38.37 GB/s,
38.38 GB/s, 38.37 GB/s, 38.35 GB/s, 38.39 GB/s]
- [37.98 GB/s, 37.99 GB/s, 38.02 GB/s, 38.01 GB/s, 38.01 GB/s, 38.00 GB/s,
38.02 GB/s, 38.00 GB/s, 38.02 GB/s, 38.02 GB/s]
threads: [2, 4, 6, 8, 10, 12, 14, 16]
threads per core: 2
total size: [300.00 MB, 300.00 MB, 300.00 MB, 300.00 MB, 300.00 MB, 300.00
MB, 300.00 MB, 300.00 MB]
import sympy
import numpy
import pystencils
from pystencils.datahandling import create_data_handling
def test_max():
dh = create_data_handling(domain_size=(10, 10), periodicity=True)
x = dh.add_array('x', values_per_cell=1)
dh.fill("x", 0.0, ghost_layers=True)
y = dh.add_array('y', values_per_cell=1)
dh.fill("y", 1.0, ghost_layers=True)
z = dh.add_array('z', values_per_cell=1)
dh.fill("z", 2.0, ghost_layers=True)
# test sp.Max with one argument
assignment_1 = pystencils.Assignment(x.center, sympy.Max(y.center + 3.3))
ast_1 = pystencils.create_kernel(assignment_1)
kernel_1 = ast_1.compile()
# test sp.Max with two arguments
assignment_2 = pystencils.Assignment(x.center, sympy.Max(0.5, y.center - 1.5))
ast_2 = pystencils.create_kernel(assignment_2)
kernel_2 = ast_2.compile()
# test sp.Max with many arguments
assignment_3 = pystencils.Assignment(x.center, sympy.Max(z.center, 4.5, y.center - 1.5, y.center + z.center))
ast_3 = pystencils.create_kernel(assignment_3)
kernel_3 = ast_3.compile()
dh.run_kernel(kernel_1)
assert numpy.all(dh.cpu_arrays["x"] == 4.3)
dh.run_kernel(kernel_2)
assert numpy.all(dh.cpu_arrays["x"] == 0.5)
dh.run_kernel(kernel_3)
assert numpy.all(dh.cpu_arrays["x"] == 4.5)
def test_min():
dh = create_data_handling(domain_size=(10, 10), periodicity=True)
x = dh.add_array('x', values_per_cell=1)
dh.fill("x", 0.0, ghost_layers=True)
y = dh.add_array('y', values_per_cell=1)
dh.fill("y", 1.0, ghost_layers=True)
z = dh.add_array('z', values_per_cell=1)
dh.fill("z", 2.0, ghost_layers=True)
# test sp.Min with one argument
assignment_1 = pystencils.Assignment(x.center, sympy.Min(y.center + 3.3))
ast_1 = pystencils.create_kernel(assignment_1)
kernel_1 = ast_1.compile()
# test sp.Min with two arguments
assignment_2 = pystencils.Assignment(x.center, sympy.Min(0.5, y.center - 1.5))
ast_2 = pystencils.create_kernel(assignment_2)
kernel_2 = ast_2.compile()
# test sp.Min with many arguments
assignment_3 = pystencils.Assignment(x.center, sympy.Min(z.center, 4.5, y.center - 1.5, y.center + z.center))
ast_3 = pystencils.create_kernel(assignment_3)
kernel_3 = ast_3.compile()
dh.run_kernel(kernel_1)
assert numpy.all(dh.cpu_arrays["x"] == 4.3)
dh.run_kernel(kernel_2)
assert numpy.all(dh.cpu_arrays["x"] == - 0.5)
dh.run_kernel(kernel_3)
assert numpy.all(dh.cpu_arrays["x"] == - 0.5)
import sympy
import pystencils
from pystencils.data_types import cast_func, create_type
def test_abs():
x, y, z = pystencils.fields('x, y, z: float64[2d]')
default_int_type = create_type('int64')
assignments = pystencils.AssignmentCollection({
x[0, 0]: sympy.Abs(cast_func(y[0, 0], default_int_type))
})
ast = pystencils.create_kernel(assignments, target="gpu")
code = pystencils.get_code_str(ast)
print(code)
assert 'fabs(' not in code
"""
Test of pystencils.data_types.address_of
"""
import sympy as sp
import pystencils
from pystencils.data_types import PointerType, address_of, cast_func, create_type
from pystencils.simp.simplifications import sympy_cse
def test_address_of():
x, y = pystencils.fields('x,y: int64[2d]')
s = pystencils.TypedSymbol('s', PointerType(create_type('int64')))
assert address_of(x[0, 0]).canonical() == x[0, 0]
assert address_of(x[0, 0]).dtype == PointerType(x[0, 0].dtype, restrict=True)
assert address_of(sp.Symbol("a")).dtype == PointerType('void', restrict=True)
assignments = pystencils.AssignmentCollection({
s: address_of(x[0, 0]),
y[0, 0]: cast_func(s, create_type('int64'))
}, {})
ast = pystencils.create_kernel(assignments)
pystencils.show_code(ast)
assignments = pystencils.AssignmentCollection({
y[0, 0]: cast_func(address_of(x[0, 0]), create_type('int64'))
}, {})
ast = pystencils.create_kernel(assignments)
pystencils.show_code(ast)
def test_address_of_with_cse():
x, y = pystencils.fields('x,y: int64[2d]')
s = pystencils.TypedSymbol('s', PointerType(create_type('int64')))
assignments = pystencils.AssignmentCollection({
y[0, 0]: cast_func(address_of(x[0, 0]), create_type('int64')) + s,
x[0, 0]: cast_func(address_of(x[0, 0]), create_type('int64')) + 1
}, {})
ast = pystencils.create_kernel(assignments)
pystencils.show_code(ast)
assignments_cse = sympy_cse(assignments)
ast = pystencils.create_kernel(assignments_cse)
pystencils.show_code(ast)
%% Cell type:markdown id: tags:
# pystencils - LLVM generation
The generation of LLVM code is simliar but not identical as seen with the C++ version. For the generation itself a python module ``llvmlite`` is used. This module provides the necessary support and bindings for LLVM. In order to generate from the AST to llvm, the AST needs to be transformed to support type conversions. This is the biggest difference to the C++ version. C++ doesn't need that since the language itself handles the casts.
In this example a simple weighted Jacobi kernel is generated, so the focus remains on the part of LLVM generation.
%% Cell type:code id: tags:
``` python
import pytest
pytest.importorskip('llvmlite')
```
%% Cell type:code id: tags:
``` python
import sympy as sp
import numpy as np
import ctypes
from pystencils import Field, Assignment
from pystencils import create_kernel
from pystencils.display_utils import to_dot
sp.init_printing()
```
%% Cell type:markdown id: tags:
The numpy arrays (with inital values) create *Field*s for the update Rule. Later those arrays are used for the computation itself.
%% Cell type:code id: tags:
``` python
src_arr = np.zeros((30, 20))
src_arr[0,:] = 1.0
src_arr[:,0] = 1.0
dst_arr = src_arr.copy()
src_field = Field.create_from_numpy_array('src', src_arr)
dst_field = Field.create_from_numpy_array('dst', dst_arr)
```
%% Cell type:markdown id: tags:
Using the *Field* objects and the additional *Symbol* $\omega$ for the weight the update rule is specified as a *sympy* equation.
%% Cell type:code id: tags:
``` python
omega = sp.symbols("omega")
update_rule = Assignment(dst_field[0,0], omega * (src_field[0,1] + src_field[0,-1] + src_field[1,0] + src_field[-1,0]) / 4
+ (1.-omega)*src_field[0,0])
update_rule
```
%% Output
$\displaystyle {{dst}_{(0,0)}} \leftarrow {{src}_{(0,0)}} \left(1.0 - \omega\right) + \frac{\omega \left({{src}_{(1,0)}} + {{src}_{(0,1)}} + {{src}_{(0,-1)}} + {{src}_{(-1,0)}}\right)}{4}$
ω⋅(src_E + src_N + src_S + src_W)
dst_C := src_C⋅(1.0 - ω) + ─────────────────────────────────
4
%% Cell type:markdown id: tags:
With this update rule an abstract syntax tree (AST) can be created. This AST can be used to print the LLVM code. The creation follows the same routines as the C++ version does. However at the end there are two more steps. In order to generate LLVM, type casting and pointer arithmetic had to be introduced (which C++ does for you).
%% Cell type:code id: tags:
``` python
ast = create_kernel([update_rule], target='llvm')
print(str(ast))
```
%% Output
KernelFunction kernel([_data_dst, _data_src, omega])
Block for(ctr_0=1; ctr_0<29; ctr_0+=1)
Block _data_dst_00 ← pointer_arithmetic_func(_data_dst, 20*ctr_0)
_data_src_00 ← pointer_arithmetic_func(_data_src, 20*ctr_0)
_data_src_01 ← pointer_arithmetic_func(_data_src, 20*ctr_0 + 20)
_data_src_0m1 ← pointer_arithmetic_func(_data_src, 20*ctr_0 - 20)
for(ctr_1=1; ctr_1<19; ctr_1+=1)
Block _data_dst_00[ctr_1] ← omega*(_data_src_00[ctr_1 + 1] + _data_src_00[ctr_1 - 1] + _data_src_01[ctr_1] + _data_src_0m1[ctr_1])*cast_func(1/4, double) + (omega*cast_func(-1, double) + cast_func(1.0, double))*_data_src_00[ctr_1]
%% Cell type:markdown id: tags:
It is possible to examine the AST further.
%% Cell type:code id: tags:
``` python
to_dot(ast)
```
%% Output
<graphviz.files.Source at 0x7f23dbdc5c10>
%% Cell type:markdown id: tags:
With transformed AST it is now possible to generate and compile the AST into LLVM. Notice that unlike in C++ version, no files are writen to the hard drive (although it is possible).
There are multiple ways how to generate and compile the AST. The most simple one is simillar to the C++ version. Using the ``compile()`` function of the generated AST
You can also manually create a python function with ``make_python_function``.
Another option is obtaining the jit itself with ``generate_and_jit``.
The function ``generate_and_jit`` first generates and the compiles the AST.
If even more controll is needed, it is possible to use the functions ``generateLLVM`` and ``compileLLVM`` to achieve the same. For further controll, instead of calling ``compileLLVM`` the jit object itself can be created and its necessary functions for the compilation have to be run manually (``parse``, (``optimize``,) ``compile``)
%% Cell type:code id: tags:
``` python
kernel = ast.compile()
#kernel = make_python_function(ast)
# Or alternativally
#jit = generate_and_jit(ast)
# Call: jit('kernel', src_arr, dst_arr)
```
%% Cell type:markdown id: tags:
The compiled function(s) can be used now. Either call the function (with arguments, if not given before) or call the jit object with the function's name and its arguments. Here, numpy arrays are automatically adjusted with ctypes.
The functions and arguments can be read as well.
**All of the information the jit object has comes from the module which was parsed. If you parse a second module and don't run the compilation step, the module and the compiled code are not the same anymore, thus leading to false information**
%% Cell type:code id: tags:
``` python
#jit.print_functions()
```
%% Cell type:code id: tags:
``` python
for i in range(100):
kernel(src=src_arr, dst=dst_arr, omega=2/3)
src_arr, dst_arr = dst_arr, src_arr
```
%% Cell type:markdown id: tags:
The output is drawn with matplotlib.
%% Cell type:code id: tags:
``` python
import matplotlib.pyplot as plt
from matplotlib import cm
fig = plt.figure()
ax = fig.add_subplot(111)
ax.imshow(dst_arr, cmap=cm.jet)
plt.show()
```
%% Output
%% Cell type:code id: tags:
``` python
```
# -*- coding: utf-8 -*-
#
# Copyright © 2019 Stephan Seitz <stephan.seitz@fau.de>
#
# Distributed under terms of the GPLv3 license.
"""
"""
import itertools
import numpy as np
import pytest
import sympy
from sympy.functions import im, re
import pystencils
from pystencils import AssignmentCollection
from pystencils.data_types import TypedSymbol, create_type
X, Y = pystencils.fields('x, y: complex64[2d]')
A, B = pystencils.fields('a, b: float32[2d]')
S1, S2, T = sympy.symbols('S1, S2, T')
TEST_ASSIGNMENTS = [
AssignmentCollection({X[0, 0]: 1j}),
AssignmentCollection({
S1: re(Y.center),
S2: im(Y.center),
X[0, 0]: 2j * S1 + S2
}),
AssignmentCollection({
A.center: re(Y.center),
B.center: im(Y.center),
}),
AssignmentCollection({
Y.center: re(Y.center) + X.center + 2j,
}),
AssignmentCollection({
T: 2 + 4j,
Y.center: X.center / T,
})
]
SCALAR_DTYPES = ['float32', 'float64']
@pytest.mark.parametrize("assignment, scalar_dtypes",
itertools.product(TEST_ASSIGNMENTS, (np.float32,)))
@pytest.mark.parametrize('target', ('cpu', 'gpu'))
def test_complex_numbers(assignment, scalar_dtypes, target):
ast = pystencils.create_kernel(assignment,
target=target,
data_type=scalar_dtypes)
code = pystencils.get_code_str(ast)
print(code)
assert "Not supported" not in code
if target == 'gpu':
pytest.importorskip('pycuda')
kernel = ast.compile()
assert kernel is not None
X, Y = pystencils.fields('x, y: complex128[2d]')
A, B = pystencils.fields('a, b: float64[2d]')
S1, S2 = sympy.symbols('S1, S2')
T128 = TypedSymbol('ts', create_type('complex128'))
TEST_ASSIGNMENTS = [
AssignmentCollection({X[0, 0]: 1j}),
AssignmentCollection({
S1: re(Y.center),
S2: im(Y.center),
X[0, 0]: 2j * S1 + S2
}),
AssignmentCollection({
A.center: re(Y.center),
B.center: im(Y.center),
}),
AssignmentCollection({
Y.center: re(Y.center) + X.center + 2j,
}),
AssignmentCollection({
T128: 2 + 4j,
Y.center: X.center / T128,
})
]
SCALAR_DTYPES = ['float64']
@pytest.mark.parametrize("assignment", TEST_ASSIGNMENTS)
@pytest.mark.parametrize('target', ('cpu', 'gpu'))
def test_complex_numbers_64(assignment, target):
ast = pystencils.create_kernel(assignment,
target=target,
data_type='double')
code = pystencils.get_code_str(ast)
print(code)
assert "Not supported" not in code
if target == 'gpu':
pytest.importorskip('pycuda')
kernel = ast.compile()
assert kernel is not None
@pytest.mark.parametrize('dtype', (np.float32, np.float64))
@pytest.mark.parametrize('target', ('cpu', 'gpu'))
@pytest.mark.parametrize('with_complex_argument', ('with_complex_argument', False))
def test_complex_execution(dtype, target, with_complex_argument):
complex_dtype = f'complex{64 if dtype ==np.float32 else 128}'
x, y = pystencils.fields(f'x, y: {complex_dtype}[2d]')
x_arr = np.zeros((20, 30), complex_dtype)
y_arr = np.zeros((20, 30), complex_dtype)
if with_complex_argument:
a = pystencils.TypedSymbol('a', create_type(complex_dtype))
else:
a = (2j+1)
assignments = AssignmentCollection({
y.center: x.center + a
})
if target == 'gpu':
pytest.importorskip('pycuda')
from pycuda.gpuarray import zeros
x_arr = zeros((20, 30), complex_dtype)
y_arr = zeros((20, 30), complex_dtype)
kernel = pystencils.create_kernel(assignments, target=target, data_type=dtype).compile()
if with_complex_argument:
kernel(x=x_arr, y=y_arr, a=2j+1)
else:
kernel(x=x_arr, y=y_arr)
if target == 'gpu':
y_arr = y_arr.get()
assert np.allclose(y_arr, 2j+1)
import sympy
import pytest
import pystencils
from pystencils.astnodes import get_dummy_symbol
from pystencils.backends.cuda_backend import CudaSympyPrinter
from pystencils.data_types import address_of
def test_cuda_known_functions():
printer = CudaSympyPrinter()
print(printer.known_functions)
x, y = pystencils.fields('x,y: float32 [2d]')
assignments = pystencils.AssignmentCollection({
get_dummy_symbol(): sympy.Function('atomicAdd')(address_of(y.center()), 2),
y.center(): sympy.Function('rsqrtf')(x[0, 0])
})
ast = pystencils.create_kernel(assignments, 'gpu')
pytest.importorskip('pycuda')
pystencils.show_code(ast)
kernel = ast.compile()
assert(kernel is not None)
def test_cuda_but_not_c():
x, y = pystencils.fields('x,y: float32 [2d]')
assignments = pystencils.AssignmentCollection({
get_dummy_symbol(): sympy.Function('atomicAdd')(address_of(y.center()), 2),
y.center(): sympy.Function('rsqrtf')(x[0, 0])
})
ast = pystencils.create_kernel(assignments, 'cpu')
pystencils.show_code(ast)
def test_cuda_unknown():
x, y = pystencils.fields('x,y: float32 [2d]')
assignments = pystencils.AssignmentCollection({
get_dummy_symbol(): sympy.Function('wtf')(address_of(y.center()), 2),
})
ast = pystencils.create_kernel(assignments, 'gpu')
pystencils.show_code(ast)
%% Cell type:code id: tags:
``` python
import pytest
pytest.importorskip('graphviz')
```
%% Cell type:code id: tags:
``` python
from pystencils.session import *
from pystencils.astnodes import Block, Conditional
```
%% Cell type:code id: tags:
``` python
src, dst = ps.fields("src, dst: double[2D]", layout='c')
true_block = Block([ps.Assignment(dst[0, 0], src[-1, 0])])
false_block = Block([ps.Assignment(dst[0, 0], src[1, 0])])
ur = [true_block, Conditional(dst.center() > 0.0, true_block, false_block)]
ast = ps.create_kernel(ur)
```
%% Cell type:code id: tags:
``` python
ps.to_dot(ast, graph_style={'size': "9.5,12.5"})
```
%% Output
<graphviz.files.Source at 0x7f62452c4110>
# -*- coding: utf-8 -*-
#
# Copyright © 2019 Stephan Seitz <stephan.seitz@fau.de>
#
# Distributed under terms of the GPLv3 license.
"""
"""
from os.path import dirname, join
import numpy as np
import sympy
import pystencils
from pystencils.interpolation_astnodes import LinearInterpolator
try:
import pyconrad.autoinit
except Exception:
import unittest.mock
pyconrad = unittest.mock.MagicMock()
LENNA_FILE = join(dirname(__file__), 'test_data', 'lenna.png')
try:
import skimage.io
lenna = skimage.io.imread(LENNA_FILE, as_gray=True).astype(np.float32)
except Exception:
lenna = np.random.rand(20, 30).astype(np.float32)
def test_rotate_center():
x, y = pystencils.fields('x, y: float32[2d]')
# Rotate around center when setting coordindates origins to field centers
x.set_coordinate_origin_to_field_center()
y.set_coordinate_origin_to_field_center()
rotation_angle = sympy.pi / 5
transform_matrix = sympy.rot_axis3(rotation_angle)[:2, :2]
# Generic matrix transform works like that (for rotation it would be more clever to use transform_matrix.T)
inverse_matrix = transform_matrix.inv()
input_coordinate = x.physical_to_index(inverse_matrix @ y.physical_coordinates)
assignments = pystencils.AssignmentCollection({
y.center(): LinearInterpolator(x).at(input_coordinate)
})
kernel = pystencils.create_kernel(assignments).compile()
rotated = np.zeros_like(lenna)
kernel(x=lenna, y=rotated)
pyconrad.imshow(lenna, "lenna")
pyconrad.imshow(rotated, "rotated")
# If distance in input field is twice as close we will see a smaller image
x.coordinate_transform /= 2
input_coordinate = x.physical_to_index(inverse_matrix @ y.physical_coordinates)
assignments = pystencils.AssignmentCollection({
y.center(): LinearInterpolator(x).at(input_coordinate)
})
kernel = pystencils.create_kernel(assignments).compile()
rotated = np.zeros_like(lenna)
kernel(x=lenna, y=rotated)
pyconrad.imshow(rotated, "rotated smaller")
# Conversely, if output field has samples 3 times closer we will see a bigger image
y.coordinate_transform /= 3
input_coordinate = x.physical_to_index(inverse_matrix @ y.physical_coordinates)
assignments = pystencils.AssignmentCollection({
y.center(): LinearInterpolator(x).at(input_coordinate)
})
kernel = pystencils.create_kernel(assignments).compile()
rotated = np.zeros_like(lenna)
kernel(x=lenna, y=rotated)
pyconrad.imshow(rotated, "rotated bigger")
# coordinate_transform can be any matrix, also with symbols as entries
def main():
test_rotate_center()
if __name__ == '__main__':
main()
import numpy as np
from pystencils import Assignment, Field
from pystencils.cpu import create_indexed_kernel, make_python_function
def test_indexed_kernel():
arr = np.zeros((3, 4))
dtype = np.dtype([('x', int), ('y', int), ('value', arr.dtype)])
index_arr = np.zeros((3,), dtype=dtype)
index_arr[0] = (0, 2, 3.0)
index_arr[1] = (1, 3, 42.0)
index_arr[2] = (2, 1, 5.0)
indexed_field = Field.create_from_numpy_array('index', index_arr)
normal_field = Field.create_from_numpy_array('f', arr)
update_rule = Assignment(normal_field[0, 0], indexed_field('value'))
ast = create_indexed_kernel([update_rule], [indexed_field])
kernel = make_python_function(ast)
kernel(f=arr, index=index_arr)
for i in range(index_arr.shape[0]):
np.testing.assert_allclose(arr[index_arr[i]['x'], index_arr[i]['y']], index_arr[i]['value'], atol=1e-13)
def test_indexed_cuda_kernel():
try:
import pycuda
except ImportError:
pycuda = None
if pycuda:
from pystencils.gpucuda import make_python_function
import pycuda.gpuarray as gpuarray
from pystencils.gpucuda.kernelcreation import created_indexed_cuda_kernel
arr = np.zeros((3, 4))
dtype = np.dtype([('x', int), ('y', int), ('value', arr.dtype)])
index_arr = np.zeros((3,), dtype=dtype)
index_arr[0] = (0, 2, 3.0)
index_arr[1] = (1, 3, 42.0)
index_arr[2] = (2, 1, 5.0)
indexed_field = Field.create_from_numpy_array('index', index_arr)
normal_field = Field.create_from_numpy_array('f', arr)
update_rule = Assignment(normal_field[0, 0], indexed_field('value'))
ast = created_indexed_cuda_kernel([update_rule], [indexed_field])
kernel = make_python_function(ast)
gpu_arr = gpuarray.to_gpu(arr)
gpu_index_arr = gpuarray.to_gpu(index_arr)
kernel(f=gpu_arr, index=gpu_index_arr)
gpu_arr.get(arr)
for i in range(index_arr.shape[0]):
np.testing.assert_allclose(arr[index_arr[i]['x'], index_arr[i]['y']], index_arr[i]['value'], atol=1e-13)
else:
print("Did not run test on GPU since no pycuda is available")
# -*- coding: utf-8 -*-
#
# Copyright © 2019 Stephan Seitz <stephan.seitz@fau.de>
#
# Distributed under terms of the GPLv3 license.
"""
"""
import itertools
from os.path import dirname, join
import numpy as np
import pytest
import sympy
import pystencils
from pystencils.interpolation_astnodes import LinearInterpolator
from pystencils.spatial_coordinates import x_, y_
type_map = {
np.float32: 'float32',
np.float64: 'float64',
np.int32: 'int32',
}
try:
import pyconrad.autoinit
except Exception:
import unittest.mock
pyconrad = unittest.mock.MagicMock()
LENNA_FILE = join(dirname(__file__), 'test_data', 'lenna.png')
try:
import skimage.io
lenna = skimage.io.imread(LENNA_FILE, as_gray=True).astype(np.float64)
pyconrad.imshow(lenna)
except Exception:
lenna = np.random.rand(20, 30)
def test_interpolation():
x_f, y_f = pystencils.fields('x,y: float64 [2d]')
assignments = pystencils.AssignmentCollection({
y_f.center(): LinearInterpolator(x_f).at([x_ + 2.7, y_ + 7.2])
})
print(assignments)
ast = pystencils.create_kernel(assignments)
print(ast)
print(pystencils.show_code(ast))
kernel = ast.compile()
pyconrad.imshow(lenna)
out = np.zeros_like(lenna)
kernel(x=lenna, y=out)
pyconrad.imshow(out, "out")
def test_scale_interpolation():
x_f, y_f = pystencils.fields('x,y: float64 [2d]')
for address_mode in ['border', 'wrap', 'clamp', 'mirror']:
assignments = pystencils.AssignmentCollection({
y_f.center(): LinearInterpolator(x_f, address_mode=address_mode).at([0.5 * x_ + 2.7, 0.25 * y_ + 7.2])
})
print(assignments)
ast = pystencils.create_kernel(assignments)
print(ast)
print(pystencils.show_code(ast))
kernel = ast.compile()
out = np.zeros_like(lenna)
kernel(x=lenna, y=out)
pyconrad.imshow(out, "out " + address_mode)
@pytest.mark.parametrize('address_mode', ['border', 'clamp'])
def test_rotate_interpolation(address_mode):
"""
'wrap', 'mirror' currently fails on new sympy due to conjugate()
"""
x_f, y_f = pystencils.fields('x,y: float64 [2d]')
rotation_angle = sympy.pi / 5
transformed = sympy.rot_axis3(rotation_angle)[:2, :2] * sympy.Matrix((x_, y_))
assignments = pystencils.AssignmentCollection({
y_f.center(): LinearInterpolator(x_f, address_mode=address_mode).at(transformed)
})
print(assignments)
ast = pystencils.create_kernel(assignments)
print(ast)
print(pystencils.show_code(ast))
kernel = ast.compile()
out = np.zeros_like(lenna)
kernel(x=lenna, y=out)
pyconrad.imshow(out, "out " + address_mode)
@pytest.mark.parametrize('dtype', (np.int32, np.float32, np.float64))
@pytest.mark.parametrize('address_mode', ('border', 'wrap', 'clamp', 'mirror'))
@pytest.mark.parametrize('use_textures', ('use_textures', False))
def test_rotate_interpolation_gpu(dtype, address_mode, use_textures):
pytest.importorskip('pycuda')
import pycuda.gpuarray as gpuarray
import pycuda.autoinit # noqa
rotation_angle = sympy.pi / 5
scale = 1
if dtype == np.int32:
lenna_gpu = gpuarray.to_gpu(
np.ascontiguousarray(lenna * 255, dtype))
else:
lenna_gpu = gpuarray.to_gpu(
np.ascontiguousarray(lenna, dtype))
x_f, y_f = pystencils.fields(f'x,y: {type_map[dtype]} [2d]', ghost_layers=0)
transformed = scale * \
sympy.rot_axis3(rotation_angle)[:2, :2] * sympy.Matrix((x_, y_)) - sympy.Matrix([2, 2])
assignments = pystencils.AssignmentCollection({
y_f.center(): LinearInterpolator(x_f, address_mode=address_mode).at(transformed)
})
print(assignments)
ast = pystencils.create_kernel(assignments, target='gpu', use_textures_for_interpolation=use_textures)
print(ast)
print(pystencils.show_code(ast))
kernel = ast.compile()
out = gpuarray.zeros_like(lenna_gpu)
kernel(x=lenna_gpu, y=out)
pyconrad.imshow(out,
f"out {address_mode} texture:{use_textures} {type_map[dtype]}")
@pytest.mark.parametrize('address_mode', ['border', 'wrap', 'mirror'])
@pytest.mark.parametrize('dtype', [np.float64, np.float32, np.int32])
@pytest.mark.parametrize('use_textures', ('use_textures', False,))
def test_shift_interpolation_gpu(address_mode, dtype, use_textures):
sver = sympy.__version__.split(".")
if (int(sver[0]) == 1 and int(sver[1]) < 2) and address_mode == 'mirror':
pytest.skip("% printed as fmod on old sympy")
pytest.importorskip('pycuda')
import pycuda.gpuarray as gpuarray
import pycuda.autoinit # noqa
rotation_angle = 0 # sympy.pi / 5
scale = 1
# shift = - sympy.Matrix([1.5, 1.5])
shift = sympy.Matrix((0.0, 0.0))
if dtype == np.int32:
lenna_gpu = gpuarray.to_gpu(
np.ascontiguousarray(lenna * 255, dtype))
else:
lenna_gpu = gpuarray.to_gpu(
np.ascontiguousarray(lenna, dtype))
x_f, y_f = pystencils.fields(f'x,y: {type_map[dtype]} [2d]', ghost_layers=0)
if use_textures:
transformed = scale * sympy.rot_axis3(rotation_angle)[:2, :2] * sympy.Matrix((x_, y_)) + shift
else:
transformed = scale * sympy.rot_axis3(rotation_angle)[:2, :2] * sympy.Matrix((x_, y_)) + shift
assignments = pystencils.AssignmentCollection({
y_f.center(): LinearInterpolator(x_f, address_mode=address_mode).at(transformed)
})
# print(assignments)
ast = pystencils.create_kernel(assignments, target='gpu', use_textures_for_interpolation=use_textures)
# print(ast)
print(pystencils.show_code(ast))
kernel = ast.compile()
out = gpuarray.zeros_like(lenna_gpu)
kernel(x=lenna_gpu, y=out)
pyconrad.imshow(out,
f"out {address_mode} texture:{use_textures} {type_map[dtype]}")
@pytest.mark.parametrize('address_mode', ['border', 'clamp'])
def test_rotate_interpolation_size_change(address_mode):
"""
'wrap', 'mirror' currently fails on new sympy due to conjugate()
"""
x_f, y_f = pystencils.fields('x,y: float64 [2d]')
rotation_angle = sympy.pi / 5
transformed = sympy.rot_axis3(rotation_angle)[:2, :2] * sympy.Matrix((x_, y_))
assignments = pystencils.AssignmentCollection({
y_f.center(): LinearInterpolator(x_f, address_mode=address_mode).at(transformed)
})
print(assignments)
ast = pystencils.create_kernel(assignments)
print(ast)
print(pystencils.show_code(ast))
kernel = ast.compile()
out = np.zeros((100, 150), np.float64)
kernel(x=lenna, y=out)
pyconrad.imshow(out, "small out " + address_mode)
@pytest.mark.parametrize('address_mode, target',
itertools.product(['border', 'wrap', 'clamp', 'mirror'], ['cpu']))
def test_field_interpolated(address_mode, target):
x_f, y_f = pystencils.fields('x,y: float64 [2d]')
assignments = pystencils.AssignmentCollection({
y_f.center(): x_f.interpolated_access([0.5 * x_ + 2.7, 0.25 * y_ + 7.2], address_mode=address_mode)
})
print(assignments)
ast = pystencils.create_kernel(assignments, target=target)
print(ast)
print(pystencils.show_code(ast))
kernel = ast.compile()
out = np.zeros_like(lenna)
kernel(x=lenna, y=out)
pyconrad.imshow(out, "out " + address_mode)
def test_spatial_derivative():
x, y = pystencils.fields('x, y: float32[2d]')
tx, ty = pystencils.fields('t_x, t_y: float32[2d]')
diff = sympy.diff(x.interpolated_access((tx.center, ty.center)), tx.center)
print("diff: " + str(diff))
%% Cell type:code id: tags:
``` python
from pystencils.session import *
```
%% Cell type:code id: tags:
``` python
dh = ps.create_data_handling(domain_size=(256, 256), periodicity=True)
c_field = dh.add_array('c')
dh.fill("c", 0.0, ghost_layers=True)
```
%% Cell type:code id: tags:
``` python
for x in range(129):
for y in range(258):
dh.cpu_arrays['c'][x, y] = 1.0
```
%% Cell type:code id: tags:
``` python
plt.scalar_field(dh.cpu_arrays["c"])
```
%% Output
<matplotlib.image.AxesImage at 0x7fcb7d253710>
%% Cell type:code id: tags:
``` python
ur = ps.Assignment(c_field[0, 0], c_field[1, 0])
ast = ps.create_kernel(ur, target=dh.default_target, cpu_openmp=True)
kernel = ast.compile()
```
%% Cell type:code id: tags:
``` python
c_sync = dh.synchronization_function_cpu(['c'])
```
%% Cell type:code id: tags:
``` python
def timeloop(steps=10):
for i in range(steps):
c_sync()
dh.run_kernel(kernel)
return dh.gather_array('c')
```
%% Cell type:code id: tags:
``` python
ps.jupyter.set_display_mode('video')
```
%% Cell type:code id: tags:
``` python
ani = ps.plot.scalar_field_animation(timeloop, rescale=True, frames=12)
ps.jupyter.display_animation(ani)
```
%% Output
<IPython.core.display.HTML object>
%% Cell type:code id: tags:
``` python
ps.jupyter.set_display_mode('image_update')
```
%% Cell type:code id: tags:
``` python
ani = ps.plot.scalar_field_animation(timeloop, rescale=True, frames=12)
ps.jupyter.display_animation(ani)
```
%% Output
%% Cell type:code id: tags:
``` python
def grid_update_function(image):
for i in range(40):
c_sync()
dh.run_kernel(kernel)
return dh.gather_array('c')
```
%% Cell type:code id: tags:
``` python
animation = ps.jupyter.make_imshow_animation(dh.cpu_arrays["c"], grid_update_function, frames=300)
```
%% Output
%% Cell type:code id: tags:
``` python
ps.jupyter.set_display_mode("video")
ps.jupyter.set_display_mode("window")
ps.jupyter.set_display_mode("image_update")
ps.jupyter.activate_ipython()
```
import numpy as np
import pytest
import sympy as sp
from pathlib import Path
from kerncraft.kernel import KernelCode
from kerncraft.machinemodel import MachineModel
from kerncraft.models import ECM, ECMData, Benchmark
import pystencils as ps
from pystencils import Assignment, Field
from pystencils.backends.simd_instruction_sets import get_supported_instruction_sets, get_vector_instruction_set
from pystencils.cpu import create_kernel
from pystencils.datahandling import create_data_handling
from pystencils.kerncraft_coupling import KerncraftParameters, PyStencilsKerncraftKernel
from pystencils.kerncraft_coupling.generate_benchmark import generate_benchmark, run_c_benchmark
from pystencils.timeloop import TimeLoop
SCRIPT_FOLDER = Path(__file__).parent
INPUT_FOLDER = SCRIPT_FOLDER / "kerncraft_inputs"
@pytest.mark.kerncraft
def test_compilation():
machine_file_path = INPUT_FOLDER / "Example_SandyBridgeEP_E5-2680.yml"
machine = MachineModel(path_to_yaml=machine_file_path)
kernel_file_path = INPUT_FOLDER / "2d-5pt.c"
with open(kernel_file_path) as kernel_file:
reference_kernel = KernelCode(kernel_file.read(), machine=machine, filename=kernel_file_path)
reference_kernel.get_kernel_header(name='test_kernel')
reference_kernel.get_kernel_code(name='test_kernel')
reference_kernel.get_main_code(kernel_function_name='test_kernel')
size = [30, 50, 3]
arr = np.zeros(size)
a = Field.create_from_numpy_array('a', arr, index_dimensions=1)
b = Field.create_from_numpy_array('b', arr, index_dimensions=1)
s = sp.Symbol("s")
rhs = a[0, -1](0) + a[0, 1] + a[-1, 0] + a[1, 0]
update_rule = Assignment(b[0, 0], s * rhs)
ast = create_kernel([update_rule])
mine = generate_benchmark(ast, likwid=False)
print(mine)
@pytest.mark.kerncraft
def analysis(kernel, machine, model='ecmdata'):
if model == 'ecmdata':
model = ECMData(kernel, machine, KerncraftParameters())
elif model == 'ecm':
model = ECM(kernel, machine, KerncraftParameters())
elif model == 'benchmark':
model = Benchmark(kernel, machine, KerncraftParameters())
else:
model = ECM(kernel, machine, KerncraftParameters())
model.analyze()
return model
@pytest.mark.kerncraft
def test_3d_7pt_osaca():
size = [20, 200, 200]
kernel_file_path = INPUT_FOLDER / "3d-7pt.c"
machine_file_path = INPUT_FOLDER / "Example_SandyBridgeEP_E5-2680.yml"
machine_model = MachineModel(path_to_yaml=machine_file_path)
with open(kernel_file_path) as kernel_file:
reference_kernel = KernelCode(kernel_file.read(), machine=machine_model, filename=kernel_file_path)
reference_kernel.set_constant('M', size[0])
reference_kernel.set_constant('N', size[1])
assert size[1] == size[2]
analysis(reference_kernel, machine_model, model='ecm')
arr = np.zeros(size)
a = Field.create_from_numpy_array('a', arr, index_dimensions=0)
b = Field.create_from_numpy_array('b', arr, index_dimensions=0)
s = sp.Symbol("s")
rhs = a[0, -1, 0] + a[0, 1, 0] + a[-1, 0, 0] + a[1, 0, 0] + a[0, 0, -1] + a[0, 0, 1]
update_rule = Assignment(b[0, 0, 0], s * rhs)
ast = create_kernel([update_rule])
k = PyStencilsKerncraftKernel(ast, machine=machine_model, debug_print=True)
analysis(k, machine_model, model='ecm')
assert reference_kernel._flops == k._flops
path, lock = k.get_kernel_code(openmp=True)
with open(path) as kernel_file:
assert "#pragma omp parallel" in kernel_file.read()
path, lock = k.get_main_code()
with open(path) as kernel_file:
assert "likwid_markerInit();" in kernel_file.read()
@pytest.mark.kerncraft
def test_2d_5pt():
machine_file_path = INPUT_FOLDER / "Example_SandyBridgeEP_E5-2680.yml"
machine = MachineModel(path_to_yaml=machine_file_path)
size = [30, 50, 3]
kernel_file_path = INPUT_FOLDER / "2d-5pt.c"
with open(kernel_file_path) as kernel_file:
reference_kernel = KernelCode(kernel_file.read(), machine=machine,
filename=kernel_file_path)
reference = analysis(reference_kernel, machine)
arr = np.zeros(size)
a = Field.create_from_numpy_array('a', arr, index_dimensions=1)
b = Field.create_from_numpy_array('b', arr, index_dimensions=1)
s = sp.Symbol("s")
rhs = a[0, -1](0) + a[0, 1] + a[-1, 0] + a[1, 0]
update_rule = Assignment(b[0, 0], s * rhs)
ast = create_kernel([update_rule])
k = PyStencilsKerncraftKernel(ast, machine)
result = analysis(k, machine)
for e1, e2 in zip(reference.results['cycles'], result.results['cycles']):
assert e1 == e2
@pytest.mark.kerncraft
def test_3d_7pt():
machine_file_path = INPUT_FOLDER / "Example_SandyBridgeEP_E5-2680.yml"
machine = MachineModel(path_to_yaml=machine_file_path)
size = [30, 50, 50]
kernel_file_path = INPUT_FOLDER / "3d-7pt.c"
with open(kernel_file_path) as kernel_file:
reference_kernel = KernelCode(kernel_file.read(), machine=machine,
filename=kernel_file_path)
reference_kernel.set_constant('M', size[0])
reference_kernel.set_constant('N', size[1])
assert size[1] == size[2]
reference = analysis(reference_kernel, machine)
arr = np.zeros(size)
a = Field.create_from_numpy_array('a', arr, index_dimensions=0)
b = Field.create_from_numpy_array('b', arr, index_dimensions=0)
s = sp.Symbol("s")
rhs = a[0, -1, 0] + a[0, 1, 0] + a[-1, 0, 0] + a[1, 0, 0] + a[0, 0, -1] + a[0, 0, 1]
update_rule = Assignment(b[0, 0, 0], s * rhs)
ast = create_kernel([update_rule])
k = PyStencilsKerncraftKernel(ast, machine)
result = analysis(k, machine)
for e1, e2 in zip(reference.results['cycles'], result.results['cycles']):
assert e1 == e2
@pytest.mark.kerncraft
def test_benchmark():
size = [30, 50, 50]
arr = np.zeros(size)
a = Field.create_from_numpy_array('a', arr, index_dimensions=0)
b = Field.create_from_numpy_array('b', arr, index_dimensions=0)
s = sp.Symbol("s")
rhs = a[0, -1, 0] + a[0, 1, 0] + a[-1, 0, 0] + a[1, 0, 0] + a[0, 0, -1] + a[0, 0, 1]
update_rule = Assignment(b[0, 0, 0], s * rhs)
ast = create_kernel([update_rule])
c_benchmark_run = run_c_benchmark(ast, inner_iterations=1000, outer_iterations=1)
kernel = ast.compile()
a = np.full(size, fill_value=0.23)
b = np.full(size, fill_value=0.23)
timeloop = TimeLoop(steps=1)
timeloop.add_call(kernel, {'a': a, 'b': b, 's': 0.23})
timeloop_time = timeloop.benchmark(number_of_time_steps_for_estimation=1)
np.testing.assert_almost_equal(c_benchmark_run, timeloop_time, decimal=4)
@pytest.mark.kerncraft
def test_benchmark_vectorized():
instruction_sets = get_supported_instruction_sets()
if not instruction_sets:
pytest.skip("cannot detect CPU instruction set")
for vec in instruction_sets:
dh = create_data_handling((20, 20, 20), periodicity=True)
width = get_vector_instruction_set(instruction_set=vec)['width'] * 8
a = dh.add_array("a", values_per_cell=1, alignment=width)
b = dh.add_array("b", values_per_cell=1, alignment=width)
rhs = a[0, -1, 0] + a[0, 1, 0] + a[-1, 0, 0] + a[1, 0, 0] + a[0, 0, -1] + a[0, 0, 1]
update_rule = Assignment(b[0, 0, 0], rhs)
opt = {'instruction_set': vec, 'assume_aligned': True, 'nontemporal': True, 'assume_inner_stride_one': True}
ast = ps.create_kernel(update_rule, cpu_vectorize_info=opt)
run_c_benchmark(ast, 5)
import pytest
try:
from pystencils.llvm.llvmjit import generate_and_jit
from pystencils.llvm import create_kernel, make_python_function
from pystencils.cpu.cpujit import get_llc_command
from pystencils import Assignment, Field, show_code
import numpy as np
import sympy as sp
except ModuleNotFoundError:
pytest.importorskip("llvmlite")
def test_jacobi_fixed_field_size():
size = (30, 20)
src_field_llvm = np.random.rand(*size)
src_field_py = np.copy(src_field_llvm)
dst_field_llvm = np.zeros(size)
dst_field_py = np.zeros(size)
f = Field.create_from_numpy_array("f", src_field_llvm)
d = Field.create_from_numpy_array("d", dst_field_llvm)
jacobi = Assignment(d[0, 0], (f[1, 0] + f[-1, 0] + f[0, 1] + f[0, -1]) / 4)
ast = create_kernel([jacobi])
for x in range(1, size[0] - 1):
for y in range(1, size[1] - 1):
dst_field_py[x, y] = 0.25 * (src_field_py[x - 1, y] + src_field_py[x + 1, y] +
src_field_py[x, y - 1] + src_field_py[x, y + 1])
jit = generate_and_jit(ast)
jit('kernel', dst_field_llvm, src_field_llvm)
error = np.sum(np.abs(dst_field_py - dst_field_llvm))
np.testing.assert_almost_equal(error, 0.0)
@pytest.mark.skipif(not get_llc_command(), reason="Tests requires llc in $PATH")
def test_jacobi_fixed_field_size_gpu():
pytest.importorskip("pycuda")
size = (30, 20)
import pycuda.autoinit # noqa
from pycuda.gpuarray import to_gpu
src_field_llvm = np.random.rand(*size)
src_field_py = np.copy(src_field_llvm)
dst_field_llvm = np.zeros(size)
dst_field_py = np.zeros(size)
f = Field.create_from_numpy_array("f", src_field_py)
d = Field.create_from_numpy_array("d", dst_field_py)
src_field_llvm = to_gpu(src_field_llvm)
dst_field_llvm = to_gpu(dst_field_llvm)
jacobi = Assignment(d[0, 0], (f[1, 0] + f[-1, 0] + f[0, 1] + f[0, -1]) / 4)
ast = create_kernel([jacobi], target='gpu')
show_code(ast)
for x in range(1, size[0] - 1):
for y in range(1, size[1] - 1):
dst_field_py[x, y] = 0.25 * (src_field_py[x - 1, y] + src_field_py[x + 1, y] +
src_field_py[x, y - 1] + src_field_py[x, y + 1])
jit = generate_and_jit(ast)
jit('kernel', dst_field_llvm, src_field_llvm)
error = np.sum(np.abs(dst_field_py - dst_field_llvm.get()))
np.testing.assert_almost_equal(error, 0.0)
def test_jacobi_variable_field_size():
size = (3, 3, 3)
f = Field.create_generic("f", 3)
d = Field.create_generic("d", 3)
jacobi = Assignment(d[0, 0, 0], (f[1, 0, 0] + f[-1, 0, 0] + f[0, 1, 0] + f[0, -1, 0]) / 4)
ast = create_kernel([jacobi])
src_field_llvm = np.random.rand(*size)
src_field_py = np.copy(src_field_llvm)
dst_field_llvm = np.zeros(size)
dst_field_py = np.zeros(size)
for x in range(1, size[0] - 1):
for y in range(1, size[1] - 1):
for z in range(1, size[2] - 1):
dst_field_py[x, y, z] = 0.25 * (src_field_py[x - 1, y, z] + src_field_py[x + 1, y, z] +
src_field_py[x, y - 1, z] + src_field_py[x, y + 1, z])
kernel = make_python_function(ast, {'f': src_field_llvm, 'd': dst_field_llvm})
kernel()
error = np.sum(np.abs(dst_field_py - dst_field_llvm))
np.testing.assert_almost_equal(error, 0.0)
def test_pow_llvm():
size = (30, 20)
src_field_llvm = 4 * np.ones(size)
dst_field_llvm = np.zeros(size)
f = Field.create_from_numpy_array("f", src_field_llvm)
d = Field.create_from_numpy_array("d", dst_field_llvm)
ur = Assignment(d[0, 0], sp.Pow(f[0, 0], -1.0))
ast = create_kernel([ur])
jit = generate_and_jit(ast)
jit('kernel', dst_field_llvm, src_field_llvm)
assert np.all(0.25 == dst_field_llvm)
ur = Assignment(d[0, 0], sp.Pow(f[0, 0], 0.5))
ast = create_kernel([ur])
jit = generate_and_jit(ast)
jit('kernel', dst_field_llvm, src_field_llvm)
assert np.all(2.0 == dst_field_llvm)
ur = Assignment(d[0, 0], sp.Pow(f[0, 0], 2.0))
ast = create_kernel([ur])
jit = generate_and_jit(ast)
jit('kernel', dst_field_llvm, src_field_llvm)
assert np.all(16.0 == dst_field_llvm)
ur = Assignment(d[0, 0], sp.Pow(f[0, 0], 3.0))
ast = create_kernel([ur])
jit = generate_and_jit(ast)
jit('kernel', dst_field_llvm, src_field_llvm)
assert np.all(64.0 == dst_field_llvm)
ur = Assignment(d[0, 0], sp.Pow(f[0, 0], 4.0))
ast = create_kernel([ur])
jit = generate_and_jit(ast)
jit('kernel', dst_field_llvm, src_field_llvm)
assert np.all(256.0 == dst_field_llvm)
def test_piecewise_llvm():
size = (30, 20)
src_field_llvm = np.zeros(size)
dst_field_llvm = np.zeros(size)
src_field_llvm[0:15, :] = 10.0
f = Field.create_from_numpy_array("f", src_field_llvm)
d = Field.create_from_numpy_array("d", dst_field_llvm)
picewise_test_strict_less_than = Assignment(d[0, 0], sp.Piecewise((1.0, f[0, 0] > 10), (0.0, True)))
ast = create_kernel([picewise_test_strict_less_than])
jit = generate_and_jit(ast)
jit('kernel', dst_field_llvm, src_field_llvm)
assert (np.all(dst_field_llvm[:, :] == 0.0))
src_field_llvm = np.zeros(size)
dst_field_llvm = np.zeros(size)
src_field_llvm[0:15, :] = 10.0
picewise_test_less_than = Assignment(d[0, 0], sp.Piecewise((1.0, f[0, 0] >= 10), (0.0, True)))
ast = create_kernel([picewise_test_less_than])
jit = generate_and_jit(ast)
jit('kernel', dst_field_llvm, src_field_llvm)
assert (np.all(dst_field_llvm[0:15, :] == 1.0))
src_field_llvm = np.zeros(size)
dst_field_llvm = np.zeros(size)
src_field_llvm[0:15, :] = 10.0
picewise_test_strict_greater_than = Assignment(d[0, 0], sp.Piecewise((1.0, f[0, 0] < 5), (0.0, True)))
ast = create_kernel([picewise_test_strict_greater_than])
jit = generate_and_jit(ast)
jit('kernel', dst_field_llvm, src_field_llvm)
assert (np.all(dst_field_llvm[15:, :] == 1.0))
src_field_llvm = np.zeros(size)
dst_field_llvm = np.zeros(size)
src_field_llvm[0:15, :] = 10.0
picewise_test_greater_than = Assignment(d[0, 0], sp.Piecewise((1.0, f[0, 0] <= 10), (0.0, True)))
ast = create_kernel([picewise_test_greater_than])
jit = generate_and_jit(ast)
jit('kernel', dst_field_llvm, src_field_llvm)
assert (np.all(dst_field_llvm[:, :] == 1.0))
src_field_llvm = np.zeros(size)
dst_field_llvm = np.zeros(size)
src_field_llvm[0:15, :] = 10.0
picewise_test_equality = Assignment(d[0, 0], sp.Piecewise((1.0, sp.Equality(f[0, 0], 10.0)), (0.0, True)))
ast = create_kernel([picewise_test_equality])
jit = generate_and_jit(ast)
jit('kernel', dst_field_llvm, src_field_llvm)
assert (np.all(dst_field_llvm[0:15, :] == 1.0))
src_field_llvm = np.zeros(size)
dst_field_llvm = np.zeros(size)
src_field_llvm[0:15, :] = 10.0
picewise_test_unequality = Assignment(d[0, 0], sp.Piecewise((1.0, sp.Unequality(f[0, 0], 10.0)), (0.0, True)))
ast = create_kernel([picewise_test_unequality])
jit = generate_and_jit(ast)
jit('kernel', dst_field_llvm, src_field_llvm)
assert (np.all(dst_field_llvm[15:, :] == 1.0))
def test_piecewise_or_llvm():
size = (30, 20)
src_field_llvm = np.zeros(size)
dst_field_llvm = np.zeros(size)
src_field_llvm[0:15, :] = 10.5
f = Field.create_from_numpy_array("f", src_field_llvm)
d = Field.create_from_numpy_array("d", dst_field_llvm)
picewise_test_or = Assignment(d[0, 0], sp.Piecewise((1.0, sp.Or(f[0, 0] > 11, f[0, 0] < 10)), (0.0, True)))
ast = create_kernel([picewise_test_or])
jit = generate_and_jit(ast)
jit('kernel', dst_field_llvm, src_field_llvm)
assert (np.all(dst_field_llvm[0:15, :] == 0.0))
def test_print_function_llvm():
size = (30, 20)
src_field_llvm = np.zeros(size)
dst_field_llvm = np.zeros(size)
src_field_llvm[0:15, :] = 0.0
f = Field.create_from_numpy_array("f", src_field_llvm)
d = Field.create_from_numpy_array("d", dst_field_llvm)
up = Assignment(d[0, 0], sp.sin(f[0, 0]))
ast = create_kernel([up])
# kernel = make_python_function(ast, {'f': src_field_llvm, 'd': dst_field_llvm})
jit = generate_and_jit(ast)
jit('kernel', dst_field_llvm, src_field_llvm)
assert (np.all(dst_field_llvm[:, :] == 0.0))
if __name__ == "__main__":
test_jacobi_fixed_field_size_gpu()
import numpy as np
import pytest
import sympy as sp
import pystencils
from pystencils.backends.cuda_backend import CudaBackend
from pystencils.backends.opencl_backend import OpenClBackend
from pystencils.opencl.opencljit import get_global_cl_queue, make_python_function
try:
import pyopencl as cl
HAS_OPENCL = True
import pystencils.opencl.autoinit
except Exception:
HAS_OPENCL = False
def test_print_opencl():
z, y, x = pystencils.fields("z, y, x: [2d]")
assignments = pystencils.AssignmentCollection({
z[0, 0]: x[0, 0] * sp.log(x[0, 0] * y[0, 0])
})
print(assignments)
ast = pystencils.create_kernel(assignments, target='gpu')
print(ast)
pystencils.show_code(ast, custom_backend=CudaBackend())
opencl_code = pystencils.get_code_str(ast, custom_backend=OpenClBackend())
print(opencl_code)
assert "__global double * RESTRICT const _data_x" in str(opencl_code)
assert "__global double * RESTRICT" in str(opencl_code)
assert "get_local_id(0)" in str(opencl_code)
@pytest.mark.skipif(not HAS_OPENCL, reason="Test requires pyopencl")
def test_opencl_jit_fixed_size():
pytest.importorskip('pycuda')
z, y, x = pystencils.fields("z, y, x: [20,30]")
assignments = pystencils.AssignmentCollection({
z[0, 0]: x[0, 0] * sp.log(x[0, 0] * y[0, 0])
})
print(assignments)
ast = pystencils.create_kernel(assignments, target='gpu')
print(ast)
code = pystencils.show_code(ast, custom_backend=CudaBackend())
print(code)
opencl_code = pystencils.show_code(ast, custom_backend=OpenClBackend())
print(opencl_code)
cuda_kernel = ast.compile()
assert cuda_kernel is not None
import pycuda.gpuarray as gpuarray
x_cpu = np.random.rand(20, 30)
y_cpu = np.random.rand(20, 30)
z_cpu = np.random.rand(20, 30)
x = gpuarray.to_gpu(x_cpu)
y = gpuarray.to_gpu(y_cpu)
z = gpuarray.to_gpu(z_cpu)
cuda_kernel(x=x, y=y, z=z)
result_cuda = z.get()
import pyopencl.array as array
ctx = cl.create_some_context(0)
queue = cl.CommandQueue(ctx)
x = array.to_device(queue, x_cpu)
y = array.to_device(queue, y_cpu)
z = array.to_device(queue, z_cpu)
opencl_kernel = make_python_function(ast, queue, ctx)
assert opencl_kernel is not None
opencl_kernel(x=x, y=y, z=z)
result_opencl = z.get(queue)
assert np.allclose(result_cuda, result_opencl)
@pytest.mark.skipif(not HAS_OPENCL, reason="Test requires pyopencl")
def test_opencl_jit():
pytest.importorskip('pycuda')
z, y, x = pystencils.fields("z, y, x: [2d]")
assignments = pystencils.AssignmentCollection({
z[0, 0]: x[0, 0] * sp.log(x[0, 0] * y[0, 0])
})
print(assignments)
ast = pystencils.create_kernel(assignments, target='gpu')
print(ast)
pystencils.show_code(ast, custom_backend=CudaBackend())
pystencils.show_code(ast, custom_backend=OpenClBackend())
cuda_kernel = ast.compile()
assert cuda_kernel is not None
import pycuda.gpuarray as gpuarray
x_cpu = np.random.rand(20, 30)
y_cpu = np.random.rand(20, 30)
z_cpu = np.random.rand(20, 30)
x = gpuarray.to_gpu(x_cpu)
y = gpuarray.to_gpu(y_cpu)
z = gpuarray.to_gpu(z_cpu)
cuda_kernel(x=x, y=y, z=z)
result_cuda = z.get()
import pyopencl.array as array
ctx = cl.create_some_context(0)
queue = cl.CommandQueue(ctx)
x = array.to_device(queue, x_cpu)
y = array.to_device(queue, y_cpu)
z = array.to_device(queue, z_cpu)
opencl_kernel = make_python_function(ast, queue, ctx)
assert opencl_kernel is not None
opencl_kernel(x=x, y=y, z=z)
result_opencl = z.get(queue)
assert np.allclose(result_cuda, result_opencl)
@pytest.mark.skipif(not HAS_OPENCL, reason="Test requires pyopencl")
def test_opencl_jit_with_parameter():
pytest.importorskip('pycuda')
z, y, x = pystencils.fields("z, y, x: [2d]")
a = sp.Symbol('a')
assignments = pystencils.AssignmentCollection({
z[0, 0]: x[0, 0] * sp.log(x[0, 0] * y[0, 0]) + a
})
print(assignments)
ast = pystencils.create_kernel(assignments, target='gpu')
print(ast)
code = pystencils.show_code(ast, custom_backend=CudaBackend())
print(code)
opencl_code = pystencils.show_code(ast, custom_backend=OpenClBackend())
print(opencl_code)
cuda_kernel = ast.compile()
assert cuda_kernel is not None
import pycuda.gpuarray as gpuarray
x_cpu = np.random.rand(20, 30)
y_cpu = np.random.rand(20, 30)
z_cpu = np.random.rand(20, 30)
x = gpuarray.to_gpu(x_cpu)
y = gpuarray.to_gpu(y_cpu)
z = gpuarray.to_gpu(z_cpu)
cuda_kernel(x=x, y=y, z=z, a=5.)
result_cuda = z.get()
import pyopencl.array as array
ctx = cl.create_some_context(0)
queue = cl.CommandQueue(ctx)
x = array.to_device(queue, x_cpu)
y = array.to_device(queue, y_cpu)
z = array.to_device(queue, z_cpu)
opencl_kernel = make_python_function(ast, queue, ctx)
assert opencl_kernel is not None
opencl_kernel(x=x, y=y, z=z, a=5.)
result_opencl = z.get(queue)
assert np.allclose(result_cuda, result_opencl)
@pytest.mark.skipif(not HAS_OPENCL, reason="Test requires pyopencl")
def test_without_cuda():
z, y, x = pystencils.fields("z, y, x: [20,30]")
assignments = pystencils.AssignmentCollection({
z[0, 0]: x[0, 0] * sp.log(x[0, 0] * y[0, 0])
})
print(assignments)
ast = pystencils.create_kernel(assignments, target='gpu')
print(ast)
opencl_code = pystencils.show_code(ast, custom_backend=OpenClBackend())
print(opencl_code)
x_cpu = np.random.rand(20, 30)
y_cpu = np.random.rand(20, 30)
z_cpu = np.random.rand(20, 30)
import pyopencl.array as array
ctx = cl.create_some_context(0)
queue = cl.CommandQueue(ctx)
x = array.to_device(queue, x_cpu)
y = array.to_device(queue, y_cpu)
z = array.to_device(queue, z_cpu)
opencl_kernel = make_python_function(ast, queue, ctx)
assert opencl_kernel is not None
opencl_kernel(x=x, y=y, z=z)
@pytest.mark.skipif(not HAS_OPENCL, reason="Test requires pyopencl")
def test_kernel_creation():
global pystencils
z, y, x = pystencils.fields("z, y, x: [20,30]")
assignments = pystencils.AssignmentCollection({
z[0, 0]: x[0, 0] * sp.log(x[0, 0] * y[0, 0])
})
print(assignments)
import pystencils.opencl.autoinit
ast = pystencils.create_kernel(assignments, target='opencl')
print(ast.backend)
code = pystencils.get_code_str(ast)
print(code)
assert 'get_local_size' in code
opencl_kernel = ast.compile()
x_cpu = np.random.rand(20, 30)
y_cpu = np.random.rand(20, 30)
z_cpu = np.random.rand(20, 30)
import pyopencl.array as array
assert get_global_cl_queue()
x = array.to_device(get_global_cl_queue(), x_cpu)
y = array.to_device(get_global_cl_queue(), y_cpu)
z = array.to_device(get_global_cl_queue(), z_cpu)
assert opencl_kernel is not None
opencl_kernel(x=x, y=y, z=z)
%% Cell type:code id: tags:
``` python
import pytest
pytest.importorskip('pycuda')
```
%% Cell type:code id: tags:
``` python
from pystencils.session import *
sp.init_printing()
frac = sp.Rational
```
%% Cell type:markdown id: tags:
# Phase-field simulation of dentritic solidification in 3D
This notebook tests the model presented in the dentritic growth tutorial in 3D.
%% Cell type:code id: tags:
``` python
target = 'gpu'
gpu = target == 'gpu'
domain_size = (25, 25, 25) if 'is_test_run' in globals() else (300, 300, 300)
dh = ps.create_data_handling(domain_size=domain_size, periodicity=True, default_target=target)
φ_field = dh.add_array('phi', latex_name='φ')
φ_delta_field = dh.add_array('phidelta', latex_name='φ_D')
t_field = dh.add_array('T')
```
%% Cell type:code id: tags:
``` python
ε, m, δ, j, θzero, α, γ, Teq, κ, τ = sp.symbols("ε m δ j θ_0 α γ T_eq κ τ")
εb = sp.Symbol("\\bar{\\epsilon}")
discretize = ps.fd.Discretization2ndOrder(dx=0.03, dt=1e-5)
φ = φ_field.center
T = t_field.center
d = ps.fd.Diff
def f(φ, m):
return φ**4 / 4 - (frac(1, 2) - m/3) * φ**3 + (frac(1,4)-m/2)*φ**2
bulk_free_energy_density = f(φ, m)
interface_free_energy_density = ε ** 2 / 2 * (d(φ, 0) ** 2 + d(φ, 1) ** 2 + d(φ, 2) ** 2)
```
%% Cell type:markdown id: tags:
Here comes the major change, that has to be made for the 3D model: $\epsilon$ depends on the interface normal, which can not be computed simply as atan() as in the 2D case
%% Cell type:code id: tags:
``` python
n = sp.Matrix([d(φ, i) for i in range(3)])
nLen = sp.sqrt(sum(n_i**2 for n_i in n))
n = n / nLen
nVal = sum(n_i**4 for n_i in n)
σ = δ * nVal
εVal = εb * (1 + σ)
εVal
```
%% Output
$\displaystyle \bar{\epsilon} \left(δ \left(\frac{{\partial_{0} {{φ}_{(0,0,0)}}}^{4}}{\left({\partial_{0} {{φ}_{(0,0,0)}}}^{2} + {\partial_{1} {{φ}_{(0,0,0)}}}^{2} + {\partial_{2} {{φ}_{(0,0,0)}}}^{2}\right)^{2}} + \frac{{\partial_{1} {{φ}_{(0,0,0)}}}^{4}}{\left({\partial_{0} {{φ}_{(0,0,0)}}}^{2} + {\partial_{1} {{φ}_{(0,0,0)}}}^{2} + {\partial_{2} {{φ}_{(0,0,0)}}}^{2}\right)^{2}} + \frac{{\partial_{2} {{φ}_{(0,0,0)}}}^{4}}{\left({\partial_{0} {{φ}_{(0,0,0)}}}^{2} + {\partial_{1} {{φ}_{(0,0,0)}}}^{2} + {\partial_{2} {{φ}_{(0,0,0)}}}^{2}\right)^{2}}\right) + 1\right)$
⎛ ⎛ 4
⎜ ⎜ D(φ[0,0,0])
\bar{\epsilon}⋅⎜δ⋅⎜───────────────────────────────────────────── + ───────────
⎜ ⎜ 2
⎜ ⎜⎛ 2 2 2⎞ ⎛
⎝ ⎝⎝D(φ[0,0,0]) + D(φ[0,0,0]) + D(φ[0,0,0]) ⎠ ⎝D(φ[0,0,0]
4 4
D(φ[0,0,0]) D(φ[0,0,0])
────────────────────────────────── + ─────────────────────────────────────────
2
2 2 2⎞ ⎛ 2 2
) + D(φ[0,0,0]) + D(φ[0,0,0]) ⎠ ⎝D(φ[0,0,0]) + D(φ[0,0,0]) + D(φ[0,0,0]
⎞ ⎞
⎟ ⎟
────⎟ + 1⎟
2⎟ ⎟
2⎞ ⎟ ⎟
) ⎠ ⎠ ⎠
%% Cell type:code id: tags:
``` python
def m_func(temperature):
return (α / sp.pi) * sp.atan(γ * (Teq - temperature))
```
%% Cell type:code id: tags:
``` python
substitutions = {m: m_func(T),
ε: εVal}
fe_i = interface_free_energy_density.subs(substitutions)
fe_b = bulk_free_energy_density.subs(substitutions)
μ_if = ps.fd.expand_diff_full(ps.fd.functional_derivative(fe_i, φ), functions=[φ])
μ_b = ps.fd.expand_diff_full(ps.fd.functional_derivative(fe_b, φ), functions=[φ])
```
%% Cell type:code id: tags:
``` python
dF_dφ = μ_b + sp.Piecewise((μ_if, nLen**2 > 1e-10), (0, True))
```
%% Cell type:code id: tags:
``` python
parameters = {
τ: 0.0003,
κ: 1.8,
εb: 0.01,
δ: 0.3,
γ: 10,
j: 6,
α: 0.9,
Teq: 1.0,
θzero: 0.2,
sp.pi: sp.pi.evalf()
}
parameters
```
%% Output
$\displaystyle \left\{ \pi : 3.14159265358979, \ T_{eq} : 1.0, \ \bar{\epsilon} : 0.01, \ j : 6, \ α : 0.9, \ γ : 10, \ δ : 0.3, \ θ_{0} : 0.2, \ κ : 1.8, \ τ : 0.0003\right\}$
{π: 3.14159265358979, T_eq: 1.0, \bar{\epsilon}: 0.01, j: 6, α: 0.9, γ: 10, δ:
0.3, θ₀: 0.2, κ: 1.8, τ: 0.0003}
%% Cell type:code id: tags:
``` python
dφ_dt = - dF_dφ / τ
assignments = [
ps.Assignment(φ_delta_field.center, discretize(dφ_dt.subs(parameters))),
]
φEqs = ps.simp.sympy_cse_on_assignment_list(assignments)
φEqs.append(ps.Assignment(φ, discretize(ps.fd.transient(φ) - φ_delta_field.center)))
temperatureEvolution = -ps.fd.transient(T) + ps.fd.diffusion(T, 1) + κ * φ_delta_field.center
temperatureEqs = [
ps.Assignment(T, discretize(temperatureEvolution.subs(parameters)))
]
```
%% Cell type:code id: tags:
``` python
temperatureEqs
```
%% Output
$\displaystyle \left[ {{T}_{(0,0,0)}} \leftarrow 0.0111111111111111 {{T}_{(-1,0,0)}} + 0.0111111111111111 {{T}_{(0,-1,0)}} + 0.0111111111111111 {{T}_{(0,0,-1)}} + 0.933333333333333 {{T}_{(0,0,0)}} + 0.0111111111111111 {{T}_{(0,0,1)}} + 0.0111111111111111 {{T}_{(0,1,0)}} + 0.0111111111111111 {{T}_{(1,0,0)}} + 1.8 \cdot 10^{-5} {{φ_D}_{(0,0,0)}}\right]$
[T_C := 0.0111111111111111⋅T_W + 0.0111111111111111⋅T_S + 0.0111111111111111⋅T
_B + 0.933333333333333⋅T_C + 0.0111111111111111⋅T_T + 0.0111111111111111⋅T_N +
0.0111111111111111⋅T_E + 1.8e-5⋅phidelta_C]
%% Cell type:code id: tags:
``` python
φ_kernel = ps.create_kernel(φEqs, cpu_openmp=4, target=target).compile()
temperatureKernel = ps.create_kernel(temperatureEqs, cpu_openmp=4, target=target).compile()
```
%% Cell type:code id: tags:
``` python
def time_loop(steps):
φ_sync = dh.synchronization_function(['phi'], target=target)
temperature_sync = dh.synchronization_function(['T'], target=target)
dh.all_to_gpu()
for t in range(steps):
φ_sync()
dh.run_kernel(φ_kernel)
temperature_sync()
dh.run_kernel(temperatureKernel)
dh.all_to_cpu()
def init(nucleus_size=np.sqrt(5)):
for b in dh.iterate():
x, y, z = b.cell_index_arrays
x, y, z = x - b.shape[0] // 2, y - b.shape[1] // 2, z - b.shape[2] // 2
b['phi'].fill(0)
b['phi'][(x ** 2 + y ** 2 + z ** 2) < nucleus_size ** 2] = 1.0
b['T'].fill(0.0)
def plot(slice_obj=ps.make_slice[:, :, 0.5]):
plt.subplot(1, 3, 1)
plt.scalar_field(dh.gather_array('phi', slice_obj).squeeze())
plt.title("φ")
plt.colorbar()
plt.subplot(1, 3, 2)
plt.title("T")
plt.scalar_field(dh.gather_array('T', slice_obj).squeeze())
plt.colorbar()
plt.subplot(1, 3, 3)
plt.title("∂φ")
plt.scalar_field(dh.gather_array('phidelta', slice_obj).squeeze())
plt.colorbar()
```
%% Cell type:code id: tags:
``` python
init()
plot()
print(dh)
```
%% Output
Name| Inner (min/max)| WithGl (min/max)
----------------------------------------------------
T| ( 0, 0)| ( 0, 0)
phi| ( 0, 1)| ( 0, 1)
phidelta| ( 0, 0)| ( 0, 0)
%% Cell type:code id: tags:
``` python
if 'is_test_run' in globals():
time_loop(2)
assert np.isfinite(dh.max('phi'))
assert np.isfinite(dh.max('T'))
assert np.isfinite(dh.max('phidelta'))
else:
from time import perf_counter
vtk_writer = dh.create_vtk_writer('dentritic_growth_large', ['phi'])
last = perf_counter()
for i in range(300):
time_loop(100)
vtk_writer(i)
print("Step ", i, perf_counter() - last, dh.max('phi'))
last = perf_counter()
```