Commit 2d758462 authored by Jan Hönig's avatar Jan Hönig
Browse files

Merge branch 'Enable_OSACA_usage' into 'master'

Enable osaca usage

See merge request pycodegen/pystencils!165
parents cfd770cf 0391c91d
Pipeline #25747 passed with stages
in 8 minutes and 13 seconds
import os
import subprocess
import warnings
import tempfile
from pathlib import Path
from jinja2 import Template
from jinja2 import Environment, PackageLoader, StrictUndefined
from pystencils.astnodes import PragmaBlock
from pystencils.backends.cbackend import generate_c, get_headers
......@@ -10,116 +12,6 @@ from pystencils.data_types import get_base_type
from pystencils.include import get_pystencils_include_path
from pystencils.sympyextensions import prod
benchmark_template = Template("""
#include "kerncraft.h"
#include <stdlib.h>
#include <stdint.h>
#include <stdbool.h>
#include <math.h>
#include <stdio.h>
{{ includes }}
{%- if likwid %}
#include <likwid.h>
{%- endif %}
#define RESTRICT __restrict__
#define FUNC_PREFIX
void dummy(void *);
void timing(double* wcTime, double* cpuTime);
extern int var_false;
{{kernel_code}}
int main(int argc, char **argv)
{
{%- if likwid %}
likwid_markerInit();
{%- endif %}
{%- for field_name, dataType, size in fields %}
// Initialization {{field_name}}
double * {{field_name}} = (double *) aligned_malloc(sizeof({{dataType}}) * {{size}}, 64);
for (unsigned long long i = 0; i < {{size}}; ++i)
{{field_name}}[i] = 0.23;
if(var_false)
dummy({{field_name}});
{%- endfor %}
{%- for constantName, dataType in constants %}
// Constant {{constantName}}
{{dataType}} {{constantName}};
{{constantName}} = 0.23;
if(var_false)
dummy(& {{constantName}});
{%- endfor %}
{%- if likwid and openmp %}
#pragma omp parallel
{
likwid_markerRegisterRegion("loop");
#pragma omp barrier
{%- elif likwid %}
likwid_markerRegisterRegion("loop");
{%- endif %}
for(int warmup = 1; warmup >= 0; --warmup) {
int repeat = 2;
if(warmup == 0) {
repeat = atoi(argv[1]);
{%- if likwid %}
likwid_markerStartRegion("loop");
{%- endif %}
}
{%- if timing %}
double wcStartTime, cpuStartTime, wcEndTime, cpuEndTime;
timing(&wcStartTime, &cpuStartTime);
{%- endif %}
for (; repeat > 0; --repeat)
{
{{kernelName}}({{call_argument_list}});
// Dummy calls
{%- for field_name, dataType, size in fields %}
if(var_false) dummy((void*){{field_name}});
{%- endfor %}
{%- for constantName, dataType in constants %}
if(var_false) dummy((void*)&{{constantName}});
{%- endfor %}
}
{%- if timing %}
timing(&wcEndTime, &cpuEndTime);
if( warmup == 0)
printf("%e\\n", (wcEndTime - wcStartTime) / atoi(argv[1]) );
{%- endif %}
}
{%- if likwid %}
likwid_markerStopRegion("loop");
{%- if openmp %}
}
{%- endif %}
{%- endif %}
{%- if likwid %}
likwid_markerClose();
{%- endif %}
}
""")
def generate_benchmark(ast, likwid=False, openmp=False, timing=False):
"""Return C code of a benchmark program for the given kernel.
......@@ -157,7 +49,7 @@ def generate_benchmark(ast, likwid=False, openmp=False, timing=False):
if len(ast.body.args) > 0 and isinstance(ast.body.args[0], PragmaBlock):
ast.body.args[0].pragma_line = ''
args = {
jinja_context = {
'likwid': likwid,
'openmp': openmp,
'kernel_code': generate_c(ast, dialect='c'),
......@@ -168,16 +60,20 @@ def generate_benchmark(ast, likwid=False, openmp=False, timing=False):
'includes': includes,
'timing': timing,
}
return benchmark_template.render(**args)
env = Environment(loader=PackageLoader('pystencils.kerncraft_coupling'), undefined=StrictUndefined)
return env.get_template('benchmark.c').render(**jinja_context)
def run_c_benchmark(ast, inner_iterations, outer_iterations=3):
def run_c_benchmark(ast, inner_iterations, outer_iterations=3, path=None):
"""Runs the given kernel with outer loop in C
Args:
ast:
ast: pystencils ast which is used to compile the benchmark file
inner_iterations: timings are recorded around this many iterations
outer_iterations: number of timings recorded
path: path where the benchmark file is stored. If None a tmp folder is created
Returns:
list of times per iterations for each outer iteration
......@@ -185,26 +81,40 @@ def run_c_benchmark(ast, inner_iterations, outer_iterations=3):
import kerncraft
benchmark_code = generate_benchmark(ast, timing=True)
with open('bench.c', 'w') as f:
if path is None:
path = tempfile.mkdtemp()
if isinstance(path, str):
path = Path(path)
with open(path / 'bench.c', 'w') as f:
f.write(benchmark_code)
kerncraft_path = os.path.dirname(kerncraft.__file__)
kerncraft_path = Path(kerncraft.__file__).parent
extra_flags = ['-I' + get_pystencils_include_path(),
'-I' + os.path.join(kerncraft_path, 'headers')]
'-I' + str(kerncraft_path / 'headers')]
compiler_config = get_compiler_config()
compile_cmd = [compiler_config['command']] + compiler_config['flags'].split()
compile_cmd += [*extra_flags,
os.path.join(kerncraft_path, 'headers', 'timing.c'),
os.path.join(kerncraft_path, 'headers', 'dummy.c'),
'bench.c',
'-o', 'bench',
kerncraft_path / 'headers' / 'timing.c',
kerncraft_path / 'headers' / 'dummy.c',
path / 'bench.c',
'-o', path / 'bench',
]
run_compile_step(compile_cmd)
time_pre_estimation_per_iteration = float(subprocess.check_output(['./' / path / 'bench', str(10)]))
benchmark_time_limit = 20
if benchmark_time_limit / time_pre_estimation_per_iteration < inner_iterations:
warn = (f"A benchmark run with {inner_iterations} inner_iterations will probably take longer than "
f"{benchmark_time_limit} seconds for this kernel")
warnings.warn(warn)
results = []
for _ in range(outer_iterations):
benchmark_time = float(subprocess.check_output(['./bench', str(inner_iterations)]))
benchmark_time = float(subprocess.check_output(['./' / path / 'bench', str(inner_iterations)]))
results.append(benchmark_time)
return results
import warnings
import fcntl
from collections import defaultdict
from tempfile import TemporaryDirectory
from typing import Optional
import kerncraft
from jinja2 import Environment, PackageLoader, StrictUndefined
import sympy as sp
from kerncraft.kerncraft import KernelCode
from kerncraft.machinemodel import MachineModel
from pystencils.astnodes import (
KernelFunction, LoopOverCoordinate, ResolvedFieldAccess, SympyAssignment)
from pystencils.astnodes import (KernelFunction, LoopOverCoordinate, ResolvedFieldAccess, SympyAssignment)
from pystencils.field import get_layout_from_strides
from pystencils.kerncraft_coupling.generate_benchmark import generate_benchmark
from pystencils.sympyextensions import count_operations_in_ast
from pystencils.transformations import filtered_tree_iteration
from pystencils.utils import DotDict
from pystencils.backends.cbackend import generate_c, get_headers
from pystencils.cpu.kernelcreation import add_openmp
class PyStencilsKerncraftKernel(KernelCode):
......@@ -34,8 +36,10 @@ class PyStencilsKerncraftKernel(KernelCode):
assumed_layout: either 'SoA' or 'AoS' - if fields have symbolic sizes the layout of the index
coordinates is not known. In this case either a structures of array (SoA) or
array of structures (AoS) layout is assumed
debug_print: print debug information
filename: used for caching
"""
kerncraft.kernel.Kernel.__init__(self, machine)
super(KernelCode, self).__init__(machine=machine)
# Initialize state
self.asm_block = None
......@@ -96,7 +100,7 @@ class PyStencilsKerncraftKernel(KernelCode):
for field in fields_accessed:
layout = get_layout_tuple(field)
permuted_shape = list(field.shape[i] for i in layout)
self.set_variable(field.name, str(field.dtype), tuple(permuted_shape))
self.set_variable(field.name, tuple([str(field.dtype)]), tuple(permuted_shape))
# Scalars may be safely ignored
# for param in ast.get_parameters():
......@@ -129,24 +133,64 @@ class PyStencilsKerncraftKernel(KernelCode):
print("----------------------------- FLOPS -------------------------------")
pprint(self._flops)
def as_code(self, type_='iaca', openmp=False, as_filename=False):
def get_kernel_header(self, name='pystencils_kernel'):
file_name = "pystencils_kernel.h"
file_path = self.get_intermediate_location(file_name, machine_and_compiler_dependent=False)
lock_mode, lock_fp = self.lock_intermediate(file_path)
if lock_mode == fcntl.LOCK_EX:
function_signature = generate_c(self.kernel_ast, dialect='c', signature_only=True)
jinja_context = {
'function_signature': function_signature,
}
env = Environment(loader=PackageLoader('pystencils.kerncraft_coupling'), undefined=StrictUndefined)
file_header = env.get_template('kernel.h').render(**jinja_context)
with open(file_path, 'w') as f:
f.write(file_header)
fcntl.flock(lock_fp, fcntl.LOCK_SH) # degrade to shared lock
return file_path, lock_fp
def get_kernel_code(self, openmp=False, name='pystencils_kernl'):
"""
Generate and return compilable source code.
Args:
type_: can be iaca or likwid.
openmp: if true, openmp code will be generated
as_filename:
name: kernel name
"""
code = generate_benchmark(self.kernel_ast, likwid=type_ == 'likwid', openmp=openmp)
if as_filename:
fp, already_available = self._get_intermediate_file(f'kernel_{type_}.c',
machine_and_compiler_dependent=False)
if not already_available:
fp.write(code)
return fp.name
else:
return code
filename = 'pystencils_kernl'
if openmp:
filename += '-omp'
filename += '.c'
file_path = self.get_intermediate_location(filename, machine_and_compiler_dependent=False)
lock_mode, lock_fp = self.lock_intermediate(file_path)
if lock_mode == fcntl.LOCK_EX:
header_list = get_headers(self.kernel_ast)
includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list])
if openmp:
add_openmp(self.kernel_ast)
kernel_code = generate_c(self.kernel_ast, dialect='c')
jinja_context = {
'includes': includes,
'kernel_code': kernel_code,
}
env = Environment(loader=PackageLoader('pystencils.kerncraft_coupling'), undefined=StrictUndefined)
file_header = env.get_template('kernel.c').render(**jinja_context)
with open(file_path, 'w') as f:
f.write(file_header)
fcntl.flock(lock_fp, fcntl.LOCK_SH) # degrade to shared lock
return file_path, lock_fp
class KerncraftParameters(DotDict):
......@@ -161,6 +205,7 @@ class KerncraftParameters(DotDict):
self['iterations'] = 10
self['unit'] = 'cy/CL'
self['ignore_warnings'] = True
self['incore_model'] = 'OSACA'
# ------------------------------------------- Helper functions ---------------------------------------------------------
......
#include "kerncraft.h"
#include <stdlib.h>
#include <stdint.h>
#include <stdbool.h>
#include <math.h>
#include <stdio.h>
{{ includes }}
{%- if likwid %}
#include <likwid.h>
{%- endif %}
#define RESTRICT __restrict__
#define FUNC_PREFIX
void dummy(void *);
void timing(double* wcTime, double* cpuTime);
extern int var_false;
{{kernel_code}}
int main(int argc, char **argv)
{
{%- if likwid %}
likwid_markerInit();
{%- endif %}
{%- for field_name, dataType, size in fields %}
// Initialization {{field_name}}
double * {{field_name}} = (double *) aligned_malloc(sizeof({{dataType}}) * {{size}}, 64);
for (unsigned long long i = 0; i < {{size}}; ++i)
{{field_name}}[i] = 0.23;
if(var_false)
dummy({{field_name}});
{%- endfor %}
{%- for constantName, dataType in constants %}
// Constant {{constantName}}
{{dataType}} {{constantName}};
{{constantName}} = 0.23;
if(var_false)
dummy(& {{constantName}});
{%- endfor %}
{%- if likwid and openmp %}
#pragma omp parallel
{
likwid_markerRegisterRegion("loop");
#pragma omp barrier
{%- elif likwid %}
likwid_markerRegisterRegion("loop");
{%- endif %}
for(int warmup = 1; warmup >= 0; --warmup) {
int repeat = 2;
if(warmup == 0) {
repeat = atoi(argv[1]);
{%- if likwid %}
likwid_markerStartRegion("loop");
{%- endif %}
}
{%- if timing %}
double wcStartTime, cpuStartTime, wcEndTime, cpuEndTime;
timing(&wcStartTime, &cpuStartTime);
{%- endif %}
for (; repeat > 0; --repeat)
{
{{kernelName}}({{call_argument_list}});
// Dummy calls
{%- for field_name, dataType, size in fields %}
if(var_false) dummy((void*){{field_name}});
{%- endfor %}
{%- for constantName, dataType in constants %}
if(var_false) dummy((void*)&{{constantName}});
{%- endfor %}
}
{%- if timing %}
timing(&wcEndTime, &cpuEndTime);
if( warmup == 0)
printf("%e\n", (wcEndTime - wcStartTime) / atoi(argv[1]) );
{%- endif %}
}
{%- if likwid %}
likwid_markerStopRegion("loop");
{%- if openmp %}
}
{%- endif %}
{%- endif %}
{%- if likwid %}
likwid_markerClose();
{%- endif %}
}
#include "kerncraft.h"
#include <stdlib.h>
#include <stdint.h>
#include <stdbool.h>
#include <math.h>
#include <stdio.h>
{{ includes }}
#define RESTRICT __restrict__
#define FUNC_PREFIX
void dummy(void *);
void timing(double* wcTime, double* cpuTime);
extern int var_false;
{{kernel_code}}
\ No newline at end of file
#define FUNC_PREFIX
{{function_signature}}
\ No newline at end of file
kerncraft version: 0.8.3.dev0
model name: Intel(R) Xeon(R) CPU E5-2680 0 @ 2.70GHz
model type: Intel Xeon SandyBridge EN/EP processor
clock: 2.7 GHz
sockets: 2
cores per socket: 8
threads per core: 2
NUMA domains per socket: 1
cores per NUMA domain: 8
in-core model: !!omap
- IACA: SNB
- OSACA: SNB
- LLVM-MCA: -mcpu=sandybridge
isa: x86
FLOPs per cycle:
SP: {total: 16, ADD: 8, MUL: 8}
DP: {total: 8, ADD: 4, MUL: 4}
compiler: !!omap
- icc: -O3 -xAVX -fno-alias -qopenmp
- clang: -O3 -march=corei7-avx -mtune=corei7-avx -D_POSIX_C_SOURCE=200809L -fopenmp
- gcc: -O3 -march=corei7-avx -D_POSIX_C_SOURCE=200809L -fopenmp -lm
non-overlapping model:
ports: [2D, 3D]
performance counter metric: T_nOL + T_L1L2 + T_L2L3 + T_L3MEM
overlapping model:
ports: ['0', 0DV, '1', '2', '3', '4', '5']
performance counter metric: Max(UOPS_DISPATCHED_PORT_PORT_0:PMC[0-3], UOPS_DISPATCHED_PORT_PORT_1:PMC[0-3],
UOPS_DISPATCHED_PORT_PORT_4:PMC[0-3], UOPS_DISPATCHED_PORT_PORT_5:PMC[0-3])
cacheline size: 64 B
memory hierarchy:
- level: L1
cache per group: {sets: 64, ways: 8, cl_size: 64, replacement_policy: LRU, write_allocate: true,
write_back: true, load_from: L2, store_to: L2}
cores per group: 1
threads per group: 2
groups: 16
performance counter metrics:
accesses: MEM_UOPS_RETIRED_LOADS:PMC[0-3] + MEM_UOPS_RETIRED_STORES:PMC[0-3]
misses: L1D_REPLACEMENT:PMC[0-3]
evicts: L1D_M_EVICT:PMC[0-3]
upstream throughput: [architecture code analyzer, [2D, 3D]]
transfers overlap: false
- level: L2
cache per group: {sets: 512, ways: 8, cl_size: 64, replacement_policy: LRU, write_allocate: true,
write_back: true, load_from: L3, store_to: L3}
cores per group: 1
threads per group: 2
groups: 16
upstream throughput: [32 B/cy, half-duplex]
transfers overlap: false
performance counter metrics:
accesses: L1D_REPLACEMENT:PMC[0-3] + L1D_M_EVICT:PMC[0-3]
misses: L2_LINES_IN_ALL:PMC[0-3]
evicts: L2_TRANS_L2_WB:PMC[0-3]
- level: L3
cache per group: {sets: 20480, ways: 16, cl_size: 64, replacement_policy: LRU, write_allocate: true,
write_back: true}
cores per group: 8
threads per group: 16
groups: 2
upstream throughput: [32 B/cy, half-duplex]
transfers overlap: false
performance counter metrics:
accesses: L2_LINES_IN_ALL:PMC[0-3] + L2_TRANS_L2_WB:PMC[0-3]
misses: (CAS_COUNT_RD:MBOX0C[01] + CAS_COUNT_RD:MBOX1C[01] + CAS_COUNT_RD:MBOX2C[01]
+ CAS_COUNT_RD:MBOX3C[01])
evicts: (CAS_COUNT_WR:MBOX0C[01] + CAS_COUNT_WR:MBOX1C[01] + CAS_COUNT_WR:MBOX2C[01]
+ CAS_COUNT_WR:MBOX3C[01])
- level: MEM
cores per group: 8
upstream throughput: [full socket memory bandwidth, half-duplex]
transfers overlap: false
size per group:
threads per group: 16
benchmarks:
kernels:
copy:
FLOPs per iteration: 0
fastest bench kernel: copy_avx
read streams: {bytes: 8.00 B, streams: 1}
read+write streams: {bytes: 0.00 B, streams: 0}
write streams: {bytes: 8.00 B, streams: 1}
daxpy:
FLOPs per iteration: 2
fastest bench kernel: daxpy_avx
read streams: {bytes: 16.00 B, streams: 2}
read+write streams: {bytes: 8.00 B, streams: 1}
write streams: {bytes: 8.00 B, streams: 1}
load:
FLOPs per iteration: 0
fastest bench kernel: load_avx
read streams: {bytes: 8.00 B, streams: 1}
read+write streams: {bytes: 0.00 B, streams: 0}
write streams: {bytes: 0.00 B, streams: 0}
triad:
FLOPs per iteration: 2
fastest bench kernel: triad_avx
read streams: {bytes: 24.00 B, streams: 3}
read+write streams: {bytes: 0.00 B, streams: 0}
write streams: {bytes: 8.00 B, streams: 1}
update:
FLOPs per iteration: 0
fastest bench kernel: update_avx
read streams: {bytes: 8.00 B, streams: 1}
read+write streams: {bytes: 8.00 B, streams: 1}
write streams: {bytes: 8.00 B, streams: 1}
measurements:
L1:
1:
cores: [1, 2, 3, 4, 5, 6, 7, 8]
results:
copy: [83.27 GB/s, 166.52 GB/s, 249.78 GB/s, 333.02 GB/s, 416.34 GB/s, 495.96