Commit 2d758462 authored by Jan Hönig's avatar Jan Hönig
Browse files

Merge branch 'Enable_OSACA_usage' into 'master'

Enable osaca usage

See merge request pycodegen/pystencils!165
parents cfd770cf 0391c91d
import os
import subprocess
import warnings
import tempfile
from pathlib import Path
from jinja2 import Template
from jinja2 import Environment, PackageLoader, StrictUndefined
from pystencils.astnodes import PragmaBlock
from pystencils.backends.cbackend import generate_c, get_headers
......@@ -10,116 +12,6 @@ from pystencils.data_types import get_base_type
from pystencils.include import get_pystencils_include_path
from pystencils.sympyextensions import prod
benchmark_template = Template("""
#include "kerncraft.h"
#include <stdlib.h>
#include <stdint.h>
#include <stdbool.h>
#include <math.h>
#include <stdio.h>
{{ includes }}
{%- if likwid %}
#include <likwid.h>
{%- endif %}
#define RESTRICT __restrict__
#define FUNC_PREFIX
void dummy(void *);
void timing(double* wcTime, double* cpuTime);
extern int var_false;
{{kernel_code}}
int main(int argc, char **argv)
{
{%- if likwid %}
likwid_markerInit();
{%- endif %}
{%- for field_name, dataType, size in fields %}
// Initialization {{field_name}}
double * {{field_name}} = (double *) aligned_malloc(sizeof({{dataType}}) * {{size}}, 64);
for (unsigned long long i = 0; i < {{size}}; ++i)
{{field_name}}[i] = 0.23;
if(var_false)
dummy({{field_name}});
{%- endfor %}
{%- for constantName, dataType in constants %}
// Constant {{constantName}}
{{dataType}} {{constantName}};
{{constantName}} = 0.23;
if(var_false)
dummy(& {{constantName}});
{%- endfor %}
{%- if likwid and openmp %}
#pragma omp parallel
{
likwid_markerRegisterRegion("loop");
#pragma omp barrier
{%- elif likwid %}
likwid_markerRegisterRegion("loop");
{%- endif %}
for(int warmup = 1; warmup >= 0; --warmup) {
int repeat = 2;
if(warmup == 0) {
repeat = atoi(argv[1]);
{%- if likwid %}
likwid_markerStartRegion("loop");
{%- endif %}
}
{%- if timing %}
double wcStartTime, cpuStartTime, wcEndTime, cpuEndTime;
timing(&wcStartTime, &cpuStartTime);
{%- endif %}
for (; repeat > 0; --repeat)
{
{{kernelName}}({{call_argument_list}});
// Dummy calls
{%- for field_name, dataType, size in fields %}
if(var_false) dummy((void*){{field_name}});
{%- endfor %}
{%- for constantName, dataType in constants %}
if(var_false) dummy((void*)&{{constantName}});
{%- endfor %}
}
{%- if timing %}
timing(&wcEndTime, &cpuEndTime);
if( warmup == 0)
printf("%e\\n", (wcEndTime - wcStartTime) / atoi(argv[1]) );
{%- endif %}
}
{%- if likwid %}
likwid_markerStopRegion("loop");
{%- if openmp %}
}
{%- endif %}
{%- endif %}
{%- if likwid %}
likwid_markerClose();
{%- endif %}
}
""")
def generate_benchmark(ast, likwid=False, openmp=False, timing=False):
"""Return C code of a benchmark program for the given kernel.
......@@ -157,7 +49,7 @@ def generate_benchmark(ast, likwid=False, openmp=False, timing=False):
if len(ast.body.args) > 0 and isinstance(ast.body.args[0], PragmaBlock):
ast.body.args[0].pragma_line = ''
args = {
jinja_context = {
'likwid': likwid,
'openmp': openmp,
'kernel_code': generate_c(ast, dialect='c'),
......@@ -168,16 +60,20 @@ def generate_benchmark(ast, likwid=False, openmp=False, timing=False):
'includes': includes,
'timing': timing,
}
return benchmark_template.render(**args)
env = Environment(loader=PackageLoader('pystencils.kerncraft_coupling'), undefined=StrictUndefined)
return env.get_template('benchmark.c').render(**jinja_context)
def run_c_benchmark(ast, inner_iterations, outer_iterations=3):
def run_c_benchmark(ast, inner_iterations, outer_iterations=3, path=None):
"""Runs the given kernel with outer loop in C
Args:
ast:
ast: pystencils ast which is used to compile the benchmark file
inner_iterations: timings are recorded around this many iterations
outer_iterations: number of timings recorded
path: path where the benchmark file is stored. If None a tmp folder is created
Returns:
list of times per iterations for each outer iteration
......@@ -185,26 +81,40 @@ def run_c_benchmark(ast, inner_iterations, outer_iterations=3):
import kerncraft
benchmark_code = generate_benchmark(ast, timing=True)
with open('bench.c', 'w') as f:
if path is None:
path = tempfile.mkdtemp()
if isinstance(path, str):
path = Path(path)
with open(path / 'bench.c', 'w') as f:
f.write(benchmark_code)
kerncraft_path = os.path.dirname(kerncraft.__file__)
kerncraft_path = Path(kerncraft.__file__).parent
extra_flags = ['-I' + get_pystencils_include_path(),
'-I' + os.path.join(kerncraft_path, 'headers')]
'-I' + str(kerncraft_path / 'headers')]
compiler_config = get_compiler_config()
compile_cmd = [compiler_config['command']] + compiler_config['flags'].split()
compile_cmd += [*extra_flags,
os.path.join(kerncraft_path, 'headers', 'timing.c'),
os.path.join(kerncraft_path, 'headers', 'dummy.c'),
'bench.c',
'-o', 'bench',
kerncraft_path / 'headers' / 'timing.c',
kerncraft_path / 'headers' / 'dummy.c',
path / 'bench.c',
'-o', path / 'bench',
]
run_compile_step(compile_cmd)
time_pre_estimation_per_iteration = float(subprocess.check_output(['./' / path / 'bench', str(10)]))
benchmark_time_limit = 20
if benchmark_time_limit / time_pre_estimation_per_iteration < inner_iterations:
warn = (f"A benchmark run with {inner_iterations} inner_iterations will probably take longer than "
f"{benchmark_time_limit} seconds for this kernel")
warnings.warn(warn)
results = []
for _ in range(outer_iterations):
benchmark_time = float(subprocess.check_output(['./bench', str(inner_iterations)]))
benchmark_time = float(subprocess.check_output(['./' / path / 'bench', str(inner_iterations)]))
results.append(benchmark_time)
return results
import warnings
import fcntl
from collections import defaultdict
from tempfile import TemporaryDirectory
from typing import Optional
import kerncraft
from jinja2 import Environment, PackageLoader, StrictUndefined
import sympy as sp
from kerncraft.kerncraft import KernelCode
from kerncraft.machinemodel import MachineModel
from pystencils.astnodes import (
KernelFunction, LoopOverCoordinate, ResolvedFieldAccess, SympyAssignment)
from pystencils.astnodes import (KernelFunction, LoopOverCoordinate, ResolvedFieldAccess, SympyAssignment)
from pystencils.field import get_layout_from_strides
from pystencils.kerncraft_coupling.generate_benchmark import generate_benchmark
from pystencils.sympyextensions import count_operations_in_ast
from pystencils.transformations import filtered_tree_iteration
from pystencils.utils import DotDict
from pystencils.backends.cbackend import generate_c, get_headers
from pystencils.cpu.kernelcreation import add_openmp
class PyStencilsKerncraftKernel(KernelCode):
......@@ -34,8 +36,10 @@ class PyStencilsKerncraftKernel(KernelCode):
assumed_layout: either 'SoA' or 'AoS' - if fields have symbolic sizes the layout of the index
coordinates is not known. In this case either a structures of array (SoA) or
array of structures (AoS) layout is assumed
debug_print: print debug information
filename: used for caching
"""
kerncraft.kernel.Kernel.__init__(self, machine)
super(KernelCode, self).__init__(machine=machine)
# Initialize state
self.asm_block = None
......@@ -96,7 +100,7 @@ class PyStencilsKerncraftKernel(KernelCode):
for field in fields_accessed:
layout = get_layout_tuple(field)
permuted_shape = list(field.shape[i] for i in layout)
self.set_variable(field.name, str(field.dtype), tuple(permuted_shape))
self.set_variable(field.name, tuple([str(field.dtype)]), tuple(permuted_shape))
# Scalars may be safely ignored
# for param in ast.get_parameters():
......@@ -129,24 +133,64 @@ class PyStencilsKerncraftKernel(KernelCode):
print("----------------------------- FLOPS -------------------------------")
pprint(self._flops)
def as_code(self, type_='iaca', openmp=False, as_filename=False):
def get_kernel_header(self, name='pystencils_kernel'):
file_name = "pystencils_kernel.h"
file_path = self.get_intermediate_location(file_name, machine_and_compiler_dependent=False)
lock_mode, lock_fp = self.lock_intermediate(file_path)
if lock_mode == fcntl.LOCK_EX:
function_signature = generate_c(self.kernel_ast, dialect='c', signature_only=True)
jinja_context = {
'function_signature': function_signature,
}
env = Environment(loader=PackageLoader('pystencils.kerncraft_coupling'), undefined=StrictUndefined)
file_header = env.get_template('kernel.h').render(**jinja_context)
with open(file_path, 'w') as f:
f.write(file_header)
fcntl.flock(lock_fp, fcntl.LOCK_SH) # degrade to shared lock
return file_path, lock_fp
def get_kernel_code(self, openmp=False, name='pystencils_kernl'):
"""
Generate and return compilable source code.
Args:
type_: can be iaca or likwid.
openmp: if true, openmp code will be generated
as_filename:
name: kernel name
"""
code = generate_benchmark(self.kernel_ast, likwid=type_ == 'likwid', openmp=openmp)
if as_filename:
fp, already_available = self._get_intermediate_file(f'kernel_{type_}.c',
machine_and_compiler_dependent=False)
if not already_available:
fp.write(code)
return fp.name
else:
return code
filename = 'pystencils_kernl'
if openmp:
filename += '-omp'
filename += '.c'
file_path = self.get_intermediate_location(filename, machine_and_compiler_dependent=False)
lock_mode, lock_fp = self.lock_intermediate(file_path)
if lock_mode == fcntl.LOCK_EX:
header_list = get_headers(self.kernel_ast)
includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list])
if openmp:
add_openmp(self.kernel_ast)
kernel_code = generate_c(self.kernel_ast, dialect='c')
jinja_context = {
'includes': includes,
'kernel_code': kernel_code,
}
env = Environment(loader=PackageLoader('pystencils.kerncraft_coupling'), undefined=StrictUndefined)
file_header = env.get_template('kernel.c').render(**jinja_context)
with open(file_path, 'w') as f:
f.write(file_header)
fcntl.flock(lock_fp, fcntl.LOCK_SH) # degrade to shared lock
return file_path, lock_fp
class KerncraftParameters(DotDict):
......@@ -161,6 +205,7 @@ class KerncraftParameters(DotDict):
self['iterations'] = 10
self['unit'] = 'cy/CL'
self['ignore_warnings'] = True
self['incore_model'] = 'OSACA'
# ------------------------------------------- Helper functions ---------------------------------------------------------
......
#include "kerncraft.h"
#include <stdlib.h>
#include <stdint.h>
#include <stdbool.h>
#include <math.h>
#include <stdio.h>
{{ includes }}
{%- if likwid %}
#include <likwid.h>
{%- endif %}
#define RESTRICT __restrict__
#define FUNC_PREFIX
void dummy(void *);
void timing(double* wcTime, double* cpuTime);
extern int var_false;
{{kernel_code}}
int main(int argc, char **argv)
{
{%- if likwid %}
likwid_markerInit();
{%- endif %}
{%- for field_name, dataType, size in fields %}
// Initialization {{field_name}}
double * {{field_name}} = (double *) aligned_malloc(sizeof({{dataType}}) * {{size}}, 64);
for (unsigned long long i = 0; i < {{size}}; ++i)
{{field_name}}[i] = 0.23;
if(var_false)
dummy({{field_name}});
{%- endfor %}
{%- for constantName, dataType in constants %}
// Constant {{constantName}}
{{dataType}} {{constantName}};
{{constantName}} = 0.23;
if(var_false)
dummy(& {{constantName}});
{%- endfor %}
{%- if likwid and openmp %}
#pragma omp parallel
{
likwid_markerRegisterRegion("loop");
#pragma omp barrier
{%- elif likwid %}
likwid_markerRegisterRegion("loop");
{%- endif %}
for(int warmup = 1; warmup >= 0; --warmup) {
int repeat = 2;
if(warmup == 0) {
repeat = atoi(argv[1]);
{%- if likwid %}
likwid_markerStartRegion("loop");
{%- endif %}
}
{%- if timing %}
double wcStartTime, cpuStartTime, wcEndTime, cpuEndTime;
timing(&wcStartTime, &cpuStartTime);
{%- endif %}
for (; repeat > 0; --repeat)
{
{{kernelName}}({{call_argument_list}});
// Dummy calls
{%- for field_name, dataType, size in fields %}
if(var_false) dummy((void*){{field_name}});
{%- endfor %}
{%- for constantName, dataType in constants %}
if(var_false) dummy((void*)&{{constantName}});
{%- endfor %}
}
{%- if timing %}
timing(&wcEndTime, &cpuEndTime);
if( warmup == 0)
printf("%e\n", (wcEndTime - wcStartTime) / atoi(argv[1]) );
{%- endif %}
}
{%- if likwid %}
likwid_markerStopRegion("loop");
{%- if openmp %}
}
{%- endif %}
{%- endif %}
{%- if likwid %}
likwid_markerClose();
{%- endif %}
}
#include "kerncraft.h"
#include <stdlib.h>
#include <stdint.h>
#include <stdbool.h>
#include <math.h>
#include <stdio.h>
{{ includes }}
#define RESTRICT __restrict__
#define FUNC_PREFIX
void dummy(void *);
void timing(double* wcTime, double* cpuTime);
extern int var_false;
{{kernel_code}}
\ No newline at end of file
#define FUNC_PREFIX
{{function_signature}}
\ No newline at end of file
kerncraft version: 0.7.3
clock: 2.7 GHz
cores per socket: 8
cores per NUMA domain: 8
NUMA domains per socket: 1
model type: Intel Core SandyBridge EP processor
model name: Intel(R) Xeon(R) CPU E5-2680 0 @ 2.70GHz
sockets: 2
threads per core: 2
cacheline size: 64 B
compiler:
!!omap
- icc: -O3 -xAVX -fno-alias -qopenmp
- clang: -O3 -march=corei7-avx -mtune=corei7-avx -D_POSIX_C_SOURCE=200112L -fopenmp
- gcc: -O3 -march=corei7-avx -D_POSIX_C_SOURCE=200112L -fopenmp
micro-architecture: SNB
FLOPs per cycle:
SP: {total: 16, ADD: 8, MUL: 8}
DP: {total: 8, ADD: 4, MUL: 4}
overlapping model:
ports: ["0", "0DV", "1", "2", "3", "4", "5"]
performance counter metric:
Max(UOPS_DISPATCHED_PORT_PORT_0:PMC[0-3],
UOPS_DISPATCHED_PORT_PORT_1:PMC[0-3],
UOPS_DISPATCHED_PORT_PORT_4:PMC[0-3],
UOPS_DISPATCHED_PORT_PORT_5:PMC[0-3])
non-overlapping model:
ports: ["2D", "3D"]
performance counter metric: T_OL + T_L1L2 + T_L2L3 + T_L3MEM
write-allocate: True
memory hierarchy:
- level: L1
cache per group: {
'sets': 64, 'ways': 8, 'cl_size': 64, # 32 kB
'replacement_policy': 'LRU',
'write_allocate': True, 'write_back': True,
'load_from': 'L2', 'store_to': 'L2'}
cores per group: 1
threads per group: 2
groups: 16
performance counter metrics:
accesses: MEM_UOPS_RETIRED_LOADS:PMC[0-3]
misses: L1D_REPLACEMENT:PMC[0-3]
evicts: L1D_M_EVICT:PMC[0-3]
- level: L2
cache per group: {
'sets': 512, 'ways': 8, 'cl_size': 64, # 256 kB
'replacement_policy': 'LRU',
'write_allocate': True, 'write_back': True,
'load_from': 'L3', 'store_to': 'L3'}
cores per group: 1
threads per group: 2
groups: 16
non-overlap upstream throughput: [32 B/cy, 'half-duplex']
performance counter metrics:
accesses: L1D_REPLACEMENT:PMC[0-3]
misses: L2_LINES_IN_ALL:PMC[0-3]
evicts: L2_TRANS_L2_WB:PMC[0-3]
- level: L3
cache per group: {
'sets': 20480, 'ways': 16, 'cl_size': 64, # 20 MB
'replacement_policy': 'LRU',
'write_allocate': True, 'write_back': True}
cores per group: 8
threads per group: 16
groups: 2
non-overlap upstream throughput: [32 B/cy, 'half-duplex']
performance counter metrics:
accesses: L2_LINES_IN_ALL:PMC[0-3]
misses: (CAS_COUNT_RD:MBOX0C[01] + CAS_COUNT_RD:MBOX1C[01] +
CAS_COUNT_RD:MBOX2C[01] + CAS_COUNT_RD:MBOX3C[01])
evicts: (CAS_COUNT_WR:MBOX0C[01] + CAS_COUNT_WR:MBOX1C[01] +
CAS_COUNT_WR:MBOX2C[01] + CAS_COUNT_WR:MBOX3C[01])
- level: MEM
cores per group: 8
non-overlap upstream throughput: ['full socket memory bandwidth', 'half-duplex']
size per group: null
threads per group: 16
benchmarks:
kernels:
copy:
FLOPs per iteration: 0
read streams: {bytes: 8.00 B, streams: 1}
read+write streams: {bytes: 0.00 B, streams: 0}
write streams: {bytes: 8.00 B, streams: 1}
daxpy:
FLOPs per iteration: 2
read streams: {bytes: 16.00 B, streams: 2}
read+write streams: {bytes: 8.00 B, streams: 1}
write streams: {bytes: 8.00 B, streams: 1}
load:
FLOPs per iteration: 0
read streams: {bytes: 8.00 B, streams: 1}
read+write streams: {bytes: 0.00 B, streams: 0}
write streams: {bytes: 0.00 B, streams: 0}
triad:
FLOPs per iteration: 2
read streams: {bytes: 24.00 B, streams: 3}
read+write streams: {bytes: 0.00 B, streams: 0}
write streams: {bytes: 8.00 B, streams: 1}
update:
FLOPs per iteration: 0
read streams: {bytes: 8.00 B, streams: 1}
read+write streams: {bytes: 8.00 B, streams: 1}
write streams: {bytes: 8.00 B, streams: 1}
measurements:
L1:
1:
cores: [1, 2, 3, 4, 5, 6, 7, 8]
results:
copy: [81.98 GB/s, 163.75 GB/s, 245.62 GB/s, 327.69 GB/s, 409.41 GB/s, 489.83
GB/s, 571.67 GB/s, 653.50 GB/s]
daxpy: [71.55 GB/s, 143.01 GB/s, 214.86 GB/s, 286.26 GB/s, 355.60 GB/s,
426.71 GB/s, 497.45 GB/s, 568.97 GB/s]
load: [61.92 GB/s, 122.79 GB/s, 183.01 GB/s, 244.30 GB/s, 306.76 GB/s, 368.46
GB/s, 427.41 GB/s, 490.88 GB/s]
triad: [81.61 GB/s, 163.25 GB/s, 244.92 GB/s, 326.65 GB/s, 406.69 GB/s,
487.76 GB/s, 569.10 GB/s, 650.39 GB/s]
update: [84.03 GB/s, 168.02 GB/s, 252.10 GB/s, 335.94 GB/s, 419.90 G