Commit e91f23f7 authored by Julian Hammer's avatar Julian Hammer Committed by Markus Holzer
Browse files

improved kc coupling

parent f3e81539
......@@ -2,21 +2,23 @@ import warnings
import fcntl
from collections import defaultdict
from tempfile import TemporaryDirectory
from typing import Optional
from jinja2 import Environment, PackageLoader, StrictUndefined
import textwrap
from jinja2 import Environment, PackageLoader, StrictUndefined, Template
import sympy as sp
from kerncraft.kerncraft import KernelCode
from kerncraft.machinemodel import MachineModel
from pystencils.astnodes import (KernelFunction, LoopOverCoordinate, ResolvedFieldAccess, SympyAssignment)
from pystencils.astnodes import \
KernelFunction, LoopOverCoordinate, ResolvedFieldAccess, SympyAssignment
from pystencils.backends.cbackend import generate_c, get_headers
from pystencils.field import get_layout_from_strides
from pystencils.sympyextensions import count_operations_in_ast
from pystencils.transformations import filtered_tree_iteration
from pystencils.utils import DotDict
from pystencils.backends.cbackend import generate_c, get_headers
from pystencils.cpu.kernelcreation import add_openmp
from pystencils.data_types import get_base_type
from pystencils.sympyextensions import prod
class PyStencilsKerncraftKernel(KernelCode):
......@@ -26,7 +28,7 @@ class PyStencilsKerncraftKernel(KernelCode):
"""
LIKWID_BASE = '/usr/local/likwid'
def __init__(self, ast: KernelFunction, machine: Optional[MachineModel] = None,
def __init__(self, ast: KernelFunction, machine: MachineModel,
assumed_layout='SoA', debug_print=False, filename=None):
"""Create a kerncraft kernel using a pystencils AST
......@@ -44,6 +46,7 @@ class PyStencilsKerncraftKernel(KernelCode):
# Initialize state
self.asm_block = None
self._filename = filename
self._keep_intermediates = False
self.kernel_ast = ast
self.temporary_dir = TemporaryDirectory()
......@@ -96,14 +99,14 @@ class PyStencilsKerncraftKernel(KernelCode):
target_dict[fa.field.name].append(permuted_coord)
# Variables (arrays)
fields_accessed = ast.fields_accessed
fields_accessed = self.kernel_ast.fields_accessed
for field in fields_accessed:
layout = get_layout_tuple(field)
permuted_shape = list(field.shape[i] for i in layout)
self.set_variable(field.name, tuple([str(field.dtype)]), tuple(permuted_shape))
self.set_variable(field.name, (str(field.dtype),), tuple(permuted_shape))
# Scalars may be safely ignored
# for param in ast.get_parameters():
# for param in self.kernel_ast.get_parameters():
# if not param.is_field_parameter:
# # self.set_variable(param.symbol.name, str(param.symbol.dtype), None)
# self.sources[param.symbol.name] = [None]
......@@ -138,7 +141,10 @@ class PyStencilsKerncraftKernel(KernelCode):
file_path = self.get_intermediate_location(file_name, machine_and_compiler_dependent=False)
lock_mode, lock_fp = self.lock_intermediate(file_path)
if lock_mode == fcntl.LOCK_EX:
if lock_mode == fcntl.LOCK_SH:
# use cache
pass
else: # lock_mode == fcntl.LOCK_EX:
function_signature = generate_c(self.kernel_ast, dialect='c', signature_only=True)
jinja_context = {
......@@ -150,13 +156,12 @@ class PyStencilsKerncraftKernel(KernelCode):
with open(file_path, 'w') as f:
f.write(file_header)
fcntl.flock(lock_fp, fcntl.LOCK_SH) # degrade to shared lock
self.release_exclusive_lock(lock_fp) # degrade to shared lock
return file_path, lock_fp
def get_kernel_code(self, openmp=False, name='pystencils_kernl'):
"""
Generate and return compilable source code.
Generate and return compilable source code from AST.
Args:
openmp: if true, openmp code will be generated
......@@ -169,7 +174,11 @@ class PyStencilsKerncraftKernel(KernelCode):
file_path = self.get_intermediate_location(filename, machine_and_compiler_dependent=False)
lock_mode, lock_fp = self.lock_intermediate(file_path)
if lock_mode == fcntl.LOCK_EX:
if lock_mode == fcntl.LOCK_SH:
# use cache
with open(file_path) as f:
code = f.read()
else: # lock_mode == fcntl.LOCK_EX:
header_list = get_headers(self.kernel_ast)
includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list])
......@@ -184,11 +193,136 @@ class PyStencilsKerncraftKernel(KernelCode):
}
env = Environment(loader=PackageLoader('pystencils.kerncraft_coupling'), undefined=StrictUndefined)
file_header = env.get_template('kernel.c').render(**jinja_context)
code = env.get_template('kernel.c').render(**jinja_context)
with open(file_path, 'w') as f:
f.write(file_header)
f.write(code)
self.release_exclusive_lock(lock_fp) # degrade to shared lock
return file_path, lock_fp
CODE_TEMPLATE = Template(textwrap.dedent("""
#include <likwid.h>
#include <stdlib.h>
#include <stdint.h>
#include <stdbool.h>
#include <math.h>
#include "kerncraft.h"
#include "kernel.h"
#define RESTRICT __restrict__
#define FUNC_PREFIX
void dummy(void *);
extern int var_false;
int main(int argc, char **argv) {
{%- for constantName, dataType in constants %}
// Constant {{constantName}}
{{dataType}} {{constantName}};
{{constantName}} = 0.23;
{%- endfor %}
// Declaring arrays
{%- for field_name, dataType, size in fields %}
// Initialization {{field_name}}
double * {{field_name}} = (double *) aligned_malloc(sizeof({{dataType}}) * {{size}}, 64);
// TODO initialize in parallel context in same order as they are touched
for (unsigned long long i = 0; i < {{size}}; ++i)
{{field_name}}[i] = 0.23;
{%- endfor %}
likwid_markerInit();
#pragma omp parallel
{
likwid_markerRegisterRegion("loop");
#pragma omp barrier
// Initializing arrays in same order as touched in kernel loop nest
//INIT_ARRAYS;
// Dummy call
{%- for field_name, dataType, size in fields %}
if(var_false) dummy({{field_name}});
{%- endfor %}
{%- for constantName, dataType in constants %}
if(var_false) dummy(&{{constantName}});
{%- endfor %}
for(int warmup = 1; warmup >= 0; --warmup) {
int repeat = 2;
if(warmup == 0) {
repeat = atoi(argv[1]);
likwid_markerStartRegion("loop");
}
for(; repeat > 0; --repeat) {
{{kernelName}}({{call_argument_list}});
{%- for field_name, dataType, size in fields %}
if(var_false) dummy({{field_name}});
{%- endfor %}
{%- for constantName, dataType in constants %}
if(var_false) dummy(&{{constantName}});
{%- endfor %}
}
}
likwid_markerStopRegion("loop");
}
likwid_markerClose();
return 0;
}
"""))
fcntl.flock(lock_fp, fcntl.LOCK_SH) # degrade to shared lock
def get_main_code(self, kernel_function_name='kernel'):
"""
Generate and return compilable source code from AST.
:return: tuple of filename and shared lock file pointer
"""
# TODO produce nicer code, including help text and other "comfort features".
assert self.kernel_ast is not None, "AST does not exist, this could be due to running " \
"based on a kernel description rather than code."
file_path = self.get_intermediate_location('main.c', machine_and_compiler_dependent=False)
lock_mode, lock_fp = self.lock_intermediate(file_path)
if lock_mode == fcntl.LOCK_SH:
# use cache
with open(file_path) as f:
code = f.read()
else: # lock_mode == fcntl.LOCK_EX
# needs update
accessed_fields = {f.name: f for f in self.kernel_ast.fields_accessed}
constants = []
fields = []
call_parameters = []
for p in self.kernel_ast.get_parameters():
if not p.is_field_parameter:
constants.append((p.symbol.name, str(p.symbol.dtype)))
call_parameters.append(p.symbol.name)
else:
assert p.is_field_pointer, "Benchmark implemented only for kernels with fixed loop size"
field = accessed_fields[p.field_name]
dtype = str(get_base_type(p.symbol.dtype))
fields.append((p.field_name, dtype, prod(field.shape)))
call_parameters.append(p.field_name)
header_list = get_headers(self.kernel_ast)
includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list])
# Generate code
code = self.CODE_TEMPLATE.render(
kernelName=self.kernel_ast.function_name,
fields=fields,
constants=constants,
call_agument_list=','.join(call_parameters),
includes=includes)
# Store to file
with open(file_path, 'w') as f:
f.write(code)
self.release_exclusive_lock(lock_fp) # degrade to shared lock
return file_path, lock_fp
......
kerncraft version: 0.8.3.dev0
kerncraft version: 0.8.6.dev0
model name: Intel(R) Xeon(R) CPU E5-2680 0 @ 2.70GHz
model type: Intel Xeon SandyBridge EN/EP processor
clock: 2.7 GHz
......@@ -8,6 +8,7 @@ cores per socket: 8
threads per core: 2
NUMA domains per socket: 1
cores per NUMA domain: 8
transparent hugepage: always
in-core model: !!omap
- IACA: SNB
......@@ -20,17 +21,22 @@ FLOPs per cycle:
DP: {total: 8, ADD: 4, MUL: 4}
compiler: !!omap
- icc: -O3 -xAVX -fno-alias -qopenmp
- clang: -O3 -march=corei7-avx -mtune=corei7-avx -D_POSIX_C_SOURCE=200809L -fopenmp
- gcc: -O3 -march=corei7-avx -D_POSIX_C_SOURCE=200809L -fopenmp -lm
- icc: -O3 -xAVX -fno-alias -qopenmp -ffreestanding -nolib-inline
- clang: -O3 -march=corei7-avx -mtune=corei7-avx -D_POSIX_C_SOURCE=200809L -fopenmp -ffreestanding
- gcc: -O3 -march=corei7-avx -D_POSIX_C_SOURCE=200809L -fopenmp -lm -ffreestanding
overlapping model:
ports:
IACA: ['0', 0DV, '1', '2', '3', '4', '5']
OSACA: ['0', 0DV, '1', '2', '3', '4', '5']
LLVM-MCA: [SBDivider, SBFPDivider, SBPort0, SBPort1, SBPort23, SBPort4, SBPort5]
performance counter metric: Max(UOPS_DISPATCHED_PORT_PORT_0:PMC[0-3], UOPS_DISPATCHED_PORT_PORT_1:PMC[0-3], UOPS_DISPATCHED_PORT_PORT_4:PMC[0-3], UOPS_DISPATCHED_PORT_PORT_5:PMC[0-3])
non-overlapping model:
ports: [2D, 3D]
ports:
IACA: [2D, 3D]
OSACA: [2D, 3D]
LLVM-MCA: [SBPort23]
performance counter metric: T_nOL + T_L1L2 + T_L2L3 + T_L3MEM
overlapping model:
ports: ['0', 0DV, '1', '2', '3', '4', '5']
performance counter metric: Max(UOPS_DISPATCHED_PORT_PORT_0:PMC[0-3], UOPS_DISPATCHED_PORT_PORT_1:PMC[0-3],
UOPS_DISPATCHED_PORT_PORT_4:PMC[0-3], UOPS_DISPATCHED_PORT_PORT_5:PMC[0-3])
cacheline size: 64 B
memory hierarchy:
......
......@@ -42,9 +42,7 @@ def test_compilation():
@pytest.mark.kerncraft
def analysis(kernel, model='ecmdata'):
machine_file_path = INPUT_FOLDER / "Example_SandyBridgeEP_E5-2680.yml"
machine = MachineModel(path_to_yaml=machine_file_path)
def analysis(kernel, machine, model='ecmdata'):
if model == 'ecmdata':
model = ECMData(kernel, machine, KerncraftParameters())
elif model == 'ecm':
......@@ -71,7 +69,7 @@ def test_3d_7pt_osaca():
reference_kernel.set_constant('M', size[0])
reference_kernel.set_constant('N', size[1])
assert size[1] == size[2]
analysis(reference_kernel, model='ecm')
analysis(reference_kernel, machine_model, model='ecm')
arr = np.zeros(size)
a = Field.create_from_numpy_array('a', arr, index_dimensions=0)
......@@ -82,18 +80,22 @@ def test_3d_7pt_osaca():
update_rule = Assignment(b[0, 0, 0], s * rhs)
ast = create_kernel([update_rule])
k = PyStencilsKerncraftKernel(ast, machine=machine_model)
analysis(k, model='ecm')
analysis(k, machine_model, model='ecm')
assert reference_kernel._flops == k._flops
# assert reference.results['cl throughput'] == analysis.results['cl throughput']
@pytest.mark.kerncraft
def test_2d_5pt():
machine_file_path = INPUT_FOLDER / "Example_SandyBridgeEP_E5-2680.yml"
machine = MachineModel(path_to_yaml=machine_file_path)
size = [30, 50, 3]
kernel_file_path = INPUT_FOLDER / "2d-5pt.c"
with open(kernel_file_path) as kernel_file:
reference_kernel = KernelCode(kernel_file.read(), machine=None, filename=kernel_file_path)
reference = analysis(reference_kernel)
reference_kernel = KernelCode(kernel_file.read(), machine=machine,
filename=kernel_file_path)
reference = analysis(reference_kernel, machine)
arr = np.zeros(size)
a = Field.create_from_numpy_array('a', arr, index_dimensions=1)
......@@ -102,8 +104,8 @@ def test_2d_5pt():
rhs = a[0, -1](0) + a[0, 1] + a[-1, 0] + a[1, 0]
update_rule = Assignment(b[0, 0], s * rhs)
ast = create_kernel([update_rule])
k = PyStencilsKerncraftKernel(ast)
result = analysis(k)
k = PyStencilsKerncraftKernel(ast, machine)
result = analysis(k, machine)
for e1, e2 in zip(reference.results['cycles'], result.results['cycles']):
assert e1 == e2
......@@ -111,14 +113,18 @@ def test_2d_5pt():
@pytest.mark.kerncraft
def test_3d_7pt():
machine_file_path = INPUT_FOLDER / "Example_SandyBridgeEP_E5-2680.yml"
machine = MachineModel(path_to_yaml=machine_file_path)
size = [30, 50, 50]
kernel_file_path = INPUT_FOLDER / "3d-7pt.c"
with open(kernel_file_path) as kernel_file:
reference_kernel = KernelCode(kernel_file.read(), machine=None, filename=kernel_file_path)
reference_kernel = KernelCode(kernel_file.read(), machine=machine,
filename=kernel_file_path)
reference_kernel.set_constant('M', size[0])
reference_kernel.set_constant('N', size[1])
assert size[1] == size[2]
reference = analysis(reference_kernel)
reference = analysis(reference_kernel, machine)
arr = np.zeros(size)
a = Field.create_from_numpy_array('a', arr, index_dimensions=0)
......@@ -128,8 +134,8 @@ def test_3d_7pt():
update_rule = Assignment(b[0, 0, 0], s * rhs)
ast = create_kernel([update_rule])
k = PyStencilsKerncraftKernel(ast)
result = analysis(k)
k = PyStencilsKerncraftKernel(ast, machine)
result = analysis(k, machine)
for e1, e2 in zip(reference.results['cycles'], result.results['cycles']):
assert e1 == e2
......@@ -163,6 +169,9 @@ def test_benchmark():
@pytest.mark.kerncraft
def test_kerncraft_generic_field():
machine_file_path = INPUT_FOLDER / "Example_SandyBridgeEP_E5-2680.yml"
machine = MachineModel(path_to_yaml=machine_file_path)
a = fields('a: double[3D]')
b = fields('b: double[3D]')
s = sp.Symbol("s")
......@@ -170,4 +179,4 @@ def test_kerncraft_generic_field():
update_rule = Assignment(b[0, 0, 0], s * rhs)
ast = create_kernel([update_rule])
k = PyStencilsKerncraftKernel(ast, debug_print=True)
k = PyStencilsKerncraftKernel(ast, machine, debug_print=True)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment