From e91f23f70419db9385dea3513621ebb6d0d29efb Mon Sep 17 00:00:00 2001 From: Julian Hammer <julian.hammer@fau.de> Date: Wed, 11 Nov 2020 14:32:42 +0100 Subject: [PATCH] improved kc coupling --- .../kerncraft_coupling/kerncraft_interface.py | 168 ++++++++++++++++-- .../Example_SandyBridgeEP_E5-2680.yml | 24 ++- pystencils_tests/test_kerncraft_coupling.py | 37 ++-- 3 files changed, 189 insertions(+), 40 deletions(-) diff --git a/pystencils/kerncraft_coupling/kerncraft_interface.py b/pystencils/kerncraft_coupling/kerncraft_interface.py index 7564245c0..8e8bcc618 100644 --- a/pystencils/kerncraft_coupling/kerncraft_interface.py +++ b/pystencils/kerncraft_coupling/kerncraft_interface.py @@ -2,21 +2,23 @@ import warnings import fcntl from collections import defaultdict from tempfile import TemporaryDirectory -from typing import Optional - -from jinja2 import Environment, PackageLoader, StrictUndefined +import textwrap +from jinja2 import Environment, PackageLoader, StrictUndefined, Template import sympy as sp from kerncraft.kerncraft import KernelCode from kerncraft.machinemodel import MachineModel -from pystencils.astnodes import (KernelFunction, LoopOverCoordinate, ResolvedFieldAccess, SympyAssignment) +from pystencils.astnodes import \ + KernelFunction, LoopOverCoordinate, ResolvedFieldAccess, SympyAssignment +from pystencils.backends.cbackend import generate_c, get_headers from pystencils.field import get_layout_from_strides from pystencils.sympyextensions import count_operations_in_ast from pystencils.transformations import filtered_tree_iteration from pystencils.utils import DotDict -from pystencils.backends.cbackend import generate_c, get_headers from pystencils.cpu.kernelcreation import add_openmp +from pystencils.data_types import get_base_type +from pystencils.sympyextensions import prod class PyStencilsKerncraftKernel(KernelCode): @@ -26,7 +28,7 @@ class PyStencilsKerncraftKernel(KernelCode): """ LIKWID_BASE = '/usr/local/likwid' - def __init__(self, ast: KernelFunction, machine: Optional[MachineModel] = None, + def __init__(self, ast: KernelFunction, machine: MachineModel, assumed_layout='SoA', debug_print=False, filename=None): """Create a kerncraft kernel using a pystencils AST @@ -44,6 +46,7 @@ class PyStencilsKerncraftKernel(KernelCode): # Initialize state self.asm_block = None self._filename = filename + self._keep_intermediates = False self.kernel_ast = ast self.temporary_dir = TemporaryDirectory() @@ -96,14 +99,14 @@ class PyStencilsKerncraftKernel(KernelCode): target_dict[fa.field.name].append(permuted_coord) # Variables (arrays) - fields_accessed = ast.fields_accessed + fields_accessed = self.kernel_ast.fields_accessed for field in fields_accessed: layout = get_layout_tuple(field) permuted_shape = list(field.shape[i] for i in layout) - self.set_variable(field.name, tuple([str(field.dtype)]), tuple(permuted_shape)) + self.set_variable(field.name, (str(field.dtype),), tuple(permuted_shape)) # Scalars may be safely ignored - # for param in ast.get_parameters(): + # for param in self.kernel_ast.get_parameters(): # if not param.is_field_parameter: # # self.set_variable(param.symbol.name, str(param.symbol.dtype), None) # self.sources[param.symbol.name] = [None] @@ -138,7 +141,10 @@ class PyStencilsKerncraftKernel(KernelCode): file_path = self.get_intermediate_location(file_name, machine_and_compiler_dependent=False) lock_mode, lock_fp = self.lock_intermediate(file_path) - if lock_mode == fcntl.LOCK_EX: + if lock_mode == fcntl.LOCK_SH: + # use cache + pass + else: # lock_mode == fcntl.LOCK_EX: function_signature = generate_c(self.kernel_ast, dialect='c', signature_only=True) jinja_context = { @@ -150,13 +156,12 @@ class PyStencilsKerncraftKernel(KernelCode): with open(file_path, 'w') as f: f.write(file_header) - fcntl.flock(lock_fp, fcntl.LOCK_SH) # degrade to shared lock - + self.release_exclusive_lock(lock_fp) # degrade to shared lock return file_path, lock_fp def get_kernel_code(self, openmp=False, name='pystencils_kernl'): """ - Generate and return compilable source code. + Generate and return compilable source code from AST. Args: openmp: if true, openmp code will be generated @@ -169,7 +174,11 @@ class PyStencilsKerncraftKernel(KernelCode): file_path = self.get_intermediate_location(filename, machine_and_compiler_dependent=False) lock_mode, lock_fp = self.lock_intermediate(file_path) - if lock_mode == fcntl.LOCK_EX: + if lock_mode == fcntl.LOCK_SH: + # use cache + with open(file_path) as f: + code = f.read() + else: # lock_mode == fcntl.LOCK_EX: header_list = get_headers(self.kernel_ast) includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list]) @@ -184,11 +193,136 @@ class PyStencilsKerncraftKernel(KernelCode): } env = Environment(loader=PackageLoader('pystencils.kerncraft_coupling'), undefined=StrictUndefined) - file_header = env.get_template('kernel.c').render(**jinja_context) + code = env.get_template('kernel.c').render(**jinja_context) with open(file_path, 'w') as f: - f.write(file_header) + f.write(code) + + self.release_exclusive_lock(lock_fp) # degrade to shared lock + return file_path, lock_fp + + CODE_TEMPLATE = Template(textwrap.dedent(""" + #include <likwid.h> + #include <stdlib.h> + #include <stdint.h> + #include <stdbool.h> + #include <math.h> + #include "kerncraft.h" + #include "kernel.h" + + #define RESTRICT __restrict__ + #define FUNC_PREFIX + void dummy(void *); + extern int var_false; + + int main(int argc, char **argv) { + {%- for constantName, dataType in constants %} + // Constant {{constantName}} + {{dataType}} {{constantName}}; + {{constantName}} = 0.23; + {%- endfor %} + + // Declaring arrays + {%- for field_name, dataType, size in fields %} + + // Initialization {{field_name}} + double * {{field_name}} = (double *) aligned_malloc(sizeof({{dataType}}) * {{size}}, 64); + // TODO initialize in parallel context in same order as they are touched + for (unsigned long long i = 0; i < {{size}}; ++i) + {{field_name}}[i] = 0.23; + {%- endfor %} + + likwid_markerInit(); + #pragma omp parallel + { + likwid_markerRegisterRegion("loop"); + #pragma omp barrier + + // Initializing arrays in same order as touched in kernel loop nest + //INIT_ARRAYS; + + // Dummy call + {%- for field_name, dataType, size in fields %} + if(var_false) dummy({{field_name}}); + {%- endfor %} + {%- for constantName, dataType in constants %} + if(var_false) dummy(&{{constantName}}); + {%- endfor %} + + for(int warmup = 1; warmup >= 0; --warmup) { + int repeat = 2; + if(warmup == 0) { + repeat = atoi(argv[1]); + likwid_markerStartRegion("loop"); + } + + for(; repeat > 0; --repeat) { + {{kernelName}}({{call_argument_list}}); + + {%- for field_name, dataType, size in fields %} + if(var_false) dummy({{field_name}}); + {%- endfor %} + {%- for constantName, dataType in constants %} + if(var_false) dummy(&{{constantName}}); + {%- endfor %} + } + + } + likwid_markerStopRegion("loop"); + } + likwid_markerClose(); + return 0; + } + """)) - fcntl.flock(lock_fp, fcntl.LOCK_SH) # degrade to shared lock + def get_main_code(self, kernel_function_name='kernel'): + """ + Generate and return compilable source code from AST. + + :return: tuple of filename and shared lock file pointer + """ + # TODO produce nicer code, including help text and other "comfort features". + assert self.kernel_ast is not None, "AST does not exist, this could be due to running " \ + "based on a kernel description rather than code." + + file_path = self.get_intermediate_location('main.c', machine_and_compiler_dependent=False) + lock_mode, lock_fp = self.lock_intermediate(file_path) + + if lock_mode == fcntl.LOCK_SH: + # use cache + with open(file_path) as f: + code = f.read() + else: # lock_mode == fcntl.LOCK_EX + # needs update + accessed_fields = {f.name: f for f in self.kernel_ast.fields_accessed} + constants = [] + fields = [] + call_parameters = [] + for p in self.kernel_ast.get_parameters(): + if not p.is_field_parameter: + constants.append((p.symbol.name, str(p.symbol.dtype))) + call_parameters.append(p.symbol.name) + else: + assert p.is_field_pointer, "Benchmark implemented only for kernels with fixed loop size" + field = accessed_fields[p.field_name] + dtype = str(get_base_type(p.symbol.dtype)) + fields.append((p.field_name, dtype, prod(field.shape))) + call_parameters.append(p.field_name) + + header_list = get_headers(self.kernel_ast) + includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list]) + + # Generate code + code = self.CODE_TEMPLATE.render( + kernelName=self.kernel_ast.function_name, + fields=fields, + constants=constants, + call_agument_list=','.join(call_parameters), + includes=includes) + + # Store to file + with open(file_path, 'w') as f: + f.write(code) + self.release_exclusive_lock(lock_fp) # degrade to shared lock return file_path, lock_fp diff --git a/pystencils_tests/kerncraft_inputs/Example_SandyBridgeEP_E5-2680.yml b/pystencils_tests/kerncraft_inputs/Example_SandyBridgeEP_E5-2680.yml index 37889b8fe..890e2e895 100644 --- a/pystencils_tests/kerncraft_inputs/Example_SandyBridgeEP_E5-2680.yml +++ b/pystencils_tests/kerncraft_inputs/Example_SandyBridgeEP_E5-2680.yml @@ -1,4 +1,4 @@ -kerncraft version: 0.8.3.dev0 +kerncraft version: 0.8.6.dev0 model name: Intel(R) Xeon(R) CPU E5-2680 0 @ 2.70GHz model type: Intel Xeon SandyBridge EN/EP processor clock: 2.7 GHz @@ -8,6 +8,7 @@ cores per socket: 8 threads per core: 2 NUMA domains per socket: 1 cores per NUMA domain: 8 +transparent hugepage: always in-core model: !!omap - IACA: SNB @@ -20,17 +21,22 @@ FLOPs per cycle: DP: {total: 8, ADD: 4, MUL: 4} compiler: !!omap -- icc: -O3 -xAVX -fno-alias -qopenmp -- clang: -O3 -march=corei7-avx -mtune=corei7-avx -D_POSIX_C_SOURCE=200809L -fopenmp -- gcc: -O3 -march=corei7-avx -D_POSIX_C_SOURCE=200809L -fopenmp -lm +- icc: -O3 -xAVX -fno-alias -qopenmp -ffreestanding -nolib-inline +- clang: -O3 -march=corei7-avx -mtune=corei7-avx -D_POSIX_C_SOURCE=200809L -fopenmp -ffreestanding +- gcc: -O3 -march=corei7-avx -D_POSIX_C_SOURCE=200809L -fopenmp -lm -ffreestanding +overlapping model: + ports: + IACA: ['0', 0DV, '1', '2', '3', '4', '5'] + OSACA: ['0', 0DV, '1', '2', '3', '4', '5'] + LLVM-MCA: [SBDivider, SBFPDivider, SBPort0, SBPort1, SBPort23, SBPort4, SBPort5] + performance counter metric: Max(UOPS_DISPATCHED_PORT_PORT_0:PMC[0-3], UOPS_DISPATCHED_PORT_PORT_1:PMC[0-3], UOPS_DISPATCHED_PORT_PORT_4:PMC[0-3], UOPS_DISPATCHED_PORT_PORT_5:PMC[0-3]) non-overlapping model: - ports: [2D, 3D] + ports: + IACA: [2D, 3D] + OSACA: [2D, 3D] + LLVM-MCA: [SBPort23] performance counter metric: T_nOL + T_L1L2 + T_L2L3 + T_L3MEM -overlapping model: - ports: ['0', 0DV, '1', '2', '3', '4', '5'] - performance counter metric: Max(UOPS_DISPATCHED_PORT_PORT_0:PMC[0-3], UOPS_DISPATCHED_PORT_PORT_1:PMC[0-3], - UOPS_DISPATCHED_PORT_PORT_4:PMC[0-3], UOPS_DISPATCHED_PORT_PORT_5:PMC[0-3]) cacheline size: 64 B memory hierarchy: diff --git a/pystencils_tests/test_kerncraft_coupling.py b/pystencils_tests/test_kerncraft_coupling.py index aeb4b7acb..754604f1e 100644 --- a/pystencils_tests/test_kerncraft_coupling.py +++ b/pystencils_tests/test_kerncraft_coupling.py @@ -42,9 +42,7 @@ def test_compilation(): @pytest.mark.kerncraft -def analysis(kernel, model='ecmdata'): - machine_file_path = INPUT_FOLDER / "Example_SandyBridgeEP_E5-2680.yml" - machine = MachineModel(path_to_yaml=machine_file_path) +def analysis(kernel, machine, model='ecmdata'): if model == 'ecmdata': model = ECMData(kernel, machine, KerncraftParameters()) elif model == 'ecm': @@ -71,7 +69,7 @@ def test_3d_7pt_osaca(): reference_kernel.set_constant('M', size[0]) reference_kernel.set_constant('N', size[1]) assert size[1] == size[2] - analysis(reference_kernel, model='ecm') + analysis(reference_kernel, machine_model, model='ecm') arr = np.zeros(size) a = Field.create_from_numpy_array('a', arr, index_dimensions=0) @@ -82,18 +80,22 @@ def test_3d_7pt_osaca(): update_rule = Assignment(b[0, 0, 0], s * rhs) ast = create_kernel([update_rule]) k = PyStencilsKerncraftKernel(ast, machine=machine_model) - analysis(k, model='ecm') + analysis(k, machine_model, model='ecm') assert reference_kernel._flops == k._flops # assert reference.results['cl throughput'] == analysis.results['cl throughput'] @pytest.mark.kerncraft def test_2d_5pt(): + machine_file_path = INPUT_FOLDER / "Example_SandyBridgeEP_E5-2680.yml" + machine = MachineModel(path_to_yaml=machine_file_path) + size = [30, 50, 3] kernel_file_path = INPUT_FOLDER / "2d-5pt.c" with open(kernel_file_path) as kernel_file: - reference_kernel = KernelCode(kernel_file.read(), machine=None, filename=kernel_file_path) - reference = analysis(reference_kernel) + reference_kernel = KernelCode(kernel_file.read(), machine=machine, + filename=kernel_file_path) + reference = analysis(reference_kernel, machine) arr = np.zeros(size) a = Field.create_from_numpy_array('a', arr, index_dimensions=1) @@ -102,8 +104,8 @@ def test_2d_5pt(): rhs = a[0, -1](0) + a[0, 1] + a[-1, 0] + a[1, 0] update_rule = Assignment(b[0, 0], s * rhs) ast = create_kernel([update_rule]) - k = PyStencilsKerncraftKernel(ast) - result = analysis(k) + k = PyStencilsKerncraftKernel(ast, machine) + result = analysis(k, machine) for e1, e2 in zip(reference.results['cycles'], result.results['cycles']): assert e1 == e2 @@ -111,14 +113,18 @@ def test_2d_5pt(): @pytest.mark.kerncraft def test_3d_7pt(): + machine_file_path = INPUT_FOLDER / "Example_SandyBridgeEP_E5-2680.yml" + machine = MachineModel(path_to_yaml=machine_file_path) + size = [30, 50, 50] kernel_file_path = INPUT_FOLDER / "3d-7pt.c" with open(kernel_file_path) as kernel_file: - reference_kernel = KernelCode(kernel_file.read(), machine=None, filename=kernel_file_path) + reference_kernel = KernelCode(kernel_file.read(), machine=machine, + filename=kernel_file_path) reference_kernel.set_constant('M', size[0]) reference_kernel.set_constant('N', size[1]) assert size[1] == size[2] - reference = analysis(reference_kernel) + reference = analysis(reference_kernel, machine) arr = np.zeros(size) a = Field.create_from_numpy_array('a', arr, index_dimensions=0) @@ -128,8 +134,8 @@ def test_3d_7pt(): update_rule = Assignment(b[0, 0, 0], s * rhs) ast = create_kernel([update_rule]) - k = PyStencilsKerncraftKernel(ast) - result = analysis(k) + k = PyStencilsKerncraftKernel(ast, machine) + result = analysis(k, machine) for e1, e2 in zip(reference.results['cycles'], result.results['cycles']): assert e1 == e2 @@ -163,6 +169,9 @@ def test_benchmark(): @pytest.mark.kerncraft def test_kerncraft_generic_field(): + machine_file_path = INPUT_FOLDER / "Example_SandyBridgeEP_E5-2680.yml" + machine = MachineModel(path_to_yaml=machine_file_path) + a = fields('a: double[3D]') b = fields('b: double[3D]') s = sp.Symbol("s") @@ -170,4 +179,4 @@ def test_kerncraft_generic_field(): update_rule = Assignment(b[0, 0, 0], s * rhs) ast = create_kernel([update_rule]) - k = PyStencilsKerncraftKernel(ast, debug_print=True) + k = PyStencilsKerncraftKernel(ast, machine, debug_print=True) -- GitLab