diff --git a/pystencils/kerncraft_coupling/kerncraft_interface.py b/pystencils/kerncraft_coupling/kerncraft_interface.py
index 7564245c049c58288387a4a23918622d974aaaf3..8e8bcc618a53e8ceddd9671e4e9826da66fd71e6 100644
--- a/pystencils/kerncraft_coupling/kerncraft_interface.py
+++ b/pystencils/kerncraft_coupling/kerncraft_interface.py
@@ -2,21 +2,23 @@ import warnings
 import fcntl
 from collections import defaultdict
 from tempfile import TemporaryDirectory
-from typing import Optional
-
-from jinja2 import Environment, PackageLoader, StrictUndefined
+import textwrap
 
+from jinja2 import Environment, PackageLoader, StrictUndefined, Template
 import sympy as sp
 from kerncraft.kerncraft import KernelCode
 from kerncraft.machinemodel import MachineModel
 
-from pystencils.astnodes import (KernelFunction, LoopOverCoordinate, ResolvedFieldAccess, SympyAssignment)
+from pystencils.astnodes import \
+    KernelFunction, LoopOverCoordinate, ResolvedFieldAccess, SympyAssignment
+from pystencils.backends.cbackend import generate_c, get_headers
 from pystencils.field import get_layout_from_strides
 from pystencils.sympyextensions import count_operations_in_ast
 from pystencils.transformations import filtered_tree_iteration
 from pystencils.utils import DotDict
-from pystencils.backends.cbackend import generate_c, get_headers
 from pystencils.cpu.kernelcreation import add_openmp
+from pystencils.data_types import get_base_type
+from pystencils.sympyextensions import prod
 
 
 class PyStencilsKerncraftKernel(KernelCode):
@@ -26,7 +28,7 @@ class PyStencilsKerncraftKernel(KernelCode):
     """
     LIKWID_BASE = '/usr/local/likwid'
 
-    def __init__(self, ast: KernelFunction, machine: Optional[MachineModel] = None,
+    def __init__(self, ast: KernelFunction, machine: MachineModel,
                  assumed_layout='SoA', debug_print=False, filename=None):
         """Create a kerncraft kernel using a pystencils AST
 
@@ -44,6 +46,7 @@ class PyStencilsKerncraftKernel(KernelCode):
         # Initialize state
         self.asm_block = None
         self._filename = filename
+        self._keep_intermediates = False
 
         self.kernel_ast = ast
         self.temporary_dir = TemporaryDirectory()
@@ -96,14 +99,14 @@ class PyStencilsKerncraftKernel(KernelCode):
                 target_dict[fa.field.name].append(permuted_coord)
 
         # Variables (arrays)
-        fields_accessed = ast.fields_accessed
+        fields_accessed = self.kernel_ast.fields_accessed
         for field in fields_accessed:
             layout = get_layout_tuple(field)
             permuted_shape = list(field.shape[i] for i in layout)
-            self.set_variable(field.name, tuple([str(field.dtype)]), tuple(permuted_shape))
+            self.set_variable(field.name, (str(field.dtype),), tuple(permuted_shape))
 
         # Scalars may be safely ignored
-        # for param in ast.get_parameters():
+        # for param in self.kernel_ast.get_parameters():
         #     if not param.is_field_parameter:
         #         # self.set_variable(param.symbol.name, str(param.symbol.dtype), None)
         #         self.sources[param.symbol.name] = [None]
@@ -138,7 +141,10 @@ class PyStencilsKerncraftKernel(KernelCode):
         file_path = self.get_intermediate_location(file_name, machine_and_compiler_dependent=False)
         lock_mode, lock_fp = self.lock_intermediate(file_path)
 
-        if lock_mode == fcntl.LOCK_EX:
+        if lock_mode == fcntl.LOCK_SH:
+            # use cache
+            pass
+        else:  # lock_mode == fcntl.LOCK_EX:
             function_signature = generate_c(self.kernel_ast, dialect='c', signature_only=True)
 
             jinja_context = {
@@ -150,13 +156,12 @@ class PyStencilsKerncraftKernel(KernelCode):
             with open(file_path, 'w') as f:
                 f.write(file_header)
 
-            fcntl.flock(lock_fp, fcntl.LOCK_SH)  # degrade to shared lock
-
+            self.release_exclusive_lock(lock_fp)  # degrade to shared lock
         return file_path, lock_fp
 
     def get_kernel_code(self, openmp=False, name='pystencils_kernl'):
         """
-        Generate and return compilable source code.
+        Generate and return compilable source code from AST.
 
         Args:
             openmp: if true, openmp code will be generated
@@ -169,7 +174,11 @@ class PyStencilsKerncraftKernel(KernelCode):
         file_path = self.get_intermediate_location(filename, machine_and_compiler_dependent=False)
         lock_mode, lock_fp = self.lock_intermediate(file_path)
 
-        if lock_mode == fcntl.LOCK_EX:
+        if lock_mode == fcntl.LOCK_SH:
+            # use cache
+            with open(file_path) as f:
+                code = f.read()
+        else:  # lock_mode == fcntl.LOCK_EX:
             header_list = get_headers(self.kernel_ast)
             includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list])
 
@@ -184,11 +193,136 @@ class PyStencilsKerncraftKernel(KernelCode):
             }
 
             env = Environment(loader=PackageLoader('pystencils.kerncraft_coupling'), undefined=StrictUndefined)
-            file_header = env.get_template('kernel.c').render(**jinja_context)
+            code = env.get_template('kernel.c').render(**jinja_context)
             with open(file_path, 'w') as f:
-                f.write(file_header)
+                f.write(code)
+
+            self.release_exclusive_lock(lock_fp)  # degrade to shared lock
+        return file_path, lock_fp
+
+    CODE_TEMPLATE = Template(textwrap.dedent("""
+        #include <likwid.h>
+        #include <stdlib.h>
+        #include <stdint.h>
+        #include <stdbool.h>
+        #include <math.h>
+        #include "kerncraft.h"
+        #include "kernel.h"
+
+        #define RESTRICT __restrict__
+        #define FUNC_PREFIX
+        void dummy(void *);
+        extern int var_false;
+
+        int main(int argc, char **argv) {
+          {%- for constantName, dataType in constants %}
+          // Constant {{constantName}}
+          {{dataType}} {{constantName}};
+          {{constantName}} = 0.23;
+          {%- endfor %}
+
+          // Declaring arrays
+          {%- for field_name, dataType, size in fields %}
+
+          // Initialization {{field_name}}
+          double * {{field_name}} = (double *) aligned_malloc(sizeof({{dataType}}) * {{size}}, 64);
+          // TODO initialize in parallel context in same order as they are touched
+          for (unsigned long long i = 0; i < {{size}}; ++i)
+            {{field_name}}[i] = 0.23;
+          {%- endfor %}
+
+          likwid_markerInit();
+          #pragma omp parallel
+          {
+            likwid_markerRegisterRegion("loop");
+            #pragma omp barrier
+
+            // Initializing arrays in same order as touched in kernel loop nest
+            //INIT_ARRAYS;
+
+            // Dummy call
+            {%- for field_name, dataType, size in fields %}
+            if(var_false) dummy({{field_name}});
+            {%- endfor %}
+            {%- for constantName, dataType in constants %}
+            if(var_false) dummy(&{{constantName}});
+            {%- endfor %}
+
+            for(int warmup = 1; warmup >= 0; --warmup) {
+              int repeat = 2;
+              if(warmup == 0) {
+                repeat = atoi(argv[1]);
+                likwid_markerStartRegion("loop");
+              }
+
+              for(; repeat > 0; --repeat) {
+                {{kernelName}}({{call_argument_list}});
+
+                {%- for field_name, dataType, size in fields %}
+                if(var_false) dummy({{field_name}});
+                {%- endfor %}
+                {%- for constantName, dataType in constants %}
+                if(var_false) dummy(&{{constantName}});
+                {%- endfor %}
+              }
+
+            }
+            likwid_markerStopRegion("loop");
+          }
+          likwid_markerClose();
+          return 0;
+        }
+        """))
 
-            fcntl.flock(lock_fp, fcntl.LOCK_SH)  # degrade to shared lock
+    def get_main_code(self, kernel_function_name='kernel'):
+        """
+        Generate and return compilable source code from AST.
+
+        :return: tuple of filename and shared lock file pointer
+        """
+        # TODO produce nicer code, including help text and other "comfort features".
+        assert self.kernel_ast is not None, "AST does not exist, this could be due to running " \
+                                            "based on a kernel description rather than code."
+
+        file_path = self.get_intermediate_location('main.c', machine_and_compiler_dependent=False)
+        lock_mode, lock_fp = self.lock_intermediate(file_path)
+
+        if lock_mode == fcntl.LOCK_SH:
+            # use cache
+            with open(file_path) as f:
+                code = f.read()
+        else:  # lock_mode == fcntl.LOCK_EX
+            # needs update
+            accessed_fields = {f.name: f for f in self.kernel_ast.fields_accessed}
+            constants = []
+            fields = []
+            call_parameters = []
+            for p in self.kernel_ast.get_parameters():
+                if not p.is_field_parameter:
+                    constants.append((p.symbol.name, str(p.symbol.dtype)))
+                    call_parameters.append(p.symbol.name)
+                else:
+                    assert p.is_field_pointer, "Benchmark implemented only for kernels with fixed loop size"
+                    field = accessed_fields[p.field_name]
+                    dtype = str(get_base_type(p.symbol.dtype))
+                    fields.append((p.field_name, dtype, prod(field.shape)))
+                    call_parameters.append(p.field_name)
+
+            header_list = get_headers(self.kernel_ast)
+            includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list])
+
+            # Generate code
+            code = self.CODE_TEMPLATE.render(
+                kernelName=self.kernel_ast.function_name,
+                fields=fields,
+                constants=constants,
+                call_agument_list=','.join(call_parameters),
+                includes=includes)
+
+            # Store to file
+            with open(file_path, 'w') as f:
+                f.write(code)
+            self.release_exclusive_lock(lock_fp)  # degrade to shared lock
 
         return file_path, lock_fp
 
diff --git a/pystencils_tests/kerncraft_inputs/Example_SandyBridgeEP_E5-2680.yml b/pystencils_tests/kerncraft_inputs/Example_SandyBridgeEP_E5-2680.yml
index 37889b8fee94242855e51d5d31a3118dc367bfd6..890e2e8952bdf9fcdda92565027e29700226ddbe 100644
--- a/pystencils_tests/kerncraft_inputs/Example_SandyBridgeEP_E5-2680.yml
+++ b/pystencils_tests/kerncraft_inputs/Example_SandyBridgeEP_E5-2680.yml
@@ -1,4 +1,4 @@
-kerncraft version: 0.8.3.dev0
+kerncraft version: 0.8.6.dev0
 model name: Intel(R) Xeon(R) CPU E5-2680 0 @ 2.70GHz
 model type: Intel Xeon SandyBridge EN/EP processor
 clock: 2.7 GHz
@@ -8,6 +8,7 @@ cores per socket: 8
 threads per core: 2
 NUMA domains per socket: 1
 cores per NUMA domain: 8
+transparent hugepage: always
 
 in-core model: !!omap
   - IACA: SNB
@@ -20,17 +21,22 @@ FLOPs per cycle:
   DP: {total: 8, ADD: 4, MUL: 4}
 
 compiler: !!omap
-- icc: -O3 -xAVX -fno-alias -qopenmp
-- clang: -O3 -march=corei7-avx -mtune=corei7-avx -D_POSIX_C_SOURCE=200809L -fopenmp
-- gcc: -O3 -march=corei7-avx -D_POSIX_C_SOURCE=200809L -fopenmp -lm
+- icc: -O3 -xAVX -fno-alias -qopenmp -ffreestanding -nolib-inline
+- clang: -O3 -march=corei7-avx -mtune=corei7-avx -D_POSIX_C_SOURCE=200809L -fopenmp -ffreestanding
+- gcc: -O3 -march=corei7-avx -D_POSIX_C_SOURCE=200809L -fopenmp -lm -ffreestanding
 
+overlapping model:
+  ports: 
+    IACA: ['0', 0DV, '1', '2', '3', '4', '5']
+    OSACA: ['0', 0DV, '1', '2', '3', '4', '5']
+    LLVM-MCA: [SBDivider, SBFPDivider, SBPort0, SBPort1, SBPort23, SBPort4, SBPort5]
+  performance counter metric: Max(UOPS_DISPATCHED_PORT_PORT_0:PMC[0-3], UOPS_DISPATCHED_PORT_PORT_1:PMC[0-3], UOPS_DISPATCHED_PORT_PORT_4:PMC[0-3], UOPS_DISPATCHED_PORT_PORT_5:PMC[0-3])
 non-overlapping model:
-  ports: [2D, 3D]
+  ports: 
+    IACA: [2D, 3D]
+    OSACA: [2D, 3D]
+    LLVM-MCA: [SBPort23]
   performance counter metric: T_nOL + T_L1L2 + T_L2L3 + T_L3MEM
-overlapping model:
-  ports: ['0', 0DV, '1', '2', '3', '4', '5']
-  performance counter metric: Max(UOPS_DISPATCHED_PORT_PORT_0:PMC[0-3], UOPS_DISPATCHED_PORT_PORT_1:PMC[0-3],
-    UOPS_DISPATCHED_PORT_PORT_4:PMC[0-3], UOPS_DISPATCHED_PORT_PORT_5:PMC[0-3])
 
 cacheline size: 64 B
 memory hierarchy:
diff --git a/pystencils_tests/test_kerncraft_coupling.py b/pystencils_tests/test_kerncraft_coupling.py
index aeb4b7acb6f62730f0896b10d6b8fa6d655c871d..754604f1ed3aa06fb4138fb46fc918801c51049b 100644
--- a/pystencils_tests/test_kerncraft_coupling.py
+++ b/pystencils_tests/test_kerncraft_coupling.py
@@ -42,9 +42,7 @@ def test_compilation():
 
 
 @pytest.mark.kerncraft
-def analysis(kernel, model='ecmdata'):
-    machine_file_path = INPUT_FOLDER / "Example_SandyBridgeEP_E5-2680.yml"
-    machine = MachineModel(path_to_yaml=machine_file_path)
+def analysis(kernel, machine, model='ecmdata'):
     if model == 'ecmdata':
         model = ECMData(kernel, machine, KerncraftParameters())
     elif model == 'ecm':
@@ -71,7 +69,7 @@ def test_3d_7pt_osaca():
     reference_kernel.set_constant('M', size[0])
     reference_kernel.set_constant('N', size[1])
     assert size[1] == size[2]
-    analysis(reference_kernel, model='ecm')
+    analysis(reference_kernel, machine_model, model='ecm')
 
     arr = np.zeros(size)
     a = Field.create_from_numpy_array('a', arr, index_dimensions=0)
@@ -82,18 +80,22 @@ def test_3d_7pt_osaca():
     update_rule = Assignment(b[0, 0, 0], s * rhs)
     ast = create_kernel([update_rule])
     k = PyStencilsKerncraftKernel(ast, machine=machine_model)
-    analysis(k, model='ecm')
+    analysis(k, machine_model, model='ecm')
     assert reference_kernel._flops == k._flops
     # assert reference.results['cl throughput'] == analysis.results['cl throughput']
 
 
 @pytest.mark.kerncraft
 def test_2d_5pt():
+    machine_file_path = INPUT_FOLDER / "Example_SandyBridgeEP_E5-2680.yml"
+    machine = MachineModel(path_to_yaml=machine_file_path)
+
     size = [30, 50, 3]
     kernel_file_path = INPUT_FOLDER / "2d-5pt.c"
     with open(kernel_file_path) as kernel_file:
-        reference_kernel = KernelCode(kernel_file.read(), machine=None, filename=kernel_file_path)
-    reference = analysis(reference_kernel)
+        reference_kernel = KernelCode(kernel_file.read(), machine=machine, 
+                                      filename=kernel_file_path)
+    reference = analysis(reference_kernel, machine)
 
     arr = np.zeros(size)
     a = Field.create_from_numpy_array('a', arr, index_dimensions=1)
@@ -102,8 +104,8 @@ def test_2d_5pt():
     rhs = a[0, -1](0) + a[0, 1] + a[-1, 0] + a[1, 0]
     update_rule = Assignment(b[0, 0], s * rhs)
     ast = create_kernel([update_rule])
-    k = PyStencilsKerncraftKernel(ast)
-    result = analysis(k)
+    k = PyStencilsKerncraftKernel(ast, machine)
+    result = analysis(k, machine)
 
     for e1, e2 in zip(reference.results['cycles'], result.results['cycles']):
         assert e1 == e2
@@ -111,14 +113,18 @@ def test_2d_5pt():
 
 @pytest.mark.kerncraft
 def test_3d_7pt():
+    machine_file_path = INPUT_FOLDER / "Example_SandyBridgeEP_E5-2680.yml"
+    machine = MachineModel(path_to_yaml=machine_file_path)
+
     size = [30, 50, 50]
     kernel_file_path = INPUT_FOLDER / "3d-7pt.c"
     with open(kernel_file_path) as kernel_file:
-        reference_kernel = KernelCode(kernel_file.read(), machine=None, filename=kernel_file_path)
+        reference_kernel = KernelCode(kernel_file.read(), machine=machine,
+                                      filename=kernel_file_path)
     reference_kernel.set_constant('M', size[0])
     reference_kernel.set_constant('N', size[1])
     assert size[1] == size[2]
-    reference = analysis(reference_kernel)
+    reference = analysis(reference_kernel, machine)
 
     arr = np.zeros(size)
     a = Field.create_from_numpy_array('a', arr, index_dimensions=0)
@@ -128,8 +134,8 @@ def test_3d_7pt():
 
     update_rule = Assignment(b[0, 0, 0], s * rhs)
     ast = create_kernel([update_rule])
-    k = PyStencilsKerncraftKernel(ast)
-    result = analysis(k)
+    k = PyStencilsKerncraftKernel(ast, machine)
+    result = analysis(k, machine)
 
     for e1, e2 in zip(reference.results['cycles'], result.results['cycles']):
         assert e1 == e2
@@ -163,6 +169,9 @@ def test_benchmark():
 
 @pytest.mark.kerncraft
 def test_kerncraft_generic_field():
+    machine_file_path = INPUT_FOLDER / "Example_SandyBridgeEP_E5-2680.yml"
+    machine = MachineModel(path_to_yaml=machine_file_path)
+
     a = fields('a: double[3D]')
     b = fields('b: double[3D]')
     s = sp.Symbol("s")
@@ -170,4 +179,4 @@ def test_kerncraft_generic_field():
 
     update_rule = Assignment(b[0, 0, 0], s * rhs)
     ast = create_kernel([update_rule])
-    k = PyStencilsKerncraftKernel(ast, debug_print=True)
+    k = PyStencilsKerncraftKernel(ast, machine, debug_print=True)