diff --git a/pystencils/kerncraft_coupling/generate_benchmark.py b/pystencils/kerncraft_coupling/generate_benchmark.py
index 9a012d6c2a75c98faae05e6815dd3883c7d4d2e4..6e69ef6bc42e775fc3dda60154061fc465fbc83f 100644
--- a/pystencils/kerncraft_coupling/generate_benchmark.py
+++ b/pystencils/kerncraft_coupling/generate_benchmark.py
@@ -10,8 +10,11 @@ from pystencils.backends.cbackend import generate_c, get_headers
 from pystencils.cpu.cpujit import get_compiler_config, run_compile_step
 from pystencils.data_types import get_base_type
 from pystencils.include import get_pystencils_include_path
+from pystencils.integer_functions import modulo_ceil
 from pystencils.sympyextensions import prod
 
+import numpy as np
+
 
 def generate_benchmark(ast, likwid=False, openmp=False, timing=False):
     """Return C code of a benchmark program for the given kernel.
@@ -37,8 +40,30 @@ def generate_benchmark(ast, likwid=False, openmp=False, timing=False):
             assert p.is_field_pointer, "Benchmark implemented only for kernels with fixed loop size"
             field = accessed_fields[p.field_name]
             dtype = str(get_base_type(p.symbol.dtype))
-            fields.append((p.field_name, dtype, prod(field.shape)))
-            call_parameters.append(p.field_name)
+            np_dtype = get_base_type(p.symbol.dtype).numpy_dtype
+            size_data_type = np_dtype.itemsize
+
+            dim0_size = field.shape[-1]
+            dim1_size = np.prod(field.shape[:-1])
+            elements = prod(field.shape)
+
+            if ast.instruction_set:
+                align = ast.instruction_set['width'] * size_data_type
+                padding_elements = modulo_ceil(dim0_size, ast.instruction_set['width']) - dim0_size
+                padding_bytes = padding_elements * size_data_type
+                ghost_layers = max(max(ast.ghost_layers))
+
+                size = dim1_size * padding_bytes + np.prod(field.shape) * size_data_type
+
+                assert align % np_dtype.itemsize == 0
+                offset = ((dim0_size + padding_elements + ghost_layers) % ast.instruction_set['width']) * size_data_type
+
+                fields.append((p.field_name, dtype, elements, size, offset, align))
+                call_parameters.append(p.field_name)
+            else:
+                size = elements * size_data_type
+                fields.append((p.field_name, dtype, elements, size, 0, 0))
+                call_parameters.append(p.field_name)
 
     header_list = get_headers(ast)
     includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list])
@@ -99,10 +124,10 @@ def run_c_benchmark(ast, inner_iterations, outer_iterations=3, path=None):
     compiler_config = get_compiler_config()
     compile_cmd = [compiler_config['command']] + compiler_config['flags'].split()
     compile_cmd += [*extra_flags,
-                    kerncraft_path / 'headers' / 'timing.c',
-                    kerncraft_path / 'headers' / 'dummy.c',
-                    path / 'bench.c',
-                    '-o', path / 'bench',
+                    str(kerncraft_path / 'headers' / 'timing.c'),
+                    str(kerncraft_path / 'headers' / 'dummy.c'),
+                    str(path / 'bench.c'),
+                    '-o', str(path / 'bench'),
                     ]
     run_compile_step(compile_cmd)
 
diff --git a/pystencils/kerncraft_coupling/templates/benchmark.c b/pystencils/kerncraft_coupling/templates/benchmark.c
index ae70ddd6775a45c0709e95d57cef061da2a4b6b0..79daaffd9320f9d87e4ffaad8333e1e11a232ff5 100644
--- a/pystencils/kerncraft_coupling/templates/benchmark.c
+++ b/pystencils/kerncraft_coupling/templates/benchmark.c
@@ -1,10 +1,10 @@
-
 #include "kerncraft.h"
 #include <stdlib.h>
 #include <stdint.h>
 #include <stdbool.h>
 #include <math.h>
 #include <stdio.h>
+#include <assert.h>
 
 {{ includes }}
 
@@ -18,6 +18,43 @@ void dummy(void *);
 void timing(double* wcTime, double* cpuTime);
 extern int var_false;
 
+/* see waLBerla src/field/allocation/AlignedMalloc */
+void *aligned_malloc_with_offset( uint64_t size, uint64_t alignment, uint64_t offset )
+{
+    // With 0 alignment this function makes no sense
+    // use normal malloc instead
+    assert( alignment > 0 );
+    // Tests if alignment is power of two (assuming alignment>0)
+    assert( !(alignment & (alignment - 1)) );
+    assert( offset < alignment );
+
+    void *pa;  // pointer to allocated memory
+    void *ptr; // pointer to usable aligned memory
+
+    pa=std::malloc( (size+2*alignment-1 )+sizeof(void *));
+    if(!pa)
+        return nullptr;
+
+    // Find next aligned position, starting at pa+sizeof(void*)-1
+    ptr=(void*)( ((size_t)pa+sizeof(void *)+alignment-1) & ~(alignment-1));
+    ptr=(void*) ( (char*)(ptr) + alignment - offset);
+
+    // Store pointer to real allocated chunk just before usable chunk
+    *((void **)ptr-1)=pa;
+
+    assert( ((size_t)ptr+offset) % alignment == 0 );
+
+    return ptr;
+}
+
+void aligned_free( void *ptr )
+{
+    // assume that pointer to real allocated chunk is stored just before
+    // chunk that was given to user
+    if(ptr)
+        std::free(*((void **)ptr-1));
+}
+
 
 {{kernel_code}}
 
@@ -28,11 +65,14 @@ int main(int argc, char **argv)
   likwid_markerInit();
   {%- endif %}
 
-  {%- for field_name, dataType, size in fields %}
-
+  {%- for field_name, dataType, elements, size, offset, alignment in fields %}
   // Initialization {{field_name}}
-  double * {{field_name}} = (double *) aligned_malloc(sizeof({{dataType}}) * {{size}}, 64);
-  for (unsigned long long i = 0; i < {{size}}; ++i)
+  {%- if alignment > 0 %}
+  {{dataType}} * {{field_name}} = ({{dataType}} *) aligned_malloc_with_offset({{size}}, {{alignment}}, {{offset}});
+  {%- else %}
+  {{dataType}} * {{field_name}} = new {{dataType}}[{{elements}}];
+  {%- endif %}
+  for (unsigned long long i = 0; i < {{elements}}; ++i)
     {{field_name}}[i] = 0.23;
 
   if(var_false)
@@ -69,18 +109,18 @@ int main(int argc, char **argv)
       likwid_markerStartRegion("loop");
       {%- endif %}
     }
-    
+
     {%- if timing %}
     double wcStartTime, cpuStartTime, wcEndTime, cpuEndTime;
     timing(&wcStartTime, &cpuStartTime);
     {%- endif %}
-    
+
     for (; repeat > 0; --repeat)
     {
       {{kernelName}}({{call_argument_list}});
 
       // Dummy calls
-      {%- for field_name, dataType, size in fields %}
+      {%- for field_name, dataType, elements, size, offset, alignment in fields %}
       if(var_false) dummy((void*){{field_name}});
       {%- endfor %}
       {%- for constantName, dataType in constants %}
@@ -105,4 +145,13 @@ int main(int argc, char **argv)
   {%- if likwid %}
   likwid_markerClose();
   {%- endif %}
+
+  {%- for field_name, dataType, elements, size, offset, alignment in fields %}
+  {%- if alignment > 0 %}
+  aligned_free({{field_name}});
+  {%- else %}
+  delete[] {{field_name}};
+  {%- endif %}
+
+  {%- endfor %}
 }
diff --git a/pystencils_tests/test_kerncraft_coupling.py b/pystencils_tests/test_kerncraft_coupling.py
index 0f282fd11108bc2b82cfa568524e7d9b05f5530b..5faa292bea644731c3d23b0dc436f6b1ea75564b 100644
--- a/pystencils_tests/test_kerncraft_coupling.py
+++ b/pystencils_tests/test_kerncraft_coupling.py
@@ -7,8 +7,11 @@ from kerncraft.kernel import KernelCode
 from kerncraft.machinemodel import MachineModel
 from kerncraft.models import ECM, ECMData, Benchmark
 
+import pystencils as ps
 from pystencils import Assignment, Field
+from pystencils.backends.simd_instruction_sets import get_supported_instruction_sets, get_vector_instruction_set
 from pystencils.cpu import create_kernel
+from pystencils.datahandling import create_data_handling
 from pystencils.kerncraft_coupling import KerncraftParameters, PyStencilsKerncraftKernel
 from pystencils.kerncraft_coupling.generate_benchmark import generate_benchmark, run_c_benchmark
 from pystencils.timeloop import TimeLoop
@@ -170,3 +173,26 @@ def test_benchmark():
     timeloop_time = timeloop.benchmark(number_of_time_steps_for_estimation=1)
 
     np.testing.assert_almost_equal(c_benchmark_run, timeloop_time, decimal=4)
+
+
+@pytest.mark.kerncraft
+def test_benchmark_vectorized():
+    instruction_sets = get_supported_instruction_sets()
+    if not instruction_sets:
+        pytest.skip("cannot detect CPU instruction set")
+
+    for vec in instruction_sets:
+        dh = create_data_handling((20, 20, 20), periodicity=True)
+
+        width = get_vector_instruction_set(instruction_set=vec)['width'] * 8
+
+        a = dh.add_array("a", values_per_cell=1, alignment=width)
+        b = dh.add_array("b", values_per_cell=1, alignment=width)
+
+        rhs = a[0, -1, 0] + a[0, 1, 0] + a[-1, 0, 0] + a[1, 0, 0] + a[0, 0, -1] + a[0, 0, 1]
+        update_rule = Assignment(b[0, 0, 0], rhs)
+
+        opt = {'instruction_set': vec, 'assume_aligned': True, 'nontemporal': True, 'assume_inner_stride_one': True}
+        ast = ps.create_kernel(update_rule, cpu_vectorize_info=opt)
+
+        run_c_benchmark(ast, 5)