Merge branch 'FixBenchmarkGeneration' into 'master'

Fix benchmark generation See merge request !223

Merge branch 'FixBenchmarkGeneration' into 'master'
Fix benchmark generation See merge request !223
30641109 · Michael Kuron · 92806ae7 · 788d8507 · 30641109 · 30641109
Commit 30641109 authored 4 years ago by Michael Kuron
--- a/pystencils/kerncraft_coupling/generate_benchmark.py
+++ b/pystencils/kerncraft_coupling/generate_benchmark.py
@@ -10,8 +10,11 @@ from pystencils.backends.cbackend import generate_c, get_headers
 from pystencils.cpu.cpujit import get_compiler_config, run_compile_step
 from pystencils.data_types import get_base_type
 from pystencils.include import get_pystencils_include_path
+from pystencils.integer_functions import modulo_ceil
 from pystencils.sympyextensions import prod
+import numpy as np
 def generate_benchmark(ast, likwid=False, openmp=False, timing=False):
    """Return C code of a benchmark program for the given kernel.
@@ -37,8 +40,30 @@ def generate_benchmark(ast, likwid=False, openmp=False, timing=False):
            assert p.is_field_pointer, "Benchmark implemented only for kernels with fixed loop size"
            field = accessed_fields[p.field_name]
            dtype = str(get_base_type(p.symbol.dtype))
-            fields.append((p.field_name, dtype, prod(field.shape)))
+            np_dtype = get_base_type(p.symbol.dtype).numpy_dtype
-            call_parameters.append(p.field_name)
+            size_data_type = np_dtype.itemsize
+            dim0_size = field.shape[-1]
+            dim1_size = np.prod(field.shape[:-1])
+            elements = prod(field.shape)
+            if ast.instruction_set:
+                align = ast.instruction_set['width'] * size_data_type
+                padding_elements = modulo_ceil(dim0_size, ast.instruction_set['width']) - dim0_size
+                padding_bytes = padding_elements * size_data_type
+                ghost_layers = max(max(ast.ghost_layers))
+                size = dim1_size * padding_bytes + np.prod(field.shape) * size_data_type
+                assert align % np_dtype.itemsize == 0
+                offset = ((dim0_size + padding_elements + ghost_layers) % ast.instruction_set['width']) * size_data_type
+                fields.append((p.field_name, dtype, elements, size, offset, align))
+                call_parameters.append(p.field_name)
+            else:
+                size = elements * size_data_type
+                fields.append((p.field_name, dtype, elements, size, 0, 0))
+                call_parameters.append(p.field_name)
    header_list = get_headers(ast)
    includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list])
@@ -99,10 +124,10 @@ def run_c_benchmark(ast, inner_iterations, outer_iterations=3, path=None):
    compiler_config = get_compiler_config()
    compile_cmd = [compiler_config['command']] + compiler_config['flags'].split()
    compile_cmd += [*extra_flags,
-                    kerncraft_path / 'headers' / 'timing.c',
+                    str(kerncraft_path / 'headers' / 'timing.c'),
-                    kerncraft_path / 'headers' / 'dummy.c',
+                    str(kerncraft_path / 'headers' / 'dummy.c'),
-                    path / 'bench.c',
+                    str(path / 'bench.c'),
-                    '-o', path / 'bench',
+                    '-o', str(path / 'bench'),
                    ]
    run_compile_step(compile_cmd)

--- a/pystencils/kerncraft_coupling/templates/benchmark.c
+++ b/pystencils/kerncraft_coupling/templates/benchmark.c
 #include "kerncraft.h"
 #include <stdlib.h>
 #include <stdint.h>
 #include <stdbool.h>
 #include <math.h>
 #include <stdio.h>
+#include <assert.h>
 {{ includes }}
@@ -18,6 +18,43 @@ void dummy(void *);
 void timing(double* wcTime, double* cpuTime);
 extern int var_false;
+/* see waLBerla src/field/allocation/AlignedMalloc */
+void *aligned_malloc_with_offset( uint64_t size, uint64_t alignment, uint64_t offset )
+{
+    // With 0 alignment this function makes no sense
+    // use normal malloc instead
+    assert( alignment > 0 );
+    // Tests if alignment is power of two (assuming alignment>0)
+    assert( !(alignment & (alignment - 1)) );
+    assert( offset < alignment );
+    void *pa;  // pointer to allocated memory
+    void *ptr; // pointer to usable aligned memory
+    pa=std::malloc( (size+2*alignment-1 )+sizeof(void *));
+    if(!pa)
+        return nullptr;
+    // Find next aligned position, starting at pa+sizeof(void*)-1
+    ptr=(void*)( ((size_t)pa+sizeof(void *)+alignment-1) & ~(alignment-1));
+    ptr=(void*) ( (char*)(ptr) + alignment - offset);
+    // Store pointer to real allocated chunk just before usable chunk
+    *((void **)ptr-1)=pa;
+    assert( ((size_t)ptr+offset) % alignment == 0 );
+    return ptr;
+}
+void aligned_free( void *ptr )
+{
+    // assume that pointer to real allocated chunk is stored just before
+    // chunk that was given to user
+    if(ptr)
+        std::free(*((void **)ptr-1));
+}
 {{kernel_code}}
@@ -28,11 +65,14 @@ int main(int argc, char **argv)
  likwid_markerInit();
  {%- endif %}
-  {%- for field_name, dataType, size in fields %}
+  {%- for field_name, dataType, elements, size, offset, alignment in fields %}
  // Initialization {{field_name}}
-  double * {{field_name}} = (double *) aligned_malloc(sizeof({{dataType}}) * {{size}}, 64);
+  {%- if alignment > 0 %}
-  for (unsigned long long i = 0; i < {{size}}; ++i)
+  {{dataType}} * {{field_name}} = ({{dataType}} *) aligned_malloc_with_offset({{size}}, {{alignment}}, {{offset}});
+  {%- else %}
+  {{dataType}} * {{field_name}} = new {{dataType}}[{{elements}}];
+  {%- endif %}
+  for (unsigned long long i = 0; i < {{elements}}; ++i)
    {{field_name}}[i] = 0.23;
  if(var_false)
@@ -69,18 +109,18 @@ int main(int argc, char **argv)
      likwid_markerStartRegion("loop");
      {%- endif %}
    }
    {%- if timing %}
    double wcStartTime, cpuStartTime, wcEndTime, cpuEndTime;
    timing(&wcStartTime, &cpuStartTime);
    {%- endif %}
    for (; repeat > 0; --repeat)
    {
      {{kernelName}}({{call_argument_list}});
      // Dummy calls
-      {%- for field_name, dataType, size in fields %}
+      {%- for field_name, dataType, elements, size, offset, alignment in fields %}
      if(var_false) dummy((void*){{field_name}});
      {%- endfor %}
      {%- for constantName, dataType in constants %}
@@ -105,4 +145,13 @@ int main(int argc, char **argv)
  {%- if likwid %}
  likwid_markerClose();
  {%- endif %}
+  {%- for field_name, dataType, elements, size, offset, alignment in fields %}
+  {%- if alignment > 0 %}
+  aligned_free({{field_name}});
+  {%- else %}
+  delete[] {{field_name}};
+  {%- endif %}
+  {%- endfor %}
 }
--- a/pystencils_tests/test_kerncraft_coupling.py
+++ b/pystencils_tests/test_kerncraft_coupling.py
@@ -7,8 +7,11 @@ from kerncraft.kernel import KernelCode
 from kerncraft.machinemodel import MachineModel
 from kerncraft.models import ECM, ECMData, Benchmark
+import pystencils as ps
 from pystencils import Assignment, Field
+from pystencils.backends.simd_instruction_sets import get_supported_instruction_sets, get_vector_instruction_set
 from pystencils.cpu import create_kernel
+from pystencils.datahandling import create_data_handling
 from pystencils.kerncraft_coupling import KerncraftParameters, PyStencilsKerncraftKernel
 from pystencils.kerncraft_coupling.generate_benchmark import generate_benchmark, run_c_benchmark
 from pystencils.timeloop import TimeLoop
@@ -170,3 +173,26 @@ def test_benchmark():
    timeloop_time = timeloop.benchmark(number_of_time_steps_for_estimation=1)
    np.testing.assert_almost_equal(c_benchmark_run, timeloop_time, decimal=4)
+@pytest.mark.kerncraft
+def test_benchmark_vectorized():
+    instruction_sets = get_supported_instruction_sets()
+    if not instruction_sets:
+        pytest.skip("cannot detect CPU instruction set")
+    for vec in instruction_sets:
+        dh = create_data_handling((20, 20, 20), periodicity=True)
+        width = get_vector_instruction_set(instruction_set=vec)['width'] * 8
+        a = dh.add_array("a", values_per_cell=1, alignment=width)
+        b = dh.add_array("b", values_per_cell=1, alignment=width)
+        rhs = a[0, -1, 0] + a[0, 1, 0] + a[-1, 0, 0] + a[1, 0, 0] + a[0, 0, -1] + a[0, 0, 1]
+        update_rule = Assignment(b[0, 0, 0], rhs)
+        opt = {'instruction_set': vec, 'assume_aligned': True, 'nontemporal': True, 'assume_inner_stride_one': True}
+        ast = ps.create_kernel(update_rule, cpu_vectorize_info=opt)
+        run_c_benchmark(ast, 5)