Skip to content
Snippets Groups Projects
Commit 30641109 authored by Michael Kuron's avatar Michael Kuron :mortar_board:
Browse files

Merge branch 'FixBenchmarkGeneration' into 'master'

Fix benchmark generation

See merge request !223
parents 92806ae7 788d8507
Branches
Tags
No related merge requests found
Pipeline #30585 passed with stage
in 10 minutes and 56 seconds
...@@ -10,8 +10,11 @@ from pystencils.backends.cbackend import generate_c, get_headers ...@@ -10,8 +10,11 @@ from pystencils.backends.cbackend import generate_c, get_headers
from pystencils.cpu.cpujit import get_compiler_config, run_compile_step from pystencils.cpu.cpujit import get_compiler_config, run_compile_step
from pystencils.data_types import get_base_type from pystencils.data_types import get_base_type
from pystencils.include import get_pystencils_include_path from pystencils.include import get_pystencils_include_path
from pystencils.integer_functions import modulo_ceil
from pystencils.sympyextensions import prod from pystencils.sympyextensions import prod
import numpy as np
def generate_benchmark(ast, likwid=False, openmp=False, timing=False): def generate_benchmark(ast, likwid=False, openmp=False, timing=False):
"""Return C code of a benchmark program for the given kernel. """Return C code of a benchmark program for the given kernel.
...@@ -37,8 +40,30 @@ def generate_benchmark(ast, likwid=False, openmp=False, timing=False): ...@@ -37,8 +40,30 @@ def generate_benchmark(ast, likwid=False, openmp=False, timing=False):
assert p.is_field_pointer, "Benchmark implemented only for kernels with fixed loop size" assert p.is_field_pointer, "Benchmark implemented only for kernels with fixed loop size"
field = accessed_fields[p.field_name] field = accessed_fields[p.field_name]
dtype = str(get_base_type(p.symbol.dtype)) dtype = str(get_base_type(p.symbol.dtype))
fields.append((p.field_name, dtype, prod(field.shape))) np_dtype = get_base_type(p.symbol.dtype).numpy_dtype
call_parameters.append(p.field_name) size_data_type = np_dtype.itemsize
dim0_size = field.shape[-1]
dim1_size = np.prod(field.shape[:-1])
elements = prod(field.shape)
if ast.instruction_set:
align = ast.instruction_set['width'] * size_data_type
padding_elements = modulo_ceil(dim0_size, ast.instruction_set['width']) - dim0_size
padding_bytes = padding_elements * size_data_type
ghost_layers = max(max(ast.ghost_layers))
size = dim1_size * padding_bytes + np.prod(field.shape) * size_data_type
assert align % np_dtype.itemsize == 0
offset = ((dim0_size + padding_elements + ghost_layers) % ast.instruction_set['width']) * size_data_type
fields.append((p.field_name, dtype, elements, size, offset, align))
call_parameters.append(p.field_name)
else:
size = elements * size_data_type
fields.append((p.field_name, dtype, elements, size, 0, 0))
call_parameters.append(p.field_name)
header_list = get_headers(ast) header_list = get_headers(ast)
includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list]) includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list])
...@@ -99,10 +124,10 @@ def run_c_benchmark(ast, inner_iterations, outer_iterations=3, path=None): ...@@ -99,10 +124,10 @@ def run_c_benchmark(ast, inner_iterations, outer_iterations=3, path=None):
compiler_config = get_compiler_config() compiler_config = get_compiler_config()
compile_cmd = [compiler_config['command']] + compiler_config['flags'].split() compile_cmd = [compiler_config['command']] + compiler_config['flags'].split()
compile_cmd += [*extra_flags, compile_cmd += [*extra_flags,
kerncraft_path / 'headers' / 'timing.c', str(kerncraft_path / 'headers' / 'timing.c'),
kerncraft_path / 'headers' / 'dummy.c', str(kerncraft_path / 'headers' / 'dummy.c'),
path / 'bench.c', str(path / 'bench.c'),
'-o', path / 'bench', '-o', str(path / 'bench'),
] ]
run_compile_step(compile_cmd) run_compile_step(compile_cmd)
......
#include "kerncraft.h" #include "kerncraft.h"
#include <stdlib.h> #include <stdlib.h>
#include <stdint.h> #include <stdint.h>
#include <stdbool.h> #include <stdbool.h>
#include <math.h> #include <math.h>
#include <stdio.h> #include <stdio.h>
#include <assert.h>
{{ includes }} {{ includes }}
...@@ -18,6 +18,43 @@ void dummy(void *); ...@@ -18,6 +18,43 @@ void dummy(void *);
void timing(double* wcTime, double* cpuTime); void timing(double* wcTime, double* cpuTime);
extern int var_false; extern int var_false;
/* see waLBerla src/field/allocation/AlignedMalloc */
void *aligned_malloc_with_offset( uint64_t size, uint64_t alignment, uint64_t offset )
{
// With 0 alignment this function makes no sense
// use normal malloc instead
assert( alignment > 0 );
// Tests if alignment is power of two (assuming alignment>0)
assert( !(alignment & (alignment - 1)) );
assert( offset < alignment );
void *pa; // pointer to allocated memory
void *ptr; // pointer to usable aligned memory
pa=std::malloc( (size+2*alignment-1 )+sizeof(void *));
if(!pa)
return nullptr;
// Find next aligned position, starting at pa+sizeof(void*)-1
ptr=(void*)( ((size_t)pa+sizeof(void *)+alignment-1) & ~(alignment-1));
ptr=(void*) ( (char*)(ptr) + alignment - offset);
// Store pointer to real allocated chunk just before usable chunk
*((void **)ptr-1)=pa;
assert( ((size_t)ptr+offset) % alignment == 0 );
return ptr;
}
void aligned_free( void *ptr )
{
// assume that pointer to real allocated chunk is stored just before
// chunk that was given to user
if(ptr)
std::free(*((void **)ptr-1));
}
{{kernel_code}} {{kernel_code}}
...@@ -28,11 +65,14 @@ int main(int argc, char **argv) ...@@ -28,11 +65,14 @@ int main(int argc, char **argv)
likwid_markerInit(); likwid_markerInit();
{%- endif %} {%- endif %}
{%- for field_name, dataType, size in fields %} {%- for field_name, dataType, elements, size, offset, alignment in fields %}
// Initialization {{field_name}} // Initialization {{field_name}}
double * {{field_name}} = (double *) aligned_malloc(sizeof({{dataType}}) * {{size}}, 64); {%- if alignment > 0 %}
for (unsigned long long i = 0; i < {{size}}; ++i) {{dataType}} * {{field_name}} = ({{dataType}} *) aligned_malloc_with_offset({{size}}, {{alignment}}, {{offset}});
{%- else %}
{{dataType}} * {{field_name}} = new {{dataType}}[{{elements}}];
{%- endif %}
for (unsigned long long i = 0; i < {{elements}}; ++i)
{{field_name}}[i] = 0.23; {{field_name}}[i] = 0.23;
if(var_false) if(var_false)
...@@ -69,18 +109,18 @@ int main(int argc, char **argv) ...@@ -69,18 +109,18 @@ int main(int argc, char **argv)
likwid_markerStartRegion("loop"); likwid_markerStartRegion("loop");
{%- endif %} {%- endif %}
} }
{%- if timing %} {%- if timing %}
double wcStartTime, cpuStartTime, wcEndTime, cpuEndTime; double wcStartTime, cpuStartTime, wcEndTime, cpuEndTime;
timing(&wcStartTime, &cpuStartTime); timing(&wcStartTime, &cpuStartTime);
{%- endif %} {%- endif %}
for (; repeat > 0; --repeat) for (; repeat > 0; --repeat)
{ {
{{kernelName}}({{call_argument_list}}); {{kernelName}}({{call_argument_list}});
// Dummy calls // Dummy calls
{%- for field_name, dataType, size in fields %} {%- for field_name, dataType, elements, size, offset, alignment in fields %}
if(var_false) dummy((void*){{field_name}}); if(var_false) dummy((void*){{field_name}});
{%- endfor %} {%- endfor %}
{%- for constantName, dataType in constants %} {%- for constantName, dataType in constants %}
...@@ -105,4 +145,13 @@ int main(int argc, char **argv) ...@@ -105,4 +145,13 @@ int main(int argc, char **argv)
{%- if likwid %} {%- if likwid %}
likwid_markerClose(); likwid_markerClose();
{%- endif %} {%- endif %}
{%- for field_name, dataType, elements, size, offset, alignment in fields %}
{%- if alignment > 0 %}
aligned_free({{field_name}});
{%- else %}
delete[] {{field_name}};
{%- endif %}
{%- endfor %}
} }
...@@ -7,8 +7,11 @@ from kerncraft.kernel import KernelCode ...@@ -7,8 +7,11 @@ from kerncraft.kernel import KernelCode
from kerncraft.machinemodel import MachineModel from kerncraft.machinemodel import MachineModel
from kerncraft.models import ECM, ECMData, Benchmark from kerncraft.models import ECM, ECMData, Benchmark
import pystencils as ps
from pystencils import Assignment, Field from pystencils import Assignment, Field
from pystencils.backends.simd_instruction_sets import get_supported_instruction_sets, get_vector_instruction_set
from pystencils.cpu import create_kernel from pystencils.cpu import create_kernel
from pystencils.datahandling import create_data_handling
from pystencils.kerncraft_coupling import KerncraftParameters, PyStencilsKerncraftKernel from pystencils.kerncraft_coupling import KerncraftParameters, PyStencilsKerncraftKernel
from pystencils.kerncraft_coupling.generate_benchmark import generate_benchmark, run_c_benchmark from pystencils.kerncraft_coupling.generate_benchmark import generate_benchmark, run_c_benchmark
from pystencils.timeloop import TimeLoop from pystencils.timeloop import TimeLoop
...@@ -170,3 +173,26 @@ def test_benchmark(): ...@@ -170,3 +173,26 @@ def test_benchmark():
timeloop_time = timeloop.benchmark(number_of_time_steps_for_estimation=1) timeloop_time = timeloop.benchmark(number_of_time_steps_for_estimation=1)
np.testing.assert_almost_equal(c_benchmark_run, timeloop_time, decimal=4) np.testing.assert_almost_equal(c_benchmark_run, timeloop_time, decimal=4)
@pytest.mark.kerncraft
def test_benchmark_vectorized():
instruction_sets = get_supported_instruction_sets()
if not instruction_sets:
pytest.skip("cannot detect CPU instruction set")
for vec in instruction_sets:
dh = create_data_handling((20, 20, 20), periodicity=True)
width = get_vector_instruction_set(instruction_set=vec)['width'] * 8
a = dh.add_array("a", values_per_cell=1, alignment=width)
b = dh.add_array("b", values_per_cell=1, alignment=width)
rhs = a[0, -1, 0] + a[0, 1, 0] + a[-1, 0, 0] + a[1, 0, 0] + a[0, 0, -1] + a[0, 0, 1]
update_rule = Assignment(b[0, 0, 0], rhs)
opt = {'instruction_set': vec, 'assume_aligned': True, 'nontemporal': True, 'assume_inner_stride_one': True}
ast = ps.create_kernel(update_rule, cpu_vectorize_info=opt)
run_c_benchmark(ast, 5)
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment