diff --git a/pystencils/kerncraft_coupling/generate_benchmark.py b/pystencils/kerncraft_coupling/generate_benchmark.py index 9a012d6c2a75c98faae05e6815dd3883c7d4d2e4..6e69ef6bc42e775fc3dda60154061fc465fbc83f 100644 --- a/pystencils/kerncraft_coupling/generate_benchmark.py +++ b/pystencils/kerncraft_coupling/generate_benchmark.py @@ -10,8 +10,11 @@ from pystencils.backends.cbackend import generate_c, get_headers from pystencils.cpu.cpujit import get_compiler_config, run_compile_step from pystencils.data_types import get_base_type from pystencils.include import get_pystencils_include_path +from pystencils.integer_functions import modulo_ceil from pystencils.sympyextensions import prod +import numpy as np + def generate_benchmark(ast, likwid=False, openmp=False, timing=False): """Return C code of a benchmark program for the given kernel. @@ -37,8 +40,30 @@ def generate_benchmark(ast, likwid=False, openmp=False, timing=False): assert p.is_field_pointer, "Benchmark implemented only for kernels with fixed loop size" field = accessed_fields[p.field_name] dtype = str(get_base_type(p.symbol.dtype)) - fields.append((p.field_name, dtype, prod(field.shape))) - call_parameters.append(p.field_name) + np_dtype = get_base_type(p.symbol.dtype).numpy_dtype + size_data_type = np_dtype.itemsize + + dim0_size = field.shape[-1] + dim1_size = np.prod(field.shape[:-1]) + elements = prod(field.shape) + + if ast.instruction_set: + align = ast.instruction_set['width'] * size_data_type + padding_elements = modulo_ceil(dim0_size, ast.instruction_set['width']) - dim0_size + padding_bytes = padding_elements * size_data_type + ghost_layers = max(max(ast.ghost_layers)) + + size = dim1_size * padding_bytes + np.prod(field.shape) * size_data_type + + assert align % np_dtype.itemsize == 0 + offset = ((dim0_size + padding_elements + ghost_layers) % ast.instruction_set['width']) * size_data_type + + fields.append((p.field_name, dtype, elements, size, offset, align)) + call_parameters.append(p.field_name) + else: + size = elements * size_data_type + fields.append((p.field_name, dtype, elements, size, 0, 0)) + call_parameters.append(p.field_name) header_list = get_headers(ast) includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list]) @@ -99,10 +124,10 @@ def run_c_benchmark(ast, inner_iterations, outer_iterations=3, path=None): compiler_config = get_compiler_config() compile_cmd = [compiler_config['command']] + compiler_config['flags'].split() compile_cmd += [*extra_flags, - kerncraft_path / 'headers' / 'timing.c', - kerncraft_path / 'headers' / 'dummy.c', - path / 'bench.c', - '-o', path / 'bench', + str(kerncraft_path / 'headers' / 'timing.c'), + str(kerncraft_path / 'headers' / 'dummy.c'), + str(path / 'bench.c'), + '-o', str(path / 'bench'), ] run_compile_step(compile_cmd) diff --git a/pystencils/kerncraft_coupling/templates/benchmark.c b/pystencils/kerncraft_coupling/templates/benchmark.c index ae70ddd6775a45c0709e95d57cef061da2a4b6b0..79daaffd9320f9d87e4ffaad8333e1e11a232ff5 100644 --- a/pystencils/kerncraft_coupling/templates/benchmark.c +++ b/pystencils/kerncraft_coupling/templates/benchmark.c @@ -1,10 +1,10 @@ - #include "kerncraft.h" #include <stdlib.h> #include <stdint.h> #include <stdbool.h> #include <math.h> #include <stdio.h> +#include <assert.h> {{ includes }} @@ -18,6 +18,43 @@ void dummy(void *); void timing(double* wcTime, double* cpuTime); extern int var_false; +/* see waLBerla src/field/allocation/AlignedMalloc */ +void *aligned_malloc_with_offset( uint64_t size, uint64_t alignment, uint64_t offset ) +{ + // With 0 alignment this function makes no sense + // use normal malloc instead + assert( alignment > 0 ); + // Tests if alignment is power of two (assuming alignment>0) + assert( !(alignment & (alignment - 1)) ); + assert( offset < alignment ); + + void *pa; // pointer to allocated memory + void *ptr; // pointer to usable aligned memory + + pa=std::malloc( (size+2*alignment-1 )+sizeof(void *)); + if(!pa) + return nullptr; + + // Find next aligned position, starting at pa+sizeof(void*)-1 + ptr=(void*)( ((size_t)pa+sizeof(void *)+alignment-1) & ~(alignment-1)); + ptr=(void*) ( (char*)(ptr) + alignment - offset); + + // Store pointer to real allocated chunk just before usable chunk + *((void **)ptr-1)=pa; + + assert( ((size_t)ptr+offset) % alignment == 0 ); + + return ptr; +} + +void aligned_free( void *ptr ) +{ + // assume that pointer to real allocated chunk is stored just before + // chunk that was given to user + if(ptr) + std::free(*((void **)ptr-1)); +} + {{kernel_code}} @@ -28,11 +65,14 @@ int main(int argc, char **argv) likwid_markerInit(); {%- endif %} - {%- for field_name, dataType, size in fields %} - + {%- for field_name, dataType, elements, size, offset, alignment in fields %} // Initialization {{field_name}} - double * {{field_name}} = (double *) aligned_malloc(sizeof({{dataType}}) * {{size}}, 64); - for (unsigned long long i = 0; i < {{size}}; ++i) + {%- if alignment > 0 %} + {{dataType}} * {{field_name}} = ({{dataType}} *) aligned_malloc_with_offset({{size}}, {{alignment}}, {{offset}}); + {%- else %} + {{dataType}} * {{field_name}} = new {{dataType}}[{{elements}}]; + {%- endif %} + for (unsigned long long i = 0; i < {{elements}}; ++i) {{field_name}}[i] = 0.23; if(var_false) @@ -69,18 +109,18 @@ int main(int argc, char **argv) likwid_markerStartRegion("loop"); {%- endif %} } - + {%- if timing %} double wcStartTime, cpuStartTime, wcEndTime, cpuEndTime; timing(&wcStartTime, &cpuStartTime); {%- endif %} - + for (; repeat > 0; --repeat) { {{kernelName}}({{call_argument_list}}); // Dummy calls - {%- for field_name, dataType, size in fields %} + {%- for field_name, dataType, elements, size, offset, alignment in fields %} if(var_false) dummy((void*){{field_name}}); {%- endfor %} {%- for constantName, dataType in constants %} @@ -105,4 +145,13 @@ int main(int argc, char **argv) {%- if likwid %} likwid_markerClose(); {%- endif %} + + {%- for field_name, dataType, elements, size, offset, alignment in fields %} + {%- if alignment > 0 %} + aligned_free({{field_name}}); + {%- else %} + delete[] {{field_name}}; + {%- endif %} + + {%- endfor %} } diff --git a/pystencils_tests/test_kerncraft_coupling.py b/pystencils_tests/test_kerncraft_coupling.py index 0f282fd11108bc2b82cfa568524e7d9b05f5530b..5faa292bea644731c3d23b0dc436f6b1ea75564b 100644 --- a/pystencils_tests/test_kerncraft_coupling.py +++ b/pystencils_tests/test_kerncraft_coupling.py @@ -7,8 +7,11 @@ from kerncraft.kernel import KernelCode from kerncraft.machinemodel import MachineModel from kerncraft.models import ECM, ECMData, Benchmark +import pystencils as ps from pystencils import Assignment, Field +from pystencils.backends.simd_instruction_sets import get_supported_instruction_sets, get_vector_instruction_set from pystencils.cpu import create_kernel +from pystencils.datahandling import create_data_handling from pystencils.kerncraft_coupling import KerncraftParameters, PyStencilsKerncraftKernel from pystencils.kerncraft_coupling.generate_benchmark import generate_benchmark, run_c_benchmark from pystencils.timeloop import TimeLoop @@ -170,3 +173,26 @@ def test_benchmark(): timeloop_time = timeloop.benchmark(number_of_time_steps_for_estimation=1) np.testing.assert_almost_equal(c_benchmark_run, timeloop_time, decimal=4) + + +@pytest.mark.kerncraft +def test_benchmark_vectorized(): + instruction_sets = get_supported_instruction_sets() + if not instruction_sets: + pytest.skip("cannot detect CPU instruction set") + + for vec in instruction_sets: + dh = create_data_handling((20, 20, 20), periodicity=True) + + width = get_vector_instruction_set(instruction_set=vec)['width'] * 8 + + a = dh.add_array("a", values_per_cell=1, alignment=width) + b = dh.add_array("b", values_per_cell=1, alignment=width) + + rhs = a[0, -1, 0] + a[0, 1, 0] + a[-1, 0, 0] + a[1, 0, 0] + a[0, 0, -1] + a[0, 0, 1] + update_rule = Assignment(b[0, 0, 0], rhs) + + opt = {'instruction_set': vec, 'assume_aligned': True, 'nontemporal': True, 'assume_inner_stride_one': True} + ast = ps.create_kernel(update_rule, cpu_vectorize_info=opt) + + run_c_benchmark(ast, 5)