f3e81539 · f3e81539 · f3e81539 · f3e81539 · f3e81539 · f3e81539
--- a/pystencils/include/PyStencilsField.h
+++ b/pystencils/include/PyStencilsField.h
-#pragma once
-
-extern "C++" {
-#ifdef __CUDA_ARCH__
-template <typename DTYPE_T, std::size_t DIMENSION> struct PyStencilsField {
-  DTYPE_T *data;
-  DTYPE_T shape[DIMENSION];
-  DTYPE_T stride[DIMENSION];
-};
-#else
-#include <array>
-
-template <typename DTYPE_T, std::size_t DIMENSION> struct PyStencilsField {
-  DTYPE_T *data;
-  std::array<DTYPE_T, DIMENSION> shape;
-  std::array<DTYPE_T, DIMENSION> stride;
-};
-#endif
-}
--- a/pystencils/include/aesni_rand.h
+++ b/pystencils/include/aesni_rand.h
-#if !defined(__AES__) || !defined(__SSE4_1__)
-#error AES-NI and SSE4.1 need to be enabled
-#endif
-
-#include <emmintrin.h> // SSE2
-#include <wmmintrin.h> // AES
-#ifdef __AVX512VL__
-#include <immintrin.h> // AVX*
-#else
-#include <smmintrin.h>  // SSE4
-#ifdef __FMA__
-#include <immintrin.h> // FMA
-#endif
-#endif
-#include <cstdint>
-
-#define QUALIFIERS inline
-#define TWOPOW53_INV_DOUBLE (1.1102230246251565e-16)
-#define TWOPOW32_INV_FLOAT (2.3283064e-10f)
-
-typedef std::uint32_t uint32;
-typedef std::uint64_t uint64;
-
-QUALIFIERS __m128i aesni1xm128i(const __m128i & in, const __m128i & k) {
-    __m128i x = _mm_xor_si128(k, in);
-    x = _mm_aesenc_si128(x, k);     // 1
-    x = _mm_aesenc_si128(x, k);     // 2
-    x = _mm_aesenc_si128(x, k);     // 3
-    x = _mm_aesenc_si128(x, k);     // 4
-    x = _mm_aesenc_si128(x, k);     // 5
-    x = _mm_aesenc_si128(x, k);     // 6
-    x = _mm_aesenc_si128(x, k);     // 7
-    x = _mm_aesenc_si128(x, k);     // 8
-    x = _mm_aesenc_si128(x, k);     // 9
-    x = _mm_aesenclast_si128(x, k); // 10
-    return x;
-}
-
-QUALIFIERS __m128 _my_cvtepu32_ps(const __m128i v)
-{
-#ifdef __AVX512VL__
-    return _mm_cvtepu32_ps(v);
-#else
-    __m128i v2 = _mm_srli_epi32(v, 1);
-    __m128i v1 = _mm_and_si128(v, _mm_set1_epi32(1));
-    __m128 v2f = _mm_cvtepi32_ps(v2);
-    __m128 v1f = _mm_cvtepi32_ps(v1);
-    return _mm_add_ps(_mm_add_ps(v2f, v2f), v1f);
-#endif
-}
-
-#if !defined(__AVX512VL__) && defined(__GNUC__) && __GNUC__ >= 5
-__attribute__((optimize("no-associative-math")))
-#endif
-QUALIFIERS __m128d _my_cvtepu64_pd(const __m128i x)
-{
-#ifdef __AVX512VL__
-    return _mm_cvtepu64_pd(x);
-#else
-    __m128i xH = _mm_srli_epi64(x, 32);
-    xH = _mm_or_si128(xH, _mm_castpd_si128(_mm_set1_pd(19342813113834066795298816.)));          //  2^84
-    __m128i xL = _mm_blend_epi16(x, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)), 0xcc);   //  2^52
-    __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(19342813118337666422669312.));     //  2^84 + 2^52
-    return _mm_add_pd(f, _mm_castsi128_pd(xL));
-#endif
-}
-
-
-QUALIFIERS void aesni_double2(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3,
-                              uint32 key0, uint32 key1, uint32 key2, uint32 key3,
-                              double & rnd1, double & rnd2)
-{
-    // pack input and call AES
-    __m128i c128 = _mm_set_epi32(ctr3, ctr2, ctr1, ctr0);
-    __m128i k128 = _mm_set_epi32(key3, key2, key1, key0);
-    c128 = aesni1xm128i(c128, k128);
-
-    // convert 32 to 64 bit and put 0th and 2nd element into x, 1st and 3rd element into y
-    __m128i x = _mm_and_si128(c128, _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff));
-    __m128i y = _mm_and_si128(c128, _mm_set_epi32(0xffffffff, 0, 0xffffffff, 0));
-    y = _mm_srli_si128(y, 4);
-
-    // calculate z = x ^ y << (53 - 32))
-    __m128i z = _mm_sll_epi64(y, _mm_set1_epi64x(53 - 32));
-    z = _mm_xor_si128(x, z);
-
-    // convert uint64 to double
-    __m128d rs = _my_cvtepu64_pd(z);
-    // calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
-#ifdef __FMA__
-    rs = _mm_fmadd_pd(rs, _mm_set1_pd(TWOPOW53_INV_DOUBLE), _mm_set1_pd(TWOPOW53_INV_DOUBLE/2.0));
-#else
-    rs = _mm_mul_pd(rs, _mm_set1_pd(TWOPOW53_INV_DOUBLE));
-    rs = _mm_add_pd(rs, _mm_set1_pd(TWOPOW53_INV_DOUBLE/2.0));
-#endif
-
-    // store result
-    alignas(16) double rr[2];
-    _mm_store_pd(rr, rs);
-    rnd1 = rr[0];
-    rnd2 = rr[1];
-}
-
-
-QUALIFIERS void aesni_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3,
-                             uint32 key0, uint32 key1, uint32 key2, uint32 key3,
-                             float & rnd1, float & rnd2, float & rnd3, float & rnd4)
-{
-    // pack input and call AES
-    __m128i c128 = _mm_set_epi32(ctr3, ctr2, ctr1, ctr0);
-    __m128i k128 = _mm_set_epi32(key3, key2, key1, key0);
-    c128 = aesni1xm128i(c128, k128);
-
-    // convert uint32 to float
-    __m128 rs = _my_cvtepu32_ps(c128);
-    // calculate rs * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
-#ifdef __FMA__
-    rs = _mm_fmadd_ps(rs, _mm_set1_ps(TWOPOW32_INV_FLOAT), _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
-#else
-    rs = _mm_mul_ps(rs, _mm_set1_ps(TWOPOW32_INV_FLOAT));
-    rs = _mm_add_ps(rs, _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
-#endif
-
-    // store result
-    alignas(16) float r[4];
-    _mm_store_ps(r, rs);
-    rnd1 = r[0];
-    rnd2 = r[1];
-    rnd3 = r[2];
-    rnd4 = r[3];
-}
-
--- a/pystencils/include/cuda_complex.hpp
+++ b/pystencils/include/cuda_complex.hpp
--- a/pystencils/include/opencl_stdint.h
+++ b/pystencils/include/opencl_stdint.h
-#ifndef OPENCL_STDINT
-#define OPENCL_STDINT
-
-typedef unsigned int      uint_t;
-
-typedef signed char       int8_t;
-typedef signed short      int16_t;
-typedef signed int        int32_t;
-typedef signed long int   int64_t;
-typedef unsigned char     uint8_t;
-typedef unsigned short    uint16_t;
-typedef unsigned int      uint32_t;
-typedef unsigned long int uint64_t;
-
-#endif
--- a/pystencils/include/philox_rand.h
+++ b/pystencils/include/philox_rand.h
-#include <cstdint>
-
-#ifndef __CUDA_ARCH__
-#define QUALIFIERS inline
-#else
-#define QUALIFIERS static __forceinline__ __device__
-#endif
-
-#define PHILOX_W32_0   (0x9E3779B9)
-#define PHILOX_W32_1   (0xBB67AE85)
-#define PHILOX_M4x32_0 (0xD2511F53)
-#define PHILOX_M4x32_1 (0xCD9E8D57)
-#define TWOPOW53_INV_DOUBLE (1.1102230246251565e-16)
-#define TWOPOW32_INV_FLOAT (2.3283064e-10f)
-
-typedef std::uint32_t uint32;
-typedef std::uint64_t uint64;
-
-
-QUALIFIERS uint32 mulhilo32(uint32 a, uint32 b, uint32* hip)
-{
-#ifndef __CUDA_ARCH__
-    // host code
-    uint64 product = ((uint64)a) * ((uint64)b);
-    *hip = product >> 32;
-    return (uint32)product;
-#else
-    // device code
-    *hip = __umulhi(a,b);
-    return a*b;
-#endif
-}
-
-QUALIFIERS void _philox4x32round(uint32* ctr, uint32* key)
-{
-    uint32 hi0;
-    uint32 hi1;
-    uint32 lo0 = mulhilo32(PHILOX_M4x32_0, ctr[0], &hi0);
-    uint32 lo1 = mulhilo32(PHILOX_M4x32_1, ctr[2], &hi1);
-
-    ctr[0] = hi1^ctr[1]^key[0];
-    ctr[1] = lo1;
-    ctr[2] = hi0^ctr[3]^key[1];
-    ctr[3] = lo0;
-}
-
-QUALIFIERS void _philox4x32bumpkey(uint32* key)
-{
-    key[0] += PHILOX_W32_0;
-    key[1] += PHILOX_W32_1;
-}
-
-QUALIFIERS double _uniform_double_hq(uint32 x, uint32 y)
-{
-    uint64 z = (uint64)x ^ ((uint64)y << (53 - 32));
-    return z * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0);
-}
-
-
-QUALIFIERS void philox_double2(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3,
-                               uint32 key0, uint32 key1, double & rnd1, double & rnd2)
-{
-    uint32 key[2] = {key0, key1};
-    uint32 ctr[4] = {ctr0, ctr1, ctr2, ctr3};
-    _philox4x32round(ctr, key);                           // 1
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 2
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 3
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 4
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 5
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 6
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 7
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 8
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 9
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 10
-
-    rnd1 = _uniform_double_hq(ctr[0], ctr[1]);
-    rnd2 = _uniform_double_hq(ctr[2], ctr[3]);
-}
-
-
-
-QUALIFIERS void philox_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3,
-                              uint32 key0, uint32 key1,
-                              float & rnd1, float & rnd2, float & rnd3, float & rnd4)
-{
-    uint32 key[2] = {key0, key1};
-    uint32 ctr[4] = {ctr0, ctr1, ctr2, ctr3};
-    _philox4x32round(ctr, key);                           // 1
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 2
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 3
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 4
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 5
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 6
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 7
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 8
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 9
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 10
-
-    rnd1 = ctr[0] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
-    rnd2 = ctr[1] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
-    rnd3 = ctr[2] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
-    rnd4 = ctr[3] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
-}
\ No newline at end of file
--- a/pystencils/interpolation_astnodes.py
+++ b/pystencils/interpolation_astnodes.py
--- a/pystencils/kerncraft_coupling/__init__.py
+++ b/pystencils/kerncraft_coupling/__init__.py
-from .generate_benchmark import generate_benchmark, run_c_benchmark
-from .kerncraft_interface import KerncraftParameters, PyStencilsKerncraftKernel
-
-__all__ = ['PyStencilsKerncraftKernel', 'KerncraftParameters', 'generate_benchmark', 'run_c_benchmark']
--- a/pystencils/kerncraft_coupling/generate_benchmark.py
+++ b/pystencils/kerncraft_coupling/generate_benchmark.py
-import subprocess
-import warnings
-import tempfile
-from pathlib import Path
-
-from jinja2 import Environment, PackageLoader, StrictUndefined
-
-from pystencils.astnodes import PragmaBlock
-from pystencils.backends.cbackend import generate_c, get_headers
-from pystencils.cpu.cpujit import get_compiler_config, run_compile_step
-from pystencils.data_types import get_base_type
-from pystencils.include import get_pystencils_include_path
-from pystencils.sympyextensions import prod
-
-
-def generate_benchmark(ast, likwid=False, openmp=False, timing=False):
-    """Return C code of a benchmark program for the given kernel.
-
-    Args:
-        ast: the pystencils AST object as returned by create_kernel
-        likwid: if True likwid markers are added to the code
-        openmp: relevant only if likwid=True, to generated correct likwid initialization code
-        timing: add timing output to the code, prints time per iteration to stdout
-
-    Returns:
-        C code as string
-    """
-    accessed_fields = {f.name: f for f in ast.fields_accessed}
-    constants = []
-    fields = []
-    call_parameters = []
-    for p in ast.get_parameters():
-        if not p.is_field_parameter:
-            constants.append((p.symbol.name, str(p.symbol.dtype)))
-            call_parameters.append(p.symbol.name)
-        else:
-            assert p.is_field_pointer, "Benchmark implemented only for kernels with fixed loop size"
-            field = accessed_fields[p.field_name]
-            dtype = str(get_base_type(p.symbol.dtype))
-            fields.append((p.field_name, dtype, prod(field.shape)))
-            call_parameters.append(p.field_name)
-
-    header_list = get_headers(ast)
-    includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list])
-
-    # Strip "#pragma omp parallel" from within kernel, because main function takes care of that
-    # when likwid and openmp are enabled
-    if likwid and openmp:
-        if len(ast.body.args) > 0 and isinstance(ast.body.args[0], PragmaBlock):
-            ast.body.args[0].pragma_line = ''
-
-    jinja_context = {
-        'likwid': likwid,
-        'openmp': openmp,
-        'kernel_code': generate_c(ast, dialect='c'),
-        'kernelName': ast.function_name,
-        'fields': fields,
-        'constants': constants,
-        'call_argument_list': ",".join(call_parameters),
-        'includes': includes,
-        'timing': timing,
-    }
-
-    env = Environment(loader=PackageLoader('pystencils.kerncraft_coupling'), undefined=StrictUndefined)
-
-    return env.get_template('benchmark.c').render(**jinja_context)
-
-
-def run_c_benchmark(ast, inner_iterations, outer_iterations=3, path=None):
-    """Runs the given kernel with outer loop in C
-
-    Args:
-        ast: pystencils ast which is used to compile the benchmark file
-        inner_iterations: timings are recorded around this many iterations
-        outer_iterations: number of timings recorded
-        path: path where the benchmark file is stored. If None a tmp folder is created
-
-    Returns:
-        list of times per iterations for each outer iteration
-    """
-    import kerncraft
-
-    benchmark_code = generate_benchmark(ast, timing=True)
-
-    if path is None:
-        path = tempfile.mkdtemp()
-
-    if isinstance(path, str):
-        path = Path(path)
-
-    with open(path / 'bench.c', 'w') as f:
-        f.write(benchmark_code)
-
-    kerncraft_path = Path(kerncraft.__file__).parent
-
-    extra_flags = ['-I' + get_pystencils_include_path(),
-                   '-I' + str(kerncraft_path / 'headers')]
-
-    compiler_config = get_compiler_config()
-    compile_cmd = [compiler_config['command']] + compiler_config['flags'].split()
-    compile_cmd += [*extra_flags,
-                    kerncraft_path / 'headers' / 'timing.c',
-                    kerncraft_path / 'headers' / 'dummy.c',
-                    path / 'bench.c',
-                    '-o', path / 'bench',
-                    ]
-    run_compile_step(compile_cmd)
-
-    time_pre_estimation_per_iteration = float(subprocess.check_output(['./' / path / 'bench', str(10)]))
-    benchmark_time_limit = 20
-    if benchmark_time_limit / time_pre_estimation_per_iteration < inner_iterations:
-        warn = (f"A benchmark run with {inner_iterations} inner_iterations will probably take longer than "
-                f"{benchmark_time_limit} seconds for this kernel")
-        warnings.warn(warn)
-
-    results = []
-    for _ in range(outer_iterations):
-        benchmark_time = float(subprocess.check_output(['./' / path / 'bench', str(inner_iterations)]))
-        results.append(benchmark_time)
-    return results
--- a/pystencils/kerncraft_coupling/kerncraft_interface.py
+++ b/pystencils/kerncraft_coupling/kerncraft_interface.py
-import warnings
-import fcntl
-from collections import defaultdict
-from tempfile import TemporaryDirectory
-from typing import Optional
-
-from jinja2 import Environment, PackageLoader, StrictUndefined
-
-import sympy as sp
-from kerncraft.kerncraft import KernelCode
-from kerncraft.machinemodel import MachineModel
-
-from pystencils.astnodes import (KernelFunction, LoopOverCoordinate, ResolvedFieldAccess, SympyAssignment)
-from pystencils.field import get_layout_from_strides
-from pystencils.sympyextensions import count_operations_in_ast
-from pystencils.transformations import filtered_tree_iteration
-from pystencils.utils import DotDict
-from pystencils.backends.cbackend import generate_c, get_headers
-from pystencils.cpu.kernelcreation import add_openmp
-
-
-class PyStencilsKerncraftKernel(KernelCode):
-    """
-    Implementation of kerncraft's kernel interface for pystencils CPU kernels.
-    Analyses a list of equations assuming they will be executed on a CPU
-    """
-    LIKWID_BASE = '/usr/local/likwid'
-
-    def __init__(self, ast: KernelFunction, machine: Optional[MachineModel] = None,
-                 assumed_layout='SoA', debug_print=False, filename=None):
-        """Create a kerncraft kernel using a pystencils AST
-
-        Args:
-            ast: pystencils ast
-            machine: kerncraft machine model - specify this if kernel needs to be compiled
-            assumed_layout: either 'SoA' or 'AoS' - if fields have symbolic sizes the layout of the index
-                    coordinates is not known. In this case either a structures of array (SoA) or
-                    array of structures (AoS) layout is assumed
-            debug_print: print debug information
-            filename: used for caching
-        """
-        super(KernelCode, self).__init__(machine=machine)
-
-        # Initialize state
-        self.asm_block = None
-        self._filename = filename
-
-        self.kernel_ast = ast
-        self.temporary_dir = TemporaryDirectory()
-        self._keep_intermediates = debug_print
-
-        # Loops
-        inner_loops = [l for l in filtered_tree_iteration(ast, LoopOverCoordinate, stop_type=SympyAssignment)
-                       if l.is_innermost_loop]
-        if len(inner_loops) == 0:
-            raise ValueError("No loop found in pystencils AST")
-        else:
-            if len(inner_loops) > 1:
-                warnings.warn("pystencils AST contains multiple inner loops. "
-                              "Only one can be analyzed - choosing first one")
-            inner_loop = inner_loops[0]
-
-        self._loop_stack = []
-        cur_node = inner_loop
-        while cur_node is not None:
-            if isinstance(cur_node, LoopOverCoordinate):
-                loop_counter_sym = cur_node.loop_counter_symbol
-                loop_info = (loop_counter_sym.name, cur_node.start, cur_node.stop, 1)
-                # If the correct step were to be provided, all access within that step length will
-                # also need to be passed to kerncraft: cur_node.step)
-                self._loop_stack.append(loop_info)
-            cur_node = cur_node.parent
-        self._loop_stack = list(reversed(self._loop_stack))
-
-        # Data sources & destinations
-        self.sources = defaultdict(list)
-        self.destinations = defaultdict(list)
-
-        def get_layout_tuple(f):
-            if f.has_fixed_shape:
-                return get_layout_from_strides(f.strides)
-            else:
-                layout_list = list(f.layout)
-                for _ in range(f.index_dimensions):
-                    layout_list.insert(0 if assumed_layout == 'SoA' else -1, max(layout_list) + 1)
-                return layout_list
-
-        reads, writes = search_resolved_field_accesses_in_ast(inner_loop)
-        for accesses, target_dict in [(reads, self.sources), (writes, self.destinations)]:
-            for fa in accesses:
-                coord = [sp.Symbol(LoopOverCoordinate.get_loop_counter_name(i), positive=True, integer=True) + off
-                         for i, off in enumerate(fa.offsets)]
-                coord += list(fa.idx_coordinate_values)
-                layout = get_layout_tuple(fa.field)
-                permuted_coord = [sp.sympify(coord[i]) for i in layout]
-                target_dict[fa.field.name].append(permuted_coord)
-
-        # Variables (arrays)
-        fields_accessed = ast.fields_accessed
-        for field in fields_accessed:
-            layout = get_layout_tuple(field)
-            permuted_shape = list(field.shape[i] for i in layout)
-            self.set_variable(field.name, tuple([str(field.dtype)]), tuple(permuted_shape))
-
-        # Scalars may be safely ignored
-        # for param in ast.get_parameters():
-        #     if not param.is_field_parameter:
-        #         # self.set_variable(param.symbol.name, str(param.symbol.dtype), None)
-        #         self.sources[param.symbol.name] = [None]
-
-        # data type
-        self.datatype = list(self.variables.values())[0][0]
-
-        # flops
-        operation_count = count_operations_in_ast(inner_loop)
-        self._flops = {
-            '+': operation_count['adds'],
-            '*': operation_count['muls'],
-            '/': operation_count['divs'],
-        }
-        for k in [k for k, v in self._flops.items() if v == 0]:
-            del self._flops[k]
-        self.check()
-
-        if debug_print:
-            from pprint import pprint
-            print("-----------------------------  Loop Stack --------------------------")
-            pprint(self._loop_stack)
-            print("-----------------------------  Sources -----------------------------")
-            pprint(self.sources)
-            print("-----------------------------  Destinations ------------------------")
-            pprint(self.destinations)
-            print("-----------------------------  FLOPS -------------------------------")
-            pprint(self._flops)
-
-    def get_kernel_header(self, name='pystencils_kernel'):
-        file_name = "pystencils_kernel.h"
-        file_path = self.get_intermediate_location(file_name, machine_and_compiler_dependent=False)
-        lock_mode, lock_fp = self.lock_intermediate(file_path)
-
-        if lock_mode == fcntl.LOCK_EX:
-            function_signature = generate_c(self.kernel_ast, dialect='c', signature_only=True)
-
-            jinja_context = {
-                'function_signature': function_signature,
-            }
-
-            env = Environment(loader=PackageLoader('pystencils.kerncraft_coupling'), undefined=StrictUndefined)
-            file_header = env.get_template('kernel.h').render(**jinja_context)
-            with open(file_path, 'w') as f:
-                f.write(file_header)
-
-            fcntl.flock(lock_fp, fcntl.LOCK_SH)  # degrade to shared lock
-
-        return file_path, lock_fp
-
-    def get_kernel_code(self, openmp=False, name='pystencils_kernl'):
-        """
-        Generate and return compilable source code.
-
-        Args:
-            openmp: if true, openmp code will be generated
-            name: kernel name
-        """
-        filename = 'pystencils_kernl'
-        if openmp:
-            filename += '-omp'
-        filename += '.c'
-        file_path = self.get_intermediate_location(filename, machine_and_compiler_dependent=False)
-        lock_mode, lock_fp = self.lock_intermediate(file_path)
-
-        if lock_mode == fcntl.LOCK_EX:
-            header_list = get_headers(self.kernel_ast)
-            includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list])
-
-            if openmp:
-                add_openmp(self.kernel_ast)
-
-            kernel_code = generate_c(self.kernel_ast, dialect='c')
-
-            jinja_context = {
-                'includes': includes,
-                'kernel_code': kernel_code,
-            }
-
-            env = Environment(loader=PackageLoader('pystencils.kerncraft_coupling'), undefined=StrictUndefined)
-            file_header = env.get_template('kernel.c').render(**jinja_context)
-            with open(file_path, 'w') as f:
-                f.write(file_header)
-
-            fcntl.flock(lock_fp, fcntl.LOCK_SH)  # degrade to shared lock
-
-        return file_path, lock_fp
-
-
-class KerncraftParameters(DotDict):
-    def __init__(self, **kwargs):
-        super(KerncraftParameters, self).__init__(**kwargs)
-        self['asm_block'] = 'auto'
-        self['asm_increment'] = 0
-        self['cores'] = 1
-        self['cache_predictor'] = 'SIM'
-        self['verbose'] = 0
-        self['pointer_increment'] = 'auto'
-        self['iterations'] = 10
-        self['unit'] = 'cy/CL'
-        self['ignore_warnings'] = True
-        self['incore_model'] = 'OSACA'
-
-
-# ------------------------------------------- Helper functions ---------------------------------------------------------
-
-
-def search_resolved_field_accesses_in_ast(ast):
-    def visit(node, reads, writes):
-        if not isinstance(node, SympyAssignment):
-            for a in node.args:
-                visit(a, reads, writes)
-            return
-
-        for expr, accesses in [(node.lhs, writes), (node.rhs, reads)]:
-            accesses.update(expr.atoms(ResolvedFieldAccess))
-
-    read_accesses = set()
-    write_accesses = set()
-    visit(ast, read_accesses, write_accesses)
-    return read_accesses, write_accesses
--- a/pystencils/kerncraft_coupling/templates/benchmark.c
+++ b/pystencils/kerncraft_coupling/templates/benchmark.c
-
-#include "kerncraft.h"
-#include <stdlib.h>
-#include <stdint.h>
-#include <stdbool.h>
-#include <math.h>
-#include <stdio.h>
-
-{{ includes }}
-
-{%- if likwid %}
-#include <likwid.h>
-{%- endif %}
-
-#define RESTRICT __restrict__
-#define FUNC_PREFIX
-void dummy(void *);
-void timing(double* wcTime, double* cpuTime);
-extern int var_false;
-
-
-{{kernel_code}}
-
-
-int main(int argc, char **argv)
-{
-  {%- if likwid %}
-  likwid_markerInit();
-  {%- endif %}
-
-  {%- for field_name, dataType, size in fields %}
-
-  // Initialization {{field_name}}
-  double * {{field_name}} = (double *) aligned_malloc(sizeof({{dataType}}) * {{size}}, 64);
-  for (unsigned long long i = 0; i < {{size}}; ++i)
-    {{field_name}}[i] = 0.23;
-
-  if(var_false)
-    dummy({{field_name}});
-
-  {%- endfor %}
-
-
-
-  {%- for constantName, dataType in constants %}
-
-  // Constant {{constantName}}
-  {{dataType}} {{constantName}};
-  {{constantName}} = 0.23;
-  if(var_false)
-      dummy(& {{constantName}});
-
-  {%- endfor %}
-
-  {%- if likwid and openmp %}
-  #pragma omp parallel
-  {
-  likwid_markerRegisterRegion("loop");
-  #pragma omp barrier
-  {%- elif likwid %}
-  likwid_markerRegisterRegion("loop");
-  {%- endif %}
-
-  for(int warmup = 1; warmup >= 0; --warmup) {
-    int repeat = 2;
-    if(warmup == 0) {
-      repeat = atoi(argv[1]);
-      {%- if likwid %}
-      likwid_markerStartRegion("loop");
-      {%- endif %}
-    }
-    
-    {%- if timing %}
-    double wcStartTime, cpuStartTime, wcEndTime, cpuEndTime;
-    timing(&wcStartTime, &cpuStartTime);
-    {%- endif %}
-    
-    for (; repeat > 0; --repeat)
-    {
-      {{kernelName}}({{call_argument_list}});
-
-      // Dummy calls
-      {%- for field_name, dataType, size in fields %}
-      if(var_false) dummy((void*){{field_name}});
-      {%- endfor %}
-      {%- for constantName, dataType in constants %}
-      if(var_false) dummy((void*)&{{constantName}});
-      {%- endfor %}
-    }
-    {%- if timing %}
-    timing(&wcEndTime, &cpuEndTime);
-    if( warmup == 0)
-        printf("%e\n", (wcEndTime - wcStartTime) / atoi(argv[1]) );
-    {%- endif %}
-
-  }
-
-  {%- if likwid %}
-  likwid_markerStopRegion("loop");
-  {%- if openmp %}
-  }
-  {%- endif %}
-  {%- endif %}
-
-  {%- if likwid %}
-  likwid_markerClose();
-  {%- endif %}
-}
--- a/pystencils/kerncraft_coupling/templates/kernel.c
+++ b/pystencils/kerncraft_coupling/templates/kernel.c
-
-#include "kerncraft.h"
-#include <stdlib.h>
-#include <stdint.h>
-#include <stdbool.h>
-#include <math.h>
-#include <stdio.h>
-
-{{ includes }}
-
-#define RESTRICT __restrict__
-#define FUNC_PREFIX
-void dummy(void *);
-void timing(double* wcTime, double* cpuTime);
-extern int var_false;
-
-
-{{kernel_code}}
\ No newline at end of file
--- a/pystencils/kerncraft_coupling/templates/kernel.h
+++ b/pystencils/kerncraft_coupling/templates/kernel.h
--- a/pystencils/llvm/__init__.py
+++ b/pystencils/llvm/__init__.py
--- a/pystencils/llvm/control_flow.py
+++ b/pystencils/llvm/control_flow.py
--- a/pystencils/llvm/kernelcreation.py
+++ b/pystencils/llvm/kernelcreation.py
--- a/pystencils/llvm/llvm.py
+++ b/pystencils/llvm/llvm.py
--- a/pystencils/llvm/llvmjit.py
+++ b/pystencils/llvm/llvmjit.py
--- a/pystencils/math_optimizations.py
+++ b/pystencils/math_optimizations.py
--- a/pystencils/opencl/__init__.py
+++ b/pystencils/opencl/__init__.py
--- a/pystencils/opencl/autoinit.py
+++ b/pystencils/opencl/autoinit.py
No results found