06efd624 · 06efd624 · 06efd624 · 06efd624 · 06efd624 · 06efd624
--- a/pystencils/backends/simd_instruction_sets.py
+++ b/pystencils/backends/simd_instruction_sets.py
-# noinspection SpellCheckingInspection
-def get_vector_instruction_set(data_type='double', instruction_set='avx'):
-    comparisons = {
-        '==': '_CMP_EQ_UQ',
-        '!=': '_CMP_NEQ_UQ',
-        '>=': '_CMP_GE_OQ',
-        '<=': '_CMP_LE_OQ',
-        '<': '_CMP_NGE_UQ',
-        '>': '_CMP_NLE_UQ',
-    }
-    base_names = {
-        '+': 'add[0, 1]',
-        '-': 'sub[0, 1]',
-        '*': 'mul[0, 1]',
-        '/': 'div[0, 1]',
-        '&': 'and[0, 1]',
-        '|': 'or[0, 1]',
-        'blendv': 'blendv[0, 1, 2]',
-        'sqrt': 'sqrt[0]',
-        'makeVec': 'set[]',
-        'makeZero': 'setzero[]',
-        'loadU': 'loadu[0]',
-        'loadA': 'load[0]',
-        'storeU': 'storeu[0,1]',
-        'storeA': 'store[0,1]',
-        'stream': 'stream[0,1]',
-    }
-    for comparison_op, constant in comparisons.items():
-        base_names[comparison_op] = 'cmp[0, 1, %s]' % (constant,)
-    headers = {
-        'avx512': ['<immintrin.h>'],
-        'avx': ['<immintrin.h>'],
-        'sse': ['<immintrin.h>', '<xmmintrin.h>', '<emmintrin.h>', '<pmmintrin.h>',
-                '<tmmintrin.h>', '<smmintrin.h>', '<nmmintrin.h>']
-    }
-    suffix = {
-        'double': 'pd',
-        'float': 'ps',
-    }
-    prefix = {
-        'sse': '_mm',
-        'avx': '_mm256',
-        'avx512': '_mm512',
-    }
-    width = {
-        ("double", "sse"): 2,
-        ("float", "sse"): 4,
-        ("double", "avx"): 4,
-        ("float", "avx"): 8,
-        ("double", "avx512"): 8,
-        ("float", "avx512"): 16,
-    }
-    result = {
-        'width': width[(data_type, instruction_set)],
-    }
-    pre = prefix[instruction_set]
-    suf = suffix[data_type]
-    for intrinsic_id, function_shortcut in base_names.items():
-        function_shortcut = function_shortcut.strip()
-        name = function_shortcut[:function_shortcut.index('[')]
-        if intrinsic_id == 'makeVec':
-            arg_string = "({})".format(",".join(["{0}"] * result['width']))
-        else:
-            args = function_shortcut[function_shortcut.index('[') + 1: -1]
-            arg_string = "("
-            for arg in args.split(","):
-                arg = arg.strip()
-                if not arg:
-                    continue
-                if arg in ('0', '1', '2', '3', '4', '5'):
-                    arg_string += "{" + arg + "},"
-                else:
-                    arg_string += arg + ","
-            arg_string = arg_string[:-1] + ")"
-        mask_suffix = '_mask' if instruction_set == 'avx512' and intrinsic_id in comparisons.keys() else ''
-        result[intrinsic_id] = pre + "_" + name + "_" + suf + mask_suffix + arg_string
-    result['dataTypePrefix'] = {
-        'double': "_" + pre + 'd',
-        'float': "_" + pre,
-    }
-    result['rsqrt'] = None
-    bit_width = result['width'] * (64 if data_type == 'double' else 32)
-    result['double'] = "__m%dd" % (bit_width,)
-    result['float'] = "__m%d" % (bit_width,)
-    result['int'] = "__m%di" % (bit_width,)
-    result['bool'] = "__m%dd" % (bit_width,)
-    result['headers'] = headers[instruction_set]
-    if instruction_set == 'avx512':
-        size = 8 if data_type == 'double' else 16
-        result['&'] = '_kand_mask%d({0}, {1})' % (size,)
-        result['|'] = '_kor_mask%d({0}, {1})' % (size,)
-        result['blendv'] = '%s_mask_blend_%s({2}, {0}, {1})' % (pre, suf)
-        result['rsqrt'] = "_mm512_rsqrt14_%s({0})" % (suf,)
-        result['bool'] = "__mmask%d" % (size,)
-    if instruction_set == 'avx' and data_type == 'float':
-        result['rsqrt'] = "_mm256_rsqrt_ps({0})"
-    return result
-def get_supported_instruction_sets():
-    """List of supported instruction sets on current hardware, or None if query failed."""
-    try:
-        from cpuinfo import get_cpu_info
-    except ImportError:
-        return None
-    result = []
-    required_sse_flags = {'sse', 'sse2', 'ssse3', 'sse4_1', 'sse4_2'}
-    required_avx_flags = {'avx'}
-    required_avx512_flags = {'avx512f'}
-    flags = set(get_cpu_info()['flags'])
-    if flags.issuperset(required_sse_flags):
-        result.append("sse")
-    if flags.issuperset(required_avx_flags):
-        result.append("avx")
-    if flags.issuperset(required_avx512_flags):
-        result.append("avx512")
-    return result
--- a/pystencils/cache.py
+++ b/pystencils/cache.py
-import os
-try:
-    from functools import lru_cache as memorycache
-except ImportError:
-    from backports.functools_lru_cache import lru_cache as memorycache
-try:
-    from joblib import Memory
-    from appdirs import user_cache_dir
-    if 'PYSTENCILS_CACHE_DIR' in os.environ:
-        cache_dir = os.environ['PYSTENCILS_CACHE_DIR']
-    else:
-        cache_dir = user_cache_dir('pystencils')
-    disk_cache = Memory(cachedir=cache_dir, verbose=False).cache
-    disk_cache_no_fallback = disk_cache
-except ImportError:
-    # fallback to in-memory caching if joblib is not available
-    disk_cache = memorycache(maxsize=64)
-    def disk_cache_no_fallback(o):
-        return o
-# Disable memory cache:
-# disk_cache = lambda o: o
-# disk_cache_no_fallback = lambda o: o
--- a/pystencils/data_types.py
+++ b/pystencils/data_types.py
--- a/pystencils/gpucuda/indexing.py
+++ b/pystencils/gpucuda/indexing.py
-import abc
-from typing import Tuple  # noqa
-import sympy as sp
-from pystencils.astnodes import Conditional, Block
-from pystencils.integer_functions import div_ceil
-from pystencils.slicing import normalize_slice
-from pystencils.data_types import TypedSymbol, create_type
-from functools import partial
-AUTO_BLOCK_SIZE_LIMITING = False
-BLOCK_IDX = [TypedSymbol("blockIdx." + coord, create_type("int")) for coord in ('x', 'y', 'z')]
-THREAD_IDX = [TypedSymbol("threadIdx." + coord, create_type("int")) for coord in ('x', 'y', 'z')]
-BLOCK_DIM = [TypedSymbol("blockDim." + coord, create_type("int")) for coord in ('x', 'y', 'z')]
-GRID_DIM = [TypedSymbol("gridDim." + coord, create_type("int")) for coord in ('x', 'y', 'z')]
-class AbstractIndexing(abc.ABC):
-    """
-    Abstract base class for all Indexing classes. An Indexing class defines how a multidimensional
-    field is mapped to CUDA's block and grid system. It calculates indices based on CUDA's thread and block indices
-    and computes the number of blocks and threads a kernel is started with. The Indexing class is created with
-    a pystencils field, a slice to iterate over, and further optional parameters that must have default values.
-    """
-    @property
-    @abc.abstractmethod
-    def coordinates(self):
-        """Returns a sequence of coordinate expressions for (x,y,z) depending on symbolic CUDA block and thread indices.
-        These symbolic indices can be obtained with the method `index_variables` """
-    @property
-    def index_variables(self):
-        """Sympy symbols for CUDA's block and thread indices, and block and grid dimensions. """
-        return BLOCK_IDX + THREAD_IDX + BLOCK_DIM + GRID_DIM
-    @abc.abstractmethod
-    def call_parameters(self, arr_shape):
-        """Determine grid and block size for kernel call.
-        Args:
-            arr_shape: the numeric (not symbolic) shape of the array
-        Returns:
-            dict with keys 'blocks' and 'threads' with tuple values for number of (x,y,z) threads and blocks
-            the kernel should be started with
-        """
-    @abc.abstractmethod
-    def guard(self, kernel_content, arr_shape):
-        """In some indexing schemes not all threads of a block execute the kernel content.
-        This function can return a Conditional ast node, defining this execution guard.
-        Args:
-            kernel_content: the actual kernel contents which can e.g. be put into the Conditional node as true block
-            arr_shape: the numeric or symbolic shape of the field
-        Returns:
-            ast node, which is put inside the kernel function
-        """
-# -------------------------------------------- Implementations ---------------------------------------------------------
-class BlockIndexing(AbstractIndexing):
-    """Generic indexing scheme that maps sub-blocks of an array to CUDA blocks.
-    Args:
-        field: pystencils field (common to all Indexing classes)
-        iteration_slice: slice that defines rectangular subarea which is iterated over
-        permute_block_size_dependent_on_layout: if True the block_size is permuted such that the fastest coordinate
-                                                gets the largest amount of threads
-        compile_time_block_size: compile in concrete block size, otherwise the cuda variable 'blockDim' is used
-    """
-    def __init__(self, field, iteration_slice=None,
-                 block_size=(16, 16, 1), permute_block_size_dependent_on_layout=True, compile_time_block_size=False):
-        if field.spatial_dimensions > 3:
-            raise NotImplementedError("This indexing scheme supports at most 3 spatial dimensions")
-        if permute_block_size_dependent_on_layout:
-            block_size = self.permute_block_size_according_to_layout(block_size, field.layout)
-        if AUTO_BLOCK_SIZE_LIMITING:
-            block_size = self.limit_block_size_to_device_maximum(block_size)
-        self._block_size = block_size
-        self._iterationSlice = normalize_slice(iteration_slice, field.spatial_shape)
-        self._dim = field.spatial_dimensions
-        self._symbolic_shape = [e if isinstance(e, sp.Basic) else None for e in field.spatial_shape]
-        self._compile_time_block_size = compile_time_block_size
-    @property
-    def coordinates(self):
-        offsets = _get_start_from_slice(self._iterationSlice)
-        block_size = self._block_size if self._compile_time_block_size else BLOCK_DIM
-        coordinates = [block_index * bs + thread_idx + off
-                       for block_index, bs, thread_idx, off in zip(BLOCK_IDX, block_size, THREAD_IDX, offsets)]
-        return coordinates[:self._dim]
-    def call_parameters(self, arr_shape):
-        substitution_dict = {sym: value for sym, value in zip(self._symbolic_shape, arr_shape) if sym is not None}
-        widths = [end - start for start, end in zip(_get_start_from_slice(self._iterationSlice),
-                                                    _get_end_from_slice(self._iterationSlice, arr_shape))]
-        widths = sp.Matrix(widths).subs(substitution_dict)
-        extend_bs = (1,) * (3 - len(self._block_size))
-        block_size = self._block_size + extend_bs
-        if not self._compile_time_block_size:
-            block_size = tuple(sp.Min(bs, shape) for bs, shape in zip(block_size, widths)) + extend_bs
-        grid = tuple(div_ceil(length, block_size)
-                     for length, block_size in zip(widths, block_size))
-        extend_gr = (1,) * (3 - len(grid))
-        return {'block': block_size,
-                'grid': grid + extend_gr}
-    def guard(self, kernel_content, arr_shape):
-        arr_shape = arr_shape[:self._dim]
-        conditions = [c < end
-                      for c, end in zip(self.coordinates, _get_end_from_slice(self._iterationSlice, arr_shape))]
-        condition = conditions[0]
-        for c in conditions[1:]:
-            condition = sp.And(condition, c)
-        return Block([Conditional(condition, kernel_content)])
-    @staticmethod
-    def limit_block_size_to_device_maximum(block_size):
-        """Changes block size according to match device limits.
-        * if the total amount of threads is too big for the current device, the biggest coordinate is divided by 2.
-        * next, if one component is still too big, the component which is too big is divided by 2 and the smallest
-          component is multiplied by 2, such that the total amount of threads stays the same
-        Returns:
-            the altered block_size
-        """
-        # Get device limits
-        import pycuda.driver as cuda
-        # noinspection PyUnresolvedReferences
-        import pycuda.autoinit  # NOQA
-        da = cuda.device_attribute
-        device = cuda.Context.get_device()
-        block_size = list(block_size)
-        max_threads = device.get_attribute(da.MAX_THREADS_PER_BLOCK)
-        max_block_size = [device.get_attribute(a)
-                          for a in (da.MAX_BLOCK_DIM_X, da.MAX_BLOCK_DIM_Y, da.MAX_BLOCK_DIM_Z)]
-        def prod(seq):
-            result = 1
-            for e in seq:
-                result *= e
-            return result
-        def get_index_of_too_big_element():
-            for i, bs in enumerate(block_size):
-                if bs > max_block_size[i]:
-                    return i
-            return None
-        def get_index_of_too_small_element():
-            for i, bs in enumerate(block_size):
-                if bs // 2 <= max_block_size[i]:
-                    return i
-            return None
-        # Reduce the total number of threads if necessary
-        while prod(block_size) > max_threads:
-            item_to_reduce = block_size.index(max(block_size))
-            for j, block_size_entry in enumerate(block_size):
-                if block_size_entry > max_block_size[j]:
-                    item_to_reduce = j
-            block_size[item_to_reduce] //= 2
-        # Cap individual elements
-        too_big_element_index = get_index_of_too_big_element()
-        while too_big_element_index is not None:
-            too_small_element_index = get_index_of_too_small_element()
-            block_size[too_small_element_index] *= 2
-            block_size[too_big_element_index] //= 2
-            too_big_element_index = get_index_of_too_big_element()
-        return tuple(block_size)
-    @staticmethod
-    def limit_block_size_by_register_restriction(block_size, required_registers_per_thread, device=None):
-        """Shrinks the block_size if there are too many registers used per multiprocessor.
-        This is not done automatically, since the required_registers_per_thread are not known before compilation.
-        They can be obtained by ``func.num_regs`` from a pycuda function.
-        :returns smaller block_size if too many registers are used.
-        """
-        import pycuda.driver as cuda
-        # noinspection PyUnresolvedReferences
-        import pycuda.autoinit  # NOQA
-        da = cuda.device_attribute
-        if device is None:
-            device = cuda.Context.get_device()
-        available_registers_per_mp = device.get_attribute(da.MAX_REGISTERS_PER_MULTIPROCESSOR)
-        block = block_size
-        while True:
-            num_threads = 1
-            for t in block:
-                num_threads *= t
-            required_registers_per_mt = num_threads * required_registers_per_thread
-            if required_registers_per_mt <= available_registers_per_mp:
-                return block
-            else:
-                largest_grid_entry_idx = max(range(len(block)), key=lambda e: block[e])
-                assert block[largest_grid_entry_idx] >= 2
-                block[largest_grid_entry_idx] //= 2
-    @staticmethod
-    def permute_block_size_according_to_layout(block_size, layout):
-        """Returns modified block_size such that the fastest coordinate gets the biggest block dimension"""
-        sorted_block_size = list(sorted(block_size, reverse=True))
-        while len(sorted_block_size) > len(layout):
-            sorted_block_size[0] *= sorted_block_size[-1]
-            sorted_block_size = sorted_block_size[:-1]
-        result = list(block_size)
-        for l, bs in zip(reversed(layout), sorted_block_size):
-            result[l] = bs
-        return tuple(result[:len(layout)])
-class LineIndexing(AbstractIndexing):
-    """
-    Indexing scheme that assigns the innermost 'line' i.e. the elements which are adjacent in memory to a 1D CUDA block.
-    The fastest coordinate is indexed with thread_idx.x, the remaining coordinates are mapped to block_idx.{x,y,z}
-    This indexing scheme supports up to 4 spatial dimensions, where the innermost dimensions is not larger than the
-    maximum amount of threads allowed in a CUDA block (which depends on device).
-    """
-    def __init__(self, field, iteration_slice=None):
-        available_indices = [THREAD_IDX[0]] + BLOCK_IDX
-        if field.spatial_dimensions > 4:
-            raise NotImplementedError("This indexing scheme supports at most 4 spatial dimensions")
-        coordinates = available_indices[:field.spatial_dimensions]
-        fastest_coordinate = field.layout[-1]
-        coordinates[0], coordinates[fastest_coordinate] = coordinates[fastest_coordinate], coordinates[0]
-        self._coordinates = coordinates
-        self._iterationSlice = normalize_slice(iteration_slice, field.spatial_shape)
-        self._symbolicShape = [e if isinstance(e, sp.Basic) else None for e in field.spatial_shape]
-    @property
-    def coordinates(self):
-        return [i + offset for i, offset in zip(self._coordinates, _get_start_from_slice(self._iterationSlice))]
-    def call_parameters(self, arr_shape):
-        substitution_dict = {sym: value for sym, value in zip(self._symbolicShape, arr_shape) if sym is not None}
-        widths = [end - start for start, end in zip(_get_start_from_slice(self._iterationSlice),
-                                                    _get_end_from_slice(self._iterationSlice, arr_shape))]
-        widths = sp.Matrix(widths).subs(substitution_dict)
-        def get_shape_of_cuda_idx(cuda_idx):
-            if cuda_idx not in self._coordinates:
-                return 1
-            else:
-                idx = self._coordinates.index(cuda_idx)
-                return widths[idx]
-        return {'block': tuple([get_shape_of_cuda_idx(idx) for idx in THREAD_IDX]),
-                'grid': tuple([get_shape_of_cuda_idx(idx) for idx in BLOCK_IDX])}
-    def guard(self, kernel_content, arr_shape):
-        return kernel_content
-# -------------------------------------- Helper functions --------------------------------------------------------------
-def _get_start_from_slice(iteration_slice):
-    res = []
-    for slice_component in iteration_slice:
-        if type(slice_component) is slice:
-            res.append(slice_component.start if slice_component.start is not None else 0)
-        else:
-            assert isinstance(slice_component, int)
-            res.append(slice_component)
-    return res
-def _get_end_from_slice(iteration_slice, arr_shape):
-    iter_slice = normalize_slice(iteration_slice, arr_shape)
-    res = []
-    for slice_component in iter_slice:
-        if type(slice_component) is slice:
-            res.append(slice_component.stop)
-        else:
-            assert isinstance(slice_component, int)
-            res.append(slice_component + 1)
-    return res
-def indexing_creator_from_params(gpu_indexing, gpu_indexing_params):
-    if isinstance(gpu_indexing, str):
-        if gpu_indexing == 'block':
-            indexing_creator = BlockIndexing
-        elif gpu_indexing == 'line':
-            indexing_creator = LineIndexing
-        else:
-            raise ValueError("Unknown GPU indexing %s. Valid values are 'block' and 'line'" % (gpu_indexing,))
-        if gpu_indexing_params:
-            indexing_creator = partial(indexing_creator, **gpu_indexing_params)
-        return indexing_creator
-    else:
-        return gpu_indexing
--- a/pystencils/include/philox_rand.h
+++ b/pystencils/include/philox_rand.h
-#include <cstdint>
-#ifndef __CUDA_ARCH__
-#define QUALIFIERS inline
-#else
-#define QUALIFIERS static __forceinline__ __device__
-#endif
-#define PHILOX_W32_0   (0x9E3779B9)
-#define PHILOX_W32_1   (0xBB67AE85)
-#define PHILOX_M4x32_0 (0xD2511F53)
-#define PHILOX_M4x32_1 (0xCD9E8D57)
-#define TWOPOW53_INV_DOUBLE (1.1102230246251565e-16)
-#define TWOPOW32_INV_FLOAT (2.3283064e-10f)
-typedef std::uint32_t uint32;
-typedef std::uint64_t uint64;
-QUALIFIERS uint32 mulhilo32(uint32 a, uint32 b, uint32* hip)
-{
-#ifndef __CUDA_ARCH__
-    // host code
-    uint64 product = ((uint64)a) * ((uint64)b);
-    *hip = product >> 32;
-    return (uint32)product;
-#else
-    // device code
-    *hip = __umulhi(a,b);
-    return a*b;
-#endif
-}
-QUALIFIERS void _philox4x32round(uint32* ctr, uint32* key)
-{
-    uint32 hi0;
-    uint32 hi1;
-    uint32 lo0 = mulhilo32(PHILOX_M4x32_0, ctr[0], &hi0);
-    uint32 lo1 = mulhilo32(PHILOX_M4x32_1, ctr[2], &hi1);
-    ctr[0] = hi1^ctr[1]^key[0];
-    ctr[1] = lo1;
-    ctr[2] = hi0^ctr[3]^key[1];
-    ctr[3] = lo0;
-}
-QUALIFIERS void _philox4x32bumpkey(uint32* key)
-{
-    key[0] += PHILOX_W32_0;
-    key[1] += PHILOX_W32_1;
-}
-QUALIFIERS double _uniform_double_hq(uint32 x, uint32 y)
-{
-    unsigned long long z = (unsigned long long)x ^
-        ((unsigned long long)y << (53 - 32));
-    return z * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0);
-}
-QUALIFIERS void philox_double2(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3,
-                               uint32 key0, uint32 key1, double & rnd1, double & rnd2)
-{
-    uint32 key[2] = {key0, key1};
-    uint32 ctr[4] = {ctr0, ctr1, ctr2, ctr3};
-    _philox4x32round(ctr, key);                           // 1
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 2
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 3
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 4
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 5
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 6
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 7
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 8
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 9
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 10
-    rnd1 = _uniform_double_hq(ctr[0], ctr[1]);
-    rnd2 = _uniform_double_hq(ctr[2], ctr[3]);
-}
-QUALIFIERS void philox_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3,
-                              uint32 key0, uint32 key1,
-                              float & rnd1, float & rnd2, float & rnd3, float & rnd4)
-{
-    uint32 key[2] = {key0, key1};
-    uint32 ctr[4] = {ctr0, ctr1, ctr2, ctr3};
-    _philox4x32round(ctr, key);                           // 1
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 2
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 3
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 4
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 5
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 6
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 7
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 8
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 9
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 10
-    rnd1 = ctr[0] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
-    rnd2 = ctr[1] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
-    rnd3 = ctr[2] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
-    rnd4 = ctr[3] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
-}
\ No newline at end of file
--- a/pystencils/kerncraft_coupling/__init__.py
+++ b/pystencils/kerncraft_coupling/__init__.py
-from .kerncraft_interface import PyStencilsKerncraftKernel, KerncraftParameters
-__all__ = ['PyStencilsKerncraftKernel', 'KerncraftParameters']
--- a/pystencils/kerncraft_coupling/generate_benchmark.py
+++ b/pystencils/kerncraft_coupling/generate_benchmark.py
-from jinja2 import Template
-from pystencils.backends.cbackend import generate_c, get_headers
-from pystencils.sympyextensions import prod
-from pystencils.data_types import get_base_type
-benchmark_template = Template("""
-#include "kerncraft.h"
-#include <stdlib.h>
-#include <stdint.h>
-#include <stdbool.h>
-#include <math.h>
-{{ includes }}
-{%- if likwid %}
-#include <likwid.h>
-{%- endif %}
-#define RESTRICT __restrict__
-#define FUNC_PREFIX
-void dummy(double *);
-extern int var_false;
-{{kernel_code}}
-int main(int argc, char **argv)
-{
-  {%- if likwid %}
-  likwid_markerInit();
-  likwid_markerThreadInit();
-  {%- endif %}
-  {%- for field_name, dataType, size in fields %}
-  // Initialization {{field_name}}
-  double * {{field_name}} = (double *) aligned_malloc(sizeof({{dataType}}) * {{size}}, 32);
-  for (unsigned long long i = 0; i < {{size}}; ++i)
-    {{field_name}}[i] = 0.23;
-  if(var_false)
-    dummy({{field_name}});
-  {%- endfor %}
-  {%- for constantName, dataType in constants %}
-  // Constant {{constantName}}
-  {{dataType}} {{constantName}};
-  {{constantName}} = 0.23;
-  if(var_false)
-      dummy(& {{constantName}});
-  {%- endfor %}
-  int repeat = atoi(argv[1]);
-  {%- if likwid %}
-  likwid_markerStartRegion("loop");
-  {%- endif %}
-  for (; repeat > 0; --repeat)
-  {
-    {{kernelName}}({{call_argument_list}});
-    // Dummy calls
-    {%- for field_name, dataType, size in fields %}
-    if(var_false) dummy({{field_name}});
-    {%- endfor %}
-    {%- for constantName, dataType in constants %}
-    if(var_false) dummy(&{{constantName}});
-    {%- endfor %}
-  }
-  {%- if likwid %}
-  likwid_markerStopRegion("loop");
-  {%- endif %}
-  {%- if likwid %}
-  likwid_markerClose();
-  {%- endif %}
-}
-""")
-def generate_benchmark(ast, likwid=False):
-    accessed_fields = {f.name: f for f in ast.fields_accessed}
-    constants = []
-    fields = []
-    call_parameters = []
-    for p in ast.get_parameters():
-        if not p.is_field_parameter:
-            constants.append((p.symbol.name, str(p.symbol.dtype)))
-            call_parameters.append(p.symbol.name)
-        else:
-            assert p.is_field_pointer, "Benchmark implemented only for kernels with fixed loop size"
-            field = accessed_fields[p.field_name]
-            dtype = str(get_base_type(p.symbol.dtype))
-            fields.append((p.field_name, dtype, prod(field.shape)))
-            call_parameters.append(p.field_name)
-    header_list = get_headers(ast)
-    includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list])
-    args = {
-        'likwid': likwid,
-        'kernel_code': generate_c(ast, dialect='c'),
-        'kernelName': ast.function_name,
-        'fields': fields,
-        'constants': constants,
-        'call_argument_list': ",".join(call_parameters),
-        'includes': includes,
-    }
-    return benchmark_template.render(**args)
--- a/pystencils/kerncraft_coupling/kerncraft_interface.py
+++ b/pystencils/kerncraft_coupling/kerncraft_interface.py
--- a/pystencils/kernelcreation.py
+++ b/pystencils/kernelcreation.py
--- a/pystencils/llvm/__init__.py
+++ b/pystencils/llvm/__init__.py
-from .kernelcreation import create_kernel
-from .llvmjit import make_python_function
-__all__ = ['create_kernel', 'make_python_function']
--- a/pystencils/llvm/control_flow.py
+++ b/pystencils/llvm/control_flow.py
--- a/pystencils/llvm/kernelcreation.py
+++ b/pystencils/llvm/kernelcreation.py
--- a/pystencils/llvm/llvm.py
+++ b/pystencils/llvm/llvm.py
--- a/pystencils/llvm/llvmjit.py
+++ b/pystencils/llvm/llvmjit.py
--- a/pystencils/rng.py
+++ b/pystencils/rng.py
--- a/pystencils/session.py
+++ b/pystencils/session.py
--- a/pystencils/simp/liveness_opts.py
+++ b/pystencils/simp/liveness_opts.py
--- a/pystencils/simp/simplifications.py
+++ b/pystencils/simp/simplifications.py
--- a/pystencils/sympy_gmpy_bug_workaround.py
+++ b/pystencils/sympy_gmpy_bug_workaround.py
--- a/pystencils_tests/benchmark/SkylakeSP_Gold-5122_allinclusive.yaml
+++ b/pystencils_tests/benchmark/SkylakeSP_Gold-5122_allinclusive.yaml
No results found