96a28eb8 · 96a28eb8 · 96a28eb8 · 96a28eb8 · 96a28eb8 · 96a28eb8
--- a/pystencils/cache.py
+++ b/pystencils/cache.py
-import os
-from collections.abc import Hashable
-from functools import partial
-from itertools import chain
-
-try:
-    from functools import lru_cache as memorycache
-except ImportError:
-    from backports.functools_lru_cache import lru_cache as memorycache
-
-
-try:
-    from joblib import Memory
-    from appdirs import user_cache_dir
-    if 'PYSTENCILS_CACHE_DIR' in os.environ:
-        cache_dir = os.environ['PYSTENCILS_CACHE_DIR']
-    else:
-        cache_dir = user_cache_dir('pystencils')
-    disk_cache = Memory(cache_dir, verbose=False).cache
-    disk_cache_no_fallback = disk_cache
-except ImportError:
-    # fallback to in-memory caching if joblib is not available
-    disk_cache = memorycache(maxsize=64)
-
-    def disk_cache_no_fallback(o):
-        return o
-
-
-def _wrapper(wrapped_func, cached_func, *args, **kwargs):
-    if all(isinstance(a, Hashable) for a in chain(args, kwargs.values())):
-        return cached_func(*args, **kwargs)
-    else:
-        return wrapped_func(*args, **kwargs)
-
-
-def memorycache_if_hashable(maxsize=128, typed=False):
-
-    def wrapper(func):
-        return partial(_wrapper, func, memorycache(maxsize, typed)(func))
-
-    return wrapper
-
-# Disable memory cache:
-# disk_cache = lambda o: o
-# disk_cache_no_fallback = lambda o: o
--- a/pystencils/data_types.py
+++ b/pystencils/data_types.py
--- a/pystencils/gpucuda/texture_utils.py
+++ b/pystencils/gpucuda/texture_utils.py
-# -*- coding: utf-8 -*-
-#
-# Copyright © 2019 Stephan Seitz <stephan.seitz@fau.de>
-#
-# Distributed under terms of the GPLv3 license.
-
-"""
-
-"""
-
-from os.path import dirname, isdir, join
-
-import numpy as np
-
-try:
-    import pycuda.driver as cuda
-    from pycuda import gpuarray
-except Exception:
-    pass
-
-
-def pow_two_divider(n):
-    if n == 0:
-        return 0
-    divider = 1
-    while (n & divider) == 0:
-        divider <<= 1
-    return divider
-
-
-def ndarray_to_tex(tex_ref,
-                   ndarray,
-                   address_mode=None,
-                   filter_mode=None,
-                   use_normalized_coordinates=False,
-                   read_as_integer=False):
-
-    if address_mode is None:
-        address_mode = cuda.address_mode.BORDER
-    if filter_mode is None:
-        filter_mode = cuda.filter_mode.LINEAR
-
-    if isinstance(ndarray, np.ndarray):
-        cu_array = cuda.np_to_array(ndarray, 'C')
-    elif isinstance(ndarray, gpuarray.GPUArray):
-        cu_array = cuda.gpuarray_to_array(ndarray, 'C')
-    else:
-        raise TypeError(
-            'ndarray must be numpy.ndarray or pycuda.gpuarray.GPUArray')
-
-    cuda.TextureReference.set_array(tex_ref, cu_array)
-
-    tex_ref.set_address_mode(0, address_mode)
-    if ndarray.ndim >= 2:
-        tex_ref.set_address_mode(1, address_mode)
-    if ndarray.ndim >= 3:
-        tex_ref.set_address_mode(2, address_mode)
-    tex_ref.set_filter_mode(filter_mode)
-
-    if not use_normalized_coordinates:
-        tex_ref.set_flags(tex_ref.get_flags() & ~cuda.TRSF_NORMALIZED_COORDINATES)
-
-    if not read_as_integer:
-        tex_ref.set_flags(tex_ref.get_flags() & ~cuda.TRSF_READ_AS_INTEGER)
-
-
-def prefilter_for_cubic_bspline(gpuarray):
-    import pycuda.autoinit  # NOQA
-    from pycuda.compiler import SourceModule
-
-    ndim = gpuarray.ndim
-    assert ndim == 2 or ndim == 3, "Only 2d or 3d supported"
-    assert isdir(join(dirname(__file__), "CubicInterpolationCUDA", "code")), \
-        "Submodule CubicInterpolationCUDA does not exist"
-    nvcc_options = ["-w", "-std=c++11", "-Wno-deprecated-gpu-targets"]
-    nvcc_options += ["-I" + join(dirname(__file__), "CubicInterpolationCUDA", "code")]
-    nvcc_options += ["-I" + join(dirname(__file__), "CubicInterpolationCUDA", "code", "internal")]
-
-    file_name = join(dirname(__file__), "CubicInterpolationCUDA", "code", "cubicPrefilter%iD.cu" % ndim)
-    with open(file_name) as file:
-        code = file.read()
-
-    mod = SourceModule(code, options=nvcc_options)
-
-    if ndim == 2:
-        height, width = gpuarray.shape
-        block = min(pow_two_divider(height), 64)
-        grid = height // block
-        func = mod.get_function('SamplesToCoefficients2DXf')
-        func(gpuarray, np.uint32(gpuarray.strides[-2]), *(np.uint32(r)
-                                                          for r in reversed(gpuarray.shape)),
-             block=(block, 1, 1),
-             grid=(grid, 1, 1))
-
-        block = min(pow_two_divider(width), 64)
-        grid = width // block
-        func = mod.get_function('SamplesToCoefficients2DYf')
-        func(gpuarray, np.uint32(gpuarray.strides[-2]), *(np.uint32(r)
-                                                          for r in reversed(gpuarray.shape)),
-             block=(block, 1, 1),
-             grid=(grid, 1, 1))
-    elif ndim == 3:
-        depth, height, width = gpuarray.shape
-        dimX = min(min(pow_two_divider(width), pow_two_divider(height)), 64)
-        dimY = min(min(pow_two_divider(depth), pow_two_divider(height)), 512 / dimX)
-        block = (dimX, dimY, 1)
-
-        dimGridX = (height // block[0], depth // block[1], 1)
-        dimGridY = (width // block[0], depth // block[1], 1)
-        dimGridZ = (width // block[0], height // block[1], 1)
-
-        func = mod.get_function("SamplesToCoefficients3DXf")
-        func(gpuarray, np.uint32(gpuarray.strides[-2]), *(np.uint32(r)
-                                                          for r in reversed(gpuarray.shape)),
-             block=block,
-             grid=dimGridX)
-        func = mod.get_function("SamplesToCoefficients3DYf")
-        func(gpuarray, np.uint32(gpuarray.strides[-2]), *(np.uint32(r)
-                                                          for r in reversed(gpuarray.shape)),
-             block=block,
-             grid=dimGridY)
-        func = mod.get_function("SamplesToCoefficients3DZf")
-        func(gpuarray, np.uint32(gpuarray.strides[-2]), *(np.uint32(r)
-                                                          for r in reversed(gpuarray.shape)),
-             block=block,
-             grid=dimGridZ)
--- a/pystencils/include/PyStencilsField.h
+++ b/pystencils/include/PyStencilsField.h
-#pragma once
-
-extern "C++" {
-#ifdef __CUDA_ARCH__
-template <typename DTYPE_T, std::size_t DIMENSION> struct PyStencilsField {
-  DTYPE_T *data;
-  DTYPE_T shape[DIMENSION];
-  DTYPE_T stride[DIMENSION];
-};
-#else
-#include <array>
-
-template <typename DTYPE_T, std::size_t DIMENSION> struct PyStencilsField {
-  DTYPE_T *data;
-  std::array<DTYPE_T, DIMENSION> shape;
-  std::array<DTYPE_T, DIMENSION> stride;
-};
-#endif
-}
--- a/pystencils/include/aesni_rand.h
+++ b/pystencils/include/aesni_rand.h
-#if !defined(__AES__) || !defined(__SSE4_1__)
-#error AES-NI and SSE4.1 need to be enabled
-#endif
-
-#include <emmintrin.h> // SSE2
-#include <wmmintrin.h> // AES
-#ifdef __AVX512VL__
-#include <immintrin.h> // AVX*
-#else
-#include <smmintrin.h>  // SSE4
-#ifdef __FMA__
-#include <immintrin.h> // FMA
-#endif
-#endif
-#include <cstdint>
-
-#define QUALIFIERS inline
-#define TWOPOW53_INV_DOUBLE (1.1102230246251565e-16)
-#define TWOPOW32_INV_FLOAT (2.3283064e-10f)
-
-typedef std::uint32_t uint32;
-typedef std::uint64_t uint64;
-
-QUALIFIERS __m128i aesni1xm128i(const __m128i & in, const __m128i & k) {
-    __m128i x = _mm_xor_si128(k, in);
-    x = _mm_aesenc_si128(x, k);     // 1
-    x = _mm_aesenc_si128(x, k);     // 2
-    x = _mm_aesenc_si128(x, k);     // 3
-    x = _mm_aesenc_si128(x, k);     // 4
-    x = _mm_aesenc_si128(x, k);     // 5
-    x = _mm_aesenc_si128(x, k);     // 6
-    x = _mm_aesenc_si128(x, k);     // 7
-    x = _mm_aesenc_si128(x, k);     // 8
-    x = _mm_aesenc_si128(x, k);     // 9
-    x = _mm_aesenclast_si128(x, k); // 10
-    return x;
-}
-
-QUALIFIERS __m128 _my_cvtepu32_ps(const __m128i v)
-{
-#ifdef __AVX512VL__
-    return _mm_cvtepu32_ps(v);
-#else
-    __m128i v2 = _mm_srli_epi32(v, 1);
-    __m128i v1 = _mm_and_si128(v, _mm_set1_epi32(1));
-    __m128 v2f = _mm_cvtepi32_ps(v2);
-    __m128 v1f = _mm_cvtepi32_ps(v1);
-    return _mm_add_ps(_mm_add_ps(v2f, v2f), v1f);
-#endif
-}
-
-#if !defined(__AVX512VL__) && defined(__GNUC__) && __GNUC__ >= 5
-__attribute__((optimize("no-associative-math")))
-#endif
-QUALIFIERS __m128d _my_cvtepu64_pd(const __m128i x)
-{
-#ifdef __AVX512VL__
-    return _mm_cvtepu64_pd(x);
-#else
-    __m128i xH = _mm_srli_epi64(x, 32);
-    xH = _mm_or_si128(xH, _mm_castpd_si128(_mm_set1_pd(19342813113834066795298816.)));          //  2^84
-    __m128i xL = _mm_blend_epi16(x, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)), 0xcc);   //  2^52
-    __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(19342813118337666422669312.));     //  2^84 + 2^52
-    return _mm_add_pd(f, _mm_castsi128_pd(xL));
-#endif
-}
-
-
-QUALIFIERS void aesni_double2(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3,
-                              uint32 key0, uint32 key1, uint32 key2, uint32 key3,
-                              double & rnd1, double & rnd2)
-{
-    // pack input and call AES
-    __m128i c128 = _mm_set_epi32(ctr3, ctr2, ctr1, ctr0);
-    __m128i k128 = _mm_set_epi32(key3, key2, key1, key0);
-    c128 = aesni1xm128i(c128, k128);
-
-    // convert 32 to 64 bit and put 0th and 2nd element into x, 1st and 3rd element into y
-    __m128i x = _mm_and_si128(c128, _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff));
-    __m128i y = _mm_and_si128(c128, _mm_set_epi32(0xffffffff, 0, 0xffffffff, 0));
-    y = _mm_srli_si128(y, 4);
-
-    // calculate z = x ^ y << (53 - 32))
-    __m128i z = _mm_sll_epi64(y, _mm_set1_epi64x(53 - 32));
-    z = _mm_xor_si128(x, z);
-
-    // convert uint64 to double
-    __m128d rs = _my_cvtepu64_pd(z);
-    // calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
-#ifdef __FMA__
-    rs = _mm_fmadd_pd(rs, _mm_set1_pd(TWOPOW53_INV_DOUBLE), _mm_set1_pd(TWOPOW53_INV_DOUBLE/2.0));
-#else
-    rs = _mm_mul_pd(rs, _mm_set1_pd(TWOPOW53_INV_DOUBLE));
-    rs = _mm_add_pd(rs, _mm_set1_pd(TWOPOW53_INV_DOUBLE/2.0));
-#endif
-
-    // store result
-    alignas(16) double rr[2];
-    _mm_store_pd(rr, rs);
-    rnd1 = rr[0];
-    rnd2 = rr[1];
-}
-
-
-QUALIFIERS void aesni_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3,
-                             uint32 key0, uint32 key1, uint32 key2, uint32 key3,
-                             float & rnd1, float & rnd2, float & rnd3, float & rnd4)
-{
-    // pack input and call AES
-    __m128i c128 = _mm_set_epi32(ctr3, ctr2, ctr1, ctr0);
-    __m128i k128 = _mm_set_epi32(key3, key2, key1, key0);
-    c128 = aesni1xm128i(c128, k128);
-
-    // convert uint32 to float
-    __m128 rs = _my_cvtepu32_ps(c128);
-    // calculate rs * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
-#ifdef __FMA__
-    rs = _mm_fmadd_ps(rs, _mm_set1_ps(TWOPOW32_INV_FLOAT), _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
-#else
-    rs = _mm_mul_ps(rs, _mm_set1_ps(TWOPOW32_INV_FLOAT));
-    rs = _mm_add_ps(rs, _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
-#endif
-
-    // store result
-    alignas(16) float r[4];
-    _mm_store_ps(r, rs);
-    rnd1 = r[0];
-    rnd2 = r[1];
-    rnd3 = r[2];
-    rnd4 = r[3];
-}
-
--- a/pystencils/include/opencl_stdint.h
+++ b/pystencils/include/opencl_stdint.h
-#ifndef OPENCL_STDINT
-#define OPENCL_STDINT
-
-typedef unsigned int      uint;
-typedef unsigned int      uint_t;
-
-typedef signed char       int8_t;
-typedef signed short      int16_t;
-typedef signed int        int32_t;
-typedef signed long int   int64_t;
-typedef unsigned char     uint8_t;
-typedef unsigned short    uint16_t;
-typedef unsigned int      uint32_t;
-typedef unsigned long int uint64_t;
-
-#endif
--- a/pystencils/include/philox_rand.h
+++ b/pystencils/include/philox_rand.h
-#include <cstdint>
-
-#ifndef __CUDA_ARCH__
-#define QUALIFIERS inline
-#else
-#define QUALIFIERS static __forceinline__ __device__
-#endif
-
-#define PHILOX_W32_0   (0x9E3779B9)
-#define PHILOX_W32_1   (0xBB67AE85)
-#define PHILOX_M4x32_0 (0xD2511F53)
-#define PHILOX_M4x32_1 (0xCD9E8D57)
-#define TWOPOW53_INV_DOUBLE (1.1102230246251565e-16)
-#define TWOPOW32_INV_FLOAT (2.3283064e-10f)
-
-typedef std::uint32_t uint32;
-typedef std::uint64_t uint64;
-
-
-QUALIFIERS uint32 mulhilo32(uint32 a, uint32 b, uint32* hip)
-{
-#ifndef __CUDA_ARCH__
-    // host code
-    uint64 product = ((uint64)a) * ((uint64)b);
-    *hip = product >> 32;
-    return (uint32)product;
-#else
-    // device code
-    *hip = __umulhi(a,b);
-    return a*b;
-#endif
-}
-
-QUALIFIERS void _philox4x32round(uint32* ctr, uint32* key)
-{
-    uint32 hi0;
-    uint32 hi1;
-    uint32 lo0 = mulhilo32(PHILOX_M4x32_0, ctr[0], &hi0);
-    uint32 lo1 = mulhilo32(PHILOX_M4x32_1, ctr[2], &hi1);
-
-    ctr[0] = hi1^ctr[1]^key[0];
-    ctr[1] = lo1;
-    ctr[2] = hi0^ctr[3]^key[1];
-    ctr[3] = lo0;
-}
-
-QUALIFIERS void _philox4x32bumpkey(uint32* key)
-{
-    key[0] += PHILOX_W32_0;
-    key[1] += PHILOX_W32_1;
-}
-
-QUALIFIERS double _uniform_double_hq(uint32 x, uint32 y)
-{
-    uint64 z = (uint64)x ^ ((uint64)y << (53 - 32));
-    return z * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0);
-}
-
-
-QUALIFIERS void philox_double2(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3,
-                               uint32 key0, uint32 key1, double & rnd1, double & rnd2)
-{
-    uint32 key[2] = {key0, key1};
-    uint32 ctr[4] = {ctr0, ctr1, ctr2, ctr3};
-    _philox4x32round(ctr, key);                           // 1
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 2
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 3
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 4
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 5
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 6
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 7
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 8
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 9
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 10
-
-    rnd1 = _uniform_double_hq(ctr[0], ctr[1]);
-    rnd2 = _uniform_double_hq(ctr[2], ctr[3]);
-}
-
-
-
-QUALIFIERS void philox_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3,
-                              uint32 key0, uint32 key1,
-                              float & rnd1, float & rnd2, float & rnd3, float & rnd4)
-{
-    uint32 key[2] = {key0, key1};
-    uint32 ctr[4] = {ctr0, ctr1, ctr2, ctr3};
-    _philox4x32round(ctr, key);                           // 1
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 2
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 3
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 4
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 5
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 6
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 7
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 8
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 9
-    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 10
-
-    rnd1 = ctr[0] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
-    rnd2 = ctr[1] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
-    rnd3 = ctr[2] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
-    rnd4 = ctr[3] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
-}
\ No newline at end of file
--- a/pystencils/interpolation_astnodes.py
+++ b/pystencils/interpolation_astnodes.py
--- a/pystencils/kerncraft_coupling/__init__.py
+++ b/pystencils/kerncraft_coupling/__init__.py
-from .generate_benchmark import generate_benchmark, run_c_benchmark
-from .kerncraft_interface import KerncraftParameters, PyStencilsKerncraftKernel
-
-__all__ = ['PyStencilsKerncraftKernel', 'KerncraftParameters', 'generate_benchmark', 'run_c_benchmark']
--- a/pystencils/kerncraft_coupling/generate_benchmark.py
+++ b/pystencils/kerncraft_coupling/generate_benchmark.py
--- a/pystencils/kerncraft_coupling/kerncraft_interface.py
+++ b/pystencils/kerncraft_coupling/kerncraft_interface.py
--- a/pystencils/kernelcreation.py
+++ b/pystencils/kernelcreation.py
--- a/pystencils/llvm/__init__.py
+++ b/pystencils/llvm/__init__.py
-from .kernelcreation import create_kernel
-from .llvmjit import make_python_function
-
-__all__ = ['create_kernel', 'make_python_function']
--- a/pystencils/llvm/control_flow.py
+++ b/pystencils/llvm/control_flow.py
--- a/pystencils/llvm/kernelcreation.py
+++ b/pystencils/llvm/kernelcreation.py
--- a/pystencils/llvm/llvm.py
+++ b/pystencils/llvm/llvm.py
--- a/pystencils/llvm/llvmjit.py
+++ b/pystencils/llvm/llvmjit.py
--- a/pystencils/math_optimizations.py
+++ b/pystencils/math_optimizations.py
-"""
-Default Sympy optimizations applied in pystencils kernels using :func:`sympy.codegen.rewriting.optimize`.
-
-See :func:`sympy.codegen.rewriting.optimize`.
-"""
-
-
-import itertools
-
-from pystencils import Assignment
-from pystencils.astnodes import SympyAssignment
-
-try:
-    from sympy.codegen.rewriting import optims_c99, optimize
-    from sympy.codegen.rewriting import ReplaceOptim
-    HAS_REWRITING = True
-
-    # Evaluates all constant terms
-    evaluate_constant_terms = ReplaceOptim(
-        lambda e: hasattr(e, 'is_constant') and e.is_constant and not e.is_integer,
-        lambda p: p.evalf()
-    )
-
-    optims_pystencils_cpu = [evaluate_constant_terms] + list(optims_c99)
-    optims_pystencils_gpu = [evaluate_constant_terms] + list(optims_c99)
-except ImportError:
-    from warnings import warn
-    warn("Could not import ReplaceOptim, optims_c99, optimize from sympy.codegen.rewriting."
-         "Please update your sympy installation!")
-    optims_c99 = []
-    optims_pystencils_cpu = []
-    optims_pystencils_gpu = []
-    HAS_REWRITING = False
-
-
-def optimize_assignments(assignments, optimizations):
-
-    if HAS_REWRITING:
-        assignments = [Assignment(a.lhs, optimize(a.rhs, optimizations))
-                       if hasattr(a, 'lhs')
-                       else a for a in assignments]
-        assignments_nodes = [a.atoms(SympyAssignment) for a in assignments]
-        for a in itertools.chain.from_iterable(assignments_nodes):
-            a.optimize(optimizations)
-
-    return assignments
--- a/pystencils/opencl/opencljit.py
+++ b/pystencils/opencl/opencljit.py
--- a/pystencils/rng.py
+++ b/pystencils/rng.py
No results found