simd_instruction_sets.py 4 KB
Newer Older
Michael Kuron's avatar
Michael Kuron committed
1
import math
2
import os
Michael Kuron's avatar
Michael Kuron committed
3
import platform
Michael Kuron's avatar
Michael Kuron committed
4
from ctypes import CDLL
Michael Kuron's avatar
Michael Kuron committed
5

Markus Holzer's avatar
Markus Holzer committed
6
7
from pystencils.backends.x86_instruction_sets import get_vector_instruction_set_x86
from pystencils.backends.arm_instruction_sets import get_vector_instruction_set_arm
Michael Kuron's avatar
Michael Kuron committed
8
from pystencils.backends.ppc_instruction_sets import get_vector_instruction_set_ppc
9
10


Michael Kuron's avatar
Michael Kuron committed
11
def get_vector_instruction_set(data_type='double', instruction_set='avx'):
Michael Kuron's avatar
Michael Kuron committed
12
    if instruction_set in ['neon'] or instruction_set.startswith('sve'):
Michael Kuron's avatar
Michael Kuron committed
13
14
15
        return get_vector_instruction_set_arm(data_type, instruction_set)
    elif instruction_set in ['vsx']:
        return get_vector_instruction_set_ppc(data_type, instruction_set)
Markus Holzer's avatar
Markus Holzer committed
16
17
    else:
        return get_vector_instruction_set_x86(data_type, instruction_set)
18
19


Michael Kuron's avatar
Michael Kuron committed
20
_cache = None
21
_cachelinesize = None
Michael Kuron's avatar
Michael Kuron committed
22
23


24
25
def get_supported_instruction_sets():
    """List of supported instruction sets on current hardware, or None if query failed."""
Michael Kuron's avatar
Michael Kuron committed
26
27
28
    global _cache
    if _cache is not None:
        return _cache.copy()
29
30
    if 'PYSTENCILS_SIMD' in os.environ:
        return os.environ['PYSTENCILS_SIMD'].split(',')
Michael Kuron's avatar
Michael Kuron committed
31
32
    if platform.system() == 'Darwin' and platform.machine() == 'arm64':  # not supported by cpuinfo
        return ['neon']
Michael Kuron's avatar
Michael Kuron committed
33
34
35
36
37
38
39
40
41
42
43
44
    elif platform.machine().startswith('ppc64'):  # no flags reported by cpuinfo
        import subprocess
        import tempfile
        from pystencils.cpu.cpujit import get_compiler_config
        f = tempfile.NamedTemporaryFile(suffix='.cpp')
        command = [get_compiler_config()['command'], '-mcpu=native', '-dM', '-E', f.name]
        macros = subprocess.check_output(command, input='', text=True)
        if '#define __VSX__' in macros and '#define __ALTIVEC__' in macros:
            _cache = ['vsx']
        else:
            _cache = []
        return _cache.copy()
45
46
47
48
49
50
51
    try:
        from cpuinfo import get_cpu_info
    except ImportError:
        return None

    result = []
    required_sse_flags = {'sse', 'sse2', 'ssse3', 'sse4_1', 'sse4_2'}
52
    required_avx_flags = {'avx', 'avx2'}
53
    required_avx512_flags = {'avx512f'}
Markus Holzer's avatar
Markus Holzer committed
54
    required_neon_flags = {'neon'}
Michael Kuron's avatar
Michael Kuron committed
55
    required_sve_flags = {'sve'}
56
57
58
59
60
61
62
    flags = set(get_cpu_info()['flags'])
    if flags.issuperset(required_sse_flags):
        result.append("sse")
    if flags.issuperset(required_avx_flags):
        result.append("avx")
    if flags.issuperset(required_avx512_flags):
        result.append("avx512")
Markus Holzer's avatar
Markus Holzer committed
63
64
    if flags.issuperset(required_neon_flags):
        result.append("neon")
Michael Kuron's avatar
Michael Kuron committed
65
66
67
68
69
70
71
72
73
74
75
76
77
78
    if flags.issuperset(required_sve_flags):
        if platform.system() == 'Linux':
            libc = CDLL('libc.so.6')
            native_length = 8 * libc.prctl(51, 0, 0, 0, 0)  # PR_SVE_GET_VL
            if native_length < 0:
                raise OSError("SVE length query failed")
            pwr2_length = int(2**math.floor(math.log2(native_length)))
            if pwr2_length % 256 == 0:
                result.append(f"sve{pwr2_length//2}")
            if native_length != pwr2_length:
                result.append(f"sve{pwr2_length}")
            result.append(f"sve{native_length}")
        else:
            result.append("sve")
79
    return result
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94


def get_cacheline_size(instruction_set):
    """Get the size (in bytes) of a cache block that can be zeroed without memory access.
       Usually, this is identical to the cache line size."""
    global _cachelinesize
    
    instruction_sets = get_vector_instruction_set('double', instruction_set)
    if 'cachelineSize' not in instruction_sets:
        return None
    if _cachelinesize is not None:
        return _cachelinesize
    
    import pystencils as ps
    import numpy as np
95
    from pystencils.cpu.vectorization import CachelineSize
96
97
98
    
    arr = np.zeros((1, 1), dtype=np.float32)
    f = ps.Field.create_from_numpy_array('f', arr, index_dimensions=0)
99
    ass = [CachelineSize(), ps.Assignment(f.center, CachelineSize.symbol)]
100
101
    ast = ps.create_kernel(ass, cpu_vectorize_info={'instruction_set': instruction_set})
    kernel = ast.compile()
102
    kernel(**{f.name: arr, CachelineSize.symbol.name: 0})
103
104
    _cachelinesize = int(arr[0, 0])
    return _cachelinesize