Commit b075b723 authored by Markus Holzer's avatar Markus Holzer
Browse files

Merge branch 'ppc' into 'master'

Vectorization improvements

See merge request !228
parents 24dde405 8440bc6e
Pipeline #31302 passed with stage
in 17 minutes and 34 seconds
......@@ -13,7 +13,10 @@ def get_argument_string(function_shortcut):
return arg_string
def get_vector_instruction_set_arm(data_type='double', instruction_set='neon', q_registers=True):
def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'):
if instruction_set != 'neon':
raise NotImplementedError(instruction_set)
base_names = {
'+': 'add[0, 1]',
'-': 'sub[0, 1]',
......@@ -39,16 +42,9 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon', q
'float': 32,
'int': 32}
if q_registers is True:
q_reg = 'q'
width = 128 // bits[data_type]
intwidth = 128 // bits['int']
suffix = f'q_f{bits[data_type]}'
else:
q_reg = ''
width = 64 // bits[data_type]
intwidth = 64 // bits['int']
suffix = f'_f{bits[data_type]}'
width = 128 // bits[data_type]
intwidth = 128 // bits['int']
suffix = f'q_f{bits[data_type]}'
result = dict()
......@@ -60,10 +56,10 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon', q
result[intrinsic_id] = 'v' + name + suffix + arg_string
result['makeVecConst'] = f'vdup{q_reg}_n_f{bits[data_type]}' + '({0})'
result['makeVecConst'] = f'vdupq_n_f{bits[data_type]}' + '({0})'
result['makeVec'] = f'makeVec_f{bits[data_type]}' + '(' + ", ".join(['{' + str(i) + '}' for i in range(width)]) + \
')'
result['makeVecConstInt'] = f'vdup{q_reg}_n_s{bits["int"]}' + '({0})'
result['makeVecConstInt'] = f'vdupq_n_s{bits["int"]}' + '({0})'
result['makeVecInt'] = f'makeVec_s{bits["int"]}' + '({0}, {1}, {2}, {3})'
result['+int'] = f"vaddq_s{bits['int']}" + "({0}, {1})"
......@@ -77,10 +73,12 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon', q
result['bool'] = f'uint{bits[data_type]}x{width}_t'
result['headers'] = ['<arm_neon.h>', '"arm_neon_helpers.h"']
result['!='] = f'vmvn{q_reg}_u{bits[data_type]}({result["=="]})'
result['!='] = f'vmvnq_u{bits[data_type]}({result["=="]})'
result['&'] = f'vand{q_reg}_u{bits[data_type]}' + '({0}, {1})'
result['|'] = f'vorr{q_reg}_u{bits[data_type]}' + '({0}, {1})'
result['blendv'] = f'vbsl{q_reg}_f{bits[data_type]}' + '({2}, {1}, {0})'
result['&'] = f'vandq_u{bits[data_type]}' + '({0}, {1})'
result['|'] = f'vorrq_u{bits[data_type]}' + '({0}, {1})'
result['blendv'] = f'vbslq_f{bits[data_type]}' + '({2}, {1}, {0})'
result['any'] = f'vaddlvq_u8(vreinterpretq_u8_u{bits[data_type]}({{0}})) > 0'
result['all'] = f'vaddlvq_u8(vreinterpretq_u8_u{bits[data_type]}({{0}})) == 16*0xff'
return result
......@@ -588,18 +588,17 @@ class VectorizedCustomSympyPrinter(CustomSympyPrinter):
return self.instruction_set['rsqrt'].format(self._print(expr.args[0]))
else:
return f"({self._print(1 / sp.sqrt(expr.args[0]))})"
elif isinstance(expr, vec_any):
expr_type = get_type_of_expression(expr.args[0])
if type(expr_type) is not VectorType:
return self._print(expr.args[0])
else:
return self.instruction_set['any'].format(self._print(expr.args[0]))
elif isinstance(expr, vec_all):
elif isinstance(expr, vec_any) or isinstance(expr, vec_all):
instr = 'any' if isinstance(expr, vec_any) else 'all'
expr_type = get_type_of_expression(expr.args[0])
if type(expr_type) is not VectorType:
return self._print(expr.args[0])
else:
return self.instruction_set['all'].format(self._print(expr.args[0]))
if isinstance(expr.args[0], sp.Rel):
op = expr.args[0].rel_op
if (instr, op) in self.instruction_set:
return self.instruction_set[(instr, op)].format(*[self._print(a) for a in expr.args[0].args])
return self.instruction_set[instr].format(self._print(expr.args[0]))
return super(VectorizedCustomSympyPrinter, self)._print_Function(expr)
......
def get_argument_string(function_shortcut):
args = function_shortcut[function_shortcut.index('[') + 1: -1]
arg_string = "("
for arg in args.split(","):
arg = arg.strip()
if not arg:
continue
if arg in ('0', '1', '2', '3', '4', '5'):
arg_string += "{" + arg + "},"
else:
arg_string += arg + ","
arg_string = arg_string[:-1] + ")"
return arg_string
def get_vector_instruction_set_ppc(data_type='double', instruction_set='vsx'):
if instruction_set != 'vsx':
raise NotImplementedError(instruction_set)
base_names = {
'+': 'add[0, 1]',
'-': 'sub[0, 1]',
'*': 'mul[0, 1]',
'/': 'div[0, 1]',
'sqrt': 'sqrt[0]',
'rsqrt': 'rsqrte[0]', # rsqrt is available too, but not on Clang
'loadU': 'xl[0x0, 0]',
'loadA': 'ld[0x0, 0]',
'storeU': 'xst[1, 0x0, 0]',
'storeA': 'st[1, 0x0, 0]',
'stream': 'stl[1, 0x0, 0]',
'abs': 'abs[0]',
'==': 'cmpeq[0, 1]',
'!=': 'cmpne[0, 1]',
'<=': 'cmple[0, 1]',
'<': 'cmplt[0, 1]',
'>=': 'cmpge[0, 1]',
'>': 'cmpgt[0, 1]',
'&': 'and[0, 1]',
'|': 'or[0, 1]',
'blendv': 'sel[0, 1, 2]',
('any', '=='): 'any_eq[0, 1]',
('any', '!='): 'any_ne[0, 1]',
('any', '<='): 'any_le[0, 1]',
('any', '<'): 'any_lt[0, 1]',
('any', '>='): 'any_ge[0, 1]',
('any', '>'): 'any_gt[0, 1]',
('all', '=='): 'all_eq[0, 1]',
('all', '!='): 'all_ne[0, 1]',
('all', '<='): 'all_le[0, 1]',
('all', '<'): 'all_lt[0, 1]',
('all', '>='): 'all_ge[0, 1]',
('all', '>'): 'all_gt[0, 1]',
}
bits = {'double': 64,
'float': 32,
'int': 32}
width = 128 // bits[data_type]
intwidth = 128 // bits['int']
result = dict()
for intrinsic_id, function_shortcut in base_names.items():
function_shortcut = function_shortcut.strip()
name = function_shortcut[:function_shortcut.index('[')]
arg_string = get_argument_string(function_shortcut)
result[intrinsic_id] = 'vec_' + name + arg_string
if data_type == 'double':
# Clang and XL C++ are missing these for doubles
result['loadA'] = '(__vector double)' + result['loadA'].format('(float*) {0}')
result['storeA'] = result['storeA'].format('(float*) {0}', '(__vector float) {1}')
result['stream'] = result['stream'].format('(float*) {0}', '(__vector float) {1}')
result['+int'] = "vec_add({0}, {1})"
result['width'] = width
result['intwidth'] = intwidth
result[data_type] = f'__vector {data_type}'
result['int'] = '__vector int'
result['bool'] = f'__vector __bool {"long long" if data_type == "double" else "int"}'
result['headers'] = ['<altivec.h>', '"ppc_altivec_helpers.h"']
result['makeVecConst'] = '((' + result[data_type] + '){{' + \
", ".join(['(' + data_type + ') {0}' for _ in range(width)]) + '}})'
result['makeVec'] = '((' + result[data_type] + '){{' + \
", ".join(['{' + data_type + '} {' + str(i) + '}' for i in range(width)]) + '}})'
result['makeVecConstInt'] = '((' + result['int'] + '){{' + ", ".join(['(int) {0}' for _ in range(intwidth)]) + '}})'
result['makeVecInt'] = '((' + result['int'] + '){{(int) {0}, (int) {1}, (int) {2}, (int) {3}}})'
result['any'] = 'vec_any_ne({0}, ((' + result['bool'] + ') {{' + ", ".join(['0'] * width) + '}}))'
result['all'] = 'vec_all_ne({0}, ((' + result['bool'] + ') {{' + ", ".join(['0'] * width) + '}}))'
return result
......@@ -2,19 +2,40 @@ import platform
from pystencils.backends.x86_instruction_sets import get_vector_instruction_set_x86
from pystencils.backends.arm_instruction_sets import get_vector_instruction_set_arm
from pystencils.backends.ppc_instruction_sets import get_vector_instruction_set_ppc
def get_vector_instruction_set(data_type='double', instruction_set='avx', q_registers=True):
def get_vector_instruction_set(data_type='double', instruction_set='avx'):
if instruction_set in ['neon', 'sve']:
return get_vector_instruction_set_arm(data_type, instruction_set, q_registers)
return get_vector_instruction_set_arm(data_type, instruction_set)
elif instruction_set in ['vsx']:
return get_vector_instruction_set_ppc(data_type, instruction_set)
else:
return get_vector_instruction_set_x86(data_type, instruction_set)
_cache = None
def get_supported_instruction_sets():
"""List of supported instruction sets on current hardware, or None if query failed."""
global _cache
if _cache is not None:
return _cache.copy()
if platform.system() == 'Darwin' and platform.machine() == 'arm64': # not supported by cpuinfo
return ['neon']
elif platform.machine().startswith('ppc64'): # no flags reported by cpuinfo
import subprocess
import tempfile
from pystencils.cpu.cpujit import get_compiler_config
f = tempfile.NamedTemporaryFile(suffix='.cpp')
command = [get_compiler_config()['command'], '-mcpu=native', '-dM', '-E', f.name]
macros = subprocess.check_output(command, input='', text=True)
if '#define __VSX__' in macros and '#define __ALTIVEC__' in macros:
_cache = ['vsx']
else:
_cache = []
return _cache.copy()
try:
from cpuinfo import get_cpu_info
except ImportError:
......
......@@ -137,11 +137,11 @@ def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'):
result['double'] = f"__m{bit_width}d"
result['float'] = f"__m{bit_width}"
result['int'] = f"__m{bit_width}i"
result['bool'] = f"__m{bit_width}d"
result['bool'] = result[data_type]
result['headers'] = headers[instruction_set]
result['any'] = f"{pre}_movemask_{suf}({{0}}) > 0"
result['all'] = f"{pre}_movemask_{suf}({{0}}) == 0xF"
result['all'] = f"{pre}_movemask_{suf}({{0}}) == {hex(2**result['width']-1)}"
if instruction_set == 'avx512':
size = 8 if data_type == 'double' else 16
......
......@@ -155,6 +155,9 @@ def read_config():
('flags', '-Ofast -DNDEBUG -fPIC -march=native -fopenmp -std=c++11'),
('restrict_qualifier', '__restrict__')
])
if platform.machine().startswith('ppc64'):
default_compiler_config['flags'] = default_compiler_config['flags'].replace('-march=native',
'-mcpu=native')
elif platform.system().lower() == 'windows':
default_compiler_config = OrderedDict([
('os', 'windows'),
......
......@@ -21,6 +21,36 @@
typedef std::uint32_t uint32;
typedef std::uint64_t uint64;
template <typename T, std::size_t Alignment>
class AlignedAllocator
{
public:
typedef T value_type;
template <typename U>
struct rebind {
typedef AlignedAllocator<U, Alignment> other;
};
T * allocate(const std::size_t n) const {
if (n == 0) {
return nullptr;
}
void * const p = _mm_malloc(n*sizeof(T), Alignment);
if (p == nullptr) {
throw std::bad_alloc();
}
return static_cast<T *>(p);
}
void deallocate(T * const p, const std::size_t n) const {
_mm_free(p);
}
};
template <typename Key, typename T>
using AlignedMap = std::map<Key, T, std::less<Key>, AlignedAllocator<std::pair<const Key, T>, sizeof(Key)>>;
#if defined(__AES__) || defined(_MSC_VER)
QUALIFIERS __m128i aesni_keygen_assist(__m128i temp1, __m128i temp2) {
__m128i temp3;
......@@ -85,10 +115,10 @@ QUALIFIERS std::array<__m128i,11> aesni_keygen(__m128i k) {
}
QUALIFIERS const std::array<__m128i,11> & aesni_roundkeys(const __m128i & k128) {
std::array<uint32,4> a;
_mm_storeu_si128((__m128i*) a.data(), k128);
alignas(16) std::array<uint32,4> a;
_mm_store_si128((__m128i*) a.data(), k128);
static std::map<std::array<uint32,4>, std::array<__m128i,11>> roundkeys;
static AlignedMap<std::array<uint32,4>, std::array<__m128i,11>> roundkeys;
if(roundkeys.find(a) == roundkeys.end()) {
auto rk = aesni_keygen(k128);
......@@ -308,10 +338,10 @@ QUALIFIERS void aesni_double2(uint32 ctr0, __m128i ctr1, uint32 ctr2, uint32 ctr
#ifdef __AVX2__
QUALIFIERS const std::array<__m256i,11> & aesni_roundkeys(const __m256i & k256) {
std::array<uint32,8> a;
_mm256_storeu_si256((__m256i*) a.data(), k256);
alignas(32) std::array<uint32,8> a;
_mm256_store_si256((__m256i*) a.data(), k256);
static std::map<std::array<uint32,8>, std::array<__m256i,11>> roundkeys;
static AlignedMap<std::array<uint32,8>, std::array<__m256i,11>> roundkeys;
if(roundkeys.find(a) == roundkeys.end()) {
auto rk1 = aesni_keygen(_mm256_extractf128_si256(k256, 0));
......@@ -523,10 +553,10 @@ QUALIFIERS void aesni_double2(uint32 ctr0, __m256i ctr1, uint32 ctr2, uint32 ctr
#ifdef __AVX512F__
QUALIFIERS const std::array<__m512i,11> & aesni_roundkeys(const __m512i & k512) {
std::array<uint32,16> a;
_mm512_storeu_si512((__m512i*) a.data(), k512);
alignas(64) std::array<uint32,16> a;
_mm512_store_si512((__m512i*) a.data(), k512);
static std::map<std::array<uint32,16>, std::array<__m512i,11>> roundkeys;
static AlignedMap<std::array<uint32,16>, std::array<__m512i,11>> roundkeys;
if(roundkeys.find(a) == roundkeys.end()) {
auto rk1 = aesni_keygen(_mm512_extracti32x4_epi32(k512, 0));
......@@ -553,7 +583,7 @@ QUALIFIERS __m512i aesni1xm128i(const __m512i & in, const __m512i & k0) {
x = _mm512_aesenc_epi128(x, k[7]);
x = _mm512_aesenc_epi128(x, k[8]);
x = _mm512_aesenc_epi128(x, k[9]);
x = _mm512_aesenclast_epi128(x[10], k);
x = _mm512_aesenclast_epi128(x, k[10]);
#else
__m128i a = aesni1xm128i(_mm512_extracti32x4_epi32(in, 0), _mm512_extracti32x4_epi32(k0, 0));
__m128i b = aesni1xm128i(_mm512_extracti32x4_epi32(in, 1), _mm512_extracti32x4_epi32(k0, 1));
......
......@@ -16,6 +16,17 @@
#include <arm_neon.h>
#endif
#if defined(__powerpc__) && defined(__GNUC__) && !defined(__clang__) && !defined(__xlC__)
#include <ppu_intrinsics.h>
#endif
#ifdef __ALTIVEC__
#include <altivec.h>
#undef bool
#ifndef _ARCH_PWR8
#include <pveclib/vec_int64_ppc.h>
#endif
#endif
#ifndef __CUDA_ARCH__
#define QUALIFIERS inline
#include "myintrin.h"
......@@ -38,9 +49,14 @@ QUALIFIERS uint32 mulhilo32(uint32 a, uint32 b, uint32* hip)
{
#ifndef __CUDA_ARCH__
// host code
#if defined(__powerpc__) && (!defined(__clang__) || defined(__xlC__))
*hip = __mulhwu(a,b);
return a*b;
#else
uint64 product = ((uint64)a) * ((uint64)b);
*hip = product >> 32;
return (uint32)product;
#endif
#else
// device code
*hip = __umulhi(a,b);
......@@ -281,6 +297,209 @@ QUALIFIERS void philox_double2(uint32 ctr0, __m128i ctr1, uint32 ctr2, uint32 ct
}
#endif
#ifdef __ALTIVEC__
QUALIFIERS void _philox4x32round(__vector unsigned int* ctr, __vector unsigned int* key)
{
#ifndef _ARCH_PWR8
__vector unsigned int lo0 = vec_mul(ctr[0], vec_splats(PHILOX_M4x32_0));
__vector unsigned int lo1 = vec_mul(ctr[2], vec_splats(PHILOX_M4x32_1));
__vector unsigned int hi0 = vec_mulhuw(ctr[0], vec_splats(PHILOX_M4x32_0));
__vector unsigned int hi1 = vec_mulhuw(ctr[2], vec_splats(PHILOX_M4x32_1));
#elif defined(_ARCH_PWR10)
__vector unsigned int lo0 = vec_mul(ctr[0], vec_splats(PHILOX_M4x32_0));
__vector unsigned int lo1 = vec_mul(ctr[2], vec_splats(PHILOX_M4x32_1));
__vector unsigned int hi0 = vec_mulh(ctr[0], vec_splats(PHILOX_M4x32_0));
__vector unsigned int hi1 = vec_mulh(ctr[2], vec_splats(PHILOX_M4x32_1));
#else
__vector unsigned int lohi0a = (__vector unsigned int) vec_mule(ctr[0], vec_splats(PHILOX_M4x32_0));
__vector unsigned int lohi0b = (__vector unsigned int) vec_mulo(ctr[0], vec_splats(PHILOX_M4x32_0));
__vector unsigned int lohi1a = (__vector unsigned int) vec_mule(ctr[2], vec_splats(PHILOX_M4x32_1));
__vector unsigned int lohi1b = (__vector unsigned int) vec_mulo(ctr[2], vec_splats(PHILOX_M4x32_1));
#ifdef __LITTLE_ENDIAN__
__vector unsigned int lo0 = vec_mergee(lohi0a, lohi0b);
__vector unsigned int lo1 = vec_mergee(lohi1a, lohi1b);
__vector unsigned int hi0 = vec_mergeo(lohi0a, lohi0b);
__vector unsigned int hi1 = vec_mergeo(lohi1a, lohi1b);
#else
__vector unsigned int lo0 = vec_mergeo(lohi0a, lohi0b);
__vector unsigned int lo1 = vec_mergeo(lohi1a, lohi1b);
__vector unsigned int hi0 = vec_mergee(lohi0a, lohi0b);
__vector unsigned int hi1 = vec_mergee(lohi1a, lohi1b);
#endif
#endif
ctr[0] = vec_xor(vec_xor(hi1, ctr[1]), key[0]);
ctr[1] = lo1;
ctr[2] = vec_xor(vec_xor(hi0, ctr[3]), key[1]);
ctr[3] = lo0;
}
QUALIFIERS void _philox4x32bumpkey(__vector unsigned int* key)
{
key[0] = vec_add(key[0], vec_splats(PHILOX_W32_0));
key[1] = vec_add(key[1], vec_splats(PHILOX_W32_1));
}
#ifdef __VSX__
template<bool high>
QUALIFIERS __vector double _uniform_double_hq(__vector unsigned int x, __vector unsigned int y)
{
// convert 32 to 64 bit
#ifdef __LITTLE_ENDIAN__
if (high)
{
x = vec_mergel(x, vec_splats(0U));
y = vec_mergel(y, vec_splats(0U));
}
else
{
x = vec_mergeh(x, vec_splats(0U));
y = vec_mergeh(y, vec_splats(0U));
}
#else
if (high)
{
x = vec_mergel(vec_splats(0U), x);
y = vec_mergel(vec_splats(0U), y);
}
else
{
x = vec_mergeh(vec_splats(0U), x);
y = vec_mergeh(vec_splats(0U), y);
}
#endif
// calculate z = x ^ y << (53 - 32))
#ifdef _ARCH_PWR8
__vector unsigned long long z = vec_sl((__vector unsigned long long) y, vec_splats(53ULL - 32ULL));
#else
__vector unsigned long long z = vec_vsld((__vector unsigned long long) y, vec_splats(53ULL - 32ULL));
#endif
z = vec_xor((__vector unsigned long long) x, z);
// convert uint64 to double
#ifdef __xlC__
__vector double rs = vec_ctd(z, 0);
#else
__vector double rs = vec_ctf(z, 0);
#endif
// calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
rs = vec_madd(rs, vec_splats(TWOPOW53_INV_DOUBLE), vec_splats(TWOPOW53_INV_DOUBLE/2.0));
return rs;
}
#endif
QUALIFIERS void philox_float4(__vector unsigned int ctr0, __vector unsigned int ctr1, __vector unsigned int ctr2, __vector unsigned int ctr3,
uint32 key0, uint32 key1,
__vector float & rnd1, __vector float & rnd2, __vector float & rnd3, __vector float & rnd4)
{
__vector unsigned int key[2] = {vec_splats(key0), vec_splats(key1)};
__vector unsigned int ctr[4] = {ctr0, ctr1, ctr2, ctr3};
_philox4x32round(ctr, key); // 1
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 2
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 3
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 4
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 5
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 6
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 7
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 8
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 9
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 10
// convert uint32 to float
rnd1 = vec_ctf(ctr[0], 0);
rnd2 = vec_ctf(ctr[1], 0);
rnd3 = vec_ctf(ctr[2], 0);
rnd4 = vec_ctf(ctr[3], 0);
// calculate rnd * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
rnd1 = vec_madd(rnd1, vec_splats(TWOPOW32_INV_FLOAT), vec_splats(TWOPOW32_INV_FLOAT/2.0f));
rnd2 = vec_madd(rnd2, vec_splats(TWOPOW32_INV_FLOAT), vec_splats(TWOPOW32_INV_FLOAT/2.0f));
rnd3 = vec_madd(rnd3, vec_splats(TWOPOW32_INV_FLOAT), vec_splats(TWOPOW32_INV_FLOAT/2.0f));
rnd4 = vec_madd(rnd4, vec_splats(TWOPOW32_INV_FLOAT), vec_splats(TWOPOW32_INV_FLOAT/2.0f));
}
#ifdef __VSX__
QUALIFIERS void philox_double2(__vector unsigned int ctr0, __vector unsigned int ctr1, __vector unsigned int ctr2, __vector unsigned int ctr3,
uint32 key0, uint32 key1,
__vector double & rnd1lo, __vector double & rnd1hi, __vector double & rnd2lo, __vector double & rnd2hi)
{
__vector unsigned int key[2] = {vec_splats(key0), vec_splats(key1)};
__vector unsigned int ctr[4] = {ctr0, ctr1, ctr2, ctr3};
_philox4x32round(ctr, key); // 1
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 2
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 3
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 4
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 5
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 6
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 7
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 8
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 9
_philox4x32bumpkey(key); _philox4x32round(ctr, key); // 10
rnd1lo = _uniform_double_hq<false>(ctr[0], ctr[1]);
rnd1hi = _uniform_double_hq<true>(ctr[0], ctr[1]);
rnd2lo = _uniform_double_hq<false>(ctr[2], ctr[3]);
rnd2hi = _uniform_double_hq<true>(ctr[2], ctr[3]);
}
#endif
QUALIFIERS void philox_float4(uint32 ctr0, __vector unsigned int ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1,
__vector float & rnd1, __vector float & rnd2, __vector float & rnd3, __vector float & rnd4)
{
__vector unsigned int ctr0v = vec_splats(ctr0);
__vector unsigned int ctr2v = vec_splats(ctr2);
__vector unsigned int ctr3v = vec_splats(ctr3);
philox_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, rnd2, rnd3, rnd4);
}
QUALIFIERS void philox_float4(uint32 ctr0, __vector int ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1,
__vector float & rnd1, __vector float & rnd2, __vector float & rnd3, __vector float & rnd4)
{
philox_float4(ctr0, (__vector unsigned int) ctr1, ctr2, ctr3, key0, key1, rnd1, rnd2, rnd3, rnd4);
}
#ifdef __VSX__
QUALIFIERS void philox_double2(uint32 ctr0, __vector unsigned int ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1,
__vector double & rnd1lo, __vector double & rnd1hi, __vector double & rnd2lo, __vector double & rnd2hi)
{
__vector unsigned int ctr0v = vec_splats(ctr0);
__vector unsigned int ctr2v = vec_splats(ctr2);
__vector unsigned int ctr3v = vec_splats(ctr3);
philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1lo, rnd1hi, rnd2lo, rnd2hi);
}
QUALIFIERS void philox_double2(uint32 ctr0, __vector unsigned int ctr1, uint32 ctr2, uint32 ctr3,
uint32 key0, uint32 key1,
__vector double & rnd1, __vector double & rnd2)
{
__vector unsigned int ctr0v = vec_splats(ctr0);
__vector unsigned int ctr2v = vec_splats(ctr2);
__vector unsigned int ctr3v = vec_splats(ctr3);
__vector double ignore;
philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, ignore, rnd2, ignore);