Commit 147f6901 authored by Markus Holzer's avatar Markus Holzer
Browse files

Merge branch 'nontemporal' into 'master'

Improve non-temporal stores

Closes #25

See merge request !230
parents b075b723 1d39f5f5
Pipeline #31493 passed with stages
in 11 minutes and 4 seconds
......@@ -10,20 +10,28 @@ def aligned_empty(shape, byte_alignment=True, dtype=np.float64, byte_offset=0, o
shape: size of the array
byte_alignment: alignment in bytes, for the start address of the array holds (a % byte_alignment) == 0
By default, use the maximum required by the CPU (or 512 bits if this cannot be detected).
When 'cacheline' is specified, the size of a cache line is used.
dtype: numpy data type
byte_offset: offset in bytes for position that should be aligned i.e. (a+byte_offset) % byte_alignment == 0
typically used to align first inner cell instead of ghost layer
order: storage linearization order
align_inner_coordinate: if True, the start of the innermost coordinate lines are aligned as well
"""
if byte_alignment is True:
from pystencils.backends.simd_instruction_sets import (get_supported_instruction_sets,
if byte_alignment is True or byte_alignment == 'cacheline':
from pystencils.backends.simd_instruction_sets import (get_supported_instruction_sets, get_cacheline_size,
get_vector_instruction_set)
type_name = BasicType.numpy_name_to_c(np.dtype(dtype).name)
instruction_sets = get_supported_instruction_sets()
if instruction_sets is None:
byte_alignment = 64
elif byte_alignment == 'cacheline':
cacheline_sizes = [get_cacheline_size(is_name) for is_name in instruction_sets]
if all([s is None for s in cacheline_sizes]):
byte_alignment = max([get_vector_instruction_set(type_name, is_name)['width'] * np.dtype(dtype).itemsize
for is_name in instruction_sets])
else:
byte_alignment = max([s for s in cacheline_sizes if s is not None])
else:
byte_alignment = max([get_vector_instruction_set(type_name, is_name)['width'] * np.dtype(dtype).itemsize
for is_name in instruction_sets])
......
......@@ -313,7 +313,9 @@ class Block(Node):
self._nodes = [fast_subs(a, subs_dict, skip) for a in self._nodes]
return self
def insert_front(self, node):
def insert_front(self, node, if_not_exists=False):
if if_not_exists and len(self._nodes) > 0 and self._nodes[0] == node:
return
if isinstance(node, collections.abc.Iterable):
node = list(node)
for n in node:
......@@ -324,7 +326,7 @@ class Block(Node):
node.parent = self
self._nodes.insert(0, node)
def insert_before(self, new_node, insert_before):
def insert_before(self, new_node, insert_before, if_not_exists=False):
new_node.parent = self
assert self._nodes.count(insert_before) == 1
idx = self._nodes.index(insert_before)
......@@ -337,7 +339,25 @@ class Block(Node):
idx -= 1
else:
break
self._nodes.insert(idx, new_node)
if not if_not_exists or self._nodes[idx] != new_node:
self._nodes.insert(idx, new_node)
def insert_after(self, new_node, insert_after, if_not_exists=False):
new_node.parent = self
assert self._nodes.count(insert_after) == 1
idx = self._nodes.index(insert_after) + 1
# move all assignment (definitions to the top)
if isinstance(new_node, SympyAssignment) and new_node.is_declaration:
while idx > 0:
pn = self._nodes[idx - 1]
if isinstance(pn, LoopOverCoordinate) or isinstance(pn, Conditional):
idx -= 1
else:
break
if not if_not_exists or not (self._nodes[idx - 1] == new_node
or (idx < len(self._nodes) and self._nodes[idx] == new_node)):
self._nodes.insert(idx, new_node)
def append(self, node):
if isinstance(node, list) or isinstance(node, tuple):
......@@ -816,3 +836,47 @@ class ConditionalFieldAccess(sp.Function):
def __getnewargs__(self):
return self.access, self.outofbounds_condition, self.outofbounds_value
class NontemporalFence(Node):
def __init__(self):
super(NontemporalFence, self).__init__(parent=None)
@property
def symbols_defined(self):
return set()
@property
def undefined_symbols(self):
return set()
@property
def args(self):
return []
def __eq__(self, other):
return isinstance(other, NontemporalFence)
class CachelineSize(Node):
symbol = sp.Symbol("_clsize")
mask_symbol = sp.Symbol("_clsize_mask")
last_symbol = sp.Symbol("_cl_lastvec")
def __init__(self):
super(CachelineSize, self).__init__(parent=None)
@property
def symbols_defined(self):
return set([self.symbol, self.mask_symbol, self.last_symbol])
@property
def undefined_symbols(self):
return set()
@property
def args(self):
return []
def __eq__(self, other):
return isinstance(other, CachelineSize)
......@@ -28,7 +28,6 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'):
'loadA': 'ld1[0]',
'storeU': 'st1[0, 1]',
'storeA': 'st1[0, 1]',
'stream': 'st1[0, 1]',
'abs': 'abs[0]',
'==': 'ceq[0, 1]',
......@@ -47,6 +46,7 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'):
suffix = f'q_f{bits[data_type]}'
result = dict()
result['bytes'] = 16
for intrinsic_id, function_shortcut in base_names.items():
function_shortcut = function_shortcut.strip()
......@@ -81,4 +81,7 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'):
result['any'] = f'vaddlvq_u8(vreinterpretq_u8_u{bits[data_type]}({{0}})) > 0'
result['all'] = f'vaddlvq_u8(vreinterpretq_u8_u{bits[data_type]}({{0}})) == 16*0xff'
result['cachelineSize'] = 'cachelineSize()'
result['cachelineZero'] = 'cachelineZero((void*) {0})'
return result
import re
from collections import namedtuple
import hashlib
from typing import Set
import numpy as np
......@@ -7,7 +8,7 @@ import sympy as sp
from sympy.core import S
from sympy.logic.boolalg import BooleanFalse, BooleanTrue
from pystencils.astnodes import KernelFunction, Node
from pystencils.astnodes import KernelFunction, Node, CachelineSize
from pystencils.cpu.vectorization import vec_all, vec_any
from pystencils.data_types import (
PointerType, VectorType, address_of, cast_func, create_type, get_type_of_expression,
......@@ -258,7 +259,7 @@ class CBackend:
arg, data_type, aligned, nontemporal, mask = node.lhs.args
instr = 'storeU'
if aligned:
instr = 'stream' if nontemporal else 'storeA'
instr = 'stream' if nontemporal and 'stream' in self._vector_instruction_set else 'storeA'
if mask != True: # NOQA
instr = 'maskStore' if aligned else 'maskStoreU'
printed_mask = self.sympy_printer.doprint(mask)
......@@ -271,21 +272,60 @@ class CBackend:
else:
rhs = node.rhs
return self._vector_instruction_set[instr].format("&" + self.sympy_printer.doprint(node.lhs.args[0]),
self.sympy_printer.doprint(rhs),
ptr = "&" + self.sympy_printer.doprint(node.lhs.args[0])
pre_code = ''
if nontemporal and 'cachelineZero' in self._vector_instruction_set:
pre_code = f"if (((uintptr_t) {ptr} & {CachelineSize.mask_symbol}) == 0) " + "{\n\t" + \
self._vector_instruction_set['cachelineZero'].format(ptr) + ';\n}\n'
code = self._vector_instruction_set[instr].format(ptr, self.sympy_printer.doprint(rhs),
printed_mask) + ';'
flushcond = f"((uintptr_t) {ptr} & {CachelineSize.mask_symbol}) == {CachelineSize.last_symbol}"
if nontemporal and 'flushCacheline' in self._vector_instruction_set:
code2 = self._vector_instruction_set['flushCacheline'].format(
ptr, self.sympy_printer.doprint(rhs)) + ';'
code = f"{code}\nif ({flushcond}) {{\n\t{code2}\n}}"
elif nontemporal and 'storeAAndFlushCacheline' in self._vector_instruction_set:
tmpvar = '_tmp_' + hashlib.sha1(self.sympy_printer.doprint(rhs).encode('ascii')).hexdigest()[:8]
code = 'const ' + self._print(node.lhs.dtype).replace(' const', '') + ' ' + tmpvar + ' = ' \
+ self.sympy_printer.doprint(rhs) + ';'
code1 = self._vector_instruction_set[instr].format(ptr, tmpvar, printed_mask) + ';'
code2 = self._vector_instruction_set['storeAAndFlushCacheline'].format(ptr, tmpvar, printed_mask) \
+ ';'
code += f"\nif ({flushcond}) {{\n\t{code2}\n}} else {{\n\t{code1}\n}}"
return pre_code + code
else:
return f"{self.sympy_printer.doprint(node.lhs)} = {self.sympy_printer.doprint(node.rhs)};"
def _print_NontemporalFence(self, _):
if 'streamFence' in self._vector_instruction_set:
return self._vector_instruction_set['streamFence'] + ';'
else:
return ''
def _print_CachelineSize(self, node):
if 'cachelineSize' in self._vector_instruction_set:
code = f'const size_t {node.symbol} = {self._vector_instruction_set["cachelineSize"]};\n'
code += f'const size_t {node.mask_symbol} = {node.symbol} - 1;\n'
vectorsize = self._vector_instruction_set['bytes']
code += f'const size_t {node.last_symbol} = {node.symbol} - {vectorsize};\n'
return code
else:
return ''
def _print_TemporaryMemoryAllocation(self, node):
align = 64
if self._vector_instruction_set:
align = self._vector_instruction_set['bytes']
else:
align = node.symbol.dtype.base_type.numpy_dtype.itemsize
np_dtype = node.symbol.dtype.base_type.numpy_dtype
required_size = np_dtype.itemsize * node.size + align
size = modulo_ceil(required_size, align)
code = "#if __cplusplus >= 201703L || __STDC_VERSION__ >= 201112L\n"
code += "{dtype} {name}=({dtype})aligned_alloc({align}, {size}) + {offset};\n"
code += "#elif defined(_MSC_VER)\n"
code = "#if defined(_MSC_VER)\n"
code += "{dtype} {name}=({dtype})_aligned_malloc({size}, {align}) + {offset};\n"
code += "#elif __cplusplus >= 201703L || __STDC_VERSION__ >= 201112L\n"
code += "{dtype} {name}=({dtype})aligned_alloc({align}, {size}) + {offset};\n"
code += "#else\n"
code += "{dtype} {name};\n"
code += "posix_memalign((void**) &{name}, {align}, {size});\n"
......@@ -298,7 +338,11 @@ class CBackend:
align=align)
def _print_TemporaryMemoryFree(self, node):
align = 64
if self._vector_instruction_set:
align = self._vector_instruction_set['bytes']
else:
align = node.symbol.dtype.base_type.numpy_dtype.itemsize
code = "#if defined(_MSC_VER)\n"
code += "_aligned_free(%s - %d);\n" % (self.sympy_printer.doprint(node.symbol.name), node.offset(align))
code += "#else\n"
......
......@@ -29,7 +29,7 @@ def get_vector_instruction_set_ppc(data_type='double', instruction_set='vsx'):
'loadA': 'ld[0x0, 0]',
'storeU': 'xst[1, 0x0, 0]',
'storeA': 'st[1, 0x0, 0]',
'stream': 'stl[1, 0x0, 0]',
'storeAAndFlushCacheline': 'stl[1, 0x0, 0]',
'abs': 'abs[0]',
'==': 'cmpeq[0, 1]',
......@@ -64,6 +64,7 @@ def get_vector_instruction_set_ppc(data_type='double', instruction_set='vsx'):
intwidth = 128 // bits['int']
result = dict()
result['bytes'] = 16
for intrinsic_id, function_shortcut in base_names.items():
function_shortcut = function_shortcut.strip()
......@@ -78,6 +79,8 @@ def get_vector_instruction_set_ppc(data_type='double', instruction_set='vsx'):
result['loadA'] = '(__vector double)' + result['loadA'].format('(float*) {0}')
result['storeA'] = result['storeA'].format('(float*) {0}', '(__vector float) {1}')
result['stream'] = result['stream'].format('(float*) {0}', '(__vector float) {1}')
result['storeAAndFlushCacheline'] = result['storeAAndFlushCacheline'].format('(float*) {0}',
'(__vector float) {1}')
result['+int'] = "vec_add({0}, {1})"
......@@ -98,4 +101,7 @@ def get_vector_instruction_set_ppc(data_type='double', instruction_set='vsx'):
result['any'] = 'vec_any_ne({0}, ((' + result['bool'] + ') {{' + ", ".join(['0'] * width) + '}}))'
result['all'] = 'vec_all_ne({0}, ((' + result['bool'] + ') {{' + ", ".join(['0'] * width) + '}}))'
result['cachelineSize'] = 'cachelineSize()'
result['cachelineZero'] = 'cachelineZero((void*) {0})'
return result
......@@ -15,6 +15,7 @@ def get_vector_instruction_set(data_type='double', instruction_set='avx'):
_cache = None
_cachelinesize = None
def get_supported_instruction_sets():
......@@ -56,3 +57,27 @@ def get_supported_instruction_sets():
if flags.issuperset(required_neon_flags):
result.append("neon")
return result
def get_cacheline_size(instruction_set):
"""Get the size (in bytes) of a cache block that can be zeroed without memory access.
Usually, this is identical to the cache line size."""
global _cachelinesize
instruction_sets = get_vector_instruction_set('double', instruction_set)
if 'cachelineSize' not in instruction_sets:
return None
if _cachelinesize is not None:
return _cachelinesize
import pystencils as ps
import numpy as np
arr = np.zeros((1, 1), dtype=np.float32)
f = ps.Field.create_from_numpy_array('f', arr, index_dimensions=0)
ass = [ps.astnodes.CachelineSize(), ps.Assignment(f.center, ps.astnodes.CachelineSize.symbol)]
ast = ps.create_kernel(ass, cpu_vectorize_info={'instruction_set': instruction_set})
kernel = ast.compile()
kernel(**{f.name: arr, ps.astnodes.CachelineSize.symbol.name: 0})
_cachelinesize = int(arr[0, 0])
return _cachelinesize
......@@ -110,7 +110,8 @@ def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'):
result = {
'width': width[(data_type, instruction_set)],
'intwidth': width[('int', instruction_set)]
'intwidth': width[('int', instruction_set)],
'bytes': 4 * width[("float", instruction_set)]
}
pre = prefix[instruction_set]
for intrinsic_id, function_shortcut in base_names.items():
......@@ -164,4 +165,6 @@ def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'):
result['+int'] = f"{pre}_add_{suffix['int']}({{0}}, {{1}})"
result['streamFence'] = '_mm_mfence()'
return result
......@@ -58,8 +58,9 @@ import numpy as np
from appdirs import user_cache_dir, user_config_dir
from pystencils import FieldType
from pystencils.astnodes import LoopOverCoordinate
from pystencils.backends.cbackend import generate_c, get_headers
from pystencils.data_types import cast_func, VectorType
from pystencils.data_types import cast_func, VectorType, vector_memory_access
from pystencils.include import get_pystencils_include_path
from pystencils.kernel_wrapper import KernelWrapper
from pystencils.utils import atomic_file_write, file_handle_for_atomic_write, recursive_dict_update
......@@ -386,6 +387,14 @@ def create_function_boilerplate_code(parameter_info, name, ast_node, insert_chec
if ast_node.instruction_set and aligned:
byte_width = ast_node.instruction_set['width'] * item_size
if 'cachelineZero' in ast_node.instruction_set:
has_openmp, has_nontemporal = False, False
for loop in ast_node.atoms(LoopOverCoordinate):
has_openmp = has_openmp or any(['#pragma omp' in p for p in loop.prefix_lines])
has_nontemporal = has_nontemporal or any([a.args[0].field == field and a.args[3] for a in
loop.atoms(vector_memory_access)])
if has_openmp and has_nontemporal:
byte_width = ast_node.instruction_set['cachelineSize']
offset = max(max(ast_node.ghost_layers)) * item_size
offset_cond = f"(((uintptr_t) buffer_{field.name}.buf) + {offset}) % {byte_width} == 0"
......@@ -394,6 +403,9 @@ def create_function_boilerplate_code(parameter_info, name, ast_node, insert_chec
"the kernel creation is not specified it will choose a suitable value " \
"automatically. This value might not " \
"be compatible with the allocated arrays."
if type(byte_width) is not int:
message += " Note that when both OpenMP and non-temporal stores are enabled, alignment to the "\
"cacheline size is required."
pre_call_code += template_check_array.format(cond=offset_cond, what="offset", name=field.name,
expected=message)
......
......@@ -148,6 +148,14 @@ def vectorize_inner_loops_and_adapt_load_stores(ast_node, vector_width, assume_a
if hasattr(indexed, 'field'):
nontemporal = (indexed.field in nontemporal_fields) or (indexed.field.name in nontemporal_fields)
substitutions[indexed] = vector_memory_access(indexed, vec_type, use_aligned_access, nontemporal, True)
if nontemporal:
# insert NontemporalFence after the outermost loop
parent = loop_node.parent
while type(parent.parent.parent) is not ast.KernelFunction:
parent = parent.parent
parent.parent.insert_after(ast.NontemporalFence(), parent, if_not_exists=True)
# insert CachelineSize at the beginning of the kernel
parent.parent.insert_front(ast.CachelineSize(), if_not_exists=True)
if not successful:
warnings.warn("Could not vectorize loop because of non-consecutive memory access")
continue
......
......@@ -17,3 +17,54 @@ inline int32x4_t makeVec_s32(int a, int b, int c, int d)
alignas(16) int data[4] = {a, b, c, d};
return vld1q_s32(data);
}
inline void cachelineZero(void * p) {
__asm__ volatile("dc zva, %0"::"r"(p));
}
inline size_t _cachelineSize() {
// check that dc zva is permitted
uint64_t dczid;
__asm__ volatile ("mrs %0, dczid_el0" : "=r"(dczid));
if ((dczid & (1 << 4)) != 0) {
return SIZE_MAX;
}
// allocate and fill with ones
const size_t max_size = 0x100000;
uint8_t data[2*max_size];
for (size_t i = 0; i < 2*max_size; ++i) {
data[i] = 0xff;
}
// find alignment offset
size_t offset = max_size - ((uintptr_t) data) % max_size;
// zero a cacheline
cachelineZero((void*) (data + offset));
// make sure that at least one byte was zeroed
if (data[offset] != 0) {
return SIZE_MAX;
}
// make sure that nothing was zeroed before the pointer
if (data[offset-1] == 0) {
return SIZE_MAX;
}
// find the last byte that was zeroed
for (size_t size = 1; size < max_size; ++size) {
if (data[offset + size] != 0) {
return size;
}
}
// too much was zeroed
return SIZE_MAX;
}
inline size_t cachelineSize() {
static size_t size = _cachelineSize();
return size;
}
#include <altivec.h>
#undef vector
#undef bool
inline void cachelineZero(void * p) {
#ifdef __xlC__
__dcbz(p);
#else
__asm__ volatile("dcbz 0, %0"::"r"(p):"memory");
#endif
}
inline size_t _cachelineSize() {
// allocate and fill with ones
const size_t max_size = 0x100000;
uint8_t data[2*max_size];
for (size_t i = 0; i < 2*max_size; ++i) {
data[i] = 0xff;
}
// find alignment offset
size_t offset = max_size - ((uintptr_t) data) % max_size;
// zero a cacheline
cachelineZero((void*) (data + offset));
// make sure that at least one byte was zeroed
if (data[offset] != 0) {
return SIZE_MAX;
}
// make sure that nothing was zeroed before the pointer
if (data[offset-1] == 0) {
return SIZE_MAX;
}
// find the last byte that was zeroed
for (size_t size = 1; size < max_size; ++size) {
if (data[offset + size] != 0) {
return size;
}
}
// too much was zeroed
return SIZE_MAX;
}
inline size_t cachelineSize() {
static size_t size = _cachelineSize();
return size;
}
......@@ -33,25 +33,39 @@ def test_vector_type_propagation():
np.testing.assert_equal(dst[1:-1, 1:-1], 2 * 10.0 + 3)
def test_aligned_and_nt_stores():
def test_aligned_and_nt_stores(openmp=False):
domain_size = (24, 24)
# create a datahandling object
dh = ps.create_data_handling(domain_size, periodicity=(True, True), parallel=False, default_target='cpu')
# fields
g = dh.add_array("g", values_per_cell=1, alignment=True)
alignment = 'cacheline' if openmp else True
g = dh.add_array("g", values_per_cell=1, alignment=alignment)
dh.fill("g", 1.0, ghost_layers=True)
f = dh.add_array("f", values_per_cell=1, alignment=True)
f = dh.add_array("f", values_per_cell=1, alignment=alignment)
dh.fill("f", 0.0, ghost_layers=True)
opt = {'instruction_set': instruction_set, 'assume_aligned': True, 'nontemporal': True,
'assume_inner_stride_one': True}
update_rule = [ps.Assignment(f.center(), 0.25 * (g[-1, 0] + g[1, 0] + g[0, -1] + g[0, 1]))]
ast = ps.create_kernel(update_rule, target=dh.default_target, cpu_vectorize_info=opt)
ast = ps.create_kernel(update_rule, target=dh.default_target, cpu_vectorize_info=opt, cpu_openmp=openmp)
if instruction_set in ['sse'] or instruction_set.startswith('avx'):
assert 'stream' in ast.instruction_set
assert 'streamFence' in ast.instruction_set
if instruction_set in ['neon', 'vsx'] or instruction_set.startswith('sve'):
assert 'cachelineZero' in ast.instruction_set
if instruction_set in ['vsx']:
assert 'storeAAndFlushCacheline' in ast.instruction_set
for instruction in ['stream', 'streamFence', 'cachelineZero', 'storeAAndFlushCacheline', 'flushCacheline']:
if instruction in ast.instruction_set:
assert ast.instruction_set[instruction].split('{')[0] in ps.get_code_str(ast)
kernel = ast.compile()
dh.run_kernel(kernel)
np.testing.assert_equal(np.sum(dh.cpu_arrays['f']), np.prod(domain_size))
def test_aligned_and_nt_stores_openmp():
test_aligned_and_nt_stores(True)
def test_inplace_update():
shape = (9, 9, 3)
......
......@@ -4,7 +4,8 @@ import numpy as np
import sympy as sp
import pystencils as ps
from pystencils.backends.simd_instruction_sets import get_supported_instruction_sets, get_vector_instruction_set
from pystencils.backends.simd_instruction_sets import (get_cacheline_size, get_supported_instruction_sets,
get_vector_instruction_set)
from pystencils.data_types import cast_func, VectorType
supported_instruction_sets = get_supported_instruction_sets() if get_supported_instruction_sets() else []
......@@ -76,4 +77,16 @@ def test_alignment_and_correct_ghost_layers(gl_field, gl_kernel, instruction_set
with pytest.raises(ValueError):
dh.run_kernel(kernel)
else:
dh.run_kernel(kernel)
\ No newline at end of file
dh.run_kernel(kernel)
@pytest.mark.parametrize('instruction_set', supported_instruction_sets)
def test_cacheline_size(instruction_set):
cacheline_size = get_cacheline_size(instruction_set)
if cacheline_size is None:
pytest.skip()
instruction_set = get_vector_instruction_set('double', instruction_set)
vector_size = instruction_set['bytes']
assert cacheline_size > 8 and cacheline_size < 0x100000, "Cache line size is implausible"
assert cacheline_size % vector_size == 0, "Cache line size should be multiple of vector size"
assert cacheline_size & (cacheline_size - 1) == 0, "Cache line size is not a power of 2"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment