Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
pycodegen
pystencils
Commits
147f6901
Commit
147f6901
authored
Apr 12, 2021
by
Markus Holzer
Browse files
Merge branch 'nontemporal' into 'master'
Improve non-temporal stores Closes
#25
See merge request
!230
parents
b075b723
1d39f5f5
Pipeline
#31493
passed with stages
in 11 minutes and 4 seconds
Changes
13
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
pystencils/alignedarray.py
View file @
147f6901
...
...
@@ -10,20 +10,28 @@ def aligned_empty(shape, byte_alignment=True, dtype=np.float64, byte_offset=0, o
shape: size of the array
byte_alignment: alignment in bytes, for the start address of the array holds (a % byte_alignment) == 0
By default, use the maximum required by the CPU (or 512 bits if this cannot be detected).
When 'cacheline' is specified, the size of a cache line is used.
dtype: numpy data type
byte_offset: offset in bytes for position that should be aligned i.e. (a+byte_offset) % byte_alignment == 0
typically used to align first inner cell instead of ghost layer
order: storage linearization order
align_inner_coordinate: if True, the start of the innermost coordinate lines are aligned as well
"""
if
byte_alignment
is
True
:
from
pystencils.backends.simd_instruction_sets
import
(
get_supported_instruction_sets
,
if
byte_alignment
is
True
or
byte_alignment
==
'cacheline'
:
from
pystencils.backends.simd_instruction_sets
import
(
get_supported_instruction_sets
,
get_cacheline_size
,
get_vector_instruction_set
)
type_name
=
BasicType
.
numpy_name_to_c
(
np
.
dtype
(
dtype
).
name
)
instruction_sets
=
get_supported_instruction_sets
()
if
instruction_sets
is
None
:
byte_alignment
=
64
elif
byte_alignment
==
'cacheline'
:
cacheline_sizes
=
[
get_cacheline_size
(
is_name
)
for
is_name
in
instruction_sets
]
if
all
([
s
is
None
for
s
in
cacheline_sizes
]):
byte_alignment
=
max
([
get_vector_instruction_set
(
type_name
,
is_name
)[
'width'
]
*
np
.
dtype
(
dtype
).
itemsize
for
is_name
in
instruction_sets
])
else
:
byte_alignment
=
max
([
s
for
s
in
cacheline_sizes
if
s
is
not
None
])
else
:
byte_alignment
=
max
([
get_vector_instruction_set
(
type_name
,
is_name
)[
'width'
]
*
np
.
dtype
(
dtype
).
itemsize
for
is_name
in
instruction_sets
])
...
...
pystencils/astnodes.py
View file @
147f6901
...
...
@@ -313,7 +313,9 @@ class Block(Node):
self
.
_nodes
=
[
fast_subs
(
a
,
subs_dict
,
skip
)
for
a
in
self
.
_nodes
]
return
self
def
insert_front
(
self
,
node
):
def
insert_front
(
self
,
node
,
if_not_exists
=
False
):
if
if_not_exists
and
len
(
self
.
_nodes
)
>
0
and
self
.
_nodes
[
0
]
==
node
:
return
if
isinstance
(
node
,
collections
.
abc
.
Iterable
):
node
=
list
(
node
)
for
n
in
node
:
...
...
@@ -324,7 +326,7 @@ class Block(Node):
node
.
parent
=
self
self
.
_nodes
.
insert
(
0
,
node
)
def
insert_before
(
self
,
new_node
,
insert_before
):
def
insert_before
(
self
,
new_node
,
insert_before
,
if_not_exists
=
False
):
new_node
.
parent
=
self
assert
self
.
_nodes
.
count
(
insert_before
)
==
1
idx
=
self
.
_nodes
.
index
(
insert_before
)
...
...
@@ -337,7 +339,25 @@ class Block(Node):
idx
-=
1
else
:
break
self
.
_nodes
.
insert
(
idx
,
new_node
)
if
not
if_not_exists
or
self
.
_nodes
[
idx
]
!=
new_node
:
self
.
_nodes
.
insert
(
idx
,
new_node
)
def
insert_after
(
self
,
new_node
,
insert_after
,
if_not_exists
=
False
):
new_node
.
parent
=
self
assert
self
.
_nodes
.
count
(
insert_after
)
==
1
idx
=
self
.
_nodes
.
index
(
insert_after
)
+
1
# move all assignment (definitions to the top)
if
isinstance
(
new_node
,
SympyAssignment
)
and
new_node
.
is_declaration
:
while
idx
>
0
:
pn
=
self
.
_nodes
[
idx
-
1
]
if
isinstance
(
pn
,
LoopOverCoordinate
)
or
isinstance
(
pn
,
Conditional
):
idx
-=
1
else
:
break
if
not
if_not_exists
or
not
(
self
.
_nodes
[
idx
-
1
]
==
new_node
or
(
idx
<
len
(
self
.
_nodes
)
and
self
.
_nodes
[
idx
]
==
new_node
)):
self
.
_nodes
.
insert
(
idx
,
new_node
)
def
append
(
self
,
node
):
if
isinstance
(
node
,
list
)
or
isinstance
(
node
,
tuple
):
...
...
@@ -816,3 +836,47 @@ class ConditionalFieldAccess(sp.Function):
def
__getnewargs__
(
self
):
return
self
.
access
,
self
.
outofbounds_condition
,
self
.
outofbounds_value
class
NontemporalFence
(
Node
):
def
__init__
(
self
):
super
(
NontemporalFence
,
self
).
__init__
(
parent
=
None
)
@
property
def
symbols_defined
(
self
):
return
set
()
@
property
def
undefined_symbols
(
self
):
return
set
()
@
property
def
args
(
self
):
return
[]
def
__eq__
(
self
,
other
):
return
isinstance
(
other
,
NontemporalFence
)
class
CachelineSize
(
Node
):
symbol
=
sp
.
Symbol
(
"_clsize"
)
mask_symbol
=
sp
.
Symbol
(
"_clsize_mask"
)
last_symbol
=
sp
.
Symbol
(
"_cl_lastvec"
)
def
__init__
(
self
):
super
(
CachelineSize
,
self
).
__init__
(
parent
=
None
)
@
property
def
symbols_defined
(
self
):
return
set
([
self
.
symbol
,
self
.
mask_symbol
,
self
.
last_symbol
])
@
property
def
undefined_symbols
(
self
):
return
set
()
@
property
def
args
(
self
):
return
[]
def
__eq__
(
self
,
other
):
return
isinstance
(
other
,
CachelineSize
)
pystencils/backends/arm_instruction_sets.py
View file @
147f6901
...
...
@@ -28,7 +28,6 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'):
'loadA'
:
'ld1[0]'
,
'storeU'
:
'st1[0, 1]'
,
'storeA'
:
'st1[0, 1]'
,
'stream'
:
'st1[0, 1]'
,
'abs'
:
'abs[0]'
,
'=='
:
'ceq[0, 1]'
,
...
...
@@ -47,6 +46,7 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'):
suffix
=
f
'q_f
{
bits
[
data_type
]
}
'
result
=
dict
()
result
[
'bytes'
]
=
16
for
intrinsic_id
,
function_shortcut
in
base_names
.
items
():
function_shortcut
=
function_shortcut
.
strip
()
...
...
@@ -81,4 +81,7 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'):
result
[
'any'
]
=
f
'vaddlvq_u8(vreinterpretq_u8_u
{
bits
[
data_type
]
}
({{0}})) > 0'
result
[
'all'
]
=
f
'vaddlvq_u8(vreinterpretq_u8_u
{
bits
[
data_type
]
}
({{0}})) == 16*0xff'
result
[
'cachelineSize'
]
=
'cachelineSize()'
result
[
'cachelineZero'
]
=
'cachelineZero((void*) {0})'
return
result
pystencils/backends/cbackend.py
View file @
147f6901
import
re
from
collections
import
namedtuple
import
hashlib
from
typing
import
Set
import
numpy
as
np
...
...
@@ -7,7 +8,7 @@ import sympy as sp
from
sympy.core
import
S
from
sympy.logic.boolalg
import
BooleanFalse
,
BooleanTrue
from
pystencils.astnodes
import
KernelFunction
,
Node
from
pystencils.astnodes
import
KernelFunction
,
Node
,
CachelineSize
from
pystencils.cpu.vectorization
import
vec_all
,
vec_any
from
pystencils.data_types
import
(
PointerType
,
VectorType
,
address_of
,
cast_func
,
create_type
,
get_type_of_expression
,
...
...
@@ -258,7 +259,7 @@ class CBackend:
arg
,
data_type
,
aligned
,
nontemporal
,
mask
=
node
.
lhs
.
args
instr
=
'storeU'
if
aligned
:
instr
=
'stream'
if
nontemporal
else
'storeA'
instr
=
'stream'
if
nontemporal
and
'stream'
in
self
.
_vector_instruction_set
else
'storeA'
if
mask
!=
True
:
# NOQA
instr
=
'maskStore'
if
aligned
else
'maskStoreU'
printed_mask
=
self
.
sympy_printer
.
doprint
(
mask
)
...
...
@@ -271,21 +272,60 @@ class CBackend:
else
:
rhs
=
node
.
rhs
return
self
.
_vector_instruction_set
[
instr
].
format
(
"&"
+
self
.
sympy_printer
.
doprint
(
node
.
lhs
.
args
[
0
]),
self
.
sympy_printer
.
doprint
(
rhs
),
ptr
=
"&"
+
self
.
sympy_printer
.
doprint
(
node
.
lhs
.
args
[
0
])
pre_code
=
''
if
nontemporal
and
'cachelineZero'
in
self
.
_vector_instruction_set
:
pre_code
=
f
"if (((uintptr_t)
{
ptr
}
&
{
CachelineSize
.
mask_symbol
}
) == 0) "
+
"{
\n\t
"
+
\
self
.
_vector_instruction_set
[
'cachelineZero'
].
format
(
ptr
)
+
';
\n
}
\n
'
code
=
self
.
_vector_instruction_set
[
instr
].
format
(
ptr
,
self
.
sympy_printer
.
doprint
(
rhs
),
printed_mask
)
+
';'
flushcond
=
f
"((uintptr_t)
{
ptr
}
&
{
CachelineSize
.
mask_symbol
}
) ==
{
CachelineSize
.
last_symbol
}
"
if
nontemporal
and
'flushCacheline'
in
self
.
_vector_instruction_set
:
code2
=
self
.
_vector_instruction_set
[
'flushCacheline'
].
format
(
ptr
,
self
.
sympy_printer
.
doprint
(
rhs
))
+
';'
code
=
f
"
{
code
}
\n
if (
{
flushcond
}
) {{
\n\t
{
code2
}
\n
}}"
elif
nontemporal
and
'storeAAndFlushCacheline'
in
self
.
_vector_instruction_set
:
tmpvar
=
'_tmp_'
+
hashlib
.
sha1
(
self
.
sympy_printer
.
doprint
(
rhs
).
encode
(
'ascii'
)).
hexdigest
()[:
8
]
code
=
'const '
+
self
.
_print
(
node
.
lhs
.
dtype
).
replace
(
' const'
,
''
)
+
' '
+
tmpvar
+
' = '
\
+
self
.
sympy_printer
.
doprint
(
rhs
)
+
';'
code1
=
self
.
_vector_instruction_set
[
instr
].
format
(
ptr
,
tmpvar
,
printed_mask
)
+
';'
code2
=
self
.
_vector_instruction_set
[
'storeAAndFlushCacheline'
].
format
(
ptr
,
tmpvar
,
printed_mask
)
\
+
';'
code
+=
f
"
\n
if (
{
flushcond
}
) {{
\n\t
{
code2
}
\n
}} else {{
\n\t
{
code1
}
\n
}}"
return
pre_code
+
code
else
:
return
f
"
{
self
.
sympy_printer
.
doprint
(
node
.
lhs
)
}
=
{
self
.
sympy_printer
.
doprint
(
node
.
rhs
)
}
;"
def
_print_NontemporalFence
(
self
,
_
):
if
'streamFence'
in
self
.
_vector_instruction_set
:
return
self
.
_vector_instruction_set
[
'streamFence'
]
+
';'
else
:
return
''
def
_print_CachelineSize
(
self
,
node
):
if
'cachelineSize'
in
self
.
_vector_instruction_set
:
code
=
f
'const size_t
{
node
.
symbol
}
=
{
self
.
_vector_instruction_set
[
"cachelineSize"
]
}
;
\n
'
code
+=
f
'const size_t
{
node
.
mask_symbol
}
=
{
node
.
symbol
}
- 1;
\n
'
vectorsize
=
self
.
_vector_instruction_set
[
'bytes'
]
code
+=
f
'const size_t
{
node
.
last_symbol
}
=
{
node
.
symbol
}
-
{
vectorsize
}
;
\n
'
return
code
else
:
return
''
def
_print_TemporaryMemoryAllocation
(
self
,
node
):
align
=
64
if
self
.
_vector_instruction_set
:
align
=
self
.
_vector_instruction_set
[
'bytes'
]
else
:
align
=
node
.
symbol
.
dtype
.
base_type
.
numpy_dtype
.
itemsize
np_dtype
=
node
.
symbol
.
dtype
.
base_type
.
numpy_dtype
required_size
=
np_dtype
.
itemsize
*
node
.
size
+
align
size
=
modulo_ceil
(
required_size
,
align
)
code
=
"#if __cplusplus >= 201703L || __STDC_VERSION__ >= 201112L
\n
"
code
+=
"{dtype} {name}=({dtype})aligned_alloc({align}, {size}) + {offset};
\n
"
code
+=
"#elif defined(_MSC_VER)
\n
"
code
=
"#if defined(_MSC_VER)
\n
"
code
+=
"{dtype} {name}=({dtype})_aligned_malloc({size}, {align}) + {offset};
\n
"
code
+=
"#elif __cplusplus >= 201703L || __STDC_VERSION__ >= 201112L
\n
"
code
+=
"{dtype} {name}=({dtype})aligned_alloc({align}, {size}) + {offset};
\n
"
code
+=
"#else
\n
"
code
+=
"{dtype} {name};
\n
"
code
+=
"posix_memalign((void**) &{name}, {align}, {size});
\n
"
...
...
@@ -298,7 +338,11 @@ class CBackend:
align
=
align
)
def
_print_TemporaryMemoryFree
(
self
,
node
):
align
=
64
if
self
.
_vector_instruction_set
:
align
=
self
.
_vector_instruction_set
[
'bytes'
]
else
:
align
=
node
.
symbol
.
dtype
.
base_type
.
numpy_dtype
.
itemsize
code
=
"#if defined(_MSC_VER)
\n
"
code
+=
"_aligned_free(%s - %d);
\n
"
%
(
self
.
sympy_printer
.
doprint
(
node
.
symbol
.
name
),
node
.
offset
(
align
))
code
+=
"#else
\n
"
...
...
pystencils/backends/ppc_instruction_sets.py
View file @
147f6901
...
...
@@ -29,7 +29,7 @@ def get_vector_instruction_set_ppc(data_type='double', instruction_set='vsx'):
'loadA'
:
'ld[0x0, 0]'
,
'storeU'
:
'xst[1, 0x0, 0]'
,
'storeA'
:
'st[1, 0x0, 0]'
,
'stre
am
'
:
'stl[1, 0x0, 0]'
,
'st
o
re
AAndFlushCacheline
'
:
'stl[1, 0x0, 0]'
,
'abs'
:
'abs[0]'
,
'=='
:
'cmpeq[0, 1]'
,
...
...
@@ -64,6 +64,7 @@ def get_vector_instruction_set_ppc(data_type='double', instruction_set='vsx'):
intwidth
=
128
//
bits
[
'int'
]
result
=
dict
()
result
[
'bytes'
]
=
16
for
intrinsic_id
,
function_shortcut
in
base_names
.
items
():
function_shortcut
=
function_shortcut
.
strip
()
...
...
@@ -78,6 +79,8 @@ def get_vector_instruction_set_ppc(data_type='double', instruction_set='vsx'):
result
[
'loadA'
]
=
'(__vector double)'
+
result
[
'loadA'
].
format
(
'(float*) {0}'
)
result
[
'storeA'
]
=
result
[
'storeA'
].
format
(
'(float*) {0}'
,
'(__vector float) {1}'
)
result
[
'stream'
]
=
result
[
'stream'
].
format
(
'(float*) {0}'
,
'(__vector float) {1}'
)
result
[
'storeAAndFlushCacheline'
]
=
result
[
'storeAAndFlushCacheline'
].
format
(
'(float*) {0}'
,
'(__vector float) {1}'
)
result
[
'+int'
]
=
"vec_add({0}, {1})"
...
...
@@ -98,4 +101,7 @@ def get_vector_instruction_set_ppc(data_type='double', instruction_set='vsx'):
result
[
'any'
]
=
'vec_any_ne({0}, (('
+
result
[
'bool'
]
+
') {{'
+
", "
.
join
([
'0'
]
*
width
)
+
'}}))'
result
[
'all'
]
=
'vec_all_ne({0}, (('
+
result
[
'bool'
]
+
') {{'
+
", "
.
join
([
'0'
]
*
width
)
+
'}}))'
result
[
'cachelineSize'
]
=
'cachelineSize()'
result
[
'cachelineZero'
]
=
'cachelineZero((void*) {0})'
return
result
pystencils/backends/simd_instruction_sets.py
View file @
147f6901
...
...
@@ -15,6 +15,7 @@ def get_vector_instruction_set(data_type='double', instruction_set='avx'):
_cache
=
None
_cachelinesize
=
None
def
get_supported_instruction_sets
():
...
...
@@ -56,3 +57,27 @@ def get_supported_instruction_sets():
if
flags
.
issuperset
(
required_neon_flags
):
result
.
append
(
"neon"
)
return
result
def
get_cacheline_size
(
instruction_set
):
"""Get the size (in bytes) of a cache block that can be zeroed without memory access.
Usually, this is identical to the cache line size."""
global
_cachelinesize
instruction_sets
=
get_vector_instruction_set
(
'double'
,
instruction_set
)
if
'cachelineSize'
not
in
instruction_sets
:
return
None
if
_cachelinesize
is
not
None
:
return
_cachelinesize
import
pystencils
as
ps
import
numpy
as
np
arr
=
np
.
zeros
((
1
,
1
),
dtype
=
np
.
float32
)
f
=
ps
.
Field
.
create_from_numpy_array
(
'f'
,
arr
,
index_dimensions
=
0
)
ass
=
[
ps
.
astnodes
.
CachelineSize
(),
ps
.
Assignment
(
f
.
center
,
ps
.
astnodes
.
CachelineSize
.
symbol
)]
ast
=
ps
.
create_kernel
(
ass
,
cpu_vectorize_info
=
{
'instruction_set'
:
instruction_set
})
kernel
=
ast
.
compile
()
kernel
(
**
{
f
.
name
:
arr
,
ps
.
astnodes
.
CachelineSize
.
symbol
.
name
:
0
})
_cachelinesize
=
int
(
arr
[
0
,
0
])
return
_cachelinesize
pystencils/backends/x86_instruction_sets.py
View file @
147f6901
...
...
@@ -110,7 +110,8 @@ def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'):
result
=
{
'width'
:
width
[(
data_type
,
instruction_set
)],
'intwidth'
:
width
[(
'int'
,
instruction_set
)]
'intwidth'
:
width
[(
'int'
,
instruction_set
)],
'bytes'
:
4
*
width
[(
"float"
,
instruction_set
)]
}
pre
=
prefix
[
instruction_set
]
for
intrinsic_id
,
function_shortcut
in
base_names
.
items
():
...
...
@@ -164,4 +165,6 @@ def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'):
result
[
'+int'
]
=
f
"
{
pre
}
_add_
{
suffix
[
'int'
]
}
({{0}}, {{1}})"
result
[
'streamFence'
]
=
'_mm_mfence()'
return
result
pystencils/cpu/cpujit.py
View file @
147f6901
...
...
@@ -58,8 +58,9 @@ import numpy as np
from
appdirs
import
user_cache_dir
,
user_config_dir
from
pystencils
import
FieldType
from
pystencils.astnodes
import
LoopOverCoordinate
from
pystencils.backends.cbackend
import
generate_c
,
get_headers
from
pystencils.data_types
import
cast_func
,
VectorType
from
pystencils.data_types
import
cast_func
,
VectorType
,
vector_memory_access
from
pystencils.include
import
get_pystencils_include_path
from
pystencils.kernel_wrapper
import
KernelWrapper
from
pystencils.utils
import
atomic_file_write
,
file_handle_for_atomic_write
,
recursive_dict_update
...
...
@@ -386,6 +387,14 @@ def create_function_boilerplate_code(parameter_info, name, ast_node, insert_chec
if
ast_node
.
instruction_set
and
aligned
:
byte_width
=
ast_node
.
instruction_set
[
'width'
]
*
item_size
if
'cachelineZero'
in
ast_node
.
instruction_set
:
has_openmp
,
has_nontemporal
=
False
,
False
for
loop
in
ast_node
.
atoms
(
LoopOverCoordinate
):
has_openmp
=
has_openmp
or
any
([
'#pragma omp'
in
p
for
p
in
loop
.
prefix_lines
])
has_nontemporal
=
has_nontemporal
or
any
([
a
.
args
[
0
].
field
==
field
and
a
.
args
[
3
]
for
a
in
loop
.
atoms
(
vector_memory_access
)])
if
has_openmp
and
has_nontemporal
:
byte_width
=
ast_node
.
instruction_set
[
'cachelineSize'
]
offset
=
max
(
max
(
ast_node
.
ghost_layers
))
*
item_size
offset_cond
=
f
"(((uintptr_t) buffer_
{
field
.
name
}
.buf) +
{
offset
}
) %
{
byte_width
}
== 0"
...
...
@@ -394,6 +403,9 @@ def create_function_boilerplate_code(parameter_info, name, ast_node, insert_chec
"the kernel creation is not specified it will choose a suitable value "
\
"automatically. This value might not "
\
"be compatible with the allocated arrays."
if
type
(
byte_width
)
is
not
int
:
message
+=
" Note that when both OpenMP and non-temporal stores are enabled, alignment to the "
\
"cacheline size is required."
pre_call_code
+=
template_check_array
.
format
(
cond
=
offset_cond
,
what
=
"offset"
,
name
=
field
.
name
,
expected
=
message
)
...
...
pystencils/cpu/vectorization.py
View file @
147f6901
...
...
@@ -148,6 +148,14 @@ def vectorize_inner_loops_and_adapt_load_stores(ast_node, vector_width, assume_a
if
hasattr
(
indexed
,
'field'
):
nontemporal
=
(
indexed
.
field
in
nontemporal_fields
)
or
(
indexed
.
field
.
name
in
nontemporal_fields
)
substitutions
[
indexed
]
=
vector_memory_access
(
indexed
,
vec_type
,
use_aligned_access
,
nontemporal
,
True
)
if
nontemporal
:
# insert NontemporalFence after the outermost loop
parent
=
loop_node
.
parent
while
type
(
parent
.
parent
.
parent
)
is
not
ast
.
KernelFunction
:
parent
=
parent
.
parent
parent
.
parent
.
insert_after
(
ast
.
NontemporalFence
(),
parent
,
if_not_exists
=
True
)
# insert CachelineSize at the beginning of the kernel
parent
.
parent
.
insert_front
(
ast
.
CachelineSize
(),
if_not_exists
=
True
)
if
not
successful
:
warnings
.
warn
(
"Could not vectorize loop because of non-consecutive memory access"
)
continue
...
...
pystencils/include/arm_neon_helpers.h
View file @
147f6901
...
...
@@ -17,3 +17,54 @@ inline int32x4_t makeVec_s32(int a, int b, int c, int d)
alignas
(
16
)
int
data
[
4
]
=
{
a
,
b
,
c
,
d
};
return
vld1q_s32
(
data
);
}
inline
void
cachelineZero
(
void
*
p
)
{
__asm__
volatile
(
"dc zva, %0"
::
"r"
(
p
));
}
inline
size_t
_cachelineSize
()
{
// check that dc zva is permitted
uint64_t
dczid
;
__asm__
volatile
(
"mrs %0, dczid_el0"
:
"=r"
(
dczid
));
if
((
dczid
&
(
1
<<
4
))
!=
0
)
{
return
SIZE_MAX
;
}
// allocate and fill with ones
const
size_t
max_size
=
0x100000
;
uint8_t
data
[
2
*
max_size
];
for
(
size_t
i
=
0
;
i
<
2
*
max_size
;
++
i
)
{
data
[
i
]
=
0xff
;
}
// find alignment offset
size_t
offset
=
max_size
-
((
uintptr_t
)
data
)
%
max_size
;
// zero a cacheline
cachelineZero
((
void
*
)
(
data
+
offset
));
// make sure that at least one byte was zeroed
if
(
data
[
offset
]
!=
0
)
{
return
SIZE_MAX
;
}
// make sure that nothing was zeroed before the pointer
if
(
data
[
offset
-
1
]
==
0
)
{
return
SIZE_MAX
;
}
// find the last byte that was zeroed
for
(
size_t
size
=
1
;
size
<
max_size
;
++
size
)
{
if
(
data
[
offset
+
size
]
!=
0
)
{
return
size
;
}
}
// too much was zeroed
return
SIZE_MAX
;
}
inline
size_t
cachelineSize
()
{
static
size_t
size
=
_cachelineSize
();
return
size
;
}
pystencils/include/ppc_altivec_helpers.h
View file @
147f6901
#include <altivec.h>
#undef vector
#undef bool
inline
void
cachelineZero
(
void
*
p
)
{
#ifdef __xlC__
__dcbz
(
p
);
#else
__asm__
volatile
(
"dcbz 0, %0"
::
"r"
(
p
)
:
"memory"
);
#endif
}
inline
size_t
_cachelineSize
()
{
// allocate and fill with ones
const
size_t
max_size
=
0x100000
;
uint8_t
data
[
2
*
max_size
];
for
(
size_t
i
=
0
;
i
<
2
*
max_size
;
++
i
)
{
data
[
i
]
=
0xff
;
}
// find alignment offset
size_t
offset
=
max_size
-
((
uintptr_t
)
data
)
%
max_size
;
// zero a cacheline
cachelineZero
((
void
*
)
(
data
+
offset
));
// make sure that at least one byte was zeroed
if
(
data
[
offset
]
!=
0
)
{
return
SIZE_MAX
;
}
// make sure that nothing was zeroed before the pointer
if
(
data
[
offset
-
1
]
==
0
)
{
return
SIZE_MAX
;
}
// find the last byte that was zeroed
for
(
size_t
size
=
1
;
size
<
max_size
;
++
size
)
{
if
(
data
[
offset
+
size
]
!=
0
)
{
return
size
;
}
}
// too much was zeroed
return
SIZE_MAX
;
}
inline
size_t
cachelineSize
()
{
static
size_t
size
=
_cachelineSize
();
return
size
;
}
pystencils_tests/test_vectorization.py
View file @
147f6901
...
...
@@ -33,25 +33,39 @@ def test_vector_type_propagation():
np
.
testing
.
assert_equal
(
dst
[
1
:
-
1
,
1
:
-
1
],
2
*
10.0
+
3
)
def
test_aligned_and_nt_stores
():
def
test_aligned_and_nt_stores
(
openmp
=
False
):
domain_size
=
(
24
,
24
)
# create a datahandling object
dh
=
ps
.
create_data_handling
(
domain_size
,
periodicity
=
(
True
,
True
),
parallel
=
False
,
default_target
=
'cpu'
)
# fields
g
=
dh
.
add_array
(
"g"
,
values_per_cell
=
1
,
alignment
=
True
)
alignment
=
'cacheline'
if
openmp
else
True
g
=
dh
.
add_array
(
"g"
,
values_per_cell
=
1
,
alignment
=
alignment
)
dh
.
fill
(
"g"
,
1.0
,
ghost_layers
=
True
)
f
=
dh
.
add_array
(
"f"
,
values_per_cell
=
1
,
alignment
=
True
)
f
=
dh
.
add_array
(
"f"
,
values_per_cell
=
1
,
alignment
=
alignment
)
dh
.
fill
(
"f"
,
0.0
,
ghost_layers
=
True
)
opt
=
{
'instruction_set'
:
instruction_set
,
'assume_aligned'
:
True
,
'nontemporal'
:
True
,
'assume_inner_stride_one'
:
True
}
update_rule
=
[
ps
.
Assignment
(
f
.
center
(),
0.25
*
(
g
[
-
1
,
0
]
+
g
[
1
,
0
]
+
g
[
0
,
-
1
]
+
g
[
0
,
1
]))]
ast
=
ps
.
create_kernel
(
update_rule
,
target
=
dh
.
default_target
,
cpu_vectorize_info
=
opt
)
ast
=
ps
.
create_kernel
(
update_rule
,
target
=
dh
.
default_target
,
cpu_vectorize_info
=
opt
,
cpu_openmp
=
openmp
)
if
instruction_set
in
[
'sse'
]
or
instruction_set
.
startswith
(
'avx'
):
assert
'stream'
in
ast
.
instruction_set
assert
'streamFence'
in
ast
.
instruction_set
if
instruction_set
in
[
'neon'
,
'vsx'
]
or
instruction_set
.
startswith
(
'sve'
):
assert
'cachelineZero'
in
ast
.
instruction_set
if
instruction_set
in
[
'vsx'
]:
assert
'storeAAndFlushCacheline'
in
ast
.
instruction_set
for
instruction
in
[
'stream'
,
'streamFence'
,
'cachelineZero'
,
'storeAAndFlushCacheline'
,
'flushCacheline'
]:
if
instruction
in
ast
.
instruction_set
:
assert
ast
.
instruction_set
[
instruction
].
split
(
'{'
)[
0
]
in
ps
.
get_code_str
(
ast
)
kernel
=
ast
.
compile
()
dh
.
run_kernel
(
kernel
)
np
.
testing
.
assert_equal
(
np
.
sum
(
dh
.
cpu_arrays
[
'f'
]),
np
.
prod
(
domain_size
))
def
test_aligned_and_nt_stores_openmp
():
test_aligned_and_nt_stores
(
True
)
def
test_inplace_update
():
shape
=
(
9
,
9
,
3
)
...
...
pystencils_tests/test_vectorization_specific.py
View file @
147f6901
...
...
@@ -4,7 +4,8 @@ import numpy as np
import
sympy
as
sp
import
pystencils
as
ps
from
pystencils.backends.simd_instruction_sets
import
get_supported_instruction_sets
,
get_vector_instruction_set
from
pystencils.backends.simd_instruction_sets
import
(
get_cacheline_size
,
get_supported_instruction_sets
,
get_vector_instruction_set
)
from
pystencils.data_types
import
cast_func
,
VectorType
supported_instruction_sets
=
get_supported_instruction_sets
()
if
get_supported_instruction_sets
()
else
[]
...
...
@@ -76,4 +77,16 @@ def test_alignment_and_correct_ghost_layers(gl_field, gl_kernel, instruction_set
with
pytest
.
raises
(
ValueError
):
dh
.
run_kernel
(
kernel
)
else
:
dh
.
run_kernel
(
kernel
)
\ No newline at end of file
dh
.
run_kernel
(
kernel
)
@
pytest
.
mark
.
parametrize
(
'instruction_set'
,
supported_instruction_sets
)
def
test_cacheline_size
(
instruction_set
):
cacheline_size
=
get_cacheline_size
(
instruction_set
)
if
cacheline_size
is
None
:
pytest
.
skip
()
instruction_set
=
get_vector_instruction_set
(
'double'
,
instruction_set
)
vector_size
=
instruction_set
[
'bytes'
]
assert
cacheline_size
>
8
and
cacheline_size
<
0x100000
,
"Cache line size is implausible"
assert
cacheline_size
%
vector_size
==
0
,
"Cache line size should be multiple of vector size"
assert
cacheline_size
&
(
cacheline_size
-
1
)
==
0
,
"Cache line size is not a power of 2"
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment