Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Sebastian Bindgen
pystencils
Commits
729989d5
Commit
729989d5
authored
Mar 14, 2019
by
Martin Bauer
Browse files
Cache Blocking Transformation
parent
a274d662
Changes
4
Hide whitespace changes
Inline
Side-by-side
astnodes.py
View file @
729989d5
...
...
@@ -184,6 +184,11 @@ class KernelFunction(Node):
def
body
(
self
):
return
self
.
_body
@
body
.
setter
def
body
(
self
,
value
):
self
.
_body
=
value
self
.
_body
.
parent
=
self
@
property
def
args
(
self
):
return
[
self
.
_body
]
...
...
@@ -325,8 +330,9 @@ class PragmaBlock(Block):
class
LoopOverCoordinate
(
Node
):
LOOP_COUNTER_NAME_PREFIX
=
"ctr"
BlOCK_LOOP_COUNTER_NAME_PREFIX
=
"_blockctr"
def
__init__
(
self
,
body
,
coordinate_to_loop_over
,
start
,
stop
,
step
=
1
):
def
__init__
(
self
,
body
,
coordinate_to_loop_over
,
start
,
stop
,
step
=
1
,
is_block_loop
=
False
):
super
(
LoopOverCoordinate
,
self
).
__init__
(
parent
=
None
)
self
.
body
=
body
body
.
parent
=
self
...
...
@@ -336,9 +342,11 @@ class LoopOverCoordinate(Node):
self
.
step
=
step
self
.
body
.
parent
=
self
self
.
prefix_lines
=
[]
self
.
is_block_loop
=
is_block_loop
def
new_loop_with_different_body
(
self
,
new_body
):
result
=
LoopOverCoordinate
(
new_body
,
self
.
coordinate_to_loop_over
,
self
.
start
,
self
.
stop
,
self
.
step
)
result
=
LoopOverCoordinate
(
new_body
,
self
.
coordinate_to_loop_over
,
self
.
start
,
self
.
stop
,
self
.
step
,
self
.
is_block_loop
)
result
.
prefix_lines
=
[
l
for
l
in
self
.
prefix_lines
]
return
result
...
...
@@ -385,9 +393,16 @@ class LoopOverCoordinate(Node):
def
get_loop_counter_name
(
coordinate_to_loop_over
):
return
"%s_%s"
%
(
LoopOverCoordinate
.
LOOP_COUNTER_NAME_PREFIX
,
coordinate_to_loop_over
)
@
staticmethod
def
get_block_loop_counter_name
(
coordinate_to_loop_over
):
return
"%s_%s"
%
(
LoopOverCoordinate
.
BlOCK_LOOP_COUNTER_NAME_PREFIX
,
coordinate_to_loop_over
)
@
property
def
loop_counter_name
(
self
):
return
LoopOverCoordinate
.
get_loop_counter_name
(
self
.
coordinate_to_loop_over
)
if
self
.
is_block_loop
:
return
LoopOverCoordinate
.
get_block_loop_counter_name
(
self
.
coordinate_to_loop_over
)
else
:
return
LoopOverCoordinate
.
get_loop_counter_name
(
self
.
coordinate_to_loop_over
)
@
staticmethod
def
is_loop_counter_symbol
(
symbol
):
...
...
@@ -403,9 +418,16 @@ class LoopOverCoordinate(Node):
def
get_loop_counter_symbol
(
coordinate_to_loop_over
):
return
TypedSymbol
(
LoopOverCoordinate
.
get_loop_counter_name
(
coordinate_to_loop_over
),
'int'
)
@
staticmethod
def
get_block_loop_counter_symbol
(
coordinate_to_loop_over
):
return
TypedSymbol
(
LoopOverCoordinate
.
get_block_loop_counter_name
(
coordinate_to_loop_over
),
'int'
)
@
property
def
loop_counter_symbol
(
self
):
return
LoopOverCoordinate
.
get_loop_counter_symbol
(
self
.
coordinate_to_loop_over
)
if
self
.
is_block_loop
:
return
self
.
get_block_loop_counter_symbol
(
self
.
coordinate_to_loop_over
)
else
:
return
self
.
get_loop_counter_symbol
(
self
.
coordinate_to_loop_over
)
@
property
def
is_outermost_loop
(
self
):
...
...
cpu/kernelcreation.py
View file @
729989d5
...
...
@@ -152,13 +152,14 @@ def create_indexed_kernel(assignments: AssignmentOrAstNodeList, index_fields, fu
return
ast_node
def
add_openmp
(
ast_node
,
schedule
=
"static"
,
num_threads
=
True
):
def
add_openmp
(
ast_node
,
schedule
=
"static"
,
num_threads
=
True
,
collapse
=
None
):
"""Parallelize the outer loop with OpenMP.
Args:
ast_node: abstract syntax tree created e.g. by :func:`create_kernel`
schedule: OpenMP scheduling policy e.g. 'static' or 'dynamic'
num_threads: explicitly specify number of threads
collapse: number of nested loops to include in parallel region (see OpenMP collapse)
"""
if
not
num_threads
:
return
...
...
@@ -182,7 +183,7 @@ def add_openmp(ast_node, schedule="static", num_threads=True):
import
multiprocessing
num_threads
=
multiprocessing
.
cpu_count
()
if
loop_range
is
not
None
and
loop_range
<
num_threads
:
if
loop_range
is
not
None
and
loop_range
<
num_threads
and
not
collapse
:
contained_loops
=
[
l
for
l
in
loop_to_parallelize
.
body
.
args
if
isinstance
(
l
,
LoopOverCoordinate
)]
if
len
(
contained_loops
)
==
1
:
contained_loop
=
contained_loops
[
0
]
...
...
@@ -193,4 +194,7 @@ def add_openmp(ast_node, schedule="static", num_threads=True):
except
TypeError
:
pass
loop_to_parallelize
.
prefix_lines
.
append
(
"#pragma omp for schedule(%s)"
%
(
schedule
,))
prefix
=
"#pragma omp for schedule(%s)"
%
(
schedule
,)
if
collapse
:
prefix
+=
" collapse(%d)"
%
(
collapse
,
)
loop_to_parallelize
.
prefix_lines
.
append
(
prefix
)
kernelcreation.py
View file @
729989d5
...
...
@@ -6,12 +6,12 @@ from pystencils.astnodes import LoopOverCoordinate, Conditional, Block, SympyAss
from
pystencils.cpu.vectorization
import
vectorize
from
pystencils.simp.assignment_collection
import
AssignmentCollection
from
pystencils.gpucuda.indexing
import
indexing_creator_from_params
from
pystencils.transformations
import
remove_conditionals_in_staggered_kernel
from
pystencils.transformations
import
remove_conditionals_in_staggered_kernel
,
loop_blocking
def
create_kernel
(
assignments
,
target
=
'cpu'
,
data_type
=
"double"
,
iteration_slice
=
None
,
ghost_layers
=
None
,
skip_independence_check
=
False
,
cpu_openmp
=
False
,
cpu_vectorize_info
=
None
,
cpu_openmp
=
False
,
cpu_vectorize_info
=
None
,
cpu_blocking
=
None
,
gpu_indexing
=
'block'
,
gpu_indexing_params
=
MappingProxyType
({})):
"""
Creates abstract syntax tree (AST) of kernel, using a list of update equations.
...
...
@@ -32,6 +32,7 @@ def create_kernel(assignments, target='cpu', data_type="double", iteration_slice
cpu_vectorize_info: a dictionary with keys, 'vector_instruction_set', 'assume_aligned' and 'nontemporal'
for documentation of these parameters see vectorize function. Example:
'{'instruction_set': 'avx512', 'assume_aligned': True, 'nontemporal':True}'
cpu_blocking: a tuple of block sizes or None if no blocking should be applied
gpu_indexing: either 'block' or 'line' , or custom indexing class, see `AbstractIndexing`
gpu_indexing_params: dict with indexing parameters (constructor parameters of indexing class)
e.g. for 'block' one can specify '{'block_size': (20, 20, 10) }'
...
...
@@ -72,8 +73,11 @@ def create_kernel(assignments, target='cpu', data_type="double", iteration_slice
ast
=
create_kernel
(
assignments
,
type_info
=
data_type
,
split_groups
=
split_groups
,
iteration_slice
=
iteration_slice
,
ghost_layers
=
ghost_layers
,
skip_independence_check
=
skip_independence_check
)
omp_collapse
=
None
if
cpu_blocking
:
omp_collapse
=
loop_blocking
(
ast
,
cpu_blocking
)
if
cpu_openmp
:
add_openmp
(
ast
,
num_threads
=
cpu_openmp
)
add_openmp
(
ast
,
num_threads
=
cpu_openmp
,
collapse
=
omp_collapse
)
if
cpu_vectorize_info
:
if
cpu_vectorize_info
is
True
:
vectorize
(
ast
)
...
...
@@ -232,6 +236,10 @@ def create_staggered_kernel(staggered_field, expressions, subexpressions=(), tar
ghost_layers
=
[(
1
,
0
)]
*
dim
blocking
=
kwargs
.
get
(
'cpu_blocking'
,
None
)
if
blocking
:
del
kwargs
[
'cpu_blocking'
]
cpu_vectorize_info
=
kwargs
.
get
(
'cpu_vectorize_info'
,
None
)
if
cpu_vectorize_info
:
del
kwargs
[
'cpu_vectorize_info'
]
...
...
@@ -239,6 +247,8 @@ def create_staggered_kernel(staggered_field, expressions, subexpressions=(), tar
if
target
==
'cpu'
:
remove_conditionals_in_staggered_kernel
(
ast
)
if
blocking
:
loop_blocking
(
ast
,
blocking
)
if
cpu_vectorize_info
is
True
:
vectorize
(
ast
)
elif
isinstance
(
cpu_vectorize_info
,
dict
):
...
...
transformations.py
View file @
729989d5
...
...
@@ -1008,3 +1008,52 @@ def replace_inner_stride_with_one(ast_node: ast.KernelFunction) -> None:
subs_dict
=
{
stride_param
:
1
for
stride_param
in
stride_params
}
if
subs_dict
:
ast_node
.
subs
(
subs_dict
)
def
loop_blocking
(
ast_node
:
ast
.
KernelFunction
,
block_size
)
->
int
:
"""Blocking of loops to enhance cache locality. Modifies the ast node in-place.
Args:
ast_node: kernel function node before vectorization transformation has been applied
block_size: sequence defining block size in x, y, (z) direction
Returns:
number of dimensions blocked
"""
loops
=
[
l
for
l
in
filtered_tree_iteration
(
ast_node
,
ast
.
LoopOverCoordinate
,
stop_type
=
ast
.
SympyAssignment
)]
body
=
ast_node
.
body
coordinates
=
[]
loop_starts
=
{}
loop_stops
=
{}
for
loop
in
loops
:
coord
=
loop
.
coordinate_to_loop_over
if
coord
not
in
coordinates
:
coordinates
.
append
(
coord
)
loop_starts
[
coord
]
=
loop
.
start
loop_stops
[
coord
]
=
loop
.
stop
else
:
assert
loop
.
start
==
loop_starts
[
coord
]
and
loop
.
stop
==
loop_stops
[
coord
],
\
"Multiple loops over coordinate {} with different loop bounds"
.
format
(
coord
)
# Create the outer loops that iterate over the blocks
outer_loop
=
None
for
coord
in
reversed
(
coordinates
):
body
=
ast
.
Block
([
outer_loop
])
if
outer_loop
else
body
outer_loop
=
ast
.
LoopOverCoordinate
(
body
,
coord
,
loop_starts
[
coord
],
loop_stops
[
coord
],
step
=
block_size
[
coord
],
is_block_loop
=
True
)
ast_node
.
body
=
ast
.
Block
([
outer_loop
])
# modify the existing loops to only iterate within one block
for
inner_loop
in
loops
:
coord
=
inner_loop
.
coordinate_to_loop_over
block_ctr
=
ast
.
LoopOverCoordinate
.
get_block_loop_counter_symbol
(
coord
)
loop_range
=
inner_loop
.
stop
-
inner_loop
.
start
if
sp
.
sympify
(
loop_range
).
is_number
and
loop_range
%
block_size
[
coord
]
==
0
:
stop
=
block_ctr
+
block_size
[
coord
]
else
:
stop
=
sp
.
Min
(
inner_loop
.
stop
,
block_ctr
+
block_size
[
coord
])
inner_loop
.
start
=
block_ctr
inner_loop
.
stop
=
stop
return
len
(
coordinates
)
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment