Commit d7332d59 authored by Martin Bauer's avatar Martin Bauer
Browse files

OpenMP support for staggered kernels

parent 5c9ab9a8
......@@ -3,7 +3,7 @@ from functools import partial
from pystencils.astnodes import SympyAssignment, Block, LoopOverCoordinate, KernelFunction
from pystencils.transformations import resolve_buffer_accesses, resolve_field_accesses, make_loop_over_domain, \
add_types, get_optimal_loop_ordering, parse_base_pointer_info, move_constants_before_loop, \
split_inner_loop, get_base_buffer_index
split_inner_loop, get_base_buffer_index, filtered_tree_iteration
from pystencils.data_types import TypedSymbol, BasicType, StructType, create_type
from pystencils.field import Field, FieldType
import pystencils.astnodes as ast
......@@ -152,7 +152,7 @@ def create_indexed_kernel(assignments: AssignmentOrAstNodeList, index_fields, fu
return ast_node
def add_openmp(ast_node, schedule="static", num_threads=True, collapse=None):
def add_openmp(ast_node, schedule="static", num_threads=True, collapse=None, assume_single_outer_loop=True):
"""Parallelize the outer loop with OpenMP.
Args:
......@@ -160,6 +160,8 @@ def add_openmp(ast_node, schedule="static", num_threads=True, collapse=None):
schedule: OpenMP scheduling policy e.g. 'static' or 'dynamic'
num_threads: explicitly specify number of threads
collapse: number of nested loops to include in parallel region (see OpenMP collapse)
assume_single_outer_loop: if True an exception is raised if multiple outer loops are detected for all but
optimized staggered kernels the single-outer-loop assumption should be true
"""
if not num_threads:
return
......@@ -170,31 +172,34 @@ def add_openmp(ast_node, schedule="static", num_threads=True, collapse=None):
wrapper_block = ast.PragmaBlock('#pragma omp parallel' + threads_clause, body.take_child_nodes())
body.append(wrapper_block)
outer_loops = [l for l in body.atoms(ast.LoopOverCoordinate) if l.is_outermost_loop]
outer_loops = [l for l in filtered_tree_iteration(body, LoopOverCoordinate, stop_type=SympyAssignment)
if l.is_outermost_loop]
assert outer_loops, "No outer loop found"
assert len(outer_loops) <= 1, "More than one outer loop found. Not clear where to put OpenMP pragma."
loop_to_parallelize = outer_loops[0]
try:
loop_range = int(loop_to_parallelize.stop - loop_to_parallelize.start)
except TypeError:
loop_range = None
if num_threads is None:
import multiprocessing
num_threads = multiprocessing.cpu_count()
if loop_range is not None and loop_range < num_threads and not collapse:
contained_loops = [l for l in loop_to_parallelize.body.args if isinstance(l, LoopOverCoordinate)]
if len(contained_loops) == 1:
contained_loop = contained_loops[0]
try:
contained_loop_range = int(contained_loop.stop - contained_loop.start)
if contained_loop_range > loop_range:
loop_to_parallelize = contained_loop
except TypeError:
pass
prefix = "#pragma omp for schedule(%s)" % (schedule,)
if collapse:
prefix += " collapse(%d)" % (collapse, )
loop_to_parallelize.prefix_lines.append(prefix)
if assume_single_outer_loop and len(outer_loops) > 1:
raise ValueError("More than one outer loop found, only one outer loop expected")
for loop_to_parallelize in outer_loops:
try:
loop_range = int(loop_to_parallelize.stop - loop_to_parallelize.start)
except TypeError:
loop_range = None
if num_threads is None:
import multiprocessing
num_threads = multiprocessing.cpu_count()
if loop_range is not None and loop_range < num_threads and not collapse:
contained_loops = [l for l in loop_to_parallelize.body.args if isinstance(l, LoopOverCoordinate)]
if len(contained_loops) == 1:
contained_loop = contained_loops[0]
try:
contained_loop_range = int(contained_loop.stop - contained_loop.start)
if contained_loop_range > loop_range:
loop_to_parallelize = contained_loop
except TypeError:
pass
prefix = "#pragma omp for schedule(%s)" % (schedule,)
if collapse:
prefix += " collapse(%d)" % (collapse, )
loop_to_parallelize.prefix_lines.append(prefix)
......@@ -245,13 +245,21 @@ def create_staggered_kernel(staggered_field, expressions, subexpressions=(), tar
cpu_vectorize_info = kwargs.get('cpu_vectorize_info', None)
if cpu_vectorize_info:
del kwargs['cpu_vectorize_info']
openmp = kwargs.get('cpu_openmp', None)
if openmp:
del kwargs['cpu_openmp']
ast = create_kernel(final_assignments, ghost_layers=ghost_layers, target=target, **kwargs)
if target == 'cpu':
remove_conditionals_in_staggered_kernel(ast)
move_constants_before_loop(ast)
omp_collapse = None
if blocking:
loop_blocking(ast, blocking)
omp_collapse = loop_blocking(ast, blocking)
if openmp:
from pystencils.cpu import add_openmp
add_openmp(ast, num_threads=openmp, collapse=omp_collapse, assume_single_outer_loop=False)
if cpu_vectorize_info is True:
vectorize(ast)
elif isinstance(cpu_vectorize_info, dict):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment