diff --git a/pystencils/kernelcreation.py b/pystencils/kernelcreation.py index a2dd55352548a4a28399920599394d53b5205465..aaca6d3124c4303ae289e831fdb7b7680369c408 100644 --- a/pystencils/kernelcreation.py +++ b/pystencils/kernelcreation.py @@ -24,6 +24,7 @@ def create_kernel(assignments, cpu_openmp=False, cpu_vectorize_info=None, cpu_blocking=None, + omp_single_loop=True, gpu_indexing='block', gpu_indexing_params=MappingProxyType({}), use_textures_for_interpolation=True, @@ -47,6 +48,7 @@ def create_kernel(assignments, skip_independence_check: don't check that loop iterations are independent. This is needed e.g. for periodicity kernel, that access the field outside the iteration bounds. Use with care! cpu_openmp: True or number of threads for OpenMP parallelization, False for no OpenMP + omp_single_loop: if OpenMP is active: whether multiple outer loops are permitted cpu_vectorize_info: a dictionary with keys, 'vector_instruction_set', 'assume_aligned' and 'nontemporal' for documentation of these parameters see vectorize function. Example: '{'instruction_set': 'avx512', 'assume_aligned': True, 'nontemporal':True}' @@ -99,7 +101,7 @@ def create_kernel(assignments, if cpu_blocking: omp_collapse = loop_blocking(ast, cpu_blocking) if cpu_openmp: - add_openmp(ast, num_threads=cpu_openmp, collapse=omp_collapse) + add_openmp(ast, num_threads=cpu_openmp, collapse=omp_collapse, assume_single_outer_loop=omp_single_loop) if cpu_vectorize_info: if cpu_vectorize_info is True: vectorize(ast) @@ -237,7 +239,7 @@ def create_staggered_kernel(assignments, target='cpu', gpu_exclusive_conditions= Returns: AST, see `create_kernel` """ - assert 'iteration_slice' not in kwargs and 'ghost_layers' not in kwargs + assert 'iteration_slice' not in kwargs and 'ghost_layers' not in kwargs and 'omp_single_loop' not in kwargs if isinstance(assignments, AssignmentCollection): subexpressions = assignments.subexpressions + [a for a in assignments.main_assignments @@ -325,7 +327,7 @@ def create_staggered_kernel(assignments, target='cpu', gpu_exclusive_conditions= if target == 'cpu': from pystencils.cpu import create_kernel as create_kernel_cpu - ast = create_kernel_cpu(final_assignments, ghost_layers=ghost_layers, **kwargs) + ast = create_kernel_cpu(final_assignments, ghost_layers=ghost_layers, omp_single_loop=False, **kwargs) else: ast = create_kernel(final_assignments, ghost_layers=ghost_layers, target=target, **kwargs) return ast @@ -341,6 +343,6 @@ def create_staggered_kernel(assignments, target='cpu', gpu_exclusive_conditions= remove_start_conditional = any([gl[0] == 0 for gl in ghost_layers]) prepend_optimizations = [lambda ast: remove_conditionals_in_staggered_kernel(ast, remove_start_conditional), move_constants_before_loop] - ast = create_kernel(final_assignments, ghost_layers=ghost_layers, target=target, + ast = create_kernel(final_assignments, ghost_layers=ghost_layers, target=target, omp_single_loop=False, cpu_prepend_optimizations=prepend_optimizations, **kwargs) return ast diff --git a/pystencils_tests/test_staggered_kernel.py b/pystencils_tests/test_staggered_kernel.py index a64ff2f00d0ee7abc34e56f7110adb213dcb9992..310220d21442dc1d1a24b0e74e50f3e068374b67 100644 --- a/pystencils_tests/test_staggered_kernel.py +++ b/pystencils_tests/test_staggered_kernel.py @@ -5,7 +5,7 @@ import pystencils as ps class TestStaggeredDiffusion: - def _run(self, num_neighbors, target='cpu'): + def _run(self, num_neighbors, target='cpu', openmp=False): L = (40, 40) D = 0.066 dt = 1 @@ -33,8 +33,8 @@ class TestStaggeredDiffusion: flux += [ps.Assignment(j.staggered_access("SW"), xy_staggered), ps.Assignment(j.staggered_access("NW"), xY_staggered)] - staggered_kernel = ps.create_staggered_kernel(flux, target=dh.default_target).compile() - div_kernel = ps.create_kernel(update, target=dh.default_target).compile() + staggered_kernel = ps.create_staggered_kernel(flux, target=dh.default_target, cpu_openmp=openmp).compile() + div_kernel = ps.create_kernel(update, target=dh.default_target, cpu_openmp=openmp).compile() def time_loop(steps): sync = dh.synchronization_function([c.name]) @@ -74,6 +74,9 @@ class TestStaggeredDiffusion: import pystencils.opencl.autoinit self._run(4, 'opencl') + def test_diffusion_openmp(self): + self._run(4, openmp=True) + def test_staggered_subexpressions(): dh = ps.create_data_handling((10, 10), periodicity=True, default_target='cpu')