diff --git a/gpucuda/cudajit.py b/gpucuda/cudajit.py index 048557597c73c9773b44d8876b6e510c8a22d91d..3d55bce1b68ae2564886b84950e18d8a7626eaf8 100644 --- a/gpucuda/cudajit.py +++ b/gpucuda/cudajit.py @@ -67,6 +67,7 @@ def make_python_function(kernel_function_node, argument_dict=None): cache[key] = (args, block_and_thread_numbers) cache_values.append(kwargs) # keep objects alive such that ids remain unique func(*args, **block_and_thread_numbers) + # import pycuda.driver as cuda # cuda.Context.synchronize() # useful for debugging, to get errors right after kernel was called wrapper.ast = kernel_function_node wrapper.parameters = kernel_function_node.get_parameters() diff --git a/kernelcreation.py b/kernelcreation.py index 8cc012777c1c260e3a42a14eb1b98d6fc31c7940..81a32b3a263f0d0a6d012d868c4de858ac642950 100644 --- a/kernelcreation.py +++ b/kernelcreation.py @@ -1,5 +1,6 @@ from types import MappingProxyType import sympy as sp +import itertools from pystencils.assignment import Assignment from pystencils.astnodes import LoopOverCoordinate, Conditional, Block, SympyAssignment from pystencils.cpu.vectorization import vectorize @@ -158,7 +159,8 @@ def create_indexed_kernel(assignments, index_fields, target='cpu', data_type="do raise ValueError("Unknown target %s. Has to be either 'cpu' or 'gpu'" % (target,)) -def create_staggered_kernel(staggered_field, expressions, subexpressions=(), target='cpu', **kwargs): +def create_staggered_kernel(staggered_field, expressions, subexpressions=(), target='cpu', + gpu_exclusive_conditions=False, **kwargs): """Kernel that updates a staggered field. .. image:: /img/staggered_grid.svg @@ -173,6 +175,7 @@ def create_staggered_kernel(staggered_field, expressions, subexpressions=(), tar should be updated. subexpressions: optional sequence of Assignments, that define subexpressions used in the main expressions target: 'cpu' or 'gpu' + gpu_exclusive_conditions: if/else construct to have only one code block for each of 2**dim code paths kwargs: passed directly to create_kernel, iteration slice and ghost_layers parameters are not allowed Returns: @@ -191,18 +194,41 @@ def create_staggered_kernel(staggered_field, expressions, subexpressions=(), tar "same length." final_assignments = [] - for d in range(dim): - cond = sp.And(*[conditions[i] for i in range(dim) if d != i]) + last_conditional = None + + def add(condition, dimensions, as_else_block=False): + nonlocal last_conditional if staggered_field.index_dimensions == 1: - assignments = [Assignment(staggered_field(d), expressions[d])] - a_coll = AssignmentCollection(assignments, list(subexpressions)).new_filtered([staggered_field(d)]) + assignments = [Assignment(staggered_field(d), expressions[d]) for d in dimensions] + a_coll = AssignmentCollection(assignments, list(subexpressions)) + a_coll = a_coll.new_filtered([staggered_field(d) for d in dimensions]) elif staggered_field.index_dimensions == 2: assert staggered_field.has_fixed_index_shape - assignments = [Assignment(staggered_field(d, i), expr) for i, expr in enumerate(expressions[d])] + assignments = [Assignment(staggered_field(d, i), expr) + for d in dimensions + for i, expr in enumerate(expressions[d])] a_coll = AssignmentCollection(assignments, list(subexpressions)) - a_coll = a_coll.new_filtered([staggered_field(d, i) for i in range(staggered_field.index_shape[1])]) + a_coll = a_coll.new_filtered([staggered_field(d, i) for i in range(staggered_field.index_shape[1]) + for d in dimensions]) sp_assignments = [SympyAssignment(a.lhs, a.rhs) for a in a_coll.all_assignments] - final_assignments.append(Conditional(cond, Block(sp_assignments))) + if as_else_block and last_conditional: + last_conditional.false_block = Conditional(condition, Block(sp_assignments)) + last_conditional = last_conditional.false_block + else: + last_conditional = Conditional(condition, Block(sp_assignments)) + final_assignments.append(last_conditional) + + if target == 'cpu' or not gpu_exclusive_conditions: + for d in range(dim): + cond = sp.And(*[conditions[i] for i in range(dim) if d != i]) + add(cond, [d]) + elif target == 'gpu': + full_conditions = [sp.And(*[conditions[i] for i in range(dim) if d != i]) for d in range(dim)] + for include in itertools.product(*[[1, 0]] * dim): + case_conditions = sp.And(*[c if value else sp.Not(c) for c, value in zip(full_conditions, include)]) + dimensions_to_include = [i for i in range(dim) if include[i]] + if dimensions_to_include: + add(case_conditions, dimensions_to_include, True) ghost_layers = [(1, 0)] * dim