diff --git a/pystencils/backends/cbackend.py b/pystencils/backends/cbackend.py index 92a6080c73389a815163157efefac2186aeee09e..4e503c4acad55919f353a0cdb2d30f44f4f4742b 100644 --- a/pystencils/backends/cbackend.py +++ b/pystencils/backends/cbackend.py @@ -164,6 +164,13 @@ class CBackend: return "%s%s\n%s" % (prefix, loop_str, self._print(node.body)) def _print_SympyAssignment(self, node): + if self._dialect == 'cuda' and isinstance(node.lhs, sp.Symbol) and node.lhs.name.startswith("shmemslot"): + result = "__shared__ volatile double %s[512]; %s[threadIdx.z * " \ + "blockDim.x*blockDim.y + threadIdx.y * " \ + "blockDim.x + threadIdx.x] = %s;" % \ + (node.lhs.name, node.lhs.name, self.sympy_printer.doprint(node.rhs)) + return result + if node.is_declaration: data_type = "const " + str(node.lhs.dtype) + " " if node.is_const else str(node.lhs.dtype) + " " return "%s%s = %s;" % (data_type, self.sympy_printer.doprint(node.lhs), @@ -254,6 +261,12 @@ class CustomSympyPrinter(CCodePrinter): res = str(expr.evalf().num) return res + def _print_Symbol(self, expr): + if self._dialect == 'cuda' and expr.name.startswith("shmemslot"): + return expr.name + "[threadIdx.z * blockDim.x*blockDim.y + threadIdx.y * blockDim.x + threadIdx.x]" + else: + return super(CustomSympyPrinter, self)._print_Symbol(expr) + def _print_Equality(self, expr): """Equality operator is not printable in default printer""" return '((' + self._print(expr.lhs) + ") == (" + self._print(expr.rhs) + '))' diff --git a/pystencils/simp/liveness_opts.py b/pystencils/simp/liveness_opts.py index 3bee292def73128da0c4db1124087877d69d91c3..370887dadd1c1f34a182ed814b4a796eb22e7b2b 100644 --- a/pystencils/simp/liveness_opts.py +++ b/pystencils/simp/liveness_opts.py @@ -1,69 +1,51 @@ -from sympy import Symbol, Dummy - -from pystencils import Field, Assignment - +import sympy as sp import random import copy +from typing import List +from pystencils import Field, Assignment - -def get_usage(atoms): - reg_usage = {} - for atom in atoms: - reg_usage[atom.lhs] = 0 - for atom in atoms: - for arg in atom.rhs.atoms(): - if isinstance(arg, Symbol) and not isinstance(arg, Field.Access): - if arg in reg_usage: - reg_usage[arg] += 1 - else: - print(str(arg) + " is unsatisfied") - return reg_usage - - -def get_definitions(eqs): - definitions = {} - for eq in eqs: - definitions[eq.lhs] = eq - return definitions +fa_symbol_iter = sp.numbered_symbols("fa_") -def get_roots(eqs): - roots = [] - for eq in eqs: - if isinstance(eq.lhs, Field.Access): - roots.append(eq.lhs) - if not roots: - roots.append(eqs[-1].lhs) - return roots - - -def merge_field_accesses(eqs): +def merge_field_accesses(assignments): + """Transformation that introduces symbols for all read field accesses + for multiple read accesses only one symbol is introduced""" field_accesses = {} - for eq in eqs: - for arg in eq.rhs.atoms(): + new_eqs = copy.copy(assignments) + for assignment in new_eqs: + for arg in assignment.rhs.atoms(): if isinstance(arg, Field.Access) and arg not in field_accesses: - field_accesses[arg] = Dummy() + field_accesses[arg] = next(fa_symbol_iter) - for i in range(0, len(eqs)): + for i in range(0, len(new_eqs)): for f, s in field_accesses.items(): - if f in eqs[i].atoms(): - eqs[i] = eqs[i].subs(f, s) + if f in new_eqs[i].atoms(): + new_eqs[i] = new_eqs[i].subs(f, s) for f, s in field_accesses.items(): - eqs.insert(0, Assignment(s, f)) + new_eqs.insert(0, Assignment(s, f)) + + return new_eqs - return eqs +def fuse_eqs(input_eqs, max_depth=1, max_usage=1): + """Inserts subexpressions that are used not more than `max_usage` -def refuse_eqs(input_eqs, max_depth=0, max_usage=1): + Args: + max_depth: complexity metric for the subexpression to insert + if max_depth is larger than the expression tree of the subexpression + the subexpressions is not inserted + + Somewhat the inverse of common subexpression elimination. + """ eqs = copy.copy(input_eqs) usages = get_usage(eqs) definitions = get_definitions(eqs) def inline_trivially_schedulable(sym, depth): - if sym not in usages or usages[sym] > max_usage or depth > max_depth: + if sym not in definitions or sym not in usages or usages[sym] > max_usage or depth > max_depth: return sym rhs = definitions[sym].rhs @@ -74,13 +56,13 @@ def refuse_eqs(input_eqs, max_depth=0, max_usage=1): for idx, eq in enumerate(eqs): if usages[eq.lhs] > 1 or isinstance(eq.lhs, Field.Access): - if not isinstance(eq.rhs, Symbol): - - eqs[idx] = Assignment(eq.lhs, - eq.rhs.func(*[inline_trivially_schedulable(arg, 0) for arg in eq.rhs.args])) + if not isinstance(eq.rhs, sp.Symbol): + eqs[idx] = Assignment( + eq.lhs, + eq.rhs.func(*[inline_trivially_schedulable(arg, 0) for arg in eq.rhs.args])) count = 0 - while (len(eqs) != count): + while len(eqs) != count: count = len(eqs) usages = get_usage(eqs) eqs = [eq for eq in eqs if usages[eq.lhs] > 0 or isinstance(eq.lhs, Field.Access)] @@ -88,16 +70,26 @@ def refuse_eqs(input_eqs, max_depth=0, max_usage=1): return eqs -def schedule_eqs(eqs, candidate_count=20): +def schedule_eqs(assignments: List[Assignment], candidate_count=20): + """Changes order of assignments to save registers. + + Args: + assignments: + candidate_count: tuning parameter, small means fast, but bad scheduling quality + 1 corresponds to full greedy search + + Returns: + list of re-ordered assignments + """ if candidate_count == 0: - return eqs + return assignments - definitions = get_definitions(eqs) + definitions = get_definitions(assignments) definition_atoms = {} for sym, definition in definitions.items(): - definition_atoms[sym] = list(definition.rhs.atoms(Symbol)) - roots = get_roots(eqs) - initial_usages = get_usage(eqs) + definition_atoms[sym] = list(definition.rhs.atoms(sp.Symbol)) + roots = get_roots(assignments) + initial_usages = get_usage(assignments) level = 0 current_level_set = set([frozenset(roots)]) @@ -111,12 +103,18 @@ def schedule_eqs(eqs, candidate_count=20): min_regs = min([len(current_usages[dec_set]) for dec_set in current_level_set]) max_regs = max(max_regs, min_regs) - candidates = [(dec_set, len(current_usages[dec_set])) for dec_set in current_level_set] + + def score_dec_set(dec_set): + score = len(current_usages[dec_set]) # current_schedules[dec_set][0] + return dec_set, score + + candidates = [score_dec_set(dec_set) for dec_set in current_level_set] random.shuffle(candidates) candidates.sort(key=lambda d: d[1]) for dec_set, regs in candidates[:candidate_count]: + for dec in dec_set: new_dec_set = set(dec_set) new_dec_set.remove(dec) @@ -126,7 +124,7 @@ def schedule_eqs(eqs, candidate_count=20): for arg in atoms: if not isinstance(arg, Field.Access): argu = usage.get(arg, initial_usages[arg]) - 1 - if argu == 0: + if argu == 0 and arg in definitions: new_dec_set.add(arg) usage[arg] = argu frozen_new_dec_set = frozenset(new_dec_set) @@ -134,7 +132,6 @@ def schedule_eqs(eqs, candidate_count=20): max_reg_count = max(len(usage), schedule[0]) if frozen_new_dec_set not in new_schedules or max_reg_count < new_schedules[frozen_new_dec_set][0]: - new_schedule = list(schedule[1]) new_schedule.append(definitions[dec]) new_schedules[frozen_new_dec_set] = (max_reg_count, new_schedule) @@ -150,9 +147,77 @@ def schedule_eqs(eqs, candidate_count=20): level += 1 schedule = current_schedules[frozenset()] + schedule[1].reverse() - return (schedule[1]) + return schedule[1] def liveness_opt_transformation(eqs): - return refuse_eqs(merge_field_accesses(schedule_eqs(eqs, 3)), 1, 3) + return fuse_eqs(merge_field_accesses(schedule_eqs(eqs, 30)), 1, 3) + + +# ---------- Utilities ----------------------------------------------------------------------------------------- + + +def get_usage(assignments: List[Assignment]): + """Count number of reads for all symbols in list of assignments + + Returns: + dictionary mapping symbol to number of its reads + """ + reg_usage = {} + for assignment in assignments: + for arg in assignment.rhs.atoms(): + if isinstance(arg, sp.Symbol) and not isinstance(arg, Field.Access): + if arg in reg_usage: + reg_usage[arg] += 1 + else: + reg_usage[arg] = 1 + return reg_usage + + +def get_definitions(assignments: List[Assignment]): + """Returns dictionary mapping symbol to its defining assignment""" + definitions = {} + for assignment in assignments: + definitions[assignment.lhs] = assignment + return definitions + + +def get_roots(eqs): + """Returns all field accesses that are used as lhs in assignment (stores) + In case there are no independent assignments, the last one is returned (TODO try if necessary) + """ + roots = [] + for eq in eqs: + if isinstance(eq.lhs, Field.Access): + roots.append(eq.lhs) + if not roots: + roots.append(eqs[-1].lhs) + return roots + + +# ---------- Staggered kernels ----------------------------------------------------------------------------------------- + +def unpack_staggered_eqs(field, expressions, subexpressions): + eqs = copy.deepcopy(subexpressions) + for dim in range(0, len(expressions)): + for vec in range(0, len(expressions[dim])): + eqs.append(Assignment(Field.Access(field, (0, 0, 0, dim, vec)), expressions[dim][vec])) + return eqs + + +def pack_staggered_eqs(eqs, field, expressions, subexpressions): + new_matrix_list = [0] * (field.shape[-1] * field.shape[-2]) + + for eq in eqs: + if isinstance(eq.lhs, Field.Access): + new_matrix_list[eq.lhs.offsets[-2] * field.shape[-1] + eq.lhs.offsets[-1]] = eq.rhs + + subexpressions = [eq for eq in eqs if not isinstance(eq.lhs, Field.Access)] + + return (field, [ + sp.Matrix(field.shape[-1], 1, + new_matrix_list[dim * field.shape[-1]:(dim + 1) * field.shape[-1]]) + for dim in range(field.shape[-2]) + ], subexpressions) diff --git a/pystencils/simp/liveness_opts_exp.py b/pystencils/simp/liveness_opts_exp.py new file mode 100644 index 0000000000000000000000000000000000000000..735ab1f5f8f5d9ef006fb7286ee800894f14143d --- /dev/null +++ b/pystencils/simp/liveness_opts_exp.py @@ -0,0 +1,972 @@ +import sympy +import itertools +from sympy import Symbol, Piecewise, Number, postorder_traversal, numbered_symbols +from pystencils.simp.liveness_opts import * + +atom_symbol_iter = numbered_symbols("atom_") + + +def three_operand_form(assignments): + """Transforms list of assignments in three operand form""" + + def atomize(expr, atoms): + if len(expr.args) == 0: + return expr + + atom = next(atom_symbol_iter) + if len(expr.args) == 1: + atoms.append(Assignment(atom, expr.func(atomize(expr.args[0], atoms)))) + return atom + + if isinstance(expr, Piecewise): + atoms.append( + Assignment( + atom, + Piecewise(*[(atomize(expr.expr, atoms), expr.cond) for expr in expr.args]))) + return atom + + atoms.append(Assignment(atom, expr.func(atomize(expr.args[0], atoms), atomize(expr.args[1], atoms)))) + + current_atom = atom + for i in range(2, len(expr.args)): + atom = next(atom_symbol_iter) + atoms.append(Assignment(atom, expr.func(atomize(expr.args[i], atoms), current_atom))) + current_atom = atom + + return current_atom + + atoms = [] + for eq in assignments: + new_atoms = [] + atomize(eq.rhs, new_atoms) + if len(new_atoms) > 0: + new_atoms[-1] = Assignment(eq.lhs, new_atoms[-1].rhs) + else: + new_atoms.append(eq) + atoms.extend(new_atoms) + + return atoms + + +def var_to_shmem(eqs, var_count=8): + if var_count > 8: + return eqs + if var_count == 0: + return copy.copy(eqs) + for eq in eqs: + if eq.lhs.name.startswith("shmemslot"): + return eqs + + usage = get_usage(eqs) + usage_list = [(s, usage[s]) for s in usage] + + usage_list.sort(key=lambda s: -s[1]) + + vars = [Symbol("shmemslot" + str(i)) for i in range(0, var_count)] + shmem_eqs = [] + for idx, eq in enumerate(eqs): + shmem_eqs.append(eq.subs([(usage_list[i][0], vars[i]) for i in range(0, var_count)])) + return shmem_eqs + + +def shift_fa_eqs(eqs, direction=1): + def shift_fa(expr, direction): + if isinstance(expr, Field.Access): + return expr.neighbor(0, direction) + if len(expr.args) == 0: + return expr + else: + return expr.func(*[shift_fa(arg, direction) for arg in expr.args]) + + new_eqs = [] + for eq in eqs: + new_eqs.append(shift_fa(eq, direction)) + return new_eqs + + +def get_steal_list(eqs, shifted_eqs): + def is_equal_arg(left_arg, right_arg, steal_list, left_def, right_def, verbose=False): + + if verbose: print("is_equal_arg: IN left_arg " + str(left_arg)) + if verbose: print("is_equal_arg: IN right_arg " + str(right_arg)) + + if verbose: print("is_equal_arg: SUB left_arg " + str(left_arg)) + if verbose: print("is_equal_arg: SUB left_arg " + str(right_arg)) + + if isinstance(left_arg, Number): return left_arg == right_arg + if isinstance(left_arg, Field.Access): return left_arg == right_arg + + if left_arg not in steal_list: return False + + if verbose: print("is_equal_arg: stolen" + str(steal_list[left_arg])) + + return steal_list[left_arg] == right_arg + + def is_equal_expr(left_expr, right_expr, steal_list, left_def, right_def, verbose=False): + + # print(str(left_expr) + " =?= " + str(right_expr)) + + if type(left_expr) != type(right_expr): return False + + if left_expr.func != right_expr.func or len(left_expr.args) != len(right_expr.args): + return False + + if len(left_expr.args) == 0: + return is_equal_arg(left_expr, right_expr, steal_list, left_def, right_def, verbose) + + for left_arg_perm in itertools.permutations(left_expr.args): + equal_args = True + for idx, left_arg in enumerate(left_arg_perm): + if not is_equal_arg(left_arg, right_expr.args[idx], steal_list, left_def, right_def, + verbose): + equal_args = False + break + if equal_args: return True + + return False + + steal_from_e = {} + left_def = get_definitions(eqs) + right_def = get_definitions(shifted_eqs) + for lidx, asgn_left in enumerate(eqs): + verbose = False + + for left_subexpr in sympy.postorder_traversal(asgn_left.rhs): + if isinstance(left_subexpr, sympy.Number) or isinstance( + left_subexpr, Field.Access) or isinstance(left_subexpr, Assignment): + continue + for ridx, asgn_right in enumerate(shifted_eqs): + + for right_subexpr in sympy.postorder_traversal(asgn_right.rhs): + left_arg = left_subexpr + right_arg = right_subexpr + if isinstance(left_subexpr, + Symbol) and not isinstance(left_subexpr, Field.Access): + left_arg = left_def[left_subexpr].rhs + if isinstance(right_subexpr, + Symbol) and not isinstance(right_subexpr, Field.Access): + right_arg = right_def[right_subexpr].rhs + + if is_equal_expr(left_arg, right_arg, steal_from_e, left_def, right_def, + verbose): + steal_from_e[left_subexpr] = right_subexpr + # if verbose: + print(str(left_subexpr) + " == " + str(right_subexpr)) + + return steal_from_e + + +def find_symbol(eqs, name): + for eq in eqs: + if eq.lhs.name == name: return eq.lhs + + +def find_expr(eqs, expr): + for idx, eq in enumerate(eqs): + for sub_expr in postorder_traversal(eq): + if sub_expr == expr: + return (idx, sub_expr, eq) + + +def left_steal(eqs, steal_count=2): + shifted_eqs = shift_fa_eqs(eqs) + steal_from_e = get_steal_list(eqs, shifted_eqs) + + usage = get_usage(eqs) + definitions = get_definitions(eqs) + + def count_nodes_up(node): + if isinstance(node, Field.Access): + return 1 + + if node in definitions: + node = definitions[node].rhs + + node_count = 0 + for arg in node.args: + if not (arg in usage and usage[arg] > 1): + node_count += count_nodes_up(arg) + return node_count + 1 + + new_eqs = copy.copy(eqs) + for i in range(0, steal_count): + + scores = [(s, count_nodes_up(s)) for s in steal_from_e if isinstance(s, Symbol)] + scores.sort(key=lambda s: s[1], reverse=True) + + print(scores[0:10]) + + sym_xi = scores[0][0] + print(sym_xi) + print(steal_from_e[sym_xi]) + steal_src = find_expr(new_eqs, shift_fa_eqs([steal_from_e[sym_xi]], -1)[0]) + shmem_var = Symbol("shmemslot" + str(i)) + + new_eqs.insert(steal_src[0] + 1, Assignment(shmem_var, steal_src[1])) + + steal_dst = find_expr(new_eqs, sym_xi) + + print(steal_dst) + print(steal_src) + print() + + for idx, eq in enumerate(new_eqs): + if steal_dst[1] in eq.atoms(): + new_eqs[idx] = Assignment(new_eqs[idx].lhs, new_eqs[idx].rhs.subs( + steal_dst[1], shmem_var)) + + new_eqs.pop(steal_dst[0]) + + # Ancestors of donated value cannot be stolen, therefore remove from steal list + def get_ancestor_nodes(node, definitions): + ancestors = [node] + if isinstance(node, sympy.Number): + return [] + if node in definitions: + ancestors.extend(get_ancestor_nodes(definitions[node].rhs, definitions)) + for arg in node.args: + ancestors.extend(get_ancestor_nodes(arg, definitions)) + return ancestors + + for a in get_ancestor_nodes(steal_from_e[sym_xi], definitions): + if a in steal_from_e: + steal_from_e.pop(a) + + # Remove value just stolen from steal list + steal_from_e.pop(sym_xi) + + return new_eqs + + # eqs = atomize_eqs(eqs) + + +def move_forward(atoms): + reg_usage = get_usage(atoms) + i = 0 + while i < len(atoms): + atom = atoms[i] + killed_regs = 0 + for arg in atom.rhs.atoms(): + if isinstance(arg, Field.Access) or not isinstance(arg, Symbol): + continue + reg_usage[arg] -= 1 + if reg_usage[arg] == 0: + killed_regs += 1 + if killed_regs == 0: + first_usage = i + for n in range(i, len(atoms)) or len( + [x for x in atoms[n].rhs.atoms() if x in atoms[i].rhs.atoms()]) != 0: + usage = atoms[n].rhs + if atom.lhs in usage.atoms(): + first_usage = n + break + if first_usage - i > 5: + atoms.insert(first_usage - 1, atoms.pop(i)) + for arg in atom.rhs.atoms(): + if isinstance(arg, Field.Access) or not isinstance(arg, Symbol): + continue + reg_usage[arg] += 1 + # print("_move " + str(i) + " " + str(first_usage) + " " + + # str(atom)) + i -= 1 + i += 1 + return atoms + + +def move_backward(atoms): + reg_usage = get_usage(atoms) + i = 0 + while i < len(atoms): + atom = atoms[i] + killed_regs = 0 + for arg in atom.rhs.atoms(): + if isinstance(arg, Field.Access) or not isinstance(arg, Symbol): + continue + reg_usage[arg] -= 1 + if reg_usage[arg] == 0: + killed_regs += 1 + if killed_regs > 1: + last_defined = 0 + for n in range(i - 1, 0, -1): + if len([x for x in atoms[n].rhs.atoms() if x in atoms[i].rhs.atoms() + ]) != 0 or atoms[n].lhs in atom.rhs.atoms(): + last_defined = n + break + if i - last_defined > 5: + atoms.insert(last_defined + 1, atoms.pop(i)) + # print("_move " + str(i) + " " + str(last_defined) + " " + + # str(atom) + " " + str(atoms[last_defined])) + i += 1 + + return atoms + + +def liveness_analysis(atoms): + max_alive_regs = 0 + reg_usage = get_usage(atoms) + alive_atoms = [] + alive_at_peak = [] + for atom in atoms: + + if not isinstance(atom.lhs, Field.Access): + alive_atoms.append(atom.lhs) + for arg in atom.rhs.atoms(): + if isinstance(arg, Field.Access) or not isinstance(arg, Symbol): + continue + if arg not in alive_atoms: + print("_referenced Symbol " + str(arg) + " is not alive") + else: + reg_usage[arg] -= 1 + if reg_usage[arg] == 0: + alive_atoms.remove(arg) + if max_alive_regs < len(alive_atoms): + max_alive_regs = len(alive_atoms) + alive_at_peak = list(alive_atoms) + + # print("_max alive _registers " + str(max_alive_regs)) + return (alive_at_peak, max_alive_regs) + + +def schedule_eqs1(eqs): + definitions = get_definitions(eqs) + roots = get_roots(eqs) + + def label_eqs(sym, labels): + if sym not in definitions: + return (0, 0) + if sym in labels: + return labels[sym] + if isinstance(definitions[sym].rhs, Field.Access): + labels[sym] = (1, 1) + return labels[sym] + reg_counts = [] + for arg in definitions[sym].rhs.atoms(): + reg_counts.append(label_eqs(arg, labels)) + if len(reg_counts) == 1: + labels[sym] = reg_counts[0] + return labels[sym] + print(reg_counts) + reg_counts.sort(key=lambda x: x[0]) + label = 0 + if reg_counts[-1] == reg_counts[-2]: + label = reg_counts[-1][0] + 1 + else: + label = reg_counts[-1][0] + labels[sym] = (label, 1) + return labels[sym] + + labels = {} + for root in roots: + label_eqs(root.lhs, labels) + print(labels) + + def schedule_sub_tree(sym, eqs, labels): + expr = definitions[sym] + if expr in eqs: + return + args = [] + for arg in expr.rhs.atoms(): + if isinstance(arg, Symbol) and not isinstance(arg, Field.Access): + args.append(arg) + args.sort(key=lambda arg: -labels[arg][0]) + for arg in args: + schedule_sub_tree(arg, eqs, labels) + eqs.append(expr) + + rescheduled_eqs = [] + for root in roots: + schedule_sub_tree(root.lhs, rescheduled_eqs, labels) + + return rescheduled_eqs + + +def schedule_eqs2(eqs, target=168, branches=2): + definitions = get_definitions(eqs) + roots = get_roots(eqs) + + def recursive_schedule(definitions, needed_syms, usages, target, depth): + if len(needed_syms) > target: + return _none + if len(needed_syms) == 0: + return [] + sym_list = needed_syms.items() + + score_list = [] + for sym, u in sym_list: + if u != 0: + continue + score = 0 + for arg in definitions[sym].rhs.atoms(): + if arg not in needed_syms and arg in usages: + score += 1 + + score_list.append((sym, score)) + score_list.sort(key=lambda x: x[1]) + + for sym, score in score_list[0:branches]: + needed_syms.pop(sym) + for arg in definitions[sym].rhs.atoms(): + if isinstance(arg, Symbol) and not isinstance(arg, Field.Access): + if not arg in needed_syms: + needed_syms[arg] = usages[arg] + needed_syms[arg] -= 1 + + instrs = recursive_schedule(definitions, needed_syms, usages, target, depth + 1) + if not instrs is _none: + instrs.append(definitions[sym]) + return instrs + else: + for arg in definitions[sym].rhs.atoms(): + if isinstance(arg, Symbol) and not isinstance(arg, Field.Access): + needed_syms[arg] += 1 + if needed_syms[arg] == usages[arg]: + needed_syms.pop(arg) + needed_syms[sym] = 0 + return _none + + usages = get_usage(eqs) + needed_syms = {u: 0 for u in roots} + instrs = recursive_schedule(definitions, needed_syms, usages, target, 0) + return instrs + + +def schedule_eqs3(eqs, peak_alive=[]): + peak_alive_set = set(peak_alive) + # random.shuffle(eqs) + definitions = get_definitions(eqs) + definition_atoms = {} + for sym, definition in definitions.items(): + definition_atoms[sym] = list(definition.rhs.atoms(Symbol)) + roots = get_roots(eqs) + initial_usages = get_usage(eqs) + + # levels = [set([frozenset(roots)])] + level = 0 + current_level_set = set([frozenset(roots)]) + current_usages = {frozenset(roots): {u: 0 for u in roots}} + current_schedules = {frozenset(roots): (0, [])} + max_regs = 0 + while len(current_level_set) > 0: + new_usages = dict() + new_schedules = dict() + new_level_set = set() + min_regs = min([len(current_usages[dec_set]) for dec_set in current_level_set]) + max_regs = max(max_regs, min_regs) + candidates = [(dec_set, len(current_usages[dec_set]) + + len(peak_alive_set.union(set(current_usages[dec_set].keys()))) * 0.1) + for dec_set in current_level_set] + candidates.sort(key=lambda d: d[1]) + + for dec_set, regs in candidates[:40]: + for dec in dec_set: + new_dec_set = set(dec_set) + new_dec_set.remove(dec) + usage = dict(current_usages[dec_set]) + usage.pop(dec) + atoms = definition_atoms[dec] + for arg in atoms: + if not isinstance(arg, Field.Access): + argu = usage.get(arg, initial_usages[arg]) - 1 + if argu == 0: + new_dec_set.add(arg) + usage[arg] = argu + frozen_new_dec_set = frozenset(new_dec_set) + schedule = current_schedules[dec_set] + max_reg_count = max(len(usage), schedule[0]) + if frozen_new_dec_set not in new_schedules or max_reg_count < new_schedules[ + frozen_new_dec_set][0]: + new_schedule = list(schedule[1]) + new_schedule.append(definitions[dec]) + new_schedules[frozen_new_dec_set] = (max_reg_count, new_schedule) + + if len(frozen_new_dec_set) > 0: + new_level_set.add(frozen_new_dec_set) + new_usages[frozen_new_dec_set] = usage + + current_schedules = new_schedules + current_usages = new_usages + current_level_set = new_level_set + # print(len(current_level_set)) + level += 1 + + schedule = current_schedules[frozenset()] + schedule[1].reverse() + return (schedule[1]) + + +split_symbol_iter = numbered_symbols("split_") + + +def split_live_range(eqs, splits=2): + for i in range(0, splits): + max_alive_regs = 0 + reg_usage = get_usage(eqs) + definitions = get_definitions(eqs) + alive_atoms = [] + alive_at_peak = [] + usage_at_peak = [] + + for atom in eqs: + if not isinstance(atom.lhs, Field.Access): + alive_atoms.append(atom.lhs) + for arg in atom.rhs.atoms(): + if isinstance( + arg, Field.Access) or not isinstance(arg, Symbol) or arg not in alive_atoms: + continue + + else: + reg_usage[arg] -= 1 + if reg_usage[arg] == 0: + alive_atoms.remove(arg) + if max_alive_regs < len(alive_atoms): + max_alive_regs = len(alive_atoms) + alive_at_peak = list(alive_atoms) + usage_at_peak = {u: reg_usage[u] for u in alive_at_peak} + peak_eq = atom + peak_idx = eqs.index(peak_eq) + for sym in alive_at_peak: + dependent = False + for arg in definitions[sym].rhs.atoms(): + if arg in definitions: + dependent = True + if dependent: + continue + next_occurence = 0 + for i in range(peak_idx, len(eqs)): + if sym == eqs[i].lhs or sym in eqs[i].rhs.atoms(): + next_occurence = i + break + new_sym = next(split_symbol_iter) + if next_occurence == 0: + continue + eqs.insert(next_occurence, Assignment(new_sym, definitions[sym].rhs)) + + for i in range(peak_idx, len(eqs)): + if sym in eqs[i].rhs.atoms(): + eqs[i] = eqs[i].subs(sym, new_sym) + + break + return eqs + + +def duplicate_trivial_ops(eqs, nonTrivialLength=3, trivialSymbolLength=1): + definitions = get_definitions(eqs) + eq_list = list(eqs) + idx = 0 + while (idx < len(eq_list)): + eq = eq_list[idx] + if isinstance(eq.lhs, Field.Access): + idx += 1 + continue + + trivial = True + if len(eq.rhs.atoms()) >= nonTrivialLength: + trivial = False + + for arg in eq.rhs.atoms(): + if arg in definitions: + trivial = False + break + + if len(eq.rhs.atoms(Symbol)) == trivialSymbolLength: + trivial = True + + if trivial: + for i in range(idx + 1, len(eq_list)): + if eq.lhs in eq_list[i].rhs.atoms(): + eq_list[i] = Assignment(eq_list[i].lhs, eq_list[i].rhs.subs({eq.lhs: eq.rhs})) + + eq_list.remove(eq) + else: + idx += 1 + return eq_list + + +def scramble_eqs(eqs, attempts=1000): + max_alive_regs = 0 + reg_usage = get_usage(eqs) + alive_atoms = [] + alive_at_eq = {} + for atom in eqs: + alive_at_eq[atom] = {u: reg_usage[u] for u in alive_atoms} + + if not isinstance(atom.lhs, Field.Access): + alive_atoms.append(atom.lhs) + for arg in atom.rhs.atoms(): + if isinstance(arg, Field.Access) or not isinstance(arg, Symbol): + continue + if arg not in alive_atoms: + print("_referenced Symbol " + str(arg) + " is not alive") + else: + reg_usage[arg] -= 1 + if reg_usage[arg] == 0: + alive_atoms.remove(arg) + if max_alive_regs < len(alive_atoms): + max_alive_regs = len(alive_atoms) + + orig_usage = get_usage(eqs) + for i in range(0, attempts): + a = random.randint(0, len(eqs) - 10) + if a + 2 >= len(eqs) - 10: continue + b = random.randint(a + 2, min(len(eqs) - 10, a + 20)) + eqa = eqs[a] + eqb = eqs[b] + if max_alive_regs - len(alive_at_eq[eqs[a]]) < 15: + continue + + if alive_at_eq[eqs[a + 1]][eqa.lhs] == alive_at_eq[eqb].get(eqa.lhs, 0): + + # print() + # print(eqa) + # print(eqb) + + usage = alive_at_eq[eqs[a - 1]] + eqs.insert(b - 1, eqs.pop(a)) + for n in range(a - 1, b + 1): + atom = eqs[n] + # print(str(n) + " " + str(atom)) + alive_at_eq[atom] = dict(usage) + if not isinstance(atom.lhs, Field.Access): + usage[atom.lhs] = orig_usage[atom.lhs] + for arg in atom.rhs.atoms(): + if isinstance(arg, Field.Access) or not isinstance(arg, Symbol): + continue + if arg not in usage: + pass + else: + usage[arg] -= 1 + if usage[arg] == 0: + usage.pop(arg) + return eqs + + +def scheduling_iteration(eqs): + atomized_eqs = three_operand_form(eqs) + eqs = duplicate_trivial_ops(eqs) + rescheduled_eqs = schedule_eqs3(eqs) + alive_at_peak = liveness_analysis(rescheduled_eqs) + + for n in range(0, 5): + split_live_range(atomized_eqs) + rescheduled_eqs = schedule_eqs3(rescheduled_eqs, alive_at_peak) + for i in range(0, 10): + scramble_eqs(atomized_eqs) + return atomized_eqs + + +def fuse_subs(eqs): + new_eqs = copy.copy(eqs) + for eq in eqs: + if isinstance(eq.rhs, sympy.Mul) and len(eq.rhs.args) == 2: + if eq.rhs.args[0] == -1 or eq.rhs.args[1] == -1: + for i, new_eq in enumerate(new_eqs): + if eq.lhs in new_eqs[i].atoms(): + new_eqs[i] = Assignment(new_eqs[i].lhs, new_eq.rhs.subs(eq.lhs, eq.rhs)) + new_eqs.remove(eq) + return new_eqs + + +def fuse_f_m_as(eqs, max_usage=1): + usage = get_usage(eqs) + new_eqs = copy.copy(eqs) + for eq in eqs: + if isinstance(eq.rhs, sympy.Mul) and len(eq.rhs.args) == 2 and usage[eq.lhs] <= max_usage: + for index, new_eq in enumerate(new_eqs): + if isinstance(new_eq.rhs, sympy.Add) and eq.lhs in new_eq.rhs.atoms(): + no_mul_in_args = True + for arg in new_eq.rhs.args: + if isinstance(arg, sympy.Mul): + no_mul_in_args = False + if no_mul_in_args: + new_eqs[index] = new_eqs[index].subs(eq.lhs, eq.rhs) + usage[eq.lhs] -= 1 + if usage[eq.lhs] == 0: + new_eqs.remove(eq) + break + + return new_eqs + + +def remove_sqrt(input_eqs): + def remove_in_expr(expr): + if len(expr.args) > 0: + if expr.func == sympy._pow and (expr.args[0] == 0.5 or expr.args[1] == 0.5): + return sympy.Mul(*[remove_in_expr(a) for a in expr.args]) + else: + return expr.func(*[remove_in_expr(a) for a in expr.args]) + else: + return expr + + eqs = [] + for eq in input_eqs: + eqs.append(remove_in_expr(eq)) + return eqs + + +def remove_div(input_eqs): + def remove_in_expr(expr): + if len(expr.args) > 0: + if expr.func == sympy._pow and (expr.args[0] == -1 or expr.args[1] == -1): + return sympy.Mul(*[remove_in_expr(a) for a in expr.args]) + else: + return expr.func(*[remove_in_expr(a) for a in expr.args]) + else: + return expr + + eqs = [] + for eq in input_eqs: + eqs.append(remove_in_expr(eq)) + return eqs + + +def remove_piecewise(input_eqs): + def remove_in_expr(expr): + if len(expr.args) > 0: + if expr.func == sympy.Piecewise: + cond = 1.0 + summands = [] + for a in expr.args: + # print(remove_in_expr(a[0])) + # print(remove_in_expr(a[1])) + # print( sympy.Mul(cond, remove_in_expr(a[0]), remove_in_expr(a[1]))) + summands.append(sympy.Mul(cond, remove_in_expr(a[0]), remove_in_expr(a[1]))) + cond = cond * (1 - remove_in_expr(a[1])) + # print() + + # print(sympy.Add(*summands)) + # print("----") + return sympy.Add(*summands) + else: + return expr.func(*[remove_in_expr(a) for a in expr.args]) + elif expr == sympy.true: + return 1.0 + elif expr == sympy.false: + return 0.0 + else: + return expr + + eqs = [] + for eq in input_eqs: + eqs.append(remove_in_expr(eq)) + return eqs + + +def option_none(eqs): + return eqs + + +def option_atomize(eqs): + return three_operand_form(eqs) + + +def option_reschedule(eqs): + return schedule_eqs(eqs) + + +def option_reschedule_shmem(eqs): + return schedule_eqs(var_to_shmem(eqs, 8)) + + +def option_liveness_opt_transformation(eqs): + return liveness_opt_transformation(eqs) + + +def liveness_opt_transformation_shmem(eqs): + return var_to_shmem(duplicate_trivial_ops(schedule_eqs(eqs, 20)), 6) + + +def liveness_opt_transformation_shmem2(eqs): + return scramble_eqs(schedule_eqs(duplicate_trivial_ops(var_to_shmem(eqs, 6)), 40)) + + +def option_liveness_opt_transformation_shmem(eqs): + return liveness_opt_transformation_shmem(eqs) + + +def option_liveness_opt_transformation_shmem2(eqs): + return liveness_opt_transformation_shmem2(eqs) + + +def option_atomize_reschedule_no_sqrt(eqs): + cse_atomized = remove_sqrt(three_operand_form(eqs)) + return schedule_eqs(cse_atomized) + + +def option_atomize_reschedule_no_div(eqs): + cse_atomized = remove_div(three_operand_form(eqs)) + return schedule_eqs(cse_atomized) + + +def option_atomize_reschedule_no_piecewise(eqs): + cse_atomized = remove_piecewise(three_operand_form(eqs)) + return schedule_eqs(cse_atomized) + + +def option_atomize_reschedule_no_sqrt_div(eqs): + cse_atomized = remove_div(remove_sqrt(three_operand_form(eqs))) + return schedule_eqs(cse_atomized) + + +def option_atomize_reschedule_no_all(eqs): + cse_atomized = remove_piecewise(remove_div(remove_sqrt(three_operand_form(eqs)))) + return schedule_eqs(cse_atomized) + + +def option_atomize_reschedule(eqs): + return schedule_eqs(three_operand_form(eqs)) + + +def option_reschedule_atomize(eqs): + return three_operand_form(schedule_eqs(eqs)) + + +def option_reschedule_atomize_scramble(eqs): + eqs = three_operand_form(schedule_eqs(eqs)) + for i in range(0, 10): + scramble_eqs(eqs) + return eqs + + +def option_dupl_reschedule(eqs): + eqs = duplicate_trivial_ops(eqs) + return schedule_eqs(eqs) + + +def option_dupl_atomize_reschedule(eqs): + eqs = duplicate_trivial_ops(eqs) + return schedule_eqs(three_operand_form(eqs)) + + +def option_dupl_atomize_refuse_reschedule(eqs): + eqs = duplicate_trivial_ops(eqs) + eqs = fuse_eqs(three_operand_form(eqs)) + return schedule_eqs(eqs) + + +def option_dupl_reschedule_atomize(eqs): + eqs = duplicate_trivial_ops(eqs) + return three_operand_form(schedule_eqs(eqs)) + + +def option_dupl_reschedule_atomize_scramble(eqs): + eqs = duplicate_trivial_ops(eqs) + eqs = three_operand_form(schedule_eqs(eqs)) + for i in range(0, 10): + scramble_eqs(eqs) + return eqs + + +def option_sched_iteration(eqs): + return scheduling_iteration(eqs) + + +def option_fuse_subs(eqs): + atomized_eqs = three_operand_form(eqs) + fused_eqs = fuse_subs(eqs) + return schedule_eqs(fused_eqs) + + +def option_fuse_f_m_as(eqs): + atomized_eqs = three_operand_form(eqs) + fused_eqs = fuse_f_m_as(eqs) + return schedule_eqs(fused_eqs) + + +def option_fuse_both(eqs): + atomized_eqs = three_operand_form(eqs) + fused_eqs = fuse_f_m_as(fuse_subs(eqs)) + return schedule_eqs(fused_eqs) + + +all_sched_options = [] +# all_sched_options.append(option_none) +# all_sched_options.append(option_atomize) +# all_sched_options.append(option_reschedule) +# all_sched_options.append(option_atomize_reschedule) +# all_sched_options.append(option_fuse_subs) +# all_sched_options.append(option_fuse_f_m_as) +# all_sched_options.append(option_fuse_both) +# all_sched_options.append(option_reschedule_atomize) +# all_sched_options.append(option_reschedule_atomize_scramble) +# all_sched_options.append(option_dupl_atomize_reschedule) +# all_sched_options.append(option_dupl_atomize_refuse_reschedule) +# all_sched_options.append(option_dupl_reschedule_atomize) +# all_sched_options.append(option_dupl_reschedule_atomize_scramble) +# all_sched_options.append(option_sched_iteration) +# all_sched_options.append(option_atomize_reschedule_no_piecewise) +# all_sched_options.append(option_atomize_reschedule_no_sqrt) +# all_sched_options.append(option_atomize_reschedule_no_div) +# all_sched_options.append(option_atomize_reschedule_no_sqrt_div) +# all_sched_options.append(option_atomize_reschedule_no_all) +# all_sched_options.append(option_liveness_opt_transformation) +all_sched_options.append(option_reschedule_shmem) +all_sched_options.append(option_liveness_opt_transformation_shmem) +all_sched_options.append(option_liveness_opt_transformation_shmem2) + + +def replace_accesses(eqs): + access_vars = {} + for eq in eqs: + for atom in eq.rhs.atoms(): + if isinstance(atom, Field.Access) and not atom in access_vars: + access_vars[atom] = Dummy() + + new_eqs = [] + for access in access_vars: + new_eqs.append(Assignment(access_vars[access], access)) + + def replace_accesses_in_expr(expr): + if len(expr.args) == 0: + if isinstance(expr, Field.Access): + return access_vars[expr] + else: + return expr + else: + return expr.func(*[replace_accesses_in_expr(a) for a in expr.args]) + + for eq in eqs: + new_eqs.append(Assignment(eq.lhs, replace_accesses_in_expr(eq.rhs))) + + return new_eqs + + +def shifted_equations(): + pass + + +def get_used_nodes(eqs): + used_nodes = {} + for eq in eqs: + used_nodes[eq.lhs] = [] + for atom in eq.rhs.atoms(Symbol): + used_nodes[eq.lhs].append(atom) + return used_nodes + + +def get_used_by(eqs): + used_by = {} + for eq in eqs: + for arg in eq.rhs.atoms(): + used_by.setdefault(arg, []).append(eq.lhs) + return used_by + +# shifted_equations = copy.deepcopy(equations) + +# def shift_accesses(expr): +# new_args = list(expr.args) +# print(expr.args) +# for i in range(0, len(new_args)): +# if isinstance(new_args[i], Field.Access): +# new_args[i] = new_args[i].get_shifted(2, 0, 0) +# if isinstance(new_args[i], sp._expr): +# shift_accesses(new_args[i]) +# expr._args = new_args +# print(expr.args) + +# for eq in shifted_equations: +# shift_accesses(eq.rhs) + +# for lhs in equations: +# for rhs in shifted_equations: +# if lhs == rhs: +# print(str(lhs.lhs) + " " + str(rhs.lhs)) diff --git a/pystencils/simp/liveness_permutations.py b/pystencils/simp/liveness_permutations.py new file mode 100644 index 0000000000000000000000000000000000000000..b16c8f56cb96966b922a1618107703c86dd6efff --- /dev/null +++ b/pystencils/simp/liveness_permutations.py @@ -0,0 +1,190 @@ +from pygrandchem.grandchem import StaggeredKernelParams + +from pystencils.simp.liveness_opts import * +from pystencils.simp.liveness_opts_exp import * +import random +import pycuda.driver as drv + +import pystencils as ps + +from pystencils import show_code +from timeit import default_timer as timer + +import copy + +optSequenceCache = {} + +all_opts = [[atomize_eqs, []], [schedule_eqs, [2]], [duplicate_trivial_ops, [3, 1]], + [merge_field_accesses, []], [refuse_eqs, [1, 1]], [var_to_shmem, [4]], + [var_to_shmem_lt, [4]]] + + +def mutateOptSequence(seq): + changed = False + new_seq = copy.deepcopy(seq) + while not changed: + choice = random.randint(0, 4) + if choice == 0: + new_seq.opts.append(random.choice(all_opts)) + changed = True + elif choice == 1: + if len(new_seq.opts) > 1: + a = random.randint(0, len(new_seq.opts) - 1) + b = random.randint(0, len(new_seq.opts) - 1) + new_seq.opts[a], new_seq.opts[b] = new_seq.opts[b], new_seq.opts[a] + changed = True + elif choice == 2: + if len(new_seq.opts) > 0: + new_seq.opts.remove(random.choice(new_seq.opts)) + changed = True + elif choice == 3: + if len(new_seq.opts) > 0: + opt = random.choice(new_seq.opts) + change = random.choice([-1, 1]) + factor = 1 + if change < 0: + factor = random.uniform(0.3, 1.0) + if change > 0: + factor = random.uniform(1.0, 3.0) + + if len(opt[1]) > 0: + arg = random.randint(0, len(opt[1]) - 1) + + opt[1][arg] = int(max(0, opt[1][arg] * factor + change)) + changed = True + else: + dim = random.randint(0, 2) + change = random.randint(0, 1) + newBlockSize = list(seq.blockSize) + + if change == 0: + newBlockSize[dim] = min(512, newBlockSize[dim] * 2) + else: + newBlockSize[dim] = max(1, newBlockSize[dim] // 2) + + if newBlockSize[0] * newBlockSize[1] * newBlockSize[2] <= 512 and ( + newBlockSize[0] >= 32 or newBlockSize[0] >= seq.blockSize[0]): + seq.blockSize = tuple(newBlockSize) + changed = True + + return new_seq + + +def evolvePopulation(pop, eqs_set, dhs, staggered_params=None): + + pop.append(livenessOptSequence()) + once_mutated = [mutateOptSequence(seq) for seq in pop[0:6]] + twice_mutated = [mutateOptSequence(mutateOptSequence(seq)) for seq in pop[0:4]] + thrice_mutated = [ + mutateOptSequence(mutateOptSequence(mutateOptSequence(seq))) for seq in pop[0:3] + ] + + new_pop = list(set(pop + once_mutated + twice_mutated + thrice_mutated)) + + scores = [] + for seq in new_pop: + scores.append((seq, *rateSequence(seq, eqs_set, dhs, staggered_params))) + + old_scores = [] + for s in optSequenceCache: + if s not in new_pop: + if s not in optSequenceCache: + print("Not in optSequenceCache: ") + print(s) + print(hash(s)) + old_scores.append((s, optSequenceCache[s][0], [0, 0])) + old_scores.sort(key=lambda s: sum(s[1])) + + if len(old_scores) > 0: scores.append(old_scores[0]) + + print() + scores.sort(key=lambda s: sum(s[1])) + new_pop = [] + count_old_seqs = 0 + + for score in scores: + if score[0] not in optSequenceCache: + print("Everything in scores: ") + for s in scores: + print(s[0]) + + print("Not in optSequenceCache: ") + print(score[0]) + print(hash(score[0])) + + survive = False + if (len(new_pop) < 4 or count_old_seqs < 3) and len(new_pop) < 10: + if optSequenceCache[score[0]][1] > 3: + count_old_seqs += 1 + new_pop.append(score[0]) + survive = True + + print("".join(["{:6.2f} ".format(sc) for sc in score[1]]) + "(" + + "".join(["{:3d} ".format(sc) for sc in score[2]]) + "): " + + "{:2d}".format(optSequenceCache[score[0]][1]) + (" * " if survive else " ") + + str(score[0])) + print() + + return new_pop + + +def rateSequence(seq, eqs_set, dh, staggered_params=None): + + if seq not in optSequenceCache: + optSequenceCache[seq] = [[], 0] + cache_entry = optSequenceCache[seq] + + if cache_entry[1] > 10: + return (cache_entry[0], [0, 0]) + + print(cache_entry[1], end=" ") + print(seq) + + start = timer() + transformed_eqs_set = [seq.applyOpts(eqs) for eqs in eqs_set] + end = timer() + + kernel_results = [ + bench_kernel(eqs, dh, seq.blockSize, staggered_params) for eqs in transformed_eqs_set + ] + kernel_registers = [k[1] for k in kernel_results] + result = [k[0] for k in kernel_results + ] + [k[0] * max(0.0, (len(seq.opts) - 3) * 0.1) for k in kernel_results] + + if cache_entry[1] == 0: + cache_entry[0] = result + else: + for i in range(0, len(result)): + cache_entry[0][i] = (cache_entry[0][i] * cache_entry[1] + result[i]) / ( + cache_entry[1] + 1) + + cache_entry[1] += 1 + + return cache_entry[0], kernel_registers + + +def bench_kernel(eqs, dh, blockSize=(64, 2, 1), staggered_params=None): + + if staggered_params is None: + kernel = ps.create_kernel( + eqs, target="gpu", gpu_indexing_params={ + "block_size": blockSize + }).compile() + else: + kernel = ps.create_staggered_kernel( + *pack_staggered_eqs(eqs, *staggered_params), + target="gpu", + gpu_indexing_params={ + "block_size": blockSize + }).compile() + + start = drv.Event() + end = drv.Event() + + start.record() + dh.run_kernel(kernel, timestep=1) + dh.run_kernel(kernel, timestep=1) + end.record() + end.synchronize() + msec = start.time_till(end) / 2 + return msec, kernel.num_regs diff --git a/pystencils_tests/liveness_opts/compare_seqs.py b/pystencils_tests/liveness_opts/compare_seqs.py new file mode 100644 index 0000000000000000000000000000000000000000..0604cabf82bfcd93fc69006b77cdbf87fe10a4b0 --- /dev/null +++ b/pystencils_tests/liveness_opts/compare_seqs.py @@ -0,0 +1,181 @@ +# coding: utf-8 + +# In[32]: + +import pickle +import warnings +import pystencils as ps +from pygrandchem.grandchem import GrandChemGenerator +from pygrandchem.scenarios import system_4_2, system_3_1 +from pygrandchem.initialization import init_boxes, smooth_fields +from pygrandchem.scenarios import benchmark_configs + +from sympy import Number, Symbol, Expr, preorder_traversal, postorder_traversal, Function, Piecewise, relational +from pystencils.simp import sympy_cse_on_assignment_list +from pystencils.simp.liveness_opts import * +from pystencils.simp.liveness_opts_exp import * + +from pystencils.simp.liveness_permutations import * + +import pycuda + +import sys +from subprocess import run, PIPE + +from pystencils import show_code +import pycuda.driver as drv + +import importlib + +configs = benchmark_configs() + + +def get_config(name): + return configs[name] + + +domain_size = (512, 512, 128) +periodicity = (True, True, False) + +optimization = {'gpu_indexing_params': {"block_size": (32, 4, 2)}} +#bestSeqs = pickle.load(open('best_seq.pickle', 'rb')) + +scenarios = ["42_varT_freeEnergy", "31_varT_aniso_rot"] +kernel_types = ["phi_full", "phi_partial1", "phi_partial2", "mu_full", "mu_partial1", "mu_partial2"] + +liveness_trans_seqs = importlib.import_module( + "gpu_liveness_trans_sequences").gpu_liveness_trans_sequences + +for scenario in scenarios: + + config = get_config(scenario) + + phases, components = config['Parameters']['phases'], config['Parameters']['components'] + format_args = {'p': phases, 'c': components, 's': ','.join(str(e) for e in domain_size)} + + # Adding fields + dh = ps.create_data_handling(domain_size, periodicity=periodicity, default_target='gpu') + f = dh.fields + phi_src = dh.add_array( + 'phi_src', + values_per_cell=config['Parameters']['phases'], + layout='fzyx', + latex_name='phi_s') + mu_src = dh.add_array( + 'mu_src', + values_per_cell=config['Parameters']['components'], + layout='fzyx', + latex_name="mu_s") + mu_stag = dh.add_array( + 'mu_stag', values_per_cell=(dh.dim, config['Parameters']['components']), layout='f') + phi_stag = dh.add_array('phi_stag', values_per_cell=(dh.dim, phases), layout='f') + + phi_dst = dh.add_array_like('phi_dst', 'phi_src') + mu_dst = dh.add_array_like('mu_dst', 'mu_src') + + gc = GrandChemGenerator( + phi_src, + phi_dst, + mu_src, + mu_dst, + config['FreeEnergy'], + config['Parameters'], + #conc=c, + mu_staggered=mu_stag, + phi_staggered=phi_stag, + use_block_offsets=False, + compile_kernel=False) + + mu_full_eqs = gc.mu_full() + phi_full_eqs = gc.phi_full() + + phi_kernel = ps.create_kernel(phi_full_eqs, target='gpu', **optimization).compile() + mu_kernel = ps.create_kernel(mu_full_eqs, target='gpu', **optimization).compile() + + c = dh.add_array( + 'c', values_per_cell=config['Parameters']['components'], layout='fzyx', gpu=False) + + init_boxes(dh) + #initialize_concentration_field(dh, free_energy, config['Parameters']['initial_concentration']) + smooth_fields(dh, sigma=0.4, iterations=5, dim=dh.dim) + dh.synchronization_function(['phi_src', 'phi_dst', 'mu_src', 'mu_dst'])() + + staggered_params = None + + def bench_kernels(mu_kernel, phi_kernel): + + start = drv.Event() + end = drv.Event() + + dh.run_kernel(mu_kernel, timestep=1) + start.record() + dh.run_kernel(mu_kernel, timestep=1) + dh.run_kernel(mu_kernel, timestep=1) + end.record() + end.synchronize() + msec = start.time_till(end) / 2 + print("mu_kernel: {} {:5.3f} ms".format(mu_kernel.num_regs, msec)) + + dh.run_kernel(phi_kernel, timestep=1) + start.record() + dh.run_kernel(phi_kernel, timestep=1) + dh.run_kernel(phi_kernel, timestep=1) + end.record() + end.synchronize() + msec = start.time_till(end) / 2 + print("phi_kernel: {} {:5.3f} ms".format(phi_kernel.num_regs, msec)) + + print("warmup") + bench_kernels(mu_kernel, phi_kernel) + dh.swap('mu_src', 'mu_dst') + dh.swap('phi_src', 'phi_dst') + print() + + for kernel_type in kernel_types: + print(scenario + " " + kernel_type) + for div_sqrt_approx in [True, False]: + print("Approximations for div/sqrt: " + str(div_sqrt_approx)) + for liveness_trans in [True, False]: + + gc = GrandChemGenerator( + phi_src, + phi_dst, + mu_src, + mu_dst, + config['FreeEnergy'], + config['Parameters'], + #conc=c, + mu_staggered=mu_stag, + phi_staggered=phi_stag, + use_block_offsets=False, + compile_kernel=False, + fast_divisions=div_sqrt_approx, + fast_sqrts=div_sqrt_approx, + gpu_liveness_trans_sequences=(liveness_trans_seqs[scenario] + if liveness_trans else None)) + + if kernel_type == "phi_full": + eqs = gc.phi_full() + elif kernel_type == "mu_full": + eqs = gc.mu_full() + elif kernel_type == "mu_partial1": + staggered_params = gc.mu_partial1() + elif kernel_type == "mu_partial2": + eqs = gc.mu_partial2() + elif kernel_type == "phi_partial1": + staggered_params = gc.phi_partial1() + elif kernel_type == "phi_partial2": + eqs = gc.phi_partial2() + else: + print("Specified kernel does not exist") + exit() + + if not staggered_params is None: + eqs = unpack_staggered_eqs(*staggered_params) + + print( + bench_kernel( + eqs, dh, liveness_trans_seqs[scenario][(kernel_type, + liveness_trans)].blockSize, + staggered_params)) + print() diff --git a/pystencils_tests/liveness_opts/count_ops.ipynb b/pystencils_tests/liveness_opts/count_ops.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..b48a14d07cd1392701777c22badeaf5e1d6e494b --- /dev/null +++ b/pystencils_tests/liveness_opts/count_ops.ipynb @@ -0,0 +1,2150 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import sys \n", + "sys.path.append('..')\n", + "\n", + "\n", + "\n", + "%load_ext autoreload\n", + "%autoreload 1\n", + "%aimport pystencils.simp.liveness_opts\n", + "%aimport pystencils.simp.liveness_opts_exp\n", + "%aimport pystencils.shmemvar\n", + "%aimport pystencils.backends.cbackend\n", + "%aimport pystencils.transformations\n", + "%aimport pygrandchem.grandchem_generation\n", + "\n", + "\n", + "%load_ext line_profiler\n" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "from lbmpy.session import *\n", + "from scipy.ndimage.filters import gaussian_filter\n", + "from pygrandchem.grandchem_generation import *\n", + "from pygrandchem.chemicalpotential import free_energy_from_config_object, FreeEnergy\n", + "from pygrandchem.initialization import *\n", + "from pygrandchem_tests.config_anisotropic import get_system\n", + "from pystencils.boundaries import *\n", + "\n", + "from pystencils.simp import sympy_cse_on_assignment_list\n", + "from pystencils.simp.liveness_opts import *\n", + "from pystencils.simp.liveness_opts_exp import *\n", + "\n", + "from pystencils.shmemvar import *\n", + "import graphviz\n", + "\n", + "\n", + "import pycuda.compiler\n", + "from pycuda.compiler import SourceModule\n", + "\n", + "import sys\n", + "from subprocess import run, PIPE\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Compiling and simplifying φ update equations - this may take a while\n", + "Compiling and simplifying μ update equations - this may take a while\n", + "Compiling and simplifying μ update equations - this may take a while\n" + ] + } + ], + "source": [ + "\n", + "domain_size = (128, 128, 128)\n", + "periodicity = (True, True, False)\n", + "fast_simplex_projection = True\n", + "optimization = {'gpu_indexing_params': {\"block_size\": (32, 4, 4)}}\n", + "config = get_system(dim=len(domain_size))\n", + "\n", + "phases = config['Parameters']['phases']\n", + "components = config['Parameters']['components']\n", + "diffusion_matrices = config['Parameters']['diffusion']\n", + "free_energy = config['FreeEnergy']\n", + "\n", + "# Adding fields\n", + "dh = create_data_handling(domain_size, periodicity=periodicity, default_target='gpu')\n", + "f = dh.fields\n", + "phi_src = dh.add_array('phi_src', values_per_cell=phases, layout='fzyx', latex_name='phi_s')\n", + "mu_src = dh.add_array('mu_src', values_per_cell=components, layout='fzyx', latex_name=\"mu_s\")\n", + "mu_stag = dh.add_array('mu_stag', values_per_cell=(dh.dim, components), layout='f')\n", + "phi_dst = dh.add_array_like('phi_dst', 'phi_src')\n", + "mu_dst = dh.add_array_like('mu_dst', 'mu_src')\n", + "\n", + "c = dh.add_array('c', values_per_cell=components, layout='fzyx', gpu=False)\n", + "f = dh.fields\n", + "\n", + "\n", + "phi_update_eqs = create_phi_update_equations(\n", + " phi_src, phi_dst, mu_src, free_energy, config['Parameters'], simplex_projection=fast_simplex_projection)\n", + "\n", + "mu_update_eqs = create_mu_update_equations(phi_src, phi_dst, mu_src, mu_dst, free_energy, diffusion_matrices,\n", + " config['Parameters'])\n", + "\n", + "mu_stag_update_eqs = create_mu_update_equations(\n", + " phi_src,\n", + " phi_dst,\n", + " mu_src,\n", + " mu_dst,\n", + " free_energy,\n", + " diffusion_matrices,\n", + " config['Parameters'],\n", + " mu_staggered_field=mu_stag)\n", + "\n", + "\n", + "mu_stag_precomp_eqs = create_mu_update_staggered_eqs(\n", + " phi_src,\n", + " phi_dst,\n", + " mu_src,\n", + " mu_stag,\n", + " free_energy,\n", + " diffusion_matrices,\n", + " config['Parameters'])\n", + "\n", + " \n", + "mu_stag_precomp_kernel = create_mu_update_staggered_ast(\n", + " phi_src,\n", + " phi_dst,\n", + " mu_src,\n", + " mu_stag,\n", + " free_energy,\n", + " diffusion_matrices,\n", + " config['Parameters'],\n", + " target='gpu')\n", + "\n", + "\n", + "\n", + "#phi_eqs = create_phi_update_equations(\n", + "# f['phi_src'], f['phi_dst'], f['mu_src'], free_energy, config['Parameters'], simplex_projection=True)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "243\n", + "xi_0 ↠phi_src_C^0**2\n", + "xi_1 ↠phi_src_C^1**2\n", + "xi_2 ↠phi_src_C^2**2\n", + "xi_3 ↠xi_0 + xi_1 + xi_2\n", + "xi_4 ↠phi_dst_C^1**2\n", + "xi_5 ↠phi_dst_C^0**2\n", + "xi_6 ↠phi_dst_C^2**2\n", + "xi_7 ↠32.0/(xi_4 + xi_5 + xi_6)\n", + "xi_8 ↠32.0/xi_3\n", + "xi_9 ↠2.0*mu_src_C\n", + "xi_10 ↠phi_src_C^0/2\n", + "xi_11 ↠phi_src_W^0/2 + xi_10\n", + "xi_12 ↠xi_11**2\n", + "xi_13 ↠phi_src_C^1/2\n", + "xi_14 ↠phi_src_W^1/2 + xi_13\n", + "xi_15 ↠xi_14**2\n", + "xi_16 ↠phi_src_C^2/2\n", + "xi_17 ↠phi_src_W^2/2 + xi_16\n", + "xi_18 ↠xi_17**2\n", + "xi_19 ↠1/(xi_12 + xi_15 + xi_18)\n", + "xi_20 ↠xi_12*xi_19\n", + "xi_21 ↠xi_15*xi_19\n", + "xi_22 ↠xi_18*xi_19\n", + "xi_23 ↠2.0*phi_src_C^2\n", + "xi_24 ↠-2.0*phi_src_W^2 + xi_23\n", + "xi_25 ↠sqrt(xi_11*xi_17)\n", + "xi_26 ↠2.0*phi_src_C^0\n", + "xi_27 ↠-2.0*phi_src_W^0 + xi_26\n", + "xi_28 ↠0.5*phi_src_NW^0\n", + "xi_29 ↠-0.5*phi_src_SW^0\n", + "xi_30 ↠-0.5*phi_src_S^0 + 0.5*phi_src_N^0\n", + "xi_31 ↠xi_28 + xi_29 + xi_30\n", + "xi_32 ↠0.5*phi_src_TW^0\n", + "xi_33 ↠-0.5*phi_src_BW^0\n", + "xi_34 ↠-0.5*phi_src_B^0 + 0.5*phi_src_T^0\n", + "xi_35 ↠xi_32 + xi_33 + xi_34\n", + "xi_36 ↠sqrt(xi_27**2 + xi_31**2 + xi_35**2)\n", + "xi_37 ↠0.5*phi_src_NW^2\n", + "xi_38 ↠-0.5*phi_src_SW^2\n", + "xi_39 ↠-0.5*phi_src_S^2 + 0.5*phi_src_N^2\n", + "xi_40 ↠xi_37 + xi_38 + xi_39\n", + "xi_41 ↠0.5*phi_src_TW^2\n", + "xi_42 ↠-0.5*phi_src_BW^2\n", + "xi_43 ↠-0.5*phi_src_B^2 + 0.5*phi_src_T^2\n", + "xi_44 ↠xi_41 + xi_42 + xi_43\n", + "xi_45 ↠xi_24**2 + xi_40**2 + xi_44**2\n", + "xi_46 ↠sqrt(xi_45)\n", + "xi_47 ↠0.0726237719428938*mu_src_C + 7.27037126746791\n", + "xi_48 ↠xi_22*(0.0726237719428938*mu_src_W + xi_47)\n", + "xi_49 ↠0.0138651175277251*mu_src_C + 1.37922908109611\n", + "xi_50 ↠16.0*phi_dst_C^0 - 16.0*phi_src_C^0\n", + "xi_51 ↠xi_22/xi_45\n", + "xi_52 ↠sqrt(xi_14*xi_17)\n", + "xi_53 ↠2.0*phi_src_C^1\n", + "xi_54 ↠-2.0*phi_src_W^1 + xi_53\n", + "xi_55 ↠0.5*phi_src_NW^1\n", + "xi_56 ↠-0.5*phi_src_SW^1\n", + "xi_57 ↠-0.5*phi_src_S^1 + 0.5*phi_src_N^1\n", + "xi_58 ↠xi_55 + xi_56 + xi_57\n", + "xi_59 ↠0.5*phi_src_TW^1\n", + "xi_60 ↠-0.5*phi_src_BW^1\n", + "xi_61 ↠-0.5*phi_src_B^1 + 0.5*phi_src_T^1\n", + "xi_62 ↠xi_59 + xi_60 + xi_61\n", + "xi_63 ↠sqrt(xi_54**2 + xi_58**2 + xi_62**2)\n", + "xi_64 ↠0.00672506491627283*mu_src_C + 0.974209571226215\n", + "xi_65 ↠16.0*phi_dst_C^1 - 16.0*phi_src_C^1\n", + "xi_66 ↠-xi_9\n", + "xi_67 ↠phi_src_E^0/2 + xi_10\n", + "xi_68 ↠xi_67**2\n", + "xi_69 ↠phi_src_E^1/2 + xi_13\n", + "xi_70 ↠xi_69**2\n", + "xi_71 ↠phi_src_E^2/2 + xi_16\n", + "xi_72 ↠xi_71**2\n", + "xi_73 ↠1/(xi_68 + xi_70 + xi_72)\n", + "xi_74 ↠xi_68*xi_73\n", + "xi_75 ↠xi_70*xi_73\n", + "xi_76 ↠xi_72*xi_73\n", + "xi_77 ↠-xi_23\n", + "xi_78 ↠2.0*phi_src_E^2 + xi_77\n", + "xi_79 ↠sqrt(xi_67*xi_71)\n", + "xi_80 ↠-xi_26\n", + "xi_81 ↠2.0*phi_src_E^0 + xi_80\n", + "xi_82 ↠0.5*phi_src_NE^0\n", + "xi_83 ↠0.5*phi_src_SE^0\n", + "xi_84 ↠xi_30 + xi_82 - xi_83\n", + "xi_85 ↠0.5*phi_src_TE^0\n", + "xi_86 ↠0.5*phi_src_BE^0\n", + "xi_87 ↠xi_34 + xi_85 - xi_86\n", + "xi_88 ↠sqrt(xi_81**2 + xi_84**2 + xi_87**2)\n", + "xi_89 ↠0.5*phi_src_NE^2\n", + "xi_90 ↠0.5*phi_src_SE^2\n", + "xi_91 ↠xi_39 + xi_89 - xi_90\n", + "xi_92 ↠0.5*phi_src_TE^2\n", + "xi_93 ↠0.5*phi_src_BE^2\n", + "xi_94 ↠xi_43 + xi_92 - xi_93\n", + "xi_95 ↠xi_78**2 + xi_91**2 + xi_94**2\n", + "xi_96 ↠sqrt(xi_95)\n", + "xi_97 ↠xi_76*(0.0726237719428938*mu_src_E + xi_47)\n", + "xi_98 ↠xi_76/xi_95\n", + "xi_99 ↠sqrt(xi_69*xi_71)\n", + "xi_100 ↠-xi_53\n", + "xi_101 ↠2.0*phi_src_E^1 + xi_100\n", + "xi_102 ↠0.5*phi_src_NE^1\n", + "xi_103 ↠0.5*phi_src_SE^1\n", + "xi_104 ↠xi_102 - xi_103 + xi_57\n", + "xi_105 ↠0.5*phi_src_TE^1\n", + "xi_106 ↠0.5*phi_src_BE^1\n", + "xi_107 ↠xi_105 - xi_106 + xi_61\n", + "xi_108 ↠sqrt(xi_101**2 + xi_104**2 + xi_107**2)\n", + "xi_109 ↠phi_src_S^0/2 + xi_10\n", + "xi_110 ↠xi_109**2\n", + "xi_111 ↠phi_src_S^1/2 + xi_13\n", + "xi_112 ↠xi_111**2\n", + "xi_113 ↠phi_src_S^2/2 + xi_16\n", + "xi_114 ↠xi_113**2\n", + "xi_115 ↠1/(xi_110 + xi_112 + xi_114)\n", + "xi_116 ↠xi_110*xi_115\n", + "xi_117 ↠xi_112*xi_115\n", + "xi_118 ↠xi_114*xi_115\n", + "xi_119 ↠-2.0*phi_src_S^2 + xi_23\n", + "xi_120 ↠sqrt(xi_109*xi_113)\n", + "xi_121 ↠-2.0*phi_src_S^0 + xi_26\n", + "xi_122 ↠-0.5*phi_src_W^0 + 0.5*phi_src_E^0\n", + "xi_123 ↠xi_122 + xi_29 + xi_83\n", + "xi_124 ↠0.5*phi_src_TS^0\n", + "xi_125 ↠-0.5*phi_src_BS^0\n", + "xi_126 ↠xi_124 + xi_125 + xi_34\n", + "xi_127 ↠sqrt(xi_121**2 + xi_123**2 + xi_126**2)\n", + "xi_128 ↠-0.5*phi_src_W^2 + 0.5*phi_src_E^2\n", + "xi_129 ↠xi_128 + xi_38 + xi_90\n", + "xi_130 ↠0.5*phi_src_TS^2\n", + "xi_131 ↠-0.5*phi_src_BS^2\n", + "xi_132 ↠xi_130 + xi_131 + xi_43\n", + "xi_133 ↠xi_119**2 + xi_129**2 + xi_132**2\n", + "xi_134 ↠sqrt(xi_133)\n", + "xi_135 ↠xi_118*(0.0726237719428938*mu_src_S + xi_47)\n", + "xi_136 ↠xi_118/xi_133\n", + "xi_137 ↠sqrt(xi_111*xi_113)\n", + "xi_138 ↠-2.0*phi_src_S^1 + xi_53\n", + "xi_139 ↠-0.5*phi_src_W^1 + 0.5*phi_src_E^1\n", + "xi_140 ↠xi_103 + xi_139 + xi_56\n", + "xi_141 ↠0.5*phi_src_TS^1\n", + "xi_142 ↠-0.5*phi_src_BS^1\n", + "xi_143 ↠xi_141 + xi_142 + xi_61\n", + "xi_144 ↠sqrt(xi_138**2 + xi_140**2 + xi_143**2)\n", + "xi_145 ↠phi_src_N^0/2 + xi_10\n", + "xi_146 ↠xi_145**2\n", + "xi_147 ↠phi_src_N^1/2 + xi_13\n", + "xi_148 ↠xi_147**2\n", + "xi_149 ↠phi_src_N^2/2 + xi_16\n", + "xi_150 ↠xi_149**2\n", + "xi_151 ↠1/(xi_146 + xi_148 + xi_150)\n", + "xi_152 ↠xi_146*xi_151\n", + "xi_153 ↠xi_148*xi_151\n", + "xi_154 ↠xi_150*xi_151\n", + "xi_155 ↠2.0*phi_src_N^2 + xi_77\n", + "xi_156 ↠sqrt(xi_145*xi_149)\n", + "xi_157 ↠2.0*phi_src_N^0 + xi_80\n", + "xi_158 ↠xi_122 - xi_28 + xi_82\n", + "xi_159 ↠0.5*phi_src_TN^0\n", + "xi_160 ↠0.5*phi_src_BN^0\n", + "xi_161 ↠xi_159 - xi_160 + xi_34\n", + "xi_162 ↠sqrt(xi_157**2 + xi_158**2 + xi_161**2)\n", + "xi_163 ↠xi_128 - xi_37 + xi_89\n", + "xi_164 ↠0.5*phi_src_TN^2\n", + "xi_165 ↠0.5*phi_src_BN^2\n", + "xi_166 ↠xi_164 - xi_165 + xi_43\n", + "xi_167 ↠xi_155**2 + xi_163**2 + xi_166**2\n", + "xi_168 ↠sqrt(xi_167)\n", + "xi_169 ↠xi_154*(0.0726237719428938*mu_src_N + xi_47)\n", + "xi_170 ↠xi_154/xi_167\n", + "xi_171 ↠sqrt(xi_147*xi_149)\n", + "xi_172 ↠2.0*phi_src_N^1 + xi_100\n", + "xi_173 ↠xi_102 + xi_139 - xi_55\n", + "xi_174 ↠0.5*phi_src_TN^1\n", + "xi_175 ↠0.5*phi_src_BN^1\n", + "xi_176 ↠xi_174 - xi_175 + xi_61\n", + "xi_177 ↠sqrt(xi_172**2 + xi_173**2 + xi_176**2)\n", + "xi_178 ↠phi_src_B^0/2 + xi_10\n", + "xi_179 ↠xi_178**2\n", + "xi_180 ↠phi_src_B^1/2 + xi_13\n", + "xi_181 ↠xi_180**2\n", + "xi_182 ↠phi_src_B^2/2 + xi_16\n", + "xi_183 ↠xi_182**2\n", + "xi_184 ↠1/(xi_179 + xi_181 + xi_183)\n", + "xi_185 ↠xi_179*xi_184\n", + "xi_186 ↠xi_181*xi_184\n", + "xi_187 ↠xi_183*xi_184\n", + "xi_188 ↠-2.0*phi_src_B^2 + xi_23\n", + "xi_189 ↠sqrt(xi_178*xi_182)\n", + "xi_190 ↠-2.0*phi_src_B^0 + xi_26\n", + "xi_191 ↠xi_122 + xi_33 + xi_86\n", + "xi_192 ↠xi_125 + xi_160 + xi_30\n", + "xi_193 ↠sqrt(xi_190**2 + xi_191**2 + xi_192**2)\n", + "xi_194 ↠xi_128 + xi_42 + xi_93\n", + "xi_195 ↠xi_131 + xi_165 + xi_39\n", + "xi_196 ↠xi_188**2 + xi_194**2 + xi_195**2\n", + "xi_197 ↠sqrt(xi_196)\n", + "xi_198 ↠xi_187*(0.0726237719428938*mu_src_B + xi_47)\n", + "xi_199 ↠xi_187/xi_196\n", + "xi_200 ↠sqrt(xi_180*xi_182)\n", + "xi_201 ↠-2.0*phi_src_B^1 + xi_53\n", + "xi_202 ↠xi_106 + xi_139 + xi_60\n", + "xi_203 ↠xi_142 + xi_175 + xi_57\n", + "xi_204 ↠sqrt(xi_201**2 + xi_202**2 + xi_203**2)\n", + "xi_205 ↠phi_src_T^0/2 + xi_10\n", + "xi_206 ↠xi_205**2\n", + "xi_207 ↠phi_src_T^1/2 + xi_13\n", + "xi_208 ↠xi_207**2\n", + "xi_209 ↠phi_src_T^2/2 + xi_16\n", + "xi_210 ↠xi_209**2\n", + "xi_211 ↠1/(xi_206 + xi_208 + xi_210)\n", + "xi_212 ↠xi_206*xi_211\n", + "xi_213 ↠xi_208*xi_211\n", + "xi_214 ↠xi_210*xi_211\n", + "xi_215 ↠2.0*phi_src_T^2 + xi_77\n", + "xi_216 ↠sqrt(xi_205*xi_209)\n", + "xi_217 ↠2.0*phi_src_T^0 + xi_80\n", + "xi_218 ↠xi_122 - xi_32 + xi_85\n", + "xi_219 ↠-xi_124 + xi_159 + xi_30\n", + "xi_220 ↠sqrt(xi_217**2 + xi_218**2 + xi_219**2)\n", + "xi_221 ↠xi_128 - xi_41 + xi_92\n", + "xi_222 ↠-xi_130 + xi_164 + xi_39\n", + "xi_223 ↠xi_215**2 + xi_221**2 + xi_222**2\n", + "xi_224 ↠sqrt(xi_223)\n", + "xi_225 ↠xi_214*(0.0726237719428938*mu_src_T + xi_47)\n", + "xi_226 ↠xi_214/xi_223\n", + "xi_227 ↠sqrt(xi_207*xi_209)\n", + "xi_228 ↠2.0*phi_src_T^1 + xi_100\n", + "xi_229 ↠xi_105 + xi_139 - xi_59\n", + "xi_230 ↠-xi_141 + xi_174 + xi_57\n", + "xi_231 ↠sqrt(xi_228**2 + xi_229**2 + xi_230**2)\n", + "dc_dmu_0_0 ↠xi_3/(0.0277302350554502*xi_0 + 0.0134501298325457*xi_1 + 0.145247543885788*xi_2)\n", + "dc_dphi_dt_0 ↠(0.0134501298325457*mu_src_C + 0.974209571226215)*(-xi_1*xi_8 + xi_4*xi_7) + (0.0277302350554502*mu_src_C + 1.37922908109611)*(-xi_0*xi_8 + xi_5*xi_7) + (0.145247543885788*mu_src_C + 7.27037126746791)*(-xi_2*xi_8 + xi_6*xi_7)\n", + "dc_dT_dt_0 ↠0\n", + "staggered_down_0_0 ↠-xi_24*(3.92699081698724*Piecewise((0, (xi_25 < 1.0e-9) | (xi_36*xi_46 < 1.0e-9)), (xi_11*xi_51*(-xi_20*(0.0138651175277251*mu_src_W + xi_49) + xi_48)*(16.0*phi_dst_W^0 - 16.0*phi_src_W^0 + xi_50)*(xi_24*xi_27 + xi_31*xi_40 + xi_35*xi_44)/(xi_25*xi_36), True)) + 3.92699081698724*Piecewise((0, (xi_52 < 1.0e-9) | (xi_46*xi_63 < 1.0e-9)), (xi_14*xi_51*(-xi_21*(0.00672506491627283*mu_src_W + xi_64) + xi_48)*(16.0*phi_dst_W^1 - 16.0*phi_src_W^1 + xi_65)*(xi_24*xi_54 + xi_40*xi_58 + xi_44*xi_62)/(xi_52*xi_63), True))) + (-2.0*mu_src_W + xi_9)*(2.77302350554502e-5*xi_20 + 1.34501298325457e-5*xi_21 + 0.145247543885788*xi_22)\n", + "staggered_up_0_0 ↠-xi_78*(3.92699081698724*Piecewise((0, (xi_79 < 1.0e-9) | (xi_88*xi_96 < 1.0e-9)), (xi_67*xi_98*(-xi_74*(0.0138651175277251*mu_src_E + xi_49) + xi_97)*(16.0*phi_dst_E^0 - 16.0*phi_src_E^0 + xi_50)*(xi_78*xi_81 + xi_84*xi_91 + xi_87*xi_94)/(xi_79*xi_88), True)) + 3.92699081698724*Piecewise((0, (xi_99 < 1.0e-9) | (xi_108*xi_96 < 1.0e-9)), (xi_69*xi_98*(-xi_75*(0.00672506491627283*mu_src_E + xi_64) + xi_97)*(16.0*phi_dst_E^1 - 16.0*phi_src_E^1 + xi_65)*(xi_101*xi_78 + xi_104*xi_91 + xi_107*xi_94)/(xi_108*xi_99), True))) + (2.0*mu_src_E + xi_66)*(2.77302350554502e-5*xi_74 + 1.34501298325457e-5*xi_75 + 0.145247543885788*xi_76)\n", + "staggered_down_1_0 ↠-xi_119*(3.92699081698724*Piecewise((0, (xi_120 < 1.0e-9) | (xi_127*xi_134 < 1.0e-9)), (xi_109*xi_136*(-xi_116*(0.0138651175277251*mu_src_S + xi_49) + xi_135)*(16.0*phi_dst_S^0 - 16.0*phi_src_S^0 + xi_50)*(xi_119*xi_121 + xi_123*xi_129 + xi_126*xi_132)/(xi_120*xi_127), True)) + 3.92699081698724*Piecewise((0, (xi_137 < 1.0e-9) | (xi_134*xi_144 < 1.0e-9)), (xi_111*xi_136*(-xi_117*(0.00672506491627283*mu_src_S + xi_64) + xi_135)*(16.0*phi_dst_S^1 - 16.0*phi_src_S^1 + xi_65)*(xi_119*xi_138 + xi_129*xi_140 + xi_132*xi_143)/(xi_137*xi_144), True))) + (-2.0*mu_src_S + xi_9)*(2.77302350554502e-5*xi_116 + 1.34501298325457e-5*xi_117 + 0.145247543885788*xi_118)\n", + "staggered_up_1_0 ↠-xi_155*(3.92699081698724*Piecewise((0, (xi_156 < 1.0e-9) | (xi_162*xi_168 < 1.0e-9)), (xi_145*xi_170*(-xi_152*(0.0138651175277251*mu_src_N + xi_49) + xi_169)*(16.0*phi_dst_N^0 - 16.0*phi_src_N^0 + xi_50)*(xi_155*xi_157 + xi_158*xi_163 + xi_161*xi_166)/(xi_156*xi_162), True)) + 3.92699081698724*Piecewise((0, (xi_171 < 1.0e-9) | (xi_168*xi_177 < 1.0e-9)), (xi_147*xi_170*(-xi_153*(0.00672506491627283*mu_src_N + xi_64) + xi_169)*(16.0*phi_dst_N^1 - 16.0*phi_src_N^1 + xi_65)*(xi_155*xi_172 + xi_163*xi_173 + xi_166*xi_176)/(xi_171*xi_177), True))) + (2.0*mu_src_N + xi_66)*(2.77302350554502e-5*xi_152 + 1.34501298325457e-5*xi_153 + 0.145247543885788*xi_154)\n", + "staggered_down_2_0 ↠-xi_188*(3.92699081698724*Piecewise((0, (xi_189 < 1.0e-9) | (xi_193*xi_197 < 1.0e-9)), (xi_178*xi_199*(-xi_185*(0.0138651175277251*mu_src_B + xi_49) + xi_198)*(16.0*phi_dst_B^0 - 16.0*phi_src_B^0 + xi_50)*(xi_188*xi_190 + xi_191*xi_194 + xi_192*xi_195)/(xi_189*xi_193), True)) + 3.92699081698724*Piecewise((0, (xi_200 < 1.0e-9) | (xi_197*xi_204 < 1.0e-9)), (xi_180*xi_199*(-xi_186*(0.00672506491627283*mu_src_B + xi_64) + xi_198)*(16.0*phi_dst_B^1 - 16.0*phi_src_B^1 + xi_65)*(xi_188*xi_201 + xi_194*xi_202 + xi_195*xi_203)/(xi_200*xi_204), True))) + (-2.0*mu_src_B + xi_9)*(2.77302350554502e-5*xi_185 + 1.34501298325457e-5*xi_186 + 0.145247543885788*xi_187)\n", + "staggered_up_2_0 ↠-xi_215*(3.92699081698724*Piecewise((0, (xi_216 < 1.0e-9) | (xi_220*xi_224 < 1.0e-9)), (xi_205*xi_226*(-xi_212*(0.0138651175277251*mu_src_T + xi_49) + xi_225)*(16.0*phi_dst_T^0 - 16.0*phi_src_T^0 + xi_50)*(xi_215*xi_217 + xi_218*xi_221 + xi_219*xi_222)/(xi_216*xi_220), True)) + 3.92699081698724*Piecewise((0, (xi_227 < 1.0e-9) | (xi_224*xi_231 < 1.0e-9)), (xi_207*xi_226*(-xi_213*(0.00672506491627283*mu_src_T + xi_64) + xi_225)*(16.0*phi_dst_T^1 - 16.0*phi_src_T^1 + xi_65)*(xi_215*xi_228 + xi_221*xi_229 + xi_222*xi_230)/(xi_227*xi_231), True))) + (2.0*mu_src_T + xi_66)*(2.77302350554502e-5*xi_212 + 1.34501298325457e-5*xi_213 + 0.145247543885788*xi_214)\n", + "divMgradmu_0 ↠-2.0*staggered_down_0_0 - 2.0*staggered_down_1_0 - 2.0*staggered_down_2_0 + 2.0*staggered_up_0_0 + 2.0*staggered_up_1_0 + 2.0*staggered_up_2_0\n", + "mu_dst[0,0,0] ↠mu_src_C + 0.03125*dc_dmu_0_0*(-dc_dT_dt_0 - dc_dphi_dt_0 + divMgradmu_0)\n" + ] + } + ], + "source": [ + "stag_eqs = []\n", + "\n", + "\n", + "for block in mu_stag_precomp_eqs:\n", + " for eq in block.true_block._nodes:\n", + " stag_eqs.append(eq)\n", + "\n", + "print(len(mu_update_eqs))\n", + "for eq in mu_update_eqs:\n", + " print(eq)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "163\n", + "201\n", + "435\n", + "69\n" + ] + } + ], + "source": [ + "fas = 0\n", + "adds = 0\n", + "muls = 0\n", + "mufus = 0\n", + "\n", + "def count_ops(expr):\n", + " global fas\n", + " global adds\n", + " global muls\n", + " global mufus\n", + " for arg in expr.args:\n", + " count_ops(arg)\n", + " \n", + " if isinstance(expr, Field.Access):\n", + " fas += 1\n", + " elif isinstance(expr, sympy.Add):\n", + " adds += 1\n", + " elif isinstance(expr, sympy.Mul):\n", + " muls += 1\n", + " elif isinstance(expr, sympy.Pow) and expr.exp == -1 :\n", + " mufus += 1\n", + " elif isinstance(expr, sympy.Pow) and expr.exp == 0.5:\n", + " mufus += 1\n", + " elif isinstance(expr, sympy.Pow):\n", + " muls += 1\n", + " \n", + "for eq in mu_update_eqs:\n", + " count_ops(eq)\n", + " \n", + "print(fas)\n", + "print(adds)\n", + "print(muls)\n", + "print(mufus)\n", + "\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "FUNC_PREFIX void kernel(double * const _data_mu_src, double * _data_mu_stag, double * const _data_phi_dst, double * const _data_phi_src)\n", + "{\n", + " if (blockDim.x*blockIdx.x + threadIdx.x + 1 < 130 && blockDim.y*blockIdx.y + threadIdx.y + 1 < 130 && blockDim.z*blockIdx.z + threadIdx.z + 1 < 130)\n", + " {\n", + " int64_t ctr_0 = blockDim.x*blockIdx.x + threadIdx.x + 1;\n", + " int64_t ctr_1 = blockDim.y*blockIdx.y + threadIdx.y + 1;\n", + " int64_t ctr_2 = blockDim.z*blockIdx.z + threadIdx.z + 1;\n", + " if (ctr_1 < 129 && ctr_2 < 129)\n", + " {\n", + " double * const _data_mu_src_10_20 = _data_mu_src + 130*ctr_1 + 16900*ctr_2;\n", + " double x0 = 2.0*_data_mu_src_10_20[ctr_0];\n", + " double * const _data_phi_src_10_20_30 = _data_phi_src + 130*ctr_1 + 16900*ctr_2;\n", + " double x1 = 0.5*_data_phi_src_10_20_30[ctr_0];\n", + " double x2 = x1 + 0.5*_data_phi_src_10_20_30[ctr_0 - 1];\n", + " double x3 = (x2*x2);\n", + " double * const _data_phi_src_10_20_31 = _data_phi_src + 130*ctr_1 + 16900*ctr_2 + 2197000;\n", + " double x4 = 0.5*_data_phi_src_10_20_31[ctr_0];\n", + " double x5 = x4 + 0.5*_data_phi_src_10_20_31[ctr_0 - 1];\n", + " double x6 = (x5*x5);\n", + " double * const _data_phi_src_10_20_32 = _data_phi_src + 130*ctr_1 + 16900*ctr_2 + 4394000;\n", + " double x7 = 0.5*_data_phi_src_10_20_32[ctr_0];\n", + " double x8 = x7 + 0.5*_data_phi_src_10_20_32[ctr_0 - 1];\n", + " double x9 = (x8*x8);\n", + " double x10 = (float) __frcp_rn( (float) x3 + x6 + x9);\n", + " double x11 = x10*x3;\n", + " double x12 = x10*x6;\n", + " double x13 = x10*x9;\n", + " double x14 = 2.0*_data_phi_src_10_20_32[ctr_0];\n", + " double x15 = x14 - 2.0*_data_phi_src_10_20_32[ctr_0 - 1];\n", + " double x16 = (float) sqrtf((float) x2*x8);\n", + " double x17 = 2.0*_data_phi_src_10_20_30[ctr_0];\n", + " double x18 = x17 - 2.0*_data_phi_src_10_20_30[ctr_0 - 1];\n", + " double * const _data_phi_src_1m1_20_30 = _data_phi_src + 130*ctr_1 + 16900*ctr_2 - 130;\n", + " double x19 = -0.5*_data_phi_src_1m1_20_30[ctr_0 - 1];\n", + " double * const _data_phi_src_11_20_30 = _data_phi_src + 130*ctr_1 + 16900*ctr_2 + 130;\n", + " double x20 = -0.5*_data_phi_src_1m1_20_30[ctr_0] + 0.5*_data_phi_src_11_20_30[ctr_0];\n", + " double x21 = x19 + x20 + 0.5*_data_phi_src_11_20_30[ctr_0 - 1];\n", + " double * const _data_phi_src_10_2m1_30 = _data_phi_src + 130*ctr_1 + 16900*ctr_2 - 16900;\n", + " double x22 = -0.5*_data_phi_src_10_2m1_30[ctr_0 - 1];\n", + " double * const _data_phi_src_10_21_30 = _data_phi_src + 130*ctr_1 + 16900*ctr_2 + 16900;\n", + " double x23 = -0.5*_data_phi_src_10_2m1_30[ctr_0] + 0.5*_data_phi_src_10_21_30[ctr_0];\n", + " double x24 = x22 + x23 + 0.5*_data_phi_src_10_21_30[ctr_0 - 1];\n", + " double x25 = (float) sqrtf((float) (x18*x18) + (x21*x21) + (x24*x24));\n", + " double * const _data_phi_src_1m1_20_32 = _data_phi_src + 130*ctr_1 + 16900*ctr_2 + 4393870;\n", + " double x26 = -0.5*_data_phi_src_1m1_20_32[ctr_0 - 1];\n", + " double * const _data_phi_src_11_20_32 = _data_phi_src + 130*ctr_1 + 16900*ctr_2 + 4394130;\n", + " double x27 = -0.5*_data_phi_src_1m1_20_32[ctr_0] + 0.5*_data_phi_src_11_20_32[ctr_0];\n", + " double x28 = x26 + x27 + 0.5*_data_phi_src_11_20_32[ctr_0 - 1];\n", + " double * const _data_phi_src_10_2m1_32 = _data_phi_src + 130*ctr_1 + 16900*ctr_2 + 4377100;\n", + " double x29 = -0.5*_data_phi_src_10_2m1_32[ctr_0 - 1];\n", + " double * const _data_phi_src_10_21_32 = _data_phi_src + 130*ctr_1 + 16900*ctr_2 + 4410900;\n", + " double x30 = -0.5*_data_phi_src_10_2m1_32[ctr_0] + 0.5*_data_phi_src_10_21_32[ctr_0];\n", + " double x31 = x29 + x30 + 0.5*_data_phi_src_10_21_32[ctr_0 - 1];\n", + " double x32 = (x15*x15) + (x28*x28) + (x31*x31);\n", + " double x33 = (float) sqrtf((float) x32);\n", + " double x34 = 0.07262377194289385*_data_mu_src_10_20[ctr_0] + 7.2703712674679055;\n", + " double x35 = x13*x34 + 0.07262377194289385*_data_mu_src_10_20[ctr_0 - 1];\n", + " double x36 = 0.013865117527725106*_data_mu_src_10_20[ctr_0] + 1.3792290810961054;\n", + " double * const _data_phi_dst_10_20_30 = _data_phi_dst + 130*ctr_1 + 16900*ctr_2;\n", + " double x37 = -16.0*_data_phi_src_10_20_30[ctr_0] + 16.0*_data_phi_dst_10_20_30[ctr_0];\n", + " double x38 = fdividef(x13, x32);\n", + " double x39 = (float) sqrtf((float) x5*x8);\n", + " double x40 = 2.0*_data_phi_src_10_20_31[ctr_0];\n", + " double x41 = x40 - 2.0*_data_phi_src_10_20_31[ctr_0 - 1];\n", + " double * const _data_phi_src_1m1_20_31 = _data_phi_src + 130*ctr_1 + 16900*ctr_2 + 2196870;\n", + " double x42 = -0.5*_data_phi_src_1m1_20_31[ctr_0 - 1];\n", + " double * const _data_phi_src_11_20_31 = _data_phi_src + 130*ctr_1 + 16900*ctr_2 + 2197130;\n", + " double x43 = -0.5*_data_phi_src_1m1_20_31[ctr_0] + 0.5*_data_phi_src_11_20_31[ctr_0];\n", + " double x44 = x42 + x43 + 0.5*_data_phi_src_11_20_31[ctr_0 - 1];\n", + " double * const _data_phi_src_10_2m1_31 = _data_phi_src + 130*ctr_1 + 16900*ctr_2 + 2180100;\n", + " double x45 = -0.5*_data_phi_src_10_2m1_31[ctr_0 - 1];\n", + " double * const _data_phi_src_10_21_31 = _data_phi_src + 130*ctr_1 + 16900*ctr_2 + 2213900;\n", + " double x46 = -0.5*_data_phi_src_10_2m1_31[ctr_0] + 0.5*_data_phi_src_10_21_31[ctr_0];\n", + " double x47 = x45 + x46 + 0.5*_data_phi_src_10_21_31[ctr_0 - 1];\n", + " double x48 = (float) sqrtf((float) (x41*x41) + (x44*x44) + (x47*x47));\n", + " double x49 = 0.0067250649162728321*_data_mu_src_10_20[ctr_0] + 0.97420957122621499;\n", + " double * const _data_phi_dst_10_20_31 = _data_phi_dst + 130*ctr_1 + 16900*ctr_2 + 2197000;\n", + " double x50 = -16.0*_data_phi_src_10_20_31[ctr_0] + 16.0*_data_phi_dst_10_20_31[ctr_0];\n", + " double * _data_mu_stag_10_20_30_40 = _data_mu_stag + 130*ctr_1 + 16900*ctr_2;\n", + " _data_mu_stag_10_20_30_40[ctr_0] = -x15*((x16 < 1.0000000000000001e-9 || x25*x33 < 1.0000000000000001e-9) ? (0): (fdividef(x2*x38*-x11*x36 + 0.013865117527725106*_data_mu_src_10_20[ctr_0 - 1] + x35*x37 - 16.0*_data_phi_src_10_20_30[ctr_0 - 1] + 16.0*_data_phi_dst_10_20_30[ctr_0 - 1]*x15*x18 + x21*x28 + x24*x31, x16*x25)))*3.9269908169872414 + ((x39 < 1.0000000000000001e-9 || x33*x48 < 1.0000000000000001e-9) ? (0): (fdividef(x38*x5*-x12*x49 + 0.0067250649162728321*_data_mu_src_10_20[ctr_0 - 1] + x35*x50 - 16.0*_data_phi_src_10_20_31[ctr_0 - 1] + 16.0*_data_phi_dst_10_20_31[ctr_0 - 1]*x15*x41 + x28*x44 + x31*x47, x39*x48)))*3.9269908169872414 + x0 - 2.0*_data_mu_src_10_20[ctr_0 - 1]*x11*2.7730235055450212e-5 + x12*1.3450129832545665e-5 + x13*0.1452475438857877;\n", + " } \n", + " if (ctr_0 < 129 && ctr_2 < 129)\n", + " {\n", + " double * const _data_mu_src_10_20 = _data_mu_src + 130*ctr_1 + 16900*ctr_2;\n", + " double x0 = 2.0*_data_mu_src_10_20[ctr_0];\n", + " double * const _data_phi_src_10_20_30 = _data_phi_src + 130*ctr_1 + 16900*ctr_2;\n", + " double x1 = 0.5*_data_phi_src_10_20_30[ctr_0];\n", + " double * const _data_phi_src_10_20_31 = _data_phi_src + 130*ctr_1 + 16900*ctr_2 + 2197000;\n", + " double x4 = 0.5*_data_phi_src_10_20_31[ctr_0];\n", + " double * const _data_phi_src_10_20_32 = _data_phi_src + 130*ctr_1 + 16900*ctr_2 + 4394000;\n", + " double x7 = 0.5*_data_phi_src_10_20_32[ctr_0];\n", + " double x14 = 2.0*_data_phi_src_10_20_32[ctr_0];\n", + " double x17 = 2.0*_data_phi_src_10_20_30[ctr_0];\n", + " double * const _data_phi_src_1m1_20_30 = _data_phi_src + 130*ctr_1 + 16900*ctr_2 - 130;\n", + " double x19 = -0.5*_data_phi_src_1m1_20_30[ctr_0 - 1];\n", + " double * const _data_phi_src_10_2m1_30 = _data_phi_src + 130*ctr_1 + 16900*ctr_2 - 16900;\n", + " double * const _data_phi_src_10_21_30 = _data_phi_src + 130*ctr_1 + 16900*ctr_2 + 16900;\n", + " double x23 = -0.5*_data_phi_src_10_2m1_30[ctr_0] + 0.5*_data_phi_src_10_21_30[ctr_0];\n", + " double * const _data_phi_src_1m1_20_32 = _data_phi_src + 130*ctr_1 + 16900*ctr_2 + 4393870;\n", + " double x26 = -0.5*_data_phi_src_1m1_20_32[ctr_0 - 1];\n", + " double * const _data_phi_src_10_2m1_32 = _data_phi_src + 130*ctr_1 + 16900*ctr_2 + 4377100;\n", + " double * const _data_phi_src_10_21_32 = _data_phi_src + 130*ctr_1 + 16900*ctr_2 + 4410900;\n", + " double x30 = -0.5*_data_phi_src_10_2m1_32[ctr_0] + 0.5*_data_phi_src_10_21_32[ctr_0];\n", + " double x34 = 0.07262377194289385*_data_mu_src_10_20[ctr_0] + 7.2703712674679055;\n", + " double x36 = 0.013865117527725106*_data_mu_src_10_20[ctr_0] + 1.3792290810961054;\n", + " double * const _data_phi_dst_10_20_30 = _data_phi_dst + 130*ctr_1 + 16900*ctr_2;\n", + " double x37 = -16.0*_data_phi_src_10_20_30[ctr_0] + 16.0*_data_phi_dst_10_20_30[ctr_0];\n", + " double x40 = 2.0*_data_phi_src_10_20_31[ctr_0];\n", + " double * const _data_phi_src_1m1_20_31 = _data_phi_src + 130*ctr_1 + 16900*ctr_2 + 2196870;\n", + " double x42 = -0.5*_data_phi_src_1m1_20_31[ctr_0 - 1];\n", + " double * const _data_phi_src_10_2m1_31 = _data_phi_src + 130*ctr_1 + 16900*ctr_2 + 2180100;\n", + " double * const _data_phi_src_10_21_31 = _data_phi_src + 130*ctr_1 + 16900*ctr_2 + 2213900;\n", + " double x46 = -0.5*_data_phi_src_10_2m1_31[ctr_0] + 0.5*_data_phi_src_10_21_31[ctr_0];\n", + " double x49 = 0.0067250649162728321*_data_mu_src_10_20[ctr_0] + 0.97420957122621499;\n", + " double * const _data_phi_dst_10_20_31 = _data_phi_dst + 130*ctr_1 + 16900*ctr_2 + 2197000;\n", + " double x50 = -16.0*_data_phi_src_10_20_31[ctr_0] + 16.0*_data_phi_dst_10_20_31[ctr_0];\n", + " double x51 = x1 + 0.5*_data_phi_src_1m1_20_30[ctr_0];\n", + " double x52 = (x51*x51);\n", + " double x53 = x4 + 0.5*_data_phi_src_1m1_20_31[ctr_0];\n", + " double x54 = (x53*x53);\n", + " double x55 = x7 + 0.5*_data_phi_src_1m1_20_32[ctr_0];\n", + " double x56 = (x55*x55);\n", + " double x57 = (float) __frcp_rn( (float) x52 + x54 + x56);\n", + " double x58 = x52*x57;\n", + " double x59 = x54*x57;\n", + " double x60 = x56*x57;\n", + " double x61 = x14 - 2.0*_data_phi_src_1m1_20_32[ctr_0];\n", + " double x62 = (float) sqrtf((float) x51*x55);\n", + " double x63 = x17 - 2.0*_data_phi_src_1m1_20_30[ctr_0];\n", + " double x64 = -0.5*_data_phi_src_10_20_30[ctr_0 - 1] + 0.5*_data_phi_src_10_20_30[ctr_0 + 1];\n", + " double x65 = x19 + x64 + 0.5*_data_phi_src_1m1_20_30[ctr_0 + 1];\n", + " double * const _data_phi_src_1m1_2m1_30 = _data_phi_src + 130*ctr_1 + 16900*ctr_2 - 17030;\n", + " double x66 = -0.5*_data_phi_src_1m1_2m1_30[ctr_0];\n", + " double * const _data_phi_src_1m1_21_30 = _data_phi_src + 130*ctr_1 + 16900*ctr_2 + 16770;\n", + " double x67 = x23 + x66 + 0.5*_data_phi_src_1m1_21_30[ctr_0];\n", + " double x68 = (float) sqrtf((float) (x63*x63) + (x65*x65) + (x67*x67));\n", + " double x69 = -0.5*_data_phi_src_10_20_32[ctr_0 - 1] + 0.5*_data_phi_src_10_20_32[ctr_0 + 1];\n", + " double x70 = x26 + x69 + 0.5*_data_phi_src_1m1_20_32[ctr_0 + 1];\n", + " double * const _data_phi_src_1m1_2m1_32 = _data_phi_src + 130*ctr_1 + 16900*ctr_2 + 4376970;\n", + " double x71 = -0.5*_data_phi_src_1m1_2m1_32[ctr_0];\n", + " double * const _data_phi_src_1m1_21_32 = _data_phi_src + 130*ctr_1 + 16900*ctr_2 + 4410770;\n", + " double x72 = x30 + x71 + 0.5*_data_phi_src_1m1_21_32[ctr_0];\n", + " double x73 = (x61*x61) + (x70*x70) + (x72*x72);\n", + " double x74 = (float) sqrtf((float) x73);\n", + " double * const _data_mu_src_1m1_20 = _data_mu_src + 130*ctr_1 + 16900*ctr_2 - 130;\n", + " double x75 = x60*x34 + 0.07262377194289385*_data_mu_src_1m1_20[ctr_0];\n", + " double x76 = fdividef(x60, x73);\n", + " double x77 = (float) sqrtf((float) x53*x55);\n", + " double x78 = x40 - 2.0*_data_phi_src_1m1_20_31[ctr_0];\n", + " double x79 = -0.5*_data_phi_src_10_20_31[ctr_0 - 1] + 0.5*_data_phi_src_10_20_31[ctr_0 + 1];\n", + " double x80 = x42 + x79 + 0.5*_data_phi_src_1m1_20_31[ctr_0 + 1];\n", + " double * const _data_phi_src_1m1_2m1_31 = _data_phi_src + 130*ctr_1 + 16900*ctr_2 + 2179970;\n", + " double x81 = -0.5*_data_phi_src_1m1_2m1_31[ctr_0];\n", + " double * const _data_phi_src_1m1_21_31 = _data_phi_src + 130*ctr_1 + 16900*ctr_2 + 2213770;\n", + " double x82 = x46 + x81 + 0.5*_data_phi_src_1m1_21_31[ctr_0];\n", + " double x83 = (float) sqrtf((float) (x78*x78) + (x80*x80) + (x82*x82));\n", + " double * _data_mu_stag_10_20_31_40 = _data_mu_stag + 130*ctr_1 + 16900*ctr_2 + 2197000;\n", + " double * const _data_phi_dst_1m1_20_30 = _data_phi_dst + 130*ctr_1 + 16900*ctr_2 - 130;\n", + " double * const _data_phi_dst_1m1_20_31 = _data_phi_dst + 130*ctr_1 + 16900*ctr_2 + 2196870;\n", + " _data_mu_stag_10_20_31_40[ctr_0] = -x61*((x62 < 1.0000000000000001e-9 || x68*x74 < 1.0000000000000001e-9) ? (0): (fdividef(x51*x76*-x58*x36 + 0.013865117527725106*_data_mu_src_1m1_20[ctr_0] + x75*x37 - 16.0*_data_phi_src_1m1_20_30[ctr_0] + 16.0*_data_phi_dst_1m1_20_30[ctr_0]*x61*x63 + x65*x70 + x67*x72, x62*x68)))*3.9269908169872414 + ((x77 < 1.0000000000000001e-9 || x74*x83 < 1.0000000000000001e-9) ? (0): (fdividef(x53*x76*-x59*x49 + 0.0067250649162728321*_data_mu_src_1m1_20[ctr_0] + x75*x50 - 16.0*_data_phi_src_1m1_20_31[ctr_0] + 16.0*_data_phi_dst_1m1_20_31[ctr_0]*x61*x78 + x70*x80 + x72*x82, x77*x83)))*3.9269908169872414 + x0 - 2.0*_data_mu_src_1m1_20[ctr_0]*x58*2.7730235055450212e-5 + x59*1.3450129832545665e-5 + x60*0.1452475438857877;\n", + " } \n", + " if (ctr_0 < 129 && ctr_1 < 129)\n", + " {\n", + " double * const _data_mu_src_10_20 = _data_mu_src + 130*ctr_1 + 16900*ctr_2;\n", + " double x0 = 2.0*_data_mu_src_10_20[ctr_0];\n", + " double * const _data_phi_src_10_20_30 = _data_phi_src + 130*ctr_1 + 16900*ctr_2;\n", + " double x1 = 0.5*_data_phi_src_10_20_30[ctr_0];\n", + " double * const _data_phi_src_10_20_31 = _data_phi_src + 130*ctr_1 + 16900*ctr_2 + 2197000;\n", + " double x4 = 0.5*_data_phi_src_10_20_31[ctr_0];\n", + " double * const _data_phi_src_10_20_32 = _data_phi_src + 130*ctr_1 + 16900*ctr_2 + 4394000;\n", + " double x7 = 0.5*_data_phi_src_10_20_32[ctr_0];\n", + " double x14 = 2.0*_data_phi_src_10_20_32[ctr_0];\n", + " double x17 = 2.0*_data_phi_src_10_20_30[ctr_0];\n", + " double * const _data_phi_src_11_20_30 = _data_phi_src + 130*ctr_1 + 16900*ctr_2 + 130;\n", + " double * const _data_phi_src_1m1_20_30 = _data_phi_src + 130*ctr_1 + 16900*ctr_2 - 130;\n", + " double x20 = -0.5*_data_phi_src_1m1_20_30[ctr_0] + 0.5*_data_phi_src_11_20_30[ctr_0];\n", + " double * const _data_phi_src_10_2m1_30 = _data_phi_src + 130*ctr_1 + 16900*ctr_2 - 16900;\n", + " double x22 = -0.5*_data_phi_src_10_2m1_30[ctr_0 - 1];\n", + " double * const _data_phi_src_11_20_32 = _data_phi_src + 130*ctr_1 + 16900*ctr_2 + 4394130;\n", + " double * const _data_phi_src_1m1_20_32 = _data_phi_src + 130*ctr_1 + 16900*ctr_2 + 4393870;\n", + " double x27 = -0.5*_data_phi_src_1m1_20_32[ctr_0] + 0.5*_data_phi_src_11_20_32[ctr_0];\n", + " double * const _data_phi_src_10_2m1_32 = _data_phi_src + 130*ctr_1 + 16900*ctr_2 + 4377100;\n", + " double x29 = -0.5*_data_phi_src_10_2m1_32[ctr_0 - 1];\n", + " double x34 = 0.07262377194289385*_data_mu_src_10_20[ctr_0] + 7.2703712674679055;\n", + " double x36 = 0.013865117527725106*_data_mu_src_10_20[ctr_0] + 1.3792290810961054;\n", + " double * const _data_phi_dst_10_20_30 = _data_phi_dst + 130*ctr_1 + 16900*ctr_2;\n", + " double x37 = -16.0*_data_phi_src_10_20_30[ctr_0] + 16.0*_data_phi_dst_10_20_30[ctr_0];\n", + " double x40 = 2.0*_data_phi_src_10_20_31[ctr_0];\n", + " double * const _data_phi_src_11_20_31 = _data_phi_src + 130*ctr_1 + 16900*ctr_2 + 2197130;\n", + " double * const _data_phi_src_1m1_20_31 = _data_phi_src + 130*ctr_1 + 16900*ctr_2 + 2196870;\n", + " double x43 = -0.5*_data_phi_src_1m1_20_31[ctr_0] + 0.5*_data_phi_src_11_20_31[ctr_0];\n", + " double * const _data_phi_src_10_2m1_31 = _data_phi_src + 130*ctr_1 + 16900*ctr_2 + 2180100;\n", + " double x45 = -0.5*_data_phi_src_10_2m1_31[ctr_0 - 1];\n", + " double x49 = 0.0067250649162728321*_data_mu_src_10_20[ctr_0] + 0.97420957122621499;\n", + " double * const _data_phi_dst_10_20_31 = _data_phi_dst + 130*ctr_1 + 16900*ctr_2 + 2197000;\n", + " double x50 = -16.0*_data_phi_src_10_20_31[ctr_0] + 16.0*_data_phi_dst_10_20_31[ctr_0];\n", + " double x64 = -0.5*_data_phi_src_10_20_30[ctr_0 - 1] + 0.5*_data_phi_src_10_20_30[ctr_0 + 1];\n", + " double * const _data_phi_src_1m1_2m1_30 = _data_phi_src + 130*ctr_1 + 16900*ctr_2 - 17030;\n", + " double x66 = -0.5*_data_phi_src_1m1_2m1_30[ctr_0];\n", + " double x69 = -0.5*_data_phi_src_10_20_32[ctr_0 - 1] + 0.5*_data_phi_src_10_20_32[ctr_0 + 1];\n", + " double * const _data_phi_src_1m1_2m1_32 = _data_phi_src + 130*ctr_1 + 16900*ctr_2 + 4376970;\n", + " double x71 = -0.5*_data_phi_src_1m1_2m1_32[ctr_0];\n", + " double x79 = -0.5*_data_phi_src_10_20_31[ctr_0 - 1] + 0.5*_data_phi_src_10_20_31[ctr_0 + 1];\n", + " double * const _data_phi_src_1m1_2m1_31 = _data_phi_src + 130*ctr_1 + 16900*ctr_2 + 2179970;\n", + " double x81 = -0.5*_data_phi_src_1m1_2m1_31[ctr_0];\n", + " double x84 = x1 + 0.5*_data_phi_src_10_2m1_30[ctr_0];\n", + " double x85 = (x84*x84);\n", + " double x86 = x4 + 0.5*_data_phi_src_10_2m1_31[ctr_0];\n", + " double x87 = (x86*x86);\n", + " double x88 = x7 + 0.5*_data_phi_src_10_2m1_32[ctr_0];\n", + " double x89 = (x88*x88);\n", + " double x90 = (float) __frcp_rn( (float) x85 + x87 + x89);\n", + " double x91 = x85*x90;\n", + " double x92 = x87*x90;\n", + " double x93 = x89*x90;\n", + " double x94 = x14 - 2.0*_data_phi_src_10_2m1_32[ctr_0];\n", + " double x95 = (float) sqrtf((float) x84*x88);\n", + " double x96 = x17 - 2.0*_data_phi_src_10_2m1_30[ctr_0];\n", + " double x97 = x22 + x64 + 0.5*_data_phi_src_10_2m1_30[ctr_0 + 1];\n", + " double * const _data_phi_src_11_2m1_30 = _data_phi_src + 130*ctr_1 + 16900*ctr_2 - 16770;\n", + " double x98 = x20 + x66 + 0.5*_data_phi_src_11_2m1_30[ctr_0];\n", + " double x99 = (float) sqrtf((float) (x96*x96) + (x97*x97) + (x98*x98));\n", + " double x100 = x29 + x69 + 0.5*_data_phi_src_10_2m1_32[ctr_0 + 1];\n", + " double * const _data_phi_src_11_2m1_32 = _data_phi_src + 130*ctr_1 + 16900*ctr_2 + 4377230;\n", + " double x101 = x27 + x71 + 0.5*_data_phi_src_11_2m1_32[ctr_0];\n", + " double x102 = (x100*x100) + (x101*x101) + (x94*x94);\n", + " double x103 = (float) sqrtf((float) x102);\n", + " double * const _data_mu_src_10_2m1 = _data_mu_src + 130*ctr_1 + 16900*ctr_2 - 16900;\n", + " double x104 = x93*x34 + 0.07262377194289385*_data_mu_src_10_2m1[ctr_0];\n", + " double x105 = fdividef(x93, x102);\n", + " double x106 = (float) sqrtf((float) x86*x88);\n", + " double x107 = x40 - 2.0*_data_phi_src_10_2m1_31[ctr_0];\n", + " double x108 = x45 + x79 + 0.5*_data_phi_src_10_2m1_31[ctr_0 + 1];\n", + " double * const _data_phi_src_11_2m1_31 = _data_phi_src + 130*ctr_1 + 16900*ctr_2 + 2180230;\n", + " double x109 = x43 + x81 + 0.5*_data_phi_src_11_2m1_31[ctr_0];\n", + " double x110 = (float) sqrtf((float) (x107*x107) + (x108*x108) + (x109*x109));\n", + " double * _data_mu_stag_10_20_32_40 = _data_mu_stag + 130*ctr_1 + 16900*ctr_2 + 4394000;\n", + " double * const _data_phi_dst_10_2m1_30 = _data_phi_dst + 130*ctr_1 + 16900*ctr_2 - 16900;\n", + " double * const _data_phi_dst_10_2m1_31 = _data_phi_dst + 130*ctr_1 + 16900*ctr_2 + 2180100;\n", + " _data_mu_stag_10_20_32_40[ctr_0] = -x94*((x106 < 1.0000000000000001e-9 || x103*x110 < 1.0000000000000001e-9) ? (0): (fdividef(x105*x86*x104 - x92*x49 + 0.0067250649162728321*_data_mu_src_10_2m1[ctr_0]*x50 - 16.0*_data_phi_src_10_2m1_31[ctr_0] + 16.0*_data_phi_dst_10_2m1_31[ctr_0]*x100*x108 + x101*x109 + x107*x94, x106*x110)))*3.9269908169872414 + ((x95 < 1.0000000000000001e-9 || x103*x99 < 1.0000000000000001e-9) ? (0): (fdividef(x105*x84*x104 - x91*x36 + 0.013865117527725106*_data_mu_src_10_2m1[ctr_0]*x37 - 16.0*_data_phi_src_10_2m1_30[ctr_0] + 16.0*_data_phi_dst_10_2m1_30[ctr_0]*x100*x97 + x101*x98 + x94*x96, x95*x99)))*3.9269908169872414 + x0 - 2.0*_data_mu_src_10_2m1[ctr_0]*x91*2.7730235055450212e-5 + x92*1.3450129832545665e-5 + x93*0.1452475438857877;\n", + " } \n", + " } \n", + "}\n" + ] + } + ], + "source": [ + "print(show_code(mu_stag_precomp_kernel))" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "rescheduled_eqs = schedule_eqs(atomize_eqs(mu_stag_update_eqs))" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "63\n", + "xi_0 ↠phi_src_C^0**2\n", + "xi_1 ↠phi_src_C^1**2\n", + "xi_2 ↠phi_src_C^2**2\n", + "xi_3 ↠xi_0 + xi_1 + xi_2\n", + "xi_4 ↠phi_dst_C^1**2\n", + "xi_5 ↠phi_dst_C^0**2\n", + "xi_6 ↠phi_dst_C^2**2\n", + "xi_7 ↠32.0/(xi_4 + xi_5 + xi_6)\n", + "xi_8 ↠32.0/xi_3\n", + "xi_9 ↠2.0*mu_src_C\n", + "xi_10 ↠phi_src_C^0/2\n", + "xi_11 ↠phi_src_W^0/2 + xi_10\n", + "xi_12 ↠xi_11**2\n", + "xi_13 ↠phi_src_C^1/2\n", + "xi_14 ↠phi_src_W^1/2 + xi_13\n", + "xi_15 ↠xi_14**2\n", + "xi_16 ↠phi_src_C^2/2\n", + "xi_17 ↠phi_src_W^2/2 + xi_16\n", + "xi_18 ↠xi_17**2\n", + "xi_19 ↠1/(xi_12 + xi_15 + xi_18)\n", + "xi_20 ↠xi_12*xi_19\n", + "xi_21 ↠xi_15*xi_19\n", + "xi_22 ↠xi_18*xi_19\n", + "xi_23 ↠2.0*phi_src_C^2\n", + "xi_24 ↠-2.0*phi_src_W^2 + xi_23\n", + "xi_25 ↠sqrt(xi_11*xi_17)\n", + "xi_26 ↠2.0*phi_src_C^0\n", + "xi_27 ↠-2.0*phi_src_W^0 + xi_26\n", + "xi_28 ↠0.5*phi_src_NW^0\n", + "xi_29 ↠-0.5*phi_src_SW^0\n", + "xi_30 ↠-0.5*phi_src_S^0 + 0.5*phi_src_N^0\n", + "xi_31 ↠xi_28 + xi_29 + xi_30\n", + "xi_32 ↠0.5*phi_src_TW^0\n", + "xi_33 ↠-0.5*phi_src_BW^0\n", + "xi_34 ↠-0.5*phi_src_B^0 + 0.5*phi_src_T^0\n", + "xi_35 ↠xi_32 + xi_33 + xi_34\n", + "xi_36 ↠sqrt(xi_27**2 + xi_31**2 + xi_35**2)\n", + "xi_37 ↠0.5*phi_src_NW^2\n", + "xi_38 ↠-0.5*phi_src_SW^2\n", + "xi_39 ↠-0.5*phi_src_S^2 + 0.5*phi_src_N^2\n", + "xi_40 ↠xi_37 + xi_38 + xi_39\n", + "xi_41 ↠0.5*phi_src_TW^2\n", + "xi_42 ↠-0.5*phi_src_BW^2\n", + "xi_43 ↠-0.5*phi_src_B^2 + 0.5*phi_src_T^2\n", + "xi_44 ↠xi_41 + xi_42 + xi_43\n", + "xi_45 ↠xi_24**2 + xi_40**2 + xi_44**2\n", + "xi_46 ↠sqrt(xi_45)\n", + "xi_47 ↠0.0726237719428938*mu_src_C + 7.27037126746791\n", + "xi_48 ↠xi_22*(0.0726237719428938*mu_src_W + xi_47)\n", + "xi_49 ↠0.0138651175277251*mu_src_C + 1.37922908109611\n", + "xi_50 ↠16.0*phi_dst_C^0 - 16.0*phi_src_C^0\n", + "xi_51 ↠xi_22/xi_45\n", + "xi_52 ↠sqrt(xi_14*xi_17)\n", + "xi_53 ↠2.0*phi_src_C^1\n", + "xi_54 ↠-2.0*phi_src_W^1 + xi_53\n", + "xi_55 ↠0.5*phi_src_NW^1\n", + "xi_56 ↠-0.5*phi_src_SW^1\n", + "xi_57 ↠-0.5*phi_src_S^1 + 0.5*phi_src_N^1\n", + "xi_58 ↠xi_55 + xi_56 + xi_57\n", + "xi_59 ↠0.5*phi_src_TW^1\n", + "xi_60 ↠-0.5*phi_src_BW^1\n", + "xi_61 ↠-0.5*phi_src_B^1 + 0.5*phi_src_T^1\n", + "xi_62 ↠xi_59 + xi_60 + xi_61\n", + "xi_63 ↠sqrt(xi_54**2 + xi_58**2 + xi_62**2)\n", + "xi_64 ↠0.00672506491627283*mu_src_C + 0.974209571226215\n", + "xi_65 ↠16.0*phi_dst_C^1 - 16.0*phi_src_C^1\n", + "xi_66 ↠-xi_9\n", + "xi_67 ↠phi_src_E^0/2 + xi_10\n", + "xi_68 ↠xi_67**2\n", + "xi_69 ↠phi_src_E^1/2 + xi_13\n", + "xi_70 ↠xi_69**2\n", + "xi_71 ↠phi_src_E^2/2 + xi_16\n", + "xi_72 ↠xi_71**2\n", + "xi_73 ↠1/(xi_68 + xi_70 + xi_72)\n", + "xi_74 ↠xi_68*xi_73\n", + "xi_75 ↠xi_70*xi_73\n", + "xi_76 ↠xi_72*xi_73\n", + "xi_77 ↠-xi_23\n", + "xi_78 ↠2.0*phi_src_E^2 + xi_77\n", + "xi_79 ↠sqrt(xi_67*xi_71)\n", + "xi_80 ↠-xi_26\n", + "xi_81 ↠2.0*phi_src_E^0 + xi_80\n", + "xi_82 ↠0.5*phi_src_NE^0\n", + "xi_83 ↠0.5*phi_src_SE^0\n", + "xi_84 ↠xi_30 + xi_82 - xi_83\n", + "xi_85 ↠0.5*phi_src_TE^0\n", + "xi_86 ↠0.5*phi_src_BE^0\n", + "xi_87 ↠xi_34 + xi_85 - xi_86\n", + "xi_88 ↠sqrt(xi_81**2 + xi_84**2 + xi_87**2)\n", + "xi_89 ↠0.5*phi_src_NE^2\n", + "xi_90 ↠0.5*phi_src_SE^2\n", + "xi_91 ↠xi_39 + xi_89 - xi_90\n", + "xi_92 ↠0.5*phi_src_TE^2\n", + "xi_93 ↠0.5*phi_src_BE^2\n", + "xi_94 ↠xi_43 + xi_92 - xi_93\n", + "xi_95 ↠xi_78**2 + xi_91**2 + xi_94**2\n", + "xi_96 ↠sqrt(xi_95)\n", + "xi_97 ↠xi_76*(0.0726237719428938*mu_src_E + xi_47)\n", + "xi_98 ↠xi_76/xi_95\n", + "xi_99 ↠sqrt(xi_69*xi_71)\n", + "xi_100 ↠-xi_53\n", + "xi_101 ↠2.0*phi_src_E^1 + xi_100\n", + "xi_102 ↠0.5*phi_src_NE^1\n", + "xi_103 ↠0.5*phi_src_SE^1\n", + "xi_104 ↠xi_102 - xi_103 + xi_57\n", + "xi_105 ↠0.5*phi_src_TE^1\n", + "xi_106 ↠0.5*phi_src_BE^1\n", + "xi_107 ↠xi_105 - xi_106 + xi_61\n", + "xi_108 ↠sqrt(xi_101**2 + xi_104**2 + xi_107**2)\n", + "xi_109 ↠phi_src_S^0/2 + xi_10\n", + "xi_110 ↠xi_109**2\n", + "xi_111 ↠phi_src_S^1/2 + xi_13\n", + "xi_112 ↠xi_111**2\n", + "xi_113 ↠phi_src_S^2/2 + xi_16\n", + "xi_114 ↠xi_113**2\n", + "xi_115 ↠1/(xi_110 + xi_112 + xi_114)\n", + "xi_116 ↠xi_110*xi_115\n", + "xi_117 ↠xi_112*xi_115\n", + "xi_118 ↠xi_114*xi_115\n", + "xi_119 ↠-2.0*phi_src_S^2 + xi_23\n", + "xi_120 ↠sqrt(xi_109*xi_113)\n", + "xi_121 ↠-2.0*phi_src_S^0 + xi_26\n", + "xi_122 ↠-0.5*phi_src_W^0 + 0.5*phi_src_E^0\n", + "xi_123 ↠xi_122 + xi_29 + xi_83\n", + "xi_124 ↠0.5*phi_src_TS^0\n", + "xi_125 ↠-0.5*phi_src_BS^0\n", + "xi_126 ↠xi_124 + xi_125 + xi_34\n", + "xi_127 ↠sqrt(xi_121**2 + xi_123**2 + xi_126**2)\n", + "xi_128 ↠-0.5*phi_src_W^2 + 0.5*phi_src_E^2\n", + "xi_129 ↠xi_128 + xi_38 + xi_90\n", + "xi_130 ↠0.5*phi_src_TS^2\n", + "xi_131 ↠-0.5*phi_src_BS^2\n", + "xi_132 ↠xi_130 + xi_131 + xi_43\n", + "xi_133 ↠xi_119**2 + xi_129**2 + xi_132**2\n", + "xi_134 ↠sqrt(xi_133)\n", + "xi_135 ↠xi_118*(0.0726237719428938*mu_src_S + xi_47)\n", + "xi_136 ↠xi_118/xi_133\n", + "xi_137 ↠sqrt(xi_111*xi_113)\n", + "xi_138 ↠-2.0*phi_src_S^1 + xi_53\n", + "xi_139 ↠-0.5*phi_src_W^1 + 0.5*phi_src_E^1\n", + "xi_140 ↠xi_103 + xi_139 + xi_56\n", + "xi_141 ↠0.5*phi_src_TS^1\n", + "xi_142 ↠-0.5*phi_src_BS^1\n", + "xi_143 ↠xi_141 + xi_142 + xi_61\n", + "xi_144 ↠sqrt(xi_138**2 + xi_140**2 + xi_143**2)\n", + "xi_145 ↠phi_src_N^0/2 + xi_10\n", + "xi_146 ↠xi_145**2\n", + "xi_147 ↠phi_src_N^1/2 + xi_13\n", + "xi_148 ↠xi_147**2\n", + "xi_149 ↠phi_src_N^2/2 + xi_16\n", + "xi_150 ↠xi_149**2\n", + "xi_151 ↠1/(xi_146 + xi_148 + xi_150)\n", + "xi_152 ↠xi_146*xi_151\n", + "xi_153 ↠xi_148*xi_151\n", + "xi_154 ↠xi_150*xi_151\n", + "xi_155 ↠2.0*phi_src_N^2 + xi_77\n", + "xi_156 ↠sqrt(xi_145*xi_149)\n", + "xi_157 ↠2.0*phi_src_N^0 + xi_80\n", + "xi_158 ↠xi_122 - xi_28 + xi_82\n", + "xi_159 ↠0.5*phi_src_TN^0\n", + "xi_160 ↠0.5*phi_src_BN^0\n", + "xi_161 ↠xi_159 - xi_160 + xi_34\n", + "xi_162 ↠sqrt(xi_157**2 + xi_158**2 + xi_161**2)\n", + "xi_163 ↠xi_128 - xi_37 + xi_89\n", + "xi_164 ↠0.5*phi_src_TN^2\n", + "xi_165 ↠0.5*phi_src_BN^2\n", + "xi_166 ↠xi_164 - xi_165 + xi_43\n", + "xi_167 ↠xi_155**2 + xi_163**2 + xi_166**2\n", + "xi_168 ↠sqrt(xi_167)\n", + "xi_169 ↠xi_154*(0.0726237719428938*mu_src_N + xi_47)\n", + "xi_170 ↠xi_154/xi_167\n", + "xi_171 ↠sqrt(xi_147*xi_149)\n", + "xi_172 ↠2.0*phi_src_N^1 + xi_100\n", + "xi_173 ↠xi_102 + xi_139 - xi_55\n", + "xi_174 ↠0.5*phi_src_TN^1\n", + "xi_175 ↠0.5*phi_src_BN^1\n", + "xi_176 ↠xi_174 - xi_175 + xi_61\n", + "xi_177 ↠sqrt(xi_172**2 + xi_173**2 + xi_176**2)\n", + "xi_178 ↠phi_src_B^0/2 + xi_10\n", + "xi_179 ↠xi_178**2\n", + "xi_180 ↠phi_src_B^1/2 + xi_13\n", + "xi_181 ↠xi_180**2\n", + "xi_182 ↠phi_src_B^2/2 + xi_16\n", + "xi_183 ↠xi_182**2\n", + "xi_184 ↠1/(xi_179 + xi_181 + xi_183)\n", + "xi_185 ↠xi_179*xi_184\n", + "xi_186 ↠xi_181*xi_184\n", + "xi_187 ↠xi_183*xi_184\n", + "xi_188 ↠-2.0*phi_src_B^2 + xi_23\n", + "xi_189 ↠sqrt(xi_178*xi_182)\n", + "xi_190 ↠-2.0*phi_src_B^0 + xi_26\n", + "xi_191 ↠xi_122 + xi_33 + xi_86\n", + "xi_192 ↠xi_125 + xi_160 + xi_30\n", + "xi_193 ↠sqrt(xi_190**2 + xi_191**2 + xi_192**2)\n", + "xi_194 ↠xi_128 + xi_42 + xi_93\n", + "xi_195 ↠xi_131 + xi_165 + xi_39\n", + "xi_196 ↠xi_188**2 + xi_194**2 + xi_195**2\n", + "xi_197 ↠sqrt(xi_196)\n", + "xi_198 ↠xi_187*(0.0726237719428938*mu_src_B + xi_47)\n", + "xi_199 ↠xi_187/xi_196\n", + "xi_200 ↠sqrt(xi_180*xi_182)\n", + "xi_201 ↠-2.0*phi_src_B^1 + xi_53\n", + "xi_202 ↠xi_106 + xi_139 + xi_60\n", + "xi_203 ↠xi_142 + xi_175 + xi_57\n", + "xi_204 ↠sqrt(xi_201**2 + xi_202**2 + xi_203**2)\n", + "xi_205 ↠phi_src_T^0/2 + xi_10\n", + "xi_206 ↠xi_205**2\n", + "xi_207 ↠phi_src_T^1/2 + xi_13\n", + "xi_208 ↠xi_207**2\n", + "xi_209 ↠phi_src_T^2/2 + xi_16\n", + "xi_210 ↠xi_209**2\n", + "xi_211 ↠1/(xi_206 + xi_208 + xi_210)\n", + "xi_212 ↠xi_206*xi_211\n", + "xi_213 ↠xi_208*xi_211\n", + "xi_214 ↠xi_210*xi_211\n", + "xi_215 ↠2.0*phi_src_T^2 + xi_77\n", + "xi_216 ↠sqrt(xi_205*xi_209)\n", + "xi_217 ↠2.0*phi_src_T^0 + xi_80\n", + "xi_218 ↠xi_122 - xi_32 + xi_85\n", + "xi_219 ↠-xi_124 + xi_159 + xi_30\n", + "xi_220 ↠sqrt(xi_217**2 + xi_218**2 + xi_219**2)\n", + "xi_221 ↠xi_128 - xi_41 + xi_92\n", + "xi_222 ↠-xi_130 + xi_164 + xi_39\n", + "xi_223 ↠xi_215**2 + xi_221**2 + xi_222**2\n", + "xi_224 ↠sqrt(xi_223)\n", + "xi_225 ↠xi_214*(0.0726237719428938*mu_src_T + xi_47)\n", + "xi_226 ↠xi_214/xi_223\n", + "xi_227 ↠sqrt(xi_207*xi_209)\n", + "xi_228 ↠2.0*phi_src_T^1 + xi_100\n", + "xi_229 ↠xi_105 + xi_139 - xi_59\n", + "xi_230 ↠-xi_141 + xi_174 + xi_57\n", + "xi_231 ↠sqrt(xi_228**2 + xi_229**2 + xi_230**2)\n", + "dc_dmu_0_0 ↠xi_3/(0.0277302350554502*xi_0 + 0.0134501298325457*xi_1 + 0.145247543885788*xi_2)\n", + "dc_dphi_dt_0 ↠(0.0134501298325457*mu_src_C + 0.974209571226215)*(-xi_1*xi_8 + xi_4*xi_7) + (0.0277302350554502*mu_src_C + 1.37922908109611)*(-xi_0*xi_8 + xi_5*xi_7) + (0.145247543885788*mu_src_C + 7.27037126746791)*(-xi_2*xi_8 + xi_6*xi_7)\n", + "dc_dT_dt_0 ↠0\n", + "staggered_down_0_0 ↠-xi_24*(3.92699081698724*Piecewise((0, (xi_25 < 1.0e-9) | (xi_36*xi_46 < 1.0e-9)), (xi_11*xi_51*(-xi_20*(0.0138651175277251*mu_src_W + xi_49) + xi_48)*(16.0*phi_dst_W^0 - 16.0*phi_src_W^0 + xi_50)*(xi_24*xi_27 + xi_31*xi_40 + xi_35*xi_44)/(xi_25*xi_36), True)) + 3.92699081698724*Piecewise((0, (xi_52 < 1.0e-9) | (xi_46*xi_63 < 1.0e-9)), (xi_14*xi_51*(-xi_21*(0.00672506491627283*mu_src_W + xi_64) + xi_48)*(16.0*phi_dst_W^1 - 16.0*phi_src_W^1 + xi_65)*(xi_24*xi_54 + xi_40*xi_58 + xi_44*xi_62)/(xi_52*xi_63), True))) + (-2.0*mu_src_W + xi_9)*(2.77302350554502e-5*xi_20 + 1.34501298325457e-5*xi_21 + 0.145247543885788*xi_22)\n", + "staggered_up_0_0 ↠-xi_78*(3.92699081698724*Piecewise((0, (xi_79 < 1.0e-9) | (xi_88*xi_96 < 1.0e-9)), (xi_67*xi_98*(-xi_74*(0.0138651175277251*mu_src_E + xi_49) + xi_97)*(16.0*phi_dst_E^0 - 16.0*phi_src_E^0 + xi_50)*(xi_78*xi_81 + xi_84*xi_91 + xi_87*xi_94)/(xi_79*xi_88), True)) + 3.92699081698724*Piecewise((0, (xi_99 < 1.0e-9) | (xi_108*xi_96 < 1.0e-9)), (xi_69*xi_98*(-xi_75*(0.00672506491627283*mu_src_E + xi_64) + xi_97)*(16.0*phi_dst_E^1 - 16.0*phi_src_E^1 + xi_65)*(xi_101*xi_78 + xi_104*xi_91 + xi_107*xi_94)/(xi_108*xi_99), True))) + (2.0*mu_src_E + xi_66)*(2.77302350554502e-5*xi_74 + 1.34501298325457e-5*xi_75 + 0.145247543885788*xi_76)\n", + "staggered_down_1_0 ↠-xi_119*(3.92699081698724*Piecewise((0, (xi_120 < 1.0e-9) | (xi_127*xi_134 < 1.0e-9)), (xi_109*xi_136*(-xi_116*(0.0138651175277251*mu_src_S + xi_49) + xi_135)*(16.0*phi_dst_S^0 - 16.0*phi_src_S^0 + xi_50)*(xi_119*xi_121 + xi_123*xi_129 + xi_126*xi_132)/(xi_120*xi_127), True)) + 3.92699081698724*Piecewise((0, (xi_137 < 1.0e-9) | (xi_134*xi_144 < 1.0e-9)), (xi_111*xi_136*(-xi_117*(0.00672506491627283*mu_src_S + xi_64) + xi_135)*(16.0*phi_dst_S^1 - 16.0*phi_src_S^1 + xi_65)*(xi_119*xi_138 + xi_129*xi_140 + xi_132*xi_143)/(xi_137*xi_144), True))) + (-2.0*mu_src_S + xi_9)*(2.77302350554502e-5*xi_116 + 1.34501298325457e-5*xi_117 + 0.145247543885788*xi_118)\n", + "staggered_up_1_0 ↠-xi_155*(3.92699081698724*Piecewise((0, (xi_156 < 1.0e-9) | (xi_162*xi_168 < 1.0e-9)), (xi_145*xi_170*(-xi_152*(0.0138651175277251*mu_src_N + xi_49) + xi_169)*(16.0*phi_dst_N^0 - 16.0*phi_src_N^0 + xi_50)*(xi_155*xi_157 + xi_158*xi_163 + xi_161*xi_166)/(xi_156*xi_162), True)) + 3.92699081698724*Piecewise((0, (xi_171 < 1.0e-9) | (xi_168*xi_177 < 1.0e-9)), (xi_147*xi_170*(-xi_153*(0.00672506491627283*mu_src_N + xi_64) + xi_169)*(16.0*phi_dst_N^1 - 16.0*phi_src_N^1 + xi_65)*(xi_155*xi_172 + xi_163*xi_173 + xi_166*xi_176)/(xi_171*xi_177), True))) + (2.0*mu_src_N + xi_66)*(2.77302350554502e-5*xi_152 + 1.34501298325457e-5*xi_153 + 0.145247543885788*xi_154)\n", + "staggered_down_2_0 ↠-xi_188*(3.92699081698724*Piecewise((0, (xi_189 < 1.0e-9) | (xi_193*xi_197 < 1.0e-9)), (xi_178*xi_199*(-xi_185*(0.0138651175277251*mu_src_B + xi_49) + xi_198)*(16.0*phi_dst_B^0 - 16.0*phi_src_B^0 + xi_50)*(xi_188*xi_190 + xi_191*xi_194 + xi_192*xi_195)/(xi_189*xi_193), True)) + 3.92699081698724*Piecewise((0, (xi_200 < 1.0e-9) | (xi_197*xi_204 < 1.0e-9)), (xi_180*xi_199*(-xi_186*(0.00672506491627283*mu_src_B + xi_64) + xi_198)*(16.0*phi_dst_B^1 - 16.0*phi_src_B^1 + xi_65)*(xi_188*xi_201 + xi_194*xi_202 + xi_195*xi_203)/(xi_200*xi_204), True))) + (-2.0*mu_src_B + xi_9)*(2.77302350554502e-5*xi_185 + 1.34501298325457e-5*xi_186 + 0.145247543885788*xi_187)\n", + "staggered_up_2_0 ↠-xi_215*(3.92699081698724*Piecewise((0, (xi_216 < 1.0e-9) | (xi_220*xi_224 < 1.0e-9)), (xi_205*xi_226*(-xi_212*(0.0138651175277251*mu_src_T + xi_49) + xi_225)*(16.0*phi_dst_T^0 - 16.0*phi_src_T^0 + xi_50)*(xi_215*xi_217 + xi_218*xi_221 + xi_219*xi_222)/(xi_216*xi_220), True)) + 3.92699081698724*Piecewise((0, (xi_227 < 1.0e-9) | (xi_224*xi_231 < 1.0e-9)), (xi_207*xi_226*(-xi_213*(0.00672506491627283*mu_src_T + xi_64) + xi_225)*(16.0*phi_dst_T^1 - 16.0*phi_src_T^1 + xi_65)*(xi_215*xi_228 + xi_221*xi_229 + xi_222*xi_230)/(xi_227*xi_231), True))) + (2.0*mu_src_T + xi_66)*(2.77302350554502e-5*xi_212 + 1.34501298325457e-5*xi_213 + 0.145247543885788*xi_214)\n", + "divMgradmu_0 ↠-2.0*staggered_down_0_0 - 2.0*staggered_down_1_0 - 2.0*staggered_down_2_0 + 2.0*staggered_up_0_0 + 2.0*staggered_up_1_0 + 2.0*staggered_up_2_0\n", + "mu_dst[0,0,0] ↠mu_src_C + 0.03125*dc_dmu_0_0*(-dc_dT_dt_0 - dc_dphi_dt_0 + divMgradmu_0)\n" + ] + } + ], + "source": [ + "print(len(rescheduled_eqs))\n", + "for eq in mu_update_eqs:\n", + " print(eq)" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "128\n", + "1018\n", + "\t.headerflags\t@\"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM60 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM60)\"\n", + "\t.elftype\t@\"ET_EXEC\"\n", + "\n", + "\n", + "//--------------------- .text.kernel --------------------------\n", + "\t.section\t.text.kernel,\"ax\",@progbits\n", + "\t.sectioninfo\t@\"SHI_REGISTERS=72\"\n", + "\t.align\t32\n", + " .global kernel\n", + " .type kernel,@function\n", + " .size kernel,(.L_48 - kernel)\n", + " .other kernel,@\"STO_CUDA_ENTRY STV_DEFAULT\"\n", + "kernel:\n", + ".text.kernel:\n", + " /*0008*/ MOV R1, c[0x0][0x20] ;\n", + " /*0010*/ S2R R0, SR_CTAID.Y ;\n", + " /*0018*/ S2R R2, SR_TID.Y ;\n", + " /*0028*/ S2R R50, SR_CTAID.X ;\n", + " /*0030*/ S2R R3, SR_TID.X ;\n", + " /*0038*/ S2R R4, SR_CTAID.Z ;\n", + " /*0048*/ { XMAD R5, R0.reuse, c[0x0] [0xc], R2 SLOT 0;\n", + " /*0050*/ S2R R2, SR_TID.Z SLOT 1 }\n", + " /*0058*/ XMAD.MRG R6, R0, c[0x0] [0xc].H1, RZ ;\n", + " /*0068*/ XMAD.PSL.CBCC R0, R0.H1, R6.H1, R5 ;\n", + " /*0070*/ XMAD R3, R50.reuse, c[0x0] [0x8], R3 ;\n", + " /*0078*/ XMAD.MRG R5, R50.reuse, c[0x0] [0x8].H1, RZ ;\n", + " /*0088*/ IADD32I R0, R0, 0x1 ;\n", + " /*0090*/ XMAD.PSL.CBCC R50, R50.H1, R5.H1, R3 ;\n", + " /*0098*/ ISETP.GE.U32.AND P0, PT, R0, 0x82, PT ;\n", + " /*00a8*/ IADD32I R50, R50, 0x1 ;\n", + " /*00b0*/ XMAD R2, R4.reuse, c[0x0] [0x10], R2 ;\n", + " /*00b8*/ XMAD.MRG R3, R4.reuse, c[0x0] [0x10].H1, RZ ;\n", + " /*00c8*/ XMAD.PSL.CBCC R2, R4.H1, R3.H1, R2 ;\n", + " /*00d0*/ ISETP.LT.U32.AND P0, PT, R50, 0x82, !P0 ;\n", + " /*00d8*/ IADD32I R2, R2, 0x1 ;\n", + " /*00e8*/ ISETP.LT.U32.AND P0, PT, R2, 0x82, P0 ;\n", + " /*00f0*/ @!P0 EXIT ;\n", + " /*00f8*/ { MOV32I R6, 0x82 ;\n", + " /*0108*/ SSY `(.L_1) }\n", + " /*0110*/ MOV32I R9, 0x82 ;\n", + " /*0118*/ MOV32I R11, 0x4204 ;\n", + " /*0128*/ XMAD R8, R2, 0x4204, RZ ;\n", + " /*0130*/ XMAD R3, R0.reuse, 0x82, RZ ;\n", + " /*0138*/ XMAD R5, R0, 0x82, RZ ;\n", + " /*0148*/ XMAD R7, R2.reuse, 0x4204, RZ ;\n", + " /*0150*/ MOV32I R12, 0x4204 ;\n", + " /*0158*/ XMAD R4, R0.reuse, R6.H1, RZ ;\n", + " /*0168*/ XMAD R6, R0.H1, R9.H1, RZ ;\n", + " /*0170*/ XMAD R9, R2.reuse, R11.H1, RZ ;\n", + " /*0178*/ XMAD.CHI R11, R2.H1, 0x4204, R8 ;\n", + " /*0188*/ XMAD.PSL R3, R0.H1.reuse, 0x82, R3 ;\n", + " /*0190*/ XMAD.CHI R5, R0.H1, 0x82, R5 ;\n", + " /*0198*/ XMAD R10, R2.H1.reuse, R12.H1, RZ ;\n", + " /*01a8*/ XMAD.PSL R8, R2.H1.reuse, 0x4204, R7 ;\n", + " /*01b0*/ ISETP.GE.U32.AND P1, PT, R2, 0x81, PT ;\n", + " /*01b8*/ IADD3.RS R4, R5, R4, R6 ;\n", + " /*01c8*/ IADD3.RS R9, R11, R9, R10 ;\n", + " /*01d0*/ IADD R3.CC, R8, R3 ;\n", + " /*01d8*/ ISETP.GE.U32.AND P0, PT, R0.reuse, 0x81, PT ;\n", + " /*01e8*/ IADD.X R4, R9, R4 ;\n", + " /*01f0*/ IADD R50.CC, R50, R3 ;\n", + " /*01f8*/ ISETP.LT.U32.AND P2, PT, R0, 0x81, !P1 ;\n", + " /*0208*/ IADD.X R4, RZ, R4 ;\n", + " /*0210*/ SHL R3, R50.reuse, 0x3 ;\n", + " /*0218*/ SHF.L.U64 R2, R50, 0x3, R4 ;\n", + " /*0228*/ IADD R62.CC, R3, c[0x0][0x150] ;\n", + " /*0230*/ { IADD.X R63, R2, c[0x0][0x154] ;\n", + " /*0238*/ @!P2 SYNC (*\"TARGET= .L_1 \"*) }\n", + " /*0248*/ { IADD R60.CC, R3, c[0x0][0x158] ;\n", + " /*0250*/ SSY `(.L_2) }\n", + " /*0258*/ IADD.X R61, R2, c[0x0][0x15c] ;\n", + " /*0268*/ LDG.E.64 R4, [R60+-0x8] ;\n", + " /*0270*/ LDG.E.64 R24, [R60] ;\n", + " /*0278*/ IADD32I R16.CC, R60, 0x1000000 ;\n", + " /*0288*/ IADD.X R17, RZ, R61 ;\n", + " /*0290*/ { IADD32I R20.CC, R60, 0x2000000 ;\n", + " /*0298*/ LDG.E.64 R6, [R16+0xc3038] }\n", + " /*02a8*/ { IADD.X R21, RZ, R61 ;\n", + " /*02b0*/ LDG.E.64 R8, [R16+0xc3040] }\n", + " /*02b8*/ { IADD R58.CC, R3, c[0x0][0x140] ;\n", + " /*02c8*/ LDG.E.64 R18, [R20+0x186078] }\n", + " /*02d0*/ { IADD.X R59, R2, c[0x0][0x144] ;\n", + " /*02d8*/ LDG.E.64 R28, [R20+0x186080] }\n", + " /*02e8*/ LDG.E.64 R10, [R58] ;\n", + " /*02f0*/ DEPBAR.LE SB5, 0x4 ;\n", + " /*02f8*/ DMUL R14, R4, 0.5 ;\n", + " /*0308*/ DFMA R12, R24, 0.5, R14 ;\n", + " /*0310*/ DMUL R22, R12, R12 ;\n", + " /*0318*/ DSETP.GEU.AND P2, PT, |R22|, c[0x2][0x0], PT ;\n", + " /*0328*/ F2F.F32.F64 R0, R22 ;\n", + " /*0330*/ DEPBAR.LE SB5, 0x2 ;\n", + " /*0338*/ DMUL R26, R6, 0.5 ;\n", + " /*0348*/ DMUL R14, R18, 0.5 ;\n", + " /*0350*/ @!P2 FMUL R0, R0, 1.175494350822287508e-38 ;\n", + " /*0358*/ DFMA R26, R8, 0.5, R26 ;\n", + " /*0368*/ { FMUL.FTZ R0, R0, 1 ;\n", + " /*0370*/ DEPBAR.LE SB5, 0x1 }\n", + " /*0378*/ DFMA R14, R28, 0.5, R14 ;\n", + " /*0388*/ DMUL R16, R26, R26 ;\n", + " /*0390*/ F2F.F64.F32 R30, R0 ;\n", + " /*0398*/ DMUL R20, R14, R14 ;\n", + " /*03a8*/ DADD R30, R16, R30 ;\n", + " /*03b0*/ DADD R30, R20, R30 ;\n", + " /*03b8*/ DSETP.GEU.AND P2, PT, |R30|, c[0x2][0x0], PT ;\n", + " /*03c8*/ F2F.F32.F64 R0, R30 ;\n", + " /*03d0*/ @!P2 FMUL R0, R0, 1.175494350822287508e-38 ;\n", + " /*03d8*/ IADD32I R30, R0, 0x1800000 ;\n", + " /*03e8*/ LOP32I.AND R30, R30, 0x7f800000 ;\n", + " /*03f0*/ ISETP.GT.U32.AND P2, PT, R30, c[0x2][0x8], PT ;\n", + " /*03f8*/ @P2 BRA `(.L_3) ;\n", + " /*0408*/ CAL `($kernel$__cuda_sm20_rcp_rn_ftz_f32_slowpath) ;\n", + " /*0410*/ SYNC (*\"TARGET= .L_2 \"*);\n", + ".L_3:\n", + " /*0418*/ MUFU.RCP R30, R0 ;\n", + " /*0428*/ FFMA R31, R0, R30, c[0x2][0xc] ;\n", + " /*0430*/ FADD.FTZ R31, -R31, -RZ ;\n", + " /*0438*/ { FFMA R64, R30, R31, R30 ;\n", + " /*0448*/ SYNC (*\"TARGET= .L_2 \"*) }\n", + ".L_2:\n", + " /*0450*/ { IADD32I R68.CC, R60.reuse, 0x2000000 ;\n", + " /*0458*/ LDG.E.64 R38, [R60+0x410] }\n", + " /*0468*/ { IADD.X R69, RZ, R61.reuse ;\n", + " /*0470*/ LDG.E.64 R34, [R60+-0x410] }\n", + " /*0478*/ { DADD R56, R28, R28 ;\n", + " /*0488*/ LDG.E.64 R44, [R60+-0x418] }\n", + " /*0490*/ { DADD R18, R18, R18 ;\n", + " /*0498*/ LDG.E.64 R52, [R68+0x186490] }\n", + " /*04a8*/ { IADD32I R42.CC, R60, 0x1000000 ;\n", + " /*04b0*/ LDG.E.64 R36, [R68+0x1a70a0] }\n", + " /*04b8*/ { DADD R18, R56, -R18 ;\n", + " /*04c8*/ LDG.E.64 R50, [R68+0x185c70] }\n", + " /*04d0*/ { IADD.X R43, RZ, R61 ;\n", + " /*04d8*/ LDG.E.64 R46, [R68+0x165060] }\n", + " /*04e8*/ { DSETP.GEU.AND P2, PT, |R12|, c[0x2][0x0], PT ;\n", + " /*04f0*/ LDG.E.64 R30, [R68+0x185c68] }\n", + " /*04f8*/ { DSETP.GEU.AND P3, PT, |R26|, c[0x2][0x0], PT ;\n", + " /*0508*/ LDG.E.64 R40, [R68+0x165058] }\n", + " /*0510*/ { F2F.F32.F64 R0, R12 ;\n", + " /*0518*/ LDG.E.64 R48, [R68+0x186488] }\n", + " /*0528*/ { F2F.F32.F64 R26, R26 ;\n", + " /*0530*/ LDG.E.64 R28, [R60+0x21020] }\n", + " /*0538*/ { FMUL.FTZ R64, R64, 1 ;\n", + " /*0548*/ LDG.E.64 R56, [R60+-0x21020] }\n", + " /*0550*/ LDG.E.64 R32, [R68+0x1a7098] ;\n", + " /*0558*/ LDG.E.64 R54, [R42+0xc3450] ;\n", + " /*0568*/ LDG.E.64 R66, [R42+0xc2c30] ;\n", + " /*0570*/ LDG.E.64 R58, [R58+-0x8] ;\n", + " /*0578*/ { @!P2 FMUL R0, R0, 1.175494350822287508e-38 ;\n", + " /*0588*/ SSY `(.L_4) }\n", + " /*0590*/ @!P3 FMUL R26, R26, 1.175494350822287508e-38 ;\n", + " /*0598*/ FMUL.FTZ R0, R0, 1 ;\n", + " /*05a8*/ FMUL.FTZ R26, R26, 1 ;\n", + " /*05b0*/ F2F.F64.F32 R68, R26 ;\n", + " /*05b8*/ DEPBAR.LE SB5, 0x8 ;\n", + " /*05c8*/ DMUL R52, R52, 0.5 ;\n", + " /*05d0*/ DMUL R36, R36, 0.5 ;\n", + " /*05d8*/ { DFMA R52, R50, -0.5, R52 ;\n", + " /*05e8*/ DEPBAR.LE SB5, 0x6 }\n", + " /*05f0*/ { DFMA R70, R46, -0.5, R36 ;\n", + " /*05f8*/ LDG.E.64 R50, [R60+-0x21028] }\n", + " /*0608*/ LDG.E.64 R46, [R42+0xc2c28] ;\n", + " /*0610*/ LDG.E.64 R36, [R42+0xe4060] ;\n", + " /*0618*/ DFMA R30, R30, -0.5, R52 ;\n", + " /*0628*/ { F2F.F64.F32 R52, R0 ;\n", + " /*0630*/ DEPBAR.LE SB5, 0x8 }\n", + " /*0638*/ DFMA R70, R40, -0.5, R70 ;\n", + " /*0648*/ LDG.E.64 R40, [R42+0xa2020] ;\n", + " /*0650*/ DMUL R26, R14, R52 ;\n", + " /*0658*/ { DMUL R52, R38, 0.5 ;\n", + " /*0668*/ DEPBAR.LE SB5, 0x8 }\n", + " /*0670*/ DFMA R48, R48, 0.5, R30 ;\n", + " /*0678*/ LDG.E.64 R30, [R60+0x408] ;\n", + " /*0688*/ LDG.E.64 R60, [R60+0x21018] ;\n", + " /*0690*/ DFMA R38, R34, -0.5, R52 ;\n", + " /*0698*/ LDG.E.64 R34, [R42+0xa2018] ;\n", + " /*06a8*/ LDG.E.64 R52, [R42+0xe4058] ;\n", + " /*06b0*/ DFMA R44, R44, -0.5, R38 ;\n", + " /*06b8*/ LDG.E.64 R38, [R42+0xc3448] ;\n", + " /*06c8*/ DEPBAR.LE SB5, 0x9 ;\n", + " /*06d0*/ DMUL R28, R28, 0.5 ;\n", + " /*06d8*/ DFMA R32, R32, 0.5, R70 ;\n", + " /*06e8*/ DFMA R28, R56, -0.5, R28 ;\n", + " /*06f0*/ { IADD32I R42.CC, R62, 0x1000000 ;\n", + " /*06f8*/ LDG.E.64 R56, [R62] }\n", + " /*0708*/ F2F.F64.F32 R70, R64 ;\n", + " /*0710*/ DEPBAR.LE SB5, 0x8 ;\n", + " /*0718*/ DMUL R64, R54, 0.5 ;\n", + " /*0728*/ IADD.X R43, RZ, R63 ;\n", + " /*0730*/ LDG.E.64 R54, [R42+0xc3040] ;\n", + " /*0738*/ DFMA R64, R66, -0.5, R64 ;\n", + " /*0748*/ DSETP.GEU.AND P2, PT, |R26|, c[0x2][0x0], PT ;\n", + " /*0750*/ F2F.F32.F64 R0, R26 ;\n", + " /*0758*/ DMUL R20, R20, R70 ;\n", + " /*0768*/ @!P2 FMUL R0, R0, 1.175494350822287508e-38 ;\n", + " /*0770*/ MUFU.SQRT R0, R0 ;\n", + " /*0778*/ DMUL R14, R14, R68 ;\n", + " /*0788*/ DSETP.GEU.AND P5, PT, |R20|, c[0x2][0x0], PT ;\n", + " /*0790*/ DMUL R22, R22, R70.reuse ;\n", + " /*0798*/ DMUL R16, R16, R70 ;\n", + " /*07a8*/ DEPBAR.LE SB5, 0x6 ;\n", + " /*07b0*/ DFMA R50, R50, -0.5, R28 ;\n", + " /*07b8*/ DADD R28, R8, R8 ;\n", + " /*07c8*/ DFMA R64, R46, -0.5, R64 ;\n", + " /*07d0*/ DADD R46, R6, R6 ;\n", + " /*07d8*/ DMUL R66, R36, 0.5 ;\n", + " /*07e8*/ DADD R36, R24, R24 ;\n", + " /*07f0*/ DADD R28, R28, -R46 ;\n", + " /*07f8*/ { DADD R46, R4, R4 ;\n", + " /*0808*/ DEPBAR.LE SB5, 0x3 }\n", + " /*0810*/ DFMA R40, R40, -0.5, R66 ;\n", + " /*0818*/ DFMA R30, R30, 0.5, R44 ;\n", + " /*0828*/ DADD R36, R36, -R46 ;\n", + " /*0830*/ DMUL R44, R28, R28 ;\n", + " /*0838*/ DFMA R34, R34, -0.5, R40 ;\n", + " /*0848*/ DMUL R40, R36, R36 ;\n", + " /*0850*/ DSETP.GEU.AND P4, PT, |R44|, c[0x2][0x0], PT ;\n", + " /*0858*/ F2F.F32.F64 R47, R44 ;\n", + " /*0868*/ DSETP.GEU.AND P3, PT, |R40|, c[0x2][0x0], PT ;\n", + " /*0870*/ DMUL R44, R48, R48 ;\n", + " /*0878*/ F2F.F32.F64 R46, R40 ;\n", + " /*0888*/ @!P4 FMUL R47, R47, 1.175494350822287508e-38 ;\n", + " /*0890*/ DFMA R26, R18, R18, R44 ;\n", + " /*0898*/ FMUL.FTZ R40, R47, 1 ;\n", + " /*08a8*/ { @!P3 FMUL R46, R46, 1.175494350822287508e-38 ;\n", + " /*08b0*/ DEPBAR.LE SB5, 0x2 }\n", + " /*08b8*/ DFMA R38, R38, 0.5, R64 ;\n", + " /*08c8*/ F2F.F64.F32 R40, R40 ;\n", + " /*08d0*/ FMUL.FTZ R44, R46, 1 ;\n", + " /*08d8*/ DFMA R26, R32, R32, R26 ;\n", + " /*08e8*/ DFMA R34, R52, 0.5, R34 ;\n", + " /*08f0*/ F2F.F64.F32 R44, R44 ;\n", + " /*08f8*/ DFMA R52, R38, R38, R40 ;\n", + " /*0908*/ DSETP.GEU.AND P2, PT, |R26|, c[0x2][0x0], PT ;\n", + " /*0910*/ FMUL.FTZ R46, R0, 1 ;\n", + " /*0918*/ DFMA R50, R60, 0.5, R50 ;\n", + " /*0928*/ F2F.F32.F64 R40, R26 ;\n", + " /*0930*/ DFMA R60, R30, R30, R44 ;\n", + " /*0938*/ DFMA R44, R34, R34, R52 ;\n", + " /*0948*/ F2F.F64.F32 R26, R46 ;\n", + " /*0950*/ MOV32I R64, 0x349e35fd ;\n", + " /*0958*/ F2F.F32.F64 R46, R20 ;\n", + " /*0968*/ MOV32I R65, 0x401d14dc ;\n", + " /*0970*/ DSETP.GEU.AND P4, PT, |R14|, c[0x2][0x0], PT ;\n", + " /*0978*/ DFMA R52, R50, R50, R60 ;\n", + " /*0988*/ DSETP.GEU.AND P3, PT, |R44|, c[0x2][0x0], PT ;\n", + " /*0990*/ @!P2 FMUL R40, R40, 1.175494350822287508e-38 ;\n", + " /*0998*/ { DSETP.GEU.AND P2, PT, R26, c[0x2][0x18], PT ;\n", + " /*09a8*/ MUFU.RCP R47, R40 }\n", + " /*09b0*/ { DFMA R66, R10, c[0x2][0x10], R64 ;\n", + " /*09b8*/ MUFU.SQRT R41, R40 }\n", + " /*09c8*/ F2F.F32.F64 R60, R14 ;\n", + " /*09d0*/ F2F.F32.F64 R61, R44 ;\n", + " /*09d8*/ DMUL R26, R56, 16 ;\n", + " /*09e8*/ @!P5 FMUL R46, R46, 1.175494350822287508e-38 ;\n", + " /*09f0*/ DSETP.GEU.AND P5, PT, |R52|, c[0x2][0x0], PT ;\n", + " /*09f8*/ F2F.F32.F64 R52, R52 ;\n", + " /*0a08*/ DMUL R14, R54, 16 ;\n", + " /*0a10*/ DFMA R26, R24, -16, R26 ;\n", + " /*0a18*/ @!P4 FMUL R60, R60, 1.175494350822287508e-38 ;\n", + " /*0a28*/ { DMUL R24, R20, R66 ;\n", + " /*0a30*/ MUFU.SQRT R53, R60 }\n", + " /*0a38*/ @!P3 FMUL R61, R61, 1.175494350822287508e-38 ;\n", + " /*0a48*/ { FMUL.FTZ R46, R46, R47 ;\n", + " /*0a50*/ MUFU.SQRT R54, R61 }\n", + " /*0a58*/ FMUL.FTZ R44, R41, 1 ;\n", + " /*0a68*/ DFMA R14, R8, -16, R14 ;\n", + " /*0a70*/ MOV R40, RZ ;\n", + " /*0a78*/ DFMA R24, R58, c[0x2][0x10], R24 ;\n", + " /*0a88*/ F2F.F64.F32 R46, R46 ;\n", + " /*0a90*/ F2F.F64.F32 R44, R44 ;\n", + " /*0a98*/ MOV R41, RZ ;\n", + " /*0aa8*/ MOV R56, RZ ;\n", + " /*0ab0*/ MOV R57, RZ ;\n", + " /*0ab8*/ { @!P5 FMUL R52, R52, 1.175494350822287508e-38 ;\n", + " /*0ac8*/ @!P2 SYNC (*\"TARGET= .L_4 \"*) }\n", + " /*0ad0*/ MUFU.SQRT R55, R52 ;\n", + " /*0ad8*/ FMUL.FTZ R66, R55, 1 ;\n", + " /*0ae8*/ F2F.F64.F32 R66, R66 ;\n", + " /*0af0*/ DMUL R66, R44, R66 ;\n", + " /*0af8*/ DSETP.GEU.AND P2, PT, R66, c[0x2][0x18], PT ;\n", + " /*0b08*/ @!P2 SYNC (*\"TARGET= .L_4 \"*);\n", + " /*0b10*/ { DMUL R12, R12, R46 ;\n", + " /*0b18*/ LDG.E.64 R56, [R62+-0x8] }\n", + " /*0b28*/ MOV32I R60, 0x834fff9c ;\n", + " /*0b30*/ MOV32I R61, 0x3ff61152 ;\n", + " /*0b38*/ FMUL.FTZ R0, R0, R55 ;\n", + " /*0b48*/ MUFU.RCP R0, R0 ;\n", + " /*0b50*/ DMUL R66, R22, R12 ;\n", + " /*0b58*/ DFMA R12, R10, c[0x2][0x20], R60 ;\n", + " /*0b68*/ DMUL R12, R12, R66 ;\n", + " /*0b70*/ DFMA R12, R58, c[0x2][0x20], -R12 ;\n", + " /*0b78*/ DFMA R26, R24, R26, R12 ;\n", + " /*0b88*/ DFMA R26, R4, 16.NEG, R26 ;\n", + " /*0b90*/ DMUL R56, R56, 16 ;\n", + " /*0b98*/ DMUL R56, R18, R56 ;\n", + " /*0ba8*/ DFMA R26, R36, R56, R26 ;\n", + " /*0bb0*/ DFMA R26, R30, R48, R26 ;\n", + " /*0bb8*/ DFMA R26, R50, R32, R26 ;\n", + " /*0bc8*/ DSETP.GEU.AND P2, PT, |R26|, c[0x2][0x0], PT ;\n", + " /*0bd0*/ F2F.F32.F64 R26, R26 ;\n", + " /*0bd8*/ @!P2 FMUL R26, R26, 1.175494350822287508e-38 ;\n", + " /*0be8*/ FMUL.FTZ R26, R26, R0 ;\n", + " /*0bf0*/ { F2F.F64.F32 R56, R26 ;\n", + " /*0bf8*/ SYNC (*\"TARGET= .L_4 \"*) }\n", + ".L_4:\n", + " /*0c08*/ { FMUL.FTZ R26, R54, 1 ;\n", + " /*0c10*/ SSY `(.L_5) }\n", + " /*0c18*/ FMUL.FTZ R4, R53, 1 ;\n", + " /*0c28*/ F2F.F64.F32 R26, R26 ;\n", + " /*0c30*/ F2F.F64.F32 R4, R4 ;\n", + " /*0c38*/ DMUL R26, R44, R26 ;\n", + " /*0c48*/ DSETP.GEU.AND P2, PT, R26, c[0x2][0x18], PT ;\n", + " /*0c50*/ DSETP.LT.OR P2, PT, R4, c[0x2][0x18], !P2 ;\n", + " /*0c58*/ @P2 SYNC (*\"TARGET= .L_5 \"*);\n", + " /*0c68*/ { DMUL R26, R6, 0.5 ;\n", + " /*0c70*/ LDG.E.64 R42, [R42+0xc3038] }\n", + " /*0c78*/ MOV32I R4, 0x8cfbbca1 ;\n", + " /*0c88*/ MOV32I R5, 0x3fef2cb9 ;\n", + " /*0c90*/ FMUL.FTZ R53, R53, R54 ;\n", + " /*0c98*/ MUFU.RCP R53, R53 ;\n", + " /*0ca8*/ DFMA R26, R8, 0.5, R26 ;\n", + " /*0cb0*/ DFMA R4, R10, c[0x2][0x28], R4 ;\n", + " /*0cb8*/ DMUL R26, R26, R46 ;\n", + " /*0cc8*/ DMUL R26, R16, R26 ;\n", + " /*0cd0*/ DMUL R4, R4, R26 ;\n", + " /*0cd8*/ DFMA R4, R58, c[0x2][0x28], -R4 ;\n", + " /*0ce8*/ DFMA R4, R24, R14, R4 ;\n", + " /*0cf0*/ DFMA R6, R6, 16.NEG, R4 ;\n", + " /*0cf8*/ DMUL R8, R42, 16 ;\n", + " /*0d08*/ DMUL R8, R18, R8 ;\n", + " /*0d10*/ DFMA R6, R28, R8, R6 ;\n", + " /*0d18*/ DFMA R6, R48, R38, R6 ;\n", + " /*0d28*/ DFMA R6, R32, R34, R6 ;\n", + " /*0d30*/ DSETP.GEU.AND P2, PT, |R6|, c[0x2][0x0], PT ;\n", + " /*0d38*/ F2F.F32.F64 R6, R6 ;\n", + " /*0d48*/ @!P2 FMUL R6, R6, 1.175494350822287508e-38 ;\n", + " /*0d50*/ FMUL.FTZ R6, R6, R53 ;\n", + " /*0d58*/ F2F.F64.F32 R40, R6 ;\n", + " /*0d68*/ { DMUL R40, R40, c[0x2][0x30] ;\n", + " /*0d70*/ SYNC (*\"TARGET= .L_5 \"*) }\n", + ".L_5:\n", + " /*0d78*/ DMUL R18, R18, R56 ;\n", + " /*0d88*/ DADD R4, R58, R58 ;\n", + " /*0d90*/ DFMA R40, R18, c[0x2][0x38], R40 ;\n", + " /*0d98*/ DMUL R4, R22, R4 ;\n", + " /*0da8*/ DFMA R10, R10, 2, R40 ;\n", + " /*0db0*/ DFMA R10, R4, c[0x2][0x40], R10 ;\n", + " /*0db8*/ IADD R4.CC, R3, c[0x0][0x148] ;\n", + " /*0dc8*/ DFMA R10, R16, c[0x2][0x48], R10 ;\n", + " /*0dd0*/ IADD.X R5, R2, c[0x0][0x14c] ;\n", + " /*0dd8*/ DFMA R10, R20, c[0x2][0x50], R10 ;\n", + " /*0de8*/ STG.E.64 [R4], R10 ;\n", + " /*0df0*/ SYNC (*\"TARGET= .L_1 \"*);\n", + ".L_1:\n", + " /*0df8*/ S2R R0, SR_TID.X ;\n", + " /*0e08*/ SSY `(.L_6) ;\n", + " /*0e10*/ S2R R5, SR_CTAID.X ;\n", + " /*0e18*/ XMAD R0, R5.reuse, c[0x0] [0x8], R0 ;\n", + " /*0e28*/ XMAD.MRG R6, R5.reuse, c[0x0] [0x8].H1, RZ ;\n", + " /*0e30*/ XMAD.PSL.CBCC R0, R5.H1, R6.H1, R0 ;\n", + " /*0e38*/ IADD32I R50, R0, 0x1 ;\n", + " /*0e48*/ ISETP.LT.U32.AND P1, PT, R50, 0x81, !P1 ;\n", + " /*0e50*/ @!P1 SYNC (*\"TARGET= .L_6 \"*);\n", + " /*0e58*/ { IADD R60.CC, R3.reuse, c[0x0][0x158] ;\n", + " /*0e68*/ LDG.E.64 R42, [R62] }\n", + " /*0e70*/ { IADD.X R61, R2, c[0x0][0x15c] ;\n", + " /*0e78*/ SSY `(.L_7) }\n", + " /*0e88*/ { IADD32I R28.CC, R60.reuse, 0x1000000 ;\n", + " /*0e90*/ LDG.E.64 R4, [R60+-0x410] }\n", + " /*0e98*/ { IADD.X R29, RZ, R61.reuse ;\n", + " /*0ea8*/ LDG.E.64 R40, [R60] }\n", + " /*0eb0*/ { IADD32I R30.CC, R60, 0x2000000 ;\n", + " /*0eb8*/ LDG.E.64 R52, [R60+0x21020] }\n", + " /*0ec8*/ { IADD.X R31, RZ, R61 ;\n", + " /*0ed0*/ LDG.E.64 R6, [R28+0xc2c30] }\n", + " /*0ed8*/ { IADD32I R38.CC, R62, 0x1000000 ;\n", + " /*0ee8*/ LDG.E.64 R10, [R28+0xc3040] }\n", + " /*0ef0*/ { IADD.X R39, RZ, R63 ;\n", + " /*0ef8*/ LDG.E.64 R8, [R30+0x185c70] }\n", + " /*0f08*/ { IADD R58.CC, R3, c[0x0][0x140] ;\n", + " /*0f10*/ LDG.E.64 R44, [R30+0x186080] }\n", + " /*0f18*/ { IADD.X R59, R2, c[0x0][0x144] ;\n", + " /*0f28*/ LDG.E.64 R48, [R30+0x1a70a0] }\n", + " /*0f30*/ LDG.E.64 R46, [R28+0xe4060] ;\n", + " /*0f38*/ LDG.E.64 R34, [R30+0x165060] ;\n", + " /*0f48*/ LDG.E.64 R38, [R38+0xc3040] ;\n", + " /*0f50*/ LDG.E.64 R32, [R60+-0x21020] ;\n", + " /*0f58*/ LDG.E.64 R36, [R28+0xa2020] ;\n", + " /*0f68*/ LDG.E.64 R12, [R58] ;\n", + " /*0f70*/ LDG.E.64 R14, [R60+-0x418] ;\n", + " /*0f78*/ LDG.E.64 R18, [R28+0xc2c28] ;\n", + " /*0f88*/ LDG.E.64 R16, [R30+0x185c68] ;\n", + " /*0f90*/ DEPBAR.LE SB5, 0xc ;\n", + " /*0f98*/ DMUL R22, R4, 0.5 ;\n", + " /*0fa8*/ DFMA R20, R40, 0.5, R22 ;\n", + " /*0fb0*/ DMUL R22, R20, R20 ;\n", + " /*0fb8*/ DSETP.GEU.AND P1, PT, |R22|, c[0x2][0x0], PT ;\n", + " /*0fc8*/ F2F.F32.F64 R0, R22 ;\n", + " /*0fd0*/ DEPBAR.LE SB5, 0xa ;\n", + " /*0fd8*/ DMUL R24, R6, 0.5 ;\n", + " /*0fe8*/ DMUL R26, R8, 0.5 ;\n", + " /*0ff0*/ @!P1 FMUL R0, R0, 1.175494350822287508e-38 ;\n", + " /*0ff8*/ DFMA R24, R10, 0.5, R24 ;\n", + " /*1008*/ { FMUL.FTZ R0, R0, 1 ;\n", + " /*1010*/ DEPBAR.LE SB5, 0x9 }\n", + " /*1018*/ DFMA R26, R44, 0.5, R26 ;\n", + " /*1028*/ DMUL R28, R24, R24 ;\n", + " /*1030*/ F2F.F64.F32 R54, R0 ;\n", + " /*1038*/ DMUL R30, R26, R26 ;\n", + " /*1048*/ DADD R56, R28, R54 ;\n", + " /*1050*/ DADD R56, R30, R56 ;\n", + " /*1058*/ DSETP.GEU.AND P1, PT, |R56|, c[0x2][0x0], PT ;\n", + " /*1068*/ F2F.F32.F64 R0, R56 ;\n", + " /*1070*/ @!P1 FMUL R0, R0, 1.175494350822287508e-38 ;\n", + " /*1078*/ IADD32I R51, R0, 0x1800000 ;\n", + " /*1088*/ { DMUL R54, R52, 0.5 ;\n", + " /*1090*/ DEPBAR.LE SB5, 0x7 }\n", + " /*1098*/ DMUL R48, R48, 0.5 ;\n", + " /*10a8*/ LOP32I.AND R51, R51, 0x7f800000 ;\n", + " /*10b0*/ DMUL R46, R46, 0.5 ;\n", + " /*10b8*/ { DMUL R42, R42, 16 ;\n", + " /*10c8*/ DEPBAR.LE SB5, 0x4 }\n", + " /*10d0*/ ISETP.GT.U32.AND P1, PT, R51, c[0x2][0x8], PT ;\n", + " /*10d8*/ DFMA R34, R34, -0.5, R48 ;\n", + " /*10e8*/ DFMA R32, R32, -0.5, R54 ;\n", + " /*10f0*/ DMUL R48, R38, 16 ;\n", + " /*10f8*/ MOV32I R54, 0x349e35fd ;\n", + " /*1108*/ MOV32I R55, 0x401d14dc ;\n", + " /*1110*/ DFMA R36, R36, -0.5, R46 ;\n", + " /*1118*/ DFMA R38, R40.reuse, -16, R42 ;\n", + " /*1128*/ DADD R44, R44, R44 ;\n", + " /*1130*/ DADD R40, R40, R40 ;\n", + " /*1138*/ DADD R46, R10.reuse, R10 ;\n", + " /*1148*/ DFMA R48, R10, -16, R48 ;\n", + " /*1150*/ { DFMA R42, R12, c[0x2][0x10], R54 ;\n", + " /*1158*/ @P1 BRA `(.L_8) }\n", + " /*1168*/ CAL `($kernel$__cuda_sm20_rcp_rn_ftz_f32_slowpath) ;\n", + " /*1170*/ SYNC (*\"TARGET= .L_7 \"*);\n", + ".L_8:\n", + " /*1178*/ MUFU.RCP R51, R0 ;\n", + " /*1188*/ FFMA R52, R0, R51, c[0x2][0xc] ;\n", + " /*1190*/ FADD.FTZ R52, -R52, -RZ ;\n", + " /*1198*/ { FFMA R64, R51, R52, R51 ;\n", + " /*11a8*/ SYNC (*\"TARGET= .L_7 \"*) }\n", + ".L_7:\n", + " /*11b0*/ { IADD32I R52.CC, R60, 0x2000000 ;\n", + " /*11b8*/ LDG.E.64 R58, [R58+-0x410] }\n", + " /*11c8*/ { IADD.X R53, RZ, R61 ;\n", + " /*11d0*/ SSY `(.L_9) }\n", + " /*11d8*/ { DADD R70, R8, R8 ;\n", + " /*11e8*/ LDG.E.64 R68, [R52+0x186088] }\n", + " /*11f0*/ { DADD R44, R44, -R70 ;\n", + " /*11f8*/ LDG.E.64 R66, [R52+0x186078] }\n", + " /*1208*/ { DSETP.GEU.AND P1, PT, |R20|, c[0x2][0x0], PT ;\n", + " /*1210*/ LDG.E.64 R56, [R52+0x164c50] }\n", + " /*1218*/ { F2F.F32.F64 R0, R20 ;\n", + " /*1228*/ LDG.E.64 R54, [R52+0x185c78] }\n", + " /*1230*/ { DSETP.GEU.AND P3, PT, |R24|, c[0x2][0x0], PT ;\n", + " /*1238*/ LDG.E.64 R8, [R60+0x8] }\n", + " /*1248*/ { FMUL.FTZ R64, R64, 1 ;\n", + " /*1250*/ LDG.E.64 R52, [R52+0x1a6c90] }\n", + " /*1258*/ DEPBAR.LE SB5, 0x2 ;\n", + " /*1268*/ DMUL R68, R68, 0.5 ;\n", + " /*1270*/ DFMA R66, R66, -0.5, R68 ;\n", + " /*1278*/ DFMA R68, R16, -0.5, R66 ;\n", + " /*1288*/ LDG.E.64 R16, [R60+-0x8] ;\n", + " /*1290*/ DEPBAR.LE SB5, 0x2 ;\n", + " /*1298*/ DFMA R66, R56, -0.5, R34 ;\n", + " /*12a8*/ IADD32I R34.CC, R60, 0x1000000 ;\n", + " /*12b0*/ IADD.X R35, RZ, R61 ;\n", + " /*12b8*/ LDG.E.64 R56, [R34+0xc3048] ;\n", + " /*12c8*/ DEPBAR.LE SB5, 0x2 ;\n", + " /*12d0*/ DFMA R52, R52, 0.5, R66 ;\n", + " /*12d8*/ LDG.E.64 R66, [R34+0xc3038] ;\n", + " /*12e8*/ DFMA R54, R54, 0.5, R68 ;\n", + " /*12f0*/ LDG.E.64 R68, [R60+-0x408] ;\n", + " /*12f8*/ DMUL R70, R8, 0.5 ;\n", + " /*1308*/ LDG.E.64 R8, [R34+0xc2c38] ;\n", + " /*1310*/ DEPBAR.LE SB5, 0x4 ;\n", + " /*1318*/ DFMA R16, R16, -0.5, R70 ;\n", + " /*1328*/ DFMA R70, R14, -0.5, R16 ;\n", + " /*1330*/ LDG.E.64 R14, [R60+-0x21430] ;\n", + " /*1338*/ DEPBAR.LE SB5, 0x3 ;\n", + " /*1348*/ DMUL R16, R56, 0.5 ;\n", + " /*1350*/ LDG.E.64 R56, [R34+0xe3c50] ;\n", + " /*1358*/ DFMA R16, R66, -0.5, R16 ;\n", + " /*1368*/ LDG.E.64 R66, [R34+0xa1c10] ;\n", + " /*1370*/ DEPBAR.LE SB5, 0x3 ;\n", + " /*1378*/ DFMA R68, R68, 0.5, R70 ;\n", + " /*1388*/ LDG.E.64 R70, [R60+0x20c10] ;\n", + " /*1390*/ DFMA R18, R18, -0.5, R16 ;\n", + " /*1398*/ DADD R16, R6, R6 ;\n", + " /*13a8*/ @!P1 FMUL R0, R0, 1.175494350822287508e-38 ;\n", + " /*13b0*/ DADD R46, R46, -R16 ;\n", + " /*13b8*/ FMUL.FTZ R0, R0, 1 ;\n", + " /*13c8*/ { DMUL R16, R46, R46 ;\n", + " /*13d0*/ DEPBAR.LE SB5, 0x3 }\n", + " /*13d8*/ DFMA R8, R8, 0.5, R18 ;\n", + " /*13e8*/ DSETP.GEU.AND P1, PT, |R16|, c[0x2][0x0], PT ;\n", + " /*13f0*/ F2F.F32.F64 R51, R16 ;\n", + " /*13f8*/ DMUL R34, R54, R54 ;\n", + " /*1408*/ F2F.F32.F64 R24, R24 ;\n", + " /*1410*/ @!P1 FMUL R51, R51, 1.175494350822287508e-38 ;\n", + " /*1418*/ @!P3 FMUL R24, R24, 1.175494350822287508e-38 ;\n", + " /*1428*/ FMUL.FTZ R24, R24, 1 ;\n", + " /*1430*/ DEPBAR.LE SB5, 0x2 ;\n", + " /*1438*/ DFMA R32, R14, -0.5, R32 ;\n", + " /*1448*/ DADD R14, R4, R4 ;\n", + " /*1450*/ { DADD R40, R40, -R14 ;\n", + " /*1458*/ DEPBAR.LE SB5, 0x1 }\n", + " /*1468*/ DFMA R14, R66, -0.5, R36 ;\n", + " /*1470*/ F2F.F64.F32 R36, R0 ;\n", + " /*1478*/ DMUL R18, R40, R40 ;\n", + " /*1488*/ DMUL R16, R26, R36 ;\n", + " /*1490*/ DSETP.GEU.AND P2, PT, |R18|, c[0x2][0x0], PT ;\n", + " /*1498*/ F2F.F32.F64 R0, R18 ;\n", + " /*14a8*/ DFMA R18, R44, R44, R34 ;\n", + " /*14b0*/ DSETP.GEU.AND P1, PT, |R16|, c[0x2][0x0], PT ;\n", + " /*14b8*/ F2F.F32.F64 R25, R16 ;\n", + " /*14c8*/ FMUL.FTZ R34, R51, 1 ;\n", + " /*14d0*/ F2F.F64.F32 R16, R64 ;\n", + " /*14d8*/ DFMA R18, R52, R52, R18 ;\n", + " /*14e8*/ @!P2 FMUL R0, R0, 1.175494350822287508e-38 ;\n", + " /*14f0*/ F2F.F64.F32 R34, R34 ;\n", + " /*14f8*/ DMUL R36, R30, R16 ;\n", + " /*1508*/ DFMA R14, R56, 0.5, R14 ;\n", + " /*1510*/ FMUL.FTZ R30, R0, 1 ;\n", + " /*1518*/ @!P1 FMUL R25, R25, 1.175494350822287508e-38 ;\n", + " /*1528*/ { F2F.F64.F32 R56, R24 ;\n", + " /*1530*/ MUFU.SQRT R0, R25 }\n", + " /*1538*/ DSETP.GEU.AND P1, PT, |R18|, c[0x2][0x0], PT ;\n", + " /*1548*/ F2F.F32.F64 R24, R18 ;\n", + " /*1550*/ DFMA R60, R8, R8, R34 ;\n", + " /*1558*/ F2F.F64.F32 R18, R30 ;\n", + " /*1568*/ DFMA R70, R70, 0.5, R32 ;\n", + " /*1570*/ DSETP.GEU.AND P2, PT, |R36|, c[0x2][0x0], PT ;\n", + " /*1578*/ DMUL R26, R26, R56 ;\n", + " /*1588*/ DFMA R60, R14, R14, R60 ;\n", + " /*1590*/ DFMA R32, R68, R68, R18 ;\n", + " /*1598*/ FMUL.FTZ R18, R0, 1 ;\n", + " /*15a8*/ F2F.F32.F64 R34, R36 ;\n", + " /*15b0*/ DSETP.GEU.AND P3, PT, |R26|, c[0x2][0x0], PT ;\n", + " /*15b8*/ @!P1 FMUL R24, R24, 1.175494350822287508e-38 ;\n", + " /*15c8*/ { DSETP.GEU.AND P4, PT, |R60|, c[0x2][0x0], PT ;\n", + " /*15d0*/ MUFU.RCP R35, R24 }\n", + " /*15d8*/ { F2F.F64.F32 R18, R18 ;\n", + " /*15e8*/ MUFU.SQRT R25, R24 }\n", + " /*15f0*/ DFMA R30, R70, R70, R32 ;\n", + " /*15f8*/ F2F.F32.F64 R27, R26 ;\n", + " /*1608*/ @!P2 FMUL R34, R34, 1.175494350822287508e-38 ;\n", + " /*1610*/ F2F.F32.F64 R26, R60 ;\n", + " /*1618*/ DSETP.GEU.AND P2, PT, R18, c[0x2][0x18], PT ;\n", + " /*1628*/ DSETP.GEU.AND P1, PT, |R30|, c[0x2][0x0], PT ;\n", + " /*1630*/ DMUL R18, R58, c[0x2][0x10] ;\n", + " /*1638*/ @!P3 FMUL R27, R27, 1.175494350822287508e-38 ;\n", + " /*1648*/ { FMUL.FTZ R32, R34, R35 ;\n", + " /*1650*/ MUFU.SQRT R27, R27 }\n", + " /*1658*/ @!P4 FMUL R26, R26, 1.175494350822287508e-38 ;\n", + " /*1668*/ { F2F.F32.F64 R33, R30 ;\n", + " /*1670*/ MUFU.SQRT R26, R26 }\n", + " /*1678*/ FMUL.FTZ R25, R25, 1 ;\n", + " /*1688*/ DFMA R42, R42, R36, R18 ;\n", + " /*1690*/ DMUL R22, R22, R16.reuse ;\n", + " /*1698*/ F2F.F64.F32 R18, R32 ;\n", + " /*16a8*/ DMUL R16, R28, R16 ;\n", + " /*16b0*/ F2F.F64.F32 R24, R25 ;\n", + " /*16b8*/ MOV R28, RZ ;\n", + " /*16c8*/ MOV R29, RZ ;\n", + " /*16d0*/ MOV R30, RZ ;\n", + " /*16d8*/ MOV R31, RZ ;\n", + " /*16e8*/ { @!P1 FMUL R33, R33, 1.175494350822287508e-38 ;\n", + " /*16f0*/ @!P2 SYNC (*\"TARGET= .L_9 \"*) }\n", + " /*16f8*/ MUFU.SQRT R33, R33 ;\n", + " /*1708*/ FMUL.FTZ R34, R33, 1 ;\n", + " /*1710*/ F2F.F64.F32 R34, R34 ;\n", + " /*1718*/ DMUL R34, R24, R34 ;\n", + " /*1728*/ DSETP.GEU.AND P1, PT, R34, c[0x2][0x18], PT ;\n", + " /*1730*/ @!P1 SYNC (*\"TARGET= .L_9 \"*);\n", + " /*1738*/ { DMUL R20, R20, R18 ;\n", + " /*1748*/ LDG.E.64 R30, [R62+-0x410] }\n", + " /*1750*/ MOV32I R66, 0x834fff9c ;\n", + " /*1758*/ MOV32I R67, 0x3ff61152 ;\n", + " /*1768*/ FMUL.FTZ R0, R0, R33 ;\n", + " /*1770*/ MUFU.RCP R0, R0 ;\n", + " /*1778*/ DMUL R34, R22, R20 ;\n", + " /*1788*/ DFMA R20, R12, c[0x2][0x20], R66 ;\n", + " /*1790*/ DMUL R20, R20, R34 ;\n", + " /*1798*/ DFMA R20, R58, c[0x2][0x20], -R20 ;\n", + " /*17a8*/ DFMA R38, R38, R42, R20 ;\n", + " /*17b0*/ DFMA R38, R4, 16.NEG, R38 ;\n", + " /*17b8*/ DMUL R30, R30, 16 ;\n", + " /*17c8*/ DMUL R30, R44, R30 ;\n", + " /*17d0*/ DFMA R30, R40, R30, R38 ;\n", + " /*17d8*/ DFMA R30, R68, R54, R30 ;\n", + " /*17e8*/ DFMA R30, R70, R52, R30 ;\n", + " /*17f0*/ DSETP.GEU.AND P1, PT, |R30|, c[0x2][0x0], PT ;\n", + " /*17f8*/ F2F.F32.F64 R30, R30 ;\n", + " /*1808*/ @!P1 FMUL R30, R30, 1.175494350822287508e-38 ;\n", + " /*1810*/ FMUL.FTZ R30, R30, R0 ;\n", + " /*1818*/ { F2F.F64.F32 R30, R30 ;\n", + " /*1828*/ SYNC (*\"TARGET= .L_9 \"*) }\n", + ".L_9:\n", + " /*1830*/ { FMUL.FTZ R34, R26, 1 ;\n", + " /*1838*/ SSY `(.L_10) }\n", + " /*1848*/ FMUL.FTZ R4, R27, 1 ;\n", + " /*1850*/ F2F.F64.F32 R34, R34 ;\n", + " /*1858*/ F2F.F64.F32 R4, R4 ;\n", + " /*1868*/ DMUL R24, R24, R34 ;\n", + " /*1870*/ DSETP.GEU.AND P1, PT, R24, c[0x2][0x18], PT ;\n", + " /*1878*/ DSETP.LT.OR P1, PT, R4, c[0x2][0x18], !P1 ;\n", + " /*1888*/ @P1 SYNC (*\"TARGET= .L_10 \"*);\n", + " /*1890*/ IADD32I R20.CC, R62, 0x1000000 ;\n", + " /*1898*/ IADD.X R21, RZ, R63 ;\n", + " /*18a8*/ LDG.E.64 R20, [R20+0xc2c30] ;\n", + " /*18b0*/ DMUL R24, R6, 0.5 ;\n", + " /*18b8*/ FMUL.FTZ R26, R27, R26 ;\n", + " /*18c8*/ MUFU.RCP R26, R26 ;\n", + " /*18d0*/ DFMA R24, R10, 0.5, R24 ;\n", + " /*18d8*/ MOV32I R10, 0x8cfbbca1 ;\n", + " /*18e8*/ MOV32I R11, 0x3fef2cb9 ;\n", + " /*18f0*/ DMUL R18, R24, R18 ;\n", + " /*18f8*/ DFMA R4, R12, c[0x2][0x28], R10 ;\n", + " /*1908*/ DMUL R18, R16, R18 ;\n", + " /*1910*/ DMUL R4, R4, R18 ;\n", + " /*1918*/ DFMA R4, R58, c[0x2][0x28], -R4 ;\n", + " /*1928*/ DFMA R4, R48, R42, R4 ;\n", + " /*1930*/ DFMA R4, R6, 16.NEG, R4 ;\n", + " /*1938*/ DMUL R10, R20, 16 ;\n", + " /*1948*/ DMUL R10, R44, R10 ;\n", + " /*1950*/ DFMA R4, R46, R10, R4 ;\n", + " /*1958*/ DFMA R4, R54, R8, R4 ;\n", + " /*1968*/ DFMA R4, R52, R14, R4 ;\n", + " /*1970*/ DSETP.GEU.AND P1, PT, |R4|, c[0x2][0x0], PT ;\n", + " /*1978*/ F2F.F32.F64 R4, R4 ;\n", + " /*1988*/ @!P1 FMUL R4, R4, 1.175494350822287508e-38 ;\n", + " /*1990*/ FMUL.FTZ R4, R4, R26 ;\n", + " /*1998*/ F2F.F64.F32 R28, R4 ;\n", + " /*19a8*/ { DMUL R28, R28, c[0x2][0x30] ;\n", + " /*19b0*/ SYNC (*\"TARGET= .L_10 \"*) }\n", + ".L_10:\n", + " /*19b8*/ DMUL R30, R44, R30 ;\n", + " /*19c8*/ DADD R4, R58, R58 ;\n", + " /*19d0*/ DFMA R30, R30, c[0x2][0x38], R28 ;\n", + " /*19d8*/ DMUL R22, R22, R4 ;\n", + " /*19e8*/ IADD R4.CC, R3, c[0x0][0x148] ;\n", + " /*19f0*/ DFMA R12, R12, 2, R30 ;\n", + " /*19f8*/ IADD.X R0, R2, c[0x0][0x14c] ;\n", + " /*1a08*/ IADD32I R4.CC, R4, 0x1000000 ;\n", + " /*1a10*/ DFMA R22, R22, c[0x2][0x40], R12 ;\n", + " /*1a18*/ IADD.X R5, RZ, R0 ;\n", + " /*1a28*/ DFMA R22, R16, c[0x2][0x48], R22 ;\n", + " /*1a30*/ DFMA R22, R36, c[0x2][0x50], R22 ;\n", + " /*1a38*/ STG.E.64 [R4+0xc3040], R22 ;\n", + " /*1a48*/ SYNC (*\"TARGET= .L_6 \"*);\n", + ".L_6:\n", + " /*1a50*/ ISETP.LT.U32.AND P0, PT, R50, 0x81, !P0 ;\n", + " /*1a58*/ @!P0 EXIT ;\n", + " /*1a68*/ { IADD R60.CC, R3, c[0x0][0x158] ;\n", + " /*1a70*/ SSY `(.L_11) }\n", + " /*1a78*/ IADD.X R61, R2, c[0x0][0x15c] ;\n", + " /*1a88*/ LDG.E.64 R56, [R60+-0x21020] ;\n", + " /*1a90*/ LDG.E.64 R10, [R60] ;\n", + " /*1a98*/ { IADD32I R68.CC, R60, 0x1000000 ;\n", + " /*1aa8*/ LDG.E.64 R16, [R60+-0x410] }\n", + " /*1ab0*/ IADD.X R69, RZ, R61 ;\n", + " /*1ab8*/ { IADD32I R66.CC, R60, 0x2000000 ;\n", + " /*1ac8*/ LDG.E.64 R54, [R68+0xa2020] }\n", + " /*1ad0*/ { IADD.X R67, RZ, R61 ;\n", + " /*1ad8*/ LDG.E.64 R4, [R68+0xc3040] }\n", + " /*1ae8*/ { IADD32I R20.CC, R62, 0x1000000 ;\n", + " /*1af0*/ LDG.E.64 R6, [R68+0xc3450] }\n", + " /*1af8*/ { IADD.X R21, RZ, R63 ;\n", + " /*1b08*/ LDG.E.64 R52, [R66+0x165060] }\n", + " /*1b10*/ { IADD R58.CC, R3, c[0x0][0x140] ;\n", + " /*1b18*/ LDG.E.64 R64, [R66+0x186080] }\n", + " /*1b28*/ { IADD.X R59, R2, c[0x0][0x144] ;\n", + " /*1b30*/ LDG.E.64 R8, [R66+0x186490] }\n", + " /*1b38*/ LDG.E.64 R36, [R68+0xc2c30] ;\n", + " /*1b48*/ LDG.E.64 R40, [R66+0x185c70] ;\n", + " /*1b50*/ LDG.E.64 R22, [R66+0x186088] ;\n", + " /*1b58*/ LDG.E.64 R18, [R68+0xc3048] ;\n", + " /*1b68*/ LDG.E.64 R34, [R66+0x186078] ;\n", + " /*1b70*/ LDG.E.64 R30, [R68+0xc3038] ;\n", + " /*1b78*/ LDG.E.64 R2, [R20+0xc3040] ;\n", + " /*1b88*/ LDG.E.64 R32, [R58] ;\n", + " /*1b90*/ LDG.E.64 R20, [R60+-0x21028] ;\n", + " /*1b98*/ DEPBAR.LE SB5, 0xb ;\n", + " /*1ba8*/ DMUL R12, R56, 0.5 ;\n", + " /*1bb0*/ { DFMA R50, R10, 0.5, R12 ;\n", + " /*1bb8*/ DEPBAR.LE SB5, 0xa }\n", + " /*1bc8*/ { DMUL R46, R54, 0.5 ;\n", + " /*1bd0*/ LDG.E.64 R12, [R60+0x410] }\n", + " /*1bd8*/ DMUL R48, R50, R50 ;\n", + " /*1be8*/ DSETP.GEU.AND P0, PT, |R48|, c[0x2][0x0], PT ;\n", + " /*1bf0*/ F2F.F32.F64 R0, R48 ;\n", + " /*1bf8*/ DEPBAR.LE SB5, 0xa ;\n", + " /*1c08*/ DMUL R26, R52, 0.5 ;\n", + " /*1c10*/ DFMA R46, R4, 0.5, R46 ;\n", + " /*1c18*/ @!P0 FMUL R0, R0, 1.175494350822287508e-38 ;\n", + " /*1c28*/ DMUL R6, R6, 0.5 ;\n", + " /*1c30*/ FMUL.FTZ R0, R0, 1 ;\n", + " /*1c38*/ { DMUL R42, R46, R46 ;\n", + " /*1c48*/ DEPBAR.LE SB5, 0x8 }\n", + " /*1c50*/ DFMA R44, R64, 0.5, R26 ;\n", + " /*1c58*/ { F2F.F64.F32 R24, R0 ;\n", + " /*1c68*/ LDG.E.64 R26, [R68+0xa1c10] }\n", + " /*1c70*/ { DMUL R14, R8, 0.5 ;\n", + " /*1c78*/ DEPBAR.LE SB5, 0x6 }\n", + " /*1c88*/ { DFMA R36, R36, -0.5, R6 ;\n", + " /*1c90*/ LDG.E.64 R8, [R60+0x8] }\n", + " /*1c98*/ { DMUL R38, R44, R44 ;\n", + " /*1ca8*/ LDG.E.64 R6, [R62] }\n", + " /*1cb0*/ DADD R24, R42, R24 ;\n", + " /*1cb8*/ DFMA R40, R40, -0.5, R14 ;\n", + " /*1cc8*/ LDG.E.64 R14, [R60+-0x8] ;\n", + " /*1cd0*/ DEPBAR.LE SB5, 0x6 ;\n", + " /*1cd8*/ DMUL R28, R22, 0.5 ;\n", + " /*1ce8*/ { DMUL R18, R18, 0.5 ;\n", + " /*1cf0*/ LDG.E.64 R22, [R66+0x164c50] }\n", + " /*1cf8*/ DADD R24, R38, R24 ;\n", + " /*1d08*/ { DFMA R34, R34, -0.5, R28 ;\n", + " /*1d10*/ DEPBAR.LE SB5, 0x6 }\n", + " /*1d18*/ { DFMA R30, R30, -0.5, R18 ;\n", + " /*1d28*/ LDG.E.64 R28, [R68+0xa2018] }\n", + " /*1d30*/ { DSETP.GEU.AND P0, PT, |R24|, c[0x2][0x0], PT ;\n", + " /*1d38*/ LDG.E.64 R18, [R60+-0x21430] }\n", + " /*1d48*/ F2F.F32.F64 R0, R24 ;\n", + " /*1d50*/ LDG.E.64 R24, [R66+0x165058] ;\n", + " /*1d58*/ @!P0 FMUL R0, R0, 1.175494350822287508e-38 ;\n", + " /*1d68*/ DEPBAR.LE SB5, 0x8 ;\n", + " /*1d70*/ { DMUL R2, R2, 16 ;\n", + " /*1d78*/ DEPBAR.LE SB5, 0x7 }\n", + " /*1d88*/ DMUL R70, R12, 0.5 ;\n", + " /*1d90*/ IADD32I R12, R0, 0x1800000 ;\n", + " /*1d98*/ LOP32I.AND R12, R12, 0x7f800000 ;\n", + " /*1da8*/ ISETP.GT.U32.AND P0, PT, R12, c[0x2][0x8], PT ;\n", + " /*1db0*/ DFMA R16, R16, -0.5, R70 ;\n", + " /*1db8*/ DEPBAR.LE SB5, 0x4 ;\n", + " /*1dc8*/ DMUL R8, R8, 0.5 ;\n", + " /*1dd0*/ DMUL R12, R6, 16 ;\n", + " /*1dd8*/ MOV32I R6, 0x349e35fd ;\n", + " /*1de8*/ MOV32I R7, 0x401d14dc ;\n", + " /*1df0*/ DFMA R14, R14, -0.5, R8 ;\n", + " /*1df8*/ DFMA R12, R10.reuse, -16, R12 ;\n", + " /*1e08*/ DFMA R8, R32, c[0x2][0x10], R6 ;\n", + " /*1e10*/ DADD R10, R10, R10 ;\n", + " /*1e18*/ DFMA R6, R4.reuse, -16, R2 ;\n", + " /*1e28*/ DADD R4, R4, R4 ;\n", + " /*1e30*/ { DADD R2, R64, R64 ;\n", + " /*1e38*/ @P0 BRA `(.L_12) }\n", + " /*1e48*/ CAL `($kernel$__cuda_sm20_rcp_rn_ftz_f32_slowpath) ;\n", + " /*1e50*/ SYNC (*\"TARGET= .L_11 \"*);\n", + ".L_12:\n", + " /*1e58*/ MUFU.RCP R65, R0 ;\n", + " /*1e68*/ FFMA R64, R0, R65, c[0x2][0xc] ;\n", + " /*1e70*/ FADD.FTZ R64, -R64, -RZ ;\n", + " /*1e78*/ { FFMA R64, R65, R64, R65 ;\n", + " /*1e88*/ SYNC (*\"TARGET= .L_11 \"*) }\n", + ".L_11:\n", + " /*1e90*/ { IADD32I R70.CC, R60, 0x2000000 ;\n", + " /*1e98*/ LDG.E.64 R58, [R58+-0x21020] }\n", + " /*1ea8*/ { DFMA R14, R20, -0.5, R14 ;\n", + " /*1eb0*/ SSY `(.L_13) }\n", + " /*1eb8*/ IADD.X R71, RZ, R61 ;\n", + " /*1ec8*/ LDG.E.64 R66, [R70+0x165068] ;\n", + " /*1ed0*/ LDG.E.64 R20, [R70+0x165470] ;\n", + " /*1ed8*/ DEPBAR.LE SB5, 0x2 ;\n", + " /*1ee8*/ DFMA R18, R18, -0.5, R16 ;\n", + " /*1ef0*/ LDG.E.64 R16, [R60+-0x21018] ;\n", + " /*1ef8*/ DFMA R22, R22, -0.5, R40 ;\n", + " /*1f08*/ { IADD32I R68.CC, R60, 0x1000000 ;\n", + " /*1f10*/ LDG.E.64 R40, [R60+-0x20c10] }\n", + " /*1f18*/ DFMA R24, R24, -0.5, R34 ;\n", + " /*1f28*/ IADD.X R69, RZ, R61 ;\n", + " /*1f30*/ LDG.E.64 R34, [R68+0xa2028] ;\n", + " /*1f38*/ LDG.E.64 R68, [R68+0xa2430] ;\n", + " /*1f48*/ DSETP.GEU.AND P0, PT, |R46|, c[0x2][0x0], PT ;\n", + " /*1f50*/ DFMA R26, R26, -0.5, R36 ;\n", + " /*1f58*/ F2F.F32.F64 R0, R46 ;\n", + " /*1f68*/ DADD R36, R56, R56 ;\n", + " /*1f70*/ DFMA R28, R28, -0.5, R30 ;\n", + " /*1f78*/ DADD R30, R54, R54 ;\n", + " /*1f88*/ DADD R10, R10, -R36 ;\n", + " /*1f90*/ @!P0 FMUL R0, R0, 1.175494350822287508e-38 ;\n", + " /*1f98*/ DADD R4, R4, -R30 ;\n", + " /*1fa8*/ FMUL.FTZ R0, R0, 1 ;\n", + " /*1fb0*/ DSETP.GEU.AND P0, PT, |R50|, c[0x2][0x0], PT ;\n", + " /*1fb8*/ DMUL R30, R4, R4 ;\n", + " /*1fc8*/ DADD R52, R52, R52 ;\n", + " /*1fd0*/ DSETP.GEU.AND P2, PT, |R30|, c[0x2][0x0], PT ;\n", + " /*1fd8*/ DADD R2, R2, -R52 ;\n", + " /*1fe8*/ DMUL R36, R10, R10 ;\n", + " /*1ff0*/ DEPBAR.LE SB5, 0x5 ;\n", + " /*1ff8*/ DFMA R24, R66, 0.5, R24 ;\n", + " /*2008*/ { F2F.F64.F32 R66, R0 ;\n", + " /*2010*/ DEPBAR.LE SB5, 0x4 }\n", + " /*2018*/ DSETP.GEU.AND P1, PT, |R36|, c[0x2][0x0], PT ;\n", + " /*2028*/ DFMA R20, R20, 0.5, R22 ;\n", + " /*2030*/ F2F.F32.F64 R22, R36 ;\n", + " /*2038*/ F2F.F32.F64 R23, R50 ;\n", + " /*2048*/ DMUL R36, R44, R66 ;\n", + " /*2050*/ DMUL R66, R20, R20 ;\n", + " /*2058*/ F2F.F32.F64 R0, R30 ;\n", + " /*2068*/ @!P0 FMUL R23, R23, 1.175494350822287508e-38 ;\n", + " /*2070*/ { @!P1 FMUL R22, R22, 1.175494350822287508e-38 ;\n", + " /*2078*/ DEPBAR.LE SB5, 0x2 }\n", + " /*2088*/ DSETP.GEU.AND P0, PT, |R36|, c[0x2][0x0], PT ;\n", + " /*2090*/ F2F.F32.F64 R36, R36 ;\n", + " /*2098*/ DFMA R52, R24, R24, R66 ;\n", + " /*20a8*/ FMUL.FTZ R37, R22, 1 ;\n", + " /*20b0*/ DFMA R14, R16, 0.5, R14 ;\n", + " /*20b8*/ FMUL.FTZ R16, R64, 1 ;\n", + " /*20c8*/ @!P2 FMUL R0, R0, 1.175494350822287508e-38 ;\n", + " /*20d0*/ FMUL.FTZ R30, R23, 1 ;\n", + " /*20d8*/ DFMA R18, R40, 0.5, R18 ;\n", + " /*20e8*/ DFMA R22, R2, R2, R52 ;\n", + " /*20f0*/ F2F.F64.F32 R40, R37 ;\n", + " /*20f8*/ F2F.F64.F32 R16, R16 ;\n", + " /*2108*/ FMUL.FTZ R37, R0, 1 ;\n", + " /*2110*/ @!P0 FMUL R36, R36, 1.175494350822287508e-38 ;\n", + " /*2118*/ MUFU.SQRT R0, R36 ;\n", + " /*2128*/ F2F.F64.F32 R30, R30 ;\n", + " /*2130*/ DSETP.GEU.AND P0, PT, |R22|, c[0x2][0x0], PT ;\n", + " /*2138*/ DFMA R40, R14, R14, R40 ;\n", + " /*2148*/ DMUL R38, R38, R16 ;\n", + " /*2150*/ DFMA R28, R34, 0.5, R28 ;\n", + " /*2158*/ F2F.F64.F32 R34, R37 ;\n", + " /*2168*/ F2F.F32.F64 R22, R22 ;\n", + " /*2170*/ DMUL R30, R44, R30 ;\n", + " /*2178*/ DSETP.GEU.AND P2, PT, |R38|, c[0x2][0x0], PT ;\n", + " /*2188*/ DFMA R40, R18, R18, R40 ;\n", + " /*2190*/ DFMA R26, R68, 0.5, R26 ;\n", + " /*2198*/ DFMA R44, R28, R28, R34 ;\n", + " /*21a8*/ FMUL.FTZ R34, R0, 1 ;\n", + " /*21b0*/ @!P0 FMUL R22, R22, 1.175494350822287508e-38 ;\n", + " /*21b8*/ { F2F.F32.F64 R37, R38 ;\n", + " /*21c8*/ MUFU.RCP R60, R22 }\n", + " /*21d0*/ { DSETP.GEU.AND P0, PT, |R30|, c[0x2][0x0], PT ;\n", + " /*21d8*/ MUFU.SQRT R36, R22 }\n", + " /*21e8*/ DSETP.GEU.AND P1, PT, |R40|, c[0x2][0x0], PT ;\n", + " /*21f0*/ F2F.F64.F32 R34, R34 ;\n", + " /*21f8*/ DFMA R44, R26, R26, R44 ;\n", + " /*2208*/ F2F.F32.F64 R52, R30 ;\n", + " /*2210*/ F2F.F32.F64 R23, R40 ;\n", + " /*2218*/ @!P2 FMUL R37, R37, 1.175494350822287508e-38 ;\n", + " /*2228*/ DSETP.GEU.AND P3, PT, R34, c[0x2][0x18], PT ;\n", + " /*2230*/ DSETP.GEU.AND P2, PT, |R44|, c[0x2][0x0], PT ;\n", + " /*2238*/ DMUL R30, R58, c[0x2][0x10] ;\n", + " /*2248*/ FMUL.FTZ R37, R37, R60 ;\n", + " /*2250*/ @!P0 FMUL R52, R52, 1.175494350822287508e-38 ;\n", + " /*2258*/ { @!P1 FMUL R23, R23, 1.175494350822287508e-38 ;\n", + " /*2268*/ MUFU.SQRT R22, R52 }\n", + " /*2270*/ { F2F.F32.F64 R41, R44 ;\n", + " /*2278*/ MUFU.SQRT R23, R23 }\n", + " /*2288*/ FMUL.FTZ R34, R36, 1 ;\n", + " /*2290*/ DMUL R48, R48, R16 ;\n", + " /*2298*/ F2F.F64.F32 R36, R37 ;\n", + " /*22a8*/ DMUL R16, R42, R16 ;\n", + " /*22b0*/ DFMA R8, R8, R38, R30 ;\n", + " /*22b8*/ F2F.F64.F32 R34, R34 ;\n", + " /*22c8*/ MOV R30, RZ ;\n", + " /*22d0*/ MOV R31, RZ ;\n", + " /*22d8*/ MOV R44, RZ ;\n", + " /*22e8*/ MOV R45, RZ ;\n", + " /*22f0*/ { @!P2 FMUL R41, R41, 1.175494350822287508e-38 ;\n", + " /*22f8*/ @!P3 SYNC (*\"TARGET= .L_13 \"*) }\n", + " /*2308*/ MUFU.SQRT R41, R41 ;\n", + " /*2310*/ FMUL.FTZ R52, R41, 1 ;\n", + " /*2318*/ F2F.F64.F32 R52, R52 ;\n", + " /*2328*/ DMUL R52, R34, R52 ;\n", + " /*2330*/ DSETP.GEU.AND P0, PT, R52, c[0x2][0x18], PT ;\n", + " /*2338*/ @!P0 SYNC (*\"TARGET= .L_13 \"*);\n", + " /*2348*/ IADD32I R60.CC, R62, 0x1000000 ;\n", + " /*2350*/ IADD.X R61, RZ, R63 ;\n", + " /*2358*/ LDG.E.64 R42, [R60+0xa2020] ;\n", + " /*2368*/ MOV32I R66, 0x8cfbbca1 ;\n", + " /*2370*/ MOV32I R67, 0x3fef2cb9 ;\n", + " /*2378*/ DMUL R46, R46, R36 ;\n", + " /*2388*/ DMUL R52, R58, c[0x2][0x28] ;\n", + " /*2390*/ FMUL.FTZ R0, R0, R41 ;\n", + " /*2398*/ DFMA R66, R32, c[0x2][0x28], R66 ;\n", + " /*23a8*/ DMUL R66, R16, R66 ;\n", + " /*23b0*/ DFMA R44, R8, R46, -R66 ;\n", + " /*23b8*/ DFMA R44, R6, R52, R44 ;\n", + " /*23c8*/ DFMA R54, R54, 16.NEG, R44 ;\n", + " /*23d0*/ DMUL R42, R42, 16 ;\n", + " /*23d8*/ DMUL R42, R24, R42 ;\n", + " /*23e8*/ DFMA R28, R28, R42, R54 ;\n", + " /*23f0*/ DFMA R26, R20, R26, R28 ;\n", + " /*23f8*/ DFMA R4, R2, R4, R26 ;\n", + " /*2408*/ DSETP.GEU.AND P0, PT, |R4|, c[0x2][0x0], PT ;\n", + " /*2410*/ F2F.F32.F64 R4, R4 ;\n", + " /*2418*/ MUFU.RCP R5, R0 ;\n", + " /*2428*/ @!P0 FMUL R4, R4, 1.175494350822287508e-38 ;\n", + " /*2430*/ FMUL.FTZ R4, R4, R5 ;\n", + " /*2438*/ { F2F.F64.F32 R44, R4 ;\n", + " /*2448*/ SYNC (*\"TARGET= .L_13 \"*) }\n", + ".L_13:\n", + " /*2450*/ { FMUL.FTZ R4, R23, 1 ;\n", + " /*2458*/ SSY `(.L_14) }\n", + " /*2468*/ F2F.F64.F32 R4, R4 ;\n", + " /*2470*/ DMUL R4, R34, R4 ;\n", + " /*2478*/ DSETP.GEU.AND P0, PT, R4, c[0x2][0x18], PT ;\n", + " /*2488*/ FMUL.FTZ R4, R22, 1 ;\n", + " /*2490*/ F2F.F64.F32 R4, R4 ;\n", + " /*2498*/ DSETP.LT.OR P0, PT, R4, c[0x2][0x18], !P0 ;\n", + " /*24a8*/ @P0 SYNC (*\"TARGET= .L_14 \"*);\n", + " /*24b0*/ { MOV32I R6, 0x834fff9c ;\n", + " /*24b8*/ LDG.E.64 R62, [R62+-0x21020] }\n", + " /*24c8*/ MOV32I R7, 0x3ff61152 ;\n", + " /*24d0*/ DMUL R50, R50, R36 ;\n", + " /*24d8*/ DMUL R4, R58, c[0x2][0x20] ;\n", + " /*24e8*/ FMUL.FTZ R22, R22, R23 ;\n", + " /*24f0*/ MUFU.RCP R0, R22 ;\n", + " /*24f8*/ DFMA R6, R32, c[0x2][0x20], R6 ;\n", + " /*2508*/ DMUL R6, R48, R6 ;\n", + " /*2510*/ DFMA R6, R8, R50, -R6 ;\n", + " /*2518*/ DFMA R6, R12, R4, R6 ;\n", + " /*2528*/ DFMA R56, R56, 16.NEG, R6 ;\n", + " /*2530*/ DMUL R26, R62, 16 ;\n", + " /*2538*/ DMUL R24, R24, R26 ;\n", + " /*2548*/ DFMA R14, R14, R24, R56 ;\n", + " /*2550*/ DFMA R20, R18, R20, R14 ;\n", + " /*2558*/ DFMA R10, R2, R10, R20 ;\n", + " /*2568*/ DSETP.GEU.AND P0, PT, |R10|, c[0x2][0x0], PT ;\n", + " /*2570*/ F2F.F32.F64 R10, R10 ;\n", + " /*2578*/ @!P0 FMUL R10, R10, 1.175494350822287508e-38 ;\n", + " /*2588*/ FMUL.FTZ R10, R10, R0 ;\n", + " /*2590*/ F2F.F64.F32 R30, R10 ;\n", + " /*2598*/ { DMUL R30, R30, c[0x2][0x30] ;\n", + " /*25a8*/ SYNC (*\"TARGET= .L_14 \"*) }\n", + ".L_14:\n", + " /*25b0*/ { DMUL R44, R2, R44 ;\n", + " /*25b8*/ S2R R4, SR_TID.Y }\n", + " /*25c8*/ { DADD R58, R58, R58 ;\n", + " /*25d0*/ S2R R7, SR_CTAID.Y }\n", + " /*25d8*/ { DFMA R30, R44, c[0x2][0x38], R30 ;\n", + " /*25e8*/ S2R R0, SR_TID.Z }\n", + " /*25f0*/ { DMUL R48, R48, R58 ;\n", + " /*25f8*/ S2R R5, SR_CTAID.Z }\n", + " /*2608*/ { DFMA R30, R32, 2, R30 ;\n", + " /*2610*/ S2R R8, SR_TID.X }\n", + " /*2618*/ { DFMA R30, R48, c[0x2][0x40], R30 ;\n", + " /*2628*/ S2R R10, SR_CTAID.X }\n", + " /*2630*/ XMAD R4, R7, c[0x0] [0xc], R4 ;\n", + " /*2638*/ XMAD.MRG R9, R7.reuse, c[0x0] [0xc].H1, RZ ;\n", + " /*2648*/ DFMA R16, R16, c[0x2][0x48], R30 ;\n", + " /*2650*/ XMAD.PSL.CBCC R4, R7.H1, R9.H1, R4 ;\n", + " /*2658*/ XMAD R0, R5.reuse, c[0x0] [0x10], R0 ;\n", + " /*2668*/ XMAD.MRG R6, R5, c[0x0] [0x10].H1, RZ ;\n", + " /*2670*/ DFMA R16, R38, c[0x2][0x50], R16 ;\n", + " /*2678*/ IADD32I R4, R4, 0x1 ;\n", + " /*2688*/ XMAD.PSL.CBCC R0, R5.H1, R6.H1, R0 ;\n", + " /*2690*/ MOV32I R5, 0x82 ;\n", + " /*2698*/ MOV32I R6, 0x82 ;\n", + " /*26a8*/ XMAD R3, R4, 0x82, RZ ;\n", + " /*26b0*/ XMAD R8, R10, c[0x0] [0x8], R8 ;\n", + " /*26b8*/ XMAD.MRG R11, R10, c[0x0] [0x8].H1, RZ ;\n", + " /*26c8*/ IADD32I R0, R0, 0x1 ;\n", + " /*26d0*/ XMAD R2, R4.reuse, 0x82, RZ ;\n", + " /*26d8*/ XMAD R5, R4.reuse, R5.H1, RZ ;\n", + " /*26e8*/ XMAD R6, R4.H1.reuse, R6.H1, RZ ;\n", + " /*26f0*/ XMAD.CHI R3, R4.H1, 0x82, R3 ;\n", + " /*26f8*/ XMAD.PSL.CBCC R8, R10.H1, R11.H1, R8 ;\n", + " /*2708*/ MOV32I R10, 0x4204 ;\n", + " /*2710*/ MOV32I R11, 0x4204 ;\n", + " /*2718*/ XMAD R9, R0.reuse, 0x4204, RZ ;\n", + " /*2728*/ XMAD R7, R0, 0x4204, RZ ;\n", + " /*2730*/ XMAD.PSL R4, R4.H1, 0x82, R2 ;\n", + " /*2738*/ IADD32I R8, R8, 0x1 ;\n", + " /*2748*/ XMAD R10, R0.reuse, R10.H1, RZ ;\n", + " /*2750*/ XMAD R11, R0.H1.reuse, R11.H1, RZ ;\n", + " /*2758*/ XMAD.CHI R9, R0.H1.reuse, 0x4204, R9 ;\n", + " /*2768*/ XMAD.PSL R2, R0.H1, 0x4204, R7 ;\n", + " /*2770*/ IADD3.RS R0, R3, R5, R6 ;\n", + " /*2778*/ IADD3.RS R9, R9, R10, R11 ;\n", + " /*2788*/ IADD R2.CC, R2, R4 ;\n", + " /*2790*/ IADD.X R0, R9, R0 ;\n", + " /*2798*/ IADD R8.CC, R8, R2 ;\n", + " /*27a8*/ IADD.X R3, RZ, R0 ;\n", + " /*27b0*/ LEA R2.CC, R8.reuse, c[0x0][0x148], 0x3 ;\n", + " /*27b8*/ LEA.HI.X R0, R8, c[0x0][0x14c], R3, 0x3 ;\n", + " /*27c8*/ IADD32I R2.CC, R2, 0x2000000 ;\n", + " /*27d0*/ IADD.X R3, RZ, R0 ;\n", + " /*27d8*/ STG.E.64 [R2+0x186080], R16 ;\n", + " /*27e8*/ EXIT ;\n", + " .weak $kernel$__cuda_sm20_rcp_rn_ftz_f32_slowpath\n", + " .type $kernel$__cuda_sm20_rcp_rn_ftz_f32_slowpath,@function\n", + " .size $kernel$__cuda_sm20_rcp_rn_ftz_f32_slowpath,(.L_48 - $kernel$__cuda_sm20_rcp_rn_ftz_f32_slowpath)\n", + "$kernel$__cuda_sm20_rcp_rn_ftz_f32_slowpath:\n", + " /*27f0*/ { IADD32I R65, R0, 0x1800000 ;\n", + " /*27f8*/ PBK `(.L_15) }\n", + " /*2808*/ LOP32I.AND R65, R65, 0x7f800000 ;\n", + " /*2810*/ ISETP.NE.U32.AND P2, PT, R65, c[0x2][0x58], PT ;\n", + " /*2818*/ @!P2 MUFU.RCP R64, R0 ;\n", + " /*2828*/ @!P2 BRK (*\"TARGET= .L_15 \"*);\n", + " /*2830*/ ISET.EQ.U32.AND R64, R65, c[0x2][0x5c], PT ;\n", + " /*2838*/ ICMP.NE.U32 R65, RZ, 0x1, R65 ;\n", + " /*2848*/ IADD R64, -R64, RZ ;\n", + " /*2850*/ LOP.OR.NZ P2, RZ, R65, R64 ;\n", + " /*2858*/ @!P2 BRA `(.L_16) ;\n", + " /*2868*/ LOP32I.AND R64, R0, 0x7fffff ;\n", + " /*2870*/ ISET.EQ.U32.AND R64, R64, RZ, PT ;\n", + " /*2878*/ IADD R64, -R64, RZ ;\n", + " /*2888*/ LOP.AND.NZ P2, RZ, R65, R64 ;\n", + " /*2890*/ @P2 LOP32I.AND R64, R0.reuse, 0x80000000 ;\n", + " /*2898*/ @P2 LOP32I.OR R64, R64, 0x800000 ;\n", + " /*28a8*/ { @!P2 LOP32I.AND R64, R0, 0x80000000 ;\n", + " /*28b0*/ BRK (*\"TARGET= .L_15 \"*) }\n", + ".L_16:\n", + " /*28b8*/ MUFU.RCP R64, R0 ;\n", + " /*28c8*/ BRK (*\"TARGET= .L_15 \"*);\n", + ".L_15:\n", + " /*28d0*/ RET ;\n", + ".L_17:\n", + " /*28d8*/ BRA `(.L_17) ;\n", + ".L_48:\n", + "\n", + "108\n", + "36\n", + "116\n", + "152\n", + "29\n", + "3\n" + ] + } + ], + "source": [ + "phi_kernel = create_kernel(\n", + " mu_update_eqs,\n", + " target=\"gpu\",\n", + " gpu_indexing_params={\n", + " \"block_size\": (32, 4, 1)\n", + " }).compile()\n", + "\n", + "\n", + "\n", + "code = \"#include <cstdint>\\n\"\n", + "code += \"#define FUNC_PREFIX __global__ __launch_bounds__(128)\\n\"\n", + "code += \"#define RESTRICT const __restrict__\\n\\n\"\n", + "\n", + "#code += str(show_code(phi_kernel.ast))\n", + "code += str(show_code(mu_stag_precomp_kernel)) #\n", + "\n", + "cubin = pycuda.compiler.compile(code, options=[\"-w\", \"-std=c++11\", \"-use_fast_math\" ], arch=\"sm_60\")\n", + "\n", + "run([ \"echo \\\"\" + code + \"\\\" >> temp.cubin\"],\n", + " stdout=PIPE,\n", + " shell=True)\n", + "\n", + "newFile = open(\"temp.cusbin\", \"wb\")\n", + "newFile.write(cubin)\n", + "newFile.close()\n", + "\n", + "result = run([ \"nvdisasm -c temp.cusbin\"],\n", + " stdout=PIPE,\n", + " shell=True)\n", + "\n", + "print(len(result.stdout.decode(\"utf-8\").split(\"\\n\") ) )\n", + "\n", + "print(result.stdout.decode(\"utf-8\"))\n", + "\n", + "newFile = open(\"temp.disasm\", \"wb\")\n", + "newFile.write(result.stdout)\n", + "newFile.close()\n", + "\n", + "print ( result.stdout.decode(\"utf-8\").count(\"LDG\") )\n", + "print ( result.stdout.decode(\"utf-8\").count(\"DADD\") )\n", + "print ( result.stdout.decode(\"utf-8\").count(\"DMUL\") )\n", + "print ( result.stdout.decode(\"utf-8\").count(\"DFMA\") )\n", + "print ( result.stdout.decode(\"utf-8\").count(\"MUFU\") )\n", + "print ( result.stdout.decode(\"utf-8\").count(\"STG\") )\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(show_code(phi_kernel.ast))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "for eq in rescheduled_eqs:\n", + " print(eq)\n", + " print(eq.rhs.func)\n", + " for arg in eq.rhs.args:\n", + " print(arg)\n", + " print()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "d = graphviz.Digraph(engine='dot')\n", + "for eq in rescheduled_eqs:\n", + " #d.node(eq.lhs.name)\n", + " for arg in eq.rhs.atoms():\n", + " if isinstance(arg, sympy.Symbol) and not isinstance(arg, Field.Access):\n", + " d.edge(arg.name, eq.lhs.name)\n", + "d\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/pystencils_tests/liveness_opts/cse_reorder_demo.ipynb b/pystencils_tests/liveness_opts/cse_reorder_demo.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..a17cec6b2890fb48791f2b8f5e81c328accf9b6d --- /dev/null +++ b/pystencils_tests/liveness_opts/cse_reorder_demo.ipynb @@ -0,0 +1,269 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys \n", + "sys.path.append('..')\n", + "\n", + "%load_ext autoreload\n", + "%autoreload 1\n", + "%aimport pystencils.simp.liveness_opts\n", + "%aimport pystencils.simp.liveness_opts_exp\n", + "%aimport pystencils.shmemvar\n", + "%aimport pystencils.backends.cbackend\n", + "%aimport pystencils.transformations\n", + "\n", + "\n", + "%load_ext line_profiler\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from lbmpy.session import *\n", + "from scipy.ndimage.filters import gaussian_filter\n", + "from pygrandchem_tests.config2 import get_system\n", + "from pygrandchem_tests.config import get_system as get_system_simple\n", + "from pystencils.datahandling import SerialDataHandling\n", + "from pygrandchem.grandchem_generation import *\n", + "from pygrandchem.chemicalpotential import *\n", + "from pystencils import show_code, Field\n", + "from sympy import Number, Symbol, Expr, preorder_traversal, postorder_traversal, Function, Piecewise, relational\n", + "from pystencils.simp import sympy_cse_on_assignment_list\n", + "from pystencils.simp.liveness_opts import *\n", + "from pystencils.simp.liveness_opts_exp import *\n", + "\n", + "from pystencils.shmemvar import *\n", + "import graphviz\n", + "\n", + "\n", + "import pycuda\n", + "\n", + "import sys\n", + "from subprocess import run, PIPE\n", + "\n", + "sys.setrecursionlimit(100000)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_SSA(eqs):\n", + "\n", + " phi_kernel = create_kernel(\n", + " eqs,\n", + " target=\"gpu\",\n", + " gpu_indexing_params={\n", + " \"block_size\": (32, 4, 1)\n", + " }).compile()\n", + "\n", + "\n", + " code = \"#include <cstdint>\\n\"\n", + " code += \"#define FUNC_PREFIX __global__ __launch_bounds__(128)\\n\"\n", + " code += \"#define RESTRICT __restrict__\\n\\n\"\n", + "\n", + " code += str(show_code(phi_kernel.ast))\n", + "\n", + " cubin = pycuda.compiler.compile(code, options=[\"-w\", \"-std=c++11\", \"-use_fast_math\" ], arch=\"sm_60\")\n", + "\n", + " newFile = open(\"temp.cusbin\", \"wb\")\n", + " newFile.write(cubin)\n", + " newFile.close()\n", + "\n", + " result = run([ \"nvdisasm -c temp.cusbin\"],\n", + " stdout=PIPE,\n", + " shell=True)\n", + "\n", + " result_str = result.stdout.decode(\"utf-8\")\n", + " print(len(result_str.split('\\n')))\n", + "\n", + " \n", + " print(result_str)\n", + "\n", + " newFile = open(\"temp.disasm\", \"wb\")\n", + " newFile.write(result.stdout)\n", + " newFile.close()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "def make_graph_viz(eqs):\n", + " d = graphviz.Digraph(engine='dot')\n", + " for eq in eqs:\n", + " #d.node(eq.lhs.name)\n", + " for arg in eq.rhs.atoms():\n", + " if isinstance(arg, sympy.Symbol) and not isinstance(arg, Field.Access):\n", + " d.edge(arg.name, eq.lhs.name)\n", + " return d\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "config = get_system()\n", + "free_energy = FreeEnergy(config['FreeEnergy'], config['Temperature'])\n", + "\n", + "dh = SerialDataHandling((256, 256, 256), periodicity=(True, True, False))\n", + "f = dh.fields\n", + "dh.add_array('phi_src', values_per_cell=4, layout='fzyx')\n", + "dh.add_array('mu_src', values_per_cell=2, layout='fzyx')\n", + "dh.add_array_like('phi_dst', 'phi_src')\n", + "dh.add_array_like('mu_dst', 'mu_src')\n", + "dh.add_array('c', values_per_cell=2, layout='fzyx')\n", + "\n", + "diffusion_matrices = np.zeros([4, 2, 2])\n", + "diffusion_matrices[0] = config['Parameters']['da']\n", + "diffusion_matrices[1] = config['Parameters']['db']\n", + "diffusion_matrices[2] = config['Parameters']['dg']\n", + "diffusion_matrices[3] = config['Parameters']['dl']\n", + "\n", + "f = dh.fields\n", + "\n", + "#update_eqs = create_phi_update_equations(\n", + "# f['phi_src'],\n", + "# f['phi_dst'],\n", + "# f['mu_src'],\n", + "# free_energy,\n", + "# config['Parameters'],\n", + "# simplex_projection=True)\n", + "\n", + "update_eqs = create_mu_update_equations(\n", + " f['phi_src'], f['phi_dst'], f['mu_src'], f['mu_dst'], free_energy,\n", + " diffusion_matrices, config['Parameters'])\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for eq in update_eqs:\n", + " print(eq)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "make_graph_viz(update_eqs)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "get_SSA(update_eqs)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cse_eqs = sympy_cse_on_assignment_list(update_eqs)\n", + "for eq in cse_eqs:\n", + " print(eq)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "make_graph_viz(cse_eqs)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "get_SSA(cse_eqs)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "rescheduled_eqs = schedule_eqs(cse_eqs)\n", + "\n", + "for eq in rescheduled_eqs:\n", + " print(eq)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "make_graph_viz(rescheduled_eqs)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "get_SSA(rescheduled_eqs)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/pystencils_tests/liveness_opts/grandchem_test.py b/pystencils_tests/liveness_opts/grandchem_test.py new file mode 100644 index 0000000000000000000000000000000000000000..78c2e7838c7a8844a6076e0bfaff6b4b4ba881c2 --- /dev/null +++ b/pystencils_tests/liveness_opts/grandchem_test.py @@ -0,0 +1,148 @@ +#coding : utf - 8 + +#In[32]: + +from lbmpy.session import * +from scipy.ndimage.filters import gaussian_filter + +from pygrandchem_tests.config_anisotropic import get_system +from pystencils.datahandling import SerialDataHandling +from pygrandchem.grandchem_generation import * +from pystencils import show_code +import pycuda.driver as drv +from pystencils.simp.liveness_opts import * +from pystencils.simp.liveness_opts_exp import * +from pygrandchem.initialization import * +from pygrandchem.chemicalpotential import free_energy_from_config_object, FreeEnergy +from pystencils.boundaries import * + +from pystencils import show_code +import pycuda.driver as drv +from pystencils.simp.liveness_opts import * +from pystencils.simp.liveness_opts_exp import * + +domain_size = (512, 512, 128) +periodicity = (True, True, False) +fast_simplex_projection = True +optimization = { 'gpu_indexing_params': {"block_size": (32, 4, 2)}} +config = get_system(dim=len(domain_size)) + +phases = config['Parameters']['phases'] +components = config['Parameters']['components'] +diffusion_matrices = config['Parameters']['diffusion'] +free_energy = config['FreeEnergy'] + +#Adding fields +dh = create_data_handling(domain_size, periodicity=periodicity, default_target=optimization['target']) +f = dh.fields +phi_src = dh.add_array('phi_src', values_per_cell=phases, layout='fzyx', latex_name='phi_s') +mu_src = dh.add_array('mu_src', values_per_cell=components, layout='fzyx', latex_name="mu_s") +mu_stag = dh.add_array('mu_stag', values_per_cell=(dh.dim, components), layout='f') +phi_dst = dh.add_array_like('phi_dst', 'phi_src') +mu_dst = dh.add_array_like('mu_dst', 'mu_src') + +c = dh.add_array('c', values_per_cell=components, layout='fzyx', gpu=False) + +mu_vanilla_eqs = create_mu_update_equations(phi_src, phi_dst, mu_src, mu_dst, free_energy, diffusion_matrices, + config['Parameters']) + +phi_vanilla_eqs = create_phi_update_equations( + phi_src, phi_dst, mu_src, free_energy, config['Parameters'], simplex_projection=fast_simplex_projection) + +init_boxes(dh, height=0.2) +initialize_concentration_field(dh, free_energy, config['Parameters']['initial_concentration']) +smooth_fields(dh, sigma=0.4, iterations=5, dim=dh.dim) +dh.synchronization_function(['phi_src', 'phi_dst', 'mu_src', 'mu_dst'])() +print(dh) + + +def bench_kernels(mu_kernel, phi_kernel): + + start = drv.Event() + end = drv.Event() + + dh.run_kernel(mu_kernel) + start.record() + dh.run_kernel(mu_kernel) + dh.run_kernel(mu_kernel) + end.record() + end.synchronize() + msec = start.time_till(end) / 2 + print("mu_kernel: {:5.3f} ms".format(msec)) + + dh.run_kernel(phi_kernel) + start.record() + dh.run_kernel(phi_kernel) + dh.run_kernel(phi_kernel) + end.record() + end.synchronize() + msec = start.time_till(end) / 2 + print("phi_kernel: {:5.3f} ms".format(msec)) + + +sched_options = [] + +sched_options.append(option_none) +sched_options.append(option_none) +sched_options.append(option_none) +#sched_options.append(option_reschedule) +sched_options.append(option_liveness_opt_transformation) +#sched_options.append(option_dupl_reschedule) +sched_options.append(option_liveness_opt_transformation_shmem) +sched_options.append(option_liveness_opt_transformation_shmem2) +#sched_options.append(option_reschedule_shmem) +#sched_options.append(option_liveness_opt_transformation_shmem) +#sched_options.append(optionFuseSubs) +#sched_options.append(optionFuseFMAs) +#sched_options.append(optionFuseBoth) +#sched_options.append(optionRescheduleAtomize) +#sched_options.append(optionRescheduleAtomizeScramble) +#sched_options.append(optionDuplAtomizeReschedule) +#sched_options.append(optionDuplAtomizeRefuseReschedule) +#sched_options.append(optionDuplRescheduleAtomize) +#sched_options.append(optionDuplRescheduleAtomizeScramble) +#sched_options.append(optionSchedIteration) +#sched_options.append(optionAtomizeRescheduleNoSqrt) +#sched_options.append(optionAtomizeRescheduleNoDiv) +#sched_options.append(optionAtomizeRescheduleNoSqrtDiv) +#sched_options.append(optionAtomizeRescheduleNoPiecewise) +#sched_options.append(optionAtomizeRescheduleNoAll) + +mu_rescheduled_eqs = sched_options[0](mu_vanilla_eqs) +phi_rescheduled_eqs = sched_options[0](phi_vanilla_eqs) + +mu_kernel = create_kernel(mu_rescheduled_eqs, target="gpu", gpu_indexing_params={"block_size": (32, 4, 2)}).compile() + +phi_kernel = create_kernel(phi_rescheduled_eqs, target="gpu", gpu_indexing_params={"block_size": (32, 4, 2)}).compile() + +print("mu_kernel: " + str(mu_kernel.num_regs) + " regs") +print("phi_kernel: " + str(phi_kernel.num_regs) + " regs") + +bench_kernels(mu_kernel, phi_kernel) + +print() + +dh.swap('mu_src', 'mu_dst') +dh.swap('phi_src', 'phi_dst') + +for sched_option in sched_options: + print(sched_option.__name__) + mu_rescheduled_eqs = sched_option(mu_vanilla_eqs) + phi_rescheduled_eqs = sched_option(phi_vanilla_eqs) + + mu_kernel = create_kernel( + mu_rescheduled_eqs, target="gpu", gpu_indexing_params={ + "block_size": (32, 4, 2) + }).compile() + + phi_kernel = create_kernel( + phi_rescheduled_eqs, target="gpu", gpu_indexing_params={ + "block_size": (32, 4, 2) + }).compile() + + print("mu_kernel: " + str(mu_kernel.num_regs) + " regs") + print("phi_kernel: " + str(phi_kernel.num_regs) + " regs") + + bench_kernels(mu_kernel, phi_kernel) + + print() diff --git a/pystencils_tests/liveness_opts/grandchem_test_staggered.py b/pystencils_tests/liveness_opts/grandchem_test_staggered.py new file mode 100644 index 0000000000000000000000000000000000000000..a85bf8b02614d4457398562e2d4cf43996179206 --- /dev/null +++ b/pystencils_tests/liveness_opts/grandchem_test_staggered.py @@ -0,0 +1,175 @@ +# coding: utf-8 + +# In[32]: + +import warnings +import pystencils as ps +from pygrandchem.grandchem import GrandChemGenerator +from pygrandchem.scenarios import system_4_2, system_3_1 +from pygrandchem.initialization import init_boxes, smooth_fields +from pygrandchem.scenarios import benchmark_configs + +from sympy import Number, Symbol, Expr, preorder_traversal, postorder_traversal, Function, Piecewise, relational +from pystencils.simp import sympy_cse_on_assignment_list +from pystencils.simp.liveness_opts import * +from pystencils.simp.liveness_opts_exp import * + +import graphviz + +import pycuda + +import sys +from subprocess import run, PIPE + +from pystencils import show_code +import pycuda.driver as drv + +configs = benchmark_configs() + +def get_config(name): + return configs[name] + + +domain_size = (512, 512, 128) +periodicity = (True, True, False) + +optimization = {'gpu_indexing_params': {"block_size": (32, 4, 2)}} +config = get_config('42_fixT') +phases, components = config['Parameters']['phases'], config['Parameters']['components'] +format_args = {'p': phases, 'c': components, 's': ','.join(str(e) for e in domain_size)} + +# Adding fields +dh = ps.create_data_handling(domain_size, periodicity=periodicity, default_target='gpu') +f = dh.fields +phi_src = dh.add_array( + 'phi_src', values_per_cell=config['Parameters']['phases'], layout='fzyx', latex_name='phi_s') +mu_src = dh.add_array( + 'mu_src', values_per_cell=config['Parameters']['components'], layout='fzyx', latex_name="mu_s") +mu_stag = dh.add_array( + 'mu_stag', values_per_cell=(dh.dim, config['Parameters']['components']), layout='f') +phi_stag = dh.add_array('phi_stag', values_per_cell=(dh.dim, phases), layout='f') + +phi_dst = dh.add_array_like('phi_dst', 'phi_src') +mu_dst = dh.add_array_like('mu_dst', 'mu_src') + +gc = GrandChemGenerator( + phi_src, + phi_dst, + mu_src, + mu_dst, + config['FreeEnergy'], + config['Parameters'], + #conc=c, + mu_staggered=mu_stag, + phi_staggered=phi_stag, + use_block_offsets=False, + compile_kernel=False) + +mu_full_eqs = gc.mu_full() +phi_full_eqs = gc.phi_full() + +mu_partial1_eqs = gc.mu_partial1() +mu_partial2_eqs = gc.mu_partial2() + +phi_partial1_eqs = gc.phi_partial1() +phi_partial2_eqs = gc.phi_partial2() + +phi_kernel = ps.create_kernel(phi_full_eqs, target='gpu', **optimization).compile() +mu_kernel = ps.create_kernel(mu_full_eqs, target='gpu', **optimization).compile() + +c = dh.add_array('c', values_per_cell=config['Parameters']['components'], layout='fzyx', gpu=False) + +init_boxes(dh) +#initialize_concentration_field(dh, free_energy, config['Parameters']['initial_concentration']) +smooth_fields(dh, sigma=0.4, iterations=5, dim=dh.dim) +dh.synchronization_function(['phi_src', 'phi_dst', 'mu_src', 'mu_dst'])() +print(dh) + + +def bench_kernels(mu_kernel, phi_kernel): + + start = drv.Event() + end = drv.Event() + + dh.run_kernel(mu_kernel) + start.record() + dh.run_kernel(mu_kernel) + dh.run_kernel(mu_kernel) + end.record() + end.synchronize() + msec = start.time_till(end) / 2 + print("mu_kernel: {} {:5.3f} ms".format(mu_kernel.num_regs, msec)) + + dh.run_kernel(phi_kernel) + start.record() + dh.run_kernel(phi_kernel) + dh.run_kernel(phi_kernel) + end.record() + end.synchronize() + msec = start.time_till(end) / 2 + print("phi_kernel: {} {:5.3f} ms".format(phi_kernel.num_regs, msec)) + + +sched_options = [] + +sched_options.append(option_none) +#sched_options.append(option_reschedule) +sched_options.append(option_liveness_opt_transformation) +#sched_options.append(option_dupl_reschedule) +sched_options.append(option_liveness_opt_transformation_shmem) +sched_options.append(option_liveness_opt_transformation_shmem2) +#sched_options.append(option_reschedule_shmem) +#sched_options.append(option_liveness_opt_transformation_shmem) +#sched_options.append(optionFuseSubs) +#sched_options.append(optionFuseFMAs) +#sched_options.append(optionFuseBoth) +#sched_options.append(optionRescheduleAtomize) +#sched_options.append(optionRescheduleAtomizeScramble) +#sched_options.append(optionDuplAtomizeReschedule) +#sched_options.append(optionDuplAtomizeRefuseReschedule) +#sched_options.append(optionDuplRescheduleAtomize) +#sched_options.append(optionDuplRescheduleAtomizeScramble) +#sched_options.append(optionSchedIteration) +#sched_options.append(optionAtomizeRescheduleNoSqrt) +#sched_options.append(optionAtomizeRescheduleNoDiv) +#sched_options.append(optionAtomizeRescheduleNoSqrtDiv) +#sched_options.append(optionAtomizeRescheduleNoPiecewise) +#sched_options.append(optionAtomizeRescheduleNoAll) + +print("warmup") +bench_kernels(mu_kernel, phi_kernel) +dh.swap('mu_src', 'mu_dst') +dh.swap('phi_src', 'phi_dst') + +for sched_option in sched_options: + mu_full_opt_eqs = sched_option(mu_full_eqs) + phi_full_opt_eqs = sched_option(phi_full_eqs) + + + #mu_partial1_opt_eqs = sched_option(mu_partial1_eqs) + #mu_partial2_opt_eqs = sched_option(mu_partial2_eqs) + #phi_partial1_opt_eqs = sched_option(phi_partial1_eqs) + #phi_partial2_opt_eqs = sched_option(phi_partial2_eqs) + + mu_full_opt_kernel = ps.create_kernel(mu_full_opt_eqs, target='gpu', **optimization).compile() + phi_full_opt_kernel = ps.create_kernel(phi_full_opt_eqs, target='gpu', **optimization).compile() + + #mu_partial1_opt_kernel = ps.create_staggered_kernel(mu_stag, + # mu_partial1_opt_eqs, target='gpu', **optimization).compile() + #mu_partial2_opt_kernel = ps.create_kernel( + # mu_partial2_opt_eqs, target='gpu', **optimization).compile() + + #phi_partial1_opt_kernel = ps.create_kernel( + # phi_partial1_opt_eqs, target='gpu', **optimization).compile() + #phi_partial2_opt_kernel = ps.create_kernel( + # phi_partial2_opt_eqs, target='gpu', **optimization).compile() + + bench_kernels(mu_full_opt_kernel, phi_kernel) + #bench_kernels(mu_partial1_opt_kernel, phi_kernel) + #bench_kernels(mu_partial2_opt_kernel, phi_kernel) + + bench_kernels(mu_kernel, phi_full_opt_kernel) + #bench_kernels(mu_kernel, phi_partial1_opt_kernel) + #bench_kernels(mu_kernel, phi_partial2_opt_kernel) + +print() diff --git a/pystencils_tests/liveness_opts/kernel_split.ipynb b/pystencils_tests/liveness_opts/kernel_split.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..2f811e70ce7ec107c8c4c098b9e5576bfdc9153b --- /dev/null +++ b/pystencils_tests/liveness_opts/kernel_split.ipynb @@ -0,0 +1,900 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys \n", + "sys.path.append('..')\n", + "\n", + "%load_ext autoreload\n", + "%autoreload 1\n", + "%aimport pystencils.simp.liveness_opts\n", + "%aimport pystencils.simp.liveness_opts_exp\n", + "%aimport pystencils.shmemvar\n", + "%aimport pystencils.backends.cbackend\n", + "%aimport pystencils.transformations\n", + "\n", + "\n", + "%load_ext line_profiler\n", + "\n", + "from IPython.core.display import display, HTML\n", + "display(HTML(\"<style>.container { width:100% !important; }</style>\"))\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import math\n", + "from lbmpy.session import *\n", + "from scipy.ndimage.filters import gaussian_filter\n", + "from pygrandchem_tests.config2 import get_system\n", + "from pygrandchem_tests.config import get_system as get_system_simple\n", + "from pystencils.datahandling import SerialDataHandling\n", + "from pygrandchem.grandchem_generation import *\n", + "from pygrandchem.chemicalpotential import *\n", + "from pystencils import show_code, Field\n", + "from sympy import Number, Symbol, Expr, preorder_traversal, postorder_traversal, Function, Piecewise, relational\n", + "from pystencils.simp import sympy_cse_on_assignment_list\n", + "from pystencils.simp.liveness_opts import *\n", + "from pystencils.simp.liveness_opts_exp import *\n", + "import random\n", + "from pystencils.shmemvar import *\n", + "import graphviz\n", + "\n", + "\n", + "import pycuda\n", + "\n", + "import sys\n", + "from subprocess import run, PIPE\n", + "\n", + "sys.setrecursionlimit(100000)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "config = get_system()\n", + "free_energy = FreeEnergy(config['FreeEnergy'], config['Temperature'])\n", + "\n", + "dh = SerialDataHandling((256, 256, 256), periodicity=(True, True, False))\n", + "f = dh.fields\n", + "dh.add_array('phi_src', values_per_cell=4, layout='fzyx')\n", + "dh.add_array('mu_src', values_per_cell=2, layout='fzyx')\n", + "dh.add_array_like('phi_dst', 'phi_src')\n", + "dh.add_array_like('mu_dst', 'mu_src')\n", + "dh.add_array('c', values_per_cell=2, layout='fzyx')\n", + "\n", + "diffusion_matrices = np.zeros([4, 2, 2])\n", + "diffusion_matrices[0] = config['Parameters']['da']\n", + "diffusion_matrices[1] = config['Parameters']['db']\n", + "diffusion_matrices[2] = config['Parameters']['dg']\n", + "diffusion_matrices[3] = config['Parameters']['dl']\n", + "\n", + "f = dh.fields\n", + "\n", + "#update_eqs = create_phi_update_equations(\n", + "# f['phi_src'],\n", + "# f['phi_dst'],\n", + "# f['mu_src'],\n", + "# free_energy,\n", + "# config['Parameters'],\n", + "# simplex_projection=True)\n", + "\n", + "update_eqs = create_mu_update_equations(\n", + " f['phi_src'], f['phi_dst'], f['mu_src'], f['mu_dst'], free_energy,\n", + " diffusion_matrices, config['Parameters'])\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for eq in update_eqs:\n", + " print(eq)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "update_eqs = sympy_cse_on_assignment_list(update_eqs)\n", + "for eq in update_eqs:\n", + " print(eq)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "update_eqs = merge_field_accesses(update_eqs)\n", + "\n", + "for eq in update_eqs:\n", + " print(eq)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "update_eqs = schedule_eqs(update_eqs)\n", + "for eq in update_eqs:\n", + " print(eq)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "a = Symbol(\"a\")\n", + "b = Symbol(\"b\")\n", + "c = Symbol(\"c\")\n", + "d = Symbol(\"d\")\n", + "\n", + "\n", + "fake_eqs = [\n", + " Assignment(a, sympy.Add(sympy.Mul(0.1, f['phi_src'][1, 0, 0](0)), sympy.Mul(0.1, f['phi_src'][0, 0, 0](0)))),\n", + " Assignment(b, sympy.Add(sympy.Mul(0.1, f['phi_src'][-1, 0, 0](0)), sympy.Mul(0.1, f['phi_src'][0, 0, 0](0)))),\n", + " Assignment(c, sympy.Add(sympy.Mul(0.1, f['phi_src'][0, 1, 0](0)), sympy.Mul(0.1, f['phi_src'][0, 0, 0](0)))),\n", + " Assignment(d, sympy.Add(sympy.Mul(0.1, f['phi_src'][0, -1, 0](0)), sympy.Mul(0.1, f['phi_src'][0, 0, 0](0)))),\n", + " Assignment(f['phi_dst'][0, 0, 0](0), sympy.Add(a, b, c, d))\n", + "]\n", + "\n", + "fake_eqs = schedule_eqs(atomize_eqs(merge_field_accesses(sympy_cse_on_assignment_list(fake_eqs))))\n", + "\n", + "for eq in fake_eqs:\n", + " print(eq)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def split_eqs(eqs):\n", + " \n", + " top_sym = []\n", + " bottom_sym = [eq.lhs for eq in eqs]\n", + " random.shuffle(bottom_sym)\n", + "\n", + " #\n", + "\n", + "\n", + " for n in range(0,10):\n", + " forces = { sym : (len(bottom_sym)) / (len(bottom_sym) + len(top_sym)) for sym in bottom_sym }\n", + " forces.update ({ sym : -(len(top_sym)) / (len(bottom_sym) + len(top_sym)) for sym in top_sym })\n", + " for i in range(0, 2):\n", + " for sym in bottom_sym + top_sym:\n", + " if sym in bottom_sym:\n", + " new_force = (len(bottom_sym)) / (len(bottom_sym) + len(top_sym))\n", + " if sym in top_sym:\n", + " new_force = -(len(top_sym)) / (len(bottom_sym) + len(top_sym))\n", + " for n in used_nodes[sym]:\n", + " if n in top_sym:\n", + " new_force += forces[n]\n", + " for n in used_by.get(sym, []):\n", + " new_force += forces[n]\n", + " forces[sym] = new_force / 4\n", + " \n", + " strongest_upforce = 0\n", + " strongest_upforced_node = None\n", + " for f in bottom_sym:\n", + " if forces[f] > strongest_upforce and set(used_nodes[f]) <= set(top_sym) and f in bottom_sym:\n", + " strongest_upforce = forces[f]\n", + " strongest_upforced_node = f\n", + " \n", + " \n", + " strongest_downforce = 0\n", + " strongest_downforced_node = None\n", + " for f in top_sym:\n", + " if forces[f] < strongest_downforce and set(used_by[f]) <= set(bottom_sym) and f in top_sym:\n", + " strongest_downforce = forces[f]\n", + " strongest_downforced_node = f \n", + " print(strongest_downforced_node)\n", + " print(strongest_upforced_node)\n", + " print()\n", + " \n", + " if (abs(strongest_downforce) < abs(strongest_upforce)):\n", + " top_sym.append(strongest_upforced_node)\n", + " bottom_sym.remove(strongest_upforced_node)\n", + " else:\n", + " bottom_sym.append(strongest_downforced_node)\n", + " top_sym.remove(strongest_downforced_node)\n", + " \n", + " return top_sym, bottom_sym\n", + "\n", + "top_sym, bottom_sym = split_eqs(update_eqs)\n", + "\n", + "print(top_sym)\n", + "print(bottom_sym)\n", + "\n", + "graph = graphviz.Digraph(engine='dot')\n", + "\n", + "with graph.subgraph(name=\"cluster_top\") as c:\n", + " for sym in top_sym:\n", + " c.node(sym.name)\n", + "\n", + "with graph.subgraph(name=\"cluster_bottom\") as c:\n", + " for sym in bottom_sym:\n", + " c.node(sym.name)\n", + " \n", + "\n", + "for eq in update_eqs:\n", + " for arg in eq.rhs.atoms():\n", + " if isinstance(arg, sympy.Symbol) and not isinstance(arg, Field.Access):\n", + " graph.edge(arg.name, eq.lhs.name)\n", + " \n", + "graph\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_ancestors(eqs, eq, used_nodes):\n", + " ancestors = set()\n", + " definitions = get_definitions(eqs)\n", + " def walk_up(eq):\n", + " for atom in used_nodes[eq.lhs]:\n", + " if isinstance(atom, Symbol) and atom not in ancestors:\n", + " ancestors.add(atom)\n", + " if atom in definitions:\n", + " walk_up(definitions[atom])\n", + " \n", + " walk_up(eq)\n", + " return ancestors\n", + "\n", + "def get_leaving_edge_count(eqs, eq):\n", + " used_nodes = get_used_nodes(eqs)\n", + " ancestors = get_ancestors(eqs, eq, used_nodes)\n", + " used_by = get_used_by(eqs)\n", + " leaving_edges = 0\n", + " accounted_edges = set()\n", + " for anc in ancestors:\n", + " for u in used_by[anc]:\n", + " if u not in ancestors and u not in accounted_edges:\n", + " leaving_edges += 1\n", + " accounted_edges.add(u)\n", + " return leaving_edges\n", + "\n", + "for eq in update_eqs:\n", + " leaving_edges_count = get_leaving_edge_count(update_eqs, eq)\n", + " if leaving_edges_count <= 2:\n", + " print(eq)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "used_by = get_used_by(update_eqs)\n", + "definitions = get_definitions(update_eqs)\n", + "\n", + "def get_all_ancestors(eqs):\n", + " all_ancestors = {}\n", + " for eq in eqs:\n", + " ancestors = set()\n", + " for atom in eq.rhs.atoms():\n", + " if isinstance(atom, Symbol) and atom not in ancestors:\n", + " ancestors.add(atom)\n", + " if atom in all_ancestors:\n", + " ancestors.update(all_ancestors[atom])\n", + " all_ancestors[eq] = ancestors\n", + " return all_ancestors\n", + " \n", + "def get_edge_cut_count(top_set, used_by, definitions):\n", + " out_edge_count = 0\n", + " for eq in top_set:\n", + " for u in used_by[eq.lhs]:\n", + " if definitions[u] not in top_set:\n", + " out_edge_count += 1\n", + " break\n", + " \n", + " return out_edge_count\n", + " \n", + "def get_eligible_eqs(top_set, eqs, all_ancestors):\n", + " eligible_eqs = set()\n", + " for eq in eqs:\n", + " if eq in top_set: continue\n", + " ancestors = all_ancestors[eq]\n", + " eligible = True\n", + " for anc in ancestors:\n", + " if not isinstance(anc, Field.Access) and not definitions[anc] in top_set:\n", + " eligible = False\n", + " break\n", + " if eligible: \n", + " eligible_eqs.add(eq)\n", + " return eligible_eqs\n", + " \n", + " \n", + "\n", + "largest_top_set = dict()\n", + "used_by = get_used_by(update_eqs)\n", + "used_nodes = get_used_nodes(update_eqs)\n", + "\n", + "\n", + "\n", + "def all_top_sets(top_set, eligible_eqs):\n", + " largest_top_set = copy.copy(top_set)\n", + " eq_list = list(eligible_eqs)\n", + " random.shuffle(eq_list)\n", + " for e in list(eligible_eqs)[:50]:\n", + " top_set.add(e)\n", + " new_eligible_eqs = copy.copy(eligible_eqs)\n", + " new_eligible_eqs.remove(e)\n", + " for u in used_by[e.lhs]:\n", + " eligible = True\n", + " for atom in used_nodes[u]:\n", + " if not isinstance(atom, Field.Access) and definitions[atom] not in top_set: \n", + " eligible = False\n", + " break\n", + " if eligible: \n", + " new_eligible_eqs.add(definitions[u])\n", + " edge_cuts = get_edge_cut_count(top_set, used_by, definitions)\n", + " if edge_cuts <= 3:\n", + " new_top_set = all_top_sets(top_set, new_eligible_eqs)\n", + " if len(new_top_set) > len(largest_top_set):\n", + " largest_top_set = copy.copy(new_top_set)\n", + "\n", + " top_set.remove(e)\n", + " print(len(largest_top_set))\n", + " return largest_top_set\n", + "\n", + "all_ancestors = get_all_ancestors(update_eqs)\n", + "used_by = get_used_by(update_eqs)\n", + "definitions = get_definitions(update_eqs)\n", + "top_set = set()\n", + "eligible_eqs = get_eligible_eqs(top_set, update_eqs, all_ancestors)\n", + "\n", + "\n", + "all_top_sets(top_set, eligible_eqs)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_edge_cut_change(top_set, node):\n", + " edge_cut_change = 1\n", + " for e in used_nodes[node]:\n", + " if isinstance(e, Field.Access): continue\n", + " cut = False\n", + " for e2 in used_by[e]:\n", + " if definitions[e2] not in top_set and e2 != node:\n", + " cut = True\n", + " break\n", + " if not cut: edge_cut_change -= 1\n", + " return edge_cut_change\n", + "\n", + "def get_new_eligible_eqs(old_top_set, old_eligible_eqs, e):\n", + " new_eligible_eqs = copy.copy(old_eligible_eqs)\n", + " new_eligible_eqs.remove(e)\n", + " for u in used_by[e.lhs]:\n", + " if isinstance(u, Field.Access): continue\n", + " eligible = True\n", + " for u2 in used_nodes[u]:\n", + " if definitions[u2] not in old_top_set and not isinstance(u2, Field.Access) and u2 != e.lhs:\n", + " eligible = False\n", + " if eligible:\n", + " new_eligible_eqs.add(definitions[u])\n", + " return new_eligible_eqs\n", + "\n", + " \n", + "\n", + "def largest_top_set(eqs, breadth):\n", + " used_nodes = get_used_nodes(eqs)\n", + " used_by = get_used_by(eqs)\n", + " all_ancestors = get_all_ancestors(eqs)\n", + " definitions = get_definitions(eqs)\n", + " \n", + " top_set_trace = []\n", + " \n", + " top_sets = set([(frozenset(), 0)])\n", + " eligible_eqs_dict = { frozenset(): get_eligible_eqs(frozenset(), eqs, all_ancestors) }\n", + " \n", + " for i in range(0, 1200):\n", + " candidates = []\n", + " for top_set in top_sets:\n", + " for e in eligible_eqs_dict[top_set[0]]:\n", + " candidates.append((top_set[0], e, top_set[1] + get_edge_cut_change(top_set[0], e.lhs)))\n", + " random.shuffle(candidates)\n", + " candidates.sort(key=lambda c: c[2])\n", + " \n", + " top_sets = set()\n", + " new_eligible_eqs_dict = {}\n", + " for c in candidates[0:breadth]:\n", + " new_top_set = frozenset(list(c[0]) + [c[1]] )\n", + " top_sets.add( ( new_top_set, c[2]) )\n", + " new_eligible_eqs_dict[new_top_set] = get_new_eligible_eqs(c[0], eligible_eqs_dict[c[0]], c[1])\n", + " \n", + " eligible_eqs_dict = new_eligible_eqs_dict\n", + " \n", + " top_set_trace.append((frozenset(list(c[0]) + [c[1]]), c[2] ))\n", + " return top_set_trace\n", + "\n", + "\n", + "def trim_top_set(top_set):\n", + " trimmed_list = list(top_set)\n", + " trimmed_top_set = top_set\n", + " for e in list(top_set):\n", + " on_edge = True\n", + " for u in used_by[e.lhs]:\n", + " if definitions[u] in trimmed_top_set:\n", + " on_edge = False\n", + " break\n", + " if not on_edge: continue\n", + " delta = -1\n", + " for u in used_nodes[e.lhs]:\n", + " already_cut = False\n", + " for u2 in used_by[u]:\n", + " if definitions[u2] not in trimmed_top_set:\n", + " already_cut = True\n", + " if not already_cut: delta += 1\n", + " if delta <= 0: \n", + " trimmed_list.remove(e)\n", + " trimmed_top_set = frozenset(trimmed_list)\n", + " return trimmed_top_set\n", + " \n", + " \n", + " \n", + "\n", + "\n", + "top_sets = largest_top_set(update_eqs, 1)\n", + "plt.plot([t[1] for t in top_sets])\n", + "top_sets = largest_top_set(update_eqs, 4)\n", + "plt.plot([t[1] for t in top_sets])\n", + "top_sets = largest_top_set(update_eqs, 16)\n", + "plt.plot([t[1] for t in top_sets])\n", + "top_sets = largest_top_set(update_eqs, 1024)\n", + "plt.plot([t[1] for t in top_sets])\n", + "\n", + "#trimmed_top_sets = [(trim_top_set(t[0]), 0 ) for t in top_sets]\n", + "#retrimmed_top_sets = [(trim_top_set(t[0]), 0 ) for t in trimmed_top_sets]\n", + "#reretrimmed_top_sets = [(trim_top_set(t[0]), 0 ) for t in retrimmed_top_sets]\n", + "#rereretrimmed_top_sets = [(trim_top_set(t[0]), 0 ) for t in reretrimmed_top_sets]\n", + "#rerereretrimmed_top_sets = [(trim_top_set(t[0]), 0 ) for t in rereretrimmed_top_sets]\n", + "\n", + "\n", + "\n", + "#plt.plot([get_edge_cut_count(t[0], used_by, definitions) for t in trimmed_top_sets])\n", + "#plt.plot([get_edge_cut_count(t[0], used_by, definitions) for t in retrimmed_top_sets])\n", + "#plt.plot([get_edge_cut_count(t[0], used_by, definitions) for t in reretrimmed_top_sets])\n", + "#plt.plot([get_edge_cut_count(t[0], used_by, definitions) for t in rereretrimmed_top_sets])\n", + "\n", + "#plt.plot([len(t[0]) for t in trimmed_top_sets], [get_edge_cut_count(t[0], used_by, definitions) for t in trimmed_top_sets])\n", + "#plt.plot([len(t[0]) for t in retrimmed_top_sets], [get_edge_cut_count(t[0], used_by, definitions) for t in retrimmed_top_sets])\n", + "#plt.plot([len(t[0]) for t in reretrimmed_top_sets], [get_edge_cut_count(t[0], used_by, definitions) for t in reretrimmed_top_sets])\n", + "#plt.plot([len(t[0]) for t in rereretrimmed_top_sets], [get_edge_cut_count(t[0], used_by, definitions) for t in rereretrimmed_top_sets])\n", + "#plt.plot([len(t[0]) for t in rerereretrimmed_top_sets], [get_edge_cut_count(t[0], used_by, definitions) for t in rerereretrimmed_top_sets])\n", + "\n", + "\n", + " \n", + "plt.ylim(bottom=0)\n", + "plt.xlim(left=1)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class Vertex:\n", + " def __init__(self, name, nodes=[], parts = []):\n", + " if nodes == []:\n", + " self.nodes = [self]\n", + " else:\n", + " self.nodes = nodes\n", + " self.name = name\n", + " self.pred = []\n", + " self.succ = []\n", + " self.top_level = -1\n", + " self.parts = parts\n", + " def __str__(self):\n", + " if self.name != \"\": return self.name\n", + " eqstr = self.nodes[0].name\n", + " ctr = 0\n", + " for n in self.nodes[1:]:\n", + " eqstr += \" + \" + n.name\n", + " ctr += 1\n", + " if ctr % int(math.sqrt(len(self.nodes)/2)+1) == 0:\n", + " eqstr += \"\\n\"\n", + " eqstr += \", \" + str(self.top_level)\n", + " return eqstr\n", + " \n", + " def __repr__(self):\n", + " return str(self)\n", + " \n", + "def comp_top_level(vertices): \n", + " def rec_comp_top_level(v, top_level):\n", + " v.top_level = top_level\n", + " if top_level > 15: print(str(v) + \" \" + str(top_level))\n", + " for s in v.succ:\n", + " if top_level > 25: break\n", + " if top_level >= s.top_level or s.top_level == -1:\n", + " rec_comp_top_level(s, top_level + 1)\n", + "\n", + " for v in vertices:\n", + " if len(v.pred) == 0:\n", + " rec_comp_top_level(v, 0)\n", + " \n", + "def comp_top_level2(vertices): \n", + " pass\n", + " done_vertices = set()\n", + " rem_vertices = set(vertices)\n", + " \n", + " top_level = 0\n", + " \n", + " while(len(rem_vertices) > 0):\n", + " for v in list(rem_vertices):\n", + " ready = True\n", + " for p in v.pred:\n", + " if p not in done_vertices:\n", + " ready = False\n", + " break\n", + " if ready:\n", + " v.top_level = top_level\n", + " top_level += 1\n", + " rem_vertices.remove(v)\n", + " done_vertices.add(v)\n", + " \n", + "def build_vertices(eqs):\n", + " vertices = {}\n", + " for eq in eqs:\n", + " new_vertex = Vertex(str(eq.lhs))\n", + " for atom in eq.rhs.atoms(Symbol):\n", + " if not isinstance(atom, Field.Access): new_vertex.pred.append(vertices[atom])\n", + " vertices[eq.lhs] = new_vertex\n", + " for eq in eqs:\n", + " for atom in eq.rhs.atoms(Symbol):\n", + " if atom in vertices:\n", + " vertices[atom].succ.append(vertices[eq.lhs])\n", + " \n", + " comp_top_level(list(vertices.values()))\n", + " return vertices\n", + "\n", + "def copy_vertices(vertices):\n", + " translation_dict = {}\n", + " new_vertices = []\n", + " for v in vertices:\n", + " new_vertex = Vertex(v.name, v.nodes, [v])\n", + " translation_dict[v] = new_vertex\n", + " new_vertices.append(new_vertex)\n", + " for v in vertices:\n", + " translation_dict[v].pred = [translation_dict[p] for p in v.pred]\n", + " translation_dict[v].succ = [translation_dict[s] for s in v.succ]\n", + " return new_vertices, translation_dict\n", + " \n", + " \n", + "\n", + "\n", + "def comp_coarse_graph_matching(vertices):\n", + " match = set()\n", + " mark = { v : False for v in vertices }\n", + " for u in vertices:\n", + " if mark[u]: continue\n", + " \n", + " edges = u.succ + u.pred\n", + " edges.sort(key = lambda v : - len(v.nodes))\n", + " for v in u.succ + u.pred:\n", + " if mark[v]: continue \n", + "\n", + " #print(str(u) + \" \" + str(v))\n", + " if v in u.pred:\n", + " if v.top_level != u.top_level-1 and len(v.succ) != 1 and len(u.pred) != 1:\n", + " continue\n", + " match.add((v,u))\n", + " for w in v.succ:\n", + " if v.top_level == w.top_level -1:\n", + " mark[w] = True\n", + " else:\n", + " if u.top_level != v.top_level-1 and len(v.pred) != 1 and len(u.succ) != 1:\n", + " continue\n", + " match.add((u,v))\n", + " for w in u.succ:\n", + " if u.top_level == w.top_level -1:\n", + " mark[w] = True\n", + " mark[u] = True\n", + " mark[v] = True\n", + " break\n", + " return match\n", + "\n", + "def comp_coarse_vertices(vertices, match):\n", + " coarse_vertices, translation_dict = copy_vertices(vertices)\n", + " \n", + " for m in match:\n", + "\n", + " u = translation_dict[m[0]]\n", + " v = translation_dict[m[1]]\n", + " \n", + "\n", + " coarse_vertices.remove(u)\n", + " coarse_vertices.remove(v)\n", + " cv = Vertex(\"\", u.nodes + v.nodes, [m[0],m[1]] )\n", + " coarse_vertices.append(cv)\n", + " \n", + " cv.pred = list(set(u.pred + v.pred))\n", + " cv.succ = list(set(u.succ + v.succ))\n", + " cv.pred.remove(u)\n", + " cv.succ.remove(v)\n", + " \n", + " if u in cv.succ: cv.succ.remove(u)\n", + " if v in cv.pred: cv.pred.remove(v)\n", + " \n", + " preds = u.pred + v.pred\n", + " succs = u.succ + v.succ\n", + " \n", + " for p in cv.pred:\n", + " if u in p.succ: p.succ.remove(u)\n", + " if v in p.succ: p.succ.remove(v)\n", + " if cv not in p.succ: p.succ.append(cv)\n", + " \n", + " for p in cv.succ:\n", + " if u in p.pred: p.pred.remove(u)\n", + " if v in p.pred: p.pred.remove(v)\n", + " if cv not in p.pred: p.pred.append(cv)\n", + " \n", + " comp_top_level(coarse_vertices)\n", + " return coarse_vertices\n", + " \n", + "def compute_edge_cost(vertices):\n", + " edge_costs = {}\n", + " for u in vertices:\n", + " for v in u.succ:\n", + " req_nodes = set()\n", + " for n in v.nodes: req_nodes.update(n.pred)\n", + " av_nodes = req_nodes.intersection(set(u.nodes))\n", + " edge_costs[(u,v)] = len(av_nodes)\n", + " return edge_costs\n", + " \n", + " \n", + " \n", + " \n", + "def make_graphviz(vertices):\n", + " graph = graphviz.Digraph(engine='dot')\n", + " for v in vertices:\n", + " for p in v.pred:\n", + " graph.edge(str(p), str(v), spline=\"none\")\n", + " return graph\n", + "\n", + "def make_dual_graphviz(vertices):\n", + " graph = graphviz.Digraph(engine='dot')\n", + " for v in vertices:\n", + " for p in v.pred:\n", + " graph.edge(str(p), str(v))\n", + " for p in v.succ:\n", + " graph.edge(str(v), str(p), arrowhead=\"odot\")\n", + " return graph\n", + "\n", + "def make_graphviz_topset(vertices, topset):\n", + " graph = graphviz.Digraph(engine='dot')\n", + " \n", + " with graph.subgraph(name=\"cluster_top\") as c:\n", + " for v in topset:\n", + " c.node(str(v), color=\"azure2\", style=\"filled\")\n", + " with graph.subgraph(name=\"cluster_bottom\") as c: \n", + " for v in vertices:\n", + " if v not in topset:\n", + " c.node(str(v))\n", + " \n", + " for v in vertices:\n", + " for p in v.pred:\n", + " graph.edge(str(p), str(v), spline=\"none\")\n", + " return graph\n", + "\n", + "\n", + "def make_graphviz_with_edge_cost(vertices, edge_cost):\n", + " graph = graphviz.Digraph(engine='dot')\n", + " for v in vertices:\n", + " for p in v.pred:\n", + " graph.edge(str(p), str(v), taillabel=str(edge_cost[(p,v)]))\n", + " return graph\n", + "\n", + "def visualize_matching(vertices, match):\n", + " graph = graphviz.Digraph(engine='dot')\n", + " for v in vertices:\n", + " for p in v.pred:\n", + " if (p, v) in match:\n", + " graph.edge(str(p), str(v), color=\"red\", weight=\"200\")\n", + " else:\n", + " graph.edge(str(p), str(v), weight = \"1\")\n", + " return graph\n", + "\n", + "def visualize_matching2(vertices, match):\n", + " graph = graphviz.Digraph(engine='dot')\n", + " for m in match:\n", + " v = m[1]\n", + " for p in v.pred:\n", + " if (p, v) in match:\n", + " graph.edge(str(p), str(v), color=\"red\", weight=\"200\")\n", + " else:\n", + " graph.edge(str(p), str(v), weight=\"1\")\n", + " for p in v.succ:\n", + " if (v, p) in match:\n", + " graph.edge(str(v), str(p), color=\"red\", weight=\"200\")\n", + " else:\n", + " graph.edge(str(v), str(p), weight=\"1\")\n", + " return graph\n", + "\n", + "\n", + "vertices = list(build_vertices(update_eqs).values())\n", + "#random.shuffle(vertices)\n", + "\n", + "\n", + "#vlevels = [vertices]\n", + "\n", + "vlevels = []\n", + "vlevels.append(vertices)\n", + "print(len(vlevels[-1]))\n", + "for i in range(1,25):\n", + " vlevels[-1].sort(key = lambda v: len(v.nodes))\n", + " match = comp_coarse_graph_matching(vlevels[-1])\n", + " vlevels.append(comp_coarse_vertices(vlevels[-1], match))\n", + " print(len(vlevels[-1]))\n", + "\n", + "edge_costs = [compute_edge_cost(vlevel) for vlevel in vlevels]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def refine_topset(vertices, topset, target_size):\n", + " while(True):\n", + " topset_symbols = sum( [v.nodes for v in topset], [])\n", + " reqs = {}\n", + " for v in vertices:\n", + " if v in topset: continue\n", + " for n in v.nodes:\n", + " for p in n.pred:\n", + " if p not in topset_symbols: continue\n", + " if p not in reqs: reqs[p] = 0\n", + " reqs[p] += 1\n", + " \n", + " edges = 0\n", + " for t in topset:\n", + " for n in t.nodes:\n", + " if n in reqs:\n", + " edges += 1\n", + "\n", + " print(\"edges: \" + str(edges))\n", + " print(\"size: \" + str(len(topset_symbols)))\n", + " \n", + " best_v = None\n", + " best_gain = -1\n", + " best_direction = 0\n", + " random.shuffle(vertices)\n", + " for v in vertices:\n", + " if v in topset:\n", + " free = True\n", + " for s in v.succ:\n", + " if s in topset: \n", + " free = False\n", + " break\n", + " if not free: continue\n", + " edge_inc = len(set.union(*[set(n.pred) for n in v.nodes]) - set(v.nodes))\n", + " edge_dec = sum([n in reqs for n in v.nodes])\n", + " edge_delta = edge_inc - edge_dec\n", + " size_delta = -len(v.nodes)\n", + " direction = -1\n", + " else:\n", + " free = True\n", + " for p in v.pred:\n", + " if p not in topset: \n", + " free = False\n", + " break\n", + " if not free: continue\n", + " edge_inc = len(set.union(*[set(n.pred) for s in v.succ for n in s.nodes]).intersection(set(v.nodes)))\n", + " edge_dec = sum([reqs.get(r, 0) == 1 for r in set.union(*[set(n.pred) for n in v.nodes]) ])\n", + " edge_delta = edge_inc - edge_dec\n", + " size_delta = len(v.nodes)\n", + " direction = 1\n", + " \n", + " topset_size = sum([len(t.nodes) for t in topset])\n", + " size_gain = - min(0, (topset_size - target_size) / target_size) + min(0, (topset_size - target_size + size_delta) / target_size) # ((topset_size - target_size) / target_size)**2 - ((topset_size - target_size + size_delta) / target_size)**2\n", + " if topset_size == 0:\n", + " edge_gain = - edge_delta / size_delta\n", + " elif topset_size + size_delta == 0:\n", + " edge_gain = -10 \n", + " else:\n", + " edge_gain = edges / topset_size - (edges + edge_delta) / (topset_size + size_delta)\n", + " #print(v)\n", + " #print(edge_gain)\n", + " #print(size_gain)\n", + " #print()\n", + " total_gain = edge_gain + size_gain\n", + " if total_gain > best_gain:\n", + " best_gain = total_gain\n", + " best_v = v\n", + " best_direction = direction\n", + " \n", + " print(best_gain)\n", + " print(best_v)\n", + " print(best_direction)\n", + " print()\n", + " \n", + "\n", + " if best_v in topset:\n", + " topset.remove(best_v)\n", + " else:\n", + " topset.append(best_v)\n", + "\n", + " if best_gain < 0:\n", + " break\n", + " \n", + " return topset\n", + "\n", + "\n", + "\n", + "\n", + "topset = refine_topset(vlevels[-1], [], 50)\n", + "\n", + "\n", + "for l in range(-2, -26, -1):\n", + " print(\"New level\")\n", + " topset = sum([t.parts for t in topset], [])\n", + " topset = refine_topset(vlevels[l], topset, 50) \n", + " \n", + "make_graphviz_topset(vlevels[l], topset)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "celltoolbar": "Raw Cell Format", + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/pystencils_tests/liveness_opts/liveness_evo_opt.ipynb b/pystencils_tests/liveness_opts/liveness_evo_opt.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..afb35f23d6375647352e35da11a8b37b7850f279 --- /dev/null +++ b/pystencils_tests/liveness_opts/liveness_evo_opt.ipynb @@ -0,0 +1,184 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys \n", + "sys.path.append('..')\n", + "\n", + "#%load_ext autoreload\n", + "#%autoreload 1\n", + "#%aimport pystencils.simp.liveness_opts\n", + "#%aimport pystencils.simp.liveness_opts_exp\n", + "#%aimport pystencils.simp.liveness_permutations\n", + "\n", + "\n", + "#%load_ext line_profiler" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from lbmpy.session import *\n", + "from scipy.ndimage.filters import gaussian_filter\n", + "from pygrandchem_tests.config2 import get_system\n", + "from pygrandchem_tests.config import get_system as get_system_simple\n", + "from pystencils.datahandling import SerialDataHandling\n", + "from pygrandchem.grandchem_generation import *\n", + "from pygrandchem.chemicalpotential import *\n", + "from pystencils import show_code, Field\n", + "from sympy import Symbol, Expr, preorder_traversal, postorder_traversal, Function, Piecewise, relational\n", + "from pystencils.simp import sympy_cse_on_assignment_list\n", + "from pystencils.simp.liveness_opts import *\n", + "from pystencils.simp.liveness_opts_exp import *\n", + "import matplotlib.pyplot as plt\n", + "from pystencils.backends.cbackend import generate_c\n", + "import pycuda.driver as drv\n", + "\n", + "import graphviz\n", + "\n", + "from pystencils.simp.liveness_permutations import *\n", + "\n", + "import pycuda\n", + "import pycuda.autoinit # NOQA\n", + "from pycuda.compiler import SourceModule\n", + "\n", + "import sys\n", + "from subprocess import run, PIPE\n", + "\n", + "sys.setrecursionlimit(100000)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "config = get_system()\n", + "free_energy = FreeEnergy(config['FreeEnergy'], config['Temperature'])\n", + "\n", + "dh = SerialDataHandling((512, 512, 128), periodicity=(True, True, False))\n", + "f = dh.fields\n", + "dh.add_array('phi_src', values_per_cell=4, layout='fzyx', gpu=True)\n", + "dh.add_array('mu_src', values_per_cell=2, layout='fzyx', gpu=True)\n", + "dh.add_array_like('phi_dst', 'phi_src', gpu=True)\n", + "dh.add_array_like('mu_dst', 'mu_src', gpu=True)\n", + "\n", + "dh.add_array('c', values_per_cell=2, layout='fzyx', gpu=True)\n", + "\n", + "# In[34]:\n", + "\n", + "diffusion_matrices = np.zeros([4, 2, 2])\n", + "diffusion_matrices[0] = config['Parameters']['da']\n", + "diffusion_matrices[1] = config['Parameters']['db']\n", + "diffusion_matrices[2] = config['Parameters']['dg']\n", + "diffusion_matrices[3] = config['Parameters']['dl']\n", + "\n", + "f = dh.fields\n", + "\n", + "mu_eqs = create_mu_update_equations(\n", + " f['phi_src'], f['phi_dst'], f['mu_src'], f['mu_dst'], free_energy,\n", + " diffusion_matrices, config['Parameters'])\n", + "\n", + "phi_eqs = create_phi_update_equations(\n", + " f['phi_src'],\n", + " f['phi_dst'],\n", + " f['mu_src'],\n", + " free_energy,\n", + " config['Parameters'],\n", + " simplex_projection=True)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sample = livenessOptSequence()\n", + "sample.opts = [[scheduleEqs, [6]], [fuseFMAs, [1]], [scrambleEqs, [1000]],\n", + " [moveBackward, []], [refuseEqs, [1,1]],\n", + " [mergeFieldAccesses, []], [refuseEqs, [4,3]]]\n", + "\n", + "sample2 = livenessOptSequence()\n", + "sample2.opts = [[scheduleEqs, [6]], [mergeFieldAccesses, []],\n", + " [moveBackward, []], [refuseEqs, [1, 1]],\n", + " [fuseFMAs, [1]], [scrambleEqs, [968]],\n", + " [refuseEqs, [4, 3]], [moveBackward, []],\n", + " [scrambleEqs, [1114]], [fuseFMAs, [1]], [fuseSubs, []] ]\n", + "\n", + "sample3 = livenessOptSequence()\n", + "sample3.opts = [[scheduleEqs, [1]], [mergeFieldAccesses, []], [refuseEqs, [1, 3]] ]\n", + "\n", + "sample4 = livenessOptSequence()\n", + "sample4.opts = [[scheduleEqs, [1]], [mergeFieldAccesses, []], [scrambleEqs, [1000]], [refuseEqs, [1, 3]] ]\n", + "\n", + "sample5 = livenessOptSequence()\n", + "sample5.opts = [[scheduleEqs, [10]], [mergeFieldAccesses, []], [refuseEqs, [2, 3]] ]\n", + "\n", + "\n", + "pop = [sample, sample2, sample3, sample4, sample5]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "while True:\n", + " pop = evolvePopulation(pop, [mu_eqs, phi_eqs], dh)\n", + " print()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/pystencils_tests/liveness_opts/liveness_evo_opt.py b/pystencils_tests/liveness_opts/liveness_evo_opt.py new file mode 100644 index 0000000000000000000000000000000000000000..c13042672ea1e011a52b49c473b69897e9ff3a89 --- /dev/null +++ b/pystencils_tests/liveness_opts/liveness_evo_opt.py @@ -0,0 +1,195 @@ +# coding: utf-8 + +# In[32]: + + +import pickle +import warnings +import pystencils as ps +from pygrandchem.grandchem import GrandChemGenerator +from pygrandchem.scenarios import system_4_2, system_3_1 +from pygrandchem.initialization import init_boxes, smooth_fields +from pygrandchem.scenarios import benchmark_configs + +from sympy import Number, Symbol, Expr, preorder_traversal, postorder_traversal, Function, Piecewise, relational +from pystencils.simp import sympy_cse_on_assignment_list +from pystencils.simp.liveness_opts import * +from pystencils.simp.liveness_opts_exp import * + +from pystencils.simp.liveness_permutations import * + + +import pycuda + +import sys +from subprocess import run, PIPE + +from pystencils import show_code +import pycuda.driver as drv + + +configs = benchmark_configs() + +def get_config(name): + return configs[name] + + +domain_size = (512, 512, 128) +periodicity = (True, True, False) + +optimization = {'gpu_indexing_params': {"block_size": (32, 4, 2)}} + + + +if len(sys.argv) < 4: + print("Kernel, config and div/sqrt arguments are required") + exit() + +config = get_config(sys.argv[2]) +print ("optimizing for config " + sys.argv[2]) + +approx = False +if sys.argv[3] == "true": + approx = True + + +phases, components = config['Parameters']['phases'], config['Parameters']['components'] +format_args = {'p': phases, 'c': components, 's': ','.join(str(e) for e in domain_size)} + +# Adding fields +dh = ps.create_data_handling(domain_size, periodicity=periodicity, default_target='gpu') +f = dh.fields +phi_src = dh.add_array( + 'phi_src', values_per_cell=config['Parameters']['phases'], layout='fzyx', latex_name='phi_s') +mu_src = dh.add_array( + 'mu_src', values_per_cell=config['Parameters']['components'], layout='fzyx', latex_name="mu_s") +mu_stag = dh.add_array( + 'mu_stag', values_per_cell=(dh.dim, config['Parameters']['components']), layout='f') +phi_stag = dh.add_array('phi_stag', values_per_cell=(dh.dim, phases), layout='f') + +phi_dst = dh.add_array_like('phi_dst', 'phi_src') +mu_dst = dh.add_array_like('mu_dst', 'mu_src') + +gc = GrandChemGenerator( + phi_src, + phi_dst, + mu_src, + mu_dst, + config['FreeEnergy'], + config['Parameters'], + #conc=c, + mu_staggered=mu_stag, + phi_staggered=phi_stag, + use_block_offsets=False, + compile_kernel=False, + fast_divisions=approx, + fast_sqrts=approx) + +mu_full_eqs = gc.mu_full() +phi_full_eqs = gc.phi_full() + + +staggered_params = None + + +if sys.argv[1] == "phi_full": + eqs = phi_full_eqs + +elif sys.argv[1] == "mu_full": + eqs = mu_full_eqs +elif sys.argv[1] == "mu_partial1": + staggered_params = gc.mu_partial1() +elif sys.argv[1] == "mu_partial2": + eqs = gc.mu_partial2() +elif sys.argv[1] == "phi_partial1": + staggered_params = gc.phi_partial1() +elif sys.argv[1] == "phi_partial2": + eqs = gc.phi_partial2() +else: + print("Specified kernel does not exist") + exit() + + +if not staggered_params is None: + eqs = unpack_staggered_eqs(*staggered_params) + + +phi_kernel = ps.create_kernel(phi_full_eqs, target='gpu', **optimization).compile() +mu_kernel = ps.create_kernel(mu_full_eqs, target='gpu', **optimization).compile() + +c = dh.add_array('c', values_per_cell=config['Parameters']['components'], layout='fzyx', gpu=False) + +init_boxes(dh) +#initialize_concentration_field(dh, free_energy, config['Parameters']['initial_concentration']) +smooth_fields(dh, sigma=0.4, iterations=5, dim=dh.dim) +dh.synchronization_function(['phi_src', 'phi_dst', 'mu_src', 'mu_dst'])() +print(dh) + + +def bench_kernels(mu_kernel, phi_kernel): + + start = drv.Event() + end = drv.Event() + + dh.run_kernel(mu_kernel, timestep=1) + start.record() + dh.run_kernel(mu_kernel, timestep=1) + dh.run_kernel(mu_kernel, timestep=1) + end.record() + end.synchronize() + msec = start.time_till(end) / 2 + print("mu_kernel: {} {:5.3f} ms".format(mu_kernel.num_regs, msec)) + + dh.run_kernel(phi_kernel, timestep=1) + start.record() + dh.run_kernel(phi_kernel, timestep=1) + dh.run_kernel(phi_kernel, timestep=1) + end.record() + end.synchronize() + msec = start.time_till(end) / 2 + print("phi_kernel: {} {:5.3f} ms".format(phi_kernel.num_regs, msec)) + + + +print("warmup") +bench_kernels(mu_kernel, phi_kernel) +dh.swap('mu_src', 'mu_dst') +dh.swap('phi_src', 'phi_dst') + + + + + + + +bestSeqs = pickle.load(open('best_seq.pickle', 'rb')) + +#bestSeqs= {} +#bestSeqs[( "mu_full", "42_fixT")] = livenessOptSequence([ [schedule_eqs, [12]], [duplicate_trivial_ops, [1, 1]], [refuse_eqs, [0, 1]], [var_to_shmem_lt, [3]]], (64, 4, 2)) + +#bestSeqs[("phi_full", "42_fixT")] = livenessOptSequence([[schedule_eqs, [34]]], (16, 2, 2)) + + + + +if (sys.argv[1], sys.argv[2], sys.argv[3]) in bestSeqs: + pop = [bestSeqs[(sys.argv[1], sys.argv[2], sys.argv[3])]] +elif (sys.argv[1], sys.argv[2]) in bestSeqs: + pop = [bestSeqs[(sys.argv[1], sys.argv[2])]] +else: + pop = [livenessOptSequence()] + +print("Loaded seq: " + str(pop[0]) + "\n") + + + + +while True: + pop = evolvePopulation(pop, [eqs], dh, staggered_params) + bench_kernels(mu_kernel, phi_kernel) + + bestSeqs = pickle.load(open('best_seq.pickle', 'rb')) + bestSeqs[(sys.argv[1], sys.argv[2], sys.argv[3])] = pop[0] + pickle.dump(bestSeqs, open('best_seq.pickle', 'wb')) + print() + diff --git a/pystencils_tests/liveness_opts/reorder_depth.ipynb b/pystencils_tests/liveness_opts/reorder_depth.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..e77ffaab3d2ac2404a2ef47101beb67d9fbc821f --- /dev/null +++ b/pystencils_tests/liveness_opts/reorder_depth.ipynb @@ -0,0 +1,293 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys \n", + "sys.path.append('..')\n", + "\n", + "#%load_ext autoreload\n", + "#%autoreload 1\n", + "#%aimport pystencils.simp.liveness_opts\n", + "#%aimport pystencils.simp.liveness_opts_exp\n", + "\n", + "#%load_ext line_profiler" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from lbmpy.session import *\n", + "from scipy.ndimage.filters import gaussian_filter\n", + "from pygrandchem_tests.config2 import get_system\n", + "from pygrandchem_tests.config import get_system as get_system_simple\n", + "from pystencils.datahandling import SerialDataHandling\n", + "from pygrandchem.grandchem_generation import *\n", + "from pygrandchem.chemicalpotential import *\n", + "from pystencils import show_code, Field\n", + "from sympy import Symbol, Expr, preorder_traversal, postorder_traversal, Function, Piecewise, relational\n", + "from pystencils.simp import sympy_cse_on_assignment_list\n", + "from pystencils.simp.liveness_opts import *\n", + "from pystencils.simp.liveness_opts_exp import *\n", + "import matplotlib.pyplot as plt\n", + "from pystencils.backends.cbackend import generate_c\n", + "import pycuda.driver as drv\n", + "\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "\n", + "import graphviz\n", + "\n", + "\n", + "import pycuda\n", + "import pycuda.autoinit # NOQA\n", + "from pycuda.compiler import SourceModule\n", + "\n", + "import sys\n", + "from subprocess import run, PIPE\n", + "\n", + "sys.setrecursionlimit(100000)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def bench_kernel(kernel):\n", + " \n", + " start = drv.Event()\n", + " end = drv.Event()\n", + "\n", + " dh.run_kernel(kernel)\n", + " start.record()\n", + " dh.run_kernel(kernel)\n", + " end.record()\n", + " end.synchronize()\n", + " msec = start.time_till(end)\n", + "\n", + " return msec\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "config = get_system()\n", + "free_energy = FreeEnergy(config['FreeEnergy'], config['Temperature'])\n", + "\n", + "dh = SerialDataHandling((128, 128, 128), periodicity=(True, True, False))\n", + "f = dh.fields\n", + "dh.add_array('phi_src', values_per_cell=4, layout='fzyx', gpu=True)\n", + "dh.add_array('mu_src', values_per_cell=2, layout='fzyx', gpu=True)\n", + "dh.add_array_like('phi_dst', 'phi_src', gpu=True)\n", + "dh.add_array_like('mu_dst', 'mu_src', gpu=True)\n", + "\n", + "dh.add_array('c', values_per_cell=2, layout='fzyx', gpu=True)\n", + "\n", + "# In[34]:\n", + "\n", + "diffusion_matrices = np.zeros([4, 2, 2])\n", + "diffusion_matrices[0] = config['Parameters']['da']\n", + "diffusion_matrices[1] = config['Parameters']['db']\n", + "diffusion_matrices[2] = config['Parameters']['dg']\n", + "diffusion_matrices[3] = config['Parameters']['dl']\n", + "\n", + "f = dh.fields\n", + "\n", + "eqs = create_mu_update_equations(\n", + " f['phi_src'], f['phi_dst'], f['mu_src'], f['mu_dst'], free_energy,\n", + " diffusion_matrices, config['Parameters'])\n", + "\n", + "#eqs = create_phi_update_equations(\n", + "# f['phi_src'],\n", + "# f['phi_dst'],\n", + "# f['mu_src'],\n", + "# free_energy,\n", + "# config['Parameters'],\n", + "# simplex_projection=True)\n", + "\n", + "update_eqs = sympy_cse_on_assignment_list(eqs)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "n = 0\n", + "ana_regs_result = []\n", + "nvcc_regs_result = []\n", + "time_result = []\n", + "while n <= 16:\n", + " ana_regs_result.append([])\n", + " nvcc_regs_result.append([])\n", + " time_result.append([])\n", + " for i in range(0, 5):\n", + " if n != 0:\n", + " rescheduled_eqs = var_to_shmem(duplicate_trivial_ops(schedule_eqs(update_eqs, n)), 1)\n", + " else:\n", + " rescheduled_eqs = var_to_shmem(duplicate_trivial_ops(update_eqs), 1)\n", + " aliveAtPeak, aliveRegs = liveness_analysis(rescheduled_eqs)\n", + " ana_regs_result[-1].append(aliveRegs)\n", + "\n", + " kernel = create_kernel(\n", + " rescheduled_eqs,\n", + " target=\"gpu\",\n", + " gpu_indexing_params={\n", + " \"block_size\": (64, 2, 1)\n", + " }).compile()\n", + "\n", + " nvcc_regs_result[-1].append(kernel.num_regs)\n", + " time = bench_kernel(kernel)\n", + " time_result[-1].append(time)\n", + " \n", + " print(str(aliveRegs) + \" \" + str(kernel.num_regs) + \" \" + str(time) )\n", + " print()\n", + " n = max(n*2, n+1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fig, ax1 = plt.subplots()\n", + "\n", + "medians = []\n", + "for idx, r in enumerate(ana_regs_result):\n", + " r.sort()\n", + " medians.append(r[len(r)//2])\n", + " ax1.plot( [idx] * len(r), r, \"o\", color=\"C0\")\n", + "ax1.plot(medians, color = \"C0\")\n", + "\n", + "ax2 = ax1.twinx()\n", + "\n", + "medians = []\n", + "for idx, r in enumerate(nvcc_regs_result):\n", + " r.sort()\n", + " medians.append(r[len(r)//2])\n", + " ax2.plot( [idx] * len(r), r, \"x\", color=\"C1\")\n", + "ax2.plot(medians, color = \"C1\")\n", + "\n", + "medians = []\n", + "for idx, r in enumerate(time_result):\n", + " r = [r/15 for r in r]\n", + " r.sort()\n", + " medians.append(r[len(r)//2])\n", + " ax1.plot( [idx] * len(r), r, \"+\", color=\"C2\")\n", + "ax1.plot(medians, color = \"C2\")\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "reorder_depths = [0, 1, 4 , 16, 32, 64]\n", + "shmem_counts = [0, 1, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20]\n", + "\n", + "nvcc_regs = np.zeros((len(reorder_depths), len(shmem_counts)))\n", + "times = np.zeros((len(reorder_depths), len(shmem_counts)))\n", + "\n", + "for didx, reorder_depth in enumerate(reorder_depths):\n", + " for sidx, shmem_count in enumerate(shmem_counts):\n", + " for i in range(0, 1):\n", + " if reorder_depths != 0:\n", + " rescheduled_eqs = var_to_shmem(duplicate_trivial_ops(schedule_eqs(update_eqs, reorder_depth)), shmem_count)\n", + " else:\n", + " rescheduled_eqs = var_to_shmem(duplicate_trivial_ops(update_eqs), shmem_count)\n", + " \n", + " aliveAtPeak, aliveRegs = liveness_analysis(rescheduled_eqs)\n", + "\n", + " kernel = create_kernel(\n", + " rescheduled_eqs,\n", + " target=\"gpu\",\n", + " gpu_indexing_params={\n", + " \"block_size\": (64, 2, 1)\n", + " }).compile()\n", + "\n", + " time = bench_kernel(kernel)\n", + " nvcc_regs[didx, sidx] = kernel.num_regs\n", + " times[didx, sidx] = time\n", + " \n", + " print(str(aliveRegs) + \" \" + str(kernel.num_regs) + \" \" + str(time) )\n", + " print()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plt.imshow(nvcc_regs)\n", + "plt.colorbar()\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plt.imshow(times)\n", + "plt.colorbar()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/pystencils_tests/liveness_opts/sum_tree.ipynb b/pystencils_tests/liveness_opts/sum_tree.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..2f9a84465d8a4d14bbc5186d3ad32f45064e3ead --- /dev/null +++ b/pystencils_tests/liveness_opts/sum_tree.ipynb @@ -0,0 +1,163 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 1\n", + "%aimport liveness_opts\n", + "%load_ext line_profiler" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from lbmpy.session import *\n", + "from pygrandchem_tests.config2 import get_system\n", + "from pygrandchem_tests.config import get_system as get_system_simple\n", + "from pystencils.datahandling import SerialDataHandling\n", + "from pygrandchem.grandchemGeneration import *\n", + "from pygrandchem.chemicalpotential import *\n", + "from pystencils import show_code, Field\n", + "from sympy import Symbol, Expr, preorder_traversal, postorder_traversal, Function, Piecewise, sympify\n", + "\n", + "from liveness_opts import *\n", + "from subprocess import run, PIPE\n", + "import subprocess\n", + "\n", + "import pycuda\n", + "import string" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "config = get_system()\n", + "free_energy = FreeEnergy(config['FreeEnergy'], config['Temperature'])\n", + "\n", + "dh = SerialDataHandling((256, ), periodicity=(False))\n", + "f = dh.fields\n", + "dh.add_array('phi_src', values_per_cell=1, layout='f')\n", + "dh.add_array_like('phi_dst', 'phi_src')\n", + "\n", + "f = dh.fields\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "eqs = []\n", + "\n", + "num = 512\n", + "\n", + "for i in range(0,num):\n", + " eqs.append(Assignment(Symbol(\"A\" + str(num) + \"_\" + str(i)), f['phi_src'][0] * (0.01 + i / 10.0) + 0.1) )\n", + "\n", + "num = num // 2 \n", + "while num > 0:\n", + " for i in range(0,num):\n", + " eqs.append(Assignment(Symbol(\"A\" + str(num) + \"_\" + str(i)),\n", + " Symbol(\"A\" + str(num*2) + \"_\" + str(i*2)) + Symbol(\"A\" + str(num*2) + \"_\" + str(i*2+1))))\n", + " num = num // 2\n", + "\n", + "eqs.append(Assignment( f['phi_dst'][0], Symbol(\"A1_0\")))\n", + "\n", + "livenessAnalysis(eqs)\n", + "eqs = scheduleEqs3(eqs)\n", + "livenessAnalysis(eqs)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "kernel = create_kernel(\n", + " eqs,\n", + " target=\"gpu\",\n", + " gpu_indexing_params={\n", + " \"block_size\": (256, 1, 1)\n", + " }).compile()\n", + "\n", + "code = \"#include <cstdint>\\n\"\n", + "code += \"#define FUNC_PREFIX __global__\\n\"\n", + "code += \"#define RESTRICT __restrict__\\n\\n\"\n", + "\n", + "code += str(show_code(kernel.ast))\n", + "print(code)\n", + "\n", + "\n", + "cubin = pycuda.compiler.compile(code, options=[\"-w\", \"-std=c++11\"], arch=\"sm_60\")\n", + "\n", + "run([ \"echo \\\"\" + code + \"\\\" >> temp.cubin\"],\n", + " stdout=PIPE,\n", + " shell=True)\n", + "\n", + "newFile = open(\"temp.cusbin\", \"wb\")\n", + "newFile.write(cubin)\n", + "newFile.close()\n", + "\n", + "result = run([ \"nvdisasm -c -plr -lrm narrow temp.cusbin\"],\n", + " stdout=PIPE,\n", + " shell=True)\n", + "\n", + "print(result.stdout.decode(\"utf-8\"))\n", + "\n", + "newFile = open(\"temp.disasm\", \"wb\")\n", + "newFile.write(result.stdout)\n", + "newFile.close()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/pystencils_tests/liveness_opts/test_grandchem_staggered_gpu.ipynb b/pystencils_tests/liveness_opts/test_grandchem_staggered_gpu.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..496fc07288a8a3383268f1ae52aca6a339b234e2 --- /dev/null +++ b/pystencils_tests/liveness_opts/test_grandchem_staggered_gpu.ipynb @@ -0,0 +1,355 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import sys \n", + "sys.path.append('..')\n", + "\n", + "\n", + "\n", + "%load_ext autoreload\n", + "%autoreload 1\n", + "%aimport pystencils.simp.liveness_opts\n", + "%aimport pystencils.simp.liveness_opts_exp\n", + "%aimport pystencils.shmemvar\n", + "%aimport pystencils.backends.cbackend\n", + "%aimport pystencils.transformations\n", + "\n", + "\n", + "%load_ext line_profiler" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "from lbmpy.session import *\n", + "from scipy.ndimage.filters import gaussian_filter\n", + "from pygrandchem.grandchem_generation import *\n", + "from pygrandchem.chemicalpotential import free_energy_from_config_object, FreeEnergy\n", + "from pygrandchem.initialization import *\n", + "from pygrandchem_tests.config_anisotropic import get_system\n", + "from pystencils.boundaries import *\n", + "\n", + "#import pyximport\n", + "#pyximport.install(language_level=3)\n", + "#from lbmpy.phasefield.simplex_projection import simplex_projection_2d, simplex_projection_3d" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Configuration" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "domain_size = (120, 250)\n", + "periodicity=(True, False)\n", + "fast_simplex_projection = True\n", + "optimization = {'cpu_openmp': 4, 'target': 'gpu'}\n", + "config = get_system(dim=len(domain_size))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Creating data and compute kernels" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "phases = config['Parameters']['phases']\n", + "components = config['Parameters']['components']\n", + "diffusion_matrices = config['Parameters']['diffusion']\n", + "free_energy = config['FreeEnergy']" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "256\n", + "256\n" + ] + } + ], + "source": [ + "# Adding fields\n", + "dh = create_data_handling(domain_size, periodicity=periodicity, default_target=optimization['target'])\n", + "f = dh.fields\n", + "phi_src = dh.add_array('phi_src', values_per_cell=phases, layout='fzyx', latex_name='phi_s')\n", + "mu_src = dh.add_array('mu_src', values_per_cell=components, layout='fzyx', latex_name=\"mu_s\")\n", + "mu_stag = dh.add_array('mu_stag', values_per_cell=(dh.dim, components), layout='f')\n", + "phi_dst = dh.add_array_like('phi_dst', 'phi_src')\n", + "mu_dst = dh.add_array_like('mu_dst', 'mu_src')\n", + "\n", + "c = dh.add_array('c', values_per_cell=components, layout='fzyx', gpu=False);\n", + "\n", + "# Boundary\n", + "boundary_phi = BoundaryHandling(dh, phi_dst.name, get_stencil('D2Q9'), target=dh.default_target, name='phi_boundary')\n", + "boundary_mu = BoundaryHandling(dh, mu_dst.name, get_stencil('D2Q9'), target=dh.default_target, name='mu_boundary')\n", + "\n", + "neumann_boundaries = ('N', 'S')\n", + "for direction in neumann_boundaries:\n", + " boundary_phi.set_boundary(Neumann(), slice_from_direction(direction, dh.dim))\n", + " boundary_mu.set_boundary(Neumann(), slice_from_direction(direction, dh.dim))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Compute kernels" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Compiling and simplifying φ update equations - this may take a while\n", + "Compiling and simplifying μ update equations - this may take a while\n", + "Compiling and simplifying μ update equations - this may take a while\n", + "256\n", + "256\n", + "256\n", + "256\n" + ] + } + ], + "source": [ + "c_from_mu_eqs = create_concentration_from_mu_kernel(c, phi_src, mu_src, free_energy)\n", + "\n", + "phi_update_eqs = create_phi_update_equations(phi_src, phi_dst, mu_src, free_energy, config['Parameters'], \n", + " simplex_projection=fast_simplex_projection)\n", + "mu_update_eqs = create_mu_update_equations(phi_src, phi_dst, mu_src, mu_dst,\n", + " free_energy, diffusion_matrices,\n", + " config['Parameters'])\n", + "\n", + "mu_stag_update_eqs = create_mu_update_equations(phi_src, phi_dst, mu_src, mu_dst,\n", + " free_energy, diffusion_matrices,\n", + " config['Parameters'], mu_staggered_field=mu_stag)\n", + "\n", + "mu_stag_precomp_eqs = create_mu_update_staggered_equations(phi_src, phi_dst, mu_src, \n", + " mu_stag, free_energy, diffusion_matrices, \n", + " config['Parameters'], target=optimization['target'])\n", + "\n", + "\n", + "\n", + "mu_stag_precomp_kernel = create_mu_update_staggered_ast(phi_src, phi_dst, mu_src, \n", + " mu_stag, free_energy, diffusion_matrices, \n", + " config['Parameters'], target=optimization['target']).compile()\n", + "\n", + "\n", + "phi_kernel = create_kernel(phi_update_eqs, **optimization).compile()\n", + "mu_kernel = create_kernel(mu_update_eqs, **optimization).compile()\n", + "mu_stag_kernel = create_kernel(mu_stag_update_eqs, **optimization).compile()\n", + "\n", + "conc_from_mu = create_kernel(c_from_mu_eqs, target='cpu').compile()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Geometry setup" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "256\n", + "256\n", + "256\n", + "256\n", + "256\n", + "256\n", + "256\n", + "256\n", + " Name| Inner (min/max)| WithGl (min/max)\n", + "-------------------------------------------------------------\n", + " c| (0.0254,0.318)| (0.0254,0.318)\n", + " mu_boundaryFlags| ( 1, 1)| ( 1, 2)\n", + " mu_dst| (-48.8,-48.8)| (-48.8,-48.8)\n", + " mu_src| (-48.8,-48.8)| (-48.8,-48.8)\n", + " mu_stag| ( 0, 0)| ( 0, 0)\n", + "phi_boundaryFlags| ( 1, 1)| ( 1, 2)\n", + " phi_dst| ( 0, 1)| ( 0, 1)\n", + " phi_src| ( 0, 1)| ( 0, 1)\n", + "\n" + ] + } + ], + "source": [ + "init_boxes(dh, height=0.2)\n", + "initialize_concentration_field(dh, free_energy, config['Parameters']['initial_concentration'])\n", + "smooth_fields(dh, sigma=0.4, iterations=5, dim=dh.dim)\n", + "dh.synchronization_function(['phi_src','phi_dst','mu_src', 'mu_dst'])()\n", + "print(dh)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "256\n", + "256\n", + "256\n", + "256\n" + ] + } + ], + "source": [ + "import time\n", + "phi_dst_comm = dh.synchronization_function(['phi_dst'])\n", + "mu_dst_comm = dh.synchronization_function(['mu_dst'])\n", + "\n", + "def time_loop(iterations):\n", + " dh.all_to_gpu()\n", + " start = time.perf_counter()\n", + " for t in range(iterations):\n", + " dh.run_kernel(phi_kernel)\n", + " if not fast_simplex_projection:\n", + " for b in dh.iterate():\n", + " simplex_projection_3d(b['phi_dst'])\n", + " \n", + " phi_dst_comm()\n", + " boundary_phi()\n", + " \n", + " #dh.run_kernel(mu_kernel)\n", + " dh.run_kernel(mu_stag_precomp_kernel)\n", + " dh.run_kernel(mu_stag_kernel)\n", + " mu_dst_comm()\n", + " boundary_mu()\n", + " \n", + " dh.swap('mu_src', 'mu_dst')\n", + " dh.swap('phi_src', 'phi_dst')\n", + " duration = (time.perf_counter() - start) / iterations\n", + " dh.all_to_cpu()\n", + " return duration" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAANQAAAAPBAMAAABuCfzHAAAAMFBMVEX///8AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAv3aB7AAAAD3RSTlMAiXaZIs1UEN1mu6tEMu+iw/3TAAAACXBIWXMAAA7EAAAOxAGVKw4bAAADNElEQVQ4EZVUS2gTURQ9MdOZ5tcm8QeCGKwLQbDBKGLrp+DGXQui+EGdhWs7omJXbUStooLBDyqIVkQUXRgoiFq0sxFd2dGVG7ELacUibdXEttrG8+5LnGBBMIFz571zzz3vzb0J8K9PDDhcxRtXqhb/9ejBagECydWikugDEgkPdYUdp2Al57rMiKVhlkqlmbIgkFhcZhINlSq+2kg0UZNpdih8Rxjddpq4HR1MLUdZCGxxI1nUnj/mIAg8Il+bR2B3JtOtBcY5PNZMjW2dmKU+hFARHanYR1i9+2hyemIhEMoi0MNKEn3AVoR7ECGBa8ARhuV5VdpIa0FNGvc1cxVYNEvdaFsFPAduU9hIq7wqFM7D/FGJshCQPW21SKxil5kI7mhBO/W04CEeAO2OL9T0axcFfrGEWX+s6tOIFrkh0YeaQe5pq4G7WMeW1sjJFutE7FW0MN8YbV9YqWdOWlO08aqsumg1SZlEH+r3JNfT6kKDh0ipwQaWiZXRoxNReNPgCmMpq5wvrNR7lZNbudrqydINQGscsa+0kuhD107U2Qg7xi++hUILjJxYRVydaBVyGBLGGAc+pHxhud7bPk/16mFOW91Fq1YyWyqMqzwNXWMIDnMb92DOe/gLAYhVaznRKgEbHcWIVdwXyhN1ZhGdbqzf1laciSy64oiqW0n0oT6LEC8EbGq5CHPGuaSt5pcTMQHcdBVzlOccSPnCSj2MOtaqBZVeAcHvqss8gIyFWZSFQG0bQpO4QyuPl2vdn9JWZ8qJOEmrnGJs9qpdjUW1Gu/57mx1UA4qJzDchuC0+vWEZNhVlIUGudUnoN8a43hc7+vrH4nD+sk3IYJ7tHqpmNx9PjlVQkVbJUdb7aIdreqyiIwhNIhgGzck+hBlrwYRBz7jI8u3AHV5IMoraMEAe6WZG8Cav9UY4X+Rc8AxprVVgPIeoBudHn/aEqvgFjptbMacYbxwcIGKeuaaaoCUAOG4NaSZWtv6Mkt9DeYkljvhHPN5KyONlQ5wcOkz4Hg5ykIgkOG2kel1EOvNuGzrvqkUDA64FmBus6cZK7nCm6WOZppcBBLrmf509GwKm5vX4jflHmnZdri02wAAAABJRU5ErkJggg==\n", + "text/latex": [ + "$$0.0006584706800003915$$" + ], + "text/plain": [ + "0.0006584706800003915" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "time_loop(100)\n", + "#assert interface_location() >= 53" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "xi_0 ↠phi_src_C^0**2\n", + "xi_1 ↠phi_src_C^1**2\n", + "xi_2 ↠phi_src_C^2**2\n", + "xi_3 ↠xi_0 + xi_1 + xi_2\n", + "xi_4 ↠phi_dst_C^1**2\n", + "xi_5 ↠phi_dst_C^0**2\n", + "xi_6 ↠phi_dst_C^2**2\n", + "xi_7 ↠32.0/(xi_4 + xi_5 + xi_6)\n", + "xi_8 ↠32.0/xi_3\n", + "dc_dmu_0_0 ↠xi_3/(0.0277302350554502*xi_0 + 0.0134501298325457*xi_1 + 0.145247543885788*xi_2)\n", + "dc_dphi_dt_0 ↠(0.0134501298325457*mu_src_C + 0.974209571226215)*(-xi_1*xi_8 + xi_4*xi_7) + (0.0277302350554502*mu_src_C + 1.37922908109611)*(-xi_0*xi_8 + xi_5*xi_7) + (0.145247543885788*mu_src_C + 7.27037126746791)*(-xi_2*xi_8 + xi_6*xi_7)\n", + "dc_dT_dt_0 ↠0\n", + "divMgradmu_0 ↠-2.0*mu_stag_C^0,0 - 2.0*mu_stag_C^1,0 + 2.0*mu_stag_N^1,0 + 2.0*mu_stag_E^0,0\n", + "mu_dst[0,0] ↠mu_src_C + 0.03125*dc_dmu_0_0*(-dc_dT_dt_0 - dc_dphi_dt_0 + divMgradmu_0)\n" + ] + } + ], + "source": [ + "for eq in mu_stag_update_eqs:\n", + " print(eq)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/pystencils_tests/liveness_opts/test_liveness_opts.ipynb b/pystencils_tests/liveness_opts/test_liveness_opts.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..67c1a8a549a6e03a1707b9ef00612da44adc12d7 --- /dev/null +++ b/pystencils_tests/liveness_opts/test_liveness_opts.ipynb @@ -0,0 +1,959 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "%load_ext autoreload\n", + "%autoreload 1\n", + "%aimport pystencils.simp.liveness_opts\n", + "%aimport pystencils.simp.liveness_opts_exp\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import warnings\n", + "import pystencils as ps\n", + "from pygrandchem.grandchem import GrandChemGenerator\n", + "from pygrandchem.scenarios import system_4_2, system_3_1\n", + "\n", + "from sympy import Number, Symbol, Expr, preorder_traversal, postorder_traversal, Function, Piecewise, relational\n", + "from pystencils.simp import sympy_cse_on_assignment_list\n", + "from pystencils.simp.liveness_opts import *\n", + "from pystencils.simp.liveness_opts_exp import *\n", + "\n", + "import graphviz\n", + "\n", + "#import pycuda\n", + "\n", + "import sys\n", + "from subprocess import run, PIPE\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "rotation = [\n", + " [(None,) * 4, (80, 0.03, 0.5, 0.3), (10, 1, 0, 0)],\n", + " [(None,) * 4, (None,) * 4, (11, 0.7, 0.5, 0.3)],\n", + "]\n", + "\n", + "common = {'noise_amplitude': 0, 'dim': 3}\n", + "configs = {\n", + " '42_fixT': lambda: system_4_2(variable_temperature=False),\n", + " '42_varT': lambda: system_4_2(variable_temperature=True),\n", + " '31_fixT_iso': lambda: system_3_1(variable_temperature=False, fab_value=0, **common),\n", + " '31_varT_iso': lambda: system_3_1(variable_temperature=True, fab_value=0, **common),\n", + " '31_fixT_aniso': lambda: system_3_1(variable_temperature=False, **common),\n", + " '31_varT_aniso': lambda: system_3_1(variable_temperature=True, **common),\n", + " '31_fixT_aniso_rot': lambda: system_3_1(variable_temperature=False, anisotropy_rotation=rotation, **common),\n", + " '31_varT_aniso_rot': lambda: system_3_1(variable_temperature=True, anisotropy_rotation=rotation, **common),\n", + "}\n", + "\n", + "\n", + "def get_config(name):\n", + " return configs[name]()\n", + "\n", + "\n", + "def get_generator(domain_size, config, **kwargs):\n", + " assert len(domain_size) == 3\n", + " assert config['Parameters']['dim'] == 3\n", + "\n", + " phases, components = config['Parameters']['phases'], config['Parameters']['components']\n", + "\n", + " format_args = {'p': phases, 'c': components, 's': ','.join(str(e) for e in domain_size)}\n", + " phi_src, phi_dst = ps.fields(\"phi_src({p}), phi_dst({p}) : double[{s}]\".format(**format_args), layout='f')\n", + " mu_src, mu_dst = ps.fields(\"mu_src({c}), mu_dst({c}) : double[{s}]\".format(**format_args), layout='f')\n", + " mu_stag, phi_stag = ps.fields(\"mu_stag(3, {c}), phi_stag(3, {p}) : double[{s}]\".format(**format_args),\n", + " layout='f')\n", + " c = ps.fields(\"c({c}) : double[{s}]\".format(**format_args))\n", + " \n", + " gc = GrandChemGenerator(phi_src, phi_dst, mu_src, mu_dst,\n", + " config['FreeEnergy'], config['Parameters'],\n", + " conc=c, mu_staggered=mu_stag, phi_staggered=phi_stag,\n", + " use_block_offsets=False,\n", + " compile_kernel=False,\n", + " **kwargs)\n", + " return gc" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "gc = get_generator((256, 256, 256), get_config('42_fixT'))\n", + "mu_kernel = gc.mu_full()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "xi_0 ↠phi_dst_C^0**2\n", + "xi_1 ↠phi_dst_C^1**2\n", + "xi_2 ↠phi_dst_C^2**2\n", + "xi_3 ↠phi_dst_C^3**2\n", + "xi_4 ↠phi_src_C^0**2\n", + "xi_5 ↠phi_src_C^1**2\n", + "xi_6 ↠phi_src_C^2**2\n", + "xi_7 ↠phi_src_C^3**2\n", + "xi_8 ↠200.0/((xi_0 + xi_1 + xi_2 + xi_3)*(xi_4 + xi_5 + xi_6 + xi_7))\n", + "xi_9 ↠0.2*xi_0\n", + "xi_10 ↠0.2*xi_1\n", + "xi_11 ↠-xi_10*xi_4 + xi_5*xi_9\n", + "xi_12 ↠xi_0*xi_7\n", + "xi_13 ↠xi_1*xi_7\n", + "xi_14 ↠0.2*xi_2\n", + "xi_15 ↠0.0666666666666667*xi_2*xi_7\n", + "xi_16 ↠xi_3*xi_4\n", + "xi_17 ↠xi_3*xi_5\n", + "xi_18 ↠0.0666666666666667*xi_3*xi_6\n", + "xi_19 ↠1.0*mu_src_C^0\n", + "xi_20 ↠phi_src_C^3/2\n", + "xi_21 ↠phi_src_W^3/2 + xi_20\n", + "xi_22 ↠xi_21**2\n", + "xi_23 ↠phi_src_C^0/2\n", + "xi_24 ↠phi_src_W^0/2 + xi_23\n", + "xi_25 ↠xi_24**2\n", + "xi_26 ↠phi_src_C^1/2\n", + "xi_27 ↠phi_src_W^1/2 + xi_26\n", + "xi_28 ↠xi_27**2\n", + "xi_29 ↠phi_src_C^2/2\n", + "xi_30 ↠phi_src_W^2/2 + xi_29\n", + "xi_31 ↠xi_30**2\n", + "xi_32 ↠1/(xi_22 + xi_25 + xi_28 + xi_31)\n", + "xi_33 ↠xi_22*xi_32*(-1.0*mu_src_W^0 + xi_19)\n", + "xi_34 ↠1.0*mu_src_C^1\n", + "xi_35 ↠xi_22*xi_32*(-1.0*mu_src_W^1 + xi_34)\n", + "xi_36 ↠1.0*phi_src_C^3\n", + "xi_37 ↠-1.0*phi_src_W^3 + xi_36\n", + "xi_38 ↠sqrt(xi_21*xi_24)\n", + "xi_39 ↠1.0*phi_src_C^0\n", + "xi_40 ↠-1.0*phi_src_W^0 + xi_39\n", + "xi_41 ↠-0.25*phi_src_SW^0\n", + "xi_42 ↠0.25*phi_src_NW^0\n", + "xi_43 ↠-0.25*phi_src_S^0 + 0.25*phi_src_N^0\n", + "xi_44 ↠xi_41 + xi_42 + xi_43\n", + "xi_45 ↠-0.25*phi_src_BW^0\n", + "xi_46 ↠0.25*phi_src_TW^0\n", + "xi_47 ↠-0.25*phi_src_B^0 + 0.25*phi_src_T^0\n", + "xi_48 ↠xi_45 + xi_46 + xi_47\n", + "xi_49 ↠sqrt(xi_40**2 + xi_44**2 + xi_48**2)\n", + "xi_50 ↠-0.25*phi_src_SW^3\n", + "xi_51 ↠0.25*phi_src_NW^3\n", + "xi_52 ↠-0.25*phi_src_S^3 + 0.25*phi_src_N^3\n", + "xi_53 ↠xi_50 + xi_51 + xi_52\n", + "xi_54 ↠-0.25*phi_src_BW^3\n", + "xi_55 ↠0.25*phi_src_TW^3\n", + "xi_56 ↠-0.25*phi_src_B^3 + 0.25*phi_src_T^3\n", + "xi_57 ↠xi_54 + xi_55 + xi_56\n", + "xi_58 ↠xi_37**2 + xi_53**2 + xi_57**2\n", + "xi_59 ↠sqrt(xi_58)\n", + "xi_60 ↠(xi_38 < 1.0e-9) | (xi_49*xi_59 < 1.0e-9)\n", + "xi_61 ↠0.166666666666667*mu_src_C^0\n", + "xi_62 ↠-0.0833333333333333*mu_src_C^1\n", + "xi_63 ↠0.166666666666667*mu_src_W^0 - 0.0833333333333333*mu_src_W^1 + xi_61 + xi_62\n", + "xi_64 ↠xi_22*xi_32\n", + "xi_65 ↠xi_64*(xi_63 + 0.333333333333333)\n", + "xi_66 ↠xi_25*xi_32\n", + "xi_67 ↠50.0*phi_dst_C^0 - 50.0*phi_src_C^0\n", + "xi_68 ↠1/xi_58\n", + "xi_69 ↠xi_22*xi_24*xi_32*xi_68*(50.0*phi_dst_W^0 - 50.0*phi_src_W^0 + xi_67)*(xi_37*xi_40 + xi_44*xi_53 + xi_48*xi_57)/(xi_38*xi_49)\n", + "xi_70 ↠sqrt(xi_21*xi_27)\n", + "xi_71 ↠1.0*phi_src_C^1\n", + "xi_72 ↠-1.0*phi_src_W^1 + xi_71\n", + "xi_73 ↠-0.25*phi_src_SW^1\n", + "xi_74 ↠0.25*phi_src_NW^1\n", + "xi_75 ↠-0.25*phi_src_S^1 + 0.25*phi_src_N^1\n", + "xi_76 ↠xi_73 + xi_74 + xi_75\n", + "xi_77 ↠-0.25*phi_src_BW^1\n", + "xi_78 ↠0.25*phi_src_TW^1\n", + "xi_79 ↠-0.25*phi_src_B^1 + 0.25*phi_src_T^1\n", + "xi_80 ↠xi_77 + xi_78 + xi_79\n", + "xi_81 ↠sqrt(xi_72**2 + xi_76**2 + xi_80**2)\n", + "xi_82 ↠(xi_70 < 1.0e-9) | (xi_59*xi_81 < 1.0e-9)\n", + "xi_83 ↠xi_63 + 0.2\n", + "xi_84 ↠xi_28*xi_32\n", + "xi_85 ↠50.0*phi_dst_C^1 - 50.0*phi_src_C^1\n", + "xi_86 ↠xi_22*xi_27*xi_32*xi_68*(50.0*phi_dst_W^1 - 50.0*phi_src_W^1 + xi_85)*(xi_37*xi_72 + xi_53*xi_76 + xi_57*xi_80)/(xi_70*xi_81)\n", + "xi_87 ↠sqrt(xi_21*xi_30)\n", + "xi_88 ↠1.0*phi_src_C^2\n", + "xi_89 ↠-1.0*phi_src_W^2 + xi_88\n", + "xi_90 ↠-0.25*phi_src_SW^2\n", + "xi_91 ↠0.25*phi_src_NW^2\n", + "xi_92 ↠-0.25*phi_src_S^2 + 0.25*phi_src_N^2\n", + "xi_93 ↠xi_90 + xi_91 + xi_92\n", + "xi_94 ↠-0.25*phi_src_BW^2\n", + "xi_95 ↠0.25*phi_src_TW^2\n", + "xi_96 ↠-0.25*phi_src_B^2 + 0.25*phi_src_T^2\n", + "xi_97 ↠xi_94 + xi_95 + xi_96\n", + "xi_98 ↠sqrt(xi_89**2 + xi_93**2 + xi_97**2)\n", + "xi_99 ↠(xi_87 < 1.0e-9) | (xi_59*xi_98 < 1.0e-9)\n", + "xi_100 ↠xi_31*xi_32\n", + "xi_101 ↠50.0*phi_dst_C^2 - 50.0*phi_src_C^2\n", + "xi_102 ↠xi_22*xi_30*xi_32*xi_68*(50.0*phi_dst_W^2 - 50.0*phi_src_W^2 + xi_101)*(xi_37*xi_89 + xi_53*xi_93 + xi_57*xi_97)/(xi_87*xi_98)\n", + "xi_103 ↠-0.0833333333333333*mu_src_C^0\n", + "xi_104 ↠0.166666666666667*mu_src_C^1\n", + "xi_105 ↠-0.0833333333333333*mu_src_W^0 + 0.166666666666667*mu_src_W^1 + xi_103 + xi_104\n", + "xi_106 ↠xi_64*(xi_105 + 0.333333333333333)\n", + "xi_107 ↠xi_105 + 0.2\n", + "xi_108 ↠-xi_19\n", + "xi_109 ↠phi_src_E^3/2 + xi_20\n", + "xi_110 ↠xi_109**2\n", + "xi_111 ↠phi_src_E^0/2 + xi_23\n", + "xi_112 ↠xi_111**2\n", + "xi_113 ↠phi_src_E^1/2 + xi_26\n", + "xi_114 ↠xi_113**2\n", + "xi_115 ↠phi_src_E^2/2 + xi_29\n", + "xi_116 ↠xi_115**2\n", + "xi_117 ↠1/(xi_110 + xi_112 + xi_114 + xi_116)\n", + "xi_118 ↠xi_110*xi_117*(1.0*mu_src_E^0 + xi_108)\n", + "xi_119 ↠-xi_34\n", + "xi_120 ↠xi_110*xi_117*(1.0*mu_src_E^1 + xi_119)\n", + "xi_121 ↠-xi_36\n", + "xi_122 ↠1.0*phi_src_E^3 + xi_121\n", + "xi_123 ↠sqrt(xi_109*xi_111)\n", + "xi_124 ↠-xi_39\n", + "xi_125 ↠1.0*phi_src_E^0 + xi_124\n", + "xi_126 ↠0.25*phi_src_SE^0\n", + "xi_127 ↠0.25*phi_src_NE^0\n", + "xi_128 ↠-xi_126 + xi_127 + xi_43\n", + "xi_129 ↠0.25*phi_src_BE^0\n", + "xi_130 ↠0.25*phi_src_TE^0\n", + "xi_131 ↠-xi_129 + xi_130 + xi_47\n", + "xi_132 ↠sqrt(xi_125**2 + xi_128**2 + xi_131**2)\n", + "xi_133 ↠0.25*phi_src_SE^3\n", + "xi_134 ↠0.25*phi_src_NE^3\n", + "xi_135 ↠-xi_133 + xi_134 + xi_52\n", + "xi_136 ↠0.25*phi_src_BE^3\n", + "xi_137 ↠0.25*phi_src_TE^3\n", + "xi_138 ↠-xi_136 + xi_137 + xi_56\n", + "xi_139 ↠xi_122**2 + xi_135**2 + xi_138**2\n", + "xi_140 ↠sqrt(xi_139)\n", + "xi_141 ↠(xi_123 < 1.0e-9) | (xi_132*xi_140 < 1.0e-9)\n", + "xi_142 ↠0.166666666666667*mu_src_E^0 - 0.0833333333333333*mu_src_E^1 + xi_61 + xi_62\n", + "xi_143 ↠xi_110*xi_117\n", + "xi_144 ↠xi_143*(xi_142 + 0.333333333333333)\n", + "xi_145 ↠xi_112*xi_117\n", + "xi_146 ↠1/xi_139\n", + "xi_147 ↠xi_110*xi_111*xi_117*xi_146*(50.0*phi_dst_E^0 - 50.0*phi_src_E^0 + xi_67)*(xi_122*xi_125 + xi_128*xi_135 + xi_131*xi_138)/(xi_123*xi_132)\n", + "xi_148 ↠sqrt(xi_109*xi_113)\n", + "xi_149 ↠-xi_71\n", + "xi_150 ↠1.0*phi_src_E^1 + xi_149\n", + "xi_151 ↠0.25*phi_src_SE^1\n", + "xi_152 ↠0.25*phi_src_NE^1\n", + "xi_153 ↠-xi_151 + xi_152 + xi_75\n", + "xi_154 ↠0.25*phi_src_BE^1\n", + "xi_155 ↠0.25*phi_src_TE^1\n", + "xi_156 ↠-xi_154 + xi_155 + xi_79\n", + "xi_157 ↠sqrt(xi_150**2 + xi_153**2 + xi_156**2)\n", + "xi_158 ↠(xi_148 < 1.0e-9) | (xi_140*xi_157 < 1.0e-9)\n", + "xi_159 ↠xi_142 + 0.2\n", + "xi_160 ↠xi_114*xi_117\n", + "xi_161 ↠xi_110*xi_113*xi_117*xi_146*(50.0*phi_dst_E^1 - 50.0*phi_src_E^1 + xi_85)*(xi_122*xi_150 + xi_135*xi_153 + xi_138*xi_156)/(xi_148*xi_157)\n", + "xi_162 ↠sqrt(xi_109*xi_115)\n", + "xi_163 ↠-xi_88\n", + "xi_164 ↠1.0*phi_src_E^2 + xi_163\n", + "xi_165 ↠0.25*phi_src_SE^2\n", + "xi_166 ↠0.25*phi_src_NE^2\n", + "xi_167 ↠-xi_165 + xi_166 + xi_92\n", + "xi_168 ↠0.25*phi_src_BE^2\n", + "xi_169 ↠0.25*phi_src_TE^2\n", + "xi_170 ↠-xi_168 + xi_169 + xi_96\n", + "xi_171 ↠sqrt(xi_164**2 + xi_167**2 + xi_170**2)\n", + "xi_172 ↠(xi_162 < 1.0e-9) | (xi_140*xi_171 < 1.0e-9)\n", + "xi_173 ↠xi_116*xi_117\n", + "xi_174 ↠xi_110*xi_115*xi_117*xi_146*(50.0*phi_dst_E^2 - 50.0*phi_src_E^2 + xi_101)*(xi_122*xi_164 + xi_135*xi_167 + xi_138*xi_170)/(xi_162*xi_171)\n", + "xi_175 ↠-0.0833333333333333*mu_src_E^0 + 0.166666666666667*mu_src_E^1 + xi_103 + xi_104\n", + "xi_176 ↠xi_143*(xi_175 + 0.333333333333333)\n", + "xi_177 ↠xi_175 + 0.2\n", + "xi_178 ↠phi_src_S^3/2 + xi_20\n", + "xi_179 ↠xi_178**2\n", + "xi_180 ↠phi_src_S^0/2 + xi_23\n", + "xi_181 ↠xi_180**2\n", + "xi_182 ↠phi_src_S^1/2 + xi_26\n", + "xi_183 ↠xi_182**2\n", + "xi_184 ↠phi_src_S^2/2 + xi_29\n", + "xi_185 ↠xi_184**2\n", + "xi_186 ↠1/(xi_179 + xi_181 + xi_183 + xi_185)\n", + "xi_187 ↠xi_179*xi_186*(-1.0*mu_src_S^0 + xi_19)\n", + "xi_188 ↠xi_179*xi_186*(-1.0*mu_src_S^1 + xi_34)\n", + "xi_189 ↠-1.0*phi_src_S^3 + xi_36\n", + "xi_190 ↠sqrt(xi_178*xi_180)\n", + "xi_191 ↠-1.0*phi_src_S^0 + xi_39\n", + "xi_192 ↠-0.25*phi_src_W^0 + 0.25*phi_src_E^0\n", + "xi_193 ↠xi_126 + xi_192 + xi_41\n", + "xi_194 ↠-0.25*phi_src_BS^0\n", + "xi_195 ↠0.25*phi_src_TS^0\n", + "xi_196 ↠xi_194 + xi_195 + xi_47\n", + "xi_197 ↠sqrt(xi_191**2 + xi_193**2 + xi_196**2)\n", + "xi_198 ↠-0.25*phi_src_W^3 + 0.25*phi_src_E^3\n", + "xi_199 ↠xi_133 + xi_198 + xi_50\n", + "xi_200 ↠-0.25*phi_src_BS^3\n", + "xi_201 ↠0.25*phi_src_TS^3\n", + "xi_202 ↠xi_200 + xi_201 + xi_56\n", + "xi_203 ↠xi_189**2 + xi_199**2 + xi_202**2\n", + "xi_204 ↠sqrt(xi_203)\n", + "xi_205 ↠(xi_190 < 1.0e-9) | (xi_197*xi_204 < 1.0e-9)\n", + "xi_206 ↠0.166666666666667*mu_src_S^0 - 0.0833333333333333*mu_src_S^1 + xi_61 + xi_62\n", + "xi_207 ↠xi_179*xi_186\n", + "xi_208 ↠xi_207*(xi_206 + 0.333333333333333)\n", + "xi_209 ↠xi_181*xi_186\n", + "xi_210 ↠1/xi_203\n", + "xi_211 ↠xi_179*xi_180*xi_186*xi_210*(50.0*phi_dst_S^0 - 50.0*phi_src_S^0 + xi_67)*(xi_189*xi_191 + xi_193*xi_199 + xi_196*xi_202)/(xi_190*xi_197)\n", + "xi_212 ↠sqrt(xi_178*xi_182)\n", + "xi_213 ↠-1.0*phi_src_S^1 + xi_71\n", + "xi_214 ↠-0.25*phi_src_W^1 + 0.25*phi_src_E^1\n", + "xi_215 ↠xi_151 + xi_214 + xi_73\n", + "xi_216 ↠-0.25*phi_src_BS^1\n", + "xi_217 ↠0.25*phi_src_TS^1\n", + "xi_218 ↠xi_216 + xi_217 + xi_79\n", + "xi_219 ↠sqrt(xi_213**2 + xi_215**2 + xi_218**2)\n", + "xi_220 ↠(xi_212 < 1.0e-9) | (xi_204*xi_219 < 1.0e-9)\n", + "xi_221 ↠xi_206 + 0.2\n", + "xi_222 ↠xi_183*xi_186\n", + "xi_223 ↠xi_179*xi_182*xi_186*xi_210*(50.0*phi_dst_S^1 - 50.0*phi_src_S^1 + xi_85)*(xi_189*xi_213 + xi_199*xi_215 + xi_202*xi_218)/(xi_212*xi_219)\n", + "xi_224 ↠sqrt(xi_178*xi_184)\n", + "xi_225 ↠-1.0*phi_src_S^2 + xi_88\n", + "xi_226 ↠-0.25*phi_src_W^2 + 0.25*phi_src_E^2\n", + "xi_227 ↠xi_165 + xi_226 + xi_90\n", + "xi_228 ↠-0.25*phi_src_BS^2\n", + "xi_229 ↠0.25*phi_src_TS^2\n", + "xi_230 ↠xi_228 + xi_229 + xi_96\n", + "xi_231 ↠sqrt(xi_225**2 + xi_227**2 + xi_230**2)\n", + "xi_232 ↠(xi_224 < 1.0e-9) | (xi_204*xi_231 < 1.0e-9)\n", + "xi_233 ↠xi_185*xi_186\n", + "xi_234 ↠xi_179*xi_184*xi_186*xi_210*(50.0*phi_dst_S^2 - 50.0*phi_src_S^2 + xi_101)*(xi_189*xi_225 + xi_199*xi_227 + xi_202*xi_230)/(xi_224*xi_231)\n", + "xi_235 ↠-0.0833333333333333*mu_src_S^0 + 0.166666666666667*mu_src_S^1 + xi_103 + xi_104\n", + "xi_236 ↠xi_207*(xi_235 + 0.333333333333333)\n", + "xi_237 ↠xi_235 + 0.2\n", + "xi_238 ↠phi_src_N^3/2 + xi_20\n", + "xi_239 ↠xi_238**2\n", + "xi_240 ↠phi_src_N^0/2 + xi_23\n", + "xi_241 ↠xi_240**2\n", + "xi_242 ↠phi_src_N^1/2 + xi_26\n", + "xi_243 ↠xi_242**2\n", + "xi_244 ↠phi_src_N^2/2 + xi_29\n", + "xi_245 ↠xi_244**2\n", + "xi_246 ↠1/(xi_239 + xi_241 + xi_243 + xi_245)\n", + "xi_247 ↠xi_239*xi_246*(1.0*mu_src_N^0 + xi_108)\n", + "xi_248 ↠xi_239*xi_246*(1.0*mu_src_N^1 + xi_119)\n", + "xi_249 ↠1.0*phi_src_N^3 + xi_121\n", + "xi_250 ↠sqrt(xi_238*xi_240)\n", + "xi_251 ↠1.0*phi_src_N^0 + xi_124\n", + "xi_252 ↠xi_127 + xi_192 - xi_42\n", + "xi_253 ↠0.25*phi_src_BN^0\n", + "xi_254 ↠0.25*phi_src_TN^0\n", + "xi_255 ↠-xi_253 + xi_254 + xi_47\n", + "xi_256 ↠sqrt(xi_251**2 + xi_252**2 + xi_255**2)\n", + "xi_257 ↠xi_134 + xi_198 - xi_51\n", + "xi_258 ↠0.25*phi_src_BN^3\n", + "xi_259 ↠0.25*phi_src_TN^3\n", + "xi_260 ↠-xi_258 + xi_259 + xi_56\n", + "xi_261 ↠xi_249**2 + xi_257**2 + xi_260**2\n", + "xi_262 ↠sqrt(xi_261)\n", + "xi_263 ↠(xi_250 < 1.0e-9) | (xi_256*xi_262 < 1.0e-9)\n", + "xi_264 ↠0.166666666666667*mu_src_N^0 - 0.0833333333333333*mu_src_N^1 + xi_61 + xi_62\n", + "xi_265 ↠xi_239*xi_246\n", + "xi_266 ↠xi_265*(xi_264 + 0.333333333333333)\n", + "xi_267 ↠xi_241*xi_246\n", + "xi_268 ↠1/xi_261\n", + "xi_269 ↠xi_239*xi_240*xi_246*xi_268*(50.0*phi_dst_N^0 - 50.0*phi_src_N^0 + xi_67)*(xi_249*xi_251 + xi_252*xi_257 + xi_255*xi_260)/(xi_250*xi_256)\n", + "xi_270 ↠sqrt(xi_238*xi_242)\n", + "xi_271 ↠1.0*phi_src_N^1 + xi_149\n", + "xi_272 ↠xi_152 + xi_214 - xi_74\n", + "xi_273 ↠0.25*phi_src_BN^1\n", + "xi_274 ↠0.25*phi_src_TN^1\n", + "xi_275 ↠-xi_273 + xi_274 + xi_79\n", + "xi_276 ↠sqrt(xi_271**2 + xi_272**2 + xi_275**2)\n", + "xi_277 ↠(xi_270 < 1.0e-9) | (xi_262*xi_276 < 1.0e-9)\n", + "xi_278 ↠xi_264 + 0.2\n", + "xi_279 ↠xi_243*xi_246\n", + "xi_280 ↠xi_239*xi_242*xi_246*xi_268*(50.0*phi_dst_N^1 - 50.0*phi_src_N^1 + xi_85)*(xi_249*xi_271 + xi_257*xi_272 + xi_260*xi_275)/(xi_270*xi_276)\n", + "xi_281 ↠sqrt(xi_238*xi_244)\n", + "xi_282 ↠1.0*phi_src_N^2 + xi_163\n", + "xi_283 ↠xi_166 + xi_226 - xi_91\n", + "xi_284 ↠0.25*phi_src_BN^2\n", + "xi_285 ↠0.25*phi_src_TN^2\n", + "xi_286 ↠-xi_284 + xi_285 + xi_96\n", + "xi_287 ↠sqrt(xi_282**2 + xi_283**2 + xi_286**2)\n", + "xi_288 ↠(xi_281 < 1.0e-9) | (xi_262*xi_287 < 1.0e-9)\n", + "xi_289 ↠xi_245*xi_246\n", + "xi_290 ↠xi_239*xi_244*xi_246*xi_268*(50.0*phi_dst_N^2 - 50.0*phi_src_N^2 + xi_101)*(xi_249*xi_282 + xi_257*xi_283 + xi_260*xi_286)/(xi_281*xi_287)\n", + "xi_291 ↠-0.0833333333333333*mu_src_N^0 + 0.166666666666667*mu_src_N^1 + xi_103 + xi_104\n", + "xi_292 ↠xi_265*(xi_291 + 0.333333333333333)\n", + "xi_293 ↠xi_291 + 0.2\n", + "xi_294 ↠phi_src_B^3/2 + xi_20\n", + "xi_295 ↠xi_294**2\n", + "xi_296 ↠phi_src_B^0/2 + xi_23\n", + "xi_297 ↠xi_296**2\n", + "xi_298 ↠phi_src_B^1/2 + xi_26\n", + "xi_299 ↠xi_298**2\n", + "xi_300 ↠phi_src_B^2/2 + xi_29\n", + "xi_301 ↠xi_300**2\n", + "xi_302 ↠1/(xi_295 + xi_297 + xi_299 + xi_301)\n", + "xi_303 ↠xi_295*xi_302*(-1.0*mu_src_B^0 + xi_19)\n", + "xi_304 ↠xi_295*xi_302*(-1.0*mu_src_B^1 + xi_34)\n", + "xi_305 ↠-1.0*phi_src_B^3 + xi_36\n", + "xi_306 ↠sqrt(xi_294*xi_296)\n", + "xi_307 ↠-1.0*phi_src_B^0 + xi_39\n", + "xi_308 ↠xi_129 + xi_192 + xi_45\n", + "xi_309 ↠xi_194 + xi_253 + xi_43\n", + "xi_310 ↠sqrt(xi_307**2 + xi_308**2 + xi_309**2)\n", + "xi_311 ↠xi_136 + xi_198 + xi_54\n", + "xi_312 ↠xi_200 + xi_258 + xi_52\n", + "xi_313 ↠xi_305**2 + xi_311**2 + xi_312**2\n", + "xi_314 ↠sqrt(xi_313)\n", + "xi_315 ↠(xi_306 < 1.0e-9) | (xi_310*xi_314 < 1.0e-9)\n", + "xi_316 ↠0.166666666666667*mu_src_B^0 - 0.0833333333333333*mu_src_B^1 + xi_61 + xi_62\n", + "xi_317 ↠xi_295*xi_302\n", + "xi_318 ↠xi_317*(xi_316 + 0.333333333333333)\n", + "xi_319 ↠xi_297*xi_302\n", + "xi_320 ↠1/xi_313\n", + "xi_321 ↠xi_295*xi_296*xi_302*xi_320*(50.0*phi_dst_B^0 - 50.0*phi_src_B^0 + xi_67)*(xi_305*xi_307 + xi_308*xi_311 + xi_309*xi_312)/(xi_306*xi_310)\n", + "xi_322 ↠sqrt(xi_294*xi_298)\n", + "xi_323 ↠-1.0*phi_src_B^1 + xi_71\n", + "xi_324 ↠xi_154 + xi_214 + xi_77\n", + "xi_325 ↠xi_216 + xi_273 + xi_75\n", + "xi_326 ↠sqrt(xi_323**2 + xi_324**2 + xi_325**2)\n", + "xi_327 ↠(xi_322 < 1.0e-9) | (xi_314*xi_326 < 1.0e-9)\n", + "xi_328 ↠xi_316 + 0.2\n", + "xi_329 ↠xi_299*xi_302\n", + "xi_330 ↠xi_295*xi_298*xi_302*xi_320*(50.0*phi_dst_B^1 - 50.0*phi_src_B^1 + xi_85)*(xi_305*xi_323 + xi_311*xi_324 + xi_312*xi_325)/(xi_322*xi_326)\n", + "xi_331 ↠sqrt(xi_294*xi_300)\n", + "xi_332 ↠-1.0*phi_src_B^2 + xi_88\n", + "xi_333 ↠xi_168 + xi_226 + xi_94\n", + "xi_334 ↠xi_228 + xi_284 + xi_92\n", + "xi_335 ↠sqrt(xi_332**2 + xi_333**2 + xi_334**2)\n", + "xi_336 ↠(xi_331 < 1.0e-9) | (xi_314*xi_335 < 1.0e-9)\n", + "xi_337 ↠xi_301*xi_302\n", + "xi_338 ↠xi_295*xi_300*xi_302*xi_320*(50.0*phi_dst_B^2 - 50.0*phi_src_B^2 + xi_101)*(xi_305*xi_332 + xi_311*xi_333 + xi_312*xi_334)/(xi_331*xi_335)\n", + "xi_339 ↠-0.0833333333333333*mu_src_B^0 + 0.166666666666667*mu_src_B^1 + xi_103 + xi_104\n", + "xi_340 ↠xi_317*(xi_339 + 0.333333333333333)\n", + "xi_341 ↠xi_339 + 0.2\n", + "xi_342 ↠phi_src_T^3/2 + xi_20\n", + "xi_343 ↠xi_342**2\n", + "xi_344 ↠phi_src_T^0/2 + xi_23\n", + "xi_345 ↠xi_344**2\n", + "xi_346 ↠phi_src_T^1/2 + xi_26\n", + "xi_347 ↠xi_346**2\n", + "xi_348 ↠phi_src_T^2/2 + xi_29\n", + "xi_349 ↠xi_348**2\n", + "xi_350 ↠1/(xi_343 + xi_345 + xi_347 + xi_349)\n", + "xi_351 ↠xi_343*xi_350*(1.0*mu_src_T^0 + xi_108)\n", + "xi_352 ↠xi_343*xi_350*(1.0*mu_src_T^1 + xi_119)\n", + "xi_353 ↠1.0*phi_src_T^3 + xi_121\n", + "xi_354 ↠sqrt(xi_342*xi_344)\n", + "xi_355 ↠1.0*phi_src_T^0 + xi_124\n", + "xi_356 ↠xi_130 + xi_192 - xi_46\n", + "xi_357 ↠-xi_195 + xi_254 + xi_43\n", + "xi_358 ↠sqrt(xi_355**2 + xi_356**2 + xi_357**2)\n", + "xi_359 ↠xi_137 + xi_198 - xi_55\n", + "xi_360 ↠-xi_201 + xi_259 + xi_52\n", + "xi_361 ↠xi_353**2 + xi_359**2 + xi_360**2\n", + "xi_362 ↠sqrt(xi_361)\n", + "xi_363 ↠(xi_354 < 1.0e-9) | (xi_358*xi_362 < 1.0e-9)\n", + "xi_364 ↠0.166666666666667*mu_src_T^0 - 0.0833333333333333*mu_src_T^1 + xi_61 + xi_62\n", + "xi_365 ↠xi_343*xi_350\n", + "xi_366 ↠xi_365*(xi_364 + 0.333333333333333)\n", + "xi_367 ↠xi_345*xi_350\n", + "xi_368 ↠1/xi_361\n", + "xi_369 ↠xi_343*xi_344*xi_350*xi_368*(50.0*phi_dst_T^0 - 50.0*phi_src_T^0 + xi_67)*(xi_353*xi_355 + xi_356*xi_359 + xi_357*xi_360)/(xi_354*xi_358)\n", + "xi_370 ↠sqrt(xi_342*xi_346)\n", + "xi_371 ↠1.0*phi_src_T^1 + xi_149\n", + "xi_372 ↠xi_155 + xi_214 - xi_78\n", + "xi_373 ↠-xi_217 + xi_274 + xi_75\n", + "xi_374 ↠sqrt(xi_371**2 + xi_372**2 + xi_373**2)\n", + "xi_375 ↠(xi_370 < 1.0e-9) | (xi_362*xi_374 < 1.0e-9)\n", + "xi_376 ↠xi_364 + 0.2\n", + "xi_377 ↠xi_347*xi_350\n", + "xi_378 ↠xi_343*xi_346*xi_350*xi_368*(50.0*phi_dst_T^1 - 50.0*phi_src_T^1 + xi_85)*(xi_353*xi_371 + xi_359*xi_372 + xi_360*xi_373)/(xi_370*xi_374)\n", + "xi_379 ↠sqrt(xi_342*xi_348)\n", + "xi_380 ↠1.0*phi_src_T^2 + xi_163\n", + "xi_381 ↠xi_169 + xi_226 - xi_95\n", + "xi_382 ↠-xi_229 + xi_285 + xi_92\n", + "xi_383 ↠sqrt(xi_380**2 + xi_381**2 + xi_382**2)\n", + "xi_384 ↠(xi_379 < 1.0e-9) | (xi_362*xi_383 < 1.0e-9)\n", + "xi_385 ↠xi_349*xi_350\n", + "xi_386 ↠xi_343*xi_348*xi_350*xi_368*(50.0*phi_dst_T^2 - 50.0*phi_src_T^2 + xi_101)*(xi_353*xi_380 + xi_359*xi_381 + xi_360*xi_382)/(xi_379*xi_383)\n", + "xi_387 ↠-0.0833333333333333*mu_src_T^0 + 0.166666666666667*mu_src_T^1 + xi_103 + xi_104\n", + "xi_388 ↠xi_365*(xi_387 + 0.333333333333333)\n", + "xi_389 ↠xi_387 + 0.2\n", + "dc_dmu_0_0 ↠4.00000000000000\n", + "dc_dmu_0_1 ↠2.00000000000000\n", + "dc_dmu_1_0 ↠2.00000000000000\n", + "dc_dmu_1_1 ↠4.00000000000000\n", + "dc_dphi_dt_0 ↠xi_8*(xi_11 + 0.133333333333333*xi_12 - 0.0666666666666667*xi_13 - xi_14*xi_4 - xi_15 - 0.133333333333333*xi_16 + 0.0666666666666667*xi_17 + xi_18 + xi_6*xi_9)\n", + "dc_dphi_dt_1 ↠-xi_8*(-xi_10*xi_6 + xi_11 + 0.0666666666666667*xi_12 - 0.133333333333333*xi_13 + xi_14*xi_5 + xi_15 - 0.0666666666666667*xi_16 + 0.133333333333333*xi_17 - xi_18)\n", + "dc_dT_dt_0 ↠0\n", + "dc_dT_dt_1 ↠0\n", + "staggered_down_0_0 ↠0.333333333333333*xi_33 - 0.166666666666667*xi_35 - xi_37*(3.92699081698724*Piecewise((0, xi_60), (xi_69*(xi_65 - xi_66*(xi_63 + 0.6)), True)) + 3.92699081698724*Piecewise((0, xi_82), (xi_86*(xi_65 - xi_83*xi_84), True)) + 3.92699081698724*Piecewise((0, xi_99), (xi_102*(-xi_100*xi_83 + xi_65), True)))\n", + "staggered_down_0_1 ↠-0.166666666666667*xi_33 + 0.333333333333333*xi_35 - xi_37*(3.92699081698724*Piecewise((0, xi_60), (xi_69*(xi_106 - xi_107*xi_66), True)) + 3.92699081698724*Piecewise((0, xi_82), (xi_86*(xi_106 - xi_84*(xi_105 + 0.6)), True)) + 3.92699081698724*Piecewise((0, xi_99), (xi_102*(-xi_100*xi_107 + xi_106), True)))\n", + "staggered_up_0_0 ↠0.333333333333333*xi_118 - 0.166666666666667*xi_120 - xi_122*(3.92699081698724*Piecewise((0, xi_141), (xi_147*(xi_144 - xi_145*(xi_142 + 0.6)), True)) + 3.92699081698724*Piecewise((0, xi_158), (xi_161*(xi_144 - xi_159*xi_160), True)) + 3.92699081698724*Piecewise((0, xi_172), (xi_174*(xi_144 - xi_159*xi_173), True)))\n", + "staggered_up_0_1 ↠-0.166666666666667*xi_118 + 0.333333333333333*xi_120 - xi_122*(3.92699081698724*Piecewise((0, xi_141), (xi_147*(-xi_145*xi_177 + xi_176), True)) + 3.92699081698724*Piecewise((0, xi_158), (xi_161*(-xi_160*(xi_175 + 0.6) + xi_176), True)) + 3.92699081698724*Piecewise((0, xi_172), (xi_174*(-xi_173*xi_177 + xi_176), True)))\n", + "staggered_down_1_0 ↠0.333333333333333*xi_187 - 0.166666666666667*xi_188 - xi_189*(3.92699081698724*Piecewise((0, xi_205), (xi_211*(xi_208 - xi_209*(xi_206 + 0.6)), True)) + 3.92699081698724*Piecewise((0, xi_220), (xi_223*(xi_208 - xi_221*xi_222), True)) + 3.92699081698724*Piecewise((0, xi_232), (xi_234*(xi_208 - xi_221*xi_233), True)))\n", + "staggered_down_1_1 ↠-0.166666666666667*xi_187 + 0.333333333333333*xi_188 - xi_189*(3.92699081698724*Piecewise((0, xi_205), (xi_211*(-xi_209*xi_237 + xi_236), True)) + 3.92699081698724*Piecewise((0, xi_220), (xi_223*(-xi_222*(xi_235 + 0.6) + xi_236), True)) + 3.92699081698724*Piecewise((0, xi_232), (xi_234*(-xi_233*xi_237 + xi_236), True)))\n", + "staggered_up_1_0 ↠0.333333333333333*xi_247 - 0.166666666666667*xi_248 - xi_249*(3.92699081698724*Piecewise((0, xi_263), (xi_269*(xi_266 - xi_267*(xi_264 + 0.6)), True)) + 3.92699081698724*Piecewise((0, xi_277), (xi_280*(xi_266 - xi_278*xi_279), True)) + 3.92699081698724*Piecewise((0, xi_288), (xi_290*(xi_266 - xi_278*xi_289), True)))\n", + "staggered_up_1_1 ↠-0.166666666666667*xi_247 + 0.333333333333333*xi_248 - xi_249*(3.92699081698724*Piecewise((0, xi_263), (xi_269*(-xi_267*xi_293 + xi_292), True)) + 3.92699081698724*Piecewise((0, xi_277), (xi_280*(-xi_279*(xi_291 + 0.6) + xi_292), True)) + 3.92699081698724*Piecewise((0, xi_288), (xi_290*(-xi_289*xi_293 + xi_292), True)))\n", + "staggered_down_2_0 ↠0.333333333333333*xi_303 - 0.166666666666667*xi_304 - xi_305*(3.92699081698724*Piecewise((0, xi_315), (xi_321*(xi_318 - xi_319*(xi_316 + 0.6)), True)) + 3.92699081698724*Piecewise((0, xi_327), (xi_330*(xi_318 - xi_328*xi_329), True)) + 3.92699081698724*Piecewise((0, xi_336), (xi_338*(xi_318 - xi_328*xi_337), True)))\n", + "staggered_down_2_1 ↠-0.166666666666667*xi_303 + 0.333333333333333*xi_304 - xi_305*(3.92699081698724*Piecewise((0, xi_315), (xi_321*(-xi_319*xi_341 + xi_340), True)) + 3.92699081698724*Piecewise((0, xi_327), (xi_330*(-xi_329*(xi_339 + 0.6) + xi_340), True)) + 3.92699081698724*Piecewise((0, xi_336), (xi_338*(-xi_337*xi_341 + xi_340), True)))\n", + "staggered_up_2_0 ↠0.333333333333333*xi_351 - 0.166666666666667*xi_352 - xi_353*(3.92699081698724*Piecewise((0, xi_363), (xi_369*(xi_366 - xi_367*(xi_364 + 0.6)), True)) + 3.92699081698724*Piecewise((0, xi_375), (xi_378*(xi_366 - xi_376*xi_377), True)) + 3.92699081698724*Piecewise((0, xi_384), (xi_386*(xi_366 - xi_376*xi_385), True)))\n", + "staggered_up_2_1 ↠-0.166666666666667*xi_351 + 0.333333333333333*xi_352 - xi_353*(3.92699081698724*Piecewise((0, xi_363), (xi_369*(-xi_367*xi_389 + xi_388), True)) + 3.92699081698724*Piecewise((0, xi_375), (xi_378*(-xi_377*(xi_387 + 0.6) + xi_388), True)) + 3.92699081698724*Piecewise((0, xi_384), (xi_386*(-xi_385*xi_389 + xi_388), True)))\n", + "divMgradmu_0 ↠-1.0*staggered_down_0_0 - 1.0*staggered_down_1_0 - 1.0*staggered_down_2_0 + 1.0*staggered_up_0_0 + 1.0*staggered_up_1_0 + 1.0*staggered_up_2_0\n", + "xi_390 ↠-0.01*dc_dT_dt_0 - 0.01*dc_dphi_dt_0 + 0.01*divMgradmu_0\n", + "divMgradmu_1 ↠-1.0*staggered_down_0_1 - 1.0*staggered_down_1_1 - 1.0*staggered_down_2_1 + 1.0*staggered_up_0_1 + 1.0*staggered_up_1_1 + 1.0*staggered_up_2_1\n", + "xi_391 ↠-0.01*dc_dT_dt_1 - 0.01*dc_dphi_dt_1 + 0.01*divMgradmu_1\n", + "mu_dst[0,0,0] ↠mu_src_C^0 + dc_dmu_0_0*xi_390 + dc_dmu_0_1*xi_391\n", + "mu_dst[0,0,0](1) ↠mu_src_C^1 + dc_dmu_1_0*xi_390 + dc_dmu_1_1*xi_391\n" + ] + } + ], + "source": [ + "update_eqs = mu_kernel\n", + "\n", + "for eq in update_eqs:\n", + " print(eq)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "rescheduled_eqs = duplicate_trivial_ops(update_eqs, 3, 3)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "282\n", + "xi_8 ↠200.0/((phi_dst_C^0**2 + phi_dst_C^1**2 + phi_dst_C^2**2 + phi_dst_C^3**2)*(phi_src_C^0**2 + phi_src_C^1**2 + phi_src_C^2**2 + phi_src_C^3**2))\n", + "xi_9 ↠0.2*phi_dst_C^0**2\n", + "xi_10 ↠0.2*phi_dst_C^1**2\n", + "xi_11 ↠-phi_src_C^0**2*xi_10 + phi_src_C^1**2*xi_9\n", + "xi_12 ↠phi_dst_C^0**2*phi_src_C^3**2\n", + "xi_13 ↠phi_dst_C^1**2*phi_src_C^3**2\n", + "xi_14 ↠0.2*phi_dst_C^2**2\n", + "xi_15 ↠0.0666666666666667*phi_dst_C^2**2*phi_src_C^3**2\n", + "xi_16 ↠phi_dst_C^3**2*phi_src_C^0**2\n", + "xi_17 ↠phi_dst_C^3**2*phi_src_C^1**2\n", + "xi_18 ↠0.0666666666666667*phi_dst_C^3**2*phi_src_C^2**2\n", + "xi_21 ↠phi_src_W^3/2 + phi_src_C^3/2\n", + "xi_22 ↠xi_21**2\n", + "xi_24 ↠phi_src_W^0/2 + phi_src_C^0/2\n", + "xi_25 ↠xi_24**2\n", + "xi_27 ↠phi_src_W^1/2 + phi_src_C^1/2\n", + "xi_28 ↠xi_27**2\n", + "xi_30 ↠phi_src_W^2/2 + phi_src_C^2/2\n", + "xi_31 ↠xi_30**2\n", + "xi_32 ↠1/(xi_22 + xi_25 + xi_28 + xi_31)\n", + "xi_33 ↠xi_22*xi_32*(-1.0*mu_src_W^0 + 1.0*mu_src_C^0)\n", + "xi_35 ↠xi_22*xi_32*(-1.0*mu_src_W^1 + 1.0*mu_src_C^1)\n", + "xi_37 ↠-1.0*phi_src_W^3 + 1.0*phi_src_C^3\n", + "xi_38 ↠sqrt(xi_21*xi_24)\n", + "xi_40 ↠-1.0*phi_src_W^0 + 1.0*phi_src_C^0\n", + "xi_41 ↠-0.25*phi_src_SW^0\n", + "xi_43 ↠-0.25*phi_src_S^0 + 0.25*phi_src_N^0\n", + "xi_45 ↠-0.25*phi_src_BW^0\n", + "xi_47 ↠-0.25*phi_src_B^0 + 0.25*phi_src_T^0\n", + "xi_49 ↠sqrt(xi_40**2 + (0.25*phi_src_TW^0 + xi_45 + xi_47)**2 + (0.25*phi_src_NW^0 + xi_41 + xi_43)**2)\n", + "xi_50 ↠-0.25*phi_src_SW^3\n", + "xi_52 ↠-0.25*phi_src_S^3 + 0.25*phi_src_N^3\n", + "xi_54 ↠-0.25*phi_src_BW^3\n", + "xi_56 ↠-0.25*phi_src_B^3 + 0.25*phi_src_T^3\n", + "xi_58 ↠xi_37**2 + (0.25*phi_src_TW^3 + xi_54 + xi_56)**2 + (0.25*phi_src_NW^3 + xi_50 + xi_52)**2\n", + "xi_59 ↠sqrt(xi_58)\n", + "xi_62 ↠-0.0833333333333333*mu_src_C^1\n", + "xi_63 ↠0.166666666666667*mu_src_W^0 - 0.0833333333333333*mu_src_W^1 + 0.166666666666667*mu_src_C^0 + xi_62\n", + "xi_64 ↠xi_22*xi_32\n", + "xi_65 ↠xi_64*(xi_63 + 0.333333333333333)\n", + "xi_66 ↠xi_25*xi_32\n", + "xi_67 ↠50.0*phi_dst_C^0 - 50.0*phi_src_C^0\n", + "xi_68 ↠1/xi_58\n", + "xi_69 ↠xi_22*xi_24*xi_32*xi_68*(50.0*phi_dst_W^0 - 50.0*phi_src_W^0 + xi_67)*(xi_37*xi_40 + (0.25*phi_src_TW^0 + xi_45 + xi_47)*(0.25*phi_src_TW^3 + xi_54 + xi_56) + (0.25*phi_src_NW^0 + xi_41 + xi_43)*(0.25*phi_src_NW^3 + xi_50 + xi_52))/(xi_38*xi_49)\n", + "xi_70 ↠sqrt(xi_21*xi_27)\n", + "xi_72 ↠-1.0*phi_src_W^1 + 1.0*phi_src_C^1\n", + "xi_73 ↠-0.25*phi_src_SW^1\n", + "xi_75 ↠-0.25*phi_src_S^1 + 0.25*phi_src_N^1\n", + "xi_77 ↠-0.25*phi_src_BW^1\n", + "xi_79 ↠-0.25*phi_src_B^1 + 0.25*phi_src_T^1\n", + "xi_81 ↠sqrt(xi_72**2 + (0.25*phi_src_TW^1 + xi_77 + xi_79)**2 + (0.25*phi_src_NW^1 + xi_73 + xi_75)**2)\n", + "xi_83 ↠xi_63 + 0.2\n", + "xi_84 ↠xi_28*xi_32\n", + "xi_85 ↠50.0*phi_dst_C^1 - 50.0*phi_src_C^1\n", + "xi_86 ↠xi_22*xi_27*xi_32*xi_68*(50.0*phi_dst_W^1 - 50.0*phi_src_W^1 + xi_85)*(xi_37*xi_72 + (0.25*phi_src_TW^1 + xi_77 + xi_79)*(0.25*phi_src_TW^3 + xi_54 + xi_56) + (0.25*phi_src_NW^1 + xi_73 + xi_75)*(0.25*phi_src_NW^3 + xi_50 + xi_52))/(xi_70*xi_81)\n", + "xi_87 ↠sqrt(xi_21*xi_30)\n", + "xi_89 ↠-1.0*phi_src_W^2 + 1.0*phi_src_C^2\n", + "xi_90 ↠-0.25*phi_src_SW^2\n", + "xi_92 ↠-0.25*phi_src_S^2 + 0.25*phi_src_N^2\n", + "xi_94 ↠-0.25*phi_src_BW^2\n", + "xi_96 ↠-0.25*phi_src_B^2 + 0.25*phi_src_T^2\n", + "xi_98 ↠sqrt(xi_89**2 + (0.25*phi_src_TW^2 + xi_94 + xi_96)**2 + (0.25*phi_src_NW^2 + xi_90 + xi_92)**2)\n", + "xi_100 ↠xi_31*xi_32\n", + "xi_101 ↠50.0*phi_dst_C^2 - 50.0*phi_src_C^2\n", + "xi_102 ↠xi_22*xi_30*xi_32*xi_68*(50.0*phi_dst_W^2 - 50.0*phi_src_W^2 + xi_101)*(xi_37*xi_89 + (0.25*phi_src_TW^2 + xi_94 + xi_96)*(0.25*phi_src_TW^3 + xi_54 + xi_56) + (0.25*phi_src_NW^2 + xi_90 + xi_92)*(0.25*phi_src_NW^3 + xi_50 + xi_52))/(xi_87*xi_98)\n", + "xi_103 ↠-0.0833333333333333*mu_src_C^0\n", + "xi_105 ↠-0.0833333333333333*mu_src_W^0 + 0.166666666666667*mu_src_W^1 + 0.166666666666667*mu_src_C^1 + xi_103\n", + "xi_106 ↠xi_64*(xi_105 + 0.333333333333333)\n", + "xi_107 ↠xi_105 + 0.2\n", + "xi_109 ↠phi_src_C^3/2 + phi_src_E^3/2\n", + "xi_110 ↠xi_109**2\n", + "xi_111 ↠phi_src_C^0/2 + phi_src_E^0/2\n", + "xi_112 ↠xi_111**2\n", + "xi_113 ↠phi_src_C^1/2 + phi_src_E^1/2\n", + "xi_114 ↠xi_113**2\n", + "xi_115 ↠phi_src_C^2/2 + phi_src_E^2/2\n", + "xi_116 ↠xi_115**2\n", + "xi_117 ↠1/(xi_110 + xi_112 + xi_114 + xi_116)\n", + "xi_118 ↠xi_110*xi_117*(-1.0*mu_src_C^0 + 1.0*mu_src_E^0)\n", + "xi_120 ↠xi_110*xi_117*(-1.0*mu_src_C^1 + 1.0*mu_src_E^1)\n", + "xi_122 ↠-1.0*phi_src_C^3 + 1.0*phi_src_E^3\n", + "xi_123 ↠sqrt(xi_109*xi_111)\n", + "xi_125 ↠-1.0*phi_src_C^0 + 1.0*phi_src_E^0\n", + "xi_132 ↠sqrt(xi_125**2 + (-0.25*phi_src_SE^0 + 0.25*phi_src_NE^0 + xi_43)**2 + (-0.25*phi_src_BE^0 + 0.25*phi_src_TE^0 + xi_47)**2)\n", + "xi_139 ↠xi_122**2 + (-0.25*phi_src_SE^3 + 0.25*phi_src_NE^3 + xi_52)**2 + (-0.25*phi_src_BE^3 + 0.25*phi_src_TE^3 + xi_56)**2\n", + "xi_140 ↠sqrt(xi_139)\n", + "xi_142 ↠0.166666666666667*mu_src_C^0 + 0.166666666666667*mu_src_E^0 - 0.0833333333333333*mu_src_E^1 + xi_62\n", + "xi_143 ↠xi_110*xi_117\n", + "xi_144 ↠xi_143*(xi_142 + 0.333333333333333)\n", + "xi_145 ↠xi_112*xi_117\n", + "xi_146 ↠1/xi_139\n", + "xi_147 ↠xi_110*xi_111*xi_117*xi_146*(50.0*phi_dst_E^0 - 50.0*phi_src_E^0 + xi_67)*(xi_122*xi_125 + (-0.25*phi_src_SE^0 + 0.25*phi_src_NE^0 + xi_43)*(-0.25*phi_src_SE^3 + 0.25*phi_src_NE^3 + xi_52) + (-0.25*phi_src_BE^0 + 0.25*phi_src_TE^0 + xi_47)*(-0.25*phi_src_BE^3 + 0.25*phi_src_TE^3 + xi_56))/(xi_123*xi_132)\n", + "xi_148 ↠sqrt(xi_109*xi_113)\n", + "xi_150 ↠-1.0*phi_src_C^1 + 1.0*phi_src_E^1\n", + "xi_157 ↠sqrt(xi_150**2 + (-0.25*phi_src_SE^1 + 0.25*phi_src_NE^1 + xi_75)**2 + (-0.25*phi_src_BE^1 + 0.25*phi_src_TE^1 + xi_79)**2)\n", + "xi_159 ↠xi_142 + 0.2\n", + "xi_160 ↠xi_114*xi_117\n", + "xi_161 ↠xi_110*xi_113*xi_117*xi_146*(50.0*phi_dst_E^1 - 50.0*phi_src_E^1 + xi_85)*(xi_122*xi_150 + (-0.25*phi_src_SE^1 + 0.25*phi_src_NE^1 + xi_75)*(-0.25*phi_src_SE^3 + 0.25*phi_src_NE^3 + xi_52) + (-0.25*phi_src_BE^1 + 0.25*phi_src_TE^1 + xi_79)*(-0.25*phi_src_BE^3 + 0.25*phi_src_TE^3 + xi_56))/(xi_148*xi_157)\n", + "xi_162 ↠sqrt(xi_109*xi_115)\n", + "xi_164 ↠-1.0*phi_src_C^2 + 1.0*phi_src_E^2\n", + "xi_171 ↠sqrt(xi_164**2 + (-0.25*phi_src_SE^2 + 0.25*phi_src_NE^2 + xi_92)**2 + (-0.25*phi_src_BE^2 + 0.25*phi_src_TE^2 + xi_96)**2)\n", + "xi_173 ↠xi_116*xi_117\n", + "xi_174 ↠xi_110*xi_115*xi_117*xi_146*(50.0*phi_dst_E^2 - 50.0*phi_src_E^2 + xi_101)*(xi_122*xi_164 + (-0.25*phi_src_SE^2 + 0.25*phi_src_NE^2 + xi_92)*(-0.25*phi_src_SE^3 + 0.25*phi_src_NE^3 + xi_52) + (-0.25*phi_src_BE^2 + 0.25*phi_src_TE^2 + xi_96)*(-0.25*phi_src_BE^3 + 0.25*phi_src_TE^3 + xi_56))/(xi_162*xi_171)\n", + "xi_175 ↠0.166666666666667*mu_src_C^1 - 0.0833333333333333*mu_src_E^0 + 0.166666666666667*mu_src_E^1 + xi_103\n", + "xi_176 ↠xi_143*(xi_175 + 0.333333333333333)\n", + "xi_177 ↠xi_175 + 0.2\n", + "xi_178 ↠phi_src_S^3/2 + phi_src_C^3/2\n", + "xi_179 ↠xi_178**2\n", + "xi_180 ↠phi_src_S^0/2 + phi_src_C^0/2\n", + "xi_181 ↠xi_180**2\n", + "xi_182 ↠phi_src_S^1/2 + phi_src_C^1/2\n", + "xi_183 ↠xi_182**2\n", + "xi_184 ↠phi_src_S^2/2 + phi_src_C^2/2\n", + "xi_185 ↠xi_184**2\n", + "xi_186 ↠1/(xi_179 + xi_181 + xi_183 + xi_185)\n", + "xi_187 ↠xi_179*xi_186*(-1.0*mu_src_S^0 + 1.0*mu_src_C^0)\n", + "xi_188 ↠xi_179*xi_186*(-1.0*mu_src_S^1 + 1.0*mu_src_C^1)\n", + "xi_189 ↠-1.0*phi_src_S^3 + 1.0*phi_src_C^3\n", + "xi_190 ↠sqrt(xi_178*xi_180)\n", + "xi_191 ↠-1.0*phi_src_S^0 + 1.0*phi_src_C^0\n", + "xi_192 ↠-0.25*phi_src_W^0 + 0.25*phi_src_E^0\n", + "xi_194 ↠-0.25*phi_src_BS^0\n", + "xi_197 ↠sqrt(xi_191**2 + (0.25*phi_src_TS^0 + xi_194 + xi_47)**2 + (0.25*phi_src_SE^0 + xi_192 + xi_41)**2)\n", + "xi_198 ↠-0.25*phi_src_W^3 + 0.25*phi_src_E^3\n", + "xi_200 ↠-0.25*phi_src_BS^3\n", + "xi_203 ↠xi_189**2 + (0.25*phi_src_TS^3 + xi_200 + xi_56)**2 + (0.25*phi_src_SE^3 + xi_198 + xi_50)**2\n", + "xi_204 ↠sqrt(xi_203)\n", + "xi_206 ↠0.166666666666667*mu_src_S^0 - 0.0833333333333333*mu_src_S^1 + 0.166666666666667*mu_src_C^0 + xi_62\n", + "xi_207 ↠xi_179*xi_186\n", + "xi_208 ↠xi_207*(xi_206 + 0.333333333333333)\n", + "xi_209 ↠xi_181*xi_186\n", + "xi_210 ↠1/xi_203\n", + "xi_211 ↠xi_179*xi_180*xi_186*xi_210*(50.0*phi_dst_S^0 - 50.0*phi_src_S^0 + xi_67)*(xi_189*xi_191 + (0.25*phi_src_TS^0 + xi_194 + xi_47)*(0.25*phi_src_TS^3 + xi_200 + xi_56) + (0.25*phi_src_SE^0 + xi_192 + xi_41)*(0.25*phi_src_SE^3 + xi_198 + xi_50))/(xi_190*xi_197)\n", + "xi_212 ↠sqrt(xi_178*xi_182)\n", + "xi_213 ↠-1.0*phi_src_S^1 + 1.0*phi_src_C^1\n", + "xi_214 ↠-0.25*phi_src_W^1 + 0.25*phi_src_E^1\n", + "xi_216 ↠-0.25*phi_src_BS^1\n", + "xi_219 ↠sqrt(xi_213**2 + (0.25*phi_src_TS^1 + xi_216 + xi_79)**2 + (0.25*phi_src_SE^1 + xi_214 + xi_73)**2)\n", + "xi_221 ↠xi_206 + 0.2\n", + "xi_222 ↠xi_183*xi_186\n", + "xi_223 ↠xi_179*xi_182*xi_186*xi_210*(50.0*phi_dst_S^1 - 50.0*phi_src_S^1 + xi_85)*(xi_189*xi_213 + (0.25*phi_src_TS^1 + xi_216 + xi_79)*(0.25*phi_src_TS^3 + xi_200 + xi_56) + (0.25*phi_src_SE^1 + xi_214 + xi_73)*(0.25*phi_src_SE^3 + xi_198 + xi_50))/(xi_212*xi_219)\n", + "xi_224 ↠sqrt(xi_178*xi_184)\n", + "xi_225 ↠-1.0*phi_src_S^2 + 1.0*phi_src_C^2\n", + "xi_226 ↠-0.25*phi_src_W^2 + 0.25*phi_src_E^2\n", + "xi_228 ↠-0.25*phi_src_BS^2\n", + "xi_231 ↠sqrt(xi_225**2 + (0.25*phi_src_TS^2 + xi_228 + xi_96)**2 + (0.25*phi_src_SE^2 + xi_226 + xi_90)**2)\n", + "xi_233 ↠xi_185*xi_186\n", + "xi_234 ↠xi_179*xi_184*xi_186*xi_210*(50.0*phi_dst_S^2 - 50.0*phi_src_S^2 + xi_101)*(xi_189*xi_225 + (0.25*phi_src_TS^2 + xi_228 + xi_96)*(0.25*phi_src_TS^3 + xi_200 + xi_56) + (0.25*phi_src_SE^2 + xi_226 + xi_90)*(0.25*phi_src_SE^3 + xi_198 + xi_50))/(xi_224*xi_231)\n", + "xi_235 ↠-0.0833333333333333*mu_src_S^0 + 0.166666666666667*mu_src_S^1 + 0.166666666666667*mu_src_C^1 + xi_103\n", + "xi_236 ↠xi_207*(xi_235 + 0.333333333333333)\n", + "xi_237 ↠xi_235 + 0.2\n", + "xi_238 ↠phi_src_C^3/2 + phi_src_N^3/2\n", + "xi_239 ↠xi_238**2\n", + "xi_240 ↠phi_src_C^0/2 + phi_src_N^0/2\n", + "xi_241 ↠xi_240**2\n", + "xi_242 ↠phi_src_C^1/2 + phi_src_N^1/2\n", + "xi_243 ↠xi_242**2\n", + "xi_244 ↠phi_src_C^2/2 + phi_src_N^2/2\n", + "xi_245 ↠xi_244**2\n", + "xi_246 ↠1/(xi_239 + xi_241 + xi_243 + xi_245)\n", + "xi_247 ↠xi_239*xi_246*(-1.0*mu_src_C^0 + 1.0*mu_src_N^0)\n", + "xi_248 ↠xi_239*xi_246*(-1.0*mu_src_C^1 + 1.0*mu_src_N^1)\n", + "xi_249 ↠-1.0*phi_src_C^3 + 1.0*phi_src_N^3\n", + "xi_250 ↠sqrt(xi_238*xi_240)\n", + "xi_251 ↠-1.0*phi_src_C^0 + 1.0*phi_src_N^0\n", + "xi_256 ↠sqrt(xi_251**2 + (-0.25*phi_src_NW^0 + 0.25*phi_src_NE^0 + xi_192)**2 + (-0.25*phi_src_BN^0 + 0.25*phi_src_TN^0 + xi_47)**2)\n", + "xi_261 ↠xi_249**2 + (-0.25*phi_src_NW^3 + 0.25*phi_src_NE^3 + xi_198)**2 + (-0.25*phi_src_BN^3 + 0.25*phi_src_TN^3 + xi_56)**2\n", + "xi_262 ↠sqrt(xi_261)\n", + "xi_264 ↠0.166666666666667*mu_src_C^0 + 0.166666666666667*mu_src_N^0 - 0.0833333333333333*mu_src_N^1 + xi_62\n", + "xi_265 ↠xi_239*xi_246\n", + "xi_266 ↠xi_265*(xi_264 + 0.333333333333333)\n", + "xi_267 ↠xi_241*xi_246\n", + "xi_268 ↠1/xi_261\n", + "xi_269 ↠xi_239*xi_240*xi_246*xi_268*(50.0*phi_dst_N^0 - 50.0*phi_src_N^0 + xi_67)*(xi_249*xi_251 + (-0.25*phi_src_NW^0 + 0.25*phi_src_NE^0 + xi_192)*(-0.25*phi_src_NW^3 + 0.25*phi_src_NE^3 + xi_198) + (-0.25*phi_src_BN^0 + 0.25*phi_src_TN^0 + xi_47)*(-0.25*phi_src_BN^3 + 0.25*phi_src_TN^3 + xi_56))/(xi_250*xi_256)\n", + "xi_270 ↠sqrt(xi_238*xi_242)\n", + "xi_271 ↠-1.0*phi_src_C^1 + 1.0*phi_src_N^1\n", + "xi_276 ↠sqrt(xi_271**2 + (-0.25*phi_src_NW^1 + 0.25*phi_src_NE^1 + xi_214)**2 + (-0.25*phi_src_BN^1 + 0.25*phi_src_TN^1 + xi_79)**2)\n", + "xi_278 ↠xi_264 + 0.2\n", + "xi_279 ↠xi_243*xi_246\n", + "xi_280 ↠xi_239*xi_242*xi_246*xi_268*(50.0*phi_dst_N^1 - 50.0*phi_src_N^1 + xi_85)*(xi_249*xi_271 + (-0.25*phi_src_NW^1 + 0.25*phi_src_NE^1 + xi_214)*(-0.25*phi_src_NW^3 + 0.25*phi_src_NE^3 + xi_198) + (-0.25*phi_src_BN^1 + 0.25*phi_src_TN^1 + xi_79)*(-0.25*phi_src_BN^3 + 0.25*phi_src_TN^3 + xi_56))/(xi_270*xi_276)\n", + "xi_281 ↠sqrt(xi_238*xi_244)\n", + "xi_282 ↠-1.0*phi_src_C^2 + 1.0*phi_src_N^2\n", + "xi_287 ↠sqrt(xi_282**2 + (-0.25*phi_src_NW^2 + 0.25*phi_src_NE^2 + xi_226)**2 + (-0.25*phi_src_BN^2 + 0.25*phi_src_TN^2 + xi_96)**2)\n", + "xi_289 ↠xi_245*xi_246\n", + "xi_290 ↠xi_239*xi_244*xi_246*xi_268*(50.0*phi_dst_N^2 - 50.0*phi_src_N^2 + xi_101)*(xi_249*xi_282 + (-0.25*phi_src_NW^2 + 0.25*phi_src_NE^2 + xi_226)*(-0.25*phi_src_NW^3 + 0.25*phi_src_NE^3 + xi_198) + (-0.25*phi_src_BN^2 + 0.25*phi_src_TN^2 + xi_96)*(-0.25*phi_src_BN^3 + 0.25*phi_src_TN^3 + xi_56))/(xi_281*xi_287)\n", + "xi_291 ↠0.166666666666667*mu_src_C^1 - 0.0833333333333333*mu_src_N^0 + 0.166666666666667*mu_src_N^1 + xi_103\n", + "xi_292 ↠xi_265*(xi_291 + 0.333333333333333)\n", + "xi_293 ↠xi_291 + 0.2\n", + "xi_294 ↠phi_src_B^3/2 + phi_src_C^3/2\n", + "xi_295 ↠xi_294**2\n", + "xi_296 ↠phi_src_B^0/2 + phi_src_C^0/2\n", + "xi_297 ↠xi_296**2\n", + "xi_298 ↠phi_src_B^1/2 + phi_src_C^1/2\n", + "xi_299 ↠xi_298**2\n", + "xi_300 ↠phi_src_B^2/2 + phi_src_C^2/2\n", + "xi_301 ↠xi_300**2\n", + "xi_302 ↠1/(xi_295 + xi_297 + xi_299 + xi_301)\n", + "xi_303 ↠xi_295*xi_302*(-1.0*mu_src_B^0 + 1.0*mu_src_C^0)\n", + "xi_304 ↠xi_295*xi_302*(-1.0*mu_src_B^1 + 1.0*mu_src_C^1)\n", + "xi_305 ↠-1.0*phi_src_B^3 + 1.0*phi_src_C^3\n", + "xi_306 ↠sqrt(xi_294*xi_296)\n", + "xi_307 ↠-1.0*phi_src_B^0 + 1.0*phi_src_C^0\n", + "xi_310 ↠sqrt(xi_307**2 + (0.25*phi_src_BN^0 + xi_194 + xi_43)**2 + (0.25*phi_src_BE^0 + xi_192 + xi_45)**2)\n", + "xi_313 ↠xi_305**2 + (0.25*phi_src_BN^3 + xi_200 + xi_52)**2 + (0.25*phi_src_BE^3 + xi_198 + xi_54)**2\n", + "xi_314 ↠sqrt(xi_313)\n", + "xi_316 ↠0.166666666666667*mu_src_B^0 - 0.0833333333333333*mu_src_B^1 + 0.166666666666667*mu_src_C^0 + xi_62\n", + "xi_317 ↠xi_295*xi_302\n", + "xi_318 ↠xi_317*(xi_316 + 0.333333333333333)\n", + "xi_319 ↠xi_297*xi_302\n", + "xi_320 ↠1/xi_313\n", + "xi_321 ↠xi_295*xi_296*xi_302*xi_320*(50.0*phi_dst_B^0 - 50.0*phi_src_B^0 + xi_67)*(xi_305*xi_307 + (0.25*phi_src_BN^0 + xi_194 + xi_43)*(0.25*phi_src_BN^3 + xi_200 + xi_52) + (0.25*phi_src_BE^0 + xi_192 + xi_45)*(0.25*phi_src_BE^3 + xi_198 + xi_54))/(xi_306*xi_310)\n", + "xi_322 ↠sqrt(xi_294*xi_298)\n", + "xi_323 ↠-1.0*phi_src_B^1 + 1.0*phi_src_C^1\n", + "xi_326 ↠sqrt(xi_323**2 + (0.25*phi_src_BN^1 + xi_216 + xi_75)**2 + (0.25*phi_src_BE^1 + xi_214 + xi_77)**2)\n", + "xi_328 ↠xi_316 + 0.2\n", + "xi_329 ↠xi_299*xi_302\n", + "xi_330 ↠xi_295*xi_298*xi_302*xi_320*(50.0*phi_dst_B^1 - 50.0*phi_src_B^1 + xi_85)*(xi_305*xi_323 + (0.25*phi_src_BN^1 + xi_216 + xi_75)*(0.25*phi_src_BN^3 + xi_200 + xi_52) + (0.25*phi_src_BE^1 + xi_214 + xi_77)*(0.25*phi_src_BE^3 + xi_198 + xi_54))/(xi_322*xi_326)\n", + "xi_331 ↠sqrt(xi_294*xi_300)\n", + "xi_332 ↠-1.0*phi_src_B^2 + 1.0*phi_src_C^2\n", + "xi_335 ↠sqrt(xi_332**2 + (0.25*phi_src_BN^2 + xi_228 + xi_92)**2 + (0.25*phi_src_BE^2 + xi_226 + xi_94)**2)\n", + "xi_337 ↠xi_301*xi_302\n", + "xi_338 ↠xi_295*xi_300*xi_302*xi_320*(50.0*phi_dst_B^2 - 50.0*phi_src_B^2 + xi_101)*(xi_305*xi_332 + (0.25*phi_src_BN^2 + xi_228 + xi_92)*(0.25*phi_src_BN^3 + xi_200 + xi_52) + (0.25*phi_src_BE^2 + xi_226 + xi_94)*(0.25*phi_src_BE^3 + xi_198 + xi_54))/(xi_331*xi_335)\n", + "xi_339 ↠-0.0833333333333333*mu_src_B^0 + 0.166666666666667*mu_src_B^1 + 0.166666666666667*mu_src_C^1 + xi_103\n", + "xi_340 ↠xi_317*(xi_339 + 0.333333333333333)\n", + "xi_341 ↠xi_339 + 0.2\n", + "xi_342 ↠phi_src_C^3/2 + phi_src_T^3/2\n", + "xi_343 ↠xi_342**2\n", + "xi_344 ↠phi_src_C^0/2 + phi_src_T^0/2\n", + "xi_345 ↠xi_344**2\n", + "xi_346 ↠phi_src_C^1/2 + phi_src_T^1/2\n", + "xi_347 ↠xi_346**2\n", + "xi_348 ↠phi_src_C^2/2 + phi_src_T^2/2\n", + "xi_349 ↠xi_348**2\n", + "xi_350 ↠1/(xi_343 + xi_345 + xi_347 + xi_349)\n", + "xi_351 ↠xi_343*xi_350*(-1.0*mu_src_C^0 + 1.0*mu_src_T^0)\n", + "xi_352 ↠xi_343*xi_350*(-1.0*mu_src_C^1 + 1.0*mu_src_T^1)\n", + "xi_353 ↠-1.0*phi_src_C^3 + 1.0*phi_src_T^3\n", + "xi_354 ↠sqrt(xi_342*xi_344)\n", + "xi_355 ↠-1.0*phi_src_C^0 + 1.0*phi_src_T^0\n", + "xi_358 ↠sqrt(xi_355**2 + (-0.25*phi_src_TW^0 + 0.25*phi_src_TE^0 + xi_192)**2 + (-0.25*phi_src_TS^0 + 0.25*phi_src_TN^0 + xi_43)**2)\n", + "xi_361 ↠xi_353**2 + (-0.25*phi_src_TW^3 + 0.25*phi_src_TE^3 + xi_198)**2 + (-0.25*phi_src_TS^3 + 0.25*phi_src_TN^3 + xi_52)**2\n", + "xi_362 ↠sqrt(xi_361)\n", + "xi_364 ↠0.166666666666667*mu_src_C^0 + 0.166666666666667*mu_src_T^0 - 0.0833333333333333*mu_src_T^1 + xi_62\n", + "xi_365 ↠xi_343*xi_350\n", + "xi_366 ↠xi_365*(xi_364 + 0.333333333333333)\n", + "xi_367 ↠xi_345*xi_350\n", + "xi_368 ↠1/xi_361\n", + "xi_369 ↠xi_343*xi_344*xi_350*xi_368*(50.0*phi_dst_T^0 - 50.0*phi_src_T^0 + xi_67)*(xi_353*xi_355 + (-0.25*phi_src_TW^0 + 0.25*phi_src_TE^0 + xi_192)*(-0.25*phi_src_TW^3 + 0.25*phi_src_TE^3 + xi_198) + (-0.25*phi_src_TS^0 + 0.25*phi_src_TN^0 + xi_43)*(-0.25*phi_src_TS^3 + 0.25*phi_src_TN^3 + xi_52))/(xi_354*xi_358)\n", + "xi_370 ↠sqrt(xi_342*xi_346)\n", + "xi_371 ↠-1.0*phi_src_C^1 + 1.0*phi_src_T^1\n", + "xi_374 ↠sqrt(xi_371**2 + (-0.25*phi_src_TW^1 + 0.25*phi_src_TE^1 + xi_214)**2 + (-0.25*phi_src_TS^1 + 0.25*phi_src_TN^1 + xi_75)**2)\n", + "xi_376 ↠xi_364 + 0.2\n", + "xi_377 ↠xi_347*xi_350\n", + "xi_378 ↠xi_343*xi_346*xi_350*xi_368*(50.0*phi_dst_T^1 - 50.0*phi_src_T^1 + xi_85)*(xi_353*xi_371 + (-0.25*phi_src_TW^1 + 0.25*phi_src_TE^1 + xi_214)*(-0.25*phi_src_TW^3 + 0.25*phi_src_TE^3 + xi_198) + (-0.25*phi_src_TS^1 + 0.25*phi_src_TN^1 + xi_75)*(-0.25*phi_src_TS^3 + 0.25*phi_src_TN^3 + xi_52))/(xi_370*xi_374)\n", + "xi_379 ↠sqrt(xi_342*xi_348)\n", + "xi_380 ↠-1.0*phi_src_C^2 + 1.0*phi_src_T^2\n", + "xi_383 ↠sqrt(xi_380**2 + (-0.25*phi_src_TW^2 + 0.25*phi_src_TE^2 + xi_226)**2 + (-0.25*phi_src_TS^2 + 0.25*phi_src_TN^2 + xi_92)**2)\n", + "xi_385 ↠xi_349*xi_350\n", + "xi_386 ↠xi_343*xi_348*xi_350*xi_368*(50.0*phi_dst_T^2 - 50.0*phi_src_T^2 + xi_101)*(xi_353*xi_380 + (-0.25*phi_src_TW^2 + 0.25*phi_src_TE^2 + xi_226)*(-0.25*phi_src_TW^3 + 0.25*phi_src_TE^3 + xi_198) + (-0.25*phi_src_TS^2 + 0.25*phi_src_TN^2 + xi_92)*(-0.25*phi_src_TS^3 + 0.25*phi_src_TN^3 + xi_52))/(xi_379*xi_383)\n", + "xi_387 ↠0.166666666666667*mu_src_C^1 - 0.0833333333333333*mu_src_T^0 + 0.166666666666667*mu_src_T^1 + xi_103\n", + "xi_388 ↠xi_365*(xi_387 + 0.333333333333333)\n", + "xi_389 ↠xi_387 + 0.2\n", + "dc_dphi_dt_0 ↠xi_8*(-phi_src_C^0**2*xi_14 + phi_src_C^2**2*xi_9 + xi_11 + 0.133333333333333*xi_12 - 0.0666666666666667*xi_13 - xi_15 - 0.133333333333333*xi_16 + 0.0666666666666667*xi_17 + xi_18)\n", + "dc_dphi_dt_1 ↠-xi_8*(phi_src_C^1**2*xi_14 - phi_src_C^2**2*xi_10 + xi_11 + 0.0666666666666667*xi_12 - 0.133333333333333*xi_13 + xi_15 - 0.0666666666666667*xi_16 + 0.133333333333333*xi_17 - xi_18)\n", + "staggered_down_0_0 ↠0.333333333333333*xi_33 - 0.166666666666667*xi_35 - xi_37*(3.92699081698724*Piecewise((0, (xi_38 < 1.0e-9) | (xi_49*xi_59 < 1.0e-9)), (xi_69*(xi_65 - xi_66*(xi_63 + 0.6)), True)) + 3.92699081698724*Piecewise((0, (xi_70 < 1.0e-9) | (xi_59*xi_81 < 1.0e-9)), (xi_86*(xi_65 - xi_83*xi_84), True)) + 3.92699081698724*Piecewise((0, (xi_87 < 1.0e-9) | (xi_59*xi_98 < 1.0e-9)), (xi_102*(-xi_100*xi_83 + xi_65), True)))\n", + "staggered_down_0_1 ↠-0.166666666666667*xi_33 + 0.333333333333333*xi_35 - xi_37*(3.92699081698724*Piecewise((0, (xi_38 < 1.0e-9) | (xi_49*xi_59 < 1.0e-9)), (xi_69*(xi_106 - xi_107*xi_66), True)) + 3.92699081698724*Piecewise((0, (xi_70 < 1.0e-9) | (xi_59*xi_81 < 1.0e-9)), (xi_86*(xi_106 - xi_84*(xi_105 + 0.6)), True)) + 3.92699081698724*Piecewise((0, (xi_87 < 1.0e-9) | (xi_59*xi_98 < 1.0e-9)), (xi_102*(-xi_100*xi_107 + xi_106), True)))\n", + "staggered_up_0_0 ↠0.333333333333333*xi_118 - 0.166666666666667*xi_120 - xi_122*(3.92699081698724*Piecewise((0, (xi_123 < 1.0e-9) | (xi_132*xi_140 < 1.0e-9)), (xi_147*(xi_144 - xi_145*(xi_142 + 0.6)), True)) + 3.92699081698724*Piecewise((0, (xi_148 < 1.0e-9) | (xi_140*xi_157 < 1.0e-9)), (xi_161*(xi_144 - xi_159*xi_160), True)) + 3.92699081698724*Piecewise((0, (xi_162 < 1.0e-9) | (xi_140*xi_171 < 1.0e-9)), (xi_174*(xi_144 - xi_159*xi_173), True)))\n", + "staggered_up_0_1 ↠-0.166666666666667*xi_118 + 0.333333333333333*xi_120 - xi_122*(3.92699081698724*Piecewise((0, (xi_123 < 1.0e-9) | (xi_132*xi_140 < 1.0e-9)), (xi_147*(-xi_145*xi_177 + xi_176), True)) + 3.92699081698724*Piecewise((0, (xi_148 < 1.0e-9) | (xi_140*xi_157 < 1.0e-9)), (xi_161*(-xi_160*(xi_175 + 0.6) + xi_176), True)) + 3.92699081698724*Piecewise((0, (xi_162 < 1.0e-9) | (xi_140*xi_171 < 1.0e-9)), (xi_174*(-xi_173*xi_177 + xi_176), True)))\n", + "staggered_down_1_0 ↠0.333333333333333*xi_187 - 0.166666666666667*xi_188 - xi_189*(3.92699081698724*Piecewise((0, (xi_190 < 1.0e-9) | (xi_197*xi_204 < 1.0e-9)), (xi_211*(xi_208 - xi_209*(xi_206 + 0.6)), True)) + 3.92699081698724*Piecewise((0, (xi_212 < 1.0e-9) | (xi_204*xi_219 < 1.0e-9)), (xi_223*(xi_208 - xi_221*xi_222), True)) + 3.92699081698724*Piecewise((0, (xi_224 < 1.0e-9) | (xi_204*xi_231 < 1.0e-9)), (xi_234*(xi_208 - xi_221*xi_233), True)))\n", + "staggered_down_1_1 ↠-0.166666666666667*xi_187 + 0.333333333333333*xi_188 - xi_189*(3.92699081698724*Piecewise((0, (xi_190 < 1.0e-9) | (xi_197*xi_204 < 1.0e-9)), (xi_211*(-xi_209*xi_237 + xi_236), True)) + 3.92699081698724*Piecewise((0, (xi_212 < 1.0e-9) | (xi_204*xi_219 < 1.0e-9)), (xi_223*(-xi_222*(xi_235 + 0.6) + xi_236), True)) + 3.92699081698724*Piecewise((0, (xi_224 < 1.0e-9) | (xi_204*xi_231 < 1.0e-9)), (xi_234*(-xi_233*xi_237 + xi_236), True)))\n", + "staggered_up_1_0 ↠0.333333333333333*xi_247 - 0.166666666666667*xi_248 - xi_249*(3.92699081698724*Piecewise((0, (xi_250 < 1.0e-9) | (xi_256*xi_262 < 1.0e-9)), (xi_269*(xi_266 - xi_267*(xi_264 + 0.6)), True)) + 3.92699081698724*Piecewise((0, (xi_270 < 1.0e-9) | (xi_262*xi_276 < 1.0e-9)), (xi_280*(xi_266 - xi_278*xi_279), True)) + 3.92699081698724*Piecewise((0, (xi_281 < 1.0e-9) | (xi_262*xi_287 < 1.0e-9)), (xi_290*(xi_266 - xi_278*xi_289), True)))\n", + "staggered_up_1_1 ↠-0.166666666666667*xi_247 + 0.333333333333333*xi_248 - xi_249*(3.92699081698724*Piecewise((0, (xi_250 < 1.0e-9) | (xi_256*xi_262 < 1.0e-9)), (xi_269*(-xi_267*xi_293 + xi_292), True)) + 3.92699081698724*Piecewise((0, (xi_270 < 1.0e-9) | (xi_262*xi_276 < 1.0e-9)), (xi_280*(-xi_279*(xi_291 + 0.6) + xi_292), True)) + 3.92699081698724*Piecewise((0, (xi_281 < 1.0e-9) | (xi_262*xi_287 < 1.0e-9)), (xi_290*(-xi_289*xi_293 + xi_292), True)))\n", + "staggered_down_2_0 ↠0.333333333333333*xi_303 - 0.166666666666667*xi_304 - xi_305*(3.92699081698724*Piecewise((0, (xi_306 < 1.0e-9) | (xi_310*xi_314 < 1.0e-9)), (xi_321*(xi_318 - xi_319*(xi_316 + 0.6)), True)) + 3.92699081698724*Piecewise((0, (xi_322 < 1.0e-9) | (xi_314*xi_326 < 1.0e-9)), (xi_330*(xi_318 - xi_328*xi_329), True)) + 3.92699081698724*Piecewise((0, (xi_331 < 1.0e-9) | (xi_314*xi_335 < 1.0e-9)), (xi_338*(xi_318 - xi_328*xi_337), True)))\n", + "staggered_down_2_1 ↠-0.166666666666667*xi_303 + 0.333333333333333*xi_304 - xi_305*(3.92699081698724*Piecewise((0, (xi_306 < 1.0e-9) | (xi_310*xi_314 < 1.0e-9)), (xi_321*(-xi_319*xi_341 + xi_340), True)) + 3.92699081698724*Piecewise((0, (xi_322 < 1.0e-9) | (xi_314*xi_326 < 1.0e-9)), (xi_330*(-xi_329*(xi_339 + 0.6) + xi_340), True)) + 3.92699081698724*Piecewise((0, (xi_331 < 1.0e-9) | (xi_314*xi_335 < 1.0e-9)), (xi_338*(-xi_337*xi_341 + xi_340), True)))\n", + "staggered_up_2_0 ↠0.333333333333333*xi_351 - 0.166666666666667*xi_352 - xi_353*(3.92699081698724*Piecewise((0, (xi_354 < 1.0e-9) | (xi_358*xi_362 < 1.0e-9)), (xi_369*(xi_366 - xi_367*(xi_364 + 0.6)), True)) + 3.92699081698724*Piecewise((0, (xi_370 < 1.0e-9) | (xi_362*xi_374 < 1.0e-9)), (xi_378*(xi_366 - xi_376*xi_377), True)) + 3.92699081698724*Piecewise((0, (xi_379 < 1.0e-9) | (xi_362*xi_383 < 1.0e-9)), (xi_386*(xi_366 - xi_376*xi_385), True)))\n", + "staggered_up_2_1 ↠-0.166666666666667*xi_351 + 0.333333333333333*xi_352 - xi_353*(3.92699081698724*Piecewise((0, (xi_354 < 1.0e-9) | (xi_358*xi_362 < 1.0e-9)), (xi_369*(-xi_367*xi_389 + xi_388), True)) + 3.92699081698724*Piecewise((0, (xi_370 < 1.0e-9) | (xi_362*xi_374 < 1.0e-9)), (xi_378*(-xi_377*(xi_387 + 0.6) + xi_388), True)) + 3.92699081698724*Piecewise((0, (xi_379 < 1.0e-9) | (xi_362*xi_383 < 1.0e-9)), (xi_386*(-xi_385*xi_389 + xi_388), True)))\n", + "divMgradmu_0 ↠-1.0*staggered_down_0_0 - 1.0*staggered_down_1_0 - 1.0*staggered_down_2_0 + 1.0*staggered_up_0_0 + 1.0*staggered_up_1_0 + 1.0*staggered_up_2_0\n", + "xi_390 ↠-0.01*dc_dphi_dt_0 + 0.01*divMgradmu_0\n", + "divMgradmu_1 ↠-1.0*staggered_down_0_1 - 1.0*staggered_down_1_1 - 1.0*staggered_down_2_1 + 1.0*staggered_up_0_1 + 1.0*staggered_up_1_1 + 1.0*staggered_up_2_1\n", + "xi_391 ↠-0.01*dc_dphi_dt_1 + 0.01*divMgradmu_1\n", + "mu_dst[0,0,0] ↠mu_src_C^0 + 4.0*xi_390 + 2.0*xi_391\n", + "mu_dst[0,0,0](1) ↠mu_src_C^1 + 2.0*xi_390 + 4.0*xi_391\n" + ] + } + ], + "source": [ + "print(len(rescheduled_eqs))\n", + "for eq in rescheduled_eqs:\n", + " print(eq)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "phi_kernel = ps.create_kernel(\n", + " rescheduled_eqs,\n", + " target=\"gpu\",\n", + " gpu_indexing_params={\n", + " \"block_size\": (32, 4, 1)\n", + " }).compile()\n", + "\n", + "code = \"#include <cstdint>\\n\"\n", + "code += \"#define FUNC_PREFIX __global__ __launch_bounds__(128)\\n\"\n", + "code += \"#define RESTRICT const __restrict__\\n\\n\"\n", + "\n", + "code += str(show_code(phi_kernel.ast))\n", + "\n", + "cubin = pycuda.compiler.compile(code, options=[\"-w\", \"-std=c++11\", \"-use_fast_math\" ], arch=\"sm_60\")\n", + "\n", + "run([ \"echo \\\"\" + code + \"\\\" >> temp.cubin\"],\n", + " stdout=PIPE,\n", + " shell=True)\n", + "\n", + "newFile = open(\"temp.cusbin\", \"wb\")\n", + "newFile.write(cubin)\n", + "newFile.close()\n", + "\n", + "result = run([ \"nvdisasm -c temp.cusbin\"],\n", + " stdout=PIPE,\n", + " shell=True)\n", + "\n", + "print(len(result.stdout.decode(\"utf-8\").split(\"\\n\") ) )\n", + "\n", + "print(result.stdout.decode(\"utf-8\"))\n", + "\n", + "\n", + "\n", + "newFile = open(\"temp.disasm\", \"wb\")\n", + "newFile.write(result.stdout)\n", + "newFile.close()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(show_code(phi_kernel.ast))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "for eq in rescheduled_eqs:\n", + " print(eq)\n", + " print(eq.rhs.func)\n", + " for arg in eq.rhs.args:\n", + " print(arg)\n", + " print()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "d = graphviz.Digraph(engine='dot')\n", + "for eq in rescheduled_eqs:\n", + " #d.node(eq.lhs.name)\n", + " for arg in eq.rhs.atoms():\n", + " if isinstance(arg, sympy.Symbol) and not isinstance(arg, Field.Access):\n", + " d.edge(arg.name, eq.lhs.name)\n", + "d\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/pystencils_tests/liveness_opts/test_sched_mu.ipynb b/pystencils_tests/liveness_opts/test_sched_mu.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..a1fa706665672d1cd91441197437f4c9f82b4c0f --- /dev/null +++ b/pystencils_tests/liveness_opts/test_sched_mu.ipynb @@ -0,0 +1,78 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys \n", + "sys.path.append('..')\n", + "sys.path.append('../pygrandchem_tests/')\n", + "\n", + "import numpy as np\n", + "\n", + "from pygrandchem_tests.config import get_system\n", + "from pygrandchem.grandchem_generation import create_mu_update_kernel, create_mu_update_equations_from_config, create_mu_staggered_kernel, create_kernel\n", + "import test_mu_equivalence\n", + "\n", + "from pystencils.simp.liveness_opts import *\n", + "from pystencils.simp.liveness_opts_exp import *\n", + "from pystencils.simp import sympy_cse_on_assignment_list\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "config = get_system()\n", + "phi_src = np.zeros([3, 3, 3, 4])\n", + "mu_src = np.zeros([3, 3, 3, 2])\n", + "\n", + "update_eqs = sympy_cse_on_assignment_list(create_mu_update_equations_from_config(config, phi_src, mu_src))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for sched_option in all_sched_options:\n", + " print(sched_option.__name__)\n", + " rescheduled_eqs = sched_option(update_eqs)\n", + " test_mu_equivalence.test_no_staggered_pre_computation(eqs=rescheduled_eqs, target='gpu')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/pystencils_tests/liveness_opts/test_steal.ipynb b/pystencils_tests/liveness_opts/test_steal.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..f72b7c549e2d4e2267f45f2387f026d9bb958cc3 --- /dev/null +++ b/pystencils_tests/liveness_opts/test_steal.ipynb @@ -0,0 +1,450 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys \n", + "sys.path.append('..')\n", + "print(sys.path)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 1\n", + "%aimport pystencils.simp.liveness_opts\n", + "%aimport pystencils.simp.liveness_opts_exp\n", + "%aimport pystencils.shmemvar\n", + "%aimport pystencils.backends.cbackend\n", + "%aimport pystencils.transformations\n", + "\n", + "\n", + "%load_ext line_profiler\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from lbmpy.session import *\n", + "from scipy.ndimage.filters import gaussian_filter\n", + "from pygrandchem_tests.config2 import get_system\n", + "from pygrandchem_tests.config import get_system as get_system_simple\n", + "from pystencils.datahandling import SerialDataHandling\n", + "from pygrandchem.grandchem_generation import *\n", + "from pygrandchem.chemicalpotential import *\n", + "from pystencils import show_code, Field\n", + "from sympy import Number, Symbol, Expr, preorder_traversal, postorder_traversal, Function, Piecewise, relational\n", + "from pystencils.simp import sympy_cse_on_assignment_list\n", + "from pystencils.simp.liveness_opts import *\n", + "from pystencils.simp.liveness_opts_exp import *\n", + "\n", + "from pystencils.shmemvar import *\n", + "import graphviz\n", + "\n", + "\n", + "import pycuda\n", + "\n", + "import sys\n", + "from subprocess import run, PIPE\n", + "\n", + "sys.setrecursionlimit(100000)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "config = get_system()\n", + "free_energy = FreeEnergy(config['FreeEnergy'], config['Temperature'])\n", + "\n", + "dh = SerialDataHandling((256, 256, 256), periodicity=(True, True, False))\n", + "f = dh.fields\n", + "dh.add_array('phi_src', values_per_cell=4, layout='fzyx')\n", + "dh.add_array('mu_src', values_per_cell=2, layout='fzyx')\n", + "dh.add_array_like('phi_dst', 'phi_src')\n", + "dh.add_array_like('mu_dst', 'mu_src')\n", + "dh.add_array('c', values_per_cell=2, layout='fzyx')\n", + "\n", + "diffusion_matrices = np.zeros([4, 2, 2])\n", + "diffusion_matrices[0] = config['Parameters']['da']\n", + "diffusion_matrices[1] = config['Parameters']['db']\n", + "diffusion_matrices[2] = config['Parameters']['dg']\n", + "diffusion_matrices[3] = config['Parameters']['dl']\n", + "\n", + "f = dh.fields\n", + "\n", + "#update_eqs = create_phi_update_equations(\n", + "# f['phi_src'],\n", + "# f['phi_dst'],\n", + "# f['mu_src'],\n", + "# free_energy,\n", + "# config['Parameters'],\n", + "# simplex_projection=True)\n", + "\n", + "update_eqs = create_mu_update_equations(\n", + " f['phi_src'], f['phi_dst'], f['mu_src'], f['mu_dst'], free_energy,\n", + " diffusion_matrices, config['Parameters'])\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for eq in update_eqs:\n", + " print(eq)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "update_eqs = sympy_cse_on_assignment_list(update_eqs)\n", + "for eq in update_eqs:\n", + " print(eq)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "update_eqs = merge_field_accesses(update_eqs)\n", + "\n", + "for eq in update_eqs:\n", + " print(eq)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "a = Symbol(\"a\")\n", + "b = Symbol(\"b\")\n", + "c = Symbol(\"c\")\n", + "\n", + "\n", + "fake_eqs = [\n", + " Assignment(a, sympy.Add(sympy.Mul(0.1, f['phi_src'][1, 0, 0](0)), sympy.Mul(0.1, f['phi_src'][0, 0, 0](0)))),\n", + " Assignment(b, sympy.Add(sympy.Mul(0.1, f['phi_src'][-1, 0, 0](0)), sympy.Mul(0.1, f['phi_src'][0, 0, 0](0)))),\n", + " Assignment(f['phi_dst'][0, 0, 0](0), sympy.Add(a, b))\n", + "]\n", + "\n", + "fake_eqs = merge_field_accesses(sympy_cse_on_assignment_list(fake_eqs))\n", + "\n", + "for eq in fake_eqs:\n", + " print(eq)\n", + " \n", + "shifted_fake_eqs = shift_fa_eqs(fake_eqs)\n", + "\n", + "\n", + "print()\n", + "for eq in shifted_fake_eqs:\n", + " print(eq)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "stolen_eqs = left_steal(update_eqs, 8)\n", + "\n", + "for eq in stolen_eqs:\n", + " print(eq)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "shifted_eqs = shift_fa_eqs(update_eqs)\n", + "steal_from_e = get_steal_list(update_eqs, shifted_eqs)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "usage = get_usage(update_eqs)\n", + "definitions = get_definitions(update_eqs)\n", + "\n", + "def count_nodes_up(node):\n", + " if isinstance(node, Field.Access):\n", + " return 1\n", + "\n", + " if node in definitions:\n", + " node = definitions[node].rhs\n", + " \n", + " node_count = 0\n", + " for arg in node.args:\n", + " if not (arg in usage and usage[arg] > 1):\n", + " node_count += count_nodes_up(arg)\n", + " return node_count + 1\n", + "\n", + "scores = [(s, count_nodes_up(s)) for s in steal_from_e if isinstance(s, Symbol)]\n", + "scores.sort(key=lambda s: s[1], reverse=True)\n", + "\n", + "for s in scores:\n", + " print(s)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "definitions = get_definitions(update_eqs) \n", + "usage = get_usage(update_eqs)\n", + "\n", + "d = graphviz.Digraph(engine='dot', strict=True)\n", + "\n", + "\n", + "for expr in steal_from_e:\n", + " d.node(str(expr), color=\"red\")\n", + "\n", + "for expr in steal_from_e:\n", + " definition = expr\n", + " if expr in definitions:\n", + " definition = definitions[expr].rhs\n", + " d.edge(str(definition), str(expr), weight=\"200\")\n", + " for atom in definition.args:\n", + " if not isinstance(atom, Number):\n", + " d.edge(str(atom), str(definition), weight=\"200\")\n", + "\n", + "\n", + "for eq in update_eqs:\n", + " for atom in sympy.postorder_traversal(eq.rhs):\n", + " if atom in steal_from_e:\n", + " d.edge(str(atom), str(eq.lhs), weight=\"200\")\n", + " \n", + " #d.edge(str(steal_from_e[expr]), str(expr), style=\"dashed\", weight=\"1\")\n", + " \n", + " #expr = steal_from_e[expr]\n", + " #definition = expr\n", + " #if expr in definitions:\n", + " # definition = definitions[expr].rhs\n", + " # d.edge(str(definition), str(expr), weight=\"200\")\n", + " #for atom in definition.args:\n", + " # if not isinstance(atom, Number):\n", + " # d.edge(str(atom), str(definition), weight=\"200\")\n", + " \n", + " \n", + " \n", + "d" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "d = graphviz.Digraph(engine='dot', strict=True)\n", + "\n", + "leaking_nodes = []\n", + "non_leaking_nodes = list(steal_from_e)\n", + "\n", + "\n", + "for eq in update_eqs:\n", + " if eq.lhs not in steal_from_e:\n", + " for atom in sympy.postorder_traversal(eq.rhs):\n", + " if atom in non_leaking_nodes:\n", + " non_leaking_nodes.remove(atom)\n", + " leaking_nodes.append(atom)\n", + "\n", + "for e in steal_from_e:\n", + " if e in leaking_nodes:\n", + " d.node(str(e), color=\"blue\")\n", + " else:\n", + " d.node(str(e), color=\"red\")\n", + " \n", + "for expr in steal_from_e:\n", + " definition = expr\n", + " if expr in definitions:\n", + " definition = definitions[expr].rhs\n", + " d.edge(str(definition), str(expr), weight=\"200\")\n", + " for atom in definition.args:\n", + " if not isinstance(atom, Number):\n", + " d.edge(str(atom), str(definition), weight=\"200\")\n", + "\n", + " \n", + "d" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "d = graphviz.Digraph(engine='dot', strict=True)\n", + "\n", + "def walk_up(expr):\n", + " if expr in definitions:\n", + " walk_up(definitions[expr].rhs)\n", + " d.edge(str(definitions[expr].rhs), str(expr))\n", + " for arg in expr.args:\n", + " if isinstance(arg, sympy.Number): continue\n", + " walk_up(arg)\n", + " d.edge(str(arg), str(expr))\n", + "\n", + "for eq in update_eqs:\n", + " if eq.lhs.name == \"xi_137\":\n", + " s_xi = eq.lhs\n", + " \n", + "walk_up(s_xi)\n", + "#walk_up(left_steal[s_xi])\n", + "\n", + "d" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "rescheduled_eqs = schedule_eqs(update_eqs)\n", + "\n", + "for eq in rescheduled_eqs:\n", + " print(eq)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "phi_kernel = create_kernel(\n", + " update_eqs,\n", + " target=\"gpu\",\n", + " gpu_indexing_params={\n", + " \"block_size\": (32, 4, 1)\n", + " }).compile()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(show_code(phi_kernel.ast))\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "code = \"#include <cstdint>\\n\"\n", + "code += \"#define FUNC_PREFIX __global__ __launch_bounds__(128)\\n\"\n", + "code += \"#define RESTRICT __restrict__\\n\\n\"\n", + "\n", + "code += str(show_code(phi_kernel.ast))\n", + "\n", + "cubin = pycuda.compiler.compile(code, options=[\"-w\", \"-std=c++11\", \"-use_fast_math\" ], arch=\"sm_60\")\n", + "\n", + "run([ \"echo \\\"\" + code + \"\\\" >> temp.cubin\"],\n", + " stdout=PIPE,\n", + " shell=True)\n", + "\n", + "newFile = open(\"temp.cusbin\", \"wb\")\n", + "newFile.write(cubin)\n", + "newFile.close()\n", + "\n", + "result = run([ \"nvdisasm -c temp.cusbin\"],\n", + " stdout=PIPE,\n", + " shell=True)\n", + "\n", + "print(result.stdout.decode(\"utf-8\"))\n", + "\n", + "newFile = open(\"temp.disasm\", \"wb\")\n", + "newFile.write(result.stdout)\n", + "newFile.close()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "d = graphviz.Digraph(engine='dot')\n", + "for eq in rescheduled_eqs:\n", + " #d.node(eq.lhs.name)\n", + " for arg in eq.rhs.atoms():\n", + " if isinstance(arg, sympy.Symbol) and not isinstance(arg, Field.Access):\n", + " d.edge(arg.name, eq.lhs.name)\n", + "d\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for e in steal_from_e:\n", + " print(e)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}