diff --git a/apps/benchmarks/UniformGridCPU/simulation_setup/benchmark_configs.py b/apps/benchmarks/UniformGridCPU/simulation_setup/benchmark_configs.py index 9acab66da85c8f5477251e66bc7a9ea37ccc2fd7..6a54df4522354c3f2f2ca51da370e7b4eba5229d 100755 --- a/apps/benchmarks/UniformGridCPU/simulation_setup/benchmark_configs.py +++ b/apps/benchmarks/UniformGridCPU/simulation_setup/benchmark_configs.py @@ -41,7 +41,7 @@ class Scenario: init_shear_flow = False periodic = (0, 0, 0) - self.blocks = (2, 1, 1) # block_decomposition(wlb.mpi.numProcesses()) + self.blocks = block_decomposition(wlb.mpi.numProcesses()) self.cells_per_block = cells_per_block self.periodic = periodic @@ -68,7 +68,7 @@ class Scenario: 'blocks': self.blocks, 'cellsPerBlock': self.cells_per_block, 'periodic': self.periodic, - 'oneBlockPerProcess': False + 'oneBlockPerProcess': True }, 'Parameters': { 'omega': self.omega, diff --git a/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs.py b/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs.py index 531ab22d54ab261ad8f159c91e85c5bfde03360d..e1972d914a5acc26ab54aaf3cb86c615ac4d3b77 100755 --- a/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs.py +++ b/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs.py @@ -66,7 +66,7 @@ class Scenario: init_shear_flow = False periodic = (0, 0, 0) - self.blocks = (2, 1, 1) # block_decomposition(wlb.mpi.numProcesses()) + self.blocks = block_decomposition(wlb.mpi.numProcesses()) self.cells_per_block = cells_per_block self.periodic = periodic @@ -96,7 +96,7 @@ class Scenario: 'blocks': self.blocks, 'cellsPerBlock': self.cells_per_block, 'periodic': self.periodic, - 'oneBlockPerProcess': False + 'oneBlockPerProcess': True }, 'Parameters': { 'omega': self.omega, diff --git a/python/lbmpy_walberla/additional_data_handler.py b/python/lbmpy_walberla/additional_data_handler.py index 32f8c0ee2de050d7348fc3496ad054354775035a..692d0cf57d3e905537d3cbd1af445c26c64074bc 100644 --- a/python/lbmpy_walberla/additional_data_handler.py +++ b/python/lbmpy_walberla/additional_data_handler.py @@ -1,7 +1,6 @@ from pystencils import Target from pystencils.stencil import inverse_direction - from lbmpy.advanced_streaming import AccessPdfValues, numeric_offsets, numeric_index from lbmpy.advanced_streaming.indexing import MirroredStencilDirections from lbmpy.boundaries.boundaryconditions import LbBoundary @@ -64,18 +63,18 @@ class FreeSlipAdditionalDataHandler(AdditionalDataHandler): init_list += [f"const Cell n = it.cell() + Cell({offset[0]}, {offset[1]}, {offset[2]});", f"int32_t ref_dir = {self._walberla_stencil.index(inv_offset)}; // dir: {direction}", - "element.wnx = 0; // compute discrete normal vector of free slip wall", - "element.wny = 0;", - f"if( flagField->isPartOfMaskSet( n.x() + {inv_offset[0]}, n.y(), n.z(), domainFlag ) )", - "{", - f" element.wnx = {inv_offset[0]};", + "element.wnx = 0; // compute discrete normal vector of free slip wall", + "element.wny = 0;", + f"if( flagField->isPartOfMaskSet( n.x() + {inv_offset[0]}, n.y(), n.z(), domainFlag ) )", + "{", + f" element.wnx = {inv_offset[0]};", " ref_dir = x_axis_mirrored_stencil_dir[ ref_dir ];", - "}", - f"if( flagField->isPartOfMaskSet( n.x(), n.y() + {inv_offset[1]}, n.z(), domainFlag ) )", - "{", - f" element.wny = {inv_offset[1]};", + "}", + f"if( flagField->isPartOfMaskSet( n.x(), n.y() + {inv_offset[1]}, n.z(), domainFlag ) )", + "{", + f" element.wny = {inv_offset[1]};", " ref_dir = y_axis_mirrored_stencil_dir[ ref_dir ];", - "}"] + "}"] if self._dim == 3: init_list += ["element.wnz = 0;", f"if( flagField->isPartOfMaskSet( n.x(), n.y(), n.z() + {inv_offset[2]}, domainFlag ) )", @@ -157,7 +156,7 @@ class OutflowAdditionalDataHandler(AdditionalDataHandler): self._target = target super(OutflowAdditionalDataHandler, self).__init__(stencil=stencil) - assert sum([a != 0 for a in self._normal_direction]) == 1,\ + assert sum([a != 0 for a in self._normal_direction]) == 1, \ "The outflow boundary is only implemented for straight walls at the moment." @property diff --git a/python/lbmpy_walberla/packing_kernels.py b/python/lbmpy_walberla/packing_kernels.py index 985193f1434dd43d4294067a46ea7ba2ac01dbb3..53e5d877e14f5584c1e2eb7738fe018752998485 100644 --- a/python/lbmpy_walberla/packing_kernels.py +++ b/python/lbmpy_walberla/packing_kernels.py @@ -17,6 +17,7 @@ from lbmpy.advanced_streaming.communication import _extend_dir from lbmpy.enums import Stencil from lbmpy.stencils import LBStencil +from pystencils_walberla.cmake_integration import CodeGenerationContext from pystencils_walberla.kernel_selection import KernelFamily, KernelCallNode, SwitchNode from pystencils_walberla.jinja_filters import add_pystencils_filters_to_jinja_env from pystencils_walberla.utility import config_from_context @@ -25,7 +26,8 @@ from lbmpy_walberla.alternating_sweeps import EvenIntegerCondition from lbmpy_walberla.utility import timestep_suffix -def generate_packing_kernels(generation_context, class_name: str, stencil: LBStencil, streaming_pattern: str = 'pull', +def generate_packing_kernels(generation_context: CodeGenerationContext, class_name: str, + stencil: LBStencil, streaming_pattern: str = 'pull', namespace='lbm', nonuniform: bool = False, target: Target = Target.CPU, data_type=None, cpu_openmp: bool = False, **create_kernel_params): @@ -72,7 +74,7 @@ def generate_packing_kernels(generation_context, class_name: str, stencil: LBSte header = env.get_template(f"{template_name}.tmpl.h").render(**jinja_context) source = env.get_template(f"{template_name}.tmpl.cpp").render(**jinja_context) - source_extension = "cpp" if target == Target.CPU else "cu" + source_extension = "cu" if target == Target.GPU and generation_context.cuda else "cpp" generation_context.write_file(f"{class_name}.h", header) generation_context.write_file(f"{class_name}.{source_extension}", source) diff --git a/python/lbmpy_walberla/storage_specification.py b/python/lbmpy_walberla/storage_specification.py index de82603a022bb45c74db8cbadcb35eee724775ff..544867aac60f440b05b89d16cbbebd0fe2cd5f89 100644 --- a/python/lbmpy_walberla/storage_specification.py +++ b/python/lbmpy_walberla/storage_specification.py @@ -10,12 +10,13 @@ from lbmpy import LBMConfig from lbmpy.advanced_streaming import is_inplace from lbmpy.methods import AbstractLbMethod +from pystencils_walberla.cmake_integration import CodeGenerationContext from pystencils_walberla.jinja_filters import add_pystencils_filters_to_jinja_env from pystencils_walberla.utility import config_from_context from lbmpy_walberla.packing_kernels import PackingKernelsCodegen -def generate_lbm_storage_specification(generation_context, class_name: str, +def generate_lbm_storage_specification(generation_context: CodeGenerationContext, class_name: str, method: AbstractLbMethod, lbm_config: LBMConfig, nonuniform: bool = False, target: Target = Target.CPU, data_type=None, cpu_openmp: bool = False, **create_kernel_params): @@ -83,6 +84,6 @@ def generate_lbm_storage_specification(generation_context, class_name: str, header = env.get_template('LbmStorageSpecification.tmpl.h').render(**jinja_context) source = env.get_template('LbmStorageSpecification.tmpl.cpp').render(**jinja_context) - source_extension = "cpp" if target == Target.CPU else "cu" + source_extension = "cu" if target == Target.GPU and generation_context.cuda else "cpp" generation_context.write_file(f"{class_name}.h", header) generation_context.write_file(f"{class_name}.{source_extension}", source) diff --git a/python/pystencils_walberla/codegen.py b/python/pystencils_walberla/codegen.py deleted file mode 100644 index 59fd6028fafcd2bab784c1a595f8511464108def..0000000000000000000000000000000000000000 --- a/python/pystencils_walberla/codegen.py +++ /dev/null @@ -1,534 +0,0 @@ -import warnings -from collections import OrderedDict, defaultdict -from dataclasses import replace -from itertools import product -from typing import Dict, Optional, Sequence, Tuple - -from jinja2 import Environment, PackageLoader, StrictUndefined - -from pystencils import Target, CreateKernelConfig -from pystencils import (Assignment, AssignmentCollection, Field, FieldType, create_kernel, create_staggered_kernel) -from pystencils.astnodes import KernelFunction -from pystencils.backends.cbackend import get_headers -from pystencils.backends.simd_instruction_sets import get_supported_instruction_sets -from pystencils.stencil import inverse_direction, offset_to_direction_string - -from pystencils.backends.cuda_backend import CudaSympyPrinter -from pystencils.typing.typed_sympy import SHAPE_DTYPE -from pystencils.typing import TypedSymbol - -from pystencils_walberla.jinja_filters import add_pystencils_filters_to_jinja_env -from pystencils_walberla.kernel_selection import KernelCallNode, KernelFamily, HighLevelInterfaceSpec - - -__all__ = ['generate_sweep', 'generate_pack_info', 'generate_pack_info_for_field', 'generate_pack_info_from_kernel', - 'generate_mpidtype_info_from_kernel', 'KernelInfo', - 'get_vectorize_instruction_set', 'config_from_context', 'generate_selective_sweep'] - - -def generate_sweep(generation_context, class_name, assignments, - namespace='pystencils', field_swaps=(), staggered=False, varying_parameters=(), - inner_outer_split=False, ghost_layers_to_include=0, - target=Target.CPU, data_type=None, cpu_openmp=None, cpu_vectorize_info=None, max_threads=None, - **create_kernel_params): - """Generates a waLBerla sweep from a pystencils representation. - - The constructor of the C++ sweep class expects all kernel parameters (fields and parameters) in alphabetical order. - Fields have to passed using BlockDataID's pointing to walberla fields - - Args: - generation_context: build system context filled with information from waLBerla's CMake. The context for example - defines where to write generated files, if OpenMP is available or which SIMD instruction - set should be used. See waLBerla examples on how to get a context. - class_name: name of the generated sweep class - assignments: list of assignments defining the stencil update rule or a :class:`KernelFunction` - namespace: the generated class is accessible as walberla::<namespace>::<class_name> - field_swaps: sequence of field pairs (field, temporary_field). The generated sweep only gets the first field - as argument, creating a temporary field internally which is swapped with the first field after - each iteration. - staggered: set to True to create staggered kernels with `pystencils.create_staggered_kernel` - varying_parameters: Depending on the configuration, the generated kernels may receive different arguments for - different setups. To not have to adapt the C++ application when then parameter change, - the varying_parameters sequence can contain parameter names, which are always expected by - the C++ class constructor even if the kernel does not need them. - inner_outer_split: if True generate a sweep that supports separate iteration over inner and outer regions - to allow for communication hiding. - ghost_layers_to_include: determines how many ghost layers should be included for the Sweep. - This is relevant if a setter kernel should also set correct values to the ghost layers. - target: An pystencils Target to define cpu or gpu code generation. See pystencils.Target - data_type: default datatype for the kernel creation. Default is double - cpu_openmp: if loops should use openMP or not. - cpu_vectorize_info: dictionary containing necessary information for the usage of a SIMD instruction set. - max_threads: only relevant for GPU kernels. Will be argument of `__launch_bounds__` - **create_kernel_params: remaining keyword arguments are passed to `pystencils.create_kernel` - """ - if staggered: - assert 'omp_single_loop' not in create_kernel_params - create_kernel_params['omp_single_loop'] = False - config = config_from_context(generation_context, target=target, data_type=data_type, cpu_openmp=cpu_openmp, - cpu_vectorize_info=cpu_vectorize_info, **create_kernel_params) - - if isinstance(assignments, KernelFunction): - ast = assignments - target = ast.target - elif not staggered: - ast = create_kernel(assignments, config=config) - else: - # This should not be necessary but create_staggered_kernel does not take a config at the moment ... - ast = create_staggered_kernel(assignments, **config.__dict__) - - ast.function_name = class_name.lower() - - selection_tree = KernelCallNode(ast) - generate_selective_sweep(generation_context, class_name, selection_tree, target=target, namespace=namespace, - field_swaps=field_swaps, varying_parameters=varying_parameters, - inner_outer_split=inner_outer_split, ghost_layers_to_include=ghost_layers_to_include, - cpu_vectorize_info=config.cpu_vectorize_info, - cpu_openmp=config.cpu_openmp, max_threads=max_threads) - - -def generate_selective_sweep(generation_context, class_name, selection_tree, interface_mappings=(), target=None, - namespace='pystencils', field_swaps=(), varying_parameters=(), - inner_outer_split=False, ghost_layers_to_include=0, - cpu_vectorize_info=None, cpu_openmp=False, max_threads=None): - """Generates a selective sweep from a kernel selection tree. A kernel selection tree consolidates multiple - pystencils ASTs in a tree-like structure. See also module `pystencils_walberla.kernel_selection`. - - Args: - generation_context: see documentation of `generate_sweep` - class_name: name of the generated sweep class - selection_tree: Instance of `AbstractKernelSelectionNode`, root of the selection tree - interface_mappings: sequence of `AbstractInterfaceArgumentMapping` instances for selection arguments of - the selection tree - target: `None`, `Target.CPU` or `Target.GPU`; inferred from kernels if `None` is given. - namespace: see documentation of `generate_sweep` - field_swaps: see documentation of `generate_sweep` - varying_parameters: see documentation of `generate_sweep` - inner_outer_split: see documentation of `generate_sweep` - ghost_layers_to_include: see documentation of `generate_sweep` - cpu_vectorize_info: Dictionary containing information about CPU vectorization applied to the kernels - cpu_openmp: Whether or not CPU kernels use OpenMP parallelization - max_threads: only relevant for GPU kernels. Will be argument of `__launch_bounds__` - """ - def to_name(f): - return f.name if isinstance(f, Field) else f - - field_swaps = tuple((to_name(e[0]), to_name(e[1])) for e in field_swaps) - temporary_fields = tuple(e[1] for e in field_swaps) - - kernel_family = KernelFamily(selection_tree, class_name, - temporary_fields, field_swaps, varying_parameters) - - if target is None: - target = kernel_family.get_ast_attr('target') - elif target != kernel_family.get_ast_attr('target'): - raise ValueError('Mismatch between target parameter and AST targets.') - - if not generation_context.gpu and target == Target.GPU: - return - - representative_field = {p.field_name for p in kernel_family.parameters if p.is_field_parameter} - representative_field = sorted(representative_field)[0] - - env = Environment(loader=PackageLoader('pystencils_walberla'), undefined=StrictUndefined) - add_pystencils_filters_to_jinja_env(env) - - interface_spec = HighLevelInterfaceSpec(kernel_family.kernel_selection_parameters, interface_mappings) - - jinja_context = { - 'kernel': kernel_family, - 'namespace': namespace, - 'class_name': class_name, - 'target': target.name.lower(), - 'field': representative_field, - 'ghost_layers_to_include': ghost_layers_to_include, - 'inner_outer_split': inner_outer_split, - 'interface_spec': interface_spec, - 'generate_functor': True, - 'cpu_vectorize_info': cpu_vectorize_info, - 'cpu_openmp': cpu_openmp, - 'max_threads': max_threads - } - header = env.get_template("Sweep.tmpl.h").render(**jinja_context) - source = env.get_template("Sweep.tmpl.cpp").render(**jinja_context) - - source_extension = "cu" if target == Target.GPU and generation_context.cuda else "cpp" - generation_context.write_file(f"{class_name}.h", header) - generation_context.write_file(f"{class_name}.{source_extension}", source) - - -def generate_pack_info_for_field(generation_context, class_name: str, field: Field, - direction_subset: Optional[Tuple[Tuple[int, int, int]]] = None, - operator=None, gl_to_inner=False, - target=Target.CPU, data_type=None, cpu_openmp=False, - **create_kernel_params): - """Creates a pack info for a pystencils field assuming a pull-type stencil, packing all cell elements. - - Args: - generation_context: see documentation of `generate_sweep` - class_name: name of the generated class - field: pystencils field for which to generate pack info - direction_subset: optional sequence of directions for which values should be packed - otherwise a D3Q27 stencil is assumed - operator: optional operator for, e.g., reduction pack infos - gl_to_inner: communicates values from ghost layers of sender to interior of receiver - target: An pystencils Target to define cpu or gpu code generation. See pystencils.Target - data_type: default datatype for the kernel creation. Default is double - cpu_openmp: if loops should use openMP or not. - **create_kernel_params: remaining keyword arguments are passed to `pystencils.create_kernel` - """ - - if not direction_subset: - direction_subset = tuple((i, j, k) for i, j, k in product(*[(-1, 0, 1)] * 3)) - - all_index_accesses = [field(*ind) for ind in product(*[range(s) for s in field.index_shape])] - return generate_pack_info(generation_context, class_name, {direction_subset: all_index_accesses}, operator=operator, - gl_to_inner=gl_to_inner, target=target, data_type=data_type, cpu_openmp=cpu_openmp, - **create_kernel_params) - - -def generate_pack_info_from_kernel(generation_context, class_name: str, assignments: Sequence[Assignment], - kind='pull', operator=None, target=Target.CPU, data_type=None, cpu_openmp=False, - **create_kernel_params): - """Generates a waLBerla GPU PackInfo from a (pull) kernel. - - Args: - generation_context: see documentation of `generate_sweep` - class_name: name of the generated class - assignments: list of assignments from the compute kernel - generates PackInfo for "pull" part only - i.e. the kernel is expected to only write to the center - kind: can either be pull or push - operator: optional operator for, e.g., reduction pack infos - target: An pystencils Target to define cpu or gpu code generation. See pystencils.Target - data_type: default datatype for the kernel creation. Default is double - cpu_openmp: if loops should use openMP or not. - **create_kernel_params: remaining keyword arguments are passed to `pystencils.create_kernel` - """ - assert kind in ('push', 'pull') - reads = set() - writes = set() - - if isinstance(assignments, AssignmentCollection): - assignments = assignments.all_assignments - - for a in assignments: - if not isinstance(a, Assignment): - continue - reads.update(a.rhs.atoms(Field.Access)) - writes.update(a.lhs.atoms(Field.Access)) - spec = defaultdict(set) - if kind == 'pull': - for fa in reads: - assert all(abs(e) <= 1 for e in fa.offsets) - if all(offset == 0 for offset in fa.offsets): - continue - comm_direction = inverse_direction(fa.offsets) - for comm_dir in comm_directions(comm_direction): - spec[(comm_dir,)].add(fa.field.center(*fa.index)) - elif kind == 'push': - for fa in writes: - assert all(abs(e) <= 1 for e in fa.offsets) - if all(offset == 0 for offset in fa.offsets): - continue - for comm_dir in comm_directions(fa.offsets): - spec[(comm_dir,)].add(fa) - else: - raise ValueError("Invalid 'kind' parameter") - return generate_pack_info(generation_context, class_name, spec, operator=operator, - target=target, data_type=data_type, cpu_openmp=cpu_openmp, **create_kernel_params) - - -def generate_pack_info(generation_context, class_name: str, - directions_to_pack_terms: Dict[Tuple[Tuple], Sequence[Field.Access]], - namespace='pystencils', operator=None, gl_to_inner=False, - target=Target.CPU, data_type=None, cpu_openmp=False, - **create_kernel_params): - """Generates a waLBerla GPU PackInfo - - Args: - generation_context: see documentation of `generate_sweep` - class_name: name of the generated class - directions_to_pack_terms: maps tuples of directions to read field accesses, specifying which values have to be - packed for which direction - namespace: inner namespace of the generated class - operator: optional operator for, e.g., reduction pack infos - gl_to_inner: communicates values from ghost layers of sender to interior of receiver - target: An pystencils Target to define cpu or gpu code generation. See pystencils.Target - data_type: default datatype for the kernel creation. Default is double - cpu_openmp: if loops should use openMP or not. - **create_kernel_params: remaining keyword arguments are passed to `pystencils.create_kernel` - """ - if cpu_openmp: - raise ValueError("The packing kernels are already called inside an OpenMP parallel region. Thus " - "additionally parallelising each kernel is not supported.") - items = [(e[0], sorted(e[1], key=lambda x: str(x))) for e in directions_to_pack_terms.items()] - items = sorted(items, key=lambda e: e[0]) - directions_to_pack_terms = OrderedDict(items) - - config = config_from_context(generation_context, target=target, data_type=data_type, cpu_openmp=cpu_openmp, - **create_kernel_params) - - config_zero_gl = config_from_context(generation_context, target=target, data_type=data_type, cpu_openmp=cpu_openmp, - ghost_layers=0, **create_kernel_params) - - # Vectorisation of the pack info is not implemented. - config = replace(config, cpu_vectorize_info=None) - config_zero_gl = replace(config_zero_gl, cpu_vectorize_info=None) - - config = replace(config, allow_double_writes=True) - config_zero_gl = replace(config_zero_gl, allow_double_writes=True) - - template_name = "CpuPackInfo.tmpl" if config.target == Target.CPU else 'GpuPackInfo.tmpl' - - fields_accessed = set() - for terms in directions_to_pack_terms.values(): - for term in terms: - assert isinstance(term, Field.Access) # and all(e == 0 for e in term.offsets) - fields_accessed.add(term) - - field_names = {fa.field.name for fa in fields_accessed} - - data_types = {fa.field.dtype for fa in fields_accessed} - if len(data_types) == 0: - raise ValueError("No fields to pack!") - if len(data_types) != 1: - err_detail = "\n".join(f" - {f.name} [{f.dtype}]" for f in fields_accessed) - raise NotImplementedError("Fields of different data types are used - this is not supported.\n" + err_detail) - dtype = data_types.pop() - - pack_kernels = OrderedDict() - unpack_kernels = OrderedDict() - all_accesses = set() - elements_per_cell = OrderedDict() - for direction_set, terms in directions_to_pack_terms.items(): - for d in direction_set: - if not all(abs(i) <= 1 for i in d): - raise NotImplementedError("Only first neighborhood supported") - - buffer = Field.create_generic('buffer', spatial_dimensions=1, field_type=FieldType.BUFFER, - dtype=dtype.numpy_dtype, index_shape=(len(terms),)) - - direction_strings = tuple(offset_to_direction_string(d) for d in direction_set) - all_accesses.update(terms) - - pack_assignments = [Assignment(buffer(i), term) for i, term in enumerate(terms)] - pack_ast = create_kernel(pack_assignments, config=config_zero_gl) - pack_ast.function_name = 'pack_{}'.format("_".join(direction_strings)) - if operator is None: - unpack_assignments = [Assignment(term, buffer(i)) for i, term in enumerate(terms)] - else: - unpack_assignments = [Assignment(term, operator(term, buffer(i))) for i, term in enumerate(terms)] - unpack_ast = create_kernel(unpack_assignments, config=config_zero_gl) - unpack_ast.function_name = 'unpack_{}'.format("_".join(direction_strings)) - - pack_kernels[direction_strings] = KernelInfo(pack_ast) - unpack_kernels[direction_strings] = KernelInfo(unpack_ast) - elements_per_cell[direction_strings] = len(terms) - fused_kernel = create_kernel([Assignment(buffer.center, t) for t in all_accesses], config=config) - - jinja_context = { - 'class_name': class_name, - 'pack_kernels': pack_kernels, - 'unpack_kernels': unpack_kernels, - 'fused_kernel': KernelInfo(fused_kernel), - 'elements_per_cell': elements_per_cell, - 'headers': get_headers(fused_kernel), - 'target': config.target.name.lower(), - 'dtype': dtype, - 'field_name': field_names.pop(), - 'namespace': namespace, - 'gl_to_inner': gl_to_inner, - } - env = Environment(loader=PackageLoader('pystencils_walberla'), undefined=StrictUndefined) - add_pystencils_filters_to_jinja_env(env) - header = env.get_template(template_name + ".h").render(**jinja_context) - source = env.get_template(template_name + ".cpp").render(**jinja_context) - - source_extension = "cu" if target == Target.GPU and generation_context.cuda else "cpp" - generation_context.write_file(f"{class_name}.h", header) - generation_context.write_file(f"{class_name}.{source_extension}", source) - - -def generate_mpidtype_info_from_kernel(generation_context, class_name: str, - assignments: Sequence[Assignment], kind='pull', namespace='pystencils'): - assert kind in ('push', 'pull') - reads = set() - writes = set() - - if isinstance(assignments, AssignmentCollection): - assignments = assignments.all_assignments - - for a in assignments: - if not isinstance(a, Assignment): - continue - reads.update(a.rhs.atoms(Field.Access)) - writes.update(a.lhs.atoms(Field.Access)) - - spec = defaultdict(set) - if kind == 'pull': - read_fields = set(fa.field for fa in reads) - assert len(read_fields) == 1, "Only scenarios where one fields neighbors are accessed" - field = read_fields.pop() - for fa in reads: - assert all(abs(e) <= 1 for e in fa.offsets) - if all(offset == 0 for offset in fa.offsets): - continue - comm_direction = inverse_direction(fa.offsets) - for comm_dir in comm_directions(comm_direction): - assert len(fa.index) == 1, "Supports only fields with a single index dimension" - spec[(offset_to_direction_string(comm_dir),)].add(fa.index[0]) - elif kind == 'push': - written_fields = set(fa.field for fa in writes) - assert len(written_fields) == 1, "Only scenarios where one fields neighbors are accessed" - field = written_fields.pop() - - for fa in writes: - assert all(abs(e) <= 1 for e in fa.offsets) - if all(offset == 0 for offset in fa.offsets): - continue - for comm_dir in comm_directions(fa.offsets): - assert len(fa.index) == 1, "Supports only fields with a single index dimension" - spec[(offset_to_direction_string(comm_dir),)].add(fa.index[0]) - else: - raise ValueError("Invalid 'kind' parameter") - - jinja_context = { - 'class_name': class_name, - 'namespace': namespace, - 'kind': kind, - 'field_name': field.name, - 'f_size': field.index_shape[0], - 'spec': spec, - } - env = Environment(loader=PackageLoader('pystencils_walberla'), undefined=StrictUndefined) - header = env.get_template("MpiDtypeInfo.tmpl.h").render(**jinja_context) - generation_context.write_file(f"{class_name}.h", header) - - -# ---------------------------------- Internal -------------------------------------------------------------------------- - - -class KernelInfo: - def __init__(self, ast, temporary_fields=(), field_swaps=(), varying_parameters=()): - self.ast = ast - self.temporary_fields = tuple(temporary_fields) - self.field_swaps = tuple(field_swaps) - self.varying_parameters = tuple(varying_parameters) - self.parameters = ast.get_parameters() # cache parameters here - - @property - def fields_accessed(self): - return self.ast.fields_accessed - - def get_ast_attr(self, name): - """Returns the value of an attribute of the AST managed by this KernelInfo. - For compatibility with KernelFamily.""" - return self.ast.__getattribute__(name) - - def generate_kernel_invocation_code(self, **kwargs): - ast = self.ast - ast_params = self.parameters - fnc_name = ast.function_name - is_cpu = self.ast.target == Target.CPU - call_parameters = ", ".join([p.symbol.name for p in ast_params]) - - if not is_cpu: - stream = kwargs.get('stream', '0') - spatial_shape_symbols = kwargs.get('spatial_shape_symbols', ()) - - if not spatial_shape_symbols: - spatial_shape_symbols = [p.symbol for p in ast_params if p.is_field_shape] - spatial_shape_symbols.sort(key=lambda e: e.coordinate) - else: - spatial_shape_symbols = [TypedSymbol(s, SHAPE_DTYPE) for s in spatial_shape_symbols] - - assert spatial_shape_symbols, "No shape parameters in kernel function arguments.\n"\ - "Please only use kernels for generic field sizes!" - - indexing_dict = ast.indexing.call_parameters(spatial_shape_symbols) - sp_printer_c = CudaSympyPrinter() - block = tuple(sp_printer_c.doprint(e) for e in indexing_dict['block']) - grid = tuple(sp_printer_c.doprint(e) for e in indexing_dict['grid']) - - kernel_call_lines = [ - f"dim3 _block(uint64_c({block[0]}), uint64_c({block[1]}), uint64_c({block[2]}));", - f"dim3 _grid(uint64_c({grid[0]}), uint64_c({grid[1]}), uint64_c({grid[2]}));", - f"internal_{fnc_name}::{fnc_name}<<<_grid, _block, 0, {stream}>>>({call_parameters});" - ] - - return "\n".join(kernel_call_lines) - else: - return f"internal_{fnc_name}::{fnc_name}({call_parameters});" - - -def get_vectorize_instruction_set(generation_context): - if generation_context.optimize_for_localhost: - supported_instruction_sets = get_supported_instruction_sets() - if supported_instruction_sets: - return supported_instruction_sets[-1] - else: # if cpuinfo package is not installed - warnings.warn("Could not obtain supported vectorization instruction sets - defaulting to sse. " - "This problem can probably be fixed by installing py-cpuinfo. This package can " - "gather the needed hardware information.") - return 'sse' - else: - return None - - -def config_from_context(generation_context, target=Target.CPU, data_type=None, - cpu_openmp=None, cpu_vectorize_info=None, **kwargs): - - if target == Target.GPU and not generation_context.gpu: - raise ValueError("can not generate device code if waLBerla is not build with CUDA or HIP. Please use " - "-DWALBERLA_BUILD_WITH_CUDA=1 or -DWALBERLA_BUILD_WITH_HIP=1 for configuring cmake") - - default_dtype = "float64" if generation_context.double_accuracy else "float32" - if data_type is None: - data_type = default_dtype - - if cpu_openmp and not generation_context.openmp: - warnings.warn("Code is generated with OpenMP pragmas but waLBerla is not build with OpenMP. " - "The compilation might not work due to wrong compiler flags. " - "Please use -DWALBERLA_BUILD_WITH_OPENMP=1 for configuring cmake") - - if cpu_openmp is None: - cpu_openmp = generation_context.openmp - - if cpu_vectorize_info is None: - cpu_vectorize_info = {} - - default_vec_is = get_vectorize_instruction_set(generation_context) - - cpu_vectorize_info['instruction_set'] = cpu_vectorize_info.get('instruction_set', default_vec_is) - cpu_vectorize_info['assume_inner_stride_one'] = cpu_vectorize_info.get('assume_inner_stride_one', True) - cpu_vectorize_info['assume_aligned'] = cpu_vectorize_info.get('assume_aligned', False) - cpu_vectorize_info['nontemporal'] = cpu_vectorize_info.get('nontemporal', False) - - config = CreateKernelConfig(target=target, data_type=data_type, default_number_float=data_type, - cpu_openmp=cpu_openmp, cpu_vectorize_info=cpu_vectorize_info, - **kwargs) - - return config - - -def comm_directions(direction): - if all(e == 0 for e in direction): - yield direction - binary_numbers_list = binary_numbers(len(direction)) - for comm_direction in binary_numbers_list: - for i in range(len(direction)): - if direction[i] == 0: - comm_direction[i] = 0 - if direction[i] == -1 and comm_direction[i] == 1: - comm_direction[i] = -1 - if not all(e == 0 for e in comm_direction): - yield tuple(comm_direction) - - -def binary_numbers(n): - result = list() - for i in range(1 << n): - binary_number = bin(i)[2:] - binary_number = '0' * (n - len(binary_number)) + binary_number - result.append((list(map(int, binary_number)))) - return result diff --git a/python/pystencils_walberla/kernel_info.py b/python/pystencils_walberla/kernel_info.py index 019843f903407717a222c5a44cc945f59a88fcfa..586c05abe21dcc07e5920ba8cc759120089a4e4e 100644 --- a/python/pystencils_walberla/kernel_info.py +++ b/python/pystencils_walberla/kernel_info.py @@ -58,8 +58,8 @@ class KernelInfo: grid = tuple(sp_printer_c.doprint(e) for e in indexing_dict['grid']) kernel_call_lines = [ - f"dim3 _block(uint64_c({block[0]}), uint64_c({block[1]}), uint64_c({block[2]}));", - f"dim3 _grid(uint64_c({grid[0]}), uint64_c({grid[1]}), uint64_c({grid[2]}));", + f"dim3 _block(uint32_c({block[0]}), uint32_c({block[1]}), uint32_c({block[2]}));", + f"dim3 _grid(uint32_c({grid[0]}), uint32_c({grid[1]}), uint32_c({grid[2]}));", f"internal_{fnc_name}::{fnc_name}<<<_grid, _block, 0, {stream}>>>({call_parameters});" ] diff --git a/python/pystencils_walberla/kernel_selection.py b/python/pystencils_walberla/kernel_selection.py index 544b86de027af9f810df34f9bf10a21db7444fd6..ad8a99867e0970b102409823f6a17258983bae2b 100644 --- a/python/pystencils_walberla/kernel_selection.py +++ b/python/pystencils_walberla/kernel_selection.py @@ -195,8 +195,8 @@ class KernelCallNode(AbstractKernelSelectionNode): grid = tuple(sp_printer_c.doprint(e) for e in indexing_dict['grid']) kernel_call_lines = [ - f"dim3 _block(uint64_c({block[0]}), uint64_c({block[1]}), uint64_c({block[2]}));", - f"dim3 _grid(uint64_c({grid[0]}), uint64_c({grid[1]}), uint64_c({grid[2]}));", + f"dim3 _block(uint32_c({block[0]}), uint32_c({block[1]}), uint32_c({block[2]}));", + f"dim3 _grid(uint32_c({grid[0]}), uint32_c({grid[1]}), uint32_c({grid[2]}));", f"internal_{fnc_name}::{fnc_name}<<<_grid, _block, 0, {stream}>>>({call_parameters});" ] diff --git a/python/pystencils_walberla/pack_info.py b/python/pystencils_walberla/pack_info.py index 221a946e004143f0f02c3a2663df6726add4027f..df84c71c7614695b459a49da88273761fb8048f0 100644 --- a/python/pystencils_walberla/pack_info.py +++ b/python/pystencils_walberla/pack_info.py @@ -15,7 +15,7 @@ from pystencils_walberla.kernel_info import KernelInfo from pystencils_walberla.utility import config_from_context -def generate_pack_info_for_field(ctx: CodeGenerationContext, class_name: str, field: Field, +def generate_pack_info_for_field(generation_context: CodeGenerationContext, class_name: str, field: Field, direction_subset: Optional[Tuple[Tuple[int, int, int]]] = None, operator=None, gl_to_inner=False, target=Target.CPU, data_type=None, cpu_openmp=False, @@ -23,7 +23,7 @@ def generate_pack_info_for_field(ctx: CodeGenerationContext, class_name: str, fi """Creates a pack info for a pystencils field assuming a pull-type stencil, packing all cell elements. Args: - ctx: see documentation of `generate_sweep` + generation_context: see documentation of `generate_sweep` class_name: name of the generated class field: pystencils field for which to generate pack info direction_subset: optional sequence of directions for which values should be packed @@ -40,18 +40,19 @@ def generate_pack_info_for_field(ctx: CodeGenerationContext, class_name: str, fi direction_subset = tuple((i, j, k) for i, j, k in product(*[(-1, 0, 1)] * 3)) all_index_accesses = [field(*ind) for ind in product(*[range(s) for s in field.index_shape])] - return generate_pack_info(ctx, class_name, {direction_subset: all_index_accesses}, operator=operator, + return generate_pack_info(generation_context, class_name, {direction_subset: all_index_accesses}, operator=operator, gl_to_inner=gl_to_inner, target=target, data_type=data_type, cpu_openmp=cpu_openmp, **create_kernel_params) -def generate_pack_info_from_kernel(ctx: CodeGenerationContext, class_name: str, assignments: Sequence[Assignment], - kind='pull', operator=None, target=Target.CPU, data_type=None, cpu_openmp=False, +def generate_pack_info_from_kernel(generation_context: CodeGenerationContext, class_name: str, + assignments: Sequence[Assignment], kind='pull', operator=None, + target=Target.CPU, data_type=None, cpu_openmp=False, **create_kernel_params): """Generates a waLBerla GPU PackInfo from a (pull) kernel. Args: - ctx: see documentation of `generate_sweep` + generation_context: see documentation of `generate_sweep` class_name: name of the generated class assignments: list of assignments from the compute kernel - generates PackInfo for "pull" part only i.e. the kernel is expected to only write to the center @@ -92,11 +93,11 @@ def generate_pack_info_from_kernel(ctx: CodeGenerationContext, class_name: str, spec[(comm_dir,)].add(fa) else: raise ValueError("Invalid 'kind' parameter") - return generate_pack_info(ctx, class_name, spec, operator=operator, + return generate_pack_info(generation_context, class_name, spec, operator=operator, target=target, data_type=data_type, cpu_openmp=cpu_openmp, **create_kernel_params) -def generate_pack_info(ctx: CodeGenerationContext, class_name: str, +def generate_pack_info(generation_context: CodeGenerationContext, class_name: str, directions_to_pack_terms: Dict[Tuple[Tuple], Sequence[Field.Access]], namespace='pystencils', operator=None, gl_to_inner=False, target=Target.CPU, data_type=None, cpu_openmp=False, @@ -104,7 +105,7 @@ def generate_pack_info(ctx: CodeGenerationContext, class_name: str, """Generates a waLBerla GPU PackInfo Args: - ctx: see documentation of `generate_sweep` + generation_context: see documentation of `generate_sweep` class_name: name of the generated class directions_to_pack_terms: maps tuples of directions to read field accesses, specifying which values have to be packed for which direction @@ -123,10 +124,10 @@ def generate_pack_info(ctx: CodeGenerationContext, class_name: str, items = sorted(items, key=lambda e: e[0]) directions_to_pack_terms = OrderedDict(items) - config = config_from_context(ctx, target=target, data_type=data_type, cpu_openmp=cpu_openmp, + config = config_from_context(generation_context, target=target, data_type=data_type, cpu_openmp=cpu_openmp, **create_kernel_params) - config_zero_gl = config_from_context(ctx, target=target, data_type=data_type, cpu_openmp=cpu_openmp, + config_zero_gl = config_from_context(generation_context, target=target, data_type=data_type, cpu_openmp=cpu_openmp, ghost_layers=0, **create_kernel_params) # Vectorisation of the pack info is not implemented. @@ -202,9 +203,9 @@ def generate_pack_info(ctx: CodeGenerationContext, class_name: str, header = env.get_template(template_name + ".h").render(**jinja_context) source = env.get_template(template_name + ".cpp").render(**jinja_context) - source_extension = "cpp" if config.target == Target.CPU else "cu" - ctx.write_file(f"{class_name}.h", header) - ctx.write_file(f"{class_name}.{source_extension}", source) + source_extension = "cu" if target == Target.GPU and generation_context.cuda else "cpp" + generation_context.write_file(f"{class_name}.h", header) + generation_context.write_file(f"{class_name}.{source_extension}", source) def generate_mpidtype_info_from_kernel(ctx: CodeGenerationContext, class_name: str, diff --git a/python/pystencils_walberla/sweep.py b/python/pystencils_walberla/sweep.py index ddf9a2a52b0de504394becdf99127a06f866383d..c404618b21f79b0fcb5efb5a7d93a49ac327f86d 100644 --- a/python/pystencils_walberla/sweep.py +++ b/python/pystencils_walberla/sweep.py @@ -12,7 +12,7 @@ from pystencils_walberla.kernel_selection import KernelCallNode, KernelFamily, H from pystencils_walberla.utility import config_from_context -def generate_sweep(ctx: CodeGenerationContext, class_name: str, assignments: Sequence[Assignment], +def generate_sweep(generation_context: CodeGenerationContext, class_name: str, assignments: Sequence[Assignment], namespace: str = 'pystencils', field_swaps=(), staggered=False, varying_parameters=(), inner_outer_split=False, ghost_layers_to_include=0, target=Target.CPU, data_type=None, cpu_openmp=None, cpu_vectorize_info=None, max_threads=None, @@ -23,7 +23,7 @@ def generate_sweep(ctx: CodeGenerationContext, class_name: str, assignments: Seq Fields have to passed using BlockDataID's pointing to walberla fields Args: - ctx: build system context filled with information from waLBerla's CMake. The context for example + generation_context: build system context filled with information from waLBerla's CMake. The context for example defines where to write generated files, if OpenMP is available or which SIMD instruction set should be used. See waLBerla examples on how to get a context. class_name: name of the generated sweep class @@ -51,7 +51,7 @@ def generate_sweep(ctx: CodeGenerationContext, class_name: str, assignments: Seq if staggered: assert 'omp_single_loop' not in create_kernel_params create_kernel_params['omp_single_loop'] = False - config = config_from_context(ctx, target=target, data_type=data_type, cpu_openmp=cpu_openmp, + config = config_from_context(generation_context, target=target, data_type=data_type, cpu_openmp=cpu_openmp, cpu_vectorize_info=cpu_vectorize_info, **create_kernel_params) if isinstance(assignments, KernelFunction): @@ -66,14 +66,14 @@ def generate_sweep(ctx: CodeGenerationContext, class_name: str, assignments: Seq ast.function_name = class_name.lower() selection_tree = KernelCallNode(ast) - generate_selective_sweep(ctx, class_name, selection_tree, target=target, namespace=namespace, + generate_selective_sweep(generation_context, class_name, selection_tree, target=target, namespace=namespace, field_swaps=field_swaps, varying_parameters=varying_parameters, inner_outer_split=inner_outer_split, ghost_layers_to_include=ghost_layers_to_include, cpu_vectorize_info=config.cpu_vectorize_info, cpu_openmp=config.cpu_openmp, max_threads=max_threads) -def generate_selective_sweep(ctx, class_name, selection_tree, interface_mappings=(), target=None, +def generate_selective_sweep(generation_context, class_name, selection_tree, interface_mappings=(), target=None, namespace='pystencils', field_swaps=(), varying_parameters=(), inner_outer_split=False, ghost_layers_to_include=0, cpu_vectorize_info=None, cpu_openmp=False, max_threads=None): @@ -81,7 +81,7 @@ def generate_selective_sweep(ctx, class_name, selection_tree, interface_mappings pystencils ASTs in a tree-like structure. See also module `pystencils_walberla.kernel_selection`. Args: - ctx: see documentation of `generate_sweep` + generation_context: see documentation of `generate_sweep` class_name: name of the generated sweep class selection_tree: Instance of `AbstractKernelSelectionNode`, root of the selection tree interface_mappings: sequence of `AbstractInterfaceArgumentMapping` instances for selection arguments of @@ -110,7 +110,7 @@ def generate_selective_sweep(ctx, class_name, selection_tree, interface_mappings elif target != kernel_family.get_ast_attr('target'): raise ValueError('Mismatch between target parameter and AST targets.') - if not ctx.gpu and target == Target.GPU: + if not generation_context.gpu and target == Target.GPU: return representative_field = {p.field_name for p in kernel_family.parameters if p.is_field_parameter} @@ -138,12 +138,13 @@ def generate_selective_sweep(ctx, class_name, selection_tree, interface_mappings header = env.get_template("Sweep.tmpl.h").render(**jinja_context) source = env.get_template("Sweep.tmpl.cpp").render(**jinja_context) - source_extension = "cpp" if target == Target.CPU else "cu" - ctx.write_file(f"{class_name}.h", header) - ctx.write_file(f"{class_name}.{source_extension}", source) + source_extension = "cu" if target == Target.GPU and generation_context.cuda else "cpp" + generation_context.write_file(f"{class_name}.h", header) + generation_context.write_file(f"{class_name}.{source_extension}", source) -def generate_sweep_collection(ctx, class_name: str, function_generators: Sequence[Callable], parameter_scaling=None): +def generate_sweep_collection(generation_context: CodeGenerationContext, class_name: str, + function_generators: Sequence[Callable], parameter_scaling=None): """Generates a sweep collection """ @@ -194,6 +195,6 @@ def generate_sweep_collection(ctx, class_name: str, function_generators: Sequenc header = env.get_template("SweepCollection.tmpl.h").render(**jinja_context) source = env.get_template("SweepCollection.tmpl.cpp").render(**jinja_context) - source_extension = "cpp" if target == Target.CPU else "cu" - ctx.write_file(f"{class_name}.h", header) - ctx.write_file(f"{class_name}.{source_extension}", source) + source_extension = "cu" if target == Target.GPU and generation_context.cuda else "cpp" + generation_context.write_file(f"{class_name}.h", header) + generation_context.write_file(f"{class_name}.{source_extension}", source) diff --git a/src/core/timing/DeviceSynchronizePolicy.h b/src/core/timing/DeviceSynchronizePolicy.h index 7c494e48d22b1fb195d52b90334b9c0bed0c2f65..f046883cf7720d1f93641be2cd135ac6818636dd 100644 --- a/src/core/timing/DeviceSynchronizePolicy.h +++ b/src/core/timing/DeviceSynchronizePolicy.h @@ -23,6 +23,7 @@ #pragma once #include "gpu/DeviceWrapper.h" +#include "gpu/ErrorChecking.h" #include "Time.h" @@ -74,7 +75,7 @@ internally. inline double DeviceSynchronizePolicy::getTimestamp() { // synchronize device before getting timestamp - WALBERLA_DEVICE_SECTION() { gpuDeviceSynchronize(); } + WALBERLA_DEVICE_SECTION() {WALBERLA_GPU_CHECK(gpuDeviceSynchronize()) } return getWcTime(); }