diff --git a/pystencils/astnodes.py b/pystencils/astnodes.py index 10e855df2fd9a661f053adbdb43dbfb8d969fdf4..d62b4972e2b9fce5f5a70f7aa0cd6b97bc2d52cc 100644 --- a/pystencils/astnodes.py +++ b/pystencils/astnodes.py @@ -164,31 +164,20 @@ class KernelFunction(Node): def field_name(self): return self.fields[0].name - def __init__(self, body, target, backend, compile_function, ghost_layers, function_name="kernel"): + def __init__(self, body, target, compile_function, ghost_layers, function_name="kernel"): super(KernelFunction, self).__init__() self._body = body body.parent = self self.function_name = function_name self._body.parent = self self.ghost_layers = ghost_layers - self._target = target - self._backend = backend + self.target = target # these variables are assumed to be global, so no automatic parameter is generated for them self.global_variables = set() self.instruction_set = None # used in `vectorize` function to tell the backend which i.s. (SSE,AVX) to use # function that compiles the node to a Python callable, is set by the backends self._compile_function = compile_function - @property - def target(self): - """Currently either 'cpu' or 'gpu' """ - return self._target - - @property - def backend(self): - """Backend for generating the code e.g. 'llvm', 'c', 'cuda' """ - return self._backend - @property def symbols_defined(self): return set() diff --git a/pystencils/boundaries/boundaryhandling.py b/pystencils/boundaries/boundaryhandling.py index e19e24ecb1d5cd5e59a53796fc7ae7f0c9aa30fc..42c4b14180655ec3568d45007616a278a359a51e 100644 --- a/pystencils/boundaries/boundaryhandling.py +++ b/pystencils/boundaries/boundaryhandling.py @@ -43,7 +43,7 @@ class FlagInterface: raise ValueError("There is already a boundary handling registered at the data handling." "If you want to add multiple handling objects, choose a different name.") - self.flag_field = data_handling.add_array(self.flag_field_name, dtype=self.dtype, cpu=True, gpu=False) + self.flag_field = data_handling.add_array(self.flag_field_name, dtype=self.dtype, cpu=True, acc=False) ff_ghost_layers = data_handling.ghost_layers_of_field(self.flag_field_name) for b in data_handling.iterate(ghost_layers=ff_ghost_layers): b[self.flag_field_name].fill(self.domain_flag) @@ -87,26 +87,23 @@ class BoundaryHandling: fi = flag_interface self.flag_interface = fi if fi is not None else FlagInterface(data_handling, name + "Flags") - gpu = self._target in self._data_handling._GPU_LIKE_TARGETS - class_ = self.IndexFieldBlockData - if self._target == 'opencl': - def opencl_to_device(gpu_version, cpu_version): - from pyopencl import array - gpu_version = gpu_version.boundary_object_to_index_list - cpu_version = cpu_version.boundary_object_to_index_list - for obj, cpu_arr in cpu_version.items(): - if obj not in gpu_version or gpu_version[obj].shape != cpu_arr.shape: - from pystencils.opencl.opencljit import get_global_cl_queue - - queue = self._data_handling._opencl_queue or get_global_cl_queue() - gpu_version[obj] = array.to_device(queue, cpu_arr) - else: - gpu_version[obj].set(cpu_arr) - - class_ = type('opencl_class', (self.IndexFieldBlockData,), { - 'to_gpu': opencl_to_device - }) - data_handling.add_custom_class(self._index_array_name, class_, cpu=True, gpu=gpu) + def to_cpu(gpu_version, cpu_version): + gpu_version = gpu_version.boundary_object_to_index_list + cpu_version = cpu_version.boundary_object_to_index_list + for obj, cpu_arr in cpu_version.items(): + gpu_version[obj].get(cpu_arr) + + def to_acc(gpu_version, cpu_version): + gpu_version = gpu_version.boundary_object_to_index_list + cpu_version = cpu_version.boundary_object_to_index_list + for obj, cpu_arr in cpu_version.items(): + if obj not in gpu_version or gpu_version[obj].shape != cpu_arr.shape: + gpu_version[obj] = self.data_handling.array_handler.to_gpu(cpu_arr) + else: + self.data_handling.array_handler.upload(gpu_version[obj], cpu_arr) + + creation_function = lambda: self.IndexFieldBlockData() + data_handling.add_custom_data(self._index_array_name, creation_function, creation_function, to_acc, to_cpu) @property def data_handling(self): @@ -222,7 +219,7 @@ class BoundaryHandling: if self._dirty: self.prepare() - for b in self._data_handling.iterate(gpu=self._target in self._data_handling._GPU_LIKE_TARGETS): + for b in self._data_handling.iterate(acc=self._target in self._data_handling.ACCELERATOR_TARGETS): for b_obj, idx_arr in b[self._index_array_name].boundary_object_to_index_list.items(): kwargs[self._field_name] = b[self._field_name] kwargs['indexField'] = idx_arr @@ -237,7 +234,7 @@ class BoundaryHandling: if self._dirty: self.prepare() - for b in self._data_handling.iterate(gpu=self._target in self._data_handling._GPU_LIKE_TARGETS): + for b in self._data_handling.iterate(acc=self._target in self._data_handling.ACCELERATOR_TARGETS): for b_obj, idx_arr in b[self._index_array_name].boundary_object_to_index_list.items(): arguments = kwargs.copy() arguments[self._field_name] = b[self._field_name] @@ -320,8 +317,8 @@ class BoundaryHandling: def _boundary_data_initialization(self, boundary_obj, boundary_data_setter, **kwargs): if boundary_obj.additional_data_init_callback: boundary_obj.additional_data_init_callback(boundary_data_setter, **kwargs) - if self._target in self._data_handling._GPU_LIKE_TARGETS: - self._data_handling.to_gpu(self._index_array_name) + if self._target in self._data_handling.ACCELERATOR_TARGETS: + self._data_handling.to_acc(self._index_array_name) class BoundaryInfo(object): def __init__(self, boundary_obj, flag, kernel): @@ -330,7 +327,7 @@ class BoundaryHandling: self.kernel = kernel class IndexFieldBlockData: - def __init__(self, *_1, **_2): + def __init__(self): self.boundary_object_to_index_list = {} self.boundary_object_to_data_setter = {} @@ -338,25 +335,6 @@ class BoundaryHandling: self.boundary_object_to_index_list.clear() self.boundary_object_to_data_setter.clear() - @staticmethod - def to_cpu(gpu_version, cpu_version): - gpu_version = gpu_version.boundary_object_to_index_list - cpu_version = cpu_version.boundary_object_to_index_list - for obj, cpu_arr in cpu_version.items(): - gpu_version[obj].get(cpu_arr) - - @staticmethod - def to_gpu(gpu_version, cpu_version): - from pycuda import gpuarray - gpu_version = gpu_version.boundary_object_to_index_list - cpu_version = cpu_version.boundary_object_to_index_list - for obj, cpu_arr in cpu_version.items(): - if obj not in gpu_version or gpu_version[obj].shape != cpu_arr.shape: - gpu_version[obj] = gpuarray.to_gpu(cpu_arr) - else: - gpu_version[obj].set(cpu_arr) - - class BoundaryDataSetter: def __init__(self, index_array, offset, stencil, ghost_layers, pdf_array): diff --git a/pystencils/cpu/kernelcreation.py b/pystencils/cpu/kernelcreation.py index f351ce5a2bb03d723d22a8e1f772b25a934f7994..3a99220d55008d89a8897bc3280ebce0315fcca6 100644 --- a/pystencils/cpu/kernelcreation.py +++ b/pystencils/cpu/kernelcreation.py @@ -64,7 +64,7 @@ def create_kernel(assignments: AssignmentOrAstNodeList, function_name: str = "ke loop_order = get_optimal_loop_ordering(fields_without_buffers) loop_node, ghost_layer_info = make_loop_over_domain(body, iteration_slice=iteration_slice, ghost_layers=ghost_layers, loop_order=loop_order) - ast_node = KernelFunction(loop_node, 'cpu', 'c', compile_function=make_python_function, + ast_node = KernelFunction(loop_node, 'cpu', compile_function=make_python_function, ghost_layers=ghost_layer_info, function_name=function_name) implement_interpolations(body) @@ -145,7 +145,7 @@ def create_indexed_kernel(assignments: AssignmentOrAstNodeList, index_fields, fu loop_body.append(assignment) function_body = Block([loop_node]) - ast_node = KernelFunction(function_body, "cpu", "c", make_python_function, + ast_node = KernelFunction(function_body, "cpu", make_python_function, ghost_layers=None, function_name=function_name) fixed_coordinate_mapping = {f.name: coordinate_typed_symbols for f in non_index_fields} diff --git a/pystencils/datahandling/__init__.py b/pystencils/datahandling/__init__.py index a4fa55bdc7a52e1b9c2015e2210fcbb48aaeb2e1..d0690cb7ac30131faa91d51a0e76ffbc7e820d5c 100644 --- a/pystencils/datahandling/__init__.py +++ b/pystencils/datahandling/__init__.py @@ -18,10 +18,9 @@ except ImportError: def create_data_handling(domain_size: Tuple[int, ...], periodicity: Union[bool, Tuple[bool, ...]] = False, default_layout: str = 'SoA', - default_target: str = 'cpu', + target: str = 'cpu', parallel: bool = False, - default_ghost_layers: int = 1, - opencl_queue=None) -> DataHandling: + default_ghost_layers: int = 1) -> DataHandling: """Creates a data handling instance. Args: @@ -29,12 +28,11 @@ def create_data_handling(domain_size: Tuple[int, ...], periodicity: either True, False for full or no periodicity or a tuple of booleans indicating periodicity for each coordinate default_layout: default array layout, that is used if not explicitly specified in 'add_array' - default_target: either 'cpu' or 'gpu' + target: target where code should be run, e.g. 'cpu' or 'cuda' or 'opencl' parallel: if True a parallel domain is created using walberla - each MPI process gets a part of the domain default_ghost_layers: default number of ghost layers if not overwritten in 'add_array' """ if parallel: - assert not opencl_queue, "OpenCL is only supported for SerialDataHandling" if wlb is None: raise ValueError("Cannot create parallel data handling because walberla module is not available") @@ -55,15 +53,14 @@ def create_data_handling(domain_size: Tuple[int, ...], # noinspection PyArgumentList block_storage = wlb.createUniformBlockGrid(cells=domain_size, periodic=periodicity) - return ParallelDataHandling(blocks=block_storage, dim=dim, default_target=default_target, + return ParallelDataHandling(blocks=block_storage, dim=dim, target=target, default_layout=default_layout, default_ghost_layers=default_ghost_layers) else: return SerialDataHandling(domain_size, periodicity=periodicity, - default_target=default_target, + target=target, default_layout=default_layout, - default_ghost_layers=default_ghost_layers, - opencl_queue=opencl_queue) + default_ghost_layers=default_ghost_layers) __all__ = ['create_data_handling'] diff --git a/pystencils/datahandling/datahandling_interface.py b/pystencils/datahandling/datahandling_interface.py index af1a6ba1fc9d003042063023aa1ede5fc08665db..dc40b226490432c07a651fc0fdaec41a32955479 100644 --- a/pystencils/datahandling/datahandling_interface.py +++ b/pystencils/datahandling/datahandling_interface.py @@ -16,8 +16,7 @@ class DataHandling(ABC): 'gather' function that has collects (parts of the) distributed data on a single process. """ - _GPU_LIKE_TARGETS = ['gpu', 'opencl'] - _GPU_LIKE_BACKENDS = ['gpucuda', 'opencl'] + ACCELERATOR_TARGETS = ['cuda', 'opencl', 'llvm_gpu'] # ---------------------------- Adding and accessing data ----------------------------------------------------------- @@ -39,7 +38,7 @@ class DataHandling(ABC): @abstractmethod def add_array(self, name: str, values_per_cell, dtype=np.float64, latex_name: Optional[str] = None, ghost_layers: Optional[int] = None, layout: Optional[str] = None, - cpu: bool = True, gpu: Optional[bool] = None, alignment=False, field_type=FieldType.GENERIC) -> Field: + cpu: bool = True, acc: Optional[bool] = None, alignment=False, field_type=FieldType.GENERIC) -> Field: """Adds a (possibly distributed) array to the handling that can be accessed using the given name. For each array a symbolic field is available via the 'fields' dictionary @@ -56,8 +55,11 @@ class DataHandling(ABC): layout: memory layout of array, either structure of arrays 'SoA' or array of structures 'AoS'. this is only important if values_per_cell > 1 cpu: allocate field on the CPU - gpu: allocate field on the GPU, if None, a GPU field is allocated if default_target is 'gpu' + acc: allocate field on an accelerator, if an accelerator target has been selected + if None, an accelerator field is allocated only if the target selected when creating the data handling + is not 'cpu' alignment: either False for no alignment, or the number of bytes to align to + field_type: change from generic to staggered or absolutely accessed fields, see field documentation Returns: pystencils field, that can be used to formulate symbolic kernels """ @@ -67,7 +69,7 @@ class DataHandling(ABC): """Returns true if a field or custom data element with this name was added.""" @abstractmethod - def add_array_like(self, name, name_of_template_field, latex_name=None, cpu=True, gpu=None): + def add_array_like(self, name, name_of_template_field, latex_name=None, cpu=True, acc=None): """ Adds an array with the same parameters (number of ghost layers, values_per_cell, dtype) as existing array. @@ -76,33 +78,33 @@ class DataHandling(ABC): name_of_template_field: name of array that is used as template latex_name: see 'add' method cpu: see 'add' method - gpu: see 'add' method + acc: see 'add' method """ @abstractmethod def add_custom_data(self, name: str, cpu_creation_function, - gpu_creation_function=None, cpu_to_gpu_transfer_func=None, gpu_to_cpu_transfer_func=None): + acc_creation_function=None, cpu_to_acc_transfer_func=None, acc_to_cpu_transfer_func=None): """Adds custom (non-array) data to domain. Args: name: name to access data cpu_creation_function: function returning a new instance of the data that should be stored - gpu_creation_function: optional, function returning a new instance, stored on GPU - cpu_to_gpu_transfer_func: function that transfers cpu to gpu version, - getting two parameters (gpu_instance, cpu_instance) - gpu_to_cpu_transfer_func: function that transfers gpu to cpu version, getting two parameters - (gpu_instance, cpu_instance) + acc_creation_function: optional, function returning a new instance, stored on the accelerator + cpu_to_acc_transfer_func: function that transfers cpu to accelerator version, + getting two parameters (acc_instance, cpu_instance) + acc_to_cpu_transfer_func: function that transfers accelerator to cpu version, getting two parameters + (acc_instance, cpu_instance) """ - def add_custom_class(self, name: str, class_obj, cpu: bool = True, gpu: bool = False): - """Adds non-array data by passing a class object with optional 'to_gpu' and 'to_cpu' member functions.""" - cpu_to_gpu_transfer_func = class_obj.to_gpu if cpu and gpu and hasattr(class_obj, 'to_gpu') else None - gpu_to_cpu_transfer_func = class_obj.to_cpu if cpu and gpu and hasattr(class_obj, 'to_cpu') else None + def add_custom_class(self, name: str, class_obj, cpu: bool = True, acc: bool = False): + """Adds non-array data by passing a class object with optional 'to_acc' and 'to_cpu' member functions.""" + cpu_to_acc_transfer_func = class_obj.to_acc if cpu and acc and hasattr(class_obj, 'to_acc') else None + acc_to_cpu_transfer_func = class_obj.to_cpu if cpu and acc and hasattr(class_obj, 'to_cpu') else None self.add_custom_data(name, cpu_creation_function=class_obj if cpu else None, - gpu_creation_function=class_obj if gpu else None, - cpu_to_gpu_transfer_func=cpu_to_gpu_transfer_func, - gpu_to_cpu_transfer_func=gpu_to_cpu_transfer_func) + acc_creation_function=class_obj if acc else None, + cpu_to_acc_transfer_func=cpu_to_acc_transfer_func, + acc_to_cpu_transfer_func=acc_to_cpu_transfer_func) @property @abstractmethod @@ -128,7 +130,7 @@ class DataHandling(ABC): """Returns values_per_cell of array.""" @abstractmethod - def iterate(self, slice_obj=None, gpu=False, ghost_layers=None, + def iterate(self, slice_obj=None, acc=False, ghost_layers=None, inner_ghost_layers=True) -> Iterable['Block']: """Iterate over local part of potentially distributed data structure.""" @@ -157,32 +159,32 @@ class DataHandling(ABC): """ @abstractmethod - def swap(self, name1, name2, gpu=False): + def swap(self, name1, name2, acc=False): """Swaps data of two arrays""" - # ------------------------------- CPU/GPU transfer ----------------------------------------------------------------- + # ------------------------------- CPU/ACC transfer ----------------------------------------------------------------- @abstractmethod def to_cpu(self, name): - """Copies GPU data of array with specified name to CPU. - Works only if 'cpu=True' and 'gpu=True' has been used in 'add' method.""" + """Copies accelerator data of array with specified name to CPU. + Works only if 'cpu=True' and 'acc=True' has been used in 'add' method.""" @abstractmethod - def to_gpu(self, name): - """Copies GPU data of array with specified name to GPU. - Works only if 'cpu=True' and 'gpu=True' has been used in 'add' method.""" + def to_acc(self, name): + """Copies accelerator data of array with specified name to accelerator. + Works only if 'cpu=True' and 'acc=True' has been used in 'add' method.""" @abstractmethod def all_to_cpu(self): - """Copies data from GPU to CPU for all arrays that have a CPU and a GPU representation.""" + """Copies data from accelerator to CPU for all arrays that have a CPU and an accelerator representation.""" @abstractmethod - def all_to_gpu(self): - """Copies data from CPU to GPU for all arrays that have a CPU and a GPU representation.""" + def all_to_acc(self): + """Copies data from CPU to accelerator for all arrays that have a CPU and a accelerator representation.""" @abstractmethod - def is_on_gpu(self, name): - """Checks if this data was also allocated on the GPU - does not check if this data item is in synced.""" + def is_on_acc(self, name): + """Checks if this data was also allocated on the accelerator - does not check if this data item is in synced.""" @abstractmethod def create_vtk_writer(self, file_name, data_names, ghost_layers=False) -> Callable[[int], None]: @@ -216,7 +218,7 @@ class DataHandling(ABC): # ------------------------------- Communication -------------------------------------------------------------------- @abstractmethod - def synchronization_function(self, names, stencil=None, target=None, **kwargs) -> Callable[[], None]: + def synchronization_function(self, names, stencil=None, acc=None, **kwargs) -> Callable[[], None]: """Synchronizes ghost layers for distributed arrays. For serial scenario this has to be called for correct periodicity handling @@ -225,8 +227,9 @@ class DataHandling(ABC): names: what data to synchronize: name of array or sequence of names stencil: stencil as string defining which neighbors are synchronized e.g. 'D2Q9', 'D3Q19' if None, a full synchronization (i.e. D2Q9 or D3Q27) is done - target: either 'cpu' or 'gpu - kwargs: implementation specific, optional optimization parameters for communication + acc: synchronize data on accelerator, if None use accelerator when target at construction is an + accelerator target + kwargs: implementation specific Returns: function object to run the communication diff --git a/pystencils/datahandling/parallel_datahandling.py b/pystencils/datahandling/parallel_datahandling.py index 54f26806be318f6ef91a5ca11a9888a59524fb0c..f46b48ea717d22af2fe940b74cd95440717f193c 100644 --- a/pystencils/datahandling/parallel_datahandling.py +++ b/pystencils/datahandling/parallel_datahandling.py @@ -16,7 +16,7 @@ class ParallelDataHandling(DataHandling): GPU_DATA_PREFIX = "gpu_" VTK_COUNTER = 0 - def __init__(self, blocks, default_ghost_layers=1, default_layout='SoA', dim=3, default_target='cpu'): + def __init__(self, blocks, default_ghost_layers=1, default_layout='SoA', dim=3, target='cpu'): """ Creates data handling based on walberla block storage @@ -27,8 +27,7 @@ class ParallelDataHandling(DataHandling): dim: dimension of scenario, walberla always uses three dimensions, so if dim=2 the extend of the z coordinate of blocks has to be 1 - default_target: either 'cpu' or 'gpu' . If set to 'gpu' for each array also a GPU version is allocated - if not overwritten in add_array, and synchronization functions are for the GPU by default + target: either 'cpu' or 'cuda', other targets are not supported in parallel setup """ super(ParallelDataHandling, self).__init__() assert dim in (2, 3) @@ -52,7 +51,8 @@ class ParallelDataHandling(DataHandling): if self._dim == 2: assert self.blocks.getDomainCellBB().size[2] == 1 - self.default_target = default_target + assert target in ('cpu', 'cuda'), "ParallelDataHandling only support 'cpu' and 'cuda' target" + self.target = target @property def dim(self): @@ -77,24 +77,24 @@ class ParallelDataHandling(DataHandling): return self._fieldInformation[name]['values_per_cell'] def add_custom_data(self, name, cpu_creation_function, - gpu_creation_function=None, cpu_to_gpu_transfer_func=None, gpu_to_cpu_transfer_func=None): - if cpu_creation_function and gpu_creation_function: - if cpu_to_gpu_transfer_func is None or gpu_to_cpu_transfer_func is None: + acc_creation_function=None, cpu_to_acc_transfer_func=None, acc_to_cpu_transfer_func=None): + if cpu_creation_function and acc_creation_function: + if cpu_to_acc_transfer_func is None or acc_to_cpu_transfer_func is None: raise ValueError("For GPU data, both transfer functions have to be specified") - self._custom_data_transfer_functions[name] = (cpu_to_gpu_transfer_func, gpu_to_cpu_transfer_func) + self._custom_data_transfer_functions[name] = (cpu_to_acc_transfer_func, acc_to_cpu_transfer_func) if cpu_creation_function: self.blocks.addBlockData(name, cpu_creation_function) - if gpu_creation_function: - self.blocks.addBlockData(self.GPU_DATA_PREFIX + name, gpu_creation_function) + if acc_creation_function: + self.blocks.addBlockData(self.GPU_DATA_PREFIX + name, acc_creation_function) self._custom_data_names.append(name) def add_array(self, name, values_per_cell=1, dtype=np.float64, latex_name=None, ghost_layers=None, - layout=None, cpu=True, gpu=None, alignment=False, field_type=FieldType.GENERIC): + layout=None, cpu=True, acc=None, alignment=False, field_type=FieldType.GENERIC): if ghost_layers is None: ghost_layers = self.default_ghost_layers - if gpu is None: - gpu = self.default_target == 'gpu' + if acc is None: + acc = self.target == 'cuda' if layout is None: layout = self.default_layout if len(self.blocks) == 0: @@ -122,13 +122,13 @@ class ParallelDataHandling(DataHandling): if cpu: wlb.field.addToStorage(self.blocks, name, dtype, fSize=values_per_cell, layout=layout_map[layout], ghostLayers=ghost_layers, alignment=alignment) - if gpu: + if acc: if alignment != 0: raise ValueError("Alignment for walberla GPU fields not yet supported") wlb.cuda.addGpuFieldToStorage(self.blocks, self.GPU_DATA_PREFIX + name, dtype, fSize=values_per_cell, usePitchedMem=False, ghostLayers=ghost_layers, layout=layout_map[layout]) - if cpu and gpu: + if cpu and acc: self._cpu_gpu_pairs.append((name, self.GPU_DATA_PREFIX + name)) block_bb = self.blocks.getBlockCellBB(self.blocks[0]) @@ -144,7 +144,7 @@ class ParallelDataHandling(DataHandling): field_type=field_type) self.fields[name].latex_name = latex_name self._field_name_to_cpu_data_name[name] = name - if gpu: + if acc: self._field_name_to_gpu_data_name[name] = self.GPU_DATA_PREFIX + name return self.fields[name] @@ -159,18 +159,18 @@ class ParallelDataHandling(DataHandling): def custom_data_names(self): return tuple(self._custom_data_names) - def add_array_like(self, name, name_of_template_field, latex_name=None, cpu=True, gpu=None): - return self.add_array(name, latex_name=latex_name, cpu=cpu, gpu=gpu, + def add_array_like(self, name, name_of_template_field, latex_name=None, cpu=True, acc=None): + return self.add_array(name, latex_name=latex_name, cpu=cpu, acc=acc, **self._fieldInformation[name_of_template_field]) - def swap(self, name1, name2, gpu=False): - if gpu: + def swap(self, name1, name2, acc=False): + if acc: name1 = self.GPU_DATA_PREFIX + name1 name2 = self.GPU_DATA_PREFIX + name2 for block in self.blocks: block[name1].swapDataPointers(block[name2]) - def iterate(self, slice_obj=None, gpu=False, ghost_layers=True, inner_ghost_layers=True): + def iterate(self, slice_obj=None, acc=False, ghost_layers=True, inner_ghost_layers=True): if ghost_layers is True: ghost_layers = self.default_ghost_layers elif ghost_layers is False: @@ -185,7 +185,7 @@ class ParallelDataHandling(DataHandling): elif isinstance(ghost_layers, str): ghost_layers = self.ghost_layers_of_field(ghost_layers) - prefix = self.GPU_DATA_PREFIX if gpu else "" + prefix = self.GPU_DATA_PREFIX if acc else "" if slice_obj is not None: yield from sliced_block_iteration(self.blocks, slice_obj, inner_ghost_layers, ghost_layers, self.dim, prefix) @@ -229,7 +229,8 @@ class ParallelDataHandling(DataHandling): kernel_function(**arg_dict) def get_kernel_kwargs(self, kernel_function, **kwargs): - if kernel_function.ast.backend == 'gpucuda': + if kernel_function.ast.target in self.ACCELERATOR_TARGETS: + assert kernel_function.ast.target == 'cuda', 'ParallelDataHandling only supports CUDA and CPU' name_map = self._field_name_to_gpu_data_name to_array = wlb.cuda.toGpuArray else: @@ -258,7 +259,7 @@ class ParallelDataHandling(DataHandling): else: wlb.cuda.copyFieldToCpu(self.blocks, self.GPU_DATA_PREFIX + name, name) - def to_gpu(self, name): + def to_acc(self, name): if name in self._custom_data_transfer_functions: transfer_func = self._custom_data_transfer_functions[name][0] for block in self.blocks: @@ -266,7 +267,7 @@ class ParallelDataHandling(DataHandling): else: wlb.cuda.copyFieldToGpu(self.blocks, self.GPU_DATA_PREFIX + name, name) - def is_on_gpu(self, name): + def is_on_acc(self, name): return (name, self.GPU_DATA_PREFIX + name) in self._cpu_gpu_pairs def all_to_cpu(self): @@ -275,21 +276,15 @@ class ParallelDataHandling(DataHandling): for name in self._custom_data_transfer_functions.keys(): self.to_cpu(name) - def all_to_gpu(self): + def all_to_acc(self): for cpu_name, gpu_name in self._cpu_gpu_pairs: wlb.cuda.copyFieldToGpu(self.blocks, gpu_name, cpu_name) for name in self._custom_data_transfer_functions.keys(): - self.to_gpu(name) + self.to_acc(name) - def synchronization_function_cpu(self, names, stencil=None, buffered=True, stencil_restricted=False, **_): - return self.synchronization_function(names, stencil, 'cpu', buffered, stencil_restricted) - - def synchronization_function_gpu(self, names, stencil=None, buffered=True, stencil_restricted=False, **_): - return self.synchronization_function(names, stencil, 'gpu', buffered, stencil_restricted) - - def synchronization_function(self, names, stencil=None, target=None, buffered=True, stencil_restricted=False): - if target is None: - target = self.default_target + def synchronization_function(self, names, stencil=None, acc=None, buffered=True, stencil_restricted=False): + if acc is None: + target = self.target if stencil is None: stencil = 'D3Q27' if self.dim == 3 else 'D2Q9' @@ -298,12 +293,12 @@ class ParallelDataHandling(DataHandling): names = [names] create_scheme = wlb.createUniformBufferedScheme if buffered else wlb.createUniformDirectScheme - if target == 'cpu': + if not acc: create_packing = wlb.field.createPackInfo if buffered else wlb.field.createMPIDatatypeInfo if not buffered and stencil_restricted: create_packing = wlb.field.createStencilRestrictedPackInfo else: - assert target == 'gpu' + assert self.target == 'cuda' create_packing = wlb.cuda.createPackInfo if buffered else wlb.cuda.createMPIDatatypeInfo names = [self.GPU_DATA_PREFIX + name for name in names] diff --git a/pystencils/datahandling/pycuda.py b/pystencils/datahandling/pycuda.py index 30602a30ce6b87d0e25861b43c5291cda77ed570..954d9492afcf6c045697c1fd124ce63c855400c8 100644 --- a/pystencils/datahandling/pycuda.py +++ b/pystencils/datahandling/pycuda.py @@ -25,7 +25,7 @@ class PyCudaArrayHandler: else: return gpuarray.empty(shape, dtype) - def to_gpu(self, array): + def to_acc(self, array): return gpuarray.to_gpu(array) def upload(self, gpuarray, numpy_array): diff --git a/pystencils/datahandling/pyopencl.py b/pystencils/datahandling/pyopencl.py index 7b6f44088f60c47d0c57c5185f1afd16ef16bac7..f3e5428fd1d086edac412b86d6fa0333d1c627c2 100644 --- a/pystencils/datahandling/pyopencl.py +++ b/pystencils/datahandling/pyopencl.py @@ -10,7 +10,7 @@ import pystencils class PyOpenClArrayHandler: - def __init__(self, queue): + def __init__(self, queue=None): if not queue: from pystencils.opencl.opencljit import get_global_cl_queue queue = get_global_cl_queue() @@ -31,7 +31,7 @@ class PyOpenClArrayHandler: else: return gpuarray.empty(self.queue, shape, dtype) - def to_gpu(self, array): + def to_acc(self, array): return gpuarray.to_device(self.queue, array) def upload(self, gpuarray, numpy_array): diff --git a/pystencils/datahandling/serial_datahandling.py b/pystencils/datahandling/serial_datahandling.py index ea708ae2bb1759d5a9766b278afa2dd7c235f2da..e13ecc263ab30319a12fad81da3ae37797fc3527 100644 --- a/pystencils/datahandling/serial_datahandling.py +++ b/pystencils/datahandling/serial_datahandling.py @@ -21,9 +21,7 @@ class SerialDataHandling(DataHandling): default_ghost_layers: int = 1, default_layout: str = 'SoA', periodicity: Union[bool, Sequence[bool]] = False, - default_target: str = 'cpu', - opencl_queue=None, - opencl_ctx=None, + target: str = 'cpu', array_handler=None) -> None: """ Creates a data handling for single node simulations. @@ -32,8 +30,7 @@ class SerialDataHandling(DataHandling): domain_size: size of the spatial domain as tuple default_ghost_layers: default number of ghost layers used, if not overridden in add_array() method default_layout: default layout used, if not overridden in add_array() method - default_target: either 'cpu' or 'gpu' . If set to 'gpu' for each array also a GPU version is allocated - if not overwritten in add_array, and synchronization functions are for the GPU by default + target: one of the values of DataHandling.ACCELERATOR_TARGETS """ super(SerialDataHandling, self).__init__() self._domainSize = tuple(domain_size) @@ -41,23 +38,20 @@ class SerialDataHandling(DataHandling): self.default_layout = default_layout self._fields = DotDict() self.cpu_arrays = DotDict() - self.gpu_arrays = DotDict() + self.acc_arrays = DotDict() self.custom_data_cpu = DotDict() - self.custom_data_gpu = DotDict() + self.custom_data_acc = DotDict() self._custom_data_transfer_functions = {} - self._opencl_queue = opencl_queue - self._opencl_ctx = opencl_ctx - if not array_handler: - try: - self.array_handler = PyCudaArrayHandler() - except Exception: - self.array_handler = None - - if default_target == 'opencl' or opencl_queue: - self.array_handler = PyOpenClArrayHandler(opencl_queue) - else: - self.array_handler = array_handler + self.array_handler = array_handler + if self.array_handler is None: + if target == 'opencl': + self.array_handler = PyOpenClArrayHandler() + else: + try: + self.array_handler = PyCudaArrayHandler() + except Exception: + pass if periodicity is None or periodicity is False: periodicity = [False] * self.dim @@ -66,8 +60,8 @@ class SerialDataHandling(DataHandling): self._periodicity = periodicity self._field_information = {} - self.default_target = default_target self._start_time = time.perf_counter() + self.target = target @property def dim(self): @@ -91,14 +85,20 @@ class SerialDataHandling(DataHandling): def values_per_cell(self, name): return self._field_information[name]['values_per_cell'] + def _default_acc_value(self, acc): + if acc is None: + return self.target in self.ACCELERATOR_TARGETS + else: + return acc + def add_array(self, name, values_per_cell=1, dtype=np.float64, latex_name=None, ghost_layers=None, layout=None, - cpu=True, gpu=None, alignment=False, field_type=FieldType.GENERIC): + cpu=True, acc=None, alignment=False, field_type=FieldType.GENERIC): + acc = self._default_acc_value(acc) + if ghost_layers is None: ghost_layers = self.default_ghost_layers if layout is None: layout = self.default_layout - if gpu is None: - gpu = self.default_target in self._GPU_LIKE_TARGETS kwargs = { 'shape': tuple(s + 2 * ghost_layers for s in self._domainSize), @@ -131,17 +131,17 @@ class SerialDataHandling(DataHandling): cpu_arr = create_numpy_array_with_layout(layout=layout_tuple, alignment=alignment, byte_offset=byte_offset, **kwargs) - if alignment and gpu: - raise NotImplementedError("Alignment for GPU fields not supported") + if alignment and acc: + raise NotImplementedError("Alignment for accelerator fields not supported") if cpu: if name in self.cpu_arrays: raise ValueError("CPU Field with this name already exists") self.cpu_arrays[name] = cpu_arr - if gpu: - if name in self.gpu_arrays: - raise ValueError("GPU Field with this name already exists") - self.gpu_arrays[name] = self.array_handler.to_gpu(cpu_arr) + if acc: + if name in self.acc_arrays: + raise ValueError("Accelerator Field with this name already exists") + self.acc_arrays[name] = self.array_handler.to_acc(cpu_arr) assert all(f.name != name for f in self.fields.values()), "Symbolic field with this name already exists" self.fields[name] = Field.create_from_numpy_array(name, cpu_arr, index_dimensions=index_dimensions, @@ -150,30 +150,30 @@ class SerialDataHandling(DataHandling): return self.fields[name] def add_custom_data(self, name, cpu_creation_function, - gpu_creation_function=None, cpu_to_gpu_transfer_func=None, gpu_to_cpu_transfer_func=None): + acc_creation_function=None, cpu_to_acc_transfer_func=None, acc_to_cpu_transfer_func=None): - if cpu_creation_function and gpu_creation_function: - if cpu_to_gpu_transfer_func is None or gpu_to_cpu_transfer_func is None: - raise ValueError("For GPU data, both transfer functions have to be specified") - self._custom_data_transfer_functions[name] = (cpu_to_gpu_transfer_func, gpu_to_cpu_transfer_func) + if cpu_creation_function and acc_creation_function: + if cpu_to_acc_transfer_func is None or acc_to_cpu_transfer_func is None: + raise ValueError("For accelerator data, both transfer functions have to be specified") + self._custom_data_transfer_functions[name] = (cpu_to_acc_transfer_func, acc_to_cpu_transfer_func) assert name not in self.custom_data_cpu if cpu_creation_function: assert name not in self.cpu_arrays self.custom_data_cpu[name] = cpu_creation_function() - if gpu_creation_function: - assert name not in self.gpu_arrays - self.custom_data_gpu[name] = gpu_creation_function() + if acc_creation_function: + assert name not in self.acc_arrays + self.custom_data_acc[name] = acc_creation_function() def has_data(self, name): return name in self.fields - def add_array_like(self, name, name_of_template_field, latex_name=None, cpu=True, gpu=None): - return self.add_array(name, latex_name=latex_name, cpu=cpu, gpu=gpu, + def add_array_like(self, name, name_of_template_field, latex_name=None, cpu=True, acc=None): + return self.add_array(name, latex_name=latex_name, cpu=cpu, acc=acc, **self._field_information[name_of_template_field]) - def iterate(self, slice_obj=None, gpu=False, ghost_layers=True, inner_ghost_layers=True): + def iterate(self, slice_obj=None, acc=False, ghost_layers=True, inner_ghost_layers=True): if ghost_layers is True: ghost_layers = self.default_ghost_layers elif ghost_layers is False: @@ -186,8 +186,8 @@ class SerialDataHandling(DataHandling): slice_obj = normalize_slice(slice_obj, tuple(s + 2 * ghost_layers for s in self._domainSize)) slice_obj = tuple(s if type(s) is slice else slice(s, s + 1, None) for s in slice_obj) - arrays = self.gpu_arrays if gpu else self.cpu_arrays - custom_data_dict = self.custom_data_gpu if gpu else self.custom_data_cpu + arrays = self.acc_arrays if acc else self.cpu_arrays + custom_data_dict = self.custom_data_acc if acc else self.custom_data_cpu iter_dict = custom_data_dict.copy() for name, arr in arrays.items(): field_gls = self._field_information[name]['ghost_layers'] @@ -222,61 +222,49 @@ class SerialDataHandling(DataHandling): arr.flags.writeable = False return arr - def swap(self, name1, name2, gpu=None): - if gpu is None: - gpu = self.default_target == "gpu" - arr = self.gpu_arrays if gpu else self.cpu_arrays + def swap(self, name1, name2, acc=False): + arr = self.acc_arrays if acc else self.cpu_arrays arr[name1], arr[name2] = arr[name2], arr[name1] def all_to_cpu(self): - for name in (self.cpu_arrays.keys() & self.gpu_arrays.keys()) | self._custom_data_transfer_functions.keys(): + for name in (self.cpu_arrays.keys() & self.acc_arrays.keys()) | self._custom_data_transfer_functions.keys(): self.to_cpu(name) - def all_to_gpu(self): - for name in (self.cpu_arrays.keys() & self.gpu_arrays.keys()) | self._custom_data_transfer_functions.keys(): - self.to_gpu(name) + def all_to_acc(self): + for name in (self.cpu_arrays.keys() & self.acc_arrays.keys()) | self._custom_data_transfer_functions.keys(): + self.to_acc(name) def run_kernel(self, kernel_function, **kwargs): - arrays = self.gpu_arrays if kernel_function.ast.backend in self._GPU_LIKE_BACKENDS else self.cpu_arrays + arrays = self.acc_arrays if kernel_function.ast.target in self.ACCELERATOR_TARGETS else self.cpu_arrays kernel_function(**arrays, **kwargs) def get_kernel_kwargs(self, kernel_function, **kwargs): result = {} - result.update(self.gpu_arrays if kernel_function.ast.backend in self._GPU_LIKE_BACKENDS else self.cpu_arrays) + result.update(self.acc_arrays if kernel_function.ast.target in self.ACCELERATOR_TARGETS else self.cpu_arrays) result.update(kwargs) return [result] def to_cpu(self, name): if name in self._custom_data_transfer_functions: transfer_func = self._custom_data_transfer_functions[name][1] - transfer_func(self.custom_data_gpu[name], self.custom_data_cpu[name]) + transfer_func(self.custom_data_acc[name], self.custom_data_cpu[name]) else: - self.array_handler.download(self.gpu_arrays[name], self.cpu_arrays[name]) + self.array_handler.download(self.acc_arrays[name], self.cpu_arrays[name]) - def to_gpu(self, name): + def to_acc(self, name): if name in self._custom_data_transfer_functions: transfer_func = self._custom_data_transfer_functions[name][0] - transfer_func(self.custom_data_gpu[name], self.custom_data_cpu[name]) + transfer_func(self.custom_data_acc[name], self.custom_data_cpu[name]) else: - self.array_handler.upload(self.gpu_arrays[name], self.cpu_arrays[name]) - - def is_on_gpu(self, name): - return name in self.gpu_arrays - - def synchronization_function_cpu(self, names, stencil_name=None, **_): - return self.synchronization_function(names, stencil_name, 'cpu') + self.array_handler.upload(self.acc_arrays[name], self.cpu_arrays[name]) - def synchronization_function_gpu(self, names, stencil_name=None, **_): - return self.synchronization_function(names, stencil_name, 'gpu') + def is_on_acc(self, name): + return name in self.acc_arrays - def synchronization_function(self, names, stencil=None, target=None, **_): - if target is None: - target = self.default_target - if target == 'opencl': - target = 'gpu' - assert target in ('cpu', 'gpu') + def synchronization_function(self, names, stencil=None, acc=None, **kwargs): if not hasattr(names, '__len__') or type(names) is str: names = [names] + acc = self._default_acc_value(acc) filtered_stencil = [] neighbors = [-1, 0, 1] @@ -310,29 +298,27 @@ class SerialDataHandling(DataHandling): raise NotImplementedError("Synchronization of this field is not supported: " + name) if len(filtered_stencil) > 0: - if target == 'cpu': + if not acc: from pystencils.slicing import get_periodic_boundary_functor result.append(get_periodic_boundary_functor(filtered_stencil, ghost_layers=gls)) else: from pystencils.gpucuda.periodicity import get_periodic_boundary_functor as boundary_func - target = 'gpu' if not isinstance(self.array_handler, PyOpenClArrayHandler) else 'opencl' result.append(boundary_func(filtered_stencil, self._domainSize, index_dimensions=self.fields[name].index_dimensions, index_dim_shape=values_per_cell, dtype=self.fields[name].dtype.numpy_dtype, ghost_layers=gls, - target=target, - opencl_queue=self._opencl_queue, - opencl_ctx=self._opencl_ctx)) + target=self.target, + **kwargs)) - if target == 'cpu': + if acc: def result_functor(): for arr_name, func in zip(names, result): - func(pdfs=self.cpu_arrays[arr_name]) + func(pdfs=self.acc_arrays[arr_name]) else: def result_functor(): for arr_name, func in zip(names, result): - func(pdfs=self.gpu_arrays[arr_name]) + func(pdfs=self.cpu_arrays[arr_name]) return result_functor diff --git a/pystencils/display_utils.py b/pystencils/display_utils.py index 610c404b709885e7445eba3be412f6811fca0241..bccdd875b9762ee97f82f64b4f8708fc52c3d703 100644 --- a/pystencils/display_utils.py +++ b/pystencils/display_utils.py @@ -35,7 +35,7 @@ def highlight_cpp(code: str): return HTML(highlight(code, CppLexer(), HtmlFormatter())) -def show_code(ast: KernelFunction, custom_backend=None): +def code(ast: KernelFunction, custom_backend=None): """Returns an object to display generated code (C/C++ or CUDA) Can either be displayed as HTML in Jupyter notebooks or printed as normal string. @@ -45,11 +45,8 @@ def show_code(ast: KernelFunction, custom_backend=None): if isinstance(ast, KernelWrapper): ast = ast.ast - if ast.backend == 'gpucuda': - dialect = 'cuda' - elif ast.backend == 'opencl': - dialect = 'opencl' - else: + dialect = ast.target + if dialect == 'cpu': dialect = 'c' class CodeDisplay: @@ -65,3 +62,12 @@ def show_code(ast: KernelFunction, custom_backend=None): def __repr__(self): return generate_c(self.ast, dialect=dialect, custom_backend=custom_backend) return CodeDisplay(ast) + + +def show_code(ast: KernelFunction, custom_backend=None): + code_display = code(ast, custom_backend) + try: + from IPython.display import display + display(code_display) + except ImportError: + print(code_display) diff --git a/pystencils/gpucuda/kernelcreation.py b/pystencils/gpucuda/kernelcreation.py index eecfe278e4378cdbed8738275d2cfe813d350d7b..df9a2644f45d63a217e615294f1dd790afbbf6ac 100644 --- a/pystencils/gpucuda/kernelcreation.py +++ b/pystencils/gpucuda/kernelcreation.py @@ -61,7 +61,7 @@ def create_cuda_kernel(assignments, block = indexing.guard(block, common_shape) unify_shape_symbols(block, common_shape=common_shape, fields=fields_without_buffers) - ast = KernelFunction(block, 'gpu', 'gpucuda', make_python_function, ghost_layers, function_name) + ast = KernelFunction(block, 'cuda', make_python_function, ghost_layers, function_name) ast.global_variables.update(indexing.index_variables) implement_interpolations(ast, implement_by_texture_accesses=use_textures_for_interpolation) @@ -136,7 +136,7 @@ def created_indexed_cuda_kernel(assignments, function_body = Block(coordinate_symbol_assignments + assignments) function_body = indexing.guard(function_body, get_common_shape(index_fields)) - ast = KernelFunction(function_body, 'gpu', 'gpucuda', make_python_function, None, function_name) + ast = KernelFunction(function_body, 'cuda', make_python_function, None, function_name) ast.global_variables.update(indexing.index_variables) implement_interpolations(ast, implement_by_texture_accesses=use_textures_for_interpolation) diff --git a/pystencils/gpucuda/periodicity.py b/pystencils/gpucuda/periodicity.py index 080ef44ebd995e1d8dcf4dbe1f0839e0a218fde6..02a5488c1d7af026991f3b06c7d48245c882c226 100644 --- a/pystencils/gpucuda/periodicity.py +++ b/pystencils/gpucuda/periodicity.py @@ -31,14 +31,14 @@ def create_copy_kernel(domain_size, from_slice, to_slice, index_dimensions=0, in def get_periodic_boundary_functor(stencil, domain_size, index_dimensions=0, index_dim_shape=1, ghost_layers=1, thickness=None, dtype=float, target='gpu', opencl_queue=None, opencl_ctx=None): - assert target in ['gpu', 'opencl'] + assert target in ['cuda', 'opencl'] src_dst_slice_tuples = get_periodic_boundary_src_dst_slices(stencil, ghost_layers, thickness) kernels = [] index_dimensions = index_dimensions for src_slice, dst_slice in src_dst_slice_tuples: ast = create_copy_kernel(domain_size, src_slice, dst_slice, index_dimensions, index_dim_shape, dtype) - if target == 'gpu': + if target == 'cuda': kernels.append(pystencils.gpucuda.make_python_function(ast)) else: kernels.append(pystencils.opencl.make_python_function(ast, opencl_queue, opencl_ctx)) diff --git a/pystencils/kernelcreation.py b/pystencils/kernelcreation.py index c4d5273d4f0b303e600a8ead6378878e1fdf403f..cdc7b1ce003f5f44f023f8831e30530bb2f8176d 100644 --- a/pystencils/kernelcreation.py +++ b/pystencils/kernelcreation.py @@ -100,12 +100,12 @@ def create_kernel(assignments, else: raise ValueError("Invalid value for cpu_vectorize_info") return ast - elif target == 'llvm': + elif target == 'llvm_cpu' or target == 'llvm_gpu': from pystencils.llvm import create_kernel ast = create_kernel(assignments, type_info=data_type, split_groups=split_groups, iteration_slice=iteration_slice, ghost_layers=ghost_layers) return ast - elif target == 'gpu' or target == 'opencl': + elif target == 'cuda' or target == 'opencl': from pystencils.gpucuda import create_cuda_kernel ast = create_cuda_kernel(assignments, type_info=data_type, indexing_creator=indexing_creator_from_params(gpu_indexing, gpu_indexing_params), @@ -114,10 +114,8 @@ def create_kernel(assignments, use_textures_for_interpolation=use_textures_for_interpolation) if target == 'opencl': from pystencils.opencl.opencljit import make_python_function - ast._backend = 'opencl' ast.compile = functools.partial(make_python_function, ast, opencl_queue, opencl_ctx) - ast._target = 'opencl' - ast._backend = 'opencl' + ast.target = 'opencl' return ast else: raise ValueError("Unknown target %s. Has to be one of 'cpu', 'gpu' or 'llvm' " % (target,)) @@ -181,9 +179,9 @@ def create_indexed_kernel(assignments, if cpu_openmp: add_openmp(ast, num_threads=cpu_openmp) return ast - elif target == 'llvm': + elif target == 'llvm_cpu' or target == 'llvm_gpu': raise NotImplementedError("Indexed kernels are not yet supported in LLVM backend") - elif target == 'gpu' or target == 'opencl': + elif target == 'cuda' or target == 'opencl': from pystencils.gpucuda import created_indexed_cuda_kernel idx_creator = indexing_creator_from_params(gpu_indexing, gpu_indexing_params) ast = created_indexed_cuda_kernel(assignments, @@ -194,10 +192,8 @@ def create_indexed_kernel(assignments, use_textures_for_interpolation=use_textures_for_interpolation) if target == 'opencl': from pystencils.opencl.opencljit import make_python_function - ast._backend = 'opencl' ast.compile = functools.partial(make_python_function, ast, opencl_queue, opencl_ctx) - ast._target = 'opencl' - ast._backend = 'opencl' + ast.target = 'opencl' return ast else: raise ValueError("Unknown target %s. Has to be either 'cpu' or 'gpu'" % (target,)) diff --git a/pystencils/llvm/kernelcreation.py b/pystencils/llvm/kernelcreation.py index 57e5b73876b643196f5529d05df41a11a1ef01e5..663a421eff9e092e218456eccf16b527f3d5ae36 100644 --- a/pystencils/llvm/kernelcreation.py +++ b/pystencils/llvm/kernelcreation.py @@ -3,7 +3,7 @@ from pystencils.transformations import insert_casts def create_kernel(assignments, function_name="kernel", type_info=None, split_groups=(), - iteration_slice=None, ghost_layers=None, target='cpu'): + iteration_slice=None, ghost_layers=None, target='llvm_cpu'): """ Creates an abstract syntax tree for a kernel function, by taking a list of update rules. @@ -25,21 +25,20 @@ def create_kernel(assignments, function_name="kernel", type_info=None, split_gro :return: :class:`pystencils.ast.KernelFunction` node """ - if target == 'cpu': + if target == 'llvm_cpu': from pystencils.cpu import create_kernel code = create_kernel(assignments, function_name, type_info, split_groups, iteration_slice, ghost_layers) - code._backend = 'llvm' - elif target == 'gpu': + elif target == 'llvm_cpu': from pystencils.gpucuda.kernelcreation import create_cuda_kernel code = create_cuda_kernel(assignments, function_name, type_info, iteration_slice=iteration_slice, ghost_layers=ghost_layers) - code._backend = 'llvm_gpu' else: NotImplementedError() code.body = insert_casts(code.body) code._compile_function = make_python_function + code.target = target return code diff --git a/pystencils_tests/test_datahandling.py b/pystencils_tests/test_datahandling.py index 1af1a68e3bfdb3681e05443c2412d8bd24903b0d..569cd3a42cc79af239b04dbfa4274cf85cb8437d 100644 --- a/pystencils_tests/test_datahandling.py +++ b/pystencils_tests/test_datahandling.py @@ -77,17 +77,10 @@ def access_and_gather(dh, domain_size): assert full_arr[x, y] == x + y -def synchronization(dh, test_gpu=False): +def synchronization(dh): field_name = 'comm_field_test' - if test_gpu: - try: - from pycuda import driver - import pycuda.autoinit - except ImportError: - return - field_name += 'Gpu' - dh.add_array(field_name, ghost_layers=1, dtype=np.int32, cpu=True, gpu=test_gpu) + dh.add_array(field_name, ghost_layers=1, dtype=np.int32) # initialize everything with 1 for b in dh.iterate(ghost_layers=1): @@ -95,23 +88,17 @@ def synchronization(dh, test_gpu=False): for b in dh.iterate(ghost_layers=0): b[field_name].fill(42) - if test_gpu: - dh.to_gpu(field_name) - - dh.synchronization_function(field_name, target='gpu' if test_gpu else 'cpu')() - - if test_gpu: - dh.to_cpu(field_name) + dh.all_to_acc() + dh.synchronization_function(field_name)() + dh.all_to_cpu() for b in dh.iterate(ghost_layers=1): np.testing.assert_equal(42, b[field_name]) -def kernel_execution_jacobi(dh, target): - - test_gpu = target == 'gpu' or target == 'opencl' - dh.add_array('f', gpu=test_gpu) - dh.add_array('tmp', gpu=test_gpu) +def kernel_execution_jacobi(dh): + dh.add_array('f') + dh.add_array('tmp') stencil_2d = [(1, 0), (-1, 0), (0, 1), (0, -1)] stencil_3d = [(1, 0, 0), (-1, 0, 0), (0, 1, 0), (0, -1, 0), (0, 0, 1), (0, 0, -1)] stencil = stencil_2d if dh.dim == 2 else stencil_3d @@ -120,10 +107,12 @@ def kernel_execution_jacobi(dh, target): def jacobi(): dh.fields.tmp.center @= sum(dh.fields.f.neighbors(stencil)) / len(stencil) - kernel = create_kernel(jacobi, target).compile() + kernel = create_kernel(jacobi, dh.target).compile() for b in dh.iterate(ghost_layers=1): b['f'].fill(42) + dh.all_to_acc() dh.run_kernel(kernel) + dh.all_to_cpu() for b in dh.iterate(ghost_layers=0): np.testing.assert_equal(b['f'], 42) @@ -190,36 +179,36 @@ def test_access_and_gather(): for domain_shape in [(2, 2, 3), (2, 3)]: dh = create_data_handling(domain_size=domain_shape, periodicity=True) access_and_gather(dh, domain_shape) - synchronization(dh, test_gpu=False) - synchronization(dh, test_gpu=True) + + dh = create_data_handling(domain_size=domain_shape, periodicity=True, target='cpu') + synchronization(dh) + dh = create_data_handling(domain_size=domain_shape, periodicity=True, target='cuda') + synchronization(dh) def test_kernel(): for domain_shape in [(4, 5), (3, 4, 5)]: dh = create_data_handling(domain_size=domain_shape, periodicity=True) - kernel_execution_jacobi(dh, 'cpu') + kernel_execution_jacobi(dh) reduction(dh) - try: - import pycuda - dh = create_data_handling(domain_size=domain_shape, periodicity=True) - kernel_execution_jacobi(dh, 'gpu') - except ImportError: - pass + pytest.importorskip('pycuda') + dh = create_data_handling(domain_size=domain_shape, periodicity=True, target='cuda') + kernel_execution_jacobi(dh) -@pytest.mark.parametrize('target', ('cpu', 'gpu', 'opencl')) +@pytest.mark.parametrize('target', ('cpu', 'cuda', 'opencl')) def test_kernel_param(target): for domain_shape in [(4, 5), (3, 4, 5)]: - if target == 'gpu': + if target == 'cuda': pytest.importorskip('pycuda') if target == 'opencl': pytest.importorskip('pyopencl') from pystencils.opencl.opencljit import init_globally init_globally() - dh = create_data_handling(domain_size=domain_shape, periodicity=True, default_target=target) - kernel_execution_jacobi(dh, target) + dh = create_data_handling(domain_size=domain_shape, periodicity=True, target=target) + kernel_execution_jacobi(dh) reduction(dh) diff --git a/pystencils_tests/test_datahandling_parallel.py b/pystencils_tests/test_datahandling_parallel.py index 0bfbfbd02fc86952e7fba84ce6aa080d2080ec0a..4346ce61d895590d0600d0a13ad8c17dbf21d95f 100644 --- a/pystencils_tests/test_datahandling_parallel.py +++ b/pystencils_tests/test_datahandling_parallel.py @@ -28,7 +28,7 @@ def test_gpu(): num_blocks = (3, 2, 1) blocks = wlb.createUniformBlockGrid(blocks=num_blocks, cellsPerBlock=block_size, oneBlockPerProcess=False) dh = ParallelDataHandling(blocks, default_ghost_layers=2) - dh.add_array('v', values_per_cell=3, dtype=np.int64, ghost_layers=2, gpu=True) + dh.add_array('v', values_per_cell=3, dtype=np.int64, ghost_layers=2, acc=True) for b in dh.iterate(): b['v'].fill(42)