diff --git a/conftest.py b/conftest.py index d1da0451aef163be92d96c6133076361d3531b37..c20bde40916e7f7f5163c20e1c48c7b0f3d3b3a2 100644 --- a/conftest.py +++ b/conftest.py @@ -1,9 +1,14 @@ import os -import pytest -import tempfile import runpy import sys import warnings +import tempfile + +import nbformat +import pytest +from nbconvert import PythonExporter + +from pystencils.boundaries.createindexlistcython import * # NOQA # Trigger config file reading / creation once - to avoid race conditions when multiple instances are creating it # at the same time from pystencils.cpu import cpujit @@ -15,7 +20,6 @@ try: pyximport.install(language_level=3) except ImportError: pass -from pystencils.boundaries.createindexlistcython import * # NOQA SCRIPT_FOLDER = os.path.dirname(os.path.realpath(__file__)) @@ -29,7 +33,8 @@ def add_path_to_ignore(path): collect_ignore += [os.path.join(SCRIPT_FOLDER, path, f) for f in os.listdir(os.path.join(SCRIPT_FOLDER, path))] -collect_ignore = [os.path.join(SCRIPT_FOLDER, "doc", "conf.py")] +collect_ignore = [os.path.join(SCRIPT_FOLDER, "doc", "conf.py"), + os.path.join(SCRIPT_FOLDER, "pystencils", "opencl", "opencl.autoinit")] add_path_to_ignore('pystencils_tests/benchmark') add_path_to_ignore('_local_tmp') @@ -78,8 +83,6 @@ for root, sub_dirs, files in os.walk('.'): collect_ignore.append(f) -import nbformat -from nbconvert import PythonExporter class IPythonMockup: diff --git a/pystencils/backends/cbackend.py b/pystencils/backends/cbackend.py index 559295930911c390f90cddafadb4d0c8da2b4b66..9efca1f53a6870d2b0ce2bdd1da547d16beb6a6c 100644 --- a/pystencils/backends/cbackend.py +++ b/pystencils/backends/cbackend.py @@ -353,6 +353,9 @@ class CustomSympyPrinter(CCodePrinter): result = super(CustomSympyPrinter, self)._print_Piecewise(expr) return result.replace("\n", "") + def _print_Type(self, node): + return str(node) + def _print_Function(self, expr): infix_functions = { bitwise_xor: '^', @@ -365,7 +368,7 @@ class CustomSympyPrinter(CCodePrinter): return expr.to_c(self._print) if isinstance(expr, reinterpret_cast_func): arg, data_type = expr.args - return "*((%s)(& %s))" % (PointerType(data_type, restrict=False), self._print(arg)) + return "*((%s)(& %s))" % (self._print(PointerType(data_type, restrict=False)), self._print(arg)) elif isinstance(expr, address_of): assert len(expr.args) == 1, "address_of must only have one argument" return "&(%s)" % self._print(expr.args[0]) diff --git a/pystencils/backends/opencl_backend.py b/pystencils/backends/opencl_backend.py index 60ea06be9723005654155b256f35e094db0005f6..3ab7f820ea30adebf584977625b2e559f897ca27 100644 --- a/pystencils/backends/opencl_backend.py +++ b/pystencils/backends/opencl_backend.py @@ -68,6 +68,13 @@ class OpenClSympyPrinter(CudaSympyPrinter): CustomSympyPrinter.__init__(self) self.known_functions = OPENCL_KNOWN_FUNCTIONS + def _print_Type(self, node): + code = super()._print_Type(node) + if isinstance(node, pystencils.data_types.PointerType): + return "__global " + code + else: + return code + def _print_ThreadIndexingSymbol(self, node): symbol_name: str = node.name function_name, dimension = tuple(symbol_name.split(".")) diff --git a/pystencils/boundaries/boundaryhandling.py b/pystencils/boundaries/boundaryhandling.py index d258de4b3e5e7d5b85c69d0910ea384b77ecd110..94d12619663190104f80894efebdc0955d5ac592 100644 --- a/pystencils/boundaries/boundaryhandling.py +++ b/pystencils/boundaries/boundaryhandling.py @@ -87,8 +87,25 @@ class BoundaryHandling: fi = flag_interface self.flag_interface = fi if fi is not None else FlagInterface(data_handling, name + "Flags") - gpu = self._target == 'gpu' - data_handling.add_custom_class(self._index_array_name, self.IndexFieldBlockData, cpu=True, gpu=gpu) + def to_cpu(gpu_version, cpu_version): + gpu_version = gpu_version.boundary_object_to_index_list + cpu_version = cpu_version.boundary_object_to_index_list + for obj, cpu_arr in cpu_version.items(): + gpu_version[obj].get(cpu_arr) + + def to_gpu(gpu_version, cpu_version): + gpu_version = gpu_version.boundary_object_to_index_list + cpu_version = cpu_version.boundary_object_to_index_list + for obj, cpu_arr in cpu_version.items(): + if obj not in gpu_version or gpu_version[obj].shape != cpu_arr.shape: + gpu_version[obj] = self.data_handling.array_handler.to_gpu(cpu_arr) + else: + self.data_handling.array_handler.upload(gpu_version[obj], cpu_arr) + + class_ = self.IndexFieldBlockData + class_.to_cpu = to_cpu + class_.to_gpu = to_gpu + data_handling.add_custom_class(self._index_array_name, class_) @property def data_handling(self): @@ -204,7 +221,7 @@ class BoundaryHandling: if self._dirty: self.prepare() - for b in self._data_handling.iterate(gpu=self._target == 'gpu'): + for b in self._data_handling.iterate(gpu=self._target in self._data_handling._GPU_LIKE_TARGETS): for b_obj, idx_arr in b[self._index_array_name].boundary_object_to_index_list.items(): kwargs[self._field_name] = b[self._field_name] kwargs['indexField'] = idx_arr @@ -219,7 +236,7 @@ class BoundaryHandling: if self._dirty: self.prepare() - for b in self._data_handling.iterate(gpu=self._target == 'gpu'): + for b in self._data_handling.iterate(gpu=self._target in self._data_handling._GPU_LIKE_TARGETS): for b_obj, idx_arr in b[self._index_array_name].boundary_object_to_index_list.items(): arguments = kwargs.copy() arguments[self._field_name] = b[self._field_name] @@ -302,7 +319,7 @@ class BoundaryHandling: def _boundary_data_initialization(self, boundary_obj, boundary_data_setter, **kwargs): if boundary_obj.additional_data_init_callback: boundary_obj.additional_data_init_callback(boundary_data_setter, **kwargs) - if self._target == 'gpu': + if self._target in self._data_handling._GPU_LIKE_TARGETS: self._data_handling.to_gpu(self._index_array_name) class BoundaryInfo(object): @@ -312,7 +329,7 @@ class BoundaryHandling: self.kernel = kernel class IndexFieldBlockData: - def __init__(self, *_1, **_2): + def __init__(self): self.boundary_object_to_index_list = {} self.boundary_object_to_data_setter = {} @@ -320,24 +337,6 @@ class BoundaryHandling: self.boundary_object_to_index_list.clear() self.boundary_object_to_data_setter.clear() - @staticmethod - def to_cpu(gpu_version, cpu_version): - gpu_version = gpu_version.boundary_object_to_index_list - cpu_version = cpu_version.boundary_object_to_index_list - for obj, cpu_arr in cpu_version.items(): - gpu_version[obj].get(cpu_arr) - - @staticmethod - def to_gpu(gpu_version, cpu_version): - from pycuda import gpuarray - gpu_version = gpu_version.boundary_object_to_index_list - cpu_version = cpu_version.boundary_object_to_index_list - for obj, cpu_arr in cpu_version.items(): - if obj not in gpu_version or gpu_version[obj].shape != cpu_arr.shape: - gpu_version[obj] = gpuarray.to_gpu(cpu_arr) - else: - gpu_version[obj].set(cpu_arr) - class BoundaryDataSetter: diff --git a/pystencils/datahandling/__init__.py b/pystencils/datahandling/__init__.py index 583f121b09ffbfdc6dc2103ec6e300e639791891..a4fa55bdc7a52e1b9c2015e2210fcbb48aaeb2e1 100644 --- a/pystencils/datahandling/__init__.py +++ b/pystencils/datahandling/__init__.py @@ -20,7 +20,8 @@ def create_data_handling(domain_size: Tuple[int, ...], default_layout: str = 'SoA', default_target: str = 'cpu', parallel: bool = False, - default_ghost_layers: int = 1) -> DataHandling: + default_ghost_layers: int = 1, + opencl_queue=None) -> DataHandling: """Creates a data handling instance. Args: @@ -33,6 +34,7 @@ def create_data_handling(domain_size: Tuple[int, ...], default_ghost_layers: default number of ghost layers if not overwritten in 'add_array' """ if parallel: + assert not opencl_queue, "OpenCL is only supported for SerialDataHandling" if wlb is None: raise ValueError("Cannot create parallel data handling because walberla module is not available") @@ -56,8 +58,12 @@ def create_data_handling(domain_size: Tuple[int, ...], return ParallelDataHandling(blocks=block_storage, dim=dim, default_target=default_target, default_layout=default_layout, default_ghost_layers=default_ghost_layers) else: - return SerialDataHandling(domain_size, periodicity=periodicity, default_target=default_target, - default_layout=default_layout, default_ghost_layers=default_ghost_layers) + return SerialDataHandling(domain_size, + periodicity=periodicity, + default_target=default_target, + default_layout=default_layout, + default_ghost_layers=default_ghost_layers, + opencl_queue=opencl_queue) __all__ = ['create_data_handling'] diff --git a/pystencils/datahandling/datahandling_interface.py b/pystencils/datahandling/datahandling_interface.py index ba960edc17b2e3409dc465c257e23d9e9394dc5c..af1a6ba1fc9d003042063023aa1ede5fc08665db 100644 --- a/pystencils/datahandling/datahandling_interface.py +++ b/pystencils/datahandling/datahandling_interface.py @@ -16,6 +16,9 @@ class DataHandling(ABC): 'gather' function that has collects (parts of the) distributed data on a single process. """ + _GPU_LIKE_TARGETS = ['gpu', 'opencl'] + _GPU_LIKE_BACKENDS = ['gpucuda', 'opencl'] + # ---------------------------- Adding and accessing data ----------------------------------------------------------- @property diff --git a/pystencils/datahandling/pycuda.py b/pystencils/datahandling/pycuda.py new file mode 100644 index 0000000000000000000000000000000000000000..30602a30ce6b87d0e25861b43c5291cda77ed570 --- /dev/null +++ b/pystencils/datahandling/pycuda.py @@ -0,0 +1,39 @@ +try: + import pycuda.gpuarray as gpuarray +except ImportError: + gpuarray = None +import numpy as np + +import pystencils + + +class PyCudaArrayHandler: + + def __init__(self): + import pycuda.autoinit # NOQA + + def zeros(self, shape, dtype=np.float64, order='C'): + return gpuarray.zeros(shape, dtype, order) + + def ones(self, shape, dtype, order='C'): + return gpuarray.ones(shape, dtype, order) + + def empty(self, shape, dtype=np.float64, layout=None): + if layout: + cpu_array = pystencils.field.create_numpy_array_with_layout(shape, dtype, layout) + return self.from_numpy(cpu_array) + else: + return gpuarray.empty(shape, dtype) + + def to_gpu(self, array): + return gpuarray.to_gpu(array) + + def upload(self, gpuarray, numpy_array): + gpuarray.set(numpy_array) + + def download(self, gpuarray, numpy_array): + gpuarray.get(numpy_array) + + def randn(self, shape, dtype=np.float64): + cpu_array = np.random.randn(*shape).astype(dtype) + return self.from_numpy(cpu_array) diff --git a/pystencils/datahandling/pyopencl.py b/pystencils/datahandling/pyopencl.py new file mode 100644 index 0000000000000000000000000000000000000000..7b6f44088f60c47d0c57c5185f1afd16ef16bac7 --- /dev/null +++ b/pystencils/datahandling/pyopencl.py @@ -0,0 +1,45 @@ +try: + import pyopencl.array as gpuarray +except ImportError: + gpuarray = None + +import numpy as np + +import pystencils + + +class PyOpenClArrayHandler: + + def __init__(self, queue): + if not queue: + from pystencils.opencl.opencljit import get_global_cl_queue + queue = get_global_cl_queue() + assert queue, "OpenCL queue missing!\n" \ + "Use `import pystencils.opencl.autoinit` if you want it to be automatically created" + self.queue = queue + + def zeros(self, shape, dtype=np.float64, order='C'): + return gpuarray.zeros(shape, dtype, order) + + def ones(self, shape, dtype, order='C'): + return gpuarray.ones(self.queue, shape, dtype, order) + + def empty(self, shape, dtype=np.float64, layout=None): + if layout: + cpu_array = pystencils.field.create_numpy_array_with_layout(shape, dtype, layout) + return self.from_numpy(cpu_array) + else: + return gpuarray.empty(self.queue, shape, dtype) + + def to_gpu(self, array): + return gpuarray.to_device(self.queue, array) + + def upload(self, gpuarray, numpy_array): + gpuarray.set(numpy_array, self.queue) + + def download(self, gpuarray, numpy_array): + gpuarray.get(self.queue, numpy_array) + + def randn(self, shape, dtype=np.float64): + cpu_array = np.random.randn(*shape).astype(dtype) + return self.from_numpy(cpu_array) diff --git a/pystencils/datahandling/serial_datahandling.py b/pystencils/datahandling/serial_datahandling.py index 697b6b6674a83b6786c8a6a3f687df3e49cd8a01..12c9b6f3df2757cbdb1eb03337be8bfe1aaf936d 100644 --- a/pystencils/datahandling/serial_datahandling.py +++ b/pystencils/datahandling/serial_datahandling.py @@ -6,22 +6,25 @@ import numpy as np from pystencils.datahandling.blockiteration import SerialBlock from pystencils.datahandling.datahandling_interface import DataHandling +from pystencils.datahandling.pycuda import PyCudaArrayHandler +from pystencils.datahandling.pyopencl import PyOpenClArrayHandler from pystencils.field import ( Field, FieldType, create_numpy_array_with_layout, layout_string_to_tuple, spatial_layout_string_to_tuple) from pystencils.slicing import normalize_slice, remove_ghost_layers from pystencils.utils import DotDict -try: - import pycuda.gpuarray as gpuarray - import pycuda.autoinit # NOQA -except ImportError: - gpuarray = None - class SerialDataHandling(DataHandling): - def __init__(self, domain_size: Sequence[int], default_ghost_layers: int = 1, default_layout: str = 'SoA', - periodicity: Union[bool, Sequence[bool]] = False, default_target: str = 'cpu') -> None: + def __init__(self, + domain_size: Sequence[int], + default_ghost_layers: int = 1, + default_layout: str = 'SoA', + periodicity: Union[bool, Sequence[bool]] = False, + default_target: str = 'cpu', + opencl_queue=None, + opencl_ctx=None, + array_handler=None) -> None: """ Creates a data handling for single node simulations. @@ -42,6 +45,19 @@ class SerialDataHandling(DataHandling): self.custom_data_cpu = DotDict() self.custom_data_gpu = DotDict() self._custom_data_transfer_functions = {} + self._opencl_queue = opencl_queue + self._opencl_ctx = opencl_ctx + + if not array_handler: + try: + self.array_handler = PyCudaArrayHandler() + except Exception: + self.array_handler = None + + if default_target == 'opencl' or opencl_queue: + self.array_handler = PyOpenClArrayHandler(opencl_queue) + else: + self.array_handler = array_handler if periodicity is None or periodicity is False: periodicity = [False] * self.dim @@ -82,7 +98,7 @@ class SerialDataHandling(DataHandling): if layout is None: layout = self.default_layout if gpu is None: - gpu = self.default_target == 'gpu' + gpu = self.default_target in self._GPU_LIKE_TARGETS kwargs = { 'shape': tuple(s + 2 * ghost_layers for s in self._domainSize), @@ -126,7 +142,7 @@ class SerialDataHandling(DataHandling): if gpu: if name in self.gpu_arrays: raise ValueError("GPU Field with this name already exists") - self.gpu_arrays[name] = gpuarray.to_gpu(cpu_arr) + self.gpu_arrays[name] = self.array_handler.to_gpu(cpu_arr) assert all(f.name != name for f in self.fields.values()), "Symbolic field with this name already exists" self.fields[name] = Field.create_from_numpy_array(name, cpu_arr, index_dimensions=index_dimensions, @@ -222,12 +238,12 @@ class SerialDataHandling(DataHandling): self.to_gpu(name) def run_kernel(self, kernel_function, **kwargs): - arrays = self.gpu_arrays if kernel_function.ast.backend == 'gpucuda' else self.cpu_arrays + arrays = self.gpu_arrays if kernel_function.ast.backend in self._GPU_LIKE_BACKENDS else self.cpu_arrays kernel_function(**arrays, **kwargs) def get_kernel_kwargs(self, kernel_function, **kwargs): result = {} - result.update(self.gpu_arrays if kernel_function.ast.backend == 'gpucuda' else self.cpu_arrays) + result.update(self.gpu_arrays if kernel_function.ast.backend in self._GPU_LIKE_BACKENDS else self.cpu_arrays) result.update(kwargs) return [result] @@ -236,14 +252,14 @@ class SerialDataHandling(DataHandling): transfer_func = self._custom_data_transfer_functions[name][1] transfer_func(self.custom_data_gpu[name], self.custom_data_cpu[name]) else: - self.gpu_arrays[name].get(self.cpu_arrays[name]) + self.array_handler.download(self.gpu_arrays[name], self.cpu_arrays[name]) def to_gpu(self, name): if name in self._custom_data_transfer_functions: transfer_func = self._custom_data_transfer_functions[name][0] transfer_func(self.custom_data_gpu[name], self.custom_data_cpu[name]) else: - self.gpu_arrays[name].set(self.cpu_arrays[name]) + self.array_handler.upload(self.gpu_arrays[name], self.cpu_arrays[name]) def is_on_gpu(self, name): return name in self.gpu_arrays @@ -257,6 +273,8 @@ class SerialDataHandling(DataHandling): def synchronization_function(self, names, stencil=None, target=None, **_): if target is None: target = self.default_target + if target == 'opencl': + target = 'gpu' assert target in ('cpu', 'gpu') if not hasattr(names, '__len__') or type(names) is str: names = [names] @@ -298,11 +316,15 @@ class SerialDataHandling(DataHandling): result.append(get_periodic_boundary_functor(filtered_stencil, ghost_layers=gls)) else: from pystencils.gpucuda.periodicity import get_periodic_boundary_functor as boundary_func + target = 'gpu' if not isinstance(self.array_handler, PyOpenClArrayHandler) else 'opencl' result.append(boundary_func(filtered_stencil, self._domainSize, index_dimensions=self.fields[name].index_dimensions, index_dim_shape=values_per_cell, dtype=self.fields[name].dtype.numpy_dtype, - ghost_layers=gls)) + ghost_layers=gls, + target=target, + opencl_queue=self._opencl_queue, + opencl_ctx=self._opencl_ctx)) if target == 'cpu': def result_functor(): diff --git a/pystencils/display_utils.py b/pystencils/display_utils.py index 638d1290acbbfc4d86bec12028dc59b37e2f98ea..610c404b709885e7445eba3be412f6811fca0241 100644 --- a/pystencils/display_utils.py +++ b/pystencils/display_utils.py @@ -45,7 +45,12 @@ def show_code(ast: KernelFunction, custom_backend=None): if isinstance(ast, KernelWrapper): ast = ast.ast - dialect = 'cuda' if ast.backend == 'gpucuda' else 'c' + if ast.backend == 'gpucuda': + dialect = 'cuda' + elif ast.backend == 'opencl': + dialect = 'opencl' + else: + dialect = 'c' class CodeDisplay: def __init__(self, ast_input): diff --git a/pystencils/gpucuda/periodicity.py b/pystencils/gpucuda/periodicity.py index e94a1796e3a66da626fc4e13fcc0413f5f93961d..080ef44ebd995e1d8dcf4dbe1f0839e0a218fde6 100644 --- a/pystencils/gpucuda/periodicity.py +++ b/pystencils/gpucuda/periodicity.py @@ -1,7 +1,8 @@ import numpy as np +import pystencils.gpucuda +import pystencils.opencl from pystencils import Assignment, Field -from pystencils.gpucuda import make_python_function from pystencils.gpucuda.kernelcreation import create_cuda_kernel from pystencils.slicing import get_periodic_boundary_src_dst_slices, normalize_slice @@ -25,16 +26,22 @@ def create_copy_kernel(domain_size, from_slice, to_slice, index_dimensions=0, in update_eqs.append(eq) ast = create_cuda_kernel(update_eqs, iteration_slice=to_slice, skip_independence_check=True) - return make_python_function(ast) + return ast def get_periodic_boundary_functor(stencil, domain_size, index_dimensions=0, index_dim_shape=1, ghost_layers=1, - thickness=None, dtype=float): + thickness=None, dtype=float, target='gpu', opencl_queue=None, opencl_ctx=None): + assert target in ['gpu', 'opencl'] src_dst_slice_tuples = get_periodic_boundary_src_dst_slices(stencil, ghost_layers, thickness) kernels = [] index_dimensions = index_dimensions + for src_slice, dst_slice in src_dst_slice_tuples: - kernels.append(create_copy_kernel(domain_size, src_slice, dst_slice, index_dimensions, index_dim_shape, dtype)) + ast = create_copy_kernel(domain_size, src_slice, dst_slice, index_dimensions, index_dim_shape, dtype) + if target == 'gpu': + kernels.append(pystencils.gpucuda.make_python_function(ast)) + else: + kernels.append(pystencils.opencl.make_python_function(ast, opencl_queue, opencl_ctx)) def functor(pdfs, **_): for kernel in kernels: diff --git a/pystencils/kernelcreation.py b/pystencils/kernelcreation.py index 555196a623ce61fcfd2a9a40272f06c27fb70e54..a2dd55352548a4a28399920599394d53b5205465 100644 --- a/pystencils/kernelcreation.py +++ b/pystencils/kernelcreation.py @@ -1,4 +1,5 @@ -from itertools import combinations +import functools +import itertools from types import MappingProxyType import sympy as sp @@ -27,13 +28,15 @@ def create_kernel(assignments, gpu_indexing_params=MappingProxyType({}), use_textures_for_interpolation=True, cpu_prepend_optimizations=[], - use_auto_for_assignments=False): + use_auto_for_assignments=False, + opencl_queue=None, + opencl_ctx=None): """ Creates abstract syntax tree (AST) of kernel, using a list of update equations. Args: assignments: can be a single assignment, sequence of assignments or an `AssignmentCollection` - target: 'cpu', 'llvm' or 'gpu' + target: 'cpu', 'llvm', 'gpu' or 'opencl' data_type: data type used for all untyped symbols (i.e. non-fields), can also be a dict from symbol name to type iteration_slice: rectangular subset to iterate over, if not specified the complete non-ghost layer \ @@ -108,13 +111,20 @@ def create_kernel(assignments, from pystencils.llvm import create_kernel ast = create_kernel(assignments, type_info=data_type, split_groups=split_groups, iteration_slice=iteration_slice, ghost_layers=ghost_layers) - elif target == 'gpu': + elif target == 'gpu' or target == 'opencl': from pystencils.gpucuda import create_cuda_kernel ast = create_cuda_kernel(assignments, type_info=data_type, indexing_creator=indexing_creator_from_params(gpu_indexing, gpu_indexing_params), iteration_slice=iteration_slice, ghost_layers=ghost_layers, skip_independence_check=skip_independence_check, use_textures_for_interpolation=use_textures_for_interpolation) + if target == 'opencl': + from pystencils.opencl.opencljit import make_python_function + ast._backend = 'opencl' + ast.compile = functools.partial(make_python_function, ast, opencl_queue, opencl_ctx) + ast._target = 'opencl' + ast._backend = 'opencl' + return ast else: raise ValueError("Unknown target %s. Has to be one of 'cpu', 'gpu' or 'llvm' " % (target,)) @@ -133,7 +143,9 @@ def create_indexed_kernel(assignments, cpu_openmp=True, gpu_indexing='block', gpu_indexing_params=MappingProxyType({}), - use_textures_for_interpolation=True): + use_textures_for_interpolation=True, + opencl_queue=None, + opencl_ctx=None): """ Similar to :func:`create_kernel`, but here not all cells of a field are updated but only cells with coordinates which are stored in an index field. This traversal method can e.g. be used for boundary handling. @@ -183,7 +195,7 @@ def create_indexed_kernel(assignments, return ast elif target == 'llvm': raise NotImplementedError("Indexed kernels are not yet supported in LLVM backend") - elif target == 'gpu': + elif target == 'gpu' or target == 'opencl': from pystencils.gpucuda import created_indexed_cuda_kernel idx_creator = indexing_creator_from_params(gpu_indexing, gpu_indexing_params) ast = created_indexed_cuda_kernel(assignments, @@ -192,6 +204,12 @@ def create_indexed_kernel(assignments, coordinate_names=coordinate_names, indexing_creator=idx_creator, use_textures_for_interpolation=use_textures_for_interpolation) + if target == 'opencl': + from pystencils.opencl.opencljit import make_python_function + ast._backend = 'opencl' + ast.compile = functools.partial(make_python_function, ast, opencl_queue, opencl_ctx) + ast._target = 'opencl' + ast._backend = 'opencl' return ast else: raise ValueError("Unknown target %s. Has to be either 'cpu' or 'gpu'" % (target,)) @@ -288,7 +306,7 @@ def create_staggered_kernel(assignments, target='cpu', gpu_exclusive_conditions= outer_assignment = None conditions = {direction: condition(direction) for direction in stencil} for num_conditions in range(len(stencil)): - for combination in combinations(conditions.values(), num_conditions): + for combination in itertools.combinations(conditions.values(), num_conditions): for assignment in assignments: direction = stencil[assignment.lhs.index[0]] if conditions[direction] in combination: diff --git a/pystencils/opencl/__init__.py b/pystencils/opencl/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e405a2f4e1b22a285cecb833da7764f88762d1ae --- /dev/null +++ b/pystencils/opencl/__init__.py @@ -0,0 +1,8 @@ +""" + +""" + +from pystencils.opencl.opencljit import ( + clear_global_ctx, init_globally, init_globally_with_context, make_python_function) + +__all__ = ['init_globally', 'init_globally_with_context', 'clear_global_ctx', 'make_python_function'] diff --git a/pystencils/opencl/autoinit.py b/pystencils/opencl/autoinit.py new file mode 100644 index 0000000000000000000000000000000000000000..3d20169b640a83895edd9366a279fc6f2e13f6b4 --- /dev/null +++ b/pystencils/opencl/autoinit.py @@ -0,0 +1,16 @@ +""" +Automatically initializes OpenCL context using any device. + +Use `pystencils.opencl.{init_globally_with_context,init_globally}` if you want to use a specific device. +""" + +from pystencils.opencl.opencljit import ( + clear_global_ctx, init_globally, init_globally_with_context, make_python_function) + +__all__ = ['init_globally', 'init_globally_with_context', 'clear_global_ctx', 'make_python_function'] + +try: + init_globally() +except Exception as e: + import warnings + warnings.warn(str(e)) diff --git a/pystencils/opencl/opencljit.py b/pystencils/opencl/opencljit.py index 5526c954a1cc25438710576a0119abd65ab9854d..6b5893865545294151269e7aabafdd51b1b7c264 100644 --- a/pystencils/opencl/opencljit.py +++ b/pystencils/opencl/opencljit.py @@ -3,10 +3,45 @@ import numpy as np from pystencils.backends.cbackend import generate_c, get_headers from pystencils.gpucuda.cudajit import _build_numpy_argument_list, _check_arguments from pystencils.include import get_pystencils_include_path +from pystencils.kernel_wrapper import KernelWrapper USE_FAST_MATH = True +_global_cl_ctx = None +_global_cl_queue = None + + +def get_global_cl_queue(): + return _global_cl_queue + + +def get_global_cl_ctx(): + return _global_cl_ctx + + +def init_globally(device_index=0): + import pyopencl as cl + global _global_cl_ctx + global _global_cl_queue + _global_cl_ctx = cl.create_some_context(device_index) + _global_cl_queue = cl.CommandQueue(_global_cl_ctx) + + +def init_globally_with_context(opencl_ctx, opencl_queue): + global _global_cl_ctx + global _global_cl_queue + _global_cl_ctx = opencl_ctx + _global_cl_queue = opencl_queue + + +def clear_global_ctx(): + global _global_cl_ctx + global _global_cl_queue + _global_cl_ctx = None + _global_cl_queue = None + + def make_python_function(kernel_function_node, opencl_queue, opencl_ctx, argument_dict=None, custom_backend=None): """ Creates a **OpenCL** kernel function from an abstract syntax tree which @@ -24,8 +59,16 @@ def make_python_function(kernel_function_node, opencl_queue, opencl_ctx, argumen compiled kernel as Python function """ import pyopencl as cl - assert opencl_ctx, "No valid OpenCL context" - assert opencl_queue, "No valid OpenCL queue" + + if not opencl_ctx: + opencl_ctx = _global_cl_ctx + if not opencl_queue: + opencl_queue = _global_cl_queue + + assert opencl_ctx, "No valid OpenCL context!\n" \ + "Use `import pystencils.opencl.autoinit` if you want it to be automatically created" + assert opencl_queue, "No valid OpenCL queue!\n" \ + "Use `import pystencils.opencl.autoinit` if you want it to be automatically created" if argument_dict is None: argument_dict = {} @@ -74,6 +117,10 @@ def make_python_function(kernel_function_node, opencl_queue, opencl_ctx, argumen except KeyError: full_arguments = argument_dict.copy() full_arguments.update(kwargs) + assert not any(isinstance(a, np.ndarray) + for a in full_arguments.values()), 'Calling a OpenCL kernel with a Numpy array!' + assert not any('pycuda' in str(type(a)) + for a in full_arguments.values()), 'Calling a OpenCL kernel with a PyCUDA array!' shape = _check_arguments(parameters, full_arguments) indexing = kernel_function_node.indexing @@ -90,4 +137,5 @@ def make_python_function(kernel_function_node, opencl_queue, opencl_ctx, argumen wrapper.ast = kernel_function_node wrapper.parameters = kernel_function_node.get_parameters() + wrapper = KernelWrapper(wrapper, parameters, kernel_function_node) return wrapper diff --git a/pystencils_tests/test_cudagpu.py b/pystencils_tests/test_cudagpu.py index 77790229f48b736258d343a4670c06e66a417838..f5efd517f0457ffc71d5c8015dd5d4f92aef47ea 100644 --- a/pystencils_tests/test_cudagpu.py +++ b/pystencils_tests/test_cudagpu.py @@ -1,4 +1,5 @@ import numpy as np +import pycuda.autoinit import pycuda.gpuarray as gpuarray import sympy as sp from scipy.ndimage import convolve diff --git a/pystencils_tests/test_datahandling.py b/pystencils_tests/test_datahandling.py index cdee059c218a67ad9adc824f41fa09ae3df2f19b..1af1a68e3bfdb3681e05443c2412d8bd24903b0d 100644 --- a/pystencils_tests/test_datahandling.py +++ b/pystencils_tests/test_datahandling.py @@ -6,6 +6,13 @@ import numpy as np import pystencils as ps from pystencils import create_data_handling, create_kernel +try: + import pytest +except ImportError: + import unittest.mock + pytest = unittest.mock.MagicMock() + + def basic_iteration(dh): dh.add_array('basic_iter_test_gl_default') @@ -100,15 +107,9 @@ def synchronization(dh, test_gpu=False): np.testing.assert_equal(42, b[field_name]) -def kernel_execution_jacobi(dh, test_gpu=False): - if test_gpu: - try: - from pycuda import driver - import pycuda.autoinit - except ImportError: - print("Skipping kernel_execution_jacobi GPU version, because pycuda not available") - return +def kernel_execution_jacobi(dh, target): + test_gpu = target == 'gpu' or target == 'opencl' dh.add_array('f', gpu=test_gpu) dh.add_array('tmp', gpu=test_gpu) stencil_2d = [(1, 0), (-1, 0), (0, 1), (0, -1)] @@ -119,7 +120,7 @@ def kernel_execution_jacobi(dh, test_gpu=False): def jacobi(): dh.fields.tmp.center @= sum(dh.fields.f.neighbors(stencil)) / len(stencil) - kernel = create_kernel(jacobi, target='gpu' if test_gpu else 'cpu').compile() + kernel = create_kernel(jacobi, target).compile() for b in dh.iterate(ghost_layers=1): b['f'].fill(42) dh.run_kernel(kernel) @@ -196,17 +197,32 @@ def test_access_and_gather(): def test_kernel(): for domain_shape in [(4, 5), (3, 4, 5)]: dh = create_data_handling(domain_size=domain_shape, periodicity=True) - kernel_execution_jacobi(dh, test_gpu=True) + kernel_execution_jacobi(dh, 'cpu') reduction(dh) try: import pycuda dh = create_data_handling(domain_size=domain_shape, periodicity=True) - kernel_execution_jacobi(dh, test_gpu=False) + kernel_execution_jacobi(dh, 'gpu') except ImportError: pass +@pytest.mark.parametrize('target', ('cpu', 'gpu', 'opencl')) +def test_kernel_param(target): + for domain_shape in [(4, 5), (3, 4, 5)]: + if target == 'gpu': + pytest.importorskip('pycuda') + if target == 'opencl': + pytest.importorskip('pyopencl') + from pystencils.opencl.opencljit import init_globally + init_globally() + + dh = create_data_handling(domain_size=domain_shape, periodicity=True, default_target=target) + kernel_execution_jacobi(dh, target) + reduction(dh) + + def test_vtk_output(): for domain_shape in [(4, 5), (3, 4, 5)]: dh = create_data_handling(domain_size=domain_shape, periodicity=True) diff --git a/pystencils_tests/test_datahandling_parallel.py b/pystencils_tests/test_datahandling_parallel.py index 52ccb6bce34b2ca115b19a8d00fa31da622b8ff4..0bfbfbd02fc86952e7fba84ce6aa080d2080ec0a 100644 --- a/pystencils_tests/test_datahandling_parallel.py +++ b/pystencils_tests/test_datahandling_parallel.py @@ -50,13 +50,13 @@ def test_kernel(): # 3D blocks = wlb.createUniformBlockGrid(blocks=(3, 2, 4), cellsPerBlock=(3, 2, 5), oneBlockPerProcess=False) dh = ParallelDataHandling(blocks) - kernel_execution_jacobi(dh, test_gpu=gpu) + kernel_execution_jacobi(dh, 'gpu') reduction(dh) # 2D blocks = wlb.createUniformBlockGrid(blocks=(3, 2, 1), cellsPerBlock=(3, 2, 1), oneBlockPerProcess=False) dh = ParallelDataHandling(blocks, dim=2) - kernel_execution_jacobi(dh, test_gpu=gpu) + kernel_execution_jacobi(dh, 'gpu') reduction(dh) diff --git a/pystencils_tests/test_opencl.py b/pystencils_tests/test_opencl.py index adffeb4750a62f5bb08ded29fffe00ac5900f706..ea5b1ecbc83f21672731b2ecbcb54c78faa1603b 100644 --- a/pystencils_tests/test_opencl.py +++ b/pystencils_tests/test_opencl.py @@ -1,11 +1,11 @@ import numpy as np import pytest +import sympy as sp import pystencils -import sympy as sp from pystencils.backends.cuda_backend import CudaBackend from pystencils.backends.opencl_backend import OpenClBackend -from pystencils.opencl.opencljit import make_python_function +from pystencils.opencl.opencljit import get_global_cl_queue, init_globally, make_python_function try: import pyopencl as cl @@ -233,3 +233,41 @@ def test_without_cuda(): opencl_kernel = make_python_function(ast, queue, ctx) assert opencl_kernel is not None opencl_kernel(x=x, y=y, z=z) + + +@pytest.mark.skipif(not HAS_OPENCL, reason="Test requires pyopencl") +def test_kernel_creation(): + global pystencils + z, y, x = pystencils.fields("z, y, x: [20,30]") + + assignments = pystencils.AssignmentCollection({ + z[0, 0]: x[0, 0] * sp.log(x[0, 0] * y[0, 0]) + }) + + print(assignments) + + pystencils.opencl.clear_global_ctx() + + import pystencils.opencl.autoinit + ast = pystencils.create_kernel(assignments, target='opencl') + + print(ast.backend) + + code = str(pystencils.show_code(ast)) + print(code) + assert 'get_local_size' in code + + opencl_kernel = ast.compile() + + x_cpu = np.random.rand(20, 30) + y_cpu = np.random.rand(20, 30) + z_cpu = np.random.rand(20, 30) + + import pyopencl.array as array + assert get_global_cl_queue() + x = array.to_device(get_global_cl_queue(), x_cpu) + y = array.to_device(get_global_cl_queue(), y_cpu) + z = array.to_device(get_global_cl_queue(), z_cpu) + + assert opencl_kernel is not None + opencl_kernel(x=x, y=y, z=z) diff --git a/pystencils_tests/test_staggered_kernel.py b/pystencils_tests/test_staggered_kernel.py index 4db538bf8f83d6779cb7dfd612fa9928acc23965..a64ff2f00d0ee7abc34e56f7110adb213dcb9992 100644 --- a/pystencils_tests/test_staggered_kernel.py +++ b/pystencils_tests/test_staggered_kernel.py @@ -1,16 +1,17 @@ -import pystencils as ps import numpy as np import sympy as sp +import pystencils as ps + class TestStaggeredDiffusion: - def _run(self, num_neighbors): + def _run(self, num_neighbors, target='cpu'): L = (40, 40) D = 0.066 dt = 1 T = 100 - dh = ps.create_data_handling(L, periodicity=True, default_target='cpu') + dh = ps.create_data_handling(L, periodicity=True, default_target=target) c = dh.add_array('c', values_per_cell=1) j = dh.add_array('j', values_per_cell=num_neighbors, field_type=ps.FieldType.STAGGERED_FLUX) @@ -23,7 +24,7 @@ class TestStaggeredDiffusion: jj = j.staggered_access divergence = -1 * D / (1 + sp.sqrt(2) if j.index_shape[0] == 4 else 1) * \ sum([jj(d) / sp.Matrix(ps.stencil.direction_string_to_offset(d)).norm() for d in j.staggered_stencil - + [ps.stencil.inverse_direction_string(d) for d in j.staggered_stencil]]) + + [ps.stencil.inverse_direction_string(d) for d in j.staggered_stencil]]) update = [ps.Assignment(c.center, c.center + dt * divergence)] flux = [ps.Assignment(j.staggered_access("W"), x_staggered), @@ -67,6 +68,12 @@ class TestStaggeredDiffusion: def test_diffusion_4(self): self._run(4) + def test_diffusion_opencl(self): + import pytest + pytest.importorskip('pyopencl') + import pystencils.opencl.autoinit + self._run(4, 'opencl') + def test_staggered_subexpressions(): dh = ps.create_data_handling((10, 10), periodicity=True, default_target='cpu')