Commit 171d844a authored by Stephan Seitz's avatar Stephan Seitz
Browse files

Merge branch 'opencl-datahandling' into 'master'

Opencl datahandling

See merge request pycodegen/pystencils!119
parents 9b57cf87 226abc3a
Pipeline #20930 passed with stages
in 6 minutes and 34 seconds
import os
import pytest
import tempfile
import runpy
import sys
import warnings
import tempfile
import nbformat
import pytest
from nbconvert import PythonExporter
from pystencils.boundaries.createindexlistcython import * # NOQA
# Trigger config file reading / creation once - to avoid race conditions when multiple instances are creating it
# at the same time
from pystencils.cpu import cpujit
......@@ -15,7 +20,6 @@ try:
pyximport.install(language_level=3)
except ImportError:
pass
from pystencils.boundaries.createindexlistcython import * # NOQA
SCRIPT_FOLDER = os.path.dirname(os.path.realpath(__file__))
......@@ -29,7 +33,8 @@ def add_path_to_ignore(path):
collect_ignore += [os.path.join(SCRIPT_FOLDER, path, f) for f in os.listdir(os.path.join(SCRIPT_FOLDER, path))]
collect_ignore = [os.path.join(SCRIPT_FOLDER, "doc", "conf.py")]
collect_ignore = [os.path.join(SCRIPT_FOLDER, "doc", "conf.py"),
os.path.join(SCRIPT_FOLDER, "pystencils", "opencl", "opencl.autoinit")]
add_path_to_ignore('pystencils_tests/benchmark')
add_path_to_ignore('_local_tmp')
......@@ -78,8 +83,6 @@ for root, sub_dirs, files in os.walk('.'):
collect_ignore.append(f)
import nbformat
from nbconvert import PythonExporter
class IPythonMockup:
......
......@@ -353,6 +353,9 @@ class CustomSympyPrinter(CCodePrinter):
result = super(CustomSympyPrinter, self)._print_Piecewise(expr)
return result.replace("\n", "")
def _print_Type(self, node):
return str(node)
def _print_Function(self, expr):
infix_functions = {
bitwise_xor: '^',
......@@ -365,7 +368,7 @@ class CustomSympyPrinter(CCodePrinter):
return expr.to_c(self._print)
if isinstance(expr, reinterpret_cast_func):
arg, data_type = expr.args
return "*((%s)(& %s))" % (PointerType(data_type, restrict=False), self._print(arg))
return "*((%s)(& %s))" % (self._print(PointerType(data_type, restrict=False)), self._print(arg))
elif isinstance(expr, address_of):
assert len(expr.args) == 1, "address_of must only have one argument"
return "&(%s)" % self._print(expr.args[0])
......
......@@ -68,6 +68,13 @@ class OpenClSympyPrinter(CudaSympyPrinter):
CustomSympyPrinter.__init__(self)
self.known_functions = OPENCL_KNOWN_FUNCTIONS
def _print_Type(self, node):
code = super()._print_Type(node)
if isinstance(node, pystencils.data_types.PointerType):
return "__global " + code
else:
return code
def _print_ThreadIndexingSymbol(self, node):
symbol_name: str = node.name
function_name, dimension = tuple(symbol_name.split("."))
......
......@@ -87,8 +87,25 @@ class BoundaryHandling:
fi = flag_interface
self.flag_interface = fi if fi is not None else FlagInterface(data_handling, name + "Flags")
gpu = self._target == 'gpu'
data_handling.add_custom_class(self._index_array_name, self.IndexFieldBlockData, cpu=True, gpu=gpu)
def to_cpu(gpu_version, cpu_version):
gpu_version = gpu_version.boundary_object_to_index_list
cpu_version = cpu_version.boundary_object_to_index_list
for obj, cpu_arr in cpu_version.items():
gpu_version[obj].get(cpu_arr)
def to_gpu(gpu_version, cpu_version):
gpu_version = gpu_version.boundary_object_to_index_list
cpu_version = cpu_version.boundary_object_to_index_list
for obj, cpu_arr in cpu_version.items():
if obj not in gpu_version or gpu_version[obj].shape != cpu_arr.shape:
gpu_version[obj] = self.data_handling.array_handler.to_gpu(cpu_arr)
else:
self.data_handling.array_handler.upload(gpu_version[obj], cpu_arr)
class_ = self.IndexFieldBlockData
class_.to_cpu = to_cpu
class_.to_gpu = to_gpu
data_handling.add_custom_class(self._index_array_name, class_)
@property
def data_handling(self):
......@@ -204,7 +221,7 @@ class BoundaryHandling:
if self._dirty:
self.prepare()
for b in self._data_handling.iterate(gpu=self._target == 'gpu'):
for b in self._data_handling.iterate(gpu=self._target in self._data_handling._GPU_LIKE_TARGETS):
for b_obj, idx_arr in b[self._index_array_name].boundary_object_to_index_list.items():
kwargs[self._field_name] = b[self._field_name]
kwargs['indexField'] = idx_arr
......@@ -219,7 +236,7 @@ class BoundaryHandling:
if self._dirty:
self.prepare()
for b in self._data_handling.iterate(gpu=self._target == 'gpu'):
for b in self._data_handling.iterate(gpu=self._target in self._data_handling._GPU_LIKE_TARGETS):
for b_obj, idx_arr in b[self._index_array_name].boundary_object_to_index_list.items():
arguments = kwargs.copy()
arguments[self._field_name] = b[self._field_name]
......@@ -302,7 +319,7 @@ class BoundaryHandling:
def _boundary_data_initialization(self, boundary_obj, boundary_data_setter, **kwargs):
if boundary_obj.additional_data_init_callback:
boundary_obj.additional_data_init_callback(boundary_data_setter, **kwargs)
if self._target == 'gpu':
if self._target in self._data_handling._GPU_LIKE_TARGETS:
self._data_handling.to_gpu(self._index_array_name)
class BoundaryInfo(object):
......@@ -312,7 +329,7 @@ class BoundaryHandling:
self.kernel = kernel
class IndexFieldBlockData:
def __init__(self, *_1, **_2):
def __init__(self):
self.boundary_object_to_index_list = {}
self.boundary_object_to_data_setter = {}
......@@ -320,24 +337,6 @@ class BoundaryHandling:
self.boundary_object_to_index_list.clear()
self.boundary_object_to_data_setter.clear()
@staticmethod
def to_cpu(gpu_version, cpu_version):
gpu_version = gpu_version.boundary_object_to_index_list
cpu_version = cpu_version.boundary_object_to_index_list
for obj, cpu_arr in cpu_version.items():
gpu_version[obj].get(cpu_arr)
@staticmethod
def to_gpu(gpu_version, cpu_version):
from pycuda import gpuarray
gpu_version = gpu_version.boundary_object_to_index_list
cpu_version = cpu_version.boundary_object_to_index_list
for obj, cpu_arr in cpu_version.items():
if obj not in gpu_version or gpu_version[obj].shape != cpu_arr.shape:
gpu_version[obj] = gpuarray.to_gpu(cpu_arr)
else:
gpu_version[obj].set(cpu_arr)
class BoundaryDataSetter:
......
......@@ -20,7 +20,8 @@ def create_data_handling(domain_size: Tuple[int, ...],
default_layout: str = 'SoA',
default_target: str = 'cpu',
parallel: bool = False,
default_ghost_layers: int = 1) -> DataHandling:
default_ghost_layers: int = 1,
opencl_queue=None) -> DataHandling:
"""Creates a data handling instance.
Args:
......@@ -33,6 +34,7 @@ def create_data_handling(domain_size: Tuple[int, ...],
default_ghost_layers: default number of ghost layers if not overwritten in 'add_array'
"""
if parallel:
assert not opencl_queue, "OpenCL is only supported for SerialDataHandling"
if wlb is None:
raise ValueError("Cannot create parallel data handling because walberla module is not available")
......@@ -56,8 +58,12 @@ def create_data_handling(domain_size: Tuple[int, ...],
return ParallelDataHandling(blocks=block_storage, dim=dim, default_target=default_target,
default_layout=default_layout, default_ghost_layers=default_ghost_layers)
else:
return SerialDataHandling(domain_size, periodicity=periodicity, default_target=default_target,
default_layout=default_layout, default_ghost_layers=default_ghost_layers)
return SerialDataHandling(domain_size,
periodicity=periodicity,
default_target=default_target,
default_layout=default_layout,
default_ghost_layers=default_ghost_layers,
opencl_queue=opencl_queue)
__all__ = ['create_data_handling']
......@@ -16,6 +16,9 @@ class DataHandling(ABC):
'gather' function that has collects (parts of the) distributed data on a single process.
"""
_GPU_LIKE_TARGETS = ['gpu', 'opencl']
_GPU_LIKE_BACKENDS = ['gpucuda', 'opencl']
# ---------------------------- Adding and accessing data -----------------------------------------------------------
@property
......
try:
import pycuda.gpuarray as gpuarray
except ImportError:
gpuarray = None
import numpy as np
import pystencils
class PyCudaArrayHandler:
def __init__(self):
import pycuda.autoinit # NOQA
def zeros(self, shape, dtype=np.float64, order='C'):
return gpuarray.zeros(shape, dtype, order)
def ones(self, shape, dtype, order='C'):
return gpuarray.ones(shape, dtype, order)
def empty(self, shape, dtype=np.float64, layout=None):
if layout:
cpu_array = pystencils.field.create_numpy_array_with_layout(shape, dtype, layout)
return self.from_numpy(cpu_array)
else:
return gpuarray.empty(shape, dtype)
def to_gpu(self, array):
return gpuarray.to_gpu(array)
def upload(self, gpuarray, numpy_array):
gpuarray.set(numpy_array)
def download(self, gpuarray, numpy_array):
gpuarray.get(numpy_array)
def randn(self, shape, dtype=np.float64):
cpu_array = np.random.randn(*shape).astype(dtype)
return self.from_numpy(cpu_array)
try:
import pyopencl.array as gpuarray
except ImportError:
gpuarray = None
import numpy as np
import pystencils
class PyOpenClArrayHandler:
def __init__(self, queue):
if not queue:
from pystencils.opencl.opencljit import get_global_cl_queue
queue = get_global_cl_queue()
assert queue, "OpenCL queue missing!\n" \
"Use `import pystencils.opencl.autoinit` if you want it to be automatically created"
self.queue = queue
def zeros(self, shape, dtype=np.float64, order='C'):
return gpuarray.zeros(shape, dtype, order)
def ones(self, shape, dtype, order='C'):
return gpuarray.ones(self.queue, shape, dtype, order)
def empty(self, shape, dtype=np.float64, layout=None):
if layout:
cpu_array = pystencils.field.create_numpy_array_with_layout(shape, dtype, layout)
return self.from_numpy(cpu_array)
else:
return gpuarray.empty(self.queue, shape, dtype)
def to_gpu(self, array):
return gpuarray.to_device(self.queue, array)
def upload(self, gpuarray, numpy_array):
gpuarray.set(numpy_array, self.queue)
def download(self, gpuarray, numpy_array):
gpuarray.get(self.queue, numpy_array)
def randn(self, shape, dtype=np.float64):
cpu_array = np.random.randn(*shape).astype(dtype)
return self.from_numpy(cpu_array)
......@@ -6,22 +6,25 @@ import numpy as np
from pystencils.datahandling.blockiteration import SerialBlock
from pystencils.datahandling.datahandling_interface import DataHandling
from pystencils.datahandling.pycuda import PyCudaArrayHandler
from pystencils.datahandling.pyopencl import PyOpenClArrayHandler
from pystencils.field import (
Field, FieldType, create_numpy_array_with_layout, layout_string_to_tuple, spatial_layout_string_to_tuple)
from pystencils.slicing import normalize_slice, remove_ghost_layers
from pystencils.utils import DotDict
try:
import pycuda.gpuarray as gpuarray
import pycuda.autoinit # NOQA
except ImportError:
gpuarray = None
class SerialDataHandling(DataHandling):
def __init__(self, domain_size: Sequence[int], default_ghost_layers: int = 1, default_layout: str = 'SoA',
periodicity: Union[bool, Sequence[bool]] = False, default_target: str = 'cpu') -> None:
def __init__(self,
domain_size: Sequence[int],
default_ghost_layers: int = 1,
default_layout: str = 'SoA',
periodicity: Union[bool, Sequence[bool]] = False,
default_target: str = 'cpu',
opencl_queue=None,
opencl_ctx=None,
array_handler=None) -> None:
"""
Creates a data handling for single node simulations.
......@@ -42,6 +45,19 @@ class SerialDataHandling(DataHandling):
self.custom_data_cpu = DotDict()
self.custom_data_gpu = DotDict()
self._custom_data_transfer_functions = {}
self._opencl_queue = opencl_queue
self._opencl_ctx = opencl_ctx
if not array_handler:
try:
self.array_handler = PyCudaArrayHandler()
except Exception:
self.array_handler = None
if default_target == 'opencl' or opencl_queue:
self.array_handler = PyOpenClArrayHandler(opencl_queue)
else:
self.array_handler = array_handler
if periodicity is None or periodicity is False:
periodicity = [False] * self.dim
......@@ -82,7 +98,7 @@ class SerialDataHandling(DataHandling):
if layout is None:
layout = self.default_layout
if gpu is None:
gpu = self.default_target == 'gpu'
gpu = self.default_target in self._GPU_LIKE_TARGETS
kwargs = {
'shape': tuple(s + 2 * ghost_layers for s in self._domainSize),
......@@ -126,7 +142,7 @@ class SerialDataHandling(DataHandling):
if gpu:
if name in self.gpu_arrays:
raise ValueError("GPU Field with this name already exists")
self.gpu_arrays[name] = gpuarray.to_gpu(cpu_arr)
self.gpu_arrays[name] = self.array_handler.to_gpu(cpu_arr)
assert all(f.name != name for f in self.fields.values()), "Symbolic field with this name already exists"
self.fields[name] = Field.create_from_numpy_array(name, cpu_arr, index_dimensions=index_dimensions,
......@@ -222,12 +238,12 @@ class SerialDataHandling(DataHandling):
self.to_gpu(name)
def run_kernel(self, kernel_function, **kwargs):
arrays = self.gpu_arrays if kernel_function.ast.backend == 'gpucuda' else self.cpu_arrays
arrays = self.gpu_arrays if kernel_function.ast.backend in self._GPU_LIKE_BACKENDS else self.cpu_arrays
kernel_function(**arrays, **kwargs)
def get_kernel_kwargs(self, kernel_function, **kwargs):
result = {}
result.update(self.gpu_arrays if kernel_function.ast.backend == 'gpucuda' else self.cpu_arrays)
result.update(self.gpu_arrays if kernel_function.ast.backend in self._GPU_LIKE_BACKENDS else self.cpu_arrays)
result.update(kwargs)
return [result]
......@@ -236,14 +252,14 @@ class SerialDataHandling(DataHandling):
transfer_func = self._custom_data_transfer_functions[name][1]
transfer_func(self.custom_data_gpu[name], self.custom_data_cpu[name])
else:
self.gpu_arrays[name].get(self.cpu_arrays[name])
self.array_handler.download(self.gpu_arrays[name], self.cpu_arrays[name])
def to_gpu(self, name):
if name in self._custom_data_transfer_functions:
transfer_func = self._custom_data_transfer_functions[name][0]
transfer_func(self.custom_data_gpu[name], self.custom_data_cpu[name])
else:
self.gpu_arrays[name].set(self.cpu_arrays[name])
self.array_handler.upload(self.gpu_arrays[name], self.cpu_arrays[name])
def is_on_gpu(self, name):
return name in self.gpu_arrays
......@@ -257,6 +273,8 @@ class SerialDataHandling(DataHandling):
def synchronization_function(self, names, stencil=None, target=None, **_):
if target is None:
target = self.default_target
if target == 'opencl':
target = 'gpu'
assert target in ('cpu', 'gpu')
if not hasattr(names, '__len__') or type(names) is str:
names = [names]
......@@ -298,11 +316,15 @@ class SerialDataHandling(DataHandling):
result.append(get_periodic_boundary_functor(filtered_stencil, ghost_layers=gls))
else:
from pystencils.gpucuda.periodicity import get_periodic_boundary_functor as boundary_func
target = 'gpu' if not isinstance(self.array_handler, PyOpenClArrayHandler) else 'opencl'
result.append(boundary_func(filtered_stencil, self._domainSize,
index_dimensions=self.fields[name].index_dimensions,
index_dim_shape=values_per_cell,
dtype=self.fields[name].dtype.numpy_dtype,
ghost_layers=gls))
ghost_layers=gls,
target=target,
opencl_queue=self._opencl_queue,
opencl_ctx=self._opencl_ctx))
if target == 'cpu':
def result_functor():
......
......@@ -45,7 +45,12 @@ def show_code(ast: KernelFunction, custom_backend=None):
if isinstance(ast, KernelWrapper):
ast = ast.ast
dialect = 'cuda' if ast.backend == 'gpucuda' else 'c'
if ast.backend == 'gpucuda':
dialect = 'cuda'
elif ast.backend == 'opencl':
dialect = 'opencl'
else:
dialect = 'c'
class CodeDisplay:
def __init__(self, ast_input):
......
import numpy as np
import pystencils.gpucuda
import pystencils.opencl
from pystencils import Assignment, Field
from pystencils.gpucuda import make_python_function
from pystencils.gpucuda.kernelcreation import create_cuda_kernel
from pystencils.slicing import get_periodic_boundary_src_dst_slices, normalize_slice
......@@ -25,16 +26,22 @@ def create_copy_kernel(domain_size, from_slice, to_slice, index_dimensions=0, in
update_eqs.append(eq)
ast = create_cuda_kernel(update_eqs, iteration_slice=to_slice, skip_independence_check=True)
return make_python_function(ast)
return ast
def get_periodic_boundary_functor(stencil, domain_size, index_dimensions=0, index_dim_shape=1, ghost_layers=1,
thickness=None, dtype=float):
thickness=None, dtype=float, target='gpu', opencl_queue=None, opencl_ctx=None):
assert target in ['gpu', 'opencl']
src_dst_slice_tuples = get_periodic_boundary_src_dst_slices(stencil, ghost_layers, thickness)
kernels = []
index_dimensions = index_dimensions
for src_slice, dst_slice in src_dst_slice_tuples:
kernels.append(create_copy_kernel(domain_size, src_slice, dst_slice, index_dimensions, index_dim_shape, dtype))
ast = create_copy_kernel(domain_size, src_slice, dst_slice, index_dimensions, index_dim_shape, dtype)
if target == 'gpu':
kernels.append(pystencils.gpucuda.make_python_function(ast))
else:
kernels.append(pystencils.opencl.make_python_function(ast, opencl_queue, opencl_ctx))
def functor(pdfs, **_):
for kernel in kernels:
......
from itertools import combinations
import functools
import itertools
from types import MappingProxyType
import sympy as sp
......@@ -27,13 +28,15 @@ def create_kernel(assignments,
gpu_indexing_params=MappingProxyType({}),
use_textures_for_interpolation=True,
cpu_prepend_optimizations=[],
use_auto_for_assignments=False):
use_auto_for_assignments=False,
opencl_queue=None,
opencl_ctx=None):
"""
Creates abstract syntax tree (AST) of kernel, using a list of update equations.
Args:
assignments: can be a single assignment, sequence of assignments or an `AssignmentCollection`
target: 'cpu', 'llvm' or 'gpu'
target: 'cpu', 'llvm', 'gpu' or 'opencl'
data_type: data type used for all untyped symbols (i.e. non-fields), can also be a dict from symbol name
to type
iteration_slice: rectangular subset to iterate over, if not specified the complete non-ghost layer \
......@@ -108,13 +111,20 @@ def create_kernel(assignments,
from pystencils.llvm import create_kernel
ast = create_kernel(assignments, type_info=data_type, split_groups=split_groups,
iteration_slice=iteration_slice, ghost_layers=ghost_layers)
elif target == 'gpu':
elif target == 'gpu' or target == 'opencl':
from pystencils.gpucuda import create_cuda_kernel
ast = create_cuda_kernel(assignments, type_info=data_type,
indexing_creator=indexing_creator_from_params(gpu_indexing, gpu_indexing_params),
iteration_slice=iteration_slice, ghost_layers=ghost_layers,
skip_independence_check=skip_independence_check,
use_textures_for_interpolation=use_textures_for_interpolation)
if target == 'opencl':
from pystencils.opencl.opencljit import make_python_function
ast._backend = 'opencl'
ast.compile = functools.partial(make_python_function, ast, opencl_queue, opencl_ctx)
ast._target = 'opencl'
ast._backend = 'opencl'
return ast
else:
raise ValueError("Unknown target %s. Has to be one of 'cpu', 'gpu' or 'llvm' " % (target,))
......@@ -133,7 +143,9 @@ def create_indexed_kernel(assignments,
cpu_openmp=True,
gpu_indexing='block',
gpu_indexing_params=MappingProxyType({}),
use_textures_for_interpolation=True):
use_textures_for_interpolation=True,
opencl_queue=None,
opencl_ctx=None):
"""
Similar to :func:`create_kernel`, but here not all cells of a field are updated but only cells with
coordinates which are stored in an index field. This traversal method can e.g. be used for boundary handling.
......@@ -183,7 +195,7 @@ def create_indexed_kernel(assignments,
return ast
elif target == 'llvm':
raise NotImplementedError("Indexed kernels are not yet supported in LLVM backend")
elif target == 'gpu':
elif target == 'gpu' or target == 'opencl':
from pystencils.gpucuda import created_indexed_cuda_kernel
idx_creator = indexing_creator_from_params(gpu_indexing, gpu_indexing_params)
ast = created_indexed_cuda_kernel(assignments,
......@@ -192,6 +204,12 @@ def create_indexed_kernel(assignments,
coordinate_names=coordinate_names,
indexing_creator=idx_creator,
use_textures_for_interpolation=use_textures_for_interpolation)
if target == 'opencl':
from pystencils.opencl.opencljit import make_python_function
ast._backend = 'opencl'
ast.compile = functools.partial(make_python_function, ast, opencl_queue, opencl_ctx)
ast._target = 'opencl'
ast._backend = 'opencl'
return ast
else:
raise ValueError("Unknown target %s. Has to be either 'cpu' or 'gpu'" % (target,))
......@@ -288,7 +306,7 @@ def create_staggered_kernel(assignments, target='cpu', gpu_exclusive_conditions=
outer_assignment = None
conditions = {direction: condition(direction) for direction in stencil}
for num_conditions in range(len(stencil)):
for combination in combinations(conditions.values(), num_conditions):
for combination in itertools.combinations(conditions.values(), num_conditions):
for assignment in assignments:
direction = stencil[assignment.lhs.index[0]]
if conditions[direction] in combination:
......
"""
"""
from pystencils.opencl.opencljit import (
clear_global_ctx, init_globally, init_globally_with_context, make_python_function)
__all__ = ['init_globally', 'init_globally_with_context', 'clear_global_ctx', 'make_python_function']
"""
Automatically initializes OpenCL context using any device.
Use `pystencils.opencl.{init_globally_with_context,init_globally}` if you want to use a specific device.
"""
from pystencils.opencl.opencljit import (
clear_global_ctx, init_globally, init_globally_with_context, make_python_function)
__all__ = ['init_globally', 'init_globally_with_context', 'clear_global_ctx', 'make_python_function']
try:
init_globally()
except Exception as e:
import warnings
warnings.warn(str(e))
......@@ -3,10 +3,45 @@ import numpy as np
from pystencils.backends.cbackend import generate_c, get_headers
from pystencils.gpucuda.cudajit import _build_numpy_argument_list, _check_arguments
from pystencils.include import get_pystencils_include_path
from pystencils.kernel_wrapper import KernelWrapper