diff --git a/README.md b/README.md index d079128a539b9c9244d39dc5fc0a6c5e88055227..60ab1dd3e0cebc9ba6948ace01382dc127176711 100644 --- a/README.md +++ b/README.md @@ -52,6 +52,7 @@ Without `[interactive]` you get a minimal version with very little dependencies. All options: - `gpu`: use this if an Nvidia GPU is available and CUDA is installed +- `pyopencl`: basic OpenCL support (experimental) - `alltrafos`: pulls in additional dependencies for loop simplification e.g. libisl - `bench_db`: functionality to store benchmark result in object databases - `interactive`: installs dependencies to work in Jupyter including image I/O, plotting etc. diff --git a/pystencils/astnodes.py b/pystencils/astnodes.py index b2f5727c4068852fd6f4ba1b755a1ba5d1215b91..393396aa2d073c77e8df6ff627860164c9cfe526 100644 --- a/pystencils/astnodes.py +++ b/pystencils/astnodes.py @@ -707,8 +707,7 @@ class DestructuringBindingsForFieldClass(Node): corresponding_field_names = {s.field_name for s in undefined_field_symbols if hasattr(s, 'field_name')} corresponding_field_names |= {s.field_names[0] for s in undefined_field_symbols if hasattr(s, 'field_names')} return {TypedSymbol(f, self.CLASS_NAME_TEMPLATE.format(dtype=field_map[f].dtype, ndim=field_map[f].ndim) + '&') - for f in corresponding_field_names} | \ - (self.body.undefined_symbols - undefined_field_symbols) + for f in corresponding_field_names} | (self.body.undefined_symbols - undefined_field_symbols) def subs(self, subs_dict) -> None: """Inplace! substitute, similar to sympy's but modifies the AST inplace.""" diff --git a/pystencils/backends/cbackend.py b/pystencils/backends/cbackend.py index cac2b00a1b52b29a61ccaa0cd73e9e5d37a89e95..239f060eee7de833321f811ed95b05ff8f02adbb 100644 --- a/pystencils/backends/cbackend.py +++ b/pystencils/backends/cbackend.py @@ -59,6 +59,9 @@ def generate_c(ast_node: Node, signature_only: bool = False, dialect='c', custom elif dialect == 'cuda': from pystencils.backends.cuda_backend import CudaBackend printer = CudaBackend(signature_only=signature_only) + elif dialect == 'opencl': + from pystencils.backends.opencl_backend import OpenClBackend + printer = OpenClBackend(signature_only=signature_only) else: raise ValueError("Unknown dialect: " + str(dialect)) code = printer(ast_node) @@ -166,14 +169,19 @@ class CBackend: return result def _print(self, node): + if isinstance(node, str): + return node for cls in type(node).__mro__: method_name = "_print_" + cls.__name__ if hasattr(self, method_name): return getattr(self, method_name)(node) raise NotImplementedError(self.__class__.__name__ + " does not support node of type " + node.__class__.__name__) + def _print_Type(self, node): + return str(node) + def _print_KernelFunction(self, node): - function_arguments = ["%s %s" % (str(s.symbol.dtype), s.symbol.name) for s in node.get_parameters()] + function_arguments = ["%s %s" % (self._print(s.symbol.dtype), s.symbol.name) for s in node.get_parameters()] launch_bounds = "" if self._dialect == 'cuda': max_threads = node.indexing.max_threads_per_block() @@ -208,7 +216,11 @@ class CBackend: def _print_SympyAssignment(self, node): if node.is_declaration: - data_type = "const " + str(node.lhs.dtype) + " " if node.is_const else str(node.lhs.dtype) + " " + if node.is_const: + prefix = 'const ' + else: + prefix = '' + data_type = prefix + self._print(node.lhs.dtype) + " " return "%s%s = %s;" % (data_type, self.sympy_printer.doprint(node.lhs), self.sympy_printer.doprint(node.rhs)) else: @@ -277,10 +289,10 @@ class CBackend: ] destructuring_bindings.sort() # only for code aesthetics return "{\n" + self._indent + \ - ("\n" + self._indent).join(destructuring_bindings) + \ - "\n" + self._indent + \ - ("\n" + self._indent).join(self._print(node.body).splitlines()) + \ - "\n}" + ("\n" + self._indent).join(destructuring_bindings) + \ + "\n" + self._indent + \ + ("\n" + self._indent).join(self._print(node.body).splitlines()) + \ + "\n}" # ------------------------------------------ Helper function & classes ------------------------------------------------- diff --git a/pystencils/backends/opencl1.1_known_functions.txt b/pystencils/backends/opencl1.1_known_functions.txt new file mode 100644 index 0000000000000000000000000000000000000000..abeab29043e2c76b2b2df8e9024c00e7ac34de0f --- /dev/null +++ b/pystencils/backends/opencl1.1_known_functions.txt @@ -0,0 +1,100 @@ +acos +acosh +acospi +asin +asinh +asinpi +atan +atan2 +atanh +atanpi +atan2pi +cbrt +ceil +copysign +cos +cosh +cospi +erfc +erf +exp +exp2 +exp10 +expm1 +fabs +fdim +floor +fma +fmax +fmax +fmin45 +fmin +fmod +fract +frexp +hypot +ilogb +ldexp +lgamma +lgamma_r +log +log2 +log10 +log1p +logb +mad +maxmag +minmag +modf +nextafter +pow +pown +powr +remquo +intn +remquo +rint +rootn +rootn +round +rsqrt +sin +sincos +sinh +sinpi +sqrt +tan +tanh +tanpi +tgamma +trunc + + +half_cos +half_divide +half_exp +half_exp2 +half_exp10 +half_log +half_log2 +half_log10 +half_powr +half_recip +half_rsqrt +half_sin +half_sqrt +half_tan +native_cos +native_divide +native_exp +native_exp2 +native_exp10 +native_log +native_log2 +native_log10 +native_powr +native_recip +native_rsqrt +native_sin +native_sqrt +native_tan diff --git a/pystencils/backends/opencl_backend.py b/pystencils/backends/opencl_backend.py new file mode 100644 index 0000000000000000000000000000000000000000..d2e40a69f05fe1181fc41110ca9a1b07763a75be --- /dev/null +++ b/pystencils/backends/opencl_backend.py @@ -0,0 +1,94 @@ +from os.path import dirname, join + +import pystencils.data_types +from pystencils.astnodes import Node +from pystencils.backends.cbackend import CustomSympyPrinter, generate_c +from pystencils.backends.cuda_backend import CudaBackend, CudaSympyPrinter +from pystencils.fast_approximation import fast_division, fast_inv_sqrt, fast_sqrt + +with open(join(dirname(__file__), 'opencl1.1_known_functions.txt')) as f: + lines = f.readlines() + OPENCL_KNOWN_FUNCTIONS = {l.strip(): l.strip() for l in lines if l} + + +def generate_opencl(astnode: Node, signature_only: bool = False) -> str: + """Prints an abstract syntax tree node (made for target 'gpu') as OpenCL code. + + Args: + astnode: KernelFunction node to generate code for + signature_only: if True only the signature is printed + + Returns: + C-like code for the ast node and its descendants + """ + return generate_c(astnode, signature_only, dialect='opencl') + + +class OpenClBackend(CudaBackend): + + def __init__(self, + sympy_printer=None, + signature_only=False): + if not sympy_printer: + sympy_printer = OpenClSympyPrinter() + + super().__init__(sympy_printer, signature_only) + self._dialect = 'opencl' + + def _print_Type(self, node): + code = super()._print_Type(node) + if isinstance(node, pystencils.data_types.PointerType): + return "__global " + code + else: + return code + + def _print_ThreadBlockSynchronization(self, node): + raise NotImplementedError() + + def _print_TextureDeclaration(self, node): + raise NotImplementedError() + + +class OpenClSympyPrinter(CudaSympyPrinter): + language = "OpenCL" + + DIMENSION_MAPPING = { + 'x': '0', + 'y': '1', + 'z': '2' + } + INDEXING_FUNCTION_MAPPING = { + 'blockIdx': 'get_group_id', + 'threadIdx': 'get_local_id', + 'blockDim': 'get_local_size', + 'gridDim': 'get_global_size' + } + + def __init__(self): + CustomSympyPrinter.__init__(self) + self.known_functions = OPENCL_KNOWN_FUNCTIONS + + def _print_ThreadIndexingSymbol(self, node): + symbol_name: str = node.name + function_name, dimension = tuple(symbol_name.split(".")) + dimension = self.DIMENSION_MAPPING[dimension] + function_name = self.INDEXING_FUNCTION_MAPPING[function_name] + return f"{function_name}({dimension})" + + def _print_TextureAccess(self, node): + raise NotImplementedError() + + # For math functions, OpenCL is more similar to the C++ printer CustomSympyPrinter + # since built-in math functions are generic. + # In CUDA, you have to differentiate between `sin` and `sinf` + _print_math_func = CustomSympyPrinter._print_math_func + _print_Pow = CustomSympyPrinter._print_Pow + + def _print_Function(self, expr): + if isinstance(expr, fast_division): + return "native_divide(%s, %s)" % tuple(self._print(a) for a in expr.args) + elif isinstance(expr, fast_sqrt): + return "native_sqrt(%s)" % tuple(self._print(a) for a in expr.args) + elif isinstance(expr, fast_inv_sqrt): + return "native_rsqrt(%s)" % tuple(self._print(a) for a in expr.args) + return CustomSympyPrinter._print_Function(self, expr) diff --git a/pystencils/gpucuda/indexing.py b/pystencils/gpucuda/indexing.py index f6f1fbe80c3da6c1dea78cf687fe96dc6810ec10..4c8701b2599dd9b4570c7d7f8875e0e6b58bef36 100644 --- a/pystencils/gpucuda/indexing.py +++ b/pystencils/gpucuda/indexing.py @@ -1,8 +1,8 @@ import abc from functools import partial -from typing import Tuple # noqa import sympy as sp +from sympy.core.cache import cacheit from pystencils.astnodes import Block, Conditional from pystencils.data_types import TypedSymbol, create_type @@ -10,10 +10,24 @@ from pystencils.integer_functions import div_ceil, div_floor from pystencils.slicing import normalize_slice from pystencils.sympyextensions import is_integer_sequence, prod -BLOCK_IDX = [TypedSymbol("blockIdx." + coord, create_type("int")) for coord in ('x', 'y', 'z')] -THREAD_IDX = [TypedSymbol("threadIdx." + coord, create_type("int")) for coord in ('x', 'y', 'z')] -BLOCK_DIM = [TypedSymbol("blockDim." + coord, create_type("int")) for coord in ('x', 'y', 'z')] -GRID_DIM = [TypedSymbol("gridDim." + coord, create_type("int")) for coord in ('x', 'y', 'z')] + +class ThreadIndexingSymbol(TypedSymbol): + def __new__(cls, *args, **kwds): + obj = ThreadIndexingSymbol.__xnew_cached_(cls, *args, **kwds) + return obj + + def __new_stage2__(cls, name, dtype, *args, **kwargs): + obj = super(ThreadIndexingSymbol, cls).__xnew__(cls, name, dtype, *args, **kwargs) + return obj + + __xnew__ = staticmethod(__new_stage2__) + __xnew_cached_ = staticmethod(cacheit(__new_stage2__)) + + +BLOCK_IDX = [ThreadIndexingSymbol("blockIdx." + coord, create_type("int")) for coord in ('x', 'y', 'z')] +THREAD_IDX = [ThreadIndexingSymbol("threadIdx." + coord, create_type("int")) for coord in ('x', 'y', 'z')] +BLOCK_DIM = [ThreadIndexingSymbol("blockDim." + coord, create_type("int")) for coord in ('x', 'y', 'z')] +GRID_DIM = [ThreadIndexingSymbol("gridDim." + coord, create_type("int")) for coord in ('x', 'y', 'z')] class AbstractIndexing(abc.ABC): @@ -69,6 +83,7 @@ class AbstractIndexing(abc.ABC): def symbolic_parameters(self): """Set of symbols required in call_parameters code""" + # -------------------------------------------- Implementations --------------------------------------------------------- @@ -82,6 +97,7 @@ class BlockIndexing(AbstractIndexing): gets the largest amount of threads compile_time_block_size: compile in concrete block size, otherwise the cuda variable 'blockDim' is used """ + def __init__(self, field, iteration_slice, block_size=(16, 16, 1), permute_block_size_dependent_on_layout=True, compile_time_block_size=False, maximum_block_size=(1024, 1024, 64)): diff --git a/pystencils/include/opencl_stdint.h b/pystencils/include/opencl_stdint.h new file mode 100644 index 0000000000000000000000000000000000000000..14ad55852fe87f42a569fb4159187a27f348e7ce --- /dev/null +++ b/pystencils/include/opencl_stdint.h @@ -0,0 +1,16 @@ +#ifndef OPENCL_STDINT +#define OPENCL_STDINT + +typedef unsigned int uint; +typedef unsigned int uint_t; + +typedef signed char int8_t; +typedef signed short int16_t; +typedef signed int int32_t; +typedef signed long int int64_t; +typedef unsigned char uint8_t; +typedef unsigned short uint16_t; +typedef unsigned int uint32_t; +typedef unsigned long int uint64_t; + +#endif diff --git a/pystencils/opencl/opencljit.py b/pystencils/opencl/opencljit.py new file mode 100644 index 0000000000000000000000000000000000000000..75076db53bd8dd37457fc4bb9c9dea833942d57f --- /dev/null +++ b/pystencils/opencl/opencljit.py @@ -0,0 +1,79 @@ +import numpy as np + +from pystencils.backends.cbackend import generate_c, get_headers +from pystencils.gpucuda.cudajit import _build_numpy_argument_list, _check_arguments +from pystencils.include import get_pystencils_include_path + +USE_FAST_MATH = True + + +def make_python_function(kernel_function_node, opencl_queue, opencl_ctx, argument_dict=None, custom_backend=None): + """ + Creates a **OpenCL** kernel function from an abstract syntax tree which + was created for the ``target='gpu'`` e.g. by :func:`pystencils.gpucuda.create_cuda_kernel` + or :func:`pystencils.gpucuda.created_indexed_cuda_kernel` + + Args: + opencl_queue: a valid :class:`pyopencl.CommandQueue` + opencl_ctx: a valid :class:`pyopencl.Context` + kernel_function_node: the abstract syntax tree + argument_dict: parameters passed here are already fixed. Remaining parameters have to be passed to the + returned kernel functor. + + Returns: + compiled kernel as Python function + """ + import pyopencl as cl + assert opencl_ctx, "No valid OpenCL context" + assert opencl_queue, "No valid OpenCL queue" + + if argument_dict is None: + argument_dict = {} + + # Changing of kernel name necessary since compilation with default name "kernel" is not possible (OpenCL keyword!) + kernel_function_node.function_name = "opencl_" + kernel_function_node.function_name + header_list = ['"opencl_stdint.h"'] + list(get_headers(kernel_function_node)) + includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list]) + + code = includes + "\n" + code += "#define FUNC_PREFIX __kernel\n" + code += "#define RESTRICT restrict\n\n" + code += str(generate_c(kernel_function_node, dialect='opencl', custom_backend=custom_backend)) + options = [] + if USE_FAST_MATH: + options.append("-cl-unsafe-math-optimizations -cl-mad-enable -cl-fast-relaxed-math -cl-finite-math-only") + options.append("-I \"" + get_pystencils_include_path() + "\"") + mod = cl.Program(opencl_ctx, code).build(options=options) + func = getattr(mod, kernel_function_node.function_name) + + parameters = kernel_function_node.get_parameters() + + cache = {} + cache_values = [] + + def wrapper(**kwargs): + key = hash(tuple((k, v.ctypes.data, v.strides, v.shape) if isinstance(v, np.ndarray) else (k, id(v)) + for k, v in kwargs.items())) + try: + args, block_and_thread_numbers = cache[key] + func(opencl_queue, block_and_thread_numbers['grid'], block_and_thread_numbers['block'], *args) + except KeyError: + full_arguments = argument_dict.copy() + full_arguments.update(kwargs) + shape = _check_arguments(parameters, full_arguments) + + indexing = kernel_function_node.indexing + block_and_thread_numbers = indexing.call_parameters(shape) + block_and_thread_numbers['block'] = tuple(int(i) for i in block_and_thread_numbers['block']) + block_and_thread_numbers['grid'] = tuple(int(b * g) for (b, g) in zip(block_and_thread_numbers['block'], + block_and_thread_numbers['grid'])) + + args = _build_numpy_argument_list(parameters, full_arguments) + args = [a.data if hasattr(a, 'data') else a for a in args] + cache[key] = (args, block_and_thread_numbers) + cache_values.append(kwargs) # keep objects alive such that ids remain unique + func(opencl_queue, block_and_thread_numbers['grid'], block_and_thread_numbers['block'], *args) + + wrapper.ast = kernel_function_node + wrapper.parameters = kernel_function_node.get_parameters() + return wrapper diff --git a/pystencils_tests/test_opencl.py b/pystencils_tests/test_opencl.py new file mode 100644 index 0000000000000000000000000000000000000000..24ef56f9b5e868496d4240d33fe708aaf82b43ed --- /dev/null +++ b/pystencils_tests/test_opencl.py @@ -0,0 +1,199 @@ +import numpy as np +import pytest +import sympy as sp + +import pystencils +from pystencils.backends.cuda_backend import CudaBackend +from pystencils.backends.opencl_backend import OpenClBackend +from pystencils.opencl.opencljit import make_python_function + +try: + import pyopencl as cl + HAS_OPENCL = True +except Exception: + HAS_OPENCL = False + + +def test_print_opencl(): + z, y, x = pystencils.fields("z, y, x: [2d]") + + assignments = pystencils.AssignmentCollection({ + z[0, 0]: x[0, 0] * sp.log(x[0, 0] * y[0, 0]) + }) + + print(assignments) + + ast = pystencils.create_kernel(assignments, target='gpu') + + print(ast) + + code = pystencils.show_code(ast, custom_backend=CudaBackend()) + print(code) + + opencl_code = pystencils.show_code(ast, custom_backend=OpenClBackend()) + print(opencl_code) + + assert "__global double * RESTRICT const _data_x" in str(opencl_code) + assert "__global double * RESTRICT" in str(opencl_code) + assert "get_local_id(0)" in str(opencl_code) + + +@pytest.mark.skipif(not HAS_OPENCL, reason="Test requires pyopencl") +def test_opencl_jit_fixed_size(): + z, y, x = pystencils.fields("z, y, x: [20,30]") + + assignments = pystencils.AssignmentCollection({ + z[0, 0]: x[0, 0] * sp.log(x[0, 0] * y[0, 0]) + }) + + print(assignments) + + ast = pystencils.create_kernel(assignments, target='gpu') + + print(ast) + + code = pystencils.show_code(ast, custom_backend=CudaBackend()) + print(code) + opencl_code = pystencils.show_code(ast, custom_backend=OpenClBackend()) + print(opencl_code) + + cuda_kernel = ast.compile() + assert cuda_kernel is not None + + import pycuda.gpuarray as gpuarray + + x_cpu = np.random.rand(20, 30) + y_cpu = np.random.rand(20, 30) + z_cpu = np.random.rand(20, 30) + + x = gpuarray.to_gpu(x_cpu) + y = gpuarray.to_gpu(y_cpu) + z = gpuarray.to_gpu(z_cpu) + cuda_kernel(x=x, y=y, z=z) + + result_cuda = z.get() + + import pyopencl.array as array + ctx = cl.create_some_context(0) + queue = cl.CommandQueue(ctx) + + x = array.to_device(queue, x_cpu) + y = array.to_device(queue, y_cpu) + z = array.to_device(queue, z_cpu) + + opencl_kernel = make_python_function(ast, queue, ctx) + assert opencl_kernel is not None + opencl_kernel(x=x, y=y, z=z) + + result_opencl = z.get(queue) + + assert np.allclose(result_cuda, result_opencl) + + +@pytest.mark.skipif(not HAS_OPENCL, reason="Test requires pyopencl") +def test_opencl_jit(): + z, y, x = pystencils.fields("z, y, x: [2d]") + + assignments = pystencils.AssignmentCollection({ + z[0, 0]: x[0, 0] * sp.log(x[0, 0] * y[0, 0]) + }) + + print(assignments) + + ast = pystencils.create_kernel(assignments, target='gpu') + + print(ast) + + code = pystencils.show_code(ast, custom_backend=CudaBackend()) + print(code) + opencl_code = pystencils.show_code(ast, custom_backend=OpenClBackend()) + print(opencl_code) + + cuda_kernel = ast.compile() + assert cuda_kernel is not None + + import pycuda.gpuarray as gpuarray + + x_cpu = np.random.rand(20, 30) + y_cpu = np.random.rand(20, 30) + z_cpu = np.random.rand(20, 30) + + x = gpuarray.to_gpu(x_cpu) + y = gpuarray.to_gpu(y_cpu) + z = gpuarray.to_gpu(z_cpu) + cuda_kernel(x=x, y=y, z=z) + + result_cuda = z.get() + + import pyopencl.array as array + ctx = cl.create_some_context(0) + queue = cl.CommandQueue(ctx) + + x = array.to_device(queue, x_cpu) + y = array.to_device(queue, y_cpu) + z = array.to_device(queue, z_cpu) + + opencl_kernel = make_python_function(ast, queue, ctx) + assert opencl_kernel is not None + opencl_kernel(x=x, y=y, z=z) + + result_opencl = z.get(queue) + + assert np.allclose(result_cuda, result_opencl) + + +@pytest.mark.skipif(not HAS_OPENCL, reason="Test requires pyopencl") +def test_opencl_jit_with_parameter(): + z, y, x = pystencils.fields("z, y, x: [2d]") + + a = sp.Symbol('a') + assignments = pystencils.AssignmentCollection({ + z[0, 0]: x[0, 0] * sp.log(x[0, 0] * y[0, 0]) + a + }) + + print(assignments) + + ast = pystencils.create_kernel(assignments, target='gpu') + + print(ast) + + code = pystencils.show_code(ast, custom_backend=CudaBackend()) + print(code) + opencl_code = pystencils.show_code(ast, custom_backend=OpenClBackend()) + print(opencl_code) + + cuda_kernel = ast.compile() + assert cuda_kernel is not None + + import pycuda.gpuarray as gpuarray + + x_cpu = np.random.rand(20, 30) + y_cpu = np.random.rand(20, 30) + z_cpu = np.random.rand(20, 30) + + x = gpuarray.to_gpu(x_cpu) + y = gpuarray.to_gpu(y_cpu) + z = gpuarray.to_gpu(z_cpu) + cuda_kernel(x=x, y=y, z=z, a=5.) + + result_cuda = z.get() + + import pyopencl.array as array + ctx = cl.create_some_context(0) + queue = cl.CommandQueue(ctx) + + x = array.to_device(queue, x_cpu) + y = array.to_device(queue, y_cpu) + z = array.to_device(queue, z_cpu) + + opencl_kernel = make_python_function(ast, queue, ctx) + assert opencl_kernel is not None + opencl_kernel(x=x, y=y, z=z, a=5.) + + result_opencl = z.get(queue) + + assert np.allclose(result_cuda, result_opencl) + + +if __name__ == '__main__': + test_opencl_jit() diff --git a/setup.py b/setup.py index 6959f09c28b6d9508802c4d0b1c81999e1b4463c..da6b82856b17e5252941154d43310ec1888eaa39 100644 --- a/setup.py +++ b/setup.py @@ -88,7 +88,10 @@ setup(name='pystencils', url='https://i10git.cs.fau.de/pycodegen/pystencils/', packages=['pystencils'] + ['pystencils.' + s for s in find_packages('pystencils')], install_requires=['sympy>=1.1', 'numpy', 'appdirs', 'joblib'], - package_data={'pystencils': ['include/*.h', 'backends/cuda_known_functions.txt']}, + package_data={'pystencils': ['include/*.h', + 'backends/cuda_known_functions.txt', + 'backends/opencl1.1_known_functions.txt']}, + ext_modules=cython_extensions("pystencils.boundaries.createindexlistcython"), classifiers=[ 'Development Status :: 4 - Beta', @@ -106,6 +109,7 @@ setup(name='pystencils', }, extras_require={ 'gpu': ['pycuda'], + 'opencl': ['pyopencl'], 'alltrafos': ['islpy', 'py-cpuinfo'], 'bench_db': ['blitzdb', 'pymongo', 'pandas'], 'interactive': ['matplotlib', 'ipy_table', 'imageio', 'jupyter', 'pyevtk'],