diff --git a/pystencils/backends/opencl_backend.py b/pystencils/backends/opencl_backend.py index b3a89ccc6d034921f30182ca10c81f508fc18e82..d44c944ddfe86329b5e31a554ff91ab57cce1568 100644 --- a/pystencils/backends/opencl_backend.py +++ b/pystencils/backends/opencl_backend.py @@ -18,7 +18,8 @@ def generate_opencl(astnode: Node, signature_only: bool = False) -> str: class OpenClBackend(CudaBackend): - def __init__(self, sympy_printer=None, + def __init__(self, + sympy_printer=None, signature_only=False): if not sympy_printer: sympy_printer = OpenClSympyPrinter() @@ -26,12 +27,6 @@ class OpenClBackend(CudaBackend): super().__init__(sympy_printer, signature_only) self._dialect = 'opencl' - # def _print_SympyAssignment(self, node): - # code = super()._print_SympyAssignment(node) - # if node.is_declaration and isinstance(node.lhs.dtype, pystencils.data_types.PointerType): - # return "__global " + code - # else: - # return code def _print_Type(self, node): code = super()._print_Type(node) diff --git a/pystencils/include/opencl_stdint.h b/pystencils/include/opencl_stdint.h new file mode 100644 index 0000000000000000000000000000000000000000..ac74a1454b9031ef4c98f6072e6c695ac402d0a2 --- /dev/null +++ b/pystencils/include/opencl_stdint.h @@ -0,0 +1 @@ +typedef long int int64_t; diff --git a/pystencils/opencl/opencljit.py b/pystencils/opencl/opencljit.py new file mode 100644 index 0000000000000000000000000000000000000000..7f4bdb659f07179e13c04d404a50a6fe17b19a18 --- /dev/null +++ b/pystencils/opencl/opencljit.py @@ -0,0 +1,74 @@ +import numpy as np + +from pystencils.backends.cbackend import generate_c, get_headers +from pystencils.gpucuda.cudajit import _build_numpy_argument_list, _check_arguments +from pystencils.include import get_pystencils_include_path + +USE_FAST_MATH = True + + +def make_python_function(kernel_function_node, opencl_queue, opencl_ctx, argument_dict=None, custom_backend=None): + """ + Creates a kernel function from an abstract syntax tree which + was created e.g. by :func:`pystencils.gpucuda.create_cuda_kernel` + or :func:`pystencils.gpucuda.created_indexed_cuda_kernel` + + Args: + kernel_function_node: the abstract syntax tree + argument_dict: parameters passed here are already fixed. Remaining parameters have to be passed to the + returned kernel functor. + + Returns: + compiled kernel as Python function + """ + import pyopencl as cl + + if argument_dict is None: + argument_dict = {} + + kernel_function_node.function_name = "opencl_" + kernel_function_node.function_name + header_list = ['"opencl_stdint.h"'] + list(get_headers(kernel_function_node)) + includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list]) + + code = includes + "\n" + code += "#define FUNC_PREFIX __kernel\n" + code += "#define RESTRICT restrict\n\n" + code += str(generate_c(kernel_function_node, dialect='opencl', custom_backend=custom_backend)) + options = [] + if USE_FAST_MATH: + options.append("-cl-unsafe-math-optimizations -cl-mad-enable -cl-fast-relaxed-math -cl-finite-math-only") + options.append("-I \"" + get_pystencils_include_path() + "\"") + mod = cl.Program(opencl_ctx, code).build(options=options) + func = getattr(mod, kernel_function_node.function_name) + + parameters = kernel_function_node.get_parameters() + + cache = {} + cache_values = [] + + def wrapper(**kwargs): + key = hash(tuple((k, v.ctypes.data, v.strides, v.shape) if isinstance(v, np.ndarray) else (k, id(v)) + for k, v in kwargs.items())) + try: + args, block_and_thread_numbers = cache[key] + func(opencl_queue, block_and_thread_numbers['grid'], block_and_thread_numbers['block'], *args) + except KeyError: + full_arguments = argument_dict.copy() + full_arguments.update(kwargs) + shape = _check_arguments(parameters, full_arguments) + + indexing = kernel_function_node.indexing + block_and_thread_numbers = indexing.call_parameters(shape) + block_and_thread_numbers['block'] = tuple(int(i) for i in block_and_thread_numbers['block']) + block_and_thread_numbers['grid'] = tuple(int(b*g) for (b, g) in zip(block_and_thread_numbers['block'], + block_and_thread_numbers['grid'])) + + args = _build_numpy_argument_list(parameters, full_arguments) + args = [a.data for a in args if hasattr(a, 'data')] + cache[key] = (args, block_and_thread_numbers) + cache_values.append(kwargs) # keep objects alive such that ids remain unique + func(opencl_queue, block_and_thread_numbers['grid'], block_and_thread_numbers['block'], *args) + + wrapper.ast = kernel_function_node + wrapper.parameters = kernel_function_node.get_parameters() + return wrapper diff --git a/pystencils_tests/test_opencl.py b/pystencils_tests/test_opencl.py index 00d84215cea7d4032326a048a07935a9e8221554..5216d52f8da903dad81c2f3041a202861c31f0f6 100644 --- a/pystencils_tests/test_opencl.py +++ b/pystencils_tests/test_opencl.py @@ -1,11 +1,14 @@ +import numpy as np +import pyopencl as cl import sympy as sp import pystencils from pystencils.backends.cuda_backend import CudaBackend from pystencils.backends.opencl_backend import OpenClBackend +from pystencils.opencl.opencljit import make_python_function -def test_opencl_backend(): +def test_print_opencl(): z, y, x = pystencils.fields("z, y, x: [2d]") assignments = pystencils.AssignmentCollection({ @@ -24,6 +27,61 @@ def test_opencl_backend(): opencl_code = pystencils.show_code(ast, custom_backend=OpenClBackend()) print(opencl_code) + assert "__global double * RESTRICT const _data_x" in str(opencl_code) + assert "__global double * RESTRICT" in str(opencl_code) + assert "get_local_id(0)" in str(opencl_code) + + +def test_opencl_jit(): + z, y, x = pystencils.fields("z, y, x: [20,30]") + + assignments = pystencils.AssignmentCollection({ + z[0, 0]: x[0, 0] * sp.log(x[0, 0] * y[0, 0]) + }) + + print(assignments) + + ast = pystencils.create_kernel(assignments, target='gpu') + + print(ast) + + code = pystencils.show_code(ast, custom_backend=CudaBackend()) + print(code) + opencl_code = pystencils.show_code(ast, custom_backend=OpenClBackend()) + print(opencl_code) + + cuda_kernel = ast.compile() + assert cuda_kernel is not None + + import pycuda.gpuarray as gpuarray + + x_cpu = np.random.rand(20, 30) + y_cpu = np.random.rand(20, 30) + z_cpu = np.random.rand(20, 30) + + x = gpuarray.to_gpu(x_cpu) + y = gpuarray.to_gpu(y_cpu) + z = gpuarray.to_gpu(z_cpu) + cuda_kernel(x=x, y=y, z=z) + + result_cuda = z.get() + + import pyopencl.array as array + ctx = cl.create_some_context(0) + queue = cl.CommandQueue(ctx) + + x = array.to_device(queue, x_cpu) + y = array.to_device(queue, y_cpu) + z = array.to_device(queue, z_cpu) + + opencl_kernel = make_python_function(ast, queue, ctx) + assert opencl_kernel is not None + opencl_kernel(x=x, y=y, z=z) + + result_opencl = z.get(queue) + + assert np.allclose(result_cuda, result_opencl) + if __name__ == '__main__': - test_opencl_backend() + test_opencl_jit()