Implement opencljit

b93d6992 · Stephan Seitz · c07ccb36 · b93d6992 · b93d6992 · b93d6992
Commit b93d6992 authored 5 years ago by Stephan Seitz
--- a/pystencils/backends/opencl_backend.py
+++ b/pystencils/backends/opencl_backend.py
@@ -18,7 +18,8 @@ def generate_opencl(astnode: Node, signature_only: bool = False) -> str:
 class OpenClBackend(CudaBackend):
-    def __init__(self, sympy_printer=None,
+    def __init__(self,
+                 sympy_printer=None,
                 signature_only=False):
        if not sympy_printer:
            sympy_printer = OpenClSympyPrinter()
@@ -26,12 +27,6 @@ class OpenClBackend(CudaBackend):
        super().__init__(sympy_printer, signature_only)
        self._dialect = 'opencl'
-    # def _print_SympyAssignment(self, node):
-    #     code = super()._print_SympyAssignment(node)
-    #     if node.is_declaration and isinstance(node.lhs.dtype, pystencils.data_types.PointerType):
-    #         return "__global " + code
-    #     else:
-    #         return code
    def _print_Type(self, node):
        code = super()._print_Type(node)

--- a/pystencils/include/opencl_stdint.h
+++ b/pystencils/include/opencl_stdint.h
+typedef long int int64_t;
--- a/pystencils/opencl/opencljit.py
+++ b/pystencils/opencl/opencljit.py
+import numpy as np
+from pystencils.backends.cbackend import generate_c, get_headers
+from pystencils.gpucuda.cudajit import _build_numpy_argument_list, _check_arguments
+from pystencils.include import get_pystencils_include_path
+USE_FAST_MATH = True
+def make_python_function(kernel_function_node, opencl_queue, opencl_ctx, argument_dict=None, custom_backend=None):
+    """
+    Creates a kernel function from an abstract syntax tree which
+    was created e.g. by :func:`pystencils.gpucuda.create_cuda_kernel`
+    or :func:`pystencils.gpucuda.created_indexed_cuda_kernel`
+    Args:
+        kernel_function_node: the abstract syntax tree
+        argument_dict: parameters passed here are already fixed. Remaining parameters have to be passed to the
+                       returned kernel functor.
+    Returns:
+        compiled kernel as Python function
+    """
+    import pyopencl as cl
+    if argument_dict is None:
+        argument_dict = {}
+    kernel_function_node.function_name = "opencl_" + kernel_function_node.function_name
+    header_list = ['"opencl_stdint.h"'] + list(get_headers(kernel_function_node))
+    includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list])
+    code = includes + "\n"
+    code += "#define FUNC_PREFIX __kernel\n"
+    code += "#define RESTRICT restrict\n\n"
+    code += str(generate_c(kernel_function_node, dialect='opencl', custom_backend=custom_backend))
+    options = []
+    if USE_FAST_MATH:
+        options.append("-cl-unsafe-math-optimizations -cl-mad-enable -cl-fast-relaxed-math -cl-finite-math-only")
+    options.append("-I \"" + get_pystencils_include_path() + "\"")
+    mod = cl.Program(opencl_ctx, code).build(options=options)
+    func = getattr(mod, kernel_function_node.function_name)
+    parameters = kernel_function_node.get_parameters()
+    cache = {}
+    cache_values = []
+    def wrapper(**kwargs):
+        key = hash(tuple((k, v.ctypes.data, v.strides, v.shape) if isinstance(v, np.ndarray) else (k, id(v))
+                         for k, v in kwargs.items()))
+        try:
+            args, block_and_thread_numbers = cache[key]
+            func(opencl_queue, block_and_thread_numbers['grid'], block_and_thread_numbers['block'], *args)
+        except KeyError:
+            full_arguments = argument_dict.copy()
+            full_arguments.update(kwargs)
+            shape = _check_arguments(parameters, full_arguments)
+            indexing = kernel_function_node.indexing
+            block_and_thread_numbers = indexing.call_parameters(shape)
+            block_and_thread_numbers['block'] = tuple(int(i) for i in block_and_thread_numbers['block'])
+            block_and_thread_numbers['grid'] = tuple(int(b*g) for (b, g) in zip(block_and_thread_numbers['block'],
+                                                                                block_and_thread_numbers['grid']))
+            args = _build_numpy_argument_list(parameters, full_arguments)
+            args = [a.data for a in args if hasattr(a, 'data')]
+            cache[key] = (args, block_and_thread_numbers)
+            cache_values.append(kwargs)  # keep objects alive such that ids remain unique
+            func(opencl_queue, block_and_thread_numbers['grid'], block_and_thread_numbers['block'], *args)
+    wrapper.ast = kernel_function_node
+    wrapper.parameters = kernel_function_node.get_parameters()
+    return wrapper
--- a/pystencils_tests/test_opencl.py
+++ b/pystencils_tests/test_opencl.py
+import numpy as np
+import pyopencl as cl
 import sympy as sp
 import pystencils
 from pystencils.backends.cuda_backend import CudaBackend
 from pystencils.backends.opencl_backend import OpenClBackend
+from pystencils.opencl.opencljit import make_python_function
-def test_opencl_backend():
+def test_print_opencl():
    z, y, x = pystencils.fields("z, y, x: [2d]")
    assignments = pystencils.AssignmentCollection({
@@ -24,6 +27,61 @@ def test_opencl_backend():
    opencl_code = pystencils.show_code(ast, custom_backend=OpenClBackend())
    print(opencl_code)
+    assert "__global double * RESTRICT const _data_x" in str(opencl_code)
+    assert "__global double * RESTRICT" in str(opencl_code)
+    assert "get_local_id(0)" in str(opencl_code)
+def test_opencl_jit():
+    z, y, x = pystencils.fields("z, y, x: [20,30]")
+    assignments = pystencils.AssignmentCollection({
+        z[0, 0]: x[0, 0] * sp.log(x[0, 0] * y[0, 0])
+    })
+    print(assignments)
+    ast = pystencils.create_kernel(assignments, target='gpu')
+    print(ast)
+    code = pystencils.show_code(ast, custom_backend=CudaBackend())
+    print(code)
+    opencl_code = pystencils.show_code(ast, custom_backend=OpenClBackend())
+    print(opencl_code)
+    cuda_kernel = ast.compile()
+    assert cuda_kernel is not None
+    import pycuda.gpuarray as gpuarray
+    x_cpu = np.random.rand(20, 30)
+    y_cpu = np.random.rand(20, 30)
+    z_cpu = np.random.rand(20, 30)
+    x = gpuarray.to_gpu(x_cpu)
+    y = gpuarray.to_gpu(y_cpu)
+    z = gpuarray.to_gpu(z_cpu)
+    cuda_kernel(x=x, y=y, z=z)
+    result_cuda = z.get()
+    import pyopencl.array as array
+    ctx = cl.create_some_context(0)
+    queue = cl.CommandQueue(ctx)
+    x = array.to_device(queue, x_cpu)
+    y = array.to_device(queue, y_cpu)
+    z = array.to_device(queue, z_cpu)
+    opencl_kernel = make_python_function(ast, queue, ctx)
+    assert opencl_kernel is not None
+    opencl_kernel(x=x, y=y, z=z)
+    result_opencl = z.get(queue)
+    assert np.allclose(result_cuda, result_opencl)
 if __name__ == '__main__':
-    test_opencl_backend()
+    test_opencl_jit()