diff --git a/conftest.py b/conftest.py
index d1da0451aef163be92d96c6133076361d3531b37..c20bde40916e7f7f5163c20e1c48c7b0f3d3b3a2 100644
--- a/conftest.py
+++ b/conftest.py
@@ -1,9 +1,14 @@
 import os
-import pytest
-import tempfile
 import runpy
 import sys
 import warnings
+import tempfile
+
+import nbformat
+import pytest
+from nbconvert import PythonExporter
+
+from pystencils.boundaries.createindexlistcython import *  # NOQA
 # Trigger config file reading / creation once - to avoid race conditions when multiple instances are creating it
 # at the same time
 from pystencils.cpu import cpujit
@@ -15,7 +20,6 @@ try:
     pyximport.install(language_level=3)
 except ImportError:
     pass
-from pystencils.boundaries.createindexlistcython import *  # NOQA
 
 
 SCRIPT_FOLDER = os.path.dirname(os.path.realpath(__file__))
@@ -29,7 +33,8 @@ def add_path_to_ignore(path):
     collect_ignore += [os.path.join(SCRIPT_FOLDER, path, f) for f in os.listdir(os.path.join(SCRIPT_FOLDER, path))]
 
 
-collect_ignore = [os.path.join(SCRIPT_FOLDER, "doc", "conf.py")]
+collect_ignore = [os.path.join(SCRIPT_FOLDER, "doc", "conf.py"),
+        os.path.join(SCRIPT_FOLDER, "pystencils", "opencl", "opencl.autoinit")]
 add_path_to_ignore('pystencils_tests/benchmark')
 add_path_to_ignore('_local_tmp')
 
@@ -78,8 +83,6 @@ for root, sub_dirs, files in os.walk('.'):
             collect_ignore.append(f)
 
 
-import nbformat
-from nbconvert import PythonExporter
 
 
 class IPythonMockup:
diff --git a/pystencils/backends/cbackend.py b/pystencils/backends/cbackend.py
index 559295930911c390f90cddafadb4d0c8da2b4b66..9efca1f53a6870d2b0ce2bdd1da547d16beb6a6c 100644
--- a/pystencils/backends/cbackend.py
+++ b/pystencils/backends/cbackend.py
@@ -353,6 +353,9 @@ class CustomSympyPrinter(CCodePrinter):
         result = super(CustomSympyPrinter, self)._print_Piecewise(expr)
         return result.replace("\n", "")
 
+    def _print_Type(self, node):
+        return str(node)
+
     def _print_Function(self, expr):
         infix_functions = {
             bitwise_xor: '^',
@@ -365,7 +368,7 @@ class CustomSympyPrinter(CCodePrinter):
             return expr.to_c(self._print)
         if isinstance(expr, reinterpret_cast_func):
             arg, data_type = expr.args
-            return "*((%s)(& %s))" % (PointerType(data_type, restrict=False), self._print(arg))
+            return "*((%s)(& %s))" % (self._print(PointerType(data_type, restrict=False)), self._print(arg))
         elif isinstance(expr, address_of):
             assert len(expr.args) == 1, "address_of must only have one argument"
             return "&(%s)" % self._print(expr.args[0])
diff --git a/pystencils/backends/opencl_backend.py b/pystencils/backends/opencl_backend.py
index 60ea06be9723005654155b256f35e094db0005f6..3ab7f820ea30adebf584977625b2e559f897ca27 100644
--- a/pystencils/backends/opencl_backend.py
+++ b/pystencils/backends/opencl_backend.py
@@ -68,6 +68,13 @@ class OpenClSympyPrinter(CudaSympyPrinter):
         CustomSympyPrinter.__init__(self)
         self.known_functions = OPENCL_KNOWN_FUNCTIONS
 
+    def _print_Type(self, node):
+        code = super()._print_Type(node)
+        if isinstance(node, pystencils.data_types.PointerType):
+            return "__global " + code
+        else:
+            return code
+
     def _print_ThreadIndexingSymbol(self, node):
         symbol_name: str = node.name
         function_name, dimension = tuple(symbol_name.split("."))
diff --git a/pystencils/boundaries/boundaryhandling.py b/pystencils/boundaries/boundaryhandling.py
index d258de4b3e5e7d5b85c69d0910ea384b77ecd110..94d12619663190104f80894efebdc0955d5ac592 100644
--- a/pystencils/boundaries/boundaryhandling.py
+++ b/pystencils/boundaries/boundaryhandling.py
@@ -87,8 +87,25 @@ class BoundaryHandling:
         fi = flag_interface
         self.flag_interface = fi if fi is not None else FlagInterface(data_handling, name + "Flags")
 
-        gpu = self._target == 'gpu'
-        data_handling.add_custom_class(self._index_array_name, self.IndexFieldBlockData, cpu=True, gpu=gpu)
+        def to_cpu(gpu_version, cpu_version):
+            gpu_version = gpu_version.boundary_object_to_index_list
+            cpu_version = cpu_version.boundary_object_to_index_list
+            for obj, cpu_arr in cpu_version.items():
+                gpu_version[obj].get(cpu_arr)
+
+        def to_gpu(gpu_version, cpu_version):
+            gpu_version = gpu_version.boundary_object_to_index_list
+            cpu_version = cpu_version.boundary_object_to_index_list
+            for obj, cpu_arr in cpu_version.items():
+                if obj not in gpu_version or gpu_version[obj].shape != cpu_arr.shape:
+                    gpu_version[obj] = self.data_handling.array_handler.to_gpu(cpu_arr)
+                else:
+                    self.data_handling.array_handler.upload(gpu_version[obj], cpu_arr)
+
+        class_ = self.IndexFieldBlockData
+        class_.to_cpu = to_cpu
+        class_.to_gpu = to_gpu
+        data_handling.add_custom_class(self._index_array_name, class_)
 
     @property
     def data_handling(self):
@@ -204,7 +221,7 @@ class BoundaryHandling:
         if self._dirty:
             self.prepare()
 
-        for b in self._data_handling.iterate(gpu=self._target == 'gpu'):
+        for b in self._data_handling.iterate(gpu=self._target in self._data_handling._GPU_LIKE_TARGETS):
             for b_obj, idx_arr in b[self._index_array_name].boundary_object_to_index_list.items():
                 kwargs[self._field_name] = b[self._field_name]
                 kwargs['indexField'] = idx_arr
@@ -219,7 +236,7 @@ class BoundaryHandling:
         if self._dirty:
             self.prepare()
 
-        for b in self._data_handling.iterate(gpu=self._target == 'gpu'):
+        for b in self._data_handling.iterate(gpu=self._target in self._data_handling._GPU_LIKE_TARGETS):
             for b_obj, idx_arr in b[self._index_array_name].boundary_object_to_index_list.items():
                 arguments = kwargs.copy()
                 arguments[self._field_name] = b[self._field_name]
@@ -302,7 +319,7 @@ class BoundaryHandling:
     def _boundary_data_initialization(self, boundary_obj, boundary_data_setter, **kwargs):
         if boundary_obj.additional_data_init_callback:
             boundary_obj.additional_data_init_callback(boundary_data_setter, **kwargs)
-        if self._target == 'gpu':
+        if self._target in self._data_handling._GPU_LIKE_TARGETS:
             self._data_handling.to_gpu(self._index_array_name)
 
     class BoundaryInfo(object):
@@ -312,7 +329,7 @@ class BoundaryHandling:
             self.kernel = kernel
 
     class IndexFieldBlockData:
-        def __init__(self, *_1, **_2):
+        def __init__(self):
             self.boundary_object_to_index_list = {}
             self.boundary_object_to_data_setter = {}
 
@@ -320,24 +337,6 @@ class BoundaryHandling:
             self.boundary_object_to_index_list.clear()
             self.boundary_object_to_data_setter.clear()
 
-        @staticmethod
-        def to_cpu(gpu_version, cpu_version):
-            gpu_version = gpu_version.boundary_object_to_index_list
-            cpu_version = cpu_version.boundary_object_to_index_list
-            for obj, cpu_arr in cpu_version.items():
-                gpu_version[obj].get(cpu_arr)
-
-        @staticmethod
-        def to_gpu(gpu_version, cpu_version):
-            from pycuda import gpuarray
-            gpu_version = gpu_version.boundary_object_to_index_list
-            cpu_version = cpu_version.boundary_object_to_index_list
-            for obj, cpu_arr in cpu_version.items():
-                if obj not in gpu_version or gpu_version[obj].shape != cpu_arr.shape:
-                    gpu_version[obj] = gpuarray.to_gpu(cpu_arr)
-                else:
-                    gpu_version[obj].set(cpu_arr)
-
 
 class BoundaryDataSetter:
 
diff --git a/pystencils/datahandling/__init__.py b/pystencils/datahandling/__init__.py
index 583f121b09ffbfdc6dc2103ec6e300e639791891..a4fa55bdc7a52e1b9c2015e2210fcbb48aaeb2e1 100644
--- a/pystencils/datahandling/__init__.py
+++ b/pystencils/datahandling/__init__.py
@@ -20,7 +20,8 @@ def create_data_handling(domain_size: Tuple[int, ...],
                          default_layout: str = 'SoA',
                          default_target: str = 'cpu',
                          parallel: bool = False,
-                         default_ghost_layers: int = 1) -> DataHandling:
+                         default_ghost_layers: int = 1,
+                         opencl_queue=None) -> DataHandling:
     """Creates a data handling instance.
 
     Args:
@@ -33,6 +34,7 @@ def create_data_handling(domain_size: Tuple[int, ...],
         default_ghost_layers: default number of ghost layers if not overwritten in 'add_array'
     """
     if parallel:
+        assert not opencl_queue, "OpenCL is only supported for SerialDataHandling"
         if wlb is None:
             raise ValueError("Cannot create parallel data handling because walberla module is not available")
 
@@ -56,8 +58,12 @@ def create_data_handling(domain_size: Tuple[int, ...],
         return ParallelDataHandling(blocks=block_storage, dim=dim, default_target=default_target,
                                     default_layout=default_layout, default_ghost_layers=default_ghost_layers)
     else:
-        return SerialDataHandling(domain_size, periodicity=periodicity, default_target=default_target,
-                                  default_layout=default_layout, default_ghost_layers=default_ghost_layers)
+        return SerialDataHandling(domain_size,
+                                  periodicity=periodicity,
+                                  default_target=default_target,
+                                  default_layout=default_layout,
+                                  default_ghost_layers=default_ghost_layers,
+                                  opencl_queue=opencl_queue)
 
 
 __all__ = ['create_data_handling']
diff --git a/pystencils/datahandling/datahandling_interface.py b/pystencils/datahandling/datahandling_interface.py
index ba960edc17b2e3409dc465c257e23d9e9394dc5c..af1a6ba1fc9d003042063023aa1ede5fc08665db 100644
--- a/pystencils/datahandling/datahandling_interface.py
+++ b/pystencils/datahandling/datahandling_interface.py
@@ -16,6 +16,9 @@ class DataHandling(ABC):
     'gather' function that has collects (parts of the) distributed data on a single process.
     """
 
+    _GPU_LIKE_TARGETS = ['gpu', 'opencl']
+    _GPU_LIKE_BACKENDS = ['gpucuda', 'opencl']
+
     # ---------------------------- Adding and accessing data -----------------------------------------------------------
 
     @property
diff --git a/pystencils/datahandling/pycuda.py b/pystencils/datahandling/pycuda.py
new file mode 100644
index 0000000000000000000000000000000000000000..30602a30ce6b87d0e25861b43c5291cda77ed570
--- /dev/null
+++ b/pystencils/datahandling/pycuda.py
@@ -0,0 +1,39 @@
+try:
+    import pycuda.gpuarray as gpuarray
+except ImportError:
+    gpuarray = None
+import numpy as np
+
+import pystencils
+
+
+class PyCudaArrayHandler:
+
+    def __init__(self):
+        import pycuda.autoinit  # NOQA
+
+    def zeros(self, shape, dtype=np.float64, order='C'):
+        return gpuarray.zeros(shape, dtype, order)
+
+    def ones(self, shape, dtype, order='C'):
+        return gpuarray.ones(shape, dtype, order)
+
+    def empty(self, shape, dtype=np.float64, layout=None):
+        if layout:
+            cpu_array = pystencils.field.create_numpy_array_with_layout(shape, dtype, layout)
+            return self.from_numpy(cpu_array)
+        else:
+            return gpuarray.empty(shape, dtype)
+
+    def to_gpu(self, array):
+        return gpuarray.to_gpu(array)
+
+    def upload(self, gpuarray, numpy_array):
+        gpuarray.set(numpy_array)
+
+    def download(self, gpuarray, numpy_array):
+        gpuarray.get(numpy_array)
+
+    def randn(self, shape, dtype=np.float64):
+        cpu_array = np.random.randn(*shape).astype(dtype)
+        return self.from_numpy(cpu_array)
diff --git a/pystencils/datahandling/pyopencl.py b/pystencils/datahandling/pyopencl.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b6f44088f60c47d0c57c5185f1afd16ef16bac7
--- /dev/null
+++ b/pystencils/datahandling/pyopencl.py
@@ -0,0 +1,45 @@
+try:
+    import pyopencl.array as gpuarray
+except ImportError:
+    gpuarray = None
+
+import numpy as np
+
+import pystencils
+
+
+class PyOpenClArrayHandler:
+
+    def __init__(self, queue):
+        if not queue:
+            from pystencils.opencl.opencljit import get_global_cl_queue
+            queue = get_global_cl_queue()
+        assert queue, "OpenCL queue missing!\n" \
+            "Use `import pystencils.opencl.autoinit` if you want it to be automatically created"
+        self.queue = queue
+
+    def zeros(self, shape, dtype=np.float64, order='C'):
+        return gpuarray.zeros(shape, dtype, order)
+
+    def ones(self, shape, dtype, order='C'):
+        return gpuarray.ones(self.queue, shape, dtype, order)
+
+    def empty(self, shape, dtype=np.float64, layout=None):
+        if layout:
+            cpu_array = pystencils.field.create_numpy_array_with_layout(shape, dtype, layout)
+            return self.from_numpy(cpu_array)
+        else:
+            return gpuarray.empty(self.queue, shape, dtype)
+
+    def to_gpu(self, array):
+        return gpuarray.to_device(self.queue, array)
+
+    def upload(self, gpuarray, numpy_array):
+        gpuarray.set(numpy_array, self.queue)
+
+    def download(self, gpuarray, numpy_array):
+        gpuarray.get(self.queue, numpy_array)
+
+    def randn(self, shape, dtype=np.float64):
+        cpu_array = np.random.randn(*shape).astype(dtype)
+        return self.from_numpy(cpu_array)
diff --git a/pystencils/datahandling/serial_datahandling.py b/pystencils/datahandling/serial_datahandling.py
index 697b6b6674a83b6786c8a6a3f687df3e49cd8a01..12c9b6f3df2757cbdb1eb03337be8bfe1aaf936d 100644
--- a/pystencils/datahandling/serial_datahandling.py
+++ b/pystencils/datahandling/serial_datahandling.py
@@ -6,22 +6,25 @@ import numpy as np
 
 from pystencils.datahandling.blockiteration import SerialBlock
 from pystencils.datahandling.datahandling_interface import DataHandling
+from pystencils.datahandling.pycuda import PyCudaArrayHandler
+from pystencils.datahandling.pyopencl import PyOpenClArrayHandler
 from pystencils.field import (
     Field, FieldType, create_numpy_array_with_layout, layout_string_to_tuple, spatial_layout_string_to_tuple)
 from pystencils.slicing import normalize_slice, remove_ghost_layers
 from pystencils.utils import DotDict
 
-try:
-    import pycuda.gpuarray as gpuarray
-    import pycuda.autoinit  # NOQA
-except ImportError:
-    gpuarray = None
-
 
 class SerialDataHandling(DataHandling):
 
-    def __init__(self, domain_size: Sequence[int], default_ghost_layers: int = 1, default_layout: str = 'SoA',
-                 periodicity: Union[bool, Sequence[bool]] = False, default_target: str = 'cpu') -> None:
+    def __init__(self,
+                 domain_size: Sequence[int],
+                 default_ghost_layers: int = 1,
+                 default_layout: str = 'SoA',
+                 periodicity: Union[bool, Sequence[bool]] = False,
+                 default_target: str = 'cpu',
+                 opencl_queue=None,
+                 opencl_ctx=None,
+                 array_handler=None) -> None:
         """
         Creates a data handling for single node simulations.
 
@@ -42,6 +45,19 @@ class SerialDataHandling(DataHandling):
         self.custom_data_cpu = DotDict()
         self.custom_data_gpu = DotDict()
         self._custom_data_transfer_functions = {}
+        self._opencl_queue = opencl_queue
+        self._opencl_ctx = opencl_ctx
+
+        if not array_handler:
+            try:
+                self.array_handler = PyCudaArrayHandler()
+            except Exception:
+                self.array_handler = None
+
+            if default_target == 'opencl' or opencl_queue:
+                self.array_handler = PyOpenClArrayHandler(opencl_queue)
+        else:
+            self.array_handler = array_handler
 
         if periodicity is None or periodicity is False:
             periodicity = [False] * self.dim
@@ -82,7 +98,7 @@ class SerialDataHandling(DataHandling):
         if layout is None:
             layout = self.default_layout
         if gpu is None:
-            gpu = self.default_target == 'gpu'
+            gpu = self.default_target in self._GPU_LIKE_TARGETS
 
         kwargs = {
             'shape': tuple(s + 2 * ghost_layers for s in self._domainSize),
@@ -126,7 +142,7 @@ class SerialDataHandling(DataHandling):
         if gpu:
             if name in self.gpu_arrays:
                 raise ValueError("GPU Field with this name already exists")
-            self.gpu_arrays[name] = gpuarray.to_gpu(cpu_arr)
+            self.gpu_arrays[name] = self.array_handler.to_gpu(cpu_arr)
 
         assert all(f.name != name for f in self.fields.values()), "Symbolic field with this name already exists"
         self.fields[name] = Field.create_from_numpy_array(name, cpu_arr, index_dimensions=index_dimensions,
@@ -222,12 +238,12 @@ class SerialDataHandling(DataHandling):
             self.to_gpu(name)
 
     def run_kernel(self, kernel_function, **kwargs):
-        arrays = self.gpu_arrays if kernel_function.ast.backend == 'gpucuda' else self.cpu_arrays
+        arrays = self.gpu_arrays if kernel_function.ast.backend in self._GPU_LIKE_BACKENDS else self.cpu_arrays
         kernel_function(**arrays, **kwargs)
 
     def get_kernel_kwargs(self, kernel_function, **kwargs):
         result = {}
-        result.update(self.gpu_arrays if kernel_function.ast.backend == 'gpucuda' else self.cpu_arrays)
+        result.update(self.gpu_arrays if kernel_function.ast.backend in self._GPU_LIKE_BACKENDS else self.cpu_arrays)
         result.update(kwargs)
         return [result]
 
@@ -236,14 +252,14 @@ class SerialDataHandling(DataHandling):
             transfer_func = self._custom_data_transfer_functions[name][1]
             transfer_func(self.custom_data_gpu[name], self.custom_data_cpu[name])
         else:
-            self.gpu_arrays[name].get(self.cpu_arrays[name])
+            self.array_handler.download(self.gpu_arrays[name], self.cpu_arrays[name])
 
     def to_gpu(self, name):
         if name in self._custom_data_transfer_functions:
             transfer_func = self._custom_data_transfer_functions[name][0]
             transfer_func(self.custom_data_gpu[name], self.custom_data_cpu[name])
         else:
-            self.gpu_arrays[name].set(self.cpu_arrays[name])
+            self.array_handler.upload(self.gpu_arrays[name], self.cpu_arrays[name])
 
     def is_on_gpu(self, name):
         return name in self.gpu_arrays
@@ -257,6 +273,8 @@ class SerialDataHandling(DataHandling):
     def synchronization_function(self, names, stencil=None, target=None, **_):
         if target is None:
             target = self.default_target
+        if target == 'opencl':
+            target = 'gpu'
         assert target in ('cpu', 'gpu')
         if not hasattr(names, '__len__') or type(names) is str:
             names = [names]
@@ -298,11 +316,15 @@ class SerialDataHandling(DataHandling):
                     result.append(get_periodic_boundary_functor(filtered_stencil, ghost_layers=gls))
                 else:
                     from pystencils.gpucuda.periodicity import get_periodic_boundary_functor as boundary_func
+                    target = 'gpu' if not isinstance(self.array_handler, PyOpenClArrayHandler) else 'opencl'
                     result.append(boundary_func(filtered_stencil, self._domainSize,
                                                 index_dimensions=self.fields[name].index_dimensions,
                                                 index_dim_shape=values_per_cell,
                                                 dtype=self.fields[name].dtype.numpy_dtype,
-                                                ghost_layers=gls))
+                                                ghost_layers=gls,
+                                                target=target,
+                                                opencl_queue=self._opencl_queue,
+                                                opencl_ctx=self._opencl_ctx))
 
         if target == 'cpu':
             def result_functor():
diff --git a/pystencils/display_utils.py b/pystencils/display_utils.py
index 638d1290acbbfc4d86bec12028dc59b37e2f98ea..610c404b709885e7445eba3be412f6811fca0241 100644
--- a/pystencils/display_utils.py
+++ b/pystencils/display_utils.py
@@ -45,7 +45,12 @@ def show_code(ast: KernelFunction, custom_backend=None):
     if isinstance(ast, KernelWrapper):
         ast = ast.ast
 
-    dialect = 'cuda' if ast.backend == 'gpucuda' else 'c'
+    if ast.backend == 'gpucuda':
+        dialect = 'cuda'
+    elif ast.backend == 'opencl':
+        dialect = 'opencl'
+    else:
+        dialect = 'c'
 
     class CodeDisplay:
         def __init__(self, ast_input):
diff --git a/pystencils/gpucuda/periodicity.py b/pystencils/gpucuda/periodicity.py
index e94a1796e3a66da626fc4e13fcc0413f5f93961d..080ef44ebd995e1d8dcf4dbe1f0839e0a218fde6 100644
--- a/pystencils/gpucuda/periodicity.py
+++ b/pystencils/gpucuda/periodicity.py
@@ -1,7 +1,8 @@
 import numpy as np
 
+import pystencils.gpucuda
+import pystencils.opencl
 from pystencils import Assignment, Field
-from pystencils.gpucuda import make_python_function
 from pystencils.gpucuda.kernelcreation import create_cuda_kernel
 from pystencils.slicing import get_periodic_boundary_src_dst_slices, normalize_slice
 
@@ -25,16 +26,22 @@ def create_copy_kernel(domain_size, from_slice, to_slice, index_dimensions=0, in
         update_eqs.append(eq)
 
     ast = create_cuda_kernel(update_eqs, iteration_slice=to_slice, skip_independence_check=True)
-    return make_python_function(ast)
+    return ast
 
 
 def get_periodic_boundary_functor(stencil, domain_size, index_dimensions=0, index_dim_shape=1, ghost_layers=1,
-                                  thickness=None, dtype=float):
+                                  thickness=None, dtype=float, target='gpu', opencl_queue=None, opencl_ctx=None):
+    assert target in ['gpu', 'opencl']
     src_dst_slice_tuples = get_periodic_boundary_src_dst_slices(stencil, ghost_layers, thickness)
     kernels = []
     index_dimensions = index_dimensions
+
     for src_slice, dst_slice in src_dst_slice_tuples:
-        kernels.append(create_copy_kernel(domain_size, src_slice, dst_slice, index_dimensions, index_dim_shape, dtype))
+        ast = create_copy_kernel(domain_size, src_slice, dst_slice, index_dimensions, index_dim_shape, dtype)
+        if target == 'gpu':
+            kernels.append(pystencils.gpucuda.make_python_function(ast))
+        else:
+            kernels.append(pystencils.opencl.make_python_function(ast, opencl_queue, opencl_ctx))
 
     def functor(pdfs, **_):
         for kernel in kernels:
diff --git a/pystencils/kernelcreation.py b/pystencils/kernelcreation.py
index 555196a623ce61fcfd2a9a40272f06c27fb70e54..a2dd55352548a4a28399920599394d53b5205465 100644
--- a/pystencils/kernelcreation.py
+++ b/pystencils/kernelcreation.py
@@ -1,4 +1,5 @@
-from itertools import combinations
+import functools
+import itertools
 from types import MappingProxyType
 
 import sympy as sp
@@ -27,13 +28,15 @@ def create_kernel(assignments,
                   gpu_indexing_params=MappingProxyType({}),
                   use_textures_for_interpolation=True,
                   cpu_prepend_optimizations=[],
-                  use_auto_for_assignments=False):
+                  use_auto_for_assignments=False,
+                  opencl_queue=None,
+                  opencl_ctx=None):
     """
     Creates abstract syntax tree (AST) of kernel, using a list of update equations.
 
     Args:
         assignments: can be a single assignment, sequence of assignments or an `AssignmentCollection`
-        target: 'cpu', 'llvm' or 'gpu'
+        target: 'cpu', 'llvm', 'gpu' or 'opencl'
         data_type: data type used for all untyped symbols (i.e. non-fields), can also be a dict from symbol name
                   to type
         iteration_slice: rectangular subset to iterate over, if not specified the complete non-ghost layer \
@@ -108,13 +111,20 @@ def create_kernel(assignments,
         from pystencils.llvm import create_kernel
         ast = create_kernel(assignments, type_info=data_type, split_groups=split_groups,
                             iteration_slice=iteration_slice, ghost_layers=ghost_layers)
-    elif target == 'gpu':
+    elif target == 'gpu' or target == 'opencl':
         from pystencils.gpucuda import create_cuda_kernel
         ast = create_cuda_kernel(assignments, type_info=data_type,
                                  indexing_creator=indexing_creator_from_params(gpu_indexing, gpu_indexing_params),
                                  iteration_slice=iteration_slice, ghost_layers=ghost_layers,
                                  skip_independence_check=skip_independence_check,
                                  use_textures_for_interpolation=use_textures_for_interpolation)
+        if target == 'opencl':
+            from pystencils.opencl.opencljit import make_python_function
+            ast._backend = 'opencl'
+            ast.compile = functools.partial(make_python_function, ast, opencl_queue, opencl_ctx)
+            ast._target = 'opencl'
+            ast._backend = 'opencl'
+        return ast
     else:
         raise ValueError("Unknown target %s. Has to be one of 'cpu', 'gpu' or 'llvm' " % (target,))
 
@@ -133,7 +143,9 @@ def create_indexed_kernel(assignments,
                           cpu_openmp=True,
                           gpu_indexing='block',
                           gpu_indexing_params=MappingProxyType({}),
-                          use_textures_for_interpolation=True):
+                          use_textures_for_interpolation=True,
+                          opencl_queue=None,
+                          opencl_ctx=None):
     """
     Similar to :func:`create_kernel`, but here not all cells of a field are updated but only cells with
     coordinates which are stored in an index field. This traversal method can e.g. be used for boundary handling.
@@ -183,7 +195,7 @@ def create_indexed_kernel(assignments,
         return ast
     elif target == 'llvm':
         raise NotImplementedError("Indexed kernels are not yet supported in LLVM backend")
-    elif target == 'gpu':
+    elif target == 'gpu' or target == 'opencl':
         from pystencils.gpucuda import created_indexed_cuda_kernel
         idx_creator = indexing_creator_from_params(gpu_indexing, gpu_indexing_params)
         ast = created_indexed_cuda_kernel(assignments,
@@ -192,6 +204,12 @@ def create_indexed_kernel(assignments,
                                           coordinate_names=coordinate_names,
                                           indexing_creator=idx_creator,
                                           use_textures_for_interpolation=use_textures_for_interpolation)
+        if target == 'opencl':
+            from pystencils.opencl.opencljit import make_python_function
+            ast._backend = 'opencl'
+            ast.compile = functools.partial(make_python_function, ast, opencl_queue, opencl_ctx)
+            ast._target = 'opencl'
+            ast._backend = 'opencl'
         return ast
     else:
         raise ValueError("Unknown target %s. Has to be either 'cpu' or 'gpu'" % (target,))
@@ -288,7 +306,7 @@ def create_staggered_kernel(assignments, target='cpu', gpu_exclusive_conditions=
         outer_assignment = None
         conditions = {direction: condition(direction) for direction in stencil}
         for num_conditions in range(len(stencil)):
-            for combination in combinations(conditions.values(), num_conditions):
+            for combination in itertools.combinations(conditions.values(), num_conditions):
                 for assignment in assignments:
                     direction = stencil[assignment.lhs.index[0]]
                     if conditions[direction] in combination:
diff --git a/pystencils/opencl/__init__.py b/pystencils/opencl/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e405a2f4e1b22a285cecb833da7764f88762d1ae
--- /dev/null
+++ b/pystencils/opencl/__init__.py
@@ -0,0 +1,8 @@
+"""
+
+"""
+
+from pystencils.opencl.opencljit import (
+    clear_global_ctx, init_globally, init_globally_with_context, make_python_function)
+
+__all__ = ['init_globally', 'init_globally_with_context', 'clear_global_ctx', 'make_python_function']
diff --git a/pystencils/opencl/autoinit.py b/pystencils/opencl/autoinit.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d20169b640a83895edd9366a279fc6f2e13f6b4
--- /dev/null
+++ b/pystencils/opencl/autoinit.py
@@ -0,0 +1,16 @@
+"""
+Automatically initializes OpenCL context using any device.
+
+Use `pystencils.opencl.{init_globally_with_context,init_globally}` if you want to use a specific device.
+"""
+
+from pystencils.opencl.opencljit import (
+    clear_global_ctx, init_globally, init_globally_with_context, make_python_function)
+
+__all__ = ['init_globally', 'init_globally_with_context', 'clear_global_ctx', 'make_python_function']
+
+try:
+    init_globally()
+except Exception as e:
+    import warnings
+    warnings.warn(str(e))
diff --git a/pystencils/opencl/opencljit.py b/pystencils/opencl/opencljit.py
index 5526c954a1cc25438710576a0119abd65ab9854d..6b5893865545294151269e7aabafdd51b1b7c264 100644
--- a/pystencils/opencl/opencljit.py
+++ b/pystencils/opencl/opencljit.py
@@ -3,10 +3,45 @@ import numpy as np
 from pystencils.backends.cbackend import generate_c, get_headers
 from pystencils.gpucuda.cudajit import _build_numpy_argument_list, _check_arguments
 from pystencils.include import get_pystencils_include_path
+from pystencils.kernel_wrapper import KernelWrapper
 
 USE_FAST_MATH = True
 
 
+_global_cl_ctx = None
+_global_cl_queue = None
+
+
+def get_global_cl_queue():
+    return _global_cl_queue
+
+
+def get_global_cl_ctx():
+    return _global_cl_ctx
+
+
+def init_globally(device_index=0):
+    import pyopencl as cl
+    global _global_cl_ctx
+    global _global_cl_queue
+    _global_cl_ctx = cl.create_some_context(device_index)
+    _global_cl_queue = cl.CommandQueue(_global_cl_ctx)
+
+
+def init_globally_with_context(opencl_ctx, opencl_queue):
+    global _global_cl_ctx
+    global _global_cl_queue
+    _global_cl_ctx = opencl_ctx
+    _global_cl_queue = opencl_queue
+
+
+def clear_global_ctx():
+    global _global_cl_ctx
+    global _global_cl_queue
+    _global_cl_ctx = None
+    _global_cl_queue = None
+
+
 def make_python_function(kernel_function_node, opencl_queue, opencl_ctx, argument_dict=None, custom_backend=None):
     """
     Creates a **OpenCL** kernel function from an abstract syntax tree which
@@ -24,8 +59,16 @@ def make_python_function(kernel_function_node, opencl_queue, opencl_ctx, argumen
         compiled kernel as Python function
     """
     import pyopencl as cl
-    assert opencl_ctx, "No valid OpenCL context"
-    assert opencl_queue, "No valid OpenCL queue"
+
+    if not opencl_ctx:
+        opencl_ctx = _global_cl_ctx
+    if not opencl_queue:
+        opencl_queue = _global_cl_queue
+
+    assert opencl_ctx, "No valid OpenCL context!\n" \
+        "Use `import pystencils.opencl.autoinit` if you want it to be automatically created"
+    assert opencl_queue, "No valid OpenCL queue!\n" \
+        "Use `import pystencils.opencl.autoinit` if you want it to be automatically created"
 
     if argument_dict is None:
         argument_dict = {}
@@ -74,6 +117,10 @@ def make_python_function(kernel_function_node, opencl_queue, opencl_ctx, argumen
         except KeyError:
             full_arguments = argument_dict.copy()
             full_arguments.update(kwargs)
+            assert not any(isinstance(a, np.ndarray)
+                           for a in full_arguments.values()), 'Calling a OpenCL kernel with a Numpy array!'
+            assert not any('pycuda' in str(type(a))
+                           for a in full_arguments.values()), 'Calling a OpenCL kernel with a PyCUDA array!'
             shape = _check_arguments(parameters, full_arguments)
 
             indexing = kernel_function_node.indexing
@@ -90,4 +137,5 @@ def make_python_function(kernel_function_node, opencl_queue, opencl_ctx, argumen
 
     wrapper.ast = kernel_function_node
     wrapper.parameters = kernel_function_node.get_parameters()
+    wrapper = KernelWrapper(wrapper, parameters, kernel_function_node)
     return wrapper
diff --git a/pystencils_tests/test_cudagpu.py b/pystencils_tests/test_cudagpu.py
index 77790229f48b736258d343a4670c06e66a417838..f5efd517f0457ffc71d5c8015dd5d4f92aef47ea 100644
--- a/pystencils_tests/test_cudagpu.py
+++ b/pystencils_tests/test_cudagpu.py
@@ -1,4 +1,5 @@
 import numpy as np
+import pycuda.autoinit
 import pycuda.gpuarray as gpuarray
 import sympy as sp
 from scipy.ndimage import convolve
diff --git a/pystencils_tests/test_datahandling.py b/pystencils_tests/test_datahandling.py
index cdee059c218a67ad9adc824f41fa09ae3df2f19b..1af1a68e3bfdb3681e05443c2412d8bd24903b0d 100644
--- a/pystencils_tests/test_datahandling.py
+++ b/pystencils_tests/test_datahandling.py
@@ -6,6 +6,13 @@ import numpy as np
 import pystencils as ps
 from pystencils import create_data_handling, create_kernel
 
+try:
+    import pytest
+except ImportError:
+    import unittest.mock
+    pytest = unittest.mock.MagicMock()
+
+
 
 def basic_iteration(dh):
     dh.add_array('basic_iter_test_gl_default')
@@ -100,15 +107,9 @@ def synchronization(dh, test_gpu=False):
         np.testing.assert_equal(42, b[field_name])
 
 
-def kernel_execution_jacobi(dh, test_gpu=False):
-    if test_gpu:
-        try:
-            from pycuda import driver
-            import pycuda.autoinit
-        except ImportError:
-            print("Skipping kernel_execution_jacobi GPU version, because pycuda not available")
-            return
+def kernel_execution_jacobi(dh, target):
 
+    test_gpu = target == 'gpu' or target == 'opencl'
     dh.add_array('f', gpu=test_gpu)
     dh.add_array('tmp', gpu=test_gpu)
     stencil_2d = [(1, 0), (-1, 0), (0, 1), (0, -1)]
@@ -119,7 +120,7 @@ def kernel_execution_jacobi(dh, test_gpu=False):
     def jacobi():
         dh.fields.tmp.center @= sum(dh.fields.f.neighbors(stencil)) / len(stencil)
 
-    kernel = create_kernel(jacobi, target='gpu' if test_gpu else 'cpu').compile()
+    kernel = create_kernel(jacobi, target).compile()
     for b in dh.iterate(ghost_layers=1):
         b['f'].fill(42)
     dh.run_kernel(kernel)
@@ -196,17 +197,32 @@ def test_access_and_gather():
 def test_kernel():
     for domain_shape in [(4, 5), (3, 4, 5)]:
         dh = create_data_handling(domain_size=domain_shape, periodicity=True)
-        kernel_execution_jacobi(dh, test_gpu=True)
+        kernel_execution_jacobi(dh, 'cpu')
         reduction(dh)
 
         try:
             import pycuda
             dh = create_data_handling(domain_size=domain_shape, periodicity=True)
-            kernel_execution_jacobi(dh, test_gpu=False)
+            kernel_execution_jacobi(dh, 'gpu')
         except ImportError:
             pass
 
 
+@pytest.mark.parametrize('target', ('cpu', 'gpu', 'opencl'))
+def test_kernel_param(target):
+    for domain_shape in [(4, 5), (3, 4, 5)]:
+        if target == 'gpu':
+            pytest.importorskip('pycuda')
+        if target == 'opencl':
+            pytest.importorskip('pyopencl')
+            from pystencils.opencl.opencljit import init_globally
+            init_globally()
+
+        dh = create_data_handling(domain_size=domain_shape, periodicity=True, default_target=target)
+        kernel_execution_jacobi(dh, target)
+        reduction(dh)
+
+
 def test_vtk_output():
     for domain_shape in [(4, 5), (3, 4, 5)]:
         dh = create_data_handling(domain_size=domain_shape, periodicity=True)
diff --git a/pystencils_tests/test_datahandling_parallel.py b/pystencils_tests/test_datahandling_parallel.py
index 52ccb6bce34b2ca115b19a8d00fa31da622b8ff4..0bfbfbd02fc86952e7fba84ce6aa080d2080ec0a 100644
--- a/pystencils_tests/test_datahandling_parallel.py
+++ b/pystencils_tests/test_datahandling_parallel.py
@@ -50,13 +50,13 @@ def test_kernel():
         # 3D
         blocks = wlb.createUniformBlockGrid(blocks=(3, 2, 4), cellsPerBlock=(3, 2, 5), oneBlockPerProcess=False)
         dh = ParallelDataHandling(blocks)
-        kernel_execution_jacobi(dh, test_gpu=gpu)
+        kernel_execution_jacobi(dh, 'gpu')
         reduction(dh)
 
         # 2D
         blocks = wlb.createUniformBlockGrid(blocks=(3, 2, 1), cellsPerBlock=(3, 2, 1), oneBlockPerProcess=False)
         dh = ParallelDataHandling(blocks, dim=2)
-        kernel_execution_jacobi(dh, test_gpu=gpu)
+        kernel_execution_jacobi(dh, 'gpu')
         reduction(dh)
 
 
diff --git a/pystencils_tests/test_opencl.py b/pystencils_tests/test_opencl.py
index adffeb4750a62f5bb08ded29fffe00ac5900f706..ea5b1ecbc83f21672731b2ecbcb54c78faa1603b 100644
--- a/pystencils_tests/test_opencl.py
+++ b/pystencils_tests/test_opencl.py
@@ -1,11 +1,11 @@
 import numpy as np
 import pytest
+import sympy as sp
 
 import pystencils
-import sympy as sp
 from pystencils.backends.cuda_backend import CudaBackend
 from pystencils.backends.opencl_backend import OpenClBackend
-from pystencils.opencl.opencljit import make_python_function
+from pystencils.opencl.opencljit import get_global_cl_queue, init_globally, make_python_function
 
 try:
     import pyopencl as cl
@@ -233,3 +233,41 @@ def test_without_cuda():
     opencl_kernel = make_python_function(ast, queue, ctx)
     assert opencl_kernel is not None
     opencl_kernel(x=x, y=y, z=z)
+
+
+@pytest.mark.skipif(not HAS_OPENCL, reason="Test requires pyopencl")
+def test_kernel_creation():
+    global pystencils
+    z, y, x = pystencils.fields("z, y, x: [20,30]")
+
+    assignments = pystencils.AssignmentCollection({
+        z[0, 0]: x[0, 0] * sp.log(x[0, 0] * y[0, 0])
+    })
+
+    print(assignments)
+
+    pystencils.opencl.clear_global_ctx()
+
+    import pystencils.opencl.autoinit
+    ast = pystencils.create_kernel(assignments, target='opencl')
+
+    print(ast.backend)
+
+    code = str(pystencils.show_code(ast))
+    print(code)
+    assert 'get_local_size' in code
+
+    opencl_kernel = ast.compile()
+
+    x_cpu = np.random.rand(20, 30)
+    y_cpu = np.random.rand(20, 30)
+    z_cpu = np.random.rand(20, 30)
+
+    import pyopencl.array as array
+    assert get_global_cl_queue()
+    x = array.to_device(get_global_cl_queue(), x_cpu)
+    y = array.to_device(get_global_cl_queue(), y_cpu)
+    z = array.to_device(get_global_cl_queue(), z_cpu)
+
+    assert opencl_kernel is not None
+    opencl_kernel(x=x, y=y, z=z)
diff --git a/pystencils_tests/test_staggered_kernel.py b/pystencils_tests/test_staggered_kernel.py
index 4db538bf8f83d6779cb7dfd612fa9928acc23965..a64ff2f00d0ee7abc34e56f7110adb213dcb9992 100644
--- a/pystencils_tests/test_staggered_kernel.py
+++ b/pystencils_tests/test_staggered_kernel.py
@@ -1,16 +1,17 @@
-import pystencils as ps
 import numpy as np
 import sympy as sp
 
+import pystencils as ps
+
 
 class TestStaggeredDiffusion:
-    def _run(self, num_neighbors):
+    def _run(self, num_neighbors, target='cpu'):
         L = (40, 40)
         D = 0.066
         dt = 1
         T = 100
 
-        dh = ps.create_data_handling(L, periodicity=True, default_target='cpu')
+        dh = ps.create_data_handling(L, periodicity=True, default_target=target)
 
         c = dh.add_array('c', values_per_cell=1)
         j = dh.add_array('j', values_per_cell=num_neighbors, field_type=ps.FieldType.STAGGERED_FLUX)
@@ -23,7 +24,7 @@ class TestStaggeredDiffusion:
         jj = j.staggered_access
         divergence = -1 * D / (1 + sp.sqrt(2) if j.index_shape[0] == 4 else 1) * \
             sum([jj(d) / sp.Matrix(ps.stencil.direction_string_to_offset(d)).norm() for d in j.staggered_stencil
-                + [ps.stencil.inverse_direction_string(d) for d in j.staggered_stencil]])
+                 + [ps.stencil.inverse_direction_string(d) for d in j.staggered_stencil]])
 
         update = [ps.Assignment(c.center, c.center + dt * divergence)]
         flux = [ps.Assignment(j.staggered_access("W"), x_staggered),
@@ -67,6 +68,12 @@ class TestStaggeredDiffusion:
     def test_diffusion_4(self):
         self._run(4)
 
+    def test_diffusion_opencl(self):
+        import pytest
+        pytest.importorskip('pyopencl')
+        import pystencils.opencl.autoinit
+        self._run(4, 'opencl')
+
 
 def test_staggered_subexpressions():
     dh = ps.create_data_handling((10, 10), periodicity=True, default_target='cpu')