5600b6b6 · 5600b6b6 · 5600b6b6 · 5600b6b6 · 5600b6b6 · 5600b6b6
--- a/pystencils_tests/test_boundary_indexlist_creation.py
+++ b/pystencils_tests/test_boundary_indexlist_creation.py
--- a/pystencils_tests/test_buffer.py
+++ b/pystencils_tests/test_buffer.py
@@ -2,7 +2,8 @@

 import numpy as np

-from pystencils import Assignment, Field, FieldType, create_kernel, make_slice
+import pystencils as ps
+from pystencils import Assignment, Field, FieldType, create_kernel
 from pystencils.field import create_numpy_array_with_layout, layout_string_to_tuple
 from pystencils.slicing import (
    add_ghost_layers, get_ghost_region_slice, get_slice_before_ghost_layer)
@@ -19,9 +20,9 @@ def _generate_fields(dt=np.uint64, num_directions=1, layout='numpy'):
    fields = []
    for size in field_sizes:
        field_layout = layout_string_to_tuple(layout, len(size))
-        src_arr = create_numpy_array_with_layout(size, field_layout)
+        src_arr = create_numpy_array_with_layout(size, field_layout, dtype=dt)

-        array_data = np.reshape(np.arange(1, int(np.prod(size)+1)), size)
+        array_data = np.reshape(np.arange(1, int(np.prod(size) + 1)), size)
        # Use flat iterator to input data into the array
        src_arr.flat = add_ghost_layers(array_data, index_dimensions=1 if num_directions > 1 else 0).astype(dt).flat
        dst_arr = np.zeros(src_arr.shape, dtype=dt)
@@ -40,13 +41,18 @@ def test_full_scalar_field():
                                      field_type=FieldType.BUFFER, dtype=src_arr.dtype)

        pack_eqs = [Assignment(buffer.center(), src_field.center())]
-        pack_code = create_kernel(pack_eqs, data_type={'src_field': src_arr.dtype, 'buffer': buffer.dtype})
+        config = ps.CreateKernelConfig(data_type={'src_field': src_arr.dtype, 'buffer': buffer.dtype})
+        pack_code = create_kernel(pack_eqs, config=config)
+        code = ps.get_code_str(pack_code)
+        ps.show_code(pack_code)

        pack_kernel = pack_code.compile()
        pack_kernel(buffer=buffer_arr, src_field=src_arr)

        unpack_eqs = [Assignment(dst_field.center(), buffer.center())]
-        unpack_code = create_kernel(unpack_eqs, data_type={'dst_field': dst_arr.dtype, 'buffer': buffer.dtype})
+
+        config = ps.CreateKernelConfig(data_type={'dst_field': dst_arr.dtype, 'buffer': buffer.dtype})
+        unpack_code = create_kernel(unpack_eqs, config=config)

        unpack_kernel = unpack_code.compile()
        unpack_kernel(dst_field=dst_arr, buffer=buffer_arr)
@@ -70,14 +76,18 @@ def test_field_slice():
                                          field_type=FieldType.BUFFER, dtype=src_arr.dtype)

            pack_eqs = [Assignment(buffer.center(), src_field.center())]
-            pack_code = create_kernel(pack_eqs, data_type={'src_field': src_arr.dtype, 'buffer': buffer.dtype})
+
+            config = ps.CreateKernelConfig(data_type={'src_field': src_arr.dtype, 'buffer': buffer.dtype})
+            pack_code = create_kernel(pack_eqs, config=config)

            pack_kernel = pack_code.compile()
            pack_kernel(buffer=bufferArr, src_field=src_arr[pack_slice])

            # Unpack into ghost layer of dst_field in N direction
            unpack_eqs = [Assignment(dst_field.center(), buffer.center())]
-            unpack_code = create_kernel(unpack_eqs, data_type={'dst_field': dst_arr.dtype, 'buffer': buffer.dtype})
+
+            config = ps.CreateKernelConfig(data_type={'dst_field': dst_arr.dtype, 'buffer': buffer.dtype})
+            unpack_code = create_kernel(unpack_eqs, config=config)

            unpack_kernel = unpack_code.compile()
            unpack_kernel(buffer=bufferArr, dst_field=dst_arr[unpack_slice])
@@ -102,7 +112,8 @@ def test_all_cell_values():
            eq = Assignment(buffer(idx), src_field(idx))
            pack_eqs.append(eq)

-        pack_code = create_kernel(pack_eqs, data_type={'src_field': src_arr.dtype, 'buffer': buffer.dtype})
+        config = ps.CreateKernelConfig(data_type={'src_field': src_arr.dtype, 'buffer': buffer.dtype})
+        pack_code = create_kernel(pack_eqs, config=config)
        pack_kernel = pack_code.compile()
        pack_kernel(buffer=bufferArr, src_field=src_arr)

@@ -112,7 +123,8 @@ def test_all_cell_values():
            eq = Assignment(dst_field(idx), buffer(idx))
            unpack_eqs.append(eq)

-        unpack_code = create_kernel(unpack_eqs, data_type={'dst_field': dst_arr.dtype, 'buffer': buffer.dtype})
+        config = ps.CreateKernelConfig(data_type={'dst_field': dst_arr.dtype, 'buffer': buffer.dtype})
+        unpack_code = create_kernel(unpack_eqs, config=config)
        unpack_kernel = unpack_code.compile()
        unpack_kernel(buffer=bufferArr, dst_field=dst_arr)

@@ -138,7 +150,8 @@ def test_subset_cell_values():
            eq = Assignment(buffer(buffer_idx), src_field(cell_idx))
            pack_eqs.append(eq)

-        pack_code = create_kernel(pack_eqs, data_type={'src_field': src_arr.dtype, 'buffer': buffer.dtype})
+        config = ps.CreateKernelConfig(data_type={'src_field': src_arr.dtype, 'buffer': buffer.dtype})
+        pack_code = create_kernel(pack_eqs, config=config)
        pack_kernel = pack_code.compile()
        pack_kernel(buffer=bufferArr, src_field=src_arr)

@@ -148,7 +161,8 @@ def test_subset_cell_values():
            eq = Assignment(dst_field(cell_idx), buffer(buffer_idx))
            unpack_eqs.append(eq)

-        unpack_code = create_kernel(unpack_eqs, data_type={'dst_field': dst_arr.dtype, 'buffer': buffer.dtype})
+        config = ps.CreateKernelConfig(data_type={'dst_field': dst_arr.dtype, 'buffer': buffer.dtype})
+        unpack_code = create_kernel(unpack_eqs, config=config)
        unpack_kernel = unpack_code.compile()
        unpack_kernel(buffer=bufferArr, dst_field=dst_arr)

@@ -173,7 +187,8 @@ def test_field_layouts():
                eq = Assignment(buffer(idx), src_field(idx))
                pack_eqs.append(eq)

-            pack_code = create_kernel(pack_eqs, data_type={'src_field': src_arr.dtype, 'buffer': buffer.dtype})
+            config = ps.CreateKernelConfig(data_type={'src_field': src_arr.dtype, 'buffer': buffer.dtype})
+            pack_code = create_kernel(pack_eqs, config=config)
            pack_kernel = pack_code.compile()
            pack_kernel(buffer=bufferArr, src_field=src_arr)

@@ -183,19 +198,24 @@ def test_field_layouts():
                eq = Assignment(dst_field(idx), buffer(idx))
                unpack_eqs.append(eq)

-            unpack_code = create_kernel(unpack_eqs, data_type={'dst_field': dst_arr.dtype, 'buffer': buffer.dtype})
+            config = ps.CreateKernelConfig(data_type={'dst_field': dst_arr.dtype, 'buffer': buffer.dtype})
+            unpack_code = create_kernel(unpack_eqs, config=config)
            unpack_kernel = unpack_code.compile()
            unpack_kernel(buffer=bufferArr, dst_field=dst_arr)


 def test_iteration_slices():
    num_cell_values = 19
-    fields = _generate_fields(num_directions=num_cell_values)
+    dt = np.uint64
+    fields = _generate_fields(dt=dt, num_directions=num_cell_values)
    for (src_arr, dst_arr, bufferArr) in fields:
-        src_field = Field.create_from_numpy_array("src_field", src_arr, index_dimensions=1)
-        dst_field = Field.create_from_numpy_array("dst_field", dst_arr, index_dimensions=1)
+        spatial_dimensions = len(src_arr.shape) - 1
+        # src_field = Field.create_from_numpy_array("src_field", src_arr, index_dimensions=1)
+        # dst_field = Field.create_from_numpy_array("dst_field", dst_arr, index_dimensions=1)
+        src_field = Field.create_generic("src_field", spatial_dimensions, index_shape=(num_cell_values,), dtype=dt)
+        dst_field = Field.create_generic("dst_field", spatial_dimensions, index_shape=(num_cell_values,), dtype=dt)
        buffer = Field.create_generic("buffer", spatial_dimensions=1, index_dimensions=1,
-                                        field_type=FieldType.BUFFER, dtype=src_arr.dtype)
+                                      field_type=FieldType.BUFFER, dtype=src_arr.dtype)

        pack_eqs = []
        # Since we are packing all cell values for all cells, then
@@ -207,13 +227,16 @@ def test_iteration_slices():
        dim = src_field.spatial_dimensions

        #   Pack only the leftmost slice, only every second cell
-        pack_slice = (slice(None, None, 2),) * (dim-1) + (0, )
+        pack_slice = (slice(None, None, 2),) * (dim - 1) + (0,)

        #   Fill the entire array with data
        src_arr[(slice(None, None, 1),) * dim] = np.arange(num_cell_values)
-        dst_arr.fill(0.0)
+        dst_arr.fill(0)
+
+        config = ps.CreateKernelConfig(iteration_slice=pack_slice,
+                                       data_type={'src_field': src_arr.dtype, 'buffer': buffer.dtype})

-        pack_code = create_kernel(pack_eqs, iteration_slice=pack_slice, data_type={'src_field': src_arr.dtype, 'buffer': buffer.dtype})
+        pack_code = create_kernel(pack_eqs, config=config)
        pack_kernel = pack_code.compile()
        pack_kernel(buffer=bufferArr, src_field=src_arr)

@@ -223,12 +246,14 @@ def test_iteration_slices():
            eq = Assignment(dst_field(idx), buffer(idx))
            unpack_eqs.append(eq)

-        unpack_code = create_kernel(unpack_eqs, iteration_slice=pack_slice, data_type={'dst_field': dst_arr.dtype, 'buffer': buffer.dtype})
+        config = ps.CreateKernelConfig(iteration_slice=pack_slice,
+                                       data_type={'dst_field': dst_arr.dtype, 'buffer': buffer.dtype})
+
+        unpack_code = create_kernel(unpack_eqs, config=config)
        unpack_kernel = unpack_code.compile()
        unpack_kernel(buffer=bufferArr, dst_field=dst_arr)

        #   Check if only every second entry of the leftmost slice has been copied
        np.testing.assert_equal(dst_arr[pack_slice], src_arr[pack_slice])
-        np.testing.assert_equal(dst_arr[(slice(1, None, 2),)  * (dim-1) + (0,)], 0.0)
-        np.testing.assert_equal(dst_arr[(slice(None, None, 1),)  * (dim-1) + (slice(1,None),)], 0.0)
-
+        np.testing.assert_equal(dst_arr[(slice(1, None, 2),) * (dim - 1) + (0,)], 0)
+        np.testing.assert_equal(dst_arr[(slice(None, None, 1),) * (dim - 1) + (slice(1, None),)], 0)
--- a/pystencils_tests/test_buffer_gpu.py
+++ b/pystencils_tests/test_buffer_gpu.py
 """Tests for the (un)packing (from)to buffers on a CUDA GPU."""

+from dataclasses import replace
 import numpy as np
 import pytest

-from pystencils import Assignment, Field, FieldType
+import pystencils
+from pystencils import Assignment, Field, FieldType, Target, CreateKernelConfig, create_kernel, fields
+from pystencils.bit_masks import flag_cond
 from pystencils.field import create_numpy_array_with_layout, layout_string_to_tuple
-from pystencils.gpucuda import create_cuda_kernel, make_python_function
 from pystencils.slicing import (
    add_ghost_layers, get_ghost_region_slice, get_slice_before_ghost_layer)
 from pystencils.stencil import direction_string_to_offset

 try:
    # noinspection PyUnresolvedReferences
-    import pycuda.autoinit
-    import pycuda.gpuarray as gpuarray
+    import cupy as cp
 except ImportError:
    pass

@@ -22,7 +23,7 @@ FIELD_SIZES = [(4, 3), (9, 3, 7)]


 def _generate_fields(dt=np.uint8, stencil_directions=1, layout='numpy'):
-    pytest.importorskip('pycuda')
+    pytest.importorskip('cupy')
    field_sizes = FIELD_SIZES
    if stencil_directions > 1:
        field_sizes = [s + (stencil_directions,) for s in field_sizes]
@@ -37,10 +38,10 @@ def _generate_fields(dt=np.uint8, stencil_directions=1, layout='numpy'):
        src_arr.flat = add_ghost_layers(array_data,
                                        index_dimensions=1 if stencil_directions > 1 else 0).astype(dt).flat

-        gpu_src_arr = gpuarray.to_gpu(src_arr)
-        gpu_dst_arr = gpuarray.empty_like(gpu_src_arr)
+        gpu_src_arr = cp.asarray(src_arr)
+        gpu_dst_arr = cp.zeros_like(gpu_src_arr)
        size = int(np.prod(src_arr.shape))
-        gpu_buffer_arr = gpuarray.zeros(size, dtype=dt)
+        gpu_buffer_arr = cp.zeros(size, dtype=dt)

        fields.append((src_arr, gpu_src_arr, gpu_dst_arr, gpu_buffer_arr))
    return fields
@@ -57,16 +58,20 @@ def test_full_scalar_field():

        pack_eqs = [Assignment(buffer.center(), src_field.center())]
        pack_types = {'src_field': gpu_src_arr.dtype, 'buffer': gpu_buffer_arr.dtype}
-        pack_code = create_cuda_kernel(pack_eqs, type_info=pack_types)

-        pack_kernel = make_python_function(pack_code)
+        config = CreateKernelConfig(target=pystencils.Target.GPU, data_type=pack_types)
+        pack_ast = create_kernel(pack_eqs, config=config)
+
+        pack_kernel = pack_ast.compile()
        pack_kernel(buffer=gpu_buffer_arr, src_field=gpu_src_arr)

        unpack_eqs = [Assignment(dst_field.center(), buffer.center())]
        unpack_types = {'dst_field': gpu_dst_arr.dtype, 'buffer': gpu_buffer_arr.dtype}
-        unpack_code = create_cuda_kernel(unpack_eqs, type_info=unpack_types)

-        unpack_kernel = make_python_function(unpack_code)
+        config = CreateKernelConfig(target=pystencils.Target.GPU, data_type=unpack_types)
+        unpack_ast = create_kernel(unpack_eqs, config=config)
+
+        unpack_kernel = unpack_ast.compile()
        unpack_kernel(dst_field=gpu_dst_arr, buffer=gpu_buffer_arr)

        dst_arr = gpu_dst_arr.get()
@@ -91,17 +96,21 @@ def test_field_slice():

            pack_eqs = [Assignment(buffer.center(), src_field.center())]
            pack_types = {'src_field': gpu_src_arr.dtype, 'buffer': gpu_buffer_arr.dtype}
-            pack_code = create_cuda_kernel(pack_eqs, type_info=pack_types)

-            pack_kernel = make_python_function(pack_code)
+            config = CreateKernelConfig(target=pystencils.Target.GPU, data_type=pack_types)
+            pack_ast = create_kernel(pack_eqs, config=config)
+
+            pack_kernel = pack_ast.compile()
            pack_kernel(buffer=gpu_buffer_arr, src_field=gpu_src_arr[pack_slice])

            # Unpack into ghost layer of dst_field in N direction
            unpack_eqs = [Assignment(dst_field.center(), buffer.center())]
            unpack_types = {'dst_field': gpu_dst_arr.dtype, 'buffer': gpu_buffer_arr.dtype}
-            unpack_code = create_cuda_kernel(unpack_eqs, type_info=unpack_types)

-            unpack_kernel = make_python_function(unpack_code)
+            config = CreateKernelConfig(target=pystencils.Target.GPU, data_type=unpack_types)
+            unpack_ast = create_kernel(unpack_eqs, config=config)
+
+            unpack_kernel = unpack_ast.compile()
            unpack_kernel(buffer=gpu_buffer_arr, dst_field=gpu_dst_arr[unpack_slice])

            dst_arr = gpu_dst_arr.get()
@@ -127,8 +136,11 @@ def test_all_cell_values():
            pack_eqs.append(eq)

        pack_types = {'src_field': gpu_src_arr.dtype, 'buffer': gpu_buffer_arr.dtype}
-        pack_code = create_cuda_kernel(pack_eqs, type_info=pack_types)
-        pack_kernel = make_python_function(pack_code)
+
+        config = CreateKernelConfig(target=pystencils.Target.GPU, data_type=pack_types)
+        pack_code = create_kernel(pack_eqs, config=config)
+        pack_kernel = pack_code.compile()
+
        pack_kernel(buffer=gpu_buffer_arr, src_field=gpu_src_arr)

        unpack_eqs = []
@@ -138,8 +150,10 @@ def test_all_cell_values():
            unpack_eqs.append(eq)

        unpack_types = {'dst_field': gpu_dst_arr.dtype, 'buffer': gpu_buffer_arr.dtype}
-        unpack_code = create_cuda_kernel(unpack_eqs, type_info=unpack_types)
-        unpack_kernel = make_python_function(unpack_code)
+
+        config = CreateKernelConfig(target=pystencils.Target.GPU, data_type=unpack_types)
+        unpack_ast = create_kernel(unpack_eqs, config=config)
+        unpack_kernel = unpack_ast.compile()
        unpack_kernel(buffer=gpu_buffer_arr, dst_field=gpu_dst_arr)

        dst_arr = gpu_dst_arr.get()
@@ -148,7 +162,7 @@ def test_all_cell_values():


 def test_subset_cell_values():
-    """Tests (un)packing a subset of cell values of the a field (from)to a buffer."""
+    """Tests (un)packing a subset of cell values of a field (from)to a buffer."""
    num_cell_values = 7
    # Cell indices of the field to be (un)packed (from)to the buffer
    cell_indices = [1, 3, 5, 6]
@@ -167,8 +181,9 @@ def test_subset_cell_values():
            pack_eqs.append(eq)

        pack_types = {'src_field': gpu_src_arr.dtype, 'buffer': gpu_buffer_arr.dtype}
-        pack_code = create_cuda_kernel(pack_eqs, type_info=pack_types)
-        pack_kernel = make_python_function(pack_code)
+        config = CreateKernelConfig(target=pystencils.Target.GPU, data_type=pack_types)
+        pack_ast = create_kernel(pack_eqs, config=config)
+        pack_kernel = pack_ast.compile()
        pack_kernel(buffer=gpu_buffer_arr, src_field=gpu_src_arr)

        unpack_eqs = []
@@ -178,8 +193,10 @@ def test_subset_cell_values():
            unpack_eqs.append(eq)

        unpack_types = {'dst_field': gpu_dst_arr.dtype, 'buffer': gpu_buffer_arr.dtype}
-        unpack_code = create_cuda_kernel(unpack_eqs, type_info=unpack_types)
-        unpack_kernel = make_python_function(unpack_code)
+        config = CreateKernelConfig(target=pystencils.Target.GPU, data_type=unpack_types)
+        unpack_ast = create_kernel(unpack_eqs, config=config)
+        unpack_kernel = unpack_ast.compile()
+
        unpack_kernel(buffer=gpu_buffer_arr, dst_field=gpu_dst_arr)

        dst_arr = gpu_dst_arr.get()
@@ -206,8 +223,10 @@ def test_field_layouts():
                pack_eqs.append(eq)

            pack_types = {'src_field': gpu_src_arr.dtype, 'buffer': gpu_buffer_arr.dtype}
-            pack_code = create_cuda_kernel(pack_eqs, type_info=pack_types)
-            pack_kernel = make_python_function(pack_code)
+            config = CreateKernelConfig(target=pystencils.Target.GPU, data_type=pack_types)
+            pack_ast = create_kernel(pack_eqs, config=config)
+            pack_kernel = pack_ast.compile()
+
            pack_kernel(buffer=gpu_buffer_arr, src_field=gpu_src_arr)

            unpack_eqs = []
@@ -217,6 +236,99 @@ def test_field_layouts():
                unpack_eqs.append(eq)

            unpack_types = {'dst_field': gpu_dst_arr.dtype, 'buffer': gpu_buffer_arr.dtype}
-            unpack_code = create_cuda_kernel(unpack_eqs, type_info=unpack_types)
-            unpack_kernel = make_python_function(unpack_code)
+            config = CreateKernelConfig(target=pystencils.Target.GPU, data_type=unpack_types)
+            unpack_ast = create_kernel(unpack_eqs, config=config)
+            unpack_kernel = unpack_ast.compile()
+
            unpack_kernel(buffer=gpu_buffer_arr, dst_field=gpu_dst_arr)
+
+
+def test_buffer_indexing():
+    src_field, dst_field = fields(f'pdfs_src(19), pdfs_dst(19) :double[3D]')
+    mask_field = fields(f'mask : uint32 [3D]')
+    buffer = Field.create_generic('buffer', spatial_dimensions=1, field_type=FieldType.BUFFER,
+                                  dtype="float64",
+                                  index_shape=(19,))
+
+    src_field_size = src_field.spatial_shape
+    mask_field_size = mask_field.spatial_shape
+
+    up = Assignment(buffer(0), flag_cond(1, mask_field.center, src_field[0, 1, 0](1)))
+    iteration_slice = tuple(slice(None, None, 2) for _ in range(3))
+    config = CreateKernelConfig(target=Target.GPU)
+    config = replace(config, iteration_slice=iteration_slice, ghost_layers=0)
+
+    ast = create_kernel(up, config=config)
+    parameters = ast.get_parameters()
+
+    spatial_shape_symbols = [p.symbol for p in parameters if p.is_field_shape]
+
+    # The loop counters as well as the resolved field access should depend on one common spatial shape
+    if spatial_shape_symbols[0] in mask_field_size:
+        for s in spatial_shape_symbols:
+            assert s in mask_field_size
+
+    if spatial_shape_symbols[0] in src_field_size:
+        for s in spatial_shape_symbols:
+            assert s in src_field_size
+
+    assert len(spatial_shape_symbols) <= 3
+
+
+@pytest.mark.parametrize('gpu_indexing', ("block", "line"))
+def test_iteration_slices(gpu_indexing):
+    num_cell_values = 19
+    dt = np.uint64
+    fields = _generate_fields(dt=dt, stencil_directions=num_cell_values)
+    for (src_arr, gpu_src_arr, gpu_dst_arr, gpu_buffer_arr) in fields:
+        src_field = Field.create_from_numpy_array("src_field", gpu_src_arr, index_dimensions=1)
+        dst_field = Field.create_from_numpy_array("dst_field", gpu_src_arr, index_dimensions=1)
+        buffer = Field.create_generic("buffer", spatial_dimensions=1, index_dimensions=1,
+                                      field_type=FieldType.BUFFER, dtype=src_arr.dtype)
+
+        pack_eqs = []
+        # Since we are packing all cell values for all cells, then
+        # the buffer index is equivalent to the field index
+        for idx in range(num_cell_values):
+            eq = Assignment(buffer(idx), src_field(idx))
+            pack_eqs.append(eq)
+
+        dim = src_field.spatial_dimensions
+
+        #   Pack only the leftmost slice, only every second cell
+        pack_slice = (slice(None, None, 2),) * (dim - 1) + (0,)
+
+        #   Fill the entire array with data
+        src_arr[(slice(None, None, 1),) * dim] = np.arange(num_cell_values)
+        gpu_src_arr.set(src_arr)
+        gpu_dst_arr.fill(0)
+
+        config = CreateKernelConfig(target=Target.GPU, iteration_slice=pack_slice,
+                                    data_type={'src_field': gpu_src_arr.dtype, 'buffer': gpu_buffer_arr.dtype},
+                                    gpu_indexing=gpu_indexing)
+
+        pack_code = create_kernel(pack_eqs, config=config)
+        pack_kernel = pack_code.compile()
+        pack_kernel(buffer=gpu_buffer_arr, src_field=gpu_src_arr)
+
+        unpack_eqs = []
+
+        for idx in range(num_cell_values):
+            eq = Assignment(dst_field(idx), buffer(idx))
+            unpack_eqs.append(eq)
+
+        config = CreateKernelConfig(target=Target.GPU, iteration_slice=pack_slice,
+                                    data_type={'dst_field': gpu_dst_arr.dtype, 'buffer': gpu_buffer_arr.dtype},
+                                    gpu_indexing=gpu_indexing)
+
+        unpack_code = create_kernel(unpack_eqs, config=config)
+        unpack_kernel = unpack_code.compile()
+        unpack_kernel(buffer=gpu_buffer_arr, dst_field=gpu_dst_arr)
+
+        dst_arr = gpu_dst_arr.get()
+        src_arr = gpu_src_arr.get()
+
+        #   Check if only every second entry of the leftmost slice has been copied
+        np.testing.assert_equal(dst_arr[pack_slice], src_arr[pack_slice])
+        np.testing.assert_equal(dst_arr[(slice(1, None, 2),) * (dim - 1) + (0,)], 0)
+        np.testing.assert_equal(dst_arr[(slice(None, None, 1),) * (dim - 1) + (slice(1, None),)], 0)
--- a/pystencils_tests/test_conditional_field_access.py
+++ b/pystencils_tests/test_conditional_field_access.py
@@ -35,11 +35,11 @@ def add_fixed_constant_boundary_handling(assignments, with_cse):
            for a in assignment.rhs.atoms(Field.Access) if not a.is_absolute_access
        })) for assignment in assignments.all_assignments]

-    subs = [{a: ConditionalFieldAccess(a, is_out_of_bound(
-        sp.Matrix(a.offsets) + x_vector(ndim), common_shape))
-        for a in assignment.rhs.atoms(Field.Access) if not a.is_absolute_access
-    } for assignment in assignments.all_assignments]
-    print(subs)
+    # subs = [{a: ConditionalFieldAccess(a, is_out_of_bound(
+    #     sp.Matrix(a.offsets) + x_vector(ndim), common_shape))
+    #     for a in assignment.rhs.atoms(Field.Access) if not a.is_absolute_access
+    # } for assignment in assignments.all_assignments]
+    # print(subs)

    if with_cse:
        safe_assignments = sympy_cse(ps.AssignmentCollection(safe_assignments))
@@ -48,22 +48,20 @@ def add_fixed_constant_boundary_handling(assignments, with_cse):
        return ps.AssignmentCollection(safe_assignments)


+@pytest.mark.parametrize('dtype', ('float64', 'float32'))
 @pytest.mark.parametrize('with_cse', (False, 'with_cse'))
-def test_boundary_check(with_cse):
+def test_boundary_check(dtype, with_cse):
+    f, g = ps.fields(f"f, g : {dtype}[2D]")
+    stencil = ps.Assignment(g[0, 0], (f[1, 0] + f[-1, 0] + f[0, 1] + f[0, -1]) / 4)

-    f, g = ps.fields("f, g : [2D]")
-    stencil = ps.Assignment(g[0, 0],
-                            (f[1, 0] + f[-1, 0] + f[0, 1] + f[0, -1]) / 4)
-
-    f_arr = np.random.rand(1000, 1000)
+    f_arr = np.random.rand(10, 10).astype(dtype=dtype)
    g_arr = np.zeros_like(f_arr)
-    # kernel(f=f_arr, g=g_arr)

    assignments = add_fixed_constant_boundary_handling(ps.AssignmentCollection([stencil]), with_cse)

-    print(assignments)
-    kernel_checked = ps.create_kernel(assignments, ghost_layers=0).compile()
-    ps.show_code(kernel_checked)
+    config = ps.CreateKernelConfig(data_type=dtype, default_number_float=dtype, ghost_layers=0)
+    kernel_checked = ps.create_kernel(assignments, config=config).compile()
+    # ps.show_code(kernel_checked)

    # No SEGFAULT, please!!
    kernel_checked(f=f_arr, g=g_arr)
--- a/pystencils_tests/test_conditional_vec.py
+++ b/pystencils_tests/test_conditional_vec.py
@@ -3,37 +3,39 @@ import sympy as sp
 import pytest

 import pystencils as ps
-from pystencils.astnodes import Block, Conditional
+from pystencils.alignedarray import aligned_zeros
+from pystencils.astnodes import Block, Conditional, SympyAssignment
 from pystencils.backends.simd_instruction_sets import get_supported_instruction_sets, get_vector_instruction_set
 from pystencils.enums import Target
 from pystencils.cpu.vectorization import vec_all, vec_any
+from pystencils.node_collection import NodeCollection

 supported_instruction_sets = get_supported_instruction_sets() if get_supported_instruction_sets() else []


 @pytest.mark.parametrize('instruction_set', supported_instruction_sets)
-@pytest.mark.parametrize('dtype', ('float', 'double'))
+@pytest.mark.parametrize('dtype', ('float32', 'float64'))
 def test_vec_any(instruction_set, dtype):
-    if instruction_set in ['sve', 'rvv']:
+    if instruction_set in ['sve', 'sve2', 'sme', 'rvv']:
        width = 4  # we don't know the actual value
    else:
        width = get_vector_instruction_set(dtype, instruction_set)['width']
-    data_arr = np.zeros((4 * width, 4 * width), dtype=np.float64 if dtype == 'double' else np.float32)
+    data_arr = np.zeros((4 * width, 4 * width), dtype=dtype)

    data_arr[3:9, 1:3 * width - 1] = 1.0
    data = ps.fields(f"data: {dtype}[2D]", data=data_arr)

    c = [
-        ps.Assignment(sp.Symbol("t1"), vec_any(data.center() > 0.0)),
-        Conditional(vec_any(data.center() > 0.0), Block([
-            ps.Assignment(data.center(), 2.0)
-        ]))
+        SympyAssignment(sp.Symbol("t1"), vec_any(data.center() > 0.0)),
+        Conditional(vec_any(data.center() > 0.0), Block([SympyAssignment(data.center(), 2.0)]))
    ]
-    ast = ps.create_kernel(c, target=ps.Target.CPU,
+
+    assignmets = NodeCollection(c)
+    ast = ps.create_kernel(assignments=assignmets, target=ps.Target.CPU,
                           cpu_vectorize_info={'instruction_set': instruction_set})
    kernel = ast.compile()
    kernel(data=data_arr)
-    if instruction_set in ['sve', 'rvv']:
+    if instruction_set in ['sve', 'sve2', 'sme', 'rvv']:
        # we only know that the first value has changed
        np.testing.assert_equal(data_arr[3:9, :3 * width - 1], 2.0)
    else:
@@ -41,27 +43,24 @@ def test_vec_any(instruction_set, dtype):


 @pytest.mark.parametrize('instruction_set', supported_instruction_sets)
-@pytest.mark.parametrize('dtype', ('float', 'double'))
+@pytest.mark.parametrize('dtype', ('float32', 'float64'))
 def test_vec_all(instruction_set, dtype):
-    if instruction_set in ['sve', 'rvv']:
+    if instruction_set in ['sve', 'sve2', 'sme', 'rvv']:
        width = 1000  # we don't know the actual value, need something guaranteed larger than vector
    else:
        width = get_vector_instruction_set(dtype, instruction_set)['width']
-    data_arr = np.zeros((4 * width, 4 * width), dtype=np.float64 if dtype == 'double' else np.float32)
+    data_arr = np.zeros((4 * width, 4 * width), dtype=dtype)

    data_arr[3:9, 1:3 * width - 1] = 1.0
    data = ps.fields(f"data: {dtype}[2D]", data=data_arr)

-    c = [
-        Conditional(vec_all(data.center() > 0.0), Block([
-            ps.Assignment(data.center(), 2.0)
-        ]))
-    ]
-    ast = ps.create_kernel(c, target=Target.CPU,
+    c = [Conditional(vec_all(data.center() > 0.0), Block([SympyAssignment(data.center(), 2.0)]))]
+    assignmets = NodeCollection(c)
+    ast = ps.create_kernel(assignmets, target=Target.CPU,
                           cpu_vectorize_info={'instruction_set': instruction_set})
    kernel = ast.compile()
    kernel(data=data_arr)
-    if instruction_set in ['sve', 'rvv']:
+    if instruction_set in ['sve', 'sve2', 'sme', 'rvv']:
        # we only know that some values in the middle have been replaced
        assert np.all(data_arr[3:9, :2] <= 1.0)
        assert np.any(data_arr[3:9, 2:] == 2.0)
@@ -88,26 +87,69 @@ def test_boolean_before_loop():
    ast = ps.create_kernel(a, cpu_vectorize_info={'instruction_set': supported_instruction_sets[-1]})
    kernel = ast.compile()
    kernel(f=f_arr, g=g_arr, t2=1.0)
-    print(g)
+    # print(g)
    np.testing.assert_array_equal(g_arr, 1.0)
    kernel(f=f_arr, g=g_arr, t2=-1.0)
    np.testing.assert_array_equal(g_arr, 42.0)


 @pytest.mark.parametrize('instruction_set', supported_instruction_sets)
-@pytest.mark.parametrize('dtype', ('float', 'double'))
-def test_vec_maskstore(instruction_set, dtype):
-    data_arr = np.zeros((16, 16), dtype=np.float64 if dtype == 'double' else np.float32)
+@pytest.mark.parametrize('dtype', ('float32', 'float64'))
+@pytest.mark.parametrize('nontemporal', [False, True])
+@pytest.mark.parametrize('aligned', [False, True])
+def test_vec_maskstore(instruction_set, dtype, nontemporal, aligned):
+    data_arr = (aligned_zeros if aligned else np.zeros)((16, 16), dtype=dtype)
    data_arr[3:-3, 3:-3] = 1.0
    data = ps.fields(f"data: {dtype}[2D]", data=data_arr)

-    c = [
-        Conditional(data.center() < 1.0, Block([
-            ps.Assignment(data.center(), 2.0)
-        ]))
-    ]
-    ast = ps.create_kernel(c, target=Target.CPU,
-                           cpu_vectorize_info={'instruction_set': instruction_set})
+    c = [Conditional(data.center() < 1.0, Block([SympyAssignment(data.center(), 2.0)]))]
+
+    assignmets = NodeCollection(c)
+    config = ps.CreateKernelConfig(cpu_vectorize_info={'instruction_set': instruction_set,
+                                                       'nontemporal': nontemporal,
+                                                       'assume_aligned': aligned},
+                                   default_number_float=dtype)
+    ast = ps.create_kernel(assignmets, config=config)
+    if 'maskStore' in ast.instruction_set:
+        instruction = 'maskStream' if nontemporal and 'maskStream' in ast.instruction_set else (
+                      'maskStoreA' if aligned and 'maskStoreA' in ast.instruction_set else 'maskStore')
+        assert ast.instruction_set[instruction].split('{')[0] in ps.get_code_str(ast)
+    print(ps.get_code_str(ast))
+    kernel = ast.compile()
+    kernel(data=data_arr)
+    np.testing.assert_equal(data_arr[:3, :], 2.0)
+    np.testing.assert_equal(data_arr[-3:, :], 2.0)
+    np.testing.assert_equal(data_arr[:, :3], 2.0)
+    np.testing.assert_equal(data_arr[:, -3:], 2.0)
+    np.testing.assert_equal(data_arr[3:-3, 3:-3], 1.0)
+
+
+@pytest.mark.parametrize('instruction_set', supported_instruction_sets)
+@pytest.mark.parametrize('dtype', ('float32', 'float64'))
+@pytest.mark.parametrize('nontemporal', [False, True])
+def test_vec_maskscatter(instruction_set, dtype, nontemporal):
+    data_arr = np.zeros((16, 16), dtype=dtype)
+    data_arr[3:-3, 3:-3] = 1.0
+    data = ps.fields(f"data: {dtype}[2D]")
+
+    c = [Conditional(data.center() < 1.0, Block([SympyAssignment(data.center(), 2.0)]))]
+
+    assignmets = NodeCollection(c)
+    config = ps.CreateKernelConfig(cpu_vectorize_info={'instruction_set': instruction_set,
+                                                       'nontemporal': nontemporal},
+                                   default_number_float=dtype)
+    if 'maskStoreS' not in get_vector_instruction_set(dtype, instruction_set) \
+            and not instruction_set.startswith('sve'):
+        with pytest.warns(UserWarning) as warn:
+            ast = ps.create_kernel(assignmets, config=config)
+            assert 'Could not vectorize loop' in warn[0].message.args[0]
+    else:
+        with pytest.warns(None) as warn:
+            ast = ps.create_kernel(assignmets, config=config)
+            assert len(warn) == 0
+        instruction = 'maskStreamS' if nontemporal and 'maskStreamS' in ast.instruction_set else 'maskStoreS'
+        assert ast.instruction_set[instruction].split('{')[0] in ps.get_code_str(ast)
+    print(ps.get_code_str(ast))
    kernel = ast.compile()
    kernel(data=data_arr)
    np.testing.assert_equal(data_arr[:3, :], 2.0)

--- a/tests/test_config.py
+++ b/tests/test_config.py
+from collections import defaultdict
+import numpy as np
+import pytest
+
+from pystencils import CreateKernelConfig, Target, Backend
+from pystencils.typing import BasicType
+
+
+def test_config():
+    # targets
+    config = CreateKernelConfig(target=Target.CPU)
+    assert config.target == Target.CPU
+    assert config.backend == Backend.C
+
+    config = CreateKernelConfig(target=Target.GPU)
+    assert config.target == Target.GPU
+    assert config.backend == Backend.CUDA
+
+    # typing
+    config = CreateKernelConfig(data_type=np.float64)
+    assert isinstance(config.data_type, defaultdict)
+    assert config.data_type.default_factory() == BasicType('float64')
+    assert config.default_number_float == BasicType('float64')
+    assert config.default_number_int == BasicType('int64')
+
+    config = CreateKernelConfig(data_type=np.float32)
+    assert isinstance(config.data_type, defaultdict)
+    assert config.data_type.default_factory() == BasicType('float32')
+    assert config.default_number_float == BasicType('float32')
+    assert config.default_number_int == BasicType('int64')
+
+    config = CreateKernelConfig(data_type=np.float32, default_number_float=np.float64)
+    assert isinstance(config.data_type, defaultdict)
+    assert config.data_type.default_factory() == BasicType('float32')
+    assert config.default_number_float == BasicType('float64')
+    assert config.default_number_int == BasicType('int64')
+
+    config = CreateKernelConfig(data_type=np.float32, default_number_float=np.float64, default_number_int=np.int16)
+    assert isinstance(config.data_type, defaultdict)
+    assert config.data_type.default_factory() == BasicType('float32')
+    assert config.default_number_float == BasicType('float64')
+    assert config.default_number_int == BasicType('int16')
+
+    config = CreateKernelConfig(data_type='float64')
+    assert isinstance(config.data_type, defaultdict)
+    assert config.data_type.default_factory() == BasicType('float64')
+    assert config.default_number_float == BasicType('float64')
+    assert config.default_number_int == BasicType('int64')
+
+    config = CreateKernelConfig(data_type={'a': np.float64, 'b': np.float32})
+    assert isinstance(config.data_type, defaultdict)
+    assert config.data_type.default_factory() == BasicType('float64')
+    assert config.default_number_float == BasicType('float64')
+    assert config.default_number_int == BasicType('int64')
+
+    config = CreateKernelConfig(data_type={'a': np.float32, 'b': np.int32})
+    assert isinstance(config.data_type, defaultdict)
+    assert config.data_type.default_factory() == BasicType('float32')
+    assert config.default_number_float == BasicType('float32')
+    assert config.default_number_int == BasicType('int64')
+
+
+def test_config_target_as_string():
+    with pytest.raises(ValueError):
+        CreateKernelConfig(target='cpu')
+
+
+def test_config_backend_as_string():
+    with pytest.raises(ValueError):
+        CreateKernelConfig(backend='C')
+
+
+def test_config_python_types():
+    with pytest.raises(ValueError):
+        CreateKernelConfig(data_type=float)
+
+
+def test_config_python_types2():
+    with pytest.raises(ValueError):
+        CreateKernelConfig(data_type={'a': float})
+
+
+def test_config_python_types3():
+    with pytest.raises(ValueError):
+        CreateKernelConfig(default_number_float=float)
+
+
+def test_config_python_types4():
+    with pytest.raises(ValueError):
+        CreateKernelConfig(default_number_int=int)
+
+
+def test_config_python_types5():
+    with pytest.raises(ValueError):
+        CreateKernelConfig(data_type="float")
+
+
+def test_config_python_types6():
+    with pytest.raises(ValueError):
+        CreateKernelConfig(default_number_float="float")
+
+
+def test_config_python_types7():
+    dtype = defaultdict(lambda: 'float', {'a': np.float64, 'b': np.int64})
+    with pytest.raises(ValueError):
+        CreateKernelConfig(data_type=dtype)
+
+
+def test_config_python_types8():
+    dtype = defaultdict(lambda: float, {'a': np.float64, 'b': np.int64})
+    with pytest.raises(ValueError):
+        CreateKernelConfig(data_type=dtype)
+
+
+def test_config_python_types9():
+    dtype = defaultdict(lambda: 'float32', {'a': 'float', 'b': np.int64})
+    with pytest.raises(ValueError):
+        CreateKernelConfig(data_type=dtype)
+
+
+def test_config_python_types10():
+    dtype = defaultdict(lambda: 'float32', {'a': float, 'b': np.int64})
+    with pytest.raises(ValueError):
+        CreateKernelConfig(data_type=dtype)
--- a/tests/test_create_kernel_config.py
+++ b/tests/test_create_kernel_config.py
+import numpy as np
+import sympy as sp
+import pystencils as ps
+import pystencils.config
+
+
+def test_create_kernel_config():
+    c = pystencils.config.CreateKernelConfig()
+    assert c.backend == ps.Backend.C
+    assert c.target == ps.Target.CPU
+
+    c = pystencils.config.CreateKernelConfig(target=ps.Target.GPU)
+    assert c.backend == ps.Backend.CUDA
+
+    c = pystencils.config.CreateKernelConfig(backend=ps.Backend.CUDA)
+    assert c.target == ps.Target.CPU
+    assert c.backend == ps.Backend.CUDA
+
+
+def test_kernel_decorator_config():
+    config = pystencils.config.CreateKernelConfig()
+    a, b, c = ps.fields(a=np.ones(100), b=np.ones(100), c=np.ones(100))
+
+    @ps.kernel_config(config)
+    def test():
+        a[0] @= b[0] + c[0]
+
+    ps.create_kernel(**test)
+
+
+def test_kernel_decorator2():
+    h = sp.symbols("h")
+    dtype = "float64"
+
+    src, dst = ps.fields(f"src, src_tmp: {dtype}[3D]")
+
+    @ps.kernel
+    def kernel_func():
+        dst[0, 0, 0] @= (src[1, 0, 0] + src[-1, 0, 0]
+                         + src[0, 1, 0] + src[0, -1, 0]
+                         + src[0, 0, 1] + src[0, 0, -1]) / (6 * h ** 2)
+
+    # assignments = ps.assignment_from_stencil(stencil, src, dst, normalization_factor=2)
+    ast = ps.create_kernel(kernel_func)
+
+    code = ps.get_code_str(ast)
--- a/pystencils_tests/test_custom_backends.py
+++ b/pystencils_tests/test_custom_backends.py
 from subprocess import CalledProcessError

 import pytest
-import sympy

 import pystencils
 import pystencils.cpu.cpujit
@@ -25,10 +24,10 @@ class ScreamingGpuBackend(CudaBackend):


 def test_custom_backends_cpu():
-    z, x, y = pystencils.fields("z, y, x: [2d]")
+    z, y, x = pystencils.fields("z, y, x: [2d]")

    normal_assignments = pystencils.AssignmentCollection([pystencils.Assignment(
-        z[0, 0], x[0, 0] * sympy.log(x[0, 0] * y[0, 0]))], [])
+        z[0, 0], x[0, 0] * x[0, 0] * y[0, 0])], [])

    ast = pystencils.create_kernel(normal_assignments, target=Target.CPU)
    pystencils.show_code(ast, ScreamingBackend())
@@ -37,16 +36,16 @@ def test_custom_backends_cpu():


 def test_custom_backends_gpu():
-    pytest.importorskip('pycuda')
-    import pycuda.driver
-    import pystencils.gpucuda.cudajit
+    pytest.importorskip('cupy')
+    import cupy
+    import pystencils.gpu.gpujit

    z, x, y = pystencils.fields("z, y, x: [2d]")

    normal_assignments = pystencils.AssignmentCollection([pystencils.Assignment(
-        z[0, 0], x[0, 0] * sympy.log(x[0, 0] * y[0, 0]))], [])
+        z[0, 0], x[0, 0] * x[0, 0] * y[0, 0])], [])

    ast = pystencils.create_kernel(normal_assignments, target=Target.GPU)
    pystencils.show_code(ast, ScreamingGpuBackend())
-    with pytest.raises(pycuda.driver.CompileError):
-        pystencils.gpucuda.cudajit.make_python_function(ast, custom_backend=ScreamingGpuBackend())
+    with pytest.raises((cupy.cuda.compiler.JitifyException, cupy.cuda.compiler.CompileException)):
+        pystencils.gpu.gpujit.make_python_function(ast, custom_backend=ScreamingGpuBackend())
--- a/pystencils_tests/test_data/datahandling_load_test.npz
+++ b/pystencils_tests/test_data/datahandling_load_test.npz
--- a/pystencils_tests/test_data/datahandling_parallel_load_test/dst.dat
+++ b/pystencils_tests/test_data/datahandling_parallel_load_test/dst.dat
--- a/pystencils_tests/test_data/datahandling_parallel_load_test/src.dat
+++ b/pystencils_tests/test_data/datahandling_parallel_load_test/src.dat
--- a/pystencils_tests/test_data/datahandling_parallel_save_test/dst.dat
+++ b/pystencils_tests/test_data/datahandling_parallel_save_test/dst.dat
--- a/pystencils_tests/test_data/datahandling_parallel_save_test/src.dat
+++ b/pystencils_tests/test_data/datahandling_parallel_save_test/src.dat
--- a/pystencils_tests/test_data/datahandling_save_test.npz
+++ b/pystencils_tests/test_data/datahandling_save_test.npz
--- a/pystencils_tests/test_data/lenna.png
+++ b/pystencils_tests/test_data/lenna.png
--- a/pystencils_tests/test_data/test_vessel2d_mask.png
+++ b/pystencils_tests/test_data/test_vessel2d_mask.png
--- a/pystencils_tests/test_datahandling.py
+++ b/pystencils_tests/test_datahandling.py
@@ -6,8 +6,7 @@ import numpy as np

 import pystencils as ps
 from pystencils import create_data_handling, create_kernel
-from pystencils.datahandling.pycuda import PyCudaArrayHandler
-from pystencils.datahandling.pyopencl import PyOpenClArrayHandler
+from pystencils.gpu.gpu_array_handler import GPUArrayHandler
 from pystencils.enums import Target

 try:
@@ -16,6 +15,12 @@ except ImportError:
    import unittest.mock
    pytest = unittest.mock.MagicMock()

+try:
+    import cupy.cuda.runtime
+    device_numbers = range(cupy.cuda.runtime.getDeviceCount())
+except ImportError:
+    device_numbers = []
+
 SCRIPT_FOLDER = Path(__file__).parent.absolute()
 INPUT_FOLDER = SCRIPT_FOLDER / "test_data"

@@ -86,11 +91,7 @@ def access_and_gather(dh, domain_size):
 def synchronization(dh, test_gpu=False):
    field_name = 'comm_field_test'
    if test_gpu:
-        try:
-            from pycuda import driver
-            import pycuda.autoinit
-        except ImportError:
-            return
+        pytest.importorskip("cupy")
        field_name += 'Gpu'

    dh.add_array(field_name, ghost_layers=1, dtype=np.int8, cpu=True, gpu=test_gpu)
@@ -117,7 +118,7 @@ def synchronization(dh, test_gpu=False):

 def kernel_execution_jacobi(dh, target):

-    test_gpu = target == Target.GPU or target == Target.OPENCL
+    test_gpu = target == Target.GPU
    dh.add_array('f', gpu=test_gpu)
    dh.add_array('tmp', gpu=test_gpu)

@@ -133,7 +134,7 @@ def kernel_execution_jacobi(dh, target):
    def jacobi():
        dh.fields.tmp.center @= sum(dh.fields.f.neighbors(stencil)) / len(stencil)

-    kernel = create_kernel(jacobi, target=target).compile()
+    kernel = create_kernel(jacobi, config=ps.CreateKernelConfig(target=target)).compile()
    for b in dh.iterate(ghost_layers=1):
        b['f'].fill(42)
    dh.run_kernel(kernel)
@@ -216,22 +217,18 @@ def test_kernel():
        reduction(dh)

        try:
-            import pycuda
+            import cupy
            dh = create_data_handling(domain_size=domain_shape, periodicity=True)
            kernel_execution_jacobi(dh, Target.GPU)
        except ImportError:
            pass


-@pytest.mark.parametrize('target', (Target.CPU, Target.GPU, Target.OPENCL))
+@pytest.mark.parametrize('target', (Target.CPU, Target.GPU))
 def test_kernel_param(target):
    for domain_shape in [(4, 5), (3, 4, 5)]:
        if target == Target.GPU:
-            pytest.importorskip('pycuda')
-        if target == Target.OPENCL:
-            pytest.importorskip('pyopencl')
-            from pystencils.opencl.opencljit import init_globally
-            init_globally()
+            pytest.importorskip('cupy')

        dh = create_data_handling(domain_size=domain_shape, periodicity=True, default_target=target)
        kernel_execution_jacobi(dh, target)
@@ -260,6 +257,20 @@ def test_add_arrays():
    assert y == dh.fields['y']


+@pytest.mark.parametrize('shape', [(17, 12), (7, 11, 18)])
+@pytest.mark.parametrize('layout', ['zyxf', 'fzyx'])
+def test_add_arrays_with_layout(shape, layout):
+    pytest.importorskip('cupy')
+
+    dh = create_data_handling(domain_size=shape, default_layout=layout, default_target=ps.Target.GPU)
+    f1 = dh.add_array("f1", values_per_cell=19)
+    dh.fill(f1.name, 1.0)
+
+    assert dh.cpu_arrays[f1.name].shape == dh.gpu_arrays[f1.name].shape
+    assert dh.cpu_arrays[f1.name].strides == dh.gpu_arrays[f1.name].strides
+    assert dh.cpu_arrays[f1.name].dtype == dh.gpu_arrays[f1.name].dtype
+
+
 def test_get_kwarg():
    domain_shape = (10, 10)
    field_description = 'src, dst'
@@ -270,7 +281,7 @@ def test_get_kwarg():
    dh.fill("dst", 0.0, ghost_layers=True)

    with pytest.raises(ValueError):
-        dh.add_array('src')
+        dh.add_array('src', values_per_cell=1)

    ur = ps.Assignment(src.center, dst.center)
    kernel = ps.create_kernel(ur).compile()
@@ -281,22 +292,20 @@ def test_get_kwarg():


 def test_add_custom_data():
-    pytest.importorskip('pycuda')
-
-    import pycuda.gpuarray as gpuarray
-    import pycuda.autoinit  # noqa
+    pytest.importorskip('cupy')
+    import cupy as cp

    def cpu_data_create_func():
        return np.ones((2, 2), dtype=np.float64)

    def gpu_data_create_func():
-        return gpuarray.zeros((2, 2), dtype=np.float64)
+        return cp.zeros((2, 2), dtype=np.float64)

    def cpu_to_gpu_transfer_func(gpuarr, cpuarray):
        gpuarr.set(cpuarray)

    def gpu_to_cpu_transfer_func(gpuarr, cpuarray):
-        gpuarr.get(cpuarray)
+        cpuarray[:] = gpuarr.get()

    dh = create_data_handling(domain_size=(10, 10))
    dh.add_custom_data('custom_data',
@@ -362,20 +371,11 @@ def test_load_data():
    assert np.all(dh.cpu_arrays['dst2']) == 0


-@pytest.mark.parametrize('target', (Target.GPU, Target.OPENCL))
-def test_array_handler(target):
+@pytest.mark.parametrize("device_number", device_numbers)
+def test_array_handler(device_number):
    size = (2, 2)
-    if target == Target.GPU:
-        pytest.importorskip('pycuda')
-        array_handler = PyCudaArrayHandler()
-    if target == Target.OPENCL:
-        pytest.importorskip('pyopencl')
-        import pyopencl as cl
-        from pystencils.opencl.opencljit import init_globally
-        init_globally()
-        ctx = cl.create_some_context(0)
-        queue = cl.CommandQueue(ctx)
-        array_handler = PyOpenClArrayHandler(queue)
+    pytest.importorskip('cupy')
+    array_handler = GPUArrayHandler(device_number)

    zero_array = array_handler.zeros(size)
    cpu_array = np.empty(size)
@@ -389,8 +389,23 @@ def test_array_handler(target):

    empty = array_handler.empty(size)
    assert empty.strides == (16, 8)
-    empty = array_handler.empty(shape=size, layout=(1, 0))
+    empty = array_handler.empty(shape=size, order="F")
    assert empty.strides == (8, 16)

    random_array = array_handler.randn(size)

+    cpu_array = np.empty((20, 40), dtype=np.float64)
+    gpu_array = array_handler.to_gpu(cpu_array)
+
+    assert cpu_array.base is None
+    assert gpu_array.base is None
+    assert gpu_array.strides == cpu_array.strides
+
+    cpu_array2 = np.empty((20, 40), dtype=np.float64)
+    cpu_array2 = cpu_array2.swapaxes(0, 1)
+    gpu_array2 = array_handler.to_gpu(cpu_array2)
+
+    assert cpu_array2.base is not None
+    assert gpu_array2.base is not None
+    assert gpu_array2.strides == cpu_array2.strides
+
--- a/pystencils_tests/test_datahandling_parallel.py
+++ b/pystencils_tests/test_datahandling_parallel.py
@@ -4,7 +4,6 @@ import waLBerla as wlb
 import pystencils
 from pystencils import make_slice

-from tempfile import TemporaryDirectory
 from pathlib import Path

 from pystencils.boundaries import BoundaryHandling, Neumann
@@ -12,7 +11,7 @@ from pystencils.slicing import slice_from_direction

 from pystencils.datahandling.parallel_datahandling import ParallelDataHandling
 from pystencils.datahandling import create_data_handling
-from pystencils_tests.test_datahandling import (
+from tests.test_datahandling import (
    access_and_gather, kernel_execution_jacobi, reduction, synchronization, vtk_output)

 SCRIPT_FOLDER = Path(__file__).parent.absolute()
@@ -34,14 +33,12 @@ def test_access_and_gather():
    dh = ParallelDataHandling(blocks, default_ghost_layers=2)
    access_and_gather(dh, cells)
    synchronization(dh, test_gpu=False)
-    if hasattr(wlb, 'cuda'):
+    if hasattr(wlb, 'gpu'):
        synchronization(dh, test_gpu=True)


 def test_gpu():
-    if not hasattr(wlb, 'cuda'):
-        print("Skip GPU tests because walberla was built without CUDA")
-        return
+    pytest.importorskip('waLBerla.gpu')

    block_size = (4, 7, 1)
    num_blocks = (3, 2, 1)
@@ -59,24 +56,22 @@ def test_gpu():
        np.testing.assert_equal(b['v'], 42)


-def test_kernel():
+@pytest.mark.parametrize('target', (pystencils.Target.CPU, pystencils.Target.GPU))
+def test_kernel(target):
+    if target == pystencils.Target.GPU:
+        pytest.importorskip('waLBerla.gpu')

-    for gpu in (True, False):
-        if gpu and not hasattr(wlb, 'cuda'):
-            print("Skipping CUDA tests because walberla was built without GPU support")
-            continue
-
-        # 3D
-        blocks = wlb.createUniformBlockGrid(blocks=(3, 2, 4), cellsPerBlock=(3, 2, 5), oneBlockPerProcess=False)
-        dh = ParallelDataHandling(blocks)
-        kernel_execution_jacobi(dh, pystencils.Target.GPU)
-        reduction(dh)
+    # 3D
+    blocks = wlb.createUniformBlockGrid(blocks=(3, 2, 4), cellsPerBlock=(3, 2, 5), oneBlockPerProcess=False)
+    dh = ParallelDataHandling(blocks, default_target=target)
+    kernel_execution_jacobi(dh, target)
+    reduction(dh)

-        # 2D
-        blocks = wlb.createUniformBlockGrid(blocks=(3, 2, 1), cellsPerBlock=(3, 2, 1), oneBlockPerProcess=False)
-        dh = ParallelDataHandling(blocks, dim=2)
-        kernel_execution_jacobi(dh, pystencils.Target.GPU)
-        reduction(dh)
+    # 2D
+    blocks = wlb.createUniformBlockGrid(blocks=(3, 2, 1), cellsPerBlock=(3, 2, 1), oneBlockPerProcess=False)
+    dh = ParallelDataHandling(blocks, dim=2, default_target=target)
+    kernel_execution_jacobi(dh, target)
+    reduction(dh)


 def test_vtk_output():
@@ -90,7 +85,7 @@ def test_block_iteration():
    num_blocks = (2, 2, 2)
    blocks = wlb.createUniformBlockGrid(blocks=num_blocks, cellsPerBlock=block_size, oneBlockPerProcess=False)
    dh = ParallelDataHandling(blocks, default_ghost_layers=2)
-    dh.add_array('v', values_per_cell=1, dtype=np.int64, ghost_layers=2, gpu=True)
+    dh.add_array('v', values_per_cell=1, dtype=np.int64, ghost_layers=2)

    for b in dh.iterate():
        b['v'].fill(1)
@@ -113,10 +108,12 @@ def test_block_iteration():


 def test_getter_setter():
+    pytest.importorskip('waLBerla.gpu')
+
    block_size = (2, 2, 2)
    num_blocks = (2, 2, 2)
    blocks = wlb.createUniformBlockGrid(blocks=num_blocks, cellsPerBlock=block_size, oneBlockPerProcess=False)
-    dh = ParallelDataHandling(blocks, default_ghost_layers=2)
+    dh = ParallelDataHandling(blocks, default_ghost_layers=2, default_target=pystencils.Target.GPU)
    dh.add_array('v', values_per_cell=1, dtype=np.int64, ghost_layers=2, gpu=True)

    assert dh.shape == (4, 4, 4)
@@ -134,15 +131,20 @@ def test_getter_setter():


 def test_parallel_datahandling_boundary_conditions():
-    pytest.importorskip('waLBerla.cuda')
-    dh = create_data_handling(domain_size=(7, 7), periodicity=True, parallel=True, default_target=pystencils.Target.GPU)
-    src = dh.add_array('src')
-    src2 = dh.add_array('src2')
-    dh.fill("src", 0.0, ghost_layers=True)
-    dh.fill("src", 1.0, ghost_layers=False)
-    src_cpu = dh.add_array('src_cpu', gpu=False)
-    dh.fill("src_cpu", 0.0, ghost_layers=True)
-    dh.fill("src_cpu", 1.0, ghost_layers=False)
+    pytest.importorskip('waLBerla.gpu')
+
+    dh = create_data_handling(domain_size=(7, 7), periodicity=True, parallel=True,
+                              default_target=pystencils.Target.GPU)
+
+    src = dh.add_array('src', values_per_cell=1)
+    dh.fill(src.name, 0.0, ghost_layers=True)
+    dh.fill(src.name, 1.0, ghost_layers=False)
+
+    src2 = dh.add_array('src2', values_per_cell=1)
+
+    src_cpu = dh.add_array('src_cpu', values_per_cell=1, gpu=False)
+    dh.fill(src_cpu.name, 0.0, ghost_layers=True)
+    dh.fill(src_cpu.name, 1.0, ghost_layers=False)

    boundary_stencil = [(1, 0), (-1, 0), (0, 1), (0, -1)]
    boundary_handling_cpu = BoundaryHandling(dh, src_cpu.name, boundary_stencil,
@@ -165,10 +167,11 @@ def test_parallel_datahandling_boundary_conditions():
    boundary_handling()
    dh.all_to_cpu()
    for block in dh.iterate():
-        np.testing.assert_almost_equal(block["src_cpu"], block["src"])
+        np.testing.assert_almost_equal(block[src_cpu.name], block[src.name])

    assert dh.custom_data_names == ('boundary_handling_cpuIndexArrays', 'boundary_handling_gpuIndexArrays')
-    dh.swap("src", "src2", gpu=True)
+    dh.swap(src.name, src2.name, gpu=True)
+

 def test_save_data():
    domain_shape = (2, 2)

--- a/pystencils_tests/test_derivative.py
+++ b/pystencils_tests/test_derivative.py
--- a/tests/test_dot_printer.ipynb
+++ b/tests/test_dot_printer.ipynb
+%% Cell type:code id: tags:
+
+``` python
+from pystencils.session import *
+from pystencils.astnodes import Block, Conditional, SympyAssignment
+```
+
+%% Cell type:code id: tags:
+
+``` python
+src, dst = ps.fields("src, dst: double[2D]", layout='c')
+
+true_block = Block([SympyAssignment(dst[0, 0], src[-1, 0])])
+false_block = Block([SympyAssignment(dst[0, 0], src[1, 0])])
+ur = [true_block, Conditional(dst.center() > 0.0, true_block, false_block)]
+
+ast = ps.create_kernel(ur)
+```
+
+%% Cell type:code id: tags:
+
+``` python
+ps.show_code(ast)
+```
+
+%% Output
+
+
+
+%% Cell type:code id: tags:
+
+``` python
+```
+%% Cell type:code id: tags:
+
+``` python
+from pystencils.session import *
+from pystencils.astnodes import Block, Conditional, SympyAssignment
+```
+
+%% Cell type:code id: tags:
+
+``` python
+src, dst = ps.fields("src, dst: double[2D]", layout='c')
+
+true_block = Block([SympyAssignment(dst[0, 0], src[-1, 0])])
+false_block = Block([SympyAssignment(dst[0, 0], src[1, 0])])
+ur = [true_block, Conditional(dst.center() > 0.0, true_block, false_block)]
+
+ast = ps.create_kernel(ur)
+```
+
+%% Cell type:code id: tags:
+
+``` python
+ps.show_code(ast)
+```
+
+%% Output
+
+
+
+%% Cell type:code id: tags:
+
+``` python
+```
No results found