5600b6b6 · 5600b6b6 · 5600b6b6 · 5600b6b6 · 5600b6b6 · 5600b6b6
--- a/pystencils_tests/test_data/lenna.png
+++ b/pystencils_tests/test_data/lenna.png
--- a/pystencils_tests/test_data/test_vessel2d_mask.png
+++ b/pystencils_tests/test_data/test_vessel2d_mask.png
--- a/pystencils_tests/test_datahandling.py
+++ b/pystencils_tests/test_datahandling.py
@@ -6,7 +6,7 @@ import numpy as np

 import pystencils as ps
 from pystencils import create_data_handling, create_kernel
-from pystencils.datahandling.pycuda import PyCudaArrayHandler
+from pystencils.gpu.gpu_array_handler import GPUArrayHandler
 from pystencils.enums import Target

 try:
@@ -15,6 +15,12 @@ except ImportError:
    import unittest.mock
    pytest = unittest.mock.MagicMock()

+try:
+    import cupy.cuda.runtime
+    device_numbers = range(cupy.cuda.runtime.getDeviceCount())
+except ImportError:
+    device_numbers = []
+
 SCRIPT_FOLDER = Path(__file__).parent.absolute()
 INPUT_FOLDER = SCRIPT_FOLDER / "test_data"

@@ -85,11 +91,7 @@ def access_and_gather(dh, domain_size):
 def synchronization(dh, test_gpu=False):
    field_name = 'comm_field_test'
    if test_gpu:
-        try:
-            from pycuda import driver
-            import pycuda.autoinit
-        except ImportError:
-            return
+        pytest.importorskip("cupy")
        field_name += 'Gpu'

    dh.add_array(field_name, ghost_layers=1, dtype=np.int8, cpu=True, gpu=test_gpu)
@@ -215,7 +217,7 @@ def test_kernel():
        reduction(dh)

        try:
-            import pycuda
+            import cupy
            dh = create_data_handling(domain_size=domain_shape, periodicity=True)
            kernel_execution_jacobi(dh, Target.GPU)
        except ImportError:
@@ -226,7 +228,7 @@ def test_kernel():
 def test_kernel_param(target):
    for domain_shape in [(4, 5), (3, 4, 5)]:
        if target == Target.GPU:
-            pytest.importorskip('pycuda')
+            pytest.importorskip('cupy')

        dh = create_data_handling(domain_size=domain_shape, periodicity=True, default_target=target)
        kernel_execution_jacobi(dh, target)
@@ -255,6 +257,20 @@ def test_add_arrays():
    assert y == dh.fields['y']


+@pytest.mark.parametrize('shape', [(17, 12), (7, 11, 18)])
+@pytest.mark.parametrize('layout', ['zyxf', 'fzyx'])
+def test_add_arrays_with_layout(shape, layout):
+    pytest.importorskip('cupy')
+
+    dh = create_data_handling(domain_size=shape, default_layout=layout, default_target=ps.Target.GPU)
+    f1 = dh.add_array("f1", values_per_cell=19)
+    dh.fill(f1.name, 1.0)
+
+    assert dh.cpu_arrays[f1.name].shape == dh.gpu_arrays[f1.name].shape
+    assert dh.cpu_arrays[f1.name].strides == dh.gpu_arrays[f1.name].strides
+    assert dh.cpu_arrays[f1.name].dtype == dh.gpu_arrays[f1.name].dtype
+
+
 def test_get_kwarg():
    domain_shape = (10, 10)
    field_description = 'src, dst'
@@ -265,7 +281,7 @@ def test_get_kwarg():
    dh.fill("dst", 0.0, ghost_layers=True)

    with pytest.raises(ValueError):
-        dh.add_array('src')
+        dh.add_array('src', values_per_cell=1)

    ur = ps.Assignment(src.center, dst.center)
    kernel = ps.create_kernel(ur).compile()
@@ -276,22 +292,20 @@ def test_get_kwarg():


 def test_add_custom_data():
-    pytest.importorskip('pycuda')
-
-    import pycuda.gpuarray as gpuarray
-    import pycuda.autoinit  # noqa
+    pytest.importorskip('cupy')
+    import cupy as cp

    def cpu_data_create_func():
        return np.ones((2, 2), dtype=np.float64)

    def gpu_data_create_func():
-        return gpuarray.zeros((2, 2), dtype=np.float64)
+        return cp.zeros((2, 2), dtype=np.float64)

    def cpu_to_gpu_transfer_func(gpuarr, cpuarray):
        gpuarr.set(cpuarray)

    def gpu_to_cpu_transfer_func(gpuarr, cpuarray):
-        gpuarr.get(cpuarray)
+        cpuarray[:] = gpuarr.get()

    dh = create_data_handling(domain_size=(10, 10))
    dh.add_custom_data('custom_data',
@@ -357,10 +371,11 @@ def test_load_data():
    assert np.all(dh.cpu_arrays['dst2']) == 0


-def test_array_handler():
+@pytest.mark.parametrize("device_number", device_numbers)
+def test_array_handler(device_number):
    size = (2, 2)
-    pytest.importorskip('pycuda')
-    array_handler = PyCudaArrayHandler()
+    pytest.importorskip('cupy')
+    array_handler = GPUArrayHandler(device_number)

    zero_array = array_handler.zeros(size)
    cpu_array = np.empty(size)
@@ -374,8 +389,23 @@ def test_array_handler():

    empty = array_handler.empty(size)
    assert empty.strides == (16, 8)
-    empty = array_handler.empty(shape=size, layout=(1, 0))
+    empty = array_handler.empty(shape=size, order="F")
    assert empty.strides == (8, 16)

    random_array = array_handler.randn(size)

+    cpu_array = np.empty((20, 40), dtype=np.float64)
+    gpu_array = array_handler.to_gpu(cpu_array)
+
+    assert cpu_array.base is None
+    assert gpu_array.base is None
+    assert gpu_array.strides == cpu_array.strides
+
+    cpu_array2 = np.empty((20, 40), dtype=np.float64)
+    cpu_array2 = cpu_array2.swapaxes(0, 1)
+    gpu_array2 = array_handler.to_gpu(cpu_array2)
+
+    assert cpu_array2.base is not None
+    assert gpu_array2.base is not None
+    assert gpu_array2.strides == cpu_array2.strides
+
--- a/pystencils_tests/test_datahandling_parallel.py
+++ b/pystencils_tests/test_datahandling_parallel.py
@@ -11,7 +11,7 @@ from pystencils.slicing import slice_from_direction

 from pystencils.datahandling.parallel_datahandling import ParallelDataHandling
 from pystencils.datahandling import create_data_handling
-from pystencils_tests.test_datahandling import (
+from tests.test_datahandling import (
    access_and_gather, kernel_execution_jacobi, reduction, synchronization, vtk_output)

 SCRIPT_FOLDER = Path(__file__).parent.absolute()
@@ -33,12 +33,12 @@ def test_access_and_gather():
    dh = ParallelDataHandling(blocks, default_ghost_layers=2)
    access_and_gather(dh, cells)
    synchronization(dh, test_gpu=False)
-    if hasattr(wlb, 'cuda'):
+    if hasattr(wlb, 'gpu'):
        synchronization(dh, test_gpu=True)


 def test_gpu():
-    pytest.importorskip('waLBerla.cuda')
+    pytest.importorskip('waLBerla.gpu')

    block_size = (4, 7, 1)
    num_blocks = (3, 2, 1)
@@ -59,7 +59,7 @@ def test_gpu():
 @pytest.mark.parametrize('target', (pystencils.Target.CPU, pystencils.Target.GPU))
 def test_kernel(target):
    if target == pystencils.Target.GPU:
-        pytest.importorskip('waLBerla.cuda')
+        pytest.importorskip('waLBerla.gpu')

    # 3D
    blocks = wlb.createUniformBlockGrid(blocks=(3, 2, 4), cellsPerBlock=(3, 2, 5), oneBlockPerProcess=False)
@@ -108,7 +108,7 @@ def test_block_iteration():


 def test_getter_setter():
-    pytest.importorskip('waLBerla.cuda')
+    pytest.importorskip('waLBerla.gpu')

    block_size = (2, 2, 2)
    num_blocks = (2, 2, 2)
@@ -131,7 +131,7 @@ def test_getter_setter():


 def test_parallel_datahandling_boundary_conditions():
-    pytest.importorskip('waLBerla.cuda')
+    pytest.importorskip('waLBerla.gpu')

    dh = create_data_handling(domain_size=(7, 7), periodicity=True, parallel=True,
                              default_target=pystencils.Target.GPU)

--- a/pystencils_tests/test_derivative.py
+++ b/pystencils_tests/test_derivative.py
--- a/pystencils_tests/test_dot_printer.ipynb
+++ b/pystencils_tests/test_dot_printer.ipynb
--- a/pystencils_tests/test_dot_printer.py
+++ b/pystencils_tests/test_dot_printer.py
--- a/pystencils_tests/test_dtype_check.py
+++ b/pystencils_tests/test_dtype_check.py
--- a/pystencils_tests/test_fast_approximation.py
+++ b/pystencils_tests/test_fast_approximation.py
@@ -7,7 +7,7 @@ from pystencils.fast_approximation import (


 def test_fast_sqrt():
-    pytest.importorskip('pycuda')
+    pytest.importorskip('cupy')
    f, g = ps.fields("f, g: double[2D]")
    expr = sp.sqrt(f[0, 0] + f[1, 0])

@@ -30,7 +30,7 @@ def test_fast_sqrt():


 def test_fast_divisions():
-    pytest.importorskip('pycuda')
+    pytest.importorskip('cupy')
    f, g = ps.fields("f, g: double[2D]")
    expr = f[0, 0] / f[1, 0]
    assert len(insert_fast_divisions(expr).atoms(fast_division)) == 1

--- a/pystencils_tests/test_fd_derivation.ipynb
+++ b/pystencils_tests/test_fd_derivation.ipynb
--- a/pystencils_tests/test_fd_derivation_via_rotation.ipynb
+++ b/pystencils_tests/test_fd_derivation_via_rotation.ipynb
--- a/pystencils_tests/test_fd_derivative.py
+++ b/pystencils_tests/test_fd_derivative.py
--- a/pystencils_tests/test_field.py
+++ b/pystencils_tests/test_field.py
@@ -5,21 +5,33 @@ import sympy as sp
 import pystencils as ps
 from pystencils import TypedSymbol
 from pystencils.typing import create_type
-from pystencils.field import Field, FieldType, layout_string_to_tuple
+from pystencils.field import Field, FieldType, layout_string_to_tuple, spatial_layout_string_to_tuple


 def test_field_basic():
-    f = Field.create_generic('f', spatial_dimensions=2)
+    f = Field.create_generic("f", spatial_dimensions=2)
    assert FieldType.is_generic(f)
-    assert f['E'] == f[1, 0]
-    assert f['N'] == f[0, 1]
-    assert '_' in f.center._latex('dummy')
-
-    assert f.index_to_physical(index_coordinates=sp.Matrix([0, 0]), staggered=False)[0] == 0
-    assert f.index_to_physical(index_coordinates=sp.Matrix([0, 0]), staggered=False)[1] == 0
-
-    assert f.physical_to_index(physical_coordinates=sp.Matrix([0, 0]), staggered=False)[0] == 0
-    assert f.physical_to_index(physical_coordinates=sp.Matrix([0, 0]), staggered=False)[1] == 0
+    assert f["E"] == f[1, 0]
+    assert f["N"] == f[0, 1]
+    assert "_" in f.center._latex("dummy")
+
+    assert (
+        f.index_to_physical(index_coordinates=sp.Matrix([0, 0]), staggered=False)[0]
+        == 0
+    )
+    assert (
+        f.index_to_physical(index_coordinates=sp.Matrix([0, 0]), staggered=False)[1]
+        == 0
+    )
+
+    assert (
+        f.physical_to_index(physical_coordinates=sp.Matrix([0, 0]), staggered=False)[0]
+        == 0
+    )
+    assert (
+        f.physical_to_index(physical_coordinates=sp.Matrix([0, 0]), staggered=False)[1]
+        == 0
+    )

    f1 = f.new_field_with_different_name("f1")
    assert f1.ndim == f.ndim
@@ -28,7 +40,7 @@ def test_field_basic():
    fixed = ps.fields("f(5, 5) : double[20, 20]")
    assert fixed.neighbor_vector((1, 1)).shape == (5, 5)

-    f = Field.create_fixed_size('f', (10, 10), strides=(80, 8), dtype=np.float64)
+    f = Field.create_fixed_size("f", (10, 10), strides=(80, 8), dtype=np.float64)
    assert f.spatial_strides == (10, 1)
    assert f.index_strides == ()
    assert f.center_vector == sp.Matrix([f.center])
@@ -37,20 +49,21 @@ def test_field_basic():
    assert f1.ndim == f.ndim
    assert f1.values_per_cell() == f.values_per_cell()

-    f = Field.create_fixed_size('f', (8, 8, 2, 2), index_dimensions=2)
-    assert f.center_vector == sp.Matrix([[f(0, 0), f(0, 1)],
-                                         [f(1, 0), f(1, 1)]])
+    f = Field.create_fixed_size("f", (8, 8, 2, 2), index_dimensions=2)
+    assert f.center_vector == sp.Matrix([[f(0, 0), f(0, 1)], [f(1, 0), f(1, 1)]])
    field_access = f[1, 1]
    assert field_access.nr_of_coordinates == 2
-    assert field_access.offset_name == 'NE'
+    assert field_access.offset_name == "NE"
    neighbor = field_access.neighbor(coord_id=0, offset=-2)
    assert neighbor.offsets == (-1, 1)
-    assert '_' in neighbor._latex('dummy')
+    assert "_" in neighbor._latex("dummy")

-    f = Field.create_fixed_size('f', (8, 8, 2, 2, 2), index_dimensions=3)
-    assert f.center_vector == sp.Array([[[f(i, j, k) for k in range(2)] for j in range(2)] for i in range(2)])
+    f = Field.create_fixed_size("f", (8, 8, 2, 2, 2), index_dimensions=3)
+    assert f.center_vector == sp.Array(
+        [[[f(i, j, k) for k in range(2)] for j in range(2)] for i in range(2)]
+    )

-    f = Field.create_generic('f', spatial_dimensions=5, index_dimensions=2)
+    f = Field.create_generic("f", spatial_dimensions=5, index_dimensions=2)
    field_access = f[1, -1, 2, -3, 0](1, 0)
    assert field_access.offsets == (1, -1, 2, -3, 0)
    assert field_access.index == (1, 0)
@@ -60,61 +73,71 @@ def test_error_handling():
    struct_dtype = np.dtype([('a', np.int32), ('b', np.float64), ('c', np.uint32)])
    Field.create_generic('f', spatial_dimensions=2, index_dimensions=0, dtype=struct_dtype)
    with pytest.raises(ValueError) as e:
-        Field.create_generic('f', spatial_dimensions=2, index_dimensions=1, dtype=struct_dtype)
-    assert 'index dimension' in str(e.value)
+        Field.create_generic(
+            "f", spatial_dimensions=2, index_dimensions=1, dtype=struct_dtype
+        )
+    assert "index dimension" in str(e.value)

-    arr = np.array([[[(1,)*3, (2,)*3, (3,)*3]]*2], dtype=struct_dtype)
-    Field.create_from_numpy_array('f', arr, index_dimensions=0)
+    arr = np.array([[[(1,) * 3, (2,) * 3, (3,) * 3]] * 2], dtype=struct_dtype)
+    Field.create_from_numpy_array("f", arr, index_dimensions=0)
    with pytest.raises(ValueError) as e:
-        Field.create_from_numpy_array('f', arr, index_dimensions=1)
-    assert 'Structured arrays' in str(e.value)
+        Field.create_from_numpy_array("f", arr, index_dimensions=1)
+    assert "Structured arrays" in str(e.value)

    arr = np.zeros([3, 3, 3])
-    Field.create_from_numpy_array('f', arr, index_dimensions=2)
+    Field.create_from_numpy_array("f", arr, index_dimensions=2)
    with pytest.raises(ValueError) as e:
-        Field.create_from_numpy_array('f', arr, index_dimensions=3)
-    assert 'Too many' in str(e.value)
+        Field.create_from_numpy_array("f", arr, index_dimensions=3)
+    assert "Too many" in str(e.value)

-    Field.create_fixed_size('f', (3, 2, 4), index_dimensions=0, dtype=struct_dtype, layout='reverse_numpy')
+    Field.create_fixed_size(
+        "f", (3, 2, 4), index_dimensions=0, dtype=struct_dtype, layout="reverse_numpy"
+    )
    with pytest.raises(ValueError) as e:
-        Field.create_fixed_size('f', (3, 2, 4), index_dimensions=1, dtype=struct_dtype, layout='reverse_numpy')
-    assert 'Structured arrays' in str(e.value)
-
-    f = Field.create_fixed_size('f', (10, 10))
+        Field.create_fixed_size(
+            "f",
+            (3, 2, 4),
+            index_dimensions=1,
+            dtype=struct_dtype,
+            layout="reverse_numpy",
+        )
+    assert "Structured arrays" in str(e.value)
+
+    f = Field.create_fixed_size("f", (10, 10))
    with pytest.raises(ValueError) as e:
        f[1]
-    assert 'Wrong number of spatial indices' in str(e.value)
+    assert "Wrong number of spatial indices" in str(e.value)

-    f = Field.create_generic('f', spatial_dimensions=2, index_shape=(3,))
+    f = Field.create_generic("f", spatial_dimensions=2, index_shape=(3,))
    with pytest.raises(ValueError) as e:
        f(3)
-    assert 'out of bounds' in str(e.value)
+    assert "out of bounds" in str(e.value)

-    f = Field.create_fixed_size('f', (10, 10, 3, 4), index_dimensions=2)
+    f = Field.create_fixed_size("f", (10, 10, 3, 4), index_dimensions=2)
    with pytest.raises(ValueError) as e:
        f(3, 0)
-    assert 'out of bounds' in str(e.value)
+    assert "out of bounds" in str(e.value)

    with pytest.raises(ValueError) as e:
        f(1, 0)(1, 0)
-    assert 'Indexing an already indexed' in str(e.value)
+    assert "Indexing an already indexed" in str(e.value)

    with pytest.raises(ValueError) as e:
        f(1)
-    assert 'Wrong number of indices' in str(e.value)
+    assert "Wrong number of indices" in str(e.value)

    with pytest.raises(ValueError) as e:
-        Field.create_generic('f', spatial_dimensions=2, layout='wrong')
-    assert 'Unknown layout descriptor' in str(e.value)
+        Field.create_generic("f", spatial_dimensions=2, layout="wrong")
+    assert "Unknown layout descriptor" in str(e.value)

-    assert layout_string_to_tuple('fzyx', dim=4) == (3, 2, 1, 0)
+    assert layout_string_to_tuple("fzyx", dim=4) == (3, 2, 1, 0)
    with pytest.raises(ValueError) as e:
-        layout_string_to_tuple('wrong', dim=4)
-    assert 'Unknown layout descriptor' in str(e.value)
+        layout_string_to_tuple("wrong", dim=4)
+    assert "Unknown layout descriptor" in str(e.value)


 def test_decorator_scoping():
-    dst = ps.fields('dst : double[2D]')
+    dst = ps.fields("dst : double[2D]")

    def f1():
        a = sp.Symbol("a")
@@ -134,7 +157,7 @@ def test_decorator_scoping():


 def test_string_creation():
-    x, y, z = ps.fields('  x(4),    y(3,5) z : double[  3,  47]')
+    x, y, z = ps.fields("  x(4),    y(3,5) z : double[  3,  47]")
    assert x.index_shape == (4,)
    assert y.index_shape == (3, 5)
    assert z.spatial_shape == (3, 47)
@@ -142,19 +165,85 @@ def test_string_creation():

 def test_itemsize():

-    x = ps.fields('x: float32[1d]')
-    y = ps.fields('y:  float64[2d]')
-    i = ps.fields('i:  int16[1d]')
+    x = ps.fields("x: float32[1d]")
+    y = ps.fields("y:  float64[2d]")
+    i = ps.fields("i:  int16[1d]")

    assert x.itemsize == 4
    assert y.itemsize == 8
    assert i.itemsize == 2


+def test_spatial_memory_layout_descriptors():
+    assert (
+        spatial_layout_string_to_tuple("AoS", 3)
+        == spatial_layout_string_to_tuple("aos", 3)
+        == spatial_layout_string_to_tuple("ZYXF", 3)
+        == spatial_layout_string_to_tuple("zyxf", 3)
+        == (2, 1, 0)
+    )
+    assert (
+        spatial_layout_string_to_tuple("SoA", 3)
+        == spatial_layout_string_to_tuple("soa", 3)
+        == spatial_layout_string_to_tuple("FZYX", 3)
+        == spatial_layout_string_to_tuple("fzyx", 3)
+        == spatial_layout_string_to_tuple("f", 3)
+        == spatial_layout_string_to_tuple("F", 3)
+        == (2, 1, 0)
+    )
+    assert (
+        spatial_layout_string_to_tuple("c", 3)
+        == spatial_layout_string_to_tuple("C", 3)
+        == (0, 1, 2)
+    )
+
+    assert spatial_layout_string_to_tuple("C", 5) == (0, 1, 2, 3, 4)
+
+    with pytest.raises(ValueError):
+        spatial_layout_string_to_tuple("aos", -1)
+
+    with pytest.raises(ValueError):
+        spatial_layout_string_to_tuple("aos", 4)
+
+
+def test_memory_layout_descriptors():
+    assert (
+        layout_string_to_tuple("AoS", 4)
+        == layout_string_to_tuple("aos", 4)
+        == layout_string_to_tuple("ZYXF", 4)
+        == layout_string_to_tuple("zyxf", 4)
+        == (2, 1, 0, 3)
+    )
+    assert (
+        layout_string_to_tuple("SoA", 4)
+        == layout_string_to_tuple("soa", 4)
+        == layout_string_to_tuple("FZYX", 4)
+        == layout_string_to_tuple("fzyx", 4)
+        == layout_string_to_tuple("f", 4)
+        == layout_string_to_tuple("F", 4)
+        == (3, 2, 1, 0)
+    )
+    assert (
+        layout_string_to_tuple("c", 4)
+        == layout_string_to_tuple("C", 4)
+        == (0, 1, 2, 3)
+    )
+
+    assert layout_string_to_tuple("C", 5) == (0, 1, 2, 3, 4)
+
+    with pytest.raises(ValueError):
+        layout_string_to_tuple("aos", -1)
+
+    with pytest.raises(ValueError):
+        layout_string_to_tuple("aos", 5)
+
+
 def test_staggered():

    # D2Q5
-    j1, j2, j3 = ps.fields('j1(2), j2(2,2), j3(2,2,2) : double[2D]', field_type=FieldType.STAGGERED)
+    j1, j2, j3 = ps.fields(
+        "j1(2), j2(2,2), j3(2,2,2) : double[2D]", field_type=FieldType.STAGGERED
+    )

    assert j1[0, 1](1) == j1.staggered_access((0, sp.Rational(1, 2)))
    assert j1[0, 1](1) == j1.staggered_access(np.array((0, sp.Rational(1, 2))))
@@ -163,7 +252,7 @@ def test_staggered():
    assert j1[0, 1](1) == j1.staggered_access("N")
    assert j1[0, 0](1) == j1.staggered_access("S")
    assert j1.staggered_vector_access("N") == sp.Matrix([j1.staggered_access("N")])
-    assert j1.staggered_stencil_name == 'D2Q5'
+    assert j1.staggered_stencil_name == "D2Q5"

    assert j1.physical_coordinates[0] == TypedSymbol("ctr_0", create_type("int"), nonnegative=True)
    assert j1.physical_coordinates[1] == TypedSymbol("ctr_1", create_type("int"), nonnegative=True)
@@ -176,28 +265,40 @@ def test_staggered():

    assert j2[0, 1](1, 1) == j2.staggered_access((0, sp.Rational(1, 2)), 1)
    assert j2[0, 1](1, 1) == j2.staggered_access("N", 1)
-    assert j2.staggered_vector_access("N") == sp.Matrix([j2.staggered_access("N", 0), j2.staggered_access("N", 1)])
+    assert j2.staggered_vector_access("N") == sp.Matrix(
+        [j2.staggered_access("N", 0), j2.staggered_access("N", 1)]
+    )

    assert j3[0, 1](1, 1, 1) == j3.staggered_access((0, sp.Rational(1, 2)), (1, 1))
    assert j3[0, 1](1, 1, 1) == j3.staggered_access("N", (1, 1))
-    assert j3.staggered_vector_access("N") == sp.Matrix([[j3.staggered_access("N", (i, j))
-                                                        for j in range(2)] for i in range(2)])
+    assert j3.staggered_vector_access("N") == sp.Matrix(
+        [[j3.staggered_access("N", (i, j)) for j in range(2)] for i in range(2)]
+    )

    # D2Q9
-    k1, k2 = ps.fields('k1(4), k2(2) : double[2D]', field_type=FieldType.STAGGERED)
+    k1, k2 = ps.fields("k1(4), k2(2) : double[2D]", field_type=FieldType.STAGGERED)

    assert k1[1, 1](2) == k1.staggered_access("NE")
    assert k1[0, 0](2) == k1.staggered_access("SW")
    assert k1[0, 0](3) == k1.staggered_access("NW")
-    
+
    a = k1.staggered_access("NE")
-    assert a._staggered_offset(a.offsets, a.index[0]) == [sp.Rational(1, 2), sp.Rational(1, 2)]
+    assert a._staggered_offset(a.offsets, a.index[0]) == [
+        sp.Rational(1, 2),
+        sp.Rational(1, 2),
+    ]
    a = k1.staggered_access("SW")
-    assert a._staggered_offset(a.offsets, a.index[0]) == [sp.Rational(-1, 2), sp.Rational(-1, 2)]
+    assert a._staggered_offset(a.offsets, a.index[0]) == [
+        sp.Rational(-1, 2),
+        sp.Rational(-1, 2),
+    ]
    a = k1.staggered_access("NW")
-    assert a._staggered_offset(a.offsets, a.index[0]) == [sp.Rational(-1, 2), sp.Rational(1, 2)]
+    assert a._staggered_offset(a.offsets, a.index[0]) == [
+        sp.Rational(-1, 2),
+        sp.Rational(1, 2),
+    ]

    # sign reversed when using as flux field
-    r = ps.fields('r(2) : double[2D]', field_type=FieldType.STAGGERED_FLUX)
+    r = ps.fields("r(2) : double[2D]", field_type=FieldType.STAGGERED_FLUX)
    assert r[0, 0](0) == r.staggered_access("W")
    assert -r[1, 0](0) == r.staggered_access("E")
--- a/pystencils_tests/test_field_access_poly.py
+++ b/pystencils_tests/test_field_access_poly.py
--- a/pystencils_tests/test_field_equality.ipynb
+++ b/pystencils_tests/test_field_equality.ipynb
--- a/pystencils_tests/test_finite_differences.py
+++ b/pystencils_tests/test_finite_differences.py
--- a/pystencils_tests/test_floor_ceil_int_optimization.py
+++ b/pystencils_tests/test_floor_ceil_int_optimization.py
--- a/pystencils_tests/test_fvm.py
+++ b/pystencils_tests/test_fvm.py
@@ -622,3 +622,20 @@ def test_source_stencil(stencil):
            assert len(diff.atoms(ps.field.Field.Access)) == 1
        else:
            assert len(diff.atoms(ps.field.Field.Access)) == 2
+
+
+def test_fvm_staggered_simplification():
+    D = sp.Symbol("D")
+    data_type = "float64"
+
+    c = ps.fields(f"c: {data_type}[2D]", layout='fzyx')
+    j = ps.fields(f"j(2): {data_type}[2D]", layout='fzyx', field_type=ps.FieldType.STAGGERED_FLUX)
+
+    grad_c = sp.Matrix([ps.fd.diff(c, i) for i in range(c.spatial_dimensions)])
+
+    ek = ps.fd.FVM1stOrder(c, flux=-D * grad_c)
+
+    ast = ps.create_staggered_kernel(ek.discrete_flux(j))
+    code = ps.get_code_str(ast)
+
+    assert '_size_c_0 - 1 < _size_c_0 - 1' not in code
--- a/pystencils_tests/test_global_definitions.py
+++ b/pystencils_tests/test_global_definitions.py
--- a/pystencils_tests/test_cudagpu.py
+++ b/pystencils_tests/test_cudagpu.py
+import pytest
+
 import numpy as np
-import pycuda.autoinit
-import pycuda.gpuarray as gpuarray
 import sympy as sp
+import math
 from scipy.ndimage import convolve

-from pystencils import Assignment, Field, fields, CreateKernelConfig, create_kernel, Target
-from pystencils.gpucuda import BlockIndexing
+from pystencils import Assignment, Field, fields, CreateKernelConfig, create_kernel, Target, get_code_str
+from pystencils.gpu import BlockIndexing
 from pystencils.simp import sympy_cse_on_assignment_list
-from pystencils.slicing import add_ghost_layers, make_slice, remove_ghost_layers
+from pystencils.slicing import add_ghost_layers, make_slice, remove_ghost_layers, normalize_slice
+
+try:
+    import cupy as cp
+    device_numbers = range(cp.cuda.runtime.getDeviceCount())
+except ImportError:
+    device_numbers = []
+    cp = None


 def test_averaging_kernel():
+    pytest.importorskip('cupy')
    size = (40, 55)
    src_arr = np.random.rand(*size)
    src_arr = add_ghost_layers(src_arr)
@@ -25,10 +34,10 @@ def test_averaging_kernel():
    ast = create_kernel(sympy_cse_on_assignment_list([update_rule]), config=config)
    kernel = ast.compile()

-    gpu_src_arr = gpuarray.to_gpu(src_arr)
-    gpu_dst_arr = gpuarray.to_gpu(dst_arr)
+    gpu_src_arr = cp.asarray(src_arr)
+    gpu_dst_arr = cp.asarray(dst_arr)
    kernel(src=gpu_src_arr, dst=gpu_dst_arr)
-    gpu_dst_arr.get(dst_arr)
+    dst_arr = gpu_dst_arr.get()

    stencil = np.array([[0, 1, 0], [1, 0, 1], [0, 1, 0]]) / 4.0
    reference = convolve(remove_ghost_layers(src_arr), stencil, mode='constant', cval=0.0)
@@ -37,6 +46,7 @@ def test_averaging_kernel():


 def test_variable_sized_fields():
+    pytest.importorskip('cupy')
    src_field = Field.create_generic('src', spatial_dimensions=2)
    dst_field = Field.create_generic('dst', spatial_dimensions=2)

@@ -52,10 +62,10 @@ def test_variable_sized_fields():
    src_arr = add_ghost_layers(src_arr)
    dst_arr = np.zeros_like(src_arr)

-    gpu_src_arr = gpuarray.to_gpu(src_arr)
-    gpu_dst_arr = gpuarray.to_gpu(dst_arr)
+    gpu_src_arr = cp.asarray(src_arr)
+    gpu_dst_arr = cp.asarray(dst_arr)
    kernel(src=gpu_src_arr, dst=gpu_dst_arr)
-    gpu_dst_arr.get(dst_arr)
+    dst_arr = gpu_dst_arr.get()

    stencil = np.array([[0, 1, 0], [1, 0, 1], [0, 1, 0]]) / 4.0
    reference = convolve(remove_ghost_layers(src_arr), stencil, mode='constant', cval=0.0)
@@ -64,6 +74,7 @@ def test_variable_sized_fields():


 def test_multiple_index_dimensions():
+    pytest.importorskip('cupy')
    """Sums along the last axis of a numpy array"""
    src_size = (7, 6, 4)
    dst_size = src_size[:2]
@@ -81,10 +92,10 @@ def test_multiple_index_dimensions():
    ast = create_kernel([update_rule], config=config)
    kernel = ast.compile()

-    gpu_src_arr = gpuarray.to_gpu(src_arr)
-    gpu_dst_arr = gpuarray.to_gpu(dst_arr)
+    gpu_src_arr = cp.asarray(src_arr)
+    gpu_dst_arr = cp.asarray(dst_arr)
    kernel(src=gpu_src_arr, dst=gpu_dst_arr)
-    gpu_dst_arr.get(dst_arr)
+    dst_arr = gpu_dst_arr.get()

    reference = np.zeros_like(dst_arr)
    gl = np.max(np.abs(np.array(offset, dtype=int)))
@@ -96,6 +107,7 @@ def test_multiple_index_dimensions():


 def test_ghost_layer():
+    pytest.importorskip('cupy')
    size = (6, 5)
    src_arr = np.ones(size)
    dst_arr = np.zeros_like(src_arr)
@@ -109,10 +121,10 @@ def test_ghost_layer():
    ast = create_kernel(sympy_cse_on_assignment_list([update_rule]), config=config)
    kernel = ast.compile()

-    gpu_src_arr = gpuarray.to_gpu(src_arr)
-    gpu_dst_arr = gpuarray.to_gpu(dst_arr)
+    gpu_src_arr = cp.asarray(src_arr)
+    gpu_dst_arr = cp.asarray(dst_arr)
    kernel(src=gpu_src_arr, dst=gpu_dst_arr)
-    gpu_dst_arr.get(dst_arr)
+    dst_arr = gpu_dst_arr.get()

    reference = np.zeros_like(src_arr)
    reference[ghost_layers[0][0]:-ghost_layers[0][1], ghost_layers[1][0]:-ghost_layers[1][1]] = 1
@@ -120,8 +132,9 @@ def test_ghost_layer():


 def test_setting_value():
+    pytest.importorskip('cupy')
    arr_cpu = np.arange(25, dtype=np.float64).reshape(5, 5)
-    arr_gpu = gpuarray.to_gpu(arr_cpu)
+    arr_gpu = cp.asarray(arr_cpu)

    iteration_slice = make_slice[:, :]
    f = Field.create_generic("f", 2)
@@ -136,11 +149,12 @@ def test_setting_value():


 def test_periodicity():
-    from pystencils.gpucuda.periodicity import get_periodic_boundary_functor as periodic_gpu
+    pytest.importorskip('cupy')
+    from pystencils.gpu.periodicity import get_periodic_boundary_functor as periodic_gpu
    from pystencils.slicing import get_periodic_boundary_functor as periodic_cpu

    arr_cpu = np.arange(50, dtype=np.float64).reshape(5, 5, 2)
-    arr_gpu = gpuarray.to_gpu(arr_cpu)
+    arr_gpu = cp.asarray(arr_cpu)

    periodicity_stencil = [(1, 0), (-1, 0), (1, 1)]
    periodic_gpu_kernel = periodic_gpu(periodicity_stencil, (5, 5), 1, 2)
@@ -149,23 +163,95 @@ def test_periodicity():
    cpu_result = np.copy(arr_cpu)
    periodic_cpu_kernel(cpu_result)

-    gpu_result = np.copy(arr_cpu)
    periodic_gpu_kernel(pdfs=arr_gpu)
-    arr_gpu.get(gpu_result)
+    gpu_result = arr_gpu.get()
    np.testing.assert_equal(cpu_result, gpu_result)


-def test_block_indexing():
+@pytest.mark.parametrize("device_number", device_numbers)
+def test_block_indexing(device_number):
+    pytest.importorskip('cupy')
    f = fields("f: [3D]")
-    bi = BlockIndexing(f, make_slice[:, :, :], block_size=(16, 8, 2), permute_block_size_dependent_on_layout=False)
+    s = normalize_slice(make_slice[:, :, :], f.spatial_shape)
+    bi = BlockIndexing(s, f.layout, block_size=(16, 8, 2),
+                       permute_block_size_dependent_on_layout=False)
    assert bi.call_parameters((3, 2, 32))['block'] == (3, 2, 32)
    assert bi.call_parameters((32, 2, 32))['block'] == (16, 2, 8)

-    bi = BlockIndexing(f, make_slice[:, :, :], block_size=(32, 1, 1), permute_block_size_dependent_on_layout=False)
+    bi = BlockIndexing(s, f.layout, block_size=(32, 1, 1),
+                       permute_block_size_dependent_on_layout=False)
    assert bi.call_parameters((1, 16, 16))['block'] == (1, 16, 2)

-    bi = BlockIndexing(f, make_slice[:, :, :], block_size=(16, 8, 2), maximum_block_size="auto")
+    bi = BlockIndexing(s, f.layout, block_size=(16, 8, 2),
+                       maximum_block_size="auto", device_number=device_number)
+
    # This function should be used if number of needed registers is known. Can be determined with func.num_regs
-    blocks = bi.limit_block_size_by_register_restriction([1024, 1024, 1], 1000)
+    registers_per_thread = 1000
+    blocks = bi.limit_block_size_by_register_restriction([1024, 1024, 1], registers_per_thread)
+
+    if cp.cuda.runtime.is_hip:
+        max_registers_per_block = cp.cuda.runtime.deviceGetAttribute(71, device_number)
+    else:
+        device = cp.cuda.Device(device_number)
+        da = device.attributes
+        max_registers_per_block = da.get("MaxRegistersPerBlock")
+
+    assert np.prod(blocks) * registers_per_thread < max_registers_per_block
+

-    assert sum(blocks) < sum([1024, 1024, 1])
+@pytest.mark.parametrize('gpu_indexing', ("block", "line"))
+@pytest.mark.parametrize('layout', ("C", "F"))
+@pytest.mark.parametrize('shape', ((5, 5, 5, 5), (3, 17, 387, 4), (23, 44, 21, 11)))
+def test_four_dimensional_kernel(gpu_indexing, layout, shape):
+    pytest.importorskip('cupy')
+    n_elements = np.prod(shape)
+
+    arr_cpu = np.arange(n_elements, dtype=np.float64).reshape(shape, order=layout)
+    arr_gpu = cp.asarray(arr_cpu)
+
+    iteration_slice = make_slice[:, :, :, :]
+    f = Field.create_from_numpy_array("f", arr_cpu)
+    update_rule = [Assignment(f.center, sp.Symbol("value"))]
+
+    config = CreateKernelConfig(target=Target.GPU, gpu_indexing=gpu_indexing, iteration_slice=iteration_slice)
+    ast = create_kernel(update_rule, config=config)
+    kernel = ast.compile()
+
+    kernel(f=arr_gpu, value=np.float64(42.0))
+    np.testing.assert_equal(arr_gpu.get(), np.ones(shape) * 42.0)
+
+
+@pytest.mark.parametrize('start', (1, 5))
+@pytest.mark.parametrize('end', (-1, -2, -3, -4))
+@pytest.mark.parametrize('step', (1, 2, 3, 4))
+@pytest.mark.parametrize('shape', ([55, 60], [77, 101, 80], [44, 64, 66]))
+def test_guards_with_iteration_slices(start, end, step, shape):
+    iter_slice = tuple([slice(start, end, step)] * len(shape))
+
+    kernel_config_gpu = CreateKernelConfig(target=Target.GPU, iteration_slice=iter_slice)
+    field_1 = fields(f"f(1) : double{list(shape)}")
+    assignment = Assignment(field_1.center, 1)
+    ast = create_kernel(assignment, config=kernel_config_gpu)
+    code_str = get_code_str(ast)
+
+    test_strings = list()
+    iteration_ranges = list()
+    for i, s in enumerate(iter_slice):
+        e = ((shape[i] + end) - s.start) / s.step
+        e = math.ceil(e) + s.start
+        test_strings.append(f"{s.start} < {e}")
+
+        a = s.start
+        counter = 0
+        while a < e:
+            a += 1
+            counter += 1
+        iteration_ranges.append(counter)
+
+    # check if the expected if statement is in the GPU code
+    for s in test_strings:
+        assert s in code_str
+
+    # check if these bounds lead to same lengths as the range function would produce
+    for i in range(len(iter_slice)):
+        assert iteration_ranges[i] == len(range(iter_slice[i].start, shape[i] + end, iter_slice[i].step))
No results found