diff --git a/pystencils/__init__.py b/pystencils/__init__.py
index 3c3ef8f294f9146ed155553154f1477ae8767109..92fdda9c55bed3f28709f95b0088dd2d88e1904b 100644
--- a/pystencils/__init__.py
+++ b/pystencils/__init__.py
@@ -40,12 +40,3 @@ from ._version import get_versions
 
 __version__ = get_versions()['version']
 del get_versions
-
-# setting the default GPU to the one with maximal memory. GPU_DEVICE is safe to overwrite for different needs
-try:
-    import cupy
-    if cupy.cuda.runtime.getDeviceCount() > 0:
-        GPU_DEVICE = sorted(range(cupy.cuda.runtime.getDeviceCount()),
-                            key=lambda i: cupy.cuda.Device(i).mem_info[1], reverse=True)[0]
-except ImportError:
-    pass
diff --git a/pystencils/boundaries/boundaryhandling.py b/pystencils/boundaries/boundaryhandling.py
index 0d76da0bb429d502e08e68961c92d82abdc35751..53c3980e2d2c9c947d45410d03017da135d4a843 100644
--- a/pystencils/boundaries/boundaryhandling.py
+++ b/pystencils/boundaries/boundaryhandling.py
@@ -18,6 +18,7 @@ try:
     import waLBerla as wlb
     if wlb.cpp_available:
         from pystencils.datahandling.parallel_datahandling import ParallelDataHandling
+        import cupy.cuda.runtime
     else:
         ParallelDataHandling = None
 except ImportError:
@@ -100,7 +101,7 @@ class BoundaryHandling:
         self.flag_interface = fi if fi is not None else FlagInterface(data_handling, name + "Flags")
 
         if ParallelDataHandling and isinstance(self.data_handling, ParallelDataHandling):
-            array_handler = GPUArrayHandler()
+            array_handler = GPUArrayHandler(cupy.cuda.runtime.getDevice())
         else:
             array_handler = self.data_handling.array_handler
 
diff --git a/pystencils/datahandling/__init__.py b/pystencils/datahandling/__init__.py
index 7f142428cf14b62813a7b9b1b245ffae021c1fb8..18053d2d9d6546bcb5ac2093f5f63c1633965a4e 100644
--- a/pystencils/datahandling/__init__.py
+++ b/pystencils/datahandling/__init__.py
@@ -23,7 +23,8 @@ def create_data_handling(domain_size: Tuple[int, ...],
                          default_layout: str = 'SoA',
                          default_target: Target = Target.CPU,
                          parallel: bool = False,
-                         default_ghost_layers: int = 1) -> DataHandling:
+                         default_ghost_layers: int = 1,
+                         device_number: Union[int, None] = None) -> DataHandling:
     """Creates a data handling instance.
 
     Args:
@@ -34,6 +35,9 @@ def create_data_handling(domain_size: Tuple[int, ...],
         default_target: `Target`
         parallel: if True a parallel domain is created using walberla - each MPI process gets a part of the domain
         default_ghost_layers: default number of ghost layers if not overwritten in 'add_array'
+        device_number: If `default_target` is set to 'GPU' and `parallel` is False, a device number should be
+                       specified. If none is given, the device with the largest amount of memory is used. If multiple
+                       devices have the same amount of memory, the one with the lower number is used
     """
     if isinstance(default_target, str):
         new_target = Target[default_target.upper()]
@@ -69,7 +73,8 @@ def create_data_handling(domain_size: Tuple[int, ...],
                                   periodicity=periodicity,
                                   default_target=default_target,
                                   default_layout=default_layout,
-                                  default_ghost_layers=default_ghost_layers)
+                                  default_ghost_layers=default_ghost_layers,
+                                  device_number=device_number)
 
 
 __all__ = ['create_data_handling']
diff --git a/pystencils/datahandling/serial_datahandling.py b/pystencils/datahandling/serial_datahandling.py
index 30be806b0adee90cbd61f47769023ee8f0e68743..0f5ddb431a869f3326f25b46a4f276268d2afd44 100644
--- a/pystencils/datahandling/serial_datahandling.py
+++ b/pystencils/datahandling/serial_datahandling.py
@@ -22,7 +22,8 @@ class SerialDataHandling(DataHandling):
                  default_layout: str = 'SoA',
                  periodicity: Union[bool, Sequence[bool]] = False,
                  default_target: Target = Target.CPU,
-                 array_handler=None) -> None:
+                 array_handler=None,
+                 device_number=None) -> None:
         """
         Creates a data handling for single node simulations.
 
@@ -30,9 +31,17 @@ class SerialDataHandling(DataHandling):
             domain_size: size of the spatial domain as tuple
             default_ghost_layers: default number of ghost layers used, if not overridden in add_array() method
             default_layout: default layout used, if  not overridden in add_array() method
+            periodicity: List of booleans that indicate which dimensions have periodic boundary conditions.
+                         Alternatively, a single boolean can be given, which is used for all dimensions. Defaults to
+                         False (non-periodic)
             default_target: `Target` either 'CPU' or 'GPU'. If set to 'GPU' for each array also a GPU version is
                             allocated if not overwritten in add_array, and synchronization functions are for the GPU by
                             default
+            array_handler: An object that provides the same interface as `GPUArrayHandler`, which is used for creation
+                           and transferring of GPU arrays. Default is to construct a fresh `GPUArrayHandler`
+            device_number: If `default_target` is set to 'GPU', a device number should be specified. If none is given,
+                           the device with the largest amount of memory is used. If multiple devices have the same
+                           amount of memory, the one with the lower number is used
         """
         super(SerialDataHandling, self).__init__()
         self._domainSize = tuple(domain_size)
@@ -47,8 +56,13 @@ class SerialDataHandling(DataHandling):
 
         if not array_handler:
             try:
-                self.array_handler = GPUArrayHandler()
-            except Exception:
+                if device_number is None:
+                    import cupy.cuda.runtime
+                    if cupy.cuda.runtime.getDeviceCount() > 0:
+                        device_number = sorted(range(cupy.cuda.runtime.getDeviceCount()),
+                                               key=lambda i: cupy.cuda.Device(i).mem_info[1], reverse=True)[0]
+                self.array_handler = GPUArrayHandler(device_number)
+            except ImportError:
                 self.array_handler = GPUNotAvailableHandler()
         else:
             self.array_handler = array_handler
diff --git a/pystencils/gpu/gpu_array_handler.py b/pystencils/gpu/gpu_array_handler.py
index 9031036f42a29f5a8adb2e08905c990ec8b441e8..ab5cad9a8e0c9a0a98905343214312ca7f03541c 100644
--- a/pystencils/gpu/gpu_array_handler.py
+++ b/pystencils/gpu/gpu_array_handler.py
@@ -6,30 +6,28 @@ except ImportError:
     cpx = None
 
 import numpy as np
-import pystencils
 
 
 class GPUArrayHandler:
-    @staticmethod
-    def zeros(shape, dtype=np.float64, order='C'):
-        with cp.cuda.Device(pystencils.GPU_DEVICE):
+    def __init__(self, device_number):
+        self._device_number = device_number
+
+    def zeros(self, shape, dtype=np.float64, order='C'):
+        with cp.cuda.Device(self._device_number):
             return cp.zeros(shape=shape, dtype=dtype, order=order)
 
-    @staticmethod
-    def ones(shape, dtype=np.float64, order='C'):
-        with cp.cuda.Device(pystencils.GPU_DEVICE):
+    def ones(self, shape, dtype=np.float64, order='C'):
+        with cp.cuda.Device(self._device_number):
             return cp.ones(shape=shape, dtype=dtype, order=order)
 
-    @staticmethod
-    def empty(shape, dtype=np.float64, order='C'):
-        with cp.cuda.Device(pystencils.GPU_DEVICE):
+    def empty(self, shape, dtype=np.float64, order='C'):
+        with cp.cuda.Device(self._device_number):
             return cp.empty(shape=shape, dtype=dtype, order=order)
 
-    @staticmethod
-    def to_gpu(numpy_array):
+    def to_gpu(self, numpy_array):
         swaps = _get_index_swaps(numpy_array)
         if numpy_array.base is not None and isinstance(numpy_array.base, np.ndarray):
-            with cp.cuda.Device(pystencils.GPU_DEVICE):
+            with cp.cuda.Device(self._device_number):
                 gpu_array = cp.asarray(numpy_array.base)
             for a, b in reversed(swaps):
                 gpu_array = gpu_array.swapaxes(a, b)
@@ -37,27 +35,26 @@ class GPUArrayHandler:
         else:
             return cp.asarray(numpy_array)
 
-    @staticmethod
-    def upload(array, numpy_array):
+    def upload(self, array, numpy_array):
+        assert self._device_number == array.device.id
         if numpy_array.base is not None and isinstance(numpy_array.base, np.ndarray):
-            with cp.cuda.Device(pystencils.GPU_DEVICE):
+            with cp.cuda.Device(self._device_number):
                 array.base.set(numpy_array.base)
         else:
-            with cp.cuda.Device(pystencils.GPU_DEVICE):
+            with cp.cuda.Device(self._device_number):
                 array.set(numpy_array)
 
-    @staticmethod
-    def download(array, numpy_array):
+    def download(self, array, numpy_array):
+        assert self._device_number == array.device.id
         if numpy_array.base is not None and isinstance(numpy_array.base, np.ndarray):
-            with cp.cuda.Device(pystencils.GPU_DEVICE):
+            with cp.cuda.Device(self._device_number):
                 numpy_array.base[:] = array.base.get()
         else:
-            with cp.cuda.Device(pystencils.GPU_DEVICE):
+            with cp.cuda.Device(self._device_number):
                 numpy_array[:] = array.get()
 
-    @staticmethod
-    def randn(shape, dtype=np.float64):
-        with cp.cuda.Device(pystencils.GPU_DEVICE):
+    def randn(self, shape, dtype=np.float64):
+        with cp.cuda.Device(self._device_number):
             return cp.random.randn(*shape, dtype=dtype)
 
     @staticmethod
diff --git a/pystencils/gpu/gpujit.py b/pystencils/gpu/gpujit.py
index efa5af826df117b22d26a8f766a0c8b1730ee3f4..52268924126870508c921a8569f6579707e72fda 100644
--- a/pystencils/gpu/gpujit.py
+++ b/pystencils/gpu/gpujit.py
@@ -1,6 +1,5 @@
 import numpy as np
 
-import pystencils
 from pystencils.backends.cbackend import get_headers
 from pystencils.backends.cuda_backend import generate_cuda
 from pystencils.typing import StructType
@@ -75,7 +74,9 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen
                          for k, v in kwargs.items()))
         try:
             args, block_and_thread_numbers = cache[key]
-            with cp.cuda.Device(pystencils.GPU_DEVICE):
+            device = set(a.device.id for a in args if type(a) is cp.ndarray)
+            assert len(device) == 1, "All arrays used by a kernel need to be allocated on the same device"
+            with cp.cuda.Device(device.pop()):
                 func(block_and_thread_numbers['grid'], block_and_thread_numbers['block'], args)
         except KeyError:
             full_arguments = argument_dict.copy()
@@ -90,11 +91,12 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen
             args = tuple(_build_numpy_argument_list(parameters, full_arguments))
             cache[key] = (args, block_and_thread_numbers)
             cache_values.append(kwargs)  # keep objects alive such that ids remain unique
-            with cp.cuda.Device(pystencils.GPU_DEVICE):
+            device = set(a.device.id for a in args if type(a) is cp.ndarray)
+            assert len(device) == 1, "All arrays used by a kernel need to be allocated on the same device"
+            with cp.cuda.Device(device.pop()):
                 func(block_and_thread_numbers['grid'], block_and_thread_numbers['block'], args)
-            # useful for debugging:
-            # with cp.cuda.Device(pystencils.GPU_DEVICE):
-            #     cp.cuda.runtime.deviceSynchronize()
+                # useful for debugging:
+                # cp.cuda.runtime.deviceSynchronize()
 
         # cuda.Context.synchronize() # useful for debugging, to get errors right after kernel was called
     ast = kernel_function_node
diff --git a/pystencils/gpu/kernelcreation.py b/pystencils/gpu/kernelcreation.py
index f328b75d57aba97f6294f84a4bf57960e4c174f1..066038cde246934d1694ee20613434ac9e3dc678 100644
--- a/pystencils/gpu/kernelcreation.py
+++ b/pystencils/gpu/kernelcreation.py
@@ -2,7 +2,6 @@ from typing import Union
 
 import numpy as np
 
-import pystencils
 from pystencils.astnodes import Block, KernelFunction, LoopOverCoordinate, SympyAssignment
 from pystencils.config import CreateKernelConfig
 from pystencils.typing import StructType, TypedSymbol
@@ -11,7 +10,7 @@ from pystencils.field import Field, FieldType
 from pystencils.enums import Target, Backend
 from pystencils.gpu.gpujit import make_python_function
 from pystencils.node_collection import NodeCollection
-from pystencils.gpu.indexing import indexing_creator_from_params, BlockIndexing
+from pystencils.gpu.indexing import indexing_creator_from_params
 from pystencils.simp.assignment_collection import AssignmentCollection
 from pystencils.transformations import (
     get_base_buffer_index, get_common_field, parse_base_pointer_info,
@@ -22,8 +21,6 @@ def create_cuda_kernel(assignments: Union[AssignmentCollection, NodeCollection],
                        config: CreateKernelConfig):
 
     function_name = config.function_name
-    if isinstance(config.gpu_indexing, BlockIndexing) and "device_number" not in config.gpu_indexing_params:
-        config.gpu_indexing_params["device_number"] = pystencils.GPU_DEVICE
     indexing_creator = indexing_creator_from_params(config.gpu_indexing, config.gpu_indexing_params)
     iteration_slice = config.iteration_slice
     ghost_layers = config.ghost_layers
@@ -123,8 +120,6 @@ def created_indexed_cuda_kernel(assignments: Union[AssignmentCollection, NodeCol
     index_fields = config.index_fields
     function_name = config.function_name
     coordinate_names = config.coordinate_names
-    if isinstance(config.gpu_indexing, BlockIndexing) and "device_number" not in config.gpu_indexing_params:
-        config.gpu_indexing_params["device_number"] = pystencils.GPU_DEVICE
     indexing_creator = indexing_creator_from_params(config.gpu_indexing, config.gpu_indexing_params)
 
     fields_written = assignments.bound_fields
diff --git a/pystencils_tests/test_datahandling.py b/pystencils_tests/test_datahandling.py
index 7f73a865b036986e899c233045fe0d6387953696..15e9cd74baf7df4e8922ce5f4a624a1fc0f6fb75 100644
--- a/pystencils_tests/test_datahandling.py
+++ b/pystencils_tests/test_datahandling.py
@@ -15,6 +15,12 @@ except ImportError:
     import unittest.mock
     pytest = unittest.mock.MagicMock()
 
+try:
+    import cupy.cuda.runtime
+    device_numbers = range(cupy.cuda.runtime.getDeviceCount())
+except ImportError:
+    device_numbers = []
+
 SCRIPT_FOLDER = Path(__file__).parent.absolute()
 INPUT_FOLDER = SCRIPT_FOLDER / "test_data"
 
@@ -365,10 +371,11 @@ def test_load_data():
     assert np.all(dh.cpu_arrays['dst2']) == 0
 
 
-def test_array_handler():
+@pytest.mark.parametrize("device_number", device_numbers)
+def test_array_handler(device_number):
     size = (2, 2)
     pytest.importorskip('cupy')
-    array_handler = GPUArrayHandler()
+    array_handler = GPUArrayHandler(device_number)
 
     zero_array = array_handler.zeros(size)
     cpu_array = np.empty(size)
diff --git a/pystencils_tests/test_gpu.py b/pystencils_tests/test_gpu.py
index b0af7950da352c633d73d7115bf1ea661f884937..df452e2b1bd2ec55ecc57c0325ba994a3dac894f 100644
--- a/pystencils_tests/test_gpu.py
+++ b/pystencils_tests/test_gpu.py
@@ -1,14 +1,21 @@
+import pytest
+
 import numpy as np
 import cupy as cp
 import sympy as sp
 from scipy.ndimage import convolve
 
-import pystencils
 from pystencils import Assignment, Field, fields, CreateKernelConfig, create_kernel, Target
 from pystencils.gpu import BlockIndexing
 from pystencils.simp import sympy_cse_on_assignment_list
 from pystencils.slicing import add_ghost_layers, make_slice, remove_ghost_layers
 
+try:
+    import cupy
+    device_numbers = range(cupy.cuda.runtime.getDeviceCount())
+except ImportError:
+    device_numbers = []
+
 
 def test_averaging_kernel():
     size = (40, 55)
@@ -154,7 +161,8 @@ def test_periodicity():
     np.testing.assert_equal(cpu_result, gpu_result)
 
 
-def test_block_indexing():
+@pytest.mark.parametrize("device_number", device_numbers)
+def test_block_indexing(device_number):
     f = fields("f: [3D]")
     bi = BlockIndexing(f, make_slice[:, :, :], block_size=(16, 8, 2), permute_block_size_dependent_on_layout=False)
     assert bi.call_parameters((3, 2, 32))['block'] == (3, 2, 32)
@@ -164,16 +172,16 @@ def test_block_indexing():
     assert bi.call_parameters((1, 16, 16))['block'] == (1, 16, 2)
 
     bi = BlockIndexing(f, make_slice[:, :, :], block_size=(16, 8, 2),
-                       maximum_block_size="auto", device_number=pystencils.GPU_DEVICE)
+                       maximum_block_size="auto", device_number=device_number)
 
     # This function should be used if number of needed registers is known. Can be determined with func.num_regs
     registers_per_thread = 1000
     blocks = bi.limit_block_size_by_register_restriction([1024, 1024, 1], registers_per_thread)
 
     if cp.cuda.runtime.is_hip:
-        max_registers_per_block = cp.cuda.runtime.deviceGetAttribute(71, pystencils.GPU_DEVICE)
+        max_registers_per_block = cp.cuda.runtime.deviceGetAttribute(71, device_number)
     else:
-        device = cp.cuda.Device(pystencils.GPU_DEVICE)
+        device = cp.cuda.Device(device_number)
         da = device.attributes
         max_registers_per_block = da.get("MaxRegistersPerBlock")