diff --git a/pystencils/__init__.py b/pystencils/__init__.py index 3c3ef8f294f9146ed155553154f1477ae8767109..92fdda9c55bed3f28709f95b0088dd2d88e1904b 100644 --- a/pystencils/__init__.py +++ b/pystencils/__init__.py @@ -40,12 +40,3 @@ from ._version import get_versions __version__ = get_versions()['version'] del get_versions - -# setting the default GPU to the one with maximal memory. GPU_DEVICE is safe to overwrite for different needs -try: - import cupy - if cupy.cuda.runtime.getDeviceCount() > 0: - GPU_DEVICE = sorted(range(cupy.cuda.runtime.getDeviceCount()), - key=lambda i: cupy.cuda.Device(i).mem_info[1], reverse=True)[0] -except ImportError: - pass diff --git a/pystencils/boundaries/boundaryhandling.py b/pystencils/boundaries/boundaryhandling.py index 0d76da0bb429d502e08e68961c92d82abdc35751..53c3980e2d2c9c947d45410d03017da135d4a843 100644 --- a/pystencils/boundaries/boundaryhandling.py +++ b/pystencils/boundaries/boundaryhandling.py @@ -18,6 +18,7 @@ try: import waLBerla as wlb if wlb.cpp_available: from pystencils.datahandling.parallel_datahandling import ParallelDataHandling + import cupy.cuda.runtime else: ParallelDataHandling = None except ImportError: @@ -100,7 +101,7 @@ class BoundaryHandling: self.flag_interface = fi if fi is not None else FlagInterface(data_handling, name + "Flags") if ParallelDataHandling and isinstance(self.data_handling, ParallelDataHandling): - array_handler = GPUArrayHandler() + array_handler = GPUArrayHandler(cupy.cuda.runtime.getDevice()) else: array_handler = self.data_handling.array_handler diff --git a/pystencils/datahandling/__init__.py b/pystencils/datahandling/__init__.py index 7f142428cf14b62813a7b9b1b245ffae021c1fb8..18053d2d9d6546bcb5ac2093f5f63c1633965a4e 100644 --- a/pystencils/datahandling/__init__.py +++ b/pystencils/datahandling/__init__.py @@ -23,7 +23,8 @@ def create_data_handling(domain_size: Tuple[int, ...], default_layout: str = 'SoA', default_target: Target = Target.CPU, parallel: bool = False, - default_ghost_layers: int = 1) -> DataHandling: + default_ghost_layers: int = 1, + device_number: Union[int, None] = None) -> DataHandling: """Creates a data handling instance. Args: @@ -34,6 +35,9 @@ def create_data_handling(domain_size: Tuple[int, ...], default_target: `Target` parallel: if True a parallel domain is created using walberla - each MPI process gets a part of the domain default_ghost_layers: default number of ghost layers if not overwritten in 'add_array' + device_number: If `default_target` is set to 'GPU' and `parallel` is False, a device number should be + specified. If none is given, the device with the largest amount of memory is used. If multiple + devices have the same amount of memory, the one with the lower number is used """ if isinstance(default_target, str): new_target = Target[default_target.upper()] @@ -69,7 +73,8 @@ def create_data_handling(domain_size: Tuple[int, ...], periodicity=periodicity, default_target=default_target, default_layout=default_layout, - default_ghost_layers=default_ghost_layers) + default_ghost_layers=default_ghost_layers, + device_number=device_number) __all__ = ['create_data_handling'] diff --git a/pystencils/datahandling/serial_datahandling.py b/pystencils/datahandling/serial_datahandling.py index 30be806b0adee90cbd61f47769023ee8f0e68743..0f5ddb431a869f3326f25b46a4f276268d2afd44 100644 --- a/pystencils/datahandling/serial_datahandling.py +++ b/pystencils/datahandling/serial_datahandling.py @@ -22,7 +22,8 @@ class SerialDataHandling(DataHandling): default_layout: str = 'SoA', periodicity: Union[bool, Sequence[bool]] = False, default_target: Target = Target.CPU, - array_handler=None) -> None: + array_handler=None, + device_number=None) -> None: """ Creates a data handling for single node simulations. @@ -30,9 +31,17 @@ class SerialDataHandling(DataHandling): domain_size: size of the spatial domain as tuple default_ghost_layers: default number of ghost layers used, if not overridden in add_array() method default_layout: default layout used, if not overridden in add_array() method + periodicity: List of booleans that indicate which dimensions have periodic boundary conditions. + Alternatively, a single boolean can be given, which is used for all dimensions. Defaults to + False (non-periodic) default_target: `Target` either 'CPU' or 'GPU'. If set to 'GPU' for each array also a GPU version is allocated if not overwritten in add_array, and synchronization functions are for the GPU by default + array_handler: An object that provides the same interface as `GPUArrayHandler`, which is used for creation + and transferring of GPU arrays. Default is to construct a fresh `GPUArrayHandler` + device_number: If `default_target` is set to 'GPU', a device number should be specified. If none is given, + the device with the largest amount of memory is used. If multiple devices have the same + amount of memory, the one with the lower number is used """ super(SerialDataHandling, self).__init__() self._domainSize = tuple(domain_size) @@ -47,8 +56,13 @@ class SerialDataHandling(DataHandling): if not array_handler: try: - self.array_handler = GPUArrayHandler() - except Exception: + if device_number is None: + import cupy.cuda.runtime + if cupy.cuda.runtime.getDeviceCount() > 0: + device_number = sorted(range(cupy.cuda.runtime.getDeviceCount()), + key=lambda i: cupy.cuda.Device(i).mem_info[1], reverse=True)[0] + self.array_handler = GPUArrayHandler(device_number) + except ImportError: self.array_handler = GPUNotAvailableHandler() else: self.array_handler = array_handler diff --git a/pystencils/gpu/gpu_array_handler.py b/pystencils/gpu/gpu_array_handler.py index 9031036f42a29f5a8adb2e08905c990ec8b441e8..ab5cad9a8e0c9a0a98905343214312ca7f03541c 100644 --- a/pystencils/gpu/gpu_array_handler.py +++ b/pystencils/gpu/gpu_array_handler.py @@ -6,30 +6,28 @@ except ImportError: cpx = None import numpy as np -import pystencils class GPUArrayHandler: - @staticmethod - def zeros(shape, dtype=np.float64, order='C'): - with cp.cuda.Device(pystencils.GPU_DEVICE): + def __init__(self, device_number): + self._device_number = device_number + + def zeros(self, shape, dtype=np.float64, order='C'): + with cp.cuda.Device(self._device_number): return cp.zeros(shape=shape, dtype=dtype, order=order) - @staticmethod - def ones(shape, dtype=np.float64, order='C'): - with cp.cuda.Device(pystencils.GPU_DEVICE): + def ones(self, shape, dtype=np.float64, order='C'): + with cp.cuda.Device(self._device_number): return cp.ones(shape=shape, dtype=dtype, order=order) - @staticmethod - def empty(shape, dtype=np.float64, order='C'): - with cp.cuda.Device(pystencils.GPU_DEVICE): + def empty(self, shape, dtype=np.float64, order='C'): + with cp.cuda.Device(self._device_number): return cp.empty(shape=shape, dtype=dtype, order=order) - @staticmethod - def to_gpu(numpy_array): + def to_gpu(self, numpy_array): swaps = _get_index_swaps(numpy_array) if numpy_array.base is not None and isinstance(numpy_array.base, np.ndarray): - with cp.cuda.Device(pystencils.GPU_DEVICE): + with cp.cuda.Device(self._device_number): gpu_array = cp.asarray(numpy_array.base) for a, b in reversed(swaps): gpu_array = gpu_array.swapaxes(a, b) @@ -37,27 +35,26 @@ class GPUArrayHandler: else: return cp.asarray(numpy_array) - @staticmethod - def upload(array, numpy_array): + def upload(self, array, numpy_array): + assert self._device_number == array.device.id if numpy_array.base is not None and isinstance(numpy_array.base, np.ndarray): - with cp.cuda.Device(pystencils.GPU_DEVICE): + with cp.cuda.Device(self._device_number): array.base.set(numpy_array.base) else: - with cp.cuda.Device(pystencils.GPU_DEVICE): + with cp.cuda.Device(self._device_number): array.set(numpy_array) - @staticmethod - def download(array, numpy_array): + def download(self, array, numpy_array): + assert self._device_number == array.device.id if numpy_array.base is not None and isinstance(numpy_array.base, np.ndarray): - with cp.cuda.Device(pystencils.GPU_DEVICE): + with cp.cuda.Device(self._device_number): numpy_array.base[:] = array.base.get() else: - with cp.cuda.Device(pystencils.GPU_DEVICE): + with cp.cuda.Device(self._device_number): numpy_array[:] = array.get() - @staticmethod - def randn(shape, dtype=np.float64): - with cp.cuda.Device(pystencils.GPU_DEVICE): + def randn(self, shape, dtype=np.float64): + with cp.cuda.Device(self._device_number): return cp.random.randn(*shape, dtype=dtype) @staticmethod diff --git a/pystencils/gpu/gpujit.py b/pystencils/gpu/gpujit.py index efa5af826df117b22d26a8f766a0c8b1730ee3f4..52268924126870508c921a8569f6579707e72fda 100644 --- a/pystencils/gpu/gpujit.py +++ b/pystencils/gpu/gpujit.py @@ -1,6 +1,5 @@ import numpy as np -import pystencils from pystencils.backends.cbackend import get_headers from pystencils.backends.cuda_backend import generate_cuda from pystencils.typing import StructType @@ -75,7 +74,9 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen for k, v in kwargs.items())) try: args, block_and_thread_numbers = cache[key] - with cp.cuda.Device(pystencils.GPU_DEVICE): + device = set(a.device.id for a in args if type(a) is cp.ndarray) + assert len(device) == 1, "All arrays used by a kernel need to be allocated on the same device" + with cp.cuda.Device(device.pop()): func(block_and_thread_numbers['grid'], block_and_thread_numbers['block'], args) except KeyError: full_arguments = argument_dict.copy() @@ -90,11 +91,12 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen args = tuple(_build_numpy_argument_list(parameters, full_arguments)) cache[key] = (args, block_and_thread_numbers) cache_values.append(kwargs) # keep objects alive such that ids remain unique - with cp.cuda.Device(pystencils.GPU_DEVICE): + device = set(a.device.id for a in args if type(a) is cp.ndarray) + assert len(device) == 1, "All arrays used by a kernel need to be allocated on the same device" + with cp.cuda.Device(device.pop()): func(block_and_thread_numbers['grid'], block_and_thread_numbers['block'], args) - # useful for debugging: - # with cp.cuda.Device(pystencils.GPU_DEVICE): - # cp.cuda.runtime.deviceSynchronize() + # useful for debugging: + # cp.cuda.runtime.deviceSynchronize() # cuda.Context.synchronize() # useful for debugging, to get errors right after kernel was called ast = kernel_function_node diff --git a/pystencils/gpu/kernelcreation.py b/pystencils/gpu/kernelcreation.py index f328b75d57aba97f6294f84a4bf57960e4c174f1..066038cde246934d1694ee20613434ac9e3dc678 100644 --- a/pystencils/gpu/kernelcreation.py +++ b/pystencils/gpu/kernelcreation.py @@ -2,7 +2,6 @@ from typing import Union import numpy as np -import pystencils from pystencils.astnodes import Block, KernelFunction, LoopOverCoordinate, SympyAssignment from pystencils.config import CreateKernelConfig from pystencils.typing import StructType, TypedSymbol @@ -11,7 +10,7 @@ from pystencils.field import Field, FieldType from pystencils.enums import Target, Backend from pystencils.gpu.gpujit import make_python_function from pystencils.node_collection import NodeCollection -from pystencils.gpu.indexing import indexing_creator_from_params, BlockIndexing +from pystencils.gpu.indexing import indexing_creator_from_params from pystencils.simp.assignment_collection import AssignmentCollection from pystencils.transformations import ( get_base_buffer_index, get_common_field, parse_base_pointer_info, @@ -22,8 +21,6 @@ def create_cuda_kernel(assignments: Union[AssignmentCollection, NodeCollection], config: CreateKernelConfig): function_name = config.function_name - if isinstance(config.gpu_indexing, BlockIndexing) and "device_number" not in config.gpu_indexing_params: - config.gpu_indexing_params["device_number"] = pystencils.GPU_DEVICE indexing_creator = indexing_creator_from_params(config.gpu_indexing, config.gpu_indexing_params) iteration_slice = config.iteration_slice ghost_layers = config.ghost_layers @@ -123,8 +120,6 @@ def created_indexed_cuda_kernel(assignments: Union[AssignmentCollection, NodeCol index_fields = config.index_fields function_name = config.function_name coordinate_names = config.coordinate_names - if isinstance(config.gpu_indexing, BlockIndexing) and "device_number" not in config.gpu_indexing_params: - config.gpu_indexing_params["device_number"] = pystencils.GPU_DEVICE indexing_creator = indexing_creator_from_params(config.gpu_indexing, config.gpu_indexing_params) fields_written = assignments.bound_fields diff --git a/pystencils_tests/test_datahandling.py b/pystencils_tests/test_datahandling.py index 7f73a865b036986e899c233045fe0d6387953696..15e9cd74baf7df4e8922ce5f4a624a1fc0f6fb75 100644 --- a/pystencils_tests/test_datahandling.py +++ b/pystencils_tests/test_datahandling.py @@ -15,6 +15,12 @@ except ImportError: import unittest.mock pytest = unittest.mock.MagicMock() +try: + import cupy.cuda.runtime + device_numbers = range(cupy.cuda.runtime.getDeviceCount()) +except ImportError: + device_numbers = [] + SCRIPT_FOLDER = Path(__file__).parent.absolute() INPUT_FOLDER = SCRIPT_FOLDER / "test_data" @@ -365,10 +371,11 @@ def test_load_data(): assert np.all(dh.cpu_arrays['dst2']) == 0 -def test_array_handler(): +@pytest.mark.parametrize("device_number", device_numbers) +def test_array_handler(device_number): size = (2, 2) pytest.importorskip('cupy') - array_handler = GPUArrayHandler() + array_handler = GPUArrayHandler(device_number) zero_array = array_handler.zeros(size) cpu_array = np.empty(size) diff --git a/pystencils_tests/test_gpu.py b/pystencils_tests/test_gpu.py index b0af7950da352c633d73d7115bf1ea661f884937..df452e2b1bd2ec55ecc57c0325ba994a3dac894f 100644 --- a/pystencils_tests/test_gpu.py +++ b/pystencils_tests/test_gpu.py @@ -1,14 +1,21 @@ +import pytest + import numpy as np import cupy as cp import sympy as sp from scipy.ndimage import convolve -import pystencils from pystencils import Assignment, Field, fields, CreateKernelConfig, create_kernel, Target from pystencils.gpu import BlockIndexing from pystencils.simp import sympy_cse_on_assignment_list from pystencils.slicing import add_ghost_layers, make_slice, remove_ghost_layers +try: + import cupy + device_numbers = range(cupy.cuda.runtime.getDeviceCount()) +except ImportError: + device_numbers = [] + def test_averaging_kernel(): size = (40, 55) @@ -154,7 +161,8 @@ def test_periodicity(): np.testing.assert_equal(cpu_result, gpu_result) -def test_block_indexing(): +@pytest.mark.parametrize("device_number", device_numbers) +def test_block_indexing(device_number): f = fields("f: [3D]") bi = BlockIndexing(f, make_slice[:, :, :], block_size=(16, 8, 2), permute_block_size_dependent_on_layout=False) assert bi.call_parameters((3, 2, 32))['block'] == (3, 2, 32) @@ -164,16 +172,16 @@ def test_block_indexing(): assert bi.call_parameters((1, 16, 16))['block'] == (1, 16, 2) bi = BlockIndexing(f, make_slice[:, :, :], block_size=(16, 8, 2), - maximum_block_size="auto", device_number=pystencils.GPU_DEVICE) + maximum_block_size="auto", device_number=device_number) # This function should be used if number of needed registers is known. Can be determined with func.num_regs registers_per_thread = 1000 blocks = bi.limit_block_size_by_register_restriction([1024, 1024, 1], registers_per_thread) if cp.cuda.runtime.is_hip: - max_registers_per_block = cp.cuda.runtime.deviceGetAttribute(71, pystencils.GPU_DEVICE) + max_registers_per_block = cp.cuda.runtime.deviceGetAttribute(71, device_number) else: - device = cp.cuda.Device(pystencils.GPU_DEVICE) + device = cp.cuda.Device(device_number) da = device.attributes max_registers_per_block = da.get("MaxRegistersPerBlock")