From 08ed41d1bd74040c227fb73601dee7429327a450 Mon Sep 17 00:00:00 2001
From: Markus Holzer <markus.holzer@fau.de>
Date: Sat, 24 Jun 2023 08:23:33 +0200
Subject: [PATCH] Implement Pinned GPU memory

---
 .../datahandling/serial_datahandling.py       |  8 ++-
 pystencils/gpu/gpu_array_handler.py           | 60 +++++++++++++++++--
 pystencils_tests/test_datahandling.py         | 29 +++++++++
 3 files changed, 90 insertions(+), 7 deletions(-)

diff --git a/pystencils/datahandling/serial_datahandling.py b/pystencils/datahandling/serial_datahandling.py
index eb352202d..30be806b0 100644
--- a/pystencils/datahandling/serial_datahandling.py
+++ b/pystencils/datahandling/serial_datahandling.py
@@ -127,8 +127,12 @@ class SerialDataHandling(DataHandling):
 
         # cpu_arr is always created - since there is no create_gpu_array_with_layout()
         byte_offset = ghost_layers * np.dtype(dtype).itemsize
-        cpu_arr = create_numpy_array_with_layout(layout=layout_tuple, alignment=alignment,
-                                                 byte_offset=byte_offset, **kwargs)
+
+        if gpu:
+            cpu_arr = self.array_handler.pinned_numpy_array(shape=kwargs['shape'], layout=layout_tuple, dtype=dtype)
+        else:
+            cpu_arr = create_numpy_array_with_layout(layout=layout_tuple, alignment=alignment,
+                                                     byte_offset=byte_offset, **kwargs)
 
         if alignment and gpu:
             raise NotImplementedError("Alignment for GPU fields not supported")
diff --git a/pystencils/gpu/gpu_array_handler.py b/pystencils/gpu/gpu_array_handler.py
index 9aa0db386..9031036f4 100644
--- a/pystencils/gpu/gpu_array_handler.py
+++ b/pystencils/gpu/gpu_array_handler.py
@@ -1,7 +1,9 @@
 try:
     import cupy as cp
+    import cupyx as cpx
 except ImportError:
     cp = None
+    cpx = None
 
 import numpy as np
 import pystencils
@@ -25,27 +27,75 @@ class GPUArrayHandler:
 
     @staticmethod
     def to_gpu(numpy_array):
-        with cp.cuda.Device(pystencils.GPU_DEVICE):
+        swaps = _get_index_swaps(numpy_array)
+        if numpy_array.base is not None and isinstance(numpy_array.base, np.ndarray):
+            with cp.cuda.Device(pystencils.GPU_DEVICE):
+                gpu_array = cp.asarray(numpy_array.base)
+            for a, b in reversed(swaps):
+                gpu_array = gpu_array.swapaxes(a, b)
+            return gpu_array
+        else:
             return cp.asarray(numpy_array)
 
     @staticmethod
     def upload(array, numpy_array):
-        with cp.cuda.Device(pystencils.GPU_DEVICE):
-            array.set(numpy_array)
+        if numpy_array.base is not None and isinstance(numpy_array.base, np.ndarray):
+            with cp.cuda.Device(pystencils.GPU_DEVICE):
+                array.base.set(numpy_array.base)
+        else:
+            with cp.cuda.Device(pystencils.GPU_DEVICE):
+                array.set(numpy_array)
 
     @staticmethod
     def download(array, numpy_array):
-        with cp.cuda.Device(pystencils.GPU_DEVICE):
-            numpy_array[:] = array.get()
+        if numpy_array.base is not None and isinstance(numpy_array.base, np.ndarray):
+            with cp.cuda.Device(pystencils.GPU_DEVICE):
+                numpy_array.base[:] = array.base.get()
+        else:
+            with cp.cuda.Device(pystencils.GPU_DEVICE):
+                numpy_array[:] = array.get()
 
     @staticmethod
     def randn(shape, dtype=np.float64):
         with cp.cuda.Device(pystencils.GPU_DEVICE):
             return cp.random.randn(*shape, dtype=dtype)
 
+    @staticmethod
+    def pinned_numpy_array(layout, shape, dtype):
+        assert set(layout) == set(range(len(shape))), "Wrong layout descriptor"
+        cur_layout = list(range(len(shape)))
+        swaps = []
+        for i in range(len(layout)):
+            if cur_layout[i] != layout[i]:
+                index_to_swap_with = cur_layout.index(layout[i])
+                swaps.append((i, index_to_swap_with))
+                cur_layout[i], cur_layout[index_to_swap_with] = cur_layout[index_to_swap_with], cur_layout[i]
+        assert tuple(cur_layout) == tuple(layout)
+
+        shape = list(shape)
+        for a, b in swaps:
+            shape[a], shape[b] = shape[b], shape[a]
+
+        res = cpx.empty_pinned(tuple(shape), order='c', dtype=dtype)
+
+        for a, b in reversed(swaps):
+            res = res.swapaxes(a, b)
+        return res
+
     from_numpy = to_gpu
 
 
 class GPUNotAvailableHandler:
     def __getattribute__(self, name):
         raise NotImplementedError("Unable to utilise cupy! Please make sure cupy works correctly in your setup!")
+
+
+def _get_index_swaps(array):
+    swaps = []
+    if array.base is not None and isinstance(array.base, np.ndarray):
+        for stride in array.base.strides:
+            index_base = array.base.strides.index(stride)
+            index_view = array.strides.index(stride)
+            if index_base != index_view and (index_view, index_base) not in swaps:
+                swaps.append((index_base, index_view))
+    return swaps
diff --git a/pystencils_tests/test_datahandling.py b/pystencils_tests/test_datahandling.py
index 0f48ac2ca..7f73a865b 100644
--- a/pystencils_tests/test_datahandling.py
+++ b/pystencils_tests/test_datahandling.py
@@ -251,6 +251,20 @@ def test_add_arrays():
     assert y == dh.fields['y']
 
 
+@pytest.mark.parametrize('shape', [(17, 12), (7, 11, 18)])
+@pytest.mark.parametrize('layout', ['zyxf', 'fzyx'])
+def test_add_arrays_with_layout(shape, layout):
+    pytest.importorskip('cupy')
+
+    dh = create_data_handling(domain_size=shape, default_layout=layout, default_target=ps.Target.GPU)
+    f1 = dh.add_array("f1", values_per_cell=19)
+    dh.fill(f1.name, 1.0)
+
+    assert dh.cpu_arrays[f1.name].shape == dh.gpu_arrays[f1.name].shape
+    assert dh.cpu_arrays[f1.name].strides == dh.gpu_arrays[f1.name].strides
+    assert dh.cpu_arrays[f1.name].dtype == dh.gpu_arrays[f1.name].dtype
+
+
 def test_get_kwarg():
     domain_shape = (10, 10)
     field_description = 'src, dst'
@@ -373,3 +387,18 @@ def test_array_handler():
 
     random_array = array_handler.randn(size)
 
+    cpu_array = np.empty((20, 40), dtype=np.float64)
+    gpu_array = array_handler.to_gpu(cpu_array)
+
+    assert cpu_array.base is None
+    assert gpu_array.base is None
+    assert gpu_array.strides == cpu_array.strides
+
+    cpu_array2 = np.empty((20, 40), dtype=np.float64)
+    cpu_array2 = cpu_array2.swapaxes(0, 1)
+    gpu_array2 = array_handler.to_gpu(cpu_array2)
+
+    assert cpu_array2.base is not None
+    assert gpu_array2.base is not None
+    assert gpu_array2.strides == cpu_array2.strides
+
-- 
GitLab