From e20d47da5f269e38052645f510727f323f2d6be9 Mon Sep 17 00:00:00 2001
From: Michael Kuron <m.kuron@gmx.de>
Date: Thu, 29 Jun 2023 20:19:07 +0200
Subject: [PATCH] Make AMD GPU support compatible with both hipcc and hiprtc

---
 pystencils/gpu/gpujit.py                 |  3 +++
 pystencils/include/gpu_defines.h         |  7 +++++++
 pystencils/include/philox_rand.h         | 10 ++++++----
 pystencils_tests/test_custom_backends.py |  2 +-
 4 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/pystencils/gpu/gpujit.py b/pystencils/gpu/gpujit.py
index e29f85d43..efa5af826 100644
--- a/pystencils/gpu/gpujit.py
+++ b/pystencils/gpu/gpujit.py
@@ -42,6 +42,9 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen
     headers = get_headers(kernel_function_node)
     if cp.cuda.runtime.is_hip:
         headers.add('"gpu_defines.h"')
+        for field in kernel_function_node.fields_accessed:
+            if isinstance(field.dtype, BasicType) and field.dtype.is_half():
+                headers.add('<hip/hip_fp16.h>')
     else:
         headers.update({'"gpu_defines.h"', '<cstdint>'})
         for field in kernel_function_node.fields_accessed:
diff --git a/pystencils/include/gpu_defines.h b/pystencils/include/gpu_defines.h
index 2d9875db2..67e7722e9 100644
--- a/pystencils/include/gpu_defines.h
+++ b/pystencils/include/gpu_defines.h
@@ -3,3 +3,10 @@
 #define POS_INFINITY __int_as_float(0x7f800000)
 #define INFINITY POS_INFINITY
 #define NEG_INFINITY __int_as_float(0xff800000)
+
+#ifdef __HIPCC_RTC__
+typedef __hip_uint8_t uint8_t;
+typedef __hip_int8_t int8_t;
+typedef __hip_uint16_t uint16_t;
+typedef __hip_int16_t int16_t;
+#endif
diff --git a/pystencils/include/philox_rand.h b/pystencils/include/philox_rand.h
index fab941468..cb91b53b9 100644
--- a/pystencils/include/philox_rand.h
+++ b/pystencils/include/philox_rand.h
@@ -1,4 +1,4 @@
-#ifndef __OPENCL_VERSION__
+#if !defined(__OPENCL_VERSION__) && !defined(__HIPCC_RTC__)
 #if defined(__SSE2__) || (defined(_MSC_VER) && !defined(_M_ARM64))
 #include <emmintrin.h> // SSE2
 #endif
@@ -38,7 +38,7 @@
 #endif
 #endif
 
-#ifdef __CUDA_ARCH__
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
 #define QUALIFIERS static __forceinline__ __device__
 #elif defined(__OPENCL_VERSION__)
 #define QUALIFIERS static inline
@@ -59,7 +59,9 @@
 typedef uint32_t uint32;
 typedef uint64_t uint64;
 #else
+#ifndef __HIPCC_RTC__
 #include <cstdint>
+#endif
 typedef std::uint32_t uint32;
 typedef std::uint64_t uint64;
 #endif
@@ -75,7 +77,7 @@ typedef svfloat64_t svfloat64_st;
 
 QUALIFIERS uint32 mulhilo32(uint32 a, uint32 b, uint32* hip)
 {
-#ifndef __CUDA_ARCH__
+#if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
     // host code
 #if defined(__powerpc__) && (!defined(__clang__) || defined(__xlC__))
     *hip = __mulhwu(a,b);
@@ -186,7 +188,7 @@ QUALIFIERS void philox_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3
 #endif
 }
 
-#if !defined(__CUDA_ARCH__) && !defined(__OPENCL_VERSION__)
+#if !defined(__CUDA_ARCH__) && !defined(__OPENCL_VERSION__) && !defined(__HIP_DEVICE_COMPILE__)
 #if defined(__SSE4_1__) || (defined(_MSC_VER) && !defined(_M_ARM64))
 QUALIFIERS void _philox4x32round(__m128i* ctr, __m128i* key)
 {
diff --git a/pystencils_tests/test_custom_backends.py b/pystencils_tests/test_custom_backends.py
index 9b625f8f9..c7bf7fe24 100644
--- a/pystencils_tests/test_custom_backends.py
+++ b/pystencils_tests/test_custom_backends.py
@@ -47,5 +47,5 @@ def test_custom_backends_gpu():
 
     ast = pystencils.create_kernel(normal_assignments, target=Target.GPU)
     pystencils.show_code(ast, ScreamingGpuBackend())
-    with pytest.raises(cupy.cuda.compiler.JitifyException):
+    with pytest.raises((cupy.cuda.compiler.JitifyException, cupy.cuda.compiler.CompileException)):
         pystencils.gpu.gpujit.make_python_function(ast, custom_backend=ScreamingGpuBackend())
-- 
GitLab