[clang] [clang-repl][CUDA] Move CUDA module registration to beginning of global_ctors (PR #66658)

Mon Sep 18 08:51:00 PDT 2023

https://github.com/argentite updated https://github.com/llvm/llvm-project/pull/66658

>From bed2919f781c5ef71e268c95b31a6b9af5392730 Mon Sep 17 00:00:00 2001
From: Anubhab Ghosh <anubhabghosh.me at gmail.com>
Date: Mon, 18 Sep 2023 20:33:19 +0530
Subject: [PATCH] [clang-repl][CUDA] Move CUDA module registration to beginning
 of global_ctors

CUDA device code needs to be registered to the runtime before kernels
can be launched. This is done through a global constructor.
User code in Clang interpreter, is also executed through global_ctors.
This patch ensures kernels can be launched in the same iteration it is
defined in by making the registration first in the list.
---
 clang/lib/CodeGen/CodeGenModule.cpp           |  2 +-
 .../test/Interpreter/CUDA/launch-same-ptu.cu  | 21 +++++++++++++++++++
 2 files changed, 22 insertions(+), 1 deletion(-)
 create mode 100644 clang/test/Interpreter/CUDA/launch-same-ptu.cu

diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 8b0c9340775cbe9..647c8922f27a00f 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -794,7 +794,7 @@ void CodeGenModule::Release() {
       AddGlobalCtor(ObjCInitFunction);
   if (Context.getLangOpts().CUDA && CUDARuntime) {
     if (llvm::Function *CudaCtorFunction = CUDARuntime->finalizeModule())
-      AddGlobalCtor(CudaCtorFunction);
+      AddGlobalCtor(CudaCtorFunction, /*Priority=*/0);
   }
   if (OpenMPRuntime) {
     if (llvm::Function *OpenMPRequiresDirectiveRegFun =
diff --git a/clang/test/Interpreter/CUDA/launch-same-ptu.cu b/clang/test/Interpreter/CUDA/launch-same-ptu.cu
new file mode 100644
index 000000000000000..93e203a47212fbf
--- /dev/null
+++ b/clang/test/Interpreter/CUDA/launch-same-ptu.cu
@@ -0,0 +1,21 @@
+// Tests __device__ function calls
+// RUN: cat %s | clang-repl --cuda | FileCheck %s
+
+extern "C" int printf(const char*, ...);
+
+int var;
+int* devptr = nullptr;
+printf("cudaMalloc: %d\n", cudaMalloc((void **) &devptr, sizeof(int)));
+// CHECK: cudaMalloc: 0
+
+__device__ inline void test_device(int* value) { *value = 42; } __global__ void test_kernel(int* value) { test_device(value); } test_kernel<<<1,1>>>(devptr);
+printf("CUDA Error: %d\n", cudaGetLastError());
+// CHECK-NEXT: CUDA Error: 0
+
+printf("cudaMemcpy: %d\n", cudaMemcpy(&var, devptr, sizeof(int), cudaMemcpyDeviceToHost));
+// CHECK-NEXT: cudaMemcpy: 0
+
+printf("Value: %d\n", var);
+// CHECK-NEXT: Value: 42
+
+%quit