[clang] 32c26e2 - CUDA/HIP: Use kernel name to map to symbol

Thu Jan 19 15:02:26 PST 2023

Author: Daniele Castagna
Date: 2023-01-19T15:02:14-08:00
New Revision: 32c26e27b6fcd12703dcd00adf178330d0ad8449

URL: https://github.com/llvm/llvm-project/commit/32c26e27b6fcd12703dcd00adf178330d0ad8449
DIFF: https://github.com/llvm/llvm-project/commit/32c26e27b6fcd12703dcd00adf178330d0ad8449.diff

LOG: CUDA/HIP: Use kernel name to map to symbol

Currently CGCUDANV uses an llvm::Function as a key to map kernels to a
symbol in host code.  HIP adds one level of indirection and uses the
llvm::Function to map to a global variable that will be initialized to
the kernel stub ptr.

Unfortunately there is no garantee that the llvm::Function created
by GetOrCreateLLVMFunction will be the same.  In fact, the first
time we encounter GetOrCrateLLVMFunction for a kernel, the type
might not be completed yet, and the type of llvm::Function will be
a generic {}, since the complete type is not required to get a symbol
to a function.  In this case we end up creating two global variables,
one for the llvm::Function with the incomplete type and one for the
function with the complete type. The first global variable will be
declared by not defined, resulting in a linking error.

This change uses the mangled name of the llvm::Function as key in the
KernelHandles map, in this way the same llvm::Function will be
associated to the same kernel handle even if they types are different.

Reviewed By: yaxunl

Differential Revision: https://reviews.llvm.org/D140663

Added: 
    clang/test/CodeGenCUDA/incomplete-func-ptr-type.cu

Modified: 
    clang/lib/CodeGen/CGCUDANV.cpp

Removed: 
    


################################################################################
diff  --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp
index abf320996dc4d..bb887df3e4e04 100644

--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@@ -49,10 +49,10 @@ class CGNVCUDARuntime : public CGCUDARuntime {
     const Decl *D;
   };
   llvm::SmallVector<KernelInfo, 16> EmittedKernels;
-  // Map a device stub function to a symbol for identifying kernel in host code.
+  // Map a kernel mangled name to a symbol for identifying kernel in host code
   // For CUDA, the symbol for identifying the kernel is the same as the device
   // stub function. For HIP, they are 
diff erent.
-  llvm::DenseMap<llvm::Function *, llvm::GlobalValue *> KernelHandles;
+  llvm::DenseMap<StringRef, llvm::GlobalValue *> KernelHandles;
   // Map a kernel handle to the kernel stub.
   llvm::DenseMap<llvm::GlobalValue *, llvm::Function *> KernelStubs;
   struct VarInfo {
@@ -310,7 +310,8 @@ std::string CGNVCUDARuntime::getDeviceSideName(const NamedDecl *ND) {
 void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
                                      FunctionArgList &Args) {
   EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl});
-  if (auto *GV = dyn_cast<llvm::GlobalVariable>(KernelHandles[CGF.CurFn])) {
+  if (auto *GV =
+          dyn_cast<llvm::GlobalVariable>(KernelHandles[CGF.CurFn->getName()])) {
     GV->setLinkage(CGF.CurFn->getLinkage());
     GV->setInitializer(CGF.CurFn);
   }
@@ -400,8 +401,8 @@ void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
                                ShmemSize.getPointer(), Stream.getPointer()});
 
   // Emit the call to cudaLaunch
-  llvm::Value *Kernel =
-      CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn], VoidPtrTy);
+  llvm::Value *Kernel = CGF.Builder.CreatePointerCast(
+      KernelHandles[CGF.CurFn->getName()], VoidPtrTy);
   CallArgList LaunchKernelArgs;
   LaunchKernelArgs.add(RValue::get(Kernel),
                        cudaLaunchKernelFD->getParamDecl(0)->getType());
@@ -456,8 +457,8 @@ void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF,
 
   // Emit the call to cudaLaunch
   llvm::FunctionCallee cudaLaunchFn = getLaunchFn();
-  llvm::Value *Arg =
-      CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn], CharPtrTy);
+  llvm::Value *Arg = CGF.Builder.CreatePointerCast(
+      KernelHandles[CGF.CurFn->getName()], CharPtrTy);
   CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg);
   CGF.EmitBranch(EndBlock);
 
@@ -551,7 +552,7 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
     llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy);
     llvm::Value *Args[] = {
         &GpuBinaryHandlePtr,
-        Builder.CreateBitCast(KernelHandles[I.Kernel], VoidPtrTy),
+        Builder.CreateBitCast(KernelHandles[I.Kernel->getName()], VoidPtrTy),
         KernelName,
         KernelName,
         llvm::ConstantInt::get(IntTy, -1),
@@ -1130,7 +1131,7 @@ void CGNVCUDARuntime::createOffloadingEntries() {
   StringRef Section = CGM.getLangOpts().HIP ? "hip_offloading_entries"
                                             : "cuda_offloading_entries";
   for (KernelInfo &I : EmittedKernels)
-    OMPBuilder.emitOffloadingEntry(KernelHandles[I.Kernel],
+    OMPBuilder.emitOffloadingEntry(KernelHandles[I.Kernel->getName()],
                                    getDeviceSideName(cast<NamedDecl>(I.D)), 0,
                                    DeviceVarFlags::OffloadGlobalEntry, Section);
 
@@ -1193,12 +1194,12 @@ llvm::Function *CGNVCUDARuntime::finalizeModule() {
 
 llvm::GlobalValue *CGNVCUDARuntime::getKernelHandle(llvm::Function *F,
                                                     GlobalDecl GD) {
-  auto Loc = KernelHandles.find(F);
+  auto Loc = KernelHandles.find(F->getName());
   if (Loc != KernelHandles.end())
     return Loc->second;
 
   if (!CGM.getLangOpts().HIP) {
-    KernelHandles[F] = F;
+    KernelHandles[F->getName()] = F;
     KernelStubs[F] = F;
     return F;
   }
@@ -1212,7 +1213,7 @@ llvm::GlobalValue *CGNVCUDARuntime::getKernelHandle(llvm::Function *F,
   Var->setDSOLocal(F->isDSOLocal());
   Var->setVisibility(F->getVisibility());
   CGM.maybeSetTrivialComdat(*GD.getDecl(), *Var);
-  KernelHandles[F] = Var;
+  KernelHandles[F->getName()] = Var;
   KernelStubs[Var] = F;
   return Var;
 }

diff  --git a/clang/test/CodeGenCUDA/incomplete-func-ptr-type.cu b/clang/test/CodeGenCUDA/incomplete-func-ptr-type.cu
new file mode 100644
index 0000000000000..bd1da1f05c1eb
--- /dev/null
+++ b/clang/test/CodeGenCUDA/incomplete-func-ptr-type.cu
@@ -0,0 +1,27 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm -x hip %s -o - \
+// RUN: | FileCheck %s
+
+#define __global__ __attribute__((global))
+// CHECK: @_Z4kern7TempValIjE = constant ptr @_Z19__device_stub__kern7TempValIjE, align 8
+// CHECK: @0 = private unnamed_addr constant [19 x i8] c"_Z4kern7TempValIjE\00", align 1
+template <typename type>
+struct TempVal {
+  type value;
+};
+
+__global__ void kern(TempVal<unsigned int> in_val);
+
+int main(int argc, char ** argv) {
+  auto* fptr = &(kern);
+// CHECK:   store ptr @_Z4kern7TempValIjE, ptr %fptr, align 8
+  return 0;
+}
+// CHECK:  define dso_local void @_Z19__device_stub__kern7TempValIjE(i32 %in_val.coerce) #1 {
+// CHECK:  %2 = call i32 @hipLaunchByPtr(ptr @_Z4kern7TempValIjE)
+
+// CHECK:  define internal void @__hip_register_globals(ptr %0) {
+// CHECK:    %1 = call i32 @__hipRegisterFunction(ptr %0, ptr @_Z4kern7TempValIjE, ptr @0, ptr @0, i32 -1, ptr null, ptr null, ptr null, ptr null, ptr null)
+
+__global__ void kern(TempVal<unsigned int> in_val) {
+}
+