[clang] 32c26e2 - CUDA/HIP: Use kernel name to map to symbol
Daniele Castagna via cfe-commits
cfe-commits at lists.llvm.org
Thu Jan 19 15:02:26 PST 2023
Author: Daniele Castagna
Date: 2023-01-19T15:02:14-08:00
New Revision: 32c26e27b6fcd12703dcd00adf178330d0ad8449
URL: https://github.com/llvm/llvm-project/commit/32c26e27b6fcd12703dcd00adf178330d0ad8449
DIFF: https://github.com/llvm/llvm-project/commit/32c26e27b6fcd12703dcd00adf178330d0ad8449.diff
LOG: CUDA/HIP: Use kernel name to map to symbol
Currently CGCUDANV uses an llvm::Function as a key to map kernels to a
symbol in host code. HIP adds one level of indirection and uses the
llvm::Function to map to a global variable that will be initialized to
the kernel stub ptr.
Unfortunately there is no garantee that the llvm::Function created
by GetOrCreateLLVMFunction will be the same. In fact, the first
time we encounter GetOrCrateLLVMFunction for a kernel, the type
might not be completed yet, and the type of llvm::Function will be
a generic {}, since the complete type is not required to get a symbol
to a function. In this case we end up creating two global variables,
one for the llvm::Function with the incomplete type and one for the
function with the complete type. The first global variable will be
declared by not defined, resulting in a linking error.
This change uses the mangled name of the llvm::Function as key in the
KernelHandles map, in this way the same llvm::Function will be
associated to the same kernel handle even if they types are different.
Reviewed By: yaxunl
Differential Revision: https://reviews.llvm.org/D140663
Added:
clang/test/CodeGenCUDA/incomplete-func-ptr-type.cu
Modified:
clang/lib/CodeGen/CGCUDANV.cpp
Removed:
################################################################################
diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp
index abf320996dc4d..bb887df3e4e04 100644
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@@ -49,10 +49,10 @@ class CGNVCUDARuntime : public CGCUDARuntime {
const Decl *D;
};
llvm::SmallVector<KernelInfo, 16> EmittedKernels;
- // Map a device stub function to a symbol for identifying kernel in host code.
+ // Map a kernel mangled name to a symbol for identifying kernel in host code
// For CUDA, the symbol for identifying the kernel is the same as the device
// stub function. For HIP, they are
diff erent.
- llvm::DenseMap<llvm::Function *, llvm::GlobalValue *> KernelHandles;
+ llvm::DenseMap<StringRef, llvm::GlobalValue *> KernelHandles;
// Map a kernel handle to the kernel stub.
llvm::DenseMap<llvm::GlobalValue *, llvm::Function *> KernelStubs;
struct VarInfo {
@@ -310,7 +310,8 @@ std::string CGNVCUDARuntime::getDeviceSideName(const NamedDecl *ND) {
void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
FunctionArgList &Args) {
EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl});
- if (auto *GV = dyn_cast<llvm::GlobalVariable>(KernelHandles[CGF.CurFn])) {
+ if (auto *GV =
+ dyn_cast<llvm::GlobalVariable>(KernelHandles[CGF.CurFn->getName()])) {
GV->setLinkage(CGF.CurFn->getLinkage());
GV->setInitializer(CGF.CurFn);
}
@@ -400,8 +401,8 @@ void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
ShmemSize.getPointer(), Stream.getPointer()});
// Emit the call to cudaLaunch
- llvm::Value *Kernel =
- CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn], VoidPtrTy);
+ llvm::Value *Kernel = CGF.Builder.CreatePointerCast(
+ KernelHandles[CGF.CurFn->getName()], VoidPtrTy);
CallArgList LaunchKernelArgs;
LaunchKernelArgs.add(RValue::get(Kernel),
cudaLaunchKernelFD->getParamDecl(0)->getType());
@@ -456,8 +457,8 @@ void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF,
// Emit the call to cudaLaunch
llvm::FunctionCallee cudaLaunchFn = getLaunchFn();
- llvm::Value *Arg =
- CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn], CharPtrTy);
+ llvm::Value *Arg = CGF.Builder.CreatePointerCast(
+ KernelHandles[CGF.CurFn->getName()], CharPtrTy);
CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg);
CGF.EmitBranch(EndBlock);
@@ -551,7 +552,7 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy);
llvm::Value *Args[] = {
&GpuBinaryHandlePtr,
- Builder.CreateBitCast(KernelHandles[I.Kernel], VoidPtrTy),
+ Builder.CreateBitCast(KernelHandles[I.Kernel->getName()], VoidPtrTy),
KernelName,
KernelName,
llvm::ConstantInt::get(IntTy, -1),
@@ -1130,7 +1131,7 @@ void CGNVCUDARuntime::createOffloadingEntries() {
StringRef Section = CGM.getLangOpts().HIP ? "hip_offloading_entries"
: "cuda_offloading_entries";
for (KernelInfo &I : EmittedKernels)
- OMPBuilder.emitOffloadingEntry(KernelHandles[I.Kernel],
+ OMPBuilder.emitOffloadingEntry(KernelHandles[I.Kernel->getName()],
getDeviceSideName(cast<NamedDecl>(I.D)), 0,
DeviceVarFlags::OffloadGlobalEntry, Section);
@@ -1193,12 +1194,12 @@ llvm::Function *CGNVCUDARuntime::finalizeModule() {
llvm::GlobalValue *CGNVCUDARuntime::getKernelHandle(llvm::Function *F,
GlobalDecl GD) {
- auto Loc = KernelHandles.find(F);
+ auto Loc = KernelHandles.find(F->getName());
if (Loc != KernelHandles.end())
return Loc->second;
if (!CGM.getLangOpts().HIP) {
- KernelHandles[F] = F;
+ KernelHandles[F->getName()] = F;
KernelStubs[F] = F;
return F;
}
@@ -1212,7 +1213,7 @@ llvm::GlobalValue *CGNVCUDARuntime::getKernelHandle(llvm::Function *F,
Var->setDSOLocal(F->isDSOLocal());
Var->setVisibility(F->getVisibility());
CGM.maybeSetTrivialComdat(*GD.getDecl(), *Var);
- KernelHandles[F] = Var;
+ KernelHandles[F->getName()] = Var;
KernelStubs[Var] = F;
return Var;
}
diff --git a/clang/test/CodeGenCUDA/incomplete-func-ptr-type.cu b/clang/test/CodeGenCUDA/incomplete-func-ptr-type.cu
new file mode 100644
index 0000000000000..bd1da1f05c1eb
--- /dev/null
+++ b/clang/test/CodeGenCUDA/incomplete-func-ptr-type.cu
@@ -0,0 +1,27 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm -x hip %s -o - \
+// RUN: | FileCheck %s
+
+#define __global__ __attribute__((global))
+// CHECK: @_Z4kern7TempValIjE = constant ptr @_Z19__device_stub__kern7TempValIjE, align 8
+// CHECK: @0 = private unnamed_addr constant [19 x i8] c"_Z4kern7TempValIjE\00", align 1
+template <typename type>
+struct TempVal {
+ type value;
+};
+
+__global__ void kern(TempVal<unsigned int> in_val);
+
+int main(int argc, char ** argv) {
+ auto* fptr = &(kern);
+// CHECK: store ptr @_Z4kern7TempValIjE, ptr %fptr, align 8
+ return 0;
+}
+// CHECK: define dso_local void @_Z19__device_stub__kern7TempValIjE(i32 %in_val.coerce) #1 {
+// CHECK: %2 = call i32 @hipLaunchByPtr(ptr @_Z4kern7TempValIjE)
+
+// CHECK: define internal void @__hip_register_globals(ptr %0) {
+// CHECK: %1 = call i32 @__hipRegisterFunction(ptr %0, ptr @_Z4kern7TempValIjE, ptr @0, ptr @0, i32 -1, ptr null, ptr null, ptr null, ptr null, ptr null)
+
+__global__ void kern(TempVal<unsigned int> in_val) {
+}
+
More information about the cfe-commits
mailing list