[clang] 5cf2a37 - [HIP] Emit kernel symbol

Mon Mar 1 13:32:09 PST 2021

Author: Yaxun (Sam) Liu
Date: 2021-03-01T16:31:40-05:00
New Revision: 5cf2a37f1255700d4da9d5f45e82bdfff09aee8c

URL: https://github.com/llvm/llvm-project/commit/5cf2a37f1255700d4da9d5f45e82bdfff09aee8c
DIFF: https://github.com/llvm/llvm-project/commit/5cf2a37f1255700d4da9d5f45e82bdfff09aee8c.diff

LOG: [HIP] Emit kernel symbol

Currently clang uses stub function to launch kernel. This is inconvenient
to interop with C++ programs since the stub function has different name
as kernel, which is required by ROCm debugger.

This patch emits a variable symbol which has the same name as the kernel
and uses it to register and launch the kernel. This allows C++ program to
launch a kernel by using the original kernel name.

Reviewed by: Artem Belevich

Differential Revision: https://reviews.llvm.org/D86376

Added: 
    clang/test/CodeGenCUDA/cxx-call-kernel.cpp

Modified: 
    clang/lib/CodeGen/CGCUDANV.cpp
    clang/lib/CodeGen/CGCUDARuntime.h
    clang/lib/CodeGen/CGExpr.cpp
    clang/lib/CodeGen/CodeGenModule.cpp
    clang/test/CodeGenCUDA/Inputs/cuda.h
    clang/test/CodeGenCUDA/kernel-dbg-info.cu
    clang/test/CodeGenCUDA/kernel-stub-name.cu
    clang/test/CodeGenCUDA/unnamed-types.cu

Removed: 
    


################################################################################
diff  --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp
index 57e3b151bfd4..3a311ab395e4 100644

--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@@ -42,12 +42,18 @@ class CGNVCUDARuntime : public CGCUDARuntime {
   llvm::LLVMContext &Context;
   /// Convenience reference to the current module
   llvm::Module &TheModule;
-  /// Keeps track of kernel launch stubs emitted in this module
+  /// Keeps track of kernel launch stubs and handles emitted in this module
   struct KernelInfo {
-    llvm::Function *Kernel;
+    llvm::Function *Kernel; // stub function to help launch kernel
     const Decl *D;
   };
   llvm::SmallVector<KernelInfo, 16> EmittedKernels;
+  // Map a device stub function to a symbol for identifying kernel in host code.
+  // For CUDA, the symbol for identifying the kernel is the same as the device
+  // stub function. For HIP, they are 
diff erent.
+  llvm::DenseMap<llvm::Function *, llvm::GlobalValue *> KernelHandles;
+  // Map a kernel handle to the kernel stub.
+  llvm::DenseMap<llvm::GlobalValue *, llvm::Function *> KernelStubs;
   struct VarInfo {
     llvm::GlobalVariable *Var;
     const VarDecl *D;
@@ -154,6 +160,12 @@ class CGNVCUDARuntime : public CGCUDARuntime {
 public:
   CGNVCUDARuntime(CodeGenModule &CGM);
 
+  llvm::GlobalValue *getKernelHandle(llvm::Function *F, GlobalDecl GD) override;
+  llvm::Function *getKernelStub(llvm::GlobalValue *Handle) override {
+    auto Loc = KernelStubs.find(Handle);
+    assert(Loc != KernelStubs.end());
+    return Loc->second;
+  }
   void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override;
   void handleVarRegistration(const VarDecl *VD,
                              llvm::GlobalVariable &Var) override;
@@ -272,6 +284,10 @@ std::string CGNVCUDARuntime::getDeviceSideName(const NamedDecl *ND) {
 void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
                                      FunctionArgList &Args) {
   EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl});
+  if (auto *GV = dyn_cast<llvm::GlobalVariable>(KernelHandles[CGF.CurFn])) {
+    GV->setLinkage(CGF.CurFn->getLinkage());
+    GV->setInitializer(CGF.CurFn);
+  }
   if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
                          CudaFeature::CUDA_USES_NEW_LAUNCH) ||
       (CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI))
@@ -350,7 +366,8 @@ void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
                                ShmemSize.getPointer(), Stream.getPointer()});
 
   // Emit the call to cudaLaunch
-  llvm::Value *Kernel = CGF.Builder.CreatePointerCast(CGF.CurFn, VoidPtrTy);
+  llvm::Value *Kernel =
+      CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn], VoidPtrTy);
   CallArgList LaunchKernelArgs;
   LaunchKernelArgs.add(RValue::get(Kernel),
                        cudaLaunchKernelFD->getParamDecl(0)->getType());
@@ -405,7 +422,8 @@ void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF,
 
   // Emit the call to cudaLaunch
   llvm::FunctionCallee cudaLaunchFn = getLaunchFn();
-  llvm::Value *Arg = CGF.Builder.CreatePointerCast(CGF.CurFn, CharPtrTy);
+  llvm::Value *Arg =
+      CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn], CharPtrTy);
   CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg);
   CGF.EmitBranch(EndBlock);
 
@@ -499,7 +517,7 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
     llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy);
     llvm::Value *Args[] = {
         &GpuBinaryHandlePtr,
-        Builder.CreateBitCast(I.Kernel, VoidPtrTy),
+        Builder.CreateBitCast(KernelHandles[I.Kernel], VoidPtrTy),
         KernelName,
         KernelName,
         llvm::ConstantInt::get(IntTy, -1),
@@ -1070,3 +1088,28 @@ llvm::Function *CGNVCUDARuntime::finalizeModule() {
   }
   return makeModuleCtorFunction();
 }
+
+llvm::GlobalValue *CGNVCUDARuntime::getKernelHandle(llvm::Function *F,
+                                                    GlobalDecl GD) {
+  auto Loc = KernelHandles.find(F);
+  if (Loc != KernelHandles.end())
+    return Loc->second;
+
+  if (!CGM.getLangOpts().HIP) {
+    KernelHandles[F] = F;
+    KernelStubs[F] = F;
+    return F;
+  }
+
+  auto *Var = new llvm::GlobalVariable(
+      TheModule, F->getType(), /*isConstant=*/true, F->getLinkage(),
+      /*Initializer=*/nullptr,
+      CGM.getMangledName(
+          GD.getWithKernelReferenceKind(KernelReferenceKind::Kernel)));
+  Var->setAlignment(CGM.getPointerAlign().getAsAlign());
+  Var->setDSOLocal(F->isDSOLocal());
+  Var->setVisibility(F->getVisibility());
+  KernelHandles[F] = Var;
+  KernelStubs[Var] = F;
+  return Var;
+}

diff  --git a/clang/lib/CodeGen/CGCUDARuntime.h b/clang/lib/CodeGen/CGCUDARuntime.h
index 2f4b7ab1dc6d..1c119dc77fd4 100644
--- a/clang/lib/CodeGen/CGCUDARuntime.h
+++ b/clang/lib/CodeGen/CGCUDARuntime.h
@@ -15,6 +15,7 @@
 #ifndef LLVM_CLANG_LIB_CODEGEN_CGCUDARUNTIME_H
 #define LLVM_CLANG_LIB_CODEGEN_CGCUDARUNTIME_H
 
+#include "clang/AST/GlobalDecl.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/GlobalValue.h"
 
@@ -94,6 +95,13 @@ class CGCUDARuntime {
   /// compilation is for host.
   virtual std::string getDeviceSideName(const NamedDecl *ND) = 0;
 
+  /// Get kernel handle by stub function.
+  virtual llvm::GlobalValue *getKernelHandle(llvm::Function *Stub,
+                                             GlobalDecl GD) = 0;
+
+  /// Get kernel stub by kernel handle.
+  virtual llvm::Function *getKernelStub(llvm::GlobalValue *Handle) = 0;
+
   /// Adjust linkage of shadow variables in host compilation.
   virtual void
   internalizeDeviceSideVar(const VarDecl *D,

diff  --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index 05c553f22d60..d57dd7f49d50 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -10,6 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "CGCUDARuntime.h"
 #include "CGCXXABI.h"
 #include "CGCall.h"
 #include "CGCleanup.h"
@@ -4871,8 +4872,12 @@ static CGCallee EmitDirectCallee(CodeGenFunction &CGF, GlobalDecl GD) {
       return CGCallee::forBuiltin(builtinID, FD);
   }
 
-  llvm::Constant *calleePtr = EmitFunctionDeclPointer(CGF.CGM, GD);
-  return CGCallee::forDirect(calleePtr, GD);
+  llvm::Constant *CalleePtr = EmitFunctionDeclPointer(CGF.CGM, GD);
+  if (CGF.CGM.getLangOpts().CUDA && !CGF.CGM.getLangOpts().CUDAIsDevice &&
+      FD->hasAttr<CUDAGlobalAttr>())
+    CalleePtr = CGF.CGM.getCUDARuntime().getKernelStub(
+        cast<llvm::GlobalValue>(CalleePtr->stripPointerCasts()));
+  return CGCallee::forDirect(CalleePtr, GD);
 }
 
 CGCallee CodeGenFunction::EmitCallee(const Expr *E) {
@@ -5266,6 +5271,19 @@ RValue CodeGenFunction::EmitCall(QualType CalleeType, const CGCallee &OrigCallee
     Callee.setFunctionPointer(CalleePtr);
   }
 
+  // HIP function pointer contains kernel handle when it is used in triple
+  // chevron. The kernel stub needs to be loaded from kernel handle and used
+  // as callee.
+  if (CGM.getLangOpts().HIP && !CGM.getLangOpts().CUDAIsDevice &&
+      isa<CUDAKernelCallExpr>(E) &&
+      (!TargetDecl || !isa<FunctionDecl>(TargetDecl))) {
+    llvm::Value *Handle = Callee.getFunctionPointer();
+    Handle->dump();
+    auto *Cast =
+        Builder.CreateBitCast(Handle, Handle->getType()->getPointerTo());
+    auto *Stub = Builder.CreateLoad(Address(Cast, CGM.getPointerAlign()));
+    Callee.setFunctionPointer(Stub);
+  }
   llvm::CallBase *CallOrInvoke = nullptr;
   RValue Call = EmitCall(FnInfo, Callee, ReturnValue, Args, &CallOrInvoke,
                          E->getExprLoc());

diff  --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 0d499a564039..750439dd6844 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -3571,9 +3571,19 @@ llvm::Constant *CodeGenModule::GetAddrOfFunction(GlobalDecl GD,
   }
 
   StringRef MangledName = getMangledName(GD);
-  return GetOrCreateLLVMFunction(MangledName, Ty, GD, ForVTable, DontDefer,
-                                 /*IsThunk=*/false, llvm::AttributeList(),
-                                 IsForDefinition);
+  auto *F = GetOrCreateLLVMFunction(MangledName, Ty, GD, ForVTable, DontDefer,
+                                    /*IsThunk=*/false, llvm::AttributeList(),
+                                    IsForDefinition);
+  // Returns kernel handle for HIP kernel stub function.
+  if (LangOpts.CUDA && !LangOpts.CUDAIsDevice &&
+      cast<FunctionDecl>(GD.getDecl())->hasAttr<CUDAGlobalAttr>()) {
+    auto *Handle = getCUDARuntime().getKernelHandle(
+        cast<llvm::Function>(F->stripPointerCasts()), GD);
+    if (IsForDefinition)
+      return F;
+    return llvm::ConstantExpr::getBitCast(Handle, Ty->getPointerTo());
+  }
+  return F;
 }
 
 static const FunctionDecl *

diff  --git a/clang/test/CodeGenCUDA/Inputs/cuda.h b/clang/test/CodeGenCUDA/Inputs/cuda.h
index daa6328c9499..af395b3b97bb 100644
--- a/clang/test/CodeGenCUDA/Inputs/cuda.h
+++ b/clang/test/CodeGenCUDA/Inputs/cuda.h
@@ -2,6 +2,7 @@
 
 #include <stddef.h>
 
+#if __HIP__ || __CUDA__
 #define __constant__ __attribute__((constant))
 #define __device__ __attribute__((device))
 #define __global__ __attribute__((global))
@@ -11,13 +12,22 @@
 #define __managed__ __attribute__((managed))
 #endif
 #define __launch_bounds__(...) __attribute__((launch_bounds(__VA_ARGS__)))
+#else
+#define __constant__
+#define __device__
+#define __global__
+#define __host__
+#define __shared__
+#define __managed__
+#define __launch_bounds__(...)
+#endif
 
 struct dim3 {
   unsigned x, y, z;
   __host__ __device__ dim3(unsigned x, unsigned y = 1, unsigned z = 1) : x(x), y(y), z(z) {}
 };
 
-#ifdef __HIP__
+#if __HIP__ || HIP_PLATFORM
 typedef struct hipStream *hipStream_t;
 typedef enum hipError {} hipError_t;
 int hipConfigureCall(dim3 gridSize, dim3 blockSize, size_t sharedSize = 0,

diff  --git a/clang/test/CodeGenCUDA/cxx-call-kernel.cpp b/clang/test/CodeGenCUDA/cxx-call-kernel.cpp
new file mode 100644
index 000000000000..ae58dcd348ce
--- /dev/null
+++ b/clang/test/CodeGenCUDA/cxx-call-kernel.cpp
@@ -0,0 +1,19 @@
+// RUN: %clang_cc1 -x hip -emit-llvm-bc %s -o %t.hip.bc
+// RUN: %clang_cc1 -mlink-bitcode-file %t.hip.bc -DHIP_PLATFORM -emit-llvm \
+// RUN:   %s -o - | FileCheck %s
+
+#include "Inputs/cuda.h"
+
+// CHECK: @_Z2g1i = constant void (i32)* @_Z17__device_stub__g1i, align 8
+#if __HIP__
+__global__ void g1(int x) {}
+#else
+extern void g1(int x);
+
+// CHECK: call i32 @hipLaunchKernel{{.*}}@_Z2g1i
+void test() {
+  hipLaunchKernel((void*)g1, 1, 1, nullptr, 0, 0);
+}
+
+// CHECK: __hipRegisterFunction{{.*}}@_Z2g1i
+#endif

diff  --git a/clang/test/CodeGenCUDA/kernel-dbg-info.cu b/clang/test/CodeGenCUDA/kernel-dbg-info.cu
index 4aa1f353cc5e..7e8522a371d7 100644
--- a/clang/test/CodeGenCUDA/kernel-dbg-info.cu
+++ b/clang/test/CodeGenCUDA/kernel-dbg-info.cu
@@ -30,6 +30,9 @@ extern "C" __global__ void ckernel(int *a) {
   *a = 1;
 }
 
+// Kernel symbol for launching kernel.
+// CHECK: @[[SYM:ckernel]] = constant void (i32*)* @__device_stub__ckernel, align 8
+
 // Device side kernel names
 // CHECK: @[[CKERN:[0-9]*]] = {{.*}} c"ckernel\00"
 
@@ -40,7 +43,7 @@ extern "C" __global__ void ckernel(int *a) {
 // Make sure there is no !dbg between function attributes and '{'
 // CHECK: define{{.*}} void @[[CSTUB:__device_stub__ckernel]]{{.*}} #{{[0-9]+}} {
 // CHECK-NOT: call {{.*}}@hipLaunchByPtr{{.*}}!dbg
-// CHECK: call {{.*}}@hipLaunchByPtr{{.*}}@[[CSTUB]]
+// CHECK: call {{.*}}@hipLaunchByPtr{{.*}}@[[SYM]]
 // CHECK-NOT: ret {{.*}}!dbg
 
 // CHECK-LABEL: define {{.*}}@_Z8hostfuncPi{{.*}}!dbg

diff  --git a/clang/test/CodeGenCUDA/kernel-stub-name.cu b/clang/test/CodeGenCUDA/kernel-stub-name.cu
index b66a7ba190c0..0c504b612ea7 100644
--- a/clang/test/CodeGenCUDA/kernel-stub-name.cu
+++ b/clang/test/CodeGenCUDA/kernel-stub-name.cu
@@ -2,10 +2,17 @@
 
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
 // RUN:     -fcuda-include-gpubinary %t -o - -x hip\
-// RUN:   | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=CHECK
+// RUN:   | FileCheck %s
 
 #include "Inputs/cuda.h"
 
+// Kernel handles
+
+// CHECK: @[[HCKERN:ckernel]] = constant void ()* @__device_stub__ckernel, align 8
+// CHECK: @[[HNSKERN:_ZN2ns8nskernelEv]] = constant void ()* @_ZN2ns23__device_stub__nskernelEv, align 8
+// CHECK: @[[HTKERN:_Z10kernelfuncIiEvv]] = linkonce_odr constant void ()* @_Z25__device_stub__kernelfuncIiEvv, align 8
+// CHECK: @[[HDKERN:_Z11kernel_declv]] = external constant void ()*, align 8
+
 extern "C" __global__ void ckernel() {}
 
 namespace ns {
@@ -17,6 +24,11 @@ __global__ void kernelfunc() {}
 
 __global__ void kernel_decl();
 
+void (*kernel_ptr)();
+void *void_ptr;
+
+void launch(void *kern);
+
 // Device side kernel names
 
 // CHECK: @[[CKERN:[0-9]*]] = {{.*}} c"ckernel\00"
@@ -26,16 +38,20 @@ __global__ void kernel_decl();
 // Non-template kernel stub functions
 
 // CHECK: define{{.*}}@[[CSTUB:__device_stub__ckernel]]
-// CHECK: call{{.*}}@hipLaunchByPtr{{.*}}@[[CSTUB]]
+// CHECK: call{{.*}}@hipLaunchByPtr{{.*}}@[[HCKERN]]
 // CHECK: define{{.*}}@[[NSSTUB:_ZN2ns23__device_stub__nskernelEv]]
-// CHECK: call{{.*}}@hipLaunchByPtr{{.*}}@[[NSSTUB]]
+// CHECK: call{{.*}}@hipLaunchByPtr{{.*}}@[[HNSKERN]]
+
 
-// CHECK-LABEL: define{{.*}}@_Z8hostfuncv()
+// Check kernel stub is used for triple chevron
+
+// CHECK-LABEL: define{{.*}}@_Z4fun1v()
 // CHECK: call void @[[CSTUB]]()
 // CHECK: call void @[[NSSTUB]]()
 // CHECK: call void @[[TSTUB:_Z25__device_stub__kernelfuncIiEvv]]()
 // CHECK: call void @[[DSTUB:_Z26__device_stub__kernel_declv]]()
-void hostfunc(void) {
+
+void fun1(void) {
   ckernel<<<1, 1>>>();
   ns::nskernel<<<1, 1>>>();
   kernelfunc<int><<<1, 1>>>();
@@ -45,11 +61,69 @@ void hostfunc(void) {
 // Template kernel stub functions
 
 // CHECK: define{{.*}}@[[TSTUB]]
-// CHECK: call{{.*}}@hipLaunchByPtr{{.*}}@[[TSTUB]]
+// CHECK: call{{.*}}@hipLaunchByPtr{{.*}}@[[HTKERN]]
+
+// Check declaration of stub function for external kernel.
 
 // CHECK: declare{{.*}}@[[DSTUB]]
 
+// Check kernel handle is used for passing the kernel as a function pointer
+
+// CHECK-LABEL: define{{.*}}@_Z4fun2v()
+// CHECK: call void @_Z6launchPv({{.*}}[[HCKERN]]
+// CHECK: call void @_Z6launchPv({{.*}}[[HNSKERN]]
+// CHECK: call void @_Z6launchPv({{.*}}[[HTKERN]]
+// CHECK: call void @_Z6launchPv({{.*}}[[HDKERN]]
+void fun2() {
+  launch((void *)ckernel);
+  launch((void *)ns::nskernel);
+  launch((void *)kernelfunc<int>);
+  launch((void *)kernel_decl);
+}
+
+// Check kernel handle is used for assigning a kernel to a function pointer
+
+// CHECK-LABEL: define{{.*}}@_Z4fun3v()
+// CHECK:  store void ()* bitcast (void ()** @[[HCKERN]] to void ()*), void ()** @kernel_ptr, align 8
+// CHECK:  store void ()* bitcast (void ()** @[[HCKERN]] to void ()*), void ()** @kernel_ptr, align 8
+// CHECK:  store i8* bitcast (void ()** @[[HCKERN]] to i8*), i8** @void_ptr, align 8
+// CHECK:  store i8* bitcast (void ()** @[[HCKERN]] to i8*), i8** @void_ptr, align 8
+void fun3() {
+  kernel_ptr = ckernel;
+  kernel_ptr = &ckernel;
+  void_ptr = (void *)ckernel;
+  void_ptr = (void *)&ckernel;
+}
+
+// Check kernel stub is loaded from kernel handle when function pointer is
+// used with triple chevron
+
+// CHECK-LABEL: define{{.*}}@_Z4fun4v()
+// CHECK:  store void ()* bitcast (void ()** @[[HCKERN]] to void ()*), void ()** @kernel_ptr
+// CHECK:  call i32 @_Z16hipConfigureCall4dim3S_mP9hipStream
+// CHECK:  %[[HANDLE:.*]] = load void ()*, void ()** @kernel_ptr, align 8
+// CHECK:  %[[CAST:.*]] = bitcast void ()* %[[HANDLE]] to void ()**
+// CHECK:  %[[STUB:.*]] = load void ()*, void ()** %[[CAST]], align 8
+// CHECK:  call void %[[STUB]]()
+void fun4() {
+  kernel_ptr = ckernel;
+  kernel_ptr<<<1,1>>>();
+}
+
+// Check kernel handle is passed to a function
+
+// CHECK-LABEL: define{{.*}}@_Z4fun5v()
+// CHECK:  store void ()* bitcast (void ()** @[[HCKERN]] to void ()*), void ()** @kernel_ptr
+// CHECK:  %[[HANDLE:.*]] = load void ()*, void ()** @kernel_ptr, align 8
+// CHECK:  %[[CAST:.*]] = bitcast void ()* %[[HANDLE]] to i8*
+// CHECK:  call void @_Z6launchPv(i8* %[[CAST]])
+void fun5() {
+  kernel_ptr = ckernel;
+  launch((void *)kernel_ptr);
+}
+
 // CHECK-LABEL: define{{.*}}@__hip_register_globals
-// CHECK: call{{.*}}@__hipRegisterFunction{{.*}}@[[CSTUB]]{{.*}}@[[CKERN]]
-// CHECK: call{{.*}}@__hipRegisterFunction{{.*}}@[[NSSTUB]]{{.*}}@[[NSKERN]]
-// CHECK: call{{.*}}@__hipRegisterFunction{{.*}}@[[TSTUB]]{{.*}}@[[TKERN]]
+// CHECK: call{{.*}}@__hipRegisterFunction{{.*}}@[[HCKERN]]{{.*}}@[[CKERN]]
+// CHECK: call{{.*}}@__hipRegisterFunction{{.*}}@[[HNSKERN]]{{.*}}@[[NSKERN]]
+// CHECK: call{{.*}}@__hipRegisterFunction{{.*}}@[[HTKERN]]{{.*}}@[[TKERN]]
+// CHECK-NOT: call{{.*}}@__hipRegisterFunction{{.*}}@[[HDKERN]]{{.*}}@[[DKERN]]

diff  --git a/clang/test/CodeGenCUDA/unnamed-types.cu b/clang/test/CodeGenCUDA/unnamed-types.cu
index f598117d969d..b59d5f448dde 100644
--- a/clang/test/CodeGenCUDA/unnamed-types.cu
+++ b/clang/test/CodeGenCUDA/unnamed-types.cu
@@ -54,7 +54,7 @@ void f1(float *p) {
               [] __device__ (float x) { return x + 5.f; });
 }
 // HOST: @__hip_register_globals
-// HOST: __hipRegisterFunction{{.*}}@_Z17__device_stub__k0IZZ2f1PfENKUlS0_E_clES0_EUlfE_EvS0_T_{{.*}}@0
-// HOST: __hipRegisterFunction{{.*}}@_Z17__device_stub__k1IZ2f1PfEUlfE_Z2f1S0_EUlffE_Z2f1S0_EUlfE0_EvS0_T_T0_T1_{{.*}}@1
+// HOST: __hipRegisterFunction{{.*}}@_Z2k0IZZ2f1PfENKUlS0_E_clES0_EUlfE_EvS0_T_{{.*}}@0
+// HOST: __hipRegisterFunction{{.*}}@_Z2k1IZ2f1PfEUlfE_Z2f1S0_EUlffE_Z2f1S0_EUlfE0_EvS0_T_T0_T1_{{.*}}@1
 // MSVC: __hipRegisterFunction{{.*}}@"??$k0 at V<lambda_1>@?0???R1?0??f1@@YAXPEAM at Z@QEBA at 0@Z@@@YAXPEAMV<lambda_1>@?0???R0?0??f1@@YAX0 at Z@QEBA at 0@Z@@Z{{.*}}@0
 // MSVC: __hipRegisterFunction{{.*}}@"??$k1 at V<lambda_2>@?0??f1@@YAXPEAM at Z@V<lambda_3>@?0??2 at YAX0@Z at V<lambda_4>@?0??2 at YAX0@Z@@@YAXPEAMV<lambda_2>@?0??f1@@YAX0 at Z@V<lambda_3>@?0??1 at YAX0@Z at V<lambda_4>@?0??1 at YAX0@Z@@Z{{.*}}@1