[clang] b008ea3 - [CUDA][HIP] Fix device variable linkage

Yaxun Liu via cfe-commits cfe-commits at lists.llvm.org
Fri Feb 5 12:12:14 PST 2021


Author: Yaxun (Sam) Liu
Date: 2021-02-05T15:11:12-05:00
New Revision: b008ea304d438f0aa818918caceb3bd864412304

URL: https://github.com/llvm/llvm-project/commit/b008ea304d438f0aa818918caceb3bd864412304
DIFF: https://github.com/llvm/llvm-project/commit/b008ea304d438f0aa818918caceb3bd864412304.diff

LOG: [CUDA][HIP] Fix device variable linkage

For -fgpu-rdc, shadow variables should not be internalized, otherwise
they cannot be accessed by other TUs. This is necessary because
the shadow variable of external device variables are always
emitted as undefined symbols, which need to resolve to a global
symbols.

Managed variables need to be emitted as undefined symbols
in device compilations.

Reviewed by: Artem Belevich

Differential Revision: https://reviews.llvm.org/D95901

Added: 
    clang/test/CodeGenCUDA/device-var-linkage.cu

Modified: 
    clang/lib/AST/ASTContext.cpp
    clang/lib/CodeGen/CGCUDANV.cpp
    clang/lib/CodeGen/CodeGenModule.cpp
    clang/test/CodeGenCUDA/device-stub.cu
    clang/test/CodeGenCUDA/managed-var.cu

Removed: 
    


################################################################################
diff  --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index 6b8425f5ebcc..80ec858ca0f0 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -11435,16 +11435,22 @@ operator<<(const StreamingDiagnostic &DB,
 }
 
 bool ASTContext::mayExternalizeStaticVar(const Decl *D) const {
-  return !getLangOpts().GPURelocatableDeviceCode &&
-         ((D->hasAttr<CUDADeviceAttr>() &&
-           !D->getAttr<CUDADeviceAttr>()->isImplicit()) ||
-          (D->hasAttr<CUDAConstantAttr>() &&
-           !D->getAttr<CUDAConstantAttr>()->isImplicit())) &&
-         isa<VarDecl>(D) && cast<VarDecl>(D)->isFileVarDecl() &&
-         cast<VarDecl>(D)->getStorageClass() == SC_Static;
+  bool IsStaticVar =
+      isa<VarDecl>(D) && cast<VarDecl>(D)->getStorageClass() == SC_Static;
+  bool IsExplicitDeviceVar = (D->hasAttr<CUDADeviceAttr>() &&
+                              !D->getAttr<CUDADeviceAttr>()->isImplicit()) ||
+                             (D->hasAttr<CUDAConstantAttr>() &&
+                              !D->getAttr<CUDAConstantAttr>()->isImplicit());
+  // CUDA/HIP: static managed variables need to be externalized since it is
+  // a declaration in IR, therefore cannot have internal linkage.
+  // ToDo: externalize static variables for -fgpu-rdc.
+  return IsStaticVar &&
+         (D->hasAttr<HIPManagedAttr>() ||
+          (!getLangOpts().GPURelocatableDeviceCode && IsExplicitDeviceVar));
 }
 
 bool ASTContext::shouldExternalizeStaticVar(const Decl *D) const {
   return mayExternalizeStaticVar(D) &&
-         CUDAStaticDeviceVarReferencedByHost.count(cast<VarDecl>(D));
+         (D->hasAttr<HIPManagedAttr>() ||
+          CUDAStaticDeviceVarReferencedByHost.count(cast<VarDecl>(D)));
 }

diff  --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp
index a0907ec1993e..4c3294ae2d65 100644
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@@ -546,6 +546,8 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
             /*Init=*/llvm::ConstantPointerNull::get(Var->getType()),
             Twine(Var->getName() + ".managed"), /*InsertBefore=*/nullptr,
             llvm::GlobalVariable::NotThreadLocal);
+        ManagedVar->setDSOLocal(Var->isDSOLocal());
+        ManagedVar->setVisibility(Var->getVisibility());
         replaceManagedVar(Var, ManagedVar);
         llvm::Value *Args[] = {
             &GpuBinaryHandlePtr,
@@ -932,11 +934,16 @@ CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) {
 
 void CGNVCUDARuntime::internalizeDeviceSideVar(
     const VarDecl *D, llvm::GlobalValue::LinkageTypes &Linkage) {
-  // Host-side shadows of external declarations of device-side
-  // global variables become internal definitions. These have to
-  // be internal in order to prevent name conflicts with global
-  // host variables with the same name in a 
diff erent TUs.
+  // For -fno-gpu-rdc, host-side shadows of external declarations of device-side
+  // global variables become internal definitions. These have to be internal in
+  // order to prevent name conflicts with global host variables with the same
+  // name in a 
diff erent TUs.
   //
+  // For -fgpu-rdc, the shadow variables should not be internalized because
+  // they may be accessed by 
diff erent TU.
+  if (CGM.getLangOpts().GPURelocatableDeviceCode)
+    return;
+
   // __shared__ variables are odd. Shadows do get created, but
   // they are not registered with the CUDA runtime, so they
   // can't really be used to access their device-side

diff  --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index b133d2a84a59..0cae5ba59620 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -4169,8 +4169,12 @@ void CodeGenModule::EmitGlobalVarDefinition(const VarDecl *D,
   bool NeedsGlobalDtor =
       D->needsDestruction(getContext()) == QualType::DK_cxx_destructor;
 
+  bool IsHIPManagedVarOnDevice =
+      getLangOpts().CUDAIsDevice && D->hasAttr<HIPManagedAttr>();
+
   const VarDecl *InitDecl;
-  const Expr *InitExpr = D->getAnyInitializer(InitDecl);
+  const Expr *InitExpr =
+      IsHIPManagedVarOnDevice ? nullptr : D->getAnyInitializer(InitDecl);
 
   Optional<ConstantEmitter> emitter;
 
@@ -4190,8 +4194,6 @@ void CodeGenModule::EmitGlobalVarDefinition(const VarDecl *D,
       (D->getType()->isCUDADeviceBuiltinSurfaceType() ||
        D->getType()->isCUDADeviceBuiltinTextureType() ||
        D->hasAttr<HIPManagedAttr>());
-  // HIP pinned shadow of initialized host-side global variables are also
-  // left undefined.
   if (getLangOpts().CUDA &&
       (IsCUDASharedVar || IsCUDAShadowVar || IsCUDADeviceShadowVar))
     Init = llvm::UndefValue::get(getTypes().ConvertType(ASTTy));
@@ -4302,7 +4304,10 @@ void CodeGenModule::EmitGlobalVarDefinition(const VarDecl *D,
     }
   }
 
-  GV->setInitializer(Init);
+  // HIP managed variables need to be emitted as declarations in device
+  // compilation.
+  if (!IsHIPManagedVarOnDevice)
+    GV->setInitializer(Init);
   if (emitter)
     emitter->finalize(GV);
 

diff  --git a/clang/test/CodeGenCUDA/device-stub.cu b/clang/test/CodeGenCUDA/device-stub.cu
index 864ec4b92f9e..e1fa931a13da 100644
--- a/clang/test/CodeGenCUDA/device-stub.cu
+++ b/clang/test/CodeGenCUDA/device-stub.cu
@@ -30,9 +30,13 @@
 // RUN:   | FileCheck %s -allow-deprecated-dag-overlap \
 // RUN:       --check-prefixes=ALL,LNX,RDC,CUDA,CUDARDC,CUDA-NEW
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -std=c++17 \
+// RUN:     -target-sdk-version=9.2 -fcuda-include-gpubinary %t -o - \
+// RUN:   | FileCheck %s -allow-deprecated-dag-overlap \
+// RUN:       --check-prefixes=ALL,LNX,NORDC,CUDA,CUDANORDC,CUDA-NEW,LNX_17,NORDC17
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -std=c++17 \
 // RUN:     -target-sdk-version=9.2 -fgpu-rdc -fcuda-include-gpubinary %t -o - \
 // RUN:   | FileCheck %s -allow-deprecated-dag-overlap \
-// RUN:       --check-prefixes=ALL,LNX,RDC,CUDA,CUDARDC,CUDA-NEW,LNX_17
+// RUN:       --check-prefixes=ALL,LNX,RDC,CUDA,CUDARDC,CUDA-NEW,LNX_17,RDC17
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
 // RUN:     -target-sdk-version=9.2 -o - \
 // RUN:   | FileCheck -allow-deprecated-dag-overlap %s -check-prefix=NOGPUBIN
@@ -45,7 +49,7 @@
 // RUN:   | FileCheck -allow-deprecated-dag-overlap %s -check-prefixes=NOGLOBALS,HIPNOGLOBALS
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
 // RUN:     -fgpu-rdc -fcuda-include-gpubinary %t -o - -x hip \
-// RUN:   | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,LNX,NORDC,HIP,HIPEF
+// RUN:   | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,LNX,RDC,HIP,HIPEF
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - -x hip\
 // RUN:   | FileCheck -allow-deprecated-dag-overlap %s -check-prefixes=ALL,LNX,NORDC,HIP,HIPNEF
 
@@ -56,15 +60,18 @@
 #include "Inputs/cuda.h"
 
 #ifndef NOGLOBALS
-// LNX-DAG: @device_var = internal global i32
+// NORDC-DAG: @device_var = internal global i32
+// RDC-DAG: @device_var = dso_local global i32
 // WIN-DAG: @"?device_var@@3HA" = internal global i32
 __device__ int device_var;
 
-// LNX-DAG: @constant_var = internal global i32
+// NORDC-DAG: @constant_var = internal global i32
+// RDC-DAG: @constant_var = dso_local global i32
 // WIN-DAG: @"?constant_var@@3HA" = internal global i32
 __constant__ int constant_var;
 
-// LNX-DAG: @shared_var = internal global i32
+// NORDC-DAG: @shared_var = internal global i32
+// RDC-DAG: @shared_var = dso_local global i32
 // WIN-DAG: @"?shared_var@@3HA" = internal global i32
 __shared__ int shared_var;
 
@@ -87,18 +94,21 @@ extern __constant__ int ext_constant_var;
 
 // external device-side variables with definitions should generate
 // definitions for the shadows.
-// LNX-DAG: @ext_device_var_def = internal global i32 undef,
+// NORDC-DAG: @ext_device_var_def = internal global i32 undef,
+// RDC-DAG: @ext_device_var_def = dso_local global i32 undef,
 // WIN-DAG: @"?ext_device_var_def@@3HA" = internal global i32 undef
 extern __device__ int ext_device_var_def;
 __device__ int ext_device_var_def = 1;
-// LNX-DAG: @ext_device_var_def = internal global i32 undef,
+// NORDC-DAG: @ext_device_var_def = internal global i32 undef,
+// RDC-DAG: @ext_device_var_def = dso_local global i32 undef,
 // WIN-DAG: @"?ext_constant_var_def@@3HA" = internal global i32 undef
 __constant__ int ext_constant_var_def = 2;
 
 #if __cplusplus > 201402L
-/// FIXME: Reject __device__ constexpr and inline variables in Sema.
-// LNX_17: @inline_var = internal global i32 undef, comdat, align 4{{$}}
-// LNX_17: @_ZN1C17member_inline_varE = internal constant i32 undef, comdat, align 4{{$}}
+// NORDC17: @inline_var = internal global i32 undef, comdat, align 4{{$}}
+// RDC17: @inline_var = linkonce_odr global i32 undef, comdat, align 4{{$}}
+// NORDC17: @_ZN1C17member_inline_varE = internal constant i32 undef, comdat, align 4{{$}}
+// RDC17: @_ZN1C17member_inline_varE = linkonce_odr constant i32 undef, comdat, align 4{{$}}
 __device__ inline int inline_var = 3;
 struct C {
   __device__ static constexpr int member_inline_var = 4;
@@ -151,13 +161,13 @@ void use_pointers() {
 // CUDANORDC: @__[[PREFIX]]_gpubin_handle = internal global i8** null
 // HIPNEF: @__[[PREFIX]]_gpubin_handle = linkonce hidden global i8** null
 // * constant unnamed string with NVModuleID
-// RDC: [[MODULE_ID_GLOBAL:@.*]] = private constant
+// CUDARDC: [[MODULE_ID_GLOBAL:@.*]] = private constant
 // CUDARDC-SAME: c"[[MODULE_ID:.+]]\00", section "__nv_module_id", align 32
 // * Make sure our constructor was added to global ctor list.
 // LNX: @llvm.global_ctors = appending global {{.*}}@__[[PREFIX]]_module_ctor
 // * Alias to global symbol containing the NVModuleID.
-// RDC: @__fatbinwrap[[MODULE_ID]] ={{.*}} alias { i32, i32, i8*, i8* }
-// RDC-SAME: { i32, i32, i8*, i8* }* @__[[PREFIX]]_fatbin_wrapper
+// CUDARDC: @__fatbinwrap[[MODULE_ID]] ={{.*}} alias { i32, i32, i8*, i8* }
+// CUDARDC-SAME: { i32, i32, i8*, i8* }* @__[[PREFIX]]_fatbin_wrapper
 
 // Test that we build the correct number of calls to cudaSetupArgument followed
 // by a call to cudaLaunch.
@@ -214,25 +224,33 @@ void hostfunc(void) { kernelfunc<<<1, 1>>>(1, 1, 1); }
 // HIP-NEXT: icmp eq i8** {{.*}}, null
 // HIP-NEXT: br i1 {{.*}}, label %if, label %exit
 // HIP: if:
-// NORDC: call{{.*}}[[PREFIX]]RegisterFatBinary{{.*}}__[[PREFIX]]_fatbin_wrapper
+// CUDANORDC: call{{.*}}[[PREFIX]]RegisterFatBinary{{.*}}__[[PREFIX]]_fatbin_wrapper
+//   .. stores return value in __[[PREFIX]]_gpubin_handle
+// CUDANORDC-NEXT: store{{.*}}__[[PREFIX]]_gpubin_handle
+//   .. and then calls __[[PREFIX]]_register_globals
+// HIP: call{{.*}}[[PREFIX]]RegisterFatBinary{{.*}}__[[PREFIX]]_fatbin_wrapper
 //   .. stores return value in __[[PREFIX]]_gpubin_handle
-// NORDC-NEXT: store{{.*}}__[[PREFIX]]_gpubin_handle
+// HIP-NEXT: store{{.*}}__[[PREFIX]]_gpubin_handle
 //   .. and then calls __[[PREFIX]]_register_globals
 // HIP-NEXT: br label %exit
 // HIP: exit:
 // HIP-NEXT: load i8**, i8*** @__hip_gpubin_handle
-// NORDC-NEXT: call void @__[[PREFIX]]_register_globals
+// CUDANORDC-NEXT: call void @__[[PREFIX]]_register_globals
+// HIP-NEXT: call void @__[[PREFIX]]_register_globals
 // * In separate mode we also register a destructor.
-// NORDC-NEXT: call i32 @atexit(void (i8*)* @__[[PREFIX]]_module_dtor)
+// CUDANORDC-NEXT: call i32 @atexit(void (i8*)* @__[[PREFIX]]_module_dtor)
+// HIP-NEXT: call i32 @atexit(void (i8*)* @__[[PREFIX]]_module_dtor)
 
 // With relocatable device code we call __[[PREFIX]]RegisterLinkedBinary%NVModuleID%
-// RDC: call{{.*}}__[[PREFIX]]RegisterLinkedBinary[[MODULE_ID]](
-// RDC-SAME: __[[PREFIX]]_register_globals, {{.*}}__[[PREFIX]]_fatbin_wrapper
-// RDC-SAME: [[MODULE_ID_GLOBAL]]
+// CUDARDC: call{{.*}}__[[PREFIX]]RegisterLinkedBinary[[MODULE_ID]](
+// CUDARDC-SAME: __[[PREFIX]]_register_globals, {{.*}}__[[PREFIX]]_fatbin_wrapper
+// CUDARDC-SAME: [[MODULE_ID_GLOBAL]]
 
 // Test that we've created destructor.
-// NORDC: define internal void @__[[PREFIX]]_module_dtor
-// NORDC: load{{.*}}__[[PREFIX]]_gpubin_handle
+// CUDANORDC: define internal void @__[[PREFIX]]_module_dtor
+// HIP: define internal void @__[[PREFIX]]_module_dtor
+// CUDANORDC: load{{.*}}__[[PREFIX]]_gpubin_handle
+// HIP: load{{.*}}__[[PREFIX]]_gpubin_handle
 // CUDANORDC-NEXT: call void @__[[PREFIX]]UnregisterFatBinary
 // HIP-NEXT: icmp ne i8** {{.*}}, null
 // HIP-NEXT: br i1 {{.*}}, label %if, label %exit

diff  --git a/clang/test/CodeGenCUDA/device-var-linkage.cu b/clang/test/CodeGenCUDA/device-var-linkage.cu
new file mode 100644
index 000000000000..ec67c77be638
--- /dev/null
+++ b/clang/test/CodeGenCUDA/device-var-linkage.cu
@@ -0,0 +1,65 @@
+// RUN: %clang_cc1 -triple nvptx -fcuda-is-device \
+// RUN:   -emit-llvm -o - -x hip %s \
+// RUN:   | FileCheck -check-prefixes=DEV,NORDC %s
+// RUN: %clang_cc1 -triple nvptx -fcuda-is-device \
+// RUN:   -fgpu-rdc -emit-llvm -o - -x hip %s \
+// RUN:   | FileCheck -check-prefixes=DEV,RDC %s
+// RUN: %clang_cc1 -triple nvptx \
+// RUN:   -emit-llvm -o - -x hip %s \
+// RUN:   | FileCheck -check-prefixes=HOST,NORDC-H %s
+// RUN: %clang_cc1 -triple nvptx \
+// RUN:   -fgpu-rdc -emit-llvm -o - -x hip %s \
+// RUN:   | FileCheck -check-prefixes=HOST,RDC-H %s
+
+#include "Inputs/cuda.h"
+
+// DEV-DAG: @v1 = dso_local addrspace(1) externally_initialized global i32 0
+// NORDC-H-DAG: @v1 = internal global i32 undef
+// RDC-H-DAG: @v1 = dso_local global i32 undef
+__device__ int v1;
+// DEV-DAG: @v2 = dso_local addrspace(4) externally_initialized global i32 0
+// NORDC-H-DAG: @v2 = internal global i32 undef
+// RDC-H-DAG: @v2 = dso_local global i32 undef
+__constant__ int v2;
+// DEV-DAG: @v3 = external addrspace(1) externally_initialized global i32
+// NORDC-H-DAG: @v3 = internal global i32 0
+// RDC-H-DAG: @v3 = dso_local global i32 0
+__managed__ int v3;
+
+// DEV-DAG: @ev1 = external addrspace(1) global i32
+// HOST-DAG: @ev1 = external global i32
+extern __device__ int ev1;
+// DEV-DAG: @ev2 = external addrspace(4) global i32
+// HOST-DAG: @ev2 = external global i32
+extern __constant__ int ev2;
+// DEV-DAG: @ev3 = external addrspace(1) global i32
+// HOST-DAG: @ev3 = external global i32
+extern __managed__ int ev3;
+
+// NORDC-DAG: @_ZL3sv1 = dso_local addrspace(1) externally_initialized global i32 0
+// RDC-DAG: @_ZL3sv1 = internal addrspace(1) global i32 0
+// HOST-DAG: @_ZL3sv1 = internal global i32 undef
+static __device__ int sv1;
+// NORDC-DAG: @_ZL3sv2 = dso_local addrspace(4) externally_initialized global i32 0
+// RDC-DAG: @_ZL3sv2 = internal addrspace(4) global i32 0
+// HOST-DAG: @_ZL3sv2 = internal global i32 undef
+static __constant__ int sv2;
+// DEV-DAG: @_ZL3sv3 = external addrspace(1) externally_initialized global i32
+// HOST-DAG: @_ZL3sv3 = internal global i32 0
+static __managed__ int sv3;
+
+__device__ __host__ int work(int *x);
+
+__device__ __host__ int fun1() {
+  return work(&ev1) + work(&ev2) + work(&ev3) + work(&sv1) + work(&sv2) + work(&sv3);
+}
+
+// HOST: hipRegisterVar({{.*}}@v1
+// HOST: hipRegisterVar({{.*}}@v2
+// HOST: hipRegisterManagedVar({{.*}}@v3
+// HOST-NOT: hipRegisterVar({{.*}}@ev1
+// HOST-NOT: hipRegisterVar({{.*}}@ev2
+// HOST-NOT: hipRegisterManagedVar({{.*}}@ev3
+// HOST: hipRegisterVar({{.*}}@_ZL3sv1
+// HOST: hipRegisterVar({{.*}}@_ZL3sv2
+// HOST: hipRegisterManagedVar({{.*}}@_ZL3sv3

diff  --git a/clang/test/CodeGenCUDA/managed-var.cu b/clang/test/CodeGenCUDA/managed-var.cu
index c95139784085..2dceec03e91f 100644
--- a/clang/test/CodeGenCUDA/managed-var.cu
+++ b/clang/test/CodeGenCUDA/managed-var.cu
@@ -10,17 +10,19 @@
 
 // RUN: %clang_cc1 -triple x86_64-gnu-linux -std=c++11 \
 // RUN:   -emit-llvm -o - -x hip %s | FileCheck \
-// RUN:   -check-prefixes=HOST %s
+// RUN:   -check-prefixes=HOST,NORDC %s
 
 // RUN: %clang_cc1 -triple x86_64-gnu-linux -std=c++11 \
 // RUN:   -emit-llvm -fgpu-rdc -o - -x hip %s | FileCheck \
-// RUN:   -check-prefixes=HOST %s
+// RUN:   -check-prefixes=HOST,RDC %s
 
 #include "Inputs/cuda.h"
 
-// DEV-DAG: @x = {{.*}}addrspace(1) externally_initialized global i32 undef
-// HOST-DAG: @x = internal global i32 1
-// HOST-DAG: @x.managed = internal global i32* null
+// DEV-DAG: @x = external addrspace(1) externally_initialized global i32
+// NORDC-DAG: @x = internal global i32 1
+// RDC-DAG: @x = dso_local global i32 1
+// NORDC-DAG: @x.managed = internal global i32* null
+// RDC-DAG: @x.managed = dso_local global i32* null
 // HOST-DAG: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"x\00"
 
 struct vec {
@@ -31,11 +33,28 @@ __managed__ int x = 1;
 __managed__ vec v[100];
 __managed__ vec v2[100] = {{1, 1, 1}};
 
+// DEV-DAG: @ex = external addrspace(1) global i32
+// HOST-DAG: @ex = external global i32
+extern __managed__ int ex;
+
+// DEV-DAG: @_ZL2sx = external addrspace(1) externally_initialized global i32
+// HOST-DAG: @_ZL2sx = internal global i32 1
+// HOST-DAG: @_ZL2sx.managed = internal global i32* null
+static __managed__ int sx = 1;
+
+// HOST-NOT: @ex.managed
+
+// Force ex and sx mitted in device compilation.
 __global__ void foo(int *z) {
-  *z = x;
+  *z = x + ex + sx;
   v[1].x = 2;
 }
 
+// Force ex and sx emitted in host compilatioin.
+int foo2() {
+  return ex + sx;
+}
+
 // HOST-LABEL: define {{.*}}@_Z4loadv()
 // HOST:  %ld.managed = load i32*, i32** @x.managed, align 4
 // HOST:  %0 = load i32, i32* %ld.managed, align 4
@@ -97,4 +116,6 @@ float addr_taken2() {
 }
 
 // HOST-DAG: __hipRegisterManagedVar({{.*}}@x.managed {{.*}}@x {{.*}}@[[DEVNAMEX]]{{.*}}, i64 4, i32 4)
+// HOST-DAG: __hipRegisterManagedVar({{.*}}@_ZL2sx.managed {{.*}}@_ZL2sx
+// HOST-NOT: __hipRegisterManagedVar({{.*}}@ex.managed {{.*}}@ex
 // HOST-DAG: declare void @__hipRegisterManagedVar(i8**, i8*, i8*, i8*, i64, i32)


        


More information about the cfe-commits mailing list