[clang] 04fb816 - [CUDA][HIP] Externalize kernels with internal linkage

Yaxun Liu via cfe-commits cfe-commits at lists.llvm.org
Fri Apr 22 14:07:05 PDT 2022


Author: Yaxun (Sam) Liu
Date: 2022-04-22T17:05:36-04:00
New Revision: 04fb81674ed7981397ffe70fe6a07b7168f6fe2f

URL: https://github.com/llvm/llvm-project/commit/04fb81674ed7981397ffe70fe6a07b7168f6fe2f
DIFF: https://github.com/llvm/llvm-project/commit/04fb81674ed7981397ffe70fe6a07b7168f6fe2f.diff

LOG: [CUDA][HIP] Externalize kernels with internal linkage

This patch is a continuation of https://reviews.llvm.org/D123353.

Not only kernels in anonymous namespace, but also template
kernels with template arguments in anonymous namespace
need to be externalized.

To be more generic, this patch checks the linkage of a kernel
assuming the kernel does not have __global__ attribute. If
the linkage is internal then clang will externalize it.

This patch also fixes the postfix for externalized symbol
since nvptx does not allow '.' in symbol name.

Reviewed by: Artem Belevich

Differential Revision: https://reviews.llvm.org/D124189

Fixes: https://github.com/llvm/llvm-project/issues/54560

Added: 
    

Modified: 
    clang/lib/AST/ASTContext.cpp
    clang/lib/CodeGen/CodeGenModule.cpp
    clang/test/CodeGenCUDA/device-var-linkage.cu
    clang/test/CodeGenCUDA/kernel-in-anon-ns.cu
    clang/test/CodeGenCUDA/managed-var.cu
    clang/test/CodeGenCUDA/static-device-var-rdc.cu

Removed: 
    


################################################################################
diff  --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index eb8d53a20012b..85d2bcf268f3a 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -12298,7 +12298,9 @@ bool ASTContext::mayExternalize(const Decl *D) const {
   // anonymous name space needs to be externalized to avoid duplicate symbols.
   return (IsStaticVar &&
           (D->hasAttr<HIPManagedAttr>() || IsExplicitDeviceVar)) ||
-         (D->hasAttr<CUDAGlobalAttr>() && D->isInAnonymousNamespace());
+         (D->hasAttr<CUDAGlobalAttr>() &&
+          basicGVALinkageForFunction(*this, cast<FunctionDecl>(D)) ==
+              GVA_Internal);
 }
 
 bool ASTContext::shouldExternalize(const Decl *D) const {

diff  --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index e1afe81e5a80e..784f2d0e578fa 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -6809,6 +6809,12 @@ bool CodeGenModule::stopAutoInit() {
 
 void CodeGenModule::printPostfixForExternalizedDecl(llvm::raw_ostream &OS,
                                                     const Decl *D) const {
-  OS << (isa<VarDecl>(D) ? "__static__" : ".anon.")
-     << getContext().getCUIDHash();
+  StringRef Tag;
+  // ptxas does not allow '.' in symbol names. On the other hand, HIP prefers
+  // postfix beginning with '.' since the symbol name can be demangled.
+  if (LangOpts.HIP)
+    Tag = (isa<VarDecl>(D) ? ".static." : ".intern.");
+  else
+    Tag = (isa<VarDecl>(D) ? "__static__" : "__intern__");
+  OS << Tag << getContext().getCUIDHash();
 }

diff  --git a/clang/test/CodeGenCUDA/device-var-linkage.cu b/clang/test/CodeGenCUDA/device-var-linkage.cu
index 6186d66127878..a214feb0541f9 100644
--- a/clang/test/CodeGenCUDA/device-var-linkage.cu
+++ b/clang/test/CodeGenCUDA/device-var-linkage.cu
@@ -1,15 +1,18 @@
-// RUN: %clang_cc1 -no-opaque-pointers -triple nvptx -fcuda-is-device \
+// RUN: %clang_cc1 -no-opaque-pointers -triple amdgcn -fcuda-is-device \
 // RUN:   -emit-llvm -o - -x hip %s \
 // RUN:   | FileCheck -check-prefixes=DEV,NORDC %s
-// RUN: %clang_cc1 -no-opaque-pointers -triple nvptx -fcuda-is-device \
+// RUN: %clang_cc1 -no-opaque-pointers -triple amdgcn -fcuda-is-device \
 // RUN:   -fgpu-rdc -cuid=abc -emit-llvm -o - -x hip %s \
 // RUN:   | FileCheck -check-prefixes=DEV,RDC %s
-// RUN: %clang_cc1 -no-opaque-pointers -triple nvptx \
+// RUN: %clang_cc1 -no-opaque-pointers -triple x86_64-unknown-gnu-linux \
 // RUN:   -emit-llvm -o - -x hip %s \
 // RUN:   | FileCheck -check-prefixes=HOST,NORDC-H %s
-// RUN: %clang_cc1 -no-opaque-pointers -triple nvptx \
+// RUN: %clang_cc1 -no-opaque-pointers -triple x86_64-unknown-gnu-linux \
 // RUN:   -fgpu-rdc -cuid=abc -emit-llvm -o - -x hip %s \
 // RUN:   | FileCheck -check-prefixes=HOST,RDC-H %s
+// RUN: %clang_cc1 -no-opaque-pointers -triple nvptx -fcuda-is-device \
+// RUN:   -fgpu-rdc -cuid=abc -emit-llvm -o - %s \
+// RUN:   | FileCheck -check-prefixes=CUDA %s
 
 #include "Inputs/cuda.h"
 
@@ -24,7 +27,9 @@ __constant__ int v2;
 // DEV-DAG: @v3 = addrspace(1) externally_initialized global i32 addrspace(1)* null
 // NORDC-H-DAG: @v3 = internal externally_initialized global i32* null
 // RDC-H-DAG: @v3 = externally_initialized global i32* null
+#if __HIP__
 __managed__ int v3;
+#endif
 
 // DEV-DAG: @ev1 = external addrspace(1) global i32
 // HOST-DAG: @ev1 = external global i32
@@ -34,25 +39,35 @@ extern __device__ int ev1;
 extern __constant__ int ev2;
 // DEV-DAG: @ev3 = external addrspace(1) externally_initialized global i32 addrspace(1)*
 // HOST-DAG: @ev3 = external externally_initialized global i32*
+#if __HIP__
 extern __managed__ int ev3;
+#endif
 
 // NORDC-DAG: @_ZL3sv1 = addrspace(1) externally_initialized global i32 0
-// RDC-DAG: @_ZL3sv1__static__[[HASH:.*]] = addrspace(1) externally_initialized global i32 0
+// RDC-DAG: @_ZL3sv1.static.[[HASH:.*]] = addrspace(1) externally_initialized global i32 0
 // HOST-DAG: @_ZL3sv1 = internal global i32 undef
+// CUDA-DAG: @_ZL3sv1__static__[[HASH:.*]] = addrspace(1) externally_initialized global i32 0
 static __device__ int sv1;
 // NORDC-DAG: @_ZL3sv2 = addrspace(4) externally_initialized global i32 0
-// RDC-DAG: @_ZL3sv2__static__[[HASH]] = addrspace(4) externally_initialized global i32 0
+// RDC-DAG: @_ZL3sv2.static.[[HASH]] = addrspace(4) externally_initialized global i32 0
 // HOST-DAG: @_ZL3sv2 = internal global i32 undef
+// CUDA-DAG: @_ZL3sv2__static__[[HASH]] = addrspace(4) externally_initialized global i32 0
 static __constant__ int sv2;
 // NORDC-DAG: @_ZL3sv3 = addrspace(1) externally_initialized global i32 addrspace(1)* null
-// RDC-DAG: @_ZL3sv3__static__[[HASH]] = addrspace(1) externally_initialized global i32 addrspace(1)* null
+// RDC-DAG: @_ZL3sv3.static.[[HASH]] = addrspace(1) externally_initialized global i32 addrspace(1)* null
 // HOST-DAG: @_ZL3sv3 = internal externally_initialized global i32* null
+#if __HIP__
 static __managed__ int sv3;
+#endif
 
 __device__ __host__ int work(int *x);
 
 __device__ __host__ int fun1() {
-  return work(&ev1) + work(&ev2) + work(&ev3) + work(&sv1) + work(&sv2) + work(&sv3);
+  return work(&ev1) + work(&ev2) + work(&sv1) + work(&sv2)
+#if __HIP__
+    + work(&ev3) + work(&sv3)
+#endif
+    ;
 }
 
 // HOST: hipRegisterVar({{.*}}@v1

diff  --git a/clang/test/CodeGenCUDA/kernel-in-anon-ns.cu b/clang/test/CodeGenCUDA/kernel-in-anon-ns.cu
index 4243cec796a86..bc753d76e5c11 100644
--- a/clang/test/CodeGenCUDA/kernel-in-anon-ns.cu
+++ b/clang/test/CodeGenCUDA/kernel-in-anon-ns.cu
@@ -6,19 +6,53 @@
 // RUN:   -aux-triple amdgcn-amd-amdhsa -std=c++11 -fgpu-rdc \
 // RUN:   -emit-llvm -o - -x hip %s > %t.host
 
-// RUN: cat %t.dev %t.host | FileCheck %s
+// RUN: cat %t.dev %t.host | FileCheck -check-prefixes=HIP,COMMON %s
+
+// RUN: echo "GPU binary" > %t.fatbin
+
+// RUN: %clang_cc1 -triple nvptx -fcuda-is-device -cuid=abc \
+// RUN:   -aux-triple x86_64-unknown-linux-gnu -std=c++11 -fgpu-rdc \
+// RUN:   -emit-llvm -o - %s > %t.dev
+
+// RUN: %clang_cc1 -triple x86_64-gnu-linux -cuid=abc \
+// RUN:   -aux-triple nvptx -std=c++11 -fgpu-rdc -fcuda-include-gpubinary %t.fatbin \
+// RUN:   -emit-llvm -o - %s > %t.host
+
+// RUN: cat %t.dev %t.host | FileCheck -check-prefixes=CUDA,COMMON %s
 
 #include "Inputs/cuda.h"
 
-// CHECK: define weak_odr {{.*}}void @[[KERN:_ZN12_GLOBAL__N_16kernelEv\.anon\.b04fd23c98500190]](
-// CHECK: @[[STR:.*]] = {{.*}} c"[[KERN]]\00"
-// CHECK: call i32 @__hipRegisterFunction({{.*}}@[[STR]]
+// HIP-DAG: define weak_odr {{.*}}void @[[KERN1:_ZN12_GLOBAL__N_16kernelEv\.intern\.b04fd23c98500190]](
+// HIP-DAG: define weak_odr {{.*}}void @[[KERN2:_Z8tempKernIN12_GLOBAL__N_11XEEvT_\.intern\.b04fd23c98500190]](
+// HIP-DAG: define weak_odr {{.*}}void @[[KERN3:_Z8tempKernIN12_GLOBAL__N_1UlvE_EEvT_\.intern\.b04fd23c98500190]](
+
+// CUDA-DAG: define weak_odr {{.*}}void @[[KERN1:_ZN12_GLOBAL__N_16kernelEv__intern__b04fd23c98500190]](
+// CUDA-DAG: define weak_odr {{.*}}void @[[KERN2:_Z8tempKernIN12_GLOBAL__N_11XEEvT___intern__b04fd23c98500190]](
+// CUDA-DAG: define weak_odr {{.*}}void @[[KERN3:_Z8tempKernIN12_GLOBAL__N_1UlvE_EEvT___intern__b04fd23c98500190]](
+
+// COMMON-DAG: @[[STR1:.*]] = {{.*}} c"[[KERN1]]\00"
+// COMMON-DAG: @[[STR2:.*]] = {{.*}} c"[[KERN2]]\00"
+// COMMON-DAG: @[[STR3:.*]] = {{.*}} c"[[KERN3]]\00"
+
+// COMMON-DAG: call i32 @__{{.*}}RegisterFunction({{.*}}@[[STR1]]
+// COMMON-DAG: call i32 @__{{.*}}RegisterFunction({{.*}}@[[STR2]]
+// COMMON-DAG: call i32 @__{{.*}}RegisterFunction({{.*}}@[[STR3]]
+
+
+template <typename T>
+__global__ void tempKern(T x) {}
 
 namespace {
-__global__ void kernel() {
-}
+  __global__ void kernel() {}
+  struct X {};
+  X x;
+  auto lambda = [](){};
 }
 
 void test() {
   kernel<<<1, 1>>>();
+
+  tempKern<<<1, 1>>>(x);
+
+  tempKern<<<1, 1>>>(lambda);
 }

diff  --git a/clang/test/CodeGenCUDA/managed-var.cu b/clang/test/CodeGenCUDA/managed-var.cu
index 71e57a0c2c864..7b5f6bb4771a5 100644
--- a/clang/test/CodeGenCUDA/managed-var.cu
+++ b/clang/test/CodeGenCUDA/managed-var.cu
@@ -1,5 +1,3 @@
-// REQUIRES: x86-registered-target, amdgpu-registered-target
-
 // RUN: %clang_cc1 -no-opaque-pointers -triple amdgcn-amd-amdhsa -fcuda-is-device -std=c++11 \
 // RUN:   -emit-llvm -o - -x hip %s | FileCheck \
 // RUN:   -check-prefixes=COMMON,DEV,NORDC-D %s
@@ -52,15 +50,15 @@ extern __managed__ int ex;
 
 // NORDC-D-DAG: @_ZL2sx.managed = addrspace(1) externally_initialized global i32 1, align 4
 // NORDC-D-DAG: @_ZL2sx = addrspace(1) externally_initialized global i32 addrspace(1)* null
-// RDC-D-DAG: @_ZL2sx__static__[[HASH:.*]].managed = addrspace(1) externally_initialized global i32 1, align 4
-// RDC-D-DAG: @_ZL2sx__static__[[HASH]] = addrspace(1) externally_initialized global i32 addrspace(1)* null
+// RDC-D-DAG: @_ZL2sx.static.[[HASH:.*]].managed = addrspace(1) externally_initialized global i32 1, align 4
+// RDC-D-DAG: @_ZL2sx.static.[[HASH]] = addrspace(1) externally_initialized global i32 addrspace(1)* null
 // HOST-DAG: @_ZL2sx.managed = internal global i32 1
 // HOST-DAG: @_ZL2sx = internal externally_initialized global i32* null
 // NORDC-DAG: @[[DEVNAMESX:[0-9]+]] = {{.*}}c"_ZL2sx\00"
-// RDC-DAG: @[[DEVNAMESX:[0-9]+]] = {{.*}}c"_ZL2sx__static__[[HASH:.*]]\00"
+// RDC-DAG: @[[DEVNAMESX:[0-9]+]] = {{.*}}c"_ZL2sx.static.[[HASH:.*]]\00"
 
-// POSTFIX:  @_ZL2sx__static__[[HASH:.*]] = addrspace(1) externally_initialized global i32 addrspace(1)* null
-// POSTFIX: @[[DEVNAMESX:[0-9]+]] = {{.*}}c"_ZL2sx__static__[[HASH]]\00"
+// POSTFIX:  @_ZL2sx.static.[[HASH:.*]] = addrspace(1) externally_initialized global i32 addrspace(1)* null
+// POSTFIX: @[[DEVNAMESX:[0-9]+]] = {{.*}}c"_ZL2sx.static.[[HASH]]\00"
 static __managed__ int sx = 1;
 
 // DEV-DAG: @llvm.compiler.used

diff  --git a/clang/test/CodeGenCUDA/static-device-var-rdc.cu b/clang/test/CodeGenCUDA/static-device-var-rdc.cu
index cacc67c7dfbfd..81a2bd0fd455c 100644
--- a/clang/test/CodeGenCUDA/static-device-var-rdc.cu
+++ b/clang/test/CodeGenCUDA/static-device-var-rdc.cu
@@ -40,6 +40,11 @@
 // RUN:   -std=c++11 -fgpu-rdc -emit-llvm -o - -x hip %s > %t.host
 // RUN: cat %t.host | FileCheck -check-prefix=HOST-NEG %s
 
+// Check postfix for CUDA.
+
+// RUN: %clang_cc1 -no-opaque-pointers -triple nvptx -fcuda-is-device -cuid=abc \
+// RUN:   -std=c++11 -fgpu-rdc -emit-llvm -o - %s | FileCheck \
+// RUN:   -check-prefixes=CUDA %s
 
 #include "Inputs/cuda.h"
 
@@ -55,11 +60,12 @@
 // INT-HOST-DAG: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x\00"
 
 // Test externalized static device variables
-// EXT-DEV-DAG: @_ZL1x__static__[[HASH:.*]] = addrspace(1) externally_initialized global i32 0
-// EXT-HOST-DAG: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x__static__[[HASH:.*]]\00"
+// EXT-DEV-DAG: @_ZL1x.static.[[HASH:.*]] = addrspace(1) externally_initialized global i32 0
+// EXT-HOST-DAG: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x.static.[[HASH:.*]]\00"
+// CUDA-DAG: @_ZL1x__static__[[HASH:.*]] = addrspace(1) externally_initialized global i32 0
 
-// POSTFIX: @_ZL1x__static__[[HASH:.*]] = addrspace(1) externally_initialized global i32 0
-// POSTFIX: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x__static__[[HASH]]\00"
+// POSTFIX: @_ZL1x.static.[[HASH:.*]] = addrspace(1) externally_initialized global i32 0
+// POSTFIX: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x.static.[[HASH]]\00"
 
 static __device__ int x;
 
@@ -73,8 +79,8 @@ static __device__ int x2;
 // INT-HOST-DAG: @[[DEVNAMEY:[0-9]+]] = {{.*}}c"_ZL1y\00"
 
 // Test externalized static device variables
-// EXT-DEV-DAG: @_ZL1y__static__[[HASH]] = addrspace(4) externally_initialized global i32 0
-// EXT-HOST-DAG: @[[DEVNAMEY:[0-9]+]] = {{.*}}c"_ZL1y__static__[[HASH]]\00"
+// EXT-DEV-DAG: @_ZL1y.static.[[HASH]] = addrspace(4) externally_initialized global i32 0
+// EXT-HOST-DAG: @[[DEVNAMEY:[0-9]+]] = {{.*}}c"_ZL1y.static.[[HASH]]\00"
 
 static __constant__ int y;
 


        


More information about the cfe-commits mailing list