[clang] 45f2a56 - [CUDA][HIP] Support accessing static device variable in host code for -fno-gpu-rdc

Yaxun Liu via cfe-commits cfe-commits at lists.llvm.org
Wed Aug 5 04:58:05 PDT 2020


Author: Yaxun (Sam) Liu
Date: 2020-08-05T07:57:38-04:00
New Revision: 45f2a56856e29b8cb038b2e559289b91fb98fedf

URL: https://github.com/llvm/llvm-project/commit/45f2a56856e29b8cb038b2e559289b91fb98fedf
DIFF: https://github.com/llvm/llvm-project/commit/45f2a56856e29b8cb038b2e559289b91fb98fedf.diff

LOG: [CUDA][HIP] Support accessing static device variable in host code for -fno-gpu-rdc

nvcc supports accessing file-scope static device variables in host code by host APIs
like cudaMemcpyToSymbol etc.

CUDA/HIP let users access device variables in host code by shadow variables. In host compilation,
clang emits a shadow variable for each device variable, and calls __*RegisterVariable to
register it in init function. The address of the shadow variable and the device side mangled
name of the device variable is passed to __*RegisterVariable. Runtime looks up the symbol
by name in the device binary  to find the address of the device variable.

The problem with static device variables is that they have internal linkage, therefore their
name may be changed by the linker if there are multiple symbols with the same name. Also
they end up as local symbols in the elf file, whereas the runtime only looks up the global symbols.

Another reason for making the static device variables external linkage is that they may be
initialized externally by host code and their final value may be accessed by host code
after kernel execution, therefore they actually have external linkage. Giving them internal
linkage will cause incorrect optimizations on them.

To support accessing static device var in host code for -fno-gpu-rdc mode, change the intnernal
linkage to external linkage. The name does not need change since there is only one TU for
-fno-gpu-rdc mode. Also the externalization is done only if the device static var is referenced
by host code.

Differential Revision: https://reviews.llvm.org/D80858

Added: 
    clang/test/CodeGenCUDA/static-device-var-no-rdc.cu

Modified: 
    clang/include/clang/AST/ASTContext.h
    clang/lib/AST/ASTContext.cpp
    clang/lib/Sema/SemaExpr.cpp
    clang/test/CodeGenCUDA/constexpr-variables.cu

Removed: 
    


################################################################################
diff  --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h
index 6c00fe86f282d..78207a4aad31b 100644
--- a/clang/include/clang/AST/ASTContext.h
+++ b/clang/include/clang/AST/ASTContext.h
@@ -43,6 +43,7 @@
 #include "llvm/ADT/APSInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/FoldingSet.h"
 #include "llvm/ADT/IntrusiveRefCntPtr.h"
 #include "llvm/ADT/MapVector.h"
@@ -999,6 +1000,9 @@ class ASTContext : public RefCountedBase<ASTContext> {
   // Implicitly-declared type 'struct _GUID'.
   mutable TagDecl *MSGuidTagDecl = nullptr;
 
+  /// Keep track of CUDA/HIP static device variables referenced by host code.
+  llvm::DenseSet<const VarDecl *> CUDAStaticDeviceVarReferencedByHost;
+
   ASTContext(LangOptions &LOpts, SourceManager &SM, IdentifierTable &idents,
              SelectorTable &sels, Builtin::Context &builtins);
   ASTContext(const ASTContext &) = delete;
@@ -3030,6 +3034,9 @@ OPT_LIST(V)
   /// Return a new OMPTraitInfo object owned by this context.
   OMPTraitInfo &getNewOMPTraitInfo();
 
+  /// Whether a C++ static variable should be externalized.
+  bool shouldExternalizeStaticVar(const Decl *D) const;
+
 private:
   /// All OMPTraitInfo objects live in this collection, one per
   /// `pragma omp [begin] declare variant` directive.

diff  --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index 43bbe41fb6112..04a4c5482db75 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -10325,12 +10325,17 @@ static GVALinkage adjustGVALinkageForAttributes(const ASTContext &Context,
   } else if (D->hasAttr<DLLExportAttr>()) {
     if (L == GVA_DiscardableODR)
       return GVA_StrongODR;
-  } else if (Context.getLangOpts().CUDA && Context.getLangOpts().CUDAIsDevice &&
-             D->hasAttr<CUDAGlobalAttr>()) {
+  } else if (Context.getLangOpts().CUDA && Context.getLangOpts().CUDAIsDevice) {
     // Device-side functions with __global__ attribute must always be
     // visible externally so they can be launched from host.
-    if (L == GVA_DiscardableODR || L == GVA_Internal)
+    if (D->hasAttr<CUDAGlobalAttr>() &&
+        (L == GVA_DiscardableODR || L == GVA_Internal))
       return GVA_StrongODR;
+    // Single source offloading languages like CUDA/HIP need to be able to
+    // access static device variables from host code of the same compilation
+    // unit. This is done by externalizing the static variable.
+    if (Context.shouldExternalizeStaticVar(D))
+      return GVA_StrongExternal;
   }
   return L;
 }
@@ -11185,3 +11190,11 @@ clang::operator<<(const DiagnosticBuilder &DB,
     return DB << Section.Decl;
   return DB << "a prior #pragma section";
 }
+
+bool ASTContext::shouldExternalizeStaticVar(const Decl *D) const {
+  return !getLangOpts().GPURelocatableDeviceCode &&
+         (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>()) &&
+         isa<VarDecl>(D) && cast<VarDecl>(D)->isFileVarDecl() &&
+         cast<VarDecl>(D)->getStorageClass() == SC_Static &&
+         CUDAStaticDeviceVarReferencedByHost.count(cast<VarDecl>(D));
+}

diff  --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index bb0b1fa49851d..dc867ba8f165d 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -17864,6 +17864,25 @@ static void DoMarkVarDeclReferenced(Sema &SemaRef, SourceLocation Loc,
   if (Var->isInvalidDecl())
     return;
 
+  // Record a CUDA/HIP static device/constant variable if it is referenced
+  // by host code. This is done conservatively, when the variable is referenced
+  // in any of the following contexts:
+  //   - a non-function context
+  //   - a host function
+  //   - a host device function
+  // This also requires the reference of the static device/constant variable by
+  // host code to be visible in the device compilation for the compiler to be
+  // able to externalize the static device/constant variable.
+  if ((Var->hasAttr<CUDADeviceAttr>() || Var->hasAttr<CUDAConstantAttr>()) &&
+      Var->isFileVarDecl() && Var->getStorageClass() == SC_Static) {
+    auto *CurContext = SemaRef.CurContext;
+    if (!CurContext || !isa<FunctionDecl>(CurContext) ||
+        cast<FunctionDecl>(CurContext)->hasAttr<CUDAHostAttr>() ||
+        (!cast<FunctionDecl>(CurContext)->hasAttr<CUDADeviceAttr>() &&
+         !cast<FunctionDecl>(CurContext)->hasAttr<CUDAGlobalAttr>()))
+      SemaRef.getASTContext().CUDAStaticDeviceVarReferencedByHost.insert(Var);
+  }
+
   auto *MSI = Var->getMemberSpecializationInfo();
   TemplateSpecializationKind TSK = MSI ? MSI->getTemplateSpecializationKind()
                                        : Var->getTemplateSpecializationKind();

diff  --git a/clang/test/CodeGenCUDA/constexpr-variables.cu b/clang/test/CodeGenCUDA/constexpr-variables.cu
index b8b0782b4f62f..7ae56341cdf57 100644
--- a/clang/test/CodeGenCUDA/constexpr-variables.cu
+++ b/clang/test/CodeGenCUDA/constexpr-variables.cu
@@ -19,7 +19,7 @@ struct Q {
   // CXX14: @_ZN1Q2k2E = {{.*}}externally_initialized constant i32 6
   // CXX17: @_ZN1Q2k2E = internal {{.*}}constant i32 6
   // CXX14: @_ZN1Q2k1E = available_externally {{.*}}constant i32 5
-  // CXX17: @_ZN1Q2k1E = linkonce_odr {{.*}}constant i32 5
+  // CXX17: @_ZN1Q2k1E = {{.*}} externally_initialized constant i32 5
   static constexpr int k1 = 5;
   static constexpr int k2 = 6;
 };
@@ -30,14 +30,14 @@ __constant__ const int &use_Q_k2 = Q::k2;
 
 template<typename T> struct X {
   // CXX14: @_ZN1XIiE1aE = available_externally {{.*}}constant i32 123
-  // CXX17: @_ZN1XIiE1aE = linkonce_odr {{.*}}constant i32 123
+  // CXX17: @_ZN1XIiE1aE = {{.*}}externally_initialized constant i32 123
   static constexpr int a = 123;
 };
 __constant__ const int &use_X_a = X<int>::a;
 
 template <typename T, T a, T b> struct A {
   // CXX14: @_ZN1AIiLi1ELi2EE1xE = available_externally {{.*}}constant i32 2
-  // CXX17: @_ZN1AIiLi1ELi2EE1xE = linkonce_odr {{.*}}constant i32 2
+  // CXX17: @_ZN1AIiLi1ELi2EE1xE = {{.*}}externally_initialized constant i32 2
   constexpr static T x = a * b;
 };
 __constant__ const int &y = A<int, 1, 2>::x;

diff  --git a/clang/test/CodeGenCUDA/static-device-var-no-rdc.cu b/clang/test/CodeGenCUDA/static-device-var-no-rdc.cu
new file mode 100644
index 0000000000000..1aea467c2d490
--- /dev/null
+++ b/clang/test/CodeGenCUDA/static-device-var-no-rdc.cu
@@ -0,0 +1,94 @@
+// REQUIRES: x86-registered-target
+// REQUIRES: amdgpu-registered-target
+
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device \
+// RUN:   -emit-llvm -o - -x hip %s | FileCheck \
+// RUN:   -check-prefixes=DEV %s
+
+// RUN: %clang_cc1 -triple x86_64-gnu-linux \
+// RUN:   -emit-llvm -o - -x hip %s | FileCheck \
+// RUN:   -check-prefixes=HOST %s
+
+#include "Inputs/cuda.h"
+
+// Test function scope static device variable, which should not be externalized.
+// DEV-DAG: @_ZZ6kernelPiPPKiE1w = internal addrspace(4) constant i32 1
+
+// Check a static device variable referenced by host function is externalized.
+// DEV-DAG: @_ZL1x = addrspace(1) externally_initialized global i32 0
+// HOST-DAG: @_ZL1x = internal global i32 undef
+// HOST-DAG: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x\00"
+
+static __device__ int x;
+
+// Check a static device variables referenced only by device functions and kernels
+// is not externalized.
+// DEV-DAG: @_ZL2x2 = internal addrspace(1) global i32 0
+static __device__ int x2;
+
+// Check a static device variable referenced by host device function is externalized.
+// DEV-DAG: @_ZL2x3 = addrspace(1) externally_initialized global i32 0
+static __device__ int x3;
+
+// Check a static device variable referenced in file scope is externalized.
+// DEV-DAG: @_ZL2x4 = addrspace(1) externally_initialized global i32 0
+static __device__ int x4;
+int& x4_ref = x4;
+
+// Check a static device variable in anonymous namespace.
+// DEV-DAG: @_ZN12_GLOBAL__N_12x5E = addrspace(1) externally_initialized global i32 0
+namespace {
+static __device__ int x5;
+}
+
+// Check a static constant variable referenced by host is externalized.
+// DEV-DAG: @_ZL1y = addrspace(4) externally_initialized global i32 0
+// HOST-DAG: @_ZL1y = internal global i32 undef
+// HOST-DAG: @[[DEVNAMEY:[0-9]+]] = {{.*}}c"_ZL1y\00"
+
+static __constant__ int y;
+
+// Test static host variable, which should not be externalized nor registered.
+// HOST-DAG: @_ZL1z = internal global i32 0
+// DEV-NOT: @_ZL1z
+static int z;
+
+// Test static device variable in inline function, which should not be
+// externalized nor registered.
+// DEV-DAG: @_ZZ6devfunPPKiE1p = linkonce_odr addrspace(4) constant i32 2, comdat
+
+inline __device__ void devfun(const int ** b) {
+  const static int p = 2;
+  b[0] = &p;
+  b[1] = &x2;
+}
+
+__global__ void kernel(int *a, const int **b) {
+  const static int w = 1;
+  a[0] = x;
+  a[1] = y;
+  a[2] = x2;
+  a[3] = x3;
+  a[4] = x4;
+  a[5] = x5;
+  b[0] = &w;
+  devfun(b);
+}
+
+__host__ __device__ void hdf(int *a) {
+  a[0] = x3;
+}
+
+int* getDeviceSymbol(int *x);
+
+void foo(int *a) {
+  getDeviceSymbol(&x);
+  getDeviceSymbol(&x5);
+  getDeviceSymbol(&y);
+  z = 123;
+}
+
+// HOST: __hipRegisterVar({{.*}}@_ZL1x {{.*}}@[[DEVNAMEX]]
+// HOST: __hipRegisterVar({{.*}}@_ZL1y {{.*}}@[[DEVNAMEY]]
+// HOST-NOT: __hipRegisterVar({{.*}}@_ZZ6kernelPiPPKiE1w
+// HOST-NOT: __hipRegisterVar({{.*}}@_ZZ6devfunPPKiE1p


        


More information about the cfe-commits mailing list