[clang] 47acdec - [CUDA][HIP] Support accessing static device variable in host code for -fgpu-rdc

Yaxun Liu via cfe-commits cfe-commits at lists.llvm.org
Wed Feb 24 15:41:38 PST 2021


Author: Yaxun (Sam) Liu
Date: 2021-02-24T18:23:45-05:00
New Revision: 47acdec1dd5d6d4c279727a97313c586c20e9c6f

URL: https://github.com/llvm/llvm-project/commit/47acdec1dd5d6d4c279727a97313c586c20e9c6f
DIFF: https://github.com/llvm/llvm-project/commit/47acdec1dd5d6d4c279727a97313c586c20e9c6f.diff

LOG: [CUDA][HIP] Support accessing static device variable in host code for -fgpu-rdc

For -fgpu-rdc mode, static device vars in different TU's may have the same name.
To support accessing file-scope static device variables in host code, we need to give them
a distinct name and external linkage. This can be done by postfixing each static device variable with
a distinct CUID (Compilation Unit ID) hash.

Since the static device variables have different name across compilation units, now we let
them have external linkage so that they can be looked up by the runtime.

Reviewed by: Artem Belevich, and Jon Chesterfield

Differential Revision: https://reviews.llvm.org/D85223

Added: 
    clang/test/CodeGenCUDA/static-device-var-rdc.cu
    clang/test/SemaCUDA/static-device-var.cu

Modified: 
    clang/include/clang/AST/ASTContext.h
    clang/lib/AST/ASTContext.cpp
    clang/lib/CodeGen/CGCUDANV.cpp
    clang/lib/CodeGen/CodeGenModule.cpp
    clang/lib/CodeGen/CodeGenModule.h
    clang/test/CodeGenCUDA/device-var-linkage.cu
    clang/test/CodeGenCUDA/managed-var.cu

Removed: 
    


################################################################################
diff  --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h
index 8ba4c41aac701..af2979d874382 100644
--- a/clang/include/clang/AST/ASTContext.h
+++ b/clang/include/clang/AST/ASTContext.h
@@ -299,6 +299,10 @@ class ASTContext : public RefCountedBase<ASTContext> {
   /// This is lazily created.  This is intentionally not serialized.
   mutable llvm::StringMap<StringLiteral *> StringLiteralCache;
 
+  /// MD5 hash of CUID. It is calculated when first used and cached by this
+  /// data member.
+  mutable std::string CUIDHash;
+
   /// Representation of a "canonical" template template parameter that
   /// is used in canonical template names.
   class CanonicalTemplateTemplateParm : public llvm::FoldingSetNode {
@@ -3117,6 +3121,8 @@ OPT_LIST(V)
   /// Whether a C++ static variable should be externalized.
   bool shouldExternalizeStaticVar(const Decl *D) const;
 
+  StringRef getCUIDHash() const;
+
 private:
   /// All OMPTraitInfo objects live in this collection, one per
   /// `pragma omp [begin] declare variant` directive.

diff  --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index fcbc3f6252b6e..b8231f66908a9 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -84,6 +84,7 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MD5.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
@@ -10645,7 +10646,10 @@ static GVALinkage adjustGVALinkageForAttributes(const ASTContext &Context,
       return GVA_StrongODR;
     // Single source offloading languages like CUDA/HIP need to be able to
     // access static device variables from host code of the same compilation
-    // unit. This is done by externalizing the static variable.
+    // unit. This is done by externalizing the static variable with a shared
+    // name between the host and device compilation which is the same for the
+    // same compilation unit whereas 
diff erent among 
diff erent compilation
+    // units.
     if (Context.shouldExternalizeStaticVar(D))
       return GVA_StrongExternal;
   }
@@ -11533,10 +11537,8 @@ bool ASTContext::mayExternalizeStaticVar(const Decl *D) const {
                               !D->getAttr<CUDAConstantAttr>()->isImplicit());
   // CUDA/HIP: static managed variables need to be externalized since it is
   // a declaration in IR, therefore cannot have internal linkage.
-  // ToDo: externalize static variables for -fgpu-rdc.
   return IsStaticVar &&
-         (D->hasAttr<HIPManagedAttr>() ||
-          (!getLangOpts().GPURelocatableDeviceCode && IsExplicitDeviceVar));
+         (D->hasAttr<HIPManagedAttr>() || IsExplicitDeviceVar);
 }
 
 bool ASTContext::shouldExternalizeStaticVar(const Decl *D) const {
@@ -11544,3 +11546,12 @@ bool ASTContext::shouldExternalizeStaticVar(const Decl *D) const {
          (D->hasAttr<HIPManagedAttr>() ||
           CUDAStaticDeviceVarReferencedByHost.count(cast<VarDecl>(D)));
 }
+
+StringRef ASTContext::getCUIDHash() const {
+  if (!CUIDHash.empty())
+    return CUIDHash;
+  if (LangOpts.CUID.empty())
+    return StringRef();
+  CUIDHash = llvm::utohexstr(llvm::MD5Hash(LangOpts.CUID), /*LowerCase=*/true);
+  return CUIDHash;
+}

diff  --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp
index dccbd6f74a981..57e3b151bfd42 100644
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@@ -255,6 +255,17 @@ std::string CGNVCUDARuntime::getDeviceSideName(const NamedDecl *ND) {
     DeviceSideName = std::string(Out.str());
   } else
     DeviceSideName = std::string(ND->getIdentifier()->getName());
+
+  // Make unique name for device side static file-scope variable for HIP.
+  if (CGM.getContext().shouldExternalizeStaticVar(ND) &&
+      CGM.getLangOpts().GPURelocatableDeviceCode &&
+      !CGM.getLangOpts().CUID.empty()) {
+    SmallString<256> Buffer;
+    llvm::raw_svector_ostream Out(Buffer);
+    Out << DeviceSideName;
+    CGM.printPostfixForExternalizedStaticVar(Out);
+    DeviceSideName = std::string(Out.str());
+  }
   return DeviceSideName;
 }
 

diff  --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 7e45ae3acb3e8..9dcdc8da4d923 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -1184,6 +1184,11 @@ static std::string getMangledNameImpl(const CodeGenModule &CGM, GlobalDecl GD,
       }
     }
 
+  // Make unique name for device side static file-scope variable for HIP.
+  if (CGM.getContext().shouldExternalizeStaticVar(ND) &&
+      CGM.getLangOpts().GPURelocatableDeviceCode &&
+      CGM.getLangOpts().CUDAIsDevice && !CGM.getLangOpts().CUID.empty())
+    CGM.printPostfixForExternalizedStaticVar(Out);
   return std::string(Out.str());
 }
 
@@ -1241,9 +1246,16 @@ StringRef CodeGenModule::getMangledName(GlobalDecl GD) {
     }
   }
 
-  auto FoundName = MangledDeclNames.find(CanonicalGD);
-  if (FoundName != MangledDeclNames.end())
-    return FoundName->second;
+  // In CUDA/HIP device compilation with -fgpu-rdc, the mangled name of a
+  // static device variable depends on whether the variable is referenced by
+  // a host or device host function. Therefore the mangled name cannot be
+  // cached.
+  if (!LangOpts.CUDAIsDevice ||
+      !getContext().mayExternalizeStaticVar(GD.getDecl())) {
+    auto FoundName = MangledDeclNames.find(CanonicalGD);
+    if (FoundName != MangledDeclNames.end())
+      return FoundName->second;
+  }
 
   // Keep the first result in the case of a mangling collision.
   const auto *ND = cast<NamedDecl>(GD.getDecl());
@@ -6249,3 +6261,8 @@ bool CodeGenModule::stopAutoInit() {
   }
   return false;
 }
+
+void CodeGenModule::printPostfixForExternalizedStaticVar(
+    llvm::raw_ostream &OS) const {
+  OS << ".static." << getContext().getCUIDHash();
+}

diff  --git a/clang/lib/CodeGen/CodeGenModule.h b/clang/lib/CodeGen/CodeGenModule.h
index 64118746f116d..49f94042a787d 100644
--- a/clang/lib/CodeGen/CodeGenModule.h
+++ b/clang/lib/CodeGen/CodeGenModule.h
@@ -1422,6 +1422,10 @@ class CodeGenModule : public CodeGenTypeCache {
                                            TBAAAccessInfo *TBAAInfo = nullptr);
   bool stopAutoInit();
 
+  /// Print the postfix for externalized static variable for single source
+  /// offloading languages CUDA and HIP.
+  void printPostfixForExternalizedStaticVar(llvm::raw_ostream &OS) const;
+
 private:
   llvm::Constant *GetOrCreateLLVMFunction(
       StringRef MangledName, llvm::Type *Ty, GlobalDecl D, bool ForVTable,

diff  --git a/clang/test/CodeGenCUDA/device-var-linkage.cu b/clang/test/CodeGenCUDA/device-var-linkage.cu
index 01fde03ea49d6..d62c746856931 100644
--- a/clang/test/CodeGenCUDA/device-var-linkage.cu
+++ b/clang/test/CodeGenCUDA/device-var-linkage.cu
@@ -2,13 +2,13 @@
 // RUN:   -emit-llvm -o - -x hip %s \
 // RUN:   | FileCheck -check-prefixes=DEV,NORDC %s
 // RUN: %clang_cc1 -triple nvptx -fcuda-is-device \
-// RUN:   -fgpu-rdc -emit-llvm -o - -x hip %s \
+// RUN:   -fgpu-rdc -cuid=abc -emit-llvm -o - -x hip %s \
 // RUN:   | FileCheck -check-prefixes=DEV,RDC %s
 // RUN: %clang_cc1 -triple nvptx \
 // RUN:   -emit-llvm -o - -x hip %s \
 // RUN:   | FileCheck -check-prefixes=HOST,NORDC-H %s
 // RUN: %clang_cc1 -triple nvptx \
-// RUN:   -fgpu-rdc -emit-llvm -o - -x hip %s \
+// RUN:   -fgpu-rdc -cuid=abc -emit-llvm -o - -x hip %s \
 // RUN:   | FileCheck -check-prefixes=HOST,RDC-H %s
 
 #include "Inputs/cuda.h"
@@ -37,14 +37,15 @@ extern __constant__ int ev2;
 extern __managed__ int ev3;
 
 // NORDC-DAG: @_ZL3sv1 = dso_local addrspace(1) externally_initialized global i32 0
-// RDC-DAG: @_ZL3sv1 = internal addrspace(1) global i32 0
+// RDC-DAG: @_ZL3sv1.static.[[HASH:.*]] = dso_local addrspace(1) externally_initialized global i32 0
 // HOST-DAG: @_ZL3sv1 = internal global i32 undef
 static __device__ int sv1;
 // NORDC-DAG: @_ZL3sv2 = dso_local addrspace(4) externally_initialized global i32 0
-// RDC-DAG: @_ZL3sv2 = internal addrspace(4) global i32 0
+// RDC-DAG: @_ZL3sv2.static.[[HASH]] = dso_local addrspace(4) externally_initialized global i32 0
 // HOST-DAG: @_ZL3sv2 = internal global i32 undef
 static __constant__ int sv2;
-// DEV-DAG: @_ZL3sv3 = dso_local addrspace(1) externally_initialized global i32 addrspace(1)* null
+// NORDC-DAG: @_ZL3sv3 = dso_local addrspace(1) externally_initialized global i32 addrspace(1)* null
+// RDC-DAG: @_ZL3sv3.static.[[HASH]] = dso_local addrspace(1) externally_initialized global i32 addrspace(1)* null
 // HOST-DAG: @_ZL3sv3 = internal externally_initialized global i32* null
 static __managed__ int sv3;
 

diff  --git a/clang/test/CodeGenCUDA/managed-var.cu b/clang/test/CodeGenCUDA/managed-var.cu
index 1657487bcceee..df77f81c85815 100644
--- a/clang/test/CodeGenCUDA/managed-var.cu
+++ b/clang/test/CodeGenCUDA/managed-var.cu
@@ -2,19 +2,24 @@
 
 // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -std=c++11 \
 // RUN:   -emit-llvm -o - -x hip %s | FileCheck \
-// RUN:   -check-prefixes=COMMON,DEV %s
+// RUN:   -check-prefixes=COMMON,DEV,NORDC-D %s
 
 // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -std=c++11 \
-// RUN:   -emit-llvm -fgpu-rdc -o - -x hip %s | FileCheck \
-// RUN:   -check-prefixes=COMMON,DEV %s
+// RUN:   -emit-llvm -fgpu-rdc -cuid=abc -o - -x hip %s > %t.dev
+// RUN: cat %t.dev | FileCheck -check-prefixes=COMMON,DEV,RDC-D %s
 
 // RUN: %clang_cc1 -triple x86_64-gnu-linux -std=c++11 \
 // RUN:   -emit-llvm -o - -x hip %s | FileCheck \
 // RUN:   -check-prefixes=COMMON,HOST,NORDC %s
 
 // RUN: %clang_cc1 -triple x86_64-gnu-linux -std=c++11 \
-// RUN:   -emit-llvm -fgpu-rdc -o - -x hip %s | FileCheck \
-// RUN:   -check-prefixes=COMMON,HOST,RDC %s
+// RUN:   -emit-llvm -fgpu-rdc -cuid=abc -o - -x hip %s > %t.host
+// RUN: cat %t.host | FileCheck -check-prefixes=COMMON,HOST,RDC %s
+
+// Check device and host compilation use the same postfix for static
+// variable name.
+
+// RUN: cat %t.dev %t.host | FileCheck -check-prefix=POSTFIX %s
 
 #include "Inputs/cuda.h"
 
@@ -45,10 +50,17 @@ __managed__ vec v2[100] = {{1, 1, 1}};
 // HOST-DAG: @ex = external externally_initialized global i32*
 extern __managed__ int ex;
 
-// DEV-DAG: @_ZL2sx.managed = dso_local addrspace(1) externally_initialized global i32 1, align 4
-// DEV-DAG: @_ZL2sx = dso_local addrspace(1) externally_initialized global i32 addrspace(1)* null
+// NORDC-D-DAG: @_ZL2sx.managed = dso_local addrspace(1) externally_initialized global i32 1, align 4
+// NORDC-D-DAG: @_ZL2sx = dso_local addrspace(1) externally_initialized global i32 addrspace(1)* null
+// RDC-D-DAG: @_ZL2sx.static.[[HASH:.*]].managed = dso_local addrspace(1) externally_initialized global i32 1, align 4
+// RDC-D-DAG: @_ZL2sx.static.[[HASH]] = dso_local addrspace(1) externally_initialized global i32 addrspace(1)* null
 // HOST-DAG: @_ZL2sx.managed = internal global i32 1
 // HOST-DAG: @_ZL2sx = internal externally_initialized global i32* null
+// NORDC-DAG: @[[DEVNAMESX:[0-9]+]] = {{.*}}c"_ZL2sx\00"
+// RDC-DAG: @[[DEVNAMESX:[0-9]+]] = {{.*}}c"_ZL2sx.static.[[HASH:.*]]\00"
+
+// POSTFIX:  @_ZL2sx.static.[[HASH:.*]] = dso_local addrspace(1) externally_initialized global i32 addrspace(1)* null
+// POSTFIX: @[[DEVNAMESX:[0-9]+]] = {{.*}}c"_ZL2sx.static.[[HASH]]\00"
 static __managed__ int sx = 1;
 
 // DEV-DAG: @llvm.compiler.used
@@ -154,6 +166,6 @@ __device__ __host__ int load4() {
 }
 
 // HOST-DAG: __hipRegisterManagedVar({{.*}}@x {{.*}}@x.managed {{.*}}@[[DEVNAMEX]]{{.*}}, i64 4, i32 4)
-// HOST-DAG: __hipRegisterManagedVar({{.*}}@_ZL2sx {{.*}}@_ZL2sx.managed
+// HOST-DAG: __hipRegisterManagedVar({{.*}}@_ZL2sx {{.*}}@_ZL2sx.managed {{.*}}@[[DEVNAMESX]]
 // HOST-NOT: __hipRegisterManagedVar({{.*}}@ex {{.*}}@ex.managed
 // HOST-DAG: declare void @__hipRegisterManagedVar(i8**, i8*, i8*, i8*, i64, i32)

diff  --git a/clang/test/CodeGenCUDA/static-device-var-rdc.cu b/clang/test/CodeGenCUDA/static-device-var-rdc.cu
new file mode 100644
index 0000000000000..eac985fb70de6
--- /dev/null
+++ b/clang/test/CodeGenCUDA/static-device-var-rdc.cu
@@ -0,0 +1,97 @@
+// REQUIRES: x86-registered-target
+// REQUIRES: amdgpu-registered-target
+
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device \
+// RUN:   -fgpu-rdc -emit-llvm -o - -x hip %s | FileCheck \
+// RUN:   -check-prefixes=DEV,INT-DEV %s
+
+// RUN: %clang_cc1 -triple x86_64-gnu-linux \
+// RUN:   -fgpu-rdc -emit-llvm -o - -x hip %s | FileCheck \
+// RUN:   -check-prefixes=HOST,INT-HOST %s
+
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -cuid=abc \
+// RUN:   -fgpu-rdc -emit-llvm -o - -x hip %s > %t.dev
+// RUN: cat %t.dev | FileCheck -check-prefixes=DEV,EXT-DEV %s
+
+// RUN: %clang_cc1 -triple x86_64-gnu-linux -cuid=abc \
+// RUN:   -fgpu-rdc -emit-llvm -o - -x hip %s > %t.host
+// RUN: cat %t.host | FileCheck -check-prefixes=HOST,EXT-HOST %s
+
+// Check host and device compilations use the same postfixes for static
+// variable names.
+
+// RUN: cat %t.dev %t.host | FileCheck -check-prefix=POSTFIX %s
+
+#include "Inputs/cuda.h"
+
+// Test function scope static device variable, which should not be externalized.
+// DEV-DAG: @_ZZ6kernelPiPPKiE1w = internal addrspace(4) constant i32 1
+
+
+// HOST-DAG: @_ZL1x = internal global i32 undef
+// HOST-DAG: @_ZL1y = internal global i32 undef
+
+// Test normal static device variables
+// INT-DEV-DAG: @_ZL1x = dso_local addrspace(1) externally_initialized global i32 0
+// INT-HOST-DAG: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x\00"
+
+// Test externalized static device variables
+// EXT-DEV-DAG: @_ZL1x.static.[[HASH:.*]] = dso_local addrspace(1) externally_initialized global i32 0
+// EXT-HOST-DAG: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x.static.[[HASH:.*]]\00"
+
+// POSTFIX: @_ZL1x.static.[[HASH:.*]] = dso_local addrspace(1) externally_initialized global i32 0
+// POSTFIX: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x.static.[[HASH]]\00"
+
+static __device__ int x;
+
+// Test static device variables not used by host code should not be externalized
+// DEV-DAG: @_ZL2x2 = internal addrspace(1) global i32 0
+
+static __device__ int x2;
+
+// Test normal static device variables
+// INT-DEV-DAG: @_ZL1y = dso_local addrspace(4) externally_initialized global i32 0
+// INT-HOST-DAG: @[[DEVNAMEY:[0-9]+]] = {{.*}}c"_ZL1y\00"
+
+// Test externalized static device variables
+// EXT-DEV-DAG: @_ZL1y.static.[[HASH]] = dso_local addrspace(4) externally_initialized global i32 0
+// EXT-HOST-DAG: @[[DEVNAMEY:[0-9]+]] = {{.*}}c"_ZL1y.static.[[HASH]]\00"
+
+static __constant__ int y;
+
+// Test static host variable, which should not be externalized nor registered.
+// HOST-DAG: @_ZL1z = internal global i32 0
+// DEV-NOT: @_ZL1z
+static int z;
+
+// Test static device variable in inline function, which should not be
+// externalized nor registered.
+// DEV-DAG: @_ZZ6devfunPPKiE1p = linkonce_odr addrspace(4) constant i32 2, comdat
+
+inline __device__ void devfun(const int ** b) {
+  const static int p = 2;
+  b[0] = &p;
+}
+
+__global__ void kernel(int *a, const int **b) {
+  const static int w = 1;
+  a[0] = x;
+  a[1] = y;
+  b[0] = &w;
+  b[1] = &x2;
+  devfun(b);
+}
+
+int* getDeviceSymbol(int *x);
+
+void foo() {
+  getDeviceSymbol(&x);
+  getDeviceSymbol(&y);
+  z = 123;
+}
+
+// HOST: __hipRegisterVar({{.*}}@_ZL1x {{.*}}@[[DEVNAMEX]]
+// HOST: __hipRegisterVar({{.*}}@_ZL1y {{.*}}@[[DEVNAMEY]]
+// HOST-NOT: __hipRegisterVar({{.*}}@_ZL2x2
+// HOST-NOT: __hipRegisterVar({{.*}}@_ZZ6kernelPiPPKiE1w
+// HOST-NOT: __hipRegisterVar({{.*}}@_ZZ6devfunPPKiE1p

diff  --git a/clang/test/SemaCUDA/static-device-var.cu b/clang/test/SemaCUDA/static-device-var.cu
new file mode 100644
index 0000000000000..5c8b89853b57b
--- /dev/null
+++ b/clang/test/SemaCUDA/static-device-var.cu
@@ -0,0 +1,50 @@
+// REQUIRES: x86-registered-target
+// REQUIRES: amdgpu-registered-target
+
+// RUN: %clang_cc1 -triple nvptx -fcuda-is-device \
+// RUN:    -emit-llvm -o - %s -fsyntax-only -verify=dev
+
+// RUN: %clang_cc1 -triple x86_64-gnu-linux \
+// RUN:    -emit-llvm -o - %s -fsyntax-only -verify=host
+
+// Checks allowed usage of file-scope and function-scope static variables.
+
+// host-no-diagnostics
+
+#include "Inputs/cuda.h"
+
+// Checks static variables are allowed in device functions.
+
+__device__ void f1() {
+  const static int b = 123;
+  static int a;
+}
+
+// Checks static variables are allowd in global functions.
+
+__global__ void k1() {
+  const static int b = 123;
+  static int a;
+}
+
+// Checks static device and constant variables are allowed in device and
+// host functions, and static host variables are not allowed in device
+// functions.
+
+static __device__ int x;
+static __constant__ int y;
+static int z;
+
+__global__ void kernel(int *a) {
+  a[0] = x;
+  a[1] = y;
+  a[2] = z;
+  // dev-error at -1 {{reference to __host__ variable 'z' in __global__ function}}
+}
+
+int* getDeviceSymbol(int *x);
+
+void foo() {
+  getDeviceSymbol(&x);
+  getDeviceSymbol(&y);
+}


        


More information about the cfe-commits mailing list