[clang] 0424b51 - [CUDA][HIP] Fix host used external kernel in archive

Yaxun Liu via cfe-commits cfe-commits at lists.llvm.org
Wed Apr 13 07:48:23 PDT 2022


Author: Yaxun (Sam) Liu
Date: 2022-04-13T10:47:16-04:00
New Revision: 0424b5115cffad73a0f6e68affed603a7ed9a692

URL: https://github.com/llvm/llvm-project/commit/0424b5115cffad73a0f6e68affed603a7ed9a692
DIFF: https://github.com/llvm/llvm-project/commit/0424b5115cffad73a0f6e68affed603a7ed9a692.diff

LOG: [CUDA][HIP] Fix host used external kernel in archive

For -fgpu-rdc, a host function may call an external kernel
which is defined in an archive of bitcode. Since this external
kernel is only referenced in host function, the device
bitcode does not contain reference to this external
kernel, then the linker will not try to resolve this external
kernel in the archive.

To fix this issue, host-used external kernels and device
variables are tracked. A global array containing pointers
to these external kernels and variables is emitted which
serves as an artificial references to the external kernels
and variables used by host.

Reviewed by: Artem Belevich

Differential Revision: https://reviews.llvm.org/D123441

Added: 
    clang/test/CodeGenCUDA/host-used-extern.cu

Modified: 
    clang/include/clang/AST/ASTContext.h
    clang/lib/CodeGen/CodeGenModule.cpp
    clang/lib/Sema/SemaCUDA.cpp
    clang/lib/Sema/SemaExpr.cpp

Removed: 
    


################################################################################
diff  --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h
index 490128abb2ef2..9e10571740de4 100644
--- a/clang/include/clang/AST/ASTContext.h
+++ b/clang/include/clang/AST/ASTContext.h
@@ -1160,6 +1160,10 @@ class ASTContext : public RefCountedBase<ASTContext> {
   /// Keep track of CUDA/HIP device-side variables ODR-used by host code.
   llvm::DenseSet<const VarDecl *> CUDADeviceVarODRUsedByHost;
 
+  /// Keep track of CUDA/HIP external kernels or device variables ODR-used by
+  /// host code.
+  llvm::DenseSet<const ValueDecl *> CUDAExternalDeviceDeclODRUsedByHost;
+
   ASTContext(LangOptions &LOpts, SourceManager &SM, IdentifierTable &idents,
              SelectorTable &sels, Builtin::Context &builtins,
              TranslationUnitKind TUKind);

diff  --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index b251a4a7df3d2..76094c73a9279 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -579,6 +579,30 @@ void CodeGenModule::Release() {
     }
   }
 
+  // Emit a global array containing all external kernels or device variables
+  // used by host functions and mark it as used for CUDA/HIP. This is necessary
+  // to get kernels or device variables in archives linked in even if these
+  // kernels or device variables are only used in host functions.
+  if (!Context.CUDAExternalDeviceDeclODRUsedByHost.empty()) {
+    SmallVector<llvm::Constant *, 8> UsedArray;
+    for (auto D : Context.CUDAExternalDeviceDeclODRUsedByHost) {
+      GlobalDecl GD;
+      if (auto *FD = dyn_cast<FunctionDecl>(D))
+        GD = GlobalDecl(FD, KernelReferenceKind::Kernel);
+      else
+        GD = GlobalDecl(D);
+      UsedArray.push_back(llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
+          GetAddrOfGlobal(GD), Int8PtrTy));
+    }
+
+    llvm::ArrayType *ATy = llvm::ArrayType::get(Int8PtrTy, UsedArray.size());
+
+    auto *GV = new llvm::GlobalVariable(
+        getModule(), ATy, false, llvm::GlobalValue::AppendingLinkage,
+        llvm::ConstantArray::get(ATy, UsedArray), "gpu.used.external");
+    addCompilerUsedGlobal(GV);
+  }
+
   emitLLVMUsed();
   if (SanStats)
     SanStats->finish();

diff  --git a/clang/lib/Sema/SemaCUDA.cpp b/clang/lib/Sema/SemaCUDA.cpp
index b0af13044fc29..18f9dd7fb532d 100644
--- a/clang/lib/Sema/SemaCUDA.cpp
+++ b/clang/lib/Sema/SemaCUDA.cpp
@@ -819,8 +819,13 @@ bool Sema::CheckCUDACall(SourceLocation Loc, FunctionDecl *Callee) {
     }
   }();
 
-  if (DiagKind == SemaDiagnosticBuilder::K_Nop)
+  if (DiagKind == SemaDiagnosticBuilder::K_Nop) {
+    // For -fgpu-rdc, keep track of external kernels used by host functions.
+    if (LangOpts.CUDAIsDevice && LangOpts.GPURelocatableDeviceCode &&
+        Callee->hasAttr<CUDAGlobalAttr>() && !Callee->isDefined())
+      getASTContext().CUDAExternalDeviceDeclODRUsedByHost.insert(Callee);
     return true;
+  }
 
   // Avoid emitting this error twice for the same location.  Using a hashtable
   // like this is unfortunate, but because we must continue parsing as normal

diff  --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 375bd2ec59c26..da1fed4d72aec 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -17908,8 +17908,7 @@ MarkVarDeclODRUsed(VarDecl *Var, SourceLocation Loc, Sema &SemaRef,
       }
     } else if (VarTarget == Sema::CVT_Device &&
                (UserTarget == Sema::CFT_Host ||
-                UserTarget == Sema::CFT_HostDevice) &&
-               !Var->hasExternalStorage()) {
+                UserTarget == Sema::CFT_HostDevice)) {
       // Record a CUDA/HIP device side variable if it is ODR-used
       // by host code. This is done conservatively, when the variable is
       // referenced in any of the following contexts:
@@ -17920,7 +17919,10 @@ MarkVarDeclODRUsed(VarDecl *Var, SourceLocation Loc, Sema &SemaRef,
       // be visible in the device compilation for the compiler to be able to
       // emit template variables instantiated by host code only and to
       // externalize the static device side variable ODR-used by host code.
-      SemaRef.getASTContext().CUDADeviceVarODRUsedByHost.insert(Var);
+      if (!Var->hasExternalStorage())
+        SemaRef.getASTContext().CUDADeviceVarODRUsedByHost.insert(Var);
+      else if (SemaRef.LangOpts.GPURelocatableDeviceCode)
+        SemaRef.getASTContext().CUDAExternalDeviceDeclODRUsedByHost.insert(Var);
     }
   }
 

diff  --git a/clang/test/CodeGenCUDA/host-used-extern.cu b/clang/test/CodeGenCUDA/host-used-extern.cu
new file mode 100644
index 0000000000000..02b55cdb76b4c
--- /dev/null
+++ b/clang/test/CodeGenCUDA/host-used-extern.cu
@@ -0,0 +1,51 @@
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -x hip %s \
+// RUN:   -fgpu-rdc -std=c++11 -emit-llvm -o - -target-cpu gfx906 | FileCheck %s
+
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -x hip %s \
+// RUN:   -fgpu-rdc -std=c++11 -emit-llvm -o - -target-cpu gfx906 \
+// RUN:   | FileCheck -check-prefix=NEG %s
+
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -x hip %s \
+// RUN:   -std=c++11 -emit-llvm -o - -target-cpu gfx906 \
+// RUN:   | FileCheck -check-prefixes=NEG,NORDC %s
+
+#include "Inputs/cuda.h"
+
+// CHECK-LABEL: @gpu.used.external = appending {{.*}}global
+// CHECK-DAG: @_Z7kernel1v
+// CHECK-DAG: @_Z7kernel4v
+// CHECK-DAG: @var1
+// CHECK-LABEL: @llvm.compiler.used = {{.*}} @gpu.used.external
+
+// NEG-NOT: @gpu.used.external = {{.*}} @_Z7kernel2v
+// NEG-NOT: @gpu.used.external = {{.*}} @_Z7kernel3v
+// NEG-NOT: @gpu.used.external = {{.*}} @var2
+// NEG-NOT: @gpu.used.external = {{.*}} @var3
+// NORDC-NOT: @gpu.used.external = {{.*}} @_Z7kernel1v
+// NORDC-NOT: @gpu.used.external = {{.*}} @_Z7kernel4v
+// NORDC-NOT: @gpu.used.external = {{.*}} @var1
+
+__global__ void kernel1();
+
+// kernel2 is not marked as used since it is a definition.
+__global__ void kernel2() {}
+
+// kernel3 is not marked as used since it is not called by host function.
+__global__ void kernel3();
+
+// kernel4 is marked as used even though it is not called.
+__global__ void kernel4();
+
+extern __device__ int var1;
+
+__device__ int var2;
+
+extern __device__ int var3;
+
+void use(int *p);
+
+void test() {
+  kernel1<<<1, 1>>>();
+  void *p = (void*)kernel4;
+  use(&var1);
+}


        


More information about the cfe-commits mailing list