[clang] 3530c35 - [OpenMP] Use CUDA's non-RDC mode when LTO has whole program visibility

Sat Apr 23 09:42:56 PDT 2022

Author: Joseph Huber
Date: 2022-04-23T12:42:40-04:00
New Revision: 3530c35c660919b9367f1ac598abfb9a569e7606

URL: https://github.com/llvm/llvm-project/commit/3530c35c660919b9367f1ac598abfb9a569e7606
DIFF: https://github.com/llvm/llvm-project/commit/3530c35c660919b9367f1ac598abfb9a569e7606.diff

LOG: [OpenMP] Use CUDA's non-RDC mode when LTO has whole program visibility

When we do LTO we consider ourselves to have whole program visibility if
every single input file we have contains LLVM bitcode. If we have whole
program visibliity then we can create a single image and utilize CUDA's
non-RDC mode by not passing `-c` to `ptxas` and ignoring the `nvlink`
job. This should be faster for some situations and also saves us the
time executing `nvlink`.

Reviewed By: tra

Differential Revision: https://reviews.llvm.org/D124292

Added: 
    

Modified: 
    clang/test/Driver/linker-wrapper.c
    clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp

Removed: 
    


################################################################################
diff  --git a/clang/test/Driver/linker-wrapper.c b/clang/test/Driver/linker-wrapper.c
index 5ec99f5fe5b03..7920fe8c1a990 100644

--- a/clang/test/Driver/linker-wrapper.c
+++ b/clang/test/Driver/linker-wrapper.c
@@ -38,5 +38,5 @@
 // RUN: clang-linker-wrapper --host-triple x86_64-unknown-linux-gnu --dry-run -linker-path \
 // RUN:   /usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=LTO
 
-// LTO: ptxas{{.*}}-m64 -o {{.*}}.cubin -O2 --gpu-name sm_70 -c {{.*}}.s
-// LTO: nvlink{{.*}}-m64 -o {{.*}}.out -arch sm_70 {{.*}}.cubin
+// LTO: ptxas{{.*}}-m64 -o {{.*}}.cubin -O2 --gpu-name sm_70 {{.*}}.s
+// LTO-NOT: nvlink

diff  --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
index b52dda13ac200..2c14c893c6424 100644
--- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -595,7 +595,7 @@ extractFromBuffer(std::unique_ptr<MemoryBuffer> Buffer,
 // TODO: Move these to a separate file.
 namespace nvptx {
 Expected<std::string> assemble(StringRef InputFile, Triple TheTriple,
-                               StringRef Arch) {
+                               StringRef Arch, bool RDC = true) {
   // NVPTX uses the ptxas binary to create device object files.
   Expected<std::string> PtxasPath = findProgram("ptxas", {CudaBinaryPath});
   if (!PtxasPath)
@@ -626,7 +626,8 @@ Expected<std::string> assemble(StringRef InputFile, Triple TheTriple,
   CmdArgs.push_back(Opt);
   CmdArgs.push_back("--gpu-name");
   CmdArgs.push_back(Arch);
-  CmdArgs.push_back("-c");
+  if (RDC)
+    CmdArgs.push_back("-c");
 
   CmdArgs.push_back(InputFile);
 
@@ -933,7 +934,8 @@ bool isValidCIdentifier(StringRef S) {
 }
 
 Error linkBitcodeFiles(SmallVectorImpl<std::string> &InputFiles,
-                       const Triple &TheTriple, StringRef Arch) {
+                       const Triple &TheTriple, StringRef Arch,
+                       bool &WholeProgram) {
   SmallVector<std::unique_ptr<MemoryBuffer>, 4> SavedBuffers;
   SmallVector<std::unique_ptr<lto::InputFile>, 4> BitcodeFiles;
   SmallVector<std::string, 4> NewInputFiles;
@@ -1009,7 +1011,7 @@ Error linkBitcodeFiles(SmallVectorImpl<std::string> &InputFiles,
   };
 
   // We assume visibility of the whole program if every input file was bitcode.
-  bool WholeProgram = BitcodeFiles.size() == InputFiles.size();
+  WholeProgram = BitcodeFiles.size() == InputFiles.size();
   auto LTOBackend =
       (EmbedBitcode) ? createLTO(TheTriple, Arch, WholeProgram, OutputBitcode)
                      : createLTO(TheTriple, Arch, WholeProgram);
@@ -1089,7 +1091,7 @@ Error linkBitcodeFiles(SmallVectorImpl<std::string> &InputFiles,
   // Is we are compiling for NVPTX we need to run the assembler first.
   if (TheTriple.isNVPTX() && !EmbedBitcode) {
     for (auto &File : Files) {
-      auto FileOrErr = nvptx::assemble(File, TheTriple, Arch);
+      auto FileOrErr = nvptx::assemble(File, TheTriple, Arch, !WholeProgram);
       if (!FileOrErr)
         return FileOrErr.takeError();
       File = *FileOrErr;
@@ -1117,10 +1119,11 @@ Error linkDeviceFiles(ArrayRef<DeviceFile> DeviceFiles,
   for (auto &LinkerInput : LinkerInputMap) {
     DeviceFile &File = LinkerInput.getFirst();
     Triple TheTriple = Triple(File.TheTriple);
+    bool WholeProgram = false;
 
     // Run LTO on any bitcode files and replace the input with the result.
-    if (Error Err =
-            linkBitcodeFiles(LinkerInput.getSecond(), TheTriple, File.Arch))
+    if (Error Err = linkBitcodeFiles(LinkerInput.getSecond(), TheTriple,
+                                     File.Arch, WholeProgram))
       return Err;
 
     // If we are embedding bitcode for JIT, skip the final device linking.
@@ -1130,6 +1133,14 @@ Error linkDeviceFiles(ArrayRef<DeviceFile> DeviceFiles,
       continue;
     }
 
+    // If we performed LTO on NVPTX and had whole program visibility, we can use
+    // CUDA in non-RDC mode.
+    if (WholeProgram && TheTriple.isNVPTX()) {
+      assert(!LinkerInput.getSecond().empty() && "No non-RDC image to embed");
+      LinkedImages.push_back(LinkerInput.getSecond().front());
+      continue;
+    }
+
     auto ImageOrErr = linkDevice(LinkerInput.getSecond(), TheTriple, File.Arch);
     if (!ImageOrErr)
       return ImageOrErr.takeError();