[clang] 3530c35 - [OpenMP] Use CUDA's non-RDC mode when LTO has whole program visibility
Joseph Huber via cfe-commits
cfe-commits at lists.llvm.org
Sat Apr 23 09:42:56 PDT 2022
Author: Joseph Huber
Date: 2022-04-23T12:42:40-04:00
New Revision: 3530c35c660919b9367f1ac598abfb9a569e7606
URL: https://github.com/llvm/llvm-project/commit/3530c35c660919b9367f1ac598abfb9a569e7606
DIFF: https://github.com/llvm/llvm-project/commit/3530c35c660919b9367f1ac598abfb9a569e7606.diff
LOG: [OpenMP] Use CUDA's non-RDC mode when LTO has whole program visibility
When we do LTO we consider ourselves to have whole program visibility if
every single input file we have contains LLVM bitcode. If we have whole
program visibliity then we can create a single image and utilize CUDA's
non-RDC mode by not passing `-c` to `ptxas` and ignoring the `nvlink`
job. This should be faster for some situations and also saves us the
time executing `nvlink`.
Reviewed By: tra
Differential Revision: https://reviews.llvm.org/D124292
Added:
Modified:
clang/test/Driver/linker-wrapper.c
clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
Removed:
################################################################################
diff --git a/clang/test/Driver/linker-wrapper.c b/clang/test/Driver/linker-wrapper.c
index 5ec99f5fe5b03..7920fe8c1a990 100644
--- a/clang/test/Driver/linker-wrapper.c
+++ b/clang/test/Driver/linker-wrapper.c
@@ -38,5 +38,5 @@
// RUN: clang-linker-wrapper --host-triple x86_64-unknown-linux-gnu --dry-run -linker-path \
// RUN: /usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=LTO
-// LTO: ptxas{{.*}}-m64 -o {{.*}}.cubin -O2 --gpu-name sm_70 -c {{.*}}.s
-// LTO: nvlink{{.*}}-m64 -o {{.*}}.out -arch sm_70 {{.*}}.cubin
+// LTO: ptxas{{.*}}-m64 -o {{.*}}.cubin -O2 --gpu-name sm_70 {{.*}}.s
+// LTO-NOT: nvlink
diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
index b52dda13ac200..2c14c893c6424 100644
--- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -595,7 +595,7 @@ extractFromBuffer(std::unique_ptr<MemoryBuffer> Buffer,
// TODO: Move these to a separate file.
namespace nvptx {
Expected<std::string> assemble(StringRef InputFile, Triple TheTriple,
- StringRef Arch) {
+ StringRef Arch, bool RDC = true) {
// NVPTX uses the ptxas binary to create device object files.
Expected<std::string> PtxasPath = findProgram("ptxas", {CudaBinaryPath});
if (!PtxasPath)
@@ -626,7 +626,8 @@ Expected<std::string> assemble(StringRef InputFile, Triple TheTriple,
CmdArgs.push_back(Opt);
CmdArgs.push_back("--gpu-name");
CmdArgs.push_back(Arch);
- CmdArgs.push_back("-c");
+ if (RDC)
+ CmdArgs.push_back("-c");
CmdArgs.push_back(InputFile);
@@ -933,7 +934,8 @@ bool isValidCIdentifier(StringRef S) {
}
Error linkBitcodeFiles(SmallVectorImpl<std::string> &InputFiles,
- const Triple &TheTriple, StringRef Arch) {
+ const Triple &TheTriple, StringRef Arch,
+ bool &WholeProgram) {
SmallVector<std::unique_ptr<MemoryBuffer>, 4> SavedBuffers;
SmallVector<std::unique_ptr<lto::InputFile>, 4> BitcodeFiles;
SmallVector<std::string, 4> NewInputFiles;
@@ -1009,7 +1011,7 @@ Error linkBitcodeFiles(SmallVectorImpl<std::string> &InputFiles,
};
// We assume visibility of the whole program if every input file was bitcode.
- bool WholeProgram = BitcodeFiles.size() == InputFiles.size();
+ WholeProgram = BitcodeFiles.size() == InputFiles.size();
auto LTOBackend =
(EmbedBitcode) ? createLTO(TheTriple, Arch, WholeProgram, OutputBitcode)
: createLTO(TheTriple, Arch, WholeProgram);
@@ -1089,7 +1091,7 @@ Error linkBitcodeFiles(SmallVectorImpl<std::string> &InputFiles,
// Is we are compiling for NVPTX we need to run the assembler first.
if (TheTriple.isNVPTX() && !EmbedBitcode) {
for (auto &File : Files) {
- auto FileOrErr = nvptx::assemble(File, TheTriple, Arch);
+ auto FileOrErr = nvptx::assemble(File, TheTriple, Arch, !WholeProgram);
if (!FileOrErr)
return FileOrErr.takeError();
File = *FileOrErr;
@@ -1117,10 +1119,11 @@ Error linkDeviceFiles(ArrayRef<DeviceFile> DeviceFiles,
for (auto &LinkerInput : LinkerInputMap) {
DeviceFile &File = LinkerInput.getFirst();
Triple TheTriple = Triple(File.TheTriple);
+ bool WholeProgram = false;
// Run LTO on any bitcode files and replace the input with the result.
- if (Error Err =
- linkBitcodeFiles(LinkerInput.getSecond(), TheTriple, File.Arch))
+ if (Error Err = linkBitcodeFiles(LinkerInput.getSecond(), TheTriple,
+ File.Arch, WholeProgram))
return Err;
// If we are embedding bitcode for JIT, skip the final device linking.
@@ -1130,6 +1133,14 @@ Error linkDeviceFiles(ArrayRef<DeviceFile> DeviceFiles,
continue;
}
+ // If we performed LTO on NVPTX and had whole program visibility, we can use
+ // CUDA in non-RDC mode.
+ if (WholeProgram && TheTriple.isNVPTX()) {
+ assert(!LinkerInput.getSecond().empty() && "No non-RDC image to embed");
+ LinkedImages.push_back(LinkerInput.getSecond().front());
+ continue;
+ }
+
auto ImageOrErr = linkDevice(LinkerInput.getSecond(), TheTriple, File.Arch);
if (!ImageOrErr)
return ImageOrErr.takeError();
More information about the cfe-commits
mailing list