[clang] 0f7e863 - [LinkerWrapper] Perform device linking steps in parallel

Fri Nov 11 11:46:40 PST 2022

Author: Joseph Huber
Date: 2022-11-11T13:46:33-06:00
New Revision: 0f7e8631547ad2cbbe5bcf1933e1353e58f58406

URL: https://github.com/llvm/llvm-project/commit/0f7e8631547ad2cbbe5bcf1933e1353e58f58406
DIFF: https://github.com/llvm/llvm-project/commit/0f7e8631547ad2cbbe5bcf1933e1353e58f58406.diff

LOG: [LinkerWrapper] Perform device linking steps in parallel

This patch changes the device linking steps to be performed in parallel
when multiple offloading architectures are being used. We use the LLVM
parallelism support to accomplish this by simply doing each inidividual
device linking job in a single thread. This change required re-parsing
the input arguments as these arguments have internal state that would
not be properly shared between the threads otherwise.

By default, the parallelism uses all threads availible. But this can be
controlled with the `--wrapper-jobs=` option. This was required in a few
tests to ensure the ordering was still deterministic.

Reviewed By: tra

Differential Revision: https://reviews.llvm.org/D136701

Added: 
    

Modified: 
    clang/test/Driver/linker-wrapper.c
    clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
    clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td

Removed: 
    


################################################################################
diff  --git a/clang/test/Driver/linker-wrapper.c b/clang/test/Driver/linker-wrapper.c
index b2d73f3621087..7d958301f727b 100644

--- a/clang/test/Driver/linker-wrapper.c
+++ b/clang/test/Driver/linker-wrapper.c
@@ -107,7 +107,19 @@
 
 // CUDA: nvlink{{.*}}-m64 -o {{.*}}.out -arch sm_52 {{.*}}.o
 // CUDA: nvlink{{.*}}-m64 -o {{.*}}.out -arch sm_70 {{.*}}.o {{.*}}.o
-// CUDA: fatbinary{{.*}}-64 --create {{.*}}.fatbin --image=profile=sm_52,file={{.*}}.out --image=profile=sm_70,file={{.*}}.out
+// CUDA: fatbinary{{.*}}-64 --create {{.*}}.fatbin --image=profile=sm_70,file={{.*}}.out --image=profile=sm_52,file={{.*}}.out 
+
+// RUN: clang-offload-packager -o %t.out \
+// RUN:   --image=file=%S/Inputs/dummy-elf.o,kind=cuda,triple=nvptx64-nvidia-cuda,arch=sm_80 \
+// RUN:   --image=file=%S/Inputs/dummy-elf.o,kind=cuda,triple=nvptx64-nvidia-cuda,arch=sm_75 \
+// RUN:   --image=file=%S/Inputs/dummy-elf.o,kind=cuda,triple=nvptx64-nvidia-cuda,arch=sm_70 \
+// RUN:   --image=file=%S/Inputs/dummy-elf.o,kind=cuda,triple=nvptx64-nvidia-cuda,arch=sm_52
+// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o \
+// RUN:   -fembed-offload-object=%t.out
+// RUN: clang-linker-wrapper --dry-run --host-triple=x86_64-unknown-linux-gnu --wrapper-jobs=4 \
+// RUN: --linker-path=/usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=CUDA-PAR
+
+// CUDA-PAR: fatbinary{{.*}}-64 --create {{.*}}.fatbin
 
 // RUN: clang-offload-packager -o %t.out \
 // RUN:   --image=file=%S/Inputs/dummy-elf.o,kind=hip,triple=amdgcn-amd-amdhsa,arch=gfx90a \
@@ -120,7 +132,7 @@
 
 // HIP: lld{{.*}}-flavor gnu --no-undefined -shared -plugin-opt=-amdgpu-internalize-symbols -plugin-opt=mcpu=gfx908 -o {{.*}}.out {{.*}}.o
 // HIP: lld{{.*}}-flavor gnu --no-undefined -shared -plugin-opt=-amdgpu-internalize-symbols -plugin-opt=mcpu=gfx90a -o {{.*}}.out {{.*}}.o
-// HIP: clang-offload-bundler{{.*}}-type=o -bundle-align=4096 -targets=host-x86_64-unknown-linux,hipv4-amdgcn-amd-amdhsa--gfx908,hipv4-amdgcn-amd-amdhsa--gfx90a -input=/dev/null -input={{.*}}.out -input={{.*}}out -output={{.*}}.hipfb
+// HIP: clang-offload-bundler{{.*}}-type=o -bundle-align=4096 -targets=host-x86_64-unknown-linux,hipv4-amdgcn-amd-amdhsa--gfx90a,hipv4-amdgcn-amd-amdhsa--gfx908 -input=/dev/null -input={{.*}}.out -input={{.*}}out -output={{.*}}.hipfb
 
 // RUN: clang-offload-packager -o %t.out \
 // RUN:   --image=file=%S/Inputs/dummy-elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx908 \

diff  --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
index 6a12b64f7d7dd..42fde3d790fbf 100644
--- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -42,6 +42,7 @@
 #include "llvm/Support/Host.h"
 #include "llvm/Support/InitLLVM.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Parallel.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/Program.h"
 #include "llvm/Support/Signals.h"
@@ -1119,19 +1120,34 @@ DerivedArgList getLinkerArgs(ArrayRef<OffloadFile> Input,
 /// be registered by the runtime.
 Expected<SmallVector<StringRef>>
 linkAndWrapDeviceFiles(SmallVectorImpl<OffloadFile> &LinkerInputFiles,
-                       const InputArgList &Args) {
+                       const InputArgList &Args, char **Argv, int Argc) {
   llvm::TimeTraceScope TimeScope("Handle all device input");
 
-  DenseMap<OffloadFile::TargetID, SmallVector<OffloadFile, 4>> InputsForTarget;
+  DenseMap<OffloadFile::TargetID, SmallVector<OffloadFile>> InputMap;
   for (auto &File : LinkerInputFiles)
-    InputsForTarget[File].emplace_back(std::move(File));
+    InputMap[File].emplace_back(std::move(File));
   LinkerInputFiles.clear();
 
-  DenseMap<OffloadKind, SmallVector<OffloadingImage, 2>> Images;
-  for (auto &[ID, Input] : InputsForTarget) {
+  SmallVector<SmallVector<OffloadFile>> InputsForTarget;
+  for (auto &[ID, Input] : InputMap)
+    InputsForTarget.emplace_back(std::move(Input));
+  InputMap.clear();
+
+  std::mutex ImageMtx;
+  DenseMap<OffloadKind, SmallVector<OffloadingImage>> Images;
+  auto Err = parallelForEachError(InputsForTarget, [&](auto &Input) -> Error {
     llvm::TimeTraceScope TimeScope("Link device input");
 
-    auto LinkerArgs = getLinkerArgs(Input, Args);
+    // Each thread needs its own copy of the base arguments to maintain
+    // per-device argument storage of synthetic strings.
+    const OptTable &Tbl = getOptTable();
+    BumpPtrAllocator Alloc;
+    StringSaver Saver(Alloc);
+    auto BaseArgs =
+        Tbl.parseArgs(Argc, Argv, OPT_INVALID, Saver, [](StringRef Err) {
+          reportError(createStringError(inconvertibleErrorCode(), Err));
+        });
+    auto LinkerArgs = getLinkerArgs(Input, BaseArgs);
 
     DenseSet<OffloadKind> ActiveOffloadKinds;
     for (const auto &File : Input)
@@ -1142,7 +1158,7 @@ linkAndWrapDeviceFiles(SmallVectorImpl<OffloadFile> &LinkerInputFiles,
     if (Error Err = linkBitcodeFiles(Input, InputFiles, LinkerArgs))
       return std::move(Err);
 
-    // Write any remaining device inputs to an output file for the linker job.
+    // Write any remaining device inputs to an output file for the linker.
     for (const OffloadFile &File : Input) {
       auto FileNameOrErr = writeOffloadFile(File);
       if (!FileNameOrErr)
@@ -1150,7 +1166,7 @@ linkAndWrapDeviceFiles(SmallVectorImpl<OffloadFile> &LinkerInputFiles,
       InputFiles.emplace_back(*FileNameOrErr);
     }
 
-    // Link the remaining device files, if necessary, using the device linker.
+    // Link the remaining device files using the device linker.
     llvm::Triple Triple(LinkerArgs.getLastArgValue(OPT_triple_EQ));
     bool RequiresLinking =
         !Args.hasArg(OPT_embed_bitcode) &&
@@ -1171,17 +1187,31 @@ linkAndWrapDeviceFiles(SmallVectorImpl<OffloadFile> &LinkerInputFiles,
       TheImage.TheImageKind = IMG_Object;
       TheImage.TheOffloadKind = Kind;
       TheImage.StringData = {
-          {"triple", LinkerArgs.getLastArgValue(OPT_triple_EQ)},
-          {"arch", LinkerArgs.getLastArgValue(OPT_arch_EQ)}};
+          {"triple",
+           Args.MakeArgString(LinkerArgs.getLastArgValue(OPT_triple_EQ))},
+          {"arch",
+           Args.MakeArgString(LinkerArgs.getLastArgValue(OPT_arch_EQ))}};
       TheImage.Image = std::move(*FileOrErr);
+
+      std::lock_guard<decltype(ImageMtx)> Guard(ImageMtx);
       Images[Kind].emplace_back(std::move(TheImage));
     }
-  }
+    return Error::success();
+  });
+  if (Err)
+    return std::move(Err);
 
   // Create a binary image of each offloading image and embed it into a new
   // object file.
   SmallVector<StringRef> WrappedOutput;
-  for (const auto &[Kind, Input] : Images) {
+  for (auto &[Kind, Input] : Images) {
+    // We sort the entries before bundling so they appear in a deterministic
+    // order in the final binary.
+    llvm::sort(Input, [](OffloadingImage &A, OffloadingImage &B) {
+      return A.StringData["triple"].compare(B.StringData["triple"]) == 1 ||
+             A.StringData["arch"].compare(B.StringData["arch"]) == 1 ||
+             A.TheOffloadKind < B.TheOffloadKind;
+    });
     auto BundledImagesOrErr = bundleLinkedOutput(Input, Args, Kind);
     if (!BundledImagesOrErr)
       return BundledImagesOrErr.takeError();
@@ -1362,6 +1392,16 @@ int main(int Argc, char **Argv) {
   if (!CudaBinaryPath.empty())
     CudaBinaryPath = CudaBinaryPath + "/bin";
 
+  parallel::strategy = hardware_concurrency(1);
+  if (auto *Arg = Args.getLastArg(OPT_wrapper_jobs)) {
+    unsigned Threads = 0;
+    if (!llvm::to_integer(Arg->getValue(), Threads) || Threads == 0)
+      reportError(createStringError(
+          inconvertibleErrorCode(), "%s: expected a positive integer, got '%s'",
+          Arg->getSpelling().data(), Arg->getValue()));
+    parallel::strategy = hardware_concurrency(Threads);
+  }
+
   if (Args.hasArg(OPT_wrapper_time_trace_eq)) {
     unsigned Granularity;
     Args.getLastArgValue(OPT_wrapper_time_trace_granularity, "500")
@@ -1378,7 +1418,8 @@ int main(int Argc, char **Argv) {
       reportError(DeviceInputFiles.takeError());
 
     // Link and wrap the device images extracted from the linker input.
-    auto FilesOrErr = linkAndWrapDeviceFiles(*DeviceInputFiles, Args);
+    auto FilesOrErr =
+        linkAndWrapDeviceFiles(*DeviceInputFiles, Args, Argv, Argc);
     if (!FilesOrErr)
       reportError(FilesOrErr.takeError());
 

diff  --git a/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td b/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td
index 07ce8b03a06ef..c12aac77c3b69 100644
--- a/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td
+++ b/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td
@@ -59,6 +59,10 @@ def wrapper_time_trace_granularity : Joined<["--"], "wrapper-time-trace-granular
   Flags<[WrapperOnlyOption]>, MetaVarName<"<number>">,
   HelpText<"Set the granularity of time-trace updates">;
 
+def wrapper_jobs : Joined<["--"], "wrapper-jobs=">,
+  Flags<[WrapperOnlyOption]>, MetaVarName<"<number>">,
+  HelpText<"Sets the number of parallel jobs to use for device linking">;
+
 // Flags passed to the device linker.
 def arch_EQ : Joined<["--"], "arch=">,
   Flags<[DeviceOnlyOption, HelpHidden]>, MetaVarName<"<arch>">,