[lld] [llvm] Make WriteIndexesThinBackend multi threaded (PR #109847)

Sun Oct 6 21:27:07 PDT 2024

https://github.com/NuriAmari updated https://github.com/llvm/llvm-project/pull/109847

>From 445bd565090292274bccf596aee3eae792c78929 Mon Sep 17 00:00:00 2001
From: Nuri Amari <nuriamari at fb.com>
Date: Tue, 24 Sep 2024 11:29:54 -0700
Subject: [PATCH 1/4] Make WriteIndexesThinBackend multi threaded

We've noticed that for large builds executing thin-link can take on the
order of 10s of minutes. We are only using a single thread to write the
sharded indices and import files for each input bitcode file. While we
need to ensure the index files produced lists modules in a deterministic
order, that doesn't prevent us from executing the rest of the work in parallel.

In this change we use a thread pool to execute as much of the backend's
work as possible in parallel. In local testing on a machine with 80
cores, this change makes a thin-link for ~100,000 input files run in ~2 minutes.
Without this change is takes upwards of 10 minutes.
---
 llvm/lib/LTO/LTO.cpp | 70 ++++++++++++++++++++++++++++++++------------
 1 file changed, 51 insertions(+), 19 deletions(-)

diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index b5eb7953f23b09..3604b17b176dea 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -1395,11 +1395,12 @@ class lto::ThinBackendProc {
       MapVector<StringRef, BitcodeModule> &ModuleMap) = 0;
   virtual Error wait() = 0;
   virtual unsigned getThreadCount() = 0;
+  virtual bool isSensitiveToInputOrder() { return false; }
 
   // Write sharded indices and (optionally) imports to disk
   Error emitFiles(const FunctionImporter::ImportMapTy &ImportList,
                   llvm::StringRef ModulePath,
-                  const std::string &NewModulePath) {
+                  const std::string &NewModulePath) const {
     ModuleToSummariesForIndexTy ModuleToSummariesForIndex;
     GVSummaryPtrSet DeclarationSummaries;
 
@@ -1614,6 +1615,10 @@ namespace {
 class WriteIndexesThinBackend : public ThinBackendProc {
   std::string OldPrefix, NewPrefix, NativeObjectPrefix;
   raw_fd_ostream *LinkedObjectsFile;
+  DefaultThreadPool BackendThreadPool;
+  std::optional<Error> Err;
+  std::mutex ErrMu;
+  std::mutex OnWriteMu;
 
 public:
   WriteIndexesThinBackend(
@@ -1635,8 +1640,6 @@ class WriteIndexesThinBackend : public ThinBackendProc {
       const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR,
       MapVector<StringRef, BitcodeModule> &ModuleMap) override {
     StringRef ModulePath = BM.getModuleIdentifier();
-    std::string NewModulePath =
-        getThinLTOOutputFile(ModulePath, OldPrefix, NewPrefix);
 
     if (LinkedObjectsFile) {
       std::string ObjectPrefix =
@@ -1646,19 +1649,48 @@ class WriteIndexesThinBackend : public ThinBackendProc {
       *LinkedObjectsFile << LinkedObjectsFilePath << '\n';
     }
 
-    if (auto E = emitFiles(ImportList, ModulePath, NewModulePath))
-      return E;
+    BackendThreadPool.async(
+        [this](const StringRef ModulePath,
+               const FunctionImporter::ImportMapTy &ImportList,
+               const std::string &OldPrefix, const std::string &NewPrefix) {
+          std::string NewModulePath =
+              getThinLTOOutputFile(ModulePath, OldPrefix, NewPrefix);
+          auto E = emitFiles(ImportList, ModulePath, NewModulePath);
+          if (E) {
+            std::unique_lock<std::mutex> L(ErrMu);
+            if (Err)
+              Err = joinErrors(std::move(*Err), std::move(E));
+            else
+              Err = std::move(E);
+            return;
+          }
+          if (OnWrite) {
+            // Serialize calls to the on write callback in case it is not thread
+            // safe
+            std::unique_lock<std::mutex> L(OnWriteMu);
+            OnWrite(std::string(ModulePath));
+          }
+        },
+        ModulePath, ImportList, OldPrefix, NewPrefix);
+    return Error::success();
+  }
 
-    if (OnWrite)
-      OnWrite(std::string(ModulePath));
+  Error wait() override {
+    BackendThreadPool.wait();
+    if (Err)
+      return std::move(*Err);
     return Error::success();
   }
 
-  Error wait() override { return Error::success(); }
+  unsigned getThreadCount() override {
+    return BackendThreadPool.getMaxConcurrency();
+  }
 
-  // WriteIndexesThinBackend should always return 1 to prevent module
-  // re-ordering and avoid non-determinism in the final link.
-  unsigned getThreadCount() override { return 1; }
+  bool isSensitiveToInputOrder() override {
+    // The order which modules are written to LinkedObjectsFile should be
+    // deterministic and match the order they are passed on the command line.
+    return true;
+  }
 };
 } // end anonymous namespace
 
@@ -1854,20 +1886,20 @@ Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache,
           ResolvedODR[Mod.first], ThinLTO.ModuleMap);
     };
 
-    if (BackendProcess->getThreadCount() == 1) {
-      // Process the modules in the order they were provided on the
-      // command-line. It is important for this codepath to be used for
-      // WriteIndexesThinBackend, to ensure the emitted LinkedObjectsFile lists
-      // ThinLTO objects in the same order as the inputs, which otherwise would
-      // affect the final link order.
+    if (BackendProcess->getThreadCount() == 1 ||
+        BackendProcess->isSensitiveToInputOrder()) {
+      // Process the modules in the order they were provided on the command-line.
+      // It is important for this codepath to be used for WriteIndexesThinBackend,
+      // to ensure the emitted LinkedObjectsFile lists ThinLTO objects in the same
+      // order as the inputs, which otherwise would affect the final link order.
       for (int I = 0, E = ModuleMap.size(); I != E; ++I)
         if (Error E = ProcessOneModule(I))
           return E;
     } else {
       // When executing in parallel, process largest bitsize modules first to
       // improve parallelism, and avoid starving the thread pool near the end.
-      // This saves about 15 sec on a 36-core machine while link `clang.exe`
-      // (out of 100 sec).
+      // This saves about 15 sec on a 36-core machine while link `clang.exe` (out
+      // of 100 sec).
       std::vector<BitcodeModule *> ModulesVec;
       ModulesVec.reserve(ModuleMap.size());
       for (auto &Mod : ModuleMap)

>From 00ad4a0b6406da0a4aa7a511ef6f41b06f93445f Mon Sep 17 00:00:00 2001
From: Nuri Amari <nuriamari at fb.com>
Date: Wed, 25 Sep 2024 13:03:59 -0700
Subject: [PATCH 2/4] Address PR Feedback #1

---
 llvm/lib/LTO/LTO.cpp | 63 +++++++++++++++++---------------------------
 1 file changed, 24 insertions(+), 39 deletions(-)

diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index 3604b17b176dea..fbbc9da5323d94 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -1376,15 +1376,21 @@ class lto::ThinBackendProc {
   const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries;
   lto::IndexWriteCallback OnWrite;
   bool ShouldEmitImportsFiles;
+  DefaultThreadPool BackendThreadPool;
+  std::optional<Error> Err;
+  std::mutex ErrMu;
+  std::mutex OnWriteMu;
 
 public:
   ThinBackendProc(
       const Config &Conf, ModuleSummaryIndex &CombinedIndex,
       const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries,
-      lto::IndexWriteCallback OnWrite, bool ShouldEmitImportsFiles)
+      lto::IndexWriteCallback OnWrite, bool ShouldEmitImportsFiles,
+      ThreadPoolStrategy ThinLTOParallelism)
       : Conf(Conf), CombinedIndex(CombinedIndex),
         ModuleToDefinedGVSummaries(ModuleToDefinedGVSummaries),
-        OnWrite(OnWrite), ShouldEmitImportsFiles(ShouldEmitImportsFiles) {}
+        OnWrite(OnWrite), ShouldEmitImportsFiles(ShouldEmitImportsFiles),
+        BackendThreadPool(ThinLTOParallelism) {}
 
   virtual ~ThinBackendProc() = default;
   virtual Error start(
@@ -1393,8 +1399,13 @@ class lto::ThinBackendProc {
       const FunctionImporter::ExportSetTy &ExportList,
       const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR,
       MapVector<StringRef, BitcodeModule> &ModuleMap) = 0;
-  virtual Error wait() = 0;
-  virtual unsigned getThreadCount() = 0;
+  Error wait() {
+    BackendThreadPool.wait();
+    if (Err)
+      return std::move(*Err);
+    return Error::success();
+  }
+  unsigned getThreadCount() { return BackendThreadPool.getMaxConcurrency(); }
   virtual bool isSensitiveToInputOrder() { return false; }
 
   // Write sharded indices and (optionally) imports to disk
@@ -1429,15 +1440,11 @@ class lto::ThinBackendProc {
 
 namespace {
 class InProcessThinBackend : public ThinBackendProc {
-  DefaultThreadPool BackendThreadPool;
   AddStreamFn AddStream;
   FileCache Cache;
   DenseSet<GlobalValue::GUID> CfiFunctionDefs;
   DenseSet<GlobalValue::GUID> CfiFunctionDecls;
 
-  std::optional<Error> Err;
-  std::mutex ErrMu;
-
   bool ShouldEmitIndexFiles;
 
 public:
@@ -1448,9 +1455,9 @@ class InProcessThinBackend : public ThinBackendProc {
       AddStreamFn AddStream, FileCache Cache, lto::IndexWriteCallback OnWrite,
       bool ShouldEmitIndexFiles, bool ShouldEmitImportsFiles)
       : ThinBackendProc(Conf, CombinedIndex, ModuleToDefinedGVSummaries,
-                        OnWrite, ShouldEmitImportsFiles),
-        BackendThreadPool(ThinLTOParallelism), AddStream(std::move(AddStream)),
-        Cache(std::move(Cache)), ShouldEmitIndexFiles(ShouldEmitIndexFiles) {
+                        OnWrite, ShouldEmitImportsFiles, ThinLTOParallelism),
+        AddStream(std::move(AddStream)), Cache(std::move(Cache)),
+        ShouldEmitIndexFiles(ShouldEmitIndexFiles) {
     for (auto &Name : CombinedIndex.cfiFunctionDefs())
       CfiFunctionDefs.insert(
           GlobalValue::getGUID(GlobalValue::dropLLVMManglingEscape(Name)));
@@ -1547,18 +1554,6 @@ class InProcessThinBackend : public ThinBackendProc {
       OnWrite(std::string(ModulePath));
     return Error::success();
   }
-
-  Error wait() override {
-    BackendThreadPool.wait();
-    if (Err)
-      return std::move(*Err);
-    else
-      return Error::success();
-  }
-
-  unsigned getThreadCount() override {
-    return BackendThreadPool.getMaxConcurrency();
-  }
 };
 } // end anonymous namespace
 
@@ -1615,10 +1610,6 @@ namespace {
 class WriteIndexesThinBackend : public ThinBackendProc {
   std::string OldPrefix, NewPrefix, NativeObjectPrefix;
   raw_fd_ostream *LinkedObjectsFile;
-  DefaultThreadPool BackendThreadPool;
-  std::optional<Error> Err;
-  std::mutex ErrMu;
-  std::mutex OnWriteMu;
 
 public:
   WriteIndexesThinBackend(
@@ -1628,7 +1619,8 @@ class WriteIndexesThinBackend : public ThinBackendProc {
       std::string NativeObjectPrefix, bool ShouldEmitImportsFiles,
       raw_fd_ostream *LinkedObjectsFile, lto::IndexWriteCallback OnWrite)
       : ThinBackendProc(Conf, CombinedIndex, ModuleToDefinedGVSummaries,
-                        OnWrite, ShouldEmitImportsFiles),
+                        OnWrite, ShouldEmitImportsFiles,
+                        /* ThinLTOParallelism */ hardware_concurrency()),
         OldPrefix(OldPrefix), NewPrefix(NewPrefix),
         NativeObjectPrefix(NativeObjectPrefix),
         LinkedObjectsFile(LinkedObjectsFile) {}
@@ -1641,6 +1633,10 @@ class WriteIndexesThinBackend : public ThinBackendProc {
       MapVector<StringRef, BitcodeModule> &ModuleMap) override {
     StringRef ModulePath = BM.getModuleIdentifier();
 
+    // The contents of this file may be used as input to a native link, and must
+    // therefore contain the processed modules in a determinstic order than
+    // match the order they are provided on the command line. For that reason,
+    // we cannot include this in the asynchronously executed lambda below.
     if (LinkedObjectsFile) {
       std::string ObjectPrefix =
           NativeObjectPrefix.empty() ? NewPrefix : NativeObjectPrefix;
@@ -1675,17 +1671,6 @@ class WriteIndexesThinBackend : public ThinBackendProc {
     return Error::success();
   }
 
-  Error wait() override {
-    BackendThreadPool.wait();
-    if (Err)
-      return std::move(*Err);
-    return Error::success();
-  }
-
-  unsigned getThreadCount() override {
-    return BackendThreadPool.getMaxConcurrency();
-  }
-
   bool isSensitiveToInputOrder() override {
     // The order which modules are written to LinkedObjectsFile should be
     // deterministic and match the order they are passed on the command line.

>From 82a849f133ba21650ad2d70227c99c84e544f2e3 Mon Sep 17 00:00:00 2001
From: Nuri Amari <nuriamari at fb.com>
Date: Fri, 4 Oct 2024 08:13:24 -0700
Subject: [PATCH 3/4] Address PR Feedback #2

---
 llvm/lib/LTO/LTO.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index fbbc9da5323d94..3a18814168e425 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -1634,7 +1634,7 @@ class WriteIndexesThinBackend : public ThinBackendProc {
     StringRef ModulePath = BM.getModuleIdentifier();
 
     // The contents of this file may be used as input to a native link, and must
-    // therefore contain the processed modules in a determinstic order than
+    // therefore contain the processed modules in a determinstic order that
     // match the order they are provided on the command line. For that reason,
     // we cannot include this in the asynchronously executed lambda below.
     if (LinkedObjectsFile) {

>From 33fb21bddd893ce28bd0de602ee0de37ff75d57e Mon Sep 17 00:00:00 2001
From: Nuri Amari <nuriamari at fb.com>
Date: Sun, 6 Oct 2024 18:31:12 -0700
Subject: [PATCH 4/4] Address PR Comments #3

---
 lld/COFF/LTO.cpp                              |  1 +
 lld/ELF/LTO.cpp                               |  1 +
 lld/MachO/LTO.cpp                             |  1 +
 lld/test/COFF/thinlto-emit-imports.ll         |  2 +-
 lld/test/ELF/lto/thinlto-cant-write-index.ll  |  2 +-
 lld/test/ELF/lto/thinlto-emit-imports.ll      |  2 +-
 lld/test/MachO/thinlto-emit-imports.ll        |  2 +-
 llvm/include/llvm/LTO/LTO.h                   |  3 +-
 llvm/include/llvm/Support/Threading.h         | 12 +++++++
 .../llvm/Transforms/IPO/FunctionImport.h      |  6 ++--
 llvm/lib/LTO/LTO.cpp                          | 36 +++++++++----------
 llvm/lib/LTO/ThinLTOCodeGenerator.cpp         |  5 ++-
 llvm/lib/Transforms/IPO/FunctionImport.cpp    |  7 ++--
 llvm/tools/gold/gold-plugin.cpp               |  2 +-
 llvm/tools/llvm-lto2/llvm-lto2.cpp            |  3 +-
 15 files changed, 50 insertions(+), 35 deletions(-)

diff --git a/lld/COFF/LTO.cpp b/lld/COFF/LTO.cpp
index 5c881bc01c663d..da73fe7763ceea 100644
--- a/lld/COFF/LTO.cpp
+++ b/lld/COFF/LTO.cpp
@@ -118,6 +118,7 @@ BitcodeCompiler::BitcodeCompiler(COFFLinkerContext &c) : ctx(c) {
   if (ctx.config.thinLTOIndexOnly) {
     auto OnIndexWrite = [&](StringRef S) { thinIndices.erase(S); };
     backend = lto::createWriteIndexesThinBackend(
+        llvm::hardware_concurrency(ctx.config.thinLTOJobs),
         std::string(ctx.config.thinLTOPrefixReplaceOld),
         std::string(ctx.config.thinLTOPrefixReplaceNew),
         std::string(ctx.config.thinLTOPrefixReplaceNativeObject),
diff --git a/lld/ELF/LTO.cpp b/lld/ELF/LTO.cpp
index 4df1b0c289eef8..4773c5bf7870c9 100644
--- a/lld/ELF/LTO.cpp
+++ b/lld/ELF/LTO.cpp
@@ -179,6 +179,7 @@ BitcodeCompiler::BitcodeCompiler(Ctx &ctx) : ctx(ctx) {
   auto onIndexWrite = [&](StringRef s) { thinIndices.erase(s); };
   if (ctx.arg.thinLTOIndexOnly) {
     backend = lto::createWriteIndexesThinBackend(
+        llvm::hardware_concurrency(ctx.arg.thinLTOJobs),
         std::string(ctx.arg.thinLTOPrefixReplaceOld),
         std::string(ctx.arg.thinLTOPrefixReplaceNew),
         std::string(ctx.arg.thinLTOPrefixReplaceNativeObject),
diff --git a/lld/MachO/LTO.cpp b/lld/MachO/LTO.cpp
index 6527cbb68f2498..28f5290edb58e3 100644
--- a/lld/MachO/LTO.cpp
+++ b/lld/MachO/LTO.cpp
@@ -87,6 +87,7 @@ BitcodeCompiler::BitcodeCompiler() {
   auto onIndexWrite = [&](StringRef S) { thinIndices.erase(S); };
   if (config->thinLTOIndexOnly) {
     backend = lto::createWriteIndexesThinBackend(
+        llvm::hardware_concurrency(config->thinLTOJobs),
         std::string(config->thinLTOPrefixReplaceOld),
         std::string(config->thinLTOPrefixReplaceNew),
         std::string(config->thinLTOPrefixReplaceNativeObject),
diff --git a/lld/test/COFF/thinlto-emit-imports.ll b/lld/test/COFF/thinlto-emit-imports.ll
index b47a6cea4eb7df..26af017b17b2c5 100644
--- a/lld/test/COFF/thinlto-emit-imports.ll
+++ b/lld/test/COFF/thinlto-emit-imports.ll
@@ -35,7 +35,7 @@
 ; RUN: not lld-link -entry:main -thinlto-index-only \
 ; RUN:     -thinlto-emit-imports-files %t1.obj %t2.obj %t3.obj \
 ; RUN:     -out:%t4.exe 2>&1 | FileCheck -DMSG=%errc_EACCES %s --check-prefix=ERR
-; ERR: cannot open {{.*}}3.obj.imports: [[MSG]]
+; ERR: 'cannot open {{.*}}3.obj.imports': [[MSG]]
 
 ; Ensure lld doesn't generate import files when thinlto-index-only is not enabled
 ; RUN: rm -f %t1.obj.imports
diff --git a/lld/test/ELF/lto/thinlto-cant-write-index.ll b/lld/test/ELF/lto/thinlto-cant-write-index.ll
index 286fcddd4238a1..550305986ecd5b 100644
--- a/lld/test/ELF/lto/thinlto-cant-write-index.ll
+++ b/lld/test/ELF/lto/thinlto-cant-write-index.ll
@@ -10,7 +10,7 @@
 ; RUN: chmod u-w %t2.o.thinlto.bc
 ; RUN: not ld.lld --plugin-opt=thinlto-index-only -shared %t1.o %t2.o -o /dev/null 2>&1 | FileCheck -DMSG=%errc_EACCES %s
 ; RUN: chmod u+w %t2.o.thinlto.bc
-; CHECK: cannot open {{.*}}2.o.thinlto.bc: [[MSG]]
+; CHECK: 'cannot open {{.*}}2.o.thinlto.bc': [[MSG]]
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/lld/test/ELF/lto/thinlto-emit-imports.ll b/lld/test/ELF/lto/thinlto-emit-imports.ll
index 253ec08619c982..1807a3b59d81cb 100644
--- a/lld/test/ELF/lto/thinlto-emit-imports.ll
+++ b/lld/test/ELF/lto/thinlto-emit-imports.ll
@@ -10,7 +10,7 @@
 ; RUN: touch %t3.o.imports
 ; RUN: chmod 400 %t3.o.imports
 ; RUN: not ld.lld --plugin-opt=thinlto-index-only --plugin-opt=thinlto-emit-imports-files -shared %t1.o %t2.o %t3.o -o /dev/null 2>&1 | FileCheck -DMSG=%errc_EACCES %s --check-prefix=ERR
-; ERR: cannot open {{.*}}3.o.imports: [[MSG]]
+; ERR: 'cannot open {{.*}}3.o.imports': [[MSG]]
 
 ; RUN: rm -f %t1.o.imports %t2.o.imports rm -f %t3.o.imports
 ; RUN: ld.lld --plugin-opt=thinlto-emit-imports-files -shared %t1.o %t2.o %t3.o -o %t4
diff --git a/lld/test/MachO/thinlto-emit-imports.ll b/lld/test/MachO/thinlto-emit-imports.ll
index 88f766f59c8877..90ee6a56b93b8f 100644
--- a/lld/test/MachO/thinlto-emit-imports.ll
+++ b/lld/test/MachO/thinlto-emit-imports.ll
@@ -33,7 +33,7 @@
 ; RUN: chmod 400 %t3.o.imports
 ; RUN: not %lld --thinlto-index-only --thinlto-emit-imports-files -dylib %t1.o %t2.o %t3.o -o /dev/null 2>&1 \
 ; RUN:     | FileCheck -DMSG=%errc_EACCES %s --check-prefix=ERR
-; ERR: cannot open {{.*}}3.o.imports: [[MSG]]
+; ERR: 'cannot open {{.*}}3.o.imports': [[MSG]]
 
 ; Ensure lld doesn't generate import files when thinlto-index-only is not enabled
 ; RUN: rm -f %t1.o.imports
diff --git a/llvm/include/llvm/LTO/LTO.h b/llvm/include/llvm/LTO/LTO.h
index a281c377f2601d..5c47c4df7f6a38 100644
--- a/llvm/include/llvm/LTO/LTO.h
+++ b/llvm/include/llvm/LTO/LTO.h
@@ -231,7 +231,8 @@ ThinBackend createInProcessThinBackend(ThreadPoolStrategy Parallelism,
 /// the objects with NativeObjectPrefix instead of NewPrefix. OnWrite is
 /// callback which receives module identifier and notifies LTO user that index
 /// file for the module (and optionally imports file) was created.
-ThinBackend createWriteIndexesThinBackend(std::string OldPrefix,
+ThinBackend createWriteIndexesThinBackend(ThreadPoolStrategy Parallelism,
+                                          std::string OldPrefix,
                                           std::string NewPrefix,
                                           std::string NativeObjectPrefix,
                                           bool ShouldEmitImportsFiles,
diff --git a/llvm/include/llvm/Support/Threading.h b/llvm/include/llvm/Support/Threading.h
index ba6c531ab4db21..d8e2cb0514ddd7 100644
--- a/llvm/include/llvm/Support/Threading.h
+++ b/llvm/include/llvm/Support/Threading.h
@@ -188,6 +188,18 @@ constexpr bool llvm_is_multithreaded() { return LLVM_ENABLE_THREADS; }
     return S;
   }
 
+  /// Like hardware_concurrency() above, but builds a strategy
+  /// based on the rules described for get_threadpool_strategy().
+  /// If \p Num is invalid, returns a default strategy where one thread per
+  /// hardware core is used.
+  inline ThreadPoolStrategy hardware_concurrency(StringRef Num) {
+    std::optional<ThreadPoolStrategy> S =
+        get_threadpool_strategy(Num, hardware_concurrency());
+    if (S)
+      return *S;
+    return hardware_concurrency();
+  }
+
   /// Returns an optimal thread strategy to execute specified amount of tasks.
   /// This strategy should prevent us from creating too many threads if we
   /// occasionaly have an unexpectedly small amount of tasks.
diff --git a/llvm/include/llvm/Transforms/IPO/FunctionImport.h b/llvm/include/llvm/Transforms/IPO/FunctionImport.h
index 4b29d3f40ab7b5..3623f9194d4d13 100644
--- a/llvm/include/llvm/Transforms/IPO/FunctionImport.h
+++ b/llvm/include/llvm/Transforms/IPO/FunctionImport.h
@@ -417,9 +417,9 @@ void gatherImportedSummariesForModule(
     GVSummaryPtrSet &DecSummaries);
 
 /// Emit into \p OutputFilename the files module \p ModulePath will import from.
-std::error_code
-EmitImportsFiles(StringRef ModulePath, StringRef OutputFilename,
-                 const ModuleToSummariesForIndexTy &ModuleToSummariesForIndex);
+Error EmitImportsFiles(
+    StringRef ModulePath, StringRef OutputFilename,
+    const ModuleToSummariesForIndexTy &ModuleToSummariesForIndex);
 
 /// Based on the information recorded in the summaries during global
 /// summary-based analysis:
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index 3a18814168e425..900dd2e2b15487 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -1379,7 +1379,6 @@ class lto::ThinBackendProc {
   DefaultThreadPool BackendThreadPool;
   std::optional<Error> Err;
   std::mutex ErrMu;
-  std::mutex OnWriteMu;
 
 public:
   ThinBackendProc(
@@ -1423,16 +1422,17 @@ class lto::ThinBackendProc {
     raw_fd_ostream OS(NewModulePath + ".thinlto.bc", EC,
                       sys::fs::OpenFlags::OF_None);
     if (EC)
-      return errorCodeToError(EC);
+      return createFileError("cannot open " + NewModulePath + ".thinlto.bc",
+                             EC);
 
     writeIndexToFile(CombinedIndex, OS, &ModuleToSummariesForIndex,
                      &DeclarationSummaries);
 
     if (ShouldEmitImportsFiles) {
-      EC = EmitImportsFiles(ModulePath, NewModulePath + ".imports",
-                            ModuleToSummariesForIndex);
-      if (EC)
-        return errorCodeToError(EC);
+      Error ImportFilesError = EmitImportsFiles(
+          ModulePath, NewModulePath + ".imports", ModuleToSummariesForIndex);
+      if (ImportFilesError)
+        return ImportFilesError;
     }
     return Error::success();
   }
@@ -1614,13 +1614,13 @@ class WriteIndexesThinBackend : public ThinBackendProc {
 public:
   WriteIndexesThinBackend(
       const Config &Conf, ModuleSummaryIndex &CombinedIndex,
+      ThreadPoolStrategy ThinLTOParallelism,
       const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries,
       std::string OldPrefix, std::string NewPrefix,
       std::string NativeObjectPrefix, bool ShouldEmitImportsFiles,
       raw_fd_ostream *LinkedObjectsFile, lto::IndexWriteCallback OnWrite)
       : ThinBackendProc(Conf, CombinedIndex, ModuleToDefinedGVSummaries,
-                        OnWrite, ShouldEmitImportsFiles,
-                        /* ThinLTOParallelism */ hardware_concurrency()),
+                        OnWrite, ShouldEmitImportsFiles, ThinLTOParallelism),
         OldPrefix(OldPrefix), NewPrefix(NewPrefix),
         NativeObjectPrefix(NativeObjectPrefix),
         LinkedObjectsFile(LinkedObjectsFile) {}
@@ -1660,14 +1660,11 @@ class WriteIndexesThinBackend : public ThinBackendProc {
               Err = std::move(E);
             return;
           }
-          if (OnWrite) {
-            // Serialize calls to the on write callback in case it is not thread
-            // safe
-            std::unique_lock<std::mutex> L(OnWriteMu);
-            OnWrite(std::string(ModulePath));
-          }
         },
         ModulePath, ImportList, OldPrefix, NewPrefix);
+
+    if (OnWrite)
+      OnWrite(std::string(ModulePath));
     return Error::success();
   }
 
@@ -1680,16 +1677,17 @@ class WriteIndexesThinBackend : public ThinBackendProc {
 } // end anonymous namespace
 
 ThinBackend lto::createWriteIndexesThinBackend(
-    std::string OldPrefix, std::string NewPrefix,
-    std::string NativeObjectPrefix, bool ShouldEmitImportsFiles,
-    raw_fd_ostream *LinkedObjectsFile, IndexWriteCallback OnWrite) {
+    ThreadPoolStrategy Parallelism, std::string OldPrefix,
+    std::string NewPrefix, std::string NativeObjectPrefix,
+    bool ShouldEmitImportsFiles, raw_fd_ostream *LinkedObjectsFile,
+    IndexWriteCallback OnWrite) {
   return
       [=](const Config &Conf, ModuleSummaryIndex &CombinedIndex,
           const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries,
           AddStreamFn AddStream, FileCache Cache) {
         return std::make_unique<WriteIndexesThinBackend>(
-            Conf, CombinedIndex, ModuleToDefinedGVSummaries, OldPrefix,
-            NewPrefix, NativeObjectPrefix, ShouldEmitImportsFiles,
+            Conf, CombinedIndex, Parallelism, ModuleToDefinedGVSummaries,
+            OldPrefix, NewPrefix, NativeObjectPrefix, ShouldEmitImportsFiles,
             LinkedObjectsFile, OnWrite);
       };
 }
diff --git a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
index 76268c950cf581..8074f8690cc1ce 100644
--- a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
+++ b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
@@ -837,9 +837,8 @@ void ThinLTOCodeGenerator::emitImports(Module &TheModule, StringRef OutputName,
       ModuleIdentifier, ModuleToDefinedGVSummaries,
       ImportLists[ModuleIdentifier], ModuleToSummariesForIndex, DecSummaries);
 
-  std::error_code EC;
-  if ((EC = EmitImportsFiles(ModuleIdentifier, OutputName,
-                             ModuleToSummariesForIndex)))
+  if (Error EC = EmitImportsFiles(ModuleIdentifier, OutputName,
+                                  ModuleToSummariesForIndex))
     report_fatal_error(Twine("Failed to open ") + OutputName +
                        " to save imports lists\n");
 }
diff --git a/llvm/lib/Transforms/IPO/FunctionImport.cpp b/llvm/lib/Transforms/IPO/FunctionImport.cpp
index 261731fd565b02..fee27f72f208b0 100644
--- a/llvm/lib/Transforms/IPO/FunctionImport.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionImport.cpp
@@ -1553,20 +1553,21 @@ void llvm::gatherImportedSummariesForModule(
 }
 
 /// Emit the files \p ModulePath will import from into \p OutputFilename.
-std::error_code llvm::EmitImportsFiles(
+Error llvm::EmitImportsFiles(
     StringRef ModulePath, StringRef OutputFilename,
     const ModuleToSummariesForIndexTy &ModuleToSummariesForIndex) {
   std::error_code EC;
   raw_fd_ostream ImportsOS(OutputFilename, EC, sys::fs::OpenFlags::OF_Text);
   if (EC)
-    return EC;
+    return createFileError("cannot open " + OutputFilename,
+                           errorCodeToError(EC));
   for (const auto &ILI : ModuleToSummariesForIndex)
     // The ModuleToSummariesForIndex map includes an entry for the current
     // Module (needed for writing out the index files). We don't want to
     // include it in the imports file, however, so filter it out.
     if (ILI.first != ModulePath)
       ImportsOS << ILI.first << "\n";
-  return std::error_code();
+  return Error::success();
 }
 
 bool llvm::convertToDeclaration(GlobalValue &GV) {
diff --git a/llvm/tools/gold/gold-plugin.cpp b/llvm/tools/gold/gold-plugin.cpp
index 0377791d85b3f8..9304bd4188d9ed 100644
--- a/llvm/tools/gold/gold-plugin.cpp
+++ b/llvm/tools/gold/gold-plugin.cpp
@@ -899,7 +899,7 @@ static std::unique_ptr<LTO> createLTO(IndexWriteCallback OnIndexWrite,
     std::string OldPrefix, NewPrefix;
     getThinLTOOldAndNewPrefix(OldPrefix, NewPrefix);
     Backend = createWriteIndexesThinBackend(
-        OldPrefix, NewPrefix,
+        llvm::hardware_concurrency(options::Parallelism) OldPrefix, NewPrefix,
         // TODO: Add support for optional native object path in
         // thinlto_prefix_replace option to match lld.
         /*NativeObjectPrefix=*/"", options::thinlto_emit_imports_files,
diff --git a/llvm/tools/llvm-lto2/llvm-lto2.cpp b/llvm/tools/llvm-lto2/llvm-lto2.cpp
index 5dd961a603c9e8..d4f022ef021a44 100644
--- a/llvm/tools/llvm-lto2/llvm-lto2.cpp
+++ b/llvm/tools/llvm-lto2/llvm-lto2.cpp
@@ -346,7 +346,8 @@ static int run(int argc, char **argv) {
 
   ThinBackend Backend;
   if (ThinLTODistributedIndexes)
-    Backend = createWriteIndexesThinBackend(/*OldPrefix=*/"",
+    Backend = createWriteIndexesThinBackend(llvm::hardware_concurrency(Threads),
+                                            /*OldPrefix=*/"",
                                             /*NewPrefix=*/"",
                                             /*NativeObjectPrefix=*/"",
                                             ThinLTOEmitImports,