[llvm] Make WriteIndexesThinBackend multi threaded (PR #109847)

Fri Oct 4 08:16:03 PDT 2024

https://github.com/NuriAmari updated https://github.com/llvm/llvm-project/pull/109847

>From fa9222b90084422e63c67924874cfd0c1a61da45 Mon Sep 17 00:00:00 2001
From: Nuri Amari <nuriamari at fb.com>
Date: Tue, 24 Sep 2024 11:29:54 -0700
Subject: [PATCH 1/3] Make WriteIndexesThinBackend multi threaded

We've noticed that for large builds executing thin-link can take on the
order of 10s of minutes. We are only using a single thread to write the
sharded indices and import files for each input bitcode file. While we
need to ensure the index files produced lists modules in a deterministic
order, that doesn't prevent us from executing the rest of the work in parallel.

In this change we use a thread pool to execute as much of the backend's
work as possible in parallel. In local testing on a machine with 80
cores, this change makes a thin-link for ~100,000 input files run in ~2 minutes.
Without this change is takes upwards of 10 minutes.
---
 llvm/lib/LTO/LTO.cpp | 57 ++++++++++++++++++++++++++++++++++----------
 1 file changed, 45 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index a88124dacfaefd..78084c7aedcd91 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -1395,11 +1395,12 @@ class lto::ThinBackendProc {
       MapVector<StringRef, BitcodeModule> &ModuleMap) = 0;
   virtual Error wait() = 0;
   virtual unsigned getThreadCount() = 0;
+  virtual bool isSensitiveToInputOrder() { return false; }
 
   // Write sharded indices and (optionally) imports to disk
   Error emitFiles(const FunctionImporter::ImportMapTy &ImportList,
                   llvm::StringRef ModulePath,
-                  const std::string &NewModulePath) {
+                  const std::string &NewModulePath) const {
     ModuleToSummariesForIndexTy ModuleToSummariesForIndex;
     GVSummaryPtrSet DeclarationSummaries;
 
@@ -1613,6 +1614,10 @@ namespace {
 class WriteIndexesThinBackend : public ThinBackendProc {
   std::string OldPrefix, NewPrefix, NativeObjectPrefix;
   raw_fd_ostream *LinkedObjectsFile;
+  DefaultThreadPool BackendThreadPool;
+  std::optional<Error> Err;
+  std::mutex ErrMu;
+  std::mutex OnWriteMu;
 
 public:
   WriteIndexesThinBackend(
@@ -1634,8 +1639,6 @@ class WriteIndexesThinBackend : public ThinBackendProc {
       const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR,
       MapVector<StringRef, BitcodeModule> &ModuleMap) override {
     StringRef ModulePath = BM.getModuleIdentifier();
-    std::string NewModulePath =
-        getThinLTOOutputFile(ModulePath, OldPrefix, NewPrefix);
 
     if (LinkedObjectsFile) {
       std::string ObjectPrefix =
@@ -1645,19 +1648,48 @@ class WriteIndexesThinBackend : public ThinBackendProc {
       *LinkedObjectsFile << LinkedObjectsFilePath << '\n';
     }
 
-    if (auto E = emitFiles(ImportList, ModulePath, NewModulePath))
-      return E;
+    BackendThreadPool.async(
+        [this](const StringRef ModulePath,
+               const FunctionImporter::ImportMapTy &ImportList,
+               const std::string &OldPrefix, const std::string &NewPrefix) {
+          std::string NewModulePath =
+              getThinLTOOutputFile(ModulePath, OldPrefix, NewPrefix);
+          auto E = emitFiles(ImportList, ModulePath, NewModulePath);
+          if (E) {
+            std::unique_lock<std::mutex> L(ErrMu);
+            if (Err)
+              Err = joinErrors(std::move(*Err), std::move(E));
+            else
+              Err = std::move(E);
+            return;
+          }
+          if (OnWrite) {
+            // Serialize calls to the on write callback in case it is not thread
+            // safe
+            std::unique_lock<std::mutex> L(OnWriteMu);
+            OnWrite(std::string(ModulePath));
+          }
+        },
+        ModulePath, ImportList, OldPrefix, NewPrefix);
+    return Error::success();
+  }
 
-    if (OnWrite)
-      OnWrite(std::string(ModulePath));
+  Error wait() override {
+    BackendThreadPool.wait();
+    if (Err)
+      return std::move(*Err);
     return Error::success();
   }
 
-  Error wait() override { return Error::success(); }
+  unsigned getThreadCount() override {
+    return BackendThreadPool.getMaxConcurrency();
+  }
 
-  // WriteIndexesThinBackend should always return 1 to prevent module
-  // re-ordering and avoid non-determinism in the final link.
-  unsigned getThreadCount() override { return 1; }
+  bool isSensitiveToInputOrder() override {
+    // The order which modules are written to LinkedObjectsFile should be
+    // deterministic and match the order they are passed on the command line.
+    return true;
+  }
 };
 } // end anonymous namespace
 
@@ -1856,7 +1888,8 @@ Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache,
                               ThinLTO.ModuleMap);
   };
 
-  if (BackendProc->getThreadCount() == 1) {
+  if (BackendProc->getThreadCount() == 1 ||
+      BackendProc->isSensitiveToInputOrder()) {
     // Process the modules in the order they were provided on the command-line.
     // It is important for this codepath to be used for WriteIndexesThinBackend,
     // to ensure the emitted LinkedObjectsFile lists ThinLTO objects in the same

>From f00ed250315469d3c38c26627e4df636e0fbea44 Mon Sep 17 00:00:00 2001
From: Nuri Amari <nuriamari at fb.com>
Date: Wed, 25 Sep 2024 13:03:59 -0700
Subject: [PATCH 2/3] Address PR Feedback #1

---
 llvm/lib/LTO/LTO.cpp | 63 +++++++++++++++++---------------------------
 1 file changed, 24 insertions(+), 39 deletions(-)

diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index 78084c7aedcd91..dd9e1c21c7e27c 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -1376,15 +1376,21 @@ class lto::ThinBackendProc {
   const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries;
   lto::IndexWriteCallback OnWrite;
   bool ShouldEmitImportsFiles;
+  DefaultThreadPool BackendThreadPool;
+  std::optional<Error> Err;
+  std::mutex ErrMu;
+  std::mutex OnWriteMu;
 
 public:
   ThinBackendProc(
       const Config &Conf, ModuleSummaryIndex &CombinedIndex,
       const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries,
-      lto::IndexWriteCallback OnWrite, bool ShouldEmitImportsFiles)
+      lto::IndexWriteCallback OnWrite, bool ShouldEmitImportsFiles,
+      ThreadPoolStrategy ThinLTOParallelism)
       : Conf(Conf), CombinedIndex(CombinedIndex),
         ModuleToDefinedGVSummaries(ModuleToDefinedGVSummaries),
-        OnWrite(OnWrite), ShouldEmitImportsFiles(ShouldEmitImportsFiles) {}
+        OnWrite(OnWrite), ShouldEmitImportsFiles(ShouldEmitImportsFiles),
+        BackendThreadPool(ThinLTOParallelism) {}
 
   virtual ~ThinBackendProc() = default;
   virtual Error start(
@@ -1393,8 +1399,13 @@ class lto::ThinBackendProc {
       const FunctionImporter::ExportSetTy &ExportList,
       const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR,
       MapVector<StringRef, BitcodeModule> &ModuleMap) = 0;
-  virtual Error wait() = 0;
-  virtual unsigned getThreadCount() = 0;
+  Error wait() {
+    BackendThreadPool.wait();
+    if (Err)
+      return std::move(*Err);
+    return Error::success();
+  }
+  unsigned getThreadCount() { return BackendThreadPool.getMaxConcurrency(); }
   virtual bool isSensitiveToInputOrder() { return false; }
 
   // Write sharded indices and (optionally) imports to disk
@@ -1429,15 +1440,11 @@ class lto::ThinBackendProc {
 
 namespace {
 class InProcessThinBackend : public ThinBackendProc {
-  DefaultThreadPool BackendThreadPool;
   AddStreamFn AddStream;
   FileCache Cache;
   DenseSet<GlobalValue::GUID> CfiFunctionDefs;
   DenseSet<GlobalValue::GUID> CfiFunctionDecls;
 
-  std::optional<Error> Err;
-  std::mutex ErrMu;
-
   bool ShouldEmitIndexFiles;
 
 public:
@@ -1448,9 +1455,9 @@ class InProcessThinBackend : public ThinBackendProc {
       AddStreamFn AddStream, FileCache Cache, lto::IndexWriteCallback OnWrite,
       bool ShouldEmitIndexFiles, bool ShouldEmitImportsFiles)
       : ThinBackendProc(Conf, CombinedIndex, ModuleToDefinedGVSummaries,
-                        OnWrite, ShouldEmitImportsFiles),
-        BackendThreadPool(ThinLTOParallelism), AddStream(std::move(AddStream)),
-        Cache(std::move(Cache)), ShouldEmitIndexFiles(ShouldEmitIndexFiles) {
+                        OnWrite, ShouldEmitImportsFiles, ThinLTOParallelism),
+        AddStream(std::move(AddStream)), Cache(std::move(Cache)),
+        ShouldEmitIndexFiles(ShouldEmitIndexFiles) {
     for (auto &Name : CombinedIndex.cfiFunctionDefs())
       CfiFunctionDefs.insert(
           GlobalValue::getGUID(GlobalValue::dropLLVMManglingEscape(Name)));
@@ -1546,18 +1553,6 @@ class InProcessThinBackend : public ThinBackendProc {
       OnWrite(std::string(ModulePath));
     return Error::success();
   }
-
-  Error wait() override {
-    BackendThreadPool.wait();
-    if (Err)
-      return std::move(*Err);
-    else
-      return Error::success();
-  }
-
-  unsigned getThreadCount() override {
-    return BackendThreadPool.getMaxConcurrency();
-  }
 };
 } // end anonymous namespace
 
@@ -1614,10 +1609,6 @@ namespace {
 class WriteIndexesThinBackend : public ThinBackendProc {
   std::string OldPrefix, NewPrefix, NativeObjectPrefix;
   raw_fd_ostream *LinkedObjectsFile;
-  DefaultThreadPool BackendThreadPool;
-  std::optional<Error> Err;
-  std::mutex ErrMu;
-  std::mutex OnWriteMu;
 
 public:
   WriteIndexesThinBackend(
@@ -1627,7 +1618,8 @@ class WriteIndexesThinBackend : public ThinBackendProc {
       std::string NativeObjectPrefix, bool ShouldEmitImportsFiles,
       raw_fd_ostream *LinkedObjectsFile, lto::IndexWriteCallback OnWrite)
       : ThinBackendProc(Conf, CombinedIndex, ModuleToDefinedGVSummaries,
-                        OnWrite, ShouldEmitImportsFiles),
+                        OnWrite, ShouldEmitImportsFiles,
+                        /* ThinLTOParallelism */ hardware_concurrency()),
         OldPrefix(OldPrefix), NewPrefix(NewPrefix),
         NativeObjectPrefix(NativeObjectPrefix),
         LinkedObjectsFile(LinkedObjectsFile) {}
@@ -1640,6 +1632,10 @@ class WriteIndexesThinBackend : public ThinBackendProc {
       MapVector<StringRef, BitcodeModule> &ModuleMap) override {
     StringRef ModulePath = BM.getModuleIdentifier();
 
+    // The contents of this file may be used as input to a native link, and must
+    // therefore contain the processed modules in a determinstic order than
+    // match the order they are provided on the command line. For that reason,
+    // we cannot include this in the asynchronously executed lambda below.
     if (LinkedObjectsFile) {
       std::string ObjectPrefix =
           NativeObjectPrefix.empty() ? NewPrefix : NativeObjectPrefix;
@@ -1674,17 +1670,6 @@ class WriteIndexesThinBackend : public ThinBackendProc {
     return Error::success();
   }
 
-  Error wait() override {
-    BackendThreadPool.wait();
-    if (Err)
-      return std::move(*Err);
-    return Error::success();
-  }
-
-  unsigned getThreadCount() override {
-    return BackendThreadPool.getMaxConcurrency();
-  }
-
   bool isSensitiveToInputOrder() override {
     // The order which modules are written to LinkedObjectsFile should be
     // deterministic and match the order they are passed on the command line.

>From 846ac06d523ea3e2f3febf4bf9c38cde2fec24c2 Mon Sep 17 00:00:00 2001
From: Nuri Amari <nuriamari at fb.com>
Date: Fri, 4 Oct 2024 08:13:24 -0700
Subject: [PATCH 3/3] Address PR Feedback #2

---
 llvm/lib/LTO/LTO.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index dd9e1c21c7e27c..bad872c73b4d2a 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -1633,7 +1633,7 @@ class WriteIndexesThinBackend : public ThinBackendProc {
     StringRef ModulePath = BM.getModuleIdentifier();
 
     // The contents of this file may be used as input to a native link, and must
-    // therefore contain the processed modules in a determinstic order than
+    // therefore contain the processed modules in a determinstic order that
     // match the order they are provided on the command line. For that reason,
     // we cannot include this in the asynchronously executed lambda below.
     if (LinkedObjectsFile) {