[lld] 617d64f - Re-land [ThinLTO] Re-order modules for optimal multi-threaded processing

Tue Oct 13 18:54:30 PDT 2020

Author: Alexandre Ganea
Date: 2020-10-13T21:54:15-04:00
New Revision: 617d64f6c5f8fdcdacc4401704146247152b96aa

URL: https://github.com/llvm/llvm-project/commit/617d64f6c5f8fdcdacc4401704146247152b96aa
DIFF: https://github.com/llvm/llvm-project/commit/617d64f6c5f8fdcdacc4401704146247152b96aa.diff

LOG: Re-land [ThinLTO] Re-order modules for optimal multi-threaded processing

This reverts 9b5b3050237db3642ed7ab1bdb3ffa2202511b99 and fixes the unwanted re-ordering when generating ThinLTO indexes.

The goal of this patch is to better balance thread utilization during ThinLTO in-process linking (in llvm-lto2 or in LLD). Before this patch, large modules would often be scheduled late during execution, taking a long time to complete, thus starving the thread pool.

We now sort modules in descending order, based on each module's bitcode size, so that larger modules are processed first. By doing so, smaller modules have a better chance to keep the thread pool active, and thus avoid starvation when the bitcode compilation is almost complete.

In our case (on dual Intel Xeon Gold 6140, Windows 10 version 2004, two-stage build), this saves 15 sec when linking `clang.exe` with LLD & -flto=thin, /opt:lldltojobs=all, no ThinLTO cache, -DLLVM_INTEGRATED_CRT_ALLOC=d:\git\rpmalloc.

Before patch: 100 sec
After patch: 85 sec

Inspired by the work done by David Callahan in D60495.

Differential Revision: https://reviews.llvm.org/D87966

Added: 
    lld/test/COFF/thinlto-module-order.ll

Modified: 
    llvm/include/llvm/LTO/LTO.h
    llvm/lib/LTO/LTO.cpp
    llvm/lib/LTO/ThinLTOCodeGenerator.cpp

Removed: 
    


################################################################################
diff  --git a/lld/test/COFF/thinlto-module-order.ll b/lld/test/COFF/thinlto-module-order.ll
new file mode 100644
index 000000000000..ce2d50ea6126

--- /dev/null
+++ b/lld/test/COFF/thinlto-module-order.ll
@@ -0,0 +1,26 @@
+; REQUIRES: x86
+
+; RUN: opt -thinlto-bc %s -o %t1.obj
+; RUN: opt -thinlto-bc %p/Inputs/thinlto.ll -o %t2.obj
+
+; Ensure module re-ordering in LTO::runThinLTO does not affect the processing order.
+
+; RUN: lld-link -thinlto-index-only:%t3 /entry:main %t1.obj %t2.obj
+; RUN: cat %t3 | FileCheck %s --check-prefix=NORMAL
+; NORMAL: thinlto-module-order.ll.tmp1.o
+; NORMAL: thinlto-module-order.ll.tmp2.o
+
+; RUN: lld-link -thinlto-index-only:%t3 /entry:main %t2.obj %t1.obj 
+; RUN: cat %t3 | FileCheck %s --check-prefix=REVERSED
+; REVERSED: thinlto-module-order.ll.tmp2.o
+; REVERSED: thinlto-module-order.ll.tmp1.o
+
+target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc19.0.24215"
+
+declare void @g(...)
+
+define void @main() {
+  call void (...) @g()
+  ret void
+}

diff  --git a/llvm/include/llvm/LTO/LTO.h b/llvm/include/llvm/LTO/LTO.h
index 93456c0ae7ae..a47f0cc0c3c0 100644
--- a/llvm/include/llvm/LTO/LTO.h
+++ b/llvm/include/llvm/LTO/LTO.h
@@ -91,6 +91,10 @@ setupLLVMOptimizationRemarks(LLVMContext &Context, StringRef RemarksFilename,
 Expected<std::unique_ptr<ToolOutputFile>>
 setupStatsFile(StringRef StatsFilename);
 
+/// Produces a container ordering for optimal multi-threaded processing. Returns
+/// ordered indices to elements in the input array.
+std::vector<int> generateModulesOrdering(ArrayRef<BitcodeModule *> R);
+
 class LTO;
 struct SymbolResolution;
 class ThinBackendProc;

diff  --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index 6230216aa446..c2427dc07771 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -1107,6 +1107,7 @@ class lto::ThinBackendProc {
       const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR,
       MapVector<StringRef, BitcodeModule> &ModuleMap) = 0;
   virtual Error wait() = 0;
+  virtual unsigned getThreadCount() = 0;
 };
 
 namespace {
@@ -1221,6 +1222,10 @@ class InProcessThinBackend : public ThinBackendProc {
     else
       return Error::success();
   }
+
+  unsigned getThreadCount() override {
+    return BackendThreadPool.getThreadCount();
+  }
 };
 } // end anonymous namespace
 
@@ -1309,6 +1314,10 @@ class WriteIndexesThinBackend : public ThinBackendProc {
   }
 
   Error wait() override { return Error::success(); }
+
+  // WriteIndexesThinBackend should always return 1 to prevent module
+  // re-ordering and avoid non-determinism in the final link.
+  unsigned getThreadCount() override { return 1; }
 };
 } // end anonymous namespace
 
@@ -1443,17 +1452,37 @@ Error LTO::runThinLTO(AddStreamFn AddStream, NativeObjectCache Cache,
   auto &ModuleMap =
       ThinLTO.ModulesToCompile ? *ThinLTO.ModulesToCompile : ThinLTO.ModuleMap;
 
-  // Tasks 0 through ParallelCodeGenParallelismLevel-1 are reserved for combined
-  // module and parallel code generation partitions.
-  unsigned Task = RegularLTO.ParallelCodeGenParallelismLevel;
-  for (auto &Mod : ModuleMap) {
-    if (Error E = BackendProc->start(Task, Mod.second, ImportLists[Mod.first],
-                                     ExportLists[Mod.first],
-                                     ResolvedODR[Mod.first], ThinLTO.ModuleMap))
-      return E;
-    ++Task;
-  }
+  auto ProcessOneModule = [&](int I) -> Error {
+    auto &Mod = *(ModuleMap.begin() + I);
+    // Tasks 0 through ParallelCodeGenParallelismLevel-1 are reserved for
+    // combined module and parallel code generation partitions.
+    return BackendProc->start(RegularLTO.ParallelCodeGenParallelismLevel + I,
+                              Mod.second, ImportLists[Mod.first],
+                              ExportLists[Mod.first], ResolvedODR[Mod.first],
+                              ThinLTO.ModuleMap);
+  };
 
+  if (BackendProc->getThreadCount() == 1) {
+    // Process the modules in the order they were provided on the command-line.
+    // It is important for this codepath to be used for WriteIndexesThinBackend,
+    // to ensure the emitted LinkedObjectsFile lists ThinLTO objects in the same
+    // order as the inputs, which otherwise would affect the final link order.
+    for (int I = 0, E = ModuleMap.size(); I != E; ++I)
+      if (Error E = ProcessOneModule(I))
+        return E;
+  } else {
+    // When executing in parallel, process largest bitsize modules first to
+    // improve parallelism, and avoid starving the thread pool near the end.
+    // This saves about 15 sec on a 36-core machine while link `clang.exe` (out
+    // of 100 sec).
+    std::vector<BitcodeModule *> ModulesVec;
+    ModulesVec.reserve(ModuleMap.size());
+    for (auto &Mod : ModuleMap)
+      ModulesVec.push_back(&Mod.second);
+    for (int I : generateModulesOrdering(ModulesVec))
+      if (Error E = ProcessOneModule(I))
+        return E;
+  }
   return BackendProc->wait();
 }
 
@@ -1495,3 +1524,18 @@ lto::setupStatsFile(StringRef StatsFilename) {
   StatsFile->keep();
   return std::move(StatsFile);
 }
+
+// Compute the ordering we will process the inputs: the rough heuristic here
+// is to sort them per size so that the largest module get schedule as soon as
+// possible. This is purely a compile-time optimization.
+std::vector<int> lto::generateModulesOrdering(ArrayRef<BitcodeModule *> R) {
+  std::vector<int> ModulesOrdering;
+  ModulesOrdering.resize(R.size());
+  std::iota(ModulesOrdering.begin(), ModulesOrdering.end(), 0);
+  llvm::sort(ModulesOrdering, [&](int LeftIndex, int RightIndex) {
+    auto LSize = R[LeftIndex]->getBuffer().size();
+    auto RSize = R[RightIndex]->getBuffer().size();
+    return LSize > RSize;
+  });
+  return ModulesOrdering;
+}

diff  --git a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
index 14dae848b362..3f71487951c8 100644
--- a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
+++ b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
@@ -1054,19 +1054,11 @@ void ThinLTOCodeGenerator::run() {
     ModuleToDefinedGVSummaries[ModuleIdentifier];
   }
 
-  // Compute the ordering we will process the inputs: the rough heuristic here
-  // is to sort them per size so that the largest module get schedule as soon as
-  // possible. This is purely a compile-time optimization.
-  std::vector<int> ModulesOrdering;
-  ModulesOrdering.resize(Modules.size());
-  std::iota(ModulesOrdering.begin(), ModulesOrdering.end(), 0);
-  llvm::sort(ModulesOrdering, [&](int LeftIndex, int RightIndex) {
-    auto LSize =
-        Modules[LeftIndex]->getSingleBitcodeModule().getBuffer().size();
-    auto RSize =
-        Modules[RightIndex]->getSingleBitcodeModule().getBuffer().size();
-    return LSize > RSize;
-  });
+  std::vector<BitcodeModule *> ModulesVec;
+  ModulesVec.reserve(Modules.size());
+  for (auto &Mod : Modules)
+    ModulesVec.push_back(&Mod->getSingleBitcodeModule());
+  std::vector<int> ModulesOrdering = lto::generateModulesOrdering(ModulesVec);
 
   // Parallel optimizer + codegen
   {