[llvm] 6537004 - [ThinLTO] Re-order modules for optimal multi-threaded processing

Tue Sep 22 08:26:09 PDT 2020

Author: Alexandre Ganea
Date: 2020-09-22T11:25:59-04:00
New Revision: 6537004913f3009d896bc30856698e7d22199ba7

URL: https://github.com/llvm/llvm-project/commit/6537004913f3009d896bc30856698e7d22199ba7
DIFF: https://github.com/llvm/llvm-project/commit/6537004913f3009d896bc30856698e7d22199ba7.diff

LOG: [ThinLTO] Re-order modules for optimal multi-threaded processing

Re-use an optimizition from the old LTO API (used by ld64).
This sorts modules in ascending order, based on bitcode size, so that larger modules are processed first. This allows for smaller modules to be process last, and better fill free threads 'slots', and thusly allow for better multi-thread load balancing.

In our case (on dual Intel Xeon Gold 6140, Windows 10 version 2004, two-stage build), this saves 15 sec when linking `clang.exe` with LLD & `-flto=thin`, `/opt:lldltojobs=all`, no ThinLTO cache, -DLLVM_INTEGRATED_CRT_ALLOC=d:\git\rpmalloc.

Before patch: 102 sec
After patch: 85 sec

Inspired by the work done by David Callahan in D60495.

Differential Revision: https://reviews.llvm.org/D87966

Added: 
    

Modified: 
    llvm/include/llvm/LTO/LTO.h
    llvm/lib/LTO/LTO.cpp
    llvm/lib/LTO/ThinLTOCodeGenerator.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/LTO/LTO.h b/llvm/include/llvm/LTO/LTO.h
index 93456c0ae7ae..a47f0cc0c3c0 100644

--- a/llvm/include/llvm/LTO/LTO.h
+++ b/llvm/include/llvm/LTO/LTO.h
@@ -91,6 +91,10 @@ setupLLVMOptimizationRemarks(LLVMContext &Context, StringRef RemarksFilename,
 Expected<std::unique_ptr<ToolOutputFile>>
 setupStatsFile(StringRef StatsFilename);
 
+/// Produces a container ordering for optimal multi-threaded processing. Returns
+/// ordered indices to elements in the input array.
+std::vector<int> generateModulesOrdering(ArrayRef<BitcodeModule *> R);
+
 class LTO;
 struct SymbolResolution;
 class ThinBackendProc;

diff  --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index 6230216aa446..4fbb3ad5bb4a 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -1443,15 +1443,21 @@ Error LTO::runThinLTO(AddStreamFn AddStream, NativeObjectCache Cache,
   auto &ModuleMap =
       ThinLTO.ModulesToCompile ? *ThinLTO.ModulesToCompile : ThinLTO.ModuleMap;
 
+  std::vector<BitcodeModule *> ModulesVec;
+  ModulesVec.reserve(ModuleMap.size());
+  for (auto &Mod : ModuleMap)
+    ModulesVec.push_back(&Mod.second);
+  std::vector<int> ModulesOrdering = generateModulesOrdering(ModulesVec);
+
   // Tasks 0 through ParallelCodeGenParallelismLevel-1 are reserved for combined
   // module and parallel code generation partitions.
-  unsigned Task = RegularLTO.ParallelCodeGenParallelismLevel;
-  for (auto &Mod : ModuleMap) {
-    if (Error E = BackendProc->start(Task, Mod.second, ImportLists[Mod.first],
-                                     ExportLists[Mod.first],
-                                     ResolvedODR[Mod.first], ThinLTO.ModuleMap))
+  for (auto IndexCount : ModulesOrdering) {
+    auto &Mod = *(ModuleMap.begin() + IndexCount);
+    if (Error E = BackendProc->start(
+            RegularLTO.ParallelCodeGenParallelismLevel + IndexCount, Mod.second,
+            ImportLists[Mod.first], ExportLists[Mod.first],
+            ResolvedODR[Mod.first], ThinLTO.ModuleMap))
       return E;
-    ++Task;
   }
 
   return BackendProc->wait();
@@ -1495,3 +1501,18 @@ lto::setupStatsFile(StringRef StatsFilename) {
   StatsFile->keep();
   return std::move(StatsFile);
 }
+
+// Compute the ordering we will process the inputs: the rough heuristic here
+// is to sort them per size so that the largest module get schedule as soon as
+// possible. This is purely a compile-time optimization.
+std::vector<int> lto::generateModulesOrdering(ArrayRef<BitcodeModule *> R) {
+  std::vector<int> ModulesOrdering;
+  ModulesOrdering.resize(R.size());
+  std::iota(ModulesOrdering.begin(), ModulesOrdering.end(), 0);
+  llvm::sort(ModulesOrdering, [&](int LeftIndex, int RightIndex) {
+    auto LSize = R[LeftIndex]->getBuffer().size();
+    auto RSize = R[RightIndex]->getBuffer().size();
+    return LSize > RSize;
+  });
+  return ModulesOrdering;
+}

diff  --git a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
index 14dae848b362..3f71487951c8 100644
--- a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
+++ b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
@@ -1054,19 +1054,11 @@ void ThinLTOCodeGenerator::run() {
     ModuleToDefinedGVSummaries[ModuleIdentifier];
   }
 
-  // Compute the ordering we will process the inputs: the rough heuristic here
-  // is to sort them per size so that the largest module get schedule as soon as
-  // possible. This is purely a compile-time optimization.
-  std::vector<int> ModulesOrdering;
-  ModulesOrdering.resize(Modules.size());
-  std::iota(ModulesOrdering.begin(), ModulesOrdering.end(), 0);
-  llvm::sort(ModulesOrdering, [&](int LeftIndex, int RightIndex) {
-    auto LSize =
-        Modules[LeftIndex]->getSingleBitcodeModule().getBuffer().size();
-    auto RSize =
-        Modules[RightIndex]->getSingleBitcodeModule().getBuffer().size();
-    return LSize > RSize;
-  });
+  std::vector<BitcodeModule *> ModulesVec;
+  ModulesVec.reserve(Modules.size());
+  for (auto &Mod : Modules)
+    ModulesVec.push_back(&Mod->getSingleBitcodeModule());
+  std::vector<int> ModulesOrdering = lto::generateModulesOrdering(ModulesVec);
 
   // Parallel optimizer + codegen
   {