[PATCH] D87966: [ThinLTO] Re-order modules for optimal multi-threaded processing

Tue Sep 22 07:55:02 PDT 2020

aganea updated this revision to Diff 293455.
aganea edited the summary of this revision.
aganea added a comment.

Works as intended, thanks @tejohnson!
All tests pass. Please have another look.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D87966/new/

https://reviews.llvm.org/D87966

Files:
  llvm/include/llvm/LTO/LTO.h
  llvm/lib/LTO/LTO.cpp
  llvm/lib/LTO/ThinLTOCodeGenerator.cpp


Index: llvm/lib/LTO/ThinLTOCodeGenerator.cpp
===================================================================

--- llvm/lib/LTO/ThinLTOCodeGenerator.cpp
+++ llvm/lib/LTO/ThinLTOCodeGenerator.cpp
@@ -1054,19 +1054,11 @@
     ModuleToDefinedGVSummaries[ModuleIdentifier];
   }
 
-  // Compute the ordering we will process the inputs: the rough heuristic here
-  // is to sort them per size so that the largest module get schedule as soon as
-  // possible. This is purely a compile-time optimization.
-  std::vector<int> ModulesOrdering;
-  ModulesOrdering.resize(Modules.size());
-  std::iota(ModulesOrdering.begin(), ModulesOrdering.end(), 0);
-  llvm::sort(ModulesOrdering, [&](int LeftIndex, int RightIndex) {
-    auto LSize =
-        Modules[LeftIndex]->getSingleBitcodeModule().getBuffer().size();
-    auto RSize =
-        Modules[RightIndex]->getSingleBitcodeModule().getBuffer().size();
-    return LSize > RSize;
-  });
+  std::vector<BitcodeModule *> ModulesVec;
+  ModulesVec.reserve(Modules.size());
+  for (auto &Mod : Modules)
+    ModulesVec.push_back(&Mod->getSingleBitcodeModule());
+  std::vector<int> ModulesOrdering = lto::generateModulesOrdering(ModulesVec);
 
   // Parallel optimizer + codegen
   {
Index: llvm/lib/LTO/LTO.cpp
===================================================================
--- llvm/lib/LTO/LTO.cpp
+++ llvm/lib/LTO/LTO.cpp
@@ -1443,15 +1443,21 @@
   auto &ModuleMap =
       ThinLTO.ModulesToCompile ? *ThinLTO.ModulesToCompile : ThinLTO.ModuleMap;
 
+  std::vector<BitcodeModule *> ModulesVec;
+  ModulesVec.reserve(ModuleMap.size());
+  for (auto &Mod : ModuleMap)
+    ModulesVec.push_back(&Mod.second);
+  std::vector<int> ModulesOrdering = generateModulesOrdering(ModulesVec);
+
   // Tasks 0 through ParallelCodeGenParallelismLevel-1 are reserved for combined
   // module and parallel code generation partitions.
-  unsigned Task = RegularLTO.ParallelCodeGenParallelismLevel;
-  for (auto &Mod : ModuleMap) {
-    if (Error E = BackendProc->start(Task, Mod.second, ImportLists[Mod.first],
-                                     ExportLists[Mod.first],
-                                     ResolvedODR[Mod.first], ThinLTO.ModuleMap))
+  for (auto IndexCount : ModulesOrdering) {
+    auto &Mod = *(ModuleMap.begin() + IndexCount);
+    if (Error E = BackendProc->start(
+            RegularLTO.ParallelCodeGenParallelismLevel + IndexCount, Mod.second,
+            ImportLists[Mod.first], ExportLists[Mod.first],
+            ResolvedODR[Mod.first], ThinLTO.ModuleMap))
       return E;
-    ++Task;
   }
 
   return BackendProc->wait();
@@ -1495,3 +1501,18 @@
   StatsFile->keep();
   return std::move(StatsFile);
 }
+
+// Compute the ordering we will process the inputs: the rough heuristic here
+// is to sort them per size so that the largest module get schedule as soon as
+// possible. This is purely a compile-time optimization.
+std::vector<int> lto::generateModulesOrdering(ArrayRef<BitcodeModule *> R) {
+  std::vector<int> ModulesOrdering;
+  ModulesOrdering.resize(R.size());
+  std::iota(ModulesOrdering.begin(), ModulesOrdering.end(), 0);
+  llvm::sort(ModulesOrdering, [&](int LeftIndex, int RightIndex) {
+    auto LSize = R[LeftIndex]->getBuffer().size();
+    auto RSize = R[RightIndex]->getBuffer().size();
+    return LSize > RSize;
+  });
+  return ModulesOrdering;
+}
Index: llvm/include/llvm/LTO/LTO.h
===================================================================
--- llvm/include/llvm/LTO/LTO.h
+++ llvm/include/llvm/LTO/LTO.h
@@ -91,6 +91,10 @@
 Expected<std::unique_ptr<ToolOutputFile>>
 setupStatsFile(StringRef StatsFilename);
 
+/// Produce a re-ordered container for optimal multi-threaded processing.
+/// Returns indices to elements into the input array.
+std::vector<int> generateModulesOrdering(ArrayRef<BitcodeModule *> R);
+
 class LTO;
 struct SymbolResolution;
 class ThinBackendProc;


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D87966.293455.patch
Type: text/x-patch
Size: 3884 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20200922/d96a2d7e/attachment.bin>