[clang] [llvm] [CGData][ThinLTO] Global Outlining with Two-CodeGen Rounds (PR #90933)

Sun Sep 15 18:52:43 PDT 2024

llvmbot wrote:




@llvm/pr-subscribers-lto

Author: Kyungwoo Lee (kyulee-com)

<details>
<summary>Changes</summary>

This feature is enabled by `-codegen-data-thinlto-two-rounds`, which effectively runs the `-codegen-data-generate` and `-codegen-data-use` in two rounds to enable global outlining with ThinLTO.
 1. The first round: Run both optimization + codegen with a scratch output. 
     Before running codegen, we serialize the optimized bitcode modules to a temporary path.
 2. From the scratch object files, we merge them into the codegen data.
 3. The second round: Read the optimized bitcode modules and start the codegen only this time.
      Using the codegen data, the machine outliner effectively performs the global outlining.
 
Depends on https://github.com/llvm/llvm-project/pull/90934.
This is a patch for https://discourse.llvm.org/t/rfc-enhanced-machine-outliner-part-2-thinlto-nolto/78753.

---

Patch is 26.07 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/90933.diff


10 Files Affected:

- (modified) clang/lib/CodeGen/BackendUtil.cpp (+4-4) 
- (modified) llvm/include/llvm/CGData/CodeGenData.h (+16) 
- (modified) llvm/include/llvm/LTO/LTOBackend.h (+1) 
- (modified) llvm/lib/CGData/CodeGenData.cpp (+80-1) 
- (modified) llvm/lib/LTO/CMakeLists.txt (+1) 
- (modified) llvm/lib/LTO/LTO.cpp (+133-35) 
- (modified) llvm/lib/LTO/LTOBackend.cpp (+15-2) 
- (added) llvm/test/ThinLTO/AArch64/cgdata-read-single-outline.ll (+42) 
- (added) llvm/test/ThinLTO/AArch64/cgdata-two-rounds.ll (+94) 
- (added) llvm/test/ThinLTO/AArch64/lit.local.cfg (+2) 


``````````diff

diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index 7fa69420298160..a1909d45b4d944 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -1286,10 +1286,10 @@ static void runThinLTOBackend(
     Conf.CGFileType = getCodeGenFileType(Action);
     break;
   }
-  if (Error E =
-          thinBackend(Conf, -1, AddStream, *M, *CombinedIndex, ImportList,
-                      ModuleToDefinedGVSummaries[M->getModuleIdentifier()],
-                      /* ModuleMap */ nullptr, CGOpts.CmdArgs)) {
+  if (Error E = thinBackend(
+          Conf, -1, AddStream, *M, *CombinedIndex, ImportList,
+          ModuleToDefinedGVSummaries[M->getModuleIdentifier()],
+          /* ModuleMap */ nullptr, Conf.CodeGenOnly, CGOpts.CmdArgs)) {
     handleAllErrors(std::move(E), [&](ErrorInfoBase &EIB) {
       errs() << "Error running ThinLTO backend: " << EIB.message() << '\n';
     });
diff --git a/llvm/include/llvm/CGData/CodeGenData.h b/llvm/include/llvm/CGData/CodeGenData.h
index 84133a433170fe..1e1afe99327650 100644
--- a/llvm/include/llvm/CGData/CodeGenData.h
+++ b/llvm/include/llvm/CGData/CodeGenData.h
@@ -164,6 +164,22 @@ publishOutlinedHashTree(std::unique_ptr<OutlinedHashTree> HashTree) {
   CodeGenData::getInstance().publishOutlinedHashTree(std::move(HashTree));
 }
 
+/// Initialize the two-codegen rounds.
+void initializeTwoCodegenRounds();
+
+/// Save the current module before the first codegen round.
+void saveModuleForTwoRounds(const Module &TheModule, unsigned Task);
+
+/// Load the current module before the second codegen round.
+std::unique_ptr<Module> loadModuleForTwoRounds(BitcodeModule &OrigModule,
+                                               unsigned Task,
+                                               LLVMContext &Context);
+
+/// Merge the codegen data from the input files in scratch vector in ThinLTO
+/// two-codegen rounds.
+Error mergeCodeGenData(
+    const std::unique_ptr<std::vector<llvm::SmallString<0>>> InputFiles);
+
 void warn(Error E, StringRef Whence = "");
 void warn(Twine Message, std::string Whence = "", std::string Hint = "");
 
diff --git a/llvm/include/llvm/LTO/LTOBackend.h b/llvm/include/llvm/LTO/LTOBackend.h
index de89f4bb10dff2..8516398510d4b8 100644
--- a/llvm/include/llvm/LTO/LTOBackend.h
+++ b/llvm/include/llvm/LTO/LTOBackend.h
@@ -56,6 +56,7 @@ Error thinBackend(const Config &C, unsigned Task, AddStreamFn AddStream,
                   const FunctionImporter::ImportMapTy &ImportList,
                   const GVSummaryMapTy &DefinedGlobals,
                   MapVector<StringRef, BitcodeModule> *ModuleMap,
+                  bool CodeGenOnly,
                   const std::vector<uint8_t> &CmdArgs = std::vector<uint8_t>());
 
 Error finalizeOptimizationRemarks(
diff --git a/llvm/lib/CGData/CodeGenData.cpp b/llvm/lib/CGData/CodeGenData.cpp
index 55d2504231c744..e8fda7ad7454d7 100644
--- a/llvm/lib/CGData/CodeGenData.cpp
+++ b/llvm/lib/CGData/CodeGenData.cpp
@@ -17,6 +17,7 @@
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Path.h"
 #include "llvm/Support/WithColor.h"
 
 #define DEBUG_TYPE "cg-data"
@@ -30,6 +31,14 @@ cl::opt<bool>
 cl::opt<std::string>
     CodeGenDataUsePath("codegen-data-use-path", cl::init(""), cl::Hidden,
                        cl::desc("File path to where .cgdata file is read"));
+cl::opt<bool> CodeGenDataThinLTOTwoRounds(
+    "codegen-data-thinlto-two-rounds", cl::init(false), cl::Hidden,
+    cl::desc("Enable two-round ThinLTO code generation. The first round "
+             "generates code and emits CodeGen data, while the second round "
+             "uses the emitted data for further optimizations."));
+
+// Path to where the optimized bitcodes are saved and restored for ThinLTO.
+static SmallString<128> CodeGenDataThinLTOTwoRoundsPath;
 
 static std::string getCGDataErrString(cgdata_error Err,
                                       const std::string &ErrMsg = "") {
@@ -139,7 +148,7 @@ CodeGenData &CodeGenData::getInstance() {
   std::call_once(CodeGenData::OnceFlag, []() {
     Instance = std::unique_ptr<CodeGenData>(new CodeGenData());
 
-    if (CodeGenDataGenerate)
+    if (CodeGenDataGenerate || CodeGenDataThinLTOTwoRounds)
       Instance->EmitCGData = true;
     else if (!CodeGenDataUsePath.empty()) {
       // Initialize the global CGData if the input file name is given.
@@ -215,6 +224,76 @@ void warn(Error E, StringRef Whence) {
   }
 }
 
+static std::string getPath(StringRef Dir, unsigned Task) {
+  return (Dir + "/" + llvm::Twine(Task) + ".saved_copy.bc").str();
+}
+
+void initializeTwoCodegenRounds() {
+  assert(CodeGenDataThinLTOTwoRounds);
+  if (auto EC = llvm::sys::fs::createUniqueDirectory(
+          "cgdata", CodeGenDataThinLTOTwoRoundsPath))
+    report_fatal_error(Twine("Failed to create directory: ") + EC.message());
+}
+
+void saveModuleForTwoRounds(const Module &TheModule, unsigned Task) {
+  assert(sys::fs::is_directory(CodeGenDataThinLTOTwoRoundsPath));
+  std::string Path = getPath(CodeGenDataThinLTOTwoRoundsPath, Task);
+  std::error_code EC;
+  raw_fd_ostream OS(Path, EC, sys::fs::OpenFlags::OF_None);
+  if (EC)
+    report_fatal_error(Twine("Failed to open ") + Path +
+                       " to save optimized bitcode: " + EC.message());
+  WriteBitcodeToFile(TheModule, OS, /* ShouldPreserveUseListOrder */ true);
+}
+
+std::unique_ptr<Module> loadModuleForTwoRounds(BitcodeModule &OrigModule,
+                                               unsigned Task,
+                                               LLVMContext &Context) {
+  assert(sys::fs::is_directory(CodeGenDataThinLTOTwoRoundsPath));
+  std::string Path = getPath(CodeGenDataThinLTOTwoRoundsPath, Task);
+  auto FileOrError = MemoryBuffer::getFile(Path);
+  if (auto EC = FileOrError.getError())
+    report_fatal_error(Twine("Failed to open ") + Path +
+                       " to load optimized bitcode: " + EC.message());
+
+  std::unique_ptr<MemoryBuffer> FileBuffer = std::move(*FileOrError);
+  auto RestoredModule = llvm::parseBitcodeFile(*FileBuffer, Context);
+  if (!RestoredModule)
+    report_fatal_error(Twine("Failed to parse optimized bitcode loaded from ") +
+                       Path + "\n");
+
+  // Restore the original module identifier.
+  (*RestoredModule)->setModuleIdentifier(OrigModule.getModuleIdentifier());
+  return std::move(*RestoredModule);
+}
+
+Error mergeCodeGenData(
+    const std::unique_ptr<std::vector<llvm::SmallString<0>>> InputFiles) {
+
+  OutlinedHashTreeRecord GlobalOutlineRecord;
+  for (auto &InputFile : *(InputFiles)) {
+    if (InputFile.empty())
+      continue;
+    StringRef File = StringRef(InputFile.data(), InputFile.size());
+    std::unique_ptr<MemoryBuffer> Buffer = MemoryBuffer::getMemBuffer(
+        File, "in-memory object file", /*RequiresNullTerminator=*/false);
+    Expected<std::unique_ptr<object::ObjectFile>> BinOrErr =
+        object::ObjectFile::createObjectFile(Buffer->getMemBufferRef());
+    if (!BinOrErr)
+      return BinOrErr.takeError();
+
+    std::unique_ptr<object::ObjectFile> &Obj = BinOrErr.get();
+    if (auto E = CodeGenDataReader::mergeFromObjectFile(Obj.get(),
+                                                        GlobalOutlineRecord))
+      return E;
+  }
+
+  if (!GlobalOutlineRecord.empty())
+    cgdata::publishOutlinedHashTree(std::move(GlobalOutlineRecord.HashTree));
+
+  return Error::success();
+}
+
 } // end namespace cgdata
 
 } // end namespace llvm
diff --git a/llvm/lib/LTO/CMakeLists.txt b/llvm/lib/LTO/CMakeLists.txt
index 69ff08e1f374c4..057d73b6349cf1 100644
--- a/llvm/lib/LTO/CMakeLists.txt
+++ b/llvm/lib/LTO/CMakeLists.txt
@@ -21,6 +21,7 @@ add_llvm_component_library(LLVMLTO
   BinaryFormat
   BitReader
   BitWriter
+  CGData
   CodeGen
   CodeGenTypes
   Core
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index a88124dacfaefd..945f8c859365ea 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -21,6 +21,7 @@
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/Bitcode/BitcodeWriter.h"
+#include "llvm/CGData/CodeGenData.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/AutoUpgrade.h"
@@ -70,6 +71,8 @@ static cl::opt<bool>
     DumpThinCGSCCs("dump-thin-cg-sccs", cl::init(false), cl::Hidden,
                    cl::desc("Dump the SCCs in the ThinLTO index's callgraph"));
 
+extern cl::opt<bool> CodeGenDataThinLTOTwoRounds;
+
 namespace llvm {
 /// Enable global value internalization in LTO.
 cl::opt<bool> EnableLTOInternalization(
@@ -1458,7 +1461,7 @@ class InProcessThinBackend : public ThinBackendProc {
           GlobalValue::getGUID(GlobalValue::dropLLVMManglingEscape(Name)));
   }
 
-  Error runThinLTOBackendThread(
+  virtual Error runThinLTOBackendThread(
       AddStreamFn AddStream, FileCache Cache, unsigned Task, BitcodeModule BM,
       ModuleSummaryIndex &CombinedIndex,
       const FunctionImporter::ImportMapTy &ImportList,
@@ -1473,7 +1476,8 @@ class InProcessThinBackend : public ThinBackendProc {
         return MOrErr.takeError();
 
       return thinBackend(Conf, Task, AddStream, **MOrErr, CombinedIndex,
-                         ImportList, DefinedGlobals, &ModuleMap);
+                         ImportList, DefinedGlobals, &ModuleMap,
+                         Conf.CodeGenOnly);
     };
 
     auto ModuleID = BM.getModuleIdentifier();
@@ -1558,6 +1562,60 @@ class InProcessThinBackend : public ThinBackendProc {
     return BackendThreadPool.getMaxConcurrency();
   }
 };
+
+/// This Backend will run ThinBackend process but throw away all the output from
+/// the codegen. This class facilitates the first codegen round.
+class NoOutputThinBackend : public InProcessThinBackend {
+public:
+  NoOutputThinBackend(
+      const Config &Conf, ModuleSummaryIndex &CombinedIndex,
+      ThreadPoolStrategy ThinLTOParallelism,
+      const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries,
+      std::unique_ptr<std::vector<llvm::SmallString<0>>> Scratch)
+      : InProcessThinBackend(
+            Conf, CombinedIndex, ThinLTOParallelism, ModuleToDefinedGVSummaries,
+            // Allocate a scratch buffer for each task to write output to.
+            [Allocation = &*Scratch](unsigned Task, const Twine &ModuleName) {
+              return std::make_unique<CachedFileStream>(
+                  std::make_unique<raw_svector_ostream>((*Allocation)[Task]));
+            },
+            FileCache(), nullptr, false, false),
+        Scratch(std::move(Scratch)) {}
+
+  /// Scratch space for writing output during the codegen.
+  std::unique_ptr<std::vector<llvm::SmallString<0>>> Scratch;
+};
+
+/// This Backend performs codegen on bitcode that was previously saved after
+/// going through optimization. This class facilitates the second codegen round.
+class OptimizedBitcodeThinBackend : public InProcessThinBackend {
+public:
+  OptimizedBitcodeThinBackend(
+      const Config &Conf, ModuleSummaryIndex &CombinedIndex,
+      ThreadPoolStrategy ThinLTOParallelism,
+      const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries,
+      AddStreamFn AddStream)
+      : InProcessThinBackend(Conf, CombinedIndex, ThinLTOParallelism,
+                             ModuleToDefinedGVSummaries, AddStream, FileCache(),
+                             nullptr, false, false) {}
+
+  virtual Error runThinLTOBackendThread(
+      AddStreamFn AddStream, FileCache Cache, unsigned Task, BitcodeModule BM,
+      ModuleSummaryIndex &CombinedIndex,
+      const FunctionImporter::ImportMapTy &ImportList,
+      const FunctionImporter::ExportSetTy &ExportList,
+      const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR,
+      const GVSummaryMapTy &DefinedGlobals,
+      MapVector<StringRef, BitcodeModule> &ModuleMap) override {
+    LTOLLVMContext BackendContext(Conf);
+    std::unique_ptr<Module> LoadedModule =
+        cgdata::loadModuleForTwoRounds(BM, Task, BackendContext);
+
+    return thinBackend(Conf, Task, AddStream, *LoadedModule, CombinedIndex,
+                       ImportList, DefinedGlobals, &ModuleMap,
+                       /*CodeGenOnly=*/true);
+  }
+};
 } // end anonymous namespace
 
 ThinBackend lto::createInProcessThinBackend(ThreadPoolStrategy Parallelism,
@@ -1839,45 +1897,85 @@ Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache,
 
   TimeTraceScopeExit.release();
 
-  std::unique_ptr<ThinBackendProc> BackendProc =
-      ThinLTO.Backend(Conf, ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries,
-                      AddStream, Cache);
-
   auto &ModuleMap =
       ThinLTO.ModulesToCompile ? *ThinLTO.ModulesToCompile : ThinLTO.ModuleMap;
 
-  auto ProcessOneModule = [&](int I) -> Error {
-    auto &Mod = *(ModuleMap.begin() + I);
-    // Tasks 0 through ParallelCodeGenParallelismLevel-1 are reserved for
-    // combined module and parallel code generation partitions.
-    return BackendProc->start(RegularLTO.ParallelCodeGenParallelismLevel + I,
-                              Mod.second, ImportLists[Mod.first],
-                              ExportLists[Mod.first], ResolvedODR[Mod.first],
-                              ThinLTO.ModuleMap);
+  auto RunBackends = [&](ThinBackendProc *BackendProcess) -> Error {
+    auto ProcessOneModule = [&](int I) -> Error {
+      auto &Mod = *(ModuleMap.begin() + I);
+      // Tasks 0 through ParallelCodeGenParallelismLevel-1 are reserved for
+      // combined module and parallel code generation partitions.
+      return BackendProcess->start(
+          RegularLTO.ParallelCodeGenParallelismLevel + I, Mod.second,
+          ImportLists[Mod.first], ExportLists[Mod.first],
+          ResolvedODR[Mod.first], ThinLTO.ModuleMap);
+    };
+
+    if (BackendProcess->getThreadCount() == 1) {
+      // Process the modules in the order they were provided on the
+      // command-line. It is important for this codepath to be used for
+      // WriteIndexesThinBackend, to ensure the emitted LinkedObjectsFile lists
+      // ThinLTO objects in the same order as the inputs, which otherwise would
+      // affect the final link order.
+      for (int I = 0, E = ModuleMap.size(); I != E; ++I)
+        if (Error E = ProcessOneModule(I))
+          return E;
+    } else {
+      // When executing in parallel, process largest bitsize modules first to
+      // improve parallelism, and avoid starving the thread pool near the end.
+      // This saves about 15 sec on a 36-core machine while link `clang.exe`
+      // (out of 100 sec).
+      std::vector<BitcodeModule *> ModulesVec;
+      ModulesVec.reserve(ModuleMap.size());
+      for (auto &Mod : ModuleMap)
+        ModulesVec.push_back(&Mod.second);
+      for (int I : generateModulesOrdering(ModulesVec))
+        if (Error E = ProcessOneModule(I))
+          return E;
+    }
+    return BackendProcess->wait();
   };
 
-  if (BackendProc->getThreadCount() == 1) {
-    // Process the modules in the order they were provided on the command-line.
-    // It is important for this codepath to be used for WriteIndexesThinBackend,
-    // to ensure the emitted LinkedObjectsFile lists ThinLTO objects in the same
-    // order as the inputs, which otherwise would affect the final link order.
-    for (int I = 0, E = ModuleMap.size(); I != E; ++I)
-      if (Error E = ProcessOneModule(I))
-        return E;
-  } else {
-    // When executing in parallel, process largest bitsize modules first to
-    // improve parallelism, and avoid starving the thread pool near the end.
-    // This saves about 15 sec on a 36-core machine while link `clang.exe` (out
-    // of 100 sec).
-    std::vector<BitcodeModule *> ModulesVec;
-    ModulesVec.reserve(ModuleMap.size());
-    for (auto &Mod : ModuleMap)
-      ModulesVec.push_back(&Mod.second);
-    for (int I : generateModulesOrdering(ModulesVec))
-      if (Error E = ProcessOneModule(I))
-        return E;
+  if (!CodeGenDataThinLTOTwoRounds) {
+    std::unique_ptr<ThinBackendProc> BackendProc =
+        ThinLTO.Backend(Conf, ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries,
+                        AddStream, Cache);
+    return RunBackends(BackendProc.get());
   }
-  return BackendProc->wait();
+
+  // Perform two rounds of code generation for ThinLTO:
+  // 1. First round: Run optimization and code generation with a scratch output.
+  // 2. Merge codegen data extracted from the scratch output.
+  // 3. Second round: Run code generation again using the merged data.
+  LLVM_DEBUG(dbgs() << "Running ThinLTO two-codegen rounds\n");
+
+  // Initialize a temporary path to store and retrieve optimized IRs for
+  // two-round code generation.
+  cgdata::initializeTwoCodegenRounds();
+
+  // Create a scratch output to hold intermediate results.
+  auto Outputs =
+      std::make_unique<std::vector<llvm::SmallString<0>>>(getMaxTasks());
+  auto FirstRoundLTO = std::make_unique<NoOutputThinBackend>(
+      Conf, ThinLTO.CombinedIndex, llvm::heavyweight_hardware_concurrency(),
+      ModuleToDefinedGVSummaries, std::move(Outputs));
+  // First round: Run optimization and code generation with a scratch output.
+  // Before code generation, serialize modules.
+  if (Error E = RunBackends(FirstRoundLTO.get()))
+    return E;
+
+  // Merge codegen data extracted from the scratch output.
+  if (Error E = cgdata::mergeCodeGenData(std::move(FirstRoundLTO->Scratch)))
+    return E;
+
+  // Second round: Run code generation by reading IRs.
+  std::unique_ptr<ThinBackendProc> SecondRoundLTO =
+      std::make_unique<OptimizedBitcodeThinBackend>(
+          Conf, ThinLTO.CombinedIndex, llvm::heavyweight_hardware_concurrency(),
+          ModuleToDefinedGVSummaries, AddStream);
+  Error E = RunBackends(SecondRoundLTO.get());
+
+  return E;
 }
 
 Expected<std::unique_ptr<ToolOutputFile>> lto::setupLLVMOptimizationRemarks(
diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp
index 4e58cd369c3ac9..d198e8e5102009 100644
--- a/llvm/lib/LTO/LTOBackend.cpp
+++ b/llvm/lib/LTO/LTOBackend.cpp
@@ -20,6 +20,7 @@
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/Bitcode/BitcodeWriter.h"
+#include "llvm/CGData/CodeGenData.h"
 #include "llvm/IR/LLVMRemarkStreamer.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/PassManager.h"
@@ -74,6 +75,8 @@ static cl::opt<bool> ThinLTOAssumeMerged(
     cl::desc("Assume the input has already undergone ThinLTO function "
              "importing and the other pre-optimization pipeline changes."));
 
+extern cl::opt<bool> CodeGenDataThinLTOTwoRounds;
+
 namespace llvm {
 extern cl::opt<bool> NoPGOWarnMismatch;
 }
@@ -565,7 +568,7 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream,
                        const FunctionImporter::ImportMapTy &ImportList,
                        const GVSummaryMapTy &DefinedGlobals,
                        MapVector<StringRef, BitcodeModule> *ModuleMap,
-                       const std::vector<uint8_t> &CmdArgs) {
+                       bool CodeGenOnly, const std::vector<uint8_t> &CmdArgs) {
   Expected<const Target *> TOrErr = initAndLookupTarget(Conf, Mod);
   if (!TOrErr)
     return TOrErr.takeError();
@@ -586,7 +589,9 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream,
   Mod.setPartialSampleProfileRatio(CombinedIndex);
 
   LLVM_DEBUG(dbgs() << "Running ThinLTO\n");
-  if (Conf.CodeGenOnly) {
+  if (CodeGenOnly) {
+    // If CodeGenOnly is set, we only perform code generation and skip
+    // optimization.
     codegen(Conf, TM.get(), AddStream, Task, Mod, CombinedIndex);
     return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
   }
@@ -597,11 +602,19 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream,
   auto OptimizeAndCodegen =
       [&](Module &Mod, TargetMachine *TM,
           std::unique_ptr<ToolOutputFile> DiagnosticOutputFile) {
+        // Perform optimization and code generation for ThinLTO.
         if (!opt(Conf, TM, Task, Mod, /*IsThinLTO=*/true,
                  /*ExportSummary=*/nullptr, /*ImportSummary=*/&CombinedIndex,
                  CmdArgs))
           return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
 
+        // Save the current module before the first codege...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/90933