[llvm] [AMDGPU] Graph-based Module Splitting Rewrite (PR #104763)

Tue Aug 20 10:33:40 PDT 2024

================
@@ -651,138 +1317,176 @@ static void splitAMDGPUModule(
   if (!NoExternalizeGlobals) {
     for (auto &GV : M.globals()) {
       if (GV.hasLocalLinkage())
-        SML << "[externalize] GV " << GV.getName() << '\n';
+        LLVM_DEBUG(dbgs() << "[externalize] GV " << GV.getName() << '\n');
       externalize(GV);
     }
   }
 
   // Start by calculating the cost of every function in the module, as well as
   // the module's overall cost.
-  DenseMap<const Function *, CostType> FnCosts;
-  const CostType ModuleCost = calculateFunctionCosts(SML, GetTTI, M, FnCosts);
-
-  // First, gather ever kernel into the worklist.
-  SmallVector<FunctionWithDependencies> WorkList;
-  for (auto &Fn : M) {
-    if (isEntryPoint(&Fn) && !Fn.isDeclaration())
-      WorkList.emplace_back(SML, CG, FnCosts, &Fn);
+  FunctionsCostMap FnCosts;
+  const CostType ModuleCost = calculateFunctionCosts(GetTTI, M, FnCosts);
+
+  // Build the SplitGraph, which represents the module's functions and models
+  // their dependencies accurately.
+  SplitGraph SG(M, FnCosts, ModuleCost);
+  SG.buildGraph(CG);
+
+  if (SG.empty()) {
+    LLVM_DEBUG(
+        dbgs()
+        << "[!] no nodes in graph, input is empty - no splitting possible\n");
+    ModuleCallback(cloneAll(M));
+    return;
   }
 
-  // Then, find missing functions that need to be considered as additional
-  // roots. These can't be called in theory, but in practice we still have to
-  // handle them to avoid linker errors.
-  {
-    DenseSet<const Function *> SeenFunctions;
-    for (const auto &FWD : WorkList) {
-      SeenFunctions.insert(FWD.Fn);
-      SeenFunctions.insert(FWD.Dependencies.begin(), FWD.Dependencies.end());
+  LLVM_DEBUG({
+    dbgs() << "[graph] nodes:\n";
+    for (const SplitGraph::Node *N : SG.nodes()) {
+      dbgs() << "  - [" << N->getID() << "]: " << N->getName() << " "
+             << (N->isGraphEntryPoint() ? "(entry)" : "") << "\n";
     }
+  });
 
-    for (auto &Fn : M) {
-      // If this function is not part of any kernel's dependencies and isn't
-      // directly called, consider it as a root.
-      if (!Fn.isDeclaration() && !isEntryPoint(&Fn) &&
-          !SeenFunctions.count(&Fn) && !hasDirectCaller(Fn)) {
-        WorkList.emplace_back(SML, CG, FnCosts, &Fn);
-      }
-    }
-  }
+  writeDOTGraph(SG);
 
-  // Sort the worklist so the most expensive roots are seen first.
-  sort(WorkList, [&](auto &A, auto &B) {
-    // Sort by total cost, and if the total cost is identical, sort
-    // alphabetically.
-    if (A.TotalCost == B.TotalCost)
-      return A.Fn->getName() < B.Fn->getName();
-    return A.TotalCost > B.TotalCost;
-  });
+  LLVM_DEBUG(dbgs() << "[search] testing splitting strategies\n");
 
-  if (SML) {
-    SML << "Worklist\n";
-    for (const auto &FWD : WorkList) {
-      SML << "[root] " << getName(*FWD.Fn) << " (totalCost:" << FWD.TotalCost
-          << " indirect:" << FWD.HasIndirectCall
-          << " hasNonDuplicatableDep:" << FWD.HasNonDuplicatableDependecy
-          << ")\n";
-      // Sort function names before printing to ensure determinism.
-      SmallVector<std::string> SortedDepNames;
-      SortedDepNames.reserve(FWD.Dependencies.size());
-      for (const auto *Dep : FWD.Dependencies)
-        SortedDepNames.push_back(getName(*Dep));
-      sort(SortedDepNames);
-
-      for (const auto &Name : SortedDepNames)
-        SML << "  [dependency] " << Name << '\n';
-    }
+  std::optional<SplitProposal> Proposal;
+  const auto EvaluateProposal = [&](SplitProposal SP) {
+    if (!Proposal)
+      Proposal = std::move(SP);
+    else
+      evaluateProposal(*Proposal, std::move(SP));
+  };
+
+  // TODO: It would be very easy to create new strategies by just adding a base
+  // class to RecursiveSearchSplitting and abstracting it away.
+  RecursiveSearchSplitting(SG, NumParts, EvaluateProposal).run();
+  LLVM_DEBUG(if (Proposal) dbgs() << "[search done] selected proposal: "
+                                  << Proposal->getName() << "\n";);
+
+  if (!Proposal) {
+    LLVM_DEBUG(dbgs() << "[!] no proposal made, no splitting possible!\n");
+    ModuleCallback(cloneAll(M));
+    return;
   }
 
-  // This performs all of the partitioning work.
-  auto Partitions = doPartitioning(SML, M, N, ModuleCost, FnCosts, WorkList);
-  assert(Partitions.size() == N);
+  LLVM_DEBUG(Proposal->print(dbgs()););
 
-  // If we didn't externalize GVs, then local GVs need to be conservatively
-  // imported into every module (including their initializers), and then cleaned
-  // up afterwards.
-  const auto NeedsConservativeImport = [&](const GlobalValue *GV) {
-    // We conservatively import private/internal GVs into every module and clean
-    // them up afterwards.
-    const auto *Var = dyn_cast<GlobalVariable>(GV);
-    return Var && Var->hasLocalLinkage();
-  };
+  std::optional<raw_fd_ostream> SummariesOS;
+  if (!PartitionSummariesOutput.empty()) {
+    std::error_code EC;
+    SummariesOS.emplace(PartitionSummariesOutput, EC);
+    if (EC)
+      errs() << "[" DEBUG_TYPE "]: cannot open '" << PartitionSummariesOutput
+             << "' - Partition summaries will not be printed\n";
+  }
+
+  for (unsigned PID = 0; PID < NumParts; ++PID) {
+    SplitModuleTimer SMT2("modules_creation",
+                          "creating modules for each partition");
+    LLVM_DEBUG(dbgs() << "[split] creating new modules\n");
 
-  SML << "Creating " << N << " modules...\n";
-  unsigned TotalFnImpls = 0;
-  for (unsigned I = 0; I < N; ++I) {
-    const auto &FnsInPart = Partitions[I];
+    DenseSet<const Function *> FnsInPart;
+    for (unsigned NodeID : (*Proposal)[PID].set_bits())
+      FnsInPart.insert(&SG.getNode(NodeID).getFunction());
 
     ValueToValueMapTy VMap;
+    CostType PartCost = 0;
     std::unique_ptr<Module> MPart(
         CloneModule(M, VMap, [&](const GlobalValue *GV) {
           // Functions go in their assigned partition.
-          if (const auto *Fn = dyn_cast<Function>(GV))
-            return FnsInPart.contains(Fn);
-
-          if (NeedsConservativeImport(GV))
-            return true;
+          if (const auto *Fn = dyn_cast<Function>(GV)) {
+            if (FnsInPart.contains(Fn)) {
+              PartCost += SG.getCost(*Fn);
+              return true;
+            }
+            return false;
+          }
 
           // Everything else goes in the first partition.
-          return I == 0;
+          return needsConservativeImport(GV) || PID == 0;
         }));
 
+    // FIXME: Aliases aren't seen often, and their handling isn't perfect so
+    // bugs are possible.
+
     // Clean-up conservatively imported GVs without any users.
-    for (auto &GV : make_early_inc_range(MPart->globals())) {
-      if (NeedsConservativeImport(&GV) && GV.use_empty())
+    for (auto &GV : make_early_inc_range(MPart->global_values())) {
+      if (needsConservativeImport(&GV) && GV.use_empty())
         GV.eraseFromParent();
     }
 
-    unsigned NumAllFns = 0, NumKernels = 0;
-    for (auto &Cur : *MPart) {
-      if (!Cur.isDeclaration()) {
-        ++NumAllFns;
-        if (isEntryPoint(&Cur))
-          ++NumKernels;
-      }
-    }
-    TotalFnImpls += NumAllFns;
-    SML << "  - Module " << I << " with " << NumAllFns << " functions ("
-        << NumKernels << " kernels)\n";
+    if (SummariesOS)
+      printPartitionSummary(*SummariesOS, PID, *MPart, PartCost, ModuleCost);
+
+    LLVM_DEBUG(
+        printPartitionSummary(dbgs(), PID, *MPart, PartCost, ModuleCost));
+
     ModuleCallback(std::move(MPart));
   }
-
-  SML << TotalFnImpls << " function definitions across all modules ("
-      << format("%0.2f", (float(TotalFnImpls) / FnCosts.size()) * 100)
-      << "% of original module)\n";
 }
 } // namespace
 
 PreservedAnalyses AMDGPUSplitModulePass::run(Module &M,
                                              ModuleAnalysisManager &MAM) {
+  SplitModuleTimer SMT(
+      "total", "total pass runtime (incl. potentially waiting for lockfile)");
+
   FunctionAnalysisManager &FAM =
       MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
   const auto TTIGetter = [&FAM](Function &F) -> const TargetTransformInfo & {
     return FAM.getResult<TargetIRAnalysis>(F);
   };
-  splitAMDGPUModule(TTIGetter, M, N, ModuleCallback);
-  // We don't change the original module.
-  return PreservedAnalyses::all();
+
+  bool Done = false;
+#ifndef NDEBUG
+  if (UseLockFile) {
+    SmallString<128> LockFilePath;
+    sys::path::system_temp_directory(/*ErasedOnReboot=*/true, LockFilePath);
+    sys::path::append(LockFilePath, "amdgpu-split-module-debug");
+    LLVM_DEBUG(dbgs() << DEBUG_TYPE " using lockfile '" << LockFilePath
+                      << "'\n");
+
+    while (true) {
+      llvm::LockFileManager Locked(LockFilePath.str());
+      switch (Locked) {
+      case LockFileManager::LFS_Error:
+        errs() << "[amdgpu-split-module] unable to acquire lockfile, debug "
+                  "output may be mangled by other processes\n";
+        Locked.unsafeRemoveLockFile();
+        break;
+      case LockFileManager::LFS_Owned:
+        break;
+      case LockFileManager::LFS_Shared: {
+        switch (Locked.waitForUnlock()) {
+        case LockFileManager::Res_Success:
+          break;
+        case LockFileManager::Res_OwnerDied:
+          continue; // try again to get the lock.
+        case LockFileManager::Res_Timeout:
+          errs() << "[amdgpu-split-module] unable to acquire lockfile, debug "
+                    "output may be mangled by other processes\n";
----------------
arsenm wrote:

It's not about the number of times, it's the compiler as a library should never spam to console ever 

https://github.com/llvm/llvm-project/pull/104763