[llvm] [AMDGPU] Graph-based Module Splitting Rewrite (PR #104763)

Wed Aug 21 00:07:59 PDT 2024

https://github.com/Pierre-vh updated https://github.com/llvm/llvm-project/pull/104763

>From ce1813b86e68834de54d01d8350e77ee381998dc Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Mon, 19 Aug 2024 13:16:08 +0200
Subject: [PATCH 1/3] [AMDGPU] Graph-based Module Splitting Rewrite

Major rewrite of the AMDGPUSplitModule pass.
Highlights:
 - Removal of the "SML" logging system in favor of just using CL options, LLVM_DEBUG, etc.
 - Graph-based module representation with DOTGraph printing support
 - No more defaulting to "P0" for external calls.
 - Graph-search algorithm that can explore multiple branches/assignments for a cluster of functions, up to a maximum depth.
---
 llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp  | 1766 ++++++++++++-----
 .../address-taken-externalize-with-call.ll    |   36 +-
 .../AMDGPU/address-taken-externalize.ll       |    2 +-
 .../llvm-split/AMDGPU/debug-name-hiding.ll    |   20 -
 .../AMDGPU/debug-non-kernel-root.ll           |   36 -
 .../tools/llvm-split/AMDGPU/declarations.ll   |    9 +-
 .../AMDGPU/kernels-alias-dependencies.ll      |   18 +-
 .../llvm-split/AMDGPU/kernels-cost-ranking.ll |   12 +-
 .../AMDGPU/kernels-dependency-external.ll     |   33 +-
 .../AMDGPU/kernels-dependency-indirect.ll     |   30 +-
 .../AMDGPU/kernels-dependency-overridable.ll  |   28 +-
 .../kernels-global-variables-noexternal.ll    |   12 +-
 .../AMDGPU/kernels-global-variables.ll        |   12 +-
 .../AMDGPU/large-kernels-merging.ll           |   26 +-
 .../AMDGPU/non-kernels-dependency-indirect.ll |   30 +-
 .../llvm-split/AMDGPU/recursive-search-2.ll   |  128 ++
 .../llvm-split/AMDGPU/recursive-search-8.ll   |  128 ++
 17 files changed, 1587 insertions(+), 739 deletions(-)
 delete mode 100644 llvm/test/tools/llvm-split/AMDGPU/debug-name-hiding.ll
 delete mode 100644 llvm/test/tools/llvm-split/AMDGPU/debug-non-kernel-root.ll
 create mode 100644 llvm/test/tools/llvm-split/AMDGPU/recursive-search-2.ll
 create mode 100644 llvm/test/tools/llvm-split/AMDGPU/recursive-search-8.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
index df084cf41c4783..6d755a7c61d4f6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
@@ -7,33 +7,36 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file Implements a module splitting algorithm designed to support the
-/// FullLTO --lto-partitions option for parallel codegen. This is completely
-/// different from the common SplitModule pass, as this system is designed with
-/// AMDGPU in mind.
+/// FullLTO --lto-partitions option for parallel codegen.
 ///
-/// The basic idea of this module splitting implementation is the same as
-/// SplitModule: load-balance the module's functions across a set of N
-/// partitions to allow parallel codegen. However, it does it very
-/// differently than the target-agnostic variant:
-///   - The module has "split roots", which are kernels in the vast
-//      majority of cases.
-///   - Each root has a set of dependencies, and when a root and its
-///     dependencies is considered "big", we try to put it in a partition where
-///     most dependencies are already imported, to avoid duplicating large
-///     amounts of code.
-///   - There's special care for indirect calls in order to ensure
-///     AMDGPUResourceUsageAnalysis can work correctly.
+/// The role of this module splitting pass is the same as
+/// lib/Transforms/Utils/SplitModule.cpp: load-balance the module's functions
+/// across a set of N partitions to allow for parallel codegen.
 ///
-/// This file also includes a more elaborate logging system to enable
-/// users to easily generate logs that (if desired) do not include any value
-/// names, in order to not leak information about the source file.
-/// Such logs are very helpful to understand and fix potential issues with
-/// module splitting.
+/// The similarities mostly end here, as this pass achieves load-balancing in a
+/// more elaborate fashion which is targeted towards AMDGPU modules. It can take
+/// advantage of the structure of AMDGPU modules (which are mostly
+/// self-contained) to allow for more efficient splitting without affecting
+/// codegen negatively, or causing innaccurate resource usage analysis.
+///
+/// High-level pass overview:
+///   - SplitGraph & associated classes
+///      - Graph representation of the module and of the dependencies that
+///      matter for splitting.
+///   - RecursiveSearchSplitting
+///     - Core splitting algorithm.
+///   - SplitProposal
+///     - Represents a suggested solution for splitting the input module. These
+///     solutions can be scored to determine the best one when multiple
+///     solutions are available.
+///   - Driver/pass "run" function glues everything together.
 
 #include "AMDGPUSplitModule.h"
 #include "AMDGPUTargetMachine.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/EquivalenceClasses.h"
+#include "llvm/ADT/GraphTraits.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
@@ -44,44 +47,56 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
+#include "llvm/Support/Allocator.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/DOTGraphTraits.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FileSystem.h"
+#include "llvm/Support/GraphWriter.h"
 #include "llvm/Support/Path.h"
-#include "llvm/Support/Process.h"
-#include "llvm/Support/SHA256.h"
-#include "llvm/Support/Threading.h"
+#include "llvm/Support/Timer.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include <algorithm>
 #include <cassert>
+#include <cmath>
 #include <iterator>
 #include <memory>
 #include <utility>
 #include <vector>
 
-using namespace llvm;
+#ifndef NDEBUG
+#include "llvm/Support/LockFileManager.h"
+#endif
 
 #define DEBUG_TYPE "amdgpu-split-module"
 
+namespace llvm {
 namespace {
 
+static cl::opt<unsigned> MaxDepth(
+    "amdgpu-module-splitting-max-depth",
+    cl::desc(
+        "maximum search depth. 0 forces a greedy approach. "
+        "warning: the algorithm is up to O(2^N), where N is the max depth."),
+    cl::init(8));
+
 static cl::opt<float> LargeFnFactor(
-    "amdgpu-module-splitting-large-function-threshold", cl::init(2.0f),
-    cl::Hidden,
+    "amdgpu-module-splitting-large-threshold", cl::init(2.0f), cl::Hidden,
     cl::desc(
-        "consider a function as large and needing special treatment when the "
-        "cost of importing it into a partition"
-        "exceeds the average cost of a partition by this factor; e;g. 2.0 "
-        "means if the function and its dependencies is 2 times bigger than "
-        "an average partition; 0 disables large functions handling entirely"));
+        "when max depth is reached and we can no longer branch out, this "
+        "value determines if a function is worth merging into an already "
+        "existing partition to reduce code duplication. This is a factor "
+        "of the ideal partition size, e.g. 2.0 means we consider the "
+        "function for merging if its cost (including its callees) is 2x the "
+        "size of an ideal partition."));
 
 static cl::opt<float> LargeFnOverlapForMerge(
-    "amdgpu-module-splitting-large-function-merge-overlap", cl::init(0.8f),
-    cl::Hidden,
-    cl::desc(
-        "defines how much overlap between two large function's dependencies "
-        "is needed to put them in the same partition"));
+    "amdgpu-module-splitting-merge-threshold", cl::init(0.7f), cl::Hidden,
+    cl::desc("when a function is considered for merging into a partition that "
+             "already contains some of its callees, do the merge if at least "
+             "n% of the code it can reach is already present inside the "
+             "partition; e.g. 0.7 means only merge >70%"));
 
 static cl::opt<bool> NoExternalizeGlobals(
     "amdgpu-module-splitting-no-externalize-globals", cl::Hidden,
@@ -89,142 +104,95 @@ static cl::opt<bool> NoExternalizeGlobals(
              "may cause globals to be duplicated which increases binary size"));
 
 static cl::opt<std::string>
-    LogDirOpt("amdgpu-module-splitting-log-dir", cl::Hidden,
-              cl::desc("output directory for AMDGPU module splitting logs"));
+    ModuleDotCfgOutput("amdgpu-module-splitting-print-module-dotcfg",
+                       cl::Hidden,
+                       cl::desc("output file to write out the dotgraph "
+                                "representation of the input module"));
+
+static cl::opt<std::string> PartitionSummariesOutput(
+    "amdgpu-module-splitting-print-partition-summaries", cl::Hidden,
+    cl::desc("output file to write out a summary of "
+             "the partitions created for each module"));
+
+#ifndef NDEBUG
+static cl::opt<bool> TimeBuild("amdgpu-module-splitting-time-trace", cl::Hidden,
+                               cl::desc("enable and print timers"));
+
+static cl::opt<bool>
+    UseLockFile("amdgpu-module-splitting-serial-execution", cl::Hidden,
+                cl::desc("use a lock file so only one process in the system "
+                         "can run this pass at once. useful to avoid mangled "
+                         "debug output in multithreaded environments."));
 
 static cl::opt<bool>
-    LogPrivate("amdgpu-module-splitting-log-private", cl::Hidden,
-               cl::desc("hash value names before printing them in the AMDGPU "
-                        "module splitting logs"));
+    DebugProposalSearch("amdgpu-module-splitting-debug-proposal-search",
+                        cl::Hidden,
+                        cl::desc("print all proposals received and whether "
+                                 "they were rejected or accepted"));
+
+struct SplitModuleTimer : NamedRegionTimer {
+  SplitModuleTimer(StringRef Name, StringRef Desc)
+      : NamedRegionTimer(Name, Desc, DEBUG_TYPE, "AMDGPU Module Splitting",
+                         TimeBuild) {}
+};
+#else
+struct SplitModuleTimer {
+  SplitModuleTimer(StringRef Name, StringRef Desc) {}
+};
+#endif
+
+//===----------------------------------------------------------------------===//
+// Utils
+//===----------------------------------------------------------------------===//
 
 using CostType = InstructionCost::CostType;
-using PartitionID = unsigned;
+using FunctionsCostMap = DenseMap<const Function *, CostType>;
 using GetTTIFn = function_ref<const TargetTransformInfo &(Function &)>;
+static constexpr unsigned InvalidPID = -1;
+
+/// \param Num numerator
+/// \param Dem denominator
+/// \param FmtString printf-like format string
+/// \returns a printable object to print (Num/Dem) using FmtString.
+static auto formatRatioOf(CostType Num, CostType Dem,
+                          const char *FmtString = "%0.2f") {
+  return format(FmtString, (double(Num) / Dem) * 100);
+}
 
-static bool isEntryPoint(const Function *F) {
+static bool isKernel(const Function *F) {
   return AMDGPU::isEntryFunctionCC(F->getCallingConv());
 }
 
-static std::string getName(const Value &V) {
-  static bool HideNames;
-
-  static llvm::once_flag HideNameInitFlag;
-  llvm::call_once(HideNameInitFlag, [&]() {
-    if (LogPrivate.getNumOccurrences())
-      HideNames = LogPrivate;
-    else {
-      const auto EV = sys::Process::GetEnv("AMD_SPLIT_MODULE_LOG_PRIVATE");
-      HideNames = (EV.value_or("0") != "0");
-    }
-  });
-
-  if (!HideNames)
-    return V.getName().str();
-  return toHex(SHA256::hash(arrayRefFromStringRef(V.getName())),
-               /*LowerCase=*/true);
+static bool isNonCopyable(const Function &F) {
+  return isKernel(&F) || F.hasExternalLinkage() || !F.isDefinitionExact();
 }
 
-/// Main logging helper.
-///
-/// Logging can be configured by the following environment variable.
-///   AMD_SPLIT_MODULE_LOG_DIR=<filepath>
-///     If set, uses <filepath> as the directory to write logfiles to
-///     each time module splitting is used.
-///   AMD_SPLIT_MODULE_LOG_PRIVATE
-///     If set to anything other than zero, all names are hidden.
-///
-/// Both environment variables have corresponding CL options which
-/// takes priority over them.
-///
-/// Any output printed to the log files is also printed to dbgs() when -debug is
-/// used and LLVM_DEBUG is defined.
-///
-/// This approach has a small disadvantage over LLVM_DEBUG though: logging logic
-/// cannot be removed from the code (by building without debug). This probably
-/// has a small performance cost because if some computation/formatting is
-/// needed for logging purpose, it may be done everytime only to be ignored
-/// by the logger.
-///
-/// As this pass only runs once and is not doing anything computationally
-/// expensive, this is likely a reasonable trade-off.
-///
-/// If some computation should really be avoided when unused, users of the class
-/// can check whether any logging will occur by using the bool operator.
-///
-/// \code
-///   if (SML) {
-///     // Executes only if logging to a file or if -debug is available and
-///     used.
-///   }
-/// \endcode
-class SplitModuleLogger {
-public:
-  SplitModuleLogger(const Module &M) {
-    std::string LogDir = LogDirOpt;
-    if (LogDir.empty())
-      LogDir = sys::Process::GetEnv("AMD_SPLIT_MODULE_LOG_DIR").value_or("");
-
-    // No log dir specified means we don't need to log to a file.
-    // We may still log to dbgs(), though.
-    if (LogDir.empty())
-      return;
-
-    // If a log directory is specified, create a new file with a unique name in
-    // that directory.
-    int Fd;
-    SmallString<0> PathTemplate;
-    SmallString<0> RealPath;
-    sys::path::append(PathTemplate, LogDir, "Module-%%-%%-%%-%%-%%-%%-%%.txt");
-    if (auto Err =
-            sys::fs::createUniqueFile(PathTemplate.str(), Fd, RealPath)) {
-      report_fatal_error("Failed to create log file at '" + Twine(LogDir) +
-                             "': " + Err.message(),
-                         /*CrashDiag=*/false);
-    }
-
-    FileOS = std::make_unique<raw_fd_ostream>(Fd, /*shouldClose=*/true);
-  }
-
-  bool hasLogFile() const { return FileOS != nullptr; }
-
-  raw_ostream &logfile() {
-    assert(FileOS && "no logfile!");
-    return *FileOS;
-  }
-
-  /// \returns true if this SML will log anything either to a file or dbgs().
-  /// Can be used to avoid expensive computations that are ignored when logging
-  /// is disabled.
-  operator bool() const {
-    return hasLogFile() || (DebugFlag && isCurrentDebugType(DEBUG_TYPE));
+/// If \p GV has local linkage, make it external + hidden.
+static void externalize(GlobalValue &GV) {
+  if (GV.hasLocalLinkage()) {
+    GV.setLinkage(GlobalValue::ExternalLinkage);
+    GV.setVisibility(GlobalValue::HiddenVisibility);
   }
 
-private:
-  std::unique_ptr<raw_fd_ostream> FileOS;
-};
-
-template <typename Ty>
-static SplitModuleLogger &operator<<(SplitModuleLogger &SML, const Ty &Val) {
-  static_assert(
-      !std::is_same_v<Ty, Value>,
-      "do not print values to logs directly, use handleName instead!");
-  LLVM_DEBUG(dbgs() << Val);
-  if (SML.hasLogFile())
-    SML.logfile() << Val;
-  return SML;
+  // Unnamed entities must be named consistently between modules. setName will
+  // give a distinct name to each such entity.
+  if (!GV.hasName())
+    GV.setName("__llvmsplit_unnamed");
 }
 
-/// Calculate the cost of each function in \p M
-/// \param SML Log Helper
+/// Cost analysis function. Calculates the cost of each function in \p M
+///
 /// \param GetTTI Abstract getter for TargetTransformInfo.
 /// \param M Module to analyze.
 /// \param CostMap[out] Resulting Function -> Cost map.
 /// \return The module's total cost.
-static CostType
-calculateFunctionCosts(SplitModuleLogger &SML, GetTTIFn GetTTI, Module &M,
-                       DenseMap<const Function *, CostType> &CostMap) {
+static CostType calculateFunctionCosts(GetTTIFn GetTTI, Module &M,
+                                       FunctionsCostMap &CostMap) {
+  SplitModuleTimer SMT("calculateFunctionCosts", "cost analysis");
+
+  LLVM_DEBUG(dbgs() << "[cost analysis] calculating function costs\n");
   CostType ModuleCost = 0;
-  CostType KernelCost = 0;
+  [[maybe_unused]] CostType KernelCost = 0;
 
   for (auto &Fn : M) {
     if (Fn.isDeclaration())
@@ -251,23 +219,30 @@ calculateFunctionCosts(SplitModuleLogger &SML, GetTTIFn GetTTI, Module &M,
     assert((ModuleCost + FnCost) >= ModuleCost && "Overflow!");
     ModuleCost += FnCost;
 
-    if (isEntryPoint(&Fn))
+    if (isKernel(&Fn))
       KernelCost += FnCost;
   }
 
-  CostType FnCost = (ModuleCost - KernelCost);
-  CostType ModuleCostOr1 = ModuleCost ? ModuleCost : 1;
-  SML << "=> Total Module Cost: " << ModuleCost << '\n'
-      << "  => KernelCost: " << KernelCost << " ("
-      << format("%0.2f", (float(KernelCost) / ModuleCostOr1) * 100) << "%)\n"
-      << "  => FnsCost: " << FnCost << " ("
-      << format("%0.2f", (float(FnCost) / ModuleCostOr1) * 100) << "%)\n";
+  if (CostMap.empty())
+    return 0;
+
+  assert(ModuleCost);
+  LLVM_DEBUG({
+    const CostType FnCost = ModuleCost - KernelCost;
+    dbgs() << " - total module cost is " << ModuleCost << ". kernels cost "
+           << "" << KernelCost << " ("
+           << format("%0.2f", (float(KernelCost) / ModuleCost) * 100)
+           << "% of the module), functions cost " << FnCost << " ("
+           << format("%0.2f", (float(FnCost) / ModuleCost) * 100)
+           << "% of the module)\n";
+  });
 
   return ModuleCost;
 }
 
+/// \return true if \p F can be indirectly called
 static bool canBeIndirectlyCalled(const Function &F) {
-  if (F.isDeclaration() || isEntryPoint(&F))
+  if (F.isDeclaration() || isKernel(&F))
     return false;
   return !F.hasLocalLinkage() ||
          F.hasAddressTaken(/*PutOffender=*/nullptr,
@@ -278,351 +253,1042 @@ static bool canBeIndirectlyCalled(const Function &F) {
                            /*IgnoreCastedDirectCall=*/true);
 }
 
-/// When a function or any of its callees performs an indirect call, this
-/// takes over \ref addAllDependencies and adds all potentially callable
-/// functions to \p Fns so they can be counted as dependencies of the function.
+//===----------------------------------------------------------------------===//
+// Graph-based Module Representation
+//===----------------------------------------------------------------------===//
+
+/// AMDGPUSplitModule's view of the source Module, as a graph of all components
+/// that can be split into different modules.
 ///
-/// This is needed due to how AMDGPUResourceUsageAnalysis operates: in the
-/// presence of an indirect call, the function's resource usage is the same as
-/// the most expensive function in the module.
-/// \param M    The module.
-/// \param Fns[out] Resulting list of functions.
-static void addAllIndirectCallDependencies(const Module &M,
-                                           DenseSet<const Function *> &Fns) {
-  for (const auto &Fn : M) {
-    if (canBeIndirectlyCalled(Fn))
-      Fns.insert(&Fn);
+/// The most trivial instance of this graph is just the CallGraph of the module,
+/// but it is not guaranteed that the graph is strictly equal to the CFG. It
+/// currently always is but it's designed in a way that would eventually allow
+/// us to create abstract nodes, or nodes for different entities such as global
+/// variables or any other meaningful constraint we must consider.
+///
+/// The graph is only mutable by this class, and is generally not modified
+/// after \ref SplitGraph::buildGraph runs. No consumers of the graph can
+/// mutate it.
+class SplitGraph {
+public:
+  class Node;
+
+  enum class EdgeKind : uint8_t {
+    /// The nodes are related through a direct call. This is a "strong" edge as
+    /// it means the Src will directly reference the Dst.
+    DirectCall,
+    /// The nodes are related through an indirect call.
+    /// This is a "weaker" edge and is only considered when traversing the graph
+    /// starting from a kernel. We need this edge for resource usage analysis.
+    ///
+    /// The reason why we have this edge in the first place is due to how
+    /// AMDGPUResourceUsageAnalysis works. In the presence of an indirect call,
+    /// the resource usage of the kernel containing the indirect call is the
+    /// max resource usage of all functions that can be indirectly called.
+    IndirectCall,
+  };
+
+  /// An edge between two nodes. Edges are directional, and tagged with a
+  /// "kind".
+  struct Edge {
+    Edge(Node *Src, Node *Dst, EdgeKind Kind)
+        : Src(Src), Dst(Dst), Kind(Kind) {}
+
+    Node *Src; ///< Source
+    Node *Dst; ///< Destination
+    EdgeKind Kind;
+  };
+
+  using EdgesVec = SmallVector<const Edge *, 0>;
+  using edges_iterator = EdgesVec::const_iterator;
+  using nodes_iterator = const Node *const *;
+
+  SplitGraph(const Module &M, const FunctionsCostMap &CostMap,
+             CostType ModuleCost)
+      : M(M), CostMap(CostMap), ModuleCost(ModuleCost) {}
+
+  void buildGraph(CallGraph &CG);
+
+#ifndef NDEBUG
+  void verifyGraph() const;
+#endif
+
+  bool empty() const { return Nodes.empty(); }
+  const iterator_range<nodes_iterator> nodes() const {
+    return {Nodes.begin(), Nodes.end()};
   }
-}
+  const Node &getNode(unsigned ID) const { return *Nodes[ID]; }
 
-/// Adds the functions that \p Fn may call to \p Fns, then recurses into each
-/// callee until all reachable functions have been gathered.
+  unsigned getNumNodes() const { return Nodes.size(); }
+  BitVector createNodesBitVector() const { return BitVector(Nodes.size()); }
+
+  const Module &getModule() const { return M; }
+
+  CostType getModuleCost() const { return ModuleCost; }
+  CostType getCost(const Function &F) const { return CostMap.at(&F); }
+
+  /// \returns the aggregated cost of all nodes in \p BV (bits set to 1 = node
+  /// IDs).
+  CostType calculateCost(const BitVector &BV) const;
+
+private:
+  /// Retrieves the node for \p GV in \p Cache, or creates a new node for it and
+  /// updates \p Cache.
+  Node &getNode(DenseMap<const GlobalValue *, Node *> &Cache,
+                const GlobalValue &GV);
+
+  // Create a new edge between two nodes and add it to both nodes.
+  const Edge &createEdge(Node &Src, Node &Dst, EdgeKind EK);
+
+  const Module &M;
+  const FunctionsCostMap &CostMap;
+  CostType ModuleCost;
+
+  // Final list of nodes with stable ordering.
+  SmallVector<Node *> Nodes;
+
+  SpecificBumpPtrAllocator<Node> NodesPool;
+
+  // Edges are trivially destructible objects, so as a small optimization we
+  // use a BumpPtrAllocator which avoids destructor calls but also makes
+  // allocation faster.
+  static_assert(
+      std::is_trivially_destructible_v<Edge>,
+      "Edge must be trivially destructible to use the BumpPtrAllocator");
+  BumpPtrAllocator EdgesPool;
+};
+
+/// Nodes in the SplitGraph contain both incoming, and outgoing edges.
+/// Incoming edges have this node as their Dst, and Outgoing ones have this node
+/// as their Src.
 ///
-/// \param SML Log Helper
-/// \param CG Call graph for \p Fn's module.
-/// \param Fn Current function to look at.
-/// \param Fns[out] Resulting list of functions.
-/// \param OnlyDirect Whether to only consider direct callees.
-/// \param HadIndirectCall[out] Set to true if an indirect call was seen at some
-/// point, either in \p Fn or in one of the function it calls. When that
-/// happens, we fall back to adding all callable functions inside \p Fn's module
-/// to \p Fns.
-static void addAllDependencies(SplitModuleLogger &SML, const CallGraph &CG,
-                               const Function &Fn,
-                               DenseSet<const Function *> &Fns, bool OnlyDirect,
-                               bool &HadIndirectCall) {
-  assert(!Fn.isDeclaration());
-
-  const Module &M = *Fn.getParent();
-  SmallVector<const Function *> WorkList({&Fn});
+/// Edge objects are shared by both nodes in Src/Dst. They provide immediate
+/// feedback on how two nodes are related, and in which direction they are
+/// related, which is valuable information to make splitting decisions.
+///
+/// Nodes are fundamentally abstract, and any consumers of the graph should
+/// treat them as such. While a node will be a function most of the time, we
+/// could also create nodes for any other reason. In the future, we could have
+/// single nodes for multiple functions, or nodes for GVs, etc.
+class SplitGraph::Node {
+  friend class SplitGraph;
+
+public:
+  Node(unsigned ID, const GlobalValue &GV, CostType IndividualCost,
+       bool IsNonCopyable)
+      : ID(ID), GV(GV), IndividualCost(IndividualCost),
+        IsNonCopyable(IsNonCopyable), IsEntry(false) {
+    if (auto *Fn = dyn_cast<Function>(&GV))
+      IsKernel = ::llvm::isKernel(Fn);
+    else
+      IsKernel = false;
+  }
+
+  /// An 0-indexed ID for the node. The maximum ID (exclusive) is the number of
+  /// nodes in the graph. This ID can be used as an index in a BitVector.
+  unsigned getID() const { return ID; }
+
+  const Function &getFunction() const { return cast<Function>(GV); }
+
+  /// \returns the cost to import this component into a given module, not
+  /// accounting for any dependencies that may need to be imported as well.
+  CostType getIndividualCost() const { return IndividualCost; }
+
+  bool isNonCopyable() const { return IsNonCopyable; }
+  bool isKernel() const { return IsKernel; }
+
+  /// \returns whether this is an entry point in the graph. Entry points are
+  /// defined as follows: if you take all entry points in the graph, and iterate
+  /// their dependencies, you are guaranteed to visit all nodes in the graph at
+  /// least once.
+  bool isGraphEntryPoint() const { return IsEntry; }
+
+  std::string getName() const { return GV.getName().str(); }
+
+  bool hasAnyIncomingEdges() const { return IncomingEdges.size(); }
+  bool hasAnyIncomingEdgesOfKind(EdgeKind EK) const {
+    return any_of(IncomingEdges, [&](const auto *E) { return E->Kind == EK; });
+  }
+
+  bool hasAnyOutgoingEdges() const { return OutgoingEdges.size(); }
+  bool hasAnyOutgoingEdgesOfKind(EdgeKind EK) const {
+    return any_of(OutgoingEdges, [&](const auto *E) { return E->Kind == EK; });
+  }
+
+  iterator_range<edges_iterator> incoming_edges() const {
+    return IncomingEdges;
+  }
+
+  iterator_range<edges_iterator> outgoing_edges() const {
+    return OutgoingEdges;
+  }
+
+  bool shouldFollowIndirectCalls() const { return isKernel(); }
+
+  /// Visit all children of this node in a recursive fashion. Also visits Self.
+  /// If \ref shouldFollowIndirectCalls returns false, then this only follows
+  /// DirectCall edges.
+  ///
+  /// \param Visitor Visitor Function.
+  void visitAllDependencies(std::function<void(const Node &)> Visitor) const;
+
+  /// Adds the depedencies of this node in \p BV by setting the bit
+  /// corresponding to each node.
+  ///
+  /// Implemented using \ref visitAllDependencies, hence it follows the same
+  /// rules regarding dependencies traversal.
+  ///
+  /// \param[out] BV The bitvector where the bits should be set.
+  void setDependenciesBits(BitVector &BV) const {
+    visitAllDependencies([&](const Node &N) { BV.set(N.getID()); });
+  }
+
+  /// Uses \ref visitAllDependencies to aggregate the individual cost of this
+  /// node and all of its dependencies.
+  ///
+  /// This is cached.
+  CostType getFullCost() const;
+
+private:
+  void markAsEntry() { IsEntry = true; }
+
+  unsigned ID;
+  const GlobalValue &GV;
+  CostType IndividualCost;
+  bool IsNonCopyable : 1;
+  bool IsKernel : 1;
+  bool IsEntry : 1;
+
+  // TODO: Cache dependencies as well?
+  mutable CostType FullCost = 0;
+
+  // TODO: Use a single sorted vector (with all incoming/outgoing edges grouped
+  // together)
+  EdgesVec IncomingEdges;
+  EdgesVec OutgoingEdges;
+};
+
+void SplitGraph::Node::visitAllDependencies(
+    std::function<void(const Node &)> Visitor) const {
+  const bool FollowIndirect = shouldFollowIndirectCalls();
+  // FIXME: If this can access SplitGraph in the future, use a BitVector
+  // instead.
+  DenseSet<const Node *> Seen;
+  SmallVector<const Node *, 8> WorkList({this});
   while (!WorkList.empty()) {
-    const auto &CurFn = *WorkList.pop_back_val();
-    assert(!CurFn.isDeclaration());
+    const Node *CurN = WorkList.pop_back_val();
+    if (auto [It, Inserted] = Seen.insert(CurN); !Inserted)
+      continue;
+
+    Visitor(*CurN);
 
-    // Scan for an indirect call. If such a call is found, we have to
-    // conservatively assume this can call all non-entrypoint functions in the
-    // module.
+    for (const Edge *E : CurN->outgoing_edges()) {
+      if (!FollowIndirect && E->Kind == EdgeKind::IndirectCall)
+        continue;
+      WorkList.push_back(E->Dst);
+    }
+  }
+}
+CostType SplitGraph::Node::getFullCost() const {
+  if (FullCost)
+    return FullCost;
+
+  assert(FullCost == 0);
+  visitAllDependencies(
+      [&](const Node &N) { FullCost += N.getIndividualCost(); });
+  return FullCost;
+}
+
+void SplitGraph::buildGraph(CallGraph &CG) {
+  SplitModuleTimer SMT("buildGraph", "graph construction");
+  LLVM_DEBUG(
+      dbgs()
+      << "[build graph] constructing graph representation of the input\n");
+
+  // We build the graph by just iterating all functions in the module and
+  // working on their direct callees. At the end, all nodes should be linked
+  // together as expected.
+  DenseMap<const GlobalValue *, Node *> Cache;
+  SmallVector<const Function *> FnsWithIndirectCalls, IndirectlyCallableFns;
+  for (const Function &Fn : M) {
+    if (Fn.isDeclaration())
+      continue;
 
-    for (auto &CGEntry : *CG[&CurFn]) {
+    // Look at direct callees and create the necessary edges in the graph.
+    bool HasIndirectCall = false;
+    Node &N = getNode(Cache, Fn);
+    for (auto &CGEntry : *CG[&Fn]) {
       auto *CGNode = CGEntry.second;
       auto *Callee = CGNode->getFunction();
       if (!Callee) {
-        if (OnlyDirect)
-          continue;
-
-        // Functions have an edge towards CallsExternalNode if they're external
-        // declarations, or if they do an indirect call. As we only process
-        // definitions here, we know this means the function has an indirect
-        // call. We then have to conservatively assume this can call all
-        // non-entrypoint functions in the module.
-        if (CGNode != CG.getCallsExternalNode())
-          continue; // this is another function-less node we don't care about.
-
-        SML << "Indirect call detected in " << getName(CurFn)
-            << " - treating all non-entrypoint functions as "
-               "potential dependencies\n";
-
-        // TODO: Print an ORE as well ?
-        addAllIndirectCallDependencies(M, Fns);
-        HadIndirectCall = true;
+        // TODO: Don't consider inline assembly as indirect calls.
+        if (CGNode == CG.getCallsExternalNode())
+          HasIndirectCall = true;
         continue;
       }
 
-      if (Callee->isDeclaration())
-        continue;
+      if (!Callee->isDeclaration())
+        createEdge(N, getNode(Cache, *Callee), EdgeKind::DirectCall);
+    }
+
+    // Keep track of this function if it contains an indirect call and/or if it
+    // can be indirectly called.
+    if (HasIndirectCall) {
+      LLVM_DEBUG(dbgs() << "indirect call found in " << Fn.getName() << "\n");
+      FnsWithIndirectCalls.push_back(&Fn);
+    }
+
+    if (canBeIndirectlyCalled(Fn))
+      IndirectlyCallableFns.push_back(&Fn);
+  }
 
-      auto [It, Inserted] = Fns.insert(Callee);
-      if (Inserted)
-        WorkList.push_back(Callee);
+  // Post-process functions with indirect calls.
+  for (const Function *Fn : FnsWithIndirectCalls) {
+    for (const Function *Candidate : IndirectlyCallableFns) {
+      Node &Src = getNode(Cache, *Fn);
+      Node &Dst = getNode(Cache, *Candidate);
+      createEdge(Src, Dst, EdgeKind::IndirectCall);
     }
   }
+
+  // Now, find all entry points.
+  SmallVector<Node *, 16> CandidateEntryPoints;
+  BitVector NodesReachableByKernels = createNodesBitVector();
+  for (Node *N : Nodes) {
+    // Kernels are always entry points.
+    if (N->isKernel()) {
+      N->markAsEntry();
+      N->setDependenciesBits(NodesReachableByKernels);
+    } else if (!N->hasAnyIncomingEdgesOfKind(EdgeKind::DirectCall))
+      CandidateEntryPoints.push_back(N);
+  }
+
+  for (Node *N : CandidateEntryPoints) {
+    // This can be another entry point if it's not reachable by a kernel
+    // TODO: We could sort all of the possible new entries in a stable order
+    // (e.g. by cost), then consume them one by one until
+    // NodesReachableByKernels is all 1s. It'd allow us to avoid
+    // considering some nodes as non-entries in some specific cases.
+    if (!NodesReachableByKernels.test(N->getID()))
+      N->markAsEntry();
+  }
+
+#ifndef NDEBUG
+  verifyGraph();
+#endif
 }
 
-/// Contains information about a function and its dependencies.
-/// This is a splitting root. The splitting algorithm works by
-/// assigning these to partitions.
-struct FunctionWithDependencies {
-  FunctionWithDependencies(SplitModuleLogger &SML, CallGraph &CG,
-                           const DenseMap<const Function *, CostType> &FnCosts,
-                           const Function *Fn)
-      : Fn(Fn) {
-    // When Fn is not a kernel, we don't need to collect indirect callees.
-    // Resource usage analysis is only performed on kernels, and we collect
-    // indirect callees for resource usage analysis.
-    addAllDependencies(SML, CG, *Fn, Dependencies,
-                       /*OnlyDirect*/ !isEntryPoint(Fn), HasIndirectCall);
-    TotalCost = FnCosts.at(Fn);
-    for (const auto *Dep : Dependencies) {
-      TotalCost += FnCosts.at(Dep);
-
-      // We cannot duplicate functions with external linkage, or functions that
-      // may be overriden at runtime.
-      HasNonDuplicatableDependecy |=
-          (Dep->hasExternalLinkage() || !Dep->isDefinitionExact());
+#ifndef NDEBUG
+void SplitGraph::verifyGraph() const {
+  unsigned ExpectedID = 0;
+  // Exceptionally using a set here in case IDs are messed up.
+  DenseSet<const Node *> SeenNodes;
+  DenseSet<const Function *> SeenFunctionNodes;
+  for (const Node *N : Nodes) {
+    assert(N->getID() == (ExpectedID++) && "Node IDs are incorrect!");
+    assert(SeenNodes.insert(N).second && "Node seen more than once!");
+    assert(&getNode(N->getID()) == N);
+
+    for (const Edge *E : N->IncomingEdges) {
+      assert(E->Src && E->Dst);
+      assert(E->Dst == N);
+      assert(find(E->Src->OutgoingEdges, E) != E->Src->OutgoingEdges.end());
     }
+
+    for (const Edge *E : N->OutgoingEdges) {
+      assert(E->Src && E->Dst);
+      assert(E->Src == N);
+      assert(find(E->Dst->IncomingEdges, E) != E->Dst->IncomingEdges.end());
+    }
+
+    const Function &Fn = N->getFunction();
+    if (isKernel(&Fn)) {
+      assert(!N->hasAnyIncomingEdges() && "Kernels cannot have incoming edges");
+    }
+    assert(!Fn.isDeclaration() && "declarations shouldn't have nodes!");
+
+    auto [It, Inserted] = SeenFunctionNodes.insert(&Fn);
+    assert(Inserted && "one function has multiple nodes!");
   }
+  assert(ExpectedID == Nodes.size() && "Node IDs out of sync!");
 
-  const Function *Fn = nullptr;
-  DenseSet<const Function *> Dependencies;
-  /// Whether \p Fn or any of its \ref Dependencies contains an indirect call.
-  bool HasIndirectCall = false;
-  /// Whether any of \p Fn's dependencies cannot be duplicated.
-  bool HasNonDuplicatableDependecy = false;
+  assert(createNodesBitVector().size() == getNumNodes());
 
-  CostType TotalCost = 0;
+  // Check we respect the promise of Node::isKernel
+  BitVector BV = createNodesBitVector();
+  for (const Node *N : nodes()) {
+    if (N->isGraphEntryPoint())
+      N->setDependenciesBits(BV);
+  }
 
-  /// \returns true if this function and its dependencies can be considered
-  /// large according to \p Threshold.
-  bool isLarge(CostType Threshold) const {
-    return TotalCost > Threshold && !Dependencies.empty();
+  // Ensure each function in the module has an associated node.
+  for (const auto &Fn : M) {
+    if (!Fn.isDeclaration())
+      assert(SeenFunctionNodes.contains(&Fn) &&
+             "Fn has no associated node in the graph!");
   }
+
+  assert(BV.all() &&
+         "not all nodes are reachable through the graph's entry points!");
+}
+#endif
+
+CostType SplitGraph::calculateCost(const BitVector &BV) const {
+  CostType Cost = 0;
+  for (unsigned NodeID : BV.set_bits())
+    Cost += getNode(NodeID).getIndividualCost();
+  return Cost;
+}
+
+SplitGraph::Node &
+SplitGraph::getNode(DenseMap<const GlobalValue *, Node *> &Cache,
+                    const GlobalValue &GV) {
+  auto &N = Cache[&GV];
+  if (N)
+    return *N;
+
+  CostType Cost = 0;
+  bool NonCopyable = false;
+  if (const Function *Fn = dyn_cast<Function>(&GV)) {
+    NonCopyable = isNonCopyable(*Fn);
+    Cost = CostMap.at(Fn);
+  }
+  N = new (NodesPool.Allocate()) Node(Nodes.size(), GV, Cost, NonCopyable);
+  Nodes.push_back(N);
+  assert(&getNode(N->getID()) == N);
+  return *N;
+}
+
+const SplitGraph::Edge &SplitGraph::createEdge(Node &Src, Node &Dst,
+                                               EdgeKind EK) {
+  const Edge *E = new (EdgesPool.Allocate<Edge>(1)) Edge(&Src, &Dst, EK);
+  Src.OutgoingEdges.push_back(E);
+  Dst.IncomingEdges.push_back(E);
+  return *E;
+}
+
+//===----------------------------------------------------------------------===//
+// Split Proposals
+//===----------------------------------------------------------------------===//
+
+/// Represents a module splitting proposal.
+///
+/// Proposals are made of N BitVectors, one for each partition, where each bit
+/// set indicates that the node is present and should be copied inside that
+/// partition.
+///
+/// Proposals have several metrics attached so they can be compared/sorted,
+/// which the driver to try multiple strategies resultings in multiple proposals
+/// and choose the best one out of them.
+class SplitProposal {
+public:
+  SplitProposal(const SplitGraph &SG, unsigned MaxPartitions) : SG(&SG) {
+    Partitions.resize(MaxPartitions, {0, SG.createNodesBitVector()});
+  }
+
+  void setName(StringRef NewName) { Name = NewName; }
+  StringRef getName() const { return Name; }
+
+  const BitVector &operator[](unsigned PID) const {
+    return Partitions[PID].second;
+  }
+
+  void add(unsigned PID, const BitVector &BV) {
+    Partitions[PID].second |= BV;
+    updateScore(PID);
+  }
+
+  void print(raw_ostream &OS) const;
+  LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
+
+  // Find the cheapest partition (lowest cost). In case of ties, always returns
+  // the highest partition number.
+  unsigned findCheapestPartition() const;
+
+  /// Calculate the CodeSize and Bottleneck scores.
+  void calculateScores();
+
+#ifndef NDEBUG
+  void verifyCompleteness() const;
+#endif
+
+  /// Only available after \ref calculateScores is called.
+  ///
+  /// A positive number indicating the % of code duplication that this proposal
+  /// creates. e.g. 0.2 means this proposal adds roughly 20% code size by
+  /// duplicating some functions across partitions.
+  ///
+  /// Value is always rounded up to 3 decimal places.
+  ///
+  /// A perfect score would be 0.0, and anything approaching 1.0 is very bad.
+  double getCodeSizeScore() const { return CodeSizeScore; }
+
+  /// Only available after \ref calculateScores is called.
+  ///
+  /// A number between [0, 1] which indicates how big of a bottleneck is
+  /// expected from the largest partition.
+  ///
+  /// A score of 1.0 means the biggest partition is as big as the source module,
+  /// so build time will be equal to or greater than the build time of the
+  /// initial input.
+  ///
+  /// Value is always rounded up to 3 decimal places.
+  ///
+  /// This is one of the metrics used to estimate this proposal's build time.
+  double getBottleneckScore() const { return BottleneckScore; }
+
+private:
+  void updateScore(unsigned PID) {
+    assert(SG);
+    for (auto &[PCost, Nodes] : Partitions) {
+      TotalCost -= PCost;
+      PCost = SG->calculateCost(Nodes);
+      TotalCost += PCost;
+    }
+  }
+
+  /// \see getCodeSizeScore
+  double CodeSizeScore = 0.0;
+  /// \see getBottleneckScore
+  double BottleneckScore = 0.0;
+  /// Aggregated cost of all partitions
+  CostType TotalCost = 0;
+
+  const SplitGraph *SG = nullptr;
+  std::string Name;
+
+  std::vector<std::pair<CostType, BitVector>> Partitions;
 };
 
-/// Calculates how much overlap there is between \p A and \p B.
-/// \return A number between 0.0 and 1.0, where 1.0 means A == B and 0.0 means A
-/// and B have no shared elements. Kernels do not count in overlap calculation.
-static float calculateOverlap(const DenseSet<const Function *> &A,
-                              const DenseSet<const Function *> &B) {
-  DenseSet<const Function *> Total;
-  for (const auto *F : A) {
-    if (!isEntryPoint(F))
-      Total.insert(F);
+void SplitProposal::print(raw_ostream &OS) const {
+  assert(SG);
+
+  OS << "[proposal] " << Name << ", total cost:" << TotalCost
+     << ", code size score:" << format("%0.3f", CodeSizeScore)
+     << ", bottleneck score:" << format("%0.3f", BottleneckScore) << "\n";
+  for (const auto &[PID, Part] : enumerate(Partitions)) {
+    const auto &[Cost, NodeIDs] = Part;
+    OS << "  - P" << PID << " nodes:" << NodeIDs.count() << " cost: " << Cost
+       << "|" << formatRatioOf(Cost, SG->getModuleCost()) << "%\n";
   }
+}
 
-  if (Total.empty())
-    return 0.0f;
+unsigned SplitProposal::findCheapestPartition() const {
+  assert(!Partitions.empty());
+  CostType CurCost = std::numeric_limits<CostType>::max();
+  unsigned CurPID = InvalidPID;
+  for (const auto &[Idx, Part] : enumerate(Partitions)) {
+    if (Part.first <= CurCost) {
+      CurPID = Idx;
+      CurCost = Part.first;
+    }
+  }
+  assert(CurPID != InvalidPID);
+  return CurPID;
+}
 
-  unsigned NumCommon = 0;
-  for (const auto *F : B) {
-    if (isEntryPoint(F))
-      continue;
+void SplitProposal::calculateScores() {
+  if (Partitions.empty())
+    return;
 
-    auto [It, Inserted] = Total.insert(F);
-    if (!Inserted)
-      ++NumCommon;
+  assert(SG);
+  CostType LargestPCost = 0;
+  for (auto &[PCost, Nodes] : Partitions) {
+    if (PCost > LargestPCost)
+      LargestPCost = PCost;
   }
 
-  return static_cast<float>(NumCommon) / Total.size();
+  CostType ModuleCost = SG->getModuleCost();
+  CodeSizeScore = double(TotalCost) / ModuleCost;
+  assert(CodeSizeScore >= 0.0);
+
+  BottleneckScore = double(LargestPCost) / ModuleCost;
+
+  CodeSizeScore = std::ceil(CodeSizeScore * 100.0) / 100.0;
+  BottleneckScore = std::ceil(BottleneckScore * 100.0) / 100.0;
 }
 
-/// Performs all of the partitioning work on \p M.
-/// \param SML Log Helper
-/// \param M Module to partition.
-/// \param NumParts Number of partitions to create.
-/// \param ModuleCost Total cost of all functions in \p M.
-/// \param FnCosts Map of Function -> Cost
-/// \param WorkList Functions and their dependencies to process in order.
-/// \returns The created partitions (a vector of size \p NumParts )
-static std::vector<DenseSet<const Function *>>
-doPartitioning(SplitModuleLogger &SML, Module &M, unsigned NumParts,
-               CostType ModuleCost,
-               const DenseMap<const Function *, CostType> &FnCosts,
-               const SmallVector<FunctionWithDependencies> &WorkList) {
-
-  SML << "\n--Partitioning Starts--\n";
-
-  // Calculate a "large function threshold". When more than one function's total
-  // import cost exceeds this value, we will try to assign it to an existing
-  // partition to reduce the amount of duplication needed.
-  //
-  // e.g. let two functions X and Y have a import cost of ~10% of the module, we
-  // assign X to a partition as usual, but when we get to Y, we check if it's
-  // worth also putting it in Y's partition.
-  const CostType LargeFnThreshold =
-      LargeFnFactor ? CostType(((ModuleCost / NumParts) * LargeFnFactor))
-                    : std::numeric_limits<CostType>::max();
-
-  std::vector<DenseSet<const Function *>> Partitions;
-  Partitions.resize(NumParts);
-
-  // Assign functions to partitions, and try to keep the partitions more or
-  // less balanced. We do that through a priority queue sorted in reverse, so we
-  // can always look at the partition with the least content.
-  //
-  // There are some cases where we will be deliberately unbalanced though.
-  //  - Large functions: we try to merge with existing partitions to reduce code
-  //  duplication.
-  //  - Functions with indirect or external calls always go in the first
-  //  partition (P0).
-  auto ComparePartitions = [](const std::pair<PartitionID, CostType> &a,
-                              const std::pair<PartitionID, CostType> &b) {
-    // When two partitions have the same cost, assign to the one with the
-    // biggest ID first. This allows us to put things in P0 last, because P0 may
-    // have other stuff added later.
-    if (a.second == b.second)
-      return a.first < b.first;
-    return a.second > b.second;
+#ifndef NDEBUG
+void SplitProposal::verifyCompleteness() const {
+  if (Partitions.empty())
+    return;
+
+  BitVector Result = Partitions[0].second;
+  for (const auto &P : drop_begin(Partitions))
+    Result |= P.second;
+  assert(Result.all() && "some nodes are missing from this proposal!");
+}
+#endif
+
+//===-- RecursiveSearchStrategy -------------------------------------------===//
+
+/// Partitioning algorithm.
+///
+/// This is a recursive search algorithm that can explore multiple possiblities.
+///
+/// When a cluster of nodes can go into more than one partition, and we haven't
+/// reached maximum search depth, we recurse and explore both options and their
+/// consequences. Both branches will yield a proposal, and the driver will grade
+/// both and choose the best one.
+///
+/// If max depth is reached, we will use some heuristics to make a choice. Most
+/// of the time we will just use the least-pressured (cheapest) partition, but
+/// if a cluster is particularly big and there is a good amount of overlap with
+/// an existing partition, we will choose that partition instead.
+class RecursiveSearchSplitting {
+public:
+  using SubmitProposalFn = function_ref<void(SplitProposal)>;
+
+  RecursiveSearchSplitting(const SplitGraph &SG, unsigned NumParts,
+                           SubmitProposalFn SubmitProposal);
+
+  void run();
+
+private:
+  struct WorkListEntry {
+    WorkListEntry(const BitVector &BV) : Cluster(BV) {}
+
+    unsigned NumNonEntryNodes = 0;
+    CostType TotalCost = 0;
+    CostType CostExcludingGraphEntryPoints = 0;
+    BitVector Cluster;
   };
 
-  // We can't use priority_queue here because we need to be able to access any
-  // element. This makes this a bit inefficient as we need to sort it again
-  // everytime we change it, but it's a very small array anyway (likely under 64
-  // partitions) so it's a cheap operation.
-  std::vector<std::pair<PartitionID, CostType>> BalancingQueue;
-  for (unsigned I = 0; I < NumParts; ++I)
-    BalancingQueue.emplace_back(I, 0);
-
-  // Helper function to handle assigning a function to a partition. This takes
-  // care of updating the balancing queue.
-  const auto AssignToPartition = [&](PartitionID PID,
-                                     const FunctionWithDependencies &FWD) {
-    auto &FnsInPart = Partitions[PID];
-    FnsInPart.insert(FWD.Fn);
-    FnsInPart.insert(FWD.Dependencies.begin(), FWD.Dependencies.end());
-
-    SML << "assign " << getName(*FWD.Fn) << " to P" << PID << "\n  ->  ";
-    if (!FWD.Dependencies.empty()) {
-      SML << FWD.Dependencies.size() << " dependencies added\n";
-    };
-
-    // Update the balancing queue. we scan backwards because in the common case
-    // the partition is at the end.
-    for (auto &[QueuePID, Cost] : reverse(BalancingQueue)) {
-      if (QueuePID == PID) {
-        CostType NewCost = 0;
-        for (auto *Fn : Partitions[PID])
-          NewCost += FnCosts.at(Fn);
-
-        SML << "[Updating P" << PID << " Cost]:" << Cost << " -> " << NewCost;
-        if (Cost) {
-          SML << " (" << unsigned(((float(NewCost) / Cost) - 1) * 100)
-              << "% increase)";
-        }
-        SML << '\n';
+  /// Collects all graph entry points's clusters and sort them so the most
+  /// expensive clusters are viewed first. This will merge clusters together if
+  /// they share a non-copyable dependency.
+  void setupWorkList();
+
+  /// Recursive function that assigns the worklist item at \p Idx into a
+  /// partition of \p SP.
+  ///
+  /// \p Depth is the current search depth. When this value is equal to
+  /// \ref MaxDepth, we can no longer recurse.
+  ///
+  /// This function only recurses if there is more than one possible assignment,
+  /// otherwise it is iterative to avoid creating a call stack that is as big as
+  /// \ref WorkList.
+  void pickPartition(unsigned Depth, unsigned Idx, SplitProposal SP);
+
+  /// \return A pair: first element is the PID of the partition that has the
+  /// most similarities with \p Entry, or \ref InvalidPID if no partition was
+  /// found with at least one element in common. The second element is the
+  /// aggregated cost of all dependencies in common between \p Entry and that
+  /// partition.
+  std::pair<unsigned, CostType>
+  findMostSimilarPartition(const WorkListEntry &Entry, const SplitProposal &SP);
+
+  const SplitGraph &SG;
+  unsigned NumParts;
+  SubmitProposalFn SubmitProposal;
+
+  // A Cluster is considered large when its cost, excluding entry points,
+  // exceeds this value.
+  CostType LargeClusterThreshold = 0;
+  unsigned NumProposalsSubmitted = 0;
+  SmallVector<WorkListEntry> WorkList;
+};
+
+RecursiveSearchSplitting::RecursiveSearchSplitting(
+    const SplitGraph &SG, unsigned NumParts, SubmitProposalFn SubmitProposal)
+    : SG(SG), NumParts(NumParts), SubmitProposal(SubmitProposal) {
+  // arbitrary max value as a safeguard. Anything above 10 will already be
+  // slow, this is just a max value to prevent extreme resource exhaustion or
+  // unbounded run time.
+  if (MaxDepth > 16)
+    report_fatal_error("[amdgpu-split-module] search depth of " +
+                       Twine(MaxDepth) + " is too high!");
+  LargeClusterThreshold =
+      (LargeFnFactor != 0.0)
+          ? CostType(((SG.getModuleCost() / NumParts) * LargeFnFactor))
+          : std::numeric_limits<CostType>::max();
+  LLVM_DEBUG(dbgs() << "[recursive search] large cluster threshold set at "
+                    << LargeClusterThreshold << "\n");
+}
+
+void RecursiveSearchSplitting::run() {
+  {
+    SplitModuleTimer SMT("recursive_search_prepare", "preparing worklist");
+    setupWorkList();
+  }
+
+  {
+    SplitModuleTimer SMT("recursive_search_pick", "partitioning");
+    SplitProposal SP(SG, NumParts);
+    pickPartition(/*BranchDepth=*/0, /*Idx=*/0, SP);
+  }
+}
 
-        Cost = NewCost;
+void RecursiveSearchSplitting::setupWorkList() {
+  // e.g. if A and B are two worklist item, and they both call a non copyable
+  // dependency C, this does:
+  //    A=C
+  //    B=C
+  // => NodeEC will create a single group (A, B, C) and we create a new
+  // WorkList entry for that group.
+
+  EquivalenceClasses<unsigned> NodeEC;
+  for (const SplitGraph::Node *N : SG.nodes()) {
+    if (!N->isGraphEntryPoint())
+      continue;
+
+    NodeEC.insert(N->getID());
+    N->visitAllDependencies([&](const SplitGraph::Node &Dep) {
+      if (&Dep != N && Dep.isNonCopyable())
+        NodeEC.unionSets(N->getID(), Dep.getID());
+    });
+  }
+
+  for (auto I = NodeEC.begin(), E = NodeEC.end(); I != E; ++I) {
+    if (!I->isLeader())
+      continue;
+
+    BitVector Cluster = SG.createNodesBitVector();
+    for (auto MI = NodeEC.member_begin(I); MI != NodeEC.member_end(); ++MI) {
+      const SplitGraph::Node &N = SG.getNode(*MI);
+      if (N.isGraphEntryPoint())
+        N.setDependenciesBits(Cluster);
+    }
+    WorkList.emplace_back(std::move(Cluster));
+  }
+
+  // Calculate costs and other useful information.
+  for (WorkListEntry &Entry : WorkList) {
+    for (unsigned NodeID : Entry.Cluster.set_bits()) {
+      const SplitGraph::Node &N = SG.getNode(NodeID);
+      const CostType Cost = N.getIndividualCost();
+
+      Entry.TotalCost += Cost;
+      if (!N.isGraphEntryPoint()) {
+        Entry.CostExcludingGraphEntryPoints += Cost;
+        ++Entry.NumNonEntryNodes;
       }
     }
+  }
 
-    sort(BalancingQueue, ComparePartitions);
-  };
+  sort(WorkList, [](const WorkListEntry &LHS, const WorkListEntry &RHS) {
+    return LHS.TotalCost > RHS.TotalCost;
+  });
 
-  for (auto &CurFn : WorkList) {
-    // When a function has indirect calls, it must stay in the first partition
-    // alongside every reachable non-entry function. This is a nightmare case
-    // for splitting as it severely limits what we can do.
-    if (CurFn.HasIndirectCall) {
-      SML << "Function with indirect call(s): " << getName(*CurFn.Fn)
-          << " defaulting to P0\n";
-      AssignToPartition(0, CurFn);
-      continue;
+  LLVM_DEBUG({
+    dbgs() << "[recursive search] worklist:\n";
+    for (const auto &[Idx, Entry] : enumerate(WorkList)) {
+      dbgs() << "  - [" << Idx << "]: ";
+      for (unsigned NodeID : Entry.Cluster.set_bits())
+        dbgs() << NodeID << " ";
+      dbgs() << "(total_cost:" << Entry.TotalCost
+             << ", cost_excl_entries:" << Entry.CostExcludingGraphEntryPoints
+             << ")\n";
     }
+  });
+}
 
-    // When a function has non duplicatable dependencies, we have to keep it in
-    // the first partition as well. This is a conservative approach, a
-    // finer-grained approach could keep track of which dependencies are
-    // non-duplicatable exactly and just make sure they're grouped together.
-    if (CurFn.HasNonDuplicatableDependecy) {
-      SML << "Function with externally visible dependency "
-          << getName(*CurFn.Fn) << " defaulting to P0\n";
-      AssignToPartition(0, CurFn);
+void RecursiveSearchSplitting::pickPartition(unsigned Depth, unsigned Idx,
+                                             SplitProposal SP) {
+  while (Idx < WorkList.size()) {
+    // Step 1: Determine candidate PIDs.
+    //
+    const WorkListEntry &Entry = WorkList[Idx];
+    const BitVector &Cluster = Entry.Cluster;
+
+    // Default option is to do load-balancing, AKA assign to least pressured
+    // partition.
+    const unsigned CheapestPID = SP.findCheapestPartition();
+    assert(CheapestPID != InvalidPID);
+
+    // Explore assigning to the kernel that contains the most dependencies in
+    // common.
+    const auto [MostSimilarPID, SimilarDepsCost] =
+        findMostSimilarPartition(Entry, SP);
+
+    // We can chose to explore only one path if we only have one valid path, or
+    // if we reached maximum search depth and can no longer branch out.
+    unsigned SinglePIDToTry = InvalidPID;
+    if (MostSimilarPID == InvalidPID) // no similar PID found
+      SinglePIDToTry = CheapestPID;
+    else if (MostSimilarPID == CheapestPID) // both landed on the same PID
+      SinglePIDToTry = CheapestPID;
+    else if (Depth >= MaxDepth) {
+      // We have to choose one path. Use a heuristic to guess which one will be
+      // more appropriate.
+      if (Entry.CostExcludingGraphEntryPoints > LargeClusterThreshold) {
+        // Check if the amount of code in common makes it worth it.
+        assert(SimilarDepsCost && Entry.CostExcludingGraphEntryPoints);
+        const double Ratio =
+            SimilarDepsCost / Entry.CostExcludingGraphEntryPoints;
+        assert(Ratio >= 0.0 && Ratio <= 1.0);
+        if (LargeFnOverlapForMerge > Ratio) {
+          // For debug, just print "L", so we'll see "L3=P3" for instance, which
+          // will mean we reached max depth and chose P3 based on this
+          // heuristic.
+          LLVM_DEBUG(dbgs() << "L");
+          SinglePIDToTry = MostSimilarPID;
+        }
+      } else
+        SinglePIDToTry = CheapestPID;
+    }
+
+    // Step 2: Explore candidates.
+
+    // When we only explore one possible path, and thus branch depth doesn't
+    // increase, do not recurse, iterate instead.
+    if (SinglePIDToTry != InvalidPID) {
+      LLVM_DEBUG(dbgs() << Idx << "=P" << SinglePIDToTry << ' ');
+      // Only one path to explore, don't clone SP, don't increase depth.
+      SP.add(SinglePIDToTry, Cluster);
+      ++Idx;
       continue;
     }
 
-    // Be smart with large functions to avoid duplicating their dependencies.
-    if (CurFn.isLarge(LargeFnThreshold)) {
-      assert(LargeFnOverlapForMerge >= 0.0f && LargeFnOverlapForMerge <= 1.0f);
-      SML << "Large Function: " << getName(*CurFn.Fn)
-          << " - looking for partition with at least "
-          << format("%0.2f", LargeFnOverlapForMerge * 100) << "% overlap\n";
-
-      bool Assigned = false;
-      for (const auto &[PID, Fns] : enumerate(Partitions)) {
-        float Overlap = calculateOverlap(CurFn.Dependencies, Fns);
-        SML << "  => " << format("%0.2f", Overlap * 100) << "% overlap with P"
-            << PID << '\n';
-        if (Overlap > LargeFnOverlapForMerge) {
-          SML << "  selecting P" << PID << '\n';
-          AssignToPartition(PID, CurFn);
-          Assigned = true;
-        }
-      }
+    assert(MostSimilarPID != InvalidPID);
 
-      if (Assigned)
-        continue;
+    // We explore multiple paths: recurse at increased depth, then stop this
+    // function.
+
+    LLVM_DEBUG(dbgs() << '\n');
+
+    // lb = load balancing = put in cheapest partition
+    {
+      SplitProposal BranchSP = SP;
+      LLVM_DEBUG(dbgs() << " [lb] " << std::string(Depth, ' ') << Idx << "=P"
+                        << CheapestPID << "? ");
+      BranchSP.add(CheapestPID, Cluster);
+      pickPartition(Depth + 1, Idx + 1, BranchSP);
     }
 
-    // Normal "load-balancing", assign to partition with least pressure.
-    auto [PID, CurCost] = BalancingQueue.back();
-    AssignToPartition(PID, CurFn);
+    // ms = most similar = put in partition with the most in common
+    {
+      SplitProposal BranchSP = SP;
+      LLVM_DEBUG(dbgs() << " [ms] " << std::string(Depth, ' ') << Idx << "=P"
+                        << MostSimilarPID << "? ");
+      BranchSP.add(MostSimilarPID, Cluster);
+      pickPartition(Depth + 1, Idx + 1, BranchSP);
+    }
+
+    return;
   }
 
-  if (SML) {
-    CostType ModuleCostOr1 = ModuleCost ? ModuleCost : 1;
-    for (const auto &[Idx, Part] : enumerate(Partitions)) {
-      CostType Cost = 0;
-      for (auto *Fn : Part)
-        Cost += FnCosts.at(Fn);
-      SML << "P" << Idx << " has a total cost of " << Cost << " ("
-          << format("%0.2f", (float(Cost) / ModuleCostOr1) * 100)
-          << "% of source module)\n";
+  // Step 3: If we assigned all WorkList items, submit the proposal.
+
+  assert(Idx == WorkList.size());
+  assert(NumProposalsSubmitted <= (2u << MaxDepth) &&
+         "Search got out of bounds?");
+  SP.setName("recursive_search (depth=" + std::to_string(Depth) + ") #" +
+             std::to_string(NumProposalsSubmitted++));
+  LLVM_DEBUG(dbgs() << '\n');
+  SubmitProposal(SP);
+}
+
+std::pair<unsigned, CostType>
+RecursiveSearchSplitting::findMostSimilarPartition(const WorkListEntry &Entry,
+                                                   const SplitProposal &SP) {
+  if (!Entry.NumNonEntryNodes)
+    return {InvalidPID, 0};
+
+  // We take the partition that is the most similar using Cost as a metric.
+  // So we take the set of nodes in common, compute their aggregated cost, and
+  // pick the partition with the highest cost in common.
+  unsigned ChosenPID = InvalidPID;
+  CostType ChosenCost = 0;
+  for (unsigned PID = 0; PID < NumParts; ++PID) {
+    BitVector BV = SP[PID];
+    BV &= Entry.Cluster; // FIXME: & doesn't work between BVs?!
+
+    if (BV.none())
+      continue;
+
+    const CostType Cost = SG.calculateCost(BV);
+
+    if (ChosenPID == InvalidPID || ChosenCost < Cost ||
+        (ChosenCost == Cost && PID > ChosenPID)) {
+      ChosenPID = PID;
+      ChosenCost = Cost;
     }
+  }
+
+  return {ChosenPID, ChosenCost};
+}
+
+//===----------------------------------------------------------------------===//
+// DOTGraph Printing Support
+//===----------------------------------------------------------------------===//
+
+const SplitGraph::Node *mapEdgeToDst(const SplitGraph::Edge *E) {
+  return E->Dst;
+}
 
-    SML << "--Partitioning Done--\n\n";
+using SplitGraphEdgeDstIterator =
+    mapped_iterator<SplitGraph::edges_iterator, decltype(&mapEdgeToDst)>;
+
+} // namespace
+
+template <> struct GraphTraits<SplitGraph> {
+  using NodeRef = const SplitGraph::Node *;
+  using nodes_iterator = SplitGraph::nodes_iterator;
+  using ChildIteratorType = SplitGraphEdgeDstIterator;
+
+  using EdgeRef = const SplitGraph::Edge *;
+  using ChildEdgeIteratorType = SplitGraph::edges_iterator;
+
+  static NodeRef getEntryNode(NodeRef N) { return N; }
+
+  static ChildIteratorType child_begin(NodeRef Ref) {
+    return {Ref->outgoing_edges().begin(), mapEdgeToDst};
+  }
+  static ChildIteratorType child_end(NodeRef Ref) {
+    return {Ref->outgoing_edges().end(), mapEdgeToDst};
   }
 
-  // Check no functions were missed.
-#ifndef NDEBUG
-  DenseSet<const Function *> AllFunctions;
-  for (const auto &Part : Partitions)
-    AllFunctions.insert(Part.begin(), Part.end());
+  static nodes_iterator nodes_begin(const SplitGraph &G) {
+    return G.nodes().begin();
+  }
+  static nodes_iterator nodes_end(const SplitGraph &G) {
+    return G.nodes().end();
+  }
+};
 
-  for (auto &Fn : M) {
-    if (!Fn.isDeclaration() && !AllFunctions.contains(&Fn)) {
-      assert(AllFunctions.contains(&Fn) && "Missed a function?!");
+template <> struct DOTGraphTraits<SplitGraph> : public DefaultDOTGraphTraits {
+  DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
+
+  static std::string getGraphName(const SplitGraph &SG) {
+    return SG.getModule().getName().str();
+  }
+
+  std::string getNodeLabel(const SplitGraph::Node *N, const SplitGraph &SG) {
+    return N->getName();
+  }
+
+  static std::string getNodeDescription(const SplitGraph::Node *N,
+                                        const SplitGraph &SG) {
+    std::string Result;
+    if (N->isKernel())
+      Result += "kernel ";
+    if (N->isNonCopyable())
+      Result += "non-copyable ";
+    Result += "cost:" + std::to_string(N->getIndividualCost());
+    return Result;
+  }
+
+  static std::string getNodeAttributes(const SplitGraph::Node *N,
+                                       const SplitGraph &SG) {
+    return N->hasAnyIncomingEdges() ? "" : "color=\"red\"";
+  }
+
+  static std::string getEdgeAttributes(const SplitGraph::Node *N,
+                                       SplitGraphEdgeDstIterator EI,
+                                       const SplitGraph &SG) {
+
+    switch ((*EI.getCurrent())->Kind) {
+    case SplitGraph::EdgeKind::DirectCall:
+      return "";
+    case SplitGraph::EdgeKind::IndirectCall:
+      return "style=\"dashed\"";
     }
   }
-#endif
+};
 
-  return Partitions;
+//===----------------------------------------------------------------------===//
+// Driver
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+// If we didn't externalize GVs, then local GVs need to be conservatively
+// imported into every module (including their initializers), and then cleaned
+// up afterwards.
+static bool needsConservativeImport(const GlobalValue *GV) {
+  if (const auto *Var = dyn_cast<GlobalVariable>(GV))
+    return Var->hasLocalLinkage();
+  return isa<GlobalAlias>(GV);
 }
 
-static void externalize(GlobalValue &GV) {
-  if (GV.hasLocalLinkage()) {
-    GV.setLinkage(GlobalValue::ExternalLinkage);
-    GV.setVisibility(GlobalValue::HiddenVisibility);
+/// Prints a summary of the partition \p N, represented by module \p M, to \p
+/// OS.
+static void printPartitionSummary(raw_ostream &OS, unsigned N, const Module &M,
+                                  unsigned PartCost, unsigned ModuleCost) {
+  OS << "*** Partition P" << N << " ***\n";
+
+  for (const auto &Fn : M) {
+    if (!Fn.isDeclaration())
+      OS << " - [function] " << Fn.getName() << "\n";
   }
 
-  // Unnamed entities must be named consistently between modules. setName will
-  // give a distinct name to each such entity.
-  if (!GV.hasName())
-    GV.setName("__llvmsplit_unnamed");
+  for (const auto &GV : M.globals()) {
+    if (GV.hasInitializer())
+      OS << " - [global] " << GV.getName() << "\n";
+  }
+
+  OS << "Partition contains " << formatRatioOf(PartCost, ModuleCost)
+     << "% of the source\n";
 }
 
-static bool hasDirectCaller(const Function &Fn) {
-  for (auto &U : Fn.uses()) {
-    if (auto *CB = dyn_cast<CallBase>(U.getUser()); CB && CB->isCallee(&U))
-      return true;
+static void evaluateProposal(SplitProposal &Best, SplitProposal New) {
+  SplitModuleTimer SMT("proposal_evaluation", "proposal ranking algorithm");
+
+  New.calculateScores();
+
+  LLVM_DEBUG({
+    New.verifyCompleteness();
+    if (DebugProposalSearch)
+      New.print(dbgs());
+  });
+
+  const double CurBScore = Best.getBottleneckScore();
+  const double CurCSScore = Best.getCodeSizeScore();
+  const double NewBScore = New.getBottleneckScore();
+  const double NewCSScore = New.getCodeSizeScore();
+
+  // TODO: Improve this
+  //    We can probably lower the precision of the comparison at first
+  //    e.g. if we have
+  //      - (Current): BScore: 0.489 CSCore 1.105
+  //      - (New): BScore: 0.475 CSCore 1.305
+  //    Currently we'd choose the new one because the bottleneck score is
+  //    lower, but the new one duplicates more code. It may be worth it to
+  //    discard the new proposal as the impact on build time is negligible.
+
+  // Compare them
+  bool IsBest = false;
+  if (NewBScore < CurBScore)
+    IsBest = true;
+  else if (NewBScore == CurBScore)
+    IsBest = (NewCSScore < CurCSScore); // Use code size as tie breaker.
+
+  if (IsBest)
+    Best = std::move(New);
+
+  LLVM_DEBUG(if (DebugProposalSearch) {
+    if (IsBest)
+      dbgs() << "[search] new best proposal!\n";
+    else
+      dbgs() << "[search] discarding - not profitable\n";
+  });
+}
+
+/// Trivial helper to create an identical copy of \p M.
+static std::unique_ptr<Module> cloneAll(const Module &M) {
+  ValueToValueMapTy VMap;
+  return CloneModule(M, VMap, [&](const GlobalValue *GV) { return true; });
+}
+
+/// Writes \p SG as a DOTGraph to \ref ModuleDotCfgDir if requested.
+static void writeDOTGraph(const SplitGraph &SG) {
+  if (ModuleDotCfgOutput.empty())
+    return;
+
+  std::error_code EC;
+  raw_fd_ostream OS(ModuleDotCfgOutput, EC);
+  if (EC) {
+    errs() << "[" DEBUG_TYPE "]: cannot open '" << ModuleDotCfgOutput
+           << "' - DOTGraph will not be printed\n";
   }
-  return false;
+  WriteGraph(OS, SG, /*ShortName=*/false,
+             /*Title=*/SG.getModule().getName());
 }
 
 static void splitAMDGPUModule(
-    GetTTIFn GetTTI, Module &M, unsigned N,
+    GetTTIFn GetTTI, Module &M, unsigned NumParts,
     function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback) {
-
-  SplitModuleLogger SML(M);
-
   CallGraph CG(M);
 
   // Externalize functions whose address are taken.
@@ -639,8 +1305,8 @@ static void splitAMDGPUModule(
   for (auto &Fn : M) {
     if (Fn.hasAddressTaken()) {
       if (Fn.hasLocalLinkage()) {
-        SML << "[externalize] " << Fn.getName()
-            << " because its address is taken\n";
+        LLVM_DEBUG(dbgs() << "[externalize] " << Fn.getName()
+                          << " because its address is taken\n");
       }
       externalize(Fn);
     }
@@ -651,138 +1317,176 @@ static void splitAMDGPUModule(
   if (!NoExternalizeGlobals) {
     for (auto &GV : M.globals()) {
       if (GV.hasLocalLinkage())
-        SML << "[externalize] GV " << GV.getName() << '\n';
+        LLVM_DEBUG(dbgs() << "[externalize] GV " << GV.getName() << '\n');
       externalize(GV);
     }
   }
 
   // Start by calculating the cost of every function in the module, as well as
   // the module's overall cost.
-  DenseMap<const Function *, CostType> FnCosts;
-  const CostType ModuleCost = calculateFunctionCosts(SML, GetTTI, M, FnCosts);
-
-  // First, gather ever kernel into the worklist.
-  SmallVector<FunctionWithDependencies> WorkList;
-  for (auto &Fn : M) {
-    if (isEntryPoint(&Fn) && !Fn.isDeclaration())
-      WorkList.emplace_back(SML, CG, FnCosts, &Fn);
+  FunctionsCostMap FnCosts;
+  const CostType ModuleCost = calculateFunctionCosts(GetTTI, M, FnCosts);
+
+  // Build the SplitGraph, which represents the module's functions and models
+  // their dependencies accurately.
+  SplitGraph SG(M, FnCosts, ModuleCost);
+  SG.buildGraph(CG);
+
+  if (SG.empty()) {
+    LLVM_DEBUG(
+        dbgs()
+        << "[!] no nodes in graph, input is empty - no splitting possible\n");
+    ModuleCallback(cloneAll(M));
+    return;
   }
 
-  // Then, find missing functions that need to be considered as additional
-  // roots. These can't be called in theory, but in practice we still have to
-  // handle them to avoid linker errors.
-  {
-    DenseSet<const Function *> SeenFunctions;
-    for (const auto &FWD : WorkList) {
-      SeenFunctions.insert(FWD.Fn);
-      SeenFunctions.insert(FWD.Dependencies.begin(), FWD.Dependencies.end());
+  LLVM_DEBUG({
+    dbgs() << "[graph] nodes:\n";
+    for (const SplitGraph::Node *N : SG.nodes()) {
+      dbgs() << "  - [" << N->getID() << "]: " << N->getName() << " "
+             << (N->isGraphEntryPoint() ? "(entry)" : "") << "\n";
     }
+  });
 
-    for (auto &Fn : M) {
-      // If this function is not part of any kernel's dependencies and isn't
-      // directly called, consider it as a root.
-      if (!Fn.isDeclaration() && !isEntryPoint(&Fn) &&
-          !SeenFunctions.count(&Fn) && !hasDirectCaller(Fn)) {
-        WorkList.emplace_back(SML, CG, FnCosts, &Fn);
-      }
-    }
-  }
+  writeDOTGraph(SG);
 
-  // Sort the worklist so the most expensive roots are seen first.
-  sort(WorkList, [&](auto &A, auto &B) {
-    // Sort by total cost, and if the total cost is identical, sort
-    // alphabetically.
-    if (A.TotalCost == B.TotalCost)
-      return A.Fn->getName() < B.Fn->getName();
-    return A.TotalCost > B.TotalCost;
-  });
+  LLVM_DEBUG(dbgs() << "[search] testing splitting strategies\n");
 
-  if (SML) {
-    SML << "Worklist\n";
-    for (const auto &FWD : WorkList) {
-      SML << "[root] " << getName(*FWD.Fn) << " (totalCost:" << FWD.TotalCost
-          << " indirect:" << FWD.HasIndirectCall
-          << " hasNonDuplicatableDep:" << FWD.HasNonDuplicatableDependecy
-          << ")\n";
-      // Sort function names before printing to ensure determinism.
-      SmallVector<std::string> SortedDepNames;
-      SortedDepNames.reserve(FWD.Dependencies.size());
-      for (const auto *Dep : FWD.Dependencies)
-        SortedDepNames.push_back(getName(*Dep));
-      sort(SortedDepNames);
-
-      for (const auto &Name : SortedDepNames)
-        SML << "  [dependency] " << Name << '\n';
-    }
+  std::optional<SplitProposal> Proposal;
+  const auto EvaluateProposal = [&](SplitProposal SP) {
+    if (!Proposal)
+      Proposal = std::move(SP);
+    else
+      evaluateProposal(*Proposal, std::move(SP));
+  };
+
+  // TODO: It would be very easy to create new strategies by just adding a base
+  // class to RecursiveSearchSplitting and abstracting it away.
+  RecursiveSearchSplitting(SG, NumParts, EvaluateProposal).run();
+  LLVM_DEBUG(if (Proposal) dbgs() << "[search done] selected proposal: "
+                                  << Proposal->getName() << "\n";);
+
+  if (!Proposal) {
+    LLVM_DEBUG(dbgs() << "[!] no proposal made, no splitting possible!\n");
+    ModuleCallback(cloneAll(M));
+    return;
   }
 
-  // This performs all of the partitioning work.
-  auto Partitions = doPartitioning(SML, M, N, ModuleCost, FnCosts, WorkList);
-  assert(Partitions.size() == N);
+  LLVM_DEBUG(Proposal->print(dbgs()););
 
-  // If we didn't externalize GVs, then local GVs need to be conservatively
-  // imported into every module (including their initializers), and then cleaned
-  // up afterwards.
-  const auto NeedsConservativeImport = [&](const GlobalValue *GV) {
-    // We conservatively import private/internal GVs into every module and clean
-    // them up afterwards.
-    const auto *Var = dyn_cast<GlobalVariable>(GV);
-    return Var && Var->hasLocalLinkage();
-  };
+  std::optional<raw_fd_ostream> SummariesOS;
+  if (!PartitionSummariesOutput.empty()) {
+    std::error_code EC;
+    SummariesOS.emplace(PartitionSummariesOutput, EC);
+    if (EC)
+      errs() << "[" DEBUG_TYPE "]: cannot open '" << PartitionSummariesOutput
+             << "' - Partition summaries will not be printed\n";
+  }
+
+  for (unsigned PID = 0; PID < NumParts; ++PID) {
+    SplitModuleTimer SMT2("modules_creation",
+                          "creating modules for each partition");
+    LLVM_DEBUG(dbgs() << "[split] creating new modules\n");
 
-  SML << "Creating " << N << " modules...\n";
-  unsigned TotalFnImpls = 0;
-  for (unsigned I = 0; I < N; ++I) {
-    const auto &FnsInPart = Partitions[I];
+    DenseSet<const Function *> FnsInPart;
+    for (unsigned NodeID : (*Proposal)[PID].set_bits())
+      FnsInPart.insert(&SG.getNode(NodeID).getFunction());
 
     ValueToValueMapTy VMap;
+    CostType PartCost = 0;
     std::unique_ptr<Module> MPart(
         CloneModule(M, VMap, [&](const GlobalValue *GV) {
           // Functions go in their assigned partition.
-          if (const auto *Fn = dyn_cast<Function>(GV))
-            return FnsInPart.contains(Fn);
-
-          if (NeedsConservativeImport(GV))
-            return true;
+          if (const auto *Fn = dyn_cast<Function>(GV)) {
+            if (FnsInPart.contains(Fn)) {
+              PartCost += SG.getCost(*Fn);
+              return true;
+            }
+            return false;
+          }
 
           // Everything else goes in the first partition.
-          return I == 0;
+          return needsConservativeImport(GV) || PID == 0;
         }));
 
+    // FIXME: Aliases aren't seen often, and their handling isn't perfect so
+    // bugs are possible.
+
     // Clean-up conservatively imported GVs without any users.
-    for (auto &GV : make_early_inc_range(MPart->globals())) {
-      if (NeedsConservativeImport(&GV) && GV.use_empty())
+    for (auto &GV : make_early_inc_range(MPart->global_values())) {
+      if (needsConservativeImport(&GV) && GV.use_empty())
         GV.eraseFromParent();
     }
 
-    unsigned NumAllFns = 0, NumKernels = 0;
-    for (auto &Cur : *MPart) {
-      if (!Cur.isDeclaration()) {
-        ++NumAllFns;
-        if (isEntryPoint(&Cur))
-          ++NumKernels;
-      }
-    }
-    TotalFnImpls += NumAllFns;
-    SML << "  - Module " << I << " with " << NumAllFns << " functions ("
-        << NumKernels << " kernels)\n";
+    if (SummariesOS)
+      printPartitionSummary(*SummariesOS, PID, *MPart, PartCost, ModuleCost);
+
+    LLVM_DEBUG(
+        printPartitionSummary(dbgs(), PID, *MPart, PartCost, ModuleCost));
+
     ModuleCallback(std::move(MPart));
   }
-
-  SML << TotalFnImpls << " function definitions across all modules ("
-      << format("%0.2f", (float(TotalFnImpls) / FnCosts.size()) * 100)
-      << "% of original module)\n";
 }
 } // namespace
 
 PreservedAnalyses AMDGPUSplitModulePass::run(Module &M,
                                              ModuleAnalysisManager &MAM) {
+  SplitModuleTimer SMT(
+      "total", "total pass runtime (incl. potentially waiting for lockfile)");
+
   FunctionAnalysisManager &FAM =
       MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
   const auto TTIGetter = [&FAM](Function &F) -> const TargetTransformInfo & {
     return FAM.getResult<TargetIRAnalysis>(F);
   };
-  splitAMDGPUModule(TTIGetter, M, N, ModuleCallback);
-  // We don't change the original module.
-  return PreservedAnalyses::all();
+
+  bool Done = false;
+#ifndef NDEBUG
+  if (UseLockFile) {
+    SmallString<128> LockFilePath;
+    sys::path::system_temp_directory(/*ErasedOnReboot=*/true, LockFilePath);
+    sys::path::append(LockFilePath, "amdgpu-split-module-debug");
+    LLVM_DEBUG(dbgs() << DEBUG_TYPE " using lockfile '" << LockFilePath
+                      << "'\n");
+
+    while (true) {
+      llvm::LockFileManager Locked(LockFilePath.str());
+      switch (Locked) {
+      case LockFileManager::LFS_Error:
+        errs() << "[amdgpu-split-module] unable to acquire lockfile, debug "
+                  "output may be mangled by other processes\n";
+        Locked.unsafeRemoveLockFile();
+        break;
+      case LockFileManager::LFS_Owned:
+        break;
+      case LockFileManager::LFS_Shared: {
+        switch (Locked.waitForUnlock()) {
+        case LockFileManager::Res_Success:
+          break;
+        case LockFileManager::Res_OwnerDied:
+          continue; // try again to get the lock.
+        case LockFileManager::Res_Timeout:
+          errs() << "[amdgpu-split-module] unable to acquire lockfile, debug "
+                    "output may be mangled by other processes\n";
+          Locked.unsafeRemoveLockFile();
+          break; // give up
+        }
+        break;
+      }
+      }
+
+      splitAMDGPUModule(TTIGetter, M, N, ModuleCallback);
+      Done = true;
+      break;
+    }
+  }
+#endif
+
+  if (!Done)
+    splitAMDGPUModule(TTIGetter, M, N, ModuleCallback);
+
+  // We can change linkage/visibilities in the input, consider that nothing is
+  // preserved just to be safe. This pass runs last anyway.
+  return PreservedAnalyses::none();
 }
+} // namespace llvm
diff --git a/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize-with-call.ll b/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize-with-call.ll
index d269f92763853c..708b5a006be60e 100644
--- a/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize-with-call.ll
+++ b/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize-with-call.ll
@@ -1,30 +1,24 @@
-; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-function-threshold=0
-; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
-; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
-; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
+; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-threshold=0
+; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 --implicit-check-not=define %s
+; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=define %s
+; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 --implicit-check-not=define %s
 
 ; 3 kernels:
 ;   - A does a direct call to HelperA
 ;   - B is storing @HelperA
 ;   - C does a direct call to HelperA
 ;
-; The helper functions will get externalized, which will force A and C into P0 as
-; external functions cannot be duplicated.
-
-; CHECK0: define hidden void @HelperA()
-; CHECK0: define amdgpu_kernel void @A()
-; CHECK0: declare amdgpu_kernel void @B(ptr)
-; CHECK0: define amdgpu_kernel void @C()
-
-; CHECK1: declare hidden void @HelperA()
-; CHECK1: declare amdgpu_kernel void @A()
-; CHECK1: declare amdgpu_kernel void @B(ptr)
-; CHECK1: declare amdgpu_kernel void @C()
-
-; CHECK2: declare hidden void @HelperA()
-; CHECK2: declare amdgpu_kernel void @A()
-; CHECK2: define amdgpu_kernel void @B(ptr %dst)
-; CHECK2: declare amdgpu_kernel void @C()
+; The helper functions will get externalized, so C/A will end up
+; in the same partition.
+
+; P0 is empty.
+; CHECK0: declare
+
+; CHECK1: define amdgpu_kernel void @B(ptr %dst)
+
+; CHECK2: define hidden void @HelperA()
+; CHECK2: define amdgpu_kernel void @A()
+; CHECK2: define amdgpu_kernel void @C()
 
 define internal void @HelperA() {
   ret void
diff --git a/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize.ll b/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize.ll
index 731cf4b374c95b..81f6c8f0fbb3a6 100644
--- a/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize.ll
+++ b/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-split -o %t %s -j 2 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-function-threshold=0
+; RUN: llvm-split -o %t %s -j 2 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-threshold=0
 ; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
 ; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
 
diff --git a/llvm/test/tools/llvm-split/AMDGPU/debug-name-hiding.ll b/llvm/test/tools/llvm-split/AMDGPU/debug-name-hiding.ll
deleted file mode 100644
index 6a07ed51ba1beb..00000000000000
--- a/llvm/test/tools/llvm-split/AMDGPU/debug-name-hiding.ll
+++ /dev/null
@@ -1,20 +0,0 @@
-; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -debug -amdgpu-module-splitting-log-private 2>&1 | FileCheck %s --implicit-check-not=MyCustomKernel
-; REQUIRES: asserts
-
-; SHA256 of the kernel names.
-
-; CHECK: a097723d21cf9f35d90e6fb7881995ac8c398b3366a6c97efc657404f9fe301c
-; CHECK: 626bc23242de8fcfda7f0e66318d29455c081df6b5380e64d14703c95fcbcd59
-; CHECK: c38d90a7ca71dc5d694bb9e093dadcdedfc4cb4adf7ed7e46d42fe95a0b4ef55
-
-define amdgpu_kernel void @MyCustomKernel0() {
-  ret void
-}
-
-define amdgpu_kernel void @MyCustomKernel1() {
-  ret void
-}
-
-define amdgpu_kernel void @MyCustomKernel2() {
-  ret void
-}
diff --git a/llvm/test/tools/llvm-split/AMDGPU/debug-non-kernel-root.ll b/llvm/test/tools/llvm-split/AMDGPU/debug-non-kernel-root.ll
deleted file mode 100644
index 836b5c05d0653d..00000000000000
--- a/llvm/test/tools/llvm-split/AMDGPU/debug-non-kernel-root.ll
+++ /dev/null
@@ -1,36 +0,0 @@
-; RUN: llvm-split -o %t %s -j 2 -mtriple amdgcn-amd-amdhsa -debug 2>&1 | FileCheck %s --implicit-check-not="[root]"
-; REQUIRES: asserts
-
-; func_3 is never directly called, it needs to be considered
-; as a root to handle this module correctly.
-
-; CHECK:      [root] kernel_1
-; CHECK-NEXT:   [dependency] func_1
-; CHECK-NEXT:   [dependency] func_2
-; CHECK-NEXT: [root] func_3
-; CHECK-NEXT:   [dependency] func_2
-
-define amdgpu_kernel void @kernel_1() {
-entry:
-  call void @func_1()
-  ret void
-}
-
-define linkonce_odr hidden void @func_1() {
-entry:
-  %call = call i32 @func_2()
-  ret void
-}
-
-define linkonce_odr hidden i32 @func_2() #0 {
-entry:
-  ret i32 0
-}
-
-define void @func_3() {
-entry:
-  %call = call i32 @func_2()
-  ret void
-}
-
-attributes #0 = { noinline optnone }
diff --git a/llvm/test/tools/llvm-split/AMDGPU/declarations.ll b/llvm/test/tools/llvm-split/AMDGPU/declarations.ll
index f579056e914aa4..987a6b8f467cd5 100644
--- a/llvm/test/tools/llvm-split/AMDGPU/declarations.ll
+++ b/llvm/test/tools/llvm-split/AMDGPU/declarations.ll
@@ -1,15 +1,12 @@
 ; RUN: llvm-split -o %t %s -j 2 -mtriple amdgcn-amd-amdhsa
 ; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
-; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
+; RUN: not llvm-dis -o - %t1
 
-; Check that all declarations are put into each partition.
+; Empty module without any defs should result in a single output module that is
+; an exact copy of the input.
 
 ; CHECK0: declare void @A
 ; CHECK0: declare void @B
 
-; CHECK1: declare void @A
-; CHECK1: declare void @B
-
 declare void @A()
-
 declare void @B()
diff --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-alias-dependencies.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-alias-dependencies.ll
index c2746d1398924c..d7e84abd5f968d 100644
--- a/llvm/test/tools/llvm-split/AMDGPU/kernels-alias-dependencies.ll
+++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-alias-dependencies.ll
@@ -1,6 +1,6 @@
 ; RUN: llvm-split -o %t %s -j 2 -mtriple amdgcn-amd-amdhsa
-; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
-; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
+; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 --implicit-check-not=define %s
+; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=define %s
 
 ; 3 kernels:
 ;   - A calls nothing
@@ -13,16 +13,12 @@
 ; Additionally, @PerryThePlatypus gets externalized as
 ; the alias counts as taking its address.
 
-; CHECK0-NOT: define
-; CHECK0: @Perry = internal alias ptr (), ptr @PerryThePlatypus
-; CHECK0: define hidden void @PerryThePlatypus()
-; CHECK0: define amdgpu_kernel void @B
-; CHECK0: define amdgpu_kernel void @C
-; CHECK0-NOT: define
+; CHECK0: define amdgpu_kernel void @A
 
-; CHECK1-NOT: define
-; CHECK1: define amdgpu_kernel void @A
-; CHECK1-NOT: define
+; CHECK1: @Perry = internal alias ptr (), ptr @PerryThePlatypus
+; CHECK1: define hidden void @PerryThePlatypus()
+; CHECK1: define amdgpu_kernel void @B
+; CHECK1: define amdgpu_kernel void @C
 
 @Perry = internal alias ptr(), ptr @PerryThePlatypus
 
diff --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-cost-ranking.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-cost-ranking.ll
index 4635264aefb39a..c7e13304dc6dec 100644
--- a/llvm/test/tools/llvm-split/AMDGPU/kernels-cost-ranking.ll
+++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-cost-ranking.ll
@@ -1,27 +1,21 @@
 ; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa
-; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
-; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
-; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
+; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 --implicit-check-not=define %s
+; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=define %s
+; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 --implicit-check-not=define %s
 
 ; 3 kernels with each their own dependencies should go into 3
 ; distinct partitions. The most expensive kernel should be
 ; seen first and go into the last partition.
 
-; CHECK0-NOT: define
 ; CHECK0: define amdgpu_kernel void @C
 ; CHECK0: define internal void @HelperC
 ; CHECK0-NOT: define
 
-; CHECK1-NOT: define
 ; CHECK1: define amdgpu_kernel void @A
 ; CHECK1: define internal void @HelperA
-; CHECK1-NOT: define
 
-; CHECK2-NOT: define
 ; CHECK2: define amdgpu_kernel void @B
 ; CHECK2: define internal void @HelperB
-; CHECK2-NOT: define
-
 
 define amdgpu_kernel void @A() {
   call void @HelperA()
diff --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-external.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-external.ll
index 435e97a5813400..332344a776e82e 100644
--- a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-external.ll
+++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-external.ll
@@ -1,29 +1,20 @@
 ; RUN: llvm-split -o %t %s -j 4 -mtriple amdgcn-amd-amdhsa
-; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
-; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
-; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
-; RUN: llvm-dis -o - %t3 | FileCheck --check-prefix=CHECK3 %s
+; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 --implicit-check-not=define %s
+; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=define %s
+; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 --implicit-check-not=define %s
+; RUN: llvm-dis -o - %t3 | FileCheck --check-prefix=CHECK3 --implicit-check-not=define %s
 
-; Both overridable helper should go in P0.
+; CHECK0: define internal void @PrivateHelper1()
+; CHECK0: define amdgpu_kernel void @D
 
-; CHECK0-NOT: define
-; CHECK0: define available_externally void @OverridableHelper0()
-; CHECK0: define internal void @OverridableHelper1()
-; CHECK0: define amdgpu_kernel void @A
-; CHECK0: define amdgpu_kernel void @B
-; CHECK0-NOT: define
+; CHECK1: define internal void @PrivateHelper0()
+; CHECK1: define amdgpu_kernel void @C
 
-; CHECK1-NOT: define
+; CHECK2: define internal void @OverridableHelper1()
+; CHECK2: define amdgpu_kernel void @B
 
-; CHECK2-NOT: define
-; CHECK2: define internal void @PrivateHelper1()
-; CHECK2: define amdgpu_kernel void @D
-; CHECK2-NOT: define
-
-; CHECK3-NOT: define
-; CHECK3: define internal void @PrivateHelper0()
-; CHECK3: define amdgpu_kernel void @C
-; CHECK3-NOT: define
+; CHECK3: define available_externally void @OverridableHelper0()
+; CHECK3: define amdgpu_kernel void @A
 
 define available_externally void @OverridableHelper0() {
   ret void
diff --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-indirect.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-indirect.ll
index 2d870039112cbf..5be945bda48bf4 100644
--- a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-indirect.ll
+++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-indirect.ll
@@ -1,7 +1,7 @@
 ; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa
-; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
-; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
-; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
+; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 --implicit-check-not=define %s
+; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=define %s
+; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 --implicit-check-not=define %s
 
 ; We have 4 kernels:
 ;   - Each kernel has an internal helper
@@ -15,25 +15,19 @@
 ; indirect call. HelperC/D should also end up in P0 as they
 ; are dependencies of HelperB.
 
-; CHECK0-NOT: define
-; CHECK0: define hidden void @HelperA
-; CHECK0: define hidden void @HelperB
-; CHECK0: define hidden void @CallCandidate
-; CHECK0: define internal void @HelperC
 ; CHECK0: define internal void @HelperD
-; CHECK0: define amdgpu_kernel void @A
-; CHECK0: define amdgpu_kernel void @B
-; CHECK0-NOT: define
+; CHECK0: define amdgpu_kernel void @D
 
-; CHECK1-NOT: define
-; CHECK1: define internal void @HelperD
-; CHECK1: define amdgpu_kernel void @D
-; CHECK1-NOT: define
+; CHECK1: define internal void @HelperC
+; CHECK1: define amdgpu_kernel void @C
 
-; CHECK2-NOT: define
+; CHECK2: define hidden void @HelperA
+; CHECK2: define hidden void @HelperB
+; CHECK2: define hidden void @CallCandidate
 ; CHECK2: define internal void @HelperC
-; CHECK2: define amdgpu_kernel void @C
-; CHECK2-NOT: define
+; CHECK2: define internal void @HelperD
+; CHECK2: define amdgpu_kernel void @A
+; CHECK2: define amdgpu_kernel void @B
 
 @addrthief = global [3 x ptr] [ptr @HelperA, ptr @HelperB, ptr @CallCandidate]
 
diff --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-overridable.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-overridable.ll
index dc2c5c3c07bee6..9205a5d1930e52 100644
--- a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-overridable.ll
+++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-overridable.ll
@@ -1,21 +1,15 @@
 ; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa
-; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
-; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
-; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
-
-; CHECK0-NOT: define
-; CHECK0: define void @ExternalHelper
-; CHECK0: define amdgpu_kernel void @A
-; CHECK0: define amdgpu_kernel void @B
-; CHECK0-NOT: define
-
-; CHECK1-NOT: define
-; CHECK1: define amdgpu_kernel void @D
-; CHECK1-NOT: define
-
-; CHECK2-NOT: define
-; CHECK2: define amdgpu_kernel void @C
-; CHECK2-NOT: define
+; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 --implicit-check-not=define %s
+; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=define %s
+; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 --implicit-check-not=define %s
+
+; CHECK0: define amdgpu_kernel void @D
+
+; CHECK1: define amdgpu_kernel void @C
+
+; CHECK2: define void @ExternalHelper
+; CHECK2: define amdgpu_kernel void @A
+; CHECK2: define amdgpu_kernel void @B
 
 define void @ExternalHelper() {
   ret void
diff --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables-noexternal.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables-noexternal.ll
index 0fc76934afc548..a184d92aea9b9f 100644
--- a/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables-noexternal.ll
+++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables-noexternal.ll
@@ -1,26 +1,20 @@
 ; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-no-externalize-globals
-; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
-; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
-; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
+; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 --implicit-check-not=define %s
+; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=define %s
+; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 --implicit-check-not=define %s
 
 ; 3 kernels use private/internal global variables.
 ; The GVs should be copied in each partition as needed.
 
-; CHECK0-NOT: define
 ; CHECK0: @bar = internal constant ptr
 ; CHECK0: define amdgpu_kernel void @C
-; CHECK0-NOT: define
 
-; CHECK1-NOT: define
 ; CHECK1: @foo = private constant ptr
 ; CHECK1: define amdgpu_kernel void @A
-; CHECK1-NOT: define
 
-; CHECK2-NOT: define
 ; CHECK2: @foo = private constant ptr
 ; CHECK2: @bar = internal constant ptr
 ; CHECK2: define amdgpu_kernel void @B
-; CHECK2-NOT: define
 
 @foo = private constant ptr poison
 @bar = internal constant ptr poison
diff --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables.ll
index 7564662e7c7c0c..be84a0b5916f0d 100644
--- a/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables.ll
+++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables.ll
@@ -1,28 +1,22 @@
 ; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa
-; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
-; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
-; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
+; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 --implicit-check-not=define %s
+; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=define %s
+; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 --implicit-check-not=define %s
 
 ; 3 kernels use private/internal global variables.
 ; The GVs should be copied in each partition as needed.
 
-; CHECK0-NOT: define
 ; CHECK0: @foo = hidden constant ptr poison
 ; CHECK0: @bar = hidden constant ptr poison
 ; CHECK0: define amdgpu_kernel void @C
-; CHECK0-NOT: define
 
-; CHECK1-NOT: define
 ; CHECK1: @foo = external hidden constant ptr{{$}}
 ; CHECK1: @bar = external hidden constant ptr{{$}}
 ; CHECK1: define amdgpu_kernel void @A
-; CHECK1-NOT: define
 
-; CHECK2-NOT: define
 ; CHECK2: @foo = external hidden constant ptr{{$}}
 ; CHECK2: @bar = external hidden constant ptr{{$}}
 ; CHECK2: define amdgpu_kernel void @B
-; CHECK2-NOT: define
 
 @foo = private constant ptr poison
 @bar = internal constant ptr poison
diff --git a/llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging.ll b/llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging.ll
index 459c5a7f1a2db3..807fb2e5f33cea 100644
--- a/llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging.ll
+++ b/llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging.ll
@@ -1,12 +1,12 @@
-; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-function-threshold=1.2 -amdgpu-module-splitting-large-function-merge-overlap=0.5
-; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
-; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
-; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
+; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-max-depth=0 -amdgpu-module-splitting-large-threshold=1.2 -amdgpu-module-splitting-merge-threshold=0.5
+; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 --implicit-check-not=define %s
+; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=define %s
+; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 --implicit-check-not=define %s
 
-; RUN: llvm-split -o %t.nolarge %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-function-threshold=0
-; RUN: llvm-dis -o - %t.nolarge0 | FileCheck --check-prefix=NOLARGEKERNELS-CHECK0 %s
-; RUN: llvm-dis -o - %t.nolarge1 | FileCheck --check-prefix=NOLARGEKERNELS-CHECK1 %s
-; RUN: llvm-dis -o - %t.nolarge2 | FileCheck --check-prefix=NOLARGEKERNELS-CHECK2 %s
+; RUN: llvm-split -o %t.nolarge %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-threshold=0 -amdgpu-module-splitting-max-depth=0
+; RUN: llvm-dis -o - %t.nolarge0 | FileCheck --check-prefix=NOLARGEKERNELS-CHECK0 --implicit-check-not=define %s
+; RUN: llvm-dis -o - %t.nolarge1 | FileCheck --check-prefix=NOLARGEKERNELS-CHECK1 --implicit-check-not=define %s
+; RUN: llvm-dis -o - %t.nolarge2 | FileCheck --check-prefix=NOLARGEKERNELS-CHECK2 --implicit-check-not=define %s
 
 ; 2 kernels (A/B) are large and share all their dependencies.
 ; They should go in the same partition, the remaining kernel should
@@ -15,14 +15,12 @@
 ; Also check w/o large kernels processing to verify they are indeed handled
 ; differently.
 
-; CHECK0-NOT: define
+; P0 is empty
+; CHECK0: declare
 
-; CHECK1-NOT: define
 ; CHECK1: define internal void @HelperC()
 ; CHECK1: define amdgpu_kernel void @C
-; CHECK1-NOT: define
 
-; CHECK2-NOT: define
 ; CHECK2: define internal void @large2()
 ; CHECK2: define internal void @large1()
 ; CHECK2: define internal void @large0()
@@ -30,12 +28,9 @@
 ; CHECK2: define internal void @HelperB()
 ; CHECK2: define amdgpu_kernel void @A
 ; CHECK2: define amdgpu_kernel void @B
-; CHECK2-NOT: define
 
-; NOLARGEKERNELS-CHECK0-NOT: define
 ; NOLARGEKERNELS-CHECK0: define internal void @HelperC()
 ; NOLARGEKERNELS-CHECK0: define amdgpu_kernel void @C
-; NOLARGEKERNELS-CHECK0-NOT: define
 
 ; NOLARGEKERNELS-CHECK1: define internal void @large2()
 ; NOLARGEKERNELS-CHECK1: define internal void @large1()
@@ -49,6 +44,7 @@
 ; NOLARGEKERNELS-CHECK2: define internal void @HelperA()
 ; NOLARGEKERNELS-CHECK2: define amdgpu_kernel void @A
 
+
 define internal void @large2() {
   store volatile i32 42, ptr null
   call void @large2()
diff --git a/llvm/test/tools/llvm-split/AMDGPU/non-kernels-dependency-indirect.ll b/llvm/test/tools/llvm-split/AMDGPU/non-kernels-dependency-indirect.ll
index 167930ce0e8063..1314a78b42f3b0 100644
--- a/llvm/test/tools/llvm-split/AMDGPU/non-kernels-dependency-indirect.ll
+++ b/llvm/test/tools/llvm-split/AMDGPU/non-kernels-dependency-indirect.ll
@@ -1,7 +1,7 @@
 ; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa
-; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 --implicit-check-not=DEFINE %s
-; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=DEFINE %s
-; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 --implicit-check-not=DEFINE %s
+; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 --implicit-check-not=define %s
+; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=define %s
+; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 --implicit-check-not=define %s
 
 ; We have 4 function:
 ;   - Each function has an internal helper
@@ -11,19 +11,19 @@
 ; @CallCandidate doesn't have to be in A/B's partition, unlike
 ; in the corresponding tests for kernels where it has to.
 
-; CHECK0: define hidden void @HelperA
-; CHECK0: define hidden void @HelperB
 ; CHECK0: define internal void @HelperC
 ; CHECK0: define internal void @HelperD
-; CHECK0: define void @A
-; CHECK0: define void @B
+; CHECK0: define internal void @C
+; CHECK0: define internal void @D
 
-; CHECK1: define internal void @HelperD
-; CHECK1: define void @D
+; CHECK1: define hidden void @HelperA
+; CHECK1: define hidden void @CallCandidate()
+; CHECK1: define internal void @A
 
-; CHECK2: define hidden void @CallCandidate
+; CHECK2: define hidden void @HelperB
 ; CHECK2: define internal void @HelperC
-; CHECK2: define void @C
+; CHECK2: define internal void @HelperD
+; CHECK2: define internal void @B
 
 @addrthief = global [3 x ptr] [ptr @HelperA, ptr @HelperB, ptr @CallCandidate]
 
@@ -51,22 +51,22 @@ define internal void @HelperD() {
   ret void
 }
 
-define void @A(ptr %call) {
+define internal void @A(ptr %call) {
   call void @HelperA(ptr %call)
   ret void
 }
 
-define void @B(ptr %call) {
+define internal void @B(ptr %call) {
   call void @HelperB(ptr %call)
   ret void
 }
 
-define void @C() {
+define internal void @C() {
   call void @HelperC()
   ret void
 }
 
-define void @D() {
+define internal void @D() {
   call void @HelperD()
   ret void
 }
diff --git a/llvm/test/tools/llvm-split/AMDGPU/recursive-search-2.ll b/llvm/test/tools/llvm-split/AMDGPU/recursive-search-2.ll
new file mode 100644
index 00000000000000..01f2f3627f9905
--- /dev/null
+++ b/llvm/test/tools/llvm-split/AMDGPU/recursive-search-2.ll
@@ -0,0 +1,128 @@
+; RUN: llvm-split -o %t_s3_ %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-max-depth=2
+; RUN: llvm-dis -o - %t_s3_0 | FileCheck --check-prefix=SPLIT3-CHECK0 --implicit-check-not=define %s
+; RUN: llvm-dis -o - %t_s3_1 | FileCheck --check-prefix=SPLIT3-CHECK1 --implicit-check-not=define %s
+; RUN: llvm-dis -o - %t_s3_2 | FileCheck --check-prefix=SPLIT3-CHECK2 --implicit-check-not=define %s
+
+; RUN: llvm-split -o %t_s5_ %s -j 5 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-max-depth=2
+; RUN: llvm-dis -o - %t_s5_0 | FileCheck --check-prefix=SPLIT5-CHECK0 --implicit-check-not=define %s
+; RUN: llvm-dis -o - %t_s5_1 | FileCheck --check-prefix=SPLIT5-CHECK1 --implicit-check-not=define %s
+; RUN: llvm-dis -o - %t_s5_2 | FileCheck --check-prefix=SPLIT5-CHECK2 --implicit-check-not=define %s
+; RUN: llvm-dis -o - %t_s5_3 | FileCheck --check-prefix=SPLIT5-CHECK3 --implicit-check-not=define %s
+; RUN: llvm-dis -o - %t_s5_4 | FileCheck --check-prefix=SPLIT5-CHECK4 --implicit-check-not=define %s
+
+; Test the specifics of the search algorithm.
+; This test will change depending on new heuristics we add or remove.
+
+; --------------------------------------------
+
+; SPLIT3-CHECK0: define internal void @HelperA()
+; SPLIT3-CHECK0: define internal void @HelperB()
+; SPLIT3-CHECK0: define internal void @HelperC()
+; SPLIT3-CHECK0: define amdgpu_kernel void @AB()
+; SPLIT3-CHECK0: define amdgpu_kernel void @BC()
+
+; SPLIT3-CHECK1: define amdgpu_kernel void @A()
+; SPLIT3-CHECK1: define internal void @HelperA()
+; SPLIT3-CHECK1: define amdgpu_kernel void @C()
+; SPLIT3-CHECK1: define internal void @HelperC()
+
+; SPLIT3-CHECK2: define internal void @HelperA()
+; SPLIT3-CHECK2: define amdgpu_kernel void @B()
+; SPLIT3-CHECK2: define internal void @HelperB()
+; SPLIT3-CHECK2: define internal void @HelperC()
+; SPLIT3-CHECK2: define amdgpu_kernel void @ABC()
+
+; --------------------------------------------
+
+; SPLIT5-CHECK0: define amdgpu_kernel void @A()
+; SPLIT5-CHECK0: define internal void @HelperA()
+; SPLIT5-CHECK0: define amdgpu_kernel void @B()
+; SPLIT5-CHECK0: define internal void @HelperB()
+
+; SPLIT5-CHECK1: define internal void @HelperB()
+; SPLIT5-CHECK1: define internal void @HelperC()
+; SPLIT5-CHECK1: define amdgpu_kernel void @BC
+
+; SPLIT5-CHECK2: define internal void @HelperA()
+; SPLIT5-CHECK2: define internal void @HelperB()
+; SPLIT5-CHECK2: define amdgpu_kernel void @AB()
+
+; SPLIT5-CHECK3: define amdgpu_kernel void @C()
+; SPLIT5-CHECK3: define internal void @HelperC()
+
+; SPLIT5-CHECK4: define internal void @HelperA()
+; SPLIT5-CHECK4: define internal void @HelperB()
+; SPLIT5-CHECK4: define internal void @HelperC()
+; SPLIT5-CHECK4: define amdgpu_kernel void @ABC()
+
+define amdgpu_kernel void @A() {
+  store volatile i64 42, ptr null
+  store volatile i64 42, ptr null
+  store volatile i64 42, ptr null
+  store volatile i64 42, ptr null
+  store volatile i64 42, ptr null
+  call void @HelperA()
+  ret void
+}
+
+define internal void @HelperA() {
+  store volatile i32 42, ptr null
+  store volatile i32 42, ptr null
+  ret void
+}
+
+define amdgpu_kernel void @B() {
+  store volatile i64 42, ptr null
+  store volatile i64 42, ptr null
+  store volatile i64 42, ptr null
+  store volatile i64 42, ptr null
+  call void @HelperB()
+  ret void
+}
+
+define internal void @HelperB() {
+  store volatile i32 42, ptr null
+  store volatile i32 42, ptr null
+  store volatile i32 42, ptr null
+  ret void
+}
+
+define amdgpu_kernel void @C() {
+  store volatile i64 42, ptr null
+  store volatile i64 42, ptr null
+  store volatile i64 42, ptr null
+  store volatile i64 42, ptr null
+  store volatile i64 42, ptr null
+  store volatile i64 42, ptr null
+  store volatile i64 42, ptr null
+  store volatile i64 42, ptr null
+  call void @HelperC()
+  ret void
+}
+
+define internal void @HelperC() {
+  store volatile i32 42, ptr null
+  ret void
+}
+
+define amdgpu_kernel void @AB() {
+  store volatile i32 42, ptr null
+  call void @HelperA()
+  call void @HelperB()
+  ret void
+}
+
+define amdgpu_kernel void @BC() {
+  store volatile i32 42, ptr null
+  store volatile i32 42, ptr null
+  call void @HelperB()
+  call void @HelperC()
+  ret void
+}
+
+define amdgpu_kernel void @ABC() {
+  call void @HelperA()
+  call void @HelperB()
+  call void @HelperC()
+  ret void
+}
diff --git a/llvm/test/tools/llvm-split/AMDGPU/recursive-search-8.ll b/llvm/test/tools/llvm-split/AMDGPU/recursive-search-8.ll
new file mode 100644
index 00000000000000..eae57a19883106
--- /dev/null
+++ b/llvm/test/tools/llvm-split/AMDGPU/recursive-search-8.ll
@@ -0,0 +1,128 @@
+; RUN: llvm-split -o %t_s3_ %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-max-depth=8
+; RUN: llvm-dis -o - %t_s3_0 | FileCheck --check-prefix=SPLIT3-CHECK0 --implicit-check-not=define %s
+; RUN: llvm-dis -o - %t_s3_1 | FileCheck --check-prefix=SPLIT3-CHECK1 --implicit-check-not=define %s
+; RUN: llvm-dis -o - %t_s3_2 | FileCheck --check-prefix=SPLIT3-CHECK2 --implicit-check-not=define %s
+
+; RUN: llvm-split -o %t_s5_ %s -j 5 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-max-depth=8
+; RUN: llvm-dis -o - %t_s5_0 | FileCheck --check-prefix=SPLIT5-CHECK0 --implicit-check-not=define %s
+; RUN: llvm-dis -o - %t_s5_1 | FileCheck --check-prefix=SPLIT5-CHECK1 --implicit-check-not=define %s
+; RUN: llvm-dis -o - %t_s5_2 | FileCheck --check-prefix=SPLIT5-CHECK2 --implicit-check-not=define %s
+; RUN: llvm-dis -o - %t_s5_3 | FileCheck --check-prefix=SPLIT5-CHECK3 --implicit-check-not=define %s
+; RUN: llvm-dis -o - %t_s5_4 | FileCheck --check-prefix=SPLIT5-CHECK4 --implicit-check-not=define %s
+
+; Test the specifics of the search algorithm.
+; This test will change depending on new heuristics we add or remove.
+
+; --------------------------------------------
+
+; SPLIT3-CHECK0: define internal void @HelperA()
+; SPLIT3-CHECK0: define internal void @HelperB()
+; SPLIT3-CHECK0: define internal void @HelperC()
+; SPLIT3-CHECK0: define amdgpu_kernel void @AB()
+; SPLIT3-CHECK0: define amdgpu_kernel void @BC()
+
+; SPLIT3-CHECK1: define amdgpu_kernel void @A()
+; SPLIT3-CHECK1: define internal void @HelperA()
+; SPLIT3-CHECK1: define amdgpu_kernel void @C()
+; SPLIT3-CHECK1: define internal void @HelperC()
+
+; SPLIT3-CHECK2: define internal void @HelperA()
+; SPLIT3-CHECK2: define amdgpu_kernel void @B()
+; SPLIT3-CHECK2: define internal void @HelperB()
+; SPLIT3-CHECK2: define internal void @HelperC()
+; SPLIT3-CHECK2: define amdgpu_kernel void @ABC()
+
+; --------------------------------------------
+
+; SPLIT5-CHECK0: define amdgpu_kernel void @A()
+; SPLIT5-CHECK0: define internal void @HelperA()
+; SPLIT5-CHECK0: define amdgpu_kernel void @B()
+; SPLIT5-CHECK0: define internal void @HelperB()
+
+; SPLIT5-CHECK1: define internal void @HelperB()
+; SPLIT5-CHECK1: define internal void @HelperC()
+; SPLIT5-CHECK1: define amdgpu_kernel void @BC
+
+; SPLIT5-CHECK2: define internal void @HelperA()
+; SPLIT5-CHECK2: define internal void @HelperB()
+; SPLIT5-CHECK2: define amdgpu_kernel void @AB()
+
+; SPLIT5-CHECK3: define amdgpu_kernel void @C()
+; SPLIT5-CHECK3: define internal void @HelperC()
+
+; SPLIT5-CHECK4: define internal void @HelperA()
+; SPLIT5-CHECK4: define internal void @HelperB()
+; SPLIT5-CHECK4: define internal void @HelperC()
+; SPLIT5-CHECK4: define amdgpu_kernel void @ABC()
+
+define amdgpu_kernel void @A() {
+  store volatile i64 42, ptr null
+  store volatile i64 42, ptr null
+  store volatile i64 42, ptr null
+  store volatile i64 42, ptr null
+  store volatile i64 42, ptr null
+  call void @HelperA()
+  ret void
+}
+
+define internal void @HelperA() {
+  store volatile i32 42, ptr null
+  store volatile i32 42, ptr null
+  ret void
+}
+
+define amdgpu_kernel void @B() {
+  store volatile i64 42, ptr null
+  store volatile i64 42, ptr null
+  store volatile i64 42, ptr null
+  store volatile i64 42, ptr null
+  call void @HelperB()
+  ret void
+}
+
+define internal void @HelperB() {
+  store volatile i32 42, ptr null
+  store volatile i32 42, ptr null
+  store volatile i32 42, ptr null
+  ret void
+}
+
+define amdgpu_kernel void @C() {
+  store volatile i64 42, ptr null
+  store volatile i64 42, ptr null
+  store volatile i64 42, ptr null
+  store volatile i64 42, ptr null
+  store volatile i64 42, ptr null
+  store volatile i64 42, ptr null
+  store volatile i64 42, ptr null
+  store volatile i64 42, ptr null
+  call void @HelperC()
+  ret void
+}
+
+define internal void @HelperC() {
+  store volatile i32 42, ptr null
+  ret void
+}
+
+define amdgpu_kernel void @AB() {
+  store volatile i32 42, ptr null
+  call void @HelperA()
+  call void @HelperB()
+  ret void
+}
+
+define amdgpu_kernel void @BC() {
+  store volatile i32 42, ptr null
+  store volatile i32 42, ptr null
+  call void @HelperB()
+  call void @HelperC()
+  ret void
+}
+
+define amdgpu_kernel void @ABC() {
+  call void @HelperA()
+  call void @HelperB()
+  call void @HelperC()
+  ret void
+}

>From 63c8f3079883614e736a63145d7d00a0ae3e5135 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Tue, 20 Aug 2024 09:41:34 +0200
Subject: [PATCH 2/3] Address comments

---
 llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp | 83 ++++++++++----------
 1 file changed, 41 insertions(+), 42 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
index 6d755a7c61d4f6..c971d7b111e6eb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
@@ -152,19 +152,14 @@ static constexpr unsigned InvalidPID = -1;
 
 /// \param Num numerator
 /// \param Dem denominator
-/// \param FmtString printf-like format string
-/// \returns a printable object to print (Num/Dem) using FmtString.
-static auto formatRatioOf(CostType Num, CostType Dem,
-                          const char *FmtString = "%0.2f") {
-  return format(FmtString, (double(Num) / Dem) * 100);
-}
-
-static bool isKernel(const Function *F) {
-  return AMDGPU::isEntryFunctionCC(F->getCallingConv());
+/// \returns a printable object to print (Num/Dem) using "%0.2f".
+static auto formatRatioOf(CostType Num, CostType Dem) {
+  return format("%0.2f", (static_cast<double>(Num) / Dem) * 100);
 }
 
 static bool isNonCopyable(const Function &F) {
-  return isKernel(&F) || F.hasExternalLinkage() || !F.isDefinitionExact();
+  return AMDGPU::isEntryFunctionCC(F.getCallingConv()) ||
+         F.hasExternalLinkage() || !F.isDefinitionExact();
 }
 
 /// If \p GV has local linkage, make it external + hidden.
@@ -219,7 +214,7 @@ static CostType calculateFunctionCosts(GetTTIFn GetTTI, Module &M,
     assert((ModuleCost + FnCost) >= ModuleCost && "Overflow!");
     ModuleCost += FnCost;
 
-    if (isKernel(&Fn))
+    if (AMDGPU::isEntryFunctionCC(Fn.getCallingConv()))
       KernelCost += FnCost;
   }
 
@@ -242,7 +237,7 @@ static CostType calculateFunctionCosts(GetTTIFn GetTTI, Module &M,
 
 /// \return true if \p F can be indirectly called
 static bool canBeIndirectlyCalled(const Function &F) {
-  if (F.isDeclaration() || isKernel(&F))
+  if (F.isDeclaration() || AMDGPU::isEntryFunctionCC(F.getCallingConv()))
     return false;
   return !F.hasLocalLinkage() ||
          F.hasAddressTaken(/*PutOffender=*/nullptr,
@@ -261,7 +256,7 @@ static bool canBeIndirectlyCalled(const Function &F) {
 /// that can be split into different modules.
 ///
 /// The most trivial instance of this graph is just the CallGraph of the module,
-/// but it is not guaranteed that the graph is strictly equal to the CFG. It
+/// but it is not guaranteed that the graph is strictly equal to the CG. It
 /// currently always is but it's designed in a way that would eventually allow
 /// us to create abstract nodes, or nodes for different entities such as global
 /// variables or any other meaningful constraint we must consider.
@@ -377,11 +372,11 @@ class SplitGraph::Node {
   Node(unsigned ID, const GlobalValue &GV, CostType IndividualCost,
        bool IsNonCopyable)
       : ID(ID), GV(GV), IndividualCost(IndividualCost),
-        IsNonCopyable(IsNonCopyable), IsEntry(false) {
+        IsNonCopyable(IsNonCopyable), IsGraphEntry(false) {
     if (auto *Fn = dyn_cast<Function>(&GV))
-      IsKernel = ::llvm::isKernel(Fn);
+      IsEntryFnCC = AMDGPU::isEntryFunctionCC(Fn->getCallingConv());
     else
-      IsKernel = false;
+      IsEntryFnCC = false;
   }
 
   /// An 0-indexed ID for the node. The maximum ID (exclusive) is the number of
@@ -395,15 +390,15 @@ class SplitGraph::Node {
   CostType getIndividualCost() const { return IndividualCost; }
 
   bool isNonCopyable() const { return IsNonCopyable; }
-  bool isKernel() const { return IsKernel; }
+  bool isEntryFunctionCC() const { return IsEntryFnCC; }
 
   /// \returns whether this is an entry point in the graph. Entry points are
   /// defined as follows: if you take all entry points in the graph, and iterate
   /// their dependencies, you are guaranteed to visit all nodes in the graph at
   /// least once.
-  bool isGraphEntryPoint() const { return IsEntry; }
+  bool isGraphEntryPoint() const { return IsGraphEntry; }
 
-  std::string getName() const { return GV.getName().str(); }
+  StringRef getName() const { return GV.getName(); }
 
   bool hasAnyIncomingEdges() const { return IncomingEdges.size(); }
   bool hasAnyIncomingEdgesOfKind(EdgeKind EK) const {
@@ -423,7 +418,7 @@ class SplitGraph::Node {
     return OutgoingEdges;
   }
 
-  bool shouldFollowIndirectCalls() const { return isKernel(); }
+  bool shouldFollowIndirectCalls() const { return isEntryFunctionCC(); }
 
   /// Visit all children of this node in a recursive fashion. Also visits Self.
   /// If \ref shouldFollowIndirectCalls returns false, then this only follows
@@ -450,14 +445,14 @@ class SplitGraph::Node {
   CostType getFullCost() const;
 
 private:
-  void markAsEntry() { IsEntry = true; }
+  void markAsGraphEntry() { IsGraphEntry = true; }
 
   unsigned ID;
   const GlobalValue &GV;
   CostType IndividualCost;
   bool IsNonCopyable : 1;
-  bool IsKernel : 1;
-  bool IsEntry : 1;
+  bool IsEntryFnCC : 1;
+  bool IsGraphEntry : 1;
 
   // TODO: Cache dependencies as well?
   mutable CostType FullCost = 0;
@@ -489,6 +484,7 @@ void SplitGraph::Node::visitAllDependencies(
     }
   }
 }
+
 CostType SplitGraph::Node::getFullCost() const {
   if (FullCost)
     return FullCost;
@@ -555,9 +551,9 @@ void SplitGraph::buildGraph(CallGraph &CG) {
   SmallVector<Node *, 16> CandidateEntryPoints;
   BitVector NodesReachableByKernels = createNodesBitVector();
   for (Node *N : Nodes) {
-    // Kernels are always entry points.
-    if (N->isKernel()) {
-      N->markAsEntry();
+    // Functions with an Entry CC are always graph entry points too.
+    if (N->isEntryFunctionCC()) {
+      N->markAsGraphEntry();
       N->setDependenciesBits(NodesReachableByKernels);
     } else if (!N->hasAnyIncomingEdgesOfKind(EdgeKind::DirectCall))
       CandidateEntryPoints.push_back(N);
@@ -570,7 +566,7 @@ void SplitGraph::buildGraph(CallGraph &CG) {
     // NodesReachableByKernels is all 1s. It'd allow us to avoid
     // considering some nodes as non-entries in some specific cases.
     if (!NodesReachableByKernels.test(N->getID()))
-      N->markAsEntry();
+      N->markAsGraphEntry();
   }
 
 #ifndef NDEBUG
@@ -602,7 +598,7 @@ void SplitGraph::verifyGraph() const {
     }
 
     const Function &Fn = N->getFunction();
-    if (isKernel(&Fn)) {
+    if (AMDGPU::isEntryFunctionCC(Fn.getCallingConv())) {
       assert(!N->hasAnyIncomingEdges() && "Kernels cannot have incoming edges");
     }
     assert(!Fn.isDeclaration() && "declarations shouldn't have nodes!");
@@ -765,11 +761,11 @@ void SplitProposal::print(raw_ostream &OS) const {
 
   OS << "[proposal] " << Name << ", total cost:" << TotalCost
      << ", code size score:" << format("%0.3f", CodeSizeScore)
-     << ", bottleneck score:" << format("%0.3f", BottleneckScore) << "\n";
+     << ", bottleneck score:" << format("%0.3f", BottleneckScore) << '\n';
   for (const auto &[PID, Part] : enumerate(Partitions)) {
     const auto &[Cost, NodeIDs] = Part;
     OS << "  - P" << PID << " nodes:" << NodeIDs.count() << " cost: " << Cost
-       << "|" << formatRatioOf(Cost, SG->getModuleCost()) << "%\n";
+       << '|' << formatRatioOf(Cost, SG->getModuleCost()) << "%\n";
   }
 }
 
@@ -1021,7 +1017,7 @@ void RecursiveSearchSplitting::pickPartition(unsigned Depth, unsigned Idx,
           // For debug, just print "L", so we'll see "L3=P3" for instance, which
           // will mean we reached max depth and chose P3 based on this
           // heuristic.
-          LLVM_DEBUG(dbgs() << "L");
+          LLVM_DEBUG(dbgs() << 'L');
           SinglePIDToTry = MostSimilarPID;
         }
       } else
@@ -1050,8 +1046,8 @@ void RecursiveSearchSplitting::pickPartition(unsigned Depth, unsigned Idx,
     // lb = load balancing = put in cheapest partition
     {
       SplitProposal BranchSP = SP;
-      LLVM_DEBUG(dbgs() << " [lb] " << std::string(Depth, ' ') << Idx << "=P"
-                        << CheapestPID << "? ");
+      LLVM_DEBUG(dbgs().indent(Depth)
+                 << " [lb] " << Idx << "=P" << CheapestPID << "? ");
       BranchSP.add(CheapestPID, Cluster);
       pickPartition(Depth + 1, Idx + 1, BranchSP);
     }
@@ -1059,8 +1055,8 @@ void RecursiveSearchSplitting::pickPartition(unsigned Depth, unsigned Idx,
     // ms = most similar = put in partition with the most in common
     {
       SplitProposal BranchSP = SP;
-      LLVM_DEBUG(dbgs() << " [ms] " << std::string(Depth, ' ') << Idx << "=P"
-                        << MostSimilarPID << "? ");
+      LLVM_DEBUG(dbgs().indent(Depth)
+                 << " [ms] " << Idx << "=P" << MostSimilarPID << "? ");
       BranchSP.add(MostSimilarPID, Cluster);
       pickPartition(Depth + 1, Idx + 1, BranchSP);
     }
@@ -1155,14 +1151,14 @@ template <> struct DOTGraphTraits<SplitGraph> : public DefaultDOTGraphTraits {
   }
 
   std::string getNodeLabel(const SplitGraph::Node *N, const SplitGraph &SG) {
-    return N->getName();
+    return N->getName().str();
   }
 
   static std::string getNodeDescription(const SplitGraph::Node *N,
                                         const SplitGraph &SG) {
     std::string Result;
-    if (N->isKernel())
-      Result += "kernel ";
+    if (N->isEntryFunctionCC())
+      Result += "entry-fn-cc ";
     if (N->isNonCopyable())
       Result += "non-copyable ";
     Result += "cost:" + std::to_string(N->getIndividualCost());
@@ -1453,8 +1449,9 @@ PreservedAnalyses AMDGPUSplitModulePass::run(Module &M,
       llvm::LockFileManager Locked(LockFilePath.str());
       switch (Locked) {
       case LockFileManager::LFS_Error:
-        errs() << "[amdgpu-split-module] unable to acquire lockfile, debug "
-                  "output may be mangled by other processes\n";
+        LLVM_DEBUG(
+            dbgs() << "[amdgpu-split-module] unable to acquire lockfile, debug "
+                      "output may be mangled by other processes\n");
         Locked.unsafeRemoveLockFile();
         break;
       case LockFileManager::LFS_Owned:
@@ -1466,8 +1463,10 @@ PreservedAnalyses AMDGPUSplitModulePass::run(Module &M,
         case LockFileManager::Res_OwnerDied:
           continue; // try again to get the lock.
         case LockFileManager::Res_Timeout:
-          errs() << "[amdgpu-split-module] unable to acquire lockfile, debug "
-                    "output may be mangled by other processes\n";
+          LLVM_DEBUG(
+              dbgs()
+              << "[amdgpu-split-module] unable to acquire lockfile, debug "
+                 "output may be mangled by other processes\n");
           Locked.unsafeRemoveLockFile();
           break; // give up
         }

>From 93d98005743c45f52d26f6eb23b03d6a08b47257 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Wed, 21 Aug 2024 09:07:47 +0200
Subject: [PATCH 3/3] Comments, refactor verify

---
 llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp | 119 ++++++++++++-------
 1 file changed, 79 insertions(+), 40 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
index c971d7b111e6eb..ca371a883b897e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
@@ -115,9 +115,6 @@ static cl::opt<std::string> PartitionSummariesOutput(
              "the partitions created for each module"));
 
 #ifndef NDEBUG
-static cl::opt<bool> TimeBuild("amdgpu-module-splitting-time-trace", cl::Hidden,
-                               cl::desc("enable and print timers"));
-
 static cl::opt<bool>
     UseLockFile("amdgpu-module-splitting-serial-execution", cl::Hidden,
                 cl::desc("use a lock file so only one process in the system "
@@ -129,17 +126,13 @@ static cl::opt<bool>
                         cl::Hidden,
                         cl::desc("print all proposals received and whether "
                                  "they were rejected or accepted"));
+#endif
 
 struct SplitModuleTimer : NamedRegionTimer {
   SplitModuleTimer(StringRef Name, StringRef Desc)
       : NamedRegionTimer(Name, Desc, DEBUG_TYPE, "AMDGPU Module Splitting",
-                         TimeBuild) {}
+                         TimePassesIsEnabled) {}
 };
-#else
-struct SplitModuleTimer {
-  SplitModuleTimer(StringRef Name, StringRef Desc) {}
-};
-#endif
 
 //===----------------------------------------------------------------------===//
 // Utils
@@ -157,9 +150,18 @@ static auto formatRatioOf(CostType Num, CostType Dem) {
   return format("%0.2f", (static_cast<double>(Num) / Dem) * 100);
 }
 
+/// Checks whether a given function is non-copyable.
+///
+/// Non-copyable functions cannot be cloned into multiple partitions, and only
+/// one copy of the function can be present across all partitions.
+///
+/// External functions fall into this category. If we were to clone them, we
+/// would end up with multiple symbol definitions and a very unhappy linker.
 static bool isNonCopyable(const Function &F) {
-  return AMDGPU::isEntryFunctionCC(F.getCallingConv()) ||
-         F.hasExternalLinkage() || !F.isDefinitionExact();
+  assert(AMDGPU::isEntryFunctionCC(F.getCallingConv())
+             ? F.hasExternalLinkage()
+             : true && "Kernel w/o external linkage?");
+  return F.hasExternalLinkage() || !F.isDefinitionExact();
 }
 
 /// If \p GV has local linkage, make it external + hidden.
@@ -305,7 +307,7 @@ class SplitGraph {
   void buildGraph(CallGraph &CG);
 
 #ifndef NDEBUG
-  void verifyGraph() const;
+  bool verifyGraph() const;
 #endif
 
   bool empty() const { return Nodes.empty(); }
@@ -372,11 +374,9 @@ class SplitGraph::Node {
   Node(unsigned ID, const GlobalValue &GV, CostType IndividualCost,
        bool IsNonCopyable)
       : ID(ID), GV(GV), IndividualCost(IndividualCost),
-        IsNonCopyable(IsNonCopyable), IsGraphEntry(false) {
+        IsNonCopyable(IsNonCopyable), IsEntryFnCC(false), IsGraphEntry(false) {
     if (auto *Fn = dyn_cast<Function>(&GV))
       IsEntryFnCC = AMDGPU::isEntryFunctionCC(Fn->getCallingConv());
-    else
-      IsEntryFnCC = false;
   }
 
   /// An 0-indexed ID for the node. The maximum ID (exclusive) is the number of
@@ -434,7 +434,7 @@ class SplitGraph::Node {
   /// rules regarding dependencies traversal.
   ///
   /// \param[out] BV The bitvector where the bits should be set.
-  void setDependenciesBits(BitVector &BV) const {
+  void getDependencies(BitVector &BV) const {
     visitAllDependencies([&](const Node &N) { BV.set(N.getID()); });
   }
 
@@ -554,7 +554,7 @@ void SplitGraph::buildGraph(CallGraph &CG) {
     // Functions with an Entry CC are always graph entry points too.
     if (N->isEntryFunctionCC()) {
       N->markAsGraphEntry();
-      N->setDependenciesBits(NodesReachableByKernels);
+      N->getDependencies(NodesReachableByKernels);
     } else if (!N->hasAnyIncomingEdgesOfKind(EdgeKind::DirectCall))
       CandidateEntryPoints.push_back(N);
   }
@@ -570,62 +570,101 @@ void SplitGraph::buildGraph(CallGraph &CG) {
   }
 
 #ifndef NDEBUG
-  verifyGraph();
+  assert(verifyGraph());
 #endif
 }
 
 #ifndef NDEBUG
-void SplitGraph::verifyGraph() const {
+bool SplitGraph::verifyGraph() const {
   unsigned ExpectedID = 0;
   // Exceptionally using a set here in case IDs are messed up.
   DenseSet<const Node *> SeenNodes;
   DenseSet<const Function *> SeenFunctionNodes;
   for (const Node *N : Nodes) {
-    assert(N->getID() == (ExpectedID++) && "Node IDs are incorrect!");
-    assert(SeenNodes.insert(N).second && "Node seen more than once!");
-    assert(&getNode(N->getID()) == N);
+    if (N->getID() != (ExpectedID++)) {
+      errs() << "Node IDs are incorrect!\n";
+      return false;
+    }
+
+    if (!SeenNodes.insert(N).second) {
+      errs() << "Node seen more than once!\n";
+      return false;
+    }
+
+    if (&getNode(N->getID()) != N) {
+      errs() << "getNode doesn't return the right node\n";
+      return false;
+    }
 
     for (const Edge *E : N->IncomingEdges) {
-      assert(E->Src && E->Dst);
-      assert(E->Dst == N);
-      assert(find(E->Src->OutgoingEdges, E) != E->Src->OutgoingEdges.end());
+      if (!E->Src || !E->Dst || (E->Dst != N) ||
+          (find(E->Src->OutgoingEdges, E) == E->Src->OutgoingEdges.end())) {
+        errs() << "ill-formed incoming edges\n";
+        return false;
+      }
     }
 
     for (const Edge *E : N->OutgoingEdges) {
-      assert(E->Src && E->Dst);
-      assert(E->Src == N);
-      assert(find(E->Dst->IncomingEdges, E) != E->Dst->IncomingEdges.end());
+      if (!E->Src || !E->Dst || (E->Src != N) ||
+          (find(E->Dst->IncomingEdges, E) == E->Dst->IncomingEdges.end())) {
+        errs() << "ill-formed outgoing edges\n";
+        return false;
+      }
     }
 
     const Function &Fn = N->getFunction();
     if (AMDGPU::isEntryFunctionCC(Fn.getCallingConv())) {
-      assert(!N->hasAnyIncomingEdges() && "Kernels cannot have incoming edges");
+      if (N->hasAnyIncomingEdges()) {
+        errs() << "Kernels cannot have incoming edges\n";
+        return false;
+      }
+    }
+
+    if (Fn.isDeclaration()) {
+      errs() << "declarations shouldn't have nodes!\n";
+      return false;
     }
-    assert(!Fn.isDeclaration() && "declarations shouldn't have nodes!");
 
     auto [It, Inserted] = SeenFunctionNodes.insert(&Fn);
-    assert(Inserted && "one function has multiple nodes!");
+    if (!Inserted) {
+      errs() << "one function has multiple nodes!\n";
+      return false;
+    }
   }
-  assert(ExpectedID == Nodes.size() && "Node IDs out of sync!");
 
-  assert(createNodesBitVector().size() == getNumNodes());
+  if (ExpectedID != Nodes.size()) {
+    errs() << "Node IDs out of sync!\n";
+    return false;
+  }
+
+  if (createNodesBitVector().size() != getNumNodes()) {
+    errs() << "nodes bit vector doesn't have the right size!\n";
+    return false;
+  }
 
   // Check we respect the promise of Node::isKernel
   BitVector BV = createNodesBitVector();
   for (const Node *N : nodes()) {
     if (N->isGraphEntryPoint())
-      N->setDependenciesBits(BV);
+      N->getDependencies(BV);
   }
 
   // Ensure each function in the module has an associated node.
   for (const auto &Fn : M) {
-    if (!Fn.isDeclaration())
-      assert(SeenFunctionNodes.contains(&Fn) &&
-             "Fn has no associated node in the graph!");
+    if (!Fn.isDeclaration()) {
+      if (!SeenFunctionNodes.contains(&Fn)) {
+        errs() << "Fn has no associated node in the graph!\n";
+        return false;
+      }
+    }
+  }
+
+  if (!BV.all()) {
+    errs() << "not all nodes are reachable through the graph's entry points!\n";
+    return false;
   }
 
-  assert(BV.all() &&
-         "not all nodes are reachable through the graph's entry points!");
+  return true;
 }
 #endif
 
@@ -943,7 +982,7 @@ void RecursiveSearchSplitting::setupWorkList() {
     for (auto MI = NodeEC.member_begin(I); MI != NodeEC.member_end(); ++MI) {
       const SplitGraph::Node &N = SG.getNode(*MI);
       if (N.isGraphEntryPoint())
-        N.setDependenciesBits(Cluster);
+        N.getDependencies(Cluster);
     }
     WorkList.emplace_back(std::move(Cluster));
   }