[llvm] [AMDGPU] Graph-based Module Splitting Rewrite (llvm#104763) (PR #106707)

Fri Aug 30 03:36:58 PDT 2024

https://github.com/dklimkin created https://github.com/llvm/llvm-project/pull/106707

* Revert "Fix MSVC "not all control paths return a value" warning. NFC."
Dep to revert c9b6e01b2e4fc930dac91dd44c0592ad7e36d967

* Revert "[AMDGPU] Graph-based Module Splitting Rewrite (#104763)"
Breaks tests.


>From 5686e6a4e08df82af6ee8e2bda32fdd195fb3f6c Mon Sep 17 00:00:00 2001
From: Danial Klimkin <dklimkin at google.com>
Date: Fri, 30 Aug 2024 12:36:08 +0200
Subject: [PATCH] [AMDGPU] Graph-based Module Splitting Rewrite (llvm#104763)

* Revert "Fix MSVC "not all control paths return a value" warning. NFC."

Dep to revert c9b6e01b2e4fc930dac91dd44c0592ad7e36d967

* Revert "[AMDGPU] Graph-based Module Splitting Rewrite (#104763)"

This reverts commit c9b6e01b2e4fc930dac91dd44c0592ad7e36d967.
---
 llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp  | 1803 +++++------------
 .../address-taken-externalize-with-call.ll    |   36 +-
 .../AMDGPU/address-taken-externalize.ll       |    2 +-
 .../llvm-split/AMDGPU/debug-name-hiding.ll    |   20 +
 .../AMDGPU/debug-non-kernel-root.ll           |   36 +
 .../tools/llvm-split/AMDGPU/declarations.ll   |    9 +-
 .../AMDGPU/kernels-alias-dependencies.ll      |   18 +-
 .../llvm-split/AMDGPU/kernels-cost-ranking.ll |   12 +-
 .../AMDGPU/kernels-dependency-external.ll     |   33 +-
 .../AMDGPU/kernels-dependency-indirect.ll     |   30 +-
 .../AMDGPU/kernels-dependency-overridable.ll  |   28 +-
 .../kernels-global-variables-noexternal.ll    |   12 +-
 .../AMDGPU/kernels-global-variables.ll        |   12 +-
 .../AMDGPU/large-kernels-merging.ll           |   26 +-
 .../AMDGPU/non-kernels-dependency-indirect.ll |   30 +-
 .../llvm-split/AMDGPU/recursive-search-2.ll   |  128 --
 .../llvm-split/AMDGPU/recursive-search-8.ll   |  128 --
 17 files changed, 738 insertions(+), 1625 deletions(-)
 create mode 100644 llvm/test/tools/llvm-split/AMDGPU/debug-name-hiding.ll
 create mode 100644 llvm/test/tools/llvm-split/AMDGPU/debug-non-kernel-root.ll
 delete mode 100644 llvm/test/tools/llvm-split/AMDGPU/recursive-search-2.ll
 delete mode 100644 llvm/test/tools/llvm-split/AMDGPU/recursive-search-8.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
index a5807a70582b39..df084cf41c4783 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
@@ -7,36 +7,33 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file Implements a module splitting algorithm designed to support the
-/// FullLTO --lto-partitions option for parallel codegen.
+/// FullLTO --lto-partitions option for parallel codegen. This is completely
+/// different from the common SplitModule pass, as this system is designed with
+/// AMDGPU in mind.
 ///
-/// The role of this module splitting pass is the same as
-/// lib/Transforms/Utils/SplitModule.cpp: load-balance the module's functions
-/// across a set of N partitions to allow for parallel codegen.
+/// The basic idea of this module splitting implementation is the same as
+/// SplitModule: load-balance the module's functions across a set of N
+/// partitions to allow parallel codegen. However, it does it very
+/// differently than the target-agnostic variant:
+///   - The module has "split roots", which are kernels in the vast
+//      majority of cases.
+///   - Each root has a set of dependencies, and when a root and its
+///     dependencies is considered "big", we try to put it in a partition where
+///     most dependencies are already imported, to avoid duplicating large
+///     amounts of code.
+///   - There's special care for indirect calls in order to ensure
+///     AMDGPUResourceUsageAnalysis can work correctly.
 ///
-/// The similarities mostly end here, as this pass achieves load-balancing in a
-/// more elaborate fashion which is targeted towards AMDGPU modules. It can take
-/// advantage of the structure of AMDGPU modules (which are mostly
-/// self-contained) to allow for more efficient splitting without affecting
-/// codegen negatively, or causing innaccurate resource usage analysis.
-///
-/// High-level pass overview:
-///   - SplitGraph & associated classes
-///      - Graph representation of the module and of the dependencies that
-///      matter for splitting.
-///   - RecursiveSearchSplitting
-///     - Core splitting algorithm.
-///   - SplitProposal
-///     - Represents a suggested solution for splitting the input module. These
-///     solutions can be scored to determine the best one when multiple
-///     solutions are available.
-///   - Driver/pass "run" function glues everything together.
+/// This file also includes a more elaborate logging system to enable
+/// users to easily generate logs that (if desired) do not include any value
+/// names, in order to not leak information about the source file.
+/// Such logs are very helpful to understand and fix potential issues with
+/// module splitting.
 
 #include "AMDGPUSplitModule.h"
 #include "AMDGPUTargetMachine.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/EquivalenceClasses.h"
-#include "llvm/ADT/GraphTraits.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
@@ -47,56 +44,44 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
-#include "llvm/Support/Allocator.h"
 #include "llvm/Support/Casting.h"
-#include "llvm/Support/DOTGraphTraits.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FileSystem.h"
-#include "llvm/Support/GraphWriter.h"
 #include "llvm/Support/Path.h"
-#include "llvm/Support/Timer.h"
+#include "llvm/Support/Process.h"
+#include "llvm/Support/SHA256.h"
+#include "llvm/Support/Threading.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include <algorithm>
 #include <cassert>
-#include <cmath>
 #include <iterator>
 #include <memory>
 #include <utility>
 #include <vector>
 
-#ifndef NDEBUG
-#include "llvm/Support/LockFileManager.h"
-#endif
+using namespace llvm;
 
 #define DEBUG_TYPE "amdgpu-split-module"
 
-namespace llvm {
 namespace {
 
-static cl::opt<unsigned> MaxDepth(
-    "amdgpu-module-splitting-max-depth",
-    cl::desc(
-        "maximum search depth. 0 forces a greedy approach. "
-        "warning: the algorithm is up to O(2^N), where N is the max depth."),
-    cl::init(8));
-
 static cl::opt<float> LargeFnFactor(
-    "amdgpu-module-splitting-large-threshold", cl::init(2.0f), cl::Hidden,
+    "amdgpu-module-splitting-large-function-threshold", cl::init(2.0f),
+    cl::Hidden,
     cl::desc(
-        "when max depth is reached and we can no longer branch out, this "
-        "value determines if a function is worth merging into an already "
-        "existing partition to reduce code duplication. This is a factor "
-        "of the ideal partition size, e.g. 2.0 means we consider the "
-        "function for merging if its cost (including its callees) is 2x the "
-        "size of an ideal partition."));
+        "consider a function as large and needing special treatment when the "
+        "cost of importing it into a partition"
+        "exceeds the average cost of a partition by this factor; e;g. 2.0 "
+        "means if the function and its dependencies is 2 times bigger than "
+        "an average partition; 0 disables large functions handling entirely"));
 
 static cl::opt<float> LargeFnOverlapForMerge(
-    "amdgpu-module-splitting-merge-threshold", cl::init(0.7f), cl::Hidden,
-    cl::desc("when a function is considered for merging into a partition that "
-             "already contains some of its callees, do the merge if at least "
-             "n% of the code it can reach is already present inside the "
-             "partition; e.g. 0.7 means only merge >70%"));
+    "amdgpu-module-splitting-large-function-merge-overlap", cl::init(0.8f),
+    cl::Hidden,
+    cl::desc(
+        "defines how much overlap between two large function's dependencies "
+        "is needed to put them in the same partition"));
 
 static cl::opt<bool> NoExternalizeGlobals(
     "amdgpu-module-splitting-no-externalize-globals", cl::Hidden,
@@ -104,92 +89,142 @@ static cl::opt<bool> NoExternalizeGlobals(
              "may cause globals to be duplicated which increases binary size"));
 
 static cl::opt<std::string>
-    ModuleDotCfgOutput("amdgpu-module-splitting-print-module-dotcfg",
-                       cl::Hidden,
-                       cl::desc("output file to write out the dotgraph "
-                                "representation of the input module"));
+    LogDirOpt("amdgpu-module-splitting-log-dir", cl::Hidden,
+              cl::desc("output directory for AMDGPU module splitting logs"));
 
-static cl::opt<std::string> PartitionSummariesOutput(
-    "amdgpu-module-splitting-print-partition-summaries", cl::Hidden,
-    cl::desc("output file to write out a summary of "
-             "the partitions created for each module"));
-
-#ifndef NDEBUG
 static cl::opt<bool>
-    UseLockFile("amdgpu-module-splitting-serial-execution", cl::Hidden,
-                cl::desc("use a lock file so only one process in the system "
-                         "can run this pass at once. useful to avoid mangled "
-                         "debug output in multithreaded environments."));
+    LogPrivate("amdgpu-module-splitting-log-private", cl::Hidden,
+               cl::desc("hash value names before printing them in the AMDGPU "
+                        "module splitting logs"));
 
-static cl::opt<bool>
-    DebugProposalSearch("amdgpu-module-splitting-debug-proposal-search",
-                        cl::Hidden,
-                        cl::desc("print all proposals received and whether "
-                                 "they were rejected or accepted"));
-#endif
+using CostType = InstructionCost::CostType;
+using PartitionID = unsigned;
+using GetTTIFn = function_ref<const TargetTransformInfo &(Function &)>;
 
-struct SplitModuleTimer : NamedRegionTimer {
-  SplitModuleTimer(StringRef Name, StringRef Desc)
-      : NamedRegionTimer(Name, Desc, DEBUG_TYPE, "AMDGPU Module Splitting",
-                         TimePassesIsEnabled) {}
-};
+static bool isEntryPoint(const Function *F) {
+  return AMDGPU::isEntryFunctionCC(F->getCallingConv());
+}
 
-//===----------------------------------------------------------------------===//
-// Utils
-//===----------------------------------------------------------------------===//
+static std::string getName(const Value &V) {
+  static bool HideNames;
 
-using CostType = InstructionCost::CostType;
-using FunctionsCostMap = DenseMap<const Function *, CostType>;
-using GetTTIFn = function_ref<const TargetTransformInfo &(Function &)>;
-static constexpr unsigned InvalidPID = -1;
+  static llvm::once_flag HideNameInitFlag;
+  llvm::call_once(HideNameInitFlag, [&]() {
+    if (LogPrivate.getNumOccurrences())
+      HideNames = LogPrivate;
+    else {
+      const auto EV = sys::Process::GetEnv("AMD_SPLIT_MODULE_LOG_PRIVATE");
+      HideNames = (EV.value_or("0") != "0");
+    }
+  });
 
-/// \param Num numerator
-/// \param Dem denominator
-/// \returns a printable object to print (Num/Dem) using "%0.2f".
-static auto formatRatioOf(CostType Num, CostType Dem) {
-  return format("%0.2f", (static_cast<double>(Num) / Dem) * 100);
+  if (!HideNames)
+    return V.getName().str();
+  return toHex(SHA256::hash(arrayRefFromStringRef(V.getName())),
+               /*LowerCase=*/true);
 }
 
-/// Checks whether a given function is non-copyable.
+/// Main logging helper.
 ///
-/// Non-copyable functions cannot be cloned into multiple partitions, and only
-/// one copy of the function can be present across all partitions.
+/// Logging can be configured by the following environment variable.
+///   AMD_SPLIT_MODULE_LOG_DIR=<filepath>
+///     If set, uses <filepath> as the directory to write logfiles to
+///     each time module splitting is used.
+///   AMD_SPLIT_MODULE_LOG_PRIVATE
+///     If set to anything other than zero, all names are hidden.
 ///
-/// External functions fall into this category. If we were to clone them, we
-/// would end up with multiple symbol definitions and a very unhappy linker.
-static bool isNonCopyable(const Function &F) {
-  assert(AMDGPU::isEntryFunctionCC(F.getCallingConv())
-             ? F.hasExternalLinkage()
-             : true && "Kernel w/o external linkage?");
-  return F.hasExternalLinkage() || !F.isDefinitionExact();
-}
+/// Both environment variables have corresponding CL options which
+/// takes priority over them.
+///
+/// Any output printed to the log files is also printed to dbgs() when -debug is
+/// used and LLVM_DEBUG is defined.
+///
+/// This approach has a small disadvantage over LLVM_DEBUG though: logging logic
+/// cannot be removed from the code (by building without debug). This probably
+/// has a small performance cost because if some computation/formatting is
+/// needed for logging purpose, it may be done everytime only to be ignored
+/// by the logger.
+///
+/// As this pass only runs once and is not doing anything computationally
+/// expensive, this is likely a reasonable trade-off.
+///
+/// If some computation should really be avoided when unused, users of the class
+/// can check whether any logging will occur by using the bool operator.
+///
+/// \code
+///   if (SML) {
+///     // Executes only if logging to a file or if -debug is available and
+///     used.
+///   }
+/// \endcode
+class SplitModuleLogger {
+public:
+  SplitModuleLogger(const Module &M) {
+    std::string LogDir = LogDirOpt;
+    if (LogDir.empty())
+      LogDir = sys::Process::GetEnv("AMD_SPLIT_MODULE_LOG_DIR").value_or("");
+
+    // No log dir specified means we don't need to log to a file.
+    // We may still log to dbgs(), though.
+    if (LogDir.empty())
+      return;
+
+    // If a log directory is specified, create a new file with a unique name in
+    // that directory.
+    int Fd;
+    SmallString<0> PathTemplate;
+    SmallString<0> RealPath;
+    sys::path::append(PathTemplate, LogDir, "Module-%%-%%-%%-%%-%%-%%-%%.txt");
+    if (auto Err =
+            sys::fs::createUniqueFile(PathTemplate.str(), Fd, RealPath)) {
+      report_fatal_error("Failed to create log file at '" + Twine(LogDir) +
+                             "': " + Err.message(),
+                         /*CrashDiag=*/false);
+    }
 
-/// If \p GV has local linkage, make it external + hidden.
-static void externalize(GlobalValue &GV) {
-  if (GV.hasLocalLinkage()) {
-    GV.setLinkage(GlobalValue::ExternalLinkage);
-    GV.setVisibility(GlobalValue::HiddenVisibility);
+    FileOS = std::make_unique<raw_fd_ostream>(Fd, /*shouldClose=*/true);
   }
 
-  // Unnamed entities must be named consistently between modules. setName will
-  // give a distinct name to each such entity.
-  if (!GV.hasName())
-    GV.setName("__llvmsplit_unnamed");
+  bool hasLogFile() const { return FileOS != nullptr; }
+
+  raw_ostream &logfile() {
+    assert(FileOS && "no logfile!");
+    return *FileOS;
+  }
+
+  /// \returns true if this SML will log anything either to a file or dbgs().
+  /// Can be used to avoid expensive computations that are ignored when logging
+  /// is disabled.
+  operator bool() const {
+    return hasLogFile() || (DebugFlag && isCurrentDebugType(DEBUG_TYPE));
+  }
+
+private:
+  std::unique_ptr<raw_fd_ostream> FileOS;
+};
+
+template <typename Ty>
+static SplitModuleLogger &operator<<(SplitModuleLogger &SML, const Ty &Val) {
+  static_assert(
+      !std::is_same_v<Ty, Value>,
+      "do not print values to logs directly, use handleName instead!");
+  LLVM_DEBUG(dbgs() << Val);
+  if (SML.hasLogFile())
+    SML.logfile() << Val;
+  return SML;
 }
 
-/// Cost analysis function. Calculates the cost of each function in \p M
-///
+/// Calculate the cost of each function in \p M
+/// \param SML Log Helper
 /// \param GetTTI Abstract getter for TargetTransformInfo.
 /// \param M Module to analyze.
 /// \param CostMap[out] Resulting Function -> Cost map.
 /// \return The module's total cost.
-static CostType calculateFunctionCosts(GetTTIFn GetTTI, Module &M,
-                                       FunctionsCostMap &CostMap) {
-  SplitModuleTimer SMT("calculateFunctionCosts", "cost analysis");
-
-  LLVM_DEBUG(dbgs() << "[cost analysis] calculating function costs\n");
+static CostType
+calculateFunctionCosts(SplitModuleLogger &SML, GetTTIFn GetTTI, Module &M,
+                       DenseMap<const Function *, CostType> &CostMap) {
   CostType ModuleCost = 0;
-  [[maybe_unused]] CostType KernelCost = 0;
+  CostType KernelCost = 0;
 
   for (auto &Fn : M) {
     if (Fn.isDeclaration())
@@ -216,30 +251,23 @@ static CostType calculateFunctionCosts(GetTTIFn GetTTI, Module &M,
     assert((ModuleCost + FnCost) >= ModuleCost && "Overflow!");
     ModuleCost += FnCost;
 
-    if (AMDGPU::isEntryFunctionCC(Fn.getCallingConv()))
+    if (isEntryPoint(&Fn))
       KernelCost += FnCost;
   }
 
-  if (CostMap.empty())
-    return 0;
-
-  assert(ModuleCost);
-  LLVM_DEBUG({
-    const CostType FnCost = ModuleCost - KernelCost;
-    dbgs() << " - total module cost is " << ModuleCost << ". kernels cost "
-           << "" << KernelCost << " ("
-           << format("%0.2f", (float(KernelCost) / ModuleCost) * 100)
-           << "% of the module), functions cost " << FnCost << " ("
-           << format("%0.2f", (float(FnCost) / ModuleCost) * 100)
-           << "% of the module)\n";
-  });
+  CostType FnCost = (ModuleCost - KernelCost);
+  CostType ModuleCostOr1 = ModuleCost ? ModuleCost : 1;
+  SML << "=> Total Module Cost: " << ModuleCost << '\n'
+      << "  => KernelCost: " << KernelCost << " ("
+      << format("%0.2f", (float(KernelCost) / ModuleCostOr1) * 100) << "%)\n"
+      << "  => FnsCost: " << FnCost << " ("
+      << format("%0.2f", (float(FnCost) / ModuleCostOr1) * 100) << "%)\n";
 
   return ModuleCost;
 }
 
-/// \return true if \p F can be indirectly called
 static bool canBeIndirectlyCalled(const Function &F) {
-  if (F.isDeclaration() || AMDGPU::isEntryFunctionCC(F.getCallingConv()))
+  if (F.isDeclaration() || isEntryPoint(&F))
     return false;
   return !F.hasLocalLinkage() ||
          F.hasAddressTaken(/*PutOffender=*/nullptr,
@@ -250,1081 +278,351 @@ static bool canBeIndirectlyCalled(const Function &F) {
                            /*IgnoreCastedDirectCall=*/true);
 }
 
-//===----------------------------------------------------------------------===//
-// Graph-based Module Representation
-//===----------------------------------------------------------------------===//
-
-/// AMDGPUSplitModule's view of the source Module, as a graph of all components
-/// that can be split into different modules.
-///
-/// The most trivial instance of this graph is just the CallGraph of the module,
-/// but it is not guaranteed that the graph is strictly equal to the CG. It
-/// currently always is but it's designed in a way that would eventually allow
-/// us to create abstract nodes, or nodes for different entities such as global
-/// variables or any other meaningful constraint we must consider.
+/// When a function or any of its callees performs an indirect call, this
+/// takes over \ref addAllDependencies and adds all potentially callable
+/// functions to \p Fns so they can be counted as dependencies of the function.
 ///
-/// The graph is only mutable by this class, and is generally not modified
-/// after \ref SplitGraph::buildGraph runs. No consumers of the graph can
-/// mutate it.
-class SplitGraph {
-public:
-  class Node;
-
-  enum class EdgeKind : uint8_t {
-    /// The nodes are related through a direct call. This is a "strong" edge as
-    /// it means the Src will directly reference the Dst.
-    DirectCall,
-    /// The nodes are related through an indirect call.
-    /// This is a "weaker" edge and is only considered when traversing the graph
-    /// starting from a kernel. We need this edge for resource usage analysis.
-    ///
-    /// The reason why we have this edge in the first place is due to how
-    /// AMDGPUResourceUsageAnalysis works. In the presence of an indirect call,
-    /// the resource usage of the kernel containing the indirect call is the
-    /// max resource usage of all functions that can be indirectly called.
-    IndirectCall,
-  };
-
-  /// An edge between two nodes. Edges are directional, and tagged with a
-  /// "kind".
-  struct Edge {
-    Edge(Node *Src, Node *Dst, EdgeKind Kind)
-        : Src(Src), Dst(Dst), Kind(Kind) {}
-
-    Node *Src; ///< Source
-    Node *Dst; ///< Destination
-    EdgeKind Kind;
-  };
-
-  using EdgesVec = SmallVector<const Edge *, 0>;
-  using edges_iterator = EdgesVec::const_iterator;
-  using nodes_iterator = const Node *const *;
-
-  SplitGraph(const Module &M, const FunctionsCostMap &CostMap,
-             CostType ModuleCost)
-      : M(M), CostMap(CostMap), ModuleCost(ModuleCost) {}
-
-  void buildGraph(CallGraph &CG);
-
-#ifndef NDEBUG
-  bool verifyGraph() const;
-#endif
-
-  bool empty() const { return Nodes.empty(); }
-  const iterator_range<nodes_iterator> nodes() const {
-    return {Nodes.begin(), Nodes.end()};
+/// This is needed due to how AMDGPUResourceUsageAnalysis operates: in the
+/// presence of an indirect call, the function's resource usage is the same as
+/// the most expensive function in the module.
+/// \param M    The module.
+/// \param Fns[out] Resulting list of functions.
+static void addAllIndirectCallDependencies(const Module &M,
+                                           DenseSet<const Function *> &Fns) {
+  for (const auto &Fn : M) {
+    if (canBeIndirectlyCalled(Fn))
+      Fns.insert(&Fn);
   }
-  const Node &getNode(unsigned ID) const { return *Nodes[ID]; }
-
-  unsigned getNumNodes() const { return Nodes.size(); }
-  BitVector createNodesBitVector() const { return BitVector(Nodes.size()); }
-
-  const Module &getModule() const { return M; }
-
-  CostType getModuleCost() const { return ModuleCost; }
-  CostType getCost(const Function &F) const { return CostMap.at(&F); }
-
-  /// \returns the aggregated cost of all nodes in \p BV (bits set to 1 = node
-  /// IDs).
-  CostType calculateCost(const BitVector &BV) const;
-
-private:
-  /// Retrieves the node for \p GV in \p Cache, or creates a new node for it and
-  /// updates \p Cache.
-  Node &getNode(DenseMap<const GlobalValue *, Node *> &Cache,
-                const GlobalValue &GV);
-
-  // Create a new edge between two nodes and add it to both nodes.
-  const Edge &createEdge(Node &Src, Node &Dst, EdgeKind EK);
-
-  const Module &M;
-  const FunctionsCostMap &CostMap;
-  CostType ModuleCost;
-
-  // Final list of nodes with stable ordering.
-  SmallVector<Node *> Nodes;
-
-  SpecificBumpPtrAllocator<Node> NodesPool;
-
-  // Edges are trivially destructible objects, so as a small optimization we
-  // use a BumpPtrAllocator which avoids destructor calls but also makes
-  // allocation faster.
-  static_assert(
-      std::is_trivially_destructible_v<Edge>,
-      "Edge must be trivially destructible to use the BumpPtrAllocator");
-  BumpPtrAllocator EdgesPool;
-};
+}
 
-/// Nodes in the SplitGraph contain both incoming, and outgoing edges.
-/// Incoming edges have this node as their Dst, and Outgoing ones have this node
-/// as their Src.
+/// Adds the functions that \p Fn may call to \p Fns, then recurses into each
+/// callee until all reachable functions have been gathered.
 ///
-/// Edge objects are shared by both nodes in Src/Dst. They provide immediate
-/// feedback on how two nodes are related, and in which direction they are
-/// related, which is valuable information to make splitting decisions.
-///
-/// Nodes are fundamentally abstract, and any consumers of the graph should
-/// treat them as such. While a node will be a function most of the time, we
-/// could also create nodes for any other reason. In the future, we could have
-/// single nodes for multiple functions, or nodes for GVs, etc.
-class SplitGraph::Node {
-  friend class SplitGraph;
-
-public:
-  Node(unsigned ID, const GlobalValue &GV, CostType IndividualCost,
-       bool IsNonCopyable)
-      : ID(ID), GV(GV), IndividualCost(IndividualCost),
-        IsNonCopyable(IsNonCopyable), IsEntryFnCC(false), IsGraphEntry(false) {
-    if (auto *Fn = dyn_cast<Function>(&GV))
-      IsEntryFnCC = AMDGPU::isEntryFunctionCC(Fn->getCallingConv());
-  }
-
-  /// An 0-indexed ID for the node. The maximum ID (exclusive) is the number of
-  /// nodes in the graph. This ID can be used as an index in a BitVector.
-  unsigned getID() const { return ID; }
-
-  const Function &getFunction() const { return cast<Function>(GV); }
-
-  /// \returns the cost to import this component into a given module, not
-  /// accounting for any dependencies that may need to be imported as well.
-  CostType getIndividualCost() const { return IndividualCost; }
-
-  bool isNonCopyable() const { return IsNonCopyable; }
-  bool isEntryFunctionCC() const { return IsEntryFnCC; }
-
-  /// \returns whether this is an entry point in the graph. Entry points are
-  /// defined as follows: if you take all entry points in the graph, and iterate
-  /// their dependencies, you are guaranteed to visit all nodes in the graph at
-  /// least once.
-  bool isGraphEntryPoint() const { return IsGraphEntry; }
-
-  StringRef getName() const { return GV.getName(); }
-
-  bool hasAnyIncomingEdges() const { return IncomingEdges.size(); }
-  bool hasAnyIncomingEdgesOfKind(EdgeKind EK) const {
-    return any_of(IncomingEdges, [&](const auto *E) { return E->Kind == EK; });
-  }
-
-  bool hasAnyOutgoingEdges() const { return OutgoingEdges.size(); }
-  bool hasAnyOutgoingEdgesOfKind(EdgeKind EK) const {
-    return any_of(OutgoingEdges, [&](const auto *E) { return E->Kind == EK; });
-  }
-
-  iterator_range<edges_iterator> incoming_edges() const {
-    return IncomingEdges;
-  }
-
-  iterator_range<edges_iterator> outgoing_edges() const {
-    return OutgoingEdges;
-  }
-
-  bool shouldFollowIndirectCalls() const { return isEntryFunctionCC(); }
-
-  /// Visit all children of this node in a recursive fashion. Also visits Self.
-  /// If \ref shouldFollowIndirectCalls returns false, then this only follows
-  /// DirectCall edges.
-  ///
-  /// \param Visitor Visitor Function.
-  void visitAllDependencies(std::function<void(const Node &)> Visitor) const;
-
-  /// Adds the depedencies of this node in \p BV by setting the bit
-  /// corresponding to each node.
-  ///
-  /// Implemented using \ref visitAllDependencies, hence it follows the same
-  /// rules regarding dependencies traversal.
-  ///
-  /// \param[out] BV The bitvector where the bits should be set.
-  void getDependencies(BitVector &BV) const {
-    visitAllDependencies([&](const Node &N) { BV.set(N.getID()); });
-  }
-
-  /// Uses \ref visitAllDependencies to aggregate the individual cost of this
-  /// node and all of its dependencies.
-  ///
-  /// This is cached.
-  CostType getFullCost() const;
-
-private:
-  void markAsGraphEntry() { IsGraphEntry = true; }
-
-  unsigned ID;
-  const GlobalValue &GV;
-  CostType IndividualCost;
-  bool IsNonCopyable : 1;
-  bool IsEntryFnCC : 1;
-  bool IsGraphEntry : 1;
-
-  // TODO: Cache dependencies as well?
-  mutable CostType FullCost = 0;
-
-  // TODO: Use a single sorted vector (with all incoming/outgoing edges grouped
-  // together)
-  EdgesVec IncomingEdges;
-  EdgesVec OutgoingEdges;
-};
-
-void SplitGraph::Node::visitAllDependencies(
-    std::function<void(const Node &)> Visitor) const {
-  const bool FollowIndirect = shouldFollowIndirectCalls();
-  // FIXME: If this can access SplitGraph in the future, use a BitVector
-  // instead.
-  DenseSet<const Node *> Seen;
-  SmallVector<const Node *, 8> WorkList({this});
+/// \param SML Log Helper
+/// \param CG Call graph for \p Fn's module.
+/// \param Fn Current function to look at.
+/// \param Fns[out] Resulting list of functions.
+/// \param OnlyDirect Whether to only consider direct callees.
+/// \param HadIndirectCall[out] Set to true if an indirect call was seen at some
+/// point, either in \p Fn or in one of the function it calls. When that
+/// happens, we fall back to adding all callable functions inside \p Fn's module
+/// to \p Fns.
+static void addAllDependencies(SplitModuleLogger &SML, const CallGraph &CG,
+                               const Function &Fn,
+                               DenseSet<const Function *> &Fns, bool OnlyDirect,
+                               bool &HadIndirectCall) {
+  assert(!Fn.isDeclaration());
+
+  const Module &M = *Fn.getParent();
+  SmallVector<const Function *> WorkList({&Fn});
   while (!WorkList.empty()) {
-    const Node *CurN = WorkList.pop_back_val();
-    if (auto [It, Inserted] = Seen.insert(CurN); !Inserted)
-      continue;
-
-    Visitor(*CurN);
-
-    for (const Edge *E : CurN->outgoing_edges()) {
-      if (!FollowIndirect && E->Kind == EdgeKind::IndirectCall)
-        continue;
-      WorkList.push_back(E->Dst);
-    }
-  }
-}
-
-CostType SplitGraph::Node::getFullCost() const {
-  if (FullCost)
-    return FullCost;
-
-  assert(FullCost == 0);
-  visitAllDependencies(
-      [&](const Node &N) { FullCost += N.getIndividualCost(); });
-  return FullCost;
-}
+    const auto &CurFn = *WorkList.pop_back_val();
+    assert(!CurFn.isDeclaration());
 
-void SplitGraph::buildGraph(CallGraph &CG) {
-  SplitModuleTimer SMT("buildGraph", "graph construction");
-  LLVM_DEBUG(
-      dbgs()
-      << "[build graph] constructing graph representation of the input\n");
-
-  // We build the graph by just iterating all functions in the module and
-  // working on their direct callees. At the end, all nodes should be linked
-  // together as expected.
-  DenseMap<const GlobalValue *, Node *> Cache;
-  SmallVector<const Function *> FnsWithIndirectCalls, IndirectlyCallableFns;
-  for (const Function &Fn : M) {
-    if (Fn.isDeclaration())
-      continue;
+    // Scan for an indirect call. If such a call is found, we have to
+    // conservatively assume this can call all non-entrypoint functions in the
+    // module.
 
-    // Look at direct callees and create the necessary edges in the graph.
-    bool HasIndirectCall = false;
-    Node &N = getNode(Cache, Fn);
-    for (auto &CGEntry : *CG[&Fn]) {
+    for (auto &CGEntry : *CG[&CurFn]) {
       auto *CGNode = CGEntry.second;
       auto *Callee = CGNode->getFunction();
       if (!Callee) {
-        // TODO: Don't consider inline assembly as indirect calls.
-        if (CGNode == CG.getCallsExternalNode())
-          HasIndirectCall = true;
+        if (OnlyDirect)
+          continue;
+
+        // Functions have an edge towards CallsExternalNode if they're external
+        // declarations, or if they do an indirect call. As we only process
+        // definitions here, we know this means the function has an indirect
+        // call. We then have to conservatively assume this can call all
+        // non-entrypoint functions in the module.
+        if (CGNode != CG.getCallsExternalNode())
+          continue; // this is another function-less node we don't care about.
+
+        SML << "Indirect call detected in " << getName(CurFn)
+            << " - treating all non-entrypoint functions as "
+               "potential dependencies\n";
+
+        // TODO: Print an ORE as well ?
+        addAllIndirectCallDependencies(M, Fns);
+        HadIndirectCall = true;
         continue;
       }
 
-      if (!Callee->isDeclaration())
-        createEdge(N, getNode(Cache, *Callee), EdgeKind::DirectCall);
-    }
-
-    // Keep track of this function if it contains an indirect call and/or if it
-    // can be indirectly called.
-    if (HasIndirectCall) {
-      LLVM_DEBUG(dbgs() << "indirect call found in " << Fn.getName() << "\n");
-      FnsWithIndirectCalls.push_back(&Fn);
-    }
-
-    if (canBeIndirectlyCalled(Fn))
-      IndirectlyCallableFns.push_back(&Fn);
-  }
+      if (Callee->isDeclaration())
+        continue;
 
-  // Post-process functions with indirect calls.
-  for (const Function *Fn : FnsWithIndirectCalls) {
-    for (const Function *Candidate : IndirectlyCallableFns) {
-      Node &Src = getNode(Cache, *Fn);
-      Node &Dst = getNode(Cache, *Candidate);
-      createEdge(Src, Dst, EdgeKind::IndirectCall);
+      auto [It, Inserted] = Fns.insert(Callee);
+      if (Inserted)
+        WorkList.push_back(Callee);
     }
   }
-
-  // Now, find all entry points.
-  SmallVector<Node *, 16> CandidateEntryPoints;
-  BitVector NodesReachableByKernels = createNodesBitVector();
-  for (Node *N : Nodes) {
-    // Functions with an Entry CC are always graph entry points too.
-    if (N->isEntryFunctionCC()) {
-      N->markAsGraphEntry();
-      N->getDependencies(NodesReachableByKernels);
-    } else if (!N->hasAnyIncomingEdgesOfKind(EdgeKind::DirectCall))
-      CandidateEntryPoints.push_back(N);
-  }
-
-  for (Node *N : CandidateEntryPoints) {
-    // This can be another entry point if it's not reachable by a kernel
-    // TODO: We could sort all of the possible new entries in a stable order
-    // (e.g. by cost), then consume them one by one until
-    // NodesReachableByKernels is all 1s. It'd allow us to avoid
-    // considering some nodes as non-entries in some specific cases.
-    if (!NodesReachableByKernels.test(N->getID()))
-      N->markAsGraphEntry();
-  }
-
-#ifndef NDEBUG
-  assert(verifyGraph());
-#endif
 }
 
-#ifndef NDEBUG
-bool SplitGraph::verifyGraph() const {
-  unsigned ExpectedID = 0;
-  // Exceptionally using a set here in case IDs are messed up.
-  DenseSet<const Node *> SeenNodes;
-  DenseSet<const Function *> SeenFunctionNodes;
-  for (const Node *N : Nodes) {
-    if (N->getID() != (ExpectedID++)) {
-      errs() << "Node IDs are incorrect!\n";
-      return false;
-    }
-
-    if (!SeenNodes.insert(N).second) {
-      errs() << "Node seen more than once!\n";
-      return false;
-    }
-
-    if (&getNode(N->getID()) != N) {
-      errs() << "getNode doesn't return the right node\n";
-      return false;
-    }
-
-    for (const Edge *E : N->IncomingEdges) {
-      if (!E->Src || !E->Dst || (E->Dst != N) ||
-          (find(E->Src->OutgoingEdges, E) == E->Src->OutgoingEdges.end())) {
-        errs() << "ill-formed incoming edges\n";
-        return false;
-      }
-    }
-
-    for (const Edge *E : N->OutgoingEdges) {
-      if (!E->Src || !E->Dst || (E->Src != N) ||
-          (find(E->Dst->IncomingEdges, E) == E->Dst->IncomingEdges.end())) {
-        errs() << "ill-formed outgoing edges\n";
-        return false;
-      }
-    }
-
-    const Function &Fn = N->getFunction();
-    if (AMDGPU::isEntryFunctionCC(Fn.getCallingConv())) {
-      if (N->hasAnyIncomingEdges()) {
-        errs() << "Kernels cannot have incoming edges\n";
-        return false;
-      }
-    }
-
-    if (Fn.isDeclaration()) {
-      errs() << "declarations shouldn't have nodes!\n";
-      return false;
-    }
-
-    auto [It, Inserted] = SeenFunctionNodes.insert(&Fn);
-    if (!Inserted) {
-      errs() << "one function has multiple nodes!\n";
-      return false;
+/// Contains information about a function and its dependencies.
+/// This is a splitting root. The splitting algorithm works by
+/// assigning these to partitions.
+struct FunctionWithDependencies {
+  FunctionWithDependencies(SplitModuleLogger &SML, CallGraph &CG,
+                           const DenseMap<const Function *, CostType> &FnCosts,
+                           const Function *Fn)
+      : Fn(Fn) {
+    // When Fn is not a kernel, we don't need to collect indirect callees.
+    // Resource usage analysis is only performed on kernels, and we collect
+    // indirect callees for resource usage analysis.
+    addAllDependencies(SML, CG, *Fn, Dependencies,
+                       /*OnlyDirect*/ !isEntryPoint(Fn), HasIndirectCall);
+    TotalCost = FnCosts.at(Fn);
+    for (const auto *Dep : Dependencies) {
+      TotalCost += FnCosts.at(Dep);
+
+      // We cannot duplicate functions with external linkage, or functions that
+      // may be overriden at runtime.
+      HasNonDuplicatableDependecy |=
+          (Dep->hasExternalLinkage() || !Dep->isDefinitionExact());
     }
   }
 
-  if (ExpectedID != Nodes.size()) {
-    errs() << "Node IDs out of sync!\n";
-    return false;
-  }
-
-  if (createNodesBitVector().size() != getNumNodes()) {
-    errs() << "nodes bit vector doesn't have the right size!\n";
-    return false;
-  }
-
-  // Check we respect the promise of Node::isKernel
-  BitVector BV = createNodesBitVector();
-  for (const Node *N : nodes()) {
-    if (N->isGraphEntryPoint())
-      N->getDependencies(BV);
-  }
-
-  // Ensure each function in the module has an associated node.
-  for (const auto &Fn : M) {
-    if (!Fn.isDeclaration()) {
-      if (!SeenFunctionNodes.contains(&Fn)) {
-        errs() << "Fn has no associated node in the graph!\n";
-        return false;
-      }
-    }
-  }
-
-  if (!BV.all()) {
-    errs() << "not all nodes are reachable through the graph's entry points!\n";
-    return false;
-  }
-
-  return true;
-}
-#endif
-
-CostType SplitGraph::calculateCost(const BitVector &BV) const {
-  CostType Cost = 0;
-  for (unsigned NodeID : BV.set_bits())
-    Cost += getNode(NodeID).getIndividualCost();
-  return Cost;
-}
-
-SplitGraph::Node &
-SplitGraph::getNode(DenseMap<const GlobalValue *, Node *> &Cache,
-                    const GlobalValue &GV) {
-  auto &N = Cache[&GV];
-  if (N)
-    return *N;
-
-  CostType Cost = 0;
-  bool NonCopyable = false;
-  if (const Function *Fn = dyn_cast<Function>(&GV)) {
-    NonCopyable = isNonCopyable(*Fn);
-    Cost = CostMap.at(Fn);
-  }
-  N = new (NodesPool.Allocate()) Node(Nodes.size(), GV, Cost, NonCopyable);
-  Nodes.push_back(N);
-  assert(&getNode(N->getID()) == N);
-  return *N;
-}
-
-const SplitGraph::Edge &SplitGraph::createEdge(Node &Src, Node &Dst,
-                                               EdgeKind EK) {
-  const Edge *E = new (EdgesPool.Allocate<Edge>(1)) Edge(&Src, &Dst, EK);
-  Src.OutgoingEdges.push_back(E);
-  Dst.IncomingEdges.push_back(E);
-  return *E;
-}
-
-//===----------------------------------------------------------------------===//
-// Split Proposals
-//===----------------------------------------------------------------------===//
-
-/// Represents a module splitting proposal.
-///
-/// Proposals are made of N BitVectors, one for each partition, where each bit
-/// set indicates that the node is present and should be copied inside that
-/// partition.
-///
-/// Proposals have several metrics attached so they can be compared/sorted,
-/// which the driver to try multiple strategies resultings in multiple proposals
-/// and choose the best one out of them.
-class SplitProposal {
-public:
-  SplitProposal(const SplitGraph &SG, unsigned MaxPartitions) : SG(&SG) {
-    Partitions.resize(MaxPartitions, {0, SG.createNodesBitVector()});
-  }
+  const Function *Fn = nullptr;
+  DenseSet<const Function *> Dependencies;
+  /// Whether \p Fn or any of its \ref Dependencies contains an indirect call.
+  bool HasIndirectCall = false;
+  /// Whether any of \p Fn's dependencies cannot be duplicated.
+  bool HasNonDuplicatableDependecy = false;
 
-  void setName(StringRef NewName) { Name = NewName; }
-  StringRef getName() const { return Name; }
-
-  const BitVector &operator[](unsigned PID) const {
-    return Partitions[PID].second;
-  }
-
-  void add(unsigned PID, const BitVector &BV) {
-    Partitions[PID].second |= BV;
-    updateScore(PID);
-  }
-
-  void print(raw_ostream &OS) const;
-  LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
-
-  // Find the cheapest partition (lowest cost). In case of ties, always returns
-  // the highest partition number.
-  unsigned findCheapestPartition() const;
-
-  /// Calculate the CodeSize and Bottleneck scores.
-  void calculateScores();
-
-#ifndef NDEBUG
-  void verifyCompleteness() const;
-#endif
-
-  /// Only available after \ref calculateScores is called.
-  ///
-  /// A positive number indicating the % of code duplication that this proposal
-  /// creates. e.g. 0.2 means this proposal adds roughly 20% code size by
-  /// duplicating some functions across partitions.
-  ///
-  /// Value is always rounded up to 3 decimal places.
-  ///
-  /// A perfect score would be 0.0, and anything approaching 1.0 is very bad.
-  double getCodeSizeScore() const { return CodeSizeScore; }
-
-  /// Only available after \ref calculateScores is called.
-  ///
-  /// A number between [0, 1] which indicates how big of a bottleneck is
-  /// expected from the largest partition.
-  ///
-  /// A score of 1.0 means the biggest partition is as big as the source module,
-  /// so build time will be equal to or greater than the build time of the
-  /// initial input.
-  ///
-  /// Value is always rounded up to 3 decimal places.
-  ///
-  /// This is one of the metrics used to estimate this proposal's build time.
-  double getBottleneckScore() const { return BottleneckScore; }
-
-private:
-  void updateScore(unsigned PID) {
-    assert(SG);
-    for (auto &[PCost, Nodes] : Partitions) {
-      TotalCost -= PCost;
-      PCost = SG->calculateCost(Nodes);
-      TotalCost += PCost;
-    }
-  }
-
-  /// \see getCodeSizeScore
-  double CodeSizeScore = 0.0;
-  /// \see getBottleneckScore
-  double BottleneckScore = 0.0;
-  /// Aggregated cost of all partitions
   CostType TotalCost = 0;
 
-  const SplitGraph *SG = nullptr;
-  std::string Name;
-
-  std::vector<std::pair<CostType, BitVector>> Partitions;
-};
-
-void SplitProposal::print(raw_ostream &OS) const {
-  assert(SG);
-
-  OS << "[proposal] " << Name << ", total cost:" << TotalCost
-     << ", code size score:" << format("%0.3f", CodeSizeScore)
-     << ", bottleneck score:" << format("%0.3f", BottleneckScore) << '\n';
-  for (const auto &[PID, Part] : enumerate(Partitions)) {
-    const auto &[Cost, NodeIDs] = Part;
-    OS << "  - P" << PID << " nodes:" << NodeIDs.count() << " cost: " << Cost
-       << '|' << formatRatioOf(Cost, SG->getModuleCost()) << "%\n";
-  }
-}
-
-unsigned SplitProposal::findCheapestPartition() const {
-  assert(!Partitions.empty());
-  CostType CurCost = std::numeric_limits<CostType>::max();
-  unsigned CurPID = InvalidPID;
-  for (const auto &[Idx, Part] : enumerate(Partitions)) {
-    if (Part.first <= CurCost) {
-      CurPID = Idx;
-      CurCost = Part.first;
-    }
-  }
-  assert(CurPID != InvalidPID);
-  return CurPID;
-}
-
-void SplitProposal::calculateScores() {
-  if (Partitions.empty())
-    return;
-
-  assert(SG);
-  CostType LargestPCost = 0;
-  for (auto &[PCost, Nodes] : Partitions) {
-    if (PCost > LargestPCost)
-      LargestPCost = PCost;
+  /// \returns true if this function and its dependencies can be considered
+  /// large according to \p Threshold.
+  bool isLarge(CostType Threshold) const {
+    return TotalCost > Threshold && !Dependencies.empty();
   }
-
-  CostType ModuleCost = SG->getModuleCost();
-  CodeSizeScore = double(TotalCost) / ModuleCost;
-  assert(CodeSizeScore >= 0.0);
-
-  BottleneckScore = double(LargestPCost) / ModuleCost;
-
-  CodeSizeScore = std::ceil(CodeSizeScore * 100.0) / 100.0;
-  BottleneckScore = std::ceil(BottleneckScore * 100.0) / 100.0;
-}
-
-#ifndef NDEBUG
-void SplitProposal::verifyCompleteness() const {
-  if (Partitions.empty())
-    return;
-
-  BitVector Result = Partitions[0].second;
-  for (const auto &P : drop_begin(Partitions))
-    Result |= P.second;
-  assert(Result.all() && "some nodes are missing from this proposal!");
-}
-#endif
-
-//===-- RecursiveSearchStrategy -------------------------------------------===//
-
-/// Partitioning algorithm.
-///
-/// This is a recursive search algorithm that can explore multiple possiblities.
-///
-/// When a cluster of nodes can go into more than one partition, and we haven't
-/// reached maximum search depth, we recurse and explore both options and their
-/// consequences. Both branches will yield a proposal, and the driver will grade
-/// both and choose the best one.
-///
-/// If max depth is reached, we will use some heuristics to make a choice. Most
-/// of the time we will just use the least-pressured (cheapest) partition, but
-/// if a cluster is particularly big and there is a good amount of overlap with
-/// an existing partition, we will choose that partition instead.
-class RecursiveSearchSplitting {
-public:
-  using SubmitProposalFn = function_ref<void(SplitProposal)>;
-
-  RecursiveSearchSplitting(const SplitGraph &SG, unsigned NumParts,
-                           SubmitProposalFn SubmitProposal);
-
-  void run();
-
-private:
-  struct WorkListEntry {
-    WorkListEntry(const BitVector &BV) : Cluster(BV) {}
-
-    unsigned NumNonEntryNodes = 0;
-    CostType TotalCost = 0;
-    CostType CostExcludingGraphEntryPoints = 0;
-    BitVector Cluster;
-  };
-
-  /// Collects all graph entry points's clusters and sort them so the most
-  /// expensive clusters are viewed first. This will merge clusters together if
-  /// they share a non-copyable dependency.
-  void setupWorkList();
-
-  /// Recursive function that assigns the worklist item at \p Idx into a
-  /// partition of \p SP.
-  ///
-  /// \p Depth is the current search depth. When this value is equal to
-  /// \ref MaxDepth, we can no longer recurse.
-  ///
-  /// This function only recurses if there is more than one possible assignment,
-  /// otherwise it is iterative to avoid creating a call stack that is as big as
-  /// \ref WorkList.
-  void pickPartition(unsigned Depth, unsigned Idx, SplitProposal SP);
-
-  /// \return A pair: first element is the PID of the partition that has the
-  /// most similarities with \p Entry, or \ref InvalidPID if no partition was
-  /// found with at least one element in common. The second element is the
-  /// aggregated cost of all dependencies in common between \p Entry and that
-  /// partition.
-  std::pair<unsigned, CostType>
-  findMostSimilarPartition(const WorkListEntry &Entry, const SplitProposal &SP);
-
-  const SplitGraph &SG;
-  unsigned NumParts;
-  SubmitProposalFn SubmitProposal;
-
-  // A Cluster is considered large when its cost, excluding entry points,
-  // exceeds this value.
-  CostType LargeClusterThreshold = 0;
-  unsigned NumProposalsSubmitted = 0;
-  SmallVector<WorkListEntry> WorkList;
 };
 
-RecursiveSearchSplitting::RecursiveSearchSplitting(
-    const SplitGraph &SG, unsigned NumParts, SubmitProposalFn SubmitProposal)
-    : SG(SG), NumParts(NumParts), SubmitProposal(SubmitProposal) {
-  // arbitrary max value as a safeguard. Anything above 10 will already be
-  // slow, this is just a max value to prevent extreme resource exhaustion or
-  // unbounded run time.
-  if (MaxDepth > 16)
-    report_fatal_error("[amdgpu-split-module] search depth of " +
-                       Twine(MaxDepth) + " is too high!");
-  LargeClusterThreshold =
-      (LargeFnFactor != 0.0)
-          ? CostType(((SG.getModuleCost() / NumParts) * LargeFnFactor))
-          : std::numeric_limits<CostType>::max();
-  LLVM_DEBUG(dbgs() << "[recursive search] large cluster threshold set at "
-                    << LargeClusterThreshold << "\n");
-}
-
-void RecursiveSearchSplitting::run() {
-  {
-    SplitModuleTimer SMT("recursive_search_prepare", "preparing worklist");
-    setupWorkList();
+/// Calculates how much overlap there is between \p A and \p B.
+/// \return A number between 0.0 and 1.0, where 1.0 means A == B and 0.0 means A
+/// and B have no shared elements. Kernels do not count in overlap calculation.
+static float calculateOverlap(const DenseSet<const Function *> &A,
+                              const DenseSet<const Function *> &B) {
+  DenseSet<const Function *> Total;
+  for (const auto *F : A) {
+    if (!isEntryPoint(F))
+      Total.insert(F);
   }
 
-  {
-    SplitModuleTimer SMT("recursive_search_pick", "partitioning");
-    SplitProposal SP(SG, NumParts);
-    pickPartition(/*BranchDepth=*/0, /*Idx=*/0, SP);
-  }
-}
+  if (Total.empty())
+    return 0.0f;
 
-void RecursiveSearchSplitting::setupWorkList() {
-  // e.g. if A and B are two worklist item, and they both call a non copyable
-  // dependency C, this does:
-  //    A=C
-  //    B=C
-  // => NodeEC will create a single group (A, B, C) and we create a new
-  // WorkList entry for that group.
-
-  EquivalenceClasses<unsigned> NodeEC;
-  for (const SplitGraph::Node *N : SG.nodes()) {
-    if (!N->isGraphEntryPoint())
+  unsigned NumCommon = 0;
+  for (const auto *F : B) {
+    if (isEntryPoint(F))
       continue;
 
-    NodeEC.insert(N->getID());
-    N->visitAllDependencies([&](const SplitGraph::Node &Dep) {
-      if (&Dep != N && Dep.isNonCopyable())
-        NodeEC.unionSets(N->getID(), Dep.getID());
-    });
+    auto [It, Inserted] = Total.insert(F);
+    if (!Inserted)
+      ++NumCommon;
   }
 
-  for (auto I = NodeEC.begin(), E = NodeEC.end(); I != E; ++I) {
-    if (!I->isLeader())
-      continue;
+  return static_cast<float>(NumCommon) / Total.size();
+}
 
-    BitVector Cluster = SG.createNodesBitVector();
-    for (auto MI = NodeEC.member_begin(I); MI != NodeEC.member_end(); ++MI) {
-      const SplitGraph::Node &N = SG.getNode(*MI);
-      if (N.isGraphEntryPoint())
-        N.getDependencies(Cluster);
-    }
-    WorkList.emplace_back(std::move(Cluster));
-  }
+/// Performs all of the partitioning work on \p M.
+/// \param SML Log Helper
+/// \param M Module to partition.
+/// \param NumParts Number of partitions to create.
+/// \param ModuleCost Total cost of all functions in \p M.
+/// \param FnCosts Map of Function -> Cost
+/// \param WorkList Functions and their dependencies to process in order.
+/// \returns The created partitions (a vector of size \p NumParts )
+static std::vector<DenseSet<const Function *>>
+doPartitioning(SplitModuleLogger &SML, Module &M, unsigned NumParts,
+               CostType ModuleCost,
+               const DenseMap<const Function *, CostType> &FnCosts,
+               const SmallVector<FunctionWithDependencies> &WorkList) {
+
+  SML << "\n--Partitioning Starts--\n";
+
+  // Calculate a "large function threshold". When more than one function's total
+  // import cost exceeds this value, we will try to assign it to an existing
+  // partition to reduce the amount of duplication needed.
+  //
+  // e.g. let two functions X and Y have a import cost of ~10% of the module, we
+  // assign X to a partition as usual, but when we get to Y, we check if it's
+  // worth also putting it in Y's partition.
+  const CostType LargeFnThreshold =
+      LargeFnFactor ? CostType(((ModuleCost / NumParts) * LargeFnFactor))
+                    : std::numeric_limits<CostType>::max();
+
+  std::vector<DenseSet<const Function *>> Partitions;
+  Partitions.resize(NumParts);
+
+  // Assign functions to partitions, and try to keep the partitions more or
+  // less balanced. We do that through a priority queue sorted in reverse, so we
+  // can always look at the partition with the least content.
+  //
+  // There are some cases where we will be deliberately unbalanced though.
+  //  - Large functions: we try to merge with existing partitions to reduce code
+  //  duplication.
+  //  - Functions with indirect or external calls always go in the first
+  //  partition (P0).
+  auto ComparePartitions = [](const std::pair<PartitionID, CostType> &a,
+                              const std::pair<PartitionID, CostType> &b) {
+    // When two partitions have the same cost, assign to the one with the
+    // biggest ID first. This allows us to put things in P0 last, because P0 may
+    // have other stuff added later.
+    if (a.second == b.second)
+      return a.first < b.first;
+    return a.second > b.second;
+  };
 
-  // Calculate costs and other useful information.
-  for (WorkListEntry &Entry : WorkList) {
-    for (unsigned NodeID : Entry.Cluster.set_bits()) {
-      const SplitGraph::Node &N = SG.getNode(NodeID);
-      const CostType Cost = N.getIndividualCost();
+  // We can't use priority_queue here because we need to be able to access any
+  // element. This makes this a bit inefficient as we need to sort it again
+  // everytime we change it, but it's a very small array anyway (likely under 64
+  // partitions) so it's a cheap operation.
+  std::vector<std::pair<PartitionID, CostType>> BalancingQueue;
+  for (unsigned I = 0; I < NumParts; ++I)
+    BalancingQueue.emplace_back(I, 0);
+
+  // Helper function to handle assigning a function to a partition. This takes
+  // care of updating the balancing queue.
+  const auto AssignToPartition = [&](PartitionID PID,
+                                     const FunctionWithDependencies &FWD) {
+    auto &FnsInPart = Partitions[PID];
+    FnsInPart.insert(FWD.Fn);
+    FnsInPart.insert(FWD.Dependencies.begin(), FWD.Dependencies.end());
+
+    SML << "assign " << getName(*FWD.Fn) << " to P" << PID << "\n  ->  ";
+    if (!FWD.Dependencies.empty()) {
+      SML << FWD.Dependencies.size() << " dependencies added\n";
+    };
+
+    // Update the balancing queue. we scan backwards because in the common case
+    // the partition is at the end.
+    for (auto &[QueuePID, Cost] : reverse(BalancingQueue)) {
+      if (QueuePID == PID) {
+        CostType NewCost = 0;
+        for (auto *Fn : Partitions[PID])
+          NewCost += FnCosts.at(Fn);
+
+        SML << "[Updating P" << PID << " Cost]:" << Cost << " -> " << NewCost;
+        if (Cost) {
+          SML << " (" << unsigned(((float(NewCost) / Cost) - 1) * 100)
+              << "% increase)";
+        }
+        SML << '\n';
 
-      Entry.TotalCost += Cost;
-      if (!N.isGraphEntryPoint()) {
-        Entry.CostExcludingGraphEntryPoints += Cost;
-        ++Entry.NumNonEntryNodes;
+        Cost = NewCost;
       }
     }
-  }
 
-  sort(WorkList, [](const WorkListEntry &LHS, const WorkListEntry &RHS) {
-    return LHS.TotalCost > RHS.TotalCost;
-  });
-
-  LLVM_DEBUG({
-    dbgs() << "[recursive search] worklist:\n";
-    for (const auto &[Idx, Entry] : enumerate(WorkList)) {
-      dbgs() << "  - [" << Idx << "]: ";
-      for (unsigned NodeID : Entry.Cluster.set_bits())
-        dbgs() << NodeID << " ";
-      dbgs() << "(total_cost:" << Entry.TotalCost
-             << ", cost_excl_entries:" << Entry.CostExcludingGraphEntryPoints
-             << ")\n";
-    }
-  });
-}
+    sort(BalancingQueue, ComparePartitions);
+  };
 
-void RecursiveSearchSplitting::pickPartition(unsigned Depth, unsigned Idx,
-                                             SplitProposal SP) {
-  while (Idx < WorkList.size()) {
-    // Step 1: Determine candidate PIDs.
-    //
-    const WorkListEntry &Entry = WorkList[Idx];
-    const BitVector &Cluster = Entry.Cluster;
-
-    // Default option is to do load-balancing, AKA assign to least pressured
-    // partition.
-    const unsigned CheapestPID = SP.findCheapestPartition();
-    assert(CheapestPID != InvalidPID);
-
-    // Explore assigning to the kernel that contains the most dependencies in
-    // common.
-    const auto [MostSimilarPID, SimilarDepsCost] =
-        findMostSimilarPartition(Entry, SP);
-
-    // We can chose to explore only one path if we only have one valid path, or
-    // if we reached maximum search depth and can no longer branch out.
-    unsigned SinglePIDToTry = InvalidPID;
-    if (MostSimilarPID == InvalidPID) // no similar PID found
-      SinglePIDToTry = CheapestPID;
-    else if (MostSimilarPID == CheapestPID) // both landed on the same PID
-      SinglePIDToTry = CheapestPID;
-    else if (Depth >= MaxDepth) {
-      // We have to choose one path. Use a heuristic to guess which one will be
-      // more appropriate.
-      if (Entry.CostExcludingGraphEntryPoints > LargeClusterThreshold) {
-        // Check if the amount of code in common makes it worth it.
-        assert(SimilarDepsCost && Entry.CostExcludingGraphEntryPoints);
-        const double Ratio =
-            SimilarDepsCost / Entry.CostExcludingGraphEntryPoints;
-        assert(Ratio >= 0.0 && Ratio <= 1.0);
-        if (LargeFnOverlapForMerge > Ratio) {
-          // For debug, just print "L", so we'll see "L3=P3" for instance, which
-          // will mean we reached max depth and chose P3 based on this
-          // heuristic.
-          LLVM_DEBUG(dbgs() << 'L');
-          SinglePIDToTry = MostSimilarPID;
-        }
-      } else
-        SinglePIDToTry = CheapestPID;
+  for (auto &CurFn : WorkList) {
+    // When a function has indirect calls, it must stay in the first partition
+    // alongside every reachable non-entry function. This is a nightmare case
+    // for splitting as it severely limits what we can do.
+    if (CurFn.HasIndirectCall) {
+      SML << "Function with indirect call(s): " << getName(*CurFn.Fn)
+          << " defaulting to P0\n";
+      AssignToPartition(0, CurFn);
+      continue;
     }
 
-    // Step 2: Explore candidates.
-
-    // When we only explore one possible path, and thus branch depth doesn't
-    // increase, do not recurse, iterate instead.
-    if (SinglePIDToTry != InvalidPID) {
-      LLVM_DEBUG(dbgs() << Idx << "=P" << SinglePIDToTry << ' ');
-      // Only one path to explore, don't clone SP, don't increase depth.
-      SP.add(SinglePIDToTry, Cluster);
-      ++Idx;
+    // When a function has non duplicatable dependencies, we have to keep it in
+    // the first partition as well. This is a conservative approach, a
+    // finer-grained approach could keep track of which dependencies are
+    // non-duplicatable exactly and just make sure they're grouped together.
+    if (CurFn.HasNonDuplicatableDependecy) {
+      SML << "Function with externally visible dependency "
+          << getName(*CurFn.Fn) << " defaulting to P0\n";
+      AssignToPartition(0, CurFn);
       continue;
     }
 
-    assert(MostSimilarPID != InvalidPID);
-
-    // We explore multiple paths: recurse at increased depth, then stop this
-    // function.
-
-    LLVM_DEBUG(dbgs() << '\n');
-
-    // lb = load balancing = put in cheapest partition
-    {
-      SplitProposal BranchSP = SP;
-      LLVM_DEBUG(dbgs().indent(Depth)
-                 << " [lb] " << Idx << "=P" << CheapestPID << "? ");
-      BranchSP.add(CheapestPID, Cluster);
-      pickPartition(Depth + 1, Idx + 1, BranchSP);
-    }
+    // Be smart with large functions to avoid duplicating their dependencies.
+    if (CurFn.isLarge(LargeFnThreshold)) {
+      assert(LargeFnOverlapForMerge >= 0.0f && LargeFnOverlapForMerge <= 1.0f);
+      SML << "Large Function: " << getName(*CurFn.Fn)
+          << " - looking for partition with at least "
+          << format("%0.2f", LargeFnOverlapForMerge * 100) << "% overlap\n";
+
+      bool Assigned = false;
+      for (const auto &[PID, Fns] : enumerate(Partitions)) {
+        float Overlap = calculateOverlap(CurFn.Dependencies, Fns);
+        SML << "  => " << format("%0.2f", Overlap * 100) << "% overlap with P"
+            << PID << '\n';
+        if (Overlap > LargeFnOverlapForMerge) {
+          SML << "  selecting P" << PID << '\n';
+          AssignToPartition(PID, CurFn);
+          Assigned = true;
+        }
+      }
 
-    // ms = most similar = put in partition with the most in common
-    {
-      SplitProposal BranchSP = SP;
-      LLVM_DEBUG(dbgs().indent(Depth)
-                 << " [ms] " << Idx << "=P" << MostSimilarPID << "? ");
-      BranchSP.add(MostSimilarPID, Cluster);
-      pickPartition(Depth + 1, Idx + 1, BranchSP);
+      if (Assigned)
+        continue;
     }
 
-    return;
+    // Normal "load-balancing", assign to partition with least pressure.
+    auto [PID, CurCost] = BalancingQueue.back();
+    AssignToPartition(PID, CurFn);
   }
 
-  // Step 3: If we assigned all WorkList items, submit the proposal.
-
-  assert(Idx == WorkList.size());
-  assert(NumProposalsSubmitted <= (2u << MaxDepth) &&
-         "Search got out of bounds?");
-  SP.setName("recursive_search (depth=" + std::to_string(Depth) + ") #" +
-             std::to_string(NumProposalsSubmitted++));
-  LLVM_DEBUG(dbgs() << '\n');
-  SubmitProposal(SP);
-}
-
-std::pair<unsigned, CostType>
-RecursiveSearchSplitting::findMostSimilarPartition(const WorkListEntry &Entry,
-                                                   const SplitProposal &SP) {
-  if (!Entry.NumNonEntryNodes)
-    return {InvalidPID, 0};
-
-  // We take the partition that is the most similar using Cost as a metric.
-  // So we take the set of nodes in common, compute their aggregated cost, and
-  // pick the partition with the highest cost in common.
-  unsigned ChosenPID = InvalidPID;
-  CostType ChosenCost = 0;
-  for (unsigned PID = 0; PID < NumParts; ++PID) {
-    BitVector BV = SP[PID];
-    BV &= Entry.Cluster; // FIXME: & doesn't work between BVs?!
-
-    if (BV.none())
-      continue;
-
-    const CostType Cost = SG.calculateCost(BV);
-
-    if (ChosenPID == InvalidPID || ChosenCost < Cost ||
-        (ChosenCost == Cost && PID > ChosenPID)) {
-      ChosenPID = PID;
-      ChosenCost = Cost;
+  if (SML) {
+    CostType ModuleCostOr1 = ModuleCost ? ModuleCost : 1;
+    for (const auto &[Idx, Part] : enumerate(Partitions)) {
+      CostType Cost = 0;
+      for (auto *Fn : Part)
+        Cost += FnCosts.at(Fn);
+      SML << "P" << Idx << " has a total cost of " << Cost << " ("
+          << format("%0.2f", (float(Cost) / ModuleCostOr1) * 100)
+          << "% of source module)\n";
     }
-  }
-
-  return {ChosenPID, ChosenCost};
-}
 
-//===----------------------------------------------------------------------===//
-// DOTGraph Printing Support
-//===----------------------------------------------------------------------===//
-
-const SplitGraph::Node *mapEdgeToDst(const SplitGraph::Edge *E) {
-  return E->Dst;
-}
-
-using SplitGraphEdgeDstIterator =
-    mapped_iterator<SplitGraph::edges_iterator, decltype(&mapEdgeToDst)>;
-
-} // namespace
-
-template <> struct GraphTraits<SplitGraph> {
-  using NodeRef = const SplitGraph::Node *;
-  using nodes_iterator = SplitGraph::nodes_iterator;
-  using ChildIteratorType = SplitGraphEdgeDstIterator;
-
-  using EdgeRef = const SplitGraph::Edge *;
-  using ChildEdgeIteratorType = SplitGraph::edges_iterator;
-
-  static NodeRef getEntryNode(NodeRef N) { return N; }
-
-  static ChildIteratorType child_begin(NodeRef Ref) {
-    return {Ref->outgoing_edges().begin(), mapEdgeToDst};
-  }
-  static ChildIteratorType child_end(NodeRef Ref) {
-    return {Ref->outgoing_edges().end(), mapEdgeToDst};
-  }
-
-  static nodes_iterator nodes_begin(const SplitGraph &G) {
-    return G.nodes().begin();
-  }
-  static nodes_iterator nodes_end(const SplitGraph &G) {
-    return G.nodes().end();
-  }
-};
-
-template <> struct DOTGraphTraits<SplitGraph> : public DefaultDOTGraphTraits {
-  DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
-
-  static std::string getGraphName(const SplitGraph &SG) {
-    return SG.getModule().getName().str();
-  }
-
-  std::string getNodeLabel(const SplitGraph::Node *N, const SplitGraph &SG) {
-    return N->getName().str();
-  }
-
-  static std::string getNodeDescription(const SplitGraph::Node *N,
-                                        const SplitGraph &SG) {
-    std::string Result;
-    if (N->isEntryFunctionCC())
-      Result += "entry-fn-cc ";
-    if (N->isNonCopyable())
-      Result += "non-copyable ";
-    Result += "cost:" + std::to_string(N->getIndividualCost());
-    return Result;
-  }
-
-  static std::string getNodeAttributes(const SplitGraph::Node *N,
-                                       const SplitGraph &SG) {
-    return N->hasAnyIncomingEdges() ? "" : "color=\"red\"";
+    SML << "--Partitioning Done--\n\n";
   }
 
-  static std::string getEdgeAttributes(const SplitGraph::Node *N,
-                                       SplitGraphEdgeDstIterator EI,
-                                       const SplitGraph &SG) {
+  // Check no functions were missed.
+#ifndef NDEBUG
+  DenseSet<const Function *> AllFunctions;
+  for (const auto &Part : Partitions)
+    AllFunctions.insert(Part.begin(), Part.end());
 
-    switch ((*EI.getCurrent())->Kind) {
-    case SplitGraph::EdgeKind::DirectCall:
-      return "";
-    case SplitGraph::EdgeKind::IndirectCall:
-      return "style=\"dashed\"";
+  for (auto &Fn : M) {
+    if (!Fn.isDeclaration() && !AllFunctions.contains(&Fn)) {
+      assert(AllFunctions.contains(&Fn) && "Missed a function?!");
     }
-    llvm_unreachable("Unknown SplitGraph::EdgeKind enum");
   }
-};
-
-//===----------------------------------------------------------------------===//
-// Driver
-//===----------------------------------------------------------------------===//
-
-namespace {
+#endif
 
-// If we didn't externalize GVs, then local GVs need to be conservatively
-// imported into every module (including their initializers), and then cleaned
-// up afterwards.
-static bool needsConservativeImport(const GlobalValue *GV) {
-  if (const auto *Var = dyn_cast<GlobalVariable>(GV))
-    return Var->hasLocalLinkage();
-  return isa<GlobalAlias>(GV);
+  return Partitions;
 }
 
-/// Prints a summary of the partition \p N, represented by module \p M, to \p
-/// OS.
-static void printPartitionSummary(raw_ostream &OS, unsigned N, const Module &M,
-                                  unsigned PartCost, unsigned ModuleCost) {
-  OS << "*** Partition P" << N << " ***\n";
-
-  for (const auto &Fn : M) {
-    if (!Fn.isDeclaration())
-      OS << " - [function] " << Fn.getName() << "\n";
-  }
-
-  for (const auto &GV : M.globals()) {
-    if (GV.hasInitializer())
-      OS << " - [global] " << GV.getName() << "\n";
+static void externalize(GlobalValue &GV) {
+  if (GV.hasLocalLinkage()) {
+    GV.setLinkage(GlobalValue::ExternalLinkage);
+    GV.setVisibility(GlobalValue::HiddenVisibility);
   }
 
-  OS << "Partition contains " << formatRatioOf(PartCost, ModuleCost)
-     << "% of the source\n";
-}
-
-static void evaluateProposal(SplitProposal &Best, SplitProposal New) {
-  SplitModuleTimer SMT("proposal_evaluation", "proposal ranking algorithm");
-
-  New.calculateScores();
-
-  LLVM_DEBUG({
-    New.verifyCompleteness();
-    if (DebugProposalSearch)
-      New.print(dbgs());
-  });
-
-  const double CurBScore = Best.getBottleneckScore();
-  const double CurCSScore = Best.getCodeSizeScore();
-  const double NewBScore = New.getBottleneckScore();
-  const double NewCSScore = New.getCodeSizeScore();
-
-  // TODO: Improve this
-  //    We can probably lower the precision of the comparison at first
-  //    e.g. if we have
-  //      - (Current): BScore: 0.489 CSCore 1.105
-  //      - (New): BScore: 0.475 CSCore 1.305
-  //    Currently we'd choose the new one because the bottleneck score is
-  //    lower, but the new one duplicates more code. It may be worth it to
-  //    discard the new proposal as the impact on build time is negligible.
-
-  // Compare them
-  bool IsBest = false;
-  if (NewBScore < CurBScore)
-    IsBest = true;
-  else if (NewBScore == CurBScore)
-    IsBest = (NewCSScore < CurCSScore); // Use code size as tie breaker.
-
-  if (IsBest)
-    Best = std::move(New);
-
-  LLVM_DEBUG(if (DebugProposalSearch) {
-    if (IsBest)
-      dbgs() << "[search] new best proposal!\n";
-    else
-      dbgs() << "[search] discarding - not profitable\n";
-  });
-}
-
-/// Trivial helper to create an identical copy of \p M.
-static std::unique_ptr<Module> cloneAll(const Module &M) {
-  ValueToValueMapTy VMap;
-  return CloneModule(M, VMap, [&](const GlobalValue *GV) { return true; });
+  // Unnamed entities must be named consistently between modules. setName will
+  // give a distinct name to each such entity.
+  if (!GV.hasName())
+    GV.setName("__llvmsplit_unnamed");
 }
 
-/// Writes \p SG as a DOTGraph to \ref ModuleDotCfgDir if requested.
-static void writeDOTGraph(const SplitGraph &SG) {
-  if (ModuleDotCfgOutput.empty())
-    return;
-
-  std::error_code EC;
-  raw_fd_ostream OS(ModuleDotCfgOutput, EC);
-  if (EC) {
-    errs() << "[" DEBUG_TYPE "]: cannot open '" << ModuleDotCfgOutput
-           << "' - DOTGraph will not be printed\n";
+static bool hasDirectCaller(const Function &Fn) {
+  for (auto &U : Fn.uses()) {
+    if (auto *CB = dyn_cast<CallBase>(U.getUser()); CB && CB->isCallee(&U))
+      return true;
   }
-  WriteGraph(OS, SG, /*ShortName=*/false,
-             /*Title=*/SG.getModule().getName());
+  return false;
 }
 
 static void splitAMDGPUModule(
-    GetTTIFn GetTTI, Module &M, unsigned NumParts,
+    GetTTIFn GetTTI, Module &M, unsigned N,
     function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback) {
+
+  SplitModuleLogger SML(M);
+
   CallGraph CG(M);
 
   // Externalize functions whose address are taken.
@@ -1341,8 +639,8 @@ static void splitAMDGPUModule(
   for (auto &Fn : M) {
     if (Fn.hasAddressTaken()) {
       if (Fn.hasLocalLinkage()) {
-        LLVM_DEBUG(dbgs() << "[externalize] " << Fn.getName()
-                          << " because its address is taken\n");
+        SML << "[externalize] " << Fn.getName()
+            << " because its address is taken\n";
       }
       externalize(Fn);
     }
@@ -1353,179 +651,138 @@ static void splitAMDGPUModule(
   if (!NoExternalizeGlobals) {
     for (auto &GV : M.globals()) {
       if (GV.hasLocalLinkage())
-        LLVM_DEBUG(dbgs() << "[externalize] GV " << GV.getName() << '\n');
+        SML << "[externalize] GV " << GV.getName() << '\n';
       externalize(GV);
     }
   }
 
   // Start by calculating the cost of every function in the module, as well as
   // the module's overall cost.
-  FunctionsCostMap FnCosts;
-  const CostType ModuleCost = calculateFunctionCosts(GetTTI, M, FnCosts);
-
-  // Build the SplitGraph, which represents the module's functions and models
-  // their dependencies accurately.
-  SplitGraph SG(M, FnCosts, ModuleCost);
-  SG.buildGraph(CG);
-
-  if (SG.empty()) {
-    LLVM_DEBUG(
-        dbgs()
-        << "[!] no nodes in graph, input is empty - no splitting possible\n");
-    ModuleCallback(cloneAll(M));
-    return;
+  DenseMap<const Function *, CostType> FnCosts;
+  const CostType ModuleCost = calculateFunctionCosts(SML, GetTTI, M, FnCosts);
+
+  // First, gather ever kernel into the worklist.
+  SmallVector<FunctionWithDependencies> WorkList;
+  for (auto &Fn : M) {
+    if (isEntryPoint(&Fn) && !Fn.isDeclaration())
+      WorkList.emplace_back(SML, CG, FnCosts, &Fn);
   }
 
-  LLVM_DEBUG({
-    dbgs() << "[graph] nodes:\n";
-    for (const SplitGraph::Node *N : SG.nodes()) {
-      dbgs() << "  - [" << N->getID() << "]: " << N->getName() << " "
-             << (N->isGraphEntryPoint() ? "(entry)" : "") << "\n";
+  // Then, find missing functions that need to be considered as additional
+  // roots. These can't be called in theory, but in practice we still have to
+  // handle them to avoid linker errors.
+  {
+    DenseSet<const Function *> SeenFunctions;
+    for (const auto &FWD : WorkList) {
+      SeenFunctions.insert(FWD.Fn);
+      SeenFunctions.insert(FWD.Dependencies.begin(), FWD.Dependencies.end());
     }
-  });
 
-  writeDOTGraph(SG);
-
-  LLVM_DEBUG(dbgs() << "[search] testing splitting strategies\n");
-
-  std::optional<SplitProposal> Proposal;
-  const auto EvaluateProposal = [&](SplitProposal SP) {
-    if (!Proposal)
-      Proposal = std::move(SP);
-    else
-      evaluateProposal(*Proposal, std::move(SP));
-  };
-
-  // TODO: It would be very easy to create new strategies by just adding a base
-  // class to RecursiveSearchSplitting and abstracting it away.
-  RecursiveSearchSplitting(SG, NumParts, EvaluateProposal).run();
-  LLVM_DEBUG(if (Proposal) dbgs() << "[search done] selected proposal: "
-                                  << Proposal->getName() << "\n";);
-
-  if (!Proposal) {
-    LLVM_DEBUG(dbgs() << "[!] no proposal made, no splitting possible!\n");
-    ModuleCallback(cloneAll(M));
-    return;
+    for (auto &Fn : M) {
+      // If this function is not part of any kernel's dependencies and isn't
+      // directly called, consider it as a root.
+      if (!Fn.isDeclaration() && !isEntryPoint(&Fn) &&
+          !SeenFunctions.count(&Fn) && !hasDirectCaller(Fn)) {
+        WorkList.emplace_back(SML, CG, FnCosts, &Fn);
+      }
+    }
   }
 
-  LLVM_DEBUG(Proposal->print(dbgs()););
+  // Sort the worklist so the most expensive roots are seen first.
+  sort(WorkList, [&](auto &A, auto &B) {
+    // Sort by total cost, and if the total cost is identical, sort
+    // alphabetically.
+    if (A.TotalCost == B.TotalCost)
+      return A.Fn->getName() < B.Fn->getName();
+    return A.TotalCost > B.TotalCost;
+  });
 
-  std::optional<raw_fd_ostream> SummariesOS;
-  if (!PartitionSummariesOutput.empty()) {
-    std::error_code EC;
-    SummariesOS.emplace(PartitionSummariesOutput, EC);
-    if (EC)
-      errs() << "[" DEBUG_TYPE "]: cannot open '" << PartitionSummariesOutput
-             << "' - Partition summaries will not be printed\n";
+  if (SML) {
+    SML << "Worklist\n";
+    for (const auto &FWD : WorkList) {
+      SML << "[root] " << getName(*FWD.Fn) << " (totalCost:" << FWD.TotalCost
+          << " indirect:" << FWD.HasIndirectCall
+          << " hasNonDuplicatableDep:" << FWD.HasNonDuplicatableDependecy
+          << ")\n";
+      // Sort function names before printing to ensure determinism.
+      SmallVector<std::string> SortedDepNames;
+      SortedDepNames.reserve(FWD.Dependencies.size());
+      for (const auto *Dep : FWD.Dependencies)
+        SortedDepNames.push_back(getName(*Dep));
+      sort(SortedDepNames);
+
+      for (const auto &Name : SortedDepNames)
+        SML << "  [dependency] " << Name << '\n';
+    }
   }
 
-  for (unsigned PID = 0; PID < NumParts; ++PID) {
-    SplitModuleTimer SMT2("modules_creation",
-                          "creating modules for each partition");
-    LLVM_DEBUG(dbgs() << "[split] creating new modules\n");
+  // This performs all of the partitioning work.
+  auto Partitions = doPartitioning(SML, M, N, ModuleCost, FnCosts, WorkList);
+  assert(Partitions.size() == N);
+
+  // If we didn't externalize GVs, then local GVs need to be conservatively
+  // imported into every module (including their initializers), and then cleaned
+  // up afterwards.
+  const auto NeedsConservativeImport = [&](const GlobalValue *GV) {
+    // We conservatively import private/internal GVs into every module and clean
+    // them up afterwards.
+    const auto *Var = dyn_cast<GlobalVariable>(GV);
+    return Var && Var->hasLocalLinkage();
+  };
 
-    DenseSet<const Function *> FnsInPart;
-    for (unsigned NodeID : (*Proposal)[PID].set_bits())
-      FnsInPart.insert(&SG.getNode(NodeID).getFunction());
+  SML << "Creating " << N << " modules...\n";
+  unsigned TotalFnImpls = 0;
+  for (unsigned I = 0; I < N; ++I) {
+    const auto &FnsInPart = Partitions[I];
 
     ValueToValueMapTy VMap;
-    CostType PartCost = 0;
     std::unique_ptr<Module> MPart(
         CloneModule(M, VMap, [&](const GlobalValue *GV) {
           // Functions go in their assigned partition.
-          if (const auto *Fn = dyn_cast<Function>(GV)) {
-            if (FnsInPart.contains(Fn)) {
-              PartCost += SG.getCost(*Fn);
-              return true;
-            }
-            return false;
-          }
+          if (const auto *Fn = dyn_cast<Function>(GV))
+            return FnsInPart.contains(Fn);
+
+          if (NeedsConservativeImport(GV))
+            return true;
 
           // Everything else goes in the first partition.
-          return needsConservativeImport(GV) || PID == 0;
+          return I == 0;
         }));
 
-    // FIXME: Aliases aren't seen often, and their handling isn't perfect so
-    // bugs are possible.
-
     // Clean-up conservatively imported GVs without any users.
-    for (auto &GV : make_early_inc_range(MPart->global_values())) {
-      if (needsConservativeImport(&GV) && GV.use_empty())
+    for (auto &GV : make_early_inc_range(MPart->globals())) {
+      if (NeedsConservativeImport(&GV) && GV.use_empty())
         GV.eraseFromParent();
     }
 
-    if (SummariesOS)
-      printPartitionSummary(*SummariesOS, PID, *MPart, PartCost, ModuleCost);
-
-    LLVM_DEBUG(
-        printPartitionSummary(dbgs(), PID, *MPart, PartCost, ModuleCost));
-
+    unsigned NumAllFns = 0, NumKernels = 0;
+    for (auto &Cur : *MPart) {
+      if (!Cur.isDeclaration()) {
+        ++NumAllFns;
+        if (isEntryPoint(&Cur))
+          ++NumKernels;
+      }
+    }
+    TotalFnImpls += NumAllFns;
+    SML << "  - Module " << I << " with " << NumAllFns << " functions ("
+        << NumKernels << " kernels)\n";
     ModuleCallback(std::move(MPart));
   }
+
+  SML << TotalFnImpls << " function definitions across all modules ("
+      << format("%0.2f", (float(TotalFnImpls) / FnCosts.size()) * 100)
+      << "% of original module)\n";
 }
 } // namespace
 
 PreservedAnalyses AMDGPUSplitModulePass::run(Module &M,
                                              ModuleAnalysisManager &MAM) {
-  SplitModuleTimer SMT(
-      "total", "total pass runtime (incl. potentially waiting for lockfile)");
-
   FunctionAnalysisManager &FAM =
       MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
   const auto TTIGetter = [&FAM](Function &F) -> const TargetTransformInfo & {
     return FAM.getResult<TargetIRAnalysis>(F);
   };
-
-  bool Done = false;
-#ifndef NDEBUG
-  if (UseLockFile) {
-    SmallString<128> LockFilePath;
-    sys::path::system_temp_directory(/*ErasedOnReboot=*/true, LockFilePath);
-    sys::path::append(LockFilePath, "amdgpu-split-module-debug");
-    LLVM_DEBUG(dbgs() << DEBUG_TYPE " using lockfile '" << LockFilePath
-                      << "'\n");
-
-    while (true) {
-      llvm::LockFileManager Locked(LockFilePath.str());
-      switch (Locked) {
-      case LockFileManager::LFS_Error:
-        LLVM_DEBUG(
-            dbgs() << "[amdgpu-split-module] unable to acquire lockfile, debug "
-                      "output may be mangled by other processes\n");
-        Locked.unsafeRemoveLockFile();
-        break;
-      case LockFileManager::LFS_Owned:
-        break;
-      case LockFileManager::LFS_Shared: {
-        switch (Locked.waitForUnlock()) {
-        case LockFileManager::Res_Success:
-          break;
-        case LockFileManager::Res_OwnerDied:
-          continue; // try again to get the lock.
-        case LockFileManager::Res_Timeout:
-          LLVM_DEBUG(
-              dbgs()
-              << "[amdgpu-split-module] unable to acquire lockfile, debug "
-                 "output may be mangled by other processes\n");
-          Locked.unsafeRemoveLockFile();
-          break; // give up
-        }
-        break;
-      }
-      }
-
-      splitAMDGPUModule(TTIGetter, M, N, ModuleCallback);
-      Done = true;
-      break;
-    }
-  }
-#endif
-
-  if (!Done)
-    splitAMDGPUModule(TTIGetter, M, N, ModuleCallback);
-
-  // We can change linkage/visibilities in the input, consider that nothing is
-  // preserved just to be safe. This pass runs last anyway.
-  return PreservedAnalyses::none();
+  splitAMDGPUModule(TTIGetter, M, N, ModuleCallback);
+  // We don't change the original module.
+  return PreservedAnalyses::all();
 }
-} // namespace llvm
diff --git a/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize-with-call.ll b/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize-with-call.ll
index 708b5a006be60e..d269f92763853c 100644
--- a/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize-with-call.ll
+++ b/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize-with-call.ll
@@ -1,24 +1,30 @@
-; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-threshold=0
-; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 --implicit-check-not=define %s
+; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-function-threshold=0
+; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
+; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
+; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
 
 ; 3 kernels:
 ;   - A does a direct call to HelperA
 ;   - B is storing @HelperA
 ;   - C does a direct call to HelperA
 ;
-; The helper functions will get externalized, so C/A will end up
-; in the same partition.
-
-; P0 is empty.
-; CHECK0: declare
-
-; CHECK1: define amdgpu_kernel void @B(ptr %dst)
-
-; CHECK2: define hidden void @HelperA()
-; CHECK2: define amdgpu_kernel void @A()
-; CHECK2: define amdgpu_kernel void @C()
+; The helper functions will get externalized, which will force A and C into P0 as
+; external functions cannot be duplicated.
+
+; CHECK0: define hidden void @HelperA()
+; CHECK0: define amdgpu_kernel void @A()
+; CHECK0: declare amdgpu_kernel void @B(ptr)
+; CHECK0: define amdgpu_kernel void @C()
+
+; CHECK1: declare hidden void @HelperA()
+; CHECK1: declare amdgpu_kernel void @A()
+; CHECK1: declare amdgpu_kernel void @B(ptr)
+; CHECK1: declare amdgpu_kernel void @C()
+
+; CHECK2: declare hidden void @HelperA()
+; CHECK2: declare amdgpu_kernel void @A()
+; CHECK2: define amdgpu_kernel void @B(ptr %dst)
+; CHECK2: declare amdgpu_kernel void @C()
 
 define internal void @HelperA() {
   ret void
diff --git a/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize.ll b/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize.ll
index 81f6c8f0fbb3a6..731cf4b374c95b 100644
--- a/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize.ll
+++ b/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-split -o %t %s -j 2 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-threshold=0
+; RUN: llvm-split -o %t %s -j 2 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-function-threshold=0
 ; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
 ; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
 
diff --git a/llvm/test/tools/llvm-split/AMDGPU/debug-name-hiding.ll b/llvm/test/tools/llvm-split/AMDGPU/debug-name-hiding.ll
new file mode 100644
index 00000000000000..6a07ed51ba1beb
--- /dev/null
+++ b/llvm/test/tools/llvm-split/AMDGPU/debug-name-hiding.ll
@@ -0,0 +1,20 @@
+; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -debug -amdgpu-module-splitting-log-private 2>&1 | FileCheck %s --implicit-check-not=MyCustomKernel
+; REQUIRES: asserts
+
+; SHA256 of the kernel names.
+
+; CHECK: a097723d21cf9f35d90e6fb7881995ac8c398b3366a6c97efc657404f9fe301c
+; CHECK: 626bc23242de8fcfda7f0e66318d29455c081df6b5380e64d14703c95fcbcd59
+; CHECK: c38d90a7ca71dc5d694bb9e093dadcdedfc4cb4adf7ed7e46d42fe95a0b4ef55
+
+define amdgpu_kernel void @MyCustomKernel0() {
+  ret void
+}
+
+define amdgpu_kernel void @MyCustomKernel1() {
+  ret void
+}
+
+define amdgpu_kernel void @MyCustomKernel2() {
+  ret void
+}
diff --git a/llvm/test/tools/llvm-split/AMDGPU/debug-non-kernel-root.ll b/llvm/test/tools/llvm-split/AMDGPU/debug-non-kernel-root.ll
new file mode 100644
index 00000000000000..836b5c05d0653d
--- /dev/null
+++ b/llvm/test/tools/llvm-split/AMDGPU/debug-non-kernel-root.ll
@@ -0,0 +1,36 @@
+; RUN: llvm-split -o %t %s -j 2 -mtriple amdgcn-amd-amdhsa -debug 2>&1 | FileCheck %s --implicit-check-not="[root]"
+; REQUIRES: asserts
+
+; func_3 is never directly called, it needs to be considered
+; as a root to handle this module correctly.
+
+; CHECK:      [root] kernel_1
+; CHECK-NEXT:   [dependency] func_1
+; CHECK-NEXT:   [dependency] func_2
+; CHECK-NEXT: [root] func_3
+; CHECK-NEXT:   [dependency] func_2
+
+define amdgpu_kernel void @kernel_1() {
+entry:
+  call void @func_1()
+  ret void
+}
+
+define linkonce_odr hidden void @func_1() {
+entry:
+  %call = call i32 @func_2()
+  ret void
+}
+
+define linkonce_odr hidden i32 @func_2() #0 {
+entry:
+  ret i32 0
+}
+
+define void @func_3() {
+entry:
+  %call = call i32 @func_2()
+  ret void
+}
+
+attributes #0 = { noinline optnone }
diff --git a/llvm/test/tools/llvm-split/AMDGPU/declarations.ll b/llvm/test/tools/llvm-split/AMDGPU/declarations.ll
index 755676061b2557..10b6cdfef4055f 100644
--- a/llvm/test/tools/llvm-split/AMDGPU/declarations.ll
+++ b/llvm/test/tools/llvm-split/AMDGPU/declarations.ll
@@ -1,13 +1,16 @@
 ; RUN: rm -rf %t0 %t1
 ; RUN: llvm-split -o %t %s -j 2 -mtriple amdgcn-amd-amdhsa
 ; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
-; RUN: not llvm-dis -o - %t1
+; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
 
-; Empty module without any defs should result in a single output module that is
-; an exact copy of the input.
+; Check that all declarations are put into each partition.
 
 ; CHECK0: declare void @A
 ; CHECK0: declare void @B
 
+; CHECK1: declare void @A
+; CHECK1: declare void @B
+
 declare void @A()
+
 declare void @B()
diff --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-alias-dependencies.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-alias-dependencies.ll
index d7e84abd5f968d..c2746d1398924c 100644
--- a/llvm/test/tools/llvm-split/AMDGPU/kernels-alias-dependencies.ll
+++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-alias-dependencies.ll
@@ -1,6 +1,6 @@
 ; RUN: llvm-split -o %t %s -j 2 -mtriple amdgcn-amd-amdhsa
-; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=define %s
+; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
+; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
 
 ; 3 kernels:
 ;   - A calls nothing
@@ -13,12 +13,16 @@
 ; Additionally, @PerryThePlatypus gets externalized as
 ; the alias counts as taking its address.
 
-; CHECK0: define amdgpu_kernel void @A
+; CHECK0-NOT: define
+; CHECK0: @Perry = internal alias ptr (), ptr @PerryThePlatypus
+; CHECK0: define hidden void @PerryThePlatypus()
+; CHECK0: define amdgpu_kernel void @B
+; CHECK0: define amdgpu_kernel void @C
+; CHECK0-NOT: define
 
-; CHECK1: @Perry = internal alias ptr (), ptr @PerryThePlatypus
-; CHECK1: define hidden void @PerryThePlatypus()
-; CHECK1: define amdgpu_kernel void @B
-; CHECK1: define amdgpu_kernel void @C
+; CHECK1-NOT: define
+; CHECK1: define amdgpu_kernel void @A
+; CHECK1-NOT: define
 
 @Perry = internal alias ptr(), ptr @PerryThePlatypus
 
diff --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-cost-ranking.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-cost-ranking.ll
index c7e13304dc6dec..4635264aefb39a 100644
--- a/llvm/test/tools/llvm-split/AMDGPU/kernels-cost-ranking.ll
+++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-cost-ranking.ll
@@ -1,21 +1,27 @@
 ; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa
-; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 --implicit-check-not=define %s
+; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
+; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
+; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
 
 ; 3 kernels with each their own dependencies should go into 3
 ; distinct partitions. The most expensive kernel should be
 ; seen first and go into the last partition.
 
+; CHECK0-NOT: define
 ; CHECK0: define amdgpu_kernel void @C
 ; CHECK0: define internal void @HelperC
 ; CHECK0-NOT: define
 
+; CHECK1-NOT: define
 ; CHECK1: define amdgpu_kernel void @A
 ; CHECK1: define internal void @HelperA
+; CHECK1-NOT: define
 
+; CHECK2-NOT: define
 ; CHECK2: define amdgpu_kernel void @B
 ; CHECK2: define internal void @HelperB
+; CHECK2-NOT: define
+
 
 define amdgpu_kernel void @A() {
   call void @HelperA()
diff --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-external.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-external.ll
index 332344a776e82e..435e97a5813400 100644
--- a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-external.ll
+++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-external.ll
@@ -1,20 +1,29 @@
 ; RUN: llvm-split -o %t %s -j 4 -mtriple amdgcn-amd-amdhsa
-; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t3 | FileCheck --check-prefix=CHECK3 --implicit-check-not=define %s
+; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
+; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
+; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
+; RUN: llvm-dis -o - %t3 | FileCheck --check-prefix=CHECK3 %s
 
-; CHECK0: define internal void @PrivateHelper1()
-; CHECK0: define amdgpu_kernel void @D
+; Both overridable helper should go in P0.
 
-; CHECK1: define internal void @PrivateHelper0()
-; CHECK1: define amdgpu_kernel void @C
+; CHECK0-NOT: define
+; CHECK0: define available_externally void @OverridableHelper0()
+; CHECK0: define internal void @OverridableHelper1()
+; CHECK0: define amdgpu_kernel void @A
+; CHECK0: define amdgpu_kernel void @B
+; CHECK0-NOT: define
 
-; CHECK2: define internal void @OverridableHelper1()
-; CHECK2: define amdgpu_kernel void @B
+; CHECK1-NOT: define
 
-; CHECK3: define available_externally void @OverridableHelper0()
-; CHECK3: define amdgpu_kernel void @A
+; CHECK2-NOT: define
+; CHECK2: define internal void @PrivateHelper1()
+; CHECK2: define amdgpu_kernel void @D
+; CHECK2-NOT: define
+
+; CHECK3-NOT: define
+; CHECK3: define internal void @PrivateHelper0()
+; CHECK3: define amdgpu_kernel void @C
+; CHECK3-NOT: define
 
 define available_externally void @OverridableHelper0() {
   ret void
diff --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-indirect.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-indirect.ll
index 5be945bda48bf4..2d870039112cbf 100644
--- a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-indirect.ll
+++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-indirect.ll
@@ -1,7 +1,7 @@
 ; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa
-; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 --implicit-check-not=define %s
+; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
+; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
+; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
 
 ; We have 4 kernels:
 ;   - Each kernel has an internal helper
@@ -15,19 +15,25 @@
 ; indirect call. HelperC/D should also end up in P0 as they
 ; are dependencies of HelperB.
 
+; CHECK0-NOT: define
+; CHECK0: define hidden void @HelperA
+; CHECK0: define hidden void @HelperB
+; CHECK0: define hidden void @CallCandidate
+; CHECK0: define internal void @HelperC
 ; CHECK0: define internal void @HelperD
-; CHECK0: define amdgpu_kernel void @D
+; CHECK0: define amdgpu_kernel void @A
+; CHECK0: define amdgpu_kernel void @B
+; CHECK0-NOT: define
 
-; CHECK1: define internal void @HelperC
-; CHECK1: define amdgpu_kernel void @C
+; CHECK1-NOT: define
+; CHECK1: define internal void @HelperD
+; CHECK1: define amdgpu_kernel void @D
+; CHECK1-NOT: define
 
-; CHECK2: define hidden void @HelperA
-; CHECK2: define hidden void @HelperB
-; CHECK2: define hidden void @CallCandidate
+; CHECK2-NOT: define
 ; CHECK2: define internal void @HelperC
-; CHECK2: define internal void @HelperD
-; CHECK2: define amdgpu_kernel void @A
-; CHECK2: define amdgpu_kernel void @B
+; CHECK2: define amdgpu_kernel void @C
+; CHECK2-NOT: define
 
 @addrthief = global [3 x ptr] [ptr @HelperA, ptr @HelperB, ptr @CallCandidate]
 
diff --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-overridable.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-overridable.ll
index 9205a5d1930e52..dc2c5c3c07bee6 100644
--- a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-overridable.ll
+++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-overridable.ll
@@ -1,15 +1,21 @@
 ; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa
-; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 --implicit-check-not=define %s
-
-; CHECK0: define amdgpu_kernel void @D
-
-; CHECK1: define amdgpu_kernel void @C
-
-; CHECK2: define void @ExternalHelper
-; CHECK2: define amdgpu_kernel void @A
-; CHECK2: define amdgpu_kernel void @B
+; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
+; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
+; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
+
+; CHECK0-NOT: define
+; CHECK0: define void @ExternalHelper
+; CHECK0: define amdgpu_kernel void @A
+; CHECK0: define amdgpu_kernel void @B
+; CHECK0-NOT: define
+
+; CHECK1-NOT: define
+; CHECK1: define amdgpu_kernel void @D
+; CHECK1-NOT: define
+
+; CHECK2-NOT: define
+; CHECK2: define amdgpu_kernel void @C
+; CHECK2-NOT: define
 
 define void @ExternalHelper() {
   ret void
diff --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables-noexternal.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables-noexternal.ll
index a184d92aea9b9f..0fc76934afc548 100644
--- a/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables-noexternal.ll
+++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables-noexternal.ll
@@ -1,20 +1,26 @@
 ; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-no-externalize-globals
-; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 --implicit-check-not=define %s
+; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
+; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
+; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
 
 ; 3 kernels use private/internal global variables.
 ; The GVs should be copied in each partition as needed.
 
+; CHECK0-NOT: define
 ; CHECK0: @bar = internal constant ptr
 ; CHECK0: define amdgpu_kernel void @C
+; CHECK0-NOT: define
 
+; CHECK1-NOT: define
 ; CHECK1: @foo = private constant ptr
 ; CHECK1: define amdgpu_kernel void @A
+; CHECK1-NOT: define
 
+; CHECK2-NOT: define
 ; CHECK2: @foo = private constant ptr
 ; CHECK2: @bar = internal constant ptr
 ; CHECK2: define amdgpu_kernel void @B
+; CHECK2-NOT: define
 
 @foo = private constant ptr poison
 @bar = internal constant ptr poison
diff --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables.ll
index be84a0b5916f0d..7564662e7c7c0c 100644
--- a/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables.ll
+++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables.ll
@@ -1,22 +1,28 @@
 ; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa
-; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 --implicit-check-not=define %s
+; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
+; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
+; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
 
 ; 3 kernels use private/internal global variables.
 ; The GVs should be copied in each partition as needed.
 
+; CHECK0-NOT: define
 ; CHECK0: @foo = hidden constant ptr poison
 ; CHECK0: @bar = hidden constant ptr poison
 ; CHECK0: define amdgpu_kernel void @C
+; CHECK0-NOT: define
 
+; CHECK1-NOT: define
 ; CHECK1: @foo = external hidden constant ptr{{$}}
 ; CHECK1: @bar = external hidden constant ptr{{$}}
 ; CHECK1: define amdgpu_kernel void @A
+; CHECK1-NOT: define
 
+; CHECK2-NOT: define
 ; CHECK2: @foo = external hidden constant ptr{{$}}
 ; CHECK2: @bar = external hidden constant ptr{{$}}
 ; CHECK2: define amdgpu_kernel void @B
+; CHECK2-NOT: define
 
 @foo = private constant ptr poison
 @bar = internal constant ptr poison
diff --git a/llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging.ll b/llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging.ll
index 807fb2e5f33cea..459c5a7f1a2db3 100644
--- a/llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging.ll
+++ b/llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging.ll
@@ -1,12 +1,12 @@
-; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-max-depth=0 -amdgpu-module-splitting-large-threshold=1.2 -amdgpu-module-splitting-merge-threshold=0.5
-; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 --implicit-check-not=define %s
+; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-function-threshold=1.2 -amdgpu-module-splitting-large-function-merge-overlap=0.5
+; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
+; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
+; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
 
-; RUN: llvm-split -o %t.nolarge %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-threshold=0 -amdgpu-module-splitting-max-depth=0
-; RUN: llvm-dis -o - %t.nolarge0 | FileCheck --check-prefix=NOLARGEKERNELS-CHECK0 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t.nolarge1 | FileCheck --check-prefix=NOLARGEKERNELS-CHECK1 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t.nolarge2 | FileCheck --check-prefix=NOLARGEKERNELS-CHECK2 --implicit-check-not=define %s
+; RUN: llvm-split -o %t.nolarge %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-function-threshold=0
+; RUN: llvm-dis -o - %t.nolarge0 | FileCheck --check-prefix=NOLARGEKERNELS-CHECK0 %s
+; RUN: llvm-dis -o - %t.nolarge1 | FileCheck --check-prefix=NOLARGEKERNELS-CHECK1 %s
+; RUN: llvm-dis -o - %t.nolarge2 | FileCheck --check-prefix=NOLARGEKERNELS-CHECK2 %s
 
 ; 2 kernels (A/B) are large and share all their dependencies.
 ; They should go in the same partition, the remaining kernel should
@@ -15,12 +15,14 @@
 ; Also check w/o large kernels processing to verify they are indeed handled
 ; differently.
 
-; P0 is empty
-; CHECK0: declare
+; CHECK0-NOT: define
 
+; CHECK1-NOT: define
 ; CHECK1: define internal void @HelperC()
 ; CHECK1: define amdgpu_kernel void @C
+; CHECK1-NOT: define
 
+; CHECK2-NOT: define
 ; CHECK2: define internal void @large2()
 ; CHECK2: define internal void @large1()
 ; CHECK2: define internal void @large0()
@@ -28,9 +30,12 @@
 ; CHECK2: define internal void @HelperB()
 ; CHECK2: define amdgpu_kernel void @A
 ; CHECK2: define amdgpu_kernel void @B
+; CHECK2-NOT: define
 
+; NOLARGEKERNELS-CHECK0-NOT: define
 ; NOLARGEKERNELS-CHECK0: define internal void @HelperC()
 ; NOLARGEKERNELS-CHECK0: define amdgpu_kernel void @C
+; NOLARGEKERNELS-CHECK0-NOT: define
 
 ; NOLARGEKERNELS-CHECK1: define internal void @large2()
 ; NOLARGEKERNELS-CHECK1: define internal void @large1()
@@ -44,7 +49,6 @@
 ; NOLARGEKERNELS-CHECK2: define internal void @HelperA()
 ; NOLARGEKERNELS-CHECK2: define amdgpu_kernel void @A
 
-
 define internal void @large2() {
   store volatile i32 42, ptr null
   call void @large2()
diff --git a/llvm/test/tools/llvm-split/AMDGPU/non-kernels-dependency-indirect.ll b/llvm/test/tools/llvm-split/AMDGPU/non-kernels-dependency-indirect.ll
index 1314a78b42f3b0..167930ce0e8063 100644
--- a/llvm/test/tools/llvm-split/AMDGPU/non-kernels-dependency-indirect.ll
+++ b/llvm/test/tools/llvm-split/AMDGPU/non-kernels-dependency-indirect.ll
@@ -1,7 +1,7 @@
 ; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa
-; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 --implicit-check-not=define %s
+; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 --implicit-check-not=DEFINE %s
+; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=DEFINE %s
+; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 --implicit-check-not=DEFINE %s
 
 ; We have 4 function:
 ;   - Each function has an internal helper
@@ -11,19 +11,19 @@
 ; @CallCandidate doesn't have to be in A/B's partition, unlike
 ; in the corresponding tests for kernels where it has to.
 
+; CHECK0: define hidden void @HelperA
+; CHECK0: define hidden void @HelperB
 ; CHECK0: define internal void @HelperC
 ; CHECK0: define internal void @HelperD
-; CHECK0: define internal void @C
-; CHECK0: define internal void @D
+; CHECK0: define void @A
+; CHECK0: define void @B
 
-; CHECK1: define hidden void @HelperA
-; CHECK1: define hidden void @CallCandidate()
-; CHECK1: define internal void @A
+; CHECK1: define internal void @HelperD
+; CHECK1: define void @D
 
-; CHECK2: define hidden void @HelperB
+; CHECK2: define hidden void @CallCandidate
 ; CHECK2: define internal void @HelperC
-; CHECK2: define internal void @HelperD
-; CHECK2: define internal void @B
+; CHECK2: define void @C
 
 @addrthief = global [3 x ptr] [ptr @HelperA, ptr @HelperB, ptr @CallCandidate]
 
@@ -51,22 +51,22 @@ define internal void @HelperD() {
   ret void
 }
 
-define internal void @A(ptr %call) {
+define void @A(ptr %call) {
   call void @HelperA(ptr %call)
   ret void
 }
 
-define internal void @B(ptr %call) {
+define void @B(ptr %call) {
   call void @HelperB(ptr %call)
   ret void
 }
 
-define internal void @C() {
+define void @C() {
   call void @HelperC()
   ret void
 }
 
-define internal void @D() {
+define void @D() {
   call void @HelperD()
   ret void
 }
diff --git a/llvm/test/tools/llvm-split/AMDGPU/recursive-search-2.ll b/llvm/test/tools/llvm-split/AMDGPU/recursive-search-2.ll
deleted file mode 100644
index 01f2f3627f9905..00000000000000
--- a/llvm/test/tools/llvm-split/AMDGPU/recursive-search-2.ll
+++ /dev/null
@@ -1,128 +0,0 @@
-; RUN: llvm-split -o %t_s3_ %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-max-depth=2
-; RUN: llvm-dis -o - %t_s3_0 | FileCheck --check-prefix=SPLIT3-CHECK0 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t_s3_1 | FileCheck --check-prefix=SPLIT3-CHECK1 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t_s3_2 | FileCheck --check-prefix=SPLIT3-CHECK2 --implicit-check-not=define %s
-
-; RUN: llvm-split -o %t_s5_ %s -j 5 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-max-depth=2
-; RUN: llvm-dis -o - %t_s5_0 | FileCheck --check-prefix=SPLIT5-CHECK0 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t_s5_1 | FileCheck --check-prefix=SPLIT5-CHECK1 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t_s5_2 | FileCheck --check-prefix=SPLIT5-CHECK2 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t_s5_3 | FileCheck --check-prefix=SPLIT5-CHECK3 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t_s5_4 | FileCheck --check-prefix=SPLIT5-CHECK4 --implicit-check-not=define %s
-
-; Test the specifics of the search algorithm.
-; This test will change depending on new heuristics we add or remove.
-
-; --------------------------------------------
-
-; SPLIT3-CHECK0: define internal void @HelperA()
-; SPLIT3-CHECK0: define internal void @HelperB()
-; SPLIT3-CHECK0: define internal void @HelperC()
-; SPLIT3-CHECK0: define amdgpu_kernel void @AB()
-; SPLIT3-CHECK0: define amdgpu_kernel void @BC()
-
-; SPLIT3-CHECK1: define amdgpu_kernel void @A()
-; SPLIT3-CHECK1: define internal void @HelperA()
-; SPLIT3-CHECK1: define amdgpu_kernel void @C()
-; SPLIT3-CHECK1: define internal void @HelperC()
-
-; SPLIT3-CHECK2: define internal void @HelperA()
-; SPLIT3-CHECK2: define amdgpu_kernel void @B()
-; SPLIT3-CHECK2: define internal void @HelperB()
-; SPLIT3-CHECK2: define internal void @HelperC()
-; SPLIT3-CHECK2: define amdgpu_kernel void @ABC()
-
-; --------------------------------------------
-
-; SPLIT5-CHECK0: define amdgpu_kernel void @A()
-; SPLIT5-CHECK0: define internal void @HelperA()
-; SPLIT5-CHECK0: define amdgpu_kernel void @B()
-; SPLIT5-CHECK0: define internal void @HelperB()
-
-; SPLIT5-CHECK1: define internal void @HelperB()
-; SPLIT5-CHECK1: define internal void @HelperC()
-; SPLIT5-CHECK1: define amdgpu_kernel void @BC
-
-; SPLIT5-CHECK2: define internal void @HelperA()
-; SPLIT5-CHECK2: define internal void @HelperB()
-; SPLIT5-CHECK2: define amdgpu_kernel void @AB()
-
-; SPLIT5-CHECK3: define amdgpu_kernel void @C()
-; SPLIT5-CHECK3: define internal void @HelperC()
-
-; SPLIT5-CHECK4: define internal void @HelperA()
-; SPLIT5-CHECK4: define internal void @HelperB()
-; SPLIT5-CHECK4: define internal void @HelperC()
-; SPLIT5-CHECK4: define amdgpu_kernel void @ABC()
-
-define amdgpu_kernel void @A() {
-  store volatile i64 42, ptr null
-  store volatile i64 42, ptr null
-  store volatile i64 42, ptr null
-  store volatile i64 42, ptr null
-  store volatile i64 42, ptr null
-  call void @HelperA()
-  ret void
-}
-
-define internal void @HelperA() {
-  store volatile i32 42, ptr null
-  store volatile i32 42, ptr null
-  ret void
-}
-
-define amdgpu_kernel void @B() {
-  store volatile i64 42, ptr null
-  store volatile i64 42, ptr null
-  store volatile i64 42, ptr null
-  store volatile i64 42, ptr null
-  call void @HelperB()
-  ret void
-}
-
-define internal void @HelperB() {
-  store volatile i32 42, ptr null
-  store volatile i32 42, ptr null
-  store volatile i32 42, ptr null
-  ret void
-}
-
-define amdgpu_kernel void @C() {
-  store volatile i64 42, ptr null
-  store volatile i64 42, ptr null
-  store volatile i64 42, ptr null
-  store volatile i64 42, ptr null
-  store volatile i64 42, ptr null
-  store volatile i64 42, ptr null
-  store volatile i64 42, ptr null
-  store volatile i64 42, ptr null
-  call void @HelperC()
-  ret void
-}
-
-define internal void @HelperC() {
-  store volatile i32 42, ptr null
-  ret void
-}
-
-define amdgpu_kernel void @AB() {
-  store volatile i32 42, ptr null
-  call void @HelperA()
-  call void @HelperB()
-  ret void
-}
-
-define amdgpu_kernel void @BC() {
-  store volatile i32 42, ptr null
-  store volatile i32 42, ptr null
-  call void @HelperB()
-  call void @HelperC()
-  ret void
-}
-
-define amdgpu_kernel void @ABC() {
-  call void @HelperA()
-  call void @HelperB()
-  call void @HelperC()
-  ret void
-}
diff --git a/llvm/test/tools/llvm-split/AMDGPU/recursive-search-8.ll b/llvm/test/tools/llvm-split/AMDGPU/recursive-search-8.ll
deleted file mode 100644
index eae57a19883106..00000000000000
--- a/llvm/test/tools/llvm-split/AMDGPU/recursive-search-8.ll
+++ /dev/null
@@ -1,128 +0,0 @@
-; RUN: llvm-split -o %t_s3_ %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-max-depth=8
-; RUN: llvm-dis -o - %t_s3_0 | FileCheck --check-prefix=SPLIT3-CHECK0 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t_s3_1 | FileCheck --check-prefix=SPLIT3-CHECK1 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t_s3_2 | FileCheck --check-prefix=SPLIT3-CHECK2 --implicit-check-not=define %s
-
-; RUN: llvm-split -o %t_s5_ %s -j 5 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-max-depth=8
-; RUN: llvm-dis -o - %t_s5_0 | FileCheck --check-prefix=SPLIT5-CHECK0 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t_s5_1 | FileCheck --check-prefix=SPLIT5-CHECK1 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t_s5_2 | FileCheck --check-prefix=SPLIT5-CHECK2 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t_s5_3 | FileCheck --check-prefix=SPLIT5-CHECK3 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t_s5_4 | FileCheck --check-prefix=SPLIT5-CHECK4 --implicit-check-not=define %s
-
-; Test the specifics of the search algorithm.
-; This test will change depending on new heuristics we add or remove.
-
-; --------------------------------------------
-
-; SPLIT3-CHECK0: define internal void @HelperA()
-; SPLIT3-CHECK0: define internal void @HelperB()
-; SPLIT3-CHECK0: define internal void @HelperC()
-; SPLIT3-CHECK0: define amdgpu_kernel void @AB()
-; SPLIT3-CHECK0: define amdgpu_kernel void @BC()
-
-; SPLIT3-CHECK1: define amdgpu_kernel void @A()
-; SPLIT3-CHECK1: define internal void @HelperA()
-; SPLIT3-CHECK1: define amdgpu_kernel void @C()
-; SPLIT3-CHECK1: define internal void @HelperC()
-
-; SPLIT3-CHECK2: define internal void @HelperA()
-; SPLIT3-CHECK2: define amdgpu_kernel void @B()
-; SPLIT3-CHECK2: define internal void @HelperB()
-; SPLIT3-CHECK2: define internal void @HelperC()
-; SPLIT3-CHECK2: define amdgpu_kernel void @ABC()
-
-; --------------------------------------------
-
-; SPLIT5-CHECK0: define amdgpu_kernel void @A()
-; SPLIT5-CHECK0: define internal void @HelperA()
-; SPLIT5-CHECK0: define amdgpu_kernel void @B()
-; SPLIT5-CHECK0: define internal void @HelperB()
-
-; SPLIT5-CHECK1: define internal void @HelperB()
-; SPLIT5-CHECK1: define internal void @HelperC()
-; SPLIT5-CHECK1: define amdgpu_kernel void @BC
-
-; SPLIT5-CHECK2: define internal void @HelperA()
-; SPLIT5-CHECK2: define internal void @HelperB()
-; SPLIT5-CHECK2: define amdgpu_kernel void @AB()
-
-; SPLIT5-CHECK3: define amdgpu_kernel void @C()
-; SPLIT5-CHECK3: define internal void @HelperC()
-
-; SPLIT5-CHECK4: define internal void @HelperA()
-; SPLIT5-CHECK4: define internal void @HelperB()
-; SPLIT5-CHECK4: define internal void @HelperC()
-; SPLIT5-CHECK4: define amdgpu_kernel void @ABC()
-
-define amdgpu_kernel void @A() {
-  store volatile i64 42, ptr null
-  store volatile i64 42, ptr null
-  store volatile i64 42, ptr null
-  store volatile i64 42, ptr null
-  store volatile i64 42, ptr null
-  call void @HelperA()
-  ret void
-}
-
-define internal void @HelperA() {
-  store volatile i32 42, ptr null
-  store volatile i32 42, ptr null
-  ret void
-}
-
-define amdgpu_kernel void @B() {
-  store volatile i64 42, ptr null
-  store volatile i64 42, ptr null
-  store volatile i64 42, ptr null
-  store volatile i64 42, ptr null
-  call void @HelperB()
-  ret void
-}
-
-define internal void @HelperB() {
-  store volatile i32 42, ptr null
-  store volatile i32 42, ptr null
-  store volatile i32 42, ptr null
-  ret void
-}
-
-define amdgpu_kernel void @C() {
-  store volatile i64 42, ptr null
-  store volatile i64 42, ptr null
-  store volatile i64 42, ptr null
-  store volatile i64 42, ptr null
-  store volatile i64 42, ptr null
-  store volatile i64 42, ptr null
-  store volatile i64 42, ptr null
-  store volatile i64 42, ptr null
-  call void @HelperC()
-  ret void
-}
-
-define internal void @HelperC() {
-  store volatile i32 42, ptr null
-  ret void
-}
-
-define amdgpu_kernel void @AB() {
-  store volatile i32 42, ptr null
-  call void @HelperA()
-  call void @HelperB()
-  ret void
-}
-
-define amdgpu_kernel void @BC() {
-  store volatile i32 42, ptr null
-  store volatile i32 42, ptr null
-  call void @HelperB()
-  call void @HelperC()
-  ret void
-}
-
-define amdgpu_kernel void @ABC() {
-  call void @HelperA()
-  call void @HelperB()
-  call void @HelperC()
-  ret void
-}