[llvm-branch-commits] [llvm] 92f6036 - Revert "[AMDGPU] Add AMDGPU-specific module splitting (#89245)"

Thu May 23 23:44:24 PDT 2024

Author: Vitaly Buka
Date: 2024-05-23T23:44:21-07:00
New Revision: 92f6036dfea9e64b141dcba8f8329ab17e53c3a5

URL: https://github.com/llvm/llvm-project/commit/92f6036dfea9e64b141dcba8f8329ab17e53c3a5
DIFF: https://github.com/llvm/llvm-project/commit/92f6036dfea9e64b141dcba8f8329ab17e53c3a5.diff

LOG: Revert "[AMDGPU] Add AMDGPU-specific module splitting (#89245)"

This reverts commit d7c37130008374341e79c355ad85cc48942136ff.

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
    llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
    llvm/lib/Target/AMDGPU/CMakeLists.txt

Removed: 
    llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
    llvm/lib/Target/AMDGPU/AMDGPUSplitModule.h
    llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize-with-call.ll
    llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize.ll
    llvm/test/tools/llvm-split/AMDGPU/debug-name-hiding.ll
    llvm/test/tools/llvm-split/AMDGPU/kernels-alias-dependencies.ll
    llvm/test/tools/llvm-split/AMDGPU/kernels-cost-ranking.ll
    llvm/test/tools/llvm-split/AMDGPU/kernels-dependencies.ll
    llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-duplication.ll
    llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-external.ll
    llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-indirect.ll
    llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-overridable.ll
    llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables-noexternal.ll
    llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables.ll
    llvm/test/tools/llvm-split/AMDGPU/kernels-load-balancing.ll
    llvm/test/tools/llvm-split/AMDGPU/kernels-no-dependencies.ll
    llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging.ll
    llvm/test/tools/llvm-split/AMDGPU/lit.local.cfg


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
deleted file mode 100644
index 56e275ce707b6..0000000000000

--- a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
+++ /dev/null
@@ -1,744 +0,0 @@
-//===- AMDGPUSplitModule.cpp ----------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file Implements a module splitting algorithm designed to support the
-/// FullLTO --lto-partitions option for parallel codegen. This is completely
-/// 
diff erent from the common SplitModule pass, as this system is designed with
-/// AMDGPU in mind.
-///
-/// The basic idea of this module splitting implementation is the same as
-/// SplitModule: load-balance the module's functions across a set of N
-/// partitions to allow parallel codegen. However, it does it very
-/// 
diff erently than the target-agnostic variant:
-///   - Kernels are used as the module's "roots".
-///     They're known entry points on AMDGPU, and everything else is often
-///     internal only.
-///   - Each kernel has a set of dependencies, and when a kernel and its
-///     dependencies is considered "big", we try to put it in a partition where
-///     most dependencies are already imported, to avoid duplicating large
-///     amounts of code.
-///   - There's special care for indirect calls in order to ensure
-///     AMDGPUResourceUsageAnalysis can work correctly.
-///
-/// This file also includes a more elaborate logging system to enable
-/// users to easily generate logs that (if desired) do not include any value
-/// names, in order to not leak information about the source file.
-/// Such logs are very helpful to understand and fix potential issues with
-/// module splitting.
-
-#include "AMDGPUSplitModule.h"
-#include "AMDGPUTargetMachine.h"
-#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Analysis/CallGraph.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/User.h"
-#include "llvm/IR/Value.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/Path.h"
-#include "llvm/Support/Process.h"
-#include "llvm/Support/SHA256.h"
-#include "llvm/Support/Threading.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-#include <algorithm>
-#include <cassert>
-#include <iterator>
-#include <memory>
-#include <utility>
-#include <vector>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "amdgpu-split-module"
-
-namespace {
-
-static cl::opt<float> LargeKernelFactor(
-    "amdgpu-module-splitting-large-kernel-threshold", cl::init(2.0f),
-    cl::Hidden,
-    cl::desc(
-        "consider a kernel as large and needing special treatment when it "
-        "exceeds the average cost of a partition by this factor; e;g. 2.0 "
-        "means if the kernel and its dependencies is 2 times bigger than "
-        "an average partition; 0 disables large kernels handling entirely"));
-
-static cl::opt<float> LargeKernelOverlapForMerge(
-    "amdgpu-module-splitting-large-kernel-merge-overlap", cl::init(0.8f),
-    cl::Hidden,
-    cl::desc("defines how much overlap between two large kernel's dependencies "
-             "is needed to put them in the same partition"));
-
-static cl::opt<bool> NoExternalizeGlobals(
-    "amdgpu-module-splitting-no-externalize-globals", cl::Hidden,
-    cl::desc("disables externalization of global variable with local linkage; "
-             "may cause globals to be duplicated which increases binary size"));
-
-static cl::opt<std::string>
-    LogDirOpt("amdgpu-module-splitting-log-dir", cl::Hidden,
-              cl::desc("output directory for AMDGPU module splitting logs"));
-
-static cl::opt<bool>
-    LogPrivate("amdgpu-module-splitting-log-private", cl::Hidden,
-               cl::desc("hash value names before printing them in the AMDGPU "
-                        "module splitting logs"));
-
-using CostType = InstructionCost::CostType;
-using PartitionID = unsigned;
-
-static bool isEntryPoint(const Function *F) {
-  return AMDGPU::isEntryFunctionCC(F->getCallingConv());
-}
-
-static std::string getName(const Value &V) {
-  static bool HideNames;
-
-  static llvm::once_flag HideNameInitFlag;
-  llvm::call_once(HideNameInitFlag, [&]() {
-    if (LogPrivate.getNumOccurrences())
-      HideNames = LogPrivate;
-    else {
-      const auto EV = sys::Process::GetEnv("AMD_SPLIT_MODULE_LOG_PRIVATE");
-      HideNames = (EV.value_or("0") != "0");
-    }
-  });
-
-  if (!HideNames)
-    return V.getName().str();
-  return toHex(SHA256::hash(arrayRefFromStringRef(V.getName())),
-               /*LowerCase=*/true);
-}
-
-/// Main logging helper.
-///
-/// Logging can be configured by the following environment variable.
-///   AMD_SPLIT_MODULE_LOG_DIR=<filepath>
-///     If set, uses <filepath> as the directory to write logfiles to
-///     each time module splitting is used.
-///   AMD_SPLIT_MODULE_LOG_PRIVATE
-///     If set to anything other than zero, all names are hidden.
-///
-/// Both environment variables have corresponding CL options which
-/// takes priority over them.
-///
-/// Any output printed to the log files is also printed to dbgs() when -debug is
-/// used and LLVM_DEBUG is defined.
-///
-/// This approach has a small disadvantage over LLVM_DEBUG though: logging logic
-/// cannot be removed from the code (by building without debug). This probably
-/// has a small performance cost because if some computation/formatting is
-/// needed for logging purpose, it may be done everytime only to be ignored
-/// by the logger.
-///
-/// As this pass only runs once and is not doing anything computationally
-/// expensive, this is likely a reasonable trade-off.
-///
-/// If some computation should really be avoided when unused, users of the class
-/// can check whether any logging will occur by using the bool operator.
-///
-/// \code
-///   if (SML) {
-///     // Executes only if logging to a file or if -debug is available and
-///     used.
-///   }
-/// \endcode
-class SplitModuleLogger {
-public:
-  SplitModuleLogger(const Module &M) {
-    std::string LogDir = LogDirOpt;
-    if (LogDir.empty())
-      LogDir = sys::Process::GetEnv("AMD_SPLIT_MODULE_LOG_DIR").value_or("");
-
-    // No log dir specified means we don't need to log to a file.
-    // We may still log to dbgs(), though.
-    if (LogDir.empty())
-      return;
-
-    // If a log directory is specified, create a new file with a unique name in
-    // that directory.
-    int Fd;
-    SmallString<0> PathTemplate;
-    SmallString<0> RealPath;
-    sys::path::append(PathTemplate, LogDir, "Module-%%-%%-%%-%%-%%-%%-%%.txt");
-    if (auto Err =
-            sys::fs::createUniqueFile(PathTemplate.str(), Fd, RealPath)) {
-      report_fatal_error("Failed to create log file at '" + Twine(LogDir) +
-                             "': " + Err.message(),
-                         /*CrashDiag=*/false);
-    }
-
-    FileOS = std::make_unique<raw_fd_ostream>(Fd, /*shouldClose=*/true);
-  }
-
-  bool hasLogFile() const { return FileOS != nullptr; }
-
-  raw_ostream &logfile() {
-    assert(FileOS && "no logfile!");
-    return *FileOS;
-  }
-
-  /// \returns true if this SML will log anything either to a file or dbgs().
-  /// Can be used to avoid expensive computations that are ignored when logging
-  /// is disabled.
-  operator bool() const {
-    return hasLogFile() || (DebugFlag && isCurrentDebugType(DEBUG_TYPE));
-  }
-
-private:
-  std::unique_ptr<raw_fd_ostream> FileOS;
-};
-
-template <typename Ty>
-static SplitModuleLogger &operator<<(SplitModuleLogger &SML, const Ty &Val) {
-  static_assert(
-      !std::is_same_v<Ty, Value>,
-      "do not print values to logs directly, use handleName instead!");
-  LLVM_DEBUG(dbgs() << Val);
-  if (SML.hasLogFile())
-    SML.logfile() << Val;
-  return SML;
-}
-
-/// Calculate the cost of each function in \p M
-/// \param SML Log Helper
-/// \param TM TargetMachine instance used to retrieve TargetTransformInfo.
-/// \param M Module to analyze.
-/// \param CostMap[out] Resulting Function -> Cost map.
-/// \return The module's total cost.
-static CostType
-calculateFunctionCosts(SplitModuleLogger &SML, const AMDGPUTargetMachine &TM,
-                       Module &M,
-                       DenseMap<const Function *, CostType> &CostMap) {
-  CostType ModuleCost = 0;
-  CostType KernelCost = 0;
-
-  for (auto &Fn : M) {
-    if (Fn.isDeclaration())
-      continue;
-
-    CostType FnCost = 0;
-    TargetTransformInfo TTI = TM.getTargetTransformInfo(Fn);
-
-    for (const auto &BB : Fn) {
-      for (const auto &I : BB) {
-        auto Cost =
-            TTI.getInstructionCost(&I, TargetTransformInfo::TCK_CodeSize);
-        assert(Cost != InstructionCost::getMax());
-        // Assume expensive if we can't tell the cost of an instruction.
-        CostType CostVal =
-            Cost.getValue().value_or(TargetTransformInfo::TCC_Expensive);
-        assert((FnCost + CostVal) >= FnCost && "Overflow!");
-        FnCost += CostVal;
-      }
-    }
-
-    assert(FnCost != 0);
-
-    CostMap[&Fn] = FnCost;
-    assert((ModuleCost + FnCost) >= ModuleCost && "Overflow!");
-    ModuleCost += FnCost;
-
-    if (isEntryPoint(&Fn))
-      KernelCost += FnCost;
-  }
-
-  CostType FnCost = (ModuleCost - KernelCost);
-  SML << "=> Total Module Cost: " << ModuleCost << '\n'
-      << "  => KernelCost: " << KernelCost << " ("
-      << format("%0.2f", (float(KernelCost) / ModuleCost) * 100) << "%)\n"
-      << "  => FnsCost: " << FnCost << " ("
-      << format("%0.2f", (float(FnCost) / ModuleCost) * 100) << "%)\n";
-
-  return ModuleCost;
-}
-
-static bool canBeIndirectlyCalled(const Function &F) {
-  if (F.isDeclaration() || isEntryPoint(&F))
-    return false;
-  return !F.hasLocalLinkage() ||
-         F.hasAddressTaken(/*PutOffender=*/nullptr,
-                           /*IgnoreCallbackUses=*/false,
-                           /*IgnoreAssumeLikeCalls=*/true,
-                           /*IgnoreLLVMUsed=*/true,
-                           /*IgnoreARCAttachedCall=*/false,
-                           /*IgnoreCastedDirectCall=*/true);
-}
-
-/// When a kernel or any of its callees performs an indirect call, this function
-/// takes over \ref addAllDependencies and adds all potentially callable
-/// functions to \p Fns so they can be counted as dependencies of the kernel.
-///
-/// This is needed due to how AMDGPUResourceUsageAnalysis operates: in the
-/// presence of an indirect call, the function's resource usage is the same as
-/// the most expensive function in the module.
-/// \param M    The module.
-/// \param Fns[out] Resulting list of functions.
-static void addAllIndirectCallDependencies(const Module &M,
-                                           DenseSet<const Function *> &Fns) {
-  for (const auto &Fn : M) {
-    if (canBeIndirectlyCalled(Fn))
-      Fns.insert(&Fn);
-  }
-}
-
-/// Adds the functions that \p Fn may call to \p Fns, then recurses into each
-/// callee until all reachable functions have been gathered.
-///
-/// \param SML Log Helper
-/// \param CG Call graph for \p Fn's module.
-/// \param Fn Current function to look at.
-/// \param Fns[out] Resulting list of functions.
-/// \param HadIndirectCall[out] Set to true if an indirect call was seen at some
-/// point, either in \p Fn or in one of the function it calls. When that
-/// happens, we fall back to adding all callable functions inside \p Fn's module
-/// to \p Fns.
-static void addAllDependencies(SplitModuleLogger &SML, const CallGraph &CG,
-                               const Function &Fn,
-                               DenseSet<const Function *> &Fns,
-                               bool &HadIndirectCall) {
-  assert(!Fn.isDeclaration());
-
-  const Module &M = *Fn.getParent();
-  SmallVector<const Function *> WorkList({&Fn});
-  while (!WorkList.empty()) {
-    const auto &CurFn = *WorkList.pop_back_val();
-    assert(!CurFn.isDeclaration());
-
-    // Scan for an indirect call. If such a call is found, we have to
-    // conservatively assume this can call all non-entrypoint functions in the
-    // module.
-
-    for (auto &CGEntry : *CG[&CurFn]) {
-      auto *CGNode = CGEntry.second;
-      auto *Callee = CGNode->getFunction();
-      if (!Callee) {
-        // Functions have an edge towards CallsExternalNode if they're external
-        // declarations, or if they do an indirect call. As we only process
-        // definitions here, we know this means the function has an indirect
-        // call. We then have to conservatively assume this can call all
-        // non-entrypoint functions in the module.
-        if (CGNode != CG.getCallsExternalNode())
-          continue; // this is another function-less node we don't care about.
-
-        SML << "Indirect call detected in " << getName(CurFn)
-            << " - treating all non-entrypoint functions as "
-               "potential dependencies\n";
-
-        // TODO: Print an ORE as well ?
-        addAllIndirectCallDependencies(M, Fns);
-        HadIndirectCall = true;
-        return;
-      }
-
-      if (Callee->isDeclaration())
-        continue;
-
-      auto [It, Inserted] = Fns.insert(Callee);
-      if (Inserted)
-        WorkList.push_back(Callee);
-    }
-  }
-}
-
-/// Contains information about a kernel and its dependencies.
-struct KernelWithDependencies {
-  KernelWithDependencies(SplitModuleLogger &SML, CallGraph &CG,
-                         const DenseMap<const Function *, CostType> &FnCosts,
-                         const Function *Fn)
-      : Fn(Fn) {
-    addAllDependencies(SML, CG, *Fn, Dependencies, HasIndirectCall);
-    TotalCost = FnCosts.at(Fn);
-    for (const auto *Dep : Dependencies) {
-      TotalCost += FnCosts.at(Dep);
-
-      // We cannot duplicate functions with external linkage, or functions that
-      // may be overriden at runtime.
-      HasNonDuplicatableDependecy |=
-          (Dep->hasExternalLinkage() || !Dep->isDefinitionExact());
-    }
-  }
-
-  const Function *Fn = nullptr;
-  DenseSet<const Function *> Dependencies;
-  /// Whether \p Fn or any of its \ref Dependencies contains an indirect call.
-  bool HasIndirectCall = false;
-  /// Whether any of \p Fn's dependencies cannot be duplicated.
-  bool HasNonDuplicatableDependecy = false;
-
-  CostType TotalCost = 0;
-
-  /// \returns true if this kernel and its dependencies can be considered large
-  /// according to \p Threshold.
-  bool isLarge(CostType Threshold) const {
-    return TotalCost > Threshold && !Dependencies.empty();
-  }
-};
-
-/// Calculates how much overlap there is between \p A and \p B.
-/// \return A number between 0.0 and 1.0, where 1.0 means A == B and 0.0 means A
-/// and B have no shared elements. Kernels do not count in overlap calculation.
-static float calculateOverlap(const DenseSet<const Function *> &A,
-                              const DenseSet<const Function *> &B) {
-  DenseSet<const Function *> Total;
-  for (const auto *F : A) {
-    if (!isEntryPoint(F))
-      Total.insert(F);
-  }
-
-  if (Total.empty())
-    return 0.0f;
-
-  unsigned NumCommon = 0;
-  for (const auto *F : B) {
-    if (isEntryPoint(F))
-      continue;
-
-    auto [It, Inserted] = Total.insert(F);
-    if (!Inserted)
-      ++NumCommon;
-  }
-
-  return static_cast<float>(NumCommon) / Total.size();
-}
-
-/// Performs all of the partitioning work on \p M.
-/// \param SML Log Helper
-/// \param M Module to partition.
-/// \param NumParts Number of partitions to create.
-/// \param ModuleCost Total cost of all functions in \p M.
-/// \param FnCosts Map of Function -> Cost
-/// \param WorkList Kernels and their dependencies to process in order.
-/// \returns The created partitions (a vector of size \p NumParts )
-static std::vector<DenseSet<const Function *>>
-doPartitioning(SplitModuleLogger &SML, Module &M, unsigned NumParts,
-               CostType ModuleCost,
-               const DenseMap<const Function *, CostType> &FnCosts,
-               const SmallVector<KernelWithDependencies> &WorkList) {
-
-  SML << "\n--Partitioning Starts--\n";
-
-  // Calculate a "large kernel threshold". When more than one kernel's total
-  // import cost exceeds this value, we will try to merge it with other,
-  // similarly large kernels.
-  //
-  // e.g. let two kernels X and Y have a import cost of ~10% of the module, we
-  // assign X to a partition as usual, but when we get to Y, we check if it's
-  // worth also putting it in Y's partition.
-  const CostType LargeKernelThreshold =
-      LargeKernelFactor ? ((ModuleCost / NumParts) * LargeKernelFactor)
-                        : std::numeric_limits<CostType>::max();
-
-  std::vector<DenseSet<const Function *>> Partitions;
-  Partitions.resize(NumParts);
-
-  // Assign a partition to each kernel, and try to keep the partitions more or
-  // less balanced. We do that through a priority queue sorted in reverse, so we
-  // can always look at the partition with the least content.
-  //
-  // There are some cases where we will be deliberately unbalanced though.
-  //  - Large kernels: we try to merge with existing partitions to reduce code
-  //  duplication.
-  //  - Kernels with indirect or external calls always go in the first partition
-  //  (P0).
-  auto ComparePartitions = [](const std::pair<PartitionID, CostType> &a,
-                              const std::pair<PartitionID, CostType> &b) {
-    // When two partitions have the same cost, assign to the one with the
-    // biggest ID first. This allows us to put things in P0 last, because P0 may
-    // have other stuff added later.
-    if (a.second == b.second)
-      return a.first < b.first;
-    return a.second > b.second;
-  };
-
-  // We can't use priority_queue here because we need to be able to access any
-  // element. This makes this a bit inefficient as we need to sort it again
-  // everytime we change it, but it's a very small array anyway (likely under 64
-  // partitions) so it's a cheap operation.
-  std::vector<std::pair<PartitionID, CostType>> BalancingQueue;
-  for (unsigned I = 0; I < NumParts; ++I)
-    BalancingQueue.push_back(std::make_pair(I, 0));
-
-  // Helper function to handle assigning a kernel to a partition. This takes
-  // care of updating the balancing queue.
-  const auto AssignToPartition = [&](PartitionID PID,
-                                     const KernelWithDependencies &KWD) {
-    auto &FnsInPart = Partitions[PID];
-    FnsInPart.insert(KWD.Fn);
-    FnsInPart.insert(KWD.Dependencies.begin(), KWD.Dependencies.end());
-
-    SML << "assign " << getName(*KWD.Fn) << " to P" << PID << "\n  ->  ";
-    if (!KWD.Dependencies.empty()) {
-      SML << KWD.Dependencies.size() << " dependencies added\n";
-    };
-
-    // Update the balancing queue. we scan backwards because in the common case
-    // the partition is at the end.
-    for (auto &[QueuePID, Cost] : reverse(BalancingQueue)) {
-      if (QueuePID == PID) {
-        CostType NewCost = 0;
-        for (auto *Fn : Partitions[PID])
-          NewCost += FnCosts.at(Fn);
-
-        SML << "[Updating P" << PID << " Cost]:" << Cost << " -> " << NewCost;
-        if (Cost) {
-          SML << " (" << unsigned(((float(NewCost) / Cost) - 1) * 100)
-              << "% increase)";
-        }
-        SML << '\n';
-
-        Cost = NewCost;
-      }
-    }
-
-    sort(BalancingQueue, ComparePartitions);
-  };
-
-  for (auto &CurKernel : WorkList) {
-    // When a kernel has indirect calls, it must stay in the first partition
-    // alongside every reachable non-entry function. This is a nightmare case
-    // for splitting as it severely limits what we can do.
-    if (CurKernel.HasIndirectCall) {
-      SML << "Kernel with indirect call(s): " << getName(*CurKernel.Fn)
-          << " defaulting to P0\n";
-      AssignToPartition(0, CurKernel);
-      continue;
-    }
-
-    // When a kernel has non duplicatable dependencies, we have to keep it in
-    // the first partition as well. This is a conservative approach, a
-    // finer-grained approach could keep track of which dependencies are
-    // non-duplicatable exactly and just make sure they're grouped together.
-    if (CurKernel.HasNonDuplicatableDependecy) {
-      SML << "Kernel with externally visible dependency "
-          << getName(*CurKernel.Fn) << " defaulting to P0\n";
-      AssignToPartition(0, CurKernel);
-      continue;
-    }
-
-    // Be smart with large kernels to avoid duplicating their dependencies.
-    if (CurKernel.isLarge(LargeKernelThreshold)) {
-      assert(LargeKernelOverlapForMerge >= 0.0f &&
-             LargeKernelOverlapForMerge <= 1.0f);
-      SML << "Large Kernel: " << getName(*CurKernel.Fn)
-          << " - looking for partition with at least "
-          << format("%0.2f", LargeKernelOverlapForMerge * 100) << "% overlap\n";
-
-      bool Assigned = false;
-      for (const auto &[PID, Fns] : enumerate(Partitions)) {
-        float Overlap = calculateOverlap(CurKernel.Dependencies, Fns);
-        SML << "  => " << format("%0.2f", Overlap * 100) << "% overlap with P"
-            << PID << '\n';
-        if (Overlap > LargeKernelOverlapForMerge) {
-          SML << "  selecting P" << PID << '\n';
-          AssignToPartition(PID, CurKernel);
-          Assigned = true;
-        }
-      }
-
-      if (Assigned)
-        continue;
-    }
-
-    // Normal "load-balancing", assign to partition with least pressure.
-    auto [PID, CurCost] = BalancingQueue.back();
-    AssignToPartition(PID, CurKernel);
-  }
-
-  // Work is mostly done now, verify the partioning and add all functions we may
-  // have missed (= unreachable, or we don't understand how they're reached) to
-  // P0.
-  DenseSet<const Function *> AllFunctions;
-  for (const auto &[Idx, Part] : enumerate(Partitions)) {
-    CostType Cost = 0;
-    for (auto *Fn : Part) {
-      // external linkage functions should exclusively be in the first partition
-      // at this stage. In theory, we should only ever see external linkage
-      // functions here if they're kernels, or if they've been added due to a
-      // kernel using indirect calls somewhere in its CallGraph.
-      assert(Idx == 0 || (!Fn->hasExternalLinkage() || isEntryPoint(Fn)));
-      Cost += FnCosts.at(Fn);
-    }
-    SML << "P" << Idx << " has a total cost of " << Cost << " ("
-        << format("%0.2f", (float(Cost) / ModuleCost) * 100)
-        << "% of source module)\n";
-    AllFunctions.insert(Part.begin(), Part.end());
-  }
-
-  // Add missed functions to P0. This will take care of adding things like
-  // external functions with no callers in the module to P0. This should be
-  // fairly rare as AMDGPU internalizes everything in most cases, so unused
-  // internal functions would get removed.
-  for (auto &Fn : M) {
-    if (!Fn.isDeclaration() && !AllFunctions.contains(&Fn)) {
-      SML << getName(Fn) << " has no partition assigned, defaulting to P0\n";
-      Partitions[0].insert(&Fn);
-    }
-  }
-
-  SML << "--Partitioning Done--\n\n";
-
-  return Partitions;
-}
-
-static void externalize(GlobalValue &GV) {
-  if (GV.hasLocalLinkage()) {
-    GV.setLinkage(GlobalValue::ExternalLinkage);
-    GV.setVisibility(GlobalValue::HiddenVisibility);
-  }
-
-  // Unnamed entities must be named consistently between modules. setName will
-  // give a distinct name to each such entity.
-  if (!GV.hasName())
-    GV.setName("__llvmsplit_unnamed");
-}
-} // end anonymous namespace
-
-void llvm::splitAMDGPUModule(
-    const AMDGPUTargetMachine &TM, Module &M, unsigned N,
-    function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback) {
-
-  SplitModuleLogger SML(M);
-
-  CallGraph CG(M);
-
-  // Externalize functions whose address are taken.
-  //
-  // This is needed because partitioning is purely based on calls, but sometimes
-  // a kernel/function may just look at the address of another local function
-  // and not do anything (no calls). After partitioning, that local function may
-  // end up in a 
diff erent module (so it's just a declaration in the module
-  // where its address is taken), which emits a "undefined hidden symbol" linker
-  // error.
-  //
-  // Additionally, it guides partitioning to not duplicate this function if it's
-  // called directly at some point.
-  for (auto &Fn : M) {
-    if (Fn.hasAddressTaken()) {
-      if (Fn.hasLocalLinkage()) {
-        SML << "[externalize] " << Fn.getName()
-            << " because its address is taken\n";
-      }
-      externalize(Fn);
-    }
-  }
-
-  // Externalize local GVs, which avoids duplicating their initializers, which
-  // in turns helps keep code size in check.
-  if (!NoExternalizeGlobals) {
-    for (auto &GV : M.globals()) {
-      if (GV.hasLocalLinkage())
-        SML << "[externalize] GV " << GV.getName() << '\n';
-      externalize(GV);
-    }
-  }
-
-  // Start by calculating the cost of every function in the module, as well as
-  // the module's overall cost.
-  DenseMap<const Function *, CostType> FnCosts;
-  const CostType ModuleCost = calculateFunctionCosts(SML, TM, M, FnCosts);
-
-  // Gather every kernel into a WorkList, then sort it by descending total cost
-  // of the kernel so the biggest kernels are seen first.
-  SmallVector<KernelWithDependencies> WorkList;
-  for (auto &Fn : M) {
-    if (isEntryPoint(&Fn) && !Fn.isDeclaration())
-      WorkList.emplace_back(SML, CG, FnCosts, &Fn);
-  }
-  sort(WorkList, [&](auto &A, auto &B) {
-    // Sort by total cost, and if the total cost is identical, sort
-    // alphabetically.
-    if (A.TotalCost == B.TotalCost)
-      return A.Fn->getName() < B.Fn->getName();
-    return A.TotalCost > B.TotalCost;
-  });
-
-  if (SML) {
-    SML << "Worklist\n";
-    for (const auto &KWD : WorkList) {
-      SML << "[Kernel] " << getName(*KWD.Fn) << " (totalCost:" << KWD.TotalCost
-          << " indirect:" << KWD.HasIndirectCall
-          << " hasNonDuplicatableDep:" << KWD.HasNonDuplicatableDependecy
-          << ")\n";
-      for (const auto *Dep : KWD.Dependencies)
-        SML << "  [Dep] " << getName(*Dep) << '\n';
-    }
-  }
-
-  // This performs all of the partitioning work.
-  auto Partitions = doPartitioning(SML, M, N, ModuleCost, FnCosts, WorkList);
-  assert(Partitions.size() == N);
-
-  // If we didn't externalize GVs, then local GVs need to be conservatively
-  // imported into every module (including their initializers), and then cleaned
-  // up afterwards.
-  const auto NeedsConservativeImport = [&](const GlobalValue *GV) {
-    // We conservatively import private/internal GVs into every module and clean
-    // them up afterwards.
-    const auto *Var = dyn_cast<GlobalVariable>(GV);
-    return Var && Var->hasLocalLinkage();
-  };
-
-  SML << "Creating " << N << " modules...\n";
-  unsigned TotalFnImpls = 0;
-  for (unsigned I = 0; I < N; ++I) {
-    const auto &FnsInPart = Partitions[I];
-
-    ValueToValueMapTy VMap;
-    std::unique_ptr<Module> MPart(
-        CloneModule(M, VMap, [&](const GlobalValue *GV) {
-          // Functions go in their assigned partition.
-          if (const auto *Fn = dyn_cast<Function>(GV)) {
-// Check we don't import an external linkage function in any
-// partition other than P0.
-#ifndef NDEBUG
-            if (Fn->hasExternalLinkage() && !isEntryPoint(Fn)) {
-              assert((I == 0) == FnsInPart.contains(Fn));
-            }
-#endif
-            return FnsInPart.contains(Fn);
-          }
-
-          if (NeedsConservativeImport(GV))
-            return true;
-
-          // Everything else goes in the first partition.
-          return I == 0;
-        }));
-
-    // Clean-up conservatively imported GVs without any users.
-    for (auto &GV : make_early_inc_range(MPart->globals())) {
-      if (NeedsConservativeImport(&GV) && GV.use_empty())
-        GV.eraseFromParent();
-    }
-
-    unsigned NumAllFns = 0, NumKernels = 0;
-    for (auto &Cur : *MPart) {
-      if (!Cur.isDeclaration()) {
-        ++NumAllFns;
-        if (isEntryPoint(&Cur))
-          ++NumKernels;
-      }
-    }
-    TotalFnImpls += NumAllFns;
-    SML << "  - Module " << I << " with " << NumAllFns << " functions ("
-        << NumKernels << " kernels)\n";
-    ModuleCallback(std::move(MPart));
-  }
-
-  SML << TotalFnImpls << " function definitions across all modules ("
-      << format("%0.2f", (float(TotalFnImpls) / FnCosts.size()) * 100)
-      << "% of original module)\n";
-}

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.h b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.h
deleted file mode 100644
index 6171643bd4adc..0000000000000
--- a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.h
+++ /dev/null
@@ -1,30 +0,0 @@
-//===- AMDGPUSplitModule.h -------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TARGET_AMDGPUSPLITMODULE_H
-#define LLVM_TARGET_AMDGPUSPLITMODULE_H
-
-#include "llvm/ADT/STLFunctionalExtras.h"
-#include <memory>
-
-namespace llvm {
-
-class Module;
-class AMDGPUTargetMachine;
-
-/// Splits the module M into N linkable partitions. The function ModuleCallback
-/// is called N times passing each individual partition as the MPart argument.
-void splitAMDGPUModule(
-    const AMDGPUTargetMachine &TM, Module &M, unsigned N,
-    function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback);
-
-} // end namespace llvm
-
-#endif // LLVM_TARGET_AMDGPUSPLITMODULE_H

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index dbbfe34a63863..20329dea60275 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -21,7 +21,6 @@
 #include "AMDGPUIGroupLP.h"
 #include "AMDGPUMacroFusion.h"
 #include "AMDGPURegBankSelect.h"
-#include "AMDGPUSplitModule.h"
 #include "AMDGPUTargetObjectFile.h"
 #include "AMDGPUTargetTransformInfo.h"
 #include "AMDGPUUnifyDivergentExitNodes.h"
@@ -816,13 +815,6 @@ AMDGPUTargetMachine::getAddressSpaceForPseudoSourceKind(unsigned Kind) const {
   return AMDGPUAS::FLAT_ADDRESS;
 }
 
-bool AMDGPUTargetMachine::splitModule(
-    Module &M, unsigned NumParts,
-    function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback) const {
-  splitAMDGPUModule(*this, M, NumParts, ModuleCallback);
-  return true;
-}
-
 //===----------------------------------------------------------------------===//
 // GCN Target Machine (SI+)
 //===----------------------------------------------------------------------===//

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
index 2cfd232483a8a..e48cb8fdc6576 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -73,10 +73,6 @@ class AMDGPUTargetMachine : public LLVMTargetMachine {
   getPredicatedAddrSpace(const Value *V) const override;
 
   unsigned getAddressSpaceForPseudoSourceKind(unsigned Kind) const override;
-
-  bool splitModule(Module &M, unsigned NumParts,
-                   function_ref<void(std::unique_ptr<Module> MPart)>
-                       ModuleCallback) const override;
 };
 
 //===----------------------------------------------------------------------===//

diff  --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index c992352cb78da..ead81b402eb76 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -98,7 +98,6 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPURewriteOutArguments.cpp
   AMDGPURewriteUndefForPHI.cpp
   AMDGPUSetWavePriority.cpp
-  AMDGPUSplitModule.cpp
   AMDGPUSubtarget.cpp
   AMDGPUTargetMachine.cpp
   AMDGPUTargetObjectFile.cpp

diff  --git a/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize-with-call.ll b/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize-with-call.ll
deleted file mode 100644
index 8b76237efa325..0000000000000
--- a/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize-with-call.ll
+++ /dev/null
@@ -1,46 +0,0 @@
-; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-kernel-threshold=0
-; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
-; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
-; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
-
-; 3 kernels:
-;   - A does a direct call to HelperA
-;   - B is storing @HelperA
-;   - C does a direct call to HelperA
-;
-; The helper functions will get externalized, which will force A and C into P0 as
-; external functions cannot be duplicated.
-
-; CHECK0: define hidden void @HelperA()
-; CHECK0: define amdgpu_kernel void @A()
-; CHECK0: declare amdgpu_kernel void @B(ptr)
-; CHECK0: define amdgpu_kernel void @C()
-
-; CHECK1: declare hidden void @HelperA()
-; CHECK1: declare amdgpu_kernel void @A()
-; CHECK1: declare amdgpu_kernel void @B(ptr)
-; CHECK1: declare amdgpu_kernel void @C()
-
-; CHECK2: declare hidden void @HelperA()
-; CHECK2: declare amdgpu_kernel void @A()
-; CHECK2: define amdgpu_kernel void @B(ptr %dst)
-; CHECK2: declare amdgpu_kernel void @C()
-
-define internal void @HelperA() {
-  ret void
-}
-
-define amdgpu_kernel void @A() {
-  call void @HelperA()
-  ret void
-}
-
-define amdgpu_kernel void @B(ptr %dst) {
-  store ptr @HelperA, ptr %dst
-  ret void
-}
-
-define amdgpu_kernel void @C() {
-  call void @HelperA()
-  ret void
-}

diff  --git a/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize.ll b/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize.ll
deleted file mode 100644
index 46d7d9783aeae..0000000000000
--- a/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize.ll
+++ /dev/null
@@ -1,37 +0,0 @@
-; RUN: llvm-split -o %t %s -j 2 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-kernel-threshold=0
-; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
-; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
-
-; 2 kernels:
-;   - A is isolated
-;   - B is storing @HelperA/B's address
-;
-; The helper functions should get externalized (become hidden w/ external linkage)
-
-; CHECK0: define hidden void @HelperA()
-; CHECK0: define hidden void @HelperB()
-; CHECK0: define amdgpu_kernel void @A()
-; CHECK0: declare amdgpu_kernel void @B(i1, ptr)
-
-; CHECK1: declare hidden void @HelperA()
-; CHECK1: declare hidden void @HelperB()
-; CHECK1: declare amdgpu_kernel void @A()
-; CHECK1: define amdgpu_kernel void @B(i1 %cond, ptr %dst)
-
-define internal void @HelperA() {
-  ret void
-}
-
-define internal void @HelperB() {
-  ret void
-}
-
-define amdgpu_kernel void @A() {
-  ret void
-}
-
-define amdgpu_kernel void @B(i1 %cond, ptr %dst) {
-  %addr = select i1 %cond, ptr @HelperA, ptr @HelperB
-  store ptr %addr, ptr %dst
-  ret void
-}

diff  --git a/llvm/test/tools/llvm-split/AMDGPU/debug-name-hiding.ll b/llvm/test/tools/llvm-split/AMDGPU/debug-name-hiding.ll
deleted file mode 100644
index 6a07ed51ba1be..0000000000000
--- a/llvm/test/tools/llvm-split/AMDGPU/debug-name-hiding.ll
+++ /dev/null
@@ -1,20 +0,0 @@
-; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -debug -amdgpu-module-splitting-log-private 2>&1 | FileCheck %s --implicit-check-not=MyCustomKernel
-; REQUIRES: asserts
-
-; SHA256 of the kernel names.
-
-; CHECK: a097723d21cf9f35d90e6fb7881995ac8c398b3366a6c97efc657404f9fe301c
-; CHECK: 626bc23242de8fcfda7f0e66318d29455c081df6b5380e64d14703c95fcbcd59
-; CHECK: c38d90a7ca71dc5d694bb9e093dadcdedfc4cb4adf7ed7e46d42fe95a0b4ef55
-
-define amdgpu_kernel void @MyCustomKernel0() {
-  ret void
-}
-
-define amdgpu_kernel void @MyCustomKernel1() {
-  ret void
-}
-
-define amdgpu_kernel void @MyCustomKernel2() {
-  ret void
-}

diff  --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-alias-dependencies.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-alias-dependencies.ll
deleted file mode 100644
index c2746d1398924..0000000000000
--- a/llvm/test/tools/llvm-split/AMDGPU/kernels-alias-dependencies.ll
+++ /dev/null
@@ -1,45 +0,0 @@
-; RUN: llvm-split -o %t %s -j 2 -mtriple amdgcn-amd-amdhsa
-; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
-; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
-
-; 3 kernels:
-;   - A calls nothing
-;   - B calls @PerryThePlatypus
-;   - C calls @Perry, an alias of @PerryThePlatypus
-;
-; We should see through the alias and put B/C in the same
-; partition.
-;
-; Additionally, @PerryThePlatypus gets externalized as
-; the alias counts as taking its address.
-
-; CHECK0-NOT: define
-; CHECK0: @Perry = internal alias ptr (), ptr @PerryThePlatypus
-; CHECK0: define hidden void @PerryThePlatypus()
-; CHECK0: define amdgpu_kernel void @B
-; CHECK0: define amdgpu_kernel void @C
-; CHECK0-NOT: define
-
-; CHECK1-NOT: define
-; CHECK1: define amdgpu_kernel void @A
-; CHECK1-NOT: define
-
- at Perry = internal alias ptr(), ptr @PerryThePlatypus
-
-define internal void @PerryThePlatypus() {
-  ret void
-}
-
-define amdgpu_kernel void @A() {
-  ret void
-}
-
-define amdgpu_kernel void @B() {
-  call void @PerryThePlatypus()
-  ret void
-}
-
-define amdgpu_kernel void @C() {
-  call void @Perry()
-  ret void
-}

diff  --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-cost-ranking.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-cost-ranking.ll
deleted file mode 100644
index 4635264aefb39..0000000000000
--- a/llvm/test/tools/llvm-split/AMDGPU/kernels-cost-ranking.ll
+++ /dev/null
@@ -1,54 +0,0 @@
-; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa
-; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
-; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
-; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
-
-; 3 kernels with each their own dependencies should go into 3
-; distinct partitions. The most expensive kernel should be
-; seen first and go into the last partition.
-
-; CHECK0-NOT: define
-; CHECK0: define amdgpu_kernel void @C
-; CHECK0: define internal void @HelperC
-; CHECK0-NOT: define
-
-; CHECK1-NOT: define
-; CHECK1: define amdgpu_kernel void @A
-; CHECK1: define internal void @HelperA
-; CHECK1-NOT: define
-
-; CHECK2-NOT: define
-; CHECK2: define amdgpu_kernel void @B
-; CHECK2: define internal void @HelperB
-; CHECK2-NOT: define
-
-
-define amdgpu_kernel void @A() {
-  call void @HelperA()
-  ret void
-}
-
-define internal void @HelperA() {
-  ret void
-}
-
-define amdgpu_kernel void @B(ptr %x) {
-  store i64 42, ptr %x
-  store i64 43, ptr %x
-  store i64 44, ptr %x
-  call void @HelperB()
-  ret void
-}
-
-define internal void @HelperB() {
-  ret void
-}
-
-define amdgpu_kernel void @C() {
-  call void @HelperC()
-  ret void
-}
-
-define internal void @HelperC() {
-  ret void
-}

diff  --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependencies.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependencies.ll
deleted file mode 100644
index bea527f15bbaa..0000000000000
--- a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependencies.ll
+++ /dev/null
@@ -1,50 +0,0 @@
-; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa
-; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
-; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
-; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
-
-; 3 kernels with each their own dependencies should go into 3
-; distinct partitions.
-
-; CHECK0-NOT: define
-; CHECK0: define amdgpu_kernel void @C
-; CHECK0: define internal void @HelperC
-; CHECK0-NOT: define
-
-; CHECK1-NOT: define
-; CHECK1: define amdgpu_kernel void @B
-; CHECK1: define internal void @HelperB
-; CHECK1-NOT: define
-
-; CHECK2-NOT: define
-; CHECK2: define amdgpu_kernel void @A
-; CHECK2: define internal void @HelperA
-; CHECK2-NOT: define
-
-
-define amdgpu_kernel void @A() {
-  call void @HelperA()
-  ret void
-}
-
-define internal void @HelperA() {
-  ret void
-}
-
-define amdgpu_kernel void @B() {
-  call void @HelperB()
-  ret void
-}
-
-define internal void @HelperB() {
-  ret void
-}
-
-define amdgpu_kernel void @C() {
-  call void @HelperC()
-  ret void
-}
-
-define internal void @HelperC() {
-  ret void
-}

diff  --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-duplication.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-duplication.ll
deleted file mode 100644
index 64839f8d8456a..0000000000000
--- a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-duplication.ll
+++ /dev/null
@@ -1,41 +0,0 @@
-; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa
-; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
-; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
-; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
-
-; 3 kernels share a common helper, that helper should be
-; cloned in all partitions.
-
-; CHECK0-NOT: define
-; CHECK0: define internal void @Helper
-; CHECK0: define amdgpu_kernel void @C
-; CHECK0-NOT: define
-
-; CHECK1-NOT: define
-; CHECK1: define internal void @Helper
-; CHECK1: define amdgpu_kernel void @B
-; CHECK1-NOT: define
-
-; CHECK2-NOT: define
-; CHECK2: define internal void @Helper
-; CHECK2: define amdgpu_kernel void @A
-; CHECK2-NOT: define
-
-define internal void @Helper() {
-  ret void
-}
-
-define amdgpu_kernel void @A() {
-  call void @Helper()
-  ret void
-}
-
-define amdgpu_kernel void @B() {
-  call void @Helper()
-  ret void
-}
-
-define amdgpu_kernel void @C() {
-  call void @Helper()
-  ret void
-}

diff  --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-external.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-external.ll
deleted file mode 100644
index 435e97a581340..0000000000000
--- a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-external.ll
+++ /dev/null
@@ -1,64 +0,0 @@
-; RUN: llvm-split -o %t %s -j 4 -mtriple amdgcn-amd-amdhsa
-; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
-; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
-; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
-; RUN: llvm-dis -o - %t3 | FileCheck --check-prefix=CHECK3 %s
-
-; Both overridable helper should go in P0.
-
-; CHECK0-NOT: define
-; CHECK0: define available_externally void @OverridableHelper0()
-; CHECK0: define internal void @OverridableHelper1()
-; CHECK0: define amdgpu_kernel void @A
-; CHECK0: define amdgpu_kernel void @B
-; CHECK0-NOT: define
-
-; CHECK1-NOT: define
-
-; CHECK2-NOT: define
-; CHECK2: define internal void @PrivateHelper1()
-; CHECK2: define amdgpu_kernel void @D
-; CHECK2-NOT: define
-
-; CHECK3-NOT: define
-; CHECK3: define internal void @PrivateHelper0()
-; CHECK3: define amdgpu_kernel void @C
-; CHECK3-NOT: define
-
-define available_externally void @OverridableHelper0() {
-  ret void
-}
-
-define internal void @OverridableHelper1() #0 {
-  ret void
-}
-
-define internal void @PrivateHelper0() {
-  ret void
-}
-
-define internal void @PrivateHelper1() {
-  ret void
-}
-
-define amdgpu_kernel void @A() {
-  call void @OverridableHelper0()
-  ret void
-}
-
-define amdgpu_kernel void @B() {
-  call void @OverridableHelper1()
-  ret void
-}
-
-define amdgpu_kernel void @C() {
-  call void @PrivateHelper0()
-  ret void
-}
-
-define amdgpu_kernel void @D() {
-  call void @PrivateHelper1()
-  ret void
-}
-
-attributes #0 = { nobuiltin }

diff  --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-indirect.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-indirect.ll
deleted file mode 100644
index 9701ac35ce54e..0000000000000
--- a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-indirect.ll
+++ /dev/null
@@ -1,76 +0,0 @@
-; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa
-; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
-; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
-; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
-
-; We have 4 kernels:
-;   - Each kernel has an internal helper
-;   - @A and @B's helpers does an indirect call.
-;
-; We default to putting A/B in P0, alongside a copy
-; of all helpers who have their address taken.
-; The other kernels can still go into separate partitions.
-
-; CHECK0-NOT: define
-; CHECK0: define hidden void @HelperA
-; CHECK0: define hidden void @HelperB
-; CHECK0: define hidden void @CallCandidate
-; CHECK0-NOT: define {{.*}} @HelperC
-; CHECK0-NOT: define {{.*}} @HelperD
-; CHECK0: define amdgpu_kernel void @A
-; CHECK0: define amdgpu_kernel void @B
-; CHECK0-NOT: define
-
-; CHECK1-NOT: define
-; CHECK1: define internal void @HelperD
-; CHECK1: define amdgpu_kernel void @D
-; CHECK1-NOT: define
-
-; CHECK2-NOT: define
-; CHECK2: define internal void @HelperC
-; CHECK2: define amdgpu_kernel void @C
-; CHECK2-NOT: define
-
- at addrthief = global [3 x ptr] [ptr @HelperA, ptr @HelperB, ptr @CallCandidate]
-
-define internal void @HelperA(ptr %call) {
-  call void %call()
-  ret void
-}
-
-define internal void @HelperB(ptr %call) {
-  call void %call()
-  ret void
-}
-
-define internal void @CallCandidate() {
-  ret void
-}
-
-define internal void @HelperC() {
-  ret void
-}
-
-define internal void @HelperD() {
-  ret void
-}
-
-define amdgpu_kernel void @A(ptr %call) {
-  call void @HelperA(ptr %call)
-  ret void
-}
-
-define amdgpu_kernel void @B(ptr %call) {
-  call void @HelperB(ptr %call)
-  ret void
-}
-
-define amdgpu_kernel void @C() {
-  call void @HelperC()
-  ret void
-}
-
-define amdgpu_kernel void @D() {
-  call void @HelperD()
-  ret void
-}

diff  --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-overridable.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-overridable.ll
deleted file mode 100644
index dc2c5c3c07bee..0000000000000
--- a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-overridable.ll
+++ /dev/null
@@ -1,40 +0,0 @@
-; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa
-; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
-; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
-; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
-
-; CHECK0-NOT: define
-; CHECK0: define void @ExternalHelper
-; CHECK0: define amdgpu_kernel void @A
-; CHECK0: define amdgpu_kernel void @B
-; CHECK0-NOT: define
-
-; CHECK1-NOT: define
-; CHECK1: define amdgpu_kernel void @D
-; CHECK1-NOT: define
-
-; CHECK2-NOT: define
-; CHECK2: define amdgpu_kernel void @C
-; CHECK2-NOT: define
-
-define void @ExternalHelper() {
-  ret void
-}
-
-define amdgpu_kernel void @A() {
-  call void @ExternalHelper()
-  ret void
-}
-
-define amdgpu_kernel void @B() {
-  call void @ExternalHelper()
-  ret void
-}
-
-define amdgpu_kernel void @C() {
-  ret void
-}
-
-define amdgpu_kernel void @D() {
-  ret void
-}

diff  --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables-noexternal.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables-noexternal.ll
deleted file mode 100644
index 0fc76934afc54..0000000000000
--- a/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables-noexternal.ll
+++ /dev/null
@@ -1,42 +0,0 @@
-; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-no-externalize-globals
-; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
-; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
-; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
-
-; 3 kernels use private/internal global variables.
-; The GVs should be copied in each partition as needed.
-
-; CHECK0-NOT: define
-; CHECK0: @bar = internal constant ptr
-; CHECK0: define amdgpu_kernel void @C
-; CHECK0-NOT: define
-
-; CHECK1-NOT: define
-; CHECK1: @foo = private constant ptr
-; CHECK1: define amdgpu_kernel void @A
-; CHECK1-NOT: define
-
-; CHECK2-NOT: define
-; CHECK2: @foo = private constant ptr
-; CHECK2: @bar = internal constant ptr
-; CHECK2: define amdgpu_kernel void @B
-; CHECK2-NOT: define
-
- at foo = private constant ptr poison
- at bar = internal constant ptr poison
-
-define amdgpu_kernel void @A() {
-  store i32 42, ptr @foo
-  ret void
-}
-
-define amdgpu_kernel void @B() {
-  store i32 42, ptr @foo
-  store i32 42, ptr @bar
-  ret void
-}
-
-define amdgpu_kernel void @C() {
-  store i32 42, ptr @bar
-  ret void
-}

diff  --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables.ll
deleted file mode 100644
index 7564662e7c7c0..0000000000000
--- a/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables.ll
+++ /dev/null
@@ -1,44 +0,0 @@
-; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa
-; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
-; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
-; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
-
-; 3 kernels use private/internal global variables.
-; The GVs should be copied in each partition as needed.
-
-; CHECK0-NOT: define
-; CHECK0: @foo = hidden constant ptr poison
-; CHECK0: @bar = hidden constant ptr poison
-; CHECK0: define amdgpu_kernel void @C
-; CHECK0-NOT: define
-
-; CHECK1-NOT: define
-; CHECK1: @foo = external hidden constant ptr{{$}}
-; CHECK1: @bar = external hidden constant ptr{{$}}
-; CHECK1: define amdgpu_kernel void @A
-; CHECK1-NOT: define
-
-; CHECK2-NOT: define
-; CHECK2: @foo = external hidden constant ptr{{$}}
-; CHECK2: @bar = external hidden constant ptr{{$}}
-; CHECK2: define amdgpu_kernel void @B
-; CHECK2-NOT: define
-
- at foo = private constant ptr poison
- at bar = internal constant ptr poison
-
-define amdgpu_kernel void @A() {
-  store i32 42, ptr @foo
-  ret void
-}
-
-define amdgpu_kernel void @B() {
-  store i32 42, ptr @foo
-  store i32 42, ptr @bar
-  ret void
-}
-
-define amdgpu_kernel void @C() {
-  store i32 42, ptr @bar
-  ret void
-}

diff  --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-load-balancing.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-load-balancing.ll
deleted file mode 100644
index 5dfb95c5fc660..0000000000000
--- a/llvm/test/tools/llvm-split/AMDGPU/kernels-load-balancing.ll
+++ /dev/null
@@ -1,75 +0,0 @@
-; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa
-; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
-; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
-; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
-
-; Test load balancing logic with 6 kernels.
-;
-; Kernels go from most expensive (A == 6) to least expensive (F == 1)
-;
-; Load balancing should work like this (current partition cost is in parens)
-;
-; Initial    -> [P0(0), P1(0), P2(0)]
-;
-; A(6) goes in 2 -> [P2(6), P0(0), P1(0)]
-; B(5) goes in 1 -> [P2(6), P1(5), P0(4)]
-; C(4) goes in 0 -> [P2(6), P1(5), P0(4)]
-
-; D(3) goes in 0 -> [P0(7), P2(6), P1(5)]
-; E(2) goes in 1 -> [P0(7), P1(7), P2(6)]
-; F(1) goes in 2 -> [P0(7), P1(7), P2(7)]
-
-; CHECK0-NOT: define
-; CHECK0: define amdgpu_kernel void @C
-; CHECK0: define amdgpu_kernel void @D
-; CHECK0-NOT: define
-
-; CHECK1-NOT: define
-; CHECK1: define amdgpu_kernel void @B
-; CHECK1: define amdgpu_kernel void @E
-; CHECK1-NOT: define
-
-; CHECK2-NOT: define
-; CHECK2: define amdgpu_kernel void @A
-; CHECK2: define amdgpu_kernel void @F
-; CHECK2-NOT: define
-
-
-define amdgpu_kernel void @A(ptr %x) {
-  store i64 42, ptr %x
-  store i64 43, ptr %x
-  store i64 44, ptr %x
-  store i64 45, ptr %x
-  store i64 46, ptr %x
-  ret void
-}
-
-define amdgpu_kernel void @B(ptr %x) {
-  store i64 42, ptr %x
-  store i64 43, ptr %x
-  store i64 44, ptr %x
-  store i64 45, ptr %x
-  ret void
-}
-
-define amdgpu_kernel void @C(ptr %x) {
-  store i64 42, ptr %x
-  store i64 43, ptr %x
-  store i64 44, ptr %x
-  ret void
-}
-
-define amdgpu_kernel void @D(ptr %x) {
-  store i64 42, ptr %x
-  store i64 43, ptr %x
-  ret void
-}
-
-define amdgpu_kernel void @E(ptr %x) {
-  store i64 42, ptr %x
-  ret void
-}
-
-define amdgpu_kernel void @F() {
-  ret void
-}

diff  --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-no-dependencies.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-no-dependencies.ll
deleted file mode 100644
index 8959acfcae542..0000000000000
--- a/llvm/test/tools/llvm-split/AMDGPU/kernels-no-dependencies.ll
+++ /dev/null
@@ -1,39 +0,0 @@
-; RUN: llvm-split -o %t %s -j 4 -mtriple amdgcn-amd-amdhsa
-; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
-; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
-; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
-; RUN: llvm-dis -o - %t3 | FileCheck --check-prefix=CHECK3 %s
-
-; Check that 4 independent kernels get put into 4 
diff erent partitions.
-
-; CHECK0-NOT: define
-; CHECK0: define amdgpu_kernel void @D
-; CHECK0-NOT: define
-
-; CHECK1-NOT: define
-; CHECK1: define amdgpu_kernel void @C
-; CHECK1-NOT: define
-
-; CHECK2-NOT: define
-; CHECK2: define amdgpu_kernel void @B
-; CHECK2-NOT: define
-
-; CHECK3-NOT: define
-; CHECK3: define amdgpu_kernel void @A
-; CHECK3-NOT: define
-
-define amdgpu_kernel void @A() {
-  ret void
-}
-
-define amdgpu_kernel void @B() {
-  ret void
-}
-
-define amdgpu_kernel void @C() {
-  ret void
-}
-
-define amdgpu_kernel void @D() {
-  ret void
-}

diff  --git a/llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging.ll b/llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging.ll
deleted file mode 100644
index 4fdbac7d17897..0000000000000
--- a/llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging.ll
+++ /dev/null
@@ -1,98 +0,0 @@
-; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-kernel-threshold=1.2 -amdgpu-module-splitting-large-kernel-merge-overlap=0.5
-; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
-; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
-; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
-
-; RUN: llvm-split -o %t.nolarge %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-kernel-threshold=0
-; RUN: llvm-dis -o - %t.nolarge0 | FileCheck --check-prefix=NOLARGEKERNELS-CHECK0 %s
-; RUN: llvm-dis -o - %t.nolarge1 | FileCheck --check-prefix=NOLARGEKERNELS-CHECK1 %s
-; RUN: llvm-dis -o - %t.nolarge2 | FileCheck --check-prefix=NOLARGEKERNELS-CHECK2 %s
-
-; 2 kernels (A/B) are large and share all their dependencies.
-; They should go in the same partition, the remaining kernel should
-; go somewhere else, and one partition should be empty.
-;
-; Also check w/o large kernels processing to verify they are indeed handled
-; 
diff erently.
-
-; CHECK0-NOT: define
-
-; CHECK1-NOT: define
-; CHECK1: define internal void @HelperC()
-; CHECK1: define amdgpu_kernel void @C
-; CHECK1-NOT: define
-
-; CHECK2-NOT: define
-; CHECK2: define internal void @large2()
-; CHECK2: define internal void @large1()
-; CHECK2: define internal void @large0()
-; CHECK2: define internal void @HelperA()
-; CHECK2: define internal void @HelperB()
-; CHECK2: define amdgpu_kernel void @A
-; CHECK2: define amdgpu_kernel void @B
-; CHECK2-NOT: define
-
-; NOLARGEKERNELS-CHECK0-NOT: define
-; NOLARGEKERNELS-CHECK0: define internal void @HelperC()
-; NOLARGEKERNELS-CHECK0: define amdgpu_kernel void @C
-; NOLARGEKERNELS-CHECK0-NOT: define
-
-; NOLARGEKERNELS-CHECK1: define internal void @large2()
-; NOLARGEKERNELS-CHECK1: define internal void @large1()
-; NOLARGEKERNELS-CHECK1: define internal void @large0()
-; NOLARGEKERNELS-CHECK1: define internal void @HelperB()
-; NOLARGEKERNELS-CHECK1: define amdgpu_kernel void @B
-
-; NOLARGEKERNELS-CHECK2: define internal void @large2()
-; NOLARGEKERNELS-CHECK2: define internal void @large1()
-; NOLARGEKERNELS-CHECK2: define internal void @large0()
-; NOLARGEKERNELS-CHECK2: define internal void @HelperA()
-; NOLARGEKERNELS-CHECK2: define amdgpu_kernel void @A
-
-define internal void @large2() {
-  store volatile i32 42, ptr null
-  call void @large2()
-  ret void
-}
-
-define internal void @large1() {
-  call void @large1()
-  call void @large2()
-  ret void
-}
-
-define internal void @large0() {
-  call void @large0()
-  call void @large1()
-  call void @large2()
-  ret void
-}
-
-define internal void @HelperA() {
-  call void @large0()
-  ret void
-}
-
-define internal void @HelperB() {
-  call void @large0()
-  ret void
-}
-
-define amdgpu_kernel void @A() {
-  call void @HelperA()
-  ret void
-}
-
-define amdgpu_kernel void @B() {
-  call void @HelperB()
-  ret void
-}
-
-define internal void @HelperC() {
-  ret void
-}
-
-define amdgpu_kernel void @C() {
-  call void @HelperC()
-  ret void
-}

diff  --git a/llvm/test/tools/llvm-split/AMDGPU/lit.local.cfg b/llvm/test/tools/llvm-split/AMDGPU/lit.local.cfg
deleted file mode 100644
index 6154a6c1c9061..0000000000000
--- a/llvm/test/tools/llvm-split/AMDGPU/lit.local.cfg
+++ /dev/null
@@ -1,2 +0,0 @@
-if not "AMDGPU" in config.root.targets:
-  config.unsupported = True