[llvm] [AMDGPU] Graph-based Module Splitting Rewrite (llvm#104763) (PR #106707)

Fri Aug 30 03:37:36 PDT 2024

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Danial Klimkin (dklimkin)

<details>
<summary>Changes</summary>

* Revert "Fix MSVC "not all control paths return a value" warning. NFC."
Dep to revert c9b6e01b2e4fc930dac91dd44c0592ad7e36d967

* Revert "[AMDGPU] Graph-based Module Splitting Rewrite (#104763)"
Breaks tests.


---

Patch is 107.18 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/106707.diff


17 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp (+530-1273) 
- (modified) llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize-with-call.ll (+21-15) 
- (modified) llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize.ll (+1-1) 
- (added) llvm/test/tools/llvm-split/AMDGPU/debug-name-hiding.ll (+20) 
- (added) llvm/test/tools/llvm-split/AMDGPU/debug-non-kernel-root.ll (+36) 
- (modified) llvm/test/tools/llvm-split/AMDGPU/declarations.ll (+6-3) 
- (modified) llvm/test/tools/llvm-split/AMDGPU/kernels-alias-dependencies.ll (+11-7) 
- (modified) llvm/test/tools/llvm-split/AMDGPU/kernels-cost-ranking.ll (+9-3) 
- (modified) llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-external.ll (+21-12) 
- (modified) llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-indirect.ll (+18-12) 
- (modified) llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-overridable.ll (+17-11) 
- (modified) llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables-noexternal.ll (+9-3) 
- (modified) llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables.ll (+9-3) 
- (modified) llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging.ll (+15-11) 
- (modified) llvm/test/tools/llvm-split/AMDGPU/non-kernels-dependency-indirect.ll (+15-15) 
- (removed) llvm/test/tools/llvm-split/AMDGPU/recursive-search-2.ll (-128) 
- (removed) llvm/test/tools/llvm-split/AMDGPU/recursive-search-8.ll (-128) 


``````````diff

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
index a5807a70582b39..df084cf41c4783 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
@@ -7,36 +7,33 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file Implements a module splitting algorithm designed to support the
-/// FullLTO --lto-partitions option for parallel codegen.
+/// FullLTO --lto-partitions option for parallel codegen. This is completely
+/// different from the common SplitModule pass, as this system is designed with
+/// AMDGPU in mind.
 ///
-/// The role of this module splitting pass is the same as
-/// lib/Transforms/Utils/SplitModule.cpp: load-balance the module's functions
-/// across a set of N partitions to allow for parallel codegen.
+/// The basic idea of this module splitting implementation is the same as
+/// SplitModule: load-balance the module's functions across a set of N
+/// partitions to allow parallel codegen. However, it does it very
+/// differently than the target-agnostic variant:
+///   - The module has "split roots", which are kernels in the vast
+//      majority of cases.
+///   - Each root has a set of dependencies, and when a root and its
+///     dependencies is considered "big", we try to put it in a partition where
+///     most dependencies are already imported, to avoid duplicating large
+///     amounts of code.
+///   - There's special care for indirect calls in order to ensure
+///     AMDGPUResourceUsageAnalysis can work correctly.
 ///
-/// The similarities mostly end here, as this pass achieves load-balancing in a
-/// more elaborate fashion which is targeted towards AMDGPU modules. It can take
-/// advantage of the structure of AMDGPU modules (which are mostly
-/// self-contained) to allow for more efficient splitting without affecting
-/// codegen negatively, or causing innaccurate resource usage analysis.
-///
-/// High-level pass overview:
-///   - SplitGraph & associated classes
-///      - Graph representation of the module and of the dependencies that
-///      matter for splitting.
-///   - RecursiveSearchSplitting
-///     - Core splitting algorithm.
-///   - SplitProposal
-///     - Represents a suggested solution for splitting the input module. These
-///     solutions can be scored to determine the best one when multiple
-///     solutions are available.
-///   - Driver/pass "run" function glues everything together.
+/// This file also includes a more elaborate logging system to enable
+/// users to easily generate logs that (if desired) do not include any value
+/// names, in order to not leak information about the source file.
+/// Such logs are very helpful to understand and fix potential issues with
+/// module splitting.
 
 #include "AMDGPUSplitModule.h"
 #include "AMDGPUTargetMachine.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/EquivalenceClasses.h"
-#include "llvm/ADT/GraphTraits.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
@@ -47,56 +44,44 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
-#include "llvm/Support/Allocator.h"
 #include "llvm/Support/Casting.h"
-#include "llvm/Support/DOTGraphTraits.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FileSystem.h"
-#include "llvm/Support/GraphWriter.h"
 #include "llvm/Support/Path.h"
-#include "llvm/Support/Timer.h"
+#include "llvm/Support/Process.h"
+#include "llvm/Support/SHA256.h"
+#include "llvm/Support/Threading.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include <algorithm>
 #include <cassert>
-#include <cmath>
 #include <iterator>
 #include <memory>
 #include <utility>
 #include <vector>
 
-#ifndef NDEBUG
-#include "llvm/Support/LockFileManager.h"
-#endif
+using namespace llvm;
 
 #define DEBUG_TYPE "amdgpu-split-module"
 
-namespace llvm {
 namespace {
 
-static cl::opt<unsigned> MaxDepth(
-    "amdgpu-module-splitting-max-depth",
-    cl::desc(
-        "maximum search depth. 0 forces a greedy approach. "
-        "warning: the algorithm is up to O(2^N), where N is the max depth."),
-    cl::init(8));
-
 static cl::opt<float> LargeFnFactor(
-    "amdgpu-module-splitting-large-threshold", cl::init(2.0f), cl::Hidden,
+    "amdgpu-module-splitting-large-function-threshold", cl::init(2.0f),
+    cl::Hidden,
     cl::desc(
-        "when max depth is reached and we can no longer branch out, this "
-        "value determines if a function is worth merging into an already "
-        "existing partition to reduce code duplication. This is a factor "
-        "of the ideal partition size, e.g. 2.0 means we consider the "
-        "function for merging if its cost (including its callees) is 2x the "
-        "size of an ideal partition."));
+        "consider a function as large and needing special treatment when the "
+        "cost of importing it into a partition"
+        "exceeds the average cost of a partition by this factor; e;g. 2.0 "
+        "means if the function and its dependencies is 2 times bigger than "
+        "an average partition; 0 disables large functions handling entirely"));
 
 static cl::opt<float> LargeFnOverlapForMerge(
-    "amdgpu-module-splitting-merge-threshold", cl::init(0.7f), cl::Hidden,
-    cl::desc("when a function is considered for merging into a partition that "
-             "already contains some of its callees, do the merge if at least "
-             "n% of the code it can reach is already present inside the "
-             "partition; e.g. 0.7 means only merge >70%"));
+    "amdgpu-module-splitting-large-function-merge-overlap", cl::init(0.8f),
+    cl::Hidden,
+    cl::desc(
+        "defines how much overlap between two large function's dependencies "
+        "is needed to put them in the same partition"));
 
 static cl::opt<bool> NoExternalizeGlobals(
     "amdgpu-module-splitting-no-externalize-globals", cl::Hidden,
@@ -104,92 +89,142 @@ static cl::opt<bool> NoExternalizeGlobals(
              "may cause globals to be duplicated which increases binary size"));
 
 static cl::opt<std::string>
-    ModuleDotCfgOutput("amdgpu-module-splitting-print-module-dotcfg",
-                       cl::Hidden,
-                       cl::desc("output file to write out the dotgraph "
-                                "representation of the input module"));
+    LogDirOpt("amdgpu-module-splitting-log-dir", cl::Hidden,
+              cl::desc("output directory for AMDGPU module splitting logs"));
 
-static cl::opt<std::string> PartitionSummariesOutput(
-    "amdgpu-module-splitting-print-partition-summaries", cl::Hidden,
-    cl::desc("output file to write out a summary of "
-             "the partitions created for each module"));
-
-#ifndef NDEBUG
 static cl::opt<bool>
-    UseLockFile("amdgpu-module-splitting-serial-execution", cl::Hidden,
-                cl::desc("use a lock file so only one process in the system "
-                         "can run this pass at once. useful to avoid mangled "
-                         "debug output in multithreaded environments."));
+    LogPrivate("amdgpu-module-splitting-log-private", cl::Hidden,
+               cl::desc("hash value names before printing them in the AMDGPU "
+                        "module splitting logs"));
 
-static cl::opt<bool>
-    DebugProposalSearch("amdgpu-module-splitting-debug-proposal-search",
-                        cl::Hidden,
-                        cl::desc("print all proposals received and whether "
-                                 "they were rejected or accepted"));
-#endif
+using CostType = InstructionCost::CostType;
+using PartitionID = unsigned;
+using GetTTIFn = function_ref<const TargetTransformInfo &(Function &)>;
 
-struct SplitModuleTimer : NamedRegionTimer {
-  SplitModuleTimer(StringRef Name, StringRef Desc)
-      : NamedRegionTimer(Name, Desc, DEBUG_TYPE, "AMDGPU Module Splitting",
-                         TimePassesIsEnabled) {}
-};
+static bool isEntryPoint(const Function *F) {
+  return AMDGPU::isEntryFunctionCC(F->getCallingConv());
+}
 
-//===----------------------------------------------------------------------===//
-// Utils
-//===----------------------------------------------------------------------===//
+static std::string getName(const Value &V) {
+  static bool HideNames;
 
-using CostType = InstructionCost::CostType;
-using FunctionsCostMap = DenseMap<const Function *, CostType>;
-using GetTTIFn = function_ref<const TargetTransformInfo &(Function &)>;
-static constexpr unsigned InvalidPID = -1;
+  static llvm::once_flag HideNameInitFlag;
+  llvm::call_once(HideNameInitFlag, [&]() {
+    if (LogPrivate.getNumOccurrences())
+      HideNames = LogPrivate;
+    else {
+      const auto EV = sys::Process::GetEnv("AMD_SPLIT_MODULE_LOG_PRIVATE");
+      HideNames = (EV.value_or("0") != "0");
+    }
+  });
 
-/// \param Num numerator
-/// \param Dem denominator
-/// \returns a printable object to print (Num/Dem) using "%0.2f".
-static auto formatRatioOf(CostType Num, CostType Dem) {
-  return format("%0.2f", (static_cast<double>(Num) / Dem) * 100);
+  if (!HideNames)
+    return V.getName().str();
+  return toHex(SHA256::hash(arrayRefFromStringRef(V.getName())),
+               /*LowerCase=*/true);
 }
 
-/// Checks whether a given function is non-copyable.
+/// Main logging helper.
 ///
-/// Non-copyable functions cannot be cloned into multiple partitions, and only
-/// one copy of the function can be present across all partitions.
+/// Logging can be configured by the following environment variable.
+///   AMD_SPLIT_MODULE_LOG_DIR=<filepath>
+///     If set, uses <filepath> as the directory to write logfiles to
+///     each time module splitting is used.
+///   AMD_SPLIT_MODULE_LOG_PRIVATE
+///     If set to anything other than zero, all names are hidden.
 ///
-/// External functions fall into this category. If we were to clone them, we
-/// would end up with multiple symbol definitions and a very unhappy linker.
-static bool isNonCopyable(const Function &F) {
-  assert(AMDGPU::isEntryFunctionCC(F.getCallingConv())
-             ? F.hasExternalLinkage()
-             : true && "Kernel w/o external linkage?");
-  return F.hasExternalLinkage() || !F.isDefinitionExact();
-}
+/// Both environment variables have corresponding CL options which
+/// takes priority over them.
+///
+/// Any output printed to the log files is also printed to dbgs() when -debug is
+/// used and LLVM_DEBUG is defined.
+///
+/// This approach has a small disadvantage over LLVM_DEBUG though: logging logic
+/// cannot be removed from the code (by building without debug). This probably
+/// has a small performance cost because if some computation/formatting is
+/// needed for logging purpose, it may be done everytime only to be ignored
+/// by the logger.
+///
+/// As this pass only runs once and is not doing anything computationally
+/// expensive, this is likely a reasonable trade-off.
+///
+/// If some computation should really be avoided when unused, users of the class
+/// can check whether any logging will occur by using the bool operator.
+///
+/// \code
+///   if (SML) {
+///     // Executes only if logging to a file or if -debug is available and
+///     used.
+///   }
+/// \endcode
+class SplitModuleLogger {
+public:
+  SplitModuleLogger(const Module &M) {
+    std::string LogDir = LogDirOpt;
+    if (LogDir.empty())
+      LogDir = sys::Process::GetEnv("AMD_SPLIT_MODULE_LOG_DIR").value_or("");
+
+    // No log dir specified means we don't need to log to a file.
+    // We may still log to dbgs(), though.
+    if (LogDir.empty())
+      return;
+
+    // If a log directory is specified, create a new file with a unique name in
+    // that directory.
+    int Fd;
+    SmallString<0> PathTemplate;
+    SmallString<0> RealPath;
+    sys::path::append(PathTemplate, LogDir, "Module-%%-%%-%%-%%-%%-%%-%%.txt");
+    if (auto Err =
+            sys::fs::createUniqueFile(PathTemplate.str(), Fd, RealPath)) {
+      report_fatal_error("Failed to create log file at '" + Twine(LogDir) +
+                             "': " + Err.message(),
+                         /*CrashDiag=*/false);
+    }
 
-/// If \p GV has local linkage, make it external + hidden.
-static void externalize(GlobalValue &GV) {
-  if (GV.hasLocalLinkage()) {
-    GV.setLinkage(GlobalValue::ExternalLinkage);
-    GV.setVisibility(GlobalValue::HiddenVisibility);
+    FileOS = std::make_unique<raw_fd_ostream>(Fd, /*shouldClose=*/true);
   }
 
-  // Unnamed entities must be named consistently between modules. setName will
-  // give a distinct name to each such entity.
-  if (!GV.hasName())
-    GV.setName("__llvmsplit_unnamed");
+  bool hasLogFile() const { return FileOS != nullptr; }
+
+  raw_ostream &logfile() {
+    assert(FileOS && "no logfile!");
+    return *FileOS;
+  }
+
+  /// \returns true if this SML will log anything either to a file or dbgs().
+  /// Can be used to avoid expensive computations that are ignored when logging
+  /// is disabled.
+  operator bool() const {
+    return hasLogFile() || (DebugFlag && isCurrentDebugType(DEBUG_TYPE));
+  }
+
+private:
+  std::unique_ptr<raw_fd_ostream> FileOS;
+};
+
+template <typename Ty>
+static SplitModuleLogger &operator<<(SplitModuleLogger &SML, const Ty &Val) {
+  static_assert(
+      !std::is_same_v<Ty, Value>,
+      "do not print values to logs directly, use handleName instead!");
+  LLVM_DEBUG(dbgs() << Val);
+  if (SML.hasLogFile())
+    SML.logfile() << Val;
+  return SML;
 }
 
-/// Cost analysis function. Calculates the cost of each function in \p M
-///
+/// Calculate the cost of each function in \p M
+/// \param SML Log Helper
 /// \param GetTTI Abstract getter for TargetTransformInfo.
 /// \param M Module to analyze.
 /// \param CostMap[out] Resulting Function -> Cost map.
 /// \return The module's total cost.
-static CostType calculateFunctionCosts(GetTTIFn GetTTI, Module &M,
-                                       FunctionsCostMap &CostMap) {
-  SplitModuleTimer SMT("calculateFunctionCosts", "cost analysis");
-
-  LLVM_DEBUG(dbgs() << "[cost analysis] calculating function costs\n");
+static CostType
+calculateFunctionCosts(SplitModuleLogger &SML, GetTTIFn GetTTI, Module &M,
+                       DenseMap<const Function *, CostType> &CostMap) {
   CostType ModuleCost = 0;
-  [[maybe_unused]] CostType KernelCost = 0;
+  CostType KernelCost = 0;
 
   for (auto &Fn : M) {
     if (Fn.isDeclaration())
@@ -216,30 +251,23 @@ static CostType calculateFunctionCosts(GetTTIFn GetTTI, Module &M,
     assert((ModuleCost + FnCost) >= ModuleCost && "Overflow!");
     ModuleCost += FnCost;
 
-    if (AMDGPU::isEntryFunctionCC(Fn.getCallingConv()))
+    if (isEntryPoint(&Fn))
       KernelCost += FnCost;
   }
 
-  if (CostMap.empty())
-    return 0;
-
-  assert(ModuleCost);
-  LLVM_DEBUG({
-    const CostType FnCost = ModuleCost - KernelCost;
-    dbgs() << " - total module cost is " << ModuleCost << ". kernels cost "
-           << "" << KernelCost << " ("
-           << format("%0.2f", (float(KernelCost) / ModuleCost) * 100)
-           << "% of the module), functions cost " << FnCost << " ("
-           << format("%0.2f", (float(FnCost) / ModuleCost) * 100)
-           << "% of the module)\n";
-  });
+  CostType FnCost = (ModuleCost - KernelCost);
+  CostType ModuleCostOr1 = ModuleCost ? ModuleCost : 1;
+  SML << "=> Total Module Cost: " << ModuleCost << '\n'
+      << "  => KernelCost: " << KernelCost << " ("
+      << format("%0.2f", (float(KernelCost) / ModuleCostOr1) * 100) << "%)\n"
+      << "  => FnsCost: " << FnCost << " ("
+      << format("%0.2f", (float(FnCost) / ModuleCostOr1) * 100) << "%)\n";
 
   return ModuleCost;
 }
 
-/// \return true if \p F can be indirectly called
 static bool canBeIndirectlyCalled(const Function &F) {
-  if (F.isDeclaration() || AMDGPU::isEntryFunctionCC(F.getCallingConv()))
+  if (F.isDeclaration() || isEntryPoint(&F))
     return false;
   return !F.hasLocalLinkage() ||
          F.hasAddressTaken(/*PutOffender=*/nullptr,
@@ -250,1081 +278,351 @@ static bool canBeIndirectlyCalled(const Function &F) {
                            /*IgnoreCastedDirectCall=*/true);
 }
 
-//===----------------------------------------------------------------------===//
-// Graph-based Module Representation
-//===----------------------------------------------------------------------===//
-
-/// AMDGPUSplitModule's view of the source Module, as a graph of all components
-/// that can be split into different modules.
-///
-/// The most trivial instance of this graph is just the CallGraph of the module,
-/// but it is not guaranteed that the graph is strictly equal to the CG. It
-/// currently always is but it's designed in a way that would eventually allow
-/// us to create abstract nodes, or nodes for different entities such as global
-/// variables or any other meaningful constraint we must consider.
+/// When a function or any of its callees performs an indirect call, this
+/// takes over \ref addAllDependencies and adds all potentially callable
+/// functions to \p Fns so they can be counted as dependencies of the function.
 ///
-/// The graph is only mutable by this class, and is generally not modified
-/// after \ref SplitGraph::buildGraph runs. No consumers of the graph can
-/// mutate it.
-class SplitGraph {
-public:
-  class Node;
-
-  enum class EdgeKind : uint8_t {
-    /// The nodes are related through a direct call. This is a "strong" edge as
-    /// it means the Src will directly reference the Dst.
-    DirectCall,
-    /// The nodes are related through an indirect call.
-    /// This is a "weaker" edge and is only considered when traversing the graph
-    /// starting from a kernel. We need this edge for resource usage analysis.
-    ///
-    /// The reason why we have this edge in the first place is due to how
-    /// AMDGPUResourceUsageAnalysis works. In the presence of an indirect call,
-    /// the resource usage of the kernel containing the indirect call is the
-    /// max resource usage of all functions that can be indirectly called.
-    IndirectCall,
-  };
-
-  /// An edge between two nodes. Edges are directional, and tagged with a
-  /// "kind".
-  struct Edge {
-    Edge(Node *Src, Node *Dst, EdgeKind Kind)
-        : Src(Src), Dst(Dst), Kind(Kind) {}
-
-    Node *Src; ///< Source
-    Node *Dst; ///< Destination
-    EdgeKind Kind;
-  };
-
-  using EdgesVec = SmallVector<const Edge *, 0>;
-  using edges_iterator = EdgesVec::const_iterator;
-  using nodes_iterator = const Node *const *;
-
-  SplitGraph(const Module &M, const FunctionsCostMap &CostMap,
-             CostType ModuleCost)
-      : M(M), CostMap(CostMap), ModuleCost(ModuleCost) {}
-
-  void buildGraph(CallGraph &CG);
-
-#ifndef NDEBUG
-  bool verifyGraph() const;
-#endif
-
-  bool empty() const { return Nodes.empty(); }
-  const iterator_range<nodes_iterator> nodes() const {
-    return {Nodes.begin(), Nodes.end()};
+/// This is needed due to how AMDGPUResourceUsageAnalysis operates: in the
+/// presence of an indirect call, the function's resource usage is the same as
+/// the most expensive function in the module.
+/// \param M    The module.
+/// \param Fns[out] Resulting list of functions.
+static void addAllIndirectCallDependencies(const Module &M,
+                                           DenseSet<const Function *> &Fns) {
+  for (const auto &Fn : M) {
+    if (canBeIndirectlyCalled(Fn))
+      Fns.insert(&Fn);
   }
-  const Node &getNode(unsigned ID) const { return *Nodes[ID]; }
-
-  unsigned getNumNodes() const { return Nodes.size(); }
-  BitVector createNodesBitVector() const { return BitVector(Nodes.size()); }
-
-  const Module &getModule() const { return M; }
-
-  CostType getModuleCost() const { return ModuleCost; }
-  CostType getCost(const Function &F) const { return CostMap.at(&F); }
-
-  /// \returns the aggregated cost of all nodes in \p BV (bits set to 1 = node
-  /// IDs).
-  CostType calculateCost(const BitVector &BV) const;
-
-private:
-  /// Retrieves the node for \p GV in \p Cache, or creates a new node for it and
-  /// updates \p Cache.
-  Node &getNode(DenseMap<const GlobalValue *, Node *> &Cache,
-                const GlobalValue &GV);
-
-  // Create a new edge between two nodes and add it to both nodes.
-  cons...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/106707