[llvm] [AMDGPU] Graph-based Module Splitting Rewrite (PR #104763)

Mon Aug 19 07:05:13 PDT 2024

================
@@ -44,187 +47,152 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
+#include "llvm/Support/Allocator.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/DOTGraphTraits.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FileSystem.h"
+#include "llvm/Support/GraphWriter.h"
 #include "llvm/Support/Path.h"
-#include "llvm/Support/Process.h"
-#include "llvm/Support/SHA256.h"
-#include "llvm/Support/Threading.h"
+#include "llvm/Support/Timer.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include <algorithm>
 #include <cassert>
+#include <cmath>
 #include <iterator>
 #include <memory>
 #include <utility>
 #include <vector>
 
-using namespace llvm;
+#ifndef NDEBUG
+#include "llvm/Support/LockFileManager.h"
+#endif
 
 #define DEBUG_TYPE "amdgpu-split-module"
 
+namespace llvm {
 namespace {
 
+static cl::opt<unsigned> MaxDepth(
+    "amdgpu-module-splitting-max-depth",
+    cl::desc(
+        "maximum search depth. 0 forces a greedy approach. "
+        "warning: the algorithm is up to O(2^N), where N is the max depth."),
+    cl::init(8));
+
 static cl::opt<float> LargeFnFactor(
-    "amdgpu-module-splitting-large-function-threshold", cl::init(2.0f),
-    cl::Hidden,
+    "amdgpu-module-splitting-large-threshold", cl::init(2.0f), cl::Hidden,
     cl::desc(
-        "consider a function as large and needing special treatment when the "
-        "cost of importing it into a partition"
-        "exceeds the average cost of a partition by this factor; e;g. 2.0 "
-        "means if the function and its dependencies is 2 times bigger than "
-        "an average partition; 0 disables large functions handling entirely"));
+        "when max depth is reached and we can no longer branch out, this "
+        "value determines if a function is worth merging into an already "
+        "existing partition to reduce code duplication. This is a factor "
+        "of the ideal partition size, e.g. 2.0 means we consider the "
+        "function for merging if its cost (including its callees) is 2x the "
+        "size of an ideal partition."));
 
 static cl::opt<float> LargeFnOverlapForMerge(
-    "amdgpu-module-splitting-large-function-merge-overlap", cl::init(0.8f),
-    cl::Hidden,
-    cl::desc(
-        "defines how much overlap between two large function's dependencies "
-        "is needed to put them in the same partition"));
+    "amdgpu-module-splitting-merge-threshold", cl::init(0.7f), cl::Hidden,
+    cl::desc("when a function is considered for merging into a partition that "
+             "already contains some of its callees, do the merge if at least "
+             "n% of the code it can reach is already present inside the "
+             "partition; e.g. 0.7 means only merge >70%"));
 
 static cl::opt<bool> NoExternalizeGlobals(
     "amdgpu-module-splitting-no-externalize-globals", cl::Hidden,
     cl::desc("disables externalization of global variable with local linkage; "
              "may cause globals to be duplicated which increases binary size"));
 
 static cl::opt<std::string>
-    LogDirOpt("amdgpu-module-splitting-log-dir", cl::Hidden,
-              cl::desc("output directory for AMDGPU module splitting logs"));
+    ModuleDotCfgOutput("amdgpu-module-splitting-print-module-dotcfg",
+                       cl::Hidden,
+                       cl::desc("output file to write out the dotgraph "
+                                "representation of the input module"));
+
+static cl::opt<std::string> PartitionSummariesOutput(
+    "amdgpu-module-splitting-print-partition-summaries", cl::Hidden,
+    cl::desc("output file to write out a summary of "
+             "the partitions created for each module"));
+
+#ifndef NDEBUG
+static cl::opt<bool> TimeBuild("amdgpu-module-splitting-time-trace", cl::Hidden,
+                               cl::desc("enable and print timers"));
+
+static cl::opt<bool>
+    UseLockFile("amdgpu-module-splitting-serial-execution", cl::Hidden,
+                cl::desc("use a lock file so only one process in the system "
+                         "can run this pass at once. useful to avoid mangled "
+                         "debug output in multithreaded environments."));
 
 static cl::opt<bool>
-    LogPrivate("amdgpu-module-splitting-log-private", cl::Hidden,
-               cl::desc("hash value names before printing them in the AMDGPU "
-                        "module splitting logs"));
+    DebugProposalSearch("amdgpu-module-splitting-debug-proposal-search",
+                        cl::Hidden,
+                        cl::desc("print all proposals received and whether "
+                                 "they were rejected or accepted"));
+
+struct SplitModuleTimer : NamedRegionTimer {
+  SplitModuleTimer(StringRef Name, StringRef Desc)
+      : NamedRegionTimer(Name, Desc, DEBUG_TYPE, "AMDGPU Module Splitting",
+                         TimeBuild) {}
+};
+#else
+struct SplitModuleTimer {
+  SplitModuleTimer(StringRef Name, StringRef Desc) {}
+};
+#endif
+
+//===----------------------------------------------------------------------===//
+// Utils
+//===----------------------------------------------------------------------===//
 
 using CostType = InstructionCost::CostType;
-using PartitionID = unsigned;
+using FunctionsCostMap = DenseMap<const Function *, CostType>;
 using GetTTIFn = function_ref<const TargetTransformInfo &(Function &)>;
+static constexpr unsigned InvalidPID = -1;
+
+/// \param Num numerator
+/// \param Dem denominator
+/// \param FmtString printf-like format string
+/// \returns a printable object to print (Num/Dem) using FmtString.
+static auto formatRatioOf(CostType Num, CostType Dem,
+                          const char *FmtString = "%0.2f") {
----------------
arsenm wrote:

The format string is allowed to not be a compile time constant? Make this a template argument? 

https://github.com/llvm/llvm-project/pull/104763