[llvm] [AMDGPU] Graph-based Module Splitting Rewrite (PR #104763)
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Mon Aug 19 07:05:13 PDT 2024
================
@@ -44,187 +47,152 @@
#include "llvm/IR/Module.h"
#include "llvm/IR/User.h"
#include "llvm/IR/Value.h"
+#include "llvm/Support/Allocator.h"
#include "llvm/Support/Casting.h"
+#include "llvm/Support/DOTGraphTraits.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/GraphWriter.h"
#include "llvm/Support/Path.h"
-#include "llvm/Support/Process.h"
-#include "llvm/Support/SHA256.h"
-#include "llvm/Support/Threading.h"
+#include "llvm/Support/Timer.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Utils/Cloning.h"
#include <algorithm>
#include <cassert>
+#include <cmath>
#include <iterator>
#include <memory>
#include <utility>
#include <vector>
-using namespace llvm;
+#ifndef NDEBUG
+#include "llvm/Support/LockFileManager.h"
+#endif
#define DEBUG_TYPE "amdgpu-split-module"
+namespace llvm {
namespace {
+static cl::opt<unsigned> MaxDepth(
+ "amdgpu-module-splitting-max-depth",
+ cl::desc(
+ "maximum search depth. 0 forces a greedy approach. "
+ "warning: the algorithm is up to O(2^N), where N is the max depth."),
+ cl::init(8));
+
static cl::opt<float> LargeFnFactor(
- "amdgpu-module-splitting-large-function-threshold", cl::init(2.0f),
- cl::Hidden,
+ "amdgpu-module-splitting-large-threshold", cl::init(2.0f), cl::Hidden,
cl::desc(
- "consider a function as large and needing special treatment when the "
- "cost of importing it into a partition"
- "exceeds the average cost of a partition by this factor; e;g. 2.0 "
- "means if the function and its dependencies is 2 times bigger than "
- "an average partition; 0 disables large functions handling entirely"));
+ "when max depth is reached and we can no longer branch out, this "
+ "value determines if a function is worth merging into an already "
+ "existing partition to reduce code duplication. This is a factor "
+ "of the ideal partition size, e.g. 2.0 means we consider the "
+ "function for merging if its cost (including its callees) is 2x the "
+ "size of an ideal partition."));
static cl::opt<float> LargeFnOverlapForMerge(
- "amdgpu-module-splitting-large-function-merge-overlap", cl::init(0.8f),
- cl::Hidden,
- cl::desc(
- "defines how much overlap between two large function's dependencies "
- "is needed to put them in the same partition"));
+ "amdgpu-module-splitting-merge-threshold", cl::init(0.7f), cl::Hidden,
+ cl::desc("when a function is considered for merging into a partition that "
+ "already contains some of its callees, do the merge if at least "
+ "n% of the code it can reach is already present inside the "
+ "partition; e.g. 0.7 means only merge >70%"));
static cl::opt<bool> NoExternalizeGlobals(
"amdgpu-module-splitting-no-externalize-globals", cl::Hidden,
cl::desc("disables externalization of global variable with local linkage; "
"may cause globals to be duplicated which increases binary size"));
static cl::opt<std::string>
- LogDirOpt("amdgpu-module-splitting-log-dir", cl::Hidden,
- cl::desc("output directory for AMDGPU module splitting logs"));
+ ModuleDotCfgOutput("amdgpu-module-splitting-print-module-dotcfg",
+ cl::Hidden,
+ cl::desc("output file to write out the dotgraph "
+ "representation of the input module"));
+
+static cl::opt<std::string> PartitionSummariesOutput(
+ "amdgpu-module-splitting-print-partition-summaries", cl::Hidden,
+ cl::desc("output file to write out a summary of "
+ "the partitions created for each module"));
+
+#ifndef NDEBUG
+static cl::opt<bool> TimeBuild("amdgpu-module-splitting-time-trace", cl::Hidden,
+ cl::desc("enable and print timers"));
+
+static cl::opt<bool>
+ UseLockFile("amdgpu-module-splitting-serial-execution", cl::Hidden,
+ cl::desc("use a lock file so only one process in the system "
+ "can run this pass at once. useful to avoid mangled "
+ "debug output in multithreaded environments."));
static cl::opt<bool>
- LogPrivate("amdgpu-module-splitting-log-private", cl::Hidden,
- cl::desc("hash value names before printing them in the AMDGPU "
- "module splitting logs"));
+ DebugProposalSearch("amdgpu-module-splitting-debug-proposal-search",
+ cl::Hidden,
+ cl::desc("print all proposals received and whether "
+ "they were rejected or accepted"));
+
+struct SplitModuleTimer : NamedRegionTimer {
+ SplitModuleTimer(StringRef Name, StringRef Desc)
+ : NamedRegionTimer(Name, Desc, DEBUG_TYPE, "AMDGPU Module Splitting",
+ TimeBuild) {}
+};
+#else
+struct SplitModuleTimer {
+ SplitModuleTimer(StringRef Name, StringRef Desc) {}
+};
+#endif
+
+//===----------------------------------------------------------------------===//
+// Utils
+//===----------------------------------------------------------------------===//
using CostType = InstructionCost::CostType;
-using PartitionID = unsigned;
+using FunctionsCostMap = DenseMap<const Function *, CostType>;
using GetTTIFn = function_ref<const TargetTransformInfo &(Function &)>;
+static constexpr unsigned InvalidPID = -1;
+
+/// \param Num numerator
+/// \param Dem denominator
+/// \param FmtString printf-like format string
+/// \returns a printable object to print (Num/Dem) using FmtString.
+static auto formatRatioOf(CostType Num, CostType Dem,
+ const char *FmtString = "%0.2f") {
----------------
arsenm wrote:
The format string is allowed to not be a compile time constant? Make this a template argument?
https://github.com/llvm/llvm-project/pull/104763
More information about the llvm-commits
mailing list