[llvm] [AMDGPU] Add AMDGPU-specific module splitting (PR #89245)
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Sat Apr 20 02:24:45 PDT 2024
================
@@ -0,0 +1,733 @@
+//===- AMDGPUSplitModule.cpp ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file Implements a module splitting algorithm designed to support the
+/// FullLTO --lto-partitions option for parallel codegen. This is completely
+/// different from the common SplitModule pass, as this system is designed with
+/// AMDGPU in mind.
+///
+/// The basic idea of this module splitting implementation is the same as
+/// SplitModule: load-balance the module's functions across a set of N
+/// partitions to allow parallel codegen. However, it does it very
+/// differently than the target-agnostic variant:
+/// - Kernels are used as the module's "roots".
+/// They're known entry points on AMDGPU, and everything else is often
+/// internal only.
+/// - Each kernel has a set of dependencies, and when a kernel and its
+/// dependencies is considered "big", we try to put it in a partition where
+/// most dependencies are already imported, to avoid duplicating large
+/// amounts of code.
+/// - There's special care for indirect calls in order to ensure
+/// AMDGPUResourceUsageAnalysis can work correctly.
+///
+/// This file also includes a more elaborate logging system to enable
+/// users to easily generate logs that (if desired) do not include any value
+/// names, in order to not leak information about the source file.
+/// Such logs are very helpful to understand and fix potential issues with
+/// module splitting.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUSplitModule.h"
+#include "AMDGPUTargetMachine.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Process.h"
+#include "llvm/Support/SHA256.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <memory>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-split-module"
+
+namespace {
+
+static cl::opt<float> LargeKernelFactor(
+ "amdgpu-module-splitting-large-kernel-threshold", cl::init(2.0), cl::Hidden,
+ cl::desc(
+ "consider a kernel as large and needing special treatment when it "
+ "exceeds the average cost of a partition by this factor; e;g. 2.0 "
+ "means if the kernel and its dependencies is 2 times bigger than "
+ "an average partition; 0 disables large kernels handling entirely"));
+
+static cl::opt<float> LargeKernelOverlapForMerge(
+ "amdgpu-module-splitting-large-kernel-merge-overlap", cl::init(0.8),
+ cl::Hidden,
+ cl::desc("defines how much overlap between two large kernel's dependencies "
+ "is needed to put them in the same partition"));
+
+static cl::opt<bool> NoExternalizeGlobals(
+ "amdgpu-module-splitting-no-externalize-globals", cl::Hidden,
+ cl::desc("disables externalization of global variable with local linkage; "
+ "may cause globals to be duplicated which increases binary size"));
+
+static cl::opt<std::string>
+ LogDirOpt("amdgpu-module-splitting-log-dir", cl::Hidden,
+ cl::desc("output directory for AMDGPU module splitting logs"));
+
+static cl::opt<bool>
+ LogPrivate("amdgpu-module-splitting-log-private", cl::Hidden,
+ cl::desc("hash value names before printing them in the AMDGPU "
+ "module splitting logs"));
+
+using CostType = InstructionCost::CostType;
+using PartitionID = unsigned;
+
+static std::string getName(const Value &V) {
+ static std::optional<bool> HideNames;
+ if (!HideNames) {
+ if (LogPrivate.getNumOccurrences())
+ HideNames = LogPrivate;
+ else {
+ const auto EV = sys::Process::GetEnv("AMD_SPLIT_MODULE_LOG_PRIVATE");
+ HideNames = (EV.value_or("0") != "0");
+ }
+ }
+
+ if (!*HideNames)
+ return V.getName().str();
+ return toHex(SHA256::hash(arrayRefFromStringRef(V.getName())),
+ /*LowerCase*/ true);
+}
+
+/// Main logging helper.
+///
+/// Logging can be configured by the following environment variable.
+/// AMD_SPLIT_MODULE_LOG_DIR=<filepath>
+/// If set, uses <filepath> as the directory to write logfiles to
+/// each time module splitting is used.
+/// AMD_SPLIT_MODULE_LOG_PRIVATE
+/// If set to anything other than zero, all names are hidden.
+///
+/// Both environment variables have corresponding CL options which
+/// takes priority over them.
+///
+/// Any output printed to the log files is also printed to dbgs() when -debug is
+/// used and LLVM_DEBUG is defined.
+///
+/// This approach has a small disadvantage over LLVM_DEBUG though: logging logic
+/// cannot be removed from the code (by building without debug). This probably
+/// has a small performance cost because if some computation/formatting is
+/// needed for logging purpose, it may be done everytime only to be ignored
+/// by the logger.
+///
+/// As this pass only runs once and is not doing anything computationally
+/// expensive, this is likely a reasonable trade-off.
+///
+/// If some computation should really be avoided when unused, users of the class
+/// can check whether any logging will occur by using the bool operator.
+///
+/// \code
+/// if (SML) {
+/// // Executes only if logging to a file or if -debug is available and
+/// used.
+/// }
+/// \endcode
+class SplitModuleLogger {
+public:
+ SplitModuleLogger(const Module &M) {
+ std::string LogDir = LogDirOpt;
+ if (LogDir.empty())
+ LogDir = sys::Process::GetEnv("AMD_SPLIT_MODULE_LOG_DIR").value_or("");
+
+ // No log dir specified means we don't need to log to a file.
+ // We may still log to dbgs(), though.
+ if (LogDir.empty())
+ return;
+
+ if (!sys::fs::is_directory(LogDir)) {
+ report_fatal_error("invalid AMDGPU split module log directory: '" +
+ Twine(LogDir) + "' is not a directory",
+ /*CrashDiag=*/false);
+ }
+
+ // If a log directory is specified, create a new file with a unique name in
+ // that directory.
+ SmallString<0> FilePath;
+ int Fd;
+ std::string LogFile = (LogDir + "/" + "Module-%%-%%-%%-%%-%%-%%-%%.txt");
+ if (auto Err = sys::fs::createUniqueFile(LogFile, Fd, FilePath)) {
+ dbgs() << LogFile << "\n";
+ std::string Msg =
+ "Failed to create log file at '" + LogDir + "': " + Err.message();
+ report_fatal_error(StringRef(Msg),
+ /*CrashDiag=*/false);
+ }
+
+ FileOS = std::make_unique<raw_fd_ostream>(Fd, /*shouldClose*/ true);
+ }
+
+ bool hasLogFile() const { return FileOS != nullptr; }
+
+ raw_ostream &logfile() {
+ assert(FileOS && "no logfile!");
+ return *FileOS;
+ }
+
+ /// \returns true if this SML will log anything either to a file or dbgs().
+ /// Can be used to avoid expensive computations that are ignored when logging
+ /// is disabled.
+ operator bool() const {
+ return hasLogFile() || (DebugFlag && isCurrentDebugType(DEBUG_TYPE));
+ }
+
+private:
+ std::unique_ptr<raw_fd_ostream> FileOS;
+};
+
+template <typename Ty>
+static SplitModuleLogger &operator<<(SplitModuleLogger &SML, const Ty &Val) {
+ static_assert(
+ !std::is_same_v<Ty, Value>,
+ "do not print values to logs directly, use handleName instead!");
+ LLVM_DEBUG(dbgs() << Val);
+ if (SML.hasLogFile())
+ SML.logfile() << Val;
+ return SML;
+}
+
+/// Calculate the cost of each function in \p M
+/// \param SML Log Helper
+/// \param TM TargetMachine instance used to retrieve TargetTransformInfo.
+/// \param M Module to analyze.
+/// \param CostMap[out] Resulting Function -> Cost map.
+/// \return The module's total cost.
+static CostType
+calculateFunctionCosts(SplitModuleLogger &SML, const AMDGPUTargetMachine &TM,
+ Module &M,
+ DenseMap<const Function *, CostType> &CostMap) {
+ CostType ModuleCost = 0;
+ CostType KernelCost = 0;
+
+ for (auto &Fn : M) {
+ if (Fn.isDeclaration())
+ continue;
+
+ CostType FnCost = 0;
+ auto TTI = TM.getTargetTransformInfo(Fn);
+
+ for (auto &BB : Fn) {
+ for (auto &I : BB) {
+ auto Cost =
+ TTI.getInstructionCost(&I, TargetTransformInfo::TCK_CodeSize);
+ assert(Cost != InstructionCost::getMax());
+ // Assume expensive if we can't tell the cost of an instruction.
+ CostType CostVal =
+ Cost.getValue().value_or(TargetTransformInfo::TCC_Expensive);
+ assert((FnCost + CostVal) >= FnCost && "Overflow!");
+ FnCost += CostVal;
+ }
+ }
+
+ assert(FnCost != 0);
+
+ CostMap[&Fn] = FnCost;
+ assert((ModuleCost + FnCost) >= ModuleCost && "Overflow!");
+ ModuleCost += FnCost;
+
+ if (AMDGPU::isKernelCC(&Fn))
+ KernelCost += FnCost;
+ }
+
+ CostType FnCost = (ModuleCost - KernelCost);
+ SML << "=> Total Module Cost: " << ModuleCost << "\n"
+ << " => KernelCost: " << KernelCost << " ("
+ << format("%0.2f", (float(KernelCost) / ModuleCost) * 100) << "%)\n"
+ << " => FnsCost: " << FnCost << " ("
+ << format("%0.2f", (float(FnCost) / ModuleCost) * 100) << "%)\n";
+
+ return ModuleCost;
+}
+
+/// When a kernel or any of its callees performs an indirect call, this function
+/// takes over \ref addAllDependencies and adds all potentially callable
+/// functions to \p Fns so they can be counted as dependencies of the kernel.
+///
+/// This is needed due to how AMDGPUResourceUsageAnalysis operates: in the
+/// presence of an indirect call, the function's resource usage is the same as
+/// the most expensive function in the module.
+/// \param M The module.
+/// \param Fns[out] Resulting list of functions.
+static void addAllIndirectCallDependencies(const Module &M,
+ DenseSet<const Function *> &Fns) {
+ for (const auto &Fn : M) {
+ if (!Fn.isDeclaration() && !AMDGPU::isEntryFunctionCC(Fn.getCallingConv()))
+ Fns.insert(&Fn);
+ }
+}
+
+/// Adds the functions that \p Fn may call to \p Fns, then recurses into each
+/// callee until all reachable functions have been gathered.
+///
+/// \param SML Log Helper
+/// \param CG Call graph for \p Fn's module.
+/// \param Fn Current function to look at.
+/// \param Fns[out] Resulting list of functions.
+/// \param HadIndirectCall[out] Set to true if an indirect call was seen at some
+/// point, either in \p Fn or in one of the function it calls. When that
+/// happens, we fall back to adding all callable functions inside \p Fn's module
+/// to \p Fns.
+/// \param HadExternalCall[out] Set to true if a call to an external function
+/// was seen at some point, either in \p Fn or in one of the function it calls
+static void addAllDependencies(SplitModuleLogger &SML, const CallGraph &CG,
+ const Function &Fn,
+ DenseSet<const Function *> &Fns,
+ bool &HadIndirectCall, bool &HadExternalCall) {
+ assert(!Fn.isDeclaration());
+
+ const Module &M = *Fn.getParent();
+ SmallVector<const Function *> WorkList({&Fn});
+ while (!WorkList.empty()) {
+ const auto &CurFn = *WorkList.pop_back_val();
+
+ // Scan for an indirect call. If such a call is found, we have to
+ // conservatively assume this can call all non-entrypoint functions in the
+ // module.
+ for (const auto &BB : CurFn) {
+ for (const auto &I : BB) {
+ const auto *CB = dyn_cast<CallBase>(&I);
+ if (!CB || !CB->isIndirectCall())
+ continue;
+
+ SML << "Indirect call detected in " << getName(CurFn)
+ << " - treating all non-entrypoint functions as "
+ "potential dependencies\n";
+
+ // TODO: Print an ORE as well ?
+ addAllIndirectCallDependencies(M, Fns);
+ HadIndirectCall = true;
+ return;
+ }
+ }
+
+ for (auto &CGEntry : *CG[&CurFn]) {
+ auto *Callee = CGEntry.second->getFunction();
+ if (!Callee)
+ continue;
+
+ assert(!AMDGPU::isKernelCC(Callee));
+
+ if (Callee->isDeclaration())
+ continue;
+
+ if (Callee->hasExternalLinkage())
+ HadExternalCall = true;
+
+ auto [It, Inserted] = Fns.insert(Callee);
+ if (Inserted)
+ WorkList.push_back(Callee);
+ }
+ }
+}
+
+/// Contains information about a kernel and its dependencies.
+struct KernelWithDependencies {
+ KernelWithDependencies(SplitModuleLogger &SML, CallGraph &CG,
+ const DenseMap<const Function *, CostType> &FnCosts,
+ const Function *Fn)
+ : Fn(Fn) {
+ addAllDependencies(SML, CG, *Fn, Dependencies, HasIndirectCall,
+ HasExternalCall);
+ TotalCost = FnCosts.at(Fn);
+ for (const auto *Dep : Dependencies)
+ TotalCost += FnCosts.at(Dep);
+ }
+
+ const Function *Fn = nullptr;
+ DenseSet<const Function *> Dependencies;
+ /// Whether \p Fn or any of its \ref Dependencies contains an indirect call.
+ bool HasIndirectCall = false;
+ /// Whether \p Fn or any of its \ref Dependencies contains a call to a
+ /// function with external linkage.
+ bool HasExternalCall = false;
+
+ CostType TotalCost = 0;
+
+ /// \returns true if this kernel and its dependencies can be considered large
+ /// according to \p Threshold.
+ bool isLarge(CostType Threshold) const {
+ return TotalCost > Threshold && !Dependencies.empty();
+ }
+};
+
+/// Calculates how much overlap there is between \p A and \p B.
+/// \return A number between 0.0 and 1.0, where 1.0 means A == B and 0.0 means A
+/// and B have no shared elements. Kernels do not count in overlap calculation.
+static float calculateOverlap(const DenseSet<const Function *> &A,
+ const DenseSet<const Function *> &B) {
+ DenseSet<const Function *> Total;
+ for (const auto *F : A) {
+ if (!AMDGPU::isKernelCC(F))
+ Total.insert(F);
+ }
+
+ if (Total.empty())
+ return 0.0f;
+
+ unsigned NumCommon = 0;
+ for (const auto *F : B) {
+ if (AMDGPU::isKernelCC(F))
+ continue;
+
+ auto [It, Inserted] = Total.insert(F);
+ if (!Inserted)
+ ++NumCommon;
+ }
+
+ return float(NumCommon) / Total.size();
----------------
arsenm wrote:
```suggestion
return static_cast<float>(NumCommon) / Total.size();
```
https://github.com/llvm/llvm-project/pull/89245
More information about the llvm-commits
mailing list