[llvm] Revert "[AMDGPU] Add AMDGPU-specific module splitting" (PR #93275)
via llvm-commits
llvm-commits at lists.llvm.org
Thu May 23 23:45:33 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Vitaly Buka (vitalybuka)
<details>
<summary>Changes</summary>
Fails on https://lab.llvm.org/buildbot/#/builders/85/builds/24181
Reverts llvm/llvm-project#<!-- -->89245
---
Patch is 56.15 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/93275.diff
21 Files Affected:
- (removed) llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp (-744)
- (removed) llvm/lib/Target/AMDGPU/AMDGPUSplitModule.h (-30)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp (-8)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h (-4)
- (modified) llvm/lib/Target/AMDGPU/CMakeLists.txt (-1)
- (removed) llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize-with-call.ll (-46)
- (removed) llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize.ll (-37)
- (removed) llvm/test/tools/llvm-split/AMDGPU/debug-name-hiding.ll (-20)
- (removed) llvm/test/tools/llvm-split/AMDGPU/kernels-alias-dependencies.ll (-45)
- (removed) llvm/test/tools/llvm-split/AMDGPU/kernels-cost-ranking.ll (-54)
- (removed) llvm/test/tools/llvm-split/AMDGPU/kernels-dependencies.ll (-50)
- (removed) llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-duplication.ll (-41)
- (removed) llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-external.ll (-64)
- (removed) llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-indirect.ll (-76)
- (removed) llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-overridable.ll (-40)
- (removed) llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables-noexternal.ll (-42)
- (removed) llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables.ll (-44)
- (removed) llvm/test/tools/llvm-split/AMDGPU/kernels-load-balancing.ll (-75)
- (removed) llvm/test/tools/llvm-split/AMDGPU/kernels-no-dependencies.ll (-39)
- (removed) llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging.ll (-98)
- (removed) llvm/test/tools/llvm-split/AMDGPU/lit.local.cfg (-2)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
deleted file mode 100644
index 56e275ce707b6..0000000000000
--- a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
+++ /dev/null
@@ -1,744 +0,0 @@
-//===- AMDGPUSplitModule.cpp ----------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file Implements a module splitting algorithm designed to support the
-/// FullLTO --lto-partitions option for parallel codegen. This is completely
-/// different from the common SplitModule pass, as this system is designed with
-/// AMDGPU in mind.
-///
-/// The basic idea of this module splitting implementation is the same as
-/// SplitModule: load-balance the module's functions across a set of N
-/// partitions to allow parallel codegen. However, it does it very
-/// differently than the target-agnostic variant:
-/// - Kernels are used as the module's "roots".
-/// They're known entry points on AMDGPU, and everything else is often
-/// internal only.
-/// - Each kernel has a set of dependencies, and when a kernel and its
-/// dependencies is considered "big", we try to put it in a partition where
-/// most dependencies are already imported, to avoid duplicating large
-/// amounts of code.
-/// - There's special care for indirect calls in order to ensure
-/// AMDGPUResourceUsageAnalysis can work correctly.
-///
-/// This file also includes a more elaborate logging system to enable
-/// users to easily generate logs that (if desired) do not include any value
-/// names, in order to not leak information about the source file.
-/// Such logs are very helpful to understand and fix potential issues with
-/// module splitting.
-
-#include "AMDGPUSplitModule.h"
-#include "AMDGPUTargetMachine.h"
-#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Analysis/CallGraph.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/User.h"
-#include "llvm/IR/Value.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/Path.h"
-#include "llvm/Support/Process.h"
-#include "llvm/Support/SHA256.h"
-#include "llvm/Support/Threading.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-#include <algorithm>
-#include <cassert>
-#include <iterator>
-#include <memory>
-#include <utility>
-#include <vector>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "amdgpu-split-module"
-
-namespace {
-
-static cl::opt<float> LargeKernelFactor(
- "amdgpu-module-splitting-large-kernel-threshold", cl::init(2.0f),
- cl::Hidden,
- cl::desc(
- "consider a kernel as large and needing special treatment when it "
- "exceeds the average cost of a partition by this factor; e;g. 2.0 "
- "means if the kernel and its dependencies is 2 times bigger than "
- "an average partition; 0 disables large kernels handling entirely"));
-
-static cl::opt<float> LargeKernelOverlapForMerge(
- "amdgpu-module-splitting-large-kernel-merge-overlap", cl::init(0.8f),
- cl::Hidden,
- cl::desc("defines how much overlap between two large kernel's dependencies "
- "is needed to put them in the same partition"));
-
-static cl::opt<bool> NoExternalizeGlobals(
- "amdgpu-module-splitting-no-externalize-globals", cl::Hidden,
- cl::desc("disables externalization of global variable with local linkage; "
- "may cause globals to be duplicated which increases binary size"));
-
-static cl::opt<std::string>
- LogDirOpt("amdgpu-module-splitting-log-dir", cl::Hidden,
- cl::desc("output directory for AMDGPU module splitting logs"));
-
-static cl::opt<bool>
- LogPrivate("amdgpu-module-splitting-log-private", cl::Hidden,
- cl::desc("hash value names before printing them in the AMDGPU "
- "module splitting logs"));
-
-using CostType = InstructionCost::CostType;
-using PartitionID = unsigned;
-
-static bool isEntryPoint(const Function *F) {
- return AMDGPU::isEntryFunctionCC(F->getCallingConv());
-}
-
-static std::string getName(const Value &V) {
- static bool HideNames;
-
- static llvm::once_flag HideNameInitFlag;
- llvm::call_once(HideNameInitFlag, [&]() {
- if (LogPrivate.getNumOccurrences())
- HideNames = LogPrivate;
- else {
- const auto EV = sys::Process::GetEnv("AMD_SPLIT_MODULE_LOG_PRIVATE");
- HideNames = (EV.value_or("0") != "0");
- }
- });
-
- if (!HideNames)
- return V.getName().str();
- return toHex(SHA256::hash(arrayRefFromStringRef(V.getName())),
- /*LowerCase=*/true);
-}
-
-/// Main logging helper.
-///
-/// Logging can be configured by the following environment variable.
-/// AMD_SPLIT_MODULE_LOG_DIR=<filepath>
-/// If set, uses <filepath> as the directory to write logfiles to
-/// each time module splitting is used.
-/// AMD_SPLIT_MODULE_LOG_PRIVATE
-/// If set to anything other than zero, all names are hidden.
-///
-/// Both environment variables have corresponding CL options which
-/// takes priority over them.
-///
-/// Any output printed to the log files is also printed to dbgs() when -debug is
-/// used and LLVM_DEBUG is defined.
-///
-/// This approach has a small disadvantage over LLVM_DEBUG though: logging logic
-/// cannot be removed from the code (by building without debug). This probably
-/// has a small performance cost because if some computation/formatting is
-/// needed for logging purpose, it may be done everytime only to be ignored
-/// by the logger.
-///
-/// As this pass only runs once and is not doing anything computationally
-/// expensive, this is likely a reasonable trade-off.
-///
-/// If some computation should really be avoided when unused, users of the class
-/// can check whether any logging will occur by using the bool operator.
-///
-/// \code
-/// if (SML) {
-/// // Executes only if logging to a file or if -debug is available and
-/// used.
-/// }
-/// \endcode
-class SplitModuleLogger {
-public:
- SplitModuleLogger(const Module &M) {
- std::string LogDir = LogDirOpt;
- if (LogDir.empty())
- LogDir = sys::Process::GetEnv("AMD_SPLIT_MODULE_LOG_DIR").value_or("");
-
- // No log dir specified means we don't need to log to a file.
- // We may still log to dbgs(), though.
- if (LogDir.empty())
- return;
-
- // If a log directory is specified, create a new file with a unique name in
- // that directory.
- int Fd;
- SmallString<0> PathTemplate;
- SmallString<0> RealPath;
- sys::path::append(PathTemplate, LogDir, "Module-%%-%%-%%-%%-%%-%%-%%.txt");
- if (auto Err =
- sys::fs::createUniqueFile(PathTemplate.str(), Fd, RealPath)) {
- report_fatal_error("Failed to create log file at '" + Twine(LogDir) +
- "': " + Err.message(),
- /*CrashDiag=*/false);
- }
-
- FileOS = std::make_unique<raw_fd_ostream>(Fd, /*shouldClose=*/true);
- }
-
- bool hasLogFile() const { return FileOS != nullptr; }
-
- raw_ostream &logfile() {
- assert(FileOS && "no logfile!");
- return *FileOS;
- }
-
- /// \returns true if this SML will log anything either to a file or dbgs().
- /// Can be used to avoid expensive computations that are ignored when logging
- /// is disabled.
- operator bool() const {
- return hasLogFile() || (DebugFlag && isCurrentDebugType(DEBUG_TYPE));
- }
-
-private:
- std::unique_ptr<raw_fd_ostream> FileOS;
-};
-
-template <typename Ty>
-static SplitModuleLogger &operator<<(SplitModuleLogger &SML, const Ty &Val) {
- static_assert(
- !std::is_same_v<Ty, Value>,
- "do not print values to logs directly, use handleName instead!");
- LLVM_DEBUG(dbgs() << Val);
- if (SML.hasLogFile())
- SML.logfile() << Val;
- return SML;
-}
-
-/// Calculate the cost of each function in \p M
-/// \param SML Log Helper
-/// \param TM TargetMachine instance used to retrieve TargetTransformInfo.
-/// \param M Module to analyze.
-/// \param CostMap[out] Resulting Function -> Cost map.
-/// \return The module's total cost.
-static CostType
-calculateFunctionCosts(SplitModuleLogger &SML, const AMDGPUTargetMachine &TM,
- Module &M,
- DenseMap<const Function *, CostType> &CostMap) {
- CostType ModuleCost = 0;
- CostType KernelCost = 0;
-
- for (auto &Fn : M) {
- if (Fn.isDeclaration())
- continue;
-
- CostType FnCost = 0;
- TargetTransformInfo TTI = TM.getTargetTransformInfo(Fn);
-
- for (const auto &BB : Fn) {
- for (const auto &I : BB) {
- auto Cost =
- TTI.getInstructionCost(&I, TargetTransformInfo::TCK_CodeSize);
- assert(Cost != InstructionCost::getMax());
- // Assume expensive if we can't tell the cost of an instruction.
- CostType CostVal =
- Cost.getValue().value_or(TargetTransformInfo::TCC_Expensive);
- assert((FnCost + CostVal) >= FnCost && "Overflow!");
- FnCost += CostVal;
- }
- }
-
- assert(FnCost != 0);
-
- CostMap[&Fn] = FnCost;
- assert((ModuleCost + FnCost) >= ModuleCost && "Overflow!");
- ModuleCost += FnCost;
-
- if (isEntryPoint(&Fn))
- KernelCost += FnCost;
- }
-
- CostType FnCost = (ModuleCost - KernelCost);
- SML << "=> Total Module Cost: " << ModuleCost << '\n'
- << " => KernelCost: " << KernelCost << " ("
- << format("%0.2f", (float(KernelCost) / ModuleCost) * 100) << "%)\n"
- << " => FnsCost: " << FnCost << " ("
- << format("%0.2f", (float(FnCost) / ModuleCost) * 100) << "%)\n";
-
- return ModuleCost;
-}
-
-static bool canBeIndirectlyCalled(const Function &F) {
- if (F.isDeclaration() || isEntryPoint(&F))
- return false;
- return !F.hasLocalLinkage() ||
- F.hasAddressTaken(/*PutOffender=*/nullptr,
- /*IgnoreCallbackUses=*/false,
- /*IgnoreAssumeLikeCalls=*/true,
- /*IgnoreLLVMUsed=*/true,
- /*IgnoreARCAttachedCall=*/false,
- /*IgnoreCastedDirectCall=*/true);
-}
-
-/// When a kernel or any of its callees performs an indirect call, this function
-/// takes over \ref addAllDependencies and adds all potentially callable
-/// functions to \p Fns so they can be counted as dependencies of the kernel.
-///
-/// This is needed due to how AMDGPUResourceUsageAnalysis operates: in the
-/// presence of an indirect call, the function's resource usage is the same as
-/// the most expensive function in the module.
-/// \param M The module.
-/// \param Fns[out] Resulting list of functions.
-static void addAllIndirectCallDependencies(const Module &M,
- DenseSet<const Function *> &Fns) {
- for (const auto &Fn : M) {
- if (canBeIndirectlyCalled(Fn))
- Fns.insert(&Fn);
- }
-}
-
-/// Adds the functions that \p Fn may call to \p Fns, then recurses into each
-/// callee until all reachable functions have been gathered.
-///
-/// \param SML Log Helper
-/// \param CG Call graph for \p Fn's module.
-/// \param Fn Current function to look at.
-/// \param Fns[out] Resulting list of functions.
-/// \param HadIndirectCall[out] Set to true if an indirect call was seen at some
-/// point, either in \p Fn or in one of the function it calls. When that
-/// happens, we fall back to adding all callable functions inside \p Fn's module
-/// to \p Fns.
-static void addAllDependencies(SplitModuleLogger &SML, const CallGraph &CG,
- const Function &Fn,
- DenseSet<const Function *> &Fns,
- bool &HadIndirectCall) {
- assert(!Fn.isDeclaration());
-
- const Module &M = *Fn.getParent();
- SmallVector<const Function *> WorkList({&Fn});
- while (!WorkList.empty()) {
- const auto &CurFn = *WorkList.pop_back_val();
- assert(!CurFn.isDeclaration());
-
- // Scan for an indirect call. If such a call is found, we have to
- // conservatively assume this can call all non-entrypoint functions in the
- // module.
-
- for (auto &CGEntry : *CG[&CurFn]) {
- auto *CGNode = CGEntry.second;
- auto *Callee = CGNode->getFunction();
- if (!Callee) {
- // Functions have an edge towards CallsExternalNode if they're external
- // declarations, or if they do an indirect call. As we only process
- // definitions here, we know this means the function has an indirect
- // call. We then have to conservatively assume this can call all
- // non-entrypoint functions in the module.
- if (CGNode != CG.getCallsExternalNode())
- continue; // this is another function-less node we don't care about.
-
- SML << "Indirect call detected in " << getName(CurFn)
- << " - treating all non-entrypoint functions as "
- "potential dependencies\n";
-
- // TODO: Print an ORE as well ?
- addAllIndirectCallDependencies(M, Fns);
- HadIndirectCall = true;
- return;
- }
-
- if (Callee->isDeclaration())
- continue;
-
- auto [It, Inserted] = Fns.insert(Callee);
- if (Inserted)
- WorkList.push_back(Callee);
- }
- }
-}
-
-/// Contains information about a kernel and its dependencies.
-struct KernelWithDependencies {
- KernelWithDependencies(SplitModuleLogger &SML, CallGraph &CG,
- const DenseMap<const Function *, CostType> &FnCosts,
- const Function *Fn)
- : Fn(Fn) {
- addAllDependencies(SML, CG, *Fn, Dependencies, HasIndirectCall);
- TotalCost = FnCosts.at(Fn);
- for (const auto *Dep : Dependencies) {
- TotalCost += FnCosts.at(Dep);
-
- // We cannot duplicate functions with external linkage, or functions that
- // may be overriden at runtime.
- HasNonDuplicatableDependecy |=
- (Dep->hasExternalLinkage() || !Dep->isDefinitionExact());
- }
- }
-
- const Function *Fn = nullptr;
- DenseSet<const Function *> Dependencies;
- /// Whether \p Fn or any of its \ref Dependencies contains an indirect call.
- bool HasIndirectCall = false;
- /// Whether any of \p Fn's dependencies cannot be duplicated.
- bool HasNonDuplicatableDependecy = false;
-
- CostType TotalCost = 0;
-
- /// \returns true if this kernel and its dependencies can be considered large
- /// according to \p Threshold.
- bool isLarge(CostType Threshold) const {
- return TotalCost > Threshold && !Dependencies.empty();
- }
-};
-
-/// Calculates how much overlap there is between \p A and \p B.
-/// \return A number between 0.0 and 1.0, where 1.0 means A == B and 0.0 means A
-/// and B have no shared elements. Kernels do not count in overlap calculation.
-static float calculateOverlap(const DenseSet<const Function *> &A,
- const DenseSet<const Function *> &B) {
- DenseSet<const Function *> Total;
- for (const auto *F : A) {
- if (!isEntryPoint(F))
- Total.insert(F);
- }
-
- if (Total.empty())
- return 0.0f;
-
- unsigned NumCommon = 0;
- for (const auto *F : B) {
- if (isEntryPoint(F))
- continue;
-
- auto [It, Inserted] = Total.insert(F);
- if (!Inserted)
- ++NumCommon;
- }
-
- return static_cast<float>(NumCommon) / Total.size();
-}
-
-/// Performs all of the partitioning work on \p M.
-/// \param SML Log Helper
-/// \param M Module to partition.
-/// \param NumParts Number of partitions to create.
-/// \param ModuleCost Total cost of all functions in \p M.
-/// \param FnCosts Map of Function -> Cost
-/// \param WorkList Kernels and their dependencies to process in order.
-/// \returns The created partitions (a vector of size \p NumParts )
-static std::vector<DenseSet<const Function *>>
-doPartitioning(SplitModuleLogger &SML, Module &M, unsigned NumParts,
- CostType ModuleCost,
- const DenseMap<const Function *, CostType> &FnCosts,
- const SmallVector<KernelWithDependencies> &WorkList) {
-
- SML << "\n--Partitioning Starts--\n";
-
- // Calculate a "large kernel threshold". When more than one kernel's total
- // import cost exceeds this value, we will try to merge it with other,
- // similarly large kernels.
- //
- // e.g. let two kernels X and Y have a import cost of ~10% of the module, we
- // assign X to a partition as usual, but when we get to Y, we check if it's
- // worth also putting it in Y's partition.
- const CostType LargeKernelThreshold =
- LargeKernelFactor ? ((ModuleCost / NumParts) * LargeKernelFactor)
- : std::numeric_limits<CostType>::max();
-
- std::vector<DenseSet<const Function *>> Partitions;
- Partitions.resize(NumParts);
-
- // Assign a partition to each kernel, and try to keep the partitions more or
- // less balanced. We do that through a priority queue sorted in reverse, so we
- // can always look at the partition with the least content.
- //
- // There are some cases where we will be deliberately unbalanced though.
- // - Large kernels: we try to merge with existing partitions to reduce code
- // duplication.
- // - Kernels with indirect or external calls always go in the first partition
- // (P0).
- auto ComparePartitions = [](const std::pair<PartitionID, CostType> &a,
- const std::pair<PartitionID, CostType> &b) {
- // When two partitions have the same cost, assign to the one with the
- // biggest ID first. This allows us to put things in P0 last, because P0 may
- // have other stuff added later.
- if (a.second == b.second)
- return a.first < b.first;
- return a.second > b.second;
- };
-
- // We can't use priority_queue here because we need to be able to access any
- // element. This makes this a bit inefficient as we need to sort it again
- // everytime we change it, but it's a very small array anyway (likely under 64
- // partitions) so it's a cheap operation.
- std::vector<std::pair<PartitionID, CostType>> BalancingQueue;
- for (unsigned I = 0; I < NumParts; ++I)
- BalancingQueue.push_back(std::make_pair(I, 0));
-
- // Helper function to handle assigning a kernel to a partition. This takes
- // care of updating the balancing queue.
- const auto AssignToPartition = [&](PartitionID PID,
- const KernelWithDependencies &KWD) {
- auto &FnsInPart = Partitions[PID];
- FnsInPart.insert(KWD.Fn);
- FnsInPart.insert(KWD.Dependencies.begin(), KWD.Dependencies.end());
-
- SML << "assign " << getName(*KWD.Fn) << " to P" << PID << "\n -> ";
- if (!KWD.Dependencies.empty()) {
- SML << KWD.Dependencies.size() << " dependencies added\n";
- };
-
- // Update the balancing queue. we scan backwards because in the common case
- // the partition is at the end.
- for (auto &[QueuePID, Cost] : reverse(BalancingQueue)) {
- if (QueuePID == PID) {
- CostType NewCost = 0;
- for (auto *Fn : Partitions[PID])
- NewCost += FnCosts.at(Fn);
-
- SML << "[Updating P" << PID << " Cost]:" << Cost << " -> " << NewCost;
- if (Cost) {
- SML << " (" << unsigned(((float(NewCost) / Cost) - 1) * 100)
- << "% increase)";
- }
- SML << '\n';
-
- Cost = NewCost;
- }
- }
-
- sort(BalancingQueue, ComparePartitions);
- };
-
- for (auto &CurKernel : WorkList) {
- // When a kernel has indirect calls, it must stay in the first partition
- // alongside every reachable non-entry function. This is a nightmare case
- // for splitting as it severely limits what we can do.
- if (CurKernel.HasIndirectCall) {
- SML << "Kernel with indirect call(s): " << getName(*CurKernel.Fn)
- << " defaulting to P0\n";
- AssignToPartition(0, CurKernel);
- continue;
- }
-
- // When a kernel has non duplicatable dependencies...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/93275
More information about the llvm-commits
mailing list