[llvm] 0640875 - [NFC][InlineCost] Factor cost modeling out of CallAnalyzer traversal.
Mircea Trofin via llvm-commits
llvm-commits at lists.llvm.org
Fri Jan 10 15:30:47 PST 2020
Author: Mircea Trofin
Date: 2020-01-10T15:30:24-08:00
New Revision: 064087581ab98cca7254b4d0f12ecbed13da2692
URL: https://github.com/llvm/llvm-project/commit/064087581ab98cca7254b4d0f12ecbed13da2692
DIFF: https://github.com/llvm/llvm-project/commit/064087581ab98cca7254b4d0f12ecbed13da2692.diff
LOG: [NFC][InlineCost] Factor cost modeling out of CallAnalyzer traversal.
Summary:
The goal is to simplify experimentation on the cost model. Today,
CallAnalyzer decides 2 things: legality, and benefit. The refactoring
keeps legality assessment in CallAnalyzer, and factors benefit
evaluation out, as an extension.
Reviewers: davidxl, eraman
Reviewed By: davidxl
Subscribers: kamleshbhalui, fedor.sergeev, hiraditya, baloghadamsoftware, haicheng, a.sidorin, Szelethus, donat.nagy, dkrupp, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D71733
Added:
Modified:
llvm/lib/Analysis/InlineCost.cpp
Removed:
################################################################################
diff --git a/llvm/lib/Analysis/InlineCost.cpp b/llvm/lib/Analysis/InlineCost.cpp
index b5f4192bf856..de83a48aad16 100644
--- a/llvm/lib/Analysis/InlineCost.cpp
+++ b/llvm/lib/Analysis/InlineCost.cpp
@@ -93,11 +93,13 @@ static cl::opt<bool> OptComputeFullInlineCost(
"exceeds the threshold."));
namespace {
-
+class InlineCostCallAnalyzer;
class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
typedef InstVisitor<CallAnalyzer, bool> Base;
friend class InstVisitor<CallAnalyzer, bool>;
+protected:
+ virtual ~CallAnalyzer() {}
/// The TargetTransformInfo available for this compilation.
const TargetTransformInfo &TTI;
@@ -124,20 +126,86 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
/// easily cacheable. Instead, use the cover function paramHasAttr.
CallBase &CandidateCall;
- /// Tunable parameters that control the analysis.
- const InlineParams &Params;
+ /// Extension points for handling callsite features.
+ /// Called after a basic block was analyzed.
+ virtual void onBlockAnalyzed(const BasicBlock *BB) {}
- /// Upper bound for the inlining cost. Bonuses are being applied to account
- /// for speculative "expected profit" of the inlining decision.
- int Threshold;
+ /// Called at the end of the analysis of the callsite. Return the outcome of
+ /// the analysis, i.e. 'InlineResult(true)' if the inlining may happen, or
+ /// the reason it can't.
+ virtual InlineResult finalizeAnalysis() { return true; }
- /// Inlining cost measured in abstract units, accounts for all the
- /// instructions expected to be executed for a given function invocation.
- /// Instructions that are statically proven to be dead based on call-site
- /// arguments are not counted here.
- int Cost = 0;
+ /// Called when we're about to start processing a basic block, and every time
+ /// we are done processing an instruction. Return true if there is no point in
+ /// continuing the analysis (e.g. we've determined already the call site is
+ /// too expensive to inline)
+ virtual bool shouldStop() { return false; }
+
+ /// Called before the analysis of the callee body starts (with callsite
+ /// contexts propagated). It checks callsite-specific information. Return a
+ /// reason analysis can't continue if that's the case, or 'true' if it may
+ /// continue.
+ virtual InlineResult onAnalysisStart() { return true; }
+
+ /// Called if the analysis engine decides SROA cannot be done for the given
+ /// alloca.
+ virtual void onDisableSROA(AllocaInst *Arg) {}
+
+ /// Called the analysis engine determines load elimination won't happen.
+ virtual void onDisableLoadElimination() {}
+
+ /// Called to account for a call.
+ virtual void onCallPenalty() {}
- bool ComputeFullInlineCost;
+ /// Called to account for the expectation the inlining would result in a load
+ /// elimination.
+ virtual void onLoadEliminationOpportunity() {}
+
+ /// Called to account for the cost of argument setup for the Call in the
+ /// callee's body (not the callsite currently under analysis).
+ virtual void onCallArgumentSetup(const CallBase &Call) {}
+
+ /// Called to account for a load relative intrinsic.
+ virtual void onLoadRelativeIntrinsic() {}
+
+ /// Called to account for a lowered call.
+ virtual void onLoweredCall(Function *F, CallBase &Call, bool IsIndirectCall) {
+ }
+
+ /// Account for a jump table of given size. Return false to stop further
+ /// processing the switch instruction
+ virtual bool onJumpTable(unsigned JumpTableSize) { return true; }
+
+ /// Account for a case cluster of given size. Return false to stop further
+ /// processing of the instruction.
+ virtual bool onCaseCluster(unsigned NumCaseCluster) { return true; }
+
+ /// Called at the end of processing a switch instruction, with the given
+ /// number of case clusters.
+ virtual void onFinalizeSwitch(unsigned JumpTableSize,
+ unsigned NumCaseCluster) {}
+
+ /// Called to account for any other instruction not specifically accounted
+ /// for.
+ virtual void onCommonInstructionSimplification() {}
+
+ /// Start accounting potential benefits due to SROA for the given alloca.
+ virtual void onInitializeSROAArg(AllocaInst *Arg) {}
+
+ /// Account SROA savings for the AllocaInst value.
+ virtual void onAggregateSROAUse(AllocaInst *V) {}
+
+ bool handleSROA(Value *V, bool DoNotDisable) {
+ // Check for SROA candidates in comparisons.
+ if (auto *SROAArg = getSROAArgForValueOrNull(V)) {
+ if (DoNotDisable) {
+ onAggregateSROAUse(SROAArg);
+ return true;
+ }
+ disableSROAForArg(SROAArg);
+ }
+ return false;
+ }
bool IsCallerRecursive = false;
bool IsRecursiveCall = false;
@@ -149,20 +217,11 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
bool HasUninlineableIntrinsic = false;
bool InitsVargArgs = false;
- /// Attempt to evaluate indirect calls to boost its inline cost.
- bool BoostIndirectCalls;
-
/// Number of bytes allocated statically by the callee.
uint64_t AllocatedSize = 0;
unsigned NumInstructions = 0;
unsigned NumVectorInstructions = 0;
- /// Bonus to be applied when percentage of vector instructions in callee is
- /// high (see more details in updateThreshold).
- int VectorBonus = 0;
- /// Bonus to be applied when the callee has only one reachable basic block.
- int SingleBBBonus = 0;
-
/// While we walk the potentially-inlined instructions, we build up and
/// maintain a mapping of simplified values specific to this callsite. The
/// idea is to propagate any special information we have about arguments to
@@ -174,12 +233,12 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
/// Keep track of the values which map back (through function arguments) to
/// allocas on the caller stack which could be simplified through SROA.
- DenseMap<Value *, Value *> SROAArgValues;
+ DenseMap<Value *, AllocaInst *> SROAArgValues;
- /// The mapping of caller Alloca values to their accumulated cost savings. If
- /// we have to disable SROA for one of the allocas, this tells us how much
- /// cost must be added.
- DenseMap<Value *, int> SROAArgCosts;
+ /// Keep track of Allocas for which we believe we may get SROA optimization.
+ /// We don't delete entries in SROAArgValue because we still want
+ /// isAllocaDerivedArg to function correctly.
+ DenseSet<AllocaInst *> EnabledSROAArgValues;
/// Keep track of values which map to a pointer base and constant offset.
DenseMap<Value *, std::pair<Value *, APInt>> ConstantOffsetPtrs;
@@ -196,17 +255,20 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
/// loads.
bool EnableLoadElimination;
SmallPtrSet<Value *, 16> LoadAddrSet;
- int LoadEliminationCost = 0;
+
+ AllocaInst *getSROAArgForValueOrNull(Value *V) const {
+ auto It = SROAArgValues.find(V);
+ if (It == SROAArgValues.end() ||
+ EnabledSROAArgValues.count(It->second) == 0)
+ return nullptr;
+ return It->second;
+ }
// Custom simplification helper routines.
bool isAllocaDerivedArg(Value *V);
- bool lookupSROAArgAndCost(Value *V, Value *&Arg,
- DenseMap<Value *, int>::iterator &CostIt);
- void disableSROA(DenseMap<Value *, int>::iterator CostIt);
+ void disableSROAForArg(AllocaInst *SROAArg);
void disableSROA(Value *V);
void findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB);
- void accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
- int InstructionCost);
void disableLoadElimination();
bool isGEPFree(GetElementPtrInst &GEP);
bool canFoldInboundsGEP(GetElementPtrInst &I);
@@ -227,32 +289,13 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
/// inlined through this particular callsite.
bool isKnownNonNullInCallee(Value *V);
- /// Update Threshold based on callsite properties such as callee
- /// attributes and callee hotness for PGO builds. The Callee is explicitly
- /// passed to support analyzing indirect calls whose target is inferred by
- /// analysis.
- void updateThreshold(CallBase &Call, Function &Callee);
-
/// Return true if size growth is allowed when inlining the callee at \p Call.
bool allowSizeGrowth(CallBase &Call);
- /// Return true if \p Call is a cold callsite.
- bool isColdCallSite(CallBase &Call, BlockFrequencyInfo *CallerBFI);
-
- /// Return a higher threshold if \p Call is a hot callsite.
- Optional<int> getHotCallSiteThreshold(CallBase &Call,
- BlockFrequencyInfo *CallerBFI);
-
// Custom analysis routines.
InlineResult analyzeBlock(BasicBlock *BB,
SmallPtrSetImpl<const Value *> &EphValues);
- /// Handle a capped 'int' increment for Cost.
- void addCost(int64_t Inc, int64_t UpperBound = INT_MAX) {
- assert(UpperBound > 0 && UpperBound <= INT_MAX && "invalid upper bound");
- Cost = (int)std::min(UpperBound, Cost + Inc);
- }
-
// Disable several entry points to the visitor so we don't accidentally use
// them by declaring but not defining them here.
void visit(Module *);
@@ -298,20 +341,13 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI,
ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE,
- Function &Callee, CallBase &Call, const InlineParams &Params,
- bool BoostIndirect = true)
+ Function &Callee, CallBase &Call)
: TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI),
PSI(PSI), F(Callee), DL(F.getParent()->getDataLayout()), ORE(ORE),
- CandidateCall(Call), Params(Params), Threshold(Params.DefaultThreshold),
- ComputeFullInlineCost(OptComputeFullInlineCost ||
- Params.ComputeFullInlineCost || ORE),
- BoostIndirectCalls(BoostIndirect), EnableLoadElimination(true) {}
+ CandidateCall(Call), EnableLoadElimination(true) {}
InlineResult analyze();
- int getThreshold() { return Threshold; }
- int getCost() { return Cost; }
-
// Keep a bunch of stats about the cost savings found so we can print them
// out when debugging.
unsigned NumConstantArgs = 0;
@@ -320,12 +356,291 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
unsigned NumConstantPtrCmps = 0;
unsigned NumConstantPtrDiffs = 0;
unsigned NumInstructionsSimplified = 0;
+
+ void dump();
+};
+
+/// FIXME: if it is necessary to derive from InlineCostCallAnalyzer, note
+/// the FIXME in onLoweredCall, when instantiating an InlineCostCallAnalyzer
+class InlineCostCallAnalyzer final : public CallAnalyzer {
+ const int CostUpperBound = INT_MAX - InlineConstants::InstrCost - 1;
+ const bool ComputeFullInlineCost;
+ int LoadEliminationCost = 0;
+ /// Bonus to be applied when percentage of vector instructions in callee is
+ /// high (see more details in updateThreshold).
+ int VectorBonus = 0;
+ /// Bonus to be applied when the callee has only one reachable basic block.
+ int SingleBBBonus = 0;
+
+ /// Tunable parameters that control the analysis.
+ const InlineParams &Params;
+
+ /// Upper bound for the inlining cost. Bonuses are being applied to account
+ /// for speculative "expected profit" of the inlining decision.
+ int Threshold = 0;
+
+ /// Attempt to evaluate indirect calls to boost its inline cost.
+ const bool BoostIndirectCalls;
+
+ /// Inlining cost measured in abstract units, accounts for all the
+ /// instructions expected to be executed for a given function invocation.
+ /// Instructions that are statically proven to be dead based on call-site
+ /// arguments are not counted here.
+ int Cost = 0;
+
+ bool SingleBB = true;
+
unsigned SROACostSavings = 0;
unsigned SROACostSavingsLost = 0;
+ /// The mapping of caller Alloca values to their accumulated cost savings. If
+ /// we have to disable SROA for one of the allocas, this tells us how much
+ /// cost must be added.
+ DenseMap<AllocaInst *, int> SROAArgCosts;
+
+ /// Return true if \p Call is a cold callsite.
+ bool isColdCallSite(CallBase &Call, BlockFrequencyInfo *CallerBFI);
+
+ /// Update Threshold based on callsite properties such as callee
+ /// attributes and callee hotness for PGO builds. The Callee is explicitly
+ /// passed to support analyzing indirect calls whose target is inferred by
+ /// analysis.
+ void updateThreshold(CallBase &Call, Function &Callee);
+ /// Return a higher threshold if \p Call is a hot callsite.
+ Optional<int> getHotCallSiteThreshold(CallBase &Call,
+ BlockFrequencyInfo *CallerBFI);
+
+ /// Handle a capped 'int' increment for Cost.
+ void addCost(int64_t Inc, int64_t UpperBound = INT_MAX) {
+ assert(UpperBound > 0 && UpperBound <= INT_MAX && "invalid upper bound");
+ Cost = (int)std::min(UpperBound, Cost + Inc);
+ }
+
+ void onDisableSROA(AllocaInst *Arg) override {
+ auto CostIt = SROAArgCosts.find(Arg);
+ if (CostIt == SROAArgCosts.end())
+ return;
+ addCost(CostIt->second);
+ SROACostSavings -= CostIt->second;
+ SROACostSavingsLost += CostIt->second;
+ SROAArgCosts.erase(CostIt);
+ }
+
+ void onDisableLoadElimination() override {
+ addCost(LoadEliminationCost);
+ LoadEliminationCost = 0;
+ }
+ void onCallPenalty() override { addCost(InlineConstants::CallPenalty); }
+ void onCallArgumentSetup(const CallBase &Call) override {
+ // Pay the price of the argument setup. We account for the average 1
+ // instruction per call argument setup here.
+ addCost(Call.arg_size() * InlineConstants::InstrCost);
+ }
+ void onLoadRelativeIntrinsic() override {
+ // This is normally lowered to 4 LLVM instructions.
+ addCost(3 * InlineConstants::InstrCost);
+ }
+ void onLoweredCall(Function *F, CallBase &Call,
+ bool IsIndirectCall) override {
+ // We account for the average 1 instruction per call argument setup here.
+ addCost(Call.arg_size() * InlineConstants::InstrCost);
+
+ // If we have a constant that we are calling as a function, we can peer
+ // through it and see the function target. This happens not infrequently
+ // during devirtualization and so we want to give it a hefty bonus for
+ // inlining, but cap that bonus in the event that inlining wouldn't pan out.
+ // Pretend to inline the function, with a custom threshold.
+ if (IsIndirectCall && BoostIndirectCalls) {
+ auto IndirectCallParams = Params;
+ IndirectCallParams.DefaultThreshold =
+ InlineConstants::IndirectCallThreshold;
+ /// FIXME: if InlineCostCallAnalyzer is derived from, this may need
+ /// to instantiate the derived class.
+ InlineCostCallAnalyzer CA(TTI, GetAssumptionCache, GetBFI, PSI, ORE, *F,
+ Call, IndirectCallParams, false);
+ if (CA.analyze()) {
+ // We were able to inline the indirect call! Subtract the cost from the
+ // threshold to get the bonus we want to apply, but don't go below zero.
+ Cost -= std::max(0, CA.getThreshold() - CA.getCost());
+ }
+ } else
+ // Otherwise simply add the cost for merely making the call.
+ addCost(InlineConstants::CallPenalty);
+ }
+
+ void onFinalizeSwitch(unsigned JumpTableSize,
+ unsigned NumCaseCluster) override {
+ // If suitable for a jump table, consider the cost for the table size and
+ // branch to destination.
+ // Maximum valid cost increased in this function.
+ if (JumpTableSize) {
+ int64_t JTCost = (int64_t)JumpTableSize * InlineConstants::InstrCost +
+ 4 * InlineConstants::InstrCost;
+
+ addCost(JTCost, (int64_t)CostUpperBound);
+ return;
+ }
+ // Considering forming a binary search, we should find the number of nodes
+ // which is same as the number of comparisons when lowered. For a given
+ // number of clusters, n, we can define a recursive function, f(n), to find
+ // the number of nodes in the tree. The recursion is :
+ // f(n) = 1 + f(n/2) + f (n - n/2), when n > 3,
+ // and f(n) = n, when n <= 3.
+ // This will lead a binary tree where the leaf should be either f(2) or f(3)
+ // when n > 3. So, the number of comparisons from leaves should be n, while
+ // the number of non-leaf should be :
+ // 2^(log2(n) - 1) - 1
+ // = 2^log2(n) * 2^-1 - 1
+ // = n / 2 - 1.
+ // Considering comparisons from leaf and non-leaf nodes, we can estimate the
+ // number of comparisons in a simple closed form :
+ // n + n / 2 - 1 = n * 3 / 2 - 1
+ if (NumCaseCluster <= 3) {
+ // Suppose a comparison includes one compare and one conditional branch.
+ addCost(NumCaseCluster * 2 * InlineConstants::InstrCost);
+ return;
+ }
+
+ int64_t ExpectedNumberOfCompare = 3 * (int64_t)NumCaseCluster / 2 - 1;
+ int64_t SwitchCost =
+ ExpectedNumberOfCompare * 2 * InlineConstants::InstrCost;
+
+ addCost(SwitchCost, (int64_t)CostUpperBound);
+ }
+ void onCommonInstructionSimplification() override {
+ addCost(InlineConstants::InstrCost);
+ }
+
+ void onInitializeSROAArg(AllocaInst *Arg) override {
+ assert(Arg != nullptr &&
+ "Should not initialize SROA costs for null value.");
+ SROAArgCosts[Arg] = 0;
+ EnabledSROAArgValues.insert(Arg);
+ }
+
+ void onAggregateSROAUse(AllocaInst *SROAArg) override {
+ auto CostIt = SROAArgCosts.find(SROAArg);
+ assert(CostIt != SROAArgCosts.end() &&
+ "expected this argument to have a cost");
+ CostIt->second += InlineConstants::InstrCost;
+ SROACostSavings += InlineConstants::InstrCost;
+ }
+
+ void onBlockAnalyzed(const BasicBlock *BB) override {
+ auto *TI = BB->getTerminator();
+ // If we had any successors at this point, than post-inlining is likely to
+ // have them as well. Note that we assume any basic blocks which existed
+ // due to branches or switches which folded above will also fold after
+ // inlining.
+ if (SingleBB && TI->getNumSuccessors() > 1) {
+ // Take off the bonus we applied to the threshold.
+ Threshold -= SingleBBBonus;
+ SingleBB = false;
+ }
+ }
+ InlineResult finalizeAnalysis() override {
+ // Loops generally act a lot like calls in that they act like barriers to
+ // movement, require a certain amount of setup, etc. So when optimising for
+ // size, we penalise any call sites that perform loops. We do this after all
+ // other costs here, so will likely only be dealing with relatively small
+ // functions (and hence DT and LI will hopefully be cheap).
+ auto *Caller = CandidateCall.getFunction();
+ if (Caller->hasMinSize()) {
+ DominatorTree DT(F);
+ LoopInfo LI(DT);
+ int NumLoops = 0;
+ for (Loop *L : LI) {
+ // Ignore loops that will not be executed
+ if (DeadBlocks.count(L->getHeader()))
+ continue;
+ NumLoops++;
+ }
+ addCost(NumLoops * InlineConstants::CallPenalty);
+ }
+
+ // We applied the maximum possible vector bonus at the beginning. Now,
+ // subtract the excess bonus, if any, from the Threshold before
+ // comparing against Cost.
+ if (NumVectorInstructions <= NumInstructions / 10)
+ Threshold -= VectorBonus;
+ else if (NumVectorInstructions <= NumInstructions / 2)
+ Threshold -= VectorBonus / 2;
+
+ return Cost < std::max(1, Threshold);
+ }
+ bool shouldStop() override {
+ // Bail out the moment we cross the threshold. This means we'll under-count
+ // the cost, but only when undercounting doesn't matter.
+ return Cost >= Threshold && !ComputeFullInlineCost;
+ }
+
+ void onLoadEliminationOpportunity() override {
+ LoadEliminationCost += InlineConstants::InstrCost;
+ }
+
+ InlineResult onAnalysisStart() override {
+ // Perform some tweaks to the cost and threshold based on the direct
+ // callsite information.
+
+ // We want to more aggressively inline vector-dense kernels, so up the
+ // threshold, and we'll lower it if the % of vector instructions gets too
+ // low. Note that these bonuses are some what arbitrary and evolved over
+ // time by accident as much as because they are principled bonuses.
+ //
+ // FIXME: It would be nice to remove all such bonuses. At least it would be
+ // nice to base the bonus values on something more scientific.
+ assert(NumInstructions == 0);
+ assert(NumVectorInstructions == 0);
+
+ // Update the threshold based on callsite properties
+ updateThreshold(CandidateCall, F);
+
+ // While Threshold depends on commandline options that can take negative
+ // values, we want to enforce the invariant that the computed threshold and
+ // bonuses are non-negative.
+ assert(Threshold >= 0);
+ assert(SingleBBBonus >= 0);
+ assert(VectorBonus >= 0);
+
+ // Speculatively apply all possible bonuses to Threshold. If cost exceeds
+ // this Threshold any time, and cost cannot decrease, we can stop processing
+ // the rest of the function body.
+ Threshold += (SingleBBBonus + VectorBonus);
+
+ // Give out bonuses for the callsite, as the instructions setting them up
+ // will be gone after inlining.
+ addCost(-getCallsiteCost(this->CandidateCall, DL));
+
+ // If this function uses the coldcc calling convention, prefer not to inline
+ // it.
+ if (F.getCallingConv() == CallingConv::Cold)
+ Cost += InlineConstants::ColdccPenalty;
+
+ // Check if we're done. This can happen due to bonuses and penalties.
+ if (Cost >= Threshold && !ComputeFullInlineCost)
+ return "high cost";
+
+ return true;
+ }
+
+public:
+ InlineCostCallAnalyzer(
+ const TargetTransformInfo &TTI,
+ std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
+ Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI,
+ ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE, Function &Callee,
+ CallBase &Call, const InlineParams &Params, bool BoostIndirect = true)
+ : CallAnalyzer(TTI, GetAssumptionCache, GetBFI, PSI, ORE, Callee, Call),
+ ComputeFullInlineCost(OptComputeFullInlineCost ||
+ Params.ComputeFullInlineCost || ORE),
+ Params(Params), Threshold(Params.DefaultThreshold),
+ BoostIndirectCalls(BoostIndirect) {}
void dump();
-};
+ virtual ~InlineCostCallAnalyzer() {}
+ int getThreshold() { return Threshold; }
+ int getCost() { return Cost; }
+};
} // namespace
/// Test whether the given value is an Alloca-derived function argument.
@@ -333,55 +648,21 @@ bool CallAnalyzer::isAllocaDerivedArg(Value *V) {
return SROAArgValues.count(V);
}
-/// Lookup the SROA-candidate argument and cost iterator which V maps to.
-/// Returns false if V does not map to a SROA-candidate.
-bool CallAnalyzer::lookupSROAArgAndCost(
- Value *V, Value *&Arg, DenseMap<Value *, int>::iterator &CostIt) {
- if (SROAArgValues.empty() || SROAArgCosts.empty())
- return false;
-
- DenseMap<Value *, Value *>::iterator ArgIt = SROAArgValues.find(V);
- if (ArgIt == SROAArgValues.end())
- return false;
-
- Arg = ArgIt->second;
- CostIt = SROAArgCosts.find(Arg);
- return CostIt != SROAArgCosts.end();
-}
-
-/// Disable SROA for the candidate marked by this cost iterator.
-///
-/// This marks the candidate as no longer viable for SROA, and adds the cost
-/// savings associated with it back into the inline cost measurement.
-void CallAnalyzer::disableSROA(DenseMap<Value *, int>::iterator CostIt) {
- // If we're no longer able to perform SROA we need to undo its cost savings
- // and prevent subsequent analysis.
- addCost(CostIt->second);
- SROACostSavings -= CostIt->second;
- SROACostSavingsLost += CostIt->second;
- SROAArgCosts.erase(CostIt);
+void CallAnalyzer::disableSROAForArg(AllocaInst *SROAArg) {
+ onDisableSROA(SROAArg);
+ EnabledSROAArgValues.erase(SROAArg);
disableLoadElimination();
}
-
/// If 'V' maps to a SROA candidate, disable SROA for it.
void CallAnalyzer::disableSROA(Value *V) {
- Value *SROAArg;
- DenseMap<Value *, int>::iterator CostIt;
- if (lookupSROAArgAndCost(V, SROAArg, CostIt))
- disableSROA(CostIt);
-}
-
-/// Accumulate the given cost for a particular SROA candidate.
-void CallAnalyzer::accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
- int InstructionCost) {
- CostIt->second += InstructionCost;
- SROACostSavings += InstructionCost;
+ if (auto *SROAArg = getSROAArgForValueOrNull(V)) {
+ disableSROAForArg(SROAArg);
+ }
}
void CallAnalyzer::disableLoadElimination() {
if (EnableLoadElimination) {
- addCost(LoadEliminationCost);
- LoadEliminationCost = 0;
+ onDisableLoadElimination();
EnableLoadElimination = false;
}
}
@@ -553,9 +834,7 @@ bool CallAnalyzer::visitPHI(PHINode &I) {
if (FirstBaseAndOffset.first) {
ConstantOffsetPtrs[&I] = FirstBaseAndOffset;
- Value *SROAArg;
- DenseMap<Value *, int>::iterator CostIt;
- if (lookupSROAArgAndCost(FirstV, SROAArg, CostIt))
+ if (auto *SROAArg = getSROAArgForValueOrNull(FirstV))
SROAArgValues[&I] = SROAArg;
}
@@ -585,10 +864,7 @@ bool CallAnalyzer::canFoldInboundsGEP(GetElementPtrInst &I) {
}
bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
- Value *SROAArg;
- DenseMap<Value *, int>::iterator CostIt;
- bool SROACandidate =
- lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt);
+ auto *SROAArg = getSROAArgForValueOrNull(I.getPointerOperand());
// Lambda to check whether a GEP's indices are all constant.
auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) {
@@ -599,7 +875,7 @@ bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
};
if ((I.isInBounds() && canFoldInboundsGEP(I)) || IsGEPOffsetConstant(I)) {
- if (SROACandidate)
+ if (SROAArg)
SROAArgValues[&I] = SROAArg;
// Constant GEPs are modeled as free.
@@ -607,8 +883,8 @@ bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
}
// Variable GEPs will require math and will disable SROA.
- if (SROACandidate)
- disableSROA(CostIt);
+ if (SROAArg)
+ disableSROAForArg(SROAArg);
return isGEPFree(I);
}
@@ -648,9 +924,7 @@ bool CallAnalyzer::visitBitCast(BitCastInst &I) {
ConstantOffsetPtrs[&I] = BaseAndOffset;
// Also look for SROA candidates here.
- Value *SROAArg;
- DenseMap<Value *, int>::iterator CostIt;
- if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt))
+ if (auto *SROAArg = getSROAArgForValueOrNull(I.getOperand(0)))
SROAArgValues[&I] = SROAArg;
// Bitcasts are always zero cost.
@@ -682,9 +956,7 @@ bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
// and so we can just add the integer in here. The only places where SROA is
// preserved either cannot fire on an integer, or won't in-and-of themselves
// disable SROA (ext) w/o some later use that we would see and disable.
- Value *SROAArg;
- DenseMap<Value *, int>::iterator CostIt;
- if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt))
+ if (auto *SROAArg = getSROAArgForValueOrNull(I.getOperand(0)))
SROAArgValues[&I] = SROAArg;
return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
@@ -708,9 +980,7 @@ bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
}
// "Propagate" SROA here in the same manner as we do for ptrtoint above.
- Value *SROAArg;
- DenseMap<Value *, int>::iterator CostIt;
- if (lookupSROAArgAndCost(Op, SROAArg, CostIt))
+ if (auto *SROAArg = getSROAArgForValueOrNull(Op))
SROAArgValues[&I] = SROAArg;
return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
@@ -737,7 +1007,7 @@ bool CallAnalyzer::visitCastInst(CastInst &I) {
case Instruction::FPToUI:
case Instruction::FPToSI:
if (TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive)
- addCost(InlineConstants::CallPenalty);
+ onCallPenalty();
break;
default:
break;
@@ -810,8 +1080,8 @@ bool CallAnalyzer::allowSizeGrowth(CallBase &Call) {
return true;
}
-bool CallAnalyzer::isColdCallSite(CallBase &Call,
- BlockFrequencyInfo *CallerBFI) {
+bool InlineCostCallAnalyzer::isColdCallSite(CallBase &Call,
+ BlockFrequencyInfo *CallerBFI) {
// If global profile summary is available, then callsite's coldness is
// determined based on that.
if (PSI && PSI->hasProfileSummary())
@@ -834,8 +1104,8 @@ bool CallAnalyzer::isColdCallSite(CallBase &Call,
}
Optional<int>
-CallAnalyzer::getHotCallSiteThreshold(CallBase &Call,
- BlockFrequencyInfo *CallerBFI) {
+InlineCostCallAnalyzer::getHotCallSiteThreshold(CallBase &Call,
+ BlockFrequencyInfo *CallerBFI) {
// If global profile summary is available, then callsite's hotness is
// determined based on that.
@@ -862,7 +1132,7 @@ CallAnalyzer::getHotCallSiteThreshold(CallBase &Call,
return None;
}
-void CallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) {
+void InlineCostCallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) {
// If no size growth is allowed for this inlining, set Threshold to 0.
if (!allowSizeGrowth(Call)) {
Threshold = 0;
@@ -1024,19 +1294,7 @@ bool CallAnalyzer::visitCmpInst(CmpInst &I) {
: ConstantInt::getFalse(I.getType());
return true;
}
- // Finally check for SROA candidates in comparisons.
- Value *SROAArg;
- DenseMap<Value *, int>::iterator CostIt;
- if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) {
- if (isa<ConstantPointerNull>(I.getOperand(1))) {
- accumulateSROACost(CostIt, InlineConstants::InstrCost);
- return true;
- }
-
- disableSROA(CostIt);
- }
-
- return false;
+ return handleSROA(I.getOperand(0), isa<ConstantPointerNull>(I.getOperand(1)));
}
bool CallAnalyzer::visitSub(BinaryOperator &I) {
@@ -1100,7 +1358,7 @@ bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {
if (I.getType()->isFloatingPointTy() &&
TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive &&
!match(&I, m_FNeg(m_Value())))
- addCost(InlineConstants::CallPenalty);
+ onCallPenalty();
return false;
}
@@ -1127,23 +1385,15 @@ bool CallAnalyzer::visitFNeg(UnaryOperator &I) {
}
bool CallAnalyzer::visitLoad(LoadInst &I) {
- Value *SROAArg;
- DenseMap<Value *, int>::iterator CostIt;
- if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) {
- if (I.isSimple()) {
- accumulateSROACost(CostIt, InlineConstants::InstrCost);
- return true;
- }
-
- disableSROA(CostIt);
- }
+ if (handleSROA(I.getPointerOperand(), I.isSimple()))
+ return true;
// If the data is already loaded from this address and hasn't been clobbered
// by any stores or calls, this load is likely to be redundant and can be
// eliminated.
if (EnableLoadElimination &&
!LoadAddrSet.insert(I.getPointerOperand()).second && I.isUnordered()) {
- LoadEliminationCost += InlineConstants::InstrCost;
+ onLoadEliminationOpportunity();
return true;
}
@@ -1151,16 +1401,8 @@ bool CallAnalyzer::visitLoad(LoadInst &I) {
}
bool CallAnalyzer::visitStore(StoreInst &I) {
- Value *SROAArg;
- DenseMap<Value *, int>::iterator CostIt;
- if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) {
- if (I.isSimple()) {
- accumulateSROACost(CostIt, InlineConstants::InstrCost);
- return true;
- }
-
- disableSROA(CostIt);
- }
+ if (handleSROA(I.getPointerOperand(), I.isSimple()))
+ return true;
// The store can potentially clobber loads and prevent repeated loads from
// being eliminated.
@@ -1250,9 +1492,7 @@ bool CallAnalyzer::visitCallBase(CallBase &Call) {
// in this inline context. If not, we've done all we can.
F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee));
if (!F) {
- // Pay the price of the argument setup. We account for the average 1
- // instruction per call argument setup here.
- addCost(Call.arg_size() * InlineConstants::InstrCost);
+ onCallArgumentSetup(Call);
if (!Call.onlyReadsMemory())
disableLoadElimination();
@@ -1276,8 +1516,7 @@ bool CallAnalyzer::visitCallBase(CallBase &Call) {
return Base::visitCallBase(Call);
case Intrinsic::load_relative:
- // This is normally lowered to 4 LLVM instructions.
- addCost(3 * InlineConstants::InstrCost);
+ onLoadRelativeIntrinsic();
return false;
case Intrinsic::memset:
@@ -1304,28 +1543,7 @@ bool CallAnalyzer::visitCallBase(CallBase &Call) {
}
if (TTI.isLoweredToCall(F)) {
- // We account for the average 1 instruction per call argument setup here.
- addCost(Call.arg_size() * InlineConstants::InstrCost);
-
- // If we have a constant that we are calling as a function, we can peer
- // through it and see the function target. This happens not infrequently
- // during devirtualization and so we want to give it a hefty bonus for
- // inlining, but cap that bonus in the event that inlining wouldn't pan out.
- // Pretend to inline the function, with a custom threshold.
- if (IsIndirectCall && BoostIndirectCalls) {
- auto IndirectCallParams = Params;
- IndirectCallParams.DefaultThreshold =
- InlineConstants::IndirectCallThreshold;
- CallAnalyzer CA(TTI, GetAssumptionCache, GetBFI, PSI, ORE, *F, Call,
- IndirectCallParams, false);
- if (CA.analyze()) {
- // We were able to inline the indirect call! Subtract the cost from the
- // threshold to get the bonus we want to apply, but don't go below zero.
- Cost -= std::max(0, CA.getThreshold() - CA.getCost());
- }
- } else
- // Otherwise simply add the cost for merely making the call.
- addCost(InlineConstants::CallPenalty);
+ onLoweredCall(F, Call, IsIndirectCall);
}
if (!(Call.onlyReadsMemory() || (IsIndirectCall && F->onlyReadsMemory())))
@@ -1381,9 +1599,7 @@ bool CallAnalyzer::visitSelectInst(SelectInst &SI) {
if (TrueBaseAndOffset == FalseBaseAndOffset && TrueBaseAndOffset.first) {
ConstantOffsetPtrs[&SI] = TrueBaseAndOffset;
- Value *SROAArg;
- DenseMap<Value *, int>::iterator CostIt;
- if (lookupSROAArgAndCost(TrueVal, SROAArg, CostIt))
+ if (auto *SROAArg = getSROAArgForValueOrNull(TrueVal))
SROAArgValues[&SI] = SROAArg;
return true;
}
@@ -1422,9 +1638,7 @@ bool CallAnalyzer::visitSelectInst(SelectInst &SI) {
if (BaseAndOffset.first) {
ConstantOffsetPtrs[&SI] = BaseAndOffset;
- Value *SROAArg;
- DenseMap<Value *, int>::iterator CostIt;
- if (lookupSROAArgAndCost(SelectedV, SROAArg, CostIt))
+ if (auto *SROAArg = getSROAArgForValueOrNull(SelectedV))
SROAArgValues[&SI] = SROAArg;
}
@@ -1452,49 +1666,12 @@ bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
// inlining those. It will prevent inlining in cases where the optimization
// does not (yet) fire.
- // Maximum valid cost increased in this function.
- int CostUpperBound = INT_MAX - InlineConstants::InstrCost - 1;
-
unsigned JumpTableSize = 0;
BlockFrequencyInfo *BFI = GetBFI ? &((*GetBFI)(F)) : nullptr;
unsigned NumCaseCluster =
TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize, PSI, BFI);
- // If suitable for a jump table, consider the cost for the table size and
- // branch to destination.
- if (JumpTableSize) {
- int64_t JTCost = (int64_t)JumpTableSize * InlineConstants::InstrCost +
- 4 * InlineConstants::InstrCost;
-
- addCost(JTCost, (int64_t)CostUpperBound);
- return false;
- }
-
- // Considering forming a binary search, we should find the number of nodes
- // which is same as the number of comparisons when lowered. For a given
- // number of clusters, n, we can define a recursive function, f(n), to find
- // the number of nodes in the tree. The recursion is :
- // f(n) = 1 + f(n/2) + f (n - n/2), when n > 3,
- // and f(n) = n, when n <= 3.
- // This will lead a binary tree where the leaf should be either f(2) or f(3)
- // when n > 3. So, the number of comparisons from leaves should be n, while
- // the number of non-leaf should be :
- // 2^(log2(n) - 1) - 1
- // = 2^log2(n) * 2^-1 - 1
- // = n / 2 - 1.
- // Considering comparisons from leaf and non-leaf nodes, we can estimate the
- // number of comparisons in a simple closed form :
- // n + n / 2 - 1 = n * 3 / 2 - 1
- if (NumCaseCluster <= 3) {
- // Suppose a comparison includes one compare and one conditional branch.
- addCost(NumCaseCluster * 2 * InlineConstants::InstrCost);
- return false;
- }
-
- int64_t ExpectedNumberOfCompare = 3 * (int64_t)NumCaseCluster / 2 - 1;
- int64_t SwitchCost = ExpectedNumberOfCompare * 2 * InlineConstants::InstrCost;
-
- addCost(SwitchCost, (int64_t)CostUpperBound);
+ onFinalizeSwitch(JumpTableSize, NumCaseCluster);
return false;
}
@@ -1587,7 +1764,7 @@ CallAnalyzer::analyzeBlock(BasicBlock *BB,
if (Base::visit(&*I))
++NumInstructionsSimplified;
else
- addCost(InlineConstants::InstrCost);
+ onCommonInstructionSimplification();
using namespace ore;
// If the visit this instruction detected an uninlinable pattern, abort.
@@ -1632,9 +1809,7 @@ CallAnalyzer::analyzeBlock(BasicBlock *BB,
return IR;
}
- // Check if we've passed the maximum possible threshold so we don't spin in
- // huge basic blocks that will never inline.
- if (Cost >= Threshold && !ComputeFullInlineCost)
+ if (shouldStop())
return false;
}
@@ -1728,46 +1903,9 @@ void CallAnalyzer::findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB) {
InlineResult CallAnalyzer::analyze() {
++NumCallsAnalyzed;
- // Perform some tweaks to the cost and threshold based on the direct
- // callsite information.
-
- // We want to more aggressively inline vector-dense kernels, so up the
- // threshold, and we'll lower it if the % of vector instructions gets too
- // low. Note that these bonuses are some what arbitrary and evolved over time
- // by accident as much as because they are principled bonuses.
- //
- // FIXME: It would be nice to remove all such bonuses. At least it would be
- // nice to base the bonus values on something more scientific.
- assert(NumInstructions == 0);
- assert(NumVectorInstructions == 0);
-
- // Update the threshold based on callsite properties
- updateThreshold(CandidateCall, F);
-
- // While Threshold depends on commandline options that can take negative
- // values, we want to enforce the invariant that the computed threshold and
- // bonuses are non-negative.
- assert(Threshold >= 0);
- assert(SingleBBBonus >= 0);
- assert(VectorBonus >= 0);
-
- // Speculatively apply all possible bonuses to Threshold. If cost exceeds
- // this Threshold any time, and cost cannot decrease, we can stop processing
- // the rest of the function body.
- Threshold += (SingleBBBonus + VectorBonus);
-
- // Give out bonuses for the callsite, as the instructions setting them up
- // will be gone after inlining.
- addCost(-getCallsiteCost(CandidateCall, DL));
-
- // If this function uses the coldcc calling convention, prefer not to inline
- // it.
- if (F.getCallingConv() == CallingConv::Cold)
- Cost += InlineConstants::ColdccPenalty;
-
- // Check if we're done. This can happen due to bonuses and penalties.
- if (Cost >= Threshold && !ComputeFullInlineCost)
- return "high cost";
+ auto Result = onAnalysisStart();
+ if (!Result)
+ return Result;
if (F.empty())
return true;
@@ -1796,9 +1934,9 @@ InlineResult CallAnalyzer::analyze() {
ConstantOffsetPtrs[&*FAI] = std::make_pair(PtrArg, C->getValue());
// We can SROA any pointer arguments derived from alloca instructions.
- if (isa<AllocaInst>(PtrArg)) {
- SROAArgValues[&*FAI] = PtrArg;
- SROAArgCosts[PtrArg] = 0;
+ if (auto *SROAArg = dyn_cast<AllocaInst>(PtrArg)) {
+ SROAArgValues[&*FAI] = SROAArg;
+ onInitializeSROAArg(SROAArg);
}
}
}
@@ -1824,12 +1962,10 @@ InlineResult CallAnalyzer::analyze() {
BBSetVector;
BBSetVector BBWorklist;
BBWorklist.insert(&F.getEntryBlock());
- bool SingleBB = true;
+
// Note that we *must not* cache the size, this loop grows the worklist.
for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
- // Bail out the moment we cross the threshold. This means we'll under-count
- // the cost, but only when undercounting doesn't matter.
- if (Cost >= Threshold && !ComputeFullInlineCost)
+ if (shouldStop())
break;
BasicBlock *BB = BBWorklist[Idx];
@@ -1889,15 +2025,7 @@ InlineResult CallAnalyzer::analyze() {
++TIdx)
BBWorklist.insert(TI->getSuccessor(TIdx));
- // If we had any successors at this point, than post-inlining is likely to
- // have them as well. Note that we assume any basic blocks which existed
- // due to branches or switches which folded above will also fold after
- // inlining.
- if (SingleBB && TI->getNumSuccessors() > 1) {
- // Take off the bonus we applied to the threshold.
- Threshold -= SingleBBBonus;
- SingleBB = false;
- }
+ onBlockAnalyzed(BB);
}
bool OnlyOneCallAndLocalLinkage = F.hasLocalLinkage() && F.hasOneUse() &&
@@ -1908,38 +2036,12 @@ InlineResult CallAnalyzer::analyze() {
if (!OnlyOneCallAndLocalLinkage && ContainsNoDuplicateCall)
return "noduplicate";
- // Loops generally act a lot like calls in that they act like barriers to
- // movement, require a certain amount of setup, etc. So when optimising for
- // size, we penalise any call sites that perform loops. We do this after all
- // other costs here, so will likely only be dealing with relatively small
- // functions (and hence DT and LI will hopefully be cheap).
- if (Caller->hasMinSize()) {
- DominatorTree DT(F);
- LoopInfo LI(DT);
- int NumLoops = 0;
- for (Loop *L : LI) {
- // Ignore loops that will not be executed
- if (DeadBlocks.count(L->getHeader()))
- continue;
- NumLoops++;
- }
- addCost(NumLoops * InlineConstants::CallPenalty);
- }
-
- // We applied the maximum possible vector bonus at the beginning. Now,
- // subtract the excess bonus, if any, from the Threshold before
- // comparing against Cost.
- if (NumVectorInstructions <= NumInstructions / 10)
- Threshold -= VectorBonus;
- else if (NumVectorInstructions <= NumInstructions / 2)
- Threshold -= VectorBonus / 2;
-
- return Cost < std::max(1, Threshold);
+ return finalizeAnalysis();
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// Dump stats about this call's analysis.
-LLVM_DUMP_METHOD void CallAnalyzer::dump() {
+LLVM_DUMP_METHOD void InlineCostCallAnalyzer::dump() {
#define DEBUG_PRINT_STAT(x) dbgs() << " " #x ": " << x << "\n"
DEBUG_PRINT_STAT(NumConstantArgs);
DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs);
@@ -2073,8 +2175,8 @@ InlineCost llvm::getInlineCost(
LLVM_DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName()
<< "... (caller:" << Caller->getName() << ")\n");
- CallAnalyzer CA(CalleeTTI, GetAssumptionCache, GetBFI, PSI, ORE, *Callee,
- Call, Params);
+ InlineCostCallAnalyzer CA(CalleeTTI, GetAssumptionCache, GetBFI, PSI, ORE,
+ *Callee, Call, Params);
InlineResult ShouldInline = CA.analyze();
LLVM_DEBUG(CA.dump());
@@ -2121,16 +2223,17 @@ InlineResult llvm::isInlineViable(Function &F) {
switch (Call->getCalledFunction()->getIntrinsicID()) {
default:
break;
- // Disallow inlining of @llvm.icall.branch.funnel because current
- // backend can't separate call targets from call arguments.
case llvm::Intrinsic::icall_branch_funnel:
+ // Disallow inlining of @llvm.icall.branch.funnel because current
+ // backend can't separate call targets from call arguments.
return "disallowed inlining of @llvm.icall.branch.funnel";
- // Disallow inlining functions that call @llvm.localescape. Doing this
- // correctly would require major changes to the inliner.
case llvm::Intrinsic::localescape:
+ // Disallow inlining functions that call @llvm.localescape. Doing this
+ // correctly would require major changes to the inliner.
return "disallowed inlining of @llvm.localescape";
- // Disallow inlining of functions that initialize VarArgs with va_start.
case llvm::Intrinsic::vastart:
+ // Disallow inlining of functions that initialize VarArgs with
+ // va_start.
return "contains VarArgs initialized with va_start";
}
}
More information about the llvm-commits
mailing list