[llvm] 338a601 - Revert "[NFC][InlineCost] Factor cost modeling out of CallAnalyzer traversal."
Mircea Trofin via llvm-commits
llvm-commits at lists.llvm.org
Wed Jan 8 17:43:35 PST 2020
Author: Mircea Trofin
Date: 2020-01-08T17:42:23-08:00
New Revision: 338a601612ca36e112b14f622eb310985b93192a
URL: https://github.com/llvm/llvm-project/commit/338a601612ca36e112b14f622eb310985b93192a
DIFF: https://github.com/llvm/llvm-project/commit/338a601612ca36e112b14f622eb310985b93192a.diff
LOG: Revert "[NFC][InlineCost] Factor cost modeling out of CallAnalyzer traversal."
This reverts commit 76aab66d34446ccf764cf8127b73e1517df75fb4.
Failure:
http://lab.llvm.org:8011/builders/clang-with-thin-lto-ubuntu/builds/20562,
will investigate and resubmit.
Added:
Modified:
llvm/lib/Analysis/InlineCost.cpp
Removed:
################################################################################
diff --git a/llvm/lib/Analysis/InlineCost.cpp b/llvm/lib/Analysis/InlineCost.cpp
index 017301fce9b7..b5f4192bf856 100644
--- a/llvm/lib/Analysis/InlineCost.cpp
+++ b/llvm/lib/Analysis/InlineCost.cpp
@@ -93,12 +93,11 @@ static cl::opt<bool> OptComputeFullInlineCost(
"exceeds the threshold."));
namespace {
-class InlineCostCallAnalyzer;
+
class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
typedef InstVisitor<CallAnalyzer, bool> Base;
friend class InstVisitor<CallAnalyzer, bool>;
-protected:
/// The TargetTransformInfo available for this compilation.
const TargetTransformInfo &TTI;
@@ -125,86 +124,20 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
/// easily cacheable. Instead, use the cover function paramHasAttr.
CallBase &CandidateCall;
- /// Extension points for handling callsite features.
- /// Called after a basic block was analyzed.
- virtual void onBlockAnalyzed(const BasicBlock *BB) {}
-
- /// Called at the end of the analysis of the callsite. Return the outcome of
- /// the analysis, i.e. 'InlineResult(true)' if the inlining may happen, or
- /// the reason it can't.
- virtual InlineResult finalizeAnalysis() { return true; }
-
- /// Called when we're about to start processing a basic block, and every time
- /// we are done processing an instruction. Return true if there is no point in
- /// continuing the analysis (e.g. we've determined already the call site is
- /// too expensive to inline)
- virtual bool shouldStop() { return false; }
-
- /// Called before the analysis of the callee body starts (with callsite
- /// contexts propagated). It checks callsite-specific information. Return a
- /// reason analysis can't continue if that's the case, or 'true' if it may
- /// continue.
- virtual InlineResult onAnalysisStart() { return true; }
-
- /// Called if the analysis engine decides SROA cannot be done for the given
- /// alloca.
- virtual void onDisableSROA(AllocaInst *Arg) {}
-
- /// Called the analysis engine determines load elimination won't happen.
- virtual void onDisableLoadElimination() {}
-
- /// Called to account for a call.
- virtual void onCallPenalty() {}
-
- /// Called to account for the expectation the inlining would result in a load
- /// elimination.
- virtual void onLoadEliminationOpportunity() {}
-
- /// Called to account for the cost of argument setup for the Call in the
- /// callee's body (not the callsite currently under analysis).
- virtual void onCallArgumentSetup(const CallBase &Call) {}
-
- /// Called to account for a load relative intrinsic.
- virtual void onLoadRelativeIntrinsic() {}
-
- /// Called to account for a lowered call.
- virtual void onLoweredCall(Function *F, CallBase &Call, bool IsIndirectCall) {
- }
-
- /// Account for a jump table of given size. Return false to stop further
- /// processing the switch instruction
- virtual bool onJumpTable(unsigned JumpTableSize) { return true; }
-
- /// Account for a case cluster of given size. Return false to stop further
- /// processing of the instruction.
- virtual bool onCaseCluster(unsigned NumCaseCluster) { return true; }
-
- /// Called at the end of processing a switch instruction, with the given
- /// number of case clusters.
- virtual void onFinalizeSwitch(unsigned JumpTableSize,
- unsigned NumCaseCluster) {}
-
- /// Called to account for any other instruction not specifically accounted
- /// for.
- virtual void onCommonInstructionSimplification() {}
+ /// Tunable parameters that control the analysis.
+ const InlineParams &Params;
- /// Start accounting potential benefits due to SROA for the given alloca.
- virtual void onInitializeSROAArg(AllocaInst *Arg) {}
+ /// Upper bound for the inlining cost. Bonuses are being applied to account
+ /// for speculative "expected profit" of the inlining decision.
+ int Threshold;
- /// Account SROA savings for the AllocaInst value.
- virtual void onAggregateSROAUse(AllocaInst *V) {}
+ /// Inlining cost measured in abstract units, accounts for all the
+ /// instructions expected to be executed for a given function invocation.
+ /// Instructions that are statically proven to be dead based on call-site
+ /// arguments are not counted here.
+ int Cost = 0;
- bool handleSROA(Value *V, bool DoNotDisable) {
- // Check for SROA candidates in comparisons.
- if (auto *SROAArg = getSROAArgForValueOrNull(V)) {
- if (DoNotDisable) {
- onAggregateSROAUse(SROAArg);
- return true;
- }
- disableSROA(SROAArg);
- }
- return false;
- }
+ bool ComputeFullInlineCost;
bool IsCallerRecursive = false;
bool IsRecursiveCall = false;
@@ -216,11 +149,20 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
bool HasUninlineableIntrinsic = false;
bool InitsVargArgs = false;
+ /// Attempt to evaluate indirect calls to boost its inline cost.
+ bool BoostIndirectCalls;
+
/// Number of bytes allocated statically by the callee.
uint64_t AllocatedSize = 0;
unsigned NumInstructions = 0;
unsigned NumVectorInstructions = 0;
+ /// Bonus to be applied when percentage of vector instructions in callee is
+ /// high (see more details in updateThreshold).
+ int VectorBonus = 0;
+ /// Bonus to be applied when the callee has only one reachable basic block.
+ int SingleBBBonus = 0;
+
/// While we walk the potentially-inlined instructions, we build up and
/// maintain a mapping of simplified values specific to this callsite. The
/// idea is to propagate any special information we have about arguments to
@@ -232,10 +174,12 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
/// Keep track of the values which map back (through function arguments) to
/// allocas on the caller stack which could be simplified through SROA.
- /// We can disable an association (because for some reason the SROA oportunity
- /// is lost) by setting the value to nullptr. We don't delete because we still
- /// want isAllocaDerivedArg to function correctly.
- DenseMap<Value *, AllocaInst *> SROAArgValues;
+ DenseMap<Value *, Value *> SROAArgValues;
+
+ /// The mapping of caller Alloca values to their accumulated cost savings. If
+ /// we have to disable SROA for one of the allocas, this tells us how much
+ /// cost must be added.
+ DenseMap<Value *, int> SROAArgCosts;
/// Keep track of values which map to a pointer base and constant offset.
DenseMap<Value *, std::pair<Value *, APInt>> ConstantOffsetPtrs;
@@ -252,18 +196,17 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
/// loads.
bool EnableLoadElimination;
SmallPtrSet<Value *, 16> LoadAddrSet;
-
- AllocaInst *getSROAArgForValueOrNull(Value *V) const {
- auto It = SROAArgValues.find(V);
- if (It == SROAArgValues.end())
- return nullptr;
- return It->second;
- }
+ int LoadEliminationCost = 0;
// Custom simplification helper routines.
bool isAllocaDerivedArg(Value *V);
+ bool lookupSROAArgAndCost(Value *V, Value *&Arg,
+ DenseMap<Value *, int>::iterator &CostIt);
+ void disableSROA(DenseMap<Value *, int>::iterator CostIt);
void disableSROA(Value *V);
void findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB);
+ void accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
+ int InstructionCost);
void disableLoadElimination();
bool isGEPFree(GetElementPtrInst &GEP);
bool canFoldInboundsGEP(GetElementPtrInst &I);
@@ -284,13 +227,32 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
/// inlined through this particular callsite.
bool isKnownNonNullInCallee(Value *V);
+ /// Update Threshold based on callsite properties such as callee
+ /// attributes and callee hotness for PGO builds. The Callee is explicitly
+ /// passed to support analyzing indirect calls whose target is inferred by
+ /// analysis.
+ void updateThreshold(CallBase &Call, Function &Callee);
+
/// Return true if size growth is allowed when inlining the callee at \p Call.
bool allowSizeGrowth(CallBase &Call);
+ /// Return true if \p Call is a cold callsite.
+ bool isColdCallSite(CallBase &Call, BlockFrequencyInfo *CallerBFI);
+
+ /// Return a higher threshold if \p Call is a hot callsite.
+ Optional<int> getHotCallSiteThreshold(CallBase &Call,
+ BlockFrequencyInfo *CallerBFI);
+
// Custom analysis routines.
InlineResult analyzeBlock(BasicBlock *BB,
SmallPtrSetImpl<const Value *> &EphValues);
+ /// Handle a capped 'int' increment for Cost.
+ void addCost(int64_t Inc, int64_t UpperBound = INT_MAX) {
+ assert(UpperBound > 0 && UpperBound <= INT_MAX && "invalid upper bound");
+ Cost = (int)std::min(UpperBound, Cost + Inc);
+ }
+
// Disable several entry points to the visitor so we don't accidentally use
// them by declaring but not defining them here.
void visit(Module *);
@@ -336,13 +298,20 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI,
ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE,
- Function &Callee, CallBase &Call)
+ Function &Callee, CallBase &Call, const InlineParams &Params,
+ bool BoostIndirect = true)
: TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI),
PSI(PSI), F(Callee), DL(F.getParent()->getDataLayout()), ORE(ORE),
- CandidateCall(Call), EnableLoadElimination(true) {}
+ CandidateCall(Call), Params(Params), Threshold(Params.DefaultThreshold),
+ ComputeFullInlineCost(OptComputeFullInlineCost ||
+ Params.ComputeFullInlineCost || ORE),
+ BoostIndirectCalls(BoostIndirect), EnableLoadElimination(true) {}
InlineResult analyze();
+ int getThreshold() { return Threshold; }
+ int getCost() { return Cost; }
+
// Keep a bunch of stats about the cost savings found so we can print them
// out when debugging.
unsigned NumConstantArgs = 0;
@@ -351,283 +320,12 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
unsigned NumConstantPtrCmps = 0;
unsigned NumConstantPtrDiffs = 0;
unsigned NumInstructionsSimplified = 0;
-
- void dump();
-};
-
-/// FIXME: if it is necessary to derive from InlineCostCallAnalyzer, note
-/// the FIXME in onLoweredCall, when instantiating an InlineCostCallAnalyzer
-class InlineCostCallAnalyzer final : public CallAnalyzer {
- const int CostUpperBound = INT_MAX - InlineConstants::InstrCost - 1;
- const bool ComputeFullInlineCost;
- int LoadEliminationCost = 0;
- /// Bonus to be applied when percentage of vector instructions in callee is
- /// high (see more details in updateThreshold).
- int VectorBonus = 0;
- /// Bonus to be applied when the callee has only one reachable basic block.
- int SingleBBBonus = 0;
-
- /// Tunable parameters that control the analysis.
- const InlineParams &Params;
-
- /// Upper bound for the inlining cost. Bonuses are being applied to account
- /// for speculative "expected profit" of the inlining decision.
- int Threshold = 0;
-
- /// Attempt to evaluate indirect calls to boost its inline cost.
- const bool BoostIndirectCalls;
-
- /// Inlining cost measured in abstract units, accounts for all the
- /// instructions expected to be executed for a given function invocation.
- /// Instructions that are statically proven to be dead based on call-site
- /// arguments are not counted here.
- int Cost = 0;
-
- bool SingleBB = true;
-
unsigned SROACostSavings = 0;
unsigned SROACostSavingsLost = 0;
- /// The mapping of caller Alloca values to their accumulated cost savings. If
- /// we have to disable SROA for one of the allocas, this tells us how much
- /// cost must be added.
- DenseMap<AllocaInst *, int> SROAArgCosts;
-
- /// Return true if \p Call is a cold callsite.
- bool isColdCallSite(CallBase &Call, BlockFrequencyInfo *CallerBFI);
-
- /// Update Threshold based on callsite properties such as callee
- /// attributes and callee hotness for PGO builds. The Callee is explicitly
- /// passed to support analyzing indirect calls whose target is inferred by
- /// analysis.
- void updateThreshold(CallBase &Call, Function &Callee);
- /// Return a higher threshold if \p Call is a hot callsite.
- Optional<int> getHotCallSiteThreshold(CallBase &Call,
- BlockFrequencyInfo *CallerBFI);
-
- /// Handle a capped 'int' increment for Cost.
- void addCost(int64_t Inc, int64_t UpperBound = INT_MAX) {
- assert(UpperBound > 0 && UpperBound <= INT_MAX && "invalid upper bound");
- Cost = (int)std::min(UpperBound, Cost + Inc);
- }
-
- void onDisableSROA(AllocaInst *Arg) override {
- auto CostIt = SROAArgCosts.find(Arg);
- if (CostIt == SROAArgCosts.end())
- return;
- addCost(CostIt->second);
- SROACostSavings -= CostIt->second;
- SROACostSavingsLost += CostIt->second;
- SROAArgCosts.erase(CostIt);
- }
-
- void onDisableLoadElimination() override {
- addCost(LoadEliminationCost);
- LoadEliminationCost = 0;
- }
- void onCallPenalty() override { addCost(InlineConstants::CallPenalty); }
- void onCallArgumentSetup(const CallBase &Call) override {
- // Pay the price of the argument setup. We account for the average 1
- // instruction per call argument setup here.
- addCost(Call.arg_size() * InlineConstants::InstrCost);
- }
- void onLoadRelativeIntrinsic() override {
- // This is normally lowered to 4 LLVM instructions.
- addCost(3 * InlineConstants::InstrCost);
- }
- void onLoweredCall(Function *F, CallBase &Call,
- bool IsIndirectCall) override {
- // We account for the average 1 instruction per call argument setup here.
- addCost(Call.arg_size() * InlineConstants::InstrCost);
-
- // If we have a constant that we are calling as a function, we can peer
- // through it and see the function target. This happens not infrequently
- // during devirtualization and so we want to give it a hefty bonus for
- // inlining, but cap that bonus in the event that inlining wouldn't pan out.
- // Pretend to inline the function, with a custom threshold.
- if (IsIndirectCall && BoostIndirectCalls) {
- auto IndirectCallParams = Params;
- IndirectCallParams.DefaultThreshold =
- InlineConstants::IndirectCallThreshold;
- /// FIXME: if InlineCostCallAnalyzer is derived from, this may need
- /// to instantiate the derived class.
- InlineCostCallAnalyzer CA(TTI, GetAssumptionCache, GetBFI, PSI, ORE, *F,
- Call, IndirectCallParams, false);
- if (CA.analyze()) {
- // We were able to inline the indirect call! Subtract the cost from the
- // threshold to get the bonus we want to apply, but don't go below zero.
- Cost -= std::max(0, CA.getThreshold() - CA.getCost());
- }
- } else
- // Otherwise simply add the cost for merely making the call.
- addCost(InlineConstants::CallPenalty);
- }
-
- void onFinalizeSwitch(unsigned JumpTableSize,
- unsigned NumCaseCluster) override {
- // If suitable for a jump table, consider the cost for the table size and
- // branch to destination.
- // Maximum valid cost increased in this function.
- if (JumpTableSize) {
- int64_t JTCost = (int64_t)JumpTableSize * InlineConstants::InstrCost +
- 4 * InlineConstants::InstrCost;
-
- addCost(JTCost, (int64_t)CostUpperBound);
- return;
- }
- // Considering forming a binary search, we should find the number of nodes
- // which is same as the number of comparisons when lowered. For a given
- // number of clusters, n, we can define a recursive function, f(n), to find
- // the number of nodes in the tree. The recursion is :
- // f(n) = 1 + f(n/2) + f (n - n/2), when n > 3,
- // and f(n) = n, when n <= 3.
- // This will lead a binary tree where the leaf should be either f(2) or f(3)
- // when n > 3. So, the number of comparisons from leaves should be n, while
- // the number of non-leaf should be :
- // 2^(log2(n) - 1) - 1
- // = 2^log2(n) * 2^-1 - 1
- // = n / 2 - 1.
- // Considering comparisons from leaf and non-leaf nodes, we can estimate the
- // number of comparisons in a simple closed form :
- // n + n / 2 - 1 = n * 3 / 2 - 1
- if (NumCaseCluster <= 3) {
- // Suppose a comparison includes one compare and one conditional branch.
- addCost(NumCaseCluster * 2 * InlineConstants::InstrCost);
- return;
- }
-
- int64_t ExpectedNumberOfCompare = 3 * (int64_t)NumCaseCluster / 2 - 1;
- int64_t SwitchCost =
- ExpectedNumberOfCompare * 2 * InlineConstants::InstrCost;
-
- addCost(SwitchCost, (int64_t)CostUpperBound);
- }
- void onCommonInstructionSimplification() override {
- addCost(InlineConstants::InstrCost);
- }
-
- void onInitializeSROAArg(AllocaInst *Arg) override { SROAArgCosts[Arg] = 0; }
-
- void onAggregateSROAUse(AllocaInst *SROAArg) override {
- auto CostIt = SROAArgCosts.find(SROAArg);
- CostIt->second += InlineConstants::InstrCost;
- SROACostSavings += InlineConstants::InstrCost;
- }
-
- virtual void onBlockAnalyzed(const BasicBlock *BB) override {
- auto *TI = BB->getTerminator();
- // If we had any successors at this point, than post-inlining is likely to
- // have them as well. Note that we assume any basic blocks which existed
- // due to branches or switches which folded above will also fold after
- // inlining.
- if (SingleBB && TI->getNumSuccessors() > 1) {
- // Take off the bonus we applied to the threshold.
- Threshold -= SingleBBBonus;
- SingleBB = false;
- }
- }
- virtual InlineResult finalizeAnalysis() override {
- // Loops generally act a lot like calls in that they act like barriers to
- // movement, require a certain amount of setup, etc. So when optimising for
- // size, we penalise any call sites that perform loops. We do this after all
- // other costs here, so will likely only be dealing with relatively small
- // functions (and hence DT and LI will hopefully be cheap).
- auto *Caller = CandidateCall.getFunction();
- if (Caller->hasMinSize()) {
- DominatorTree DT(F);
- LoopInfo LI(DT);
- int NumLoops = 0;
- for (Loop *L : LI) {
- // Ignore loops that will not be executed
- if (DeadBlocks.count(L->getHeader()))
- continue;
- NumLoops++;
- }
- addCost(NumLoops * InlineConstants::CallPenalty);
- }
-
- // We applied the maximum possible vector bonus at the beginning. Now,
- // subtract the excess bonus, if any, from the Threshold before
- // comparing against Cost.
- if (NumVectorInstructions <= NumInstructions / 10)
- Threshold -= VectorBonus;
- else if (NumVectorInstructions <= NumInstructions / 2)
- Threshold -= VectorBonus / 2;
-
- return Cost < std::max(1, Threshold);
- }
- virtual bool shouldStop() override {
- // Bail out the moment we cross the threshold. This means we'll under-count
- // the cost, but only when undercounting doesn't matter.
- return Cost >= Threshold && !ComputeFullInlineCost;
- }
-
- virtual void onLoadEliminationOpportunity() {
- LoadEliminationCost += InlineConstants::InstrCost;
- }
-
- InlineResult onAnalysisStart() override {
- // Perform some tweaks to the cost and threshold based on the direct
- // callsite information.
-
- // We want to more aggressively inline vector-dense kernels, so up the
- // threshold, and we'll lower it if the % of vector instructions gets too
- // low. Note that these bonuses are some what arbitrary and evolved over
- // time by accident as much as because they are principled bonuses.
- //
- // FIXME: It would be nice to remove all such bonuses. At least it would be
- // nice to base the bonus values on something more scientific.
- assert(NumInstructions == 0);
- assert(NumVectorInstructions == 0);
-
- // Update the threshold based on callsite properties
- updateThreshold(CandidateCall, F);
-
- // While Threshold depends on commandline options that can take negative
- // values, we want to enforce the invariant that the computed threshold and
- // bonuses are non-negative.
- assert(Threshold >= 0);
- assert(SingleBBBonus >= 0);
- assert(VectorBonus >= 0);
-
- // Speculatively apply all possible bonuses to Threshold. If cost exceeds
- // this Threshold any time, and cost cannot decrease, we can stop processing
- // the rest of the function body.
- Threshold += (SingleBBBonus + VectorBonus);
-
- // Give out bonuses for the callsite, as the instructions setting them up
- // will be gone after inlining.
- addCost(-getCallsiteCost(this->CandidateCall, DL));
-
- // If this function uses the coldcc calling convention, prefer not to inline
- // it.
- if (F.getCallingConv() == CallingConv::Cold)
- Cost += InlineConstants::ColdccPenalty;
-
- // Check if we're done. This can happen due to bonuses and penalties.
- if (Cost >= Threshold && !ComputeFullInlineCost)
- return "high cost";
-
- return true;
- }
-
-public:
- InlineCostCallAnalyzer(
- const TargetTransformInfo &TTI,
- std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
- Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI,
- ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE, Function &Callee,
- CallBase &Call, const InlineParams &Params, bool BoostIndirect = true)
- : CallAnalyzer(TTI, GetAssumptionCache, GetBFI, PSI, ORE, Callee, Call),
- ComputeFullInlineCost(OptComputeFullInlineCost ||
- Params.ComputeFullInlineCost || ORE),
- Params(Params), Threshold(Params.DefaultThreshold),
- BoostIndirectCalls(BoostIndirect) {}
void dump();
-
- int getThreshold() { return Threshold; }
- int getCost() { return Cost; }
};
+
} // namespace
/// Test whether the given value is an Alloca-derived function argument.
@@ -635,22 +333,55 @@ bool CallAnalyzer::isAllocaDerivedArg(Value *V) {
return SROAArgValues.count(V);
}
+/// Lookup the SROA-candidate argument and cost iterator which V maps to.
+/// Returns false if V does not map to a SROA-candidate.
+bool CallAnalyzer::lookupSROAArgAndCost(
+ Value *V, Value *&Arg, DenseMap<Value *, int>::iterator &CostIt) {
+ if (SROAArgValues.empty() || SROAArgCosts.empty())
+ return false;
+
+ DenseMap<Value *, Value *>::iterator ArgIt = SROAArgValues.find(V);
+ if (ArgIt == SROAArgValues.end())
+ return false;
+
+ Arg = ArgIt->second;
+ CostIt = SROAArgCosts.find(Arg);
+ return CostIt != SROAArgCosts.end();
+}
+
+/// Disable SROA for the candidate marked by this cost iterator.
+///
+/// This marks the candidate as no longer viable for SROA, and adds the cost
+/// savings associated with it back into the inline cost measurement.
+void CallAnalyzer::disableSROA(DenseMap<Value *, int>::iterator CostIt) {
+ // If we're no longer able to perform SROA we need to undo its cost savings
+ // and prevent subsequent analysis.
+ addCost(CostIt->second);
+ SROACostSavings -= CostIt->second;
+ SROACostSavingsLost += CostIt->second;
+ SROAArgCosts.erase(CostIt);
+ disableLoadElimination();
+}
+
/// If 'V' maps to a SROA candidate, disable SROA for it.
void CallAnalyzer::disableSROA(Value *V) {
- auto It = SROAArgValues.find(V);
- if (It == SROAArgValues.end())
- return;
- auto *SROAArg = It->second;
- if (!SROAArg)
- return;
- It->second = nullptr;
- onDisableSROA(SROAArg);
- disableLoadElimination();
+ Value *SROAArg;
+ DenseMap<Value *, int>::iterator CostIt;
+ if (lookupSROAArgAndCost(V, SROAArg, CostIt))
+ disableSROA(CostIt);
+}
+
+/// Accumulate the given cost for a particular SROA candidate.
+void CallAnalyzer::accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
+ int InstructionCost) {
+ CostIt->second += InstructionCost;
+ SROACostSavings += InstructionCost;
}
void CallAnalyzer::disableLoadElimination() {
if (EnableLoadElimination) {
- onDisableLoadElimination();
+ addCost(LoadEliminationCost);
+ LoadEliminationCost = 0;
EnableLoadElimination = false;
}
}
@@ -822,7 +553,9 @@ bool CallAnalyzer::visitPHI(PHINode &I) {
if (FirstBaseAndOffset.first) {
ConstantOffsetPtrs[&I] = FirstBaseAndOffset;
- if (auto *SROAArg = getSROAArgForValueOrNull(FirstV))
+ Value *SROAArg;
+ DenseMap<Value *, int>::iterator CostIt;
+ if (lookupSROAArgAndCost(FirstV, SROAArg, CostIt))
SROAArgValues[&I] = SROAArg;
}
@@ -852,8 +585,10 @@ bool CallAnalyzer::canFoldInboundsGEP(GetElementPtrInst &I) {
}
bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
+ Value *SROAArg;
DenseMap<Value *, int>::iterator CostIt;
- auto *SROAArg = getSROAArgForValueOrNull(I.getPointerOperand());
+ bool SROACandidate =
+ lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt);
// Lambda to check whether a GEP's indices are all constant.
auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) {
@@ -864,7 +599,7 @@ bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
};
if ((I.isInBounds() && canFoldInboundsGEP(I)) || IsGEPOffsetConstant(I)) {
- if (SROAArg)
+ if (SROACandidate)
SROAArgValues[&I] = SROAArg;
// Constant GEPs are modeled as free.
@@ -872,8 +607,8 @@ bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
}
// Variable GEPs will require math and will disable SROA.
- if (SROAArg)
- disableSROA(SROAArg);
+ if (SROACandidate)
+ disableSROA(CostIt);
return isGEPFree(I);
}
@@ -913,7 +648,9 @@ bool CallAnalyzer::visitBitCast(BitCastInst &I) {
ConstantOffsetPtrs[&I] = BaseAndOffset;
// Also look for SROA candidates here.
- if (auto *SROAArg = getSROAArgForValueOrNull(I.getOperand(0)))
+ Value *SROAArg;
+ DenseMap<Value *, int>::iterator CostIt;
+ if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt))
SROAArgValues[&I] = SROAArg;
// Bitcasts are always zero cost.
@@ -945,7 +682,9 @@ bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
// and so we can just add the integer in here. The only places where SROA is
// preserved either cannot fire on an integer, or won't in-and-of themselves
// disable SROA (ext) w/o some later use that we would see and disable.
- if (auto *SROAArg = getSROAArgForValueOrNull(I.getOperand(0)))
+ Value *SROAArg;
+ DenseMap<Value *, int>::iterator CostIt;
+ if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt))
SROAArgValues[&I] = SROAArg;
return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
@@ -969,7 +708,9 @@ bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
}
// "Propagate" SROA here in the same manner as we do for ptrtoint above.
- if (auto *SROAArg = getSROAArgForValueOrNull(Op))
+ Value *SROAArg;
+ DenseMap<Value *, int>::iterator CostIt;
+ if (lookupSROAArgAndCost(Op, SROAArg, CostIt))
SROAArgValues[&I] = SROAArg;
return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
@@ -996,7 +737,7 @@ bool CallAnalyzer::visitCastInst(CastInst &I) {
case Instruction::FPToUI:
case Instruction::FPToSI:
if (TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive)
- onCallPenalty();
+ addCost(InlineConstants::CallPenalty);
break;
default:
break;
@@ -1069,8 +810,8 @@ bool CallAnalyzer::allowSizeGrowth(CallBase &Call) {
return true;
}
-bool InlineCostCallAnalyzer::isColdCallSite(CallBase &Call,
- BlockFrequencyInfo *CallerBFI) {
+bool CallAnalyzer::isColdCallSite(CallBase &Call,
+ BlockFrequencyInfo *CallerBFI) {
// If global profile summary is available, then callsite's coldness is
// determined based on that.
if (PSI && PSI->hasProfileSummary())
@@ -1093,8 +834,8 @@ bool InlineCostCallAnalyzer::isColdCallSite(CallBase &Call,
}
Optional<int>
-InlineCostCallAnalyzer::getHotCallSiteThreshold(CallBase &Call,
- BlockFrequencyInfo *CallerBFI) {
+CallAnalyzer::getHotCallSiteThreshold(CallBase &Call,
+ BlockFrequencyInfo *CallerBFI) {
// If global profile summary is available, then callsite's hotness is
// determined based on that.
@@ -1121,7 +862,7 @@ InlineCostCallAnalyzer::getHotCallSiteThreshold(CallBase &Call,
return None;
}
-void InlineCostCallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) {
+void CallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) {
// If no size growth is allowed for this inlining, set Threshold to 0.
if (!allowSizeGrowth(Call)) {
Threshold = 0;
@@ -1283,7 +1024,19 @@ bool CallAnalyzer::visitCmpInst(CmpInst &I) {
: ConstantInt::getFalse(I.getType());
return true;
}
- return handleSROA(I.getOperand(0), isa<ConstantPointerNull>(I.getOperand(1)));
+ // Finally check for SROA candidates in comparisons.
+ Value *SROAArg;
+ DenseMap<Value *, int>::iterator CostIt;
+ if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) {
+ if (isa<ConstantPointerNull>(I.getOperand(1))) {
+ accumulateSROACost(CostIt, InlineConstants::InstrCost);
+ return true;
+ }
+
+ disableSROA(CostIt);
+ }
+
+ return false;
}
bool CallAnalyzer::visitSub(BinaryOperator &I) {
@@ -1347,7 +1100,7 @@ bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {
if (I.getType()->isFloatingPointTy() &&
TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive &&
!match(&I, m_FNeg(m_Value())))
- onCallPenalty();
+ addCost(InlineConstants::CallPenalty);
return false;
}
@@ -1374,15 +1127,23 @@ bool CallAnalyzer::visitFNeg(UnaryOperator &I) {
}
bool CallAnalyzer::visitLoad(LoadInst &I) {
- if (handleSROA(I.getPointerOperand(), I.isSimple()))
- return true;
+ Value *SROAArg;
+ DenseMap<Value *, int>::iterator CostIt;
+ if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) {
+ if (I.isSimple()) {
+ accumulateSROACost(CostIt, InlineConstants::InstrCost);
+ return true;
+ }
+
+ disableSROA(CostIt);
+ }
// If the data is already loaded from this address and hasn't been clobbered
// by any stores or calls, this load is likely to be redundant and can be
// eliminated.
if (EnableLoadElimination &&
!LoadAddrSet.insert(I.getPointerOperand()).second && I.isUnordered()) {
- onLoadEliminationOpportunity();
+ LoadEliminationCost += InlineConstants::InstrCost;
return true;
}
@@ -1390,8 +1151,16 @@ bool CallAnalyzer::visitLoad(LoadInst &I) {
}
bool CallAnalyzer::visitStore(StoreInst &I) {
- if (handleSROA(I.getPointerOperand(), I.isSimple()))
- return true;
+ Value *SROAArg;
+ DenseMap<Value *, int>::iterator CostIt;
+ if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) {
+ if (I.isSimple()) {
+ accumulateSROACost(CostIt, InlineConstants::InstrCost);
+ return true;
+ }
+
+ disableSROA(CostIt);
+ }
// The store can potentially clobber loads and prevent repeated loads from
// being eliminated.
@@ -1481,7 +1250,9 @@ bool CallAnalyzer::visitCallBase(CallBase &Call) {
// in this inline context. If not, we've done all we can.
F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee));
if (!F) {
- onCallArgumentSetup(Call);
+ // Pay the price of the argument setup. We account for the average 1
+ // instruction per call argument setup here.
+ addCost(Call.arg_size() * InlineConstants::InstrCost);
if (!Call.onlyReadsMemory())
disableLoadElimination();
@@ -1505,7 +1276,8 @@ bool CallAnalyzer::visitCallBase(CallBase &Call) {
return Base::visitCallBase(Call);
case Intrinsic::load_relative:
- onLoadRelativeIntrinsic();
+ // This is normally lowered to 4 LLVM instructions.
+ addCost(3 * InlineConstants::InstrCost);
return false;
case Intrinsic::memset:
@@ -1532,7 +1304,28 @@ bool CallAnalyzer::visitCallBase(CallBase &Call) {
}
if (TTI.isLoweredToCall(F)) {
- onLoweredCall(F, Call, IsIndirectCall);
+ // We account for the average 1 instruction per call argument setup here.
+ addCost(Call.arg_size() * InlineConstants::InstrCost);
+
+ // If we have a constant that we are calling as a function, we can peer
+ // through it and see the function target. This happens not infrequently
+ // during devirtualization and so we want to give it a hefty bonus for
+ // inlining, but cap that bonus in the event that inlining wouldn't pan out.
+ // Pretend to inline the function, with a custom threshold.
+ if (IsIndirectCall && BoostIndirectCalls) {
+ auto IndirectCallParams = Params;
+ IndirectCallParams.DefaultThreshold =
+ InlineConstants::IndirectCallThreshold;
+ CallAnalyzer CA(TTI, GetAssumptionCache, GetBFI, PSI, ORE, *F, Call,
+ IndirectCallParams, false);
+ if (CA.analyze()) {
+ // We were able to inline the indirect call! Subtract the cost from the
+ // threshold to get the bonus we want to apply, but don't go below zero.
+ Cost -= std::max(0, CA.getThreshold() - CA.getCost());
+ }
+ } else
+ // Otherwise simply add the cost for merely making the call.
+ addCost(InlineConstants::CallPenalty);
}
if (!(Call.onlyReadsMemory() || (IsIndirectCall && F->onlyReadsMemory())))
@@ -1588,7 +1381,9 @@ bool CallAnalyzer::visitSelectInst(SelectInst &SI) {
if (TrueBaseAndOffset == FalseBaseAndOffset && TrueBaseAndOffset.first) {
ConstantOffsetPtrs[&SI] = TrueBaseAndOffset;
- if (auto *SROAArg = getSROAArgForValueOrNull(TrueVal))
+ Value *SROAArg;
+ DenseMap<Value *, int>::iterator CostIt;
+ if (lookupSROAArgAndCost(TrueVal, SROAArg, CostIt))
SROAArgValues[&SI] = SROAArg;
return true;
}
@@ -1627,7 +1422,9 @@ bool CallAnalyzer::visitSelectInst(SelectInst &SI) {
if (BaseAndOffset.first) {
ConstantOffsetPtrs[&SI] = BaseAndOffset;
- if (auto *SROAArg = getSROAArgForValueOrNull(SelectedV))
+ Value *SROAArg;
+ DenseMap<Value *, int>::iterator CostIt;
+ if (lookupSROAArgAndCost(SelectedV, SROAArg, CostIt))
SROAArgValues[&SI] = SROAArg;
}
@@ -1655,12 +1452,49 @@ bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
// inlining those. It will prevent inlining in cases where the optimization
// does not (yet) fire.
+ // Maximum valid cost increased in this function.
+ int CostUpperBound = INT_MAX - InlineConstants::InstrCost - 1;
+
unsigned JumpTableSize = 0;
BlockFrequencyInfo *BFI = GetBFI ? &((*GetBFI)(F)) : nullptr;
unsigned NumCaseCluster =
TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize, PSI, BFI);
- onFinalizeSwitch(JumpTableSize, NumCaseCluster);
+ // If suitable for a jump table, consider the cost for the table size and
+ // branch to destination.
+ if (JumpTableSize) {
+ int64_t JTCost = (int64_t)JumpTableSize * InlineConstants::InstrCost +
+ 4 * InlineConstants::InstrCost;
+
+ addCost(JTCost, (int64_t)CostUpperBound);
+ return false;
+ }
+
+ // Considering forming a binary search, we should find the number of nodes
+ // which is same as the number of comparisons when lowered. For a given
+ // number of clusters, n, we can define a recursive function, f(n), to find
+ // the number of nodes in the tree. The recursion is :
+ // f(n) = 1 + f(n/2) + f (n - n/2), when n > 3,
+ // and f(n) = n, when n <= 3.
+ // This will lead a binary tree where the leaf should be either f(2) or f(3)
+ // when n > 3. So, the number of comparisons from leaves should be n, while
+ // the number of non-leaf should be :
+ // 2^(log2(n) - 1) - 1
+ // = 2^log2(n) * 2^-1 - 1
+ // = n / 2 - 1.
+ // Considering comparisons from leaf and non-leaf nodes, we can estimate the
+ // number of comparisons in a simple closed form :
+ // n + n / 2 - 1 = n * 3 / 2 - 1
+ if (NumCaseCluster <= 3) {
+ // Suppose a comparison includes one compare and one conditional branch.
+ addCost(NumCaseCluster * 2 * InlineConstants::InstrCost);
+ return false;
+ }
+
+ int64_t ExpectedNumberOfCompare = 3 * (int64_t)NumCaseCluster / 2 - 1;
+ int64_t SwitchCost = ExpectedNumberOfCompare * 2 * InlineConstants::InstrCost;
+
+ addCost(SwitchCost, (int64_t)CostUpperBound);
return false;
}
@@ -1753,7 +1587,7 @@ CallAnalyzer::analyzeBlock(BasicBlock *BB,
if (Base::visit(&*I))
++NumInstructionsSimplified;
else
- onCommonInstructionSimplification();
+ addCost(InlineConstants::InstrCost);
using namespace ore;
// If the visit this instruction detected an uninlinable pattern, abort.
@@ -1798,7 +1632,9 @@ CallAnalyzer::analyzeBlock(BasicBlock *BB,
return IR;
}
- if (shouldStop())
+ // Check if we've passed the maximum possible threshold so we don't spin in
+ // huge basic blocks that will never inline.
+ if (Cost >= Threshold && !ComputeFullInlineCost)
return false;
}
@@ -1892,9 +1728,46 @@ void CallAnalyzer::findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB) {
InlineResult CallAnalyzer::analyze() {
++NumCallsAnalyzed;
- auto Result = onAnalysisStart();
- if (!Result)
- return Result;
+ // Perform some tweaks to the cost and threshold based on the direct
+ // callsite information.
+
+ // We want to more aggressively inline vector-dense kernels, so up the
+ // threshold, and we'll lower it if the % of vector instructions gets too
+ // low. Note that these bonuses are some what arbitrary and evolved over time
+ // by accident as much as because they are principled bonuses.
+ //
+ // FIXME: It would be nice to remove all such bonuses. At least it would be
+ // nice to base the bonus values on something more scientific.
+ assert(NumInstructions == 0);
+ assert(NumVectorInstructions == 0);
+
+ // Update the threshold based on callsite properties
+ updateThreshold(CandidateCall, F);
+
+ // While Threshold depends on commandline options that can take negative
+ // values, we want to enforce the invariant that the computed threshold and
+ // bonuses are non-negative.
+ assert(Threshold >= 0);
+ assert(SingleBBBonus >= 0);
+ assert(VectorBonus >= 0);
+
+ // Speculatively apply all possible bonuses to Threshold. If cost exceeds
+ // this Threshold any time, and cost cannot decrease, we can stop processing
+ // the rest of the function body.
+ Threshold += (SingleBBBonus + VectorBonus);
+
+ // Give out bonuses for the callsite, as the instructions setting them up
+ // will be gone after inlining.
+ addCost(-getCallsiteCost(CandidateCall, DL));
+
+ // If this function uses the coldcc calling convention, prefer not to inline
+ // it.
+ if (F.getCallingConv() == CallingConv::Cold)
+ Cost += InlineConstants::ColdccPenalty;
+
+ // Check if we're done. This can happen due to bonuses and penalties.
+ if (Cost >= Threshold && !ComputeFullInlineCost)
+ return "high cost";
if (F.empty())
return true;
@@ -1923,9 +1796,9 @@ InlineResult CallAnalyzer::analyze() {
ConstantOffsetPtrs[&*FAI] = std::make_pair(PtrArg, C->getValue());
// We can SROA any pointer arguments derived from alloca instructions.
- if (auto *SROAArg = dyn_cast<AllocaInst>(PtrArg)) {
- SROAArgValues[&*FAI] = SROAArg;
- onInitializeSROAArg(SROAArg);
+ if (isa<AllocaInst>(PtrArg)) {
+ SROAArgValues[&*FAI] = PtrArg;
+ SROAArgCosts[PtrArg] = 0;
}
}
}
@@ -1951,10 +1824,12 @@ InlineResult CallAnalyzer::analyze() {
BBSetVector;
BBSetVector BBWorklist;
BBWorklist.insert(&F.getEntryBlock());
-
+ bool SingleBB = true;
// Note that we *must not* cache the size, this loop grows the worklist.
for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
- if (shouldStop())
+ // Bail out the moment we cross the threshold. This means we'll under-count
+ // the cost, but only when undercounting doesn't matter.
+ if (Cost >= Threshold && !ComputeFullInlineCost)
break;
BasicBlock *BB = BBWorklist[Idx];
@@ -2014,7 +1889,15 @@ InlineResult CallAnalyzer::analyze() {
++TIdx)
BBWorklist.insert(TI->getSuccessor(TIdx));
- onBlockAnalyzed(BB);
+ // If we had any successors at this point, than post-inlining is likely to
+ // have them as well. Note that we assume any basic blocks which existed
+ // due to branches or switches which folded above will also fold after
+ // inlining.
+ if (SingleBB && TI->getNumSuccessors() > 1) {
+ // Take off the bonus we applied to the threshold.
+ Threshold -= SingleBBBonus;
+ SingleBB = false;
+ }
}
bool OnlyOneCallAndLocalLinkage = F.hasLocalLinkage() && F.hasOneUse() &&
@@ -2025,12 +1908,38 @@ InlineResult CallAnalyzer::analyze() {
if (!OnlyOneCallAndLocalLinkage && ContainsNoDuplicateCall)
return "noduplicate";
- return finalizeAnalysis();
+ // Loops generally act a lot like calls in that they act like barriers to
+ // movement, require a certain amount of setup, etc. So when optimising for
+ // size, we penalise any call sites that perform loops. We do this after all
+ // other costs here, so will likely only be dealing with relatively small
+ // functions (and hence DT and LI will hopefully be cheap).
+ if (Caller->hasMinSize()) {
+ DominatorTree DT(F);
+ LoopInfo LI(DT);
+ int NumLoops = 0;
+ for (Loop *L : LI) {
+ // Ignore loops that will not be executed
+ if (DeadBlocks.count(L->getHeader()))
+ continue;
+ NumLoops++;
+ }
+ addCost(NumLoops * InlineConstants::CallPenalty);
+ }
+
+ // We applied the maximum possible vector bonus at the beginning. Now,
+ // subtract the excess bonus, if any, from the Threshold before
+ // comparing against Cost.
+ if (NumVectorInstructions <= NumInstructions / 10)
+ Threshold -= VectorBonus;
+ else if (NumVectorInstructions <= NumInstructions / 2)
+ Threshold -= VectorBonus / 2;
+
+ return Cost < std::max(1, Threshold);
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// Dump stats about this call's analysis.
-LLVM_DUMP_METHOD void InlineCostCallAnalyzer::dump() {
+LLVM_DUMP_METHOD void CallAnalyzer::dump() {
#define DEBUG_PRINT_STAT(x) dbgs() << " " #x ": " << x << "\n"
DEBUG_PRINT_STAT(NumConstantArgs);
DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs);
@@ -2164,8 +2073,8 @@ InlineCost llvm::getInlineCost(
LLVM_DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName()
<< "... (caller:" << Caller->getName() << ")\n");
- InlineCostCallAnalyzer CA(CalleeTTI, GetAssumptionCache, GetBFI, PSI, ORE,
- *Callee, Call, Params);
+ CallAnalyzer CA(CalleeTTI, GetAssumptionCache, GetBFI, PSI, ORE, *Callee,
+ Call, Params);
InlineResult ShouldInline = CA.analyze();
LLVM_DEBUG(CA.dump());
@@ -2212,16 +2121,15 @@ InlineResult llvm::isInlineViable(Function &F) {
switch (Call->getCalledFunction()->getIntrinsicID()) {
default:
break;
- // Disallow inlining of @llvm.icall.branch.funnel because current
- // backend can't separate call targets from call arguments.
+ // Disallow inlining of @llvm.icall.branch.funnel because current
+ // backend can't separate call targets from call arguments.
case llvm::Intrinsic::icall_branch_funnel:
return "disallowed inlining of @llvm.icall.branch.funnel";
- // Disallow inlining functions that call @llvm.localescape. Doing this
- // correctly would require major changes to the inliner.
+ // Disallow inlining functions that call @llvm.localescape. Doing this
+ // correctly would require major changes to the inliner.
case llvm::Intrinsic::localescape:
return "disallowed inlining of @llvm.localescape";
- // Disallow inlining of functions that initialize VarArgs with
- // va_start.
+ // Disallow inlining of functions that initialize VarArgs with va_start.
case llvm::Intrinsic::vastart:
return "contains VarArgs initialized with va_start";
}
More information about the llvm-commits
mailing list