[llvm] r302967 - [PartialInlining] Profile based cost analysis
Xinliang David Li via llvm-commits
llvm-commits at lists.llvm.org
Fri May 12 16:41:44 PDT 2017
Author: davidxl
Date: Fri May 12 18:41:43 2017
New Revision: 302967
URL: http://llvm.org/viewvc/llvm-project?rev=302967&view=rev
Log:
[PartialInlining] Profile based cost analysis
Implemented frequency based cost/saving analysis
and related options.
The pass is now in a state ready to be turne on
in the pipeline (in follow up).
Differential Revision: http://reviews.llvm.org/D32783
Added:
llvm/trunk/test/Transforms/CodeExtractor/PartialInlineEntryUpdate.ll
llvm/trunk/test/Transforms/CodeExtractor/PartialInlineHighCost.ll
Modified:
llvm/trunk/lib/Transforms/IPO/PartialInlining.cpp
llvm/trunk/test/Transforms/CodeExtractor/ExtractedFnEntryCount.ll
llvm/trunk/test/Transforms/CodeExtractor/MultipleExitBranchProb.ll
llvm/trunk/test/Transforms/CodeExtractor/PartialInlineAnd.ll
llvm/trunk/test/Transforms/CodeExtractor/PartialInlineOr.ll
llvm/trunk/test/Transforms/CodeExtractor/PartialInlineOrAnd.ll
llvm/trunk/test/Transforms/CodeExtractor/SingleCondition.ll
llvm/trunk/test/Transforms/CodeExtractor/X86/InheritTargetAttributes.ll
Modified: llvm/trunk/lib/Transforms/IPO/PartialInlining.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/IPO/PartialInlining.cpp?rev=302967&r1=302966&r2=302967&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/IPO/PartialInlining.cpp (original)
+++ llvm/trunk/lib/Transforms/IPO/PartialInlining.cpp Fri May 12 18:41:43 2017
@@ -16,6 +16,7 @@
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/CodeMetrics.h"
#include "llvm/Analysis/InlineCost.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
@@ -42,6 +43,11 @@ STATISTIC(NumPartialInlined,
static cl::opt<bool>
DisablePartialInlining("disable-partial-inlining", cl::init(false),
cl::Hidden, cl::desc("Disable partial ininling"));
+// This is an option used by testing:
+static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis",
+ cl::init(false), cl::ZeroOrMore,
+ cl::ReallyHidden,
+ cl::desc("Skip Cost Analysis"));
static cl::opt<unsigned> MaxNumInlineBlocks(
"max-num-inline-blocks", cl::init(5), cl::Hidden,
@@ -53,6 +59,15 @@ static cl::opt<int> MaxNumPartialInlinin
"max-partial-inlining", cl::init(-1), cl::Hidden, cl::ZeroOrMore,
cl::desc("Max number of partial inlining. The default is unlimited"));
+// Used only when PGO or user annotated branch data is absent. It is
+// the least value that is used to weigh the outline region. If BFI
+// produces larger value, the BFI value will be used.
+static cl::opt<int>
+ OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75),
+ cl::Hidden, cl::ZeroOrMore,
+ cl::desc("Relative frequency of outline region to "
+ "the entry block"));
+
namespace {
struct FunctionOutliningInfo {
@@ -84,8 +99,6 @@ struct PartialInlinerImpl {
bool run(Module &M);
Function *unswitchFunction(Function *F);
- std::unique_ptr<FunctionOutliningInfo> computeOutliningInfo(Function *F);
-
private:
int NumPartialInlining = 0;
std::function<AssumptionCache &(Function &)> *GetAssumptionCache;
@@ -93,11 +106,84 @@ private:
Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI;
ProfileSummaryInfo *PSI;
- bool shouldPartialInline(CallSite CS, OptimizationRemarkEmitter &ORE);
+ // Return the frequency of the OutlininingBB relative to F's entry point.
+ // The result is no larger than 1 and is represented using BP.
+ // (Note that the outlined region's 'head' block can only have incoming
+ // edges from the guarding entry blocks).
+ BranchProbability getOutliningCallBBRelativeFreq(Function *F,
+ FunctionOutliningInfo *OI,
+ Function *DuplicateFunction,
+ BlockFrequencyInfo *BFI,
+ BasicBlock *OutliningCallBB);
+
+ // Return true if the callee of CS should be partially inlined with
+ // profit.
+ bool shouldPartialInline(CallSite CS, Function *F, FunctionOutliningInfo *OI,
+ BlockFrequencyInfo *CalleeBFI,
+ BasicBlock *OutliningCallBB,
+ int OutliningCallOverhead,
+ OptimizationRemarkEmitter &ORE);
+
+ // Try to inline DuplicateFunction (cloned from F with call to
+ // the OutlinedFunction into its callers. Return true
+ // if there is any successful inlining.
+ bool tryPartialInline(Function *DuplicateFunction,
+ Function *F, /*orignal function */
+ FunctionOutliningInfo *OI, Function *OutlinedFunction,
+ BlockFrequencyInfo *CalleeBFI);
+
+ // Compute the mapping from use site of DuplicationFunction to the enclosing
+ // BB's profile count.
+ void computeCallsiteToProfCountMap(Function *DuplicateFunction,
+ DenseMap<User *, uint64_t> &SiteCountMap);
+
bool IsLimitReached() {
return (MaxNumPartialInlining != -1 &&
NumPartialInlining >= MaxNumPartialInlining);
}
+
+ CallSite getCallSite(User *U) {
+ CallSite CS;
+ if (CallInst *CI = dyn_cast<CallInst>(U))
+ CS = CallSite(CI);
+ else if (InvokeInst *II = dyn_cast<InvokeInst>(U))
+ CS = CallSite(II);
+ else
+ llvm_unreachable("All uses must be calls");
+ return CS;
+ }
+
+ CallSite getOneCallSiteTo(Function *F) {
+ User *User = *F->user_begin();
+ return getCallSite(User);
+ }
+
+ std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function *F) {
+ CallSite CS = getOneCallSiteTo(F);
+ DebugLoc DLoc = CS.getInstruction()->getDebugLoc();
+ BasicBlock *Block = CS.getParent();
+ return std::make_tuple(DLoc, Block);
+ }
+
+ // Returns the costs associated with function outlining:
+ // - The first value is the non-weighted runtime cost for making the call
+ // to the outlined function 'OutlinedFunction', including the addtional
+ // setup cost in the outlined function itself;
+ // - The second value is the estimated size of the new call sequence in
+ // basic block 'OutliningCallBB';
+ // - The third value is the estimated size of the original code from
+ // function 'F' that is extracted into the outlined function.
+ std::tuple<int, int, int>
+ computeOutliningCosts(Function *F, const FunctionOutliningInfo *OutliningInfo,
+ Function *OutlinedFunction,
+ BasicBlock *OutliningCallBB);
+ // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to
+ // approximate both the size and runtime cost (Note that in the current
+ // inline cost analysis, there is no clear distinction there either).
+ int computeBBInlineCost(BasicBlock *BB);
+
+ std::unique_ptr<FunctionOutliningInfo> computeOutliningInfo(Function *F);
+
};
struct PartialInlinerLegacyPass : public ModulePass {
@@ -223,7 +309,8 @@ PartialInlinerImpl::computeOutliningInfo
// Do sanity check of the entries: threre should not
// be any successors (not in the entry set) other than
// {ReturnBlock, NonReturnBlock}
- assert(OutliningInfo->Entries[0] == &F->front());
+ assert(OutliningInfo->Entries[0] == &F->front() &&
+ "Function Entry must be the first in Entries vector");
DenseSet<BasicBlock *> Entries;
for (BasicBlock *E : OutliningInfo->Entries)
Entries.insert(E);
@@ -289,10 +376,54 @@ PartialInlinerImpl::computeOutliningInfo
return OutliningInfo;
}
-bool PartialInlinerImpl::shouldPartialInline(CallSite CS,
- OptimizationRemarkEmitter &ORE) {
- // TODO : more sharing with shouldInline in Inliner.cpp
+// Check if there is PGO data or user annoated branch data:
+static bool hasProfileData(Function *F, FunctionOutliningInfo *OI) {
+ if (F->getEntryCount())
+ return true;
+ // Now check if any of the entry block has MD_prof data:
+ for (auto *E : OI->Entries) {
+ BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator());
+ if (!BR || BR->isUnconditional())
+ continue;
+ uint64_t T, F;
+ if (BR->extractProfMetadata(T, F))
+ return true;
+ }
+ return false;
+}
+
+BranchProbability PartialInlinerImpl::getOutliningCallBBRelativeFreq(
+ Function *F, FunctionOutliningInfo *OI, Function *DuplicateFunction,
+ BlockFrequencyInfo *BFI, BasicBlock *OutliningCallBB) {
+
+ auto EntryFreq =
+ BFI->getBlockFreq(&DuplicateFunction->getEntryBlock());
+ auto OutliningCallFreq = BFI->getBlockFreq(OutliningCallBB);
+
+ auto OutlineRegionRelFreq =
+ BranchProbability::getBranchProbability(OutliningCallFreq.getFrequency(),
+ EntryFreq.getFrequency());
+
+ if (hasProfileData(F, OI))
+ return OutlineRegionRelFreq;
+
+ // When profile data is not available, we need to be very
+ // conservative in estimating the overall savings. We need to make sure
+ // the outline region relative frequency is not below the threshold
+ // specified by the option.
+ OutlineRegionRelFreq = std::max(OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100));
+
+ return OutlineRegionRelFreq;
+}
+
+bool PartialInlinerImpl::shouldPartialInline(
+ CallSite CS, Function *F /* Original Callee */, FunctionOutliningInfo *OI,
+ BlockFrequencyInfo *CalleeBFI, BasicBlock *OutliningCallBB,
+ int NonWeightedOutliningRcost, OptimizationRemarkEmitter &ORE) {
using namespace ore;
+ if (SkipCostAnalysis)
+ return true;
+
Instruction *Call = CS.getInstruction();
Function *Callee = CS.getCalledFunction();
Function *Caller = CS.getCaller();
@@ -302,36 +433,166 @@ bool PartialInlinerImpl::shouldPartialIn
if (IC.isAlways()) {
ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", Call)
- << NV("Callee", Callee)
+ << NV("Callee", F)
<< " should always be fully inlined, not partially");
return false;
}
if (IC.isNever()) {
ORE.emit(OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", Call)
- << NV("Callee", Callee) << " not partially inlined into "
+ << NV("Callee", F) << " not partially inlined into "
<< NV("Caller", Caller)
<< " because it should never be inlined (cost=never)");
return false;
}
if (!IC) {
- ORE.emit(OptimizationRemarkMissed(DEBUG_TYPE, "TooCostly", Call)
- << NV("Callee", Callee) << " not partially inlined into "
+ ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", Call)
+ << NV("Callee", F) << " not partially inlined into "
<< NV("Caller", Caller) << " because too costly to inline (cost="
<< NV("Cost", IC.getCost()) << ", threshold="
<< NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")");
return false;
}
+ const DataLayout &DL = Caller->getParent()->getDataLayout();
+ // The savings of eliminating the call:
+ int NonWeightedSavings = getCallsiteCost(CS, DL);
+ BlockFrequency NormWeightedSavings(NonWeightedSavings);
+
+ auto RelativeFreq =
+ getOutliningCallBBRelativeFreq(F, OI, Callee, CalleeBFI, OutliningCallBB);
+ auto NormWeightedRcost =
+ BlockFrequency(NonWeightedOutliningRcost) * RelativeFreq;
+
+ // Weighted saving is smaller than weighted cost, return false
+ if (NormWeightedSavings < NormWeightedRcost) {
+ ORE.emit(
+ OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh", Call)
+ << NV("Callee", F) << " not partially inlined into "
+ << NV("Caller", Caller) << " runtime overhead (overhead="
+ << NV("Overhead", (unsigned)NormWeightedRcost.getFrequency())
+ << ", savings="
+ << NV("Savings", (unsigned)NormWeightedSavings.getFrequency()) << ")"
+ << " of making the outlined call is too high");
+
+ return false;
+ }
ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", Call)
- << NV("Callee", Callee) << " can be partially inlined into "
+ << NV("Callee", F) << " can be partially inlined into "
<< NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost())
<< " (threshold="
<< NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")");
return true;
}
+// TODO: Ideally we should share Inliner's InlineCost Analysis code.
+// For now use a simplified version. The returned 'InlineCost' will be used
+// to esimate the size cost as well as runtime cost of the BB.
+int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB) {
+ int InlineCost = 0;
+ const DataLayout &DL = BB->getParent()->getParent()->getDataLayout();
+ for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+ if (isa<DbgInfoIntrinsic>(I))
+ continue;
+
+ if (CallInst *CI = dyn_cast<CallInst>(I)) {
+ InlineCost += getCallsiteCost(CallSite(CI), DL);
+ continue;
+ }
+
+ if (InvokeInst *II = dyn_cast<InvokeInst>(I)) {
+ InlineCost += getCallsiteCost(CallSite(II), DL);
+ continue;
+ }
+
+ if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) {
+ InlineCost += (SI->getNumCases() + 1) * InlineConstants::InstrCost;
+ continue;
+ }
+ InlineCost += InlineConstants::InstrCost;
+ }
+ return InlineCost;
+}
+
+std::tuple<int, int, int> PartialInlinerImpl::computeOutliningCosts(
+ Function *F, const FunctionOutliningInfo *OI, Function *OutlinedFunction,
+ BasicBlock *OutliningCallBB) {
+ // First compute the cost of the outlined region 'OI' in the original
+ // function 'F':
+ int OutlinedRegionCost = 0;
+ for (BasicBlock &BB : *F) {
+ if (&BB != OI->ReturnBlock &&
+ // Assuming Entry set is small -- do a linear search here:
+ std::find(OI->Entries.begin(), OI->Entries.end(), &BB) ==
+ OI->Entries.end()) {
+ OutlinedRegionCost += computeBBInlineCost(&BB);
+ }
+ }
+
+ // Now compute the cost of the call sequence to the outlined function
+ // 'OutlinedFunction' in BB 'OutliningCallBB':
+ int OutliningFuncCallCost = computeBBInlineCost(OutliningCallBB);
+
+ // Now compute the cost of the extracted/outlined function itself:
+ int OutlinedFunctionCost = 0;
+ for (BasicBlock &BB : *OutlinedFunction) {
+ OutlinedFunctionCost += computeBBInlineCost(&BB);
+ }
+
+ assert(OutlinedFunctionCost >= OutlinedRegionCost &&
+ "Outlined function cost should be no less than the outlined region");
+ int OutliningRuntimeOverhead =
+ OutliningFuncCallCost + (OutlinedFunctionCost - OutlinedRegionCost);
+
+ return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead,
+ OutlinedRegionCost);
+}
+
+// Create the callsite to profile count map which is
+// used to update the original function's entry count,
+// after the function is partially inlined into the callsite.
+void PartialInlinerImpl::computeCallsiteToProfCountMap(
+ Function *DuplicateFunction,
+ DenseMap<User *, uint64_t> &CallSiteToProfCountMap) {
+ std::vector<User *> Users(DuplicateFunction->user_begin(),
+ DuplicateFunction->user_end());
+ Function *CurrentCaller = nullptr;
+ BlockFrequencyInfo *CurrentCallerBFI = nullptr;
+
+ auto ComputeCurrBFI = [&,this](Function *Caller) {
+ // For the old pass manager:
+ if (!GetBFI) {
+ if (CurrentCallerBFI)
+ delete CurrentCallerBFI;
+ DominatorTree DT(*Caller);
+ LoopInfo LI(DT);
+ BranchProbabilityInfo BPI(*Caller, LI);
+ CurrentCallerBFI = new BlockFrequencyInfo(*Caller, BPI, LI);
+ } else {
+ // New pass manager:
+ CurrentCallerBFI = &(*GetBFI)(*Caller);
+ }
+ };
+
+ for (User *User : Users) {
+ CallSite CS = getCallSite(User);
+ Function *Caller = CS.getCaller();
+ if (CurrentCaller != Caller) {
+ CurrentCaller = Caller;
+ ComputeCurrBFI(Caller);
+ } else {
+ assert(CurrentCallerBFI && "CallerBFI is not set");
+ }
+ BasicBlock *CallBB = CS.getInstruction()->getParent();
+ auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB);
+ if (Count)
+ CallSiteToProfCountMap[User] = *Count;
+ else
+ CallSiteToProfCountMap[User] = 0;
+ }
+}
+
Function *PartialInlinerImpl::unswitchFunction(Function *F) {
if (F->hasAddressTaken())
@@ -347,21 +608,21 @@ Function *PartialInlinerImpl::unswitchFu
if (PSI->isFunctionEntryCold(F))
return nullptr;
- std::unique_ptr<FunctionOutliningInfo> OutliningInfo =
- computeOutliningInfo(F);
+ if (F->user_begin() == F->user_end())
+ return nullptr;
- if (!OutliningInfo)
+ std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F);
+
+ if (!OI)
return nullptr;
// Clone the function, so that we can hack away on it.
ValueToValueMapTy VMap;
Function *DuplicateFunction = CloneFunction(F, VMap);
- BasicBlock *NewReturnBlock =
- cast<BasicBlock>(VMap[OutliningInfo->ReturnBlock]);
- BasicBlock *NewNonReturnBlock =
- cast<BasicBlock>(VMap[OutliningInfo->NonReturnBlock]);
+ BasicBlock *NewReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]);
+ BasicBlock *NewNonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]);
DenseSet<BasicBlock *> NewEntries;
- for (BasicBlock *BB : OutliningInfo->Entries) {
+ for (BasicBlock *BB : OI->Entries) {
NewEntries.insert(cast<BasicBlock>(VMap[BB]));
}
@@ -390,7 +651,7 @@ Function *PartialInlinerImpl::unswitchFu
BasicBlock *PreReturn = NewReturnBlock;
// only split block when necessary:
PHINode *FirstPhi = getFirstPHI(PreReturn);
- unsigned NumPredsFromEntries = OutliningInfo->ReturnBlockPreds.size();
+ unsigned NumPredsFromEntries = OI->ReturnBlockPreds.size();
if (FirstPhi && FirstPhi->getNumIncomingValues() > NumPredsFromEntries + 1) {
NewReturnBlock = NewReturnBlock->splitBasicBlock(
@@ -408,14 +669,14 @@ Function *PartialInlinerImpl::unswitchFu
Ins = NewReturnBlock->getFirstNonPHI();
RetPhi->addIncoming(&*I, PreReturn);
- for (BasicBlock *E : OutliningInfo->ReturnBlockPreds) {
+ for (BasicBlock *E : OI->ReturnBlockPreds) {
BasicBlock *NewE = cast<BasicBlock>(VMap[E]);
RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(NewE), NewE);
OldPhi->removeIncomingValue(NewE);
}
++I;
}
- for (auto E : OutliningInfo->ReturnBlockPreds) {
+ for (auto E : OI->ReturnBlockPreds) {
BasicBlock *NewE = cast<BasicBlock>(VMap[E]);
NewE->getTerminator()->replaceUsesOfWith(PreReturn, NewReturnBlock);
}
@@ -443,50 +704,107 @@ Function *PartialInlinerImpl::unswitchFu
BlockFrequencyInfo BFI(*DuplicateFunction, BPI, LI);
// Extract the body of the if.
- Function *ExtractedFunction =
+ Function *OutlinedFunction =
CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false, &BFI, &BPI)
.extractCodeRegion();
- // Inline the top-level if test into all callers.
+ bool AnyInline =
+ tryPartialInline(DuplicateFunction, F, OI.get(), OutlinedFunction, &BFI);
+
+ // Ditch the duplicate, since we're done with it, and rewrite all remaining
+ // users (function pointers, etc.) back to the original function.
+ DuplicateFunction->replaceAllUsesWith(F);
+ DuplicateFunction->eraseFromParent();
+ if (!AnyInline && OutlinedFunction)
+ OutlinedFunction->eraseFromParent();
+ return OutlinedFunction;
+}
+
+bool PartialInlinerImpl::tryPartialInline(Function *DuplicateFunction,
+ Function *F,
+ FunctionOutliningInfo *OI,
+ Function *OutlinedFunction,
+ BlockFrequencyInfo *CalleeBFI) {
+ if (OutlinedFunction == nullptr)
+ return false;
+
+ int NonWeightedRcost;
+ int SizeCost;
+ int OutlinedRegionSizeCost;
+
+ auto OutliningCallBB =
+ getOneCallSiteTo(OutlinedFunction).getInstruction()->getParent();
+
+ std::tie(SizeCost, NonWeightedRcost, OutlinedRegionSizeCost) =
+ computeOutliningCosts(F, OI, OutlinedFunction, OutliningCallBB);
+
+ // The call sequence to the outlined function is larger than the original
+ // outlined region size, it does not increase the chances of inlining
+ // 'F' with outlining (The inliner usies the size increase to model the
+ // the cost of inlining a callee).
+ if (!SkipCostAnalysis && OutlinedRegionSizeCost < SizeCost) {
+ OptimizationRemarkEmitter ORE(F);
+ DebugLoc DLoc;
+ BasicBlock *Block;
+ std::tie(DLoc, Block) = getOneDebugLoc(DuplicateFunction);
+ ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall",
+ DLoc, Block)
+ << ore::NV("Function", F)
+ << " not partially inlined into callers (Original Size = "
+ << ore::NV("OutlinedRegionOriginalSize", OutlinedRegionSizeCost)
+ << ", Size of call sequence to outlined function = "
+ << ore::NV("NewSize", SizeCost) << ")");
+ return false;
+ }
+
+ assert(F->user_begin() == F->user_end() &&
+ "F's users should all be replaced!");
std::vector<User *> Users(DuplicateFunction->user_begin(),
DuplicateFunction->user_end());
+ DenseMap<User *, uint64_t> CallSiteToProfCountMap;
+ if (F->getEntryCount())
+ computeCallsiteToProfCountMap(DuplicateFunction, CallSiteToProfCountMap);
+
+ auto CalleeEntryCount = F->getEntryCount();
+ uint64_t CalleeEntryCountV = (CalleeEntryCount ? *CalleeEntryCount : 0);
+ bool AnyInline = false;
for (User *User : Users) {
- CallSite CS;
- if (CallInst *CI = dyn_cast<CallInst>(User))
- CS = CallSite(CI);
- else if (InvokeInst *II = dyn_cast<InvokeInst>(User))
- CS = CallSite(II);
- else
- llvm_unreachable("All uses must be calls");
+ CallSite CS = getCallSite(User);
if (IsLimitReached())
continue;
OptimizationRemarkEmitter ORE(CS.getCaller());
- if (!shouldPartialInline(CS, ORE))
+
+ if (!shouldPartialInline(CS, F, OI, CalleeBFI, OutliningCallBB,
+ NonWeightedRcost, ORE))
continue;
- DebugLoc DLoc = CS.getInstruction()->getDebugLoc();
- BasicBlock *Block = CS.getParent();
- ORE.emit(OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", DLoc, Block)
- << ore::NV("Callee", F) << " partially inlined into "
- << ore::NV("Caller", CS.getCaller()));
+ ORE.emit(
+ OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", CS.getInstruction())
+ << ore::NV("Callee", F) << " partially inlined into "
+ << ore::NV("Caller", CS.getCaller()));
InlineFunctionInfo IFI(nullptr, GetAssumptionCache, PSI);
InlineFunction(CS, IFI);
+
+ // Now update the entry count:
+ if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) {
+ uint64_t CallSiteCount = CallSiteToProfCountMap[User];
+ CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount);
+ }
+
+ AnyInline = true;
NumPartialInlining++;
- // update stats
+ // Update the stats
NumPartialInlined++;
}
- // Ditch the duplicate, since we're done with it, and rewrite all remaining
- // users (function pointers, etc.) back to the original function.
- DuplicateFunction->replaceAllUsesWith(F);
- DuplicateFunction->eraseFromParent();
-
+ if (AnyInline && CalleeEntryCount)
+ F->setEntryCount(CalleeEntryCountV);
- return ExtractedFunction;
+ return AnyInline;
}
bool PartialInlinerImpl::run(Module &M) {
Modified: llvm/trunk/test/Transforms/CodeExtractor/ExtractedFnEntryCount.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/CodeExtractor/ExtractedFnEntryCount.ll?rev=302967&r1=302966&r2=302967&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/CodeExtractor/ExtractedFnEntryCount.ll (original)
+++ llvm/trunk/test/Transforms/CodeExtractor/ExtractedFnEntryCount.ll Fri May 12 18:41:43 2017
@@ -1,4 +1,4 @@
-; RUN: opt < %s -partial-inliner -S | FileCheck %s
+; RUN: opt < %s -partial-inliner -skip-partial-inlining-cost-analysis -S | FileCheck %s
; This test checks to make sure that the CodeExtractor
; properly sets the entry count for the function that is
Modified: llvm/trunk/test/Transforms/CodeExtractor/MultipleExitBranchProb.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/CodeExtractor/MultipleExitBranchProb.ll?rev=302967&r1=302966&r2=302967&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/CodeExtractor/MultipleExitBranchProb.ll (original)
+++ llvm/trunk/test/Transforms/CodeExtractor/MultipleExitBranchProb.ll Fri May 12 18:41:43 2017
@@ -1,4 +1,4 @@
-; RUN: opt < %s -partial-inliner -max-num-inline-blocks=2 -S | FileCheck %s
+; RUN: opt < %s -partial-inliner -max-num-inline-blocks=2 -skip-partial-inlining-cost-analysis -S | FileCheck %s
; This test checks to make sure that CodeExtractor updates
; the exit branch probabilities for multiple exit blocks.
Modified: llvm/trunk/test/Transforms/CodeExtractor/PartialInlineAnd.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/CodeExtractor/PartialInlineAnd.ll?rev=302967&r1=302966&r2=302967&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/CodeExtractor/PartialInlineAnd.ll (original)
+++ llvm/trunk/test/Transforms/CodeExtractor/PartialInlineAnd.ll Fri May 12 18:41:43 2017
@@ -1,7 +1,7 @@
; RUN: opt < %s -partial-inliner -S | FileCheck %s
; RUN: opt < %s -passes=partial-inliner -S | FileCheck %s
-; RUN: opt < %s -partial-inliner -max-num-inline-blocks=2 -S | FileCheck --check-prefix=LIMIT %s
-; RUN: opt < %s -passes=partial-inliner -max-num-inline-blocks=2 -S | FileCheck --check-prefix=LIMIT %s
+; RUN: opt < %s -partial-inliner -skip-partial-inlining-cost-analysis -max-num-inline-blocks=2 -S | FileCheck --check-prefix=LIMIT %s
+; RUN: opt < %s -passes=partial-inliner -skip-partial-inlining-cost-analysis -max-num-inline-blocks=2 -S | FileCheck --check-prefix=LIMIT %s
; Function Attrs: nounwind uwtable
define i32 @bar(i32 %arg) local_unnamed_addr #0 {
Added: llvm/trunk/test/Transforms/CodeExtractor/PartialInlineEntryUpdate.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/CodeExtractor/PartialInlineEntryUpdate.ll?rev=302967&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/CodeExtractor/PartialInlineEntryUpdate.ll (added)
+++ llvm/trunk/test/Transforms/CodeExtractor/PartialInlineEntryUpdate.ll Fri May 12 18:41:43 2017
@@ -0,0 +1,41 @@
+; RUN: opt < %s -skip-partial-inlining-cost-analysis -partial-inliner -S | FileCheck %s
+; RUN: opt < %s -skip-partial-inlining-cost-analysis -passes=partial-inliner -S | FileCheck %s
+
+define i32 @Func(i1 %cond, i32* align 4 %align.val) !prof !1 {
+; CHECK: @Func({{.*}}) !prof [[REMAINCOUNT:![0-9]+]]
+entry:
+ br i1 %cond, label %if.then, label %return
+if.then:
+ ; Dummy store to have more than 0 uses
+ store i32 10, i32* %align.val, align 4
+ br label %return
+return: ; preds = %entry
+ ret i32 0
+}
+
+define internal i32 @Caller1(i1 %cond, i32* align 2 %align.val) !prof !3{
+entry:
+; CHECK-LABEL: @Caller1
+; CHECK: br
+; CHECK: call void @Func.1_
+; CHECK: br
+; CHECK: call void @Func.1_
+ %val = call i32 @Func(i1 %cond, i32* %align.val)
+ %val2 = call i32 @Func(i1 %cond, i32* %align.val)
+ ret i32 %val
+}
+
+define internal i32 @Caller2(i1 %cond, i32* align 2 %align.val) !prof !2{
+entry:
+; CHECK-LABEL: @Caller2
+; CHECK: br
+; CHECK: call void @Func.1_
+ %val = call i32 @Func(i1 %cond, i32* %align.val)
+ ret i32 %val
+}
+
+; CHECK: [[REMAINCOUNT]] = !{!"function_entry_count", i64 150}
+!1 = !{!"function_entry_count", i64 200}
+!2 = !{!"function_entry_count", i64 10}
+!3 = !{!"function_entry_count", i64 20}
+
Added: llvm/trunk/test/Transforms/CodeExtractor/PartialInlineHighCost.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/CodeExtractor/PartialInlineHighCost.ll?rev=302967&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/CodeExtractor/PartialInlineHighCost.ll (added)
+++ llvm/trunk/test/Transforms/CodeExtractor/PartialInlineHighCost.ll Fri May 12 18:41:43 2017
@@ -0,0 +1,107 @@
+; The outlined region has high frequency and the outlining
+; call sequence is expensive (input, output, multiple exit etc)
+; RUN: opt < %s -partial-inliner -max-num-inline-blocks=2 -S | FileCheck %s
+; RUN: opt < %s -passes=partial-inliner -max-num-inline-blocks=2 -S | FileCheck %s
+; RUN: opt < %s -partial-inliner -skip-partial-inlining-cost-analysis -max-num-inline-blocks=2 -S | FileCheck --check-prefix=NOCOST %s
+; RUN: opt < %s -passes=partial-inliner -skip-partial-inlining-cost-analysis -max-num-inline-blocks=2 -S | FileCheck --check-prefix=NOCOST %s
+
+
+; Function Attrs: nounwind
+define i32 @bar_hot_outline_region(i32 %arg) local_unnamed_addr #0 {
+bb:
+ %tmp = icmp slt i32 %arg, 0
+ br i1 %tmp, label %bb1, label %bb16, !prof !1
+
+bb1: ; preds = %bb
+ %tmp2 = tail call i32 (...) @foo() #0
+ %tmp3 = tail call i32 (...) @foo() #0
+ %tmp4 = tail call i32 (...) @foo() #0
+ %tmp5 = tail call i32 (...) @foo() #0
+ %tmp6 = tail call i32 (...) @foo() #0
+ %tmp7 = tail call i32 (...) @foo() #0
+ %tmp8 = add nsw i32 %arg, 1
+ %tmp9 = tail call i32 @goo(i32 %tmp8) #0
+ %tmp10 = tail call i32 (...) @foo() #0
+ %tmp11 = icmp eq i32 %tmp10, 0
+ br i1 %tmp11, label %bb12, label %bb16
+
+bb12: ; preds = %bb1
+ %tmp13 = tail call i32 (...) @foo() #0
+ %tmp14 = icmp eq i32 %tmp13, 0
+ %tmp15 = select i1 %tmp14, i32 0, i32 3
+ br label %bb16
+
+bb16: ; preds = %bb12, %bb1, %bb
+ %tmp17 = phi i32 [ 2, %bb1 ], [ %tmp15, %bb12 ], [ 0, %bb ]
+ ret i32 %tmp17
+}
+
+define i32 @bar_cold_outline_region(i32 %arg) local_unnamed_addr #0 {
+bb:
+ %tmp = icmp slt i32 %arg, 0
+ br i1 %tmp, label %bb1, label %bb16, !prof !2
+
+bb1: ; preds = %bb
+ %tmp2 = tail call i32 (...) @foo() #0
+ %tmp3 = tail call i32 (...) @foo() #0
+ %tmp4 = tail call i32 (...) @foo() #0
+ %tmp5 = tail call i32 (...) @foo() #0
+ %tmp6 = tail call i32 (...) @foo() #0
+ %tmp7 = tail call i32 (...) @foo() #0
+ %tmp8 = add nsw i32 %arg, 1
+ %tmp9 = tail call i32 @goo(i32 %tmp8) #0
+ %tmp10 = tail call i32 (...) @foo() #0
+ %tmp11 = icmp eq i32 %tmp10, 0
+ br i1 %tmp11, label %bb12, label %bb16
+
+bb12: ; preds = %bb1
+ %tmp13 = tail call i32 (...) @foo() #0
+ %tmp14 = icmp eq i32 %tmp13, 0
+ %tmp15 = select i1 %tmp14, i32 0, i32 3
+ br label %bb16
+
+bb16: ; preds = %bb12, %bb1, %bb
+ %tmp17 = phi i32 [ 2, %bb1 ], [ %tmp15, %bb12 ], [ 0, %bb ]
+ ret i32 %tmp17
+}
+
+; Function Attrs: nounwind
+declare i32 @foo(...) local_unnamed_addr #0
+
+; Function Attrs: nounwind
+declare i32 @goo(i32) local_unnamed_addr #0
+
+; Function Attrs: nounwind
+define i32 @dummy_caller(i32 %arg) local_unnamed_addr #0 {
+bb:
+; CHECK-LABEL: @dummy_caller
+; CHECK-NOT: br i1
+; CHECK-NOT: call{{.*}}bar_hot_outline_region.
+; NOCOST-LABEL: @dummy_caller
+; NOCOST: br i1
+; NOCOST: call{{.*}}bar_hot_outline_region.
+
+ %tmp = tail call i32 @bar_hot_outline_region(i32 %arg)
+ ret i32 %tmp
+}
+
+define i32 @dummy_caller2(i32 %arg) local_unnamed_addr #0 {
+bb:
+; CHECK-LABEL: @dummy_caller2
+; CHECK: br i1
+; CHECK: call{{.*}}bar_cold_outline_region.
+; NOCOST-LABEL: @dummy_caller2
+; NOCOST: br i1
+; NOCOST: call{{.*}}bar_cold_outline_region.
+
+ %tmp = tail call i32 @bar_cold_outline_region(i32 %arg)
+ ret i32 %tmp
+}
+
+attributes #0 = { nounwind }
+
+!llvm.ident = !{!0}
+
+!0 = !{!"clang version 5.0.0 (trunk 301898)"}
+!1 = !{!"branch_weights", i32 2000, i32 1}
+!2 = !{!"branch_weights", i32 1, i32 100}
Modified: llvm/trunk/test/Transforms/CodeExtractor/PartialInlineOr.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/CodeExtractor/PartialInlineOr.ll?rev=302967&r1=302966&r2=302967&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/CodeExtractor/PartialInlineOr.ll (original)
+++ llvm/trunk/test/Transforms/CodeExtractor/PartialInlineOr.ll Fri May 12 18:41:43 2017
@@ -1,5 +1,5 @@
-; RUN: opt < %s -partial-inliner -S | FileCheck %s
-; RUN: opt < %s -passes=partial-inliner -S | FileCheck %s
+; RUN: opt < %s -partial-inliner -skip-partial-inlining-cost-analysis -S | FileCheck %s
+; RUN: opt < %s -passes=partial-inliner -skip-partial-inlining-cost-analysis -S | FileCheck %s
; RUN: opt < %s -partial-inliner -max-num-inline-blocks=2 -S | FileCheck --check-prefix=LIMIT %s
; RUN: opt < %s -passes=partial-inliner -max-num-inline-blocks=2 -S | FileCheck --check-prefix=LIMIT %s
Modified: llvm/trunk/test/Transforms/CodeExtractor/PartialInlineOrAnd.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/CodeExtractor/PartialInlineOrAnd.ll?rev=302967&r1=302966&r2=302967&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/CodeExtractor/PartialInlineOrAnd.ll (original)
+++ llvm/trunk/test/Transforms/CodeExtractor/PartialInlineOrAnd.ll Fri May 12 18:41:43 2017
@@ -1,7 +1,7 @@
; RUN: opt < %s -partial-inliner -S | FileCheck %s
; RUN: opt < %s -passes=partial-inliner -S | FileCheck %s
-; RUN: opt < %s -partial-inliner -max-num-inline-blocks=3 -S | FileCheck --check-prefix=LIMIT3 %s
-; RUN: opt < %s -passes=partial-inliner -max-num-inline-blocks=3 -S | FileCheck --check-prefix=LIMIT3 %s
+; RUN: opt < %s -partial-inliner -max-num-inline-blocks=3 -skip-partial-inlining-cost-analysis -S | FileCheck --check-prefix=LIMIT3 %s
+; RUN: opt < %s -passes=partial-inliner -max-num-inline-blocks=3 -skip-partial-inlining-cost-analysis -S | FileCheck --check-prefix=LIMIT3 %s
; RUN: opt < %s -partial-inliner -max-num-inline-blocks=2 -S | FileCheck --check-prefix=LIMIT2 %s
; RUN: opt < %s -passes=partial-inliner -max-num-inline-blocks=2 -S | FileCheck --check-prefix=LIMIT2 %s
Modified: llvm/trunk/test/Transforms/CodeExtractor/SingleCondition.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/CodeExtractor/SingleCondition.ll?rev=302967&r1=302966&r2=302967&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/CodeExtractor/SingleCondition.ll (original)
+++ llvm/trunk/test/Transforms/CodeExtractor/SingleCondition.ll Fri May 12 18:41:43 2017
@@ -1,5 +1,5 @@
-; RUN: opt < %s -partial-inliner -S | FileCheck %s
-; RUN: opt < %s -passes=partial-inliner -S | FileCheck %s
+; RUN: opt < %s -skip-partial-inlining-cost-analysis -partial-inliner -S | FileCheck %s
+; RUN: opt < %s -skip-partial-inlining-cost-analysis -passes=partial-inliner -S | FileCheck %s
define internal i32 @inlinedFunc(i1 %cond, i32* align 4 %align.val) {
entry:
Modified: llvm/trunk/test/Transforms/CodeExtractor/X86/InheritTargetAttributes.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/CodeExtractor/X86/InheritTargetAttributes.ll?rev=302967&r1=302966&r2=302967&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/CodeExtractor/X86/InheritTargetAttributes.ll (original)
+++ llvm/trunk/test/Transforms/CodeExtractor/X86/InheritTargetAttributes.ll Fri May 12 18:41:43 2017
@@ -1,5 +1,5 @@
-; RUN: opt < %s -partial-inliner | llc -filetype=null
-; RUN: opt < %s -partial-inliner -S | FileCheck %s
+; RUN: opt < %s -partial-inliner -skip-partial-inlining-cost-analysis | llc -filetype=null
+; RUN: opt < %s -partial-inliner -skip-partial-inlining-cost-analysis -S | FileCheck %s
; This testcase checks to see if CodeExtractor properly inherits
; target specific attributes for the extracted function. This can
; cause certain instructions that depend on the attributes to not
More information about the llvm-commits
mailing list