[llvm] r302967 - [PartialInlining] Profile based cost analysis

Xinliang David Li via llvm-commits llvm-commits at lists.llvm.org
Fri May 12 16:41:44 PDT 2017


Author: davidxl
Date: Fri May 12 18:41:43 2017
New Revision: 302967

URL: http://llvm.org/viewvc/llvm-project?rev=302967&view=rev
Log:
[PartialInlining] Profile based cost analysis

Implemented frequency based cost/saving analysis
and related options.

The pass is now in a state ready to be turne on
in the pipeline (in follow up).

Differential Revision: http://reviews.llvm.org/D32783

Added:
    llvm/trunk/test/Transforms/CodeExtractor/PartialInlineEntryUpdate.ll
    llvm/trunk/test/Transforms/CodeExtractor/PartialInlineHighCost.ll
Modified:
    llvm/trunk/lib/Transforms/IPO/PartialInlining.cpp
    llvm/trunk/test/Transforms/CodeExtractor/ExtractedFnEntryCount.ll
    llvm/trunk/test/Transforms/CodeExtractor/MultipleExitBranchProb.ll
    llvm/trunk/test/Transforms/CodeExtractor/PartialInlineAnd.ll
    llvm/trunk/test/Transforms/CodeExtractor/PartialInlineOr.ll
    llvm/trunk/test/Transforms/CodeExtractor/PartialInlineOrAnd.ll
    llvm/trunk/test/Transforms/CodeExtractor/SingleCondition.ll
    llvm/trunk/test/Transforms/CodeExtractor/X86/InheritTargetAttributes.ll

Modified: llvm/trunk/lib/Transforms/IPO/PartialInlining.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/IPO/PartialInlining.cpp?rev=302967&r1=302966&r2=302967&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/IPO/PartialInlining.cpp (original)
+++ llvm/trunk/lib/Transforms/IPO/PartialInlining.cpp Fri May 12 18:41:43 2017
@@ -16,6 +16,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/CodeMetrics.h"
 #include "llvm/Analysis/InlineCost.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/OptimizationDiagnosticInfo.h"
@@ -42,6 +43,11 @@ STATISTIC(NumPartialInlined,
 static cl::opt<bool>
     DisablePartialInlining("disable-partial-inlining", cl::init(false),
                            cl::Hidden, cl::desc("Disable partial ininling"));
+// This is an option used by testing:
+static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis",
+                                      cl::init(false), cl::ZeroOrMore,
+                                      cl::ReallyHidden,
+                                      cl::desc("Skip Cost Analysis"));
 
 static cl::opt<unsigned> MaxNumInlineBlocks(
     "max-num-inline-blocks", cl::init(5), cl::Hidden,
@@ -53,6 +59,15 @@ static cl::opt<int> MaxNumPartialInlinin
     "max-partial-inlining", cl::init(-1), cl::Hidden, cl::ZeroOrMore,
     cl::desc("Max number of partial inlining. The default is unlimited"));
 
+// Used only when PGO or user annotated branch data is absent. It is
+// the least value that is used to weigh the outline region. If BFI
+// produces larger value, the BFI value will be used.
+static cl::opt<int>
+    OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75),
+                             cl::Hidden, cl::ZeroOrMore,
+                             cl::desc("Relative frequency of outline region to "
+                                      "the entry block"));
+
 namespace {
 
 struct FunctionOutliningInfo {
@@ -84,8 +99,6 @@ struct PartialInlinerImpl {
   bool run(Module &M);
   Function *unswitchFunction(Function *F);
 
-  std::unique_ptr<FunctionOutliningInfo> computeOutliningInfo(Function *F);
-
 private:
   int NumPartialInlining = 0;
   std::function<AssumptionCache &(Function &)> *GetAssumptionCache;
@@ -93,11 +106,84 @@ private:
   Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI;
   ProfileSummaryInfo *PSI;
 
-  bool shouldPartialInline(CallSite CS, OptimizationRemarkEmitter &ORE);
+  // Return the frequency of the OutlininingBB relative to F's entry point.
+  // The result is no larger than 1 and is represented using BP.
+  // (Note that the outlined region's 'head' block can only have incoming
+  // edges from the guarding entry blocks).
+  BranchProbability getOutliningCallBBRelativeFreq(Function *F,
+                                                   FunctionOutliningInfo *OI,
+                                                   Function *DuplicateFunction,
+                                                   BlockFrequencyInfo *BFI,
+                                                   BasicBlock *OutliningCallBB);
+
+  // Return true if the callee of CS should be partially inlined with
+  // profit.
+  bool shouldPartialInline(CallSite CS, Function *F, FunctionOutliningInfo *OI,
+                           BlockFrequencyInfo *CalleeBFI,
+                           BasicBlock *OutliningCallBB,
+                           int OutliningCallOverhead,
+                           OptimizationRemarkEmitter &ORE);
+
+  // Try to inline DuplicateFunction (cloned from F with call to
+  // the OutlinedFunction into its callers. Return true
+  // if there is any successful inlining.
+  bool tryPartialInline(Function *DuplicateFunction,
+                        Function *F, /*orignal function */
+                        FunctionOutliningInfo *OI, Function *OutlinedFunction,
+                        BlockFrequencyInfo *CalleeBFI);
+
+  // Compute the mapping from use site of DuplicationFunction to the enclosing
+  // BB's profile count.
+  void computeCallsiteToProfCountMap(Function *DuplicateFunction,
+                                     DenseMap<User *, uint64_t> &SiteCountMap);
+
   bool IsLimitReached() {
     return (MaxNumPartialInlining != -1 &&
             NumPartialInlining >= MaxNumPartialInlining);
   }
+
+  CallSite getCallSite(User *U) {
+    CallSite CS;
+    if (CallInst *CI = dyn_cast<CallInst>(U))
+      CS = CallSite(CI);
+    else if (InvokeInst *II = dyn_cast<InvokeInst>(U))
+      CS = CallSite(II);
+    else
+      llvm_unreachable("All uses must be calls");
+    return CS;
+  }
+
+  CallSite getOneCallSiteTo(Function *F) {
+    User *User = *F->user_begin();
+    return getCallSite(User);
+  }
+
+  std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function *F) {
+    CallSite CS = getOneCallSiteTo(F);
+    DebugLoc DLoc = CS.getInstruction()->getDebugLoc();
+    BasicBlock *Block = CS.getParent();
+    return std::make_tuple(DLoc, Block);
+  }
+
+  // Returns the costs associated with function outlining:
+  // - The first value is the non-weighted runtime cost for making the call
+  //   to the outlined function 'OutlinedFunction', including the addtional
+  //   setup cost in the outlined function itself;
+  // - The second value is the estimated size of the new call sequence in
+  //   basic block 'OutliningCallBB';
+  // - The third value is the estimated size of the original code from
+  //   function 'F' that is extracted into the outlined function.
+  std::tuple<int, int, int>
+  computeOutliningCosts(Function *F, const FunctionOutliningInfo *OutliningInfo,
+                        Function *OutlinedFunction,
+                        BasicBlock *OutliningCallBB);
+  // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to
+  // approximate both the size and runtime cost (Note that in the current
+  // inline cost analysis, there is no clear distinction there either).
+  int computeBBInlineCost(BasicBlock *BB);
+
+  std::unique_ptr<FunctionOutliningInfo> computeOutliningInfo(Function *F);
+
 };
 
 struct PartialInlinerLegacyPass : public ModulePass {
@@ -223,7 +309,8 @@ PartialInlinerImpl::computeOutliningInfo
   // Do sanity check of the entries: threre should not
   // be any successors (not in the entry set) other than
   // {ReturnBlock, NonReturnBlock}
-  assert(OutliningInfo->Entries[0] == &F->front());
+  assert(OutliningInfo->Entries[0] == &F->front() &&
+         "Function Entry must be the first in Entries vector");
   DenseSet<BasicBlock *> Entries;
   for (BasicBlock *E : OutliningInfo->Entries)
     Entries.insert(E);
@@ -289,10 +376,54 @@ PartialInlinerImpl::computeOutliningInfo
   return OutliningInfo;
 }
 
-bool PartialInlinerImpl::shouldPartialInline(CallSite CS,
-                                             OptimizationRemarkEmitter &ORE) {
-  // TODO : more sharing with shouldInline in Inliner.cpp
+// Check if there is PGO data or user annoated branch data:
+static bool hasProfileData(Function *F, FunctionOutliningInfo *OI) {
+  if (F->getEntryCount())
+    return true;
+  // Now check if any of the entry block has MD_prof data:
+  for (auto *E : OI->Entries) {
+    BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator());
+    if (!BR || BR->isUnconditional())
+      continue;
+    uint64_t T, F;
+    if (BR->extractProfMetadata(T, F))
+      return true;
+  }
+  return false;
+}
+
+BranchProbability PartialInlinerImpl::getOutliningCallBBRelativeFreq(
+    Function *F, FunctionOutliningInfo *OI, Function *DuplicateFunction,
+    BlockFrequencyInfo *BFI, BasicBlock *OutliningCallBB) {
+
+  auto EntryFreq =
+      BFI->getBlockFreq(&DuplicateFunction->getEntryBlock());
+  auto OutliningCallFreq = BFI->getBlockFreq(OutliningCallBB);
+
+  auto OutlineRegionRelFreq =
+      BranchProbability::getBranchProbability(OutliningCallFreq.getFrequency(),
+                                              EntryFreq.getFrequency());
+
+  if (hasProfileData(F, OI))
+    return OutlineRegionRelFreq;
+
+  // When profile data is not available, we need to be very
+  // conservative in estimating the overall savings. We need to make sure
+  // the outline region relative frequency is not below the threshold
+  // specified by the option.
+  OutlineRegionRelFreq = std::max(OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100));
+
+  return OutlineRegionRelFreq;
+}
+
+bool PartialInlinerImpl::shouldPartialInline(
+    CallSite CS, Function *F /* Original Callee */, FunctionOutliningInfo *OI,
+    BlockFrequencyInfo *CalleeBFI, BasicBlock *OutliningCallBB,
+    int NonWeightedOutliningRcost, OptimizationRemarkEmitter &ORE) {
   using namespace ore;
+  if (SkipCostAnalysis)
+    return true;
+
   Instruction *Call = CS.getInstruction();
   Function *Callee = CS.getCalledFunction();
   Function *Caller = CS.getCaller();
@@ -302,36 +433,166 @@ bool PartialInlinerImpl::shouldPartialIn
 
   if (IC.isAlways()) {
     ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", Call)
-             << NV("Callee", Callee)
+             << NV("Callee", F)
              << " should always be fully inlined, not partially");
     return false;
   }
 
   if (IC.isNever()) {
     ORE.emit(OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", Call)
-             << NV("Callee", Callee) << " not partially inlined into "
+             << NV("Callee", F) << " not partially inlined into "
              << NV("Caller", Caller)
              << " because it should never be inlined (cost=never)");
     return false;
   }
 
   if (!IC) {
-    ORE.emit(OptimizationRemarkMissed(DEBUG_TYPE, "TooCostly", Call)
-             << NV("Callee", Callee) << " not partially inlined into "
+    ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", Call)
+             << NV("Callee", F) << " not partially inlined into "
              << NV("Caller", Caller) << " because too costly to inline (cost="
              << NV("Cost", IC.getCost()) << ", threshold="
              << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")");
     return false;
   }
+  const DataLayout &DL = Caller->getParent()->getDataLayout();
+  // The savings of eliminating the call:
+  int NonWeightedSavings = getCallsiteCost(CS, DL);
+  BlockFrequency NormWeightedSavings(NonWeightedSavings);
+
+  auto RelativeFreq =
+      getOutliningCallBBRelativeFreq(F, OI, Callee, CalleeBFI, OutliningCallBB);
+  auto NormWeightedRcost =
+      BlockFrequency(NonWeightedOutliningRcost) * RelativeFreq;
+
+  // Weighted saving is smaller than weighted cost, return false
+  if (NormWeightedSavings < NormWeightedRcost) {
+    ORE.emit(
+        OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh", Call)
+        << NV("Callee", F) << " not partially inlined into "
+        << NV("Caller", Caller) << " runtime overhead (overhead="
+        << NV("Overhead", (unsigned)NormWeightedRcost.getFrequency())
+        << ", savings="
+        << NV("Savings", (unsigned)NormWeightedSavings.getFrequency()) << ")"
+        << " of making the outlined call is too high");
+
+    return false;
+  }
 
   ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", Call)
-           << NV("Callee", Callee) << " can be partially inlined into "
+           << NV("Callee", F) << " can be partially inlined into "
            << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost())
            << " (threshold="
            << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")");
   return true;
 }
 
+// TODO: Ideally  we should share Inliner's InlineCost Analysis code.
+// For now use a simplified version. The returned 'InlineCost' will be used
+// to esimate the size cost as well as runtime cost of the BB.
+int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB) {
+  int InlineCost = 0;
+  const DataLayout &DL = BB->getParent()->getParent()->getDataLayout();
+  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+    if (isa<DbgInfoIntrinsic>(I))
+      continue;
+
+    if (CallInst *CI = dyn_cast<CallInst>(I)) {
+      InlineCost += getCallsiteCost(CallSite(CI), DL);
+      continue;
+    }
+
+    if (InvokeInst *II = dyn_cast<InvokeInst>(I)) {
+      InlineCost += getCallsiteCost(CallSite(II), DL);
+      continue;
+    }
+
+    if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) {
+      InlineCost += (SI->getNumCases() + 1) * InlineConstants::InstrCost;
+      continue;
+    }
+    InlineCost += InlineConstants::InstrCost;
+  }
+  return InlineCost;
+}
+
+std::tuple<int, int, int> PartialInlinerImpl::computeOutliningCosts(
+    Function *F, const FunctionOutliningInfo *OI, Function *OutlinedFunction,
+    BasicBlock *OutliningCallBB) {
+  // First compute the cost of the outlined region 'OI' in the original
+  // function 'F':
+  int OutlinedRegionCost = 0;
+  for (BasicBlock &BB : *F) {
+    if (&BB != OI->ReturnBlock &&
+        // Assuming Entry set is small -- do a linear search here:
+        std::find(OI->Entries.begin(), OI->Entries.end(), &BB) ==
+            OI->Entries.end()) {
+      OutlinedRegionCost += computeBBInlineCost(&BB);
+    }
+  }
+
+  // Now compute the cost of the call sequence to the outlined function
+  // 'OutlinedFunction' in BB 'OutliningCallBB':
+  int OutliningFuncCallCost = computeBBInlineCost(OutliningCallBB);
+
+  // Now compute the cost of the extracted/outlined function itself:
+  int OutlinedFunctionCost = 0;
+  for (BasicBlock &BB : *OutlinedFunction) {
+    OutlinedFunctionCost += computeBBInlineCost(&BB);
+  }
+
+  assert(OutlinedFunctionCost >= OutlinedRegionCost &&
+         "Outlined function cost should be no less than the outlined region");
+  int OutliningRuntimeOverhead =
+      OutliningFuncCallCost + (OutlinedFunctionCost - OutlinedRegionCost);
+
+  return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead,
+                         OutlinedRegionCost);
+}
+
+// Create the callsite to profile count map which is
+// used to update the original function's entry count,
+// after the function is partially inlined into the callsite.
+void PartialInlinerImpl::computeCallsiteToProfCountMap(
+    Function *DuplicateFunction,
+    DenseMap<User *, uint64_t> &CallSiteToProfCountMap) {
+  std::vector<User *> Users(DuplicateFunction->user_begin(),
+                            DuplicateFunction->user_end());
+  Function *CurrentCaller = nullptr;
+  BlockFrequencyInfo *CurrentCallerBFI = nullptr;
+
+  auto ComputeCurrBFI = [&,this](Function *Caller) {
+      // For the old pass manager:
+      if (!GetBFI) {
+        if (CurrentCallerBFI)
+          delete CurrentCallerBFI;
+        DominatorTree DT(*Caller);
+        LoopInfo LI(DT);
+        BranchProbabilityInfo BPI(*Caller, LI);
+        CurrentCallerBFI = new BlockFrequencyInfo(*Caller, BPI, LI);
+      } else {
+        // New pass manager:
+        CurrentCallerBFI = &(*GetBFI)(*Caller);
+      }
+  };
+
+  for (User *User : Users) {
+    CallSite CS = getCallSite(User);
+    Function *Caller = CS.getCaller();
+    if (CurrentCaller != Caller) {
+      CurrentCaller = Caller;
+      ComputeCurrBFI(Caller);
+    } else {
+      assert(CurrentCallerBFI && "CallerBFI is not set");
+    }
+    BasicBlock *CallBB = CS.getInstruction()->getParent();
+    auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB);
+    if (Count)
+      CallSiteToProfCountMap[User] = *Count;
+    else
+      CallSiteToProfCountMap[User] = 0;
+  }
+}
+
 Function *PartialInlinerImpl::unswitchFunction(Function *F) {
 
   if (F->hasAddressTaken())
@@ -347,21 +608,21 @@ Function *PartialInlinerImpl::unswitchFu
   if (PSI->isFunctionEntryCold(F))
     return nullptr;
 
-  std::unique_ptr<FunctionOutliningInfo> OutliningInfo =
-      computeOutliningInfo(F);
+  if (F->user_begin() == F->user_end())
+    return nullptr;
 
-  if (!OutliningInfo)
+  std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F);
+
+  if (!OI)
     return nullptr;
 
   // Clone the function, so that we can hack away on it.
   ValueToValueMapTy VMap;
   Function *DuplicateFunction = CloneFunction(F, VMap);
-  BasicBlock *NewReturnBlock =
-      cast<BasicBlock>(VMap[OutliningInfo->ReturnBlock]);
-  BasicBlock *NewNonReturnBlock =
-      cast<BasicBlock>(VMap[OutliningInfo->NonReturnBlock]);
+  BasicBlock *NewReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]);
+  BasicBlock *NewNonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]);
   DenseSet<BasicBlock *> NewEntries;
-  for (BasicBlock *BB : OutliningInfo->Entries) {
+  for (BasicBlock *BB : OI->Entries) {
     NewEntries.insert(cast<BasicBlock>(VMap[BB]));
   }
 
@@ -390,7 +651,7 @@ Function *PartialInlinerImpl::unswitchFu
   BasicBlock *PreReturn = NewReturnBlock;
   // only split block when necessary:
   PHINode *FirstPhi = getFirstPHI(PreReturn);
-  unsigned NumPredsFromEntries = OutliningInfo->ReturnBlockPreds.size();
+  unsigned NumPredsFromEntries = OI->ReturnBlockPreds.size();
   if (FirstPhi && FirstPhi->getNumIncomingValues() > NumPredsFromEntries + 1) {
 
     NewReturnBlock = NewReturnBlock->splitBasicBlock(
@@ -408,14 +669,14 @@ Function *PartialInlinerImpl::unswitchFu
       Ins = NewReturnBlock->getFirstNonPHI();
 
       RetPhi->addIncoming(&*I, PreReturn);
-      for (BasicBlock *E : OutliningInfo->ReturnBlockPreds) {
+      for (BasicBlock *E : OI->ReturnBlockPreds) {
         BasicBlock *NewE = cast<BasicBlock>(VMap[E]);
         RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(NewE), NewE);
         OldPhi->removeIncomingValue(NewE);
       }
       ++I;
     }
-    for (auto E : OutliningInfo->ReturnBlockPreds) {
+    for (auto E : OI->ReturnBlockPreds) {
       BasicBlock *NewE = cast<BasicBlock>(VMap[E]);
       NewE->getTerminator()->replaceUsesOfWith(PreReturn, NewReturnBlock);
     }
@@ -443,50 +704,107 @@ Function *PartialInlinerImpl::unswitchFu
   BlockFrequencyInfo BFI(*DuplicateFunction, BPI, LI);
 
   // Extract the body of the if.
-  Function *ExtractedFunction =
+  Function *OutlinedFunction =
       CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false, &BFI, &BPI)
           .extractCodeRegion();
 
-  // Inline the top-level if test into all callers.
+  bool AnyInline =
+      tryPartialInline(DuplicateFunction, F, OI.get(), OutlinedFunction, &BFI);
+
+  // Ditch the duplicate, since we're done with it, and rewrite all remaining
+  // users (function pointers, etc.) back to the original function.
+  DuplicateFunction->replaceAllUsesWith(F);
+  DuplicateFunction->eraseFromParent();
+  if (!AnyInline && OutlinedFunction)
+    OutlinedFunction->eraseFromParent();
+  return OutlinedFunction;
+}
+
+bool PartialInlinerImpl::tryPartialInline(Function *DuplicateFunction,
+                                          Function *F,
+                                          FunctionOutliningInfo *OI,
+                                          Function *OutlinedFunction,
+                                          BlockFrequencyInfo *CalleeBFI) {
+  if (OutlinedFunction == nullptr)
+    return false;
+
+  int NonWeightedRcost;
+  int SizeCost;
+  int OutlinedRegionSizeCost;
+
+  auto OutliningCallBB =
+      getOneCallSiteTo(OutlinedFunction).getInstruction()->getParent();
+
+  std::tie(SizeCost, NonWeightedRcost, OutlinedRegionSizeCost) =
+      computeOutliningCosts(F, OI, OutlinedFunction, OutliningCallBB);
+
+  // The call sequence to the outlined function is larger than the original
+  // outlined region size, it does not increase the chances of inlining
+  // 'F' with outlining (The inliner usies the size increase to model the
+  // the cost of inlining a callee).
+  if (!SkipCostAnalysis && OutlinedRegionSizeCost < SizeCost) {
+    OptimizationRemarkEmitter ORE(F);
+    DebugLoc DLoc;
+    BasicBlock *Block;
+    std::tie(DLoc, Block) = getOneDebugLoc(DuplicateFunction);
+    ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall",
+                                        DLoc, Block)
+             << ore::NV("Function", F)
+             << " not partially inlined into callers (Original Size = "
+             << ore::NV("OutlinedRegionOriginalSize", OutlinedRegionSizeCost)
+             << ", Size of call sequence to outlined function = "
+             << ore::NV("NewSize", SizeCost) << ")");
+    return false;
+  }
+
+  assert(F->user_begin() == F->user_end() &&
+         "F's users should all be replaced!");
   std::vector<User *> Users(DuplicateFunction->user_begin(),
                             DuplicateFunction->user_end());
 
+  DenseMap<User *, uint64_t> CallSiteToProfCountMap;
+  if (F->getEntryCount())
+    computeCallsiteToProfCountMap(DuplicateFunction, CallSiteToProfCountMap);
+
+  auto CalleeEntryCount = F->getEntryCount();
+  uint64_t CalleeEntryCountV = (CalleeEntryCount ? *CalleeEntryCount : 0);
+  bool AnyInline = false;
   for (User *User : Users) {
-    CallSite CS;
-    if (CallInst *CI = dyn_cast<CallInst>(User))
-      CS = CallSite(CI);
-    else if (InvokeInst *II = dyn_cast<InvokeInst>(User))
-      CS = CallSite(II);
-    else
-      llvm_unreachable("All uses must be calls");
+    CallSite CS = getCallSite(User);
 
     if (IsLimitReached())
       continue;
 
     OptimizationRemarkEmitter ORE(CS.getCaller());
-    if (!shouldPartialInline(CS, ORE))
+
+    if (!shouldPartialInline(CS, F, OI, CalleeBFI, OutliningCallBB,
+                             NonWeightedRcost, ORE))
       continue;
 
-    DebugLoc DLoc = CS.getInstruction()->getDebugLoc();
-    BasicBlock *Block = CS.getParent();
-    ORE.emit(OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", DLoc, Block)
-             << ore::NV("Callee", F) << " partially inlined into "
-             << ore::NV("Caller", CS.getCaller()));
+    ORE.emit(
+        OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", CS.getInstruction())
+        << ore::NV("Callee", F) << " partially inlined into "
+        << ore::NV("Caller", CS.getCaller()));
 
     InlineFunctionInfo IFI(nullptr, GetAssumptionCache, PSI);
     InlineFunction(CS, IFI);
+
+    // Now update the entry count:
+    if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) {
+      uint64_t CallSiteCount = CallSiteToProfCountMap[User];
+      CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount);
+    }
+
+    AnyInline = true;
     NumPartialInlining++;
-    // update stats
+    // Update the stats
     NumPartialInlined++;
   }
 
-  // Ditch the duplicate, since we're done with it, and rewrite all remaining
-  // users (function pointers, etc.) back to the original function.
-  DuplicateFunction->replaceAllUsesWith(F);
-  DuplicateFunction->eraseFromParent();
-
+  if (AnyInline && CalleeEntryCount)
+    F->setEntryCount(CalleeEntryCountV);
 
-  return ExtractedFunction;
+  return AnyInline;
 }
 
 bool PartialInlinerImpl::run(Module &M) {

Modified: llvm/trunk/test/Transforms/CodeExtractor/ExtractedFnEntryCount.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/CodeExtractor/ExtractedFnEntryCount.ll?rev=302967&r1=302966&r2=302967&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/CodeExtractor/ExtractedFnEntryCount.ll (original)
+++ llvm/trunk/test/Transforms/CodeExtractor/ExtractedFnEntryCount.ll Fri May 12 18:41:43 2017
@@ -1,4 +1,4 @@
-; RUN: opt < %s -partial-inliner -S | FileCheck %s
+; RUN: opt < %s -partial-inliner -skip-partial-inlining-cost-analysis -S | FileCheck %s
 
 ; This test checks to make sure that the CodeExtractor
 ;  properly sets the entry count for the function that is

Modified: llvm/trunk/test/Transforms/CodeExtractor/MultipleExitBranchProb.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/CodeExtractor/MultipleExitBranchProb.ll?rev=302967&r1=302966&r2=302967&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/CodeExtractor/MultipleExitBranchProb.ll (original)
+++ llvm/trunk/test/Transforms/CodeExtractor/MultipleExitBranchProb.ll Fri May 12 18:41:43 2017
@@ -1,4 +1,4 @@
-; RUN: opt < %s -partial-inliner -max-num-inline-blocks=2 -S | FileCheck %s
+; RUN: opt < %s -partial-inliner -max-num-inline-blocks=2 -skip-partial-inlining-cost-analysis -S | FileCheck %s
 
 ; This test checks to make sure that CodeExtractor updates
 ;  the exit branch probabilities for multiple exit blocks.

Modified: llvm/trunk/test/Transforms/CodeExtractor/PartialInlineAnd.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/CodeExtractor/PartialInlineAnd.ll?rev=302967&r1=302966&r2=302967&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/CodeExtractor/PartialInlineAnd.ll (original)
+++ llvm/trunk/test/Transforms/CodeExtractor/PartialInlineAnd.ll Fri May 12 18:41:43 2017
@@ -1,7 +1,7 @@
 ; RUN: opt < %s -partial-inliner -S | FileCheck %s
 ; RUN: opt < %s -passes=partial-inliner -S | FileCheck %s
-; RUN: opt < %s -partial-inliner -max-num-inline-blocks=2 -S | FileCheck --check-prefix=LIMIT %s
-; RUN: opt < %s -passes=partial-inliner -max-num-inline-blocks=2 -S | FileCheck  --check-prefix=LIMIT %s
+; RUN: opt < %s -partial-inliner -skip-partial-inlining-cost-analysis -max-num-inline-blocks=2 -S | FileCheck --check-prefix=LIMIT %s
+; RUN: opt < %s -passes=partial-inliner -skip-partial-inlining-cost-analysis -max-num-inline-blocks=2 -S | FileCheck  --check-prefix=LIMIT %s
 
 ; Function Attrs: nounwind uwtable
 define i32 @bar(i32 %arg) local_unnamed_addr #0 {

Added: llvm/trunk/test/Transforms/CodeExtractor/PartialInlineEntryUpdate.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/CodeExtractor/PartialInlineEntryUpdate.ll?rev=302967&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/CodeExtractor/PartialInlineEntryUpdate.ll (added)
+++ llvm/trunk/test/Transforms/CodeExtractor/PartialInlineEntryUpdate.ll Fri May 12 18:41:43 2017
@@ -0,0 +1,41 @@
+; RUN: opt < %s -skip-partial-inlining-cost-analysis -partial-inliner -S  | FileCheck %s
+; RUN: opt < %s -skip-partial-inlining-cost-analysis -passes=partial-inliner -S  | FileCheck %s
+
+define i32 @Func(i1 %cond, i32* align 4 %align.val) !prof !1 {
+; CHECK: @Func({{.*}}) !prof [[REMAINCOUNT:![0-9]+]]
+entry:
+  br i1 %cond, label %if.then, label %return
+if.then:
+  ; Dummy store to have more than 0 uses
+  store i32 10, i32* %align.val, align 4
+  br label %return
+return:             ; preds = %entry
+  ret i32 0
+}
+
+define internal i32 @Caller1(i1 %cond, i32* align 2 %align.val) !prof !3{
+entry:
+; CHECK-LABEL: @Caller1
+; CHECK: br
+; CHECK: call void @Func.1_ 
+; CHECK: br
+; CHECK: call void @Func.1_ 
+  %val = call i32 @Func(i1 %cond, i32* %align.val)
+  %val2 = call i32 @Func(i1 %cond, i32* %align.val)
+  ret i32 %val
+}
+
+define internal i32 @Caller2(i1 %cond, i32* align 2 %align.val) !prof !2{
+entry:
+; CHECK-LABEL: @Caller2
+; CHECK: br
+; CHECK: call void @Func.1_ 
+  %val = call i32 @Func(i1 %cond, i32* %align.val)
+  ret i32 %val
+}
+
+; CHECK: [[REMAINCOUNT]] = !{!"function_entry_count", i64 150}
+!1 = !{!"function_entry_count", i64 200}
+!2 = !{!"function_entry_count", i64 10}
+!3 = !{!"function_entry_count", i64 20}
+

Added: llvm/trunk/test/Transforms/CodeExtractor/PartialInlineHighCost.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/CodeExtractor/PartialInlineHighCost.ll?rev=302967&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/CodeExtractor/PartialInlineHighCost.ll (added)
+++ llvm/trunk/test/Transforms/CodeExtractor/PartialInlineHighCost.ll Fri May 12 18:41:43 2017
@@ -0,0 +1,107 @@
+; The outlined region has high frequency  and the outlining
+; call sequence is expensive (input, output, multiple exit etc)
+; RUN: opt < %s -partial-inliner -max-num-inline-blocks=2 -S | FileCheck %s
+; RUN: opt < %s -passes=partial-inliner -max-num-inline-blocks=2 -S | FileCheck %s
+; RUN: opt < %s -partial-inliner -skip-partial-inlining-cost-analysis -max-num-inline-blocks=2 -S | FileCheck --check-prefix=NOCOST %s
+; RUN: opt < %s -passes=partial-inliner -skip-partial-inlining-cost-analysis -max-num-inline-blocks=2 -S | FileCheck  --check-prefix=NOCOST %s
+
+
+; Function Attrs: nounwind
+define i32 @bar_hot_outline_region(i32 %arg) local_unnamed_addr #0 {
+bb:
+  %tmp = icmp slt i32 %arg, 0
+  br i1 %tmp, label %bb1, label %bb16, !prof !1
+
+bb1:                                              ; preds = %bb
+  %tmp2 = tail call i32 (...) @foo() #0
+  %tmp3 = tail call i32 (...) @foo() #0
+  %tmp4 = tail call i32 (...) @foo() #0
+  %tmp5 = tail call i32 (...) @foo() #0
+  %tmp6 = tail call i32 (...) @foo() #0
+  %tmp7 = tail call i32 (...) @foo() #0
+  %tmp8 = add nsw i32 %arg, 1
+  %tmp9 = tail call i32 @goo(i32 %tmp8) #0
+  %tmp10 = tail call i32 (...) @foo() #0
+  %tmp11 = icmp eq i32 %tmp10, 0
+  br i1 %tmp11, label %bb12, label %bb16
+
+bb12:                                             ; preds = %bb1
+  %tmp13 = tail call i32 (...) @foo() #0
+  %tmp14 = icmp eq i32 %tmp13, 0
+  %tmp15 = select i1 %tmp14, i32 0, i32 3
+  br label %bb16
+
+bb16:                                             ; preds = %bb12, %bb1, %bb
+  %tmp17 = phi i32 [ 2, %bb1 ], [ %tmp15, %bb12 ], [ 0, %bb ]
+  ret i32 %tmp17
+}
+
+define i32 @bar_cold_outline_region(i32 %arg) local_unnamed_addr #0 {
+bb:
+  %tmp = icmp slt i32 %arg, 0
+  br i1 %tmp, label %bb1, label %bb16, !prof !2
+
+bb1:                                              ; preds = %bb
+  %tmp2 = tail call i32 (...) @foo() #0
+  %tmp3 = tail call i32 (...) @foo() #0
+  %tmp4 = tail call i32 (...) @foo() #0
+  %tmp5 = tail call i32 (...) @foo() #0
+  %tmp6 = tail call i32 (...) @foo() #0
+  %tmp7 = tail call i32 (...) @foo() #0
+  %tmp8 = add nsw i32 %arg, 1
+  %tmp9 = tail call i32 @goo(i32 %tmp8) #0
+  %tmp10 = tail call i32 (...) @foo() #0
+  %tmp11 = icmp eq i32 %tmp10, 0
+  br i1 %tmp11, label %bb12, label %bb16
+
+bb12:                                             ; preds = %bb1
+  %tmp13 = tail call i32 (...) @foo() #0
+  %tmp14 = icmp eq i32 %tmp13, 0
+  %tmp15 = select i1 %tmp14, i32 0, i32 3
+  br label %bb16
+
+bb16:                                             ; preds = %bb12, %bb1, %bb
+  %tmp17 = phi i32 [ 2, %bb1 ], [ %tmp15, %bb12 ], [ 0, %bb ]
+  ret i32 %tmp17
+}
+
+; Function Attrs: nounwind
+declare i32 @foo(...) local_unnamed_addr #0
+
+; Function Attrs: nounwind
+declare i32 @goo(i32) local_unnamed_addr #0
+
+; Function Attrs: nounwind
+define i32 @dummy_caller(i32 %arg) local_unnamed_addr #0 {
+bb:
+; CHECK-LABEL: @dummy_caller
+; CHECK-NOT: br i1
+; CHECK-NOT: call{{.*}}bar_hot_outline_region. 
+; NOCOST-LABEL: @dummy_caller
+; NOCOST: br i1
+; NOCOST: call{{.*}}bar_hot_outline_region.
+
+  %tmp = tail call i32 @bar_hot_outline_region(i32 %arg)
+  ret i32 %tmp
+}
+
+define i32 @dummy_caller2(i32 %arg) local_unnamed_addr #0 {
+bb:
+; CHECK-LABEL: @dummy_caller2
+; CHECK: br i1
+; CHECK: call{{.*}}bar_cold_outline_region.
+; NOCOST-LABEL: @dummy_caller2
+; NOCOST: br i1
+; NOCOST: call{{.*}}bar_cold_outline_region.
+
+  %tmp = tail call i32 @bar_cold_outline_region(i32 %arg)
+  ret i32 %tmp
+}
+
+attributes #0 = { nounwind }
+
+!llvm.ident = !{!0}
+
+!0 = !{!"clang version 5.0.0 (trunk 301898)"}
+!1 = !{!"branch_weights", i32 2000, i32 1}
+!2 = !{!"branch_weights", i32 1, i32 100}

Modified: llvm/trunk/test/Transforms/CodeExtractor/PartialInlineOr.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/CodeExtractor/PartialInlineOr.ll?rev=302967&r1=302966&r2=302967&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/CodeExtractor/PartialInlineOr.ll (original)
+++ llvm/trunk/test/Transforms/CodeExtractor/PartialInlineOr.ll Fri May 12 18:41:43 2017
@@ -1,5 +1,5 @@
-; RUN: opt < %s -partial-inliner -S | FileCheck %s
-; RUN: opt < %s -passes=partial-inliner -S | FileCheck %s
+; RUN: opt < %s -partial-inliner -skip-partial-inlining-cost-analysis -S | FileCheck %s
+; RUN: opt < %s -passes=partial-inliner -skip-partial-inlining-cost-analysis -S | FileCheck %s
 ; RUN: opt < %s -partial-inliner -max-num-inline-blocks=2 -S | FileCheck --check-prefix=LIMIT %s
 ; RUN: opt < %s -passes=partial-inliner -max-num-inline-blocks=2 -S | FileCheck  --check-prefix=LIMIT %s
 

Modified: llvm/trunk/test/Transforms/CodeExtractor/PartialInlineOrAnd.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/CodeExtractor/PartialInlineOrAnd.ll?rev=302967&r1=302966&r2=302967&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/CodeExtractor/PartialInlineOrAnd.ll (original)
+++ llvm/trunk/test/Transforms/CodeExtractor/PartialInlineOrAnd.ll Fri May 12 18:41:43 2017
@@ -1,7 +1,7 @@
 ; RUN: opt < %s -partial-inliner -S | FileCheck %s
 ; RUN: opt < %s -passes=partial-inliner -S | FileCheck %s
-; RUN: opt < %s -partial-inliner -max-num-inline-blocks=3 -S | FileCheck --check-prefix=LIMIT3 %s
-; RUN: opt < %s -passes=partial-inliner -max-num-inline-blocks=3 -S | FileCheck  --check-prefix=LIMIT3 %s
+; RUN: opt < %s -partial-inliner -max-num-inline-blocks=3 -skip-partial-inlining-cost-analysis  -S | FileCheck --check-prefix=LIMIT3 %s
+; RUN: opt < %s -passes=partial-inliner -max-num-inline-blocks=3 -skip-partial-inlining-cost-analysis -S | FileCheck  --check-prefix=LIMIT3 %s
 ; RUN: opt < %s -partial-inliner -max-num-inline-blocks=2 -S | FileCheck --check-prefix=LIMIT2 %s
 ; RUN: opt < %s -passes=partial-inliner -max-num-inline-blocks=2 -S | FileCheck  --check-prefix=LIMIT2 %s
 

Modified: llvm/trunk/test/Transforms/CodeExtractor/SingleCondition.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/CodeExtractor/SingleCondition.ll?rev=302967&r1=302966&r2=302967&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/CodeExtractor/SingleCondition.ll (original)
+++ llvm/trunk/test/Transforms/CodeExtractor/SingleCondition.ll Fri May 12 18:41:43 2017
@@ -1,5 +1,5 @@
-; RUN: opt < %s -partial-inliner -S  | FileCheck %s
-; RUN: opt < %s -passes=partial-inliner -S  | FileCheck %s
+; RUN: opt < %s -skip-partial-inlining-cost-analysis -partial-inliner -S  | FileCheck %s
+; RUN: opt < %s -skip-partial-inlining-cost-analysis -passes=partial-inliner -S  | FileCheck %s
 
 define internal i32 @inlinedFunc(i1 %cond, i32* align 4 %align.val) {
 entry:

Modified: llvm/trunk/test/Transforms/CodeExtractor/X86/InheritTargetAttributes.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/CodeExtractor/X86/InheritTargetAttributes.ll?rev=302967&r1=302966&r2=302967&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/CodeExtractor/X86/InheritTargetAttributes.ll (original)
+++ llvm/trunk/test/Transforms/CodeExtractor/X86/InheritTargetAttributes.ll Fri May 12 18:41:43 2017
@@ -1,5 +1,5 @@
-; RUN: opt < %s -partial-inliner | llc -filetype=null
-; RUN: opt < %s -partial-inliner -S | FileCheck %s
+; RUN: opt < %s -partial-inliner -skip-partial-inlining-cost-analysis | llc -filetype=null
+; RUN: opt < %s -partial-inliner -skip-partial-inlining-cost-analysis -S | FileCheck %s
 ; This testcase checks to see if CodeExtractor properly inherits
 ;   target specific attributes for the extracted function. This can
 ;   cause certain instructions that depend on the attributes to not




More information about the llvm-commits mailing list