[llvm] r309441 - [Inliner] Do not apply any bonus for cold callsites.

Easwaran Raman via llvm-commits llvm-commits at lists.llvm.org
Fri Jul 28 14:47:36 PDT 2017


Author: eraman
Date: Fri Jul 28 14:47:36 2017
New Revision: 309441

URL: http://llvm.org/viewvc/llvm-project?rev=309441&view=rev
Log:
[Inliner] Do not apply any bonus for cold callsites.

Summary:
Inlining threshold is increased by application of bonuses when the
callee has a single reachable basic block or is rich in vector
instructions. Similarly, inlining cost is reduced by applying a large
bonus when the last call to a static function is considered for
inlining. This patch disables the application of these bonuses when the
callsite or the callee is cold. The intention here is to prevent a large
cold callsite from being inlined to a non-cold caller that could prevent
the caller from being inlined. This is especially important when the
cold callsite is a last call to a static since the associated bonus is
very high.

Reviewers: chandlerc, davidxl

Subscribers: danielcdh, llvm-commits

Differential Revision: https://reviews.llvm.org/D35823

Added:
    llvm/trunk/test/Transforms/Inline/last-call-no-bonus.ll
    llvm/trunk/test/Transforms/Inline/vector-no-bonus.ll
Modified:
    llvm/trunk/lib/Analysis/InlineCost.cpp
    llvm/trunk/test/Transforms/Inline/last-call-bonus.ll

Modified: llvm/trunk/lib/Analysis/InlineCost.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Analysis/InlineCost.cpp?rev=309441&r1=309440&r2=309441&view=diff
==============================================================================
--- llvm/trunk/lib/Analysis/InlineCost.cpp (original)
+++ llvm/trunk/lib/Analysis/InlineCost.cpp Fri Jul 28 14:47:36 2017
@@ -119,8 +119,9 @@ class CallAnalyzer : public InstVisitor<
   /// Number of bytes allocated statically by the callee.
   uint64_t AllocatedSize;
   unsigned NumInstructions, NumVectorInstructions;
-  int FiftyPercentVectorBonus, TenPercentVectorBonus;
-  int VectorBonus;
+  int VectorBonus, TenPercentVectorBonus;
+  // Bonus to be applied when the callee has only one reachable basic block.
+  int SingleBBBonus;
 
   /// While we walk the potentially-inlined instructions, we build up and
   /// maintain a mapping of simplified values specific to this callsite. The
@@ -235,11 +236,11 @@ public:
         ExposesReturnsTwice(false), HasDynamicAlloca(false),
         ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false),
         HasFrameEscape(false), AllocatedSize(0), NumInstructions(0),
-        NumVectorInstructions(0), FiftyPercentVectorBonus(0),
-        TenPercentVectorBonus(0), VectorBonus(0), NumConstantArgs(0),
-        NumConstantOffsetPtrArgs(0), NumAllocaArgs(0), NumConstantPtrCmps(0),
-        NumConstantPtrDiffs(0), NumInstructionsSimplified(0),
-        SROACostSavings(0), SROACostSavingsLost(0) {}
+        NumVectorInstructions(0), VectorBonus(0), SingleBBBonus(0),
+        NumConstantArgs(0), NumConstantOffsetPtrArgs(0), NumAllocaArgs(0),
+        NumConstantPtrCmps(0), NumConstantPtrDiffs(0),
+        NumInstructionsSimplified(0), SROACostSavings(0),
+        SROACostSavingsLost(0) {}
 
   bool analyzeCall(CallSite CS);
 
@@ -678,11 +679,49 @@ void CallAnalyzer::updateThreshold(CallS
     return B ? std::max(A, B.getValue()) : A;
   };
 
+  // Various bonus percentages. These are multiplied by Threshold to get the
+  // bonus values.
+  // SingleBBBonus: This bonus is applied if the callee has a single reachable
+  // basic block at the given callsite context. This is speculatively applied
+  // and withdrawn if more than one basic block is seen.
+  //
+  // Vector bonuses: We want to more aggressively inline vector-dense kernels
+  // and apply this bonus based on the percentage of vector instructions. A
+  // bonus is applied if the vector instructions exceed 50% and half that amount
+  // is applied if it exceeds 10%. Note that these bonuses are some what
+  // arbitrary and evolved over time by accident as much as because they are
+  // principled bonuses.
+  // FIXME: It would be nice to base the bonus values on something more
+  // scientific.
+  //
+  // LstCallToStaticBonus: This large bonus is applied to ensure the inlining
+  // of the last call to a static function as inlining such functions is
+  // guaranteed to reduce code size.
+  //
+  // These bonus percentages may be set to 0 based on properties of the caller
+  // and the callsite.
+  int SingleBBBonusPercent = 50;
+  int VectorBonusPercent = 150;
+  int LastCallToStaticBonus = InlineConstants::LastCallToStaticBonus;
+
+  // Lambda to set all the above bonus and bonus percentages to 0.
+  auto DisallowAllBonuses = [&]() {
+    SingleBBBonusPercent = 0;
+    VectorBonusPercent = 0;
+    LastCallToStaticBonus = 0;
+  };
+
   // Use the OptMinSizeThreshold or OptSizeThreshold knob if they are available
   // and reduce the threshold if the caller has the necessary attribute.
-  if (Caller->optForMinSize())
+  if (Caller->optForMinSize()) {
     Threshold = MinIfValid(Threshold, Params.OptMinSizeThreshold);
-  else if (Caller->optForSize())
+    // For minsize, we want to disable the single BB bonus and the vector
+    // bonuses, but not the last-call-to-static bonus. Inlining the last call to
+    // a static function will, at the minimum, eliminate the parameter setup and
+    // call/return instructions.
+    SingleBBBonusPercent = 0;
+    VectorBonusPercent = 0;
+  } else if (Caller->optForSize())
     Threshold = MinIfValid(Threshold, Params.OptSizeThreshold);
 
   // Adjust the threshold based on inlinehint attribute and profile based
@@ -706,6 +745,11 @@ void CallAnalyzer::updateThreshold(CallS
           Threshold = Params.HotCallSiteThreshold.getValue();
         } else if (isColdCallSite(CS, CallerBFI)) {
           DEBUG(dbgs() << "Cold callsite.\n");
+          // Do not apply bonuses for a cold callsite including the
+          // LastCallToStatic bonus. While this bonus might result in code size
+          // reduction, it can cause the size of a non-cold caller to increase
+          // preventing it from being inlined.
+          DisallowAllBonuses();
           Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold);
         }
       } else {
@@ -717,6 +761,11 @@ void CallAnalyzer::updateThreshold(CallS
           Threshold = MaxIfValid(Threshold, Params.HintThreshold);
         } else if (PSI->isFunctionEntryCold(&Callee)) {
           DEBUG(dbgs() << "Cold callee.\n");
+          // Do not apply bonuses for a cold callee including the
+          // LastCallToStatic bonus. While this bonus might result in code size
+          // reduction, it can cause the size of a non-cold caller to increase
+          // preventing it from being inlined.
+          DisallowAllBonuses();
           Threshold = MinIfValid(Threshold, Params.ColdThreshold);
         }
       }
@@ -726,6 +775,17 @@ void CallAnalyzer::updateThreshold(CallS
   // Finally, take the target-specific inlining threshold multiplier into
   // account.
   Threshold *= TTI.getInliningThresholdMultiplier();
+
+  SingleBBBonus = Threshold * SingleBBBonusPercent / 100;
+  VectorBonus = Threshold * VectorBonusPercent / 100;
+
+  bool OnlyOneCallAndLocalLinkage =
+      F.hasLocalLinkage() && F.hasOneUse() && &F == CS.getCalledFunction();
+  // If there is only one call of the function, and it has internal linkage,
+  // the cost of inlining it drops dramatically. It may seem odd to update
+  // Cost in updateThreshold, but the bonus depends on the logic in this method.
+  if (OnlyOneCallAndLocalLinkage)
+    Cost -= LastCallToStaticBonus;
 }
 
 bool CallAnalyzer::visitCmpInst(CmpInst &I) {
@@ -1295,31 +1355,15 @@ bool CallAnalyzer::analyzeCall(CallSite
   // Update the threshold based on callsite properties
   updateThreshold(CS, F);
 
-  FiftyPercentVectorBonus = 3 * Threshold / 2;
-  TenPercentVectorBonus = 3 * Threshold / 4;
-
-  // Track whether the post-inlining function would have more than one basic
-  // block. A single basic block is often intended for inlining. Balloon the
-  // threshold by 50% until we pass the single-BB phase.
-  bool SingleBB = true;
-  int SingleBBBonus = Threshold / 2;
-
   // Speculatively apply all possible bonuses to Threshold. If cost exceeds
   // this Threshold any time, and cost cannot decrease, we can stop processing
   // the rest of the function body.
-  Threshold += (SingleBBBonus + FiftyPercentVectorBonus);
+  Threshold += (SingleBBBonus + VectorBonus);
 
   // Give out bonuses for the callsite, as the instructions setting them up
   // will be gone after inlining.
   Cost -= getCallsiteCost(CS, DL);
 
-  // If there is only one call of the function, and it has internal linkage,
-  // the cost of inlining it drops dramatically.
-  bool OnlyOneCallAndLocalLinkage =
-      F.hasLocalLinkage() && F.hasOneUse() && &F == CS.getCalledFunction();
-  if (OnlyOneCallAndLocalLinkage)
-    Cost -= InlineConstants::LastCallToStaticBonus;
-
   // If this function uses the coldcc calling convention, prefer not to inline
   // it.
   if (F.getCallingConv() == CallingConv::Cold)
@@ -1387,6 +1431,7 @@ bool CallAnalyzer::analyzeCall(CallSite
       BBSetVector;
   BBSetVector BBWorklist;
   BBWorklist.insert(&F.getEntryBlock());
+  bool SingleBB = true;
   // Note that we *must not* cache the size, this loop grows the worklist.
   for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
     // Bail out the moment we cross the threshold. This means we'll under-count
@@ -1451,6 +1496,8 @@ bool CallAnalyzer::analyzeCall(CallSite
     }
   }
 
+  bool OnlyOneCallAndLocalLinkage =
+      F.hasLocalLinkage() && F.hasOneUse() && &F == CS.getCalledFunction();
   // If this is a noduplicate call, we can still inline as long as
   // inlining this would cause the removal of the caller (so the instruction
   // is not actually duplicated, just moved).
@@ -1461,9 +1508,9 @@ bool CallAnalyzer::analyzeCall(CallSite
   // subtract the excess bonus, if any, from the Threshold before
   // comparing against Cost.
   if (NumVectorInstructions <= NumInstructions / 10)
-    Threshold -= FiftyPercentVectorBonus;
+    Threshold -= VectorBonus;
   else if (NumVectorInstructions <= NumInstructions / 2)
-    Threshold -= (FiftyPercentVectorBonus - TenPercentVectorBonus);
+    Threshold -= VectorBonus/2;
 
   return Cost < std::max(1, Threshold);
 }

Modified: llvm/trunk/test/Transforms/Inline/last-call-bonus.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/Inline/last-call-bonus.ll?rev=309441&r1=309440&r2=309441&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/Inline/last-call-bonus.ll (original)
+++ llvm/trunk/test/Transforms/Inline/last-call-bonus.ll Fri Jul 28 14:47:36 2017
@@ -10,6 +10,7 @@
 ; preprocess the test.
 
 ; RUN: opt < %s -loop-unroll -inline -unroll-threshold=15000 -inline-threshold=250 -S | FileCheck %s
+; RUN: opt < %s -passes='function(require<opt-remark-emit>,loop(unroll)),require<profile-summary>,cgscc(inline)' -unroll-threshold=15000 -inline-threshold=250 -S | FileCheck %s
 ; CHECK-LABEL: define internal i32 @bar()
 
 define internal i32 @baz() {

Added: llvm/trunk/test/Transforms/Inline/last-call-no-bonus.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/Inline/last-call-no-bonus.ll?rev=309441&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/Inline/last-call-no-bonus.ll (added)
+++ llvm/trunk/test/Transforms/Inline/last-call-no-bonus.ll Fri Jul 28 14:47:36 2017
@@ -0,0 +1,58 @@
+; This code is virtually identical to last-call-bonus.ll, but the callsites
+; to the internal functions are cold, thereby preventing the last call to
+; static bonus from being applied.
+
+; RUN: opt < %s -passes='function(require<opt-remark-emit>,loop(unroll)),require<profile-summary>,cgscc(inline)' -unroll-threshold=15000 -inline-threshold=250 -S | FileCheck %s
+
+; CHECK-LABEL: define internal i32 @baz
+define internal i32 @baz() {
+entry:
+  br label %bb1
+
+bb1:
+  %ind = phi i32 [ 0, %entry ], [ %inc, %bb1 ]
+  call void @extern()
+  %inc = add nsw i32 %ind, 1
+  %cmp = icmp sgt i32 %inc, 510
+  br i1 %cmp, label %ret, label %bb1
+
+ret:
+  ret i32 0
+}
+
+; CHECK-LABEL: define internal i32 @bar
+define internal i32 @bar(i1 %b) {
+entry:
+  br label %bb1
+
+bb1:
+  %ind = phi i32 [ 0, %entry ], [ %inc, %bb1 ]
+  call void @extern()
+  %inc = add nsw i32 %ind, 1
+  %cmp = icmp sgt i32 %inc, 510
+  br i1 %cmp, label %for.exit, label %bb1
+
+for.exit:
+  br i1 %b, label %bb2, label %ret, !prof !0
+bb2:
+; CHECK: call i32 @baz
+  call i32 @baz()
+  br label %ret
+ret:
+  ret i32 0
+}
+; CHECK-LABEL: define i32 @foo
+define i32 @foo(i1 %b) {
+entry:
+  br i1 %b, label %bb1, label %ret, !prof !0
+bb1:
+; CHECK: call i32 @bar
+  call i32 @bar(i1 %b)
+  br label %ret
+ret:
+  ret i32 0
+}
+
+declare void @extern()
+
+!0 = !{!"branch_weights", i32 1, i32 2500}

Added: llvm/trunk/test/Transforms/Inline/vector-no-bonus.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/Inline/vector-no-bonus.ll?rev=309441&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/Inline/vector-no-bonus.ll (added)
+++ llvm/trunk/test/Transforms/Inline/vector-no-bonus.ll Fri Jul 28 14:47:36 2017
@@ -0,0 +1,47 @@
+; The code in this test is very similar to vector-bonus.ll except for
+; the fact that the call to bar is cold thereby preventing the application of
+; the vector bonus.
+; RUN: opt < %s -inline -inline-threshold=35  -S | FileCheck %s
+; RUN: opt < %s -passes='cgscc(inline)' -inline-threshold=35  -S | FileCheck %s
+
+define i32 @bar(<4 x i32> %v, i32 %i) #0 {
+entry:
+  %cmp = icmp sgt i32 %i, 4
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %mul1 = mul nsw i32 %i, %i
+  br label %return
+
+if.else:                                          ; preds = %entry
+  %add1 = add nsw i32 %i, %i
+  %add2 = add nsw i32 %i, %i
+  %add3 = add nsw i32 %i, %i
+  %add4 = add nsw i32 %i, %i
+  %add5 = add nsw i32 %i, %i
+  %add6 = add nsw i32 %i, %i
+  %vecext = extractelement <4 x i32> %v, i32 0
+  %vecext7 = extractelement <4 x i32> %v, i32 1
+  %add7 = add nsw i32 %vecext, %vecext7
+  br label %return
+
+return:                                           ; preds = %if.else, %if.then
+  %retval.0 = phi i32 [ %mul1, %if.then ], [ %add7, %if.else ]
+  ret i32 %retval.0
+}
+
+define i32 @foo(<4 x i32> %v, i32 %a) #1 {
+; CHECK-LABEL: @foo(
+; CHECK-NOT: call i32 @bar
+; CHECK: ret
+entry:
+  %cmp = icmp eq i32 %a, 0
+  br i1 %cmp, label %callbb, label %ret
+callbb:
+  %call = call i32 @bar(<4 x i32> %v, i32 %a)
+  br label %ret
+ret:
+  %call1 = phi i32 [%call, %callbb], [0, %entry]
+  ret i32 %call1
+}
+




More information about the llvm-commits mailing list