[llvm] d1b376f - [FuncSpec] Rework the discardment logic for unprofitable specializations.

Wed Aug 9 02:33:10 PDT 2023

Author: Alexandros Lamprineas
Date: 2023-08-09T10:28:46+01:00
New Revision: d1b376fd7bf73bca557f3c174d4c129ed4d45ae5

URL: https://github.com/llvm/llvm-project/commit/d1b376fd7bf73bca557f3c174d4c129ed4d45ae5
DIFF: https://github.com/llvm/llvm-project/commit/d1b376fd7bf73bca557f3c174d4c129ed4d45ae5.diff

LOG: [FuncSpec] Rework the discardment logic for unprofitable specializations.

Currently we make an arbitrary comparison between codesize and latency
in order to decide whether to keep a specialization or not. Sometimes
the latency savings are biased in favor of loops because of imprecise
block frequencies, therefore this metric contains a lot of noise. This
patch tries to address the problem as follows:

* Reject specializations whose codesize savings are less than X% of
  the original function size.
* Reject specializations whose latency savings are less than Y% of
  the original function size.
* Reject specializations whose inlining bonus is less than Z% of
  the original function size.

I am not saying this is super precise, but at least X, Y and Z are
configurable, allowing us to tweak the cost model. Moreover, it lets
us prioritize codesize over latency, which is a less noisy metric.

I am also increasing the minimum size a function should have to be
considered a candidate for specialization. Initially the cost of
a function was calculated as

  CodeMetrics::NumInsts * InlineConstants::getInstrCost()

which later in D150464 was altered into CodeMetrics::NumInsts since
the metric is supposed to model TargetTransformInfo::TCK_CodeSize.
However, we omitted adjusting MinFunctionSize in that commit.

Differential Revision: https://reviews.llvm.org/D157123

Added: 
    

Modified: 
    llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h
    llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
    llvm/unittests/Transforms/IPO/FunctionSpecializationTest.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h b/llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h
index 8f67365334bf84..f87dccba0eda0a 100644

--- a/llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h
+++ b/llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h
@@ -176,8 +176,7 @@ class InstCostVisitor : public InstVisitor<InstCostVisitor, Constant *> {
     return Solver.isBlockExecutable(BB) && !DeadBlocks.contains(BB);
   }
 
-  Bonus getUserBonus(Instruction *User, Value *Use = nullptr,
-                     Constant *C = nullptr);
+  Bonus getSpecializationBonus(Argument *A, Constant *C);
 
   Bonus getBonusFromPendingPHIs();
 
@@ -187,6 +186,9 @@ class InstCostVisitor : public InstVisitor<InstCostVisitor, Constant *> {
   static bool canEliminateSuccessor(BasicBlock *BB, BasicBlock *Succ,
                                     DenseSet<BasicBlock *> &DeadBlocks);
 
+  Bonus getUserBonus(Instruction *User, Value *Use = nullptr,
+                     Constant *C = nullptr);
+
   Cost estimateBasicBlocks(SmallVectorImpl<BasicBlock *> &WorkList);
   Cost estimateSwitchInst(SwitchInst &I);
   Cost estimateBranchInst(BranchInst &I);
@@ -244,10 +246,6 @@ class FunctionSpecializer {
     return InstCostVisitor(M.getDataLayout(), BFI, TTI, Solver);
   }
 
-  /// Compute a bonus for replacing argument \p A with constant \p C.
-  Bonus getSpecializationBonus(Argument *A, Constant *C,
-                               InstCostVisitor &Visitor);
-
 private:
   Constant *getPromotableAlloca(AllocaInst *Alloca, CallInst *Call);
 
@@ -268,14 +266,16 @@ class FunctionSpecializer {
 
   /// @brief  Find potential specialization opportunities.
   /// @param F Function to specialize
-  /// @param SpecCost Cost of specializing a function. Final score is benefit
-  /// minus this cost.
+  /// @param FuncSize Cost of specializing a function.
   /// @param AllSpecs A vector to add potential specializations to.
   /// @param SM  A map for a function's specialisation range
   /// @return True, if any potential specializations were found
-  bool findSpecializations(Function *F, unsigned SpecCost,
+  bool findSpecializations(Function *F, unsigned FuncSize,
                            SmallVectorImpl<Spec> &AllSpecs, SpecMap &SM);
 
+  /// Compute the inlining bonus for replacing argument \p A with constant \p C.
+  unsigned getInliningBonus(Argument *A, Constant *C);
+
   bool isCandidateFunction(Function *F);
 
   /// @brief Create a specialization of \p F and prime the SCCPSolver

diff  --git a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
index dd2751965853b3..1582e871de89a1 100644
--- a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
@@ -89,10 +89,25 @@ static cl::opt<unsigned> MaxBlockPredecessors(
     "considered during the estimation of dead code"));
 
 static cl::opt<unsigned> MinFunctionSize(
-    "funcspec-min-function-size", cl::init(100), cl::Hidden, cl::desc(
+    "funcspec-min-function-size", cl::init(300), cl::Hidden, cl::desc(
     "Don't specialize functions that have less than this number of "
     "instructions"));
 
+static cl::opt<unsigned> MinCodeSizeSavings(
+    "funcspec-min-codesize-savings", cl::init(20), cl::Hidden, cl::desc(
+    "Reject specializations whose codesize savings are less than this"
+    "much percent of the original function size"));
+
+static cl::opt<unsigned> MinLatencySavings(
+    "funcspec-min-latency-savings", cl::init(70), cl::Hidden, cl::desc(
+    "Reject specializations whose latency savings are less than this"
+    "much percent of the original function size"));
+
+static cl::opt<unsigned> MinInliningBonus(
+    "funcspec-min-inlining-bonus", cl::init(300), cl::Hidden, cl::desc(
+    "Reject specializations whose inlining bonus is less than this"
+    "much percent of the original function size"));
+
 static cl::opt<bool> SpecializeOnAddress(
     "funcspec-on-address", cl::init(false), cl::Hidden, cl::desc(
     "Enable function specialization on the address of global values"));
@@ -180,6 +195,22 @@ Bonus InstCostVisitor::getBonusFromPendingPHIs() {
   return B;
 }
 
+/// Compute a bonus for replacing argument \p A with constant \p C.
+Bonus InstCostVisitor::getSpecializationBonus(Argument *A, Constant *C) {
+  LLVM_DEBUG(dbgs() << "FnSpecialization: Analysing bonus for constant: "
+                    << C->getNameOrAsOperand() << "\n");
+  Bonus B;
+  for (auto *U : A->users())
+    if (auto *UI = dyn_cast<Instruction>(U))
+      if (isBlockExecutable(UI->getParent()))
+        B += getUserBonus(UI, A, C);
+
+  LLVM_DEBUG(dbgs() << "FnSpecialization:   Accumulated bonus {CodeSize = "
+                    << B.CodeSize << ", Latency = " << B.Latency
+                    << "} for argument " << *A << "\n");
+  return B;
+}
+
 Bonus InstCostVisitor::getUserBonus(Instruction *User, Value *Use, Constant *C) {
   // We have already propagated a constant for this user.
   if (KnownConstants.contains(User))
@@ -589,15 +620,15 @@ bool FunctionSpecializer::run() {
     int64_t Sz = *Metrics.NumInsts.getValue();
     assert(Sz > 0 && "CodeSize should be positive");
     // It is safe to down cast from int64_t, NumInsts is always positive.
-    unsigned SpecCost = static_cast<unsigned>(Sz);
+    unsigned FuncSize = static_cast<unsigned>(Sz);
 
     LLVM_DEBUG(dbgs() << "FnSpecialization: Specialization cost for "
-                      << F.getName() << " is " << SpecCost << "\n");
+                      << F.getName() << " is " << FuncSize << "\n");
 
     if (Inserted && Metrics.isRecursive)
       promoteConstantStackValues(&F);
 
-    if (!findSpecializations(&F, SpecCost, AllSpecs, SM)) {
+    if (!findSpecializations(&F, FuncSize, AllSpecs, SM)) {
       LLVM_DEBUG(
           dbgs() << "FnSpecialization: No possible specializations found for "
                  << F.getName() << "\n");
@@ -732,7 +763,7 @@ static Function *cloneCandidateFunction(Function *F) {
   return Clone;
 }
 
-bool FunctionSpecializer::findSpecializations(Function *F, unsigned SpecCost,
+bool FunctionSpecializer::findSpecializations(Function *F, unsigned FuncSize,
                                               SmallVectorImpl<Spec> &AllSpecs,
                                               SpecMap &SM) {
   // A mapping from a specialisation signature to the index of the respective
@@ -799,21 +830,42 @@ bool FunctionSpecializer::findSpecializations(Function *F, unsigned SpecCost,
     } else {
       // Calculate the specialisation gain.
       Bonus B;
+      unsigned Score = 0;
       InstCostVisitor Visitor = getInstCostVisitorFor(F);
-      for (ArgInfo &A : S.Args)
-        B += getSpecializationBonus(A.Formal, A.Actual, Visitor);
+      for (ArgInfo &A : S.Args) {
+        B += Visitor.getSpecializationBonus(A.Formal, A.Actual);
+        Score += getInliningBonus(A.Formal, A.Actual);
+      }
       B += Visitor.getBonusFromPendingPHIs();
 
-      LLVM_DEBUG(dbgs() << "FnSpecialization: Specialization score {CodeSize = "
+
+      LLVM_DEBUG(dbgs() << "FnSpecialization: Specialization bonus {CodeSize = "
                         << B.CodeSize << ", Latency = " << B.Latency
-                        << "}\n");
+                        << ", Inlining = " << Score << "}\n");
+
+      auto IsProfitable = [&FuncSize](Bonus &B, unsigned Score) -> bool {
+        // No check required.
+        if (ForceSpecialization)
+          return true;
+        // Minimum inlining bonus.
+        if (Score > MinInliningBonus * FuncSize / 100)
+          return true;
+        // Minimum codesize savings.
+        if (B.CodeSize < MinCodeSizeSavings * FuncSize / 100)
+          return false;
+        // Minimum latency savings.
+        if (B.Latency < MinLatencySavings * FuncSize / 100)
+          return false;
+        return true;
+      };
 
       // Discard unprofitable specialisations.
-      if (!ForceSpecialization && B.Latency <= SpecCost - B.CodeSize)
+      if (!IsProfitable(B, Score))
         continue;
 
       // Create a new specialisation entry.
-      auto &Spec = AllSpecs.emplace_back(F, S, B.Latency);
+      Score += std::max(B.CodeSize, B.Latency);
+      auto &Spec = AllSpecs.emplace_back(F, S, Score);
       if (CS.getFunction() != F)
         Spec.CallSites.push_back(&CS);
       const unsigned Index = AllSpecs.size() - 1;
@@ -879,31 +931,14 @@ Function *FunctionSpecializer::createSpecialization(Function *F,
   return Clone;
 }
 
-/// Compute a bonus for replacing argument \p A with constant \p C.
-Bonus FunctionSpecializer::getSpecializationBonus(Argument *A, Constant *C,
-                                                 InstCostVisitor &Visitor) {
-  LLVM_DEBUG(dbgs() << "FnSpecialization: Analysing bonus for constant: "
-                    << C->getNameOrAsOperand() << "\n");
-
-  Bonus B;
-  for (auto *U : A->users())
-    if (auto *UI = dyn_cast<Instruction>(U))
-      if (Visitor.isBlockExecutable(UI->getParent()))
-        B += Visitor.getUserBonus(UI, A, C);
-
-  LLVM_DEBUG(dbgs() << "FnSpecialization:   Accumulated bonus {CodeSize = "
-                    << B.CodeSize << ", Latency = " << B.Latency
-                    << "} for argument " << *A << "\n");
-
-  // The below heuristic is only concerned with exposing inlining
-  // opportunities via indirect call promotion. If the argument is not a
-  // (potentially casted) function pointer, give up.
-  //
-  // TODO: Perhaps we should consider checking such inlining opportunities
-  // while traversing the users of the specialization arguments ?
+/// Compute the inlining bonus for replacing argument \p A with constant \p C.
+/// The below heuristic is only concerned with exposing inlining
+/// opportunities via indirect call promotion. If the argument is not a
+/// (potentially casted) function pointer, give up.
+unsigned FunctionSpecializer::getInliningBonus(Argument *A, Constant *C) {
   Function *CalledFunction = dyn_cast<Function>(C->stripPointerCasts());
   if (!CalledFunction)
-    return B;
+    return 0;
 
   // Get TTI for the called function (used for the inline cost).
   auto &CalleeTTI = (GetTTI)(*CalledFunction);
@@ -948,7 +983,7 @@ Bonus FunctionSpecializer::getSpecializationBonus(Argument *A, Constant *C,
                       << " for user " << *U << "\n");
   }
 
-  return B += {0, InliningBonus};
+  return InliningBonus > 0 ? static_cast<unsigned>(InliningBonus) : 0;
 }
 
 /// Determine if it is possible to specialise the function for constant values

diff  --git a/llvm/unittests/Transforms/IPO/FunctionSpecializationTest.cpp b/llvm/unittests/Transforms/IPO/FunctionSpecializationTest.cpp
index 8225d3525ff887..643e8972cd7505 100644
--- a/llvm/unittests/Transforms/IPO/FunctionSpecializationTest.cpp
+++ b/llvm/unittests/Transforms/IPO/FunctionSpecializationTest.cpp
@@ -168,13 +168,13 @@ TEST_F(FunctionSpecializationTest, SwitchInst) {
 
   // mul
   Bonus Ref = getInstCost(Mul);
-  Bonus Test = Specializer.getSpecializationBonus(F->getArg(0), One, Visitor);
+  Bonus Test = Visitor.getSpecializationBonus(F->getArg(0), One);
   EXPECT_EQ(Test, Ref);
   EXPECT_TRUE(Test.CodeSize > 0 && Test.Latency > 0);
 
   // and + or + add
   Ref = getInstCost(And) + getInstCost(Or) + getInstCost(Add);
-  Test = Specializer.getSpecializationBonus(F->getArg(1), One, Visitor);
+  Test = Visitor.getSpecializationBonus(F->getArg(1), One);
   EXPECT_EQ(Test, Ref);
   EXPECT_TRUE(Test.CodeSize > 0 && Test.Latency > 0);
 
@@ -183,7 +183,7 @@ TEST_F(FunctionSpecializationTest, SwitchInst) {
         getInstCost(Sdiv, /*SizeOnly =*/ true) +
         getInstCost(BrBB2, /*SizeOnly =*/ true) +
         getInstCost(BrLoop, /*SizeOnly =*/ true);
-  Test = Specializer.getSpecializationBonus(F->getArg(2), One, Visitor);
+  Test = Visitor.getSpecializationBonus(F->getArg(2), One);
   EXPECT_EQ(Test, Ref);
   EXPECT_TRUE(Test.CodeSize > 0 && Test.Latency > 0);
 }
@@ -235,13 +235,13 @@ TEST_F(FunctionSpecializationTest, BranchInst) {
 
   // mul
   Bonus Ref = getInstCost(Mul);
-  Bonus Test = Specializer.getSpecializationBonus(F->getArg(0), One, Visitor);
+  Bonus Test = Visitor.getSpecializationBonus(F->getArg(0), One);
   EXPECT_EQ(Test, Ref);
   EXPECT_TRUE(Test.CodeSize > 0 && Test.Latency > 0);
 
   // add
   Ref = getInstCost(Add);
-  Test = Specializer.getSpecializationBonus(F->getArg(1), One, Visitor);
+  Test = Visitor.getSpecializationBonus(F->getArg(1), One);
   EXPECT_EQ(Test, Ref);
   EXPECT_TRUE(Test.CodeSize > 0 && Test.Latency > 0);
 
@@ -252,7 +252,7 @@ TEST_F(FunctionSpecializationTest, BranchInst) {
         getInstCost(Sdiv, /*SizeOnly =*/ true) +
         getInstCost(BrBB2, /*SizeOnly =*/ true) +
         getInstCost(BrLoop, /*SizeOnly =*/ true);
-  Test = Specializer.getSpecializationBonus(F->getArg(2), False, Visitor);
+  Test = Visitor.getSpecializationBonus(F->getArg(2), False);
   EXPECT_EQ(Test, Ref);
   EXPECT_TRUE(Test.CodeSize > 0 && Test.Latency > 0);
 }
@@ -301,24 +301,24 @@ TEST_F(FunctionSpecializationTest, Misc) {
 
   // icmp + zext
   Bonus Ref = getInstCost(Icmp) + getInstCost(Zext);
-  Bonus Test = Specializer.getSpecializationBonus(F->getArg(0), One, Visitor);
+  Bonus Test = Visitor.getSpecializationBonus(F->getArg(0), One);
   EXPECT_EQ(Test, Ref);
   EXPECT_TRUE(Test.CodeSize > 0 && Test.Latency > 0);
 
   // select
   Ref = getInstCost(Select);
-  Test = Specializer.getSpecializationBonus(F->getArg(1), True, Visitor);
+  Test = Visitor.getSpecializationBonus(F->getArg(1), True);
   EXPECT_EQ(Test, Ref);
   EXPECT_TRUE(Test.CodeSize > 0 && Test.Latency > 0);
 
   // gep + load + freeze + smax
   Ref = getInstCost(Gep) + getInstCost(Load) + getInstCost(Freeze) +
         getInstCost(Smax);
-  Test = Specializer.getSpecializationBonus(F->getArg(2), GV, Visitor);
+  Test = Visitor.getSpecializationBonus(F->getArg(2), GV);
   EXPECT_EQ(Test, Ref);
   EXPECT_TRUE(Test.CodeSize > 0 && Test.Latency > 0);
 
-  Test = Specializer.getSpecializationBonus(F->getArg(3), Undef, Visitor);
+  Test = Visitor.getSpecializationBonus(F->getArg(3), Undef);
   EXPECT_TRUE(Test.CodeSize == 0 && Test.Latency == 0);
 }
 
@@ -369,17 +369,17 @@ TEST_F(FunctionSpecializationTest, PhiNode) {
   Instruction &Icmp = *++BB.begin();
   Instruction &Branch = BB.back();
 
-  Bonus Test = Specializer.getSpecializationBonus(F->getArg(0), One, Visitor);
+  Bonus Test = Visitor.getSpecializationBonus(F->getArg(0), One);
   EXPECT_TRUE(Test.CodeSize == 0 && Test.Latency == 0);
 
-  Test = Specializer.getSpecializationBonus(F->getArg(1), One, Visitor);
+  Test = Visitor.getSpecializationBonus(F->getArg(1), One);
   EXPECT_TRUE(Test.CodeSize == 0 && Test.Latency == 0);
 
   // switch + phi + br
   Bonus Ref = getInstCost(Switch) +
               getInstCost(PhiCase2, /*SizeOnly =*/ true) +
               getInstCost(BrBB, /*SizeOnly =*/ true);
-  Test = Specializer.getSpecializationBonus(F->getArg(2), One, Visitor);
+  Test = Visitor.getSpecializationBonus(F->getArg(2), One);
   EXPECT_EQ(Test, Ref);
   EXPECT_TRUE(Test.CodeSize > 0 && Test.Latency > 0);