[llvm] [FuncSpec] Update function specialization to handle phi-chains (PR #71442)

Mats Petersson via llvm-commits llvm-commits at lists.llvm.org
Tue Nov 7 13:13:13 PST 2023


https://github.com/Leporacanthicus updated https://github.com/llvm/llvm-project/pull/71442

>From 966954ca7bf9cdfeb1307fe0ddbd044300bd3680 Mon Sep 17 00:00:00 2001
From: Mats Petersson <mats.petersson at arm.com>
Date: Mon, 6 Nov 2023 19:14:53 +0000
Subject: [PATCH 1/2] [FuncSpec] Update function specialization to handle
 phi-chains

When using the LLVM flang compiler with alias analysis (AA) enabled,
SPEC2017:548.exchange2_r was running significantly slower than
wihtout the AA.

This was caused by the GVN pass replacing many of the loads in the
pre-AA code with phi-nodes that form a long chain of dependencies,
which the function specialization was unable to follow.

This adds a function to follow phi-nodes when they are a strongly
connected component, with some limitations to avoid spending ages
analysing phi-nodes.

The minimum latency savings also had to be lowered - fewer load
instructions means less saving.

Adding some more prints to help debugging the isProfitable decision.

No significant change in compile time or generated code-size.

Co-authored-by: Alexandros Lamprineas <alexandros.lamprineas at arm.com>
---
 .../Transforms/IPO/FunctionSpecialization.h   |   4 +
 .../Transforms/IPO/FunctionSpecialization.cpp | 133 ++++++++++++++----
 .../discover-strongly-connected-phis.ll       |  87 ++++++++++++
 3 files changed, 198 insertions(+), 26 deletions(-)
 create mode 100644 llvm/test/Transforms/FunctionSpecialization/discover-strongly-connected-phis.ll

diff --git a/llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h b/llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h
index 50f9aae73dc53e2..f35543cb8411b35 100644
--- a/llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h
+++ b/llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h
@@ -183,6 +183,8 @@ class InstCostVisitor : public InstVisitor<InstCostVisitor, Constant *> {
   DenseSet<BasicBlock *> DeadBlocks;
   // PHI nodes we have visited before.
   DenseSet<Instruction *> VisitedPHIs;
+  // PHI nodes forming a strongly connected component.
+  DenseSet<PHINode *> StronglyConnectedPHIs;
   // PHI nodes we have visited once without successfully constant folding them.
   // Once the InstCostVisitor has processed all the specialization arguments,
   // it should be possible to determine whether those PHIs can be folded
@@ -217,6 +219,8 @@ class InstCostVisitor : public InstVisitor<InstCostVisitor, Constant *> {
   Cost estimateSwitchInst(SwitchInst &I);
   Cost estimateBranchInst(BranchInst &I);
 
+  void discoverStronglyConnectedComponent(PHINode *PN, unsigned Depth);
+
   Constant *visitInstruction(Instruction &I) { return nullptr; }
   Constant *visitPHINode(PHINode &I);
   Constant *visitFreezeInst(FreezeInst &I);
diff --git a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
index b75ca7761a60b62..23e665a1901b5e1 100644
--- a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
@@ -39,10 +39,15 @@ static cl::opt<unsigned> MaxClones(
     "The maximum number of clones allowed for a single function "
     "specialization"));
 
+static cl::opt<unsigned> MaxDiscoveryDepth(
+    "funcspec-max-discovery-depth", cl::init(10), cl::Hidden,
+    cl::desc("The maximum recursion depth allowed when searching for strongly "
+             "connected phis"));
+
 static cl::opt<unsigned> MaxIncomingPhiValues(
-    "funcspec-max-incoming-phi-values", cl::init(4), cl::Hidden, cl::desc(
-    "The maximum number of incoming values a PHI node can have to be "
-    "considered during the specialization bonus estimation"));
+    "funcspec-max-incoming-phi-values", cl::init(8), cl::Hidden,
+    cl::desc("The maximum number of incoming values a PHI node can have to be "
+             "considered during the specialization bonus estimation"));
 
 static cl::opt<unsigned> MaxBlockPredecessors(
     "funcspec-max-block-predecessors", cl::init(2), cl::Hidden, cl::desc(
@@ -64,9 +69,9 @@ static cl::opt<unsigned> MinCodeSizeSavings(
     "much percent of the original function size"));
 
 static cl::opt<unsigned> MinLatencySavings(
-    "funcspec-min-latency-savings", cl::init(70), cl::Hidden, cl::desc(
-    "Reject specializations whose latency savings are less than this"
-    "much percent of the original function size"));
+    "funcspec-min-latency-savings", cl::init(45), cl::Hidden,
+    cl::desc("Reject specializations whose latency savings are less than this"
+             "much percent of the original function size"));
 
 static cl::opt<unsigned> MinInliningBonus(
     "funcspec-min-inlining-bonus", cl::init(300), cl::Hidden, cl::desc(
@@ -262,30 +267,86 @@ Cost InstCostVisitor::estimateBranchInst(BranchInst &I) {
   return estimateBasicBlocks(WorkList);
 }
 
+void InstCostVisitor::discoverStronglyConnectedComponent(PHINode *PN,
+                                                         unsigned Depth) {
+  if (Depth > MaxDiscoveryDepth)
+    return;
+
+  if (PN->getNumIncomingValues() > MaxIncomingPhiValues)
+    return;
+
+  if (!StronglyConnectedPHIs.insert(PN).second)
+    return;
+
+  for (unsigned I = 0, E = PN->getNumIncomingValues(); I != E; ++I) {
+    Value *V = PN->getIncomingValue(I);
+    if (auto *Phi = dyn_cast<PHINode>(V)) {
+      if (Phi == PN || DeadBlocks.contains(PN->getIncomingBlock(I)))
+        continue;
+      discoverStronglyConnectedComponent(Phi, Depth + 1);
+    }
+  }
+}
+
 Constant *InstCostVisitor::visitPHINode(PHINode &I) {
   if (I.getNumIncomingValues() > MaxIncomingPhiValues)
     return nullptr;
 
   bool Inserted = VisitedPHIs.insert(&I).second;
   Constant *Const = nullptr;
+  SmallVector<PHINode *, 8> UnknownIncomingValues;
 
-  for (unsigned Idx = 0, E = I.getNumIncomingValues(); Idx != E; ++Idx) {
-    Value *V = I.getIncomingValue(Idx);
-    if (auto *Inst = dyn_cast<Instruction>(V))
-      if (Inst == &I || DeadBlocks.contains(I.getIncomingBlock(Idx)))
-        continue;
-    Constant *C = findConstantFor(V, KnownConstants);
-    if (!C) {
-      if (Inserted)
-        PendingPHIs.push_back(&I);
-      return nullptr;
+  auto CanConstantFoldPhi = [&](PHINode *PN) -> bool {
+    UnknownIncomingValues.clear();
+
+    for (unsigned I = 0, E = PN->getNumIncomingValues(); I != E; ++I) {
+      Value *V = PN->getIncomingValue(I);
+
+      // Disregard self-references and dead incoming values.
+      if (auto *Inst = dyn_cast<Instruction>(V))
+        if (Inst == PN || DeadBlocks.contains(PN->getIncomingBlock(I)))
+          continue;
+
+      if (Constant *C = findConstantFor(V, KnownConstants)) {
+        if (!Const)
+          Const = C;
+        // Not all incoming values are the same constant. Bail immediately.
+        else if (C != Const)
+          return false;
+      } else if (auto *Phi = dyn_cast<PHINode>(V)) {
+        // It's not a strongly connected phi. Collect it and bail at the end.
+        if (!StronglyConnectedPHIs.contains(Phi))
+          UnknownIncomingValues.push_back(Phi);
+      } else {
+        // We can't reason about anything else.
+        return false;
+      }
+    }
+    return UnknownIncomingValues.empty();
+  };
+
+  if (CanConstantFoldPhi(&I))
+    return Const;
+
+  if (Inserted) {
+    // First time we are seeing this phi. We'll retry later, after all
+    // the constant arguments have been propagated. Bail for now.
+    PendingPHIs.push_back(&I);
+    return nullptr;
+  }
+
+  for (PHINode *Phi : UnknownIncomingValues)
+    discoverStronglyConnectedComponent(Phi, 1);
+
+  bool CannotConstantFoldPhi = false;
+  for (PHINode *Phi : StronglyConnectedPHIs) {
+    if (!CanConstantFoldPhi(Phi)) {
+      CannotConstantFoldPhi = true;
+      break;
     }
-    if (!Const)
-      Const = C;
-    else if (C != Const)
-      return nullptr;
   }
-  return Const;
+  StronglyConnectedPHIs.clear();
+  return CannotConstantFoldPhi ? nullptr : Const;
 }
 
 Constant *InstCostVisitor::visitFreezeInst(FreezeInst &I) {
@@ -809,20 +870,40 @@ bool FunctionSpecializer::findSpecializations(Function *F, unsigned FuncSize,
       auto IsProfitable = [](Bonus &B, unsigned Score, unsigned FuncSize,
                              unsigned FuncGrowth) -> bool {
         // No check required.
-        if (ForceSpecialization)
+        if (ForceSpecialization) {
+          LLVM_DEBUG(dbgs() << "Force is on\n");
           return true;
+        }
         // Minimum inlining bonus.
-        if (Score > MinInliningBonus * FuncSize / 100)
+        if (Score > MinInliningBonus * FuncSize / 100) {
+          LLVM_DEBUG(dbgs()
+                     << "FnSpecialization: Min inliningbous: Score = " << Score
+                     << " > " << MinInliningBonus * FuncSize / 100 << "\n");
           return true;
+        }
         // Minimum codesize savings.
-        if (B.CodeSize < MinCodeSizeSavings * FuncSize / 100)
+        if (B.CodeSize < MinCodeSizeSavings * FuncSize / 100) {
+          LLVM_DEBUG(dbgs()
+                     << "FnSpecialization: Min CodeSize Saving: CodeSize = "
+                     << B.CodeSize << " > "
+                     << MinCodeSizeSavings * FuncSize / 100 << "\n");
           return false;
+        }
         // Minimum latency savings.
-        if (B.Latency < MinLatencySavings * FuncSize / 100)
+        if (B.Latency < MinLatencySavings * FuncSize / 100) {
+          LLVM_DEBUG(dbgs()
+                     << "FnSpecialization: Min Latency Saving: Latency = "
+                     << B.Latency << " > " << MinLatencySavings * FuncSize / 100
+                     << "\n");
           return false;
+        }
         // Maximum codesize growth.
-        if (FuncGrowth / FuncSize > MaxCodeSizeGrowth)
+        if (FuncGrowth / FuncSize > MaxCodeSizeGrowth) {
+          LLVM_DEBUG(dbgs() << "FnSpecialization: Max Func Growth: CodeSize = "
+                            << FuncGrowth / FuncSize << " > "
+                            << MaxCodeSizeGrowth << "\n");
           return false;
+        }
         return true;
       };
 
diff --git a/llvm/test/Transforms/FunctionSpecialization/discover-strongly-connected-phis.ll b/llvm/test/Transforms/FunctionSpecialization/discover-strongly-connected-phis.ll
new file mode 100644
index 000000000000000..3463ddb6f066de8
--- /dev/null
+++ b/llvm/test/Transforms/FunctionSpecialization/discover-strongly-connected-phis.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+;
+; RUN: opt -passes="ipsccp<func-spec>" -funcspec-min-function-size=20 -funcspec-for-literal-constant -S < %s | FileCheck %s --check-prefix=FUNCSPEC
+; RUN: opt -passes="ipsccp<func-spec>" -funcspec-min-function-size=20 -funcspec-for-literal-constant -funcspec-max-discovery-depth=5 -S < %s | FileCheck %s --check-prefix=NOFUNCSPEC
+
+define i64 @bar(i1 %c1, i1 %c2, i1 %c3, i1 %c4, i1 %c5, i1 %c6, i1 %c7, i1 %c8, i1 %c9, i1 %c10) {
+; FUNCSPEC-LABEL: define i64 @bar(
+; FUNCSPEC-SAME: i1 [[C1:%.*]], i1 [[C2:%.*]], i1 [[C3:%.*]], i1 [[C4:%.*]], i1 [[C5:%.*]], i1 [[C6:%.*]], i1 [[C7:%.*]], i1 [[C8:%.*]], i1 [[C9:%.*]], i1 [[C10:%.*]]) {
+; FUNCSPEC-NEXT:  entry:
+; FUNCSPEC-NEXT:    [[F1:%.*]] = call i64 @foo.specialized.1(i64 3, i1 [[C1]], i1 [[C2]], i1 [[C3]], i1 [[C4]], i1 [[C5]], i1 [[C6]], i1 [[C7]], i1 [[C8]], i1 [[C9]], i1 [[C10]]), !range [[RNG0:![0-9]+]]
+; FUNCSPEC-NEXT:    [[F2:%.*]] = call i64 @foo.specialized.2(i64 4, i1 [[C1]], i1 [[C2]], i1 [[C3]], i1 [[C4]], i1 [[C5]], i1 [[C6]], i1 [[C7]], i1 [[C8]], i1 [[C9]], i1 [[C10]]), !range [[RNG1:![0-9]+]]
+; FUNCSPEC-NEXT:    [[ADD:%.*]] = add nuw nsw i64 [[F1]], [[F2]]
+; FUNCSPEC-NEXT:    ret i64 [[ADD]]
+;
+; NOFUNCSPEC-LABEL: define i64 @bar(
+; NOFUNCSPEC-SAME: i1 [[C1:%.*]], i1 [[C2:%.*]], i1 [[C3:%.*]], i1 [[C4:%.*]], i1 [[C5:%.*]], i1 [[C6:%.*]], i1 [[C7:%.*]], i1 [[C8:%.*]], i1 [[C9:%.*]], i1 [[C10:%.*]]) {
+; NOFUNCSPEC-NEXT:  entry:
+; NOFUNCSPEC-NEXT:    [[F1:%.*]] = call i64 @foo(i64 3, i1 [[C1]], i1 [[C2]], i1 [[C3]], i1 [[C4]], i1 [[C5]], i1 [[C6]], i1 [[C7]], i1 [[C8]], i1 [[C9]], i1 [[C10]]), !range [[RNG0:![0-9]+]]
+; NOFUNCSPEC-NEXT:    [[F2:%.*]] = call i64 @foo(i64 4, i1 [[C1]], i1 [[C2]], i1 [[C3]], i1 [[C4]], i1 [[C5]], i1 [[C6]], i1 [[C7]], i1 [[C8]], i1 [[C9]], i1 [[C10]]), !range [[RNG0]]
+; NOFUNCSPEC-NEXT:    [[ADD:%.*]] = add nuw nsw i64 [[F1]], [[F2]]
+; NOFUNCSPEC-NEXT:    ret i64 [[ADD]]
+;
+entry:
+  %f1 = call i64 @foo(i64 3, i1 %c1, i1 %c2, i1 %c3, i1 %c4, i1 %c5, i1 %c6, i1 %c7, i1 %c8, i1 %c9, i1 %c10)
+  %f2 = call i64 @foo(i64 4, i1 %c1, i1 %c2, i1 %c3, i1 %c4, i1 %c5, i1 %c6, i1 %c7, i1 %c8, i1 %c9, i1 %c10)
+  %add = add i64 %f1, %f2
+  ret i64 %add
+}
+
+define internal i64 @foo(i64 %n, i1 %c1, i1 %c2, i1 %c3, i1 %c4, i1 %c5, i1 %c6, i1 %c7, i1 %c8, i1 %c9, i1 %c10) {
+entry:
+  br i1 %c1, label %l1, label %l9
+
+l1:
+  %phi1 = phi i64 [ %n, %entry ], [ %phi2, %l2 ]
+  %add = add i64 %phi1, 1
+  %div = sdiv i64 %add, 2
+  br i1 %c2, label %l1_5, label %exit
+
+l1_5:
+  br i1 %c3, label %l1_75, label %l6
+
+l1_75:
+  br i1 %c4, label %l2, label %l3
+
+l2:
+  %phi2 = phi i64 [ %phi1, %l1_75 ], [ %phi3, %l3 ]
+  br label %l1
+
+l3:
+  %phi3 = phi i64 [ %phi1, %l1_75 ], [ %phi4, %l4 ]
+  br label %l2
+
+l4:
+  %phi4 = phi i64 [ %phi5, %l5 ], [ %phi6, %l6 ]
+  br i1 %c5, label %l3, label %l6
+
+l5:
+  %phi5 = phi i64 [ %phi6, %l6_5 ], [ %phi7, %l7 ]
+  br label %l4
+
+l6:
+  %phi6 = phi i64 [ %phi4, %l4 ], [ %phi1, %l1_5 ]
+  br i1 %c6, label %l4, label %l6_5
+
+l6_5:
+  br i1 %c7, label %l5, label %l8
+
+l7:
+  %phi7 = phi i64 [ %phi9, %l9 ], [ %phi8, %l8 ]
+  br i1 %c8, label %l5, label %l8
+
+l8:
+  %phi8 = phi i64 [ %phi6, %l6_5 ], [ %phi7, %l7 ]
+  br i1 %c9, label %l7, label %l9
+
+l9:
+  %phi9 = phi i64 [ %n, %entry ], [ %phi8, %l8 ]
+  %sub = sub i64 %phi9, 1
+  %mul = mul i64 %sub, 2
+  br i1 %c10, label %l7, label %exit
+
+exit:
+  %res = phi i64 [ %div, %l1 ], [ %mul, %l9]
+  ret i64 %res
+}
+

>From 5f83d4aec7656bdb15e2c0c105d4cf69cba04486 Mon Sep 17 00:00:00 2001
From: Mats Petersson <mats.petersson at arm.com>
Date: Tue, 7 Nov 2023 21:12:33 +0000
Subject: [PATCH 2/2] fix review comments

---
 .../Transforms/IPO/FunctionSpecialization.cpp | 58 +++++++++++++++----
 1 file changed, 48 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
index 23e665a1901b5e1..98f7de3eed98a71 100644
--- a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
@@ -267,6 +267,38 @@ Cost InstCostVisitor::estimateBranchInst(BranchInst &I) {
   return estimateBasicBlocks(WorkList);
 }
 
+// This function is determining if a PHINode is part of a strongly
+// connected component - that is a chain or graph of PHINodes that all
+// link to each other. That means, if the original input to the chain
+// is a constant, all the other values are also that constant.
+//
+// For example:
+//
+// L1:
+// %a = load %0
+//
+// L4:
+// %c = phi [%a, %L1], [%d, %L2]
+//
+// L2:
+// %d = phi [%e, L3] [%c, L4]
+// L3:
+// %e = phi [%c, L4],[%f, L5]
+// L5:
+// %f = phi [%j, L6], [%h, L7]
+// L6:
+// %j = phi [%h:L7, %j, L6]
+// L7:
+// %h = phi [%g:L6, %c:L4]
+//
+// This is only showing the labels and PHINodes, not the branches that
+// choose the different paths.
+// If some input is not part of the graph, then it's not a strongly
+// connected component.
+//
+// A depth limit is used to avoid extreme recurusion.
+// A max number of incoming phi values ensures that expensive searches
+// are avoided.
 void InstCostVisitor::discoverStronglyConnectedComponent(PHINode *PN,
                                                          unsigned Depth) {
   if (Depth > MaxDiscoveryDepth)
@@ -296,6 +328,10 @@ Constant *InstCostVisitor::visitPHINode(PHINode &I) {
   Constant *Const = nullptr;
   SmallVector<PHINode *, 8> UnknownIncomingValues;
 
+  // A PHINode can be constantfolded if all of these conditions are true:
+  // - It has a constant input.
+  // - It is always the SAME constnat.
+  // - It is a strongly connected component (see above)
   auto CanConstantFoldPhi = [&](PHINode *PN) -> bool {
     UnknownIncomingValues.clear();
 
@@ -325,8 +361,10 @@ Constant *InstCostVisitor::visitPHINode(PHINode &I) {
     return UnknownIncomingValues.empty();
   };
 
-  if (CanConstantFoldPhi(&I))
+  if (CanConstantFoldPhi(&I)) {
+    assert(Const && "CanConstantfoldPhi expected to have set Const when returing true"); 
     return Const;
+  }
 
   if (Inserted) {
     // First time we are seeing this phi. We'll retry later, after all
@@ -871,37 +909,37 @@ bool FunctionSpecializer::findSpecializations(Function *F, unsigned FuncSize,
                              unsigned FuncGrowth) -> bool {
         // No check required.
         if (ForceSpecialization) {
-          LLVM_DEBUG(dbgs() << "Force is on\n");
+          LLVM_DEBUG(dbgs() << "FnSpecialization: Force is on\n");
           return true;
         }
         // Minimum inlining bonus.
         if (Score > MinInliningBonus * FuncSize / 100) {
           LLVM_DEBUG(dbgs()
-                     << "FnSpecialization: Min inliningbous: Score = " << Score
-                     << " > " << MinInliningBonus * FuncSize / 100 << "\n");
+                     << "FnSpecialization: Sufficient inlining bonus( " << Score
+                     << " > " << MinInliningBonus * FuncSize / 100 << ")\n");
           return true;
         }
         // Minimum codesize savings.
         if (B.CodeSize < MinCodeSizeSavings * FuncSize / 100) {
           LLVM_DEBUG(dbgs()
-                     << "FnSpecialization: Min CodeSize Saving: CodeSize = "
+                     << "FnSpecialization: Insufficinet CodeSize Saving("
                      << B.CodeSize << " > "
-                     << MinCodeSizeSavings * FuncSize / 100 << "\n");
+                     << MinCodeSizeSavings * FuncSize / 100 << ")\n");
           return false;
         }
         // Minimum latency savings.
         if (B.Latency < MinLatencySavings * FuncSize / 100) {
           LLVM_DEBUG(dbgs()
-                     << "FnSpecialization: Min Latency Saving: Latency = "
+                     << "FnSpecialization: Insufficinet Latency Saving("
                      << B.Latency << " > " << MinLatencySavings * FuncSize / 100
-                     << "\n");
+                     << ")\n");
           return false;
         }
         // Maximum codesize growth.
         if (FuncGrowth / FuncSize > MaxCodeSizeGrowth) {
-          LLVM_DEBUG(dbgs() << "FnSpecialization: Max Func Growth: CodeSize = "
+          LLVM_DEBUG(dbgs() << "FnSpecialization: Func Growth exceeds threshold("
                             << FuncGrowth / FuncSize << " > "
-                            << MaxCodeSizeGrowth << "\n");
+                            << MaxCodeSizeGrowth << ")\n");
           return false;
         }
         return true;



More information about the llvm-commits mailing list