[llvm] [SimplifyCFG] Avoid introducing of too many phi nodes during sinking (PR #102465)

Thu Aug 8 06:31:47 PDT 2024

https://github.com/nikic created https://github.com/llvm/llvm-project/pull/102465

Currently, the sinking heuristic considers sinking profitable if we're introducing at most one phi per sunk instruction.

In practice, this usually works out fairly nicely, because we usually end up trading one original phi for one new phi, with the exception of void instructions where we do have to introduce a phi to allow non-trivial sinking at all.

However, there are pathological cases like the one in `@many_indirect_phis`, where we really do end up adding an extra phi for each sunk instruction, which is non-profitable.

This patch changes the heuristic to keep track of the number of phis that will be needed depending on how many instructions are sunk, and do not allow sinking if the number of phis increases.

Unfortunately, this is not entirely accurate either, because a phi count reduction in one place can effectively cancel out a phi count increase in another. E.g. if we sink some instruction sequence rooted at a void instruction in a way that ultimately requires no phis at all, the bonus this case gets may be consumed by something else (this is the `@store_and_unrelated_many_phi_add2` case).

Fixes https://github.com/llvm/llvm-project/issues/96838.

>From 5f7551b1066f26840126179dffde34f5c343b443 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov at redhat.com>
Date: Wed, 7 Aug 2024 11:47:13 +0200
Subject: [PATCH] [SimplifyCFG] Avoid introducing of too many phi nodes during
 sinking

Currently, the sinking heuristic considers sinking profitable if
we're introducing at most one phi per sunk instruction.

In practice, this usually works out fairly nicely, because we
usually end up trading one original phi for one new phi, with the
exception of void instructions where we do have to introduce a
phi to allow non-trivial sinking at all.

However, there are pathological cases like the one in
`@many_indirect_phis`, where we really do end up adding an extra
phi for each sunk instruction, which is non-profitable.

This patch changes the heuristic to keep track of the number of
phis that will be needed depending on how many instructions are
sunk, and do not allow sinking if the number of phis increases.

Unfortunately, this is not entirely accurate either, because a
phi count reduction in one place can effectively cancel out a phi
count increase in another. E.g. if we sink some instruction
sequence rooted at a void instruction in a way that ultimately
requires no phis at all, the bonus this case gets may be consumed
by something else (this is the `@store_and_unrelated_many_phi_add2`
case).
---
 llvm/lib/Transforms/Utils/SimplifyCFG.cpp     | 112 ++++--------------
 .../PGOProfile/cspgo_profile_summary.ll       |   8 +-
 .../SimplifyCFG/X86/sink-common-code.ll       |  71 +++++------
 3 files changed, 63 insertions(+), 128 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index ccdfe47ef81e7e..c337e78c8729d6 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -1932,7 +1932,8 @@ static bool replacingOperandWithVariableIsCheap(const Instruction *I,
 // PHI node (because an operand varies in each input block), add to PHIOperands.
 static bool canSinkInstructions(
     ArrayRef<Instruction *> Insts,
-    DenseMap<const Use *, SmallVector<Value *, 4>> &PHIOperands) {
+    DenseMap<const Use *, SmallVector<Value *, 4>> &PHIOperands,
+    SmallVectorImpl<int> &NeededPHIs) {
   // Prune out obviously bad instructions to move. Each instruction must have
   // the same number of uses, and we check later that the uses are consistent.
   std::optional<unsigned> NumUses;
@@ -1985,6 +1986,7 @@ static bool canSinkInstructions(
   // then the other phi operands must match the instructions from Insts. This
   // also has to hold true for any phi nodes that would be created as a result
   // of sinking. Both of these cases are represented by PhiOperands.
+  int NeededPHIsAdjustment = 0;
   for (const Use &U : I0->uses()) {
     auto It = PHIOperands.find(&U);
     if (It == PHIOperands.end())
@@ -1992,6 +1994,9 @@ static bool canSinkInstructions(
       return false;
     if (!equal(Insts, It->second))
       return false;
+    // If we sink this instruction, we're not going to need the phi for it
+    // anymore.
+    --NeededPHIsAdjustment;
   }
 
   // Because SROA can't handle speculating stores of selects, try not to sink
@@ -2060,8 +2065,16 @@ static bool canSinkInstructions(
       auto &Ops = PHIOperands[&I0->getOperandUse(OI)];
       for (auto *I : Insts)
         Ops.push_back(I->getOperand(OI));
+      ++NeededPHIsAdjustment;
     }
   }
+
+  // Allow void instructions to introduce phi.
+  if (NumUses == 0 && NeededPHIsAdjustment > 0)
+    --NeededPHIsAdjustment;
+
+  int OldNeededPHIs = NeededPHIs.empty() ? 0 : NeededPHIs.back();
+  NeededPHIs.push_back(OldNeededPHIs + NeededPHIsAdjustment);
   return true;
 }
 
@@ -2306,13 +2319,13 @@ static bool sinkCommonCodeFromPredecessors(BasicBlock *BB,
   }
 
   int ScanIdx = 0;
-  SmallPtrSet<Value*,4> InstructionsToSink;
+  // Number of new PHIs introduced if sinking the first N instructions.
+  SmallVector<int> NeededPHIs;
   LockstepReverseIterator LRI(UnconditionalPreds);
   while (LRI.isValid() &&
-         canSinkInstructions(*LRI, PHIOperands)) {
+         canSinkInstructions(*LRI, PHIOperands, NeededPHIs)) {
     LLVM_DEBUG(dbgs() << "SINK: instruction can be sunk: " << *(*LRI)[0]
                       << "\n");
-    InstructionsToSink.insert((*LRI).begin(), (*LRI).end());
     ++ScanIdx;
     --LRI;
   }
@@ -2327,82 +2340,12 @@ static bool sinkCommonCodeFromPredecessors(BasicBlock *BB,
     // Okay, we *could* sink last ScanIdx instructions. But how many can we
     // actually sink before encountering instruction that is unprofitable to
     // sink?
-    auto ProfitableToSinkInstruction = [&](LockstepReverseIterator &LRI) {
-      unsigned NumPHIInsts = 0;
-      for (Use &U : (*LRI)[0]->operands()) {
-        auto It = PHIOperands.find(&U);
-        if (It != PHIOperands.end() && !all_of(It->second, [&](Value *V) {
-              return InstructionsToSink.contains(V);
-            })) {
-          ++NumPHIInsts;
-          // FIXME: this check is overly optimistic. We may end up not sinking
-          // said instruction, due to the very same profitability check.
-          // See @creating_too_many_phis in sink-common-code.ll.
-        }
-      }
-      LLVM_DEBUG(dbgs() << "SINK: #phi insts: " << NumPHIInsts << "\n");
-      return NumPHIInsts <= 1;
-    };
-
-    // We've determined that we are going to sink last ScanIdx instructions,
-    // and recorded them in InstructionsToSink. Now, some instructions may be
-    // unprofitable to sink. But that determination depends on the instructions
-    // that we are going to sink.
-
-    // First, forward scan: find the first instruction unprofitable to sink,
-    // recording all the ones that are profitable to sink.
-    // FIXME: would it be better, after we detect that not all are profitable.
-    // to either record the profitable ones, or erase the unprofitable ones?
-    // Maybe we need to choose (at runtime) the one that will touch least
-    // instrs?
-    LRI.reset();
-    int Idx = 0;
-    SmallPtrSet<Value *, 4> InstructionsProfitableToSink;
-    while (Idx < ScanIdx) {
-      if (!ProfitableToSinkInstruction(LRI)) {
-        // Too many PHIs would be created.
-        LLVM_DEBUG(
-            dbgs() << "SINK: stopping here, too many PHIs would be created!\n");
+    for (; ScanIdx > 0; --ScanIdx) {
+      // Avoid increasing the overall number of phis. Note that we may still
+      // introduce extras phis when sinking void instructions, as they get a
+      // bonus when producing this count.
+      if (NeededPHIs[ScanIdx - 1] <= 0)
         break;
-      }
-      InstructionsProfitableToSink.insert((*LRI).begin(), (*LRI).end());
-      --LRI;
-      ++Idx;
-    }
-
-    // If no instructions can be sunk, early-return.
-    if (Idx == 0)
-      return false;
-
-    // Did we determine that (only) some instructions are unprofitable to sink?
-    if (Idx < ScanIdx) {
-      // Okay, some instructions are unprofitable.
-      ScanIdx = Idx;
-      InstructionsToSink = InstructionsProfitableToSink;
-
-      // But, that may make other instructions unprofitable, too.
-      // So, do a backward scan, do any earlier instructions become
-      // unprofitable?
-      assert(
-          !ProfitableToSinkInstruction(LRI) &&
-          "We already know that the last instruction is unprofitable to sink");
-      ++LRI;
-      --Idx;
-      while (Idx >= 0) {
-        // If we detect that an instruction becomes unprofitable to sink,
-        // all earlier instructions won't be sunk either,
-        // so preemptively keep InstructionsProfitableToSink in sync.
-        // FIXME: is this the most performant approach?
-        for (auto *I : *LRI)
-          InstructionsProfitableToSink.erase(I);
-        if (!ProfitableToSinkInstruction(LRI)) {
-          // Everything starting with this instruction won't be sunk.
-          ScanIdx = Idx;
-          InstructionsToSink = InstructionsProfitableToSink;
-        }
-        ++LRI;
-        --Idx;
-      }
     }
 
     // If no instructions can be sunk, early-return.
@@ -2445,16 +2388,7 @@ static bool sinkCommonCodeFromPredecessors(BasicBlock *BB,
 
   // Now that we've analyzed all potential sinking candidates, perform the
   // actual sink. We iteratively sink the last non-terminator of the source
-  // blocks into their common successor unless doing so would require too
-  // many PHI instructions to be generated (currently only one PHI is allowed
-  // per sunk instruction).
-  //
-  // We can use InstructionsToSink to discount values needing PHI-merging that will
-  // actually be sunk in a later iteration. This allows us to be more
-  // aggressive in what we sink. This does allow a false positive where we
-  // sink presuming a later value will also be sunk, but stop half way through
-  // and never actually sink it which means we produce more PHIs than intended.
-  // This is unlikely in practice though.
+  // blocks into their common successor.
   int SinkIdx = 0;
   for (; SinkIdx != ScanIdx; ++SinkIdx) {
     LLVM_DEBUG(dbgs() << "SINK: Sink: "
diff --git a/llvm/test/Transforms/PGOProfile/cspgo_profile_summary.ll b/llvm/test/Transforms/PGOProfile/cspgo_profile_summary.ll
index 7040bac6a4c43f..072bdb315fc356 100644
--- a/llvm/test/Transforms/PGOProfile/cspgo_profile_summary.ll
+++ b/llvm/test/Transforms/PGOProfile/cspgo_profile_summary.ll
@@ -66,10 +66,10 @@ for.end:
   ret void
 }
 ; PGOSUMMARY-LABEL: @bar
-; PGOSUMMARY: %even.odd = select i1 %tobool{{[0-9]*}}, ptr @even, ptr @odd
+; PGOSUMMARY: br i1 %tobool, label %if.else, label %if.then
 ; PGOSUMMARY-SAME: !prof ![[BW_PGO_BAR:[0-9]+]]
 ; CSPGOSUMMARY-LABEL: @bar
-; CSPGOSUMMARY: %even.odd = select i1 %tobool{{[0-9]*}}, ptr @even, ptr @odd
+; CSPGOSUMMARY: br i1 %tobool, label %if.else, label %if.then
 ; CSPGOSUMMARY-SAME: !prof ![[BW_CSPGO_BAR:[0-9]+]]
 
 define internal fastcc i32 @cond(i32 %i) {
@@ -102,9 +102,9 @@ for.end:
   ret void
 }
 ; CSPGOSUMMARY-LABEL: @foo
-; CSPGOSUMMARY: %even.odd.i = select i1 %tobool.i{{[0-9]*}}, ptr @even, ptr @odd
+; CSPGOSUMMARY: br i1 %tobool.i, label %if.else.i, label %if.then.i
 ; CSPGOSUMMARY-SAME: !prof ![[BW_CSPGO_BAR]]
-; CSPGOSUMMARY: %even.odd.i2 = select i1 %tobool.i{{[0-9]*}}, ptr @odd, ptr @even
+; CSPGOSUMMARY: br i1 %tobool.i, label %if.then.i2, label %if.else.i21
 ; CSPGOSUMMARY-SAME: !prof ![[BW_CSPGO_BAR]]
 
 declare dso_local i32 @bar_m(i32)
diff --git a/llvm/test/Transforms/SimplifyCFG/X86/sink-common-code.ll b/llvm/test/Transforms/SimplifyCFG/X86/sink-common-code.ll
index cb2bbb8e0b9317..66fd00aa86aa09 100644
--- a/llvm/test/Transforms/SimplifyCFG/X86/sink-common-code.ll
+++ b/llvm/test/Transforms/SimplifyCFG/X86/sink-common-code.ll
@@ -518,24 +518,26 @@ declare void @llvm.dbg.value(metadata, metadata, metadata)
 !11 = !DILocation(line: 1, column: 14, scope: !8)
 
 
-; The load should be commoned.
+; The load should not be commoned, as this would increase the number of phis.
 define i32 @test15(i1 zeroext %flag, i32 %w, i32 %x, i32 %y, ptr %s) {
 ; CHECK-LABEL: @test15(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 [[FLAG:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
 ; CHECK:       if.then:
 ; CHECK-NEXT:    call void @bar(i32 1)
+; CHECK-NEXT:    [[SV1:%.*]] = load i32, ptr [[S:%.*]], align 4
+; CHECK-NEXT:    [[EXT1:%.*]] = zext i32 [[SV1]] to i64
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i64 [[EXT1]], 56
 ; CHECK-NEXT:    br label [[IF_END:%.*]]
 ; CHECK:       if.else:
 ; CHECK-NEXT:    call void @bar(i32 4)
-; CHECK-NEXT:    [[GEPB:%.*]] = getelementptr inbounds [[STRUCT_ANON:%.*]], ptr [[S:%.*]], i32 0, i32 1
+; CHECK-NEXT:    [[GEPB:%.*]] = getelementptr inbounds [[STRUCT_ANON:%.*]], ptr [[S]], i32 0, i32 1
+; CHECK-NEXT:    [[SV2:%.*]] = load i32, ptr [[GEPB]], align 4
+; CHECK-NEXT:    [[EXT2:%.*]] = zext i32 [[SV2]] to i64
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i64 [[EXT2]], 57
 ; CHECK-NEXT:    br label [[IF_END]]
 ; CHECK:       if.end:
-; CHECK-NEXT:    [[GEPB_SINK:%.*]] = phi ptr [ [[GEPB]], [[IF_ELSE]] ], [ [[S]], [[IF_THEN]] ]
-; CHECK-NEXT:    [[DOTSINK:%.*]] = phi i64 [ 57, [[IF_ELSE]] ], [ 56, [[IF_THEN]] ]
-; CHECK-NEXT:    [[SV2:%.*]] = load i32, ptr [[GEPB_SINK]], align 4
-; CHECK-NEXT:    [[EXT2:%.*]] = zext i32 [[SV2]] to i64
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i64 [[EXT2]], [[DOTSINK]]
+; CHECK-NEXT:    [[P:%.*]] = phi i1 [ [[CMP1]], [[IF_THEN]] ], [ [[CMP2]], [[IF_ELSE]] ]
 ; CHECK-NEXT:    ret i32 1
 ;
 entry:
@@ -2035,22 +2037,25 @@ join:
 
 define i32 @many_indirect_phis(i1 %cond, i32 %a, i32 %b) {
 ; CHECK-LABEL: @many_indirect_phis(
-; CHECK-NEXT:    br i1 [[COND:%.*]], label [[IF:%.*]], label [[JOIN:%.*]]
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[IF:%.*]], label [[ELSE:%.*]]
 ; CHECK:       if:
 ; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    [[ADD_0_A:%.*]] = add i32 [[A:%.*]], 1
+; CHECK-NEXT:    [[ADD_1_A:%.*]] = add i32 [[ADD_0_A]], 10
+; CHECK-NEXT:    [[ADD_2_A:%.*]] = add i32 [[ADD_1_A]], 20
+; CHECK-NEXT:    [[ADD_3_A:%.*]] = add i32 [[ADD_2_A]], 30
+; CHECK-NEXT:    [[ADD_4_A:%.*]] = add i32 [[ADD_3_A]], 40
+; CHECK-NEXT:    br label [[JOIN:%.*]]
+; CHECK:       else:
+; CHECK-NEXT:    [[ADD_0_B:%.*]] = add i32 [[B:%.*]], 1
+; CHECK-NEXT:    [[ADD_1_B:%.*]] = add i32 [[ADD_0_B]], 11
+; CHECK-NEXT:    [[ADD_2_B:%.*]] = add i32 [[ADD_1_B]], 21
+; CHECK-NEXT:    [[ADD_3_B:%.*]] = add i32 [[ADD_2_B]], 31
+; CHECK-NEXT:    [[ADD_4_B:%.*]] = add i32 [[ADD_3_B]], 41
 ; CHECK-NEXT:    br label [[JOIN]]
 ; CHECK:       join:
-; CHECK-NEXT:    [[B_SINK:%.*]] = phi i32 [ [[A:%.*]], [[IF]] ], [ [[B:%.*]], [[TMP0:%.*]] ]
-; CHECK-NEXT:    [[DOTSINK3:%.*]] = phi i32 [ 10, [[IF]] ], [ 11, [[TMP0]] ]
-; CHECK-NEXT:    [[DOTSINK2:%.*]] = phi i32 [ 20, [[IF]] ], [ 21, [[TMP0]] ]
-; CHECK-NEXT:    [[DOTSINK1:%.*]] = phi i32 [ 30, [[IF]] ], [ 31, [[TMP0]] ]
-; CHECK-NEXT:    [[DOTSINK:%.*]] = phi i32 [ 40, [[IF]] ], [ 41, [[TMP0]] ]
-; CHECK-NEXT:    [[ADD_0_B:%.*]] = add i32 [[B_SINK]], 1
-; CHECK-NEXT:    [[ADD_1_B:%.*]] = add i32 [[ADD_0_B]], [[DOTSINK3]]
-; CHECK-NEXT:    [[ADD_2_B:%.*]] = add i32 [[ADD_1_B]], [[DOTSINK2]]
-; CHECK-NEXT:    [[ADD_3_B:%.*]] = add i32 [[ADD_2_B]], [[DOTSINK1]]
-; CHECK-NEXT:    [[ADD_4_B:%.*]] = add i32 [[ADD_3_B]], [[DOTSINK]]
-; CHECK-NEXT:    ret i32 [[ADD_4_B]]
+; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ [[ADD_4_A]], [[IF]] ], [ [[ADD_4_B]], [[ELSE]] ]
+; CHECK-NEXT:    ret i32 [[PHI]]
 ;
   br i1 %cond, label %if, label %else
 
@@ -2111,19 +2116,16 @@ join:
 
 define i32 @store_and_related_many_phi_add(i1 %cond, ptr %p, i32 %a, i32 %b) {
 ; CHECK-LABEL: @store_and_related_many_phi_add(
-; CHECK-NEXT:    br i1 [[COND:%.*]], label [[IF:%.*]], label [[ELSE:%.*]]
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[IF:%.*]], label [[JOIN:%.*]]
 ; CHECK:       if:
 ; CHECK-NEXT:    call void @dummy()
-; CHECK-NEXT:    [[ADD_1:%.*]] = add i32 [[A:%.*]], 2
-; CHECK-NEXT:    br label [[JOIN:%.*]]
-; CHECK:       else:
-; CHECK-NEXT:    [[ADD_2:%.*]] = add i32 [[B:%.*]], 3
 ; CHECK-NEXT:    br label [[JOIN]]
 ; CHECK:       join:
-; CHECK-NEXT:    [[ADD_2_SINK:%.*]] = phi i32 [ [[ADD_2]], [[ELSE]] ], [ [[ADD_1]], [[IF]] ]
-; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ [[ADD_1]], [[IF]] ], [ [[ADD_2]], [[ELSE]] ]
-; CHECK-NEXT:    store i32 [[ADD_2_SINK]], ptr [[P:%.*]], align 4
-; CHECK-NEXT:    ret i32 [[PHI]]
+; CHECK-NEXT:    [[DOTSINK:%.*]] = phi i32 [ 2, [[IF]] ], [ 3, [[TMP0:%.*]] ]
+; CHECK-NEXT:    [[B_SINK:%.*]] = phi i32 [ [[A:%.*]], [[IF]] ], [ [[B:%.*]], [[TMP0]] ]
+; CHECK-NEXT:    [[ADD_2:%.*]] = add i32 [[B_SINK]], [[DOTSINK]]
+; CHECK-NEXT:    store i32 [[ADD_2]], ptr [[P:%.*]], align 4
+; CHECK-NEXT:    ret i32 [[ADD_2]]
 ;
   br i1 %cond, label %if, label %else
 
@@ -2143,21 +2145,20 @@ join:
   ret i32 %phi
 }
 
+; FIXME: It would be better not to sink in this case.
 define i32 @store_and_unrelated_many_phi_add2(i1 %cond, ptr %p, i32 %a, i32 %b) {
 ; CHECK-LABEL: @store_and_unrelated_many_phi_add2(
-; CHECK-NEXT:    br i1 [[COND:%.*]], label [[IF:%.*]], label [[ELSE:%.*]]
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[IF:%.*]], label [[JOIN:%.*]]
 ; CHECK:       if:
 ; CHECK-NEXT:    call void @dummy()
-; CHECK-NEXT:    [[ADD_1:%.*]] = add i32 [[A:%.*]], 2
-; CHECK-NEXT:    br label [[JOIN:%.*]]
-; CHECK:       else:
-; CHECK-NEXT:    [[ADD_2:%.*]] = add i32 [[B:%.*]], 3
 ; CHECK-NEXT:    br label [[JOIN]]
 ; CHECK:       join:
-; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ [[ADD_1]], [[IF]] ], [ [[ADD_2]], [[ELSE]] ]
+; CHECK-NEXT:    [[DOTSINK:%.*]] = phi i32 [ 2, [[IF]] ], [ 3, [[TMP0:%.*]] ]
+; CHECK-NEXT:    [[B_SINK:%.*]] = phi i32 [ [[A:%.*]], [[IF]] ], [ [[B:%.*]], [[TMP0]] ]
+; CHECK-NEXT:    [[ADD_2:%.*]] = add i32 [[B_SINK]], [[DOTSINK]]
 ; CHECK-NEXT:    [[ADD_A_2:%.*]] = add i32 [[A]], 1
 ; CHECK-NEXT:    store i32 [[ADD_A_2]], ptr [[P:%.*]], align 4
-; CHECK-NEXT:    ret i32 [[PHI]]
+; CHECK-NEXT:    ret i32 [[ADD_2]]
 ;
   br i1 %cond, label %if, label %else