[llvm] b582202 - [SimplifyCFG] 'Fold branch to common dest': don't overestimate the cost

Tue Mar 23 08:30:57 PDT 2021

Author: Roman Lebedev
Date: 2021-03-23T18:30:26+03:00
New Revision: b5822026dd727e23a6ed6270f9844e0021141607

URL: https://github.com/llvm/llvm-project/commit/b5822026dd727e23a6ed6270f9844e0021141607
DIFF: https://github.com/llvm/llvm-project/commit/b5822026dd727e23a6ed6270f9844e0021141607.diff

LOG: [SimplifyCFG] 'Fold branch to common dest': don't overestimate the cost

`FoldBranchToCommonDest()` has a certain budget (`-bonus-inst-threshold=`)
for bonus instruction duplication. And currently it calculates the cost
as-if it will actually duplicate into each predecessor.

But ignoring the budget, it won't always duplicate into each predecessor,
there are some correctness and profitability checks.
So when calculating the cost, we should first check into which blocks
will we *actually* duplicate, and only then use that block count
to do budgeting.

Added: 
    

Modified: 
    llvm/lib/Transforms/Utils/SimplifyCFG.cpp
    llvm/test/Transforms/SimplifyCFG/fold-branch-to-common-dest-two-preds-cost.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 0b1215e597cd..92a16119c767 100644

--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -3033,8 +3033,6 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, DomTreeUpdater *DTU,
 
   BasicBlock *BB = BI->getParent();
 
-  const unsigned PredCount = pred_size(BB);
-
   bool Changed = false;
 
   TargetTransformInfo::TargetCostKind CostKind =
@@ -3047,32 +3045,6 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, DomTreeUpdater *DTU,
       Cond->getParent() != BB || !Cond->hasOneUse())
     return Changed;
 
-  // Only allow this transformation if computing the condition doesn't involve
-  // too many instructions and these involved instructions can be executed
-  // unconditionally. We denote all involved instructions except the condition
-  // as "bonus instructions", and only allow this transformation when the
-  // number of the bonus instructions we'll need to create when cloning into
-  // each predecessor does not exceed a certain threshold.
-  unsigned NumBonusInsts = 0;
-  for (Instruction &I : *BB) {
-    // Don't check the branch condition comparison itself.
-    if (&I == Cond)
-      continue;
-    // Ignore dbg intrinsics, and the terminator.
-    if (isa<DbgInfoIntrinsic>(I) || isa<BranchInst>(I))
-      continue;
-    // I must be safe to execute unconditionally.
-    if (!isSafeToSpeculativelyExecute(&I))
-      return Changed;
-
-    // Account for the cost of duplicating this instruction into each
-    // predecessor.
-    NumBonusInsts += PredCount;
-    // Early exits once we reach the limit.
-    if (NumBonusInsts > BonusInstThreshold)
-      return Changed;
-  }
-
   // Cond is known to be a compare or binary operator.  Check to make sure that
   // neither operand is a potentially-trapping constant expression.
   if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Cond->getOperand(0)))
@@ -3086,6 +3058,8 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, DomTreeUpdater *DTU,
   if (is_contained(successors(BB), BB))
     return Changed;
 
+  // With which predecessors will we want to deal with?
+  SmallVector<BasicBlock *, 8> Preds;
   for (BasicBlock *PredBlock : predecessors(BB)) {
     BranchInst *PBI = dyn_cast<BranchInst>(PredBlock->getTerminator());
 
@@ -3116,6 +3090,40 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, DomTreeUpdater *DTU,
         continue;
     }
 
+    // Ok, we do want to deal with this predecessor. Record it.
+    Preds.emplace_back(PredBlock);
+  }
+
+  const unsigned PredCount = Preds.size();
+  // Only allow this transformation if computing the condition doesn't involve
+  // too many instructions and these involved instructions can be executed
+  // unconditionally. We denote all involved instructions except the condition
+  // as "bonus instructions", and only allow this transformation when the
+  // number of the bonus instructions we'll need to create when cloning into
+  // each predecessor does not exceed a certain threshold.
+  unsigned NumBonusInsts = 0;
+  for (Instruction &I : *BB) {
+    // Don't check the branch condition comparison itself.
+    if (&I == Cond)
+      continue;
+    // Ignore dbg intrinsics, and the terminator.
+    if (isa<DbgInfoIntrinsic>(I) || isa<BranchInst>(I))
+      continue;
+    // I must be safe to execute unconditionally.
+    if (!isSafeToSpeculativelyExecute(&I))
+      return Changed;
+
+    // Account for the cost of duplicating this instruction into each
+    // predecessor.
+    NumBonusInsts += PredCount;
+    // Early exits once we reach the limit.
+    if (NumBonusInsts > BonusInstThreshold)
+      return Changed;
+  }
+
+  // Ok, we have the budget. Perform the transformation.
+  for (BasicBlock *PredBlock : Preds) {
+    auto *PBI = cast<BranchInst>(PredBlock->getTerminator());
     return performBranchToCommonDestFolding(BI, PBI, DTU, MSSAU, PoisonSafe,
                                             TTI);
   }

diff  --git a/llvm/test/Transforms/SimplifyCFG/fold-branch-to-common-dest-two-preds-cost.ll b/llvm/test/Transforms/SimplifyCFG/fold-branch-to-common-dest-two-preds-cost.ll
index f0d6f43478da..450ab5fcc799 100644
--- a/llvm/test/Transforms/SimplifyCFG/fold-branch-to-common-dest-two-preds-cost.ll
+++ b/llvm/test/Transforms/SimplifyCFG/fold-branch-to-common-dest-two-preds-cost.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -S -simplifycfg -simplifycfg-require-and-preserve-domtree=1 -bonus-inst-threshold=1 | FileCheck --check-prefixes=THR1 %s
-; RUN: opt < %s -S -simplifycfg -simplifycfg-require-and-preserve-domtree=1 -bonus-inst-threshold=2 | FileCheck --check-prefixes=THR2 %s
+; RUN: opt < %s -S -simplifycfg -simplifycfg-require-and-preserve-domtree=1 -bonus-inst-threshold=1 | FileCheck --check-prefixes=ALL,THR1 %s
+; RUN: opt < %s -S -simplifycfg -simplifycfg-require-and-preserve-domtree=1 -bonus-inst-threshold=2 | FileCheck --check-prefixes=ALL,THR2 %s
 
 declare void @sideeffect0()
 declare void @sideeffect1()
@@ -82,50 +82,29 @@ final_right:
 ; But, we aren't going to clone it into one of the predecessors,
 ; because that isn't profitable. So we should not use it in cost calculation.
 define void @two_preds_with_extra_op_and_branchweights(i8 %v0, i8 %v1, i8 %v2, i8 %v3) {
-; THR1-LABEL: @two_preds_with_extra_op_and_branchweights(
-; THR1-NEXT:  entry:
-; THR1-NEXT:    [[C0:%.*]] = icmp eq i8 [[V0:%.*]], 0
-; THR1-NEXT:    br i1 [[C0]], label [[PRED0:%.*]], label [[PRED1:%.*]]
-; THR1:       pred0:
-; THR1-NEXT:    [[C1:%.*]] = icmp eq i8 [[V1:%.*]], 0
-; THR1-NEXT:    br i1 [[C1]], label [[FINAL_LEFT:%.*]], label [[DISPATCH:%.*]], !prof !0
-; THR1:       pred1:
-; THR1-NEXT:    [[C2:%.*]] = icmp eq i8 [[V2:%.*]], 0
-; THR1-NEXT:    br i1 [[C2]], label [[DISPATCH]], label [[FINAL_RIGHT:%.*]]
-; THR1:       dispatch:
-; THR1-NEXT:    [[V3_ADJ:%.*]] = add i8 [[V1]], [[V2]]
-; THR1-NEXT:    [[C3:%.*]] = icmp eq i8 [[V3_ADJ]], 0
-; THR1-NEXT:    br i1 [[C3]], label [[FINAL_LEFT]], label [[FINAL_RIGHT]]
-; THR1:       final_left:
-; THR1-NEXT:    call void @sideeffect0()
-; THR1-NEXT:    ret void
-; THR1:       final_right:
-; THR1-NEXT:    call void @sideeffect1()
-; THR1-NEXT:    ret void
-;
-; THR2-LABEL: @two_preds_with_extra_op_and_branchweights(
-; THR2-NEXT:  entry:
-; THR2-NEXT:    [[C0:%.*]] = icmp eq i8 [[V0:%.*]], 0
-; THR2-NEXT:    br i1 [[C0]], label [[PRED0:%.*]], label [[PRED1:%.*]]
-; THR2:       pred0:
-; THR2-NEXT:    [[C1:%.*]] = icmp eq i8 [[V1:%.*]], 0
-; THR2-NEXT:    br i1 [[C1]], label [[FINAL_LEFT:%.*]], label [[DISPATCH:%.*]], !prof !0
-; THR2:       pred1:
-; THR2-NEXT:    [[C2:%.*]] = icmp eq i8 [[V2:%.*]], 0
-; THR2-NEXT:    [[V3_ADJ:%.*]] = add i8 [[V1]], [[V2]]
-; THR2-NEXT:    [[C3:%.*]] = icmp eq i8 [[V3_ADJ]], 0
-; THR2-NEXT:    [[OR_COND:%.*]] = select i1 [[C2]], i1 [[C3]], i1 false
-; THR2-NEXT:    br i1 [[OR_COND]], label [[FINAL_LEFT]], label [[FINAL_RIGHT:%.*]]
-; THR2:       dispatch:
-; THR2-NEXT:    [[V3_ADJ_OLD:%.*]] = add i8 [[V1]], [[V2]]
-; THR2-NEXT:    [[C3_OLD:%.*]] = icmp eq i8 [[V3_ADJ_OLD]], 0
-; THR2-NEXT:    br i1 [[C3_OLD]], label [[FINAL_LEFT]], label [[FINAL_RIGHT]]
-; THR2:       final_left:
-; THR2-NEXT:    call void @sideeffect0()
-; THR2-NEXT:    ret void
-; THR2:       final_right:
-; THR2-NEXT:    call void @sideeffect1()
-; THR2-NEXT:    ret void
+; ALL-LABEL: @two_preds_with_extra_op_and_branchweights(
+; ALL-NEXT:  entry:
+; ALL-NEXT:    [[C0:%.*]] = icmp eq i8 [[V0:%.*]], 0
+; ALL-NEXT:    br i1 [[C0]], label [[PRED0:%.*]], label [[PRED1:%.*]]
+; ALL:       pred0:
+; ALL-NEXT:    [[C1:%.*]] = icmp eq i8 [[V1:%.*]], 0
+; ALL-NEXT:    br i1 [[C1]], label [[FINAL_LEFT:%.*]], label [[DISPATCH:%.*]], !prof !0
+; ALL:       pred1:
+; ALL-NEXT:    [[C2:%.*]] = icmp eq i8 [[V2:%.*]], 0
+; ALL-NEXT:    [[V3_ADJ:%.*]] = add i8 [[V1]], [[V2]]
+; ALL-NEXT:    [[C3:%.*]] = icmp eq i8 [[V3_ADJ]], 0
+; ALL-NEXT:    [[OR_COND:%.*]] = select i1 [[C2]], i1 [[C3]], i1 false
+; ALL-NEXT:    br i1 [[OR_COND]], label [[FINAL_LEFT]], label [[FINAL_RIGHT:%.*]]
+; ALL:       dispatch:
+; ALL-NEXT:    [[V3_ADJ_OLD:%.*]] = add i8 [[V1]], [[V2]]
+; ALL-NEXT:    [[C3_OLD:%.*]] = icmp eq i8 [[V3_ADJ_OLD]], 0
+; ALL-NEXT:    br i1 [[C3_OLD]], label [[FINAL_LEFT]], label [[FINAL_RIGHT]]
+; ALL:       final_left:
+; ALL-NEXT:    call void @sideeffect0()
+; ALL-NEXT:    ret void
+; ALL:       final_right:
+; ALL-NEXT:    call void @sideeffect1()
+; ALL-NEXT:    ret void
 ;
 entry:
   %c0 = icmp eq i8 %v0, 0