[llvm] [BranchFolding] Fold fallthroughs into conditional tailcalls if profitable (PR #140476)

Nabeel Omer via llvm-commits llvm-commits at lists.llvm.org
Sun May 18 13:34:59 PDT 2025


https://github.com/omern1 updated https://github.com/llvm/llvm-project/pull/140476

>From 8948b7244be63d32b3f3fae19160672e094f2014 Mon Sep 17 00:00:00 2001
From: Nabeel Omer <Nabeel.Omer at sony.com>
Date: Fri, 16 May 2025 16:24:04 +0100
Subject: [PATCH 1/2] [BranchFolding] Fold fallthroughs into conditional
 tailcalls if profitable

This patch makes BranchFolding take branch frequency information into
account when creating conditional tailcalls.

It also enables folding fallthrough blocks into conditional tailcalls
when that's profitable.

This should fix #126363.
---
 llvm/lib/CodeGen/BranchFolding.cpp            |  53 +++--
 llvm/test/CodeGen/X86/conditional-tailcall.ll | 187 ++++++++++++++++++
 2 files changed, 225 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/CodeGen/BranchFolding.cpp b/llvm/lib/CodeGen/BranchFolding.cpp
index 6f5afbd2a996a..af2c40005081e 100644
--- a/llvm/lib/CodeGen/BranchFolding.cpp
+++ b/llvm/lib/CodeGen/BranchFolding.cpp
@@ -26,6 +26,7 @@
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/BranchFoldingPass.h"
 #include "llvm/CodeGen/MBFIWrapper.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -1547,32 +1548,54 @@ bool BranchFolder::OptimizeBlock(MachineBasicBlock *MBB) {
     MachineInstr &TailCall = *MBB->getFirstNonDebugInstr();
     if (TII->isUnconditionalTailCall(TailCall)) {
       SmallVector<MachineBasicBlock *> PredsChanged;
-      for (auto &Pred : MBB->predecessors()) {
+      for (auto *Pred : MBB->predecessors()) {
+        bool IsPGOInfoAvailable = false;
+        for (MachineBasicBlock *const PredSucc : Pred->successors()) {
+          IsPGOInfoAvailable |= MBPI.isEdgeHot(Pred, PredSucc);
+        }
+
         MachineBasicBlock *PredTBB = nullptr, *PredFBB = nullptr;
         SmallVector<MachineOperand, 4> PredCond;
         bool PredAnalyzable =
             !TII->analyzeBranch(*Pred, PredTBB, PredFBB, PredCond, true);
 
-        // Only eliminate if MBB == TBB (Taken Basic Block)
-        if (PredAnalyzable && !PredCond.empty() && PredTBB == MBB &&
-            PredTBB != PredFBB) {
-          // The predecessor has a conditional branch to this block which
-          // consists of only a tail call. Try to fold the tail call into the
-          // conditional branch.
+        bool IsEdgeCold = !MBPI.isEdgeHot(Pred, MBB);
+        bool CanFoldFallThrough =
+            IsPGOInfoAvailable && IsEdgeCold &&
+            (MBB == PredFBB ||
+             (PredFBB == nullptr && Pred->getFallThrough() == MBB));
+        bool CanFoldTakenBlock =
+            (MBB == PredTBB && (IsPGOInfoAvailable ? IsEdgeCold : true));
+
+        // When we have PGO (or equivalent) information, we want to fold the
+        // fallthrough if it's cold. Folding a fallthrough puts it behind a
+        // conditional branch which isn't desirable if it's hot. When there
+        // isn't any PGO information available we want to fold the taken block
+        // if it's possible and we never want to fold the fallthrough as we
+        // don't know if that is desirable.
+        if (PredAnalyzable && !PredCond.empty() && PredTBB != PredFBB &&
+            (CanFoldTakenBlock || CanFoldFallThrough)) {
+          SmallVector<MachineOperand, 4> ReversedCond(PredCond);
+          if (CanFoldFallThrough) {
+            DebugLoc Dl = MBB->findBranchDebugLoc();
+            TII->reverseBranchCondition(ReversedCond);
+            TII->removeBranch(*Pred);
+            TII->insertBranch(*Pred, MBB, PredTBB, ReversedCond, Dl);
+          }
+          
+          PredAnalyzable =
+              !TII->analyzeBranch(*Pred, PredTBB, PredFBB, PredCond, true);
+
           if (TII->canMakeTailCallConditional(PredCond, TailCall)) {
-            // TODO: It would be nice if analyzeBranch() could provide a pointer
-            // to the branch instruction so replaceBranchWithTailCall() doesn't
-            // have to search for it.
+            // TODO: It would be nice if analyzeBranch() could provide a
+            // pointer to the branch instruction so
+            // replaceBranchWithTailCall() doesn't have to search for it.
             TII->replaceBranchWithTailCall(*Pred, PredCond, TailCall);
             PredsChanged.push_back(Pred);
           }
         }
-        // If the predecessor is falling through to this block, we could reverse
-        // the branch condition and fold the tail call into that. However, after
-        // that we might have to re-arrange the CFG to fall through to the other
-        // block and there is a high risk of regressing code size rather than
-        // improving it.
       }
+
       if (!PredsChanged.empty()) {
         NumTailCalls += PredsChanged.size();
         for (auto &Pred : PredsChanged)
diff --git a/llvm/test/CodeGen/X86/conditional-tailcall.ll b/llvm/test/CodeGen/X86/conditional-tailcall.ll
index 4c990d81810be..b851840e167da 100644
--- a/llvm/test/CodeGen/X86/conditional-tailcall.ll
+++ b/llvm/test/CodeGen/X86/conditional-tailcall.ll
@@ -597,3 +597,190 @@ cleanup.thread:                                   ; preds = %cleanup.thread.loop
   %6 = phi i1 [ %cmp37, %5 ], [ %call34, %if.else28 ], [ false, %cleanup.thread.loopexit ]
   ret i1 %6
 }
+
+define void @true_likely(i1 noundef zeroext %0) {
+; CHECK32-LABEL: true_likely:
+; CHECK32:       # %bb.0:
+; CHECK32-NEXT:    cmpb $0, {{[0-9]+}}(%esp) # encoding: [0x80,0x7c,0x24,0x04,0x00]
+; CHECK32-NEXT:    je func_false # TAILCALL
+; CHECK32-NEXT:    # encoding: [0x74,A]
+; CHECK32-NEXT:    # fixup A - offset: 1, value: func_false-1, kind: FK_PCRel_1
+; CHECK32-NEXT:  # %bb.1:
+; CHECK32-NEXT:    jmp func_true # TAILCALL
+; CHECK32-NEXT:    # encoding: [0xeb,A]
+; CHECK32-NEXT:    # fixup A - offset: 1, value: func_true-1, kind: FK_PCRel_1
+;
+; CHECK64-LABEL: true_likely:
+; CHECK64:       # %bb.0:
+; CHECK64-NEXT:    testl %edi, %edi # encoding: [0x85,0xff]
+; CHECK64-NEXT:    je func_false # TAILCALL
+; CHECK64-NEXT:    # encoding: [0x74,A]
+; CHECK64-NEXT:    # fixup A - offset: 1, value: func_false-1, kind: FK_PCRel_1
+; CHECK64-NEXT:  # %bb.1:
+; CHECK64-NEXT:    jmp func_true # TAILCALL
+; CHECK64-NEXT:    # encoding: [0xeb,A]
+; CHECK64-NEXT:    # fixup A - offset: 1, value: func_true-1, kind: FK_PCRel_1
+;
+; WIN64-LABEL: true_likely:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    testb %cl, %cl # encoding: [0x84,0xc9]
+; WIN64-NEXT:    je func_false # TAILCALL
+; WIN64-NEXT:    # encoding: [0x74,A]
+; WIN64-NEXT:    # fixup A - offset: 1, value: func_false-1, kind: FK_PCRel_1
+; WIN64-NEXT:  # %bb.1:
+; WIN64-NEXT:    jmp func_true # TAILCALL
+; WIN64-NEXT:    # encoding: [0xeb,A]
+; WIN64-NEXT:    # fixup A - offset: 1, value: func_true-1, kind: FK_PCRel_1
+  br i1 %0, label %2, label %3, !prof !6
+
+2:
+  tail call void @func_true()
+  br label %4
+
+3:
+  tail call void @func_false()
+  br label %4
+
+4:
+  ret void
+}
+
+define void @false_likely(i1 noundef zeroext %0) {
+; CHECK32-LABEL: false_likely:
+; CHECK32:       # %bb.0:
+; CHECK32-NEXT:    cmpb $0, {{[0-9]+}}(%esp) # encoding: [0x80,0x7c,0x24,0x04,0x00]
+; CHECK32-NEXT:    jne func_true # TAILCALL
+; CHECK32-NEXT:    # encoding: [0x75,A]
+; CHECK32-NEXT:    # fixup A - offset: 1, value: func_true-1, kind: FK_PCRel_1
+; CHECK32-NEXT:  # %bb.1:
+; CHECK32-NEXT:    jmp func_false # TAILCALL
+; CHECK32-NEXT:    # encoding: [0xeb,A]
+; CHECK32-NEXT:    # fixup A - offset: 1, value: func_false-1, kind: FK_PCRel_1
+;
+; CHECK64-LABEL: false_likely:
+; CHECK64:       # %bb.0:
+; CHECK64-NEXT:    testl %edi, %edi # encoding: [0x85,0xff]
+; CHECK64-NEXT:    jne func_true # TAILCALL
+; CHECK64-NEXT:    # encoding: [0x75,A]
+; CHECK64-NEXT:    # fixup A - offset: 1, value: func_true-1, kind: FK_PCRel_1
+; CHECK64-NEXT:  # %bb.1:
+; CHECK64-NEXT:    jmp func_false # TAILCALL
+; CHECK64-NEXT:    # encoding: [0xeb,A]
+; CHECK64-NEXT:    # fixup A - offset: 1, value: func_false-1, kind: FK_PCRel_1
+;
+; WIN64-LABEL: false_likely:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    testb %cl, %cl # encoding: [0x84,0xc9]
+; WIN64-NEXT:    jne func_true # TAILCALL
+; WIN64-NEXT:    # encoding: [0x75,A]
+; WIN64-NEXT:    # fixup A - offset: 1, value: func_true-1, kind: FK_PCRel_1
+; WIN64-NEXT:  # %bb.1:
+; WIN64-NEXT:    jmp func_false # TAILCALL
+; WIN64-NEXT:    # encoding: [0xeb,A]
+; WIN64-NEXT:    # fixup A - offset: 1, value: func_false-1, kind: FK_PCRel_1
+  br i1 %0, label %2, label %3, !prof !7
+
+2:
+  tail call void @func_true()
+  br label %4
+
+3:
+  tail call void @func_false()
+  br label %4
+
+4:
+  ret void
+}
+
+
+define void @edge_is_hot_but_not_fallthrough(i1 noundef zeroext %0) {
+; CHECK32-LABEL: edge_is_hot_but_not_fallthrough:
+; CHECK32:       # %bb.0:
+; CHECK32-NEXT:    subl $12, %esp # encoding: [0x83,0xec,0x0c]
+; CHECK32-NEXT:    .cfi_def_cfa_offset 16
+; CHECK32-NEXT:    cmpb $0, {{[0-9]+}}(%esp) # encoding: [0x80,0x7c,0x24,0x10,0x00]
+; CHECK32-NEXT:    je .LBB6_1 # encoding: [0x74,A]
+; CHECK32-NEXT:    # fixup A - offset: 1, value: .LBB6_1-1, kind: FK_PCRel_1
+; CHECK32-NEXT:  # %bb.3:
+; CHECK32-NEXT:    addl $12, %esp # encoding: [0x83,0xc4,0x0c]
+; CHECK32-NEXT:    .cfi_def_cfa_offset 4
+; CHECK32-NEXT:    retl # encoding: [0xc3]
+; CHECK32-NEXT:  .LBB6_1:
+; CHECK32-NEXT:    .cfi_def_cfa_offset 16
+; CHECK32-NEXT:    calll func_true # encoding: [0xe8,A,A,A,A]
+; CHECK32-NEXT:    # fixup A - offset: 1, value: func_true-4, kind: FK_PCRel_4
+; CHECK32-NEXT:    .p2align 4
+; CHECK32-NEXT:  .LBB6_2: # =>This Inner Loop Header: Depth=1
+; CHECK32-NEXT:    calll func_false # encoding: [0xe8,A,A,A,A]
+; CHECK32-NEXT:    # fixup A - offset: 1, value: func_false-4, kind: FK_PCRel_4
+; CHECK32-NEXT:    jmp .LBB6_2 # encoding: [0xeb,A]
+; CHECK32-NEXT:    # fixup A - offset: 1, value: .LBB6_2-1, kind: FK_PCRel_1
+;
+; CHECK64-LABEL: edge_is_hot_but_not_fallthrough:
+; CHECK64:       # %bb.0:
+; CHECK64-NEXT:    pushq %rax # encoding: [0x50]
+; CHECK64-NEXT:    .cfi_def_cfa_offset 16
+; CHECK64-NEXT:    testl %edi, %edi # encoding: [0x85,0xff]
+; CHECK64-NEXT:    je .LBB6_1 # encoding: [0x74,A]
+; CHECK64-NEXT:    # fixup A - offset: 1, value: .LBB6_1-1, kind: FK_PCRel_1
+; CHECK64-NEXT:  # %bb.3:
+; CHECK64-NEXT:    popq %rax # encoding: [0x58]
+; CHECK64-NEXT:    .cfi_def_cfa_offset 8
+; CHECK64-NEXT:    retq # encoding: [0xc3]
+; CHECK64-NEXT:  .LBB6_1:
+; CHECK64-NEXT:    .cfi_def_cfa_offset 16
+; CHECK64-NEXT:    callq func_true # encoding: [0xe8,A,A,A,A]
+; CHECK64-NEXT:    # fixup A - offset: 1, value: func_true-4, kind: reloc_branch_4byte_pcrel
+; CHECK64-NEXT:    .p2align 4
+; CHECK64-NEXT:  .LBB6_2: # =>This Inner Loop Header: Depth=1
+; CHECK64-NEXT:    callq func_false # encoding: [0xe8,A,A,A,A]
+; CHECK64-NEXT:    # fixup A - offset: 1, value: func_false-4, kind: reloc_branch_4byte_pcrel
+; CHECK64-NEXT:    jmp .LBB6_2 # encoding: [0xeb,A]
+; CHECK64-NEXT:    # fixup A - offset: 1, value: .LBB6_2-1, kind: FK_PCRel_1
+;
+; WIN64-LABEL: edge_is_hot_but_not_fallthrough:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    subq $40, %rsp # encoding: [0x48,0x83,0xec,0x28]
+; WIN64-NEXT:    .seh_stackalloc 40
+; WIN64-NEXT:    .seh_endprologue
+; WIN64-NEXT:    testb %cl, %cl # encoding: [0x84,0xc9]
+; WIN64-NEXT:    je .LBB6_1 # encoding: [0x74,A]
+; WIN64-NEXT:    # fixup A - offset: 1, value: .LBB6_1-1, kind: FK_PCRel_1
+; WIN64-NEXT:  # %bb.3:
+; WIN64-NEXT:    .seh_startepilogue
+; WIN64-NEXT:    addq $40, %rsp # encoding: [0x48,0x83,0xc4,0x28]
+; WIN64-NEXT:    .seh_endepilogue
+; WIN64-NEXT:    retq # encoding: [0xc3]
+; WIN64-NEXT:  .LBB6_1:
+; WIN64-NEXT:    callq func_true # encoding: [0xe8,A,A,A,A]
+; WIN64-NEXT:    # fixup A - offset: 1, value: func_true-4, kind: reloc_branch_4byte_pcrel
+; WIN64-NEXT:    .p2align 4
+; WIN64-NEXT:  .LBB6_2: # =>This Inner Loop Header: Depth=1
+; WIN64-NEXT:    callq func_false # encoding: [0xe8,A,A,A,A]
+; WIN64-NEXT:    # fixup A - offset: 1, value: func_false-4, kind: reloc_branch_4byte_pcrel
+; WIN64-NEXT:    jmp .LBB6_2 # encoding: [0xeb,A]
+; WIN64-NEXT:    # fixup A - offset: 1, value: .LBB6_2-1, kind: FK_PCRel_1
+; WIN64-NEXT:    .seh_endproc
+  br i1 %0, label %2, label %3, !prof !6
+2:
+  %and6 = and i1 %0, 1
+  br label %5
+
+3:
+  tail call void @func_true()
+  br label %4
+
+4:
+  tail call void @func_false()
+  br label %4
+
+5:
+  ret void
+}
+
+!6 = !{!"branch_weights", !"expected", i32 2000, i32 1}
+!7 = !{!"branch_weights", !"expected", i32 1, i32 2000}
+
+
+declare dso_local void @func_true()
+declare dso_local void @func_false()

>From cd4084f4885b5e0f60c4af1ce8b6d28724188833 Mon Sep 17 00:00:00 2001
From: Nabeel Omer <Nabeel.Omer at sony.com>
Date: Sun, 18 May 2025 21:34:48 +0100
Subject: [PATCH 2/2] Fix formatting

---
 llvm/lib/CodeGen/BranchFolding.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/BranchFolding.cpp b/llvm/lib/CodeGen/BranchFolding.cpp
index af2c40005081e..17c6fd095e687 100644
--- a/llvm/lib/CodeGen/BranchFolding.cpp
+++ b/llvm/lib/CodeGen/BranchFolding.cpp
@@ -1582,7 +1582,7 @@ bool BranchFolder::OptimizeBlock(MachineBasicBlock *MBB) {
             TII->removeBranch(*Pred);
             TII->insertBranch(*Pred, MBB, PredTBB, ReversedCond, Dl);
           }
-          
+
           PredAnalyzable =
               !TII->analyzeBranch(*Pred, PredTBB, PredFBB, PredCond, true);
 



More information about the llvm-commits mailing list