[llvm] [CodeLayout] Do not flip branch condition when using optsize (PR #114607)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Nov 1 14:39:02 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86
Author: Ellis Hoag (ellishg)
<details>
<summary>Changes</summary>
* Do not use profile data when flipping a branch condition when optimizing for size. This should improving outlining and ICF due to more uniform instruction sequences.
* Refactor `optimizeBranches()` to use early `continue`s
* Use the correct debug location for `insertBranch()`
---
Full diff: https://github.com/llvm/llvm-project/pull/114607.diff
3 Files Affected:
- (modified) llvm/lib/CodeGen/MachineBlockPlacement.cpp (+25-19)
- (added) llvm/test/CodeGen/AArch64/block-placement-optimize-branches.ll (+88)
- (modified) llvm/test/CodeGen/X86/conditional-tailcall.ll (+6-6)
``````````diff
diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
index d1dced9ef28dca..bdad63f368dfec 100644
--- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
@@ -2906,7 +2906,7 @@ void MachineBlockPlacement::buildCFGChains() {
void MachineBlockPlacement::optimizeBranches() {
BlockChain &FunctionChain = *BlockToChain[&F->front()];
- SmallVector<MachineOperand, 4> Cond; // For analyzeBranch.
+ SmallVector<MachineOperand, 4> Cond;
// Now that all the basic blocks in the chain have the proper layout,
// make a final call to analyzeBranch with AllowModify set.
@@ -2916,24 +2916,30 @@ void MachineBlockPlacement::optimizeBranches() {
// a fallthrough when it occurs after predicated terminators.
for (MachineBasicBlock *ChainBB : FunctionChain) {
Cond.clear();
- MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
- if (!TII->analyzeBranch(*ChainBB, TBB, FBB, Cond, /*AllowModify*/ true)) {
- // If PrevBB has a two-way branch, try to re-order the branches
- // such that we branch to the successor with higher probability first.
- if (TBB && !Cond.empty() && FBB &&
- MBPI->getEdgeProbability(ChainBB, FBB) >
- MBPI->getEdgeProbability(ChainBB, TBB) &&
- !TII->reverseBranchCondition(Cond)) {
- LLVM_DEBUG(dbgs() << "Reverse order of the two branches: "
- << getBlockName(ChainBB) << "\n");
- LLVM_DEBUG(dbgs() << " Edge probability: "
- << MBPI->getEdgeProbability(ChainBB, FBB) << " vs "
- << MBPI->getEdgeProbability(ChainBB, TBB) << "\n");
- DebugLoc dl; // FIXME: this is nowhere
- TII->removeBranch(*ChainBB);
- TII->insertBranch(*ChainBB, FBB, TBB, Cond, dl);
- }
- }
+ MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+ if (TII->analyzeBranch(*ChainBB, TBB, FBB, Cond, /*AllowModify*/ true))
+ continue;
+ if (!TBB || !FBB || Cond.empty())
+ continue;
+ // If we are optimizing for size we do not consider the runtime performance.
+ // Instead, we retain the original branch condition so we have more uniform
+ // instructions which will benefit ICF.
+ if (llvm::shouldOptimizeForSize(ChainBB, PSI, MBFI.get()))
+ continue;
+ // If ChainBB has a two-way branch, try to re-order the branches
+ // such that we branch to the successor with higher probability first.
+ if (MBPI->getEdgeProbability(ChainBB, TBB) >=
+ MBPI->getEdgeProbability(ChainBB, FBB))
+ continue;
+ if (TII->reverseBranchCondition(Cond))
+ continue;
+ LLVM_DEBUG(dbgs() << "Reverse order of the two branches: "
+ << getBlockName(ChainBB) << "\n");
+ LLVM_DEBUG(dbgs() << " " << getBlockName(TBB) << " < " << getBlockName(FBB)
+ << "\n");
+ auto Dl = ChainBB->findBranchDebugLoc();
+ TII->removeBranch(*ChainBB);
+ TII->insertBranch(*ChainBB, FBB, TBB, Cond, Dl);
}
}
diff --git a/llvm/test/CodeGen/AArch64/block-placement-optimize-branches.ll b/llvm/test/CodeGen/AArch64/block-placement-optimize-branches.ll
new file mode 100644
index 00000000000000..3645718968f9e3
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/block-placement-optimize-branches.ll
@@ -0,0 +1,88 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=aarch64 | FileCheck %s
+
+; When consuming profile data we sometimes flip a branch to improve runtime
+; performance. If we are optimizing for size, we avoid changing the branch to
+; improve outlining and ICF.
+
+define i8 @foo_optsize(i32 %v4) optsize {
+; CHECK-LABEL: foo_optsize:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: cbz wzr, .LBB0_2
+; CHECK-NEXT: .LBB0_1:
+; CHECK-NEXT: mov w0, wzr
+; CHECK-NEXT: ret
+; CHECK-NEXT: .LBB0_2: // %b1
+; CHECK-NEXT: cbnz w0, .LBB0_4
+; CHECK-NEXT: .LBB0_3: // %b2
+; CHECK-NEXT: mov w0, #1 // =0x1
+; CHECK-NEXT: ret
+; CHECK-NEXT: .LBB0_4: // %b1
+; CHECK-NEXT: cmp w0, #1
+; CHECK-NEXT: b.ne .LBB0_1
+; CHECK-NEXT: // %bb.5: // %b3
+; CHECK-NEXT: cbz wzr, .LBB0_1
+; CHECK-NEXT: b .LBB0_3
+entry:
+ %v2 = icmp eq i32 0, 0
+ br i1 %v2, label %b1, label %b4
+
+b1:
+ switch i32 %v4, label %b4 [
+ i32 1, label %b3
+ i32 0, label %b2
+ ], !prof !0
+
+b2:
+ br label %b4
+
+b3:
+ %v3 = icmp eq i32 0, 0
+ br i1 %v3, label %b4, label %b2
+
+b4:
+ %v16 = phi i8 [ 1, %b2 ], [ 0, %entry ], [ 0, %b3 ], [ 0, %b1 ]
+ ret i8 %v16
+}
+
+define i8 @foo_optspeed(i32 %v4) {
+; CHECK-LABEL: foo_optspeed:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: cbz wzr, .LBB1_2
+; CHECK-NEXT: .LBB1_1:
+; CHECK-NEXT: mov w0, wzr
+; CHECK-NEXT: ret
+; CHECK-NEXT: .LBB1_2: // %b1
+; CHECK-NEXT: cbnz w0, .LBB1_4
+; CHECK-NEXT: .LBB1_3: // %b2
+; CHECK-NEXT: mov w0, #1 // =0x1
+; CHECK-NEXT: ret
+; CHECK-NEXT: .LBB1_4: // %b1
+; CHECK-NEXT: cmp w0, #1
+; CHECK-NEXT: b.ne .LBB1_1
+; CHECK-NEXT: // %bb.5: // %b3
+; CHECK-NEXT: cbnz wzr, .LBB1_3
+; CHECK-NEXT: b .LBB1_1
+entry:
+ %v2 = icmp eq i32 0, 0
+ br i1 %v2, label %b1, label %b4
+
+b1:
+ switch i32 %v4, label %b4 [
+ i32 1, label %b3
+ i32 0, label %b2
+ ], !prof !0
+
+b2:
+ br label %b4
+
+b3:
+ %v3 = icmp eq i32 0, 0
+ br i1 %v3, label %b4, label %b2
+
+b4:
+ %v16 = phi i8 [ 1, %b2 ], [ 0, %entry ], [ 0, %b3 ], [ 0, %b1 ]
+ ret i8 %v16
+}
+
+!0 = !{!"branch_weights", i32 5, i32 5, i32 100}
diff --git a/llvm/test/CodeGen/X86/conditional-tailcall.ll b/llvm/test/CodeGen/X86/conditional-tailcall.ll
index 88a132d3850d1d..9e0a19f9a504f2 100644
--- a/llvm/test/CodeGen/X86/conditional-tailcall.ll
+++ b/llvm/test/CodeGen/X86/conditional-tailcall.ll
@@ -303,10 +303,10 @@ define zeroext i1 @pr31257(ptr nocapture readonly dereferenceable(8) %s) minsize
; CHECK32-NEXT: .LBB3_8: # %if.else
; CHECK32-NEXT: # in Loop: Header=BB3_1 Depth=1
; CHECK32-NEXT: movl %esi, %ebx # encoding: [0x89,0xf3]
-; CHECK32-NEXT: jb .LBB3_11 # encoding: [0x72,A]
-; CHECK32-NEXT: # fixup A - offset: 1, value: .LBB3_11-1, kind: FK_PCRel_1
-; CHECK32-NEXT: jmp .LBB3_9 # encoding: [0xeb,A]
+; CHECK32-NEXT: jae .LBB3_9 # encoding: [0x73,A]
; CHECK32-NEXT: # fixup A - offset: 1, value: .LBB3_9-1, kind: FK_PCRel_1
+; CHECK32-NEXT: jmp .LBB3_11 # encoding: [0xeb,A]
+; CHECK32-NEXT: # fixup A - offset: 1, value: .LBB3_11-1, kind: FK_PCRel_1
; CHECK32-NEXT: .LBB3_12: # %sw.bb22
; CHECK32-NEXT: # in Loop: Header=BB3_1 Depth=1
; CHECK32-NEXT: movzbl (%eax), %ebx # encoding: [0x0f,0xb6,0x18]
@@ -483,10 +483,10 @@ define zeroext i1 @pr31257(ptr nocapture readonly dereferenceable(8) %s) minsize
; WIN64-NEXT: # %bb.6: # %sw.bb
; WIN64-NEXT: # in Loop: Header=BB3_1 Depth=1
; WIN64-NEXT: cmpl $45, %r9d # encoding: [0x41,0x83,0xf9,0x2d]
-; WIN64-NEXT: je .LBB3_10 # encoding: [0x74,A]
-; WIN64-NEXT: # fixup A - offset: 1, value: .LBB3_10-1, kind: FK_PCRel_1
-; WIN64-NEXT: jmp .LBB3_8 # encoding: [0xeb,A]
+; WIN64-NEXT: jne .LBB3_8 # encoding: [0x75,A]
; WIN64-NEXT: # fixup A - offset: 1, value: .LBB3_8-1, kind: FK_PCRel_1
+; WIN64-NEXT: jmp .LBB3_10 # encoding: [0xeb,A]
+; WIN64-NEXT: # fixup A - offset: 1, value: .LBB3_10-1, kind: FK_PCRel_1
; WIN64-NEXT: .LBB3_7: # %sw.bb14
; WIN64-NEXT: # in Loop: Header=BB3_1 Depth=1
; WIN64-NEXT: movzbl (%rcx), %r9d # encoding: [0x44,0x0f,0xb6,0x09]
``````````
</details>
https://github.com/llvm/llvm-project/pull/114607
More information about the llvm-commits
mailing list