[llvm] [CodeLayout] Do not flip branch condition when using optsize (PR #114607)

Fri Nov 1 14:39:01 PDT 2024

llvmbot wrote:




@llvm/pr-subscribers-backend-aarch64

Author: Ellis Hoag (ellishg)

<details>
<summary>Changes</summary>

* Do not use profile data when flipping a branch condition when optimizing for size. This should improving outlining and ICF due to more uniform instruction sequences.
* Refactor `optimizeBranches()` to use early `continue`s
* Use the correct debug location for `insertBranch()`

---
Full diff: https://github.com/llvm/llvm-project/pull/114607.diff


3 Files Affected:

- (modified) llvm/lib/CodeGen/MachineBlockPlacement.cpp (+25-19) 
- (added) llvm/test/CodeGen/AArch64/block-placement-optimize-branches.ll (+88) 
- (modified) llvm/test/CodeGen/X86/conditional-tailcall.ll (+6-6) 


``````````diff

diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
index d1dced9ef28dca..bdad63f368dfec 100644
--- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
@@ -2906,7 +2906,7 @@ void MachineBlockPlacement::buildCFGChains() {
 
 void MachineBlockPlacement::optimizeBranches() {
   BlockChain &FunctionChain = *BlockToChain[&F->front()];
-  SmallVector<MachineOperand, 4> Cond; // For analyzeBranch.
+  SmallVector<MachineOperand, 4> Cond;
 
   // Now that all the basic blocks in the chain have the proper layout,
   // make a final call to analyzeBranch with AllowModify set.
@@ -2916,24 +2916,30 @@ void MachineBlockPlacement::optimizeBranches() {
   // a fallthrough when it occurs after predicated terminators.
   for (MachineBasicBlock *ChainBB : FunctionChain) {
     Cond.clear();
-    MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
-    if (!TII->analyzeBranch(*ChainBB, TBB, FBB, Cond, /*AllowModify*/ true)) {
-      // If PrevBB has a two-way branch, try to re-order the branches
-      // such that we branch to the successor with higher probability first.
-      if (TBB && !Cond.empty() && FBB &&
-          MBPI->getEdgeProbability(ChainBB, FBB) >
-              MBPI->getEdgeProbability(ChainBB, TBB) &&
-          !TII->reverseBranchCondition(Cond)) {
-        LLVM_DEBUG(dbgs() << "Reverse order of the two branches: "
-                          << getBlockName(ChainBB) << "\n");
-        LLVM_DEBUG(dbgs() << "    Edge probability: "
-                          << MBPI->getEdgeProbability(ChainBB, FBB) << " vs "
-                          << MBPI->getEdgeProbability(ChainBB, TBB) << "\n");
-        DebugLoc dl; // FIXME: this is nowhere
-        TII->removeBranch(*ChainBB);
-        TII->insertBranch(*ChainBB, FBB, TBB, Cond, dl);
-      }
-    }
+    MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+    if (TII->analyzeBranch(*ChainBB, TBB, FBB, Cond, /*AllowModify*/ true))
+      continue;
+    if (!TBB || !FBB || Cond.empty())
+      continue;
+    // If we are optimizing for size we do not consider the runtime performance.
+    // Instead, we retain the original branch condition so we have more uniform
+    // instructions which will benefit ICF.
+    if (llvm::shouldOptimizeForSize(ChainBB, PSI, MBFI.get()))
+      continue;
+    // If ChainBB has a two-way branch, try to re-order the branches
+    // such that we branch to the successor with higher probability first.
+    if (MBPI->getEdgeProbability(ChainBB, TBB) >=
+        MBPI->getEdgeProbability(ChainBB, FBB))
+      continue;
+    if (TII->reverseBranchCondition(Cond))
+      continue;
+    LLVM_DEBUG(dbgs() << "Reverse order of the two branches: "
+                      << getBlockName(ChainBB) << "\n");
+    LLVM_DEBUG(dbgs() << "  " << getBlockName(TBB) << " < " << getBlockName(FBB)
+                      << "\n");
+    auto Dl = ChainBB->findBranchDebugLoc();
+    TII->removeBranch(*ChainBB);
+    TII->insertBranch(*ChainBB, FBB, TBB, Cond, Dl);
   }
 }
 
diff --git a/llvm/test/CodeGen/AArch64/block-placement-optimize-branches.ll b/llvm/test/CodeGen/AArch64/block-placement-optimize-branches.ll
new file mode 100644
index 00000000000000..3645718968f9e3
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/block-placement-optimize-branches.ll
@@ -0,0 +1,88 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=aarch64 | FileCheck %s
+
+; When consuming profile data we sometimes flip a branch to improve runtime
+; performance. If we are optimizing for size, we avoid changing the branch to
+; improve outlining and ICF.
+
+define i8 @foo_optsize(i32 %v4) optsize {
+; CHECK-LABEL: foo_optsize:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cbz wzr, .LBB0_2
+; CHECK-NEXT:  .LBB0_1:
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB0_2: // %b1
+; CHECK-NEXT:    cbnz w0, .LBB0_4
+; CHECK-NEXT:  .LBB0_3: // %b2
+; CHECK-NEXT:    mov w0, #1 // =0x1
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB0_4: // %b1
+; CHECK-NEXT:    cmp w0, #1
+; CHECK-NEXT:    b.ne .LBB0_1
+; CHECK-NEXT:  // %bb.5: // %b3
+; CHECK-NEXT:    cbz wzr, .LBB0_1
+; CHECK-NEXT:    b .LBB0_3
+entry:
+  %v2 = icmp eq i32 0, 0
+  br i1 %v2, label %b1, label %b4
+
+b1:
+  switch i32 %v4, label %b4 [
+    i32 1, label %b3
+    i32 0, label %b2
+  ], !prof !0
+
+b2:
+  br label %b4
+
+b3:
+  %v3 = icmp eq i32 0, 0
+  br i1 %v3, label %b4, label %b2
+
+b4:
+  %v16 = phi i8 [ 1, %b2 ], [ 0, %entry ], [ 0, %b3 ], [ 0, %b1 ]
+  ret i8 %v16
+}
+
+define i8 @foo_optspeed(i32 %v4) {
+; CHECK-LABEL: foo_optspeed:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cbz wzr, .LBB1_2
+; CHECK-NEXT:  .LBB1_1:
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB1_2: // %b1
+; CHECK-NEXT:    cbnz w0, .LBB1_4
+; CHECK-NEXT:  .LBB1_3: // %b2
+; CHECK-NEXT:    mov w0, #1 // =0x1
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB1_4: // %b1
+; CHECK-NEXT:    cmp w0, #1
+; CHECK-NEXT:    b.ne .LBB1_1
+; CHECK-NEXT:  // %bb.5: // %b3
+; CHECK-NEXT:    cbnz wzr, .LBB1_3
+; CHECK-NEXT:    b .LBB1_1
+entry:
+  %v2 = icmp eq i32 0, 0
+  br i1 %v2, label %b1, label %b4
+
+b1:
+  switch i32 %v4, label %b4 [
+    i32 1, label %b3
+    i32 0, label %b2
+  ], !prof !0
+
+b2:
+  br label %b4
+
+b3:
+  %v3 = icmp eq i32 0, 0
+  br i1 %v3, label %b4, label %b2
+
+b4:
+  %v16 = phi i8 [ 1, %b2 ], [ 0, %entry ], [ 0, %b3 ], [ 0, %b1 ]
+  ret i8 %v16
+}
+
+!0 = !{!"branch_weights", i32 5, i32 5, i32 100}
diff --git a/llvm/test/CodeGen/X86/conditional-tailcall.ll b/llvm/test/CodeGen/X86/conditional-tailcall.ll
index 88a132d3850d1d..9e0a19f9a504f2 100644
--- a/llvm/test/CodeGen/X86/conditional-tailcall.ll
+++ b/llvm/test/CodeGen/X86/conditional-tailcall.ll
@@ -303,10 +303,10 @@ define zeroext i1 @pr31257(ptr nocapture readonly dereferenceable(8) %s) minsize
 ; CHECK32-NEXT:  .LBB3_8: # %if.else
 ; CHECK32-NEXT:    # in Loop: Header=BB3_1 Depth=1
 ; CHECK32-NEXT:    movl %esi, %ebx # encoding: [0x89,0xf3]
-; CHECK32-NEXT:    jb .LBB3_11 # encoding: [0x72,A]
-; CHECK32-NEXT:    # fixup A - offset: 1, value: .LBB3_11-1, kind: FK_PCRel_1
-; CHECK32-NEXT:    jmp .LBB3_9 # encoding: [0xeb,A]
+; CHECK32-NEXT:    jae .LBB3_9 # encoding: [0x73,A]
 ; CHECK32-NEXT:    # fixup A - offset: 1, value: .LBB3_9-1, kind: FK_PCRel_1
+; CHECK32-NEXT:    jmp .LBB3_11 # encoding: [0xeb,A]
+; CHECK32-NEXT:    # fixup A - offset: 1, value: .LBB3_11-1, kind: FK_PCRel_1
 ; CHECK32-NEXT:  .LBB3_12: # %sw.bb22
 ; CHECK32-NEXT:    # in Loop: Header=BB3_1 Depth=1
 ; CHECK32-NEXT:    movzbl (%eax), %ebx # encoding: [0x0f,0xb6,0x18]
@@ -483,10 +483,10 @@ define zeroext i1 @pr31257(ptr nocapture readonly dereferenceable(8) %s) minsize
 ; WIN64-NEXT:  # %bb.6: # %sw.bb
 ; WIN64-NEXT:    # in Loop: Header=BB3_1 Depth=1
 ; WIN64-NEXT:    cmpl $45, %r9d # encoding: [0x41,0x83,0xf9,0x2d]
-; WIN64-NEXT:    je .LBB3_10 # encoding: [0x74,A]
-; WIN64-NEXT:    # fixup A - offset: 1, value: .LBB3_10-1, kind: FK_PCRel_1
-; WIN64-NEXT:    jmp .LBB3_8 # encoding: [0xeb,A]
+; WIN64-NEXT:    jne .LBB3_8 # encoding: [0x75,A]
 ; WIN64-NEXT:    # fixup A - offset: 1, value: .LBB3_8-1, kind: FK_PCRel_1
+; WIN64-NEXT:    jmp .LBB3_10 # encoding: [0xeb,A]
+; WIN64-NEXT:    # fixup A - offset: 1, value: .LBB3_10-1, kind: FK_PCRel_1
 ; WIN64-NEXT:  .LBB3_7: # %sw.bb14
 ; WIN64-NEXT:    # in Loop: Header=BB3_1 Depth=1
 ; WIN64-NEXT:    movzbl (%rcx), %r9d # encoding: [0x44,0x0f,0xb6,0x09]

``````````

</details>


https://github.com/llvm/llvm-project/pull/114607