[llvm] AArch64: Add TBZ/TBNZ matcher for x & (1 << y). (PR #172962)

Thu Dec 18 23:37:54 PST 2025

https://github.com/pcc created https://github.com/llvm/llvm-project/pull/172962

x & (1 << y) is InstCombine's canonical form of a bit test which is
currently code generated literally, missing an opportunity to use TBZ/TBNZ
on bit 0 of x >> y, which generally results in an instruction sequence
that is shorter by 2 instructions. Implement this optimization. On my
machine this results in a 0.05% reduction in clang binary size and a 0.25%
reduction in dynamic instruction count compiling AArch64ISelLowering.cpp.


>From 0c2f9b463e2013710ba90e9b8a19f8e9b9fd693f Mon Sep 17 00:00:00 2001
From: Peter Collingbourne <pcc at google.com>
Date: Thu, 18 Dec 2025 23:37:39 -0800
Subject: [PATCH] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20initia?=
 =?UTF-8?q?l=20version?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Created using spr 1.3.6-beta.1
---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 66 +++++++++++--------
 .../AArch64/switch-cases-to-branch-and.ll     | 40 +++++------
 2 files changed, 56 insertions(+), 50 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 837393b0cbdcd..476a455f2e506 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -11177,6 +11177,40 @@ std::pair<SDValue, uint64_t> lookThroughSignExtension(SDValue Val) {
   return {Val, Val.getValueSizeInBits() - 1};
 }
 
+// Op is an SDValue that is being compared to 0. If the comparison is a bit
+// test, optimize it to a TBZ or TBNZ.
+static SDValue optimizeBitTest(SDValue Op, SDValue Chain, SDValue Dest,
+                               unsigned Opcode, SelectionDAG &DAG) {
+  SDLoc DL(Op);
+
+  if (Op.getOpcode() != ISD::AND)
+    return SDValue();
+
+  // See if we can use a TBZ to fold in an AND as well.
+  // TBZ has a smaller branch displacement than CBZ.  If the offset is
+  // out of bounds, a late MI-layer pass rewrites branches.
+  // 403.gcc is an example that hits this case.
+  if (isa<ConstantSDNode>(Op.getOperand(1)) &&
+      isPowerOf2_64(Op.getConstantOperandVal(1))) {
+    SDValue Test = Op.getOperand(0);
+    uint64_t Mask = Op.getConstantOperandVal(1);
+    return DAG.getNode(Opcode, DL, MVT::Other, Chain, Test,
+                       DAG.getConstant(Log2_64(Mask), DL, MVT::i64), Dest);
+  }
+
+  if (Op.getOperand(0).getOpcode() == ISD::SHL) {
+    auto Op00 = Op.getOperand(0).getOperand(0);
+    if (isa<ConstantSDNode>(Op00) && Op00->getAsZExtVal() == 1) {
+      auto Shr = DAG.getNode(ISD::SRL, DL, Op00.getValueType(),
+                             Op.getOperand(1), Op.getOperand(0).getOperand(1));
+      return DAG.getNode(Opcode, DL, MVT::Other, Chain, Shr,
+                         DAG.getConstant(0, DL, MVT::i64), Dest);
+    }
+  }
+
+  return SDValue();
+}
+
 SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
   SDValue Chain = Op.getOperand(0);
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
@@ -11236,35 +11270,15 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
     const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
     if (RHSC && RHSC->getZExtValue() == 0 && ProduceNonFlagSettingCondBr) {
       if (CC == ISD::SETEQ) {
-        // See if we can use a TBZ to fold in an AND as well.
-        // TBZ has a smaller branch displacement than CBZ.  If the offset is
-        // out of bounds, a late MI-layer pass rewrites branches.
-        // 403.gcc is an example that hits this case.
-        if (LHS.getOpcode() == ISD::AND &&
-            isa<ConstantSDNode>(LHS.getOperand(1)) &&
-            isPowerOf2_64(LHS.getConstantOperandVal(1))) {
-          SDValue Test = LHS.getOperand(0);
-          uint64_t Mask = LHS.getConstantOperandVal(1);
-          return DAG.getNode(AArch64ISD::TBZ, DL, MVT::Other, Chain, Test,
-                             DAG.getConstant(Log2_64(Mask), DL, MVT::i64),
-                             Dest);
-        }
+        if (SDValue Result =
+                optimizeBitTest(LHS, Chain, Dest, AArch64ISD::TBZ, DAG))
+          return Result;
 
         return DAG.getNode(AArch64ISD::CBZ, DL, MVT::Other, Chain, LHS, Dest);
       } else if (CC == ISD::SETNE) {
-        // See if we can use a TBZ to fold in an AND as well.
-        // TBZ has a smaller branch displacement than CBZ.  If the offset is
-        // out of bounds, a late MI-layer pass rewrites branches.
-        // 403.gcc is an example that hits this case.
-        if (LHS.getOpcode() == ISD::AND &&
-            isa<ConstantSDNode>(LHS.getOperand(1)) &&
-            isPowerOf2_64(LHS.getConstantOperandVal(1))) {
-          SDValue Test = LHS.getOperand(0);
-          uint64_t Mask = LHS.getConstantOperandVal(1);
-          return DAG.getNode(AArch64ISD::TBNZ, DL, MVT::Other, Chain, Test,
-                             DAG.getConstant(Log2_64(Mask), DL, MVT::i64),
-                             Dest);
-        }
+        if (SDValue Result =
+                optimizeBitTest(LHS, Chain, Dest, AArch64ISD::TBNZ, DAG))
+          return Result;
 
         return DAG.getNode(AArch64ISD::CBNZ, DL, MVT::Other, Chain, LHS, Dest);
       } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
diff --git a/llvm/test/CodeGen/AArch64/switch-cases-to-branch-and.ll b/llvm/test/CodeGen/AArch64/switch-cases-to-branch-and.ll
index 775ab3fe110e0..cb59a8d976eda 100644
--- a/llvm/test/CodeGen/AArch64/switch-cases-to-branch-and.ll
+++ b/llvm/test/CodeGen/AArch64/switch-cases-to-branch-and.ll
@@ -422,25 +422,23 @@ e2:
 define i32 @switch_in_loop_with_matching_dests_0_and_pow2_4_cases(ptr %start) {
 ; CHECK-LABEL: switch_in_loop_with_matching_dests_0_and_pow2_4_cases:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    mov x10, #32769 ; =0x8001
-; CHECK-NEXT:    mov w8, #1 ; =0x1
+; CHECK-NEXT:    mov x8, #32769 ; =0x8001
 ; CHECK-NEXT:    add x9, x0, #1
-; CHECK-NEXT:    movk x10, #1, lsl #32
+; CHECK-NEXT:    movk x8, #1, lsl #32
 ; CHECK-NEXT:    b LBB5_2
 ; CHECK-NEXT:  LBB5_1: ; %loop
 ; CHECK-NEXT:    ; in Loop: Header=BB5_2 Depth=1
-; CHECK-NEXT:    cmp w11, #124
+; CHECK-NEXT:    cmp w10, #124
 ; CHECK-NEXT:    b.eq LBB5_5
 ; CHECK-NEXT:  LBB5_2: ; %loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldrb w11, [x9], #1
-; CHECK-NEXT:    cmp w11, #32
+; CHECK-NEXT:    ldrb w10, [x9], #1
+; CHECK-NEXT:    cmp w10, #32
 ; CHECK-NEXT:    b.hi LBB5_1
 ; CHECK-NEXT:  ; %bb.3: ; %loop
 ; CHECK-NEXT:    ; in Loop: Header=BB5_2 Depth=1
-; CHECK-NEXT:    lsl x12, x8, x11
-; CHECK-NEXT:    tst x12, x10
-; CHECK-NEXT:    b.eq LBB5_1
+; CHECK-NEXT:    lsr x11, x8, x10
+; CHECK-NEXT:    tbz w11, #0, LBB5_1
 ; CHECK-NEXT:  ; %bb.4: ; %e1
 ; CHECK-NEXT:    mov w0, #-1 ; =0xffffffff
 ; CHECK-NEXT:    ret
@@ -608,10 +606,9 @@ exit:
 define i64 @consecutive_match_both(ptr %p, i32 %param) {
 ; CHECK-LABEL: consecutive_match_both:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    mov w8, #1 ; =0x1
+; CHECK-NEXT:    mov w8, #249 ; =0xf9
 ; CHECK-NEXT:    mov w9, #100 ; =0x64
-; CHECK-NEXT:    mov w10, #249 ; =0xf9
-; CHECK-NEXT:    lsl w8, w8, w1
+; CHECK-NEXT:    lsr w8, w8, w1
 ; CHECK-NEXT:    b LBB8_2
 ; CHECK-NEXT:  LBB8_1: ; %loop.latch
 ; CHECK-NEXT:    ; in Loop: Header=BB8_2 Depth=1
@@ -623,8 +620,7 @@ define i64 @consecutive_match_both(ptr %p, i32 %param) {
 ; CHECK-NEXT:    b.hi LBB8_1
 ; CHECK-NEXT:  ; %bb.3: ; %loop.header
 ; CHECK-NEXT:    ; in Loop: Header=BB8_2 Depth=1
-; CHECK-NEXT:    tst w8, w10
-; CHECK-NEXT:    b.eq LBB8_1
+; CHECK-NEXT:    tbz w8, #0, LBB8_1
 ; CHECK-NEXT:  ; %bb.4: ; %e0
 ; CHECK-NEXT:    mov x0, xzr
 ; CHECK-NEXT:    ret
@@ -688,10 +684,9 @@ e1:
 define i64 @consecutive_match_before(ptr %p, i32 %param) {
 ; CHECK-LABEL: consecutive_match_before:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    mov w8, #1 ; =0x1
+; CHECK-NEXT:    mov w8, #25 ; =0x19
 ; CHECK-NEXT:    mov w9, #100 ; =0x64
-; CHECK-NEXT:    mov w10, #25 ; =0x19
-; CHECK-NEXT:    lsl w8, w8, w1
+; CHECK-NEXT:    lsr w8, w8, w1
 ; CHECK-NEXT:    b LBB9_2
 ; CHECK-NEXT:  LBB9_1: ; %loop.latch
 ; CHECK-NEXT:    ; in Loop: Header=BB9_2 Depth=1
@@ -703,8 +698,7 @@ define i64 @consecutive_match_before(ptr %p, i32 %param) {
 ; CHECK-NEXT:    b.hi LBB9_1
 ; CHECK-NEXT:  ; %bb.3: ; %loop.header
 ; CHECK-NEXT:    ; in Loop: Header=BB9_2 Depth=1
-; CHECK-NEXT:    tst w8, w10
-; CHECK-NEXT:    b.eq LBB9_1
+; CHECK-NEXT:    tbz w8, #0, LBB9_1
 ; CHECK-NEXT:  ; %bb.4: ; %e0
 ; CHECK-NEXT:    mov x0, xzr
 ; CHECK-NEXT:    ret
@@ -765,10 +759,9 @@ e1:
 define i64 @consecutive_match_after(ptr %p, i32 %param) {
 ; CHECK-LABEL: consecutive_match_after:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    mov w8, #1 ; =0x1
+; CHECK-NEXT:    mov w8, #49 ; =0x31
 ; CHECK-NEXT:    mov w9, #100 ; =0x64
-; CHECK-NEXT:    mov w10, #49 ; =0x31
-; CHECK-NEXT:    lsl w8, w8, w1
+; CHECK-NEXT:    lsr w8, w8, w1
 ; CHECK-NEXT:    b LBB10_2
 ; CHECK-NEXT:  LBB10_1: ; %loop.latch
 ; CHECK-NEXT:    ; in Loop: Header=BB10_2 Depth=1
@@ -780,8 +773,7 @@ define i64 @consecutive_match_after(ptr %p, i32 %param) {
 ; CHECK-NEXT:    b.hi LBB10_1
 ; CHECK-NEXT:  ; %bb.3: ; %loop.header
 ; CHECK-NEXT:    ; in Loop: Header=BB10_2 Depth=1
-; CHECK-NEXT:    tst w8, w10
-; CHECK-NEXT:    b.eq LBB10_1
+; CHECK-NEXT:    tbz w8, #0, LBB10_1
 ; CHECK-NEXT:  ; %bb.4: ; %e0
 ; CHECK-NEXT:    mov x0, xzr
 ; CHECK-NEXT:    ret