[llvm] AArch64: Add TBZ/TBNZ matcher for x & (1 << y). (PR #172962)
Peter Collingbourne via llvm-commits
llvm-commits at lists.llvm.org
Thu Dec 18 23:37:54 PST 2025
https://github.com/pcc created https://github.com/llvm/llvm-project/pull/172962
x & (1 << y) is InstCombine's canonical form of a bit test which is
currently code generated literally, missing an opportunity to use TBZ/TBNZ
on bit 0 of x >> y, which generally results in an instruction sequence
that is shorter by 2 instructions. Implement this optimization. On my
machine this results in a 0.05% reduction in clang binary size and a 0.25%
reduction in dynamic instruction count compiling AArch64ISelLowering.cpp.
>From 0c2f9b463e2013710ba90e9b8a19f8e9b9fd693f Mon Sep 17 00:00:00 2001
From: Peter Collingbourne <pcc at google.com>
Date: Thu, 18 Dec 2025 23:37:39 -0800
Subject: [PATCH] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20initia?=
=?UTF-8?q?l=20version?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Created using spr 1.3.6-beta.1
---
.../Target/AArch64/AArch64ISelLowering.cpp | 66 +++++++++++--------
.../AArch64/switch-cases-to-branch-and.ll | 40 +++++------
2 files changed, 56 insertions(+), 50 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 837393b0cbdcd..476a455f2e506 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -11177,6 +11177,40 @@ std::pair<SDValue, uint64_t> lookThroughSignExtension(SDValue Val) {
return {Val, Val.getValueSizeInBits() - 1};
}
+// Op is an SDValue that is being compared to 0. If the comparison is a bit
+// test, optimize it to a TBZ or TBNZ.
+static SDValue optimizeBitTest(SDValue Op, SDValue Chain, SDValue Dest,
+ unsigned Opcode, SelectionDAG &DAG) {
+ SDLoc DL(Op);
+
+ if (Op.getOpcode() != ISD::AND)
+ return SDValue();
+
+ // See if we can use a TBZ to fold in an AND as well.
+ // TBZ has a smaller branch displacement than CBZ. If the offset is
+ // out of bounds, a late MI-layer pass rewrites branches.
+ // 403.gcc is an example that hits this case.
+ if (isa<ConstantSDNode>(Op.getOperand(1)) &&
+ isPowerOf2_64(Op.getConstantOperandVal(1))) {
+ SDValue Test = Op.getOperand(0);
+ uint64_t Mask = Op.getConstantOperandVal(1);
+ return DAG.getNode(Opcode, DL, MVT::Other, Chain, Test,
+ DAG.getConstant(Log2_64(Mask), DL, MVT::i64), Dest);
+ }
+
+ if (Op.getOperand(0).getOpcode() == ISD::SHL) {
+ auto Op00 = Op.getOperand(0).getOperand(0);
+ if (isa<ConstantSDNode>(Op00) && Op00->getAsZExtVal() == 1) {
+ auto Shr = DAG.getNode(ISD::SRL, DL, Op00.getValueType(),
+ Op.getOperand(1), Op.getOperand(0).getOperand(1));
+ return DAG.getNode(Opcode, DL, MVT::Other, Chain, Shr,
+ DAG.getConstant(0, DL, MVT::i64), Dest);
+ }
+ }
+
+ return SDValue();
+}
+
SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
SDValue Chain = Op.getOperand(0);
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
@@ -11236,35 +11270,15 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
if (RHSC && RHSC->getZExtValue() == 0 && ProduceNonFlagSettingCondBr) {
if (CC == ISD::SETEQ) {
- // See if we can use a TBZ to fold in an AND as well.
- // TBZ has a smaller branch displacement than CBZ. If the offset is
- // out of bounds, a late MI-layer pass rewrites branches.
- // 403.gcc is an example that hits this case.
- if (LHS.getOpcode() == ISD::AND &&
- isa<ConstantSDNode>(LHS.getOperand(1)) &&
- isPowerOf2_64(LHS.getConstantOperandVal(1))) {
- SDValue Test = LHS.getOperand(0);
- uint64_t Mask = LHS.getConstantOperandVal(1);
- return DAG.getNode(AArch64ISD::TBZ, DL, MVT::Other, Chain, Test,
- DAG.getConstant(Log2_64(Mask), DL, MVT::i64),
- Dest);
- }
+ if (SDValue Result =
+ optimizeBitTest(LHS, Chain, Dest, AArch64ISD::TBZ, DAG))
+ return Result;
return DAG.getNode(AArch64ISD::CBZ, DL, MVT::Other, Chain, LHS, Dest);
} else if (CC == ISD::SETNE) {
- // See if we can use a TBZ to fold in an AND as well.
- // TBZ has a smaller branch displacement than CBZ. If the offset is
- // out of bounds, a late MI-layer pass rewrites branches.
- // 403.gcc is an example that hits this case.
- if (LHS.getOpcode() == ISD::AND &&
- isa<ConstantSDNode>(LHS.getOperand(1)) &&
- isPowerOf2_64(LHS.getConstantOperandVal(1))) {
- SDValue Test = LHS.getOperand(0);
- uint64_t Mask = LHS.getConstantOperandVal(1);
- return DAG.getNode(AArch64ISD::TBNZ, DL, MVT::Other, Chain, Test,
- DAG.getConstant(Log2_64(Mask), DL, MVT::i64),
- Dest);
- }
+ if (SDValue Result =
+ optimizeBitTest(LHS, Chain, Dest, AArch64ISD::TBNZ, DAG))
+ return Result;
return DAG.getNode(AArch64ISD::CBNZ, DL, MVT::Other, Chain, LHS, Dest);
} else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
diff --git a/llvm/test/CodeGen/AArch64/switch-cases-to-branch-and.ll b/llvm/test/CodeGen/AArch64/switch-cases-to-branch-and.ll
index 775ab3fe110e0..cb59a8d976eda 100644
--- a/llvm/test/CodeGen/AArch64/switch-cases-to-branch-and.ll
+++ b/llvm/test/CodeGen/AArch64/switch-cases-to-branch-and.ll
@@ -422,25 +422,23 @@ e2:
define i32 @switch_in_loop_with_matching_dests_0_and_pow2_4_cases(ptr %start) {
; CHECK-LABEL: switch_in_loop_with_matching_dests_0_and_pow2_4_cases:
; CHECK: ; %bb.0: ; %entry
-; CHECK-NEXT: mov x10, #32769 ; =0x8001
-; CHECK-NEXT: mov w8, #1 ; =0x1
+; CHECK-NEXT: mov x8, #32769 ; =0x8001
; CHECK-NEXT: add x9, x0, #1
-; CHECK-NEXT: movk x10, #1, lsl #32
+; CHECK-NEXT: movk x8, #1, lsl #32
; CHECK-NEXT: b LBB5_2
; CHECK-NEXT: LBB5_1: ; %loop
; CHECK-NEXT: ; in Loop: Header=BB5_2 Depth=1
-; CHECK-NEXT: cmp w11, #124
+; CHECK-NEXT: cmp w10, #124
; CHECK-NEXT: b.eq LBB5_5
; CHECK-NEXT: LBB5_2: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ldrb w11, [x9], #1
-; CHECK-NEXT: cmp w11, #32
+; CHECK-NEXT: ldrb w10, [x9], #1
+; CHECK-NEXT: cmp w10, #32
; CHECK-NEXT: b.hi LBB5_1
; CHECK-NEXT: ; %bb.3: ; %loop
; CHECK-NEXT: ; in Loop: Header=BB5_2 Depth=1
-; CHECK-NEXT: lsl x12, x8, x11
-; CHECK-NEXT: tst x12, x10
-; CHECK-NEXT: b.eq LBB5_1
+; CHECK-NEXT: lsr x11, x8, x10
+; CHECK-NEXT: tbz w11, #0, LBB5_1
; CHECK-NEXT: ; %bb.4: ; %e1
; CHECK-NEXT: mov w0, #-1 ; =0xffffffff
; CHECK-NEXT: ret
@@ -608,10 +606,9 @@ exit:
define i64 @consecutive_match_both(ptr %p, i32 %param) {
; CHECK-LABEL: consecutive_match_both:
; CHECK: ; %bb.0: ; %entry
-; CHECK-NEXT: mov w8, #1 ; =0x1
+; CHECK-NEXT: mov w8, #249 ; =0xf9
; CHECK-NEXT: mov w9, #100 ; =0x64
-; CHECK-NEXT: mov w10, #249 ; =0xf9
-; CHECK-NEXT: lsl w8, w8, w1
+; CHECK-NEXT: lsr w8, w8, w1
; CHECK-NEXT: b LBB8_2
; CHECK-NEXT: LBB8_1: ; %loop.latch
; CHECK-NEXT: ; in Loop: Header=BB8_2 Depth=1
@@ -623,8 +620,7 @@ define i64 @consecutive_match_both(ptr %p, i32 %param) {
; CHECK-NEXT: b.hi LBB8_1
; CHECK-NEXT: ; %bb.3: ; %loop.header
; CHECK-NEXT: ; in Loop: Header=BB8_2 Depth=1
-; CHECK-NEXT: tst w8, w10
-; CHECK-NEXT: b.eq LBB8_1
+; CHECK-NEXT: tbz w8, #0, LBB8_1
; CHECK-NEXT: ; %bb.4: ; %e0
; CHECK-NEXT: mov x0, xzr
; CHECK-NEXT: ret
@@ -688,10 +684,9 @@ e1:
define i64 @consecutive_match_before(ptr %p, i32 %param) {
; CHECK-LABEL: consecutive_match_before:
; CHECK: ; %bb.0: ; %entry
-; CHECK-NEXT: mov w8, #1 ; =0x1
+; CHECK-NEXT: mov w8, #25 ; =0x19
; CHECK-NEXT: mov w9, #100 ; =0x64
-; CHECK-NEXT: mov w10, #25 ; =0x19
-; CHECK-NEXT: lsl w8, w8, w1
+; CHECK-NEXT: lsr w8, w8, w1
; CHECK-NEXT: b LBB9_2
; CHECK-NEXT: LBB9_1: ; %loop.latch
; CHECK-NEXT: ; in Loop: Header=BB9_2 Depth=1
@@ -703,8 +698,7 @@ define i64 @consecutive_match_before(ptr %p, i32 %param) {
; CHECK-NEXT: b.hi LBB9_1
; CHECK-NEXT: ; %bb.3: ; %loop.header
; CHECK-NEXT: ; in Loop: Header=BB9_2 Depth=1
-; CHECK-NEXT: tst w8, w10
-; CHECK-NEXT: b.eq LBB9_1
+; CHECK-NEXT: tbz w8, #0, LBB9_1
; CHECK-NEXT: ; %bb.4: ; %e0
; CHECK-NEXT: mov x0, xzr
; CHECK-NEXT: ret
@@ -765,10 +759,9 @@ e1:
define i64 @consecutive_match_after(ptr %p, i32 %param) {
; CHECK-LABEL: consecutive_match_after:
; CHECK: ; %bb.0: ; %entry
-; CHECK-NEXT: mov w8, #1 ; =0x1
+; CHECK-NEXT: mov w8, #49 ; =0x31
; CHECK-NEXT: mov w9, #100 ; =0x64
-; CHECK-NEXT: mov w10, #49 ; =0x31
-; CHECK-NEXT: lsl w8, w8, w1
+; CHECK-NEXT: lsr w8, w8, w1
; CHECK-NEXT: b LBB10_2
; CHECK-NEXT: LBB10_1: ; %loop.latch
; CHECK-NEXT: ; in Loop: Header=BB10_2 Depth=1
@@ -780,8 +773,7 @@ define i64 @consecutive_match_after(ptr %p, i32 %param) {
; CHECK-NEXT: b.hi LBB10_1
; CHECK-NEXT: ; %bb.3: ; %loop.header
; CHECK-NEXT: ; in Loop: Header=BB10_2 Depth=1
-; CHECK-NEXT: tst w8, w10
-; CHECK-NEXT: b.eq LBB10_1
+; CHECK-NEXT: tbz w8, #0, LBB10_1
; CHECK-NEXT: ; %bb.4: ; %e0
; CHECK-NEXT: mov x0, xzr
; CHECK-NEXT: ret
More information about the llvm-commits
mailing list