[llvm] [AArch64] Enable preferZeroCompareBranch for AArch64 when we don't have fused cmp+br (PR #150045)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Jul 22 08:39:10 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-aarch64
Author: AZero13 (AZero13)
<details>
<summary>Changes</summary>
Obviously we also cannot do this if speculative load hardening is on too.
---
Full diff: https://github.com/llvm/llvm-project/pull/150045.diff
8 Files Affected:
- (modified) llvm/include/llvm/CodeGen/TargetLowering.h (+1-1)
- (modified) llvm/lib/CodeGen/CodeGenPrepare.cpp (+1-1)
- (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+8)
- (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.h (+2)
- (modified) llvm/lib/Target/ARM/ARMISelLowering.h (+1-1)
- (modified) llvm/lib/Target/RISCV/RISCVISelLowering.h (+1-1)
- (modified) llvm/lib/Target/SystemZ/SystemZISelLowering.h (+1-1)
- (added) llvm/test/CodeGen/AArch64/branch-on-zero.ll (+156)
``````````diff
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 084b788d51828..993b75256fa23 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -749,7 +749,7 @@ class LLVM_ABI TargetLoweringBase {
/// Return true if the heuristic to prefer icmp eq zero should be used in code
/// gen prepare.
- virtual bool preferZeroCompareBranch() const { return false; }
+ virtual bool preferZeroCompareBranch(BranchInst *) const { return false; }
/// Return true if it is cheaper to split the store of a merged int val
/// from a pair of smaller values into multiple stores.
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index d9d41f1d72e35..90ad9949fb772 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -8630,7 +8630,7 @@ static bool optimizeBranch(BranchInst *Branch, const TargetLowering &TLI,
// br %c, bla, blb
// Creating the cmp to zero can be better for the backend, especially if the
// lshr produces flags that can be used automatically.
- if (!TLI.preferZeroCompareBranch() || !Branch->isConditional())
+ if (!TLI.preferZeroCompareBranch(Branch) || !Branch->isConditional())
return false;
ICmpInst *Cmp = dyn_cast<ICmpInst>(Branch->getCondition());
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index ff23f76fadccd..12a7bf60401e6 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -28508,6 +28508,14 @@ Register AArch64TargetLowering::getExceptionSelectorRegister(
return AArch64::X1;
}
+bool AArch64TargetLowering::preferZeroCompareBranch(BranchInst *Branch) const {
+ // If we can use Armv9.6 CB instructions, prefer that over zero compare branches.
+
+ // If we have speculative load hardening enabled, we cannot use
+ // zero compare branches.
+ return !Subtarget->hasCMPBR() && !Branch->getFunction()->hasFnAttribute(Attribute::SpeculativeLoadHardening);
+}
+
bool AArch64TargetLowering::isMaskAndCmp0FoldingBeneficial(
const Instruction &AndI) const {
// Only sink 'and' mask to cmp use block if it is masking a single bit, since
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 7b1de3d3254f2..26fa599655a48 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -366,6 +366,8 @@ class AArch64TargetLowering : public TargetLowering {
return true;
}
+ bool preferZeroCompareBranch(BranchInst *) const override;
+
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override;
bool hasAndNotCompare(SDValue V) const override {
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 5f4aef55b22c9..ff55dd8d1b06d 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -605,7 +605,7 @@ class VectorType;
Sched::Preference getSchedulingPreference(SDNode *N) const override;
- bool preferZeroCompareBranch() const override { return true; }
+ bool preferZeroCompareBranch(BranchInst *) const override { return true; }
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index a5d735c407e5c..71dd861b17759 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -225,7 +225,7 @@ class RISCVTargetLowering : public TargetLowering {
unsigned getCustomCtpopCost(EVT VT, ISD::CondCode Cond) const override;
- bool preferZeroCompareBranch() const override { return true; }
+ bool preferZeroCompareBranch(BranchInst *) const override { return true; }
// Note that one specific case requires fence insertion for an
// AtomicCmpXchgInst but is handled via the RISCVZacasABIFix pass rather
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
index 1866962e17587..b5e497d773bd9 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -471,7 +471,7 @@ class SystemZTargetLowering : public TargetLowering {
}
bool isCheapToSpeculateCtlz(Type *) const override { return true; }
bool isCheapToSpeculateCttz(Type *) const override { return true; }
- bool preferZeroCompareBranch() const override { return true; }
+ bool preferZeroCompareBranch(BranchInst *) const override { return true; }
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override {
ConstantInt* Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));
return Mask && Mask->getValue().isIntN(16);
diff --git a/llvm/test/CodeGen/AArch64/branch-on-zero.ll b/llvm/test/CodeGen/AArch64/branch-on-zero.ll
new file mode 100644
index 0000000000000..efd4d2b319c55
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/branch-on-zero.ll
@@ -0,0 +1,156 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64 %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64 -global-isel %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+
+define i32 @test_lshr(ptr nocapture %x, ptr nocapture readonly %y, i32 %n) {
+; CHECK-SD-LABEL: test_lshr:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: lsr w8, w2, #2
+; CHECK-SD-NEXT: cbz w8, .LBB0_2
+; CHECK-SD-NEXT: .LBB0_1: // %while.body
+; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-SD-NEXT: ldr w9, [x1], #4
+; CHECK-SD-NEXT: subs w8, w8, #1
+; CHECK-SD-NEXT: lsl w9, w9, #1
+; CHECK-SD-NEXT: str w9, [x0], #4
+; CHECK-SD-NEXT: b.ne .LBB0_1
+; CHECK-SD-NEXT: .LBB0_2: // %while.end
+; CHECK-SD-NEXT: mov w0, wzr
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_lshr:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: lsr w8, w2, #2
+; CHECK-GI-NEXT: cbz w8, .LBB0_2
+; CHECK-GI-NEXT: .LBB0_1: // %while.body
+; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-GI-NEXT: ldr w9, [x1], #4
+; CHECK-GI-NEXT: add x10, x0, #4
+; CHECK-GI-NEXT: subs w8, w8, #1
+; CHECK-GI-NEXT: lsl w9, w9, #1
+; CHECK-GI-NEXT: str w9, [x0]
+; CHECK-GI-NEXT: mov x0, x10
+; CHECK-GI-NEXT: b.ne .LBB0_1
+; CHECK-GI-NEXT: .LBB0_2: // %while.end
+; CHECK-GI-NEXT: mov w0, wzr
+; CHECK-GI-NEXT: ret
+entry:
+ %shr = lshr i32 %n, 2
+ %tobool.not4 = icmp eq i32 %shr, 0
+ br i1 %tobool.not4, label %while.end, label %while.body
+
+while.body: ; preds = %entry, %while.body
+ %c.07 = phi i32 [ %dec, %while.body ], [ %shr, %entry ]
+ %x.addr.06 = phi ptr [ %incdec.ptr1, %while.body ], [ %x, %entry ]
+ %y.addr.05 = phi ptr [ %incdec.ptr, %while.body ], [ %y, %entry ]
+ %incdec.ptr = getelementptr inbounds i32, ptr %y.addr.05, i32 1
+ %0 = load i32, ptr %y.addr.05, align 4
+ %mul = shl nsw i32 %0, 1
+ %incdec.ptr1 = getelementptr inbounds i32, ptr %x.addr.06, i32 1
+ store i32 %mul, ptr %x.addr.06, align 4
+ %dec = add nsw i32 %c.07, -1
+ %tobool.not = icmp eq i32 %dec, 0
+ br i1 %tobool.not, label %while.end, label %while.body
+
+while.end: ; preds = %while.body, %entry
+ ret i32 0
+}
+
+define i32 @test_lshr2(ptr nocapture %x, ptr nocapture readonly %y, i32 %n) {
+; CHECK-SD-LABEL: test_lshr2:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: lsr w8, w2, #2
+; CHECK-SD-NEXT: cbz w8, .LBB1_2
+; CHECK-SD-NEXT: .LBB1_1: // %while.body
+; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-SD-NEXT: ldr w9, [x1], #4
+; CHECK-SD-NEXT: subs w8, w8, #1
+; CHECK-SD-NEXT: lsl w9, w9, #1
+; CHECK-SD-NEXT: str w9, [x0], #4
+; CHECK-SD-NEXT: b.ne .LBB1_1
+; CHECK-SD-NEXT: .LBB1_2: // %while.end
+; CHECK-SD-NEXT: mov w0, wzr
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_lshr2:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: lsr w8, w2, #2
+; CHECK-GI-NEXT: cbz w8, .LBB1_2
+; CHECK-GI-NEXT: .LBB1_1: // %while.body
+; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-GI-NEXT: ldr w9, [x1], #4
+; CHECK-GI-NEXT: add x10, x0, #4
+; CHECK-GI-NEXT: subs w8, w8, #1
+; CHECK-GI-NEXT: lsl w9, w9, #1
+; CHECK-GI-NEXT: str w9, [x0]
+; CHECK-GI-NEXT: mov x0, x10
+; CHECK-GI-NEXT: b.ne .LBB1_1
+; CHECK-GI-NEXT: .LBB1_2: // %while.end
+; CHECK-GI-NEXT: mov w0, wzr
+; CHECK-GI-NEXT: ret
+entry:
+ %tobool.not4 = icmp ult i32 %n, 4
+ br i1 %tobool.not4, label %while.end, label %while.body.preheader
+
+while.body.preheader: ; preds = %entry
+ %shr = lshr i32 %n, 2
+ br label %while.body
+
+while.body: ; preds = %while.body.preheader, %while.body
+ %c.07 = phi i32 [ %dec, %while.body ], [ %shr, %while.body.preheader ]
+ %x.addr.06 = phi ptr [ %incdec.ptr1, %while.body ], [ %x, %while.body.preheader ]
+ %y.addr.05 = phi ptr [ %incdec.ptr, %while.body ], [ %y, %while.body.preheader ]
+ %incdec.ptr = getelementptr inbounds i32, ptr %y.addr.05, i32 1
+ %0 = load i32, ptr %y.addr.05, align 4
+ %mul = shl nsw i32 %0, 1
+ %incdec.ptr1 = getelementptr inbounds i32, ptr %x.addr.06, i32 1
+ store i32 %mul, ptr %x.addr.06, align 4
+ %dec = add nsw i32 %c.07, -1
+ %tobool.not = icmp eq i32 %dec, 0
+ br i1 %tobool.not, label %while.end, label %while.body
+
+while.end: ; preds = %while.body, %entry
+ ret i32 0
+}
+
+
+define i32 @lshr(i32 %u) {
+; CHECK-LABEL: lshr:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: stp x30, x19, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: .cfi_offset w19, -8
+; CHECK-NEXT: .cfi_offset w30, -16
+; CHECK-NEXT: mov w19, w0
+; CHECK-NEXT: lsr w0, w0, #4
+; CHECK-NEXT: mov w8, w19
+; CHECK-NEXT: cbz w0, .LBB2_2
+; CHECK-NEXT: // %bb.1: // %if.then
+; CHECK-NEXT: bl use
+; CHECK-NEXT: add w8, w19, w19, lsl #1
+; CHECK-NEXT: .LBB2_2: // %if.end
+; CHECK-NEXT: sub w9, w19, #7
+; CHECK-NEXT: cmp w8, w9
+; CHECK-NEXT: cset w0, hi
+; CHECK-NEXT: ldp x30, x19, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+entry:
+ %cmp.not = icmp ult i32 %u, 16
+ br i1 %cmp.not, label %if.end, label %if.then
+
+if.then: ; preds = %entry
+ %shr = lshr i32 %u, 4
+ tail call void @use(i32 noundef %shr)
+ %mul = mul i32 %u, 3
+ br label %if.end
+
+if.end: ; preds = %if.then, %entry
+ %u.addr.0 = phi i32 [ %mul, %if.then ], [ %u, %entry ]
+ %sub = add i32 %u, -7
+ %cmp1 = icmp ugt i32 %u.addr.0, %sub
+ %conv = zext i1 %cmp1 to i32
+ ret i32 %conv
+}
+
+declare void @use(i32)
+
``````````
</details>
https://github.com/llvm/llvm-project/pull/150045
More information about the llvm-commits
mailing list