[llvm] Implement hasOrNot (PR #163995)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Oct 17 10:18:35 PDT 2025
https://github.com/AZero13 created https://github.com/llvm/llvm-project/pull/163995
None
>From b96a886a0f034abcc259a13a74b10b8d66323101 Mon Sep 17 00:00:00 2001
From: AZero13 <gfunni234 at gmail.com>
Date: Fri, 17 Oct 2025 13:16:39 -0400
Subject: [PATCH] Implement hasOrNot
---
llvm/include/llvm/CodeGen/TargetLowering.h | 5 +
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 13 +-
llvm/lib/Target/AArch64/AArch64ISelLowering.h | 12 ++
llvm/lib/Target/ARM/ARMISelLowering.cpp | 10 ++
llvm/lib/Target/ARM/ARMISelLowering.h | 2 +
.../AArch64/dagcombine-vselect-signbit-orn.ll | 127 ++++++++++++++++
.../ARM/dagcombine-vselect-signbit-orn.ll | 143 ++++++++++++++++++
7 files changed, 309 insertions(+), 3 deletions(-)
create mode 100644 llvm/test/CodeGen/AArch64/dagcombine-vselect-signbit-orn.ll
create mode 100644 llvm/test/CodeGen/ARM/dagcombine-vselect-signbit-orn.ll
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 73f2c55a71125..322c7b9068255 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -827,6 +827,11 @@ class LLVM_ABI TargetLoweringBase {
return hasAndNotCompare(X);
}
+ /// Return true if the target has a bitwise or-not operation:
+ virtual bool hasOrNot(SDValue X) const {
+ return false;
+ }
+
/// Return true if the target has a bit-test instruction:
/// (X & (1 << Y)) ==/!= 0
/// This knowledge can be used to prevent breaking the pattern,
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index c97300d64d455..737471bc9d9e1 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -12257,9 +12257,16 @@ static SDValue foldVSelectToSignBitSplatMask(SDNode *N, SelectionDAG &DAG) {
return DAG.getNode(ISD::AND, DL, VT, Not, DAG.getFreeze(N2));
}
- // TODO: There's another pattern in this family, but it may require
- // implementing hasOrNot() to check for profitability:
- // (Cond0 s> -1) ? -1 : N2 --> ~(Cond0 s>> BW-1) | freeze(N2)
+ // If we have to invert the sign bit mask and OR with -1, only do that
+ // transform if the target has a bitwise 'or not' instruction (the invert is free).
+ // (Cond0 s> -1) ? -1 : N2 --> ~(Cond0 s>> BW-1) | freeze(N2)
+ if (isAllOnesOrAllOnesSplat(N2) && TLI.hasOrNot(N2)) {
+ SDLoc DL(N);
+ SDValue ShiftAmt = DAG.getShiftAmountConstant(EltSizeInBits - 1, VT, DL);
+ SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Cond0, ShiftAmt);
+ SDValue Not = DAG.getNOT(DL, Sra, VT);
+ return DAG.getNode(ISD::OR, DL, VT, Not, DAG.getFreeze(N1));
+ }
return SDValue();
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 00956fdc8e48e..3488f361f37e5 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -414,6 +414,18 @@ class AArch64TargetLowering : public TargetLowering {
return VT.getFixedSizeInBits() >= 64; // vector 'bic'
}
+ bool hasOrNot(SDValue X) const override {
+ EVT VT = X.getValueType();
+
+ if (!VT.isVector())
+ return VT == MVT::i32 || VT == MVT::i64; // scalar 'orn'
+
+ if (VT.isScalableVector())
+ return true;
+
+ return VT.getFixedSizeInBits() >= 64; // vector 'orn'
+ }
+
bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
unsigned OldShiftOpcode, unsigned NewShiftOpcode,
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 67ea2dd3df792..1092d0117a58a 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -17788,6 +17788,16 @@ static SDValue PerformShiftCombine(SDNode *N,
return SDValue();
}
+bool ARMTargetLowering::hasOrNot(SDValue Y) const {
+ // We can use orns for any scalar.
+ EVT VT = Y.getValueType();
+ if (!VT.isVector())
+ return Subtarget->isThumb2() && VT == MVT::i32; // scalar 'orn'
+ if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps())
+ return VT.getFixedSizeInBits() >= 64; // vector 'orn'
+ return false;
+}
+
// Look for a sign/zero/fpextend extend of a larger than legal load. This can be
// split into multiple extending loads, which are simpler to deal with than an
// arbitrary extend. For fp extends we use an integer extending load and a VCVTL
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 70aa001a41885..993b63f21c07a 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -614,6 +614,8 @@ class VectorType;
return V.getValueType().isScalarInteger();
}
+ bool hasOrNot(SDValue Y) const override;
+
bool
isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const override;
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
diff --git a/llvm/test/CodeGen/AArch64/dagcombine-vselect-signbit-orn.ll b/llvm/test/CodeGen/AArch64/dagcombine-vselect-signbit-orn.ll
new file mode 100644
index 0000000000000..c60d140cd6049
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/dagcombine-vselect-signbit-orn.ll
@@ -0,0 +1,127 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64-unknown-unknown | FileCheck %s
+
+; Test for the optimization: (Cond0 s> -1) ? -1 : N2 --> ~(Cond0 s>> BW-1) | freeze(N2)
+; This pattern should be optimized to use 'orn' instruction on AArch64
+
+define <4 x i32> @vselect_signbit_orn_scalar(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: vselect_signbit_orn_scalar:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmge v0.4s, v0.4s, #0
+; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
+ %cmp = icmp sgt <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
+ %sel = select <4 x i1> %cmp, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %y
+ ret <4 x i32> %sel
+}
+
+define <2 x i64> @vselect_signbit_orn_scalar64(<2 x i64> %x, <2 x i64> %y) {
+; CHECK-LABEL: vselect_signbit_orn_scalar64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmge v0.2d, v0.2d, #0
+; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
+ %cmp = icmp sgt <2 x i64> %x, <i64 -1, i64 -1>
+ %sel = select <2 x i1> %cmp, <2 x i64> <i64 -1, i64 -1>, <2 x i64> %y
+ ret <2 x i64> %sel
+}
+
+define <4 x i32> @vselect_signbit_orn_vector(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: vselect_signbit_orn_vector:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmge v0.4s, v0.4s, #0
+; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
+ %cmp = icmp sgt <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
+ %sel = select <4 x i1> %cmp, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %y
+ ret <4 x i32> %sel
+}
+
+define <2 x i64> @vselect_signbit_orn_vector64(<2 x i64> %x, <2 x i64> %y) {
+; CHECK-LABEL: vselect_signbit_orn_vector64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmge v0.2d, v0.2d, #0
+; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
+ %cmp = icmp sgt <2 x i64> %x, <i64 -1, i64 -1>
+ %sel = select <2 x i1> %cmp, <2 x i64> <i64 -1, i64 -1>, <2 x i64> %y
+ ret <2 x i64> %sel
+}
+
+; Test with different constant values for N2
+define <4 x i32> @vselect_signbit_orn_scalar_const(<4 x i32> %x) {
+; CHECK-LABEL: vselect_signbit_orn_scalar_const:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmge v0.4s, v0.4s, #0
+; CHECK-NEXT: orr v0.4s, #42
+; CHECK-NEXT: ret
+ %cmp = icmp sgt <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
+ %sel = select <4 x i1> %cmp, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> <i32 42, i32 42, i32 42, i32 42>
+ ret <4 x i32> %sel
+}
+
+; Test the inverse pattern to ensure it doesn't get optimized (should use different instruction)
+define <4 x i32> @vselect_signbit_not_orn(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: vselect_signbit_not_orn:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmlt v0.4s, v0.4s, #0
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
+ %cmp = icmp sgt <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
+ %sel = select <4 x i1> %cmp, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32> %y
+ ret <4 x i32> %sel
+}
+
+define <4 x i32> @test_orn_instruction_direct(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: test_orn_instruction_direct:
+; CHECK: // %bb.0:
+; CHECK-NEXT: orn v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
+ %not_y = xor <4 x i32> %y, <i32 -1, i32 -1, i32 -1, i32 -1>
+ %result = or <4 x i32> %x, %not_y
+ ret <4 x i32> %result
+}
+
+; Scalar versions of the same tests
+define i32 @vselect_signbit_orn_scalar_i32(i32 %x, i32 %y) {
+; CHECK-LABEL: vselect_signbit_orn_scalar_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmn w0, #1
+; CHECK-NEXT: csinv w0, w1, wzr, le
+; CHECK-NEXT: ret
+ %cmp = icmp sgt i32 %x, -1
+ %sel = select i1 %cmp, i32 -1, i32 %y
+ ret i32 %sel
+}
+
+define i64 @vselect_signbit_orn_scalar_i64(i64 %x, i64 %y) {
+; CHECK-LABEL: vselect_signbit_orn_scalar_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmn x0, #1
+; CHECK-NEXT: csinv x0, x1, xzr, le
+; CHECK-NEXT: ret
+ %cmp = icmp sgt i64 %x, -1
+ %sel = select i1 %cmp, i64 -1, i64 %y
+ ret i64 %sel
+}
+
+define i32 @test_orn_instruction_scalar_i32(i32 %x, i32 %y) {
+; CHECK-LABEL: test_orn_instruction_scalar_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: orn w0, w0, w1
+; CHECK-NEXT: ret
+ %not_y = xor i32 %y, -1
+ %result = or i32 %x, %not_y
+ ret i32 %result
+}
+
+define i64 @test_orn_instruction_scalar_i64(i64 %x, i64 %y) {
+; CHECK-LABEL: test_orn_instruction_scalar_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: orn x0, x0, x1
+; CHECK-NEXT: ret
+ %not_y = xor i64 %y, -1
+ %result = or i64 %x, %not_y
+ ret i64 %result
+}
+
diff --git a/llvm/test/CodeGen/ARM/dagcombine-vselect-signbit-orn.ll b/llvm/test/CodeGen/ARM/dagcombine-vselect-signbit-orn.ll
new file mode 100644
index 0000000000000..1a0d63745dfa9
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/dagcombine-vselect-signbit-orn.ll
@@ -0,0 +1,143 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=armv7-unknown-unknown | FileCheck %s
+
+; Test for the optimization: (Cond0 s> -1) ? -1 : N2 --> ~(Cond0 s>> BW-1) | freeze(N2)
+; The DAGCombiner optimization transforms the select into the expected pattern,
+; but further optimizations convert it to a more efficient sequence
+
+define <4 x i32> @vselect_signbit_orn_vector(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: vselect_signbit_orn_vector:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vmov d17, r2, r3
+; CHECK-NEXT: vmov d16, r0, r1
+; CHECK-NEXT: mov r0, sp
+; CHECK-NEXT: vld1.64 {d18, d19}, [r0]
+; CHECK-NEXT: vshr.s32 q8, q8, #31
+; CHECK-NEXT: vorn q8, q9, q8
+; CHECK-NEXT: vmov r0, r1, d16
+; CHECK-NEXT: vmov r2, r3, d17
+; CHECK-NEXT: bx lr
+ %cmp = icmp sgt <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
+ %sel = select <4 x i1> %cmp, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %y
+ ret <4 x i32> %sel
+}
+
+define <2 x i64> @vselect_signbit_orn_vector64(<2 x i64> %x, <2 x i64> %y) {
+; CHECK-LABEL: vselect_signbit_orn_vector64:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vmov d17, r2, r3
+; CHECK-NEXT: vmov d16, r0, r1
+; CHECK-NEXT: mov r0, sp
+; CHECK-NEXT: vld1.64 {d18, d19}, [r0]
+; CHECK-NEXT: vshr.s64 q8, q8, #63
+; CHECK-NEXT: vorn q8, q9, q8
+; CHECK-NEXT: vmov r0, r1, d16
+; CHECK-NEXT: vmov r2, r3, d17
+; CHECK-NEXT: bx lr
+ %cmp = icmp sgt <2 x i64> %x, <i64 -1, i64 -1>
+ %sel = select <2 x i1> %cmp, <2 x i64> <i64 -1, i64 -1>, <2 x i64> %y
+ ret <2 x i64> %sel
+}
+
+; Test with different constant values for N2
+define <4 x i32> @vselect_signbit_orn_const(<4 x i32> %x) {
+; CHECK-LABEL: vselect_signbit_orn_const:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vmov d17, r2, r3
+; CHECK-NEXT: vmov d16, r0, r1
+; CHECK-NEXT: vshr.s32 q8, q8, #31
+; CHECK-NEXT: vmvn q8, q8
+; CHECK-NEXT: vorr.i32 q8, #0x2a
+; CHECK-NEXT: vmov r0, r1, d16
+; CHECK-NEXT: vmov r2, r3, d17
+; CHECK-NEXT: bx lr
+ %cmp = icmp sgt <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
+ %sel = select <4 x i1> %cmp, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> <i32 42, i32 42, i32 42, i32 42>
+ ret <4 x i32> %sel
+}
+
+; Test the inverse pattern to ensure it doesn't get optimized (should use different instruction)
+define <4 x i32> @vselect_signbit_not_orn(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: vselect_signbit_not_orn:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vmov d17, r2, r3
+; CHECK-NEXT: vmov d16, r0, r1
+; CHECK-NEXT: mov r0, sp
+; CHECK-NEXT: vld1.64 {d18, d19}, [r0]
+; CHECK-NEXT: vshr.s32 q8, q8, #31
+; CHECK-NEXT: vand q8, q8, q9
+; CHECK-NEXT: vmov r0, r1, d16
+; CHECK-NEXT: vmov r2, r3, d17
+; CHECK-NEXT: bx lr
+ %cmp = icmp sgt <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
+ %sel = select <4 x i1> %cmp, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32> %y
+ ret <4 x i32> %sel
+}
+
+; Test to demonstrate that orn instruction is available when the pattern matches directly
+define <4 x i32> @test_orn_instruction_direct(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: test_orn_instruction_direct:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vmov d17, r2, r3
+; CHECK-NEXT: vmov d16, r0, r1
+; CHECK-NEXT: mov r0, sp
+; CHECK-NEXT: vld1.64 {d18, d19}, [r0]
+; CHECK-NEXT: vorn q8, q8, q9
+; CHECK-NEXT: vmov r0, r1, d16
+; CHECK-NEXT: vmov r2, r3, d17
+; CHECK-NEXT: bx lr
+ %not_y = xor <4 x i32> %y, <i32 -1, i32 -1, i32 -1, i32 -1>
+ %result = or <4 x i32> %x, %not_y
+ ret <4 x i32> %result
+}
+
+; Scalar versions of the same tests
+define i32 @vselect_signbit_orn_scalar_i32(i32 %x, i32 %y) {
+; CHECK-LABEL: vselect_signbit_orn_scalar_i32:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: cmn r0, #1
+; CHECK-NEXT: mvngt r1, #0
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
+ %cmp = icmp sgt i32 %x, -1
+ %sel = select i1 %cmp, i32 -1, i32 %y
+ ret i32 %sel
+}
+
+define i64 @vselect_signbit_orn_scalar_i64(i64 %x, i64 %y) {
+; CHECK-LABEL: vselect_signbit_orn_scalar_i64:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: cmn r1, #1
+; CHECK-NEXT: mov r0, r2
+; CHECK-NEXT: mvngt r3, #0
+; CHECK-NEXT: mvngt r0, #0
+; CHECK-NEXT: mov r1, r3
+; CHECK-NEXT: bx lr
+ %cmp = icmp sgt i64 %x, -1
+ %sel = select i1 %cmp, i64 -1, i64 %y
+ ret i64 %sel
+}
+
+define i32 @test_orn_instruction_scalar_i32(i32 %x, i32 %y) {
+; CHECK-LABEL: test_orn_instruction_scalar_i32:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: mvn r1, r1
+; CHECK-NEXT: orr r0, r0, r1
+; CHECK-NEXT: bx lr
+ %not_y = xor i32 %y, -1
+ %result = or i32 %x, %not_y
+ ret i32 %result
+}
+
+define i64 @test_orn_instruction_scalar_i64(i64 %x, i64 %y) {
+; CHECK-LABEL: test_orn_instruction_scalar_i64:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: mvn r2, r2
+; CHECK-NEXT: orr r0, r0, r2
+; CHECK-NEXT: mvn r2, r3
+; CHECK-NEXT: orr r1, r1, r2
+; CHECK-NEXT: bx lr
+ %not_y = xor i64 %y, -1
+ %result = or i64 %x, %not_y
+ ret i64 %result
+}
More information about the llvm-commits
mailing list