[llvm] [DAG] isKnownNeverZero - add ISD::SHL DemandedElts handling and tests (PR #183772)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Mar 6 07:07:01 PST 2026
https://github.com/Joeljm1 updated https://github.com/llvm/llvm-project/pull/183772
>From 9b3c3d5100cf124af25b571d0cdb4fc712a7dfc5 Mon Sep 17 00:00:00 2001
From: Joel Joseph Mathews <joeljosephcl10 at gmail.com>
Date: Fri, 27 Feb 2026 22:28:38 +0530
Subject: [PATCH 1/8] [DAG] isKnownNeverZero - add ISD::SHL DemandedElts
handling and tests
added DemandedElts to ISD::SHL case in isKnowNeverZero and required
tests
---
.../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 4 +-
.../AArch64/AArch64SelectionDAGTest.cpp | 50 +++++++++++++++++++
2 files changed, 52 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 4a2bd811b5214..b1e2b07f0c1fb 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -6209,13 +6209,13 @@ bool SelectionDAG::isKnownNeverZero(SDValue Op, const APInt &DemandedElts,
case ISD::SHL: {
if (Op->getFlags().hasNoSignedWrap() || Op->getFlags().hasNoUnsignedWrap())
- return isKnownNeverZero(Op.getOperand(0), Depth + 1);
+ return isKnownNeverZero(Op.getOperand(0),DemandedElts, Depth + 1);
KnownBits ValKnown = computeKnownBits(Op.getOperand(0), Depth + 1);
// 1 << X is never zero.
if (ValKnown.One[0])
return true;
// If max shift cnt of known ones is non-zero, result is non-zero.
- APInt MaxCnt = computeKnownBits(Op.getOperand(1), Depth + 1).getMaxValue();
+ APInt MaxCnt = computeKnownBits(Op.getOperand(1),DemandedElts ,Depth + 1).getMaxValue();
if (MaxCnt.ult(ValKnown.getBitWidth()) &&
!ValKnown.One.shl(MaxCnt).isZero())
return true;
diff --git a/llvm/unittests/Target/AArch64/AArch64SelectionDAGTest.cpp b/llvm/unittests/Target/AArch64/AArch64SelectionDAGTest.cpp
index b0c48e8c97995..90a765dc16433 100644
--- a/llvm/unittests/Target/AArch64/AArch64SelectionDAGTest.cpp
+++ b/llvm/unittests/Target/AArch64/AArch64SelectionDAGTest.cpp
@@ -1574,4 +1574,54 @@ TEST_F(AArch64SelectionDAGTest, KnownNeverZero_Select) {
EXPECT_FALSE(DAG->isKnownNeverZero(VSelect444Big, DemandAll));
EXPECT_TRUE(DAG->isKnownNeverZero(VSelect4444, DemandAll));
}
+
+TEST_F(AArch64SelectionDAGTest, KnownNeverZero_SHL_DemandedElts) {
+ SDLoc Loc;
+ EVT VT = EVT::getVectorVT(Context, MVT::i32, 4);
+
+ // Vector: < -2147483648, 1, 0 -8 >
+ SDValue V1 = DAG->getConstant(-2147483648, Loc, MVT::i32);
+ SDValue V2 = DAG->getConstant(1, Loc, MVT::i32);
+ SDValue V3 = DAG->getConstant(0, Loc, MVT::i32);
+ SDValue V4 = DAG->getConstant(-8, Loc, MVT::i32);
+
+ SDValue Vec = DAG->getBuildVector(VT, Loc, {V1, V2, V3, V4});
+
+ SDValue Shift1 = DAG->getConstant(1, Loc, VT);
+ SDValue Shift31 = DAG->getConstant(31, Loc, VT);
+ SDValue ShiftOverflow = DAG->getConstant(32, Loc, VT);
+ SDValue ShiftUnkown = DAG->getExternalSymbol("unknown", VT);
+
+ SDValue Op1 = DAG->getNode(ISD::SHL, Loc, VT, Vec, Shift1);
+ SDValue Op31 = DAG->getNode(ISD::SHL, Loc, VT, Vec, Shift31);
+ SDValue Op32 = DAG->getNode(ISD::SHL, Loc, VT, Vec, ShiftOverflow);
+ SDValue OpUnkown = DAG->getNode(ISD::SHL, Loc, VT, Vec, ShiftUnkown);
+ // Lane 0: -2147483648 << 1 = 0
+ APInt Lane0(4, 1);
+ EXPECT_FALSE(DAG->isKnownNeverZero(Op1, Lane0));
+ EXPECT_FALSE(DAG->isKnownNeverZero(OpUnkown, Lane0));
+ // Lane 1: 1 << 31 = -2147483648
+ APInt Lane1(4, 2);
+ EXPECT_TRUE(DAG->isKnownNeverZero(Op31, Lane1));
+ // Lane 2: 0 << 1 = 0
+ APInt Lane2(4, 4);
+ EXPECT_FALSE(DAG->isKnownNeverZero(Op1, Lane2));
+ EXPECT_FALSE(DAG->isKnownNeverZero(OpUnkown, Lane2));
+ EXPECT_FALSE(DAG->isKnownNeverZero(Op32, Lane2));
+ // Lane 3: -8 << 1 = -16
+ APInt Lane3(4, 8);
+ EXPECT_TRUE(DAG->isKnownNeverZero(Op1, Lane3));
+ EXPECT_FALSE(DAG->isKnownNeverZero(Op32, Lane3));
+ EXPECT_FALSE(DAG->isKnownNeverZero(Op31, Lane3));
+ // lane 2 and 4
+ APInt Lane1AND4(4, -6);
+ EXPECT_TRUE(DAG->isKnownNeverZero(Op1, Lane1AND4));
+ EXPECT_FALSE(DAG->isKnownNeverZero(Op31, Lane1AND4));
+ EXPECT_FALSE(DAG->isKnownNeverZero(OpUnkown, Lane1AND4));
+ // all lanes
+ APInt LaneAll(4, -1);
+ EXPECT_FALSE(DAG->isKnownNeverZero(Op1, LaneAll));
+ EXPECT_FALSE(DAG->isKnownNeverZero(OpUnkown, LaneAll));
+ EXPECT_FALSE(DAG->isKnownNeverZero(Op32, LaneAll));
+}
} // end namespace llvm
>From cedf95e471e32fd9c00cafefc3de1796bd2b9824 Mon Sep 17 00:00:00 2001
From: Joeljm1 <143259392+Joeljm1 at users.noreply.github.com>
Date: Fri, 27 Feb 2026 20:54:52 +0300
Subject: [PATCH 2/8] Update llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
Add a missing DemandedElts in isKnownNeverZero ISD::SHL case
Co-authored-by: Simon Pilgrim <git at redking.me.uk>
---
llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index b1e2b07f0c1fb..34a8e3c6d1cc5 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -6210,7 +6210,7 @@ bool SelectionDAG::isKnownNeverZero(SDValue Op, const APInt &DemandedElts,
case ISD::SHL: {
if (Op->getFlags().hasNoSignedWrap() || Op->getFlags().hasNoUnsignedWrap())
return isKnownNeverZero(Op.getOperand(0),DemandedElts, Depth + 1);
- KnownBits ValKnown = computeKnownBits(Op.getOperand(0), Depth + 1);
+ KnownBits ValKnown = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
// 1 << X is never zero.
if (ValKnown.One[0])
return true;
>From 00bc8adfa8425ccab4ba23639ff94ec0bf3b476b Mon Sep 17 00:00:00 2001
From: Joel Joseph Mathews <joeljosephcl10 at gmail.com>
Date: Fri, 6 Mar 2026 09:29:46 +0530
Subject: [PATCH 3/8] removed unit test for SHL and regenerated
known-never-zero.ll
---
llvm/test/CodeGen/X86/known-never-zero.ll | 95 ++++++++++++-------
.../AArch64/AArch64SelectionDAGTest.cpp | 50 ----------
2 files changed, 61 insertions(+), 84 deletions(-)
diff --git a/llvm/test/CodeGen/X86/known-never-zero.ll b/llvm/test/CodeGen/X86/known-never-zero.ll
index bbc1d9477c6a9..726fcc868e840 100644
--- a/llvm/test/CodeGen/X86/known-never-zero.ll
+++ b/llvm/test/CodeGen/X86/known-never-zero.ll
@@ -134,7 +134,9 @@ define i32 @extractelt_nonzero_vec(<4 x i32> %a0, ptr %p1, i32 %a2) {
; X86-NEXT: por %xmm0, %xmm1
; X86-NEXT: movdqa %xmm1, (%eax)
; X86-NEXT: movd %xmm1, %eax
-; X86-NEXT: rep bsfl %eax, %eax
+; X86-NEXT: bsfl %eax, %ecx
+; X86-NEXT: movl $32, %eax
+; X86-NEXT: cmovnel %ecx, %eax
; X86-NEXT: retl
;
; X64-LABEL: extractelt_nonzero_vec:
@@ -142,8 +144,9 @@ define i32 @extractelt_nonzero_vec(<4 x i32> %a0, ptr %p1, i32 %a2) {
; X64-NEXT: vmovaps {{.*#+}} xmm1 = [8,4294967295,4294967295,4294967295]
; X64-NEXT: vblendvps %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0
; X64-NEXT: vmovaps %xmm0, (%rdi)
-; X64-NEXT: vmovd %xmm0, %eax
-; X64-NEXT: rep bsfl %eax, %eax
+; X64-NEXT: vmovd %xmm0, %ecx
+; X64-NEXT: movl $32, %eax
+; X64-NEXT: rep bsfl %ecx, %eax
; X64-NEXT: retq
%cmp = icmp sgt <4 x i32> zeroinitializer, %a0
%sel = select <4 x i1> %cmp, <4 x i32> <i32 4, i32 0, i32 0, i32 0>, <4 x i32> <i32 8, i32 -1, i32 -1, i32 -1>
@@ -218,7 +221,7 @@ define i32 @shl_known_nonzero_1s_bit_set_vec(<4 x i32> %x, ptr %p) {
; X86-NEXT: pslld $23, %xmm0
; X86-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-NEXT: cvttps2dq %xmm0, %xmm0
-; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [123,0,0,0]
+; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-NEXT: movdqa {{.*#+}} xmm1 = [4294967295,0,4294967295,0]
; X86-NEXT: pand %xmm0, %xmm1
; X86-NEXT: movdqa %xmm1, (%eax)
@@ -233,7 +236,7 @@ define i32 @shl_known_nonzero_1s_bit_set_vec(<4 x i32> %x, ptr %p) {
; X64-NEXT: vpslld $23, %xmm0, %xmm0
; X64-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; X64-NEXT: vcvttps2dq %xmm0, %xmm0
-; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [123,0,0,0]
+; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; X64-NEXT: vmovdqa %xmm0, (%rdi)
; X64-NEXT: vmovd %xmm0, %ecx
; X64-NEXT: movl $32, %eax
@@ -431,15 +434,18 @@ define i32 @uaddsat_known_nonzero_vec(<16 x i8> %x, ptr %p) {
; X86-NEXT: paddusb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-NEXT: movdqa %xmm0, (%eax)
; X86-NEXT: movzbl (%eax), %eax
-; X86-NEXT: rep bsfl %eax, %eax
+; X86-NEXT: bsfl %eax, %ecx
+; X86-NEXT: movl $32, %eax
+; X86-NEXT: cmovnel %ecx, %eax
; X86-NEXT: retl
;
; X64-LABEL: uaddsat_known_nonzero_vec:
; X64: # %bb.0:
; X64-NEXT: vpaddusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; X64-NEXT: vmovdqa %xmm0, (%rdi)
-; X64-NEXT: vpextrb $0, %xmm0, %eax
-; X64-NEXT: rep bsfl %eax, %eax
+; X64-NEXT: vpextrb $0, %xmm0, %ecx
+; X64-NEXT: movl $32, %eax
+; X64-NEXT: rep bsfl %ecx, %eax
; X64-NEXT: retq
%z = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> %x, <16 x i8> <i8 1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>)
store <16 x i8> %z, ptr %p
@@ -970,14 +976,17 @@ define i32 @smax_known_never_zero_vec_element(<4 x i32> %x) {
; X86-NEXT: pandn %xmm1, %xmm2
; X86-NEXT: por %xmm0, %xmm2
; X86-NEXT: movd %xmm2, %eax
-; X86-NEXT: rep bsfl %eax, %eax
+; X86-NEXT: bsfl %eax, %ecx
+; X86-NEXT: movl $32, %eax
+; X86-NEXT: cmovnel %ecx, %eax
; X86-NEXT: retl
;
; X64-LABEL: smax_known_never_zero_vec_element:
; X64: # %bb.0:
; X64-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-NEXT: vmovd %xmm0, %eax
-; X64-NEXT: rep bsfl %eax, %eax
+; X64-NEXT: vmovd %xmm0, %ecx
+; X64-NEXT: movl $32, %eax
+; X64-NEXT: rep bsfl %ecx, %eax
; X64-NEXT: retq
%z = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %x, <4 x i32> <i32 54, i32 -23, i32 -12, i32 -1>)
%el = extractelement <4 x i32> %z, i32 0
@@ -1276,7 +1285,9 @@ define i32 @sra_known_nonzero_exact_vec(<4 x i32> %x, <4 x i32> %yy, ptr %p) {
; X86-NEXT: movdqa %xmm1, (%eax)
; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
; X86-NEXT: movd %xmm0, %eax
-; X86-NEXT: rep bsfl %eax, %eax
+; X86-NEXT: bsfl %eax, %ecx
+; X86-NEXT: movl $32, %eax
+; X86-NEXT: cmovnel %ecx, %eax
; X86-NEXT: retl
;
; X64-LABEL: sra_known_nonzero_exact_vec:
@@ -1285,8 +1296,9 @@ define i32 @sra_known_nonzero_exact_vec(<4 x i32> %x, <4 x i32> %yy, ptr %p) {
; X64-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; X64-NEXT: vpsrad %xmm0, %xmm1, %xmm0
; X64-NEXT: vmovdqa %xmm0, (%rdi)
-; X64-NEXT: vpextrd $1, %xmm0, %eax
-; X64-NEXT: rep bsfl %eax, %eax
+; X64-NEXT: vpextrd $1, %xmm0, %ecx
+; X64-NEXT: movl $32, %eax
+; X64-NEXT: rep bsfl %ecx, %eax
; X64-NEXT: retq
%x.splat = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> zeroinitializer
%y = or <4 x i32> %yy, <i32 0, i32 256, i32 0, i32 0>
@@ -1354,7 +1366,9 @@ define i32 @srl_known_nonzero_sign_bit_set_vec(<4 x i32> %x, ptr %p) {
; X86-NEXT: movdqa %xmm0, (%eax)
; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X86-NEXT: movd %xmm0, %eax
-; X86-NEXT: rep bsfl %eax, %eax
+; X86-NEXT: bsfl %eax, %ecx
+; X86-NEXT: movl $32, %eax
+; X86-NEXT: cmovnel %ecx, %eax
; X86-NEXT: retl
;
; X64-LABEL: srl_known_nonzero_sign_bit_set_vec:
@@ -1363,8 +1377,9 @@ define i32 @srl_known_nonzero_sign_bit_set_vec(<4 x i32> %x, ptr %p) {
; X64-NEXT: vmovdqa {{.*#+}} xmm1 = [0,65535,2147606891,0]
; X64-NEXT: vpsrld %xmm0, %xmm1, %xmm0
; X64-NEXT: vmovdqa %xmm0, (%rdi)
-; X64-NEXT: vpextrd $2, %xmm0, %eax
-; X64-NEXT: rep bsfl %eax, %eax
+; X64-NEXT: vpextrd $2, %xmm0, %ecx
+; X64-NEXT: movl $32, %eax
+; X64-NEXT: rep bsfl %ecx, %eax
; X64-NEXT: retq
%x.splat = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> zeroinitializer
%z = lshr <4 x i32> <i32 0, i32 65535, i32 2147606891, i32 0>, %x.splat
@@ -1409,7 +1424,9 @@ define i32 @srl_known_nonzero_exact_vec(<4 x i32> %x, <4 x i32> %yy, ptr %p) {
; X86-NEXT: movdqa %xmm1, (%eax)
; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3]
; X86-NEXT: movd %xmm0, %eax
-; X86-NEXT: rep bsfl %eax, %eax
+; X86-NEXT: bsfl %eax, %ecx
+; X86-NEXT: movl $32, %eax
+; X86-NEXT: cmovnel %ecx, %eax
; X86-NEXT: retl
;
; X64-LABEL: srl_known_nonzero_exact_vec:
@@ -1418,8 +1435,9 @@ define i32 @srl_known_nonzero_exact_vec(<4 x i32> %x, <4 x i32> %yy, ptr %p) {
; X64-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; X64-NEXT: vpsrld %xmm0, %xmm1, %xmm0
; X64-NEXT: vmovdqa %xmm0, (%rdi)
-; X64-NEXT: vpextrd $3, %xmm0, %eax
-; X64-NEXT: rep bsfl %eax, %eax
+; X64-NEXT: vpextrd $3, %xmm0, %ecx
+; X64-NEXT: movl $32, %eax
+; X64-NEXT: rep bsfl %ecx, %eax
; X64-NEXT: retq
%x.splat = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> zeroinitializer
%y = or <4 x i32> %yy, <i32 0, i32 0, i32 0, i32 256>
@@ -1766,15 +1784,18 @@ define i32 @add_nuw_known_nonzero_vec(<4 x i32> %xx, ptr %p) {
; X86-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-NEXT: movdqa %xmm0, (%eax)
; X86-NEXT: movd %xmm0, %eax
-; X86-NEXT: rep bsfl %eax, %eax
+; X86-NEXT: bsfl %eax, %ecx
+; X86-NEXT: movl $32, %eax
+; X86-NEXT: cmovnel %ecx, %eax
; X86-NEXT: retl
;
; X64-LABEL: add_nuw_known_nonzero_vec:
; X64: # %bb.0:
; X64-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; X64-NEXT: vmovdqa %xmm0, (%rdi)
-; X64-NEXT: vmovd %xmm0, %eax
-; X64-NEXT: rep bsfl %eax, %eax
+; X64-NEXT: vmovd %xmm0, %ecx
+; X64-NEXT: movl $32, %eax
+; X64-NEXT: rep bsfl %ecx, %eax
; X64-NEXT: retq
%z = add nuw <4 x i32> %xx, <i32 1, i32 0, i32 0, i32 0>
store <4 x i32> %z, ptr %p
@@ -1982,8 +2003,7 @@ define i32 @mul_maybe_zero(i32 %x, i32 %y) {
define i32 @bitcast_known_nonzero(<2 x i16> %xx) {
; X86-LABEL: bitcast_known_nonzero:
; X86: # %bb.0:
-; X86-NEXT: pxor %xmm1, %xmm1
-; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; X86-NEXT: pslld $23, %xmm0
; X86-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-NEXT: cvttps2dq %xmm0, %xmm0
@@ -2058,6 +2078,7 @@ define i32 @zext_known_nonzero(i16 %xx) {
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl $256, %eax # imm = 0x100
; X86-NEXT: shll %cl, %eax
+; X86-NEXT: movzwl %ax, %eax
; X86-NEXT: rep bsfl %eax, %eax
; X86-NEXT: retl
;
@@ -2067,6 +2088,7 @@ define i32 @zext_known_nonzero(i16 %xx) {
; X64-NEXT: movl $256, %eax # imm = 0x100
; X64-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NEXT: shll %cl, %eax
+; X64-NEXT: movzwl %ax, %eax
; X64-NEXT: rep bsfl %eax, %eax
; X64-NEXT: retq
%x = shl nuw nsw i16 256, %xx
@@ -2101,6 +2123,7 @@ define i32 @sext_known_nonzero(i16 %xx) {
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl $256, %eax # imm = 0x100
; X86-NEXT: shll %cl, %eax
+; X86-NEXT: movzwl %ax, %eax
; X86-NEXT: rep bsfl %eax, %eax
; X86-NEXT: retl
;
@@ -2110,6 +2133,7 @@ define i32 @sext_known_nonzero(i16 %xx) {
; X64-NEXT: movl $256, %eax # imm = 0x100
; X64-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NEXT: shll %cl, %eax
+; X64-NEXT: movzwl %ax, %eax
; X64-NEXT: rep bsfl %eax, %eax
; X64-NEXT: retq
%x = shl nuw nsw i16 256, %xx
@@ -2169,8 +2193,9 @@ define i32 @test_zext_demanded_elts(<4 x i32> %a0, ptr %p) {
; X64-NEXT: vunpckhps {{.*#+}} xmm2 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; X64-NEXT: vmovaps %xmm2, 16(%rdi)
; X64-NEXT: vmovdqa %xmm1, (%rdi)
-; X64-NEXT: vmovd %xmm0, %eax
-; X64-NEXT: rep bsfq %rax, %rax
+; X64-NEXT: vmovd %xmm0, %ecx
+; X64-NEXT: movl $64, %eax
+; X64-NEXT: rep bsfq %rcx, %rax
; X64-NEXT: # kill: def $eax killed $eax killed $rax
; X64-NEXT: retq
%cmp = icmp sgt <4 x i32> zeroinitializer, %a0
@@ -2203,11 +2228,12 @@ define i32 @test_sext_demanded_elts(<4 x i32> %a0, ptr %p) {
; X86-NEXT: movdqa %xmm0, 16(%eax)
; X86-NEXT: movdqa %xmm2, (%eax)
; X86-NEXT: movd %xmm1, %eax
-; X86-NEXT: rep bsfl %ecx, %edx
-; X86-NEXT: rep bsfl %eax, %eax
-; X86-NEXT: addl $32, %eax
-; X86-NEXT: testl %ecx, %ecx
-; X86-NEXT: cmovnel %edx, %eax
+; X86-NEXT: bsfl %eax, %eax
+; X86-NEXT: movl $32, %edx
+; X86-NEXT: cmovnel %eax, %edx
+; X86-NEXT: addl $32, %edx
+; X86-NEXT: bsfl %ecx, %eax
+; X86-NEXT: cmovel %edx, %eax
; X86-NEXT: retl
;
; X64-LABEL: test_sext_demanded_elts:
@@ -2219,8 +2245,9 @@ define i32 @test_sext_demanded_elts(<4 x i32> %a0, ptr %p) {
; X64-NEXT: vpmovsxdq %xmm0, %xmm0
; X64-NEXT: vmovdqa %xmm0, (%rdi)
; X64-NEXT: vmovdqa %xmm1, 16(%rdi)
-; X64-NEXT: vmovq %xmm0, %rax
-; X64-NEXT: rep bsfq %rax, %rax
+; X64-NEXT: vmovq %xmm0, %rcx
+; X64-NEXT: movl $64, %eax
+; X64-NEXT: rep bsfq %rcx, %rax
; X64-NEXT: # kill: def $eax killed $eax killed $rax
; X64-NEXT: retq
%cmp = icmp sgt <4 x i32> zeroinitializer, %a0
diff --git a/llvm/unittests/Target/AArch64/AArch64SelectionDAGTest.cpp b/llvm/unittests/Target/AArch64/AArch64SelectionDAGTest.cpp
index 90a765dc16433..b0c48e8c97995 100644
--- a/llvm/unittests/Target/AArch64/AArch64SelectionDAGTest.cpp
+++ b/llvm/unittests/Target/AArch64/AArch64SelectionDAGTest.cpp
@@ -1574,54 +1574,4 @@ TEST_F(AArch64SelectionDAGTest, KnownNeverZero_Select) {
EXPECT_FALSE(DAG->isKnownNeverZero(VSelect444Big, DemandAll));
EXPECT_TRUE(DAG->isKnownNeverZero(VSelect4444, DemandAll));
}
-
-TEST_F(AArch64SelectionDAGTest, KnownNeverZero_SHL_DemandedElts) {
- SDLoc Loc;
- EVT VT = EVT::getVectorVT(Context, MVT::i32, 4);
-
- // Vector: < -2147483648, 1, 0 -8 >
- SDValue V1 = DAG->getConstant(-2147483648, Loc, MVT::i32);
- SDValue V2 = DAG->getConstant(1, Loc, MVT::i32);
- SDValue V3 = DAG->getConstant(0, Loc, MVT::i32);
- SDValue V4 = DAG->getConstant(-8, Loc, MVT::i32);
-
- SDValue Vec = DAG->getBuildVector(VT, Loc, {V1, V2, V3, V4});
-
- SDValue Shift1 = DAG->getConstant(1, Loc, VT);
- SDValue Shift31 = DAG->getConstant(31, Loc, VT);
- SDValue ShiftOverflow = DAG->getConstant(32, Loc, VT);
- SDValue ShiftUnkown = DAG->getExternalSymbol("unknown", VT);
-
- SDValue Op1 = DAG->getNode(ISD::SHL, Loc, VT, Vec, Shift1);
- SDValue Op31 = DAG->getNode(ISD::SHL, Loc, VT, Vec, Shift31);
- SDValue Op32 = DAG->getNode(ISD::SHL, Loc, VT, Vec, ShiftOverflow);
- SDValue OpUnkown = DAG->getNode(ISD::SHL, Loc, VT, Vec, ShiftUnkown);
- // Lane 0: -2147483648 << 1 = 0
- APInt Lane0(4, 1);
- EXPECT_FALSE(DAG->isKnownNeverZero(Op1, Lane0));
- EXPECT_FALSE(DAG->isKnownNeverZero(OpUnkown, Lane0));
- // Lane 1: 1 << 31 = -2147483648
- APInt Lane1(4, 2);
- EXPECT_TRUE(DAG->isKnownNeverZero(Op31, Lane1));
- // Lane 2: 0 << 1 = 0
- APInt Lane2(4, 4);
- EXPECT_FALSE(DAG->isKnownNeverZero(Op1, Lane2));
- EXPECT_FALSE(DAG->isKnownNeverZero(OpUnkown, Lane2));
- EXPECT_FALSE(DAG->isKnownNeverZero(Op32, Lane2));
- // Lane 3: -8 << 1 = -16
- APInt Lane3(4, 8);
- EXPECT_TRUE(DAG->isKnownNeverZero(Op1, Lane3));
- EXPECT_FALSE(DAG->isKnownNeverZero(Op32, Lane3));
- EXPECT_FALSE(DAG->isKnownNeverZero(Op31, Lane3));
- // lane 2 and 4
- APInt Lane1AND4(4, -6);
- EXPECT_TRUE(DAG->isKnownNeverZero(Op1, Lane1AND4));
- EXPECT_FALSE(DAG->isKnownNeverZero(Op31, Lane1AND4));
- EXPECT_FALSE(DAG->isKnownNeverZero(OpUnkown, Lane1AND4));
- // all lanes
- APInt LaneAll(4, -1);
- EXPECT_FALSE(DAG->isKnownNeverZero(Op1, LaneAll));
- EXPECT_FALSE(DAG->isKnownNeverZero(OpUnkown, LaneAll));
- EXPECT_FALSE(DAG->isKnownNeverZero(Op32, LaneAll));
-}
} // end namespace llvm
>From f9b182d7d1271e176968d800bb6a976c8f47f09d Mon Sep 17 00:00:00 2001
From: Joel Joseph Mathews <joeljosephcl10 at gmail.com>
Date: Fri, 6 Mar 2026 16:29:45 +0530
Subject: [PATCH 4/8] regenerate known-never-zero.ll
---
llvm/test/CodeGen/X86/known-never-zero.ll | 11 ++++-------
1 file changed, 4 insertions(+), 7 deletions(-)
diff --git a/llvm/test/CodeGen/X86/known-never-zero.ll b/llvm/test/CodeGen/X86/known-never-zero.ll
index 726fcc868e840..eae6d20be7c85 100644
--- a/llvm/test/CodeGen/X86/known-never-zero.ll
+++ b/llvm/test/CodeGen/X86/known-never-zero.ll
@@ -221,7 +221,7 @@ define i32 @shl_known_nonzero_1s_bit_set_vec(<4 x i32> %x, ptr %p) {
; X86-NEXT: pslld $23, %xmm0
; X86-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-NEXT: cvttps2dq %xmm0, %xmm0
-; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [123,0,0,0]
; X86-NEXT: movdqa {{.*#+}} xmm1 = [4294967295,0,4294967295,0]
; X86-NEXT: pand %xmm0, %xmm1
; X86-NEXT: movdqa %xmm1, (%eax)
@@ -236,7 +236,7 @@ define i32 @shl_known_nonzero_1s_bit_set_vec(<4 x i32> %x, ptr %p) {
; X64-NEXT: vpslld $23, %xmm0, %xmm0
; X64-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; X64-NEXT: vcvttps2dq %xmm0, %xmm0
-; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [123,0,0,0]
; X64-NEXT: vmovdqa %xmm0, (%rdi)
; X64-NEXT: vmovd %xmm0, %ecx
; X64-NEXT: movl $32, %eax
@@ -2003,7 +2003,8 @@ define i32 @mul_maybe_zero(i32 %x, i32 %y) {
define i32 @bitcast_known_nonzero(<2 x i16> %xx) {
; X86-LABEL: bitcast_known_nonzero:
; X86: # %bb.0:
-; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; X86-NEXT: pxor %xmm1, %xmm1
+; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; X86-NEXT: pslld $23, %xmm0
; X86-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-NEXT: cvttps2dq %xmm0, %xmm0
@@ -2078,7 +2079,6 @@ define i32 @zext_known_nonzero(i16 %xx) {
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl $256, %eax # imm = 0x100
; X86-NEXT: shll %cl, %eax
-; X86-NEXT: movzwl %ax, %eax
; X86-NEXT: rep bsfl %eax, %eax
; X86-NEXT: retl
;
@@ -2088,7 +2088,6 @@ define i32 @zext_known_nonzero(i16 %xx) {
; X64-NEXT: movl $256, %eax # imm = 0x100
; X64-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NEXT: shll %cl, %eax
-; X64-NEXT: movzwl %ax, %eax
; X64-NEXT: rep bsfl %eax, %eax
; X64-NEXT: retq
%x = shl nuw nsw i16 256, %xx
@@ -2123,7 +2122,6 @@ define i32 @sext_known_nonzero(i16 %xx) {
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl $256, %eax # imm = 0x100
; X86-NEXT: shll %cl, %eax
-; X86-NEXT: movzwl %ax, %eax
; X86-NEXT: rep bsfl %eax, %eax
; X86-NEXT: retl
;
@@ -2133,7 +2131,6 @@ define i32 @sext_known_nonzero(i16 %xx) {
; X64-NEXT: movl $256, %eax # imm = 0x100
; X64-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NEXT: shll %cl, %eax
-; X64-NEXT: movzwl %ax, %eax
; X64-NEXT: rep bsfl %eax, %eax
; X64-NEXT: retq
%x = shl nuw nsw i16 256, %xx
>From 8f14b889b75c1fd6c6cb78a27e8b9e897e70ad0d Mon Sep 17 00:00:00 2001
From: Joeljm1 <143259392+Joeljm1 at users.noreply.github.com>
Date: Fri, 6 Mar 2026 14:37:45 +0300
Subject: [PATCH 5/8] Update llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
Co-authored-by: Simon Pilgrim <git at redking.me.uk>
---
llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 3cd57029fe3a6..61ccfbe45c8ad 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -6262,7 +6262,7 @@ bool SelectionDAG::isKnownNeverZero(SDValue Op, const APInt &DemandedElts,
if (ValKnown.One[0])
return true;
// If max shift cnt of known ones is non-zero, result is non-zero.
- APInt MaxCnt = computeKnownBits(Op.getOperand(1),DemandedElts ,Depth + 1).getMaxValue();
+ APInt MaxCnt = computeKnownBits(Op.getOperand(1), DemandedElts,Depth + 1).getMaxValue();
if (MaxCnt.ult(ValKnown.getBitWidth()) &&
!ValKnown.One.shl(MaxCnt).isZero())
return true;
>From d505977ff44606abd35ae90444f1bc09c9b2a7c8 Mon Sep 17 00:00:00 2001
From: Joeljm1 <143259392+Joeljm1 at users.noreply.github.com>
Date: Fri, 6 Mar 2026 14:38:00 +0300
Subject: [PATCH 6/8] Update llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
Co-authored-by: Simon Pilgrim <git at redking.me.uk>
---
llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 61ccfbe45c8ad..138d6e971b908 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -6256,7 +6256,7 @@ bool SelectionDAG::isKnownNeverZero(SDValue Op, const APInt &DemandedElts,
case ISD::SHL: {
if (Op->getFlags().hasNoSignedWrap() || Op->getFlags().hasNoUnsignedWrap())
- return isKnownNeverZero(Op.getOperand(0),DemandedElts, Depth + 1);
+ return isKnownNeverZero(Op.getOperand(0), DemandedElts, Depth + 1);
KnownBits ValKnown = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
// 1 << X is never zero.
if (ValKnown.One[0])
>From ce5a498cbc830f887d9fd80fe0ae16f3a90d0604 Mon Sep 17 00:00:00 2001
From: Joel Joseph Mathews <joeljosephcl10 at gmail.com>
Date: Fri, 6 Mar 2026 18:42:27 +0530
Subject: [PATCH 7/8] clang-format only shl case in isKnownNeverZero fn
---
llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 138d6e971b908..c4cd1b88f7802 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -6257,12 +6257,14 @@ bool SelectionDAG::isKnownNeverZero(SDValue Op, const APInt &DemandedElts,
case ISD::SHL: {
if (Op->getFlags().hasNoSignedWrap() || Op->getFlags().hasNoUnsignedWrap())
return isKnownNeverZero(Op.getOperand(0), DemandedElts, Depth + 1);
- KnownBits ValKnown = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+ KnownBits ValKnown =
+ computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
// 1 << X is never zero.
if (ValKnown.One[0])
return true;
// If max shift cnt of known ones is non-zero, result is non-zero.
- APInt MaxCnt = computeKnownBits(Op.getOperand(1), DemandedElts,Depth + 1).getMaxValue();
+ APInt MaxCnt = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1)
+ .getMaxValue();
if (MaxCnt.ult(ValKnown.getBitWidth()) &&
!ValKnown.One.shl(MaxCnt).isZero())
return true;
>From cd99a17a865861e2e6ca383b75609d787fabf726 Mon Sep 17 00:00:00 2001
From: Joel Joseph Mathews <joeljosephcl10 at gmail.com>
Date: Fri, 6 Mar 2026 20:36:18 +0530
Subject: [PATCH 8/8] fixed tests for known-never-zero.ll
---
llvm/test/CodeGen/X86/known-never-zero.ll | 120 ++++++++--------------
1 file changed, 42 insertions(+), 78 deletions(-)
diff --git a/llvm/test/CodeGen/X86/known-never-zero.ll b/llvm/test/CodeGen/X86/known-never-zero.ll
index eae6d20be7c85..3953f9f4ab93f 100644
--- a/llvm/test/CodeGen/X86/known-never-zero.ll
+++ b/llvm/test/CodeGen/X86/known-never-zero.ll
@@ -134,9 +134,7 @@ define i32 @extractelt_nonzero_vec(<4 x i32> %a0, ptr %p1, i32 %a2) {
; X86-NEXT: por %xmm0, %xmm1
; X86-NEXT: movdqa %xmm1, (%eax)
; X86-NEXT: movd %xmm1, %eax
-; X86-NEXT: bsfl %eax, %ecx
-; X86-NEXT: movl $32, %eax
-; X86-NEXT: cmovnel %ecx, %eax
+; X86-NEXT: rep bsfl %eax, %eax
; X86-NEXT: retl
;
; X64-LABEL: extractelt_nonzero_vec:
@@ -144,9 +142,8 @@ define i32 @extractelt_nonzero_vec(<4 x i32> %a0, ptr %p1, i32 %a2) {
; X64-NEXT: vmovaps {{.*#+}} xmm1 = [8,4294967295,4294967295,4294967295]
; X64-NEXT: vblendvps %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0
; X64-NEXT: vmovaps %xmm0, (%rdi)
-; X64-NEXT: vmovd %xmm0, %ecx
-; X64-NEXT: movl $32, %eax
-; X64-NEXT: rep bsfl %ecx, %eax
+; X64-NEXT: vmovd %xmm0, %eax
+; X64-NEXT: rep bsfl %eax, %eax
; X64-NEXT: retq
%cmp = icmp sgt <4 x i32> zeroinitializer, %a0
%sel = select <4 x i1> %cmp, <4 x i32> <i32 4, i32 0, i32 0, i32 0>, <4 x i32> <i32 8, i32 -1, i32 -1, i32 -1>
@@ -226,9 +223,7 @@ define i32 @shl_known_nonzero_1s_bit_set_vec(<4 x i32> %x, ptr %p) {
; X86-NEXT: pand %xmm0, %xmm1
; X86-NEXT: movdqa %xmm1, (%eax)
; X86-NEXT: movd %xmm0, %eax
-; X86-NEXT: bsfl %eax, %ecx
-; X86-NEXT: movl $32, %eax
-; X86-NEXT: cmovnel %ecx, %eax
+; X86-NEXT: rep bsfl %eax, %eax
; X86-NEXT: retl
;
; X64-LABEL: shl_known_nonzero_1s_bit_set_vec:
@@ -238,9 +233,8 @@ define i32 @shl_known_nonzero_1s_bit_set_vec(<4 x i32> %x, ptr %p) {
; X64-NEXT: vcvttps2dq %xmm0, %xmm0
; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [123,0,0,0]
; X64-NEXT: vmovdqa %xmm0, (%rdi)
-; X64-NEXT: vmovd %xmm0, %ecx
-; X64-NEXT: movl $32, %eax
-; X64-NEXT: rep bsfl %ecx, %eax
+; X64-NEXT: vmovd %xmm0, %eax
+; X64-NEXT: rep bsfl %eax, %eax
; X64-NEXT: retq
%z = shl <4 x i32> <i32 123, i32 0, i32 0, i32 0>, %x
store <4 x i32> %z, ptr %p
@@ -290,9 +284,7 @@ define i32 @shl_known_nonzero_nsw_vec(<4 x i32> %x, <4 x i32> %yy, ptr %p) {
; X86-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
; X86-NEXT: movdqa %xmm3, (%eax)
; X86-NEXT: movd %xmm1, %eax
-; X86-NEXT: bsfl %eax, %ecx
-; X86-NEXT: movl $32, %eax
-; X86-NEXT: cmovnel %ecx, %eax
+; X86-NEXT: rep bsfl %eax, %eax
; X86-NEXT: retl
;
; X64-LABEL: shl_known_nonzero_nsw_vec:
@@ -303,9 +295,8 @@ define i32 @shl_known_nonzero_nsw_vec(<4 x i32> %x, <4 x i32> %yy, ptr %p) {
; X64-NEXT: vcvttps2dq %xmm0, %xmm0
; X64-NEXT: vpmulld %xmm0, %xmm1, %xmm0
; X64-NEXT: vmovdqa %xmm0, (%rdi)
-; X64-NEXT: vmovd %xmm0, %ecx
-; X64-NEXT: movl $32, %eax
-; X64-NEXT: rep bsfl %ecx, %eax
+; X64-NEXT: vmovd %xmm0, %eax
+; X64-NEXT: rep bsfl %eax, %eax
; X64-NEXT: retq
%y = or <4 x i32> %yy, <i32 256, i32 0, i32 0, i32 0>
%z = shl nsw <4 x i32> %y, %x
@@ -356,9 +347,7 @@ define i32 @shl_known_nonzero_nuw_vec(<4 x i32> %x, <4 x i32> %yy, ptr %p) {
; X86-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
; X86-NEXT: movdqa %xmm3, (%eax)
; X86-NEXT: movd %xmm1, %eax
-; X86-NEXT: bsfl %eax, %ecx
-; X86-NEXT: movl $32, %eax
-; X86-NEXT: cmovnel %ecx, %eax
+; X86-NEXT: rep bsfl %eax, %eax
; X86-NEXT: retl
;
; X64-LABEL: shl_known_nonzero_nuw_vec:
@@ -369,9 +358,8 @@ define i32 @shl_known_nonzero_nuw_vec(<4 x i32> %x, <4 x i32> %yy, ptr %p) {
; X64-NEXT: vcvttps2dq %xmm0, %xmm0
; X64-NEXT: vpmulld %xmm0, %xmm1, %xmm0
; X64-NEXT: vmovdqa %xmm0, (%rdi)
-; X64-NEXT: vmovd %xmm0, %ecx
-; X64-NEXT: movl $32, %eax
-; X64-NEXT: rep bsfl %ecx, %eax
+; X64-NEXT: vmovd %xmm0, %eax
+; X64-NEXT: rep bsfl %eax, %eax
; X64-NEXT: retq
%y = or <4 x i32> %yy, <i32 256, i32 0, i32 0, i32 0>
%z = shl nuw <4 x i32> %y, %x
@@ -434,18 +422,15 @@ define i32 @uaddsat_known_nonzero_vec(<16 x i8> %x, ptr %p) {
; X86-NEXT: paddusb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-NEXT: movdqa %xmm0, (%eax)
; X86-NEXT: movzbl (%eax), %eax
-; X86-NEXT: bsfl %eax, %ecx
-; X86-NEXT: movl $32, %eax
-; X86-NEXT: cmovnel %ecx, %eax
+; X86-NEXT: rep bsfl %eax, %eax
; X86-NEXT: retl
;
; X64-LABEL: uaddsat_known_nonzero_vec:
; X64: # %bb.0:
; X64-NEXT: vpaddusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; X64-NEXT: vmovdqa %xmm0, (%rdi)
-; X64-NEXT: vpextrb $0, %xmm0, %ecx
-; X64-NEXT: movl $32, %eax
-; X64-NEXT: rep bsfl %ecx, %eax
+; X64-NEXT: vpextrb $0, %xmm0, %eax
+; X64-NEXT: rep bsfl %eax, %eax
; X64-NEXT: retq
%z = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> %x, <16 x i8> <i8 1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>)
store <16 x i8> %z, ptr %p
@@ -540,9 +525,7 @@ define i32 @umax_known_nonzero_vec(<16 x i8> %x, ptr %p) {
; X86-NEXT: pmaxub %xmm0, %xmm1
; X86-NEXT: movdqa %xmm1, (%eax)
; X86-NEXT: movzbl (%eax), %eax
-; X86-NEXT: bsfl %eax, %ecx
-; X86-NEXT: movl $32, %eax
-; X86-NEXT: cmovnel %ecx, %eax
+; X86-NEXT: rep bsfl %eax, %eax
; X86-NEXT: retl
;
; X64-LABEL: umax_known_nonzero_vec:
@@ -559,9 +542,8 @@ define i32 @umax_known_nonzero_vec(<16 x i8> %x, ptr %p) {
; X64-NEXT: vpblendvb %xmm1, %xmm3, %xmm2, %xmm1
; X64-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
; X64-NEXT: vmovdqa %xmm0, (%rdi)
-; X64-NEXT: vpextrb $0, %xmm0, %ecx
-; X64-NEXT: movl $32, %eax
-; X64-NEXT: rep bsfl %ecx, %eax
+; X64-NEXT: vpextrb $0, %xmm0, %eax
+; X64-NEXT: rep bsfl %eax, %eax
; X64-NEXT: retq
%yy = shl nuw <16 x i8> <i8 4, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, %x
%z = call <16 x i8> @llvm.umax.v16i8(<16 x i8> %x, <16 x i8> %yy)
@@ -976,17 +958,14 @@ define i32 @smax_known_never_zero_vec_element(<4 x i32> %x) {
; X86-NEXT: pandn %xmm1, %xmm2
; X86-NEXT: por %xmm0, %xmm2
; X86-NEXT: movd %xmm2, %eax
-; X86-NEXT: bsfl %eax, %ecx
-; X86-NEXT: movl $32, %eax
-; X86-NEXT: cmovnel %ecx, %eax
+; X86-NEXT: rep bsfl %eax, %eax
; X86-NEXT: retl
;
; X64-LABEL: smax_known_never_zero_vec_element:
; X64: # %bb.0:
; X64-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-NEXT: vmovd %xmm0, %ecx
-; X64-NEXT: movl $32, %eax
-; X64-NEXT: rep bsfl %ecx, %eax
+; X64-NEXT: vmovd %xmm0, %eax
+; X64-NEXT: rep bsfl %eax, %eax
; X64-NEXT: retq
%z = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %x, <4 x i32> <i32 54, i32 -23, i32 -12, i32 -1>)
%el = extractelement <4 x i32> %z, i32 0
@@ -1285,9 +1264,7 @@ define i32 @sra_known_nonzero_exact_vec(<4 x i32> %x, <4 x i32> %yy, ptr %p) {
; X86-NEXT: movdqa %xmm1, (%eax)
; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
; X86-NEXT: movd %xmm0, %eax
-; X86-NEXT: bsfl %eax, %ecx
-; X86-NEXT: movl $32, %eax
-; X86-NEXT: cmovnel %ecx, %eax
+; X86-NEXT: rep bsfl %eax, %eax
; X86-NEXT: retl
;
; X64-LABEL: sra_known_nonzero_exact_vec:
@@ -1296,9 +1273,8 @@ define i32 @sra_known_nonzero_exact_vec(<4 x i32> %x, <4 x i32> %yy, ptr %p) {
; X64-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; X64-NEXT: vpsrad %xmm0, %xmm1, %xmm0
; X64-NEXT: vmovdqa %xmm0, (%rdi)
-; X64-NEXT: vpextrd $1, %xmm0, %ecx
-; X64-NEXT: movl $32, %eax
-; X64-NEXT: rep bsfl %ecx, %eax
+; X64-NEXT: vpextrd $1, %xmm0, %eax
+; X64-NEXT: rep bsfl %eax, %eax
; X64-NEXT: retq
%x.splat = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> zeroinitializer
%y = or <4 x i32> %yy, <i32 0, i32 256, i32 0, i32 0>
@@ -1366,9 +1342,7 @@ define i32 @srl_known_nonzero_sign_bit_set_vec(<4 x i32> %x, ptr %p) {
; X86-NEXT: movdqa %xmm0, (%eax)
; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X86-NEXT: movd %xmm0, %eax
-; X86-NEXT: bsfl %eax, %ecx
-; X86-NEXT: movl $32, %eax
-; X86-NEXT: cmovnel %ecx, %eax
+; X86-NEXT: rep bsfl %eax, %eax
; X86-NEXT: retl
;
; X64-LABEL: srl_known_nonzero_sign_bit_set_vec:
@@ -1377,9 +1351,8 @@ define i32 @srl_known_nonzero_sign_bit_set_vec(<4 x i32> %x, ptr %p) {
; X64-NEXT: vmovdqa {{.*#+}} xmm1 = [0,65535,2147606891,0]
; X64-NEXT: vpsrld %xmm0, %xmm1, %xmm0
; X64-NEXT: vmovdqa %xmm0, (%rdi)
-; X64-NEXT: vpextrd $2, %xmm0, %ecx
-; X64-NEXT: movl $32, %eax
-; X64-NEXT: rep bsfl %ecx, %eax
+; X64-NEXT: vpextrd $2, %xmm0, %eax
+; X64-NEXT: rep bsfl %eax, %eax
; X64-NEXT: retq
%x.splat = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> zeroinitializer
%z = lshr <4 x i32> <i32 0, i32 65535, i32 2147606891, i32 0>, %x.splat
@@ -1424,9 +1397,7 @@ define i32 @srl_known_nonzero_exact_vec(<4 x i32> %x, <4 x i32> %yy, ptr %p) {
; X86-NEXT: movdqa %xmm1, (%eax)
; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3]
; X86-NEXT: movd %xmm0, %eax
-; X86-NEXT: bsfl %eax, %ecx
-; X86-NEXT: movl $32, %eax
-; X86-NEXT: cmovnel %ecx, %eax
+; X86-NEXT: rep bsfl %eax, %eax
; X86-NEXT: retl
;
; X64-LABEL: srl_known_nonzero_exact_vec:
@@ -1435,9 +1406,8 @@ define i32 @srl_known_nonzero_exact_vec(<4 x i32> %x, <4 x i32> %yy, ptr %p) {
; X64-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; X64-NEXT: vpsrld %xmm0, %xmm1, %xmm0
; X64-NEXT: vmovdqa %xmm0, (%rdi)
-; X64-NEXT: vpextrd $3, %xmm0, %ecx
-; X64-NEXT: movl $32, %eax
-; X64-NEXT: rep bsfl %ecx, %eax
+; X64-NEXT: vpextrd $3, %xmm0, %eax
+; X64-NEXT: rep bsfl %eax, %eax
; X64-NEXT: retq
%x.splat = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> zeroinitializer
%y = or <4 x i32> %yy, <i32 0, i32 0, i32 0, i32 256>
@@ -1784,18 +1754,15 @@ define i32 @add_nuw_known_nonzero_vec(<4 x i32> %xx, ptr %p) {
; X86-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-NEXT: movdqa %xmm0, (%eax)
; X86-NEXT: movd %xmm0, %eax
-; X86-NEXT: bsfl %eax, %ecx
-; X86-NEXT: movl $32, %eax
-; X86-NEXT: cmovnel %ecx, %eax
+; X86-NEXT: rep bsfl %eax, %eax
; X86-NEXT: retl
;
; X64-LABEL: add_nuw_known_nonzero_vec:
; X64: # %bb.0:
; X64-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; X64-NEXT: vmovdqa %xmm0, (%rdi)
-; X64-NEXT: vmovd %xmm0, %ecx
-; X64-NEXT: movl $32, %eax
-; X64-NEXT: rep bsfl %ecx, %eax
+; X64-NEXT: vmovd %xmm0, %eax
+; X64-NEXT: rep bsfl %eax, %eax
; X64-NEXT: retq
%z = add nuw <4 x i32> %xx, <i32 1, i32 0, i32 0, i32 0>
store <4 x i32> %z, ptr %p
@@ -2190,9 +2157,8 @@ define i32 @test_zext_demanded_elts(<4 x i32> %a0, ptr %p) {
; X64-NEXT: vunpckhps {{.*#+}} xmm2 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; X64-NEXT: vmovaps %xmm2, 16(%rdi)
; X64-NEXT: vmovdqa %xmm1, (%rdi)
-; X64-NEXT: vmovd %xmm0, %ecx
-; X64-NEXT: movl $64, %eax
-; X64-NEXT: rep bsfq %rcx, %rax
+; X64-NEXT: vmovd %xmm0, %eax
+; X64-NEXT: rep bsfq %rax, %rax
; X64-NEXT: # kill: def $eax killed $eax killed $rax
; X64-NEXT: retq
%cmp = icmp sgt <4 x i32> zeroinitializer, %a0
@@ -2225,12 +2191,11 @@ define i32 @test_sext_demanded_elts(<4 x i32> %a0, ptr %p) {
; X86-NEXT: movdqa %xmm0, 16(%eax)
; X86-NEXT: movdqa %xmm2, (%eax)
; X86-NEXT: movd %xmm1, %eax
-; X86-NEXT: bsfl %eax, %eax
-; X86-NEXT: movl $32, %edx
-; X86-NEXT: cmovnel %eax, %edx
-; X86-NEXT: addl $32, %edx
-; X86-NEXT: bsfl %ecx, %eax
-; X86-NEXT: cmovel %edx, %eax
+; X86-NEXT: rep bsfl %ecx, %edx
+; X86-NEXT: rep bsfl %eax, %eax
+; X86-NEXT: addl $32, %eax
+; X86-NEXT: testl %ecx, %ecx
+; X86-NEXT: cmovnel %edx, %eax
; X86-NEXT: retl
;
; X64-LABEL: test_sext_demanded_elts:
@@ -2242,9 +2207,8 @@ define i32 @test_sext_demanded_elts(<4 x i32> %a0, ptr %p) {
; X64-NEXT: vpmovsxdq %xmm0, %xmm0
; X64-NEXT: vmovdqa %xmm0, (%rdi)
; X64-NEXT: vmovdqa %xmm1, 16(%rdi)
-; X64-NEXT: vmovq %xmm0, %rcx
-; X64-NEXT: movl $64, %eax
-; X64-NEXT: rep bsfq %rcx, %rax
+; X64-NEXT: vmovq %xmm0, %rax
+; X64-NEXT: rep bsfq %rax, %rax
; X64-NEXT: # kill: def $eax killed $eax killed $rax
; X64-NEXT: retq
%cmp = icmp sgt <4 x i32> zeroinitializer, %a0
More information about the llvm-commits
mailing list