[llvm] [DAG] isKnownNeverZero: Add DemandedElts handling for ROTL/ROTR/BITREVERSE/BSWAP/CTPOP/ABS (PR #184033)

Ayush Kumar Gaur via llvm-commits llvm-commits at lists.llvm.org
Fri Mar 6 08:15:05 PST 2026


https://github.com/Ayush3941 updated https://github.com/llvm/llvm-project/pull/184033

>From 2859336292085d42da26be9ecc5fb898de5bb14a Mon Sep 17 00:00:00 2001
From: Ayush3941 <ayushkgaur1 at gmail.com>
Date: Sun, 1 Mar 2026 14:54:12 -0500
Subject: [PATCH 1/5] [DAG] isKnownNeverZero: Add DemandedElts handling for
 ROTL/ROTR/BITREVERSE/BSWAP/CTPOP/ABS

---
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp |  2 +-
 .../X86/known-never-zero-demanded-elts.ll     | 78 +++++++++++++++++++
 2 files changed, 79 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/X86/known-never-zero-demanded-elts.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 31a83dd6e0ec0..c5750b36dc629 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -6290,7 +6290,7 @@ bool SelectionDAG::isKnownNeverZero(SDValue Op, const APInt &DemandedElts,
   case ISD::BSWAP:
   case ISD::CTPOP:
   case ISD::ABS:
-    return isKnownNeverZero(Op.getOperand(0), Depth + 1);
+    return isKnownNeverZero(Op.getOperand(0), DemandedElts, Depth + 1);
 
   case ISD::SRA:
   case ISD::SRL: {
diff --git a/llvm/test/CodeGen/X86/known-never-zero-demanded-elts.ll b/llvm/test/CodeGen/X86/known-never-zero-demanded-elts.ll
new file mode 100644
index 0000000000000..97725d0e4dddd
--- /dev/null
+++ b/llvm/test/CodeGen/X86/known-never-zero-demanded-elts.ll
@@ -0,0 +1,78 @@
+; RUN: llc -O2 -mtriple=x86_64-unknown-linux-gnu %s -o - | FileCheck %s
+
+; Helper: build vector with lane0 set, other lanes undef.
+
+declare <4 x i8>  @llvm.ctpop.v4i8(<4 x i8>)
+declare <4 x i16> @llvm.bswap.v4i16(<4 x i16>)
+declare <4 x i8>  @llvm.bitreverse.v4i8(<4 x i8>)
+declare <4 x i8>  @llvm.fshl.v4i8(<4 x i8>, <4 x i8>, <4 x i8>)
+declare <4 x i8>  @llvm.fshr.v4i8(<4 x i8>, <4 x i8>, <4 x i8>)
+declare <4 x i8> @llvm.abs.v4i8(<4 x i8>, i1)
+
+define i1 @ctpop_lane0_nonzero() {
+; CHECK-LABEL: ctpop_lane0_nonzero:
+; CHECK: mov{{.*}}1
+; CHECK: ret
+  %v0 = insertelement <4 x i8> undef, i8 8, i64 0          ; lane0 = 8 (non-zero)
+  %w  = call <4 x i8> @llvm.ctpop.v4i8(<4 x i8> %v0)        ; ctpop(8)=1 -> non-zero
+  %e0 = extractelement <4 x i8> %w, i64 0
+  %cmp = icmp ne i8 %e0, 0
+  ret i1 %cmp
+}
+
+define i1 @bswap_lane0_nonzero() {
+; CHECK-LABEL: bswap_lane0_nonzero:
+; CHECK: mov{{.*}}1
+; CHECK: ret
+  %v0 = insertelement <4 x i16> undef, i16 1, i64 0         ; lane0 = 1
+  %w  = call <4 x i16> @llvm.bswap.v4i16(<4 x i16> %v0)      ; bswap(1)=256 -> non-zero
+  %e0 = extractelement <4 x i16> %w, i64 0
+  %cmp = icmp ne i16 %e0, 0
+  ret i1 %cmp
+}
+
+define i1 @bitreverse_lane0_nonzero() {
+; CHECK-LABEL: bitreverse_lane0_nonzero:
+; CHECK: mov{{.*}}1
+; CHECK: ret
+  %v0 = insertelement <4 x i8> undef, i8 1, i64 0           ; lane0 = 1
+  %w  = call <4 x i8> @llvm.bitreverse.v4i8(<4 x i8> %v0)    ; bitreverse(1)=0x80 -> non-zero
+  %e0 = extractelement <4 x i8> %w, i64 0
+  %cmp = icmp ne i8 %e0, 0
+  ret i1 %cmp
+}
+
+define i1 @rotl_lane0_nonzero() {
+; CHECK-LABEL: rotl_lane0_nonzero:
+; CHECK: mov{{.*}}1
+; CHECK: ret
+  %x  = insertelement <4 x i8> undef, i8 2, i64 0           ; lane0 = 2 (non-zero)
+  %k  = insertelement <4 x i8> undef, i8 1, i64 0           ; rotate by 1
+  %w  = call <4 x i8> @llvm.fshl.v4i8(<4 x i8> %x, <4 x i8> %x, <4 x i8> %k)
+  %e0 = extractelement <4 x i8> %w, i64 0
+  %cmp = icmp ne i8 %e0, 0
+  ret i1 %cmp
+}
+
+define i1 @rotr_lane0_nonzero() {
+; CHECK-LABEL: rotr_lane0_nonzero:
+; CHECK: mov{{.*}}1
+; CHECK: ret
+  %x  = insertelement <4 x i8> undef, i8 2, i64 0
+  %k  = insertelement <4 x i8> undef, i8 1, i64 0
+  %w  = call <4 x i8> @llvm.fshr.v4i8(<4 x i8> %x, <4 x i8> %x, <4 x i8> %k)
+  %e0 = extractelement <4 x i8> %w, i64 0
+  %cmp = icmp ne i8 %e0, 0
+  ret i1 %cmp
+}
+
+define i1 @abs_lane0_nonzero() {
+; CHECK-LABEL: abs_lane0_nonzero:
+; CHECK: mov{{.*}}1
+; CHECK: ret
+  %v0 = insertelement <4 x i8> undef, i8 -2, i64 0   ; lane0 = -2 (non-zero)
+  %w  = call <4 x i8> @llvm.abs.v4i8(<4 x i8> %v0, i1 false)
+  %e0 = extractelement <4 x i8> %w, i64 0
+  %cmp = icmp ne i8 %e0, 0
+  ret i1 %cmp
+}
\ No newline at end of file

>From e1022800edd244b75efb32b4dae5636afb7462d6 Mon Sep 17 00:00:00 2001
From: Ayush3941 <ayushkgaur1 at gmail.com>
Date: Sun, 1 Mar 2026 15:07:07 -0500
Subject: [PATCH 2/5] [DAG] isKnownNeverZero: Add DemandedElts handling for
 ROTL/ROTR/BITREVERSE/BSWAP/CTPOP/ABS v2

---
 .../X86/known-never-zero-demanded-elts.ll     | 28 +++++++++----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/llvm/test/CodeGen/X86/known-never-zero-demanded-elts.ll b/llvm/test/CodeGen/X86/known-never-zero-demanded-elts.ll
index 97725d0e4dddd..34942bf8d9ab5 100644
--- a/llvm/test/CodeGen/X86/known-never-zero-demanded-elts.ll
+++ b/llvm/test/CodeGen/X86/known-never-zero-demanded-elts.ll
@@ -1,20 +1,20 @@
 ; RUN: llc -O2 -mtriple=x86_64-unknown-linux-gnu %s -o - | FileCheck %s
 
-; Helper: build vector with lane0 set, other lanes undef.
+; Helper: build vector with lane0 set, other lanes poison.
 
 declare <4 x i8>  @llvm.ctpop.v4i8(<4 x i8>)
 declare <4 x i16> @llvm.bswap.v4i16(<4 x i16>)
 declare <4 x i8>  @llvm.bitreverse.v4i8(<4 x i8>)
 declare <4 x i8>  @llvm.fshl.v4i8(<4 x i8>, <4 x i8>, <4 x i8>)
 declare <4 x i8>  @llvm.fshr.v4i8(<4 x i8>, <4 x i8>, <4 x i8>)
-declare <4 x i8> @llvm.abs.v4i8(<4 x i8>, i1)
+declare <4 x i8>  @llvm.abs.v4i8(<4 x i8>, i1)
 
 define i1 @ctpop_lane0_nonzero() {
 ; CHECK-LABEL: ctpop_lane0_nonzero:
 ; CHECK: mov{{.*}}1
 ; CHECK: ret
-  %v0 = insertelement <4 x i8> undef, i8 8, i64 0          ; lane0 = 8 (non-zero)
-  %w  = call <4 x i8> @llvm.ctpop.v4i8(<4 x i8> %v0)        ; ctpop(8)=1 -> non-zero
+  %v0 = insertelement <4 x i8> poison, i8 8, i64 0
+  %w  = call <4 x i8> @llvm.ctpop.v4i8(<4 x i8> %v0)
   %e0 = extractelement <4 x i8> %w, i64 0
   %cmp = icmp ne i8 %e0, 0
   ret i1 %cmp
@@ -24,8 +24,8 @@ define i1 @bswap_lane0_nonzero() {
 ; CHECK-LABEL: bswap_lane0_nonzero:
 ; CHECK: mov{{.*}}1
 ; CHECK: ret
-  %v0 = insertelement <4 x i16> undef, i16 1, i64 0         ; lane0 = 1
-  %w  = call <4 x i16> @llvm.bswap.v4i16(<4 x i16> %v0)      ; bswap(1)=256 -> non-zero
+  %v0 = insertelement <4 x i16> poison, i16 1, i64 0
+  %w  = call <4 x i16> @llvm.bswap.v4i16(<4 x i16> %v0)
   %e0 = extractelement <4 x i16> %w, i64 0
   %cmp = icmp ne i16 %e0, 0
   ret i1 %cmp
@@ -35,8 +35,8 @@ define i1 @bitreverse_lane0_nonzero() {
 ; CHECK-LABEL: bitreverse_lane0_nonzero:
 ; CHECK: mov{{.*}}1
 ; CHECK: ret
-  %v0 = insertelement <4 x i8> undef, i8 1, i64 0           ; lane0 = 1
-  %w  = call <4 x i8> @llvm.bitreverse.v4i8(<4 x i8> %v0)    ; bitreverse(1)=0x80 -> non-zero
+  %v0 = insertelement <4 x i8> poison, i8 1, i64 0
+  %w  = call <4 x i8> @llvm.bitreverse.v4i8(<4 x i8> %v0)
   %e0 = extractelement <4 x i8> %w, i64 0
   %cmp = icmp ne i8 %e0, 0
   ret i1 %cmp
@@ -46,8 +46,8 @@ define i1 @rotl_lane0_nonzero() {
 ; CHECK-LABEL: rotl_lane0_nonzero:
 ; CHECK: mov{{.*}}1
 ; CHECK: ret
-  %x  = insertelement <4 x i8> undef, i8 2, i64 0           ; lane0 = 2 (non-zero)
-  %k  = insertelement <4 x i8> undef, i8 1, i64 0           ; rotate by 1
+  %x  = insertelement <4 x i8> poison, i8 2, i64 0
+  %k  = insertelement <4 x i8> poison, i8 1, i64 0
   %w  = call <4 x i8> @llvm.fshl.v4i8(<4 x i8> %x, <4 x i8> %x, <4 x i8> %k)
   %e0 = extractelement <4 x i8> %w, i64 0
   %cmp = icmp ne i8 %e0, 0
@@ -58,8 +58,8 @@ define i1 @rotr_lane0_nonzero() {
 ; CHECK-LABEL: rotr_lane0_nonzero:
 ; CHECK: mov{{.*}}1
 ; CHECK: ret
-  %x  = insertelement <4 x i8> undef, i8 2, i64 0
-  %k  = insertelement <4 x i8> undef, i8 1, i64 0
+  %x  = insertelement <4 x i8> poison, i8 2, i64 0
+  %k  = insertelement <4 x i8> poison, i8 1, i64 0
   %w  = call <4 x i8> @llvm.fshr.v4i8(<4 x i8> %x, <4 x i8> %x, <4 x i8> %k)
   %e0 = extractelement <4 x i8> %w, i64 0
   %cmp = icmp ne i8 %e0, 0
@@ -70,9 +70,9 @@ define i1 @abs_lane0_nonzero() {
 ; CHECK-LABEL: abs_lane0_nonzero:
 ; CHECK: mov{{.*}}1
 ; CHECK: ret
-  %v0 = insertelement <4 x i8> undef, i8 -2, i64 0   ; lane0 = -2 (non-zero)
+  %v0 = insertelement <4 x i8> poison, i8 -2, i64 0
   %w  = call <4 x i8> @llvm.abs.v4i8(<4 x i8> %v0, i1 false)
   %e0 = extractelement <4 x i8> %w, i64 0
   %cmp = icmp ne i8 %e0, 0
   ret i1 %cmp
-}
\ No newline at end of file
+}

>From 89e5b5f5efa40a30b7fee2fe1e002d4f19e83e6f Mon Sep 17 00:00:00 2001
From: Ayush3941 <ayushkgaur1 at gmail.com>
Date: Fri, 6 Mar 2026 10:42:10 -0500
Subject: [PATCH 3/5] [DAG] isKnownNeverZero: Add DemandedElts handling for
 ROTL/ROTR/BITREVERSE/BSWAP/CTPOP/ABS update script tests

---
 llvm/test/CodeGen/X86/known-never-zero.ll | 110 ++++++++++++++++++++++
 1 file changed, 110 insertions(+)

diff --git a/llvm/test/CodeGen/X86/known-never-zero.ll b/llvm/test/CodeGen/X86/known-never-zero.ll
index 12bb486d8dceb..a058d54bade25 100644
--- a/llvm/test/CodeGen/X86/known-never-zero.ll
+++ b/llvm/test/CodeGen/X86/known-never-zero.ll
@@ -20,6 +20,12 @@ declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>)
 declare i32 @llvm.abs.i32(i32, i1)
 declare i32 @llvm.fshl.i32(i32, i32, i32)
 declare i32 @llvm.fshr.i32(i32, i32, i32)
+declare <4 x i8>  @llvm.ctpop.v4i8(<4 x i8>)
+declare <4 x i16> @llvm.bswap.v4i16(<4 x i16>)
+declare <4 x i8>  @llvm.bitreverse.v4i8(<4 x i8>)
+declare <4 x i8>  @llvm.fshl.v4i8(<4 x i8>, <4 x i8>, <4 x i8>)
+declare <4 x i8>  @llvm.fshr.v4i8(<4 x i8>, <4 x i8>, <4 x i8>)
+declare <4 x i8>  @llvm.abs.v4i8(<4 x i8>, i1)
 
 define i32 @or_known_nonzero(i32 %x) {
 ; X86-LABEL: or_known_nonzero:
@@ -1465,3 +1471,107 @@ define i32 @sext_maybe_zero(i16 %x) {
   %r = call i32 @llvm.cttz.i32(i32 %z, i1 false)
   ret i32 %r
 }
+
+define i1 @ctpop_lane0_nonzero() {
+; X86-LABEL: ctpop_lane0_nonzero:
+; X86:       # %bb.0:
+; X86-NEXT:    movb $1, %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: ctpop_lane0_nonzero:
+; X64:       # %bb.0:
+; X64-NEXT:    movb $1, %al
+; X64-NEXT:    retq
+  %v0 = insertelement <4 x i8> poison, i8 8, i64 0
+  %w  = call <4 x i8> @llvm.ctpop.v4i8(<4 x i8> %v0)
+  %e0 = extractelement <4 x i8> %w, i64 0
+  %cmp = icmp ne i8 %e0, 0
+  ret i1 %cmp
+}
+
+define i1 @bswap_lane0_nonzero() {
+; X86-LABEL: bswap_lane0_nonzero:
+; X86:       # %bb.0:
+; X86-NEXT:    movb $1, %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: bswap_lane0_nonzero:
+; X64:       # %bb.0:
+; X64-NEXT:    movb $1, %al
+; X64-NEXT:    retq
+  %v0 = insertelement <4 x i16> poison, i16 1, i64 0
+  %w  = call <4 x i16> @llvm.bswap.v4i16(<4 x i16> %v0)
+  %e0 = extractelement <4 x i16> %w, i64 0
+  %cmp = icmp ne i16 %e0, 0
+  ret i1 %cmp
+}
+
+define i1 @bitreverse_lane0_nonzero() {
+; X86-LABEL: bitreverse_lane0_nonzero:
+; X86:       # %bb.0:
+; X86-NEXT:    movb $1, %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: bitreverse_lane0_nonzero:
+; X64:       # %bb.0:
+; X64-NEXT:    movb $1, %al
+; X64-NEXT:    retq
+  %v0 = insertelement <4 x i8> poison, i8 1, i64 0
+  %w  = call <4 x i8> @llvm.bitreverse.v4i8(<4 x i8> %v0)
+  %e0 = extractelement <4 x i8> %w, i64 0
+  %cmp = icmp ne i8 %e0, 0
+  ret i1 %cmp
+}
+
+define i1 @rotl_lane0_nonzero() {
+; X86-LABEL: rotl_lane0_nonzero:
+; X86:       # %bb.0:
+; X86-NEXT:    movb $1, %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: rotl_lane0_nonzero:
+; X64:       # %bb.0:
+; X64-NEXT:    movb $1, %al
+; X64-NEXT:    retq
+  %x  = insertelement <4 x i8> poison, i8 2, i64 0
+  %k  = insertelement <4 x i8> poison, i8 1, i64 0
+  %w  = call <4 x i8> @llvm.fshl.v4i8(<4 x i8> %x, <4 x i8> %x, <4 x i8> %k)
+  %e0 = extractelement <4 x i8> %w, i64 0
+  %cmp = icmp ne i8 %e0, 0
+  ret i1 %cmp
+}
+
+define i1 @rotr_lane0_nonzero() {
+; X86-LABEL: rotr_lane0_nonzero:
+; X86:       # %bb.0:
+; X86-NEXT:    movb $1, %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: rotr_lane0_nonzero:
+; X64:       # %bb.0:
+; X64-NEXT:    movb $1, %al
+; X64-NEXT:    retq
+  %x  = insertelement <4 x i8> poison, i8 2, i64 0
+  %k  = insertelement <4 x i8> poison, i8 1, i64 0
+  %w  = call <4 x i8> @llvm.fshr.v4i8(<4 x i8> %x, <4 x i8> %x, <4 x i8> %k)
+  %e0 = extractelement <4 x i8> %w, i64 0
+  %cmp = icmp ne i8 %e0, 0
+  ret i1 %cmp
+}
+
+define i1 @abs_lane0_nonzero() {
+; X86-LABEL: abs_lane0_nonzero:
+; X86:       # %bb.0:
+; X86-NEXT:    movb $1, %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: abs_lane0_nonzero:
+; X64:       # %bb.0:
+; X64-NEXT:    movb $1, %al
+; X64-NEXT:    retq
+  %v0 = insertelement <4 x i8> poison, i8 -2, i64 0
+  %w  = call <4 x i8> @llvm.abs.v4i8(<4 x i8> %v0, i1 false)
+  %e0 = extractelement <4 x i8> %w, i64 0
+  %cmp = icmp ne i8 %e0, 0
+  ret i1 %cmp
+}

>From 11e0095a18d07cc025e10cccaa2b2f4bbc572d73 Mon Sep 17 00:00:00 2001
From: Ayush3941 <ayushkgaur1 at gmail.com>
Date: Fri, 6 Mar 2026 10:44:31 -0500
Subject: [PATCH 4/5] [DAG] isKnownNeverZero: Add DemandedElts handling for
 ROTL/ROTR/BITREVERSE/BSWAP/CTPOP/ABS update script tests

---
 .../X86/known-never-zero-demanded-elts.ll     | 78 -------------------
 1 file changed, 78 deletions(-)
 delete mode 100644 llvm/test/CodeGen/X86/known-never-zero-demanded-elts.ll

diff --git a/llvm/test/CodeGen/X86/known-never-zero-demanded-elts.ll b/llvm/test/CodeGen/X86/known-never-zero-demanded-elts.ll
deleted file mode 100644
index 34942bf8d9ab5..0000000000000
--- a/llvm/test/CodeGen/X86/known-never-zero-demanded-elts.ll
+++ /dev/null
@@ -1,78 +0,0 @@
-; RUN: llc -O2 -mtriple=x86_64-unknown-linux-gnu %s -o - | FileCheck %s
-
-; Helper: build vector with lane0 set, other lanes poison.
-
-declare <4 x i8>  @llvm.ctpop.v4i8(<4 x i8>)
-declare <4 x i16> @llvm.bswap.v4i16(<4 x i16>)
-declare <4 x i8>  @llvm.bitreverse.v4i8(<4 x i8>)
-declare <4 x i8>  @llvm.fshl.v4i8(<4 x i8>, <4 x i8>, <4 x i8>)
-declare <4 x i8>  @llvm.fshr.v4i8(<4 x i8>, <4 x i8>, <4 x i8>)
-declare <4 x i8>  @llvm.abs.v4i8(<4 x i8>, i1)
-
-define i1 @ctpop_lane0_nonzero() {
-; CHECK-LABEL: ctpop_lane0_nonzero:
-; CHECK: mov{{.*}}1
-; CHECK: ret
-  %v0 = insertelement <4 x i8> poison, i8 8, i64 0
-  %w  = call <4 x i8> @llvm.ctpop.v4i8(<4 x i8> %v0)
-  %e0 = extractelement <4 x i8> %w, i64 0
-  %cmp = icmp ne i8 %e0, 0
-  ret i1 %cmp
-}
-
-define i1 @bswap_lane0_nonzero() {
-; CHECK-LABEL: bswap_lane0_nonzero:
-; CHECK: mov{{.*}}1
-; CHECK: ret
-  %v0 = insertelement <4 x i16> poison, i16 1, i64 0
-  %w  = call <4 x i16> @llvm.bswap.v4i16(<4 x i16> %v0)
-  %e0 = extractelement <4 x i16> %w, i64 0
-  %cmp = icmp ne i16 %e0, 0
-  ret i1 %cmp
-}
-
-define i1 @bitreverse_lane0_nonzero() {
-; CHECK-LABEL: bitreverse_lane0_nonzero:
-; CHECK: mov{{.*}}1
-; CHECK: ret
-  %v0 = insertelement <4 x i8> poison, i8 1, i64 0
-  %w  = call <4 x i8> @llvm.bitreverse.v4i8(<4 x i8> %v0)
-  %e0 = extractelement <4 x i8> %w, i64 0
-  %cmp = icmp ne i8 %e0, 0
-  ret i1 %cmp
-}
-
-define i1 @rotl_lane0_nonzero() {
-; CHECK-LABEL: rotl_lane0_nonzero:
-; CHECK: mov{{.*}}1
-; CHECK: ret
-  %x  = insertelement <4 x i8> poison, i8 2, i64 0
-  %k  = insertelement <4 x i8> poison, i8 1, i64 0
-  %w  = call <4 x i8> @llvm.fshl.v4i8(<4 x i8> %x, <4 x i8> %x, <4 x i8> %k)
-  %e0 = extractelement <4 x i8> %w, i64 0
-  %cmp = icmp ne i8 %e0, 0
-  ret i1 %cmp
-}
-
-define i1 @rotr_lane0_nonzero() {
-; CHECK-LABEL: rotr_lane0_nonzero:
-; CHECK: mov{{.*}}1
-; CHECK: ret
-  %x  = insertelement <4 x i8> poison, i8 2, i64 0
-  %k  = insertelement <4 x i8> poison, i8 1, i64 0
-  %w  = call <4 x i8> @llvm.fshr.v4i8(<4 x i8> %x, <4 x i8> %x, <4 x i8> %k)
-  %e0 = extractelement <4 x i8> %w, i64 0
-  %cmp = icmp ne i8 %e0, 0
-  ret i1 %cmp
-}
-
-define i1 @abs_lane0_nonzero() {
-; CHECK-LABEL: abs_lane0_nonzero:
-; CHECK: mov{{.*}}1
-; CHECK: ret
-  %v0 = insertelement <4 x i8> poison, i8 -2, i64 0
-  %w  = call <4 x i8> @llvm.abs.v4i8(<4 x i8> %v0, i1 false)
-  %e0 = extractelement <4 x i8> %w, i64 0
-  %cmp = icmp ne i8 %e0, 0
-  ret i1 %cmp
-}

>From 4641bc741c8022d4fa082e7ea223b0c806b26c9c Mon Sep 17 00:00:00 2001
From: Ayush3941 <ayushkgaur1 at gmail.com>
Date: Fri, 6 Mar 2026 11:14:20 -0500
Subject: [PATCH 5/5] [DAG] isKnownNeverZero: Add DemandedElts handling for
 ROTL/ROTR/BITREVERSE/BSWAP/CTPOP/ABS v3

---
 llvm/test/CodeGen/X86/known-never-zero.ll | 132 ++++++++++++----------
 1 file changed, 72 insertions(+), 60 deletions(-)

diff --git a/llvm/test/CodeGen/X86/known-never-zero.ll b/llvm/test/CodeGen/X86/known-never-zero.ll
index 2cd91a3cc6fc6..2c39760030228 100644
--- a/llvm/test/CodeGen/X86/known-never-zero.ll
+++ b/llvm/test/CodeGen/X86/known-never-zero.ll
@@ -5,27 +5,6 @@
 ;; Use cttz to test if we properly prove never-zero. There is a very
 ;; simple transform from cttz -> cttz_zero_undef if its operand is
 ;; known never zero.
-declare i32 @llvm.cttz.i32(i32, i1)
-declare i32 @llvm.uadd.sat.i32(i32, i32)
-declare i32 @llvm.umax.i32(i32, i32)
-declare i32 @llvm.umin.i32(i32, i32)
-declare i32 @llvm.smin.i32(i32, i32)
-declare <4 x i32> @llvm.smin.v4i32(<4 x i32>, <4 x i32>)
-declare i32 @llvm.smax.i32(i32, i32)
-declare <4 x i32> @llvm.smax.v4i32(<4 x i32>, <4 x i32>)
-declare i32 @llvm.bswap.i32(i32)
-declare i32 @llvm.bitreverse.i32(i32)
-declare i32 @llvm.ctpop.i32(i32)
-declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>)
-declare i32 @llvm.abs.i32(i32, i1)
-declare i32 @llvm.fshl.i32(i32, i32, i32)
-declare i32 @llvm.fshr.i32(i32, i32, i32)
-declare <4 x i8>  @llvm.ctpop.v4i8(<4 x i8>)
-declare <4 x i16> @llvm.bswap.v4i16(<4 x i16>)
-declare <4 x i8>  @llvm.bitreverse.v4i8(<4 x i8>)
-declare <4 x i8>  @llvm.fshl.v4i8(<4 x i8>, <4 x i8>, <4 x i8>)
-declare <4 x i8>  @llvm.fshr.v4i8(<4 x i8>, <4 x i8>, <4 x i8>)
-declare <4 x i8>  @llvm.abs.v4i8(<4 x i8>, i1)
 
 define i32 @or_known_nonzero(i32 %x) {
 ; X86-LABEL: or_known_nonzero:
@@ -52,15 +31,18 @@ define i32 @or_known_nonzero_vec(<4 x i32> %x, ptr %p) {
 ; X86-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-NEXT:    movdqa %xmm0, (%eax)
 ; X86-NEXT:    movd %xmm0, %eax
-; X86-NEXT:    rep bsfl %eax, %eax
+; X86-NEXT:    bsfl %eax, %ecx
+; X86-NEXT:    movl $32, %eax
+; X86-NEXT:    cmovnel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: or_known_nonzero_vec:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; X64-NEXT:    vmovdqa %xmm0, (%rdi)
-; X64-NEXT:    vmovd %xmm0, %eax
-; X64-NEXT:    rep bsfl %eax, %eax
+; X64-NEXT:    vmovd %xmm0, %ecx
+; X64-NEXT:    movl $32, %eax
+; X64-NEXT:    rep bsfl %ecx, %eax
 ; X64-NEXT:    retq
   %z = or <4 x i32> %x, <i32 1, i32 0, i32 0, i32 0>
   store <4 x i32> %z, ptr %p
@@ -452,15 +434,18 @@ define i32 @uaddsat_known_nonzero_vec(<16 x i8> %x, ptr %p) {
 ; X86-NEXT:    paddusb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-NEXT:    movdqa %xmm0, (%eax)
 ; X86-NEXT:    movzbl (%eax), %eax
-; X86-NEXT:    rep bsfl %eax, %eax
+; X86-NEXT:    bsfl %eax, %ecx
+; X86-NEXT:    movl $32, %eax
+; X86-NEXT:    cmovnel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: uaddsat_known_nonzero_vec:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vpaddusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; X64-NEXT:    vmovdqa %xmm0, (%rdi)
-; X64-NEXT:    vpextrb $0, %xmm0, %eax
-; X64-NEXT:    rep bsfl %eax, %eax
+; X64-NEXT:    vpextrb $0, %xmm0, %ecx
+; X64-NEXT:    movl $32, %eax
+; X64-NEXT:    rep bsfl %ecx, %eax
 ; X64-NEXT:    retq
   %z = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> %x, <16 x i8> <i8 1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>)
   store <16 x i8> %z, ptr %p
@@ -854,14 +839,17 @@ define i32 @smin_known_never_zero_vec_element(<4 x i32> %x) {
 ; X86-NEXT:    por %xmm0, %xmm2
 ; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1]
 ; X86-NEXT:    movd %xmm0, %eax
-; X86-NEXT:    rep bsfl %eax, %eax
+; X86-NEXT:    bsfl %eax, %ecx
+; X86-NEXT:    movl $32, %eax
+; X86-NEXT:    cmovnel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: smin_known_never_zero_vec_element:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-NEXT:    vpextrd $1, %xmm0, %eax
-; X64-NEXT:    rep bsfl %eax, %eax
+; X64-NEXT:    vpextrd $1, %xmm0, %ecx
+; X64-NEXT:    movl $32, %eax
+; X64-NEXT:    rep bsfl %ecx, %eax
 ; X64-NEXT:    retq
   %z = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %x, <4 x i32> <i32 54, i32 -23, i32 12, i32 1>)
   %el = extractelement <4 x i32> %z, i32 1
@@ -991,14 +979,17 @@ define i32 @smax_known_never_zero_vec_element(<4 x i32> %x) {
 ; X86-NEXT:    pandn %xmm1, %xmm2
 ; X86-NEXT:    por %xmm0, %xmm2
 ; X86-NEXT:    movd %xmm2, %eax
-; X86-NEXT:    rep bsfl %eax, %eax
+; X86-NEXT:    bsfl %eax, %ecx
+; X86-NEXT:    movl $32, %eax
+; X86-NEXT:    cmovnel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: smax_known_never_zero_vec_element:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-NEXT:    vmovd %xmm0, %eax
-; X64-NEXT:    rep bsfl %eax, %eax
+; X64-NEXT:    vmovd %xmm0, %ecx
+; X64-NEXT:    movl $32, %eax
+; X64-NEXT:    rep bsfl %ecx, %eax
 ; X64-NEXT:    retq
   %z = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %x, <4 x i32> <i32 54, i32 -23, i32 -12, i32 -1>)
   %el = extractelement <4 x i32> %z, i32 0
@@ -1242,7 +1233,9 @@ define i32 @sra_known_nonzero_sign_bit_set_vec(<4 x i32> %x, ptr %p) {
 ; X86-NEXT:    psrad %xmm1, %xmm0
 ; X86-NEXT:    movdqa %xmm0, (%eax)
 ; X86-NEXT:    movd %xmm0, %eax
-; X86-NEXT:    rep bsfl %eax, %eax
+; X86-NEXT:    bsfl %eax, %ecx
+; X86-NEXT:    movl $32, %eax
+; X86-NEXT:    cmovnel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: sra_known_nonzero_sign_bit_set_vec:
@@ -1251,8 +1244,9 @@ define i32 @sra_known_nonzero_sign_bit_set_vec(<4 x i32> %x, ptr %p) {
 ; X64-NEXT:    vmovdqa {{.*#+}} xmm1 = [2147606891,65535,1,0]
 ; X64-NEXT:    vpsrad %xmm0, %xmm1, %xmm0
 ; X64-NEXT:    vmovdqa %xmm0, (%rdi)
-; X64-NEXT:    vmovd %xmm0, %eax
-; X64-NEXT:    rep bsfl %eax, %eax
+; X64-NEXT:    vmovd %xmm0, %ecx
+; X64-NEXT:    movl $32, %eax
+; X64-NEXT:    rep bsfl %ecx, %eax
 ; X64-NEXT:    retq
   %xx = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> zeroinitializer
   %z = ashr <4 x i32> <i32 2147606891, i32 65535, i32 1, i32 0>, %xx
@@ -1297,7 +1291,9 @@ define i32 @sra_known_nonzero_exact_vec(<4 x i32> %x, <4 x i32> %yy, ptr %p) {
 ; X86-NEXT:    movdqa %xmm1, (%eax)
 ; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
 ; X86-NEXT:    movd %xmm0, %eax
-; X86-NEXT:    rep bsfl %eax, %eax
+; X86-NEXT:    bsfl %eax, %ecx
+; X86-NEXT:    movl $32, %eax
+; X86-NEXT:    cmovnel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: sra_known_nonzero_exact_vec:
@@ -1306,8 +1302,9 @@ define i32 @sra_known_nonzero_exact_vec(<4 x i32> %x, <4 x i32> %yy, ptr %p) {
 ; X64-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; X64-NEXT:    vpsrad %xmm0, %xmm1, %xmm0
 ; X64-NEXT:    vmovdqa %xmm0, (%rdi)
-; X64-NEXT:    vpextrd $1, %xmm0, %eax
-; X64-NEXT:    rep bsfl %eax, %eax
+; X64-NEXT:    vpextrd $1, %xmm0, %ecx
+; X64-NEXT:    movl $32, %eax
+; X64-NEXT:    rep bsfl %ecx, %eax
 ; X64-NEXT:    retq
   %x.splat = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> zeroinitializer
   %y = or <4 x i32> %yy, <i32 0, i32 256, i32 0, i32 0>
@@ -1375,7 +1372,9 @@ define i32 @srl_known_nonzero_sign_bit_set_vec(<4 x i32> %x, ptr %p) {
 ; X86-NEXT:    movdqa %xmm0, (%eax)
 ; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
 ; X86-NEXT:    movd %xmm0, %eax
-; X86-NEXT:    rep bsfl %eax, %eax
+; X86-NEXT:    bsfl %eax, %ecx
+; X86-NEXT:    movl $32, %eax
+; X86-NEXT:    cmovnel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: srl_known_nonzero_sign_bit_set_vec:
@@ -1384,8 +1383,9 @@ define i32 @srl_known_nonzero_sign_bit_set_vec(<4 x i32> %x, ptr %p) {
 ; X64-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,65535,2147606891,0]
 ; X64-NEXT:    vpsrld %xmm0, %xmm1, %xmm0
 ; X64-NEXT:    vmovdqa %xmm0, (%rdi)
-; X64-NEXT:    vpextrd $2, %xmm0, %eax
-; X64-NEXT:    rep bsfl %eax, %eax
+; X64-NEXT:    vpextrd $2, %xmm0, %ecx
+; X64-NEXT:    movl $32, %eax
+; X64-NEXT:    rep bsfl %ecx, %eax
 ; X64-NEXT:    retq
   %x.splat = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> zeroinitializer
   %z = lshr <4 x i32> <i32 0, i32 65535, i32 2147606891, i32 0>, %x.splat
@@ -1430,7 +1430,9 @@ define i32 @srl_known_nonzero_exact_vec(<4 x i32> %x, <4 x i32> %yy, ptr %p) {
 ; X86-NEXT:    movdqa %xmm1, (%eax)
 ; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3]
 ; X86-NEXT:    movd %xmm0, %eax
-; X86-NEXT:    rep bsfl %eax, %eax
+; X86-NEXT:    bsfl %eax, %ecx
+; X86-NEXT:    movl $32, %eax
+; X86-NEXT:    cmovnel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: srl_known_nonzero_exact_vec:
@@ -1439,8 +1441,9 @@ define i32 @srl_known_nonzero_exact_vec(<4 x i32> %x, <4 x i32> %yy, ptr %p) {
 ; X64-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; X64-NEXT:    vpsrld %xmm0, %xmm1, %xmm0
 ; X64-NEXT:    vmovdqa %xmm0, (%rdi)
-; X64-NEXT:    vpextrd $3, %xmm0, %eax
-; X64-NEXT:    rep bsfl %eax, %eax
+; X64-NEXT:    vpextrd $3, %xmm0, %ecx
+; X64-NEXT:    movl $32, %eax
+; X64-NEXT:    rep bsfl %ecx, %eax
 ; X64-NEXT:    retq
   %x.splat = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> zeroinitializer
   %y = or <4 x i32> %yy, <i32 0, i32 0, i32 0, i32 256>
@@ -1787,15 +1790,18 @@ define i32 @add_nuw_known_nonzero_vec(<4 x i32> %xx, ptr %p) {
 ; X86-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-NEXT:    movdqa %xmm0, (%eax)
 ; X86-NEXT:    movd %xmm0, %eax
-; X86-NEXT:    rep bsfl %eax, %eax
+; X86-NEXT:    bsfl %eax, %ecx
+; X86-NEXT:    movl $32, %eax
+; X86-NEXT:    cmovnel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: add_nuw_known_nonzero_vec:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; X64-NEXT:    vmovdqa %xmm0, (%rdi)
-; X64-NEXT:    vmovd %xmm0, %eax
-; X64-NEXT:    rep bsfl %eax, %eax
+; X64-NEXT:    vmovd %xmm0, %ecx
+; X64-NEXT:    movl $32, %eax
+; X64-NEXT:    rep bsfl %ecx, %eax
 ; X64-NEXT:    retq
   %z = add nuw <4 x i32> %xx, <i32 1, i32 0, i32 0, i32 0>
   store <4 x i32> %z, ptr %p
@@ -1911,7 +1917,9 @@ define i32 @sub_known_nonzero_ne_vec(<4 x i32> %xx, ptr %p) {
 ; X86-NEXT:    psubd %xmm0, %xmm1
 ; X86-NEXT:    movdqa %xmm1, (%eax)
 ; X86-NEXT:    movd %xmm1, %eax
-; X86-NEXT:    rep bsfl %eax, %eax
+; X86-NEXT:    bsfl %eax, %ecx
+; X86-NEXT:    movl $32, %eax
+; X86-NEXT:    cmovnel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: sub_known_nonzero_ne_vec:
@@ -1920,8 +1928,9 @@ define i32 @sub_known_nonzero_ne_vec(<4 x i32> %xx, ptr %p) {
 ; X64-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [2,0]
 ; X64-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
 ; X64-NEXT:    vmovdqa %xmm0, (%rdi)
-; X64-NEXT:    vmovd %xmm0, %eax
-; X64-NEXT:    rep bsfl %eax, %eax
+; X64-NEXT:    vmovd %xmm0, %ecx
+; X64-NEXT:    movl $32, %eax
+; X64-NEXT:    rep bsfl %ecx, %eax
 ; X64-NEXT:    retq
   %u = or <4 x i32> %xx, <i32 1, i32 0, i32 0, i32 0>
   %z = sub <4 x i32> <i32 2, i32 0, i32 0, i32 0>, %u
@@ -2293,8 +2302,9 @@ define i32 @test_zext_demanded_elts(<4 x i32> %a0, ptr %p) {
 ; X64-NEXT:    vunpckhps {{.*#+}} xmm2 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
 ; X64-NEXT:    vmovaps %xmm2, 16(%rdi)
 ; X64-NEXT:    vmovdqa %xmm1, (%rdi)
-; X64-NEXT:    vmovd %xmm0, %eax
-; X64-NEXT:    rep bsfq %rax, %rax
+; X64-NEXT:    vmovd %xmm0, %ecx
+; X64-NEXT:    movl $64, %eax
+; X64-NEXT:    rep bsfq %rcx, %rax
 ; X64-NEXT:    # kill: def $eax killed $eax killed $rax
 ; X64-NEXT:    retq
   %cmp = icmp sgt <4 x i32> zeroinitializer, %a0
@@ -2327,11 +2337,12 @@ define i32 @test_sext_demanded_elts(<4 x i32> %a0, ptr %p) {
 ; X86-NEXT:    movdqa %xmm0, 16(%eax)
 ; X86-NEXT:    movdqa %xmm2, (%eax)
 ; X86-NEXT:    movd %xmm1, %eax
-; X86-NEXT:    rep bsfl %ecx, %edx
-; X86-NEXT:    rep bsfl %eax, %eax
-; X86-NEXT:    addl $32, %eax
-; X86-NEXT:    testl %ecx, %ecx
-; X86-NEXT:    cmovnel %edx, %eax
+; X86-NEXT:    bsfl %eax, %eax
+; X86-NEXT:    movl $32, %edx
+; X86-NEXT:    cmovnel %eax, %edx
+; X86-NEXT:    addl $32, %edx
+; X86-NEXT:    bsfl %ecx, %eax
+; X86-NEXT:    cmovel %edx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_sext_demanded_elts:
@@ -2343,8 +2354,9 @@ define i32 @test_sext_demanded_elts(<4 x i32> %a0, ptr %p) {
 ; X64-NEXT:    vpmovsxdq %xmm0, %xmm0
 ; X64-NEXT:    vmovdqa %xmm0, (%rdi)
 ; X64-NEXT:    vmovdqa %xmm1, 16(%rdi)
-; X64-NEXT:    vmovq %xmm0, %rax
-; X64-NEXT:    rep bsfq %rax, %rax
+; X64-NEXT:    vmovq %xmm0, %rcx
+; X64-NEXT:    movl $64, %eax
+; X64-NEXT:    rep bsfq %rcx, %rax
 ; X64-NEXT:    # kill: def $eax killed $eax killed $rax
 ; X64-NEXT:    retq
   %cmp = icmp sgt <4 x i32> zeroinitializer, %a0



More information about the llvm-commits mailing list