[llvm] goldsteinn/known vec reduce add mul (PR #88410)

Thu Apr 11 09:29:46 PDT 2024

https://github.com/goldsteinn created https://github.com/llvm/llvm-project/pull/88410

- **[ValueTracking] Add tests for `computeKnownBits` of `llvm.vector.reduce.{add,mul}`; NFC**
- **[ValueTracking] Implement `computeKnownBits` for `llvm.vector.reduce.{add,mul}`**
- **[ValueTracking] Add tests for `isKnownNonZero` of `llvm.vector.reduce.{add,mul}`; NFC**
- **[ValueTracking] Implement `isKnownNonZero` for `llvm.vector.reduce.{add,mul}`**


>From 7a242961415580cc41a95614f79aa88fa7d14b7a Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n at gmail.com>
Date: Wed, 10 Apr 2024 15:30:43 -0500
Subject: [PATCH 1/4] [ValueTracking] Add tests for `computeKnownBits` of
 `llvm.vector.reduce.{add,mul}`; NFC

---
 .../test/Transforms/InstCombine/known-bits.ll | 184 ++++++++++++++++++
 1 file changed, 184 insertions(+)

diff --git a/llvm/test/Transforms/InstCombine/known-bits.ll b/llvm/test/Transforms/InstCombine/known-bits.ll
index d210b19bb7faf2..fde3e22143333b 100644
--- a/llvm/test/Transforms/InstCombine/known-bits.ll
+++ b/llvm/test/Transforms/InstCombine/known-bits.ll
@@ -999,5 +999,189 @@ define i1 @extract_value_smul_fail(i8 %xx, i8 %yy) {
   ret i1 %r
 }
 
+define i8 @known_reduce_add(<2 x i8> %xx) {
+; CHECK-LABEL: @known_reduce_add(
+; CHECK-NEXT:    [[X:%.*]] = and <2 x i8> [[XX:%.*]], <i8 3, i8 3>
+; CHECK-NEXT:    [[V:%.*]] = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> [[X]])
+; CHECK-NEXT:    [[R:%.*]] = and i8 [[V]], 8
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %x = and <2 x i8> %xx, <i8 3, i8 3>
+  %v = call i8 @llvm.vector.reduce.add(<2 x i8> %x)
+  %r = and i8 %v, 8
+  ret i8 %r
+}
+
+define i8 @known_reduce_add_fail(<2 x i8> %xx) {
+; CHECK-LABEL: @known_reduce_add_fail(
+; CHECK-NEXT:    [[X:%.*]] = and <2 x i8> [[XX:%.*]], <i8 3, i8 3>
+; CHECK-NEXT:    [[V:%.*]] = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> [[X]])
+; CHECK-NEXT:    [[R:%.*]] = and i8 [[V]], 4
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %x = and <2 x i8> %xx, <i8 3, i8 3>
+  %v = call i8 @llvm.vector.reduce.add(<2 x i8> %x)
+  %r = and i8 %v, 4
+  ret i8 %r
+}
+
+define i8 @known_reduce_add_fail2(<4 x i8> %xx) {
+; CHECK-LABEL: @known_reduce_add_fail2(
+; CHECK-NEXT:    [[X:%.*]] = and <4 x i8> [[XX:%.*]], <i8 3, i8 3, i8 3, i8 3>
+; CHECK-NEXT:    [[V:%.*]] = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> [[X]])
+; CHECK-NEXT:    [[R:%.*]] = and i8 [[V]], 8
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %x = and <4 x i8> %xx, <i8 3, i8 3, i8 3, i8 3>
+  %v = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %x)
+  %r = and i8 %v, 8
+  ret i8 %r
+}
+
+define i8 @known_reduce_add2(<4 x i8> %xx) {
+; CHECK-LABEL: @known_reduce_add2(
+; CHECK-NEXT:    [[X:%.*]] = and <4 x i8> [[XX:%.*]], <i8 3, i8 3, i8 3, i8 3>
+; CHECK-NEXT:    [[V:%.*]] = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> [[X]])
+; CHECK-NEXT:    [[R:%.*]] = and i8 [[V]], 32
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %x = and <4 x i8> %xx, <i8 3, i8 3, i8 3, i8 3>
+  %v = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %x)
+  %r = and i8 %v, 32
+  ret i8 %r
+}
+
+define i8 @known_reduce_add3(<2 x i8> %xx) {
+; CHECK-LABEL: @known_reduce_add3(
+; CHECK-NEXT:    [[X:%.*]] = or <2 x i8> [[XX:%.*]], <i8 1, i8 1>
+; CHECK-NEXT:    [[V:%.*]] = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> [[X]])
+; CHECK-NEXT:    [[R:%.*]] = and i8 [[V]], 1
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %x = or <2 x i8> %xx, <i8 1, i8 1>
+  %v = call i8 @llvm.vector.reduce.add(<2 x i8> %x)
+  %r = and i8 %v, 1
+  ret i8 %r
+}
+
+define i8 @known_reduce_add33(<3 x i8> %xx) {
+; CHECK-LABEL: @known_reduce_add33(
+; CHECK-NEXT:    [[X:%.*]] = or <3 x i8> [[XX:%.*]], <i8 1, i8 1, i8 1>
+; CHECK-NEXT:    [[V:%.*]] = call i8 @llvm.vector.reduce.add.v3i8(<3 x i8> [[X]])
+; CHECK-NEXT:    [[R:%.*]] = and i8 [[V]], 1
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %x = or <3 x i8> %xx, <i8 1, i8 1, i8 1>
+  %v = call i8 @llvm.vector.reduce.add.v3i8(<3 x i8> %x)
+  %r = and i8 %v, 1
+  ret i8 %r
+}
+
+define i8 @known_reduce_add34(<4 x i8> %xx) {
+; CHECK-LABEL: @known_reduce_add34(
+; CHECK-NEXT:    [[X:%.*]] = or <4 x i8> [[XX:%.*]], <i8 1, i8 1, i8 1, i8 1>
+; CHECK-NEXT:    [[V:%.*]] = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> [[X]])
+; CHECK-NEXT:    [[R:%.*]] = and i8 [[V]], 1
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %x = or <4 x i8> %xx, <i8 1, i8 1, i8 1, i8 1>
+  %v = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %x)
+  %r = and i8 %v, 1
+  ret i8 %r
+}
+
+define i8 @known_reduce_add4(<2 x i8> %xx) {
+; CHECK-LABEL: @known_reduce_add4(
+; CHECK-NEXT:    [[X0:%.*]] = and <2 x i8> [[XX:%.*]], <i8 2, i8 2>
+; CHECK-NEXT:    [[X:%.*]] = or disjoint <2 x i8> [[X0]], <i8 1, i8 1>
+; CHECK-NEXT:    [[V:%.*]] = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> [[X]])
+; CHECK-NEXT:    [[R:%.*]] = and i8 [[V]], 2
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %x0 = and <2 x i8> %xx, <i8 3, i8 3>
+  %x = or <2 x i8> %x0, <i8 1, i8 1>
+  %v = call i8 @llvm.vector.reduce.add(<2 x i8> %x)
+  %r = and i8 %v, 2
+  ret i8 %r
+}
+
+define i8 @known_reduce_add4_fail(<2 x i8> %xx) {
+; CHECK-LABEL: @known_reduce_add4_fail(
+; CHECK-NEXT:    [[X:%.*]] = or <2 x i8> [[XX:%.*]], <i8 1, i8 1>
+; CHECK-NEXT:    [[V:%.*]] = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> [[X]])
+; CHECK-NEXT:    [[R:%.*]] = and i8 [[V]], 2
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %x = or <2 x i8> %xx, <i8 1, i8 1>
+  %v = call i8 @llvm.vector.reduce.add(<2 x i8> %x)
+  %r = and i8 %v, 2
+  ret i8 %r
+}
+
+define i8 @known_reduce_mul(<2 x i8> %xx) {
+; CHECK-LABEL: @known_reduce_mul(
+; CHECK-NEXT:    [[X:%.*]] = and <2 x i8> [[XX:%.*]], <i8 3, i8 3>
+; CHECK-NEXT:    [[V:%.*]] = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> [[X]])
+; CHECK-NEXT:    [[R:%.*]] = and i8 [[V]], 16
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %x = and <2 x i8> %xx, <i8 3, i8 3>
+  %v = call i8 @llvm.vector.reduce.mul(<2 x i8> %x)
+  %r = and i8 %v, 16
+  ret i8 %r
+}
+
+define i8 @known_reduce_mul_fail(<2 x i8> %xx) {
+; CHECK-LABEL: @known_reduce_mul_fail(
+; CHECK-NEXT:    [[X:%.*]] = and <2 x i8> [[XX:%.*]], <i8 3, i8 3>
+; CHECK-NEXT:    [[V:%.*]] = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> [[X]])
+; CHECK-NEXT:    [[R:%.*]] = and i8 [[V]], 8
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %x = and <2 x i8> %xx, <i8 3, i8 3>
+  %v = call i8 @llvm.vector.reduce.mul(<2 x i8> %x)
+  %r = and i8 %v, 8
+  ret i8 %r
+}
+
+define i8 @known_reduce_mul_fail2(<3 x i8> %xx) {
+; CHECK-LABEL: @known_reduce_mul_fail2(
+; CHECK-NEXT:    [[X:%.*]] = and <3 x i8> [[XX:%.*]], <i8 3, i8 3, i8 3>
+; CHECK-NEXT:    [[V:%.*]] = call i8 @llvm.vector.reduce.mul.v3i8(<3 x i8> [[X]])
+; CHECK-NEXT:    [[R:%.*]] = and i8 [[V]], 32
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %x = and <3 x i8> %xx, <i8 3, i8 3, i8 3>
+  %v = call i8 @llvm.vector.reduce.mul.v3i8(<3 x i8> %x)
+  %r = and i8 %v, 32
+  ret i8 %r
+}
+
+define i8 @known_reduce_mul2(<3 x i8> %xx) {
+; CHECK-LABEL: @known_reduce_mul2(
+; CHECK-NEXT:    [[X:%.*]] = and <3 x i8> [[XX:%.*]], <i8 3, i8 3, i8 3>
+; CHECK-NEXT:    [[V:%.*]] = call i8 @llvm.vector.reduce.mul.v3i8(<3 x i8> [[X]])
+; CHECK-NEXT:    [[R:%.*]] = and i8 [[V]], 64
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %x = and <3 x i8> %xx, <i8 3, i8 3, i8 3>
+  %v = call i8 @llvm.vector.reduce.mul.v3i8(<3 x i8> %x)
+  %r = and i8 %v, 64
+  ret i8 %r
+}
+
+define i8 @known_reduce_mul3(<2 x i8> %xx) {
+; CHECK-LABEL: @known_reduce_mul3(
+; CHECK-NEXT:    [[X:%.*]] = or <2 x i8> [[XX:%.*]], <i8 1, i8 1>
+; CHECK-NEXT:    [[V:%.*]] = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> [[X]])
+; CHECK-NEXT:    [[R:%.*]] = and i8 [[V]], 1
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %x = or <2 x i8> %xx, <i8 1, i8 1>
+  %v = call i8 @llvm.vector.reduce.mul(<2 x i8> %x)
+  %r = and i8 %v, 1
+  ret i8 %r
+}
+
 declare void @use(i1)
 declare void @sink(i8)

>From 03936f5984493c4a845a3f8122ffdbe14330cad3 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n at gmail.com>
Date: Wed, 10 Apr 2024 14:24:18 -0500
Subject: [PATCH 2/4] [ValueTracking] Implement `computeKnownBits` for
 `llvm.vector.reduce.{add,mul}`

---
 llvm/lib/Analysis/ValueTracking.cpp           | 20 ++++++++++
 .../test/Transforms/InstCombine/known-bits.ll | 40 ++++---------------
 2 files changed, 28 insertions(+), 32 deletions(-)

diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 3a10de72a27562..8bda957cb0a6a5 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -1629,6 +1629,26 @@ static void computeKnownBitsFromOperator(const Operator *I,
       case Intrinsic::vector_reduce_smin:
         computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
         break;
+      case Intrinsic::vector_reduce_mul:
+      case Intrinsic::vector_reduce_add:
+        // We compute the common bits for all elements then apply the reduce op
+        // NumEle times. This is mostly useful for known high zeros.
+        if (auto *VecTy =
+                dyn_cast<FixedVectorType>(I->getOperand(0)->getType())) {
+          computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
+          KnownBits SingleKnown = Known;
+          for (unsigned i = 1, e = VecTy->getNumElements(); i < e; ++i) {
+            if (Known.isUnknown())
+              break;
+            if (II->getIntrinsicID() == Intrinsic::vector_reduce_add)
+              Known = KnownBits::computeForAddSub(
+                  /*Add=*/true, /*NSW=*/false, /*NUW=*/false, SingleKnown,
+                  Known);
+            else
+              Known = KnownBits::mul(SingleKnown, Known);
+          }
+        }
+        break;
       case Intrinsic::umin:
         computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
         computeKnownBits(I->getOperand(1), Known2, Depth + 1, Q);
diff --git a/llvm/test/Transforms/InstCombine/known-bits.ll b/llvm/test/Transforms/InstCombine/known-bits.ll
index fde3e22143333b..4eca9ea87c6c4e 100644
--- a/llvm/test/Transforms/InstCombine/known-bits.ll
+++ b/llvm/test/Transforms/InstCombine/known-bits.ll
@@ -1001,10 +1001,7 @@ define i1 @extract_value_smul_fail(i8 %xx, i8 %yy) {
 
 define i8 @known_reduce_add(<2 x i8> %xx) {
 ; CHECK-LABEL: @known_reduce_add(
-; CHECK-NEXT:    [[X:%.*]] = and <2 x i8> [[XX:%.*]], <i8 3, i8 3>
-; CHECK-NEXT:    [[V:%.*]] = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> [[X]])
-; CHECK-NEXT:    [[R:%.*]] = and i8 [[V]], 8
-; CHECK-NEXT:    ret i8 [[R]]
+; CHECK-NEXT:    ret i8 0
 ;
   %x = and <2 x i8> %xx, <i8 3, i8 3>
   %v = call i8 @llvm.vector.reduce.add(<2 x i8> %x)
@@ -1040,10 +1037,7 @@ define i8 @known_reduce_add_fail2(<4 x i8> %xx) {
 
 define i8 @known_reduce_add2(<4 x i8> %xx) {
 ; CHECK-LABEL: @known_reduce_add2(
-; CHECK-NEXT:    [[X:%.*]] = and <4 x i8> [[XX:%.*]], <i8 3, i8 3, i8 3, i8 3>
-; CHECK-NEXT:    [[V:%.*]] = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> [[X]])
-; CHECK-NEXT:    [[R:%.*]] = and i8 [[V]], 32
-; CHECK-NEXT:    ret i8 [[R]]
+; CHECK-NEXT:    ret i8 0
 ;
   %x = and <4 x i8> %xx, <i8 3, i8 3, i8 3, i8 3>
   %v = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %x)
@@ -1053,10 +1047,7 @@ define i8 @known_reduce_add2(<4 x i8> %xx) {
 
 define i8 @known_reduce_add3(<2 x i8> %xx) {
 ; CHECK-LABEL: @known_reduce_add3(
-; CHECK-NEXT:    [[X:%.*]] = or <2 x i8> [[XX:%.*]], <i8 1, i8 1>
-; CHECK-NEXT:    [[V:%.*]] = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> [[X]])
-; CHECK-NEXT:    [[R:%.*]] = and i8 [[V]], 1
-; CHECK-NEXT:    ret i8 [[R]]
+; CHECK-NEXT:    ret i8 0
 ;
   %x = or <2 x i8> %xx, <i8 1, i8 1>
   %v = call i8 @llvm.vector.reduce.add(<2 x i8> %x)
@@ -1066,10 +1057,7 @@ define i8 @known_reduce_add3(<2 x i8> %xx) {
 
 define i8 @known_reduce_add33(<3 x i8> %xx) {
 ; CHECK-LABEL: @known_reduce_add33(
-; CHECK-NEXT:    [[X:%.*]] = or <3 x i8> [[XX:%.*]], <i8 1, i8 1, i8 1>
-; CHECK-NEXT:    [[V:%.*]] = call i8 @llvm.vector.reduce.add.v3i8(<3 x i8> [[X]])
-; CHECK-NEXT:    [[R:%.*]] = and i8 [[V]], 1
-; CHECK-NEXT:    ret i8 [[R]]
+; CHECK-NEXT:    ret i8 1
 ;
   %x = or <3 x i8> %xx, <i8 1, i8 1, i8 1>
   %v = call i8 @llvm.vector.reduce.add.v3i8(<3 x i8> %x)
@@ -1079,10 +1067,7 @@ define i8 @known_reduce_add33(<3 x i8> %xx) {
 
 define i8 @known_reduce_add34(<4 x i8> %xx) {
 ; CHECK-LABEL: @known_reduce_add34(
-; CHECK-NEXT:    [[X:%.*]] = or <4 x i8> [[XX:%.*]], <i8 1, i8 1, i8 1, i8 1>
-; CHECK-NEXT:    [[V:%.*]] = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> [[X]])
-; CHECK-NEXT:    [[R:%.*]] = and i8 [[V]], 1
-; CHECK-NEXT:    ret i8 [[R]]
+; CHECK-NEXT:    ret i8 0
 ;
   %x = or <4 x i8> %xx, <i8 1, i8 1, i8 1, i8 1>
   %v = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %x)
@@ -1120,10 +1105,7 @@ define i8 @known_reduce_add4_fail(<2 x i8> %xx) {
 
 define i8 @known_reduce_mul(<2 x i8> %xx) {
 ; CHECK-LABEL: @known_reduce_mul(
-; CHECK-NEXT:    [[X:%.*]] = and <2 x i8> [[XX:%.*]], <i8 3, i8 3>
-; CHECK-NEXT:    [[V:%.*]] = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> [[X]])
-; CHECK-NEXT:    [[R:%.*]] = and i8 [[V]], 16
-; CHECK-NEXT:    ret i8 [[R]]
+; CHECK-NEXT:    ret i8 0
 ;
   %x = and <2 x i8> %xx, <i8 3, i8 3>
   %v = call i8 @llvm.vector.reduce.mul(<2 x i8> %x)
@@ -1159,10 +1141,7 @@ define i8 @known_reduce_mul_fail2(<3 x i8> %xx) {
 
 define i8 @known_reduce_mul2(<3 x i8> %xx) {
 ; CHECK-LABEL: @known_reduce_mul2(
-; CHECK-NEXT:    [[X:%.*]] = and <3 x i8> [[XX:%.*]], <i8 3, i8 3, i8 3>
-; CHECK-NEXT:    [[V:%.*]] = call i8 @llvm.vector.reduce.mul.v3i8(<3 x i8> [[X]])
-; CHECK-NEXT:    [[R:%.*]] = and i8 [[V]], 64
-; CHECK-NEXT:    ret i8 [[R]]
+; CHECK-NEXT:    ret i8 0
 ;
   %x = and <3 x i8> %xx, <i8 3, i8 3, i8 3>
   %v = call i8 @llvm.vector.reduce.mul.v3i8(<3 x i8> %x)
@@ -1172,10 +1151,7 @@ define i8 @known_reduce_mul2(<3 x i8> %xx) {
 
 define i8 @known_reduce_mul3(<2 x i8> %xx) {
 ; CHECK-LABEL: @known_reduce_mul3(
-; CHECK-NEXT:    [[X:%.*]] = or <2 x i8> [[XX:%.*]], <i8 1, i8 1>
-; CHECK-NEXT:    [[V:%.*]] = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> [[X]])
-; CHECK-NEXT:    [[R:%.*]] = and i8 [[V]], 1
-; CHECK-NEXT:    ret i8 [[R]]
+; CHECK-NEXT:    ret i8 1
 ;
   %x = or <2 x i8> %xx, <i8 1, i8 1>
   %v = call i8 @llvm.vector.reduce.mul(<2 x i8> %x)

>From 57789b825f2de6d4d8ed9d3d8012f15f4ff41778 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n at gmail.com>
Date: Wed, 10 Apr 2024 15:29:02 -0500
Subject: [PATCH 3/4] [ValueTracking] Add tests for `isKnownNonZero` of
 `llvm.vector.reduce.{add,mul}`; NFC

---
 .../Transforms/InstSimplify/known-non-zero.ll | 117 ++++++++++++++++++
 1 file changed, 117 insertions(+)

diff --git a/llvm/test/Transforms/InstSimplify/known-non-zero.ll b/llvm/test/Transforms/InstSimplify/known-non-zero.ll
index d9b8f5eed32390..fd2febe6ea26c2 100644
--- a/llvm/test/Transforms/InstSimplify/known-non-zero.ll
+++ b/llvm/test/Transforms/InstSimplify/known-non-zero.ll
@@ -377,3 +377,120 @@ define <2 x i1> @insert_nonzero_any_idx_fail(<2 x i8> %xx, i8 %yy, i32 %idx) {
   %r = icmp eq <2 x i8> %ins, zeroinitializer
   ret <2 x i1> %r
 }
+
+define i1 @nonzero_reduce_add(<2 x i8> %xx) {
+; CHECK-LABEL: @nonzero_reduce_add(
+; CHECK-NEXT:    [[X0:%.*]] = and <2 x i8> [[XX:%.*]], <i8 3, i8 3>
+; CHECK-NEXT:    [[X:%.*]] = add <2 x i8> [[X0]], <i8 1, i8 1>
+; CHECK-NEXT:    [[V:%.*]] = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> [[X]])
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i8 [[V]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %x0 = and <2 x i8> %xx, <i8 3, i8 3>
+  %x = add <2 x i8> %x0, <i8 1, i8 1>
+  %v = call i8 @llvm.vector.reduce.add(<2 x i8> %x)
+  %r = icmp eq i8 %v, 0
+  ret i1 %r
+}
+
+define i1 @nonzero_reduce_add_fail(<2 x i8> %xx) {
+; CHECK-LABEL: @nonzero_reduce_add_fail(
+; CHECK-NEXT:    [[X0:%.*]] = and <2 x i8> [[XX:%.*]], <i8 3, i8 3>
+; CHECK-NEXT:    [[X:%.*]] = add <2 x i8> [[X0]], <i8 1, i8 0>
+; CHECK-NEXT:    [[V:%.*]] = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> [[X]])
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i8 [[V]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %x0 = and <2 x i8> %xx, <i8 3, i8 3>
+  %x = add <2 x i8> %x0, <i8 1, i8 0>
+  %v = call i8 @llvm.vector.reduce.add(<2 x i8> %x)
+  %r = icmp eq i8 %v, 0
+  ret i1 %r
+}
+
+define i1 @nonzero_reduce_add_fail2(<2 x i8> %xx) {
+; CHECK-LABEL: @nonzero_reduce_add_fail2(
+; CHECK-NEXT:    [[X:%.*]] = add nuw <2 x i8> [[XX:%.*]], <i8 1, i8 1>
+; CHECK-NEXT:    [[V:%.*]] = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> [[X]])
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i8 [[V]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %x = add nuw <2 x i8> %xx, <i8 1, i8 1>
+  %v = call i8 @llvm.vector.reduce.add(<2 x i8> %x)
+  %r = icmp eq i8 %v, 0
+  ret i1 %r
+}
+
+define i1 @nonzero_reduce_add_fail3(<18 x i4> %xx) {
+; CHECK-LABEL: @nonzero_reduce_add_fail3(
+; CHECK-NEXT:    [[X0:%.*]] = and <18 x i4> [[XX:%.*]], <i4 3, i4 3, i4 3, i4 3, i4 3, i4 3, i4 3, i4 3, i4 3, i4 3, i4 3, i4 3, i4 3, i4 3, i4 3, i4 3, i4 3, i4 3>
+; CHECK-NEXT:    [[X:%.*]] = add <18 x i4> [[X0]], <i4 1, i4 1, i4 1, i4 1, i4 1, i4 1, i4 1, i4 1, i4 1, i4 1, i4 1, i4 1, i4 1, i4 1, i4 1, i4 1, i4 1, i4 1>
+; CHECK-NEXT:    [[V:%.*]] = call i4 @llvm.vector.reduce.add.v18i4(<18 x i4> [[X]])
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i4 [[V]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %x0 = and <18 x i4> %xx, <i4 3, i4 3, i4 3, i4 3, i4 3, i4 3, i4 3, i4 3, i4 3, i4 3, i4 3, i4 3, i4 3, i4 3, i4 3, i4 3, i4 3, i4 3>
+  %x = add <18 x i4> %x0, <i4 1, i4 1, i4 1, i4 1, i4 1, i4 1, i4 1, i4 1, i4 1, i4 1, i4 1, i4 1, i4 1, i4 1, i4 1, i4 1, i4 1, i4 1>
+  %v = call i4 @llvm.vector.reduce.add.v18i4(<18 x i4> %x)
+  %r = icmp eq i4 %v, 0
+  ret i1 %r
+}
+
+define i1 @nonzero_reduce_mul(<2 x i8> %xx) {
+; CHECK-LABEL: @nonzero_reduce_mul(
+; CHECK-NEXT:    [[X0:%.*]] = and <2 x i8> [[XX:%.*]], <i8 3, i8 3>
+; CHECK-NEXT:    [[X:%.*]] = add <2 x i8> [[X0]], <i8 1, i8 1>
+; CHECK-NEXT:    [[V:%.*]] = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> [[X]])
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i8 [[V]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %x0 = and <2 x i8> %xx, <i8 3, i8 3>
+  %x = add <2 x i8> %x0, <i8 1, i8 1>
+  %v = call i8 @llvm.vector.reduce.mul(<2 x i8> %x)
+  %r = icmp eq i8 %v, 0
+  ret i1 %r
+}
+
+define i1 @nonzero_reduce_mul2(<3 x i16> %xx) {
+; CHECK-LABEL: @nonzero_reduce_mul2(
+; CHECK-NEXT:    [[X0:%.*]] = and <3 x i16> [[XX:%.*]], <i16 3, i16 3, i16 3>
+; CHECK-NEXT:    [[X:%.*]] = add <3 x i16> [[X0]], <i16 1, i16 1, i16 1>
+; CHECK-NEXT:    [[V:%.*]] = call i16 @llvm.vector.reduce.mul.v3i16(<3 x i16> [[X]])
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i16 [[V]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %x0 = and <3 x i16> %xx, <i16 3, i16 3, i16 3>
+  %x = add <3 x i16> %x0, <i16 1, i16 1, i16 1>
+  %v = call i16 @llvm.vector.reduce.mul.v3i16(<3 x i16> %x)
+  %r = icmp eq i16 %v, 0
+  ret i1 %r
+}
+
+define i1 @nonzero_reduce_mul_fail(<2 x i8> %xx) {
+; CHECK-LABEL: @nonzero_reduce_mul_fail(
+; CHECK-NEXT:    [[X0:%.*]] = and <2 x i8> [[XX:%.*]], <i8 15, i8 15>
+; CHECK-NEXT:    [[X:%.*]] = add <2 x i8> [[X0]], <i8 1, i8 1>
+; CHECK-NEXT:    [[V:%.*]] = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> [[X]])
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i8 [[V]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %x0 = and <2 x i8> %xx, <i8 15, i8 15>
+  %x = add <2 x i8> %x0, <i8 1, i8 1>
+  %v = call i8 @llvm.vector.reduce.mul(<2 x i8> %x)
+  %r = icmp eq i8 %v, 0
+  ret i1 %r
+}
+
+define i1 @nonzero_reduce_mul_fail2(<2 x i8> %xx) {
+; CHECK-LABEL: @nonzero_reduce_mul_fail2(
+; CHECK-NEXT:    [[X:%.*]] = add nuw <2 x i8> [[XX:%.*]], <i8 1, i8 1>
+; CHECK-NEXT:    [[V:%.*]] = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> [[X]])
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i8 [[V]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %x = add nuw <2 x i8> %xx, <i8 1, i8 1>
+  %v = call i8 @llvm.vector.reduce.mul(<2 x i8> %x)
+  %r = icmp eq i8 %v, 0
+  ret i1 %r
+}
+

>From f03faacf8d2130ab58f7ec5163f93c99e10ccd62 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n at gmail.com>
Date: Wed, 10 Apr 2024 15:36:33 -0500
Subject: [PATCH 4/4] [ValueTracking] Implement `isKnownNonZero` for
 `llvm.vector.reduce.{add,mul}`

Proof for bespoke non-zero logic: https://alive2.llvm.org/ce/z/P6HRvw
---
 llvm/lib/Analysis/ValueTracking.cpp           | 35 +++++++++++++++++++
 .../Transforms/InstSimplify/known-non-zero.ll | 18 ++--------
 2 files changed, 38 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 8bda957cb0a6a5..32ffe5adc41f51 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -2924,6 +2924,41 @@ static bool isKnownNonZeroFromOperator(const Operator *I,
       case Intrinsic::vector_reduce_smax:
       case Intrinsic::vector_reduce_smin:
         return isKnownNonZero(II->getArgOperand(0), Depth, Q);
+        // If we know the reduction doesn't overflow and all elements are
+        // non-zero, the reduction is non-zero.
+      case Intrinsic::vector_reduce_mul:
+      case Intrinsic::vector_reduce_add:
+        if (computeKnownBits(I, Depth + 1, Q).isNonZero())
+          return true;
+
+        if (auto *VecTy =
+                dyn_cast<FixedVectorType>(I->getOperand(0)->getType())) {
+          bool Overflow;
+          if (II->getIntrinsicID() == Intrinsic::vector_reduce_add) {
+            APInt NumEle(BitWidth, VecTy->getNumElements());
+            // If we can't store num ele in bitwidth, the result is either
+            // known-zero or we won't get anything useful.
+            if (NumEle.getZExtValue() != VecTy->getNumElements())
+              break;
+            APInt MaxVal =
+                computeKnownBits(II->getArgOperand(0), Depth, Q).getMaxValue();
+            MaxVal = MaxVal.umul_ov(NumEle, Overflow);
+          } else {
+            APInt MaxVal =
+                computeKnownBits(II->getArgOperand(0), Depth, Q).getMaxValue();
+            APInt SingleVal = MaxVal;
+            for (unsigned i = 1, e = VecTy->getNumElements(); i < e; ++i) {
+              MaxVal = MaxVal.umul_ov(SingleVal, Overflow);
+              if (Overflow)
+                break;
+            }
+          }
+
+          if (Overflow)
+            break;
+          return isKnownNonZero(II->getArgOperand(0), Depth, Q);
+        }
+        break;
       case Intrinsic::umax:
       case Intrinsic::uadd_sat:
         return isKnownNonZero(II->getArgOperand(1), DemandedElts, Depth, Q) ||
diff --git a/llvm/test/Transforms/InstSimplify/known-non-zero.ll b/llvm/test/Transforms/InstSimplify/known-non-zero.ll
index fd2febe6ea26c2..f620ecd8d853dc 100644
--- a/llvm/test/Transforms/InstSimplify/known-non-zero.ll
+++ b/llvm/test/Transforms/InstSimplify/known-non-zero.ll
@@ -380,11 +380,7 @@ define <2 x i1> @insert_nonzero_any_idx_fail(<2 x i8> %xx, i8 %yy, i32 %idx) {
 
 define i1 @nonzero_reduce_add(<2 x i8> %xx) {
 ; CHECK-LABEL: @nonzero_reduce_add(
-; CHECK-NEXT:    [[X0:%.*]] = and <2 x i8> [[XX:%.*]], <i8 3, i8 3>
-; CHECK-NEXT:    [[X:%.*]] = add <2 x i8> [[X0]], <i8 1, i8 1>
-; CHECK-NEXT:    [[V:%.*]] = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> [[X]])
-; CHECK-NEXT:    [[R:%.*]] = icmp eq i8 [[V]], 0
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 false
 ;
   %x0 = and <2 x i8> %xx, <i8 3, i8 3>
   %x = add <2 x i8> %x0, <i8 1, i8 1>
@@ -438,11 +434,7 @@ define i1 @nonzero_reduce_add_fail3(<18 x i4> %xx) {
 
 define i1 @nonzero_reduce_mul(<2 x i8> %xx) {
 ; CHECK-LABEL: @nonzero_reduce_mul(
-; CHECK-NEXT:    [[X0:%.*]] = and <2 x i8> [[XX:%.*]], <i8 3, i8 3>
-; CHECK-NEXT:    [[X:%.*]] = add <2 x i8> [[X0]], <i8 1, i8 1>
-; CHECK-NEXT:    [[V:%.*]] = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> [[X]])
-; CHECK-NEXT:    [[R:%.*]] = icmp eq i8 [[V]], 0
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 false
 ;
   %x0 = and <2 x i8> %xx, <i8 3, i8 3>
   %x = add <2 x i8> %x0, <i8 1, i8 1>
@@ -453,11 +445,7 @@ define i1 @nonzero_reduce_mul(<2 x i8> %xx) {
 
 define i1 @nonzero_reduce_mul2(<3 x i16> %xx) {
 ; CHECK-LABEL: @nonzero_reduce_mul2(
-; CHECK-NEXT:    [[X0:%.*]] = and <3 x i16> [[XX:%.*]], <i16 3, i16 3, i16 3>
-; CHECK-NEXT:    [[X:%.*]] = add <3 x i16> [[X0]], <i16 1, i16 1, i16 1>
-; CHECK-NEXT:    [[V:%.*]] = call i16 @llvm.vector.reduce.mul.v3i16(<3 x i16> [[X]])
-; CHECK-NEXT:    [[R:%.*]] = icmp eq i16 [[V]], 0
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 false
 ;
   %x0 = and <3 x i16> %xx, <i16 3, i16 3, i16 3>
   %x = add <3 x i16> %x0, <i16 1, i16 1, i16 1>