[llvm] [InstCombine] Fold icmp (vreduce_(or|and) %x), (0|-1) (PR #182684)

Sat Feb 21 07:54:51 PST 2026

https://github.com/artagnon created https://github.com/llvm/llvm-project/pull/182684

We can just compare the entire bitcasted value against Zero or AllOnes in this case.

It is profitable on all major targets: https://godbolt.org/z/o7ecKjbsK

>From e75d258cee8f08caffb8da9979c9945abac9d6bf Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <artagnon at tenstorrent.com>
Date: Sat, 21 Feb 2026 15:17:21 +0000
Subject: [PATCH 1/2] [InstCombine] Pre-commit icmp-vector-bitwise-reductions
 tests

---
 .../icmp-vector-bitwise-reductions.ll         | 102 ++++++++++++++++++
 1 file changed, 102 insertions(+)
 create mode 100644 llvm/test/Transforms/InstCombine/icmp-vector-bitwise-reductions.ll

diff --git a/llvm/test/Transforms/InstCombine/icmp-vector-bitwise-reductions.ll b/llvm/test/Transforms/InstCombine/icmp-vector-bitwise-reductions.ll
new file mode 100644
index 0000000000000..744f222937e5f
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/icmp-vector-bitwise-reductions.ll
@@ -0,0 +1,102 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -p instcombine -S %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define i1 @vreduce_and_eq(<4 x i8> %v) {
+; CHECK-LABEL: define i1 @vreduce_and_eq(
+; CHECK-SAME: <4 x i8> [[V:%.*]]) {
+; CHECK-NEXT:    [[RED:%.*]] = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> [[V]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[RED]], -1
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %red = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> %v)
+  %cmp = icmp eq i8 %red, -1
+  ret i1 %cmp
+}
+
+define i1 @vreduce_and_ne(<4 x i8> %v) {
+; CHECK-LABEL: define i1 @vreduce_and_ne(
+; CHECK-SAME: <4 x i8> [[V:%.*]]) {
+; CHECK-NEXT:    [[RED:%.*]] = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> [[V]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i8 [[RED]], -1
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %red = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> %v)
+  %cmp = icmp ne i8 %red, -1
+  ret i1 %cmp
+}
+
+define i1 @vreduce_or_eq(<4 x i8> %v) {
+; CHECK-LABEL: define i1 @vreduce_or_eq(
+; CHECK-SAME: <4 x i8> [[V:%.*]]) {
+; CHECK-NEXT:    [[RED:%.*]] = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> [[V]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[RED]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %red = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> %v)
+  %cmp = icmp eq i8 %red, 0
+  ret i1 %cmp
+}
+
+define i1 @vreduce_or_ne(<4 x i8> %v) {
+; CHECK-LABEL: define i1 @vreduce_or_ne(
+; CHECK-SAME: <4 x i8> [[V:%.*]]) {
+; CHECK-NEXT:    [[RED:%.*]] = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> [[V]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i8 [[RED]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %red = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> %v)
+  %cmp = icmp ne i8 %red, 0
+  ret i1 %cmp
+}
+
+define i1 @loaded_value(ptr %p) {
+; CHECK-LABEL: define i1 @loaded_value(
+; CHECK-SAME: ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[V:%.*]] = load <4 x i8>, ptr [[P]], align 4
+; CHECK-NEXT:    [[RED:%.*]] = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> [[V]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[RED]], -1
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %v = load <4 x i8>, ptr %p
+  %red = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> %v)
+  %cmp = icmp eq i8 %red, -1
+  ret i1 %cmp
+}
+
+define i1 @vector_elt_type_legality(<4 x i32> %v) {
+; CHECK-LABEL: define i1 @vector_elt_type_legality(
+; CHECK-SAME: <4 x i32> [[V:%.*]]) {
+; CHECK-NEXT:    [[RED:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[V]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[RED]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %red = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %v)
+  %cmp = icmp ne i32 %red, 0
+  ret i1 %cmp
+}
+
+define i1 @vreduce_and_sgt(<4 x i8> %v) {
+; CHECK-LABEL: define i1 @vreduce_and_sgt(
+; CHECK-SAME: <4 x i8> [[V:%.*]]) {
+; CHECK-NEXT:    [[RED:%.*]] = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> [[V]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[RED]], -1
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %red = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> %v)
+  %cmp = icmp sgt i8 %red, -1
+  ret i1 %cmp
+}
+
+define i1 @scalable(<vscale x 4 x i8> %v) {
+; CHECK-LABEL: define i1 @scalable(
+; CHECK-SAME: <vscale x 4 x i8> [[V:%.*]]) {
+; CHECK-NEXT:    [[RED:%.*]] = call i8 @llvm.vector.reduce.and.nxv4i8(<vscale x 4 x i8> [[V]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[RED]], -1
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %red = call i8 @llvm.vector.reduce.and.nxv4i8(<vscale x 4 x i8> %v)
+  %cmp = icmp eq i8 %red, -1
+  ret i1 %cmp
+}

>From cd7b959c757b85d49e4886a7bb7b35a3c5b457d3 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <artagnon at tenstorrent.com>
Date: Sat, 21 Feb 2026 15:18:10 +0000
Subject: [PATCH 2/2] [InstCombine] Fold icmp (vreduce_(or|and) %x), (0|-1)

We can just compare the entire bitcasted value against Zero or AllOnes
in this case.

It is profitable on all major targets: https://godbolt.org/z/o7ecKjbsK
---
 .../InstCombine/InstCombineCompares.cpp       | 41 +++++++++++++++++++
 .../icmp-vector-bitwise-reductions.ll         | 21 +++++-----
 2 files changed, 51 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index ec6ac25bb8b9c..546b3e11a075f 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -7523,6 +7523,44 @@ static Instruction *foldICmpInvariantGroup(ICmpInst &I) {
   return nullptr;
 }
 
+static Instruction *foldICmpOfVectorReduce(ICmpInst &I, const DataLayout &DL,
+                                           IRBuilderBase &Builder) {
+  if (!ICmpInst::isEquality(I.getPredicate()))
+    return nullptr;
+
+  // The caller puts constants after non-constants.
+  Value *Op = I.getOperand(0);
+  Value *Const = I.getOperand(1);
+
+  // For Cond an equality condition, fold
+  //
+  //   icmp (eq|ne) (vreduce_(or|and) Op), (Zero|AllOnes) ->
+  //   icmp (eq|ne) Op, (Zero|AllOnes)
+  //
+  // with a bitcast.
+  Value *Vec;
+  if ((match(Const, m_ZeroInt()) &&
+       match(Op, m_Intrinsic<Intrinsic::vector_reduce_or>(m_Value(Vec)))) ||
+      (match(Const, m_AllOnes()) &&
+       match(Op, m_Intrinsic<Intrinsic::vector_reduce_and>(m_Value(Vec))))) {
+    auto *VecTy = dyn_cast<FixedVectorType>(Vec->getType());
+    if (!VecTy)
+      return nullptr;
+    Type *VecEltTy = VecTy->getElementType();
+    unsigned ScalarBW =
+        DL.getTypeSizeInBits(VecEltTy) * VecTy->getNumElements();
+    if (!DL.fitsInLegalInteger(ScalarBW))
+      return nullptr;
+    Type *ScalarTy = IntegerType::get(I.getContext(), ScalarBW);
+    Value *NewConst = match(Const, m_ZeroInt())
+                          ? ConstantInt::get(ScalarTy, 0)
+                          : ConstantInt::getAllOnesValue(ScalarTy);
+    return CmpInst::Create(Instruction::ICmp, I.getPredicate(),
+                           Builder.CreateBitCast(Vec, ScalarTy), NewConst);
+  }
+  return nullptr;
+}
+
 /// This function folds patterns produced by lowering of reduce idioms, such as
 /// llvm.vector.reduce.and which are lowered into instruction chains. This code
 /// attempts to generate fewer number of scalar comparisons instead of vector
@@ -7998,6 +8036,9 @@ Instruction *InstCombinerImpl::visitICmpInst(ICmpInst &I) {
   if (Instruction *Res = foldICmpOfUAddOv(I))
     return Res;
 
+  if (Instruction *Res = foldICmpOfVectorReduce(I, DL, Builder))
+    return Res;
+
   // The 'cmpxchg' instruction returns an aggregate containing the old value and
   // an i1 which indicates whether or not we successfully did the swap.
   //
diff --git a/llvm/test/Transforms/InstCombine/icmp-vector-bitwise-reductions.ll b/llvm/test/Transforms/InstCombine/icmp-vector-bitwise-reductions.ll
index 744f222937e5f..09da979b7e873 100644
--- a/llvm/test/Transforms/InstCombine/icmp-vector-bitwise-reductions.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-vector-bitwise-reductions.ll
@@ -6,8 +6,8 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 define i1 @vreduce_and_eq(<4 x i8> %v) {
 ; CHECK-LABEL: define i1 @vreduce_and_eq(
 ; CHECK-SAME: <4 x i8> [[V:%.*]]) {
-; CHECK-NEXT:    [[RED:%.*]] = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> [[V]])
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[RED]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i8> [[V]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP1]], -1
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %red = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> %v)
@@ -18,8 +18,8 @@ define i1 @vreduce_and_eq(<4 x i8> %v) {
 define i1 @vreduce_and_ne(<4 x i8> %v) {
 ; CHECK-LABEL: define i1 @vreduce_and_ne(
 ; CHECK-SAME: <4 x i8> [[V:%.*]]) {
-; CHECK-NEXT:    [[RED:%.*]] = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> [[V]])
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i8 [[RED]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i8> [[V]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[TMP1]], -1
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %red = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> %v)
@@ -30,8 +30,8 @@ define i1 @vreduce_and_ne(<4 x i8> %v) {
 define i1 @vreduce_or_eq(<4 x i8> %v) {
 ; CHECK-LABEL: define i1 @vreduce_or_eq(
 ; CHECK-SAME: <4 x i8> [[V:%.*]]) {
-; CHECK-NEXT:    [[RED:%.*]] = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> [[V]])
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[RED]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i8> [[V]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP1]], 0
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %red = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> %v)
@@ -42,8 +42,8 @@ define i1 @vreduce_or_eq(<4 x i8> %v) {
 define i1 @vreduce_or_ne(<4 x i8> %v) {
 ; CHECK-LABEL: define i1 @vreduce_or_ne(
 ; CHECK-SAME: <4 x i8> [[V:%.*]]) {
-; CHECK-NEXT:    [[RED:%.*]] = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> [[V]])
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i8 [[RED]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i8> [[V]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[TMP1]], 0
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %red = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> %v)
@@ -54,9 +54,8 @@ define i1 @vreduce_or_ne(<4 x i8> %v) {
 define i1 @loaded_value(ptr %p) {
 ; CHECK-LABEL: define i1 @loaded_value(
 ; CHECK-SAME: ptr [[P:%.*]]) {
-; CHECK-NEXT:    [[V:%.*]] = load <4 x i8>, ptr [[P]], align 4
-; CHECK-NEXT:    [[RED:%.*]] = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> [[V]])
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[RED]], -1
+; CHECK-NEXT:    [[V1:%.*]] = load i32, ptr [[P]], align 4
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[V1]], -1
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %v = load <4 x i8>, ptr %p