[llvm] [InstCombine] Fold icmp (vreduce_(or|and) %x), (0|-1) (PR #182684)

Sat Feb 21 07:55:23 PST 2026

llvmbot wrote:




@llvm/pr-subscribers-llvm-transforms

Author: Ramkumar Ramachandra (artagnon)

<details>
<summary>Changes</summary>

We can just compare the entire bitcasted value against Zero or AllOnes in this case.

It is profitable on all major targets: https://godbolt.org/z/o7ecKjbsK

---
Full diff: https://github.com/llvm/llvm-project/pull/182684.diff


2 Files Affected:

- (modified) llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp (+41) 
- (added) llvm/test/Transforms/InstCombine/icmp-vector-bitwise-reductions.ll (+101) 


``````````diff

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index ec6ac25bb8b9c..546b3e11a075f 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -7523,6 +7523,44 @@ static Instruction *foldICmpInvariantGroup(ICmpInst &I) {
   return nullptr;
 }
 
+static Instruction *foldICmpOfVectorReduce(ICmpInst &I, const DataLayout &DL,
+                                           IRBuilderBase &Builder) {
+  if (!ICmpInst::isEquality(I.getPredicate()))
+    return nullptr;
+
+  // The caller puts constants after non-constants.
+  Value *Op = I.getOperand(0);
+  Value *Const = I.getOperand(1);
+
+  // For Cond an equality condition, fold
+  //
+  //   icmp (eq|ne) (vreduce_(or|and) Op), (Zero|AllOnes) ->
+  //   icmp (eq|ne) Op, (Zero|AllOnes)
+  //
+  // with a bitcast.
+  Value *Vec;
+  if ((match(Const, m_ZeroInt()) &&
+       match(Op, m_Intrinsic<Intrinsic::vector_reduce_or>(m_Value(Vec)))) ||
+      (match(Const, m_AllOnes()) &&
+       match(Op, m_Intrinsic<Intrinsic::vector_reduce_and>(m_Value(Vec))))) {
+    auto *VecTy = dyn_cast<FixedVectorType>(Vec->getType());
+    if (!VecTy)
+      return nullptr;
+    Type *VecEltTy = VecTy->getElementType();
+    unsigned ScalarBW =
+        DL.getTypeSizeInBits(VecEltTy) * VecTy->getNumElements();
+    if (!DL.fitsInLegalInteger(ScalarBW))
+      return nullptr;
+    Type *ScalarTy = IntegerType::get(I.getContext(), ScalarBW);
+    Value *NewConst = match(Const, m_ZeroInt())
+                          ? ConstantInt::get(ScalarTy, 0)
+                          : ConstantInt::getAllOnesValue(ScalarTy);
+    return CmpInst::Create(Instruction::ICmp, I.getPredicate(),
+                           Builder.CreateBitCast(Vec, ScalarTy), NewConst);
+  }
+  return nullptr;
+}
+
 /// This function folds patterns produced by lowering of reduce idioms, such as
 /// llvm.vector.reduce.and which are lowered into instruction chains. This code
 /// attempts to generate fewer number of scalar comparisons instead of vector
@@ -7998,6 +8036,9 @@ Instruction *InstCombinerImpl::visitICmpInst(ICmpInst &I) {
   if (Instruction *Res = foldICmpOfUAddOv(I))
     return Res;
 
+  if (Instruction *Res = foldICmpOfVectorReduce(I, DL, Builder))
+    return Res;
+
   // The 'cmpxchg' instruction returns an aggregate containing the old value and
   // an i1 which indicates whether or not we successfully did the swap.
   //
diff --git a/llvm/test/Transforms/InstCombine/icmp-vector-bitwise-reductions.ll b/llvm/test/Transforms/InstCombine/icmp-vector-bitwise-reductions.ll
new file mode 100644
index 0000000000000..09da979b7e873
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/icmp-vector-bitwise-reductions.ll
@@ -0,0 +1,101 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -p instcombine -S %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define i1 @vreduce_and_eq(<4 x i8> %v) {
+; CHECK-LABEL: define i1 @vreduce_and_eq(
+; CHECK-SAME: <4 x i8> [[V:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i8> [[V]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP1]], -1
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %red = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> %v)
+  %cmp = icmp eq i8 %red, -1
+  ret i1 %cmp
+}
+
+define i1 @vreduce_and_ne(<4 x i8> %v) {
+; CHECK-LABEL: define i1 @vreduce_and_ne(
+; CHECK-SAME: <4 x i8> [[V:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i8> [[V]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[TMP1]], -1
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %red = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> %v)
+  %cmp = icmp ne i8 %red, -1
+  ret i1 %cmp
+}
+
+define i1 @vreduce_or_eq(<4 x i8> %v) {
+; CHECK-LABEL: define i1 @vreduce_or_eq(
+; CHECK-SAME: <4 x i8> [[V:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i8> [[V]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %red = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> %v)
+  %cmp = icmp eq i8 %red, 0
+  ret i1 %cmp
+}
+
+define i1 @vreduce_or_ne(<4 x i8> %v) {
+; CHECK-LABEL: define i1 @vreduce_or_ne(
+; CHECK-SAME: <4 x i8> [[V:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i8> [[V]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %red = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> %v)
+  %cmp = icmp ne i8 %red, 0
+  ret i1 %cmp
+}
+
+define i1 @loaded_value(ptr %p) {
+; CHECK-LABEL: define i1 @loaded_value(
+; CHECK-SAME: ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[V1:%.*]] = load i32, ptr [[P]], align 4
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[V1]], -1
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %v = load <4 x i8>, ptr %p
+  %red = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> %v)
+  %cmp = icmp eq i8 %red, -1
+  ret i1 %cmp
+}
+
+define i1 @vector_elt_type_legality(<4 x i32> %v) {
+; CHECK-LABEL: define i1 @vector_elt_type_legality(
+; CHECK-SAME: <4 x i32> [[V:%.*]]) {
+; CHECK-NEXT:    [[RED:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[V]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[RED]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %red = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %v)
+  %cmp = icmp ne i32 %red, 0
+  ret i1 %cmp
+}
+
+define i1 @vreduce_and_sgt(<4 x i8> %v) {
+; CHECK-LABEL: define i1 @vreduce_and_sgt(
+; CHECK-SAME: <4 x i8> [[V:%.*]]) {
+; CHECK-NEXT:    [[RED:%.*]] = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> [[V]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[RED]], -1
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %red = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> %v)
+  %cmp = icmp sgt i8 %red, -1
+  ret i1 %cmp
+}
+
+define i1 @scalable(<vscale x 4 x i8> %v) {
+; CHECK-LABEL: define i1 @scalable(
+; CHECK-SAME: <vscale x 4 x i8> [[V:%.*]]) {
+; CHECK-NEXT:    [[RED:%.*]] = call i8 @llvm.vector.reduce.and.nxv4i8(<vscale x 4 x i8> [[V]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[RED]], -1
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %red = call i8 @llvm.vector.reduce.and.nxv4i8(<vscale x 4 x i8> %v)
+  %cmp = icmp eq i8 %red, -1
+  ret i1 %cmp
+}

``````````

</details>


https://github.com/llvm/llvm-project/pull/182684