[llvm] [VectorCombine] Fold binary op of reductions. (PR #121567)

Thu Feb 20 00:55:21 PST 2025

https://github.com/mgudim updated https://github.com/llvm/llvm-project/pull/121567

>From 9753c8d89b888e14a755811616d5e3f5996abdf0 Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Thu, 2 Jan 2025 08:26:22 -0800
Subject: [PATCH 1/2] [InstCombine] Fold binary op of reductions.

Replace binary of of two reductions with one reduction of the binary op
applied to vectors. For example:

```
%v0_red = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %v0)
%v1_red = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %v1)
%res = add i32 %v0_red, %v1_red
```
gets transformed to:

```
%1 = add <16 x i32> %v0, %v1
%res = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %1)
```
---
 .../InstCombine/InstCombineAddSub.cpp         | 18 ++----
 .../InstCombine/InstCombineAndOrXor.cpp       |  9 +++
 .../InstCombine/InstCombineInternal.h         |  1 +
 .../InstCombine/InstCombineMulDivRem.cpp      |  3 +
 .../InstCombine/InstructionCombining.cpp      | 57 +++++++++++++++++++
 .../VectorCombine/fold-binop-of-reductions.ll | 40 ++++++-------
 6 files changed, 91 insertions(+), 37 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 658bbbc569766..29cf75a6e318d 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -1528,6 +1528,9 @@ Instruction *InstCombinerImpl::visitAdd(BinaryOperator &I) {
   if (Instruction *X = foldVectorBinop(I))
     return X;
 
+  if (Instruction *X = foldBinopOfReductions(I))
+    return replaceInstUsesWith(I, X);
+
   if (Instruction *Phi = foldBinopWithPhiOperands(I))
     return Phi;
 
@@ -2387,19 +2390,8 @@ Instruction *InstCombinerImpl::visitSub(BinaryOperator &I) {
     }
   }
 
-  auto m_AddRdx = [](Value *&Vec) {
-    return m_OneUse(m_Intrinsic<Intrinsic::vector_reduce_add>(m_Value(Vec)));
-  };
-  Value *V0, *V1;
-  if (match(Op0, m_AddRdx(V0)) && match(Op1, m_AddRdx(V1)) &&
-      V0->getType() == V1->getType()) {
-    // Difference of sums is sum of differences:
-    // add_rdx(V0) - add_rdx(V1) --> add_rdx(V0 - V1)
-    Value *Sub = Builder.CreateSub(V0, V1);
-    Value *Rdx = Builder.CreateIntrinsic(Intrinsic::vector_reduce_add,
-                                         {Sub->getType()}, {Sub});
-    return replaceInstUsesWith(I, Rdx);
-  }
+  if (Instruction *X = foldBinopOfReductions(I))
+    return replaceInstUsesWith(I, X);
 
   if (Constant *C = dyn_cast<Constant>(Op0)) {
     Value *X;
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 4616ea6ab5487..4df70716692a3 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -2385,6 +2385,9 @@ Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) {
   if (Instruction *X = foldVectorBinop(I))
     return X;
 
+  if (Instruction *X = foldBinopOfReductions(I))
+    return replaceInstUsesWith(I, X);
+
   if (Instruction *Phi = foldBinopWithPhiOperands(I))
     return Phi;
 
@@ -3565,6 +3568,9 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
   if (Instruction *X = foldVectorBinop(I))
     return X;
 
+  if (Instruction *X = foldBinopOfReductions(I))
+    return replaceInstUsesWith(I, X);
+
   if (Instruction *Phi = foldBinopWithPhiOperands(I))
     return Phi;
 
@@ -4688,6 +4694,9 @@ Instruction *InstCombinerImpl::visitXor(BinaryOperator &I) {
   if (Instruction *X = foldVectorBinop(I))
     return X;
 
+  if (Instruction *X = foldBinopOfReductions(I))
+    return replaceInstUsesWith(I, X);
+
   if (Instruction *Phi = foldBinopWithPhiOperands(I))
     return Phi;
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 83e1da98deeda..40a03cc24817d 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -594,6 +594,7 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
 
   /// Canonicalize the position of binops relative to shufflevector.
   Instruction *foldVectorBinop(BinaryOperator &Inst);
+  Instruction *foldBinopOfReductions(BinaryOperator &Inst);
   Instruction *foldVectorSelect(SelectInst &Sel);
   Instruction *foldSelectShuffle(ShuffleVectorInst &Shuf);
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index c7023eb79b04e..f2dc65c2051ca 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -199,6 +199,9 @@ Instruction *InstCombinerImpl::visitMul(BinaryOperator &I) {
   if (Instruction *X = foldVectorBinop(I))
     return X;
 
+  if (Instruction *X = foldBinopOfReductions(I))
+    return replaceInstUsesWith(I, X);
+
   if (Instruction *Phi = foldBinopWithPhiOperands(I))
     return Phi;
 
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 5621511570b58..01bd918e493bd 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -2318,6 +2318,63 @@ Instruction *InstCombinerImpl::foldVectorBinop(BinaryOperator &Inst) {
   return nullptr;
 }
 
+static Intrinsic::ID getReductionForBinop(Instruction::BinaryOps Opc) {
+  switch (Opc) {
+  default:
+    break;
+  case Instruction::Add:
+    return Intrinsic::vector_reduce_add;
+  case Instruction::Mul:
+    return Intrinsic::vector_reduce_mul;
+  case Instruction::And:
+    return Intrinsic::vector_reduce_and;
+  case Instruction::Or:
+    return Intrinsic::vector_reduce_or;
+  case Instruction::Xor:
+    return Intrinsic::vector_reduce_xor;
+  }
+  return Intrinsic::not_intrinsic;
+}
+
+Instruction *InstCombinerImpl::foldBinopOfReductions(BinaryOperator &Inst) {
+  Instruction::BinaryOps BinOpOpc = Inst.getOpcode();
+  Intrinsic::ID ReductionIID = getReductionForBinop(BinOpOpc);
+  if (BinOpOpc == Instruction::Sub)
+    ReductionIID = Intrinsic::vector_reduce_add;
+  if (ReductionIID == Intrinsic::not_intrinsic)
+    return nullptr;
+
+  auto checkIntrinsicAndGetItsArgument = [](Value *V,
+                                            Intrinsic::ID IID) -> Value * {
+    IntrinsicInst *II = dyn_cast<IntrinsicInst>(V);
+    if (!II)
+      return nullptr;
+    if (II->getIntrinsicID() == IID && II->hasOneUse())
+      return II->getArgOperand(0);
+    return nullptr;
+  };
+
+  Value *V0 = checkIntrinsicAndGetItsArgument(Inst.getOperand(0), ReductionIID);
+  if (!V0)
+    return nullptr;
+  Value *V1 = checkIntrinsicAndGetItsArgument(Inst.getOperand(1), ReductionIID);
+  if (!V1)
+    return nullptr;
+
+  Type *VTy = V0->getType();
+  if (V1->getType() != VTy)
+    return nullptr;
+
+  Value *VectorBO = Builder.CreateBinOp(BinOpOpc, V0, V1);
+
+  if (PossiblyDisjointInst *PDInst = dyn_cast<PossiblyDisjointInst>(&Inst))
+    if (auto *PDVectorBO = dyn_cast<PossiblyDisjointInst>(VectorBO))
+      PDVectorBO->setIsDisjoint(PDInst->isDisjoint());
+
+  Instruction *Rdx = Builder.CreateIntrinsic(ReductionIID, {VTy}, {VectorBO});
+  return Rdx;
+}
+
 /// Try to narrow the width of a binop if at least 1 operand is an extend of
 /// of a value. This requires a potentially expensive known bits check to make
 /// sure the narrow op does not overflow.
diff --git a/llvm/test/Transforms/VectorCombine/fold-binop-of-reductions.ll b/llvm/test/Transforms/VectorCombine/fold-binop-of-reductions.ll
index 86f17cdfb79b4..cc88db03fbe2c 100644
--- a/llvm/test/Transforms/VectorCombine/fold-binop-of-reductions.ll
+++ b/llvm/test/Transforms/VectorCombine/fold-binop-of-reductions.ll
@@ -4,9 +4,8 @@
 define i32 @add_of_reduce_add(<16 x i32> %v0, <16 x i32> %v1) {
 ; CHECK-LABEL: define i32 @add_of_reduce_add(
 ; CHECK-SAME: <16 x i32> [[V0:%.*]], <16 x i32> [[V1:%.*]]) {
-; CHECK-NEXT:    [[V0_RED:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[V0]])
-; CHECK-NEXT:    [[V1_RED:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[V1]])
-; CHECK-NEXT:    [[RES:%.*]] = add i32 [[V0_RED]], [[V1_RED]]
+; CHECK-NEXT:    [[TMP1:%.*]] = add <16 x i32> [[V0]], [[V1]]
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP1]])
 ; CHECK-NEXT:    ret i32 [[RES]]
 ;
   %v0_red = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %v0)
@@ -31,9 +30,8 @@ define i32 @sub_of_reduce_add(<16 x i32> %v0, <16 x i32> %v1) {
 define i32 @mul_of_reduce_mul(<16 x i32> %v0, <16 x i32> %v1) {
 ; CHECK-LABEL: define i32 @mul_of_reduce_mul(
 ; CHECK-SAME: <16 x i32> [[V0:%.*]], <16 x i32> [[V1:%.*]]) {
-; CHECK-NEXT:    [[V0_RED:%.*]] = tail call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> [[V0]])
-; CHECK-NEXT:    [[V1_RED:%.*]] = tail call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> [[V1]])
-; CHECK-NEXT:    [[RES:%.*]] = mul i32 [[V0_RED]], [[V1_RED]]
+; CHECK-NEXT:    [[TMP1:%.*]] = mul <16 x i32> [[V0]], [[V1]]
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> [[TMP1]])
 ; CHECK-NEXT:    ret i32 [[RES]]
 ;
   %v0_red = tail call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> %v0)
@@ -45,9 +43,8 @@ define i32 @mul_of_reduce_mul(<16 x i32> %v0, <16 x i32> %v1) {
 define i32 @and_of_reduce_and(<16 x i32> %v0, <16 x i32> %v1) {
 ; CHECK-LABEL: define i32 @and_of_reduce_and(
 ; CHECK-SAME: <16 x i32> [[V0:%.*]], <16 x i32> [[V1:%.*]]) {
-; CHECK-NEXT:    [[V0_RED:%.*]] = tail call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[V0]])
-; CHECK-NEXT:    [[V1_RED:%.*]] = tail call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[V1]])
-; CHECK-NEXT:    [[RES:%.*]] = and i32 [[V0_RED]], [[V1_RED]]
+; CHECK-NEXT:    [[TMP1:%.*]] = and <16 x i32> [[V0]], [[V1]]
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP1]])
 ; CHECK-NEXT:    ret i32 [[RES]]
 ;
   %v0_red = tail call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> %v0)
@@ -59,9 +56,8 @@ define i32 @and_of_reduce_and(<16 x i32> %v0, <16 x i32> %v1) {
 define i32 @or_of_reduce_or(<16 x i32> %v0, <16 x i32> %v1) {
 ; CHECK-LABEL: define i32 @or_of_reduce_or(
 ; CHECK-SAME: <16 x i32> [[V0:%.*]], <16 x i32> [[V1:%.*]]) {
-; CHECK-NEXT:    [[V0_RED:%.*]] = tail call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> [[V0]])
-; CHECK-NEXT:    [[V1_RED:%.*]] = tail call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> [[V1]])
-; CHECK-NEXT:    [[RES:%.*]] = or i32 [[V0_RED]], [[V1_RED]]
+; CHECK-NEXT:    [[TMP1:%.*]] = or <16 x i32> [[V0]], [[V1]]
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> [[TMP1]])
 ; CHECK-NEXT:    ret i32 [[RES]]
 ;
   %v0_red = tail call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> %v0)
@@ -73,9 +69,8 @@ define i32 @or_of_reduce_or(<16 x i32> %v0, <16 x i32> %v1) {
 define i32 @xor_of_reduce_xor(<16 x i32> %v0, <16 x i32> %v1) {
 ; CHECK-LABEL: define i32 @xor_of_reduce_xor(
 ; CHECK-SAME: <16 x i32> [[V0:%.*]], <16 x i32> [[V1:%.*]]) {
-; CHECK-NEXT:    [[V0_RED:%.*]] = tail call i32 @llvm.vector.reduce.xor.v16i32(<16 x i32> [[V0]])
-; CHECK-NEXT:    [[V1_RED:%.*]] = tail call i32 @llvm.vector.reduce.xor.v16i32(<16 x i32> [[V1]])
-; CHECK-NEXT:    [[RES:%.*]] = xor i32 [[V0_RED]], [[V1_RED]]
+; CHECK-NEXT:    [[TMP1:%.*]] = xor <16 x i32> [[V0]], [[V1]]
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.vector.reduce.xor.v16i32(<16 x i32> [[TMP1]])
 ; CHECK-NEXT:    ret i32 [[RES]]
 ;
   %v0_red = tail call i32 @llvm.vector.reduce.xor.v16i32(<16 x i32> %v0)
@@ -161,9 +156,8 @@ define i32 @multiple_use_of_reduction_1(<16 x i32> %v0, <16 x i32> %v1, ptr %p)
 define i32 @do_not_preserve_overflow_flags(<16 x i32> %v0, <16 x i32> %v1) {
 ; CHECK-LABEL: define i32 @do_not_preserve_overflow_flags(
 ; CHECK-SAME: <16 x i32> [[V0:%.*]], <16 x i32> [[V1:%.*]]) {
-; CHECK-NEXT:    [[V0_RED:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[V0]])
-; CHECK-NEXT:    [[V1_RED:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[V1]])
-; CHECK-NEXT:    [[RES:%.*]] = add nuw nsw i32 [[V0_RED]], [[V1_RED]]
+; CHECK-NEXT:    [[TMP1:%.*]] = add <16 x i32> [[V0]], [[V1]]
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP1]])
 ; CHECK-NEXT:    ret i32 [[RES]]
 ;
   %v0_red = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %v0)
@@ -175,9 +169,8 @@ define i32 @do_not_preserve_overflow_flags(<16 x i32> %v0, <16 x i32> %v1) {
 define i32 @preserve_disjoint_flags(<16 x i32> %v0, <16 x i32> %v1) {
 ; CHECK-LABEL: define i32 @preserve_disjoint_flags(
 ; CHECK-SAME: <16 x i32> [[V0:%.*]], <16 x i32> [[V1:%.*]]) {
-; CHECK-NEXT:    [[V0_RED:%.*]] = tail call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> [[V0]])
-; CHECK-NEXT:    [[V1_RED:%.*]] = tail call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> [[V1]])
-; CHECK-NEXT:    [[RES:%.*]] = or disjoint i32 [[V0_RED]], [[V1_RED]]
+; CHECK-NEXT:    [[TMP1:%.*]] = or disjoint <16 x i32> [[V0]], [[V1]]
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> [[TMP1]])
 ; CHECK-NEXT:    ret i32 [[RES]]
 ;
   %v0_red = tail call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> %v0)
@@ -189,9 +182,8 @@ define i32 @preserve_disjoint_flags(<16 x i32> %v0, <16 x i32> %v1) {
 define i32 @add_of_reduce_add_vscale(<vscale x 16 x i32> %v0, <vscale x 16 x i32> %v1) {
 ; CHECK-LABEL: define i32 @add_of_reduce_add_vscale(
 ; CHECK-SAME: <vscale x 16 x i32> [[V0:%.*]], <vscale x 16 x i32> [[V1:%.*]]) {
-; CHECK-NEXT:    [[V0_RED:%.*]] = tail call i32 @llvm.vector.reduce.add.nxv16i32(<vscale x 16 x i32> [[V0]])
-; CHECK-NEXT:    [[V1_RED:%.*]] = tail call i32 @llvm.vector.reduce.add.nxv16i32(<vscale x 16 x i32> [[V1]])
-; CHECK-NEXT:    [[RES:%.*]] = add i32 [[V0_RED]], [[V1_RED]]
+; CHECK-NEXT:    [[TMP1:%.*]] = add <vscale x 16 x i32> [[V0]], [[V1]]
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.vector.reduce.add.nxv16i32(<vscale x 16 x i32> [[TMP1]])
 ; CHECK-NEXT:    ret i32 [[RES]]
 ;
   %v0_red = tail call i32 @llvm.vector.reduce.add.nxv16i32(<vscale x 16 x i32> %v0)

>From bc1a1988ed247d74daf144b19d0c5044c7473d43 Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Wed, 29 Jan 2025 03:58:19 -0800
Subject: [PATCH 2/2] Improve cost model

---
 .../include/llvm/Transforms/Utils/LoopUtils.h |   2 +
 .../InstCombine/InstCombineAddSub.cpp         |  18 ++-
 .../InstCombine/InstCombineAndOrXor.cpp       |   9 --
 .../InstCombine/InstCombineInternal.h         |   1 -
 .../InstCombine/InstCombineMulDivRem.cpp      |   3 -
 .../InstCombine/InstructionCombining.cpp      |  57 ---------
 llvm/lib/Transforms/Utils/LoopUtils.cpp       |  20 +++
 .../Transforms/Vectorize/VectorCombine.cpp    | 117 ++++++++++++++++++
 .../ARM/fold-binop-of-reductions.ll           |  93 ++++++++++++++
 .../VectorCombine/fold-binop-of-reductions.ll |   2 +-
 10 files changed, 246 insertions(+), 76 deletions(-)
 create mode 100644 llvm/test/Transforms/VectorCombine/ARM/fold-binop-of-reductions.ll

diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
index b4cd52fef70fd..1007b9d48fb72 100644
--- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
@@ -365,6 +365,8 @@ constexpr Intrinsic::ID getReductionIntrinsicID(RecurKind RK);
 
 /// Returns the arithmetic instruction opcode used when expanding a reduction.
 unsigned getArithmeticReductionInstruction(Intrinsic::ID RdxID);
+/// Returns the reduction intrinsic id corresponding to the binary operation.
+Intrinsic::ID getReductionForBinop(Instruction::BinaryOps Opc);
 
 /// Returns the min/max intrinsic used when expanding a min/max reduction.
 Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 29cf75a6e318d..658bbbc569766 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -1528,9 +1528,6 @@ Instruction *InstCombinerImpl::visitAdd(BinaryOperator &I) {
   if (Instruction *X = foldVectorBinop(I))
     return X;
 
-  if (Instruction *X = foldBinopOfReductions(I))
-    return replaceInstUsesWith(I, X);
-
   if (Instruction *Phi = foldBinopWithPhiOperands(I))
     return Phi;
 
@@ -2390,8 +2387,19 @@ Instruction *InstCombinerImpl::visitSub(BinaryOperator &I) {
     }
   }
 
-  if (Instruction *X = foldBinopOfReductions(I))
-    return replaceInstUsesWith(I, X);
+  auto m_AddRdx = [](Value *&Vec) {
+    return m_OneUse(m_Intrinsic<Intrinsic::vector_reduce_add>(m_Value(Vec)));
+  };
+  Value *V0, *V1;
+  if (match(Op0, m_AddRdx(V0)) && match(Op1, m_AddRdx(V1)) &&
+      V0->getType() == V1->getType()) {
+    // Difference of sums is sum of differences:
+    // add_rdx(V0) - add_rdx(V1) --> add_rdx(V0 - V1)
+    Value *Sub = Builder.CreateSub(V0, V1);
+    Value *Rdx = Builder.CreateIntrinsic(Intrinsic::vector_reduce_add,
+                                         {Sub->getType()}, {Sub});
+    return replaceInstUsesWith(I, Rdx);
+  }
 
   if (Constant *C = dyn_cast<Constant>(Op0)) {
     Value *X;
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 4df70716692a3..4616ea6ab5487 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -2385,9 +2385,6 @@ Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) {
   if (Instruction *X = foldVectorBinop(I))
     return X;
 
-  if (Instruction *X = foldBinopOfReductions(I))
-    return replaceInstUsesWith(I, X);
-
   if (Instruction *Phi = foldBinopWithPhiOperands(I))
     return Phi;
 
@@ -3568,9 +3565,6 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
   if (Instruction *X = foldVectorBinop(I))
     return X;
 
-  if (Instruction *X = foldBinopOfReductions(I))
-    return replaceInstUsesWith(I, X);
-
   if (Instruction *Phi = foldBinopWithPhiOperands(I))
     return Phi;
 
@@ -4694,9 +4688,6 @@ Instruction *InstCombinerImpl::visitXor(BinaryOperator &I) {
   if (Instruction *X = foldVectorBinop(I))
     return X;
 
-  if (Instruction *X = foldBinopOfReductions(I))
-    return replaceInstUsesWith(I, X);
-
   if (Instruction *Phi = foldBinopWithPhiOperands(I))
     return Phi;
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 40a03cc24817d..83e1da98deeda 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -594,7 +594,6 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
 
   /// Canonicalize the position of binops relative to shufflevector.
   Instruction *foldVectorBinop(BinaryOperator &Inst);
-  Instruction *foldBinopOfReductions(BinaryOperator &Inst);
   Instruction *foldVectorSelect(SelectInst &Sel);
   Instruction *foldSelectShuffle(ShuffleVectorInst &Shuf);
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index f2dc65c2051ca..c7023eb79b04e 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -199,9 +199,6 @@ Instruction *InstCombinerImpl::visitMul(BinaryOperator &I) {
   if (Instruction *X = foldVectorBinop(I))
     return X;
 
-  if (Instruction *X = foldBinopOfReductions(I))
-    return replaceInstUsesWith(I, X);
-
   if (Instruction *Phi = foldBinopWithPhiOperands(I))
     return Phi;
 
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 01bd918e493bd..5621511570b58 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -2318,63 +2318,6 @@ Instruction *InstCombinerImpl::foldVectorBinop(BinaryOperator &Inst) {
   return nullptr;
 }
 
-static Intrinsic::ID getReductionForBinop(Instruction::BinaryOps Opc) {
-  switch (Opc) {
-  default:
-    break;
-  case Instruction::Add:
-    return Intrinsic::vector_reduce_add;
-  case Instruction::Mul:
-    return Intrinsic::vector_reduce_mul;
-  case Instruction::And:
-    return Intrinsic::vector_reduce_and;
-  case Instruction::Or:
-    return Intrinsic::vector_reduce_or;
-  case Instruction::Xor:
-    return Intrinsic::vector_reduce_xor;
-  }
-  return Intrinsic::not_intrinsic;
-}
-
-Instruction *InstCombinerImpl::foldBinopOfReductions(BinaryOperator &Inst) {
-  Instruction::BinaryOps BinOpOpc = Inst.getOpcode();
-  Intrinsic::ID ReductionIID = getReductionForBinop(BinOpOpc);
-  if (BinOpOpc == Instruction::Sub)
-    ReductionIID = Intrinsic::vector_reduce_add;
-  if (ReductionIID == Intrinsic::not_intrinsic)
-    return nullptr;
-
-  auto checkIntrinsicAndGetItsArgument = [](Value *V,
-                                            Intrinsic::ID IID) -> Value * {
-    IntrinsicInst *II = dyn_cast<IntrinsicInst>(V);
-    if (!II)
-      return nullptr;
-    if (II->getIntrinsicID() == IID && II->hasOneUse())
-      return II->getArgOperand(0);
-    return nullptr;
-  };
-
-  Value *V0 = checkIntrinsicAndGetItsArgument(Inst.getOperand(0), ReductionIID);
-  if (!V0)
-    return nullptr;
-  Value *V1 = checkIntrinsicAndGetItsArgument(Inst.getOperand(1), ReductionIID);
-  if (!V1)
-    return nullptr;
-
-  Type *VTy = V0->getType();
-  if (V1->getType() != VTy)
-    return nullptr;
-
-  Value *VectorBO = Builder.CreateBinOp(BinOpOpc, V0, V1);
-
-  if (PossiblyDisjointInst *PDInst = dyn_cast<PossiblyDisjointInst>(&Inst))
-    if (auto *PDVectorBO = dyn_cast<PossiblyDisjointInst>(VectorBO))
-      PDVectorBO->setIsDisjoint(PDInst->isDisjoint());
-
-  Instruction *Rdx = Builder.CreateIntrinsic(ReductionIID, {VTy}, {VectorBO});
-  return Rdx;
-}
-
 /// Try to narrow the width of a binop if at least 1 operand is an extend of
 /// of a value. This requires a potentially expensive known bits check to make
 /// sure the narrow op does not overflow.
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index 45915c10107b2..0506ea915a23f 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -957,6 +957,7 @@ constexpr Intrinsic::ID llvm::getReductionIntrinsicID(RecurKind RK) {
   }
 }
 
+// This is the inverse to getReductionForBinop
 unsigned llvm::getArithmeticReductionInstruction(Intrinsic::ID RdxID) {
   switch (RdxID) {
   case Intrinsic::vector_reduce_fadd:
@@ -986,6 +987,25 @@ unsigned llvm::getArithmeticReductionInstruction(Intrinsic::ID RdxID) {
   }
 }
 
+// This is the inverse to getArithmeticReductionInstruction
+Intrinsic::ID llvm::getReductionForBinop(Instruction::BinaryOps Opc) {
+  switch (Opc) {
+  default:
+    break;
+  case Instruction::Add:
+    return Intrinsic::vector_reduce_add;
+  case Instruction::Mul:
+    return Intrinsic::vector_reduce_mul;
+  case Instruction::And:
+    return Intrinsic::vector_reduce_and;
+  case Instruction::Or:
+    return Intrinsic::vector_reduce_or;
+  case Instruction::Xor:
+    return Intrinsic::vector_reduce_xor;
+  }
+  return Intrinsic::not_intrinsic;
+}
+
 Intrinsic::ID llvm::getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID) {
   switch (RdxID) {
   default:
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 746742e14d080..87476977ddfcb 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -114,6 +114,7 @@ class VectorCombine {
   bool scalarizeBinopOrCmp(Instruction &I);
   bool scalarizeVPIntrinsic(Instruction &I);
   bool foldExtractedCmps(Instruction &I);
+  bool foldBinopOfReductions(Instruction &I);
   bool foldSingleElementStore(Instruction &I);
   bool scalarizeLoadExtract(Instruction &I);
   bool foldConcatOfBoolMasks(Instruction &I);
@@ -1242,6 +1243,121 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) {
   return true;
 }
 
+static void analyzeCostOfVecReduction(const IntrinsicInst &II,
+                                      TTI::TargetCostKind CostKind,
+                                      const TargetTransformInfo &TTI,
+                                      InstructionCost &CostBeforeReduction,
+                                      InstructionCost &CostAfterReduction) {
+  Instruction *Op0, *Op1;
+  auto *RedOp = dyn_cast<Instruction>(II.getOperand(0));
+  auto *VecRedTy = cast<VectorType>(II.getOperand(0)->getType());
+  unsigned ReductionOpc =
+      getArithmeticReductionInstruction(II.getIntrinsicID());
+  if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value()))) {
+    bool IsUnsigned = isa<ZExtInst>(RedOp);
+    auto *ExtType = cast<VectorType>(RedOp->getOperand(0)->getType());
+
+    CostBeforeReduction =
+        TTI.getCastInstrCost(RedOp->getOpcode(), VecRedTy, ExtType,
+                             TTI::CastContextHint::None, CostKind, RedOp);
+    CostAfterReduction =
+        TTI.getExtendedReductionCost(ReductionOpc, IsUnsigned, II.getType(),
+                                     ExtType, FastMathFlags(), CostKind);
+    return;
+  }
+  if (RedOp && II.getIntrinsicID() == Intrinsic::vector_reduce_add &&
+      match(RedOp,
+            m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) &&
+      match(Op0, m_ZExtOrSExt(m_Value())) &&
+      Op0->getOpcode() == Op1->getOpcode() &&
+      Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
+      (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
+    // Matched reduce.add(ext(mul(ext(A), ext(B)))
+    bool IsUnsigned = isa<ZExtInst>(Op0);
+    auto *ExtType = cast<VectorType>(Op0->getOperand(0)->getType());
+    VectorType *MulType = VectorType::get(Op0->getType(), VecRedTy);
+
+    InstructionCost ExtCost =
+        TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
+                             TTI::CastContextHint::None, CostKind, Op0);
+    InstructionCost MulCost =
+        TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
+    InstructionCost Ext2Cost =
+        TTI.getCastInstrCost(RedOp->getOpcode(), VecRedTy, MulType,
+                             TTI::CastContextHint::None, CostKind, RedOp);
+
+    CostBeforeReduction = ExtCost * 2 + MulCost + Ext2Cost;
+    CostAfterReduction =
+        TTI.getMulAccReductionCost(IsUnsigned, II.getType(), ExtType, CostKind);
+    return;
+  }
+  CostAfterReduction = TTI.getArithmeticReductionCost(ReductionOpc, VecRedTy,
+                                                      std::nullopt, CostKind);
+  return;
+}
+
+bool VectorCombine::foldBinopOfReductions(Instruction &I) {
+  Instruction::BinaryOps BinOpOpc = cast<BinaryOperator>(&I)->getOpcode();
+  Intrinsic::ID ReductionIID = getReductionForBinop(BinOpOpc);
+  if (BinOpOpc == Instruction::Sub)
+    ReductionIID = Intrinsic::vector_reduce_add;
+  if (ReductionIID == Intrinsic::not_intrinsic)
+    return false;
+
+  auto checkIntrinsicAndGetItsArgument = [](Value *V,
+                                            Intrinsic::ID IID) -> Value * {
+    auto *II = dyn_cast<IntrinsicInst>(V);
+    if (!II)
+      return nullptr;
+    if (II->getIntrinsicID() == IID && II->hasOneUse())
+      return II->getArgOperand(0);
+    return nullptr;
+  };
+
+  Value *V0 = checkIntrinsicAndGetItsArgument(I.getOperand(0), ReductionIID);
+  if (!V0)
+    return false;
+  Value *V1 = checkIntrinsicAndGetItsArgument(I.getOperand(1), ReductionIID);
+  if (!V1)
+    return false;
+
+  auto *VTy = cast<VectorType>(V0->getType());
+  if (V1->getType() != VTy)
+    return false;
+  const auto &II0 = *cast<IntrinsicInst>(I.getOperand(0));
+  const auto &II1 = *cast<IntrinsicInst>(I.getOperand(1));
+  unsigned ReductionOpc =
+      getArithmeticReductionInstruction(II0.getIntrinsicID());
+
+  InstructionCost OldCost = 0;
+  InstructionCost NewCost = 0;
+  InstructionCost CostOfRedOperand0 = 0;
+  InstructionCost CostOfRed0 = 0;
+  InstructionCost CostOfRedOperand1 = 0;
+  InstructionCost CostOfRed1 = 0;
+  analyzeCostOfVecReduction(II0, CostKind, TTI, CostOfRedOperand0, CostOfRed0);
+  analyzeCostOfVecReduction(II1, CostKind, TTI, CostOfRedOperand1, CostOfRed1);
+  OldCost = CostOfRed0 + CostOfRed1 + TTI.getInstructionCost(&I, CostKind);
+  NewCost =
+      CostOfRedOperand0 + CostOfRedOperand1 +
+      TTI.getArithmeticInstrCost(BinOpOpc, VTy, CostKind) +
+      TTI.getArithmeticReductionCost(ReductionOpc, VTy, std::nullopt, CostKind);
+  if (NewCost >= OldCost || !NewCost.isValid())
+    return false;
+
+  LLVM_DEBUG(dbgs() << "Found two mergeable reductions: " << I
+                    << "\n  OldCost: " << OldCost << " vs NewCost: " << NewCost
+                    << "\n");
+  Value *VectorBO = Builder.CreateBinOp(BinOpOpc, V0, V1);
+  if (auto *PDInst = dyn_cast<PossiblyDisjointInst>(&I))
+    if (auto *PDVectorBO = dyn_cast<PossiblyDisjointInst>(VectorBO))
+      PDVectorBO->setIsDisjoint(PDInst->isDisjoint());
+
+  Instruction *Rdx = Builder.CreateIntrinsic(ReductionIID, {VTy}, {VectorBO});
+  replaceValue(I, *Rdx);
+  return true;
+}
+
 // Check if memory loc modified between two instrs in the same BB
 static bool isMemModifiedBetween(BasicBlock::iterator Begin,
                                  BasicBlock::iterator End,
@@ -3382,6 +3498,7 @@ bool VectorCombine::run() {
         if (Instruction::isBinaryOp(Opcode)) {
           MadeChange |= foldExtractExtract(I);
           MadeChange |= foldExtractedCmps(I);
+          MadeChange |= foldBinopOfReductions(I);
         }
         break;
       }
diff --git a/llvm/test/Transforms/VectorCombine/ARM/fold-binop-of-reductions.ll b/llvm/test/Transforms/VectorCombine/ARM/fold-binop-of-reductions.ll
new file mode 100644
index 0000000000000..ad362ef2bf900
--- /dev/null
+++ b/llvm/test/Transforms/VectorCombine/ARM/fold-binop-of-reductions.ll
@@ -0,0 +1,93 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -mattr=+mve.fp -passes=vector-combine -S | FileCheck %s
+
+target triple = "thumbv8.1m.main-arm-none-eabi"
+
+define i16 @add_of_reduce_add(<8 x i16> %v0, <8 x i16> %v1) {
+; CHECK-LABEL: define i16 @add_of_reduce_add(
+; CHECK-SAME: <8 x i16> [[V0:%.*]], <8 x i16> [[V1:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = add <8 x i16> [[V0]], [[V1]]
+; CHECK-NEXT:    [[RES:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP1]])
+; CHECK-NEXT:    ret i16 [[RES]]
+;
+  %v0_red = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %v0)
+  %v1_red = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %v1)
+  %res = add i16 %v0_red, %v1_red
+  ret i16 %res
+}
+
+define i16 @reduce_zext_0(<8 x i8> %v0, <8 x i16> %v1) {
+; CHECK-LABEL: define i16 @reduce_zext_0(
+; CHECK-SAME: <8 x i8> [[V0:%.*]], <8 x i16> [[V1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[ZEXT_:%.*]] = zext <8 x i8> [[V0]] to <8 x i16>
+; CHECK-NEXT:    [[V0_RED:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[ZEXT_]])
+; CHECK-NEXT:    [[V1_RED:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[V1]])
+; CHECK-NEXT:    [[RES:%.*]] = add i16 [[V0_RED]], [[V1_RED]]
+; CHECK-NEXT:    ret i16 [[RES]]
+;
+  %zext_ = zext <8 x i8> %v0 to <8 x i16>
+  %v0_red = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %zext_)
+  %v1_red = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %v1)
+  %res = add i16 %v0_red, %v1_red
+  ret i16 %res
+}
+
+define i16 @reduce_zext_1(<8 x i16> %v0, <8 x i8> %v1) {
+; CHECK-LABEL: define i16 @reduce_zext_1(
+; CHECK-SAME: <8 x i16> [[V0:%.*]], <8 x i8> [[V1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[ZEXT_:%.*]] = zext <8 x i8> [[V1]] to <8 x i16>
+; CHECK-NEXT:    [[V0_RED:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[V0]])
+; CHECK-NEXT:    [[V1_RED:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[ZEXT_]])
+; CHECK-NEXT:    [[RES:%.*]] = add i16 [[V0_RED]], [[V1_RED]]
+; CHECK-NEXT:    ret i16 [[RES]]
+;
+  %zext_ = zext <8 x i8> %v1 to <8 x i16>
+  %v0_red = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %v0)
+  %v1_red = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %zext_)
+  %res = add i16 %v0_red, %v1_red
+  ret i16 %res
+}
+
+define i32 @mul_acc_pattern_0(<8 x i8> %v0, <8 x i8> %v1, <8 x i32> %v2) {
+; CHECK-LABEL: define i32 @mul_acc_pattern_0(
+; CHECK-SAME: <8 x i8> [[V0:%.*]], <8 x i8> [[V1:%.*]], <8 x i32> [[V2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[INNER_ZEXT_0:%.*]] = zext <8 x i8> [[V0]] to <8 x i16>
+; CHECK-NEXT:    [[INNER_ZEXT_1:%.*]] = zext <8 x i8> [[V1]] to <8 x i16>
+; CHECK-NEXT:    [[MUL_:%.*]] = mul <8 x i16> [[INNER_ZEXT_0]], [[INNER_ZEXT_1]]
+; CHECK-NEXT:    [[ZEXT_:%.*]] = zext <8 x i16> [[MUL_]] to <8 x i32>
+; CHECK-NEXT:    [[RED_MUL_ACC_PATTERN:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[ZEXT_]])
+; CHECK-NEXT:    [[RED:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[V2]])
+; CHECK-NEXT:    [[RES:%.*]] = add i32 [[RED_MUL_ACC_PATTERN]], [[RED]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %inner_zext_0 = zext <8 x i8> %v0 to <8 x i16>
+  %inner_zext_1 = zext <8 x i8> %v1 to <8 x i16>
+  %mul_ = mul <8 x i16> %inner_zext_0, %inner_zext_1
+  %zext_ = zext <8 x i16> %mul_ to <8 x i32>
+  %red_mul_acc_pattern = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %zext_)
+  %red = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %v2)
+  %res = add i32 %red_mul_acc_pattern, %red
+  ret i32 %res
+}
+
+define i32 @mul_acc_pattern_1(<8 x i8> %v0, <8 x i8> %v1, <8 x i32> %v2) {
+; CHECK-LABEL: define i32 @mul_acc_pattern_1(
+; CHECK-SAME: <8 x i8> [[V0:%.*]], <8 x i8> [[V1:%.*]], <8 x i32> [[V2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[INNER_ZEXT_0:%.*]] = zext <8 x i8> [[V0]] to <8 x i16>
+; CHECK-NEXT:    [[INNER_ZEXT_1:%.*]] = zext <8 x i8> [[V1]] to <8 x i16>
+; CHECK-NEXT:    [[MUL_:%.*]] = mul <8 x i16> [[INNER_ZEXT_0]], [[INNER_ZEXT_1]]
+; CHECK-NEXT:    [[ZEXT_:%.*]] = zext <8 x i16> [[MUL_]] to <8 x i32>
+; CHECK-NEXT:    [[RED_MUL_ACC_PATTERN:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[ZEXT_]])
+; CHECK-NEXT:    [[RED:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[V2]])
+; CHECK-NEXT:    [[RES:%.*]] = add i32 [[RED]], [[RED_MUL_ACC_PATTERN]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %inner_zext_0 = zext <8 x i8> %v0 to <8 x i16>
+  %inner_zext_1 = zext <8 x i8> %v1 to <8 x i16>
+  %mul_ = mul <8 x i16> %inner_zext_0, %inner_zext_1
+  %zext_ = zext <8 x i16> %mul_ to <8 x i32>
+  %red_mul_acc_pattern = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %zext_)
+  %red = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %v2)
+  %res = add i32 %red, %red_mul_acc_pattern
+  ret i32 %res
+}
diff --git a/llvm/test/Transforms/VectorCombine/fold-binop-of-reductions.ll b/llvm/test/Transforms/VectorCombine/fold-binop-of-reductions.ll
index cc88db03fbe2c..5f29af9de5a39 100644
--- a/llvm/test/Transforms/VectorCombine/fold-binop-of-reductions.ll
+++ b/llvm/test/Transforms/VectorCombine/fold-binop-of-reductions.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+; RUN: opt < %s -passes=vector-combine -S | FileCheck %s
 
 define i32 @add_of_reduce_add(<16 x i32> %v0, <16 x i32> %v1) {
 ; CHECK-LABEL: define i32 @add_of_reduce_add(