[llvm] [VPlan] Add VPBundleRecipe, replacing extended reduction recipes. (PR #144281)

Fri Jun 20 00:23:04 PDT 2025

https://github.com/fhahn updated https://github.com/llvm/llvm-project/pull/144281

>From 736357e6154d4a044ecf577d21d96e0898ea4a9d Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Fri, 13 Jun 2025 21:07:04 +0100
Subject: [PATCH 1/4] VPBundleRecipe

This patch adds a new recipe to combine multiple recipes into a 'bundle'
recipe, which should be considered as single entity for cost-modeling and
transforms. The recipe needs to be 'unbundled', i.e. replaced by its
individual recipes before execute.

This subsumes VPExtendedReductionRecipe and
VPMulAccumulateReductionRecipe and should make it easier to extend to
include more types of bundled patterns, like e.g. extends folded into
loads or various arithmetic instructions, if supported by the target.

It allows avoiding re-creating the original recipes when converting to
concrete recipes, together with removing the need to record various
information. The current version of the patch still retains the original
printing matching VPExtendedReductionRecipe and VPMulAccumulateReductionRecipe,
but this specialized print could be replaced with printing the bundled recipes
directly.

Currently the unbundle implementation is a bit more complicated than
necessary, as we need to fold the extends across ops to match the
current behavior, but there's quite possibly a better place to do so.
---
 llvm/lib/Transforms/Vectorize/VPlan.h         | 335 +++++++-----------
 .../Transforms/Vectorize/VPlanAnalysis.cpp    |   5 +-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 204 +++++++----
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 133 ++-----
 llvm/lib/Transforms/Vectorize/VPlanValue.h    |   5 +-
 .../LoopVectorize/ARM/mve-reductions.ll       |  27 +-
 .../vplan-printing-reductions.ll              |  12 +-
 7 files changed, 303 insertions(+), 418 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 5a3c4a514a5dd..256706deb0977 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -525,14 +525,13 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
 
   static inline bool classof(const VPRecipeBase *R) {
     switch (R->getVPDefID()) {
+    case VPRecipeBase::VPBundleSC:
     case VPRecipeBase::VPDerivedIVSC:
     case VPRecipeBase::VPEVLBasedIVPHISC:
     case VPRecipeBase::VPExpandSCEVSC:
     case VPRecipeBase::VPInstructionSC:
     case VPRecipeBase::VPReductionEVLSC:
     case VPRecipeBase::VPReductionSC:
-    case VPRecipeBase::VPMulAccumulateReductionSC:
-    case VPRecipeBase::VPExtendedReductionSC:
     case VPRecipeBase::VPReplicateSC:
     case VPRecipeBase::VPScalarIVStepsSC:
     case VPRecipeBase::VPVectorPointerSC:
@@ -852,9 +851,7 @@ struct VPRecipeWithIRFlags : public VPSingleDefRecipe, public VPIRFlags {
            R->getVPDefID() == VPRecipeBase::VPReductionEVLSC ||
            R->getVPDefID() == VPRecipeBase::VPReplicateSC ||
            R->getVPDefID() == VPRecipeBase::VPVectorEndPointerSC ||
-           R->getVPDefID() == VPRecipeBase::VPVectorPointerSC ||
-           R->getVPDefID() == VPRecipeBase::VPExtendedReductionSC ||
-           R->getVPDefID() == VPRecipeBase::VPMulAccumulateReductionSC;
+           R->getVPDefID() == VPRecipeBase::VPVectorPointerSC;
   }
 
   static inline bool classof(const VPUser *U) {
@@ -2431,29 +2428,6 @@ class VPReductionRecipe : public VPRecipeWithIRFlags {
     }
     setUnderlyingValue(I);
   }
-
-  /// For VPExtendedReductionRecipe.
-  /// Note that the debug location is from the extend.
-  VPReductionRecipe(const unsigned char SC, const RecurKind RdxKind,
-                    ArrayRef<VPValue *> Operands, VPValue *CondOp,
-                    bool IsOrdered, DebugLoc DL)
-      : VPRecipeWithIRFlags(SC, Operands, DL), RdxKind(RdxKind),
-        IsOrdered(IsOrdered), IsConditional(CondOp) {
-    if (CondOp)
-      addOperand(CondOp);
-  }
-
-  /// For VPMulAccumulateReductionRecipe.
-  /// Note that the NUW/NSW flags and the debug location are from the Mul.
-  VPReductionRecipe(const unsigned char SC, const RecurKind RdxKind,
-                    ArrayRef<VPValue *> Operands, VPValue *CondOp,
-                    bool IsOrdered, WrapFlagsTy WrapFlags, DebugLoc DL)
-      : VPRecipeWithIRFlags(SC, Operands, WrapFlags, DL), RdxKind(RdxKind),
-        IsOrdered(IsOrdered), IsConditional(CondOp) {
-    if (CondOp)
-      addOperand(CondOp);
-  }
-
 public:
   VPReductionRecipe(RecurKind RdxKind, FastMathFlags FMFs, Instruction *I,
                     VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp,
@@ -2479,9 +2453,7 @@ class VPReductionRecipe : public VPRecipeWithIRFlags {
 
   static inline bool classof(const VPRecipeBase *R) {
     return R->getVPDefID() == VPRecipeBase::VPReductionSC ||
-           R->getVPDefID() == VPRecipeBase::VPReductionEVLSC ||
-           R->getVPDefID() == VPRecipeBase::VPExtendedReductionSC ||
-           R->getVPDefID() == VPRecipeBase::VPMulAccumulateReductionSC;
+           R->getVPDefID() == VPRecipeBase::VPReductionEVLSC;
   }
 
   static inline bool classof(const VPUser *U) {
@@ -2620,190 +2592,6 @@ class VPReductionEVLRecipe : public VPReductionRecipe {
   }
 };
 
-/// A recipe to represent inloop extended reduction operations, performing a
-/// reduction on a extended vector operand into a scalar value, and adding the
-/// result to a chain. This recipe is abstract and needs to be lowered to
-/// concrete recipes before codegen. The operands are {ChainOp, VecOp,
-/// [Condition]}.
-class VPExtendedReductionRecipe : public VPReductionRecipe {
-  /// Opcode of the extend for VecOp.
-  Instruction::CastOps ExtOp;
-
-  /// The scalar type after extending.
-  Type *ResultTy;
-
-  /// For cloning VPExtendedReductionRecipe.
-  VPExtendedReductionRecipe(VPExtendedReductionRecipe *ExtRed)
-      : VPReductionRecipe(
-            VPDef::VPExtendedReductionSC, ExtRed->getRecurrenceKind(),
-            {ExtRed->getChainOp(), ExtRed->getVecOp()}, ExtRed->getCondOp(),
-            ExtRed->isOrdered(), ExtRed->getDebugLoc()),
-        ExtOp(ExtRed->getExtOpcode()), ResultTy(ExtRed->getResultType()) {
-    transferFlags(*ExtRed);
-    setUnderlyingValue(ExtRed->getUnderlyingValue());
-  }
-
-public:
-  VPExtendedReductionRecipe(VPReductionRecipe *R, VPWidenCastRecipe *Ext)
-      : VPReductionRecipe(VPDef::VPExtendedReductionSC, R->getRecurrenceKind(),
-                          {R->getChainOp(), Ext->getOperand(0)}, R->getCondOp(),
-                          R->isOrdered(), Ext->getDebugLoc()),
-        ExtOp(Ext->getOpcode()), ResultTy(Ext->getResultType()) {
-    assert((ExtOp == Instruction::CastOps::ZExt ||
-            ExtOp == Instruction::CastOps::SExt) &&
-           "VPExtendedReductionRecipe only supports zext and sext.");
-
-    transferFlags(*Ext);
-    setUnderlyingValue(R->getUnderlyingValue());
-  }
-
-  ~VPExtendedReductionRecipe() override = default;
-
-  VPExtendedReductionRecipe *clone() override {
-    return new VPExtendedReductionRecipe(this);
-  }
-
-  VP_CLASSOF_IMPL(VPDef::VPExtendedReductionSC);
-
-  void execute(VPTransformState &State) override {
-    llvm_unreachable("VPExtendedReductionRecipe should be transform to "
-                     "VPExtendedRecipe + VPReductionRecipe before execution.");
-  };
-
-  /// Return the cost of VPExtendedReductionRecipe.
-  InstructionCost computeCost(ElementCount VF,
-                              VPCostContext &Ctx) const override;
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  /// Print the recipe.
-  void print(raw_ostream &O, const Twine &Indent,
-             VPSlotTracker &SlotTracker) const override;
-#endif
-
-  /// The scalar type after extending.
-  Type *getResultType() const { return ResultTy; }
-
-  /// Is the extend ZExt?
-  bool isZExt() const { return getExtOpcode() == Instruction::ZExt; }
-
-  /// Get the opcode of the extend for VecOp.
-  Instruction::CastOps getExtOpcode() const { return ExtOp; }
-};
-
-/// A recipe to represent inloop MulAccumulateReduction operations,  multiplying
-/// the vector operands (which may be extended), performing a reduction.add on
-/// the result, and adding the scalar result to a chain. This recipe is abstract
-/// and needs to be lowered to concrete recipes before codegen. The operands are
-/// {ChainOp, VecOp1, VecOp2, [Condition]}.
-class VPMulAccumulateReductionRecipe : public VPReductionRecipe {
-  /// Opcode of the extend for VecOp1 and VecOp2.
-  Instruction::CastOps ExtOp;
-
-  /// Non-neg flag of the extend recipe.
-  bool IsNonNeg = false;
-
-  /// The scalar type after extending.
-  Type *ResultTy = nullptr;
-
-  /// For cloning VPMulAccumulateReductionRecipe.
-  VPMulAccumulateReductionRecipe(VPMulAccumulateReductionRecipe *MulAcc)
-      : VPReductionRecipe(
-            VPDef::VPMulAccumulateReductionSC, MulAcc->getRecurrenceKind(),
-            {MulAcc->getChainOp(), MulAcc->getVecOp0(), MulAcc->getVecOp1()},
-            MulAcc->getCondOp(), MulAcc->isOrdered(),
-            WrapFlagsTy(MulAcc->hasNoUnsignedWrap(), MulAcc->hasNoSignedWrap()),
-            MulAcc->getDebugLoc()),
-        ExtOp(MulAcc->getExtOpcode()), IsNonNeg(MulAcc->isNonNeg()),
-        ResultTy(MulAcc->getResultType()) {
-    transferFlags(*MulAcc);
-    setUnderlyingValue(MulAcc->getUnderlyingValue());
-  }
-
-public:
-  VPMulAccumulateReductionRecipe(VPReductionRecipe *R, VPWidenRecipe *Mul,
-                                 VPWidenCastRecipe *Ext0,
-                                 VPWidenCastRecipe *Ext1, Type *ResultTy)
-      : VPReductionRecipe(
-            VPDef::VPMulAccumulateReductionSC, R->getRecurrenceKind(),
-            {R->getChainOp(), Ext0->getOperand(0), Ext1->getOperand(0)},
-            R->getCondOp(), R->isOrdered(),
-            WrapFlagsTy(Mul->hasNoUnsignedWrap(), Mul->hasNoSignedWrap()),
-            R->getDebugLoc()),
-        ExtOp(Ext0->getOpcode()), ResultTy(ResultTy) {
-    assert(RecurrenceDescriptor::getOpcode(getRecurrenceKind()) ==
-               Instruction::Add &&
-           "The reduction instruction in MulAccumulateteReductionRecipe must "
-           "be Add");
-    assert((ExtOp == Instruction::CastOps::ZExt ||
-            ExtOp == Instruction::CastOps::SExt) &&
-           "VPMulAccumulateReductionRecipe only supports zext and sext.");
-    setUnderlyingValue(R->getUnderlyingValue());
-    // Only set the non-negative flag if the original recipe contains.
-    if (Ext0->hasNonNegFlag())
-      IsNonNeg = Ext0->isNonNeg();
-  }
-
-  VPMulAccumulateReductionRecipe(VPReductionRecipe *R, VPWidenRecipe *Mul,
-                                 Type *ResultTy)
-      : VPReductionRecipe(
-            VPDef::VPMulAccumulateReductionSC, R->getRecurrenceKind(),
-            {R->getChainOp(), Mul->getOperand(0), Mul->getOperand(1)},
-            R->getCondOp(), R->isOrdered(),
-            WrapFlagsTy(Mul->hasNoUnsignedWrap(), Mul->hasNoSignedWrap()),
-            R->getDebugLoc()),
-        ExtOp(Instruction::CastOps::CastOpsEnd), ResultTy(ResultTy) {
-    assert(RecurrenceDescriptor::getOpcode(getRecurrenceKind()) ==
-               Instruction::Add &&
-           "The reduction instruction in MulAccumulateReductionRecipe must be "
-           "Add");
-    setUnderlyingValue(R->getUnderlyingValue());
-  }
-
-  ~VPMulAccumulateReductionRecipe() override = default;
-
-  VPMulAccumulateReductionRecipe *clone() override {
-    return new VPMulAccumulateReductionRecipe(this);
-  }
-
-  VP_CLASSOF_IMPL(VPDef::VPMulAccumulateReductionSC);
-
-  void execute(VPTransformState &State) override {
-    llvm_unreachable("VPMulAccumulateReductionRecipe should transform to "
-                     "VPWidenCastRecipe + "
-                     "VPWidenRecipe + VPReductionRecipe before execution");
-  }
-
-  /// Return the cost of VPMulAccumulateReductionRecipe.
-  InstructionCost computeCost(ElementCount VF,
-                              VPCostContext &Ctx) const override;
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  /// Print the recipe.
-  void print(raw_ostream &O, const Twine &Indent,
-             VPSlotTracker &SlotTracker) const override;
-#endif
-
-  Type *getResultType() const { return ResultTy; }
-
-  /// The first vector value to be extended and reduced.
-  VPValue *getVecOp0() const { return getOperand(1); }
-
-  /// The second vector value to be extended and reduced.
-  VPValue *getVecOp1() const { return getOperand(2); }
-
-  /// Return true if this recipe contains extended operands.
-  bool isExtended() const { return ExtOp != Instruction::CastOps::CastOpsEnd; }
-
-  /// Return the opcode of the extends for the operands.
-  Instruction::CastOps getExtOpcode() const { return ExtOp; }
-
-  /// Return if the operands are zero-extended.
-  bool isZExt() const { return ExtOp == Instruction::CastOps::ZExt; }
-
-  /// Return true if the operand extends have the non-negative flag.
-  bool isNonNeg() const { return IsNonNeg; }
-};
-
 /// VPReplicateRecipe replicates a given instruction producing multiple scalar
 /// copies of the original scalar type, one per lane, instead of producing a
 /// single copy of widened type for all lanes. If the instruction is known to be
@@ -2922,6 +2710,123 @@ class VPBranchOnMaskRecipe : public VPRecipeBase {
   }
 };
 
+/// A recipe to combine multiple recipes into a 'bundle' recipe, which should be
+/// considered as single entity for cost-modeling and transforms. The recipe
+/// needs to be 'unbundled', i.e. replaced by its individual recipes before
+/// execute.
+class VPBundleRecipe : public VPSingleDefRecipe {
+  enum class BundleTypes {
+    ExtendedReduction,
+    MulAccumulateReduction,
+  };
+
+  /// Recipes bundled together in this VPBundleRecipe.
+  SmallVector<VPSingleDefRecipe *> BundledOps;
+
+  /// Temporary VPValues used for external operands of the bundle, i.e. operands
+  /// not defined by recipes in the bundle.
+  SmallVector<VPValue *> TmpValues;
+
+  /// Type of the bundle.
+  BundleTypes BundleType;
+
+  VPBundleRecipe(BundleTypes BundleType, ArrayRef<VPSingleDefRecipe *> ToBundle)
+      : VPSingleDefRecipe(VPDef::VPBundleSC, {}, {}), BundledOps(ToBundle),
+        BundleType(BundleType) {
+    // Bundle up the operand recipes.
+    SmallPtrSet<VPUser *, 4> BundledUsers;
+    for (auto *R : ToBundle)
+      BundledUsers.insert(R);
+
+    // Recipes in the bundle, expect the last one, must only be used inside the
+    // bundle. If there other external users, clone the recipes for the bundle.
+    for (const auto &[Idx, R] : enumerate(drop_end(ToBundle))) {
+      if (all_of(R->users(), [&BundledUsers](VPUser *U) {
+            return BundledUsers.contains(U);
+          })) {
+        if (R->getParent())
+          R->removeFromParent();
+        continue;
+      }
+      // There users external to the bundle. Clone the recipe for use in the
+      // bundle and update all its in-bundle users.
+      this->BundledOps[Idx] = R->clone();
+      BundledUsers.insert(this->BundledOps[Idx]);
+      R->replaceUsesWithIf(this->BundledOps[Idx],
+                           [&BundledUsers](VPUser &U, unsigned) {
+                             return BundledUsers.contains(&U);
+                           });
+    }
+    BundledOps.back()->removeFromParent();
+
+    // Internalize all external operands to the bundled operations. To do so,
+    // create new temporary VPValues for all operands not defined by recipe in
+    // the bundle. The original operands are added as operands of the
+    // VPBundleRecipe.
+    for (auto *R : this->BundledOps) {
+      for (const auto &[Idx, Op] : enumerate(R->operands())) {
+        auto *Def = Op->getDefiningRecipe();
+        if (Def && BundledUsers.contains(Def))
+          continue;
+        addOperand(Op);
+        TmpValues.push_back(new VPValue());
+        R->setOperand(Idx, TmpValues.back());
+      }
+    }
+  }
+
+public:
+  VPBundleRecipe(VPWidenCastRecipe *Ext, VPReductionRecipe *Red)
+      : VPBundleRecipe(BundleTypes::ExtendedReduction, {Ext, Red}) {}
+  VPBundleRecipe(VPWidenRecipe *Mul, VPReductionRecipe *Red)
+      : VPBundleRecipe(BundleTypes::MulAccumulateReduction, {Mul, Red}) {}
+  VPBundleRecipe(VPWidenCastRecipe *Ext0, VPWidenCastRecipe *Ext1,
+                 VPWidenRecipe *Mul, VPReductionRecipe *Red)
+      : VPBundleRecipe(BundleTypes::MulAccumulateReduction,
+                       {Ext0, Ext1, Mul, Red}) {}
+  VPBundleRecipe(VPWidenCastRecipe *Ext0, VPWidenCastRecipe *Ext1,
+                 VPWidenRecipe *Mul, VPWidenCastRecipe *Ext2,
+                 VPReductionRecipe *Red)
+      : VPBundleRecipe(BundleTypes::MulAccumulateReduction,
+                       {Ext0, Ext1, Mul, Ext2, Red}) {}
+
+  ~VPBundleRecipe() override {
+    SmallPtrSet<VPRecipeBase *, 4> Seen;
+    for (auto *R : reverse(BundledOps))
+      if (Seen.insert(R).second)
+        delete R;
+    for (VPValue *T : TmpValues)
+      delete T;
+  }
+
+  VP_CLASSOF_IMPL(VPDef::VPBundleSC)
+
+  VPBundleRecipe *clone() override {
+    return new VPBundleRecipe(BundleType, BundledOps);
+  }
+
+  /// Return the VPSingleDefRecipe producing the final result of the bundled
+  /// recipe.
+  VPSingleDefRecipe *getResultOp() const { return BundledOps.back(); }
+
+  void unbundle();
+
+  /// Generate the extraction of the appropriate bit from the block mask and the
+  /// conditional branch.
+  void execute(VPTransformState &State) override {
+    llvm_unreachable("recipe must be removed before execute");
+  }
+
+  InstructionCost computeCost(ElementCount VF,
+                              VPCostContext &Ctx) const override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+#endif
+};
+
 /// VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when
 /// control converges back from a Branch-on-Mask. The phi nodes are needed in
 /// order to merge values that are set under such a branch and feed their uses.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 76da5b0314a8e..c8336e7b3f92c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -267,6 +267,9 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
 
   Type *ResultTy =
       TypeSwitch<const VPRecipeBase *, Type *>(V->getDefiningRecipe())
+          .Case<VPBundleRecipe>([this](const auto *R) {
+            return inferScalarType(R->getOperand(R->getNumOperands() - 2));
+          })
           .Case<VPActiveLaneMaskPHIRecipe, VPCanonicalIVPHIRecipe,
                 VPFirstOrderRecurrencePHIRecipe, VPReductionPHIRecipe,
                 VPWidenPointerInductionRecipe, VPEVLBasedIVPHIRecipe>(
@@ -296,8 +299,6 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
             // TODO: Use info from interleave group.
             return V->getUnderlyingValue()->getType();
           })
-          .Case<VPExtendedReductionRecipe, VPMulAccumulateReductionRecipe>(
-              [](const auto *R) { return R->getResultType(); })
           .Case<VPExpandSCEVRecipe>([](const VPExpandSCEVRecipe *R) {
             return R->getSCEV()->getType();
           })
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 048286d7a97bc..392d6c3d32c87 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -73,8 +73,6 @@ bool VPRecipeBase::mayWriteToMemory() const {
   case VPBlendSC:
   case VPReductionEVLSC:
   case VPReductionSC:
-  case VPExtendedReductionSC:
-  case VPMulAccumulateReductionSC:
   case VPVectorPointerSC:
   case VPWidenCanonicalIVSC:
   case VPWidenCastSC:
@@ -123,8 +121,6 @@ bool VPRecipeBase::mayReadFromMemory() const {
   case VPBlendSC:
   case VPReductionEVLSC:
   case VPReductionSC:
-  case VPExtendedReductionSC:
-  case VPMulAccumulateReductionSC:
   case VPVectorPointerSC:
   case VPWidenCanonicalIVSC:
   case VPWidenCastSC:
@@ -163,8 +159,6 @@ bool VPRecipeBase::mayHaveSideEffects() const {
   case VPBlendSC:
   case VPReductionEVLSC:
   case VPReductionSC:
-  case VPExtendedReductionSC:
-  case VPMulAccumulateReductionSC:
   case VPScalarIVStepsSC:
   case VPVectorPointerSC:
   case VPWidenCanonicalIVSC:
@@ -2582,30 +2576,142 @@ InstructionCost VPReductionRecipe::computeCost(ElementCount VF,
                                             Ctx.CostKind);
 }
 
-InstructionCost
-VPExtendedReductionRecipe::computeCost(ElementCount VF,
-                                       VPCostContext &Ctx) const {
-  unsigned Opcode = RecurrenceDescriptor::getOpcode(getRecurrenceKind());
-  Type *RedTy = Ctx.Types.inferScalarType(this);
-  auto *SrcVecTy =
-      cast<VectorType>(toVectorTy(Ctx.Types.inferScalarType(getVecOp()), VF));
-  assert(RedTy->isIntegerTy() &&
-         "ExtendedReduction only support integer type currently.");
-  return Ctx.TTI.getExtendedReductionCost(Opcode, isZExt(), RedTy, SrcVecTy,
-                                          std::nullopt, Ctx.CostKind);
+void VPBundleRecipe::unbundle() {
+  for (auto *Op : BundledOps)
+    if (!Op->getParent())
+      Op->insertBefore(this);
+
+  for (const auto &[Idx, Op] : enumerate(operands()))
+    TmpValues[Idx]->replaceAllUsesWith(Op);
+
+  replaceAllUsesWith(getResultOp());
+
+  if (BundleType == BundleTypes::MulAccumulateReduction &&
+      BundledOps.size() == 5) {
+    // Note that we will drop the extend after mul which transforms
+    // reduce.add(ext(mul(ext, ext))) to reduce.add(mul(ext, ext)).
+    auto *Ext0 = cast<VPWidenCastRecipe>(BundledOps[0]);
+    auto *Ext1 = cast<VPWidenCastRecipe>(BundledOps[1]);
+    auto *Ext2 = cast<VPWidenCastRecipe>(BundledOps[3]);
+    auto *Op0 =
+        new VPWidenCastRecipe(Ext0->getOpcode(), Ext0->getOperand(0),
+                              Ext2->getResultType(), *Ext0, getDebugLoc());
+    Op0->insertBefore(Ext0);
+
+    VPSingleDefRecipe *Op1 = Op0;
+    if (Ext0 != Ext1) {
+      Op1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(0),
+                                  Ext2->getResultType(), *Ext1, getDebugLoc());
+      Op1->insertBefore(Ext0);
+    }
+    auto *Mul = cast<VPWidenRecipe>(BundledOps[2]);
+    auto *Red = cast<VPReductionRecipe>(BundledOps[4]);
+    Mul->setOperand(0, Op0);
+    Mul->setOperand(1, Op1);
+    Red->setOperand(1, Mul);
+    Ext0->eraseFromParent();
+    Ext2->eraseFromParent();
+    if (Ext0 != Ext1)
+      Ext1->eraseFromParent();
+  }
+  BundledOps.clear();
 }
 
-InstructionCost
-VPMulAccumulateReductionRecipe::computeCost(ElementCount VF,
+InstructionCost VPBundleRecipe::computeCost(ElementCount VF,
                                             VPCostContext &Ctx) const {
   Type *RedTy = Ctx.Types.inferScalarType(this);
-  auto *SrcVecTy =
-      cast<VectorType>(toVectorTy(Ctx.Types.inferScalarType(getVecOp0()), VF));
-  return Ctx.TTI.getMulAccReductionCost(isZExt(), RedTy, SrcVecTy,
-                                        Ctx.CostKind);
+  auto *SrcVecTy = cast<VectorType>(
+      toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF));
+  assert(RedTy->isIntegerTy() &&
+         "ExtendedReduction only support integer type currently.");
+  switch (BundleType) {
+  case BundleTypes::ExtendedReduction: {
+    unsigned Opcode = RecurrenceDescriptor::getOpcode(
+        cast<VPReductionRecipe>(BundledOps[1])->getRecurrenceKind());
+    return Ctx.TTI.getExtendedReductionCost(
+        Opcode,
+        cast<VPWidenCastRecipe>(BundledOps.front())->getOpcode() ==
+            Instruction::ZExt,
+        RedTy, SrcVecTy, std::nullopt, Ctx.CostKind);
+  }
+  case BundleTypes::MulAccumulateReduction:
+    return Ctx.TTI.getMulAccReductionCost(
+        BundledOps.size() > 2
+            ? cast<VPWidenCastRecipe>(BundledOps.front())->getOpcode() ==
+                  Instruction::ZExt
+            : false,
+        RedTy, SrcVecTy, Ctx.CostKind);
+  }
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+
+void VPBundleRecipe::print(raw_ostream &O, const Twine &Indent,
+                           VPSlotTracker &SlotTracker) const {
+  O << Indent << "BUNDLE ";
+  printAsOperand(O, SlotTracker);
+  O << " = ";
+  auto *Red = cast<VPReductionRecipe>(BundledOps.back());
+  unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
+
+  switch (BundleType) {
+  case BundleTypes::ExtendedReduction: {
+    getOperand(1)->printAsOperand(O, SlotTracker);
+    O << " +";
+    O << " reduce." << Instruction::getOpcodeName(Opcode) << " (";
+    getOperand(0)->printAsOperand(O, SlotTracker);
+    Red->printFlags(O);
+
+    auto *Ext0 = cast<VPWidenCastRecipe>(BundledOps[0]);
+    O << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
+      << *Ext0->getResultType();
+    if (Red->isConditional()) {
+      O << ", ";
+      Red->getCondOp()->printAsOperand(O, SlotTracker);
+    }
+    O << ")";
+    break;
+  }
+  case BundleTypes::MulAccumulateReduction: {
+    getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker);
+    O << " + ";
+    O << "reduce."
+      << Instruction::getOpcodeName(
+             RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()))
+      << " (";
+    O << "mul";
+    auto *Mul = cast<VPWidenRecipe>(BundledOps.size() == 2 ? BundledOps[0]
+                                                           : BundledOps[2]);
+    Mul->printFlags(O);
+    bool IsExtended = BundledOps.size() > 2;
+    if (IsExtended)
+      O << "(";
+    getOperand(0)->printAsOperand(O, SlotTracker);
+    if (IsExtended) {
+      auto *Ext0 = cast<VPWidenCastRecipe>(
+          BundledOps.size() == 5 ? BundledOps[3] : BundledOps[0]);
+      O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
+        << *Ext0->getResultType() << "), (";
+    } else {
+      O << ", ";
+    }
+    getOperand(1)->printAsOperand(O, SlotTracker);
+    if (IsExtended) {
+      auto *Ext1 = cast<VPWidenCastRecipe>(
+          BundledOps.size() == 5 ? BundledOps[3] : BundledOps[1]);
+      O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to "
+        << *Ext1->getResultType() << ")";
+    }
+    if (Red->isConditional()) {
+      O << ", ";
+      Red->getCondOp()->printAsOperand(O, SlotTracker);
+    }
+    O << ")";
+    break;
+  }
+  }
+}
+
 void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,
                               VPSlotTracker &SlotTracker) const {
   O << Indent << "REDUCE ";
@@ -2648,58 +2754,6 @@ void VPReductionEVLRecipe::print(raw_ostream &O, const Twine &Indent,
   O << ")";
 }
 
-void VPExtendedReductionRecipe::print(raw_ostream &O, const Twine &Indent,
-                                      VPSlotTracker &SlotTracker) const {
-  O << Indent << "EXTENDED-REDUCE ";
-  printAsOperand(O, SlotTracker);
-  O << " = ";
-  getChainOp()->printAsOperand(O, SlotTracker);
-  O << " +";
-  O << " reduce."
-    << Instruction::getOpcodeName(
-           RecurrenceDescriptor::getOpcode(getRecurrenceKind()))
-    << " (";
-  getVecOp()->printAsOperand(O, SlotTracker);
-  printFlags(O);
-  O << Instruction::getOpcodeName(ExtOp) << " to " << *getResultType();
-  if (isConditional()) {
-    O << ", ";
-    getCondOp()->printAsOperand(O, SlotTracker);
-  }
-  O << ")";
-}
-
-void VPMulAccumulateReductionRecipe::print(raw_ostream &O, const Twine &Indent,
-                                           VPSlotTracker &SlotTracker) const {
-  O << Indent << "MULACC-REDUCE ";
-  printAsOperand(O, SlotTracker);
-  O << " = ";
-  getChainOp()->printAsOperand(O, SlotTracker);
-  O << " + ";
-  O << "reduce."
-    << Instruction::getOpcodeName(
-           RecurrenceDescriptor::getOpcode(getRecurrenceKind()))
-    << " (";
-  O << "mul";
-  printFlags(O);
-  if (isExtended())
-    O << "(";
-  getVecOp0()->printAsOperand(O, SlotTracker);
-  if (isExtended())
-    O << " " << Instruction::getOpcodeName(ExtOp) << " to " << *getResultType()
-      << "), (";
-  else
-    O << ", ";
-  getVecOp1()->printAsOperand(O, SlotTracker);
-  if (isExtended())
-    O << " " << Instruction::getOpcodeName(ExtOp) << " to " << *getResultType()
-      << ")";
-  if (isConditional()) {
-    O << ", ";
-    getCondOp()->printAsOperand(O, SlotTracker);
-  }
-  O << ")";
-}
 #endif
 
 /// A helper function to scalarize a single Instruction in the innermost loop.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 05a0e15f9a199..0b4cd10f35252 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1779,9 +1779,9 @@ void VPlanTransforms::truncateToMinimalBitwidths(
   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
            vp_depth_first_deep(Plan.getVectorLoopRegion()))) {
     for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
-      if (!isa<VPWidenRecipe, VPWidenCastRecipe, VPReplicateRecipe,
-               VPWidenSelectRecipe, VPWidenLoadRecipe, VPWidenIntrinsicRecipe>(
-              &R))
+      if (!isa<VPInstruction, VPWidenRecipe, VPWidenCastRecipe,
+               VPReplicateRecipe, VPWidenSelectRecipe, VPWidenLoadRecipe,
+               VPWidenIntrinsicRecipe>(&R))
         continue;
 
       VPValue *ResultVPV = R.getVPSingleValue();
@@ -2530,83 +2530,6 @@ void VPlanTransforms::dissolveLoopRegions(VPlan &Plan) {
     R->dissolveToCFGLoop();
 }
 
-// Expand VPExtendedReductionRecipe to VPWidenCastRecipe + VPReductionRecipe.
-static void expandVPExtendedReduction(VPExtendedReductionRecipe *ExtRed) {
-  VPWidenCastRecipe *Ext;
-  // Only ZExt contains non-neg flags.
-  if (ExtRed->isZExt())
-    Ext = new VPWidenCastRecipe(ExtRed->getExtOpcode(), ExtRed->getVecOp(),
-                                ExtRed->getResultType(), *ExtRed,
-                                ExtRed->getDebugLoc());
-  else
-    Ext = new VPWidenCastRecipe(ExtRed->getExtOpcode(), ExtRed->getVecOp(),
-                                ExtRed->getResultType(), {},
-                                ExtRed->getDebugLoc());
-
-  auto *Red = new VPReductionRecipe(
-      ExtRed->getRecurrenceKind(), FastMathFlags(), ExtRed->getChainOp(), Ext,
-      ExtRed->getCondOp(), ExtRed->isOrdered(), ExtRed->getDebugLoc());
-  Ext->insertBefore(ExtRed);
-  Red->insertBefore(ExtRed);
-  ExtRed->replaceAllUsesWith(Red);
-  ExtRed->eraseFromParent();
-}
-
-// Expand VPMulAccumulateReductionRecipe to VPWidenRecipe (mul) +
-// VPReductionRecipe (reduce.add)
-// + VPWidenCastRecipe (optional).
-static void
-expandVPMulAccumulateReduction(VPMulAccumulateReductionRecipe *MulAcc) {
-  // Generate inner VPWidenCastRecipes if necessary.
-  // Note that we will drop the extend after mul which transforms
-  // reduce.add(ext(mul(ext, ext))) to reduce.add(mul(ext, ext)).
-  VPValue *Op0, *Op1;
-  if (MulAcc->isExtended()) {
-    Type *RedTy = MulAcc->getResultType();
-    if (MulAcc->isZExt())
-      Op0 = new VPWidenCastRecipe(
-          MulAcc->getExtOpcode(), MulAcc->getVecOp0(), RedTy,
-          VPIRFlags::NonNegFlagsTy(MulAcc->isNonNeg()), MulAcc->getDebugLoc());
-    else
-      Op0 = new VPWidenCastRecipe(MulAcc->getExtOpcode(), MulAcc->getVecOp0(),
-                                  RedTy, {}, MulAcc->getDebugLoc());
-    Op0->getDefiningRecipe()->insertBefore(MulAcc);
-    // Prevent reduce.add(mul(ext(A), ext(A))) generate duplicate
-    // VPWidenCastRecipe.
-    if (MulAcc->getVecOp0() == MulAcc->getVecOp1()) {
-      Op1 = Op0;
-    } else {
-      if (MulAcc->isZExt())
-        Op1 = new VPWidenCastRecipe(
-            MulAcc->getExtOpcode(), MulAcc->getVecOp1(), RedTy,
-            VPIRFlags::NonNegFlagsTy(MulAcc->isNonNeg()),
-            MulAcc->getDebugLoc());
-      else
-        Op1 = new VPWidenCastRecipe(MulAcc->getExtOpcode(), MulAcc->getVecOp1(),
-                                    RedTy, {}, MulAcc->getDebugLoc());
-      Op1->getDefiningRecipe()->insertBefore(MulAcc);
-    }
-  } else {
-    // No extends in this MulAccRecipe.
-    Op0 = MulAcc->getVecOp0();
-    Op1 = MulAcc->getVecOp1();
-  }
-
-  std::array<VPValue *, 2> MulOps = {Op0, Op1};
-  auto *Mul = new VPWidenRecipe(
-      Instruction::Mul, ArrayRef(MulOps), MulAcc->hasNoUnsignedWrap(),
-      MulAcc->hasNoSignedWrap(), MulAcc->getDebugLoc());
-  Mul->insertBefore(MulAcc);
-
-  auto *Red = new VPReductionRecipe(
-      MulAcc->getRecurrenceKind(), FastMathFlags(), MulAcc->getChainOp(), Mul,
-      MulAcc->getCondOp(), MulAcc->isOrdered(), MulAcc->getDebugLoc());
-  Red->insertBefore(MulAcc);
-
-  MulAcc->replaceAllUsesWith(Red);
-  MulAcc->eraseFromParent();
-}
-
 void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan,
                                                Type &CanonicalIVTy) {
   using namespace llvm::VPlanPatternMatch;
@@ -2666,12 +2589,10 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan,
       ToRemove.push_back(VPI);
     }
     for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
-      if (auto *ExtRed = dyn_cast<VPExtendedReductionRecipe>(&R)) {
-        expandVPExtendedReduction(ExtRed);
-        continue;
+      if (auto *Bundle = dyn_cast<VPBundleRecipe>(&R)) {
+        Bundle->unbundle();
+        Bundle->eraseFromParent();
       }
-      if (auto *MulAcc = dyn_cast<VPMulAccumulateReductionRecipe>(&R))
-        expandVPMulAccumulateReduction(MulAcc);
     }
   }
 
@@ -2771,10 +2692,10 @@ void VPlanTransforms::handleUncountableEarlyExit(
 }
 
 /// This function tries convert extended in-loop reductions to
-/// VPExtendedReductionRecipe and clamp the \p Range if it is beneficial and
-/// valid. The created recipe must be lowered to concrete
+/// VPBundleRecipe and clamp the \p Range if it is beneficial and
+/// valid. The created recipe must be unbundled to its constituent
 /// recipes before execution.
-static VPExtendedReductionRecipe *
+static VPBundleRecipe *
 tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx,
                                      VFRange &Range) {
   using namespace VPlanPatternMatch;
@@ -2808,19 +2729,19 @@ tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx,
           cast<VPWidenCastRecipe>(VecOp)->getOpcode() ==
               Instruction::CastOps::ZExt,
           Ctx.Types.inferScalarType(A)))
-    return new VPExtendedReductionRecipe(Red, cast<VPWidenCastRecipe>(VecOp));
+    return new VPBundleRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
 
   return nullptr;
 }
 
 /// This function tries convert extended in-loop reductions to
-/// VPMulAccumulateReductionRecipe and clamp the \p Range if it is beneficial
-/// and valid. The created VPExtendedReductionRecipe must be lower to concrete
-/// recipes before execution. Patterns of MulAccumulateReduction:
+/// VPBundleRecipe and clamp the \p Range if it is beneficial
+/// and valid. The created VPBundleRecipe must be unbundled to its constituent
+/// recipes before execution. Patterns of the VPBundleRecipe:
 ///   reduce.add(mul(...)),
 ///   reduce.add(mul(ext(A), ext(B))),
 ///   reduce.add(ext(mul(ext(A), ext(B)))).
-static VPMulAccumulateReductionRecipe *
+static VPBundleRecipe *
 tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
                                           VPCostContext &Ctx, VFRange &Range) {
   using namespace VPlanPatternMatch;
@@ -2876,12 +2797,13 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
         match(RecipeB, m_ZExtOrSExt(m_VPValue())) &&
         IsMulAccValidAndClampRange(RecipeA->getOpcode() ==
                                        Instruction::CastOps::ZExt,
-                                   Mul, RecipeA, RecipeB, nullptr))
-      return new VPMulAccumulateReductionRecipe(Red, Mul, RecipeA, RecipeB,
-                                                RecipeA->getResultType());
+                                   Mul, RecipeA, RecipeB, nullptr)) {
+      return new VPBundleRecipe(RecipeA, RecipeB, Mul, Red);
+    }
     // Match reduce.add(mul).
-    if (IsMulAccValidAndClampRange(true, Mul, nullptr, nullptr, nullptr))
-      return new VPMulAccumulateReductionRecipe(Red, Mul, RedTy);
+    if (IsMulAccValidAndClampRange(true, Mul, nullptr, nullptr, nullptr)) {
+      return new VPBundleRecipe(Mul, Red);
+    }
   }
   // Match reduce.add(ext(mul(ext(A), ext(B)))).
   // All extend recipes must have same opcode or A == B
@@ -2898,9 +2820,9 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
         Ext0->getOpcode() == Ext1->getOpcode() &&
         IsMulAccValidAndClampRange(Ext0->getOpcode() ==
                                        Instruction::CastOps::ZExt,
-                                   Mul, Ext0, Ext1, Ext))
-      return new VPMulAccumulateReductionRecipe(Red, Mul, Ext0, Ext1,
-                                                Ext->getResultType());
+                                   Mul, Ext0, Ext1, Ext)) {
+      return new VPBundleRecipe(Ext0, Ext1, Mul, Ext, Red);
+    }
   }
   return nullptr;
 }
@@ -2910,8 +2832,9 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
 static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red,
                                                VPCostContext &Ctx,
                                                VFRange &Range) {
-  VPReductionRecipe *AbstractR = nullptr;
-
+  VPBundleRecipe *AbstractR = nullptr;
+  auto IP = std::next(Red->getIterator());
+  auto *VPBB = Red->getParent();
   if (auto *MulAcc = tryToMatchAndCreateMulAccumulateReduction(Red, Ctx, Range))
     AbstractR = MulAcc;
   else if (auto *ExtRed = tryToMatchAndCreateExtendedReduction(Red, Ctx, Range))
@@ -2920,7 +2843,7 @@ static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red,
   if (!AbstractR)
     return;
 
-  AbstractR->insertBefore(Red);
+  AbstractR->insertBefore(*VPBB, IP);
   Red->replaceAllUsesWith(AbstractR);
 }
 
@@ -2928,7 +2851,7 @@ void VPlanTransforms::convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx,
                                                VFRange &Range) {
   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
            vp_depth_first_deep(Plan.getVectorLoopRegion()))) {
-    for (VPRecipeBase &R : *VPBB) {
+    for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
       if (auto *Red = dyn_cast<VPReductionRecipe>(&R))
         tryToCreateAbstractReductionRecipe(Red, Ctx, Range);
     }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index a0d3dc9b934cc..7246cb9a75ed8 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -32,6 +32,7 @@ namespace llvm {
 // Forward declarations.
 class raw_ostream;
 class Value;
+class VPBundleRecipe;
 class VPDef;
 struct VPDoubleValueDef;
 class VPSlotTracker;
@@ -49,6 +50,7 @@ class VPValue {
   friend struct VPDoubleValueDef;
   friend class VPInterleaveRecipe;
   friend class VPlan;
+  friend class VPBundleRecipe;
 
   const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast).
 
@@ -328,6 +330,7 @@ class VPDef {
   /// type identification.
   using VPRecipeTy = enum {
     VPBranchOnMaskSC,
+    VPBundleSC,
     VPDerivedIVSC,
     VPExpandSCEVSC,
     VPIRInstructionSC,
@@ -335,8 +338,6 @@ class VPDef {
     VPInterleaveSC,
     VPReductionEVLSC,
     VPReductionSC,
-    VPMulAccumulateReductionSC,
-    VPExtendedReductionSC,
     VPPartialReductionSC,
     VPReplicateSC,
     VPScalarIVStepsSC,
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll
index f179a3ae04d23..212340fdcda26 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll
@@ -115,20 +115,20 @@ define i64 @add_i16_i64(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       for.body.preheader:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 8
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[N]], 2147483644
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[N]], 2147483640
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP0]], align 2
-; CHECK-NEXT:    [[TMP1:%.*]] = sext <4 x i16> [[WIDE_LOAD]] to <4 x i64>
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP1]])
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP0]], align 2
+; CHECK-NEXT:    [[TMP1:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP1]])
 ; CHECK-NEXT:    [[TMP3]] = add i64 [[TMP2]], [[VEC_PHI]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
@@ -180,20 +180,20 @@ define i64 @add_i8_i64(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       for.body.preheader:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 16
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[N]], 2147483644
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[N]], 2147483632
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[X:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP0]], align 1
-; CHECK-NEXT:    [[TMP1:%.*]] = zext <4 x i8> [[WIDE_LOAD]] to <4 x i64>
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP1]])
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> [[TMP1]])
 ; CHECK-NEXT:    [[TMP3]] = add i64 [[TMP2]], [[VEC_PHI]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
@@ -1526,7 +1526,8 @@ define i64 @mla_and_add_together_16_64(ptr nocapture noundef readonly %x, i32 no
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP0]], align 2
 ; CHECK-NEXT:    [[TMP1:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
-; CHECK-NEXT:    [[TMP2:%.*]] = mul nsw <8 x i64> [[TMP1]], [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = mul nsw <8 x i64> [[TMP1]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP2]])
 ; CHECK-NEXT:    [[TMP5]] = add i64 [[TMP3]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32>
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll
index 978f1b80d26da..3cd37851ec725 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll
@@ -287,12 +287,12 @@ define i64 @print_extended_reduction(ptr nocapture readonly %x, ptr nocapture re
 ; CHECK-NEXT: <x1> vector loop: {
 ; CHECK-NEXT:   vector.body:
 ; CHECK-NEXT:     EMIT vp<[[IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[IV_NEXT:%.+]]>
-; CHECK-NEXT:     WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi vp<[[RDX_START]]>, ir<[[RDX_NEXT:%.+]]>
+; CHECK-NEXT:     WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi vp<[[RDX_START]]>, vp<[[RDX_NEXT:%.+]]>
 ; CHECK-NEXT:     vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[IV]]>, ir<1>
 ; CHECK-NEXT:     CLONE ir<%arrayidx> = getelementptr inbounds ir<%x>, vp<[[STEPS]]>
 ; CHECK-NEXT:     vp<[[ADDR:%.+]]> = vector-pointer ir<%arrayidx>
 ; CHECK-NEXT:     WIDEN ir<[[LOAD:%.+]]> = load vp<[[ADDR]]>
-; CHECK-NEXT:     EXTENDED-REDUCE ir<[[RDX_NEXT:%.+]]> = ir<[[RDX]]> + reduce.add (ir<[[LOAD]]> zext to i64)
+; CHECK-NEXT:     BUNDLE vp<[[RDX_NEXT]]> = ir<[[RDX]]> + reduce.add (ir<[[LOAD]]> zext to i64)
 ; CHECK-NEXT:     EMIT vp<[[IV_NEXT]]> = add nuw vp<[[IV]]>, vp<[[VFxUF]]>
 ; CHECK-NEXT:     EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]>
 ; CHECK-NEXT:   No successors
@@ -332,7 +332,7 @@ define i64 @print_mulacc(ptr nocapture readonly %x, ptr nocapture readonly %y, i
 ; CHECK-NEXT: <x1> vector loop: {
 ; CHECK-NEXT:   vector.body:
 ; CHECK-NEXT:     EMIT vp<[[IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[IV_NEXT:%.+]]>
-; CHECK-NEXT:     WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi vp<[[RDX_START]]>, ir<[[RDX_NEXT:%.+]]>
+; CHECK-NEXT:     WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi vp<[[RDX_START]]>, vp<[[RDX_NEXT:%.+]]>
 ; CHECK-NEXT:     vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[IV]]>, ir<1>
 ; CHECK-NEXT:     CLONE ir<[[ARRAYIDX0:%.+]]> = getelementptr inbounds ir<%x>, vp<[[STEPS]]>
 ; CHECK-NEXT:     vp<[[ADDR0:%.+]]> = vector-pointer ir<[[ARRAYIDX0]]>
@@ -340,7 +340,7 @@ define i64 @print_mulacc(ptr nocapture readonly %x, ptr nocapture readonly %y, i
 ; CHECK-NEXT:     CLONE ir<[[ARRAYIDX1:%.+]]> = getelementptr inbounds ir<%y>, vp<[[STEPS]]>
 ; CHECK-NEXT:     vp<[[ADDR1:%.+]]> = vector-pointer ir<[[ARRAYIDX1]]>
 ; CHECK-NEXT:     WIDEN ir<[[LOAD1:%.+]]> = load vp<[[ADDR1]]>
-; CHECK-NEXT:     MULACC-REDUCE ir<[[RDX_NEXT]]> = ir<[[RDX]]> + reduce.add (mul nsw ir<[[LOAD0]]>, ir<[[LOAD1]]>)
+; CHECK-NEXT:     BUNDLE vp<[[RDX_NEXT]]> = ir<[[RDX]]> + reduce.add (mul nsw ir<[[LOAD0]]>, ir<[[LOAD1]]>)
 ; CHECK-NEXT:     EMIT vp<[[IV_NEXT]]> = add nuw vp<[[IV]]>, vp<[[VFxUF]]>
 ; CHECK-NEXT:     EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]>
 ; CHECK-NEXT:   No successors
@@ -382,7 +382,7 @@ define i64 @print_mulacc_extended(ptr nocapture readonly %x, ptr nocapture reado
 ; CHECK-NEXT: <x1> vector loop: {
 ; CHECK-NEXT:   vector.body:
 ; CHECK-NEXT:     EMIT vp<[[IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[IV_NEXT:%.+]]>
-; CHECK-NEXT:     WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi vp<[[RDX_START]]>, ir<[[RDX_NEXT:%.+]]>
+; CHECK-NEXT:     WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi vp<[[RDX_START]]>, vp<[[RDX_NEXT:%.+]]>
 ; CHECK-NEXT:     vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[IV]]>, ir<1>
 ; CHECK-NEXT:     CLONE ir<[[ARRAYIDX0:%.+]]> = getelementptr inbounds ir<%x>, vp<[[STEPS]]>
 ; CHECK-NEXT:     vp<[[ADDR0:%.+]]> = vector-pointer ir<[[ARRAYIDX0]]>
@@ -390,7 +390,7 @@ define i64 @print_mulacc_extended(ptr nocapture readonly %x, ptr nocapture reado
 ; CHECK-NEXT:     CLONE ir<[[ARRAYIDX1:%.+]]> = getelementptr inbounds ir<%y>, vp<[[STEPS]]>
 ; CHECK-NEXT:     vp<[[ADDR1:%.+]]> = vector-pointer ir<[[ARRAYIDX1]]>
 ; CHECK-NEXT:     WIDEN ir<[[LOAD1:%.+]]> = load vp<[[ADDR1]]>
-; CHECK-NEXT:     MULACC-REDUCE ir<[[RDX_NEXT:%.+]]> = ir<[[RDX]]> + reduce.add (mul nsw (ir<[[LOAD0]]> sext to i64), (ir<[[LOAD1]]> sext to i64))
+; CHECK-NEXT:     BUNDLE vp<[[RDX_NEXT:%.+]]> = ir<[[RDX]]> + reduce.add (mul nsw (ir<[[LOAD0]]> sext to i64), (ir<[[LOAD1]]> sext to i64))
 ; CHECK-NEXT:     EMIT vp<[[IV_NEXT]]> = add nuw vp<[[IV]]>, vp<[[VFxUF]]>
 ; CHECK-NEXT:     EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]>
 ; CHECK-NEXT:   No successors

>From 3c3f9e4ab79f1914923db6df0b2cc71c18c31e89 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Tue, 17 Jun 2025 11:56:40 +0100
Subject: [PATCH 2/4] !fixup address comments, thanks!

---
 llvm/lib/Transforms/Vectorize/VPlan.h         | 24 +++++++++++--------
 .../Transforms/Vectorize/VPlanAnalysis.cpp    |  6 ++++-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  3 ++-
 .../LoopVectorize/ARM/mve-reductions.ll       | 24 +++++++++----------
 4 files changed, 33 insertions(+), 24 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 256706deb0977..00412fb70aa0a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2735,10 +2735,10 @@ class VPBundleRecipe : public VPSingleDefRecipe {
         BundleType(BundleType) {
     // Bundle up the operand recipes.
     SmallPtrSet<VPUser *, 4> BundledUsers;
-    for (auto *R : ToBundle)
+    for (auto *R : BundledOps)
       BundledUsers.insert(R);
 
-    // Recipes in the bundle, expect the last one, must only be used inside the
+    // Recipes in the bundle, except the last one, must only be used inside the
     // bundle. If there other external users, clone the recipes for the bundle.
     for (const auto &[Idx, R] : enumerate(drop_end(ToBundle))) {
       if (all_of(R->users(), [&BundledUsers](VPUser *U) {
@@ -2748,14 +2748,14 @@ class VPBundleRecipe : public VPSingleDefRecipe {
           R->removeFromParent();
         continue;
       }
-      // There users external to the bundle. Clone the recipe for use in the
+      // The users external to the bundle. Clone the recipe for use in the
       // bundle and update all its in-bundle users.
-      this->BundledOps[Idx] = R->clone();
-      BundledUsers.insert(this->BundledOps[Idx]);
-      R->replaceUsesWithIf(this->BundledOps[Idx],
-                           [&BundledUsers](VPUser &U, unsigned) {
-                             return BundledUsers.contains(&U);
-                           });
+      VPSingleDefRecipe *Copy = R->clone();
+      BundledOps[Idx] = Copy;
+      BundledUsers.insert(Copy);
+      R->replaceUsesWithIf(Copy, [&BundledUsers](VPUser &U, unsigned) {
+        return BundledUsers.contains(&U);
+      });
     }
     BundledOps.back()->removeFromParent();
 
@@ -2763,7 +2763,7 @@ class VPBundleRecipe : public VPSingleDefRecipe {
     // create new temporary VPValues for all operands not defined by recipe in
     // the bundle. The original operands are added as operands of the
     // VPBundleRecipe.
-    for (auto *R : this->BundledOps) {
+    for (auto *R : BundledOps) {
       for (const auto &[Idx, Op] : enumerate(R->operands())) {
         auto *Def = Op->getDefiningRecipe();
         if (Def && BundledUsers.contains(Def))
@@ -2802,6 +2802,7 @@ class VPBundleRecipe : public VPSingleDefRecipe {
   VP_CLASSOF_IMPL(VPDef::VPBundleSC)
 
   VPBundleRecipe *clone() override {
+    assert(!BundledOps.empty() && "empty bundles should be removed");
     return new VPBundleRecipe(BundleType, BundledOps);
   }
 
@@ -2809,6 +2810,9 @@ class VPBundleRecipe : public VPSingleDefRecipe {
   /// recipe.
   VPSingleDefRecipe *getResultOp() const { return BundledOps.back(); }
 
+  /// Insert the bundled recipes back into the VPlan, directly before the
+  /// current recipe. Leaves the bundle recipe empty and the recipe must be
+  /// removed before codegen.
   void unbundle();
 
   /// Generate the extraction of the appropriate bit from the block mask and the
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index c8336e7b3f92c..1e2961c5beb56 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -268,7 +268,11 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
   Type *ResultTy =
       TypeSwitch<const VPRecipeBase *, Type *>(V->getDefiningRecipe())
           .Case<VPBundleRecipe>([this](const auto *R) {
-            return inferScalarType(R->getOperand(R->getNumOperands() - 2));
+            unsigned RdxOpIdxOffset =
+                cast<VPReductionRecipe>(R->getResultOp())->isConditional() ? 2
+                                                                           : 1;
+            return inferScalarType(
+                R->getOperand(R->getNumOperands() - RdxOpIdxOffset));
           })
           .Case<VPActiveLaneMaskPHIRecipe, VPCanonicalIVPHIRecipe,
                 VPFirstOrderRecurrencePHIRecipe, VPReductionPHIRecipe,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 392d6c3d32c87..c6bcb1491ee4f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2590,6 +2590,7 @@ void VPBundleRecipe::unbundle() {
       BundledOps.size() == 5) {
     // Note that we will drop the extend after mul which transforms
     // reduce.add(ext(mul(ext, ext))) to reduce.add(mul(ext, ext)).
+    // TODO: This transform should be done separately from bundling/unbundling.
     auto *Ext0 = cast<VPWidenCastRecipe>(BundledOps[0]);
     auto *Ext1 = cast<VPWidenCastRecipe>(BundledOps[1]);
     auto *Ext2 = cast<VPWidenCastRecipe>(BundledOps[3]);
@@ -2602,7 +2603,7 @@ void VPBundleRecipe::unbundle() {
     if (Ext0 != Ext1) {
       Op1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(0),
                                   Ext2->getResultType(), *Ext1, getDebugLoc());
-      Op1->insertBefore(Ext0);
+      Op1->insertBefore(Ext1);
     }
     auto *Mul = cast<VPWidenRecipe>(BundledOps[2]);
     auto *Red = cast<VPReductionRecipe>(BundledOps[4]);
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll
index 212340fdcda26..e8af144498659 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll
@@ -115,20 +115,20 @@ define i64 @add_i16_i64(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       for.body.preheader:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 8
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[N]], 2147483640
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[N]], 2147483644
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP0]], align 2
-; CHECK-NEXT:    [[TMP1:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP1]])
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP0]], align 2
+; CHECK-NEXT:    [[TMP1:%.*]] = sext <4 x i16> [[WIDE_LOAD]] to <4 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP1]])
 ; CHECK-NEXT:    [[TMP3]] = add i64 [[TMP2]], [[VEC_PHI]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
@@ -180,20 +180,20 @@ define i64 @add_i8_i64(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       for.body.preheader:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 16
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[N]], 2147483632
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[N]], 2147483644
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[X:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1
-; CHECK-NEXT:    [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64>
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> [[TMP1]])
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP0]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = zext <4 x i8> [[WIDE_LOAD]] to <4 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP1]])
 ; CHECK-NEXT:    [[TMP3]] = add i64 [[TMP2]], [[VEC_PHI]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:

>From 1b7bf4b989c7e0a775bf55d85d1ce4223ce21a74 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Tue, 17 Jun 2025 12:17:22 +0100
Subject: [PATCH 3/4] !fixup fix formatting

---
 llvm/lib/Transforms/Vectorize/VPlan.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 00412fb70aa0a..5f560530400ad 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2428,6 +2428,7 @@ class VPReductionRecipe : public VPRecipeWithIRFlags {
     }
     setUnderlyingValue(I);
   }
+
 public:
   VPReductionRecipe(RecurKind RdxKind, FastMathFlags FMFs, Instruction *I,
                     VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp,

>From ac1a2dca83311b99441d697c322ccdbc46923be2 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Fri, 20 Jun 2025 08:08:07 +0100
Subject: [PATCH 4/4] !fixup deep-clone whole bundle

---
 llvm/lib/Transforms/Vectorize/VPlan.h         | 85 +++++++----------
 .../Transforms/Vectorize/VPlanAnalysis.cpp    |  5 +-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 94 ++++++++++++++-----
 3 files changed, 108 insertions(+), 76 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 39e4edeab869b..86b055f682052 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2697,7 +2697,10 @@ class VPBranchOnMaskRecipe : public VPRecipeBase {
 /// A recipe to combine multiple recipes into a 'bundle' recipe, which should be
 /// considered as single entity for cost-modeling and transforms. The recipe
 /// needs to be 'unbundled', i.e. replaced by its individual recipes before
-/// execute.
+/// execute. The bundled recipes are completely connected from the def-use graph
+/// outside the bundled recipes. Operands not defined by recipes in the bundle
+/// are added as operands of the VPBundleRecipe and the users of the result
+/// recipe must be updated to use the VPBundleRecipe.
 class VPBundleRecipe : public VPSingleDefRecipe {
   enum class BundleTypes {
     ExtendedReduction,
@@ -2705,7 +2708,7 @@ class VPBundleRecipe : public VPSingleDefRecipe {
   };
 
   /// Recipes bundled together in this VPBundleRecipe.
-  SmallVector<VPSingleDefRecipe *> BundledOps;
+  SmallVector<VPSingleDefRecipe *> BundledRecipes;
 
   /// Temporary VPValues used for external operands of the bundle, i.e. operands
   /// not defined by recipes in the bundle.
@@ -2714,69 +2717,39 @@ class VPBundleRecipe : public VPSingleDefRecipe {
   /// Type of the bundle.
   BundleTypes BundleType;
 
-  VPBundleRecipe(BundleTypes BundleType, ArrayRef<VPSingleDefRecipe *> ToBundle)
-      : VPSingleDefRecipe(VPDef::VPBundleSC, {}, {}), BundledOps(ToBundle),
+  VPBundleRecipe(BundleTypes BundleType, ArrayRef<VPSingleDefRecipe *> ToBundle,
+                 ArrayRef<VPValue *> Operands)
+      : VPSingleDefRecipe(VPDef::VPBundleSC, {}, {}), BundledRecipes(ToBundle),
         BundleType(BundleType) {
-    // Bundle up the operand recipes.
-    SmallPtrSet<VPUser *, 4> BundledUsers;
-    for (auto *R : BundledOps)
-      BundledUsers.insert(R);
-
-    // Recipes in the bundle, except the last one, must only be used inside the
-    // bundle. If there other external users, clone the recipes for the bundle.
-    for (const auto &[Idx, R] : enumerate(drop_end(ToBundle))) {
-      if (all_of(R->users(), [&BundledUsers](VPUser *U) {
-            return BundledUsers.contains(U);
-          })) {
-        if (R->getParent())
-          R->removeFromParent();
-        continue;
-      }
-      // The users external to the bundle. Clone the recipe for use in the
-      // bundle and update all its in-bundle users.
-      VPSingleDefRecipe *Copy = R->clone();
-      BundledOps[Idx] = Copy;
-      BundledUsers.insert(Copy);
-      R->replaceUsesWithIf(Copy, [&BundledUsers](VPUser &U, unsigned) {
-        return BundledUsers.contains(&U);
-      });
-    }
-    BundledOps.back()->removeFromParent();
-
-    // Internalize all external operands to the bundled operations. To do so,
-    // create new temporary VPValues for all operands not defined by recipe in
-    // the bundle. The original operands are added as operands of the
-    // VPBundleRecipe.
-    for (auto *R : BundledOps) {
-      for (const auto &[Idx, Op] : enumerate(R->operands())) {
-        auto *Def = Op->getDefiningRecipe();
-        if (Def && BundledUsers.contains(Def))
-          continue;
-        addOperand(Op);
-        TmpValues.push_back(new VPValue());
-        R->setOperand(Idx, TmpValues.back());
-      }
-    }
+    bundle(Operands);
   }
 
+  /// Internalize recipes in BundledRecipes External operands (i.e. not defined
+  /// by another recipe in the bundle) are replaced by temporary VPValues and
+  /// the original operands are transferred to the VPBundleRecipe itself. Clone
+  /// recipes as needed to ensure they are only used by other recipes in the
+  /// bundle. If \p Operands is not empty, use it as operands for the new
+  /// VPBundleRecipe (used when cloning the recipe).
+  void bundle(ArrayRef<VPValue *> Operands);
+
 public:
   VPBundleRecipe(VPWidenCastRecipe *Ext, VPReductionRecipe *Red)
-      : VPBundleRecipe(BundleTypes::ExtendedReduction, {Ext, Red}) {}
+      : VPBundleRecipe(BundleTypes::ExtendedReduction, {Ext, Red}, {}) {}
   VPBundleRecipe(VPWidenRecipe *Mul, VPReductionRecipe *Red)
-      : VPBundleRecipe(BundleTypes::MulAccumulateReduction, {Mul, Red}) {}
+      : VPBundleRecipe(BundleTypes::MulAccumulateReduction, {Mul, Red}, {}) {}
   VPBundleRecipe(VPWidenCastRecipe *Ext0, VPWidenCastRecipe *Ext1,
                  VPWidenRecipe *Mul, VPReductionRecipe *Red)
       : VPBundleRecipe(BundleTypes::MulAccumulateReduction,
-                       {Ext0, Ext1, Mul, Red}) {}
+                       {Ext0, Ext1, Mul, Red}, {}) {}
   VPBundleRecipe(VPWidenCastRecipe *Ext0, VPWidenCastRecipe *Ext1,
                  VPWidenRecipe *Mul, VPWidenCastRecipe *Ext2,
                  VPReductionRecipe *Red)
       : VPBundleRecipe(BundleTypes::MulAccumulateReduction,
-                       {Ext0, Ext1, Mul, Ext2, Red}) {}
+                       {Ext0, Ext1, Mul, Ext2, Red}, {}) {}
 
   ~VPBundleRecipe() override {
     SmallPtrSet<VPRecipeBase *, 4> Seen;
-    for (auto *R : reverse(BundledOps))
+    for (auto *R : reverse(BundledRecipes))
       if (Seen.insert(R).second)
         delete R;
     for (VPValue *T : TmpValues)
@@ -2786,13 +2759,21 @@ class VPBundleRecipe : public VPSingleDefRecipe {
   VP_CLASSOF_IMPL(VPDef::VPBundleSC)
 
   VPBundleRecipe *clone() override {
-    assert(!BundledOps.empty() && "empty bundles should be removed");
-    return new VPBundleRecipe(BundleType, BundledOps);
+    assert(!BundledRecipes.empty() && "empty bundles should be removed");
+    SmallVector<VPSingleDefRecipe *> NewBundledRecipes;
+    for (auto *R : BundledRecipes)
+      NewBundledRecipes.push_back(R->clone());
+    for (auto *New : NewBundledRecipes) {
+      for (const auto &[Idx, Old] : enumerate(BundledRecipes)) {
+        New->replaceUsesOfWith(Old, NewBundledRecipes[Idx]);
+      }
+    }
+    return new VPBundleRecipe(BundleType, NewBundledRecipes, operands());
   }
 
   /// Return the VPSingleDefRecipe producing the final result of the bundled
   /// recipe.
-  VPSingleDefRecipe *getResultOp() const { return BundledOps.back(); }
+  VPSingleDefRecipe *getResultRecipe() const { return BundledRecipes.back(); }
 
   /// Insert the bundled recipes back into the VPlan, directly before the
   /// current recipe. Leaves the bundle recipe empty and the recipe must be
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 1e2961c5beb56..21f90d96ed9ad 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -269,8 +269,9 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
       TypeSwitch<const VPRecipeBase *, Type *>(V->getDefiningRecipe())
           .Case<VPBundleRecipe>([this](const auto *R) {
             unsigned RdxOpIdxOffset =
-                cast<VPReductionRecipe>(R->getResultOp())->isConditional() ? 2
-                                                                           : 1;
+                cast<VPReductionRecipe>(R->getResultRecipe())->isConditional()
+                    ? 2
+                    : 1;
             return inferScalarType(
                 R->getOperand(R->getNumOperands() - RdxOpIdxOffset));
           })
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 8931fcb80d51e..4d6ac4e5a1205 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2440,24 +2440,74 @@ InstructionCost VPReductionRecipe::computeCost(ElementCount VF,
                                             Ctx.CostKind);
 }
 
+void VPBundleRecipe::bundle(ArrayRef<VPValue *> Operands) {
+  assert(!BundledRecipes.empty() && "Nothing to bundle?");
+
+  // Bundle up the operand recipes.
+  SmallPtrSet<VPUser *, 4> BundledUsers;
+  for (auto *R : BundledRecipes)
+    BundledUsers.insert(R);
+
+  // Recipes in the bundle, except the last one, must only be used inside the
+  // bundle. If there other external users, clone the recipes for the bundle.
+  for (unsigned Idx = 0; Idx != BundledRecipes.size() - 1; ++Idx) {
+    VPSingleDefRecipe *R = BundledRecipes[Idx];
+    if (all_of(R->users(), [&BundledUsers](VPUser *U) {
+          return BundledUsers.contains(U);
+        })) {
+      if (R->getParent())
+        R->removeFromParent();
+      continue;
+    }
+    // The users external to the bundle. Clone the recipe for use in the
+    // bundle and update all its in-bundle users.
+    VPSingleDefRecipe *Copy = R->clone();
+    BundledRecipes[Idx] = Copy;
+    BundledUsers.insert(Copy);
+    R->replaceUsesWithIf(Copy, [&BundledUsers](VPUser &U, unsigned) {
+      return BundledUsers.contains(&U);
+    });
+  }
+  if (BundledRecipes.back()->getParent())
+    BundledRecipes.back()->removeFromParent();
+
+  // Internalize all external operands to the bundled operations. To do so,
+  // create new temporary VPValues for all operands not defined by recipe in
+  // the bundle. The original operands are added as operands of the
+  // VPBundleRecipe.
+  for (auto *R : BundledRecipes) {
+    for (const auto &[Idx, Op] : enumerate(R->operands())) {
+      auto *Def = Op->getDefiningRecipe();
+      if (Def && BundledUsers.contains(Def))
+        continue;
+      if (Operands.empty())
+        addOperand(Op);
+      else
+        addOperand(Operands[TmpValues.size()]);
+      TmpValues.push_back(new VPValue());
+      R->setOperand(Idx, TmpValues.back());
+    }
+  }
+}
+
 void VPBundleRecipe::unbundle() {
-  for (auto *Op : BundledOps)
-    if (!Op->getParent())
-      Op->insertBefore(this);
+  for (auto *R : BundledRecipes)
+    if (!R->getParent())
+      R->insertBefore(this);
 
   for (const auto &[Idx, Op] : enumerate(operands()))
     TmpValues[Idx]->replaceAllUsesWith(Op);
 
-  replaceAllUsesWith(getResultOp());
+  replaceAllUsesWith(getResultRecipe());
 
   if (BundleType == BundleTypes::MulAccumulateReduction &&
-      BundledOps.size() == 5) {
+      BundledRecipes.size() == 5) {
     // Note that we will drop the extend after mul which transforms
     // reduce.add(ext(mul(ext, ext))) to reduce.add(mul(ext, ext)).
     // TODO: This transform should be done separately from bundling/unbundling.
-    auto *Ext0 = cast<VPWidenCastRecipe>(BundledOps[0]);
-    auto *Ext1 = cast<VPWidenCastRecipe>(BundledOps[1]);
-    auto *Ext2 = cast<VPWidenCastRecipe>(BundledOps[3]);
+    auto *Ext0 = cast<VPWidenCastRecipe>(BundledRecipes[0]);
+    auto *Ext1 = cast<VPWidenCastRecipe>(BundledRecipes[1]);
+    auto *Ext2 = cast<VPWidenCastRecipe>(BundledRecipes[3]);
     auto *Op0 =
         new VPWidenCastRecipe(Ext0->getOpcode(), Ext0->getOperand(0),
                               Ext2->getResultType(), *Ext0, getDebugLoc());
@@ -2469,8 +2519,8 @@ void VPBundleRecipe::unbundle() {
                                   Ext2->getResultType(), *Ext1, getDebugLoc());
       Op1->insertBefore(Ext1);
     }
-    auto *Mul = cast<VPWidenRecipe>(BundledOps[2]);
-    auto *Red = cast<VPReductionRecipe>(BundledOps[4]);
+    auto *Mul = cast<VPWidenRecipe>(BundledRecipes[2]);
+    auto *Red = cast<VPReductionRecipe>(BundledRecipes[4]);
     Mul->setOperand(0, Op0);
     Mul->setOperand(1, Op1);
     Red->setOperand(1, Mul);
@@ -2479,7 +2529,7 @@ void VPBundleRecipe::unbundle() {
     if (Ext0 != Ext1)
       Ext1->eraseFromParent();
   }
-  BundledOps.clear();
+  BundledRecipes.clear();
 }
 
 InstructionCost VPBundleRecipe::computeCost(ElementCount VF,
@@ -2492,17 +2542,17 @@ InstructionCost VPBundleRecipe::computeCost(ElementCount VF,
   switch (BundleType) {
   case BundleTypes::ExtendedReduction: {
     unsigned Opcode = RecurrenceDescriptor::getOpcode(
-        cast<VPReductionRecipe>(BundledOps[1])->getRecurrenceKind());
+        cast<VPReductionRecipe>(BundledRecipes[1])->getRecurrenceKind());
     return Ctx.TTI.getExtendedReductionCost(
         Opcode,
-        cast<VPWidenCastRecipe>(BundledOps.front())->getOpcode() ==
+        cast<VPWidenCastRecipe>(BundledRecipes.front())->getOpcode() ==
             Instruction::ZExt,
         RedTy, SrcVecTy, std::nullopt, Ctx.CostKind);
   }
   case BundleTypes::MulAccumulateReduction:
     return Ctx.TTI.getMulAccReductionCost(
-        BundledOps.size() > 2
-            ? cast<VPWidenCastRecipe>(BundledOps.front())->getOpcode() ==
+        BundledRecipes.size() > 2
+            ? cast<VPWidenCastRecipe>(BundledRecipes.front())->getOpcode() ==
                   Instruction::ZExt
             : false,
         RedTy, SrcVecTy, Ctx.CostKind);
@@ -2516,7 +2566,7 @@ void VPBundleRecipe::print(raw_ostream &O, const Twine &Indent,
   O << Indent << "BUNDLE ";
   printAsOperand(O, SlotTracker);
   O << " = ";
-  auto *Red = cast<VPReductionRecipe>(BundledOps.back());
+  auto *Red = cast<VPReductionRecipe>(BundledRecipes.back());
   unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
 
   switch (BundleType) {
@@ -2527,7 +2577,7 @@ void VPBundleRecipe::print(raw_ostream &O, const Twine &Indent,
     getOperand(0)->printAsOperand(O, SlotTracker);
     Red->printFlags(O);
 
-    auto *Ext0 = cast<VPWidenCastRecipe>(BundledOps[0]);
+    auto *Ext0 = cast<VPWidenCastRecipe>(BundledRecipes[0]);
     O << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
       << *Ext0->getResultType();
     if (Red->isConditional()) {
@@ -2545,16 +2595,16 @@ void VPBundleRecipe::print(raw_ostream &O, const Twine &Indent,
              RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()))
       << " (";
     O << "mul";
-    auto *Mul = cast<VPWidenRecipe>(BundledOps.size() == 2 ? BundledOps[0]
-                                                           : BundledOps[2]);
+    auto *Mul = cast<VPWidenRecipe>(
+        BundledRecipes.size() == 2 ? BundledRecipes[0] : BundledRecipes[2]);
     Mul->printFlags(O);
-    bool IsExtended = BundledOps.size() > 2;
+    bool IsExtended = BundledRecipes.size() > 2;
     if (IsExtended)
       O << "(";
     getOperand(0)->printAsOperand(O, SlotTracker);
     if (IsExtended) {
       auto *Ext0 = cast<VPWidenCastRecipe>(
-          BundledOps.size() == 5 ? BundledOps[3] : BundledOps[0]);
+          BundledRecipes.size() == 5 ? BundledRecipes[3] : BundledRecipes[0]);
       O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
         << *Ext0->getResultType() << "), (";
     } else {
@@ -2563,7 +2613,7 @@ void VPBundleRecipe::print(raw_ostream &O, const Twine &Indent,
     getOperand(1)->printAsOperand(O, SlotTracker);
     if (IsExtended) {
       auto *Ext1 = cast<VPWidenCastRecipe>(
-          BundledOps.size() == 5 ? BundledOps[3] : BundledOps[1]);
+          BundledRecipes.size() == 5 ? BundledRecipes[3] : BundledRecipes[1]);
       O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to "
         << *Ext1->getResultType() << ")";
     }